Apply the dispatcher to RunImpl

Ting PAN
Commit d1f714ea authored May 15, 2019 by Ting PAN
Showing with 668 additions and 868 deletions
Dragon/include/core/common.h
Dragon/include/core/graph.h
Dragon/include/core/graph_gradient.h
Dragon/include/core/operator.h
Dragon/include/core/operator_schema.h
Dragon/include/core/types.h
Dragon/include/core/workspace.h
Dragon/include/operators/arithmetic/fully_connected_op.h
Dragon/include/operators/array/multinomial_op.h
Dragon/include/utils/caffemodel.h
Dragon/python/dragon/operators/array.py
Dragon/python/dragon/vm/torch/ops/builtin.py
Dragon/python/dragon/vm/torch/ops/modules/array.py
Dragon/python/dragon/vm/torch/tensor.py
Dragon/src/contrib/rcnn/bbox_utils.cu
Dragon/src/contrib/rcnn/bbox_utils.h
Dragon/src/core/graph.cc
Dragon/src/core/graph_gradient.cc
Dragon/src/core/graph_optimizer.cc
Dragon/src/core/operator_schema.cc
--- a/Dragon/include/core/common.h
+++ b/Dragon/include/core/common.h
@@ -35,6 +35,7 @@
 #include "core/types.h"
 #include "proto/dragon.pb.h"
+#include "utils/string.h"
 #include "utils/logging.h"
 namespace dragon {

--- a/Dragon/include/core/graph.h
+++ b/Dragon/include/core/graph.h
@@ -85,6 +85,8 @@ GraphBase* NewGraph(
    const GraphDef&             def,
    Workspace*                  ws);
+/* Macros */
 DECLARE_REGISTRY(
    GraphRegistry,
    GraphBase,

--- a/Dragon/include/core/graph_gradient.h
+++ b/Dragon/include/core/graph_gradient.h
@@ -43,7 +43,7 @@ class GraphGradientMaker {
    bool CheckGrad(
        const OperatorDef&              forward_op,
        const Set<string>&              targets,
-        vector< pair<string, int> >&    gen_grads);
+        vector<pair<string, int>>&      gen_grads);
    string GetOperatorName();

--- a/Dragon/include/core/operator.h
+++ b/Dragon/include/core/operator.h
@@ -100,7 +100,7 @@ class OperatorBase {
    /*! \brief Return the specified argument */
    const Argument& arg(const string& name) { return *(args_[name]); }
-    typedef Map<string, vector<OperatorBase*> > SubGraph;
+    typedef Map<string, vector<OperatorBase*>> SubGraph;
    /*! \brief Return the recomputing subgraph of this operator */
    SubGraph& subgraph() { return subgraph_; }
@@ -221,7 +221,7 @@ OperatorBase* NewOperator(
    const OperatorDef&          def,
    Workspace*                  ws);
-/*! Macros */
+/* Macros */
 #define OpArg OperatorBase::Arg
 #define OpArgs OperatorBase::Args
@@ -266,7 +266,7 @@ DECLARE_REGISTRY(
    const OperatorDef&,
    Workspace*);
-/*! NVIDIA's Accelerated Library - CUDNN */
+/* NVIDIA's Accelerated Library - CUDNN */
 DECLARE_REGISTRY(
    CUDNNOperatorRegistry,
@@ -274,7 +274,7 @@ DECLARE_REGISTRY(
    const OperatorDef&,
    Workspace*);
-/*! CAMBRICON's Accelerated Library - CNML */
+/* CAMBRICON's Accelerated Library - CNML */
 DECLARE_REGISTRY(
    CNMLOperatorRegistry,
@@ -282,13 +282,60 @@ DECLARE_REGISTRY(
    const OperatorDef&,
    Workspace*);
+/* Dispatcher for Runtime Typed-Implementation */
+#define XIsType(x, dtype) \
+    x.template IsType<dtype>()
+template <typename... Types>
+struct TensorTypes {};
+template <typename Sizes, typename... Args>
+struct DispatchHelper;
+#define DEFINE_TENSOR_TYPES_DISPATCHER(TensorTypes, Impl) \
+    template <typename T, typename... Types, typename... Args> \
+    struct DispatchHelper<TensorTypes<T, Types...>, Args...> { \
+        template <typename Op> \
+        static void Call(Op* op, const TypeMeta& meta, string& types) { \
+            if (meta.Match<T>()) return op->template Impl<T, Args...>(); \
+            types += "    * " + TypeToString<T>() + ",\n"; \
+            return DispatchHelper<TensorTypes<Types...>, Args...> \
+                ::Call(op, meta, types); \
+        } \
+        template <typename Op> \
+        static void Call(Op* op, const Tensor& tensor) { \
+            string types; return Call(op, tensor.meta(), types); \
+        } \
+    }; \
+    template <typename... Args> \
+    struct DispatchHelper<TensorTypes<>, Args...> { \
+        template <typename Op> \
+        static void Call(Op* op, const TypeMeta& meta, string& types) { \
+            LOG(FATAL) << "Unsupported DType: " \
+                       << TypeMetaToString(meta) << "\n" \
+                       << "<" << op->type() << "Op>" \
+                       << " supports the following dtypes: {\n" \
+                       << types << "}"; \
+        } \
+        template <typename Op> \
+        static void Call(Op* op, const Tensor& tensor) { \
+            return Call(op, tensor.meta(), ""); \
+        } \
+    };
+DEFINE_TENSOR_TYPES_DISPATCHER(TensorTypes, RunImpl);
+#undef DEFINE_TENSOR_TYPES_DISPATCHER
+/* TensorFiller */
 #define TENSOR_FILL_WITH_TYPE(tensor, shape, type) \
    if (tensor.count() == 0) { \
        CHECK(ws()->GetFiller(tensor.name())) \
            << "\nTensor(" << tensor.name() << ") is empty. \n" \
            << "may be specify a filler for it ?"; \
        tensor.Reshape(shape); \
-        unique_ptr< Filler<type, Context> > filler(  \
+        unique_ptr<Filler<type, Context>> filler(  \
            CreateFiller<type, Context>(*ws()->GetFiller(tensor.name()))); \
        filler->Fill(&tensor, ctx()); \
    } else { \
@@ -308,7 +355,7 @@ DECLARE_REGISTRY(
            << "\nTensor(" << tensor.name() << ") is empty. \n" \
            << "may be specify a filler for it ?"; \
        tensor.Reshape(shape); \
-        unique_ptr< Filler<T, Context> > filler(  \
+        unique_ptr<Filler<T, Context>> filler(  \
            CreateFiller<T, Context>(*ws()->GetFiller(tensor.name()))); \
        filler->Fill(&tensor, ctx()); \
    } else { \
@@ -322,6 +369,8 @@ DECLARE_REGISTRY(
        tensor.Reshape(shape); \
    }
+/* Shared Multiplier */
 #define DECLARE_MULTIPLIER(name, size) \
    const T* name; \
    { \
@@ -335,6 +384,8 @@ DECLARE_REGISTRY(
        name = mp->template data<T, Context>(); \
    }
+/* Dynamic Arguments */
 #define DECLARE_ARG_WITH_DESC(type, arg) \
    type arg##_; \
    string arg##_desc_; \
@@ -393,8 +444,7 @@ DECLARE_REGISTRY(
 #define GET_ARGS_SIZE(arg) \
    (int)std::max(arg##_.size(), arg##_desc_.size())
-#define XIsType(x, dtype) \
+/* Registers */
-    x.template IsType<dtype>()
 #define INSTANTIATE_OPERATOR(name, context) \
    template class name##Op<context>;

--- a/Dragon/include/core/operator_schema.h
+++ b/Dragon/include/core/operator_schema.h
@@ -42,7 +42,7 @@ class OpSchema {
        return *this; 
    }
-    OpSchema& Inplace(set<pair<int, int> > inplace);
+    OpSchema& Inplace(set<pair<int, int>> inplace);
    std::function<bool(int, int)> CheckInplace;
    bool AllowInplace() const { return allow_inplace_; }

--- a/Dragon/include/core/types.h
+++ b/Dragon/include/core/types.h
@@ -73,6 +73,11 @@ inline const std::string TypeMetaToString(
        m2s_type_map[meta.id()] : "unknown";
 }
+template<typename T>
+inline const std::string TypeToString() {
+    return TypeMetaToString(TypeMeta::Make<T>());
+}
 }  // namespace dragon
 #endif  // DRAGON_CORE_TYPES_H_
\ No newline at end of file
--- a/Dragon/include/core/workspace.h
+++ b/Dragon/include/core/workspace.h
@@ -13,22 +13,18 @@
 #ifndef DRAGON_CORE_WORKSPACE_H_
 #define DRAGON_CORE_WORKSPACE_H_
-#include "core/common.h"
 #include "core/graph.h"
-#include "utils/string.h"
 namespace dragon {
 class Workspace {
 public:
-    typedef Map<string, Map<string, int64_t> > DummyNameMap;
+    typedef Map<string, Map<string, int64_t>> DummyNameMap;
+    typedef Map<string, unique_ptr<Tensor>> TensorMap;
-    typedef Map<string, unique_ptr<Tensor> > TensorMap;
    typedef Map<string, string> TensorAliasMap;
    typedef Map<string, TensorFillerProto> TensorFillerMap;
+    typedef Map<string, unique_ptr<OperatorBase>> OperatorMap;
-    typedef Map<string, unique_ptr<OperatorBase> > OperatorMap;
+    typedef Map<string, unique_ptr<GraphBase>> GraphMap;
-    typedef Map<string, unique_ptr<GraphBase> > GraphMap;
    /*! \brief Constructor */
    Workspace(const string& name) : name_(name) { Initialize(); }

--- a/Dragon/include/operators/arithmetic/fully_connected_op.h
+++ b/Dragon/include/operators/arithmetic/fully_connected_op.h
@@ -28,6 +28,7 @@ class FullyConnectedOp final : public Operator<Context> {
    USE_OPERATOR_FUNCTIONS;
    void RunOnDevice();
+    template <typename T> void RunImpl();
    template <typename T> void TransRunImpl();
    template <typename T> void NoTransRunImpl();

--- a/Dragon/include/operators/array/multinomial_op.h
+++ b/Dragon/include/operators/array/multinomial_op.h
@@ -22,6 +22,7 @@ class MultinomialOp final : public Operator<Context> {
 public:
    MultinomialOp(const OperatorDef& def, Workspace* ws)
        : Operator<Context>(def, ws),
+          eps_(OpArg<float>("eps", 0.f)),
          normalize_(OpArg<int64_t>("normalize", 0)),
          num_samples_(OpArg<int64_t>("num_samples", 1)) {}
    USE_OPERATOR_FUNCTIONS;
@@ -32,6 +33,7 @@ class MultinomialOp final : public Operator<Context> {
    template <typename T> void RunImpl();
 protected:
+    float eps_;
    int64_t outer_dim_, axis_;
    int64_t normalize_, num_samples_;
    unique_ptr<OperatorBase> softmax_op_;

--- a/Dragon/include/utils/caffemodel.h
+++ b/Dragon/include/utils/caffemodel.h
@@ -26,22 +26,24 @@ inline void LoadCaffeModel(
    LOG(INFO) << "Restore From Model @: " << file << "......";
    LOG(INFO) << "Model Format: CaffeModel";
    for (int i = 0; i < net_param.layer_size(); i++) {
-        const LayerParameter& layer = net_param.layer(i);
+        const auto& layer = net_param.layer(i);
-        const string& layer_name = layer.name();
+        const auto& layer_name = layer.name();
-        string prefix = layer_name + "/param:";
+        auto prefix = layer_name + "/param:";
        for (int j = 0; j < layer.blobs_size(); j++) {
-            string tensor_name = prefix + std::to_string(j);
+            auto tensor_name = prefix + std::to_string(j);
-            if (!ws->HasTensor(tensor_name))
+            if (!ws->HasTensor(tensor_name)) {
-                LOG(WARNING) << "Tensor(" << tensor_name << ") "
+                LOG(WARNING)
-                << "does not exist in any Graphs, skip.";
+                    << "Tensor(" << tensor_name << ") "
-            else{
+                    << "does not exist in any Graphs, skip.";
-                BlobProto blob = layer.blobs(j);
+            } else {
-                vector<int64_t> dims;
+                auto blob = layer.blobs(j);
-                for (auto dim : blob.shape().dim()) dims.push_back(dim);
+                vec64_t tensor_shape;
-                Tensor* tensor = ws->GetTensor(tensor_name);
+                for (auto dim : blob.shape().dim())
+                    tensor_shape.push_back(dim);
+                auto* tensor = ws->GetTensor(tensor_name);
                std::stringstream DimString;
-                if (dims.size() > 0) {
+                if (tensor_shape.size() > 0) {
-                    tensor->Reshape(dims);
+                    tensor->Reshape(tensor_shape);
                    CHECK_EQ(tensor->count(), blob.data_size())
                        << "\nTensor(" << tensor_name << ") "
                        << "failed to load, except size:  "
@@ -52,9 +54,9 @@ inline void LoadCaffeModel(
                    tensor->Reshape({ blob.data_size() });
                    DimString << "(missing)";
                }
-                float* Xdata = tensor->mutable_data<float, CPUContext>();
+                auto* x = tensor->mutable_data<float, CPUContext>();
-                for (int idx = 0; idx < blob.data_size(); idx++)
+                for (int xi = 0; xi < blob.data_size(); ++xi)
-                    Xdata[idx] = blob.data(idx);
+                    x[xi] = blob.data(xi);
                LOG(INFO) << "Tensor(" << tensor_name << ") "
                          << "loaded, shape: " << DimString.str()
                          << ", size: " << blob.data_size();
@@ -66,32 +68,33 @@ inline void LoadCaffeModel(
 inline void SavaCaffeModel(
    string                          file,
    const vector<Tensor*>&          tensors) {
-    NetParameter net_param;
+    int j = -1;
+    NetParameter net;
    Map<string, int> layer_hash;
-    int layer_idx = -1;
    for (int i = 0; i < tensors.size(); i++) {
        if (tensors[i]->count() <= 0) continue;
-        vector<string> splits = str::split(
+        auto splits = str::split(
            tensors[i]->name(), "/param:");
        if (layer_hash.count(splits[0]) == 0) {
-            layer_hash[splits[0]] = ++layer_idx;
+            layer_hash[splits[0]] = ++j;
-            LayerParameter* layer = net_param.add_layer();
+            auto* layer = net.add_layer();
            layer->set_name(splits[0]);
        }
-        BlobProto* blob = net_param.mutable_layer(layer_idx)->add_blobs();
+        auto* blob = net.mutable_layer(j)->add_blobs();
-        for (auto dim : tensors[i]->dims()) blob->mutable_shape()->add_dim(dim);
+        for (auto dim : tensors[i]->dims())
+            blob->mutable_shape()->add_dim(dim);
        if (XIsType((*tensors[i]), float)) {
-            auto* Xdata = tensors[i]->data<float, CPUContext>();
+            auto* x = tensors[i]->data<float, CPUContext>();
-            for (int id = 0; id < tensors[i]->count(); id++)
+            for (int xi = 0; xi < tensors[i]->count(); ++xi)
-                blob->mutable_data()->Add(Xdata[id]);
+                blob->mutable_data()->Add(x[xi]);
        } else if (XIsType((*tensors[i]), float16)) {
-            auto* Xdata = tensors[i]->data<float16, CPUContext>();
+            auto* x = tensors[i]->data<float16, CPUContext>();
-            for (int id = 0; id < tensors[i]->count(); id++)
+            for (int xi = 0; xi < tensors[i]->count(); ++xi)
                blob->mutable_data()->Add(
-                    cast::to<float>(Xdata[id]));
+                    cast::to<float>(x[xi]));
        }
    }
-    WriteProtoToBinaryFile(net_param, file.c_str());
+    WriteProtoToBinaryFile(net, file.c_str());
    LOG(INFO) << "Save the model @: " << file << "......";
    LOG(INFO) << "Model format: Caffe";
 }

--- a/Dragon/python/dragon/operators/array.py
+++ b/Dragon/python/dragon/operators/array.py
@@ -748,7 +748,7 @@ def Arange(start, stop=None, step=1, dtype='float32', **kwargs):
 @OpSchema.Inputs(1)
-def Multinomial(inputs, num_samples=1, normalize=False, **kwargs):
+def Multinomial(inputs, num_samples=1, eps=0., normalize=False, **kwargs):
    """Return a tensor where each row contains ``num_samples``,
    sampled from the multinomial distribution.
@@ -765,6 +765,8 @@ def Multinomial(inputs, num_samples=1, normalize=False, **kwargs):
        The input tensor.
    num_samples : int, optional, default=1
        The number of samples.
+    eps : float, optional, default=0.
+        The prob to a uniform sampling.
    normalize : boolean, optional, default=False
        Whether to normalize the inputs.

--- a/Dragon/python/dragon/vm/torch/ops/builtin.py
+++ b/Dragon/python/dragon/vm/torch/ops/builtin.py
@@ -987,7 +987,7 @@ def one_hot(input, depth):
    return module.forward(input)
-def multinomial(input, num_samples, out=None):
+def multinomial(input, num_samples, eps=0., out=None):
    """Return a tensor where each row contains ``num_samples``,
     sampled from the multinomial distribution.
@@ -997,8 +997,8 @@ def multinomial(input, num_samples, out=None):
        The input tensor.
    num_samples : int
        The number of samples.
-    normalize : boolean, optional, default=False
+    eps : float, optional, default=0.
-        Whether to normalize the inputs.
+        The prob to a uniform sampling.
    Returns
    -------
@@ -1008,9 +1008,11 @@ def multinomial(input, num_samples, out=None):
    """
    dev = MakeDevice(inputs=[input])
    key = 'Multinomial/{}' \
-          '/num_samples:{}'.format(dev, num_samples)
+          '/num_samples:{}' \
+          '/eps:{}'.format(dev, num_samples, eps)
    module = get_module(
        Multinomial, key, dev,
+        eps=eps,
        num_samples=num_samples,
    )
    return module.forward(input, out)

--- a/Dragon/python/dragon/vm/torch/ops/modules/array.py
+++ b/Dragon/python/dragon/vm/torch/ops/modules/array.py
@@ -377,6 +377,7 @@ class Cast(BaseModule):
 class Multinomial(BaseModule):
    def __init__(self, key, dev, **kwargs):
        super(Multinomial, self).__init__(key, dev, **kwargs)
+        self.eps = kwargs.get('eps', 0)
        self.num_samples = kwargs.get('num_samples', 1)
        self.register_op()
@@ -384,6 +385,7 @@ class Multinomial(BaseModule):
        self.op_meta = {
            'op_type': 'Multinomial',
            'arguments': {
+                'eps': float(self.eps),
                'num_samples': self.num_samples,
                'normalize': False,
            },

--- a/Dragon/python/dragon/vm/torch/tensor.py
+++ b/Dragon/python/dragon/vm/torch/tensor.py
@@ -980,7 +980,7 @@ class Tensor(object):
        """
        raise NotImplementedError('Refer torch.ops.tensor.normal_')
-    def multinomial(self, num_samples, normalize=False):
+    def multinomial(self, num_samples, eps=0.):
        """Return a tensor where each row contains ``num_samples``,
           sampled from the multinomial distribution.
@@ -988,8 +988,8 @@ class Tensor(object):
        ----------
        num_samples : int
            The number of samples.
-        normalize : boolean, optional, default=False
+        eps : float, optional, default=0.
-            Whether to normalize the inputs.
+            The prob to a uniform sampling.
        Returns
        -------

--- a/Dragon/src/contrib/rcnn/bbox_utils.cu
+++ b/Dragon/src/contrib/rcnn/bbox_utils.cu
@@ -81,8 +81,8 @@ void _ApplyNMS(
    CUDA_CHECK(cudaMemcpy(boxes_dev, boxes,
        boxes_nbytes, cudaMemcpyHostToDevice));
    nms_mask<T>
-        << < blocks, NMS_BLOCK_SIZE,
+        <<< blocks, NMS_BLOCK_SIZE,
-             0, ctx->cuda_stream() >> > (num_boxes,
+            0, ctx->cuda_stream() >>> (num_boxes,
                 thresh, (T*)boxes_dev, (uint64_t*)mask_dev);
    ctx->FinishDeviceCompution();

--- a/Dragon/src/contrib/rcnn/bbox_utils.h
+++ b/Dragon/src/contrib/rcnn/bbox_utils.h
@@ -347,7 +347,7 @@ inline void CollectRoIs(
    const int                       canonical_level,
    const int                       canonical_scale,
    const T*                        rois,
-    vector< vector<int64_t> >&      roi_bins) {
+    vector<vec64_t>&                roi_bins) {
    const T* roi = rois;
    for (int i = 0; i < num_rois; ++i) {
        int bin_idx = roi_level(min_level, max_level,
@@ -360,7 +360,7 @@ inline void CollectRoIs(
 template <typename T>
 inline void DistributeRoIs(
-    const vector< vector<int64_t> >&    roi_bins,
+    const vector<vec64_t>&              roi_bins,
    const T*                            rois,
    vector<T*>                          outputs) {
    for (int i = 0; i < roi_bins.size(); i++) {

--- a/Dragon/src/core/graph.cc
+++ b/Dragon/src/core/graph.cc
@@ -123,7 +123,7 @@ Graph::Graph(const GraphDef& def, Workspace* ws)
    // Recomputing-aware
    if (subgraph_indices.size() > 0) {
-        Map< string, vector<OperatorBase*> > subgraph;
+        Map<string, vector<OperatorBase*>> subgraph;
        for (const auto& it : subgraph_indices) {
            subgraph[it.first] = vector<OperatorBase*>();
            for (const auto& idx : subgraph_indices[it.first])

--- a/Dragon/src/core/graph_gradient.cc
+++ b/Dragon/src/core/graph_gradient.cc
@@ -7,7 +7,7 @@ namespace dragon {
 bool GraphGradientMaker::CheckGrad(
    const OperatorDef&              forward_op,
    const Set<string>&              targets,
-    vector< pair<string, int> >&    gen_grads) {
+    vector<pair<string, int>>&      gen_grads) {
    if (NoGradientRegistry()->Has(forward_op.type())) {
        for (auto& input : forward_op.input())
            blacklist_set_.insert(input);
@@ -81,7 +81,7 @@ void GraphGradientMaker::Make(
    for (int i = (int)forward_def.size() - 1; i >= 0; --i) {
        // Collect inputs & outputs, generate RAW grad ops
        const OperatorDef& op = *forward_def[i];
-        vector< pair<string, int> > gen_grads;
+        vector<pair<string, int>> gen_grads;
        bool is_skip = CheckGrad(op, targets_set, gen_grads);
        vector<string> g_outputs;
        for (auto& output : op.output()) {
@@ -214,7 +214,7 @@ void GraphGradientMaker::Make(
 GraphDef GraphGradientMaker::Share(const GraphDef& input_def) {
    Set<int> invalid_ops;
    Map<string, int> ref_count;
-    Map< string, pair<int, string> > ssa_map;
+    Map<string, pair<int, string>> ssa_map;
    // Count the refs for detecting leaf nodes
    for (int i = 0; i < input_def.op_size(); ++i) {
        const OperatorDef& op = input_def.op(i);

--- a/Dragon/src/core/graph_optimizer.cc
+++ b/Dragon/src/core/graph_optimizer.cc
@@ -174,7 +174,7 @@ GraphDef GraphOptimizer::MirrorStage(
    const GraphDef&                  input_def,
    Map<string, vec32_t >&       op_indices) {
    GraphDef output_def(input_def);
-    Map<string, set<int> > fake_op_indices;
+    Map<string, set<int>> fake_op_indices;
    Map<string, string> rename_map;
    Map<string, int> versions;

--- a/Dragon/src/core/operator_schema.cc
+++ b/Dragon/src/core/operator_schema.cc
@@ -54,7 +54,7 @@ OpSchema& OpSchema::NumOutputs(int n) {
    return NumOutputs(n, n);
 }
-OpSchema& OpSchema::Inplace(set< pair<int, int> > inplace) {
+OpSchema& OpSchema::Inplace(set<pair<int, int>> inplace) {
    CheckInplace = [inplace](int in, int out)->bool {
        return (inplace.count(std::make_pair(in, out)) > 0);
    };

--- a/Dragon/src/kernels/activation/dropout_op_kernel.cu
+++ b/Dragon/src/kernels/activation/dropout_op_kernel.cu
@@ -37,14 +37,10 @@ template<> void Dropout<float, CUDAContext>(
    float*                  y,
    CUDAContext*            ctx) {
    auto thresh = (uint32_t)(UINT_MAX * prob);
-    math::RandomUniform(
+    math::RandomUniform(count, 0.f, 1.f, mask32, ctx);
-        count,
-        0.f, (float)UINT_MAX,
-        mask32, ctx
-    );
    _Dropout
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count,
        thresh,
        scale,
@@ -85,14 +81,10 @@ template<> void Dropout<float16, CUDAContext>(
    float16*                y,
    CUDAContext*            ctx) {
    auto thresh = (uint32_t)(UINT_MAX * prob);
-    math::RandomUniform(
+    math::RandomUniform(count, 0.f, 1.f, mask32, ctx);
-        count,
-        0.f, (float)UINT_MAX,
-        mask32, ctx
-    );
    _Dropout
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count,
        thresh,
        cast::to<half>(scale),
@@ -124,8 +116,8 @@ template <> void ApplyMask<float, uint8_t, CUDAContext>(
    float*                  y,
    CUDAContext*            ctx) {
    _ApplyMask
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count, scale, x, mask, y
    );
 }
@@ -157,8 +149,8 @@ template <> void ApplyMask<float16, uint8_t, CUDAContext>(
    float16*                y,
    CUDAContext*            ctx) {
    _ApplyMaskHalf
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count,
        cast::to<half>(scale),
        reinterpret_cast<const half*>(x),

--- a/Dragon/src/kernels/activation/droppath_op_kernel.cu
+++ b/Dragon/src/kernels/activation/droppath_op_kernel.cu
@@ -44,8 +44,8 @@ template<> void DropPath<float, CUDAContext>(
    auto nthreads = rows * cols;
    auto thresh = 1.f - (1.f / scale);
    _DropPath
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        nthreads, cols, thresh, scale, x, mask, y
    );
 }
@@ -85,8 +85,8 @@ template<> void DropPath<float16, CUDAContext>(
    auto nthreads = rows * cols;
    auto thresh = 1.f - (1.f / scale);
    _DropPath
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        nthreads, cols,
        thresh,
        cast::to<half>(scale),

--- a/Dragon/src/kernels/activation/elu_op_kernel.cu
+++ b/Dragon/src/kernels/activation/elu_op_kernel.cu
@@ -28,8 +28,8 @@ template<> void Elu<float, CUDAContext>(
    float*                  y,
    CUDAContext*            ctx) {
    _Elu
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count, x, alpha, y
    );
 }
@@ -58,8 +58,8 @@ template<> void EluGrad<float, CUDAContext>(
    float*                  dx,
    CUDAContext*            ctx) {
    _EluGrad
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count, alpha, dy, y, dx
    );
 }

--- a/Dragon/src/kernels/activation/prelu_op_kernel.cu
+++ b/Dragon/src/kernels/activation/prelu_op_kernel.cu
@@ -66,21 +66,21 @@ template<> void PRelu<float, CUDAContext>(
    CUDAContext*            ctx) {
    if (channel_shared) {
        _PRelu
-            << < CUDA_BLOCKS(count), CUDA_THREADS,
+            <<< CUDA_BLOCKS(count), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            count, channels, dim, x, w, y
        );
    } else {
        if (data_format == "NCHW") {
            _PReluNCHW
-                << < CUDA_BLOCKS(count), CUDA_THREADS,
+                <<< CUDA_BLOCKS(count), CUDA_THREADS,
-                     0, ctx->cuda_stream() >> >(
+                    0, ctx->cuda_stream() >>>(
                count, channels, dim, x, w, y
            );
        } else if (data_format == "NHWC") {
            _PReluNHWC
-                << < CUDA_BLOCKS(count), CUDA_THREADS,
+                <<< CUDA_BLOCKS(count), CUDA_THREADS,
-                     0, ctx->cuda_stream() >> >(
+                    0, ctx->cuda_stream() >>>(
                count, channels, dim, x, w, y
            );
        } else {
@@ -152,21 +152,21 @@ template<> void PReluGrad<float, CUDAContext>(
    CUDAContext*            ctx) {
    if (channel_shared) {
        _PReluGrad
-            << < CUDA_BLOCKS(count), CUDA_THREADS,
+            <<< CUDA_BLOCKS(count), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            count, channels, dim, dy, x, w, dx
        );
    } else {
        if (data_format == "NCHW") {
            _PReluGradNCHW
-                << < CUDA_BLOCKS(count), CUDA_THREADS,
+                <<< CUDA_BLOCKS(count), CUDA_THREADS,
-                     0, ctx->cuda_stream() >> >(
+                    0, ctx->cuda_stream() >>>(
                count, channels, dim, dy, x, w, dx
            );
        } else if (data_format == "NHWC") {
            _PReluGradNHWC
-                << < CUDA_BLOCKS(count), CUDA_THREADS,
+                <<< CUDA_BLOCKS(count), CUDA_THREADS,
-                     0, ctx->cuda_stream() >> >(
+                    0, ctx->cuda_stream() >>>(
                count, channels, dim, dy, x, w, dx
            );
        } else {
@@ -210,8 +210,8 @@ template<> void PReluWGrad<float, CUDAContext>(
    CUDAContext*            ctx) {
    auto cdim = channels * dim;
    _PReluWGradBcast
-        << < CUDA_BLOCKS(cdim), CUDA_THREADS,
+        <<< CUDA_BLOCKS(cdim), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        cdim, rows, row_offset, dy, x, bcast_dw
    );
    if (channel_shared) {

--- a/Dragon/src/kernels/activation/relu_op_kernel.cu
+++ b/Dragon/src/kernels/activation/relu_op_kernel.cu
@@ -35,8 +35,8 @@ template<> void Relu<float, CUDAContext>(
    float*                  y,
    CUDAContext*            ctx) {
    _Relu
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count, slope, x, y
    );
 }
@@ -83,8 +83,8 @@ template<> void Relu<float16, CUDAContext>(
    CUDAContext*            ctx) {
    if ((count & 1) == 0) {
        _Relu
-            << < CUDA_BLOCKS(count >> 1), CUDA_THREADS,
+            <<< CUDA_BLOCKS(count >> 1), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            count >> 1,
            cast::to<half2>(slope),
            reinterpret_cast<const half2*>(x),
@@ -92,8 +92,8 @@ template<> void Relu<float16, CUDAContext>(
        );
    } else {
        _Relu
-            << < CUDA_BLOCKS(count), CUDA_THREADS,
+            <<< CUDA_BLOCKS(count), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            count,
            cast::to<half>(slope),
            reinterpret_cast<const half*>(x),
@@ -134,8 +134,8 @@ template<> void ReluGrad<float, CUDAContext>(
    float*                  dx,
    CUDAContext*            ctx) {
    _ReluGrad
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count, slope, dy, y, dx
    );
 }
@@ -170,8 +170,8 @@ template<> void ReluGrad<float16, CUDAContext>(
    float16*                dx,
    CUDAContext*            ctx) {
    _ReluGrad
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count, slope,
        reinterpret_cast<const half*>(dy),
        reinterpret_cast<const half*>(y),

--- a/Dragon/src/kernels/activation/selu_op_kernel.cu
+++ b/Dragon/src/kernels/activation/selu_op_kernel.cu
@@ -34,8 +34,8 @@ template<> void SElu<float, CUDAContext>(
    float*                  y,
    CUDAContext*            ctx) {
    _SElu
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count, x, y
    );
 }
@@ -63,8 +63,8 @@ template<> void SElu<float16, CUDAContext>(
    float16*                y,
    CUDAContext*            ctx) {
    _SElu
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count,
        reinterpret_cast<const half*>(x),
        reinterpret_cast<half*>(y)
@@ -99,8 +99,8 @@ template<> void SEluGrad<float, CUDAContext>(
    float*                  dx,
    CUDAContext*            ctx) {
    _SEluGrad
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count, dy, y, dx
    );
 }
@@ -131,8 +131,8 @@ template<> void SEluGrad<float16, CUDAContext>(
    float16*                dx,
    CUDAContext*            ctx) {
    _SEluGrad
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count,
        reinterpret_cast<const half*>(dy),
        reinterpret_cast<const half*>(y),

--- a/Dragon/src/kernels/activation/sigmoid_op_kernel.cu
+++ b/Dragon/src/kernels/activation/sigmoid_op_kernel.cu
@@ -25,8 +25,8 @@ template<> void Sigmoid<float, CUDAContext>(
    float*                  y,
    CUDAContext*            ctx) {
    _Sigmoid
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count, x, y
    );
 }
@@ -51,8 +51,8 @@ template<> void SigmoidGrad<float, CUDAContext>(
    float*                  dx,
    CUDAContext*            ctx) {
    _SigmoidGrad
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count, dy, y, dx
    );
 }

--- a/Dragon/src/kernels/activation/softmax_op_kernel.cu
+++ b/Dragon/src/kernels/activation/softmax_op_kernel.cu
@@ -96,26 +96,26 @@ template<> void Softmax<float, CUDAContext>(
    auto num_preds = outer_dim * inner_dim;
    auto nelements = num_preds * axis_dim;
    _SoftmaxReduceMax
-        << < CUDA_BLOCKS(num_preds), CUDA_THREADS,
+        <<< CUDA_BLOCKS(num_preds), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        num_preds, axis_dim, inner_dim, x, scale
    );
    _SoftmaxSub
-        << < CUDA_BLOCKS(nelements), CUDA_THREADS,
+        <<< CUDA_BLOCKS(nelements), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        nelements, axis_dim, inner_dim, scale, y
    );
    math::Exp(nelements, y, y, ctx);
    _SoftmaxReduceSum
-        << < CUDA_BLOCKS(num_preds), CUDA_THREADS,
+        <<< CUDA_BLOCKS(num_preds), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        num_preds, axis_dim, inner_dim, y, scale
    );
    _SoftmaxDiv
-        << < CUDA_BLOCKS(nelements), CUDA_THREADS,
+        <<< CUDA_BLOCKS(nelements), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        nelements, axis_dim, inner_dim, scale, y
    );
 }
@@ -159,13 +159,13 @@ template<> void SoftmaxGrad<float, CUDAContext>(
    auto num_preds = outer_dim * inner_dim;
    auto nelements = num_preds * axis_dim;
    _SoftmaxDot
-        << < CUDA_BLOCKS(num_preds), CUDA_THREADS,
+        <<< CUDA_BLOCKS(num_preds), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        num_preds, axis_dim, inner_dim, dy, y, scale
    );
    _SoftmaxSub
-        << < CUDA_BLOCKS(nelements), CUDA_THREADS,
+        <<< CUDA_BLOCKS(nelements), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        nelements, axis_dim, inner_dim, scale, dx
    );
    math::Mul(nelements, dx, y, dx, ctx);

--- a/Dragon/src/kernels/activation/tanh_op_kernel.cu
+++ b/Dragon/src/kernels/activation/tanh_op_kernel.cu
@@ -25,8 +25,8 @@ template<> void Tanh<float, CUDAContext>(
    float*                  y,
    CUDAContext*            ctx) {
    _Tanh
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count, x, y
    );
 }
@@ -51,8 +51,8 @@ template<> void TanhGrad<float, CUDAContext>(
    float*                  dx,
    CUDAContext*            ctx) {
    _TanhGrad
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count, dy, y, dx
    );
 }

--- a/Dragon/src/kernels/arithmetic/affine_op_kernel.cu
+++ b/Dragon/src/kernels/arithmetic/affine_op_kernel.cu
@@ -60,15 +60,15 @@ template<> void Affine<float, CUDAContext>(
    auto nthreads = outer_dim * axis_dim * inner_dim;
    if (beta != nullptr) {
        _Affine
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            nthreads, axis_dim, inner_dim,
            x, alpha, beta, y
        );
    } else {
        _AffineNoBias
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            nthreads, axis_dim, inner_dim, x, alpha, y
        );
    }
@@ -124,8 +124,8 @@ template<> void Affine<float16, CUDAContext>(
    auto nthreads = outer_dim * axis_dim * inner_dim;
    if (beta != nullptr) {
        _Affine
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            nthreads, axis_dim, inner_dim,
            reinterpret_cast<const half*>(x),
            reinterpret_cast<const half*>(alpha),
@@ -134,8 +134,8 @@ template<> void Affine<float16, CUDAContext>(
        );
    } else {
        _AffineNoBias
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            nthreads, axis_dim, inner_dim,
            reinterpret_cast<const half*>(x),
            reinterpret_cast<const half*>(alpha),
@@ -156,8 +156,8 @@ template <> void AffineGrad<float, CUDAContext>(
    CUDAContext*            ctx) {
    auto nthreads = outer_dim * axis_dim * inner_dim;
    _AffineNoBias
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        nthreads, axis_dim, inner_dim, dy, alpha, dx
    );
 }
@@ -174,8 +174,8 @@ template <> void AffineGrad<float16, CUDAContext>(
    CUDAContext*            ctx) {
    auto nthreads = outer_dim * axis_dim * inner_dim;
    _AffineNoBias
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        nthreads, axis_dim, inner_dim,
        reinterpret_cast<const half*>(dy),
        reinterpret_cast<const half*>(alpha),

--- a/Dragon/src/kernels/arithmetic/clip_op_kernel.cu
+++ b/Dragon/src/kernels/arithmetic/clip_op_kernel.cu
@@ -83,8 +83,8 @@ template<> __global__ void _ClipGrad<half>(
        T*                      y, \
        CUDAContext*            ctx) { \
        _Clip<T> \
-            << < CUDA_BLOCKS(count), CUDA_THREADS, \
+            <<< CUDA_BLOCKS(count), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+                0, ctx->cuda_stream() >>>( \
            count,  \
            cast::to<T>(low), \
            cast::to<T>(high), \
@@ -102,8 +102,8 @@ template<> __global__ void _ClipGrad<half>(
        T*                      dx, \
        CUDAContext*            ctx) { \
        _ClipGrad<T> \
-            << < CUDA_BLOCKS(count), CUDA_THREADS, \
+            <<< CUDA_BLOCKS(count), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+                0, ctx->cuda_stream() >>>( \
            count, \
            cast::to<T>(low), \
            cast::to<T>(high), \
@@ -133,8 +133,8 @@ template <> void Clip<float16, CUDAContext>(
    float16*                y,
    CUDAContext*            ctx) {
    _Clip
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count,
        cast::to<half>(low),
        cast::to<half>(high),
@@ -152,8 +152,8 @@ template <> void ClipGrad<float16, CUDAContext>(
    float16*                dx,
    CUDAContext*            ctx) {
    _ClipGrad
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count,
        cast::to<half>(low),
        cast::to<half>(high),

--- a/Dragon/src/kernels/arithmetic/maximum_op_kernel.cu
+++ b/Dragon/src/kernels/arithmetic/maximum_op_kernel.cu
@@ -139,8 +139,8 @@ template<> __global__ void _BroadcastMaximumGrad<half>(
        T*                      y, \
        CUDAContext*            ctx) { \
        _##name \
-            << < CUDA_BLOCKS(count), CUDA_THREADS, \
+            <<< CUDA_BLOCKS(count), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+                0, ctx->cuda_stream() >>>( \
            count, x1, x2, y \
        ); \
    }
@@ -155,8 +155,8 @@ template<> __global__ void _BroadcastMaximumGrad<half>(
        T*                      dx2, \
        CUDAContext*            ctx) { \
        _##name \
-            << < CUDA_BLOCKS(count), CUDA_THREADS, \
+            <<< CUDA_BLOCKS(count), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+                0, ctx->cuda_stream() >>>( \
            count, x1, x2, dy, dx1, dx2 \
        ); \
    }
@@ -196,8 +196,8 @@ template <> void Maximum<float16, CUDAContext>(
    float16*                y,
    CUDAContext*            ctx) {
    _Maximum \
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count,
        reinterpret_cast<const half*>(x1),
        reinterpret_cast<const half*>(x2),
@@ -212,8 +212,8 @@ template <> void BroadcastMaximum<float16, CUDAContext>(
    float16*                y,
    CUDAContext*            ctx) {
    _BroadcastMaximum \
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count,
        reinterpret_cast<const half*>(x1),
        cast::to<half>(x2),
@@ -230,8 +230,8 @@ template <> void MaximumGrad<float16, CUDAContext>(
    float16*                dx2,
    CUDAContext*            ctx) {
    _MaximumGrad \
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count,
        reinterpret_cast<const half*>(x1),
        reinterpret_cast<const half*>(x2),
@@ -250,8 +250,8 @@ template <> void BroadcastMaximumGrad<float16, CUDAContext>(
    float16*                dx2,
    CUDAContext*            ctx) {
    _BroadcastMaximumGrad \
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count,
        reinterpret_cast<const half*>(x1),
        cast::to<half>(x2),

--- a/Dragon/src/kernels/arithmetic/minimum_op_kernel.cu
+++ b/Dragon/src/kernels/arithmetic/minimum_op_kernel.cu
@@ -139,8 +139,8 @@ template<> __global__ void _BroadcastMinimumGrad<half>(
        T*                      y, \
        CUDAContext*            ctx) { \
        _##name \
-            << < CUDA_BLOCKS(count), CUDA_THREADS, \
+            <<< CUDA_BLOCKS(count), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+                0, ctx->cuda_stream() >>>( \
            count, x1, x2, y \
        ); \
    }
@@ -155,8 +155,8 @@ template<> __global__ void _BroadcastMinimumGrad<half>(
        T*                      dx2, \
        CUDAContext*            ctx) { \
        _##name \
-            << < CUDA_BLOCKS(count), CUDA_THREADS, \
+            <<< CUDA_BLOCKS(count), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+                0, ctx->cuda_stream() >>>( \
            count, x1, x2, dy, dx1, dx2 \
        ); \
    }
@@ -196,8 +196,8 @@ template <> void Minimum<float16, CUDAContext>(
    float16*                y,
    CUDAContext*            ctx) {
    _Minimum \
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count,
        reinterpret_cast<const half*>(x1),
        reinterpret_cast<const half*>(x2),
@@ -212,8 +212,8 @@ template <> void BroadcastMinimum<float16, CUDAContext>(
    float16*                y,
    CUDAContext*            ctx) {
    _BroadcastMinimum \
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count,
        reinterpret_cast<const half*>(x1),
        cast::to<half>(x2),
@@ -230,8 +230,8 @@ template <> void MinimumGrad<float16, CUDAContext>(
    float16*                dx2,
    CUDAContext*            ctx) {
    _MinimumGrad \
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count,
        reinterpret_cast<const half*>(x1),
        reinterpret_cast<const half*>(x2),
@@ -250,8 +250,8 @@ template <> void BroadcastMinimumGrad<float16, CUDAContext>(
    float16*                dx2,
    CUDAContext*            ctx) {
    _BroadcastMinimumGrad \
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count,
        reinterpret_cast<const half*>(x1),
        cast::to<half>(x2),

--- a/Dragon/src/kernels/arithmetic/moments_op_kernel.cu
+++ b/Dragon/src/kernels/arithmetic/moments_op_kernel.cu
@@ -251,8 +251,8 @@ void _Moments(
            ndims, x_dims, y_dims,
                &rows, &cols)) {
        _ColwiseMoments
-            << < CUDA_2D_BLOCKS(rows), CUDA_THREADS,
+            <<< CUDA_2D_BLOCKS(rows), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            rows, cols, x, mean, var
        ); return;
    }
@@ -262,8 +262,8 @@ void _Moments(
            ndims, x_dims, y_dims, 
                &rows, &cols)) {
        _RowwiseMoments
-            << < CUDA_2D_BLOCKS(cols), CUDA_THREADS,
+            <<< CUDA_2D_BLOCKS(cols), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            rows, cols, x, mean, var
        ); return;
    }
@@ -294,8 +294,8 @@ void _Moments(
    ctx->Memcpy<CUDAContext, CPUContext>(dbytes, YDS, dimsT.data());
    _GenericMoments
-        << < CUDA_2D_BLOCKS(outer_dim), CUDA_THREADS,
+        <<< CUDA_2D_BLOCKS(outer_dim), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        ndims, outer_dim, inner_dim,
        XSS, YDS, x, mean, var
    );

--- a/Dragon/src/kernels/array/arange_op_kernel.cu
+++ b/Dragon/src/kernels/array/arange_op_kernel.cu
@@ -30,8 +30,8 @@ __global__ void _Arange(
        T*                      y, \
        CUDAContext*            ctx) { \
        _Arange \
-            << < CUDA_BLOCKS(count), CUDA_THREADS, \
+            <<< CUDA_BLOCKS(count), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+                0, ctx->cuda_stream() >>>( \
            count, start, step, y \
        ); \
    }
@@ -64,8 +64,8 @@ template <> void Arange<float16, CUDAContext>(
    float16*                y,
    CUDAContext*            ctx) {
    _Arange
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count, start, step,
        reinterpret_cast<half*>(y)
    );

--- a/Dragon/src/kernels/array/argreduce_op_kernel.cc
+++ b/Dragon/src/kernels/array/argreduce_op_kernel.cc
@@ -20,12 +20,12 @@ void _ArgMax(
        for (int iix = 0; iix < inner_dim; ++iix) {
            const T* X = x + (oix * axis_dim * inner_dim + iix);
            const int y_offset = oix * top_k * inner_dim + iix;
-            vector< pair<T, int64_t> > vec(axis_dim);
+            vector<pair<T, int64_t>> vec(axis_dim);
            for (int j = 0; j < axis_dim; ++j)
                vec[j] = std::make_pair(X[j * inner_dim], j);
            std::partial_sort(
                vec.begin(), vec.begin() + top_k, vec.end(),
-                    std::greater< pair<T, int64_t> >());
+                    std::greater<pair<T, int64_t>>());
            for (int j = 0; j < top_k; ++j) {
                indices[y_offset + j * inner_dim] = vec[j].second;
                if (values) values[y_offset + j * inner_dim] = vec[j].first;
@@ -49,7 +49,7 @@ void _ArgMin(
        for (int iix = 0; iix < inner_dim; ++iix) {
            const T* X = x + (oix * axis_dim * inner_dim + iix);
            const int y_offset = oix * top_k * inner_dim + iix;
-            vector< pair<T, int64_t> > vec(axis_dim);
+            vector<pair<T, int64_t>> vec(axis_dim);
            for (int j = 0; j < axis_dim; ++j)
                vec[j] = std::make_pair(X[j * inner_dim], j);
            std::partial_sort(vec.begin(), vec.begin() + top_k, vec.end());

--- a/Dragon/src/kernels/array/argreduce_op_kernel.cu
+++ b/Dragon/src/kernels/array/argreduce_op_kernel.cu
@@ -133,8 +133,8 @@ template<> __global__ void _ArgMin<half>(
        CHECK_EQ(top_k, 1) << "\nRequired top_k == 1."; \
        auto nthreads = outer_dim * inner_dim; \
        _##name \
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+                0, ctx->cuda_stream() >>>( \
            nthreads, inner_dim, axis_dim, \
            x, indices, values \
        ); \
@@ -168,8 +168,8 @@ template<> void ArgMax<float16, CUDAContext>(
    CHECK_EQ(top_k, 1) << "\nRequired top_k == 1.";
    auto nthreads = outer_dim * inner_dim;
    _ArgMax
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        nthreads, inner_dim, axis_dim,
        reinterpret_cast<const half*>(x),
        indices,
@@ -189,8 +189,8 @@ template<> void ArgMin<float16, CUDAContext>(
    CHECK_EQ(top_k, 1) << "\nRequired top_k == 1.";
    auto nthreads = outer_dim * inner_dim;
    _ArgMin
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
         nthreads, inner_dim, axis_dim,
         reinterpret_cast<const half*>(x),
         indices,

--- a/Dragon/src/kernels/array/concat_op_kernel.cu
+++ b/Dragon/src/kernels/array/concat_op_kernel.cu
@@ -43,8 +43,8 @@ __global__ void _Concat(
        auto cols = axis_dim * inner_dim; \
        auto nthreads = outer_dim * axis_dim * inner_dim; \
        _##name \
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+                0, ctx->cuda_stream() >>>( \
            nthreads, \
            inner_dim, \
            cols, \

--- a/Dragon/src/kernels/array/crop_op_kernel.cu
+++ b/Dragon/src/kernels/array/crop_op_kernel.cu
@@ -83,8 +83,8 @@ __global__ void _CropGrad(
        T*                      y, \
        CUDAContext*            ctx) { \
        _##name \
-            << < CUDA_BLOCKS(count), CUDA_THREADS, \
+            <<< CUDA_BLOCKS(count), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+                0, ctx->cuda_stream() >>>( \
            count, ndims, \
            x_strides, y_dims, \
            starts, x, y \

--- a/Dragon/src/kernels/array/index_select_op_kernel.cu
+++ b/Dragon/src/kernels/array/index_select_op_kernel.cu
@@ -115,8 +115,8 @@ template <> __global__ void _IndexSelectGrad<half>(
        CUDAContext*            ctx) { \
        auto nthreads = outer_dim * num_indices * inner_dim; \
        _IndexSelect \
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+                0, ctx->cuda_stream() >>>( \
            nthreads, inner_dim, \
            axis_dim, num_indices, \
            indices, x, y \
@@ -135,8 +135,8 @@ template <> __global__ void _IndexSelectGrad<half>(
        CUDAContext*            ctx) { \
        auto nthreads = outer_dim * inner_dim; \
        _IndexSelectGrad \
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+                0, ctx->cuda_stream() >>>( \
            nthreads, inner_dim, \
            axis_dim, num_indices, \
            indices, dy, dx \
@@ -170,8 +170,8 @@ template <> void IndexSelectGrad<float16, CUDAContext>(
    CUDAContext*            ctx) {
    auto nthreads = outer_dim * inner_dim;
    _IndexSelectGrad
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        nthreads, inner_dim,
        axis_dim, num_indices,
        indices,

--- a/Dragon/src/kernels/array/one_hot_op_kernel.cu
+++ b/Dragon/src/kernels/array/one_hot_op_kernel.cu
@@ -32,8 +32,8 @@ template <> void OneHot<float, CUDAContext>(
    float*                  y,
    CUDAContext*            ctx) {
    _OneHot
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count, depth, on_value, x, y
    );
 }
@@ -48,8 +48,8 @@ template <> void OneHot<int, CUDAContext>(
    int*                    y,
    CUDAContext*            ctx) {
    _OneHot
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count, depth, on_value, x, y
    );
 }
@@ -64,8 +64,8 @@ template <> void OneHot<int64_t, CUDAContext>(
    int64_t*                y,
    CUDAContext*            ctx) {
    _OneHot
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count, depth, on_value, x, y
    );
 }

--- a/Dragon/src/kernels/array/pad_op_kernel.cu
+++ b/Dragon/src/kernels/array/pad_op_kernel.cu
@@ -130,8 +130,8 @@ __global__ void _EdgePad(
        T*                      y, \
        CUDAContext*            ctx) { \
        _ConstPad \
-            << < CUDA_BLOCKS(count), CUDA_THREADS, \
+            <<< CUDA_BLOCKS(count), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+                0, ctx->cuda_stream() >>>( \
            count, ndims, \
            x_dims, x_strides, \
            y_dims, l_pads, \
@@ -152,8 +152,8 @@ __global__ void _EdgePad(
        T*                      y, \
        CUDAContext*            ctx) { \
        _##name \
-            << < CUDA_BLOCKS(count), CUDA_THREADS, \
+            <<< CUDA_BLOCKS(count), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+                0, ctx->cuda_stream() >>>( \
            count, ndims, \
            x_dims, x_strides, \
            y_dims, l_pads, \

--- a/Dragon/src/kernels/array/reduce_sum_op_kernel.cu
+++ b/Dragon/src/kernels/array/reduce_sum_op_kernel.cu
@@ -202,8 +202,8 @@ void _ReduceSum(
        ndims, x_dims, y_dims,
            &rows, &cols)) {
        _ColwiseReduceSum
-            << < CUDA_2D_BLOCKS(rows), CUDA_THREADS,
+            <<< CUDA_2D_BLOCKS(rows), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            rows, cols, scale, x, y
        ); return;
    }
@@ -213,8 +213,8 @@ void _ReduceSum(
            ndims, x_dims, y_dims,
                &rows, &cols)) {
        _RowwiseReduceSum
-            << < CUDA_2D_BLOCKS(cols), CUDA_THREADS,
+            <<< CUDA_2D_BLOCKS(cols), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            rows, cols, scale, x, y
        ); return;
    }
@@ -245,8 +245,8 @@ void _ReduceSum(
    ctx->Memcpy<CUDAContext, CPUContext>(dbytes, YDS, dimsT.data());
    _GenericReduceSum
-        << < CUDA_2D_BLOCKS(outer_dim), CUDA_THREADS,
+        <<< CUDA_2D_BLOCKS(outer_dim), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        ndims, outer_dim, inner_dim,
        XSS, YDS, scale, x, y
    );
@@ -372,8 +372,8 @@ template <> __global__ void _ReduceSumGrad<half>(
        T*                      dx, \
        CUDAContext*            ctx) { \
        _ReduceSumGrad \
-            << < CUDA_BLOCKS(count), CUDA_THREADS, \
+            <<< CUDA_BLOCKS(count), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+                0, ctx->cuda_stream() >>>( \
            count, ndim, x_dims, \
            y_dims, y_strides, \
            scale, dy, dx \
@@ -398,8 +398,8 @@ template<> void ReduceSumGrad<float16, CUDAContext>(
    float16*                dx,
    CUDAContext*            ctx) {
    _ReduceSumGrad
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count, ndim, x_dims,
        y_dims, y_strides,
        scale,

--- a/Dragon/src/kernels/array/repeat_op_kernel.cu
+++ b/Dragon/src/kernels/array/repeat_op_kernel.cu
@@ -93,8 +93,8 @@ template<> __global__ void _RepeatGrad<half>(
        auto y_inner_dim = inner_dim * repeats; \
        auto nthreads = outer_dim * axis_dim * y_inner_dim; \
        _Repeat \
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+                0, ctx->cuda_stream() >>>( \
            nthreads, axis_dim, \
            inner_dim, y_inner_dim, \
            x, y \
@@ -113,8 +113,8 @@ template<> __global__ void _RepeatGrad<half>(
        auto y_inner_dim = inner_dim * repeats; \
        auto nthreads = outer_dim * axis_dim * inner_dim; \
        _RepeatGrad \
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+                0, ctx->cuda_stream() >>>( \
            nthreads, \
            axis_dim, \
            inner_dim, \
@@ -151,8 +151,8 @@ template<> void RepeatGrad<float16, CUDAContext>(
    auto y_inner_dim = inner_dim * repeats;
    auto nthreads = outer_dim * axis_dim * inner_dim;
    _RepeatGrad
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        nthreads,
        axis_dim,
        inner_dim,

--- a/Dragon/src/kernels/array/slice_op_kernel.cu
+++ b/Dragon/src/kernels/array/slice_op_kernel.cu
@@ -64,8 +64,8 @@ __global__ void _SliceGrad(
        auto cols = slice_dim * inner_dim; \
        auto nthreads = outer_dim * cols; \
        _##name \
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+                0, ctx->cuda_stream() >>>( \
            nthreads, \
            inner_dim, \
            axis_dim, \
@@ -126,8 +126,8 @@ template <> void SliceGrad<float16, CUDAContext>(
    auto cols = slice_dim * inner_dim;
    auto nthreads = outer_dim * cols;
    _SliceGrad
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        nthreads,
        inner_dim,
        axis_dim,

--- a/Dragon/src/kernels/array/tile_op_kernel.cu
+++ b/Dragon/src/kernels/array/tile_op_kernel.cu
@@ -98,8 +98,8 @@ template<> __global__ void _TileGrad<half>(
        T*                      y, \
        CUDAContext*            ctx) { \
        _Tile \
-            << < CUDA_BLOCKS(count), CUDA_THREADS, \
+            <<< CUDA_BLOCKS(count), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+                0, ctx->cuda_stream() >>>( \
            count, \
            ndims, \
            x_dims, \
@@ -120,8 +120,8 @@ template<> __global__ void _TileGrad<half>(
        auto nthreads = rows * cols; \
        auto tiled_cols = multiple * cols; \
        _TileGrad \
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+                0, ctx->cuda_stream() >>>( \
            nthreads, \
            cols, \
            tiled_cols, \
@@ -156,8 +156,8 @@ template<> void TileGrad<float16, CUDAContext>(
    auto nthreads = rows * cols;
    auto tiled_cols = multiple * cols;
    _TileGrad
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        nthreads,
        cols,
        tiled_cols,

--- a/Dragon/src/kernels/array/transpose_op_kernel.cu
+++ b/Dragon/src/kernels/array/transpose_op_kernel.cu
@@ -80,8 +80,8 @@ __global__ void _TransposeGrad(
        T*                      y, \
        CUDAContext*            ctx) { \
        _##name \
-            << < CUDA_BLOCKS(count), CUDA_THREADS, \
+            <<< CUDA_BLOCKS(count), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+                0, ctx->cuda_stream() >>>( \
            count, ndims, x_strides, y_dims, x, y \
        ); \
    }

--- a/Dragon/src/kernels/control_flow/assign_op_kernel.cu
+++ b/Dragon/src/kernels/control_flow/assign_op_kernel.cu
@@ -55,8 +55,8 @@ __global__ void _Assign(
        T*                      y, \
        CUDAContext*            ctx) { \
        _Assign \
-            << < CUDA_BLOCKS(count), CUDA_THREADS, \
+            <<< CUDA_BLOCKS(count), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+                0, ctx->cuda_stream() >>>( \
            count, \
            ndims, \
            x_dims, \

--- a/Dragon/src/kernels/control_flow/compare_op_kernel.cu
+++ b/Dragon/src/kernels/control_flow/compare_op_kernel.cu
@@ -153,8 +153,8 @@ __global__ void _GreaterEqualHalf(
        bool*                   y, \
        CUDAContext*            ctx) { \
        IMPL \
-            << < CUDA_BLOCKS(count), CUDA_THREADS, \
+            <<< CUDA_BLOCKS(count), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+                0, ctx->cuda_stream() >>>( \
            count, a, b, y \
        ); \
    }
@@ -167,8 +167,8 @@ __global__ void _GreaterEqualHalf(
        bool*                   y, \
        CUDAContext*            ctx) { \
        _##OP##Half \
-            << < CUDA_BLOCKS(count), CUDA_THREADS, \
+            <<< CUDA_BLOCKS(count), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+                0, ctx->cuda_stream() >>>( \
            count, \
            reinterpret_cast<const half*>(a), \
            reinterpret_cast<const half*>(b), \

--- a/Dragon/src/kernels/control_flow/masked_assign_op_kernel.cu
+++ b/Dragon/src/kernels/control_flow/masked_assign_op_kernel.cu
@@ -30,8 +30,8 @@ __global__ void _MaskedAssign(
        T*                      y, \
        CUDAContext*            ctx) { \
        _MaskedAssign \
-            << < CUDA_BLOCKS(count), CUDA_THREADS, \
+            <<< CUDA_BLOCKS(count), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+                0, ctx->cuda_stream() >>>( \
            count, mask, x, y \
        ); \
    }

--- a/Dragon/src/kernels/loss/l1_loss_op_kernel.cu
+++ b/Dragon/src/kernels/loss/l1_loss_op_kernel.cu
@@ -27,8 +27,8 @@ template<> void AbsGrad<float, CUDAContext>(
    float*                  dx,
    CUDAContext*            ctx) {
    _AbsGrad
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count, dy, dx
    );
 }

--- a/Dragon/src/kernels/loss/nll_loss_op_kernel.cu
+++ b/Dragon/src/kernels/loss/nll_loss_op_kernel.cu
@@ -55,8 +55,8 @@ template <> void NLLLoss<float, float, CUDAContext>(
    CUDAContext*            ctx) {
    auto nthreads = outer_dim * inner_dim;
    _NLLLoss
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        nthreads, axis_dim, inner_dim, nignores,
        ignore, log_prob, target, loss, flag
     );
@@ -77,8 +77,8 @@ template <> void NLLLoss<float, int64_t, CUDAContext>(
    CUDAContext*            ctx) {
    auto nthreads = outer_dim * inner_dim;
    _NLLLoss
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        nthreads, axis_dim, inner_dim, nignores,
        ignore, log_prob, target, loss, flag
    );
@@ -129,8 +129,8 @@ template<> void NLLLossGrad<float, float, CUDAContext>(
    CUDAContext*            ctx) {
    auto nthreads = outer_dim * inner_dim;
    _NLLLossGrad
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        nthreads, axis_dim, inner_dim, nignores,
        ignore, log_prob, target, dx, flag
    );
@@ -151,8 +151,8 @@ template<> void NLLLossGrad<float, int64_t, CUDAContext>(
    CUDAContext*            ctx) {
    auto nthreads = outer_dim * inner_dim;
    _NLLLossGrad
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        nthreads, axis_dim, inner_dim, nignores,
        ignore, log_prob, target, dx, flag
    );

--- a/Dragon/src/kernels/loss/sigmoid_ce_loss_op_kernel.cu
+++ b/Dragon/src/kernels/loss/sigmoid_ce_loss_op_kernel.cu
@@ -42,8 +42,8 @@ template <> void SigmoidCrossEntropy<float, CUDAContext>(
    int*                    flag,
    CUDAContext*            ctx) {
    _SigmoidCrossEntropy
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count, logit, target, loss, flag
    );
 }
@@ -77,8 +77,8 @@ template <> void SigmoidCrossEntropyGrad<float, CUDAContext>(
    int*                    flag,
    CUDAContext*            ctx) {
    _SigmoidCrossEntropyGrad
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count, logit, target, dlogit, flag
    );
 }

--- a/Dragon/src/kernels/loss/sigmoid_focal_loss_op_kernel.cu
+++ b/Dragon/src/kernels/loss/sigmoid_focal_loss_op_kernel.cu
@@ -71,8 +71,8 @@ template <> void SigmoidFocalLoss<float, float, CUDAContext>(
    CUDAContext*            ctx) {
    auto nthreads = outer_dim * axis_dim * inner_dim;
    _SigmoidFocalLoss
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        nthreads, axis_dim, inner_dim,
        pos_alpha, neg_alpha, gamma, neg_id,
        logits, targets, losses, flags
@@ -96,8 +96,8 @@ template <> void SigmoidFocalLoss<float, int64_t, CUDAContext>(
    CUDAContext*            ctx) {
    auto nthreads = outer_dim * axis_dim * inner_dim;
    _SigmoidFocalLoss
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        nthreads, axis_dim, inner_dim,
        pos_alpha, neg_alpha, gamma, neg_id,
        logits, targets, losses, flags
@@ -171,8 +171,8 @@ template <> void SigmoidFocalLossGrad<float, float, CUDAContext>(
    CUDAContext*            ctx) {
    auto count = outer_dim * axis_dim * inner_dim;
    _SigmoidFocalLossGrad
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count, axis_dim, inner_dim,
        pos_alpha, neg_alpha, gamma, neg_id,
        logits, targets, dlogits, flags
@@ -196,8 +196,8 @@ template <> void SigmoidFocalLossGrad<float, int64_t, CUDAContext>(
    CUDAContext*            ctx) {
    auto count = outer_dim * axis_dim * inner_dim;
    _SigmoidFocalLossGrad
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count, axis_dim, inner_dim,
        pos_alpha, neg_alpha, gamma, neg_id,
        logits, targets, dlogits, flags

--- a/Dragon/src/kernels/loss/smooth_l1_loss_op_kernel.cu
+++ b/Dragon/src/kernels/loss/smooth_l1_loss_op_kernel.cu
@@ -33,8 +33,8 @@ template<> void SmoothL1<float, CUDAContext>(
    float*                  y,
    CUDAContext*            ctx) {
    _SmoothL1
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count, beta, x, y
     );
 }
@@ -63,8 +63,8 @@ template<> void SmoothL1Grad<float, CUDAContext>(
    float*                  dx,
    CUDAContext*            ctx) {
    _SmoothL1Grad
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count, beta, dy, dx
    );
 }

--- a/Dragon/src/kernels/loss/softmax_ce_loss_op_kernel.cu
+++ b/Dragon/src/kernels/loss/softmax_ce_loss_op_kernel.cu
@@ -29,8 +29,8 @@ template <> void SoftmaxCrossEntropy<float, CUDAContext>(
    float*                  losses,
    CUDAContext*            ctx) {
    _SoftmaxCrossEntropy
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count, prob, targets, losses
    );
 }

--- a/Dragon/src/kernels/loss/softmax_focal_loss_op_kernel.cu
+++ b/Dragon/src/kernels/loss/softmax_focal_loss_op_kernel.cu
@@ -67,8 +67,8 @@ template <> void SoftmaxFocalLoss<float, float, CUDAContext>(
    CUDAContext*            ctx) {
    auto num_preds = outer_dim * inner_dim;
    _SoftmaxFocalLoss
-        << < CUDA_BLOCKS(num_preds), CUDA_THREADS,
+        <<< CUDA_BLOCKS(num_preds), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        num_preds, axis_dim, inner_dim,
        pos_alpha, neg_alpha, gamma, neg_id,
        nignores, ignores,
@@ -95,8 +95,8 @@ template <> void SoftmaxFocalLoss<float, int64_t, CUDAContext>(
    CUDAContext*            ctx) {
    auto num_preds = outer_dim * inner_dim;
    _SoftmaxFocalLoss
-        << < CUDA_BLOCKS(num_preds), CUDA_THREADS,
+        <<< CUDA_BLOCKS(num_preds), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        num_preds, axis_dim, inner_dim,
        pos_alpha, neg_alpha, gamma, neg_id,
        nignores, ignores,
@@ -179,8 +179,8 @@ template<> void SoftmaxFocalLossGrad<float, float, CUDAContext>(
    CUDAContext*            ctx) {
    auto num_preds = outer_dim * inner_dim;
    _SoftmaxFocalLossGrad
-        << < CUDA_BLOCKS(num_preds), CUDA_THREADS,
+        <<< CUDA_BLOCKS(num_preds), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        num_preds, axis_dim, inner_dim,
        pos_alpha, neg_alpha, gamma, neg_id,
        nignores, ignores,
@@ -207,8 +207,8 @@ template<> void SoftmaxFocalLossGrad<float, int64_t, CUDAContext>(
    CUDAContext*            ctx) {
    auto num_preds = outer_dim * inner_dim;
    _SoftmaxFocalLossGrad
-        << < CUDA_BLOCKS(num_preds), CUDA_THREADS,
+        <<< CUDA_BLOCKS(num_preds), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        num_preds, axis_dim, inner_dim,
        pos_alpha, neg_alpha, gamma, neg_id,
        nignores, ignores,

--- a/Dragon/src/kernels/loss/sparse_softmax_ce_loss_op_kernel.cu
+++ b/Dragon/src/kernels/loss/sparse_softmax_ce_loss_op_kernel.cu
@@ -59,8 +59,8 @@ template <> void SparseSoftmaxCrossEntropy<float, float, CUDAContext>(
    CUDAContext*            ctx) {
    auto nthreads = outer_dim * inner_dim;
    _SparseSoftmaxCrossEntropy
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        nthreads, axis_dim, inner_dim, nignores,
        ignore, prob, target, loss, flag
    );
@@ -81,8 +81,8 @@ template <> void SparseSoftmaxCrossEntropy<float, int64_t, CUDAContext>(
    CUDAContext*            ctx) {
    auto nthreads = outer_dim * inner_dim;
    _SparseSoftmaxCrossEntropy
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        nthreads, axis_dim, inner_dim, nignores,
        ignore, prob, target, loss, flag
    );
@@ -136,8 +136,8 @@ template<> void SparseSoftmaxCrossEntropyGrad<float, float, CUDAContext>(
    CUDAContext*            ctx) {
    auto nthreads = outer_dim * inner_dim;
    _SparseSoftmaxCrossEntropyGrad
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        nthreads, axis_dim, inner_dim, nignores,
        ignore, prob, target, dx, flag
    );
@@ -158,8 +158,8 @@ template<> void SparseSoftmaxCrossEntropyGrad<float, int64_t, CUDAContext>(
    CUDAContext*            ctx) {
    auto nthreads = outer_dim * inner_dim;
    _SparseSoftmaxCrossEntropyGrad
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        nthreads, axis_dim, inner_dim, nignores,
        ignore, prob, target, dx, flag
    );

--- a/Dragon/src/kernels/misc/astype_op_kernel.cu
+++ b/Dragon/src/kernels/misc/astype_op_kernel.cu
@@ -26,8 +26,8 @@ __global__ void _TypeA2B(
        Tb*                 b, \
        CUDAContext*        ctx) { \
        _TypeA2B \
-            << < CUDA_BLOCKS(count), CUDA_THREADS, \
+            <<< CUDA_BLOCKS(count), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+                0, ctx->cuda_stream() >>>( \
            count, a, b \
        ); \
    }
@@ -66,8 +66,8 @@ template <> void TypeA2B<float16, float, CUDAContext>(
    float*                  b, 
    CUDAContext*            ctx) {
    _TypeA2B
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count, reinterpret_cast<const half*>(a), b
    );
 }
@@ -89,8 +89,8 @@ template <> void TypeA2B<float, float16, CUDAContext>(
    float16*            b,
    CUDAContext*        ctx) {
    _TypeA2B
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count, a, reinterpret_cast<half*>(b)
    );
 }
@@ -112,8 +112,8 @@ template <> void TypeA2B<float16, float16, CUDAContext>(
    float16*                b,
    CUDAContext*            ctx) {
    _TypeA2B
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count,
        reinterpret_cast<const half*>(a),
        reinterpret_cast<half*>(b)

--- a/Dragon/src/kernels/misc/gradient_op_kernel.cu
+++ b/Dragon/src/kernels/misc/gradient_op_kernel.cu
@@ -62,8 +62,8 @@ template <> __global__ void _GradientTwoSum<half2>(
        T*                      dx, \
        CUDAContext*            ctx) { \
        _GradientTwoSum \
-            << < CUDA_BLOCKS(count), CUDA_THREADS, \
+            <<< CUDA_BLOCKS(count), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+                0, ctx->cuda_stream() >>>( \
            count, dy1, dy2, dx \
        ); \
    }
@@ -83,8 +83,8 @@ template <> void GradientTwoSum<float16, CUDAContext>(
    CUDAContext*            ctx) {
    if ((count & 1) == 0) {
        _GradientTwoSum
-            << < CUDA_BLOCKS(count >> 2), CUDA_THREADS,
+            <<< CUDA_BLOCKS(count >> 2), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            count >> 2,
            reinterpret_cast<const half2*>(dy1),
            reinterpret_cast<const half2*>(dy2),
@@ -92,8 +92,8 @@ template <> void GradientTwoSum<float16, CUDAContext>(
        );
    } else {
        _GradientTwoSum
-            << < CUDA_BLOCKS(count), CUDA_THREADS,
+            <<< CUDA_BLOCKS(count), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            count,
            reinterpret_cast<const half*>(dy1),
            reinterpret_cast<const half*>(dy2),

--- a/Dragon/src/kernels/misc/image_data_op_kernel.cu
+++ b/Dragon/src/kernels/misc/image_data_op_kernel.cu
@@ -76,14 +76,14 @@ template <> void ImageData<float, float, CUDAContext>(
    auto nthreads = N * C * H * W;
    if (data_format == "NCHW") {
        _ImageDataNCHW
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            nthreads, C, H, W, mean, std, x, y
        );
    } else if (data_format == "NHWC") {
        _ImageDataNHWC
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            nthreads, C, H, W, mean, std, x, y
       );
    } else {
@@ -107,14 +107,14 @@ template <> void ImageData<uint8_t, float, CUDAContext>(
    auto nthreads = N * C * H * W;
    if (data_format == "NCHW") {
        _ImageDataNCHW
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            nthreads, C, H, W, mean, std, x, y
        );
    } else if (data_format == "NHWC") {
        _ImageDataNHWC
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            nthreads, C, H, W, mean, std, x, y
       );
    } else {
@@ -191,15 +191,15 @@ template <> void ImageData<float, float16, CUDAContext>(
    auto nthreads = N * C * H * W;
    if (data_format == "NCHW") {
        _ImageDataHalfNCHW
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            nthreads, C, H, W, mean, std,
            x, reinterpret_cast<half*>(y)
        );
    } else if (data_format == "NHWC") {
        _ImageDataHalfNHWC
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
           nthreads, C, H, W, mean, std,
           x, reinterpret_cast<half*>(y)
        );
@@ -222,15 +222,15 @@ template <> void ImageData<uint8_t, float16, CUDAContext>(
    auto nthreads = N * C * H * W;
    if (data_format == "NCHW") {
        _ImageDataHalfNCHW
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            nthreads, C, H, W, mean, std,
            x, reinterpret_cast<half*>(y)
        );
    } else if (data_format == "NHWC") {
        _ImageDataHalfNHWC
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            nthreads, C, H, W, mean, std,
            x, reinterpret_cast<half*>(y)
        );

--- a/Dragon/src/kernels/norm/batch_norm_op_kernel.cu
+++ b/Dragon/src/kernels/norm/batch_norm_op_kernel.cu
@@ -190,27 +190,27 @@ __global__ void _BatchNormInferenceGrad(
        auto nthreads = N * C * S; \
        if (data_format == "NCHW") { \
            _BatchNormInternalGrad<Tx, Tp, StorageOrder::NCHW> \
-                << < CUDA_2D_BLOCKS(C), CUDA_THREADS, \
+                <<< CUDA_2D_BLOCKS(C), CUDA_THREADS, \
-                     0, ctx->cuda_stream() >> >( \
+                    0, ctx->cuda_stream() >>>( \
                N, C, S, x, mu, rsig, gamma, \
                dy, ds, db, dgamma, dbeta \
            ); \
            _BatchNormTrainingGrad<Tx, Tp, StorageOrder::NCHW> \
-                << < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
+                <<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
-                     0, ctx->cuda_stream() >> >( \
+                    0, ctx->cuda_stream() >>>( \
                nthreads, N, C, S, x, mu, \
                rsig, gamma, ds, db, dy, dx \
            ); \
        } else if (data_format == "NHWC") { \
            _BatchNormInternalGrad<Tx, Tp, StorageOrder::NHWC> \
-                << < CUDA_2D_BLOCKS(C), CUDA_THREADS, \
+                <<< CUDA_2D_BLOCKS(C), CUDA_THREADS, \
-                     0, ctx->cuda_stream() >> >( \
+                    0, ctx->cuda_stream() >>>( \
                N, C, S, x, mu, rsig, gamma, \
                dy, ds, db, dgamma, dbeta \
            ); \
            _BatchNormTrainingGrad<Tx, Tp, StorageOrder::NHWC> \
-                << < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
+                <<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
-                     0, ctx->cuda_stream() >> >( \
+                    0, ctx->cuda_stream() >>>( \
                nthreads, N, C, S, x, mu, \
                rsig, gamma, ds, db, dy, dx \
            ); \
@@ -234,24 +234,24 @@ __global__ void _BatchNormInferenceGrad(
        if (data_format == "NCHW") { \
            if (dgamma != nullptr) { \
                _BatchNormWGrad<Tx, Tp, StorageOrder::NCHW> \
-                    << < CUDA_2D_BLOCKS(C), CUDA_THREADS, \
+                    <<< CUDA_2D_BLOCKS(C), CUDA_THREADS, \
-                         0, ctx->cuda_stream() >> > \
+                        0, ctx->cuda_stream() >>> \
                    (N, C, S, x, mu, rsig, dy, dgamma, dbeta); \
            } \
            _BatchNormInferenceGrad<Tx, Tp, StorageOrder::NCHW> \
-                << < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
+                <<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
-                     0, ctx->cuda_stream() >> > \
+                    0, ctx->cuda_stream() >>> \
                (nthreads, C, S, rsig, gamma, dy, dx); \
        } else if (data_format == "NHWC") { \
            if (dgamma != nullptr) { \
                _BatchNormWGrad<Tx, Tp, StorageOrder::NHWC> \
-                    << < CUDA_2D_BLOCKS(C), CUDA_THREADS, \
+                    <<< CUDA_2D_BLOCKS(C), CUDA_THREADS, \
-                         0, ctx->cuda_stream() >> > \
+                        0, ctx->cuda_stream() >>> \
                    (N, C, S, x, mu, rsig, dy, dgamma, dbeta); \
            } \
            _BatchNormInferenceGrad<Tx, Tp, StorageOrder::NHWC> \
-                << < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
+                <<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
-                     0, ctx->cuda_stream() >> > \
+                    0, ctx->cuda_stream() >>> \
                (nthreads, C, S, rsig, gamma, dy, dx); \
        } \
    }

--- a/Dragon/src/kernels/norm/group_norm_op_kernel.cu
+++ b/Dragon/src/kernels/norm/group_norm_op_kernel.cu
@@ -408,20 +408,20 @@ __global__ void _GroupNormGradHalf(
        CUDAContext*                ctx) { \
        const int C = G * D; \
        _GroupNormFusedParams<Tp> \
-            << < CUDA_2D_BLOCKS(N * G), CUDA_THREADS, \
+            <<< CUDA_2D_BLOCKS(N * G), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+                0, ctx->cuda_stream() >>>( \
            N, G, D, mu, rsig, gamma, beta, scale, bias \
        ); \
        if (data_format == "NCHW") { \
            _GroupNormForwardNCHW<Tx, Tp> \
-                << < CUDA_2D_BLOCKS(N * C), CUDA_THREADS, \
+                <<< CUDA_2D_BLOCKS(N * C), CUDA_THREADS, \
-                     0, ctx->cuda_stream() >> >( \
+                    0, ctx->cuda_stream() >>>( \
                N, C, S, x, scale, bias, y \
            ); \
        } else if (data_format == "NHWC") { \
            _GroupNormForwardNHWC<Tx, Tp> \
-                << < CUDA_2D_BLOCKS(N * C), CUDA_THREADS, \
+                <<< CUDA_2D_BLOCKS(N * C), CUDA_THREADS, \
-                     0, ctx->cuda_stream() >> >( \
+                    0, ctx->cuda_stream() >>>( \
                N, C, S, x, scale, bias, y \
            ); \
        } \
@@ -448,35 +448,35 @@ __global__ void _GroupNormGradHalf(
        auto nthreads = N * G * D * S; \
        if (data_format == "NCHW") { \
            _GroupNormWGrad<Tx, Tp, StorageOrder::NCHW> \
-                << < CUDA_2D_BLOCKS(G * D), CUDA_THREADS, \
+                <<< CUDA_2D_BLOCKS(G * D), CUDA_THREADS, \
-                     0, ctx->cuda_stream() >> >( \
+                    0, ctx->cuda_stream() >>>( \
                N, G, D, S, x, mu, rsig, dy, dgamma, dbeta \
            ); \
            _GroupNormInternalGrad<Tx, Tp, StorageOrder::NCHW> \
-                << < CUDA_2D_BLOCKS(N * G), CUDA_THREADS, \
+                <<< CUDA_2D_BLOCKS(N * G), CUDA_THREADS, \
-                     0, ctx->cuda_stream() >> >( \
+                    0, ctx->cuda_stream() >>>( \
                N, G, D, S, x, gamma, dy, ds, db \
            ); \
            _GroupNormGrad<Tx, Tp, StorageOrder::NCHW> \
-                << < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
+                <<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
-                     0, ctx->cuda_stream() >> >( \
+                    0, ctx->cuda_stream() >>>( \
                nthreads, G, D, S, x, mu, rsig, \
                gamma, ds, db, dy, dx \
            ); \
        } else if (data_format == "NHWC") { \
            _GroupNormWGrad<Tx, Tp, StorageOrder::NHWC> \
-                << < CUDA_2D_BLOCKS(G * D), CUDA_THREADS, \
+                <<< CUDA_2D_BLOCKS(G * D), CUDA_THREADS, \
-                     0, ctx->cuda_stream() >> >( \
+                    0, ctx->cuda_stream() >>>( \
                N, G, D, S, x, mu, rsig, dy, dgamma, dbeta \
            ); \
            _GroupNormInternalGrad<Tx, Tp, StorageOrder::NHWC> \
-                << < CUDA_2D_BLOCKS(N * G), CUDA_THREADS, \
+                <<< CUDA_2D_BLOCKS(N * G), CUDA_THREADS, \
-                     0, ctx->cuda_stream() >> >( \
+                    0, ctx->cuda_stream() >>>( \
                N, G, D, S, x, gamma, dy, ds, db \
            ); \
            _GroupNormGrad<Tx, Tp, StorageOrder::NHWC> \
-                << < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
+                <<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
-                     0, ctx->cuda_stream() >> > ( \
+                    0, ctx->cuda_stream() >>> ( \
                nthreads, G, D, S, x, mu, rsig, \
                gamma, ds, db, dy, dx \
            ); \
@@ -503,14 +503,14 @@ template <> void GroupNormForward<float16, float, CUDAContext>(
    CUDAContext*                ctx) {
    const int C = G * D;
    _GroupNormFusedParams<float>
-        << < CUDA_2D_BLOCKS(N * G), CUDA_THREADS,
+        <<< CUDA_2D_BLOCKS(N * G), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        N, G, D, mu, rsig, gamma, beta, scale, bias
    );
    if (data_format == "NCHW") {
        _GroupNormForwardNCHW<half, float>
-            << < CUDA_2D_BLOCKS(N * C), CUDA_THREADS,
+            <<< CUDA_2D_BLOCKS(N * C), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            N, C, S,
            reinterpret_cast<const half*>(x),
            scale, bias,
@@ -518,8 +518,8 @@ template <> void GroupNormForward<float16, float, CUDAContext>(
        );
    } else if (data_format == "NHWC") {
        _GroupNormForwardNHWC<half, float>
-            << < CUDA_2D_BLOCKS(N * C), CUDA_THREADS,
+            <<< CUDA_2D_BLOCKS(N * C), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            N, C, S,
            reinterpret_cast<const half*>(x),
            scale, bias,
@@ -548,8 +548,8 @@ template <> void GroupNormBackward<float16, float, CUDAContext>(
    auto nthreads = N * G * D * S;
    if (data_format == "NCHW") {
        _GroupNormWGradHalf<StorageOrder::NCHW>
-            << < CUDA_2D_BLOCKS(G * D), CUDA_THREADS,
+            <<< CUDA_2D_BLOCKS(G * D), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            N, G, D, S,
            reinterpret_cast<const half*>(x),
            mu, rsig,
@@ -557,8 +557,8 @@ template <> void GroupNormBackward<float16, float, CUDAContext>(
            dgamma, dbeta
        );
        _GroupNormInternalGradHalf<StorageOrder::NCHW>
-            << < CUDA_2D_BLOCKS(N * G), CUDA_THREADS,
+            <<< CUDA_2D_BLOCKS(N * G), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            N, G, D, S,
            reinterpret_cast<const half*>(x),
            gamma,
@@ -566,8 +566,8 @@ template <> void GroupNormBackward<float16, float, CUDAContext>(
            ds, db
        );
        _GroupNormGradHalf<StorageOrder::NCHW>
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            nthreads, G, D, S,
            reinterpret_cast<const half*>(x),
            mu, rsig, gamma, ds, db,
@@ -576,8 +576,8 @@ template <> void GroupNormBackward<float16, float, CUDAContext>(
        );
    } else if (data_format == "NHWC") { \
        _GroupNormWGradHalf<StorageOrder::NHWC>
-            << < CUDA_2D_BLOCKS(G * D), CUDA_THREADS,
+            <<< CUDA_2D_BLOCKS(G * D), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            N, G, D, S,
            reinterpret_cast<const half*>(x),
            mu, rsig,
@@ -585,8 +585,8 @@ template <> void GroupNormBackward<float16, float, CUDAContext>(
            dgamma, dbeta
        );
        _GroupNormInternalGradHalf<StorageOrder::NHWC>
-            << < CUDA_2D_BLOCKS(N * G), CUDA_THREADS,
+            <<< CUDA_2D_BLOCKS(N * G), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            N, G, D, S,
            reinterpret_cast<const half*>(x),
            gamma,
@@ -594,8 +594,8 @@ template <> void GroupNormBackward<float16, float, CUDAContext>(
            ds, db
        );
        _GroupNormGradHalf<StorageOrder::NHWC>
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            nthreads, G, D, S,
            reinterpret_cast<const half*>(x),
            mu, rsig, gamma, ds, db,

--- a/Dragon/src/kernels/recurrent/lstm_cell_op_kernel.cu
+++ b/Dragon/src/kernels/recurrent/lstm_cell_op_kernel.cu
@@ -58,13 +58,13 @@ template <> void LSTMCell<float, CUDAContext>(
    auto o_offset = 2 * C, c_offset = 3 * C,
         x_offset = 4 * C, NC = N * C;
    _LSTMCellAct
-        << < CUDA_BLOCKS(NC * 4), CUDA_THREADS,
+        <<< CUDA_BLOCKS(NC * 4), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        NC * 4, c_offset, x_offset, actx
    );
    _LSTMCellGate
-        << < CUDA_BLOCKS(NC), CUDA_THREADS,
+        <<< CUDA_BLOCKS(NC), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        NC, C, o_offset, c_offset,
        x_offset, cx, actx, c, h
    );
@@ -138,14 +138,14 @@ template <> void LSTMCellGrad<float, CUDAContext>(
    auto o_offset = 2 * C, c_offset = 3 * C,
         x_offset = 4 * C, NC = N * C;
    _LSTMCellGateGrad
-        << < CUDA_BLOCKS(NC), CUDA_THREADS,
+        <<< CUDA_BLOCKS(NC), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        NC, C, o_offset, c_offset, x_offset,
        cx, actx, c, dc, dh, dcx, dx
    );
    _LSTMCellActGrad
-        << < CUDA_BLOCKS(NC * 4), CUDA_THREADS,
+        <<< CUDA_BLOCKS(NC * 4), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        NC * 4, c_offset, x_offset, actx, dx
    );
 }

--- a/Dragon/src/kernels/update/adam_update_op_kernel.cu
+++ b/Dragon/src/kernels/update/adam_update_op_kernel.cu
@@ -39,8 +39,8 @@ template <> void AdamUpdate<float, CUDAContext>(
    float*                  v,
    CUDAContext*            ctx) {
    _AdamUpdate
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count, lr, beta1, beta2, eps, g, m, v
    );
 }

--- a/Dragon/src/kernels/update/mprec_update_op_kerne.cu
+++ b/Dragon/src/kernels/update/mprec_update_op_kerne.cu
@@ -29,8 +29,8 @@ template <> void MixedPrecL2Decay<float16, CUDAContext>(
    float*                  dx,
    CUDAContext*            ctx) {
    _MixedPrecL2DecayHalf
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count,
        alpha,
        reinterpret_cast<const half*>(w),
@@ -58,8 +58,8 @@ template <> void MixedPrecUpdate<float16, CUDAContext>(
    float16*                w,
    CUDAContext*            ctx) {
    _MixedPrecUpdateHalf
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count,
        updates,
        reinterpret_cast<half*>(w)

--- a/Dragon/src/kernels/update/nesterov_update_op_kernel.cu
+++ b/Dragon/src/kernels/update/nesterov_update_op_kernel.cu
@@ -32,8 +32,8 @@ template <> void NesterovUpdate<float, CUDAContext>(
    float*                  h,
    CUDAContext*            ctx) {
    _NesterovUpdate
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count, lr, momentum, g, h
    );
 }

--- a/Dragon/src/kernels/update/rmsprop_update_op_kernel.cu
+++ b/Dragon/src/kernels/update/rmsprop_update_op_kernel.cu
@@ -34,8 +34,8 @@ template <> void RMSPropUpdate<float, CUDAContext>(
    float*                  h,
    CUDAContext*            ctx) {
    _RMSPropUpdate
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count, lr, decay, eps, g, h
    );
 }

--- a/Dragon/src/kernels/update/sgd_update_op_kernel.cu
+++ b/Dragon/src/kernels/update/sgd_update_op_kernel.cu
@@ -31,8 +31,8 @@ template <> void SGDUpdate<float, CUDAContext>(
    float*                  h,
    CUDAContext*            ctx) {
    _SGDUpdate
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        count, lr, momentum, g, h
    );
 }

--- a/Dragon/src/kernels/vision/bias_add_op_kernel.cu
+++ b/Dragon/src/kernels/vision/bias_add_op_kernel.cu
@@ -52,14 +52,14 @@ template<> void BiasAdd<float, CUDAContext>(
    auto nthreads = outer_dim * axis_dim * inner_dim;
    if (data_format == "NCHW") {
        _BiasAddNCHW
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            nthreads, axis_dim, inner_dim, bias, y
        );
    } else if (data_format == "NHWC") {
        _BiasAddNHWC
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            nthreads, axis_dim, bias, y
        );
    } else {

--- a/Dragon/src/kernels/vision/bilinear_resize_op_kernel.cu
+++ b/Dragon/src/kernels/vision/bilinear_resize_op_kernel.cu
@@ -109,15 +109,15 @@ template <> void BilinearResize<float, CUDAContext>(
    auto scale_w = (float)W / (float)out_w;
     if (data_format == "NCHW") {
         _BilinearResizeNCHW
-             << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+             <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                  0, ctx->cuda_stream() >> >(
+                 0, ctx->cuda_stream() >>>(
             nthreads, C, H, W, out_h, out_w,
             scale_h, scale_w, x, y
        );
    } else if(data_format == "NHWC") {
         _BilinearResizeNHWC
-             << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+             <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                  0, ctx->cuda_stream() >> >(
+                 0, ctx->cuda_stream() >>>(
             nthreads, C, H, W, out_h, out_w,
             scale_h, scale_w, x, y
        );
@@ -224,15 +224,15 @@ template <> void BilinearResizeGrad<float, CUDAContext>(
    auto scale_w = (float)W / (float)out_w;
     if (data_format == "NCHW") {
         _BilinearResizeGradNCHW
-             << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+             <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                  0, ctx->cuda_stream() >> >(
+                 0, ctx->cuda_stream() >>>(
             nthreads, C, H, W, out_h, out_w,
             scale_h, scale_w, dy, dx
        );
    } else if(data_format == "NHWC") {
         _BilinearResizeGradNHWC
-             << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+             <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                  0, ctx->cuda_stream() >> >(
+                 0, ctx->cuda_stream() >>>(
             nthreads, C, H, W, out_h, out_w,
             scale_h, scale_w, dy, dx
        );

--- a/Dragon/src/kernels/vision/conv_op_kernel.cu
+++ b/Dragon/src/kernels/vision/conv_op_kernel.cu
@@ -123,8 +123,8 @@ template <> void Im2Col2d<float, CUDAContext>(
    auto nthreads = C * out_h * out_w;
    if (data_format == "NCHW") {
         _Im2Col2dNCHW
-             << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+             <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                  0, ctx->cuda_stream() >> >(
+                 0, ctx->cuda_stream() >>>(
            nthreads,
            H, W,
            out_h, out_w,
@@ -136,8 +136,8 @@ template <> void Im2Col2d<float, CUDAContext>(
        );
    } else if (data_format == "NHWC") {
        _Im2Col2dNHWC
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            nthreads,
            C, H, W,
            out_h, out_w,
@@ -286,8 +286,8 @@ template <> void Col2Im2d<float, CUDAContext>(
    const int nthreads = C * H * W;
    if (data_format == "NCHW") {
        _Col2Im2dNCHW
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            nthreads,
            H, W,
            out_h, out_w,
@@ -299,8 +299,8 @@ template <> void Col2Im2d<float, CUDAContext>(
        );
    } else if (data_format == "NHWC") {
        _Col2Im2dNHWC
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            nthreads,
            C, H, W,
            out_h, out_w,

--- a/Dragon/src/kernels/vision/depthwise_conv_op_kernel.cu
+++ b/Dragon/src/kernels/vision/depthwise_conv_op_kernel.cu
@@ -144,8 +144,8 @@ template <> void DepthwiseConv2d<float, CUDAContext>(
    if (data_format == "NCHW") {
        if (kernel_h == 3 && kernel_w == 3) {
            _DepthwiseConv2dNCHW<float, 3, 3>
-                << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                     0, ctx->cuda_stream() >> >(
+                    0, ctx->cuda_stream() >>>(
                nthreads,
                C, H, W,
                out_h, out_w,
@@ -157,8 +157,8 @@ template <> void DepthwiseConv2d<float, CUDAContext>(
            );
        } else if (kernel_h == 5 && kernel_w == 5) {
            _DepthwiseConv2dNCHW<float, 5, 5>
-                << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                     0, ctx->cuda_stream() >> >(
+                    0, ctx->cuda_stream() >>>(
                nthreads,
                C, H, W,
                out_h, out_w,
@@ -170,8 +170,8 @@ template <> void DepthwiseConv2d<float, CUDAContext>(
            );
        } else if (kernel_h == 7 && kernel_w == 7) {
            _DepthwiseConv2dNCHW<float, 7, 7>
-                << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                     0, ctx->cuda_stream() >> >(
+                    0, ctx->cuda_stream() >>>(
                nthreads,
                C, H, W,
                out_h, out_w,
@@ -183,8 +183,8 @@ template <> void DepthwiseConv2d<float, CUDAContext>(
            );
       } else {
            _DepthwiseConv2dNCHW<float, -1, -1>
-                << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                     0, ctx->cuda_stream() >> >(
+                    0, ctx->cuda_stream() >>>(
                nthreads,
                C, H, W,
                out_h, out_w,
@@ -198,8 +198,8 @@ template <> void DepthwiseConv2d<float, CUDAContext>(
    } else if (data_format == "NHWC") {
        if (kernel_h == 3 && kernel_w == 3) {
            _DepthwiseConv2dNHWC<float, 3, 3>
-                << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                     0, ctx->cuda_stream() >> >(
+                    0, ctx->cuda_stream() >>>(
                nthreads,
                C, H, W,
                out_h, out_w,
@@ -211,8 +211,8 @@ template <> void DepthwiseConv2d<float, CUDAContext>(
            );
        } else if (kernel_h == 5 && kernel_w == 5) {
            _DepthwiseConv2dNHWC<float, 5, 5>
-                << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                     0, ctx->cuda_stream() >> >(
+                    0, ctx->cuda_stream() >>>(
                nthreads,
                C, H, W,
                out_h, out_w,
@@ -224,8 +224,8 @@ template <> void DepthwiseConv2d<float, CUDAContext>(
            );
        } else if (kernel_h == 7 && kernel_w == 7) {
            _DepthwiseConv2dNHWC<float, 7, 7>
-                << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                     0, ctx->cuda_stream() >> >(
+                    0, ctx->cuda_stream() >>>(
                nthreads,
                C, H, W,
                out_h, out_w,
@@ -237,8 +237,8 @@ template <> void DepthwiseConv2d<float, CUDAContext>(
            );
        } else {
            _DepthwiseConv2dNHWC<float, -1, -1>
-                << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                     0, ctx->cuda_stream() >> >(
+                    0, ctx->cuda_stream() >>>(
                nthreads,
                C, H, W,
                out_h, out_w,
@@ -394,8 +394,8 @@ template <> void DepthwiseConv2dGrad<float, CUDAContext>(
    if (data_format == "NCHW") {
        if (kernel_h == 3 && kernel_w == 3) {
            _DepthwiseConv2dGradNCHW<float, 3, 3>
-                << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                     0, ctx->cuda_stream() >> >(
+                    0, ctx->cuda_stream() >>>(
                nthreads,
                C, H, W,
                out_h, out_w,
@@ -407,8 +407,8 @@ template <> void DepthwiseConv2dGrad<float, CUDAContext>(
            );
        } else if (kernel_h == 5 && kernel_w == 5) {
            _DepthwiseConv2dGradNCHW<float, 5, 5>
-                << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                     0, ctx->cuda_stream() >> >(
+                    0, ctx->cuda_stream() >>>(
                nthreads,
                C, H, W,
                out_h, out_w,
@@ -420,8 +420,8 @@ template <> void DepthwiseConv2dGrad<float, CUDAContext>(
            );
        } else if (kernel_h == 7 && kernel_w == 7) {
            _DepthwiseConv2dGradNCHW<float, 7, 7>
-                << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                     0, ctx->cuda_stream() >> >(
+                    0, ctx->cuda_stream() >>>(
                nthreads,
                C, H, W,
                out_h, out_w,
@@ -433,8 +433,8 @@ template <> void DepthwiseConv2dGrad<float, CUDAContext>(
            );
        } else {
             _DepthwiseConv2dGradNCHW<float, -1, -1>
-                << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                     0, ctx->cuda_stream() >> >(
+                    0, ctx->cuda_stream() >>>(
                nthreads,
                C, H, W,
                out_h, out_w,
@@ -448,8 +448,8 @@ template <> void DepthwiseConv2dGrad<float, CUDAContext>(
    } else if (data_format == "NHWC") {
        if (kernel_h == 3 && kernel_w == 3) {
            _DepthwiseConv2dGradNHWC<float, 3, 3>
-                << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                     0, ctx->cuda_stream() >> >(
+                    0, ctx->cuda_stream() >>>(
                nthreads,
                C, H, W,
                out_h, out_w,
@@ -461,8 +461,8 @@ template <> void DepthwiseConv2dGrad<float, CUDAContext>(
            );
        } else if (kernel_h == 5 && kernel_w == 5) {
            _DepthwiseConv2dGradNHWC<float, 5, 5>
-                << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                     0, ctx->cuda_stream() >> >(
+                    0, ctx->cuda_stream() >>>(
                nthreads,
                C, H, W,
                out_h, out_w,
@@ -474,8 +474,8 @@ template <> void DepthwiseConv2dGrad<float, CUDAContext>(
            );
        } else if (kernel_h == 7 && kernel_w == 7) {
            _DepthwiseConv2dGradNHWC<float, 7, 7>
-                << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                     0, ctx->cuda_stream() >> >(
+                    0, ctx->cuda_stream() >>>(
                nthreads,
                C, H, W,
                out_h, out_w,
@@ -487,8 +487,8 @@ template <> void DepthwiseConv2dGrad<float, CUDAContext>(
            );
        } else {
             _DepthwiseConv2dGradNHWC<float, -1, -1>
-                << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                     0, ctx->cuda_stream() >> >(
+                    0, ctx->cuda_stream() >>>(
                nthreads,
                C, H, W,
                out_h, out_w,
@@ -634,8 +634,8 @@ template <> void DepthwiseConv2dWGrad<float, CUDAContext>(
    auto nblocks = C * kernel_h * kernel_w;
    if (data_format == "NCHW") {
        _DepthwiseConv2dWGradNCHW
-            << < nblocks, nthreads,
+            <<< nblocks, nthreads,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            N, C, H, W,
            out_h, out_w,
            kernel_h, kernel_w,
@@ -646,8 +646,8 @@ template <> void DepthwiseConv2dWGrad<float, CUDAContext>(
       );
    } else if (data_format == "NHWC") {
        _DepthwiseConv2dWGradNHWC
-            << < nblocks, nthreads,
+            <<< nblocks, nthreads,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            N, C, H, W,
            out_h, out_w,
            kernel_h, kernel_w,

--- a/Dragon/src/kernels/vision/drop_block_op_kernel.cu
+++ b/Dragon/src/kernels/vision/drop_block_op_kernel.cu
@@ -77,16 +77,12 @@ template <> void DropBlock2d<CUDAContext>(
    int*                    mask,
    CUDAContext*            ctx) {
    auto nthreads = N * C * seed_h * seed_w;
-    math::RandomUniform(
+    math::RandomUniform(nthreads, 0.f, 1.f, seed, ctx);
-        nthreads,
-        0.f, float(UINT_MAX),
-        seed, ctx
-    );
    auto mask_thresh = (uint32_t)(UINT_MAX * gamma);
    if (data_format == "NCHW") {
        _DropBlock2dNCHW
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            nthreads,
            C, H, W,
            seed_h, seed_w,
@@ -96,8 +92,8 @@ template <> void DropBlock2d<CUDAContext>(
        );
    } else if(data_format == "NHWC") {
        _DropBlock2dNHWC
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            nthreads,
            C, H, W,
            seed_h, seed_w,

--- a/Dragon/src/kernels/vision/nn_resize_op_kernel.cu
+++ b/Dragon/src/kernels/vision/nn_resize_op_kernel.cu
@@ -81,15 +81,15 @@ template <> void NNResize<float, CUDAContext>(
    auto scale_w = (float)W / (float)out_w;
    if (data_format == "NCHW") {
        _NNResizeNCHW
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            nthreads, C, H, W, out_h, out_w,
            scale_h, scale_w, x, y
        );
    } else if(data_format == "NHWC") {
        _NNResizeNHWC
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            nthreads, C, H, W, out_h, out_w,
            scale_h, scale_w, x, y
        );
@@ -116,8 +116,8 @@ template <> void NNResize<float16, CUDAContext>(
    auto scale_w = (float)W / (float)out_w;
    if (data_format == "NCHW") {
        _NNResizeNCHW
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            nthreads, C, H, W,
            out_h, out_w, scale_h, scale_w,
            reinterpret_cast<const half*>(x),
@@ -125,8 +125,8 @@ template <> void NNResize<float16, CUDAContext>(
        );
    } else if(data_format == "NHWC") {
        _NNResizeNHWC
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            nthreads, C, H, W,
            out_h, out_w, scale_h, scale_w,
            reinterpret_cast<const half*>(x),
@@ -209,15 +209,15 @@ template <> void NNResizeGrad<float, CUDAContext>(
    auto scale_w = (float)W / (float)out_w;
    if (data_format == "NCHW") {
        _NNResizeGradNCHW
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            nthreads, C, H, W, out_h, out_w,
            scale_h, scale_w, dy, dx
        );
    } else if(data_format == "NHWC") {
        _NNResizeGradNHWC 
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            nthreads, C, H, W, out_h, out_w,
            scale_h, scale_w, dy, dx
        );

--- a/Dragon/src/kernels/vision/pool_op_kernel.cu
+++ b/Dragon/src/kernels/vision/pool_op_kernel.cu
@@ -120,8 +120,8 @@ template<> void MaxPool2d<float, CUDAContext>(
    auto nthreads = N * C * pool_h * pool_w;
    if (data_format == "NCHW") {
        _MaxPool2dNCHW
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            nthreads,
            C, H, W,
            pool_h, pool_w,
@@ -132,8 +132,8 @@ template<> void MaxPool2d<float, CUDAContext>(
        );
    } else if (data_format == "NHWC") {
        _MaxPool2dNHWC
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            nthreads,
            C, H, W,
            pool_h, pool_w,
@@ -256,8 +256,8 @@ template<> void AvgPool2d<float, CUDAContext>(
    auto nthreads = N * C * pool_h * pool_w;
    if (data_format == "NCHW") {
        _AvgPool2dNCHW
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            nthreads,
            C, H, W,
            pool_h, pool_w,
@@ -268,8 +268,8 @@ template<> void AvgPool2d<float, CUDAContext>(
        );
    } else if (data_format == "NHWC") {
        _AvgPool2dNHWC
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            nthreads,
            C, H, W,
            pool_h, pool_w,
@@ -392,8 +392,8 @@ template<> void MaxPool2dGrad<float, CUDAContext>(
    auto nthreads = N * C * H * W;
    if (data_format == "NCHW") {
        _MaxPool2dGrad_NCHW
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            nthreads,
            C, H, W,
            pool_h, pool_w,
@@ -404,8 +404,8 @@ template<> void MaxPool2dGrad<float, CUDAContext>(
        );
    } else if (data_format == "NHWC") {
        _MaxPool2dGradNHWC
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            nthreads,
            C, H, W,
            pool_h, pool_w,
@@ -531,8 +531,8 @@ template<> void AvgPool2dGrad<float, CUDAContext>(
    auto nthreads = N * C * H * W;
    if (data_format == "NCHW") {
        _AvgPool2dGradNCHW
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            nthreads,
            C, H, W,
            pool_h, pool_w,
@@ -543,8 +543,8 @@ template<> void AvgPool2dGrad<float, CUDAContext>(
        );
    } else if (data_format == "NHWC") {
        _AvgPool2dGradNHWC
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+                0, ctx->cuda_stream() >>>(
            nthreads,
            C, H, W,
            pool_h, pool_w,

--- a/Dragon/src/kernels/vision/roi_align_op_kernel.cu
+++ b/Dragon/src/kernels/vision/roi_align_op_kernel.cu
@@ -132,8 +132,8 @@ template<> void ROIAlign<float, CUDAContext>(
    CUDAContext*            ctx) {
    auto nthreads = num_rois * C  * pool_h * pool_w;
    _ROIAlign
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        nthreads,
        C, H, W,
        pool_h, pool_w,
@@ -283,8 +283,8 @@ template<> void ROIAlignGrad<float, CUDAContext>(
    CUDAContext*            ctx) {
    auto nthreads = num_rois * C  * pool_h * pool_w;
    _ROIAlignGrad
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        nthreads,
        C, H, W,
        pool_h, pool_w,

--- a/Dragon/src/kernels/vision/roi_align_op_kernel.fp16.cu
+++ b/Dragon/src/kernels/vision/roi_align_op_kernel.fp16.cu
@@ -134,8 +134,8 @@ template<> void ROIAlign<float16, CUDAContext>(
    CUDAContext*            ctx) {
    auto nthreads = num_rois * C  * pool_h * pool_w;
    _ROIAlignHalf
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >
+            0, ctx->cuda_stream() >>>
        (nthreads, C, H, W, pool_h, pool_w,
            sampling_ratio, spatial_scale,
                reinterpret_cast<const half*>(x), rois,

--- a/Dragon/src/kernels/vision/roi_pool_op_kernel.cu
+++ b/Dragon/src/kernels/vision/roi_pool_op_kernel.cu
@@ -92,8 +92,8 @@ template<> void ROIPool<float, CUDAContext>(
    CUDAContext*            ctx) {
    auto nthreads = num_rois * C * pool_h * pool_w;
    _ROIPool
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        nthreads,
        C, H, W,
        pool_h, pool_w,
@@ -185,8 +185,8 @@ template<> void ROIPool<float16, CUDAContext>(
    CUDAContext*            ctx) {
    auto nthreads = num_rois * C * pool_h * pool_w;
    _ROIPoolHalf
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        nthreads,
        C, H, W,
        pool_h, pool_w,
@@ -286,8 +286,8 @@ template<> void ROIPoolGrad<float, CUDAContext>(
    CUDAContext*            ctx) {
    auto nthreads = N * C * H * W;
    _ROIPoolGrad
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+            0, ctx->cuda_stream() >>>(
        nthreads,
        num_rois,
        C, H, W,

--- a/Dragon/src/onnx/onnx_backend.cc
+++ b/Dragon/src/onnx/onnx_backend.cc
@@ -180,9 +180,9 @@ ONNXBackend::get_special_nodes() const {
    }; return kSpecialNodes;
 }
-const Map< string, Map<string, string> >&
+const Map<string, Map<string, string>>&
 ONNXBackend::get_node_renamed_attrs() const {
-    const static Map< string, Map<string, string> >
+    const static Map<string, Map<string, string>>
        kPerNodeRenamedAttrs = {
            { "Gemm", { { "transB", "transW" } } },
            { "BatchNormalization", { { "epsilon", "eps" } } },

--- a/Dragon/src/onnx/onnx_backend.h
+++ b/Dragon/src/onnx/onnx_backend.h
@@ -221,7 +221,7 @@ class ONNXBackend {
    const Map<string, SpecialNodeConverter>& get_special_nodes() const;
    const Map<string, string>& get_renamed_attrs() const;
-    const Map< string, Map<string, string> >& get_node_renamed_attrs() const;
+    const Map<string, Map<string, string>>& get_node_renamed_attrs() const;
 };
 }  // namespace onnx

--- a/Dragon/src/operators/activation/cudnn_dropout_op.cc
+++ b/Dragon/src/operators/activation/cudnn_dropout_op.cc
@@ -77,15 +77,8 @@ template <class Context>
 void CuDNNDropoutOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));
-    if (XIsType(X(0), float)) {
+    DispatchHelper<TensorTypes
-        RunImpl<float>();
+        <float, float16>>::Call(this, X(0));
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
 }
 template <class Context> template <typename T>
@@ -147,15 +140,8 @@ template <class Context>
 void CuDNNDropoutGradientOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));
-    if (XIsType(X(0), float)) {
+    DispatchHelper<TensorTypes
-        RunImpl<float>();
+        <float, float16>>::Call(this, X(0));
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
 }
 DEPLOY_CUDNN(Dropout);

--- a/Dragon/src/operators/activation/cudnn_elu_op.cc
+++ b/Dragon/src/operators/activation/cudnn_elu_op.cc
@@ -26,15 +26,8 @@ template <class Context>
 void CuDNNEluOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));
-    if (XIsType(X(0), float)) {
+    DispatchHelper<TensorTypes
-        RunImpl<float>();
+        <float, float16>>::Call(this, X(0));
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
 }
 template <class Context> template <typename T>
@@ -60,15 +53,8 @@ template <class Context>
 void CuDNNEluGradientOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));
-    if (XIsType(X(0), float)) {
+    DispatchHelper<TensorTypes
-        RunImpl<float>();
+        <float, float16>>::Call(this, X(0));
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
 }
 DEPLOY_CUDNN(Elu);

--- a/Dragon/src/operators/activation/cudnn_relu_op.cc
+++ b/Dragon/src/operators/activation/cudnn_relu_op.cc
@@ -40,15 +40,8 @@ void CuDNNReluOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));
-    if (XIsType(X(0), float)) {
+    DispatchHelper<TensorTypes
-        RunImpl<float>();
+        <float, float16>>::Call(this, X(0));
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
 }
 template <class Context> template <typename T>
@@ -92,15 +85,8 @@ void CuDNNReluGradientOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));
-    if (XIsType(X(0), float)) {
+    DispatchHelper<TensorTypes
-        RunImpl<float>();
+        <float, float16>>::Call(this, X(0));
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
 }
 DEPLOY_CUDNN(Relu);

--- a/Dragon/src/operators/activation/cudnn_sigmoid_op.cc
+++ b/Dragon/src/operators/activation/cudnn_sigmoid_op.cc
@@ -35,15 +35,8 @@ template <class Context>
 void CuDNNSigmoidOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));
-    if (XIsType(X(0), float)) {
+    DispatchHelper<TensorTypes
-        RunImpl<float>();
+        <float, float16>>::Call(this, X(0));
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
 }
 template <class Context> template <typename T>
@@ -82,15 +75,8 @@ template <class Context>
 void CuDNNSigmoidGradientOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));
-    if (XIsType(X(0), float)) {
+    DispatchHelper<TensorTypes
-        RunImpl<float>();
+        <float, float16>>::Call(this, X(0));
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
 }
 DEPLOY_CUDNN(Sigmoid);

--- a/Dragon/src/operators/activation/cudnn_softmax_op.cc
+++ b/Dragon/src/operators/activation/cudnn_softmax_op.cc
@@ -45,15 +45,8 @@ void CuDNNSoftmaxOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));
-    if (XIsType(X(0), float)) {
+    DispatchHelper<TensorTypes
-        RunImpl<float>();
+        <float, float16>>::Call(this, X(0));
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
 }
 template <class Context> template <typename T>
@@ -91,15 +84,8 @@ void CuDNNSoftmaxGradientOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));
-    if (XIsType(X(0), float)) {
+    DispatchHelper<TensorTypes
-        RunImpl<float>();
+        <float, float16>>::Call(this, X(0));
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
 }
 DEPLOY_CUDNN(Softmax);

--- a/Dragon/src/operators/activation/cudnn_tanh_op.cc
+++ b/Dragon/src/operators/activation/cudnn_tanh_op.cc
@@ -35,15 +35,8 @@ template <class Context>
 void CuDNNTanhOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));
-    if (XIsType(X(0), float)) {
+    DispatchHelper<TensorTypes
-        RunImpl<float>();
+        <float, float16>>::Call(this, X(0));
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
 }
 template <class Context> template <typename T>
@@ -82,15 +75,8 @@ template <class Context>
 void CuDNNTanhGradientOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));
-    if (XIsType(X(0), float)) {
+    DispatchHelper<TensorTypes
-        RunImpl<float>();
+        <float, float16>>::Call(this, X(0));
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
 }
 DEPLOY_CUDNN(Tanh);

--- a/Dragon/src/operators/activation/dropout_op.cc
+++ b/Dragon/src/operators/activation/dropout_op.cc
@@ -44,15 +44,8 @@ template <class Context>
 void DropoutOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));
-    if (XIsType(X(0), float)) {
+    DispatchHelper<TensorTypes
-        RunImpl<float>();
+        <float, float16>>::Call(this, X(0));
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
 }
 template <class Context> template <typename T>
@@ -83,15 +76,8 @@ template <class Context>
 void DropoutGradientOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));
-    if (XIsType(X(0), float)) {
+    DispatchHelper<TensorTypes
-        RunImpl<float>();
+        <float, float16>>::Call(this, X(0));
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
 }
 DEPLOY_CPU(Dropout);

--- a/Dragon/src/operators/activation/droppath_op.cc
+++ b/Dragon/src/operators/activation/droppath_op.cc
@@ -52,15 +52,8 @@ void DropPathOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));
-    if (XIsType(X(0), float)) {
+    DispatchHelper<TensorTypes
-        RunImpl<float>();
+        <float, float16>>::Call(this, X(0));
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
 }
 template <class Context> template <typename T>
@@ -97,15 +90,8 @@ void DropPathGradientOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));
-    if (XIsType(X(0), float)) {
+    DispatchHelper<TensorTypes
-        RunImpl<float>();
+        <float, float16>>::Call(this, X(0));
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
 }
 DEPLOY_CPU(DropPath);

--- a/Dragon/src/operators/activation/elu_op.cc
+++ b/Dragon/src/operators/activation/elu_op.cc
@@ -20,13 +20,8 @@ template <class Context>
 void EluOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));
-    if (XIsType(X(0), float)) {
+    DispatchHelper<TensorTypes
-        RunImpl<float>();
+        <float>>::Call(this, X(0));
-    } else {
-        LOG(FATAL) << DTypeString(
-            X(0), { "float32" }
-        );
-    }
 }
 template <class Context> template <typename T>
@@ -46,13 +41,8 @@ template <class Context>
 void EluGradientOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));
-    if (XIsType(X(0), float)) {
+    DispatchHelper<TensorTypes
-        RunImpl<float>();
+        <float>>::Call(this, X(0));
-    } else {
-        LOG(FATAL) << DTypeString(
-            X(0), { "float32" }
-        );
-    }
 }
 DEPLOY_CPU(Elu);

--- a/Dragon/src/operators/activation/prelu_op.cc
+++ b/Dragon/src/operators/activation/prelu_op.cc
@@ -40,13 +40,8 @@ void PReluOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));
-    if (XIsType(X(0), float)) {
+    DispatchHelper<TensorTypes
-        RunImpl<float>();
+        <float>>::Call(this, X(0));
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32" }
-        );
-    }
 }
 template <class Context> template <typename T>
@@ -98,13 +93,8 @@ void PReluGradientOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));
    Y(1)->ReshapeLike(X(1));
-    if (XIsType(X(0), float)) {
+    DispatchHelper<TensorTypes
-        RunImpl<float>();
+        <float>>::Call(this, X(0));
-    } else {
-        LOG(FATAL) << DTypeString(
-            X(0), { "float32" }
-        );
-    }
 }
 DEPLOY_CPU(PRelu);

--- a/Dragon/src/operators/activation/relu_op.cc
+++ b/Dragon/src/operators/activation/relu_op.cc
@@ -20,15 +20,8 @@ template <class Context>
 void ReluOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));
-    if (XIsType(X(0), float)) {
+    DispatchHelper<TensorTypes
-        RunImpl<float>();
+        <float, float16>>::Call(this, X(0));
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
 }
 template <class Context> template <typename T>
@@ -48,15 +41,8 @@ template <class Context>
 void ReluGradientOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));
-    if (XIsType(X(0), float)) {
+    DispatchHelper<TensorTypes
-        RunImpl<float>();
+        <float, float16>>::Call(this, X(0));
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
 }
 DEPLOY_CPU(Relu);

--- a/Dragon/src/operators/activation/selu_op.cc
+++ b/Dragon/src/operators/activation/selu_op.cc
@@ -19,15 +19,8 @@ template <class Context>
 void SEluOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));
-    if (XIsType(X(0), float)) {
+    DispatchHelper<TensorTypes
-        RunImpl<float>();
+        <float, float16>>::Call(this, X(0));
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
 }
 template <class Context> template <typename T>
@@ -47,15 +40,8 @@ template <class Context>
 void SEluGradientOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));
-    if (XIsType(X(0), float)) {
+    DispatchHelper<TensorTypes
-        RunImpl<float>();
+        <float, float16>>::Call(this, X(0));
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
 }
 DEPLOY_CPU(SElu);

--- a/Dragon/src/operators/activation/sigmoid_op.cc
+++ b/Dragon/src/operators/activation/sigmoid_op.cc
@@ -15,13 +15,8 @@ template <class Context>
 void SigmoidOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));
-    if (XIsType(X(0), float)) {
+    DispatchHelper<TensorTypes
-        RunImpl<float>();
+        <float>>::Call(this, X(0));
-    } else {
-        LOG(FATAL) << DTypeString(
-            X(0), { "float32" }
-        );
-    }
 }
 template <class Context> template <typename T>
@@ -41,13 +36,8 @@ template <class Context>
 void SigmoidGradientOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));
-    if (XIsType(X(0), float)) {
+    DispatchHelper<TensorTypes
-        RunImpl<float>();
+        <float>>::Call(this, X(0));
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32" }
-        );
-    }
 }
 DEPLOY_CPU(Sigmoid);

--- a/Dragon/src/operators/activation/softmax_op.cc
+++ b/Dragon/src/operators/activation/softmax_op.cc
@@ -43,13 +43,8 @@ void SoftmaxOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));
-    if (XIsType(X(0), float)) {
+    DispatchHelper<TensorTypes
-        RunImpl<float>();
+        <float>>::Call(this, X(0));
-    } else {
-        LOG(FATAL) << DTypeString(
-            X(0), { "float32" }
-        );
-    }
 }
 template <class Context> template <typename T>
@@ -86,13 +81,8 @@ void SoftmaxGradientOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));
-    if (XIsType(X(0), float)) {
+    DispatchHelper<TensorTypes
-        RunImpl<float>();
+        <float>>::Call(this, X(0));
-    } else {
-        LOG(FATAL) << DTypeString(
-            X(0), { "float32" }
-        );
-    }
 }
 DEPLOY_CPU(Softmax);

--- a/Dragon/src/operators/activation/tanh_op.cc
+++ b/Dragon/src/operators/activation/tanh_op.cc
@@ -15,13 +15,8 @@ template <class Context>
 void TanhOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));
-    if (XIsType(X(0), float)) {
+    DispatchHelper<TensorTypes
-        RunImpl<float>();
+        <float>>::Call(this, X(0));
-    } else {
-        LOG(FATAL) << DTypeString(
-            X(0), { "float32" }
-        );
-    }
 }
 template <class Context> template <typename T>
@@ -41,13 +36,8 @@ template <class Context>
 void TanhGradientOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));
-    if (XIsType(X(0), float)) {
+    DispatchHelper<TensorTypes
-        RunImpl<float>();
+        <float>>::Call(this, X(0));
-    } else {
-        LOG(FATAL) << DTypeString(
-            X(0), { "float32" }
-        );
-    }
 }
 DEPLOY_CPU(Tanh);

--- a/Dragon/src/operators/arithmetic/affine_op.cc
+++ b/Dragon/src/operators/arithmetic/affine_op.cc
@@ -46,15 +46,8 @@ void AffineOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));
-    if (XIsType(X(0), float)) {
+    DispatchHelper<TensorTypes
-        RunImpl<float>();
+        <float, float16>>::Call(this, X(0));
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
 }
 template <class Context> template <typename T>
@@ -111,9 +104,7 @@ void AffineGradientOp<Context>::RunImpl() {
 }
 template <class Context> template <typename T>
-void AffineGradientOp<Context>::Reduce(
+void AffineGradientOp<Context>::Reduce(T* x, T* y) {
-    T*                      x,
-    T*                      y) {
    vec32_t dims = {
        (int)outer_dim_,
        (int)scale_dim_,
@@ -138,15 +129,8 @@ void AffineGradientOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(-1));
-    if (XIsType(X(-1), float)) {
+    DispatchHelper<TensorTypes
-        RunImpl<float>();
+        <float, float16>>::Call(this, X(-1));
-    } else if (XIsType(X(-1), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(-1),
-            { "float32", "float16" }
-        );
-    }
 }
 DEPLOY_CPU(Affine);

--- a/Dragon/src/operators/arithmetic/cudnn_affine_op.cc
+++ b/Dragon/src/operators/arithmetic/cudnn_affine_op.cc
@@ -108,13 +108,6 @@ void CuDNNAffineOp<Context>::RunOnDevice() {
 template <class Context> template <typename DT, typename CT>
 void CuDNNAffineGradientOp<Context>::RunImpl() {
    this->template ResetDesc<DT>(X(-1));
-    scale_dim_ = X(1).count();
-    outer_dim_ = X(-1).count(0, axis_);
-    inner_dim_ = X(-1).count(axis_ + num_axes_);
-    dim_ = scale_dim_ * inner_dim_;
-    reduce_dim_ = std::max(outer_dim_, inner_dim_);
-    Y(0)->ReshapeLike(X(-1));
    auto* alpha = X(1).template data<DT, Context>();
    auto* dy = X(-1).template mutable_data<DT, Context>();
@@ -230,9 +223,7 @@ void CuDNNAffineGradientOp<Context>::CuDNNReduce(
 }
 template <class Context> template <typename T>
-void CuDNNAffineGradientOp<Context>::Reduce(
+void CuDNNAffineGradientOp<Context>::Reduce(T* x, T* y) {
-    T*                      x,
-    T*                      y) {
    vec32_t dims = {
        (int)outer_dim_,
        (int)scale_dim_,
@@ -248,6 +239,14 @@ void CuDNNAffineGradientOp<Context>::Reduce(
 template <class Context>
 void CuDNNAffineGradientOp<Context>::RunOnDevice() {
+    scale_dim_ = X(1).count();
+    outer_dim_ = X(-1).count(0, axis_);
+    inner_dim_ = X(-1).count(axis_ + num_axes_);
+    dim_ = scale_dim_ * inner_dim_;
+    reduce_dim_ = std::max(outer_dim_, inner_dim_);
+    Y(0)->ReshapeLike(X(-1));
    if (XIsType(X(-1), float)) {
        RunImpl<float, float>();
    } else if (XIsType(X(-1), float16)) {

--- a/Dragon/src/operators/arithmetic/eltwise_op.cc
+++ b/Dragon/src/operators/arithmetic/eltwise_op.cc
@@ -36,6 +36,13 @@ void EltwiseOp<Context>::ProdRunImpl() {
 template <class Context> template <typename T>
 void EltwiseOp<Context>::RunImpl() {
+    if (operation_ == "SUM") SumRunImpl<T>();
+    else if (operation_ == "PROD") ProdRunImpl<T>();
+    else LOG(FATAL) << "Unknwon Operation: " << operation_;
+}
+template <class Context>
+void EltwiseOp<Context>::RunOnDevice() {
    for (int i = 1; i < XSize(); i++) {
        CHECK(X(i).dims() == X(0).dims())
            << "\nExcepted Input(" << i << ")'s dims as "
@@ -45,33 +52,10 @@ void EltwiseOp<Context>::RunImpl() {
    Y(0)->ReshapeLike(X(0));
-    if (operation_ == "SUM") SumRunImpl<T>();
+    DispatchHelper<TensorTypes
-    else if (operation_ == "PROD") ProdRunImpl<T>();
+        <int8_t, uint8_t, int, int64_t,
-    else LOG(FATAL) << "Unknwon Operation: " << operation_;
+            float16, float, double>
-}
+    >::Call(this, X(0));
-template <class Context>
-void EltwiseOp<Context>::RunOnDevice() {
-    if (XIsType(X(0), int8_t)) {
-        RunImpl<int8_t>();
-    } else if (XIsType(X(0), uint8_t)) {
-        RunImpl<uint8_t>();
-    } else if (XIsType(X(0), int)) {
-        RunImpl<int>();
-    } else if (XIsType(X(0), int64_t)) {
-        RunImpl<int64_t>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0), {
-            "int8", "uint8", "int32", "int64",
-            "float16", "float32", "float64",
-        });
-    }
 }
 template <class Context> template <typename T>
@@ -133,26 +117,10 @@ void EltwiseGradientOp<Context>::RunImpl() {
 template <class Context>
 void EltwiseGradientOp<Context>::RunOnDevice() {
-    if (XIsType(X(0), int8_t)) {
+    DispatchHelper<TensorTypes
-        RunImpl<int8_t>();
+        <int8_t, uint8_t, int, int64_t,
-    } else if (XIsType(X(0), uint8_t)) {
+            float16, float, double>
-        RunImpl<uint8_t>();
+    >::Call(this, X(0));
-    } else if (XIsType(X(0), int)) {
-        RunImpl<int>();
-    } else if (XIsType(X(0), int64_t)) {
-        RunImpl<int64_t>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0), {
-            "int8", "uint8", "int32", "int64",
-            "float16", "float32", "float64",
-        });
-    }
 }
 DEPLOY_CPU(Eltwise);

--- a/Dragon/src/operators/arithmetic/exp_op.cc
+++ b/Dragon/src/operators/arithmetic/exp_op.cc
@@ -15,17 +15,9 @@ template <class Context>
 void ExpOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));
-    if (XIsType(X(0), float16)) {
+    DispatchHelper<TensorTypes
-        RunImpl<float16>();
+        <float, float16, double>
-    } else if (XIsType(X(0), float)) {
+    >::Call(this, X(0));
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float16", "float32", "float64" }
-        );
-    }
 }
 template <class Context> template <typename T>
@@ -40,17 +32,9 @@ template <class Context>
 void ExpGradientOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));
-    if (XIsType(X(0), float16)) {
+    DispatchHelper<TensorTypes
-        RunImpl<float16>();
+        <float, float16, double>
-    } else if (XIsType(X(0), float)) {
+    >::Call(this, X(0));
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float16", "float32", "float64" }
-        );
-    }
 }
 DEPLOY_CPU(Exp);

--- a/Dragon/src/operators/arithmetic/fully_connected_op.cc
+++ b/Dragon/src/operators/arithmetic/fully_connected_op.cc
--- a/Dragon/src/operators/arithmetic/gram_matrix_op.cc
+++ b/Dragon/src/operators/arithmetic/gram_matrix_op.cc
--- a/Dragon/src/operators/arithmetic/log_op.cc
+++ b/Dragon/src/operators/arithmetic/log_op.cc
--- a/Dragon/src/operators/arithmetic/matmul_op.cc
+++ b/Dragon/src/operators/arithmetic/matmul_op.cc
--- a/Dragon/src/operators/arithmetic/maximum_op.cc
+++ b/Dragon/src/operators/arithmetic/maximum_op.cc
--- a/Dragon/src/operators/arithmetic/minimum_op.cc
+++ b/Dragon/src/operators/arithmetic/minimum_op.cc
--- a/Dragon/src/operators/arithmetic/moments_op.cc
+++ b/Dragon/src/operators/arithmetic/moments_op.cc
--- a/Dragon/src/operators/arithmetic/pow_op.cc
+++ b/Dragon/src/operators/arithmetic/pow_op.cc
--- a/Dragon/src/operators/arithmetic/sqrt_op.cc
+++ b/Dragon/src/operators/arithmetic/sqrt_op.cc
--- a/Dragon/src/operators/arithmetic/square_op.cc
+++ b/Dragon/src/operators/arithmetic/square_op.cc
--- a/Dragon/src/operators/array/arange_op.cc
+++ b/Dragon/src/operators/array/arange_op.cc
--- a/Dragon/src/operators/array/argreduce_op.cc
+++ b/Dragon/src/operators/array/argreduce_op.cc
--- a/Dragon/src/operators/array/concat_op.cc
+++ b/Dragon/src/operators/array/concat_op.cc
--- a/Dragon/src/operators/array/crop_op.cc
+++ b/Dragon/src/operators/array/crop_op.cc
--- a/Dragon/src/operators/array/index_select_op.cc
+++ b/Dragon/src/operators/array/index_select_op.cc
--- a/Dragon/src/operators/array/multinomial_op.cc
+++ b/Dragon/src/operators/array/multinomial_op.cc
--- a/Dragon/src/operators/array/one_hot_op.cc
+++ b/Dragon/src/operators/array/one_hot_op.cc
--- a/Dragon/src/operators/array/pad_op.cc
+++ b/Dragon/src/operators/array/pad_op.cc
--- a/Dragon/src/operators/array/reduce_op.cc
+++ b/Dragon/src/operators/array/reduce_op.cc
--- a/Dragon/src/operators/array/repeat_op.cc
+++ b/Dragon/src/operators/array/repeat_op.cc
--- a/Dragon/src/operators/array/slice_op.cc
+++ b/Dragon/src/operators/array/slice_op.cc
--- a/Dragon/src/operators/array/stack_op.cc
+++ b/Dragon/src/operators/array/stack_op.cc
--- a/Dragon/src/operators/array/tile_op.cc
+++ b/Dragon/src/operators/array/tile_op.cc
--- a/Dragon/src/operators/array/transpose_op.cc
+++ b/Dragon/src/operators/array/transpose_op.cc
--- a/Dragon/src/operators/control_flow/assign_op.cc
+++ b/Dragon/src/operators/control_flow/assign_op.cc
--- a/Dragon/src/operators/control_flow/copy_op.cc
+++ b/Dragon/src/operators/control_flow/copy_op.cc
--- a/Dragon/src/operators/control_flow/masked_assign_op.cc
+++ b/Dragon/src/operators/control_flow/masked_assign_op.cc
--- a/Dragon/src/operators/loss/ctc_loss_op.cc
+++ b/Dragon/src/operators/loss/ctc_loss_op.cc
--- a/Dragon/src/operators/loss/l1_loss_op.cc
+++ b/Dragon/src/operators/loss/l1_loss_op.cc
--- a/Dragon/src/operators/loss/l2_loss_op.cc
+++ b/Dragon/src/operators/loss/l2_loss_op.cc
--- a/Dragon/src/operators/loss/sigmoid_ce_loss_op.cc
+++ b/Dragon/src/operators/loss/sigmoid_ce_loss_op.cc
--- a/Dragon/src/operators/loss/smooth_l1_loss_op.cc
+++ b/Dragon/src/operators/loss/smooth_l1_loss_op.cc
--- a/Dragon/src/operators/loss/softmax_ce_loss_op.cc
+++ b/Dragon/src/operators/loss/softmax_ce_loss_op.cc
--- a/Dragon/src/operators/misc/accuracy_op.cc
+++ b/Dragon/src/operators/misc/accuracy_op.cc
--- a/Dragon/src/operators/misc/gradient_op.cc
+++ b/Dragon/src/operators/misc/gradient_op.cc
--- a/Dragon/src/operators/misc/initialize_op.cc
+++ b/Dragon/src/operators/misc/initialize_op.cc
--- a/Dragon/src/operators/misc/python_op.cc
+++ b/Dragon/src/operators/misc/python_op.cc
--- a/Dragon/src/operators/mpi/mpi_broadcast_op.cc
+++ b/Dragon/src/operators/mpi/mpi_broadcast_op.cc
--- a/Dragon/src/operators/mpi/mpi_gather_op.cc
+++ b/Dragon/src/operators/mpi/mpi_gather_op.cc
--- a/Dragon/src/operators/norm/l2_norm_op.cc
+++ b/Dragon/src/operators/norm/l2_norm_op.cc
--- a/Dragon/src/operators/recurrent/cudnn_recurrent_op.cc
+++ b/Dragon/src/operators/recurrent/cudnn_recurrent_op.cc
--- a/Dragon/src/operators/recurrent/rnn_param_op.cc
+++ b/Dragon/src/operators/recurrent/rnn_param_op.cc
--- a/Dragon/src/operators/vision/bias_add_op.cc
+++ b/Dragon/src/operators/vision/bias_add_op.cc
--- a/Dragon/src/operators/vision/bilinear_resize_op.cc
+++ b/Dragon/src/operators/vision/bilinear_resize_op.cc
--- a/Dragon/src/operators/vision/conv2d_op.cc
+++ b/Dragon/src/operators/vision/conv2d_op.cc
--- a/Dragon/src/operators/vision/conv2d_transpose_op.cc
+++ b/Dragon/src/operators/vision/conv2d_transpose_op.cc
--- a/Dragon/src/operators/vision/cudnn_bias_add_op.cc
+++ b/Dragon/src/operators/vision/cudnn_bias_add_op.cc
--- a/Dragon/src/operators/vision/cudnn_conv2d_op.cc
+++ b/Dragon/src/operators/vision/cudnn_conv2d_op.cc
--- a/Dragon/src/operators/vision/cudnn_conv2d_transpose_op.cc
+++ b/Dragon/src/operators/vision/cudnn_conv2d_transpose_op.cc
--- a/Dragon/src/operators/vision/cudnn_depthwise_conv2d_op.cc
+++ b/Dragon/src/operators/vision/cudnn_depthwise_conv2d_op.cc
--- a/Dragon/src/operators/vision/cudnn_lrn_op.cc
+++ b/Dragon/src/operators/vision/cudnn_lrn_op.cc
--- a/Dragon/src/operators/vision/cudnn_pool2d_op.cc
+++ b/Dragon/src/operators/vision/cudnn_pool2d_op.cc
--- a/Dragon/src/operators/vision/depthwise_conv2d_op.cc
+++ b/Dragon/src/operators/vision/depthwise_conv2d_op.cc
--- a/Dragon/src/operators/vision/drop_block2d_op.cc
+++ b/Dragon/src/operators/vision/drop_block2d_op.cc
--- a/Dragon/src/operators/vision/nn_resize_op.cc
+++ b/Dragon/src/operators/vision/nn_resize_op.cc
--- a/Dragon/src/operators/vision/roi_align_op.cc
+++ b/Dragon/src/operators/vision/roi_align_op.cc
--- a/Dragon/src/operators/vision/roi_pool_op.cc
+++ b/Dragon/src/operators/vision/roi_pool_op.cc
--- a/Dragon/src/utils/math_functions.cu
+++ b/Dragon/src/utils/math_functions.cu
--- a/Dragon/src/utils/math_functions.fp16.cu
+++ b/Dragon/src/utils/math_functions.fp16.cu