Apply the dispatcher to RunImpl

Ting PAN
Commit d1f714ea authored May 15, 2019 by Ting PAN
Showing with 1208 additions and 2363 deletions
Dragon/include/core/common.h
Dragon/include/core/graph.h
Dragon/include/core/graph_gradient.h
Dragon/include/core/operator.h
Dragon/include/core/operator_schema.h
Dragon/include/core/types.h
Dragon/include/core/workspace.h
Dragon/include/operators/arithmetic/fully_connected_op.h
Dragon/include/operators/array/multinomial_op.h
Dragon/include/utils/caffemodel.h
Dragon/python/dragon/operators/array.py
Dragon/python/dragon/vm/torch/ops/builtin.py
Dragon/python/dragon/vm/torch/ops/modules/array.py
Dragon/python/dragon/vm/torch/tensor.py
Dragon/src/contrib/rcnn/bbox_utils.cu
Dragon/src/contrib/rcnn/bbox_utils.h
Dragon/src/core/graph.cc
Dragon/src/core/graph_gradient.cc
Dragon/src/core/graph_optimizer.cc
Dragon/src/core/operator_schema.cc
--- a/Dragon/include/core/common.h
+++ b/Dragon/include/core/common.h
@@ -35,6 +35,7 @@

 #include "core/types.h"
 #include "proto/dragon.pb.h"
+#include "utils/string.h"
 #include "utils/logging.h"

 namespace dragon {

--- a/Dragon/include/core/graph.h
+++ b/Dragon/include/core/graph.h
@@ -85,6 +85,8 @@ GraphBase* NewGraph(
    const GraphDef&             def,
    Workspace*                  ws);

+/* Macros */
+
 DECLARE_REGISTRY(
    GraphRegistry,
    GraphBase,

--- a/Dragon/include/core/graph_gradient.h
+++ b/Dragon/include/core/graph_gradient.h
@@ -43,7 +43,7 @@ class GraphGradientMaker {
    bool CheckGrad(
        const OperatorDef&              forward_op,
        const Set<string>&              targets,
-        vector< pair<string, int> >&    gen_grads);
+        vector<pair<string, int>>&      gen_grads);

    string GetOperatorName();


--- a/Dragon/include/core/operator.h
+++ b/Dragon/include/core/operator.h
@@ -100,7 +100,7 @@ class OperatorBase {
    /*! \brief Return the specified argument */
    const Argument& arg(const string& name) { return *(args_[name]); }

-    typedef Map<string, vector<OperatorBase*> > SubGraph;
+    typedef Map<string, vector<OperatorBase*>> SubGraph;

    /*! \brief Return the recomputing subgraph of this operator */
    SubGraph& subgraph() { return subgraph_; }
@@ -221,7 +221,7 @@ OperatorBase* NewOperator(
    const OperatorDef&          def,
    Workspace*                  ws);

-/*! Macros */
+/* Macros */

 #define OpArg OperatorBase::Arg
 #define OpArgs OperatorBase::Args
@@ -266,7 +266,7 @@ DECLARE_REGISTRY(
    const OperatorDef&,
    Workspace*);

-/*! NVIDIA's Accelerated Library - CUDNN */
+/* NVIDIA's Accelerated Library - CUDNN */

 DECLARE_REGISTRY(
    CUDNNOperatorRegistry,
@@ -274,7 +274,7 @@ DECLARE_REGISTRY(
    const OperatorDef&,
    Workspace*);

-/*! CAMBRICON's Accelerated Library - CNML */
+/* CAMBRICON's Accelerated Library - CNML */

 DECLARE_REGISTRY(
    CNMLOperatorRegistry,
@@ -282,13 +282,60 @@ DECLARE_REGISTRY(
    const OperatorDef&,
    Workspace*);

+/* Dispatcher for Runtime Typed-Implementation */
+
+#define XIsType(x, dtype) \
+    x.template IsType<dtype>()
+
+template <typename... Types>
+struct TensorTypes {};
+
+template <typename Sizes, typename... Args>
+struct DispatchHelper;
+
+#define DEFINE_TENSOR_TYPES_DISPATCHER(TensorTypes, Impl) \
+    template <typename T, typename... Types, typename... Args> \
+    struct DispatchHelper<TensorTypes<T, Types...>, Args...> { \
+        template <typename Op> \
+        static void Call(Op* op, const TypeMeta& meta, string& types) { \
+            if (meta.Match<T>()) return op->template Impl<T, Args...>(); \
+            types += "    * " + TypeToString<T>() + ",\n"; \
+            return DispatchHelper<TensorTypes<Types...>, Args...> \
+                ::Call(op, meta, types); \
+        } \
+        template <typename Op> \
+        static void Call(Op* op, const Tensor& tensor) { \
+            string types; return Call(op, tensor.meta(), types); \
+        } \
+    }; \
+    template <typename... Args> \
+    struct DispatchHelper<TensorTypes<>, Args...> { \
+        template <typename Op> \
+        static void Call(Op* op, const TypeMeta& meta, string& types) { \
+            LOG(FATAL) << "Unsupported DType: " \
+                       << TypeMetaToString(meta) << "\n" \
+                       << "<" << op->type() << "Op>" \
+                       << " supports the following dtypes: {\n" \
+                       << types << "}"; \
+        } \
+        template <typename Op> \
+        static void Call(Op* op, const Tensor& tensor) { \
+            return Call(op, tensor.meta(), ""); \
+        } \
+    };
+
+DEFINE_TENSOR_TYPES_DISPATCHER(TensorTypes, RunImpl);
+#undef DEFINE_TENSOR_TYPES_DISPATCHER
+
+/* TensorFiller */
+
 #define TENSOR_FILL_WITH_TYPE(tensor, shape, type) \
    if (tensor.count() == 0) { \
        CHECK(ws()->GetFiller(tensor.name())) \
            << "\nTensor(" << tensor.name() << ") is empty. \n" \
            << "may be specify a filler for it ?"; \
        tensor.Reshape(shape); \
-        unique_ptr< Filler<type, Context> > filler(  \
+        unique_ptr<Filler<type, Context>> filler(  \
            CreateFiller<type, Context>(*ws()->GetFiller(tensor.name()))); \
        filler->Fill(&tensor, ctx()); \
    } else { \
@@ -308,7 +355,7 @@ DECLARE_REGISTRY(
            << "\nTensor(" << tensor.name() << ") is empty. \n" \
            << "may be specify a filler for it ?"; \
        tensor.Reshape(shape); \
-        unique_ptr< Filler<T, Context> > filler(  \
+        unique_ptr<Filler<T, Context>> filler(  \
            CreateFiller<T, Context>(*ws()->GetFiller(tensor.name()))); \
        filler->Fill(&tensor, ctx()); \
    } else { \
@@ -322,6 +369,8 @@ DECLARE_REGISTRY(
        tensor.Reshape(shape); \
    }

+/* Shared Multiplier */
+
 #define DECLARE_MULTIPLIER(name, size) \
    const T* name; \
    { \
@@ -335,6 +384,8 @@ DECLARE_REGISTRY(
        name = mp->template data<T, Context>(); \
    }

+/* Dynamic Arguments */
+
 #define DECLARE_ARG_WITH_DESC(type, arg) \
    type arg##_; \
    string arg##_desc_; \
@@ -393,8 +444,7 @@ DECLARE_REGISTRY(
 #define GET_ARGS_SIZE(arg) \
    (int)std::max(arg##_.size(), arg##_desc_.size())

-#define XIsType(x, dtype) \
-    x.template IsType<dtype>()
+/* Registers */

 #define INSTANTIATE_OPERATOR(name, context) \
    template class name##Op<context>;

--- a/Dragon/include/core/operator_schema.h
+++ b/Dragon/include/core/operator_schema.h
@@ -42,7 +42,7 @@ class OpSchema {
        return *this; 
    }

-    OpSchema& Inplace(set<pair<int, int> > inplace);
+    OpSchema& Inplace(set<pair<int, int>> inplace);
    std::function<bool(int, int)> CheckInplace;
    bool AllowInplace() const { return allow_inplace_; }


--- a/Dragon/include/core/types.h
+++ b/Dragon/include/core/types.h
@@ -73,6 +73,11 @@ inline const std::string TypeMetaToString(
        m2s_type_map[meta.id()] : "unknown";
 }

+template<typename T>
+inline const std::string TypeToString() {
+    return TypeMetaToString(TypeMeta::Make<T>());
+}
+
 }  // namespace dragon

 #endif  // DRAGON_CORE_TYPES_H_
\ No newline at end of file
--- a/Dragon/include/core/workspace.h
+++ b/Dragon/include/core/workspace.h
@@ -13,22 +13,18 @@
 #ifndef DRAGON_CORE_WORKSPACE_H_
 #define DRAGON_CORE_WORKSPACE_H_

-#include "core/common.h"
 #include "core/graph.h"
-#include "utils/string.h"

 namespace dragon {

 class Workspace {
 public:
-    typedef Map<string, Map<string, int64_t> > DummyNameMap;
-
-    typedef Map<string, unique_ptr<Tensor> > TensorMap;
+    typedef Map<string, Map<string, int64_t>> DummyNameMap;
+    typedef Map<string, unique_ptr<Tensor>> TensorMap;
    typedef Map<string, string> TensorAliasMap;
    typedef Map<string, TensorFillerProto> TensorFillerMap;
-
-    typedef Map<string, unique_ptr<OperatorBase> > OperatorMap;
-    typedef Map<string, unique_ptr<GraphBase> > GraphMap;
+    typedef Map<string, unique_ptr<OperatorBase>> OperatorMap;
+    typedef Map<string, unique_ptr<GraphBase>> GraphMap;

    /*! \brief Constructor */
    Workspace(const string& name) : name_(name) { Initialize(); }

--- a/Dragon/include/operators/arithmetic/fully_connected_op.h
+++ b/Dragon/include/operators/arithmetic/fully_connected_op.h
@@ -28,6 +28,7 @@ class FullyConnectedOp final : public Operator<Context> {
    USE_OPERATOR_FUNCTIONS;

    void RunOnDevice();
+    template <typename T> void RunImpl();
    template <typename T> void TransRunImpl();
    template <typename T> void NoTransRunImpl();


--- a/Dragon/include/operators/array/multinomial_op.h
+++ b/Dragon/include/operators/array/multinomial_op.h
@@ -22,6 +22,7 @@ class MultinomialOp final : public Operator<Context> {
 public:
    MultinomialOp(const OperatorDef& def, Workspace* ws)
        : Operator<Context>(def, ws),
+          eps_(OpArg<float>("eps", 0.f)),
          normalize_(OpArg<int64_t>("normalize", 0)),
          num_samples_(OpArg<int64_t>("num_samples", 1)) {}
    USE_OPERATOR_FUNCTIONS;
@@ -32,6 +33,7 @@ class MultinomialOp final : public Operator<Context> {
    template <typename T> void RunImpl();

 protected:
+    float eps_;
    int64_t outer_dim_, axis_;
    int64_t normalize_, num_samples_;
    unique_ptr<OperatorBase> softmax_op_;

--- a/Dragon/include/utils/caffemodel.h
+++ b/Dragon/include/utils/caffemodel.h
@@ -26,22 +26,24 @@ inline void LoadCaffeModel(
    LOG(INFO) << "Restore From Model @: " << file << "......";
    LOG(INFO) << "Model Format: CaffeModel";
    for (int i = 0; i < net_param.layer_size(); i++) {
-        const LayerParameter& layer = net_param.layer(i);
-        const string& layer_name = layer.name();
-        string prefix = layer_name + "/param:";
+        const auto& layer = net_param.layer(i);
+        const auto& layer_name = layer.name();
+        auto prefix = layer_name + "/param:";
        for (int j = 0; j < layer.blobs_size(); j++) {
-            string tensor_name = prefix + std::to_string(j);
-            if (!ws->HasTensor(tensor_name))
-                LOG(WARNING) << "Tensor(" << tensor_name << ") "
-                << "does not exist in any Graphs, skip.";
-            else{
-                BlobProto blob = layer.blobs(j);
-                vector<int64_t> dims;
-                for (auto dim : blob.shape().dim()) dims.push_back(dim);
-                Tensor* tensor = ws->GetTensor(tensor_name);
+            auto tensor_name = prefix + std::to_string(j);
+            if (!ws->HasTensor(tensor_name)) {
+                LOG(WARNING)
+                    << "Tensor(" << tensor_name << ") "
+                    << "does not exist in any Graphs, skip.";
+            } else {
+                auto blob = layer.blobs(j);
+                vec64_t tensor_shape;
+                for (auto dim : blob.shape().dim())
+                    tensor_shape.push_back(dim);
+                auto* tensor = ws->GetTensor(tensor_name);
                std::stringstream DimString;
-                if (dims.size() > 0) {
-                    tensor->Reshape(dims);
+                if (tensor_shape.size() > 0) {
+                    tensor->Reshape(tensor_shape);
                    CHECK_EQ(tensor->count(), blob.data_size())
                        << "\nTensor(" << tensor_name << ") "
                        << "failed to load, except size:  "
@@ -52,9 +54,9 @@ inline void LoadCaffeModel(
                    tensor->Reshape({ blob.data_size() });
                    DimString << "(missing)";
                }
-                float* Xdata = tensor->mutable_data<float, CPUContext>();
-                for (int idx = 0; idx < blob.data_size(); idx++)
-                    Xdata[idx] = blob.data(idx);
+                auto* x = tensor->mutable_data<float, CPUContext>();
+                for (int xi = 0; xi < blob.data_size(); ++xi)
+                    x[xi] = blob.data(xi);
                LOG(INFO) << "Tensor(" << tensor_name << ") "
                          << "loaded, shape: " << DimString.str()
                          << ", size: " << blob.data_size();
@@ -66,32 +68,33 @@ inline void LoadCaffeModel(
 inline void SavaCaffeModel(
    string                          file,
    const vector<Tensor*>&          tensors) {
-    NetParameter net_param;
+    int j = -1;
+    NetParameter net;
    Map<string, int> layer_hash;
-    int layer_idx = -1;
    for (int i = 0; i < tensors.size(); i++) {
        if (tensors[i]->count() <= 0) continue;
-        vector<string> splits = str::split(
+        auto splits = str::split(
            tensors[i]->name(), "/param:");
        if (layer_hash.count(splits[0]) == 0) {
-            layer_hash[splits[0]] = ++layer_idx;
-            LayerParameter* layer = net_param.add_layer();
+            layer_hash[splits[0]] = ++j;
+            auto* layer = net.add_layer();
            layer->set_name(splits[0]);
        }
-        BlobProto* blob = net_param.mutable_layer(layer_idx)->add_blobs();
-        for (auto dim : tensors[i]->dims()) blob->mutable_shape()->add_dim(dim);
+        auto* blob = net.mutable_layer(j)->add_blobs();
+        for (auto dim : tensors[i]->dims())
+            blob->mutable_shape()->add_dim(dim);
        if (XIsType((*tensors[i]), float)) {
-            auto* Xdata = tensors[i]->data<float, CPUContext>();
-            for (int id = 0; id < tensors[i]->count(); id++)
-                blob->mutable_data()->Add(Xdata[id]);
+            auto* x = tensors[i]->data<float, CPUContext>();
+            for (int xi = 0; xi < tensors[i]->count(); ++xi)
+                blob->mutable_data()->Add(x[xi]);
        } else if (XIsType((*tensors[i]), float16)) {
-            auto* Xdata = tensors[i]->data<float16, CPUContext>();
-            for (int id = 0; id < tensors[i]->count(); id++)
+            auto* x = tensors[i]->data<float16, CPUContext>();
+            for (int xi = 0; xi < tensors[i]->count(); ++xi)
                blob->mutable_data()->Add(
-                    cast::to<float>(Xdata[id]));
+                    cast::to<float>(x[xi]));
        }
    }
-    WriteProtoToBinaryFile(net_param, file.c_str());
+    WriteProtoToBinaryFile(net, file.c_str());
    LOG(INFO) << "Save the model @: " << file << "......";
    LOG(INFO) << "Model format: Caffe";
 }

--- a/Dragon/python/dragon/operators/array.py
+++ b/Dragon/python/dragon/operators/array.py
@@ -748,7 +748,7 @@ def Arange(start, stop=None, step=1, dtype='float32', **kwargs):


 @OpSchema.Inputs(1)
-def Multinomial(inputs, num_samples=1, normalize=False, **kwargs):
+def Multinomial(inputs, num_samples=1, eps=0., normalize=False, **kwargs):
    """Return a tensor where each row contains ``num_samples``,
    sampled from the multinomial distribution.

@@ -765,6 +765,8 @@ def Multinomial(inputs, num_samples=1, normalize=False, **kwargs):
        The input tensor.
    num_samples : int, optional, default=1
        The number of samples.
+    eps : float, optional, default=0.
+        The prob to a uniform sampling.
    normalize : boolean, optional, default=False
        Whether to normalize the inputs.


--- a/Dragon/python/dragon/vm/torch/ops/builtin.py
+++ b/Dragon/python/dragon/vm/torch/ops/builtin.py
@@ -987,7 +987,7 @@ def one_hot(input, depth):
    return module.forward(input)


-def multinomial(input, num_samples, out=None):
+def multinomial(input, num_samples, eps=0., out=None):
    """Return a tensor where each row contains ``num_samples``,
     sampled from the multinomial distribution.

@@ -997,8 +997,8 @@ def multinomial(input, num_samples, out=None):
        The input tensor.
    num_samples : int
        The number of samples.
-    normalize : boolean, optional, default=False
-        Whether to normalize the inputs.
+    eps : float, optional, default=0.
+        The prob to a uniform sampling.

    Returns
    -------
@@ -1008,9 +1008,11 @@ def multinomial(input, num_samples, out=None):
    """
    dev = MakeDevice(inputs=[input])
    key = 'Multinomial/{}' \
-          '/num_samples:{}'.format(dev, num_samples)
+          '/num_samples:{}' \
+          '/eps:{}'.format(dev, num_samples, eps)
    module = get_module(
        Multinomial, key, dev,
+        eps=eps,
        num_samples=num_samples,
    )
    return module.forward(input, out)

--- a/Dragon/python/dragon/vm/torch/ops/modules/array.py
+++ b/Dragon/python/dragon/vm/torch/ops/modules/array.py
@@ -377,6 +377,7 @@ class Cast(BaseModule):
 class Multinomial(BaseModule):
    def __init__(self, key, dev, **kwargs):
        super(Multinomial, self).__init__(key, dev, **kwargs)
+        self.eps = kwargs.get('eps', 0)
        self.num_samples = kwargs.get('num_samples', 1)
        self.register_op()

@@ -384,6 +385,7 @@ class Multinomial(BaseModule):
        self.op_meta = {
            'op_type': 'Multinomial',
            'arguments': {
+                'eps': float(self.eps),
                'num_samples': self.num_samples,
                'normalize': False,
            },

--- a/Dragon/python/dragon/vm/torch/tensor.py
+++ b/Dragon/python/dragon/vm/torch/tensor.py
@@ -980,7 +980,7 @@ class Tensor(object):
        """
        raise NotImplementedError('Refer torch.ops.tensor.normal_')

-    def multinomial(self, num_samples, normalize=False):
+    def multinomial(self, num_samples, eps=0.):
        """Return a tensor where each row contains ``num_samples``,
           sampled from the multinomial distribution.

@@ -988,8 +988,8 @@ class Tensor(object):
        ----------
        num_samples : int
            The number of samples.
-        normalize : boolean, optional, default=False
-            Whether to normalize the inputs.
+        eps : float, optional, default=0.
+            The prob to a uniform sampling.

        Returns
        -------

--- a/Dragon/src/contrib/rcnn/bbox_utils.cu
+++ b/Dragon/src/contrib/rcnn/bbox_utils.cu
@@ -81,8 +81,8 @@ void _ApplyNMS(
    CUDA_CHECK(cudaMemcpy(boxes_dev, boxes,
        boxes_nbytes, cudaMemcpyHostToDevice));
    nms_mask<T>
-        << < blocks, NMS_BLOCK_SIZE,
-             0, ctx->cuda_stream() >> > (num_boxes,
+        <<< blocks, NMS_BLOCK_SIZE,
+            0, ctx->cuda_stream() >>> (num_boxes,
                 thresh, (T*)boxes_dev, (uint64_t*)mask_dev);
    ctx->FinishDeviceCompution();


--- a/Dragon/src/contrib/rcnn/bbox_utils.h
+++ b/Dragon/src/contrib/rcnn/bbox_utils.h
@@ -347,7 +347,7 @@ inline void CollectRoIs(
    const int                       canonical_level,
    const int                       canonical_scale,
    const T*                        rois,
-    vector< vector<int64_t> >&      roi_bins) {
+    vector<vec64_t>&                roi_bins) {
    const T* roi = rois;
    for (int i = 0; i < num_rois; ++i) {
        int bin_idx = roi_level(min_level, max_level,
@@ -360,7 +360,7 @@ inline void CollectRoIs(

 template <typename T>
 inline void DistributeRoIs(
-    const vector< vector<int64_t> >&    roi_bins,
+    const vector<vec64_t>&              roi_bins,
    const T*                            rois,
    vector<T*>                          outputs) {
    for (int i = 0; i < roi_bins.size(); i++) {

--- a/Dragon/src/core/graph.cc
+++ b/Dragon/src/core/graph.cc
@@ -123,7 +123,7 @@ Graph::Graph(const GraphDef& def, Workspace* ws)

    // Recomputing-aware
    if (subgraph_indices.size() > 0) {
-        Map< string, vector<OperatorBase*> > subgraph;
+        Map<string, vector<OperatorBase*>> subgraph;
        for (const auto& it : subgraph_indices) {
            subgraph[it.first] = vector<OperatorBase*>();
            for (const auto& idx : subgraph_indices[it.first])

--- a/Dragon/src/core/graph_gradient.cc
+++ b/Dragon/src/core/graph_gradient.cc
@@ -7,7 +7,7 @@ namespace dragon {
 bool GraphGradientMaker::CheckGrad(
    const OperatorDef&              forward_op,
    const Set<string>&              targets,
-    vector< pair<string, int> >&    gen_grads) {
+    vector<pair<string, int>>&      gen_grads) {
    if (NoGradientRegistry()->Has(forward_op.type())) {
        for (auto& input : forward_op.input())
            blacklist_set_.insert(input);
@@ -81,7 +81,7 @@ void GraphGradientMaker::Make(
    for (int i = (int)forward_def.size() - 1; i >= 0; --i) {
        // Collect inputs & outputs, generate RAW grad ops
        const OperatorDef& op = *forward_def[i];
-        vector< pair<string, int> > gen_grads;
+        vector<pair<string, int>> gen_grads;
        bool is_skip = CheckGrad(op, targets_set, gen_grads);
        vector<string> g_outputs;
        for (auto& output : op.output()) {
@@ -214,7 +214,7 @@ void GraphGradientMaker::Make(
 GraphDef GraphGradientMaker::Share(const GraphDef& input_def) {
    Set<int> invalid_ops;
    Map<string, int> ref_count;
-    Map< string, pair<int, string> > ssa_map;
+    Map<string, pair<int, string>> ssa_map;
    // Count the refs for detecting leaf nodes
    for (int i = 0; i < input_def.op_size(); ++i) {
        const OperatorDef& op = input_def.op(i);

--- a/Dragon/src/core/graph_optimizer.cc
+++ b/Dragon/src/core/graph_optimizer.cc
@@ -174,7 +174,7 @@ GraphDef GraphOptimizer::MirrorStage(
    const GraphDef&                  input_def,
    Map<string, vec32_t >&       op_indices) {
    GraphDef output_def(input_def);
-    Map<string, set<int> > fake_op_indices;
+    Map<string, set<int>> fake_op_indices;
    Map<string, string> rename_map;
    Map<string, int> versions;


--- a/Dragon/src/core/operator_schema.cc
+++ b/Dragon/src/core/operator_schema.cc
@@ -54,7 +54,7 @@ OpSchema& OpSchema::NumOutputs(int n) {
    return NumOutputs(n, n);
 }

-OpSchema& OpSchema::Inplace(set< pair<int, int> > inplace) {
+OpSchema& OpSchema::Inplace(set<pair<int, int>> inplace) {
    CheckInplace = [inplace](int in, int out)->bool {
        return (inplace.count(std::make_pair(in, out)) > 0);
    };

--- a/Dragon/src/kernels/activation/dropout_op_kernel.cu
+++ b/Dragon/src/kernels/activation/dropout_op_kernel.cu
@@ -37,14 +37,10 @@ template<> void Dropout<float, CUDAContext>(
    float*                  y,
    CUDAContext*            ctx) {
    auto thresh = (uint32_t)(UINT_MAX * prob);
-    math::RandomUniform(
-        count,
-        0.f, (float)UINT_MAX,
-        mask32, ctx
-    );
+    math::RandomUniform(count, 0.f, 1.f, mask32, ctx);
    _Dropout
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count,
        thresh,
        scale,
@@ -85,14 +81,10 @@ template<> void Dropout<float16, CUDAContext>(
    float16*                y,
    CUDAContext*            ctx) {
    auto thresh = (uint32_t)(UINT_MAX * prob);
-    math::RandomUniform(
-        count,
-        0.f, (float)UINT_MAX,
-        mask32, ctx
-    );
+    math::RandomUniform(count, 0.f, 1.f, mask32, ctx);
    _Dropout
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count,
        thresh,
        cast::to<half>(scale),
@@ -124,8 +116,8 @@ template <> void ApplyMask<float, uint8_t, CUDAContext>(
    float*                  y,
    CUDAContext*            ctx) {
    _ApplyMask
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count, scale, x, mask, y
    );
 }
@@ -157,8 +149,8 @@ template <> void ApplyMask<float16, uint8_t, CUDAContext>(
    float16*                y,
    CUDAContext*            ctx) {
    _ApplyMaskHalf
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count,
        cast::to<half>(scale),
        reinterpret_cast<const half*>(x),

--- a/Dragon/src/kernels/activation/droppath_op_kernel.cu
+++ b/Dragon/src/kernels/activation/droppath_op_kernel.cu
@@ -44,8 +44,8 @@ template<> void DropPath<float, CUDAContext>(
    auto nthreads = rows * cols;
    auto thresh = 1.f - (1.f / scale);
    _DropPath
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        nthreads, cols, thresh, scale, x, mask, y
    );
 }
@@ -85,8 +85,8 @@ template<> void DropPath<float16, CUDAContext>(
    auto nthreads = rows * cols;
    auto thresh = 1.f - (1.f / scale);
    _DropPath
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        nthreads, cols,
        thresh,
        cast::to<half>(scale),

--- a/Dragon/src/kernels/activation/elu_op_kernel.cu
+++ b/Dragon/src/kernels/activation/elu_op_kernel.cu
@@ -28,8 +28,8 @@ template<> void Elu<float, CUDAContext>(
    float*                  y,
    CUDAContext*            ctx) {
    _Elu
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count, x, alpha, y
    );
 }
@@ -58,8 +58,8 @@ template<> void EluGrad<float, CUDAContext>(
    float*                  dx,
    CUDAContext*            ctx) {
    _EluGrad
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count, alpha, dy, y, dx
    );
 }

--- a/Dragon/src/kernels/activation/prelu_op_kernel.cu
+++ b/Dragon/src/kernels/activation/prelu_op_kernel.cu
@@ -66,21 +66,21 @@ template<> void PRelu<float, CUDAContext>(
    CUDAContext*            ctx) {
    if (channel_shared) {
        _PRelu
-            << < CUDA_BLOCKS(count), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(count), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            count, channels, dim, x, w, y
        );
    } else {
        if (data_format == "NCHW") {
            _PReluNCHW
-                << < CUDA_BLOCKS(count), CUDA_THREADS,
-                     0, ctx->cuda_stream() >> >(
+                <<< CUDA_BLOCKS(count), CUDA_THREADS,
+                    0, ctx->cuda_stream() >>>(
                count, channels, dim, x, w, y
            );
        } else if (data_format == "NHWC") {
            _PReluNHWC
-                << < CUDA_BLOCKS(count), CUDA_THREADS,
-                     0, ctx->cuda_stream() >> >(
+                <<< CUDA_BLOCKS(count), CUDA_THREADS,
+                    0, ctx->cuda_stream() >>>(
                count, channels, dim, x, w, y
            );
        } else {
@@ -152,21 +152,21 @@ template<> void PReluGrad<float, CUDAContext>(
    CUDAContext*            ctx) {
    if (channel_shared) {
        _PReluGrad
-            << < CUDA_BLOCKS(count), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(count), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            count, channels, dim, dy, x, w, dx
        );
    } else {
        if (data_format == "NCHW") {
            _PReluGradNCHW
-                << < CUDA_BLOCKS(count), CUDA_THREADS,
-                     0, ctx->cuda_stream() >> >(
+                <<< CUDA_BLOCKS(count), CUDA_THREADS,
+                    0, ctx->cuda_stream() >>>(
                count, channels, dim, dy, x, w, dx
            );
        } else if (data_format == "NHWC") {
            _PReluGradNHWC
-                << < CUDA_BLOCKS(count), CUDA_THREADS,
-                     0, ctx->cuda_stream() >> >(
+                <<< CUDA_BLOCKS(count), CUDA_THREADS,
+                    0, ctx->cuda_stream() >>>(
                count, channels, dim, dy, x, w, dx
            );
        } else {
@@ -210,8 +210,8 @@ template<> void PReluWGrad<float, CUDAContext>(
    CUDAContext*            ctx) {
    auto cdim = channels * dim;
    _PReluWGradBcast
-        << < CUDA_BLOCKS(cdim), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(cdim), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        cdim, rows, row_offset, dy, x, bcast_dw
    );
    if (channel_shared) {

--- a/Dragon/src/kernels/activation/relu_op_kernel.cu
+++ b/Dragon/src/kernels/activation/relu_op_kernel.cu
@@ -35,8 +35,8 @@ template<> void Relu<float, CUDAContext>(
    float*                  y,
    CUDAContext*            ctx) {
    _Relu
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count, slope, x, y
    );
 }
@@ -83,8 +83,8 @@ template<> void Relu<float16, CUDAContext>(
    CUDAContext*            ctx) {
    if ((count & 1) == 0) {
        _Relu
-            << < CUDA_BLOCKS(count >> 1), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(count >> 1), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            count >> 1,
            cast::to<half2>(slope),
            reinterpret_cast<const half2*>(x),
@@ -92,8 +92,8 @@ template<> void Relu<float16, CUDAContext>(
        );
    } else {
        _Relu
-            << < CUDA_BLOCKS(count), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(count), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            count,
            cast::to<half>(slope),
            reinterpret_cast<const half*>(x),
@@ -134,8 +134,8 @@ template<> void ReluGrad<float, CUDAContext>(
    float*                  dx,
    CUDAContext*            ctx) {
    _ReluGrad
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count, slope, dy, y, dx
    );
 }
@@ -170,8 +170,8 @@ template<> void ReluGrad<float16, CUDAContext>(
    float16*                dx,
    CUDAContext*            ctx) {
    _ReluGrad
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count, slope,
        reinterpret_cast<const half*>(dy),
        reinterpret_cast<const half*>(y),

--- a/Dragon/src/kernels/activation/selu_op_kernel.cu
+++ b/Dragon/src/kernels/activation/selu_op_kernel.cu
@@ -34,8 +34,8 @@ template<> void SElu<float, CUDAContext>(
    float*                  y,
    CUDAContext*            ctx) {
    _SElu
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count, x, y
    );
 }
@@ -63,8 +63,8 @@ template<> void SElu<float16, CUDAContext>(
    float16*                y,
    CUDAContext*            ctx) {
    _SElu
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count,
        reinterpret_cast<const half*>(x),
        reinterpret_cast<half*>(y)
@@ -99,8 +99,8 @@ template<> void SEluGrad<float, CUDAContext>(
    float*                  dx,
    CUDAContext*            ctx) {
    _SEluGrad
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count, dy, y, dx
    );
 }
@@ -131,8 +131,8 @@ template<> void SEluGrad<float16, CUDAContext>(
    float16*                dx,
    CUDAContext*            ctx) {
    _SEluGrad
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count,
        reinterpret_cast<const half*>(dy),
        reinterpret_cast<const half*>(y),

--- a/Dragon/src/kernels/activation/sigmoid_op_kernel.cu
+++ b/Dragon/src/kernels/activation/sigmoid_op_kernel.cu
@@ -25,8 +25,8 @@ template<> void Sigmoid<float, CUDAContext>(
    float*                  y,
    CUDAContext*            ctx) {
    _Sigmoid
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count, x, y
    );
 }
@@ -51,8 +51,8 @@ template<> void SigmoidGrad<float, CUDAContext>(
    float*                  dx,
    CUDAContext*            ctx) {
    _SigmoidGrad
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count, dy, y, dx
    );
 }

--- a/Dragon/src/kernels/activation/softmax_op_kernel.cu
+++ b/Dragon/src/kernels/activation/softmax_op_kernel.cu
@@ -96,26 +96,26 @@ template<> void Softmax<float, CUDAContext>(
    auto num_preds = outer_dim * inner_dim;
    auto nelements = num_preds * axis_dim;
    _SoftmaxReduceMax
-        << < CUDA_BLOCKS(num_preds), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(num_preds), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        num_preds, axis_dim, inner_dim, x, scale
    );
    _SoftmaxSub
-        << < CUDA_BLOCKS(nelements), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(nelements), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        nelements, axis_dim, inner_dim, scale, y
    );

    math::Exp(nelements, y, y, ctx);

    _SoftmaxReduceSum
-        << < CUDA_BLOCKS(num_preds), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(num_preds), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        num_preds, axis_dim, inner_dim, y, scale
    );
    _SoftmaxDiv
-        << < CUDA_BLOCKS(nelements), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(nelements), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        nelements, axis_dim, inner_dim, scale, y
    );
 }
@@ -159,13 +159,13 @@ template<> void SoftmaxGrad<float, CUDAContext>(
    auto num_preds = outer_dim * inner_dim;
    auto nelements = num_preds * axis_dim;
    _SoftmaxDot
-        << < CUDA_BLOCKS(num_preds), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(num_preds), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        num_preds, axis_dim, inner_dim, dy, y, scale
    );
    _SoftmaxSub
-        << < CUDA_BLOCKS(nelements), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(nelements), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        nelements, axis_dim, inner_dim, scale, dx
    );
    math::Mul(nelements, dx, y, dx, ctx);

--- a/Dragon/src/kernels/activation/tanh_op_kernel.cu
+++ b/Dragon/src/kernels/activation/tanh_op_kernel.cu
@@ -25,8 +25,8 @@ template<> void Tanh<float, CUDAContext>(
    float*                  y,
    CUDAContext*            ctx) {
    _Tanh
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count, x, y
    );
 }
@@ -51,8 +51,8 @@ template<> void TanhGrad<float, CUDAContext>(
    float*                  dx,
    CUDAContext*            ctx) {
    _TanhGrad
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count, dy, y, dx
    );
 }

--- a/Dragon/src/kernels/arithmetic/affine_op_kernel.cu
+++ b/Dragon/src/kernels/arithmetic/affine_op_kernel.cu
@@ -60,15 +60,15 @@ template<> void Affine<float, CUDAContext>(
    auto nthreads = outer_dim * axis_dim * inner_dim;
    if (beta != nullptr) {
        _Affine
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            nthreads, axis_dim, inner_dim,
            x, alpha, beta, y
        );
    } else {
        _AffineNoBias
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            nthreads, axis_dim, inner_dim, x, alpha, y
        );
    }
@@ -124,8 +124,8 @@ template<> void Affine<float16, CUDAContext>(
    auto nthreads = outer_dim * axis_dim * inner_dim;
    if (beta != nullptr) {
        _Affine
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            nthreads, axis_dim, inner_dim,
            reinterpret_cast<const half*>(x),
            reinterpret_cast<const half*>(alpha),
@@ -134,8 +134,8 @@ template<> void Affine<float16, CUDAContext>(
        );
    } else {
        _AffineNoBias
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            nthreads, axis_dim, inner_dim,
            reinterpret_cast<const half*>(x),
            reinterpret_cast<const half*>(alpha),
@@ -156,8 +156,8 @@ template <> void AffineGrad<float, CUDAContext>(
    CUDAContext*            ctx) {
    auto nthreads = outer_dim * axis_dim * inner_dim;
    _AffineNoBias
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        nthreads, axis_dim, inner_dim, dy, alpha, dx
    );
 }
@@ -174,8 +174,8 @@ template <> void AffineGrad<float16, CUDAContext>(
    CUDAContext*            ctx) {
    auto nthreads = outer_dim * axis_dim * inner_dim;
    _AffineNoBias
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        nthreads, axis_dim, inner_dim,
        reinterpret_cast<const half*>(dy),
        reinterpret_cast<const half*>(alpha),

--- a/Dragon/src/kernels/arithmetic/clip_op_kernel.cu
+++ b/Dragon/src/kernels/arithmetic/clip_op_kernel.cu
@@ -83,8 +83,8 @@ template<> __global__ void _ClipGrad<half>(
        T*                      y, \
        CUDAContext*            ctx) { \
        _Clip<T> \
-            << < CUDA_BLOCKS(count), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+            <<< CUDA_BLOCKS(count), CUDA_THREADS, \
+                0, ctx->cuda_stream() >>>( \
            count,  \
            cast::to<T>(low), \
            cast::to<T>(high), \
@@ -102,8 +102,8 @@ template<> __global__ void _ClipGrad<half>(
        T*                      dx, \
        CUDAContext*            ctx) { \
        _ClipGrad<T> \
-            << < CUDA_BLOCKS(count), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+            <<< CUDA_BLOCKS(count), CUDA_THREADS, \
+                0, ctx->cuda_stream() >>>( \
            count, \
            cast::to<T>(low), \
            cast::to<T>(high), \
@@ -133,8 +133,8 @@ template <> void Clip<float16, CUDAContext>(
    float16*                y,
    CUDAContext*            ctx) {
    _Clip
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count,
        cast::to<half>(low),
        cast::to<half>(high),
@@ -152,8 +152,8 @@ template <> void ClipGrad<float16, CUDAContext>(
    float16*                dx,
    CUDAContext*            ctx) {
    _ClipGrad
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count,
        cast::to<half>(low),
        cast::to<half>(high),

--- a/Dragon/src/kernels/arithmetic/maximum_op_kernel.cu
+++ b/Dragon/src/kernels/arithmetic/maximum_op_kernel.cu
@@ -139,8 +139,8 @@ template<> __global__ void _BroadcastMaximumGrad<half>(
        T*                      y, \
        CUDAContext*            ctx) { \
        _##name \
-            << < CUDA_BLOCKS(count), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+            <<< CUDA_BLOCKS(count), CUDA_THREADS, \
+                0, ctx->cuda_stream() >>>( \
            count, x1, x2, y \
        ); \
    }
@@ -155,8 +155,8 @@ template<> __global__ void _BroadcastMaximumGrad<half>(
        T*                      dx2, \
        CUDAContext*            ctx) { \
        _##name \
-            << < CUDA_BLOCKS(count), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+            <<< CUDA_BLOCKS(count), CUDA_THREADS, \
+                0, ctx->cuda_stream() >>>( \
            count, x1, x2, dy, dx1, dx2 \
        ); \
    }
@@ -196,8 +196,8 @@ template <> void Maximum<float16, CUDAContext>(
    float16*                y,
    CUDAContext*            ctx) {
    _Maximum \
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count,
        reinterpret_cast<const half*>(x1),
        reinterpret_cast<const half*>(x2),
@@ -212,8 +212,8 @@ template <> void BroadcastMaximum<float16, CUDAContext>(
    float16*                y,
    CUDAContext*            ctx) {
    _BroadcastMaximum \
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count,
        reinterpret_cast<const half*>(x1),
        cast::to<half>(x2),
@@ -230,8 +230,8 @@ template <> void MaximumGrad<float16, CUDAContext>(
    float16*                dx2,
    CUDAContext*            ctx) {
    _MaximumGrad \
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count,
        reinterpret_cast<const half*>(x1),
        reinterpret_cast<const half*>(x2),
@@ -250,8 +250,8 @@ template <> void BroadcastMaximumGrad<float16, CUDAContext>(
    float16*                dx2,
    CUDAContext*            ctx) {
    _BroadcastMaximumGrad \
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count,
        reinterpret_cast<const half*>(x1),
        cast::to<half>(x2),

--- a/Dragon/src/kernels/arithmetic/minimum_op_kernel.cu
+++ b/Dragon/src/kernels/arithmetic/minimum_op_kernel.cu
@@ -139,8 +139,8 @@ template<> __global__ void _BroadcastMinimumGrad<half>(
        T*                      y, \
        CUDAContext*            ctx) { \
        _##name \
-            << < CUDA_BLOCKS(count), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+            <<< CUDA_BLOCKS(count), CUDA_THREADS, \
+                0, ctx->cuda_stream() >>>( \
            count, x1, x2, y \
        ); \
    }
@@ -155,8 +155,8 @@ template<> __global__ void _BroadcastMinimumGrad<half>(
        T*                      dx2, \
        CUDAContext*            ctx) { \
        _##name \
-            << < CUDA_BLOCKS(count), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+            <<< CUDA_BLOCKS(count), CUDA_THREADS, \
+                0, ctx->cuda_stream() >>>( \
            count, x1, x2, dy, dx1, dx2 \
        ); \
    }
@@ -196,8 +196,8 @@ template <> void Minimum<float16, CUDAContext>(
    float16*                y,
    CUDAContext*            ctx) {
    _Minimum \
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count,
        reinterpret_cast<const half*>(x1),
        reinterpret_cast<const half*>(x2),
@@ -212,8 +212,8 @@ template <> void BroadcastMinimum<float16, CUDAContext>(
    float16*                y,
    CUDAContext*            ctx) {
    _BroadcastMinimum \
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count,
        reinterpret_cast<const half*>(x1),
        cast::to<half>(x2),
@@ -230,8 +230,8 @@ template <> void MinimumGrad<float16, CUDAContext>(
    float16*                dx2,
    CUDAContext*            ctx) {
    _MinimumGrad \
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count,
        reinterpret_cast<const half*>(x1),
        reinterpret_cast<const half*>(x2),
@@ -250,8 +250,8 @@ template <> void BroadcastMinimumGrad<float16, CUDAContext>(
    float16*                dx2,
    CUDAContext*            ctx) {
    _BroadcastMinimumGrad \
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count,
        reinterpret_cast<const half*>(x1),
        cast::to<half>(x2),

--- a/Dragon/src/kernels/arithmetic/moments_op_kernel.cu
+++ b/Dragon/src/kernels/arithmetic/moments_op_kernel.cu
@@ -251,8 +251,8 @@ void _Moments(
            ndims, x_dims, y_dims,
                &rows, &cols)) {
        _ColwiseMoments
-            << < CUDA_2D_BLOCKS(rows), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_2D_BLOCKS(rows), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            rows, cols, x, mean, var
        ); return;
    }
@@ -262,8 +262,8 @@ void _Moments(
            ndims, x_dims, y_dims, 
                &rows, &cols)) {
        _RowwiseMoments
-            << < CUDA_2D_BLOCKS(cols), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_2D_BLOCKS(cols), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            rows, cols, x, mean, var
        ); return;
    }
@@ -294,8 +294,8 @@ void _Moments(
    ctx->Memcpy<CUDAContext, CPUContext>(dbytes, YDS, dimsT.data());

    _GenericMoments
-        << < CUDA_2D_BLOCKS(outer_dim), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_2D_BLOCKS(outer_dim), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        ndims, outer_dim, inner_dim,
        XSS, YDS, x, mean, var
    );

--- a/Dragon/src/kernels/array/arange_op_kernel.cu
+++ b/Dragon/src/kernels/array/arange_op_kernel.cu
@@ -30,8 +30,8 @@ __global__ void _Arange(
        T*                      y, \
        CUDAContext*            ctx) { \
        _Arange \
-            << < CUDA_BLOCKS(count), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+            <<< CUDA_BLOCKS(count), CUDA_THREADS, \
+                0, ctx->cuda_stream() >>>( \
            count, start, step, y \
        ); \
    }
@@ -64,8 +64,8 @@ template <> void Arange<float16, CUDAContext>(
    float16*                y,
    CUDAContext*            ctx) {
    _Arange
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count, start, step,
        reinterpret_cast<half*>(y)
    );

--- a/Dragon/src/kernels/array/argreduce_op_kernel.cc
+++ b/Dragon/src/kernels/array/argreduce_op_kernel.cc
@@ -20,12 +20,12 @@ void _ArgMax(
        for (int iix = 0; iix < inner_dim; ++iix) {
            const T* X = x + (oix * axis_dim * inner_dim + iix);
            const int y_offset = oix * top_k * inner_dim + iix;
-            vector< pair<T, int64_t> > vec(axis_dim);
+            vector<pair<T, int64_t>> vec(axis_dim);
            for (int j = 0; j < axis_dim; ++j)
                vec[j] = std::make_pair(X[j * inner_dim], j);
            std::partial_sort(
                vec.begin(), vec.begin() + top_k, vec.end(),
-                    std::greater< pair<T, int64_t> >());
+                    std::greater<pair<T, int64_t>>());
            for (int j = 0; j < top_k; ++j) {
                indices[y_offset + j * inner_dim] = vec[j].second;
                if (values) values[y_offset + j * inner_dim] = vec[j].first;
@@ -49,7 +49,7 @@ void _ArgMin(
        for (int iix = 0; iix < inner_dim; ++iix) {
            const T* X = x + (oix * axis_dim * inner_dim + iix);
            const int y_offset = oix * top_k * inner_dim + iix;
-            vector< pair<T, int64_t> > vec(axis_dim);
+            vector<pair<T, int64_t>> vec(axis_dim);
            for (int j = 0; j < axis_dim; ++j)
                vec[j] = std::make_pair(X[j * inner_dim], j);
            std::partial_sort(vec.begin(), vec.begin() + top_k, vec.end());

--- a/Dragon/src/kernels/array/argreduce_op_kernel.cu
+++ b/Dragon/src/kernels/array/argreduce_op_kernel.cu
@@ -133,8 +133,8 @@ template<> __global__ void _ArgMin<half>(
        CHECK_EQ(top_k, 1) << "\nRequired top_k == 1."; \
        auto nthreads = outer_dim * inner_dim; \
        _##name \
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
+                0, ctx->cuda_stream() >>>( \
            nthreads, inner_dim, axis_dim, \
            x, indices, values \
        ); \
@@ -168,8 +168,8 @@ template<> void ArgMax<float16, CUDAContext>(
    CHECK_EQ(top_k, 1) << "\nRequired top_k == 1.";
    auto nthreads = outer_dim * inner_dim;
    _ArgMax
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        nthreads, inner_dim, axis_dim,
        reinterpret_cast<const half*>(x),
        indices,
@@ -189,8 +189,8 @@ template<> void ArgMin<float16, CUDAContext>(
    CHECK_EQ(top_k, 1) << "\nRequired top_k == 1.";
    auto nthreads = outer_dim * inner_dim;
    _ArgMin
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
         nthreads, inner_dim, axis_dim,
         reinterpret_cast<const half*>(x),
         indices,

--- a/Dragon/src/kernels/array/concat_op_kernel.cu
+++ b/Dragon/src/kernels/array/concat_op_kernel.cu
@@ -43,8 +43,8 @@ __global__ void _Concat(
        auto cols = axis_dim * inner_dim; \
        auto nthreads = outer_dim * axis_dim * inner_dim; \
        _##name \
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
+                0, ctx->cuda_stream() >>>( \
            nthreads, \
            inner_dim, \
            cols, \

--- a/Dragon/src/kernels/array/crop_op_kernel.cu
+++ b/Dragon/src/kernels/array/crop_op_kernel.cu
@@ -83,8 +83,8 @@ __global__ void _CropGrad(
        T*                      y, \
        CUDAContext*            ctx) { \
        _##name \
-            << < CUDA_BLOCKS(count), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+            <<< CUDA_BLOCKS(count), CUDA_THREADS, \
+                0, ctx->cuda_stream() >>>( \
            count, ndims, \
            x_strides, y_dims, \
            starts, x, y \

--- a/Dragon/src/kernels/array/index_select_op_kernel.cu
+++ b/Dragon/src/kernels/array/index_select_op_kernel.cu
@@ -115,8 +115,8 @@ template <> __global__ void _IndexSelectGrad<half>(
        CUDAContext*            ctx) { \
        auto nthreads = outer_dim * num_indices * inner_dim; \
        _IndexSelect \
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
+                0, ctx->cuda_stream() >>>( \
            nthreads, inner_dim, \
            axis_dim, num_indices, \
            indices, x, y \
@@ -135,8 +135,8 @@ template <> __global__ void _IndexSelectGrad<half>(
        CUDAContext*            ctx) { \
        auto nthreads = outer_dim * inner_dim; \
        _IndexSelectGrad \
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
+                0, ctx->cuda_stream() >>>( \
            nthreads, inner_dim, \
            axis_dim, num_indices, \
            indices, dy, dx \
@@ -170,8 +170,8 @@ template <> void IndexSelectGrad<float16, CUDAContext>(
    CUDAContext*            ctx) {
    auto nthreads = outer_dim * inner_dim;
    _IndexSelectGrad
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        nthreads, inner_dim,
        axis_dim, num_indices,
        indices,

--- a/Dragon/src/kernels/array/one_hot_op_kernel.cu
+++ b/Dragon/src/kernels/array/one_hot_op_kernel.cu
@@ -32,8 +32,8 @@ template <> void OneHot<float, CUDAContext>(
    float*                  y,
    CUDAContext*            ctx) {
    _OneHot
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count, depth, on_value, x, y
    );
 }
@@ -48,8 +48,8 @@ template <> void OneHot<int, CUDAContext>(
    int*                    y,
    CUDAContext*            ctx) {
    _OneHot
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count, depth, on_value, x, y
    );
 }
@@ -64,8 +64,8 @@ template <> void OneHot<int64_t, CUDAContext>(
    int64_t*                y,
    CUDAContext*            ctx) {
    _OneHot
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count, depth, on_value, x, y
    );
 }

--- a/Dragon/src/kernels/array/pad_op_kernel.cu
+++ b/Dragon/src/kernels/array/pad_op_kernel.cu
@@ -130,8 +130,8 @@ __global__ void _EdgePad(
        T*                      y, \
        CUDAContext*            ctx) { \
        _ConstPad \
-            << < CUDA_BLOCKS(count), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+            <<< CUDA_BLOCKS(count), CUDA_THREADS, \
+                0, ctx->cuda_stream() >>>( \
            count, ndims, \
            x_dims, x_strides, \
            y_dims, l_pads, \
@@ -152,8 +152,8 @@ __global__ void _EdgePad(
        T*                      y, \
        CUDAContext*            ctx) { \
        _##name \
-            << < CUDA_BLOCKS(count), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+            <<< CUDA_BLOCKS(count), CUDA_THREADS, \
+                0, ctx->cuda_stream() >>>( \
            count, ndims, \
            x_dims, x_strides, \
            y_dims, l_pads, \

--- a/Dragon/src/kernels/array/reduce_sum_op_kernel.cu
+++ b/Dragon/src/kernels/array/reduce_sum_op_kernel.cu
@@ -202,8 +202,8 @@ void _ReduceSum(
        ndims, x_dims, y_dims,
            &rows, &cols)) {
        _ColwiseReduceSum
-            << < CUDA_2D_BLOCKS(rows), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_2D_BLOCKS(rows), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            rows, cols, scale, x, y
        ); return;
    }
@@ -213,8 +213,8 @@ void _ReduceSum(
            ndims, x_dims, y_dims,
                &rows, &cols)) {
        _RowwiseReduceSum
-            << < CUDA_2D_BLOCKS(cols), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_2D_BLOCKS(cols), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            rows, cols, scale, x, y
        ); return;
    }
@@ -245,8 +245,8 @@ void _ReduceSum(
    ctx->Memcpy<CUDAContext, CPUContext>(dbytes, YDS, dimsT.data());

    _GenericReduceSum
-        << < CUDA_2D_BLOCKS(outer_dim), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_2D_BLOCKS(outer_dim), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        ndims, outer_dim, inner_dim,
        XSS, YDS, scale, x, y
    );
@@ -372,8 +372,8 @@ template <> __global__ void _ReduceSumGrad<half>(
        T*                      dx, \
        CUDAContext*            ctx) { \
        _ReduceSumGrad \
-            << < CUDA_BLOCKS(count), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+            <<< CUDA_BLOCKS(count), CUDA_THREADS, \
+                0, ctx->cuda_stream() >>>( \
            count, ndim, x_dims, \
            y_dims, y_strides, \
            scale, dy, dx \
@@ -398,8 +398,8 @@ template<> void ReduceSumGrad<float16, CUDAContext>(
    float16*                dx,
    CUDAContext*            ctx) {
    _ReduceSumGrad
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count, ndim, x_dims,
        y_dims, y_strides,
        scale,

--- a/Dragon/src/kernels/array/repeat_op_kernel.cu
+++ b/Dragon/src/kernels/array/repeat_op_kernel.cu
@@ -93,8 +93,8 @@ template<> __global__ void _RepeatGrad<half>(
        auto y_inner_dim = inner_dim * repeats; \
        auto nthreads = outer_dim * axis_dim * y_inner_dim; \
        _Repeat \
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
+                0, ctx->cuda_stream() >>>( \
            nthreads, axis_dim, \
            inner_dim, y_inner_dim, \
            x, y \
@@ -113,8 +113,8 @@ template<> __global__ void _RepeatGrad<half>(
        auto y_inner_dim = inner_dim * repeats; \
        auto nthreads = outer_dim * axis_dim * inner_dim; \
        _RepeatGrad \
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
+                0, ctx->cuda_stream() >>>( \
            nthreads, \
            axis_dim, \
            inner_dim, \
@@ -151,8 +151,8 @@ template<> void RepeatGrad<float16, CUDAContext>(
    auto y_inner_dim = inner_dim * repeats;
    auto nthreads = outer_dim * axis_dim * inner_dim;
    _RepeatGrad
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
+            0, ctx->cuda_stream() >>>(
        nthreads,
        axis_dim,
        inner_dim,

--- a/Dragon/src/kernels/array/slice_op_kernel.cu
+++ b/Dragon/src/kernels/array/slice_op_kernel.cu
@@ -64,8 +64,8 @@ __global__ void _SliceGrad(
        auto cols = slice_dim * inner_dim; \
        auto nthreads = outer_dim * cols; \
        _##name \
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
+                0, ctx->cuda_stream() >>>( \
            nthreads, \
            inner_dim, \
            axis_dim, \
@@ -126,8 +126,8 @@ template <> void SliceGrad<float16, CUDAContext>(
    auto cols = slice_dim * inner_dim;
    auto nthreads = outer_dim * cols;
    _SliceGrad
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        nthreads,
        inner_dim,
        axis_dim,

--- a/Dragon/src/kernels/array/tile_op_kernel.cu
+++ b/Dragon/src/kernels/array/tile_op_kernel.cu
@@ -98,8 +98,8 @@ template<> __global__ void _TileGrad<half>(
        T*                      y, \
        CUDAContext*            ctx) { \
        _Tile \
-            << < CUDA_BLOCKS(count), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+            <<< CUDA_BLOCKS(count), CUDA_THREADS, \
+                0, ctx->cuda_stream() >>>( \
            count, \
            ndims, \
            x_dims, \
@@ -120,8 +120,8 @@ template<> __global__ void _TileGrad<half>(
        auto nthreads = rows * cols; \
        auto tiled_cols = multiple * cols; \
        _TileGrad \
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
+                0, ctx->cuda_stream() >>>( \
            nthreads, \
            cols, \
            tiled_cols, \
@@ -156,8 +156,8 @@ template<> void TileGrad<float16, CUDAContext>(
    auto nthreads = rows * cols;
    auto tiled_cols = multiple * cols;
    _TileGrad
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        nthreads,
        cols,
        tiled_cols,

--- a/Dragon/src/kernels/array/transpose_op_kernel.cu
+++ b/Dragon/src/kernels/array/transpose_op_kernel.cu
@@ -80,8 +80,8 @@ __global__ void _TransposeGrad(
        T*                      y, \
        CUDAContext*            ctx) { \
        _##name \
-            << < CUDA_BLOCKS(count), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+            <<< CUDA_BLOCKS(count), CUDA_THREADS, \
+                0, ctx->cuda_stream() >>>( \
            count, ndims, x_strides, y_dims, x, y \
        ); \
    }

--- a/Dragon/src/kernels/control_flow/assign_op_kernel.cu
+++ b/Dragon/src/kernels/control_flow/assign_op_kernel.cu
@@ -55,8 +55,8 @@ __global__ void _Assign(
        T*                      y, \
        CUDAContext*            ctx) { \
        _Assign \
-            << < CUDA_BLOCKS(count), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+            <<< CUDA_BLOCKS(count), CUDA_THREADS, \
+                0, ctx->cuda_stream() >>>( \
            count, \
            ndims, \
            x_dims, \

--- a/Dragon/src/kernels/control_flow/compare_op_kernel.cu
+++ b/Dragon/src/kernels/control_flow/compare_op_kernel.cu
@@ -153,8 +153,8 @@ __global__ void _GreaterEqualHalf(
        bool*                   y, \
        CUDAContext*            ctx) { \
        IMPL \
-            << < CUDA_BLOCKS(count), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+            <<< CUDA_BLOCKS(count), CUDA_THREADS, \
+                0, ctx->cuda_stream() >>>( \
            count, a, b, y \
        ); \
    }
@@ -167,8 +167,8 @@ __global__ void _GreaterEqualHalf(
        bool*                   y, \
        CUDAContext*            ctx) { \
        _##OP##Half \
-            << < CUDA_BLOCKS(count), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+            <<< CUDA_BLOCKS(count), CUDA_THREADS, \
+                0, ctx->cuda_stream() >>>( \
            count, \
            reinterpret_cast<const half*>(a), \
            reinterpret_cast<const half*>(b), \

--- a/Dragon/src/kernels/control_flow/masked_assign_op_kernel.cu
+++ b/Dragon/src/kernels/control_flow/masked_assign_op_kernel.cu
@@ -30,8 +30,8 @@ __global__ void _MaskedAssign(
        T*                      y, \
        CUDAContext*            ctx) { \
        _MaskedAssign \
-            << < CUDA_BLOCKS(count), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+            <<< CUDA_BLOCKS(count), CUDA_THREADS, \
+                0, ctx->cuda_stream() >>>( \
            count, mask, x, y \
        ); \
    }

--- a/Dragon/src/kernels/loss/l1_loss_op_kernel.cu
+++ b/Dragon/src/kernels/loss/l1_loss_op_kernel.cu
@@ -27,8 +27,8 @@ template<> void AbsGrad<float, CUDAContext>(
    float*                  dx,
    CUDAContext*            ctx) {
    _AbsGrad
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count, dy, dx
    );
 }

--- a/Dragon/src/kernels/loss/nll_loss_op_kernel.cu
+++ b/Dragon/src/kernels/loss/nll_loss_op_kernel.cu
@@ -55,8 +55,8 @@ template <> void NLLLoss<float, float, CUDAContext>(
    CUDAContext*            ctx) {
    auto nthreads = outer_dim * inner_dim;
    _NLLLoss
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        nthreads, axis_dim, inner_dim, nignores,
        ignore, log_prob, target, loss, flag
     );
@@ -77,8 +77,8 @@ template <> void NLLLoss<float, int64_t, CUDAContext>(
    CUDAContext*            ctx) {
    auto nthreads = outer_dim * inner_dim;
    _NLLLoss
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        nthreads, axis_dim, inner_dim, nignores,
        ignore, log_prob, target, loss, flag
    );
@@ -129,8 +129,8 @@ template<> void NLLLossGrad<float, float, CUDAContext>(
    CUDAContext*            ctx) {
    auto nthreads = outer_dim * inner_dim;
    _NLLLossGrad
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        nthreads, axis_dim, inner_dim, nignores,
        ignore, log_prob, target, dx, flag
    );
@@ -151,8 +151,8 @@ template<> void NLLLossGrad<float, int64_t, CUDAContext>(
    CUDAContext*            ctx) {
    auto nthreads = outer_dim * inner_dim;
    _NLLLossGrad
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        nthreads, axis_dim, inner_dim, nignores,
        ignore, log_prob, target, dx, flag
    );

--- a/Dragon/src/kernels/loss/sigmoid_ce_loss_op_kernel.cu
+++ b/Dragon/src/kernels/loss/sigmoid_ce_loss_op_kernel.cu
@@ -42,8 +42,8 @@ template <> void SigmoidCrossEntropy<float, CUDAContext>(
    int*                    flag,
    CUDAContext*            ctx) {
    _SigmoidCrossEntropy
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count, logit, target, loss, flag
    );
 }
@@ -77,8 +77,8 @@ template <> void SigmoidCrossEntropyGrad<float, CUDAContext>(
    int*                    flag,
    CUDAContext*            ctx) {
    _SigmoidCrossEntropyGrad
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count, logit, target, dlogit, flag
    );
 }

--- a/Dragon/src/kernels/loss/sigmoid_focal_loss_op_kernel.cu
+++ b/Dragon/src/kernels/loss/sigmoid_focal_loss_op_kernel.cu
@@ -71,8 +71,8 @@ template <> void SigmoidFocalLoss<float, float, CUDAContext>(
    CUDAContext*            ctx) {
    auto nthreads = outer_dim * axis_dim * inner_dim;
    _SigmoidFocalLoss
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        nthreads, axis_dim, inner_dim,
        pos_alpha, neg_alpha, gamma, neg_id,
        logits, targets, losses, flags
@@ -96,8 +96,8 @@ template <> void SigmoidFocalLoss<float, int64_t, CUDAContext>(
    CUDAContext*            ctx) {
    auto nthreads = outer_dim * axis_dim * inner_dim;
    _SigmoidFocalLoss
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        nthreads, axis_dim, inner_dim,
        pos_alpha, neg_alpha, gamma, neg_id,
        logits, targets, losses, flags
@@ -171,8 +171,8 @@ template <> void SigmoidFocalLossGrad<float, float, CUDAContext>(
    CUDAContext*            ctx) {
    auto count = outer_dim * axis_dim * inner_dim;
    _SigmoidFocalLossGrad
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count, axis_dim, inner_dim,
        pos_alpha, neg_alpha, gamma, neg_id,
        logits, targets, dlogits, flags
@@ -196,8 +196,8 @@ template <> void SigmoidFocalLossGrad<float, int64_t, CUDAContext>(
    CUDAContext*            ctx) {
    auto count = outer_dim * axis_dim * inner_dim;
    _SigmoidFocalLossGrad
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count, axis_dim, inner_dim,
        pos_alpha, neg_alpha, gamma, neg_id,
        logits, targets, dlogits, flags

--- a/Dragon/src/kernels/loss/smooth_l1_loss_op_kernel.cu
+++ b/Dragon/src/kernels/loss/smooth_l1_loss_op_kernel.cu
@@ -33,8 +33,8 @@ template<> void SmoothL1<float, CUDAContext>(
    float*                  y,
    CUDAContext*            ctx) {
    _SmoothL1
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count, beta, x, y
     );
 }
@@ -63,8 +63,8 @@ template<> void SmoothL1Grad<float, CUDAContext>(
    float*                  dx,
    CUDAContext*            ctx) {
    _SmoothL1Grad
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count, beta, dy, dx
    );
 }

--- a/Dragon/src/kernels/loss/softmax_ce_loss_op_kernel.cu
+++ b/Dragon/src/kernels/loss/softmax_ce_loss_op_kernel.cu
@@ -29,8 +29,8 @@ template <> void SoftmaxCrossEntropy<float, CUDAContext>(
    float*                  losses,
    CUDAContext*            ctx) {
    _SoftmaxCrossEntropy
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count, prob, targets, losses
    );
 }

--- a/Dragon/src/kernels/loss/softmax_focal_loss_op_kernel.cu
+++ b/Dragon/src/kernels/loss/softmax_focal_loss_op_kernel.cu
@@ -67,8 +67,8 @@ template <> void SoftmaxFocalLoss<float, float, CUDAContext>(
    CUDAContext*            ctx) {
    auto num_preds = outer_dim * inner_dim;
    _SoftmaxFocalLoss
-        << < CUDA_BLOCKS(num_preds), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(num_preds), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        num_preds, axis_dim, inner_dim,
        pos_alpha, neg_alpha, gamma, neg_id,
        nignores, ignores,
@@ -95,8 +95,8 @@ template <> void SoftmaxFocalLoss<float, int64_t, CUDAContext>(
    CUDAContext*            ctx) {
    auto num_preds = outer_dim * inner_dim;
    _SoftmaxFocalLoss
-        << < CUDA_BLOCKS(num_preds), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(num_preds), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        num_preds, axis_dim, inner_dim,
        pos_alpha, neg_alpha, gamma, neg_id,
        nignores, ignores,
@@ -179,8 +179,8 @@ template<> void SoftmaxFocalLossGrad<float, float, CUDAContext>(
    CUDAContext*            ctx) {
    auto num_preds = outer_dim * inner_dim;
    _SoftmaxFocalLossGrad
-        << < CUDA_BLOCKS(num_preds), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(num_preds), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        num_preds, axis_dim, inner_dim,
        pos_alpha, neg_alpha, gamma, neg_id,
        nignores, ignores,
@@ -207,8 +207,8 @@ template<> void SoftmaxFocalLossGrad<float, int64_t, CUDAContext>(
    CUDAContext*            ctx) {
    auto num_preds = outer_dim * inner_dim;
    _SoftmaxFocalLossGrad
-        << < CUDA_BLOCKS(num_preds), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(num_preds), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        num_preds, axis_dim, inner_dim,
        pos_alpha, neg_alpha, gamma, neg_id,
        nignores, ignores,

--- a/Dragon/src/kernels/loss/sparse_softmax_ce_loss_op_kernel.cu
+++ b/Dragon/src/kernels/loss/sparse_softmax_ce_loss_op_kernel.cu
@@ -59,8 +59,8 @@ template <> void SparseSoftmaxCrossEntropy<float, float, CUDAContext>(
    CUDAContext*            ctx) {
    auto nthreads = outer_dim * inner_dim;
    _SparseSoftmaxCrossEntropy
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        nthreads, axis_dim, inner_dim, nignores,
        ignore, prob, target, loss, flag
    );
@@ -81,8 +81,8 @@ template <> void SparseSoftmaxCrossEntropy<float, int64_t, CUDAContext>(
    CUDAContext*            ctx) {
    auto nthreads = outer_dim * inner_dim;
    _SparseSoftmaxCrossEntropy
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        nthreads, axis_dim, inner_dim, nignores,
        ignore, prob, target, loss, flag
    );
@@ -136,8 +136,8 @@ template<> void SparseSoftmaxCrossEntropyGrad<float, float, CUDAContext>(
    CUDAContext*            ctx) {
    auto nthreads = outer_dim * inner_dim;
    _SparseSoftmaxCrossEntropyGrad
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        nthreads, axis_dim, inner_dim, nignores,
        ignore, prob, target, dx, flag
    );
@@ -158,8 +158,8 @@ template<> void SparseSoftmaxCrossEntropyGrad<float, int64_t, CUDAContext>(
    CUDAContext*            ctx) {
    auto nthreads = outer_dim * inner_dim;
    _SparseSoftmaxCrossEntropyGrad
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        nthreads, axis_dim, inner_dim, nignores,
        ignore, prob, target, dx, flag
    );

--- a/Dragon/src/kernels/misc/astype_op_kernel.cu
+++ b/Dragon/src/kernels/misc/astype_op_kernel.cu
@@ -26,8 +26,8 @@ __global__ void _TypeA2B(
        Tb*                 b, \
        CUDAContext*        ctx) { \
        _TypeA2B \
-            << < CUDA_BLOCKS(count), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+            <<< CUDA_BLOCKS(count), CUDA_THREADS, \
+                0, ctx->cuda_stream() >>>( \
            count, a, b \
        ); \
    }
@@ -66,8 +66,8 @@ template <> void TypeA2B<float16, float, CUDAContext>(
    float*                  b, 
    CUDAContext*            ctx) {
    _TypeA2B
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count, reinterpret_cast<const half*>(a), b
    );
 }
@@ -89,8 +89,8 @@ template <> void TypeA2B<float, float16, CUDAContext>(
    float16*            b,
    CUDAContext*        ctx) {
    _TypeA2B
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count, a, reinterpret_cast<half*>(b)
    );
 }
@@ -112,8 +112,8 @@ template <> void TypeA2B<float16, float16, CUDAContext>(
    float16*                b,
    CUDAContext*            ctx) {
    _TypeA2B
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count,
        reinterpret_cast<const half*>(a),
        reinterpret_cast<half*>(b)

--- a/Dragon/src/kernels/misc/gradient_op_kernel.cu
+++ b/Dragon/src/kernels/misc/gradient_op_kernel.cu
@@ -62,8 +62,8 @@ template <> __global__ void _GradientTwoSum<half2>(
        T*                      dx, \
        CUDAContext*            ctx) { \
        _GradientTwoSum \
-            << < CUDA_BLOCKS(count), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+            <<< CUDA_BLOCKS(count), CUDA_THREADS, \
+                0, ctx->cuda_stream() >>>( \
            count, dy1, dy2, dx \
        ); \
    }
@@ -83,8 +83,8 @@ template <> void GradientTwoSum<float16, CUDAContext>(
    CUDAContext*            ctx) {
    if ((count & 1) == 0) {
        _GradientTwoSum
-            << < CUDA_BLOCKS(count >> 2), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(count >> 2), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            count >> 2,
            reinterpret_cast<const half2*>(dy1),
            reinterpret_cast<const half2*>(dy2),
@@ -92,8 +92,8 @@ template <> void GradientTwoSum<float16, CUDAContext>(
        );
    } else {
        _GradientTwoSum
-            << < CUDA_BLOCKS(count), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(count), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            count,
            reinterpret_cast<const half*>(dy1),
            reinterpret_cast<const half*>(dy2),

--- a/Dragon/src/kernels/misc/image_data_op_kernel.cu
+++ b/Dragon/src/kernels/misc/image_data_op_kernel.cu
@@ -76,14 +76,14 @@ template <> void ImageData<float, float, CUDAContext>(
    auto nthreads = N * C * H * W;
    if (data_format == "NCHW") {
        _ImageDataNCHW
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            nthreads, C, H, W, mean, std, x, y
        );
    } else if (data_format == "NHWC") {
        _ImageDataNHWC
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            nthreads, C, H, W, mean, std, x, y
       );
    } else {
@@ -107,14 +107,14 @@ template <> void ImageData<uint8_t, float, CUDAContext>(
    auto nthreads = N * C * H * W;
    if (data_format == "NCHW") {
        _ImageDataNCHW
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            nthreads, C, H, W, mean, std, x, y
        );
    } else if (data_format == "NHWC") {
        _ImageDataNHWC
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            nthreads, C, H, W, mean, std, x, y
       );
    } else {
@@ -191,15 +191,15 @@ template <> void ImageData<float, float16, CUDAContext>(
    auto nthreads = N * C * H * W;
    if (data_format == "NCHW") {
        _ImageDataHalfNCHW
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            nthreads, C, H, W, mean, std,
            x, reinterpret_cast<half*>(y)
        );
    } else if (data_format == "NHWC") {
        _ImageDataHalfNHWC
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
           nthreads, C, H, W, mean, std,
           x, reinterpret_cast<half*>(y)
        );
@@ -222,15 +222,15 @@ template <> void ImageData<uint8_t, float16, CUDAContext>(
    auto nthreads = N * C * H * W;
    if (data_format == "NCHW") {
        _ImageDataHalfNCHW
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            nthreads, C, H, W, mean, std,
            x, reinterpret_cast<half*>(y)
        );
    } else if (data_format == "NHWC") {
        _ImageDataHalfNHWC
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            nthreads, C, H, W, mean, std,
            x, reinterpret_cast<half*>(y)
        );

--- a/Dragon/src/kernels/norm/batch_norm_op_kernel.cu
+++ b/Dragon/src/kernels/norm/batch_norm_op_kernel.cu
@@ -190,27 +190,27 @@ __global__ void _BatchNormInferenceGrad(
        auto nthreads = N * C * S; \
        if (data_format == "NCHW") { \
            _BatchNormInternalGrad<Tx, Tp, StorageOrder::NCHW> \
-                << < CUDA_2D_BLOCKS(C), CUDA_THREADS, \
-                     0, ctx->cuda_stream() >> >( \
+                <<< CUDA_2D_BLOCKS(C), CUDA_THREADS, \
+                    0, ctx->cuda_stream() >>>( \
                N, C, S, x, mu, rsig, gamma, \
                dy, ds, db, dgamma, dbeta \
            ); \
            _BatchNormTrainingGrad<Tx, Tp, StorageOrder::NCHW> \
-                << < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
-                     0, ctx->cuda_stream() >> >( \
+                <<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
+                    0, ctx->cuda_stream() >>>( \
                nthreads, N, C, S, x, mu, \
                rsig, gamma, ds, db, dy, dx \
            ); \
        } else if (data_format == "NHWC") { \
            _BatchNormInternalGrad<Tx, Tp, StorageOrder::NHWC> \
-                << < CUDA_2D_BLOCKS(C), CUDA_THREADS, \
-                     0, ctx->cuda_stream() >> >( \
+                <<< CUDA_2D_BLOCKS(C), CUDA_THREADS, \
+                    0, ctx->cuda_stream() >>>( \
                N, C, S, x, mu, rsig, gamma, \
                dy, ds, db, dgamma, dbeta \
            ); \
            _BatchNormTrainingGrad<Tx, Tp, StorageOrder::NHWC> \
-                << < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
-                     0, ctx->cuda_stream() >> >( \
+                <<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
+                    0, ctx->cuda_stream() >>>( \
                nthreads, N, C, S, x, mu, \
                rsig, gamma, ds, db, dy, dx \
            ); \
@@ -234,24 +234,24 @@ __global__ void _BatchNormInferenceGrad(
        if (data_format == "NCHW") { \
            if (dgamma != nullptr) { \
                _BatchNormWGrad<Tx, Tp, StorageOrder::NCHW> \
-                    << < CUDA_2D_BLOCKS(C), CUDA_THREADS, \
-                         0, ctx->cuda_stream() >> > \
+                    <<< CUDA_2D_BLOCKS(C), CUDA_THREADS, \
+                        0, ctx->cuda_stream() >>> \
                    (N, C, S, x, mu, rsig, dy, dgamma, dbeta); \
            } \
            _BatchNormInferenceGrad<Tx, Tp, StorageOrder::NCHW> \
-                << < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
-                     0, ctx->cuda_stream() >> > \
+                <<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
+                    0, ctx->cuda_stream() >>> \
                (nthreads, C, S, rsig, gamma, dy, dx); \
        } else if (data_format == "NHWC") { \
            if (dgamma != nullptr) { \
                _BatchNormWGrad<Tx, Tp, StorageOrder::NHWC> \
-                    << < CUDA_2D_BLOCKS(C), CUDA_THREADS, \
-                         0, ctx->cuda_stream() >> > \
+                    <<< CUDA_2D_BLOCKS(C), CUDA_THREADS, \
+                        0, ctx->cuda_stream() >>> \
                    (N, C, S, x, mu, rsig, dy, dgamma, dbeta); \
            } \
            _BatchNormInferenceGrad<Tx, Tp, StorageOrder::NHWC> \
-                << < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
-                     0, ctx->cuda_stream() >> > \
+                <<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
+                    0, ctx->cuda_stream() >>> \
                (nthreads, C, S, rsig, gamma, dy, dx); \
        } \
    }

--- a/Dragon/src/kernels/norm/group_norm_op_kernel.cu
+++ b/Dragon/src/kernels/norm/group_norm_op_kernel.cu
@@ -408,20 +408,20 @@ __global__ void _GroupNormGradHalf(
        CUDAContext*                ctx) { \
        const int C = G * D; \
        _GroupNormFusedParams<Tp> \
-            << < CUDA_2D_BLOCKS(N * G), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+            <<< CUDA_2D_BLOCKS(N * G), CUDA_THREADS, \
+                0, ctx->cuda_stream() >>>( \
            N, G, D, mu, rsig, gamma, beta, scale, bias \
        ); \
        if (data_format == "NCHW") { \
            _GroupNormForwardNCHW<Tx, Tp> \
-                << < CUDA_2D_BLOCKS(N * C), CUDA_THREADS, \
-                     0, ctx->cuda_stream() >> >( \
+                <<< CUDA_2D_BLOCKS(N * C), CUDA_THREADS, \
+                    0, ctx->cuda_stream() >>>( \
                N, C, S, x, scale, bias, y \
            ); \
        } else if (data_format == "NHWC") { \
            _GroupNormForwardNHWC<Tx, Tp> \
-                << < CUDA_2D_BLOCKS(N * C), CUDA_THREADS, \
-                     0, ctx->cuda_stream() >> >( \
+                <<< CUDA_2D_BLOCKS(N * C), CUDA_THREADS, \
+                    0, ctx->cuda_stream() >>>( \
                N, C, S, x, scale, bias, y \
            ); \
        } \
@@ -448,35 +448,35 @@ __global__ void _GroupNormGradHalf(
        auto nthreads = N * G * D * S; \
        if (data_format == "NCHW") { \
            _GroupNormWGrad<Tx, Tp, StorageOrder::NCHW> \
-                << < CUDA_2D_BLOCKS(G * D), CUDA_THREADS, \
-                     0, ctx->cuda_stream() >> >( \
+                <<< CUDA_2D_BLOCKS(G * D), CUDA_THREADS, \
+                    0, ctx->cuda_stream() >>>( \
                N, G, D, S, x, mu, rsig, dy, dgamma, dbeta \
            ); \
            _GroupNormInternalGrad<Tx, Tp, StorageOrder::NCHW> \
-                << < CUDA_2D_BLOCKS(N * G), CUDA_THREADS, \
-                     0, ctx->cuda_stream() >> >( \
+                <<< CUDA_2D_BLOCKS(N * G), CUDA_THREADS, \
+                    0, ctx->cuda_stream() >>>( \
                N, G, D, S, x, gamma, dy, ds, db \
            ); \
            _GroupNormGrad<Tx, Tp, StorageOrder::NCHW> \
-                << < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
-                     0, ctx->cuda_stream() >> >( \
+                <<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
+                    0, ctx->cuda_stream() >>>( \
                nthreads, G, D, S, x, mu, rsig, \
                gamma, ds, db, dy, dx \
            ); \
        } else if (data_format == "NHWC") { \
            _GroupNormWGrad<Tx, Tp, StorageOrder::NHWC> \
-                << < CUDA_2D_BLOCKS(G * D), CUDA_THREADS, \
-                     0, ctx->cuda_stream() >> >( \
+                <<< CUDA_2D_BLOCKS(G * D), CUDA_THREADS, \
+                    0, ctx->cuda_stream() >>>( \
                N, G, D, S, x, mu, rsig, dy, dgamma, dbeta \
            ); \
            _GroupNormInternalGrad<Tx, Tp, StorageOrder::NHWC> \
-                << < CUDA_2D_BLOCKS(N * G), CUDA_THREADS, \
-                     0, ctx->cuda_stream() >> >( \
+                <<< CUDA_2D_BLOCKS(N * G), CUDA_THREADS, \
+                    0, ctx->cuda_stream() >>>( \
                N, G, D, S, x, gamma, dy, ds, db \
            ); \
            _GroupNormGrad<Tx, Tp, StorageOrder::NHWC> \
-                << < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
-                     0, ctx->cuda_stream() >> > ( \
+                <<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
+                    0, ctx->cuda_stream() >>> ( \
                nthreads, G, D, S, x, mu, rsig, \
                gamma, ds, db, dy, dx \
            ); \
@@ -503,14 +503,14 @@ template <> void GroupNormForward<float16, float, CUDAContext>(
    CUDAContext*                ctx) {
    const int C = G * D;
    _GroupNormFusedParams<float>
-        << < CUDA_2D_BLOCKS(N * G), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_2D_BLOCKS(N * G), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        N, G, D, mu, rsig, gamma, beta, scale, bias
    );
    if (data_format == "NCHW") {
        _GroupNormForwardNCHW<half, float>
-            << < CUDA_2D_BLOCKS(N * C), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_2D_BLOCKS(N * C), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            N, C, S,
            reinterpret_cast<const half*>(x),
            scale, bias,
@@ -518,8 +518,8 @@ template <> void GroupNormForward<float16, float, CUDAContext>(
        );
    } else if (data_format == "NHWC") {
        _GroupNormForwardNHWC<half, float>
-            << < CUDA_2D_BLOCKS(N * C), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_2D_BLOCKS(N * C), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            N, C, S,
            reinterpret_cast<const half*>(x),
            scale, bias,
@@ -548,8 +548,8 @@ template <> void GroupNormBackward<float16, float, CUDAContext>(
    auto nthreads = N * G * D * S;
    if (data_format == "NCHW") {
        _GroupNormWGradHalf<StorageOrder::NCHW>
-            << < CUDA_2D_BLOCKS(G * D), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_2D_BLOCKS(G * D), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            N, G, D, S,
            reinterpret_cast<const half*>(x),
            mu, rsig,
@@ -557,8 +557,8 @@ template <> void GroupNormBackward<float16, float, CUDAContext>(
            dgamma, dbeta
        );
        _GroupNormInternalGradHalf<StorageOrder::NCHW>
-            << < CUDA_2D_BLOCKS(N * G), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_2D_BLOCKS(N * G), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            N, G, D, S,
            reinterpret_cast<const half*>(x),
            gamma,
@@ -566,8 +566,8 @@ template <> void GroupNormBackward<float16, float, CUDAContext>(
            ds, db
        );
        _GroupNormGradHalf<StorageOrder::NCHW>
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            nthreads, G, D, S,
            reinterpret_cast<const half*>(x),
            mu, rsig, gamma, ds, db,
@@ -576,8 +576,8 @@ template <> void GroupNormBackward<float16, float, CUDAContext>(
        );
    } else if (data_format == "NHWC") { \
        _GroupNormWGradHalf<StorageOrder::NHWC>
-            << < CUDA_2D_BLOCKS(G * D), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_2D_BLOCKS(G * D), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            N, G, D, S,
            reinterpret_cast<const half*>(x),
            mu, rsig,
@@ -585,8 +585,8 @@ template <> void GroupNormBackward<float16, float, CUDAContext>(
            dgamma, dbeta
        );
        _GroupNormInternalGradHalf<StorageOrder::NHWC>
-            << < CUDA_2D_BLOCKS(N * G), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_2D_BLOCKS(N * G), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            N, G, D, S,
            reinterpret_cast<const half*>(x),
            gamma,
@@ -594,8 +594,8 @@ template <> void GroupNormBackward<float16, float, CUDAContext>(
            ds, db
        );
        _GroupNormGradHalf<StorageOrder::NHWC>
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            nthreads, G, D, S,
            reinterpret_cast<const half*>(x),
            mu, rsig, gamma, ds, db,

--- a/Dragon/src/kernels/recurrent/lstm_cell_op_kernel.cu
+++ b/Dragon/src/kernels/recurrent/lstm_cell_op_kernel.cu
@@ -58,13 +58,13 @@ template <> void LSTMCell<float, CUDAContext>(
    auto o_offset = 2 * C, c_offset = 3 * C,
         x_offset = 4 * C, NC = N * C;
    _LSTMCellAct
-        << < CUDA_BLOCKS(NC * 4), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(NC * 4), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        NC * 4, c_offset, x_offset, actx
    );
    _LSTMCellGate
-        << < CUDA_BLOCKS(NC), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(NC), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        NC, C, o_offset, c_offset,
        x_offset, cx, actx, c, h
    );
@@ -138,14 +138,14 @@ template <> void LSTMCellGrad<float, CUDAContext>(
    auto o_offset = 2 * C, c_offset = 3 * C,
         x_offset = 4 * C, NC = N * C;
    _LSTMCellGateGrad
-        << < CUDA_BLOCKS(NC), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(NC), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        NC, C, o_offset, c_offset, x_offset,
        cx, actx, c, dc, dh, dcx, dx
    );
    _LSTMCellActGrad
-        << < CUDA_BLOCKS(NC * 4), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(NC * 4), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        NC * 4, c_offset, x_offset, actx, dx
    );
 }

--- a/Dragon/src/kernels/update/adam_update_op_kernel.cu
+++ b/Dragon/src/kernels/update/adam_update_op_kernel.cu
@@ -39,8 +39,8 @@ template <> void AdamUpdate<float, CUDAContext>(
    float*                  v,
    CUDAContext*            ctx) {
    _AdamUpdate
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count, lr, beta1, beta2, eps, g, m, v
    );
 }

--- a/Dragon/src/kernels/update/mprec_update_op_kerne.cu
+++ b/Dragon/src/kernels/update/mprec_update_op_kerne.cu
@@ -29,8 +29,8 @@ template <> void MixedPrecL2Decay<float16, CUDAContext>(
    float*                  dx,
    CUDAContext*            ctx) {
    _MixedPrecL2DecayHalf
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count,
        alpha,
        reinterpret_cast<const half*>(w),
@@ -58,8 +58,8 @@ template <> void MixedPrecUpdate<float16, CUDAContext>(
    float16*                w,
    CUDAContext*            ctx) {
    _MixedPrecUpdateHalf
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count,
        updates,
        reinterpret_cast<half*>(w)

--- a/Dragon/src/kernels/update/nesterov_update_op_kernel.cu
+++ b/Dragon/src/kernels/update/nesterov_update_op_kernel.cu
@@ -32,8 +32,8 @@ template <> void NesterovUpdate<float, CUDAContext>(
    float*                  h,
    CUDAContext*            ctx) {
    _NesterovUpdate
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count, lr, momentum, g, h
    );
 }

--- a/Dragon/src/kernels/update/rmsprop_update_op_kernel.cu
+++ b/Dragon/src/kernels/update/rmsprop_update_op_kernel.cu
@@ -34,8 +34,8 @@ template <> void RMSPropUpdate<float, CUDAContext>(
    float*                  h,
    CUDAContext*            ctx) {
    _RMSPropUpdate
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count, lr, decay, eps, g, h
    );
 }

--- a/Dragon/src/kernels/update/sgd_update_op_kernel.cu
+++ b/Dragon/src/kernels/update/sgd_update_op_kernel.cu
@@ -31,8 +31,8 @@ template <> void SGDUpdate<float, CUDAContext>(
    float*                  h,
    CUDAContext*            ctx) {
    _SGDUpdate
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        count, lr, momentum, g, h
    );
 }

--- a/Dragon/src/kernels/vision/bias_add_op_kernel.cu
+++ b/Dragon/src/kernels/vision/bias_add_op_kernel.cu
@@ -52,14 +52,14 @@ template<> void BiasAdd<float, CUDAContext>(
    auto nthreads = outer_dim * axis_dim * inner_dim;
    if (data_format == "NCHW") {
        _BiasAddNCHW
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            nthreads, axis_dim, inner_dim, bias, y
        );
    } else if (data_format == "NHWC") {
        _BiasAddNHWC
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            nthreads, axis_dim, bias, y
        );
    } else {

--- a/Dragon/src/kernels/vision/bilinear_resize_op_kernel.cu
+++ b/Dragon/src/kernels/vision/bilinear_resize_op_kernel.cu
@@ -109,15 +109,15 @@ template <> void BilinearResize<float, CUDAContext>(
    auto scale_w = (float)W / (float)out_w;
     if (data_format == "NCHW") {
         _BilinearResizeNCHW
-             << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                  0, ctx->cuda_stream() >> >(
+             <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                 0, ctx->cuda_stream() >>>(
             nthreads, C, H, W, out_h, out_w,
             scale_h, scale_w, x, y
        );
    } else if(data_format == "NHWC") {
         _BilinearResizeNHWC
-             << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                  0, ctx->cuda_stream() >> >(
+             <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                 0, ctx->cuda_stream() >>>(
             nthreads, C, H, W, out_h, out_w,
             scale_h, scale_w, x, y
        );
@@ -224,15 +224,15 @@ template <> void BilinearResizeGrad<float, CUDAContext>(
    auto scale_w = (float)W / (float)out_w;
     if (data_format == "NCHW") {
         _BilinearResizeGradNCHW
-             << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                  0, ctx->cuda_stream() >> >(
+             <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                 0, ctx->cuda_stream() >>>(
             nthreads, C, H, W, out_h, out_w,
             scale_h, scale_w, dy, dx
        );
    } else if(data_format == "NHWC") {
         _BilinearResizeGradNHWC
-             << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                  0, ctx->cuda_stream() >> >(
+             <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                 0, ctx->cuda_stream() >>>(
             nthreads, C, H, W, out_h, out_w,
             scale_h, scale_w, dy, dx
        );

--- a/Dragon/src/kernels/vision/conv_op_kernel.cu
+++ b/Dragon/src/kernels/vision/conv_op_kernel.cu
@@ -123,8 +123,8 @@ template <> void Im2Col2d<float, CUDAContext>(
    auto nthreads = C * out_h * out_w;
    if (data_format == "NCHW") {
         _Im2Col2dNCHW
-             << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                  0, ctx->cuda_stream() >> >(
+             <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                 0, ctx->cuda_stream() >>>(
            nthreads,
            H, W,
            out_h, out_w,
@@ -136,8 +136,8 @@ template <> void Im2Col2d<float, CUDAContext>(
        );
    } else if (data_format == "NHWC") {
        _Im2Col2dNHWC
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            nthreads,
            C, H, W,
            out_h, out_w,
@@ -286,8 +286,8 @@ template <> void Col2Im2d<float, CUDAContext>(
    const int nthreads = C * H * W;
    if (data_format == "NCHW") {
        _Col2Im2dNCHW
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            nthreads,
            H, W,
            out_h, out_w,
@@ -299,8 +299,8 @@ template <> void Col2Im2d<float, CUDAContext>(
        );
    } else if (data_format == "NHWC") {
        _Col2Im2dNHWC
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            nthreads,
            C, H, W,
            out_h, out_w,

--- a/Dragon/src/kernels/vision/depthwise_conv_op_kernel.cu
+++ b/Dragon/src/kernels/vision/depthwise_conv_op_kernel.cu
@@ -144,8 +144,8 @@ template <> void DepthwiseConv2d<float, CUDAContext>(
    if (data_format == "NCHW") {
        if (kernel_h == 3 && kernel_w == 3) {
            _DepthwiseConv2dNCHW<float, 3, 3>
-                << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                     0, ctx->cuda_stream() >> >(
+                <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                    0, ctx->cuda_stream() >>>(
                nthreads,
                C, H, W,
                out_h, out_w,
@@ -157,8 +157,8 @@ template <> void DepthwiseConv2d<float, CUDAContext>(
            );
        } else if (kernel_h == 5 && kernel_w == 5) {
            _DepthwiseConv2dNCHW<float, 5, 5>
-                << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                     0, ctx->cuda_stream() >> >(
+                <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                    0, ctx->cuda_stream() >>>(
                nthreads,
                C, H, W,
                out_h, out_w,
@@ -170,8 +170,8 @@ template <> void DepthwiseConv2d<float, CUDAContext>(
            );
        } else if (kernel_h == 7 && kernel_w == 7) {
            _DepthwiseConv2dNCHW<float, 7, 7>
-                << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                     0, ctx->cuda_stream() >> >(
+                <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                    0, ctx->cuda_stream() >>>(
                nthreads,
                C, H, W,
                out_h, out_w,
@@ -183,8 +183,8 @@ template <> void DepthwiseConv2d<float, CUDAContext>(
            );
       } else {
            _DepthwiseConv2dNCHW<float, -1, -1>
-                << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                     0, ctx->cuda_stream() >> >(
+                <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                    0, ctx->cuda_stream() >>>(
                nthreads,
                C, H, W,
                out_h, out_w,
@@ -198,8 +198,8 @@ template <> void DepthwiseConv2d<float, CUDAContext>(
    } else if (data_format == "NHWC") {
        if (kernel_h == 3 && kernel_w == 3) {
            _DepthwiseConv2dNHWC<float, 3, 3>
-                << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                     0, ctx->cuda_stream() >> >(
+                <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                    0, ctx->cuda_stream() >>>(
                nthreads,
                C, H, W,
                out_h, out_w,
@@ -211,8 +211,8 @@ template <> void DepthwiseConv2d<float, CUDAContext>(
            );
        } else if (kernel_h == 5 && kernel_w == 5) {
            _DepthwiseConv2dNHWC<float, 5, 5>
-                << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                     0, ctx->cuda_stream() >> >(
+                <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                    0, ctx->cuda_stream() >>>(
                nthreads,
                C, H, W,
                out_h, out_w,
@@ -224,8 +224,8 @@ template <> void DepthwiseConv2d<float, CUDAContext>(
            );
        } else if (kernel_h == 7 && kernel_w == 7) {
            _DepthwiseConv2dNHWC<float, 7, 7>
-                << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                     0, ctx->cuda_stream() >> >(
+                <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                    0, ctx->cuda_stream() >>>(
                nthreads,
                C, H, W,
                out_h, out_w,
@@ -237,8 +237,8 @@ template <> void DepthwiseConv2d<float, CUDAContext>(
            );
        } else {
            _DepthwiseConv2dNHWC<float, -1, -1>
-                << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                     0, ctx->cuda_stream() >> >(
+                <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                    0, ctx->cuda_stream() >>>(
                nthreads,
                C, H, W,
                out_h, out_w,
@@ -394,8 +394,8 @@ template <> void DepthwiseConv2dGrad<float, CUDAContext>(
    if (data_format == "NCHW") {
        if (kernel_h == 3 && kernel_w == 3) {
            _DepthwiseConv2dGradNCHW<float, 3, 3>
-                << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                     0, ctx->cuda_stream() >> >(
+                <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                    0, ctx->cuda_stream() >>>(
                nthreads,
                C, H, W,
                out_h, out_w,
@@ -407,8 +407,8 @@ template <> void DepthwiseConv2dGrad<float, CUDAContext>(
            );
        } else if (kernel_h == 5 && kernel_w == 5) {
            _DepthwiseConv2dGradNCHW<float, 5, 5>
-                << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                     0, ctx->cuda_stream() >> >(
+                <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                    0, ctx->cuda_stream() >>>(
                nthreads,
                C, H, W,
                out_h, out_w,
@@ -420,8 +420,8 @@ template <> void DepthwiseConv2dGrad<float, CUDAContext>(
            );
        } else if (kernel_h == 7 && kernel_w == 7) {
            _DepthwiseConv2dGradNCHW<float, 7, 7>
-                << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                     0, ctx->cuda_stream() >> >(
+                <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                    0, ctx->cuda_stream() >>>(
                nthreads,
                C, H, W,
                out_h, out_w,
@@ -433,8 +433,8 @@ template <> void DepthwiseConv2dGrad<float, CUDAContext>(
            );
        } else {
             _DepthwiseConv2dGradNCHW<float, -1, -1>
-                << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                     0, ctx->cuda_stream() >> >(
+                <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                    0, ctx->cuda_stream() >>>(
                nthreads,
                C, H, W,
                out_h, out_w,
@@ -448,8 +448,8 @@ template <> void DepthwiseConv2dGrad<float, CUDAContext>(
    } else if (data_format == "NHWC") {
        if (kernel_h == 3 && kernel_w == 3) {
            _DepthwiseConv2dGradNHWC<float, 3, 3>
-                << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                     0, ctx->cuda_stream() >> >(
+                <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                    0, ctx->cuda_stream() >>>(
                nthreads,
                C, H, W,
                out_h, out_w,
@@ -461,8 +461,8 @@ template <> void DepthwiseConv2dGrad<float, CUDAContext>(
            );
        } else if (kernel_h == 5 && kernel_w == 5) {
            _DepthwiseConv2dGradNHWC<float, 5, 5>
-                << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                     0, ctx->cuda_stream() >> >(
+                <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                    0, ctx->cuda_stream() >>>(
                nthreads,
                C, H, W,
                out_h, out_w,
@@ -474,8 +474,8 @@ template <> void DepthwiseConv2dGrad<float, CUDAContext>(
            );
        } else if (kernel_h == 7 && kernel_w == 7) {
            _DepthwiseConv2dGradNHWC<float, 7, 7>
-                << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                     0, ctx->cuda_stream() >> >(
+                <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                    0, ctx->cuda_stream() >>>(
                nthreads,
                C, H, W,
                out_h, out_w,
@@ -487,8 +487,8 @@ template <> void DepthwiseConv2dGrad<float, CUDAContext>(
            );
        } else {
             _DepthwiseConv2dGradNHWC<float, -1, -1>
-                << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                     0, ctx->cuda_stream() >> >(
+                <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                    0, ctx->cuda_stream() >>>(
                nthreads,
                C, H, W,
                out_h, out_w,
@@ -634,8 +634,8 @@ template <> void DepthwiseConv2dWGrad<float, CUDAContext>(
    auto nblocks = C * kernel_h * kernel_w;
    if (data_format == "NCHW") {
        _DepthwiseConv2dWGradNCHW
-            << < nblocks, nthreads,
-                 0, ctx->cuda_stream() >> >(
+            <<< nblocks, nthreads,
+                0, ctx->cuda_stream() >>>(
            N, C, H, W,
            out_h, out_w,
            kernel_h, kernel_w,
@@ -646,8 +646,8 @@ template <> void DepthwiseConv2dWGrad<float, CUDAContext>(
       );
    } else if (data_format == "NHWC") {
        _DepthwiseConv2dWGradNHWC
-            << < nblocks, nthreads,
-                 0, ctx->cuda_stream() >> >(
+            <<< nblocks, nthreads,
+                0, ctx->cuda_stream() >>>(
            N, C, H, W,
            out_h, out_w,
            kernel_h, kernel_w,

--- a/Dragon/src/kernels/vision/drop_block_op_kernel.cu
+++ b/Dragon/src/kernels/vision/drop_block_op_kernel.cu
@@ -77,16 +77,12 @@ template <> void DropBlock2d<CUDAContext>(
    int*                    mask,
    CUDAContext*            ctx) {
    auto nthreads = N * C * seed_h * seed_w;
-    math::RandomUniform(
-        nthreads,
-        0.f, float(UINT_MAX),
-        seed, ctx
-    );
+    math::RandomUniform(nthreads, 0.f, 1.f, seed, ctx);
    auto mask_thresh = (uint32_t)(UINT_MAX * gamma);
    if (data_format == "NCHW") {
        _DropBlock2dNCHW
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            nthreads,
            C, H, W,
            seed_h, seed_w,
@@ -96,8 +92,8 @@ template <> void DropBlock2d<CUDAContext>(
        );
    } else if(data_format == "NHWC") {
        _DropBlock2dNHWC
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            nthreads,
            C, H, W,
            seed_h, seed_w,

--- a/Dragon/src/kernels/vision/nn_resize_op_kernel.cu
+++ b/Dragon/src/kernels/vision/nn_resize_op_kernel.cu
@@ -81,15 +81,15 @@ template <> void NNResize<float, CUDAContext>(
    auto scale_w = (float)W / (float)out_w;
    if (data_format == "NCHW") {
        _NNResizeNCHW
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            nthreads, C, H, W, out_h, out_w,
            scale_h, scale_w, x, y
        );
    } else if(data_format == "NHWC") {
        _NNResizeNHWC
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            nthreads, C, H, W, out_h, out_w,
            scale_h, scale_w, x, y
        );
@@ -116,8 +116,8 @@ template <> void NNResize<float16, CUDAContext>(
    auto scale_w = (float)W / (float)out_w;
    if (data_format == "NCHW") {
        _NNResizeNCHW
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            nthreads, C, H, W,
            out_h, out_w, scale_h, scale_w,
            reinterpret_cast<const half*>(x),
@@ -125,8 +125,8 @@ template <> void NNResize<float16, CUDAContext>(
        );
    } else if(data_format == "NHWC") {
        _NNResizeNHWC
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            nthreads, C, H, W,
            out_h, out_w, scale_h, scale_w,
            reinterpret_cast<const half*>(x),
@@ -209,15 +209,15 @@ template <> void NNResizeGrad<float, CUDAContext>(
    auto scale_w = (float)W / (float)out_w;
    if (data_format == "NCHW") {
        _NNResizeGradNCHW
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            nthreads, C, H, W, out_h, out_w,
            scale_h, scale_w, dy, dx
        );
    } else if(data_format == "NHWC") {
        _NNResizeGradNHWC 
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            nthreads, C, H, W, out_h, out_w,
            scale_h, scale_w, dy, dx
        );

--- a/Dragon/src/kernels/vision/pool_op_kernel.cu
+++ b/Dragon/src/kernels/vision/pool_op_kernel.cu
@@ -120,8 +120,8 @@ template<> void MaxPool2d<float, CUDAContext>(
    auto nthreads = N * C * pool_h * pool_w;
    if (data_format == "NCHW") {
        _MaxPool2dNCHW
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            nthreads,
            C, H, W,
            pool_h, pool_w,
@@ -132,8 +132,8 @@ template<> void MaxPool2d<float, CUDAContext>(
        );
    } else if (data_format == "NHWC") {
        _MaxPool2dNHWC
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            nthreads,
            C, H, W,
            pool_h, pool_w,
@@ -256,8 +256,8 @@ template<> void AvgPool2d<float, CUDAContext>(
    auto nthreads = N * C * pool_h * pool_w;
    if (data_format == "NCHW") {
        _AvgPool2dNCHW
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            nthreads,
            C, H, W,
            pool_h, pool_w,
@@ -268,8 +268,8 @@ template<> void AvgPool2d<float, CUDAContext>(
        );
    } else if (data_format == "NHWC") {
        _AvgPool2dNHWC
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            nthreads,
            C, H, W,
            pool_h, pool_w,
@@ -392,8 +392,8 @@ template<> void MaxPool2dGrad<float, CUDAContext>(
    auto nthreads = N * C * H * W;
    if (data_format == "NCHW") {
        _MaxPool2dGrad_NCHW
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            nthreads,
            C, H, W,
            pool_h, pool_w,
@@ -404,8 +404,8 @@ template<> void MaxPool2dGrad<float, CUDAContext>(
        );
    } else if (data_format == "NHWC") {
        _MaxPool2dGradNHWC
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            nthreads,
            C, H, W,
            pool_h, pool_w,
@@ -531,8 +531,8 @@ template<> void AvgPool2dGrad<float, CUDAContext>(
    auto nthreads = N * C * H * W;
    if (data_format == "NCHW") {
        _AvgPool2dGradNCHW
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            nthreads,
            C, H, W,
            pool_h, pool_w,
@@ -543,8 +543,8 @@ template<> void AvgPool2dGrad<float, CUDAContext>(
        );
    } else if (data_format == "NHWC") {
        _AvgPool2dGradNHWC
-            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            nthreads,
            C, H, W,
            pool_h, pool_w,

--- a/Dragon/src/kernels/vision/roi_align_op_kernel.cu
+++ b/Dragon/src/kernels/vision/roi_align_op_kernel.cu
@@ -132,8 +132,8 @@ template<> void ROIAlign<float, CUDAContext>(
    CUDAContext*            ctx) {
    auto nthreads = num_rois * C  * pool_h * pool_w;
    _ROIAlign
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        nthreads,
        C, H, W,
        pool_h, pool_w,
@@ -283,8 +283,8 @@ template<> void ROIAlignGrad<float, CUDAContext>(
    CUDAContext*            ctx) {
    auto nthreads = num_rois * C  * pool_h * pool_w;
    _ROIAlignGrad
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        nthreads,
        C, H, W,
        pool_h, pool_w,

--- a/Dragon/src/kernels/vision/roi_align_op_kernel.fp16.cu
+++ b/Dragon/src/kernels/vision/roi_align_op_kernel.fp16.cu
@@ -134,8 +134,8 @@ template<> void ROIAlign<float16, CUDAContext>(
    CUDAContext*            ctx) {
    auto nthreads = num_rois * C  * pool_h * pool_w;
    _ROIAlignHalf
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>
        (nthreads, C, H, W, pool_h, pool_w,
            sampling_ratio, spatial_scale,
                reinterpret_cast<const half*>(x), rois,

--- a/Dragon/src/kernels/vision/roi_pool_op_kernel.cu
+++ b/Dragon/src/kernels/vision/roi_pool_op_kernel.cu
@@ -92,8 +92,8 @@ template<> void ROIPool<float, CUDAContext>(
    CUDAContext*            ctx) {
    auto nthreads = num_rois * C * pool_h * pool_w;
    _ROIPool
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        nthreads,
        C, H, W,
        pool_h, pool_w,
@@ -185,8 +185,8 @@ template<> void ROIPool<float16, CUDAContext>(
    CUDAContext*            ctx) {
    auto nthreads = num_rois * C * pool_h * pool_w;
    _ROIPoolHalf
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        nthreads,
        C, H, W,
        pool_h, pool_w,
@@ -286,8 +286,8 @@ template<> void ROIPoolGrad<float, CUDAContext>(
    CUDAContext*            ctx) {
    auto nthreads = N * C * H * W;
    _ROIPoolGrad
-        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        nthreads,
        num_rois,
        C, H, W,

--- a/Dragon/src/onnx/onnx_backend.cc
+++ b/Dragon/src/onnx/onnx_backend.cc
@@ -180,9 +180,9 @@ ONNXBackend::get_special_nodes() const {
    }; return kSpecialNodes;
 }

-const Map< string, Map<string, string> >&
+const Map<string, Map<string, string>>&
 ONNXBackend::get_node_renamed_attrs() const {
-    const static Map< string, Map<string, string> >
+    const static Map<string, Map<string, string>>
        kPerNodeRenamedAttrs = {
            { "Gemm", { { "transB", "transW" } } },
            { "BatchNormalization", { { "epsilon", "eps" } } },

--- a/Dragon/src/onnx/onnx_backend.h
+++ b/Dragon/src/onnx/onnx_backend.h
@@ -221,7 +221,7 @@ class ONNXBackend {
    const Map<string, SpecialNodeConverter>& get_special_nodes() const;

    const Map<string, string>& get_renamed_attrs() const;
-    const Map< string, Map<string, string> >& get_node_renamed_attrs() const;
+    const Map<string, Map<string, string>>& get_node_renamed_attrs() const;
 };

 }  // namespace onnx

--- a/Dragon/src/operators/activation/cudnn_dropout_op.cc
+++ b/Dragon/src/operators/activation/cudnn_dropout_op.cc
@@ -77,15 +77,8 @@ template <class Context>
 void CuDNNDropoutOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16>>::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -147,15 +140,8 @@ template <class Context>
 void CuDNNDropoutGradientOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16>>::Call(this, X(0));
 }

 DEPLOY_CUDNN(Dropout);

--- a/Dragon/src/operators/activation/cudnn_elu_op.cc
+++ b/Dragon/src/operators/activation/cudnn_elu_op.cc
@@ -26,15 +26,8 @@ template <class Context>
 void CuDNNEluOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16>>::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -60,15 +53,8 @@ template <class Context>
 void CuDNNEluGradientOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16>>::Call(this, X(0));
 }

 DEPLOY_CUDNN(Elu);

--- a/Dragon/src/operators/activation/cudnn_relu_op.cc
+++ b/Dragon/src/operators/activation/cudnn_relu_op.cc
@@ -40,15 +40,8 @@ void CuDNNReluOp<Context>::RunOnDevice() {

    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16>>::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -92,15 +85,8 @@ void CuDNNReluGradientOp<Context>::RunOnDevice() {

    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16>>::Call(this, X(0));
 }

 DEPLOY_CUDNN(Relu);

--- a/Dragon/src/operators/activation/cudnn_sigmoid_op.cc
+++ b/Dragon/src/operators/activation/cudnn_sigmoid_op.cc
@@ -35,15 +35,8 @@ template <class Context>
 void CuDNNSigmoidOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16>>::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -82,15 +75,8 @@ template <class Context>
 void CuDNNSigmoidGradientOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16>>::Call(this, X(0));
 }

 DEPLOY_CUDNN(Sigmoid);

--- a/Dragon/src/operators/activation/cudnn_softmax_op.cc
+++ b/Dragon/src/operators/activation/cudnn_softmax_op.cc
@@ -45,15 +45,8 @@ void CuDNNSoftmaxOp<Context>::RunOnDevice() {

    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16>>::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -91,15 +84,8 @@ void CuDNNSoftmaxGradientOp<Context>::RunOnDevice() {

    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16>>::Call(this, X(0));
 }

 DEPLOY_CUDNN(Softmax);

--- a/Dragon/src/operators/activation/cudnn_tanh_op.cc
+++ b/Dragon/src/operators/activation/cudnn_tanh_op.cc
@@ -35,15 +35,8 @@ template <class Context>
 void CuDNNTanhOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16>>::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -82,15 +75,8 @@ template <class Context>
 void CuDNNTanhGradientOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16>>::Call(this, X(0));
 }

 DEPLOY_CUDNN(Tanh);

--- a/Dragon/src/operators/activation/dropout_op.cc
+++ b/Dragon/src/operators/activation/dropout_op.cc
@@ -44,15 +44,8 @@ template <class Context>
 void DropoutOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16>>::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -83,15 +76,8 @@ template <class Context>
 void DropoutGradientOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16>>::Call(this, X(0));
 }

 DEPLOY_CPU(Dropout);

--- a/Dragon/src/operators/activation/droppath_op.cc
+++ b/Dragon/src/operators/activation/droppath_op.cc
@@ -52,15 +52,8 @@ void DropPathOp<Context>::RunOnDevice() {

    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16>>::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -97,15 +90,8 @@ void DropPathGradientOp<Context>::RunOnDevice() {

    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16>>::Call(this, X(0));
 }

 DEPLOY_CPU(DropPath);

--- a/Dragon/src/operators/activation/elu_op.cc
+++ b/Dragon/src/operators/activation/elu_op.cc
@@ -20,13 +20,8 @@ template <class Context>
 void EluOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else {
-        LOG(FATAL) << DTypeString(
-            X(0), { "float32" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float>>::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -46,13 +41,8 @@ template <class Context>
 void EluGradientOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else {
-        LOG(FATAL) << DTypeString(
-            X(0), { "float32" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float>>::Call(this, X(0));
 }

 DEPLOY_CPU(Elu);

--- a/Dragon/src/operators/activation/prelu_op.cc
+++ b/Dragon/src/operators/activation/prelu_op.cc
@@ -40,13 +40,8 @@ void PReluOp<Context>::RunOnDevice() {

    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float>>::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -98,13 +93,8 @@ void PReluGradientOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));
    Y(1)->ReshapeLike(X(1));

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else {
-        LOG(FATAL) << DTypeString(
-            X(0), { "float32" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float>>::Call(this, X(0));
 }

 DEPLOY_CPU(PRelu);

--- a/Dragon/src/operators/activation/relu_op.cc
+++ b/Dragon/src/operators/activation/relu_op.cc
@@ -20,15 +20,8 @@ template <class Context>
 void ReluOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16>>::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -48,15 +41,8 @@ template <class Context>
 void ReluGradientOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16>>::Call(this, X(0));
 }

 DEPLOY_CPU(Relu);

--- a/Dragon/src/operators/activation/selu_op.cc
+++ b/Dragon/src/operators/activation/selu_op.cc
@@ -19,15 +19,8 @@ template <class Context>
 void SEluOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16>>::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -47,15 +40,8 @@ template <class Context>
 void SEluGradientOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16>>::Call(this, X(0));
 }

 DEPLOY_CPU(SElu);

--- a/Dragon/src/operators/activation/sigmoid_op.cc
+++ b/Dragon/src/operators/activation/sigmoid_op.cc
@@ -15,13 +15,8 @@ template <class Context>
 void SigmoidOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else {
-        LOG(FATAL) << DTypeString(
-            X(0), { "float32" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float>>::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -41,13 +36,8 @@ template <class Context>
 void SigmoidGradientOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float>>::Call(this, X(0));
 }

 DEPLOY_CPU(Sigmoid);

--- a/Dragon/src/operators/activation/softmax_op.cc
+++ b/Dragon/src/operators/activation/softmax_op.cc
@@ -43,13 +43,8 @@ void SoftmaxOp<Context>::RunOnDevice() {

    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else {
-        LOG(FATAL) << DTypeString(
-            X(0), { "float32" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float>>::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -86,13 +81,8 @@ void SoftmaxGradientOp<Context>::RunOnDevice() {

    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else {
-        LOG(FATAL) << DTypeString(
-            X(0), { "float32" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float>>::Call(this, X(0));
 }

 DEPLOY_CPU(Softmax);

--- a/Dragon/src/operators/activation/tanh_op.cc
+++ b/Dragon/src/operators/activation/tanh_op.cc
@@ -15,13 +15,8 @@ template <class Context>
 void TanhOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else {
-        LOG(FATAL) << DTypeString(
-            X(0), { "float32" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float>>::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -41,13 +36,8 @@ template <class Context>
 void TanhGradientOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else {
-        LOG(FATAL) << DTypeString(
-            X(0), { "float32" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float>>::Call(this, X(0));
 }

 DEPLOY_CPU(Tanh);

--- a/Dragon/src/operators/arithmetic/affine_op.cc
+++ b/Dragon/src/operators/arithmetic/affine_op.cc
@@ -46,15 +46,8 @@ void AffineOp<Context>::RunOnDevice() {

    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16>>::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -111,9 +104,7 @@ void AffineGradientOp<Context>::RunImpl() {
 }

 template <class Context> template <typename T>
-void AffineGradientOp<Context>::Reduce(
-    T*                      x,
-    T*                      y) {
+void AffineGradientOp<Context>::Reduce(T* x, T* y) {
    vec32_t dims = {
        (int)outer_dim_,
        (int)scale_dim_,
@@ -138,15 +129,8 @@ void AffineGradientOp<Context>::RunOnDevice() {

    Y(0)->ReshapeLike(X(-1));

-    if (XIsType(X(-1), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(-1), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(-1),
-            { "float32", "float16" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16>>::Call(this, X(-1));
 }

 DEPLOY_CPU(Affine);

--- a/Dragon/src/operators/arithmetic/cudnn_affine_op.cc
+++ b/Dragon/src/operators/arithmetic/cudnn_affine_op.cc
@@ -108,13 +108,6 @@ void CuDNNAffineOp<Context>::RunOnDevice() {
 template <class Context> template <typename DT, typename CT>
 void CuDNNAffineGradientOp<Context>::RunImpl() {
    this->template ResetDesc<DT>(X(-1));
-    scale_dim_ = X(1).count();
-    outer_dim_ = X(-1).count(0, axis_);
-    inner_dim_ = X(-1).count(axis_ + num_axes_);
-    dim_ = scale_dim_ * inner_dim_;
-    reduce_dim_ = std::max(outer_dim_, inner_dim_);
-
-    Y(0)->ReshapeLike(X(-1));

    auto* alpha = X(1).template data<DT, Context>();
    auto* dy = X(-1).template mutable_data<DT, Context>();
@@ -230,9 +223,7 @@ void CuDNNAffineGradientOp<Context>::CuDNNReduce(
 }

 template <class Context> template <typename T>
-void CuDNNAffineGradientOp<Context>::Reduce(
-    T*                      x,
-    T*                      y) {
+void CuDNNAffineGradientOp<Context>::Reduce(T* x, T* y) {
    vec32_t dims = {
        (int)outer_dim_,
        (int)scale_dim_,
@@ -248,6 +239,14 @@ void CuDNNAffineGradientOp<Context>::Reduce(

 template <class Context>
 void CuDNNAffineGradientOp<Context>::RunOnDevice() {
+    scale_dim_ = X(1).count();
+    outer_dim_ = X(-1).count(0, axis_);
+    inner_dim_ = X(-1).count(axis_ + num_axes_);
+    dim_ = scale_dim_ * inner_dim_;
+    reduce_dim_ = std::max(outer_dim_, inner_dim_);
+
+    Y(0)->ReshapeLike(X(-1));
+
    if (XIsType(X(-1), float)) {
        RunImpl<float, float>();
    } else if (XIsType(X(-1), float16)) {

--- a/Dragon/src/operators/arithmetic/eltwise_op.cc
+++ b/Dragon/src/operators/arithmetic/eltwise_op.cc
@@ -36,6 +36,13 @@ void EltwiseOp<Context>::ProdRunImpl() {

 template <class Context> template <typename T>
 void EltwiseOp<Context>::RunImpl() {
+    if (operation_ == "SUM") SumRunImpl<T>();
+    else if (operation_ == "PROD") ProdRunImpl<T>();
+    else LOG(FATAL) << "Unknwon Operation: " << operation_;
+}
+
+template <class Context>
+void EltwiseOp<Context>::RunOnDevice() {
    for (int i = 1; i < XSize(); i++) {
        CHECK(X(i).dims() == X(0).dims())
            << "\nExcepted Input(" << i << ")'s dims as "
@@ -45,33 +52,10 @@ void EltwiseOp<Context>::RunImpl() {

    Y(0)->ReshapeLike(X(0));

-    if (operation_ == "SUM") SumRunImpl<T>();
-    else if (operation_ == "PROD") ProdRunImpl<T>();
-    else LOG(FATAL) << "Unknwon Operation: " << operation_;
-}
-
-template <class Context>
-void EltwiseOp<Context>::RunOnDevice() {
-    if (XIsType(X(0), int8_t)) {
-        RunImpl<int8_t>();
-    } else if (XIsType(X(0), uint8_t)) {
-        RunImpl<uint8_t>();
-    } else if (XIsType(X(0), int)) {
-        RunImpl<int>();
-    } else if (XIsType(X(0), int64_t)) {
-        RunImpl<int64_t>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0), {
-            "int8", "uint8", "int32", "int64",
-            "float16", "float32", "float64",
-        });
-    }
+    DispatchHelper<TensorTypes
+        <int8_t, uint8_t, int, int64_t,
+            float16, float, double>
+    >::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -133,26 +117,10 @@ void EltwiseGradientOp<Context>::RunImpl() {

 template <class Context>
 void EltwiseGradientOp<Context>::RunOnDevice() {
-    if (XIsType(X(0), int8_t)) {
-        RunImpl<int8_t>();
-    } else if (XIsType(X(0), uint8_t)) {
-        RunImpl<uint8_t>();
-    } else if (XIsType(X(0), int)) {
-        RunImpl<int>();
-    } else if (XIsType(X(0), int64_t)) {
-        RunImpl<int64_t>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0), {
-            "int8", "uint8", "int32", "int64",
-            "float16", "float32", "float64",
-        });
-    }
+    DispatchHelper<TensorTypes
+        <int8_t, uint8_t, int, int64_t,
+            float16, float, double>
+    >::Call(this, X(0));
 }

 DEPLOY_CPU(Eltwise);

--- a/Dragon/src/operators/arithmetic/exp_op.cc
+++ b/Dragon/src/operators/arithmetic/exp_op.cc
@@ -15,17 +15,9 @@ template <class Context>
 void ExpOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float16", "float32", "float64" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16, double>
+    >::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -40,17 +32,9 @@ template <class Context>
 void ExpGradientOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float16", "float32", "float64" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16, double>
+    >::Call(this, X(0));
 }

 DEPLOY_CPU(Exp);

--- a/Dragon/src/operators/arithmetic/fully_connected_op.cc
+++ b/Dragon/src/operators/arithmetic/fully_connected_op.cc
@@ -84,6 +84,12 @@ void FullyConnectedOp<Context>::NoTransRunImpl() {
    }
 }

+template <class Context> template <typename T>
+void FullyConnectedOp<Context>::RunImpl() {
+    if (transW_) TransRunImpl<T>();
+    else NoTransRunImpl<T>();
+}
+
 template <class Context>
 void FullyConnectedOp<Context>::RunOnDevice() {
    DETERMINE_RUNTIME_ARGS(X(0));
@@ -101,31 +107,12 @@ void FullyConnectedOp<Context>::RunOnDevice() {
    for (int i = 0; i < axis_ + 1; i++) {
        out_shape[i] = i < axis_ ? X(0).dim(i) : N_;
    }
+
    Y(0)->Reshape(out_shape);

-    if (XIsType(X(0), float16)) {
-        if (transW_) {
-            TransRunImpl<float16>();
-        } else {
-            NoTransRunImpl<float16>();
-        }
-    } else if (XIsType(X(0), float)) {
-        if (transW_) {
-            TransRunImpl<float>();
-        } else {
-            NoTransRunImpl<float>();
-        }
-    } else if (XIsType(X(0), double)) {
-        if (transW_) {
-            TransRunImpl<double>();
-        } else {
-            NoTransRunImpl<double>();
-        }
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float16",  "float32", "float64" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16, double>
+    >::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -209,17 +196,9 @@ void FullyConnectedGradientOp<Context>::RunOnDevice() {
            << X(1).DimString();
    }

-    if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float16", "float32", "float64" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16, double>
+    >::Call(this, X(0));
 }

 DEPLOY_CPU(FullyConnected);

--- a/Dragon/src/operators/arithmetic/gram_matrix_op.cc
+++ b/Dragon/src/operators/arithmetic/gram_matrix_op.cc
@@ -35,17 +35,9 @@ void GramMatrixOp<Context>::RunOnDevice() {
        { outer_dim_, axis_dim_, axis_dim_ }
    );

-    if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float16", "float32", "float64" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16, double>
+    >::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -79,17 +71,9 @@ void GramMatrixGradientOp<Context>::RunOnDevice() {

    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float16", "float32", "float64" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16, double>
+    >::Call(this, X(0));
 }

 DEPLOY_CPU(GramMatrix);

--- a/Dragon/src/operators/arithmetic/log_op.cc
+++ b/Dragon/src/operators/arithmetic/log_op.cc
@@ -14,17 +14,9 @@ template <class Context>
 void LogOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float16", "float32", "float64" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16, double>
+    >::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -39,17 +31,9 @@ template <class Context>
 void LogGradientOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float16", "float32", "float64" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16, double>
+    >::Call(this, X(0));
 }

 DEPLOY_CPU(Log);

--- a/Dragon/src/operators/arithmetic/matmul_op.cc
+++ b/Dragon/src/operators/arithmetic/matmul_op.cc
@@ -65,17 +65,9 @@ void MatmulOp<Context>::RunOnDevice() {

    Y(0)->Reshape(out_shape);

-    if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float16", "float32", "float64" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16, double>
+    >::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -182,17 +174,9 @@ void MatmulGradientOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));
    Y(1)->ReshapeLike(X(1));

-    if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float16", "float32", "float64" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16, double>
+    >::Call(this, X(0));
 }

 DEPLOY_CPU(Matmul);

--- a/Dragon/src/operators/arithmetic/maximum_op.cc
+++ b/Dragon/src/operators/arithmetic/maximum_op.cc
@@ -58,26 +58,10 @@ void MaximumOp<Context>::RunImpl() {

 template <class Context>
 void MaximumOp<Context>::RunOnDevice() {
-    if (XIsType(X(0), int8_t)) {
-        RunImpl<int8_t>();
-    } else if (XIsType(X(0), uint8_t)) {
-        RunImpl<uint8_t>();
-    } else if (XIsType(X(0), int)) {
-        RunImpl<int>();
-    } else if (XIsType(X(0), int64_t)) {
-        RunImpl<int64_t>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0), {
-            "int8", "uint8", "int32", "int64",
-            "float16", "float32", "float64",
-        });
-    }
+    DispatchHelper<TensorTypes
+        <int8_t, uint8_t, int, int64_t,
+            float16, float, double>
+    >::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -135,9 +119,6 @@ void MaximumGradientOp<Context>::BroadcastRunImpl() {

 template <class Context> template <typename T>
 void MaximumGradientOp<Context>::RunImpl() {
-    Y(0)->ReshapeLike(X(0));
-    Y(1)->ReshapeLike(X(1));
-
    if (X(0).dims() == X(1).dims()) {
        EltwiseRunImpl<T>();
    } else {
@@ -147,26 +128,13 @@ void MaximumGradientOp<Context>::RunImpl() {

 template <class Context>
 void MaximumGradientOp<Context>::RunOnDevice() {
-    if (XIsType(X(0), int8_t)) {
-        RunImpl<int8_t>();
-    } else if (XIsType(X(0), uint8_t)) {
-        RunImpl<uint8_t>();
-    } else if (XIsType(X(0), int)) {
-        RunImpl<int>();
-    } else if (XIsType(X(0), int64_t)) {
-        RunImpl<int64_t>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0), {
-            "int8", "uint8", "int32", "int64",
-            "float16", "float32", "float64",
-        });
-    }
+    Y(0)->ReshapeLike(X(0));
+    Y(1)->ReshapeLike(X(1));
+
+    DispatchHelper<TensorTypes
+        <int8_t, uint8_t, int, int64_t,
+            float16, float, double>
+    >::Call(this, X(0));
 }

 DEPLOY_CPU(Maximum);

--- a/Dragon/src/operators/arithmetic/minimum_op.cc
+++ b/Dragon/src/operators/arithmetic/minimum_op.cc
@@ -58,26 +58,10 @@ void MinimumOp<Context>::RunImpl() {

 template <class Context>
 void MinimumOp<Context>::RunOnDevice() {
-    if (XIsType(X(0), int8_t)) {
-        RunImpl<int8_t>();
-    } else if (XIsType(X(0), uint8_t)) {
-        RunImpl<uint8_t>();
-    } else if (XIsType(X(0), int)) {
-        RunImpl<int>();
-    } else if (XIsType(X(0), int64_t)) {
-        RunImpl<int64_t>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0), {
-            "int8", "uint8", "int32", "int64",
-            "float16", "float32", "float64",
-        });
-    }
+    DispatchHelper<TensorTypes
+        <int8_t, uint8_t, int, int64_t,
+            float16, float, double>
+    >::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -135,9 +119,6 @@ void MinimumGradientOp<Context>::BroadcastRunImpl() {

 template <class Context> template <typename T>
 void MinimumGradientOp<Context>::RunImpl() {
-    Y(0)->ReshapeLike(X(0));
-    Y(1)->ReshapeLike(X(1));
-
    if (X(0).dims() == X(1).dims()) {
        EltwiseRunImpl<T>();
    } else {
@@ -147,26 +128,13 @@ void MinimumGradientOp<Context>::RunImpl() {

 template <class Context>
 void MinimumGradientOp<Context>::RunOnDevice() {
-    if (XIsType(X(0), int8_t)) {
-        RunImpl<int8_t>();
-    } else if (XIsType(X(0), uint8_t)) {
-        RunImpl<uint8_t>();
-    } else if (XIsType(X(0), int)) {
-        RunImpl<int>();
-    } else if (XIsType(X(0), int64_t)) {
-        RunImpl<int64_t>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0), {
-            "int8", "uint8", "int32", "int64",
-            "float16", "float32", "float64",
-        });
-    }
+    Y(0)->ReshapeLike(X(0));
+    Y(1)->ReshapeLike(X(1));
+
+    DispatchHelper<TensorTypes
+        <int8_t, uint8_t, int, int64_t,
+            float16, float, double>
+    >::Call(this, X(0));
 }

 DEPLOY_CPU(Minimum);

--- a/Dragon/src/operators/arithmetic/moments_op.cc
+++ b/Dragon/src/operators/arithmetic/moments_op.cc
@@ -7,6 +7,31 @@ namespace dragon {
 template <class Context>
 template <typename Tx, typename Ty>
 void MomentsOp<Context>::RunImpl() {
+    auto* x = X(0).template data<Tx, Context>();
+    auto* mean = Y(0)->template mutable_data<Ty, Context>();
+    auto* var = Y(1)->template mutable_data<Ty, Context>();
+
+    if (X(0).count() == 1) {
+        kernel::TypeA2B(
+            Y(0)->count(),
+            x, mean, ctx()
+        );
+        math::Set(
+            Y(0)->count(),
+            cast::to<Ty>(0.f),
+            var, ctx()
+        );
+    } else {
+        kernel::Moments(
+            (int)dims32_.size(), dims32_.data(),
+            (int)axes32_.size(), axes32_.data(),
+            x, mean, var, ctx()
+        );
+    }
+}
+
+template <class Context>
+void MomentsOp<Context>::RunOnDevice() {
    dims_ = X(0).dims(); axes32_.clear();
    dims32_.assign(dims_.begin(), dims_.end());
    axes32_.assign(axes_.begin(), axes_.end());
@@ -35,31 +60,6 @@ void MomentsOp<Context>::RunImpl() {
    Y(0)->Reshape(out_shape);
    Y(1)->Reshape(out_shape);

-    auto* x = X(0).template data<Tx, Context>();
-    auto* mean = Y(0)->template mutable_data<Ty, Context>();
-    auto* var = Y(1)->template mutable_data<Ty, Context>();
-
-    if (X(0).count() == 1) {
-        kernel::TypeA2B(
-            Y(0)->count(),
-            x, mean, ctx()
-        );
-        math::Set(
-            Y(0)->count(),
-            cast::to<Ty>(0.f),
-            var, ctx()
-        );
-    } else {
-        kernel::Moments(
-            (int)dims32_.size(), dims32_.data(),
-            (int)axes32_.size(), axes32_.data(),
-            x, mean, var, ctx()
-        );
-    }
-}
-
-template <class Context>
-void MomentsOp<Context>::RunOnDevice() {
    if (XIsType(X(0), int8_t)) {
        RunImpl<int8_t, float>();
    } else if (XIsType(X(0), uint8_t)) {

--- a/Dragon/src/operators/arithmetic/pow_op.cc
+++ b/Dragon/src/operators/arithmetic/pow_op.cc
@@ -32,17 +32,9 @@ template <class Context>
 void PowOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float16", "float32", "float64" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16, double>
+    >::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -88,17 +80,9 @@ template <class Context>
 void PowGradientOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float16", "float32", "float64" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16, double>
+    >::Call(this, X(0));
 }

 DEPLOY_CPU(Pow);

--- a/Dragon/src/operators/arithmetic/sqrt_op.cc
+++ b/Dragon/src/operators/arithmetic/sqrt_op.cc
@@ -14,17 +14,9 @@ template <class Context>
 void SqrtOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float16", "float32", "float64" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16, double>
+    >::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -40,17 +32,9 @@ template <class Context>
 void SqrtGradientOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float16", "float32", "float64" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16, double>
+    >::Call(this, X(0));
 }

 DEPLOY_CPU(Sqrt);

--- a/Dragon/src/operators/arithmetic/square_op.cc
+++ b/Dragon/src/operators/arithmetic/square_op.cc
@@ -14,26 +14,10 @@ template <class Context>
 void SquareOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), int8_t)) {
-        RunImpl<int8_t>();
-    } else if (XIsType(X(0), uint8_t)) {
-        RunImpl<uint8_t>();
-    } else if (XIsType(X(0), int)) {
-        RunImpl<int>();
-    } else if (XIsType(X(0), int64_t)) {
-        RunImpl<int64_t>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0), {
-            "int8", "uint8", "int32", "int64",
-            "float16", "float32", "float64",
-        });
-    }
+    DispatchHelper<TensorTypes
+        <int8_t, uint8_t, int, int64_t,
+            float16, float, double>
+    >::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -49,26 +33,10 @@ template <class Context>
 void SquareGradientOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), int8_t)) {
-        RunImpl<int8_t>();
-    } else if (XIsType(X(0), uint8_t)) {
-        RunImpl<uint8_t>();
-    } else if (XIsType(X(0), int)) {
-        RunImpl<int>();
-    } else if (XIsType(X(0), int64_t)) {
-        RunImpl<int64_t>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0), {
-            "int8", "uint8", "int32", "int64",
-            "float16", "float32", "float64",
-        });
-    }
+    DispatchHelper<TensorTypes
+        <int8_t, uint8_t, int, int64_t,
+            float16, float, double>
+    >::Call(this, X(0));
 }

 DEPLOY_CPU(Square);

--- a/Dragon/src/operators/array/arange_op.cc
+++ b/Dragon/src/operators/array/arange_op.cc
@@ -6,20 +6,24 @@ namespace dragon {

 template <class Context> template <typename T>
 void ArangeOp<Context>::RunImpl() {
-    astart_ = start(), astop_ = stop(), astep_ = step();
-    if (astop_ == 0) { astop_ = astart_; astart_ = 0; }
-    dim_ = (astop_ - astart_ - 1) / astep_ + 1;
-    CHECK_GT(dim_, 0) << "\nInvalid arguments: \n"
-                     << "start = " << start() << ", "
-                     << "stop = " << stop() << ", "
-                     << "step = " << step() << ".";
-    Y(0)->Reshape({ dim_ });
    auto* y = Y(0)->template mutable_data<T, Context>();
    kernel::Arange(dim_, astart_, astep_, y, ctx());
 }

 template <class Context>
 void ArangeOp<Context>::RunOnDevice() {
+    astart_ = start(), astop_ = stop(), astep_ = step();
+    if (astop_ == 0) { astop_ = astart_; astart_ = 0; }
+    dim_ = (astop_ - astart_ - 1) / astep_ + 1;
+
+    CHECK_GT(dim_, 0)
+        << "\nInvalid arguments: \n"
+        << "start = " << start() << ", "
+        << "stop = " << stop() << ", "
+        << "step = " << step() << ".";
+
+    Y(0)->Reshape({ dim_ });
+
    if (dtype() == "int8") {
        RunImpl<int8_t>();
    } else if (dtype() == "uint8") {

--- a/Dragon/src/operators/array/argreduce_op.cc
+++ b/Dragon/src/operators/array/argreduce_op.cc
@@ -101,28 +101,10 @@ void ArgReduceOp<Context>::RunOnDevice() {
    Y(0)->Reshape(out_shape);
    Y(1)->Reshape(out_shape);

-    if (XIsType(X(0), bool)) {
-        RunImpl<bool>();
-    } else if (XIsType(X(0), int8_t)) {
-        RunImpl<int8_t>();
-    } else if (XIsType(X(0), uint8_t)) {
-        RunImpl<uint8_t>();
-    } else if (XIsType(X(0), int)) {
-        RunImpl<int>();
-    } else if (XIsType(X(0), int64_t)) {
-        RunImpl<int64_t>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0), {
-            "bool", "int8", "uint8", "int32", "int64",
-            "float16", "float32", "float64",
-        });
-    }
+    DispatchHelper<TensorTypes
+        <bool, int8_t, uint8_t, int, int64_t,
+               float16, float, double>
+    >::Call(this, X(0));
 }

 DEPLOY_CPU(ArgReduce);

--- a/Dragon/src/operators/array/concat_op.cc
+++ b/Dragon/src/operators/array/concat_op.cc
@@ -56,28 +56,10 @@ void ConcatOp<Context>::RunOnDevice() {

    Y(0)->Reshape(out_shape);

-    if (XIsType(X(0), bool)) {
-        RunImpl<bool>();
-    } else if (XIsType(X(0), int8_t)) {
-        RunImpl<int8_t>();
-    } else if (XIsType(X(0), uint8_t)) {
-        RunImpl<uint8_t>();
-    } else if (XIsType(X(0), int)) {
-        RunImpl<int>();
-    } else if (XIsType(X(0), int64_t)) {
-        RunImpl<int64_t>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0), {
-            "bool", "int8", "uint8", "int32", "int64",
-                 "float16", "float32", "float64",
-        });
-    }
+    DispatchHelper<TensorTypes
+        <bool, int8_t, uint8_t, int, int64_t,
+               float16, float, double>
+    >::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -116,28 +98,10 @@ void ConcatGradientOp<Context>::RunOnDevice() {
    for (int i = 0; i < YSize(); i++)
        Y(i)->ReshapeLike(X(i));

-    if (XIsType(X(0), bool)) {
-        RunImpl<bool>();
-    } else if (XIsType(X(0), int8_t)) {
-        RunImpl<int8_t>();
-    } else if (XIsType(X(0), uint8_t)) {
-        RunImpl<uint8_t>();
-    } else if (XIsType(X(0), int)) {
-        RunImpl<int>();
-    } else if (XIsType(X(0), int64_t)) {
-        RunImpl<int64_t>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0), {
-            "bool", "int8", "uint8", "int32", "int64",
-                 "float16", "float32", "float64",
-        });
-    }
+    DispatchHelper<TensorTypes
+        <bool, int8_t, uint8_t, int, int64_t,
+               float16, float, double>
+    >::Call(this, X(0));
 }

 DEPLOY_CPU(Concat);

--- a/Dragon/src/operators/array/crop_op.cc
+++ b/Dragon/src/operators/array/crop_op.cc
@@ -145,28 +145,10 @@ void CropOp<Context>::RunOnDevice() {
    TENSOR_FROM_VEC(X_strides_, X(0).strides(), int);
    TENSOR_FROM_VEC(Y_dims_, Y_dims, int);

-    if (XIsType(X(0), bool)) {
-        RunImpl<bool>();
-    } else if (XIsType(X(0), int8_t)) {
-        RunImpl<int8_t>();
-    } else if (XIsType(X(0), uint8_t)) {
-        RunImpl<uint8_t>();
-    } else if (XIsType(X(0), int)) {
-        RunImpl<int>();
-    } else if (XIsType(X(0), int64_t)) {
-        RunImpl<int64_t>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0), {
-            "bool", "int8", "uint8", "int32", "int64",
-                 "float16", "float32", "float64",
-        });
-    }
+    DispatchHelper<TensorTypes
+        <bool, int8_t, uint8_t, int, int64_t,
+               float16, float, double>
+    >::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -209,28 +191,10 @@ void CropGradientOp<Context>::RunOnDevice() {
    TENSOR_FROM_VEC(X_strides_, X(0).strides(), int);
    TENSOR_FROM_VEC(Y_dims_, Y_dims, int);

-    if (XIsType(X(0), bool)) {
-        RunImpl<bool>();
-    } else if (XIsType(X(0), int8_t)) {
-        RunImpl<int8_t>();
-    } else if (XIsType(X(0), uint8_t)) {
-        RunImpl<uint8_t>();
-    } else if (XIsType(X(0), int)) {
-        RunImpl<int>();
-    } else if (XIsType(X(0), int64_t)) {
-        RunImpl<int64_t>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0), {
-            "bool", "int8", "uint8", "int32", "int64",
-                 "float16", "float32", "float64",
-        });
-    }
+    DispatchHelper<TensorTypes
+        <bool, int8_t, uint8_t, int, int64_t,
+               float16, float, double>
+    >::Call(this, X(0));
 }

 DEPLOY_CPU(Crop);

--- a/Dragon/src/operators/array/index_select_op.cc
+++ b/Dragon/src/operators/array/index_select_op.cc
@@ -55,28 +55,10 @@ void IndexSelectOp<Context>::RunOnDevice() {
    CHECK(X(1).template IsType<int64_t>())
        << "\nThe type of indices should be int64.";

-    if (XIsType(X(0), bool)) {
-        RunImpl<bool>();
-    } else if (XIsType(X(0), int8_t)) {
-        RunImpl<int8_t>();
-    } else if (XIsType(X(0), uint8_t)) {
-        RunImpl<uint8_t>();
-    } else if (XIsType(X(0), int)) {
-        RunImpl<int>();
-    } else if (XIsType(X(0), int64_t)) {
-        RunImpl<int64_t>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0), {
-            "bool", "int8", "uint8", "int32", "int64",
-            "float16", "float32", "float64",
-        });
-    }
+    DispatchHelper<TensorTypes
+        <bool, int8_t, uint8_t, int, int64_t,
+               float16, float, double>
+    >::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -115,26 +97,10 @@ void IndexSelectGradientOp<Context>::RunOnDevice() {
    CHECK(X(1).template IsType<int64_t>())
        << "\nThe type of indices should be int64.";

-    if (XIsType(X(0), int8_t)) {
-        RunImpl<int8_t>();
-    } else if (XIsType(X(0), uint8_t)) {
-        RunImpl<uint8_t>();
-    } else if (XIsType(X(0), int)) {
-        RunImpl<int>();
-    } else if (XIsType(X(0), int64_t)) {
-        RunImpl<int64_t>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0), {
-            "int8", "uint8", "int32", "int64",
-            "float16", "float32", "float64",
-        });
-    }
+    DispatchHelper<TensorTypes
+        <int8_t, uint8_t, int, int64_t,
+            float16, float, double>
+    >::Call(this, X(0));
 }

 DEPLOY_CPU(IndexSelect);

--- a/Dragon/src/operators/array/multinomial_op.cc
+++ b/Dragon/src/operators/array/multinomial_op.cc
@@ -35,17 +35,26 @@ void MultinomialOp<Context>::RunImpl() {

    double running_total, r;
    int yi = 0, num_classes = X(0).dim(axis_);
+    double uniform_p = 1. / (double)num_classes;

    auto* rng = ctx()->rand_generator();
+    std::uniform_real_distribution<float> eps_dist;

    for (int i = 0; i < outer_dim_; ++i) {
        running_total = 0.;
-        for (int j = 0; j < num_classes; ++j) {
-            running_total += (double)x[j];
-            cdf[j] = running_total;
+        if (eps_ > 0.f && eps_dist(*rng) < eps_) {
+            for (int j = 0; j < num_classes; ++j) {
+                running_total += uniform_p;
+                cdf[j] = running_total;
+            }
+        } else {
+            for (int j = 0; j < num_classes; ++j) {
+                running_total += (double)x[j];
+                cdf[j] = running_total;
+            }
        }
        std::uniform_real_distribution<double>
-            dist(0.f, running_total);
+            dist(0., running_total);
        for (int j = 0; j < (int)num_samples_; ++j) {
            r = dist(*rng);
            auto found_iter = std::upper_bound(
@@ -75,24 +84,10 @@ void MultinomialOp<Context>::RunOnDevice() {
    // Normalize the logits if necessary
    if (normalize_) SoftmaxRun();

-    if (XIsType(X(0), int8_t)) {
-        RunImpl<int8_t>();
-    } else if (XIsType(X(0), uint8_t)) {
-        RunImpl<uint8_t>();
-    } else if (XIsType(X(0), int)) {
-        RunImpl<int>();
-    } else if (XIsType(X(0), int64_t)) {
-        RunImpl<int64_t>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0), {
-            "int8", "uint8", "int32", "int64",
-                 "float32", "float64",
-        });
-    }
+    DispatchHelper<TensorTypes
+        <int8_t, uint8_t, int,
+         int64_t, float, double>
+    >::Call(this, X(0));
 }

 DEPLOY_CPU(Multinomial);

--- a/Dragon/src/operators/array/one_hot_op.cc
+++ b/Dragon/src/operators/array/one_hot_op.cc
@@ -29,17 +29,9 @@ void OneHotOp<Context>::RunOnDevice() {

    Y(0)->Reshape(out_shape);
   
-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), int)) {
-        RunImpl<int>();
-    } else if (XIsType(X(0), int64_t)) {
-        RunImpl<int64_t>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "int32", "int64" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, int, int64_t>
+    >::Call(this, X(0));
 }

 DEPLOY_CPU(OneHot);

--- a/Dragon/src/operators/array/pad_op.cc
+++ b/Dragon/src/operators/array/pad_op.cc
@@ -112,28 +112,10 @@ void PadOp<Context>::RunOnDevice() {
    TENSOR_FROM_VEC(X_strides_, X(0).strides(), int);
    TENSOR_FROM_VEC(Y_dims_, Y_dims, int);

-    if (XIsType(X(0), bool)) {
-        RunImpl<bool>();
-    } else if (XIsType(X(0), int8_t)) {
-        RunImpl<int8_t>();
-    } else if (XIsType(X(0), uint8_t)) {
-        RunImpl<uint8_t>();
-    } else if (XIsType(X(0), int)) {
-        RunImpl<int>();
-    } else if (XIsType(X(0), int64_t)) {
-        RunImpl<int64_t>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0), {
-            "bool", "int8", "uint8", "int32", "int64",
-                 "float16", "float32", "float64",
-        });
-    }
+    DispatchHelper<TensorTypes
+        <bool, int8_t, uint8_t, int, int64_t,
+               float16, float, double>
+    >::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -213,28 +195,10 @@ void PadGradientOp<Context>::RunOnDevice() {
    TENSOR_FROM_VEC(Y_strides_, X(0).strides(), int);
    TENSOR_FROM_VEC(X_dims_, X_dims, int);

-    if (XIsType(X(0), bool)) {
-        RunImpl<bool>();
-    } else if (XIsType(X(0), int8_t)) {
-        RunImpl<int8_t>();
-    } else if (XIsType(X(0), uint8_t)) {
-        RunImpl<uint8_t>();
-    } else if (XIsType(X(0), int)) {
-        RunImpl<int>();
-    } else if (XIsType(X(0), int64_t)) {
-        RunImpl<int64_t>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0), {
-            "bool", "int8", "uint8", "int32", "int64",
-                 "float16", "float32", "float64",
-        });
-    }
+    DispatchHelper<TensorTypes
+        <bool, int8_t, uint8_t, int, int64_t,
+               float16, float, double>
+    >::Call(this, X(0));
 }

 DEPLOY_CPU(Pad);

--- a/Dragon/src/operators/array/reduce_op.cc
+++ b/Dragon/src/operators/array/reduce_op.cc
@@ -14,33 +14,6 @@ namespace dragon {

 template <class Context> template <typename T>
 void ReduceOp<Context>::RunImpl() {
-    dims_ = X(0).dims();
-    dims32_.assign(dims_.begin(), dims_.end());
-    axes32_.assign(axes_.begin(), axes_.end());
-
-    if (axes32_.empty()) {
-        // Reduce to a Scalar if missing axes
-        for (int i = 0; i < X(0).ndim(); ++i)
-            axes32_.push_back(i);
-    }
-
-    for (int i = 0; i < axes32_.size(); i++) {
-        int axis = axes32_[i];
-        axes32_[i] = axis < 0 ? axis + X(0).ndim() : axis;
-        CHECK(axes32_[i] >= 0 && axes32_[i] < X(0).ndim()) \
-            << "\nExcepted the axis in [-" << X(0).ndim()
-            << ", " << X(0).ndim() << "), got " << axis << ".";
-        dims_[axes32_[i]] = 1;
-    }
-
-    vec64_t out_shape;
-    for (const auto& dim : dims_) {
-        if (dim != 1 || keep_dims_)
-            out_shape.emplace_back(dim);
-    }
-
-    Y(0)->Reshape(out_shape);
-
    auto* x = X(0).template data<T, Context>();
    auto* y = Y(0)->template mutable_data<T, Context>();

@@ -64,31 +37,8 @@ void ReduceOp<Context>::RunImpl() {

 template <class Context>
 void ReduceOp<Context>::RunOnDevice() {
-    if (XIsType(X(0), int8_t)) {
-        RunImpl<int8_t>();
-    } else if (XIsType(X(0), uint8_t)) {
-        RunImpl<uint8_t>();
-    } else if (XIsType(X(0), int)) {
-        RunImpl<int>();
-    } else if (XIsType(X(0), int64_t)) {
-        RunImpl<int64_t>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0), {
-            "int8", "uint8", "int32", "int64",
-            "float16", "float32", "float64",
-        });
-    }
-}
-
-template <class Context> template <typename T>
-void ReduceGradientOp<Context>::RunImpl() {
-    y_dims_ = X(0).dims();
+    dims_ = X(0).dims();
+    dims32_.assign(dims_.begin(), dims_.end());
    axes32_.assign(axes_.begin(), axes_.end());

    if (axes32_.empty()) {
@@ -103,11 +53,25 @@ void ReduceGradientOp<Context>::RunImpl() {
        CHECK(axes32_[i] >= 0 && axes32_[i] < X(0).ndim()) \
            << "\nExcepted the axis in [-" << X(0).ndim()
            << ", " << X(0).ndim() << "), got " << axis << ".";
-        y_dims_[axes32_[i]] = 1;
+        dims_[axes32_[i]] = 1;
    }

-    Y(0)->ReshapeLike(X(0));
+    vec64_t out_shape;
+    for (const auto& dim : dims_) {
+        if (dim != 1 || keep_dims_)
+            out_shape.emplace_back(dim);
+    }

+    Y(0)->Reshape(out_shape);
+
+    DispatchHelper<TensorTypes
+        <int8_t, uint8_t, int, int64_t,
+            float16, float, double>
+    >::Call(this, X(0));
+}
+
+template <class Context> template <typename T>
+void ReduceGradientOp<Context>::RunImpl() {
    auto* dy = X(1).template data<T, Context>();
    auto* dx = Y(0)->template mutable_data<T, Context>();

@@ -153,26 +117,30 @@ void ReduceGradientOp<Context>::RunImpl() {

 template <class Context>
 void ReduceGradientOp<Context>::RunOnDevice() {
-    if (XIsType(X(0), int8_t)) {
-        RunImpl<int8_t>();
-    } else if (XIsType(X(0), uint8_t)) {
-        RunImpl<uint8_t>();
-    } else if (XIsType(X(0), int)) {
-        RunImpl<int>();
-    } else if (XIsType(X(0), int64_t)) {
-        RunImpl<int64_t>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0), {
-            "int8", "uint8", "int32", "int64",
-            "float16", "float32", "float64",
-        });
+    y_dims_ = X(0).dims();
+    axes32_.assign(axes_.begin(), axes_.end());
+
+    if (axes32_.empty()) {
+        // Reduce to a Scalar if missing axes
+        for (int i = 0; i < X(0).ndim(); ++i)
+            axes32_.push_back(i);
    }
+
+    for (int i = 0; i < axes32_.size(); i++) {
+        int axis = axes32_[i];
+        axes32_[i] = axis < 0 ? axis + X(0).ndim() : axis;
+        CHECK(axes32_[i] >= 0 && axes32_[i] < X(0).ndim()) \
+            << "\nExcepted the axis in [-" << X(0).ndim()
+            << ", " << X(0).ndim() << "), got " << axis << ".";
+        y_dims_[axes32_[i]] = 1;
+    }
+
+    Y(0)->ReshapeLike(X(0));
+
+    DispatchHelper<TensorTypes
+        <int8_t, uint8_t, int, int64_t,
+            float16, float, double>
+    >::Call(this, X(0));
 }

 DEPLOY_CPU(Reduce);

--- a/Dragon/src/operators/array/repeat_op.cc
+++ b/Dragon/src/operators/array/repeat_op.cc
@@ -45,28 +45,10 @@ void RepeatOp<Context>::RunOnDevice() {
        Y(0)->Reshape(out_shape);
    }

-    if (XIsType(X(0), bool)) {
-        RunImpl<bool>();
-    } else if (XIsType(X(0), int8_t)) {
-        RunImpl<int8_t>();
-    } else if (XIsType(X(0), uint8_t)) {
-        RunImpl<uint8_t>();
-    } else if (XIsType(X(0), int)) {
-        RunImpl<int>();
-    } else if (XIsType(X(0), int64_t)) {
-        RunImpl<int64_t>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0), {
-            "bool", "int8", "uint8", "int32", "int64",
-                 "float16", "float32", "float64",
-        });
-    }
+    DispatchHelper<TensorTypes
+        <bool, int8_t, uint8_t, int, int64_t,
+               float16, float, double>
+    >::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -98,26 +80,10 @@ void RepeatGradientOp<Context>::RunOnDevice() {

    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), int8_t)) {
-        RunImpl<int8_t>();
-    } else if (XIsType(X(0), uint8_t)) {
-        RunImpl<uint8_t>();
-    } else if (XIsType(X(0), int)) {
-        RunImpl<int>();
-    } else if (XIsType(X(0), int64_t)) {
-        RunImpl<int64_t>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0), {
-            "int8", "uint8", "int32", "int64",
-            "float16", "float32", "float64",
-        });
-    }
+    DispatchHelper<TensorTypes
+        <int8_t, uint8_t, int, int64_t,
+            float16, float, double>
+    >::Call(this, X(0));
 }

 DEPLOY_CPU(Repeat);

--- a/Dragon/src/operators/array/slice_op.cc
+++ b/Dragon/src/operators/array/slice_op.cc
@@ -65,28 +65,10 @@ void SliceOp<Context>::RunOnDevice() {
    outer_dim_ = X(0).count(0, axis_);
    inner_dim_ = X(0).count(axis_ + 1);

-    if (XIsType(X(0), bool)) {
-        RunImpl<bool>();
-    } else if (XIsType(X(0), int8_t)) {
-        RunImpl<int8_t>();
-    } else if (XIsType(X(0), uint8_t)) {
-        RunImpl<uint8_t>();
-    } else if (XIsType(X(0), int)) {
-        RunImpl<int>();
-    } else if (XIsType(X(0), int64_t)) {
-        RunImpl<int64_t>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0), {
-            "bool", "int8", "uint8", "int32", "int64",
-                 "float16", "float32", "float64",
-        });
-    }
+    DispatchHelper<TensorTypes
+        <bool, int8_t, uint8_t, int, int64_t,
+               float16, float, double>
+    >::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -136,28 +118,10 @@ void SliceGradientOp<Context>::RunOnDevice() {

    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), bool)) {
-        RunImpl<bool>();
-    } else if (XIsType(X(0), int8_t)) {
-        RunImpl<int8_t>();
-    } else if (XIsType(X(0), uint8_t)) {
-        RunImpl<uint8_t>();
-    } else if (XIsType(X(0), int)) {
-        RunImpl<int>();
-    } else if (XIsType(X(0), int64_t)) {
-        RunImpl<int64_t>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0), {
-            "bool", "int8", "uint8", "int32", "int64",
-                 "float16", "float32", "float64",
-        });
-    }
+    DispatchHelper<TensorTypes
+        <bool, int8_t, uint8_t, int, int64_t,
+               float16, float, double>
+    >::Call(this, X(0));
 }

 DEPLOY_CPU(Slice);

--- a/Dragon/src/operators/array/stack_op.cc
+++ b/Dragon/src/operators/array/stack_op.cc
@@ -51,28 +51,10 @@ void StackOp<Context>::RunOnDevice() {

    Y(0)->Reshape(out_shape);

-    if (XIsType(X(0), bool)) {
-        RunImpl<bool>();
-    } else if (XIsType(X(0), int8_t)) {
-        RunImpl<int8_t>();
-    } else if (XIsType(X(0), uint8_t)) {
-        RunImpl<uint8_t>();
-    } else if (XIsType(X(0), int)) {
-        RunImpl<int>();
-    } else if (XIsType(X(0), int64_t)) {
-        RunImpl<int64_t>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0), {
-            "bool", "int8", "uint8", "int32", "int64",
-            "float16", "float32", "float64",
-        });
-    }
+    DispatchHelper<TensorTypes
+        <bool, int8_t, uint8_t, int, int64_t,
+               float16, float, double>
+    >::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -104,28 +86,10 @@ void StackGradientOp<Context>::RunOnDevice() {
    for (int i = 0; i < YSize(); i++)
        Y(i)->ReshapeLike(X(i));

-    if (XIsType(X(0), bool)) {
-        RunImpl<bool>();
-    } else if (XIsType(X(0), int8_t)) {
-        RunImpl<int8_t>();
-    } else if (XIsType(X(0), uint8_t)) {
-        RunImpl<uint8_t>();
-    } else if (XIsType(X(0), int)) {
-        RunImpl<int>();
-    } else if (XIsType(X(0), int64_t)) {
-        RunImpl<int64_t>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0), {
-            "bool", "int8", "uint8", "int32", "int64",
-                 "float16", "float32", "float64",
-        });
-    }
+    DispatchHelper<TensorTypes
+        <bool, int8_t, uint8_t, int, int64_t,
+               float16, float, double>
+    >::Call(this, X(0));
 }

 DEPLOY_CPU(Stack);

--- a/Dragon/src/operators/array/tile_op.cc
+++ b/Dragon/src/operators/array/tile_op.cc
@@ -60,28 +60,10 @@ void TileOp<Context>::RunOnDevice() {
    TENSOR_FROM_VEC(X_dims_, X(0).dims(), int);
    TENSOR_FROM_VEC(Y_dims_, Y_dims, int);

-    if (XIsType(X(0), bool)) {
-        RunImpl<bool>();
-    } else if (XIsType(X(0), int8_t)) {
-        RunImpl<int8_t>();
-    } else if (XIsType(X(0), uint8_t)) {
-        RunImpl<uint8_t>();
-    } else if (XIsType(X(0), int)) {
-        RunImpl<int>();
-    } else if (XIsType(X(0), int64_t)) {
-        RunImpl<int64_t>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0), {
-            "bool", "int8", "uint8", "int32", "int64",
-                 "float16", "float32", "float64",
-        });
-    }
+    DispatchHelper<TensorTypes
+        <bool, int8_t, uint8_t, int, int64_t,
+               float16, float, double>
+    >::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -99,7 +81,7 @@ void TileGradientOp<Context>::RunImpl() {
 template <class Context>
 void TileGradientOp<Context>::RunOnDevice() {
    // Add the axes
-    vector< pair<int, int> > dispatch_axes;
+    vector<pair<int, int>> dispatch_axes;
    for (int i = 0; i < X(0).ndim(); i++) {
        auto m = multiples(i);
        if (m > 1) { dispatch_axes.push_back({ m, i }); }
@@ -128,26 +110,11 @@ void TileGradientOp<Context>::RunOnDevice() {
        rows_ = dst_->count(0, axis_);
        cols_ = dst_->count(axis_);

-        if (XIsType(X(0), int8_t)) {
-            RunImpl<int8_t>();
-        } else if (XIsType(X(0), uint8_t)) {
-            RunImpl<uint8_t>();
-        } else if (XIsType(X(0), int)) {
-            RunImpl<int>();
-        } else if (XIsType(X(0), int64_t)) {
-            RunImpl<int64_t>();
-        } else if (XIsType(X(0), float16)) {
-            RunImpl<float16>();
-        } else if (XIsType(X(0), float)) {
-            RunImpl<float>();
-        } else if (XIsType(X(0), double)) {
-            RunImpl<double>();
-        } else {
-            LOG(FATAL) << DTypeString(X(0), {
-                "int8", "uint8", "int32", "int64",
-                "float16", "float32", "float64",
-            });
-        } ctx()->FinishDeviceCompution();
+        DispatchHelper<TensorTypes
+            <int8_t, uint8_t, int, int64_t,
+                float16, float, double>
+        >::Call(this, X(0));
+        ctx()->FinishDeviceCompution();

        // Protect X if num_axes >= 2
        std::swap(src_, dst_);

--- a/Dragon/src/operators/array/transpose_op.cc
+++ b/Dragon/src/operators/array/transpose_op.cc
@@ -54,28 +54,10 @@ void TransposeOp<Context>::RunOnDevice() {

    Y(0)->Reshape(out_shape);

-    if (XIsType(X(0), bool)) {
-        RunImpl<bool>();
-    } else if (XIsType(X(0), int8_t)) {
-        RunImpl<int8_t>();
-    } else if (XIsType(X(0), uint8_t)) {
-        RunImpl<uint8_t>();
-    } else if (XIsType(X(0), int)) {
-        RunImpl<int>();
-    } else if (XIsType(X(0), int64_t)) {
-        RunImpl<int64_t>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0), {
-            "bool", "int8", "uint8", "int32", "int64",
-                 "float16", "float32", "float64",
-        });
-    }
+    DispatchHelper<TensorTypes
+        <bool, int8_t, uint8_t, int, int64_t,
+               float16, float, double>
+    >::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -125,28 +107,10 @@ void TransposeGradientOp<Context>::RunOnDevice() {

    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), bool)) {
-        RunImpl<bool>();
-    } else if (XIsType(X(0), int8_t)) {
-        RunImpl<int8_t>();
-    } else if (XIsType(X(0), uint8_t)) {
-        RunImpl<uint8_t>();
-    } else if (XIsType(X(0), int)) {
-        RunImpl<int>();
-    } else if (XIsType(X(0), int64_t)) {
-        RunImpl<int64_t>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0), {
-            "bool", "int8", "uint8", "int32", "int64",
-                 "float16", "float32", "float64",
-        });
-    }
+    DispatchHelper<TensorTypes
+        <bool, int8_t, uint8_t, int, int64_t,
+               float16, float, double>
+    >::Call(this, X(0));
 }

 DEPLOY_CPU(Transpose);

--- a/Dragon/src/operators/control_flow/assign_op.cc
+++ b/Dragon/src/operators/control_flow/assign_op.cc
@@ -116,28 +116,10 @@ void AssignOp<Context>::RunOnDevice() {
    TENSOR_FROM_VECTOR(X_dims_, X_dims, int);
    TENSOR_FROM_VECTOR(Y_strides_, Y(0)->strides(), int);

-    if (XIsType(X(0), bool)) {
-        RunImpl<bool>();
-    } else if (XIsType(X(0), int8_t)) {
-        RunImpl<int8_t>();
-    } else if (XIsType(X(0), uint8_t)) {
-        RunImpl<uint8_t>();
-    } else if (XIsType(X(0), int)) {
-        RunImpl<int>();
-    } else if (XIsType(X(0), int64_t)) {
-        RunImpl<int64_t>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0), {
-            "bool", "int8", "uint8", "int32", "int64",
-                 "float16", "float32", "float64",
-        });
-    }
+    DispatchHelper<TensorTypes
+        <bool, int8_t, uint8_t, int, int64_t,
+               float16, float, double>
+    >::Call(this, X(0));
 }

 DEPLOY_CPU(Assign);

--- a/Dragon/src/operators/control_flow/copy_op.cc
+++ b/Dragon/src/operators/control_flow/copy_op.cc
@@ -14,28 +14,10 @@ template <class Context>
 void CopyOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), bool)) {
-        RunImpl<bool>();
-    } else if (XIsType(X(0), int8_t)) {
-        RunImpl<int8_t>();
-    } else if (XIsType(X(0), uint8_t)) {
-        RunImpl<uint8_t>();
-    } else if (XIsType(X(0), int)) {
-        RunImpl<int>();
-    } else if (XIsType(X(0), int64_t)) {
-        RunImpl<int64_t>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0), {
-            "bool", "int8", "uint8", "int32", "int64",
-                 "float16", "float32", "float64",
-        });
-    }
+    DispatchHelper<TensorTypes
+        <bool, int8_t, uint8_t, int, int64_t,
+               float16, float, double>
+    >::Call(this, X(0));
 }

 DEPLOY_CPU(Copy);

--- a/Dragon/src/operators/control_flow/masked_assign_op.cc
+++ b/Dragon/src/operators/control_flow/masked_assign_op.cc
@@ -65,28 +65,10 @@ void MaskedAssignOp<Context>::RunOnDevice() {
    CHECK(XIsType(X(1), bool) || XIsType(X(1), uint8_t))
        << "\nExcepted bool or uint8 mask.";

-    if (XIsType(X(0), bool)) {
-        RunImpl<bool>();
-    } else if (XIsType(X(0), int8_t)) {
-        RunImpl<int8_t>();
-    } else if (XIsType(X(0), uint8_t)) {
-        RunImpl<uint8_t>();
-    } else if (XIsType(X(0), int)) {
-        RunImpl<int>();
-    } else if (XIsType(X(0), int64_t)) {
-        RunImpl<int64_t>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0), {
-            "bool", "int8", "uint8", "int32", "int64",
-                 "float16", "float32", "float64",
-        });
-    }
+    DispatchHelper<TensorTypes
+        <bool, int8_t, uint8_t, int, int64_t,
+               float16, float, double>
+    >::Call(this, X(0));
 }

 DEPLOY_CPU(MaskedAssign);

--- a/Dragon/src/operators/loss/ctc_loss_op.cc
+++ b/Dragon/src/operators/loss/ctc_loss_op.cc
@@ -23,13 +23,8 @@ void CTCLossGradientOp<Context>::RunImpl() {

 template <class Context>
 void CTCLossGradientOp<Context>::RunOnDevice() {
-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else {
-        LOG(FATAL) << DTypeString(
-            X(0), { "float32" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float>>::Call(this, X(0));
 }

 DEPLOY_CPU(CTCLoss);

--- a/Dragon/src/operators/loss/l1_loss_op.cc
+++ b/Dragon/src/operators/loss/l1_loss_op.cc
@@ -15,9 +15,7 @@ void L1LossOp<Context>::RunImpl() {
        ->ReshapeLike(X(0))
        ->template mutable_data<T, Context>();

-    auto* y = Y(0)
-        ->Reshape({})
-        ->template mutable_data<T, Context>();
+    auto* y = Y(0)->template mutable_data<T, Context>();

    if (XSize() > 1) {
        auto* target = X(1).template data<T, Context>();
@@ -53,13 +51,10 @@ void L1LossOp<Context>::RunOnDevice() {
            << "while " << X(0).DimString() << " is required.";
    }

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else {
-        LOG(FATAL) << DTypeString(
-            X(0), { "float32" }
-        );
-    }
+    Y(0)->Reshape({});
+
+    DispatchHelper<TensorTypes
+        <float>>::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -104,13 +99,8 @@ void L1LossGradientOp<Context>::RunImpl() {

 template <class Context>
 void L1LossGradientOp<Context>::RunOnDevice() {
-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else {
-        LOG(FATAL) << DTypeString(
-            X(0), { "float32" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float>>::Call(this, X(0));
 }

 DEPLOY_CPU(L1Loss);

--- a/Dragon/src/operators/loss/l2_loss_op.cc
+++ b/Dragon/src/operators/loss/l2_loss_op.cc
@@ -14,9 +14,7 @@ void L2LossOp<Context>::RunImpl() {
        ->ReshapeLike(X(0))
        ->template mutable_data<T, Context>();

-    auto* y = Y(0)
-        ->Reshape({})
-        ->template mutable_data<float, Context>();
+    auto* y = Y(0)->template mutable_data<float, Context>();

    if (XSize() > 1) {
        auto* target = X(1).template data<T, Context>();
@@ -56,15 +54,10 @@ void L2LossOp<Context>::RunOnDevice() {
            << "while " << X(0).DimString() << " is required.";
    }

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
+    Y(0)->Reshape({});
+
+    DispatchHelper<TensorTypes
+        <float, float16>>::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -107,15 +100,8 @@ void L2LossGradientOp<Context>::RunImpl() {

 template <class Context>
 void L2LossGradientOp<Context>::RunOnDevice() {
-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16>>::Call(this, X(0));
 }

 DEPLOY_CPU(L2Loss);

--- a/Dragon/src/operators/loss/sigmoid_ce_loss_op.cc
+++ b/Dragon/src/operators/loss/sigmoid_ce_loss_op.cc
@@ -50,13 +50,8 @@ void SigmoidCrossEntropyOp<Context>::RunOnDevice() {
    loss_.ReshapeLike(X(0));
    flag_.ReshapeLike(X(0));

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else {
-        LOG(FATAL) << DTypeString(
-            X(0), { "float32" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float>>::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -113,13 +108,8 @@ void SigmoidCrossEntropyGradientOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));
    flag_.ReshapeLike(X(0));

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else {
-        LOG(FATAL) << DTypeString(
-            X(0), { "float32" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float>>::Call(this, X(0));
 }

 DEPLOY_CPU(SigmoidCrossEntropy);

--- a/Dragon/src/operators/loss/smooth_l1_loss_op.cc
+++ b/Dragon/src/operators/loss/smooth_l1_loss_op.cc
@@ -44,7 +44,6 @@ void SmoothL1LossOp<Context>::RunImpl() {
        normalizer = X(0).count();
    }

-    Y(0)->Reshape({});
    auto* y = Y(0)->template mutable_data<T, Context>();
    math::Sum(nelements, 1. / normalizer, err, y, ctx());
 }
@@ -53,13 +52,10 @@ template <class Context>
 void SmoothL1LossOp<Context>::RunOnDevice() {
    CHECK(X(0).count() == X(1).count());

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else {
-        LOG(FATAL) << DTypeString(
-            X(0), { "float32" }
-        );
-    }
+    Y(0)->Reshape({});
+
+    DispatchHelper<TensorTypes
+        <float>>::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -110,13 +106,8 @@ void SmoothL1LossGradientOp<Context>::RunImpl() {

 template <class Context>
 void SmoothL1LossGradientOp<Context>::RunOnDevice() {
-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else {
-        LOG(FATAL) << DTypeString(
-            X(0), { "float32" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float>>::Call(this, X(0));
 }

 DEPLOY_CPU(SmoothL1Loss);

--- a/Dragon/src/operators/loss/softmax_ce_loss_op.cc
+++ b/Dragon/src/operators/loss/softmax_ce_loss_op.cc
@@ -89,13 +89,8 @@ void SoftmaxCrossEntropyOp<Context>::RunOnDevice() {
    SoftmaxRun();
    loss_.ReshapeLike(X(0));

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else {
-        LOG(FATAL) << DTypeString(
-            X(0), { "float32" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float>>::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -161,13 +156,8 @@ void SoftmaxCrossEntropyGradientOp<Context>::RunOnDevice() {

    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else {
-        LOG(FATAL) << DTypeString(
-            X(0), { "float32" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float>>::Call(this, X(0));
 }

 DEPLOY_CPU(SoftmaxCrossEntropy);

--- a/Dragon/src/operators/misc/accuracy_op.cc
+++ b/Dragon/src/operators/misc/accuracy_op.cc
@@ -24,7 +24,7 @@ void AccuracyOp<Context>::RunImpl() {
            const int label = target[i * inner_dim_ + j];
            for (int k = 0; k < ignore_.count(); k++)
                if (label == ignore[k]) continue;
-            vector< pair<Tx, int> > vec;
+            vector<pair<Tx, int>> vec;
            for (int k = 0; k < axis_dim_; k++)
                vec.push_back(
                    std::make_pair(
@@ -35,7 +35,7 @@ void AccuracyOp<Context>::RunImpl() {
                vec.begin(),
                vec.begin() + top_k_,
                vec.end(),
-                std::greater< pair<Tx, int> >()
+                std::greater<pair<Tx, int>>()
            );
            for (int k = 0; k < top_k_; k++) {
                if (vec[k].second == label) { acc++; break; }

--- a/Dragon/src/operators/misc/gradient_op.cc
+++ b/Dragon/src/operators/misc/gradient_op.cc
@@ -18,43 +18,19 @@ void GradientGenerateOp<Context>::RunImpl() {

 template <class Context>
 void GradientGenerateOp<Context>::RunOnDevice() {
-    if (XIsType(X(0), bool)) {
-        RunImpl<bool>();
-    } else if (XIsType(X(0), int8_t)) {
-        RunImpl<int8_t>();
-    } else if (XIsType(X(0), uint8_t)) {
-        RunImpl<uint8_t>();
-    } else if (XIsType(X(0), int)) {
-        RunImpl<int>();
-    } else if (XIsType(X(0), int64_t)) {
-        RunImpl<int64_t>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0), {
-            "bool", "int8", "uint8", "int32", "int64",
-            "float16", "float32", "float64",
-        });
-    }
+    DispatchHelper<TensorTypes
+        <bool, int8_t, uint8_t, int, int64_t,
+               float16, float, double>
+    >::Call(this, X(0));
 }

-DEPLOY_CPU(GradientGenerate);
-#ifdef WITH_CUDA
-DEPLOY_CUDA(GradientGenerate);
-#endif
-OPERATOR_SCHEMA(GradientGenerate);
-
 template <class Context> template <typename T>
 void GradientGatherOp<Context>::RunImpl() {
    int64_t count = Y(0)->count();
    auto* y = Y(0)->template mutable_data<T, Context>();
    if (indices.size() == 1) {
        auto* x = X(indices[0]).template data<T, Context>();
-        ctx()->template Copy<T, Context, Context>(count, y, x);
+        math::Copy(count, x, y, ctx());
    } else if(indices.size() == 2) {
        CHECK_EQ(count, X(indices[1]).count());
        auto* a = X(indices[0]).template data<T, Context>();
@@ -63,7 +39,7 @@ void GradientGatherOp<Context>::RunImpl() {
    } else {
        size_t i = 1;
        auto* x = X(indices[0]).template data<T, Context>();
-        ctx()->template Copy<T, Context, Context>(count, y, x);
+        math::Copy(count, x, y, ctx());
        while (i < indices.size()) {
            if (indices.size() - i >= 2) {
                auto* a = X(indices[i]).template data<T, Context>();
@@ -84,34 +60,12 @@ void GradientGatherOp<Context>::RunOnDevice() {
    auto& Xi = X(indices[0]);
    Y(0)->ReshapeLike(Xi);

-    if (XIsType(Xi, int8_t)) {
-        RunImpl<int8_t>();
-    } else if (XIsType(Xi, uint8_t)) {
-        RunImpl<uint8_t>();
-    } else if (XIsType(Xi, int)) {
-        RunImpl<int>();
-    } else if (XIsType(Xi, int64_t)) {
-        RunImpl<int64_t>();
-    } else if (XIsType(Xi, float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(Xi, float)) {
-        RunImpl<float>();
-    } else if (XIsType(Xi, double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(Xi, {
-            "int8", "uint8", "int32", "int64",
-            "float16", "float32", "float64",
-        });
-    }
+    DispatchHelper<TensorTypes
+        <int8_t, uint8_t, int, int64_t,
+            float16, float, double>
+    >::Call(this, Xi);
 }

-DEPLOY_CPU(GradientGather);
-#ifdef WITH_CUDA
-DEPLOY_CUDA(GradientGather);
-#endif
-OPERATOR_SCHEMA(GradientGather).NumOutputs(1);
-
 template <class Context> template <typename T>
 void GradientAddOp<Context>::RunImpl() {
    auto* x = X(1).template data<T, Context>();
@@ -124,37 +78,12 @@ void GradientAddOp<Context>::RunOnDevice() {
    CHECK_EQ(X(0).name(), Y(0)->name())
        << "\nRequires X(0) == Y(0).";

-    if (XIsType(X(0), int8_t)) {
-        RunImpl<int8_t>();
-    } else if (XIsType(X(0), uint8_t)) {
-        RunImpl<uint8_t>();
-    } else if (XIsType(X(0), int)) {
-        RunImpl<int>();
-    } else if (XIsType(X(0), int64_t)) {
-        RunImpl<int64_t>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0), {
-            "int8", "uint8", "int32", "int64",
-            "float16", "float32", "float64",
-        });
-    }
+    DispatchHelper<TensorTypes
+        <int8_t, uint8_t, int, int64_t,
+            float16, float, double>
+    >::Call(this, X(0));
 }

-DEPLOY_CPU(GradientAdd);
-#ifdef WITH_CUDA
-DEPLOY_CUDA(GradientAdd);
-#endif
-
-OPERATOR_SCHEMA(GradientAdd)
-    .NumInputs(2).NumOutputs(1)
-    .Inplace({ { 0, 0 } });
-
 template <class Context>
 void StopGradientOp<Context>::RunOnDevice() {
    if (Y(0)->name() != X(0).name()) {
@@ -163,14 +92,53 @@ void StopGradientOp<Context>::RunOnDevice() {
    }
 }

+DEPLOY_CPU(GradientGenerate);
+#ifdef WITH_CUDA
+DEPLOY_CUDA(GradientGenerate);
+#endif
+
+DEPLOY_CPU(GradientGather);
+#ifdef WITH_CUDA
+DEPLOY_CUDA(GradientGather);
+#endif
+
+DEPLOY_CPU(GradientAdd);
+#ifdef WITH_CUDA
+DEPLOY_CUDA(GradientAdd);
+#endif
+
 DEPLOY_CPU(StopGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(StopGradient);
 #endif

+OPERATOR_SCHEMA(GradientGenerate)
+     /* X(0), ... */
+    .NumInputs(1, INT_MAX)
+     /* Y(0), ... */
+    .NumOutputs(1, INT_MAX);
+
+OPERATOR_SCHEMA(GradientGather)
+     /* X(0), ... */
+    .NumInputs(1, INT_MAX)
+     /* Y */
+    .NumOutputs(1);
+
+OPERATOR_SCHEMA(GradientAdd)
+     /* X(0), X(1) */
+    .NumInputs(2)
+     /* Y */
+    .NumOutputs(1)
+     /* X(0) => Y */
+    .Inplace({ { 0, 0 } });
+
 OPERATOR_SCHEMA(StopGradient)
-    .NumInputs(1).NumOutputs(1)
-    .Inplace({ { 0, 0 } });;
+     /* X */
+    .NumInputs(1)
+     /* Y */
+    .NumOutputs(1)
+     /* X => Y */
+    .Inplace({ { 0, 0 } });

 NO_GRADIENT(StopGradient);


--- a/Dragon/src/operators/misc/initialize_op.cc
+++ b/Dragon/src/operators/misc/initialize_op.cc
@@ -5,7 +5,7 @@ namespace dragon {

 template <class Context> template <typename T>
 void InitializeOp<Context>::RunImpl() {
-    unique_ptr< Filler<T, Context> > f;
+    unique_ptr<Filler<T, Context>> f;
    f.reset(CreateFiller<T, Context>(proto_));
    f->Fill(Y(0), ctx());
 }

--- a/Dragon/src/operators/misc/python_op.cc
+++ b/Dragon/src/operators/misc/python_op.cc
@@ -152,9 +152,11 @@ DEPLOY_CUDA(TemplateGradient);
 #endif
 OPERATOR_SCHEMA(TemplateGradient);

-class GetTemplateGradient final : public GradientMakerBase {
+namespace {
+
+class GradientMaker final : public GradientMakerBase {
 public:
-    GRADIENT_MAKER_CTOR(GetTemplateGradient);
+    GRADIENT_MAKER_CTOR(GradientMaker);
    vector<OperatorDef> MakeDef() override {
        vector<string> inputs, outputs;
        for (auto input : def.input()) inputs.push_back(input);
@@ -164,7 +166,9 @@ class GetTemplateGradient final : public GradientMakerBase {
    }
 };

-REGISTER_GRADIENT(Template, GetTemplateGradient);
+}  // namespace
+
+REGISTER_GRADIENT(Template, GradientMaker);

 }  // namespace dragon


--- a/Dragon/src/operators/mpi/mpi_broadcast_op.cc
+++ b/Dragon/src/operators/mpi/mpi_broadcast_op.cc
@@ -35,22 +35,10 @@ void MPIBroadcastOp<Context>::RunOnDevice() {
    BCast(dims.data(), ndim);
    Y(0)->Reshape(dims);

-    if (XIsType(X(0), int8_t)) {
-        RunImpl<int8_t>();
-    } else if (XIsType(X(0), int)) {
-        RunImpl<int>();
-    } else if (XIsType(X(0), int64_t)) {
-        RunImpl<int64_t>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0), {
-            "int8", "int32", "int64",
-            "float16", "float32",
-        });
-    }
+    DispatchHelper<TensorTypes
+        <bool, int8_t, uint8_t, int, int64_t,
+               float16, float, double>
+    >::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -74,22 +62,10 @@ template <class Context>
 void MPIBroadcastGradientOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(-1));

-    if (XIsType(X(-1), int8_t)) {
-        RunImpl<int8_t>();
-    } else if (XIsType(X(-1), int)) {
-        RunImpl<int>();
-    } else if (XIsType(X(-1), int64_t)) {
-        RunImpl<int64_t>();
-    } else if (XIsType(X(-1), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(-1), float)) {
-        RunImpl<float>();
-    } else {
-        LOG(FATAL) << DTypeString(X(-1), {
-            "int8", "int32", "int64",
-            "float16", "float32",
-        });
-    }
+    DispatchHelper<TensorTypes
+        <int8_t, uint8_t, int, int64_t,
+            float16, float, double>
+    >::Call(this, X(-1));
 }

 DEPLOY_CPU(MPIBroadcast);

--- a/Dragon/src/operators/mpi/mpi_gather_op.cc
+++ b/Dragon/src/operators/mpi/mpi_gather_op.cc
@@ -50,22 +50,10 @@ void MPIGatherOp<Context>::RunOnDevice() {
        }
    }

-    if (XIsType(X(0), int8_t)) {
-        RunImpl<int8_t>();
-    } else if (XIsType(X(0), int)) {
-        RunImpl<int>();
-    } else if (XIsType(X(0), int64_t)) {
-        RunImpl<int64_t>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0), {
-            "int8", "int32", "int64",
-            "float16", "float32",
-        });
-    }
+    DispatchHelper<TensorTypes
+        <bool, int8_t, uint8_t, int, int64_t,
+               float16, float, double>
+    >::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -88,22 +76,10 @@ template <class Context>
 void MPIGatherGradientOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), int8_t)) {
-        RunImpl<int8_t>();
-    } else if (XIsType(X(0), int)) {
-        RunImpl<int>();
-    } else if (XIsType(X(0), int64_t)) {
-        RunImpl<int64_t>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0), {
-            "int8", "int32", "int64",
-            "float16", "float32",
-        });
-    }
+    DispatchHelper<TensorTypes
+        <bool, int8_t, uint8_t, int, int64_t,
+               float16, float, double>
+    >::Call(this, X(0));
 }

 DEPLOY_CPU(MPIGather);

--- a/Dragon/src/operators/norm/l2_norm_op.cc
+++ b/Dragon/src/operators/norm/l2_norm_op.cc
@@ -73,17 +73,9 @@ void L2NormOp<Context>::RunOnDevice() {

    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float16", "float32", "float64" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16, double>
+    >::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -177,17 +169,9 @@ void L2NormGradientOp<Context>::RunOnDevice() {

    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), double)) {
-        RunImpl<double>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float16", "float32", "float64" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16, double>
+    >::Call(this, X(0));
 }

 DEPLOY_CPU(L2Norm);

--- a/Dragon/src/operators/recurrent/cudnn_recurrent_op.cc
+++ b/Dragon/src/operators/recurrent/cudnn_recurrent_op.cc
@@ -211,15 +211,8 @@ void CuDNNRecurrentOp<Context>::RunImpl() {

 template <class Context>
 void CuDNNRecurrentOp<Context>::RunOnDevice() {
-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16>>::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -313,15 +306,8 @@ void CuDNNRecurrentGradientOp<Context>::RunOnDevice() {
    Y(2)->ReshapeLike(X(2));  // dHx
    Y(3)->ReshapeLike(X(3));  // dCx

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16>>::Call(this, X(0));
 }

 DEPLOY_CUDNN(Recurrent);

--- a/Dragon/src/operators/recurrent/rnn_param_op.cc
+++ b/Dragon/src/operators/recurrent/rnn_param_op.cc
@@ -52,15 +52,8 @@ void RNNParamSetOp<Context>::RunImpl() {

 template <class Context>
 void RNNParamSetOp<Context>::RunOnDevice() {
-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16>>::Call(this, X(0));
 }

 DEPLOY_CPU(RNNParamSet);

--- a/Dragon/src/operators/vision/bias_add_op.cc
+++ b/Dragon/src/operators/vision/bias_add_op.cc
@@ -43,13 +43,8 @@ void BiasAddOp<Context>::RunOnDevice() {

    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else {
-        LOG(FATAL) << DTypeString(
-            X(0), { "float32" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float>>::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -102,13 +97,8 @@ void BiasAddGradientOp<Context>::RunOnDevice() {

    Y(1)->ReshapeLike(X(0));

-    if (XIsType(X(-1), float)) {
-        RunImpl<float>();
-    } else {
-        LOG(FATAL) << DTypeString(
-            X(-1), { "float32" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float>>::Call(this, X(-1));
 }

 DEPLOY_CPU(BiasAdd);

--- a/Dragon/src/operators/vision/bilinear_resize_op.cc
+++ b/Dragon/src/operators/vision/bilinear_resize_op.cc
@@ -48,14 +48,9 @@ void BilinearResizeOp<Context>::RunOnDevice() {
    }

    Y(0)->Reshape(out_shape);
-    
-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else {
-        LOG(FATAL) << DTypeString(
-            X(0), { "float32" }
-        );
-    }
+
+    DispatchHelper<TensorTypes
+        <float>>::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -90,14 +85,9 @@ void BilinearResizeGradientOp<Context>::RunImpl() {
 template <class Context>
 void BilinearResizeGradientOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));
-    
-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else {
-        LOG(FATAL) << DTypeString(
-            X(0), { "float32" }
-        );
-    }
+
+    DispatchHelper<TensorTypes
+        <float>>::Call(this, X(0));
 }

 DEPLOY_CPU(BilinearResize);

--- a/Dragon/src/operators/vision/conv2d_op.cc
+++ b/Dragon/src/operators/vision/conv2d_op.cc
@@ -27,13 +27,8 @@ void Conv2dOp<Context>::RunOnDevice() {
    if (data_format() == "NHWC" && group_ != 1)
        LOG(FATAL) << "GroupConv(NHWC) is not supported.";

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else {
-        LOG(FATAL) << DTypeString(
-            X(0), { "float32" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float>>::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -66,13 +61,8 @@ void Conv2dGradientOp<Context>::RunOnDevice() {
    if (data_format() == "NHWC" && group_ != 1)
        LOG(FATAL) << "GroupConv(NHWC) is not supported.";

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else {
-        LOG(FATAL) << DTypeString(
-            X(0), { "float32" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float>>::Call(this, X(0));
 }

 DEPLOY_CPU(Conv2d);

--- a/Dragon/src/operators/vision/conv2d_transpose_op.cc
+++ b/Dragon/src/operators/vision/conv2d_transpose_op.cc
@@ -32,13 +32,8 @@ void ConvTranspose2dOp<Context>::RunOnDevice() {
    for (int i = 0; i < num_axes_; i++)
        out_shape_[i] = X(0).dim(axis_ + i);

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else {
-        LOG(FATAL) << DTypeString(
-            X(0), { "float32" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float>>::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -76,13 +71,8 @@ void ConvTranspose2dGradientOp<Context>::RunOnDevice() {
    for (int i = 0; i < num_axes_; i++)
        out_shape_[i] = X(0).dim(axis_ + i);

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else {
-        LOG(FATAL) << DTypeString(
-            X(0), { "float32" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float>>::Call(this, X(0));
 }

 DEPLOY_CPU(ConvTranspose2d);

--- a/Dragon/src/operators/vision/cudnn_bias_add_op.cc
+++ b/Dragon/src/operators/vision/cudnn_bias_add_op.cc
@@ -62,15 +62,8 @@ void CuDNNBiasAddOp<Context>::RunOnDevice() {

    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16>>::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -129,15 +122,8 @@ void CuDNNBiasAddGradientOp<Context>::RunOnDevice() {

    Y(1)->ReshapeLike(X(0));

-    if (XIsType(X(-1), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(-1), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(-1),
-            { "float32", "float16" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16>>::Call(this, X(-1));
 }

 DEPLOY_CUDNN(BiasAdd);

--- a/Dragon/src/operators/vision/cudnn_conv2d_op.cc
+++ b/Dragon/src/operators/vision/cudnn_conv2d_op.cc
@@ -216,15 +216,8 @@ void CuDNNConv2dOp<Context>::RunOnDevice() {
 #endif
    ConvOpBase<Context>::Reshape();

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16>>::Call(this, X(0));
 }

 template <class Context>
@@ -474,15 +467,8 @@ void CuDNNConv2dGradientOp<Context>::RunOnDevice() {
 #endif
    ConvOpBase<Context>::Reshape(true);

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16>>::Call(this, X(0));
 }

 DEPLOY_CUDNN(Conv2d);

--- a/Dragon/src/operators/vision/cudnn_conv2d_transpose_op.cc
+++ b/Dragon/src/operators/vision/cudnn_conv2d_transpose_op.cc
@@ -214,15 +214,8 @@ void CuDNNConvTranspose2dOp<Context>::RunOnDevice() {
 #endif
    ConvOpBase<Context>::Reshape();

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16>>::Call(this, X(0));
 }

 DEPLOY_CUDNN(ConvTranspose2d);
@@ -471,15 +464,8 @@ void CuDNNConvTranspose2dGradientOp<Context>::RunOnDevice() {
 #endif
    ConvOpBase<Context>::Reshape(true);

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16>>::Call(this, X(0));
 }

 DEPLOY_CUDNN(ConvTranspose2dGradient);

--- a/Dragon/src/operators/vision/cudnn_depthwise_conv2d_op.cc
+++ b/Dragon/src/operators/vision/cudnn_depthwise_conv2d_op.cc
@@ -68,13 +68,8 @@ void CuDNNDepthwiseConv2dOp<Context>::RunOnDevice() {
        << "\nExcepted in/out channels unchanged.";
    ConvOpBase<Context>::Reshape();

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else {
-        LOG(FATAL) << DTypeString(
-            X(0), { "float32" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float>>::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -152,13 +147,8 @@ void CuDNNDepthwiseConv2dGradientOp<Context>::RunOnDevice() {
        == "NCHW" ? X(0).dim(1) : X(0).dim(-1);
    ConvOpBase<Context>::Reshape(true);

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else {
-        LOG(FATAL) << DTypeString(
-            X(0), { "float32" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float>>::Call(this, X(0));
 }

 DEPLOY_CUDNN(DepthwiseConv2d);

--- a/Dragon/src/operators/vision/cudnn_lrn_op.cc
+++ b/Dragon/src/operators/vision/cudnn_lrn_op.cc
@@ -14,10 +14,13 @@ void CuDNNLRNOp<Context>::RunImpl() {
        auto* y = Y(0)->template mutable_data<T, Context>();

        CUDNN_CHECK(cudnnLRNCrossChannelForward(
-            ctx()->cudnn_handle(), lrn_desc_,
+            ctx()->cudnn_handle(),
+            lrn_desc_,
            CUDNN_LRN_CROSS_CHANNEL_DIM1,
-            CuDNNType<T>::one, input_desc_, x,
-            CuDNNType<T>::zero, output_desc_, y
+            CuDNNType<T>::one,
+            input_desc_, x,
+            CuDNNType<T>::zero,
+            output_desc_, y
        ));
    } else {
        LOG(FATAL) << "Unknown DataFormat: " << data_format();
@@ -29,15 +32,8 @@ void CuDNNLRNOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));

    if (this->mode_ == "ACROSS_CHANNELS") {
-        if (XIsType(X(0), float)) {
-            RunImpl<float>();
-        } else if (XIsType(X(0), float16)) {
-            RunImpl<float16>();
-        } else {
-            LOG(FATAL) << DTypeString(X(0),
-                { "float32", "float16" }
-            );
-        }
+        DispatchHelper<TensorTypes
+            <float, float16>>::Call(this, X(0));
    } else if (this->mode_ == "WITHIN_CHANNEL") {
        LRNOp<Context>::RunOnDevice();
    } else {
@@ -57,7 +53,8 @@ void CuDNNLRNGradientOp<Context>::RunImpl() {
        auto* dx = Y(0)->template mutable_data<T, Context>();

        CUDNN_CHECK(cudnnLRNCrossChannelBackward(
-            ctx()->cudnn_handle(), lrn_desc_,
+            ctx()->cudnn_handle(),
+            lrn_desc_,
            CUDNN_LRN_CROSS_CHANNEL_DIM1,
            CuDNNType<T>::one,
            input_desc_, y,
@@ -76,15 +73,8 @@ void CuDNNLRNGradientOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));

    if (this->mode_ == "ACROSS_CHANNELS") {
-        if (XIsType(X(0), float)) {
-            RunImpl<float>();
-        } else if (XIsType(X(0), float16)) {
-            RunImpl<float16>();
-        } else {
-            LOG(FATAL) << DTypeString(X(0),
-                { "float32", "float16" }
-            );
-        }
+        DispatchHelper<TensorTypes
+            <float, float16>>::Call(this, X(0));
    } else if (this->mode_ == "WITHIN_CHANNEL") {
        LRNGradientOp<Context>::RunOnDevice();
    } else {

--- a/Dragon/src/operators/vision/cudnn_pool2d_op.cc
+++ b/Dragon/src/operators/vision/cudnn_pool2d_op.cc
@@ -39,9 +39,12 @@ void CuDNNPool2dOp<Context>::RunImpl() {
    auto* y = Y(0)->template mutable_data<T, Context>();

    CUDNN_CHECK(cudnnPoolingForward(
-        ctx()->cudnn_handle(), pool_desc_,
-        CuDNNType<T>::one, input_desc_, x,
-        CuDNNType<T>::zero, output_desc_, y
+        ctx()->cudnn_handle(),
+        pool_desc_,
+        CuDNNType<T>::one,
+        input_desc_, x,
+        CuDNNType<T>::zero,
+        output_desc_, y
    ));
 }

@@ -49,15 +52,8 @@ template <class Context>
 void CuDNNPool2dOp<Context>::RunOnDevice() {
    Pool2dOp<Context>::Reshape();

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16>>::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -97,7 +93,8 @@ void CuDNNPool2dGradientOp<Context>::RunImpl() {
    auto* dx = Y(0)->template mutable_data<T, Context>();

    CUDNN_CHECK(cudnnPoolingBackward(
-        ctx()->cudnn_handle(), pool_desc_,
+        ctx()->cudnn_handle(),
+        pool_desc_,
        CuDNNType<T>::one,
        input_desc_, y,
        input_desc_, dy,
@@ -111,15 +108,8 @@ template <class Context>
 void CuDNNPool2dGradientOp<Context>::RunOnDevice() {
    Pool2dGradientOp<Context>::Reshape();

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16>>::Call(this, X(0));
 }

 DEPLOY_CUDNN(Pool2d);

--- a/Dragon/src/operators/vision/depthwise_conv2d_op.cc
+++ b/Dragon/src/operators/vision/depthwise_conv2d_op.cc
@@ -40,13 +40,8 @@ void DepthwiseConv2dOp<Context>::RunOnDevice() {
        << "\nExcepted in/out channels unchanged.";
    ConvOpBase<Context>::Reshape();

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else {
-        LOG(FATAL) << DTypeString(
-            X(0), { "float32" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float>>::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -98,13 +93,8 @@ void DepthwiseConv2dGradientOp<Context>::RunOnDevice() {
        X(0).dim(1) : X(0).dim(-1);
    ConvOpBase<Context>::Reshape(true);

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else {
-        LOG(FATAL) << DTypeString(
-            X(0), { "float32" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float>>::Call(this, X(0));
 }

 DEPLOY_CPU(DepthwiseConv2d);

--- a/Dragon/src/operators/vision/drop_block2d_op.cc
+++ b/Dragon/src/operators/vision/drop_block2d_op.cc
@@ -102,15 +102,8 @@ template <class Context>
 void DropBlock2dOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16>>::Call(this, X(0));
 }

 template <class Context> template <typename T>
@@ -144,15 +137,8 @@ template <class Context>
 void DropBlock2dGradientOp<Context>::RunOnDevice() {
    Y(0)->ReshapeLike(X(0));

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16>>::Call(this, X(0));
 }

 DEPLOY_CPU(DropBlock2d);

--- a/Dragon/src/operators/vision/nn_resize_op.cc
+++ b/Dragon/src/operators/vision/nn_resize_op.cc
@@ -49,15 +49,8 @@ void NNResizeOp<Context>::RunOnDevice() {

    Y(0)->Reshape(out_shape);

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    }
+    DispatchHelper<TensorTypes
+        <float, float16>>::Call(this, X(0));
 }

 template <class Context> template <typename T>

--- a/Dragon/src/operators/vision/roi_align_op.cc
+++ b/Dragon/src/operators/vision/roi_align_op.cc
@@ -33,15 +33,8 @@ void ROIAlignOp<Context>::RunOnDevice() {
        pool_w_       /* feature_w */
    });

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    };
+    DispatchHelper<TensorTypes
+        <float, float16>>::Call(this, X(0));
 }

 template <class Context> template <typename T>

--- a/Dragon/src/operators/vision/roi_pool_op.cc
+++ b/Dragon/src/operators/vision/roi_pool_op.cc
@@ -37,15 +37,8 @@ void ROIPoolOp<Context>::RunOnDevice() {
        pool_w_       /* feature_w */
    });

-    if (XIsType(X(0), float)) {
-        RunImpl<float>();
-    } else if (XIsType(X(0), float16)) {
-        RunImpl<float16>();
-    } else {
-        LOG(FATAL) << DTypeString(X(0),
-            { "float32", "float16" }
-        );
-    };
+    DispatchHelper<TensorTypes
+        <float, float16>>::Call(this, X(0));
 }

 template <class Context> template <typename T>

--- a/Dragon/src/utils/math_functions.cu
+++ b/Dragon/src/utils/math_functions.cu
@@ -111,8 +111,8 @@ DEFINE_COPY_FUNC(double);
        T*                      y, \
        CUDAContext*            ctx) { \
        _##name \
-            << < CUDA_BLOCKS(n), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+            <<< CUDA_BLOCKS(n), CUDA_THREADS, \
+                0, ctx->cuda_stream() >>>( \
            n, x, y \
        ); \
    }
@@ -245,8 +245,8 @@ __global__ void _AddScalar(
                sizeof(T) * n, ctx->cuda_stream())); \
        } else { \
            _Set \
-                << < CUDA_BLOCKS(n), CUDA_THREADS, \
-                     0, ctx->cuda_stream() >> >( \
+                <<< CUDA_BLOCKS(n), CUDA_THREADS, \
+                    0, ctx->cuda_stream() >>>( \
                n, alpha, y \
            ); \
        } \
@@ -273,15 +273,15 @@ DEFINE_SET_FUNC(double);
        if (type == 0) { \
            /*! Row - BroadcastX */ \
            _RowBroadcastSet \
-                << < CUDA_BLOCKS(n), CUDA_THREADS, \
-                     0, ctx->cuda_stream() >> >( \
+                <<< CUDA_BLOCKS(n), CUDA_THREADS, \
+                    0, ctx->cuda_stream() >>>( \
                n, cols, x, y \
            ); \
        } else if (type == 1) { \
            /*! Col - BroadcastX */ \
            _ColBroadcastSet \
-                << < CUDA_BLOCKS(n), CUDA_THREADS, \
-                     0, ctx->cuda_stream() >> >( \
+                <<< CUDA_BLOCKS(n), CUDA_THREADS, \
+                    0, ctx->cuda_stream() >>>( \
                n, cols, x, y \
            ); \
        } \
@@ -307,8 +307,8 @@ DEFINE_BROADCAST_SET_FUNC(double);
        T*                      y, \
        CUDAContext*            ctx) { \
        _Pow \
-            << < CUDA_BLOCKS(n), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+            <<< CUDA_BLOCKS(n), CUDA_THREADS, \
+                0, ctx->cuda_stream() >>>( \
            n, cast::to<T>(exp), x, y \
        ); \
    }
@@ -337,8 +337,8 @@ DEFINE_POWX_FUNC(double);
            } return; \
        } \
        _Scale \
-            << < CUDA_BLOCKS(n), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+            <<< CUDA_BLOCKS(n), CUDA_THREADS, \
+                0, ctx->cuda_stream() >>>( \
            n, _alpha_, x, y \
        ); \
    }
@@ -386,8 +386,8 @@ DEFINE_CUBLAS_SCALE_FUNC(double, cublasDscal_v2);
        T*                      y, \
        CUDAContext*            ctx) { \
        _Axpy \
-            << < CUDA_BLOCKS(n), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+            <<< CUDA_BLOCKS(n), CUDA_THREADS, \
+                0, ctx->cuda_stream() >>>( \
            n, cast::to<T>(alpha), x, y \
        ); \
    }
@@ -434,8 +434,8 @@ DEFINE_AXPY_FUNC(int64_t);
        T*                      y, \
        CUDAContext*            ctx) { \
        _Axpby \
-            << < CUDA_BLOCKS(n), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+            <<< CUDA_BLOCKS(n), CUDA_THREADS, \
+                0, ctx->cuda_stream() >>>( \
            n, \
            cast::to<T>(alpha), x, \
            cast::to<T>(beta), y \
@@ -461,8 +461,8 @@ DEFINE_AXPBY_FUNC(double);
        T _alpha_ = (T)alpha; \
        if (_alpha_ == T(0)) return; \
        _AddScalar \
-            << < CUDA_BLOCKS(n), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+            <<< CUDA_BLOCKS(n), CUDA_THREADS, \
+                0, ctx->cuda_stream() >>>( \
            n, _alpha_, y \
        ); \
    }
@@ -506,8 +506,8 @@ __global__ void _InvStd(
        T*                      y, \
        CUDAContext*            ctx) { \
        _InvStd \
-            << < CUDA_BLOCKS(n), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+            <<< CUDA_BLOCKS(n), CUDA_THREADS, \
+                0, ctx->cuda_stream() >>>( \
            n, cast::to<T>(eps), x, y \
        ); \
    }
@@ -598,8 +598,8 @@ template <> float ASum<float, CUDAContext>(
        T*                      y, \
        CUDAContext*            ctx) { \
        _##name \
-            << < CUDA_BLOCKS(n), CUDA_THREADS, \
-                 0, ctx->cuda_stream() >> >( \
+            <<< CUDA_BLOCKS(n), CUDA_THREADS, \
+                0, ctx->cuda_stream() >>>( \
            n, a, b, y \
        ); \
    }
@@ -829,29 +829,29 @@ __global__ void _ColBroadcastDiv(
        if (type == 0) { \
            /*! Row - BroadcastB */ \
            _RowBroadcast##name<T, false> \
-                << < CUDA_BLOCKS(n), CUDA_THREADS, \
-                    0, ctx->cuda_stream() >> >( \
+                <<< CUDA_BLOCKS(n), CUDA_THREADS, \
+                   0, ctx->cuda_stream() >>>( \
                n, cols, a, b, y \
            ); \
        } else if (type == 1) { \
            /*! Col - BroadcastB */ \
            _ColBroadcast##name<T, false> \
-                << < CUDA_BLOCKS(n), CUDA_THREADS, \
-                    0, ctx->cuda_stream() >> >( \
+                <<< CUDA_BLOCKS(n), CUDA_THREADS, \
+                   0, ctx->cuda_stream() >>>( \
                n, cols, a, b, y \
            ); \
        } else if (type == 2) { \
            /*! Row - BroadcastA */ \
            _RowBroadcast##name<T, true> \
-                << < CUDA_BLOCKS(n), CUDA_THREADS, \
-                    0, ctx->cuda_stream() >> >( \
+                <<< CUDA_BLOCKS(n), CUDA_THREADS, \
+                   0, ctx->cuda_stream() >>>( \
                n, cols, a, b, y \
            ); \
        } else if (type == 3) { \
            /*! Col - BroadcastA */ \
            _ColBroadcast##name<T, true> \
-                << < CUDA_BLOCKS(n), CUDA_THREADS, \
-                    0, ctx->cuda_stream() >> >( \
+                <<< CUDA_BLOCKS(n), CUDA_THREADS, \
+                   0, ctx->cuda_stream() >>>( \
                n, cols, a, b, y \
            ); \
        } else { \

--- a/Dragon/src/utils/math_functions.fp16.cu
+++ b/Dragon/src/utils/math_functions.fp16.cu
@@ -48,16 +48,16 @@ template <> void Exp<float16, CUDAContext>(
    CUDAContext*            ctx) {
    if ((n & 1) == 0) {
        _ExpHalf2
-            << < CUDA_BLOCKS(n >> 1), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(n >> 1), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            n >> 1,
            reinterpret_cast<const half2*>(x),
            reinterpret_cast<half2*>(y)
        );
    } else {
        _ExpHalf
-            << < CUDA_BLOCKS(n), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(n), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            n,
            reinterpret_cast<const half*>(x),
            reinterpret_cast<half*>(y)
@@ -94,16 +94,16 @@ template <> void Log<float16, CUDAContext>(
    CUDAContext*            ctx) {
    if ((n & 1) == 0) {
        _LogHalf2
-            << < CUDA_BLOCKS(n >> 1), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(n >> 1), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            n >> 1,
            reinterpret_cast<const half2*>(x),
            reinterpret_cast<half2*>(y)
        );
    } else {
        _LogHalf
-            << < CUDA_BLOCKS(n), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(n), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            n,
            reinterpret_cast<const half*>(x),
            reinterpret_cast<half*>(y)
@@ -140,16 +140,16 @@ template <> void Inv<float16, CUDAContext>(
    CUDAContext*            ctx) {
    if ((n & 1) == 0) {
        _InvHalf2
-            << < CUDA_BLOCKS(n >> 1), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(n >> 1), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            n >> 1,
            reinterpret_cast<const half2*>(x),
            reinterpret_cast<half2*>(y)
        );
    } else {
        _InvHalf
-            << < CUDA_BLOCKS(n), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(n), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            n,
            reinterpret_cast<const half*>(x),
            reinterpret_cast<half*>(y)
@@ -186,16 +186,16 @@ template <> void Sqrt<float16, CUDAContext>(
    CUDAContext*            ctx) {
    if ((n & 1) == 0) {
        _SqrtHalf2
-            << < CUDA_BLOCKS(n >> 1), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(n >> 1), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            n >> 1,
            reinterpret_cast<const half2*>(x),
            reinterpret_cast<half2*>(y)
        );
    } else {
        _SqrtHalf
-            << < CUDA_BLOCKS(n), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(n), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            n,
            reinterpret_cast<const half*>(x),
            reinterpret_cast<half*>(y)
@@ -232,16 +232,16 @@ template <> void RSqrt<float16, CUDAContext>(
    CUDAContext*            ctx) {
    if ((n & 1) == 0) {
        _RSqrtHalf2
-            << < CUDA_BLOCKS(n >> 1), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(n >> 1), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            n >> 1,
            reinterpret_cast<const half2*>(x),
            reinterpret_cast<half2*>(y)
        );
    } else {
        _RSqrtHalf
-            << < CUDA_BLOCKS(n), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(n), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            n,
            reinterpret_cast<const half*>(x),
            reinterpret_cast<half*>(y)
@@ -278,16 +278,16 @@ template <> void Square<float16, CUDAContext>(
    CUDAContext*            ctx) {
    if ((n & 1) == 0) {
        _SquareHalf2
-            << < CUDA_BLOCKS(n >> 1), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(n >> 1), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            n >> 1,
            reinterpret_cast<const half2*>(x),
            reinterpret_cast<half2*>(y)
        );
    } else {
        _SquareHalf
-            << < CUDA_BLOCKS(n), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(n), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            n,
            reinterpret_cast<const half*>(x),
            reinterpret_cast<half*>(y)
@@ -330,16 +330,16 @@ template <> void Set<float16, CUDAContext>(
    }
    if ((n & 1) == 0) {
        _SetHalf<half2>
-            << < CUDA_BLOCKS(n >> 1), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(n >> 1), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            n >> 1,
            cast::to<half2>(alpha),
            reinterpret_cast<half2*>(y)
        );
    } else {
        _SetHalf<float16>
-            << < CUDA_BLOCKS(n), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(n), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            n, alpha, y
        );
    }
@@ -380,8 +380,8 @@ template <> void Pow<float16, CUDAContext>(
    CHECK(alpha == 2.f) << "\nRequired power = 2";
    if ((n & 1) == 0) {
        _PowHalf2
-            << < CUDA_BLOCKS(n >> 1), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(n >> 1), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            n >> 1,
            alpha,
            reinterpret_cast<const half2*>(x),
@@ -389,8 +389,8 @@ template <> void Pow<float16, CUDAContext>(
        );
    } else {
        _PowHalf
-            << < CUDA_BLOCKS(n), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(n), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            n,
            alpha,
            reinterpret_cast<const half*>(x),
@@ -487,16 +487,16 @@ template <> void AddScalar<float16, CUDAContext>(
    CUDAContext*            ctx) {
    if ((n & 1) == 0) {
        _AddScalarHalf2
-            << < CUDA_BLOCKS(n >> 1), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(n >> 1), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            n >> 1,
            cast::to<half2>(alpha),
            reinterpret_cast<half2*>(y)
        );
    } else {
        _AddScalarHalf
-            << < CUDA_BLOCKS(n), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(n), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            n,
            cast::to<half>(alpha),
            reinterpret_cast<half*>(y)
@@ -546,8 +546,8 @@ template <> void InvStd<float16, CUDAContext>(
    CUDAContext*            ctx) {
    if ((n & 1) == 0) {
        _InvStdHalf2
-            << < CUDA_BLOCKS(n >> 1), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(n >> 1), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            n >> 1,
            cast::to<half2>(eps),
            reinterpret_cast<const half2*>(x),
@@ -555,8 +555,8 @@ template <> void InvStd<float16, CUDAContext>(
        );
    } else {
        _InvStdHalf
-            << < CUDA_BLOCKS(n), CUDA_THREADS,
-                 0, ctx->cuda_stream() >> >(
+            <<< CUDA_BLOCKS(n), CUDA_THREADS,
+                0, ctx->cuda_stream() >>>(
            n,
            cast::to<half>(eps),
            reinterpret_cast<const half*>(x),
@@ -668,8 +668,8 @@ __global__ void _DivHalf(
        CUDAContext*            ctx) { \
        if ((n & 1) == 0) { \
            _##name##Half2 \
-                << < CUDA_BLOCKS(n >> 1), CUDA_THREADS, \
-                     0, ctx->cuda_stream() >> >( \
+                <<< CUDA_BLOCKS(n >> 1), CUDA_THREADS, \
+                    0, ctx->cuda_stream() >>>( \
                n >> 1, \
                reinterpret_cast<const half2*>(a), \
                reinterpret_cast<const half2*>(b), \
@@ -677,8 +677,8 @@ __global__ void _DivHalf(
            ); \
        } else { \
            _##name##Half \
-                << < CUDA_BLOCKS(n), CUDA_THREADS, \
-                     0, ctx->cuda_stream() >> >( \
+                <<< CUDA_BLOCKS(n), CUDA_THREADS, \
+                    0, ctx->cuda_stream() >>>( \
                n, \
                reinterpret_cast<const half*>(a), \
                reinterpret_cast<const half*>(b), \
@@ -699,8 +699,8 @@ template <> void Div<float16, CUDAContext>(
    float16*                y,
    CUDAContext*            ctx) {
    _DivHalf
-        << < CUDA_BLOCKS(n), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+        <<< CUDA_BLOCKS(n), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
        n,
        reinterpret_cast<const half*>(a),
        reinterpret_cast<const half*>(b),
@@ -884,8 +884,8 @@ __global__ void _ColBroadcastDivHalf(
        if (type == 0) { \
            /*! Row - BroadcastB */ \
            _RowBroadcast##name##Half<false> \
-                << < CUDA_BLOCKS(n), CUDA_THREADS, \
-                     0, ctx->cuda_stream() >> >( \
+                <<< CUDA_BLOCKS(n), CUDA_THREADS, \
+                    0, ctx->cuda_stream() >>>( \
                n, cols, \
                reinterpret_cast<const half*>(a), \
                reinterpret_cast<const half*>(b), \
@@ -894,8 +894,8 @@ __global__ void _ColBroadcastDivHalf(
        } else if (type == 1) { \
            /*! Col - BroadcastB */ \
            _ColBroadcast##name##Half<false> \
-                << < CUDA_BLOCKS(n), CUDA_THREADS, \
-                     0, ctx->cuda_stream() >> >( \
+                <<< CUDA_BLOCKS(n), CUDA_THREADS, \
+                    0, ctx->cuda_stream() >>>( \
                n, cols, \
                reinterpret_cast<const half*>(a), \
                reinterpret_cast<const half*>(b), \
@@ -904,8 +904,8 @@ __global__ void _ColBroadcastDivHalf(
        } else if (type == 2) { \
            /*! Row - BroadcastA */ \
            _RowBroadcast##name##Half<true> \
-                << < CUDA_BLOCKS(n), CUDA_THREADS, \
-                     0, ctx->cuda_stream() >> >( \
+                <<< CUDA_BLOCKS(n), CUDA_THREADS, \
+                    0, ctx->cuda_stream() >>>( \
                n, cols, \
                reinterpret_cast<const half*>(a), \
                reinterpret_cast<const half*>(b), \
@@ -914,8 +914,8 @@ __global__ void _ColBroadcastDivHalf(
        } else if (type == 3) { \
            /*! Col - BroadcastA */ \
            _ColBroadcast##name##Half<true> \
-                << < CUDA_BLOCKS(n), CUDA_THREADS, \
-                     0, ctx->cuda_stream() >> >( \
+                <<< CUDA_BLOCKS(n), CUDA_THREADS, \
+                    0, ctx->cuda_stream() >>>( \
                n, cols, \
                reinterpret_cast<const half*>(a), \
                reinterpret_cast<const half*>(b), \