Dismantle FP16 & TensorCore Support

Ting PAN
Commit 79a52211 authored May 29, 2018 by Ting PAN
Showing with 1886 additions and 1292 deletions
Dragon/include/core/context_cuda.h
Dragon/include/operators/arithmetic/scale_op.h → Dragon/include/operators/arithmetic/affine_op.h
Dragon/include/operators/misc/accuracy_op.h
Dragon/include/operators/misc/gradient_op.h
Dragon/include/operators/vision/conv_op.h
Dragon/include/operators/vision/conv_transpose_op.h
Dragon/include/utils/cast.h
Dragon/include/utils/cuda_device.h
Dragon/include/utils/op_kernel.h
Dragon/modules/cxx/dragon.cc
Dragon/python/dragon/core/tensor_utils.py
Dragon/python/dragon/docs/contents/operators.rst
Dragon/python/dragon/docs/contents/ops.rst
Dragon/python/dragon/io/data_batch.py
Dragon/python/dragon/io/data_reader.py
Dragon/python/dragon/operators/arithmetic.py
Dragon/python/dragon/ops.py
Dragon/python/dragon/vm/caffe/layers/common.py
Dragon/python/dragon/vm/caffe/layers/data.py
Dragon/python/dragon/vm/torch/module.py
--- a/Dragon/include/core/context_cuda.h
+++ b/Dragon/include/core/context_cuda.h
@@ -133,6 +133,9 @@ class CUDAContext {
        } else {
            DeviceGuard gurad(gpu_id_);
            CUBLAS_CHECK(cublasCreate_v2(&handle));
+#if CUDA_VERSION >= 9000
+            if (TENSOR_CORE_AVAILABLE()) cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH);
+#endif
            return handle;
        }
    }

--- a/Dragon/include/operators/arithmetic/scale_op.h
+++ b/Dragon/include/operators/arithmetic/scale_op.h
@@ -9,17 +9,17 @@
 //
 // ------------------------------------------------------------

-#ifndef DRAGON_OPERATORS_ARITHMETIC_SCALE_OP_H_
-#define DRAGON_OPERATORS_ARITHMETIC_SCALE_OP_H_
+#ifndef DRAGON_OPERATORS_ARITHMETIC_AFFINE_OP_H_
+#define DRAGON_OPERATORS_ARITHMETIC_AFFINE_OP_H_

 #include "core/operator.h"

 namespace dragon {

 template <class Context>
-class ScaleOp : public Operator<Context> {
+class AffineOp : public Operator<Context> {
 public:
-    ScaleOp(const OperatorDef& op_def, Workspace* ws)
+    AffineOp(const OperatorDef& op_def, Workspace* ws)
        : Operator<Context>(op_def, ws),
          axis(OperatorBase::GetSingleArg<int>("axis", 1)),
          num_axes(OperatorBase::GetSingleArg<int>("num_axes", 1)) {}
@@ -30,14 +30,14 @@ class ScaleOp : public Operator<Context> {

 protected:
    TIndex axis, start_axis, num_axes;
-    TIndex inner_dim;
+    TIndex outer_dim, scale_dim, inner_dim;
    Tensor* bias_multiplier;
 };

 template <class Context>
-class ScaleGradientOp final : public Operator<Context> {
+class AffineGradientOp final : public Operator<Context> {
 public:
-    ScaleGradientOp(const OperatorDef& op_def, Workspace* ws)
+    AffineGradientOp(const OperatorDef& op_def, Workspace* ws)
        : Operator<Context>(op_def, ws),
        axis(OperatorBase::GetSingleArg<int>("axis", 1)),
        num_axes(OperatorBase::GetSingleArg<int>("num_axes", -1)) {}
@@ -57,4 +57,4 @@ class ScaleGradientOp final : public Operator<Context> {

 }    // namespace dragon

-#endif    // DRAGON_OPERATORS_ARITHMETIC_SCALE_OP_H_
\ No newline at end of file
+#endif    // DRAGON_OPERATORS_ARITHMETIC_AFFINE_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/misc/accuracy_op.h
+++ b/Dragon/include/operators/misc/accuracy_op.h
@@ -33,7 +33,7 @@ class AccuracyOp final: public Operator<Context> {
    USE_OPERATOR_FUNCTIONS(Context);

    void RunOnDevice() override;
-    template <typename T> void RunWithType();
+    template <typename Tx, typename Ty> void RunWithType();

 protected:
    TIndex top_k, axis, outer_dim, inner_dim, num_classes;

--- a/Dragon/include/operators/misc/gradient_op.h
+++ b/Dragon/include/operators/misc/gradient_op.h
@@ -9,8 +9,8 @@
 //
 // -------------------------------------------------------------

-#ifndef DRAGON_OPERATORS_MISC_GRADIENT_GENERATE_OP_H_
-#define DRAGON_OPERATORS_MISC_GRADIENT_GENERATE_OP_H_
+#ifndef DRAGON_OPERATORS_MISC_GRADIENT_OP_H_
+#define DRAGON_OPERATORS_MISC_GRADIENT_OP_H_

 #include "core/operator.h"

@@ -62,4 +62,4 @@ class StopGradientOp final : public Operator<Context> {

 }    // namespace dragon

-#endif    // DRAGON_OPERATORS_MISC_GRADIENT_GENERATE_OP_H_
\ No newline at end of file
+#endif    // DRAGON_OPERATORS_MISC_GRADIENT_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/vision/conv_op.h
+++ b/Dragon/include/operators/vision/conv_op.h
@@ -56,11 +56,13 @@ template <class Context>
 class CuDNNConv2dOp : public Conv2dOp<Context> {
 public:
    CuDNNConv2dOp(const OperatorDef& def, Workspace* ws)
-        : Conv2dOp<Context>(def, ws) {
+        : Conv2dOp<Context>(def, ws), enable_tensor_core(true) {
 #if CUDNN_VERSION_MIN(7, 0, 0)
        cudnn_group = 1;
+        enable_tensor_core &= TENSOR_CORE_AVAILABLE();
 #else
        cudnn_group = this->group;
+        enable_tensor_core = false;
 #endif
        handle = new cudnnHandle_t[cudnn_group];
        stream = new cudaStream_t[cudnn_group];
@@ -109,17 +111,20 @@ class CuDNNConv2dOp : public Conv2dOp<Context> {
    size_t workspace_fwd_data_size;
    TIndex bias_offset, cudnn_group;
    vector<TIndex> input_dims;
+    bool enable_tensor_core;
 };

 template <class Context>
 class CuDNNConv2dGradientOp : public Conv2dGradientOp<Context> {
 public:
    CuDNNConv2dGradientOp(const OperatorDef& def, Workspace* ws)
-        : Conv2dGradientOp<Context>(def, ws) {
+        : Conv2dGradientOp<Context>(def, ws), enable_tensor_core(true) {
 #if CUDNN_VERSION_MIN(7, 0, 0)
        cudnn_group = 1;
+        enable_tensor_core &= TENSOR_CORE_AVAILABLE();
 #else
        cudnn_group = this->group;
+        enable_tensor_core = false;
 #endif
        handle = new cudnnHandle_t[cudnn_group * 3];
        stream = new cudaStream_t[cudnn_group * 3];
@@ -168,6 +173,7 @@ class CuDNNConv2dGradientOp : public Conv2dGradientOp<Context> {
    size_t workspace_bwd_filter_size, workspace_bwd_data_size;
    TIndex bias_offset, cudnn_group;
    vector<TIndex> input_dims;
+    bool enable_tensor_core;
 };

 #endif    // WITH_CUDNN

--- a/Dragon/include/operators/vision/conv_transpose_op.h
+++ b/Dragon/include/operators/vision/conv_transpose_op.h
@@ -60,11 +60,13 @@ template <class Context>
 class CuDNNConv2dTransposeOp : public Conv2dTransposeOp<Context> {
 public:
    CuDNNConv2dTransposeOp(const OperatorDef& def, Workspace* ws)
-        : Conv2dTransposeOp<Context>(def, ws) {
+        : Conv2dTransposeOp<Context>(def, ws), enable_tensor_core(true) {
 #if CUDNN_VERSION_MIN(7, 0, 0)
        cudnn_group = 1;
+        enable_tensor_core &= TENSOR_CORE_AVAILABLE();
 #else
        cudnn_group = this->group;
+        enable_tensor_core = false;
 #endif
        handle = new cudnnHandle_t[cudnn_group];
        stream = new cudaStream_t[cudnn_group];
@@ -112,17 +114,20 @@ class CuDNNConv2dTransposeOp : public Conv2dTransposeOp<Context> {
    size_t workspace_fwd_data_size;
    TIndex bias_offset, cudnn_group;
    vector<TIndex> input_dims;
+    bool enable_tensor_core;
 };

 template <class Context>
 class CuDNNConv2dTransposeGradientOp : public Conv2dTransposeGradientOp<Context> {
 public:
    CuDNNConv2dTransposeGradientOp(const OperatorDef& def, Workspace* ws)
-        : Conv2dTransposeGradientOp<Context>(def, ws) {
+        : Conv2dTransposeGradientOp<Context>(def, ws), enable_tensor_core(true) {
 #if CUDNN_VERSION_MIN(7, 0, 0)
        cudnn_group = 1;
+        enable_tensor_core &= TENSOR_CORE_AVAILABLE();
 #else
        cudnn_group = this->group;
+        enable_tensor_core = false;
 #endif
        handle = new cudnnHandle_t[cudnn_group * 3];
        stream = new cudaStream_t[cudnn_group * 3];
@@ -171,6 +176,7 @@ public:
    size_t workspace_bwd_filter_size, workspace_bwd_data_size;
    TIndex bias_offset, cudnn_group;
    vector<TIndex> input_dims;
+    bool enable_tensor_core;
 };

 #endif    // WITH_CUDNN

--- a/Dragon/include/utils/cast.h
+++ b/Dragon/include/utils/cast.h
@@ -26,9 +26,9 @@ template<> inline int dragon_cast<int, float>(float val) {
    return static_cast<int>(val);
 }

-template<> inline float dragon_cast<float, float>(float val) {
-    return val;
-}
+template<> inline float dragon_cast<float, float>(float val) { return val; }
+
+template<> inline float16 dragon_cast<float16, float16>(float16 val) { return val; }

 template<> inline float16 dragon_cast<float16, float>(float val) {
    float16 ret;

--- a/Dragon/include/utils/cuda_device.h
+++ b/Dragon/include/utils/cuda_device.h
@@ -77,7 +77,7 @@ inline int GET_BLOCKS(const int N) {
 #define __hdiv hdiv
 #endif

-inline int NUM_DEVICES() {
+inline int CUDA_NUM_DEVICES() {
    static int count = -1;
    if (count < 0) {
        auto err = cudaGetDeviceCount(&count);
@@ -86,21 +86,47 @@ inline int NUM_DEVICES() {
    return count;
 }

-inline int CURRENT_DEVICE() {
+inline int CUDA_CURRENT_DEVICE() {
    int gpu_id;
    cudaGetDevice(&gpu_id);
    return gpu_id;
 }

-inline int POINTER_DEVICE(const void* ptr) {
+inline int CUDA_POINTER_DEVICE(const void* ptr) {
    cudaPointerAttributes attr;
    CUDA_CHECK(cudaPointerGetAttributes(&attr, ptr));
    return attr.device;
 }

+struct CUDADeviceProps {
+    CUDADeviceProps() : props(CUDA_NUM_DEVICES()) {
+        for (int i = 0; i < CUDA_NUM_DEVICES(); ++i)
+            CUDA_CHECK(cudaGetDeviceProperties(&props[i], i));
+}
+    vector<cudaDeviceProp> props;
+};
+
+inline const cudaDeviceProp& GetDeviceProperty(const int device_id) {
+    static CUDADeviceProps props;
+    CHECK_LT(device_id, (int)props.props.size())
+        << "Invalid device id: " << device_id
+        << "\nDetected " << props.props.size() << " eligible cuda devices.";
+    return props.props[device_id];
+}
+
+inline bool TENSOR_CORE_AVAILABLE() {
+#if CUDA_VERSION < 9000
+    return false;
+#else
+    int device = CUDA_CURRENT_DEVICE();
+    auto& prop = GetDeviceProperty(device);
+    return prop.major >= 7;
+#endif
+}
+
 class DeviceGuard {
 public:
-    DeviceGuard(int newDevice) : previous_(CURRENT_DEVICE()) {
+    DeviceGuard(int newDevice) : previous_(CUDA_CURRENT_DEVICE()) {
        if (previous_ != newDevice) 
            CUDA_CHECK(cudaSetDevice(newDevice));
    }

--- a/Dragon/include/utils/op_kernel.h
+++ b/Dragon/include/utils/op_kernel.h
@@ -150,6 +150,28 @@ void Tanh(const int count, const T* x, T* y);
 template <typename T, class Context>
 void TanhGrad(const int count, const T* dy, const T* y, T* dx);

+/******************** arithmetic.affine ********************/
+
+template <typename T, class Context>
+void Affine(const int count,
+            const int outer_dim,
+            const int scale_dim,
+            const int inner_dim,
+            const T* x,
+            const T* alpha,
+            const T* beta,
+            const T* beta_multiplier,
+            T* y);
+
+template <typename T, class Context>
+void AffineGrad(const int count,
+                const int outer_dim,
+                const int scale_dim,
+                const int inner_dim,
+                const T* dy,
+                const T* alpha,
+                T* dx);
+
 /******************** arithmetic.bias_add ********************/

 template <typename T, class Context>
@@ -172,24 +194,6 @@ void Clip(const int count,
          T* mask, 
          T* y);

-/******************** arithmetic.scale ********************/
-
-template <typename T, class Context>
-void Scale(const int axis, 
-           Tensor* x, 
-           Tensor* gamma, 
-           Tensor* beta, 
-           Tensor* BMul, 
-           Tensor* y);
-
-template <typename T, class Context>
-void ScaleGrad(const int axis, Tensor* dy, Tensor* gamma, Tensor* dx);
-
-/******************** cast.float2half ********************/
-
-template <typename T, class Context>
-void Float2Half(const int count, const float* x, float16* y);
-
 /******************** control_flow.compare ********************/

 template <typename T, class Context>
@@ -286,7 +290,7 @@ void SparseSoftmaxFocalLossGrad(const int count,
                                Tensor* ignore, 
                                T* dx);

-/******************** misc.dtype ********************/
+/******************** misc.astype ********************/

 template <typename Ta, typename Tb, class Context>
 void TypeA2B(const int count, const Ta* a, Tb* b);

--- a/Dragon/modules/cxx/dragon.cc
+++ b/Dragon/modules/cxx/dragon.cc
@@ -88,7 +88,7 @@ std::string CreateGraph(const std::string& graph_file, const Device& device, Wor
    //  overwritten device options
    DeviceOption* device_option = meta_graph.mutable_device_option();
    device_option->set_device_type((DeviceType)device.device_type());
-    device_option->set_gpu_id(device.device_id());
+    device_option->set_device_id(device.device_id());
    device_option->set_engine("CUDNN");
    dragon::GraphBase* graph = ws->CreateGraph(meta_graph);
    if (!graph) LOG(FATAL) << "Can not create the graph.";

--- a/Dragon/python/dragon/core/tensor_utils.py
+++ b/Dragon/python/dragon/core/tensor_utils.py
@@ -80,7 +80,8 @@ def FromShape(shape, dtype='float32', ctx=None, name=None):
    The wrapper of ``TensorFromShapeCC``.

    """
-    tensor = Tensor(name)
+    if name is None: tensor = Tensor(name=name)
+    else: tensor = Tensor(_name=name)
    if not isinstance(shape, (tuple, list)):
        raise TypeError('The shape should be a tuple or list.')
    if ctx is None: ctx = MakeDeviceOption(0, 0) # CPUContext
@@ -134,7 +135,8 @@ def FromPyArray(array, name=None):
    The wrapper of ``TensorFromPyArrayCC``.

    """
-    tensor = Tensor(name)
+    if name is None: tensor = Tensor(name=name)
+    else: tensor = Tensor(_name=name)
    if not isinstance(array, np.ndarray):
        raise TypeError('The given nd-array should be numpy.ndarray.')
    TensorFromPyArrayCC(_stringify_tensor(tensor), array)
@@ -258,15 +260,18 @@ def GetTensorInfo(tensor, stream=1):

    The string info contains following fields:

-    stream #1: ``dtype``, ``from_numpy``, ``init``
-               ``mem``, ``mem_at``, ``device_id``
+    stream #1: ``dtype``, ``from_numpy``, ``init``, ``mem``, ``mem_at``, ``device_id``
+
    stream #2: ``shape``
+
    stream #3: #1 + #2

    Parameters
    ----------
    tensor : Tensor or str
        The input tensor.
+    stream : int
+        The stream id.

    Returns
    -------

--- a/Dragon/python/dragon/docs/contents/operators.rst
+++ b/Dragon/python/dragon/docs/contents/operators.rst
@@ -14,7 +14,6 @@ Common
   operators/ndarray
   operators/control_flow
   operators/misc
-   operators/cast
   operators/mpi

 ===================================         =====================================================================
@@ -26,7 +25,6 @@ List                                        Brief
 `dragon.operators.ndarray`_                 The ndarray operators.
 `dragon.operators.control_flow`_            The control flow operators.
 `dragon.operators.misc`_                    The misc operators.
-`dragon.operators.cast`_                    The cast operators.
 `dragon.operators.mpi`_                     The MPI operators.
 ===================================         =====================================================================

@@ -95,7 +93,6 @@ List                                            Brief
 .. _dragon.operators.ndarray: operators/ndarray.html
 .. _dragon.operators.control_flow: operators/control_flow.html
 .. _dragon.operators.misc: operators/misc.html
-.. _dragon.operators.cast: operators/cast.html
 .. _dragon.operators.mpi: operators/mpi.html
 .. _dragon.operators.activation: operators/activation.html
 .. _dragon.operators.vision: operators/vision.html

--- a/Dragon/python/dragon/docs/contents/ops.rst
+++ b/Dragon/python/dragon/docs/contents/ops.rst
@@ -101,7 +101,7 @@ List               Brief
 `Matmul`_          Matrix Multiplication.
 `InnerProduct`_    InnerProduct Function.
 `Eltwise`_         Eltwise Sum/Product Function.
-`Scale`_           Scale Function.
+`Affine`_          Calculate ``y = Ax + b`` along the given range of axes.
 `GramMatrix`_      Calculate the gram matrix, introduced by `[Gatys et.al, 2016] <https://www.cv-foundation.org/openaccess/content_cvpr_2016/papers/Gatys_Image_Style_Transfer_CVPR_2016_paper.pdf>`_.
 ===============    ======================================================================

@@ -244,7 +244,7 @@ List                 Brief
 .. _Dot: operators/arithmetic.html#dragon.operators.arithmetic.Dot
 .. _InnerProduct: operators/arithmetic.html#dragon.operators.arithmetic.InnerProduct
 .. _Eltwise: operators/arithmetic.html#dragon.operators.arithmetic.Eltwise
-.. _Scale: operators/arithmetic.html#dragon.operators.arithmetic.Scale
+.. _Affine: operators/arithmetic.html#dragon.operators.arithmetic.Affine
 .. _GramMatrix: operators/arithmetic.html#dragon.operators.arithmetic.GramMatrix

 .. _BatchNorm: operators/norm.html#dragon.operators.norm.BatchNorm

--- a/Dragon/python/dragon/io/data_batch.py
+++ b/Dragon/python/dragon/io/data_batch.py
@@ -39,9 +39,9 @@ class DataBatch(object):
        source : str
            The path of database.
        multiple_nodes: boolean
-            Whether to split data for multiple parallel nodes.
+            Whether to split data for multiple parallel nodes. Default is ``False``.
        shuffle : boolean
-            Whether to shuffle the data.
+            Whether to shuffle the data. Default is ``False``.
        num_chunks : int
            The number of chunks to split. Default is ``2048``.
        chunk_size : int

--- a/Dragon/python/dragon/io/data_reader.py
+++ b/Dragon/python/dragon/io/data_reader.py
@@ -36,11 +36,9 @@ class DataReader(Process):
        source : str
            The path of database.
        multiple_nodes: boolean
-            Whether to split data for multiple parallel nodes.
+            Whether to split data for multiple parallel nodes. Default is ``False``.
        shuffle : boolean
-            Whether to shuffle the data.
-        instance_chunk : boolean
-            Whether to limit each chunk with at most 1 instance.
+            Whether to shuffle the data. Default is ``False``.
        num_chunks : int
            The number of chunks to split. Default is ``2048``.
        chunk_size : int
@@ -51,7 +49,6 @@ class DataReader(Process):
        self._source = kwargs.get('source', '')
        self._multiple_nodes = kwargs.get('multiple_nodes', False)
        self._use_shuffle = kwargs.get('shuffle', False)
-        self._use_instance_chunk = kwargs.get('instance_chunk', False)
        self._num_chunks = kwargs.get('num_chunks', 2048)
        self._chunk_size = kwargs.get('chunk_size', -1)

@@ -172,12 +169,13 @@ class DataReader(Process):
        self._db_zfill = int(self._db.get('zfill'))
        self._epoch_size = int(self._db_size / self._num_parts + 1)

-        if self._use_instance_chunk:
-            self._chunk_size = 1
+        if self._use_shuffle:
+            if self._chunk_size == 1:
+                # each chunk has at most 1 record [For Fully Shuffle]
                self._num_shuffle_parts = int(self._db_size / self._chunk_size / self._num_parts) + 1
            else:
-            # search a optimal chunk size by chunks
-            if self._chunk_size == -1:
+                if self._use_shuffle and self._chunk_size == -1:
+                    # search a optimal chunk size by chunks [For Chunk Shuffle]
                    max_chunk_size = self._db._total_size / ((self._num_chunks * (1 << 20)))
                    min_chunk_size = 1
                    while min_chunk_size * 2 < max_chunk_size: min_chunk_size *= 2
@@ -185,6 +183,13 @@ class DataReader(Process):
                    self._num_shuffle_parts = int(math.ceil(self._db._total_size * 1.1 /
                                                 (self._num_parts * self._chunk_size << 20)))
                    self._chunk_size = int(self._db_size / self._num_shuffle_parts / self._num_parts + 1)
+        else:
+            # each chunk has at most K records [For Multiple Nodes]
+            # note that if ``shuffle`` and ``multiple_nodes`` are all ``False``,
+            # ``chunk_size`` and ``num_shuffle_parts`` are meaningless
+            self._chunk_size = int(self._db_size / self._num_parts) + 1
+            self._num_shuffle_parts = 1
+
        self._perm = np.arange(self._num_shuffle_parts)

        # init env

--- a/Dragon/python/dragon/operators/arithmetic.py
+++ b/Dragon/python/dragon/operators/arithmetic.py
@@ -428,8 +428,8 @@ def Sqrt(inputs, **kwargs):
    return output


-def Scale(inputs, axis=1, num_axes=1, **kwargs):
-    """Scale Function.
+def Affine(inputs, axis=1, num_axes=1, **kwargs):
+    """Calculate ``y = Ax + b`` along the given range of axes.

    The number of inputs vary from ``2`` to ``3`` (Without or With ``bias``).

@@ -457,7 +457,7 @@ def Scale(inputs, axis=1, num_axes=1, **kwargs):
    CheckInputs(inputs, 2, 3)
    arguments = ParseArguments(locals())

-    output = Tensor.CreateOperator(nout=1, op_type='Scale', **arguments)
+    output = Tensor.CreateOperator(nout=1, op_type='Affine', **arguments)

    if inputs[0].shape is not None:
        output.shape = inputs[0].shape[:]

--- a/Dragon/python/dragon/ops.py
+++ b/Dragon/python/dragon/ops.py
@@ -90,7 +90,7 @@ Square = math.Square
 Sqrt = math.Sqrt
 InnerProduct = math.InnerProduct
 Eltwise = math.Eltwise
-Scale = math.Scale
+Affine = math.Affine
 GramMatrix = math.GramMatrix

 # normalization

--- a/Dragon/python/dragon/vm/caffe/layers/common.py
+++ b/Dragon/python/dragon/vm/caffe/layers/common.py
@@ -494,7 +494,7 @@ class ScaleLayer(Layer):

    def Setup(self, bottom):
        super(ScaleLayer, self).Setup(bottom)
-        return ops.Scale(bottom + [blob['data'] for blob in self._blobs], **self._param)
+        return ops.Affine(bottom + [blob['data'] for blob in self._blobs], **self._param)


 class BNLayer(Layer):
@@ -606,10 +606,12 @@ class NormalizeLayer(Layer):
    def __init__(self, LayerParameter):
        super(NormalizeLayer, self).__init__(LayerParameter)
        param = LayerParameter.normalize_param
-        self._l2norm_param = {'axis': 1,
+        self._l2norm_param = {
+            'axis': 1,
            'num_axes': -1 if param.across_spatial else 1,
            'eps': param.eps}
-        self._scale_param = {'axis': 1,
+        self._scale_param = {
+            'axis': 1,
            'num_axes': 0 if param.channel_shared else 1}
        scope = LayerParameter.name
        scale = Tensor(scope + '/param:0')
@@ -622,7 +624,7 @@ class NormalizeLayer(Layer):
    def Setup(self, bottom):
        super(NormalizeLayer, self).Setup(bottom)
        norm_out = [ops.L2Norm(bottom[0], **self._l2norm_param)]
-        scale_out = ops.Scale(norm_out + [blob['data'] for blob in self.scale_blobs],
+        scale_out = ops.Affine(norm_out + [blob['data'] for blob in self.scale_blobs],
                               **self._scale_param)
        return scale_out


--- a/Dragon/python/dragon/vm/caffe/layers/data.py
+++ b/Dragon/python/dragon/vm/caffe/layers/data.py
@@ -71,7 +71,7 @@ class DataLayer(Layer):
                       'min_random_scale': transform_param.min_random_scale,
                       'max_random_scale': transform_param.max_random_scale,
                       'shuffle': parallel_param.shuffle,
-                       'node_step': parallel_param.node_step,
+                       'multiple_nodes': parallel_param.multiple_nodes,
                       'partition': parallel_param.partition}

    def Setup(self, bottom):

--- a/Dragon/python/dragon/vm/torch/module.py
+++ b/Dragon/python/dragon/vm/torch/module.py
@@ -112,6 +112,9 @@ class Module(object):
                module.state_dict(destination, prefix + name + '.', to_numpy=to_numpy)
        return destination

+    def _load_state_dict_key_mismatch(self, full_name, name, is_missing):
+        pass
+
    def load_state_dict(self, state_dict, strict=True):
        logger.info('Load the state dict from numpy arrays.')
        def submodule_key_mismatch(full_name, is_missing):
@@ -159,7 +162,7 @@ class Module(object):
                    ', '.join('"{}"'.format(k) for k in unexpected))
            if len(missing) > 0:
                error_msg += 'Missing key(s) in state_dict: {}. '.format(
-                    ', '.join('"{}"'.format(k) for k in unexpected))
+                    ', '.join('"{}"'.format(k) for k in missing))
            if len(error_msg) > 0:
                raise KeyError(error_msg)

@@ -210,9 +213,9 @@ class Module(object):

    def __call__(self, *args, **kwargs):
        with dg.name_scope(get_module_name(self)):
-            return self.forward(*args)
+            return self.forward(*args, **kwargs)

-    def forward(self, *inputs):
+    def forward(self, *inputs, **kwargs):
        raise NotImplementedError('The base module can not be called.')

    def name_scope(self, remove_separator=True):

--- a/Dragon/python/dragon/vm/torch/nn/__init__.py
+++ b/Dragon/python/dragon/vm/torch/nn/__init__.py
@@ -16,12 +16,13 @@ as it will be reused by the ``torch.ops``.

 from dragon.vm.torch.module import Module
 from dragon.vm.torch.tensor import Parameter
-from .modules.conv import Conv2d
+from .modules.conv import Conv2d, ConvTranspose2d
 from .modules.pooling import MaxPool2d, AvgPool2d
-from .modules.activation import ReLU, Softmax
+from .modules.activation import ReLU, Sigmoid, Softmax
 from .modules.linear import Linear
 from .modules.loss import CrossEntropyLoss
 from .modules.container import Container, Sequential, ModuleList
 from .modules.batchnorm import BatchNorm1d, BatchNorm2d, BatchNorm3d
+from .modules.affine import Affine
 from .modules.dropout import Dropout, Dropout2d, Dropout3d
 from . import init
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/nn/modules/activation.py
+++ b/Dragon/python/dragon/vm/torch/nn/modules/activation.py
@@ -35,6 +35,25 @@ class ReLU(Module):
        return self.run(inputs, outputs)


+class Sigmoid(Module):
+    def __init__(self, inplace=False):
+        super(Sigmoid, self).__init__()
+        self._inplace = inplace
+        self.register_op()
+
+    def register_op(self):
+        self.op_meta = {
+            'op_type': 'Sigmoid',
+            'n_inputs': 1, 'n_outputs': 1,
+            'arguments': {}
+        }
+
+    def forward(self, x):
+        inputs = [x]; self.unify_devices(inputs)
+        outputs = [x if self._inplace else self.register_output(x.dtype)]
+        return self.run(inputs, outputs)
+
+
 class Softmax(Module):
    def __init__(self, dim=None):
        super(Softmax, self).__init__()

--- a/Dragon/python/dragon/vm/torch/nn/modules/affine.py
+++ b/Dragon/python/dragon/vm/torch/nn/modules/affine.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from dragon.vm.torch.nn import Module, Parameter
+from dragon.vm.torch.ops.creation import zeros, ones
+
+
+class Affine(Module):
+    def __init__(self, num_features, bias=True, fix_weight=False, fix_bias=False):
+        super(Affine, self).__init__()
+        self.num_features = num_features
+        self.weight = Parameter(ones(num_features), requires_grad=not fix_weight)
+        if bias:
+            self.bias = Parameter(zeros(num_features), requires_grad=not fix_bias)
+        else:
+            self.bias = None
+        self.inputs = [self.weight, self.bias] if bias else [self.weight]
+        self.register_op()
+
+    def register_op(self):
+        self.op_meta = {
+            'op_type': 'Affine',
+            'n_inputs': 3 if self.bias else 2, 'n_outputs': 1,
+            'arguments': {
+                'axis': 1, # Data format: NCHW
+                'num_axes': 1,
+            }
+        }
+
+    def forward(self, input):
+        inputs = [input] + self.inputs
+        self.unify_devices(inputs)
+        outputs = [self.register_output(input.dtype)]
+        return self.run(inputs, outputs)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/nn/modules/batchnorm.py
+++ b/Dragon/python/dragon/vm/torch/nn/modules/batchnorm.py
@@ -20,8 +20,8 @@ from dragon.vm.torch.module import RunOperator


 class _BatchNorm(Module):
-    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True,
-                 track_running_stats=True):
+    def __init__(self, num_features, eps=1e-5, momentum=0.1,
+                 affine=True, track_running_stats=True):
        super(_BatchNorm, self).__init__()
        self.num_features = num_features
        self.eps = eps
@@ -103,6 +103,10 @@ class _BatchNorm(Module):
        self.unify_devices(inputs)
        outputs = [self.register_output(input.dtype)]
        phase = 'TRAIN' if input.requires_grad else 'TEST'
+        # Normalize the input by using batch stats ALWAYS
+        # Note that the update of moving average is meaningless(
+        # Because we can not remove it. Why? Ask nvidia and cuDNN -:)
+        if not self.track_running_stats: phase = 'TRAIN'
        meta = ['PERSISTENT',] + self.make_meta_from_phase(phase)
        return RunOperator(inputs, outputs, meta)


--- a/Dragon/python/dragon/vm/torch/nn/modules/conv.py
+++ b/Dragon/python/dragon/vm/torch/nn/modules/conv.py
@@ -52,7 +52,8 @@ class _ConvNd(Module):

    def register_op(self):
        self.op_meta = {
-            'op_type': 'Conv2d',
+            'op_type': 'Conv{}d{}'.format(len(self.kernel_size),
+                       'Transpose' if self.transposed else ''),
            'n_inputs': 3 if self.bias else 2, 'n_outputs': 1,
            'arguments': {
                'num_output': self.weight.shape[0],
@@ -106,3 +107,21 @@ class Conv2d(_ConvNd):
        self.unify_devices(inputs)
        outputs = [self.register_output(input.dtype)]
        return self.run(inputs, outputs)
+
+
+class ConvTranspose2d(_ConvNd):
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, output_padding=0, groups=1, bias=True, dilation=1):
+        kernel_size = _pair(kernel_size)
+        stride = _pair(stride)
+        padding = _pair(padding)
+        dilation = _pair(dilation)
+        super(ConvTranspose2d, self).__init__(
+            in_channels, out_channels, kernel_size, stride, padding, dilation,
+            True, _pair(0), groups, bias)
+
+    def forward(self, input):
+        inputs = [input, self.weight] + ([self.bias] if self.bias else [])
+        self.unify_devices(inputs)
+        outputs = [self.register_output(input.dtype)]
+        return self.run(inputs, outputs)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/ops/__init__.py
+++ b/Dragon/python/dragon/vm/torch/ops/__init__.py
@@ -21,3 +21,7 @@ from .arithmetic import (
 from .ndarray import (
    sum, mean, argmin, argmax, max, topk, cat
 )
+
+from .vision import (
+    nn_resize, bilinear_resize, roi_pool, roi_align
+)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/ops/modules/vision.py
+++ b/Dragon/python/dragon/vm/torch/ops/modules/vision.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from dragon.vm.torch.ops.modules.base import BaseModule
+
+
+class Resize2d(BaseModule):
+    def __init__(self,  key, ctx, **kwargs):
+        super(Resize2d, self).__init__(key, ctx, **kwargs)
+        self.op_type = kwargs.get('op_type', 'NNResize')
+        self.dsize = kwargs.get('dsize', None)
+        self.fx = kwargs.get('fx', None)
+        self.fy = kwargs.get('fy', None)
+        self.register_arguments()
+        self.register_op()
+
+    def register_arguments(self):
+        if self.dsize:
+            self.dsize = [self.register_argument('dsize[{}]'.format(i))
+                for i in range(2)]
+
+    def register_op(self):
+        self.op_meta = {
+            'op_type': self.op_type,
+            'n_inputs': 1, 'n_outputs': 1,
+            'arguments': {
+                'dsize_desc': [d for d in self.dsize] if self.dsize else None,
+                'fx': self.fx, 'fy': self.fy,
+                'data_format': 'NCHW',
+            }
+        }
+
+    def forward(self, input, dsize=None):
+        inputs = [input]; self.unify_devices(inputs)
+        outputs = [self.register_output(input.dtype)]
+        if dsize is not None:
+            for ix, d in enumerate(dsize):
+                self.set_argument_i(self.dsize[ix], d)
+        return self.run(inputs, outputs)
+
+
+class RoIPool(BaseModule):
+    def __init__(self,  key, ctx, **kwargs):
+        super(RoIPool, self).__init__(key, ctx, **kwargs)
+        self.pool_h = kwargs.get('pooled_h', 0)
+        self.pool_w = kwargs.get('pooled_w', 0)
+        self.spatial_scale = kwargs.get('spatial_scale', 1.0)
+        self.register_arguments()
+        self.register_op()
+
+    def register_arguments(self):
+        """No arguments for roi-pool op."""
+        pass
+
+    def register_op(self):
+        self.op_meta = {
+            'op_type': 'ROIPooling',
+            'n_inputs': 2, 'n_outputs': 1,
+            'arguments': {
+                'pool_h': self.pool_h, 'pool_w': self.pool_w,
+                'spatial_scale': self.spatial_scale,
+            }
+        }
+
+    def forward(self, feature, rois, dsize=None):
+        inputs = [feature, rois]; self.unify_devices(inputs)
+        outputs = [self.register_output(feature.dtype)]
+        return self.run(inputs, outputs)
+
+
+class RoIAlign(BaseModule):
+    def __init__(self,  key, ctx, **kwargs):
+        super(RoIAlign, self).__init__(key, ctx, **kwargs)
+        self.pool_h = kwargs.get('pooled_h', 0)
+        self.pool_w = kwargs.get('pooled_w', 0)
+        self.spatial_scale = kwargs.get('spatial_scale', 1.0)
+        self.sampling_ratio = kwargs.get('sampling_ratio', 2)
+        self.register_arguments()
+        self.register_op()
+
+    def register_arguments(self):
+        """No arguments for roi-pool op."""
+        pass
+
+    def register_op(self):
+        self.op_meta = {
+            'op_type': 'ROIAlign',
+            'n_inputs': 2, 'n_outputs': 1,
+            'arguments': {
+                'pool_h': self.pool_h, 'pool_w': self.pool_w,
+                'spatial_scale': self.spatial_scale,
+                'sampling_ratio': self.sampling_ratio,
+            }
+        }
+
+    def forward(self, feature, rois, dsize=None):
+        inputs = [feature, rois]; self.unify_devices(inputs)
+        outputs = [self.register_output(feature.dtype)]
+        return self.run(inputs, outputs)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/ops/vision.py
+++ b/Dragon/python/dragon/vm/torch/ops/vision.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from dragon.vm.torch.ops.primitive import MakeContext
+from dragon.vm.torch.ops.factory import get_module
+from dragon.vm.torch.ops.modules.vision import Resize2d
+from dragon.vm.torch.ops.modules.vision import RoIPool, RoIAlign
+
+
+def _resize_2d(input, op_type, dsize, fx, fy):
+    if dsize is None:
+        if fx < 0 or fy < 0:
+            raise ValueError('Set fx and fy if dsize is None.')
+    else:
+        if len(dsize) != 2:
+            raise ValueError('The dsize should be a list with 2 elements.')
+    if dsize is None and (fy == -1.0 or fx == -1.0):
+        raise RuntimeError('The dsize, fx/fy should be specified either.')
+    ctx = MakeContext(inputs=[input])
+    key = 'torch/ops/{}/{}:{}/dsize:{}/fx:{}/fy:{}'.format(
+        op_type.lower(), ctx[0].lower(), ctx[1], '2' if dsize else 'none', fx, fy)
+    module = get_module(Resize2d, key, ctx,
+        op_type=op_type, dsize=dsize, fx=fx, fy=fy)
+    return module.forward(input, dsize)
+
+
+def nn_resize(input, dsize, fx=-1.0, fy=-1.0):
+    return _resize_2d(input, 'NNResize', dsize, fx, fy)
+
+
+def bilinear_resize(input, dsize, fx=-1.0, fy=-1.0):
+    return _resize_2d(input, 'BilinearResize', dsize, fx, fy)
+
+
+def roi_pool(feature, rois, pooled_h, pooled_w, spatial_scale):
+    ctx = MakeContext(inputs=[feature])
+    key = 'torch/ops/roi_pool/{}:{}/pool_h:{}/pool_w:{}/spatial_scale:{}'.format(
+        ctx[0].lower(), ctx[1], pooled_h, pooled_w, spatial_scale)
+    module = get_module(RoIPool, key, ctx, pooled_h=pooled_h,
+        pooled_w=pooled_w, spatial_scale=spatial_scale)
+    return module.forward(feature, rois)
+
+
+def roi_align(feature, rois, pooled_h, pooled_w,
+              spatial_scale, sampling_ratio=2):
+    ctx = MakeContext(inputs=[feature])
+    key = 'torch/ops/roi_align/{}:{}/pool_h:{}/pool_w:{}/' \
+          'spatial_scale:{}/sampling_ratio:{}'.format(
+        ctx[0].lower(), ctx[1], pooled_h, pooled_w, spatial_scale, sampling_ratio)
+    module = get_module(RoIAlign, key, ctx, pooled_h=pooled_h,
+        pooled_w=pooled_w, spatial_scale=spatial_scale, sampling_ratio=sampling_ratio)
+    return module.forward(feature, rois)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/optim/optimizer.py
+++ b/Dragon/python/dragon/vm/torch/optim/optimizer.py
@@ -110,7 +110,7 @@ class Optimizer(object):
            _update(p, g, op_type=self._update_type,
                slot=group['slot'],
                lr_mult=group.get('lr_mult', 1.0),
-                decay_mult=group.get('lr_mult', 1.0))
+                decay_mult=group.get('decay_mult', 1.0))

    def zero_grad(self):
        """Set all gradients to zeros.

--- a/Dragon/python/dragon/vm/torch/tensor.py
+++ b/Dragon/python/dragon/vm/torch/tensor.py
@@ -144,6 +144,17 @@ class Tensor(object):
                               "Use .cpu() to move the tensor to host memory first.")
        return tensor_utils.ToPyArray(self._dg_tensor)

+    def numpy_ex(self):
+        """Create a numpy const nd-array sharing this tensor.
+
+        Returns
+        -------
+        numpy.ndarray
+            The numpy nd-array.
+
+        """
+        return tensor_utils.ToPyArrayEx(self._dg_tensor)
+
    def dragon(self):
        """Create a dragon tensor sharing this tensor.

@@ -533,7 +544,7 @@ class Tensor(object):

        Parameters
        ----------
-        args : tuple
+        args : tuple or int
            The new size.

        Returns
@@ -606,6 +617,21 @@ class Tensor(object):
        """
        self.fill_(0.)

+    def one_(self):
+        """Fills self tensor with ones.
+
+        Parameters
+        ----------
+        value : numerical type
+
+        Returns
+        -------
+        vm.torch.Tensor
+            The self.
+
+        """
+        self.fill_(1.)
+
    def uniform_(self, low=0, high=1):
        """Fill self tensor with the specified uniform distribution.


--- a/Dragon/python/dragon/vm/torch/utils/data/dataloader.py
+++ b/Dragon/python/dragon/vm/torch/utils/data/dataloader.py
@@ -34,7 +34,7 @@ class _DataLoaderIter(object):

 class DataLoader(object):
    def __init__(self, dataset, batch_size=1, shuffle=False,
-                 partition=False, multiple_nodes=False, instance_chunk=False):
+                 partition=False, multiple_nodes=False, num_chunks=2048, chunk_size=-1):
        """A MPI-Aware DataLoader. Forked from ``dragon.io``.

        Parameters
@@ -43,16 +43,16 @@ class DataLoader(object):
            The dataset.
        batch_size : int
            The batch size. Divided by n mpi-nodes if ``partition`` is True.
-        instance_chunk : boolean
-            Whether to limit each chunk with at most 1 instance.
        shuffle : boolean
            Whether to shuffle the data.
        partition : boolean
            Whether to partition batch. Default is ``False``.
        multiple_nodes: boolean
            Whether to split data for multiple parallel nodes.
-        instance_chunk : boolean
-            Whether to limit each chunk with at most 1 instance.
+        num_chunks : int
+            The number of chunks to split. Default is ``2048``.
+        chunk_size : int
+            The size(MB) of each chunk. Default is -1 (Refer ``num_chunks``).

        """
        self.dataset = dataset
@@ -65,7 +65,8 @@ class DataLoader(object):
            'source': dataset.database,
            'multiple_nodes': multiple_nodes,
            'shuffle': shuffle,
-            'instance_chunk': instance_chunk,
+            'num_chunks': num_chunks,
+            'chunk_size': chunk_size,
            'batch_size': batch_size,
            'partition': partition,
            'transform': dataset.transform,

--- a/Dragon/python/dragon/vm/torch/utils/data/io/data_batch.py
+++ b/Dragon/python/dragon/vm/torch/utils/data/io/data_batch.py
@@ -39,11 +39,9 @@ class DataBatch(object):
        source : str
            The path of database.
        multiple_nodes: boolean
-            Whether to split data for multiple parallel nodes.
+            Whether to split data for multiple parallel nodes. Default is ``False``.
        shuffle : boolean
-            Whether to shuffle the data.
-        instance_chunk : boolean
-            Whether to limit each chunk with at most 1 instance.
+            Whether to shuffle the data. Default is ``False``.
        num_chunks : int
            The number of chunks to split. Default is ``2048``.
        chunk_size : int

--- a/Dragon/python/dragon/vm/torch/utils/data/io/data_reader.py
+++ b/Dragon/python/dragon/vm/torch/utils/data/io/data_reader.py
@@ -36,11 +36,9 @@ class DataReader(Process):
        source : str
            The path of database.
        multiple_nodes: boolean
-            Whether to split data for multiple parallel nodes.
+            Whether to split data for multiple parallel nodes. Default is ``False``.
        shuffle : boolean
-            Whether to shuffle the data.
-        instance_chunk : boolean
-            Whether to limit each chunk with at most 1 instance.
+            Whether to shuffle the data. Default is ``False``.
        num_chunks : int
            The number of chunks to split. Default is ``2048``.
        chunk_size : int
@@ -51,7 +49,6 @@ class DataReader(Process):
        self._source = kwargs.get('source', '')
        self._multiple_nodes = kwargs.get('multiple_nodes', False)
        self._use_shuffle = kwargs.get('shuffle', False)
-        self._use_instance_chunk = kwargs.get('instance_chunk', False)
        self._num_chunks = kwargs.get('num_chunks', 2048)
        self._chunk_size = kwargs.get('chunk_size', -1)

@@ -172,12 +169,13 @@ class DataReader(Process):
        self._db_zfill = int(self._db.get('zfill'))
        self._epoch_size = int(self._db_size / self._num_parts + 1)

-        if self._use_instance_chunk:
-            self._chunk_size = 1
+        if self._use_shuffle:
+            if self._chunk_size == 1:
+                # each chunk has at most 1 record [For Fully Shuffle]
                self._num_shuffle_parts = int(self._db_size / self._chunk_size / self._num_parts) + 1
            else:
-            # search a optimal chunk size by chunks
-            if self._chunk_size == -1:
+                if self._use_shuffle and self._chunk_size == -1:
+                    # search a optimal chunk size by chunks [For Chunk Shuffle]
                    max_chunk_size = self._db._total_size / ((self._num_chunks * (1 << 20)))
                    min_chunk_size = 1
                    while min_chunk_size * 2 < max_chunk_size: min_chunk_size *= 2
@@ -185,6 +183,13 @@ class DataReader(Process):
                    self._num_shuffle_parts = int(math.ceil(self._db._total_size * 1.1 /
                                                 (self._num_parts * self._chunk_size << 20)))
                    self._chunk_size = int(self._db_size / self._num_shuffle_parts / self._num_parts + 1)
+        else:
+            # each chunk has at most K records [For Multiple Nodes]
+            # note that if ``shuffle`` and ``multiple_nodes`` are all ``False``,
+            # ``chunk_size`` and ``num_shuffle_parts`` are meaningless
+            self._chunk_size = int(self._db_size / self._num_parts) + 1
+            self._num_shuffle_parts = 1
+
        self._perm = np.arange(self._num_shuffle_parts)

        # init env

--- a/Dragon/python/dragon/vm/torch/utils/model_zoo.py
+++ b/Dragon/python/dragon/vm/torch/utils/model_zoo.py
@@ -13,7 +13,7 @@
 #
 # ------------------------------------------------------------

-import torch
+import dragon.vm.torch as torch

 import hashlib
 import os
@@ -44,28 +44,6 @@ HASH_REGEX = re.compile(r'-([a-f0-9]*)\.')


 def load_url(url, model_dir=None, map_location=None, progress=True):
-    r"""Loads the Torch serialized object at the given URL.
-
-    If the object is already present in `model_dir`, it's deserialized and
-    returned. The filename part of the URL should follow the naming convention
-    ``filename-<sha256>.ext`` where ``<sha256>`` is the first eight or more
-    digits of the SHA256 hash of the contents of the file. The hash is used to
-    ensure unique names and to verify the contents of the file.
-
-    The default value of `model_dir` is ``$TORCH_HOME/models`` where
-    ``$TORCH_HOME`` defaults to ``~/.torch``. The default directory can be
-    overriden with the ``$TORCH_MODEL_ZOO`` environment variable.
-
-    Args:
-        url (string): URL of the object to download
-        model_dir (string, optional): directory in which to save the object
-        map_location (optional): a function or a dict specifying how to remap storage locations (see torch.load)
-        progress (bool, optional): whether or not to display a progress bar to stderr
-
-    Example:
-        >>> state_dict = torch.utils.model_zoo.load_url('https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde.pth')
-
-    """
    if model_dir is None:
        torch_home = os.path.expanduser(os.getenv('TORCH_HOME', '~/.torch'))
        model_dir = os.getenv('TORCH_MODEL_ZOO', os.path.join(torch_home, 'models'))

--- a/Dragon/python/setup.py
+++ b/Dragon/python/setup.py
@@ -36,7 +36,7 @@ find_packages('dragon')
 find_modules()

 setup(name = 'dragon',
-      version='0.2.2.1',
+      version='0.2.2.2',
      description = 'Dragon: A Computation Graph Virtual Machine Based Deep Learning Framework',
      url='https://github.com/seetaresearch/Dragon',
      author='Ting Pan',

--- a/Dragon/src/core/context.cc
+++ b/Dragon/src/core/context.cc
@@ -12,7 +12,7 @@ CUDAObject CUDAContext::cuda_object_;
 template<> void CPUContext::Memcpy<CPUContext, CUDAContext>(
    size_t nbytes, void* dst, const void* src) {
 #ifdef WITH_CUDA
-    CUDAContext ctx(POINTER_DEVICE(src));
+    CUDAContext ctx(CUDA_POINTER_DEVICE(src));
    ctx.Memcpy<CPUContext, CUDAContext>(nbytes, dst, src);
 #else
    LOG(FATAL) << "CUDA was not compiled.";
@@ -23,7 +23,7 @@ template<> void CPUContext::Memcpy<CPUContext, CUDAContext>(
 template<> void CPUContext::Memcpy<CUDAContext, CPUContext>(
    size_t nbytes, void* dst, const void* src) {
 #ifdef WITH_CUDA
-        CUDAContext ctx(POINTER_DEVICE(dst));
+        CUDAContext ctx(CUDA_POINTER_DEVICE(dst));
        ctx.Memcpy<CUDAContext, CPUContext>(nbytes, dst, src);
 #else
    LOG(FATAL) << "CUDA was not compiled.";

--- a/Dragon/src/core/mixedmem.cc
+++ b/Dragon/src/core/mixedmem.cc
@@ -128,8 +128,8 @@ MixedMemory::~MixedMemory() {
 void MixedMemory::SwitchToDevice() {
    if (cuda_ptr_) {
 #ifdef WITH_CUDA
-        int ptr_device = POINTER_DEVICE(cuda_ptr_);
-        int cur_device = CURRENT_DEVICE();
+        int ptr_device = CUDA_POINTER_DEVICE(cuda_ptr_);
+        int cur_device = CUDA_CURRENT_DEVICE();
        if (ptr_device != cur_device) state_ = SWITCHED;
 #endif
    }
@@ -139,7 +139,7 @@ void MixedMemory::SwitchToCUDADevice(int device_id) {
 #ifdef WITH_CUDA
    DeviceGuard gurad(device_id);
    if (cuda_ptr_) {
-        int ptr_device = POINTER_DEVICE(cuda_ptr_);
+        int ptr_device = CUDA_POINTER_DEVICE(cuda_ptr_);
        if (ptr_device != device_id) state_ = SWITCHED;
    }
    ToCUDA();
@@ -164,7 +164,7 @@ const Map<string, string> MixedMemory::info() const {
    }
    s2s["mem_at"] = _state_;
    if (cpu_ptr_) s2s["CPU"] = "0";
-    if (cuda_ptr_) s2s["CUDA"] = dragon_cast<string, int>(POINTER_DEVICE(cuda_ptr_));
+    if (cuda_ptr_) s2s["CUDA"] = dragon_cast<string, int>(CUDA_POINTER_DEVICE(cuda_ptr_));
    return s2s;
 }


--- a/Dragon/src/operators/arithmetic/scale_op.cc
+++ b/Dragon/src/operators/arithmetic/scale_op.cc
-#include "operators/arithmetic/scale_op.h"
+#include "operators/arithmetic/affine_op.h"
 #include "core/workspace.h"
 #include "utils/filler.h"
 #include "utils/op_kernel.h"
@@ -6,7 +6,7 @@
 namespace dragon {

 template <class Context> template <typename T>
-void ScaleOp<Context>::RunWithType() {
+void AffineOp<Context>::RunWithType() {
    start_axis = axis;
    if (start_axis < 0) start_axis += (int)Input(0).ndim();
    if (num_axes == -1) num_axes = (int)Input(0).ndim() - start_axis;
@@ -18,27 +18,29 @@ void ScaleOp<Context>::RunWithType() {
    const vector<TIndex>::const_iterator& dim_start = Input(0).dims().begin() + start_axis;
    const vector<TIndex>::const_iterator& dim_end = dim_start + num_axes;
    vector<TIndex> param_dims(dim_start, dim_end);
-
-    TENSOR_FILL(Input(1), param_dims);
+    TENSOR_FILL(Input(1), param_dims);;
+    outer_dim = Input(0).count(0, start_axis);
+    inner_dim = Input(0).count(start_axis + num_axes);
+    scale_dim = Input(1).count();
    if (InputSize() > 2) {
        TENSOR_FILL(Input(2), param_dims);
-        inner_dim = Input(0).count(start_axis + num_axes);
        INIT_MULTIPLIER(bias_multiplier, inner_dim);
    }

-    if (InputSize() > 2) {
-        kernel::Scale<T, Context>(start_axis, &Input(0), &Input(1),
-                                        &Input(2), bias_multiplier,
-                                                        Output(0));
-    } else {
-        kernel::Scale<T, Context>(start_axis, &Input(0), &Input(1),
-                                                  nullptr, nullptr,
-                                                        Output(0));
-    }
+    auto* Xdata = Input(0).template data<T, Context>();
+    auto* Adata = Input(1).template data<T, Context>();
+    auto* Bdata = InputSize() > 2 ? Input(2).template data<T, Context>() : nullptr;
+    auto* BMdata = InputSize() > 2 ? bias_multiplier->template data<T, Context>() : nullptr;
+    auto* Ydata = Output(0)->template mutable_data<T, Context>();
+
+    kernel::Affine<T, Context>(Output(0)->count(),
+                  outer_dim, scale_dim, inner_dim,
+                      Xdata, Adata, Bdata, BMdata,
+                                           Ydata);
 }

 template <class Context>
-void ScaleOp<Context>::RunOnDevice() {
+void AffineOp<Context>::RunOnDevice() {
    Output(0)->ReshapeLike(Input(0));

    if (XIsType(Input(0), float)) RunWithType<float>();
@@ -46,14 +48,14 @@ void ScaleOp<Context>::RunOnDevice() {
    else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
 }

-DEPLOY_CPU(Scale);
+DEPLOY_CPU(Affine);
 #ifdef WITH_CUDA
-DEPLOY_CUDA(Scale);
+DEPLOY_CUDA(Affine);
 #endif
-OPERATOR_SCHEMA(Scale).NumInputs(2, 3).NumOutputs(1);
+OPERATOR_SCHEMA(Affine).NumInputs(2, 3).NumOutputs(1);

 template <class Context> template <typename T>
-void ScaleGradientOp<Context>::BiasRunWithType() {
+void AffineGradientOp<Context>::BiasRunWithType() {
    Output(2)->ReshapeLike(Input(1));
    INIT_MULTIPLIER(bias_multiplier, inner_dim);
    auto* BMul_data = this->bias_multiplier->template data<T, Context>();
@@ -71,7 +73,7 @@ void ScaleGradientOp<Context>::BiasRunWithType() {
 }

 template <class Context> template <typename T>
-void ScaleGradientOp<Context>::ScaleRunWithType() {
+void AffineGradientOp<Context>::ScaleRunWithType() {
    Output(0)->ReshapeLike(Input(0));
    Output(1)->ReshapeLike(Input(1));
    INIT_MULTIPLIER(sum_multiplier, sum_dim);
@@ -119,15 +121,20 @@ void ScaleGradientOp<Context>::ScaleRunWithType() {
 }

 template <class Context> template <typename T>
-void ScaleGradientOp<Context>::RunWithType() {
+void AffineGradientOp<Context>::RunWithType() {
    Output(0)->ReshapeLike(Input(0));

-    kernel::ScaleGrad<T, Context>(start_axis,
-           &Input(-1), &Input(1), Output(0));
+    auto* dYdata = Input(-1).template data<T, Context>();
+    auto* Adata = Input(1).template data<T, Context>();
+    auto* dXdata = Output(0)->template mutable_data<T, Context>();
+
+    kernel::AffineGrad<T, Context>(Output(0)->count(),
+                      outer_dim, scale_dim, inner_dim,
+                               dYdata, Adata, dXdata);
 }

 template <class Context>
-void ScaleGradientOp<Context>::RunOnDevice() {
+void AffineGradientOp<Context>::RunOnDevice() {
    start_axis = axis;
    if (start_axis < 0) start_axis += (int)Input(0).ndim();
    if (num_axes == -1) num_axes = (int)Input(0).ndim() - start_axis;
@@ -151,21 +158,21 @@ void ScaleGradientOp<Context>::RunOnDevice() {
    }
 }

-DEPLOY_CPU(ScaleGradient);
+DEPLOY_CPU(AffineGradient);
 #ifdef WITH_CUDA
-DEPLOY_CUDA(ScaleGradient);
+DEPLOY_CUDA(AffineGradient);
 #endif
-OPERATOR_SCHEMA(ScaleGradient).NumInputs(3).NumOutputs(3);
+OPERATOR_SCHEMA(AffineGradient).NumInputs(3).NumOutputs(3);

-class GetScaleGradient final : public GradientMakerBase {
+class GetAffineGradient final : public GradientMakerBase {
 public:
-    GRADIENT_MAKER_CTOR(GetScaleGradient);
+    GRADIENT_MAKER_CTOR(GetAffineGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",
            vector<string> {I(0), I(1), GO(0)},
            vector<string> {GI(0), GI(1), GI(2)});
    }
 };
-REGISTER_GRADIENT(Scale, GetScaleGradient);
+REGISTER_GRADIENT(Affine, GetAffineGradient);

 }    // namespace dragon
\ No newline at end of file
--- a/Dragon/src/operators/misc/accuracy_op.cc
+++ b/Dragon/src/operators/misc/accuracy_op.cc
@@ -3,17 +3,18 @@
 #include "utils/math_functions.h"

 namespace dragon {
-template <class Context> template <typename T>
+
+template <class Context> template <typename Tx, typename Ty>
 void AccuracyOp<Context>::RunWithType() {
    if (OutputSize() > 1) {
-        math::Set<T, CPUContext>(num_classes, 0, 
-            Output(1)->template mutable_data<T, CPUContext>());
+        math::Set<float, CPUContext>(num_classes, 0,
+            Output(1)->template mutable_data<float, CPUContext>());
    }
-    Map<int, int> num_per_class;
+    Map<int, TIndex> num_per_class;

-    T acc = 0, count = 0;
-    auto* Xdata = Input(0).template data<T, CPUContext>();
-    auto* labels = Input(1).template data<T, CPUContext>();
+    TIndex acc = 0, count = 0;
+    auto* Xdata = Input(0).template data<Tx, CPUContext>();
+    auto* labels = Input(1).template data<Ty, CPUContext>();
    auto* ignores = ignore_labels.count() > 0 ?
                        ignore_labels.data<int, CPUContext>() : nullptr;
    const TIndex dim = Input(0).count() / outer_dim;
@@ -23,14 +24,14 @@ void AccuracyOp<Context>::RunWithType() {
            for (int k = 0; k < ignore_labels.count(); k++)
                if (label == ignores[k]) continue;
            if (OutputSize() > 1) num_per_class[label]++;
-            vector<pair<T, int> > vec;
+            vector<pair<Tx, int> > vec;
            for (int k = 0; k < num_classes; k++)
                vec.push_back(std::make_pair(Xdata[i * dim + k * inner_dim + j], k));
-            std::partial_sort(vec.begin(), vec.begin() + top_k, vec.end(), std::greater<pair<T, int> >());
+            std::partial_sort(vec.begin(), vec.begin() + top_k, vec.end(), std::greater<pair<Tx, int> >());
            for (int k = 0; k < top_k; k++) {
                if (vec[k].second == label) {
                    if (OutputSize() > 1)
-                        Output(1)->template mutable_data<T, CPUContext>()[label]++;
+                        Output(1)->template mutable_data<float, CPUContext>()[label]++;
                    acc++;
                    break;
                }
@@ -39,11 +40,12 @@ void AccuracyOp<Context>::RunWithType() {
        }    //  end inner_dim
    }    // end outer_dim

-    Output(0)->template mutable_data<T, CPUContext>()[0] = acc / count;
+    Output(0)->template mutable_data<float, CPUContext>()[0] = (float)acc / count;
    if (OutputSize() > 1) {
-        auto* acc_per_class = Output(1)->template mutable_data<T, CPUContext>();
+        auto* acc_per_class = Output(1)->template mutable_data<float, CPUContext>();
        for (int i = 0; i < num_classes; i++)
-            acc_per_class[i] = num_per_class[i] == 0 ? 0 : acc_per_class[i] / acc_per_class[i];
+            acc_per_class[i] = num_per_class[i] == 0 ?
+                0 : acc_per_class[i] / num_per_class[i];
    }
 }

@@ -58,8 +60,11 @@ void AccuracyOp<Context>::RunOnDevice() {
    Output(0)->Reshape(vector<TIndex>(1, 1));
    if (OutputSize() > 1) Output(1)->Reshape(vector<TIndex>(1, num_classes)); 

-    if (XIsType(Input(0), float)) RunWithType<float>();
-    else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
+    if (XIsType(Input(0), float)) {
+        if (XIsType(Input(1), float)) RunWithType<float, float>();
+        else if(XIsType(Input(1), int64_t)) RunWithType<float, int64_t>();
+        else LOG(FATAL) << DTypeHelper(Input(1), { "float32", "int64" });
+    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
 }

 DEPLOY_CPU(Accuracy);

--- a/Dragon/src/operators/vision/cudnn_conv2d_op.cc
+++ b/Dragon/src/operators/vision/cudnn_conv2d_op.cc
@@ -128,6 +128,8 @@ void CuDNNConv2dOp<Context>::RunOnDevice() {
 #endif
 #if CUDNN_VERSION_MIN(7, 0, 0)
        CUDNN_CHECK(cudnnSetConvolutionGroupCount(conv_desc, this->group));
+        if (enable_tensor_core)
+            CUDNN_CHECK(cudnnSetConvolutionMathType(conv_desc, CUDNN_TENSOR_OP_MATH));
 #endif
        RunWithType<float>();
    } else if (XIsType(Input(0), float16)) {
@@ -148,6 +150,8 @@ void CuDNNConv2dOp<Context>::RunOnDevice() {
 #endif
 #if CUDNN_VERSION_MIN(7, 0, 0)
        CUDNN_CHECK(cudnnSetConvolutionGroupCount(conv_desc, this->group));
+        if (enable_tensor_core)
+            CUDNN_CHECK(cudnnSetConvolutionMathType(conv_desc, CUDNN_TENSOR_OP_MATH));
 #endif
        RunWithType<float16>();
 #endif  // WITH_CUDA_FP16
@@ -306,6 +310,8 @@ void CuDNNConv2dGradientOp<Context>::RunOnDevice() {
 #endif
 #if CUDNN_VERSION_MIN(7, 0, 0)
        CUDNN_CHECK(cudnnSetConvolutionGroupCount(conv_desc, this->group));
+        if (enable_tensor_core) 
+            CUDNN_CHECK(cudnnSetConvolutionMathType(conv_desc, CUDNN_TENSOR_OP_MATH));
 #endif
        RunWithType<float>();
    } else if (XIsType(Input(0), float16)) {
@@ -325,6 +331,8 @@ void CuDNNConv2dGradientOp<Context>::RunOnDevice() {
 #endif
 #if CUDNN_VERSION_MIN(7, 0, 0)
        CUDNN_CHECK(cudnnSetConvolutionGroupCount(conv_desc, this->group));
+        if (enable_tensor_core)
+            CUDNN_CHECK(cudnnSetConvolutionMathType(conv_desc, CUDNN_TENSOR_OP_MATH));
 #endif
        RunWithType<float16>();
 #endif  // WITH_CUDA_FP16

--- a/Dragon/src/operators/vision/cudnn_conv2d_transpose_op.cc
+++ b/Dragon/src/operators/vision/cudnn_conv2d_transpose_op.cc
@@ -130,6 +130,8 @@ void CuDNNConv2dTransposeOp<Context>::RunOnDevice() {
 #endif
 #if CUDNN_VERSION_MIN(7, 0, 0)
        CUDNN_CHECK(cudnnSetConvolutionGroupCount(conv_desc, this->group));
+        if (enable_tensor_core)
+            CUDNN_CHECK(cudnnSetConvolutionMathType(conv_desc, CUDNN_TENSOR_OP_MATH));
 #endif
        RunWithType<float>();
    } else if (XIsType(Input(0), float16)) {
@@ -150,6 +152,8 @@ void CuDNNConv2dTransposeOp<Context>::RunOnDevice() {
 #endif
 #if CUDNN_VERSION_MIN(7, 0, 0)
        CUDNN_CHECK(cudnnSetConvolutionGroupCount(conv_desc, this->group));
+        if (enable_tensor_core)
+            CUDNN_CHECK(cudnnSetConvolutionMathType(conv_desc, CUDNN_TENSOR_OP_MATH));
 #endif
        RunWithType<float16>();
 #endif  // WITH_CUDA_FP16
@@ -310,6 +314,8 @@ void CuDNNConv2dTransposeGradientOp<Context>::RunOnDevice() {
 #endif
 #if CUDNN_VERSION_MIN(7, 0, 0)
        CUDNN_CHECK(cudnnSetConvolutionGroupCount(conv_desc, this->group));
+        if (enable_tensor_core)
+            CUDNN_CHECK(cudnnSetConvolutionMathType(conv_desc, CUDNN_TENSOR_OP_MATH));
 #endif
        RunWithType<float>();
    } else if (XIsType(Input(0), float16)) {
@@ -330,6 +336,8 @@ void CuDNNConv2dTransposeGradientOp<Context>::RunOnDevice() {
 #endif
 #if CUDNN_VERSION_MIN(7, 0, 0)
        CUDNN_CHECK(cudnnSetConvolutionGroupCount(conv_desc, this->group));
+        if (enable_tensor_core)
+            CUDNN_CHECK(cudnnSetConvolutionMathType(conv_desc, CUDNN_TENSOR_OP_MATH));
 #endif
        RunWithType<float16>();
 #endif  // WITH_CUDA_FP16

--- a/Dragon/src/utils/math_functions.cu
+++ b/Dragon/src/utils/math_functions.cu
@@ -42,22 +42,6 @@ template <> void Set<int, CUDAContext>(const int n,
    CUDA_POST_KERNEL_CHECK;
 }

-template <> void Set<float16, CUDAContext>(const int n, 
-                                           const float16 alpha, 
-                                           float16* x) {
-#ifdef WITH_CUDA_FP16
-    if (n % 2 == 0) 
-        _Set<half2> << <GET_BLOCKS(n / 2), CUDA_NUM_THREADS >> >(n / 2,
-                                    dragon_cast<half2, float16>(alpha),
-                                          reinterpret_cast<half2*>(x));
-    else 
-        _Set<float16> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, alpha, x);
-    CUDA_POST_KERNEL_CHECK;
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
-}
-
 template <> void RandomUniform<uint32_t, CUDAContext>(const int n,
                                                      const float low,
                                                      const float high,
@@ -67,13 +51,6 @@ template <> void RandomUniform<uint32_t, CUDAContext>(const int n,
    CURAND_CHECK(curandGenerate(curand_generator(), x, n));
 }

-template <> void RandomUniform<float16, CUDAContext>(const int n,
-                                                     const float low,
-                                                     const float high,
-                                                     float16* x) {
-    NOT_IMPLEMENTED;
-}
-
 template <> void RandomNormal<float, CUDAContext>(const int n,
                                                  const float mu,
                                                  const float sigma,
@@ -81,13 +58,6 @@ template <> void RandomNormal<float, CUDAContext>(const int n,
    CURAND_CHECK(curandGenerateNormal(curand_generator(), x, n, mu, sigma));
 }

-template <> void RandomNormal<float16, CUDAContext>(const int n,
-                                                    const float mu, 
-                                                    const float sigma, 
-                                                    float16* x) {
-    NOT_IMPLEMENTED;
-}
-
 template <> void RandomBernoulli<float, CUDAContext>(const int n,
                                                     const float p,
                                                     unsigned int* x) {
@@ -113,46 +83,6 @@ template <> void Add<float, CUDAContext>(int n,
    CUDA_POST_KERNEL_CHECK;
 }

-#ifdef WITH_CUDA_FP16
-template <typename T>
-__global__ void _AddHalf(const int n, const half* a, const half* b, half* y) {
-    CUDA_KERNEL_LOOP(idx, n) {
-#if __CUDA_ARCH__ >= 530
-        y[idx] = __hadd(a[idx], b[idx]);
-#endif
-    }
-}
-
-template <typename T>
-__global__ void _AddHalf2(const int n, const half2* a, const half2* b, half2* y) {
-    CUDA_KERNEL_LOOP(idx, n) {
-#if __CUDA_ARCH__ >= 530
-        y[idx] = __hadd2(a[idx], b[idx]);
-#endif
-    }
-}
-#endif
-
-template <> void Add<float16, CUDAContext>(int n,
-                                           const float16* a,
-                                           const float16* b,
-                                           float16* y) {
-#ifdef WITH_CUDA_FP16
-    if (n % 2 == 0)
-        _AddHalf2<half2> << <GET_BLOCKS(n / 2), CUDA_NUM_THREADS >> >(n / 2,
-                                          reinterpret_cast<const half2*>(a),
-                                          reinterpret_cast<const half2*>(b),
-                                               reinterpret_cast<half2*>(y));
-    else _AddHalf<half> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n,
-                                  reinterpret_cast<const half*>(a),
-                                  reinterpret_cast<const half*>(b),
-                                       reinterpret_cast<half*>(y));
-    CUDA_POST_KERNEL_CHECK;
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
-}
-
 template <typename T>
 __global__ void _Sub(const int n, const T* a, const T *b, T* y) {
    CUDA_KERNEL_LOOP(idx, n) {
@@ -168,46 +98,6 @@ template <> void Sub<float, CUDAContext>(int n,
    CUDA_POST_KERNEL_CHECK;
 }

-#ifdef WITH_CUDA_FP16
-template <typename T>
-__global__ void _SubHalf(const int n, const half* a, const half* b, half* y) {
-    CUDA_KERNEL_LOOP(idx, n) {
-#if __CUDA_ARCH__ >= 530
-        y[idx] = __hsub(a[idx], b[idx]);
-#endif
-    }
-}
-
-template <typename T>
-__global__ void _SubHalf2(const int n, const half2* a, const half2* b, half2* y) {
-    CUDA_KERNEL_LOOP(idx, n) {
-#if __CUDA_ARCH__ >= 530
-        y[idx] = __hsub2(a[idx], b[idx]);
-#endif
-    }
-}
-#endif
-
-template <> void Sub<float16, CUDAContext>(int n,
-                                           const float16* a,
-                                           const float16* b,
-                                           float16* y) {
-#ifdef WITH_CUDA_FP16
-    if (n % 2 == 0)
-        _SubHalf2<half2> << <GET_BLOCKS(n / 2), CUDA_NUM_THREADS >> >(n / 2,
-                                           reinterpret_cast<const half2*>(a),
-                                           reinterpret_cast<const half2*>(b),
-                                                reinterpret_cast<half2*>(y));
-    else _SubHalf<half> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n,
-                                  reinterpret_cast<const half*>(a),
-                                  reinterpret_cast<const half*>(b),
-                                       reinterpret_cast<half*>(y));
-    CUDA_POST_KERNEL_CHECK;
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
-}
-
 template <typename T>
 __global__ void _Mul(const int n, const T* a, const T* b, T* y) {
    CUDA_KERNEL_LOOP(idx, n) {
@@ -223,46 +113,6 @@ template <> void Mul<float, CUDAContext>(int n,
    CUDA_POST_KERNEL_CHECK;
 }

-#ifdef WITH_CUDA_FP16
-template <typename T>
-__global__ void _MulHalf(const int n, const half* a, const half* b, half* y) {
-    CUDA_KERNEL_LOOP(idx, n) {
-#if __CUDA_ARCH__ >= 530
-        y[idx] = __hmul(a[idx], b[idx]);
-#endif
-    }
-}
-
-template <typename T>
-__global__ void _MulHalf2(const int n, const half2* a, const half2* b, half2* y) {
-    CUDA_KERNEL_LOOP(idx, n) {
-#if __CUDA_ARCH__ >= 530
-        y[idx] = __hmul2(a[idx], b[idx]);
-#endif
-    }
-}
-#endif
-
-template <> void Mul<float16, CUDAContext>(int n,
-                                           const float16* a,
-                                           const float16* b,
-                                           float16* y) {
-#ifdef WITH_CUDA_FP16
-    if (n % 2 == 0)
-        _MulHalf2<half2> << <GET_BLOCKS(n / 2), CUDA_NUM_THREADS >> >(n / 2,
-                                          reinterpret_cast<const half2*>(a),
-                                          reinterpret_cast<const half2*>(b),
-                                               reinterpret_cast<half2*>(y));
-    else _MulHalf<half> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n,
-                                  reinterpret_cast<const half*>(a),
-                                  reinterpret_cast<const half*>(b),
-                                       reinterpret_cast<half*>(y));
-    CUDA_POST_KERNEL_CHECK;
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
-}
-
 template <typename T>
 __global__ void _Div(const int n, const T* a, const T* b, T* y) {
    CUDA_KERNEL_LOOP(idx, n) {
@@ -278,32 +128,6 @@ template <> void Div<float, CUDAContext>(int n,
    CUDA_POST_KERNEL_CHECK;
 }

-#ifdef WITH_CUDA_FP16
-template <typename T>
-__global__ void _DivHalf(const int n, const half* a, const half* b, half* y) {
-    CUDA_KERNEL_LOOP(idx, n) {
-#if __CUDA_ARCH__ >= 530
-        y[idx] = __hdiv(a[idx], b[idx]);
-#endif
-    }
-}
-#endif
-
-template <> void Div<float16, CUDAContext>(int n, 
-                                           const float16* a, 
-                                           const float16* b,
-                                           float16* y) {
-#ifdef WITH_CUDA_FP16
-    _DivHalf<half> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n,
-                             reinterpret_cast<const half*>(a),
-                             reinterpret_cast<const half*>(b),
-                                  reinterpret_cast<half*>(y));
-    CUDA_POST_KERNEL_CHECK;
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
-}
-
 template <typename T>
 __global__ void _Clip(const int n, const T low, const T high, T* x) {
    CUDA_KERNEL_LOOP(idx, n) {
@@ -358,44 +182,6 @@ template <> void Square<float, CUDAContext>(int n,
    CUDA_POST_KERNEL_CHECK;
 }

-#ifdef WITH_CUDA_FP16
-template <typename T>
-__global__ void _SquareHalf(const int n, const half* x, half* y) {
-    CUDA_KERNEL_LOOP(idx, n) {
-#if __CUDA_ARCH__ >= 530
-        y[idx] = __hmul(x[idx], x[idx]);
-#endif
-    }
-}
-
-template <typename T>
-__global__ void _SquareHalf2(const int n, const half2* x, half2* y) {
-    CUDA_KERNEL_LOOP(idx, n) {
-#if __CUDA_ARCH__ >= 530
-        y[idx] = __hmul2(x[idx], x[idx]);
-#endif
-    }
-}
-#endif
-
-template <> void Square<float16, CUDAContext>(int n,
-                                              const float16* x,
-                                              float16* y) {
-#ifdef WITH_CUDA_FP16
-    if (n % 2 == 0)
-        _SquareHalf2<half2> << < GET_BLOCKS(n / 2), CUDA_NUM_THREADS >> >(n / 2,
-                                              reinterpret_cast<const half2*>(x),
-                                                   reinterpret_cast<half2*>(y));
-    else _SquareHalf<half> << < GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n,
-                                      reinterpret_cast<const half*>(x),
-                                           reinterpret_cast<half*>(y));
-    CUDA_POST_KERNEL_CHECK;
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
-}
-
-
 template <typename T>
 __global__ void _Sqrt(const int n, const T* x, T* y) {
    CUDA_KERNEL_LOOP(idx, n) {
@@ -410,43 +196,6 @@ template <> void Sqrt<float, CUDAContext>(int n,
    CUDA_POST_KERNEL_CHECK;
 }

-#ifdef WITH_CUDA_FP16
-template <typename T>
-__global__ void _SqrtHalf(const int n, const half* x, half* y) {
-    CUDA_KERNEL_LOOP(idx, n) {
-#if __CUDA_ARCH__ >= 530
-        y[idx] = hsqrt(x[idx]);
-#endif
-    }
-}
-
-template <typename T>
-__global__ void _SqrtHalf2(const int n, const half2* x, half2* y) {
-    CUDA_KERNEL_LOOP(idx, n) {
-#if __CUDA_ARCH__ >= 530
-        y[idx] = h2sqrt(x[idx]);
-#endif
-    }
-}
-#endif
-
-template <> void Sqrt<float16, CUDAContext>(int n,
-                                            const float16* x,
-                                            float16* y) {
-#ifdef WITH_CUDA_FP16
-    if (n % 2 == 0)
-        _SqrtHalf2<half2> << < GET_BLOCKS(n / 2), CUDA_NUM_THREADS >> >(n / 2,
-                                            reinterpret_cast<const half2*>(x),
-                                                 reinterpret_cast<half2*>(y));
-    else _SqrtHalf<half>  << < GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n,
-                                     reinterpret_cast<const half*>(x),
-                                          reinterpret_cast<half*>(y));
-    CUDA_POST_KERNEL_CHECK;
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
-}
-
 template <typename T>
 __global__ void _Pow(const int n, const T alpha, const T* a, T* y) {
    CUDA_KERNEL_LOOP(idx, n) {
@@ -462,48 +211,6 @@ template <> void Pow<float, CUDAContext>(int n,
    CUDA_POST_KERNEL_CHECK;
 }

-#ifdef WITH_CUDA_FP16
-template <typename T>
-__global__ void _PowHalf(const int n, const float alpha, const half* a, half* y) {
-    CUDA_KERNEL_LOOP(idx, n) {
-#if __CUDA_ARCH__ >= 530
-        y[idx] = __hmul(a[idx], a[idx]);
-#endif
-    }
-}
-
-template <typename T>
-__global__ void _PowHalf2(const int n, const float alpha, const half2* a, half2* y) {
-    CUDA_KERNEL_LOOP(idx, n) {
-#if __CUDA_ARCH__ >= 530
-        y[idx] = __hmul2(a[idx], a[idx]);
-#endif
-    }
-}
-#endif
-
-template <> void Pow<float16, CUDAContext>(int n, 
-                                           const float alpha,
-                                           const float16* x,
-                                           float16* y) {
-#ifdef WITH_CUDA_FP16
-    CHECK(alpha == float(2)) << "fp16 only support the power of 2";
-    if (n % 2 == 0)
-        _PowHalf2<half2> << < GET_BLOCKS(n / 2), CUDA_NUM_THREADS >> >(n / 2,
-                                                                       alpha,
-                                           reinterpret_cast<const half2*>(x),
-                                                reinterpret_cast<half2*>(y));
-
-    else _PowHalf<half>  << < GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n,
-                                                               alpha,
-                                    reinterpret_cast<const half*>(x),
-                                         reinterpret_cast<half*>(y));
-    CUDA_POST_KERNEL_CHECK;
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
-}
-
 template <typename T>
 __global__ void _Inv(const int n, const float numerator, const T* x, T* y) {
    CUDA_KERNEL_LOOP(idx, n) {
@@ -518,47 +225,6 @@ template <> void Inv<float, CUDAContext>(const int n,
    _Inv<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, numerator, x, y);
 }

-#ifdef WITH_CUDA_FP16
-template <typename T>
-__global__ void _InvHalf(const int n, const half numerator, const half* x, half* y) {
-    CUDA_KERNEL_LOOP(idx, n) {
-#if __CUDA_ARCH__ >= 530
-        y[idx] =  __hmul(hrcp(x[idx]), numerator);
-#endif
-    }
-}
-
-template <typename T>
-__global__ void _InvHalf2(const int n, const half2 numerator, const half2* x, half2* y) {
-    CUDA_KERNEL_LOOP(idx, n) {
-#if __CUDA_ARCH__ >= 530
-        y[idx] = __hmul2(h2rcp(x[idx]), numerator);
-#endif
-    }
-}
-#endif
-
-template <> void Inv<float16, CUDAContext>(const int n, 
-                                           const float numerator, 
-                                           const float16* x, 
-                                           float16* y) {
-#ifdef WITH_CUDA_FP16
-    if (n % 2 == 0)
-        _InvHalf2<half2> << < GET_BLOCKS(n / 2), CUDA_NUM_THREADS >> >(n / 2,
-                                        dragon_cast<half2, float>(numerator),
-                                           reinterpret_cast<const half2*>(x),
-                                                reinterpret_cast<half2*>(y));
-    else 
-        _InvHalf<half> << < GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n,
-                               dragon_cast<half, float>(numerator),
-                                  reinterpret_cast<const half*>(x),
-                                       reinterpret_cast<half*>(y));
-    CUDA_POST_KERNEL_CHECK;
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
-}
-
 /******************** Level-2 ********************/

 template <> void Scal<float, CUDAContext>(const int n, const float alpha, float* y) {
@@ -568,18 +234,6 @@ template <> void Scal<float, CUDAContext>(const int n, const float alpha, float*
                                y, 1));
 }

-template <> void Scal<float16, CUDAContext>(const int n, const float alpha, float16* y) {
-#ifdef WITH_CUDA_FP16
-    CUBLAS_CHECK(cublasScalEx(cublas_handle(), 
-                              n, 
-                              &alpha, CUDA_R_32F, 
-                              y, CUDA_R_16F, 1, 
-                              CUDA_R_32F));
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
-}
-
 template <> void Scale<float, CUDAContext>(const int n,
                                           const float alpha,
                                           const float* x,
@@ -588,23 +242,13 @@ template <> void Scale<float, CUDAContext>(const int n,
    CUBLAS_CHECK(cublasSscal_v2(cublas_handle(), n, &alpha, y, 1));
 }

-template <> void Scale<float16, CUDAContext>(const int n, 
-                                             const float alpha, 
-                                             const float16* x, 
-                                             float16* y) {
-    CUDAContext ctx;
-    ctx.Copy<float16, CUDAContext, CUDAContext>(n, y, x);
-    Scal<float16, CUDAContext>(n, alpha, y);
-}
-
 template <> float StridedDot<float, CUDAContext>(const int n,
                                                 const float* a,
                                                 const int incx,
                                                 const float* b,
                                                 const int incy) {
    float result;
-    CUBLAS_CHECK(cublasSdot_v2(cublas_handle(), 
-                               n, 
+    CUBLAS_CHECK(cublasSdot_v2(cublas_handle(), n,
                                          a, incx,
                                          b, incy,
                                        &result));
@@ -615,22 +259,6 @@ template <> float Dot<float, CUDAContext>(int n, const float* a, const float* b)
    return StridedDot<float, CUDAContext>(n, a, 1, b, 1);
 }

-template <> float Dot<float16, CUDAContext>(int n, const float16* a, const float16* b) {
-#ifdef WITH_CUDA_FP16
-    float16 result;
-    CUBLAS_CHECK(cublasDotEx(cublas_handle(),
-                             n,
-                             &a, CUDA_R_16F, 1,
-                             &b, CUDA_R_16F, 1,
-                             &result, CUDA_R_16F,
-                             CUDA_R_32F));
-    return dragon_cast<float, float16>(result);
-#else
-    CUDA_FP16_NOT_COMPILED;
-    return 0.;
-#endif
-}
-
 template <> float ASum<float, CUDAContext>(const int n, const float* x) {
    return cublasSasum(n, x, 1);
 }
@@ -646,43 +274,6 @@ template <> void AddScalar<float, CUDAContext>(const int n, const float alpha, f
    _AddScalar<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, alpha, y);
 }

-#ifdef WITH_CUDA_FP16
-template <typename T>
-__global__ void _AddScalarHalf(const int n, half alpha, half* y) {
-    CUDA_KERNEL_LOOP(idx, n) {
-#if __CUDA_ARCH__ >= 530
-        y[idx] = __hadd(y[idx], alpha);
-#endif
-    }
-}
-
-template <typename T>
-__global__ void _AddScalarHalf2(const int n, half2 alpha, half2* y) {
-    CUDA_KERNEL_LOOP(idx, n) {
-#if __CUDA_ARCH__ >= 530
-        y[idx] = __hadd2(y[idx], alpha);
-#endif
-    }
-}
-#endif
-
-template <> void AddScalar<float16, CUDAContext>(const int n, const float alpha, float16* y) {
-#ifdef WITH_CUDA_FP16
-    if (n % 2 == 0) {
-        _AddScalarHalf2<half2> << <GET_BLOCKS(n / 2), CUDA_NUM_THREADS >> >(n / 2,
-                                                 dragon_cast<half2, float>(alpha),
-                                                     reinterpret_cast<half2*>(y));
-    } else {
-        _AddScalarHalf<half> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n,
-                                        dragon_cast<half, float>(alpha),
-                                            reinterpret_cast<half*>(y));
-    }
-    CUDA_POST_KERNEL_CHECK;
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
-}
-
 template <typename T>
 __global__ void _MulScalar(const int n, T alpha, T* y) {
    CUDA_KERNEL_LOOP(idx, n) {
@@ -694,69 +285,15 @@ template <> void MulScalar<float, CUDAContext>(const int n, const float alpha, f
    _MulScalar<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, alpha, y);
 }

-#ifdef WITH_CUDA_FP16
-template <typename T>
-__global__ void _MulScalarHalf(const int n, half alpha, half* y) {
-    CUDA_KERNEL_LOOP(idx, n) {
-#if __CUDA_ARCH__ >= 530
-        y[idx] = __hmul(y[idx], alpha);
-#endif
-    }
-}
-
-template <typename T>
-__global__ void _MulScalarHalf2(const int n, half2 alpha, half2* y) {
-    CUDA_KERNEL_LOOP(idx, n) {
-#if __CUDA_ARCH__ >= 530
-        y[idx] = __hmul2(y[idx], alpha);
-#endif
-    }
-}
-#endif
-
-template <> void MulScalar<float16, CUDAContext>(const int n, const float alpha, float16* y) {
-#ifdef WITH_CUDA_FP16
-    if (n % 2 == 0) {
-        _MulScalarHalf2<half2> << <GET_BLOCKS(n / 2), CUDA_NUM_THREADS >> >(n / 2,
-                                                 dragon_cast<half2, float>(alpha),
-                                                     reinterpret_cast<half2*>(y));
-    } else {
-        _MulScalarHalf<half> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n,
-                                        dragon_cast<half, float>(alpha),
-                                            reinterpret_cast<half*>(y));
-    }
-    CUDA_POST_KERNEL_CHECK;
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
-}
-
 template <> void Axpy<float, CUDAContext>(const int n,
                                          float alpha,
                                          const float* x,
                                          float* y) {
-    CUBLAS_CHECK(cublasSaxpy_v2(cublas_handle(), 
-                                n, 
+    CUBLAS_CHECK(cublasSaxpy_v2(cublas_handle(), n,
                                      &alpha, x, 1,
                                            y, 1));
 }

-template <> void Axpy<float16, CUDAContext>(const int n, 
-                                            float alpha, 
-                                            const float16* x, 
-                                            float16* y) {
-#ifdef WITH_CUDA_FP16
-    CUBLAS_CHECK(cublasAxpyEx(cublas_handle(), 
-                              n, 
-                              &alpha, CUDA_R_32F, 
-                              x, CUDA_R_16F, 1, 
-                              y, CUDA_R_16F, 1, 
-                              CUDA_R_32F));
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
-}
-
 template <> void Axpby<float, CUDAContext>(const int n,
                                           float alpha,
                                           const float* x,
@@ -766,17 +303,6 @@ template <> void Axpby<float, CUDAContext>(const int n,
    Axpy<float, CUDAContext>(n, alpha, x, y);
 }

-template <> void Axpby<float16, CUDAContext>(const int n, 
-                                             float alpha, 
-                                             const float16* x, 
-                                             float beta, 
-                                             float16* y) {
-    Scal<float16, CUDAContext>(n, beta, y);
-    Axpy<float16, CUDAContext>(n, alpha, x, y);
-}
-
-/******************** Level-3 ********************/
-
 template <> void RandomUniform<float, CUDAContext>(const int n,
                                                   const float low,
                                                   const float high,
@@ -787,6 +313,8 @@ template <> void RandomUniform<float, CUDAContext>(const int n,
    if (low != float(0)) AddScalar<float, CUDAContext>(n, low, x);
 }

+/******************** Level-3 ********************/
+
 template <> void Gemm<float, CUDAContext>(const CBLAS_TRANSPOSE transA,
                                          const CBLAS_TRANSPOSE transB,
                                          const int M,
@@ -813,51 +341,6 @@ template <> void Gemm<float, CUDAContext>(const CBLAS_TRANSPOSE transA,
                                         C, N));
 }

-template <> void Gemm<float16, CUDAContext>(const CBLAS_TRANSPOSE transA, 
-                                            const CBLAS_TRANSPOSE transB,
-                                            const int M, 
-                                            const int N, 
-                                            const int K, 
-                                            const float alpha, 
-                                            const float16* A,
-                                            const float16* B, 
-                                            const float beta, 
-                                            float16 *C, 
-                                            TensorProto_DataType math_type) {
-#ifdef WITH_CUDA_FP16
-    int lda = (transA == CblasNoTrans) ? K : M;
-    int ldb = (transB == CblasNoTrans) ? N : K;
-    cublasOperation_t cuTransA = (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-    cublasOperation_t cuTransB = (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-    if (math_type == TensorProto_DataType_FLOAT) {
-        const float _alpha_ = alpha, _beta_ = beta;
-        CUBLAS_CHECK(cublasSgemmEx(cublas_handle(), 
-                                   cuTransB, cuTransA,
-                                   N, M, K, 
-                                   &_alpha_, 
-                                   B, CUDA_R_16F, ldb, 
-                                   A, CUDA_R_16F, lda,
-                                   &_beta_, 
-                                   C, CUDA_R_16F, N));
-    } else if (math_type == TensorProto_DataType_FLOAT16) {
-        const half _alpha_ = dragon_cast<half, float>(alpha);
-        const half _beta_ = dragon_cast<half, float>(beta);
-        CUBLAS_CHECK(cublasHgemm(cublas_handle(), 
-                                 cuTransB, cuTransA,
-                                 N, M, K, 
-                                 &_alpha_,
-                                 reinterpret_cast<const half*>(B), ldb,
-                                 reinterpret_cast<const half*>(A), lda,
-                                 &_beta_,
-                                 reinterpret_cast<half*>(C), N));
-    } else {
-        LOG(FATAL) << "Unsupported math type";
-    }
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
-}
-
 template <> void Gemv<float, CUDAContext>(const CBLAS_TRANSPOSE transA,
                                          const int M, const int N,
                                          const float alpha,
@@ -878,50 +361,6 @@ template <> void Gemv<float, CUDAContext>(const CBLAS_TRANSPOSE transA,
                                         y, 1));
 }

-template <> void Gemv<float16, CUDAContext>(const CBLAS_TRANSPOSE transA, 
-                                            const int M, 
-                                            const int N,
-                                            const float alpha, 
-                                            const float16* A, 
-                                            const float16* x, 
-                                            const float beta, 
-                                            float16* y, 
-                                            TensorProto_DataType math_type) {
-#ifdef WITH_CUDA_FP16
-    cublasOperation_t cuTransA = (transA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
-    int m = (cuTransA == CUBLAS_OP_N) ? N : M;
-    int k = (cuTransA == CUBLAS_OP_N) ? M : N;
-    int LDA = (cuTransA == CUBLAS_OP_N) ? m : k;
-    int LDC = m;
-    const float _alpha_ = alpha, _beta_ = beta;
-    if (math_type == TensorProto_DataType_FLOAT) {
-        CUBLAS_CHECK(cublasSgemmEx(cublas_handle(), 
-                                   cuTransA, CUBLAS_OP_N,
-                                   m, 1, k, 
-                                   &_alpha_, 
-                                   A, CUDA_R_16F, LDA, 
-                                   x, CUDA_R_16F, k, 
-                                   &_beta_,
-                                   y, CUDA_R_16F, LDC));
-    } else if (math_type == TensorProto_DataType_FLOAT16) {
-        const half _alpha_ = dragon_cast<half, float>(alpha);
-        const half _beta_ = dragon_cast<half, float>(beta);
-        CUBLAS_CHECK(cublasHgemm(cublas_handle(), 
-                                 cuTransA, CUBLAS_OP_N,
-                                 m, 1, k, 
-                                 &_alpha_,
-                                 reinterpret_cast<const half*>(A), LDA, 
-                                 reinterpret_cast<const half*>(x), k, 
-                                 &_beta_,
-                                 reinterpret_cast<half*>(y), LDC));
-    } else {
-            LOG(FATAL) << "Unsupported math type";
-    }
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
-}
-
 }    // namespace math

 }    // namespace dragon

--- a/Dragon/src/utils/math_functions_fp16.cu
+++ b/Dragon/src/utils/math_functions_fp16.cu
+#ifdef WITH_CUDA
+
+#include <cmath>
+
+#include "core/context_cuda.h"
+#include "utils/cuda_device.h"
+#include "utils/math_functions.h"
+#include "utils/cast.h"
+
+namespace dragon {
+
+namespace math {
+
+/******************** Level-0 ********************/
+
+template <typename T>
+__global__ void _SetHalf(const int n, const T alpha, T* x) {
+    CUDA_KERNEL_LOOP(idx, n) {
+        x[idx] = alpha;
+    }
+}
+
+template <> void Set<float16, CUDAContext>(const int n,
+                                           const float16 alpha,
+                                           float16* x) {
+#ifdef WITH_CUDA_FP16
+    if (n % 2 == 0) 
+        _SetHalf<half2> << <GET_BLOCKS(n / 2), CUDA_NUM_THREADS >> >(n / 2,
+                                        dragon_cast<half2, float16>(alpha),
+                                              reinterpret_cast<half2*>(x));
+    else 
+        _SetHalf<float16> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, alpha, x);
+    CUDA_POST_KERNEL_CHECK;
+#else
+    CUDA_FP16_NOT_COMPILED;
+#endif
+}
+
+template <> void RandomUniform<float16, CUDAContext>(const int n,
+                                                     const float low,
+                                                     const float high,
+                                                     float16* x) {
+    NOT_IMPLEMENTED;
+}
+
+template <> void RandomNormal<float16, CUDAContext>(const int n,
+                                                    const float mu, 
+                                                    const float sigma, 
+                                                    float16* x) {
+    NOT_IMPLEMENTED;
+}
+
+/******************** Level-1 ********************/
+
+#ifdef WITH_CUDA_FP16
+template <typename T>
+__global__ void _AddHalf(const int n, const half* a, const half* b, half* y) {
+    CUDA_KERNEL_LOOP(idx, n) {
+#if __CUDA_ARCH__ >= 530
+        y[idx] = __hadd(a[idx], b[idx]);
+#endif
+    }
+}
+
+template <typename T>
+__global__ void _AddHalf2(const int n, const half2* a, const half2* b, half2* y) {
+    CUDA_KERNEL_LOOP(idx, n) {
+#if __CUDA_ARCH__ >= 530
+        y[idx] = __hadd2(a[idx], b[idx]);
+#endif
+    }
+}
+#endif
+
+template <> void Add<float16, CUDAContext>(int n,
+                                           const float16* a,
+                                           const float16* b,
+                                           float16* y) {
+#ifdef WITH_CUDA_FP16
+    if (n % 2 == 0)
+        _AddHalf2<half2> << <GET_BLOCKS(n / 2), CUDA_NUM_THREADS >> >(n / 2,
+                                          reinterpret_cast<const half2*>(a),
+                                          reinterpret_cast<const half2*>(b),
+                                               reinterpret_cast<half2*>(y));
+    else _AddHalf<half> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n,
+                                  reinterpret_cast<const half*>(a),
+                                  reinterpret_cast<const half*>(b),
+                                       reinterpret_cast<half*>(y));
+    CUDA_POST_KERNEL_CHECK;
+#else
+    CUDA_FP16_NOT_COMPILED;
+#endif
+}
+
+#ifdef WITH_CUDA_FP16
+template <typename T>
+__global__ void _SubHalf(const int n, const half* a, const half* b, half* y) {
+    CUDA_KERNEL_LOOP(idx, n) {
+#if __CUDA_ARCH__ >= 530
+        y[idx] = __hsub(a[idx], b[idx]);
+#endif
+    }
+}
+
+template <typename T>
+__global__ void _SubHalf2(const int n, const half2* a, const half2* b, half2* y) {
+    CUDA_KERNEL_LOOP(idx, n) {
+#if __CUDA_ARCH__ >= 530
+        y[idx] = __hsub2(a[idx], b[idx]);
+#endif
+    }
+}
+#endif
+
+template <> void Sub<float16, CUDAContext>(int n,
+                                           const float16* a,
+                                           const float16* b,
+                                           float16* y) {
+#ifdef WITH_CUDA_FP16
+    if (n % 2 == 0)
+        _SubHalf2<half2> << <GET_BLOCKS(n / 2), CUDA_NUM_THREADS >> >(n / 2,
+                                           reinterpret_cast<const half2*>(a),
+                                           reinterpret_cast<const half2*>(b),
+                                                reinterpret_cast<half2*>(y));
+    else _SubHalf<half> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n,
+                                  reinterpret_cast<const half*>(a),
+                                  reinterpret_cast<const half*>(b),
+                                       reinterpret_cast<half*>(y));
+    CUDA_POST_KERNEL_CHECK;
+#else
+    CUDA_FP16_NOT_COMPILED;
+#endif
+}
+
+#ifdef WITH_CUDA_FP16
+template <typename T>
+__global__ void _MulHalf(const int n, const half* a, const half* b, half* y) {
+    CUDA_KERNEL_LOOP(idx, n) {
+#if __CUDA_ARCH__ >= 530
+        y[idx] = __hmul(a[idx], b[idx]);
+#endif
+    }
+}
+
+template <typename T>
+__global__ void _MulHalf2(const int n, const half2* a, const half2* b, half2* y) {
+    CUDA_KERNEL_LOOP(idx, n) {
+#if __CUDA_ARCH__ >= 530
+        y[idx] = __hmul2(a[idx], b[idx]);
+#endif
+    }
+}
+#endif
+
+template <> void Mul<float16, CUDAContext>(int n,
+                                           const float16* a,
+                                           const float16* b,
+                                           float16* y) {
+#ifdef WITH_CUDA_FP16
+    if (n % 2 == 0)
+        _MulHalf2<half2> << <GET_BLOCKS(n / 2), CUDA_NUM_THREADS >> >(n / 2,
+                                          reinterpret_cast<const half2*>(a),
+                                          reinterpret_cast<const half2*>(b),
+                                               reinterpret_cast<half2*>(y));
+    else _MulHalf<half> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n,
+                                  reinterpret_cast<const half*>(a),
+                                  reinterpret_cast<const half*>(b),
+                                       reinterpret_cast<half*>(y));
+    CUDA_POST_KERNEL_CHECK;
+#else
+    CUDA_FP16_NOT_COMPILED;
+#endif
+}
+
+#ifdef WITH_CUDA_FP16
+template <typename T>
+__global__ void _DivHalf(const int n, const half* a, const half* b, half* y) {
+    CUDA_KERNEL_LOOP(idx, n) {
+#if __CUDA_ARCH__ >= 530
+        y[idx] = __hdiv(a[idx], b[idx]);
+#endif
+    }
+}
+#endif
+
+template <> void Div<float16, CUDAContext>(int n,
+                                           const float16* a,
+                                           const float16* b,
+                                           float16* y) {
+#ifdef WITH_CUDA_FP16
+    _DivHalf<half> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n,
+                             reinterpret_cast<const half*>(a),
+                             reinterpret_cast<const half*>(b),
+                                  reinterpret_cast<half*>(y));
+    CUDA_POST_KERNEL_CHECK;
+#else
+    CUDA_FP16_NOT_COMPILED;
+#endif
+}
+
+#ifdef WITH_CUDA_FP16
+template <typename T>
+__global__ void _SquareHalf(const int n, const half* x, half* y) {
+    CUDA_KERNEL_LOOP(idx, n) {
+#if __CUDA_ARCH__ >= 530
+        y[idx] = __hmul(x[idx], x[idx]);
+#endif
+    }
+}
+
+template <typename T>
+__global__ void _SquareHalf2(const int n, const half2* x, half2* y) {
+    CUDA_KERNEL_LOOP(idx, n) {
+#if __CUDA_ARCH__ >= 530
+        y[idx] = __hmul2(x[idx], x[idx]);
+#endif
+    }
+}
+#endif
+
+template <> void Square<float16, CUDAContext>(int n,
+                                              const float16* x,
+                                              float16* y) {
+#ifdef WITH_CUDA_FP16
+    if (n % 2 == 0)
+        _SquareHalf2<half2> << < GET_BLOCKS(n / 2), CUDA_NUM_THREADS >> >(n / 2,
+                                              reinterpret_cast<const half2*>(x),
+                                                   reinterpret_cast<half2*>(y));
+    else _SquareHalf<half> << < GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n,
+                                      reinterpret_cast<const half*>(x),
+                                           reinterpret_cast<half*>(y));
+    CUDA_POST_KERNEL_CHECK;
+#else
+    CUDA_FP16_NOT_COMPILED;
+#endif
+}
+
+#ifdef WITH_CUDA_FP16
+template <typename T>
+__global__ void _SqrtHalf(const int n, const half* x, half* y) {
+    CUDA_KERNEL_LOOP(idx, n) {
+#if __CUDA_ARCH__ >= 530
+        y[idx] = hsqrt(x[idx]);
+#endif
+    }
+}
+
+template <typename T>
+__global__ void _SqrtHalf2(const int n, const half2* x, half2* y) {
+    CUDA_KERNEL_LOOP(idx, n) {
+#if __CUDA_ARCH__ >= 530
+        y[idx] = h2sqrt(x[idx]);
+#endif
+    }
+}
+#endif
+
+template <> void Sqrt<float16, CUDAContext>(int n,
+                                            const float16* x,
+                                            float16* y) {
+#ifdef WITH_CUDA_FP16
+    if (n % 2 == 0)
+        _SqrtHalf2<half2> << < GET_BLOCKS(n / 2), CUDA_NUM_THREADS >> >(n / 2,
+                                            reinterpret_cast<const half2*>(x),
+                                                 reinterpret_cast<half2*>(y));
+    else _SqrtHalf<half>  << < GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n,
+                                     reinterpret_cast<const half*>(x),
+                                          reinterpret_cast<half*>(y));
+    CUDA_POST_KERNEL_CHECK;
+#else
+    CUDA_FP16_NOT_COMPILED;
+#endif
+}
+
+#ifdef WITH_CUDA_FP16
+template <typename T>
+__global__ void _PowHalf(const int n, const float alpha, const half* a, half* y) {
+    CUDA_KERNEL_LOOP(idx, n) {
+#if __CUDA_ARCH__ >= 530
+        y[idx] = __hmul(a[idx], a[idx]);
+#endif
+    }
+}
+
+template <typename T>
+__global__ void _PowHalf2(const int n, const float alpha, const half2* a, half2* y) {
+    CUDA_KERNEL_LOOP(idx, n) {
+#if __CUDA_ARCH__ >= 530
+        y[idx] = __hmul2(a[idx], a[idx]);
+#endif
+    }
+}
+#endif
+
+template <> void Pow<float16, CUDAContext>(int n, 
+                                           const float alpha,
+                                           const float16* x,
+                                           float16* y) {
+#ifdef WITH_CUDA_FP16
+    CHECK(alpha == float(2)) << "fp16 only support the power of 2";
+    if (n % 2 == 0)
+        _PowHalf2<half2> << < GET_BLOCKS(n / 2), CUDA_NUM_THREADS >> >(n / 2,
+                                                                       alpha,
+                                           reinterpret_cast<const half2*>(x),
+                                                reinterpret_cast<half2*>(y));
+
+    else _PowHalf<half>  << < GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n,
+                                                               alpha,
+                                    reinterpret_cast<const half*>(x),
+                                         reinterpret_cast<half*>(y));
+    CUDA_POST_KERNEL_CHECK;
+#else
+    CUDA_FP16_NOT_COMPILED;
+#endif
+}
+
+#ifdef WITH_CUDA_FP16
+template <typename T>
+__global__ void _InvHalf(const int n, const half numerator, const half* x, half* y) {
+    CUDA_KERNEL_LOOP(idx, n) {
+#if __CUDA_ARCH__ >= 530
+        y[idx] =  __hmul(hrcp(x[idx]), numerator);
+#endif
+    }
+}
+
+template <typename T>
+__global__ void _InvHalf2(const int n, const half2 numerator, const half2* x, half2* y) {
+    CUDA_KERNEL_LOOP(idx, n) {
+#if __CUDA_ARCH__ >= 530
+        y[idx] = __hmul2(h2rcp(x[idx]), numerator);
+#endif
+    }
+}
+#endif
+
+template <> void Inv<float16, CUDAContext>(const int n,
+                                           const float numerator,
+                                           const float16* x,
+                                           float16* y) {
+#ifdef WITH_CUDA_FP16
+    if (n % 2 == 0)
+        _InvHalf2<half2> << < GET_BLOCKS(n / 2), CUDA_NUM_THREADS >> >(n / 2,
+                                        dragon_cast<half2, float>(numerator),
+                                           reinterpret_cast<const half2*>(x),
+                                                reinterpret_cast<half2*>(y));
+    else 
+        _InvHalf<half> << < GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n,
+                               dragon_cast<half, float>(numerator),
+                                  reinterpret_cast<const half*>(x),
+                                       reinterpret_cast<half*>(y));
+    CUDA_POST_KERNEL_CHECK;
+#else
+    CUDA_FP16_NOT_COMPILED;
+#endif
+}
+
+/******************** Level-2 ********************/
+
+template <> void Scal<float16, CUDAContext>(const int n, const float alpha, float16* y) {
+#ifdef WITH_CUDA_FP16
+    CUBLAS_CHECK(cublasScalEx(cublas_handle(), n,
+                              &alpha, CUDA_R_32F,
+                              y, CUDA_R_16F, 1,
+                              CUDA_R_32F));
+#else
+    CUDA_FP16_NOT_COMPILED;
+#endif
+}
+
+template <> void Scale<float16, CUDAContext>(const int n,
+                                             const float alpha,
+                                             const float16* x,
+                                             float16* y) {
+    CUDAContext ctx;
+    ctx.Copy<float16, CUDAContext, CUDAContext>(n, y, x);
+    Scal<float16, CUDAContext>(n, alpha, y);
+}
+
+template <> float Dot<float16, CUDAContext>(int n, const float16* a, const float16* b) {
+#ifdef WITH_CUDA_FP16
+    float16 result;
+    CUBLAS_CHECK(cublasDotEx(cublas_handle(),
+                             n,
+                             &a, CUDA_R_16F, 1,
+                             &b, CUDA_R_16F, 1,
+                             &result, CUDA_R_16F,
+                             CUDA_R_32F));
+    return dragon_cast<float, float16>(result);
+#else
+    CUDA_FP16_NOT_COMPILED;
+    return 0.;
+#endif
+}
+
+#ifdef WITH_CUDA_FP16
+template <typename T>
+__global__ void _AddScalarHalf(const int n, half alpha, half* y) {
+    CUDA_KERNEL_LOOP(idx, n) {
+#if __CUDA_ARCH__ >= 530
+        y[idx] = __hadd(y[idx], alpha);
+#endif
+    }
+}
+
+template <typename T>
+__global__ void _AddScalarHalf2(const int n, half2 alpha, half2* y) {
+    CUDA_KERNEL_LOOP(idx, n) {
+#if __CUDA_ARCH__ >= 530
+        y[idx] = __hadd2(y[idx], alpha);
+#endif
+    }
+}
+#endif
+
+template <> void AddScalar<float16, CUDAContext>(const int n, const float alpha, float16* y) {
+#ifdef WITH_CUDA_FP16
+    if (n % 2 == 0) {
+        _AddScalarHalf2<half2> << <GET_BLOCKS(n / 2), CUDA_NUM_THREADS >> >(n / 2,
+                                                 dragon_cast<half2, float>(alpha),
+                                                     reinterpret_cast<half2*>(y));
+    } else {
+        _AddScalarHalf<half> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n,
+                                        dragon_cast<half, float>(alpha),
+                                            reinterpret_cast<half*>(y));
+    }
+    CUDA_POST_KERNEL_CHECK;
+#else
+    CUDA_FP16_NOT_COMPILED;
+#endif
+}
+
+#ifdef WITH_CUDA_FP16
+template <typename T>
+__global__ void _MulScalarHalf(const int n, half alpha, half* y) {
+    CUDA_KERNEL_LOOP(idx, n) {
+#if __CUDA_ARCH__ >= 530
+        y[idx] = __hmul(y[idx], alpha);
+#endif
+    }
+}
+
+template <typename T>
+__global__ void _MulScalarHalf2(const int n, half2 alpha, half2* y) {
+    CUDA_KERNEL_LOOP(idx, n) {
+#if __CUDA_ARCH__ >= 530
+        y[idx] = __hmul2(y[idx], alpha);
+#endif
+    }
+}
+#endif
+
+template <> void MulScalar<float16, CUDAContext>(const int n, const float alpha, float16* y) {
+#ifdef WITH_CUDA_FP16
+    if (n % 2 == 0) {
+        _MulScalarHalf2<half2> << <GET_BLOCKS(n / 2), CUDA_NUM_THREADS >> >(n / 2,
+                                                 dragon_cast<half2, float>(alpha),
+                                                     reinterpret_cast<half2*>(y));
+    } else {
+        _MulScalarHalf<half> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n,
+                                        dragon_cast<half, float>(alpha),
+                                            reinterpret_cast<half*>(y));
+    }
+    CUDA_POST_KERNEL_CHECK;
+#else
+    CUDA_FP16_NOT_COMPILED;
+#endif
+}
+
+template <> void Axpy<float16, CUDAContext>(const int n,
+                                            float alpha,
+                                            const float16* x,
+                                            float16* y) {
+#ifdef WITH_CUDA_FP16
+    CUBLAS_CHECK(cublasAxpyEx(cublas_handle(), n,
+                              &alpha, CUDA_R_32F,
+                                x, CUDA_R_16F, 1,
+                                y, CUDA_R_16F, 1,
+                                    CUDA_R_32F));
+#else
+    CUDA_FP16_NOT_COMPILED;
+#endif
+}
+
+template <> void Axpby<float16, CUDAContext>(const int n,
+                                             float alpha,
+                                             const float16* x,
+                                             float beta, 
+                                             float16* y) {
+    Scal<float16, CUDAContext>(n, beta, y);
+    Axpy<float16, CUDAContext>(n, alpha, x, y);
+}
+
+/******************** Level-3 ********************/
+
+template <> void Gemm<float16, CUDAContext>(const CBLAS_TRANSPOSE transA,
+                                            const CBLAS_TRANSPOSE transB,
+                                            const int M,
+                                            const int N,
+                                            const int K,
+                                            const float alpha,
+                                            const float16* A,
+                                            const float16* B,
+                                            const float beta,
+                                            float16 *C,
+                                            TensorProto_DataType math_type) {
+#ifdef WITH_CUDA_FP16
+    int lda = (transA == CblasNoTrans) ? K : M;
+    int ldb = (transB == CblasNoTrans) ? N : K;
+    cublasOperation_t cuTransA = (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+    cublasOperation_t cuTransB = (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+    if (math_type == TensorProto_DataType_FLOAT) {
+        const float _alpha_ = alpha, _beta_ = beta;
+#if CUDA_VERSION >= 9000
+        if (TENSOR_CORE_AVAILABLE()) {
+            //  GEMM + MATH32 + TENSOR-CORE
+            CUBLAS_CHECK(cublasGemmEx(cublas_handle(),
+                                   cuTransB, cuTransA,
+                                              N, M, K,
+                                             &_alpha_,
+                                   B, CUDA_R_16F, ldb,
+                                   A, CUDA_R_16F, lda,
+                                              &_beta_,
+                                     C, CUDA_R_16F, N,
+                                           CUDA_R_32F,
+                      CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+        } else {
+            //  GEMM + MATH32 + DEFAULT
+            CUBLAS_CHECK(cublasSgemmEx(cublas_handle(),
+                                    cuTransB, cuTransA,
+                                               N, M, K,
+                                              &_alpha_,
+                                    B, CUDA_R_16F, ldb,
+                                    A, CUDA_R_16F, lda,
+                                               &_beta_,
+                                    C, CUDA_R_16F, N));
+        }
+#else
+       CUBLAS_CHECK(cublasSgemmEx(cublas_handle(),
+                               cuTransB, cuTransA,
+                                          N, M, K,
+                                         &_alpha_,
+                               B, CUDA_R_16F, ldb,
+                               A, CUDA_R_16F, lda,
+                                          &_beta_,
+                               C, CUDA_R_16F, N));
+#endif
+    } else if (math_type == TensorProto_DataType_FLOAT16) {
+        const half _alpha_ = dragon_cast<half, float>(alpha);
+        const half _beta_ = dragon_cast<half, float>(beta);
+#if CUDA_VERSION >= 9000
+        if (TENSOR_CORE_AVAILABLE()) {
+            //  GEMM + MATH16 + TENSOR-CORE
+            CUBLAS_CHECK(cublasGemmEx(cublas_handle(),
+                                   cuTransB, cuTransA,
+                                              N, M, K,
+                                             &_alpha_,
+                                   B, CUDA_R_16F, ldb,
+                                   A, CUDA_R_16F, lda,
+                                              &_beta_,
+                                     C, CUDA_R_16F, N,
+                                           CUDA_R_16F,
+                      CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+        } else {
+            //  GEMM + MATH16 + DEFAULT
+             CUBLAS_CHECK(cublasHgemm(cublas_handle(),
+                                   cuTransB, cuTransA,
+                                              N, M, K,
+                                             &_alpha_,
+                reinterpret_cast<const half*>(B), ldb,
+                reinterpret_cast<const half*>(A), lda,
+                                              &_beta_,
+                      reinterpret_cast<half*>(C), N));
+        }
+#else
+        CUBLAS_CHECK(cublasHgemm(cublas_handle(),
+                              cuTransB, cuTransA,
+                                         N, M, K,
+                                        &_alpha_,
+           reinterpret_cast<const half*>(B), ldb,
+           reinterpret_cast<const half*>(A), lda,
+                                         &_beta_,
+                 reinterpret_cast<half*>(C), N));
+
+#endif
+    } else {
+        LOG(FATAL) << "Unsupported math type";
+    }
+#else
+    CUDA_FP16_NOT_COMPILED;
+#endif
+}
+
+template <> void Gemv<float16, CUDAContext>(const CBLAS_TRANSPOSE transA,
+                                            const int M,
+                                            const int N,
+                                            const float alpha,
+                                            const float16* A,
+                                            const float16* x,
+                                            const float beta,
+                                            float16* y,
+                                            TensorProto_DataType math_type) {
+#ifdef WITH_CUDA_FP16
+    cublasOperation_t cuTransA = (transA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
+    int m = (cuTransA == CUBLAS_OP_N) ? N : M;
+    int k = (cuTransA == CUBLAS_OP_N) ? M : N;
+    int LDA = (cuTransA == CUBLAS_OP_N) ? m : k;
+    int LDC = m;
+    const float _alpha_ = alpha, _beta_ = beta;
+    if (math_type == TensorProto_DataType_FLOAT) {
+#if CUDA_VERSION >= 9000
+        if (TENSOR_CORE_AVAILABLE()) {
+            //  GEMV + MATH32 + TENSOR-CORE
+            CUBLAS_CHECK(cublasGemmEx(cublas_handle(),
+                                cuTransA, CUBLAS_OP_N,
+                                              m, 1, k,
+                                             &_alpha_,
+                                   A, CUDA_R_16F, LDA,
+                                     x, CUDA_R_16F, k,
+                                              &_beta_,
+                                   y, CUDA_R_16F, LDC,
+                                           CUDA_R_32F,
+                      CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+        } else {
+            //  GEMV + MATH32 + DEFAULT
+            CUBLAS_CHECK(cublasSgemmEx(cublas_handle(),
+                                 cuTransA, CUBLAS_OP_N,
+                                               m, 1, k,
+                                              &_alpha_,
+                                    A, CUDA_R_16F, LDA,
+                                      x, CUDA_R_16F, k,
+                                               &_beta_,
+                                  y, CUDA_R_16F, LDC));
+        }
+#else
+        CUBLAS_CHECK(cublasSgemmEx(cublas_handle(),
+                             cuTransA, CUBLAS_OP_N,
+                                           m, 1, k,
+                                          &_alpha_,
+                                A, CUDA_R_16F, LDA,
+                                  x, CUDA_R_16F, k,
+                                           &_beta_,
+                              y, CUDA_R_16F, LDC));
+#endif
+    } else if (math_type == TensorProto_DataType_FLOAT16) {
+        const half _alpha_ = dragon_cast<half, float>(alpha);
+        const half _beta_ = dragon_cast<half, float>(beta);
+#if CUDA_VERSION >= 9000
+        if (TENSOR_CORE_AVAILABLE()) {
+            //  GEMV + MATH16 + TENSOR-CORE
+            CUBLAS_CHECK(cublasGemmEx(cublas_handle(),
+                                cuTransA, CUBLAS_OP_N,
+                                              m, 1, k,
+                                             &_alpha_,
+                                   A, CUDA_R_16F, LDA,
+                                     x, CUDA_R_16F, k,
+                                              &_beta_,
+                                   y, CUDA_R_16F, LDC,
+                                           CUDA_R_16F,
+                      CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+        } else {
+            //  GEMV + MATH16 + DEFAULT
+            CUBLAS_CHECK(cublasHgemm(cublas_handle(),
+                               cuTransA, CUBLAS_OP_N,
+                                             m, 1, k,
+                                            &_alpha_,
+               reinterpret_cast<const half*>(A), LDA,
+                 reinterpret_cast<const half*>(x), k,
+                                             &_beta_,
+                   reinterpret_cast<half*>(y), LDC));
+        }
+#else
+        CUBLAS_CHECK(cublasHgemm(cublas_handle(),
+                           cuTransA, CUBLAS_OP_N,
+                                         m, 1, k,
+                                        &_alpha_,
+           reinterpret_cast<const half*>(A), LDA,
+             reinterpret_cast<const half*>(x), k,
+                                         &_beta_,
+               reinterpret_cast<half*>(y), LDC));
+#endif
+    } else {
+            LOG(FATAL) << "Unsupported math type";
+    }
+#else
+    CUDA_FP16_NOT_COMPILED;
+#endif
+}
+
+}    // namespace math
+
+}    // namespace dragon
+
+#endif // WITH_CUDA
\ No newline at end of file
--- a/Dragon/src/utils/op_kernel.cc
+++ b/Dragon/src/utils/op_kernel.cc
@@ -6,6 +6,7 @@
 #include "utils/omp_alternative.h"
 #include "utils/sse_alternative.h"
 #include "utils/math_functions.h"
+#include "utils/cast.h"

 bool judge(int a, int b)  { return unsigned(a) < unsigned(b); }

@@ -359,6 +360,70 @@ template<> void TanhGrad<float, CPUContext>(const int count,
    }
 }

+/******************** arithmetic.affine ********************/
+
+template<> void Affine<float, CPUContext>(const int count,
+                                          const int outer_dim,
+                                          const int scale_dim,
+                                          const int inner_dim,
+                                          const float* x,
+                                          const float* alpha,
+                                          const float* beta,
+                                          const float* beta_multiplier,
+                                          float* y) {
+    //  Ax
+    auto* Xdata = x; auto* Ydata = y;
+    for (int n = 0; n < outer_dim; ++n) {
+        for (int d = 0; d < scale_dim; ++d) {
+            math::Scale<float, CPUContext>(inner_dim, alpha[d], Xdata, Ydata);
+            Xdata += inner_dim; 
+            Ydata += inner_dim;
+        }
+    }
+    //  Pb
+    if (beta != nullptr && beta_multiplier != nullptr) {
+        int dim = scale_dim * inner_dim;
+        Ydata = y;
+        for (int n = 0; n < outer_dim; ++n) {
+            math::Gemm<float, CPUContext>(CblasNoTrans, CblasNoTrans,
+                                             scale_dim, inner_dim, 1,
+                                                                 1.0,
+                                               beta, beta_multiplier,
+                                                                 1.0,
+                                                              Ydata);
+             Ydata += dim;
+        }
+    }
+}
+
+template<> void Affine<float16, CPUContext>(const int count,
+                                            const int outer_dim,
+                                            const int scale_dim,
+                                            const int inner_dim,
+                                            const float16* x,
+                                            const float16* alpha,
+                                            const float16* beta,
+                                            const float16* beta_multiplier,
+                                            float16* y) {
+    LOG(FATAL) << "float16 is unsupported for CPUContext.";
+}
+
+template <> void AffineGrad<float, CPUContext>(const int count,
+                                               const int outer_dim,
+                                               const int scale_dim,
+                                               const int inner_dim,
+                                               const float* dy,
+                                               const float* alpha,
+                                               float* dx) {
+    auto* dYdata = dy; auto* dXdata = dx;
+    for (int n = 0; n < outer_dim; ++n) {
+        for (int d = 0; d < scale_dim; ++d) {
+            math::Scale<float, CPUContext>(inner_dim, alpha[d], dYdata, dXdata);
+            dYdata += inner_dim; dXdata += inner_dim;
+        }
+    }
+}
+
 /******************** arithmetic.bias_add ********************/

 template<> void BiasAdd<float, CPUContext>(const int count,
@@ -408,77 +473,6 @@ template <> void Clip<float, CPUContext>(const int count,
    }
 }

-/******************** arithmetic.scale ********************/
-
-template<> void Scale<float, CPUContext>(const int axis, 
-                                         Tensor* x, 
-                                         Tensor* gamma, 
-                                         Tensor* beta, 
-                                         Tensor* BMul, 
-                                         Tensor* y) {
-    int outer_dim = x->count(0, axis);
-    int inner_dim = x->count(axis + gamma->ndim());
-    int scale_dim = gamma->count();
-    auto* Xdata = x->data<float, CPUContext>();
-    auto* Ydata = y->mutable_data<float, CPUContext>();
-    auto* Sdata = gamma->data<float, CPUContext>();
-    auto* Bdata = beta != nullptr ? 
-                          beta->data<float, CPUContext>() : 
-                          nullptr;
-    auto* BMul_data = BMul != nullptr ?
-                      BMul->data<float, CPUContext>() : 
-                      nullptr;
-    for (int n = 0; n < outer_dim; ++n) {
-        for (int d = 0; d < scale_dim; ++d) {
-            const float factor = Sdata[d];
-            math::Scale<float, CPUContext>(inner_dim, factor, Xdata, Ydata);
-            Xdata += inner_dim; 
-            Ydata += inner_dim;
-        }
-    }
-    if (Bdata != nullptr) {
-        int dim = scale_dim * inner_dim;
-        Ydata = y->mutable_data<float, CPUContext>();
-        for (int n = 0; n < outer_dim; ++n) {
-            math::Gemm<float, CPUContext>(CblasNoTrans, CblasNoTrans,
-                                             scale_dim, inner_dim, 1,
-                                                                 1.0,
-                                                    Bdata, BMul_data,
-                                                                 1.0,
-                                                               Ydata);
-             Ydata += dim;
-        }
-    }
-}
-
-template<> void Scale<float16, CPUContext>(const int axis, 
-                                           Tensor* x, 
-                                           Tensor* gamma, 
-                                           Tensor* beta, 
-                                           Tensor* BMul, 
-                                           Tensor* y) {
-    LOG(FATAL) << "float16 is unsupported for CPUContext.";
-}
-
-template <> void ScaleGrad<float, CPUContext>(const int axis, 
-                                              Tensor* dy, 
-                                              Tensor* gamma, 
-                                              Tensor* dx) {
-    int outer_dim = dx->count(0, axis);
-    int inner_dim = dx->count(axis + gamma->ndim());
-    int scale_dim = gamma->count();
-    auto* dYdata = dy->data<float, CPUContext>();
-    auto* dXdata = dx->mutable_data<float, CPUContext>();
-    auto* Sdata = gamma->data<float, CPUContext>();
-    for (int n = 0; n < outer_dim; ++n) {
-        for (int d = 0; d < scale_dim; ++d) {
-            const float factor = Sdata[d];
-            math::Scale<float, CPUContext>(inner_dim, factor, dYdata, dXdata);
-            dYdata += inner_dim; dXdata += inner_dim;
-        }
-    }
-}
-
 /******************** control_flow.compare ********************/

 template <> void Equal<float, CPUContext>(const int count,
@@ -809,7 +803,7 @@ template<> void SparseSoftmaxFocalLossGrad<float, CPUContext>(const int count,
    }
 }

-/******************** misc.dtype ********************/
+/******************** misc.astype ********************/

 template <typename Ta, typename Tb>
 void _TypeA2B(const int count, const Ta* a, Tb* b) {
@@ -819,6 +813,14 @@ void _TypeA2B(const int count, const Ta* a, Tb* b) {
    for (int i = 0; i < count; ++i) b[i] = a[i];
 }

+template <typename Ta, typename Tb>
+void _TypeA2B_v2(const int count, const Ta* a, Tb* b) {
+#ifdef WITH_OMP
+#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
+#endif
+    for (int i = 0; i < count; ++i) b[i] = dragon_cast<Tb, Ta>(a[i]);
+}
+
 #define DEFINE_TYPE_A2B(type_a, type_b) \
    template <> void TypeA2B<type_a, type_b, CPUContext>(const int count, \
                                                         const type_a* a, \
@@ -826,6 +828,13 @@ void _TypeA2B(const int count, const Ta* a, Tb* b) {
        _TypeA2B<type_a, type_b>(count, a, b); \
    }

+#define DEFINE_TYPE_A2B_V2(type_a, type_b) \
+    template <> void TypeA2B<type_a, type_b, CPUContext>(const int count, \
+                                                         const type_a* a, \
+                                                         type_b* b) { \
+        _TypeA2B_v2<type_a, type_b>(count, a, b); \
+    }
+
 #define DEFINE_TYPE_DISABLE_FP16(type) \
    template <> void TypeA2B<float16, type, CPUContext>(const int count, \
                                                        const float16* a, \
@@ -845,13 +854,10 @@ void _TypeA2B(const int count, const Ta* a, Tb* b) {
    DEFINE_TYPE_A2B(type_a, int64_t); \
    DEFINE_TYPE_A2B(type_a, uint8_t);

-template <> void TypeA2B<float16, float16, CPUContext>(const int count,
-                                                       const float16* a,
-                                                       float16* b) {
-    LOG(FATAL) << "float16 is unsupported for CPUContext."; 
-}
-
-DEFINE_TYPE_A2ALL(float); DEFINE_TYPE_DISABLE_FP16(float);
+DEFINE_TYPE_A2B_V2(float16, float);
+DEFINE_TYPE_A2B_V2(float, float16);
+DEFINE_TYPE_A2B_V2(float16, float16);
+DEFINE_TYPE_A2ALL(float);
 DEFINE_TYPE_A2ALL(double); DEFINE_TYPE_DISABLE_FP16(double);
 DEFINE_TYPE_A2ALL(int); DEFINE_TYPE_DISABLE_FP16(int);
 DEFINE_TYPE_A2ALL(int64_t); DEFINE_TYPE_DISABLE_FP16(int64_t);

--- a/Dragon/src/utils/op_kernel.cu
+++ b/Dragon/src/utils/op_kernel.cu
@@ -14,18 +14,13 @@ namespace dragon {
 namespace kernel {

 template <typename T>
-__global__ void _Empty() { }
+__global__ void _Empty() {}

 template<> void Empty<float, CUDAContext>() {
    _Empty<float> << <1, 1 >> >();
    CUDA_POST_KERNEL_CHECK;
 }

-template<> void Empty<float16, CUDAContext>() {
-    _Empty<float16> << <1, 1 >> >();
-     CUDA_POST_KERNEL_CHECK;
-}
-
 /******************** activation.dropout ********************/

 template<typename T>
@@ -357,49 +352,6 @@ template<> void Relu<float, CUDAContext>(const int count,
    CUDA_POST_KERNEL_CHECK;
 }

-#ifdef WITH_CUDA_FP16
-template <typename T>
-__global__ void _ReluHalf(const int count, const half* x, const half slope, half* y) {
-    const half kZero = __float2half(0.f);
-    CUDA_KERNEL_LOOP(idx, count) {
-#if __CUDA_ARCH__ >= 530
-        y[idx] = __hgt(x[idx], kZero) ? x[idx] : __hmul(x[idx], slope);
-#endif
-    }
-}
-
-template <typename T>
-__global__ void _ReluHalf2(const int count, const half2* x, const half2 slope, half2* y) {
-    const half2 kZero = __float2half2_rn(0.f);
-    CUDA_KERNEL_LOOP(idx, count) {
-#if __CUDA_ARCH__ >= 530
-        y[idx] = __hbgt2(x[idx], kZero) ? x[idx] : __hmul2(x[idx], slope);
-#endif
-    }
-}
-#endif
-
-template<> void Relu<float16, CUDAContext>(const int count,
-                                           const float16* x,
-                                           const float slope,
-                                           float16* y) {
-#ifdef WITH_CUDA_FP16
-    if (count % 2 == 0) 
-        _ReluHalf2<half2> << < GET_BLOCKS(count), CUDA_NUM_THREADS >> > (count / 2,
-                                                 reinterpret_cast<const half2*>(x),
-                                                  dragon_cast<half2, float>(slope),
-                                                      reinterpret_cast<half2*>(y));
-    else
-        _ReluHalf<half> << < GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
-                                           reinterpret_cast<const half*>(x),
-                                            dragon_cast<half, float>(slope),
-                                                reinterpret_cast<half*>(y));
-    CUDA_POST_KERNEL_CHECK;
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
-}
-
 template <typename T>
 __global__ void _ReluGrad(const int count,
                          const T* dy,
@@ -668,6 +620,69 @@ template<> void TanhGrad<float, CUDAContext>(const int count,
    CUDA_POST_KERNEL_CHECK;
 }

+/******************** arithmetic.scale ********************/
+
+template <typename T>
+__global__ void _AffineWithOBias(const int count,
+                                   const int scale_dim,
+                                   const int inner_dim,
+                                   const T* x,
+                                   const T* alpha,
+                                   T* y) {
+    CUDA_KERNEL_LOOP(idx, count) {
+        const int scale_idx = (idx / inner_dim) % scale_dim;
+         y[idx] = alpha[scale_idx] * x[idx];
+    }
+}
+
+template <typename T>
+__global__ void _AffineWithBias(const int count,
+                                const int scale_dim,
+                                const int inner_dim,
+                                const T* x,
+                                const T* alpha,
+                                const T* beta,
+                                T* y) {
+    CUDA_KERNEL_LOOP(idx, count) {
+        const int scale_idx = (idx / inner_dim) % scale_dim;
+        y[idx] = alpha[scale_idx] * x[idx] + beta[scale_idx];
+    }
+}
+
+template<> void Affine<float, CUDAContext>(const int count,
+                                           const int outer_dim,
+                                           const int scale_dim,
+                                           const int inner_dim,
+                                           const float* x,
+                                           const float* alpha,
+                                           const float* beta,
+                                           const float* beta_multiplier,
+                                           float* y) {
+    if (beta != nullptr) {
+        _AffineWithBias<float> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
+                                                             scale_dim, inner_dim,
+                                                               x, alpha, beta, y);
+    } else {
+        _AffineWithOBias<float> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
+                                                              scale_dim, inner_dim,
+                                                                      x, alpha, y);
+    }
+    CUDA_POST_KERNEL_CHECK;
+}
+
+template <> void AffineGrad<float, CUDAContext>(const int count,
+                                                const int outer_dim,
+                                                const int scale_dim,
+                                                const int inner_dim,
+                                                const float* dy,
+                                                const float* alpha,
+                                                float* dx) {
+    _AffineWithOBias<float> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
+                                                          scale_dim, inner_dim,
+                                                                dy, alpha, dx);
+    CUDA_POST_KERNEL_CHECK;
+}
+
 /******************** arithmetic.bias_add ********************/

 template <typename T>
@@ -747,172 +762,6 @@ template <> void Clip<float, CUDAContext>(const int count,
                                                                  y);
 }

-/******************** arithmetic.scale ********************/
-
-template <typename T>
-__global__ void _ScaleWithoutBias(const int n,
-                                  const T* x,
-                                  const T* scale,
-                                  const int scale_dim,
-                                  const int inner_dim,
-                                  T* y) {
-    CUDA_KERNEL_LOOP(idx, n) {
-        const int scale_idx = (idx / inner_dim) % scale_dim;
-         y[idx] = x[idx] * scale[scale_idx];
-    }
-}
-
-template <typename T>
-__global__ void _ScaleWithBias(const int n,
-                               const T* x,
-                               const T* scale,
-                               const T* bias,
-                               const int scale_dim,
-                               const int inner_dim,
-                               T* y) {
-    CUDA_KERNEL_LOOP(idx, n) {
-        const int scale_idx = (idx / inner_dim) % scale_dim;
-        y[idx] = x[idx] * scale[scale_idx] + bias[scale_idx];
-    }
-}
-
-template<> void Scale<float, CUDAContext>(const int axis,
-                                          Tensor* x,
-                                          Tensor* gamma,
-                                          Tensor* beta,
-                                          Tensor* BMul,
-                                          Tensor* y) {
-    const int count = x->count();
-    const int inner_dim = x->count(axis + gamma->ndim());
-    const int scale_dim = gamma->count();
-    auto* Xdata = x->data<float, CUDAContext>();
-    auto* Ydata = y->mutable_data<float, CUDAContext>();
-    auto* Sdata = gamma->data<float, CUDAContext>();
-    auto* Bdata = beta != nullptr ? 
-                          beta->data<float, CUDAContext>() :
-                          nullptr;
-    if (Bdata != nullptr)
-        _ScaleWithBias<float> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
-                                                                           Xdata,
-                                                                           Sdata,
-                                                                           Bdata,
-                                                                       scale_dim,
-                                                                       inner_dim,
-                                                                           Ydata);
-    else _ScaleWithoutBias<float> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
-                                                                               Xdata,
-                                                                               Sdata,
-                                                                           scale_dim,
-                                                                           inner_dim,
-                                                                               Ydata);
-}
-
-#ifdef WITH_CUDA_FP16
-template <typename T>
-__global__ void _ScaleWithoutBiasHalf(const int n, 
-                                      const half* x, 
-                                      const half* scale,
-                                      const int scale_dim, 
-                                      const int inner_dim, 
-                                      half* y) {
-    CUDA_KERNEL_LOOP(idx, n) {
-#if __CUDA_ARCH__ >= 530
-        const int scale_idx = (idx / inner_dim) % scale_dim;
-        y[idx] = __hmul(x[idx], scale[scale_idx]);
-#endif
-    }
-}
-
-template <typename T>
-__global__ void _ScaleWithBiasHalf(const int n, 
-                                   const half* x, 
-                                   const half* scale, 
-                                   const half* bias, 
-                                   const int scale_dim, 
-                                   const int inner_dim, 
-                                   half* y) {
-    CUDA_KERNEL_LOOP(idx, n) {
-#if __CUDA_ARCH__ >= 530
-        const int scale_idx = (idx / inner_dim) % scale_dim;
-        y[idx] = __hadd(__hmul(x[idx], scale[scale_idx]), bias[scale_idx]);
-#endif
-    }
-}
-#endif
-
-template<> void Scale<float16, CUDAContext>(const int axis,
-                                            Tensor* x,
-                                            Tensor* gamma,
-                                            Tensor* beta,
-                                            Tensor* BMul,
-                                            Tensor* y) {
-#ifdef WITH_CUDA_FP16
-    const int count = x->count();
-    const int inner_dim = x->count(axis + gamma->ndim());
-    const int scale_dim = gamma->count();
-    auto* Xdata = x->data<float16, CUDAContext>();
-    auto* Ydata = y->mutable_data<float16, CUDAContext>();
-    auto* Sdata = gamma->data<float16, CUDAContext>();
-    auto* Bdata = beta != nullptr ? 
-                          beta->data<float16, CUDAContext>() :
-                          nullptr;
-    if (Bdata != nullptr)
-        _ScaleWithBiasHalf<half> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
-                                               reinterpret_cast<const half*>(Xdata),
-                                               reinterpret_cast<const half*>(Sdata),
-                                               reinterpret_cast<const half*>(Bdata),
-                                                                          scale_dim,
-                                                                          inner_dim,
-                                                     reinterpret_cast<half*>(Ydata));
-    else _ScaleWithoutBiasHalf<half> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
-                                                   reinterpret_cast<const half*>(Xdata),
-                                                   reinterpret_cast<const half*>(Sdata),
-                                                                              scale_dim,
-                                                                              inner_dim,
-                                                         reinterpret_cast<half*>(Ydata));
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
-}
-
-template <> void ScaleGrad<float, CUDAContext>(const int axis, 
-                                               Tensor* dy, 
-                                               Tensor* gamma, 
-                                               Tensor* dx) {
-    const int count = dx->count();
-    const int inner_dim = dx->count(axis + gamma->ndim());
-    const int scale_dim = gamma->count();
-    auto* dYdata = dy->data<float, CUDAContext>();
-    auto* dXdata = dx->mutable_data<float, CUDAContext>();
-    auto* Sdata = gamma->data<float, CUDAContext>();
-    _ScaleWithoutBias<float> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
-                                                                         dYdata,
-                                                                          Sdata,
-                                                                      scale_dim,
-                                                                      inner_dim,
-                                                                         dXdata);
-}
-
-/******************** cast.float2half ********************/
-
-#ifdef WITH_CUDA_FP16
-template <typename T>
-__global__ void _FloatToHalfKernel(const int count, const float* x, half* y) {
-    CUDA_KERNEL_LOOP(idx, count) {
-        y[idx] = __float2half(x[idx]);
-    }
-}
-
-template <> void Float2Half<float, CUDAContext>(const int count, 
-                                                const float* x, 
-                                                float16* y) {
-    _FloatToHalfKernel<float> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
-                                                                               x,
-                                                      reinterpret_cast<half*>(y));
-     CUDA_POST_KERNEL_CHECK;
-}
-#endif
-
 /******************** control_flow.compare ********************/

 template <typename T>
@@ -1387,7 +1236,7 @@ template<> void SparseSoftmaxFocalLossGrad<float, CUDAContext>(const int count,
    CUDA_POST_KERNEL_CHECK;
 }

-/******************** misc.dtype ********************/
+/******************** misc.astype ********************/

 template <typename Ta, typename Tb>
 __global__ void _TypeA2B(const int count, const Ta* a, Tb* b) {
@@ -1396,19 +1245,6 @@ __global__ void _TypeA2B(const int count, const Ta* a, Tb* b) {
    }
 }

-#ifdef WITH_CUDA_FP16
-__global__ void _TypeHalf2Float(const int count, const half* a, float* b) {
-    CUDA_KERNEL_LOOP(idx, count) {
-        b[idx] = __half2float(a[idx]);
-    }
-}
-__global__ void _TypeFloat2Half(const int count, const float* a, half* b) {
-    CUDA_KERNEL_LOOP(idx, count) {
-        b[idx] = __float2half(a[idx]);
-    }
-}
-#endif
-
 #define DEFINE_TYPE_A2B(type_a, type_b) \
    template <> void TypeA2B<type_a, type_b, CUDAContext>(const int count, \
                                                          const type_a* a, \
@@ -1416,34 +1252,6 @@ __global__ void _TypeFloat2Half(const int count, const float* a, half* b) {
        _TypeA2B<type_a, type_b> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count, a, b); \
    }

-#define DEFINE_TYPE_DISABLE_FP16(type) \
-    template <> void TypeA2B<float16, type, CUDAContext>(const int count, \
-                                                         const float16* a, \
-                                                         type* b) { \
-        LOG(FATAL) << "CUDAContext has not implemented: float16 -> " \
-                   << TypeMetaToString(TypeMeta::Make<type>()); \
-    } \
-    template <> void TypeA2B<type, float16, CUDAContext>(const int count, \
-                                                         const type* a, \
-                                                         float16* b) { \
-        LOG(FATAL) << "CUDAContext has not implemented: " \
-                   << TypeMetaToString(TypeMeta::Make<type>()) << " -> float16"; \
-    }
-
-#define DEFINE_TYPE_ENABLE_FP16_FP32 \
-    template <> void TypeA2B<float16, float, CUDAContext>(const int count, \
-                                                          const float16* a, \
-                                                          float* b) { \
-        _TypeHalf2Float << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count, \
-                                      reinterpret_cast<const half*>(a), b); \
-    } \
-    template <> void TypeA2B<float, float16, CUDAContext>(const int count, \
-                                                          const float* a, \
-                                                          float16* b) { \
-        _TypeFloat2Half << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count, \
-                                            a, reinterpret_cast<half*>(b)); \
-    }
-
 #define DEFINE_TYPE_A2ALL(type_a) \
    DEFINE_TYPE_A2B(type_a, float); \
    DEFINE_TYPE_A2B(type_a, double); \
@@ -1457,32 +1265,6 @@ DEFINE_TYPE_A2ALL(int);
 DEFINE_TYPE_A2ALL(int64_t);
 DEFINE_TYPE_A2ALL(uint8_t);

-#ifdef WITH_CUDA_FP16
-template <> void TypeA2B<float16, float16, CUDAContext>(const int count,
-                                                        const float16* a,
-                                                        float16* b) {
-    _TypeA2B<half, half> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
-                                           reinterpret_cast<const half*>(a),
-                                                reinterpret_cast<half*>(b));
-}
-DEFINE_TYPE_ENABLE_FP16_FP32;
-DEFINE_TYPE_DISABLE_FP16(double);
-DEFINE_TYPE_DISABLE_FP16(int);
-DEFINE_TYPE_DISABLE_FP16(int64_t);
-DEFINE_TYPE_DISABLE_FP16(uint8_t);
-#else
-template <> void TypeA2B<float16, float16, CUDAContext>(const int count,
-                                                        const float16* a,
-                                                        float16* b) {
-    LOG(FATAL) << "CUDAContext has not implemented: float16 -> float16";
-}
-DEFINE_TYPE_DISABLE_FP16(float);
-DEFINE_TYPE_DISABLE_FP16(double);
-DEFINE_TYPE_DISABLE_FP16(int);
-DEFINE_TYPE_DISABLE_FP16(int64_t);
-DEFINE_TYPE_DISABLE_FP16(uint8_t);
-#endif
-
 /******************** misc.image_data ********************/

 template <typename Tx, typename Ty>
@@ -1522,43 +1304,6 @@ __global__ void _ImageData_NHWC(const int count,
    }
 }

-template <typename Tx, typename Ty>
-__global__ void _ImageDataHalf_NCHW(const int count,
-                                    const int N, const int C,
-                                    const int H, const int W,
-                                    const float* mean_values,
-                                    const float* std_values,
-                                    const Tx* x, 
-                                    Ty* y) {
-    CUDA_KERNEL_LOOP(idx, count) {
-        const int w = idx % W;
-        const int h = (idx / W) % H;
-        const int c = (idx / W / H) % C;
-        const int n = idx / W / H / C;
-        float raw_value = x[((n * H + h) * W + w) * C + c];
-        if (mean_values != nullptr) raw_value -= mean_values[c];
-        if (std_values != nullptr) raw_value /= std_values[c];
-        y[idx] = __float2half(raw_value);
-    }
-}
-
-template <typename Tx, typename Ty>
-__global__ void _ImageDataHalf_NHWC(const int count,
-                                    const int N, const int C,
-                                    const int H, const int W,
-                                    const float* mean_values,
-                                    const float* std_values,
-                                    const Tx* x, 
-                                    Ty* y) {
-    CUDA_KERNEL_LOOP(idx, count) {
-        const int c = idx % C;
-        float raw_value = x[idx];
-        if (mean_values != nullptr) raw_value -= mean_values[c];
-        if (std_values != nullptr) raw_value /= std_values[c];
-        y[idx] = __float2half(raw_value);
-    }
-}
-
 template <> void ImageData<float, float, CUDAContext>(const int count,
                                                      const int N, const int C,
                                                      const int H, const int W,
@@ -1613,67 +1358,7 @@ template <> void ImageData<uint8_t, float, CUDAContext>(const int count,
    CUDA_POST_KERNEL_CHECK;
 }

-template <> void ImageData<float, float16, CUDAContext>(const int count,
-                                                        const int N, const int C,
-                                                        const int H, const int W,
-                                                        const float* mean_values,
-                                                        const float* std_values,
-                                                        const string& data_format,
-                                                        const float* x,
-                                                        float16* y) {
-#ifdef WITH_CUDA_FP16
-    if (data_format == "NCHW") {
-        _ImageDataHalf_NCHW<float, half> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
-                                                                                 N, C, H, W,
-                                                                                mean_values,
-                                                                                 std_values,
-                                                                                          x,
-                                                                reinterpret_cast<half*>(y));
-    } else if (data_format == "NHWC") {
-        _ImageDataHalf_NHWC<float, half> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
-                                                                                 N, C, H, W,
-                                                                                mean_values,
-                                                                                 std_values,
-                                                                                          x,
-                                                                reinterpret_cast<half*>(y));

-    } else LOG(FATAL) << "Unknown data format: " << data_format;
-    CUDA_POST_KERNEL_CHECK;
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
-}
-
-template <> void ImageData<uint8_t, float16, CUDAContext>(const int count,
-                                                          const int N, const int C,
-                                                          const int H, const int W,
-                                                          const float* mean_values,
-                                                          const float* std_values,
-                                                          const string& data_format,
-                                                          const uint8_t* x,
-                                                          float16* y) {
-#ifdef WITH_CUDA_FP16
-    if (data_format == "NCHW") {
-        _ImageDataHalf_NCHW<uint8_t, half> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
-                                                                                   N, C, H, W,
-                                                                                  mean_values,
-                                                                                   std_values,
-                                                                                            x,
-                                                                  reinterpret_cast<half*>(y));
-    } else if (data_format == "NHWC") {
-        _ImageDataHalf_NHWC<uint8_t, half> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
-                                                                                   N, C, H, W,
-                                                                                  mean_values,
-                                                                                   std_values,
-                                                                                            x,
-                                                                  reinterpret_cast<half*>(y));
-
-    } else LOG(FATAL) << "Unknown data format: " << data_format;
-    CUDA_POST_KERNEL_CHECK;
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
-}

 /******************** ndarray.argmax ********************/

@@ -1998,30 +1683,6 @@ template <> void Concat<float, CUDAContext>(const int count,
    CUDA_POST_KERNEL_CHECK;
 }

-template <> void Concat<float16, CUDAContext>(const int count,
-                                              const int outer_dim,
-                                              const int inner_dim,
-                                              const int x_concat_dim,
-                                              const int y_concat_dim,
-                                              const int concat_offset,
-                                              const float16* x,
-                                              float16* y,
-                                              CUDAContext* context) {
-#ifdef WITH_CUDA_FP16
-    _Concat<half> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
-                                                           outer_dim,
-                                                           inner_dim,
-                                                        x_concat_dim,
-                                                        y_concat_dim,
-                                                       concat_offset,
-                                    reinterpret_cast<const half*>(x),
-                                         reinterpret_cast<half*>(y));
-    CUDA_POST_KERNEL_CHECK;
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
-}
-
 template <typename T>
 __global__ void _ConcatGrad(const int count,
                            const int outer_dim,
@@ -2061,29 +1722,7 @@ template <> void ConcatGrad<float, CUDAContext>(const int count,
    CUDA_POST_KERNEL_CHECK;
 }

-template <> void ConcatGrad<float16, CUDAContext>(const int count,
-                                                  const int outer_dim,
-                                                  const int inner_dim,
-                                                  const int x_concat_dim,
-                                                  const int y_concat_dim,
-                                                  const int concat_offset,
-                                                  const float16* dy,
-                                                  float16* dx,
-                                                  CUDAContext* context) {
-#ifdef WITH_CUDA_FP16
-    _ConcatGrad<half> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
-                                                               outer_dim,
-                                                               inner_dim,
-                                                            x_concat_dim,
-                                                            y_concat_dim,
-                                                           concat_offset,
-                                       reinterpret_cast<const half*>(dy),
-                                            reinterpret_cast<half*>(dx));
-    CUDA_POST_KERNEL_CHECK;
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
-}
+

 /******************** ndarray.crop ********************/

@@ -2746,28 +2385,6 @@ template <> void Transpose<float, CUDAContext>(const int count,
    CUDA_POST_KERNEL_CHECK;
 }

-template <> void Transpose<float16, CUDAContext>(const int count,
-                                                 const int ndim,
-                                                 const int* order,
-                                                 const int* old_steps,
-                                                 const int* new_steps,
-                                                 const float16* x,
-                                                 float16* y) {
-#ifdef WITH_CUDA_FP16
-    _Transpose<half> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
-                                                                   ndim,
-                                                                  order,
-                                                              old_steps,
-                                                              new_steps,
-                                       reinterpret_cast<const half*>(x),
-                                            reinterpret_cast<half*>(y));
-    CUDA_POST_KERNEL_CHECK;
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
-}
-
-
 template <typename T>
 __global__ void _TransposeGrad(const int count, 
                               const int ndim, 
@@ -2804,27 +2421,6 @@ template <> void TransposeGrad<float, CUDAContext>(const int count,
    CUDA_POST_KERNEL_CHECK;
 }

-template <> void TransposeGrad<float16, CUDAContext>(const int count, 
-                                                     const int ndim,
-                                                     const int* order, 
-                                                     const int* old_steps,
-                                                     const int* new_steps, 
-                                                     const float16* dy, 
-                                                     float16* dx) {
-#ifdef WITH_CUDA_FP16
-    _TransposeGrad<half> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count, 
-                                                                       ndim, 
-                                                                      order, 
-                                                                  old_steps, 
-                                                                  new_steps, 
-                                          reinterpret_cast<const half*>(dy),
-                                               reinterpret_cast<half*>(dx));
-    CUDA_POST_KERNEL_CHECK;
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
-}
-
 /******************** recurrent.lstm_uint ********************/

 template <typename T>

--- a/Dragon/src/utils/op_kernel_fp16.cu
+++ b/Dragon/src/utils/op_kernel_fp16.cu
+#ifdef WITH_CUDA
+
+#include <cmath>
+
+#include "core/context_cuda.h"
+#include "core/tensor.h"
+#include "utils/cuda_device.h"
+#include "utils/op_kernel.h"
+#include "utils/math_functions.h"
+#include "utils/cast.h"
+
+namespace dragon {
+
+namespace kernel {
+
+template <typename T>
+__global__ void _EmptyHalf() {}
+
+template<> void Empty<float16, CUDAContext>() {
+    _EmptyHalf<float16> << <1, 1 >> >();
+    CUDA_POST_KERNEL_CHECK;
+}
+
+/******************** activation.relu ********************/
+
+#ifdef WITH_CUDA_FP16
+template <typename T>
+__global__ void _ReluHalf(const int count, const half* x, const half slope, half* y) {
+    const half kZero = __float2half(0.f);
+    CUDA_KERNEL_LOOP(idx, count) {
+#if __CUDA_ARCH__ >= 530
+        y[idx] = __hgt(x[idx], kZero) ? x[idx] : __hmul(x[idx], slope);
+#endif
+    }
+}
+
+template <typename T>
+__global__ void _ReluHalf2(const int count, const half2* x, const half2 slope, half2* y) {
+    const half2 kZero = __float2half2_rn(0.f);
+    CUDA_KERNEL_LOOP(idx, count) {
+#if __CUDA_ARCH__ >= 530
+        y[idx] = __hbgt2(x[idx], kZero) ? x[idx] : __hmul2(x[idx], slope);
+#endif
+    }
+}
+#endif
+
+template<> void Relu<float16, CUDAContext>(const int count,
+                                           const float16* x,
+                                           const float slope,
+                                           float16* y) {
+#ifdef WITH_CUDA_FP16
+    if (count % 2 == 0) 
+        _ReluHalf2<half2> << < GET_BLOCKS(count), CUDA_NUM_THREADS >> > (count / 2,
+                                                 reinterpret_cast<const half2*>(x),
+                                                  dragon_cast<half2, float>(slope),
+                                                      reinterpret_cast<half2*>(y));
+    else
+        _ReluHalf<half> << < GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
+                                           reinterpret_cast<const half*>(x),
+                                            dragon_cast<half, float>(slope),
+                                                reinterpret_cast<half*>(y));
+    CUDA_POST_KERNEL_CHECK;
+#else
+    CUDA_FP16_NOT_COMPILED;
+#endif
+}
+
+/******************** arithmetic.affine ********************/
+
+#ifdef WITH_CUDA_FP16
+template <typename T>
+__global__ void _AffineWithOBiasHalf(const int count,
+                                     const int scale_dim,
+                                     const int inner_dim,
+                                     const half* x,
+                                     const half* alpha,
+                                     half* y) {
+    CUDA_KERNEL_LOOP(idx, count) {
+#if __CUDA_ARCH__ >= 530
+        const int scale_idx = (idx / inner_dim) % scale_dim;
+        y[idx] = __hmul(alpha[scale_idx], x[idx]);
+#endif
+    }
+}
+
+template <typename T>
+__global__ void _AffineWithBiasHalf(const int count,
+                                    const int scale_dim,
+                                    const int inner_dim,
+                                    const half* x,
+                                    const half* alpha,
+                                    const half* beta,
+                                    half* y) {
+    CUDA_KERNEL_LOOP(idx, count) {
+#if __CUDA_ARCH__ >= 530
+        const int scale_idx = (idx / inner_dim) % scale_dim;
+        y[idx] = __hadd(__hmul(alpha[scale_idx], x[idx]), beta[scale_idx]);
+#endif
+    }
+}
+#endif
+
+template<> void Affine<float16, CUDAContext>(const int count,
+                                             const int outer_dim,
+                                             const int scale_dim,
+                                             const int inner_dim,
+                                             const float16* x,
+                                             const float16* alpha,
+                                             const float16* beta,
+                                             const float16* beta_multiplier,
+                                             float16* y) {
+#ifdef WITH_CUDA_FP16
+    if (beta != nullptr) {
+        _AffineWithBiasHalf<float> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
+                                                                 scale_dim, inner_dim,
+                                                     reinterpret_cast<const half*>(x),
+                                                 reinterpret_cast<const half*>(alpha),
+                                                  reinterpret_cast<const half*>(beta),
+                                                          reinterpret_cast<half*>(y));
+    } else {
+        _AffineWithOBiasHalf<float> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
+                                                                  scale_dim, inner_dim,
+                                                      reinterpret_cast<const half*>(x),
+                                                  reinterpret_cast<const half*>(alpha),
+                                                           reinterpret_cast<half*>(y));
+    }
+    CUDA_POST_KERNEL_CHECK;
+#else
+    CUDA_FP16_NOT_COMPILED;
+#endif
+}
+
+/******************** misc.astype ********************/
+
+#ifdef WITH_CUDA_FP16
+__global__ void _TypeHalf2Float(const int count, const half* a, float* b) {
+    CUDA_KERNEL_LOOP(idx, count) {
+        b[idx] = __half2float(a[idx]);
+    }
+}
+__global__ void _TypeFloat2Half(const int count, const float* a, half* b) {
+    CUDA_KERNEL_LOOP(idx, count) {
+        b[idx] = __float2half(a[idx]);
+    }
+}
+
+__global__ void _TypeHalf2Half(const int count, const half* a, half* b) {
+    CUDA_KERNEL_LOOP(idx, count) {
+        b[idx] = a[idx];
+    }
+}
+#endif
+
+#define DEFINE_TYPE_DISABLE_FP16(type) \
+    template <> void TypeA2B<float16, type, CUDAContext>(const int count, \
+                                                         const float16* a, \
+                                                         type* b) { \
+        LOG(FATAL) << "CUDAContext has not implemented: float16 -> " \
+                   << TypeMetaToString(TypeMeta::Make<type>()); \
+    } \
+    template <> void TypeA2B<type, float16, CUDAContext>(const int count, \
+                                                         const type* a, \
+                                                         float16* b) { \
+        LOG(FATAL) << "CUDAContext has not implemented: " \
+                   << TypeMetaToString(TypeMeta::Make<type>()) << " -> float16"; \
+    }
+
+#define DEFINE_TYPE_ENABLE_FP16_FP32 \
+    template <> void TypeA2B<float16, float, CUDAContext>(const int count, \
+                                                          const float16* a, \
+                                                          float* b) { \
+        _TypeHalf2Float << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count, \
+                                      reinterpret_cast<const half*>(a), b); \
+        CUDA_POST_KERNEL_CHECK; \
+    } \
+    template <> void TypeA2B<float, float16, CUDAContext>(const int count, \
+                                                          const float* a, \
+                                                          float16* b) { \
+        _TypeFloat2Half << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count, \
+                                            a, reinterpret_cast<half*>(b)); \
+        CUDA_POST_KERNEL_CHECK; \
+    }
+
+#ifdef WITH_CUDA_FP16
+template <> void TypeA2B<float16, float16, CUDAContext>(const int count,
+                                                        const float16* a,
+                                                        float16* b) {
+    _TypeHalf2Half << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
+                                     reinterpret_cast<const half*>(a),
+                                          reinterpret_cast<half*>(b));
+    CUDA_POST_KERNEL_CHECK;
+}
+DEFINE_TYPE_ENABLE_FP16_FP32;
+DEFINE_TYPE_DISABLE_FP16(double);
+DEFINE_TYPE_DISABLE_FP16(int);
+DEFINE_TYPE_DISABLE_FP16(int64_t);
+DEFINE_TYPE_DISABLE_FP16(uint8_t);
+#else
+template <> void TypeA2B<float16, float16, CUDAContext>(const int count,
+                                                        const float16* a,
+                                                        float16* b) {
+    LOG(FATAL) << "CUDAContext has not implemented: float16 -> float16";
+}
+DEFINE_TYPE_DISABLE_FP16(float);
+DEFINE_TYPE_DISABLE_FP16(double);
+DEFINE_TYPE_DISABLE_FP16(int);
+DEFINE_TYPE_DISABLE_FP16(int64_t);
+DEFINE_TYPE_DISABLE_FP16(uint8_t);
+#endif
+
+/******************** misc.image_data ********************/
+
+#ifdef WITH_CUDA_FP16
+template <typename Tx, typename Ty>
+__global__ void _ImageDataHalf_NCHW(const int count,
+                                    const int N, const int C,
+                                    const int H, const int W,
+                                    const float* mean_values,
+                                    const float* std_values,
+                                    const Tx* x, 
+                                    Ty* y) {
+    CUDA_KERNEL_LOOP(idx, count) {
+        const int w = idx % W;
+        const int h = (idx / W) % H;
+        const int c = (idx / W / H) % C;
+        const int n = idx / W / H / C;
+        float raw_value = x[((n * H + h) * W + w) * C + c];
+        if (mean_values != nullptr) raw_value -= mean_values[c];
+        if (std_values != nullptr) raw_value /= std_values[c];
+        y[idx] = __float2half(raw_value);
+    }
+}
+
+template <typename Tx, typename Ty>
+__global__ void _ImageDataHalf_NHWC(const int count,
+                                    const int N, const int C,
+                                    const int H, const int W,
+                                    const float* mean_values,
+                                    const float* std_values,
+                                    const Tx* x, 
+                                    Ty* y) {
+    CUDA_KERNEL_LOOP(idx, count) {
+        const int c = idx % C;
+        float raw_value = x[idx];
+        if (mean_values != nullptr) raw_value -= mean_values[c];
+        if (std_values != nullptr) raw_value /= std_values[c];
+        y[idx] = __float2half(raw_value);
+    }
+}
+#endif
+
+template <> void ImageData<float, float16, CUDAContext>(const int count,
+                                                        const int N, const int C,
+                                                        const int H, const int W,
+                                                        const float* mean_values,
+                                                        const float* std_values,
+                                                        const string& data_format,
+                                                        const float* x,
+                                                        float16* y) {
+#ifdef WITH_CUDA_FP16
+    if (data_format == "NCHW") {
+        _ImageDataHalf_NCHW<float, half> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
+                                                                                 N, C, H, W,
+                                                                                mean_values,
+                                                                                 std_values,
+                                                                                          x,
+                                                                reinterpret_cast<half*>(y));
+    } else if (data_format == "NHWC") {
+        _ImageDataHalf_NHWC<float, half> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
+                                                                                 N, C, H, W,
+                                                                                mean_values,
+                                                                                 std_values,
+                                                                                          x,
+                                                                reinterpret_cast<half*>(y));
+
+    } else LOG(FATAL) << "Unknown data format: " << data_format;
+    CUDA_POST_KERNEL_CHECK;
+#else
+    CUDA_FP16_NOT_COMPILED;
+#endif
+}
+
+template <> void ImageData<uint8_t, float16, CUDAContext>(const int count,
+                                                          const int N, const int C,
+                                                          const int H, const int W,
+                                                          const float* mean_values,
+                                                          const float* std_values,
+                                                          const string& data_format,
+                                                          const uint8_t* x,
+                                                          float16* y) {
+#ifdef WITH_CUDA_FP16
+    if (data_format == "NCHW") {
+        _ImageDataHalf_NCHW<uint8_t, half> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
+                                                                                   N, C, H, W,
+                                                                                  mean_values,
+                                                                                   std_values,
+                                                                                            x,
+                                                                  reinterpret_cast<half*>(y));
+    } else if (data_format == "NHWC") {
+        _ImageDataHalf_NHWC<uint8_t, half> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
+                                                                                   N, C, H, W,
+                                                                                  mean_values,
+                                                                                   std_values,
+                                                                                            x,
+                                                                  reinterpret_cast<half*>(y));
+
+    } else LOG(FATAL) << "Unknown data format: " << data_format;
+    CUDA_POST_KERNEL_CHECK;
+#else
+    CUDA_FP16_NOT_COMPILED;
+#endif
+}
+
+/******************** ndarray.concat ********************/
+
+template <typename T>
+__global__ void _ConcatHalf(const int count,
+                            const int outer_dim,
+                            const int inner_dim,
+                            const int x_concat_dim,
+                            const int y_concat_dim,
+                            const int concat_offset,
+                            const T* x,
+                            T* y) {
+    CUDA_KERNEL_LOOP(idx, count) {
+        const int tmp = x_concat_dim * inner_dim;
+        const int outer_idx = idx / tmp;
+        const int concat_idx = idx % tmp;
+        const int y_idx = (outer_idx * y_concat_dim + concat_offset)
+                                     * inner_dim + concat_idx;
+        y[y_idx] = x[idx];
+    }
+}
+
+template <> void Concat<float16, CUDAContext>(const int count,
+                                              const int outer_dim,
+                                              const int inner_dim,
+                                              const int x_concat_dim,
+                                              const int y_concat_dim,
+                                              const int concat_offset,
+                                              const float16* x,
+                                              float16* y,
+                                              CUDAContext* context) {
+#ifdef WITH_CUDA_FP16
+    _ConcatHalf<half> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
+                                                               outer_dim,
+                                                               inner_dim,
+                                                            x_concat_dim,
+                                                            y_concat_dim,
+                                                           concat_offset,
+                                        reinterpret_cast<const half*>(x),
+                                             reinterpret_cast<half*>(y));
+    CUDA_POST_KERNEL_CHECK;
+#else
+    CUDA_FP16_NOT_COMPILED;
+#endif
+}
+
+template <typename T>
+__global__ void _ConcatGradHalf(const int count,
+                                const int outer_dim,
+                                const int inner_dim,
+                                const int x_concat_dim,
+                                const int y_concat_dim,
+                                const int concat_offset,
+                                const T* dy,
+                                T* dx) {
+    CUDA_KERNEL_LOOP(idx, count) {
+        const int tmp = x_concat_dim * inner_dim;
+        const int outer_idx = idx / tmp;
+        const int concat_idx = idx % tmp;
+        const int y_idx = (outer_idx * y_concat_dim + concat_offset)
+                                     * inner_dim + concat_idx;
+        dx[idx] = dy[y_idx];
+    }
+}
+
+template <> void ConcatGrad<float16, CUDAContext>(const int count,
+                                                  const int outer_dim,
+                                                  const int inner_dim,
+                                                  const int x_concat_dim,
+                                                  const int y_concat_dim,
+                                                  const int concat_offset,
+                                                  const float16* dy,
+                                                  float16* dx,
+                                                  CUDAContext* context) {
+#ifdef WITH_CUDA_FP16
+    _ConcatGradHalf<half> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
+                                                                   outer_dim,
+                                                                   inner_dim,
+                                                                x_concat_dim,
+                                                                y_concat_dim,
+                                                               concat_offset,
+                                           reinterpret_cast<const half*>(dy),
+                                                reinterpret_cast<half*>(dx));
+    CUDA_POST_KERNEL_CHECK;
+#else
+    CUDA_FP16_NOT_COMPILED;
+#endif
+}
+
+/******************** ndarray.transpose ********************/
+
+template <typename T>
+__global__ void _TransposeHalf(const int count,
+                               const int ndim,
+                               const int* order,
+                               const int* old_steps,
+                               const int* new_steps,
+                               const T* x,
+                               T* y) {
+    CUDA_KERNEL_LOOP(idx, count) {
+       int x_idx = 0, y_idx = idx;
+       for (int j = 0; j < ndim; ++j) {
+           int k = order[j];
+           x_idx += (y_idx / new_steps[j]) * old_steps[k];
+           y_idx %= new_steps[j];
+       }
+       y[idx] = x[x_idx];
+   }
+}
+
+template <> void Transpose<float16, CUDAContext>(const int count,
+                                                 const int ndim,
+                                                 const int* order,
+                                                 const int* old_steps,
+                                                 const int* new_steps,
+                                                 const float16* x,
+                                                 float16* y) {
+#ifdef WITH_CUDA_FP16
+    _TransposeHalf<half> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
+                                                                       ndim,
+                                                                      order,
+                                                                  old_steps,
+                                                                  new_steps,
+                                           reinterpret_cast<const half*>(x),
+                                                reinterpret_cast<half*>(y));
+    CUDA_POST_KERNEL_CHECK;
+#else
+    CUDA_FP16_NOT_COMPILED;
+#endif
+}
+
+template <typename T>
+__global__ void _TransposeGradHalf(const int count,
+                                   const int ndim,
+                                   const int* order,
+                                   const int* old_steps,
+                                   const int* new_steps,
+                                   const T* dy,
+                                   T* dx) {
+    CUDA_KERNEL_LOOP(idx, count) {
+        int x_idx = 0, y_idx = idx;
+        for (int j = 0; j < ndim; ++j) {
+            int k = order[j];
+            x_idx += (y_idx / new_steps[j]) * old_steps[k];
+            y_idx %= new_steps[j];
+        }
+        dx[x_idx] = dy[idx];
+    }
+}
+
+template <> void TransposeGrad<float16, CUDAContext>(const int count,
+                                                     const int ndim,
+                                                     const int* order,
+                                                     const int* old_steps,
+                                                     const int* new_steps,
+                                                     const float16* dy,
+                                                     float16* dx) {
+#ifdef WITH_CUDA_FP16
+    _TransposeGradHalf<half> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
+                                                                           ndim,
+                                                                          order,
+                                                                      old_steps,
+                                                                      new_steps,
+                                              reinterpret_cast<const half*>(dy),
+                                                   reinterpret_cast<half*>(dx));
+    CUDA_POST_KERNEL_CHECK;
+#else
+    CUDA_FP16_NOT_COMPILED;
+#endif
+}
+
+}    // namespace kernel
+
+}    // namespace dragon
+
+#endif // WITH_CUDA
\ No newline at end of file