Preliminary FP16 Training Support

Ting PAN
Commit fe161546 authored Jun 11, 2018 by Ting PAN
Showing with 441 additions and 39 deletions
Dragon/include/core/common.h
Dragon/include/operators/loss/sparse_softmax_cross_entropy_op.h
Dragon/include/operators/update/adam_update_op.h
Dragon/include/operators/update/nesterov_update_op.h
Dragon/include/operators/update/rmsprop_update_op.h
Dragon/include/operators/update/sgd_update_op.h
Dragon/include/operators/update/update_op_base.h
Dragon/include/operators/vision/conv_op.h
Dragon/include/operators/vision/conv_transpose_op.h
Dragon/include/utils/cuda_device.h
Dragon/python/dragon/__init__.py
Dragon/python/dragon/version.py
Dragon/python/setup.py
Dragon/src/operators/loss/sparse_softmax_cross_entropy_op.cc
Dragon/src/operators/misc/accuracy_op.cc
Dragon/src/operators/misc/astype_op.cc
Dragon/src/operators/update/adam_update_op.cc
Dragon/src/operators/update/nesterov_update_op.cc
Dragon/src/operators/update/rmsprop_update_op.cc
Dragon/src/operators/update/sgd_update_op.cc
--- a/Dragon/include/core/common.h
+++ b/Dragon/include/core/common.h
@@ -49,6 +49,8 @@ using Map = std::unordered_map<Key, Value>;
 template <typename Value>
 using Set = std::unordered_set<Value> ;

+#define DRAGON_VERSION 2204
+
 #define CONCATENATE_IMPL(s1, s2) s1##s2
 #define CONCATENATE(s1, s2) CONCATENATE_IMPL(s1,s2)
 #define ANONYMOUS_VARIABLE(str) CONCATENATE(str, __LINE__)

--- a/Dragon/include/operators/loss/sparse_softmax_cross_entropy_op.h
+++ b/Dragon/include/operators/loss/sparse_softmax_cross_entropy_op.h
@@ -33,6 +33,8 @@ class SparseSoftmaxCrossEntropyOp : public Operator<Context> {
    USE_OPERATOR_FUNCTIONS(Context);

    void SoftmaxRun();
+    void SoftmaxRunFP16();
+
    void RunOnDevice() override;
    template <typename Tx, typename Ty> void RunWithType();


--- a/Dragon/include/operators/update/adam_update_op.h
+++ b/Dragon/include/operators/update/adam_update_op.h
@@ -25,6 +25,7 @@ class AdamUpdateOp final : public UpdateOpBase<Context> {
    USE_UPDATER_FUNCTIONS(Context);

    void ComputeRunWithFloat() override;
+    void ComputeRunWithFloat16() override;

 protected:
    int t; float lr, beta1, beta2, eps;

--- a/Dragon/include/operators/update/nesterov_update_op.h
+++ b/Dragon/include/operators/update/nesterov_update_op.h
@@ -25,6 +25,7 @@ class NesterovUpdateOp final : public UpdateOpBase<Context> {
    USE_UPDATER_FUNCTIONS(Context);

    void ComputeRunWithFloat() override;
+    void ComputeRunWithFloat16() override;

 protected:
    float lr, momentum;

--- a/Dragon/include/operators/update/rmsprop_update_op.h
+++ b/Dragon/include/operators/update/rmsprop_update_op.h
@@ -25,6 +25,7 @@ class RMSPropUpdateOp final : public UpdateOpBase<Context> {
    USE_UPDATER_FUNCTIONS(Context);

    void ComputeRunWithFloat() override;
+    void ComputeRunWithFloat16() override;

 protected:
    float lr, decay, eps;

--- a/Dragon/include/operators/update/sgd_update_op.h
+++ b/Dragon/include/operators/update/sgd_update_op.h
@@ -26,6 +26,7 @@ class SGDUpdateOp final : public UpdateOpBase<Context> {
    USE_UPDATER_FUNCTIONS(Context);

    void ComputeRunWithFloat() override;
+    void ComputeRunWithFloat16() override;

 protected:
    float old_lr, lr, momentum, correction;

--- a/Dragon/include/operators/update/update_op_base.h
+++ b/Dragon/include/operators/update/update_op_base.h
@@ -35,6 +35,7 @@ class UpdateOpBase : public Operator<Context> {
    void RunOnDevice() override;
    template <typename T> void PreprocessRunWithType();
    virtual void ComputeRunWithFloat() = 0;
+    virtual void ComputeRunWithFloat16() { LOG(FATAL) << "This Updater does not support FP16."; }
    template <typename T> void UpdateRunWithType();

 protected:

--- a/Dragon/include/operators/vision/conv_op.h
+++ b/Dragon/include/operators/vision/conv_op.h
@@ -103,6 +103,7 @@ class CuDNNConv2dOp : public Conv2dOp<Context> {
 protected:
    cudnnHandle_t* handle;
    cudaStream_t*  stream;
+    cudnnDataType_t compute_type;
    cudnnTensorFormat_t format;
    cudnnConvolutionFwdAlgo_t fwd_algo;
    cudnnTensorDescriptor_t input_desc, output_desc, bias_desc;
@@ -164,6 +165,7 @@ class CuDNNConv2dGradientOp : public Conv2dGradientOp<Context> {
 protected:
    cudnnHandle_t* handle;
    cudaStream_t*  stream;
+    cudnnDataType_t compute_type;
    cudnnTensorFormat_t format;
    cudnnConvolutionBwdFilterAlgo_t bwd_filter_algo;
    cudnnConvolutionBwdDataAlgo_t bwd_data_algo;

--- a/Dragon/include/operators/vision/conv_transpose_op.h
+++ b/Dragon/include/operators/vision/conv_transpose_op.h
@@ -106,6 +106,7 @@ class CuDNNConv2dTransposeOp : public Conv2dTransposeOp<Context> {
 protected:
    cudnnHandle_t* handle;
    cudaStream_t*  stream;
+    cudnnDataType_t compute_type;
    cudnnTensorFormat_t format;
    cudnnConvolutionBwdDataAlgo_t fwd_algo;
    cudnnTensorDescriptor_t input_desc, output_desc, bias_desc;
@@ -167,6 +168,7 @@ public:
 protected:
    cudnnHandle_t* handle;
    cudaStream_t*  stream;
+    cudnnDataType_t compute_type;
    cudnnTensorFormat_t format;
    cudnnConvolutionBwdFilterAlgo_t bwd_filter_algo;
    cudnnConvolutionFwdAlgo_t bwd_data_algo;

--- a/Dragon/include/utils/cuda_device.h
+++ b/Dragon/include/utils/cuda_device.h
@@ -116,6 +116,12 @@ inline const cudaDeviceProp& GetDeviceProperty(const int device_id) {
    return props.props[device_id];
 }

+inline bool CUDA_TRUE_FP16_AVAILABLE() {
+    int device = CUDA_CURRENT_DEVICE();
+    auto& prop = GetDeviceProperty(device);
+    return prop.major >= 6;
+}
+
 inline bool TENSOR_CORE_AVAILABLE() {
 #if CUDA_VERSION < 9000
    return false;

--- a/Dragon/python/dragon/__init__.py
+++ b/Dragon/python/dragon/__init__.py
@@ -34,3 +34,7 @@ from dragon.core.scope import TensorScope as name_scope
 from dragon.core.scope import PhaseScope as phase_scope
 from dragon.core.scope import DeviceScope as device_scope

+# version
+from dragon.version import version
+__version__ = version
+
--- a/Dragon/python/dragon/version.py
+++ b/Dragon/python/dragon/version.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+version = '0.2.2'
+full_version = '0.2.2.4'
+release = False
+
+if not release:
+    version = full_version
\ No newline at end of file
--- a/Dragon/python/setup.py
+++ b/Dragon/python/setup.py
@@ -42,7 +42,7 @@ find_modules()


 setup(name = 'dragon',
-      version='0.2.2.3',
+      version='0.2.2.4',
      description = 'Dragon: A Computation Graph Virtual Machine Based Deep Learning Framework',
      url='https://github.com/seetaresearch/Dragon',
      author='Ting Pan',

--- a/Dragon/src/operators/loss/sparse_softmax_cross_entropy_op.cc
+++ b/Dragon/src/operators/loss/sparse_softmax_cross_entropy_op.cc
@@ -20,6 +20,24 @@ void SparseSoftmaxCrossEntropyOp<Context>::SoftmaxRun() {
    softmax_op->Run();
 }

+template <class Context>
+void SparseSoftmaxCrossEntropyOp<Context>::SoftmaxRunFP16() {
+    Tensor* XF32 = ws()->CreateTensor("/mnt/" + anchor() + "/softmax/xf32");
+    XF32->ReshapeLike(Input(0));
+    auto* XdataF16 = Input(0).template data<float16, Context>();
+    auto* XdataF32 = XF32->template mutable_data<float, Context>();
+    kernel::TypeA2B<float16, float, Context>(Input(0).count(), XdataF16, XdataF32);
+    OperatorDef softmax_def = MakeOperatorDef("Softmax", "",
+        vector<string>({ XF32->name() }),
+        vector<string>({ "/mnt/" + anchor() + "/softmax/prob" }));
+    softmax_def.add_arg()->CopyFrom(this->arg("axis"));
+    if (op_def().has_device_option())
+        softmax_def.mutable_device_option()->CopyFrom(op_def().device_option());
+    if (!softmax_op) softmax_op.reset(CreateOperator(softmax_def, ws()));
+    else softmax_op->MutableOp(softmax_def);
+    softmax_op->Run();
+}
+
 template <class Context> template <typename Tx, typename Ty>
 void SparseSoftmaxCrossEntropyOp<Context>::RunWithType() {
    auto* prob_data = prob->template data<Tx, Context>();
@@ -59,11 +77,11 @@ void SparseSoftmaxCrossEntropyOp<Context>::RunOnDevice() {
        << "\nNumber of predictions must match the number of labels.";
    valid.Reshape(vector<TIndex>(1, outer_dim * inner_dim));
    losses.Reshape(vector<TIndex>(1, outer_dim * inner_dim));
-    ws()->CreateTensor("/mnt/" + anchor() + "/softmax/prob");
-    SoftmaxRun();
-    prob = ws()->GetTensor("/mnt/" + anchor() + "/softmax/prob");
+    prob = ws()->CreateTensor("/mnt/" + anchor() + "/softmax/prob");

-    if (XIsType(Input(0), float)) {
+    if (XIsType(Input(0), float) || XIsType(Input(0), float16)) {
+        if (XIsType(Input(0), float16)) SoftmaxRunFP16();
+        else SoftmaxRun();
        if (XIsType(Input(1), float)) RunWithType<float, float>();
        else if (XIsType(Input(1), int64_t)) RunWithType<float, int64_t>();
        else LOG(FATAL) << DTypeHelper(Input(1), { "float32", "int64" });
@@ -118,11 +136,17 @@ void SparseSoftmaxCrossEntropyGradientOp<Context>::RunOnDevice() {
    Output(0)->ReshapeLike(Input(0));
    valid.Reshape(vector<TIndex>(1, outer_dim * inner_dim));

-    if (XIsType(Input(0), float)) {
+    if (XIsType(Input(0), float) || XIsType(Input(0), float16)) {
        if (XIsType(Input(1), float)) RunWithType<float, float>();
        else if (XIsType(Input(1), int64_t)) RunWithType<float, int64_t>();
        else LOG(FATAL) << DTypeHelper(Input(1), { "float32", "int64" });
-    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
+        if (XIsType(Input(0), float16)) {
+            auto* dXdataF32 = Output(0)->template data<float, Context>();
+            auto* dXdataF16 = prob->template mutable_data<float16, Context>();
+            kernel::TypeA2B<float, float16, Context>(Output(0)->count(), dXdataF32, dXdataF16);
+            Output(0)->template Copy<Context, Context>(*prob);
+        }
+    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
 }

 DEPLOY_CPU(SparseSoftmaxCrossEntropyGradient);

--- a/Dragon/src/operators/misc/accuracy_op.cc
+++ b/Dragon/src/operators/misc/accuracy_op.cc
 #include <algorithm>
 #include "operators/misc/accuracy_op.h"
+#include "core/workspace.h"
+#include "utils/op_kernel.h"
 #include "utils/math_functions.h"

+
 namespace dragon {

 template <class Context> template <typename Tx, typename Ty>
@@ -13,7 +16,17 @@ void AccuracyOp<Context>::RunWithType() {
    Map<int, TIndex> num_per_class;

    TIndex acc = 0, count = 0;
-    auto* Xdata = Input(0).template data<Tx, CPUContext>();
+
+    const Tx* Xdata;
+    if (XIsType(Input(0), float16)) {
+        Tensor* XF32 = ws()->CreateTensor("/mnt/" + anchor() + "/accuracy/xf32");
+        XF32->ReshapeLike(Input(0));
+        auto* XdataF16 = Input(0).template data<float16, CPUContext>();
+        auto* XdataF32 = XF32->template mutable_data<float, CPUContext>();
+        kernel::TypeA2B<float16, float, CPUContext>(Input(0).count(), XdataF16, XdataF32);
+        Xdata = XdataF32;
+    } else Xdata = Input(0).template data<Tx, CPUContext>();
+
    auto* labels = Input(1).template data<Ty, CPUContext>();
    auto* ignores = ignore_labels.count() > 0 ?
                        ignore_labels.data<int, CPUContext>() : nullptr;
@@ -60,11 +73,11 @@ void AccuracyOp<Context>::RunOnDevice() {
    Output(0)->Reshape(vector<TIndex>(1, 1));
    if (OutputSize() > 1) Output(1)->Reshape(vector<TIndex>(1, num_classes)); 

-    if (XIsType(Input(0), float)) {
+    if (XIsType(Input(0), float) || XIsType(Input(0), float16)) {
        if (XIsType(Input(1), float)) RunWithType<float, float>();
        else if(XIsType(Input(1), int64_t)) RunWithType<float, int64_t>();
        else LOG(FATAL) << DTypeHelper(Input(1), { "float32", "int64" });
-    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
+    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
 }

 DEPLOY_CPU(Accuracy);

--- a/Dragon/src/operators/misc/astype_op.cc
+++ b/Dragon/src/operators/misc/astype_op.cc
@@ -22,6 +22,7 @@ namespace dragon {
            auto* Ydata = buffer->template mutable_data<type_b, Context>(); \
            kernel::TypeA2B<type_a, type_b, Context>(Output(0)->count(), Xdata, Ydata); \
            Output(0)->template Copy<Context, Context>(*buffer); \
+            ws()->ReleaseBuffer(buffer); \
        } \
        return; \
    }

--- a/Dragon/src/operators/update/adam_update_op.cc
+++ b/Dragon/src/operators/update/adam_update_op.cc
@@ -22,6 +22,24 @@ void AdamUpdateOp<Context>::ComputeRunWithFloat() {
           lr, beta1, beta2, eps, dXdata, Mdata, Vdata);
 }

+template <class Context>
+void AdamUpdateOp<Context>::ComputeRunWithFloat16() {
+    Tensor* m = ws()->CreateTensor("/mnt/" + Slot() + "/adam/m");
+    Tensor* v = ws()->CreateTensor("/mnt/" + Slot() + "/adam/v");
+    m->ReshapeLike(Input(0));
+    v->ReshapeLike(Input(0));
+
+    t++;
+    beta1 = Param("beta1"), beta2 = Param("beta2"), eps = Param("eps");
+    float coeff = sqrt(1. - pow(beta2, t)) / (1. - pow(beta1, t));
+    lr = Param("base_lr") * coeff * this->lr_mult;
+    auto* dXdata = Input(0).template mutable_data<float16, Context>();
+    auto* Mdata = m->mutable_data<float16, Context>();
+    auto* Vdata = v->mutable_data<float16, Context>();
+    kernel::AdamUpdate<float16, Context>(Input(0).count(),
+             lr, beta1, beta2, eps, dXdata, Mdata, Vdata);
+}
+
 DEPLOY_CPU(AdamUpdate);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(AdamUpdate);

--- a/Dragon/src/operators/update/nesterov_update_op.cc
+++ b/Dragon/src/operators/update/nesterov_update_op.cc
@@ -17,6 +17,18 @@ void NesterovUpdateOp<Context>::ComputeRunWithFloat() {
                               lr, momentum, dXdata, Hdata);
 }

+template <class Context>
+void NesterovUpdateOp<Context>::ComputeRunWithFloat16() {
+    Tensor* h = ws()->CreateTensor("/mnt/" + Slot() + "/nesterov/h");
+    h->ReshapeLike(Input(0));
+
+    lr = Param("base_lr") * this->lr_mult, momentum = Param("momentum");
+    auto* dXdata = Input(0).template mutable_data<float16, Context>();
+    auto* Hdata = h->template mutable_data<float16, Context>();
+    kernel::NesterovUpdate<float16, Context>(Input(0).count(),
+                                 lr, momentum, dXdata, Hdata);
+}
+
 DEPLOY_CPU(NesterovUpdate);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(NesterovUpdate);

--- a/Dragon/src/operators/update/rmsprop_update_op.cc
+++ b/Dragon/src/operators/update/rmsprop_update_op.cc
@@ -17,6 +17,19 @@ void RMSPropUpdateOp<Context>::ComputeRunWithFloat() {
                            lr, decay, eps, dXdata, Hdata);
 }

+template <class Context>
+void RMSPropUpdateOp<Context>::ComputeRunWithFloat16() {
+    Tensor* h = ws()->CreateTensor("/mnt/" + Slot() + "/rmsprop/h");
+    h->ReshapeLike(Input(0));
+
+    lr = Param("base_lr") * this->lr_mult;
+    decay = Param("decay"), eps = Param("eps");
+    auto* dXdata = Input(0).template mutable_data<float16, Context>();
+    auto* Hdata = h->template mutable_data<float16, Context>();
+    kernel::RMSPropUpdate<float16, Context>(Input(0).count(),
+                              lr, decay, eps, dXdata, Hdata);
+}
+
 DEPLOY_CPU(RMSPropUpdate);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(RMSPropUpdate);

--- a/Dragon/src/operators/update/sgd_update_op.cc
+++ b/Dragon/src/operators/update/sgd_update_op.cc
@@ -19,6 +19,19 @@ void SGDUpdateOp<Context>::ComputeRunWithFloat() {
             lr, momentum * correction, dXdata, Hdata);
 }

+template <class Context>
+void SGDUpdateOp<Context>::ComputeRunWithFloat16() {
+    Tensor* h = ws()->CreateTensor("/mnt/" + Slot() + "/sgd/h");
+    h->ReshapeLike(Input(0));
+
+    lr = Param("base_lr") * this->lr_mult, momentum = Param("momentum");
+    if (old_lr > 0) { correction = lr / old_lr; } old_lr = lr;
+    auto* dXdata = Input(0).template mutable_data<float16, Context>();
+    auto* Hdata = h->template mutable_data<float16, Context>();
+    kernel::SGDUpdate<float16, Context>(Input(0).count(),
+               lr, momentum * correction, dXdata, Hdata);
+}
+
 DEPLOY_CPU(SGDUpdate);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(SGDUpdate);

--- a/Dragon/src/operators/update/update_op_base.cc
+++ b/Dragon/src/operators/update/update_op_base.cc
 #include "operators/update/update_op_base.h"
 #include "core/workspace.h"
 #include "utils/math_functions.h"
+#include "utils/cast.h"

 namespace dragon {

@@ -27,10 +28,10 @@ void UpdateOpBase<Context>::PreprocessRunWithType() {
    clip_thresh = Param("clip_gradient");
    if (clip_thresh > 0) {
        auto* dXdata = Input(0).template mutable_data<T, Context>();
-        T sumsq_grad = math::Dot<T, Context>(Input(0).count(), dXdata, dXdata);
-        const T l2norm = sqrt(sumsq_grad);
+        float sumsq_grad = math::Dot<T, Context>(Input(0).count(), dXdata, dXdata);
+        const float l2norm = sqrt(sumsq_grad);
        if (l2norm > clip_thresh) {
-            T factor = clip_thresh / l2norm;
+            float factor = clip_thresh / l2norm;
            math::Scal<T, Context>(Input(0).count(), factor, dXdata);
        }
    }
@@ -48,7 +49,8 @@ void UpdateOpBase<Context>::UpdateRunWithType() {
    auto* dXdata = Input(0).template mutable_data<T, Context>();
    auto* Xdata = Output(0)->template mutable_data<T, Context>();
    math::Axpy<T, Context>(Output(0)->count(), -1.0, dXdata, Xdata);
-    if (zero_grad) math::Set<T, Context>(Input(0).count(), 0, dXdata);
+    T zeroT = dragon_cast<T, float>(0.f);
+    if (zero_grad) math::Set<T, Context>(Input(0).count(), zeroT, dXdata);
 }

 template <class Context>
@@ -62,7 +64,11 @@ void UpdateOpBase<Context>::RunOnDevice() {
        PreprocessRunWithType<float>();
        ComputeRunWithFloat();
        UpdateRunWithType<float>();
-    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
+    } else if (XIsType(Input(0), float16)) {
+        PreprocessRunWithType<float16>();
+        ComputeRunWithFloat16();
+        UpdateRunWithType<float16>();
+    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
 } 

 template class UpdateOpBase<CPUContext>;

--- a/Dragon/src/operators/vision/cudnn_conv2d_op.cc
+++ b/Dragon/src/operators/vision/cudnn_conv2d_op.cc
@@ -135,12 +135,14 @@ void CuDNNConv2dOp<Context>::RunOnDevice() {
    } else if (XIsType(Input(0), float16)) {
 #ifdef WITH_CUDA_FP16
 #if CUDNN_VERSION_MIN(6, 0, 0)
+        compute_type = CUDA_TRUE_FP16_AVAILABLE() ?
+            CUDNN_DATA_HALF : CUDNN_DATA_FLOAT;
        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc,
                                   this->pad[0], this->pad[1],
                             this->stride[0], this->stride[1],
                         this->dilation[0], this->dilation[1],
                                      CUDNN_CROSS_CORRELATION,
-                                           CUDNN_DATA_FLOAT));
+                                               compute_type));
 #else
        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc,
                                   this->pad[0], this->pad[1],
@@ -317,12 +319,17 @@ void CuDNNConv2dGradientOp<Context>::RunOnDevice() {
    } else if (XIsType(Input(0), float16)) {
 #ifdef WITH_CUDA_FP16
 #if CUDNN_VERSION_MIN(6, 0, 0)
+        //  may encounter CUDNN_STATUS_BAD_PARAM if using CUDNN_DATA_HALF
+        //  keep it before cuDNN fix this bug
+        //  compute_type = CUDA_TRUE_FP16_AVAILABLE() ?
+        //      CUDNN_DATA_HALF : CUDNN_DATA_FLOAT;
+        compute_type = CUDNN_DATA_FLOAT;
        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc,
                                   this->pad[0], this->pad[1],
                             this->stride[0], this->stride[1],
                         this->dilation[0], this->dilation[1],
                                      CUDNN_CROSS_CORRELATION,
-                                           CUDNN_DATA_FLOAT));
+                                               compute_type));
 #else
        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc,
                                   this->pad[0], this->pad[1],

--- a/Dragon/src/operators/vision/cudnn_conv2d_transpose_op.cc
+++ b/Dragon/src/operators/vision/cudnn_conv2d_transpose_op.cc
@@ -137,12 +137,14 @@ void CuDNNConv2dTransposeOp<Context>::RunOnDevice() {
    } else if (XIsType(Input(0), float16)) {
 #ifdef WITH_CUDA_FP16
 #if CUDNN_VERSION_MIN(6, 0, 0)
+        compute_type = CUDA_TRUE_FP16_AVAILABLE() ?
+            CUDNN_DATA_HALF : CUDNN_DATA_FLOAT;
        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc,
                                   this->pad[0], this->pad[1],
                             this->stride[0], this->stride[1],
                         this->dilation[0], this->dilation[1],
                                      CUDNN_CROSS_CORRELATION,
-                                           CUDNN_DATA_FLOAT));
+                                               compute_type));
 #else
        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc,
                                   this->pad[0], this->pad[1],
@@ -321,12 +323,14 @@ void CuDNNConv2dTransposeGradientOp<Context>::RunOnDevice() {
    } else if (XIsType(Input(0), float16)) {
 #ifdef WITH_CUDA_FP16
 #if CUDNN_VERSION_MIN(6, 0, 0)
+        compute_type = CUDA_TRUE_FP16_AVAILABLE() ?
+            CUDNN_DATA_HALF : CUDNN_DATA_FLOAT;
        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc,
                                   this->pad[0], this->pad[1],
                             this->stride[0], this->stride[1],
                         this->dilation[0], this->dilation[1],
                                      CUDNN_CROSS_CORRELATION,
-                                           CUDNN_DATA_FLOAT));
+                                               compute_type));
 #else
        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc,
                                   this->pad[0], this->pad[1],

--- a/Dragon/src/utils/math_functions_fp16.cu
+++ b/Dragon/src/utils/math_functions_fp16.cu
@@ -20,9 +20,7 @@ __global__ void _SetHalf(const int n, const T alpha, T* x) {
    }
 }

-template <> void Set<float16, CUDAContext>(const int n,
-                                           const float16 alpha,
-                                           float16* x) {
+template <> void Set<float16, CUDAContext>(const int n, const float16 alpha, float16* x) {
 #ifdef WITH_CUDA_FP16
    if (n % 2 == 0) 
        _SetHalf<half2> << <GET_BLOCKS(n / 2), CUDA_NUM_THREADS >> >(n / 2,
@@ -36,18 +34,29 @@ template <> void Set<float16, CUDAContext>(const int n,
 #endif
 }

-template <> void RandomUniform<float16, CUDAContext>(const int n,
-                                                     const float low,
-                                                     const float high,
-                                                     float16* x) {
-    NOT_IMPLEMENTED;
+#ifdef WITH_CUDA_FP16
+__global__ void _TypeFloat2Half(const int n, const float* a, half* b) {
+    CUDA_KERNEL_LOOP(idx, n) {
+        b[idx] = __float2half(a[idx]);
+    }
 }
+#endif

 template <> void RandomNormal<float16, CUDAContext>(const int n,
-                                                    const float mu, 
-                                                    const float sigma, 
+                                                    const float mu,
+                                                    const float sigma,
                                                    float16* x) {
-    NOT_IMPLEMENTED;
+#ifdef WITH_CUDA_FP16
+    float* xf32 = (float*)CUDAContext::New(n * sizeof(float));
+    CURAND_CHECK(curandGenerateNormal(curand_generator(), xf32, n, mu, sigma));
+    _TypeFloat2Half << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n,
+                                                          xf32,
+                                   reinterpret_cast<half*>(x));
+    CUDA_POST_KERNEL_CHECK;
+    CUDAContext::Delete(xf32);
+#else
+    CUDA_FP16_NOT_COMPILED;
+#endif
 }

 /******************** Level-1 ********************/
@@ -380,12 +389,11 @@ template <> void Scale<float16, CUDAContext>(const int n,
 template <> float Dot<float16, CUDAContext>(int n, const float16* a, const float16* b) {
 #ifdef WITH_CUDA_FP16
    float16 result;
-    CUBLAS_CHECK(cublasDotEx(cublas_handle(),
-                             n,
-                             &a, CUDA_R_16F, 1,
-                             &b, CUDA_R_16F, 1,
-                             &result, CUDA_R_16F,
-                             CUDA_R_32F));
+    CUBLAS_CHECK(cublasDotEx(cublas_handle(), n,
+                               a, CUDA_R_16F, 1,
+                               b, CUDA_R_16F, 1,
+                            &result, CUDA_R_16F,
+                                   CUDA_R_32F));
    return dragon_cast<float, float16>(result);
 #else
    CUDA_FP16_NOT_COMPILED;
@@ -491,6 +499,26 @@ template <> void Axpby<float16, CUDAContext>(const int n,
    Axpy<float16, CUDAContext>(n, alpha, x, y);
 }

+template <> void RandomUniform<float16, CUDAContext>(const int n,
+                                                     const float low,
+                                                     const float high,
+                                                     float16* x) {
+#ifdef WITH_CUDA_FP16
+    float* xf32 = (float*)CUDAContext::New(n * sizeof(float));
+    CURAND_CHECK(curandGenerateUniform(curand_generator(), xf32, n));
+    _TypeFloat2Half << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n,
+                                                          xf32,
+                                   reinterpret_cast<half*>(x));
+    CUDA_POST_KERNEL_CHECK;
+    float range = high - low;
+    if (range != float(1)) Scal<float16, CUDAContext>(n, range, x);
+    if (low != float(0)) AddScalar<float16, CUDAContext>(n, low, x);
+    CUDAContext::Delete(xf32);
+#else
+    CUDA_FP16_NOT_COMPILED;
+#endif
+}
+
 /******************** Level-3 ********************/

 template <> void Gemm<float16, CUDAContext>(const CBLAS_TRANSPOSE transA,

--- a/Dragon/src/utils/op_kernel.cc
+++ b/Dragon/src/utils/op_kernel.cc
@@ -1838,6 +1838,15 @@ template <> void AdamUpdate<float, CPUContext>(const int count,
    _AdamUpdate<float>(count, lr, beta1, beta2, eps, g, m, v);
 }

+template <> void AdamUpdate<float16, CPUContext>(const int count,
+                                                 const float lr,
+                                                 const float beta1,
+                                                 const float beta2,
+                                                 const float eps,
+                                                 float16* g, float16* m, float16* v) {
+    LOG(FATAL) << "float16 is unsupported for CPUContext.";
+}
+
 /******************** update.nesterov_update ********************/

 template <typename T>
@@ -1862,6 +1871,13 @@ template <> void NesterovUpdate<float, CPUContext>(const int count,
    _NesterovUpdate<float>(count, lr, momentum, g, h);
 }

+template <> void NesterovUpdate<float16, CPUContext>(const int count,
+                                                     const float lr,
+                                                     const float momentum,
+                                                     float16* g, float16* h) {
+    LOG(FATAL) << "float16 is unsupported for CPUContext.";
+}
+
 /******************** update.rmsprop_update ********************/

 template <typename T>
@@ -1888,6 +1904,14 @@ template <> void RMSPropUpdate<float, CPUContext>(const int count,
    _RMSPropUpdate<float>(count, lr, decay, eps, g, h);
 }

+template <> void RMSPropUpdate<float16, CPUContext>(const int count,
+                                                    const float lr,
+                                                    const float decay,
+                                                    const float eps,
+                                                    float16* g, float16* h) {
+    LOG(FATAL) << "float16 is unsupported for CPUContext.";
+}
+
 /******************** update.sgd_update ********************/

 template <typename T>
@@ -1911,6 +1935,13 @@ template <> void SGDUpdate<float, CPUContext>(const int count,
    _SGDUpdate<float>(count, lr, momentum, g, h);
 }

+template <> void SGDUpdate<float16, CPUContext>(const int count,
+                                       const float lr,
+                                       const float momentum,
+                                       float16* g, float16* h) {
+    LOG(FATAL) << "float16 is unsupported for CPUContext.";
+}
+
 /******************** vision.bilinear_resize ********************/

 template <typename T>

--- a/Dragon/src/utils/op_kernel.cu
+++ b/Dragon/src/utils/op_kernel.cu
@@ -2667,9 +2667,9 @@ __global__ void _SGDUpdate(const int count,
 }

 template <> void SGDUpdate<float, CUDAContext>(const int count,
-                                               const float lr,
-                                               const float momentum,
-                                               float* g, float* h) {
+                                      const float lr,
+                                      const float momentum,
+                                      float* g, float* h) {
    _SGDUpdate<float> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
                                                     lr, momentum, g, h);
    CUDA_POST_KERNEL_CHECK;

--- a/Dragon/src/utils/op_kernel_fp16.cu
+++ b/Dragon/src/utils/op_kernel_fp16.cu
@@ -482,6 +482,193 @@ template <> void TransposeGrad<float16, CUDAContext>(const int count,
 #endif
 }

+/******************** update.adam_update ********************/
+
+#ifdef WITH_CUDA_FP16
+__global__ void _AdamUpdateHalf(const int count,
+                                const half lr,
+                                const half beta1,
+                                const half beta2,
+                                const half eps,
+                                half* g, half* m, half* v) {
+    CUDA_KERNEL_LOOP(i, count) {
+#if __CUDA_ARCH__ >= 530
+        half gi = g[i];
+        half kOne = __float2half(1.f);
+        half mi = m[i] = __hadd(__hmul(m[i], beta1), __hmul(gi, __hsub(kOne, beta1)));
+        half vi = v[i] = __hadd(__hmul(v[i], beta2), __hmul(gi, __hmul(gi, __hsub(kOne, beta2))));
+        g[i] = __hdiv(__hmul(lr, mi), __hadd(hsqrt(vi), eps));
+#endif
+    }
+}
+#endif
+
+template <> void AdamUpdate<float16, CUDAContext>(const int count,
+                                                  const float lr,
+                                                  const float beta1,
+                                                  const float beta2,
+                                                  const float eps,
+                                                  float16* g, float16* m, float16* v) {
+#ifdef WITH_CUDA_FP16
+    _AdamUpdateHalf << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
+                                          dragon_cast<half, float>(lr),
+                                       dragon_cast<half, float>(beta1),
+                                       dragon_cast<half, float>(beta2),
+                                         dragon_cast<half, float>(eps),
+                                            reinterpret_cast<half*>(g),
+                                            reinterpret_cast<half*>(m),
+                                           reinterpret_cast<half*>(v));
+    CUDA_POST_KERNEL_CHECK;
+#else
+    CUDA_FP16_NOT_COMPILED;
+#endif
+}
+
+/******************** update.nesterov_update ********************/
+
+#ifdef WITH_CUDA_FP16
+__global__ void _NesterovUpdateHalf(const int count,
+                                    const half lr,
+                                    const half momentum,
+                                    half* g, half* h) {
+    CUDA_KERNEL_LOOP(i, count) {
+#if __CUDA_ARCH__ >= 530
+        half hi = h[i];
+        half hi_new = h[i] = __hadd(__hmul(momentum, hi), __hmul(lr, g[i]));
+        half kOne = __float2half(1.f);
+        g[i] = __hsub(__hmul(__hadd(kOne, momentum), hi_new), __hmul(momentum, hi));
+#endif
+    }
+}
+
+__global__ void _NesterovUpdateHalf2(const int count,
+                                     const half2 lr,
+                                     const half2 momentum,
+                                     half2* g, half2* h) {
+    CUDA_KERNEL_LOOP(i, count) {
+#if __CUDA_ARCH__ >= 530
+        half2 hi = h[i];
+        half2 hi_new = h[i] = __hadd2(__hmul2(momentum, hi), __hmul2(lr, g[i]));
+        half2 kOne = __float2half2_rn(1.f);
+        g[i] = __hsub2(__hmul2(__hadd2(kOne, momentum), hi_new), __hmul2(momentum, hi));
+#endif
+    }
+}
+#endif
+
+template <> void NesterovUpdate<float16, CUDAContext>(const int count,
+                                                      const float lr,
+                                                      const float momentum,
+                                                      float16* g, float16* h) {
+#ifdef WITH_CUDA_FP16
+    if (count % 2 == 0) {
+        _NesterovUpdateHalf2 << <GET_BLOCKS(count / 2), CUDA_NUM_THREADS >> >(count / 2,
+                                                          dragon_cast<half2, float>(lr),
+                                                    dragon_cast<half2, float>(momentum),
+                                                            reinterpret_cast<half2*>(g),
+                                                           reinterpret_cast<half2*>(h));
+    } else {
+        _NesterovUpdateHalf << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
+                                                  dragon_cast<half, float>(lr),
+                                            dragon_cast<half, float>(momentum),
+                                                    reinterpret_cast<half*>(g),
+                                                   reinterpret_cast<half*>(h));
+    }
+    CUDA_POST_KERNEL_CHECK;
+#else
+    CUDA_FP16_NOT_COMPILED;
+#endif
+}
+
+/******************** update.rmsprop_update ********************/
+
+#ifdef WITH_CUDA_FP16
+__global__ void _RMSPropUpdateHalf(const int count,
+                                   const half lr,
+                                   const half decay,
+                                   const half eps,
+                                   half* g, half* h) {
+    CUDA_KERNEL_LOOP(i, count) {
+#if __CUDA_ARCH__ >= 530
+        half gi = g[i];
+        half kOne = __float2half(1.f);
+        half hi = h[i] = __hadd(__hmul(decay, h[i]), __hmul(__hmul(__hsub(kOne, decay), gi), gi));
+        g[i] = __hdiv(__hmul(lr, g[i]), __hadd(hsqrt(hi), eps));
+#endif
+    }
+}
+#endif
+
+template <> void RMSPropUpdate<float16, CUDAContext>(const int count,
+                                                     const float lr,
+                                                     const float decay,
+                                                     const float eps,
+                                                     float16* g, float16* h) {
+#ifdef WITH_CUDA_FP16
+    _RMSPropUpdateHalf << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
+                                             dragon_cast<half, float>(lr),
+                                          dragon_cast<half, float>(decay),
+                                            dragon_cast<half, float>(eps),
+                                               reinterpret_cast<half*>(g),
+                                              reinterpret_cast<half*>(h));
+    CUDA_POST_KERNEL_CHECK;
+#else
+    CUDA_FP16_NOT_COMPILED;
+#endif
+}
+
+/******************** update.sgd_update ********************/
+
+#ifdef WITH_CUDA_FP16
+__global__ void _SGDUpdateHalf(const int count,
+                               const half lr,
+                               const half momentum,
+                               half* g, half* h) {
+    CUDA_KERNEL_LOOP(i, count) {
+#if __CUDA_ARCH__ >= 530
+        half hi = h[i];
+        g[i] = h[i] = __hadd(__hmul(momentum, hi), __hmul(lr, g[i]));
+#endif
+    }
+}
+
+__global__ void _SGDUpdateHalf2(const int count,
+                                const half2 lr,
+                                const half2 momentum,
+                                half2* g, half2* h) {
+    CUDA_KERNEL_LOOP(i, count) {
+#if __CUDA_ARCH__ >= 530
+        half2 hi = h[i];
+        g[i] = h[i] = __hadd2(__hmul2(momentum, hi), __hmul2(lr, g[i]));
+#endif
+    }
+}
+#endif
+
+template <> void SGDUpdate<float16, CUDAContext>(const int count,
+                                                 const float lr,
+                                                 const float momentum,
+                                                 float16* g, float16* h) {
+#ifdef WITH_CUDA_FP16
+    if (count % 2 == 0) {
+        _SGDUpdateHalf2 << <GET_BLOCKS(count / 2), CUDA_NUM_THREADS >> >(count / 2,
+                                                     dragon_cast<half2, float>(lr),
+                                               dragon_cast<half2, float>(momentum),
+                                                       reinterpret_cast<half2*>(g),
+                                                      reinterpret_cast<half2*>(h));
+    } else {
+        _SGDUpdateHalf << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
+                                             dragon_cast<half, float>(lr),
+                                       dragon_cast<half, float>(momentum),
+                                               reinterpret_cast<half*>(g),
+                                              reinterpret_cast<half*>(h));
+    }
+    CUDA_POST_KERNEL_CHECK;
+#else
+    CUDA_FP16_NOT_COMPILED;
+#endif
+}
+
 }    // namespace kernel

 }    // namespace dragon