Add Cambricon's CNML Context

Ting PAN
Commit 96f7277e authored Nov 20, 2018 by Ting PAN
Showing with 2334 additions and 418 deletions
CHANGES
Docker/ubuntu-16.04-cpu-openblas/Dockerfile
Docker/ubuntu-16.04-cuda9.0-cudnn7/Dockerfile
Dragon/CMakeLists.txt
Dragon/include/core/common.h
Dragon/include/core/context.h
Dragon/include/core/context_cnml.h
Dragon/include/core/context_cuda.h
Dragon/include/core/graph.h
Dragon/include/core/mixedmem.h
Dragon/include/core/operator.h
Dragon/include/core/tensor.h
Dragon/include/core/types.h
Dragon/include/core/workspace.h
Dragon/include/operators/activation/dropout_op.h
Dragon/include/operators/activation/relu_op.h
Dragon/include/operators/activation/softmax_op.h
Dragon/include/operators/arithmetic/affine_op.h
Dragon/include/operators/arithmetic/clip_op.h
Dragon/include/operators/arithmetic/maximum_op.h
--- a/CHANGES
+++ b/CHANGES
+------------------------------------------------------------------------
+The list of most significant changes made over time in Dragon.
+Dragon 0.2.2.12 (20181120)
+DRAGON_VERSION == 2212
+Changes (w.r.t. Dragon 0.2.2.11):
+Preview Features:
+- Added Cambricon's CNML context.
+- Added the support for Int8(Char) Tensor.
+- Removed the cuda device id query from pointer.
+- Added ``DropBlock2dOp``
+- Added ``MaximumOp``, ``MinimumOp``, ``NLLLossOp``.
+- Added CuDNN support for ``BiasAddOp``.
+- Optimized memory usage of ``DropoutOp``.
+- Replaced ``thread_local`` with platform TLS solution.
+- Changed the default norm eps from 1e-3 to 1e-5,
+  affected: ``BatchNorm``, ``BatchRenorm``, ``GroupNorm``, ``InstanceNorm``, ``L2Norm``.
+- Enforced CUDA FP16 support (i.e. Removed ``WITH_CUDA_FP16``).
+- [PyTorch] Added ``torch.one_hot``.
+- [PyTorch] Added ``torch.log``, ``Tensor.log``, ``torch.exp`` and ``Tensor.exp``.
+- [PyTorch] Added ``torch.minimum``, ``torch.maximum``,
+   ``torch.clamp``, ``Tensor.clamp``, ``Tensor.clamp_``.
+- [PyTorch] Added ``nn.ELU`` and ``nn.SELU``.
+- [PyTorch] Added ``nn.GroupNorm``.
+- [PyTorch] Added ``nn.NLLLoss``, ``nn.BCEWithLogitsLoss``,
+   ``nn.L1Loss``, ``nn.MSELoss``, ``nn.SmoothL1Loss``.
+- [PyTorch] Added ``nn.DropBlock2d``.
+- [PyTorch] Added ``train`` and ``eval`` mode for Module,
+   affected: ``nn.BatchNorm``, ``nn.Dropout``.
+- [PyTorch] Deprecated the ``size_average`` and ``reduce`` in
+    ``nn.Loss``, added ``reduction`` instead.
+- [PyTorch] ``torch.save`` can save both ``torch.Tensor`` and other pickle values.
+- [PyCaffe] Added ``DropBlockLayer``.
+Bugs fixed:
+- Fixed the uncomputed output in ``BiasAddGradientOp``.
+- Fixed the incorrect gradients of ``ClipGradientOp``.
+- Fixed the wrong results of ``math::Inv`` under ``CPUContext``.
+- Fixed the issue that the default device is used on initializing NCCL.
+- Removed the strictly shape check in ``SmoothL1Op``.
+- Fixed wrong CXX API exporting under Win32.
+- [PyTorch] Fixed an issue that multiple ``GradientGather`` are triggered by one Operator.
+- [PyTorch] Fixed the schema check by in-place fundamental ops.
+- [PyTorch] Fixed the missing shape and dtype after ``Tensor.copy_``.
+- [PyTorch] Fixed an issue that ``Tensor.fill_`` and ``Tensor.zero_``
+  will change the data type of an non-empty Tensor.
+- [PyTorch] Fixed the Python2 Int(s) check.
+------------------------------------------------------------------------
\ No newline at end of file
--- a/Docker/ubuntu-16.04-cpu-openblas/Dockerfile
+++ b/Docker/ubuntu-16.04-cpu-openblas/Dockerfile
@@ -8,10 +8,14 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
    unzip \
    ssh \
    vim \
+    libtbb-dev \
+    libsdl2-dev \
    libnuma-dev \
    libprotobuf-dev \
    protobuf-compiler \
+    libopencv-dev \
    libopenblas-dev \
+    libboost-all-dev \
    python3-pip \
    python3-dev \
    python3-pyqt4 \
@@ -40,3 +44,5 @@ RUN git clone https://github.com/seetaresearch/Dragon.git && \
    wget http://dragon.seetatech.com/download/docker/ubuntu-16.04-cpu-openblas/CMakeLists.txt && \
    mkdir build && cd build && cmake .. && make install -j8 && cd .. && rm -rf build && \
    cd python && python3 setup.py install
+RUN rm /usr/bin/python && ln -s /usr/bin/python3 /usr/bin/python && ln -s /usr/bin/pip3 /usr/bin/pip
\ No newline at end of file
--- a/Docker/ubuntu-16.04-cuda9.0-cudnn7/Dockerfile
+++ b/Docker/ubuntu-16.04-cuda9.0-cudnn7/Dockerfile
@@ -9,10 +9,14 @@ RUN rm /etc/apt/sources.list.d/cuda.list && rm /etc/apt/sources.list.d/nvidia-ml
    unzip \
    ssh \
    vim \
+    libtbb-dev \
+    libsdl2-dev \
    libnuma-dev \
    libprotobuf-dev \
    protobuf-compiler \
+    libopencv-dev \
    libopenblas-dev \
+    libboost-all-dev \
    libnccl2 \
    libnccl-dev \
    python3-pip \
@@ -43,3 +47,5 @@ RUN git clone https://github.com/seetaresearch/Dragon.git && \
    wget http://dragon.seetatech.com/download/docker/ubuntu-16.04-cuda9.0-cudnn7/CMakeLists.txt && \
    mkdir build && cd build && cmake .. && make install -j8 && cd .. && rm -rf build && \
    cd python && python3 setup.py install
+RUN rm /usr/bin/python && ln -s /usr/bin/python3 /usr/bin/python && ln -s /usr/bin/pip3 /usr/bin/pip
\ No newline at end of file
--- a/Dragon/CMakeLists.txt
+++ b/Dragon/CMakeLists.txt
@@ -17,7 +17,6 @@ option(WITH_SSE                    "Set ON to use SSE 4.1"  ON)
 option(WITH_MPI                    "Set ON to use MPI"  OFF)
 option(WITH_MPI_CUDA               "Set ON to use MPI-CUDA"  OFF)
 option(WITH_MPI_NCCL               "Set ON to use MPI-NCCL"  OFF)
-option(WITH_CUDA_FP16              "Set ON to use FP16"  ON)
 # Set your 3rdparty
 set(3RDPARTY_DIR  ${PROJECT_SOURCE_DIR}/../3rdparty)
@@ -163,10 +162,6 @@ if (WITH_MPI_NCCL)
    ADD_DEFINITIONS(-DWITH_MPI_NCCL)
    message(STATUS "Use MPI-NCCL [Optional]")
 endif()
-if (WITH_CUDA_FP16)
-    ADD_DEFINITIONS(-DWITH_CUDA_FP16)
-    message(STATUS "Use CUDA FP16 [Optional]")
-endif()
 # ---[ Flags
 set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ${CUDA_ARCH}")

--- a/Dragon/include/core/common.h
+++ b/Dragon/include/core/common.h
@@ -13,6 +13,7 @@
 #define DRAGON_CORE_COMMON_H_
 #include <ctime>
+#include <random>
 #include <climits>
 #include <memory>
 #include <string>
@@ -49,25 +50,35 @@ using Map = std::unordered_map<Key, Value>;
 template <typename Value>
 using Set = std::unordered_set<Value> ;
-/*
+/* * * * * * * * * * * * * * * * * * * * *
- * Define the Kernel version.
+ *                                       *
- *
+ *            Kernel Version             *
- * | Major(2) | Minor(2) | Patch(11) |
+ *                                       *
- */
+ *     Major(2) | Minor(2) | Patch(12)   *
-#define DRAGON_VERSION 2211
+ *                                       *
+ * * * * * * * * * * * * * * * * * * * * */
+#define DRAGON_VERSION 2212
+/* * * * * * * * * * * * * * * * * * * * *
+ *                                       *
+ *          Default Random Seed          *
+ *                                       *
+ * * * * * * * * * * * * * * * * * * * * */
-/*
- * Define the default random seed.
- */
 #define DEFAULT_RNG_SEED 3
-/*
+/* * * * * * * * * * * * * * * * * * * * *
- * Define the common marcos.
+ *                                       *
- */
+ *                Macros                 *
-#ifdef _MSC_VER
+ *                                       *
-#if _MSC_VER < 1900
+ * * * * * * * * * * * * * * * * * * * * */
-#define thread_local __declspec(thread)
-#endif
+//  avoid using of "thread_local" for VS2013 or older Xcode
+#if defined(__clang__) || defined(__GNUC__)
+#define TLS_OBJECT __thread
+#else
+#define TLS_OBJECT __declspec(thread)
 #endif
 #define CONCATENATE_IMPL(s1, s2) s1##s2

--- a/Dragon/include/core/context.h
+++ b/Dragon/include/core/context.h
@@ -12,15 +12,8 @@
 #ifndef DRAGON_CORE_CONTEXT_H_
 #define DRAGON_CORE_CONTEXT_H_
-#include <random>
-#include <ctime>
 #include "core/common.h"
-#ifdef WITH_CUDA
-#include "utils/cuda_device.h"
-#endif
 namespace dragon {
 class CPUContext {
@@ -45,7 +38,7 @@ class CPUContext {
 #else
        data = malloc(nbytes);
 #endif
-        CHECK(data) << "Malloc mem: " << nbytes << " bytes failed.";
+        CHECK(data) << "\nMalloc mem: " << nbytes << " bytes failed.";
        return data;
    }

--- a/Dragon/include/core/context_cnml.h
+++ b/Dragon/include/core/context_cnml.h
+// ------------------------------------------------------------
+// Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+//
+// Licensed under the BSD 2-Clause License.
+// You should have received a copy of the BSD 2-Clause License
+// along with the software. If not, See,
+//
+//      <https://opensource.org/licenses/BSD-2-Clause>
+//
+// ------------------------------------------------------------
+#ifndef DRAGON_CORE_CONTEXT_CNML_H_
+#define DRAGON_CORE_CONTEXT_CNML_H_
+/* CAMBRICON's CNRT && CNML Environment */
+#include "core/common.h"
+struct cnrtStream;
+struct cnmlCpuTensor;
+struct cnmlTensor;
+struct cnmlFusionOp;
+typedef struct cnrtStream* cnrtStream_t;
+typedef struct cnmlCpuTensor* cnmlCpuTensor_t;
+typedef struct cnmlTensor* cnmlTensor_t;
+typedef struct cnmlFusionOp* cnmlFusionOp_t;
+namespace dragon {
+class CNRTObject;
+class CNMLContext {
+ public:
+     CNMLContext(const DeviceOption& option)
+        : device_id_(option.device_id()),
+        random_seed_(option.has_random_seed() ?
+            option.random_seed() : DEFAULT_RNG_SEED) {
+        CHECK_EQ(option.device_type(), CNML);
+    }
+    CNMLContext(const int device_id = 0)
+        : device_id_(device_id),
+          random_seed_(DEFAULT_RNG_SEED) {}
+    void SwitchToDevice(int stream_id);
+    inline void SwitchToDevice() { SwitchToDevice(1); }
+    void FinishDeviceCompution();
+    static void* New(size_t nbytes);
+    static void Memset(
+        size_t              nbytes,
+        void*               ptr);
+    inline void MemsetAsync(
+        size_t              nbytes,
+        void*               ptr) {
+        Memset(nbytes, ptr);
+    }
+    template<class DstContext, class SrcContext>
+    static void Memcpy(
+        size_t              nbytes,
+        void*               dst,
+        const void*         src);
+    template<class DstContext, class SrcContext>
+    inline void MemcpyAsync(
+        size_t              nbytes,
+        void*               dst,
+        const void*         src) {
+        Memcpy<DstContext, SrcContext>(dst, src, nbytes);
+    }
+    static void Delete(void* data);
+    inline int device_id() const { return device_id_; }
+    inline void set_stream_id(int stream_id) { stream_id_ = stream_id; }
+    inline cnrtStream_t cnrt_stream() {
+        return cnrt_stream(device_id_, stream_id_);
+    }
+    static cnrtStream_t cnrt_stream(
+        int                 device_id,
+        int                 stream_id);
+    static std::mutex& mutex() { static std::mutex m; return m; }
+    static thread_local CNRTObject cnrt_object_;
+ private:
+    int device_id_, stream_id_ = 1, random_seed_;
+    unique_ptr<std::mt19937> rand_generator_;
+};
+}    // namepsace dragon
+#endif    // DRAGON_CORE_CONTEXT_CNML_H_
\ No newline at end of file
--- a/Dragon/include/core/context_cuda.h
+++ b/Dragon/include/core/context_cuda.h
@@ -12,8 +12,9 @@
 #ifndef DRAGON_CORE_CONTEXT_CUDA_H_
 #define DRAGON_CORE_CONTEXT_CUDA_H_
+/* NVIDIA's CUDA Environment */
 #include "core/common.h"
-#include "core/context.h"
 #include "utils/cuda_device.h"
 #include "utils/cudnn_device.h"
@@ -52,13 +53,13 @@ class CUDAObject {
    }
    //  follow the caffe2,
-    //  each device takes a group of non-bl0cking streams
+    //  each device takes a group of non-blocking streams
    //  the stream 0 is reserved for default stream,
    //  as some computations really require it,
    //  e.g. cublas.asum() and mixed cpu/cuda operations
    //  besides, somes calls, such as cudnn.conv() and cudnn.rnn(),
    //  produce wrong results if running them on non-blocking streams
-    //  note that caffe2 also use default streams (within CuDNNState)
+    //  note that caffe2 also uses default streams (within CuDNNState)
    cudaStream_t GetStream(int device_id, int stream_id) {
        vector<cudaStream_t>& dev_streams = cuda_streams[device_id];
        if (dev_streams.size() <= (unsigned)stream_id)
@@ -140,7 +141,7 @@ class CUDAContext {
    inline static void* New(size_t nbytes) {
        void* data;
        cudaMalloc(&data, nbytes);
-        CHECK(data) << "Malloc cuda mem: " 
+        CHECK(data) << "\nMalloc cuda mem: "
                    << nbytes << " bytes failed.";
        return data;
    }
@@ -199,11 +200,11 @@ class CUDAContext {
    static cudaStream_t cuda_stream(
        int                 device_id,
        int                 stream_id) {
-        return cuda_object_.GetStream(device_id, stream_id);
+        return cuda_object()->GetStream(device_id, stream_id);
    }
    cublasHandle_t cublas_handle() {
-        return cuda_object_.GetCuBLASHandle(device_id_, stream_id_);
+        return cuda_object()->GetCuBLASHandle(device_id_, stream_id_);
    }
    inline std::mt19937* rand_generator() {
@@ -227,13 +228,17 @@ class CUDAContext {
 #ifdef WITH_CUDNN
    cudnnHandle_t cudnn_handle() {
-        return cuda_object_.GetCuDNNHandle(device_id_, stream_id_);
+        return cuda_object()->GetCuDNNHandle(device_id_, stream_id_);
    }
 #endif
    static std::mutex& mutex() { static std::mutex m; return m; }
-    static thread_local CUDAObject cuda_object_;
+    static CUDAObject* cuda_object() {
+        static TLS_OBJECT CUDAObject* cuda_object_;
+        if (!cuda_object_) cuda_object_ = new CUDAObject();
+        return cuda_object_;
+    }
 private:
    int device_id_, stream_id_ = 1, random_seed_;

--- a/Dragon/include/core/graph.h
+++ b/Dragon/include/core/graph.h
@@ -48,10 +48,10 @@ class GraphBase {
    Workspace* ws_;
 };
-class Graph final : public GraphBase {
+class Graph : public GraphBase {
 public:
    Graph(const GraphDef& meta_graph, Workspace* ws);
-    ~Graph() { for (auto* op : ops_) delete op; }
+    virtual ~Graph() { for (auto* op : ops_) delete op; }
    bool Create(
        const GraphDef&         optimized_graph,
@@ -73,7 +73,7 @@ class Graph final : public GraphBase {
    inline Workspace* ws() const { return ws_; }
- private:
+ protected:
    void ForwardShareDyeing(string u, string ancestor);
    void ForwardPruneDyeing(
        string                  u,
@@ -98,6 +98,9 @@ DECLARE_REGISTRY(
    const GraphDef&,
    Workspace*);
+#define REGISTER_GRAPH(name, ...) \
+    REGISTER_CLASS(GraphRegistry, name, __VA_ARGS__)
 }    // namespace dragon
 #endif    // DRAGON_CORE_GRAPH_H_
\ No newline at end of file
--- a/Dragon/include/core/mixedmem.h
+++ b/Dragon/include/core/mixedmem.h
@@ -12,30 +12,49 @@
 #ifndef DRAGON_CORE_MIXEDMEM_H_
 #define DRAGON_CORE_MIXEDMEM_H_
-#include "context.h"
+#include "core/context.h"
-#include "context_cuda.h"
+#include "core/context_cuda.h"
+#include "core/context_cnml.h"
 namespace dragon {
+typedef enum {
+    NCHW,
+    NHWC,
+} DataOrder;
 class MixedMemory {
 public:
-    enum State {
+    typedef enum {
        UNINITIALIZED,
        STATE_AT_CPU,
        STATE_AT_CUDA,
+        STATE_AT_CNML,
        SWITCHED,
-        SYNCED };
+        SYNCED,
+    } State;
-    MixedMemory() : cpu_ptr_(nullptr), cuda_ptr_(nullptr) {}
+    MixedMemory() : cpu_ptr_(nullptr),
+          cuda_ptr_(nullptr), cnml_ptr_(nullptr) {}
    MixedMemory(const TypeMeta& meta, const size_t nbytes)
-        : meta_(meta), nbytes_(nbytes),
+        : meta_(meta), nbytes_(nbytes), cpu_ptr_(nullptr),
-          cpu_ptr_(nullptr), cuda_ptr_(nullptr) {}
+          cuda_ptr_(nullptr), cnml_ptr_(nullptr) {}
    ~MixedMemory();
    const void* cpu_data();
    const void* cuda_data();
+    const void* cnml_data();
    void* mutable_cpu_data();
    void* mutable_cuda_data();
+    void* mutable_cnml_data();
+    void* malloc_cnml_data();
+    void fetch_cnml_data(void** data);
+    cnmlCpuTensor_t& cnml_cpu_tensor();
+    cnmlTensor_t& cnml_mlu_tensor();
    void set_cpu_data(void* cpu_ptr, size_t nbytes);
    void SwitchToDevice();
@@ -43,23 +62,35 @@ class MixedMemory {
    inline size_t nbytes() const { return nbytes_; }
-    inline void* cpu_ptr() { state_ = STATE_AT_CPU; return cpu_ptr_; }
+    inline size_t nchunks() const { return nchunks_; }
-    inline void* cuda_ptr() { state_ = STATE_AT_CUDA; return cuda_ptr_; }
+    void set_nchunks(size_t nchunks) { nchunks_ = nchunks; }
    inline State state() const { return state_; }
+    inline DataOrder order() const { return order_; }
+    inline void set_order(DataOrder order) { order_ = order; }
    const Map<string, string> info() const;
-    void ToCUDA();
    void ToCPU();
+    void ToCUDA();
 private:
-    void* cpu_ptr_, *cuda_ptr_;
-    bool own_cpu_ptr_ = true;
-    State state_ = UNINITIALIZED;
-    size_t nbytes_ = 0;
    TypeMeta meta_;
+    size_t nbytes_ = 0, nchunks_ = 1;
+    DataOrder order_ = NCHW;
+    State state_ = UNINITIALIZED;
+    void* cpu_ptr_, *cuda_ptr_, *cnml_ptr_;
+    int own_cpu_ptr_ = 1, ptr_device_ = 0;
+    /* For CAMBRICON's CNML Environment */
+    cnmlCpuTensor_t cnml_cpu_tensor_ = nullptr;
+    cnmlTensor_t cnml_mlu_tensor_ = nullptr;
 };
 }    // namespace dragon
-#endif
+#endif    // DRAGON_CORE_MIXEDMEM_H_
\ No newline at end of file
--- a/Dragon/include/core/operator.h
+++ b/Dragon/include/core/operator.h
@@ -44,7 +44,9 @@ class OperatorBase {
                   const string& anchor);
    inline void SwitchToPhase(const string& phase) { phase_ = phase; }
    virtual void Run(int stream_id = 1) { NOT_IMPLEMENTED; }
+    virtual void Fusion(void* graph) { NOT_IMPLEMENTED; }
    inline const string& name() const { return def_.name(); }
    inline const string& type() const { return def_.type(); }
@@ -186,12 +188,22 @@ DECLARE_REGISTRY(
    const OperatorDef&,
    Workspace*);
+/* NVIDIA's Accelerated Library - CUDNN */
 DECLARE_REGISTRY(
    CUDNNOperatorRegistry,
    OperatorBase,
    const OperatorDef&,
    Workspace*);
+/* CAMBRICON's Accelerated Library - CNML */
+DECLARE_REGISTRY(
+    CNMLOperatorRegistry,
+    OperatorBase,
+    const OperatorDef&,
+    Workspace*);
 #define TENSOR_FILL_WITH_TYPE(tensor, shape, type) \
    if (tensor.count() == 0) { \
        CHECK(ws()->GetFiller(tensor.name())) \
@@ -310,6 +322,9 @@ DECLARE_REGISTRY(
 #define INSTANTIATE_CUDNN_OPERATOR(name) \
    template class CuDNN##name##Op<CUDAContext>;
+#define INSTANTIATE_CNML_OPERATOR(name) \
+    template class CnML##name##Op<CNMLContext>;
 #define REGISTER_CPU_OPERATOR(name, ...) \
    REGISTER_CLASS(CPUOperatorRegistry, name, __VA_ARGS__)
@@ -319,6 +334,9 @@ DECLARE_REGISTRY(
 #define REGISTER_CUDNN_OPERATOR(name, ...) \
    REGISTER_CLASS(CUDNNOperatorRegistry, name, __VA_ARGS__)
+#define REGISTER_CNML_OPERATOR(name, ...) \
+    REGISTER_CLASS(CNMLOperatorRegistry, name, __VA_ARGS__)
 #define DEPLOY_CPU(name) \
    REGISTER_CPU_OPERATOR(name, name##Op<CPUContext>); \
    INSTANTIATE_OPERATOR(name, CPUContext);
@@ -336,6 +354,10 @@ DECLARE_REGISTRY(
    REGISTER_CUDNN_OPERATOR(name, CuDNN##name##Op<CUDAContext>); \
    INSTANTIATE_CUDNN_OPERATOR(name);
+#define DEPLOY_CNML(name) \
+    REGISTER_CNML_OPERATOR(name, CnML##name##Op<CNMLContext>); \
+    INSTANTIATE_CNML_OPERATOR(name);
 }    // namespace dragon
 #endif    // DRAGON_CORE_OPERATOR_H_
\ No newline at end of file
--- a/Dragon/include/core/tensor.h
+++ b/Dragon/include/core/tensor.h
@@ -10,7 +10,7 @@
 // ------------------------------------------------------------
 #ifndef DRAGON_CORE_TENSOR_H_
-#define DRAONG_CORE_TENSOR_H_
+#define DRAGON_CORE_TENSOR_H_
 #include "core/common.h"
 #include "core/mixedmem.h"
@@ -103,16 +103,20 @@ class Tensor {
        return offset;
    }
-    inline string DimString() const {
+    static inline string DimString(
-        if (ndim() == 0) return "(0,)";
+        const vector<TIndex>&   dims) {
+        if (dims.size() == 0) return "(0,)";
        std::stringstream ss;
        ss << "(";
-        for (int i = 0; i < ndim() - 1; i++) ss << dim(i) << ",";
+        for (int i = 0; i < dims.size() - 1; i++)
-        if (ndim() == 1) ss << dim(0) << ",)";
+            ss << dims[i] << ",";
-        else ss << dim(ndim() - 1) << ")";
+        if (dims.size() == 1) ss << dims[0] << ",)";
+        else ss << dims.back() << ")";
        return ss.str();
    }
+    inline string DimString() const { return DimString(dims_); }
    inline bool is_corrupted() const { return is_corrupted_; }
    inline void Corrupt() { is_corrupted_ = true; }
@@ -156,9 +160,12 @@ class Tensor {
            } else if (TypeMeta::Id<Context>() ==
                    TypeMeta::Id<CUDAContext>()) {
                *data_ptr = mem->mutable_cuda_data();
+            } else if (TypeMeta::Id<Context>() == 
+                    TypeMeta::Id<CNMLContext>()) {
+                *data_ptr = mem->mutable_cnml_data();
            } else {
-                LOG(FATAL) << "Unknown memory type. "
+                LOG(FATAL) << "Unknown memory type.\n"
-                           << "Only CPU or CUDA is supported.";
+                           << "Only CPU, CUDA and CNML are supported.";
            }
        }
    }
@@ -173,9 +180,12 @@ class Tensor {
        } else if (TypeMeta::Id<Context>() ==
                TypeMeta::Id<CUDAContext>()) {
             return mem->cuda_data();
+        } else if (TypeMeta::Id<Context>() == 
+                TypeMeta::Id<CNMLContext>()) {
+            return mem->cnml_data();
        } else {
-             LOG(FATAL) << "Unknown memory type. "
+             LOG(FATAL) << "Unknown memory type.\n"
-                        << "Only CPU or CUDA are supported.";
+                        << "Only CPU, CUDA, and CNML are supported.";
             return nullptr;
        }
    }
@@ -295,4 +305,4 @@ class Tensor {
 }    // namespace dragon
-#endif    // DRAONG_CORE_TENSOR_H_
+#endif    // DRAGON_CORE_TENSOR_H_
\ No newline at end of file
--- a/Dragon/include/core/types.h
+++ b/Dragon/include/core/types.h
@@ -18,6 +18,9 @@
 namespace dragon {
+typedef char int8;
+typedef unsigned char uint8;
 #ifdef _MSC_VER
 typedef struct __declspec(align(2)) {
@@ -49,8 +52,8 @@ inline const TypeMeta& TypeStringToMeta(
            { "int64", TypeMeta::Make<int64_t>() },
            { "float64", TypeMeta::Make<double>() },
            { "float16", TypeMeta::Make<float16>() },
-            { "uint8", TypeMeta::Make<uint8_t>() },
+            { "uint8", TypeMeta::Make<uint8>() },
-            { "int8", TypeMeta::Make<char>() },
+            { "int8", TypeMeta::Make<int8>() },
    };
    static TypeMeta unknown_type;
    return s2m_type_map.count(str_type) ?
@@ -66,8 +69,8 @@ inline const std::string TypeMetaToString(
            { TypeMeta::Id<int64_t>(), "int64" },
            { TypeMeta::Id<double>(), "float64", },
            { TypeMeta::Id<float16>(), "float16" },
-            { TypeMeta::Id<uint8_t>(), "uint8" },
+            { TypeMeta::Id<uint8>(), "uint8" },
-            { TypeMeta::Id<char>(), "int8" }
+            { TypeMeta::Id<int8>(), "int8" }
    };
    return m2s_type_map.count(meta.id()) ?
        m2s_type_map[meta.id()] : "unknown";

--- a/Dragon/include/core/workspace.h
+++ b/Dragon/include/core/workspace.h
@@ -47,8 +47,8 @@ class Workspace {
        recompute_flag->Reshape({ 1 });
        recompute_flag->mutable_data<bool, CPUContext>()[0] = false;
        for (int i = 0; i < WORKSPACE_MAX_CORRUPTED_SIZE; i++) {
-            string name = "/opt/mirror_stage/buffer_" +
+            string name = "/opt/mirror_stage/buffer_"
-                dragon_cast<string, int>(i);
+                + std::to_string(i);
            Tensor* buffer = CreateTensor(name);
            head->mutable_data<string, CPUContext>()[i] = "";
        }
@@ -277,7 +277,8 @@ class Workspace {
    inline bool SetProxy(
        const string&           key,
        const string&           proxy) {
-        if (proxy_map_.count(key))
+        if (key == proxy) return false;
+        if (proxy_map_.count(key) > 0)
            return proxy_map_[key] == proxy;
        proxy_map_[key] = proxy;
        return true;

--- a/Dragon/include/operators/activation/dropout_op.h
+++ b/Dragon/include/operators/activation/dropout_op.h
@@ -23,7 +23,7 @@ class DropoutOp final : public Operator<Context> {
    DropoutOp(const OperatorDef& def, Workspace* ws)
        : Operator<Context>(def, ws),
          use_scale(OperatorBase::Arg<bool>("scale", true)) {
-        GET_ARGUMENT_WITH_DESC(float, prob, 0.5);
+        GET_ARGUMENT_WITH_DESC(float, prob, 0.5f);
        SwitchToPhase(OperatorBase::Arg<string>("phase", ""));
    }
    USE_OPERATOR_FUNCTIONS;
@@ -42,7 +42,7 @@ class DropoutGradientOp final : public Operator<Context> {
    DropoutGradientOp(const OperatorDef& def, Workspace* ws)
        : Operator<Context>(def, ws),
          use_scale(OperatorBase::Arg<bool>("scale", true)) {
-        GET_ARGUMENT_WITH_DESC(float, prob, 0.5);
+        GET_ARGUMENT_WITH_DESC(float, prob, 0.5f);
        SwitchToPhase(OperatorBase::Arg<string>("phase", ""));
    }
    USE_OPERATOR_FUNCTIONS;
@@ -53,7 +53,6 @@ class DropoutGradientOp final : public Operator<Context> {
 protected:
     DECLARE_ARGUMENT_WITH_DESC(float, prob);
     bool use_scale;
-     Tensor* mask;
 };
 DEFINE_ARGUMENT_WITH_DESC(float, DropoutOp, prob);
@@ -70,7 +69,7 @@ public:
        : Operator<Context>(def, ws), states_initialized(false),
        use_scale(OperatorBase::Arg<bool>("scale", true)),
        random_seed(DEFAULT_RNG_SEED) {
-        GET_ARGUMENT_WITH_DESC(float, prob, 0.5);
+        GET_ARGUMENT_WITH_DESC(float, prob, 0.5f);
        SwitchToPhase(OperatorBase::Arg<string>("phase", ""));
        CUDNN_CHECK(cudnnCreateTensorDescriptor(&input_desc));
        CUDNN_CHECK(cudnnCreateDropoutDescriptor(&dropout_desc));
@@ -101,7 +100,7 @@ public:
        : Operator<Context>(def, ws), states_initialized(false),
        use_scale(OperatorBase::Arg<bool>("scale", true)),
        random_seed(DEFAULT_RNG_SEED) {
-        GET_ARGUMENT_WITH_DESC(float, prob, 0.5);
+        GET_ARGUMENT_WITH_DESC(float, prob, 0.5f);
        SwitchToPhase(OperatorBase::Arg<string>("phase", ""));
        CUDNN_CHECK(cudnnCreateTensorDescriptor(&input_desc));
        CUDNN_CHECK(cudnnCreateDropoutDescriptor(&dropout_desc));

--- a/Dragon/include/operators/activation/relu_op.h
+++ b/Dragon/include/operators/activation/relu_op.h
@@ -21,7 +21,7 @@ class ReluOp : public Operator<Context> {
 public:
    ReluOp(const OperatorDef& def, Workspace* ws)
        : Operator<Context>(def, ws),
-          slope(OperatorBase::Arg<float>("slope", 0.0)) {}
+          slope(OperatorBase::Arg<float>("slope", 0.f)) {}
    USE_OPERATOR_FUNCTIONS;
    void RunOnDevice() override;
@@ -36,7 +36,7 @@ class ReluGradientOp : public Operator<Context> {
 public:
    ReluGradientOp(const OperatorDef& def, Workspace* ws)
        : Operator<Context>(def, ws),
-          slope(OperatorBase::Arg<float>("slope", 0.0)) {}
+          slope(OperatorBase::Arg<float>("slope", 0.f)) {}
    USE_OPERATOR_FUNCTIONS;
    void RunOnDevice() override;

--- a/Dragon/include/operators/activation/softmax_op.h
+++ b/Dragon/include/operators/activation/softmax_op.h
@@ -48,8 +48,6 @@ class SoftmaxGradientOp final : public Operator<Context> {
 #ifdef WITH_CUDNN
-#include "utils/cudnn_device.h"
 template <class Context>
 class CuDNNSoftmaxOp final : public Operator<Context> {
 public:
@@ -70,8 +68,7 @@ class CuDNNSoftmaxOp final : public Operator<Context> {
    template <typename T> void RunWithType();
 protected:
-    int axis;
+    TIndex axis, outer_dim, inner_dim;
-    TIndex outer_dim, inner_dim;
    cudnnTensorDescriptor_t input_desc, output_desc;
 };
@@ -95,8 +92,7 @@ class CuDNNSoftmaxGradientOp final : public Operator<Context> {
    template <typename T> void RunWithType();
 protected:
-    int axis;
+    TIndex axis, outer_dim, inner_dim;
-    TIndex outer_dim, inner_dim;
    cudnnTensorDescriptor_t input_desc, output_desc;
 };

--- a/Dragon/include/operators/arithmetic/affine_op.h
+++ b/Dragon/include/operators/arithmetic/affine_op.h
@@ -55,7 +55,7 @@ class AffineGradientOp final : public Operator<Context> {
 #ifdef WITH_CUDNN
-#include "utils/cudnn_device.h"
+#if CUDNN_VERSION_MIN(6, 0, 0)
 template <class Context>
 class CuDNNAffineOpBase : public Operator<Context> {
@@ -152,6 +152,8 @@ protected:
    Tensor sum_result;
 };
+#endif
 #endif    // WITH_CUDNN
 }    // namespace dragon

--- a/Dragon/include/operators/arithmetic/clip_op.h
+++ b/Dragon/include/operators/arithmetic/clip_op.h
@@ -36,11 +36,17 @@ class ClipOp final : public Operator<Context> {
 template <class Context>
 class ClipGradientOp final : public Operator<Context> {
 public:
-    USE_SIMPLE_CTOR_DTOR(ClipGradientOp);
+    ClipGradientOp(const OperatorDef& def, Workspace* ws)
+        : Operator<Context>(def, ws),
+          low(OperatorBase::Arg<float>("low", -FLT_MAX)),
+          high(OperatorBase::Arg<float>("high", FLT_MAX)) {}
    USE_OPERATOR_FUNCTIONS;
    void RunOnDevice() override;
    template <typename T> void RunWithType();
+ protected:
+    float low, high;
 };
 }    // namespace dragon

--- a/Dragon/include/operators/arithmetic/maximum_op.h
+++ b/Dragon/include/operators/arithmetic/maximum_op.h
+// ------------------------------------------------------------
+// Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+//
+// Licensed under the BSD 2-Clause License.
+// You should have received a copy of the BSD 2-Clause License
+// along with the software. If not, See,
+//
+//      <https://opensource.org/licenses/BSD-2-Clause>
+//
+// ------------------------------------------------------------
+#ifndef DRAGON_OPERATORS_ARITHMETIC_MAXIMUM_OP_H_
+#define DRAGON_OPERATORS_ARITHMETIC_MAXIMUM_OP_H_
+#include "core/operator.h"
+namespace dragon {
+template <class Context>
+class MaximumOp final : public Operator<Context> {
+ public:
+    USE_SIMPLE_CTOR_DTOR(MaximumOp);
+    USE_OPERATOR_FUNCTIONS;
+    void RunOnDevice() override;
+    template <typename T> void EltwiseRunWithType();
+    template <typename T> void BroadcastRunWithType();
+};
+template <class Context>
+class MaximumGradientOp final : public Operator<Context> {
+ public:
+    USE_SIMPLE_CTOR_DTOR(MaximumGradientOp);
+    USE_OPERATOR_FUNCTIONS;
+    void RunOnDevice() override;
+    template <typename T> void EltwiseRunWithType();
+    template <typename T> void BroadcastRunWithType();
+};
+}    // namespace dragon
+#endif    // DRAGON_OPERATORS_ARITHMETIC_MAXIMUM_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/arithmetic/minimum_op.h
+++ b/Dragon/include/operators/arithmetic/minimum_op.h
+// ------------------------------------------------------------
+// Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+//
+// Licensed under the BSD 2-Clause License.
+// You should have received a copy of the BSD 2-Clause License
+// along with the software. If not, See,
+//
+//      <https://opensource.org/licenses/BSD-2-Clause>
+//
+// ------------------------------------------------------------
+#ifndef DRAGON_OPERATORS_ARITHMETIC_MINIMUM_OP_H_
+#define DRAGON_OPERATORS_ARITHMETIC_MINIMUM_OP_H_
+#include "core/operator.h"
+namespace dragon {
+template <class Context>
+class MinimumOp final : public Operator<Context> {
+ public:
+    USE_SIMPLE_CTOR_DTOR(MinimumOp);
+    USE_OPERATOR_FUNCTIONS;
+    void RunOnDevice() override;
+    template <typename T> void EltwiseRunWithType();
+    template <typename T> void BroadcastRunWithType();
+};
+template <class Context>
+class MinimumGradientOp final : public Operator<Context> {
+ public:
+    USE_SIMPLE_CTOR_DTOR(MinimumGradientOp);
+    USE_OPERATOR_FUNCTIONS;
+    void RunOnDevice() override;
+    template <typename T> void EltwiseRunWithType();
+    template <typename T> void BroadcastRunWithType();
+};
+}    // namespace dragon
+#endif    // DRAGON_OPERATORS_ARITHMETIC_MINIMUM_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/loss/ctc_loss_op.h
+++ b/Dragon/include/operators/loss/ctc_loss_op.h
@@ -43,8 +43,6 @@ public:
 #if CUDNN_VERSION_MIN(7, 0, 0)
-#include "utils/cudnn_device.h"
 template <class Context>
 class CuDNNCTCLossOp final : public Operator<Context> {
 public:

--- a/Dragon/include/operators/loss/nll_loss_op.h
+++ b/Dragon/include/operators/loss/nll_loss_op.h
+// ------------------------------------------------------------
+// Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+//
+// Licensed under the BSD 2-Clause License.
+// You should have received a copy of the BSD 2-Clause License
+// along with the software. If not, See,
+//
+//      <https://opensource.org/licenses/BSD-2-Clause>
+//
+// -------------------------------------------------------------
+#ifndef DRAGON_OPERATORS_LOSS_NLL_LOSS_OP_H_
+#define DRAGON_OPERATORS_LOSS_NLL_LOSS_OP_H_
+#include "core/operator.h"
+namespace dragon {
+template <class Context>
+class NLLLossOp : public Operator<Context> {
+ public:
+    NLLLossOp(
+        const OperatorDef&          def,
+        Workspace*                  ws)
+        : Operator<Context>(def, ws),
+          axis(OperatorBase::Arg<int>("axis", 1)),
+          normalization(OperatorBase::Arg<string>(
+              "normalization", "VALID")) {
+        auto xs = OperatorBase::Args<int>("ignore_labels");
+        if (xs.size()) {
+            ignores.Reshape({ (TIndex)xs.size() });
+            auto* Idata = ignores.mutable_data<int, CPUContext>();
+            for (int i = 0; i < xs.size(); i++) Idata[i] = xs[i];
+        }
+    }
+    USE_OPERATOR_FUNCTIONS;
+    void RunOnDevice() override;
+    template <typename Tx, typename Ty> void RunWithType();
+ protected:
+    TIndex axis, outer_dim, inner_dim;
+    Tensor losses, flags, ignores;
+    string normalization;
+};
+template <class Context>
+class NLLLossGradientOp : public Operator<Context> {
+ public:
+    NLLLossGradientOp(
+        const OperatorDef&          def,
+        Workspace*                  ws)
+        : Operator<Context>(def, ws),
+          axis(OperatorBase::Arg<int>("axis", 1)),
+          normalization(OperatorBase::Arg<string>(
+              "normalization", "VALID")) {
+        auto xs = OperatorBase::Args<int>("ignore_labels");
+        if (xs.size()) {
+            ignores.Reshape({ (TIndex)xs.size() });
+            auto* Idata = ignores.mutable_data<int, CPUContext>();
+            for (int i = 0; i < xs.size(); i++) Idata[i] = xs[i];
+        }
+    }
+    USE_OPERATOR_FUNCTIONS;
+    void RunOnDevice() override;
+    template <typename Tx, typename Ty> void RunWithType();
+ protected:
+    TIndex axis, outer_dim, inner_dim;
+    Tensor ignores, flags;
+    string normalization;
+};
+}    // namespace dragon
+#endif    // DRAGON_OPERATORS_LOSS_NLL_LOSS_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/misc/initialize_op.h
+++ b/Dragon/include/operators/misc/initialize_op.h
@@ -22,7 +22,8 @@ class InitializeOp : public Operator<Context> {
 public:
    InitializeOp(const OperatorDef& def, Workspace* ws)
        : Operator<Context>(def, ws),
-          shape_desc(OperatorBase::Arg<string>("shape", "")) {
+          shape_desc(OperatorBase::Arg<string>("shape", "")),
+          dtype(OperatorBase::Arg<string>("dtype", "float32")) {
        GET_ARGUMENTS_WITH_DESC(int, dims);
    }
    USE_OPERATOR_FUNCTIONS;
@@ -32,19 +33,29 @@ class InitializeOp : public Operator<Context> {
 protected:
    DECLARE_ARGUMENTS_WITH_DESC(int, dims);
-    string shape_desc;
+    string shape_desc, dtype;
    TensorFiller filler;
 };
 template <class Context>
-class FillOp final : public InitializeOp<Context> {
+class FillOp final : public Operator<Context> {
 public:
    FillOp(const OperatorDef& def, Workspace* ws)
-        : InitializeOp<Context>(def, ws) {
+        : Operator<Context>(def, ws),
-        this->filler.set_type("constant");
+          shape_desc(OperatorBase::Arg<string>("shape", "")),
-        this->filler.set_value(OperatorBase::Arg<float>("value", 0.0));
+          dtype(OperatorBase::Arg<string>("dtype", "float32")),
+          value(OperatorBase::Arg<float>("value", 0.0)) {
+        GET_ARGUMENTS_WITH_DESC(int, dims);
    }
    USE_OPERATOR_FUNCTIONS;
+    void RunOnDevice() override;
+    template <typename T> void RunWithType();
+ protected:
+    DECLARE_ARGUMENTS_WITH_DESC(int, dims);
+    string shape_desc, dtype;
+    float value;
 };
 template <class Context>
@@ -130,6 +141,7 @@ public:
 };
 DEFINE_ARGUMENTS_WITH_DESC(int, InitializeOp, dims);
+DEFINE_ARGUMENTS_WITH_DESC(int, FillOp, dims);
 }    // namespace

--- a/Dragon/include/operators/norm/batch_norm_op.h
+++ b/Dragon/include/operators/norm/batch_norm_op.h
@@ -25,7 +25,7 @@ class BatchNormOp final : public Operator<Context> {
        : Operator<Context>(def, ws),
          axis(OperatorBase::Arg<int>("axis", -1)),
          momentum(OperatorBase::Arg<float>("momentum", 0.9f)),
-          eps(OperatorBase::Arg<float>("eps", 1e-3f)),
+          eps(OperatorBase::Arg<float>("eps", 1e-5f)),
          use_stats(OperatorBase::Arg<int>("use_stats", -1)),
          mode(OperatorBase::Arg<string>("mode", "DEFAULT")) {
        if (axis != -1) 
@@ -81,7 +81,7 @@ class FusedBatchNormOp : public Operator<Context> {
        : Operator<Context>(def, ws),
          axis(OperatorBase::Arg<int>("axis", -1)),
          momentum(OperatorBase::Arg<float>("momentum", 0.9f)),
-          eps(OperatorBase::Arg<float>("eps", 1e-3f)),
+          eps(OperatorBase::Arg<float>("eps", 1e-5f)),
          use_stats(OperatorBase::Arg<int>("use_stats", -1)) {}
    USE_OPERATOR_FUNCTIONS;
@@ -105,7 +105,7 @@ class FusedBatchNormGradientOp : public Operator<Context> {
    FusedBatchNormGradientOp(const OperatorDef& def, Workspace* ws)
        : Operator<Context>(def, ws),
          axis(OperatorBase::Arg<int>("axis", -1)),
-          eps(OperatorBase::Arg<float>("eps", 1e-3f)),
+          eps(OperatorBase::Arg<float>("eps", 1e-5f)),
          use_stats(OperatorBase::Arg<int>("use_stats", -1)) {}
    USE_OPERATOR_FUNCTIONS;
@@ -127,14 +127,13 @@ class FusedBatchNormGradientOp : public Operator<Context> {
 #if CUDNN_VERSION_MIN(5, 0, 0)
-#include "utils/cudnn_device.h"
 template <class Context>
-class CuDNNBatchNormOp final : public FusedBatchNormOp<Context> {
+class CuDNNBatchNormOp final
+    : public FusedBatchNormOp<Context> {
 public:
    CuDNNBatchNormOp(const OperatorDef& def, Workspace* ws)
        : FusedBatchNormOp<Context>(def, ws),
-          eps64(OperatorBase::Arg<float>("eps", 1e-3f)) {
+          eps64(OperatorBase::Arg<float>("eps", 1e-5f)) {
        CUDNN_CHECK(cudnnCreateTensorDescriptor(&input_desc));
        CUDNN_CHECK(cudnnCreateTensorDescriptor(&output_desc));
        CUDNN_CHECK(cudnnCreateTensorDescriptor(&bn_desc));
@@ -167,11 +166,12 @@ class CuDNNBatchNormOp final : public FusedBatchNormOp<Context> {
 };
 template <class Context>
-class CuDNNBatchNormGradientOp final : public FusedBatchNormGradientOp<Context> {
+class CuDNNBatchNormGradientOp final
+    : public FusedBatchNormGradientOp<Context> {
 public:
    CuDNNBatchNormGradientOp(const OperatorDef& def, Workspace* ws)
        : FusedBatchNormGradientOp<Context>(def, ws),
-          eps64(OperatorBase::Arg<float>("eps", 1e-3f)) {
+          eps64(OperatorBase::Arg<float>("eps", 1e-5f)) {
        CUDNN_CHECK(cudnnCreateTensorDescriptor(&input_desc));
        CUDNN_CHECK(cudnnCreateTensorDescriptor(&output_desc));
        CUDNN_CHECK(cudnnCreateTensorDescriptor(&bn_desc));

--- a/Dragon/include/operators/norm/batch_renorm_op.h
+++ b/Dragon/include/operators/norm/batch_renorm_op.h
@@ -23,7 +23,7 @@ class BatchRenormOp final : public Operator<Context> {
        : Operator<Context>(def, ws),
          axis(OperatorBase::Arg<int>("axis", -1)),
          momentum(OperatorBase::Arg<float>("momentum", 0.9f)),
-          eps(OperatorBase::Arg<float>("eps", 1e-3f)),
+          eps(OperatorBase::Arg<float>("eps", 1e-5f)),
          r_max(OperatorBase::Arg<float>("r_max", 3.f)),
          d_max(OperatorBase::Arg<float>("d_max", 5.f)),
          t_delta(OperatorBase::Arg<float>("t_delta", 1.f)),

--- a/Dragon/include/operators/norm/group_norm_op.h
+++ b/Dragon/include/operators/norm/group_norm_op.h
@@ -23,7 +23,7 @@ class GroupNormOp final : public Operator<Context> {
        : Operator<Context>(def, ws),
          group(OperatorBase::Arg<int>("group", 32)),
          axis(OperatorBase::Arg<int>("axis", -1)),
-          eps(OperatorBase::Arg<float>("eps", 1e-3f)) {
+          eps(OperatorBase::Arg<float>("eps", 1e-5f)) {
        if (axis != -1) 
            CHECK_EQ(axis, 1) 
                << "\nThe axis can only be set to 1.";
@@ -73,7 +73,7 @@ class FusedGroupNormOp final : public Operator<Context> {
        : Operator<Context>(def, ws),
          group(OperatorBase::Arg<int>("group", 32)),
          axis(OperatorBase::Arg<int>("axis", -1)),
-          eps(OperatorBase::Arg<float>("eps", 1e-3f)) {}
+          eps(OperatorBase::Arg<float>("eps", 1e-5f)) {}
    USE_OPERATOR_FUNCTIONS;
    void Setup();

--- a/Dragon/include/operators/norm/instance_norm_op.h
+++ b/Dragon/include/operators/norm/instance_norm_op.h
@@ -22,9 +22,10 @@ class InstanceNormOp final : public Operator<Context> {
    InstanceNormOp(const OperatorDef& def, Workspace* ws)
        : Operator<Context>(def, ws),
          axis(OperatorBase::Arg<int>("axis", -1)),
-          eps(OperatorBase::Arg<float>("eps", 1e-3f)) {
+          eps(OperatorBase::Arg<float>("eps", 1e-5f)) {
        if (axis != -1) 
-            CHECK_EQ(axis, 1) << "\nThe axis can only be set to 1.";
+            CHECK_EQ(axis, 1)
+                << "\nThe axis can only be set to 1.";
    }
    USE_OPERATOR_FUNCTIONS;
@@ -47,7 +48,8 @@ class InstanceNormGradientOp final : public Operator<Context> {
        : Operator<Context>(def, ws),
          axis(OperatorBase::Arg<int>("axis", -1)) {
        if (axis != -1)
-            CHECK_EQ(axis, 1) << "\nThe axis can only be set to 1.";
+            CHECK_EQ(axis, 1)
+                << "\nThe axis can only be set to 1.";
    }
    USE_OPERATOR_FUNCTIONS;

--- a/Dragon/include/operators/norm/l2_norm_op.h
+++ b/Dragon/include/operators/norm/l2_norm_op.h
@@ -23,7 +23,7 @@ class L2NormOp final : public Operator<Context> {
        : Operator<Context>(def, ws),
          axis(OperatorBase::Arg<int>("axis", 0)),
          num_axes(OperatorBase::Arg<int>("num_axes", -1)),
-          eps(OperatorBase::Arg<float>("eps", 1e-3f)),
+          eps(OperatorBase::Arg<float>("eps", 1e-5f)),
          mode(OperatorBase::Arg<string>("mode", "SUM")) {}
    USE_OPERATOR_FUNCTIONS;

--- a/Dragon/include/operators/recurrent/cudnn_recurrent_op.h
+++ b/Dragon/include/operators/recurrent/cudnn_recurrent_op.h
@@ -20,8 +20,6 @@ namespace dragon {
 #if CUDNN_VERSION_MIN(5, 0, 0)
-#include "utils/cudnn_device.h"
 class cudnnTensorDescriptors {
 public:
    cudnnTensorDescriptors(const int num_descs) {

--- a/Dragon/include/operators/vision/bias_add_op.h
+++ b/Dragon/include/operators/vision/bias_add_op.h
@@ -21,7 +21,8 @@ class BiasAddOp final : public Operator<Context> {
 public:
    BiasAddOp(const OperatorDef& def, Workspace* ws)
        : Operator<Context>(def, ws),
-          data_format(OperatorBase::Arg<string>("data_format", "NCHW")) {}
+          data_format(OperatorBase::Arg<string>(
+              "data_format", "NCHW")) {}
    USE_OPERATOR_FUNCTIONS;
    void RunOnDevice() override;
@@ -37,7 +38,8 @@ class BiasAddGradientOp final : public Operator<Context> {
 public:
    BiasAddGradientOp(const OperatorDef& def, Workspace* ws)
        : Operator<Context>(def, ws),
-          data_format(OperatorBase::Arg<string>("data_format", "NCHW")) {}
+          data_format(OperatorBase::Arg<string>(
+              "data_format", "NCHW")) {}
    USE_OPERATOR_FUNCTIONS;
    void RunOnDevice() override;
@@ -48,6 +50,62 @@ class BiasAddGradientOp final : public Operator<Context> {
    string data_format;
 };
+#ifdef WITH_CUDNN
+template <class Context>
+class CuDNNBiasAddOp final : public Operator<Context> {
+ public:
+    CuDNNBiasAddOp(const OperatorDef& def, Workspace* ws)
+        : Operator<Context>(def, ws),
+          data_format(OperatorBase::Arg<string>(
+              "data_format", "NCHW")) {
+        CUDNN_CHECK(cudnnCreateTensorDescriptor(&bias_desc));
+        CUDNN_CHECK(cudnnCreateTensorDescriptor(&output_desc));
+    }
+    USE_OPERATOR_FUNCTIONS;
+    ~CuDNNBiasAddOp() {
+        CUDNN_CHECK(cudnnDestroyTensorDescriptor(bias_desc));
+        CUDNN_CHECK(cudnnDestroyTensorDescriptor(output_desc));
+    }
+    void RunOnDevice() override;
+    template <typename T> void RunWithType();
+ protected:
+    TIndex outer_dim, dim, inner_dim;
+    string data_format;
+    cudnnTensorDescriptor_t bias_desc, output_desc;
+};
+template <class Context>
+class CuDNNBiasAddGradientOp final : public Operator<Context> {
+public:
+    CuDNNBiasAddGradientOp(const OperatorDef& def, Workspace* ws)
+        : Operator<Context>(def, ws),
+          data_format(OperatorBase::Arg<string>(
+              "data_format", "NCHW")) {
+        CUDNN_CHECK(cudnnCreateTensorDescriptor(&input_desc));
+        CUDNN_CHECK(cudnnCreateTensorDescriptor(&bias_desc));
+    }
+    USE_OPERATOR_FUNCTIONS;
+    ~CuDNNBiasAddGradientOp() {
+        CUDNN_CHECK(cudnnDestroyTensorDescriptor(input_desc));
+        CUDNN_CHECK(cudnnDestroyTensorDescriptor(bias_desc));
+    }
+    void RunOnDevice() override;
+    template <typename T> void RunWithType();
+protected:
+    TIndex outer_dim, dim, inner_dim;
+    string data_format;
+    cudnnTensorDescriptor_t input_desc, bias_desc;
+};
+#endif //   WITH_CUDNN
 }    // namespace dragon
 #endif    // DRAGON_OPERATORS_VISION_BIAS_ADD_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/vision/conv_op.h
+++ b/Dragon/include/operators/vision/conv_op.h
@@ -50,8 +50,6 @@ class Conv2dGradientOp : public Conv2dOp<Context> {
 #ifdef WITH_CUDNN
-#include "utils/cudnn_device.h"
 template <class Context>
 class CuDNNConv2dOp final : public Conv2dOp<Context> {
 public:
@@ -97,7 +95,7 @@ class CuDNNConv2dOp final : public Conv2dOp<Context> {
    cudnnConvolutionDescriptor_t conv_desc;
    cudnnFilterDescriptor_t filter_desc;
    size_t fwd_data_size;
-    TIndex bias_offset, cudnn_group;
+    TIndex cudnn_group;
    vector<TIndex> input_dims;
    bool enable_tensor_core;
 };
@@ -148,7 +146,7 @@ class CuDNNConv2dGradientOp final : public Conv2dGradientOp<Context> {
    cudnnConvolutionDescriptor_t conv_desc;
    cudnnFilterDescriptor_t filter_desc;
    size_t bwd_filter_size, bwd_data_size;
-    TIndex bias_offset, cudnn_group;
+    TIndex cudnn_group;
    vector<TIndex> input_dims;
    bool enable_tensor_core;
 };

--- a/Dragon/include/operators/vision/conv_op_base.h
+++ b/Dragon/include/operators/vision/conv_op_base.h
@@ -84,6 +84,7 @@ class ConvOpBase : public Operator<Context> {
                                                   ctx());
        } else LOG(FATAL) << "ConvNd has not been implemented yet";
    }
    template <typename T> void Col2Im(const T* col, T* im) {
        if (Input(0).ndim() == 4) {
             kernel::Col2Im2d<T, Context>(conv_in_channels,

--- a/Dragon/include/operators/vision/conv_transpose_op.h
+++ b/Dragon/include/operators/vision/conv_transpose_op.h
@@ -54,8 +54,6 @@ class Conv2dTransposeGradientOp : public Conv2dTransposeOp<Context> {
 #ifdef WITH_CUDNN
-#include "utils/cudnn_device.h"
 template <class Context>
 class CuDNNConv2dTransposeOp final : public Conv2dTransposeOp<Context> {
 public:
@@ -100,7 +98,7 @@ class CuDNNConv2dTransposeOp final : public Conv2dTransposeOp<Context> {
    cudnnConvolutionDescriptor_t conv_desc;
    cudnnFilterDescriptor_t filter_desc;
    size_t fwd_data_size;
-    TIndex bias_offset, cudnn_group;
+    TIndex cudnn_group;
    vector<TIndex> input_dims;
    bool enable_tensor_core;
 };
@@ -150,7 +148,7 @@ public:
    cudnnConvolutionDescriptor_t conv_desc;
    cudnnFilterDescriptor_t filter_desc;
    size_t bwd_filter_size, bwd_data_size;
-    TIndex bias_offset, cudnn_group;
+    TIndex cudnn_group;
    vector<TIndex> input_dims;
    bool enable_tensor_core;
 };

--- a/Dragon/include/operators/vision/drop_block_op.h
+++ b/Dragon/include/operators/vision/drop_block_op.h
+// ------------------------------------------------------------
+// Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+//
+// Licensed under the BSD 2-Clause License.
+// You should have received a copy of the BSD 2-Clause License
+// along with the software. If not, See,
+//
+//      <https://opensource.org/licenses/BSD-2-Clause>
+//
+// ------------------------------------------------------------
+#ifndef DRAGON_OPERATORS_VISION_DROP_BLOCK_OP_H_
+#define DRAGON_OPERATORS_VISION_DROP_BLOCK_OP_H_
+#include "core/operator.h"
+#include "utils/math_functions.h"
+namespace dragon {
+template <class Context>
+class DropBlock2dOp final : public Operator<Context> {
+ public:
+    DropBlock2dOp(const OperatorDef& def, Workspace* ws)
+        : Operator<Context>(def, ws),
+          block_size(OperatorBase::Arg<int>("block_size", 7)),
+          alpha(OperatorBase::Arg<float>("alpha", 1.f)),
+          decrement(OperatorBase::Arg<float>("decrement", 0.f)),
+          data_format(OperatorBase::Arg<string>("data_format", "NCHW")) {
+        GET_ARGUMENT_WITH_DESC(float, keep_prob, 0.9f);
+        SwitchToPhase(OperatorBase::Arg<string>("phase", ""));
+    }
+    USE_OPERATOR_FUNCTIONS;
+    void RunOnDevice() override;
+    template <typename T> void RunWithType();
+ protected:
+    DECLARE_ARGUMENT_WITH_DESC(float, keep_prob);
+    TIndex block_size, seed_h, seed_w;
+    TIndex n, c, h, w;
+    float alpha, decrement, apply_prob = 1., gamma;
+    string data_format;
+    vector<TIndex> seed_dims;
+};
+template <class Context>
+class DropBlock2dGradientOp final : public Operator<Context> {
+ public:
+    DropBlock2dGradientOp(const OperatorDef& def, Workspace* ws)
+        : Operator<Context>(def, ws) {
+        SwitchToPhase(OperatorBase::Arg<string>("phase", ""));
+    }
+    USE_OPERATOR_FUNCTIONS;
+    void RunOnDevice() override;
+    template <typename T> void RunWithType();
+};
+DEFINE_ARGUMENT_WITH_DESC(float, DropBlock2dOp, keep_prob);
+}    // namespace dragon
+#endif    // DRAGON_OPERATORS_VISION_DROP_BLOCK_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/vision/lrn_op.h
+++ b/Dragon/include/operators/vision/lrn_op.h
@@ -16,7 +16,10 @@
 namespace dragon {
-enum LRNMode { ACROSS_CHANNELS, WITHIN_CHANNEL };
+typedef enum {
+    ACROSS_CHANNELS,
+    WITHIN_CHANNEL,
+} LRNMode;
 template <class Context>
 class LRNOp : public Operator<Context> {
@@ -82,8 +85,6 @@ class LRNGradientOp : public Operator<Context> {
 #ifdef WITH_CUDNN
-#include "utils/cudnn_device.h"
 template <class Context>
 class CuDNNLRNOp final : public LRNOp<Context> {
 public:

--- a/Dragon/include/operators/vision/pooling_op.h
+++ b/Dragon/include/operators/vision/pooling_op.h
--- a/Dragon/include/utils/caffemodel.h
+++ b/Dragon/include/utils/caffemodel.h
@@ -73,7 +73,7 @@ inline void LoadCaffeModel(
        const string& layer_name = layer.name();
        string prefix = layer_name + "/param:";
        for (int j = 0; j < layer.blobs_size(); j++) {
-            string tensor_name = prefix + dragon_cast<string, int>(j);
+            string tensor_name = prefix + std::to_string(j);
            if (!ws->HasTensor(tensor_name))
                LOG(WARNING) << "Tensor(" << tensor_name << ") "
                << "does not exist in any Graphs, skip.";
@@ -114,7 +114,7 @@ inline void SavaCaffeModel(
    int layer_idx = -1;
    for (int i = 0; i < tensors.size(); i++) {
        if (tensors[i]->count() <= 0) continue;
-        vector<string> splits = SplitString(
+        vector<string> splits = str::split(
            tensors[i]->name(), "/param:");
        if (layer_hash.count(splits[0]) == 0) {
            layer_hash[splits[0]] = ++layer_idx;

--- a/Dragon/include/utils/cast.h
+++ b/Dragon/include/utils/cast.h
@@ -28,6 +28,10 @@ template<> inline int dragon_cast<int, float>(float val) {
    return static_cast<int>(val);
 }
+template<> inline int64_t dragon_cast<int64_t, float>(float val) {
+    return static_cast<int64_t>(val);
+}
 template<> inline float dragon_cast<float, float>(float val) {
    return val; 
 }
@@ -127,7 +131,7 @@ template<> inline float32 dragon_cast<float32, float>(float val) {
    return dragon_cast<float32, float16>(t);
 }
-#ifdef WITH_CUDA_FP16
+#ifdef WITH_CUDA
 template<> inline half dragon_cast<half, float>(float val) {
 #if CUDA_VERSION_MIN(9, 0, 0)
@@ -165,7 +169,7 @@ template<> inline half2 dragon_cast<half2, float16>(float16 val) {
 }
-#endif    // WITH_CUDA_FP16
+#endif    // WITH_CUDA
 }    // namespace dragon

--- a/Dragon/include/utils/cuda_device.h
+++ b/Dragon/include/utils/cuda_device.h
@@ -101,16 +101,10 @@ inline int CUDA_NUM_DEVICES() {
    return count;
 }
-inline int CUDA_DEVICE() {
+inline int CUDA_GET_DEVICE() {
-    int gpu_id;
+    int device_id;
-    cudaGetDevice(&gpu_id);
+    cudaGetDevice(&device_id);
-    return gpu_id;
+    return device_id;
-}
-inline int CUDA_DEVICE(const void* ptr) {
-    cudaPointerAttributes attr;
-    CUDA_CHECK(cudaPointerGetAttributes(&attr, ptr));
-    return attr.device;
 }
 struct CUDADeviceProps {
@@ -132,7 +126,7 @@ inline const cudaDeviceProp& GetDeviceProperty(
 }
 inline bool CUDA_TRUE_FP16_AVAILABLE() {
-    int device = CUDA_DEVICE();
+    int device = CUDA_GET_DEVICE();
    auto& prop = GetDeviceProperty(device);
    return prop.major >= 6;
 }
@@ -141,7 +135,7 @@ inline bool TENSOR_CORE_AVAILABLE() {
 #if CUDA_VERSION < 9000
    return false;
 #else
-    int device = CUDA_DEVICE();
+    int device = CUDA_GET_DEVICE();
    auto& prop = GetDeviceProperty(device);
    return prop.major >= 7;
 #endif
@@ -149,23 +143,16 @@ inline bool TENSOR_CORE_AVAILABLE() {
 class DeviceGuard {
 public:
-    DeviceGuard(int newDevice)
+    DeviceGuard(int new_id) : prev_id(CUDA_GET_DEVICE()) {
-        : previous_(CUDA_DEVICE()) {
+        if (prev_id != new_id) CUDA_CHECK(cudaSetDevice(new_id));
-        if (previous_ != newDevice)
-            CUDA_CHECK(cudaSetDevice(newDevice));
    }
-    ~DeviceGuard() {
+    ~DeviceGuard() { CUDA_CHECK(cudaSetDevice(prev_id)); }
-        CUDA_CHECK(cudaSetDevice(previous_));
-    }
 private:
-    int previous_;
+    int prev_id;
 };
-#define CUDA_FP16_NOT_COMPILED \
-    LOG(FATAL) << "CUDA-FP16 was not compiled."
 #else
 #define CUDA_NOT_COMPILED \

--- a/Dragon/include/utils/cudnn_device.h
+++ b/Dragon/include/utils/cudnn_device.h
@@ -55,7 +55,6 @@ template<> class CUDNNType<double> {
    typedef double BNParamType;
 };
-#ifdef WITH_CUDA_FP16
 template<> class CUDNNType<float16> {
 public:
    static const cudnnDataType_t type = CUDNN_DATA_HALF;
@@ -63,37 +62,63 @@ template<> class CUDNNType<float16> {
    static const void *one, *zero;
    typedef float BNParamType;
 };
-#endif
 template <typename T>
-void cudnnSetTensorDesc(cudnnTensorDescriptor_t* desc, Tensor* tensor);
+void cudnnSetTensorDesc(
+    cudnnTensorDescriptor_t*            desc,
+    Tensor*                             tensor);
 template <typename T>
-void cudnnSetTensor4dDesc(cudnnTensorDescriptor_t* desc, const string& data_format, Tensor* tensor);
+void cudnnSetTensor4dDesc(
+    cudnnTensorDescriptor_t*            desc,
+    const string&                       data_format,
+    Tensor*                             tensor);
 template <typename T>
-void cudnnSetTensor5dDesc(cudnnTensorDescriptor_t* desc, const string& data_format, Tensor* tensor);
+void cudnnSetTensor5dDesc(
+    cudnnTensorDescriptor_t*            desc,
+    const string&                       data_format,
+    Tensor*                             tensor);
 template <typename T>
-void cudnnSetTensor3dDesc(cudnnTensorDescriptor_t* desc, const string& data_format, Tensor* tensor);
+void cudnnSetTensor3dDesc(
+    cudnnTensorDescriptor_t*            desc,
+    const string&                       data_format,
+    Tensor*                             tensor);
 template <typename T>
-void cudnnSetTensorDesc(cudnnTensorDescriptor_t* desc, const std::vector<int64_t>& dims);
+void cudnnSetTensorDesc(
+    cudnnTensorDescriptor_t*            desc,
+    const std::vector<int64_t>&         dims);
 template <typename T>
-void cudnnSetTensor4dDesc(cudnnTensorDescriptor_t* desc, const string& data_format, const std::vector<int64_t>& dims);
+void cudnnSetTensor4dDesc(
+    cudnnTensorDescriptor_t*            desc,
+    const string&                       data_format,
+    const std::vector<int64_t>&         dims);
 template <typename T>
-void cudnnSetTensor4dDescWithGroup(cudnnTensorDescriptor_t* desc, const string& data_format, const std::vector<int64_t>& dims, const int64_t group);
+void cudnnSetTensor4dDescWithGroup(
+    cudnnTensorDescriptor_t*            desc,
+    const string&                       data_format,
+    const std::vector<int64_t>&         dims,
+    const int64_t                       group);
 template <typename T>
-void cudnnSetTensor5dDesc(cudnnTensorDescriptor_t* desc, const string& data_format, const std::vector<int64_t>& dims);
+void cudnnSetTensor5dDesc(
+    cudnnTensorDescriptor_t*            desc,
+    const string&                       data_format,
+    const std::vector<int64_t>&         dims);
 template <typename T>
-void cudnnSetTensor3dDesc(cudnnTensorDescriptor_t* desc, const string& data_format, const std::vector<int64_t>& dims);
+void cudnnSetTensor3dDesc(
+    cudnnTensorDescriptor_t*            desc,
+    const string&                       data_format,
+    const std::vector<int64_t>&         dims);
 template <typename T>
-void cudnnSetTensorDesc(cudnnTensorDescriptor_t* desc,
+void cudnnSetTensorDesc(
+    cudnnTensorDescriptor_t*            desc,
    const std::vector<int64_t>&         dims,
    const std::vector<int64_t>&         strides);

--- a/Dragon/include/utils/math_functions.h
+++ b/Dragon/include/utils/math_functions.h
@@ -69,7 +69,7 @@ template <typename T, class Context>
 void RandomBernoulli(
    const int               n,
    const float             p,
-    uint32_t*               x,
+    T*                      x,
    Context*                ctx);
 /******************** Level-1 ********************/

--- a/Dragon/include/utils/op_kernel.h
+++ b/Dragon/include/utils/op_kernel.h
@@ -25,21 +25,21 @@ typedef int64_t TIndex;
 template <typename T, class Context>
 void Dropout(
    const int               count,
-    T                       prob,
+    float                   prob,
-    T                       scale,
+    float                   scale,
    const T*                x,
-    uint32_t*               mask,
+    uint32_t*               mask32,
+    uint8_t*                mask8,
    T*                      y,
    Context*                ctx);
-template <typename T, class Context>
+template <typename Tx, typename Tm, class Context>
-void DropoutGrad(
+void ApplyMask(
    const int               count,
-    T                       prob,
+    const float             scale,
-    T                       scale,
+    const Tx*               x,
-    const T*                dy,
+    const Tm*               mask,
-    const uint32_t*         mask,
+    Tx*                     y,
-    T*                      dx,
    Context*                ctx);
 /******************** activation.elu ********************/
@@ -234,10 +234,95 @@ void Clip(
    const float             low,
    const float             high,
    const T*                x,
-    T*                      mask,
    T*                      y,
    Context*                ctx);
+template <typename T, class Context>
+void ClipGrad(
+    const int               count,
+    const float             low,
+    const float             high,
+    const T*                x,
+    const T*                dy,
+    T*                      dx,
+    Context*                ctx);
+/******************** arithmetic.maximum ********************/
+template <typename T, class Context>
+void MaximumE(
+    const int               count,
+    const T*                x1,
+    const T*                x2,
+    T*                      y,
+    Context*                ctx);
+template <typename T, class Context>
+void MaximumB(
+    const int               count,
+    const T*                x1,
+    const T                 x2,
+    T*                      y,
+    Context*                ctx);
+template <typename T, class Context>
+void MaximumEGrad(
+    const int               count,
+    const T*                x1,
+    const T*                x2,
+    const T*                dy,
+    T*                      dx1,
+    T*                      dx2,
+    Context*                ctx);
+template <typename T, class Context>
+void MaximumBGrad(
+    const int               count,
+    const T*                x1,
+    const T                 x2,
+    const T*                dy,
+    T*                      dx1,
+ /* T*                      dx2, */
+    Context*                ctx);
+/******************** arithmetic.minimum ********************/
+template <typename T, class Context>
+void MinimumE(
+    const int               count,
+    const T*                x1,
+    const T*                x2,
+    T*                      y,
+    Context*                ctx);
+template <typename T, class Context>
+void MinimumB(
+    const int               count,
+    const T*                x1,
+    const T                 x2,
+    T*                      y,
+    Context*                ctx);
+template <typename T, class Context>
+void MinimumEGrad(
+    const int               count,
+    const T*                x1,
+    const T*                x2,
+    const T*                dy,
+    T*                      dx1,
+    T*                      dx2,
+    Context*                ctx);
+template <typename T, class Context>
+void MinimumBGrad(
+    const int               count,
+    const T*                x1,
+    const T                 x2,
+    const T*                dy,
+    T*                      dx1,
+ /* T*                      dx2, */
+    Context*                ctx);
 /******************** control_flow.compare ********************/
 template <typename T, class Context>
@@ -257,6 +342,34 @@ void AbsGrad(
    T*                      dx,
    Context*                ctx);
+/******************** loss.nll_loss ********************/
+template <typename Tx, typename Ty, class Context>
+void NLLLoss(
+    const int               outer_dim,
+    const int               axis_dim,
+    const int               inner_dim,
+    const Tx*               log_prob,
+    const Ty*               labels,
+    const int*              ignores,
+    const int               num_ignores,
+    float*                  losses,
+    float*                  flags,
+    Context*                ctx);
+template <typename Tx, typename Ty, class Context>
+void NLLLossGrad(
+    const int               outer_dim,
+    const int               axis_dim,
+    const int               inner_dim,
+    const Tx*               prob,
+    const Ty*               labels,
+    const int*              ignores,
+    const int               num_ignores,
+    Tx*                     dx,
+    float*                  flags,
+    Context*                ctx);
 /******************** loss.sigmoid_cross_entropy ********************/
 template <typename T, class Context>
@@ -902,6 +1015,23 @@ void Col2Im2d(
    T*                      im,
    Context*                ctx);
+/******************** vision.drop_block ********************/
+template <class Context>
+void DropBlock2d(
+    const int               N,
+    const int               C,
+    const int               H,
+    const int               W,
+    const int               seed_h,
+    const int               seed_w,
+    const int               block_size,
+    const float             gamma,
+    const string&           data_format,
+    uint32_t*               seed,
+    int*                    mask,
+    Context*                ctx);
 /******************** vision.nn_resize ********************/
 template <typename T, class Context>

--- a/Dragon/include/utils/sse_alternative.h
+++ b/Dragon/include/utils/sse_alternative.h
@@ -111,7 +111,7 @@ void Axpby(
    const T                 beta,
    T*                      y);
-}    // namespace ssd
+}    // namespace sse
 }    // namespace dragon

--- a/Dragon/include/utils/string.h
+++ b/Dragon/include/utils/string.h
@@ -18,11 +18,11 @@
 #include <iostream>
 #include <cstdlib>
-#include "utils/cast.h"
 namespace dragon {
-inline std::vector<std::string> SplitString(
+namespace str {
+inline std::vector<std::string> split(
    const std::string&              str,
    const std::string&              c) {
    std::vector<std::string> ret;
@@ -36,17 +36,7 @@ inline std::vector<std::string> SplitString(
    return ret;
 }
-#define DEFINE_NUMBER2STRING(T) \
+}    // namespace str
-    template<> inline std::string dragon_cast<std::string, T>(T val) { \
-       std::stringstream ss; ss << val; return ss.str(); \
-    }
-DEFINE_NUMBER2STRING(int);
-DEFINE_NUMBER2STRING(unsigned long long);
-template<> inline int dragon_cast<int, std::string>(std::string val) {
-    return atoi(val.c_str()); 
-}
 }    // namespace dragon

--- a/Dragon/modules/cxx/CMakeLists.txt
+++ b/Dragon/modules/cxx/CMakeLists.txt
@@ -2,6 +2,7 @@ message(STATUS "Found CXX Module: ${CMAKE_CURRENT_LIST_DIR}")
 FILE(GLOB_RECURSE MODULE_FILES *.h *.hpp *.c *.cpp *.cu *.cc)
 FILE(GLOB_RECURSE SRC_FILES ../../src/*.c ../../src/*.cpp ../../src/*.cu ../../src/*.cc)
+LIST(REMOVE_ITEM SRC_FILES ${CMAKE_CURRENT_SOURCE_DIR}/../../src/operators/misc/python_op.cc)
 # ---[ Target
 if (WITH_CUDA)
@@ -36,7 +37,9 @@ if(WIN32)
    TARGET_LINK_LIBRARIES(${PROJECT_NAME}_cxx shlwapi.lib)
 endif()
-SET_TARGET_PROPERTIES(${PROJECT_NAME}_cxx PROPERTIES OUTPUT_NAME dragon_cxx)
+SET_TARGET_PROPERTIES(${PROJECT_NAME}_cxx PROPERTIES OUTPUT_NAME dragon)
+SET_TARGET_PROPERTIES(${PROJECT_NAME}_cxx PROPERTIES DEFINE_SYMBOL DRAGON_CXX_EXPORTS)
 # ---[ Install
-INSTALL(TARGETS ${PROJECT_NAME}_cxx DESTINATION ${PROJECT_BINARY_DIR}/../lib)
+INSTALL(TARGETS ${PROJECT_NAME}_cxx DESTINATION ${PROJECT_BINARY_DIR}/../api/lib)
\ No newline at end of file
+FILE(INSTALL dragon.h DESTINATION ${PROJECT_BINARY_DIR}/../api/include)
\ No newline at end of file
--- a/Dragon/modules/cxx/device.cc
+++ b/Dragon/modules/cxx/device.cc
 #include "dragon.h"
-#include "core/common.h"
+#include "utils/logging.h"
 namespace dragon {

--- a/Dragon/modules/cxx/dragon.cc
+++ b/Dragon/modules/cxx/dragon.cc
@@ -12,6 +12,12 @@
 namespace dragon {
+/* * * * * * * * * * * * * * * * * * * * *
+ *                                       *
+ *               Workspace               *
+ *                                       *
+ * * * * * * * * * * * * * * * * * * * * */
 Map<string, unique_ptr < Workspace > > g_workspaces;
 Map<string, vector<string> > sub_workspaces;
 std::mutex g_mutex;
@@ -29,7 +35,8 @@ Workspace* CreateWorkspace(const std::string& name){
 Workspace* ResetWorkspace(const std::string& name) {
    std::unique_lock<std::mutex> lock(g_mutex);
    CHECK(g_workspaces.count(name))
-        << "\nWorkspace(" << name << ") does not exist, can not be reset.";
+        << "\nWorkspace(" << name << ") does not exist."
+        << "\nCan not be reset.";
    LOG(INFO) << "Reset the Workspace(" << name << ").";
    g_workspaces[name].reset(new Workspace(name));
    for (auto& sub_workspace : sub_workspaces[name]) {
@@ -43,7 +50,8 @@ Workspace* ResetWorkspace(const std::string& name) {
 void ReleaseWorkspace(const std::string& name) {
    std::unique_lock<std::mutex> lock(g_mutex);
    CHECK(g_workspaces.count(name))
-        << "\nWorkspace(" << name << ") does not exist, can not be released.";
+        << "\nWorkspace(" << name << ") does not exist."
+        << "\nCan not be released.";
    LOG(INFO) << "Release the Workspace(" << name << ").";
    g_workspaces[name].reset();
    g_workspaces.erase(name);
@@ -61,6 +69,12 @@ void MoveWorkspace(
              << "into the Workspace(" << target_ws->name() << ").";
 }
+/* * * * * * * * * * * * * * * * * * * * *
+ *                                       *
+ *                Graph                  *
+ *                                       *
+ * * * * * * * * * * * * * * * * * * * * */
 std::string CreateGraph(
    const std::string&          graph_file,
    Workspace*                  ws) {
@@ -102,6 +116,19 @@ std::string CreateGraph(
    return meta_graph.name();
 }
+void RunGraph(
+    const std::string&          graph_name,
+    Workspace*                  ws,
+    const int                   stream_id) {
+    ws->RunGraph(graph_name, "", "", stream_id);
+}
+/* * * * * * * * * * * * * * * * * * * * *
+ *                                       *
+ *                Tensor                 *
+ *                                       *
+ * * * * * * * * * * * * * * * * * * * * */
 void CreateTensor(
    const std::string&          name,
    Workspace*                  ws) {
@@ -109,6 +136,32 @@ void CreateTensor(
 }
 template <typename T>
+T* FetchTensor(
+    const std::string&          name,
+    vector<TIndex>&             shape,
+    Workspace*                  ws){
+    if (!ws->HasTensor(name)){
+        LOG(FATAL) << "Tensor(" << name << ")"
+            << " doesn't exist, try create it before.";
+    }
+    Tensor* tensor = ws->GetTensor(name);
+    if (tensor->meta().id() == 0){
+        LOG(FATAL) << "Tensor(" << name << ")"
+            << " has not been computed yet";
+    }
+    shape = tensor->dims();
+    void* data = malloc(tensor->nbytes());
+    if (tensor->memory_state() == MixedMemory::STATE_AT_CUDA) {
+        CUDAContext::Memcpy<CPUContext, CUDAContext>(
+            tensor->nbytes(), data, tensor->raw_data<CUDAContext>());
+    } else {
+        CPUContext::Memcpy<CPUContext, CPUContext>(
+            tensor->nbytes(), data, tensor->raw_data<CPUContext>());
+    }
+    return static_cast<T*>(data);
+}
+template <typename T>
 void FeedTensor(
    const std::string&          name,
    const vector<TIndex>&       shape,
@@ -135,6 +188,12 @@ void FeedTensor(
    }
 }
+/* * * * * * * * * * * * * * * * * * * * *
+ *                                       *
+ *                I / O                  *
+ *                                       *
+ * * * * * * * * * * * * * * * * * * * * */
 void TransplantCaffeModel(
    const std::string&          input_model,
    const std::string&          output_model) {
@@ -146,7 +205,7 @@ void TransplantCaffeModel(
        const string& layer_name = layer.name();
        string prefix = layer_name + "/param:";
        for (int j = 0; j < layer.blobs_size(); j++) {
-            string tensor_name = prefix + dragon_cast<string, int>(j);
+            string tensor_name = prefix + std::to_string(j);
            BlobProto blob = layer.blobs(j);
            TensorProto* proto = protos.add_protos();
            proto->set_data_type(TensorProto_DataType_FLOAT);
@@ -218,7 +277,7 @@ void LoadCaffemodel(
        const string& layer_name = layer.name();
        string prefix = scope + layer_name + "/param:";
        for (int j = 0; j < layer.blobs_size(); j++){
-            string tensor_name = prefix + dragon_cast<string, int>(j);
+            string tensor_name = prefix + std::to_string(j);
            if (!ws->HasTensor(tensor_name))
                ws->CreateTensor(tensor_name);
            BlobProto blob = layer.blobs(j);
@@ -248,63 +307,54 @@ void LoadCaffemodel(
    }
 }
-void RunGraph(
+/* * * * * * * * * * * * * * * * * * * * *
-    const std::string&          graph_name,
+ *                                       *
-    Workspace*                  ws,
+ *                Config                 *
-    const int                   stream_id) {
+ *                                       *
-    ws->RunGraph(graph_name, "", "", stream_id);
+ * * * * * * * * * * * * * * * * * * * * */
-}
-template <typename T>
-T* FetchTensor(
-    const std::string&          name,
-    vector<TIndex>&             shape,
-    Workspace*                  ws){
-    if (!ws->HasTensor(name)){
-        LOG(FATAL) << "Tensor(" << name << ")"
-            << " doesn't exist, try create it before.";
-    }
-    Tensor* tensor = ws->GetTensor(name);
-    if (tensor->meta().id() == 0){
-        LOG(FATAL) << "Tensor(" << name << ")"
-            << " has not been computed yet";
-    }
-    shape = tensor->dims();
-    void* data = malloc(tensor->nbytes());
-    if (tensor->memory_state() == MixedMemory::STATE_AT_CUDA) {
-        CUDAContext::Memcpy<CPUContext, CUDAContext>(
-            tensor->nbytes(), data, tensor->raw_data<CUDAContext>());
-    } else {
-        CPUContext::Memcpy<CPUContext, CPUContext>(
-            tensor->nbytes(), data, tensor->raw_data<CPUContext>());
-    }
-    return static_cast<T*>(data);
-}
 void SetLogLevel(const std::string& level) {
    SetLogDestination(StrToLogSeverity(level));
 }
-template float* FetchTensor<float>(
+/* * * * * * * * * * * * * * * * * * * * *
+ *                                       *
+ *               Template                *
+ *                                       *
+ * * * * * * * * * * * * * * * * * * * * */
+template DRAGON_API float* FetchTensor<float>(
    const std::string&,
    std::vector<TIndex>&,
    Workspace*);
-template void FeedTensor<float>(
+template DRAGON_API float16* FetchTensor<float16>(
+    const std::string&,
+    std::vector<TIndex>&,
+    Workspace*);
+template DRAGON_API void FeedTensor<float>(
    const std::string&,
    const std::vector<TIndex>&,
    const float*,
    const Device&,
    Workspace*);
-template void FeedTensor<int>(
+template DRAGON_API void FeedTensor<float16>(
+    const std::string&,
+    const std::vector<TIndex>&,
+    const float16*,
+    const Device&,
+    Workspace*);
+template DRAGON_API void FeedTensor<int>(
    const std::string&,
    const std::vector<TIndex>&,
    const int*,
    const Device&,
    Workspace*);
-template void FeedTensor<uint8_t>(
+template DRAGON_API void FeedTensor<uint8_t>(
    const std::string&,
    const std::vector<TIndex>&,
    const uint8_t*,

--- a/Dragon/modules/cxx/dragon.h
+++ b/Dragon/modules/cxx/dragon.h
@@ -16,10 +16,28 @@
 #include <cstdint>
 #include <vector>
-#ifdef WIN32
+#ifdef _MSC_VER
-    #define EXPORT __declspec(dllexport)
+    #ifdef DRAGON_CXX_EXPORTS
+        #define DRAGON_API __declspec(dllexport)
+    #else
+        #define DRAGON_API __declspec(dllimport)
+    #endif
 #else
-    #define EXPORT
+    #define DRAGON_API
+#endif
+/* * * * * * * * * * * * * * * * * * * * *
+ *                                       *
+ *           Internal Headers            *
+ *                                       *
+ * * * * * * * * * * * * * * * * * * * * */
+#ifdef DRAGON_CXX_EXPORTS
+#include "core/types.h"
+#else
+namespace dragon {
+    struct float16;
+}
 #endif
 namespace dragon {
@@ -28,72 +46,102 @@ typedef int64_t TIndex;
 class Workspace;
-class Device {
+class DRAGON_API Device {
 public:
-    EXPORT Device();
+    Device();
-    EXPORT explicit Device(std::string device_type);
+    explicit Device(std::string device_type);
-    EXPORT Device(std::string device_type, int device_id);
+    Device(std::string device_type, int device_id);
-    EXPORT const int& device_type() const { return device_type_; }
+    const int& device_type() const { return device_type_; }
-    EXPORT const int device_id() const { return device_id_; }
+    const int device_id() const { return device_id_; }
 private:
    int device_type_, device_id_;
 };
-EXPORT Workspace* CreateWorkspace(const std::string& name);
+/* * * * * * * * * * * * * * * * * * * * *
+ *                                       *
+ *               Workspace               *
+ *                                       *
+ * * * * * * * * * * * * * * * * * * * * */
-EXPORT Workspace* ResetWorkspace(const std::string& name);
+DRAGON_API Workspace* CreateWorkspace(const std::string& name);
-EXPORT void ReleaseWorkspace(const std::string& name);
+DRAGON_API Workspace* ResetWorkspace(const std::string& name);
-EXPORT void MoveWorkspace(Workspace* main, Workspace* sub);
+DRAGON_API void ReleaseWorkspace(const std::string& name);
-EXPORT std::string CreateGraph(
+DRAGON_API void MoveWorkspace(Workspace* main, Workspace* sub);
+/* * * * * * * * * * * * * * * * * * * * *
+ *                                       *
+ *                Graph                  *
+ *                                       *
+ * * * * * * * * * * * * * * * * * * * * */
+DRAGON_API std::string CreateGraph(
    const std::string&          graph_file,
    Workspace*                  ws);
-EXPORT std::string CreateGraph(
+DRAGON_API std::string CreateGraph(
    const std::string&          graph_file,
    const Device&               device,
    Workspace*                  ws);
-EXPORT void RunGraph(
+DRAGON_API void RunGraph(
    const std::string&          graph_name,
    Workspace*                  ws,
    const int                   stream_id = 1);
-EXPORT void CreateTensor(
+/* * * * * * * * * * * * * * * * * * * * *
+ *                                       *
+ *                Tensor                 *
+ *                                       *
+ * * * * * * * * * * * * * * * * * * * * */
+DRAGON_API void CreateTensor(
    const std::string&          name,
    Workspace*                  ws);
 template <typename T>
-EXPORT void FeedTensor(
+DRAGON_API T* FetchTensor(
    const std::string&          name,
-    const std::vector<TIndex>&  shape,
+    std::vector<TIndex>&        shape,
-    const T*                    data,
-    const Device&               device,
    Workspace*                  ws);
 template <typename T>
-EXPORT T* FetchTensor(
+DRAGON_API void FeedTensor(
    const std::string&          name,
-    std::vector<TIndex>&        shape,
+    const std::vector<TIndex>&  shape,
+    const T*                    data,
+    const Device&               device,
    Workspace*                  ws);
-EXPORT void LoadCaffemodel(
+/* * * * * * * * * * * * * * * * * * * * *
+ *                                       *
+ *                I / O                  *
+ *                                       *
+ * * * * * * * * * * * * * * * * * * * * */
+DRAGON_API void LoadCaffemodel(
    const std::string&          model_file,
    Workspace*                  ws);
-EXPORT void TransplantCaffeModel(
+DRAGON_API void TransplantCaffeModel(
    const std::string&          input_model,
    const std::string&          output_model);
-EXPORT void LoadDragonmodel(
+DRAGON_API void LoadDragonmodel(
    const std::string&          model_file,
    Workspace*                  ws);
-EXPORT void SetLogLevel(const std::string& level);
+/* * * * * * * * * * * * * * * * * * * * *
+ *                                       *
+ *                Config                 *
+ *                                       *
+ * * * * * * * * * * * * * * * * * * * * */
+DRAGON_API void SetLogLevel(const std::string& level);
 }    // namespace dragon

--- a/Dragon/modules/python/dragon.cc
+++ b/Dragon/modules/python/dragon.cc
@@ -19,7 +19,8 @@ Workspace* ws() { return g_workspace; }
 TypeId CTypeToFetcher(TypeId type) {
    static Map<TypeId,TypeId> c_type_map {
-            { TypeMeta::Id<uint8_t>(), TypeMeta::Id<NumpyFetcher>() },
+        { TypeMeta::Id<int8>(), TypeMeta::Id<NumpyFetcher>() },
+        { TypeMeta::Id<uint8>(), TypeMeta::Id<NumpyFetcher>() },
        { TypeMeta::Id<int>(), TypeMeta::Id<NumpyFetcher>() },
        { TypeMeta::Id<int64_t>(), TypeMeta::Id<NumpyFetcher>() },
        { TypeMeta::Id<float>(), TypeMeta::Id<NumpyFetcher>() },
@@ -197,6 +198,11 @@ inline PyObject* FeedTensorCC(PyObject* self, PyObject* args) {
    }
 }
+inline PyObject* OnModuleExitCC(PyObject* self, PyObject* args) {
+    g_workspaces.clear();
+    Py_RETURN_TRUE;
+}
 #define PYFUNC(name) {#name, name, METH_VARARGS, ""}
 #define PYENDFUNC {nullptr, nullptr, 0, nullptr}
@@ -255,6 +261,7 @@ PyMethodDef* GetAllMethods() {
        PYFUNC(SnapshotCC),
        /****  Config ****/
        PYFUNC(SetLogLevelCC),
+        PYFUNC(OnModuleExitCC),
        PYENDFUNC,
    };
    return g_python_methods;
@@ -272,9 +279,11 @@ void common_init() {
 }
 #ifdef WITH_PYTHON3
-static struct PyModuleDef libdragon = { PyModuleDef_HEAD_INIT,
+static struct PyModuleDef libdragon = {
+    PyModuleDef_HEAD_INIT,
    "libdragon", "", -1,
-                                        GetAllMethods() };
+    GetAllMethods() 
+};
 PyMODINIT_FUNC PyInit_libdragon(void) {
    PyObject* module = PyModule_Create(&libdragon);
@@ -285,7 +294,8 @@ PyMODINIT_FUNC PyInit_libdragon(void) {
 #else   // WITH_PYTHON2
 PyMODINIT_FUNC initlibdragon(void) {
-    PyObject* moudle = Py_InitModule("libdragon", GetAllMethods());
+    PyObject* moudle = Py_InitModule(
+        "libdragon", GetAllMethods());
    if (moudle == nullptr) return;
    common_init();
 }

--- a/Dragon/modules/python/dragon.h
+++ b/Dragon/modules/python/dragon.h
@@ -31,7 +31,8 @@ class TensorFetcherBase {
 class TensorFeederBase {
 public:
    virtual ~TensorFeederBase() {}
-    virtual PyObject* Feed(const DeviceOption& option, 
+    virtual PyObject* Feed(
+        const DeviceOption&             option,
        PyArrayObject*                  array,
        Tensor*                         tensor) = 0;
 };
@@ -61,7 +62,7 @@ class NumpyFetcher : public TensorFetcherBase {
            PyErr_SetString(PyExc_RuntimeError, s.c_str());
            return nullptr;
        }
-        //  create a empty array with r shape
+        //  create a empty array with the same shape
        PyObject* array = PyArray_SimpleNew(
            tensor.ndim(), npy_dims.data(), npy_type);
        //  copy the tensor data to the numpy array
@@ -88,7 +89,8 @@ class StringFetcher : public TensorFetcherBase {
 class NumpyFeeder : public TensorFeederBase {
 public:
-    PyObject* Feed(const DeviceOption& option,
+    PyObject* Feed(
+        const DeviceOption&         option,
        PyArrayObject*              original_array,
        Tensor*                     tensor) override {
        PyArrayObject* array = PyArray_GETCONTIGUOUS(original_array);
@@ -100,7 +102,6 @@ class NumpyFeeder : public TensorFeederBase {
        if (meta.id() != tensor->meta().id() && tensor->meta().id() != 0)
            LOG(WARNING) << "Feed Tensor(" << tensor->name() << ")"
                         << " with different data type from original one.";
-        tensor->SetMeta(meta);
        int ndim = PyArray_NDIM(array);
        npy_intp* npy_dims = PyArray_DIMS(array);
        vector<TIndex> dims;
@@ -110,16 +111,16 @@ class NumpyFeeder : public TensorFeederBase {
 #ifdef WITH_CUDA
            CUDAContext context(option);
            context.SwitchToDevice();
+            auto* data = tensor->raw_mutable_data<CUDAContext>(meta);
            context.Memcpy<CUDAContext, CPUContext>(tensor->nbytes(),
-                             tensor->raw_mutable_data<CUDAContext>(),
+                      data, static_cast<void*>(PyArray_DATA(array)));
-                            static_cast<void*>(PyArray_DATA(array)));
 #else   
            LOG(FATAL) << "CUDA was not compiled.";
 #endif
        } else {
+            auto* data = tensor->raw_mutable_data<CPUContext>(meta);
            CPUContext::Memcpy<CPUContext, CPUContext>(tensor->nbytes(),
-                                 tensor->raw_mutable_data<CPUContext>(),
+                         data, static_cast<void*>(PyArray_DATA(array)));
-                               static_cast<void*>(PyArray_DATA(array)));
        }
        Py_XDECREF(array);
        Py_RETURN_TRUE;

--- a/Dragon/modules/python/py_cuda.h
+++ b/Dragon/modules/python/py_cuda.h
@@ -25,4 +25,4 @@ inline PyObject* IsCUDADriverSufficientCC(PyObject* self, PyObject* args) {
 #endif
 }
-#endif    // DRAGON_PYTHON_PY_MPI_H_
+#endif    // DRAGON_PYTHON_PY_CUDA_H_
\ No newline at end of file
--- a/Dragon/modules/python/py_tensor.h
+++ b/Dragon/modules/python/py_tensor.h
@@ -94,7 +94,6 @@ PyObject* TensorFromShapeCC(PyObject* self, PyObject* args) {
    if (meta.id() != tensor->meta().id() && tensor->meta().id() != 0)
        LOG(WARNING) << "Set Tensor(" << tensor->name() << ")"
        << " with different data type from original one.";
-    tensor->SetMeta(meta);
    int ndim = PyList_Size(shape);
    CHECK_GT(ndim, 0)
        << "\nThe len of shape should be greater than 1. Got " << ndim << ".";
@@ -112,9 +111,9 @@ PyObject* TensorFromShapeCC(PyObject* self, PyObject* args) {
    if (dev_opt.device_type() == CUDA) {
        CUDAContext ctx(dev_opt);
        ctx.SwitchToDevice();
-        tensor->raw_mutable_data<CUDAContext>();
+        tensor->raw_mutable_data<CUDAContext>(meta);
    } else {
-        tensor->raw_mutable_data<CPUContext>();
+        tensor->raw_mutable_data<CPUContext>(meta);
    }
    Py_RETURN_TRUE;
 }
@@ -173,19 +172,19 @@ PyObject* TensorFromTensorCC(PyObject* self, PyObject* args) {
    Tensor* srcT = ws()->GetTensor(src_name);
    Tensor* dstT = ws()->CreateTensor(dst_name);
    dstT->ReshapeLike(*srcT);
-    dstT->SetMeta(srcT->meta());
+    const TypeMeta& meta = srcT->meta();
    if (dst_ctx.device_type() == DeviceType::CUDA) {
        if (src_ctx.device_type() == DeviceType::CUDA) {
            //  CUDA <- CUDA
            CUDAContext::Memcpy<CUDAContext, CUDAContext>(
                srcT->nbytes(),
-                    dstT->raw_mutable_data<CUDAContext>(),
+                    dstT->raw_mutable_data<CUDAContext>(meta),
                        srcT->raw_data<CUDAContext>());
        } else {
            //  CUDA <- CPU
            CUDAContext::Memcpy<CUDAContext, CUDAContext>(
                srcT->nbytes(),
-                    dstT->raw_mutable_data<CUDAContext>(),
+                    dstT->raw_mutable_data<CUDAContext>(meta),
                        srcT->raw_data<CPUContext>());
        }
    } else {
@@ -193,13 +192,13 @@ PyObject* TensorFromTensorCC(PyObject* self, PyObject* args) {
            //  CPU <- CUDA
            CUDAContext::Memcpy<CUDAContext, CUDAContext>(
                srcT->nbytes(),
-                    dstT->raw_mutable_data<CPUContext>(),
+                    dstT->raw_mutable_data<CPUContext>(meta),
                        srcT->raw_data<CUDAContext>());
        } else {
            //  CPU <- CPU
            CUDAContext::Memcpy<CUDAContext, CUDAContext>(
                srcT->nbytes(),
-                    dstT->raw_mutable_data<CPUContext>(),
+                    dstT->raw_mutable_data<CPUContext>(meta),
                        srcT->raw_data<CPUContext>());
        }
    }

--- a/Dragon/modules/python/py_types.h
+++ b/Dragon/modules/python/py_types.h
@@ -23,8 +23,8 @@ inline const int TypeMetaToNPY(const TypeMeta& meta) {
        { TypeMeta::Id<int64_t>(), NPY_INT64 },
        { TypeMeta::Id<double>(), NPY_FLOAT64 },
        { TypeMeta::Id<float16>(), NPY_FLOAT16 },
-            { TypeMeta::Id<uint8_t>(), NPY_UINT8 },
+        { TypeMeta::Id<uint8>(), NPY_UINT8 },
-            { TypeMeta::Id<char>(), NPY_INT8 }
+        { TypeMeta::Id<int8>(), NPY_INT8 }
    };
    return m2npy_type_map.count(meta.id()) ? m2npy_type_map[meta.id()] : -1;
 }
@@ -36,11 +36,12 @@ inline const TypeMeta& TypeNPYToMeta(int npy_type) {
        { NPY_INT64, TypeMeta::Make<int64_t>() },
        { NPY_FLOAT64, TypeMeta::Make<double>() },
        { NPY_FLOAT16, TypeMeta::Make<float16>() },
-            { NPY_UINT8, TypeMeta::Make<uint8_t>() },
+        { NPY_UINT8, TypeMeta::Make<uint8>() },
-            { NPY_INT8, TypeMeta::Make<char>() },
+        { NPY_INT8, TypeMeta::Make<int8>() },
    };
    static TypeMeta unknown_type;
-    return npy2m_type_map.count(npy_type) ? npy2m_type_map[npy_type] : unknown_type;
+    return npy2m_type_map.count(npy_type) ?
+        npy2m_type_map[npy_type] : unknown_type;
 }
 #endif    // DRAGON_PYTHON_PY_TYPES_H_
\ No newline at end of file
--- a/Dragon/python/dragon/config.py
+++ b/Dragon/python/dragon/config.py
@@ -26,11 +26,11 @@ option = {}
 REGISTERED_OPERATORS = set(s for s in RegisteredOperatorsCC())
 NO_GRADIENT_OPERATORS = set(s for s in NoGradientOperatorsCC())
-# The current device, 'CPU' or 'CUDA'
+# The current device, 'CPU', 'CUDA' or 'CNML'
 option['device'] = 'CPU'
 # The device id
-option['gpu_id'] = 0
+option['device_id'] = 0
 # Whether to use cuDNN if possible
 option['use_cudnn'] = False
@@ -44,6 +44,9 @@ option['debug_mode'] = False
 # Whether to share grads
 option['share_grads'] = True
+# Optional graph type
+option['graph_type'] = ''
 # Whether to log the meta graphs
 option['log_meta_graph'] = False
@@ -84,7 +87,7 @@ def IsCUDADriverSufficient():
 def EnableCUDA(gpu_id=0, use_cudnn=True):
-    """Enable CUDA mode globally.
+    """Enable NVIDIA's CUDA mode globally.
    Parameters
    ----------
@@ -100,9 +103,28 @@ def EnableCUDA(gpu_id=0, use_cudnn=True):
    """
    global option
    option['device'] = 'CUDA'
-    option['gpu_id'] = gpu_id
+    option['device_id'] = gpu_id
    option['use_cudnn'] = use_cudnn
+def EnableCNML(mlu_id=0):
+    """Enable Cambricon's CNML mode globally.
+    Parameters
+    ----------
+    device_id : int
+        The id of MLU to use.
+    Returns
+    -------
+    None
+    """
+    global option
+    option['device'] = 'CNML'
+    option['device_id'] = mlu_id
 # TODO(PhyscalX): please not use @setter
 # TODO(PhyscalX): seems that it can't change the global value
@@ -133,7 +155,6 @@ def GetRandomSeed():
        The global random seed.
    """
-    global option
    return option['random_seed']
@@ -151,7 +172,7 @@ def SetGPU(id):
    """
    global option
-    option['gpu_id'] = id
+    option['device_id'] = id
 def GetGPU():
@@ -163,8 +184,7 @@ def GetGPU():
        The global id of GPU.
    """
-    global option
+    return option['device_id']
-    return option['gpu_id']
 def SetDebugMode(enabled=True):
@@ -186,6 +206,25 @@ def SetDebugMode(enabled=True):
    option['debug_mode'] = enabled
+def SetGraphType(graph_type=''):
+    """Set the graph type.
+    If empty, the default DAG graph will be used.
+    Parameters
+    ----------
+    graph_type : str
+        The graph type.
+    Returns
+    -------
+    None
+    """
+    global option
+    option['graph_type'] = graph_type
 def LogMetaGraph(enabled=True):
    """Enable to log meta graph globally.

--- a/Dragon/python/dragon/core/tensor.py
+++ b/Dragon/python/dragon/core/tensor.py
@@ -737,7 +737,7 @@ class Tensor(object):
        Parameters
        ----------
-        new_value : basic type, list or numpy.ndarray
+        new_value : number, list or numpy.ndarray
            The values to set.
        Returns

--- a/Dragon/python/dragon/core/tensor_utils.py
+++ b/Dragon/python/dragon/core/tensor_utils.py
@@ -325,5 +325,7 @@ def GetTensorInfo(tensor, stream=1):
        info['mem'].append('CPU'); info['device_id'] = 0
    if 'CUDA' in info:
        info['mem'].append('CUDA'); info['device_id'] = int(info['CUDA'])
+    if 'CNML' in info:
+        info['mem'].append('CNML'); info['device_id'] = int(info['CNML'])
    info['init'] = len(info['mem']) > 0
    return info
\ No newline at end of file
--- a/Dragon/python/dragon/core/workspace.py
+++ b/Dragon/python/dragon/core/workspace.py
@@ -439,7 +439,7 @@ def FetchTensor(tensor):
    Returns
    -------
-    numpy.ndarray
+    ndarray
        The values copied from the backend.
    References
@@ -457,7 +457,7 @@ def FeedTensor(tensor, array, force_cpu=False, dtype=None):
    ----------
    tensor : Tensor or str
        The tensor to feed.
-    ndarray : basic type, list or numpy.ndarray
+    ndarray : number, list or ndarray
        The values to feed.
    force_cpu : boolean
        Whether force to feed to cpu context.
@@ -488,25 +488,23 @@ def FeedTensor(tensor, array, force_cpu=False, dtype=None):
    """
    name = tensor.name if hasattr(tensor, 'name') else str(tensor)
-    dev = None
+    if force_cpu is True:
-    if force_cpu is True: dev = utils.MakeDeviceOption(0, 0)
+        dev = utils.MakeDeviceOption(0, 0)
    else:
        from dragon.core.scope import _DEVICE_SCOPE
        if _DEVICE_SCOPE != '':
-            supports = {'/cpu': 0, '/gpu': 1}
+            supports = {'/cpu': 0, '/gpu': 1, '/mlu': 2}
            dev = pb.DeviceOption()
            dev.device_type = supports[_DEVICE_SCOPE.split(':')[0]]
-            dev.gpu_id = int(_DEVICE_SCOPE.split(':')[1])
+            dev.device_id = int(_DEVICE_SCOPE.split(':')[1])
        else:
            from dragon.config import option
            if option['device'] == 'CUDA':
-                dev = utils.MakeDeviceOption(1, option['gpu_id'])
+                dev = utils.MakeDeviceOption(1, option['device_id'])
-            elif option['device'] == 'CPU':
+            else:
                dev = utils.MakeDeviceOption(0, 0)
    if not isinstance(array, np.ndarray):
-        if not isinstance(array, list):
-            array = [array]
        auto_data_type = np.float32 if dtype is None else dtype
    else:
        auto_data_type = array.dtype if dtype is None else dtype
@@ -573,8 +571,8 @@ def RunGraph(graph_name, inputs=(), outputs=[], stage=None, return_outputs=True)
    Returns
    -------
-    None, numpy.ndarray or list of numpy.ndarray
+    None, ndarray or list of ndarray
-        The outputs, format as numpy.ndarray.
+        The outputs, format as ndarray.
    See Also
    --------

--- a/Dragon/python/dragon/docs/contents/ops.rst
+++ b/Dragon/python/dragon/docs/contents/ops.rst
@@ -42,6 +42,7 @@ List                   Brief
 `BilinearResize`_      Resize the image with Bi-linear method.
 `BiasAdd`_             Add the bias across channels to a ``NCHW`` or ``NHWC`` input.
 `DenseConcat`_         Memory-efficient concatenation for DenseNet. `[Huang et.al, 2017] <http://arxiv.org/abs/1608.06993>`_.
+`DropBlock2d`_         Randomly drop the outputs according to the spatial blocks. `[Ghiasi et.al, 2018] <https://arxiv.org/abs/1810.12890>`_.
 ===================    ======================================================================
 Recurrent
@@ -76,6 +77,7 @@ Loss
 =============================      ======================================================================
 List                               Brief
 =============================      ======================================================================
+`NLLLoss`_                         Negative likelihood loss with sparse labels.
 `SparseSoftmaxCrossEntropy`_       SoftmaxCrossEntropy with sparse labels.
 `SigmoidCrossEntropy`_             SigmoidCrossEntropy.
 `SoftmaxCrossEntropy`_             SoftmaxCrossEntropy with dense(one-hot) labels.
@@ -102,6 +104,8 @@ List               Brief
 `Exp`_             Calculate the exponential of input.
 `Square`_          Calculate the square of input.
 `Sqrt`_            Calculate the sqrt of input.
+`Maximum`_         Return the max value of given two inputs.
+`Minimum`_          Return the min value of given two inputs.
 `Clip`_            Clip the input to be between lower and higher bounds.
 `Matmul`_          Matrix Multiplication.
 `InnerProduct`_    InnerProduct Function.
@@ -215,6 +219,7 @@ List                 Brief
 .. _BilinearResize: operators/vision.html#dragon.operators.vision.BilinearResize
 .. _BiasAdd: operators/vision.html#dragon.operators.vision.BiasAdd
 .. _DenseConcat: operators/vision.html#dragon.operators.vision.DenseConcat
+.. _DropBlock2d: operators/vision.html#dragon.operators.vision.DropBlock2d
 .. _RNN: operators/recurrent.html#dragon.operators.recurrent.RNN
 .. _LSTM: operators/recurrent.html#dragon.operators.recurrent.LSTM
@@ -231,6 +236,7 @@ List                 Brief
 .. _Softmax: operators/activation.html#dragon.operators.activation.Softmax
 .. _Dropout: operators/activation.html#dragon.operators.activation.Dropout
+.. _NLLLoss: operators/loss.html#dragon.operators.loss.NLLLoss
 .. _SparseSoftmaxCrossEntropy: operators/loss.html#dragon.operators.loss.SparseSoftmaxCrossEntropy
 .. _SigmoidCrossEntropy: operators/loss.html#dragon.operators.loss.SigmoidCrossEntropy
 .. _SoftmaxCrossEntropy: operators/loss.html#dragon.operators.loss.SoftmaxCrossEntropy
@@ -246,6 +252,8 @@ List                 Brief
 .. _Mul: operators/arithmetic.html#dragon.operators.arithmetic.Mul
 .. _Div: operators/arithmetic.html#dragon.operators.arithmetic.Div
 .. _Clip: operators/arithmetic.html#dragon.operators.arithmetic.Clip
+.. _Maximum: operators/arithmetic.html#dragon.operators.arithmetic.Maximum
+.. _Minimum: operators/arithmetic.html#dragon.operators.arithmetic.Minimum
 .. _Pow: operators/arithmetic.html#dragon.operators.arithmetic.Pow
 .. _Log: operators/arithmetic.html#dragon.operators.arithmetic.Log
 .. _Exp: operators/arithmetic.html#dragon.operators.arithmetic.Exp

--- a/Dragon/python/dragon/docs/contents/vm/caffe/layer.rst
+++ b/Dragon/python/dragon/docs/contents/vm/caffe/layer.rst
@@ -32,6 +32,7 @@ List                       Brief
 `LRNLayer`_                The implementation of ``LRNLayer``.
 `NNResizeLayer`_           The implementation of ``NNResizeLayer``.
 `BilinearResizeLayer`_     The implementation of ``BilinearResizeLayer``.
+`DropBlockLayer`_          The implementation of ``DropBlockLayer``.
 ======================     =============================================================================
@@ -160,6 +161,7 @@ API Reference
 .. _LRNLayer: #dragon.vm.caffe.layers.vision.LRNLayer
 .. _NNResizeLayer: #dragon.vm.caffe.layers.vision.NNResizeLayer
 .. _BilinearResizeLayer: #dragon.vm.caffe.layers.vision.BilinearResizeLayer
+.. _DropBlockLayer: #dragon.vm.caffe.layers.vision.DropBlockLayer
 .. _ReLULayer: #dragon.vm.caffe.layers.neuron.ReLULayer
 .. _PReLULayer: #dragon.vm.caffe.layers.neuron.PReLULayer

--- a/Dragon/python/dragon/import_c_apis.py
+++ b/Dragon/python/dragon/import_c_apis.py
@@ -15,6 +15,7 @@ from __future__ import print_function
 import sys
 import logging
+import atexit
 try:
    from dragon.libdragon import *
@@ -22,3 +23,5 @@ except ImportError as e:
    logging.critical(
        'Cannot import dragon. Error: {0}'.format(str(e)))
    sys.exit(1)
+atexit.register(OnModuleExitCC)
\ No newline at end of file
--- a/Dragon/python/dragon/io/data_transformer.py
+++ b/Dragon/python/dragon/io/data_transformer.py
@@ -101,7 +101,8 @@ class DataTransformer(Process):
            im = im.reshape((datum.height, datum.width, datum.channels))
        # random scale
-        random_scale = npr.uniform() * (self._max_random_scale - self._min_random_scale) \
+        random_scale = npr.uniform() * (
+            self._max_random_scale - self._min_random_scale) \
                + self._min_random_scale
        if random_scale != 1.0:
            if sys.version_info >= (3, 0):
@@ -110,7 +111,9 @@ class DataTransformer(Process):
            else:
                # Fuck Fuck Fuck opencv-python2, it always has a BUG
                # that leads to duplicate cuDA handles created at gpu:0
-                new_shape = (int(im.shape[1] * random_scale), int(im.shape[0] * random_scale))
+                new_shape = (
+                    int(np.ceil(im.shape[1] * random_scale)),
+                    int(np.ceil(im.shape[0] * random_scale)))
                im = PIL.Image.fromarray(im)
                im = im.resize(new_shape, PIL.Image.BILINEAR)
                im = np.array(im)

--- a/Dragon/python/dragon/operators/__init__.py
+++ b/Dragon/python/dragon/operators/__init__.py
@@ -9,10 +9,12 @@
 #
 # ------------------------------------------------------------
+import numpy as np
 from dragon.core.tensor import Tensor
 INT_MAX = 2147483647
 def CheckInputs(inputs, *args):
    def Verify(inputs, min_num, max_num):
        # type checking
@@ -44,6 +46,17 @@ def ParseArguments(locals):
    return dict(__all__, **kwargs)
+def WrapConstants(constants, dtype='float32'):
+    if not isinstance(constants, Tensor):
+        if not isinstance(constants, np.ndarray):
+            constants = np.array(constants, dtype=dtype)
+        tensor = Tensor()
+        tensor.set_value(constants)
+        tensor.shape = constants.shape
+        constants = tensor
+    return constants
 def AddArgumentWithDesc(arguments, property, name, as_target=True):
    if isinstance(property, Tensor):
        if as_target:

--- a/Dragon/python/dragon/operators/arithmetic.py
+++ b/Dragon/python/dragon/operators/arithmetic.py
@@ -115,6 +115,70 @@ def Div(inputs, **kwargs):
    return output
+def Maximum(inputs, **kwargs):
+    """Return the max value of given two inputs.
+    Parameters
+    ----------
+    inputs : list
+        The input tensors, A and B.
+    Returns
+    -------
+    Tensor
+        The output tensor.
+    """
+    inputs[0] = WrapConstants(inputs[0], dtype='float32')
+    inputs[1] = WrapConstants(inputs[1], dtype='float32')
+    CheckInputs(inputs, 2)
+    arguments = ParseArguments(locals())
+    output = Tensor.CreateOperator(nout=1, op_type='Maximum', **arguments)
+    if inputs[0].shape is not None and \
+        inputs[1].shape is not None:
+            output.shape = inputs[0].shape[:]
+            if output.shape != inputs[1].shape and \
+                len(output.shape) < len(inputs[1].shape):
+                    output.shape = inputs[1].shape
+    return output
+def Minimum(inputs, **kwargs):
+    """Return the min value of given two inputs.
+    Parameters
+    ----------
+    inputs : list
+        The input tensors, A and B.
+    Returns
+    -------
+    Tensor
+        The output tensor.
+    """
+    inputs[0] = WrapConstants(inputs[0], dtype='float32')
+    inputs[1] = WrapConstants(inputs[1], dtype='float32')
+    CheckInputs(inputs, 2)
+    arguments = ParseArguments(locals())
+    output = Tensor.CreateOperator(nout=1, op_type='Minimum', **arguments)
+    if inputs[0].shape is not None and \
+        inputs[1].shape is not None:
+            output.shape = inputs[0].shape[:]
+            if output.shape != inputs[1].shape and \
+                len(output.shape) < len(inputs[1].shape):
+                    output.shape = inputs[1].shape
+    return output
 def Clip(inputs, low=None, high=None, **kwargs):
    """Clip the input to be between lower and higher bounds.

--- a/Dragon/python/dragon/operators/initializer.py
+++ b/Dragon/python/dragon/operators/initializer.py
@@ -36,15 +36,19 @@ def _wrap_output_shape(output, shape):
    return output
-def Fill(shape, value=0, **kwargs):
+def Fill(shape, value=0, dtype='float32', **kwargs):
    """Return a Tensor with specific value filled.
+    If ``dtype`` is None, tensor
    Parameters
    ----------
    shape : list, tuple or Tensor
        The output shape.
    value : basic numerical type
        The value to fill.
+    dtype : str
+        The optional data type.
    Returns
    -------

--- a/Dragon/python/dragon/operators/loss.py
+++ b/Dragon/python/dragon/operators/loss.py
@@ -19,6 +19,46 @@ from . import *
 from .activation import Softmax
+def NLLLoss(inputs, axis=1, normalization='VALID', ignore_labels=(), **kwargs):
+    """Negative likelihood loss with sparse labels.
+    Parameters
+    ----------
+    inputs : list of Tensor
+        The inputs, represent [input, sparse_labels].
+    axis : int
+        The axis of softmax function.
+    normalization : str
+        The normalization, ``UNIT``, ``FULL``, ``VALID``, ``BATCH_SIZE`` or ``NONE``.
+    ignore_label : tuple or list
+        The label id to ignore. Default is ``empty``.
+    Returns
+    -------
+    Tensor
+        The loss.
+    Notes
+    -----
+    Set the normalization to ``UNIT`` will return unreduced losses.
+    """
+    CheckInputs(inputs, 2)
+    arguments = ParseArguments(locals())
+    output = Tensor.CreateOperator(nout=1, op_type='NLLLoss', **arguments)
+    if inputs[0].shape is not None:
+        if normalization != 'UNIT': output.shape = [1]
+        elif all(dim is not None for dim in inputs[0].shape):
+            outer_dim = int(np.prod(inputs[0].shape[0 : axis]))
+            inner_dim = int(np.prod(inputs[0].shape[axis + 1 :]))
+            output.shape = [outer_dim * inner_dim]
+        else: output.shape = [None]
+    return output
 def SparseSoftmaxCrossEntropy(inputs, axis=1, normalization='VALID', ignore_labels=(), **kwargs):
    """SoftmaxCrossEntropy with sparse labels.

--- a/Dragon/python/dragon/operators/norm.py
+++ b/Dragon/python/dragon/operators/norm.py
@@ -16,8 +16,10 @@ from __future__ import print_function
 from . import *
-def BatchNorm(inputs, axis=-1, momentum=0.9, eps=1e-3,
+def BatchNorm(
-              use_stats=-1, mode='DEFAULT', **kwargs):
+    inputs, axis=-1, momentum=0.9, eps=1e-5,
+    use_stats=-1, mode='DEFAULT', **kwargs
+):
    """Batch Normalization. `[Ioffe & Szegedy, 2015] <https://arxiv.org/abs/1502.03167>`_.
    It follows the implementation of `Caffe`_, that scale procedure is moved to `ops.Scale(*args, **kwargs)`_.
@@ -70,9 +72,11 @@ def BatchNorm(inputs, axis=-1, momentum=0.9, eps=1e-3,
    return output
-def BatchRenorm(inputs, axis=-1, momentum=0.9, eps=1e-3,
+def BatchRenorm(
+    inputs, axis=-1, momentum=0.9, eps=1e-5,
    r_max=3.0, d_max=5.0, t_delta=0.001,
-                use_stats=-1, mode='DEFAULT', **kwargs):
+    use_stats=-1, mode='DEFAULT', **kwargs
+):
    """Batch Renormalization. `[Ioffe, 2017] <https://arxiv.org/abs/1702.03275>`_.
    It follows the implementation of `Caffe`_, that scale procedure is moved to `ops.Scale(*args, **kwargs)`_.
@@ -131,7 +135,10 @@ def BatchRenorm(inputs, axis=-1, momentum=0.9, eps=1e-3,
    return output
-def FusedBatchNorm(inputs, axis=-1, momentum=0.9, eps=1e-3, use_stats=-1, **kwargs):
+def FusedBatchNorm(
+    inputs, axis=-1, momentum=0.9, eps=1e-5,
+    use_stats=-1, **kwargs
+):
    """Batch Normalization, with scale procedure after normalization.
    Parameters
@@ -170,7 +177,7 @@ def FusedBatchNorm(inputs, axis=-1, momentum=0.9, eps=1e-3, use_stats=-1, **kwar
    return output
-def GroupNorm(inputs, group=32, axis=-1, eps=1e-3, **kwargs):
+def GroupNorm(inputs, group=32, axis=-1, eps=1e-5, **kwargs):
    """Group Normalization. `[Wu & He, 2018] <https://arxiv.org/abs/1803.08494>`_.
    Parameters
@@ -203,7 +210,7 @@ def GroupNorm(inputs, group=32, axis=-1, eps=1e-3, **kwargs):
    return output
-def FusedGroupNorm(inputs, group=32, axis=-1, eps=1e-3, **kwargs):
+def FusedGroupNorm(inputs, group=32, axis=-1, eps=1e-5, **kwargs):
    """Group Normalization, with scale procedure after normalization.
    Parameters
@@ -236,7 +243,7 @@ def FusedGroupNorm(inputs, group=32, axis=-1, eps=1e-3, **kwargs):
    return output
-def InstanceNorm(inputs, axis=-1, eps=1e-3, **kwargs):
+def InstanceNorm(inputs, axis=-1, eps=1e-5, **kwargs):
    """Instance Normalization. `[Ulyanov et.al, 2016] <https://arxiv.org/abs/1607.08022>`_
    Parameters

--- a/Dragon/python/dragon/operators/vision.py
+++ b/Dragon/python/dragon/operators/vision.py
@@ -630,3 +630,44 @@ def DenseConcat(inputs, growth_rate=0, axis=1, **kwargs):
                output.shape[axis] += inputs[i].shape[axis]
    return output
+def DropBlock2d(inputs, block_size=7, keep_prob=0.9,
+                alpha=1., decrement=0., data_format='NCHW', **kwargs):
+    """Randomly drop the outputs according to the spatial blocks. `[Ghiasi et.al, 2018] <https://arxiv.org/abs/1810.12890>`_.
+    Set the ``decrement`` to schedule ``keep_prob`` for each iteration.
+    Set the ``alpha`` to decrease ``gamma`` for different stages.
+    Parameters
+    ----------
+    inputs : Tensor
+        The input tensor.
+    block_size : int
+        The size of dropping block.
+    keep_prob : float or Tensor
+        The prob of keeping. Default is ``0.9``.
+    alpha : float
+        The scale factor to gamma.
+    decrement : float
+        The decrement to keep prob.
+    data_format : str
+        The data format, ``NCHW`` or ``NHWC``.
+    Returns
+    -------
+    Tensor
+        The output tensor.
+    """
+    CheckInputs(inputs, 1)
+    arguments = ParseArguments(locals())
+    arguments = AddArgumentWithDesc(arguments, keep_prob, 'keep_prob', as_target=False)
+    output = Tensor.CreateOperator(nout=1, op_type='DropBlock2d', **arguments)
+    if inputs.shape is not None:
+        output.shape = inputs.shape[:]
+    return output
\ No newline at end of file
--- a/Dragon/python/dragon/ops.py
+++ b/Dragon/python/dragon/ops.py
@@ -51,6 +51,7 @@ NNResize = vision.NNResize
 BilinearResize = vision.BilinearResize
 BiasAdd = vision.BiasAdd
 DenseConcat = vision.DenseConcat
+DropBlock2d = vision.DropBlock2d
 # recurrent
 LSTMCell = recurrent.LSTMCell
@@ -70,6 +71,7 @@ Softmax = act.Softmax
 Dropout = act.Dropout
 # loss
+NLLLoss = loss.NLLLoss
 SparseSoftmaxCrossEntropy = loss.SparseSoftmaxCrossEntropy
 SigmoidCrossEntropy = loss.SigmoidCrossEntropy
 SoftmaxCrossEntropy = loss.SoftmaxCrossEntropy
@@ -85,6 +87,8 @@ Add = math.Add
 Sub = math.Sub
 Mul = math.Mul
 Div = math.Div
+Maximum = math.Maximum
+Minimum = math.Minimum
 Clip = math.Clip
 Matmul = math.Matmul
 Pow = math.Pow

--- a/Dragon/python/dragon/protos/dragon.proto
+++ b/Dragon/python/dragon/protos/dragon.proto
@@ -35,7 +35,11 @@ message Argument {
    repeated string strings=7;
 }
-enum DeviceType { CPU = 0; CUDA = 1; OPENCL = 2; }
+enum DeviceType { 
+	CPU = 0; 
+	CUDA = 1; 
+	CNML = 2; 
+}
 message DeviceOption {
    optional DeviceType device_type = 1 [default = CPU];

--- a/Dragon/python/dragon/protos/dragon_pb2.py
+++ b/Dragon/python/dragon/protos/dragon_pb2.py
@@ -19,7 +19,7 @@ _sym_db = _symbol_database.Default()
 DESCRIPTOR = _descriptor.FileDescriptor(
  name='dragon.proto',
  package='dragon',
-  serialized_pb=_b('\n\x0c\x64ragon.proto\x12\x06\x64ragon\"\xfe\x01\n\x0bTensorProto\x12\x0c\n\x04\x64ims\x18\x01 \x03(\x05\x12\x36\n\tdata_type\x18\x02 \x01(\x0e\x32\x1c.dragon.TensorProto.DataType:\x05\x46LOAT\x12\x16\n\nfloat_data\x18\x03 \x03(\x02\x42\x02\x10\x01\x12\x16\n\nint32_data\x18\x04 \x03(\x05\x42\x02\x10\x01\x12\x11\n\tbyte_data\x18\x05 \x01(\x0c\x12\x13\n\x0bstring_data\x18\x06 \x03(\x0c\x12\x0c\n\x04name\x18\x07 \x01(\t\"C\n\x08\x44\x61taType\x12\t\n\x05\x46LOAT\x10\x01\x12\t\n\x05INT32\x10\x02\x12\x08\n\x04\x42YTE\x10\x03\x12\n\n\x06STRING\x10\x04\x12\x0b\n\x07\x46LOAT16\x10\x0c\"3\n\x0cTensorProtos\x12#\n\x06protos\x18\x01 \x03(\x0b\x32\x13.dragon.TensorProto\"\x80\x01\n\x08\x41rgument\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\t\n\x01\x66\x18\x02 \x01(\x02\x12\t\n\x01i\x18\x03 \x01(\x05\x12\x0b\n\x03i64\x18\t \x01(\x03\x12\t\n\x01s\x18\x04 \x01(\t\x12\t\n\x01\x62\x18\x08 \x01(\x08\x12\x0e\n\x06\x66loats\x18\x05 \x03(\x02\x12\x0c\n\x04ints\x18\x06 \x03(\x05\x12\x0f\n\x07strings\x18\x07 \x03(\t\"z\n\x0c\x44\x65viceOption\x12,\n\x0b\x64\x65vice_type\x18\x01 \x01(\x0e\x32\x12.dragon.DeviceType:\x03\x43PU\x12\x14\n\tdevice_id\x18\x02 \x01(\x05:\x01\x30\x12\x16\n\x0brandom_seed\x18\x03 \x01(\r:\x01\x33\x12\x0e\n\x06\x65ngine\x18\x04 \x01(\t\"\x94\x01\n\x0bOperatorDef\x12\r\n\x05input\x18\x01 \x03(\t\x12\x0e\n\x06output\x18\x02 \x03(\t\x12\x0c\n\x04name\x18\x03 \x01(\t\x12\x0c\n\x04type\x18\x04 \x01(\t\x12\x1d\n\x03\x61rg\x18\x05 \x03(\x0b\x32\x10.dragon.Argument\x12+\n\rdevice_option\x18\x06 \x01(\x0b\x32\x14.dragon.DeviceOption\"=\n\x0eGradientTarget\x12\x0c\n\x04\x63ost\x18\x01 \x01(\t\x12\x0b\n\x03wrt\x18\x02 \x01(\t\x12\x10\n\x08\x65xternal\x18\x03 \x01(\t\"Y\n\x0cUpdateTarget\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04type\x18\x02 \x01(\t\x12\x0e\n\x06tensor\x18\x03 \x03(\t\x12\x1d\n\x03\x61rg\x18\x04 \x03(\x0b\x32\x10.dragon.Argument\"\x94\x02\n\x0cTensorFiller\x12\x0e\n\x06tensor\x18\x01 \x01(\t\x12\x16\n\x04type\x18\x02 \x01(\t:\x08\x63onstant\x12\x10\n\x05value\x18\x03 \x01(\x02:\x01\x30\x12\x0e\n\x03low\x18\x04 \x01(\x02:\x01\x30\x12\x0f\n\x04high\x18\x05 \x01(\x02:\x01\x31\x12\x0f\n\x04mean\x18\x06 \x01(\x02:\x01\x30\x12\x0e\n\x03std\x18\x07 \x01(\x02:\x01\x31\x12\x10\n\x05scale\x18\x08 \x01(\x02:\x01\x33\x12@\n\rvariance_norm\x18\t \x01(\x0e\x32!.dragon.TensorFiller.VarianceNorm:\x06\x46\x41N_IN\"4\n\x0cVarianceNorm\x12\n\n\x06\x46\x41N_IN\x10\x00\x12\x0b\n\x07\x46\x41N_OUT\x10\x01\x12\x0b\n\x07\x46\x41N_AVG\x10\x02\"\xfb\x01\n\x08GraphDef\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1f\n\x02op\x18\x02 \x03(\x0b\x32\x13.dragon.OperatorDef\x12\x12\n\ngraph_type\x18\x03 \x01(\t\x12+\n\rdevice_option\x18\x05 \x01(\x0b\x32\x14.dragon.DeviceOption\x12\x1d\n\x03\x61rg\x18\x06 \x03(\x0b\x32\x10.dragon.Argument\x12\x0e\n\x06target\x18\x07 \x03(\t\x12(\n\x08g_target\x18\x08 \x03(\x0b\x32\x16.dragon.GradientTarget\x12&\n\x08u_target\x18\t \x03(\x0b\x32\x14.dragon.UpdateTarget*+\n\nDeviceType\x12\x07\n\x03\x43PU\x10\x00\x12\x08\n\x04\x43UDA\x10\x01\x12\n\n\x06OPENCL\x10\x02')
+  serialized_pb=_b('\n\x0c\x64ragon.proto\x12\x06\x64ragon\"\xfe\x01\n\x0bTensorProto\x12\x0c\n\x04\x64ims\x18\x01 \x03(\x05\x12\x36\n\tdata_type\x18\x02 \x01(\x0e\x32\x1c.dragon.TensorProto.DataType:\x05\x46LOAT\x12\x16\n\nfloat_data\x18\x03 \x03(\x02\x42\x02\x10\x01\x12\x16\n\nint32_data\x18\x04 \x03(\x05\x42\x02\x10\x01\x12\x11\n\tbyte_data\x18\x05 \x01(\x0c\x12\x13\n\x0bstring_data\x18\x06 \x03(\x0c\x12\x0c\n\x04name\x18\x07 \x01(\t\"C\n\x08\x44\x61taType\x12\t\n\x05\x46LOAT\x10\x01\x12\t\n\x05INT32\x10\x02\x12\x08\n\x04\x42YTE\x10\x03\x12\n\n\x06STRING\x10\x04\x12\x0b\n\x07\x46LOAT16\x10\x0c\"3\n\x0cTensorProtos\x12#\n\x06protos\x18\x01 \x03(\x0b\x32\x13.dragon.TensorProto\"\x80\x01\n\x08\x41rgument\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\t\n\x01\x66\x18\x02 \x01(\x02\x12\t\n\x01i\x18\x03 \x01(\x05\x12\x0b\n\x03i64\x18\t \x01(\x03\x12\t\n\x01s\x18\x04 \x01(\t\x12\t\n\x01\x62\x18\x08 \x01(\x08\x12\x0e\n\x06\x66loats\x18\x05 \x03(\x02\x12\x0c\n\x04ints\x18\x06 \x03(\x05\x12\x0f\n\x07strings\x18\x07 \x03(\t\"z\n\x0c\x44\x65viceOption\x12,\n\x0b\x64\x65vice_type\x18\x01 \x01(\x0e\x32\x12.dragon.DeviceType:\x03\x43PU\x12\x14\n\tdevice_id\x18\x02 \x01(\x05:\x01\x30\x12\x16\n\x0brandom_seed\x18\x03 \x01(\r:\x01\x33\x12\x0e\n\x06\x65ngine\x18\x04 \x01(\t\"\x94\x01\n\x0bOperatorDef\x12\r\n\x05input\x18\x01 \x03(\t\x12\x0e\n\x06output\x18\x02 \x03(\t\x12\x0c\n\x04name\x18\x03 \x01(\t\x12\x0c\n\x04type\x18\x04 \x01(\t\x12\x1d\n\x03\x61rg\x18\x05 \x03(\x0b\x32\x10.dragon.Argument\x12+\n\rdevice_option\x18\x06 \x01(\x0b\x32\x14.dragon.DeviceOption\"=\n\x0eGradientTarget\x12\x0c\n\x04\x63ost\x18\x01 \x01(\t\x12\x0b\n\x03wrt\x18\x02 \x01(\t\x12\x10\n\x08\x65xternal\x18\x03 \x01(\t\"Y\n\x0cUpdateTarget\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04type\x18\x02 \x01(\t\x12\x0e\n\x06tensor\x18\x03 \x03(\t\x12\x1d\n\x03\x61rg\x18\x04 \x03(\x0b\x32\x10.dragon.Argument\"\x94\x02\n\x0cTensorFiller\x12\x0e\n\x06tensor\x18\x01 \x01(\t\x12\x16\n\x04type\x18\x02 \x01(\t:\x08\x63onstant\x12\x10\n\x05value\x18\x03 \x01(\x02:\x01\x30\x12\x0e\n\x03low\x18\x04 \x01(\x02:\x01\x30\x12\x0f\n\x04high\x18\x05 \x01(\x02:\x01\x31\x12\x0f\n\x04mean\x18\x06 \x01(\x02:\x01\x30\x12\x0e\n\x03std\x18\x07 \x01(\x02:\x01\x31\x12\x10\n\x05scale\x18\x08 \x01(\x02:\x01\x33\x12@\n\rvariance_norm\x18\t \x01(\x0e\x32!.dragon.TensorFiller.VarianceNorm:\x06\x46\x41N_IN\"4\n\x0cVarianceNorm\x12\n\n\x06\x46\x41N_IN\x10\x00\x12\x0b\n\x07\x46\x41N_OUT\x10\x01\x12\x0b\n\x07\x46\x41N_AVG\x10\x02\"\xfb\x01\n\x08GraphDef\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1f\n\x02op\x18\x02 \x03(\x0b\x32\x13.dragon.OperatorDef\x12\x12\n\ngraph_type\x18\x03 \x01(\t\x12+\n\rdevice_option\x18\x05 \x01(\x0b\x32\x14.dragon.DeviceOption\x12\x1d\n\x03\x61rg\x18\x06 \x03(\x0b\x32\x10.dragon.Argument\x12\x0e\n\x06target\x18\x07 \x03(\t\x12(\n\x08g_target\x18\x08 \x03(\x0b\x32\x16.dragon.GradientTarget\x12&\n\x08u_target\x18\t \x03(\x0b\x32\x14.dragon.UpdateTarget*)\n\nDeviceType\x12\x07\n\x03\x43PU\x10\x00\x12\x08\n\x04\x43UDA\x10\x01\x12\x08\n\x04\x43NML\x10\x02')
 )
 _sym_db.RegisterFileDescriptor(DESCRIPTOR)
@@ -38,21 +38,21 @@ _DEVICETYPE = _descriptor.EnumDescriptor(
      options=None,
      type=None),
    _descriptor.EnumValueDescriptor(
-      name='OPENCL', index=2, number=2,
+      name='CNML', index=2, number=2,
      options=None,
      type=None),
  ],
  containing_type=None,
  options=None,
  serialized_start=1427,
-  serialized_end=1470,
+  serialized_end=1468,
 )
 _sym_db.RegisterEnumDescriptor(_DEVICETYPE)
 DeviceType = enum_type_wrapper.EnumTypeWrapper(_DEVICETYPE)
 CPU = 0
 CUDA = 1
-OPENCL = 2
+CNML = 2
 _TENSORPROTO_DATATYPE = _descriptor.EnumDescriptor(

--- a/Dragon/python/dragon/updaters.py
+++ b/Dragon/python/dragon/updaters.py
@@ -14,7 +14,6 @@ from __future__ import division
 from __future__ import print_function
 import pprint
-import numpy as np
 import dragon.core.workspace as ws
 from dragon.core.tensor import Tensor
@@ -43,7 +42,7 @@ class BaseUpdater(object):
        self._defaults = {
            'scale_gradient': scale_gradient,
            'clip_gradient': clip_gradient,
-            'l2_decay': l2_decay
+            'l2_decay': l2_decay,
        }
        self._param_group = []
        self._slot = slot
@@ -77,7 +76,7 @@ class BaseUpdater(object):
        defaults = self.__dict__.get('_defaults')
        if item in defaults:
            if self._registered:
-                return ws.FetchTensor(self._slot + '/' + item)[0]
+                return ws.FetchTensor(self._slot + '/' + item)
            else: return defaults[item]
        return self.__dict__[item]
@@ -85,9 +84,8 @@ class BaseUpdater(object):
        defaults = self.__dict__.get('_defaults')
        if defaults is not None and key in defaults:
            if self._registered:
-                # convert all defaults as float32 for convenience
+                ws.FeedTensor(self._slot + '/' + key, value,
-                ws.FeedTensor(self._slot + '/' + key,
+                    dtype='float32', force_cpu=True)
-                    np.array([value], dtype=np.float32))
            else:
                self._defaults[key] = value
        else:
@@ -96,8 +94,8 @@ class BaseUpdater(object):
    def register_in_workspace(self):
        if not self._registered:
            for k, v in self._defaults.items():
-                # convert all defaults as float32 for convenience
+                ws.FeedTensor(self._slot + "/" + k, v,
-                ws.FeedTensor(self._slot + "/" + k, np.array([v], dtype=np.float32))
+                    dtype='float32', force_cpu=True)
            self._registered = True
            if self._verbose:
                from dragon.config import logger

--- a/Dragon/python/dragon/version.py
+++ b/Dragon/python/dragon/version.py
@@ -14,7 +14,7 @@ from __future__ import division
 from __future__ import print_function
 version = '0.2.2'
-full_version = '0.2.2.11'
+full_version = '0.2.2.13'
 release = False
 if not release:

--- a/Dragon/python/dragon/vm/caffe/layers/__init__.py
+++ b/Dragon/python/dragon/vm/caffe/layers/__init__.py
@@ -19,7 +19,8 @@ from .vision import ConvolutionLayer, \
                    ROIPoolingLayer, \
                    ROIAlignLayer, \
                    NNResizeLayer, \
-                    BilinearResizeLayer
+                    BilinearResizeLayer, \
+                    DropBlockLayer
 from .neuron import ReLULayer, \
                    PReLULayer, \

--- a/Dragon/python/dragon/vm/caffe/layers/common.py
+++ b/Dragon/python/dragon/vm/caffe/layers/common.py
@@ -446,10 +446,13 @@ class InstanceNormLayer(Layer):
    The implementation of ``InstanceNormLayer``.
    Introduced by `[Ulyanov et.al, 2016] <https://arxiv.org/abs/1607.08022>`_
    """
    def __init__(self, LayerParameter):
        super(InstanceNormLayer, self).__init__(LayerParameter)
-        self._param = {'axis': 1}
+        param = LayerParameter.instance_norm_param
+        self._param = {'eps': param.eps,
+                       'axis': 1}
    def Setup(self, bottom):
        super(InstanceNormLayer, self).Setup(bottom)

--- a/Dragon/python/dragon/vm/caffe/layers/vision.py
+++ b/Dragon/python/dragon/vm/caffe/layers/vision.py
@@ -250,7 +250,7 @@ class NNResizeLayer(Layer):
    Parameters
    ----------
-    shape : caffe_pb2. BlobShape
+    shape : caffe_pb2.BlobShape
        The output shape. Refer `ResizeParameter.shape`_.
    fx : float
        The scale factor of height. Refer `ResizeParameter.fx`_.
@@ -283,7 +283,7 @@ class BilinearResizeLayer(Layer):
    Parameters
    ----------
-    shape : caffe_pb2. BlobShape
+    shape : caffe_pb2.BlobShape
        The output shape. Refer `ResizeParameter.shape`_.
    fx : float
        The scale factor of height. Refer `ResizeParameter.fx`_.
@@ -309,3 +309,33 @@ class BilinearResizeLayer(Layer):
                raise ValueError('The second bottom should be provided to determine the shape.')
            self._param['shape_like'] = bottom[1]
        return ops.BilinearResize(input, **self._param)
+class DropBlockLayer(Layer):
+    """The implementation of ``DropBlock2dLayer``.
+    Parameters
+    ----------
+    block_size : int
+        The size of dropping block. Refer ``DropBlockParameter.block_size``.
+    keep_prob : float
+        The prob of keeping. Refer ``DropBlockParameter.keep_prob``.
+    alpha : float
+        The scale factor to gamma. Refer ``DropBlockParameter.alpha``.
+    decrement : float
+        The decrement to keep prob. Refer ``DropBlockParameter.decrement``.
+    """
+    def __init__(self, LayerParameter):
+        super(DropBlockLayer, self).__init__(LayerParameter)
+        param = LayerParameter.drop_block_param
+        self._param = {'block_size': param.block_size,
+                       'keep_prob': param.keep_prob,
+                       'alpha': param.alpha,
+                       'decrement': param.decrement,
+                       'data_format': 'NCHW'}
+    def Setup(self, bottom):
+        super(DropBlockLayer, self).Setup(bottom)
+        input = bottom[0] if isinstance(bottom, list) else bottom
+        return ops.DropBlock2d(input, **self._param)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/caffe/proto/caffe.proto
+++ b/Dragon/python/dragon/vm/caffe/proto/caffe.proto
@@ -424,7 +424,9 @@ message LayerParameter {
  optional DenseConcatParameter dense_concat_param = 163;
  optional FocalLossParameter focal_loss_param = 164;
  optional GatherParameter gather_param = 165;
-  optional GroupNormParameter group_norm_param = 166;
+  optional InstanceNormParameter instance_norm_param = 166;
+  optional GroupNormParameter group_norm_param = 167;
+  optional DropBlockParameter drop_block_param = 168;
 }
 // Message that stores parameters used to apply transformation
@@ -537,7 +539,7 @@ message BatchNormParameter {
  optional float moving_average_fraction = 2 [default = 0.9];
  // Small value to add to the variance estimate so that we don't divide by
  // zero.
-  optional float eps = 3 [default = 1e-3];
+  optional float eps = 3 [default = 1e-5];
 }
 message BiasParameter {
@@ -595,7 +597,7 @@ message ConvolutionParameter {
  repeated uint32 stride = 6; // The stride; defaults to 1
  // Factor used to dilate the kernel, (implicitly) zero-filling the resulting
  // holes. (Kernel dilation is sometimes referred to by its use in the
-  // algorithme à trous from Holschneider et al. 1987.)
+  // algorithme ¨¤ trous from Holschneider et al. 1987.)
  repeated uint32 dilation = 18; // The dilation; defaults to 1
  // For 2D convolution only, the *_h and *_w versions may also be used to
@@ -1456,7 +1458,7 @@ message NormalizeParameter {
  // Whether or not scale parameters are shared across channels.
  optional bool channel_shared = 3 [default = true];
  // Epsilon for not dividing by zero while normalizing variance
-  optional float eps = 4 [default = 1e-3];
+  optional float eps = 4 [default = 1e-5];
 }
 message ParallelParameter {
@@ -1492,7 +1494,7 @@ message ProposalParameter {
 message BatchRenormParameter {
  optional bool use_global_stats = 1;
  optional float moving_average_fraction = 2 [default = 0.9];
-  optional float eps = 3 [default = 1e-3];
+  optional float eps = 3 [default = 1e-5];
  optional float r_max = 4 [default = 3.0];
  optional float d_max = 5 [default = 5.0];
  optional float t_delta = 6 [default = 0.001];
@@ -1513,17 +1515,18 @@ message GatherParameter {
  optional int32 axis = 1 [default = 0];
 }
-message GroupNormParameter {
+message InstanceNormParameter {
-  // If false, accumulate global mean/variance values via a moving average. If
+  optional float eps = 1 [default = 1e-5];
-  // true, use those accumulated values instead of computing mean/variance
-  // across the batch.
-  optional bool use_global_stats = 1;
-  // How much does the moving average decay each iteration?
-  optional float moving_average_fraction = 2 [default = 0.9];
-  // Small value to add to the variance estimate so that we don't divide by
-  // zero.
-  optional float eps = 3 [default = 1e-3];
-  optional uint32 group = 5 [default = 32]; // The group size
 }
+message GroupNormParameter {
+  optional float eps = 1 [default = 1e-5];
+  optional int32 group = 2 [default = 32]; // The group size
+}
+message DropBlockParameter {
+  optional int32 block_size = 1 [default = 7];
+  optional float keep_prob = 2 [default = 0.9];
+  optional float alpha = 3 [default = 1.0];
+  optional float decrement = 4 [default = 0.0];
+}
--- a/Dragon/python/dragon/vm/caffe/proto/caffe_pb2.py
+++ b/Dragon/python/dragon/vm/caffe/proto/caffe_pb2.py
--- a/Dragon/python/dragon/vm/tensorflow/framework/ops.py
+++ b/Dragon/python/dragon/vm/tensorflow/framework/ops.py
@@ -24,7 +24,7 @@ def convert_to_tensor(value, dtype=None, name=None, **kwargs):
    Parameters
    ----------
-    value : basic type, list or numpy.ndarray
+    value : number, list or numpy.ndarray
        The value to convert.
    dtype : Dtype or None
        The data type. If ``None``, inferred from the type of `value`.

--- a/Dragon/python/dragon/vm/theano/compile/function.py
+++ b/Dragon/python/dragon/vm/theano/compile/function.py
@@ -15,6 +15,7 @@ import numpy as np
 import dragon.core.mpi as mpi
 import dragon.core.workspace as ws
 import dragon.protos.dragon_pb2 as pb
 from dragon.core.utils import MakeArgument
 from dragon.core.gradient_maker import GraphGradientMaker
 from dragon.core.scope import GetOperatorName, GetTensorName
@@ -156,6 +157,7 @@ def GraphDef_Opt(meta_graph):
    OX = 3 if option['share_grads'] else 2
    if option['debug_mode']: OX = 1
    meta_graph.arg.add().CopyFrom(MakeArgument('optimization_level', OX))
+    meta_graph.graph_type = option['graph_type']
 def GraphDef_Device(meta_graph):
@@ -181,11 +183,12 @@ def GraphDef_Device(meta_graph):
    """
    from dragon.config import option
    if option['device'] is not 'None':
-        supports = {'CPU': 0, 'CUDA': 1}
+        supports = {'CPU': 0, 'CUDA': 1, 'CNML': 2}
        device_option = pb.DeviceOption()
        device_option.device_type = supports[option['device']]
-        device_option.device_id = option['gpu_id']
+        device_option.device_id = option['device_id']
        device_option.random_seed = option['random_seed']
+        if option['device'] == 'CUDA':
            if option['use_cudnn']: device_option.engine = 'CUDNN'
        meta_graph.device_option.CopyFrom(device_option)
@@ -217,16 +220,16 @@ def function(inputs=None, outputs=None, givens=None, updater=None):
    Examples
    --------
-    >>> x = Tensor('x').Variable()
+    >>> x = Tensor('x', dtype='float32').Variable()
    >>> y = x * 2
-    >>> f = theano.function(outputs=y)
+    >>> f = function(outputs=y)
-    >>> x.set_value(np.ones((2, 3), dtype=np.float32))
+    >>> x.set_value(np.ones((2, 3)))
    >>> print(f())
    >>> [[ 2.  2.  2.]
         [ 2.  2.  2.]]
-    >>> f = theano.function(inputs=x, outputs=y)
+    >>> f = function(inputs=x, outputs=y)
-    >>> print(f(np.ones((2, 3), dtype=np.float32)))
+    >>> print(f(np.ones((2, 3)))
    >>> [[ 2.  2.  2.]
         [ 2.  2.  2.]]
@@ -339,13 +342,15 @@ def eval(self, feed_dict=None):
                raise TypeError('The key of feed_dict key should be a Tensor.')
            if key.shape is not None:
                if len(key.shape) != len(value.shape):
-                    raise RuntimeError('The Tensor({}) was limited to {} dimensions, \
+                    raise RuntimeError(
-                                                    while feed a value with {} dimensions.'.
+                        'The Tensor({}) was limited to {} dimensions, \
-                                       format(key.name, len(key.shape), len(value.shape)))
+                         while feed a value with {} dimensions.'.format(
+                            key.name, len(key.shape), len(value.shape)))
                for i in range(len(key.shape)):
                    if key.shape[i] is None: continue
                    if key.shape[i] != value.shape[i]:
-                        raise RuntimeError('The shape of Tensor({}) was limited as ('.format(key.name) +
+                        raise RuntimeError(
+                            'The shape of Tensor({}) was limited as ('.format(key.name) +
                            ','.join([str(dim) for dim in key.shape]) + '), ' +
                            'while feed a value with (' + ','.join([str(dim) for dim in value.shape]) + ').')
        return self._eval_func(*feed_dict.values())

--- a/Dragon/python/dragon/vm/theano/compile/sharedvalue.py
+++ b/Dragon/python/dragon/vm/theano/compile/sharedvalue.py
@@ -20,7 +20,7 @@ def shared(value, name=None, **kwargs):
    Parameters
    ----------
-    value : basic type, list or numpy.ndarray
+    value : number, list or numpy.ndarray
        The numerical values.
    name : str
        The name of tensor.

--- a/Dragon/python/dragon/vm/torch/module.py
+++ b/Dragon/python/dragon/vm/torch/module.py
@@ -42,6 +42,7 @@ class Module(object):
        self._buffers = OrderedDict()
        self._persistent_key = self._op = None
        self._ctx = ('CPU', 0)
+        self.training = True
    def __getattr__(self, item):
        if '_parameters' in self.__dict__:
@@ -363,3 +364,12 @@ class Module(object):
    def run(self, inputs, outputs, auto_grad=True):
        meta = ('PERSISTENT', self.persistent_key, self.op)
        return RunOperator(inputs, outputs, meta, auto_grad=auto_grad)
+    def train(self, mode=True):
+        self.training = mode
+        for module in self.children():
+            module.train(mode)
+        return self
+    def eval(self):
+        return self.train(False)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/nn/__init__.py
+++ b/Dragon/python/dragon/vm/torch/nn/__init__.py
@@ -10,20 +10,35 @@
 # ------------------------------------------------------------
 """We move the Module & Parameter to ``torch`` instead of ``torch.nn``,
 as it will be reused by the ``torch.ops``.
 """
 from dragon.vm.torch.module import Module
 from dragon.vm.torch.tensor import Parameter
 from .modules.conv import Conv2d, ConvTranspose2d
 from .modules.pooling import MaxPool2d, AvgPool2d
-from .modules.activation import ReLU, LeakyReLU, Sigmoid, Softmax
+from .modules.activation import (
+    ReLU, LeakyReLU, ELU, SELU,
+    Sigmoid, Softmax,
+)
 from .modules.linear import Linear
-from .modules.loss import CrossEntropyLoss
+from .modules.loss import (
+    BCEWithLogitsLoss,
+    NLLLoss, CrossEntropyLoss,
+    L1Loss, MSELoss, SmoothL1Loss,
+)
 from .modules.container import Container, Sequential, ModuleList
 from .modules.batchnorm import BatchNorm1d, BatchNorm2d, BatchNorm3d
+from .modules.groupnorm import GroupNorm1d, GroupNorm2d, GroupNorm3d
 from .modules.affine import Affine
 from .modules.dropout import Dropout, Dropout2d, Dropout3d
+from .modules.dropblock import DropBlock2d
 from .modules.rnn import RNNBase, RNN, LSTM, GRU
 from . import init
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/nn/functional.py
+++ b/Dragon/python/dragon/vm/torch/nn/functional.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# Codes are based on:
+#
+#      <https://github.com/pytorch/pytorch/blob/master/torch/nn/functional.py>
+#
+# ------------------------------------------------------------
+import warnings
+class _Reduction:
+    @staticmethod
+    def get_enum(reduction):
+        if reduction == 'none':
+            return 0
+        if reduction == 'elementwise_mean':
+            return 1
+        if reduction == 'sum':
+            return 2
+        raise ValueError(reduction + " is not a valid value for reduction")
+    # In order to support previous versions, accept boolean size_average and reduce
+    # and convert them into the new constants for now
+    # We use these functions in torch/legacy as well, in which case we'll silence the warning
+    @staticmethod
+    def legacy_get_string(size_average, reduce, emit_warning=True):
+        warning = "size_average and reduce args will be deprecated, please use reduction='{}' instead."
+        if size_average is None:
+            size_average = True
+        if reduce is None:
+            reduce = True
+        if size_average and reduce:
+            ret = 'elementwise_mean'
+        elif reduce:
+            ret = 'sum'
+        else:
+            ret = 'none'
+        if emit_warning:
+            warnings.warn(warning.format(ret))
+        return ret
+    @staticmethod
+    def legacy_get_enum(size_average, reduce, emit_warning=True):
+        return _Reduction.get_enum(_Reduction.legacy_get_string(size_average, reduce, emit_warning))
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/nn/modules/activation.py
+++ b/Dragon/python/dragon/vm/torch/nn/modules/activation.py
@@ -55,6 +55,47 @@ class LeakyReLU(Module):
        return self.run(inputs, outputs)
+class ELU(Module):
+    def __init__(self, alpha=1.0, inplace=False):
+        super(ELU, self).__init__()
+        self.alpha = alpha
+        self._inplace = inplace
+        self.register_op()
+    def register_op(self):
+        self.op_meta = {
+            'op_type': 'Elu',
+            'n_inputs': 1, 'n_outputs': 1,
+            'arguments': {
+                'alpha': self.alpha,
+            }
+        }
+    def forward(self, x):
+        inputs = [x]; self.unify_devices(inputs)
+        outputs = [x if self._inplace else self.register_output(x.dtype)]
+        return self.run(inputs, outputs)
+class SELU(Module):
+    def __init__(self, inplace=False):
+        super(SELU, self).__init__()
+        self._inplace = inplace
+        self.register_op()
+    def register_op(self):
+        self.op_meta = {
+            'op_type': 'SElu',
+            'n_inputs': 1, 'n_outputs': 1,
+            'arguments': {}
+        }
+    def forward(self, x):
+        inputs = [x]; self.unify_devices(inputs)
+        outputs = [x if self._inplace else self.register_output(x.dtype)]
+        return self.run(inputs, outputs)
 class Sigmoid(Module):
    def __init__(self, inplace=False):
        super(Sigmoid, self).__init__()

--- a/Dragon/python/dragon/vm/torch/nn/modules/batchnorm.py
+++ b/Dragon/python/dragon/vm/torch/nn/modules/batchnorm.py
@@ -102,7 +102,7 @@ class _BatchNorm(Module):
        inputs = [input] + self.inputs
        self.unify_devices(inputs)
        outputs = [self.register_output(input.dtype)]
-        phase = 'TRAIN' if input.requires_grad else 'TEST'
+        phase = 'TRAIN' if self.training else 'TEST'
        # Normalize the input by using batch stats ALWAYS
        # Note that the update of moving average is meaningless(
        # Because we can not remove it. Why? Ask nvidia and cuDNN -:)

--- a/Dragon/python/dragon/vm/torch/nn/modules/dropblock.py
+++ b/Dragon/python/dragon/vm/torch/nn/modules/dropblock.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from dragon.vm.torch.nn import Module
+class DropBlock2d(Module):
+    def __init__(self, block_size=7, kp=0.9,
+                 alpha=1., decrement=0., inplace=False):
+        super(DropBlock2d, self).__init__()
+        self.kp = kp
+        self.block_size = block_size
+        self.alpha = alpha
+        self.decrement = decrement
+        self.inplace = inplace
+        self.register_op()
+    def register_op(self):
+        self.op_meta = {
+            'op_type': 'DropBlock2d',
+            'n_inputs': 1, 'n_outputs': 1,
+            'arguments': {
+                'block_size': self.block_size,
+                'keep_prob': self.kp,
+                'alpha': self.alpha,
+                'decrement': self.decrement,
+                'data_format': 'NCHW',
+                'phase': 'TRAIN',
+            }
+        }
+    def forward(self, input):
+        if not self.training: return input
+        inputs = [input]
+        self.unify_devices(inputs)
+        outputs = [input if self.inplace else self.register_output(input.dtype)]
+        return self.run(inputs, outputs)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/nn/modules/dropout.py
+++ b/Dragon/python/dragon/vm/torch/nn/modules/dropout.py
@@ -34,7 +34,7 @@ class Dropout(Module):
        }
    def forward(self, input):
-        if not input.requires_grad: return input
+        if not self.training: return input
        inputs = [input]
        self.unify_devices(inputs)
        outputs = [input if self.inplace else self.register_output(input.dtype)]

--- a/Dragon/python/dragon/vm/torch/nn/modules/groupnorm.py
+++ b/Dragon/python/dragon/vm/torch/nn/modules/groupnorm.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from dragon.vm.torch.tensor import Tensor
+from dragon.vm.torch.nn import Module, Parameter
+from dragon.vm.torch.ops.creation import zeros, ones
+from dragon.vm.torch.module import RunOperator
+class _GroupNorm(Module):
+    def __init__(self, num_features, group=32,
+                 eps=1e-5, affine=True):
+        super(_GroupNorm, self).__init__()
+        self.num_features = num_features
+        self.group = group
+        self.eps = eps
+        self.affine = affine
+        if self.affine:
+            self.weight = Parameter(Tensor(num_features))
+            self.bias = Parameter(Tensor(num_features))
+        else:
+            self.weight = self.bias = None
+        self.inputs = [self.weight, self.bias] if self.affine else []
+        self.reset_parameters()
+        self.register_op()
+    def reset_parameters(self):
+        if self.affine:
+            self.weight.data.uniform_()
+            self.bias.data.zero_()
+    def register_op(self):
+        self.op_meta = {
+            'op_type': 'FusedGroupNorm' if self.affine else 'GroupNorm',
+            'n_inputs': 3 if self.affine else 1, 'n_outputs': 1,
+            'arguments': {
+                'group': self.group,
+                'axis': 1, # Data format: NCHW
+                'eps': self.eps,
+            }
+        }
+    def forward(self, input):
+        inputs = [input] + self.inputs
+        self.unify_devices(inputs)
+        outputs = [self.register_output(input.dtype)]
+        return self.run(inputs, outputs)
+class GroupNorm1d(_GroupNorm):
+    """Dragon does not use separate backend functions."""
+    pass
+class GroupNorm2d(_GroupNorm):
+    """Dragon does not use separate backend functions."""
+    pass
+class GroupNorm3d(_GroupNorm):
+    """Dragon does not use separate backend functions."""
+    pass
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/nn/modules/loss.py
+++ b/Dragon/python/dragon/vm/torch/nn/modules/loss.py
@@ -18,50 +18,176 @@ from __future__ import division
 from __future__ import print_function
 from dragon.vm.torch.nn import Module
+from dragon.vm.torch.nn.functional import _Reduction
-def _assert_no_grad(variable):
-    assert not variable.requires_grad, \
-        "nn criterions don't compute the gradient w.r.t. targets - please " \
-        "mark these variables as not requiring gradients"
 class _Loss(Module):
-    def __init__(self, size_average=True):
+    def __init__(self, size_average=None, reduce=None, reduction='elementwise_mean'):
        super(_Loss, self).__init__()
-        self.size_average = size_average
+        if size_average is not None or reduce is not None:
+            self.reduction = _Reduction.legacy_get_string(size_average, reduce)
+        else:
+            self.reduction = reduction
 class _WeightedLoss(_Loss):
-    def __init__(self, weight=None, size_average=True):
+    def __init__(self, weight=None, size_average=None, reduce=None, reduction='elementwise_mean'):
-        super(_WeightedLoss, self).__init__(size_average)
+        super(_WeightedLoss, self).__init__(size_average, reduce, reduction)
        self.weight = weight
-        # TODO(PhyscalX):  Dragon will support it later :).
        if weight is not None:
            raise NotImplementedError('WeightedLoss has been not implemented yet.')
+class NLLLoss(_WeightedLoss):
+    def __init__(self, weight=None, size_average=None, ignore_index=-100,
+                 reduce=None, reduction='elementwise_mean'):
+        super(NLLLoss, self).__init__(weight, size_average, reduce, reduction)
+        self.ignore_index = ignore_index
+        self.normalization = {
+            'elementwise_mean': 'VALID',
+            'sum': 'None',
+            'none': 'UNIT'}[self.reduction]
+        self.register_op()
+    def register_op(self):
+        self.op_meta = {
+            'op_type': 'NLLLoss',
+            'n_inputs': 2, 'n_outputs': 1,
+            'arguments': {
+                'axis': 1,
+                'normalization': self.normalization,
+                'ignore_labels': () if self.ignore_index < 0 else (self.ignore_index),
+            }
+        }
+    def forward(self, input, target):
+        inputs = [input, target]; self.unify_devices(inputs)
+        outputs = [self.register_output(input.dtype)]
+        return self.run(inputs, outputs)
+class BCEWithLogitsLoss(_WeightedLoss):
+    def __init__(self, weight=None, size_average=None, reduce=None,
+                 reduction='elementwise_mean', pos_weight=None):
+        super(BCEWithLogitsLoss, self).__init__(weight, size_average, reduce, reduction)
+        if pos_weight is not None:
+            raise NotImplementedError('Positive weight has been not implemented yet.')
+        self.normalization = {
+            'elementwise_mean': 'VALID',
+            'sum': 'None',
+            'none': 'UNIT'}[self.reduction]
+        self.register_op()
+    def register_op(self):
+        self.op_meta = {
+            'op_type': 'SigmoidCrossEntropy',
+            'n_inputs': 2, 'n_outputs': 1,
+            'arguments': {
+                'normalization': self.normalization,
+            }
+        }
+    def forward(self, input, target):
+        inputs = [input, target]; self.unify_devices(inputs)
+        outputs = [self.register_output(input.dtype)]
+        return self.run(inputs, outputs)
 class CrossEntropyLoss(_WeightedLoss):
-    def __init__(self, weight=None, size_average=True, ignore_index=-100, reduce=True):
+    def __init__(self, weight=None, size_average=None, ignore_index=-100,
-        super(CrossEntropyLoss, self).__init__(weight, size_average)
+                 reduce=None, reduction='elementwise_mean'):
+        super(CrossEntropyLoss, self).__init__(weight, size_average, reduce, reduction)
        self.ignore_index = ignore_index
-        self.reduce = reduce
+        self.normalization = {
+            'elementwise_mean': 'VALID',
+            'sum': 'None',
+            'none': 'UNIT'}[self.reduction]
        self.register_op()
    def register_op(self):
        self.op_meta = {
-            'op_type': 'SparseSoftmaxCrossEntropy' if self.reduce else 'SoftmaxCrossEntropy',
+            'op_type': 'SparseSoftmaxCrossEntropy',
            'n_inputs': 2, 'n_outputs': 1,
            'arguments': {
                'axis': 1,
-                'normalization': 'VALID' if self.size_average else 'NONE',
+                'normalization': self.normalization,
                'ignore_labels': () if self.ignore_index < 0 else (self.ignore_index),
            }
        }
    def forward(self, input, target):
-        _assert_no_grad(target)
+        inputs = [input, target]; self.unify_devices(inputs)
+        outputs = [self.register_output(input.dtype)]
+        return self.run(inputs, outputs)
+class L1Loss(_Loss):
+    def __init__(self, size_average=None, reduce=None, reduction='elementwise_mean'):
+        super(L1Loss, self).__init__(size_average, reduce, reduction)
+        self.normalization = {
+            'elementwise_mean': 'BATCH_SIZE',
+            'sum': 'None'}[self.reduction]
+        self.register_op()
+    def register_op(self):
+        self.op_meta = {
+            'op_type': 'L1Loss',
+            'n_inputs': 2, 'n_outputs': 1,
+            'arguments': {
+                'normalization': self.normalization,
+            }
+        }
+    def forward(self, input, target):
+        inputs = [input, target]; self.unify_devices(inputs)
+        outputs = [self.register_output(input.dtype)]
+        return self.run(inputs, outputs)
+class MSELoss(_Loss):
+    def __init__(self, size_average=None, reduce=None, reduction='elementwise_mean'):
+        super(MSELoss, self).__init__(size_average, reduce, reduction)
+        self.normalization = {
+            'elementwise_mean': 'BATCH_SIZE',
+            'sum': 'None'}[self.reduction]
+        self.register_op()
+    def register_op(self):
+        self.op_meta = {
+            'op_type': 'L2Loss',
+            'n_inputs': 2, 'n_outputs': 1,
+            'arguments': {
+                'normalization': self.normalization,
+            }
+        }
+    def forward(self, input, target):
+        inputs = [input, target]; self.unify_devices(inputs)
+        outputs = [self.register_output(input.dtype)]
+        return self.run(inputs, outputs)
+class SmoothL1Loss(_Loss):
+    def __init__(self, size_average=None, beta=1.0,
+                 reduce=None, reduction='elementwise_mean'):
+        super(SmoothL1Loss, self).__init__(size_average, reduce, reduction)
+        self.normalization = {
+            'elementwise_mean': 'BATCH_SIZE',
+            'sum': 'None'}[self.reduction]
+        self.beta = beta
+        self.register_op()
+    def register_op(self):
+        self.op_meta = {
+            'op_type': 'SmoothL1Loss',
+            'n_inputs': 2, 'n_outputs': 1,
+            'arguments': {
+                'beta': self.beta,
+                'normalization': self.normalization,
+            }
+        }
+    def forward(self, input, target):
        inputs = [input, target]; self.unify_devices(inputs)
        outputs = [self.register_output(input.dtype)]
        return self.run(inputs, outputs)
--- a/Dragon/python/dragon/vm/torch/ops/__init__.py
+++ b/Dragon/python/dragon/vm/torch/ops/__init__.py
@@ -11,11 +11,12 @@
 from .creation import (
    zeros, zeros_like, ones, ones_like,
-    rand, randn
+    one_hot, rand, randn,
 )
 from .arithmetic import (
-    add, sub, mul, div,
+    add, sub, mul, div, log, exp,
+    maximum, minimum, clamp,
 )
 from .ndarray import (

--- a/Dragon/python/dragon/vm/torch/ops/arithmetic.py
+++ b/Dragon/python/dragon/vm/torch/ops/arithmetic.py
@@ -16,14 +16,15 @@ from __future__ import print_function
 from dragon.vm.torch.tensor import Tensor
 from dragon.vm.torch.ops.primitive import MakeContext, WrapScalar
 from dragon.vm.torch.ops.factory import get_module
-from dragon.vm.torch.ops.modules.arithmetic import Fundamental
+from dragon.vm.torch.ops.modules.arithmetic import (
+    Fundamental, Log, Exp,
+    Maximum, Minimum, Clamp,
+)
 def _fundamental(input, value, op='Add', out=None):
    if not isinstance(value, Tensor):
-        if not isinstance(value, (int, float)):
-            raise TypeError('Type of value should be numerical, got {}.'
-                    .format(type(value)))
        value = WrapScalar(value, input._dtype, input._ctx)
    ctx = MakeContext(inputs=[input, value])
    key = 'torch/ops/{}/{}:{}'.format(op.lower(), ctx[0].lower(), ctx[1])
@@ -33,17 +34,63 @@ def _fundamental(input, value, op='Add', out=None):
 def _rfundamental(input, value, op='RAdd', out=None):
    if not isinstance(value, Tensor):
-        if not isinstance(value, (int, float)):
-            raise TypeError('Type of value should be numerical, got {}.'
-                    .format(type(value)))
        value = WrapScalar(value, input._dtype, input._ctx)
    ctx = MakeContext(inputs=[input, value])
    key = 'torch/ops/{}/{}:{}'.format(op.lower(), ctx[0].lower(), ctx[1])
    module = get_module(Fundamental, key, ctx, op_type=op)
    return module.forward(value, input, out)
+def _maximum(input, other, out=None):
+    if not isinstance(input, Tensor):
+        input = WrapScalar(input, 'float32', other._ctx)
+        dtype = other._dtype
+    elif not isinstance(other, Tensor):
+        other = WrapScalar(other, 'float32', input._ctx)
+        dtype = input._dtype
+    else: dtype = input._dtype
+    ctx = MakeContext(inputs=[input])
+    key = 'torch/ops/maximum/{}:{}'.format(ctx[0].lower(), ctx[1])
+    module = get_module(Maximum, key, ctx)
+    return module.forward(input, other, out, dtype)
+def _minimum(input, other, out=None):
+    if not isinstance(input, Tensor):
+        input = WrapScalar(input, 'float32', other._ctx)
+        dtype = other._dtype
+    elif not isinstance(other, Tensor):
+        other = WrapScalar(other, 'float32', input._ctx)
+        dtype = input._dtype
+    else: dtype = input._dtype
+    ctx = MakeContext(inputs=[input])
+    key = 'torch/ops/minimum/{}:{}'.format(ctx[0].lower(), ctx[1])
+    module = get_module(Minimum, key, ctx)
+    return module.forward(input, other, out, dtype)
+def _clamp(input, min=None, max=None, out=None):
+    ctx = MakeContext(inputs=[input])
+    key = 'torch/ops/clamp/{}:{}/min:{}/max:{}'.format(
+        ctx[0].lower(), ctx[1], min, max)
+    module = get_module(Clamp, key, ctx, min=min, max=max)
+    return module.forward(input, out)
+def _exp(input, out=None):
+    ctx = MakeContext(inputs=[input])
+    key = 'torch/ops/exp/{}:{}'.format(ctx[0].lower(), ctx[1])
+    module = get_module(Exp, key, ctx)
+    return module.forward(input, out)
+def _log(input, out=None):
+    ctx = MakeContext(inputs=[input])
+    key = 'torch/ops/log/{}:{}'.format(ctx[0].lower(), ctx[1])
+    module = get_module(Log, key, ctx)
+    return module.forward(input, out)
 def add(input, value, out=None):
    """Add the ``input`` and ``value`` into the output tensor.
@@ -126,3 +173,106 @@ def div(input, value, out=None):
    """
    return _fundamental(input, value, out=out, op='Div')
+def maximum(input, other, out=None):
+    """Return the max value of given two tensors.
+    Parameters
+    ----------
+    input : vm.torch.Tensor
+        The input tensor.
+    other : vm.torch.Tensor
+        The input tensor.
+    out : vm.torch.Tensor or None
+        The output tensor.
+    Returns
+    -------
+    vm.torch.Tensor
+        The output tensor.
+    """
+    return _maximum(input, other, out)
+def minimum(input, other, out=None):
+    """Return the min value of given two tensors.
+    Parameters
+    ----------
+    input : vm.torch.Tensor
+        The input tensor.
+    other : vm.torch.Tensor
+        The input tensor.
+    out : vm.torch.Tensor or None
+        The output tensor.
+    Returns
+    -------
+    vm.torch.Tensor
+        The output tensor.
+    """
+    return _minimum(input, other, out)
+def clamp(input, min=None, max=None, out=None):
+    """Clamp all elements into the range [min, max].
+    Parameters
+    ----------
+    input : vm.torch.Tensor
+        The input tensor.
+    min : numerical or None
+        The min value.
+    max : numerical or None
+        The max value.
+    out : vm.torch.Tensor or None
+        The output tensor.
+    Returns
+    -------
+    vm.torch.Tensor
+        The output tensor.
+    """
+    return _clamp(input, min, max, out)
+def log(input, out=None):
+    """Compute the natural logarithm of input.
+    Parameters
+    ----------
+    input : vm.torch.Tensor
+        The input tensor.
+    out : vm.torch.Tensor or None
+        The output tensor.
+    Returns
+    -------
+    vm.torch.Tensor
+        The output tensor.
+    """
+    return _log(input, out)
+def exp(input, out=None):
+    """Compute the exponential of input.
+    Parameters
+    ----------
+    input : vm.torch.Tensor
+        The input tensor.
+    out : vm.torch.Tensor or None
+        The output tensor.
+    Returns
+    -------
+    vm.torch.Tensor
+        The output tensor.
+    """
+    return _exp(input, out)
--- a/Dragon/python/dragon/vm/torch/ops/builtin.py
+++ b/Dragon/python/dragon/vm/torch/ops/builtin.py
@@ -21,12 +21,18 @@ from dragon.vm.torch.execute_engine import RunOperator
 from dragon.vm.torch.ops.factory import get_module
 from dragon.vm.torch.autograd.grad_mode import no_grad
 from dragon.vm.torch.ops.primitive import MakeContext
-from dragon.vm.torch.ops.arithmetic import _fundamental, _rfundamental
+from dragon.vm.torch.ops.arithmetic import (
+    _fundamental, _rfundamental, _log, _exp,
+    _clamp,
+)
 from dragon.vm.torch.ops.ndarray import (
    reshape, squeeze, unsqueeze,
    _permute, _repeat, _crop,
    _fill, _reduce, _arg_reduce,
 )
 from dragon.vm.torch.ops.modules.dtype import AsType
@@ -53,9 +59,14 @@ def copy_(self, src, non_blocking=False):
        The ``self`` tensor.
    """
+    # Copy memory
    FromTensor(
        src, CTX_TO_DEVICE_OPTION[tuple(src._ctx)],
        self.name, CTX_TO_DEVICE_OPTION[tuple(self._ctx)])
+    self._dtype = src._dtype
+    # Transfer the static shape if necessary
+    self._static_shape = src.size() \
+        if self._static_shape else None
    return self
@@ -295,6 +306,76 @@ def rdiv(self, value):
    return _rfundamental(self, value, op='RDiv')
+def clamp(self, min=None, max=None):
+    """Return a tensor that all elements are clamped into the range [min, max].
+    Parameters
+    ----------
+    min : numerical or None
+        The min value.
+    max : numerical or None
+        The max value.
+    Returns
+    -------
+    vm.torch.Tensor
+        The output tensor.
+    """
+    return _clamp(self, min, max)
+def clamp_(self, min=None, max=None):
+    """Clamp all elements are clamped into the range [min, max].
+    Parameters
+    ----------
+    min : numerical or None
+        The min value.
+    max : numerical or None
+        The max value.
+    Returns
+    -------
+    vm.torch.Tensor
+        The output tensor.
+    """
+    return _clamp(self, min, max, self)
+def log(self):
+    """Compute the natural logarithm of this tensor.
+    Parameters
+    ----------
+    None
+    Returns
+    -------
+    vm.torch.Tensor
+        The log tensor.
+    """
+    return _log(self)
+def exp(self):
+    """Compute the exponential of this tensor.
+    Parameters
+    ----------
+    None
+    Returns
+    -------
+    vm.torch.Tensor
+        The exp tensor.
+    """
+    return _exp(self)
 Tensor.add = add
 Tensor.add_ = add_
 Tensor.__radd__ = radd
@@ -308,6 +389,10 @@ Tensor.div = div
 Tensor.div_ = div_
 Tensor.__rdiv__ = rdiv
 Tensor.__rtruediv__ = rdiv
+Tensor.clamp = clamp
+Tensor.clamp_ = clamp_
+Tensor.log = log
+Tensor.exp = exp
 ##############################################
@@ -387,16 +472,12 @@ def _unsqueeze_(self, dim=None):
 def view(self, *args):
-    if self._static_shape:
-        raise RuntimeError('Can not view a leaf variable, it owns the static sizes.')
    return reshape(self, shape=args)
 def view_as(self, other):
    if not isinstance(other, Tensor):
        raise ValueError('The other should be a torch tensor.')
-    if self._static_shape:
-        raise RuntimeError('Can not view a leaf variable, it owns the static sizes.')
    return reshape(self, shape=None, shape_like=other)

--- a/Dragon/python/dragon/vm/torch/ops/creation.py
+++ b/Dragon/python/dragon/vm/torch/ops/creation.py
@@ -13,14 +13,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+from dragon.vm.torch.ops.primitive import MakeContext, CanonicalAxis
+from dragon.vm.torch.ops.factory import get_module
 from dragon.vm.torch.tensor import LeafTensor
 from dragon.vm.torch.execute_engine import RunOperator
 from dragon.vm.torch.ops.primitive import MakeContext
+from dragon.vm.torch.ops.factory import get_module
+from dragon.vm.torch.ops.modules.creation import OneHot
 __all__= [
    'zeros', 'zeros_like', 'ones', 'ones_like',
-    'rand', 'randn',
+    'one_hot', 'rand', 'randn',
 ]
@@ -180,3 +186,26 @@ def randn(*sizes, **kwargs):
    inputs = []; outputs = [out]; ctx = MakeContext(inputs, outputs)
    meta = ('ONCE', 'RandomNormal', ctx)
    return RunOperator(inputs, outputs, meta, **arguments)
+def one_hot(input, depth):
+    """Return a ont hot tensor according to given input.
+    Parameters
+    ----------
+    input : vm.torch.Tensor
+        The input tensor.
+    depth : int
+        The depth of channels.
+    Returns
+    -------
+    vm.torch.FloatTensor
+        The output tensor.
+    """
+    ctx = MakeContext(inputs=[input])
+    key = 'torch/ops/one_hot/{}:{}/depth:{}'.format(
+        ctx[0].lower(), ctx[1], depth)
+    module = get_module(OneHot, key, ctx, depth=depth)
+    return module.forward(input)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/ops/modules/arithmetic.py
+++ b/Dragon/python/dragon/vm/torch/ops/modules/arithmetic.py
@@ -38,3 +38,125 @@ class Fundamental(BaseModule):
        inputs = [x1, x2]; self.unify_devices(inputs)
        outputs = [y] if y else [self.register_output(x1.dtype)]
        return self.run(inputs, outputs)
+class Maximum(BaseModule):
+    def __init__(self, key, ctx, **kwargs):
+        super(Maximum, self).__init__(key, ctx, **kwargs)
+        self.register_arguments()
+        self.register_op()
+    def register_arguments(self):
+        """No arguments for minimum op."""
+        pass
+    def register_op(self):
+        self.op_meta = {
+            'op_type': 'Maximum',
+            'n_inputs': 2, 'n_outputs': 1,
+            'arguments': {}
+        }
+    def forward(self, x1, x2, y, dtype):
+        inputs = [x1, x2]; self.unify_devices(inputs)
+        outputs = [y] if y else [self.register_output(dtype)]
+        return self.run(inputs, outputs)
+class Minimum(BaseModule):
+    def __init__(self, key, ctx, **kwargs):
+        super(Minimum, self).__init__(key, ctx, **kwargs)
+        self.register_arguments()
+        self.register_op()
+    def register_arguments(self):
+        """No arguments for minimum op."""
+        pass
+    def register_op(self):
+        self.op_meta = {
+            'op_type': 'Minimum',
+            'n_inputs': 2, 'n_outputs': 1,
+            'arguments': {}
+        }
+    def forward(self, x1, x2, y, dtype):
+        inputs = [x1, x2]; self.unify_devices(inputs)
+        outputs = [y] if y else [self.register_output(dtype)]
+        return self.run(inputs, outputs)
+class Clamp(BaseModule):
+    def __init__(self, key, ctx, **kwargs):
+        super(Clamp, self).__init__(key, ctx, **kwargs)
+        self.min = kwargs.get('min', None)
+        self.max = kwargs.get('max', None)
+        if self.min is not None: self.min = float(self.min)
+        if self.max is not None: self.max = float(self.max)
+        self.register_arguments()
+        self.register_op()
+    def register_arguments(self):
+        """No arguments clamp op."""
+        pass
+    def register_op(self):
+        self.op_meta = {
+            'op_type': 'Clip',
+            'n_inputs': 1, 'n_outputs': 1,
+            'arguments': {
+                'low': self.min,
+                'high': self.max,
+            }
+        }
+    def forward(self, x, y):
+        inputs = [x]; self.unify_devices(inputs)
+        outputs = [y] if y else [self.register_output(x.dtype)]
+        return self.run(inputs, outputs)
+class Log(BaseModule):
+    def __init__(self, key, ctx, **kwargs):
+        super(Log, self).__init__(key, ctx, **kwargs)
+        self.register_arguments()
+        self.register_op()
+    def register_arguments(self):
+        """No arguments for Log op."""
+        pass
+    def register_op(self):
+        self.op_meta = {
+            'op_type': 'Log',
+            'n_inputs': 1, 'n_outputs': 1,
+            'arguments': {}
+        }
+    def forward(self, x, y):
+        inputs = [x]; self.unify_devices(inputs)
+        outputs = [y] if y else [self.register_output(x.dtype)]
+        return self.run(inputs, outputs)
+class Exp(BaseModule):
+    def __init__(self, key, ctx, **kwargs):
+        super(Exp, self).__init__(key, ctx, **kwargs)
+        self.register_arguments()
+        self.register_op()
+    def register_arguments(self):
+        """No arguments for Log op."""
+        pass
+    def register_op(self):
+        self.op_meta = {
+            'op_type': 'Exp',
+            'n_inputs': 1, 'n_outputs': 1,
+            'arguments': {}
+        }
+    def forward(self, x, y):
+        inputs = [x]; self.unify_devices(inputs)
+        outputs = [y] if y else [self.register_output(x.dtype)]
+        return self.run(inputs, outputs)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/ops/modules/creation.py
+++ b/Dragon/python/dragon/vm/torch/ops/modules/creation.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from dragon.vm.torch.ops.modules.base import BaseModule
+class OneHot(BaseModule):
+    def __init__(self, key, ctx, **kwargs):
+        super(OneHot, self).__init__(key, ctx, **kwargs)
+        self.depth = kwargs.get('depth', 1)
+        self.register_arguments()
+        self.register_op()
+    def register_arguments(self):
+        """No Arguments for concat op."""
+        pass
+    def register_op(self):
+        self.op_meta = {
+            'op_type': 'OneHot',
+            'n_inputs': 1, 'n_outputs': 1,
+            'arguments': {
+                'depth': self.depth,
+            }
+        }
+    def forward(self, x):
+        inputs = [x]; self.unify_devices(inputs)
+        outputs = [self.register_output(x.dtype)]
+        return self.run(inputs, outputs)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/ops/modules/shape.py
+++ b/Dragon/python/dragon/vm/torch/ops/modules/shape.py
@@ -22,6 +22,7 @@ class Fill(BaseModule):
        super(Fill, self).__init__(key, ctx, **kwargs)
        self.len_shape = kwargs.get('len_shape', 0)
        self.value = kwargs.get('value', 0.0)
+        self.dtype = kwargs.get('dtype', 'float32')
        self.register_arguments()
        self.register_op()
@@ -34,6 +35,7 @@ class Fill(BaseModule):
            'op_type': 'Fill',
            'n_inputs': 0, 'n_outputs': 1,
            'arguments': {
+                'dtype': self.dtype,
                'value': float(self.value),
                'dims_desc': [d for d in self.shape] if len(self.shape) > 0 else None,
            }

--- a/Dragon/python/dragon/vm/torch/ops/ndarray.py
+++ b/Dragon/python/dragon/vm/torch/ops/ndarray.py
@@ -62,9 +62,10 @@ def _repeat(input, times):
 def _fill(input, shape, value):
    ctx = MakeContext(inputs=[input]); len_shape = len(shape)
-    key = 'torch/ops/fill/{}:{}/ndims:#{}/value:{}'.format(
+    key = 'torch/ops/fill/{}:{}/dtype:{}/ndims:#{}/value:{}'.format(
-        ctx[0].lower(), ctx[1], len_shape, value)
+        ctx[0].lower(), ctx[1], input._dtype, len_shape, value)
-    module = get_module(Fill, key, ctx, len_shape=len_shape, value=value)
+    module = get_module(Fill, key, ctx, len_shape=len_shape,
+        value=value, dtype=input._dtype)
    return module.forward(input, shape)

--- a/Dragon/python/dragon/vm/torch/ops/update.py
+++ b/Dragon/python/dragon/vm/torch/ops/update.py
@@ -35,7 +35,7 @@ def _update(param, grad, op_type, slot,
            lr_mult=1.0, decay_mult=1.0):
    ctx = MakeContext(inputs=[param])
    key = 'torch/ops/{}/{}:{}/{}/{}'.format(op_type.lower(),
-        ctx[0].lower(),ctx[1], slot, param.name)
+        ctx[0].lower(), ctx[1], slot, param.name)
    module = get_module(Update, key, ctx, op_type=op_type,
            lr_mult=lr_mult, decay_mult=decay_mult, slot=slot)
    return module.forward(param, grad)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/optim/optimizer.py
+++ b/Dragon/python/dragon/vm/torch/optim/optimizer.py
@@ -72,10 +72,9 @@ class Optimizer(object):
        param_temp = group['slot'] + '/{}'
        for k, v in group.items():
            if k in self._mutable_parameters:
-                # convert all defaults as float32 for convenience
                dg.workspace.FeedTensor(param_temp.format(
-                    self._mutable_parameters[k]),
+                    self._mutable_parameters[k]), v,
-                        np.array([v], dtype=np.float32))
+                        dtype='float32', force_cpu=True)
    def _run_update_ops(self, group):
        """Generate & Run UpdateOps.
@@ -107,10 +106,12 @@ class Optimizer(object):
        # Run regular update ops
        for p, g in zip(params, grads):
-            _update(p, g, op_type=self._update_type,
+            _update(p, g,
+                op_type=self._update_type,
                slot=group['slot'],
                lr_mult=group.get('lr_mult', 1.0),
-                decay_mult=group.get('decay_mult', 1.0))
+                decay_mult=group.get('decay_mult', 1.0)
+            )
    def zero_grad(self):
        """Set all gradients to zeros.

--- a/Dragon/python/dragon/vm/torch/serialization.py
+++ b/Dragon/python/dragon/vm/torch/serialization.py
--- a/Dragon/python/dragon/vm/torch/tensor.py
+++ b/Dragon/python/dragon/vm/torch/tensor.py
--- a/Dragon/python/setup.py
+++ b/Dragon/python/setup.py
--- a/Dragon/src/core/context.cc
+++ b/Dragon/src/core/context.cc
--- a/Dragon/src/core/graph.cc
+++ b/Dragon/src/core/graph.cc
--- a/Dragon/src/core/graph_gradient.cc
+++ b/Dragon/src/core/graph_gradient.cc
--- a/Dragon/src/core/mixedmem.cc
+++ b/Dragon/src/core/mixedmem.cc
--- a/Dragon/src/core/operator.cc
+++ b/Dragon/src/core/operator.cc
--- a/Dragon/src/core/workspace.cc
+++ b/Dragon/src/core/workspace.cc
--- a/Dragon/src/operators/activation/cudnn_dropout_op.cc
+++ b/Dragon/src/operators/activation/cudnn_dropout_op.cc
--- a/Dragon/src/operators/activation/cudnn_elu_op.cc
+++ b/Dragon/src/operators/activation/cudnn_elu_op.cc
--- a/Dragon/src/operators/activation/cudnn_relu_op.cc
+++ b/Dragon/src/operators/activation/cudnn_relu_op.cc
--- a/Dragon/src/operators/activation/cudnn_sigmoid_op.cc
+++ b/Dragon/src/operators/activation/cudnn_sigmoid_op.cc
--- a/Dragon/src/operators/activation/cudnn_softmax_op.cc
+++ b/Dragon/src/operators/activation/cudnn_softmax_op.cc
--- a/Dragon/src/operators/activation/cudnn_tanh_op.cc
+++ b/Dragon/src/operators/activation/cudnn_tanh_op.cc
--- a/Dragon/src/operators/activation/dropout_op.cc
+++ b/Dragon/src/operators/activation/dropout_op.cc
--- a/Dragon/src/operators/arithmetic/add_op.cc
+++ b/Dragon/src/operators/arithmetic/add_op.cc
--- a/Dragon/src/operators/arithmetic/clip_op.cc
+++ b/Dragon/src/operators/arithmetic/clip_op.cc
--- a/Dragon/src/operators/arithmetic/cudnn_affine_op.cc
+++ b/Dragon/src/operators/arithmetic/cudnn_affine_op.cc
--- a/Dragon/src/operators/arithmetic/div_op.cc
+++ b/Dragon/src/operators/arithmetic/div_op.cc
--- a/Dragon/src/operators/arithmetic/log_op.cc
+++ b/Dragon/src/operators/arithmetic/log_op.cc
--- a/Dragon/src/operators/arithmetic/maximum_op.cc
+++ b/Dragon/src/operators/arithmetic/maximum_op.cc
--- a/Dragon/src/operators/arithmetic/minimum_op.cc
+++ b/Dragon/src/operators/arithmetic/minimum_op.cc
--- a/Dragon/src/operators/arithmetic/mul_op.cc
+++ b/Dragon/src/operators/arithmetic/mul_op.cc
--- a/Dragon/src/operators/arithmetic/pow_op.cc
+++ b/Dragon/src/operators/arithmetic/pow_op.cc
--- a/Dragon/src/operators/arithmetic/sub_op.cc
+++ b/Dragon/src/operators/arithmetic/sub_op.cc
--- a/Dragon/src/operators/control_flow/scan_op.cc
+++ b/Dragon/src/operators/control_flow/scan_op.cc
--- a/Dragon/src/operators/loss/nll_loss_op.cc
+++ b/Dragon/src/operators/loss/nll_loss_op.cc
--- a/Dragon/src/operators/loss/sigmoid_focal_loss_op.cc
+++ b/Dragon/src/operators/loss/sigmoid_focal_loss_op.cc
--- a/Dragon/src/operators/loss/smooth_l1_loss_op.cc
+++ b/Dragon/src/operators/loss/smooth_l1_loss_op.cc
--- a/Dragon/src/operators/loss/sparse_softmax_cross_entropy_op.cc
+++ b/Dragon/src/operators/loss/sparse_softmax_cross_entropy_op.cc
--- a/Dragon/src/operators/misc/initialize_op.cc
+++ b/Dragon/src/operators/misc/initialize_op.cc
--- a/Dragon/src/operators/ndarray/reshape_op.cc
+++ b/Dragon/src/operators/ndarray/reshape_op.cc
--- a/Dragon/src/operators/norm/cudnn_batch_norm_op.cc
+++ b/Dragon/src/operators/norm/cudnn_batch_norm_op.cc
--- a/Dragon/src/operators/norm/fused_group_norm.cc
+++ b/Dragon/src/operators/norm/fused_group_norm.cc
--- a/Dragon/src/operators/recurrent/cudnn_recurrent_op.cc
+++ b/Dragon/src/operators/recurrent/cudnn_recurrent_op.cc
--- a/Dragon/src/operators/update/collective_update_op.cc
+++ b/Dragon/src/operators/update/collective_update_op.cc
--- a/Dragon/src/operators/vision/bias_add_op.cc
+++ b/Dragon/src/operators/vision/bias_add_op.cc
--- a/Dragon/src/operators/vision/conv2d_op.cc
+++ b/Dragon/src/operators/vision/conv2d_op.cc
--- a/Dragon/src/operators/vision/conv_op_base.cc
+++ b/Dragon/src/operators/vision/conv_op_base.cc
--- a/Dragon/src/operators/vision/cudnn_bias_add_op.cc
+++ b/Dragon/src/operators/vision/cudnn_bias_add_op.cc
--- a/Dragon/src/operators/vision/cudnn_conv2d_op.cc
+++ b/Dragon/src/operators/vision/cudnn_conv2d_op.cc
--- a/Dragon/src/operators/vision/cudnn_conv2d_transpose_op.cc
+++ b/Dragon/src/operators/vision/cudnn_conv2d_transpose_op.cc
--- a/Dragon/src/operators/vision/cudnn_lrn_op.cc
+++ b/Dragon/src/operators/vision/cudnn_lrn_op.cc
--- a/Dragon/src/operators/vision/cudnn_pooling2d_op.cc
+++ b/Dragon/src/operators/vision/cudnn_pooling2d_op.cc
--- a/Dragon/src/operators/vision/dense_concat_op.cc
+++ b/Dragon/src/operators/vision/dense_concat_op.cc
--- a/Dragon/src/operators/vision/drop_block_op.cc
+++ b/Dragon/src/operators/vision/drop_block_op.cc
--- a/Dragon/src/protos/dragon.proto
+++ b/Dragon/src/protos/dragon.proto
--- a/Dragon/src/utils/cudnn_device.cc
+++ b/Dragon/src/utils/cudnn_device.cc
--- a/Dragon/src/utils/logging.cc
+++ b/Dragon/src/utils/logging.cc
--- a/Dragon/src/utils/math_functions.cc
+++ b/Dragon/src/utils/math_functions.cc
--- a/Dragon/src/utils/math_functions.cu
+++ b/Dragon/src/utils/math_functions.cu
--- a/Dragon/src/utils/math_functions_fp16.cu
+++ b/Dragon/src/utils/math_functions_fp16.cu
--- a/Dragon/src/utils/op_kernel.cc
+++ b/Dragon/src/utils/op_kernel.cc
--- a/Dragon/src/utils/op_kernel.cu
+++ b/Dragon/src/utils/op_kernel.cu
--- a/Dragon/src/utils/op_kernel_fp16.cu
+++ b/Dragon/src/utils/op_kernel_fp16.cu