Dragon 0.2.1 preview

Ting PAN
Commit b7365da6 authored Oct 23, 2017 by Ting PAN
Showing with 2652 additions and 371 deletions
Dragon/CMakeLists.txt
Dragon/include/core/common.h
Dragon/include/core/context.h
Dragon/include/core/context_cuda.h
Dragon/include/core/operator.h
Dragon/include/core/operator_schema.h
Dragon/include/core/tensor.h
Dragon/include/core/workspace.h
Dragon/include/operators/activation/dropout_op.h
Dragon/include/operators/activation/elu_op.h
Dragon/include/operators/activation/sigmoid_op.h
Dragon/include/operators/activation/tanh_op.h
Dragon/include/operators/arithmetic/add_op.h
Dragon/include/operators/arithmetic/div_op.h
Dragon/include/operators/arithmetic/eltwise_op.h
Dragon/include/operators/arithmetic/mul_op.h
Dragon/include/operators/arithmetic/sub_op.h
Dragon/include/operators/utils/cast_op.h → Dragon/include/operators/cast/float2half_op.h
Dragon/include/operators/utils/compare_op.h → Dragon/include/operators/control_flow/compare_op.h
Dragon/include/operators/utils/copy_op.h → Dragon/include/operators/control_flow/copy_op.h
--- a/Dragon/CMakeLists.txt
+++ b/Dragon/CMakeLists.txt
 # ---------------- Welcom To Use Dragon  ----------------

 PROJECT(dragon)
-CMAKE_MINIMUM_REQUIRED(VERSION 2.8.0)
+CMAKE_MINIMUM_REQUIRED(VERSION 3.0.0)

 # ---------------- Welcom To Use Dragon   ----------------

@@ -12,7 +12,7 @@ option(WITH_PYTHON3                "Set ON to use PYTHON3 otherwise PYTHON2"  OF
 option(WITH_CUDA                   "Set ON to use CUDA"  ON)
 option(WITH_CUDNN                  "Set ON to use CUDNN" OFF)
 option(WITH_BLAS                   "Set ON to use BLAS"  OFF)
-option(WITH_OMP                    "Set ON to use OpenMP"  OFF)
+option(WITH_OMP                    "Set ON to use OpenMP"  ON)
 option(WITH_SSE                    "Set ON to use SSE 4.1"  ON)
 option(WITH_MPI                    "Set ON to use MPI"  OFF)
 option(WITH_MPI_CUDA               "Set ON to use MPI-CUDA"  OFF)
@@ -23,14 +23,13 @@ option(WITH_CUDA_FP16              "Set ON to use FP16"  ON)
 set(3RDPARTY_DIR  ${PROJECT_SOURCE_DIR}/../3rdparty)

 # set your python environment
-set(PYTHON_DIR /usr/include/python2.7)  # preferred
-#set(PYTHON_DIR /usr/include/python3.x)  # optional, set specific version
-#set(ANACONDA_DIR /xxx/anaconda)  # optional, root folder of anaconda, preset for 2.7, 3.5, and 3.6 
-set(NUMPY_DIR /xxx/numpy)  # required, root folder of numpy package
+set(PYTHON_INCLUDE_DIR /usr/include/python2.7)  # preferred
+#set(PYTHON_INCLUDE_DIR /usr/include/python3.x)  # optional, set specific version
+#set(ANACONDA_ROOT_DIR /xxx/anaconda)  # optional, preset for 2.7, 3.5, and 3.6
+set(NUMPY_ROOT_DIR /xxx/numpy)  # required

 # set CUDA compiling architecture
-set(CUDA_ARCH     -gencode arch=compute_20,code=sm_20
-                  -gencode arch=compute_30,code=sm_30
+set(CUDA_ARCH     -gencode arch=compute_30,code=sm_30
                  -gencode arch=compute_35,code=sm_35
                  -gencode arch=compute_50,code=sm_50
                  -gencode arch=compute_60,code=sm_60)
@@ -81,19 +80,20 @@ include_directories(${3RDPARTY_DIR}/include)
 include_directories(${3RDPARTY_DIR}/include/mpi)
 include_directories(${CUDA_INCLUDE_DIRS})
 include_directories(${PROJECT_SOURCE_DIR}/src)
-include_directories(${NUMPY_DIR}/core/include)
-include_directories(${NUMPY_DIR})
-include_directories(${NUMPY_DIR}/numpy)
-include_directories(${ANACONDA_DIR}/include/python2.7)
-include_directories(${ANACONDA_DIR}/include/python3.5)
-include_directories(${ANACONDA_DIR}/include/python3.6)
-include_directories(${PYTHON_DIR})
-include_directories(${ANACONDA_DIR}/include)
+include_directories(${PYTHON_INCLUDE_DIR})
+include_directories(${ANACONDA_ROOT_DIR}/include)
+include_directories(${ANACONDA_ROOT_DIR}/include/python2.7)
+include_directories(${ANACONDA_ROOT_DIR}/include/python3.5)
+include_directories(${ANACONDA_ROOT_DIR}/include/python3.6)
+include_directories(${NUMPY_ROOT_DIR}/core/include)
+include_directories(${NUMPY_ROOT_DIR}/include)
+include_directories(${NUMPY_ROOT_DIR})

 # ---[ libs
 set(3RDPARTY_LIBS ${3RDPARTY_DIR}/lib)
+set(UINX_CUDNN_LIBS /usr/local/cuda/lib64)
 link_directories(${3RDPARTY_LIBS})
-link_directories(/usr/local/cuda/lib64)
+link_directories(${UINX_CUDNN_LIBS})

 # ---[ Install
 set(CMAKE_INSTALL_PREFIX ${PROJECT_SOURCE_DIR} CACHE STRING "set install prefix" FORCE)
@@ -166,6 +166,12 @@ endif()

 # ---[ Warnings

+# ---[ execute
+set (PROTOS_DIR ${PROJECT_SOURCE_DIR}/src/protos)
+message(STATUS "Generate Protobuf Files")
+execute_process(COMMAND protoc -I=${PROTOS_DIR} --cpp_out=${PROTOS_DIR} ${PROTOS_DIR}/caffemodel.proto)
+execute_process(COMMAND protoc -I=${PROTOS_DIR} --cpp_out=${PROTOS_DIR} ${PROTOS_DIR}/dragon.proto)
+
 # ---[ Subdirectories
 add_subdirectory(modules/python)


--- a/Dragon/include/core/common.h
+++ b/Dragon/include/core/common.h
@@ -45,7 +45,7 @@ using Set = std::unordered_set<Value> ;
 #define CONCATENATE_IMPL(s1, s2) s1##s2
 #define CONCATENATE(s1, s2) CONCATENATE_IMPL(s1,s2)
 #define ANONYMOUS_VARIABLE(str) CONCATENATE(str, __LINE__)
-#define NOT_IMPLEMENTED LOG(FATAL) << "this moudle is not implemented"
+#define NOT_IMPLEMENTED LOG(FATAL) << "This module has not been implemented yet."

 }    // namespace dragon


--- a/Dragon/include/core/context.h
+++ b/Dragon/include/core/context.h
@@ -42,7 +42,7 @@ class CPUContext{
 #else
        data = malloc(nbytes);
 #endif
-        CHECK(data) << "malloc mem: " << nbytes << " bytes failed.";
+        CHECK(data) << "Malloc mem: " << nbytes << " bytes failed.";
        return data;
    }


--- a/Dragon/include/core/context_cuda.h
+++ b/Dragon/include/core/context_cuda.h
@@ -91,13 +91,13 @@ class CUDAContext {
        cudaStreamSynchronize(cudaStreamDefault);
        cudaError_t error = cudaGetLastError();
        CHECK_EQ(error, cudaSuccess) 
-            << "cuda error: " << cudaGetErrorString(error);
+            << "CUDA Error: " << cudaGetErrorString(error);
    }

    inline static void* New(size_t nbytes) {
        void* data;
        cudaMalloc(&data, nbytes);
-        CHECK(data) << "malloc cuda mem: " << nbytes << " bytes failed.";
+        CHECK(data) << "Malloc cuda mem: " << nbytes << " bytes failed.";
        return data;
    }

@@ -190,12 +190,12 @@ static inline cudnnHandle_t& cudnn_handle() {
 #else  // WITH_CUDA
 class CUDAContext{
 public:
-    CUDAContext(const DeviceOption& option) { LOG(FATAL) << "CUDA is not compilied."; }
-    CUDAContext(const int gpu_id = 0) { LOG(FATAL) << "CUDA is not compilied."; }
+    CUDAContext(const DeviceOption& option) { LOG(FATAL) << "CUDA was not compiled."; }
+    CUDAContext(const int gpu_id = 0) { LOG(FATAL) << "CUDA was not compiled."; }

    template<class DstContext, class SrcContext>
    static void Memcpy(size_t nbytes, void* dst, const void* src) { 
-        LOG(FATAL) << "CUDA is not compilied.";
+        LOG(FATAL) << "CUDA was not compiled.";
    }
 };
 #endif // WITH_CUDA

--- a/Dragon/include/core/operator.h
+++ b/Dragon/include/core/operator.h
@@ -61,12 +61,17 @@ class OperatorBase{
    inline const Map<std::string, const Argument*>& args() { return args_; }
    inline const Argument& arg(const string& name) { return *(args_[name]); }

+    typedef Map<string, vector<OperatorBase*> > RecomputeMap;
+    inline RecomputeMap& recompute_map() { return recompute_map_; }
+    void set_recompute_map(RecomputeMap recompute_map) { recompute_map_ = recompute_map; }
+
    inline const OperatorDef& op_def() const { return op_def_; }
    inline const string debug_string() const { return op_def_.DebugString(); }

 protected:
    string phase_;
    Map<std::string, const Argument*> args_;
+    Map<string, vector<OperatorBase*> > recompute_map_;
    vector<Tensor*> inputs_, outputs_;
    OperatorDef op_def_;
    Workspace* ws_;
@@ -158,10 +163,10 @@ DECLARE_REGISTRY(CUDNNOperatorRegistry, OperatorBase, const OperatorDef&, Worksp
         TIndex count = 1; \
         for(int i = 0; i < shape.size(); i++) count *= shape[i]; \
         CHECK_EQ(count, tensor.count()) \
-            << "\nmodel request " << "Tensor(" << tensor.name() << ")'s " \
-            << "size is " << count << "\n" \
-            << "but now is " << tensor.count() << "\n" \
-            << "may be feed the incorrect Tensor before ?"; \
+            << "\nModel request " << "Tensor(" << tensor.name() << ")'s " \
+            << "size is " << count << ", \n" \
+            << "but now is " << tensor.count() << ", " \
+            << "did you feed the incorrect Tensor before ?"; \
        tensor.Reshape(shape); \
    }


--- a/Dragon/include/core/operator_schema.h
+++ b/Dragon/include/core/operator_schema.h
@@ -23,6 +23,8 @@ class OpSchema{

    bool Verify(const OperatorDef& def) const;

+    inline OpSchema& IgnoreVerify() { ignore_verify_ = true; return *this; }
+
    OpSchema& Inplace(set<pair<int, int> > inplace);
    std::function<bool(int, int)> CheckInplace;
    inline bool AllowInplace() const { return allow_inplace_; }
@@ -38,14 +40,13 @@ class OpSchema{
        min_input_ = min_output_= 0;
        max_input_ = max_output_ = std::numeric_limits<int>::max();
        CheckInplace = [](int, int) { return false; };
-        allow_inplace_ = false;
+        ignore_verify_ =  allow_inplace_ = false;
    }

    string op_type_, file_;
    int line_, min_input_, max_input_;
    int min_output_, max_output_;
-    bool allow_inplace_;
-    
+    bool allow_inplace_, ignore_verify_;
 };

 class OpSchemaRegistry {

--- a/Dragon/include/core/tensor.h
+++ b/Dragon/include/core/tensor.h
@@ -30,11 +30,20 @@ class Tensor {
            CHECK_GT(d, 0);
            new_size *= d;
        }
-        if (size_ != new_size && own_mem_ &&
+        if (own_mem_) {
+            if (size_ != new_size &&
                capacity_ < TIndex(new_size * meta_.itemsize())) {
                memory_.reset();
                capacity_ = 0;
            }
+        } else {
+            if (ex_memory_ && TIndex(ex_memory_->nbytes()) < 
+                              TIndex(new_size * meta_.itemsize())) {
+                delete ex_memory_;
+                ex_memory_ = nullptr;
+                capacity_ = 0;
+            }
+        }
        size_ = new_size;
    }

@@ -96,7 +105,7 @@ class Tensor {
    MixedMemory* memory() const { return own_mem_ ? memory_.get() : ex_memory_; }
    MixedMemory::State memory_state() const { 
        MixedMemory* mem = memory();
-        CHECK(mem) << "memory access before allowcating.";
+        CHECK(mem) << "Memory access before allowcating.";
        return memory()->state(); 
    }

@@ -120,7 +129,7 @@ class Tensor {
            } else if (TypeMeta::Id<Context>() == TypeMeta::Id<CUDAContext>()) {
                *data_ptr = mem->mutable_cuda_data();
            } else {
-                LOG(FATAL) << "unknown memory type access. only CPU or CUDA are supported.";
+                LOG(FATAL) << "Unknown memory type. Only CPU or CUDA is supported.";
            }
        }
    }
@@ -142,35 +151,17 @@ class Tensor {
    template <class Context>
    void* raw_mutable_data(const TypeMeta& meta) {
        void* data_ptr;
-        if (own_mem_) {
        mutable_data_ptr<Context>(&data_ptr);
-            if (meta_ == meta && data_ptr) {
-                return data_ptr;
-            } else {
+        if (meta_ == meta && data_ptr) return data_ptr;
        meta_ = meta;
        CHECK_GT(size_, 0);
-                memory_.reset(new MixedMemory(meta, size_* meta_.itemsize()));
-                mutable_data_ptr<Context>(&data_ptr);  //  malloc
-                if (meta.ctor()) meta_.ctor()(data_ptr, size_);
-            }
-            capacity_ = size_ * meta_.itemsize();
-            return data_ptr;
-        } else {
-            meta_ = meta;
-            CHECK_GT(size_, 0);
-            TIndex ex_capacity_ = ex_memory_->nbytes();
-            if (ex_capacity_ >= TIndex(size_ * meta.itemsize())) {
-                mutable_data_ptr<Context>(&data_ptr);
-            } else {
-                delete ex_memory_;
-                ex_memory_ = new MixedMemory(meta, size_* meta_.itemsize());
-                mutable_data_ptr<Context>(&data_ptr);  //  malloc
-                if (meta.ctor()) meta_.ctor()(data_ptr, size_);
+        if (own_mem_) memory_.reset(new MixedMemory(meta, size_* meta_.itemsize()));
+        else ex_memory_ = new MixedMemory(meta, size_* meta_.itemsize());
+        mutable_data_ptr<Context>(&data_ptr);    //  malloc memory
+        if (meta.ctor()) meta_.ctor()(data_ptr, size_);  //  call the constructor
        capacity_ = size_ * meta.itemsize();
-            }
        return data_ptr;
    }
-    }

    template <class Context>
    void* raw_mutable_data() {
@@ -181,7 +172,9 @@ class Tensor {
    }

    template <class Context>
-    const void* raw_data() const { return const_data_ptr<Context>(); }
+    const void* raw_data() const { 
+        return const_data_ptr<Context>(); 
+    }

    template <typename T, class Context>
    T* mutable_data() {

--- a/Dragon/include/core/workspace.h
+++ b/Dragon/include/core/workspace.h
@@ -25,10 +25,8 @@ class Workspace{
    typedef Map<string, unique_ptr<GraphBase> > GraphMap;
    typedef Map<string, TensorFiller> FillerMap;
    typedef Map<string, string> RenameMap;
-    typedef Map<string, vector<OperatorBase*> > RecomputeMap;

-    Workspace(): root_folder_(".") { init(); }
-    Workspace(string root_folder) : root_folder_(root_folder) { init(); }
+    Workspace() { init(); }
    ~Workspace();

    void init() { 
@@ -122,7 +120,7 @@ class Workspace{
            buffer_map_[category].pop();
            return GetTensor(name);
        }
-        LOG(FATAL) << "buffers of [" << category << "] "
+        LOG(FATAL) << "Buffers of [" << category << "] "
                   << "are not enough, add more if necessary.";
        return nullptr;
    }
@@ -162,28 +160,11 @@ class Workspace{

    /******************** Utility ********************/

-    inline const string& GetRootFolder() const { return root_folder_; }
-
    inline void CreateRename(const string& old_tensor,
                             const string& new_tensor) {
        rename_map_[old_tensor] = new_tensor;
    }

-    inline void AddRecompute(const string& tensor, OperatorBase* op) {
-        if (!recompute_map_.count(tensor)) {
-            recompute_map_[tensor] = vector<OperatorBase*>();
-        }
-        recompute_map_[tensor].push_back(op);
-    }
-
-    inline vector<OperatorBase*> GetRecompute(const string& tensor) {
-        if (recompute_map_.count(tensor)) {
-            return recompute_map_[tensor];
-        } else {
-            return vector<OperatorBase*>();
-        }
-    }
-
 private:
    TensorMap tensor_map_;
    BufferMap buffer_map_;
@@ -191,8 +172,6 @@ class Workspace{
    GraphMap graph_map_;
    FillerMap filler_map_;
    RenameMap rename_map_;
-    RecomputeMap recompute_map_;
-    string root_folder_;
 };

 }    // namespace dragon

--- a/Dragon/include/operators/activation/dropout_op.h
+++ b/Dragon/include/operators/activation/dropout_op.h
@@ -17,7 +17,7 @@ class DropoutOp final : public Operator<Context> {
 public:
    DropoutOp(const OperatorDef& op_def, Workspace* ws)
        : Operator<Context>(op_def, ws),
-          prob(OperatorBase::GetSingleArg<float>("prob", 0)) {
+          prob(OperatorBase::GetSingleArg<float>("prob", 0.5)) {
        bool use_scale = OperatorBase::GetSingleArg<bool>("scale", true);
        threshold = static_cast<unsigned int>(UINT_MAX * prob);
        if (use_scale) scale = 1.0 / (1.0 - prob);
@@ -38,7 +38,7 @@ class DropoutGradientOp final : public Operator<Context> {
 public:
    DropoutGradientOp(const OperatorDef& op_def, Workspace* ws) 
        : Operator<Context>(op_def, ws),
-          prob(OperatorBase::GetSingleArg<float>("prob", 0)) {
+          prob(OperatorBase::GetSingleArg<float>("prob", 0.5)) {
        bool use_scale = OperatorBase::GetSingleArg<bool>("scale", true);
        threshold = static_cast<unsigned int>(UINT_MAX * prob);
        if (use_scale) scale = 1.0 / (1.0 - prob);

--- a/Dragon/include/operators/activation/elu_op.h
+++ b/Dragon/include/operators/activation/elu_op.h
+// --------------------------------------------------------
+// Dragon
+// Copyright(c) 2017 SeetaTech
+// Written by Ting Pan
+// --------------------------------------------------------
+
+#ifndef DRAGON_OPERATORS_ACTIVATION_ELU_OP_H_
+#define DRAGON_OPERATORS_ACTIVATION_ELU_OP_H_
+
+#include "core/operator.h"
+
+namespace dragon {
+
+template <class Context>
+class EluOp : public Operator<Context> {
+ public:
+    EluOp(const OperatorDef& op_def, Workspace* ws)
+        : Operator<Context>(op_def, ws),
+          alpha(OperatorBase::GetSingleArg<float>("alpha", 1.0)) {}
+
+    void RunOnDevice() override;
+    template <typename T> void RunWithType();
+
+ protected:
+    float alpha;
+};
+
+template <class Context>
+class EluGradientOp : public Operator<Context> {
+ public:
+    EluGradientOp(const OperatorDef& op_def, Workspace* ws)
+        : Operator<Context>(op_def, ws),
+          alpha(OperatorBase::GetSingleArg<float>("alpha", 1.0)) {
+        DISABLE_SHARE_GRADIENT;
+    }
+
+    void RunOnDevice() override;
+    template <typename T> void RunWithType();
+
+ protected:
+    float alpha;
+};
+
+#ifdef WITH_CUDNN
+
+#if CUDNN_VERSION_MIN(6, 0, 0)
+
+template <class Context>
+class CuDNNEluOp final : public EluOp<Context> {
+public:
+    CuDNNEluOp(const OperatorDef& op_def, Workspace* ws) 
+        : EluOp<Context>(op_def, ws) {
+        CUDNN_CHECK(cudnnCreateTensorDescriptor(&input_desc));
+        CUDNN_CHECK(cudnnCreateTensorDescriptor(&output_desc));
+        CUDNN_CHECK(cudnnCreateActivationDescriptor(&act_desc));
+        CUDNN_CHECK(cudnnSetActivationDescriptor(act_desc, 
+            CUDNN_ACTIVATION_ELU, CUDNN_PROPAGATE_NAN, this->alpha));
+    }
+    void RunOnDevice() override;
+    template <typename T> void RunWithType(); 
+
+ protected:
+    cudnnTensorDescriptor_t input_desc, output_desc;
+    cudnnActivationDescriptor_t act_desc;
+};
+
+template <class Context>
+class CuDNNEluGradientOp final : public EluGradientOp<Context> {
+ public:
+    CuDNNEluGradientOp(const OperatorDef& op_def, Workspace* ws)
+        : EluGradientOp<Context>(op_def, ws) {
+        CUDNN_CHECK(cudnnCreateTensorDescriptor(&input_desc));
+        CUDNN_CHECK(cudnnCreateTensorDescriptor(&output_desc));
+        CUDNN_CHECK(cudnnCreateActivationDescriptor(&act_desc));
+        CUDNN_CHECK(cudnnSetActivationDescriptor(act_desc,
+            CUDNN_ACTIVATION_ELU, CUDNN_PROPAGATE_NAN, this->alpha));
+    }
+    void RunOnDevice() override;
+    template <typename T> void RunWithType();
+
+ protected:
+    cudnnTensorDescriptor_t input_desc, output_desc;
+    cudnnActivationDescriptor_t act_desc;
+};
+
+#endif
+
+#endif // WITH_CUDNN
+
+}    // namespace dragon
+
+#endif    // DRAGON_OPERATORS_ACTIVATION_ELU_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/activation/sigmoid_op.h
+++ b/Dragon/include/operators/activation/sigmoid_op.h
@@ -12,7 +12,7 @@
 namespace dragon {

 template <class Context>
-class SigmoidOp final : public Operator<Context> {
+class SigmoidOp : public Operator<Context> {
 public:
    USE_SIMPLE_CTOR_DTOR(SigmoidOp);

@@ -21,7 +21,7 @@ class SigmoidOp final : public Operator<Context> {
 };

 template <class Context>
-class SigmoidGradientOp final : public Operator<Context> {
+class SigmoidGradientOp : public Operator<Context> {
 public:
    SigmoidGradientOp(const OperatorDef& op_def, Workspace* ws)
        : Operator<Context>(op_def, ws) {
@@ -32,6 +32,48 @@ class SigmoidGradientOp final : public Operator<Context> {
    template <typename T> void RunWithType();
 };

+#ifdef WITH_CUDNN
+
+template <class Context>
+class CuDNNSigmoidOp final : public SigmoidOp<Context> {
+public:
+    CuDNNSigmoidOp(const OperatorDef& op_def, Workspace* ws) 
+        : SigmoidOp<Context>(op_def, ws) {
+        CUDNN_CHECK(cudnnCreateTensorDescriptor(&input_desc));
+        CUDNN_CHECK(cudnnCreateTensorDescriptor(&output_desc));
+        CUDNN_CHECK(cudnnCreateActivationDescriptor(&act_desc));
+        CUDNN_CHECK(cudnnSetActivationDescriptor(act_desc, 
+            CUDNN_ACTIVATION_SIGMOID, CUDNN_PROPAGATE_NAN, 0));
+    }
+    void RunOnDevice() override;
+    template <typename T> void RunWithType(); 
+
+ protected:
+    cudnnTensorDescriptor_t input_desc, output_desc;
+    cudnnActivationDescriptor_t act_desc;
+};
+
+template <class Context>
+class CuDNNSigmoidGradientOp final : public SigmoidGradientOp<Context> {
+ public:
+    CuDNNSigmoidGradientOp(const OperatorDef& op_def, Workspace* ws)
+        : SigmoidGradientOp<Context>(op_def, ws) {
+        CUDNN_CHECK(cudnnCreateTensorDescriptor(&input_desc));
+        CUDNN_CHECK(cudnnCreateTensorDescriptor(&output_desc));
+        CUDNN_CHECK(cudnnCreateActivationDescriptor(&act_desc));
+        CUDNN_CHECK(cudnnSetActivationDescriptor(act_desc,
+            CUDNN_ACTIVATION_SIGMOID, CUDNN_PROPAGATE_NAN, 0));
+    }
+    void RunOnDevice() override;
+    template <typename T> void RunWithType();
+
+ protected:
+    cudnnTensorDescriptor_t input_desc, output_desc;
+    cudnnActivationDescriptor_t act_desc;
+};
+
+#endif // WITH_CUDNN
+
 }    // namespace dragon

 #endif    // DRAGON_OPERATORS_ACTIVATION_SIGMOID_OP_HPP
\ No newline at end of file
--- a/Dragon/include/operators/activation/tanh_op.h
+++ b/Dragon/include/operators/activation/tanh_op.h
@@ -12,7 +12,7 @@
 namespace dragon {

 template <class Context>
-class TanhOp final : public Operator<Context> {
+class TanhOp : public Operator<Context> {
 public:
    USE_SIMPLE_CTOR_DTOR(TanhOp);

@@ -21,7 +21,7 @@ class TanhOp final : public Operator<Context> {
 };

 template <class Context>
-class TanhGradientOp final : public Operator<Context> {
+class TanhGradientOp : public Operator<Context> {
 public:
     TanhGradientOp(const OperatorDef& op_def, Workspace* ws)
         : Operator<Context>(op_def, ws) {
@@ -32,6 +32,48 @@ class TanhGradientOp final : public Operator<Context> {
    template <typename T> void RunWithType();
 };

+#ifdef WITH_CUDNN
+
+template <class Context>
+class CuDNNTanhOp final : public TanhOp<Context> {
+public:
+    CuDNNTanhOp(const OperatorDef& op_def, Workspace* ws)
+        : TanhOp<Context>(op_def, ws) {
+        CUDNN_CHECK(cudnnCreateTensorDescriptor(&input_desc));
+        CUDNN_CHECK(cudnnCreateTensorDescriptor(&output_desc));
+        CUDNN_CHECK(cudnnCreateActivationDescriptor(&act_desc));
+        CUDNN_CHECK(cudnnSetActivationDescriptor(act_desc, 
+            CUDNN_ACTIVATION_TANH, CUDNN_PROPAGATE_NAN, 0));
+    }
+    void RunOnDevice() override;
+    template <typename T> void RunWithType(); 
+
+ protected:
+    cudnnTensorDescriptor_t input_desc, output_desc;
+    cudnnActivationDescriptor_t act_desc;
+};
+
+template <class Context>
+class CuDNNTanhGradientOp final : public TanhGradientOp<Context> {
+ public:
+    CuDNNTanhGradientOp(const OperatorDef& op_def, Workspace* ws)
+        : TanhGradientOp<Context>(op_def, ws) {
+        CUDNN_CHECK(cudnnCreateTensorDescriptor(&input_desc));
+        CUDNN_CHECK(cudnnCreateTensorDescriptor(&output_desc));
+        CUDNN_CHECK(cudnnCreateActivationDescriptor(&act_desc));
+        CUDNN_CHECK(cudnnSetActivationDescriptor(act_desc,
+            CUDNN_ACTIVATION_TANH, CUDNN_PROPAGATE_NAN, 0));
+    }
+    void RunOnDevice() override;
+    template <typename T> void RunWithType();
+
+ protected:
+    cudnnTensorDescriptor_t input_desc, output_desc;
+    cudnnActivationDescriptor_t act_desc;
+};
+
+#endif // WITH_CUDNN
+
 }    // namespace dragon

 #endif    // DRAGON_OPERATORS_ACTIVATION_TANH_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/arithmetic/add_op.h
+++ b/Dragon/include/operators/arithmetic/add_op.h
@@ -38,6 +38,33 @@ class AddGradientOp final : public Operator<Context> {
    Tensor* bcast_multiplier;
 };

+template <class Context>
+class RAddOp final : public Operator<Context> {
+ public:
+    USE_SIMPLE_CTOR_DTOR(RAddOp);
+
+    void RunOnDevice() override;
+    template <typename T> void EltwiseRunWithType();
+    template <typename T> void BroadcastRunWithType(int type);
+    
+ protected:
+    Tensor* bcast_multiplier;
+};
+
+template <class Context>
+class RAddGradientOp final : public Operator<Context> {
+ public:
+    USE_SIMPLE_CTOR_DTOR(RAddGradientOp);
+
+    void ShareGradient() override;
+    void RunOnDevice() override;
+    template <typename T> void EltwiseRunWithType();
+    template <typename T> void BroadcastRunWithType(int type);
+
+ protected:
+    Tensor* bcast_multiplier;
+};
+
 }    // namespace dragon

 #endif    // DRAGON_OPERATORS_ARITHMETIC_ADD_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/arithmetic/div_op.h
+++ b/Dragon/include/operators/arithmetic/div_op.h
@@ -38,6 +38,33 @@ class DivGradientOp final : public Operator<Context> {
    Tensor* bcast_multiplier;
 };

+template <class Context>
+class RDivOp final : public Operator<Context> {
+ public:
+    USE_SIMPLE_CTOR_DTOR(RDivOp);
+
+    void RunOnDevice() override;
+    template <typename T> void EltwiseRunWithType();
+    template <typename T> void BroadcastRunWithType(int type);
+
+ protected:
+    Tensor* bcast_multiplier;
+};
+
+template <class Context>
+class RDivGradientOp final : public Operator<Context> {
+ public:
+    USE_SIMPLE_CTOR_DTOR(RDivGradientOp);
+
+    void ShareGradient() override;
+    void RunOnDevice() override;
+    template <typename T> void EltwiseRunWithType();
+    template <typename T> void BroadcastRunWithType(int type);
+
+ protected:
+    Tensor* bcast_multiplier;
+};
+
 }    // namepsace dragon

 #endif    // DRAGON_OPERATORS_ARITHMETIC_DIV_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/arithmetic/eltwise_op.h
+++ b/Dragon/include/operators/arithmetic/eltwise_op.h
@@ -20,7 +20,7 @@ class EltwiseOp final : public Operator<Context> {
          coeffs(OperatorBase::GetRepeatedArg<float>("coeffs")) {
        if (coeffs.size() > 0) {
            CHECK_EQ(coeffs.size(), InputSize())
-                << "\nop has " << InputSize() << " inputs, "
+                << "\nOp has " << InputSize() << " inputs, "
                << "but provided " << coeffs.size() << " coeffs.";
        } else coeffs.resize(InputSize(), float(1));
    }

--- a/Dragon/include/operators/arithmetic/mul_op.h
+++ b/Dragon/include/operators/arithmetic/mul_op.h
@@ -38,6 +38,33 @@ class MulGradientOp final : public Operator<Context> {
    Tensor* bcast_multiplier;
 };

+template <class Context>
+class RMulOp final : public Operator<Context> {
+ public:
+    USE_SIMPLE_CTOR_DTOR(RMulOp);
+
+    void RunOnDevice() override;
+    template <typename T> void EltwiseRunWithType();
+    template <typename T> void BroadcastRunWithType(int type);
+
+ protected:
+    Tensor* bcast_multiplier;
+};
+
+template <class Context>
+class RMulGradientOp final : public Operator<Context> {
+ public:
+    USE_SIMPLE_CTOR_DTOR(RMulGradientOp);
+
+    void ShareGradient() override;
+    void RunOnDevice() override;
+    template <typename T> void EltwiseRunWithType();
+    template <typename T> void BroadcastRunWithType(int type);
+
+ protected:
+    Tensor* bcast_multiplier;
+};
+
 }    // namespace dragon

 #endif    // DRAGON_OPERATORS_ARITHMETIC_MUL_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/arithmetic/sub_op.h
+++ b/Dragon/include/operators/arithmetic/sub_op.h
@@ -38,6 +38,33 @@ class SubGradientOp final : public Operator<Context> {
    Tensor* bcast_multiplier;
 };

+template <class Context>
+class RSubOp final : public Operator<Context> {
+ public:
+    USE_SIMPLE_CTOR_DTOR(RSubOp);
+
+    void RunOnDevice() override;
+    template <typename T> void EltwiseRunWithType();
+    template <typename T> void BroadcastRunWithType(int type);
+
+ protected:
+    Tensor* bcast_multiplier;
+};
+
+template <class Context>
+class RSubGradientOp final : public Operator<Context> {
+ public:
+    USE_SIMPLE_CTOR_DTOR(RSubGradientOp);
+
+    void ShareGradient() override;
+    void RunOnDevice() override;
+    template <typename T> void EltwiseRunWithType();
+    template <typename T> void BroadcastRunWithType(int type);
+
+ protected:
+    Tensor* bcast_multiplier;
+};
+
 }    // namespace dragon

 #endif    // DRAGON_OPERATORS_ARITHMETIC_SUB_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/utils/cast_op.h
+++ b/Dragon/include/operators/utils/cast_op.h
@@ -4,8 +4,8 @@
 // Written by Ting Pan
 // --------------------------------------------------------

-#ifndef DRAGON_OPERATORS_UTILS_CAST_OP_H_
-#define DRAGON_OPERATORS_UTILS_CAST_OP_H_
+#ifndef DRAGON_OPERATORS_CAST_FLOAT2HALF_OP_H_
+#define DRAGON_OPERATORS_CAST_FLOAT2HALF_OP_H_

 #include "core/operator.h"

@@ -20,4 +20,4 @@ class FloatToHalfOp final : public Operator<Context> {

 }    // namespace dragon

-#endif    // DRAGON_OPERATORS_UTILS_CAST_OP_H_
\ No newline at end of file
+#endif    // DRAGON_OPERATORS_CAST_FLOAT2HALF_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/utils/compare_op.h
+++ b/Dragon/include/operators/utils/compare_op.h
@@ -4,8 +4,8 @@
 // Written by Ting Pan
 // --------------------------------------------------------

-#ifndef DRAGON_OPERATORS_UTILS_COMPARE_OP_H_
-#define DRAGON_OPERATORS_UTILS_COMPARE_OP_H_
+#ifndef DRAGON_OPERATORS_CONTROL_FLOW_COMPARE_OP_H_
+#define DRAGON_OPERATORS_CONTROL_FLOW_COMPARE_OP_H_

 #include "core/operator.h"

@@ -27,4 +27,4 @@ class CompareOp final : public Operator<Context> {

 }    // namespace dragon

-#endif    // DRAGON_OPERATORS_UTILS_COMPARE_OP_H_
\ No newline at end of file
+#endif    // DRAGON_OPERATORS_CONTROL_FLOW_COMPARE_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/utils/copy_op.h
+++ b/Dragon/include/operators/utils/copy_op.h
@@ -4,8 +4,8 @@
 // Written by Ting Pan
 // --------------------------------------------------------

-#ifndef DRAGON_OPERATORS_UTILS_COPY_OP_H_
-#define DRAGON_OPERATORS_UTILS_COPY_OP_H_
+#ifndef DRAGON_OPERATORS_CONTROL_FLOW_COPY_OP_H_
+#define DRAGON_OPERATORS_CONTROL_FLOW_COPY_OP_H_

 #include "core/operator.h"

@@ -21,4 +21,4 @@ class CopyOp final : public Operator<Context> {

 }    // namespace dragon

-#endif    // DRAGON_OPERATORS_UTILS_COPY_OP_H_
\ No newline at end of file
+#endif    // DRAGON_OPERATORS_CONTROL_FLOW_COPY_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/common/scan_op.h
+++ b/Dragon/include/operators/common/scan_op.h
@@ -4,8 +4,8 @@
 // Written by Ting Pan
 // --------------------------------------------------------

-#ifndef DRAGON_OPERATORS_COMMON_SCAN_OP_H_
-#define DRAGON_OPERATORS_COMMON_SCAN_OP_H_
+#ifndef DRAGON_OPERATORS_CONTROL_FLOW_SCAN_OP_H_
+#define DRAGON_OPERATORS_CONTROL_FLOW_SCAN_OP_H_

 #include "core/operator.h"

@@ -80,4 +80,4 @@ class ScanGradientOp final: public Operator<Context> {

 }    // namespace dragon

-#endif    // DRAGON_OPERATORS_COMMON_SCAN_OP_H_
\ No newline at end of file
+#endif    // DRAGON_OPERATORS_CONTROL_FLOW_SCAN_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/utils/accuracy_op.h
+++ b/Dragon/include/operators/utils/accuracy_op.h
@@ -4,8 +4,8 @@
 // Written by Ting Pan
 // --------------------------------------------------------

-#ifndef DRAGON_OPERATORS_UTILS_ACCURACY_OP_H_
-#define DRAGON_OPERATORS_UTILS_ACCURACY_OP_H_
+#ifndef DRAGON_OPERATORS_MISC_ACCURACY_OP_H_
+#define DRAGON_OPERATORS_MISC_ACCURACY_OP_H_

 #include "core/operator.h"

@@ -16,7 +16,8 @@ class AccuracyOp final: public Operator<Context> {
 public:
    AccuracyOp(const OperatorDef& op_def, Workspace* ws)
        : Operator<Context>(op_def, ws),
-          top_k(OperatorBase::GetSingleArg<int>("top_k", 1)) {
+          top_k(OperatorBase::GetSingleArg<int>("top_k", 1)),
+          axis(OperatorBase::GetSingleArg<int>("axis", 1)) {
        vector<int> args = OperatorBase::GetRepeatedArg<int>("ignore_labels");
        if (args.size()) {
            ignore_labels.Reshape(vector<TIndex>(1, args.size()));
@@ -29,10 +30,10 @@ class AccuracyOp final: public Operator<Context> {
    template <typename T> void RunWithType();

 protected:
-    TIndex top_k, outer_num, inner_num, classes;
+    TIndex top_k, axis, outer_dim, inner_dim, num_classes;
    Tensor ignore_labels;
 };

 }    // namespace dragon

-#endif    // DRAGON_OPERATORS_UTILS_ACCURACY_OP_H_
\ No newline at end of file
+#endif    // DRAGON_OPERATORS_MISC_ACCURACY_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/utils/gradient_op.h
+++ b/Dragon/include/operators/utils/gradient_op.h
@@ -4,8 +4,8 @@
 // Written by Ting Pan
 // --------------------------------------------------------

-#ifndef DRAGON_OPERATORS_UTILS_GRADIENT_GENERATE_OP_H_
-#define DRAGON_OPERATORS_UTILS_GRADIENT_GENERATE_OP_H_
+#ifndef DRAGON_OPERATORS_MISC_GRADIENT_GENERATE_OP_H_
+#define DRAGON_OPERATORS_MISC_GRADIENT_GENERATE_OP_H_

 #include "core/operator.h"

@@ -59,4 +59,4 @@ class StopGradientOp final : public Operator<Context> {

 }    // namespace dragon

-#endif    // DRAGON_OPERATORS_UTILS_GRADIENT_GENERATE_OP_H_
\ No newline at end of file
+#endif    // DRAGON_OPERATORS_MISC_GRADIENT_GENERATE_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/utils/initialize_op.h
+++ b/Dragon/include/operators/utils/initialize_op.h
@@ -4,8 +4,8 @@
 // Written by Ting Pan
 // --------------------------------------------------------

-#ifndef DRAGON_OPERATORS_UTILS_INITIALIZE_OP_H_
-#define DRAGON_OPERATORS_UTILS_INITIALIZE_OP_H_
+#ifndef DRAGON_OPERATORS_MISC_INITIALIZE_OP_H_
+#define DRAGON_OPERATORS_MISC_INITIALIZE_OP_H_

 #include "core/operator.h"
 #include "utils/filler.h"
@@ -67,10 +67,12 @@ public:
    TruncatedNormalOp(const OperatorDef& op_def, Workspace* ws) 
        : InitializeOp<Context>(op_def, ws) {
        this->filler.set_type("truncated_normal");
-        this->filler.set_mean(OperatorBase::GetSingleArg<float>("mean", 0.0));
-        this->filler.set_std(OperatorBase::GetSingleArg<float>("std", 1.0));
-        this->filler.set_low(OperatorBase::GetSingleArg<float>("low", -2.0));
-        this->filler.set_high(OperatorBase::GetSingleArg<float>("high", 2.0));
+        float mu = OperatorBase::GetSingleArg<float>("mean", 0.0);
+        float sigma = OperatorBase::GetSingleArg<float>("std", 1.0);
+        this->filler.set_mean(mu);
+        this->filler.set_std(sigma);
+        this->filler.set_low(mu - 2 * sigma);
+        this->filler.set_high(mu + 2 * sigma);
    }
 };

@@ -116,4 +118,4 @@ public:

 }    // namespace

-#endif    // DRAGON_OPERATORS_UTILS_INITIALIZE_OP_H_
+#endif    // DRAGON_OPERATORS_MISC_INITIALIZE_OP_H_
--- a/Dragon/include/operators/utils/memory_data_op.h
+++ b/Dragon/include/operators/utils/memory_data_op.h
@@ -4,8 +4,8 @@
 // Written by Ting Pan
 // --------------------------------------------------------

-#ifndef DRAGON_OPERATORS_UTILS_MEMORY_DATA_OP_H_
-#define DRAGON_OPERATORS_UTILS_MEMORY_DATA_OP_H_
+#ifndef DRAGON_OPERATORS_MISC_MEMORY_DATA_OP_H_
+#define DRAGON_OPERATORS_MISC_MEMORY_DATA_OP_H_

 #include "core/operator.h"

@@ -30,4 +30,4 @@ class MemoryDataOp final : public Operator<Context> {

 }    // namespace dragon

-#endif    // DRAGON_OPERATORS_UTILS_MEMORY_DATA_OP_H_
\ No newline at end of file
+#endif    // DRAGON_OPERATORS_MISC_MEMORY_DATA_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/utils/proposal_op.h
+++ b/Dragon/include/operators/utils/proposal_op.h
@@ -4,8 +4,8 @@
 // Written by Ting Pan
 // --------------------------------------------------------

-#ifndef DRAGON_OPERATORS_UTILS_PROPOSAL_OP_H_
-#define DRAGON_OPERATORS_UTILS_PROPOSAL_OP_H_
+#ifndef DRAGON_OPERATORS_MISC_PROPOSAL_OP_H_
+#define DRAGON_OPERATORS_MISC_PROPOSAL_OP_H_

 #include "core/operator.h"

@@ -37,4 +37,4 @@ class ProposalOp final : public Operator<Context> {

 }    // namespace dragon

-#endif    // DRAGON_OPERATORS_UTILS_COMPARE_OP_H_
\ No newline at end of file
+#endif    // DRAGON_OPERATORS_MISC_COMPARE_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/common/python_op.h
+++ b/Dragon/include/operators/common/python_op.h
@@ -4,8 +4,8 @@
 // Written by Ting Pan
 // --------------------------------------------------------

-#ifndef DRAGON_OPERATORS_COMMON_PYTHON_OP_H_
-#define DRAGON_OPERATORS_COMMON_PYTHON_OP_H_
+#ifndef DRAGON_OPERATORS_MISC_PYTHON_OP_H_
+#define DRAGON_OPERATORS_MISC_PYTHON_OP_H_

 #include <Python.h>

@@ -52,4 +52,4 @@ public:

 }    // namespace dragon

-#endif    // DRAGON_OPERATORS_COMMON_PYTHON_OP_H_
\ No newline at end of file
+#endif    // DRAGON_OPERATORS_MISC_PYTHON_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/mpi/base_mpi_op.h
+++ b/Dragon/include/operators/mpi/base_mpi_op.h
@@ -33,7 +33,7 @@ class ModelMPIBase : public Operator<Context> {
        int world_root = OperatorBase::GetSingleArg<int>("root", 0);
        MPI_Group_translate_ranks(world_group, 1, &world_root, group, &comm_root);

-        CHECK(comm_root != MPI_UNDEFINED) << "mpi root is not included in layer group.";
+        CHECK(comm_root != MPI_UNDEFINED) << "MPI root is not included in layer group.";
    }

 protected:

--- a/Dragon/include/operators/ndarray/arange_op.h
+++ b/Dragon/include/operators/ndarray/arange_op.h
+// --------------------------------------------------------
+// Dragon
+// Copyright(c) 2017 SeetaTech
+// Written by Ting Pan
+// --------------------------------------------------------
+
+#ifndef DRAGON_OPERATORS_NDARRAY_ARGMAX_OP_H_
+#define DRAGON_OPERATORS_NDARRAY_ARGMAX_OP_H_
+
+#include "core/operator.h"
+
+namespace dragon {
+
+template <class Context>
+class ArangeOp final : public Operator<Context> {
+ public:
+    ArangeOp(const OperatorDef& op_def, Workspace* ws)
+        : Operator<Context>(op_def, ws),
+          start(OperatorBase::GetSingleArg<int>("static_start", 0)),
+          stop(OperatorBase::GetSingleArg<int>("static_stop", -1)),
+          step(OperatorBase::GetSingleArg<int>("static_step", 1)),
+          dtype(OperatorBase::GetSingleArg<string>("dtype", "FLOAT32")) {
+        dynamic_start_ = OperatorBase::GetSingleArg<string>("dynamic_start", "");
+        dynamic_stop_ = OperatorBase::GetSingleArg<string>("dynamic_stop", "");
+        dynamic_step_ = OperatorBase::GetSingleArg<string>("dynamic_step", "");
+    }
+
+    void RunOnDevice() override;
+    void Reshape();
+    template <typename T> void RunWithType();
+
+ protected:
+    TIndex start, stop, step, count;
+    Tensor* dynamic_start, *dynamic_stop, *dynamic_step;
+    string dynamic_start_, dynamic_stop_, dynamic_step_;
+    string dtype;
+};
+
+}    // namespace dragon
+
+#endif    // DRAGON_OPERATORS_NDARRAY_ARANGE_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/common/argmax_op.h
+++ b/Dragon/include/operators/common/argmax_op.h
@@ -4,8 +4,8 @@
 // Written by Ting Pan
 // --------------------------------------------------------

-#ifndef DRAGON_OPERATORS_COMMON_ARGMAX_OP_H_
-#define DRAGON_OPERATORS_COMMON_ARGMAX_OP_H_
+#ifndef DRAGON_OPERATORS_NDARRAY_ARGMAX_OP_H_
+#define DRAGON_OPERATORS_NDARRAY_ARGMAX_OP_H_

 #include "core/operator.h"

@@ -16,16 +16,18 @@ class ArgmaxOp final : public Operator<Context> {
 public:
    ArgmaxOp(const OperatorDef& op_def, Workspace* ws)
        : Operator<Context>(op_def, ws),
-          axis(OperatorBase::GetSingleArg<int>("axis", 0)),
+          axis(OperatorBase::GetSingleArg<int>("axis", -1)),
+          keep_dims(OperatorBase::GetSingleArg<bool>("keep_dims", false)),
          top_k(OperatorBase::GetSingleArg<int>("top_k", 1)) {}

    void RunOnDevice() override;
    template <typename T> void RunWithType();

 protected:
-    TIndex axis, top_k, count, inner_dim;
+    TIndex axis, axis_dim, top_k, count, inner_dim;
+    bool keep_dims;
 };

 }    // namespace dragon

-#endif    // DRAGON_OPERATORS_COMMON_ARGMAX_OP_H_
\ No newline at end of file
+#endif    // DRAGON_OPERATORS_NDARRAY_ARGMAX_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/ndarray/argmin_op.h
+++ b/Dragon/include/operators/ndarray/argmin_op.h
+// --------------------------------------------------------
+// Dragon
+// Copyright(c) 2017 SeetaTech
+// Written by Ting Pan
+// --------------------------------------------------------
+
+#ifndef DRAGON_OPERATORS_NDARRAY_ARGMIN_OP_H_
+#define DRAGON_OPERATORS_NDARRAY_ARGMIN_OP_H_
+
+#include "core/operator.h"
+
+namespace dragon {
+
+template <class Context>
+class ArgminOp final : public Operator<Context> {
+ public:
+    ArgminOp(const OperatorDef& op_def, Workspace* ws)
+        : Operator<Context>(op_def, ws),
+          axis(OperatorBase::GetSingleArg<int>("axis", -1)),
+          keep_dims(OperatorBase::GetSingleArg<bool>("keep_dims", false)),
+          top_k(OperatorBase::GetSingleArg<int>("top_k", 1)) {}
+
+    void RunOnDevice() override;
+    template <typename T> void RunWithType();
+
+ protected:
+    TIndex axis, axis_dim, top_k, count, inner_dim;
+    bool keep_dims;
+};
+
+}    // namespace dragon
+
+#endif    // DRAGON_OPERATORS_NDARRAY_ARGMIN_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/common/at_op.h
+++ b/Dragon/include/operators/common/at_op.h
@@ -4,8 +4,8 @@
 // Written by Ting Pan
 // --------------------------------------------------------

-#ifndef DRAGON_OPERATORS_COMMON_AT_OP_H_
-#define DRAGON_OPERATORS_COMMON_AT_OP_H_
+#ifndef DRAGON_OPERATORS_NDARRAY_AT_OP_H_
+#define DRAGON_OPERATORS_NDARRAY_AT_OP_H_

 #include "core/operator.h"

@@ -44,4 +44,4 @@ class AtGradientOp final : public Operator<Context> {

 }    // namespace dragon

-#endif    // DRAGON_OPERATORS_COMMON_AT_OP_H_
\ No newline at end of file
+#endif    // DRAGON_OPERATORS_NDARRAY_AT_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/common/concat_op.h
+++ b/Dragon/include/operators/common/concat_op.h
@@ -4,8 +4,8 @@
 // Written by Ting Pan
 // --------------------------------------------------------

-#ifndef DRAGON_OPERATORS_COMMON_CONCAT_OP_H_
-#define DRAGON_OPERATORS_COMMON_CONCAT_OP_H_
+#ifndef DRAGON_OPERATORS_NDARRAY_CONCAT_OP_H_
+#define DRAGON_OPERATORS_NDARRAY_CONCAT_OP_H_

 #include "core/operator.h"

@@ -48,4 +48,4 @@ class ConcatGradientOp : public Operator<Context> {

 }    // namespace dragon

-#endif    // DRAGON_OPERATORS_COMMON_CONCAT_OP_H_
\ No newline at end of file
+#endif    // DRAGON_OPERATORS_NDARRAY_CONCAT_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/common/crop_op.h
+++ b/Dragon/include/operators/common/crop_op.h
@@ -4,8 +4,8 @@
 // Written by Ting Pan
 // --------------------------------------------------------

-#ifndef DRAGON_OPERATORS_COMMON_CROP_OP_H_
-#define DRAGON_OPERATORS_COMMON_CROP_OP_H_
+#ifndef DRAGON_OPERATORS_NDARRAY_CROP_OP_H_
+#define DRAGON_OPERATORS_NDARRAY_CROP_OP_H_

 #include "core/operator.h"

@@ -21,9 +21,9 @@ class CropOp: public Operator<Context> {
          shape(OperatorBase::GetRepeatedArg<int>("shape")),
          shape_like(OperatorBase::GetSingleArg<string>("shape_like", "")) {
        CHECK(shape.size() * shape_like.size() == 0)
-            << "\ncan not set shape and shape_like both.";
+            << "\nCan not set shape and shape_like both.";
        CHECK(shape.size() + shape_like.size() != 0)
-            << "\nmust set shape and shape_like either.";
+            << "\nMust set shape and shape_like either.";
    }

    void ComputeOutputShape();
@@ -73,4 +73,4 @@ class CropGradientOp final : public Operator<Context > {

 }    // namespace dragon

-#endif    // DRAGON_OPERATORS_COMMON_CROP_OP_H_
\ No newline at end of file
+#endif    // DRAGON_OPERATORS_NDARRAY_CROP_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/common/expand_dims_op.h
+++ b/Dragon/include/operators/common/expand_dims_op.h
@@ -4,8 +4,8 @@
 // Written by Ting Pan
 // --------------------------------------------------------

-#ifndef DRAGON_OPERATORS_COMMON_EXPAND_DIMS_OP_H_
-#define DRAGON_OPERATORS_COMMON_EXPAND_DIMS_OP_H_
+#ifndef DRAGON_OPERATORS_NDARRAY_EXPAND_DIMS_OP_H_
+#define DRAGON_OPERATORS_NDARRAY_EXPAND_DIMS_OP_H_

 #include "core/operator.h"

@@ -36,4 +36,4 @@ class ExpandDimsGradientOp final : public Operator<Context> {

 }    // namespace dragon

-#endif    // DRAGON_OPERATORS_COMMON_EXPAND_DIMS_OP_H_
+#endif    // DRAGON_OPERATORS_NDARRAY_EXPAND_DIMS_OP_H_
--- a/Dragon/include/operators/common/flatten_op.h
+++ b/Dragon/include/operators/common/flatten_op.h
@@ -4,8 +4,8 @@
 // Written by Ting Pan
 // --------------------------------------------------------

-#ifndef DRAGON_OPERATORS_COMMON_FLATTEN_OP_H_
-#define DRAGON_OPERATORS_COMMON_FLATTEN_OP_H_
+#ifndef DRAGON_OPERATORS_NDARRAY_FLATTEN_OP_H_
+#define DRAGON_OPERATORS_NDARRAY_FLATTEN_OP_H_

 #include "core/operator.h"

@@ -17,12 +17,15 @@ class FlattenOp final : public Operator<Context> {
    FlattenOp(const OperatorDef& op_def, Workspace* ws) 
        : Operator<Context>(op_def, ws),
          axis(OperatorBase::GetSingleArg<int>("axis", 0)),
-          num_axes(OperatorBase::GetSingleArg<int>("num_axes", -1)) {}
+          num_axes(OperatorBase::GetSingleArg<int>("num_axes", -1)),
+          keep_axes(OperatorBase::GetSingleArg<int>("keep_axes", INT_MAX)) {}

    void RunOnDevice() override;
+    void SqueezeRun();
+    void KeepRun();

 protected:
-    TIndex axis, num_axes;
+    TIndex axis, num_axes, keep_axes;
 };

 template <class Context>
@@ -37,4 +40,4 @@ class FlattenGradientOp final : public Operator<Context> {

 }    // namespace dragon

-#endif    // DRAGON_OPERATORS_COMMON_FLATTEN_OP_H_
\ No newline at end of file
+#endif    // DRAGON_OPERATORS_NDARRAY_FLATTEN_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/utils/one_hot_op.h
+++ b/Dragon/include/operators/utils/one_hot_op.h
@@ -4,8 +4,8 @@
 // Written by Ting Pan
 // --------------------------------------------------------

-#ifndef DRAGON_OPERATORS_UTILS_ONE_HOT_OP_H_
-#define DRAGON_OPERATORS_UTILS_ONE_HOT_OP_H_
+#ifndef DRAGON_OPERATORS_NDARRAY_ONE_HOT_OP_H_
+#define DRAGON_OPERATORS_NDARRAY_ONE_HOT_OP_H_

 #include "core/operator.h"

@@ -29,4 +29,4 @@ class OneHotOp final : public Operator < Context > {

 }    // namespace dragon

-#endif    // DRAGON_OPERATORS_UTILS_ONE_HOT_OP_H_
\ No newline at end of file
+#endif    // DRAGON_OPERATORS_NDARRAY_ONE_HOT_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/ndarray/random_pick_op.h
+++ b/Dragon/include/operators/ndarray/random_pick_op.h
+// --------------------------------------------------------
+// Dragon
+// Copyright(c) 2017 SeetaTech
+// Written by Ting Pan
+// --------------------------------------------------------
+
+#ifndef DRAGON_OPERATORS_NDARRAY_RANDOM_PICK_OP_H_
+#define DRAGON_OPERATORS_NDARRAY_RANDOM_PICK_OP_H_
+
+#include "core/operator.h"
+
+namespace dragon {
+
+template <class Context>
+class RandomPickOp : public Operator<Context> {
+ public:
+    RandomPickOp(const OperatorDef& op_def, Workspace* ws) :
+        Operator<Context>(op_def, ws),
+        axis(OperatorBase::GetSingleArg<int>("axis", 0)),
+        max_samples(OperatorBase::GetSingleArg<int>("max_samples", 1)) {}
+
+    void RunOnDevice() override;
+    template <typename T> void RunWithType();
+
+ protected:
+    TIndex axis, max_samples;
+    TIndex outer_dim, inner_dim, x_slice_dim, y_slice_dim;
+    vector<TIndex> output_dims;
+    Tensor* pick_indices;
+};
+
+template <class Context>
+class RandomPickGradientOp final : public Operator<Context> {
+public:
+    RandomPickGradientOp(const OperatorDef& op_def, Workspace* ws)
+        : Operator<Context>(op_def, ws),
+        axis(OperatorBase::GetSingleArg<int>("axis", 0)) {}
+
+    void RunOnDevice() override;
+    template <typename T> void RunWithType();
+
+protected:
+    TIndex axis;
+    TIndex outer_dim, inner_dim, x_slice_dim, y_slice_dim;
+    Tensor* pick_indices;
+};
+
+}    // namespace dragon
+
+#endif    // DRAGON_OPERATORS_NDARRAY_RANDOM_PICK_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/common/reduce_op.h
+++ b/Dragon/include/operators/common/reduce_op.h
@@ -4,8 +4,8 @@
 // Written by Ting Pan
 // --------------------------------------------------------

-#ifndef DRAGON_OPERATORS_COMMON_REDUCE_OP_H_
-#define DRAGON_OPERATORS_COMMON_REDUCE_OP_H_
+#ifndef DRAGON_OPERATORS_NDARRAY_REDUCE_OP_H_
+#define DRAGON_OPERATORS_NDARRAY_REDUCE_OP_H_

 #include "core/operator.h"

@@ -50,4 +50,4 @@ class ReduceGradientOp final : public Operator<Context> {

 }    // namespace dragon

-#endif    // DRAGON_OPERATORS_COMMON_REDUCE_OP_H_
\ No newline at end of file
+#endif    // DRAGON_OPERATORS_NDARRAY_REDUCE_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/ndarray/repeat_op.h
+++ b/Dragon/include/operators/ndarray/repeat_op.h
+// --------------------------------------------------------
+// Dragon
+// Copyright(c) 2017 SeetaTech
+// Written by Ting Pan
+// --------------------------------------------------------
+
+#ifndef DRAGON_OPERATORS_NDARRAY_REPEAT_OP_H_
+#define DRAGON_OPERATORS_NDARRAY_REPEAT_OP_H_
+
+#include "core/operator.h"
+
+namespace dragon {
+
+template <class Context>
+class RepeatOp : public Operator<Context> {
+ public:
+    RepeatOp(const OperatorDef& op_def, Workspace* ws)
+        : Operator<Context>(op_def, ws),
+          axis(OperatorBase::GetSingleArg<int>("axis", -1)),
+          repeats(OperatorBase::GetSingleArg<int>("repeats", 1)) {}
+
+    void RunOnDevice() override;
+    template<typename T> void RunWithType();
+
+ protected:
+    TIndex axis, repeats, outer_dim, dim, inner_dim;
+};
+
+template <class Context>
+class RepeatGradientOp : public Operator<Context> {
+public:
+    RepeatGradientOp(const OperatorDef& op_def, Workspace* ws)
+        : Operator<Context>(op_def, ws),
+          axis(OperatorBase::GetSingleArg<int>("axis", -1)),
+          repeats(OperatorBase::GetSingleArg<int>("repeats", 1)) {}
+
+    void RunOnDevice() override;
+    template<typename T> void RunWithType();
+
+protected:
+    TIndex axis, repeats, outer_dim, dim, inner_dim;
+};
+
+}    // namespace dragon
+
+#endif    // DRAGON_OPERATORS_NDARRAY_REPEAT_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/common/reshape_op.h
+++ b/Dragon/include/operators/common/reshape_op.h
@@ -4,8 +4,8 @@
 // Written by Ting Pan
 // --------------------------------------------------------

-#ifndef DRAGON_OPERATORS_COMMON_RESHAPE_OP_H_
-#define DRAGON_OPERATORS_COMMON_RESHAPE_OP_H_
+#ifndef DRAGON_OPERATORS_NDARRAY_RESHAPE_OP_H_
+#define DRAGON_OPERATORS_NDARRAY_RESHAPE_OP_H_

 #include "core/operator.h"

@@ -39,4 +39,4 @@ class ReshapeGradientOp final : public Operator<Context> {

 }    // namespace dragon

-#endif    // DRAGON_OPERATORS_COMMON_RESHAPE_OP_H_
\ No newline at end of file
+#endif    // DRAGON_OPERATORS_NDARRAY_RESHAPE_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/utils/shape_op.h
+++ b/Dragon/include/operators/utils/shape_op.h
@@ -4,8 +4,8 @@
 // Written by Ting Pan
 // --------------------------------------------------------

-#ifndef DRAGON_OPERATORS_UTILS_SHAPE_OP_H_
-#define DRAGON_OPERATORS_UTILS_SHAPE_OP_H_
+#ifndef DRAGON_OPERATORS_NDARRAY_SHAPE_OP_H_
+#define DRAGON_OPERATORS_NDARRAY_SHAPE_OP_H_

 #include "core/operator.h"

@@ -20,4 +20,4 @@ class ShapeOp final : public Operator<Context> {

 }    // namespace dragon

-#endif    //DRAGON_OPERATORS_UTILS_SHAPE_OP_H_
\ No newline at end of file
+#endif    //DRAGON_OPERATORS_NDARRAY_SHAPE_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/common/slice_op.h
+++ b/Dragon/include/operators/common/slice_op.h
@@ -4,8 +4,8 @@
 // Written by Ting Pan
 // --------------------------------------------------------

-#ifndef DRAGON_OPERATORS_COMMON_SLICE_OP_H_
-#define DRAGON_OPERATORS_COMMON_SLICE_OP_H_
+#ifndef DRAGON_OPERATORS_NDARRAY_SLICE_OP_H_
+#define DRAGON_OPERATORS_NDARRAY_SLICE_OP_H_

 #include "core/operator.h"

@@ -51,4 +51,4 @@ class SliceGradientOp final : public Operator<Context> {

 }    // namespace dragon

-#endif    // #define DRAGON_OPERATORS_COMMON_SLICE_OP_H_
\ No newline at end of file
+#endif    // DRAGON_OPERATORS_NDARRAY_SLICE_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/ndarray/stack_op.h
+++ b/Dragon/include/operators/ndarray/stack_op.h
+// --------------------------------------------------------
+// Dragon
+// Copyright(c) 2017 SeetaTech
+// Written by Ting Pan
+// --------------------------------------------------------
+
+#ifndef DRAGON_OPERATORS_NDARRAY_STACK_OP_H_
+#define DRAGON_OPERATORS_NDARRAY_STACK_OP_H_
+
+#include "core/operator.h"
+
+namespace dragon {
+
+template <class Context>
+class StackOp : public Operator<Context> {
+ public:
+    StackOp(const OperatorDef& op_def, Workspace* ws)
+        : Operator<Context>(op_def, ws),
+          axis(OperatorBase::GetSingleArg<int>("axis", 0)),
+          nin(OperatorBase::GetSingleArg<int>("num_input", 1)) {}
+
+    void RunOnDevice() override;
+    template <typename T> void RunWithType();
+
+ protected:
+    TIndex axis, nin, outer_dim, inner_dim, x_concat_dim, y_concat_dim;
+    TIndex x_offset, y_offset, concat_offset;
+    vector<TIndex> stack_dims, concat_dims;
+};
+
+template <class Context>
+class StackGradientOp : public Operator<Context> {
+ public:
+    StackGradientOp(const OperatorDef& op_def, Workspace* ws) 
+        : Operator<Context>(op_def, ws),
+          axis(OperatorBase::GetSingleArg<int>("axis", 0)),
+          nin(OperatorBase::GetSingleArg<int>("num_input", 1)) {}
+
+    void ShareGradient() override;
+    void RunOnDevice() override;
+    template <typename T> void RunWithType();
+
+ protected:
+    TIndex axis, nin, outer_dim, inner_dim, x_concat_dim, y_concat_dim;
+    TIndex x_offset, y_offset, concat_offset;
+    vector<TIndex> concat_dims;
+}; 
+
+}    // namespace dragon
+
+#endif    // DRAGON_OPERATORS_NDARRAY_STACK_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/common/tile_op.h
+++ b/Dragon/include/operators/common/tile_op.h
@@ -4,8 +4,8 @@
 // Written by Ting Pan
 // --------------------------------------------------------

-#ifndef DRAGON_OPERATORS_COMMON_TILE_OP_H_
-#define DRAGON_OPERATORS_COMMON_TILE_OP_H_
+#ifndef DRAGON_OPERATORS_NDARRAY_TILE_OP_H_
+#define DRAGON_OPERATORS_NDARRAY_TILE_OP_H_

 #include "core/operator.h"

@@ -28,7 +28,7 @@ class TileOp : public Operator<Context> {
 protected:
    vector<int> multiples;
    vector< pair<int, int> > process_axes;
-    TIndex axis, multiple, outer_dim, dim, inner_dim;
+    TIndex axis, multiple, outer_dim, ex_inner_dim;
    Tensor* dest, *source;
 };

@@ -49,10 +49,10 @@ class TileGradientOp : public Operator<Context> {
 protected:
    vector<int> multiples;
    vector< pair<int, int> > process_axes;
-    TIndex axis, multiple, outer_dim, dim, inner_dim;
+    TIndex axis, multiple, outer_dim, ex_inner_dim;
    Tensor* dest, *source;
 };

 }    // namespace dragon

-#endif    // DRAGON_OPERATORS_COMMON_TILE_OP_H_
\ No newline at end of file
+#endif    // DRAGON_OPERATORS_NDARRAY_TILE_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/common/transpose_op.h
+++ b/Dragon/include/operators/common/transpose_op.h
@@ -4,8 +4,8 @@
 // Written by Ting Pan
 // --------------------------------------------------------

-#ifndef DRAGON_OPERATORS_COMMON_TRANSPOSE_OP_H_
-#define DRAGON_OPERATORS_COMMON_TRANSPOSE_OP_H_
+#ifndef DRAGON_OPERATORS_NDARRAY_TRANSPOSE_OP_H_
+#define DRAGON_OPERATORS_NDARRAY_TRANSPOSE_OP_H_

 #include "core/operator.h"

@@ -16,13 +16,17 @@ class TransposeOp final: public Operator<Context> {
 public:
    TransposeOp(const OperatorDef& op_def, Workspace* ws)
        : Operator<Context>(op_def, ws),
-          perm(OperatorBase::GetRepeatedArg<int>("perm")) {}
+          perms(OperatorBase::GetRepeatedArg<int>("perms")) {
+        if (perms.size() > 0) reverse_dims = false;
+        else reverse_dims = true;
+    }

    void RunOnDevice() override;
    template <typename T> void RunWithType();

 protected:
-    vector<int> perm;
+    vector<int> perms;
+    bool reverse_dims;
    Tensor* order, *old_steps, *new_steps;
 };

@@ -42,4 +46,4 @@ class TransposeGradientOp final : public Operator<Context> {

 }    // namespace dragon

-#endif    // DRAGON_OPERATORS_COMMON_TRANSPOSE_OP_H_
\ No newline at end of file
+#endif    // DRAGON_OPERATORS_NDARRAY_TRANSPOSE_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/update/async_update_op.h
+++ b/Dragon/include/operators/update/async_update_op.h
-// --------------------------------------------------------
-// Dragon
-// Copyright(c) 2017 SeetaTech
-// Written by Ting Pan
-// --------------------------------------------------------
-
-#ifndef DRAGON_OPERATORS_UPDATE_ASYNC_UPDATE_OP_H_
-#define DRAGON_OPERATORS_UPDATE_ASYNC_UPDATE_OP_H_
-
-#ifdef WITH_MPI
-
-#include "operators/update/update_op_base.h"
-#include "utils/thread.h"
-
-namespace dragon {
-
-template <class Context>
-class AsyncUpdateOp final: public UpdateOpBase<Context> {
- public:
-    AsyncUpdateOp(const OperatorDef& op_def, Workspace* ws);
-
-    int GetDelay(int tag);
-    void UpdateTimestamp(int tag);
-
-    void RunOnDevice() override;
-    void ComputeRunWithFloat() override { /* do nothing */ }
-    template <typename T> void RootRunWithType();
-    template <typename T> void ThreadRunWithType();
-
- protected:
-    string mode;
-    unique_ptr<Tensor> recv_buffer;
-    Tensor** acc_buffers;
-    string* tags;
-    TIndex update_count;
-    int node_id, nsync, max_recv;
-    Map<int, int> local_timestamp;
-    std::unique_ptr<std::thread> thread;
-
-#ifdef WITH_MPI_CUDA
-    cudaStream_t stream;
-    cublasHandle_t handle;
-#endif
-
-};
-
-}    // namespace dragon
-
-#endif    // WITH_MPI
-
-#endif    // DRAGON_OPERATORS_UPDATE_ASYNC_UPDATE_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/update/collective_update_op.h
+++ b/Dragon/include/operators/update/collective_update_op.h
+// --------------------------------------------------------
+// Dragon
+// Copyright(c) 2017 SeetaTech
+// Written by Ting Pan
+// --------------------------------------------------------
+
+#ifndef DRAGON_OPERATORS_UPDATE_COLLECTIVE_UPDATE_OP_H_
+#define DRAGON_OPERATORS_UPDATE_COLLECTIVE_UPDATE_OP_H_
+
+#include "core/operator.h"
+
+namespace dragon {
+
+#ifdef WITH_MPI
+
+template <class Context>
+class CollectiveUpdateOp : public Operator<Context> {
+ public:
+     CollectiveUpdateOp(const OperatorDef& op_def, Workspace* ws)
+        : Operator<Context>(op_def, ws),
+          mode(OperatorBase::GetSingleArg<string>("mode", "UNKNOWN")) {
+         InitMPI();
+         if (mode.find("NCCL") != string::npos) InitNCCL();
+     }
+
+    void InitMPI();
+    void InitNCCL();
+
+    void RunOnDevice() override;
+    void MPIAllReduceWithFloat();
+    void NCCLAllReduceWithFloat();
+    void MPIBcastWithFloat();
+    void NCCLBcastWithFloat();
+
+ protected:
+    int comm_size, comm_rank, comm_root;
+    int world_size, world_rank;
+    Tensor* buffer;
+    string  mode;
+
+    MPI_Comm comm;
+    MPI_Group group;
+
+#ifdef WITH_MPI_NCCL
+    ncclComm_t nccl_comm;
+    cudaStream_t stream;
+#endif
+};
+
+#endif    // WITH_MPI
+
+}    // namespace dragon
+
+#endif    // DRAGON_OPERATORS_UPDATE_COLLECTIVE_UPDATE_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/update/moving_average_op.h
+++ b/Dragon/include/operators/update/moving_average_op.h
@@ -26,8 +26,6 @@ class MovingAverageOp final : public Operator<Context> {

 };

-
-
 }   // namespace dragon



--- a/Dragon/include/operators/update/update_op_base.h
+++ b/Dragon/include/operators/update/update_op_base.h
@@ -16,43 +16,21 @@ class UpdateOpBase : public Operator<Context> {
 public:
    UpdateOpBase(const OperatorDef& op_def, Workspace* ws) 
        : Operator<Context>(op_def, ws),
-          allow_parallel(false),
-          async_tag(-1),
          lr_mult(OperatorBase::GetSingleArg<float>("lr_mult", 1.0)),
          decay_mult(OperatorBase::GetSingleArg<float>("decay_mult", 1.0)),
-          domain(OperatorBase::GetSingleArg<string>("domain", "_")), 
-          mode(OperatorBase::GetSingleArg<string>("mode", "Sync")) { InitMPI(); }
+          domain(OperatorBase::GetSingleArg<string>("domain", "_")) {}

    float param(const string& name) const;
-    void InitMPI();

    void RunOnDevice() override;
-    template <typename T> void ReduceRunWithType();
    template <typename T> void PreprocessRunWithType();
    virtual void ComputeRunWithFloat() = 0;
    template <typename T> void UpdateRunWithType();
-    template <typename T> void RecvRunWithType();

 protected:
    float lr_mult, decay_mult;
    float l2_decay, clip_thresh, scale_factor;
-    int comm_size, comm_rank, comm_root;
-    int world_size, world_rank;
-    bool allow_parallel;
-    int async_tag;
-    Tensor* buffer;
-    string domain, mode;
-
-#ifdef WITH_MPI
-    MPI_Comm comm;
-    MPI_Group group;
-#endif  // WITH_MPI
-
-#ifdef WITH_MPI_NCCL
-    ncclComm_t nccl_comm;
-    cudaStream_t stream;
-#endif  // WITH_MPI_NCCL
-
+    string domain;
 };

 }    // namespace dragon 

--- a/Dragon/include/operators/vision/bilinear_resize_op.h
+++ b/Dragon/include/operators/vision/bilinear_resize_op.h
+// --------------------------------------------------------
+// Dragon
+// Copyright(c) 2017 SeetaTech
+// Written by Ting Pan
+// --------------------------------------------------------
+
+#ifndef DRAGON_OPERATORS_VISION_BILINEAR_RESIZE_OP_H_
+#define DRAGON_OPERATORS_VISION_BILINEAR_RESIZE_OP_H_
+
+#include "core/operator.h"
+
+namespace dragon {
+
+template <class Context>
+class BilinearResizeOp : public Operator<Context> {
+ public:
+    BilinearResizeOp(const OperatorDef& op_def, Workspace* ws)
+        : Operator<Context>(op_def, ws),
+          static_dsize(OperatorBase::GetRepeatedArg<int>("static_dsize")),
+          dynamic_dsize(OperatorBase::GetRepeatedArg<string>("dynamic_dsize")),
+          fy(OperatorBase::GetSingleArg<float>("fy", -1.0)),
+          fx(OperatorBase::GetSingleArg<float>("fx", -1.0)) {}
+
+    void RunOnDevice() override;
+    template <typename T> void RunWithType();
+
+ protected:
+    vector<int> static_dsize;
+    vector<string> dynamic_dsize;
+    vector<TIndex> dims;
+    float h_scale, w_scale, fy, fx;
+};
+
+template <class Context>
+class BilinearResizeGradientOp : public Operator<Context> {
+ public:
+    BilinearResizeGradientOp(const OperatorDef& op_def, Workspace* ws)
+        : Operator<Context>(op_def, ws) {}
+
+    void RunOnDevice() override;
+    template <typename T> void RunWithType();
+};
+
+}    // namespace dragon
+
+#endif    // DRAGON_OPERATORS_VISION_BILINEAR_RESIZE_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/vision/dense_concat_op.h
+++ b/Dragon/include/operators/vision/dense_concat_op.h
@@ -7,7 +7,7 @@
 #ifndef DRAGON_OPERATORS_VISION_DENSE_CONCAT_OP_H_
 #define DRAGON_OPERATORS_VISION_DENSE_CONCAT_OP_H_

-#include "operators/common/concat_op.h"
+#include "operators/ndarray/concat_op.h"

 namespace dragon {


--- a/Dragon/include/operators/vision/nn_resize_op.h
+++ b/Dragon/include/operators/vision/nn_resize_op.h
@@ -16,7 +16,8 @@ class NNResizeOp : public Operator<Context> {
 public:
    NNResizeOp(const OperatorDef& op_def, Workspace* ws)
        : Operator<Context>(op_def, ws),
-          dsize(OperatorBase::GetRepeatedArg<int>("dsize")),
+          static_dsize(OperatorBase::GetRepeatedArg<int>("static_dsize")),
+          dynamic_dsize(OperatorBase::GetRepeatedArg<string>("dynamic_dsize")),
          fy(OperatorBase::GetSingleArg<float>("fy", -1.0)),
          fx(OperatorBase::GetSingleArg<float>("fx", -1.0)) {}

@@ -24,7 +25,8 @@ class NNResizeOp : public Operator<Context> {
    template <typename T> void RunWithType();

 protected:
-    vector<int> dsize;
+    vector<int> static_dsize;
+    vector<string> dynamic_dsize;
    vector<TIndex> dims;
    float h_scale, w_scale, fy, fx;
 };

--- a/Dragon/include/operators/vision/pooling_op.h
+++ b/Dragon/include/operators/vision/pooling_op.h
@@ -17,21 +17,23 @@ template <class Context>
 class PoolingOp: public Operator <Context> {
 public:
     PoolingOp(const OperatorDef& op_def, Workspace* ws)
-        : Operator<Context>(op_def, ws) {
-
+         : Operator<Context>(op_def, ws),
+           mode(PoolingMode(OperatorBase::GetSingleArg<int>("mode", MAX_POOLING))),
+           global_pooling(OperatorBase::GetSingleArg<bool>("global_pooling", false)) {
         vector<int> ks = OperatorBase::GetRepeatedArg<int>("kernel_size");
-        for (int i = 0; i < 2; i++)
-            kernel_size.push_back(i < ks.size() ? ks[i] : ks[0]);
-
         vector<int> s = OperatorBase::GetRepeatedArg<int>("stride");
-        for (int i = 0; i < 2; i++)
-            stride.push_back(i < s.size() ? s[i] : s[0]);
-
         vector<int> p = OperatorBase::GetRepeatedArg<int>("pad");
-        for (int i = 0; i < 2; i++)
+         for (int i = 0; i < 2; i++) {
+             if (global_pooling) {
+                 kernel_size.push_back(-1);
+                 stride.push_back(1);
+                 pad.push_back(0);
+             } else {
+                 kernel_size.push_back(i < ks.size() ? ks[i] : ks[0]);
+                 stride.push_back(i < s.size() ? s[i] : s[0]);
                 pad.push_back(i < p.size() ? p[i] : p[0]);
-
-        mode = PoolingMode(OperatorBase::GetSingleArg<int>("mode", MAX_POOLING));
+             }
+         }
    }

    void Reshape();
@@ -45,27 +47,30 @@ class PoolingOp: public Operator <Context> {
    PoolingMode mode;
    TIndex num, channels, height, width;
    TIndex pool_height, pool_width;
+    bool global_pooling;
 };

 template <class Context>
 class PoolingGradientOp: public Operator<Context> {
 public:
    PoolingGradientOp(const OperatorDef& op_def, Workspace* ws)
-        : Operator<Context>(op_def, ws) {
-
+         : Operator<Context>(op_def, ws),
+           mode(PoolingMode(OperatorBase::GetSingleArg<int>("mode", MAX_POOLING))),
+           global_pooling(OperatorBase::GetSingleArg<bool>("global_pooling", false)) {
         vector<int> ks = OperatorBase::GetRepeatedArg<int>("kernel_size");
-        for (int i = 0; i < 2; i++)
-            kernel_size.push_back(i < ks.size() ? ks[i] : ks[0]);
-
         vector<int> s = OperatorBase::GetRepeatedArg<int>("stride");
-        for (int i = 0; i < 2; i++)
-            stride.push_back(i < s.size() ? s[i] : s[0]);
-
         vector<int> p = OperatorBase::GetRepeatedArg<int>("pad");
-        for (int i = 0; i < 2; i++)
+         for (int i = 0; i < 2; i++) {
+             if (global_pooling) {
+                 kernel_size.push_back(-1);
+                 stride.push_back(1);
+                 pad.push_back(0);
+             } else {
+                 kernel_size.push_back(i < ks.size() ? ks[i] : ks[0]);
+                 stride.push_back(i < s.size() ? s[i] : s[0]);
                 pad.push_back(i < p.size() ? p[i] : p[0]);
-
-        mode = PoolingMode(OperatorBase::GetSingleArg<int>("mode", MAX_POOLING));
+             }
+         }
    }

    void Reshape();
@@ -79,6 +84,7 @@ class PoolingGradientOp: public Operator<Context> {
    PoolingMode mode;
    TIndex num, channels, height, width;
    TIndex pool_height, pool_width;
+    bool global_pooling;
 };

 #ifdef WITH_CUDNN

--- a/Dragon/include/operators/vision/roi_pooling_op.h
+++ b/Dragon/include/operators/vision/roi_pooling_op.h
--- a/Dragon/include/utils/caffemodel.h
+++ b/Dragon/include/utils/caffemodel.h
@@ -49,7 +49,7 @@ inline bool ReadProtoFromBinaryFile(const char* filename, Message* proto) {
    return success;
 }

-inline void LoadCaffeModel(string file, string scope, Workspace* ws) {
+inline void LoadCaffeModel(string file, Workspace* ws) {
    NetParameter net_param;
    ReadProtoFromBinaryFile(file.c_str(), &net_param);
    LOG(INFO) << "Restore From Model @: " << file << "......";
@@ -57,7 +57,7 @@ inline void LoadCaffeModel(string file, string scope, Workspace* ws) {
    for (int i = 0; i < net_param.layer_size(); i++) {
        const LayerParameter& layer = net_param.layer(i);
        const string& layer_name = layer.name();
-        string prefix = scope + layer_name + "@param";
+        string prefix = layer_name + "@param";
        for (int j = 0; j < layer.blobs_size(); j++) {
            string tensor_name = prefix + dragon_cast<string, int>(j);
            if (!ws->HasTensor(tensor_name))
@@ -111,8 +111,8 @@ inline void SavaCaffeModel(string file, const vector<Tensor*>& tensors) {
    }
    std::fstream output(file, std::ios::out | std::ios::trunc | std::ios::binary);
    CHECK(net_param.SerializeToOstream(&output));
-    LOG(INFO) << "save the model @: " << file << "......";
-    LOG(INFO) << "model format: caffemodel";
+    LOG(INFO) << "Save the model @: " << file << "......";
+    LOG(INFO) << "Model format: caffemodel";
 }

 }    // namespace dragon

--- a/Dragon/include/utils/cast.h
+++ b/Dragon/include/utils/cast.h
@@ -8,7 +8,9 @@
 #define DRAGON_UTILS_CAST_H_

 #include <cstring>
+
 #include "core/types.h"
+#include "utils/cuda_device.h"

 namespace dragon {

@@ -113,6 +115,45 @@ template<> inline float32 dragon_cast<float32, float>(float val) {
    return dragon_cast<float32, float16>(t);
 }

+#ifdef WITH_CUDA_FP16
+
+template<> inline half dragon_cast<half, float>(float val) {
+#if CUDA_VERSION_MIN(9, 0, 0)
+    __half_raw fp16_raw;
+    fp16_raw.x = dragon_cast<float16, float>(val).x;
+    return half(fp16_raw);
+#else
+    half fp16;
+    fp16.x =  dragon_cast<float16, float>(val).x;
+    return fp16;
+#endif
+}
+
+template<> inline half2 dragon_cast<half2, float>(float val) {
+#if CUDA_VERSION_MIN(9, 0, 0)
+    half fp16 = dragon_cast<half, float>(val);
+    return half2(fp16, fp16);
+#else
+    half2 fp32;
+    fp32.x = dragon_cast<float32, float>(val).x;
+    return fp32;
+#endif
+}
+
+template<> inline half2 dragon_cast<half2, float16>(float16 val) {
+#if CUDA_VERSION_MIN(9, 0, 0)
+        __half_raw fp16_raw;
+        fp16_raw.x = val.x;
+        return half2(half(fp16_raw), half(fp16_raw));
+#else
+        half2 fp32;
+        fp32.x = dragon_cast<float32, float16>(val).x;
+        return fp32;
+#endif
+
+}
+
+#endif    // WITH_CUDA_FP16

 }    // namespace dragon


--- a/Dragon/include/utils/cuda_device.h
+++ b/Dragon/include/utils/cuda_device.h
@@ -15,7 +15,7 @@
 #include <cuda.h>

 #ifdef WITH_MPI_NCCL
-#include <nccl/nccl.h>
+#include <nccl.h>
 #endif  // WITH_MPI_NCCL

 #include "core/common.h"
@@ -25,6 +25,12 @@ namespace dragon {
 static const int CUDA_NUM_THREADS = 1024;
 #define MAX_GPUS 8

+#define CUDA_VERSION_MIN(major, minor, patch) \
+    (CUDA_VERSION >= (major * 1000 + minor * 100 + patch))
+
+#define CUDA_VERSION_MAX(major, minor, patch) \
+    (CUDA_VERSION < (major * 1000 + minor * 100 + patch))
+
 #define CUDA_CHECK(condition) \
  do { \
    cudaError_t error = condition; \
@@ -61,6 +67,10 @@ inline int GET_BLOCKS(const int N) {

 #define CUDA_POST_KERNEL_CHECK CUDA_CHECK(cudaPeekAtLastError())

+#if CUDA_VERSION_MAX(9, 0, 0)
+#define __hdiv hdiv
+#endif
+
 inline int NUM_DEVICES() {
    static int count = -1;
    if (count < 0) {

--- a/Dragon/include/utils/math_functions.h
+++ b/Dragon/include/utils/math_functions.h
@@ -107,7 +107,7 @@ template<typename T, class Context>
 void AddScalar(const int n, const float alpha, T* y);

 template<typename T, class Context>
-void MulScalar(const int n, const T alpha, T* y);
+void MulScalar(const int n, const float alpha, T* y);

 template<typename T, class Context>
 void Axpy(const int n, float alpha, const T* x, T *y);

--- a/Dragon/include/utils/omp_alternative.h
+++ b/Dragon/include/utils/omp_alternative.h
@@ -14,7 +14,7 @@

 namespace dragon {

-#define OMP_MIN_ITERATORS_PER_CORE 256
+#define OMP_MIN_ITERATORS_PER_CORE 200000

 inline int GET_OMP_THREADS(const int N) { 
   int threads = std::max(N / OMP_MIN_ITERATORS_PER_CORE, 1); 

--- a/Dragon/include/utils/op_kernel.h
+++ b/Dragon/include/utils/op_kernel.h
--- a/Dragon/modules/python/CMakeLists.txt
+++ b/Dragon/modules/python/CMakeLists.txt
@@ -23,6 +23,9 @@ endif()
 if (UNIX AND WITH_BLAS)
    TARGET_LINK_LIBRARIES(${PROJECT_NAME}_python openblas)
 endif()
+if (UNIX AND WITH_MPI_NCCL)
+    TARGET_LINK_LIBRARIES(${PROJECT_NAME}_python nccl)
+endif()

 # ---[ link platforms
 if(UNIX)

--- a/Dragon/modules/python/dragon.cc
+++ b/Dragon/modules/python/dragon.cc
--- a/Dragon/modules/python/dragon.h
+++ b/Dragon/modules/python/dragon.h
@@ -96,9 +96,9 @@ class NumpyFetcher : public TensorFetcherBase {
        CHECK_GT(tensor.count(), 0);
        vector<npy_intp> npy_dims;
        for (const auto dim : tensor.dims()) npy_dims.push_back(dim);
-        int numpy_type = DragonToNumpyType(tensor.meta());    // translate a Meta to a int
+        int numpy_type = DragonToNumpyType(tensor.meta());
        if (numpy_type == -1) {
-            string s = "Tensor(" + tensor.name() + "): unknown type yet, really run the net?";
+            string s = "The data type of Tensor(" + tensor.name() + ") is unknown. Have you solved it ?";
            PyErr_SetString(PyExc_RuntimeError, s.c_str());
            return nullptr;
        }
@@ -134,12 +134,12 @@ class NumpyFeeder : public TensorFeederBase {
        PyArrayObject* array = PyArray_GETCONTIGUOUS(original_array);
        const TypeMeta& meta = NumpyTypeToDragon(PyArray_TYPE(array));
        if (meta.id() == 0) {
-            PyErr_SetString(PyExc_TypeError, "numpy data type is not supported.");
+            PyErr_SetString(PyExc_TypeError, "Unsupported data type.");
            return nullptr;
        }
        if (meta.id() != tensor->meta().id() && tensor->meta().id() != 0)
-            LOG(WARNING) << "feed Tensor(" << tensor->name() << ")"
-                         << " with different dtype from original's.";
+            LOG(WARNING) << "Feed Tensor(" << tensor->name() << ")"
+                         << " with different data type from original one.";
        tensor->SetMeta(meta);
        int ndim = PyArray_NDIM(array);
        npy_intp* npy_dims = PyArray_DIMS(array);
@@ -154,7 +154,7 @@ class NumpyFeeder : public TensorFeederBase {
                                                    tensor->raw_mutable_data<CUDAContext>(), 
                                                    static_cast<void*>(PyArray_DATA(array)));
 #else   
-            LOG(FATAL) << "CUDA is not compilied.";
+            LOG(FATAL) << "CUDA was not compiled.";
 #endif
        } else{
            CPUContext::Memcpy<CPUContext, CPUContext>(tensor->nbytes(), 

--- a/Dragon/modules/python/py_mpi.h
+++ b/Dragon/modules/python/py_mpi.h
@@ -24,7 +24,7 @@ inline PyObject* MPIInitCC(PyObject* self, PyObject* args) {
    int thread_type;
    MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &thread_type);
    CHECK_EQ(thread_type, MPI_THREAD_MULTIPLE)
-        << "require enable <MPI_THREAD_MULTIPLE> support.";
+        << "\nRequire to enable <MPI_THREAD_MULTIPLE> support.";
    Py_RETURN_TRUE;
 }

@@ -72,7 +72,7 @@ inline PyObject* MPICreateGroupCC(PyObject* self, PyObject* args) {
            all_ranks.insert(ranks[i]);
        }
        err_code = MPI_Group_incl(world_group, size, ranks, &local_group);
-        CHECK(err_code == MPI_SUCCESS) << "failed to create mpi group.";
+        CHECK(err_code == MPI_SUCCESS) << "\nFail to create mpi group.";
    }

    //  check exclude ranks
@@ -88,11 +88,11 @@ inline PyObject* MPICreateGroupCC(PyObject* self, PyObject* args) {
        for (int i = 0; i < world_size; i++)
            if (!tmp.count(i)) all_ranks.insert(i);
        err_code = MPI_Group_excl(world_group, size, ranks, &local_group);
-        CHECK(err_code == MPI_SUCCESS) << "failed to create mpi group.";
+        CHECK(err_code == MPI_SUCCESS) << "Fail to create mpi group.";
    }

    err_code = MPI_Comm_create(MPI_COMM_WORLD, local_group, &local_comm);
-    CHECK(err_code == MPI_SUCCESS) << "failed to create mpi group.";
+    CHECK(err_code == MPI_SUCCESS) << "Fail to create mpi group.";

    if (local_comm != MPI_COMM_NULL) {
        int world_rank, local_size;
@@ -120,7 +120,7 @@ inline PyObject* MPICreateGroupCC(PyObject* self, PyObject* args) {
 #else  // WITH_MPI

 #define MPI_NOT_IMPLEMENTED \
-    LOG(FATAL) << "MPI is not compilied."; \
+    LOG(FATAL) << "MPI was not compiled."; \
    Py_RETURN_TRUE

 inline PyObject* MPIInitCC(PyObject* self, PyObject* args) { MPI_NOT_IMPLEMENTED; }

--- a/Dragon/python/dragon/config.py
+++ b/Dragon/python/dragon/config.py
@@ -16,21 +16,52 @@ option = {}
 REGISTERED_OPERATORS = set(s for s in RegisteredOperatorsCC())
 NO_GRADIENT_OPERATORS = set(s for s in NoGradientOperatorsCC())

+# The current device, 'CPU' or 'CUDA'
 option['device'] = 'CPU'
+
+# The device id
 option['gpu_id'] = 0
+
+# Whether to use cuDNN if possible
 option['use_cudnn'] = False
+
+# The global random seed
 option['random_seed'] = 3

-# if True, disable Dragon-Memonger
+# Disable the memonger if true
 option['debug_mode'] = False
-option['share_grads'] = False # set it by Dragon-Memonger
-option['allow_mirrow_stage'] = True # default
+
+# Set it by the memonger
+option['share_grads'] = False
+

 def EnableCPU():
+    """Enable CPU mode globally.
+
+    Returns
+    -------
+    None
+
+    """
    global option
    option['device'] = 'CPU'

+
 def EnableCUDA(gpu_id=0, use_cudnn=True):
+    """Enable CUDA mode globally.
+
+    Parameters
+    ----------
+    gpu_id : int
+        The id of GPU to use.
+    use_cudnn : boolean
+        Whether to use cuDNN if available.
+
+    Returns
+    -------
+    None
+
+    """
    global option
    option['device'] = 'CUDA'
    option['gpu_id'] = gpu_id
@@ -39,32 +70,99 @@ def EnableCUDA(gpu_id=0, use_cudnn=True):
 # TODO(PhyscalX): please not use @setter
 # TODO(PhyscalX): seems that it can't change the global value

+
 def SetRandomSeed(seed):
+    """Set the global random seed.
+
+    Parameters
+    ----------
+    seed : int
+        The seed to use.
+
+    Returns
+    -------
+    None
+
+    """
    global option
    option['random_seed'] = seed

+
 def GetRandomSeed():
+    """Get the global random seed.
+
+    Returns
+    -------
+    int
+        The global random seed.
+
+    """
    global option
    return option['random_seed']

+
 def SetGPU(id):
+    """Set the global id GPU.
+
+    Parameters
+    ----------
+    id : int
+        The id of GPU to use.
+
+    Returns
+    -------
+    None
+
+    """
    global option
    option['gpu_id'] = id

+
 def GetGPU(id):
+    """Get the global id of GPU.
+
+    Returns
+    -------
+    int
+        The global id of GPU.
+
+    """
    global option
    return option['gpu_id']

-def SetDebugMode(mode):
+
+def SetDebugMode(enabled=True):
+    """Enable Debug mode globally.
+
+    It will disable all memory sharing optimizations.
+
+    Parameters
+    ----------
+    enabled : boolean
+        Whether to enable debug mode.
+
+    Returns
+    -------
+    None
+
+    """
    global option
-    option['debug_mode'] = mode
+    option['debug_mode'] = enabled
+

 def SetLoggingLevel(level):
-    """
-    set the minimum level of logging
-    :param level:  a str of DEBUG, INFO(default), WARNING, ERROR, FATAL
-    """
+    """Set the minimum level of Logging.
+
+    Parameters
+    ----------
+    level : str
+        The level, ``DEBUG``, ``INFO``, ``WARNING``, ``ERROR`` or ``FATAL``.

+    Notes
+    -----
+    The default level is ``INFO``.
+
+    """
    SetLogLevelCC(level)
    global logger
    logger.setLevel({

--- a/Dragon/python/dragon/core/gradient_maker.py
+++ b/Dragon/python/dragon/core/gradient_maker.py
@@ -13,10 +13,34 @@ from dragon.__init__ import *
 from .scope import GetOperatorName

 class GraphGradientMaker(object):
+    """
+    GraphGradientMaker is deigned to generate gradient operators automatically.
+
+    It relies on the generating rules defined in the C++ backend.
+    """
    @classmethod
-    def CreateGradientForOp(cls, op_def, g_output):
-        """ parse ops from string """
-        g_ops, g_inputs, defaults = CreateGradientDefsCC(op_def.SerializeToString(), g_output)
+    def CreateGradientForOp(cls, forward_op, g_output):
+        """Generate the OperatorDef for ``BackwardOp`` by ``ForwardOp``.
+
+        Parameters
+        ----------
+        forward_op : dragon_pb2.OperatorDef
+            The OperatorDef of ``ForwardOp``.
+        g_output : list of str
+            The inputs of ``BackwardOp`` (Precomputed Grads).
+
+        Returns
+        -------
+        tuple
+            The OpDef, outputs and defaults of ``BackwardOp``.
+
+        References
+        ----------
+        The wrapper of ``CreateGradientDefsCC``.
+
+        """
+        g_ops, g_inputs, defaults = \
+            CreateGradientDefsCC(forward_op.SerializeToString(), g_output)
        for idx, g_op in enumerate(g_ops):
            new_def = pb.OperatorDef()
            new_def.ParseFromString(g_op)
@@ -24,9 +48,28 @@ class GraphGradientMaker(object):
            g_ops[idx] = new_def
        return g_ops, g_inputs, defaults

+
    @classmethod
    def CheckMissingGrad(cls, forward_op, inputs_to_grads, blacklist, targets):
-        """ check the missing grads, if True, skip this op """
+        """Check if missing Grads. If True, skip this Op.
+
+        Parameters
+        ----------
+        forward_op : dragon_pb2.OperatorDef
+            The OperatorDef of ``ForwardOp``.
+        inputs_to_grads : dict
+            The dict of <input, g_input>.
+        blacklist : set of str
+            The set of ``NoGradient`` tensors.
+        targets : list of str
+            The solving targets.
+
+        Returns
+        -------
+        tuple
+            The result of checking and generated filling grads.
+
+        """
        if forward_op.type in config.NO_GRADIENT_OPERATORS:
            for input in forward_op.input: blacklist.add(input)
            return (True, None)
@@ -49,22 +92,44 @@ class GraphGradientMaker(object):
        # check pass, even if missing some grads
        return (False, gen_grads)

+
    @classmethod
-    def Make(cls, ops, targets):
+    def Make(cls, forward_ops, targets):
+        """Make ``BackwardOps`` based on ``ForwardOps``.
+
+        Parameters
+        ----------
+        forward_ops : list of dragon_pb2.OperatorDef
+            The operators of ``ForwardOp``.
+        targets : list of str
+            The solving targets.
+
+        Returns
+        -------
+        tuple
+            The ``ForwardOps`` and ``BackwardOps``.
+
+        See Also
+        --------
+        `theano.function(*args, **kwargs)`_ - How to make a graph. [**Theano Style**]
+
+        """
        inputs_to_grads = {}
        inputs_count = defaultdict(int)
        grads_count = defaultdict(int)
-        all_g_ops = []
+
        all_split_grads = set()
        blacklist = set()

+        backward_ops = []
+
        # PLAY for the forward
-        for op in ops:
-            if op.type in config.NO_GRADIENT_OPERATORS: continue
-            for input in op.input: inputs_count[input] += 1
+        for forward_op in forward_ops:
+            if forward_op.type in config.NO_GRADIENT_OPERATORS: continue
+            for input in forward_op.input: inputs_count[input] += 1

        # PLAY for the backward
-        for forward_op in ops[::-1]:
+        for forward_op in forward_ops[::-1]:
            is_skip, gen_grads = cls.CheckMissingGrad(forward_op, inputs_to_grads, blacklist, targets)
            g_outputs = list(inputs_to_grads.get(name, None) for name in forward_op.output)
            g_ops, g_inputs, defaults = cls.CreateGradientForOp(forward_op, g_outputs)
@@ -81,8 +146,8 @@ class GraphGradientMaker(object):
                                             GetOperatorName()[1], defaults=values)
                    if forward_op.HasField('device_option'):
                        gen_op.device_option.CopyFrom(forward_op.device_option)
-                    all_g_ops.append(gen_op)
-                for g_op in g_ops: all_g_ops.append(g_op)
+                    backward_ops.append(gen_op)
+                for g_op in g_ops: backward_ops.append(g_op)

            # split & gather grads for multi-used input
            for g_op in g_ops:
@@ -107,11 +172,11 @@ class GraphGradientMaker(object):
                            if g_op.HasField('device_option'):
                                gather_op.device_option.CopyFrom(g_op.device_option)
                            _, gather_op.name = GetOperatorName()
-                            all_g_ops.append(gather_op)
+                            backward_ops.append(gather_op)
                        g_op.output[g_output_idx] = split_name

            # done
            if not is_skip:
                for name, grad in zip(forward_op.input, g_inputs):
                    if grad != '': inputs_to_grads[name] = grad
-        return ops, all_g_ops
\ No newline at end of file
+        return forward_ops, backward_ops
\ No newline at end of file
--- a/Dragon/python/dragon/core/mpi.py
+++ b/Dragon/python/dragon/core/mpi.py
@@ -13,43 +13,162 @@ from dragon import MPIInitCC, MPIRankCC, MPISizeCC, \
 _is_init = False
 _snapshot_ranks = []
 _parallel_groups = []
-_parallel_mode = 'Sync'
+_parallel_mode = 'MPI'

-def init():
+__all__ = [
+    'Init',
+    'Is_Init',
+    'Rank',
+    'Size',
+    'CreateGroup',
+    'Snapshot',
+    'AllowSnapshot',
+    'Parallel',
+    'AllowParallel',
+    'SetParallelMode',
+    'GetParallelMode',
+    'Finalize'
+]
+
+def _check_init():
+    global _is_init
+    if _is_init is False: Init()
+
+def Init():
+    """Init the MPI env.
+
+    Returns
+    -------
+    None
+
+    Notes
+    -----
+    This function can only be called once.
+
+    References
+    ----------
+    The wrapper of ``MPIInitCC``
+
+    """
    MPIInitCC()
    global _is_init
    global _snapshot_ranks
    _is_init = True
-    _snapshot_ranks = [i for i in xrange(size())]
+    _snapshot_ranks = [i for i in xrange(Size())]

-def check_init():
-    global _is_init
-    if _is_init is False: init()

-def is_init():
+def Is_Init():
+    """Whether the MPI env has initialized.
+
+    Returns
+    -------
+    boolean
+
+    """
    return _is_init

-def rank():
-    check_init()
+
+def Rank():
+    """The world rank of current MPI node.
+
+    Returns
+    -------
+    int
+        The world rank.
+
+    References
+    ----------
+    The wrapper of ``MPIRankCC``.
+
+    """
+    _check_init()
    return MPIRankCC()

-def size():
-    check_init()
+
+def Size():
+    """The world size of current MPI env.
+
+    Returns
+    -------
+    int
+        The world size.
+
+    References
+    ----------
+    The wrapper of ``MPISizeCC``.
+
+    """
+    _check_init()
    return MPISizeCC()

-def group(root=0, incl=[], excl=[]):
-    check_init()
+
+def CreateGroup(root=0, incl=[], excl=[]):
+    """Construct a ``MPIGroup`` with specific members.
+
+    Parameters
+    ----------
+    root : int
+        The root of this group.
+    incl : list
+        The include nodes.
+    excl: list
+        The exclude nodes.
+
+    Returns
+    -------
+    tuple
+        The local common and group id.
+
+    References
+    ----------
+    The wrapper of ``MPICreateGroupCC``.
+
+    """
+    _check_init()
    comm, group = MPICreateGroupCC(root, incl, excl)
    return np.int64(comm), np.int64(group)

-def snapshot(incl):
-    check_init()
+
+def Snapshot(incl):
+    """Set the specific MPI nodes to snapshot.
+
+    The exclude nodes will not snapshot through `workspace.Snapshot(*args, **kwargs)`_.
+
+    Parameters
+    ----------
+    incl : int or list
+
+    Returns
+    -------
+    None
+
+    """
+    _check_init()
    if not isinstance(incl, list): incl = [incl]
    global _snapshot_ranks
    _snapshot_ranks = incl

-def parallel(conf):
-    check_init()
+
+def Parallel(conf):
+    """Set the specific MPI nodes for data parallelism.
+
+    Parameters
+    ----------
+    conf : list
+        The list of configures. Each configure should be a list also.
+
+    Returns
+    -------
+    None
+
+    Examples
+    --------
+    >>> mpi.parallel([0, 1]) # rank(0, 1) will be into a parallel group.
+
+    >>> mpi.parallel([0, 1], [2, 3]) # rank(0, 1), rank(2, 3) will be into two parallel groups.
+
+    """
+    _check_init()
    if not isinstance(conf[0], list): conf = [conf]
    for ele in conf:
        if not isinstance(ele, list):
@@ -57,28 +176,81 @@ def parallel(conf):
    global _parallel_groups
    _parallel_groups = conf

-def allow_snapshot():
+
+def AllowSnapshot():
+    """Whether this node can snapshot.
+
+    Returns
+    -------
+    boolean
+    """
    global _snapshot_ranks
-    return rank() in _snapshot_ranks
+    return Rank() in _snapshot_ranks
+
+
+def AllowParallel():
+    """Whether this node was set for data parallelism.
+
+    Returns
+    -------
+    boolean

-def allow_parallel():
+    """
    global _parallel_groups
-    world_rank = rank()
+    world_rank = Rank()
    for idx, g in enumerate(_parallel_groups):
        if world_rank in g: return idx, g
    return -1, []

-def set_parallel_mode(mode):
-    assert mode == 'Sync' or \
-           mode == 'Async' \
-           or mode == 'Async_No_Lock'
+
+def SetParallelMode(mode):
+    """Set the mode of data parallelism.
+
+    Parameters
+    ----------
+    mode : str
+        The mode, ``MPI``, ``NCCL`` or ``MIXED``.
+
+    Returns
+    -------
+    None
+
+    Notes
+    -----
+    The default mode is ``MPI``.
+
+    """
+    assert mode == 'MPI' or \
+           mode == 'NCCL' \
+           or mode == 'MIXED'
    global _parallel_mode
    _parallel_mode = mode

-def get_parallel_mode():
+
+def GetParallelMode():
+    """Get the current mode of data parallelism.
+
+    Returns
+    -------
+    str
+        The mode, ``MPI``, ``NCCL`` or ``MIXED``.
+
+    """
    global _parallel_mode
    return _parallel_mode

-def finalize():
-    check_init()
+
+def Finalize():
+    """Finalize the MPI env.
+
+    Returns
+    -------
+    None
+
+    Notes
+    -----
+    This function should be called to close the initialized MPI env.
+
+    """
+    _check_init()
    MPIFinalizeCC()
\ No newline at end of file
--- a/Dragon/python/dragon/core/scope.py
+++ b/Dragon/python/dragon/core/scope.py
@@ -11,34 +11,100 @@ PHASE_SCOPE = ''
 DEVICE_SCOPE = ''
 ENGINE_SCOPE = ''

+SEPARATOR = '/'
+
 _CURRENT_OP_IDX = 0
 _SCOPE_TENSOR_IDX = defaultdict(int)

+__all__ = [
+    'GetTensorIdx',
+    'GetTensorName',
+    'GetOperatorIdx',
+    'GetOperatorName',
+    'TensorScope',
+    'PhaseScope',
+    'DeviceScope'
+]
+
 def GetOperatorIdx():
+    """Get the available operator index.
+
+    Returns
+    -------
+    int
+        The operator index.
+
+    """
    global _CURRENT_OP_IDX
    _CURRENT_OP_IDX = _CURRENT_OP_IDX + 1
    return _CURRENT_OP_IDX - 1

+
 def GetTensorIdx():
+    """Get the available tensor index.
+
+    Returns
+    -------
+    int
+        The tensor index.
+
+    """
    global _SCOPE_TENSOR_IDX
    _SCOPE_TENSOR_IDX[TENSOR_SCOPE] += 1
    return _SCOPE_TENSOR_IDX[TENSOR_SCOPE] - 1

+
 def GetOperatorName(name=None):
+    """Get the available operator name.
+
+    Parameters
+    ----------
+    name : str
+        The optional name to use.
+
+    Returns
+    -------
+    str
+        The operator name.
+
+    """
    op_idx = GetOperatorIdx()
    if name is None:
        return op_idx, 'Op_' + str(op_idx)
    else: return op_idx, name

+
 def GetTensorName():
+    """Get the available tensor name.
+
+    Returns
+    -------
+    str
+        The operator name.
+
+    """
    return 'Tensor_' + str(GetTensorIdx())

+
 class TensorScope(object):
-    SEPARATOR = '/'
+    """TensorScope is the basic variable scope.
+
+    Examples
+    --------
+    >>> with TensorScope('conv1'): a = Tensor('weights')
+    >>> a.name
+    >>> conv1/weight
+
+    >>> import dragon
+    >>> with dragon.name_scope('conv1'): a = Tensor('weights')
+    >>> a.name
+    >>> conv1/weight
+
+    """
    def __init__(self, prefix):
        assert isinstance(prefix, type('str')), \
            "TensorScope takes in a string as its argument."
-        self.prefix = prefix + TensorScope.SEPARATOR
+        self.prefix = prefix + SEPARATOR

    def __enter__(self):
        global TENSOR_SCOPE
@@ -49,7 +115,20 @@ class TensorScope(object):
        assert TENSOR_SCOPE.endswith(self.prefix)
        TENSOR_SCOPE = TENSOR_SCOPE[:-len(self.prefix)]

+
 class PhaseScope(object):
+    """PhaseScope is a auxiliary to assign the specific phase.
+
+    Examples
+    --------
+    >>> import dragon.vm.theano as theano
+    >>> a = ops.RandomUniform([2, 3])
+    >>> with PhaseScope(phase='train'): f = theano.function(outputs=a)
+
+    >>> import dragon
+    >>> with dragon.phase_scope(phase='test'): f = theano.function(outputs=a)
+
+    """
    def __init__(self, phase):
        assert isinstance(phase, type('str')), \
            "PhaseScope takes in a string as its argument."
@@ -64,7 +143,18 @@ class PhaseScope(object):
        assert PHASE_SCOPE == self.phase
        PHASE_SCOPE = ''

+
 class DeviceScope(object):
+    """DeviceScope is a auxiliary to assign the specific device.
+
+    Examples
+    --------
+    >>> with DeviceScope(device='cpu'): a = ops.RandomUniform([2, 3])
+
+    >>> import dragon
+    >>> with dragon.device_scope(device='gpu', id=0, use_cudnn=True):  a = ops.RandomUniform([2, 3])
+
+    """
    def __init__(self, device, id=0, use_cudnn=True):
        self.device = device.lower()
        self.engine = 'CUDNN' if use_cudnn else 'DRAGON'
@@ -77,7 +167,6 @@ class DeviceScope(object):
        DEVICE_SCOPE = '/' + self.device + ':' + str(self.id)
        ENGINE_SCOPE = self.engine

-
    def __exit__(self, type, value, traceback):
        global DEVICE_SCOPE, ENGINE_SCOPE
        DEVICE_SCOPE = ''

--- a/Dragon/python/dragon/core/tensor.py
+++ b/Dragon/python/dragon/core/tensor.py
--- a/Dragon/python/dragon/core/workspace.py
+++ b/Dragon/python/dragon/core/workspace.py
--- a/Dragon/python/dragon/docs/Makefile
+++ b/Dragon/python/dragon/docs/Makefile
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+PAPER         =
+BUILDDIR      = _build
+
+# User-friendly check for sphinx-build
+ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
+$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
+endif
+
+# Internal variables.
+PAPEROPT_a4     = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+
+.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext
+
+help:
+	@echo "Please use \`make <target>' where <target> is one of"
+	@echo "  html       to make standalone HTML files"
+	@echo "  dirhtml    to make HTML files named index.html in directories"
+	@echo "  singlehtml to make a single large HTML file"
+	@echo "  pickle     to make pickle files"
+	@echo "  json       to make JSON files"
+	@echo "  htmlhelp   to make HTML files and a HTML help project"
+	@echo "  qthelp     to make HTML files and a qthelp project"
+	@echo "  applehelp  to make an Apple Help Book"
+	@echo "  devhelp    to make HTML files and a Devhelp project"
+	@echo "  epub       to make an epub"
+	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
+	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
+	@echo "  text       to make text files"
+	@echo "  man        to make manual pages"
+	@echo "  texinfo    to make Texinfo files"
+	@echo "  info       to make Texinfo files and run them through makeinfo"
+	@echo "  gettext    to make PO message catalogs"
+	@echo "  changes    to make an overview of all changed/added/deprecated items"
+	@echo "  xml        to make Docutils-native XML files"
+	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
+	@echo "  linkcheck  to check all external links for integrity"
+	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
+	@echo "  coverage   to run coverage check of the documentation (if enabled)"
+
+clean:
+	rm -rf $(BUILDDIR)/*
+
+html:
+	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+dirhtml:
+	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+singlehtml:
+	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+	@echo
+	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+pickle:
+	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+	@echo
+	@echo "Build finished; now you can process the pickle files."
+
+json:
+	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+	@echo
+	@echo "Build finished; now you can process the JSON files."
+
+htmlhelp:
+	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+	@echo
+	@echo "Build finished; now you can run HTML Help Workshop with the" \
+	      ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+qthelp:
+	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+	@echo
+	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
+	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/Dragon.qhcp"
+	@echo "To view the help file:"
+	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/Dragon.qhc"
+
+applehelp:
+	$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
+	@echo
+	@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
+	@echo "N.B. You won't be able to view it unless you put it in" \
+	      "~/Library/Documentation/Help or install it in your application" \
+	      "bundle."
+
+devhelp:
+	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+	@echo
+	@echo "Build finished."
+	@echo "To view the help file:"
+	@echo "# mkdir -p $$HOME/.local/share/devhelp/Dragon"
+	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/Dragon"
+	@echo "# devhelp"
+
+epub:
+	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+	@echo
+	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+latex:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo
+	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+	@echo "Run \`make' in that directory to run these through (pdf)latex" \
+	      "(use \`make latexpdf' here to do that automatically)."
+
+latexpdf:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through pdflatex..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+latexpdfja:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through platex and dvipdfmx..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+text:
+	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+	@echo
+	@echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+man:
+	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+	@echo
+	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+texinfo:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo
+	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
+	@echo "Run \`make' in that directory to run these through makeinfo" \
+	      "(use \`make info' here to do that automatically)."
+
+info:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo "Running Texinfo files through makeinfo..."
+	make -C $(BUILDDIR)/texinfo info
+	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
+
+gettext:
+	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
+	@echo
+	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
+
+changes:
+	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+	@echo
+	@echo "The overview file is in $(BUILDDIR)/changes."
+
+linkcheck:
+	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+	@echo
+	@echo "Link check complete; look for any errors in the above output " \
+	      "or in $(BUILDDIR)/linkcheck/output.txt."
+
+doctest:
+	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+	@echo "Testing of doctests in the sources finished, look at the " \
+	      "results in $(BUILDDIR)/doctest/output.txt."
+
+coverage:
+	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
+	@echo "Testing of coverage in the sources finished, look at the " \
+	      "results in $(BUILDDIR)/coverage/python.txt."
+
+xml:
+	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
+	@echo
+	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
+
+pseudoxml:
+	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
+	@echo
+	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
--- a/Dragon/python/dragon/docs/_extensions/mathmacro.py
+++ b/Dragon/python/dragon/docs/_extensions/mathmacro.py
+"""Sphinx extension provide a new directive *mathmacro*.
+
+This extension has to be added after the other math extension since it
+redefined the math directive and the math role. For example, like this
+(in the conf.py file)::
+
+  extensions = [
+      'sphinx.ext.autodoc', 'sphinx.ext.doctest',
+      'sphinx.ext.mathjax',
+      'sphinx.ext.viewcode', 'sphinx.ext.autosummary',
+      'numpydoc',
+      'mathmacro']
+
+"""
+
+from __future__ import print_function
+
+import re
+
+from docutils.parsers.rst.directives.misc import Replace
+
+from sphinx.ext.mathbase import MathDirective
+from sphinx.ext.mathbase import math_role
+
+
+def multiple_replacer(replace_dict):
+    """Return a function replacing doing multiple replacements.
+
+    The produced function replace `replace_dict.keys()` by
+    `replace_dict.values`, respectively.
+
+    """
+    def replacement_function(match):
+        s = match.group(0)
+        end = s[-1]
+        if re.match(r'[\W_]', end):
+            return replace_dict[s[:-1]]+end
+        else:
+            return replace_dict[s]
+
+    pattern = "|".join([re.escape(k)+r'[\W_]'
+                        for k in replace_dict.keys()])
+    pattern = re.compile(pattern, re.M)
+    return lambda string: pattern.sub(replacement_function, string)
+
+def multiple_replace(string, replace_dict):
+    mreplace = multiple_replacer(replace_dict)
+    return mreplace(string)
+
+
+class MathMacro(Replace):
+    """Directive defining a math macro."""
+    def run(self):
+        if not hasattr(self.state.document, 'math_macros'):
+            self.state.document.math_macros = {}
+
+        latex_key = '\\'+self.state.parent.rawsource.split('|')[1]
+        self.state.document.math_macros[latex_key] = ''.join(self.content)
+
+        self.content[0] = ':math:`'+self.content[0]
+        self.content[-1] = self.content[-1]+'`'
+
+        return super(MathMacro, self).run()
+
+
+class NewMathDirective(MathDirective):
+    """New math block directive parsing the latex code."""
+    def run(self):
+        try:
+            math_macros = self.state.document.math_macros
+        except AttributeError:
+            pass
+        else:
+            if math_macros:
+                multiple_replace = multiple_replacer(math_macros)
+                for i, c in enumerate(self.content):
+                    self.content[i] = multiple_replace(c)
+                for i, a in enumerate(self.arguments):
+                    self.arguments[i] = multiple_replace(a)
+        return super(NewMathDirective, self).run()
+
+
+def new_math_role(role, rawtext, text, lineno, inliner,
+                  options={}, content=[]):
+    """New math role parsing the latex code."""
+    try:
+        math_macros = inliner.document.math_macros
+    except AttributeError:
+        pass
+    else:
+        if math_macros:
+            rawtext = multiple_replace(rawtext, math_macros)
+            text = rawtext.split('`')[1]
+
+    return math_role(role, rawtext, text, lineno, inliner,
+                     options=options, content=content)
+
+
+def setup(app):
+    app.add_role('math', new_math_role)
+    app.add_directive('math', NewMathDirective)
+    app.add_directive('mathmacro', MathMacro)
\ No newline at end of file
--- a/Dragon/python/dragon/docs/_templates/layout.html
+++ b/Dragon/python/dragon/docs/_templates/layout.html
+{% extends "basic/layout.html" %}
+
+<!----------------------- Defines ----------------------->
+
+{% if theme_bootstrap_version == "3" %}
+  {% set bootstrap_version, navbar_version = "3.3.7", "" %}
+  {% set bs_span_prefix = "col-md-" %}
+{% else %}
+  {% set bootstrap_version, navbar_version = "2.3.2", "-2" %}
+  {% set bs_span_prefix = "span" %}
+{% endif %}
+{%- set render_sidebar = (not embedded) and (not theme_nosidebar|tobool) and sidebars %}
+{%- set bs_content_width = render_sidebar and "9" or "12"%}
+
+
+<!----------------------- CSS/JS ----------------------->
+
+{% set css_files = css_files + [
+    '_static/css/dragon.css',
+    '_static/fonts/font-awesome/css/font-awesome.min.css',
+    '_static/fonts/lato/css/latofonts.css'
+    ]
+%}
+{% set script_files = script_files + [
+    '_static/js/jquery-1.11.0.min.js',
+    '_static/js/jquery-fix.js',
+    '_static/bootstrap-' + bootstrap_version + '/js/bootstrap.min.js',
+    '_static/js/bootstrap-sphinx.js',
+  ]
+%}
+
+{%- if render_sidebar %}
+{% set script_files = script_files + ['_static/js/sidebar.js']%}
+{%- endif %}
+
+<!----------------------- Macros ----------------------->
+
+{%- macro bsidebar() %}
+{%- if render_sidebar %}
+    <div class="sphinxsidebar leftsidebar" role="navigation" aria-label="main navigation">
+        <div class="sphinxsidebarwrapper">
+            {%- block sidebartoc %}
+            {%- include "localtoc.html" %}
+            {%- endblock %}
+        </div>
+    </div>
+{%- endif %}
+{%- endmacro %}
+
+
+<!----------------------- Blocks ----------------------->
+
+{%- block doctype -%}
+<!DOCTYPE html>
+{%- endblock %}
+
+{%- block extrahead %}
+<meta charset='utf-8'>
+<meta http-equiv='X-UA-Compatible' content='IE=edge,chrome=1'>
+<meta name='viewport' content='width=device-width, initial-scale=1.0, maximum-scale=1'>
+<meta name="apple-mobile-web-app-capable" content="yes">
+{% endblock %}
+
+{# Silence the sidebar's, relbar's #}
+{% block header %}{% endblock %}
+{% block relbar1 %}{% endblock %}
+{% block relbar2 %}{% endblock %}
+{% block sidebarsourcelink %}{% endblock %}
+
+{% block content %}
+{% include "navbar.html" %}
+<div class="container doc-container">
+  <div class="row">
+    {% block sidebar1 %}{{ bsidebar() }}{% endblock %}
+    <div class="content">
+        {% block body %}{% endblock %}
+    </div>
+  </div>
+</div>
+{% endblock %}
--- a/Dragon/python/dragon/docs/_templates/localtoc.html
+++ b/Dragon/python/dragon/docs/_templates/localtoc.html
+{{ toctree(maxdepth=theme_globaltoc_depth|toint, collapse=True,includehidden=theme_globaltoc_includehidden|tobool) }}
\ No newline at end of file
--- a/Dragon/python/dragon/docs/_templates/navbar.html
+++ b/Dragon/python/dragon/docs/_templates/navbar.html
+{# Import the theme's layout. #}
+
+<div id="navbar" class="{{ theme_navbar_class }} navbar-default {% if theme_navbar_fixed_top|tobool -%} navbar-fixed-top{%- endif -%}">
+    <div class="container">
+        <div class="navbar-header">
+            <button type="button" class="navbar-toggle" data-toggle="collapse" data-target=".nav-collapse">
+                <span class="icon-bar"></span>
+                <span class="icon-bar"></span>
+                <span class="icon-bar"></span>
+            </button>
+            <a class="navbar-brand" href="{{ pathto("../../index") }}">
+                {%- block sidebarlogo %}
+                    {%- if logo %}<span><img src="{{ pathto('_static/images/' + logo, 1) }}"></span>{%- endif %}
+                {%- endblock %}
+                {% if theme_navbar_title -%}{{ theme_navbar_title|e }}{%- else -%}{{ project|e }}{%- endif -%}
+            </a>
+        </div>
+        <div class="collapse navbar-collapse nav-collapse">
+            <ul class="nav navbar-nav">
+                <!-- Install -->
+                <li><a href="{{ pathto("../../helper/install") }}">Install</a></li>
+                <!-- Github -->
+                <li><a href="https://github.com/neopenx/Dragon">Github</a></li>
+                <!-- API -->
+                <li class="dropdown globaltoc-container">
+                    <a role="button" id="dLabelGlobalToc" data-toggle="dropdown" data-target="#" href="#" aria-expanded="true">API<b class="caret"></b></a>
+                    <ul class="dropdown-menu globaltoc" role="menu" aria-labelledby="dLabelGlobalToc">
+                        <li class="toctree-l1 dropdown-submenu"><a class="reference internal" href="{{ pathto(master_doc) }}" tabindex="-1">Python</a><li>
+                    </ul>
+                </li>
+            </ul>
+            {% block navbarsearch %}
+            {% include "navbarsearchbox.html" %}
+            {% endblock %}
+        </div>
+    </div>
+</div>
--- a/Dragon/python/dragon/docs/_templates/search.html
+++ b/Dragon/python/dragon/docs/_templates/search.html
+{%- extends "layout.html" %}
+
+{% if theme_bootstrap_version == "3" %}
+  {% set bootstrap_version, navbar_version = "3.3.7", "" %}
+  {% set bs_span_prefix = "col-md-" %}
+{% else %}
+  {% set bootstrap_version, navbar_version = "2.3.2", "-2" %}
+  {% set bs_span_prefix = "span" %}
+{% endif %}
+{%- set render_sidebar = False %}
+
+{% set title = _('Search') %}
+{% set script_files = script_files + ['_static/searchtools.js'] %}
+
+{% block extrahead %}
+<script type="text/javascript" xmlns="http://www.w3.org/1999/html">
+    jQuery(function () { Search.loadIndex("{{ pathto('searchindex.js', 1) }}"); });
+</script>
+  {# this is used when loading the search index using $.ajax fails,
+     such as on Chrome for documents on localhost #}
+<script type="text/javascript" id="searchindexloader"></script>
+  {{ super() }}
+{% endblock %}
+
+
+{% block content %}
+{% include "navbar.html" %}
+<div class="container doc-container">
+    <div class="col-lg-2"></div>
+    <div class="col-lg-8">
+        {% block sidebar1 %}{{ bsidebar() }}{% endblock %}
+        <div class="content" style="width: 100%">
+            <h1 id="search-documentation">{{ _('Search') }}</h1>
+            <div id="fallback" class="admonition warning">
+                <script type="text/javascript">$('#fallback').hide();</script>
+                <p>
+                    {% trans %}Please activate JavaScript to enable the search
+                    functionality.{% endtrans %}
+                </p>
+            </div>
+            <p>
+                From here you can search these documents. Enter your search words into the box below and click <strong>Search</strong>.
+            </p>
+            {% if theme_bootstrap_version == "3" %}
+            <form class="form-inline" action="" method="get">
+                <div class="form-group">
+                    <input type="text" class="form-control" name="q" value="" />
+                </div>
+                <input type="submit" class="btn btn-default" value="Search" />
+                <span id="search-progress" style="padding-left: 10px"></span>
+            </form>
+            {% else %}
+            <form class="form-search">
+                <input type="text" class="input-medium search-query" name="q" value="" />
+                <input type="submit" class="btn btn-default" value="{{ _('search') }}" />
+                <span id="search-progress" style="padding-left: 10px"></span>
+            </form>
+            {% endif %}
+
+            {% if search_performed %}
+            <h2>{{ _('Search Results') }}</h2>
+            {% if not search_results %}
+            <p>{{ _('Your search did not match any documents. Please make sure that all words are spelled correctly and that you\'ve selected enough categories.') }}</p>
+            {% endif %}
+            {% endif %}
+            <div id="search-results">
+                {% if search_results %}
+                <ul>
+                    {% for href, caption, context in search_results %}
+                    <li>
+                        <a href="{{ pathto(item.href) }}">{{ caption }}</a>
+                        <div class="context">{{ context|e }}</div>
+                    </li>
+                    {% endfor %}
+                </ul>
+                {% endif %}
+            </div>
+        </div>
+    </div>
+    <div class="col-lg-2"></div>
+</div>
+{% endblock %}
--- a/Dragon/python/dragon/docs/conf.py
+++ b/Dragon/python/dragon/docs/conf.py
+# --------------------------------------------------------
+# Dragon
+# Copyright(c) 2017 SeetaTech
+# Written by Ting Pan
+# --------------------------------------------------------
+
+import sys
+import os
+import sphinx_bootstrap_theme
+
+# basic
+html_static_path = ['_static']
+templates_path = ['_templates']
+exclude_patterns = ['_build']
+
+source_suffix = '.rst'
+master_doc = 'index'
+pygments_style = 'sphinx'
+todo_include_todos = True
+
+# extensions
+sys.path.insert(0, os.path.abspath('_extensions'))
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.mathjax',
+    'sphinx.ext.napoleon',
+    #'sphinx.ext.viewcode',
+    'mathmacro',
+]
+
+# project
+project = ''
+copyright = '2017, Ting Pan'
+author = 'Ting Pan'
+html_logo = "dragon.png"
+html_title = ""
+html_short_title = ""
+html_favicon = 'images/favicon.png'
+
+version = ''
+release = ''
+language = None
+
+# theme
+html_theme = 'bootstrap'
+html_theme_path = sphinx_bootstrap_theme.get_html_theme_path()
+
+html_show_sourcelink = False
+html_show_sphinx = False
+html_show_copyright = False
+
+html_theme_options = {
+    'globaltoc_depth': -1,
+    'navbar_class': "navbar navbar-inverse",
+    'navbar_fixed_top': "true",
+    'bootswatch_theme': "yeti",
+}
+
+html_sidebars = {'index': ['localtoc.html'],
+                 'install': ['localtoc.html'],
+                 'contents/**': ['localtoc.html']}
+
+# overloads
+def setup(app):
+    app.config.values['autodoc_member_order'] = ('bysource', True)
\ No newline at end of file
--- a/Dragon/python/dragon/docs/contents/config.rst
+++ b/Dragon/python/dragon/docs/contents/config.rst
+====================
+:mod:`dragon.config`
+====================
+
+.. toctree::
+   :hidden:
+
+Quick Shortcut
+--------------
+
+====================    =============================================================================
+List                    Brief
+====================    =============================================================================
+`EnableCPU`_            Enable CPU mode globally.
+`EnableCUDA`_           Enable CUDA mode globally.
+`SetRandomSeed`_        Set the global random seed.
+`GetRandomSeed`_        Get the global random seed.
+`SetGPU`_               Set the global id GPU.
+`GetGPU`_               Get the global id of GPU.
+`SetDebugMode`_         Enable Debug mode globally.
+`SetLoggingLevel`_      Set the minimum level of Logging.
+====================    =============================================================================
+
+API Reference
+-------------
+
+.. automodule:: dragon.config
+    :members:
+
+.. _EnableCPU: #dragon.config.EnableCPU
+.. _EnableCUDA: #dragon.config.EnableCUDA
+.. _SetRandomSeed: #dragon.config.SetRandomSeed
+.. _GetRandomSeed: #dragon.config.GetRandomSeed
+.. _SetGPU: #dragon.config.SetGPU
+.. _GetGPU: #dragon.config.GetGPU
+.. _SetDebugMode: #dragon.config.SetDebugMode
+.. _SetLoggingLevel: #dragon.config.SetLoggingLevel
\ No newline at end of file
--- a/Dragon/python/dragon/docs/contents/core.rst
+++ b/Dragon/python/dragon/docs/contents/core.rst
+==================
+:mod:`dragon.core`
+==================
+
+Data Structure
+--------------
+
+.. toctree::
+   :hidden:
+
+   core/tensor
+   core/scope
+
+==============================      =======================================================================
+List                                Brief
+==============================      =======================================================================
+`dragon.core.scope`_                The Scope and Namespace.
+`dragon.core.tensor`_               The basic structure of VM.
+==============================      =======================================================================
+
+C++ Binding Wrapper
+-------------------
+
+.. toctree::
+   :hidden:
+
+   core/workspace
+   core/mpi
+   core/gradient_maker
+
+==============================      =======================================================================
+List                                Brief
+==============================      =======================================================================
+`dragon.core.workspace`_            The interfaces of Workspace, mostly are the wrappers of C++.
+`dragon.core.gradient_maker`_       The generator of GradientOps.
+`dragon.core.mpi`_                  The MPI utilities.
+==============================      =======================================================================
+
+.. _dragon.core.mpi: core/mpi.html
+.. _dragon.core.scope: core/scope.html
+.. _dragon.core.tensor: core/tensor.html
+.. _dragon.core.workspace: core/workspace.html
+.. _dragon.core.gradient_maker: core/gradient_maker.html
\ No newline at end of file
--- a/Dragon/python/dragon/docs/contents/core/gradient_maker.rst
+++ b/Dragon/python/dragon/docs/contents/core/gradient_maker.rst
+====================
+:mod:`GradientMaker`
+====================
+
+.. toctree::
+   :hidden:
+
+.. currentmodule:: dragon.core.gradient_maker
+
+.. autoclass:: GraphGradientMaker
+    :members:
+
+.. _theano.function(*args, **kwargs): ../vm/theano/compile.html#dragon.vm.theano.compile.function.function
\ No newline at end of file
--- a/Dragon/python/dragon/docs/contents/core/mpi.rst
+++ b/Dragon/python/dragon/docs/contents/core/mpi.rst
+==========
+:mod:`MPI`
+==========
+
+.. toctree::
+   :hidden:
+
+Basic
+-----
+
+==============================    =============================================================================
+List                              Brief
+==============================    =============================================================================
+`Init`_                           Init the MPI env.
+`Is_Init`_                        Whether the MPI env has initialized.
+`Rank`_                           The world rank of current MPI node.
+`Size`_                           The world size of current MPI env.
+`CreateGroup`_                    Construct a MPIGroup with specific members.
+`Finalize`_                       Finalize the MPI env.
+==============================    =============================================================================
+
+Parallelism
+-----------
+
+==============================    =============================================================================
+List                              Brief
+==============================    =============================================================================
+`Snapshot`_                       Set the specific MPI nodes to snapshot.
+`Parallel`_                       Set the specific MPI nodes for data parallelism.
+`AllowSnapshot`_                  Whether this node can snapshot.
+`AllowParallel`_                  Whether this node was set for data parallelism.
+`SetParallelMode`_                Set the mode of data parallelism.
+`GetParallelMode`_                Get the current mode of data parallelism.
+==============================    =============================================================================
+
+.. automodule:: dragon.core.mpi
+    :members:
+
+.. _Init: #dragon.core.mpi.Init
+.. _Is_Init: #dragon.core.mpi.Is_Init
+.. _Rank: #dragon.core.mpi.Rank
+.. _Size: #dragon.core.mpi.Size
+.. _CreateGroup: #dragon.core.mpi.CreateGroup
+.. _Finalize:  #dragon.core.mpi.Finalize
+
+.. _Snapshot: #dragon.core.mpi.Snapshot
+.. _Parallel: #dragon.core.mpi.Parallel
+.. _AllowSnapshot: #dragon.core.mpi.AllowSnapshot
+.. _AllowParallel: #dragon.core.mpi.AllowParallel
+.. _SetParallelMode: #dragon.core.mpi.SetParallelMode
+.. _GetParallelMode: #dragon.core.mpi.GetParallelMode
+
+.. _workspace.Snapshot(*args, **kwargs): workspace.html#dragon.core.workspace.Snapshot
\ No newline at end of file
--- a/Dragon/python/dragon/docs/contents/core/scope.rst
+++ b/Dragon/python/dragon/docs/contents/core/scope.rst
+============
+:mod:`Scope`
+============
+
+.. toctree::
+   :hidden:
+
+.. currentmodule:: dragon.core.scope
+
+.. autoclass:: dragon.core.scope.TensorScope
+    :members:
+
+.. autoclass:: dragon.core.scope.PhaseScope
+    :members:
+
+.. autoclass:: dragon.core.scope.DeviceScope
+    :members:
--- a/Dragon/python/dragon/docs/contents/core/tensor.rst
+++ b/Dragon/python/dragon/docs/contents/core/tensor.rst
+=============
+:mod:`Tensor`
+=============
+
+.. toctree::
+   :hidden:
+
+Quick Shortcut
+--------------
+
+==============================    =============================================================================
+List                              Brief
+==============================    =============================================================================
+`Tensor.name`_                    Return or Set the name.
+`Tensor.shape`_                   Return or Set the shape.
+`Tensor.dtype`_                   Return or Set the data type.
+`Tensor.set_value`_               Feed the values to C++ backend.
+`Tensor.get_value`_               Fetch the values from C++ backend.
+`Tensor.copy`_                    Return a Tensor with same content.
+`Tensor.reshape`_                 Reshape the dimensions of input.
+`Tensor.dimshuffle`_              Shuffle the dimen`sions.
+`Tensor.CreateOperator`_          Construct a new Tensor with specific operator descriptor.
+`Tensor.Fill`_                    Fill self with the specific type of filler.
+`Tensor.PrintExpressions`_        Return the stringified internal expressions.
+==============================    =============================================================================
+
+Register
+--------
+
+==============================    =============================================================================
+List                              Brief
+==============================    =============================================================================
+`Tensor.Variable`_                Register as an empty variable.
+`Tensor.Placeholder`_             Register as a placeholder.
+`Tensor.Constant`_                Register as a variable with constant initializer.
+`Tensor.Uniform`_                 Register as a variable with uniform initializer.
+`Tensor.Normal`_                  Register as a variable with normal initializer.
+`Tensor.TruncatedNormal`_         Register as a variable with truncated normal initializer.
+`Tensor.Gaussian`_                Register as a variable with gaussian initializer.
+`Tensor.Xavier`_                  Register as a variable with xavier initializer.
+`Tensor.MSRA`_                    Register as a variable with msra initializer.
+`Tensor.GlorotUniform`_           Register as a variable with glorot uniform initializer.
+`Tensor.GlorotNormal`_            Register as a variable with glorot normal initializer.
+==============================    =============================================================================
+
+Override
+--------
+
+==============================    =============================================================================
+List                              Brief
+==============================    =============================================================================
+`Tensor.__add__`_                 x.__add__(y) <=> x + y
+`Tensor.__radd__`_                 x.__radd__(y) <=> y + x
+`Tensor.__sub__`_                 x.__sub__(y) <=> x - y
+`Tensor.__rsub__`_                x.__rsub__(y) <=> y - x
+`Tensor.__mul__`_                 x.__mul__(y) <=> x * y
+`Tensor.__rmul__`_                x.__rmul__(y) <=> y * x
+`Tensor.__div__`_                 x.__div__(y) <=> x / y
+`Tensor.__rdiv__`_                x.__rdiv__(y) <=> y / x
+`Tensor.__neg__`_                 x.__neg__()  <=> -x
+`Tensor.__str__`_                 Return the information(name/shape).
+`Tensor.__getitem__`_             Return a Tensor with specific indices.
+`Tensor.__call__`_                Print the expressions.
+==============================    =============================================================================
+
+
+API Reference
+-------------
+
+.. currentmodule:: dragon.core.tensor
+
+.. autoclass:: Tensor
+    :members:
+
+    .. automethod:: __init__
+
+.. _Tensor.Variable: #dragon.core.tensor.Tensor.Variable
+.. _Tensor.Placeholder: #dragon.core.tensor.Tensor.Placeholder
+.. _Tensor.Constant: #dragon.core.tensor.Tensor.Constant
+.. _Tensor.Uniform: #dragon.core.tensor.Tensor.Uniform
+.. _Tensor.Normal: #dragon.core.tensor.Tensor.Normal
+.. _Tensor.TruncatedNormal: #dragon.core.tensor.Tensor.TruncatedNormal
+.. _Tensor.Gaussian: #dragon.core.tensor.Tensor.Gaussian
+.. _Tensor.Xavier: #dragon.core.tensor.Tensor.Xavier
+.. _Tensor.MSRA: #dragon.core.tensor.Tensor.MSRA
+.. _Tensor.GlorotUniform: #dragon.core.tensor.Tensor.GlorotUniform
+.. _Tensor.GlorotNormal: #dragon.core.tensor.Tensor.GlorotNormal
+
+.. _Tensor.__add__: #dragon.core.tensor.Tensor.__add__
+.. _Tensor.__radd__: #dragon.core.tensor.Tensor.__radd__
+.. _Tensor.__sub__: #dragon.core.tensor.Tensor.__sub__
+.. _Tensor.__rsub__: #dragon.core.tensor.Tensor.__rsub__
+.. _Tensor.__mul__: #dragon.core.tensor.Tensor.__mul__
+.. _Tensor.__rmul__: #dragon.core.tensor.Tensor.__rmul__
+.. _Tensor.__div__: #dragon.core.tensor.Tensor.__div__
+.. _Tensor.__rdiv__: #dragon.core.tensor.Tensor.__rdiv__
+.. _Tensor.__neg__: #dragon.core.tensor.Tensor.__neg__
+.. _Tensor.__str__: #dragon.core.tensor.Tensor.__str__
+.. _Tensor.__getattr__: #dragon.core.tensor.Tensor.__getattr__
+.. _Tensor.__getitem__: #dragon.core.tensor.Tensor.__getitem__
+.. _Tensor.__call__: #dragon.core.tensor.Tensor.__call__
+
+.. _Tensor.name: #dragon.core.tensor.Tensor.name
+.. _Tensor.shape: #dragon.core.tensor.Tensor.shape
+.. _Tensor.dtype: #dragon.core.tensor.Tensor.dtype
+.. _Tensor.set_value: #dragon.core.tensor.Tensor.set_value
+.. _Tensor.get_value: #dragon.core.tensor.Tensor.get_value
+.. _Tensor.copy: #dragon.core.tensor.Tensor.copy
+.. _Tensor.reshape: #dragon.core.tensor.Tensor.reshape
+.. _Tensor.dimshuffle: #dragon.core.tensor.Tensor.dimshuffle
+.. _Tensor.CreateOperator: #dragon.core.tensor.Tensor.CreateOperator
+.. _Tensor.Fill: #dragon.core.tensor.Tensor.Fill
+.. _Tensor.PrintExpressions: #dragon.core.tensor.Tensor.PrintExpressions
+
+.. _workspace.FeedTensor(*args, **kwargs): workspace.html#dragon.core.workspace.FeedTensor
+.. _workspace.FetchTensor(*args, **kwargs): workspace.html#dragon.core.workspace.FetchTensor
+.. _ops.Copy(*args, **kwargs): ../operators/control_flow.html#dragon.operators.control_flow.Copy
--- a/Dragon/python/dragon/docs/contents/core/workspace.rst
+++ b/Dragon/python/dragon/docs/contents/core/workspace.rst
+================
+:mod:`Workspace`
+================
+
+.. toctree::
+   :hidden:
+
+Tensor
+------
+
+==============================    =============================================================================
+List                              Brief
+==============================    =============================================================================
+`HasTensor`_                      Query whether tensor has registered in current workspace.
+`GetTensorName`_                  Query the name represented in current workspace.
+`CreateFiller`_                   Create the filler in the backend.
+`FetchTensor`_                    Fetch the values of given tensor.
+`FeedTensor`_                     Feed the values to the given tensor.
+==============================    =============================================================================
+
+Graph
+-----
+
+==============================    =============================================================================
+List                              Brief
+==============================    =============================================================================
+`CreateGraph`_                    Create the graph in the backend.
+`RunGraph`_                       Run the specific graph.
+==============================    =============================================================================
+
+Misc
+----
+
+==============================    =============================================================================
+List                              Brief
+==============================    =============================================================================
+`Snapshot`_                       Snapshot tensors into a binary file.
+`Restore`_                        Restore tensors from a binary file.
+`SwitchWorkspace`_                Switch to the specific Workspace.
+`PrintRawGraphDef`_               Print the raw prototxt.
+`PrintOptimizedGraph`_            Print the optimized prototxt.
+`WriteOptimizedGraph`_            Generate the optimized prototxt into a file.
+==============================    =============================================================================
+
+API Reference
+-------------
+
+.. automodule:: dragon.core.workspace
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+.. _SwitchWorkspace: #dragon.core.workspace.SwitchWorkspace
+.. _CreateGraph: #dragon.core.workspace.CreateGraph
+.. _HasTensor: #dragon.core.workspace.HasTensor
+.. _GetTensorName: #dragon.core.workspace.GetTensorName
+.. _CreateFiller: #dragon.core.workspace.CreateFiller
+.. _FetchTensor: #dragon.core.workspace.FetchTensor
+.. _FeedTensor: #dragon.core.workspace.FeedTensor
+.. _RunGraph: #dragon.core.workspace.RunGraph
+.. _Snapshot: #dragon.core.workspace.Snapshot
+.. _Restore: #dragon.core.workspace.Restore
+.. _PrintRawGraphDef: #dragon.core.workspace.PrintRawGraphDef
+.. _PrintOptimizedGraph: #dragon.core.workspace.PrintOptimizedGraph
+.. _WriteOptimizedGraph: #dragon.core.workspace.WriteOptimizedGraph
+
+.. _theano.function(*args, **kwargs): ../vm/theano/compile.html#dragon.vm.theano.compile.function.function
\ No newline at end of file
--- a/Dragon/python/dragon/docs/contents/io.rst
+++ b/Dragon/python/dragon/docs/contents/io.rst
+================
+:mod:`dragon.io`
+================
+
+Wrapper
+-------
+
+.. toctree::
+   :hidden:
+
+   io/data_batch
+
+==========================      =====================================================================
+List                            Brief
+==========================      =====================================================================
+`dragon.io.data_batch`_         Efficient I/O based on `LMDB`_.
+==========================      =====================================================================
+
+Component
+---------
+
+.. toctree::
+   :hidden:
+
+   io/data_reader
+   io/data_transformer
+   io/blob_fetcher
+
+==============================      =====================================================================
+List                                Brief
+==============================      =====================================================================
+`dragon.io.data_reader`_            Queue encoded string from `LMDB`_.
+`dragon.io.data_transformer`_       Queue transformed images from `DataReader`_.
+`dragon.io.blob_fetcher`_           Queue blobs from `DataTransformer`_.
+==============================      =====================================================================
+
+
+.. _LMDB: http://lmdb.readthedocs.io/en/release
+.. _DataReader: io/data_reader.html#dragon.io.data_reader
+.. _DataTransformer: io/data_transformer.html#dragon.io.data_transformer
+.. _dragon.io.data_batch: io/data_batch.html
+.. _dragon.io.data_reader: io/data_reader.html
+.. _dragon.io.data_transformer: io/data_transformer.html
+.. _dragon.io.blob_fetcher: io/blob_fetcher.html
\ No newline at end of file
--- a/Dragon/python/dragon/docs/contents/io/blob_fetcher.rst
+++ b/Dragon/python/dragon/docs/contents/io/blob_fetcher.rst
+==================
+:mod:`BlobFetcher`
+==================
+
+.. toctree::
+   :hidden:
+
+.. currentmodule:: dragon.io.blob_fetcher
+
+.. autoclass:: BlobFetcher
+    :members:
+
+    .. automethod:: __init__
+
+.. _DataTransformer: data_transformer.html#dragon.io.data_transformer
\ No newline at end of file
--- a/Dragon/python/dragon/docs/contents/io/data_batch.rst
+++ b/Dragon/python/dragon/docs/contents/io/data_batch.rst
+================
+:mod:`DataBatch`
+================
+
+.. toctree::
+   :hidden:
+
+.. currentmodule:: dragon.io.data_batch
+
+.. autoclass:: DataBatch
+    :members:
+
+    .. automethod:: __init__
\ No newline at end of file
--- a/Dragon/python/dragon/docs/contents/io/data_reader.rst
+++ b/Dragon/python/dragon/docs/contents/io/data_reader.rst
+=================
+:mod:`DataReader`
+=================
+
+.. toctree::
+   :hidden:
+
+.. currentmodule:: dragon.io.data_reader
+
+.. autoclass:: DataReader
+    :members:
+
+    .. automethod:: __init__
+
+.. _LMDB: http://lmdb.readthedocs.io/en/release
\ No newline at end of file
--- a/Dragon/python/dragon/docs/contents/io/data_transformer.rst
+++ b/Dragon/python/dragon/docs/contents/io/data_transformer.rst
+======================
+:mod:`DataTransformer`
+======================
+
+.. toctree::
+   :hidden:
+
+.. currentmodule:: dragon.io.data_transformer
+
+.. autoclass:: DataTransformer
+    :members:
+
+    .. automethod:: __init__
+
+.. _DataReader: data_reader.html#dragon.io.data_reader
\ No newline at end of file
--- a/Dragon/python/dragon/docs/contents/memonger.rst
+++ b/Dragon/python/dragon/docs/contents/memonger.rst
+======================
+:mod:`dragon.memonger`
+======================
+
+.. toctree::
+   :hidden:
+
+Quick Shortcut
+--------------
+
+====================    =============================================================================
+List                    Brief
+====================    =============================================================================
+`ShareGrads`_           Enable gradients sharing globally.
+`Drop`_                 Drop(Share) the inputs for outputs.
+====================    =============================================================================
+
+API Reference
+-------------
+
+.. automodule:: dragon.memonger
+    :members:
+
+.. _ShareGrads: #dragon.memonger.ShareGrads
+.. _Drop: #dragon.memonger.Drop
\ No newline at end of file
--- a/Dragon/python/dragon/docs/contents/operators.rst
+++ b/Dragon/python/dragon/docs/contents/operators.rst
+=======================
+:mod:`dragon.operators`
+=======================
+
+Common
+------
+
+.. toctree::
+   :hidden:
+
+   operators/data
+   operators/initializer
+   operators/arithmetic
+   operators/ndarray
+   operators/control_flow
+   operators/misc
+   operators/cast
+   operators/mpi
+
+===================================         =====================================================================
+List                                        Brief
+===================================         =====================================================================
+`dragon.operators.data`_                    The data preparing operators.
+`dragon.operators.initializer`_             The initializing operators.
+`dragon.operators.arithmetic`_              The arithmetic operators.
+`dragon.operators.ndarray`_                 The ndarray operators.
+`dragon.operators.control_flow`_            The control flow operators.
+`dragon.operators.misc`_                    The misc operators.
+`dragon.operators.cast`_                    The cast operators.
+`dragon.operators.mpi`_                     The MPI operators.
+===================================         =====================================================================
+
+
+Neural Networks
+---------------
+
+.. toctree::
+   :hidden:
+
+   operators/norm
+   operators/activation
+   operators/vision
+   operators/recurrent
+   operators/loss
+
+===================================         =====================================================================
+List                                        Brief
+===================================         =====================================================================
+`dragon.operators.norm`_                    The normalization operators.
+`dragon.operators.activation`_              The activation operators.
+`dragon.operators.vision`_                  The vision operators.
+`dragon.operators.recurrent`_               The recurrent operators.
+`dragon.operators.loss`_                    The loss operators.
+===================================         =====================================================================
+
+
+Custom
+------
+
+.. toctree::
+   :hidden:
+
+   operators/custom/minibatch
+   operators/custom/data_process
+   operators/custom/vec_mult
+
+=========================================       =====================================================================
+List                                            Brief
+=========================================       =====================================================================
+`dragon.operators.custom.minibatch`_            How to form a minibatch based on `dragon.io`_ package.
+`dragon.operators.custom.data_process`_         How to custom a RunOp for data processing.
+`dragon.operators.custom.vec_mult`_             How to custom a TemplateOp for Vector Multiplication.
+=========================================       =====================================================================
+
+
+
+
+.. _dragon.operators.data: operators/data.html
+.. _dragon.operators.initializer: operators/initializer.html
+.. _dragon.operators.arithmetic: operators/arithmetic.html
+.. _dragon.operators.ndarray: operators/ndarray.html
+.. _dragon.operators.control_flow: operators/control_flow.html
+.. _dragon.operators.misc: operators/misc.html
+.. _dragon.operators.cast: operators/cast.html
+.. _dragon.operators.mpi: operators/mpi.html
+.. _dragon.operators.activation: operators/activation.html
+.. _dragon.operators.vision: operators/vision.html
+.. _dragon.operators.recurrent: operators/recurrent.html
+.. _dragon.operators.loss: operators/loss.html
+.. _dragon.operators.norm: operators/norm.html
+.. _dragon.io: io.html
+.. _dragon.operators.custom.minibatch: operators/custom/minibatch.html
+.. _dragon.operators.custom.data_process: operators/custom/data_process.html
+.. _dragon.operators.custom.vec_mult: operators/custom/vec_mult.html
\ No newline at end of file
--- a/Dragon/python/dragon/docs/contents/operators/activation.rst
+++ b/Dragon/python/dragon/docs/contents/operators/activation.rst
+=================
+:mod:`Activation`
+=================
+
+.. toctree::
+   :hidden:
+
+.. automodule:: dragon.operators.activation
+    :members:
+
+.. |sigmoid_function| mathmacro:: \, y = \frac{1}{1 + {e}^{-x}}
+
+.. |tanh_function| mathmacro:: \, y = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}
+
+.. |relu_function| mathmacro:: \, y = \max(0, x)
+
+.. |elu_function| mathmacro:: \, y = \left\{ \begin{array} \\ x & & (x > 0) \\ Alpha * (e^{x} - 1) & & (x <= 0) \\ \end{array} \right.
+
+.. |leaky_relu_function| mathmacro:: \, y = \max(x, 0) + Slope * \min(x, 0)
+
+.. |dropout_function| mathmacro:: \, y = x * Bernoulli(p=1 - prob)
+
+.. |softmax_function| mathmacro:: \, y = \frac{e^{x_{i}}}{\sum  e^{x_{j}}}
--- a/Dragon/python/dragon/docs/contents/operators/arithmetic.rst
+++ b/Dragon/python/dragon/docs/contents/operators/arithmetic.rst
+=================
+:mod:`Arithmetic`
+=================
+
+.. toctree::
+   :hidden:
+
+.. automodule:: dragon.operators.arithmetic
+    :members:
+
+.. |power_function| mathmacro:: \\ \, y = [(Scale * x) + Shift]^{Power}
+
+.. |scale_function| mathmacro:: \\ [Axis, Axis + NumAxes)
+
+.. |gram_matrix_function| mathmacro:: \\ \, y = xx^{T}
\ No newline at end of file
--- a/Dragon/python/dragon/docs/contents/operators/cast.rst
+++ b/Dragon/python/dragon/docs/contents/operators/cast.rst
+===========
+:mod:`Cast`
+===========
+
+.. toctree::
+   :hidden:
+
+.. automodule:: dragon.operators.cast
+    :members:
\ No newline at end of file
--- a/Dragon/python/dragon/docs/contents/operators/control_flow.rst
+++ b/Dragon/python/dragon/docs/contents/operators/control_flow.rst
+===================
+:mod:`Control Flow`
+===================
+
+.. toctree::
+   :hidden:
+
+.. automodule:: dragon.operators.control_flow
+    :members:
\ No newline at end of file
--- a/Dragon/python/dragon/docs/contents/operators/custom/data_process.rst
+++ b/Dragon/python/dragon/docs/contents/operators/custom/data_process.rst
+==================
+:mod:`DataProcess`
+==================
+
+.. toctree::
+   :hidden:
+
+.. currentmodule:: dragon.operators.custom.data_process
+
+.. autoclass:: DataProcessOp
+    :members:
\ No newline at end of file
--- a/Dragon/python/dragon/docs/contents/operators/custom/minibatch.rst
+++ b/Dragon/python/dragon/docs/contents/operators/custom/minibatch.rst
+================
+:mod:`MiniBatch`
+================
+
+.. toctree::
+   :hidden:
+
+.. currentmodule:: dragon.operators.custom.minibatch
+
+.. autoclass:: MiniBatchOp
+    :members:
+
+.. _dragon.io: ../../io.html
\ No newline at end of file
--- a/Dragon/python/dragon/docs/contents/operators/custom/vec_mult.rst
+++ b/Dragon/python/dragon/docs/contents/operators/custom/vec_mult.rst
+==============
+:mod:`VecMult`
+==============
+
+.. toctree::
+   :hidden:
+
+.. currentmodule:: dragon.operators.custom.vec_mult
+
+.. autoclass:: VecMultOp
+    :members:
\ No newline at end of file
--- a/Dragon/python/dragon/docs/contents/operators/data.rst
+++ b/Dragon/python/dragon/docs/contents/operators/data.rst
+===========
+:mod:`Data`
+===========
+
+.. toctree::
+   :hidden:
+
+.. automodule:: dragon.operators.data
+    :members:
+
+.. _LMDB: http://lmdb.readthedocs.io/en/release
+.. _DataBatch: ../io/data_batch.html#dragon.io.data_batch
+.. _DataReader: ../io/data_reader.html#dragon.io.data_reader
+.. _DataTransformer: ../io/data_transformer.html#dragon.io.data_transformer
+.. _BlobFetcher: ../io/blob_fetcher.html#dragon.io.blob_fetcher
\ No newline at end of file
--- a/Dragon/python/dragon/docs/contents/operators/initializer.rst
+++ b/Dragon/python/dragon/docs/contents/operators/initializer.rst
--- a/Dragon/python/dragon/docs/contents/operators/loss.rst
+++ b/Dragon/python/dragon/docs/contents/operators/loss.rst
--- a/Dragon/python/dragon/docs/contents/operators/misc.rst
+++ b/Dragon/python/dragon/docs/contents/operators/misc.rst
--- a/Dragon/python/dragon/docs/contents/operators/mpi.rst
+++ b/Dragon/python/dragon/docs/contents/operators/mpi.rst
--- a/Dragon/python/dragon/docs/contents/operators/ndarray.rst
+++ b/Dragon/python/dragon/docs/contents/operators/ndarray.rst
--- a/Dragon/python/dragon/docs/contents/operators/norm.rst
+++ b/Dragon/python/dragon/docs/contents/operators/norm.rst
--- a/Dragon/python/dragon/docs/contents/operators/recurrent.rst
+++ b/Dragon/python/dragon/docs/contents/operators/recurrent.rst
--- a/Dragon/python/dragon/docs/contents/operators/vision.rst
+++ b/Dragon/python/dragon/docs/contents/operators/vision.rst
--- a/Dragon/python/dragon/docs/contents/ops.rst
+++ b/Dragon/python/dragon/docs/contents/ops.rst
--- a/Dragon/python/dragon/docs/contents/tools.rst
+++ b/Dragon/python/dragon/docs/contents/tools.rst
--- a/Dragon/python/dragon/docs/contents/tools/db.rst
+++ b/Dragon/python/dragon/docs/contents/tools/db.rst
--- a/Dragon/python/dragon/docs/contents/tools/im2db.rst
+++ b/Dragon/python/dragon/docs/contents/tools/im2db.rst
--- a/Dragon/python/dragon/docs/contents/tools/summary_writer.rst
+++ b/Dragon/python/dragon/docs/contents/tools/summary_writer.rst
--- a/Dragon/python/dragon/docs/contents/updaters.rst
+++ b/Dragon/python/dragon/docs/contents/updaters.rst
--- a/Dragon/python/dragon/docs/contents/vm.rst
+++ b/Dragon/python/dragon/docs/contents/vm.rst
--- a/Dragon/python/dragon/docs/contents/vm/caffe.rst
+++ b/Dragon/python/dragon/docs/contents/vm/caffe.rst
--- a/Dragon/python/dragon/docs/contents/vm/caffe/layer.rst
+++ b/Dragon/python/dragon/docs/contents/vm/caffe/layer.rst
--- a/Dragon/python/dragon/docs/contents/vm/caffe/misc.rst
+++ b/Dragon/python/dragon/docs/contents/vm/caffe/misc.rst
--- a/Dragon/python/dragon/docs/contents/vm/caffe/net.rst
+++ b/Dragon/python/dragon/docs/contents/vm/caffe/net.rst
--- a/Dragon/python/dragon/docs/contents/vm/caffe/solver.rst
+++ b/Dragon/python/dragon/docs/contents/vm/caffe/solver.rst
--- a/Dragon/python/dragon/docs/contents/vm/theano.rst
+++ b/Dragon/python/dragon/docs/contents/vm/theano.rst
--- a/Dragon/python/dragon/docs/contents/vm/theano/compile.rst
+++ b/Dragon/python/dragon/docs/contents/vm/theano/compile.rst
--- a/Dragon/python/dragon/docs/contents/vm/theano/tensor.rst
+++ b/Dragon/python/dragon/docs/contents/vm/theano/tensor.rst
--- a/Dragon/python/dragon/docs/index.rst
+++ b/Dragon/python/dragon/docs/index.rst
--- a/Dragon/python/dragon/docs/install.rst
+++ b/Dragon/python/dragon/docs/install.rst
--- a/Dragon/python/dragon/docs/make.bat
+++ b/Dragon/python/dragon/docs/make.bat
--- a/Dragon/python/dragon/io/blob_fetcher.py
+++ b/Dragon/python/dragon/io/blob_fetcher.py
--- a/Dragon/python/dragon/io/data_batch.py
+++ b/Dragon/python/dragon/io/data_batch.py
--- a/Dragon/python/dragon/io/data_reader.py
+++ b/Dragon/python/dragon/io/data_reader.py
--- a/Dragon/python/dragon/io/data_transformer.py
+++ b/Dragon/python/dragon/io/data_transformer.py
--- a/Dragon/python/dragon/memonger.py
+++ b/Dragon/python/dragon/memonger.py
--- a/Dragon/python/dragon/operators/__init__.py
+++ b/Dragon/python/dragon/operators/__init__.py
--- a/Dragon/python/dragon/operators/activation.py
+++ b/Dragon/python/dragon/operators/activation.py
--- a/Dragon/python/dragon/operators/arithmetic.py
+++ b/Dragon/python/dragon/operators/arithmetic.py
--- a/Dragon/python/dragon/operators/cast.py
+++ b/Dragon/python/dragon/operators/cast.py
--- a/Dragon/python/dragon/operators/common.py
+++ b/Dragon/python/dragon/operators/common.py
--- a/Dragon/python/dragon/operators/control_flow.py
+++ b/Dragon/python/dragon/operators/control_flow.py
--- a/Dragon/python/dragon/operators/custom/__init__.py
+++ b/Dragon/python/dragon/operators/custom/__init__.py
--- a/Dragon/python/dragon/operators/custom/examples/data_process.py
+++ b/Dragon/python/dragon/operators/custom/examples/data_process.py
--- a/Dragon/python/dragon/operators/custom/minibatch.py
+++ b/Dragon/python/dragon/operators/custom/minibatch.py
--- a/Dragon/python/dragon/operators/custom/examples/vec_mult.py
+++ b/Dragon/python/dragon/operators/custom/examples/vec_mult.py
--- a/Dragon/python/dragon/operators/data.py
+++ b/Dragon/python/dragon/operators/data.py
--- a/Dragon/python/dragon/operators/initializer.py
+++ b/Dragon/python/dragon/operators/initializer.py
--- a/Dragon/python/dragon/operators/loss.py
+++ b/Dragon/python/dragon/operators/loss.py
--- a/Dragon/python/dragon/operators/misc.py
+++ b/Dragon/python/dragon/operators/misc.py
--- a/Dragon/python/dragon/operators/mpi.py
+++ b/Dragon/python/dragon/operators/mpi.py
--- a/Dragon/python/dragon/operators/ndarray.py
+++ b/Dragon/python/dragon/operators/ndarray.py
--- a/Dragon/python/dragon/operators/norm.py
+++ b/Dragon/python/dragon/operators/norm.py
--- a/Dragon/python/dragon/operators/recurrent.py
+++ b/Dragon/python/dragon/operators/recurrent.py
--- a/Dragon/python/dragon/operators/utils.py
+++ b/Dragon/python/dragon/operators/utils.py
--- a/Dragon/python/dragon/operators/vision.py
+++ b/Dragon/python/dragon/operators/vision.py
--- a/Dragon/python/dragon/ops.py
+++ b/Dragon/python/dragon/ops.py
--- a/Dragon/python/dragon/tools/db.py
+++ b/Dragon/python/dragon/tools/db.py
--- a/Dragon/python/dragon/tools/im2lmdb.py
+++ b/Dragon/python/dragon/tools/im2lmdb.py
--- a/Dragon/python/dragon/tools/summary_writer.py
+++ b/Dragon/python/dragon/tools/summary_writer.py
--- a/Dragon/python/dragon/updaters.py
+++ b/Dragon/python/dragon/updaters.py
--- a/Dragon/python/dragon/vm/caffe/__init__.py
+++ b/Dragon/python/dragon/vm/caffe/__init__.py
--- a/Dragon/python/dragon/vm/caffe/common.py
+++ b/Dragon/python/dragon/vm/caffe/common.py
--- a/Dragon/python/dragon/vm/caffe/layer.py
+++ b/Dragon/python/dragon/vm/caffe/layer.py
--- a/Dragon/python/dragon/vm/caffe/layers/__init__.py
+++ b/Dragon/python/dragon/vm/caffe/layers/__init__.py
--- a/Dragon/python/dragon/vm/caffe/layers/common.py
+++ b/Dragon/python/dragon/vm/caffe/layers/common.py
--- a/Dragon/python/dragon/vm/caffe/layers/data.py
+++ b/Dragon/python/dragon/vm/caffe/layers/data.py
--- a/Dragon/python/dragon/vm/caffe/layers/layer.py
+++ b/Dragon/python/dragon/vm/caffe/layers/layer.py
--- a/Dragon/python/dragon/vm/caffe/layers/loss.py
+++ b/Dragon/python/dragon/vm/caffe/layers/loss.py
--- a/Dragon/python/dragon/vm/caffe/layers/mpi.py
+++ b/Dragon/python/dragon/vm/caffe/layers/mpi.py
--- a/Dragon/python/dragon/vm/caffe/layers/neuron.py
+++ b/Dragon/python/dragon/vm/caffe/layers/neuron.py
--- a/Dragon/python/dragon/vm/caffe/layers/vision.py
+++ b/Dragon/python/dragon/vm/caffe/layers/vision.py
--- a/Dragon/python/dragon/vm/caffe/misc.py
+++ b/Dragon/python/dragon/vm/caffe/misc.py
--- a/Dragon/python/dragon/vm/caffe/net.py
+++ b/Dragon/python/dragon/vm/caffe/net.py
--- a/Dragon/python/dragon/vm/caffe/proto/caffe.proto
+++ b/Dragon/python/dragon/vm/caffe/proto/caffe.proto
--- a/Dragon/python/dragon/vm/caffe/proto/caffe_pb2.py
+++ b/Dragon/python/dragon/vm/caffe/proto/caffe_pb2.py
--- a/Dragon/python/dragon/vm/caffe/solver.py
+++ b/Dragon/python/dragon/vm/caffe/solver.py
--- a/Dragon/python/dragon/vm/tensorflow/__init__.py
+++ b/Dragon/python/dragon/vm/tensorflow/__init__.py
--- a/Dragon/python/dragon/vm/tensorflow/contrib/__init__.py
+++ b/Dragon/python/dragon/vm/tensorflow/contrib/__init__.py
--- a/Dragon/python/dragon/vm/tensorflow/contrib/layers/__init__.py
+++ b/Dragon/python/dragon/vm/tensorflow/contrib/layers/__init__.py
--- a/Dragon/python/dragon/vm/tensorflow/contrib/layers/layers.py
+++ b/Dragon/python/dragon/vm/tensorflow/contrib/layers/layers.py
--- a/Dragon/python/dragon/vm/tensorflow/core/collection.py
+++ b/Dragon/python/dragon/vm/tensorflow/core/collection.py
--- a/Dragon/python/dragon/vm/tensorflow/core/device.py
+++ b/Dragon/python/dragon/vm/tensorflow/core/device.py
--- a/Dragon/python/dragon/vm/tensorflow/core/dtypes.py
+++ b/Dragon/python/dragon/vm/tensorflow/core/dtypes.py
--- a/Dragon/python/dragon/vm/tensorflow/core/session.py
+++ b/Dragon/python/dragon/vm/tensorflow/core/session.py
--- a/Dragon/python/dragon/vm/tensorflow/core/variables.py
+++ b/Dragon/python/dragon/vm/tensorflow/core/variables.py
--- a/Dragon/python/dragon/vm/tensorflow/ops/__init__.py
+++ b/Dragon/python/dragon/vm/tensorflow/ops/__init__.py
--- a/Dragon/python/dragon/vm/tensorflow/ops/array_ops.py
+++ b/Dragon/python/dragon/vm/tensorflow/ops/array_ops.py
--- a/Dragon/python/dragon/vm/tensorflow/ops/constant_op.py
+++ b/Dragon/python/dragon/vm/tensorflow/ops/constant_op.py
--- a/Dragon/python/dragon/vm/tensorflow/ops/control_flow_ops.py
+++ b/Dragon/python/dragon/vm/tensorflow/ops/control_flow_ops.py
--- a/Dragon/python/dragon/vm/tensorflow/ops/dtypes.py
+++ b/Dragon/python/dragon/vm/tensorflow/ops/dtypes.py
--- a/Dragon/python/dragon/vm/tensorflow/ops/init_ops.py
+++ b/Dragon/python/dragon/vm/tensorflow/ops/init_ops.py
--- a/Dragon/python/dragon/vm/tensorflow/ops/math_ops.py
+++ b/Dragon/python/dragon/vm/tensorflow/ops/math_ops.py
--- a/Dragon/python/dragon/vm/tensorflow/ops/nn.py
+++ b/Dragon/python/dragon/vm/tensorflow/ops/nn.py
--- a/Dragon/python/dragon/vm/tensorflow/ops/nn_ops.py
+++ b/Dragon/python/dragon/vm/tensorflow/ops/nn_ops.py
--- a/Dragon/python/dragon/vm/tensorflow/ops/random_ops.py
+++ b/Dragon/python/dragon/vm/tensorflow/ops/random_ops.py
--- a/Dragon/python/dragon/vm/tensorflow/training/optimizer.py
+++ b/Dragon/python/dragon/vm/tensorflow/training/optimizer.py
--- a/Dragon/python/dragon/vm/tensorflow/training/saver.py
+++ b/Dragon/python/dragon/vm/tensorflow/training/saver.py
--- a/Dragon/python/dragon/vm/tensorflow/training/train.py
+++ b/Dragon/python/dragon/vm/tensorflow/training/train.py
--- a/Dragon/python/dragon/vm/tensorflow/utils/__init__.py
+++ b/Dragon/python/dragon/vm/tensorflow/utils/__init__.py
--- a/Dragon/python/dragon/vm/tensorflow/utils/check.py
+++ b/Dragon/python/dragon/vm/tensorflow/utils/check.py
--- a/Dragon/python/dragon/vm/tensorflow/utils/gradients.py
+++ b/Dragon/python/dragon/vm/tensorflow/utils/gradients.py
--- a/Dragon/python/dragon/vm/theano/__init__.py
+++ b/Dragon/python/dragon/vm/theano/__init__.py
--- a/Dragon/python/dragon/vm/tensorflow/core/__init__.py
+++ b/Dragon/python/dragon/vm/tensorflow/core/__init__.py
--- a/Dragon/python/dragon/vm/theano/compile/function.py
+++ b/Dragon/python/dragon/vm/theano/compile/function.py
--- a/Dragon/python/dragon/vm/theano/core/scan.py
+++ b/Dragon/python/dragon/vm/theano/core/scan.py
--- a/Dragon/python/dragon/vm/theano/core/swap.py
+++ b/Dragon/python/dragon/vm/theano/core/swap.py
--- a/Dragon/python/dragon/vm/theano/config.py
+++ b/Dragon/python/dragon/vm/theano/config.py
--- a/Dragon/python/dragon/vm/tensorflow/training/__init__.py
+++ b/Dragon/python/dragon/vm/tensorflow/training/__init__.py
--- a/Dragon/python/dragon/vm/theano/core/__init__.py
+++ b/Dragon/python/dragon/vm/theano/core/__init__.py
--- a/Dragon/python/dragon/vm/theano/core/function.py
+++ b/Dragon/python/dragon/vm/theano/core/function.py
--- a/Dragon/python/dragon/vm/theano/gradient.py
+++ b/Dragon/python/dragon/vm/theano/gradient.py
--- a/Dragon/python/dragon/vm/theano/tensor/__init__.py
+++ b/Dragon/python/dragon/vm/theano/tensor/__init__.py
--- a/Dragon/python/dragon/vm/theano/tensor/basic.py
+++ b/Dragon/python/dragon/vm/theano/tensor/basic.py
--- a/Dragon/python/dragon/vm/theano/tensor/extra_ops.py
+++ b/Dragon/python/dragon/vm/theano/tensor/extra_ops.py
--- a/Dragon/python/dragon/vm/theano/tensor/nnet.py
+++ b/Dragon/python/dragon/vm/theano/tensor/nnet.py
--- a/Dragon/python/dragon/vm/theano/tensor/ops.py
+++ b/Dragon/python/dragon/vm/theano/tensor/ops.py
--- a/Dragon/src/core/context.cc
+++ b/Dragon/src/core/context.cc
--- a/Dragon/src/core/graph.cc
+++ b/Dragon/src/core/graph.cc
--- a/Dragon/src/core/graph_gradient.cc
+++ b/Dragon/src/core/graph_gradient.cc
--- a/Dragon/src/core/mixedmem.cc
+++ b/Dragon/src/core/mixedmem.cc
--- a/Dragon/src/core/operator.cc
+++ b/Dragon/src/core/operator.cc
--- a/Dragon/src/core/operator_schema.cc
+++ b/Dragon/src/core/operator_schema.cc
--- a/Dragon/src/core/workspace.cc
+++ b/Dragon/src/core/workspace.cc
--- a/Dragon/src/operators/activation/cudnn_elu_op.cc
+++ b/Dragon/src/operators/activation/cudnn_elu_op.cc
--- a/Dragon/src/operators/activation/cudnn_relu_op.cc
+++ b/Dragon/src/operators/activation/cudnn_relu_op.cc
--- a/Dragon/src/operators/activation/cudnn_sigmoid_op.cc
+++ b/Dragon/src/operators/activation/cudnn_sigmoid_op.cc
--- a/Dragon/src/operators/activation/cudnn_softmax_op.cc
+++ b/Dragon/src/operators/activation/cudnn_softmax_op.cc
--- a/Dragon/src/operators/activation/cudnn_tanh_op.cc
+++ b/Dragon/src/operators/activation/cudnn_tanh_op.cc
--- a/Dragon/src/operators/activation/dropout_op.cc
+++ b/Dragon/src/operators/activation/dropout_op.cc
--- a/Dragon/src/operators/activation/elu_op.cc
+++ b/Dragon/src/operators/activation/elu_op.cc
--- a/Dragon/src/operators/activation/relu_op.cc
+++ b/Dragon/src/operators/activation/relu_op.cc
--- a/Dragon/src/operators/activation/sigmoid_op.cc
+++ b/Dragon/src/operators/activation/sigmoid_op.cc
--- a/Dragon/src/operators/activation/softmax_op.cc
+++ b/Dragon/src/operators/activation/softmax_op.cc
--- a/Dragon/src/operators/activation/tanh_op.cc
+++ b/Dragon/src/operators/activation/tanh_op.cc
--- a/Dragon/src/operators/arithmetic/add_op.cc
+++ b/Dragon/src/operators/arithmetic/add_op.cc
--- a/Dragon/src/operators/arithmetic/bias_add_op.cc
+++ b/Dragon/src/operators/arithmetic/bias_add_op.cc
--- a/Dragon/src/operators/arithmetic/clip_op.cc
+++ b/Dragon/src/operators/arithmetic/clip_op.cc
--- a/Dragon/src/operators/arithmetic/div_op.cc
+++ b/Dragon/src/operators/arithmetic/div_op.cc
--- a/Dragon/src/operators/arithmetic/dot_op.cc
+++ b/Dragon/src/operators/arithmetic/dot_op.cc
--- a/Dragon/src/operators/arithmetic/eltwise_op.cc
+++ b/Dragon/src/operators/arithmetic/eltwise_op.cc
--- a/Dragon/src/operators/arithmetic/exp_op.cc
+++ b/Dragon/src/operators/arithmetic/exp_op.cc
--- a/Dragon/src/operators/arithmetic/gram_matrix_op.cc
+++ b/Dragon/src/operators/arithmetic/gram_matrix_op.cc
--- a/Dragon/src/operators/arithmetic/inner_product_op.cc
+++ b/Dragon/src/operators/arithmetic/inner_product_op.cc
--- a/Dragon/src/operators/arithmetic/log_op.cc
+++ b/Dragon/src/operators/arithmetic/log_op.cc
--- a/Dragon/src/operators/arithmetic/matmul_op.cc
+++ b/Dragon/src/operators/arithmetic/matmul_op.cc
--- a/Dragon/src/operators/arithmetic/mul_op.cc
+++ b/Dragon/src/operators/arithmetic/mul_op.cc
--- a/Dragon/src/operators/arithmetic/pow_op.cc
+++ b/Dragon/src/operators/arithmetic/pow_op.cc
--- a/Dragon/src/operators/arithmetic/radd_op .cc
+++ b/Dragon/src/operators/arithmetic/radd_op .cc
--- a/Dragon/src/operators/arithmetic/rdiv_op.cc
+++ b/Dragon/src/operators/arithmetic/rdiv_op.cc
--- a/Dragon/src/operators/arithmetic/rmul_op.cc
+++ b/Dragon/src/operators/arithmetic/rmul_op.cc
--- a/Dragon/src/operators/arithmetic/rsub_op.cc
+++ b/Dragon/src/operators/arithmetic/rsub_op.cc
--- a/Dragon/src/operators/arithmetic/scale_op.cc
+++ b/Dragon/src/operators/arithmetic/scale_op.cc
--- a/Dragon/src/operators/arithmetic/square_op.cc
+++ b/Dragon/src/operators/arithmetic/square_op.cc
--- a/Dragon/src/operators/arithmetic/sub_op.cc
+++ b/Dragon/src/operators/arithmetic/sub_op.cc
--- a/Dragon/src/operators/utils/cast_op.cpp
+++ b/Dragon/src/operators/utils/cast_op.cpp
--- a/Dragon/src/operators/utils/compare_op.cc
+++ b/Dragon/src/operators/utils/compare_op.cc
--- a/Dragon/src/operators/utils/copy_op.cc
+++ b/Dragon/src/operators/utils/copy_op.cc
--- a/Dragon/src/operators/common/scan_op.cc
+++ b/Dragon/src/operators/common/scan_op.cc
--- a/Dragon/src/operators/loss/l1_loss_op.cc
+++ b/Dragon/src/operators/loss/l1_loss_op.cc
--- a/Dragon/src/operators/loss/l2_loss_op.cc
+++ b/Dragon/src/operators/loss/l2_loss_op.cc
--- a/Dragon/src/operators/loss/sigmoid_cross_entropy_op.cc
+++ b/Dragon/src/operators/loss/sigmoid_cross_entropy_op.cc
--- a/Dragon/src/operators/loss/smooth_l1_loss_op.cc
+++ b/Dragon/src/operators/loss/smooth_l1_loss_op.cc
--- a/Dragon/src/operators/loss/softmax_cross_entropy_op.cc
+++ b/Dragon/src/operators/loss/softmax_cross_entropy_op.cc
--- a/Dragon/src/operators/loss/sparse_softmax_cross_entropy_op.cc
+++ b/Dragon/src/operators/loss/sparse_softmax_cross_entropy_op.cc
--- a/Dragon/src/operators/loss/sparse_softmax_focal_loss_op.cc
+++ b/Dragon/src/operators/loss/sparse_softmax_focal_loss_op.cc
--- a/Dragon/src/operators/utils/accuracy_op.cc
+++ b/Dragon/src/operators/utils/accuracy_op.cc
--- a/Dragon/src/operators/utils/gradient_op.cc
+++ b/Dragon/src/operators/utils/gradient_op.cc
--- a/Dragon/src/operators/utils/initialize_op.cc
+++ b/Dragon/src/operators/utils/initialize_op.cc
--- a/Dragon/src/operators/utils/memory_data_op.cc
+++ b/Dragon/src/operators/utils/memory_data_op.cc
--- a/Dragon/src/operators/utils/proposal_op.cc
+++ b/Dragon/src/operators/utils/proposal_op.cc
--- a/Dragon/src/operators/utils/proposal_op.cu
+++ b/Dragon/src/operators/utils/proposal_op.cu
--- a/Dragon/src/operators/common/python_op.cc
+++ b/Dragon/src/operators/common/python_op.cc
--- a/Dragon/src/operators/mpi/mpi_broadcast_op.cc
+++ b/Dragon/src/operators/mpi/mpi_broadcast_op.cc
--- a/Dragon/src/operators/mpi/mpi_gather_op.cc
+++ b/Dragon/src/operators/mpi/mpi_gather_op.cc
--- a/Dragon/src/operators/ndarray/arange_op.cc
+++ b/Dragon/src/operators/ndarray/arange_op.cc
--- a/Dragon/src/operators/common/argmax_op.cc
+++ b/Dragon/src/operators/common/argmax_op.cc
--- a/Dragon/src/operators/ndarray/argmin_op.cc
+++ b/Dragon/src/operators/ndarray/argmin_op.cc
--- a/Dragon/src/operators/common/at_op.cc
+++ b/Dragon/src/operators/common/at_op.cc
--- a/Dragon/src/operators/common/concat_op.cc
+++ b/Dragon/src/operators/common/concat_op.cc
--- a/Dragon/src/operators/common/crop_op.cc
+++ b/Dragon/src/operators/common/crop_op.cc
--- a/Dragon/src/operators/common/expand_dims_op.cc
+++ b/Dragon/src/operators/common/expand_dims_op.cc
--- a/Dragon/src/operators/common/flatten_op.cc
+++ b/Dragon/src/operators/common/flatten_op.cc
--- a/Dragon/src/operators/utils/one_hot_op.cc
+++ b/Dragon/src/operators/utils/one_hot_op.cc
--- a/Dragon/src/operators/ndarray/random_pick_op.cc
+++ b/Dragon/src/operators/ndarray/random_pick_op.cc
--- a/Dragon/src/operators/common/reduce_op.cc
+++ b/Dragon/src/operators/common/reduce_op.cc
--- a/Dragon/src/operators/ndarray/repeat_op.cc
+++ b/Dragon/src/operators/ndarray/repeat_op.cc
--- a/Dragon/src/operators/common/reshape_op.cc
+++ b/Dragon/src/operators/common/reshape_op.cc
--- a/Dragon/src/operators/utils/shape_op.cc
+++ b/Dragon/src/operators/utils/shape_op.cc
--- a/Dragon/src/operators/common/slice_op.cc
+++ b/Dragon/src/operators/common/slice_op.cc
--- a/Dragon/src/operators/ndarray/stack_op.cc
+++ b/Dragon/src/operators/ndarray/stack_op.cc
--- a/Dragon/src/operators/common/tile_op.cc
+++ b/Dragon/src/operators/common/tile_op.cc
--- a/Dragon/src/operators/common/transpose_op.cc
+++ b/Dragon/src/operators/common/transpose_op.cc
--- a/Dragon/src/operators/norm/batch_norm_op.cc
+++ b/Dragon/src/operators/norm/batch_norm_op.cc
--- a/Dragon/src/operators/norm/batch_renorm_op.cc
+++ b/Dragon/src/operators/norm/batch_renorm_op.cc
--- a/Dragon/src/operators/norm/cudnn_bn_op.cc
+++ b/Dragon/src/operators/norm/cudnn_bn_op.cc
--- a/Dragon/src/operators/norm/instance_norm_op.cc
+++ b/Dragon/src/operators/norm/instance_norm_op.cc
--- a/Dragon/src/operators/norm/l2_norm_op.cc
+++ b/Dragon/src/operators/norm/l2_norm_op.cc
--- a/Dragon/src/operators/recurrent/lstm_unit_op.cc
+++ b/Dragon/src/operators/recurrent/lstm_unit_op.cc
--- a/Dragon/src/operators/update/async_update_op.cc
+++ b/Dragon/src/operators/update/async_update_op.cc
--- a/Dragon/src/operators/update/collective_update_op.cc
+++ b/Dragon/src/operators/update/collective_update_op.cc
--- a/Dragon/src/operators/update/moving_average_op.cc
+++ b/Dragon/src/operators/update/moving_average_op.cc
--- a/Dragon/src/operators/update/update_op_base.cc
+++ b/Dragon/src/operators/update/update_op_base.cc
--- a/Dragon/src/operators/vision/bilinear_resize_op.cc
+++ b/Dragon/src/operators/vision/bilinear_resize_op.cc
--- a/Dragon/src/operators/vision/conv_op.cc
+++ b/Dragon/src/operators/vision/conv_op.cc
--- a/Dragon/src/operators/vision/cudnn_conv_op.cc
+++ b/Dragon/src/operators/vision/cudnn_conv_op.cc
--- a/Dragon/src/operators/vision/cudnn_deconv_op.cc
+++ b/Dragon/src/operators/vision/cudnn_deconv_op.cc
--- a/Dragon/src/operators/vision/cudnn_lrn_op.cc
+++ b/Dragon/src/operators/vision/cudnn_lrn_op.cc
--- a/Dragon/src/operators/vision/cudnn_pooling_op.cc
+++ b/Dragon/src/operators/vision/cudnn_pooling_op.cc
--- a/Dragon/src/operators/vision/deconv_op.cc
+++ b/Dragon/src/operators/vision/deconv_op.cc
--- a/Dragon/src/operators/vision/dense_concat_op.cc
+++ b/Dragon/src/operators/vision/dense_concat_op.cc
--- a/Dragon/src/operators/vision/lrn_op.cc
+++ b/Dragon/src/operators/vision/lrn_op.cc
--- a/Dragon/src/operators/vision/nn_resize_op.cc
+++ b/Dragon/src/operators/vision/nn_resize_op.cc
--- a/Dragon/src/operators/vision/pooling_op.cc
+++ b/Dragon/src/operators/vision/pooling_op.cc
--- a/Dragon/src/operators/vision/roi_align_op.cc
+++ b/Dragon/src/operators/vision/roi_align_op.cc
--- a/Dragon/src/operators/vision/roi_pooling_op.cc
+++ b/Dragon/src/operators/vision/roi_pooling_op.cc
--- a/Dragon/src/protos/caffemodel.pb.cc
+++ b/Dragon/src/protos/caffemodel.pb.cc
--- a/Dragon/src/protos/caffemodel.pb.h
+++ b/Dragon/src/protos/caffemodel.pb.h
--- a/Dragon/src/protos/dragon.pb.cc
+++ b/Dragon/src/protos/dragon.pb.cc
--- a/Dragon/src/protos/dragon.pb.h
+++ b/Dragon/src/protos/dragon.pb.h
--- a/Dragon/src/utils/math_functions.cc
+++ b/Dragon/src/utils/math_functions.cc
--- a/Dragon/src/utils/math_functions.cu
+++ b/Dragon/src/utils/math_functions.cu
--- a/Dragon/src/utils/op_kernel.cc
+++ b/Dragon/src/utils/op_kernel.cc
--- a/Dragon/src/utils/op_kernel.cu
+++ b/Dragon/src/utils/op_kernel.cu
--- a/Dragon/src/utils/sse_alternative.cc
+++ b/Dragon/src/utils/sse_alternative.cc
--- a/README.md
+++ b/README.md
--- a/examples/Seg-FCN/surgery.py
+++ b/examples/Seg-FCN/surgery.py
--- a/examples/Seg-FCN/transplants/VGG16/solve.py
+++ b/examples/Seg-FCN/transplants/VGG16/solve.py
--- a/examples/cifar10/gen_lmdb.py
+++ b/examples/cifar10/gen_lmdb.py