Export Workspace for PyModule

Ting PAN
Commit 40e94d24 authored Apr 02, 2019 by Ting PAN
Showing with 3518 additions and 3153 deletions
CHANGES
Dragon/include/core/graph.h
Dragon/include/core/graph_gradient.h
Dragon/include/core/tensor.h
Dragon/include/core/workspace.h
Dragon/include/operators/misc/gradient_op.h
Dragon/include/utils/op_kernel.h
Dragon/modules/cxx/dragon.cc
Dragon/modules/python/py_autograd.h
Dragon/modules/python/py_dragon.h
Dragon/modules/python/py_graph.h
Dragon/modules/python/py_io.h
Dragon/modules/python/py_module.cc
Dragon/modules/python/py_onnx.h
Dragon/modules/python/py_operator.h
Dragon/modules/python/py_tensor.h
Dragon/python/dragon/__init__.py
Dragon/python/dragon/config.py
Dragon/python/dragon/core/cuda.py
Dragon/python/dragon/core/gradient_maker.py
--- a/CHANGES
+++ b/CHANGES
 ------------------------------------------------------------------------
 The list of most significant changes made over time in Dragon.

-Dragon 0.3.0.0 (20190309)
+Dragon 0.3.0.0 (20190402)
 DRAGON_VERSION == 3000

 Changes (w.r.t. Dragon 0.2.2.13):
@@ -36,6 +36,8 @@ Preview Features:

 - The behavior of accumulating gradients have been canceled.

+- Python module now has been assigned to take charge of ``Workspace``.
+

 Bugs fixed:


--- a/Dragon/include/core/graph.h
+++ b/Dragon/include/core/graph.h
@@ -22,17 +22,15 @@ class GraphBase {
 public:
    /*! \brief Default constructor */
    GraphBase(
-        const GraphDef&         meta_graph,
+        const GraphDef&         def,
        Workspace*              ws);

    /*! \brief Default deconstructor */
    virtual ~GraphBase() {}

-    GraphDef BuildUpdateOps(const GraphDef& input_def);
-
    /*! \brief Create a graph from the optimized def */
    virtual bool Create(
-        const GraphDef&         optimized_graph,
+        const GraphDef&         def,
        Workspace*              ws) = 0;

    /*! \brief Run the graph once synchronously */
@@ -58,14 +56,14 @@ class GraphBase {
 class Graph : public GraphBase {
 public:
    /*! \brief Default constructor */
-    Graph(const GraphDef& meta_graph, Workspace* ws);
+    Graph(const GraphDef& def, Workspace* ws);

    /*! \brief Default deconstructor */
    virtual ~Graph() { for (auto* op : ops_) delete op; }

    /*! \brief Create a graph from the optimized def */
    bool Create(
-        const GraphDef&         optimized_graph,
+        const GraphDef&         def,
        Workspace*              ws) override;

    /*! \brief Run the graph once synchronously */

--- a/Dragon/include/core/graph_gradient.h
+++ b/Dragon/include/core/graph_gradient.h
@@ -31,7 +31,7 @@ class GraphGradientMaker {
        const GraphDef&              forward_def,
        GraphDef&                    backward_def);

-    void Share(GraphDef& graph);
+    GraphDef Share(const GraphDef& input_def);

    void SetTerms(const Map<string, string>& terms) { terms_ = terms; }
    void SetOperatorPrefix(const string& prefix) { op_prefix_ = prefix; }

--- a/Dragon/include/core/tensor.h
+++ b/Dragon/include/core/tensor.h
@@ -42,9 +42,9 @@ class Tensor {
            d = dims[i]; strides_[i] = (int64_t)new_size;
            CHECK_GE(d, 0);
            if (d > 0) new_size *= d;
-        } if (own_mem_) {
-            if (size_ != new_size &&
-                capacity_ < new_size * meta_.itemsize()) {
+        }
+        if (own_mem_) {
+            if (capacity_ < new_size * meta_.itemsize()) {
                memory_.reset();
                capacity_ = 0;
            }

--- a/Dragon/include/core/workspace.h
+++ b/Dragon/include/core/workspace.h
@@ -29,23 +29,28 @@ class Workspace {

    typedef Map<string, unique_ptr<OperatorBase> > OperatorMap;
    typedef Map<string, unique_ptr<GraphBase> > GraphMap;
-    typedef Map<string, Workspace*> WorkspaceMap;

    /*! \brief Constructor */
-    Workspace(const string& name) : name_(name) { InitWorkspace(); }
+    Workspace(const string& name) : name_(name) { Initialize(); }

    /*! \brief Return the name of this workspace */
    const string& name() { return name_; }

-    /*! \brief Create some internal tensors */
-    void InitWorkspace();
+    /*! \brief Return the name of stored tensors */
+    vector<string> tensors() const;
+
+    /*! \brief Return the name of stored graphs */
+    vector<string> graphs() const;

-    /*! \brief Move a external workspace into this workspace */
-    Workspace* Move(Workspace* ws);
+    /*! \brief Create some internal tensors */
+    void Initialize();

    /*! \brief Destory all the tensors */
    void Clear();

+    /*! \brief Merge from a external workspace */
+    void MergeFrom(Workspace* ws);
+
    /*! \brief Query the real name of specified tensor */
    string GetTensorName(const string& name) const;

@@ -66,14 +71,11 @@ class Workspace {
    /*! \brief Reset the specified tensor */
    void ResetTensor(const string& name);

-    /*! \brief Return all the stored tensor names */
-    vector<string> GetTensors() const;
-
    /* \brief Whether the specified filler is in this workspace */
    bool HasFiller(const string& name, bool use_remote = true) const;

    /*! \brief Create the specified filler */
-    void CreateFiller(const TensorFillerProto filler);
+    void CreateFiller(const TensorFillerProto& filler);

    /*! \brief Return the specified filler */
    const TensorFillerProto* GetFiller(const string& name) const;
@@ -82,27 +84,26 @@ class Workspace {
    template <class Context>
    vector<void*> caches(const vector<size_t>& segments) {
        int64_t nbytes = 0;
+        vector<void*> ret(segments.size());
        for (auto& segment : segments) nbytes += (int64_t)segment;
-        Tensor* cache_t = CreateTensor("/share/cache");
-        cache_t->Reshape({ nbytes });
-        vector<void*> Bcaches(segments.size());
-        Bcaches[0] = cache_t->template mutable_data<uint8_t, Context>();
+        auto* T = CreateTensor("/share/cache")->Reshape({ nbytes });
+        ret[0] = T->template mutable_data<uint8_t, Context>();
        for (int i = 1; i < segments.size(); i++)
-            Bcaches[i] = (uint8_t*)Bcaches[i - 1] + segments[i - 1];
-        return Bcaches;
+            ret[i] = (uint8_t*)ret[i - 1] + segments[i - 1];
+        return ret;
    }

    /*! \brief Create temporal cache segments with the specified type */
    template <typename T, class Context>
    vector<T*> caches(const vector<int64_t>& segments) {
-        vector<size_t> Tsegments;
-        for (auto& segment : segments)
-            Tsegments.emplace_back(segment * sizeof(T));
-        vector<void*> Bcaches = caches<Context>(Tsegments);
-        vector<T*> Tcaches(segments.size());
+        vector<size_t> segments_in_byte;
+        vector<T*> ret(segments.size());
+        for (const auto& e : segments)
+            segments_in_byte.emplace_back(e * sizeof(T));
+        auto ret_in_byte = caches<Context>(segments_in_byte);
        for (int i = 0; i < segments.size(); i++)
-            Tcaches[i] = (T*)Bcaches[i];
-        return Tcaches;
+            ret[i] = (T*)ret_in_byte[i];
+        return ret;
    }

    /*! \brief Create a operator in this workspace */
@@ -124,9 +125,6 @@ class Workspace {
        const string&               exclude,
        int                         stream_id = 0);

-    /*! \brief Return all the stored graph names */
-    vector<string> GetGraphs() const;
-
    /* \brief Set an alias for the tensor */
    bool SetTensorAlias(const string& name, const string& alias);

@@ -160,7 +158,7 @@ class Workspace {
    GraphMap graph_map_;

    /*! \brief Store the remote workspaces */
-    WorkspaceMap workspace_map_;
+    vector<Workspace*> remote_workspaces_;
 };

 }  // namespace dragon

--- a/Dragon/include/operators/misc/gradient_op.h
+++ b/Dragon/include/operators/misc/gradient_op.h
@@ -40,8 +40,11 @@ class GradientGatherOp final : public Operator<Context> {
 public:
    GradientGatherOp(const OperatorDef& def, Workspace* ws)
        : Operator<Context>(def, ws) {
-        for (int i = 0; i < InputSize(); i++)
-            if (Input(i).name() != "NULL") indices.push_back(i);
+        for (int i = 0; i < InputSize(); i++) {
+            if (Input(i).name() != "NULL") {
+                indices.push_back(i);
+            }
+        }
    }
    USE_OPERATOR_FUNCTIONS;

@@ -53,6 +56,16 @@ class GradientGatherOp final : public Operator<Context> {
 };

 template <class Context>
+class GradientAddOp final : public Operator<Context> {
+ public:
+    USE_SIMPLE_CTOR_DTOR(GradientAddOp);
+    USE_OPERATOR_FUNCTIONS;
+
+    void RunOnDevice() override;
+    template <typename T> void RunWithType();
+};
+
+template <class Context>
 class StopGradientOp final : public Operator<Context> {
 public:
    USE_SIMPLE_CTOR_DTOR(StopGradientOp);

--- a/Dragon/include/utils/op_kernel.h
+++ b/Dragon/include/utils/op_kernel.h
@@ -1033,7 +1033,6 @@ void MixedPrecisionUpdate(

 template <typename T, class Context>
 void BiasAdd(
-    const int               count,
    const int               outer_dim,
    const int               dim,
    const int               inner_dim,

--- a/Dragon/modules/cxx/dragon.cc
+++ b/Dragon/modules/cxx/dragon.cc
@@ -38,7 +38,7 @@ Workspace* ResetWorkspace(const std::string& name) {
    g_workspaces[name].reset(new Workspace(name));
    for (auto& sub_workspace : sub_workspaces[name]) {
        if (g_workspaces.count(sub_workspace) > 0)
-            g_workspaces[name]->Move(
+            g_workspaces[name]->MergeFrom(
                g_workspaces[sub_workspace].get());
    }
    return g_workspaces[name].get();
@@ -55,7 +55,7 @@ void MoveWorkspace(
    std::unique_lock<std::mutex> lock(g_mutex);
    CHECK(src) << "\nGiven source workspace is invalid.";
    CHECK(dst) << "\nGiven destination workspace is invalid.";
-    dst->Move(src);
+    dst->MergeFrom(src);
    sub_workspaces[dst->name()].push_back(src->name());
    LOG(INFO) << "Move the Workspace(" << src->name() << ") "
              << "into the Workspace(" << dst->name() << ").";

--- a/Dragon/modules/python/py_autograd.h
+++ b/Dragon/modules/python/py_autograd.h
@@ -36,29 +36,6 @@ void AddGradientMethods(pybind11::module& m) {
            vector<pybind11::bytes>, vector<string>, vector<float>
        >(grad_ops, grad.g_inputs, grad.defaults);
    });
-
-    m.def("FlowGradients", [](
-        const vector<OperatorDef*>&   forward_ops,
-        const vector<string>&         targets,
-        const vector<string>&         input_grads,
-        const vector<string>&         ignore_grads,
-        const bool                    is_sharing,
-        const bool                    verbose) {
-        // Make => Optimize => Run
-        GraphDef backward_ops;
-        GraphGradientMaker maker;
-        for (auto& grad : input_grads) maker.AddExternalGrad(grad);
-        for (auto& grad : ignore_grads) maker.AddIgnoreGrad(grad);
-        maker.Make(forward_ops, targets, backward_ops);
-        if (is_sharing) maker.Share(backward_ops);
-        pybind11::gil_scoped_release g;
-        for (auto& op : backward_ops.op()) {
-            if (op.type().empty()) continue;
-            if (verbose) std::cout << op.DebugString() << std::endl;
-            if (op.has_uid()) ws()->RunOperator(op);
-            else ws()->RunOperatorOnce(op);
-        }
-    });
 }

 }  // namespace python

--- a/Dragon/modules/python/py_dragon.h
+++ b/Dragon/modules/python/py_dragon.h
@@ -16,15 +16,17 @@
 #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION

 #include "py_types.h"
+
 #include "core/common.h"
-#include "core/registry.h"
 #include "core/context.h"
-#include "core/context_cuda.h"
 #include "core/operator.h"
-#include "core/operator_gradient.h"
-#include "core/graph_gradient.h"
+#include "core/registry.h"
 #include "core/workspace.h"
+#include "core/context_cuda.h"
+#include "core/graph_gradient.h"
+#include "core/operator_gradient.h"
 #include "utils/caffemodel.h"
+#include "onnx/onnx_backend.h"

 #include <pybind11/stl.h>
 #include <pybind11/pybind11.h>
@@ -136,8 +138,6 @@ class NumpyFeeder : public TensorFeederBase {
    }
 };

-Workspace* ws();
-
 }  // namespace python

 }  // namespace dragon

--- a/Dragon/modules/python/py_graph.h
+++ b/Dragon/modules/python/py_graph.h
-/*!
- * Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
- *
- * Licensed under the BSD 2-Clause License.
- * You should have received a copy of the BSD 2-Clause License
- * along with the software. If not, See,
- *
- *      <https://opensource.org/licenses/BSD-2-Clause>
- *
- * ------------------------------------------------------------
- */
-
-#ifndef DRAGON_PYTHON_PY_GRAPH_H_
-#define DRAGON_PYTHON_PY_GRAPH_H_
-
-#include "py_dragon.h"
-
-namespace dragon {
-
-namespace python {
-
-void AddGraphMethods(pybind11::module& m) {
-    /*! \brief Create a graph from the serialized def */
-    m.def("CreateGraph", [](
-        const string&           serialized,
-        const bool              verbose) {
-        GraphDef graph_def;
-        if (!graph_def.ParseFromString(serialized))
-            LOG(FATAL) << "Failed to parse the GraphDef.";
-        auto* graph = ws()->CreateGraph(graph_def);
-        if (verbose) {
-            // It is not a good design to print the debug string
-            auto* graph_tensor = ws()->CreateTensor(
-                "/graph_def/optimized/" + graph->name());
-            if (graph_tensor->count() > 0) {
-                auto* data = graph_tensor->mutable_data<string, CPUContext>();
-                std::cout << data[0] << std::endl;
-            }
-
-        }
-        // Return the graph name may be different from the def
-        // We will make a unique dummy name on creating the graph
-        return graph->name();
-    });
-
-    /*! \brief Run an existing graph */
-    m.def("RunGraph", [](
-        const string&           name,
-        const string&           include,
-        const string&           exclude) {
-        pybind11::gil_scoped_release g;
-        ws()->RunGraph(name, include, exclude);
-    });
-
-    /*! \brief List all of the existing graphs */
-    m.def("Graphs", []() { ws()->GetGraphs(); });
-}
-
-}  // namespace python
-
-}  // namespace dragon
-
-#endif  // DRAGON_PYTHON_PY_GRAPH_H_
\ No newline at end of file
--- a/Dragon/modules/python/py_io.h
+++ b/Dragon/modules/python/py_io.h
-/*!
- * Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
- *
- * Licensed under the BSD 2-Clause License.
- * You should have received a copy of the BSD 2-Clause License
- * along with the software. If not, See,
- *
- *      <https://opensource.org/licenses/BSD-2-Clause>
- *
- * ------------------------------------------------------------
- */
-
-#ifndef DRAGON_PYTHON_PY_IO_H_
-#define DRAGON_PYTHON_PY_IO_H_
-
-#include "py_dragon.h"
-
-namespace dragon {
-
-namespace python {
-
-void AddIOMethods(pybind11::module& m) {
-    m.def("Snapshot", [](
-        const string&       filename,
-        vector<string>&     names,
-        const int           format) {
-        vector<Tensor*> tensors;
-        switch (format) {
-            case 0:  // Pickle
-                LOG(FATAL) << "Format depends on Pickle. "
-                              "Can't be used in C++.";
-                break;
-            case 1:  // CaffeModel
-                for (const auto& e : names)
-                    tensors.emplace_back(ws()->GetTensor(e));
-                SavaCaffeModel(filename, tensors);
-                break;
-            default:
-                LOG(FATAL) << "Unknwon format, code: " << format;
-        }
-    });
-
-    m.def("Restore", [](
-        const string&       filename,
-        const int           format) {
-        switch (format) {
-            case 0:  // Pickle
-                LOG(FATAL) << "Format depends on Pickle. "
-                    "Can't be used in C++.";
-                break;
-            case 1:  // CaffeModel
-                LoadCaffeModel(filename, ws());
-                break;
-            default: 
-                LOG(FATAL) << "Unknwon format, code: " << format;
-        }
-    });
-}
-
-}  // namespace python
-
-}  // namespace dragon
-
-#endif  // DRAGON_PYTHON_PY_IO_H_
\ No newline at end of file
--- a/Dragon/modules/python/py_module.cc
+++ b/Dragon/modules/python/py_module.cc
-#include "py_graph.h"
 #include "py_autograd.h"
 #include "py_operator.h"
 #include "py_tensor.h"
 #include "py_cuda.h"
 #include "py_mpi.h"
-#include "py_io.h"
-#include "py_onnx.h"
 #include "py_config.h"
 #include "py_proto.h"

@@ -16,13 +13,6 @@ namespace python {
 DEFINE_TYPED_REGISTRY(TensorFetcherRegistry, TypeId, TensorFetcherBase);
 DEFINE_TYPED_REGISTRY(TensorFeederRegistry, TypeId, TensorFeederBase);

-Map<string, unique_ptr < Workspace > > g_workspaces;
-Map<string, vector<string> > sub_workspaces;
-Workspace* g_workspace;
-string g_current_workspace;
-
-Workspace* ws() { return g_workspace; }
-
 TypeId CTypeToFetcher(TypeId type) {
    static Map<TypeId,TypeId> c_type_map {
        { TypeMeta::Id<bool>(), TypeMeta::Id<NumpyFetcher>() },
@@ -41,116 +31,230 @@ REGISTER_TENSOR_FETCHER(TypeMeta::Id<NumpyFetcher>(), NumpyFetcher);
 REGISTER_TENSOR_FETCHER(TypeMeta::Id<StringFetcher>(), StringFetcher);
 REGISTER_TENSOR_FEEDER(TypeMeta::Id<NumpyFeeder>(), NumpyFeeder);

-void SwitchWorkspace(
-    const string&           name,
-    const bool              create_if_missing = true) {
-    if (g_workspaces.count(name)) {
-        g_current_workspace = name;
-        g_workspace = g_workspaces[name].get();
-    } else if (create_if_missing) {
-        unique_ptr<Workspace> new_workspace(new Workspace(name));
-        g_workspace = new_workspace.get();
-        g_workspaces[name] = std::move(new_workspace);
-        sub_workspaces[name] = vector<string>();
-        g_current_workspace = name;
-    } else {
-        LOG(FATAL) << "Workspace of the given name does not exist."
-           "\nAnd, it is not allowed to create. (Try to alllow?)";
-    }
-}
-
-void OnImportModule() {
-    []() { import_array1(); }();
-    static bool initialized = false;
-    if (initialized) return;
-    SwitchWorkspace("default", true);
-    g_current_workspace = "default";
-    initialized = true;
-}
+void OnImportModule() { []() { import_array1(); }(); }

 PYBIND11_MODULE(libdragon, m) {
+    /*! \brief Export the Workspace class */
+    pybind11::class_<Workspace>(m, "Workspace")
+        .def(pybind11::init<const string&>())

-     /*! \brief Switch to the specific workspace */
-    m.def("SwitchWorkspace", &SwitchWorkspace);
+        /*! \brief Return the name of this workspace */
+        .def_property_readonly("name", &Workspace::name)

-    /*! \brief Return the current active workspace */
-    m.def("CurrentWorkspace", []() {
-        return g_current_workspace;
-    });
+        /*! \brief Return the name of stored tensors */
+        .def_property_readonly("tensors", &Workspace::tensors)

-    /*! \brief List all of the existing workspace */
-    m.def("Workspaces", []() -> vector<string> {
-        vector<string> names;
-        for (auto const& it : g_workspaces)
-            names.emplace_back(it.first);
-        return names;
-    });
+        /*! \brief Return the name of stored graphs */
+        .def_property_readonly("graphs", &Workspace::graphs)

-    /*! \brief Move the source workspace into the target */
-    m.def("MoveWorkspace", [](
-        const string&           target,
-        const string&           source) {
-        CHECK(g_workspaces.count(source))
-            << "\nSource Workspace(" << source << ") does not exist.";
-        CHECK(g_workspaces.count(target))
-            << "\nTarget Workspace(" << target << ") does not exist.";
-        g_workspaces[target]->Move(g_workspaces[source].get());
-        sub_workspaces[target].push_back(source);
-        LOG(INFO) << "Move the Workspace(" << source << ") "
-            << "into the Workspace(" << target << ").";
-    });
+        /*! \brief Destory all the tensors */
+        .def("Clear", &Workspace::Clear)
+
+        /*! \brief Merge a external workspace into self */
+        .def("MergeFrom", &Workspace::MergeFrom)
+
+        /*! \brief Return a unique dummy name */
+        .def("GetDummyName", &Workspace::GetDummyName)
+
+        /*! \brief Return the unique name of given tensor */
+        .def("GetTensorName", &Workspace::GetTensorName)

-    /*! \brief Reset the specific workspace */
-    m.def("ResetWorkspace", [](const string& name) {
-        string target_workspace = g_current_workspace;
-        if (!name.empty()) target_workspace = name;
-        CHECK(g_workspaces.count(target_workspace))
-            << "\nWorkspace(" << target_workspace
-            << ") does not exist, can not be reset.";
-        LOG(INFO) << "Reset the Workspace(" << target_workspace << ")";
-        g_workspaces[target_workspace].reset(new Workspace(target_workspace));
-        g_workspace = g_workspaces[target_workspace].get();
-        for (auto& sub_workspace : sub_workspaces[target_workspace]) {
-            if (g_workspaces.count(sub_workspace) > 0)
-                g_workspace->Move(g_workspaces[sub_workspace].get());
+        /*! \brief Reset a tensor with the given name */
+        .def("ResetTensor", &Workspace::ResetTensor)
+
+        /*! \brief Indicate whether the given tensor is existing */
+        .def("HasTensor", [](
+            Workspace*                  self,
+            const string&               name) {
+            return self->HasTensor(name);
+        })
+
+        /*! \brief Create a tensor with the given name */
+        .def("CreateTensor", [](
+            Workspace*                  self,
+            const string&               name) {
+            self->CreateTensor(name);
+        })
+
+        /*! \brief Create a tensor from the specified filler */
+        .def("CreateFiller", [](
+            Workspace*                  self,
+            const string&               serialized) {
+            TensorFillerProto filler_proto;
+            if (!filler_proto.ParseFromString(serialized))
+                LOG(FATAL) << "Failed to parse the TensorFiller.";
+            self->CreateFiller(filler_proto);
+            self->CreateTensor(filler_proto.tensor());
+        })
+
+        /*! \brief Create a tensor with the given shape */
+        .def("TensorFromShape", [](
+            Workspace*                  self,
+            const string&               name,
+            const vector<int64_t>&      shape,
+            const string&               dtype) {
+            const TypeMeta& meta = TypeStringToMeta(dtype);
+            CHECK(meta.id() != 0)
+                << "\nUnsupported data type: " + dtype + ".";
+            Tensor* tensor = self->CreateTensor(name);
+            tensor->Reshape(shape);
+            tensor->raw_mutable_data<CPUContext>(meta);
+        })
+
+        /*! \brief Create a tensor with the given array */
+        .def("TensorFromArray", [](
+            Workspace*                  self,
+            const string&               name,
+            pybind11::object            object) {
+            PyArrayObject* array = PyArray_GETCONTIGUOUS(
+                reinterpret_cast<PyArrayObject*>(object.ptr()));
+            const TypeMeta& meta = TypeNPYToMeta(PyArray_TYPE(array));
+            if (meta.id() == 0) LOG(FATAL) << "Unsupported data type.";
+            Tensor* tensor = self->CreateTensor(name);
+            tensor->SetMeta(meta);
+            int ndim = PyArray_NDIM(array);
+            npy_intp* npy_dims = PyArray_DIMS(array);
+            vector<int64_t> dims;
+            for (int i = 0; i < ndim; i++) dims.push_back(npy_dims[i]);
+            tensor->Reshape(dims);
+            auto* data = static_cast<void*>(PyArray_DATA(array));
+            if (!tensor->has_memory()) {
+                MixedMemory* memory(new MixedMemory());
+                memory->set_cpu_data(data, tensor->nbytes());
+                tensor->set_memory(memory);
+            } else {
+                if (tensor->DECREFPyArray) tensor->DECREFPyArray();
+                tensor->memory()->set_cpu_data(data, tensor->nbytes());
            }
-    });
+            // Follow the codes of PyTorch
+            // Here we bind the DECREF to Tensor
+            // ResetTensor() or ResetWorkspace() can trigger it
+            tensor->DECREFPyArray = [array]()->void { Py_XDECREF(array); };
+        })

-    /*! \brief Release the memory of tensors */
-    m.def("ClearWorkspace", [](const string& name) {
-        string target_workspace = g_current_workspace;
-        if (!name.empty()) target_workspace = name;
-        CHECK(g_workspaces.count(target_workspace))
-            << "\nWorkspace(" << target_workspace
-            << ") does not exist, can not be reset.";
-        LOG(INFO) << "Clear the Workspace(" << target_workspace << ")";
-        g_workspaces[target_workspace]->Clear();
-    });
+        /*! \brief Create a tensor copied from an existing one */
+        .def("TensorFromTensor", [](
+            Workspace*                  self,
+            const string&               name,
+            const string&               other,
+            const string&               dev1,
+            const string&               dev2) {
+            DeviceOption dst_ctx, src_ctx;
+            dst_ctx.ParseFromString(dev1);
+            src_ctx.ParseFromString(dev2);
+            Tensor* srcT = self->GetTensor(other);
+            Tensor* dstT = self->CreateTensor(name);
+            dstT->ReshapeLike(*srcT);
+            const TypeMeta& meta = srcT->meta();
+            if (dst_ctx.device_type() == PROTO_CUDA) {
+                if (src_ctx.device_type() == PROTO_CUDA) {
+                    // CUDA <- CUDA
+                    CUDAContext::MemcpyEx<CUDAContext, CUDAContext>(
+                        srcT->nbytes(),
+                        dstT->raw_mutable_data<CUDAContext>(meta),
+                        srcT->raw_data<CUDAContext>(),
+                        src_ctx.device_id());
+                } else {
+                    // CUDA <- CPU
+                    CUDAContext::MemcpyEx<CUDAContext, CPUContext>(
+                        srcT->nbytes(),
+                        dstT->raw_mutable_data<CUDAContext>(meta),
+                        srcT->raw_data<CPUContext>(),
+                        dst_ctx.device_id());
+                }
+            } else {
+                if (src_ctx.device_type() == PROTO_CUDA) {
+                    // CPU <- CUDA
+                    CUDAContext::MemcpyEx<CPUContext, CUDAContext>(
+                        srcT->nbytes(),
+                        dstT->raw_mutable_data<CPUContext>(meta),
+                        srcT->raw_data<CUDAContext>(),
+                        src_ctx.device_id());
+                } else {
+                    // CPU <- CPU
+                    CPUContext::Memcpy<CUDAContext, CUDAContext>(
+                        srcT->nbytes(),
+                        dstT->raw_mutable_data<CPUContext>(meta),
+                        srcT->raw_data<CPUContext>());
+                }
+            }
+        })
+
+        /*! \brief Return a array zero-copied from an existing tensor */
+        .def("TensorToArray", [](
+            Workspace*                  self,
+            const string&               name,
+            const bool                  readonly) {
+            Tensor* tensor = self->GetTensor(name);
+            CHECK_GT(tensor->count(), 0);
+            vector<npy_intp> dims;
+            for (const auto dim : tensor->dims()) dims.push_back(dim);
+            int npy_type = TypeMetaToNPY(tensor->meta());
+            if (npy_type == -1) {
+                LOG(FATAL) << "Tensor(" + tensor->name() + ") "
+                    "with dtype." + TypeMetaToString(tensor->meta()) +
+                    " is not supported by numpy.";
+            }
+            auto* data = readonly ?
+                const_cast<void*>(tensor->raw_data<CPUContext>()) :
+                    tensor->raw_mutable_data<CPUContext>();
+            PyObject* array = PyArray_SimpleNewFromData(
+                tensor->ndim(), dims.data(), npy_type, data);
+            return pybind11::reinterpret_steal<pybind11::object>(array);
+        })
+
+        /*! \brief Return the CXX Tensor reference */
+        .def("GetTensor", [](
+            Workspace*                  self,
+            const string&               name) {
+            return self->GetTensor(name);
+        }, pybind11::return_value_policy::reference_internal)

-    /*! \brief Copy the array data to the tensor */
-    m.def("FeedTensor", [](
+        /*! \brief Return the filler type of a tensor */
+        .def("GetFillerType", [](
+            Workspace*                  self,
+            const string&               name) {
+            return self->GetFiller(name)->type();
+        })
+
+        /* \brief Set an alias for the tensor */
+        .def("SetTensorAlias", [](
+            Workspace*                  self,
+            const string&               name,
+            const string&               alias) {
+            CHECK(self->HasTensor(name))
+                << "\nTensor(" + name << ") has not been "
+                << "registered in the current workspace.";
+            self->SetTensorAlias(name, alias);
+        })
+
+        /*! \brief Copy the array data to tensor */
+        .def("FeedTensor", [](
+            Workspace*                  self,
            const string&               name,
            pybind11::object            value,
-        const string&           device_option) {
+            const string&               ctx) {
            DeviceOption dev;
-        if (!device_option.empty()) {
-            if (!dev.ParseFromString(device_option)) {
-                LOG(FATAL) << "Failed to parse the DeviceOption.";
-            }
+            if (!ctx.empty()) {
+                CHECK(dev.ParseFromString(ctx))
+                    << "\nFailed to parse the DeviceOption.";
            }
-        Tensor* tensor = g_workspace->CreateTensor(name);
-        unique_ptr<TensorFeederBase> feeder(TensorFeederRegistry()
-            ->Create(TypeMeta::Id<NumpyFeeder>()));
-        feeder->Feed(dev, reinterpret_cast<
-            PyArrayObject*>(value.ptr()), tensor);
-    });
+            Tensor* tensor = self->CreateTensor(name);
+            unique_ptr<TensorFeederBase> feeder(
+                TensorFeederRegistry()->Create(
+                    TypeMeta::Id<NumpyFeeder>()));
+            feeder->Feed(dev, reinterpret_cast
+                <PyArrayObject*>(value.ptr()), tensor);
+        })

        /*! \brief Copy the tensor data to the array */
-    m.def("FetchTensor", [](const string& name) {
-        if (!g_workspace->HasTensor(name))
-            LOG(FATAL) << "Tensor(" + name + ") "
-                "does not exist. Have you registered it?";
-        Tensor* tensor = g_workspace->GetTensor(name);
+       .def("FetchTensor", [](
+            Workspace*                  self,
+            const string&               name) {
+            CHECK(self->HasTensor(name))
+                << "\nTensor(" + name + ") does not exist.\n"
+                << "Have you registered it?";
+            Tensor* tensor = self->GetTensor(name);
            TypeId type_id = CTypeToFetcher(tensor->meta().id());
            CHECK(type_id != 0)
                << "\nTensor(" << tensor->name()
@@ -163,30 +267,153 @@ PYBIND11_MODULE(libdragon, m) {
                LOG(FATAL) << name << " is not a C++ native type.";
                return pybind11::object();
            }
-    });
+        })

-    /*! \brief Return a unique dummy name */
-    m.def("GetDummyName", [](
-        const string&           basename,
-        const string&           suffix,
-        const string&           domain,
-        const bool              zero_based) {
-        return ws()->GetDummyName(
-            basename, suffix, domain, zero_based);
+        /*! \brief Run a operator from the def reference */
+        .def("RunOperator", [](
+            Workspace*                  self,
+            OperatorDef*                def,
+            const bool                  verbose) {
+            pybind11::gil_scoped_release g;
+            if (verbose) {
+                // It is not a good design to print the debug string
+                std::cout << def->DebugString() << std::endl;
+            }
+            self->RunOperator(*def);
+        })
+
+        /*! \brief Run a operator from the serialized def */
+        .def("RunOperator", [](
+            Workspace*                  self,
+            const string&               serialized,
+            const bool                  verbose) {
+            OperatorDef def;
+            CHECK(def.ParseFromString(serialized));
+            pybind11::gil_scoped_release g;
+            if (verbose) {
+                // It is not a good design to print the debug string
+                std::cout << def.DebugString() << std::endl;
+            }
+            self->RunOperatorOnce(def);
+        })
+
+        /*! \brief Create a graph from the serialized def */
+        .def("CreateGraph", [](
+            Workspace*                  self,
+            const string&               serialized,
+            const bool                  verbose) {
+            GraphDef graph_def;
+            CHECK(graph_def.ParseFromString(serialized))
+                << "\nFailed to parse the GraphDef.";
+            auto* graph = self->CreateGraph(graph_def);
+            if (verbose) {
+                // It is not a good design to print the debug string
+                auto* T = self->CreateTensor(
+                    "/graph_def/optimized/" + graph->name());
+                if (T->count() > 0) {
+                    auto* data = T->mutable_data<string, CPUContext>();
+                    std::cout << data[0] << std::endl;
+                }
+            }
+            // Return the graph name may be different from the def
+            // We will make a unique dummy name on creating the graph
+            return graph->name();
+        })
+
+        /*! \brief Run an existing graph */
+        .def("RunGraph", [](
+            Workspace*                  self,
+            const string&               name,
+            const string&               include,
+            const string&               exclude) {
+            pybind11::gil_scoped_release g;
+            self->RunGraph(name, include, exclude);
+        })
+
+        .def("Backward", [](
+            Workspace*                      self,
+            const vector<OperatorDef*>&     forward_ops,
+            const vector<string>&           targets,
+            const vector<string>&           input_grads,
+            const vector<string>&           ignore_grads,
+            const bool                      is_sharing,
+            const bool                      verbose) {
+            // Make => Optimize => Run
+            GraphDef backward_ops;
+            GraphGradientMaker maker;
+            for (auto& e : input_grads) maker.AddExternalGrad(e);
+            for (auto& e : ignore_grads) maker.AddIgnoreGrad(e);
+            maker.Make(forward_ops, targets, backward_ops);
+            pybind11::gil_scoped_release g;
+            if (is_sharing) backward_ops = maker.Share(backward_ops);
+            for (auto& op : backward_ops.op()) {
+                if (verbose) std::cout << op.DebugString() << std::endl;
+                if (op.has_uid()) self->RunOperator(op);
+                else self->RunOperatorOnce(op);
+            }
+        })
+
+        /*! \brief Serialize tensors into a binary file */
+        .def("Snapshot", [](
+            Workspace*                  self,
+            const string&               filename,
+            const vector<string>&       tensors,
+            const int                   format) {
+            vector<Tensor*> refs;
+            switch (format) {
+                case 0:  // Pickle
+                    LOG(FATAL) << "Format depends on Pickle. "
+                                  "Can't be used in C++.";
+                    break;
+                case 1:  // CaffeModel
+                    for (const auto& e : tensors)
+                        refs.emplace_back(self->GetTensor(e));
+                    SavaCaffeModel(filename, refs);
+                    break;
+                default:
+                    LOG(FATAL) << "Unknwon format, code: " << format;
+            }
+        })
+
+        /*! \brief Load tensors from a binary file */
+        .def("Restore", [](
+            Workspace*                  self,
+            const string&               filename,
+            const int                   format) {
+                switch (format) {
+                case 0:  // Pickle
+                    LOG(FATAL) << "Format depends on Pickle. "
+                                  "Can't be used in C++.";
+                    break;
+                case 1:  // CaffeModel
+                    LoadCaffeModel(filename, self);
+                    break;
+                default:
+                    LOG(FATAL) << "Unknwon format, code: " << format;
+            }
+        })
+
+        /*! \brief Load tensors and graph from a ONNX model */
+        .def("ImportONNXModel", [](
+            Workspace*                  self,
+            const string&               model_path) {
+            GraphDef init_graph, pred_graph;
+            onnx::ONNXBackend onnx_backend;
+            onnx_backend.Prepare(model_path, &init_graph, &pred_graph);
+            // Serializing to Python is intractable
+            // We should apply the initializer immediately
+            self->RunGraph(self->CreateGraph(init_graph)->name(), "", "");
+            return pybind11::bytes(pred_graph.SerializeAsString());
        });

-    AddIOMethods(m);
    AddMPIMethods(m);
    AddCUDAMethods(m);
    AddProtoMethods(m);
-    AddGraphMethods(m);
    AddTensorMethods(m);
    AddConfigMethods(m);
    AddGradientMethods(m);
    AddOperatorMethods(m);
-
    OnImportModule();
-    m.def("OnModuleExit", []() { g_workspaces.clear(); });
 }

 }  // namespace python

--- a/Dragon/modules/python/py_onnx.h
+++ b/Dragon/modules/python/py_onnx.h
-/*!
- * Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
- *
- * Licensed under the BSD 2-Clause License.
- * You should have received a copy of the BSD 2-Clause License
- * along with the software. If not, See,
- *
- *      <https://opensource.org/licenses/BSD-2-Clause>
- *
- * ------------------------------------------------------------
- */
-
-#ifndef DRAGON_PYTHON_PY_ONNX_H_
-#define DRAGON_PYTHON_PY_ONNX_H_
-
-#include "onnx/onnx_backend.h"
-
-#include "py_dragon.h"
-
-namespace dragon {
-
-namespace python {
-
-void AddONNXMethods(pybind11::module& m) {
-    m.def("ImportONNXModel", [](
-        const string&           model_path) {
-        GraphDef init_graph, pred_graph;
-        onnx::ONNXBackend onnx_backend;
-        onnx_backend.Prepare(model_path, &init_graph, &pred_graph);
-        // Serializing to Python is intractable
-        // We should apply the initializer immediately
-        ws()->CreateGraph(init_graph);
-        ws()->RunGraph(init_graph.name(), "", "");
-        return pybind11::bytes(pred_graph.SerializeAsString());
-    });
-}
-
-}  // namespace python
-
-}  // namespace dragon
-
-#endif // DRAGON_PYTHON_PY_ONNX_H_
\ No newline at end of file
--- a/Dragon/modules/python/py_operator.h
+++ b/Dragon/modules/python/py_operator.h
@@ -20,36 +20,14 @@ namespace dragon {
 namespace python {

 void AddOperatorMethods(pybind11::module& m) {
-    /*! \brief Return all the registered operators */
-    m.def("RegisteredOperators", []() { return CPUOperatorRegistry()->keys(); });
-
-    /*! \brief Return all the operators without gradients */
-    m.def("NoGradientOperators", []() { return NoGradientRegistry()->keys(); });
-
-    /*! \brief Run a operator from the def reference */
-    m.def("RunOperator", [](
-        OperatorDef*        def,
-        const bool          verbose) {
-        pybind11::gil_scoped_release g;
-        if (verbose) {
-            // It is not a good design to print the debug string
-            std::cout << def->DebugString() << std::endl;
-        }
-        ws()->RunOperator(*def);
+    /*! \brief Return the registered operators */
+    m.def("RegisteredOperators", []() {
+        return CPUOperatorRegistry()->keys(); 
    });

-    /*! \brief Run a operator from the serialized def */
-    m.def("RunOperator", [](
-        const string&       serialized,
-        const bool          verbose) {
-        OperatorDef def;
-        CHECK(def.ParseFromString(serialized));
-        pybind11::gil_scoped_release g;
-        if (verbose) {
-            // It is not a good design to print the debug string
-            std::cout << def.DebugString() << std::endl;
-        }
-        ws()->RunOperatorOnce(def);
+    /*! \brief Return the non-gradient operators */
+    m.def("NoGradientOperators", []() {
+        return NoGradientRegistry()->keys(); 
    });
 }


--- a/Dragon/modules/python/py_tensor.h
+++ b/Dragon/modules/python/py_tensor.h
@@ -22,208 +22,51 @@ namespace python {
 void AddTensorMethods(pybind11::module& m) {
    /*! \brief Export the Tensor class */
    pybind11::class_<Tensor>(m, "Tensor")
+        /*! \brief Return the number of dimensions */
        .def_property_readonly("ndim", &Tensor::ndim)
+
+        /*! \brief Return all the dimensions */
        .def_property_readonly("dims", &Tensor::dims)
+
+        /*! \brief Return the total number of elements */
        .def_property_readonly("size", &Tensor::size)
+
+        /*! \brief Return the data type */
        .def_property_readonly("dtype", [](Tensor* self) {
            return TypeMetaToString(self->meta());
-      }).def_property_readonly("device", [](Tensor* self) {
+        })
+        
+        /*! \brief Return the device information */
+        .def_property_readonly("device", [](Tensor* self) {
            if (self->has_memory()) {
-                Map<string, string> mem_info = self->memory()->info();
+                auto mem_info = self->memory()->info();
                return std::tuple<string, int>(
                    mem_info["device_type"], atoi(
                        mem_info["device_id"].c_str()));
            } else {
                return std::tuple<string, int>("Unknown", 0);
            }
-      }).def("ToCPU", [](Tensor* self) {
-            CHECK(self->has_memory()) << "\nTensor(" << self->name()
-              << ") does not initialize or had been reset.";
+        })
+
+        /*! \brief Switch the memory to the cpu context */
+        .def("ToCPU", [](Tensor* self) {
+            CHECK(self->has_memory())
+                << "\nTensor(" << self->name() << ") "
+                << "does not initialize or had been reset.";
            self->memory()->ToCPU();
-      }).def("ToCUDA", [](Tensor* self, const int device_id) {
+        })
+
+        /*! \brief Switch the memory to the cuda context */
+        .def("ToCUDA", [](Tensor* self, int device_id) {
 #ifdef WITH_CUDA
-           CHECK(self->has_memory()) << "\nTensor(" << self->name()
-               << ") does not initialize or had been reset.";
+           CHECK(self->has_memory()) 
+               << "\nTensor(" << self->name() << ") "
+               << "does not initialize or had been reset.";
           self->memory()->SwitchToCUDADevice(device_id);
 #else
           CUDA_NOT_COMPILED;
 #endif
     });
-
-    /*! \brief List all the existing tensors */
-    m.def("Tensors", []() { return ws()->GetTensors(); });
-
-    /*! \brief Indicate whether the given tensor is existing */
-    m.def("HasTensor", [](
-        const string& name) -> bool {
-        return ws()->HasTensor(name);
-    });
-
-    /*! \brief Return the unique name of given tensor */
-    m.def("GetTensorName", [](
-        const string& name) -> string {
-        return ws()->GetTensorName(name);
-    });
-
-    /*! \brief Create a tensor with the given name */
-    m.def("CreateTensor", [](
-        const string& name) -> void {
-        ws()->CreateTensor(name);
-    });
-
-    /*! \brief Create a tensor with the given name */
-    m.def("ResetTensor", [](
-        const string& name) -> void {
-        ws()->ResetTensor(name);
-    });
-
-    /*! \brief Create a tensor with the given shape */
-    m.def("TensorFromShape", [](
-        const string&               name,
-        const vector<int64_t>&      shape,
-        const string&               dtype) {
-        const TypeMeta& meta = TypeStringToMeta(dtype);
-        if (meta.id() == 0) {
-            LOG(FATAL) << "Unsupported data type: " + dtype + ".";
-        }
-        Tensor* tensor = ws()->CreateTensor(name);
-        if (meta.id() != tensor->meta().id() && tensor->meta().id() != 0)
-            LOG(WARNING) << "Set Tensor(" << tensor->name() << ")"
-            << " with different data type from original one.";
-        tensor->Reshape(shape);
-        tensor->raw_mutable_data<CPUContext>(meta);
-    });
-
-    /*! \brief Create a tensor with the given array */
-    m.def("TensorFromPyArray", [](
-        const string&               name,
-        pybind11::object            py_array) {
-        PyArrayObject* array = PyArray_GETCONTIGUOUS(
-            reinterpret_cast<PyArrayObject*>(py_array.ptr()));
-        const TypeMeta& meta = TypeNPYToMeta(PyArray_TYPE(array));
-        if (meta.id() == 0) LOG(FATAL) << "Unsupported data type.";
-        Tensor* tensor = ws()->CreateTensor(name);
-        tensor->SetMeta(meta);
-        int ndim = PyArray_NDIM(array);
-        npy_intp* npy_dims = PyArray_DIMS(array);
-        vector<int64_t> dims;
-        for (int i = 0; i < ndim; i++) dims.push_back(npy_dims[i]);
-        tensor->Reshape(dims);
-        auto* data = static_cast<void*>(PyArray_DATA(array));
-        if (!tensor->has_memory()) {
-            MixedMemory* memory(new MixedMemory());
-            memory->set_cpu_data(data, tensor->nbytes());
-            tensor->set_memory(memory);
-        } else {
-            if (tensor->DECREFPyArray) tensor->DECREFPyArray();
-            tensor->memory()->set_cpu_data(data, tensor->nbytes());
-        }
-        // Follow the codes of PyTorch
-        // Here we bind the DECREF to Tensor
-        // ResetTensor() or ResetWorkspace() can trigger it
-        tensor->DECREFPyArray = [array]()->void { Py_XDECREF(array); };
-    });
-
-    /*! \brief Create a tensor copied from an existing one */
-    m.def("TensorFromTensor", [](
-        const string&               name,
-        const string&               other,
-        const string&               dev1,
-        const string&               dev2) {
-        DeviceOption dst_ctx, src_ctx;
-        dst_ctx.ParseFromString(dev1);
-        src_ctx.ParseFromString(dev2);
-        Tensor* srcT = ws()->GetTensor(other);
-        Tensor* dstT = ws()->CreateTensor(name);
-        dstT->ReshapeLike(*srcT);
-        const TypeMeta& meta = srcT->meta();
-        if (dst_ctx.device_type() == PROTO_CUDA) {
-            if (src_ctx.device_type() == PROTO_CUDA) {
-                // CUDA <- CUDA
-                CUDAContext::MemcpyEx<CUDAContext, CUDAContext>(
-                    srcT->nbytes(),
-                        dstT->raw_mutable_data<CUDAContext>(meta),
-                            srcT->raw_data<CUDAContext>(),
-                                src_ctx.device_id());
-            } else {
-                // CUDA <- CPU
-                CUDAContext::MemcpyEx<CUDAContext, CPUContext>(
-                    srcT->nbytes(),
-                        dstT->raw_mutable_data<CUDAContext>(meta),
-                            srcT->raw_data<CPUContext>(),
-                                dst_ctx.device_id());
-            }
-        } else {
-            if (src_ctx.device_type() == PROTO_CUDA) {
-                // CPU <- CUDA
-                CUDAContext::MemcpyEx<CPUContext, CUDAContext>(
-                    srcT->nbytes(),
-                        dstT->raw_mutable_data<CPUContext>(meta),
-                            srcT->raw_data<CUDAContext>(),
-                                src_ctx.device_id());
-            } else {
-                // CPU <- CPU
-                CPUContext::Memcpy<CUDAContext, CUDAContext>(
-                    srcT->nbytes(),
-                        dstT->raw_mutable_data<CPUContext>(meta),
-                            srcT->raw_data<CPUContext>());
-            }
-        }
-    });
-
-    /*! \brief Return a array zero-copied from an existing tensor */
-    m.def("TensorToPyArray", [](
-        const string&               name,
-        const bool                  readonly) {
-        Tensor* tensor = ws()->GetTensor(name);
-        CHECK_GT(tensor->count(), 0);
-        vector<npy_intp> dims;
-        for (const auto dim : tensor->dims()) dims.push_back(dim);
-        int npy_type = TypeMetaToNPY(tensor->meta());
-        if (npy_type == -1) {
-            LOG(FATAL) << "Tensor(" + tensor->name() + ") "
-                "with dtype." + TypeMetaToString(tensor->meta()) +
-                " is not supported by numpy.";
-        }
-        auto* data = readonly ?
-            const_cast<void*>(tensor->raw_data<CPUContext>()) :
-                tensor->raw_mutable_data<CPUContext>();
-        PyObject* array = PyArray_SimpleNewFromData(
-            tensor->ndim(), dims.data(), npy_type, data);
-        return pybind11::reinterpret_steal<pybind11::object>(array);
-    });
-
-    /*! \brief Create a tensor from the specified filler */
-    m.def("CreateFiller", [](
-        const string&               serialized) {
-        TensorFillerProto filler_proto;
-        if (!filler_proto.ParseFromString(serialized))
-            LOG(FATAL) << "Failed to parse the TensorFiller.";
-        ws()->CreateFiller(filler_proto);
-        ws()->CreateTensor(filler_proto.tensor());
-    });
-
-    /*! \brief Return the filler type of a tensor */
-    m.def("GetFillerType", [](const string& name) {
-        return ws()->GetFiller(name)->type();
-    });
-
-    /* \brief Set an alias for the tensor */
-    m.def("SetTensorAlias", [](
-        const string&               name,
-        const string&               alias) {
-        if (!ws()->HasTensor(name)) {
-            LOG(FATAL) << "Tensor(" + name << ") has not "
-                "been registered in the current workspace.";
-        }
-        ws()->SetTensorAlias(name, alias);
-    });
-
-    /*! \brief Return the CXX Tensor reference */
-    m.def("GetTensor", [](
-        const string&               name) {
-        return ws()->GetTensor(name);
-    }, pybind11::return_value_policy::reference_internal);
 }

 }  // namespace python

--- a/Dragon/python/dragon/__init__.py
+++ b/Dragon/python/dragon/__init__.py
@@ -22,6 +22,9 @@ import dragon.config as config
 # Core
 from dragon.core.tensor import Tensor
 import dragon.core.workspace as workspace
+from dragon.core.workspace import Workspace
+from dragon.core.workspace import get_default_workspace
+from dragon.core.workspace import reset_default_workspace
 import dragon.core.tensor_utils as tensor_utils
 import dragon.core.mpi as mpi
 import dragon.core.cuda as cuda
@@ -41,7 +44,6 @@ from dragon.vm.theano.tensor import grad as grad
 from dragon.core.scope import name_scope, get_default_name_scope
 from dragon.core.scope import phase_scope, get_default_phase
 from dragon.core.scope import device_scope, get_default_device
-from dragon.core.scope import WorkspaceScope as ws_scope

 # Version
 from dragon.version import version

--- a/Dragon/python/dragon/config.py
+++ b/Dragon/python/dragon/config.py
@@ -15,8 +15,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import dragon.import_c_api as C
-import dragon.core.logging as logging
+from dragon import import_c_api as _C
+from dragon.core import logging as _logging
+

 option = {}

@@ -290,12 +291,12 @@ def SetLoggingLevel(level):
    The default level is *INFO*.

    """
-    C.SetLoggingLevel(level)
-    logging.set_verbosity({
-        'DEBUG': logging.DEBUG,
-        'INFO': logging.INFO,
-        'WARNING': logging.WARN,
-        'ERROR': logging.ERROR,
-        'FATAL': logging.FATAL,
+    _C.SetLoggingLevel(level)
+    _logging.set_verbosity({
+        'DEBUG': _logging.DEBUG,
+        'INFO': _logging.INFO,
+        'WARNING': _logging.WARN,
+        'ERROR': _logging.ERROR,
+        'FATAL': _logging.FATAL,
        }[level]
    )
\ No newline at end of file
--- a/Dragon/python/dragon/core/cuda.py
+++ b/Dragon/python/dragon/core/cuda.py
@@ -15,7 +15,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import dragon.import_c_api as _C
+from dragon import import_c_api as _C


 def IsCUDADriverSufficient():

--- a/Dragon/python/dragon/core/gradient_maker.py
+++ b/Dragon/python/dragon/core/gradient_maker.py
@@ -30,11 +30,10 @@ from __future__ import print_function

 from collections import defaultdict

-import dragon.proto.dragon_pb2 as pb
-import dragon.import_c_api as C
-
-from dragon.core.helper import OperatorHelper
-from dragon.core.proto_utils import MakeOperatorDef
+from dragon import import_c_api as _C
+from dragon.core import helper as _helper
+from dragon.proto import dragon_pb2 as _proto_def
+from dragon.core import proto_utils as _proto_utils


 class GraphGradientMaker(object):
@@ -62,16 +61,22 @@ class GraphGradientMaker(object):
            The OpDef, outputs and defaults of ``BackwardOp``.

        """
-        g_ops, g_inputs, defaults = C.CreateGradientDefs(
+        g_ops, g_inputs, defaults = _C.CreateGradientDefs(
            forward_op.SerializeToString(), g_outputs)
        for idx, g_op in enumerate(g_ops):
-            new_def = pb.OperatorDef()
+            new_def = _proto_def.OperatorDef()
            new_def.ParseFromString(g_op)
            g_ops[idx] = new_def
        return g_ops, g_inputs, defaults

    @classmethod
-    def CheckGrad(cls, forward_op, inputs_to_grads, blacklist, targets):
+    def CheckGrad(
+        cls,
+        forward_op,
+        inputs_to_grads,
+        blacklist,
+        targets,
+    ):
        """Check if missing Grads. If True, skip this Op.

        Parameters
@@ -91,7 +96,7 @@ class GraphGradientMaker(object):
            The result of checking and generated filling grads.

        """
-        if forward_op.type in C.NO_GRADIENT_OPERATORS:
+        if forward_op.type in _C.NO_GRADIENT_OPERATORS:
            for input in forward_op.input: blacklist.add(input)
            return True, None

@@ -114,7 +119,13 @@ class GraphGradientMaker(object):
        return False, gen_grads

    @classmethod
-    def Make(cls, forward_ops, targets, input_grads=None, auto_names=True):
+    def Make(
+        cls,
+        forward_ops,
+        targets,
+        input_grads=None,
+        auto_names=True,
+    ):
        """Make ``BackwardOps`` based on ``ForwardOps``.

        Parameters
@@ -149,7 +160,7 @@ class GraphGradientMaker(object):

        # PLAY for the forward
        for forward_op in forward_ops:
-            if forward_op.type in C.NO_GRADIENT_OPERATORS: continue
+            if forward_op.type in _C.NO_GRADIENT_OPERATORS: continue
            outputs = [o for o in forward_op.output]
            for input in forward_op.input:
                if input not in outputs:
@@ -176,14 +187,17 @@ class GraphGradientMaker(object):
                        op_inputs.append(item[0])
                        op_outputs.append(item[0] + '_grad')
                        values.append(defaults[item[1]])
-                    gen_op = MakeOperatorDef('GradientGenerate', op_inputs, op_outputs, defaults=values)
-                    gen_op.name = OperatorHelper.get_name() if auto_names else 'runtime'
+                    gen_op = _proto_utils.MakeOperatorDef(
+                        'GradientGenerate', op_inputs, op_outputs, defaults=values)
+                    gen_op.name = _helper.OperatorHelper. \
+                        get_name() if auto_names else 'runtime'
                    if forward_op.HasField('device_option'):
                        gen_op.device_option.CopyFrom(forward_op.device_option)
                    backward_ops.append(gen_op)
                #  GradientOp
                for g_op in g_ops:
-                    g_op.name = OperatorHelper.get_name() if auto_names else 'runtime'
+                    g_op.name = _helper.OperatorHelper. \
+                        get_name() if auto_names else 'runtime'
                    backward_ops.append(g_op)

            # Split & Gather grads for multi-used input
@@ -208,10 +222,12 @@ class GraphGradientMaker(object):
                            for idx in range(grads_count[g_output]):
                                if '%s_autosplit_%d' % (g_output, idx) in all_split_grads:
                                    split_inputs.append('%s_autosplit_%d' % (g_output, idx))
-                            gather_op = MakeOperatorDef('GradientGather', split_inputs, [g_output])
+                            gather_op = _proto_utils.MakeOperatorDef(
+                                'GradientGather', split_inputs, [g_output])
                            if g_op.HasField('device_option'):
                                gather_op.device_option.CopyFrom(g_op.device_option)
-                            gather_op.name = OperatorHelper.get_name() if auto_names else 'runtime'
+                            gather_op.name = _helper.OperatorHelper. \
+                                get_name() if auto_names else 'runtime'
                            backward_ops.append(gather_op)
                        g_op.output[g_output_idx] = split_name


--- a/Dragon/python/dragon/core/helper.py
+++ b/Dragon/python/dragon/core/helper.py
@@ -17,7 +17,8 @@ from __future__ import print_function

 import math
 import numpy
-import dragon
+
+from dragon.core import workspace as _workspace


 class OperatorHelper(object):
@@ -39,11 +40,11 @@ class OperatorHelper(object):

    @classmethod
    def get_index_and_name(cls, prefix='Op'):
-        name = dragon.workspace.GetDummyName(prefix, domain='Operator')
+        name = _workspace.GetDummyName(prefix, domain='Operator')
        try:
            _, op_idx = name.split('_')
        except:
-            name = dragon.workspace.GetDummyName(prefix, domain='Operator')
+            name = _workspace.GetDummyName(prefix, domain='Operator')
            _, op_idx = name.split('_')
        return int(op_idx), name


--- a/Dragon/python/dragon/core/mpi.py
+++ b/Dragon/python/dragon/core/mpi.py
@@ -15,7 +15,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import dragon.import_c_api as _C
+from dragon import import_c_api as _C


 _GLOBAL_MPI_IS_INIT = False

--- a/Dragon/python/dragon/core/proto_utils.py
+++ b/Dragon/python/dragon/core/proto_utils.py
@@ -9,7 +9,7 @@
 #
 # ------------------------------------------------------------

-"""Define some helpful protobuf makers here."""
+"""Define some helpful protocol buffer makers here."""

 from __future__ import absolute_import
 from __future__ import division
@@ -17,28 +17,28 @@ from __future__ import print_function

 import sys
 import copy
-import numpy as np
-from google.protobuf.message import Message
+import numpy

-import dragon.config as cfg
-import dragon.import_c_api as _C
-from dragon.proto import dragon_pb2 as pb
-from dragon.core.scope import get_default_device
+from dragon import config as _cfg
+from dragon import import_c_api as _C
+from dragon.core import scope as _scope
+from dragon.proto import dragon_pb2 as _proto_def
+from google.protobuf.message import Message as _Message


 if sys.version_info >= (3,0):
    def MakeArgument(key, value):
-        argument = pb.Argument()
+        argument = _proto_def.Argument()
        argument.name = key
        if type(value) is float: argument.f = value
-        elif type(value) in (bool, int, np.int64) : argument.i = value
+        elif type(value) in (bool, int, numpy.int64) : argument.i = value
        elif type(value) is bytes: argument.s = value
        elif type(value) is str: argument.s = str.encode(value)
-        elif isinstance(value, Message): argument.s = value.SerializeToString()
+        elif isinstance(value, _Message): argument.s = value.SerializeToString()
        elif all(type(v) is float for v in value): argument.floats.extend(value)
        elif all(type(v) is int for v in value): argument.ints.extend(value)
        elif all(type(v) is str for v in value): argument.strings.extend([str.encode(v) for v in value])
-        elif all(isinstance(v, Message) for v in value):
+        elif all(isinstance(v, _Message) for v in value):
            argument.strings.extend([v.SerializeToString() for v in value])
        else:
            raise ValueError(
@@ -47,20 +47,20 @@ if sys.version_info >= (3,0):
        return argument
 else:
    def MakeArgument(key, value):
-        argument = pb.Argument()
+        argument = _proto_def.Argument()
        argument.name = key
        if type(value) is float: argument.f = value
-        elif type(value) in (bool, int, long, np.int64) : argument.i = value
+        elif type(value) in (bool, int, long, numpy.int64) : argument.i = value
        elif type(value) is str: argument.s = value
        elif type(value) is unicode: argument.s = str(value)
-        elif isinstance(value, Message): argument.s = value.SerializeToString()
+        elif isinstance(value, _Message): argument.s = value.SerializeToString()
        elif all(type(v) is float for v in value): argument.floats.extend(value)
        elif all(type(v) is int for v in value): argument.ints.extend(value)
        elif all(type(v) is long for v in value): argument.ints.extend(value)
        elif all(type(v) is str for v in value): argument.strings.extend(value)
        elif all(type(v) is unicode for v in value):
            argument.strings.extend([str(v) for v in value])
-        elif all(isinstance(v, Message) for v in value):
+        elif all(isinstance(v, _Message) for v in value):
            argument.strings.extend([v.SerializeToString() for v in value])
        else:
            raise ValueError(
@@ -70,10 +70,16 @@ else:


 def MakeOperatorDef(
-    op_type, inputs=(), outputs=(),
-        name='', uid=None, device_option=None,
-            arg=None, **kwargs):
-    operator = pb.OperatorDef()
+    op_type,
+    inputs=(),
+    outputs=(),
+    name='',
+    uid=None,
+    device_option=None,
+    arg=None,
+    **kwargs
+):
+    operator = _proto_def.OperatorDef()
    operator.type = op_type
    operator.name = name
    operator.input.extend([str(tensor) for tensor in inputs])
@@ -92,9 +98,15 @@ def MakeOperatorDef(


 def MakeCXXOperatorDef(
-    op_type, inputs=(), outputs=(),
-        name='', uid=None, device_option=None,
-            arg=None, **kwargs):
+    op_type,
+    inputs=(),
+    outputs=(),
+    name='',
+    uid=None,
+    device_option=None,
+    arg=None,
+    **kwargs
+):
    c_def = _C.OperatorDef()
    py_def = MakeOperatorDef(
        op_type, inputs, outputs, name, uid,
@@ -104,7 +116,7 @@ def MakeCXXOperatorDef(


 def MakeDeviceOption(device_type, device_id, rng_seed=None):
-    option = pb.DeviceOption()
+    option = _proto_def.DeviceOption()
    option.device_type = device_type
    option.device_id = device_id
    if rng_seed is not None: option.random_seed = rng_seed
@@ -133,7 +145,7 @@ def GetDeviceOption(device_type, device_id=0, rng_seed=None):


 def GetDefaultDeviceOption():
-    device_info = get_default_device()
+    device_info = _scope.get_default_device()
    if device_info is not None:
        return GetDeviceOption(
            device_info['device_type'],
@@ -142,10 +154,10 @@ def GetDefaultDeviceOption():


 def GetGlobalDeviceOption():
-    option = cfg.GetGlobalOptions()
+    options = _cfg.GetGlobalOptions()
    return GetDeviceOption(
-        option['device'],
-            option['device_id'])
+        options['device'],
+            options['device_id'])


 # Fix the python stdout
@@ -159,6 +171,5 @@ class Unbuffered(object):
       return getattr(self.stream, attr)


-# Clear the stdout buffer for mpi(C++ && Python)
-import sys
+# Clear the stdout buffer for mpi
 sys.stdout = Unbuffered(sys.stdout)
\ No newline at end of file
--- a/Dragon/python/dragon/core/scope.py
+++ b/Dragon/python/dragon/core/scope.py
@@ -13,92 +13,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import threading
-import dragon.import_c_api as _C
-
-from contextlib import contextmanager
-
-
-__all__ = [
-    'name_scope',
-    'phase_scope',
-    'device_scope',
-    'get_default_phase',
-    'get_default_device',
-    'get_default_name_scope',
-    'WorkspaceScope',
-]
-
-
-class _ThreadLocalStack(threading.local):
-    def __init__(self):
-        super(_ThreadLocalStack, self).__init__()
-        self._enforce_nesting = True
-        self.stack = []
-
-    def get_default(self):
-        return self.stack[-1] if len(self.stack) >= 1 else None
-
-    def is_cleared(self):
-        return not self.stack
-
-    @property
-    def enforce_nesting(self):
-        return self._enforce_nesting
-
-    @enforce_nesting.setter
-    def enforce_nesting(self, value):
-        self._enforce_nesting = value
-
-    @contextmanager
-    def get_controller(self, default):
-        """A context manager for manipulating a default stack."""
-        self.stack.append(default)
-        try:
-            yield default
-        finally:
-            # stack may be empty if reset() was called
-            if self.stack:
-                if self._enforce_nesting:
-                    if self.stack[-1] is not default:
-                        raise AssertionError(
-                            "Nesting violated for default stack of %s objects" %
-                            type(default))
-                    self.stack.pop()
-                else:
-                    self.stack.remove(default)
-
-
-class WorkspaceScope(object):
-    """WorkspaceScope is a auxiliary to assign the specific workspace.
-
-    Examples
-    --------
-    >>> import dragon as dg
-    >>> with WorkspaceScope('session1'): pass
-    >>> with dg.ws_scope('session2'): pass
-
-    """
-    def __init__(self, ws_name):
-        assert isinstance(ws_name, type('str')), \
-            'WorkspaceScope takes in a string as its argument.'
-        assert ws_name != '', \
-            'The workspace name should not be empty.'
-        self.ws = ws_name
-        self.prev = 'default'
-
-    def __enter__(self):
-        self.prev = _C.CurrentWorkspace()
-        _C.SwitchWorkspace(self.ws, True)
-
-    def __exit__(self, type, value, traceback):
-        _C.SwitchWorkspace(self.prev, True)
-
-
-_GLOBAL_TENSOR_STACK = _ThreadLocalStack()
-_GLOBAL_PHASE_STACK = _ThreadLocalStack()
-_GLOBAL_DEVICE_STACK = _ThreadLocalStack()
-_PREDEFINED_SCOPE_SEPARATOR = '/'
+from dragon.core import tls as _tls


 def name_scope(name):
@@ -140,7 +55,7 @@ def device_scope(device_type, device_id=0):

    """
    device_type, device_id, device_type.lower(), device_id
-    assert device_type in ['cpu', 'gpu', 'cuda', 'cnml']
+    assert device_type in ('cpu', 'gpu', 'cuda', 'cnml')
    # Default names
    if device_type == 'gpu': device_type = 'cuda'
    return _GLOBAL_DEVICE_STACK.get_controller({
@@ -213,3 +128,9 @@ def get_default_device():

    """
    return _GLOBAL_DEVICE_STACK.get_default()
+
+
+_GLOBAL_TENSOR_STACK = _tls.Stack()
+_GLOBAL_PHASE_STACK = _tls.Stack()
+_GLOBAL_DEVICE_STACK = _tls.Stack()
+_PREDEFINED_SCOPE_SEPARATOR = '/'
\ No newline at end of file
--- a/Dragon/python/dragon/core/tensor.py
+++ b/Dragon/python/dragon/core/tensor.py
@@ -22,14 +22,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import numpy as np
+import numpy

-import dragon.core.workspace as ws
-import dragon.proto.dragon_pb2 as pb
-
-from dragon.core.proto_utils import MakeOperatorDef, GetDefaultDeviceOption
-from dragon.core.scope import get_default_name_scope
-from dragon.core.helper import OperatorHelper, GradientHelper
+from dragon.core import scope as _scope
+from dragon.core import helper as _helper
+from dragon.core import workspace as _workspace
+from dragon.proto import dragon_pb2 as _proto_def
+from dragon.core import proto_utils as _proto_utils


 class Tensor(object):
@@ -59,7 +58,7 @@ class Tensor(object):

        """
        self.name, self.shape, self.dtype = name, shape, dtype
-        self.gradient = GradientHelper(self)
+        self.gradient = _helper.GradientHelper(self)

    ##############################################
    #                                            #
@@ -258,8 +257,8 @@ class Tensor(object):
    @name.setter
    def name(self, value):
        if value != '':
-            self._name = ws.GetDummyName(
-                get_default_name_scope() + value
+            self._name = _workspace.GetDummyName(
+                _scope.get_default_name_scope() + value
                    if value else 'Tensor', domain='Tensor')
        else:
            # Set it manually for same cases
@@ -506,15 +505,15 @@ class Tensor(object):
            existing_outputs=[self], starts=starts, sizes=sizes)

    def _from_constants(self, value):
-        if not isinstance(value, np.ndarray):
+        if not isinstance(value, numpy.ndarray):
            try:
-                value = np.array(value, dtype=self.dtype
+                value = numpy.array(value, dtype=self.dtype
                    if self.dtype else 'float32')
            except:
                raise TypeError(
                    'Can not convert the value to Tensor or numpy array.')
        ref_tensor =  Tensor.Ref(
-            name=ws.GetDummyName('Constant',
+            name=_workspace.GetDummyName('Constant',
                domain='Tensor', zero_based=False),
                    shape=list(value.shape), dtype=str(value.dtype))
        ref_tensor.set_value(value)
@@ -798,14 +797,16 @@ class Tensor(object):

        Returns
        -------
-        None
+        Tensor
+            The self.

        See Also
        --------
        `workspace.FeedTensor(*args, **kwargs)`_ - How to feed a Tensor.

        """
-        ws.FeedTensor(self, new_value)
+        _workspace.FeedTensor(self, new_value)
+        return self

    def get_value(self):
        """Fetch the values from C++ backend. [**Theano Style**]
@@ -820,7 +821,7 @@ class Tensor(object):
        `workspace.FetchTensor(*args, **kwargs)`_ - How to fetch a Tensor.

        """
-        return ws.FetchTensor(self)
+        return _workspace.FetchTensor(self)

    def copy(self):
        """Return a Tensor with same content. [**Theano Style**]
@@ -835,7 +836,7 @@ class Tensor(object):
        `ops.Copy(*args, **kwargs)`_ - How to copy A to B.

        """
-        new_tensor = Tensor(self.name + '_copy')
+        new_tensor = Tensor.Ref(self.name + '_copy')
        arguments = {'inputs': self, 'existing_outputs': new_tensor}
        return self.CreateOperator('Copy', **arguments)

@@ -906,7 +907,7 @@ class Tensor(object):

            if self.shape is not None:
                output.shape = input_shape[:]
-                output.shape.insert(axis, np.long(1))
+                output.shape.insert(axis, 1)

        return output

@@ -924,17 +925,8 @@ class Tensor(object):
        TensorShape
            The shape description.

-        Examples
-        --------
-        >>> a = Tensor(shape=[1, 2, 3, 4])
-        >>> print a.get_shape()
-        >>> TensorShape([Dimension(1), Dimension(2), Dimension(3), Dimension(4)])
-
-        >>> print a.get_shape().as_list()
-        >>> [1, 2, 3, 4]
-
        """
-        raise NotImplementedError('Implemented in <vm.tensorflow.framework.tensor_shape>')
+        raise NotImplementedError('')

    def eval(self, feed_dict=None):
        """Run and return the computing results of this tensor.
@@ -950,7 +942,7 @@ class Tensor(object):
            The values of this tensor in the backend.

        """
-        raise NotImplementedError('Try "import dragon.vm.tensorflow" to load this dynamic methods.')
+        raise NotImplementedError('')

    ############################################
    #                                          #
@@ -984,26 +976,32 @@ class Tensor(object):
        return ref_tensor

    @classmethod
-    def CreateOperator(cls, op_type, inputs,
-            num_outputs=1, existing_outputs=None,
-                extra_inputs=None, name=None, **kwargs):
+    def CreateOperator(
+        cls,
+        op_type,
+        inputs,
+        num_outputs=1,
+        existing_outputs=None,
+        extra_inputs=None,
+        name=None,
+        **kwargs
+    ):
        """Construct a new Tensor with specific operator descriptor.

        Parameters
        ----------
-        inputs : list of Tensor or Tensor
-            The inputs for this operator.
        op_type : str
-            The operator type.
-        num_outputs : int, optional
+            The type of operator.
+        inputs : sequence of Tensor
+            The inputs for this operator.
+        num_outputs : int, optional, default=1
            The number of outputs to return.
-            Discarded if ``existing_outputs`` is not None.
        existing_outputs : sequence of Tensor, optional
            The existing outputs for this operator.
        extra_inputs : sequence of Tensor, optional
-            The inputs that should be attached to solving targets, e.g. dynamic shape.
+            The inputs that should be attached to solving targets.
        name : str, optional
-            The optional name to use. ``Op_xxx`` will be used automatically if it is None.
+            The optional name.

        Returns
        -------
@@ -1049,10 +1047,10 @@ class Tensor(object):
        # 2. Generate outputs
        outputs = []
        if existing_outputs is None:
-            name_scope = get_default_name_scope()
+            name_scope = _scope.get_default_name_scope()
            for idx in range(num_outputs):
                outputs.append(Tensor.Ref(
-                    ws.GetDummyName(name_scope +
+                    _workspace.GetDummyName(name_scope +
                        (name if name else op_type),
                            suffix=':{}'.format(idx),
                                domain='Tensor')))
@@ -1066,11 +1064,10 @@ class Tensor(object):
        # 3. Construct OperatorDef
        inputs_name = [input.name for input in inputs]
        outputs_name = [output.name for output in outputs]
-        op_idx, op_name = OperatorHelper.get_index_and_name()
-
-        device_option = GetDefaultDeviceOption()
+        op_idx, op_name = _helper.OperatorHelper.get_index_and_name()
+        device_option = _proto_utils.GetDefaultDeviceOption()

-        op_def = MakeOperatorDef(op_type,
+        op_def = _proto_utils.MakeOperatorDef(op_type,
            inputs_name, outputs_name, op_name,
                device_option=device_option, **kwargs)

@@ -1089,7 +1086,7 @@ class Tensor(object):
                    output.extra_targets.add(input.name)

        # 5. Refine the shape and data type
-        outputs = OperatorHelper.apply(op_type,
+        outputs = _helper.OperatorHelper.apply(op_type,
            arguments=kwargs, inputs=inputs, outputs=outputs)

        # 6. Returns
@@ -1097,42 +1094,6 @@ class Tensor(object):
        elif num_outputs == 1: return outputs[0]
        else: return None

-    @classmethod
-    def Convert(cls, value, dtype='float32'):
-        """Convert the given value to a tensor.
-
-        Parameters
-        ----------
-        value : number or Tensor
-            The value to convert.
-        dtype : str, optional, default='float32'
-            The data type of the tensor.
-
-        Returns
-        -------
-        Tensor
-            The tensor converted with given value.
-
-        """
-        if isinstance(value, Tensor): return value
-        else:
-            if not isinstance(value, np.ndarray):
-                try:
-                    if dtype:
-                        value = np.array(value, dtype=dtype)
-                    else:
-                        value = np.array(value)
-                except:
-                    raise TypeError('{} value can not be '
-                        'converted to Tensor.'.format(
-                            type(value).__name__))
-            ref_tensor = Tensor.Ref(
-                name=ws.GetDummyName('Constant',
-                    domain='Tensor', zero_based=False),
-                        shape=list(value.shape), dtype=str(value.dtype))
-            ref_tensor.set_value(value)
-            return ref_tensor
-
    def Fill(self, type, **kwargs):
        """Fill self with the specific type of filler.

@@ -1147,11 +1108,12 @@ class Tensor(object):
            Self, with filler registered implicitly in the backend.

        """
-        filler = pb.TensorFillerProto()
+        filler = _proto_def.TensorFillerProto()
        filler.tensor = self.name
        filler.type = type.lower()

-        if filler.type in ['placeholder', 'variable']: pass
+        if filler.type in ['placeholder', 'variable']:
+            pass
        elif filler.type == 'constant':
            filler.value = kwargs['value'] if 'value' in kwargs else 0
        elif filler.type in ['normal', 'gaussian']:
@@ -1180,39 +1142,5 @@ class Tensor(object):
        else:
            raise ValueError('Unknown filler type: {}'.format(filler.type))

-        ws.CreateFiller(filler)
+        _workspace.CreateFiller(filler)
        return self
\ No newline at end of file
-
-    def debug_expressions(self):
-        """Return the internal expressions for displaying.
-
-        Returns
-        -------
-        str
-            The internal expressions.
-
-        """
-        external_inputs = set()
-        outputs = set()
-        ordered_exprs = sorted(self.expressions.items(), key=lambda d: d[0])
-        buffer0 = '-------------------Expressions-------------------\n'
-        buffer1 = ''; buffer2 = 'Inputs: ['
-
-        for k, v in ordered_exprs:
-            buffer1 = buffer1 + '>>>  ' + str(k).zfill(3) + '. ('
-            for input in v.input:
-                if input not in outputs:
-                    external_inputs.add(input)
-                buffer1 = buffer1 + input + ', '
-            buffer1 = buffer1 + 'None, ' if len(v.input) == 0 else buffer1
-            buffer1 = buffer1[0:-2] + ') -> ' + v.type + ' -> ('
-            for output in v.output:
-                outputs.add(output)
-                buffer1 = buffer1 + output + ', '
-            buffer1 = buffer1[0:-2] + ') \n'
-
-        buffer1 = buffer1 + 'Target: ' + self._name + '\n'
-        for ex_input in external_inputs:
-            buffer2 = buffer2 + ex_input + ', '
-        buffer2 = buffer2 + ']\n'
-        return buffer0 + buffer2 + buffer1 + buffer0
\ No newline at end of file
--- a/Dragon/python/dragon/core/tensor_utils.py
+++ b/Dragon/python/dragon/core/tensor_utils.py
@@ -16,10 +16,10 @@ from __future__ import division
 from __future__ import print_function

 import numpy
-import dragon

-from dragon.core.tensor import Tensor
-from dragon.core.proto_utils import GetDeviceOption
+from dragon.core import workspace as _workspace
+from dragon.core import proto_utils as _proto_utils
+from dragon.core.tensor import Tensor as _Tensor


 def FromShape(shape, dtype='float32', name=None):
@@ -47,9 +47,8 @@ def FromShape(shape, dtype='float32', name=None):
    tensor.shape = list(shape)
    if not isinstance(shape, (tuple, list)):
        raise TypeError('The shape should be a tuple or list.')
-    dragon.C.TensorFromShape(
-        _stringify_tensor(tensor),
-            list(shape), dtype)
+    _get_workspace().TensorFromShape(
+        _stringify_tensor(tensor), list(shape), dtype)
    return tensor


@@ -70,7 +69,8 @@ def SetShape(tensor, shape, dtype='float32'):
    None

    """
-    dragon.C.TensorFromShape(_stringify_tensor(tensor), shape, dtype)
+    _get_workspace().TensorFromShape(
+        _stringify_tensor(tensor), shape, dtype)


 def FromTensor(src, src_ctx=None, name=None, ctx=None):
@@ -97,15 +97,17 @@ def FromTensor(src, src_ctx=None, name=None, ctx=None):

    """
    tensor = _try_get_tensor(name)
-    if src_ctx is None: src_ctx = GetDeviceOption('cpu')
-    if ctx is None: ctx = GetDeviceOption('cpu')
-    dragon.C.TensorFromTensor(
-        _stringify_tensor(tensor), _stringify_tensor(src),
-            _stringify_proto(ctx), _stringify_proto(src_ctx))
+    if src_ctx is None: src_ctx = _proto_utils.GetDeviceOption('cpu')
+    if ctx is None: ctx = _proto_utils.GetDeviceOption('cpu')
+    _get_workspace().TensorFromTensor(
+        _stringify_tensor(tensor),
+            _stringify_tensor(src),
+                _stringify_proto(ctx),
+                    _stringify_proto(src_ctx))
    return tensor


-def FromPyArray(array, name=None):
+def FromArray(array, name=None):
    """Create a Tensor from a existing Array.

    Note that memory of Tensor are ``zero-copied``.
@@ -128,12 +130,13 @@ def FromPyArray(array, name=None):
    """
    tensor = _try_get_tensor(name)
    if not isinstance(array, numpy.ndarray):
-        raise TypeError('The given nd-array should be numpy.ndarray.')
-    dragon.C.TensorFromPyArray(_stringify_tensor(tensor), array)
+        raise TypeError('Excepted a numpy.ndarray.')
+    _get_workspace().TensorFromArray(
+        _stringify_tensor(tensor), array)
    return tensor


-def SetPyArray(tensor, array):
+def SetArray(tensor, array):
    """Set a Tensor from a existing Array.

    Note that memory of Tensor are ``zero-copied``.
@@ -149,15 +152,12 @@ def SetPyArray(tensor, array):
    -------
    None

-    References
-    ----------
-    The wrapper of ``TensorFromPyArrayCC``.
-
    """
-    dragon.C.TensorFromPyArray(_stringify_tensor(tensor), array)
+    _get_workspace().TensorFromArray(
+        _stringify_tensor(tensor), array)


-def ToPyArray(tensor, readonly=False):
+def ToArray(tensor, readonly=False):
    """Create a Array from a existing Tensor.

    Note that memory of Array are *zero-copied*.
@@ -175,7 +175,8 @@ def ToPyArray(tensor, readonly=False):
        The array sharing the memory with original tensor.

    """
-    return dragon.C.TensorToPyArray(_stringify_tensor(tensor), readonly)
+    return  _get_workspace().TensorToArray(
+        _stringify_tensor(tensor), readonly)


 def GetStorage(tensor):
@@ -193,8 +194,8 @@ def GetStorage(tensor):

    """
    tensor = _stringify_tensor(tensor)
-    if not dragon.workspace.HasTensor(tensor): return None
-    return dragon.C.GetTensor(tensor)
+    if not _get_workspace().HasTensor(tensor): return None
+    return _get_workspace().GetTensor(tensor)


 def _stringify_proto(obj):
@@ -210,5 +211,10 @@ def _stringify_tensor(obj):

 def _try_get_tensor(name=None):
    """Try to create or get a tensor"""
-    if name is None or name == '': return Tensor()
-    else: return Tensor.Ref(name)
\ No newline at end of file
+    if name is None or name == '': return _Tensor()
+    else: return _Tensor.Ref(name)
+
+
+def _get_workspace():
+    """Get the current default workspace."""
+    return _workspace.get_default_workspace()
\ No newline at end of file
--- a/Dragon/python/dragon/core/tls.py
+++ b/Dragon/python/dragon/core/tls.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+"""Define the common thread local structures."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import threading
+import contextlib
+
+
+class Constant(threading.local):
+    def __init__(self, **attrs):
+        super(Constant, self).__init__()
+        self.__dict__.update(attrs)
+
+
+class Stack(threading.local):
+    def __init__(self):
+        super(Stack, self).__init__()
+        self._enforce_nesting = True
+        self.stack = []
+
+    def get_default(self):
+        return self.stack[-1] if len(self.stack) >= 1 else None
+
+    def reset(self):
+        self.stack = []
+
+    def is_cleared(self):
+        return not self.stack
+
+    @property
+    def enforce_nesting(self):
+        return self._enforce_nesting
+
+    @enforce_nesting.setter
+    def enforce_nesting(self, value):
+        self._enforce_nesting = value
+
+    @contextlib.contextmanager
+    def get_controller(self, default):
+        """A context manager for manipulating a default stack."""
+        self.stack.append(default)
+        try:
+            yield default
+        finally:
+            # stack may be empty if reset() was called
+            if self.stack:
+                if self._enforce_nesting:
+                    if self.stack[-1] is not default:
+                        raise AssertionError(
+                            "Nesting violated for default stack of %s objects" %
+                            type(default))
+                    self.stack.pop()
+                else:
+                    self.stack.remove(default)
\ No newline at end of file
--- a/Dragon/python/dragon/core/workspace.py
+++ b/Dragon/python/dragon/core/workspace.py
@@ -9,14 +9,10 @@
 #
 # ------------------------------------------------------------

-"""A Wrapper for the C++ backend Workspace.
+"""Wrappers for the Workspace of C++ backend.

-Note that a default workspace is switched globally,
-so these C++ calls are safe and deterministic.
-
-See the documentation to learn how to switch between workspaces:
-
-    <http://dragon.seetatech.com/api/python/contents/core/workspace.html>
+Flexible API is provided to manage the global resources
+between the Python threads (quite different from C++).

 """

@@ -25,112 +21,219 @@ from __future__ import division
 from __future__ import print_function

 import os
+import re
 import numpy
-import threading
+import contextlib
 import six.moves.cPickle as pickle
+from collections import defaultdict, deque

-import dragon.import_c_api as _C
-import dragon.core.logging as logging
-import dragon.proto.dragon_pb2 as pb
+from dragon import config as _cfg
+from dragon import import_c_api as _C
+from dragon.core import tls as _tls
+from dragon.core import mpi as _mpi
+from dragon.core import logging as _logging
+from dragon.core import mapping as _mapping
+from dragon.proto import dragon_pb2 as _proto_def
+from dragon.core import proto_utils as _proto_utils

-from dragon.config import GetGlobalOptions
-from dragon.core import mpi, mapping, proto_utils

+class TensorPool(object):
+    """We apply the TensorPool to manage the reused tensors.

-def CurrentWorkspace():
-    """Return the current active workspace.
+    Tensors with the same scope in the pool will be reused by turns,
+    which speeds up the whole system by reducing the unnecessary deconstructing.

-    Returns
-    -------
-    str
-        The workspace name.
+    Heuristically, we have used 5 pools with different scopes:
+
+    * scope(Leaf): A Pool to reuse leaf tensors.
+
+    * scope(NumPy): A pool to reuse leaf tensors from numpy.
+
+    * scope(Join): A pool to reuse RT(runtime) tensors required by forward-backward.
+
+    * scope(Detach): A pool to reuse RT(runtime) tensors required by forward only.
+
+    * scope(Reference): A pool to reuse reshaped tensors(sharing contents).

    """
-    return _C.CurrentWorkspace()
+    def __init__(self):
+        # deque provide much higher performance than Queue
+        self._scope2keys = defaultdict(deque)

+    def get(self, scope='${DETACH}'):
+        try:
+            return self._scope2keys[scope].popleft()
+        except:
+            self._scope2keys[scope].append(
+                GetDummyName(
+                    '${POOL}/%s/Tensor' % scope,
+                        domain='Tensor', zero_based=False))
+            return self._scope2keys[scope].popleft()

-def SwitchWorkspace(workspace_name, create_if_missing=True):
-    """Switch to the specific workspace.
+    def put(self, name):
+        if '${POOL}' in name:
+            scope, _ = name[8:].split('/')
+            self._scope2keys[scope].append(name)
+            return True
+        else: return False

-    Parameters
-    ----------
-    workspace_name : str
-        The name of the specific workspace.
-    create_if_missing : boolean
-        Whether to create the specific workspace if it does not exist.

-    Returns
-    -------
-    None
+class OperatorPool(object):
+    """Operators whose gradients is required will hold a resource handle,
+    which is also called ``Anchor`` in the backend.
+
+    We apply this pool to collect the handles according to the type of operator,
+    as the mem size of temporal resources varies greatly.
+
+    The resource handle will be released after the gradient flow automatically.
+
+    """
+    def __init__(self):
+        # deque provide much higher performance than Queue
+        self._type2keys = defaultdict(deque)
+
+    def get(self, op_type):
+        try:
+            return self._type2keys[op_type].popleft()
+        except:
+            self._type2keys[op_type].append(
+                GetDummyName(
+                    '${POOL}/%s' % op_type,
+                        domain='Operator', zero_based=False))
+            return self._type2keys[op_type].popleft()
+
+    def put(self, op_name):
+        op_type, _ = op_name[8:].split('_')
+        self._type2keys[op_type].append(op_name)
+
+
+class Workspace(_C.Workspace):
+    """A wrapper for the C implemented workspace.
+
+    This class is a fusion of *Workspace*, *Pool* and *tf.Graph*.
+
+    We find that they work in a similar way while named different.

    """
-    if workspace_name == '':
-        raise ValueError('The workspace name should not be empty.')
-    _C.SwitchWorkspace(workspace_name, create_if_missing)
+    def __init__(self, name=''):
+        super(Workspace, self).__init__(name)
+        self._ref_objects = []
+        self._collections = {}
+        self.tensor_pool = TensorPool()
+        self.operator_pool = OperatorPool()
+
+    def get_collection_ref(self, name):
+        coll_list = self._collections.get(name, None)
+        if coll_list is None:
+            coll_list = []
+            self._collections[name] = coll_list
+        return coll_list
+
+    def get_collection(self, name, scope=None):
+        coll_list = self._collections.get(name, None)
+        if coll_list is None:
+            return []
+        if scope is None:
+            return list(coll_list)
+        else:
+            filter_coll_list = []
+            regex = re.compile(scope)
+            for item in coll_list:
+                if hasattr(item, "name") and regex.match(item.name):
+                    filter_coll_list.append(item)
+            return filter_coll_list
+
+    def add_to_collection(self, name, value):
+        if name not in self._collections:
+            self._collections[name] = [value]
+        else:
+            self._collections[name].append(value)
+
+    def add_to_collections(self, names, value):
+        for name in names:
+            self.add_to_collection(name, value)

+    def merge_from(self, other):
+        """Merge a external workspace into ``self``.

-def MoveWorkspace(target_ws, source_ws):
-    """Move the source workspace into the target workspace.
+        The ``other`` will not be reset until ``self`` is reset.
+        Carefulness should be taken to associate with the workspaces.

        Parameters
        ----------
-    target_ws : str
-        The name of the target workspace.
-    source_ws : str
-        The name of the source workspace.
+        other : Workspace
+            The given external workspace.

        Returns
        -------
-    None
+        Workspace
+            The ``self``.

        """
-    if target_ws == '' or source_ws == '':
-        raise ValueError('The target or source name can not be empty.')
-    _C.MoveWorkspace(target_ws, source_ws)
+        self.MergeFrom(other)
+        self._ref_objects.append(other)
+        return self

+    def as_default(self):
+        """Switch ``self`` as the default workspace.

-def ResetWorkspace(workspace_name=''):
-    """Reset the specific workspace.
+        Call this method with the *with* keyword.

-    Remove all resources of given workspace.
+        Once *with* is exited, the previous default will be set.

-    If workspace name is empty, the current workspace will be modified.
+        Returns
+        -------
+        Workspace
+            The ``self``.

-    Parameters
-    ----------
-    workspace_name : str
-        The name of the specific workspace.
+        """
+        return _GLOBAL_DEFAULT_WORKSPACE_STACK.get_controller(self)
+
+    def clear(self):
+        """Remove all the tensors.
+
+        Optionally call this method to clean the memories.

        Returns
        -------
        None

        """
-    _C.ResetWorkspace(workspace_name)
+        self.Clear()
+
+
+def get_default_workspace():
+    """Return the current default workspace.

+    Returns
+    -------
+    Workspace
+        The default workspace.

-def ClearWorkspace(workspace_name=''):
-    """Clear the specific workspace.
+    """
+    return _GLOBAL_DEFAULT_WORKSPACE_STACK.get_default()

-    You may need to clear the workspace when sharing grads.

-    If workspace name is empty, the current workspace will be modified.
+def reset_default_workspace():
+    """Reset the global default workspace.

-    Parameters
-    ----------
-    workspace_name : str
-        The name of the specific workspace.
+    Do not call this method to reset any instances.

    Returns
    -------
    None

    """
-    _C.ClearWorkspace(workspace_name)
+    if not _GLOBAL_DEFAULT_WORKSPACE_STACK.is_cleared():
+        raise AssertionError(
+            "Do not use reset_default_workspace() to clear "
+            "nested workspaces.\nIf you need a cleared workspace, "
+            "exit the nesting and create a new workspace.")
+    _GLOBAL_DEFAULT_WORKSPACE_STACK.reset()


 def CreateGraph(graph_def):
-    """Create the graph in the VM backend.
+    """Create the graph in current workspace.

    Parameters
    ----------
@@ -143,17 +246,16 @@ def CreateGraph(graph_def):
        The graph name to run.

    """
-    option = GetGlobalOptions()
    LogMetaGraph(graph_def)
    ExportMetaGraph(graph_def)
-    return _C.CreateGraph(
+    options = _cfg.GetGlobalOptions()
+    return get_default_workspace().CreateGraph(
        _stringify_proto(graph_def),
-            option['log_optimized_graph'],
-    )
+            options['log_optimized_graph'])


 def RunOperator(op_def, verbose=False):
-    """Run the operator in the VM backend.
+    """Run the operator.

    Parameters
    ----------
@@ -167,9 +269,9 @@ def RunOperator(op_def, verbose=False):
    None

    """
-    if isinstance(op_def, pb.OperatorDef):
+    if isinstance(op_def, _proto_def.OperatorDef):
        op_def = op_def.SerializeToString()
-    _C.RunOperator(op_def, verbose)
+    get_default_workspace().RunOperator(op_def, verbose)


 def HasTensor(tensor):
@@ -186,7 +288,8 @@ def HasTensor(tensor):
        The query result.

    """
-    return _C.HasTensor(_stringify_tensor(tensor))
+    tensor = _stringify_tensor(tensor)
+    return get_default_workspace().HasTensor(tensor)


 def CreateTensor(tensor):
@@ -202,7 +305,8 @@ def CreateTensor(tensor):
    None

    """
-    return _C.CreateTensor(_stringify_tensor(tensor))
+    tensor = _stringify_tensor(tensor)
+    get_default_workspace().CreateTensor(tensor)


 def CreateFiller(filler_def):
@@ -225,7 +329,7 @@ def CreateFiller(filler_def):
    """
    filler_def = filler_def if isinstance(filler_def, str) \
        else filler_def.SerializePartialToString()
-    _C.CreateFiller(filler_def)
+    get_default_workspace().CreateFiller(filler_def)


 def GetFillerType(tensor):
@@ -246,7 +350,8 @@ def GetFillerType(tensor):
        The filler type.

    """
-    return _C.GetFillerType(_stringify_tensor(tensor))
+    tensor = _stringify_tensor(tensor)
+    return get_default_workspace().GetFillerType(tensor)


 def GetTensorName(tensor):
@@ -267,7 +372,8 @@ def GetTensorName(tensor):
    The query result may be different from the one used in the frontend.

    """
-    return _C.GetTensorName(_stringify_tensor(tensor))
+    tensor = _stringify_tensor(tensor)
+    return get_default_workspace().GetTensorName(tensor)


 def SetTensorAlias(tensor, alias):
@@ -285,7 +391,8 @@ def SetTensorAlias(tensor, alias):
    None

    """
-    return _C.SetTensorAlias(_stringify_tensor(tensor), alias)
+    tensor = _stringify_tensor(tensor)
+    get_default_workspace().SetTensorAlias(tensor, alias)


 def FetchTensor(tensor):
@@ -302,10 +409,16 @@ def FetchTensor(tensor):
        The values copied from the backend.

    """
-    return _C.FetchTensor(_stringify_tensor(tensor))
+    tensor = _stringify_tensor(tensor)
+    return get_default_workspace().FetchTensor(tensor)


-def FeedTensor(tensor, array, force_cpu=False, dtype=None):
+def FeedTensor(
+    tensor,
+    array,
+    force_cpu=False,
+    dtype=None,
+):
    """Feed the values to the given tensor.

    Parameters
@@ -314,10 +427,10 @@ def FeedTensor(tensor, array, force_cpu=False, dtype=None):
        The tensor to feed.
    array : number, list, tuple, or numpy.ndarray
        The values to feed.
-    force_cpu : boolean
+    force_cpu : boolean, optional, default=False
        Whether force to feed to cpu context.
-    dtype : str
-        The data type. If ``None``, ``float32`` will be used instead.
+    dtype : str, optional
+        The optional data type.

    Returns
    -------
@@ -340,36 +453,29 @@ def FeedTensor(tensor, array, force_cpu=False, dtype=None):
    """
    name = tensor.name if hasattr(tensor, 'name') else str(tensor)
    if force_cpu is True:
-        dev = proto_utils.GetDeviceOption('cpu')
+        dev = _proto_utils.GetDeviceOption('cpu')
    else:
-        dev = proto_utils.GetDefaultDeviceOption()
-        if dev is None: dev = proto_utils.GetGlobalDeviceOption()
+        dev = _proto_utils.GetDefaultDeviceOption()
+        if dev is None: dev = _proto_utils.GetGlobalDeviceOption()

    if not isinstance(array, numpy.ndarray):
-        auto_data_type = numpy.float32 if dtype is None else dtype
+        dtype = 'float32' if dtype is None else dtype
    else:
-        auto_data_type = array.dtype if dtype is None else dtype
+        dtype = array.dtype if dtype is None else dtype

    if hasattr(tensor, 'dtype') and tensor.dtype is not None:
-        if tensor.dtype not in mapping.TENSOR_TYPE_TO_NP_TYPE:
+        if tensor.dtype not in _mapping.TENSOR_TYPE_TO_NP_TYPE:
            raise TypeError('Unsupported data type: {}'.format(tensor.dtype))
-        preset_data_type = mapping.TENSOR_TYPE_TO_NP_TYPE[tensor.dtype]
-        if dtype is not None:
-            if dtype != preset_data_type:
-                raise TypeError(
-                    'The preset data type is {}, but force to {}'.
-                        format(preset_data_type, dtype))
-        auto_data_type = preset_data_type
+        dtype = _mapping.TENSOR_TYPE_TO_NP_TYPE[tensor.dtype]

-    nd_array = numpy.array(array, dtype=auto_data_type, copy=False)
-    _C.FeedTensor(name, nd_array, _stringify_proto(dev))
+    dev = _stringify_proto(dev)
+    array = numpy.array(array, dtype=dtype, copy=False)
+    get_default_workspace().FeedTensor(name, array, dev)


 def ResetTensor(tensor):
    """Reset the memory of given tensor.

-    Note that the tensor will not be ``DELETE`` for the workspace.
-
    Parameters
    ----------
    tensor : Tensor or str
@@ -380,12 +486,16 @@ def ResetTensor(tensor):
    None

    """
-    return _C.ResetTensor(_stringify_tensor(tensor))
+    tensor = _stringify_tensor(tensor)
+    return get_default_workspace().ResetTensor(tensor)


 def RunGraph(
-    graph_name, inputs=(), outputs=[],
-        stage=None, return_outputs=True,
+    graph_name,
+    inputs=(),
+    outputs=[],
+    stage=None,
+    return_outputs=True,
 ):
    """Run the specific graph.

@@ -424,7 +534,8 @@ def RunGraph(
    # Run the graph according to the specified include/exclude rule
    runtime_stage = stage if stage else 'default'
    rule = _PREDEFINED_GRAPH_RUNTIME_STAGES[runtime_stage]
-    _C.RunGraph(str(graph_name), str(rule['include']), str(rule['exclude']))
+    get_default_workspace().RunGraph(
+        graph_name, rule['include'], rule['exclude'])

    # Try to return the outputs
    # Force to return may lead to asserts if outputs are not computed
@@ -434,18 +545,23 @@ def RunGraph(
        else: return [outputs[i].get_value() for i in range(len(outputs))]


-def FlowGradients(inputs, targets, input_grads=None, ignored_grads=None):
+def Backward(
+    forward_ops,
+    targets,
+    input_grads=None,
+    ignored_grads=None,
+):
    """Compute the gradients of given input flows.

    Parameters
    ----------
    input_flow : sequence of OperatorDef
-        The referring flows to generate gradient flows.
+        The referring ops to generate gradients.
    targets : sequence or str
-        The solving targets, generate grads automatically.
-    input_grads : sequence of str or None
-        The input grads.
-    ignored_grads : sequence of str or None
+        The solving targets.
+    input_grads : sequence of str, optional
+        The external input grads.
+    ignored_grads : sequence of str, optional
        The grads that are explicitly ignored.

    Returns
@@ -453,17 +569,17 @@ def FlowGradients(inputs, targets, input_grads=None, ignored_grads=None):
    None

    """
-    option = GetGlobalOptions()
+    options = _cfg.GetGlobalOptions()

    required_logging = True \
-        if (option['log_optimized_graph'] or
-            option['log_meta_graph']) else False
+        if (options['log_optimized_graph'] or
+            options['log_meta_graph']) else False

-    _C.FlowGradients(
-        inputs, targets,
+    get_default_workspace().Backward(
+        forward_ops, targets,
            input_grads if input_grads else [],
                ignored_grads if ignored_grads else [],
-                    option['share_grads'], required_logging)
+                    options['share_grads'], required_logging)


 def LogMetaGraph(graph_def):
@@ -479,8 +595,8 @@ def LogMetaGraph(graph_def):
    None

    """
-    option = GetGlobalOptions()
-    if option['log_meta_graph']: print(graph_def)
+    options = _cfg.GetGlobalOptions()
+    if options['log_meta_graph']: print(graph_def)


 def ExportMetaGraph(graph_def):
@@ -498,28 +614,34 @@ def ExportMetaGraph(graph_def):
    None

    """
-    option = GetGlobalOptions()
-    if option['export_meta_graph']:
-        if not os.path.exists(option['export_meta_graph']):
+    options = _cfg.GetGlobalOptions()
+    if options['export_meta_graph']:
+        if not os.path.exists(options['export_meta_graph']):
            try:
-                os.makedirs(option['export_meta_graph'])
+                os.makedirs(options['export_meta_graph'])
            except Exception:
                raise ValueError('The given prefix is invalid.')

        path = os.path.join(
-            option['export_meta_graph'],
+            options['export_meta_graph'],
                graph_def.name + '.metatxt')

        with open(path, 'w') as f: f.write(str(graph_def))
-        logging.info('Export meta graph into: {}'.format(path))
+        _logging.info('Export meta graph into: {}'.format(path))


 def Snapshot(
-    tensors, filename,
-        prefix='', suffix='.bin',
-            format='default',
+    tensors,
+    filename,
+    prefix='',
+    suffix='.bin',
+    format='pickle',
 ):
-    """Snapshot tensors into a binary file.
+    """Serialize tensors into a binary file.
+
+    The filename is formatted as:
+
+        ``prefix`` + ``filename`` + ``suffix``

    Parameters
    ----------
@@ -527,11 +649,11 @@ def Snapshot(
        The tensors to be wrote.
    filename : str
        The name of this binary file.
-    prefix : str
+    prefix : str, optional, default=''
        The prefix of this binary file.
-    suffix : str
+    suffix : str, optional, default='.bin'
        The suffix of this binary file.
-    format : str
+    format : {'pickle', 'caffe'}, optional
        The format of this binary file.

    Returns
@@ -540,72 +662,66 @@ def Snapshot(

    Notes
    -----
-    The full file path will be:  ``prefix`` + ``filename`` + ``suffix``.

-    Available formats: ['default', 'caffe'].

    """
    file_path = prefix + filename + suffix
-    if mpi.Is_Init():
-        if not mpi.AllowSnapshot(): return
-        file_path = file_path + '.rank.{}'.format(mpi.Rank())
+    if _mpi.Is_Init():
+        if not _mpi.AllowSnapshot(): return
+        file_path = file_path + '.rank.{}'.format(_mpi.Rank())

    dir = os.path.split(file_path)[0]
    if len(dir) > 0 and not os.path.exists(dir): os.makedirs(dir)

-    if format == 'default':
+    if format == 'pickle':
        state_dict = {}
        for tensor in tensors:
            state_dict[tensor.name] = FetchTensor(tensor)
        with open(file_path, 'wb') as f:
            pickle.dump(state_dict, f, pickle.HIGHEST_PROTOCOL)
-        logging.info('Snapshot Model@: ' + file_path)
-        logging.info('Model Format: Pickle')
-    elif format is 'caffe':
+        _logging.info('Snapshot Model@: ' + file_path)
+        _logging.info('Model Format: Pickle')
+    elif format == 'caffe':
        names = [tensor.name for tensor in tensors]
-        _C.Snapshot(file_path, names, 1)
-    else: raise TypeError('Unknown binary format: {}'.format(format))
+        get_default_workspace().Snapshot(file_path, names, 1)
+    else:
+        raise TypeError('Unknown binary format: ' + format)


-def Restore(binary_file, format='default'):
+def Restore(binary_file, format='pickle'):
    """Restore tensors from a binary file.

    Parameters
    ----------
    binary_file : str
        The path of binary file.
-    format : str
+    format : {'pickle', 'caffe'}, optional
        The format of this binary file.

    Returns
    -------
    None

-    Notes
-    -----
-    Available formats: ['default', 'caffe'].
-
    """
    assert os.path.exists(binary_file), \
        'Binary file({}) does not exist.'.format(binary_file)

-    if format == 'default':
+    if format == 'pickle':
        try:
            state_dict = pickle.load(open(binary_file, 'rb'))
        except UnicodeDecodeError:
-            state_dict = pickle.load(open(binary_file, 'rb'), encoding='iso-8859-1')
-        logging.info('Restore From Model@: ' + binary_file)
-        logging.info('Model Format: Pickle')
+            state_dict = pickle.load(
+                open(binary_file, 'rb'), encoding='iso-8859-1')
+        _logging.info('Restore From Model@: ' + binary_file)
+        _logging.info('Model Format: Pickle')
        for k, v in state_dict.items():
            if HasTensor(k):
                FeedTensor(k, v)
-                logging.info('[Info]: Tensor({}) is restored.'.format(k))
+                _logging.info('Tensor({}) is restored.'.format(k))
    elif format == 'caffe':
-        # Caffe models can't save the tensor name
-        # We simply use "layer_name/param:X"
-        _C.Restore(binary_file, 1)
+        get_default_workspace().Restore(binary_file, 1)
    else:
-        raise TypeError('Unknown binary format: {}'.format(format))
+        raise TypeError('Unknown binary format: ' + format)


 def GetDummyName(basename, suffix='', domain='', zero_based=True):
@@ -633,7 +749,8 @@ def GetDummyName(basename, suffix='', domain='', zero_based=True):
        The unique dummy name.

    """
-    return _C.GetDummyName(basename, suffix, domain, zero_based)
+    return get_default_workspace().GetDummyName(
+        basename, suffix, domain, zero_based)


 def _stringify_proto(obj):
@@ -647,8 +764,38 @@ def _stringify_tensor(obj):
    else: return str(obj)


-# Define a global lock to lock the current workspace
-_GLOBAL_WORKSPACE_LOCK = threading.Lock()
+class _DefaultWorkspaceStack(_tls.Stack):
+    """A thread-local stack of objects for
+    providing an implicit default workspace."""
+
+    def __init__(self):
+        super(_DefaultWorkspaceStack, self).__init__()
+        self._global_default_workspace = None
+
+    def get_default(self):
+        """Override that returns a global default if the stack is empty."""
+        ret = super(_DefaultWorkspaceStack, self).get_default()
+        if ret is None: ret = self._get_default_workspace()
+        return ret
+
+    def _get_default_workspace(self):
+        if self._global_default_workspace is None:
+            self._global_default_workspace = Workspace()
+        return self._global_default_workspace
+
+    def reset(self):
+        super(_DefaultWorkspaceStack, self).reset()
+        self._global_default_workspace = None
+
+    @contextlib.contextmanager
+    def get_controller(self, default):
+        with super(_DefaultWorkspaceStack, self) \
+                .get_controller(default) as g:
+            yield g
+
+
+# Define a global stack to store the workspaces of current thread
+_GLOBAL_DEFAULT_WORKSPACE_STACK = _DefaultWorkspaceStack()

 # Define some useful runtime stages
 _PREDEFINED_GRAPH_RUNTIME_STAGES = {

--- a/Dragon/python/dragon/import_c_api.py
+++ b/Dragon/python/dragon/import_c_api.py
@@ -23,7 +23,6 @@ from __future__ import print_function

 import sys
 import logging as _logging
-import atexit

 try:
    from dragon.libdragon import *
@@ -32,9 +31,5 @@ except ImportError as e:
        'Cannot import dragon. Error: {0}'.format(str(e)))
    sys.exit(1)

-
 REGISTERED_OPERATORS = set(s for s in RegisteredOperators())
 NO_GRADIENT_OPERATORS = set(s for s in NoGradientOperators())
\
-
-
-atexit.register(OnModuleExit)
\ No newline at end of file
--- a/Dragon/python/dragon/memonger.py
+++ b/Dragon/python/dragon/memonger.py
@@ -15,6 +15,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+from dragon import config as _cfg
+

 def ShareGrads(enabled=True):
    """Enable gradients sharing globally.
@@ -34,8 +36,8 @@ def ShareGrads(enabled=True):
    >>> opt.ShareGrads()

    """
-    from dragon.config import option
-    option['share_grads'] = enabled
+    options = _cfg.GetGlobalOptions()
+    options['share_grads'] = enabled


 def IsGradsShared():
@@ -47,8 +49,8 @@ def IsGradsShared():
        ``True`` if sharing grads else ``False``.

    """
-    from dragon.config import option
-    return option['share_grads']
+    options = _cfg.GetGlobalOptions()
+    return options['share_grads']


 def Drop(op_func, *args, **kwargs):

--- a/Dragon/python/dragon/operators/custom/minibatch.py
+++ b/Dragon/python/dragon/operators/custom/minibatch.py
@@ -13,8 +13,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import dragon
-import dragon.utils.vision
+from dragon.utils import vision as _vision
+from dragon.core import workspace as _workspace


 class MiniBatchOp(object):
@@ -36,7 +36,7 @@ class MiniBatchOp(object):

        """
        kwargs = eval(self.param_str)
-        self._data_batch = dragon.utils.vision.DataBatch(**kwargs)
+        self._data_batch = _vision.DataBatch(**kwargs)

    def run(self, inputs, outputs):
        """Run method, i.e., forward pass.
@@ -55,4 +55,4 @@ class MiniBatchOp(object):
        """
        blobs = self._data_batch.get()
        for idx, blob in enumerate(blobs):
-            dragon.workspace.FeedTensor(outputs[idx], blob)
\ No newline at end of file
+            _workspace.FeedTensor(outputs[idx], blob)
\ No newline at end of file
--- a/Dragon/python/dragon/ops.py
+++ b/Dragon/python/dragon/ops.py
@@ -15,149 +15,149 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-from .operators import initializer as init_ops
-from .operators import vision as vision_ops
-from .operators import loss as loss_ops
-from .operators import data as data_ops
-from .operators import activation as active_ops
-from .operators import arithmetic as math_ops
-from .operators import control_flow as control_flow_ops
-from .operators import misc as misc_ops
-from .operators import mpi as mpi_ops
-from .operators import array as array_ops
-from .operators import norm as norm_ops
-from .operators import recurrent as recurrent_ops
-from .operators import contrib as contrib_ops
+from .operators import initializer as _init_ops
+from .operators import vision as _vision_ops
+from .operators import loss as _loss_ops
+from .operators import data as _data_ops
+from .operators import activation as _active_ops
+from .operators import arithmetic as _math_ops
+from .operators import control_flow as _control_flow_ops
+from .operators import misc as _misc_ops
+from .operators import mpi as _mpi_ops
+from .operators import array as _array_ops
+from .operators import norm as _norm_ops
+from .operators import recurrent as _recurrent_ops
+from .operators import contrib as _contrib_ops

 # Data
-LMDBData = data_ops.LMDBData
-ImageData = data_ops.ImageData
+LMDBData = _data_ops.LMDBData
+ImageData = _data_ops.ImageData

 # Initializer
-Fill = init_ops.Fill
-RandomUniform = init_ops.RandomUniform
-RandomNormal = init_ops.RandomNormal
-TruncatedNormal = init_ops.TruncatedNormal
-GlorotUniform = init_ops.GlorotUniform
-GlorotNormal = init_ops.GlorotNormal
+Fill = _init_ops.Fill
+RandomUniform = _init_ops.RandomUniform
+RandomNormal = _init_ops.RandomNormal
+TruncatedNormal = _init_ops.TruncatedNormal
+GlorotUniform = _init_ops.GlorotUniform
+GlorotNormal = _init_ops.GlorotNormal

 # Vision
-Conv2d = vision_ops.Conv2d
-DepthwiseConv2d = vision_ops.DepthwiseConv2d
-ConvTranspose2d = DeConv2d = Conv2dTranspose = vision_ops.ConvTranspose2d
-Pool2d = vision_ops.Pool2d
-ROIPool = vision_ops.ROIPool
-ROIAlign = vision_ops.ROIAlign
-LRN = vision_ops.LRN
-NNResize = vision_ops.NNResize
-BilinearResize = vision_ops.BilinearResize
-BiasAdd = vision_ops.BiasAdd
-DropBlock2d = vision_ops.DropBlock2d
+Conv2d = _vision_ops.Conv2d
+DepthwiseConv2d = _vision_ops.DepthwiseConv2d
+ConvTranspose2d = DeConv2d = Conv2dTranspose = _vision_ops.ConvTranspose2d
+Pool2d = _vision_ops.Pool2d
+ROIPool = _vision_ops.ROIPool
+ROIAlign = _vision_ops.ROIAlign
+LRN = _vision_ops.LRN
+NNResize = _vision_ops.NNResize
+BilinearResize = _vision_ops.BilinearResize
+BiasAdd = _vision_ops.BiasAdd
+DropBlock2d = _vision_ops.DropBlock2d

 # Recurrent
-LSTMCell = recurrent_ops.LSTMCell
-RNN = recurrent_ops.RNN
-LSTM = recurrent_ops.LSTM
-GRU = recurrent_ops.GRU
+LSTMCell = _recurrent_ops.LSTMCell
+RNN = _recurrent_ops.RNN
+LSTM = _recurrent_ops.LSTM
+GRU = _recurrent_ops.GRU

 # Activation
-Sigmoid = active_ops.Sigmoid
-Tanh = active_ops.Tanh
-Relu = active_ops.Relu
-LRelu = active_ops.LRelu
-PRelu = active_ops.PRelu
-Elu = active_ops.Elu
-SElu = active_ops.SElu
-Softmax = active_ops.Softmax
-Dropout = active_ops.Dropout
+Sigmoid = _active_ops.Sigmoid
+Tanh = _active_ops.Tanh
+Relu = _active_ops.Relu
+LRelu = _active_ops.LRelu
+PRelu = _active_ops.PRelu
+Elu = _active_ops.Elu
+SElu = _active_ops.SElu
+Softmax = _active_ops.Softmax
+Dropout = _active_ops.Dropout

 # Loss
-NLLLoss = loss_ops.NLLLoss
-SparseSoftmaxCrossEntropy = loss_ops.SparseSoftmaxCrossEntropy
-SigmoidCrossEntropy = loss_ops.SigmoidCrossEntropy
-SoftmaxCrossEntropy = loss_ops.SoftmaxCrossEntropy
-SmoothL1Loss = loss_ops.SmoothL1Loss
-L1Loss = loss_ops.L1Loss
-L2Loss = loss_ops.L2Loss
-SigmoidFocalLoss = loss_ops.SigmoidFocalLoss
-SoftmaxFocalLoss = loss_ops.SoftmaxFocalLoss
-CTCLoss = loss_ops.CTCLoss
+NLLLoss = _loss_ops.NLLLoss
+SparseSoftmaxCrossEntropy = _loss_ops.SparseSoftmaxCrossEntropy
+SigmoidCrossEntropy = _loss_ops.SigmoidCrossEntropy
+SoftmaxCrossEntropy = _loss_ops.SoftmaxCrossEntropy
+SmoothL1Loss = _loss_ops.SmoothL1Loss
+L1Loss = _loss_ops.L1Loss
+L2Loss = _loss_ops.L2Loss
+SigmoidFocalLoss = _loss_ops.SigmoidFocalLoss
+SoftmaxFocalLoss = _loss_ops.SoftmaxFocalLoss
+CTCLoss = _loss_ops.CTCLoss

 # Arithmetic
-Add = math_ops.Add
-Sub = math_ops.Sub
-Mul = math_ops.Mul
-Div = math_ops.Div
-Maximum = math_ops.Maximum
-Minimum = math_ops.Minimum
-Moments = math_ops.Moments
-Clip = math_ops.Clip
-Matmul = math_ops.Matmul
-Pow = math_ops.Pow
-Dot = math_ops.Dot
-Log = math_ops.Log
-Exp = math_ops.Exp
-Square = math_ops.Square
-Sqrt = math_ops.Sqrt
-FullyConnected = math_ops.FullyConnected
-Eltwise = math_ops.Eltwise
-Affine = math_ops.Affine
-GramMatrix = math_ops.GramMatrix
-Accumulate = math_ops.Accumulate
-MovingAverage = math_ops.MovingAverage
+Add = _math_ops.Add
+Sub = _math_ops.Sub
+Mul = _math_ops.Mul
+Div = _math_ops.Div
+Maximum = _math_ops.Maximum
+Minimum = _math_ops.Minimum
+Moments = _math_ops.Moments
+Clip = _math_ops.Clip
+Matmul = _math_ops.Matmul
+Pow = _math_ops.Pow
+Dot = _math_ops.Dot
+Log = _math_ops.Log
+Exp = _math_ops.Exp
+Square = _math_ops.Square
+Sqrt = _math_ops.Sqrt
+FullyConnected = _math_ops.FullyConnected
+Eltwise = _math_ops.Eltwise
+Affine = _math_ops.Affine
+GramMatrix = _math_ops.GramMatrix
+Accumulate = _math_ops.Accumulate
+MovingAverage = _math_ops.MovingAverage

 # Normalization
-BatchNorm = norm_ops.BatchNorm
-GroupNorm = norm_ops.GroupNorm
-LayerNorm = norm_ops.LayerNorm
-InstanceNorm = norm_ops.InstanceNorm
-L2Norm = norm_ops.L2Norm
+BatchNorm = _norm_ops.BatchNorm
+GroupNorm = _norm_ops.GroupNorm
+LayerNorm = _norm_ops.LayerNorm
+InstanceNorm = _norm_ops.InstanceNorm
+L2Norm = _norm_ops.L2Norm

 # NDArray
-Gather = array_ops.Gather
-Crop = array_ops.Crop
-Reduce = array_ops.Reduce
-Sum = array_ops.Sum
-Mean = array_ops.Mean
-Max = array_ops.Max
-ArgMax = array_ops.ArgMax
-Min = array_ops.Min
-ArgMin = array_ops.ArgMin
-Slice = array_ops.Slice
-Stack = array_ops.Stack
-Concat = array_ops.Concat
-Transpose = array_ops.Transpose
-Repeat = array_ops.Repeat
-Tile = array_ops.Tile
-Pad = array_ops.Pad
-OneHot = array_ops.OneHot
-Flatten = array_ops.Flatten
-Reshape = array_ops.Reshape
-ExpandDims = array_ops.ExpandDims
-Squeeze = array_ops.Squeeze
-Shape = array_ops.Shape
-Arange = array_ops.Arange
-Multinomial = array_ops.Multinomial
+Gather = _array_ops.Gather
+Crop = _array_ops.Crop
+Reduce = _array_ops.Reduce
+Sum = _array_ops.Sum
+Mean = _array_ops.Mean
+Max = _array_ops.Max
+ArgMax = _array_ops.ArgMax
+Min = _array_ops.Min
+ArgMin = _array_ops.ArgMin
+Slice = _array_ops.Slice
+Stack = _array_ops.Stack
+Concat = _array_ops.Concat
+Transpose = _array_ops.Transpose
+Repeat = _array_ops.Repeat
+Tile = _array_ops.Tile
+Pad = _array_ops.Pad
+OneHot = _array_ops.OneHot
+Flatten = _array_ops.Flatten
+Reshape = _array_ops.Reshape
+ExpandDims = _array_ops.ExpandDims
+Squeeze = _array_ops.Squeeze
+Shape = _array_ops.Shape
+Arange = _array_ops.Arange
+Multinomial = _array_ops.Multinomial

 # Control Flow
-Copy = control_flow_ops.Copy
-Assign = control_flow_ops.Assign
-Equal = control_flow_ops.Equal
-Less = control_flow_ops.Less
-LessEqual = control_flow_ops.LessEqual
-Greater = control_flow_ops.Greater
-GreaterEqual = control_flow_ops.GreaterEqual
+Copy = _control_flow_ops.Copy
+Assign = _control_flow_ops.Assign
+Equal = _control_flow_ops.Equal
+Less = _control_flow_ops.Less
+LessEqual = _control_flow_ops.LessEqual
+Greater = _control_flow_ops.Greater
+GreaterEqual = _control_flow_ops.GreaterEqual

 # Misc
-Cast = AsType = misc_ops.Cast
-Run = misc_ops.Run
-Template = misc_ops.Template
-Accuracy = misc_ops.Accuracy
-StopGradient = misc_ops.StopGradient
+Cast = AsType = _misc_ops.Cast
+Run = _misc_ops.Run
+Template = _misc_ops.Template
+Accuracy = _misc_ops.Accuracy
+StopGradient = _misc_ops.StopGradient

 # MPI
-MPIBroadcast = mpi_ops.MPIBroadcast
-MPIGather = mpi_ops.MPIGather
+MPIBroadcast = _mpi_ops.MPIBroadcast
+MPIGather = _mpi_ops.MPIGather

 # Contrib
-Proposal = contrib_ops.Proposal # R-CNN
\ No newline at end of file
+Proposal = _contrib_ops.Proposal # R-CNN
\ No newline at end of file
--- a/Dragon/python/dragon/proto/dragon.proto
+++ b/Dragon/python/dragon/proto/dragon.proto
@@ -145,18 +145,6 @@ message GradientProto {
    optional string external = 3;
 }

-// Record the updater information
-message UpdaterProto {
-    // The operator name to use.
-    optional string name = 1;
-    // The operator type.
-    optional string type = 2;
-    // The tensor to update.
-    repeated string tensor = 3;
-    // The arguments.
-    repeated Argument arg = 4;
-}
-
 // Graph Definition
 message GraphDef {
    // The graph name.
@@ -181,6 +169,4 @@ message GraphDef {

    // The gradients information.
    repeated GradientProto gradient = 9;
-    // The updaters information.
-    repeated UpdaterProto updater = 10;
 }
\ No newline at end of file
--- a/Dragon/python/dragon/updaters.py
+++ b/Dragon/python/dragon/updaters.py
@@ -22,8 +22,8 @@ from __future__ import print_function

 import pprint

-from dragon.core import workspace
-from dragon.core.tensor import Tensor
+from dragon.core import workspace as _workspace
+from dragon.core.tensor import Tensor as _Tensor


 class BaseUpdater(object):
@@ -32,12 +32,14 @@ class BaseUpdater(object):
    # Store the global unique slot index
    _DEFAULT_UNIQUE_SLOT_ID = 0

-    def __init__(self,
+    def __init__(
+        self,
        scale_gradient=1.0,
        clip_gradient=-1.0,
        l2_decay=-1.0,
        slot=None,
-                 verbose=True):
+        verbose=True,
+    ):
        """Construct a Updater to optimize the objectives.

        Parameters
@@ -84,7 +86,7 @@ class BaseUpdater(object):
        None

        """
-        pair = (tensor.name if isinstance(tensor, Tensor) \
+        pair = (tensor.name if isinstance(tensor, _Tensor) \
            else tensor for tensor in pair)
        self._param_group.append((pair,
            {'lr_mult': lr_mult, 'decay_mult': decay_mult}))
@@ -93,7 +95,8 @@ class BaseUpdater(object):
        defaults = self.__dict__.get('_defaults')
        if item in defaults:
            if self._registered:
-                return workspace.FetchTensor(self._slot + '/' + item)
+                return _workspace.FetchTensor(
+                    self._slot + '/' + item)
            else: return defaults[item]
        return self.__dict__[item]

@@ -101,7 +104,8 @@ class BaseUpdater(object):
        defaults = self.__dict__.get('_defaults')
        if defaults is not None and key in defaults:
            if self._registered:
-                workspace.FeedTensor(self._slot + '/' + key, value,
+                _workspace.FeedTensor(
+                    self._slot + '/' + key, value,
                        dtype='float32', force_cpu=True)
            else:
                self._defaults[key] = value
@@ -111,7 +115,8 @@ class BaseUpdater(object):
    def register_in_workspace(self):
        if not self._registered:
            for k, v in self._defaults.items():
-                workspace.FeedTensor(self._slot + "/" + k, v,
+                _workspace.FeedTensor(
+                    self._slot + "/" + k, v,
                        dtype='float32', force_cpu=True)
            self._registered = True
            if self._verbose:
@@ -206,8 +211,14 @@ class AdamUpdater(BaseUpdater):
    Introduced by `[Kingma & Ba, 2014] <https://arxiv.org/abs/1412.6980>`_.

    """
-    def __init__(self, base_lr=0.01, beta1=0.9,
-                 beta2=0.999, eps=1e-8, **kwargs):
+    def __init__(
+        self,
+        base_lr=0.01,
+        beta1=0.9,
+        beta2=0.999,
+        eps=1e-8,
+        **kwargs
+    ):
        """Construct a Adam Updater to optimize the objectives.

        Parameters
@@ -222,7 +233,7 @@ class AdamUpdater(BaseUpdater):
            The eps.

        """
-        super(AdamUpdater, self).__init__(**kwargs )
+        super(AdamUpdater, self).__init__(**kwargs)
        self._defaults = dict({
            'base_lr': base_lr,
            'beta1': beta1,

--- a/Dragon/python/dragon/utils/vision/blob_fetcher.py
+++ b/Dragon/python/dragon/utils/vision/blob_fetcher.py
@@ -13,11 +13,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import numpy as np
-from multiprocessing import Process
+import numpy
+import multiprocessing


-class BlobFetcher(Process):
+class BlobFetcher(multiprocessing.Process):
    """BlobFetcher is deployed to queue blobs from `DataTransformer`_.

    It is supported to form *NHWC* image blobs and *1d* label blobs.
@@ -37,10 +37,9 @@ class BlobFetcher(Process):

        """
        super(BlobFetcher, self).__init__()
-        self._batch_size = kwargs.get('batch_size', 100)
+        self._batch_size = kwargs.get('batch_size', 128)
        self._partition  = kwargs.get('partition', False)
-        if self._partition:
-            self._batch_size = self._batch_size // kwargs['group_size']
+        if self._partition: self._batch_size /= kwargs['group_size']
        self.Q_in = self.Q_out = None
        self.daemon = True

@@ -54,9 +53,9 @@ class BlobFetcher(Process):

        """
        im, labels = self.Q_in.get()
-        im_blob = np.zeros(shape=([self._batch_size] + list(im.shape)), dtype=np.uint8)
-        label_blob = np.zeros((self._batch_size, len(labels)),  dtype=np.int64)
-        for ix in range(0, self._batch_size):
+        im_blob = numpy.zeros(shape=([self._batch_size] + list(im.shape)), dtype='uint8')
+        label_blob = numpy.zeros((self._batch_size, len(labels)), dtype='int64')
+        for ix in range(self._batch_size):
            im_blob[ix, :, :, :], label_blob[ix, :] = im, labels
            if ix != self._batch_size - 1: im, labels = self.Q_in.get()
        return im_blob, label_blob

--- a/Dragon/python/dragon/utils/vision/data_batch.py
+++ b/Dragon/python/dragon/utils/vision/data_batch.py
@@ -14,11 +14,10 @@ from __future__ import division
 from __future__ import print_function

 import time
-import pprint
-from multiprocessing import Queue
+import multiprocessing

-import dragon.core.mpi as mpi
-import dragon.core.logging as logging
+from dragon.core import mpi as _mpi
+from dragon.core import logging as _logging

 from .data_reader import DataReader
 from .data_transformer import DataTransformer
@@ -77,10 +76,11 @@ class DataBatch(object):
        super(DataBatch, self).__init__()
        # Init mpi
        global_rank = 0; local_rank = 0; group_size = 1
-        if mpi.Is_Init():
-            idx, group = mpi.AllowParallel()
-            if idx != -1:  # DataParallel
-                global_rank = mpi.Rank()
+        if _mpi.Is_Init() and kwargs.get(
+                'phase', 'TRAIN') == 'TRAIN':
+            rank, group = _mpi.AllowParallel()
+            if rank != -1: # DataParallel
+                global_rank = _mpi.Rank()
                group_size = len(group)
                for i, node in enumerate(group):
                    if global_rank == node: local_rank = i
@@ -105,7 +105,7 @@ class DataBatch(object):
                    self._num_transformers += 1
            # Add 1 transformer for random crop
            if kwargs.get('crop_size', 0) > 0 and \
-                kwargs.get('phase', 'TEST') == 'TRAIN':
+                kwargs.get('phase', 'TRAIN') == 'TRAIN':
                    self._num_transformers += 1
        self._num_transformers = min(self._num_transformers, self._max_transformers)

@@ -115,9 +115,12 @@ class DataBatch(object):
            self._batch_size = int(self._batch_size / kwargs['group_size'])

        # Init queues
-        self.Q_level_1 = Queue(self._prefetch * self._num_readers * self._batch_size)
-        self.Q_level_2 = Queue(self._prefetch * self._num_readers * self._batch_size)
-        self.Q_level_3 = Queue(self._prefetch * self._num_readers)
+        self.Q_level_1 = multiprocessing.Queue(
+            self._prefetch * self._num_readers * self._batch_size)
+        self.Q_level_2 = multiprocessing.Queue(
+            self._prefetch * self._num_readers * self._batch_size)
+        self.Q_level_3 = multiprocessing.Queue(
+            self._prefetch * self._num_readers)

        # Init readers
        self._readers = []
@@ -167,11 +170,11 @@ class DataBatch(object):
                    process.terminate()
                    process.join()
            terminate(self._fetchers)
-            if local_rank == 0: logging.info('Terminating BlobFetcher ......')
+            if local_rank == 0: _logging.info('Terminate BlobFetcher.')
            terminate(self._transformers)
-            if local_rank == 0: logging.info('Terminating DataTransformer ......')
+            if local_rank == 0: _logging.info('Terminate DataTransformer.')
            terminate(self._readers)
-            if local_rank == 0: logging.info('Terminating DataReader......')
+            if local_rank == 0: _logging.info('Terminate DataReader.')
        import atexit
        atexit.register(cleanup)


--- a/Dragon/python/dragon/utils/vision/data_reader.py
+++ b/Dragon/python/dragon/utils/vision/data_reader.py
@@ -14,15 +14,14 @@ from __future__ import division
 from __future__ import print_function

 import math
-import numpy as np
-import numpy.random as npr
-from multiprocessing import Process
+import numpy
+import multiprocessing

-import dragon.config as config
-from dragon.tools.db import LMDB
+from dragon import config as _cfg
+from dragon.tools import db as _db


-class DataReader(Process):
+class DataReader(multiprocessing.Process):
    """DataReader is deployed to queue encoded str from `LMDB`_.

    It is supported to adaptively partition and shuffle records over all distributed nodes.
@@ -55,7 +54,7 @@ class DataReader(Process):

        self._part_idx, self._num_parts = 0, 1
        self._cur_idx, self._cur_chunk_idx = 0, 0
-        self._random_seed = config.GetRandomSeed()
+        self._random_seed = _cfg.GetRandomSeed()

        self.Q_out = None
        self.daemon = True
@@ -106,7 +105,9 @@ class DataReader(Process):

        """
        if self._multiple_nodes or self._use_shuffle:
-            if self._use_shuffle: self._perm = npr.permutation(self._num_shuffle_parts)
+            if self._use_shuffle:
+                self._perm = numpy.random.permutation(
+                    self._num_shuffle_parts)
            self._cur_chunk_idx = 0
            self._start_idx = int(self._part_idx * self._num_shuffle_parts + self._perm[self._cur_chunk_idx])
            self._start_idx = int(self._start_idx * self._chunk_size)
@@ -158,23 +159,23 @@ class DataReader(Process):

        """
        # fix seed
-        npr.seed(self._random_seed)
+        numpy.random.seed(self._random_seed)

        # init db
-        self._db = LMDB()
+        self._db = _db.LMDB()
        self._db.open(self._source)
        self._zfill = self._db.zfill()
        self._num_entries = self._db.num_entries()
-        self._epoch_size = int(self._num_entries/ self._num_parts + 1)
+        self._epoch_size = int(self._num_entries / self._num_parts + 1)

        if self._use_shuffle:
            if self._chunk_size == 1:
-                # Each chunk has at most 1 record [For Fully Shuffle]
+                # Each chunk has at most 1 record (Naive Shuffle)
                self._chunk_size, self._num_shuffle_parts = \
                    1, int(self._num_entries / self._num_parts) + 1
            else:
                if self._use_shuffle and self._chunk_size == -1:
-                    # Search a optimal chunk size by chunks [For Chunk Shuffle]
+                    # Search a optimal chunk size by chunks (Chunk Shuffle)
                    max_chunk_size = self._db._total_size / ((self._num_chunks * (1 << 20)))
                    min_chunk_size = 1
                    while min_chunk_size * 2 < max_chunk_size: min_chunk_size *= 2
@@ -184,17 +185,17 @@ class DataReader(Process):
                    self._chunk_size = int(self._num_entries / self._num_shuffle_parts / self._num_parts + 1)
                    limit = (self._num_parts - 0.5) * self._num_shuffle_parts * self._chunk_size
                    if self._num_entries <= limit:
-                        # Roll back to fully shuffle
+                        # Roll back to naive shuffle
                        self._chunk_size, self._num_shuffle_parts = \
                            1, int(self._num_entries / self._num_parts) + 1
        else:
-            # Each chunk has at most K records [For Multiple Nodes]
-            # Note that if ``shuffle`` and ``multiple_nodes`` are all ``False``,
+            # Each chunk has at most K records
+            # Note that if ``shuffle`` and ``multiple_nodes`` are all *False*,
            # ``chunk_size`` and ``num_shuffle_parts`` are meaningless
            self._chunk_size = int(self._num_entries / self._num_parts) + 1
            self._num_shuffle_parts = 1

-        self._perm = np.arange(self._num_shuffle_parts)
+        self._perm = numpy.arange(self._num_shuffle_parts)

        # Init env
        self.reset()

--- a/Dragon/python/dragon/utils/vision/data_transformer.py
+++ b/Dragon/python/dragon/utils/vision/data_transformer.py
@@ -13,12 +13,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import numpy as np
-import numpy.random as npr
-from multiprocessing import Process
+import numpy
+import multiprocessing

-import dragon.config as config
-import dragon.vm.caffe.proto.caffe_pb2 as pb
+from dragon import config as _cfg
+from dragon.vm.caffe.proto import caffe_pb2 as _proto_def

 try:
    import cv2
@@ -31,7 +30,7 @@ except ImportError as e:
    print("Failed to import PIL. \nIt's OK if disabling color augmentation.".format(str(e)))


-class DataTransformer(Process):
+class DataTransformer(multiprocessing.Process):
    """DataTransformer is deployed to queue transformed images from `DataReader`_.

    Nearly all common image augmentation methods are supported.
@@ -72,7 +71,7 @@ class DataTransformer(Process):
        self._max_random_scale = kwargs.get('max_random_scale', 1.0)
        self._force_color = kwargs.get('force_color', False)
        self._phase = kwargs.get('phase', 'TRAIN')
-        self._random_seed = config.GetRandomSeed()
+        self._random_seed = _cfg.GetRandomSeed()
        self.Q_in = self.Q_out = None
        self.daemon = True

@@ -91,16 +90,16 @@ class DataTransformer(Process):

        """
        # decode
-        datum = pb.Datum()
+        datum = _proto_def.Datum()
        datum.ParseFromString(serialized)
-        im = np.fromstring(datum.data, np.uint8)
+        im = numpy.fromstring(datum.data, numpy.uint8)
        if datum.encoded is True:
            im = cv2.imdecode(im, -1)
        else:
            im = im.reshape((datum.height, datum.width, datum.channels))

        # Random scale
-        random_scale = npr.uniform() * (
+        random_scale = numpy.random.uniform() * (
            self._max_random_scale - self._min_random_scale) \
                + self._min_random_scale
        if random_scale != 1.0:
@@ -109,7 +108,7 @@ class DataTransformer(Process):

        # Padding
        if self._padding > 0:
-            pad_img = np.empty((
+            pad_img = numpy.empty((
                im.shape[0] + 2 * self._padding,
                im.shape[1] + 2 * self._padding, im.shape[2]), dtype=im.dtype)
            pad_img.fill(self._fill_value)
@@ -120,8 +119,8 @@ class DataTransformer(Process):
        # Random crop
        if self._crop_size > 0:
            if self._phase == 'TRAIN':
-                h_off = npr.randint(im.shape[0] - self._crop_size + 1)
-                w_off = npr.randint(im.shape[1] - self._crop_size + 1)
+                h_off = numpy.random.randint(im.shape[0] - self._crop_size + 1)
+                w_off = numpy.random.randint(im.shape[1] - self._crop_size + 1)
            else:
                h_off = int((im.shape[0] - self._crop_size) / 2)
                w_off = int((im.shape[1] - self._crop_size) / 2)
@@ -130,28 +129,28 @@ class DataTransformer(Process):

        # Random mirror
        if self._mirror:
-            if npr.randint(0, 2) > 0:
+            if numpy.random.randint(0, 2) > 0:
                im = im[:, ::-1, :]

        # Gray Transformation
        if self._force_color:
            if im.shape[2] == 1:
                # duplicate to 3 channels
-                im = np.concatenate([im, im, im], axis=2)
+                im = numpy.concatenate([im, im, im], axis=2)

        # Color Augmentation
        if self._color_aug:
            im = PIL.Image.fromarray(im)
-            delta_brightness = npr.uniform(-0.4, 0.4) + 1.0
-            delta_contrast = npr.uniform(-0.4, 0.4) + 1.0
-            delta_saturation = npr.uniform(-0.4, 0.4) + 1.0
+            delta_brightness = numpy.random.uniform(-0.4, 0.4) + 1.0
+            delta_contrast = numpy.random.uniform(-0.4, 0.4) + 1.0
+            delta_saturation = numpy.random.uniform(-0.4, 0.4) + 1.0
            im = PIL.ImageEnhance.Brightness(im)
            im = im.enhance(delta_brightness)
            im = PIL.ImageEnhance.Contrast(im)
            im = im.enhance(delta_contrast)
            im = PIL.ImageEnhance.Color(im)
            im = im.enhance(delta_saturation)
-            im = np.array(im)
+            im = numpy.array(im)

        # Extract Labels
        labels = []
@@ -169,7 +168,7 @@ class DataTransformer(Process):

        """
        # Fix the random seed
-        npr.seed(self._random_seed)
+        numpy.random.seed(self._random_seed)

        # Run!
        while True:

--- a/Dragon/python/dragon/utils/vision/im2db.py
+++ b/Dragon/python/dragon/utils/vision/im2db.py
@@ -16,8 +16,8 @@ import shutil
 import argparse
 import cv2

-from dragon.tools.db import LMDB
-from dragon.vm.caffe.proto import caffe_pb2
+from dragon.tools import db as _db
+from dragon.vm.caffe.proto import caffe_pb2 as _proto_def


 def resize_image(im, resize):
@@ -37,11 +37,10 @@ def resize_image(im, resize):

    """
    if im.shape[0] > im.shape[1]:
-        newsize = (resize, im.shape[0] * resize / im.shape[1])
+        new_size = (resize, im.shape[0] * resize // im.shape[1])
    else:
-        newsize = (im.shape[1] * resize / im.shape[0], resize)
-    im = cv2.resize(im, newsize)
-    return im
+        new_size = (im.shape[1] * resize // im.shape[0], resize)
+    return cv2.resize(im, new_size, interpolation=cv2.INTER_LINEAR)


 def make_db(args):
@@ -72,7 +71,7 @@ def make_db(args):

    print('start time: ', time.strftime("%a, %d %b %Y %H:%M:%S", time.gmtime()))

-    db = LMDB(max_commit=10000)
+    db = _db.LMDB(max_commit=10000)
    db.open(args.database, mode='w')

    total_line = sum(1 for line in open(args.list))
@@ -106,7 +105,7 @@ def make_db(args):
                img = resize_image(img, args.resize)
            result, imgencode = cv2.imencode('.jpg', img, encode_param)

-            datum = caffe_pb2.Datum()
+            datum = _proto_def.Datum()
            datum.height, datum.width, datum.channels = img.shape
            datum.label = int(label)
            datum.encoded = True

--- a/Dragon/python/dragon/vm/caffe/layer.py
+++ b/Dragon/python/dragon/vm/caffe/layer.py
@@ -15,7 +15,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import dragon
+from dragon.core import scope as _scope
+from dragon.core.tensor import Tensor as _Tensor


 class Layer(object):
@@ -74,12 +75,12 @@ class Layer(object):
        # Note that a non-empty tensor scope will make it
        # impossible to load/save caffe models. You should use
        # a new workspace instead of the terrible name scope
-        scoped_name = dragon.get_default_name_scope() + self._name
+        scoped_name = _scope.get_default_name_scope() + self._name
        param_name = scoped_name + '/param:{}'.format(len(self._blobs))

        # Set the name explicitly
-        variable = dragon.Tensor.Ref(param_name)
-        variable_grad = dragon.Tensor.Ref(param_name + '_grad')
+        variable = _Tensor.Ref(param_name)
+        variable_grad = _Tensor.Ref(param_name + '_grad')

        if filler is not None:
            variable.Fill(**filler)

--- a/Dragon/python/dragon/vm/caffe/layers/common.py
+++ b/Dragon/python/dragon/vm/caffe/layers/common.py
@@ -15,11 +15,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import dragon
-from ..layer import Layer
+from dragon import ops as _ops
+from ..layer import Layer as _Layer


-class InnerProductLayer(Layer):
+class InnerProductLayer(_Layer):
    """The implementation of ``InnerProductLayer``.

    Parameters
@@ -28,9 +28,9 @@ class InnerProductLayer(Layer):
         The output dim. Refer `InnerProductParameter.num_output`_.
    bias_term : boolean
         Whether to use bias. Refer `InnerProductParameter.bias_term`_.
-    weight_filler : caffe_pb2.FillerParameter
+    weight_filler : FillerParameter
         The filler of weight. Refer `InnerProductParameter.weight_filler`_.
-    bias_filler : caffe_pb2.FillerParameter
+    bias_filler : FillerParameter
         The filler of bias. Refer `InnerProductParameter.bias_filler`_.
    axis : int
        The start axis to calculate. Refer `InnerProductParameter.axis`_.
@@ -53,10 +53,10 @@ class InnerProductLayer(Layer):

    def LayerSetup(self, bottom):
        inputs = [bottom] + [blob['data'] for blob in self._blobs]
-        return dragon.ops.FullyConnected(inputs, **self.arguments)
+        return _ops.FullyConnected(inputs, **self.arguments)


-class AccuracyLayer(Layer):
+class AccuracyLayer(_Layer):
    """The implementation of ``AccuracyLayer``.

    Parameters
@@ -79,10 +79,10 @@ class AccuracyLayer(Layer):
        }

    def LayerSetup(self, bottom):
-        return dragon.ops.Accuracy(bottom, **self.arguments)
+        return _ops.Accuracy(bottom, **self.arguments)


-class PythonLayer(Layer):
+class PythonLayer(_Layer):
    """The implementation of ``PythonLayer``.

    Parameters
@@ -106,10 +106,10 @@ class PythonLayer(Layer):
        }

    def LayerSetup(self, bottom):
-        return dragon.ops.Run(bottom, **self.arguments)
+        return _ops.Run(bottom, **self.arguments)


-class EltwiseLayer(Layer):
+class EltwiseLayer(_Layer):
    """The implementation of ``EltwiseLayer``.

    Parameters
@@ -130,20 +130,20 @@ class EltwiseLayer(Layer):
        }

    def LayerSetup(self, bottom):
-        return dragon.ops.Eltwise(bottom, **self.arguments)
+        return _ops.Eltwise(bottom, **self.arguments)


-class AddLayer(Layer):
+class AddLayer(_Layer):
    """The extended implementation of ``EltwiseLayer``."""

    def __init__(self, LayerParameter):
        super(AddLayer, self).__init__(LayerParameter)

    def LayerSetup(self, bottom):
-        return dragon.ops.Add(bottom, **self.arguments)
+        return _ops.Add(bottom, **self.arguments)


-class ConcatLayer(Layer):
+class ConcatLayer(_Layer):
    """The implementation of ``ConcatLayer``.

    Parameters
@@ -157,10 +157,10 @@ class ConcatLayer(Layer):
        self.arguments = {'axis': LayerParameter.concat_param.axis}

    def LayerSetup(self, bottom):
-        return dragon.ops.Concat(bottom, **self.arguments)
+        return _ops.Concat(bottom, **self.arguments)


-class SliceLayer(Layer):
+class SliceLayer(_Layer):
    """The implementation of ``SliceLayer``.

    Parameters
@@ -181,17 +181,17 @@ class SliceLayer(Layer):
        }

    def LayerSetup(self, bottom):
-        return dragon.ops.Slice(bottom, **self.arguments)
+        return _ops.Slice(bottom, **self.arguments)


-class CropLayer(Layer):
+class CropLayer(_Layer):
    """The implementation of ``CropLayer``.

    Parameters
    ----------
    axis : int
        The start axis. Refer `CropParameter.axis`_.
-    offset : list of int
+    offset : sequence of int
        The offsets. Refer `CropParameter.offset`_.

    """
@@ -208,15 +208,15 @@ class CropLayer(Layer):
            raise ValueError('Excepted two bottom blobs.')
        self.arguments['shape_like'] = bottom[1]
        self.arguments['starts'] = self.arguments['sizes'] = None
-        return dragon.ops.Crop(bottom[0], **self.arguments)
+        return _ops.Crop(bottom[0], **self.arguments)


-class ReshapeLayer(Layer):
+class ReshapeLayer(_Layer):
    """The implementation of ``ReshapeLayer``.

    Parameters
    ----------
-    shape : list of int
+    shape : sequence of int
        The output shape. Refer `ReshapeParameter.shape`_.

    """
@@ -226,15 +226,15 @@ class ReshapeLayer(Layer):
            in LayerParameter.reshape_param.shape.dim]}

    def LayerSetup(self, bottom):
-        return dragon.ops.Reshape(bottom, **self.arguments)
+        return _ops.Reshape(bottom, **self.arguments)


-class PermuteLayer(Layer):
+class PermuteLayer(_Layer):
    """The implementation of ``PermuteLayer``.

    Parameters
    ----------
-    order : list of int
+    order : sequence of int
        The permutation. Refer `PermuteParameter.order`_.

    """
@@ -244,10 +244,10 @@ class PermuteLayer(Layer):
            in LayerParameter.permute_param.order]}

    def LayerSetup(self, bottom):
-        return dragon.ops.Transpose(bottom, **self.arguments)
+        return _ops.Transpose(bottom, **self.arguments)


-class FlattenLayer(Layer):
+class FlattenLayer(_Layer):
    """The implementation of ``FlattenLayer``.

    Parameters
@@ -266,10 +266,10 @@ class FlattenLayer(Layer):
        self.arguments = {'axis': axis, 'num_axes': num_axes}

    def LayerSetup(self, bottom):
-        return dragon.ops.Flatten(bottom, **self.arguments)
+        return _ops.Flatten(bottom, **self.arguments)


-class GatherLayer(Layer):
+class GatherLayer(_Layer):
    """The extended implementation of ``GatherOp``.

    Parameters
@@ -285,10 +285,10 @@ class GatherLayer(Layer):
    def LayerSetup(self, bottom):
        if not isinstance(bottom, (tuple, list)) or len(bottom) != 2:
            raise ValueError('Excepted two bottom blobs.')
-        return dragon.ops.Gather(bottom[0], indices=bottom[1], **self.arguments)
+        return _ops.Gather(bottom[0], indices=bottom[1], **self.arguments)


-class SoftmaxLayer(Layer):
+class SoftmaxLayer(_Layer):
    """The implementation of ``SoftmaxLayer``.

    Parameters
@@ -302,10 +302,10 @@ class SoftmaxLayer(Layer):
        self.arguments = {'axis': LayerParameter.softmax_param.axis}

    def LayerSetup(self, bottom):
-        return dragon.ops.Softmax(bottom, **self.arguments)
+        return _ops.Softmax(bottom, **self.arguments)


-class ArgMaxLayer(Layer):
+class ArgMaxLayer(_Layer):
    """The implementation of ``ArgMaxLayer``.

    Parameters
@@ -326,10 +326,10 @@ class ArgMaxLayer(Layer):
        }

    def LayerSetup(self, bottom):
-        return dragon.ops.ArgMax(bottom, **self.arguments)
+        return _ops.ArgMax(bottom, **self.arguments)


-class BatchNormLayer(Layer):
+class BatchNormLayer(_Layer):
    """The implementation of ``BatchNormLayer``.

    Parameters
@@ -359,10 +359,10 @@ class BatchNormLayer(Layer):

    def LayerSetup(self, bottom):
        inputs = [bottom] + [blob['data'] for blob in self._blobs]
-        return dragon.ops.BatchNorm(inputs, **self.arguments)
+        return _ops.BatchNorm(inputs, **self.arguments)


-class GroupNormLayer(Layer):
+class GroupNormLayer(_Layer):
    """The implementation of ``GroupNormLayer``.

    Parameters
@@ -386,10 +386,10 @@ class GroupNormLayer(Layer):

    def LayerSetup(self, bottom):
        inputs = [bottom] + [blob['data'] for blob in self._blobs]
-        return dragon.ops.GroupNorm(inputs, **self.arguments)
+        return _ops.GroupNorm(inputs, **self.arguments)


-class InstanceNormLayer(Layer):
+class InstanceNormLayer(_Layer):
    """The implementation of ``InstanceNormLayer``.

    Introduced by `[Ulyanov et.al, 2016] <https://arxiv.org/abs/1607.08022>`_
@@ -405,10 +405,10 @@ class InstanceNormLayer(Layer):
        self.arguments = {'eps': LayerParameter.instance_norm_param.eps, 'axis': 1}

    def LayerSetup(self, bottom):
-        return dragon.ops.InstanceNorm(bottom, **self.arguments)
+        return _ops.InstanceNorm(bottom, **self.arguments)


-class ScaleLayer(Layer):
+class ScaleLayer(_Layer):
    """The implementation of ``ScaleLayer``.

    Parameters
@@ -439,10 +439,10 @@ class ScaleLayer(Layer):

    def LayerSetup(self, bottom):
        inputs = [bottom]+ [blob['data'] for blob in self._blobs]
-        return dragon.ops.Affine(inputs, **self.arguments)
+        return _ops.Affine(inputs, **self.arguments)


-class BNLayer(Layer):
+class BNLayer(_Layer):
    """The implementation of ``BNLayer``.

    Parameters
@@ -477,10 +477,10 @@ class BNLayer(Layer):

    def LayerSetup(self, bottom):
        inputs = [bottom] + [blob['data'] for blob in self._blobs]
-        return dragon.ops.BatchNorm(inputs, **self.arguments)
+        return _ops.BatchNorm(inputs, **self.arguments)


-class GNLayer(Layer):
+class GNLayer(_Layer):
    """The implementation of ``GNLayer``.

    Parameters
@@ -509,10 +509,10 @@ class GNLayer(Layer):

    def LayerSetup(self, bottom):
        inputs = [bottom] + [blob['data'] for blob in self._blobs]
-        return dragon.ops.GroupNorm(inputs, **self.arguments)
+        return _ops.GroupNorm(inputs, **self.arguments)


-class NormalizeLayer(Layer):
+class NormalizeLayer(_Layer):
    """The implementation of ``NormalizeLayer``.

    Parameters
@@ -542,13 +542,13 @@ class NormalizeLayer(Layer):
        self.AddBlob(filler=self.GetFiller(param, 'scale_filler'), value=1) # scale

    def LayerSetup(self, bottom):
-        norm_out = [dragon.ops.L2Norm(bottom, **self.l2norm_arguments)]
-        return dragon.ops.Affine(
+        norm_out = [_ops.L2Norm(bottom, **self.l2norm_arguments)]
+        return _ops.Affine(
            norm_out + [blob['data'] for blob in self._blobs],
                **self.affine_arguments)


-class TileLayer(Layer):
+class TileLayer(_Layer):
    """The extended implementation of ``TileLayer``.

    Parameters
@@ -565,10 +565,10 @@ class TileLayer(Layer):
        }

    def LayerSetup(self, bottom):
-        return dragon.ops.Tile(bottom, **self.arguments)
+        return _ops.Tile(bottom, **self.arguments)


-class ReductionLayer(Layer):
+class ReductionLayer(_Layer):
    """The extended implementation of ``ReductionLayer``.

    Parameters
@@ -591,10 +591,10 @@ class ReductionLayer(Layer):
        }

    def LayerSetup(self, bottom):
-        return dragon.ops.Reduce(bottom, **self.arguments)
+        return _ops.Reduce(bottom, **self.arguments)


-class ExpandDimsLayer(Layer):
+class ExpandDimsLayer(_Layer):
    """The implementation of ``ExpandDimsLayer``.

    Parameters
@@ -608,29 +608,29 @@ class ExpandDimsLayer(Layer):
        self.arguments = {'axis': LayerParameter.expand_dims_param.axis}

    def LayerSetup(self, bottom):
-        return dragon.ops.ExpandDims(bottom, **self.arguments)
+        return _ops.ExpandDims(bottom, **self.arguments)


-class StopGradientLayer(Layer):
+class StopGradientLayer(_Layer):
    """The implementation of ``StopGradientLayer``."""

    def __init__(self, LayerParameter):
        super(StopGradientLayer, self).__init__(LayerParameter)

    def LayerSetup(self, bottom):
-        return dragon.ops.StopGradient(bottom, **self.arguments)
+        return _ops.StopGradient(bottom, **self.arguments)


-class ProposalLayer(Layer):
+class ProposalLayer(_Layer):
    """The implementation of ``ProposalLayer``.

    Parameters
    ----------
-    stride : list of int
+    stride : sequence of int
        The stride of anchors. Refer ``ProposalParameter.stride``.
-    scale : list of float
+    scale : sequence of float
        The scales of anchors. Refer `ProposalParameter.scale`_.
-    ratio : list of float
+    ratio : sequence of float
        The ratios of anchors. Refer `ProposalParameter.ratio`_.
    pre_nms_top_n : int
        The num of anchors before nms. Refer `ProposalParameter.pre_nms_topn`_.
@@ -668,10 +668,10 @@ class ProposalLayer(Layer):
        }

    def LayerSetup(self, bottom):
-        return dragon.ops.Proposal(bottom, **self.arguments)
+        return _ops.Proposal(bottom, **self.arguments)


-class CastLayer(Layer):
+class CastLayer(_Layer):
    """The implementation of ``CastLayer``.

    Parameters
@@ -686,4 +686,4 @@ class CastLayer(Layer):
        self.arguments = {'dtype': param.dtype.lower()}

    def LayerSetup(self, bottom):
-        return dragon.ops.Cast(bottom, **self.arguments)
\ No newline at end of file
+        return _ops.Cast(bottom, **self.arguments)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/caffe/layers/data.py
+++ b/Dragon/python/dragon/vm/caffe/layers/data.py
@@ -15,13 +15,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import dragon
-from ..layer import Layer
+from dragon import ops as _ops
+from ..layer import Layer as _Layer


-class DataLayer(Layer):
-    """
-    The implementation of ``DataLayer``.
+class DataLayer(_Layer):
+    """The implementation of ``DataLayer``.

    Different from ``Caffe``, we force to use `LMDB`_ backend.

@@ -33,7 +32,7 @@ class DataLayer(Layer):
        The prefetch count. Refer `DataParameter.prefetch`_.
    batch_size : int
        The size of a mini-batch. Refer `DataParameter.batch_size`_.
-    phase : caffe_pb2.Phase
+    phase : Phase
        The phase of layer. Refer `LayerParameter.phase`_.
    mirrow : boolean
        Whether to randomly mirror. Refer `TransformationParameter.mirror`_.
@@ -49,9 +48,9 @@ class DataLayer(Layer):
        The min scale of the images. Extension of `TransformationParameter`_.
    max_random_scale : float
        The max scale of the images. Extension of `TransformationParameter`_.
-    dtype : caffe_pb2.MemoryDataParameter.DataType
-        The output data type. ``FLOAT32`` or ``FLOAT16``.
-    mean_value : list of float
+    dtype : MemoryDataParameter.DataType
+        The output data type. *FLOAT32* or *FLOAT16*.
+    mean_value : sequence of float
        The mean of each channel. Refer `TransformationParameter.mean_value`_.
    scale : float
        The scaling factor. Refer `TransformationParameter.scale`_.
@@ -93,20 +92,20 @@ class DataLayer(Layer):
                [1. / transform_param.scale] * 3

    def LayerSetup(self, bottom):
-        data, label = dragon.ops.LMDBData(**self.arguments)
-        return dragon.ops.ImageData(data, **self.arguments), label
+        data, label = _ops.LMDBData(**self.arguments)
+        return _ops.ImageData(data, **self.arguments), label


-class MemoryDataLayer(Layer):
+class MemoryDataLayer(_Layer):
    """The implementation of ``MemoryDataLayer``.

    We extend it with ``FP16`` and ``NHWC => NCHW``.

    Parameters
    ----------
-    dtype : caffe_pb2.MemoryDataParameter.DataType
+    dtype : MemoryDataParameter.DataType
        The output data type. ``FLOAT32`` or ``FLOAT16``.
-    mean_value : list of float
+    mean_value : sequence of float
        The mean of each channel. Refer `TransformationParameter.mean_value`_.
    scale : float
        The scaling factor. Refer `TransformationParameter.scale`_.
@@ -131,4 +130,4 @@ class MemoryDataLayer(Layer):
                [1. / transform_param.scale] * 3

    def LayerSetup(self, bottom):
-        return dragon.ops.ImageData(bottom, **self.arguments)
\ No newline at end of file
+        return _ops.ImageData(bottom, **self.arguments)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/caffe/layers/loss.py
+++ b/Dragon/python/dragon/vm/caffe/layers/loss.py
@@ -15,11 +15,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import dragon
-from ..layer import Layer
+from dragon import ops as _ops
+from ..layer import Layer as _Layer


-class SoftmaxWithLossLayer(Layer):
+class SoftmaxWithLossLayer(_Layer):
    """The implementation of ``SoftmaxWithLossLayer``.

    Parameters
@@ -52,12 +52,12 @@ class SoftmaxWithLossLayer(Layer):
        }

    def LayerSetup(self, bottom):
-        loss = dragon.ops.SparseSoftmaxCrossEntropy(bottom, **self.arguments)
+        loss = _ops.SparseSoftmaxCrossEntropy(bottom, **self.arguments)
        if self._loss_weight is not None: loss *= self._loss_weight
        return loss


-class SigmoidCrossEntropyLossLayer(Layer):
+class SigmoidCrossEntropyLossLayer(_Layer):
    """The implementation of ``SigmoidCrossEntropyLossLayer``.

    Parameters
@@ -79,12 +79,12 @@ class SigmoidCrossEntropyLossLayer(Layer):
        self.arguments = {'normalization': normalization}

    def LayerSetup(self, bottom):
-        loss = dragon.ops.SigmoidCrossEntropy(bottom, **self.arguments)
+        loss = _ops.SigmoidCrossEntropy(bottom, **self.arguments)
        if self._loss_weight is not None: loss *= self._loss_weight
        return loss


-class L2LossLayer(Layer):
+class L2LossLayer(_Layer):
    """The implementation of ``L2LossLayer``.

    Parameters
@@ -106,12 +106,12 @@ class L2LossLayer(Layer):
        self.arguments = {'normalization': normalization}

    def LayerSetup(self, bottom):
-        loss = dragon.ops.L2Loss(bottom, **self.arguments)
+        loss = _ops.L2Loss(bottom, **self.arguments)
        if self._loss_weight is not None: loss *= self._loss_weight
        return loss


-class SmoothL1LossLayer(Layer):
+class SmoothL1LossLayer(_Layer):
    """The implementation of ``SmoothL1LossLayer``.

    Parameters
@@ -140,12 +140,12 @@ class SmoothL1LossLayer(Layer):
        }

    def LayerSetup(self, bottom):
-        loss = dragon.ops.SmoothL1Loss(bottom, **self.arguments)
+        loss = _ops.SmoothL1Loss(bottom, **self.arguments)
        if self._loss_weight is not None: loss *= self._loss_weight
        return loss


-class SigmoidWithFocalLossLayer(Layer):
+class SigmoidWithFocalLossLayer(_Layer):
    """The implementation of ``SigmoidWithFocalLossLayer``.

    Parameters
@@ -183,12 +183,12 @@ class SigmoidWithFocalLossLayer(Layer):
        }

    def LayerSetup(self, bottom):
-        loss = dragon.ops.SigmoidFocalLoss(bottom, **self.arguments)
+        loss = _ops.SigmoidFocalLoss(bottom, **self.arguments)
        if self._loss_weight is not None: loss *= self._loss_weight
        return loss


-class SoftmaxWithFocalLossLayer(Layer):
+class SoftmaxWithFocalLossLayer(_Layer):
    """The implementation of ``SoftmaxWithFocalLossLayer``.

    Parameters
@@ -227,6 +227,6 @@ class SoftmaxWithFocalLossLayer(Layer):
        }

    def LayerSetup(self, bottom):
-        loss = dragon.ops.SoftmaxFocalLoss(bottom, **self.arguments)
+        loss = _ops.SoftmaxFocalLoss(bottom, **self.arguments)
        if self._loss_weight is not None: loss *= self._loss_weight
        return loss
\ No newline at end of file
--- a/Dragon/python/dragon/vm/caffe/layers/mpi.py
+++ b/Dragon/python/dragon/vm/caffe/layers/mpi.py
@@ -15,11 +15,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import dragon
-from ..layer import Layer
+from dragon import ops as _ops
+from ..layer import Layer as _Layer


-class MPIBroadcastLayer(Layer):
+class MPIBroadcastLayer(_Layer):
    """The implementation of ``MPIBroadcastLayer``.

    Parameters
@@ -33,10 +33,10 @@ class MPIBroadcastLayer(Layer):
        self.arguments = {'root': LayerParameter.mpi_param.root}

    def LayerSetup(self, bottom):
-        return dragon.ops.MPIBroadcast(bottom, **self.arguments)
+        return _ops.MPIBroadcast(bottom, **self.arguments)


-class MPIGatherLayer(Layer):
+class MPIGatherLayer(_Layer):
    """The implementation of ``MPIGatherLayer``.

    Parameters
@@ -53,4 +53,4 @@ class MPIGatherLayer(Layer):
        }

    def LayerSetup(self, bottom):
-        return dragon.ops.MPIGather(bottom, **self.arguments)
\ No newline at end of file
+        return _ops.MPIGather(bottom, **self.arguments)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/caffe/layers/neuron.py
+++ b/Dragon/python/dragon/vm/caffe/layers/neuron.py
@@ -15,11 +15,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import dragon
-from ..layer import Layer
+from dragon import ops as _ops
+from ..layer import Layer as _Layer


-class ReLULayer(Layer):
+class ReLULayer(_Layer):
    """The implementation of ``ReLULayer``.

    Parameters
@@ -35,10 +35,10 @@ class ReLULayer(Layer):
            self.arguments = {'slope': param.negative_slope}

    def LayerSetup(self, bottom):
-        return dragon.ops.Relu(bottom, **self.arguments)
+        return _ops.Relu(bottom, **self.arguments)


-class PReLULayer(Layer):
+class PReLULayer(_Layer):
    """The implementation of ``PReLULayer``.

    Parameters
@@ -61,10 +61,10 @@ class PReLULayer(Layer):

    def LayerSetup(self, bottom):
        inputs = [bottom] + [blob['data'] for blob in self._blobs]
-        return dragon.ops.PRelu(inputs, **self.arguments)
+        return _ops.PRelu(inputs, **self.arguments)


-class ELULayer(Layer):
+class ELULayer(_Layer):
    """The implementation of ``ELULayer``.

    Parameters
@@ -78,40 +78,40 @@ class ELULayer(Layer):
        self.arguments = {'alpha': float(LayerParameter.elu_param.alpha)}

    def LayerSetup(self, bottom):
-        return dragon.ops.Elu(bottom, **self.arguments)
+        return _ops.Elu(bottom, **self.arguments)


-class SELULayer(Layer):
+class SELULayer(_Layer):
    """The implementation of ``SELULayer``."""

    def __init__(self, LayerParameter):
        super(SELULayer, self).__init__(LayerParameter)

    def LayerSetup(self, bottom):
-        return dragon.ops.SElu(bottom, **self.arguments)
+        return _ops.SElu(bottom, **self.arguments)


-class SigmoidLayer(Layer):
+class SigmoidLayer(_Layer):
    """The implementation of ``SigmoidLayer``."""

    def __init__(self, LayerParameter):
        super(SigmoidLayer, self).__init__(LayerParameter)

    def LayerSetup(self, bottom):
-        return dragon.ops.Sigmoid(bottom, **self.arguments)
+        return _ops.Sigmoid(bottom, **self.arguments)


-class TanHLayer(Layer):
+class TanHLayer(_Layer):
    """The implementation of ``TanHLayer``."""

    def __init__(self, LayerParameter):
        super(TanHLayer, self).__init__(LayerParameter)

    def LayerSetup(self, bottom):
-        return dragon.ops.Tanh(bottom, **self.arguments)
+        return _ops.Tanh(bottom, **self.arguments)


-class DropoutLayer(Layer):
+class DropoutLayer(_Layer):
    """The implementation of ``DropoutLayer``.

    Parameters
@@ -132,10 +132,10 @@ class DropoutLayer(Layer):
        }

    def LayerSetup(self, bottom):
-        return dragon.ops.Dropout(bottom, **self.arguments)
+        return _ops.Dropout(bottom, **self.arguments)


-class PowerLayer(Layer):
+class PowerLayer(_Layer):
    """The implementation of ``PowerLayer``.

    Parameters
@@ -158,4 +158,4 @@ class PowerLayer(Layer):
        }

    def LayerSetup(self, bottom):
-        return dragon.ops.Pow(bottom, **self.arguments)
\ No newline at end of file
+        return _ops.Pow(bottom, **self.arguments)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/caffe/layers/vision.py
+++ b/Dragon/python/dragon/vm/caffe/layers/vision.py
@@ -15,11 +15,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import dragon
-from ..layer import Layer
+from dragon import ops as _ops
+from ..layer import Layer as _Layer


-class ConvolutionLayer(Layer):
+class ConvolutionLayer(_Layer):
    """The implementation of ``ConvolutionLayer``.

    Parameters
@@ -28,19 +28,19 @@ class ConvolutionLayer(Layer):
        The output channels. Refer `ConvolutionParameter.num_output`_.
    bias_term : boolean
        Whether to use bias. Refer `ConvolutionParameter.bias_term`_.
-    pad : list of int
+    pad : sequence of int
        The zero padding size(s). Refer `ConvolutionParameter.pad`_.
    kernel_size : list of int
        The kernel size(s). Refer `ConvolutionParameter.kernel_size`_.
-    stride : list of int
+    stride : sequence of int
        The stride(s). Refer `ConvolutionParameter.stride`_.
-    dilation : list of int
+    dilation : sequence of int
        The dilation(s). Refer `ConvolutionParameter.dilation`_.
    group : int
         The group size. Refer `ConvolutionParameter.group`_.
    weight_filler : FillerParameter
        The filler of weights. Refer `ConvolutionParameter.weight_filler`_.
-    bias_filler : FillerParameters
+    bias_filler : FillerParameter
        The filler of bias. Refer `ConvolutionParameter.bias_filler`_.

    """
@@ -76,10 +76,10 @@ class ConvolutionLayer(Layer):

    def LayerSetup(self, bottom):
        inputs = [bottom] + [blob['data'] for blob in self._blobs]
-        return dragon.ops.Conv2d(inputs, **self.arguments)
+        return _ops.Conv2d(inputs, **self.arguments)


-class DepthwiseConvolutionLayer(Layer):
+class DepthwiseConvolutionLayer(_Layer):
    """The implementation of ``DepthwiseConvolutionLayer``.

    Parameters
@@ -88,15 +88,15 @@ class DepthwiseConvolutionLayer(Layer):
        The output channels. Refer `ConvolutionParameter.num_output`_.
    bias_term : boolean
        Whether to use bias. Refer `ConvolutionParameter.bias_term`_.
-    pad : list of int
+    pad : sequence of int
        The zero padding size(s). Refer `ConvolutionParameter.pad`_.
-    kernel_size : list of int
+    kernel_size : sequence of int
        The kernel size(s). Refer `ConvolutionParameter.kernel_size`_.
-    stride : list of int
+    stride : sequence of int
        The stride(s). Refer `ConvolutionParameter.stride`_.
    weight_filler : FillerParameter
        The filler of weights. Refer `ConvolutionParameter.weight_filler`_.
-    bias_filler : FillerParameters
+    bias_filler : FillerParameter
        The filler of bias. Refer `ConvolutionParameter.bias_filler`_.

    """
@@ -130,7 +130,7 @@ class DepthwiseConvolutionLayer(Layer):

    def LayerSetup(self, bottom):
        inputs = [bottom] + [blob['data'] for blob in self._blobs]
-        return dragon.ops.DepthwiseConv2d(inputs, **self.arguments)
+        return _ops.DepthwiseConv2d(inputs, **self.arguments)


 class DeconvolutionLayer(ConvolutionLayer):
@@ -142,19 +142,19 @@ class DeconvolutionLayer(ConvolutionLayer):
        The output channels. Refer `ConvolutionParameter.num_output`_.
    bias_term : boolean
        Whether to use bias. Refer `ConvolutionParameter.bias_term`_.
-    pad : list of int
+    pad : sequence of int
        The zero padding size(s). Refer `ConvolutionParameter.pad`_.
-    kernel_size : list of int
+    kernel_size : sequence of int
        The kernel size(s). Refer `ConvolutionParameter.kernel_size`_.
-    stride : list of int
+    stride : sequence of int
        The stride(s). Refer `ConvolutionParameter.stride`_.
-    dilation : list of int
+    dilation : sequence of int
        The dilation(s). Refer `ConvolutionParameter.dilation`_.
    group : int
         The group size. Refer `ConvolutionParameter.group`_.
    weight_filler : FillerParameter
        The filler of weights. Refer `ConvolutionParameter.weight_filler`_.
-    bias_filler : FillerParameters
+    bias_filler : FillerParameter
        The filler of bias. Refer `ConvolutionParameter.bias_filler`_.

    """
@@ -163,29 +163,29 @@ class DeconvolutionLayer(ConvolutionLayer):

    def LayerSetup(self, bottom):
        inputs = [bottom] + [blob['data'] for blob in self._blobs]
-        return dragon.ops.ConvTranspose2d(inputs, **self.arguments)
+        return _ops.ConvTranspose2d(inputs, **self.arguments)


-class PoolingLayer(Layer):
+class PoolingLayer(_Layer):
    """The implementation of ``PoolingLayer``.

    Parameters
    ----------
    pool : PoolMethod
        The method. Refer `PoolingParameter.pool`_.
-    pad : list of int
+    pad : sequence of int
        The zero padding size(s). Refer `PoolingParameter.pad`_.
    pad_h : int
        The padding size of height. Refer `PoolingParameter.pad_h`_.
    pad_w : int
        The padding size of width. Refer `PoolingParameter.pad_w`_.
-    kernel_size : list of int
+    kernel_size : sequence of int
        The kernel size(s). Refer `PoolingParameter.kernel_size`_.
    kernel_h : int
        The kernel size of height. Refer `PoolingParameter.kernel_h`_.
    kernel_w : int
        The kernel size of width. Refer `PoolingParameter.kernel_w`_.
-    stride : list of int
+    stride : sequence of int
        The strides. Refer `PoolingParameter.stride`_.
    stride_h : int
        The stride of height. Refer `PoolingParameter.stride_h`_.
@@ -212,10 +212,10 @@ class PoolingLayer(Layer):
        else: self.arguments['strides'] = [param.stride_h, param.stride_w]

    def LayerSetup(self, bottom):
-        return dragon.ops.Pool2d(bottom, **self.arguments)
+        return _ops.Pool2d(bottom, **self.arguments)


-class ROIPoolingLayer(Layer):
+class ROIPoolingLayer(_Layer):
    """The implementation of ``ROIPoolingLayer``.

    Parameters
@@ -238,10 +238,10 @@ class ROIPoolingLayer(Layer):
        }

    def LayerSetup(self, bottom):
-        return dragon.ops.ROIPool(bottom, **self.arguments)
+        return _ops.ROIPool(bottom, **self.arguments)


-class ROIAlignLayer(Layer):
+class ROIAlignLayer(_Layer):
    """The implementation of ``ROIAlignLayer``.

    Parameters
@@ -264,10 +264,10 @@ class ROIAlignLayer(Layer):
        }

    def LayerSetup(self, bottom):
-        return dragon.ops.ROIAlign(bottom, **self.arguments)
+        return _ops.ROIAlign(bottom, **self.arguments)


-class LRNLayer(Layer):
+class LRNLayer(_Layer):
    """The implementation of ``LRNLayer``.

    Parameters
@@ -296,15 +296,15 @@ class LRNLayer(Layer):
        }

    def LayerSetup(self, bottom):
-        return dragon.ops.LRN(bottom, **self.arguments)
+        return _ops.LRN(bottom, **self.arguments)


-class NNResizeLayer(Layer):
+class NNResizeLayer(_Layer):
    """The implementation of ``NNResizeLayer``.

    Parameters
    ----------
-    shape : caffe_pb2.BlobShape
+    shape : BlobShape
        The output shape. Refer `ResizeParameter.shape`_.
    fx : float
        The scale factor of height. Refer `ResizeParameter.fx`_.
@@ -330,15 +330,15 @@ class NNResizeLayer(Layer):
                raise ValueError('The second bottom should be provided to determine the shape.')
            self.arguments['shape_like'] = bottom[1]
            bottom = bottom[0]
-        return dragon.ops.NNResize(bottom, **self.arguments)
+        return _ops.NNResize(bottom, **self.arguments)


-class BilinearResizeLayer(Layer):
+class BilinearResizeLayer(_Layer):
    """The implementation of ``BilinearResizeLayer``.

    Parameters
    ----------
-    shape : caffe_pb2.BlobShape
+    shape : BlobShape
        The output shape. Refer `ResizeParameter.shape`_.
    fx : float
        The scale factor of height. Refer `ResizeParameter.fx`_.
@@ -364,10 +364,10 @@ class BilinearResizeLayer(Layer):
                raise ValueError('The second bottom should be provided to determine the shape.')
            self.arguments['shape_like'] = bottom[1]
            bottom = bottom[0]
-        return dragon.ops.BilinearResize(bottom, **self.arguments)
+        return _ops.BilinearResize(bottom, **self.arguments)


-class DropBlockLayer(Layer):
+class DropBlockLayer(_Layer):
    """The implementation of ``DropBlock2dLayer``.

    Parameters
@@ -394,4 +394,4 @@ class DropBlockLayer(Layer):
        }

    def LayerSetup(self, bottom):
-        return dragon.ops.DropBlock2d(bottom, **self.arguments)
\ No newline at end of file
+        return _ops.DropBlock2d(bottom, **self.arguments)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/caffe/misc.py
+++ b/Dragon/python/dragon/vm/caffe/misc.py
@@ -15,10 +15,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import dragon
+from dragon import config as _cfg


-_GLOBAL_ROOT_CAFFE_SOLVER = True
+_GLOBAL_CAFFE_ROOT_SOLVER = True


 def set_mode_cpu():
@@ -33,7 +33,7 @@ def set_mode_cpu():
    The implementation of `set_mode_cpu(_caffe.cpp, L51)`_.

    """
-    dragon.config.EnableCPU()
+    _cfg.EnableCPU()


 def set_mode_gpu():
@@ -48,7 +48,7 @@ def set_mode_gpu():
    The implementation of `set_mode_gpu(_caffe.cpp, L52)`_.

    """
-    dragon.config.EnableCUDA()
+    _cfg.EnableCUDA()


 def set_device(device):
@@ -63,7 +63,7 @@ def set_device(device):
    The implementation of `SetDevice(common.cpp, L65)`_.

    """
-    dragon.config.SetGPU(device)
+    _cfg.SetGPU(device)


 def set_random_seed(seed):
@@ -83,7 +83,7 @@ def set_random_seed(seed):
    The implementation of `set_random_seed(_caffe.cpp, L71)`_.

    """
-    dragon.config.SetRandomSeed(seed)
+    _cfg.SetRandomSeed(seed)


 def root_solver():
@@ -99,7 +99,7 @@ def root_solver():
    The implementation of `root_solver(common.hpp, L164)`_.

    """
-    return _GLOBAL_ROOT_CAFFE_SOLVER
+    return _GLOBAL_CAFFE_ROOT_SOLVER


 def set_root_solver(val):
@@ -115,5 +115,5 @@ def set_root_solver(val):
    The implementation of `set_root_solver(common.hpp, L165)`_.

    """
-    global _GLOBAL_ROOT_CAFFE_SOLVER
-    _GLOBAL_ROOT_CAFFE_SOLVER = val
\ No newline at end of file
+    global _GLOBAL_CAFFE_ROOT_SOLVER
+    _GLOBAL_CAFFE_ROOT_SOLVER = val
\ No newline at end of file
--- a/Dragon/python/dragon/vm/caffe/net.py
+++ b/Dragon/python/dragon/vm/caffe/net.py
@@ -15,12 +15,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import dragon
-
 from collections import OrderedDict
-from google.protobuf.text_format import Parse as parse_text_proto
-from dragon.vm.caffe import layers as layer_factory
-from dragon.vm.caffe.proto import caffe_pb2 as pb
+from google.protobuf.text_format import Parse as _parse_text_proto
+
+from dragon.core.tensor import Tensor as _Tensor
+from dragon.core import workspace as _workspace
+
+from dragon.vm.theano.gradient import grad as _Grad
+from dragon.vm.theano.compile.function import function as _Function
+from dragon.vm.caffe import layers as _layer_factory
+from dragon.vm.caffe.proto import caffe_pb2 as _proto_def


 class Blob(object):
@@ -89,8 +93,8 @@ class Net(object):
        The implementation of `Net_Init(_caffe.cpp, L109)`_.

        """
-        self._net = pb.NetParameter()
-        parse_text_proto(open(proto_txt,'r').read(), self._net)
+        self._net = _proto_def.NetParameter()
+        _parse_text_proto(open(proto_txt,'r').read(), self._net)
        self._phase = phase
        self._layers = []
        self._inputs_to_tensors = {}
@@ -100,16 +104,17 @@ class Net(object):
        if len(self._net.input) > 0:
            for input in self._net.input:
                if not input in self._blobs:
-                    variable = dragon.Tensor(input).Variable()
+                    variable = _Tensor(input).Variable()
                    self._blobs[input] = {
                        'data': variable,
-                        'diff': dragon.Tensor.Ref(variable.name + '_grad'),
+                        'diff': _Tensor.Ref(variable.name + '_grad'),
                    }
                self._inputs_to_tensors[input] = self._blobs[input]['data']

        for layer in self._net.layer:
            if not self.FilterLayer(layer): continue
-            self._layers.append(getattr(layer_factory, layer.type + 'Layer')(layer))
+            self._layers.append(getattr(
+                _layer_factory, layer.type + 'Layer')(layer))

        self.Setup()

@@ -199,7 +204,7 @@ class Net(object):
            for idx, top in enumerate(layer._top):
                self._blobs[top] = {
                    'data': outputs[idx],
-                    'diff': dragon.Tensor.Ref(outputs[idx].name + '_grad'),
+                    'diff': _Tensor.Ref(outputs[idx].name + '_grad'),
                }
                self._net_outputs.add(top)

@@ -271,14 +276,14 @@ class Net(object):

        for loss in self.losses:
            for var in self.trainable_variables:
-                dragon.grad(loss, var)
+                _Grad(loss, var)

-        self._function = dragon.function(
+        self._function = _Function(
            outputs=[self.blobs[key].data
                for key in self.outputs])

        if hasattr(self, '_model'):
-            dragon.workspace.Restore(self._model, format='caffe')
+            _workspace.Restore(self._model, format='caffe')

        return self._function

@@ -299,7 +304,7 @@ class Net(object):
        The implementation of `CopyTrainedLayersFromBinaryProto(net.cpp, L780)`_.

        """
-        dragon.workspace.Restore(model, format='caffe')
+        _workspace.Restore(model, format='caffe')

    def forward(self, **kwargs):
        """Forward pass. [**PyCaffe Style**]
@@ -322,11 +327,11 @@ class Net(object):
        def GetOutputs(net, net_outputs):
            ret = {}
            for output in net_outputs:
-                ret[output] = dragon.workspace.FetchTensor(net.blobs[output].data)
+                ret[output] = net.blobs[output].data.get_value()
            return ret

        for name, blob in kwargs.items():
-            dragon.workspace.FeedTensor(self._inputs_to_tensors[name], blob)
+            _workspace.FeedTensor(self._inputs_to_tensors[name], blob)

        self.function()(return_outputs=False, stage='forward')

@@ -347,7 +352,7 @@ class Net(object):

        """
        for name, blob in kwargs.items():
-            dragon.workspace.FeedTensor(self._inputs_to_tensors[name], blob)
+            _workspace.FeedTensor(self._inputs_to_tensors[name], blob)
        self.function()(return_outputs=False, stage='forward')

    def backward(self, **kwargs):
@@ -368,7 +373,7 @@ class Net(object):

        """
        for name, blob in kwargs.items():
-            dragon.workspace.FeedTensor(self.blobs[name].diff, blob)
+            _workspace.FeedTensor(self.blobs[name].diff, blob)
        self.function()(return_outputs=False, stage='backward')

    def save(self, filename):
@@ -399,7 +404,7 @@ class Net(object):
                    if param.data.name not in keys:
                        tensors.append(param.data)
                        keys.add(param.data.name)
-        dragon.workspace.Snapshot(tensors, filename, suffix='', format='caffe')
+        _workspace.Snapshot(tensors, filename, suffix='', format='caffe')

    @property
    def blobs(self):

--- a/Dragon/python/dragon/vm/caffe/solver.py
+++ b/Dragon/python/dragon/vm/caffe/solver.py
@@ -16,12 +16,16 @@ from __future__ import division
 from __future__ import print_function

 import time
-import dragon
-from google.protobuf.text_format import Parse as parse_text_proto

-from dragon.vm.caffe.misc import root_solver
-from dragon.vm.caffe.net import Net
-from dragon.vm.caffe.proto import caffe_pb2 as pb
+from dragon import updaters as _updaters
+from dragon.core import mpi as _mpi
+from dragon.core import workspace as _workspace
+from google.protobuf.text_format import Parse as _parse_text_proto
+
+from dragon.vm.caffe.net import Net as _Net
+from dragon.vm.caffe.proto import caffe_pb2 as _proto_def
+from dragon.vm.caffe.misc import root_solver as _root_solver
+from dragon.vm.theano.compile.function import function as _Function


 class Solver(object):
@@ -48,8 +52,8 @@ class Solver(object):
        >>> solver = Solver('solver.prototxt')

        """
-        self._param = pb.SolverParameter()
-        parse_text_proto(open(proto_txt, 'r').read(), self._param)
+        self._param = _proto_def.SolverParameter()
+        _parse_text_proto(open(proto_txt, 'r').read(), self._param)
        if self._param.iter_size > 1:
            raise NotImplementedError('Gradients accumulating is deprecated.')
        self._net = None
@@ -75,12 +79,12 @@ class Solver(object):

        """
        if self._param.HasField('net'):
-            self._net = Net(self._param.net, "TRAIN")
+            self._net = _Net(self._param.net, "TRAIN")

        if self._param.HasField('train_net'):
            if self._net is not None:
                raise RuntimeError('net or train_net can not be specified both.')
-            self._net = Net(self._param.train_net, "TRAIN")
+            self._net = _Net(self._param.train_net, "TRAIN")

    def InitTestNets(self):
        """Initialize the test nets.
@@ -94,10 +98,10 @@ class Solver(object):
        The implementation of `InitTestNets(solver.cpp, L104)`_.

        """
-        if dragon.mpi.Is_Init():
-            idx, group = dragon.mpi.AllowParallel()
+        if _mpi.Is_Init():
+            rank, group = _mpi.AllowParallel()
            # Only the root in a parallel group can test
-            if idx != -1 and dragon.mpi.Rank() != group[0]: return
+            if rank != -1 and _mpi.Rank() != group[0]: return

        num_test_net = len(self._param.test_iter)
        if num_test_net > 0:
@@ -106,12 +110,12 @@ class Solver(object):

        if len(self._param.test_net) > 0:
            for test_net in self._param.test_net:
-                 self._test_nets.append(Net(test_net, "TEST"))
+                 self._test_nets.append(_Net(test_net, "TEST"))
            num_test_net -= len(self._param.test_net)

        # Consider generic_net
        if num_test_net > 0:
-            self._test_nets.append(Net(self._param.net, "TEST"))
+            self._test_nets.append(_Net(self._param.net, "TEST"))

    def BuildNets(self):
        """Build the nets.
@@ -164,7 +168,7 @@ class Solver(object):
                            blob.decay_multiplier)

        # Compile
-        self.update = dragon.function(updater=self.optimizer)
+        self.update = _Function(updater=self.optimizer)

    def GetLearningRate(self):
        """Get learning rate based on the preset policy.
@@ -244,7 +248,7 @@ class Solver(object):

        for iter in range(test_iter):
            self.tests[test_idx](return_outputs=False)
-            if not root_solver(): continue
+            if not _root_solver(): continue
            if iter == 0:
                for key in net.outputs:
                    values = net.blobs[key].data.get_value().flatten()
@@ -259,7 +263,7 @@ class Solver(object):
                        test_score[i] += value
                        i += 1

-        if not root_solver(): return
+        if not _root_solver(): return

        print('Iteration {}, Test net #{}'.format(self.iter, test_idx))
        for idx, score in enumerate(test_score):
@@ -299,12 +303,12 @@ class Solver(object):
            loss = 0.0
            for i in range(self._param.iter_size):
                self.train(return_outputs=False)
-                if root_solver():
+                if _root_solver():
                    for e in self.net.losses:
                        values = e.get_value().flatten()
                        for v in values: loss += v

-            if root_solver():
+            if _root_solver():
                loss /= self._param.iter_size
                if len(loss_vec) < self._param.average_loss:
                    loss_vec.append(loss)
@@ -319,7 +323,7 @@ class Solver(object):
            self.update()

            # Display
-            if root_solver() and self._param.display:
+            if _root_solver() and self._param.display:
                if self.iter % self._param.display == 0:
                    base_lr = self.optimizer.base_lr
                    print('Iteration %d, lr = %s, loss = %f, time = %.2fs' % \
@@ -410,7 +414,7 @@ class Solver(object):
        """
        tensors = [blob.data for blob in self._layer_blobs]
        filename = "_iter_" + str(self.iter)
-        dragon.workspace.Snapshot(tensors, filename,
+        _workspace.Snapshot(tensors, filename,
            prefix=self._param.snapshot_prefix,
                suffix='.caffemodel', format='caffe')

@@ -492,7 +496,7 @@ class SGDSolver(Solver):
    """
    def __init__(self, proto_txt):
        super(SGDSolver, self).__init__(proto_txt=proto_txt)
-        self.optimizer = dragon.updaters.SGDUpdater(**self._optimizer_arguments)
+        self.optimizer = _updaters.SGDUpdater(**self._optimizer_arguments)
        self.BuildOptimizer()

    def ParseOptimizerArguments(self):
@@ -514,7 +518,7 @@ class NesterovSolver(Solver):
    """
    def __init__(self, proto_txt):
        super(NesterovSolver, self).__init__(proto_txt=proto_txt)
-        self.optimizer = dragon.updaters.NesterovUpdater(**self._optimizer_arguments)
+        self.optimizer = _updaters.NesterovUpdater(**self._optimizer_arguments)
        self.BuildOptimizer()

    def ParseOptimizerArguments(self):
@@ -538,7 +542,7 @@ class RMSPropSolver(Solver):
    """
    def __init__(self, proto_txt):
        super(RMSPropSolver, self).__init__(proto_txt=proto_txt)
-        self.optimizer = dragon.updaters.RMSPropUpdater(**self._optimizer_arguments)
+        self.optimizer = _updaters.RMSPropUpdater(**self._optimizer_arguments)
        self.BuildOptimizer()

    def ParseOptimizerArguments(self):
@@ -565,7 +569,7 @@ class AdamSolver(Solver):
    """
    def __init__(self, proto_txt):
        super(AdamSolver, self).__init__(proto_txt=proto_txt)
-        self.optimizer = dragon.updaters.AdamUpdater(**self._optimizer_arguments)
+        self.optimizer = _updaters.AdamUpdater(**self._optimizer_arguments)
        self.BuildOptimizer()

    def ParseOptimizerArguments(self):

--- a/Dragon/python/dragon/vm/onnx/frontend.py
+++ b/Dragon/python/dragon/vm/onnx/frontend.py
@@ -17,17 +17,18 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import numpy
 import itertools
-import numpy as np
 from collections import defaultdict

-from onnx import (checker, mapping, numpy_helper, GraphProto, OperatorSetIdProto)
+from onnx import checker, mapping, numpy_helper, GraphProto, OperatorSetIdProto
 from onnx.helper import make_tensor_value_info, make_model, printable_graph

-from dragon.vm.onnx.helper import \
-    (extract_initializer, extract_leaf_tensors,
-     native_run_graph, fetch_initializer,)
-
+from dragon.core import workspace as _workspace
+from dragon.vm.onnx.helper import native_run_graph
+from dragon.vm.onnx.helper import fetch_initializer
+from dragon.vm.onnx.helper import extract_initializer
+from dragon.vm.onnx.helper import extract_leaf_tensors
 from dragon.vm.onnx.nodes.factory import get_nodes_def


@@ -104,15 +105,22 @@ class DragonFrontend(object):
        if run_native_graph and not enforce_no_running:
            inputs = {}
            for name, (elem_type, shape) in value_info.items():
-                inputs[name] = np.random.randn(*shape).astype(
+                inputs[name] = numpy.random.randn(*shape).astype(
                    mapping.TENSOR_TYPE_TO_NP_TYPE[elem_type])
+
            ws, outputs, initializer = native_run_graph(
                graph_def, inputs, initializer, init_func)

+            for name in graph_def.output:
+                output = outputs[name]
+                elem_type = mapping.NP_TYPE_TO_TENSOR_TYPE[output.dtype]
+                shape = output.shape
+                value_info[name] = (elem_type, shape)
+
        if enforce_no_running:
            # In some cases(e.g. PyTorch), we had ran the graph
            # outputs had been in ``value_info`` already
-            import dragon.core.workspace as ws
+            ws = _workspace.get_default_workspace()
            initializer = fetch_initializer(initializer)

        # Prepare to make the graph

--- a/Dragon/python/dragon/vm/onnx/helper.py
+++ b/Dragon/python/dragon/vm/onnx/helper.py
@@ -21,8 +21,8 @@ import sys
 from onnx.backend.base import namedtupledict
 from onnx import numpy_helper

-import dragon as dg
-from dragon.vm.onnx.workspace import Workspace
+from dragon.core import workspace as _workspace
+from dragon.core.tensor import Tensor as _Tensor


 INITIALIZER_TAG = {
@@ -65,7 +65,7 @@ def fetch_initializer(initializer):
    # Fetch the initializer
    return [
        numpy_helper.from_array(
-            dg.workspace.FetchTensor(name), name=name)
+            _workspace.FetchTensor(name), name=name)
                for name in initializer
    ]

@@ -87,32 +87,32 @@ def native_run_graph(graph_def, inputs, initializer, init_func=None):
            graph_def.arg[i].i = 0

    # Create an anonymous workspace
-    ws = Workspace()
+    ws = _workspace.Workspace()

-    with dg.ws_scope(ws.name):
+    with ws.as_default():
        # Register all the initializer before feeding them
        for name in initializer:
-            dg.Tensor(name=name).Variable()
+            _Tensor(name=name).Variable()

        # Feed the given values if necessary
        if init_func: init_func()

        # Feed the external inputs
        for name, blob in inputs.items():
-            dg.workspace.FeedTensor(name, blob)
+            _workspace.FeedTensor(name, blob)

        # Create and Run the graph
-        graph_name = dg.workspace.CreateGraph(graph_def)
-        dg.workspace.RunGraph(graph_name, return_outputs=False)
+        graph_name = _workspace.CreateGraph(graph_def)
+        _workspace.RunGraph(graph_name, return_outputs=False)

        # Fetch the outputs
        output_names = graph_def.output
-        output_values = [dg.workspace.FetchTensor(name) for name in output_names]
+        output_values = [_workspace.FetchTensor(name) for name in output_names]

        # Fetch the initializer
        initializer = [
            numpy_helper.from_array(
-                dg.workspace.FetchTensor(name), name=name)
+                _workspace.FetchTensor(name), name=name)
                    for name in initializer
        ]


--- a/Dragon/python/dragon/vm/onnx/utils.py
+++ b/Dragon/python/dragon/vm/onnx/utils.py
@@ -16,12 +16,12 @@ from __future__ import division
 from __future__ import print_function

 import os
-import numpy as np
-from onnx import mapping
-from google.protobuf.text_format import Parse as parse_text_proto
+import numpy

-import dragon.proto.dragon_pb2 as pb
-import dragon.import_c_api as C
+from onnx import mapping as _mapping
+from dragon.core import workspace as _workspace
+from dragon.proto import dragon_pb2 as _proto_def
+from google.protobuf.text_format import Parse as _parse_text_proto

 from dragon.vm.theano.compile.function import Function
 from dragon.vm.onnx.frontend import graph_def_to_onnx_model
@@ -119,8 +119,8 @@ def export_from_graph_text(

    """
    with open(text_file, 'r') as rf:
-        graph_def = pb.GraphDef()
-        parse_text_proto(rf.read(), graph_def)
+        graph_def = _proto_def.GraphDef()
+        _parse_text_proto(rf.read(), graph_def)

    export_from_graph_def(
        graph_def=graph_def,
@@ -148,8 +148,10 @@ def import_to_graph_def(model_path):
    """
    if not os.path.exists(model_path):
        raise ValueError('Given model({}) is not existed.'.format(model_path))
-    graph_def = pb.GraphDef()
-    serialized_proto = C.ImportONNXModel(model_path)
+    graph_def = _proto_def.GraphDef()
+    serialized_proto = _workspace \
+        .get_default_workspace() \
+            .ImportONNXModel(model_path)
    graph_def.ParseFromString(serialized_proto)
    return graph_def

@@ -238,4 +240,4 @@ def surgery_on_graph_def(


 def make_value_info(shape, dtype='float32'):
-    return mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(dtype)], shape
\ No newline at end of file
+    return _mapping.NP_TYPE_TO_TENSOR_TYPE[numpy.dtype(dtype)], shape
\ No newline at end of file
--- a/Dragon/python/dragon/vm/onnx/workspace.py
+++ b/Dragon/python/dragon/vm/onnx/workspace.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# Codes are based on:
-#
-#      <https://github.com/pytorch/pytorch/blob/master/caffe2/python/onnx/workspace.py>
-#
-# ------------------------------------------------------------
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import uuid
-import dragon as dg
-
-
-class Workspace(object):
-    def __init__(self):
-        self.name = 'onnx/' + str(uuid.uuid4())
-
-    def __getattr__(self, attr):
-        def f(*args, **kwargs):
-            with dg.ws_scope(self.name, ):
-                return getattr(dg.workspace, attr)(*args, **kwargs)
-        return f
-
-    def __del__(self):
-        self.ResetWorkspace(self.name)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/tensorflow/client/session.py
+++ b/Dragon/python/dragon/vm/tensorflow/client/session.py
@@ -16,45 +16,42 @@ from __future__ import print_function
 import warnings
 from collections import defaultdict

-import dragon
-
+from dragon.core import workspace as _workspace
+from dragon.core.tensor import Tensor as _Tensor
+from dragon.vm.theano.compile import function as _Function
 from dragon.vm.tensorflow.protobuf import config_pb2
 from dragon.vm.tensorflow.training.optimizer import Optimizer
 from dragon.vm.tensorflow.ops.variables import VariablesInitializer
 from dragon.vm.tensorflow.framework import ops


-_GLOBAL_DATA_FLOW_KEYS = defaultdict(dict)
-
-
 class _DataFlow(object):
    """DataFlow takes a group of expressions and
    the specified output tensors.

    We store the flows that requiring the same output names,
-    i.e., those flows can be reused and should not to create a new graph.
+    i.e., those flows can be reused and should not be created again.

    """
    def __init__(self, functions):
        self.functions = functions

    def run(self, feed_dict=None):
-        for i, function in enumerate(self.functions):
+        for i, func in enumerate(self.functions):
            if i == 0 and feed_dict is not None:
                for tensor, value in feed_dict.items():
-                    dragon.workspace.FeedTensor(tensor, value)
-            function(return_outputs=False)
+                    _workspace.FeedTensor(tensor, value)
+            func(return_outputs=False)

    @classmethod
-    def try_get(cls, workspace, flow_key):
-        global _GLOBAL_DATA_FLOW_KEYS
-        if flow_key in _GLOBAL_DATA_FLOW_KEYS[workspace]:
-            return _GLOBAL_DATA_FLOW_KEYS[workspace][flow_key]
+    def try_get(cls, graph_id, flow_key):
+        if flow_key in _GLOBAL_DATA_FLOWS[graph_id]:
+            return _GLOBAL_DATA_FLOWS[graph_id][flow_key]

    @classmethod
-    def try_add(cls, workspace, flow_key, flow):
-        global _GLOBAL_DATA_FLOW_KEYS
-        _GLOBAL_DATA_FLOW_KEYS[workspace][flow_key] = flow
+    def try_add(cls, graph_id, flow_key, flow):
+        global _GLOBAL_DATA_FLOWS
+        _GLOBAL_DATA_FLOWS[graph_id][flow_key] = flow


 class BaseSession(object):
@@ -115,7 +112,7 @@ class BaseSession(object):
        for e in fetches:
            if isinstance(e, Optimizer): optimizers.append(e)
            elif isinstance(e, VariablesInitializer): tensors.extend(e.var_list)
-            elif isinstance(e, dragon.Tensor): tensors.append(e)
+            elif isinstance(e, _Tensor): tensors.append(e)

        # Find minimum solving targets
        targets = set()
@@ -124,24 +121,23 @@ class BaseSession(object):
            for t in optimizer._targets: targets.add(t)

        targets = list(targets)
-        gen_flow_key = tuple(e.name for e in targets)
+        flow_key = tuple(e.name for e in targets)

        # Exist this data flow before?
-        data_flow = _DataFlow.try_get(
-            self._graph._workspace, gen_flow_key)
+        flow = _DataFlow.try_get(id(self._graph), flow_key)

        # Run by feeding
        if feed_dict is not None:
            # Check the feed dict
            for key, value in feed_dict.items():
-                if not isinstance(key, dragon.Tensor):
-                    raise TypeError('The key of feed_dict key should be a Tensor.')
+                if not isinstance(key, _Tensor):
+                    raise TypeError('The key of ``feed_dict`` should be a Tensor.')
                if key.shape is not None:
                    # Align the number of dimensions
                    if len(key.shape) != len(value.shape):
                        raise RuntimeError(
-                            'The Tensor({}) was limited to {} dimensions, \
-                                while feed a value with {} dimensions.'
+                            'The Tensor({}) was limited to {} dimensions, '\
+                            'while feed a value with {} dimensions.'
                            .format(key.name, len(key.shape), len(value.shape)))
                    # Verify for the each dimension
                    for i in range(len(key.shape)):
@@ -150,19 +146,20 @@ class BaseSession(object):
                            raise RuntimeError(
                                'The shape of Tensor({}) was limited as ('.format(key.name) +
                                ','.join([str(dim) for dim in key.shape]) + '), ' +
-                                        'while feed a value with (' + ','.join([str(dim) for dim in value.shape]) + ').')
+                                'while feed a value with (' +
+                                ','.join([str(dim) for dim in value.shape]) + ').')

        # Create a new data flow if necessary
-        if data_flow is None:
-            functions = [dragon.function(outputs=targets)]
+        if flow is None:
+            functions = [_Function(outputs=targets)]
            for optimizer in optimizers:
-                functions.append(dragon.function(
+                functions.append(_Function(
                    updater=optimizer.updater))
-            data_flow = _DataFlow(functions)
-            _DataFlow.try_add(self.graph._workspace, gen_flow_key, data_flow)
+            flow = _DataFlow(functions)
+            _DataFlow.try_add(id(self._graph), flow_key, flow)

        # Run this data flow
-        data_flow.run(feed_dict)
+        flow.run(feed_dict)

        # Fetch after running
        returns = []
@@ -234,3 +231,8 @@ class InteractiveSession(BaseSession):
    @staticmethod
    def reset(target, containers=None, config=None):
        pass
+
+
+# Store the flows for different graphs
+# ThreadLocal is not necessary
+_GLOBAL_DATA_FLOWS = defaultdict(dict)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/tensorflow/contrib/framework/ops/variables.py
+++ b/Dragon/python/dragon/vm/tensorflow/contrib/framework/ops/variables.py
@@ -13,8 +13,11 @@ from dragon.vm.tensorflow.framework import ops
 from dragon.vm.tensorflow.ops import var_scope as variable_scope


-def get_variables(scope=None, suffix=None,
-                  collection=ops.GraphKeys.GLOBAL_VARIABLES):
+def get_variables(
+    scope=None,
+    suffix=None,
+    collection=ops.GraphKeys.GLOBAL_VARIABLES,
+):
    if isinstance(scope, variable_scope.VariableScope):
        scope = scope.name
    if suffix is not None:

--- a/Dragon/python/dragon/vm/tensorflow/contrib/layers/initializers.py
+++ b/Dragon/python/dragon/vm/tensorflow/contrib/layers/initializers.py
@@ -19,25 +19,31 @@ from __future__ import print_function

 import math

-from dragon.vm.tensorflow.framework import dtypes
 from dragon.vm.tensorflow.ops import random_ops
+from dragon.vm.tensorflow.framework import dtypes


-__all__ = ['xavier_initializer',
-           'xavier_initializer_conv2d',
-           'variance_scaling_initializer']
-
-
-def xavier_initializer(uniform=True, seed=None, dtype=dtypes.float32):
-    return variance_scaling_initializer(factor=1.0, mode='FAN_AVG',
-                                        uniform=uniform, seed=seed, dtype=dtype)
-
-
-xavier_initializer_conv2d = xavier_initializer
+def xavier_initializer(
+    uniform=True,
+    seed=None,
+    dtype=dtypes.float32,
+):
+    return variance_scaling_initializer(
+        factor=1.0,
+        mode='FAN_AVG',
+        uniform=uniform,
+        seed=seed,
+        dtype=dtype,
+    )


-def variance_scaling_initializer(factor=2.0, mode='FAN_IN', uniform=False,
-                                 seed=None, dtype=dtypes.float32):
+def variance_scaling_initializer(
+    factor=2.0,
+    mode='FAN_IN',
+    uniform=False,
+    seed=None,
+    dtype=dtypes.float32,
+):
    if not dtype.is_floating:
        raise TypeError('Cannot create initializer for non-floating point type.')
    if mode not in ['FAN_IN', 'FAN_OUT', 'FAN_AVG']:
@@ -79,3 +85,7 @@ def variance_scaling_initializer(factor=2.0, mode='FAN_IN', uniform=False,
                                               seed=seed)

    return _initializer
+
+
+# Alias
+xavier_initializer_conv2d = xavier_initializer
\ No newline at end of file
--- a/Dragon/python/dragon/vm/tensorflow/contrib/layers/layers.py
+++ b/Dragon/python/dragon/vm/tensorflow/contrib/layers/layers.py
@@ -17,20 +17,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-from collections import defaultdict
-
-import dragon.ops as op_lib
-
-import dragon.vm.tensorflow.framework.ops as ops
+from dragon.vm.tensorflow.framework import ops
 from dragon.vm.tensorflow.contrib.layers import initializers
 from dragon.vm.tensorflow.ops import init_ops
 from dragon.vm.tensorflow.ops import nn
 from dragon.vm.tensorflow.ops import var_scope as vs
 from dragon.vm.tensorflow.layers import layers
+from dragon.ops import Flatten as _FlattenOp

-__all__ = ['flatten']
-
-_LAYERS_UID_DICT = defaultdict(int)

 DATA_FORMAT_NCHW = 'NCHW'
 DATA_FORMAT_NHWC = 'NHWC'
@@ -38,53 +32,52 @@ DATA_FORMAT_NCDHW = 'NCDHW'
 DATA_FORMAT_NDHWC = 'NDHWC'


-def _default_scope(scope, key, indicator):
-    if scope is None:
-        return indicator
-        # global _LAYERS_UID_DICT
-        # _LAYERS_UID_DICT[key] += 1
-        # return '{}{}'.format(indicator, _LAYERS_UID_DICT[key])
-    else:
-        return scope
-
-
-def avg_pool2d(inputs,
+def avg_pool2d(
+    inputs,
    kernel_size,
    stride=2,
    padding='VALID',
    data_format=DATA_FORMAT_NHWC,
    outputs_collections=None,
-               scope=None):
+    scope=None,
+):
    if data_format not in (DATA_FORMAT_NCHW, DATA_FORMAT_NHWC):
        raise ValueError('data_format has to be either NCHW or NHWC.')
    df = ('channels_first' if data_format and data_format.startswith('NC')
          else 'channels_last')
-    return layers.average_pooling2d(inputs=inputs,
+    return layers.average_pooling2d(
+        inputs=inputs,
        pool_size=kernel_size,
        strides=stride,
        padding=padding,
-                                    data_format=df)
+        data_format=df,
+    )


-def max_pool2d(inputs,
+def max_pool2d(
+    inputs,
    kernel_size,
    stride=2,
    padding='VALID',
    data_format=DATA_FORMAT_NHWC,
    outputs_collections=None,
-               scope=None):
+    scope=None,
+):
    if data_format not in (DATA_FORMAT_NCHW, DATA_FORMAT_NHWC):
        raise ValueError('data_format has to be either NCHW or NHWC.')
    df = ('channels_first' if data_format and data_format.startswith('NC')
          else 'channels_last')
-    return layers.max_pooling2d(inputs=inputs,
+    return layers.max_pooling2d(
+        inputs=inputs,
        pool_size=kernel_size,
        strides=stride,
        padding=padding,
-                                data_format=df)
+        data_format=df,
+    )


-def convolution(inputs,
+def convolution(
+    inputs,
    num_outputs,
    kernel_size,
    stride=1,
@@ -102,8 +95,9 @@ def convolution(inputs,
    variables_collections=None,
    outputs_collections=None,
    trainable=True,
-                scope=None):
-    scope = _default_scope(scope, 'CONVOLUTION', 'Conv')
+    scope=None,
+):
+    scope = _default_scope(scope, 'Conv')
    if data_format not in [None, 'NHWC', 'NCHW']:
        raise ValueError('Invalid data_format: %r' % (data_format,))
    data_format = 'channels_first' if data_format == 'NCHW' else 'channels_last'
@@ -126,7 +120,8 @@ def convolution(inputs,
                bias_regularizer=biases_regularizer,
                activity_regularizer=None,
                trainable=trainable,
-                reuse=reuse)
+                reuse=reuse,
+            )


 # Simple alias.
@@ -134,7 +129,8 @@ convolution2d = convolution
 conv2d = convolution2d


-def fully_connected(inputs,
+def fully_connected(
+    inputs,
    num_outputs,
    activation_fn=nn.relu,
    normalizer_fn=None,
@@ -147,8 +143,9 @@ def fully_connected(inputs,
    variables_collections=None,
    outputs_collections=None,
    trainable=True,
-                    scope=None):
-    scope = _default_scope(scope, 'FULLY_CONNECTED', 'fully_connected')
+    scope=None,
+):
+    scope = _default_scope(scope, 'fully_connected')
    with vs.variable_scope(scope, reuse=reuse) as sc:
        return layers.dense(
            inputs=inputs,
@@ -160,10 +157,12 @@ def fully_connected(inputs,
            bias_regularizer=biases_regularizer,
            activity_regularizer=None,
            trainable=trainable,
-            reuse=reuse)
+            reuse=reuse,
+        )


-def batch_norm(inputs,
+def batch_norm(
+    inputs,
    decay=0.999,
    center=True,
    scale=False,
@@ -184,8 +183,9 @@ def batch_norm(inputs,
    scope=None,
    renorm=False,
    renorm_clipping=None,
-               renorm_decay=0.99):
-    scope = _default_scope(scope, 'BATCH_NORM', 'BatchNorm')
+    renorm_decay=0.99,
+):
+    scope = _default_scope(scope, 'BatchNorm')
    if data_format not in (DATA_FORMAT_NCHW, DATA_FORMAT_NHWC):
        raise ValueError('data_format has to be either NCHW or NHWC.')
    axis = 1 if data_format == DATA_FORMAT_NCHW else -1
@@ -193,10 +193,14 @@ def batch_norm(inputs,
    with vs.variable_scope(scope, reuse=reuse) as sc:
        if not param_initializers:
            param_initializers = {}
-        beta_initializer = param_initializers.get('beta', init_ops.zeros_initializer())
-        gamma_initializer = param_initializers.get('gamma', init_ops.ones_initializer())
-        moving_mean_initializer = param_initializers.get('moving_mean', init_ops.zeros_initializer())
-        moving_variance_initializer = param_initializers.get('moving_variance', init_ops.ones_initializer())
+        beta_initializer = param_initializers.get(
+            'beta', init_ops.zeros_initializer())
+        gamma_initializer = param_initializers.get(
+            'gamma', init_ops.ones_initializer())
+        moving_mean_initializer = param_initializers.get(
+            'moving_mean', init_ops.zeros_initializer())
+        moving_variance_initializer = param_initializers.get(
+            'moving_variance', init_ops.ones_initializer())

        if not param_regularizers:
            param_regularizers = {}
@@ -222,11 +226,19 @@ def batch_norm(inputs,
            renorm_clipping=renorm_clipping,
            renorm_momentum=renorm_decay,
            fused=fused,
-            training=is_training)
+            training=is_training,
+        )


-def flatten(inputs,
+def flatten(
+    inputs,
    outputs_collections=None,
-            scope=None):
-    return op_lib.Flatten(inputs, axis=0, keep_axes=2)
+    scope=None,
+):
+    return _FlattenOp(inputs, axis=0, keep_axes=2)
+

+def _default_scope(scope, indicator):
+    """Return the default scope."""
+    if scope is None: return indicator
+    else: return scope
\ No newline at end of file
--- a/Dragon/python/dragon/vm/tensorflow/framework/constant_op.py
+++ b/Dragon/python/dragon/vm/tensorflow/framework/constant_op.py
@@ -13,60 +13,68 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import dragon
-import numpy as np
+import numpy

+from dragon.core import scope as _scope
+from dragon.core import workspace as _workspace
+from dragon.core.tensor import Tensor as _Tensor

-def constant(value, dtype=None, shape=None, name=None, verify_shape=False):
+
+def constant(
+    value,
+    dtype=None,
+    shape=None,
+    name=None,
+    verify_shape=False,
+):
    if dtype is not None:
-        if isinstance(value, np.ndarray):
-            feed = value.astype(dtype.as_numpy_dtype)
-        elif isinstance(value, list):
-            feed = np.array(value, dtype.as_numpy_dtype)
-        else:
-            feed = np.array([value], dtype.as_numpy_dtype)
+        if isinstance(value, numpy.ndarray):
+            value = value.astype(dtype.as_numpy_dtype)
        else:
-        if isinstance(value, np.ndarray): feed = value
+            value = numpy.array(value, dtype.as_numpy_dtype)
    else:
-            feed = np.array(value)
+        if not isinstance(value, numpy.ndarray):
+            value = numpy.array(value)
            # Discard the default float64
-            if feed.dtype == np.float64:
-                feed = feed.astype(np.float32)
+            if value.dtype == numpy.float64:
+                value = value.astype(numpy.float32)

    # Determine the shape
    if shape is not None:
-        if feed.size == 1:
+        if value.size == 1:
            # Case 1: Broadcast with scalar value
-            c = feed.flatten()[0]
-            feed = np.zeros(shape, feed.dtype)
-            feed.fill(c)
+            scalar = value.flatten()[0]
+            value = numpy.empty(shape, value.dtype)
+            value.fill(scalar)
        else:
            # Case 2: Reshape directly
            if verify_shape:
                if shape is not None:
                    if len(shape) != len(value.shape):
                        raise RuntimeError(
-                            'The constant was limited to {} dimensions, \
-                                while feed a value with {} dimensions.'.
-                                    format(len(shape), len(value.shape)))
+                            'The constant was limited to {} dimensions, ' \
+                            'while feed a value with {} dimensions.'
+                            .format(len(shape), len(value.shape)))
                    for i in range(len(shape)):
                        if shape[i] is None: continue
                        if shape[i] != value.shape[i]:
                            raise RuntimeError(
                                'The shape of constant was limited as (' +
                                ','.join([str(dim) for dim in shape]) + '), ' +
-                                    'while feed a value with (' + ','.join([str(dim) for dim in value.shape]) + ').')
-            feed = feed.reshape(shape)
+                                'while feed a value with (' +
+                                ','.join([str(dim) for dim in value.shape]) + ').')
+            value = value.reshape(shape)

-    defined_name = dragon.workspace.GetDummyName(
-        dragon.get_default_name_scope() +
+    # Get a available name
+    defined_name = \
+        _workspace.GetDummyName(
+            basename=_scope.get_default_name_scope() +
                (name if name else 'Const'),
            suffix=':0', domain='Tensor')

    # Feed into the workspace
-    tensor = dragon.Tensor.Ref(
+    return _Tensor.Ref(
        name=defined_name,
-            shape=list(feed.shape),
-                dtype=str(feed.dtype))
-    tensor.set_value(feed)
-    return tensor
\ No newline at end of file
+            shape=list(value.shape),
+                dtype=str(value.dtype)
+    ).set_value(value)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/tensorflow/framework/framework_lib.py
+++ b/Dragon/python/dragon/vm/tensorflow/framework/framework_lib.py
@@ -13,9 +13,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-from dragon.vm.tensorflow.framework.ops import Graph
+# The Graph (Workspace:))
+from dragon.core.workspace import Workspace as Graph

-# Utilities used when building a Graph.
+# Utilities used when building a Graph
 from dragon.vm.tensorflow.framework.ops import device
 from dragon.vm.tensorflow.framework.ops import name_scope
 from dragon.vm.tensorflow.framework.ops import get_default_graph
@@ -27,5 +28,6 @@ from dragon.vm.tensorflow.framework.ops import GraphKeys
 from dragon.vm.tensorflow.framework.constant_op import *
 from dragon.vm.tensorflow.framework.dtypes import *

+# Utilities used to represent a Tensor
 from dragon.vm.tensorflow.framework.tensor_shape import Dimension
 from dragon.vm.tensorflow.framework.tensor_shape import TensorShape
\ No newline at end of file
--- a/Dragon/python/dragon/vm/tensorflow/framework/ops.py
+++ b/Dragon/python/dragon/vm/tensorflow/framework/ops.py
@@ -17,16 +17,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import re
-import uuid
-import threading
-import dragon
+from dragon.core import tls as _tls
+from dragon.core import scope as _scope
+from dragon.core import workspace as _workspace
+from dragon.core.tensor import Tensor as _Tensor

 from dragon.vm.tensorflow.framework import constant_op
-from dragon.vm.tensorflow.util import tf_contextlib


-def convert_to_tensor(value, dtype=None, name=None, preferred_dtype=None):
+def convert_to_tensor(
+    value,
+    dtype=None,
+    name=None,
+    preferred_dtype=None,
+):
    """Converts the given value to a Tensor.

    Parameters
@@ -46,73 +50,10 @@ def convert_to_tensor(value, dtype=None, name=None, preferred_dtype=None):
        The output tensor.

    """
-    if isinstance(value, dragon.Tensor): return value
+    if isinstance(value, _Tensor): return value
    return constant_op.constant(value, dtype=dtype, name=name)


-class Graph(object):
-    """A wrapper to connect ``Function`` to ``Workspace``.
-
-    Note that official TensorFlow trace the expressions explicitly
-    in this class, while we have done in the virtual stack.
-
-    Besides, organizing a ``Flow``, i.e., expressions with specified
-    outputs should also be done here.
-
-    """
-
-    def __init__(self):
-        self._collections = {}
-        self._workspace = 'tf/graph/' + str(uuid.uuid4())
-
-    def get_collection_ref(self, name):
-        coll_list = self._collections.get(name, None)
-        if coll_list is None:
-            coll_list = []
-            self._collections[name] = coll_list
-        return coll_list
-
-    def get_collection(self, name, scope=None):
-        coll_list = self._collections.get(name, None)
-        if coll_list is None:
-            return []
-        if scope is None:
-            return list(coll_list)
-        else:
-            filter_coll_list = []
-            regex = re.compile(scope)
-            for item in coll_list:
-                if hasattr(item, "name") and regex.match(item.name):
-                    filter_coll_list.append(item)
-            return filter_coll_list
-
-    def add_to_collection(self, name, value):
-        if name not in self._collections:
-            self._collections[name] = [value]
-        else:
-            self._collections[name].append(value)
-
-    def add_to_collections(self, names, value):
-        for name in names:
-            self.add_to_collection(name, value)
-
-    def device(self, device_name_or_function):
-        if not isinstance(device_name_or_function, str):
-            raise TypeError('The device function should be a str.')
-        device_and_id = device_name_or_function.split('/')[1]
-        device, id = device_and_id.split(':')
-        if device not in ['cpu', 'gpu']:
-            raise ValueError('The device should either be cpu or gpu.')
-        try:
-            id = int(id)
-        except Exception as e:
-            raise ValueError('The device id should be a integer.')
-        return dragon.device_scope(device, device_id=id)
-
-    def as_default(self):
-        return _default_graph_stack.get_controller(self)
-
-
 class GraphKeys(object):
    GLOBAL_VARIABLES = "variables"
    # Key to collect local variables that are local to the machine and are not
@@ -202,112 +143,15 @@ def add_to_collections(names, value):
 def name_scope(name, default_name=None, values=None):
    name = default_name if name is None else name
    name = '' if name is None else name
-    return dragon.name_scope(name)
-
-
-##############################################
-#                                            #
-#              Default Stack                 #
-#                                            #
-##############################################
-
-
-class _DefaultStack(threading.local):
-    """A thread-local stack of objects for providing implicit defaults."""
-
-    def __init__(self):
-        super(_DefaultStack, self).__init__()
-        self._enforce_nesting = True
-        self.stack = []
-
-    def get_default(self):
-        return self.stack[-1] if len(self.stack) >= 1 else None
-
-    def reset(self):
-        self.stack = []
-
-    def is_cleared(self):
-        return not self.stack
-
-    @property
-    def enforce_nesting(self):
-        return self._enforce_nesting
-
-    @enforce_nesting.setter
-    def enforce_nesting(self, value):
-        self._enforce_nesting = value
-
-    @tf_contextlib.contextmanager
-    def get_controller(self, default):
-        """A context manager for manipulating a default stack."""
-        self.stack.append(default)
-        try:
-            yield default
-        finally:
-            # stack may be empty if reset() was called
-            if self.stack:
-                if self._enforce_nesting:
-                    if self.stack[-1] is not default:
-                        raise AssertionError(
-                            "Nesting violated for default stack of %s objects" %
-                            type(default))
-                    self.stack.pop()
-                else:
-                    self.stack.remove(default)
-
-
-class _DefaultGraphStack(_DefaultStack):
-    """A thread-local stack of objects for providing an implicit default graph."""
-
-    def __init__(self):
-        super(_DefaultGraphStack, self).__init__()
-        self._global_default_graph = None
-
-    def get_default(self):
-        """Override that returns a global default if the stack is empty."""
-        ret = super(_DefaultGraphStack, self).get_default()
-        if ret is None:
-            ret = self._GetGlobalDefaultGraph()
-        return ret
-
-    def _GetGlobalDefaultGraph(self):
-        if self._global_default_graph is None:
-            # TODO(mrry): Perhaps log that the default graph is being used, or set
-            #   provide some other feedback to prevent confusion when a mixture of
-            #   the global default graph and an explicit graph are combined in the
-            #   same process.
-            self._global_default_graph = Graph()
-            # Rewritten the random workspace name
-            self._global_default_graph._workspace = 'default'
-        return self._global_default_graph
-
-    def reset(self):
-        super(_DefaultGraphStack, self).reset()
-        # We should call dragon api to reset the workspace
-        dragon.workspace.ResetWorkspace(self._global_default_graph._workspace)
-        self._global_default_graph = None
-
-    @tf_contextlib.contextmanager
-    def get_controller(self, default):
-        with super(_DefaultGraphStack, self).get_controller(default) as g:
-            with dragon.ws_scope(g._workspace):
-                yield g
-
-
-_default_graph_stack = _DefaultGraphStack()
-_default_session_stack = _DefaultStack()
+    return _scope.name_scope(name)


 def get_default_graph():
-    return _default_graph_stack.get_default()
+    return _workspace.get_default_workspace()


 def reset_default_graph():
-    if not _default_graph_stack.is_cleared():
-        raise AssertionError("Do not use tf.reset_default_graph() to clear "
-                             "nested graphs. If you need a cleared graph, "
-                             "exit the nesting and create a new graph.")
-    _default_graph_stack.reset()
+    _workspace.reset_default_workspace()


 def default_session(session):
@@ -319,7 +163,17 @@ def get_default_session():


 def device(device_name_or_function):
-    return get_default_graph().device(device_name_or_function)
+    if not isinstance(device_name_or_function, str):
+        raise TypeError('The device function should be a str.')
+    device_and_id = device_name_or_function.split('/')[1]
+    device, id = device_and_id.split(':')
+    if device not in ['cpu', 'gpu']:
+        raise ValueError('The device should either be cpu or gpu.')
+    try:
+        id = int(id)
+    except Exception as _:
+        raise ValueError('The device id should be a integer.')
+    return _scope.device_scope(device, device_id=id)


 def _eval_using_default_session(tensors, feed_dict, session=None):
@@ -333,6 +187,10 @@ def _eval_using_default_session(tensors, feed_dict, session=None):
    return session.run(tensors, feed_dict)


+_default_session_stack = _tls.Stack()
+
+
+# The Monkey Patching
 # Require "import dragon.vm.tensorflow"
-dragon.Tensor.eval = lambda self, feed_dict=None, session=None : \
+_Tensor.eval = lambda self, feed_dict=None, session=None : \
    _eval_using_default_session(self, feed_dict, session)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/tensorflow/framework/tensor_shape.py
+++ b/Dragon/python/dragon/vm/tensorflow/framework/tensor_shape.py
@@ -13,7 +13,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-from dragon.core.tensor import Tensor
+from dragon.core.tensor import Tensor as _Tensor


 class Dimension(object):
@@ -114,4 +114,5 @@ def get_shape(self):
    return TensorShape(self.shape)


-Tensor.get_shape = get_shape
\ No newline at end of file
+# The Monkey Patching
+_Tensor.get_shape = get_shape
\ No newline at end of file
--- a/Dragon/python/dragon/vm/tensorflow/layers/base.py
+++ b/Dragon/python/dragon/vm/tensorflow/layers/base.py
@@ -28,7 +28,13 @@ from dragon.vm.tensorflow.util import nest


 class Layer(object):
-    def __init__(self, trainable=True, name=None, dtype=dtypes.float32, **kwargs):
+    def __init__(
+        self,
+        trainable=True,
+        name=None,
+        dtype=dtypes.float32,
+        **kwargs
+    ):
        allowed_kwargs = {'_scope', '_reuse'}
        for kwarg in kwargs:
            if kwarg not in allowed_kwargs:
@@ -79,13 +85,15 @@ class Layer(object):
            _add_elements_to_collection(self.updates, ops.GraphKeys.UPDATE_OPS)
            return outputs

-    def add_variable(self,
+    def add_variable(
+        self,
        name,
        shape,
        dtype=None,
        trainable=True,
        initializer=None,
-                     regularizer=None):
+        regularizer=None,
+    ):
        if dtype is None: dtype = self.dtype
        variable = vs.get_variable(
            name,
@@ -93,7 +101,8 @@ class Layer(object):
            initializer=initializer,
            regularizer=regularizer,
            dtype=dtypes.as_dtype(dtype),
-            trainable=trainable and self.trainable)
+            trainable=trainable and self.trainable,
+        )
        if trainable:
            self._trainable_weights.append(variable)
        else:
@@ -105,9 +114,14 @@ class Layer(object):


 class InputSpec(object):
-    def __init__(self,
-        dtype=None, shape=None, ndim=None,
-            max_ndim=None, min_ndim=None, axes=None
+    def __init__(
+        self,
+        dtype=None,
+        shape=None,
+        ndim=None,
+        max_ndim=None,
+        min_ndim=None,
+        axes=None,
    ):
        self.dtype = dtype
        self.shape = shape
@@ -125,9 +139,6 @@ def _to_snake_case(name):
    return 'private' + insecure


-PER_GRAPH_LAYER_NAME_UIDS = weakref.WeakKeyDictionary()
-
-
 def _unique_layer_name(name):
    global PER_GRAPH_LAYER_NAME_UIDS
    graph = ops.get_default_graph()
@@ -153,3 +164,6 @@ def _add_elements_to_collection(elements, collection_list):
        for element in elements:
            if element not in collection_set:
                collection.append(element)
+
+
+PER_GRAPH_LAYER_NAME_UIDS = weakref.WeakKeyDictionary()
\ No newline at end of file
--- a/Dragon/python/dragon/vm/tensorflow/layers/convolutional.py
+++ b/Dragon/python/dragon/vm/tensorflow/layers/convolutional.py
@@ -20,7 +20,8 @@ from dragon.vm.tensorflow.ops import nn


 class _Conv(base.Layer):
-    def __init__(self,
+    def __init__(
+        self,
        rank,
        filters,
        kernel_size,
@@ -37,7 +38,8 @@ class _Conv(base.Layer):
        activity_regularizer=None,
        trainable=True,
        name=None,
-                 **kwargs):
+        **kwargs
+    ):
        super(_Conv, self).__init__(trainable=trainable, name=name, **kwargs)
        self.rank = rank
        self.filters = filters
@@ -82,7 +84,8 @@ class _Conv(base.Layer):
            shape=kernel_shape,
            initializer=self.kernel_initializer,
            regularizer=self.kernel_regularizer,
-            dtype=self.dtype)
+            dtype=self.dtype,
+        )

        if self.use_bias:
            self.bias = self.add_variable(
@@ -90,7 +93,8 @@ class _Conv(base.Layer):
                shape=(self.filters,),
                initializer=self.bias_initializer,
                regularizer=self.bias_regularizer,
-                dtype=self.dtype)
+                dtype=self.dtype,
+            )
        else:
            self.bias = None

@@ -108,10 +112,15 @@ class _Conv(base.Layer):
            dilation_rate=self.dilation_rate,
            strides=self.strides,
            padding=self.padding.upper(),
-            data_format=tf_data_format)
+            data_format=tf_data_format,
+        )

        if self.bias is not None:
-            outputs = nn.bias_add(outputs, self.bias, data_format=tf_data_format)
+            outputs = nn.bias_add(
+                outputs,
+                self.bias,
+                data_format=tf_data_format,
+            )

        if self.activation is not None:
            return self.activation(outputs)
@@ -119,7 +128,9 @@ class _Conv(base.Layer):


 class Conv2D(_Conv):
-    def __init__(self, filters,
+    def __init__(
+        self,
+        filters,
        kernel_size,
        strides=(1, 1),
        padding='valid',
@@ -134,7 +145,8 @@ class Conv2D(_Conv):
        activity_regularizer=None,
        trainable=True,
        name=None,
-                 **kwargs):
+        **kwargs
+    ):
        super(Conv2D, self).__init__(
            rank=2,
            filters=filters,
@@ -154,7 +166,8 @@ class Conv2D(_Conv):
            name=name, **kwargs)


-def conv2d(inputs,
+def conv2d(
+    inputs,
    filters,
    kernel_size,
    strides=(1, 1),
@@ -170,7 +183,8 @@ def conv2d(inputs,
    activity_regularizer=None,
    trainable=True,
    name=None,
-           reuse=None):
+    reuse=None,
+):
    return Conv2D(
        filters=filters,
        kernel_size=kernel_size,
@@ -188,4 +202,5 @@ def conv2d(inputs,
        trainable=trainable,
        name=name,
        _reuse=reuse,
-        _scope=name).apply(inputs)
\ No newline at end of file
+        _scope=name,
+    ).apply(inputs)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/tensorflow/layers/core.py
+++ b/Dragon/python/dragon/vm/tensorflow/layers/core.py
@@ -24,7 +24,8 @@ from dragon.vm.tensorflow.ops import standard_ops


 class Dense(base.Layer):
-    def __init__(self,
+    def __init__(
+        self,
        units,
        activation=None,
        use_bias=True,
@@ -35,7 +36,8 @@ class Dense(base.Layer):
        activity_regularizer=None,
        trainable=True,
        name=None,
-                 **kwargs):
+        **kwargs
+    ):
        super(Dense, self).__init__(trainable=trainable, name=name, **kwargs)
        self.units = units
        self.activation = activation
@@ -61,7 +63,8 @@ class Dense(base.Layer):
            shape=[input_shape[-1].value, self.units],
            initializer=self.kernel_initializer,
            regularizer=self.kernel_regularizer,
-            dtype=self.dtype)
+            dtype=self.dtype,
+        )

        if self.use_bias:
            self.bias = self.add_variable(
@@ -69,7 +72,8 @@ class Dense(base.Layer):
                shape=[self.units,],
                initializer=self.bias_initializer,
                regularizer=self.bias_regularizer,
-                dtype=self.dtype)
+                dtype=self.dtype,
+            )
        else:
            self.bias = None
        self.built = True
@@ -83,7 +87,8 @@ class Dense(base.Layer):
        return outputs


-def dense(inputs,
+def dense(
+    inputs,
    units,
    activation=None,
    use_bias=True,
@@ -94,7 +99,8 @@ def dense(inputs,
    activity_regularizer=None,
    trainable=True,
    name=None,
-          reuse=None):
+    reuse=None,
+):
    return Dense(
        units,
        activation=activation,
@@ -107,4 +113,5 @@ def dense(inputs,
        trainable=trainable,
        name=name,
        _scope=name,
-        _reuse=reuse).apply(inputs)
\ No newline at end of file
+        _reuse=reuse,
+    ).apply(inputs)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/tensorflow/layers/layers.py
+++ b/Dragon/python/dragon/vm/tensorflow/layers/layers.py
@@ -13,20 +13,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-from dragon.vm.tensorflow.layers.convolutional import (
+from .convolutional import (
    conv2d, Conv2D,
 )

-from dragon.vm.tensorflow.layers.core import (
+from .core import (
    dense, Dense,
 )

-from dragon.vm.tensorflow.layers.normalization import (
+from .normalization import (
    batch_normalization, BatchNormalization,
    batch_norm, BatchNorm,
 )

-from dragon.vm.tensorflow.layers.pooling import (
+from .pooling import (
    average_pooling2d, AveragePooling2D,
    max_pooling2d, MaxPooling2D,
 )
--- a/Dragon/python/dragon/vm/tensorflow/layers/normalization.py
+++ b/Dragon/python/dragon/vm/tensorflow/layers/normalization.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import dragon
+from dragon.ops import BatchNorm as _BatchNormOp

 from dragon.vm.tensorflow.framework import tensor_shape
 from dragon.vm.tensorflow.layers import base
@@ -25,7 +25,8 @@ from dragon.vm.tensorflow.ops import init_ops


 class BatchNormalization(base.Layer):
-    def __init__(self,
+    def __init__(
+        self,
        axis=-1,
        momentum=0.99,
        epsilon=1e-3,
@@ -43,8 +44,10 @@ class BatchNormalization(base.Layer):
        fused=None,
        trainable=True,
        name=None,
-                 **kwargs):
-        super(BatchNormalization, self).__init__(trainable=trainable, name=name, **kwargs)
+        **kwargs
+    ):
+        super(BatchNormalization, self).__init__(
+            trainable=trainable, name=name, **kwargs)
        self.axis = axis
        self.momentum = momentum
        self.epsilon = epsilon
@@ -92,33 +95,37 @@ class BatchNormalization(base.Layer):
            name='moving_mean',
            shape=(param_dim.value,),
            initializer=self.moving_mean_initializer,
-            trainable=False)
+            trainable=False,
+        )

        self.moving_variance = self.add_variable(
            name='moving_variance',
            shape=(param_dim.value,),
            initializer=self.moving_variance_initializer,
-            trainable=False)
+            trainable=False,
+        )

        self.gamma = self.add_variable(
            name='gamma',
            shape=(param_dim.value,),
            initializer=self.gamma_initializer,
            regularizer=self.gamma_regularizer,
-            trainable=self.scale)
+            trainable=self.scale,
+        )

        self.beta = self.add_variable(
            name='beta',
            shape=(param_dim.value,),
            initializer=self.beta_initializer,
            regularizer=self.beta_regularizer,
-            trainable=self.center)
+            trainable=self.center,
+        )

        self.built = True

    def call(self, inputs, training=False, *args, **kwargs):
        use_stats = 0 if training else 1
-        return dragon.ops.BatchNorm([
+        return _BatchNormOp([
            inputs,
            self.moving_mean,
            self.moving_variance,
@@ -127,7 +134,8 @@ class BatchNormalization(base.Layer):
            axis=self.axis,
            momentum=self.momentum,
            eps=self.epsilon,
-            use_stats=use_stats)
+            use_stats=use_stats,
+        )


 def batch_normalization(
@@ -170,7 +178,8 @@ def batch_normalization(
        trainable=trainable,
        name=name,
        _reuse=reuse,
-        _scope=name).apply(inputs, training=training)
+        _scope=name,
+    ).apply(inputs, training=training)


 # Aliases

--- a/Dragon/python/dragon/vm/tensorflow/layers/pooling.py
+++ b/Dragon/python/dragon/vm/tensorflow/layers/pooling.py
@@ -22,9 +22,16 @@ from dragon.vm.tensorflow.layers import base, utils


 class _Pooling2D(base.Layer):
-    def __init__(self, pool_function, pool_size, strides,
-                 padding='valid', data_format='channels_last',
-                 name=None, **kwargs):
+    def __init__(
+        self,
+        pool_function,
+        pool_size,
+        strides,
+        padding='valid',
+        data_format='channels_last',
+        name=None,
+        **kwargs
+    ):
        super(_Pooling2D, self).__init__(name=name, **kwargs)
        self.pool_function = pool_function
        self.pool_size = utils.normalize_tuple(pool_size, 2, 'pool_size')
@@ -40,19 +47,25 @@ class _Pooling2D(base.Layer):
        else:
            pool_shape = (1, 1) + self.pool_size
            strides = (1, 1) + self.strides
-        outputs = self.pool_function(
+        return self.pool_function(
            inputs,
            ksize=pool_shape,
            strides=strides,
            padding=self.padding.upper(),
-            data_format=utils.convert_data_format(self.data_format, 4))
-        return outputs
+            data_format=utils.convert_data_format(self.data_format, 4),
+        )


 class MaxPooling2D(_Pooling2D):
-    def __init__(self, pool_size, strides,
-                 padding='valid', data_format='channels_last',
-                 name=None, **kwargs):
+    def __init__(
+        self,
+        pool_size,
+        strides,
+        padding='valid',
+        data_format='channels_last',
+        name=None,
+        **kwargs
+    ):
        super(MaxPooling2D, self).__init__(
            nn.max_pool,
            pool_size=pool_size,
@@ -63,9 +76,15 @@ class MaxPooling2D(_Pooling2D):


 class AveragePooling2D(_Pooling2D):
-    def __init__(self, pool_size, strides,
-                 padding='valid', data_format='channels_last',
-                 name=None, **kwargs):
+    def __init__(
+        self,
+        pool_size,
+        strides,
+        padding='valid',
+        data_format='channels_last',
+        name=None,
+        **kwargs
+    ):
        super(AveragePooling2D, self).__init__(
            nn.avg_pool,
            pool_size=pool_size,
@@ -76,22 +95,34 @@ class AveragePooling2D(_Pooling2D):


 def max_pooling2d(
-    inputs, pool_size, strides, padding='valid',
-        data_format='channels_last', name=None):
+    inputs,
+    pool_size,
+    strides,
+    padding='valid',
+    data_format='channels_last',
+    name=None,
+):
    return MaxPooling2D(
        pool_size=pool_size,
        strides=strides,
        padding=padding,
        data_format=data_format,
-        name=name).apply(inputs)
+        name=name,
+    ).apply(inputs)


 def average_pooling2d(
-    inputs, pool_size, strides, padding='valid',
-        data_format='channels_last', name=None):
+    inputs,
+    pool_size,
+    strides,
+    padding='valid',
+    data_format='channels_last',
+    name=None,
+):
    return AveragePooling2D(
        pool_size=pool_size,
        strides=strides,
        padding=padding,
        data_format=data_format,
-        name=name).apply(inputs)
\ No newline at end of file
+        name=name,
+    ).apply(inputs)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/tensorflow/layers/utils.py
+++ b/Dragon/python/dragon/vm/tensorflow/layers/utils.py
--- a/Dragon/python/dragon/vm/tensorflow/ops/array_ops.py
+++ b/Dragon/python/dragon/vm/tensorflow/ops/array_ops.py
@@ -13,8 +13,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import dragon
-
+from dragon import ops as _ops
+from dragon.core import scope as _scope
+from dragon.core import workspace as _workspace
+from dragon.core.tensor import Tensor as _Tensor
 from dragon.vm.tensorflow.framework import dtypes


@@ -23,19 +25,19 @@ def expand_dims(input, axis=None, name=None, dim=None):
        if axis is not None:
            raise ValueError("cannot specify both 'axis' and 'dim'.")
        axis = dim
-    return dragon.ops.ExpandDims(input, axis=axis, name=name)
+    return _ops.ExpandDims(input, axis=axis, name=name)


-def shape(input, name=None, out_type=dtypes.float32):
-    return dragon.ops.Shape(input, name=name)
+def shape(input, name=None, out_type=dtypes.int64):
+    return _ops.Shape(input, name=name)


 def zeros(shape, dtype=dtypes.float32, name=None):
-    return dragon.ops.Fill(shape, value=0.0, dtype=dtype.name, name=name)
+    return _ops.Fill(shape, value=0.0, dtype=dtype.name, name=name)


 def ones(shape, dtype=dtypes.float32, name=None):
-    return dragon.ops.Fill(shape, value=1.0, dtype=dtype.name, name=name)
+    return _ops.Fill(shape, value=1.0, dtype=dtype.name, name=name)


 def placeholder(dtype, shape=None, name=None):
@@ -45,29 +47,41 @@ def placeholder(dtype, shape=None, name=None):
            raise TypeError('The dtype should be a valid tensorflow data type.')

    # Construct a tensor from the explicit name
-    return dragon.Tensor.Ref(
-        dragon.workspace.GetDummyName(
-            dragon.get_default_name_scope() + name
+    return _Tensor.Ref(
+        _workspace.GetDummyName(
+            _scope.get_default_name_scope() + name
                if name else 'Placeholder',
                    suffix=':0', domain='Tensor'),
        dtype=dtype.name, shape=shape).Placeholder()


 def concat(values, axis, name=None):
-    return dragon.ops.Concat(values, axis=axis, name=name)
+    return _ops.Concat(values, axis=axis, name=name)


 def transpose(a, perm=None, name=None):
-    return dragon.ops.Transpose(a, perm=perm, name=name)
+    return _ops.Transpose(a, perm=perm, name=name)


 def tile(input, multiples, name=None):
-    return dragon.ops.Tile(input, multiples=multiples, name=name)
+    return _ops.Tile(input, multiples=multiples, name=name)


-def pad(tensor, paddings, mode="CONSTANT", name=None, constant_values=0):
-    return dragon.ops.Pad(tensor, paddings, mode=mode, name=name, value=constant_values)
+def pad(
+    tensor,
+    paddings,
+    mode="CONSTANT",
+    name=None,
+    constant_values=0,
+):
+    return _ops.Pad(
+        tensor,
+        paddings,
+        mode=mode,
+        name=name,
+        value=constant_values,
+    )


 def reshape(tensor, shape, name=None):
-    return dragon.ops.Reshape(tensor, shape=shape, name=name)
+    return _ops.Reshape(tensor, shape=shape, name=name)
--- a/Dragon/python/dragon/vm/tensorflow/ops/control_flow_ops.py
+++ b/Dragon/python/dragon/vm/tensorflow/ops/control_flow_ops.py
@@ -13,8 +13,17 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import dragon
+from dragon import ops as _ops


 def equal(a, b, name=None):
-    return dragon.ops.Equal([a, b], name=name)
\ No newline at end of file
+    return _ops.Equal([a, b], name=name)
+
+
+def greater(a, b, name=None):
+    return _ops.Greater([a, b], name=name)
+
+
+def less(a, b, name=None):
+    return _ops.Less([a, b], name=name)
+
--- a/Dragon/python/dragon/vm/tensorflow/ops/gradients_impl.py
+++ b/Dragon/python/dragon/vm/tensorflow/ops/gradients_impl.py
@@ -13,7 +13,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import dragon
+from dragon.vm.theano.gradient import grad as _Grad


 def gradients(ys, xs, **kwargs):
@@ -34,5 +34,5 @@ def gradients(ys, xs, **kwargs):
    """
    dxs = []
    if not isinstance(ys, list): ys = [ys]
-    for y in ys: dxs.append(dragon.grad(y, xs))
+    for y in ys: dxs.append(_Grad(y, xs))
    if len(dxs) == 1: return dxs[0]
\ No newline at end of file
--- a/Dragon/python/dragon/vm/tensorflow/ops/init_ops.py
+++ b/Dragon/python/dragon/vm/tensorflow/ops/init_ops.py
@@ -13,8 +13,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import dragon
-
+from dragon import ops as _ops
 from dragon.vm.tensorflow.framework import dtypes


@@ -59,7 +58,7 @@ class Zeros(Initializer):

    def __call__(self, shape, dtype=None, **kwargs):
        if dtype is None: dtype = self.dtype
-        return dragon.ops.Fill(shape, value=0, dtype=dtype.name)
+        return _ops.Fill(shape, value=0, dtype=dtype.name)


 class Ones(Initializer):
@@ -83,7 +82,7 @@ class Ones(Initializer):

    def __call__(self, shape, dtype=None, **kwargs):
        if dtype is None: dtype = self.dtype
-        return dragon.ops.Fill(shape, value=1, dtype=dtype.name)
+        return _ops.Fill(shape, value=1, dtype=dtype.name)


 class Constant(Initializer):
@@ -93,7 +92,7 @@ class Constant(Initializer):

    def __call__(self, shape, dtype=None, **kwargs):
        if dtype is None: dtype = self.dtype
-        return dragon.ops.Fill(shape, value=self.value, dtype=dtype.name)
+        return _ops.Fill(shape, value=self.value, dtype=dtype.name)


 class RandomUniform(Initializer):
@@ -104,8 +103,12 @@ class RandomUniform(Initializer):

    def __call__(self, shape, dtype=None, **kwargs):
        if dtype is None: dtype = self.dtype
-        return dragon.ops.RandomUniform(
-            shape, self.minval, self.maxval, dtype=dtype.name)
+        return _ops.RandomUniform(
+            shape=shape,
+            low=self.minval,
+            high=self.maxval,
+            dtype=dtype.name,
+        )


 class RandomNormal(Initializer):
@@ -117,8 +120,12 @@ class RandomNormal(Initializer):

    def __call__(self, shape, dtype=None, **kwargs):
        if dtype is None: dtype = self.dtype
-        return dragon.ops.RandomNormal(
-            shape, self.mean, self.stddev, dtype=dtype.name)
+        return _ops.RandomNormal(
+            shape=shape,
+            mean=self.mean,
+            std=self.stddev,
+            dtype=dtype.name,
+        )


 class TruncatedNormal(Initializer):
@@ -130,15 +137,21 @@ class TruncatedNormal(Initializer):

    def __call__(self, shape, dtype=None, **kwargs):
        if dtype is None: dtype = self.dtype
-        return dragon.ops.TruncatedNormal(
-            shape, self.mean, self.stddev, dtype=dtype.name)
+        return _ops.TruncatedNormal(
+            shape=shape,
+            mean=self.mean,
+            std=self.stddev,
+            dtype=dtype.name,
+        )


 class VarianceScaling(Initializer):
-    def __init__(self,
-        scale=1.0, mode="fan_in",
+    def __init__(
+        self,
+        scale=1.0,
+        mode="fan_in",
        distribution="normal",
-                 dtype=dtypes.float32
+        dtype=dtypes.float32,
    ):
        if scale <= 0.:
            raise ValueError("`scale` must be positive float.")
@@ -159,13 +172,40 @@ class VarianceScaling(Initializer):
    def __call__(self, shape, dtype=None, **kwargs):
        if dtype is None: dtype = self.dtype
        if self.distribution == "normal":
-            return dragon.ops.GlorotNormal(shape=shape, scale=self.scale * 2.,
-                mode=self.mode, dtype=dtype.name)
+            return _ops.GlorotNormal(
+                shape=shape,
+                scale=self.scale * 2.,
+                mode=self.mode,
+                dtype=dtype.name,
+            )
        else:
-            return dragon.ops.GlorotUniform(shape=shape, scale=self.scale * 3.,
-                mode=self.mode, dtype=dtype.name)
+            return _ops.GlorotUniform(
+                shape=shape,
+                scale=self.scale * 3.,
+                mode=self.mode,
+                dtype=dtype.name,
+            )
+
+
+def glorot_uniform_initializer(dtype=dtypes.float32):
+    return variance_scaling_initializer(
+        scale=1.0,
+        mode='fan_avg',
+        distribution='uniform',
+        dtype=dtype,
+    )
+
+
+def glorot_normal_initializer(dtype=dtypes.float32):
+    return variance_scaling_initializer(
+        scale=1.0,
+        mode='fan_avg',
+        distribution='normal',
+        dtype=dtype,
+    )


+# Aliases
 zeros_initializer = Zeros
 ones_initializer = Ones
 constant_initializer = Constant
@@ -173,13 +213,3 @@ random_uniform_initializer = RandomUniform
 random_normal_initializer = RandomNormal
 truncated_normal_initializer = TruncatedNormal
 variance_scaling_initializer = VarianceScaling
\ No newline at end of file
-
-
-def glorot_uniform_initializer(dtype=dtypes.float32):
-    return variance_scaling_initializer(scale=1.0,
-        mode='fan_avg', distribution='uniform', dtype=dtype)
-
-
-def glorot_normal_initializer(dtype=dtypes.float32):
-    return variance_scaling_initializer(scale=1.0,
-        mode='fan_avg', distribution='normal', dtype=dtype)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/tensorflow/ops/losses.py
+++ b/Dragon/python/dragon/vm/tensorflow/ops/losses.py
@@ -17,8 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import dragon
-
+from dragon import ops as _ops
 from dragon.vm.tensorflow.framework import ops


@@ -34,8 +33,8 @@ class Reduction(object):
    * `SUM_OVER_NONZERO_WEIGHTS`: Scalar `SUM` divided by number of non-zero
       weights. DEPRECATED.
    * `SUM_BY_NONZERO_WEIGHTS`: Same as `SUM_OVER_NONZERO_WEIGHTS`.
-    """

+    """
    NONE = "none"
    SUM = "weighted_sum"
    SUM_OVER_BATCH_SIZE = "weighted_sum_over_batch_size"
@@ -51,7 +50,8 @@ class Reduction(object):
            cls.MEAN,
            cls.SUM_OVER_BATCH_SIZE,
            cls.SUM_OVER_NONZERO_WEIGHTS,
-            cls.SUM_BY_NONZERO_WEIGHTS)
+            cls.SUM_BY_NONZERO_WEIGHTS,
+        )

    @classmethod
    def validate(cls, key):
@@ -59,10 +59,43 @@ class Reduction(object):
            raise ValueError("Invalid Reduction Key %s." % key)


+def softmax_cross_entropy(
+    onehot_labels,
+    logits,
+    weights=1.,
+    label_smoothing=0,
+    scope=None,
+    loss_collection=ops.GraphKeys.LOSSES,
+    reduction=Reduction.SUM_BY_NONZERO_WEIGHTS,
+):
+    if onehot_labels is None: raise ValueError("onehot_labels must not be None.")
+    if logits is None: raise ValueError("logits must not be None.")
+    normalization = None
+    if reduction == Reduction.NONE: normalization = 'UNIT'
+    elif reduction == Reduction.MEAN: normalization = 'FULL'
+    elif reduction == Reduction.SUM_BY_NONZERO_WEIGHTS or \
+            reduction == Reduction.SUM_OVER_NONZERO_WEIGHTS:
+        normalization = 'NONE'
+    elif reduction == Reduction.SUM_OVER_BATCH_SIZE:
+        normalization = 'BATCH_SIZE'
+    loss = _ops.SoftmaxCrossEntropy(
+        [logits, onehot_labels],
+        normalization=normalization,
+        name=scope,
+    )
+    if weights != 1.0: loss = weights * loss
+    ops.add_to_collection(loss_collection, loss)
+    return loss
+
+
 def sparse_softmax_cross_entropy(
-    labels, logits, weights=1.0, scope=None,
+    labels,
+    logits,
+    weights=1.,
+    scope=None,
    loss_collection=ops.GraphKeys.LOSSES,
-            reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
+    reduction=Reduction.SUM_BY_NONZERO_WEIGHTS,
+):
    if labels is None: raise ValueError("labels must not be None.")
    if logits is None: raise ValueError("logits must not be None.")
    normalization = None
@@ -70,13 +103,14 @@ def sparse_softmax_cross_entropy(
    elif reduction == Reduction.MEAN: normalization = 'FULL'
    elif reduction == Reduction.SUM_BY_NONZERO_WEIGHTS or \
            reduction == Reduction.SUM_OVER_NONZERO_WEIGHTS:
-        normalization = 'VALID'
+        normalization = 'NONE'
    elif reduction == Reduction.SUM_OVER_BATCH_SIZE:
        normalization = 'BATCH_SIZE'
-
-    loss = dragon.ops.SparseSoftmaxCrossEntropy([logits, labels],
-        normalization=normalization, name=scope)
+    loss = _ops.SparseSoftmaxCrossEntropy(
+        [logits, labels],
+        normalization=normalization,
+        name=scope,
+    )
    if weights != 1.0: loss = weights * loss
    ops.add_to_collection(loss_collection, loss)
    return loss
\ No newline at end of file
-
--- a/Dragon/python/dragon/vm/tensorflow/ops/math_ops.py
+++ b/Dragon/python/dragon/vm/tensorflow/ops/math_ops.py
@@ -13,31 +13,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import dragon 
-
-
-__all__ = [
-    'argmax',
-    'argmin',
-    'matmul',
-    'add',
-    'subtract',
-    'multiply',
-    'divide',
-    'sub',
-    'mul',
-    'div',
-    'cast',
-    'log',
-    'exp',
-    'square',
-    'sqrt',
-    'reduce_sum',
-    'reduce_mean',
-    'sigmoid',
-    'tanh',
-    'add_n',
-]
+from dragon import ops as _ops


 def argmax(input, axis=None, name=None, dimension=None):
@@ -46,7 +22,7 @@ def argmax(input, axis=None, name=None, dimension=None):
            raise ValueError("cannot specify both 'axis' and 'dimension'.")
        axis = dimension
    elif axis is None: axis = 0
-    return dragon.ops.ArgMax(input, axis=axis, name=name)
+    return _ops.ArgMax(input, axis=axis, name=name)


 def argmin(input, axis=None, name=None, dimension=None):
@@ -55,31 +31,38 @@ def argmin(input, axis=None, name=None, dimension=None):
            raise ValueError("cannot specify both 'axis' and 'dimension'.")
        axis = dimension
    elif axis is None: axis = 0
-    return dragon.ops.ArgMin(input, axis=axis, name=name)
+    return _ops.ArgMin(input, axis=axis, name=name)


-def matmul(a,
+def matmul(
+    a,
    b,
    transpose_a=False,
    transpose_b=False,
-           name=None):
-    return dragon.ops.Matmul([a, b], transA=transpose_a, transB=transpose_b, name=name)
+    name=None,
+):
+    return _ops.Matmul(
+        [a, b],
+        transA=transpose_a,
+        transB=transpose_b,
+        name=name,
+    )


 def add(x, y, name=None):
-    return dragon.ops.Add([x, y], name=name)
+    return _ops.Add([x, y], name=name)


 def subtract(x, y, name=None):
-    return dragon.ops.Sub([x, y], name=name)
+    return _ops.Sub([x, y], name=name)


 def multiply(x, y, name=None):
-    return dragon.ops.Mul([x, y], name=name)
+    return _ops.Mul([x, y], name=name)


 def divide(x, y, name=None):
-    return dragon.ops.Div([x, y], name=name)
+    return _ops.Div([x, y], name=name)


 def mul(x, y, name=None):
@@ -95,27 +78,27 @@ def div(x, y, name=None):


 def cast(x, dtype, name=None):
-    return dragon.ops.Cast(x, dtype=dtype, name=name)
+    return _ops.Cast(x, dtype=dtype, name=name)


 def log(x, name=None):
-    return dragon.ops.Log(x, name=name)
+    return _ops.Log(x, name=name)


 def exp(x, name=None):
-    return dragon.ops.Exp(x, name=name)
+    return _ops.Exp(x, name=name)


 def square(x, name=None):
-    return dragon.ops.Square(x, name=name)
+    return _ops.Square(x, name=name)


 def sqrt(x, name=None):
-    return dragon.ops.Pow(x, power=0.5, name=name)
+    return _ops.Pow(x, power=0.5, name=name)


 def pow(x, power, name=None):
-    return dragon.ops.Pow(x, power=power, name=name)
+    return _ops.Pow(x, power=power, name=name)


 def reduce_sum(
@@ -123,13 +106,19 @@ def reduce_sum(
    axis=None,
    keep_dims=False,
    name=None,
-    reduction_indices=None
+    reduction_indices=None,
 ):
    if reduction_indices is not None:
        if axis is not None:
-            raise ValueError("cannot specify both 'axis' and 'reduction_indices'.")
+            raise ValueError(
+                "Cannot specify both 'axis' and 'reduction_indices'.")
        axis = reduction_indices
-    return dragon.ops.Sum(input_tensor, axes=axis, keep_dims=keep_dims, nama=name)
+    return _ops.Sum(
+        input_tensor,
+        axes=axis,
+        keep_dims=keep_dims,
+        name=name,
+    )


 def reduce_mean(
@@ -137,22 +126,28 @@ def reduce_mean(
    axis=None,
    keep_dims=False,
    name=None,
-    reduction_indices=None
+    reduction_indices=None,
 ):
    if reduction_indices is not None:
        if axis is not None:
-            raise ValueError("cannot specify both 'axis' and 'reduction_indices'.")
+            raise ValueError(
+                "cannot specify both 'axis' and 'reduction_indices'.")
        axis = reduction_indices
-    return dragon.ops.Mean(input_tensor, axes=axis, keep_dims=keep_dims, nama=name)
+    return _ops.Mean(
+        input_tensor,
+        axes=axis,
+        keep_dims=keep_dims,
+        name=name,
+    )


 def sigmoid(x, name=None):
-    return dragon.ops.Sigmoid(x, name=name)
+    return _ops.Sigmoid(x, name=name)


 def tanh(x, name=None):
-    return dragon.ops.Tanh(x, name=name)
+    return _ops.Tanh(x, name=name)


 def add_n(inputs, name=None):
-    return dragon.ops.Eltwise(inputs, operation='SUM', name=name)
+    return _ops.Eltwise(inputs, operation='SUM', name=name)
--- a/Dragon/python/dragon/vm/tensorflow/ops/nn_impl.py
+++ b/Dragon/python/dragon/vm/tensorflow/ops/nn_impl.py
@@ -13,21 +13,25 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import dragon
+from dragon import ops as _ops


-def batch_normalization(x,
+def batch_normalization(
+    x,
    mean,
    variance,
    offset,
    scale,
    decay=0.9,
    variance_epsilon=1e-3,
-                        name=None):
-    raise NotImplementedError('Deprecated. Use tf.layer.batch_normalization.')
+    name=None,
+):
+    raise NotImplementedError(
+        'Deprecated. Use tf.layer.batch_normalization.')


-def batch_norm_with_global_normalization(t,
+def batch_norm_with_global_normalization(
+    t,
    m,
    v,
    beta,
@@ -35,12 +39,22 @@ def batch_norm_with_global_normalization(t,
    decay=0.9,
    variance_epsilon=1e-3,
    scale_after_normalization=True,
-                                         name=None):
-    raise NotImplementedError('Deprecated. Use tf.layer.batch_normalization.')
+    name=None,
+):
+    raise NotImplementedError(
+        'Deprecated. Use tf.layer.batch_normalization.')


-def l2_normalize(x,
+def l2_normalize(
+    x,
    dim,
    epsilon=1e-12,
-                 name=None):
-    return dragon.ops.L2Norm(x, axis=dim, num_axes=1, eps=epsilon, name=name)
+    name=None,
+):
+    return _ops.L2Norm(
+        x,
+        axis=dim,
+        num_axes=1,
+        eps=epsilon,
+        name=name,
+    )
--- a/Dragon/python/dragon/vm/tensorflow/ops/nn_ops.py
+++ b/Dragon/python/dragon/vm/tensorflow/ops/nn_ops.py
@@ -13,31 +13,18 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import dragon
-from dragon.core.tensor import Tensor
-
-
-__all__ = [
-    'convolution',
-    'relu',
-    'softmax',
-    'conv2d',
-    'conv2d_transpose',
-    'avg_pool',
-    'max_pool',
-    'xw_plus_b',
-    'bias_add',
-    'dropout',
-    'sigmoid_cross_entropy_with_logits',
-    'softmax_cross_entropy_with_logits',
-    'sparse_softmax_cross_entropy_with_logits',
-    'l2_loss',
-]
+from dragon import ops as _ops


 def convolution(
-    input, filter, padding, strides=None,
-        dilation_rate=None, name=None, data_format=None):
+    input,
+    filter,
+    padding,
+    strides=None,
+    dilation_rate=None,
+    name=None,
+    data_format=None,
+):
    num_total_dims = filter.get_shape().ndims
    if num_total_dims is None:
        num_total_dims = input.get_shape().ndims
@@ -71,24 +58,37 @@ def convolution(

    if num_spatial_dims == 2:
        return conv2d(
-            input, filter, strides, padding,
-                dilation_rate, data_format, name)
+            input,
+            filter,
+            strides,
+            padding,
+            dilation_rate,
+            data_format,
+            name,
+        )
    else:
        raise NotImplementedError(
            'conv{}d is not implemented.'.format(num_spatial_dims))


 def relu(features, name=None):
-   return dragon.ops.Relu(features, name=name)
+   return _ops.Relu(features, name=name)


 def softmax(logits, dim=-1, name=None):
-    return dragon.ops.Softmax(logits, axis=dim, name=name)
+    return _ops.Softmax(logits, axis=dim, name=name)


 def conv2d(
-    input, filter, strides, padding,
-        dilation_rate=None, data_format='NHWC', name=None, **kwargs):
+    input,
+    filter,
+    strides,
+    padding,
+    dilation_rate=None,
+    data_format='NHWC',
+    name=None,
+    **kwargs
+):
    """Compute 2D convolution according to the given 4D ``input`` and ``filter``.

    For **NHWC** format, filter should be as ``[filter_height, filter_width, in_channels, out_channels]``.
@@ -130,30 +130,41 @@ def conv2d(
            raise ValueError('dilation_rate must be a list with length 4.')

    if data_format == 'NHWC':
-        return dragon.ops.Conv2d([input, filter],
+        return _ops.Conv2d(
+            [input, filter],
            num_output=filter.shape[3],
            kernel_shape=filter.shape[0:2],
            strides=strides[1:3],
            dilations=dilation_rate[1:3] if dilation_rate is not None else 1,
            padding=padding,
            data_format=data_format,
-            name=name)
+            name=name,
+        )
    elif data_format == 'NCHW':
-        return dragon.ops.Conv2d([input, filter],
+        return _ops.Conv2d(
+            [input, filter],
            num_output=filter.shape[0],
            kernel_shape=filter.shape[2:4],
            strides=strides[2:4],
            dilations=dilation_rate[2:4] if dilation_rate is not None else 1,
            padding=padding,
            data_format=data_format,
-            name=name)
+            name=name,
+        )
    else:
-        raise ValueError('Unknown data format: {}'.format(data_format))
+        raise ValueError('Unknown data format: ' + data_format)


 def conv2d_transpose(
-    value, filter, output_shape, strides,
-        padding='SAME', data_format='NHWC', name=None, **kwargs):
+    value,
+    filter,
+    output_shape,
+    strides,
+    padding='SAME',
+    data_format='NHWC',
+    name=None,
+    **kwargs
+):
    """Compute 2D deconvolution according to the given 4D ``input`` and ``filter``.

    For **NHWC** format, filter should be as ``[filter_height, filter_width, out_channels, in_channels]``.
@@ -199,28 +210,39 @@ def conv2d_transpose(
        raise ValueError('output_shape should be a list with length 4.')

    if data_format == 'NHWC':
-        return dragon.ops.ConvTranspose2d([value, filter],
+        return _ops.ConvTranspose2d(
+            [value, filter],
            num_output=filter.shape[2],
            kernel_shape=filter.shape[0:2],
            strides=strides[1:3],
            padding=padding,
            data_format=data_format,
            output_shape=output_shape,
-            name=name)
+            name=name,
+        )
    elif data_format == 'NCHW':
-        return dragon.ops.Conv2dTranspose([value, filter],
+        return _ops.Conv2dTranspose(
+            [value, filter],
            num_output=filter.shape[1],
            kernel_shape=filter.shape[2:4],
            strides=strides[2:4],
            padding=padding,
            data_format=data_format,
            output_shape=output_shape,
-            name=name)
+            name=name,
+        )
    else:
-        raise ValueError('Unknown data format: {}'.format(data_format))
+        raise ValueError('Unknown data format: ' + data_format)


-def avg_pool(value, ksize, strides, padding, data_format='NHWC', name=None):
+def avg_pool(
+    value,
+    ksize,
+    strides,
+    padding,
+    data_format='NHWC',
+    name=None,
+):
    """Perform avg pooling on spatial axes.

    Parameters
@@ -252,31 +274,40 @@ def avg_pool(value, ksize, strides, padding, data_format='NHWC', name=None):
        if data_format == 'NHWC':
            if ksize[0] != 1 or ksize[3] != 1 or strides[0] != 1 or strides[3] != 1:
                raise ValueError('The pooling can only be performed on spatial axes.')
-            return dragon.ops.Pool2d(
+            return _ops.Pool2d(
                value,
                kernel_shape=[ksize[1], ksize[2]],
                strides=[strides[1], strides[2]],
                padding=padding,
                data_format=data_format,
                mode='AVG',
-                name=name)
+                name=name,
+            )
        if data_format == 'NCHW':
            if ksize[0] != 1 or ksize[1] != 1 or strides[0] != 1 or strides[1] != 1:
                raise ValueError('The pooling can only be performed on spatial axes.')
-            return dragon.ops.Pool2d(
+            return _ops.Pool2d(
                value,
                kernel_shape=[ksize[2], ksize[3]],
                strides=[strides[2], strides[3]],
                padding=padding,
                data_format=data_format,
                mode='AVG',
-                name=name)
+                name=name,
+            )
    else:
        raise NotImplementedError(
            'Pool{}d has not been implemented yet.'.format(len(ksize) - 2))


-def max_pool(value, ksize, strides, padding, data_format='NHWC', name=None):
+def max_pool(
+    value,
+    ksize,
+    strides,
+    padding,
+    data_format='NHWC',
+    name=None,
+):
    """Perform max pooling on spatial axes.

    Parameters
@@ -308,25 +339,27 @@ def max_pool(value, ksize, strides, padding, data_format='NHWC', name=None):
        if data_format == 'NHWC':
            if ksize[0] != 1 or ksize[3] != 1 or strides[0] != 1 or strides[3] != 1:
                raise ValueError('The pooling can only be performed on spatial axes.')
-            return dragon.ops.Pool2d(
+            return _ops.Pool2d(
                value,
                kernel_shape=[ksize[1], ksize[2]],
                strides=[strides[1], strides[2]],
                padding=padding,
                data_format=data_format,
                mode='MAX',
-                name=name)
+                name=name,
+            )
        if data_format == 'NCHW':
            if ksize[0] != 1 or ksize[1] != 1 or strides[0] != 1 or strides[1] != 1:
                raise ValueError('The pooling can only be performed on spatial axes.')
-            return dragon.ops.Pool2d(
+            return _ops.Pool2d(
                value,
                kernel_shape=[ksize[2], ksize[3]],
                strides=[strides[2], strides[3]],
                padding=padding,
                data_format=data_format,
                mode='MAX',
-                name=name)
+                name=name,
+            )
    else:
        raise NotImplementedError(
            'Pool{}d has not been implemented yet.'.format(len(ksize) - 2))
@@ -347,30 +380,63 @@ def xw_plus_b(x, weights, biases, name=None):
        if weights.shape[1] != biases.shape[0]:
            raise ValueError('the shape of weights and biaes are incompatible.')

-    return dragon.ops.FullyConnected([x, weights, biases], num_output=weights.shape[1], transW=False, name=name)
+    return _ops.FullyConnected(
+        [x, weights, biases],
+        num_output=weights.shape[1],
+        transW=False,
+        name=name,
+    )


 def bias_add(value, bias, data_format='NHWC', name=None):
-    return dragon.ops.BiasAdd([value, bias], data_format=data_format, name=name)
+    return _ops.BiasAdd(
+        [value, bias],
+        data_format=data_format,
+        name=name,
+    )


 def sigmoid_cross_entropy_with_logits(logits, targets, name=None):
-    return dragon.ops.SigmoidCrossEntropy([logits, targets], normalization='UNIT', name=name)
-
-
-def softmax_cross_entropy_with_logits(_sentinel=None, labels=None, logits=None, dim=-1, name=None):
-    return dragon.ops.SoftmaxCrossEntropy([logits, labels],
-        axis=dim, normalization='UNIT', name=name)
-
-
-def sparse_softmax_cross_entropy_with_logits(_sentinel=None, labels=None, logits=None, dim=-1, name=None):
-    return dragon.ops.SparseSoftmaxCrossEntropy([logits, labels],
-        axis=dim, normalization='UNIT', name=name)
+    return _ops.SigmoidCrossEntropy(
+        [logits, targets],
+        normalization='UNIT',
+        name=name,
+    )
+
+
+def softmax_cross_entropy_with_logits(
+    _sentinel=None,
+    labels=None,
+    logits=None,
+    dim=-1,
+    name=None,
+):
+    return _ops.SoftmaxCrossEntropy(
+        [logits, labels],
+        axis=dim,
+        normalization='UNIT',
+        name=name,
+    )
+
+
+def sparse_softmax_cross_entropy_with_logits(
+    _sentinel=None,
+    labels=None,
+    logits=None,
+    dim=-1,
+    name=None,
+):
+    return _ops.SparseSoftmaxCrossEntropy(
+        [logits, labels],
+        axis=dim,
+        normalization='UNIT',
+        name=name,
+    )


 def l2_loss(t, name=None):
-    return dragon.ops.L2Loss(t, normalization='NONE', name=name)
+    return _ops.L2Loss(t, normalization='NONE', name=name)


 def dropout(x, keep_prob, name=None):
-    return dragon.ops.Dropout(x, 1 - keep_prob, name=name)
+    return _ops.Dropout(x, 1. - keep_prob, name=name)
--- a/Dragon/python/dragon/vm/tensorflow/ops/random_ops.py
+++ b/Dragon/python/dragon/vm/tensorflow/ops/random_ops.py
@@ -13,34 +13,38 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import dragon
-
+from dragon import ops as _ops
 from dragon.vm.tensorflow.framework import dtypes


-__all__ = [
-    'random_normal',
-    'truncated_normal',
-    'random_uniform'
-]
-
-
 def random_normal(
-    shape, mean=0.0, stddev=1.0,
+    shape,
+    mean=0.0,
+    stddev=1.0,
    dtype=dtypes.float32,
-            seed=None, name=None):
-    return dragon.ops.RandomNormal(shape, mean, stddev, name=name)
+    seed=None,
+    name=None,
+):
+    return _ops.RandomNormal(shape, mean, stddev, name=name)


 def truncated_normal(
-    shape, mean=0.0, stddev=1.0,
+    shape,
+    mean=0.0,
+    stddev=1.0,
    dtype=dtypes.float32,
-            seed=None, name=None):
-    return dragon.ops.TruncatedNormal(shape, mean, stddev, name=name)
+    seed=None,
+    name=None,
+):
+    return _ops.TruncatedNormal(shape, mean, stddev, name=name)


 def random_uniform(
-    shape, minval=0, maxval=None,
+    shape,
+    minval=0,
+    maxval=None,
    dtype=dtypes.float32,
-            seed=None, name=None):
-    return dragon.ops.RandomUniform(shape, minval, maxval, name=name)
\ No newline at end of file
+    seed=None,
+    name=None,
+):
+    return _ops.RandomUniform(shape, minval, maxval, name=name)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/tensorflow/ops/var_scope.py
+++ b/Dragon/python/dragon/vm/tensorflow/ops/var_scope.py
@@ -13,12 +13,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import dragon
 import threading

+from dragon.core import tls as _tls
+from dragon.core import scope as _scope
+
 from dragon.vm.tensorflow.framework import dtypes, ops
 from dragon.vm.tensorflow.ops.variables import Variable
-from dragon.vm.tensorflow.framework.ops import _DefaultStack
 from dragon.vm.tensorflow.ops import init_ops


@@ -87,7 +88,8 @@ class VariableScope(object):
        """
        return self._vars

-    def get_variable(self,
+    def get_variable(
+        self,
        name,
        shape=None,
        dtype=None,
@@ -95,7 +97,8 @@ class VariableScope(object):
        regularizer=None,
        trainable=True,
        collections=None,
-        validate_shape=True):
+        validate_shape=True,
+    ):
        excepted_name = self.name + name
        if not excepted_name in self._vars:
            # Create a new variable
@@ -112,7 +115,8 @@ class VariableScope(object):
                collections=collections,
                validate_shape=validate_shape,
                name_from_variable_scope=excepted_name,
-                dtype=dtype)
+                dtype=dtype,
+            )
            self._vars[excepted_name] = variable
            return variable
        else:
@@ -132,7 +136,12 @@ class VariableScope(object):
        get_variable_scope_store().close()
        self._name_scope_ctx.__exit__(type, value, traceback)

-    def _get_default_initializer(self, name, shape=None, dtype=dtypes.float32):
+    def _get_default_initializer(
+        self,
+        name,
+        shape=None,
+        dtype=dtypes.float32,
+    ):
        # Defaults: float32
        if dtype is None:
            dtype = dtypes.float32
@@ -162,15 +171,16 @@ def variable_scope(name_or_scope, reuse=None, **kwargs):
    prefix = name_or_scope + '/' if name_or_scope != '' else ''
    vs_store = get_variable_scope_store()
    vs_name = vs_store.current_scope.name + prefix
-    original_name_scope = dragon.get_default_name_scope() + prefix
+    original_name_scope = _scope.get_default_name_scope() + prefix
    vs = VariableScope(reuse, name=vs_name, name_scope=original_name_scope)
    # Store the ctx manager instead of returning
    # As we should return a VariableScope
-    vs._name_scope_ctx = dragon.name_scope(name_or_scope)
+    vs._name_scope_ctx = _scope.name_scope(name_or_scope)
    return vs


-def get_variable(name,
+def get_variable(
+    name,
    shape=None,
    dtype=None,
    initializer=None,
@@ -178,16 +188,18 @@ def get_variable(name,
    trainable=True,
    collections=None,
    validate_shape=True,
-                 **kwargs):
+    **kwargs
+):
    return get_variable_scope().get_variable(
-        name, shape=shape, dtype=dtype,
-            initializer=initializer, regularizer=regularizer,
-                trainable=trainable, collections=collections,
-                    validate_shape=validate_shape)
-
-
-_GLOBAL_VARIABLE_SCOPE_STORE_KEY = ("__varscope",)
-_GLOBAL_VARIABLE_SCOPE_STACK = _DefaultStack()
+        name,
+        shape=shape,
+        dtype=dtype,
+        initializer=initializer,
+        regularizer=regularizer,
+        trainable=trainable,
+        collections=collections,
+        validate_shape=validate_shape,
+    )


 class _VariableScopeStore(threading.local):
@@ -220,3 +232,7 @@ def get_variable_scope_store():
 def get_variable_scope():
    """Returns the current variable scope."""
    return get_variable_scope_store().current_scope
+
+
+_GLOBAL_VARIABLE_SCOPE_STORE_KEY = ("__varscope",)
+_GLOBAL_VARIABLE_SCOPE_STACK = _tls.Stack()
\ No newline at end of file
--- a/Dragon/python/dragon/vm/tensorflow/ops/variables.py
+++ b/Dragon/python/dragon/vm/tensorflow/ops/variables.py
@@ -14,16 +14,21 @@ from __future__ import division
 from __future__ import print_function

 import copy
-import dragon
+
+from dragon.core import scope as _scope
+from dragon.core import workspace as _workspace
+from dragon.core.tensor import Tensor as _Tensor
+from dragon.vm.theano.compile import function as _Function

 from dragon.vm.tensorflow.framework import ops, constant_op
 from dragon.vm.tensorflow.util.deprecation import deprecated


-class Variable(dragon.Tensor):
+class Variable(_Tensor):
    """Construct a Variable."""

-    def __init__(self,
+    def __init__(
+        self,
        initial_value=None,
        trainable=True,
        collections=None,
@@ -31,7 +36,8 @@ class Variable(dragon.Tensor):
        name=None,
        dtype=None,
        regularizer=None,
-                 **kwargs):
+        **kwargs
+    ):
        super(Variable, self).__init__()

        if initial_value is None:
@@ -50,35 +56,39 @@ class Variable(dragon.Tensor):

        if name is not None:
            # Get a known name from the name scope
-            defined_name = dragon.get_default_name_scope() + name
+            defined_name = _scope.get_default_name_scope() + name
        else:
            if 'name_from_variable_scope' in kwargs:
                # Has a name from the variable scope
                defined_name = kwargs['name_from_variable_scope']
            else:
                # Get a auto name from the name scope
-                defined_name = dragon.get_default_name_scope() + 'Variable'
+                defined_name = _scope.get_default_name_scope() + 'Variable'

        # Set the name explicitly
-        self.set_name(dragon.workspace.GetDummyName(
+        self.set_name(_workspace.GetDummyName(
            defined_name, suffix=':0', domain='Tensor'))

        # Initializer
-        if isinstance(initial_value, dragon.Tensor) and \
+        if isinstance(initial_value, _Tensor) and \
            len(initial_value.expressions) == 1:
                # From a initializing ops
-                self.shape, self.dtype = initial_value.shape[:], initial_value.dtype
+                self.shape, self.dtype = \
+                    initial_value.shape[:], \
+                        initial_value.dtype
                init_expr = copy.deepcopy(initial_value.expressions)
                for k, v in init_expr.items():
                    init_expr[k].output[0] = self.name
                self.__init_expr__ = init_expr
        else:
            # From a const tensor
-            if not isinstance(initial_value, dragon.Tensor):
+            if not isinstance(initial_value, _Tensor):
                initial_value = constant_op.constant(
                    initial_value, name=name, dtype=dtype)
            self.set_value(initial_value.get_value())
-            self.shape, self.dtype = initial_value.shape, initial_value.dtype
+            self.shape, self.dtype = \
+                initial_value.shape, \
+                    initial_value.dtype

        # Regularizer
        self.__regularizer__ = regularizer
@@ -121,7 +131,8 @@ class VariablesInitializer(object):

    def run(self):
        if not hasattr(self, '_init_func'):
-            self._init_func = dragon.function(outputs=self.var_list) \
+            self._init_func = _Function(
+                outputs=self.var_list) \
                    if len(self.var_list) > 0 else None
        if self._init_func: self._init_func()


--- a/Dragon/python/dragon/vm/tensorflow/training/learning_rate_decay.py
+++ b/Dragon/python/dragon/vm/tensorflow/training/learning_rate_decay.py
@@ -14,9 +14,10 @@ from __future__ import division
 from __future__ import print_function

 import math
-import dragon
-import numpy as np
+import numpy

+from dragon.ops import Run as _RunOp
+from dragon.core import workspace as _workspace
 from dragon.vm.tensorflow.framework import ops


@@ -25,11 +26,11 @@ class _DecayBase(object):
        self.param_str = ''

    def set(self, tensor, value, dtype=None):
-        dragon.workspace.FeedTensor(tensor,
+        _workspace.FeedTensor(tensor,
            value, dtype=dtype, force_cpu=True)

    def get(self, tensor):
-        return dragon.workspace.FetchTensor(tensor)
+        return _workspace.FetchTensor(tensor)


 class _PiecewiseConstant(_DecayBase):
@@ -120,8 +121,9 @@ class _CosineDecayRestarts(_DecayBase):

    def run(self, inputs, outputs):
        gs = self.get(inputs[0])
-        global_step = min(gs - self.last_steps, self.decay_steps)
-        cosine_decay = 0.5 * (1 + math.cos(math.pi * global_step / self.decay_steps))
+        global_step = gs - self.last_steps
+        cosine_decay = 0.5 * (1. + math.cos(
+            math.pi * global_step / self.decay_steps))
        decayed = (1. - self.alpha) * cosine_decay + self.alpha
        new_lr = self.learning_rate * decayed
        # Restarts
@@ -132,84 +134,110 @@ class _CosineDecayRestarts(_DecayBase):
        self.set(outputs[0], new_lr, dtype='float32')


-def piecewise_constant(x, boundaries, values, name=None):
+def piecewise_constant(
+    x,
+    boundaries,
+    values,
+    name=None,
+):
    if len(values) != len(boundaries) + 1:
        raise ValueError('Excepted {} values, got {}.'.format(
            len(boundaries) + 1, len(values)))
-    lr = dragon.ops.Run([ops.convert_to_tensor(x)],
+    lr = _RunOp(
+        inputs=[ops.convert_to_tensor(x)],
        module=__name__,
        op='_PiecewiseConstant',
        param_str=str({
            'boundaries': boundaries,
-                            'values': values}),
-                        name=name)
-    lr.set_value(np.array(values[0], dtype=np.float32))
+            'values': values,
+        }),
+        name=name,
+    )
+    lr.set_value(numpy.array(values[0], dtype='float32'))
    return lr


-def exponential_decay(learning_rate,
+def exponential_decay(
+    learning_rate,
    global_step,
    decay_steps,
    decay_rate,
    staircase=False,
-                      name=None):
-    lr = dragon.ops.Run([ops.convert_to_tensor(global_step)],
+    name=None,
+):
+    lr = _RunOp(
+        inputs=[ops.convert_to_tensor(global_step)],
        module=__name__,
        op='_ExponentialDecay',
        param_str=str({
            'learning_rate': learning_rate,
            'decay_steps': decay_steps,
            'decay_rate': decay_rate,
-                            'staircase': staircase}),
-                        name=name)
-    lr.set_value(np.array(learning_rate, dtype=np.float32))
+            'staircase': staircase,
+        }),
+        name=name,
+    )
+    lr.set_value(numpy.array(learning_rate, dtype='float32'))
    return lr


-def natural_exp_decay(learning_rate,
+def natural_exp_decay(
+    learning_rate,
    global_step,
    decay_steps,
    decay_rate,
    staircase=False,
-                      name=None):
-    lr = dragon.ops.Run([ops.convert_to_tensor(global_step)],
+    name=None,
+):
+    lr = _RunOp(
+        inputs=[ops.convert_to_tensor(global_step)],
        module=__name__,
        op='_NaturalExpDecay',
        param_str=str({
            'learning_rate': learning_rate,
            'decay_steps': decay_steps,
            'decay_rate': decay_rate,
-                            'staircase': staircase}),
-                        name=name)
-    lr.set_value(np.array(learning_rate, dtype=np.float32))
+            'staircase': staircase,
+        }),
+        name=name,
+    )
+    lr.set_value(numpy.array(learning_rate, dtype='float32'))
    return lr


-def cosine_decay(learning_rate,
+def cosine_decay(
+    learning_rate,
    global_step,
    decay_steps,
    alpha=0.0,
-                 name=None):
-    lr = dragon.ops.Run([ops.convert_to_tensor(global_step)],
+    name=None,
+):
+    lr = _RunOp(
+        inputs=[ops.convert_to_tensor(global_step)],
        module=__name__,
        op='_CosineDecay',
        param_str=str({
            'learning_rate': learning_rate,
            'decay_steps': decay_steps,
-                            'alpha': alpha}),
-                        name=name)
-    lr.set_value(np.array(learning_rate, dtype=np.float32))
+            'alpha': alpha,
+        }),
+        name=name,
+    )
+    lr.set_value(numpy.array(learning_rate, dtype='float32'))
    return lr


-def cosine_decay_restarts(learning_rate,
+def cosine_decay_restarts(
+    learning_rate,
    global_step,
    first_decay_steps,
    t_mul=2.0,
    m_mul=1.0,
    alpha=0.0,
-                          name=None):
-    lr = dragon.ops.Run([ops.convert_to_tensor(global_step)],
+    name=None,
+):
+    lr = _RunOp(
+        inputs=[ops.convert_to_tensor(global_step)],
        module=__name__,
        op='_CosineDecayRestarts',
        param_str=str({
@@ -217,9 +245,11 @@ def cosine_decay_restarts(learning_rate,
            'first_decay_steps': first_decay_steps,
            't_mul': t_mul,
            'm_mul': m_mul,
-                            'alpha': alpha}),
-                        name=name)
-    lr.set_value(np.array(learning_rate, dtype=np.float32))
+            'alpha': alpha
+        }),
+        name=name,
+    )
+    lr.set_value(numpy.array(learning_rate, dtype='float32'))
    return lr



--- a/Dragon/python/dragon/vm/tensorflow/training/optimizer.py
+++ b/Dragon/python/dragon/vm/tensorflow/training/optimizer.py
@@ -13,10 +13,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import dragon
+from dragon import updaters as _updaters
+from dragon.core import workspace as _workspace
+from dragon.core.tensor import Tensor as _Tensor

 from dragon.vm.tensorflow.framework import ops
 from dragon.vm.tensorflow.ops import variables
+from dragon.vm.tensorflow.ops.gradients_impl import gradients


 class Optimizer(object):
@@ -34,16 +37,16 @@ class Optimizer(object):
        self.updater = self.train = self.update = None

    def _set_dynamic_lr(self, learning_rate):
-        if isinstance(learning_rate, dragon.Tensor):
+        if isinstance(learning_rate, _Tensor):
            self._targets.append(learning_rate)
            internal_lr = self.updater._slot + '/base_lr'
-            dragon.workspace.SetTensorAlias(learning_rate.name, internal_lr)
+            _workspace.SetTensorAlias(learning_rate, internal_lr)
            self.updater.base_lr = float(learning_rate.get_value())

    def _inc_global_step(self):
        if self._global_step is not None:
-            gs = self._global_step.get_value()
-            self._global_step.set_value((gs + 1).astype(gs.dtype))
+            v = self._global_step.get_value() + 1
+            _workspace.FeedTensor(self._global_step, v, True)

    def get_name(self):
        return self._name
@@ -57,7 +60,7 @@ class Optimizer(object):
        if var_list is None:
            var_list = variables.trainable_variables() + \
                ops.get_collection(ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES)
-        grads = dragon.grad(loss, var_list)
+        grads = gradients(loss, var_list)
        grads_and_vars = list(zip(grads, var_list))
        return grads_and_vars

@@ -95,34 +98,66 @@ class Optimizer(object):


 class GradientDescentOptimizer(Optimizer):
-    def __init__(self, learning_rate, use_locking=False, name='GradientDescent'):
+    def __init__(
+        self,
+        learning_rate,
+        use_locking=False,
+        name='GradientDescent',
+    ):
        super(GradientDescentOptimizer, self).__init__(use_locking, name)
-        self.updater = dragon.updaters.SGDUpdater(learning_rate, 0.0)
+        self.updater = _updaters.SGDUpdater(learning_rate, 0.)
        self._set_dynamic_lr(learning_rate)


 class MomentumOptimizer(Optimizer):
-    def __init__(self, learning_rate, momentum,
-                 use_locking=False, name='Momentum', use_nesterov=False):
+    def __init__(
+        self,
+        learning_rate,
+        momentum,
+        use_locking=False,
+        name='Momentum',
+        use_nesterov=False,
+    ):
        super(MomentumOptimizer, self).__init__(use_locking, name)
        if not use_nesterov:
-            self.updater = dragon.updaters.SGDUpdater(learning_rate, momentum)
+            self.updater = _updaters.SGDUpdater(learning_rate, momentum)
        else:
-            self.updater = dragon.updaters.NesterovUpdater(learning_rate, momentum)
+            self.updater = _updaters.NesterovUpdater(learning_rate, momentum)
        self._set_dynamic_lr(learning_rate)


 class AdamOptimizer(Optimizer):
-    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
-                 use_locking=False, name='Adam'):
+    def __init__(
+        self,
+        learning_rate=0.001,
+        beta1=0.9,
+        beta2=0.999,
+        epsilon=1e-8,
+        use_locking=False,
+        name='Adam',
+    ):
        super(AdamOptimizer, self).__init__(use_locking, name)
-        self.updater = dragon.updaters.AdamUpdater(learning_rate, beta1, beta2, epsilon)
+        self.updater = _updaters.AdamUpdater(
+            learning_rate, beta1, beta2, epsilon)
        self._set_dynamic_lr(learning_rate)


 class RMSPropOptimizer(Optimizer):
-    def __init__(self, learning_rate, decay=0.9, momentum=0.0, epsilon=1e-10,
-                 use_locking=False, centered=False, name='RMSProp'):
+    def __init__(
+        self,
+        learning_rate,
+        decay=0.9,
+        momentum=0.0,
+        epsilon=1e-10,
+        use_locking=False,
+        centered=False,
+        name='RMSProp',
+    ):
        super(RMSPropOptimizer, self).__init__(use_locking, name)
-        self.updater = dragon.updaters.RMSPropUpdater(learning_rate, decay, epsilon)
+        if momentum > 0.:
+            self.updater = _updaters.AdamUpdater(
+                learning_rate, momentum, decay, epsilon)
+        else:
+            self.updater = _updaters.RMSPropUpdater(
+                learning_rate, decay, epsilon)
        self._set_dynamic_lr(learning_rate)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/tensorflow/training/training.py
+++ b/Dragon/python/dragon/vm/tensorflow/training/training.py
@@ -13,14 +13,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-from dragon.vm.tensorflow.training.optimizer import (
+from .optimizer import (
    GradientDescentOptimizer,
    MomentumOptimizer,
    RMSPropOptimizer,
    AdamOptimizer,
 )

-from dragon.vm.tensorflow.training.learning_rate_decay import (
+from .learning_rate_decay import (
    piecewise_constant,
    piecewise_constant_decay,
    exponential_decay,

--- a/Dragon/python/dragon/vm/theano/__init__.py
+++ b/Dragon/python/dragon/vm/theano/__init__.py
@@ -9,9 +9,12 @@
 #
 # ------------------------------------------------------------

-from .compile import (
-    function,
-    scan,
-    shared)
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function

-from .configdefaults import config
\ No newline at end of file
+from dragon.vm.theano import tensor
+from dragon.vm.theano.compile import scan
+from dragon.vm.theano.compile import shared
+from dragon.vm.theano.compile import function
+from dragon.vm.theano.configdefaults import config
--- a/Dragon/python/dragon/vm/theano/compile/function.py
+++ b/Dragon/python/dragon/vm/theano/compile/function.py
@@ -9,24 +9,27 @@
 #
 # ------------------------------------------------------------

+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
 import os
 import copy
-import numpy as np
-
-import dragon.core.mpi as mpi
-import dragon.core.workspace as ws
-import dragon.core.logging as logging
-import dragon.proto.dragon_pb2 as pb

-from dragon.core.proto_utils import MakeArgument
-from dragon.core.helper import OperatorHelper
-from dragon.core.gradient_maker import GraphGradientMaker
-from dragon.core.scope import get_default_phase
-from dragon.core.tensor import Tensor
+from dragon import config as _cfg
+from dragon.core.tensor import Tensor as _Tensor
+from dragon.core import mpi as _mpi
+from dragon.core import scope as _scope
+from dragon.core import helper as _helper
+from dragon.core import logging as _logging
+from dragon.core import workspace as _workspace
+from dragon.proto import dragon_pb2 as _proto_def
+from dragon.core import proto_utils as _proto_utils
+from dragon.core import gradient_maker as _gradient_maker


-def GraphDef_Grad(graph_def, targets):
-    """Inject the gradient targets into GraphDef.
+def _inject_gradients(graph_def, targets):
+    """Inject the gradients into GraphDef.

    Parameters
    ----------
@@ -44,18 +47,18 @@ def GraphDef_Grad(graph_def, targets):
    `T.grad(*args, **kwargs)`_ - How the generate gradient targets.

    """
-    all_pairs = set()
+    gradients = set()
    for target in targets:
-        all_pairs.update(target.gradient.make_pairs())
+        gradients.update(target.gradient.make_pairs())

-    for pair in all_pairs:
-        gradient = pb.GradientProto()
-        gradient.cost, gradient.wrt = str(pair[0]), str(pair[1])
+    for (cost, wrt) in gradients:
+        gradient = _proto_def.GradientProto()
+        gradient.cost, gradient.wrt = str(cost), str(wrt)
        graph_def.gradient.extend([gradient])


-def GraphDef_Phase(graph_def, targets):
-    """Inject the phase into GraphDef.
+def _inject_phase(graph_def, targets):
+    """Inject the phase info into GraphDef.

    If existing gradients, we assume it should be ``TRAIN``, and vice versa.

@@ -71,18 +74,20 @@ def GraphDef_Phase(graph_def, targets):
    None

    """
-    phase = get_default_phase()
+    phase = _scope.get_default_phase()
    if phase is None:
        phase = 'TEST'
        for target in targets:
            if target.gradient.required():
                phase = 'TRAIN'
                break
-    graph_def.arg.extend([MakeArgument('phase', phase)])
+    graph_def.arg.extend([
+        _proto_utils.MakeArgument(
+            'phase', phase)])


-def GraphDef_Update(graph_def, updater):
-    """Inject the update targets into GraphDef.
+def _inject_update_ops(graph_def, updater):
+    """Inject the update ops GraphDef.

    The ``updater`` should generate update targets before.

@@ -99,43 +104,61 @@ def GraphDef_Update(graph_def, updater):

    """
    if updater is None: return
+    updater.register_in_workspace()

+    grads, update_ops = [], []
    extra_arguments = updater._extra_kwargs
    extra_arguments['slot'] = updater._slot
-    parallel_arguments = {}
-
-    updater.register_in_workspace()
-
-    # Check data parallel if necessary
-    if mpi.Is_Init():
-        idx, group = mpi.AllowParallel()
-        if idx != -1:
-            parallel_arguments['parallel_mode'] = mpi.GetParallelMode()
-            parallel_arguments['comm'], parallel_arguments['group'] \
-                = mpi.CreateGroup(root=group[0], incl=group)
-            parallel_arguments['root'] = group[0]
-        for k, v in parallel_arguments.items():
-            graph_def.arg.add().CopyFrom(MakeArgument(k, v))

+    # Build update ops according to the updater
    for e in updater._param_group:
-        pair, arguments = e
-        kwargs = dict(arguments, **extra_arguments)
-        u_target = pb.UpdaterProto()
-        u_target.type = updater.type()
-        u_target.name = OperatorHelper.get_name()
-        u_target.tensor.extend(pair)
-        for k, v in kwargs.items():
-            u_target.arg.add().CopyFrom(MakeArgument(k, v))
-        graph_def.updater.extend([u_target])
-
+        (param, grad), arguments = e
+        if _workspace.HasTensor(grad):
+            grads.append(grad)
+            arguments = dict(arguments, **extra_arguments)
+            update_ops.append(
+                _proto_utils.
+                    MakeOperatorDef(
+                        op_type=updater.type(),
+                        inputs=[grad],
+                        outputs=[param],
+                        name=_helper.OperatorHelper.get_name(),
+                        **arguments
+                    )
+                )
+        else:
+            _logging.info('Skip to update Tensor({}).'.format(param))

-def GraphDef_Opt(graph_def):
-    """Inject the optimization options into GraphDef.
+    # Check data parallel if necessary
+    if _mpi.Is_Init():
+        (rank, group), arguments = _mpi.AllowParallel(), {}
+        if rank != -1:
+            arguments['parallel_mode'] = _mpi.GetParallelMode()
+            arguments['root'], (arguments['comm'], arguments['group']) \
+                = group[0], _mpi.CreateGroup(root=group[0], incl=group)
+            update_ops.insert(
+                0, _proto_utils.
+                    MakeOperatorDef(
+                        op_type='CollectiveUpdate',
+                        inputs=grads,
+                        outputs=grads,
+                        name=_helper.OperatorHelper.get_name(),
+                        **arguments
+                    )
+                )
+
+    graph_def.op.extend(update_ops)
+
+
+def _inject_optimization(graph_def, opt_level=None):
+    """Inject the optimization info into GraphDef.

    Parameters
    ----------
    graph_def : GraphDef
        The definition of graph.
+    opt_level : int, optional
+        The optimization level.

    Returns
    -------
@@ -148,15 +171,19 @@ def GraphDef_Opt(graph_def):
    `memonger.share_grads(*args, **kwargs)`_ - How the enable gradients sharing.

    """
-    from dragon.config import option
-    OX = option['graph_optimization_level']
-    if not option['share_grads'] and OX >= 3: OX = 2
-    graph_def.arg.add().CopyFrom(MakeArgument('optimization_level', OX))
-    graph_def.graph_type = option['graph_type']
+    options = _cfg.GetGlobalOptions()
+    if opt_level is None:
+        opt_level = options['graph_optimization_level']
+        if not options['share_grads'] and \
+            opt_level >= 3: opt_level = 2
+    graph_def.arg.add().CopyFrom(
+        _proto_utils.MakeArgument(
+            'optimization_level', opt_level))
+    graph_def.graph_type = options['graph_type']


-def GraphDef_Device(graph_def):
-    """Inject the device option into GraphDef.
+def _inject_device(graph_def):
+    """Inject the device info into GraphDef.

    Parameters
    ----------
@@ -176,13 +203,13 @@ def GraphDef_Device(graph_def):
    `config.SetRandomSeed(*args, **kwargs)`_ - How to set random seed.

    """
-    from dragon.config import option
-    if option['device'] is not 'None':
+    options = _cfg.GetGlobalOptions()
+    if options['device'] is not 'none':
        supports = {'cpu': 0, 'cuda': 1, 'cnml': 2}
-        device_option = pb.DeviceOption()
-        device_option.device_type = supports[option['device']]
-        device_option.device_id = option['device_id']
-        device_option.random_seed = option['random_seed']
+        device_option = _proto_def.DeviceOption()
+        device_option.device_type = supports[options['device']]
+        device_option.device_id = options['device_id']
+        device_option.random_seed = options['random_seed']
        graph_def.device_option.CopyFrom(device_option)


@@ -194,7 +221,7 @@ class Function(object):
    """
    def __init__(self, name=None):
        self.callback = None
-        self.meta_graph = pb.GraphDef()
+        self.meta_graph = _proto_def.GraphDef()
        self.meta_graph.name = name if name else 'Graph'
        self.graph_name = None # Determined after creating

@@ -237,7 +264,7 @@ class Function(object):
            external_input_expressions = {}
            # Extract new ops
            for old_tensor, new_tensor in givens.items():
-                if isinstance(new_tensor, Tensor):
+                if isinstance(new_tensor, _Tensor):
                    name_dict[old_tensor.name] = new_tensor.name
                    external_input_expressions.update(new_tensor.expressions)
                else:
@@ -259,7 +286,8 @@ class Function(object):
            targets = [output.name for output in outputs]
            targets.extend(all_extra_targets)
            forward_ops, grad_ops, _ = \
-                GraphGradientMaker.Make(forward_ops, targets)
+                _gradient_maker.GraphGradientMaker \
+                    .Make(forward_ops, targets)
        else:
            grad_ops = []

@@ -276,26 +304,29 @@ class Function(object):

        self.inputs, self.outputs = inputs, outputs

-        # Write Misc
+        # Inject arguments based on global options
        if len(outputs) > 0:
-            GraphDef_Device(meta_graph)
-            GraphDef_Opt(meta_graph)
-            GraphDef_Grad(meta_graph, outputs)
-            GraphDef_Phase(meta_graph, outputs)
+            _inject_device(meta_graph)
+            _inject_optimization(meta_graph)
+            _inject_gradients(meta_graph, outputs)
+            _inject_phase(meta_graph, outputs)

        elif updater is not None:
-            GraphDef_Device(meta_graph)
-            GraphDef_Opt(meta_graph)
-            GraphDef_Update(meta_graph, updater)
+            _inject_device(meta_graph)
+            _inject_optimization(meta_graph, opt_level=0)
+            _inject_update_ops(meta_graph, updater)

        # Call c api to create graph
-        self.graph_name = ws.CreateGraph(meta_graph)
+        self.graph_name = _workspace.CreateGraph(meta_graph)

        # Bind a lambda callback to run this graph
        self.callback = lambda *args, **kwargs: \
-            ws.RunGraph(self.graph_name, (inputs, args), outputs, **kwargs)
+            _workspace.RunGraph(
+                graph_name=self.graph_name,
+                    inputs=(inputs, args),
+                        outputs=outputs, **kwargs)

-        # Self return
+        # Return the self
        return self

    def export_to(self, name=None, export_dir='./'):
@@ -320,7 +351,7 @@ class Function(object):
        meta_graph_copy.name = self.meta_graph.name if name is None else name
        file = os.path.join(export_dir, meta_graph_copy.name + '.metatxt')
        with open(file, 'w') as f: f.write(str(meta_graph_copy))
-        logging.info('Export meta graph into: {}'.format(file))
+        _logging.info('Export meta graph into: {}'.format(file))

    def import_from(self, graph_def, explicit_inputs=False):
        """Import the defined function from a graph def.
@@ -342,25 +373,28 @@ class Function(object):
            The self.

        """
-        self.inputs = [Tensor(name=input).Variable() for input in graph_def.input]
-        self.outputs = [Tensor(name=output) for output in graph_def.output]
+        self.inputs = [_Tensor(input).Variable() for input in graph_def.input]
+        self.outputs = [_Tensor(output) for output in graph_def.output]

-        GraphDef_Device(graph_def)
-        GraphDef_Opt(graph_def)
-        GraphDef_Phase(graph_def, self.outputs)
+        _inject_device(graph_def)
+        _inject_optimization(graph_def)
+        _inject_phase(graph_def, self.outputs)

        # Store for future development
        self.meta_graph = graph_def

        # Call c api to create graph
-        self.graph_name = ws.CreateGraph(graph_def)
+        self.graph_name = _workspace.CreateGraph(graph_def)

        # Bind a lambda callback to run this graph
        callback_inputs = self.inputs if explicit_inputs else []
        self.callback = lambda *args, **kwargs: \
-            ws.RunGraph(self.graph_name, (callback_inputs, args), self.outputs, **kwargs)
+            _workspace.RunGraph(
+                self.graph_name,
+                    (callback_inputs, args),
+                        self.outputs, **kwargs)

-        # Self return
+        # Return self
        return self

    def __call__(self, *args, **kwargs):
@@ -396,16 +430,17 @@ def function(inputs=None, outputs=None, givens=None, updater=None):

    Examples
    --------
-    >>> x = Tensor('x', dtype='float32').Variable()
+    >>> import numpy, dragon
+    >>> x = dragon.Tensor('x', dtype='float32').Variable()
    >>> y = x * 2
    >>> f = function(outputs=y)
-    >>> x.set_value(np.ones((2, 3)))
+    >>> x.set_value(numpy.ones((2, 3)))
    >>> print(f())
    >>> [[ 2.  2.  2.]
         [ 2.  2.  2.]]

    >>> f = function(inputs=x, outputs=y)
-    >>> print(f(np.ones((2, 3)))
+    >>> print(f(numpy.ones((2, 3)))
    >>> [[ 2.  2.  2.]
         [ 2.  2.  2.]]


--- a/Dragon/python/dragon/vm/theano/compile/sharedvalue.py
+++ b/Dragon/python/dragon/vm/theano/compile/sharedvalue.py
@@ -9,8 +9,11 @@
 #
 # ------------------------------------------------------------

-import numpy as np
-import dragon as dg
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from dragon.core.tensor import Tensor as _Tensor


 def shared(value, name=None, **kwargs):
@@ -18,10 +21,10 @@ def shared(value, name=None, **kwargs):

    Parameters
    ----------
-    value : number, list or numpy.ndarray
+    value : number, sequence or numpy.ndarray
        The numerical values.
-    name : str
-        The name of tensor.
+    name : str, optional
+        The optional name

    Returns
    -------
@@ -29,8 +32,4 @@ def shared(value, name=None, **kwargs):
        The initialized tensor.

    """
-    if not isinstance(value, (int, float, list, np.ndarray)):
-        raise TypeError("Unsupported type of value: {}".format(type(value)))
-    tensor = dg.Tensor(name).Variable()
-    dg.workspace.FeedTensor(tensor, value)
-    return tensor
\ No newline at end of file
+    return _Tensor(name).set_value(value)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/theano/configdefaults.py
+++ b/Dragon/python/dragon/vm/theano/configdefaults.py
@@ -9,6 +9,11 @@
 #
 # ------------------------------------------------------------

+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
 class TheanoConfig(object):
    floatX = 'float32'


--- a/Dragon/python/dragon/vm/theano/gradient.py
+++ b/Dragon/python/dragon/vm/theano/gradient.py
@@ -9,7 +9,12 @@
 #
 # ------------------------------------------------------------

-import dragon as dg
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from dragon.core.tensor import Tensor as _Tensor
+from dragon.ops import StopGradient as _StopGradientOp


 def grad(cost, wrt, **kwargs):
@@ -44,7 +49,7 @@ def grad(cost, wrt, **kwargs):
    for w in wrt:
        cost.gradient.add_wrt(w.name)
        w.gradient.add_cost(cost)
-        grads.append(dg.Tensor.Ref(
+        grads.append(_Tensor.Ref(
            name=w.name + '_grad',
                shape=w.shape, dtype=w.dtype))
    if len(grads) == 1: return grads[0]
@@ -67,4 +72,4 @@ def disconnected_grad(x):
        The identity of input.

    """
-    return dg.ops.StopGradient(x)
+    return _StopGradientOp(x)
--- a/Dragon/python/dragon/vm/theano/tensor/__init__.py
+++ b/Dragon/python/dragon/vm/theano/tensor/__init__.py
@@ -9,9 +9,11 @@
 #
 # ------------------------------------------------------------

-from .basic import *
-from .extra_ops import *
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function

-from . import nnet
-
-from ..gradient import grad, disconnected_grad
\ No newline at end of file
+from dragon.vm.theano.tensor.basic import *
+from dragon.vm.theano.tensor.extra_ops import *
+from dragon.vm.theano.tensor import nnet
+from dragon.vm.theano.gradient import grad, disconnected_grad
\ No newline at end of file
--- a/Dragon/python/dragon/vm/theano/tensor/basic.py
+++ b/Dragon/python/dragon/vm/theano/tensor/basic.py
@@ -9,21 +9,13 @@
 #
 # ------------------------------------------------------------

-import numpy as np
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function

-from dragon.core.tensor import Tensor
-import dragon.ops as ops
-
-from ..configdefaults import config
-
-_DATA_TYPES = {
-    'int32': np.int32,
-    'int64': np.int64,
-    'uint8': np.uint8,
-    'float16': np.float16,
-    'float32': np.float32,
-    'float64': np.float64,
-}
+from dragon import ops as _ops
+from dragon.core.tensor import Tensor as _Tensor
+from dragon.vm.theano.configdefaults import config as _cfg


 def scalar(name=None, dtype=None):
@@ -44,8 +36,8 @@ def scalar(name=None, dtype=None):
        The scalar variable.

    """
-    if dtype is None: dtype = config.floatX
-    return Tensor(name=name, dtype=dtype)
+    if dtype is None: dtype = _cfg.floatX
+    return _Tensor(name=name, dtype=dtype)


 def iscalar(name=None):
@@ -65,43 +57,10 @@ def iscalar(name=None):
    return scalar(name, 'int32')


-def constant(x, name=None, shape=None, dtype=None):
-    """Initialize a tensor with constant value.
-
-    If dtype is ``None``, use ``config.floatX``.
-
-    Parameters
-    ----------
-    x : basic numerical type
-        The constant value.
-    name : str, optional
-        The name of Tensor.
-    shape : sequence of int, optional
-        The shape of Tensor.
-    dtype : str, optional
-        The data type of Tensor.
-
-    Returns
-    -------
-    Tensor
-        The initialized tensor.
-
-    """
-    if dtype is None: dtype = config.floatX
-    else:
-        if dtype not in _DATA_TYPES.keys():
-            raise TypeError("Unsupported data type: {}".format(dtype))
-    if shape is None: shape = ()
-    np_value = x * np.ones(shape, dtype=_DATA_TYPES[dtype])
-    output = Tensor(name=name, shape=shape, dtype=dtype)
-    output.set_value(np_value)
-    return output
-
-
 def zeros(shape, dtype=None):
    """Initialize a tensor with zeros.

-    If dtype is ``None``, use ``config.floatX``.
+    If dtype is *None*, use *config.floatX*.

    Parameters
    ----------
@@ -116,14 +75,8 @@ def zeros(shape, dtype=None):
        The initialized tensor.

    """
-    if dtype is None: dtype = config.floatX
-    else:
-        if dtype not in _DATA_TYPES.keys():
-            raise TypeError("Unsupported data type: {}".format(dtype))
-    np_value = np.zeros(shape, dtype=_DATA_TYPES[dtype])
-    output = Tensor(shape=shape, dtype=dtype)
-    output.set_value(np_value)
-    return output
+    if dtype is None: dtype = _cfg.floatX
+    return _ops.Fill(shape=shape, value=0, dtype=dtype)


 def zeros_like(model, dtype=None, **kwargs):
@@ -131,13 +84,13 @@ def zeros_like(model, dtype=None, **kwargs):

    The values can be access only after the run of graph.

-    If dtype is ``None``, use ``config.floatX``.
+    If dtype is *None*, use *config.floatX*.

    Parameters
    ----------
    model : Tensor
        The tensor to refer shape.
-    dtype : str
+    dtype : str, optional
        The data type of Tensor.

    Returns
@@ -146,16 +99,13 @@ def zeros_like(model, dtype=None, **kwargs):
        The initialized tensor.

    """
-    if dtype is None: dtype = config.floatX
-    else:
-        raise TypeError("Unsupported data type: {}".format(dtype))
-    return ops.Fill(shape=ops.Shape(model), value=0)
+    return zeros(shape=model.shape, dtype=dtype)


 def ones(shape, dtype=None):
    """Initialize a tensor with ones.

-    If dtype is ``None``, use ``config.floatX``.
+    If dtype is *None*, use *config.floatX*.

    Parameters
    ----------
@@ -170,14 +120,8 @@ def ones(shape, dtype=None):
        The initialized tensor.

    """
-    if dtype is None: dtype = config.floatX
-    else:
-        if dtype not in _DATA_TYPES.keys():
-            raise TypeError("Unsupported data type: {}".format(dtype))
-    np_value = np.ones(shape, dtype=_DATA_TYPES[dtype])
-    output = Tensor(shape=shape, dtype=dtype)
-    output.set_value(np_value)
-    return output
+    if dtype is None: dtype = _cfg.floatX
+    return _ops.Fill(shape=shape, value=1, dtype=dtype)


 def ones_like(model, dtype=None, **kwargs):
@@ -185,7 +129,7 @@ def ones_like(model, dtype=None, **kwargs):

    The values can be access only after the run of graph.

-    If dtype is ``None``, use ``config.floatX``.
+    If dtype is *None*, use *config.floatX*.

    Parameters
    ----------
@@ -200,16 +144,13 @@ def ones_like(model, dtype=None, **kwargs):
        The initialized tensor.

    """
-    if dtype is None: dtype = config.floatX
-    else:
-        raise TypeError("Unsupported data type: {}".format(dtype))
-    return ops.Fill(shape=ops.Shape(model), value=1)
+    return ones(shape=model.shape, dtype=dtype)


 def cast(x, dtype):
    """Cast input to the tensor of specific data type.

-    If dtype is ``None``, use ``config.floatX``.
+    If dtype is *None*, use *config.floatX*.

    Parameters
    ----------
@@ -224,8 +165,8 @@ def cast(x, dtype):
        The output tensor.

    """
-    if dtype is None: dtype = config.floatX
-    raise NotImplementedError()
+    if dtype is None: dtype = _cfg.floatX
+    return x.astype(dtype)


 def dot(a, b):
@@ -246,7 +187,7 @@ def dot(a, b):
        The output tensor.

    """
-    return ops.Dot([a, b])
+    return _ops.Dot([a, b])


 def batched_tensordot(x, y, axes=2):
@@ -269,7 +210,7 @@ def transpose(x, axes=None):
        The output tensor.

    """
-    return ops.Transpose(x, perm=axes)
+    return _ops.Transpose(x, perm=axes)


 def max(x, axis=None, keepdims=False):
@@ -291,7 +232,7 @@ def max(x, axis=None, keepdims=False):

    """
    if axis is None: axis = -1
-    return ops.Max(x, axis=axis, keep_dims=keepdims)
+    return _ops.Max(x, axis=axis, keep_dims=keepdims)


 def min(x, axis=None, keepdims=False):
@@ -313,7 +254,7 @@ def min(x, axis=None, keepdims=False):

    """
    if axis is None: axis = -1
-    return ops.Min(x, axis=axis, keep_dims=keepdims)
+    return _ops.Min(x, axis=axis, keep_dims=keepdims)


 def sum(input, axis=None, keepdims=False, **kwargs):
@@ -335,7 +276,7 @@ def sum(input, axis=None, keepdims=False, **kwargs):

    """
    if axis is None: axis = -1
-    return ops.Sum(input, axis=axis, keep_dims=keepdims)
+    return _ops.Sum(input, axis=axis, keep_dims=keepdims)


 def mean(input, axis=None, keepdims=False, **kwargs):
@@ -357,7 +298,7 @@ def mean(input, axis=None, keepdims=False, **kwargs):

    """
    if axis is None: axis = -1
-    return ops.Mean(input, axis=axis, keep_dims=keepdims)
+    return _ops.Mean(input, axis=axis, keep_dims=keepdims)


 def prod(input, axis=None, keepdims=False, **kwargs):
@@ -401,7 +342,7 @@ def argmax(x, axis=None, keepdims=False):

    """
    if axis is None: axis = -1
-    return ops.ArgMax(x, axis=axis, keep_dims=keepdims)
+    return _ops.ArgMax(x, axis=axis, keep_dims=keepdims)


 def argmin(x, axis=None, keepdims=False):
@@ -423,7 +364,7 @@ def argmin(x, axis=None, keepdims=False):

    """
    if axis is None: axis = -1
-    return ops.ArgMin(x, axis=axis, keep_dims=keepdims)
+    return _ops.ArgMin(x, axis=axis, keep_dims=keepdims)


 def square(a):
@@ -440,7 +381,7 @@ def square(a):
        The square result.

    """
-    return ops.Square(a)
+    return _ops.Square(a)


 def sqrt(a):
@@ -457,7 +398,7 @@ def sqrt(a):
        The sqrt result.

    """
-    return ops.Sqrt(a)
+    return _ops.Sqrt(a)


 def pow(a, power):
@@ -474,7 +415,7 @@ def pow(a, power):
        The pow result.

    """
-    return ops.Pow(a, power)
+    return _ops.Pow(a, power)


 def exp(a):
@@ -491,7 +432,7 @@ def exp(a):
        The exponential result.

    """
-    return ops.Exp(a)
+    return _ops.Exp(a)


 def log(a):
@@ -508,7 +449,7 @@ def log(a):
        The logarithm result.

    """
-    return ops.Log(a)
+    return _ops.Log(a)


 def clip(x, min=None, max=None):
@@ -529,7 +470,7 @@ def clip(x, min=None, max=None):
        The clip result.

    """
-    return ops.Clip(x, low=min, high=max)
+    return _ops.Clip(x, low=min, high=max)


 def join(axis, *tensors_list):
@@ -548,7 +489,7 @@ def join(axis, *tensors_list):
        The output tensor.

    """
-    return ops.Concat(list(tensors_list), axis=axis)
+    return _ops.Concat(list(tensors_list), axis=axis)


 def stack(*tensors, **kwargs):
@@ -573,7 +514,7 @@ def stack(*tensors, **kwargs):
    """
    if not 'axis' in kwargs: axis = 0
    else: axis = kwargs['axis']
-    return ops.Stack(list(tensors), axis=axis)
+    return _ops.Stack(list(tensors), axis=axis)


 def concatenate(tensor_list, axis=0):
@@ -594,7 +535,7 @@ def concatenate(tensor_list, axis=0):
        The output tensor.

    """
-    return ops.Concat(tensor_list, axis=axis)
+    return _ops.Concat(tensor_list, axis=axis)


 def reshape(x, newshape, **kwargs):
@@ -613,7 +554,7 @@ def reshape(x, newshape, **kwargs):
        The output tensor.

    """
-    return ops.Reshape(x, shape=newshape)
+    return _ops.Reshape(x, shape=newshape)


 def flatten(x, outdim=1):
@@ -632,7 +573,7 @@ def flatten(x, outdim=1):
        The output tensor.

    """
-    return ops.Flatten(x, keep_axes=outdim)
+    return _ops.Flatten(x, keep_axes=outdim)


 def repeat(x, repeats, axis=None):
@@ -654,7 +595,7 @@ def repeat(x, repeats, axis=None):

    """
    if axis is None: axis = -1
-    return ops.Repeat(x, axis=axis, repeats=repeats)
+    return _ops.Repeat(x, axis=axis, repeats=repeats)


 def tile(x, reps, **kwargs):
@@ -673,7 +614,7 @@ def tile(x, reps, **kwargs):
        The output tensor.

    """
-    return ops.Tile(x, multiples=reps)
+    return _ops.Tile(x, multiples=reps)


 def arange(start, stop=None, step=1, dtype=None):
@@ -698,4 +639,4 @@ def arange(start, stop=None, step=1, dtype=None):
        The vector.

    """
-    return ops.Arange(start=start, stop=stop, step=1, dtype=dtype.upper())
\ No newline at end of file
+    return _ops.Arange(start=start, stop=stop, step=step, dtype=dtype)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/theano/tensor/extra_ops.py
+++ b/Dragon/python/dragon/vm/theano/tensor/extra_ops.py
@@ -9,8 +9,11 @@
 #
 # ------------------------------------------------------------

-from dragon.core.tensor import Tensor
-import dragon.ops as ops
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from dragon import ops as _ops


 def cumsum(x, axis=None):
@@ -20,8 +23,8 @@ def cumsum(x, axis=None):
    ----------
    x : Tensor
        The input tensor.
-    axis : int
-        The axis to sum. Default is ``None`` (Along all axes).
+    axis : int, optional
+        The axis to sum.

    """
    raise NotImplementedError()
@@ -34,8 +37,8 @@ def cumprod(x, axis=None):
    ----------
    x : Tensor
        The input tensor.
-    axis : int
-        The axis to sum. Default is ``None`` (Along all axes).
+    axis : int, optional
+        The axis to sum.

    """
    raise NotImplementedError()
@@ -59,5 +62,5 @@ def to_one_hot(y, nb_class, **kwargs):
        The one hot matrix.

    """
-    flat_y = ops.Flatten(y, keep_axes=1)
-    return ops.OneHot(flat_y, depth=nb_class)
+    flat_y = _ops.Flatten(y, keep_axes=1)
+    return _ops.OneHot(flat_y, depth=nb_class)
--- a/Dragon/python/dragon/vm/theano/tensor/nnet.py
+++ b/Dragon/python/dragon/vm/theano/tensor/nnet.py
@@ -9,8 +9,12 @@
 #
 # ------------------------------------------------------------

-from dragon.core.tensor import Tensor
-import dragon.ops as ops
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from dragon import ops as _ops
+from dragon.core.tensor import Tensor as _Tensor


 def batch_normalization(inputs, gamma, beta, mean, var, **kwargs):
@@ -35,7 +39,7 @@ def batch_normalization(inputs, gamma, beta, mean, var, **kwargs):
        The output tensor.

    """
-    return ops.BatchNorm([inputs, mean, var, gamma, beta])
+    return _ops.BatchNorm([inputs, mean, var, gamma, beta])


 def relu(x, alpha=0):
@@ -54,8 +58,8 @@ def relu(x, alpha=0):
        The output tensor.

    """
-    if alpha == 0: return ops.Relu(x)
-    else: return ops.LRelu(x, slope=alpha)
+    if alpha == 0: return _ops.Relu(x)
+    else: return _ops.LRelu(x, slope=alpha)


 def softmax(c):
@@ -74,7 +78,7 @@ def softmax(c):
        The output tensor.

    """
-    return ops.Softmax(c, axis=1)
+    return _ops.Softmax(c, axis=1)


 def categorical_crossentropy(coding_dist, true_dist, axis=1):
@@ -95,7 +99,7 @@ def categorical_crossentropy(coding_dist, true_dist, axis=1):
        The categorical cross-entropy.

    """
-    return -ops.Sum(true_dist * ops.Log(coding_dist), axis=axis)
+    return -_ops.Sum(true_dist * _ops.Log(coding_dist), axis=axis)


 def sigmoid(x):
@@ -112,7 +116,7 @@ def sigmoid(x):
        The output tensor.

    """
-    return ops.Sigmoid(x)
+    return _ops.Sigmoid(x)


 def tanh(x):
@@ -129,7 +133,7 @@ def tanh(x):
        The output tensor.

    """
-    return ops.Tanh(x)
+    return _ops.Tanh(x)


 def binary_crossentropy(output, target):
@@ -148,7 +152,7 @@ def binary_crossentropy(output, target):
        The binary cross-entropy.

    """
-    return -(target * ops.Log(output) + (1.0 - target) * ops.Log(1.0 - output))
+    return -(target * _ops.Log(output) + (1. - target) * _ops.Log(1. - output))




--- a/Dragon/python/dragon/vm/torch/autograd/__init__.py
+++ b/Dragon/python/dragon/vm/torch/autograd/__init__.py
@@ -9,5 +9,11 @@
 #
 # ------------------------------------------------------------

-from .variable import Variable
-from .grad_mode import no_grad, enable_grad, set_grad_enabled
\ No newline at end of file
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from dragon.vm.torch.autograd.variable import Variable
+from dragon.vm.torch.autograd.grad_mode import no_grad
+from dragon.vm.torch.autograd.grad_mode import enable_grad
+from dragon.vm.torch.autograd.grad_mode import set_grad_enabled
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/autograd/grad_mode.py
+++ b/Dragon/python/dragon/vm/torch/autograd/grad_mode.py
@@ -17,16 +17,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+from dragon.core import tls as _tls

-__all__ = [
-    'is_grad_enabled',
-    'no_grad',
-    'enable_grad',
-    'set_grad_enabled',
-]

-
-grad_option = {'enable_grad': True}
+def _set_grad_enabled(enabled=True):
+    """Set the status of grad option."""
+    global _GLOBAL_GRAD_OPTION
+    _GLOBAL_GRAD_OPTION.enabled = enabled


 def is_grad_enabled():
@@ -38,14 +35,7 @@ def is_grad_enabled():
        ``True`` if enabling auto-grad.

    """
-    global grad_option
-    return grad_option['enable_grad']
-
-
-def _set_grad_enabled(enabled=True):
-    global grad_option
-    grad_option['enable_grad'] = enabled
-
+    return _GLOBAL_GRAD_OPTION.enabled


 class no_grad(object):
@@ -97,3 +87,6 @@ class set_grad_enabled(object):
    def __exit__(self, *args):
        _set_grad_enabled(self.prev)
        return False
+
+
+_GLOBAL_GRAD_OPTION = _tls.Constant(enabled=True)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/autograd/variable.py
+++ b/Dragon/python/dragon/vm/torch/autograd/variable.py
@@ -15,11 +15,12 @@ from __future__ import print_function

 import warnings

-import dragon.core.tensor_utils as tensor_utils
-import dragon.core.workspace as ws
+from dragon.core import tensor_utils as _tensor_utils
+from dragon.core.workspace import Backward as _backward_impl

-from dragon.vm.torch.tensor import Tensor
-from dragon.vm.torch.pool import TensorPool, OperatorPool
+from dragon.vm.torch.c_api import _get_tensor_pool
+from dragon.vm.torch.c_api import _get_operator_pool
+from dragon.vm.torch.tensor import Tensor as _Tensor


 def Variable(tensor, requires_grad=False, volatile=False):
@@ -44,32 +45,32 @@ def backward(self, gradient=None):
        raise RuntimeError('This variable does not require grads.'
                           '\nCan not backward from this variable.')

-    # 1. Expressions -> Forward-Ops
-    # We should sort out the topology of these operators before using
+    # 1) expressions -> forward_ops
+    # We should sort out the topology before using
    all_expressions = sorted(self.__jit_recorder__.ops.items(), key=lambda d: d[0])
    forward_ops = [v for k, v in all_expressions]

-    # 2. Forward-Ops + Targets + InputGrads + IgnoredGrads -> Backward-Ops
-    targets = [self.name]; input_grads = []
+    # 2) forward_ops + targets + input_grads + ignored_grads -> backward_ops
+    targets, input_grads = [self.name], []
    ignored_grads = list(self._ignored_grads) if self._ignored_grads else []
    if gradient is not None:
-        if not isinstance(gradient, Tensor):
+        if not isinstance(gradient, _Tensor):
            raise TypeError('gradients can be either Tensors, Variables or None,'
                            ' but got {}'.format(type(gradient)))
-        tensor_utils.FromPyArray(gradient.cpu().numpy(), self.name + '_grad')
+        _tensor_utils.FromArray(gradient.numpy(True), self.name + '_grad')
        input_grads.append(self.name + '_grad')

-    # 3. Flow or Flow or Flow
-    ws.FlowGradients(forward_ops, targets, input_grads, ignored_grads)
+    # 3. Dispatch the backward ops
+    _backward_impl(forward_ops, targets, input_grads, ignored_grads)

    # 4. Release resources
    # We should release both the operator handles and tensors
    for forward_op in forward_ops:
-        OperatorPool.put(forward_op.name)
+        _get_operator_pool().put(forward_op.name)
        for output in forward_op.output:
            if output not in forward_op.input:
-                TensorPool.put(output)
+                _get_tensor_pool().put(output)


-Tensor.backward = backward
-Tensor.volatile = volatile
\ No newline at end of file
+_Tensor.backward = backward
+_Tensor.volatile = volatile
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/c_api.py
+++ b/Dragon/python/dragon/vm/torch/c_api.py
@@ -17,7 +17,9 @@ import copy
 import numpy
 import importlib

-from dragon.core import mapping, tensor_utils
+from dragon.core import mapping as _mapping
+from dragon.core import workspace as _workspace
+from dragon.core import tensor_utils as _tensor_utils


 class Size(tuple):
@@ -65,10 +67,10 @@ def from_numpy(data):
    """
    if not isinstance(data, numpy.ndarray):
        raise TypeError('The data should be a numpy.ndarray.')
-    if str(data.dtype) not in mapping.TENSOR_TYPE_TO_TORCH_TENSOR:
+    if str(data.dtype) not in _mapping.TENSOR_TYPE_TO_TORCH_TENSOR:
        raise ValueError('Unsupported type({}) to torch tensor.'.format(data.dtype))
    module = importlib.import_module('dragon.vm.torch.tensor')
-    return getattr(module, mapping.TENSOR_TYPE_TO_TORCH_TENSOR[str(data.dtype)])(data)
+    return getattr(module, _mapping.TENSOR_TYPE_TO_TORCH_TENSOR[str(data.dtype)])(data)


 def from_dragon(tensor, own_storage=False):
@@ -89,10 +91,20 @@ def from_dragon(tensor, own_storage=False):
        The torch tensor.

    """
-    storage = tensor_utils.GetStorage(tensor)
+    storage = _tensor_utils.GetStorage(tensor)
    if storage is None: return None
    module = importlib.import_module('dragon.vm.torch.tensor')
-    T = getattr(module, mapping.TENSOR_TYPE_TO_TORCH_TENSOR[storage.dtype])()
+    T = getattr(module, _mapping.TENSOR_TYPE_TO_TORCH_TENSOR[storage.dtype])()
    T._storage, T._own_storage, T._tensor = storage, own_storage, tensor
    T._device = device(*storage.device)
    return T
+
+
+def _get_tensor_pool():
+    """Return the tensor pool of current workspace."""
+    return _workspace.get_default_workspace().tensor_pool
+
+
+def _get_operator_pool():
+    """Return the operator pool of current workspace."""
+    return _workspace.get_default_workspace().operator_pool
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/execution.py
+++ b/Dragon/python/dragon/vm/torch/execution.py
@@ -28,21 +28,23 @@ from __future__ import division
 from __future__ import print_function

 import six
-import dragon as dg
-import dragon.import_c_api as C
-from dragon.config import option
-
-from .c_api import device as _Device
-from .jit import JITRecorder, is_jit_enforced
-from .autograd.grad_mode import is_grad_enabled
-from .tensor import _RuntimeTensor
-from .pool import TensorPool
+from dragon import import_c_api as _C
+from dragon.config import option as _options
+from dragon.core import workspace as _workspace

+from dragon.vm.torch.c_api import _get_tensor_pool
+from dragon.vm.torch.c_api import device as _Device
+from dragon.vm.torch.jit import JITRecorder, is_jit_enforced
+from dragon.vm.torch.autograd.grad_mode import is_grad_enabled
+from dragon.vm.torch.tensor import _RuntimeTensor

 def RunOperator(
-    inputs, outputs, meta,
+    inputs,
+    outputs,
+    meta,
    auto_grad=True,
-            callback_on_run=None):
+    callback_on_run=None,
+):
    if not isinstance(inputs, list): inputs = [inputs]
    if not isinstance(outputs, list): outputs = [outputs]
    if len(outputs) == 0:
@@ -67,14 +69,15 @@ def RunOperator(
        else:
            # Legacy mode, a torch tensor is excepted
            if isinstance(output, _Device):
-                name = TensorPool.get('${JOIN}' if requires_grad else '${DETACH}')
+                name = _get_tensor_pool().get(
+                    '${JOIN}' if requires_grad else '${DETACH}')
                outputs[ix] = _RuntimeTensor(name, device=output)
            outputs_name.append(outputs[ix].name)

    # Key + Inputs + Outputs => Op
    op_name = 'runtime'
    persistent_key, meta_op = meta
-    op = C.OperatorDef(); op.CopyFrom(meta_op)
+    op = _C.OperatorDef(); op.CopyFrom(meta_op)
    op.input, op.output = inputs_name, outputs_name

    # Auto-Grad
@@ -106,9 +109,9 @@ def RunOperator(
    if callback_on_run: callback_on_run(op_name)

    # Run
-    dg.workspace.RunOperator(op,
-        verbose=option['log_optimized_graph'] or
-                option['log_meta_graph'])
+    _workspace.RunOperator(op,
+        verbose=_options['log_optimized_graph'] or
+                _options['log_meta_graph'])

    # Returns
    if len(outputs) > 1: return outputs

--- a/Dragon/python/dragon/vm/torch/jit.py
+++ b/Dragon/python/dragon/vm/torch/jit.py
@@ -15,10 +15,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-from dragon.vm.torch.pool import OperatorPool
-
-
-_ENFORCE_JIT_TRACER = False
+from dragon.core import tls as _tls
+from dragon.vm.torch.c_api import _get_operator_pool


 def _Incrementer():
@@ -38,7 +36,7 @@ class JITRecorder(object):

    def append(self, op):
        uid = next(self.UID_GENERATOR)
-        op_name = OperatorPool.get(op.type)
+        op_name = _get_operator_pool().get(op.type)
        self.ops[uid] = op
        self.ops[uid].name = op_name
        return op_name
@@ -70,6 +68,11 @@ class JITRecorder(object):
        return buffer0 + buffer2 + buffer1 + buffer0


+def is_jit_enforced():
+    """Whether jit tracer is enforced."""
+    return _GLOBAL_ENFORCE_JIT_TRACER.enabled
+
+
 class enforce_jit(object):
    """Context-manager that enforce the jit tracer."""

@@ -77,13 +80,12 @@ class enforce_jit(object):
        self.prev = is_jit_enforced()

    def __enter__(self):
-        global _ENFORCE_JIT_TRACER
-        _ENFORCE_JIT_TRACER = True
+        global _GLOBAL_ENFORCE_JIT_TRACER
+        _GLOBAL_ENFORCE_JIT_TRACER.enabled = True

    def __exit__(self, *args):
-        global _ENFORCE_JIT_TRACER
-        _ENFORCE_JIT_TRACER = self.prev
+        global _GLOBAL_ENFORCE_JIT_TRACER
+        _GLOBAL_ENFORCE_JIT_TRACER.enabled = self.prev


-def is_jit_enforced():
-    return _ENFORCE_JIT_TRACER
\ No newline at end of file
+_GLOBAL_ENFORCE_JIT_TRACER = _tls.Constant(enabled=False)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/module.py
+++ b/Dragon/python/dragon/vm/torch/module.py
@@ -24,10 +24,12 @@ import dragon
 import warnings
 from collections import OrderedDict

-from dragon.core import proto_utils, logging
-from dragon.core.scope import get_default_name_scope
+from dragon.core import scope as _scope
+from dragon.core import logging as _logging
+from dragon.core import proto_utils as _proto_utils
+from dragon.core import tensor_utils as _tensor_utils

-from dragon.vm.torch.c_api import device as Device
+from dragon.vm.torch.c_api import device as _Device
 from dragon.vm.torch.tensor import Tensor, Parameter
 from dragon.vm.torch.execution import RunOperator
 from dragon.vm.torch.environ import add_submodule, get_module_name
@@ -38,7 +40,7 @@ class Module(object):
        self._modules = OrderedDict()
        self._parameters = OrderedDict()
        self._buffers = OrderedDict()
-        self._device = Device()
+        self._device = _Device()
        self._module_key = None
        self._module_def = None
        self.training = True
@@ -107,7 +109,7 @@ class Module(object):
        return destination

    def load_state_dict(self, state_dict, strict=True, verbose=True):
-        if verbose: logging.info('Load the state dict.')
+        if verbose: _logging.info('Load the state dict.')
        unexpected = []
        own_state = self.state_dict()
        for name, param in state_dict.items():
@@ -122,12 +124,12 @@ class Module(object):
                if isinstance(param, Tensor):
                    own_state[name].copy_(param)
                elif isinstance(param, numpy.ndarray):
-                    dragon.tensor_utils.SetPyArray(own_state[name], param)
+                    _tensor_utils.SetArray(own_state[name], param)
                else:
                    raise ValueError('Excepted the type of source state is either '
                        'dragon.vm.torch.Tensor or numpy.ndarray, got {}.'.format(type(param)))
                if verbose:
-                    logging.info('Tensor({}) loaded, Size: ({})'.format(name,
+                    _logging.info('Tensor({}) loaded, Size: ({})'.format(name,
                            ', '.join([str(d) for d in param_shape])))
            else:
                unexpected.append(name)
@@ -192,7 +194,7 @@ class Module(object):
        raise NotImplementedError('The base module can not be called.')

    def name_scope(self, remove_separator=True):
-        scope = get_default_name_scope()
+        scope = _scope.get_default_name_scope()
        if remove_separator and \
            len(scope) > 0 and \
                scope[-1] == '/':
@@ -268,7 +270,7 @@ class Module(object):
        return self

    def cpu(self):
-        self._device = Device()
+        self._device = _Device()
        # Remove key and op to re-create a one with new device
        self._module_key = self._module_def = None
        return self._apply(lambda t: t.cpu(),
@@ -276,7 +278,7 @@ class Module(object):

    def cuda(self, device=None):
        if device is None: device = dragon.config.GetGPU()
-        self._device = Device('cuda', device)
+        self._device = _Device('cuda', device)
        # Remove key and op to re-create a one with new device
        self._module_key = self._module_def = None
        return self._apply(lambda t: t.cuda(device),
@@ -309,11 +311,11 @@ class Module(object):

    def _gen_module_def(self):
        self._module_def = \
-            proto_utils.MakeCXXOperatorDef(
+            _proto_utils.MakeCXXOperatorDef(
                name='runtime',
                uid=self.module_key,
                op_type=self.op_meta['op_type'],
-                device_option=proto_utils.
+                device_option=_proto_utils.
                    GetDeviceOption(
                        self._device.type,
                            self._device.index),

--- a/Dragon/python/dragon/vm/torch/nn/functional.py
+++ b/Dragon/python/dragon/vm/torch/nn/functional.py
@@ -13,6 +13,10 @@
 #
 # ------------------------------------------------------------

+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
 import warnings



--- a/Dragon/python/dragon/vm/torch/nn/init.py
+++ b/Dragon/python/dragon/vm/torch/nn/init.py
@@ -19,6 +19,7 @@ from __future__ import print_function

 import math
 import warnings
+
 from dragon.vm.torch.autograd.grad_mode import no_grad



--- a/Dragon/python/dragon/vm/torch/ops/modules/base.py
+++ b/Dragon/python/dragon/vm/torch/ops/modules/base.py
@@ -13,10 +13,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import numpy as np
-import dragon as dg
+import numpy

-from dragon.core import proto_utils
+from dragon.core import proto_utils as _proto_utils
+from dragon.core import workspace as _workspace
 from dragon.vm.torch.module import Module


@@ -25,9 +25,10 @@ class BaseModule(Module):
        super(BaseModule, self).__init__()
        self._module_key = key
        self._device = dev
-        self._args_dev = proto_utils.\
+        self._args_dev = _proto_utils.\
            GetDeviceOption('cpu').SerializeToString()

    def set_argument_i64(self, name, value):
-        dg.C.FeedTensor(name, np.array(
-            value, dtype=np.int64), self._args_dev)
\ No newline at end of file
+        _workspace.get_default_workspace()\
+            .FeedTensor(name, numpy.array(
+                value, dtype=numpy.int64), self._args_dev)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/ops/primitive.py
+++ b/Dragon/python/dragon/vm/torch/ops/primitive.py
@@ -13,10 +13,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import numpy as np
-import dragon as dg
+import numpy

-from dragon.vm.torch.tensor import *
+from dragon.core import workspace as _workspace
+from dragon.vm.torch.tensor import Tensor as _Tensor
 from dragon.vm.torch.c_api import device as _Device


@@ -49,8 +49,8 @@ def WrapScalar(scalar, dtype, device):
    if 'float' in dtype: scalar = float(scalar)
    if 'int' in dtype: scalar = int(scalar)
    name = '/share/scalar/{}/{}'.format(dtype, str(scalar))
-    if not dg.workspace.HasTensor(name):
-        dg.workspace.FeedTensor(name, np.array(scalar, dtype=dtype))
-    t = Tensor(name=name, dtype=dtype, device=device, own_storage=False)
+    if not _workspace.HasTensor(name):
+        _workspace.FeedTensor(name, numpy.array(scalar, dtype=dtype))
+    t = _Tensor(name=name, dtype=dtype, device=device, own_storage=False)
    t.requires_grad = False
    return t
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/optim/__init__.py
+++ b/Dragon/python/dragon/vm/torch/optim/__init__.py
@@ -9,7 +9,11 @@
 #
 # ------------------------------------------------------------

-from .adam import Adam
-from .sgd import SGD
-from .rmsprop import RMSprop
-from .optimizer import Optimizer
\ No newline at end of file
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from dragon.vm.torch.optim.adam import Adam
+from dragon.vm.torch.optim.sgd import SGD
+from dragon.vm.torch.optim.rmsprop import RMSprop
+from dragon.vm.torch.optim.optimizer import Optimizer
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/optim/adam.py
+++ b/Dragon/python/dragon/vm/torch/optim/adam.py
@@ -21,21 +21,38 @@ from dragon.vm.torch.optim.optimizer import Optimizer


 class Adam(Optimizer):
-    def __init__(self, params, lr=1e-3, beta1=0.9, beta2=0.999, eps=1e-8,
-                 weight_decay=0, amsgrad=False, scale_gradient=1.0, clip_gradient=-1.0):
-        if not 0.0 <= lr:
+    def __init__(
+        self,
+        params,
+        lr=1e-3,
+        beta1=0.9,
+        beta2=0.999,
+        eps=1e-8,
+        weight_decay=0,
+        amsgrad=False,
+        scale_gradient=1.,
+        clip_gradient=-1.,
+    ):
+        if not 0. <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
-        if not 0.0 <= eps:
+        if not 0. <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
-        if not 0.0 <= beta1 < 1.0:
+        if not 0. <= beta1 < 1.:
            raise ValueError("Invalid beta parameter at index 0: {}".format(beta1))
-        if not 0.0 <= beta2 < 1.0:
+        if not 0. <= beta2 < 1.:
            raise ValueError("Invalid beta parameter at index 1: {}".format(beta2))
        if amsgrad:
            raise NotImplementedError()
-        defaults = dict(lr=lr, beta1=beta1, beta2=beta2, eps=eps,
-                        weight_decay=weight_decay, amsgrad=amsgrad,
-                        scale_gradient=scale_gradient, clip_gradient=clip_gradient)
+        defaults = dict(
+            lr=lr,
+            beta1=beta1,
+            beta2=beta2,
+            eps=eps,
+            weight_decay=weight_decay,
+            amsgrad=amsgrad,
+            scale_gradient=scale_gradient,
+            clip_gradient=clip_gradient,
+        )
        super(Adam, self).__init__(params, defaults)
        self._update_type = 'AdamUpdate'
        self._mutable_parameters = {

--- a/Dragon/python/dragon/vm/torch/optim/optimizer.py
+++ b/Dragon/python/dragon/vm/torch/optim/optimizer.py
@@ -17,14 +17,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import dragon
 from collections import defaultdict

-from dragon.vm.torch.tensor import Tensor
-
-from dragon.vm.torch.ops.builtin import (
-    _accumulate, _allreduce, _update,
-)
+from dragon.core import mpi as _mpi
+from dragon.core import workspace as _workspace
+from dragon.vm.torch.tensor import Tensor as _Tensor
+from dragon.vm.torch.ops.builtin import _update
+from dragon.vm.torch.ops.builtin import _allreduce
+from dragon.vm.torch.ops.builtin import _accumulate


 # A simple parameter flag
@@ -37,7 +37,7 @@ class Optimizer(object):

    def __init__(self, params, defaults):
        self.defaults = defaults
-        if isinstance(params, Tensor):
+        if isinstance(params, _Tensor):
            raise TypeError("params argument given to the optimizer should be "
                            "an iterable of Variables or dicts, but got " +
                            str(type(params)))
@@ -52,9 +52,9 @@ class Optimizer(object):
            self.add_param_group(param_group)
        self._update_type = None
        self._allow_parallel = False
-        if dragon.mpi.Is_Init():
-            local_rank, _ = dragon.mpi.AllowParallel()
-            if local_rank != -1: self._allow_parallel = True
+        if _mpi.Is_Init():
+            rank, _ = _mpi.AllowParallel()
+            if rank != -1: self._allow_parallel = True
        self._mutable_parameters = {}

    def __repr__(self):
@@ -72,7 +72,7 @@ class Optimizer(object):
        template = group['slot'] + '/{}'
        for k, v in group.items():
            if k in self._mutable_parameters:
-                dragon.workspace.FeedTensor(
+                _workspace.FeedTensor(
                    template.format(self._mutable_parameters[k]),
                        v, dtype='float32', force_cpu=True)

@@ -80,8 +80,8 @@ class Optimizer(object):
        grad_name = param.name + (
            '_grad[acc]' if accumulating
                else '_grad')
-        if dragon.workspace.HasTensor(grad_name):
-            return Tensor(
+        if _workspace.HasTensor(grad_name):
+            return _Tensor(
                name=grad_name,
                    own_storage=False,
                        device=param.device)
@@ -172,7 +172,7 @@ class Optimizer(object):

        params = param_group['params']

-        if isinstance(params, Tensor):
+        if isinstance(params, _Tensor):
            param_group['params'] = [params]
        elif isinstance(params, set):
            raise TypeError('Optimizer parameters need to be organized in ordered collections,'

--- a/Dragon/python/dragon/vm/torch/optim/rmsprop.py
+++ b/Dragon/python/dragon/vm/torch/optim/rmsprop.py
@@ -21,21 +21,49 @@ from dragon.vm.torch.optim.optimizer import Optimizer


 class RMSprop(Optimizer):
-    def __init__(self, params, lr=1e-2, alpha=0.99, eps=1e-8, weight_decay=0,
-                 momentum=0, centered=False, scale_gradient=1.0, clip_gradient=-1.0):
-        if not 0.0 <= lr:
+    def __init__(
+        self,
+        params,
+        lr=1e-2,
+        alpha=0.99,
+        eps=1e-8,
+        weight_decay=0,
+        momentum=0,
+        centered=False,
+        scale_gradient=1.,
+        clip_gradient=-1.,
+    ):
+        if not 0. <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
-        if not 0.0 <= eps:
+        if not 0. <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
-        if momentum != 0:
-            raise NotImplementedError()
-        if not 0.0 <= alpha:
+        if momentum < 0.:
+            raise ValueError("Invalid momentum value: {}".format(momentum))
+        if not 0. <= alpha:
            raise ValueError("Invalid alpha value: {}".format(alpha))
-
-        defaults = dict(lr=lr, momentum=momentum, alpha=alpha, eps=eps,
-                        centered=centered, weight_decay=weight_decay,
-                        scale_gradient=scale_gradient, clip_gradient=clip_gradient)
+        defaults = dict(
+            lr=lr,
+            momentum=momentum,
+            alpha=alpha,
+            eps=eps,
+            centered=centered,
+            weight_decay=weight_decay,
+            scale_gradient=scale_gradient,
+            clip_gradient=clip_gradient,
+        )
        super(RMSprop, self).__init__(params, defaults)
+        if momentum != 0.:
+            self._update_type = 'AdamUpdate'
+            self._mutable_parameters = {
+                'lr': 'base_lr',
+                'momentum': 'beta1',
+                'alpha': 'beta2',
+                'eps': 'eps',
+                'weight_decay': 'l2_decay',
+                'clip_gradient': 'clip_gradient',
+                'scale_gradient': 'scale_gradient',
+            }
+        else:
            self._update_type = 'RMSPropUpdate'
            self._mutable_parameters = {
                'lr': 'base_lr',

--- a/Dragon/python/dragon/vm/torch/optim/sgd.py
+++ b/Dragon/python/dragon/vm/torch/optim/sgd.py
@@ -21,17 +21,32 @@ from dragon.vm.torch.optim.optimizer import Optimizer, required


 class SGD(Optimizer):
-    def __init__(self, params, lr=required, momentum=0, dampening=0,
-                 weight_decay=-1.0, nesterov=False, scale_gradient=1.0, clip_gradient=-1.0):
-        if lr is not required and lr < 0.0:
+    def __init__(
+        self,
+        params,
+        lr=required,
+        momentum=0,
+        dampening=0,
+        weight_decay=-1.,
+        nesterov=False,
+        scale_gradient=1.,
+        clip_gradient=-1.,
+    ):
+        if lr is not required and lr < 0.:
            raise ValueError("Invalid learning rate: {}".format(lr))
-        if momentum < 0.0:
+        if momentum < 0.:
            raise ValueError("Invalid momentum value: {}".format(momentum))
-        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
-                        weight_decay=weight_decay, nesterov=nesterov,
-                        scale_gradient=scale_gradient, clip_gradient=clip_gradient)
-        if nesterov and (momentum <= 0 or dampening != 0):
-            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
+        defaults = dict(
+            lr=lr,
+            momentum=momentum,
+            dampening=dampening,
+            weight_decay=weight_decay,
+            nesterov=nesterov,
+            scale_gradient=scale_gradient,
+            clip_gradient=clip_gradient,
+        )
+        if nesterov and (momentum <= 0. or dampening != 0.):
+            raise ValueError("Nesterov momentum requires a momentum and zero dampening.")
        super(SGD, self).__init__(params, defaults)
        self._update_type = 'NesterovUpdate' if nesterov else 'SGDUpdate'
        self._mutable_parameters = {

--- a/Dragon/python/dragon/vm/torch/pool.py
+++ b/Dragon/python/dragon/vm/torch/pool.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-
-"""Implement some resource pools based on the dummy name. """
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import dragon
-from collections import defaultdict, deque
-
-
-class _TensorPool(object):
-    """We apply the TensorPool to manage the reused tensors.
-
-    Tensors with the same scope in the pool will be reused by turns,
-    which speeds up the whole system by reducing the unnecessary deconstructing.
-
-    Heuristically, we have used 5 pools with different scopes:
-
-    * scope(Leaf): A Pool to reuse leaf tensors.
-
-    * scope(NumPy): A pool to reuse leaf tensors from numpy.
-
-    * scope(Join): A pool to reuse RT(runtime) tensors required by forward-backward.
-
-    * scope(Detach): A pool to reuse RT(runtime) tensors required by forward only.
-
-    * scope(Reference): A pool to reuse reshaped tensors(sharing contents).
-
-    """
-    def __init__(self):
-        # deque provide much higher performance than Queue
-        self._scope2keys = defaultdict(deque)
-
-    def get(self, scope='${DETACH}'):
-        try:
-            return self._scope2keys[scope].popleft()
-        except:
-            self._scope2keys[scope].append(
-                dragon.workspace.GetDummyName(
-                    '${POOL}/%s/Tensor' % scope,
-                        domain='Tensor', zero_based=False))
-            return self._scope2keys[scope].popleft()
-
-    def put(self, name):
-        if '${POOL}' in name:
-            scope, _ = name[8:].split('/')
-            self._scope2keys[scope].append(name)
-            return True
-        else: return False
-
-
-class _OperatorPool(object):
-    """Operators whose gradients is required will hold a resource handle,
-    which is also called ``Anchor`` in the backend.
-
-    We apply this pool to collect the handles according to the type of operator,
-    as the mem size of temporal resources varies greatly.
-
-    The resource handle will be released after the gradient flow automatically.
-
-    """
-    def __init__(self):
-        # deque provide much higher performance than Queue
-        self._type2keys = defaultdict(deque)
-
-    def get(self, op_type):
-        try:
-            return self._type2keys[op_type].popleft()
-        except:
-            self._type2keys[op_type].append(
-                dragon.workspace.GetDummyName(
-                    '${POOL}/%s' % op_type,
-                        domain='Operator', zero_based=False))
-            return self._type2keys[op_type].popleft()
-
-    def put(self, op_name):
-        op_type, _ = op_name[8:].split('_')
-        self._type2keys[op_type].append(op_name)
-
-
-# Define the global pools
-TensorPool = _TensorPool()
-OperatorPool = _OperatorPool()
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/serialization.py
+++ b/Dragon/python/dragon/vm/torch/serialization.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function

 import os, sys, io
-from dragon.core.tensor_utils import ToPyArray
+from dragon.core.tensor_utils import ToArray as _to_array

 if sys.version_info[0] == 2:
    import cPickle as pickle
@@ -67,7 +67,7 @@ def _save_dict(obj):
    py_dict = type(obj)()
    for k, v in obj.items():
        if isinstance(v, dict): py_dict[k] = _save_dict(v)
-        elif hasattr(v, 'name'): py_dict[k] = ToPyArray(v, True)
+        elif hasattr(v, 'name'): py_dict[k] = _to_array(v, True)
        else: py_dict[k] = v
    return py_dict

@@ -79,7 +79,7 @@ def _save(obj, f, pickle_module, pickle_protocol):
    py_dict = type(obj)()
    for k, v in obj.items():
        if isinstance(v, dict): py_dict[k] = _save_dict(v)
-        elif hasattr(v, 'name'): py_dict[k] = ToPyArray(v, True)
+        elif hasattr(v, 'name'): py_dict[k] = _to_array(v, True)
        else: py_dict[k] = v
    pickle_module.dump(py_dict, f, pickle_protocol)


--- a/Dragon/python/dragon/vm/torch/tensor.py
+++ b/Dragon/python/dragon/vm/torch/tensor.py
@@ -15,12 +15,18 @@ from __future__ import print_function

 import six
 import numpy
-import dragon

-from dragon.core import mapping, tensor_utils, proto_utils
-from dragon.vm.torch.pool import TensorPool
-from dragon.vm.torch.c_api import Size, from_dragon
+from dragon import config as _cfg
+from dragon.core import mapping as _mapping
+from dragon.core.tensor import Tensor as _Tensor
+from dragon.core import proto_utils as _proto_utils
+from dragon.core import tensor_utils as _tensor_utils
+from dragon import get_default_workspace as _workspace
+
+from dragon.vm.torch.c_api import Size as _Size
 from dragon.vm.torch.c_api import device as _Device
+from dragon.vm.torch.c_api import _get_tensor_pool
+from dragon.vm.torch.c_api import from_dragon as _from_dragon


 class Tensor(object):
@@ -44,7 +50,7 @@ class Tensor(object):
        if len(args) == 0:
            # + empty tensor, not leaf
            if self._tensor is not None:
-                dragon.C.CreateTensor(self._tensor)
+                _workspace().CreateTensor(self._tensor)
        elif len(args) == 1:
            if isinstance(args[0], (list, tuple)):
                # + torch.Tensor(sequence)
@@ -65,23 +71,23 @@ class Tensor(object):
            self._init_from_shape(args, kwargs.get('dtype', 'float32'))

        # Store the reference of backend
-        self._storage = dragon.C.GetTensor(self.name) \
-            if self.name is not None else None
+        self._storage = _workspace().GetTensor(
+            self.name) if self.name is not None else None

    def _init_from_numpy(self, array):
-        self._static_shape = Size(array.shape)
+        self._static_shape = _Size(array.shape)
        # We use the scope of ``numpy`` instead of ``leaf``
        # As it is costly to switch memory between ``copy`` and ``zero-copy``
-        self._tensor = tensor_utils.FromPyArray(
-            array, TensorPool.get('${NUMPY}'))
+        self._tensor = _tensor_utils.FromArray(
+            array, _get_tensor_pool().get('${NUMPY}'))
        self._ignored_grads = {self.name + '_grad'} \
            if not self._requires_grad else None

    def _init_from_shape(self, shape, dtype):
        if isinstance(shape, six.integer_types): shape = [shape]
-        self._static_shape = Size(shape)
-        self._tensor = tensor_utils.FromShape(
-            shape, dtype, TensorPool.get('${LEAF}'))
+        self._static_shape = _Size(shape)
+        self._tensor = _tensor_utils.FromShape(
+            shape, dtype, _get_tensor_pool().get('${LEAF}'))
        self._ignored_grads = {self.name + '_grad'} \
            if not self._requires_grad else None

@@ -137,7 +143,7 @@ class Tensor(object):
            The self.

        """
-        if device is None: device = dragon.config.GetGPU()
+        if device is None: device = _cfg.GetGPU()
        self._storage.ToCUDA(device)
        self._device.type, self._device.index = 'cuda', device
        return self
@@ -156,7 +162,7 @@ class Tensor(object):
            The numpy array.

        """
-        return tensor_utils.ToPyArray(self._tensor, readonly)
+        return _tensor_utils.ToArray(self._tensor, readonly)

    def dragon(self):
        """Create a dragon tensor sharing this tensor.
@@ -168,7 +174,7 @@ class Tensor(object):

        """
        if isinstance(self._tensor, str):
-            return dragon.Tensor.Ref(self._tensor,
+            return _Tensor.Ref(self._tensor,
                shape=self.shape, dtype=self.dtype)
        else: return self._tensor

@@ -453,8 +459,8 @@ class Tensor(object):
            The float value.

        """
-        if self.numel() == 1: return float(str(self.data.squeeze()))
-        raise TypeError('Only size-1 arrays can be converted to Python scalars')
+        if self.numel() == 1: return float(self.numpy(readonly=True))
+        raise TypeError('Only size-1 array can be converted to Python scalars.')

    def __int__(self):
        """Return a int Python scalar of size-1 tensor.
@@ -473,7 +479,7 @@ class Tensor(object):
                # Always reuse the leaf variables or
                # tensors that do not require grad
                # PyGC will detect them automatically
-                TensorPool.put(self.name)
+                _get_tensor_pool().put(self.name)

    def _process_indices(self, item):
        if not isinstance(item, (slice, tuple)):
@@ -570,7 +576,7 @@ class Tensor(object):
            The size.

        """
-        s = Size(self._storage.dims)
+        s = _Size(self._storage.dims)
        return s[axis] if axis is not None else s

    @property
@@ -851,10 +857,10 @@ class Tensor(object):

        """
        # Copy memory
-        tensor_utils.FromTensor(
-            src, proto_utils.GetDeviceOption(
+        _tensor_utils.FromTensor(
+            src, _proto_utils.GetDeviceOption(
                src.device.type, src.device.index),
-            self.name, proto_utils.GetDeviceOption(
+            self.name, _proto_utils.GetDeviceOption(
                self.device.type, self.device.index))
        # Transfer the static shape if necessary
        self._static_shape = src.size() \
@@ -1484,7 +1490,7 @@ class Tensor(object):

    @property
    def grad(self):
-        g = from_dragon(self.name + '_grad', False)
+        g = _from_dragon(self.name + '_grad', False)
        if g: g._static_shape = self.shape
        return g

@@ -1512,7 +1518,7 @@ class Tensor(object):
    ##############################################

    def _type2str(self):
-        return mapping.TENSOR_TYPE_TO_TORCH_TENSOR[self.dtype]
+        return _mapping.TENSOR_TYPE_TO_TORCH_TENSOR[self.dtype]


 def CharTensor(*args, **kwargs):
@@ -1556,7 +1562,7 @@ def _LeafTensor(shape, dtype='float32', device=_Device(), requires_grad=False):
    Commonly used to create leaf variables, i.e., the parameters or placeholders.

    """
-    constructor = globals()[mapping.TENSOR_TYPE_TO_TORCH_TENSOR[dtype]]
+    constructor = globals()[_mapping.TENSOR_TYPE_TO_TORCH_TENSOR[dtype]]
    return constructor(*shape, device=device, requires_grad=requires_grad)


@@ -1567,7 +1573,7 @@ def _RuntimeTensor(name, dtype='float32', device=_Device()):
    i.e., the shape is computed by the backend automatically.

    """
-    constructor = globals()[mapping.TENSOR_TYPE_TO_TORCH_TENSOR[dtype]]
+    constructor = globals()[_mapping.TENSOR_TYPE_TO_TORCH_TENSOR[dtype]]
    return constructor(name=name, device=device)


@@ -1578,8 +1584,8 @@ def _ReferenceTensor(src):
    i.e., view, squeeze, and unsqueeze.

    """
-    constructor = globals()[mapping.TENSOR_TYPE_TO_TORCH_TENSOR[src.dtype]]
-    T = constructor(name=TensorPool.get('${REFERENCE}'), device=src.device)
+    constructor = globals()[_mapping.TENSOR_TYPE_TO_TORCH_TENSOR[src.dtype]]
+    T = constructor(name=_get_tensor_pool().get('${REFERENCE}'), device=src.device)
    T._ref_objects.append(src)
    return T


--- a/Dragon/src/core/graph.cc
+++ b/Dragon/src/core/graph.cc
@@ -7,9 +7,9 @@ namespace dragon {

 /*! Default constructor of <GraphBase> */

-GraphBase::GraphBase(const GraphDef& meta_graph, Workspace* ws)
-    : name_(meta_graph.name()), ws_(ws) {
-    for (auto arg : meta_graph.arg()) {
+GraphBase::GraphBase(const GraphDef& def, Workspace* ws)
+    : name_(def.name()), ws_(ws) {
+    for (auto arg : def.arg()) {
        CHECK_GT(arg.name().size(), 0);
        CHECK_EQ(args_.count(arg.name()), 0);
        args_[arg.name()] = arg;
@@ -18,7 +18,7 @@ GraphBase::GraphBase(const GraphDef& meta_graph, Workspace* ws)
    Set<string> known_tensors;

    // Topo-check for a graph
-    for (const auto& op : meta_graph.op()) {
+    for (const auto& op : def.op()) {
        // Check inputs
        for (const auto& in : op.input())
            CHECK(known_tensors.count(in) || ws_->HasTensor(in))
@@ -30,7 +30,7 @@ GraphBase::GraphBase(const GraphDef& meta_graph, Workspace* ws)

    // Check for all solving targets
    Set<string> objective_targets;
-    for (const auto& target : meta_graph.output()) {
+    for (const auto& target : def.output()) {
        CHECK(known_tensors.count(target) ||
              ws_->HasTensor(target))
            << "\nTarget: " << target
@@ -39,7 +39,7 @@ GraphBase::GraphBase(const GraphDef& meta_graph, Workspace* ws)
    }

    // Check for all gradients
-    for (const auto& gradient : meta_graph.gradient()) {
+    for (const auto& gradient : def.gradient()) {
        const auto& cost = gradient.cost();
        const auto& wrt = gradient.wrt();
        CHECK(known_tensors.count(cost) || ws_->HasTensor(cost))
@@ -55,91 +55,23 @@ GraphBase::GraphBase(const GraphDef& meta_graph, Workspace* ws)
    }
 }

-/*! Build the update operators from the def */
-
-GraphDef GraphBase::BuildUpdateOps(const GraphDef& input_def) {
-    OperatorDef collective_op;
-    collective_op.set_type("CollectiveUpdate");
-
-    // Generate Update Ops
-    vector<OperatorDef> update_ops;
-    for (const auto& updater : input_def.updater()) {
-        vector<string> missing_tensors;
-        for (const auto& tensor : updater.tensor()) {
-            if (!ws_->HasTensor(tensor)) {
-                LOG(INFO) << "Missing Tensor: " << tensor;
-                missing_tensors.push_back(tensor);
-            }
-        }
-        if (missing_tensors.size() == 0) {
-            vector<Argument> args;
-            for (const auto& arg : updater.arg()) args.push_back(arg);
-            OperatorDef op_def = MakeOperatorDef(updater.type(),
-                                                 updater.name(),
-                          vector<string>({ updater.tensor(1) }),  // dX
-                          vector<string>({ updater.tensor(0) })); // X
-            collective_op.add_input(updater.tensor(1));
-            collective_op.add_output(updater.tensor(1));
-            op_def.mutable_arg()->CopyFrom(updater.arg());
-            update_ops.push_back(op_def);
-        } else {
-            LOG(INFO) << "Missing tensors. Skip the update to Tensor("
-                      << updater.tensor(0) << ")";
-        }
-    }
-
-    // Generate Collective Ops if necessary
-    vector<OperatorDef> collective_ops;
-    if (args_.count("parallel_mode")) {
-        if (args_["parallel_mode"].s() == "MPI" ||
-            args_["parallel_mode"].s() == "NCCL") {
-            OperatorDef op_def;
-            op_def.CopyFrom(collective_op);
-            Argument collective_mode;
-            collective_mode.set_name("mode");
-            collective_mode.set_s(
-                args_["parallel_mode"].s() + "_ALLREDUCE");
-            op_def.add_arg()->CopyFrom(collective_mode);
-            if (args_.count("comm") &&
-                args_.count("group") &&
-                args_.count("root")) {
-                op_def.add_arg()->CopyFrom(args_["comm"]);
-                op_def.add_arg()->CopyFrom(args_["group"]);
-                op_def.add_arg()->CopyFrom(args_["root"]);
-            } else {
-                LOG(FATAL) << "MPI was not initialized.";
-            }
-            collective_ops.push_back(op_def);
-        }
-    }
-
-    // Generate graph
-    GraphDef update_graph(input_def);
-    update_graph.clear_updater();
-    for (const auto& op : collective_ops) update_graph.add_op()->CopyFrom(op);
-    for (const auto& op : update_ops) update_graph.add_op()->CopyFrom(op);
-    return update_graph;
-}
-
 /*! Create a graph from the optimized def */

-bool Graph::Create(
-    const GraphDef&             optimized_graph,
-    Workspace*                  ws) {
-    bool has_device_option = optimized_graph.has_device_option();
-    for (int i = 0; i < optimized_graph.op_size(); i++) {
-        OperatorDef op_def(optimized_graph.op(i));
+bool Graph::Create(const GraphDef& def, Workspace* ws) {
+    bool has_device_option = def.has_device_option();
+    for (int i = 0; i < def.op_size(); i++) {
+        OperatorDef op_def(def.op(i));
        LOG(DEBUG) << "Create Operator " << op_def.name()
                   << ": " << op_def.type();
        // Inherit device option if necessary
        if (!op_def.has_device_option() && has_device_option)
-            op_def.mutable_device_option()->CopyFrom(
-                optimized_graph.device_option());
+            op_def.mutable_device_option()
+                ->CopyFrom(def.device_option());
        // For the static graph, do recomputing-aware
        Argument arg; arg.set_name("allow_recomputing");
        arg.set_i(1); op_def.add_arg()->CopyFrom(arg);
        // For the last operator, enforce the synchronization
-        if (i == optimized_graph.op_size() - 1) {
+        if (i == def.op_size() - 1) {
            arg.set_name("do_sync");
            arg.set_i(1); op_def.add_arg()->CopyFrom(arg);
        }
@@ -151,53 +83,43 @@ bool Graph::Create(

 /*! Default constructor of <Graph> */

-Graph::Graph(const GraphDef& meta_graph, Workspace* ws)
-    : GraphBase(meta_graph, ws) {
-    GraphDef optimized_graph;
+Graph::Graph(const GraphDef& def, Workspace* ws)
+    : GraphBase(def, ws) {
+    // Apply the optimizations
+    GraphDef opt_def = def;
+    GraphOptimizer graph_optim(ws);
+    GraphGradientMaker gradient_maker;
    Map< string, vector<int> > subgraph_indices;
-    if (meta_graph.updater_size() > 0) {
-        /*!
-         * Check if existing any updaters.
-         *
-         * Note that the graph with update ops is not a dag,
-         * we should handle them independently.
-         */
-        optimized_graph = this->BuildUpdateOps(meta_graph);
-    } else {
-        int OX = 3;  // defaults: O3
+    int opt = 3;  // defaults: O3
    if (this->args_.count("optimization_level"))
-            OX = this->args_["optimization_level"].i();
-        optimized_graph = meta_graph;
-        GraphOptimizer optimizer(ws);
-        GraphGradientMaker gradient_maker;
-        if (OX >= 1) optimized_graph = optimizer.PruneNodes(meta_graph);
-        if (OX >= 2) optimized_graph = optimizer.AddInplace(optimized_graph);
-        if (OX >= 3) {
+        opt = this->args_["optimization_level"].i();
+    if (opt >= 1) opt_def = graph_optim.PruneNodes(def);
+    if (opt >= 2) opt_def = graph_optim.AddInplace(opt_def);
+    if (opt >= 3) {
        if (this->args_["phase"].s() == "TRAIN") {
-                optimized_graph = optimizer.MirrorStage(
-                    optimized_graph, subgraph_indices);
-                gradient_maker.Share(optimized_graph);
+            opt_def = graph_optim.MirrorStage(
+                opt_def, subgraph_indices);
+            opt_def = gradient_maker.Share(opt_def);
        } else {
-                optimized_graph = optimizer.SimulateGC(optimized_graph);
-            }
+            opt_def = graph_optim.SimulateGC(opt_def);
        }
    }

    // Try to store the final graph as a tensor for visualization
    bool could_be_serialized = true;
-    for (auto& op : optimized_graph.op())
+    for (auto& op : opt_def.op())
        if (op.type() == "GivenTensorFill")
            could_be_serialized = false;
    if (could_be_serialized) {
        auto* T = ws_->CreateTensor(
            "/graph_def/optimized/" +
-                meta_graph.name())->Reshape({ 1 });
+                opt_def.name())->Reshape({ 1 });
        T->mutable_data<string, CPUContext>()[0]
-            = optimized_graph.DebugString();
+            = opt_def.DebugString();
    }

    // Create
-    Create(optimized_graph, ws);
+    Create(opt_def, ws);

    // Recomputing-aware
    if (subgraph_indices.size() > 0) {

--- a/Dragon/src/core/graph_gradient.cc
+++ b/Dragon/src/core/graph_gradient.cc
@@ -168,7 +168,7 @@ void GraphGradientMaker::Make(
                OperatorDef generate_op = MakeOperatorDef(
                    "GradientGenerate", GetOperatorName(),
                        op_inputs, op_outputs,
-                            vector<Argument>(1, arg_defaults));
+                            vector<Argument>({ arg_defaults}));
                if (op.has_device_option())
                    generate_op.mutable_device_option()
                        ->CopyFrom(op.device_option());
@@ -211,25 +211,65 @@ void GraphGradientMaker::Make(
    } \
    *op->mutable_output(ix) = temp_grad;}

-void GraphGradientMaker::Share(GraphDef& graph) {
+GraphDef GraphGradientMaker::Share(const GraphDef& input_def) {
    Set<int> invalid_ops;
    Map<string, int> ref_count;
+    Map< string, pair<int, string> > ssa_map;
    // Count the refs for detecting leaf nodes
-    for (int i = 0; i < graph.op_size(); ++i) {
-        const OperatorDef& op = graph.op(i);
+    for (int i = 0; i < input_def.op_size(); ++i) {
+        const OperatorDef& op = input_def.op(i);
        // Ignore the non-gradient ops
        if (op.type().find("Gradient") == string::npos) continue;
-        if (op.type() == "GradientGather" &&
-            ignore_grads_.count(op.output(0))) {
-            for (auto& input : op.input())
+        if (op.type() == "GradientGather") {
+            invalid_ops.insert(i);
+            if (ignore_grads_.count(op.output(0))) {
+                for (const auto& input : op.input())
                    ignore_grads_.insert(input);
-            invalid_ops.insert(i); continue;
+                continue;
+            } else {
+                string head;
+                for (const auto& input : op.input()) {
+                    if (input != "NULL") {
+                        if (head.empty()) head = input;
+                        ssa_map[input] = { i, head };
+                    }
+                }
            }
-        for (auto& input : op.input())
+        }
+        for (const auto& input : op.input())
            if (input.find("grad") != string::npos)
                    ref_count[input] += 1;
    }

+    // Decompose the GradientGather in SSA format
+    GraphDef output_def(input_def); output_def.clear_op();
+    for (int i = 0; i < input_def.op_size(); ++i) {
+        if (invalid_ops.count(i)) continue;
+        const OperatorDef& op = input_def.op(i);
+        output_def.add_op()->CopyFrom(op);
+        if (op.type().find("Gradient") == string::npos) continue;
+        for (const auto& output : op.output()) {
+            const auto& find_iter = ssa_map.find(output);
+            if (find_iter != ssa_map.end()) {
+                const OperatorDef& gather_op =
+                    input_def.op(find_iter->second.first);
+                OperatorDef acc_op(gather_op);
+                acc_op.clear_input();
+                if (output != find_iter->second.second) {
+                    acc_op.set_type("GradientAdd");
+                    // Fake a inplace to avoid a new buffer
+                    acc_op.add_input(gather_op.output(0));
+                    const auto& ref_iter = ref_count.find(
+                        gather_op.output(0));
+                    if (ref_iter != ref_count.end())
+                        ref_iter->second++;
+                }
+                acc_op.add_input(output);
+                output_def.add_op()->CopyFrom(acc_op);
+            }
+        }
+    }
+
    // Prepare the Gradients Pool
    int temporary_idx = 0;
    Map<string, string> temporary_grads;
@@ -240,7 +280,7 @@ void GraphGradientMaker::Share(GraphDef& graph) {
                std::to_string(temporary_idx++);
        } else {
            /*!
-             * *LIFO* is more memory efficent than *FIFO* usually,
+             * LIFO is more memory efficent than FIFO usually,
             * Because the larger gradients will bring out later.
             *
             * Memory distribution turns out to be uniform,
@@ -252,12 +292,10 @@ void GraphGradientMaker::Share(GraphDef& graph) {
        }
    };

-    for (int i = 0; i < graph.op_size(); ++i) {
-        OperatorDef* op = graph.mutable_op(i);
+    for (int i = 0; i < output_def.op_size(); ++i) {
+        OperatorDef* op = output_def.mutable_op(i);
        // Ignore the non-gradient ops
        if (op->type().find("Gradient") == string::npos) continue;
-        // Ignore the invalid ops
-        if (invalid_ops.count(i)) { op->mutable_type()->clear(); continue; }
        // GC to store the grads that have finished lifecycle
        vector<string> GC;
        // Inplace-aware
@@ -284,9 +322,12 @@ void GraphGradientMaker::Share(GraphDef& graph) {
        // Determine the scanning order
        bool left = true;
        static Set<string> ROrderOps = {
-            "ConcatGradient", "StackGradient",
-            "RAddGradient", "RSubGradient",
-            "RMulGradient", "RDivGradient",
+            "RAddGradient",
+            "RSubGradient",
+            "RMulGradient",
+            "RDivGradient",
+            "StackGradient",
+            "ConcatGradient",
        };
        if (ROrderOps.count(op->type())) left = false;
        // Check output grads, left order
@@ -296,6 +337,7 @@ void GraphGradientMaker::Share(GraphDef& graph) {
        // Update the pool from GC
        for (auto& e : GC) grads_pool.emplace_back(e);
    }
+    return output_def;
 }

 }  // namespace dragon
\ No newline at end of file
--- a/Dragon/src/core/graph_optimizer.cc
+++ b/Dragon/src/core/graph_optimizer.cc
@@ -16,7 +16,7 @@ GraphDef GraphOptimizer::PruneNodes(const GraphDef& input_def) {
        const OperatorDef& op = input_def.op(i);
        for (const auto& v : op.output()) {
            vector<string> sp_u;
-            if (!op.input_size()) sp_u.resize(op.output_size(), "");
+            if (!op.input_size()) sp_u.resize(op.output_size());
            else sp_u.assign(op.input().begin(), op.input().end());
            for (const auto& u : sp_u) {
                if (u == "NULL") continue;
@@ -55,7 +55,7 @@ GraphDef GraphOptimizer::PruneNodes(const GraphDef& input_def) {
    // Remove the tensors that can not be produced(redundant)
    Set<string> outputs;
    // Check if having feeded tensors
-    for (const auto& e : ws_->GetTensors()) outputs.insert(e);
+    for (const auto& e : ws_->tensors()) outputs.insert(e);
    // Note that we use map to keep topo-order
    map<int, OperatorDef> final_sequence;

@@ -114,7 +114,7 @@ GraphDef GraphOptimizer::AddInplace(const GraphDef& input_def) {
        const OperatorDef& op = input_def.op(i);
        for (const auto& v : op.output()) {
            vector<string> sp_u;
-            if (!op.input_size()) sp_u.resize(op.output_size(), "");
+            if (!op.input_size()) sp_u.resize(op.output_size());
            else sp_u.assign(op.input().begin(), op.input().end());
            for (const auto& u : sp_u) {
                if (u == "NULL") continue;
@@ -224,7 +224,6 @@ GraphDef GraphOptimizer::MirrorStage(
                }
                CHECK(!v2_name.empty()) << "\nNo enough buffers for outputs.";
                ws_->CreateTensor(v2_name)->set_version(0);
-                if (!versions.count(v2_name)) versions[v2_name] = 0;
                version_name = "/ver:" + std::to_string(versions[v2_name]++);
                *op_v2->mutable_output(j) = rename_map[op.output(j)] =
                              v2_name + version_name;
@@ -248,8 +247,6 @@ GraphDef GraphOptimizer::MirrorStage(
        set<int> minimum_ops = {i};
        for (int j = 0; j < input_op.input_size(); ++j) {
            if (input_op.input(j) != output_op.input(j)) {
-                if (!fake_op_indices.count(input_op.input(j)))
-                    fake_op_indices[input_op.input(j)] = set<int>();
                for (auto idx : fake_op_indices[input_op.input(j)])
                            minimum_ops.insert(idx);
            }
@@ -262,7 +259,6 @@ GraphDef GraphOptimizer::MirrorStage(

    // Bind to the renamed tensors
    for (const auto& it : rename_map) {
-        op_indices[it.second] = vector<int>();
        for (auto op_idx : fake_op_indices[it.first])
            op_indices[it.second].push_back(op_idx);
    }

--- a/Dragon/src/core/workspace.cc
+++ b/Dragon/src/core/workspace.cc
@@ -6,7 +6,7 @@ namespace dragon {

 /*! Create some internal tensors */

-void Workspace::InitWorkspace() {
+void Workspace::Initialize() {
    CreateTensor("NULL");
    Tensor* recomputing_flag = CreateTensor(
        "/opt/recomputing_flag")->Reshape({ 1 });
@@ -14,21 +14,18 @@ void Workspace::InitWorkspace() {
        <bool, CPUContext>()[0] = false;
 }

-/*! Move a external workspace into this workspace */
+/*! Destory all the tensors */

-Workspace* Workspace::Move(Workspace* ws) {
-    CHECK(ws) << "The given Workspace is invalid.";
-    if (workspace_map_.count(ws->name()))
-        return workspace_map_[ws->name()];
-    return workspace_map_[ws->name()] = ws;
+void Workspace::Clear() {
+    // Remove and Initialize again
+    tensor_map_.clear(); Initialize();
 }

-/*! Destory all the tensors */
+/*! Merge from a external workspace */

-void Workspace::Clear() {
-    // Clear tensors, then re-initialization
-    for (auto& kv : tensor_map_) kv.second->Reset();
-    InitWorkspace();
+void Workspace::MergeFrom(Workspace* ws) {
+    CHECK(ws) << "\nThe given Workspace is invalid.";
+    remote_workspaces_.emplace_back(ws);
 }

 /*! Query the real name of specified tensor */
@@ -53,9 +50,9 @@ Tensor* Workspace::TryGetTensor(

    if (use_remote) {
        // Search the remote workspaces
-        for (auto& it : workspace_map_) {
-            if (it.second->HasTensor(query))
-                return it.second->GetTensor(query);
+        for (auto* ws : remote_workspaces_) {
+            if (ws->HasTensor(query))
+                return ws->GetTensor(query);
        }
    }
    return nullptr;
@@ -66,7 +63,8 @@ Tensor* Workspace::TryGetTensor(
 Tensor* Workspace::CreateTensor(const string& name) {
    Tensor* tensor = TryGetTensor(name);
    if (!tensor) {
-        tensor_map_[name] = unique_ptr<Tensor>(new Tensor(name));
+        tensor_map_[name] = unique_ptr
+            <Tensor>(new Tensor(name));
        return tensor_map_[name].get();
    }
    return tensor;
@@ -78,8 +76,8 @@ Tensor* Workspace::GetTensor(
    const string&               name,
    bool                        use_remote) const {
    Tensor* tensor = TryGetTensor(name, use_remote);
-    CHECK(tensor) << "\nTensor(" << name << ") does not exist "
-        << "in current workspace or sub-workspace.";
+    CHECK(tensor) << "\nTensor(" << name << ") does not "
+                  << "exist in current workspace.";
    return tensor;
 }

@@ -88,22 +86,23 @@ Tensor* Workspace::GetTensor(
 void Workspace::ResetTensor(const string& name) {
    Tensor* tensor = TryGetTensor(name, false);
    CHECK(tensor) << "\nTensor(" << name << ") does not "
-        << "belong to current workspace, could not be reset.";
+                  << "belong to current workspace.";
    tensor->Reset();
 }

-/*! Return all the stored tensor names */
+/*! Return the name of stored tensors */

-vector<string> Workspace::GetTensors() const {
+vector<string> Workspace::tensors() const {
    vector<string> locals;
    // Search the local workspace
    for (const auto& it : tensor_map_)
        locals.push_back(it.first);

    // Serach the remote workspaces
-    for (const auto& it : workspace_map_) {
-        vector<string> remotes = it.second->GetTensors();
-        locals.insert(locals.end(), remotes.begin(), remotes.end());
+    for (auto* ws : remote_workspaces_) {
+        vector<string> remotes = ws->tensors();
+        locals.insert(locals.end(),
+            remotes.begin(), remotes.end());
    }
    return locals;
 }
@@ -118,14 +117,14 @@ bool Workspace::HasFiller(
    if (!use_remote) return result;

    // Search the remote workspaces
-    for (auto& it : workspace_map_)
-        result |= it.second->HasFiller(name);
+    for (auto* ws : remote_workspaces_)
+        result |= ws->HasFiller(name);
    return result;
 }

 /*! Create the specified filler */
 void Workspace::CreateFiller(
-    const TensorFillerProto     filler) {
+    const TensorFillerProto&     filler) {
    CHECK_GT(filler.tensor().size(), 0)
        << "\nTensor with an empty name can not be filled.";
    if (HasFiller(filler.tensor())) return;
@@ -141,9 +140,9 @@ const TensorFillerProto* Workspace::GetFiller(
    if (it != tensor_filler_map_.end()) return &it->second;

    // Search the remote workspaces
-    for (const auto& it : workspace_map_) {
-        if (it.second->HasFiller(name))
-            return it.second->GetFiller(name);
+    for (auto* ws : remote_workspaces_) {
+        if (ws->HasFiller(name))
+            return ws->GetFiller(name);
    }
    return nullptr;
 }
@@ -153,7 +152,6 @@ const TensorFillerProto* Workspace::GetFiller(
 OperatorBase* Workspace::CreateOperator(const OperatorDef& def) {
    const auto& it = operator_map_.find(def.uid());
    if (it == operator_map_.end()) {
-        for (auto& input : def.input()) CreateTensor(input);
        auto* new_op = NewOperator(def, this);
        operator_map_[def.uid()] = unique_ptr<
            OperatorBase>(new_op); return new_op;
@@ -209,9 +207,9 @@ void Workspace::RunGraph(
    graph_map_[graph_name]->Run(include, exclude, stream_id);
 }

-/*! Return all the stored graph names */
+/*! Return the name of stored graphs */

-vector<string> Workspace::GetGraphs() const {
+vector<string> Workspace::graphs() const {
    vector<string> names;
    for (const auto& it : graph_map_) {
        names.push_back(it.first);
@@ -237,17 +235,20 @@ string Workspace::GetDummyName(
    const string&               suffix,
    const string&               domain,
    const bool                  zero_based) {
-    string required_name = base_name + suffix;
-    if (dummy_name_map_.count(domain) == 0) {
-        dummy_name_map_[domain] = Map<string, int64_t>();
-    }
-    auto& map_this_domain = dummy_name_map_[domain];
-    int64_t index = map_this_domain[required_name]++;
-    return index ? base_name + "_" +
+    string accepted_name; int64_t index;
+    const auto required_name = base_name + suffix;
+    auto& dmap = dummy_name_map_[domain];
+    while (1) {
+        index = dmap[required_name]++;
+        accepted_name = index ? base_name + "_" +
            std::to_string(index) + suffix :
            zero_based ? required_name :
                base_name + "_" + std::to_string(
-            map_this_domain[required_name]++) + suffix;
+                    dmap[required_name]++) + suffix;
+        if (remote_workspaces_.empty()) break;
+        if (!HasTensor(accepted_name)) break;
+    }
+    return accepted_name;
 }

 }  // namespace dragon
\ No newline at end of file
--- a/Dragon/src/kernels/activation/prelu_op_kernel.cc
+++ b/Dragon/src/kernels/activation/prelu_op_kernel.cc
@@ -117,17 +117,19 @@ template<> void PReluWGrad<float, CPUContext>(
        }
    }
    if (channel_shared) {
-        math::Dot<float, CPUContext>(channels * dim,
+        math::Dot(channels * dim,
            bcast_dw, multiplier, dw, ctx);
    } else {
        if (data_format == "NCHW") {
-            math::Gemv<float, CPUContext>(
-                CblasNoTrans, channels, dim,
+            math::Gemv(
+                CblasNoTrans,
+                channels, dim,
                1.f, bcast_dw, multiplier,
                0.f, dw, ctx);
        } else if (data_format == "NHWC") {
-            math::Gemv<float, CPUContext>(
-                CblasTrans, dim, channels,
+            math::Gemv(
+                CblasTrans,
+                dim, channels,
                1.f, bcast_dw, multiplier,
                0.f, dw, ctx);
        } else LOG(FATAL) << "Unknown data format: " << data_format;

--- a/Dragon/src/kernels/activation/prelu_op_kernel.cu
+++ b/Dragon/src/kernels/activation/prelu_op_kernel.cu
@@ -204,17 +204,19 @@ template<> void PReluWGrad<float, CUDAContext>(
             0, ctx->cuda_stream() >> >
        (cdim, rows, row_offset, dy, x, bcast_dw);
    if (channel_shared) {
-        math::Dot<float, CUDAContext>(channels * dim,
+        math::Dot(channels * dim,
            bcast_dw, multiplier, dw, ctx);
    } else {
        if (data_format == "NCHW") {
-            math::Gemv<float, CUDAContext>(
-                CblasNoTrans, channels, dim,
+            math::Gemv(
+                CblasNoTrans,
+                channels, dim,
                1.f, bcast_dw, multiplier,
                0.f, dw, ctx);
        } else if (data_format == "NHWC") {
            math::Gemv<float, CUDAContext>(
-                CblasTrans, dim, channels,
+                CblasTrans,
+                dim, channels,
                1.f, bcast_dw, multiplier,
                0.f, dw, ctx);
        } else LOG(FATAL) << "Unknown data format: " << data_format;

--- a/Dragon/src/kernels/activation/softmax_op_kernel.cc
+++ b/Dragon/src/kernels/activation/softmax_op_kernel.cc
@@ -28,17 +28,20 @@ template<> void Softmax<float, CPUContext>(
                    scale[k], x[i * dim + j * inner_dim + k]
                );
        }
-        math::Gemm<float, CPUContext>(
-            CblasNoTrans, CblasNoTrans,
+        math::Gemm(
+            CblasNoTrans,
+            CblasNoTrans,
            classes, inner_dim, 1,
-                    -1.f, sum_multiplier, scale, 1.f, y, ctx);
-        math::Exp<float, CPUContext>(dim, y, y, ctx);
-        math::Gemv<float, CPUContext>(
-            CblasTrans, classes, inner_dim,
+            -1.f, sum_multiplier, scale,
+            1.f, y, ctx);
+        math::Exp(dim, y, y, ctx);
+        math::Gemv(
+            CblasTrans,
+            classes, inner_dim,
            1.f, y, sum_multiplier,
            0.f, scale, ctx);
        for (int j = 0; j < classes; ++j) {
-            math::Div<float, CPUContext>(inner_dim, y, scale, y, ctx);
+            math::Div(inner_dim, y, scale, y, ctx);
            y += inner_dim;
        }
    }

--- a/Dragon/src/kernels/vision/bias_add_op_kernel.cc
+++ b/Dragon/src/kernels/vision/bias_add_op_kernel.cc
@@ -8,7 +8,6 @@ namespace kernel {
 /*! BiasAdd <T = float32, Device = CPU> */

 template<> void BiasAdd<float, CPUContext>(
-    const int               count,
    const int               outer_dim,
    const int               dim,
    const int               inner_dim,

--- a/Dragon/src/kernels/vision/bias_add_op_kernel.cu
+++ b/Dragon/src/kernels/vision/bias_add_op_kernel.cu
@@ -11,38 +11,37 @@ namespace kernel {

 template <typename T>
 __global__ void _BiasAdd_NCHW(
-    const int               count,
+    const int               nthreads,
    const int               dim,
    const int               inner_dim,
    const T*                bias,
    T*                      y) {
-    CUDA_1D_KERNEL_LOOP(idx, count) {
+    CUDA_1D_KERNEL_LOOP(i, nthreads) {
 #if __CUDA_ARCH__ >= 350
-        y[idx] += __ldg(bias + ((idx / inner_dim) % dim));
+        y[i] += __ldg(bias + ((i / inner_dim) % dim));
 #else
-        y[idx] += bias[(idx / inner_dim) % dim];
+        y[i] += bias[(i / inner_dim) % dim];
 #endif
    }
 }

 template <typename T>
 __global__ void _BiasAdd_NHWC(
-    const int               count,
+    const int               nthreads,
    const int               dim,
    const int               inner_dim,
    const T*                bias,
    T*                      y) {
-    CUDA_1D_KERNEL_LOOP(idx, count) {
+    CUDA_1D_KERNEL_LOOP(i, nthreads) {
 #if __CUDA_ARCH__ >= 350
-        y[idx] += __ldg(bias + (idx % dim));
+        y[i] += __ldg(bias + (i % dim));
 #else
-        y[idx] += bias[idx % dim];
+        y[i] += bias[i % dim];
 #endif
    }
 }

 template<> void BiasAdd<float, CUDAContext>(
-    const int               count,
    const int               outer_dim,
    const int               dim,
    const int               inner_dim,
@@ -51,16 +50,17 @@ template<> void BiasAdd<float, CUDAContext>(
    const float*            bias_multiplier,
    float*                  y,
    CUDAContext*            ctx) {
+    auto nthreads = outer_dim * dim * inner_dim;
    if (data_format == "NCHW") {
        _BiasAdd_NCHW<float>
-            << < CUDA_BLOCKS(count), CUDA_THREADS,
+            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
                 0, ctx->cuda_stream() >> >
-            (count, dim, inner_dim, bias, y);
+            (nthreads, dim, inner_dim, bias, y);
    } else if (data_format == "NHWC") {
        _BiasAdd_NHWC<float>
-            << < CUDA_BLOCKS(count), CUDA_THREADS,
+            << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
                 0, ctx->cuda_stream() >> >
-            (count, dim, inner_dim, bias, y);
+            (nthreads, dim, inner_dim, bias, y);
    } else LOG(FATAL) << "Unknown data format: " << data_format;
 }


--- a/Dragon/src/operators/activation/cudnn_dropout_op.cc
+++ b/Dragon/src/operators/activation/cudnn_dropout_op.cc
--- a/Dragon/src/operators/activation/cudnn_elu_op.cc
+++ b/Dragon/src/operators/activation/cudnn_elu_op.cc
--- a/Dragon/src/operators/activation/cudnn_relu_op.cc
+++ b/Dragon/src/operators/activation/cudnn_relu_op.cc
--- a/Dragon/src/operators/activation/cudnn_sigmoid_op.cc
+++ b/Dragon/src/operators/activation/cudnn_sigmoid_op.cc
--- a/Dragon/src/operators/activation/cudnn_softmax_op.cc
+++ b/Dragon/src/operators/activation/cudnn_softmax_op.cc
@@ -21,8 +21,10 @@ void CuDNNSoftmaxOp<Context>::RunWithType() {
    auto* Xdata = Input(0).template data<T, Context>();
    auto* Ydata = Output(0)->template mutable_data<T, Context>();

-    CUDNN_CHECK(cudnnSoftmaxForward(ctx()->cudnn_handle(),
-        CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL,
+    CUDNN_CHECK(cudnnSoftmaxForward(
+        ctx()->cudnn_handle(),
+        CUDNN_SOFTMAX_ACCURATE,
+        CUDNN_SOFTMAX_MODE_CHANNEL,
        CUDNNType<T>::one, input_desc, Xdata,
        CUDNNType<T>::zero, output_desc, Ydata));
 }
@@ -52,8 +54,10 @@ void CuDNNSoftmaxGradientOp<Context>::RunWithType() {
    auto* dYdata = Input(-1).template data<T, Context>();
    auto* Ydata = Input(0).template data<T, Context>();
    auto* dXdata = Output(0)->template mutable_data<T, Context>();
-    CUDNN_CHECK(cudnnSoftmaxBackward(ctx()->cudnn_handle(),
-        CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL,
+    CUDNN_CHECK(cudnnSoftmaxBackward(
+        ctx()->cudnn_handle(),
+        CUDNN_SOFTMAX_ACCURATE,
+        CUDNN_SOFTMAX_MODE_CHANNEL,
        CUDNNType<T>::one, input_desc, Ydata, input_desc, dYdata,
        CUDNNType<T>::zero, output_desc, dXdata));
 }

--- a/Dragon/src/operators/activation/cudnn_tanh_op.cc
+++ b/Dragon/src/operators/activation/cudnn_tanh_op.cc
--- a/Dragon/src/operators/arithmetic/affine_op.cc
+++ b/Dragon/src/operators/arithmetic/affine_op.cc
@@ -107,14 +107,16 @@ void AffineGradientOp<Context>::ComputeScaleGradient(
            dA : ws()->template caches<T, Context>(
                { outer_dim * scale_dim })[0];
        math::Gemv(
-            CblasNoTrans, outer_dim * scale_dim, inner_dim,
+            CblasNoTrans,
+            outer_dim * scale_dim, inner_dim,
            1.f, dYxX, multiplier,
            0.f, SRes_data, ctx());
    }
    // Reduce outer dimensions
    if (outer_dim != 1) {
        math::Gemv(
-            CblasTrans, outer_dim, scale_dim,
+            CblasTrans,
+            outer_dim, scale_dim,
            1.f, SRes_data, multiplier,
            0.f, dA, ctx());
    }

--- a/Dragon/src/operators/arithmetic/cudnn_affine_op.cc
+++ b/Dragon/src/operators/arithmetic/cudnn_affine_op.cc
@@ -153,7 +153,8 @@ void CuDNNAffineGradientOp<Context>::ComputeScaleGradient(
    CUDNN_CHECK(cudnnSetReduceTensorDescriptor(
        reduce_desc, CUDNN_REDUCE_TENSOR_ADD,
        CUDNNType<CT>::type, CUDNN_PROPAGATE_NAN,
-                CUDNN_REDUCE_TENSOR_NO_INDICES, CUDNN_32BIT_INDICES));
+        CUDNN_REDUCE_TENSOR_NO_INDICES,
+        CUDNN_32BIT_INDICES));
    size_t workspace_size = 0;
    CUDNN_CHECK(cudnnGetReductionWorkspaceSize(
        ctx()->cudnn_handle(), reduce_desc,
@@ -181,14 +182,16 @@ void CuDNNAffineGradientOp<Context>::ComputeScaleGradient_v2(
            dA : ws()->template caches<T, Context>(
                { outer_dim * scale_dim })[0];
        math::Gemv(
-            CblasNoTrans, outer_dim * scale_dim, inner_dim,
+            CblasNoTrans,
+            outer_dim * scale_dim, inner_dim,
            1.f, dYxX, multiplier,
            0.f, SRes_data, ctx());
    }
    // Reduce outer dimensions
    if (outer_dim != 1) {
        math::Gemv(
-            CblasTrans, outer_dim, scale_dim,
+            CblasTrans,
+            outer_dim, scale_dim,
            1.f, SRes_data, multiplier,
            0.f, dA, ctx());
    }

--- a/Dragon/src/operators/arithmetic/dot_op.cc
+++ b/Dragon/src/operators/arithmetic/dot_op.cc
@@ -65,8 +65,10 @@ void DotOp<Context>::GemvRunWithType() {
    auto* Ydata = Output(0)->template mutable_data<T, Context>();

    math::Gemv(
-        transA ? CblasTrans : CblasNoTrans, M1, N1,
-            1.f, X1data, X2data, 0.f, Ydata, ctx());
+        transA ? CblasTrans : CblasNoTrans,
+        M1, N1,
+        1.f, X1data, X2data,
+        0.f, Ydata, ctx());
 }

 template <class Context>
@@ -149,13 +151,15 @@ void DotGradientOp<Context>::GemmRunWithType() {
        auto* dX1data = Output(0)->template mutable_data<T, Context>();
        if (transA) {
            math::Gemm(
-                transB ? CblasTrans : CblasNoTrans, CblasTrans,
+                transB ? CblasTrans : CblasNoTrans,
+                CblasTrans,
                K1, M, N,
                1.f, X2data, dYdata,
                0.f, dX1data, ctx());
        } else {
            math::Gemm(
-                CblasNoTrans, transB ? CblasNoTrans : CblasTrans,
+                CblasNoTrans,
+                transB ? CblasNoTrans : CblasTrans,
                M, K1, N,
                1.f, dYdata, X2data,
                0.f, dX1data, ctx());
@@ -166,13 +170,15 @@ void DotGradientOp<Context>::GemmRunWithType() {
        auto* dX2data = Output(1)->template mutable_data<T, Context>();
        if (transB) {
           math::Gemm(
-                CblasTrans, transA ? CblasTrans : CblasNoTrans,
+               CblasTrans,
+               transA ? CblasTrans : CblasNoTrans,
               N, K1, M,
               1.f, dYdata, X1data,
               0.f, dX2data, ctx());
        } else {
            math::Gemm(
-                transA ? CblasNoTrans : CblasTrans, CblasNoTrans,
+                transA ? CblasNoTrans : CblasTrans,
+                CblasNoTrans,
                K1, N, M,
                1.f, X1data, dYdata,
                0.f, dX2data, ctx());
@@ -197,7 +203,8 @@ void DotGradientOp<Context>::GemvRunWithType() {
    auto* dX2data = Output(1)->template mutable_data<T, Context>();

    math::Gemm(
-        CblasNoTrans, CblasNoTrans,
+        CblasNoTrans,
+        CblasNoTrans,
        M, N, 1,
        1.f, dYdata, X2data,
        0.f, dX1data, ctx());

--- a/Dragon/src/operators/arithmetic/fully_connected_op.cc
+++ b/Dragon/src/operators/arithmetic/fully_connected_op.cc
@@ -28,7 +28,8 @@ void FullyConnectedOp<Context>::TransRunWithType() {
    auto* Ydata = Output(0)->template mutable_data<T, Context>();

    math::Gemm(
-        CblasNoTrans, CblasTrans,
+        CblasNoTrans,
+        CblasTrans,
        M, N, K,
        1.f, Xdata, Wdata,
        0.f, Ydata, ctx());
@@ -37,7 +38,8 @@ void FullyConnectedOp<Context>::TransRunWithType() {
        DECLARE_MULTIPLIER(multiplier, M);
        auto* Bdata = Input(2).template data<T, Context>();
        math::Gemm(
-            CblasNoTrans, CblasNoTrans,
+            CblasNoTrans,
+            CblasNoTrans,
            M, N, 1,
            1.f, multiplier, Bdata,
            1.f, Ydata, ctx());
@@ -61,7 +63,8 @@ void FullyConnectedOp<Context>::NoTransRunWithType() {
    auto* Ydata = Output(0)->template mutable_data<T, Context>();

    math::Gemm(
-        CblasNoTrans, CblasNoTrans,
+        CblasNoTrans,
+        CblasNoTrans,
        M, N, K,
        1.f, Xdata, Wdata,
        0.f, Ydata, ctx());
@@ -70,7 +73,8 @@ void FullyConnectedOp<Context>::NoTransRunWithType() {
        DECLARE_MULTIPLIER(multiplier, M);
        auto* Bdata = Input(2).template data<T, Context>();
        math::Gemm(
-            CblasNoTrans, CblasNoTrans,
+            CblasNoTrans,
+            CblasNoTrans,
            M, N, 1,
            1.f, multiplier, Bdata,
            1.f, Ydata, ctx());
@@ -127,13 +131,15 @@ void FullyConnectedGradientOp<Context>::RunWithType() {
        auto* dWdata = Output(1)->template mutable_data<T, Context>();
        if (transW) {
            math::Gemm(
-                CblasTrans, CblasNoTrans,
+                CblasTrans,
+                CblasNoTrans,
                N, K, M,
                1.f, dYdata, Xdata,
                0.f, dWdata, ctx());
        } else {
            math::Gemm(
-                CblasTrans, CblasNoTrans,
+                CblasTrans,
+                CblasNoTrans,
                K, N, M,
                1.f, Xdata, dYdata,
                0.f, dWdata, ctx());
@@ -145,7 +151,8 @@ void FullyConnectedGradientOp<Context>::RunWithType() {
        Output(2)->Reshape({ N });
        auto* dBdata = Output(2)->template mutable_data<T, Context>();
        math::Gemv(
-            CblasTrans, M, N,
+            CblasTrans,
+            M, N,
            1.f, dYdata, multiplier,
            0.f, dBdata, ctx());
    }
@@ -155,13 +162,15 @@ void FullyConnectedGradientOp<Context>::RunWithType() {
        auto* dXdata = Output(0)->template mutable_data<T, Context>();
        if (transW) {
            math::Gemm(
-                CblasNoTrans, CblasNoTrans,
+                CblasNoTrans,
+                CblasNoTrans,
                M, K, N,
                1.f, dYdata, Wdata,
                0.f, dXdata, ctx());
        } else {
            math::Gemm(
-                CblasNoTrans, CblasTrans,
+                CblasNoTrans,
+                CblasTrans,
                M, K, N,
                1.f, dYdata, Wdata,
                0.f, dXdata, ctx());

--- a/Dragon/src/operators/arithmetic/gram_matrix_op.cc
+++ b/Dragon/src/operators/arithmetic/gram_matrix_op.cc
@@ -9,7 +9,8 @@ void GramMatrixOp<Context>::RunWithType() {
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
    for (int i = 0; i < outer_dim; i++) {
        math::Gemm(
-            CblasNoTrans, CblasTrans,
+            CblasNoTrans,
+            CblasTrans,
            dim, dim, inner_dim,
            1.f, Xdata, Xdata,
            0.f, Ydata, ctx());
@@ -44,7 +45,8 @@ void GramMatrixGradientOp<Context>::RunWithType() {
    auto* dXdata = Output(0)->template mutable_data<T, Context>();
    for (int i = 0; i < outer_dim; i++) {
        math::Gemm(
-            CblasNoTrans, CblasNoTrans,
+            CblasNoTrans,
+            CblasNoTrans,
            dim, inner_dim, dim,
            2.f, dYdata, Xdata,
            0.f, dXdata, ctx());

--- a/Dragon/src/operators/array/multinomial_op.cc
+++ b/Dragon/src/operators/array/multinomial_op.cc
@@ -33,6 +33,7 @@ void MultinomialOp<Context>::RunWithType() {

    double running_total, r;
    int idx = 0, num_classes = Input(0).dim(axis);
+
    auto* rng = ctx()->rand_generator();

    for (int i = 0; i < outer_dim; ++i) {
@@ -47,7 +48,9 @@ void MultinomialOp<Context>::RunWithType() {
            r = dist(*rng);
            auto found_iter = std::upper_bound(
                Sdata, Sdata + num_classes, r);
-            Ydata[idx++] = std::distance(Sdata, found_iter);
+            Ydata[idx++] = std::min(
+                (int)std::distance(Sdata,
+                    found_iter), num_classes - 1);
        }
        Xdata += num_classes;
    }

--- a/Dragon/src/operators/misc/gradient_op.cc
+++ b/Dragon/src/operators/misc/gradient_op.cc
@@ -10,9 +10,9 @@ void GradientGenerateOp<Context>::RunWithType() {
    for (int i = 0; i < OutputSize(); i++) {
        if (Output(i)->name() == "NULL") continue;
        Output(i)->ReshapeLike(Input(i));
-        auto* dXdata = Output(0)->template mutable_data<T, Context>();
-        math::Set(Output(0)->count(),
-            cast::to<T>(defaults[i]), dXdata, ctx());
+        auto v = cast::to<T>(defaults[i]);
+        auto* Ydata = Output(0)->template mutable_data<T, Context>();
+        math::Set(Output(0)->count(), v, Ydata, ctx());
    }
 }

@@ -40,30 +40,29 @@ OPERATOR_SCHEMA(GradientGenerate);

 template <class Context> template <typename T>
 void GradientGatherOp<Context>::RunWithType() {
-    auto* dXdata = Output(0)->template mutable_data<T, Context>();
    int64_t count = Output(0)->count();
+    auto* Y = Output(0)->template mutable_data<T, Context>();
    if (indices.size() == 1) {
-        auto* dYdata = Input(indices[0]).template data<T, Context>();
-        ctx()->template Copy<T, Context, Context>(count, dXdata, dYdata);
+        auto* X = Input(indices[0]).template data<T, Context>();
+        ctx()->template Copy<T, Context, Context>(count, Y, X);
    } else if(indices.size() == 2) {
        CHECK_EQ(count, Input(indices[1]).count());
-        auto* dY1data = Input(indices[0]).template data<T, Context>();
-        auto* dY2data = Input(indices[1]).template data<T, Context>();
-        math::Add(count, dY1data, dY2data, dXdata, ctx());
+        auto* X1 = Input(indices[0]).template data<T, Context>();
+        auto* X2 = Input(indices[1]).template data<T, Context>();
+        math::Add(count, X1, X2, Y, ctx());
    } else {
-        size_t dy_idx = 1;
-        auto* dYdata = Input(indices[0]).template data<T, Context>();
-        ctx()->template Copy<T, Context, Context>(count, dXdata, dYdata);
-        while (dy_idx < indices.size()) {
-            if (indices.size() - dy_idx >= 2) {
-                auto* dY1data = Input(indices[dy_idx]).template data<T, Context>();
-                auto* dY2data = Input(indices[dy_idx + 1]).template data<T, Context>();
-                kernel::GradientTwoSum(count, dY1data, dY2data, dXdata, ctx());
-                dy_idx += 2;
+        size_t index = 1;
+        auto* X = Input(indices[0]).template data<T, Context>();
+        ctx()->template Copy<T, Context, Context>(count, Y, X);
+        while (index < indices.size()) {
+            if (indices.size() - index >= 2) {
+                auto* X1 = Input(indices[index]).template data<T, Context>();
+                auto* X2 = Input(indices[index + 1]).template data<T, Context>();
+                kernel::GradientTwoSum(count, X1, X2, Y, ctx());
+                index += 2;
            } else {
-                dYdata = Input(indices[dy_idx]).template data<T, Context>();
-                math::Add(count, dXdata, dYdata, dXdata, ctx());
-                dy_idx += 1;
+                X = Input(indices[index]).template data<T, Context>();
+                math::Add(count, Y, X, Y, ctx()); break;
            }
        }
    }
@@ -92,7 +91,39 @@ DEPLOY_CPU(GradientGather);
 DEPLOY_CUDA(GradientGather);
 #endif
 OPERATOR_SCHEMA(GradientGather).NumOutputs(1);
-NO_GRADIENT(GradientGather);
+
+template <class Context> template <typename T>
+void GradientAddOp<Context>::RunWithType() {
+    auto* X = Input(1).template data<T, Context>();
+    auto* Y = Output(0)->template mutable_data<T, Context>();
+    math::Add(Output(0)->count(), Y, X, Y, ctx());
+}
+
+template <class Context>
+void GradientAddOp<Context>::RunOnDevice() {
+    CHECK_EQ(Input(0).name(), Output(0)->name())
+        << "\nRequires X(0) == Y(0).";
+
+    if (XIsType(Input(0), int8_t)) RunWithType<int8_t>();
+    else if (XIsType(Input(0), uint8_t)) RunWithType<uint8_t>();
+    else if (XIsType(Input(0), int)) RunWithType<int>();
+    else if (XIsType(Input(0), int64_t)) RunWithType<int64_t>();
+    else if (XIsType(Input(0), float16)) RunWithType<float16>();
+    else if (XIsType(Input(0), float)) RunWithType<float>();
+    else if (XIsType(Input(0), double)) RunWithType<double>();
+    else LOG(FATAL) << DTypeHelper(Input(0), {
+        "int8", "uint8", "int32", "int64",
+            "float16", "float32", "float64",
+    });
+}
+
+DEPLOY_CPU(GradientAdd);
+#ifdef WITH_CUDA
+DEPLOY_CUDA(GradientAdd);
+#endif
+OPERATOR_SCHEMA(GradientAdd)
+    .NumInputs(2).NumOutputs(1)
+    .Inplace({ { 0, 0 } });

 template <class Context>
 void StopGradientOp<Context>::RunOnDevice() {

--- a/Dragon/src/operators/norm/l2_norm_op.cc
+++ b/Dragon/src/operators/norm/l2_norm_op.cc
@@ -37,19 +37,20 @@ void L2NormOp<Context>::RunWithType() {
        math::Square(buffer.count(), Xdata, Bdata, ctx());
        // Compute T1 = \sum_{i} x_{i,j}^{2}
        math::Gemv(
-            CblasTrans, dim, inner_dim,
+            CblasTrans,
+            dim, inner_dim,
            mode == "MEAN" ? 1.f / dim : 1.f, Bdata, Dmult,
            1.f, Ndata, ctx());
        // Compute T2 = \sqrt{T1}
        math::Sqrt(inner_dim, Ndata, Ndata, ctx());
        // Compute T3 = x / [(T2)]_{dim}
        math::Gemm(
-            CblasNoTrans, CblasNoTrans,
+            CblasNoTrans,
+            CblasNoTrans,
            dim, inner_dim, 1,
            1.f, Dmult, Ndata,
            0.f, Bdata, ctx());
-        math::Div(buffer.count(),
-            Xdata, Bdata, Ydata, ctx());
+        math::Div(buffer.count(), Xdata, Bdata, Ydata, ctx());
        Ndata += inner_dim;
        Xdata += buffer.count();
        Ydata += buffer.count();
@@ -101,12 +102,14 @@ void L2NormGradientOp<Context>::RunWithType() {
        // Compute \sum_{i} x_{i, j}dy_{i, j}
        math::Mul(buffer.count(), Xdata, dYdata, Bdata, ctx());
        math::Gemv(
-            CblasTrans, dim, inner_dim,
+            CblasTrans,
+            dim, inner_dim,
            mode == "MEAN" ? 1.f / dim : 1.f, Bdata, Dmult,
            0.f, BInnerdata, ctx());
        // Compute T1 = x[(\sum_{i} x_{i, j}dy_{i, j})]_{dim}
        math::Gemm(
-            CblasNoTrans, CblasNoTrans,
+            CblasNoTrans,
+            CblasNoTrans,
            dim, inner_dim, 1,
            1.f, Dmult, BInnerdata,
            0.f, Bdata, ctx());
@@ -114,7 +117,8 @@ void L2NormGradientOp<Context>::RunWithType() {
        // Compute T2 = T1 / Normalizer^{2}
        math::Square(inner_dim, Ndata, BInnerdata, ctx());
        math::Gemm(
-            CblasNoTrans, CblasNoTrans,
+            CblasNoTrans,
+            CblasNoTrans,
            dim, inner_dim, 1,
            1.f, Dmult, BInnerdata,
            0.f, Bdata, ctx());
@@ -122,7 +126,8 @@ void L2NormGradientOp<Context>::RunWithType() {
        // Compute T3 = (dy - T2) / Normalizer
        math::Sub(buffer.count(), dYdata, dXdata, dXdata, ctx());
        math::Gemm(
-            CblasNoTrans, CblasNoTrans,
+            CblasNoTrans,
+            CblasNoTrans,
            dim, inner_dim, 1,
            1.f, Dmult, Ndata,
            0.f, Bdata, ctx());

--- a/Dragon/src/operators/recurrent/cudnn_recurrent_op.cc
+++ b/Dragon/src/operators/recurrent/cudnn_recurrent_op.cc
@@ -48,18 +48,25 @@ void CuDNNRecurrentOpBase<Context>::ResetDesc() {
    // Setup RNN
 #if CUDNN_VERSION_MIN(7, 0, 0)
    CUDNN_CHECK(cudnnSetRNNDescriptor(
-        ctx()->cudnn_handle(), rnn_desc,
-            hidden_size, num_layers,
+        ctx()->cudnn_handle(),
+        rnn_desc,
+        hidden_size,
+        num_layers,
        dropout_desc,
-                    rnn_input_mode, rnn_direction, rnn_mode,
+        rnn_input_mode,
+        rnn_direction,
+        rnn_mode,
        CUDNN_RNN_ALGO_STANDARD,
        CUDNNType<T>::type));
 #else
    CUDNN_CHECK(cudnnSetRNNDescriptor(
        rnn_desc,
-            hidden_size, num_layers,
+        hidden_size,
+        num_layers,
        dropout_desc,
-                    rnn_input_mode, rnn_direction, rnn_mode,
+        rnn_input_mode,
+        rnn_direction,
+        rnn_mode,
        CUDNNType<T>::type));
 #endif

@@ -68,8 +75,6 @@ void CuDNNRecurrentOpBase<Context>::ResetDesc() {
    xs_desc->Set<T>({ batch_size, input_dim, 1 }, { input_dim, 1, 1 });
    ys_desc.reset(new cudnnTensorDescriptors(seq_length));
    ys_desc->Set<T>({ batch_size, output_dim, 1 }, { output_dim, 1, 1 });
-    CUDNN_CHECK(cudnnGetRNNWorkspaceSize(ctx()->cudnn_handle(),
-        rnn_desc, seq_length, xs_desc->descs(), &workspace_size));
    output_dims = { seq_length, batch_size, output_dim };

    // Setup Hx & Cx & Hy & Cy
@@ -82,8 +87,10 @@ void CuDNNRecurrentOpBase<Context>::ResetDesc() {
    // Setup packed weights
    size_t weights_size; int64_t weights_count;
    CUDNN_CHECK(cudnnGetRNNParamsSize(
-        ctx()->cudnn_handle(), rnn_desc, xs_desc->descs()[0],
-            &weights_size, CUDNNType<T>::type));
+        ctx()->cudnn_handle(),
+        rnn_desc, xs_desc->descs()[0],
+        &weights_size,
+        CUDNNType<T>::type));
    weights_count = (int64_t)weights_size / sizeof(T);
    CHECK_EQ(weights_count, Input(1).count())
        << "\nModel request " << "Tensor(" << Input(1).name() << ")'s "
@@ -96,8 +103,11 @@ void CuDNNRecurrentOpBase<Context>::ResetDesc() {

    // Determine the RNN workspace
    CUDNN_CHECK(cudnnGetRNNWorkspaceSize(
-        ctx()->cudnn_handle(), rnn_desc, seq_length,
-            xs_desc->descs(), &workspace_size));
+        ctx()->cudnn_handle(),
+        rnn_desc,
+        seq_length,
+        xs_desc->descs(),
+        &workspace_size));
 }

 template <class Context> template <typename T>
@@ -125,8 +135,9 @@ void CuDNNRecurrentOp<Context>::RunWithType() {
    auto handle = ctx()->cudnn_handle();

    if (phase() == "TRAIN") {
-        CUDNN_CHECK(cudnnGetRNNTrainingReserveSize(handle,
-            rnn_desc, seq_length, xs_desc->descs(), &reserve_size));
+        CUDNN_CHECK(cudnnGetRNNTrainingReserveSize(
+            handle, rnn_desc, seq_length,
+                xs_desc->descs(), &reserve_size));
        auto* reserveT = ws()->CreateTensor(mount_name(
            "rnn/reserve"))->Reshape({ (int64_t)reserve_size });
        auto* RSdata = reserveT->template mutable_data<uint8_t, Context>();
@@ -182,8 +193,9 @@ void CuDNNRecurrentGradientOp<Context>::RunWithType() {

    auto* WSdata = ws()->template caches<Context>({ workspace_size })[0];
    // Check the ReserveSpace
-    CUDNN_CHECK(cudnnGetRNNTrainingReserveSize(ctx()->cudnn_handle(),
-        rnn_desc, seq_length, xs_desc->descs(), &reserve_size));
+    CUDNN_CHECK(cudnnGetRNNTrainingReserveSize(
+        ctx()->cudnn_handle(), rnn_desc, seq_length,
+            xs_desc->descs(), &reserve_size));
    auto* reserveT = ws()->GetTensor(mount_name("rnn/reserve"));
    CHECK_EQ(reserve_size, reserveT->nbytes());
 #if CUDNN_VERSION_MIN(6,0,0)
@@ -215,6 +227,12 @@ void CuDNNRecurrentGradientOp<Context>::RunWithType() {
    }

    if (Output(1)->name() != "NULL") {
+        math::Set(
+            Output(1)->count(),
+            cast::to<T>(0.f),
+            YsData(1),
+            ctx()
+        );  // CuDNN accumulates the gradient of weights
        CUDNN_CHECK(cudnnRNNBackwardWeights(handle, rnn_desc,
                                                  seq_length,
                                 xs_desc->descs(), XsData(0), //   X

--- a/Dragon/src/operators/update/adam_update_op.cc
+++ b/Dragon/src/operators/update/adam_update_op.cc
@@ -6,10 +6,10 @@ namespace dragon {

 template <class Context>
 void AdamUpdateOp<Context>::ComputeUpdates(Tensor* dX) {
-    Tensor* m = ws()->CreateTensor(
+    auto* M = ws()->CreateTensor(
        "/mnt/" + Slot() + "/adam/m")
            ->ReshapeLike(*dX);
-    Tensor* v = ws()->CreateTensor(
+    auto* V = ws()->CreateTensor(
        "/mnt/" + Slot() + "/adam/v")
            ->ReshapeLike(*dX);

@@ -18,8 +18,8 @@ void AdamUpdateOp<Context>::ComputeUpdates(Tensor* dX) {
    float coeff = sqrt(1. - pow(beta2, t)) / (1. - pow(beta1, t));
    lr = Param("base_lr") * coeff * this->lr_mult;
    auto* dXdata = dX->template mutable_data<float, Context>();
-    auto* Mdata = m->mutable_data<float, Context>();
-    auto* Vdata = v->mutable_data<float, Context>();
+    auto* Mdata = M->template mutable_data<float, Context>();
+    auto* Vdata = V->template mutable_data<float, Context>();

    kernel::AdamUpdate(dX->count(), lr, beta1,
        beta2, eps, dXdata, Mdata, Vdata, ctx());

--- a/Dragon/src/operators/update/nesterov_update_op.cc
+++ b/Dragon/src/operators/update/nesterov_update_op.cc
@@ -7,13 +7,13 @@ namespace dragon {

 template <class Context>
 void NesterovUpdateOp<Context>::ComputeUpdates(Tensor* dX) {
-    Tensor* h = ws()->CreateTensor(
+    auto* H = ws()->CreateTensor(
        "/mnt/" + Slot() + "/nesterov/h")
            ->ReshapeLike(*dX);

    lr = Param("base_lr") * this->lr_mult, momentum = Param("momentum");
    auto* dXdata = dX->template mutable_data<float, Context>();
-    auto* Hdata = h->template mutable_data<float, Context>();
+    auto* Hdata = H->template mutable_data<float, Context>();

    kernel::NesterovUpdate(dX->count(), lr,
        momentum, dXdata, Hdata, ctx());

--- a/Dragon/src/operators/update/rmsprop_update_op.cc
+++ b/Dragon/src/operators/update/rmsprop_update_op.cc
@@ -6,14 +6,14 @@ namespace dragon {

 template <class Context>
 void RMSPropUpdateOp<Context>::ComputeUpdates(Tensor* dX) {
-    Tensor* h = ws()->CreateTensor(
+    auto* H = ws()->CreateTensor(
        "/mnt/" + Slot() + "/rmsprop/h")
            ->ReshapeLike(*dX);

    lr = Param("base_lr") * this->lr_mult;
    decay = Param("decay"), eps = Param("eps");
    auto* dXdata = dX->template mutable_data<float, Context>();
-    auto* Hdata = h->template mutable_data<float, Context>();
+    auto* Hdata = H->template mutable_data<float, Context>();

    kernel::RMSPropUpdate(dX->count(), lr,
        decay, eps, dXdata, Hdata, ctx());

--- a/Dragon/src/operators/update/sgd_update_op.cc
+++ b/Dragon/src/operators/update/sgd_update_op.cc
@@ -7,7 +7,7 @@ namespace dragon {

 template <class Context>
 void SGDUpdateOp<Context>::ComputeUpdates(Tensor* dX) {
-    Tensor* h = ws()->CreateTensor(
+    auto* H = ws()->CreateTensor(
        "/mnt/" + Slot() + "/sgd/h")
            ->ReshapeLike(*dX);

@@ -15,7 +15,7 @@ void SGDUpdateOp<Context>::ComputeUpdates(Tensor* dX) {
    // Momentum Correction, See arXiv:1706.02677
    if (old_lr > 0) { correction = lr / old_lr; } old_lr = lr;
    auto* dXdata = dX->template mutable_data<float, Context>();
-    auto* Hdata = h->template mutable_data<float, Context>();
+    auto* Hdata = H->template mutable_data<float, Context>();

    kernel::SGDUpdate(dX->count(), lr,
        momentum * correction, dXdata, Hdata, ctx());

--- a/Dragon/src/operators/vision/bias_add_op.cc
+++ b/Dragon/src/operators/vision/bias_add_op.cc
@@ -8,7 +8,7 @@ namespace dragon {

 template <class Context> template <typename T>
 void BiasAddOp<Context>::RunWithType() {
-    TENSOR_FILL(Input(1), vector<int64_t>(1, dim));
+    TENSOR_FILL(Input(1), vector<int64_t>({ dim }));
    DECLARE_MULTIPLIER(multiplier, inner_dim);

    auto* Bdata = Input(1).template data<T, Context>();
@@ -17,7 +17,7 @@ void BiasAddOp<Context>::RunWithType() {
    // Copy X to Y firstly if necessary
    Output(0)->template CopyFrom<Context>(Input(0), ctx());

-    kernel::BiasAdd(Output(0)->count(), outer_dim, dim, inner_dim,
+    kernel::BiasAdd(outer_dim, dim, inner_dim,
        data_format, Bdata, multiplier, Ydata, ctx());
 }


--- a/Dragon/src/operators/vision/bilinear_resize_op.cc
+++ b/Dragon/src/operators/vision/bilinear_resize_op.cc
@@ -30,9 +30,9 @@ void BilinearResizeOp<Context>::RunOnDevice() {
        for (int i = 0; i < 2; i++)
            dims[spatial_axis + i] = dsize(i);
    } else if (!shape_like_desc.empty()) {
-        Tensor* shape_like_tensor = ws()->GetTensor(shape_like_desc);
+        auto* sl = ws()->GetTensor(shape_like_desc);
        for (int i = 0; i < 2; i++)
-            dims[spatial_axis + i] = shape_like_tensor->dim(spatial_axis + i);
+            dims[spatial_axis + i] = sl->dim(spatial_axis + i);
    } else {
        CHECK(fy != -1.f && fx != -1.f)
            << "\nThe fx and fy should be set.";

--- a/Dragon/src/operators/vision/conv_op_base.cc
+++ b/Dragon/src/operators/vision/conv_op_base.cc
@@ -95,7 +95,8 @@ void ConvOpBase<Context>::Wx(
    for (int g = 0; g < group; g++) {
        if (data_format == "NCHW") {
            math::Gemm(
-                CblasNoTrans, CblasNoTrans,
+                CblasNoTrans,
+                CblasNoTrans,
                conv_out_channels / group,
                conv_out_spatial_dim,
                kernel_dim,
@@ -104,10 +105,13 @@ void ConvOpBase<Context>::Wx(
                0.f, y + output_offset * g, ctx());
        } else if (data_format == "NHWC") {
            math::Gemm(
-                CblasNoTrans, CblasTrans,
-                    conv_out_spatial_dim, conv_out_channels,
+                CblasNoTrans,
+                CblasTrans,
+                conv_out_spatial_dim,
+                conv_out_channels,
                kernel_dim,
-                1.f, col_buffer, weights, 0.f, y, ctx());
+                1.f, col_buffer, weights,
+                0.f, y, ctx());
        }
    }
 }
@@ -115,7 +119,7 @@ void ConvOpBase<Context>::Wx(
 template <class Context> template <typename T>
 void ConvOpBase<Context>::Pb(const T* bias, T* y) {
    DECLARE_MULTIPLIER(multiplier, out_spatial_dim);
-    kernel::BiasAdd(Output(0)->count(),
+    kernel::BiasAdd(
        Input(0).dim(0), num_output, out_spatial_dim,
            data_format, bias, multiplier, y, ctx());
 }
@@ -127,18 +131,23 @@ void ConvOpBase<Context>::Dx(const T* dy, const T* weights, T* dx) {
    for (int g = 0; g < group; g++) {
        if (data_format == "NCHW") {
            math::Gemm(
-                CblasTrans, CblasNoTrans,
-                    kernel_dim, conv_out_spatial_dim,
+                CblasTrans,
+                CblasNoTrans,
+                kernel_dim,
+                conv_out_spatial_dim,
                conv_out_channels / group,
                1.f, weights + weight_offset * g,
                dy + output_offset * g,
                0.f, col_buffer + col_offset * g, ctx());
        } else if (data_format == "NHWC") {
             math::Gemm(
-                 CblasNoTrans, CblasNoTrans,
-                     conv_out_spatial_dim, kernel_dim,
+                 CblasNoTrans,
+                 CblasNoTrans,
+                 conv_out_spatial_dim,
+                 kernel_dim,
                 conv_out_channels,
-                 1.f, dy, weights, 0.f, col_buffer, ctx());
+                 1.f, dy, weights,
+                 0.f, col_buffer, ctx());
        }
    }
    if (!is_1x1) Col2Im(col_buffer, dx);
@@ -158,7 +167,8 @@ void ConvOpBase<Context>::Dw(const T* dy, const T* x, T *dw) {
    for (int g = 0; g < group; g++) {
        if (data_format == "NCHW") {
            math::Gemm(
-                CblasNoTrans, CblasTrans,
+                CblasNoTrans,
+                CblasTrans,
                conv_out_channels / group,
                kernel_dim,
                conv_out_spatial_dim,
@@ -167,10 +177,13 @@ void ConvOpBase<Context>::Dw(const T* dy, const T* x, T *dw) {
                0.f, dw + weight_offset * g, ctx());
        } else if (data_format == "NHWC") {
            math::Gemm(
-                CblasTrans, CblasNoTrans,
-                    conv_out_channels, kernel_dim,
+                CblasTrans,
+                CblasNoTrans,
+                conv_out_channels,
+                kernel_dim,
                conv_out_spatial_dim,
-                1.f, dy, col_buffer, 0.f, dw, ctx());
+                1.f, dy, col_buffer,
+                0.f, dw, ctx());
        }
    }
 }
@@ -180,12 +193,16 @@ void ConvOpBase<Context>::Db(const T* dy, T* db) {
    DECLARE_MULTIPLIER(multiplier, out_spatial_dim);
    if (data_format == "NCHW") {
        math::Gemv(
-            CblasNoTrans, num_output, out_spatial_dim,
+            CblasNoTrans,
+            num_output,
+            out_spatial_dim,
            1.f, dy, multiplier,
            0.f, db, ctx());
    } else if (data_format == "NHWC") {
        math::Gemv(
-            CblasTrans, out_spatial_dim, num_output,
+            CblasTrans,
+            out_spatial_dim,
+            num_output,
            1.f, dy, multiplier,
            0.f, db, ctx());
    }

--- a/Dragon/src/operators/vision/cudnn_bias_add_op.cc
+++ b/Dragon/src/operators/vision/cudnn_bias_add_op.cc
@@ -28,7 +28,8 @@ void CuDNNBiasAddOp<Context>::RunWithType() {
    // Copy X to Y firstly if necessary
    Output(0)->template CopyFrom<Context>(Input(0), ctx());

-    CUDNN_CHECK(cudnnAddTensor(ctx()->cudnn_handle(),
+    CUDNN_CHECK(cudnnAddTensor(
+        ctx()->cudnn_handle(),
        CUDNNType<T>::one, bias_desc, Bdata,
        CUDNNType<T>::one, output_desc, Ydata));
 }
@@ -70,7 +71,8 @@ void CuDNNBiasAddGradientOp<Context>::RunWithType() {
    auto* dYdata = Input(-1).template data<T, Context>();
    T* dBdata = Output(1)->template mutable_data<T, Context>();

-    CUDNN_CHECK(cudnnConvolutionBackwardBias(ctx()->cudnn_handle(),
+    CUDNN_CHECK(cudnnConvolutionBackwardBias(
+        ctx()->cudnn_handle(),
        CUDNNType<T>::one, input_desc, dYdata,
        CUDNNType<T>::zero, bias_desc, dBdata));


--- a/Dragon/src/operators/vision/cudnn_conv2d_op.cc
+++ b/Dragon/src/operators/vision/cudnn_conv2d_op.cc
@@ -13,31 +13,37 @@ template <class Context>
 void CuDNNConv2dOp<Context>::SetConvDescFromInputs() {
    if (XIsType(Input(0), float)) {
 #if CUDNN_VERSION_MIN(6, 0, 0)
-        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc,
+        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
+            conv_desc,
            pad_l[0], pad_l[1],
            stride[0], stride[1],
            dilation[0], dilation[1],
            CUDNN_CROSS_CORRELATION,
            CUDNN_DATA_FLOAT));
 #else
-        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc,
+        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
+            conv_desc,
            pad_l[0], pad_l[1],
-                stride[0], stride[1], 1, 1,
+            stride[0], stride[1],
+            1, 1,
            CUDNN_CROSS_CORRELATION));
 #endif
    } else if (XIsType(Input(0), float16)) {
 #if CUDNN_VERSION_MIN(6, 0, 0)
        compute_type = CUDNN_DATA_FLOAT;
-        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc,
+        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
+            conv_desc,
            pad_l[0], pad_l[1],
            stride[0], stride[1],
            dilation[0], dilation[1],
            CUDNN_CROSS_CORRELATION,
            compute_type));
 #else
-        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc,
+        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
+            conv_desc,
            pad_l[0], pad_l[1],
-                stride[0], stride[1], 1, 1,
+            stride[0], stride[1],
+            1, 1,
            CUDNN_CROSS_CORRELATION));
 #endif
    }
@@ -58,12 +64,15 @@ void CuDNNConv2dOp<Context>::ResetDesc() {
            // Determine the input & output shape
            input_dims = Input(0).dims();
            cudnnSetTensor4dDescWithGroup<T>(
-                &input_desc, data_format, Input(0).dims(), cudnn_group);
+                &input_desc, data_format,
+                    Input(0).dims(), cudnn_group);
            cudnnSetTensor4dDescWithGroup<T>(
-                &output_desc, data_format, Output(0)->dims(), cudnn_group);
+                &output_desc, data_format,
+                    Output(0)->dims(), cudnn_group);
            if (HasBias()) {
                cudnnSetTensor4dDesc<T>(
-                    &output2b_desc, data_format, Output(0)->dims());
+                    &output2b_desc, data_format,
+                        Output(0)->dims());
            }
            // Determine the misc
            if (data_format == "NCHW") {
@@ -104,14 +113,17 @@ void CuDNNConv2dOp<Context>::ResetDesc() {

        // Now, Select the appropriate algorithm
        CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
-            ctx()->cudnn_handle(), input_desc,
-                filter_desc, conv_desc, output_desc,
+            ctx()->cudnn_handle(),
+            input_desc, filter_desc,
+            conv_desc, output_desc,
            CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-                        WORKSPACE_LIMIT_BYTES, &fwd_algo));
+            WORKSPACE_LIMIT_BYTES,
+            &fwd_algo));

        CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
-            ctx()->cudnn_handle(), input_desc,
-                filter_desc, conv_desc, output_desc,
+            ctx()->cudnn_handle(),
+            input_desc, filter_desc,
+            conv_desc, output_desc,
            fwd_algo, &fwd_data_size));
    }
 }
@@ -169,31 +181,37 @@ template <class Context>
 void CuDNNConv2dGradientOp<Context>::SetConvDescFromInputs() {
    if (XIsType(Input(0), float)) {
 #if CUDNN_VERSION_MIN(6, 0, 0)
-        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc,
+        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
+            conv_desc,
            pad_l[0], pad_l[1],
            stride[0], stride[1],
            dilation[0], dilation[1],
            CUDNN_CROSS_CORRELATION,
            CUDNN_DATA_FLOAT));
 #else
-        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc,
+        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
+            conv_desc,
            pad_l[0], pad_l[1],
-                stride[0], stride[1], 1, 1,
+            stride[0], stride[1],
+            1, 1,
            CUDNN_CROSS_CORRELATION));
 #endif
    } else if (XIsType(Input(0), float16)) {
 #if CUDNN_VERSION_MIN(6, 0, 0)
        compute_type = CUDNN_DATA_FLOAT;
-        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc,
+        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
+            conv_desc,
            pad_l[0], pad_l[1],
            stride[0], stride[1],
            dilation[0], dilation[1],
            CUDNN_CROSS_CORRELATION,
            compute_type));
 #else
-        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc,
+        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
+            conv_desc,
            pad_l[0], pad_l[1],
-                stride[0], stride[1], 1, 1,
+            stride[0], stride[1],
+            1, 1,
            CUDNN_CROSS_CORRELATION));
 #endif
    }
@@ -214,12 +232,15 @@ void CuDNNConv2dGradientOp<Context>::ResetDesc() {
            // Determine the input & output shape
            input_dims = Input(0).dims();
            cudnnSetTensor4dDescWithGroup<T>(
-                &input_desc, data_format, Input(-1).dims(), cudnn_group);
+                &input_desc, data_format,
+                    Input(-1).dims(), cudnn_group);
            cudnnSetTensor4dDescWithGroup<T>(
-                &output_desc, data_format, Input(0).dims(), cudnn_group);
+                &output_desc, data_format,
+                    Input(0).dims(), cudnn_group);
            if (HasBias()) {
                cudnnSetTensor4dDesc<T>(
-                    &input2b_desc, data_format, Input(-1).dims());
+                    &input2b_desc, data_format,
+                        Input(-1).dims());
            }
            // Determine the misc
            if (data_format == "NCHW") {
@@ -260,25 +281,31 @@ void CuDNNConv2dGradientOp<Context>::ResetDesc() {

        // Now, Select the appropriate algorithm
        CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
-            ctx()->cudnn_handle(), output_desc,
-                input_desc, conv_desc, filter_desc,
+            ctx()->cudnn_handle(),
+            output_desc, input_desc,
+            conv_desc, filter_desc,
            CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
-                        WORKSPACE_LIMIT_BYTES, &bwd_filter_algo));
+            WORKSPACE_LIMIT_BYTES,
+            &bwd_filter_algo));

        CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
-            ctx()->cudnn_handle(), output_desc,
-                input_desc, conv_desc, filter_desc,
+            ctx()->cudnn_handle(),
+            output_desc, input_desc,
+            conv_desc, filter_desc,
            bwd_filter_algo, &bwd_filter_size));

        CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
-            ctx()->cudnn_handle(), filter_desc,
-                input_desc, conv_desc, output_desc,
+            ctx()->cudnn_handle(),
+            filter_desc, input_desc,
+            conv_desc, output_desc,
            CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
-                        WORKSPACE_LIMIT_BYTES, &bwd_data_algo));
+            WORKSPACE_LIMIT_BYTES,
+            &bwd_data_algo));

        CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
-            ctx()->cudnn_handle(), filter_desc,
-                input_desc, conv_desc, output_desc,
+            ctx()->cudnn_handle(),
+            filter_desc, input_desc,
+            conv_desc, output_desc,
            bwd_data_algo, &bwd_data_size));
    }
 }
@@ -296,7 +323,8 @@ void CuDNNConv2dGradientOp<Context>::RunWithType() {

    if (Output(2)->name() != "NULL") {
        T* dBdata = Output(2)->template mutable_data<T, Context>();
-        CUDNN_CHECK(cudnnConvolutionBackwardBias(cudnn_handle,
+        CUDNN_CHECK(cudnnConvolutionBackwardBias(
+            cudnn_handle,
            CUDNNType<T>::one, input2b_desc, dYdata,
            CUDNNType<T>::zero, bias_desc, dBdata));
    }
@@ -305,7 +333,8 @@ void CuDNNConv2dGradientOp<Context>::RunWithType() {
        if (Output(1)->name() != "NULL") {
            auto* Xdata = Input(0).template data<T, Context>();
            auto* dWdata = Output(1)->template mutable_data<T, Context>();
-            CUDNN_CHECK(cudnnConvolutionBackwardFilter(cudnn_handle,
+            CUDNN_CHECK(cudnnConvolutionBackwardFilter(
+                cudnn_handle,
                CUDNNType<T>::one, output_desc, Xdata + x_offset * g,
                input_desc, dYdata + y_offset * g,
                conv_desc, bwd_filter_algo, WSdata, bwd_filter_size,
@@ -314,7 +343,8 @@ void CuDNNConv2dGradientOp<Context>::RunWithType() {
        if (Output(0)->name() != "NULL") {
            auto* Wdata = Input(1).template data<T, Context>();
            auto* dXdata = Output(0)->template mutable_data<T, Context>();
-            CUDNN_CHECK(cudnnConvolutionBackwardData(cudnn_handle,
+            CUDNN_CHECK(cudnnConvolutionBackwardData(
+                cudnn_handle,
                CUDNNType<T>::one, filter_desc, Wdata + weight_offset * g,
                input_desc, dYdata + y_offset * g,
                conv_desc, bwd_data_algo, WSdata, bwd_data_size,

--- a/Dragon/src/operators/vision/cudnn_conv2d_transpose_op.cc
+++ b/Dragon/src/operators/vision/cudnn_conv2d_transpose_op.cc
@@ -13,14 +13,16 @@ template <class Context>
 void CuDNNConvTranspose2dOp<Context>::SetConvDescFromInputs() {
    if (XIsType(Input(0), float)) {
 #if CUDNN_VERSION_MIN(6, 0, 0)
-        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc,
+        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
+            conv_desc,
            pad_l[0], pad_l[1],
            stride[0], stride[1],
            dilation[0], dilation[1],
            CUDNN_CROSS_CORRELATION,
            CUDNN_DATA_FLOAT));
 #else
-        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc,
+        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
+            conv_desc,
            pad_l[0], pad_l[1],
            stride[0], stride[1], 1, 1,
            CUDNN_CROSS_CORRELATION));
@@ -28,16 +30,19 @@ void CuDNNConvTranspose2dOp<Context>::SetConvDescFromInputs() {
    } else if (XIsType(Input(0), float16)) {
 #if CUDNN_VERSION_MIN(6, 0, 0)
        compute_type = CUDNN_DATA_FLOAT;
-        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc,
+        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
+            conv_desc,
            pad_l[0], pad_l[1],
            stride[0], stride[1],
            dilation[0], dilation[1],
            CUDNN_CROSS_CORRELATION,
            compute_type));
 #else
-        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc,
+        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
+            conv_desc,
            pad_l[0], pad_l[1],
-                stride[0], stride[1], 1, 1,
+            stride[0], stride[1],
+            1, 1,
            CUDNN_CROSS_CORRELATION));
 #endif
    }
@@ -58,12 +63,15 @@ void CuDNNConvTranspose2dOp<Context>::ResetDesc() {
            // Determine the input & output shape
            output_dims = Output(0)->dims();
            cudnnSetTensor4dDescWithGroup<T>(
-                &input_desc, data_format, Input(0).dims(), cudnn_group);
+                &input_desc, data_format,
+                    Input(0).dims(), cudnn_group);
            cudnnSetTensor4dDescWithGroup<T>(
-                &output_desc, data_format, Output(0)->dims(), cudnn_group);
+                &output_desc, data_format,
+                    Output(0)->dims(), cudnn_group);
            if (HasBias()) {
                cudnnSetTensor4dDesc<T>(
-                    &output2b_desc, data_format, Output(0)->dims());
+                    &output2b_desc, data_format,
+                        Output(0)->dims());
            }
            // Determine the misc
            if (data_format == "NCHW") {
@@ -102,14 +110,17 @@ void CuDNNConvTranspose2dOp<Context>::ResetDesc() {

        // Now, Select the appropriate algorithm
        CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
-            ctx()->cudnn_handle(), filter_desc,
-                input_desc, conv_desc, output_desc,
+            ctx()->cudnn_handle(),
+            filter_desc, input_desc,
+            conv_desc, output_desc,
            CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
-                        WORKSPACE_LIMIT_BYTES, &fwd_algo));
+            WORKSPACE_LIMIT_BYTES,
+            &fwd_algo));

        CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
-            ctx()->cudnn_handle(), filter_desc,
-                input_desc, conv_desc, output_desc,
+            ctx()->cudnn_handle(),
+            filter_desc, input_desc,
+            conv_desc, output_desc,
            fwd_algo, &fwd_data_size));
    }
 }
@@ -130,7 +141,8 @@ void CuDNNConvTranspose2dOp<Context>::RunWithType() {
    auto cudnn_handle = ctx()->cudnn_handle();

    for (int g = 0; g < cudnn_group; g++) {
-        CUDNN_CHECK(cudnnConvolutionBackwardData(cudnn_handle,
+        CUDNN_CHECK(cudnnConvolutionBackwardData(
+            cudnn_handle,
            CUDNNType<T>::one, filter_desc, Wdata + weight_offset * g,
            input_desc, Xdata + x_offset * g,
            conv_desc, fwd_algo, WSdata, fwd_data_size,
@@ -139,7 +151,8 @@ void CuDNNConvTranspose2dOp<Context>::RunWithType() {

    if (HasBias()) {
        auto* Bdata = Input(2).template data<T, Context>();
-        CUDNN_CHECK(cudnnAddTensor(cudnn_handle,
+        CUDNN_CHECK(cudnnAddTensor(
+            cudnn_handle,
            CUDNNType<T>::one, bias_desc, Bdata,
            CUDNNType<T>::one, output2b_desc, Ydata));
    }
@@ -167,31 +180,37 @@ template <class Context>
 void CuDNNConvTranspose2dGradientOp<Context>::SetConvDescFromInputs() {
    if (XIsType(Input(0), float)) {
 #if CUDNN_VERSION_MIN(6, 0, 0)
-        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc,
+        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
+            conv_desc,
            pad_l[0], pad_l[1],
            stride[0], stride[1],
            dilation[0], dilation[1],
            CUDNN_CROSS_CORRELATION,
            CUDNN_DATA_FLOAT));
 #else
-        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc,
+        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
+            conv_desc,
            pad_l[0], pad_l[1],
-                stride[0], stride[1], 1, 1,
+            stride[0], stride[1],
+            1, 1,
            CUDNN_CROSS_CORRELATION));
 #endif
    } else if (XIsType(Input(0), float16)) {
 #if CUDNN_VERSION_MIN(6, 0, 0)
        compute_type = CUDNN_DATA_FLOAT;
-        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc,
+        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
+            conv_desc,
            pad_l[0], pad_l[1],
            stride[0], stride[1],
            dilation[0], dilation[1],
            CUDNN_CROSS_CORRELATION,
            compute_type));
 #else
-        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc,
+        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
+            conv_desc,
            pad_l[0], pad_l[1],
-                stride[0], stride[1], 1, 1,
+            stride[0], stride[1],
+            1, 1,
            CUDNN_CROSS_CORRELATION));
 #endif
    }
@@ -212,12 +231,15 @@ void CuDNNConvTranspose2dGradientOp<Context>::ResetDesc() {
            // Determine the input & output shape
            output_dims = Input(-1).dims();
            cudnnSetTensor4dDescWithGroup<T>(
-                &input_desc, data_format, Input(-1).dims(), cudnn_group);
+                &input_desc, data_format,
+                    Input(-1).dims(), cudnn_group);
            cudnnSetTensor4dDescWithGroup<T>(
-                &output_desc, data_format, Input(0).dims(), cudnn_group);
+                &output_desc, data_format,
+                    Input(0).dims(), cudnn_group);
            if (HasBias()) {
                cudnnSetTensor4dDesc<T>(
-                    &input2b_desc, data_format, Input(-1).dims());
+                    &input2b_desc, data_format,
+                        Input(-1).dims());
            }
            // Determine the misc
            if (data_format == "NCHW") {
@@ -256,25 +278,31 @@ void CuDNNConvTranspose2dGradientOp<Context>::ResetDesc() {

        // Now, Select the appropriate algorithm
        CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
-            ctx()->cudnn_handle(), input_desc,
-                output_desc, conv_desc, filter_desc,
+            ctx()->cudnn_handle(),
+            input_desc, output_desc,
+            conv_desc, filter_desc,
            CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
-                        WORKSPACE_LIMIT_BYTES, &bwd_filter_algo));
+            WORKSPACE_LIMIT_BYTES,
+            &bwd_filter_algo));

        CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
-            ctx()->cudnn_handle(), input_desc,
-                output_desc, conv_desc, filter_desc,
+            ctx()->cudnn_handle(),
+            input_desc, output_desc,
+            conv_desc, filter_desc,
            bwd_filter_algo, &bwd_filter_size));

        CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
-            ctx()->cudnn_handle(), input_desc,
-                filter_desc, conv_desc, output_desc,
+            ctx()->cudnn_handle(),
+            input_desc, filter_desc,
+            conv_desc, output_desc,
            CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-                        WORKSPACE_LIMIT_BYTES, &bwd_data_algo));
+            WORKSPACE_LIMIT_BYTES,
+            &bwd_data_algo));

        CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
-            ctx()->cudnn_handle(), input_desc,
-                filter_desc, conv_desc, output_desc,
+            ctx()->cudnn_handle(),
+            input_desc, filter_desc,
+            conv_desc, output_desc,
            bwd_data_algo, &bwd_data_size));
    }
 }
@@ -292,7 +320,8 @@ void CuDNNConvTranspose2dGradientOp<Context>::RunWithType() {

    if (Output(2)->name() != "NULL") {
        T* dBdata = Output(2)->template mutable_data<T, Context>();
-        CUDNN_CHECK(cudnnConvolutionBackwardBias(cudnn_handle,
+        CUDNN_CHECK(cudnnConvolutionBackwardBias(
+            cudnn_handle,
            CUDNNType<T>::one, input2b_desc, dYdata,
            CUDNNType<T>::zero, bias_desc, dBdata));
    }
@@ -301,7 +330,8 @@ void CuDNNConvTranspose2dGradientOp<Context>::RunWithType() {
        if (Output(1)->name() != "NULL") {
            auto* Xdata = Input(0).template data<T, Context>();
            auto* dWdata = Output(1)->template mutable_data<T, Context>();
-            CUDNN_CHECK(cudnnConvolutionBackwardFilter(cudnn_handle,
+            CUDNN_CHECK(cudnnConvolutionBackwardFilter(
+                cudnn_handle,
                CUDNNType<T>::one, input_desc, dYdata + y_offset * g,
                output_desc, Xdata + x_offset * g,
                conv_desc, bwd_filter_algo, WSdata, bwd_filter_size,
@@ -310,7 +340,8 @@ void CuDNNConvTranspose2dGradientOp<Context>::RunWithType() {
        if (Output(0)->name() != "NULL") {
            auto* Wdata = Input(1).template data<T, Context>();
            auto* dXdata = Output(0)->template mutable_data<T, Context>();
-            CUDNN_CHECK(cudnnConvolutionForward(cudnn_handle,
+            CUDNN_CHECK(cudnnConvolutionForward(
+                cudnn_handle,
                CUDNNType<T>::one, input_desc, dYdata + y_offset * g,
                filter_desc, Wdata + weight_offset * g,
                conv_desc, bwd_data_algo, WSdata, bwd_data_size,

--- a/Dragon/src/operators/vision/cudnn_depthwise_conv2d_op.cc
+++ b/Dragon/src/operators/vision/cudnn_depthwise_conv2d_op.cc
@@ -28,14 +28,18 @@ void CuDNNDepthwiseConv2dOp<Context>::RunWithType() {
    auto* Wdata = Input(1).template data<T, Context>();
    auto* Ydata = Output(0)->template mutable_data<T, Context>();

-    kernel::DepthwiseConv2d(Input(0).dim(0), channels,
-        input_shape[0], input_shape[1], output_shape[0], output_shape[1],
-            kernel_shape[0], kernel_shape[1], stride[0], pad_l[0], pad_l[1],
+    kernel::DepthwiseConv2d(
+        Input(0).dim(0), channels,
+        input_shape[0], input_shape[1],
+        output_shape[0], output_shape[1],
+        kernel_shape[0], kernel_shape[1],
+        stride[0], pad_l[0], pad_l[1],
        data_format, Xdata, Wdata, Ydata, ctx());

    if (HasBias()) {
        auto* Bdata = Input(2).template data<T, Context>();
-        CUDNN_CHECK(cudnnAddTensor(ctx()->cudnn_handle(),
+        CUDNN_CHECK(cudnnAddTensor(
+            ctx()->cudnn_handle(),
            CUDNNType<T>::one, bias_desc, Bdata,
            CUDNNType<T>::one, output_desc, Ydata));
    }
@@ -83,17 +87,23 @@ void CuDNNDepthwiseConv2dGradientOp<Context>::RunWithType() {
            auto* Xdata = Input(0).template data<T, Context>();
            auto* dWdata = Output(1)->template mutable_data<T, Context>();
            math::Set(Output(1)->count(), cast::to<T>(0.f), dWdata, ctx());
-            kernel::DepthwiseConv2dWGrad(Input(0).dim(0), channels,
-                input_shape[0], input_shape[1], output_shape[0], output_shape[1],
-                    kernel_shape[0], kernel_shape[1], stride[0], pad_l[0], pad_l[1],
+            kernel::DepthwiseConv2dWGrad(
+                Input(0).dim(0), channels,
+                input_shape[0], input_shape[1],
+                output_shape[0], output_shape[1],
+                kernel_shape[0], kernel_shape[1],
+                stride[0], pad_l[0], pad_l[1],
                data_format, dYdata, Xdata, dWdata, ctx());
        }
        if (Output(0)->name() != "NULL") {
            auto* Wdata = Input(1).template data<T, Context>();
            auto* dXdata = Output(0)->template mutable_data<T, Context>();
-            kernel::DepthwiseConv2dGrad(Input(0).dim(0), channels,
-                input_shape[0], input_shape[1], output_shape[0], output_shape[1],
-                    kernel_shape[0], kernel_shape[1], stride[0], pad_l[0], pad_l[1],
+            kernel::DepthwiseConv2dGrad(
+                Input(0).dim(0), channels,
+                input_shape[0], input_shape[1],
+                output_shape[0], output_shape[1],
+                kernel_shape[0], kernel_shape[1],
+                stride[0], pad_l[0], pad_l[1],
                data_format, dYdata, Wdata, dXdata, ctx());
        }
    }

--- a/Dragon/src/operators/vision/cudnn_pool2d_op.cc
+++ b/Dragon/src/operators/vision/cudnn_pool2d_op.cc
@@ -10,13 +10,17 @@ void CuDNNPool2dOp<Context>::RunWithType() {
    cudnnSetTensor4dDesc<T>(&output_desc, this->data_format, Output(0));
 #if CUDNN_VERSION_MIN(5, 0, 0)
    CUDNN_CHECK(cudnnSetPooling2dDescriptor(
-        pool_desc, pool_mode, CUDNN_PROPAGATE_NAN,
+        pool_desc,
+        pool_mode,
+        CUDNN_PROPAGATE_NAN,
        this->kernel_shape[0], this->kernel_shape[1],
        this->pad_l[0], this->pad_l[1],
        this->stride[0], this->stride[1]));
 #else
    CUDNN_CHECK(cudnnSetPooling2dDescriptor_v4(
-        pool_desc, pool_mode, CUDNN_PROPAGATE_NAN,
+        pool_desc,
+        pool_mode,
+        CUDNN_PROPAGATE_NAN,
        this->kernel_shape[0], this->kernel_shape[1],
        this->pad_l[0], this->pad_l[1],
        this->stride[0], this->stride[1]));
@@ -47,13 +51,17 @@ void CuDNNPool2dGradientOp<Context>::RunWithType() {
    cudnnSetTensor4dDesc<T>(&output_desc, this->data_format, Output(0));
 #if CUDNN_VERSION_MIN(5, 0, 0)
    CUDNN_CHECK(cudnnSetPooling2dDescriptor(
-        pool_desc, pool_mode, CUDNN_PROPAGATE_NAN,
+        pool_desc,
+        pool_mode,
+        CUDNN_PROPAGATE_NAN,
        this->kernel_shape[0], this->kernel_shape[1],
        this->pad_l[0], this->pad_l[1],
        this->stride[0], this->stride[1]));
 #else
    CUDNN_CHECK(cudnnSetPooling2dDescriptor_v4(
-        pool_desc, pool_mode, CUDNN_PROPAGATE_NAN,
+        pool_desc,
+        pool_mode,
+        CUDNN_PROPAGATE_NAN,
        this->kernel_shape[0], this->kernel_shape[1],
        this->pad_l[0], this->pad_l[1],
        this->stride[0], this->stride[1]));

--- a/Dragon/src/operators/vision/depthwise_conv2d_op.cc
+++ b/Dragon/src/operators/vision/depthwise_conv2d_op.cc
@@ -14,9 +14,12 @@ void DepthwiseConv2dOp<Context>::RunWithType() {
    auto* Wdata = Input(1).template data<T, Context>();
    auto* Ydata = Output(0)->template mutable_data<T, Context>();

-    kernel::DepthwiseConv2d(Input(0).dim(0), channels,
-        input_shape[0], input_shape[1], output_shape[0], output_shape[1],
-            kernel_shape[0], kernel_shape[1], stride[0], pad_l[0], pad_l[1],
+    kernel::DepthwiseConv2d(
+        Input(0).dim(0), channels,
+        input_shape[0], input_shape[1],
+        output_shape[0], output_shape[1],
+        kernel_shape[0], kernel_shape[1],
+        stride[0], pad_l[0], pad_l[1],
        data_format, Xdata, Wdata, Ydata, ctx());

    if (HasBias()) {
@@ -57,18 +60,29 @@ void DepthwiseConv2dGradientOp<Context>::RunWithType() {
        if (Output(1)->name() != "NULL") {
            auto* Xdata = Input(0).template data<T, Context>();
            auto* dWdata = Output(1)->template mutable_data<T, Context>();
-            math::Set(Output(1)->count(), cast::to<T>(0.f), dWdata, ctx());
-            kernel::DepthwiseConv2dWGrad(Input(0).dim(0), channels,
-                input_shape[0], input_shape[1], output_shape[0], output_shape[1],
-                    kernel_shape[0], kernel_shape[1], stride[0], pad_l[0], pad_l[1],
+            math::Set(
+                Output(1)->count(),
+                cast::to<T>(0.f),
+                dWdata,
+                ctx()
+            );  // Zero the gradient of W
+            kernel::DepthwiseConv2dWGrad(
+                Input(0).dim(0), channels,
+                input_shape[0], input_shape[1],
+                output_shape[0], output_shape[1],
+                kernel_shape[0], kernel_shape[1],
+                stride[0], pad_l[0], pad_l[1],
                data_format, dYdata, Xdata, dWdata, ctx());
        }
        if (Output(0)->name() != "NULL") {
            auto* Wdata = Input(1).template data<T, Context>();
            auto* dXdata = Output(0)->template mutable_data<T, Context>();
-            kernel::DepthwiseConv2dGrad(Input(0).dim(0), channels,
-                input_shape[0], input_shape[1], output_shape[0], output_shape[1],
-                    kernel_shape[0], kernel_shape[1], stride[0], pad_l[0], pad_l[1],
+            kernel::DepthwiseConv2dGrad(
+                Input(0).dim(0), channels,
+                input_shape[0], input_shape[1],
+                output_shape[0], output_shape[1],
+                kernel_shape[0], kernel_shape[1],
+                stride[0], pad_l[0], pad_l[1],
                data_format, dYdata, Wdata, dXdata, ctx());
        }
    }

--- a/Dragon/src/operators/vision/nn_resize_op.cc
+++ b/Dragon/src/operators/vision/nn_resize_op.cc
@@ -30,9 +30,9 @@ void NNResizeOp<Context>::RunOnDevice() {
        for (int i = 0; i < 2; i++)
            dims[spatial_axis + i] = dsize(i);
    } else if (!shape_like_desc.empty()) {
-        Tensor* shape_like_tensor = ws()->GetTensor(shape_like_desc);
+        auto* sl = ws()->GetTensor(shape_like_desc);
        for (int i = 0; i < 2; i++)
-            dims[spatial_axis + i] = shape_like_tensor->dim(spatial_axis + i);
+            dims[spatial_axis + i] = sl->dim(spatial_axis + i);
    } else {
        CHECK(fy != -1.f && fx != -1.f)
                << "\nThe fx and fy should be set.";

--- a/Dragon/src/operators/vision/pool2d_op.cc
+++ b/Dragon/src/operators/vision/pool2d_op.cc
@@ -71,7 +71,8 @@ void Pool2dOp<Context>::MAXRunWithType() {
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
    auto* Mdata = mask->template mutable_data<int, Context>();

-    kernel::MAXPool2d(n, c, h, w, pool_h, pool_w,
+    kernel::MAXPool2d(
+        n, c, h, w, pool_h, pool_w,
        kernel_shape[0], kernel_shape[1],
        stride[0], stride[1], pad_l[0], pad_l[1],
        data_format, Xdata, Mdata, Ydata, ctx());
@@ -82,7 +83,8 @@ void Pool2dOp<Context>::AVGRunWithType() {
    auto* Xdata = Input(0).template data<T, Context>();
    auto* Ydata = Output(0)->template mutable_data<T, Context>();

-    kernel::AVGPool2d(n, c, h, w, pool_h, pool_w,
+    kernel::AVGPool2d(
+        n, c, h, w, pool_h, pool_w,
        kernel_shape[0], kernel_shape[1],
        stride[0], stride[1], pad_l[0], pad_l[1],
        data_format, Xdata, Ydata, ctx());
@@ -123,7 +125,8 @@ void Pool2dGradientOp<Context>::MAXRunWithType() {
    auto* dXdata = Output(0)->template mutable_data<T, Context>();
    auto* Mdata = mask->template data<int, Context>();

-    kernel::MAXPool2dGrad(n, c, h, w, pool_h, pool_w,
+    kernel::MAXPool2dGrad(
+        n, c, h, w, pool_h, pool_w,
        kernel_shape[0], kernel_shape[1],
        stride[0], stride[1], pad_l[0], pad_l[1],
        data_format, dYdata, Mdata, dXdata, ctx());
@@ -134,7 +137,8 @@ void Pool2dGradientOp<Context>::AVGRunWithType() {
    auto* dYdata = Input(-1).template data<T, Context>();
    auto* dXdata = Output(0)->template mutable_data<T, Context>();

-    kernel::AVGPool2dGrad(n, c, h, w, pool_h, pool_w,
+    kernel::AVGPool2dGrad(
+        n, c, h, w, pool_h, pool_w,
        kernel_shape[0], kernel_shape[1],
        stride[0], stride[1], pad_l[0], pad_l[1],
        data_format, dYdata, dXdata, ctx());

--- a/Dragon/src/operators/vision/roi_align_op.cc
+++ b/Dragon/src/operators/vision/roi_align_op.cc
@@ -12,10 +12,15 @@ void ROIAlignOp<Context>::RunWithType() {
    auto* Ydata = Output(0)->template mutable_data<T, Context>();

    kernel::ROIAlign(
-        Input(0).dim(1), Input(0).dim(2), Input(0).dim(3),
-            pool_h, pool_w, Input(1).dim(0),
-                spatial_scale, sampling_ratio,
-                    Xdata, Rdata, Ydata, ctx());
+        Input(0).dim(1),
+        Input(0).dim(2),
+        Input(0).dim(3),
+        pool_h, pool_w,
+        Input(1).dim(0),
+        spatial_scale,
+        sampling_ratio,
+        Xdata, Rdata,
+        Ydata, ctx());
 }

 template <class Context>
@@ -47,10 +52,15 @@ void ROIAlignGradientOp<Context>::RunWithType() {
    math::Set(Output(0)->count(), cast::to<T>(0.f), dXdata, ctx());

    kernel::ROIAlignGrad(
-        Output(0)->dim(1), Output(0)->dim(2), Output(0)->dim(3),
-            pool_h, pool_w, Input(1).dim(0),
-                spatial_scale, sampling_ratio,
-                    dYdata, Rdata, dXdata, ctx());
+        Output(0)->dim(1),
+        Output(0)->dim(2),
+        Output(0)->dim(3),
+        pool_h, pool_w,
+        Input(1).dim(0),
+        spatial_scale,
+        sampling_ratio,
+        dYdata, Rdata,
+        dXdata, ctx());
 }

 template <class Context>
@@ -66,10 +76,15 @@ void ROIAlignGradientOp<Context>::RunWithFloat16() {
    kernel::TypeA2B(Input(-1).count(), dYdata, WSdata[0], ctx());

    kernel::ROIAlignGrad(
-        Output(0)->dim(1), Output(0)->dim(2), Output(0)->dim(3),
-            pool_h, pool_w, Input(1).dim(0),
-                spatial_scale, sampling_ratio,
-                    WSdata[0], Rdata, WSdata[1], ctx());
+        Output(0)->dim(1),
+        Output(0)->dim(2),
+        Output(0)->dim(3),
+        pool_h, pool_w,
+        Input(1).dim(0),
+        spatial_scale, 
+        sampling_ratio,
+        WSdata[0], Rdata,
+        WSdata[1], ctx());

    kernel::TypeA2B(Output(0)->count(), WSdata[1], dXdata, ctx());
 }

--- a/Dragon/src/operators/vision/roi_pool_op.cc
+++ b/Dragon/src/operators/vision/roi_pool_op.cc
@@ -16,9 +16,14 @@ void ROIPoolOp<Context>::RunWithType() {
    auto* Ydata = Output(0)->template mutable_data<T, Context>();

    kernel::ROIPool(
-        Input(0).dim(1), Input(0).dim(2), Input(0).dim(3),
-            pool_h, pool_w, Input(1).dim(0), spatial_scale,
-                Xdata, Rdata, Mdata, Ydata, ctx());
+        Input(0).dim(1),
+        Input(0).dim(2),
+        Input(0).dim(3),
+        pool_h, pool_w,
+        Input(1).dim(0),
+        spatial_scale,
+        Xdata, Rdata, Mdata,
+        Ydata, ctx());
 }

 template <class Context>
@@ -51,10 +56,15 @@ void ROIPoolGradientOp<Context>::RunWithType() {
    auto* dXdata = Output(0)->template mutable_data<T, Context>();

    kernel::ROIPoolGrad(
-        Output(0)->dim(0), Output(0)->dim(1),
-            Output(0)->dim(2), Output(0)->dim(3),
-                pool_h, pool_w, Input(1).dim(0), spatial_scale,
-                    dYdata, Rdata, Mdata, dXdata, ctx());
+        Output(0)->dim(0),
+        Output(0)->dim(1),
+        Output(0)->dim(2),
+        Output(0)->dim(3),
+        pool_h, pool_w,
+        Input(1).dim(0),
+        spatial_scale,
+        dYdata, Rdata, Mdata,
+        dXdata, ctx());
 }

 template <class Context>
@@ -73,10 +83,15 @@ void ROIPoolGradientOp<Context>::RunWithFloat16() {
    kernel::TypeA2B(Input(-1).count(), dYdata, WSdata[0], ctx());

    kernel::ROIPoolGrad(
-        Output(0)->dim(0), Output(0)->dim(1),
-            Output(0)->dim(2), Output(0)->dim(3),
-                 pool_h, pool_w, Input(1).dim(0), spatial_scale,
-                    WSdata[0], Rdata, Mdata, WSdata[1], ctx());
+        Output(0)->dim(0),
+        Output(0)->dim(1),
+        Output(0)->dim(2),
+        Output(0)->dim(3),
+        pool_h, pool_w,
+        Input(1).dim(0),
+        spatial_scale,
+        WSdata[0], Rdata, Mdata,
+        WSdata[1], ctx());

    kernel::TypeA2B(Output(0)->count(), WSdata[1], dXdata, ctx());
 }

--- a/Dragon/src/proto/dragon.proto
+++ b/Dragon/src/proto/dragon.proto
@@ -145,18 +145,6 @@ message GradientProto {
    optional string external = 3;
 }

-// Record the updater information
-message UpdaterProto {
-    // The operator name to use.
-    optional string name = 1;
-    // The operator type.
-    optional string type = 2;
-    // The tensor to update.
-    repeated string tensor = 3;
-    // The arguments.
-    repeated Argument arg = 4;
-}
-
 // Graph Definition
 message GraphDef {
    // The graph name.
@@ -181,6 +169,4 @@ message GraphDef {

    // The gradients information.
    repeated GradientProto gradient = 9;
-    // The updaters information.
-    repeated UpdaterProto updater = 10;
 }
\ No newline at end of file