Change the representation of NULL tensor

Ting PAN
Commit b35f9320 authored Mar 20, 2019 by Ting PAN
Showing with 816 additions and 714 deletions
Docs/api/python/contents/core/cuda.rst
Docs/api/python/contents/core/tensor.rst
Docs/api/python/contents/operators.rst
Docs/api/python/contents/operators/custom/data_process.rst
Docs/api/python/contents/operators/custom/minibatch.rst
Docs/api/python/contents/operators/custom/vec_mult.rst
Docs/api/python/contents/operators/data.rst
Dragon/CMakeLists.txt
Dragon/include/core/context_cuda.h
Dragon/include/core/graph.h
Dragon/include/core/operator.h
Dragon/include/core/operator_gradient.h
Dragon/include/core/operator_schema.h
Dragon/include/operators/misc/gradient_op.h
Dragon/include/operators/vision/conv_op.h
Dragon/include/operators/vision/conv_transpose_op.h
Dragon/include/operators/vision/depthwise_conv_op.h
Dragon/include/utils/cuda_device.h
Dragon/include/utils/proto_utils.h
Dragon/modules/cxx/dragon.cc
--- a/Docs/api/python/contents/core/cuda.rst
+++ b/Docs/api/python/contents/core/cuda.rst
@@ -12,6 +12,7 @@ Quick Reference
 List                              Brief
 ==============================    =============================================================================
 `IsCUDADriverSufficient`_         Is cuda driver sufficient?
+`EnableCUDNN`_                    Enable the CuDNN engine.
 `GetDevice`_                      Get the current active cuda device.
 `SynchronizeStream`_              Synchronize the specified cuda stream.
 ==============================    =============================================================================
@@ -20,5 +21,6 @@ List                              Brief
    :members:

 .. _IsCUDADriverSufficient: #dragon.core.cuda.IsCUDADriverSufficient
+.. _EnableCUDNN: #dragon.core.cuda.EnableCUDNN
 .. _GetDevice: #dragon.core.cuda.GetDevice
 .. _SynchronizeStream: #dragon.core.cuda.SynchronizeStream
\ No newline at end of file
--- a/Docs/api/python/contents/core/tensor.rst
+++ b/Docs/api/python/contents/core/tensor.rst
@@ -93,6 +93,7 @@ API Reference
    .. automethod:: __eq__
    .. automethod:: __repr__
    .. automethod:: __getitem__
+    .. automethod:: __setitem__
    .. automethod:: __call__

 .. _Tensor.Variable: #dragon.core.tensor.Tensor.Variable

--- a/Docs/api/python/contents/operators.rst
+++ b/Docs/api/python/contents/operators.rst
@@ -59,16 +59,12 @@ Custom
   :hidden:

   operators/custom/minibatch
-   operators/custom/data_process
-   operators/custom/vec_mult


 =========================================       =====================================================================
 List                                            Brief
 =========================================       =====================================================================
-`dragon.operators.custom.minibatch`_            How to form a minibatch based on `dragon.io`_ package.
-`dragon.operators.custom.data_process`_         How to custom a RunOp for data processing.
-`dragon.operators.custom.vec_mult`_             How to custom a TemplateOp for Vector Multiplication.
+`dragon.operators.custom.minibatch`_            Form a mini-batch based on `dragon.utils.vision`_ package.
 =========================================       =====================================================================


@@ -99,9 +95,9 @@ List                                            Brief
 .. _dragon.operators.recurrent: operators/recurrent.html
 .. _dragon.operators.loss: operators/loss.html
 .. _dragon.operators.norm: operators/norm.html
-.. _dragon.io: io.html
 .. _dragon.operators.custom.minibatch: operators/custom/minibatch.html
 .. _dragon.operators.custom.data_process: operators/custom/data_process.html
 .. _dragon.operators.custom.vec_mult: operators/custom/vec_mult.html
 .. _dragon.operators.contrib.rcnn: operators/contrib/rcnn.html
+.. _dragon.utils.vision: utils.html#vision

--- a/Docs/api/python/contents/operators/custom/data_process.rst
+++ b/Docs/api/python/contents/operators/custom/data_process.rst
-==================
-:mod:`DataProcess`
-==================
-
-.. toctree::
-   :hidden:
-
-.. currentmodule:: dragon.operators.custom.data_process
-
-.. autoclass:: DataProcessOp
-    :members:
\ No newline at end of file
--- a/Docs/api/python/contents/operators/custom/minibatch.rst
+++ b/Docs/api/python/contents/operators/custom/minibatch.rst
@@ -10,4 +10,4 @@
 .. autoclass:: MiniBatchOp
    :members:

-.. _dragon.io: ../../io.html
\ No newline at end of file
+.. _dragon.utils.vision: ../../utils.html#vision
\ No newline at end of file
--- a/Docs/api/python/contents/operators/custom/vec_mult.rst
+++ b/Docs/api/python/contents/operators/custom/vec_mult.rst
-==============
-:mod:`VecMult`
-==============
-
-.. toctree::
-   :hidden:
-
-.. currentmodule:: dragon.operators.custom.vec_mult
-
-.. autoclass:: VecMultOp
-    :members:
\ No newline at end of file
--- a/Docs/api/python/contents/operators/data.rst
+++ b/Docs/api/python/contents/operators/data.rst
@@ -9,7 +9,7 @@
    :members:

 .. _LMDB: http://lmdb.readthedocs.io/en/release
-.. _DataBatch: ../io/data_batch.html#dragon.io.data_batch
-.. _DataReader: ../io/data_reader.html#dragon.io.data_reader
-.. _DataTransformer: ../io/data_transformer.html#dragon.io.data_transformer
-.. _BlobFetcher: ../io/blob_fetcher.html#dragon.io.blob_fetcher
\ No newline at end of file
+.. _DataBatch: ../utils/vision/data_batch.html
+.. _DataReader: ../utils/vision/data_reader.html
+.. _DataTransformer: ../utils/vision/data_transformer.html
+.. _BlobFetcher: ../utils/vision/blob_fetcher.html
\ No newline at end of file
--- a/Dragon/CMakeLists.txt
+++ b/Dragon/CMakeLists.txt
@@ -23,7 +23,7 @@ if (NOT THIRD_PARTY_DIR)
    set(THIRD_PARTY_DIR  ${PROJECT_SOURCE_DIR}/../ThirdParty)
 endif()

-# Set your protobuf compiler(protc) if necessary
+# Set your protobuf compiler(protoc) if necessary
 # if not, a default "protoc" in the environment path will be used
 if (NOT PROTOC_EXECUTABLE)
   set(PROTOC_EXECUTABLE protoc)

--- a/Dragon/include/core/context_cuda.h
+++ b/Dragon/include/core/context_cuda.h
@@ -128,6 +128,8 @@ class CUDAObject {
 #ifdef WITH_CUDNN
    vector<cudnnHandle_t> cudnn_handles[CUDA_MAX_DEVICES];
 #endif
+
+    bool cudnn_enabled = true;
 };

 class CUDAContext {

--- a/Dragon/include/core/graph.h
+++ b/Dragon/include/core/graph.h
@@ -84,7 +84,7 @@ class Graph : public GraphBase {

 /*! \brief Create a graph from the raw def */
 GraphBase* NewGraph(
-    const GraphDef&             meta_graph,
+    const GraphDef&             def,
    Workspace*                  ws);

 DECLARE_REGISTRY(

--- a/Dragon/include/core/operator.h
+++ b/Dragon/include/core/operator.h
@@ -142,7 +142,7 @@ class Operator : public OperatorBase {
        allow_run_ = true;
        allow_run_ &= MPICheck();
        allow_run_ &= (!(OutputSize() == 1 &&
-            Output(0)->name() == "ignore"));
+            Output(0)->name() == "NULL"));
    }

    /*! \brief Run this operator on the specified stream */
@@ -168,10 +168,10 @@ class Operator : public OperatorBase {
    /*! \brief Coordinate the context of inputs and outputs */
    virtual void MemorySwitch() {
        for (auto* e : inputs_)
-            if(e->name() != "ignore")
+            if(e->name() != "NULL")
                e->SwitchToDevice(ctx()->device_id());
        for (auto* e : outputs_)
-            if(e->name() != "ignore")
+            if(e->name() != "NULL")
                e->SwitchToDevice(ctx()->device_id());
    }


--- a/Dragon/include/core/operator_gradient.h
+++ b/Dragon/include/core/operator_gradient.h
@@ -76,23 +76,23 @@ class GradientMakerBase {

    const string I(const int i) const {
        return i < def.input_size() ?
-            def.input(i) : "ignore";
+            def.input(i) : "NULL";
    }

    const string O(const int i) const {
        return i < def.output_size() ?
-            def.output(i) : "ignore";
+            def.output(i) : "NULL";
    }

    string GI(const int i) {
-        if (i >= g_inputs_.size()) return "ignore";
+        if (i >= g_inputs_.size()) return "NULL";
        g_inputs_[i] = def.input(i) + "_grad";
        return g_inputs_[i];
    }

    const string GO(const int i) const {
        return i < g_outputs_.size() ?
-            g_outputs_[i] : "ignore";
+            g_outputs_[i] : "NULL";
    }

 protected:

--- a/Dragon/include/core/operator_schema.h
+++ b/Dragon/include/core/operator_schema.h
@@ -12,9 +12,9 @@

 #ifndef DRAGON_CORE_OPERATOR_SCHEMA_H_
 #define DRAGON_CORE_OPERATOR_SCHEMA_H_
- 
-#include <functional>
+
 #include <limits>
+#include <functional>

 #include "common.h"

@@ -92,7 +92,7 @@ class OpSchemaRegistry {
    }

 private:
-     static Map<string, OpSchema>& schema_map() {
+    static Map<string, OpSchema>& schema_map() {
        static Map<string, OpSchema> schema_map_;
        return schema_map_;
    }

--- a/Dragon/include/operators/misc/gradient_op.h
+++ b/Dragon/include/operators/misc/gradient_op.h
@@ -41,7 +41,7 @@ class GradientGatherOp final : public Operator<Context> {
    GradientGatherOp(const OperatorDef& def, Workspace* ws)
        : Operator<Context>(def, ws) {
        for (int i = 0; i < InputSize(); i++)
-            if (Input(i).name() != "ignore") indices.push_back(i);
+            if (Input(i).name() != "NULL") indices.push_back(i);
    }
    USE_OPERATOR_FUNCTIONS;


--- a/Dragon/include/operators/vision/conv_op.h
+++ b/Dragon/include/operators/vision/conv_op.h
@@ -43,7 +43,7 @@ class Conv2dGradientOp : public Conv2dOp<Context> {
    USE_OPERATOR_FUNCTIONS;
    USE_CONVOLUTION_FUNCTIONS;

-    bool HasBias() override { return Output(2)->name() != "ignore"; }
+    bool HasBias() override { return Output(2)->name() != "NULL"; }

    void RunOnDevice() override;
    template <typename T> void RunWithType();

--- a/Dragon/include/operators/vision/conv_transpose_op.h
+++ b/Dragon/include/operators/vision/conv_transpose_op.h
@@ -47,7 +47,7 @@ class ConvTranspose2dGradientOp : public ConvTranspose2dOp<Context> {
    USE_OPERATOR_FUNCTIONS;
    USE_CONVOLUTION_FUNCTIONS;

-    bool HasBias() override { return Output(2)->name() != "ignore"; }
+    bool HasBias() override { return Output(2)->name() != "NULL"; }

    void RunOnDevice() override;
    template <typename T> void RunWithType();

--- a/Dragon/include/operators/vision/depthwise_conv_op.h
+++ b/Dragon/include/operators/vision/depthwise_conv_op.h
@@ -50,7 +50,7 @@ class DepthwiseConv2dGradientOp
    USE_OPERATOR_FUNCTIONS;
    USE_CONVOLUTION_FUNCTIONS;

-    bool HasBias() override { return Output(2)->name() != "ignore"; }
+    bool HasBias() override { return Output(2)->name() != "NULL"; }

    void RunOnDevice() override;
    template <typename T> void RunWithType();

--- a/Dragon/include/utils/cuda_device.h
+++ b/Dragon/include/utils/cuda_device.h
@@ -131,7 +131,7 @@ struct CUDADeviceProps {
    CUDADeviceProps() : props(CUDA_NUM_DEVICES()) {
        for (int i = 0; i < CUDA_NUM_DEVICES(); ++i)
            CUDA_CHECK(cudaGetDeviceProperties(&props[i], i));
-}
+    }
    vector<cudaDeviceProp> props;
 };


--- a/Dragon/include/utils/proto_utils.h
+++ b/Dragon/include/utils/proto_utils.h
@@ -28,8 +28,7 @@ inline OperatorDef MakeOperatorDef(
    const IterableInputs&               inputs,
    const IterableOutputs&              outputs,
    const IterableArgs&                 args,
-    const DeviceOption&                 device_option,
-    const string&                       engine) {
+    const DeviceOption&                 device_option) {
    OperatorDef def;
    def.set_type(type);
    def.set_name(name);
@@ -51,8 +50,8 @@ inline OperatorDef MakeOperatorDef(
    const IterableOutputs&              outputs,
    const IterableArgs&                 args) {
    return MakeOperatorDef(
-        type, name, inputs, outputs, args,
-            DeviceOption(), "");
+        type, name, inputs, outputs,
+            args, DeviceOption());
 }

 template <class IterableInputs,
@@ -64,7 +63,7 @@ inline OperatorDef MakeOperatorDef(
    const IterableOutputs&              outputs) {
    return MakeOperatorDef(
        type, name, inputs, outputs,
-            vector<Argument>(), DeviceOption(), "");
+            vector<Argument>(), DeviceOption());
 }

 bool ParseProtoFromText(

--- a/Dragon/modules/cxx/dragon.cc
+++ b/Dragon/modules/cxx/dragon.cc
@@ -88,9 +88,8 @@ std::string CreateGraph(
    auto graph_def_copy(*graph_def);
    // Overwritten device options
    DeviceOption* device_option = graph_def_copy.mutable_device_option();
-    device_option->set_device_type((DeviceType)device.device_type());
+    device_option->set_device_type((DeviceTypeProto)device.device_type());
    device_option->set_device_id(device.device_id());
-    device_option->set_engine("CUDNN");
    auto* graph = ws->CreateGraph(graph_def_copy);
    if (!graph) LOG(FATAL) << "Can not create the graph.";
    return graph->name();

--- a/Dragon/modules/python/py_autograd.h
+++ b/Dragon/modules/python/py_autograd.h
@@ -53,6 +53,7 @@ void AddGradientMethods(pybind11::module& m) {
        if (is_sharing) maker.Share(backward_ops);
        pybind11::gil_scoped_release g;
        for (auto& op : backward_ops.op()) {
+            if (op.type().empty()) continue;
            if (verbose) std::cout << op.DebugString() << std::endl;
            if (op.has_uid()) ws()->RunOperator(op);
            else ws()->RunOperatorOnce(op);

--- a/Dragon/modules/python/py_cuda.h
+++ b/Dragon/modules/python/py_cuda.h
@@ -31,6 +31,13 @@ void AddCUDAMethods(pybind11::module& m) {
 #endif
    });

+    m.def("EnableCUDNN", [](bool enabled) {
+#ifdef WITH_CUDA
+        CUDAContext::cuda_object()
+            ->cudnn_enabled = enabled;
+#endif
+    });
+
    m.def("cudaGetDevice", []() {
        return CUDAContext::active_device_id();
    });

--- a/Dragon/python/dragon/config.py
+++ b/Dragon/python/dragon/config.py
@@ -27,9 +27,6 @@ option['device'] = 'cpu'
 # The device index
 option['device_id'] = 0

-# Whether to use cuDNN if possible
-option['use_cudnn'] = False
-
 # The global random seed
 option['random_seed'] = 3

@@ -77,15 +74,13 @@ def EnableCPU():
    option['device'] = 'cpu'


-def EnableCUDA(gpu_id=0, use_cudnn=True):
+def EnableCUDA(gpu_id=0):
    """Enable NVIDIA's CUDA mode globally.

    Parameters
    ----------
    gpu_id : int
        The index of GPU to use.
-    use_cudnn : boolean
-        Whether to use cuDNN if available.

    Returns
    -------
@@ -95,7 +90,6 @@ def EnableCUDA(gpu_id=0, use_cudnn=True):
    global option
    option['device'] = 'cuda'
    option['device_id'] = gpu_id
-    option['use_cudnn'] = use_cudnn


 def EnableCNML(mlu_id=0):

--- a/Dragon/python/dragon/core/cuda.py
+++ b/Dragon/python/dragon/core/cuda.py
@@ -24,12 +24,28 @@ def IsCUDADriverSufficient():
    Returns
    -------
    boolean
-        ``True`` if your device(s) support CUDA otherwise ``False``.
+        *True* if your device(s) support CUDA otherwise *False*.

    """
    return _C.IsCUDADriverSufficient()


+def EnableCUDNN(enabled=True):
+    """Enable the CuDNN engine.
+
+    Parameters
+    ----------
+    enabled : boolean
+        *True* to enable.
+
+    Returns
+    -------
+    None
+
+    """
+    return _C.EnableCUDNN(enabled)
+
+
 def GetDevice():
    """Get the current active cuda device.


--- a/Dragon/python/dragon/core/gradient_maker.py
+++ b/Dragon/python/dragon/core/gradient_maker.py
@@ -164,7 +164,7 @@ class GraphGradientMaker(object):
            is_skip, gen_grads = \
                cls.CheckGrad(forward_op, inputs_to_grads, blacklist, targets)
            # Missing grads are represented as ``None``
-            g_outputs = list(inputs_to_grads.get(name, 'ignore') for name in forward_op.output)
+            g_outputs = list(inputs_to_grads.get(name, 'NULL') for name in forward_op.output)
            g_ops, g_inputs, defaults = cls.CreateGrad(forward_op, g_outputs)

            # Append ops

--- a/Dragon/python/dragon/core/proto_utils.py
+++ b/Dragon/python/dragon/core/proto_utils.py
@@ -72,7 +72,7 @@ else:
 def MakeOperatorDef(
    op_type, inputs=(), outputs=(),
        name='', uid=None, device_option=None,
-            arg=None, engine=None, **kwargs):
+            arg=None, **kwargs):
    operator = pb.OperatorDef()
    operator.type = op_type
    operator.name = name
@@ -80,14 +80,12 @@ def MakeOperatorDef(
    operator.output.extend([str(tensor) for tensor in outputs])
    if device_option is not None:
        operator.device_option.CopyFrom(device_option)
-        if engine is not None:
-            operator.device_option.engine = engine
    if 'random_seed' in kwargs:
        operator.device_option.random_seed = kwargs['random_seed']
        del kwargs['random_seed']
    if uid is not None: operator.uid = uid
    if arg is not None: operator.arg.extend(arg)
-    for k,v in kwargs.items():
+    for k, v in kwargs.items():
        if v is None: continue
        operator.arg.add().CopyFrom(MakeArgument(k,v))
    return operator
@@ -96,46 +94,36 @@ def MakeOperatorDef(
 def MakeCXXOperatorDef(
    op_type, inputs=(), outputs=(),
        name='', uid=None, device_option=None,
-            arg=None, engine=None, **kwargs):
+            arg=None, **kwargs):
    c_def = _C.OperatorDef()
    py_def = MakeOperatorDef(
        op_type, inputs, outputs, name, uid,
-            device_option, arg, engine, **kwargs)
+            device_option, arg, **kwargs)
    c_def.ParseFrom(py_def.SerializeToString())
    return c_def


-def MakeDeviceOption(
-    device_type, device_id,
-        engine=None, rng_seed=None):
+def MakeDeviceOption(device_type, device_id, rng_seed=None):
    option = pb.DeviceOption()
    option.device_type = device_type
    option.device_id = device_id
-    if engine is not None: option.engine = engine
    if rng_seed is not None: option.random_seed = rng_seed
    return option


 _PREDEFINED_DEVICE_LIMITS = 16
-_PREDEFINED_DEVICE_ENGINES = ['', 'CUDNN']
 _PREDEFINED_DEVICE_DICT = {'cpu': 0, 'cuda': 1, 'cnml': 2}
 _PREDEFINED_DEVICE_OPTION_DICT = {}


 for i in range(_PREDEFINED_DEVICE_LIMITS):
    for device, identify in _PREDEFINED_DEVICE_DICT.items():
-        for engine in _PREDEFINED_DEVICE_ENGINES:
-            _PREDEFINED_DEVICE_OPTION_DICT[(device, i, engine)] = \
-                MakeDeviceOption(identify, i, engine)
-        if device == 'cuda':
-            _PREDEFINED_DEVICE_OPTION_DICT[('cuda', i)] = \
-                MakeDeviceOption(identify, i, 'CUDNN')
-
-
-def GetDeviceOption(
-    device_type, device_id=0,
-        engine=None, rng_seed=None):
-    ctx = (device_type, device_id, engine if engine else '')
+        _PREDEFINED_DEVICE_OPTION_DICT[(device, i)] = \
+            MakeDeviceOption(identify, i)
+
+
+def GetDeviceOption(device_type, device_id=0, rng_seed=None):
+    ctx = (device_type, device_id)
    option = _PREDEFINED_DEVICE_OPTION_DICT[ctx]
    if rng_seed is not None:
        option_copy = copy.deepcopy(option)
@@ -149,16 +137,15 @@ def GetDefaultDeviceOption():
    if device_info is not None:
        return GetDeviceOption(
            device_info['device_type'],
-            device_info['device_id'],
-            device_info['device_engine'])
+                device_info['device_id'])
    return None


 def GetGlobalDeviceOption():
    option = cfg.GetGlobalOptions()
    return GetDeviceOption(
-        option['device'], option['device_id'],
-            'CUDNN' if option['use_cudnn'] else '')
+        option['device'],
+            option['device_id'])


 # Fix the python stdout

--- a/Dragon/python/dragon/core/scope.py
+++ b/Dragon/python/dragon/core/scope.py
@@ -128,7 +128,7 @@ def name_scope(name):
    return _GLOBAL_TENSOR_STACK.get_controller(default)


-def device_scope(device_type, device_id=0, engine='AUTO'):
+def device_scope(device_type, device_id=0):
    """Nest the the specific device info.

    Parameters
@@ -137,20 +137,15 @@ def device_scope(device_type, device_id=0, engine='AUTO'):
        The type of device.
    device_id : int, optional
        The index of the device.
-    engine : {'AUTO', 'CUDNN'}, optional
-        The auxiliary accelerating library to use.

    """
-    device_type, device_id, device_engine = \
-        device_type.upper(), device_id, engine.upper()
+    device_type, device_id, device_type.lower(), device_id
    assert device_type in ['cpu', 'gpu', 'cuda', 'cnml']
    # Default names
    if device_type == 'gpu': device_type = 'cuda'
-    if device_engine == 'AUTO': device_engine = 'CUDNN'
    return _GLOBAL_DEVICE_STACK.get_controller({
        'device_type': device_type,
-        'device_id': device_id,
-        'device_engine': device_engine})
+            'device_id': device_id})


 def phase_scope(phase):
@@ -209,7 +204,7 @@ def get_default_device():

    The device dict contains the following keys:

-    (``device_type``, ``device_id``, ``device_engine``).
+    (``device_type``, ``device_id``).

    Returns
    -------

--- a/Dragon/python/dragon/operators/contrib/rcnn/ops.py
+++ b/Dragon/python/dragon/operators/contrib/rcnn/ops.py
@@ -32,29 +32,29 @@ def Proposal(inputs, strides, ratios, scales,

    Parameters
    ----------
-    inputs : list of Tensor
+    inputs : sequence of Tensor
        The inputs.
-    strides : list of int
+    strides : sequence of int
        The strides of anchors.
-    ratios : list of float
+    ratios : sequence of float
        The ratios of anchors.
-    scales : list of float
+    scales : sequence of float
        The scales of anchors.
-    pre_nms_top_n : int
+    pre_nms_top_n : int, optional, default=6000
        The number of anchors before nms.
-    post_nms_top_n : int
+    post_nms_top_n : int, optional, default=300
        The number of anchors after nms.
-    nms_thresh : float
+    nms_thresh : float, optional, default=0.7
        The threshold of nms.
-    min_size : int
+    min_size : int, optional, default=16
        The min size of anchors.
-    min_level : int
+    min_level : int, optional, default=2
        Finest level of the FPN pyramid.
-    max_level : int
+    max_level : int, optional, default=5
        Coarsest level of the FPN pyramid.
-    canonical_scale : int
+    canonical_scale : int, optional, default=224
        The baseline scale of mapping policy.
-    canonical_level : int
+    canonical_level : int, optional, default=4
        Heuristic level of the canonical scale.

    Returns

--- a/Dragon/python/dragon/operators/custom/data_process.py
+++ b/Dragon/python/dragon/operators/custom/data_process.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import dragon as dg
-from multiprocessing import Process, Queue
-
-
-class Fetcher(Process):
-    def __init__(self, queue):
-        super(Fetcher, self).__init__()
-        self._queue = queue
-        self.daemon = True
-
-        def cleanup():
-            print('Terminating Fetcher......')
-            self.terminate()
-            self.join()
-
-        import atexit
-        atexit.register(cleanup)
-
-    def run(self):
-        while True:
-            self._queue.put(np.ones((5, 10)))
-
-
-class DataProcessOp(object):
-    """How to custom a RunOp for data processing.
-
-    Examples
-    --------
-    >>> import dragon as dg
-    >>> y = dg.ops.Run([], module=__name__, op='DataProcessOp', num_outputs=1)
-    >>> foo = dg.function(outputs=y)
-    >>> foo()
-    >>> print(y.get_value())
-    >>> [[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]
-         [ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]
-         [ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]
-         [ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]
-         [ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]]
-
-    """
-    def setup(self, inputs, outputs):
-        """Setup for params or options.
-
-        Parameters
-        ----------
-        inputs : list of str
-            Indicating the name of input tensors.
-        outputs : list of str
-            Indicating the name of output tensors.
-
-        Returns
-        -------
-        None
-
-        """
-        self._queue = Queue(100)
-        self._fetcher = Fetcher(self._queue)
-        self._fetcher.start()
-
-
-    def run(self, inputs, outputs):
-        """Run method, i.e., forward pass.
-
-        Parameters
-        ----------
-        inputs : list of str
-            Indicating the name of input tensors.
-        outputs : list of str
-            Indicating the name of output tensors.
-
-        Returns
-        -------
-        None
-
-        """
-        dg.workspace.FeedTensor(outputs[0], self._queue.get())
-
-
-if __name__ == '__main__':
-    # Def
-    y = dg.ops.Run([], module=__name__, op='DataProcessOp', num_outputs=1)
-    foo = dg.function(outputs=y)
-
-    # Run
-    foo()
-
-    # Fetch
-    print(y.get_value())
\ No newline at end of file
--- a/Dragon/python/dragon/operators/custom/minibatch.py
+++ b/Dragon/python/dragon/operators/custom/minibatch.py
@@ -9,23 +9,26 @@
 #
 # ------------------------------------------------------------

-import dragon.core.workspace as ws
-from dragon.utils.vision import DataBatch
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import dragon
+import dragon.utils.vision


 class MiniBatchOp(object):
-    """How to form a minibatch based on `dragon.io`_ package.
+    """Form a mini-batch based on `dragon.utils.vision`_ package."""

-    """
    def setup(self, inputs, outputs):
        """Setup for params or options.

        Parameters
        ----------
-        inputs : list of str
-            Indicating the name of input tensors.
-        outputs : list of str
-            Indicating the name of output tensors.
+        inputs : sequence of str
+            The name of inputs.
+        outputs : sequence of str
+            The name of outputs.

        Returns
        -------
@@ -33,18 +36,17 @@ class MiniBatchOp(object):

        """
        kwargs = eval(self.param_str)
-        self._data_batch = DataBatch(**kwargs)
-
+        self._data_batch = dragon.utils.vision.DataBatch(**kwargs)

    def run(self, inputs, outputs):
        """Run method, i.e., forward pass.

        Parameters
        ----------
-        inputs : list of str
-            Indicating the name of input tensors.
-        outputs : list of str
-            Indicating the name of output tensors.
+        inputs : sequence of str
+            The name of inputs.
+        outputs : sequence of str
+            The name of outputs.

        Returns
        -------
@@ -53,4 +55,4 @@ class MiniBatchOp(object):
        """
        blobs = self._data_batch.get()
        for idx, blob in enumerate(blobs):
-            ws.FeedTensor(outputs[idx], blob)
\ No newline at end of file
+            dragon.workspace.FeedTensor(outputs[idx], blob)
\ No newline at end of file
--- a/Dragon/python/dragon/operators/custom/vec_mult.py
+++ b/Dragon/python/dragon/operators/custom/vec_mult.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import dragon as dg
-
-
-class VecMultOp(object):
-    """How to custom a TemplateOp for Vector Multiplication.
-
-    Examples
-    --------
-    >>> import dragon as dg
-    >>> x1 = dg.Tensor('x1').Variable()
-    >>> x2 = dg.Tensor('x2').Variable()
-    >>> y = dg.ops.Template([x1, x2], module=__name__, op='VecMultOp', num_outputs=1)
-    >>> dx1 = dg.grad(y, x1)
-    >>> dx2 = dg.grad(y, x2)
-    >>> foo = dg.function(outputs=y)
-    >>> dg.workspace.FeedTensor(x1, np.ones((5, 3), dtype=np.float32))
-    >>> dg.workspace.FeedTensor(x2, np.ones((5, 3), dtype=np.float32) * 5.0)
-    >>> foo()
-
-    >>> print(y.get_value())
-    >>> [[ 5.  5.  5.]
-         [ 5.  5.  5.]
-         [ 5.  5.  5.]
-         [ 5.  5.  5.]
-         [ 5.  5.  5.]]
-
-    >>> print(dx1.get_value())
-    >>> [[ 5.  5.  5.]
-         [ 5.  5.  5.]
-         [ 5.  5.  5.]
-         [ 5.  5.  5.]
-         [ 5.  5.  5.]]
-
-    >>> print(dx2.get_value())
-    >>>  [[ 1.  1.  1.]
-          [ 1.  1.  1.]
-          [ 1.  1.  1.]
-          [ 1.  1.  1.]
-          [ 1.  1.  1.]]
-
-    """
-    def setup(self, inputs, outputs):
-        """Setup for params or options.
-
-        Parameters
-        ----------
-        inputs : list of str
-            Indicating the name of input tensors.
-        outputs : list of str
-            Indicating the name of output tensors.
-
-        Returns
-        -------
-        None
-
-        """
-        pass
-
-
-    def run(self, inputs, outputs):
-        """Run method, i.e., forward pass.
-
-        Parameters
-        ----------
-        inputs : list of str
-            Indicating the name of input tensors.
-        outputs : list of str
-            Indicating the name of output tensors.
-
-        Returns
-        -------
-        None
-
-        """
-        x1 = dg.workspace.FetchTensor(inputs[0])
-        x2 = dg.workspace.FetchTensor(inputs[1])
-        dg.workspace.FeedTensor(outputs[0], x1 * x2) # call numpy mult
-
-    def grad(self, inputs, outputs):
-        """Gradient method, i.e., backward pass.
-
-        Parameters
-        ----------
-        inputs : list of str
-            Indicating the name of input tensors.
-        outputs : list of str
-            Indicating the name of output tensors.
-
-        Returns
-        -------
-        None
-
-        """
-        x1 = dg.workspace.FetchTensor(inputs[0])
-        x2 = dg.workspace.FetchTensor(inputs[1])
-        dy = dg.workspace.FetchTensor(inputs[-1])
-        dx1 = dy * x2
-        dx2 = dy * x1
-        dg.workspace.FeedTensor(outputs[0], dx1)
-        dg.workspace.FeedTensor(outputs[1], dx2)
-
-
-if __name__ == '__main__':
-    # Def
-    x1 = dg.Tensor('x1').Variable()
-    x2 = dg.Tensor('x2').Variable()
-    y = dg.ops.Template([x1, x2], module=__name__, op='VecMultOp', num_outputs=1)
-    dx1 = dg.grad(y, x1)
-    dx2 = dg.grad(y, x2)
-    foo = dg.function(outputs=y)
-
-    # Feed
-    dg.workspace.FeedTensor(x1, np.ones((5, 3), dtype=np.float32))
-    dg.workspace.FeedTensor(x2, np.ones((5, 3), dtype=np.float32) * 5.0)
-
-    # Run
-    foo()
-
-    # Fetch
-    print('y \n-------------- \n', y.get_value(), '\n')
-    print('dx1 \n-------------- \n', dx1.get_value(), '\n')
-    print('dx2 \n-------------- \n', dx2.get_value(), '\n')
\ No newline at end of file
--- a/Dragon/python/dragon/operators/data.py
+++ b/Dragon/python/dragon/operators/data.py
@@ -25,40 +25,40 @@ def LMDBData(**kwargs):
    ----------
    source : str
        The path of database.
-    shuffle : bool
+    shuffle : bool, optional, default=False
        Whether to shuffle the data.
    node_step: bool
        Whether to split data for multiple parallel nodes.
-    num_chunks : int
-        The number of chunks to split. Default is ``2048``.
-    chunk_size : int
-        The size(MB) of each chunk. Default is -1 (Refer ``num_chunks``).
-    mean_values : list
+    num_chunks : int, optional, default=2048
+        The number of chunks to split.
+    chunk_size : int, optional, default=-1
+        The size(MB) of each chunk.
+    mean_values : list, optional
        The mean value of each image channel.
-    scale : float
-        The scale performed after mean subtraction. Default is ``1.0``.
-    padding : int
-        The zero-padding size. Default is ``0``.
-    crop_size : int
-        The crop size. Default is ``0`` (Disabled).
-    mirror : bool
-        Whether to mirror(flip horizontally) images. Default is ``False``.
-    color_augmentation : bool
-        Whether to use color distortion. Default is ``False``.
-    min_random_scale : float
-        The min scale of the input images. Default is ``1.0``.
-    max_random_scale : float
-        The max scale of the input images. Default is ``1.0``.
-    force_gray : bool
-        Set not to duplicate channel for gray. Default is ``False``.
-    phase : str
-        The phase of this operator, ``TRAIN`` or ``TEST``.
-    batch_size : int
+    scale : float, optional, default=1.
+        The scale performed after mean subtraction.
+    padding : int, optional, default=0
+        The zero-padding size.
+    crop_size : int, optional, default=0
+        The cropping size.
+    mirror : bool, optional, default=False
+        Whether to mirror(flip horizontally) images.
+    color_augmentation : bool, optional, default=False
+        Whether to use color distortion.1
+    min_random_scale : float, optional, default=1.
+        The min scale of the input images.
+    max_random_scale : float, optional, default=1.
+        The max scale of the input images.
+    force_gray : bool, optional, default=False
+        Set not to duplicate channel for gray.
+    phase : {'TRAIN', 'TEST'}, optional
+        The phase of this operator.
+    batch_size : int, optional, default=128
        The size of a mini-batch.
-    partition : bool
-        Whether to partition batch for parallelism. Default is ``False``.
-    prefetch : int
-        The prefetch count. Default is ``5``.
+    partition : bool, optional, default=False
+        Whether to partition batch for parallelism.
+    prefetch : int, optional, default=5
+        The prefetch count.

    Returns
    -------
@@ -85,8 +85,7 @@ def LMDBData(**kwargs):
 @OpSchema.Inputs(1)
 def ImageData(
    inputs, mean_values=None, std_values=None,
-        dtype='float32', data_format='NCHW', **kwargs
-):
+        dtype='float32', data_format='NCHW', **kwargs):
    """Process the images from 4D raw data.

    Note that we assume the data format of raw data is **NHWC**.
@@ -99,10 +98,10 @@ def ImageData(
        The optional mean values to subtract.
    std_values : sequence of float, optional
        The optional std values to divide.
-    dtype : str
-        The type of output. ``float32`` or ``float16``.
-    data_format : str
-        The data format of output. ``NCHW`` or ``NHWC``.
+    dtype : {'float16', 'float32'}, optional
+        The data type of output.
+    data_format : {'NCHW', 'NHWC'}, optional
+        The data format of output.

    Returns
    -------

--- a/Dragon/python/dragon/operators/vision.py
+++ b/Dragon/python/dragon/operators/vision.py
@@ -52,7 +52,7 @@ def Conv2d(
        The inputs, represent [input, weights] + [bias].
    num_output : int
        The output channels of convolution.
-    kernel_shape : sequence of int.
+    kernel_shape : sequence of int
        The shape of convolution kernel.
    strides : sequence of int, optional, default=1
        The stride(s) of convolution.

--- a/Dragon/python/dragon/proto/dragon.proto
+++ b/Dragon/python/dragon/proto/dragon.proto
 // Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
 // Licensed under the BSD 2-Clause License.

+// Codes are based on:
+// https://github.com/pytorch/pytorch/blob/master/caffe2/proto/caffe2.proto
+
 syntax = "proto2";

 package dragon;

+// Store the serialized Tensor objects.
 message TensorProto {
    repeated int32 dims = 1;
    enum DataType {
        UNDEFINED = 0;
+        // Basic types.
        FLOAT = 1;
        INT32 = 2;
        BYTE = 3;
        STRING = 4;
+
+        // Less-commonly used data types.
+        BOOL = 5;
+        UINT8 = 6;
+        INT8 = 7;
+        UINT16 = 8;
+        INT16 = 9;
+        INT64 = 10;
        FLOAT16 = 12;
+        DOUBLE = 13;
    }
    optional DataType data_type = 2 [default = FLOAT];
+    // For float.
    repeated float float_data = 3 [packed = true];
+    // For int32, uint8, int8, uint16, int16, bool, and float16
+    // Note about float16: in storage we will basically convert float16 byte-wise
+    // to unsigned short and then store them in the int32_data field.
    repeated int32 int32_data = 4 [packed = true];
+    // For bytes.
    optional bytes byte_data = 5;
+    // For strings.
    repeated bytes string_data = 6;
+    // For double.
+    repeated double double_data = 9 [packed = true];
+    // For int64.
+    repeated int64 int64_data = 10 [packed = true];
+    // Store the raw data, contents are serialized as little-endian.
+    optional bytes raw_data = 13;
+
+    // Optionally, a name for the tensor.
    optional string name = 7;
 }

+// Record the filler of Tensor.
+// This structure is kept for backward compatibility
+// with caffe1, which relies implicit initializer.
 message TensorFillerProto {
    optional string tensor = 1;
    optional string type = 2 [default = 'constant'];
@@ -36,67 +67,120 @@ message TensorFillerProto {
    optional VarianceNorm variance_norm = 9 [default = FAN_IN];
 }

+// Store multiple TensorProto objects in one single proto.
 message TensorProtos {
  repeated TensorProto protos = 1;
 }

-enum DeviceType { 
-	PROTO_CPU = 0;
-	PROTO_CUDA = 1;
-	PROTO_CNML = 2;
+// DeviceType that Dragon currently supports.
+enum DeviceTypeProto {
+    // The default device.
+    PROTO_CPU = 0;
+    // NVIDIA's CUDA Environment.
+    PROTO_CUDA = 1;
+    // CAMBRICON's CNML Environment.
+    PROTO_CNML = 2;
 }

+// Device-specific options.
 message DeviceOption {
-    optional DeviceType device_type = 1 [default = PROTO_CPU];
+    // The type of device to dispatch executions.
+    optional DeviceTypeProto device_type = 1 [default = PROTO_CPU];
+    // The index of this device.
    optional int32 device_id = 2 [default = 0];
+    // The random seed to start the random generator.
    optional uint32 random_seed = 3 [default = 3];
-    optional string engine = 4;
 }

+// A named argument containing either singular float, integer and string
+// values, or repeated float, int and string arrays.
 message Argument {
+    // The name of this argument.
    optional string name = 1;
+
+    // Store the float32 value.
    optional float f = 2;
+    // Store the bool, int32, int64 value.
    optional int64 i = 3;
+    // Store the string value.
    optional bytes s = 4;

+    // Store the float32 values.
    repeated float floats = 7;
+    // Store the bool, int32, int64 values.
    repeated int64 ints = 8;
+    // Store the string values.
    repeated bytes strings = 9;
 }

+// Operator Definition
 message OperatorDef {
+    // The unique id of this operator.
+    // Set it to persist operators in the dynamic graph.
    optional string uid = 1;
+
+    // The name of inputs.
    repeated string input = 2;
+    // The name of outputs.
    repeated string output = 3;
+
+    // The optional name of this operator.
    optional string name = 4;
+    // The operator type.
    optional string type = 5;
+    // The arguments.
    repeated Argument arg = 6;
+
+    // The device option that the operator should run under.
    optional DeviceOption device_option = 7;
 }

+// Record the gradient information
 message GradientProto {
+    // The derivative target.
    optional string cost = 1;
+    // The target with respect to?
    optional string wrt = 2;
+    // The external gradient
    optional string external = 3;
 }

+// Record the updater information
 message UpdaterProto {
+    // The operator name to use.
    optional string name = 1;
+    // The operator type.
    optional string type = 2;
+    // The tensor to update.
    repeated string tensor = 3;
+    // The arguments.
    repeated Argument arg = 4;
 }

+// Graph Definition
 message GraphDef {
+    // The graph name.
    optional string name = 1;
+
+    // The operators to execute.
    repeated OperatorDef op = 2;
+
+    // The type of graph.
    optional string graph_type = 3;
+
+    // The device option for this graph.
    optional DeviceOption device_option = 5;
+
+    // The arguments.
    repeated Argument arg = 6;

+    // The name of inputs.
    repeated string input = 7;
+    // The name of outputs.
    repeated string output = 8;

+    // The gradients information.
    repeated GradientProto gradient = 9;
+    // The updaters information.
    repeated UpdaterProto updater = 10;
 }
\ No newline at end of file
--- a/Dragon/python/dragon/utils/vision/blob_fetcher.py
+++ b/Dragon/python/dragon/utils/vision/blob_fetcher.py
@@ -28,12 +28,12 @@ class BlobFetcher(Process):

        Parameters
        ----------
-        batch_size : int
-            The size of a training batch.
-        partition : boolean
-            Whether to partition batch. Default is ``False``.
-        prefetch : int
-            The prefetch count. Default is ``5``.
+        batch_size : int, optional, default=128
+            The size of a mini-batch.
+        partition : bool, optional, default=False
+            Whether to partition batch for parallelism.
+        prefetch : int, optional, default=5
+            The prefetch count.

        """
        super(BlobFetcher, self).__init__()

--- a/Dragon/python/dragon/utils/vision/data_batch.py
+++ b/Dragon/python/dragon/utils/vision/data_batch.py
@@ -40,38 +40,38 @@ class DataBatch(object):
        ----------
        source : str
            The path of database.
-        multiple_nodes: boolean
-            Whether to split data for multiple parallel nodes. Default is ``False``.
-        shuffle : boolean
-            Whether to shuffle the data. Default is ``False``.
-        num_chunks : int
-            The number of chunks to split. Default is ``2048``.
-        chunk_size : int
-            The size(MB) of each chunk. Default is -1 (Refer ``num_chunks``).
-        padding : int
-            The zero-padding size. Default is ``0`` (Disabled).
-        fill_value : int
-            The value to fill when padding is valid. Default is ``127``.
-        crop_size : int
-            The crop size. Default is ``0`` (Disabled).
-        mirror : boolean
-            Whether to flip(horizontally) images. Default is ``False``.
-        color_augmentation : boolean
-            Whether to distort colors. Default is ``False``.
-        min_random_scale : float
-            The min scale of the input images. Default is ``1.0``.
-        max_random_scale : float
-            The max scale of the input images. Default is ``1.0``.
-        force_color : boolean
-            Set to duplicate channels for gray. Default is ``False``.
-        phase : str
-            The phase of this operator, ``TRAIN`` or ``TEST``. Default is ``TRAIN``.
-        batch_size : int
-            The size of a training batch.
-        partition : boolean
-            Whether to partition batch. Default is ``False``.
-        prefetch : int
-            The prefetch count. Default is ``5``.
+        multiple_nodes: boolean, optional, default=False
+            Whether to split data for multiple parallel nodes.
+        shuffle : bool, optional, default=False
+            Whether to shuffle the data.
+        num_chunks : int, optional, default=2048
+            The number of chunks to split.
+        chunk_size : int, optional, default=-1
+            The size(MB) of each chunk.
+        padding : int, optional, default=0
+            The zero-padding size.
+        fill_value : int, optional, default=127
+            The value to fill when padding is valid.
+        crop_size : int, optional, default=0
+            The cropping size.
+        mirror : bool, optional, default=False
+            Whether to mirror(flip horizontally) images.
+        color_augmentation : bool, optional, default=False
+            Whether to use color distortion.1
+        min_random_scale : float, optional, default=1.
+            The min scale of the input images.
+        max_random_scale : float, optional, default=1.
+            The max scale of the input images.
+        force_gray : bool, optional, default=False
+            Set not to duplicate channel for gray.
+        phase : {'TRAIN', 'TEST'}, optional
+            The optional running phase.
+        batch_size : int, optional, default=128
+            The size of a mini-batch.
+        partition : bool, optional, default=False
+            Whether to partition batch for parallelism.
+        prefetch : int, optional, default=5
+            The prefetch count.

        """
        super(DataBatch, self).__init__()
@@ -109,7 +109,7 @@ class DataBatch(object):
                    self._num_transformers += 1
        self._num_transformers = min(self._num_transformers, self._max_transformers)

-        self._batch_size = kwargs.get('batch_size', 100)
+        self._batch_size = kwargs.get('batch_size', 128)
        self._partition = kwargs.get('partition', False)
        if self._partition:
            self._batch_size = int(self._batch_size / kwargs['group_size'])

--- a/Dragon/python/dragon/utils/vision/data_reader.py
+++ b/Dragon/python/dragon/utils/vision/data_reader.py
@@ -35,14 +35,14 @@ class DataReader(Process):
        ----------
        source : str
            The path of database.
-        multiple_nodes: boolean
-            Whether to split data for multiple parallel nodes. Default is ``False``.
-        shuffle : boolean
-            Whether to shuffle the data. Default is ``False``.
-        num_chunks : int
-            The number of chunks to split. Default is ``2048``.
-        chunk_size : int
-            The size(MB) of each chunk. Default is -1 (Refer ``num_chunks``).
+        multiple_nodes: boolean, optional, default=False
+            Whether to split data for multiple parallel nodes.
+        shuffle : bool, optional, default=False
+            Whether to shuffle the data.
+        num_chunks : int, optional, default=2048
+            The number of chunks to split.
+        chunk_size : int, optional, default=-1
+            The size(MB) of each chunk.

        """
        super(DataReader, self).__init__()

--- a/Dragon/python/dragon/utils/vision/data_transformer.py
+++ b/Dragon/python/dragon/utils/vision/data_transformer.py
@@ -42,24 +42,24 @@ class DataTransformer(Process):

        Parameters
        ----------
-        padding : int
-            The padding size. Default is ``0`` (Disabled).
-        fill_value : int
-            The value to fill when padding is valid. Default is ``127``.
-        crop_size : int
-            The crop size. Default is ``0`` (Disabled).
-        mirror : boolean
-            Whether to flip(horizontally) images. Default is ``False``.
-        color_augmentation : boolean
-            Whether to distort colors. Default is ``False``.
-        min_random_scale : float
-            The min scale of the input images. Default is ``1.0``.
-        max_random_scale : float
-            The max scale of the input images. Default is ``1.0``.
-        force_color : boolean
-            Set to duplicate channels for gray. Default is ``False``.
-        phase : str
-            The phase of this operator, ``TRAIN`` or ``TEST``. Default is ``TRAIN``.
+        padding : int, optional, default=0
+            The zero-padding size.
+        fill_value : int, optional, default=127
+            The value to fill when padding is valid.
+        crop_size : int, optional, default=0
+            The cropping size.
+        mirror : bool, optional, default=False
+            Whether to mirror(flip horizontally) images.
+        color_augmentation : bool, optional, default=False
+            Whether to use color distortion.1
+        min_random_scale : float, optional, default=1.
+            The min scale of the input images.
+        max_random_scale : float, optional, default=1.
+            The max scale of the input images.
+        force_gray : bool, optional, default=False
+            Set not to duplicate channel for gray.
+        phase : {'TRAIN', 'TEST'}, optional
+            The optional running phase.

        """
        super(DataTransformer, self).__init__()

--- a/Dragon/python/dragon/vm/theano/compile/function.py
+++ b/Dragon/python/dragon/vm/theano/compile/function.py
@@ -183,8 +183,6 @@ def GraphDef_Device(graph_def):
        device_option.device_type = supports[option['device']]
        device_option.device_id = option['device_id']
        device_option.random_seed = option['random_seed']
-        if option['device'] == 'cuda':
-            if option['use_cudnn']: device_option.engine = 'CUDNN'
        graph_def.device_option.CopyFrom(device_option)



--- a/Dragon/python/dragon/vm/torch/execution.py
+++ b/Dragon/python/dragon/vm/torch/execution.py
@@ -93,14 +93,14 @@ def RunOperator(
            op_name = recorder.append(op)
            op.name = op_name
            for ix in range(len(outputs)):
-                outputs[ix]._requires_grad = True
+                outputs[ix].requires_grad = True
                outputs[ix].__jit_recorder__ = recorder
                if len(ignored_grads) > 0:
                    outputs[ix]._ignored_grads = ignored_grads
        else:
            # Reset status
            for ix in range(len(outputs)):
-                outputs[ix]._requires_grad = False
+                outputs[ix].requires_grad = False

    # Callback on Run
    if callback_on_run: callback_on_run(op_name)

--- a/Dragon/python/dragon/vm/torch/module.py
+++ b/Dragon/python/dragon/vm/torch/module.py
@@ -315,9 +315,8 @@ class Module(object):
                op_type=self.op_meta['op_type'],
                device_option=proto_utils.
                    GetDeviceOption(
-                    self._device.type,
-                        self._device.index,
-                            engine='CUDNN'),
+                        self._device.type,
+                            self._device.index),
                **self.op_meta['arguments']
            )


--- a/Dragon/python/dragon/vm/torch/nn/modules/rnn.py
+++ b/Dragon/python/dragon/vm/torch/nn/modules/rnn.py
@@ -413,12 +413,15 @@ class RNNCellBase(Module):
        for weight in self.parameters():
            weight.data.uniform_(-stdv, stdv)

+from .activation import Tanh, Sigmoid

 class LSTMCell(RNNCellBase):
    def __init__(self, input_size, hidden_size, bias=True):
        super(LSTMCell, self).__init__(
            input_size, hidden_size, bias, num_chunks=4)
        self.register_op()
+        self.tanh = Tanh()
+        self.sigmoid = Sigmoid()

    def register_op(self):
        self.op_meta = {'op_type': 'LSTMCell', 'arguments': {}}

--- a/Dragon/python/dragon/vm/torch/ops/modules/array.py
+++ b/Dragon/python/dragon/vm/torch/ops/modules/array.py
@@ -349,7 +349,7 @@ class OneHot(BaseModule):
    def forward(self, x):
        inputs = [x]; self.unify_devices(inputs)
        outputs = [self.register_output()]
-        return self.run(inputs, outputs)
+        with no_grad(): return self.run(inputs, outputs)


 class Cast(BaseModule):
@@ -376,8 +376,7 @@ class Cast(BaseModule):
            y = self.run(inputs, outputs)
        else:
            self.unify_devices([x])
-            with no_grad():
-                y = self.run([], [x])
+            with no_grad(): y = self.run([], [x])
        return y


@@ -400,4 +399,4 @@ class Multinomial(BaseModule):
    def forward(self, x, y):
        inputs = [x]; self.unify_devices(inputs)
        outputs = [y] if y else [self.register_output()]
-        return self.run(inputs, outputs)
\ No newline at end of file
+        with no_grad(): return self.run(inputs, outputs)
\ No newline at end of file
--- a/Dragon/src/contrib/rcnn/bbox_utils.h
+++ b/Dragon/src/contrib/rcnn/bbox_utils.h
@@ -60,6 +60,37 @@ inline int BBoxTransform(
    return (bbox_w >= min_box_w) * (bbox_h >= min_box_h);
 }

+template <typename T>
+inline void BBoxTransform(
+    const T                         dx,
+    const T                         dy,
+    const T                         d_log_w,
+    const T                         d_log_h,
+    const T                         im_w,
+    const T                         im_h,
+    const T                         im_scale,
+    T*                              bbox) {
+    const T w = bbox[2] - bbox[0] + 1;
+    const T h = bbox[3] - bbox[1] + 1;
+    const T ctr_x = bbox[0] + (T)0.5 * w;
+    const T ctr_y = bbox[1] + (T)0.5 * h;
+
+    const T pred_ctr_x = dx * w + ctr_x;
+    const T pred_ctr_y = dy * h + ctr_y;
+    const T pred_w = exp(d_log_w) * w;
+    const T pred_h = exp(d_log_h) * h;
+
+    bbox[0] = pred_ctr_x - (T)0.5 * pred_w;
+    bbox[1] = pred_ctr_y - (T)0.5 * pred_h;
+    bbox[2] = pred_ctr_x + (T)0.5 * pred_w;
+    bbox[3] = pred_ctr_y + (T)0.5 * pred_h;
+
+    bbox[0] = std::max((T)0, std::min(bbox[0], im_w - 1)) / im_scale;
+    bbox[1] = std::max((T)0, std::min(bbox[1], im_h - 1)) / im_scale;
+    bbox[2] = std::max((T)0, std::min(bbox[2], im_w - 1)) / im_scale;
+    bbox[3] = std::max((T)0, std::min(bbox[3], im_h - 1)) / im_scale;
+}
+
 /******************** Anchor ********************/

 template <typename T>
@@ -117,6 +148,38 @@ inline void GenerateGridAnchors(
    }
 }

+template <typename T>
+inline void GenerateGridAnchors(
+    const int                       num_proposals,
+    const int                       num_classes,
+    const int                       num_anchors,
+    const int                       feat_h,
+    const int                       feat_w,
+    const int                       stride,
+    const int                       base_offset,
+    const T*                        anchors,
+    const int64_t*                  indices,
+    T*                              proposals) {
+    T x, y;
+    int idx_4d, a, h, w;
+    int lr = num_classes * base_offset;
+    int rr = num_classes * (num_anchors * feat_h * feat_w);
+    for (int i = 0; i < num_proposals; ++i) {
+        idx_4d = (int)indices[i] - lr;
+        if (idx_4d >= 0 && idx_4d < rr) {
+            idx_4d /= num_classes;
+            w = idx_4d % feat_w;
+            h = (idx_4d / feat_w) % feat_h;
+            a = idx_4d / feat_w / feat_h;
+            x = (T)w * stride, y = (T)h * stride;
+            auto* A = anchors + a * 4;
+            auto* P = proposals + i * 7 + 1;
+            P[0] = x + A[0], P[1] = y + A[1];
+            P[2] = x + A[2], P[3] = y + A[3];
+        }
+    }
+}
+
 /******************** Proposal ********************/

 template <typename T>
@@ -164,14 +227,16 @@ void GenerateMSProposals(
    const int64_t*                  indices,
    T*                              proposals) {
    int64_t index;
+    int64_t num_candidates_2x = 2 * num_candidates;
+    int64_t num_candidates_3x = 3 * num_candidates;
    float* proposal = proposals;
    float dx, dy, d_log_w, d_log_h;
    for (int i = 0; i < num_proposals; ++i) {
        index = indices[i];
        dx = deltas[index];
        dy = deltas[num_candidates + index];
-        d_log_w = deltas[2 * num_candidates + index];
-        d_log_h = deltas[3 * num_candidates + index];
+        d_log_w = deltas[num_candidates_2x + index];
+        d_log_h = deltas[num_candidates_3x + index];
        proposal[4] = BBoxTransform<float>(
            dx, dy, d_log_w, d_log_h,
                im_w, im_h, min_box_w, min_box_h,
@@ -181,6 +246,41 @@ void GenerateMSProposals(
 }

 template <typename T>
+void GenerateMCProposals(
+    const int                       num_proposals,
+    const int                       num_boxes,
+    const int                       num_classes,
+    const int                       im_idx,
+    const float                     im_h,
+    const float                     im_w,
+    const float                     im_scale,
+    const T*                        scores,
+    const T*                        deltas,
+    const int64_t*                  indices,
+    T*                              proposals) {
+    int64_t index, cls;
+    int64_t num_boxes_2x = 2 * num_boxes;
+    int64_t num_boxes_3x = 3 * num_boxes;
+    float* proposal = proposals;
+    float dx, dy, d_log_w, d_log_h;
+    for (int i = 0; i < num_proposals; ++i) {
+        cls = indices[i] % num_classes;
+        index = indices[i] / num_classes;
+        dx = deltas[index];
+        dy = deltas[num_boxes + index];
+        d_log_w = deltas[num_boxes_2x + index];
+        d_log_h = deltas[num_boxes_3x + index];
+        proposal[0] = im_idx;
+        BBoxTransform<float>(
+            dx, dy, d_log_w, d_log_h,
+                im_w, im_h, im_scale, proposal + 1);
+        proposal[5] = scores[indices[i]];
+        proposal[6] = cls + 1;
+        proposal += 7;
+    }
+}
+
+template <typename T>
 inline void SortProposals(
    const int                       start,
    const int                       end,

--- a/Dragon/src/contrib/rcnn/proposal_op.cc
+++ b/Dragon/src/contrib/rcnn/proposal_op.cc
@@ -7,7 +7,7 @@
 namespace dragon {

 template <class Context> template <typename T>
-void ProposalOp<Context>::RunWithType() {
+void ProposalOp<Context>::RunWithRCNN() {
    using BT = float;  // DType of BBox
    using BC = CPUContext;  // Context of BBox

@@ -15,7 +15,6 @@ void ProposalOp<Context>::RunWithType() {
    int total_rois = 0, num_rois;
    int num_candidates, num_proposals;

-    auto* RIdata = roi_indices.data();
    auto* batch_scores = Input(-3).template data<T, BC>();
    auto* batch_deltas = Input(-2).template data<T, BC>();
    auto* im_info = Input(-1).template data<BT, BC>();
@@ -32,11 +31,10 @@ void ProposalOp<Context>::RunWithType() {
        if (strides.size() == 1) {
            // Case 1: single stride
            feat_h = Input(0).dim(2), feat_w = Input(0).dim(3);
-            K = feat_h * feat_w, A = int(ratios.size() * scales.size());
+            A = int(ratios.size() * scales.size()), K = feat_h * feat_w;
            // Select the Top-K candidates as proposals
-            num_candidates = K * A;
-            num_proposals = std::min(
-                num_candidates, (int)pre_nms_top_n);
+            num_candidates = A * K;
+            num_proposals = std::min(num_candidates, (int)pre_nms_top_n);
            utils::math::ArgPartition(
                num_candidates, num_proposals,
                    true, scores, indices);
@@ -50,16 +48,16 @@ void ProposalOp<Context>::RunWithType() {
                    &ratios[0], &scales[0], Adata);
            rcnn::GenerateGridAnchors(
                num_proposals, A, feat_h, feat_w,
-                    strides[0], 0, Adata, indices.data(), Pdata);
+                    strides[0], 0, Adata, &indices[0], Pdata);
            rcnn::GenerateSSProposals(K, num_proposals,
                im_h, im_w, min_box_h, min_box_w,
-                    scores, deltas, indices.data(), Pdata);
+                    scores, deltas, &indices[0], Pdata);
            // Sort, NMS and Retrieve
            rcnn::SortProposals(0, num_proposals - 1, num_proposals, Pdata);
            rcnn::ApplyNMS(num_proposals, post_nms_top_n, nms_thresh,
                proposals_.template mutable_data<BT, Context>(),
-                    RIdata, num_rois, ctx());
-            rcnn::RetrieveRoIs(num_rois, n, Pdata, RIdata, Ydata);
+                    &roi_indices[0], num_rois, ctx());
+            rcnn::RetrieveRoIs(num_rois, n, Pdata, &roi_indices[0], Ydata);
        } else if (strides.size() > 1) {
            // Case 2: multiple stridess
            CHECK_EQ(strides.size(), InputSize() - 3)
@@ -70,8 +68,7 @@ void ProposalOp<Context>::RunWithType() {
                << scales.size() << " scales";
            // Select the Top-K candidates as proposals
            num_candidates = Input(-3).dim(1);
-            num_proposals = std::min(
-                num_candidates, (int)pre_nms_top_n);
+            num_proposals = std::min(num_candidates, (int)pre_nms_top_n);
            utils::math::ArgPartition(
                num_candidates, num_proposals,
                    true, scores, indices);
@@ -90,19 +87,19 @@ void ProposalOp<Context>::RunWithType() {
                rcnn::GenerateGridAnchors(
                    num_proposals, A, feat_h, feat_w,
                        strides[i], base_offset,
-                            Adata, indices.data(), Pdata);
-                base_offset += K * A;
+                            Adata, &indices[0], Pdata);
+                base_offset += (A * K);
            }
            rcnn::GenerateMSProposals(
                num_candidates, num_proposals,
                    im_h, im_w, min_box_h, min_box_w,
-                        scores, deltas, indices.data(), Pdata);
+                        scores, deltas, &indices[0], Pdata);
            // Sort, NMS and Retrieve
            rcnn::SortProposals(0, num_proposals - 1, num_proposals, Pdata);
            rcnn::ApplyNMS(num_proposals, post_nms_top_n, nms_thresh,
                proposals_.template mutable_data<BT, Context>(),
-                    RIdata, num_rois, ctx());
-            rcnn::RetrieveRoIs(num_rois, n, Pdata, RIdata, Ydata);
+                    &roi_indices[0], num_rois, ctx());
+            rcnn::RetrieveRoIs(num_rois, n, Pdata, &roi_indices[0], Ydata);
        } else {
            LOG(FATAL) << "Excepted at least one stride for proposals.";
        }
@@ -126,7 +123,7 @@ void ProposalOp<Context>::RunWithType() {
        ctx()->template Copy<BT, BC, BC>(Y.count(),
            rois, Output(0)->template data<BT, BC>());

-        rcnn::CollectRoIs<BT>(total_rois, min_level, max_level,
+        rcnn::CollectRoIs(total_rois, min_level, max_level,
            canonical_level, canonical_scale, rois, bins);

        for (int i = 0; i < OutputSize(); i++) {
@@ -138,17 +135,92 @@ void ProposalOp<Context>::RunWithType() {
    }
 }

+template <class Context> template <typename T>
+void ProposalOp<Context>::RunWithRetinaNet() {
+    using BT = float;  // DType of BBox
+    using BC = CPUContext;  // Context of BBox
+
+    int feat_h, feat_w, C = Input(-3).dim(2), A, K;
+    int total_proposals = 0;
+    int num_candidates, num_boxes, num_proposals;
+
+    auto* batch_scores = Input(-3).template data<T, BC>();
+    auto* batch_deltas = Input(-2).template data<T, BC>();
+    auto* im_info = Input(-1).template data<BT, BC>();
+    auto* Ydata = Output(0)->template mutable_data<BT, BC>();
+
+    for (int n = 0; n < num_images; ++n) {
+        const BT im_h = im_info[0];
+        const BT im_w = im_info[1];
+        const BT im_scale = im_info[2];
+        auto* scores = batch_scores + n * Input(-3).stride(0);
+        auto* deltas = batch_deltas + n * Input(-2).stride(0);
+        CHECK_EQ(strides.size(), InputSize() - 3)
+            << "\nGiven " << strides.size() << " strides and "
+            << InputSize() - 3 << " feature inputs";
+        // Select the Top-K candidates as proposals
+        num_boxes = Input(-3).dim(1);
+        num_candidates = Input(-3).count(1);
+        roi_indices.resize(num_candidates); num_candidates = 0;
+        for (int i = 0; i < roi_indices.size(); ++i)
+            if (scores[i] > score_thresh)
+                roi_indices[num_candidates++] = i;
+        scores_ex.resize(num_candidates);
+        for (int i = 0; i < num_candidates; ++i)
+            scores_ex[i] = scores[roi_indices[i]];
+        num_proposals = std::min(num_candidates, (int)pre_nms_top_n);
+        utils::math::ArgPartition(
+            num_candidates, num_proposals,
+                true, &scores_ex[0], indices);
+        for (int i = 0; i < num_proposals; ++i)
+            indices[i] = roi_indices[indices[i]];
+        // Decode the candidates
+        int base_offset = 0;
+        for (int i = 0; i < strides.size(); i++) {
+            feat_h = Input(i).dim(2), feat_w = Input(i).dim(3);
+            A = int(ratios.size() * scales.size()), K = feat_h * feat_w;
+            anchors_.Reshape({ A, 4 });
+            auto* Adata = anchors_.template mutable_data<BT, BC>();
+            rcnn::GenerateAnchors(strides[i],
+                (int)ratios.size(), (int)scales.size(),
+                    &ratios[0], &scales[0], Adata);
+            rcnn::GenerateGridAnchors(
+                num_proposals, C, A, feat_h, feat_w,
+                    strides[i], base_offset,
+                        Adata, &indices[0], Ydata);
+            base_offset += (A * K);
+        }
+        rcnn::GenerateMCProposals(
+            num_proposals, num_boxes, C, n,
+                im_h, im_w, im_scale,
+                    scores, deltas, &indices[0], Ydata);
+        total_proposals += num_proposals;
+        Ydata += (num_proposals * 7);
+        im_info += Input(-1).dim(1);
+    }
+
+    Output(0)->Reshape({ total_proposals, 7 });
+}
+
 template <class Context>
 void ProposalOp<Context>::RunOnDevice() {
    num_images = Input(0).dim(0);
    CHECK_EQ(Input(-1).dim(0), num_images)
        << "\nExcepted " << num_images << " groups image info, "
        << "but got " << Input(-1).dim(0) << ".";
-    roi_indices.resize(post_nms_top_n);
-    Output(0)->Reshape({ num_images * post_nms_top_n, 5 });

-    if (XIsType(Input(-3), float)) RunWithType<float>();
-    else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
+    if (det_type == "RCNN") {
+        roi_indices.resize(post_nms_top_n);
+        Output(0)->Reshape({ num_images * post_nms_top_n, 5 });
+        if (XIsType(Input(-3), float)) { RunWithRCNN<float>(); }
+        else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
+    } else if (det_type == "RETINANET") {
+        Output(0)->Reshape({ num_images * pre_nms_top_n, 7 });
+        if (XIsType(Input(-3), float)) { RunWithRetinaNet<float>(); }
+        else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
+    } else {
+        LOG(FATAL) << "Unsupported detector: " << det_type;
+    }
 }

 DEPLOY_CPU(Proposal);

--- a/Dragon/src/contrib/rcnn/proposal_op.h
+++ b/Dragon/src/contrib/rcnn/proposal_op.h
@@ -22,12 +22,14 @@ class ProposalOp final : public Operator<Context> {
 public:
    ProposalOp(const OperatorDef& def, Workspace* ws)
        : Operator<Context>(def, ws),
+          det_type(OperatorBase::Arg<string>("det_type", "RCNN")),
          strides(OperatorBase::Args<int64_t>("strides")),
          ratios(OperatorBase::Args<float>("ratios")),
          scales(OperatorBase::Args<float>("scales")),
          pre_nms_top_n(OperatorBase::Arg<int64_t>("pre_nms_top_n", 6000)),
          post_nms_top_n(OperatorBase::Arg<int64_t>("post_nms_top_n", 300)),
-          nms_thresh(OperatorBase::Arg<float>("nms_thresh", (float)0.7)),
+          nms_thresh(OperatorBase::Arg<float>("nms_thresh", 0.7f)),
+          score_thresh(OperatorBase::Arg<float>("score_thresh", 0.05f)),
          min_size(OperatorBase::Arg<int64_t>("min_size", 16)),
          min_level(OperatorBase::Arg<int64_t>("min_level", 2)),
          max_level(OperatorBase::Arg<int64_t>("max_level", 5)),
@@ -37,14 +39,16 @@ class ProposalOp final : public Operator<Context> {

    void RunOnDevice() override;

-    template <typename T> void RunWithType();
+    template <typename T> void RunWithRCNN();
+    template <typename T> void RunWithRetinaNet();

 protected:
+    string det_type;
+    float nms_thresh, score_thresh;
    vector<int64_t> strides, indices, roi_indices;
-    vector<float> ratios, scales;
+    vector<float> ratios, scales, scores_ex;
    int64_t pre_nms_top_n, post_nms_top_n, min_size, num_images;
    int64_t min_level, max_level, canonical_level, canonical_scale;
-    float nms_thresh;
    Tensor anchors_, proposals_, nms_mask_;
 };


--- a/Dragon/src/core/graph_gradient.cc
+++ b/Dragon/src/core/graph_gradient.cc
@@ -22,7 +22,7 @@ bool GraphGradientMaker::CheckGrad(
            if (external_grads_.count(g_output))
                inputs_to_grads_[output] = g_output;
            // Consider generate virtual grad
-            else if (targets.count(output) && g_output != "ignore") {
+            else if (targets.count(output) && g_output != "NULL") {
                gen_grads.push_back({ output, idx });
                inputs_to_grads_[output] = g_output;
            }
@@ -88,7 +88,7 @@ void GraphGradientMaker::Make(
            string g_output = "";
            if (inputs_to_grads_.count(output) > 0)
                g_output = inputs_to_grads_[output];
-            if (g_output.empty()) g_output = "ignore";
+            if (g_output.empty()) g_output = "NULL";
            g_outputs.emplace_back(g_output);
        }
        Gradient grad = MakeGradientForOp(op, g_outputs);
@@ -194,10 +194,10 @@ void GraphGradientMaker::Make(

 #define SHARE_OUTPUTS_BODY \
   {string output = op->output(ix); \
-    if (output == "ignore") continue; \
+    if (output == "NULL") continue; \
    if (ref_count.count(output) == 0) { \
        if (ignore_grads_.count(output) > 0) \
-            *op->mutable_output(ix) = "ignore"; \
+            *op->mutable_output(ix) = "NULL"; \
        continue; \
    } \
    if (op->type() == "TemplateGradient" || \
@@ -212,13 +212,22 @@ void GraphGradientMaker::Make(
    *op->mutable_output(ix) = temp_grad;}

 void GraphGradientMaker::Share(GraphDef& graph) {
+    Set<int> invalid_ops;
    Map<string, int> ref_count;
    // Count the refs for detecting leaf nodes
-    for (auto& op : graph.op()) {
+    for (int i = 0; i < graph.op_size(); ++i) {
+        const OperatorDef& op = graph.op(i);
        // Ignore the non-gradient ops
        if (op.type().find("Gradient") == string::npos) continue;
+        if (op.type() == "GradientGather" &&
+            ignore_grads_.count(op.output(0))) {
+            for (auto& input : op.input())
+                ignore_grads_.insert(input);
+            invalid_ops.insert(i); continue;
+        }
        for (auto& input : op.input())
-            if (input.find("grad") != string::npos) ref_count[input] += 1;
+            if (input.find("grad") != string::npos)
+                    ref_count[input] += 1;
    }

    // Prepare the Gradients Pool
@@ -247,6 +256,8 @@ void GraphGradientMaker::Share(GraphDef& graph) {
        OperatorDef* op = graph.mutable_op(i);
        // Ignore the non-gradient ops
        if (op->type().find("Gradient") == string::npos) continue;
+        // Ignore the invalid ops
+        if (invalid_ops.count(i)) { op->mutable_type()->clear(); continue; }
        // GC to store the grads that have finished lifecycle
        vector<string> GC;
        // Inplace-aware

--- a/Dragon/src/core/graph_optimizer.cc
+++ b/Dragon/src/core/graph_optimizer.cc
@@ -19,7 +19,7 @@ GraphDef GraphOptimizer::PruneNodes(const GraphDef& input_def) {
            if (!op.input_size()) sp_u.resize(op.output_size(), "");
            else sp_u.assign(op.input().begin(), op.input().end());
            for (const auto& u : sp_u) {
-                if (u == "ignore") continue;
+                if (u == "NULL") continue;
                dag_[v].parents.push_back(u);
                dag_[u].childs.push_back(v);
                dag_[v].op_idx = i;
@@ -66,32 +66,32 @@ GraphDef GraphOptimizer::PruneNodes(const GraphDef& input_def) {
        for (int i = 0; i < input_def.op(it).input_size(); ++i) {
            string input = input_def.op(it).input(i);
            if (!colored_[input] || !outputs.count(input))
-                *op_def.mutable_input(i) = "ignore";
+                *op_def.mutable_input(i) = "NULL";
        }
        // Rewritten for outputs
        for (int i = 0; i < input_def.op(it).output_size(); ++i) {
            string output = input_def.op(it).output(i);
-            if (!colored_[output]) *op_def.mutable_output(i) = "ignore";
+            if (!colored_[output]) *op_def.mutable_output(i) = "NULL";
            else outputs.insert(op_def.output(i));
        }
        // Rewritten for some hand-craft cases
        if (op_def.type() == "AffineGradient") {
            // Trigger in-place if not solving dAlpha
-            if (op_def.output(1) == "ignore")
-                *op_def.mutable_input(0) = "ignore";
+            if (op_def.output(1) == "NULL")
+                *op_def.mutable_input(0) = "NULL";
        } else if (op_def.type() == "MulGradient" ||
                   op_def.type() == "RMulGradient") {
-            if (op_def.output(0) == "ignore")
-                *op_def.mutable_input(1) = "ignore";
-            if (op_def.output(1) == "ignore")
-                *op_def.mutable_input(0) = "ignore";
+            if (op_def.output(0) == "NULL")
+                *op_def.mutable_input(1) = "NULL";
+            if (op_def.output(1) == "NULL")
+                *op_def.mutable_input(0) = "NULL";
        } else if (op_def.type() == "DivGradient" ||
                   op_def.type() == "RDivGradient") {
            // dX2 requires both X1 and X2
-            if (op_def.output(1) == "ignore") {
-                *op_def.mutable_input(0) = "ignore";
-                if (op_def.output(0) == "ignore")
-                    *op_def.mutable_input(1) = "ignore";
+            if (op_def.output(1) == "NULL") {
+                *op_def.mutable_input(0) = "NULL";
+                if (op_def.output(0) == "NULL")
+                    *op_def.mutable_input(1) = "NULL";
            }
        }
        // Push into the final sequence
@@ -117,7 +117,7 @@ GraphDef GraphOptimizer::AddInplace(const GraphDef& input_def) {
            if (!op.input_size()) sp_u.resize(op.output_size(), "");
            else sp_u.assign(op.input().begin(), op.input().end());
            for (const auto& u : sp_u) {
-                if (u == "ignore") continue;
+                if (u == "NULL") continue;
                dag_[v].parents.push_back(u);
                dag_[u].childs.push_back(v);
                dag_[v].op_idx = i;

--- a/Dragon/src/core/operator.cc
+++ b/Dragon/src/core/operator.cc
@@ -100,10 +100,11 @@ OperatorBase* TryCreateOperator(
        case PROTO_CPU:
            return CPUOperatorRegistry()->Create(key, def, ws);
        case PROTO_CUDA:
-            if (def.device_option().has_engine() &&
-                def.device_option().engine() == "CUDNN" &&
-                CUDNNOperatorRegistry()->Has(key))
+#ifdef WITH_CUDNN
+            if (CUDNNOperatorRegistry()->Has(key) &&
+                    CUDAContext::cuda_object()->cudnn_enabled)
                return CUDNNOperatorRegistry()->Create(key, def, ws);
+#endif
            return CUDAOperatorRegistry()->Create(key, def, ws);
        case PROTO_CNML:
            return CNMLOperatorRegistry()->Create(key, def, ws);
@@ -155,7 +156,7 @@ Gradient MakeGradientForOp(
            );
        }
    }
-    // Copy device option, engine, and arguments
+    // Copy device option and arguments
    if (maker->CopyDeviceOption() && def.has_device_option())
        for (auto& grad_def : grad.ops)
            grad_def.mutable_device_option()->CopyFrom(

--- a/Dragon/src/core/operator_schema.cc
+++ b/Dragon/src/core/operator_schema.cc
@@ -16,9 +16,9 @@ bool OpSchema::Verify(const OperatorDef& def) const {
                   << ", max=" << max_output_ << "]";
    }
    for (int in = 0; in < def.input_size(); in++) {
-        if (def.input(in) == "ignore") continue;
+        if (def.input(in) == "NULL") continue;
        for (int out = 0; out < def.output_size(); out++) {
-            if (def.output(out) == "ignore") continue;
+            if (def.output(out) == "NULL") continue;
            if (def.input(in) == def.output(out) && (!CheckInplace(in, out)))
                LOG(FATAL) << indicator << "Input("  << in << ") and "
                           << "Output(" << out << ") can not be set to inplace.";

--- a/Dragon/src/core/workspace.cc
+++ b/Dragon/src/core/workspace.cc
@@ -7,7 +7,7 @@ namespace dragon {
 /*! Create some internal tensors */

 void Workspace::InitWorkspace() {
-    CreateTensor("ignore");
+    CreateTensor("NULL");
    Tensor* recomputing_flag = CreateTensor(
        "/opt/recomputing_flag")->Reshape({ 1 });
    recomputing_flag->mutable_data

--- a/Dragon/src/onnx/onnx_importer.cc
+++ b/Dragon/src/onnx/onnx_importer.cc
@@ -351,7 +351,7 @@ ONNXImporterReturns ONNXBackend::ArgReduceNodeImporter(
    auto* operation = attributes.AddRewrittenAttribute("operation");
    if (onnx_node->node.op_type() == "ArgMax") operation->set_s("ARGMAX");
    else if (onnx_node->node.op_type() == "ArgMin") operation->set_s("ARGMIN");
-    node.add_output("ignore");  // A dummy output("Value") is required
+    node.add_output("NULL");  // A dummy output("Value") is required

    return CommonONNXNodeImporter(&onnx_node_v2, ctx);
 }

--- a/Dragon/src/operators/activation/prelu_op.cc
+++ b/Dragon/src/operators/activation/prelu_op.cc
@@ -46,7 +46,7 @@ void PReluGradientOp<Context>::RunWithType() {
    auto* Xdata = Input(0).template data<T, Context>();
    auto* dYdata = Input(-1).template data<T, Context>();

-    if (Output(1)->name() != "ignore") {
+    if (Output(1)->name() != "NULL") {
        DECLARE_MULTIPLIER(multiplier, channels * dim);
        auto* dWdata = Output(1)->template mutable_data<T, Context>();
        auto* dWBdata = ws()->template caches<T, Context>({ channels * dim })[0];
@@ -55,7 +55,7 @@ void PReluGradientOp<Context>::RunWithType() {
                dYdata, Xdata, multiplier, dWBdata, dWdata, ctx());
    }

-    if (Output(0)->name() != "ignore") {
+    if (Output(0)->name() != "NULL") {
        auto* Wdata = Input(1).template data<T, Context>();
        auto* dXdata = Output(0)->template mutable_data<T, Context>();
        kernel::PReluGrad(Output(0)->count(), channels, dim,

--- a/Dragon/src/operators/arithmetic/add_op.cc
+++ b/Dragon/src/operators/arithmetic/add_op.cc
@@ -60,13 +60,13 @@ template <class Context> template <typename T>
 void AddGradientOp<Context>::EltwiseRunWithType() {
    auto* dy = Input(-1).template data<T, Context>();

-    if (Output(1)->name() != "ignore") {
+    if (Output(1)->name() != "NULL") {
        auto* dx2 = Output(1)->template mutable_data<T, Context>();
        ctx()->template Copy<T, Context, Context>(
            Output(1)->count(), dx2, dy);
    }

-    if (Output(0)->name() != "ignore") {
+    if (Output(0)->name() != "NULL") {
        auto* dx1 = Output(0)->template mutable_data<T, Context>();
        ctx()->template Copy<T, Context, Context>(
            Output(0)->count(), dx1, dy);
@@ -78,14 +78,14 @@ void AddGradientOp<Context>::BroadcastRunWithType(int type) {
    DEFINE_FUNDAMENTAL_OP_X1X2;
    auto* dy = Input(-1).template data<T, Context>();

-    if (Output(1)->name() != "ignore") {
+    if (Output(1)->name() != "NULL") {
        auto* dx2 = Output(1)->template mutable_data<T, Context>();
        vector<int> dims = { rows, cols }, axes = { type };
        kernel::ReduceSum(2, dims.data(),
            1, axes.data(), 1.f, dy, dx2, ctx());
    }

-    if (Output(0)->name() != "ignore") {
+    if (Output(0)->name() != "NULL") {
        auto* dx1 = Output(0)->template mutable_data<T, Context>();
        ctx()->template Copy<T, Context, Context>(
            X1->count(), dx1, dy);

--- a/Dragon/src/operators/arithmetic/affine_op.cc
+++ b/Dragon/src/operators/arithmetic/affine_op.cc
@@ -60,7 +60,7 @@ void AffineGradientOp<Context>::RunWithType() {
    auto* dXdata = Output(0)->template mutable_data<T, Context>();

    // dA = X * dY
-    if (Output(1)->name() != "ignore") {
+    if (Output(1)->name() != "NULL") {
        Output(1)->ReshapeLike(Input(1));
        auto* Xdata = Input(0).template data<T, Context>();
        auto* dAdata = Output(1)->template mutable_data<T, Context>();
@@ -74,7 +74,7 @@ void AffineGradientOp<Context>::RunWithType() {
    }

    // dB = dY
-    if (Output(2)->name() != "ignore") {
+    if (Output(2)->name() != "NULL") {
        Output(2)->ReshapeLike(Input(1));
        auto* dBdata = Output(2)->template mutable_data<T, Context>();
        // Eltwise
@@ -87,7 +87,7 @@ void AffineGradientOp<Context>::RunWithType() {
    }

    // dX = alpha * dY
-    if (Output(0)->name() != "ignore") {
+    if (Output(0)->name() != "NULL") {
        kernel::AffineGrad(outer_dim, inner_dim, scale_dim,
            dYdata, Adata, dXdata, ctx());
    }

--- a/Dragon/src/operators/arithmetic/cudnn_affine_op.cc
+++ b/Dragon/src/operators/arithmetic/cudnn_affine_op.cc
@@ -101,7 +101,7 @@ void CuDNNAffineGradientOp<Context>::RunWithType() {
            CUDNNType<CT>::type, CUDNN_PROPAGATE_NAN));

    // dA = X * dY
-    if (Output(1)->name() != "ignore") {
+    if (Output(1)->name() != "NULL") {
        Output(1)->ReshapeLike(Input(1));
        auto* Xdata = Input(0).template data<DT, Context>();
        auto* dAdata = Output(1)->template mutable_data<DT, Context>();
@@ -119,7 +119,7 @@ void CuDNNAffineGradientOp<Context>::RunWithType() {
    }

    // dB = dY
-    if (Output(2)->name() != "ignore") {
+    if (Output(2)->name() != "NULL") {
        Output(2)->ReshapeLike(Input(1));
        auto* dBdata = Output(2)->template mutable_data<DT, Context>();
        // Eltwise
@@ -136,7 +136,7 @@ void CuDNNAffineGradientOp<Context>::RunWithType() {
    }

    // dX = alpha * dY
-    if (Output(0)->name() != "ignore") {
+    if (Output(0)->name() != "NULL") {
        CUDNN_CHECK(cudnnOpTensor(
            ctx()->cudnn_handle(), mul_desc,
                CUDNNType<DT>::one, input_desc, dYdata,

--- a/Dragon/src/operators/arithmetic/div_op.cc
+++ b/Dragon/src/operators/arithmetic/div_op.cc
@@ -61,7 +61,7 @@ void DivGradientOp<Context>::EltwiseRunWithType() {
    DEFINE_FUNDAMENTAL_OP_X1X2;
    auto* dy = Input(-1).template data<T, Context>();

-    if (Output(1)->name() != "ignore") {
+    if (Output(1)->name() != "NULL") {
        auto* x1 = Input(0).template data<T, Context>();
        auto* x2 = Input(1).template data<T, Context>();
        auto* dx2 = Output(1)->template mutable_data<T, Context>();
@@ -73,7 +73,7 @@ void DivGradientOp<Context>::EltwiseRunWithType() {
        math::Scale(X2->count(), -1.f, dx2, dx2, ctx());
    }

-    if (Output(0)->name() != "ignore") {
+    if (Output(0)->name() != "NULL") {
        auto* x2 = Input(1).template data<T, Context>();
        auto* dx1 = Output(0)->template mutable_data<T, Context>();
        math::Div(X1->count(), dy, x2, dx1, ctx());
@@ -85,7 +85,7 @@ void DivGradientOp<Context>::BroadcastRunWithType(int type) {
    DEFINE_FUNDAMENTAL_OP_X1X2;
    auto* dy = Input(-1).template data<T, Context>();

-    if (Output(1)->name() != "ignore") {
+    if (Output(1)->name() != "NULL") {
        auto* x1 = Input(0).template data<T, Context>();
        auto* x2 = Input(1).template data<T, Context>();
        auto* dx2 = Output(1)->template mutable_data<T, Context>();
@@ -100,7 +100,7 @@ void DivGradientOp<Context>::BroadcastRunWithType(int type) {
            1, axes.data(), -1.f, cs[0], dx2, ctx());
    }

-    if (Output(0)->name() != "ignore") {
+    if (Output(0)->name() != "NULL") {
        auto* x2 = Input(1).template data<T, Context>();
        auto* dx1 = Output(0)->template mutable_data<T, Context>();
        math::BroadcastDiv(rows, cols, type, dy, x2, dx1, ctx());

--- a/Dragon/src/operators/arithmetic/dot_op.cc
+++ b/Dragon/src/operators/arithmetic/dot_op.cc
@@ -116,13 +116,13 @@ void DotGradientOp<Context>::DotRunWithType() {
    auto* Bdata = Input(1).template data<T, Context>();
    auto* dYdata = Input(-1).template data<T, CPUContext>();

-    if (Output(0)->name() != "ignore") {
+    if (Output(0)->name() != "NULL") {
        auto* dAdata = Output(0)->template mutable_data<T, Context>();
        math::Scale(Output(0)->count(), cast::to<float>(
            dYdata[0]), Bdata, dAdata, ctx());
    }

-    if (Output(1)->name() != "ignore") {
+    if (Output(1)->name() != "NULL") {
        auto* dBdata = Output(1)->template mutable_data<T, Context>();
        math::Scale(Output(0)->count(), cast::to<float>(
            dYdata[0]), Adata, dBdata, ctx());
@@ -145,7 +145,7 @@ void DotGradientOp<Context>::GemmRunWithType() {
    auto* X2data = Input(1).template data<T, Context>();
    auto* dYdata = Input(2).template data<T, Context>();

-    if (Output(0)->name() != "ignore") {
+    if (Output(0)->name() != "NULL") {
        auto* dX1data = Output(0)->template mutable_data<T, Context>();
        if (transA) {
            math::Gemm(
@@ -162,7 +162,7 @@ void DotGradientOp<Context>::GemmRunWithType() {
        }
    }

-    if (Output(1)->name() != "ignore") {
+    if (Output(1)->name() != "NULL") {
        auto* dX2data = Output(1)->template mutable_data<T, Context>();
        if (transB) {
           math::Gemm(

--- a/Dragon/src/operators/arithmetic/eltwise_op.cc
+++ b/Dragon/src/operators/arithmetic/eltwise_op.cc
@@ -73,7 +73,7 @@ void EltwiseGradientOp<Context>::SumRunWithType() {
    auto* dYdata = Input(-1).template data<T, Context>();

    for (int i = 0; i < OutputSize(); i++) {
-        if (Output(i)->name() == "ignore") continue;
+        if (Output(i)->name() == "NULL") continue;
        auto* dXdata = Output(i)->template mutable_data<T, Context>();
        // Copy the dY to dX and Apply the coeffients
        math::Scale(nelements, coeffs[i], dYdata, dXdata, ctx());
@@ -86,7 +86,7 @@ void EltwiseGradientOp<Context>::ProdRunWithType() {
    auto* dYdata = Input(-1).template data<T, Context>();

    for (int i = 0; i < OutputSize(); i++) {
-        if (Output(i)->name() == "ignore") continue;
+        if (Output(i)->name() == "NULL") continue;
        auto* dXdata = Output(i)->template mutable_data<T, Context>();
        // Compute the first term of dX
        bool initialized = false;

--- a/Dragon/src/operators/arithmetic/fully_connected_op.cc
+++ b/Dragon/src/operators/arithmetic/fully_connected_op.cc
@@ -122,7 +122,7 @@ void FullyConnectedGradientOp<Context>::RunWithType() {
    auto* Wdata = Input(1).template data<T, Context>();
    auto* dYdata = Input(2).template data<T, Context>();

-    if (Output(1)->name() != "ignore") {
+    if (Output(1)->name() != "NULL") {
        Output(1)->ReshapeLike(Input(1));
        auto* dWdata = Output(1)->template mutable_data<T, Context>();
        if (transW) {
@@ -140,7 +140,7 @@ void FullyConnectedGradientOp<Context>::RunWithType() {
        }
    }

-    if (Output(2)->name() != "ignore") {
+    if (Output(2)->name() != "NULL") {
        DECLARE_MULTIPLIER(multiplier, M);
        Output(2)->Reshape({ N });
        auto* dBdata = Output(2)->template mutable_data<T, Context>();
@@ -150,7 +150,7 @@ void FullyConnectedGradientOp<Context>::RunWithType() {
                    0.f, dBdata, ctx());
    }

-    if (Output(0)->name() != "ignore") {
+    if (Output(0)->name() != "NULL") {
        Output(0)->ReshapeLike(Input(0));
        auto* dXdata = Output(0)->template mutable_data<T, Context>();
        if (transW) {

--- a/Dragon/src/operators/arithmetic/matmul_op.cc
+++ b/Dragon/src/operators/arithmetic/matmul_op.cc
@@ -70,14 +70,14 @@ void MatmulGradientOp<Context>::RunWithType() {

    T* dAdata = nullptr, *dBdata = nullptr;

-    if (Output(0)->name() != "ignore") {
+    if (Output(0)->name() != "NULL") {
        dAdata = Output(0)->template mutable_data<T, Context>();
-    } if (Output(1)->name() != "ignore") {
+    } if (Output(1)->name() != "NULL") {
        dBdata = Output(1)->template mutable_data<T, Context>();
    }

    for (int i = 0; i < batch_size; ++i) {
-        if (Output(0)->name() != "ignore") {
+        if (Output(0)->name() != "NULL") {
            if (transA) {
                math::Gemm(
                    transB ? CblasTrans : CblasNoTrans,
@@ -94,7 +94,7 @@ void MatmulGradientOp<Context>::RunWithType() {
                    0.f, dAdata + i * A_stride, ctx());
            }
        }
-        if (Output(1)->name() != "ignore") {
+        if (Output(1)->name() != "NULL") {
            if (transB) {
                math::Gemm(
                    CblasTrans,

--- a/Dragon/src/operators/arithmetic/maximum_op.cc
+++ b/Dragon/src/operators/arithmetic/maximum_op.cc
@@ -83,11 +83,11 @@ template <class Context> template <typename T>
 void MaximumGradientOp<Context>::BroadcastRunWithType() {
    auto* dYdata = Input(-1).template data<T, Context>();
    if (Input(0).count() == 1) {
-        if (Output(0)->name() != "ignore") {
+        if (Output(0)->name() != "NULL") {
            auto* dAdata = Output(0)->template mutable_data<T, Context>();
            math::Set(1, cast::to<T>(0.f), dAdata, ctx());
        }
-        if (Output(1)->name() != "ignore") {
+        if (Output(1)->name() != "NULL") {
            auto* Adata = Input(0).template data<T, CPUContext>();
            auto* Bdata = Input(1).template data<T, Context>();
            auto* dBdata = Output(1)->template mutable_data<T, Context>();
@@ -95,14 +95,14 @@ void MaximumGradientOp<Context>::BroadcastRunWithType() {
                Bdata, Adata[0], dYdata, dBdata, (T*)nullptr, ctx());
        }
    } else if (Input(1).count() == 1) {
-        if (Output(0)->name() != "ignore") {
+        if (Output(0)->name() != "NULL") {
            auto* Adata = Input(0).template data<T, Context>();
            auto* Bdata = Input(1).template data<T, CPUContext>();
            auto* dAdata = Output(0)->template mutable_data<T, Context>();
            kernel::BroadcastMaximumGrad(Output(0)->count(),
                Adata, Bdata[0], dYdata, dAdata, (T*)nullptr, ctx());
        }
-        if (Output(1)->name() != "ignore") {
+        if (Output(1)->name() != "NULL") {
            auto* dBdata = Output(1)->template mutable_data<T, Context>();
            math::Set(1, cast::to<T>(0.f), dBdata, ctx());
        }

--- a/Dragon/src/operators/arithmetic/minimum_op.cc
+++ b/Dragon/src/operators/arithmetic/minimum_op.cc
@@ -83,11 +83,11 @@ template <class Context> template <typename T>
 void MinimumGradientOp<Context>::BroadcastRunWithType() {
    auto* dYdata = Input(-1).template data<T, Context>();
    if (Input(0).count() == 1) {
-        if (Output(0)->name() != "ignore") {
+        if (Output(0)->name() != "NULL") {
            auto* dAdata = Output(0)->template mutable_data<T, Context>();
            math::Set<T, Context>(1, cast::to<T>(0.f), dAdata, ctx());
        }
-        if (Output(1)->name() != "ignore") {
+        if (Output(1)->name() != "NULL") {
            auto* Adata = Input(0).template data<T, CPUContext>();
            auto* Bdata = Input(1).template data<T, Context>();
            auto* dBdata = Output(1)->template mutable_data<T, Context>();
@@ -95,14 +95,14 @@ void MinimumGradientOp<Context>::BroadcastRunWithType() {
                Bdata, Adata[0], dYdata, dBdata, (T*)nullptr, ctx());
        }
    } else if (Input(1).count() == 1) {
-        if (Output(0)->name() != "ignore") {
+        if (Output(0)->name() != "NULL") {
            auto* Adata = Input(0).template data<T, Context>();
            auto* Bdata = Input(1).template data<T, CPUContext>();
            auto* dAdata = Output(0)->template mutable_data<T, Context>();
            kernel::BroadcastMinimumGrad(Output(0)->count(),
                Adata, Bdata[0], dYdata, dAdata, (T*)nullptr, ctx());
        }
-        if (Output(1)->name() != "ignore") {
+        if (Output(1)->name() != "NULL") {
            auto* dBdata = Output(1)->template mutable_data<T, Context>();
            math::Set<T, Context>(1, cast::to<T>(0.f), dBdata, ctx());
        }

--- a/Dragon/src/operators/arithmetic/mul_op.cc
+++ b/Dragon/src/operators/arithmetic/mul_op.cc
@@ -59,13 +59,13 @@ template <class Context> template <typename T>
 void MulGradientOp<Context>::EltwiseRunWithType() {
    auto* dy = Input(-1).template data<T, Context>();

-    if (Output(1)->name() != "ignore") {
+    if (Output(1)->name() != "NULL") {
        auto* x1 = Input(0).template data<T, Context>();
        auto* dx2 = Output(1)->template mutable_data<T, Context>();
        math::Mul(Output(1)->count(), dy, x1, dx2, ctx());
    }

-    if (Output(0)->name() != "ignore") {
+    if (Output(0)->name() != "NULL") {
        auto* x2 = Input(1).template data<T, Context>();
        auto* dx1 = Output(0)->template mutable_data<T, Context>();
        math::Mul(Output(0)->count(), dy, x2, dx1, ctx());
@@ -77,7 +77,7 @@ void MulGradientOp<Context>::BroadcastRunWithType(int type) {
    DEFINE_FUNDAMENTAL_OP_X1X2;
    auto* dy = Input(-1).template data<T, Context>();

-    if (Output(1)->name() != "ignore") {
+    if (Output(1)->name() != "NULL") {
        auto* x1 = Input(0).template data<T, Context>();
        auto* dx2 = Output(1)->template mutable_data<T, Context>();
        auto* c = ws()->template caches<T, Context>({ X1->count() })[0];
@@ -87,7 +87,7 @@ void MulGradientOp<Context>::BroadcastRunWithType(int type) {
            1, axes.data(), 1.f, c, dx2, ctx());
    }

-    if (Output(0)->name() != "ignore") {
+    if (Output(0)->name() != "NULL") {
        auto* x2 = Input(1).template data<T, Context>();
        auto* dx1 = Output(0)->template mutable_data<T, Context>();
        math::BroadcastMul(rows, cols, type, dy, x2, dx1, ctx());

--- a/Dragon/src/operators/arithmetic/radd_op.cc
+++ b/Dragon/src/operators/arithmetic/radd_op.cc
@@ -60,13 +60,13 @@ template <class Context> template <typename T>
 void RAddGradientOp<Context>::EltwiseRunWithType() {
    auto* dy = Input(-1).template data<T, Context>();

-    if (Output(1)->name() != "ignore") {
+    if (Output(1)->name() != "NULL") {
        auto* dx2 = Output(1)->template mutable_data<T, Context>();
        ctx()->template Copy<T, Context, Context>(
            Output(1)->count(), dx2, dy);
    }

-    if (Output(0)->name() != "ignore") {
+    if (Output(0)->name() != "NULL") {
        auto* dx1 = Output(0)->template mutable_data<T, Context>();
        ctx()->template Copy<T, Context, Context>(
            Output(0)->count(), dx1, dy);
@@ -78,14 +78,14 @@ void RAddGradientOp<Context>::BroadcastRunWithType(int type) {
    DEFINE_FUNDAMENTAL_OP_X1X2;
    auto* dy = Input(-1).template data<T, Context>();

-    if (Output(0)->name() != "ignore") {
+    if (Output(0)->name() != "NULL") {
        auto* dx1 = Output(0)->template mutable_data<T, Context>();
        vector<int> dims = { rows, cols }, axes = { type - 2 };
        kernel::ReduceSum(2, dims.data(),
            1, axes.data(), 1.f, dy, dx1, ctx());
    }

-    if (Output(1)->name() != "ignore") {
+    if (Output(1)->name() != "NULL") {
        auto* dx2 = Output(1)->template mutable_data<T, Context>();
        ctx()->template Copy<T, Context, Context>(
            X2->count(), dx2, dy);
@@ -99,23 +99,23 @@ void RAddGradientOp<Context>::RunOnDevice() {
    Output(1)->ReshapeLike(*X2);

    if (XIsType(Input(-1), int8_t)) {
-        DEFINE_FUNDAMENTAL_TYPED_CALLER(int8_t);
+        DEFINE_FUNDAMENTAL_TYPED_RCALLER(int8_t);
    } else if (XIsType(Input(-1), uint8_t)) {
-        DEFINE_FUNDAMENTAL_TYPED_CALLER(uint8_t);
+        DEFINE_FUNDAMENTAL_TYPED_RCALLER(uint8_t);
    } else if (XIsType(Input(-1), int)) {
-        DEFINE_FUNDAMENTAL_TYPED_CALLER(int);
+        DEFINE_FUNDAMENTAL_TYPED_RCALLER(int);
    } else if (XIsType(Input(-1), int64_t)) {
-        DEFINE_FUNDAMENTAL_TYPED_CALLER(int64_t);
+        DEFINE_FUNDAMENTAL_TYPED_RCALLER(int64_t);
    } else if (XIsType(Input(-1), float16)) {
-        DEFINE_FUNDAMENTAL_TYPED_CALLER(float16);
+        DEFINE_FUNDAMENTAL_TYPED_RCALLER(float16);
    } else if (XIsType(Input(-1), float)) {
-        DEFINE_FUNDAMENTAL_TYPED_CALLER(float);
+        DEFINE_FUNDAMENTAL_TYPED_RCALLER(float);
    } else if (XIsType(Input(-1), double)) {
-        DEFINE_FUNDAMENTAL_TYPED_CALLER(double);
+        DEFINE_FUNDAMENTAL_TYPED_RCALLER(double);
    } else {
        LOG(FATAL) << DTypeHelper(Input(0), {
            "int8", "uint8", "int32", "int64",
-                  "float16", "float32", "float64",
+                "float16", "float32", "float64",
        });
    }
 }

--- a/Dragon/src/operators/arithmetic/rdiv_op.cc
+++ b/Dragon/src/operators/arithmetic/rdiv_op.cc
@@ -61,7 +61,7 @@ void RDivGradientOp<Context>::EltwiseRunWithType() {
    DEFINE_FUNDAMENTAL_OP_X1X2;
    auto* dy = Input(-1).template data<T, Context>();

-    if (Output(1)->name() != "ignore") {
+    if (Output(1)->name() != "NULL") {
        auto* x1 = Input(0).template data<T, Context>();
        auto* x2 = Input(1).template data<T, Context>();
        auto* dx2 = Output(1)->template mutable_data<T, Context>();
@@ -73,7 +73,7 @@ void RDivGradientOp<Context>::EltwiseRunWithType() {
        math::Scale(X2->count(), -1.f, dx2, dx2, ctx());
    }

-    if (Output(0)->name() != "ignore") {
+    if (Output(0)->name() != "NULL") {
        auto* x2 = Input(1).template data<T, Context>();
        auto* dx1 = Output(0)->template mutable_data<T, Context>();
        math::Div(X1->count(), dy, x2, dx1, ctx());
@@ -85,7 +85,7 @@ void RDivGradientOp<Context>::BroadcastRunWithType(int type) {
    DEFINE_FUNDAMENTAL_OP_X1X2;
    auto* dy = Input(-1).template data<T, Context>();

-    if (Output(0)->name() != "ignore") {
+    if (Output(0)->name() != "NULL") {
        auto* x2 = Input(1).template data<T, Context>();
        auto* dx1 = Output(0)->template mutable_data<T, Context>();
        auto* c = ws()->template caches<T, Context>({ X2->count() })[0];
@@ -95,7 +95,7 @@ void RDivGradientOp<Context>::BroadcastRunWithType(int type) {
            1, axes.data(), 1.f, c, dx1, ctx());
    }

-    if (Output(1)->name() != "ignore") {
+    if (Output(1)->name() != "NULL") {
        auto* x1 = Input(0).template data<T, Context>();
        auto* x2 = Input(1).template data<T, Context>();
        auto* dx2 = Output(1)->template mutable_data<T, Context>();
@@ -114,19 +114,19 @@ void RDivGradientOp<Context>::RunOnDevice() {
    Output(1)->ReshapeLike(*X2);

    if (XIsType(Input(-1), int8_t)) {
-        DEFINE_FUNDAMENTAL_TYPED_CALLER(int8_t);
+        DEFINE_FUNDAMENTAL_TYPED_RCALLER(int8_t);
    } else if (XIsType(Input(-1), uint8_t)) {
-        DEFINE_FUNDAMENTAL_TYPED_CALLER(uint8_t);
+        DEFINE_FUNDAMENTAL_TYPED_RCALLER(uint8_t);
    } else if (XIsType(Input(-1), int)) {
-        DEFINE_FUNDAMENTAL_TYPED_CALLER(int);
+        DEFINE_FUNDAMENTAL_TYPED_RCALLER(int);
    } else if (XIsType(Input(-1), int64_t)) {
-        DEFINE_FUNDAMENTAL_TYPED_CALLER(int64_t);
+        DEFINE_FUNDAMENTAL_TYPED_RCALLER(int64_t);
    } else if (XIsType(Input(-1), float16)) {
-        DEFINE_FUNDAMENTAL_TYPED_CALLER(float16);
+        DEFINE_FUNDAMENTAL_TYPED_RCALLER(float16);
    } else if (XIsType(Input(-1), float)) {
-        DEFINE_FUNDAMENTAL_TYPED_CALLER(float);
+        DEFINE_FUNDAMENTAL_TYPED_RCALLER(float);
    } else if (XIsType(Input(-1), double)) {
-        DEFINE_FUNDAMENTAL_TYPED_CALLER(double);
+        DEFINE_FUNDAMENTAL_TYPED_RCALLER(double);
    } else {
        LOG(FATAL) << DTypeHelper(Input(0), {
            "int8", "uint8", "int32", "int64",

--- a/Dragon/src/operators/arithmetic/rmul_op.cc
+++ b/Dragon/src/operators/arithmetic/rmul_op.cc
@@ -60,13 +60,13 @@ template <class Context> template <typename T>
 void RMulGradientOp<Context>::EltwiseRunWithType() {
    auto* dy = Input(-1).template data<T, Context>();

-    if (Output(1)->name() != "ignore") {
+    if (Output(1)->name() != "NULL") {
        auto* x1 = Input(0).template data<T, Context>();
        auto* dx2 = Output(1)->template mutable_data<T, Context>();
        math::Mul(Output(1)->count(), dy, x1, dx2, ctx());
    }

-    if (Output(0)->name() != "ignore") {
+    if (Output(0)->name() != "NULL") {
        auto* x2 = Input(1).template data<T, Context>();
        auto* dx1 = Output(0)->template mutable_data<T, Context>();
        math::Mul(Output(0)->count(), dy, x2, dx1, ctx());
@@ -78,7 +78,7 @@ void RMulGradientOp<Context>::BroadcastRunWithType(int type) {
    DEFINE_FUNDAMENTAL_OP_X1X2;
    auto* dy = Input(-1).template data<T, Context>();

-    if (Output(0)->name() != "ignore") {
+    if (Output(0)->name() != "NULL") {
        auto* x2 = Input(1).template data<T, Context>();
        auto* dx1 = Output(0)->template mutable_data<T, Context>();
        auto* c = ws()->template caches<T, Context>({ X2->count() })[0];
@@ -88,7 +88,7 @@ void RMulGradientOp<Context>::BroadcastRunWithType(int type) {
            1, axes.data(), 1.f, c, dx1, ctx());
    }

-    if (Output(1)->name() != "ignore") {
+    if (Output(1)->name() != "NULL") {
        auto* x1 = Input(0).template data<T, Context>();
        auto* dx2 = Output(1)->template mutable_data<T, Context>();
        math::BroadcastMul(rows, cols, type - 2, dy, x1, dx2, ctx());
@@ -102,19 +102,19 @@ void RMulGradientOp<Context>::RunOnDevice() {
    Output(1)->ReshapeLike(*X2);

    if (XIsType(Input(-1), int8_t)) {
-        DEFINE_FUNDAMENTAL_TYPED_CALLER(int8_t);
+        DEFINE_FUNDAMENTAL_TYPED_RCALLER(int8_t);
    } else if (XIsType(Input(-1), uint8_t)) {
-        DEFINE_FUNDAMENTAL_TYPED_CALLER(uint8_t);
+        DEFINE_FUNDAMENTAL_TYPED_RCALLER(uint8_t);
    } else if (XIsType(Input(-1), int)) {
-        DEFINE_FUNDAMENTAL_TYPED_CALLER(int);
+        DEFINE_FUNDAMENTAL_TYPED_RCALLER(int);
    } else if (XIsType(Input(-1), int64_t)) {
-        DEFINE_FUNDAMENTAL_TYPED_CALLER(int64_t);
+        DEFINE_FUNDAMENTAL_TYPED_RCALLER(int64_t);
    } else if (XIsType(Input(-1), float16)) {
-        DEFINE_FUNDAMENTAL_TYPED_CALLER(float16);
+        DEFINE_FUNDAMENTAL_TYPED_RCALLER(float16);
    } else if (XIsType(Input(-1), float)) {
-        DEFINE_FUNDAMENTAL_TYPED_CALLER(float);
+        DEFINE_FUNDAMENTAL_TYPED_RCALLER(float);
    } else if (XIsType(Input(-1), double)) {
-        DEFINE_FUNDAMENTAL_TYPED_CALLER(double);
+        DEFINE_FUNDAMENTAL_TYPED_RCALLER(double);
    } else {
        LOG(FATAL) << DTypeHelper(Input(0), {
            "int8", "uint8", "int32", "int64",

--- a/Dragon/src/operators/arithmetic/rsub_op.cc
+++ b/Dragon/src/operators/arithmetic/rsub_op.cc
@@ -60,13 +60,13 @@ template <class Context> template <typename T>
 void RSubGradientOp<Context>::EltwiseRunWithType() {
    auto* dy = Input(-1).template data<T, Context>();

-    if (Output(1)->name() != "ignore") {
+    if (Output(1)->name() != "NULL") {
        auto* dx2 = Output(1)->template mutable_data<T, Context>();
        math::Scale<T, Context>(
            Output(1)->count(), -1, dy, dx2, ctx());
    }

-    if (Output(0)->name() != "ignore") {
+    if (Output(0)->name() != "NULL") {
        auto* dx1 = Output(0)->template mutable_data<T, Context>();
        ctx()->template Copy<T, Context, Context>(
            Output(0)->count(), dx1, dy);
@@ -78,14 +78,14 @@ void RSubGradientOp<Context>::BroadcastRunWithType(int type) {
    DEFINE_FUNDAMENTAL_OP_X1X2;
    auto* dy = Input(-1).template data<T, Context>();

-    if (Output(0)->name() != "ignore") {
+    if (Output(0)->name() != "NULL") {
        auto* dx1 = Output(0)->template mutable_data<T, Context>();
        vector<int> dims = { rows, cols }, axes = { type - 2 };
        kernel::ReduceSum(2, dims.data(),
            1, axes.data(), 1.f, dy, dx1, ctx());
    }

-    if (Output(1)->name() != "ignore") {
+    if (Output(1)->name() != "NULL") {
        auto* dx2 = Output(1)->template mutable_data<T, Context>();
        math::Scale(X2->count(), -1.f, dy, dx2, ctx());
    }
@@ -98,19 +98,19 @@ void RSubGradientOp<Context>::RunOnDevice() {
    Output(1)->ReshapeLike(*X2);

    if (XIsType(Input(-1), int8_t)) {
-        DEFINE_FUNDAMENTAL_TYPED_CALLER(int8_t);
+        DEFINE_FUNDAMENTAL_TYPED_RCALLER(int8_t);
    } else if (XIsType(Input(-1), uint8_t)) {
-        DEFINE_FUNDAMENTAL_TYPED_CALLER(uint8_t);
+        DEFINE_FUNDAMENTAL_TYPED_RCALLER(uint8_t);
    } else if (XIsType(Input(-1), int)) {
-        DEFINE_FUNDAMENTAL_TYPED_CALLER(int);
+        DEFINE_FUNDAMENTAL_TYPED_RCALLER(int);
    } else if (XIsType(Input(-1), int64_t)) {
-        DEFINE_FUNDAMENTAL_TYPED_CALLER(int64_t);
+        DEFINE_FUNDAMENTAL_TYPED_RCALLER(int64_t);
    } else if (XIsType(Input(-1), float16)) {
-        DEFINE_FUNDAMENTAL_TYPED_CALLER(float16);
+        DEFINE_FUNDAMENTAL_TYPED_RCALLER(float16);
    } else if (XIsType(Input(-1), float)) {
-        DEFINE_FUNDAMENTAL_TYPED_CALLER(float);
+        DEFINE_FUNDAMENTAL_TYPED_RCALLER(float);
    } else if (XIsType(Input(-1), double)) {
-        DEFINE_FUNDAMENTAL_TYPED_CALLER(double);
+        DEFINE_FUNDAMENTAL_TYPED_RCALLER(double);
    } else {
        LOG(FATAL) << DTypeHelper(Input(0), {
            "int8", "uint8", "int32", "int64",

--- a/Dragon/src/operators/arithmetic/sub_op.cc
+++ b/Dragon/src/operators/arithmetic/sub_op.cc
@@ -60,13 +60,13 @@ template <class Context> template <typename T>
 void SubGradientOp<Context>::EltwiseRunWithType() {
    auto* dy = Input(-1).template data<T, Context>();

-    if (Output(1)->name() != "ignore") {
+    if (Output(1)->name() != "NULL") {
        auto* dx2 = Output(1)->template mutable_data<T, Context>();
        math::Scale<T, Context>(Output(1)->count(),
            -1.f, dy, dx2, ctx());
    }

-    if (Output(0)->name() != "ignore") {
+    if (Output(0)->name() != "NULL") {
        auto* dx1 = Output(0)->template mutable_data<T, Context>();
        ctx()->template Copy<T, Context, Context>(
            Output(0)->count(), dx1, dy);
@@ -78,14 +78,14 @@ void SubGradientOp<Context>::BroadcastRunWithType(int type) {
    DEFINE_FUNDAMENTAL_OP_X1X2;
    auto* dy = Input(-1).template data<T, Context>();

-    if (Output(1)->name() != "ignore") {
+    if (Output(1)->name() != "NULL") {
        auto* dx2 = Output(1)->template mutable_data<T, Context>();
        vector<int> dims = { rows, cols }, axes = { type };
        kernel::ReduceSum(2, dims.data(),
            1, axes.data(), -1.f, dy, dx2, ctx());
    }

-    if (Output(0)->name() != "ignore") {
+    if (Output(0)->name() != "NULL") {
        auto* dx1 = Output(0)->template mutable_data<T, Context>();
        ctx()->template Copy<T, Context, Context>(
            X1->count(), dx1, dy);

--- a/Dragon/src/operators/array/argreduce_op.cc
+++ b/Dragon/src/operators/array/argreduce_op.cc
@@ -19,7 +19,7 @@ void ArgReduceOp<Context>::RunWithType() {
        // It's difficult to implement device code when top_k > 1
        auto* Xdata = Input(0).template data<T, CPUContext>();
        auto* Idata = Output(0)->template mutable_data<int64_t, CPUContext>();
-        auto* Vdata = Output(1)->name() != "ignore" ? Output(1)
+        auto* Vdata = Output(1)->name() != "NULL" ? Output(1)
            ->template mutable_data<T, CPUContext>() : nullptr;
        static CPUContext cctx;
        if (operation == "ARGMAX") {
@@ -34,7 +34,7 @@ void ArgReduceOp<Context>::RunWithType() {
    } else {
        auto* Xdata = Input(0).template data<T, Context>();
        auto* Idata = Output(0)->template mutable_data<int64_t, Context>();
-        auto* Vdata = Output(1)->name() != "ignore" ? Output(1)
+        auto* Vdata = Output(1)->name() != "NULL" ? Output(1)
            ->template mutable_data<T, Context>() : nullptr;
        if (operation == "ARGMAX") {
            kernel::ArgMax(outer_dim, inner_dim, axis_dim,

--- a/Dragon/src/operators/array/concat_op.cc
+++ b/Dragon/src/operators/array/concat_op.cc
@@ -82,7 +82,7 @@ void ConcatGradientOp<Context>::RunWithType() {

    for (int i = 0; i < OutputSize(); i++) {
        x_concat_dim = Input(i).dim(axis);
-        if (Output(i)->name() != "ignore") {
+        if (Output(i)->name() != "NULL") {
            auto* dXdata = Output(i)->template mutable_data<T, Context>();
            kernel::Slice(
                outer_dim, inner_dim,
@@ -95,7 +95,7 @@ void ConcatGradientOp<Context>::RunWithType() {

 template <class Context>
 void ConcatGradientOp<Context>::RunOnDevice() {
-    if (Input(-1).name() == "ignore") return;
+    if (Input(-1).name() == "NULL") return;

    DETERMINE_RUNTIME_ARGUMENTS(Input(0));


--- a/Dragon/src/operators/array/slice_op.cc
+++ b/Dragon/src/operators/array/slice_op.cc
@@ -89,7 +89,7 @@ void SliceGradientOp<Context>::RunWithType() {
            << "\nIllegal slice points: " << Tensor::DimString(slice_points)
            << " for dimension " << Input(0).dim(axis) << ".";

-        const T* dYdata = Input(i + 1).name() != "ignore" ?
+        const T* dYdata = Input(i + 1).name() != "NULL" ?
            Input(i + 1).template data<T, Context>() : nullptr;

        kernel::SliceGrad(

--- a/Dragon/src/operators/array/stack_op.cc
+++ b/Dragon/src/operators/array/stack_op.cc
@@ -69,7 +69,7 @@ void StackGradientOp<Context>::RunWithType() {
    auto* dYdata = Input(-1).template data<T, Context>();

    for (int i = 0; i < OutputSize(); i++) {
-        if (Output(i)->name() != "ignore") {
+        if (Output(i)->name() != "NULL") {
            auto* dXdata = Output(i)->template mutable_data<T, Context>();
            kernel::Slice(
                outer_dim, inner_dim,
@@ -81,7 +81,7 @@ void StackGradientOp<Context>::RunWithType() {

 template <class Context>
 void StackGradientOp<Context>::RunOnDevice() {
-    if (Input(-1).name() == "ignore") return;
+    if (Input(-1).name() == "NULL") return;

    DETERMINE_RUNTIME_ARGUMENTS(Input(-1));


--- a/Dragon/src/operators/control_flow/scan_op.cc
+++ b/Dragon/src/operators/control_flow/scan_op.cc
@@ -131,7 +131,7 @@ void ScanOp<Context>::UnrollTemplate() {
            func_def.output(i) +
                "@" + std::to_string(nsteps));
        // Concat all steps if necessary
-        if (Output(i)->name() == "ignore") continue;
+        if (Output(i)->name() == "NULL") continue;
        OperatorDef* op = new_def.add_op();
        op->set_name(name() + "(BodyOp." + std::to_string(
            nseqs + nrepeats + i) + ")");
@@ -186,7 +186,7 @@ void ScanGradientOp<Context>::MakeOps(
    maker.SetOperatorPrefix(name() + "(BodyOp.");
    maker.SetOperatorSuffix(")");
    for (int i = 0; i < forward_outputs.size(); i++) {
-        if (Input(i + (int)OutputSize()).name() != "ignore")
+        if (Input(i + (int)OutputSize()).name() != "NULL")
            maker.AddExternalGrad(Input(i + (int)OutputSize()).name());
    }

@@ -197,8 +197,8 @@ void ScanGradientOp<Context>::MakeOps(
    new_def.set_name(name() + "(ScanLen." + std::to_string(nsteps) + ")");
    for (const auto& target : forward_def.output()) {
        for (int i = 0; i < OutputSize(); i++) {
-            if (Output(i)->name() == "ignore") continue;
-            if (Input(i).name() == "ignore") continue;
+            if (Output(i)->name() == "NULL") continue;
+            if (Input(i).name() == "NULL") continue;
            auto* gradient = new_def.add_gradient();
            gradient->set_cost(target);
            gradient->set_wrt(Input(i).name());

--- a/Dragon/src/operators/loss/l1_loss_op.cc
+++ b/Dragon/src/operators/loss/l1_loss_op.cc
@@ -83,13 +83,13 @@ void L1LossGradientOp<Context>::RunWithType() {
    } else { dYHost *= scale; }

    for (int i = 0; i < 2; i++) {
-        if (Output(i)->name() == "ignore") continue;
+        if (Output(i)->name() == "NULL") continue;
        Output(i)->ReshapeLike(Input(i));
        auto* dXdata = Output(i)->template mutable_data<T, Context>();
        math::Scale(Output(i)->count(),
            dYHost * (i == 0 ? 1.f : -1.f),
                Ddata, dXdata, ctx());
-        if (Input(2).name() != "ignore") {
+        if (Input(2).name() != "NULL") {
            auto* mask = Input(2).template data<T, Context>();
            math::Mul(Output(i)->count(), mask, dXdata, dXdata, ctx());
        }

--- a/Dragon/src/operators/loss/l2_loss_op.cc
+++ b/Dragon/src/operators/loss/l2_loss_op.cc
@@ -88,13 +88,13 @@ void L2LossGradientOp<Context>::RunWithType() {
    } else { dYHost *= scale; }

    for (int i = 0; i < 2; i++) {
-        if (Output(i)->name() == "ignore") continue;
+        if (Output(i)->name() == "NULL") continue;
        Output(i)->ReshapeLike(Input(i));
        auto* dXdata = Output(i)->template mutable_data<T, Context>();
        math::Scale(Output(i)->count(),
            dYHost * (i == 0 ? 1.f : -1.f),
                Ddata, dXdata, ctx());
-        if (Input(2).name() != "ignore") {
+        if (Input(2).name() != "NULL") {
            auto* mask = Input(2).template data<T, Context>();
            math::Mul(Output(i)->count(), mask, dXdata, dXdata, ctx());
        }

--- a/Dragon/src/operators/loss/smooth_l1_loss_op.cc
+++ b/Dragon/src/operators/loss/smooth_l1_loss_op.cc
@@ -79,7 +79,7 @@ void SmoothL1LossGradientOp<Context>::RunWithType() {
    }

    for (int i = 0; i < 2; i++) {
-        if (Output(i)->name() == "ignore") continue;
+        if (Output(i)->name() == "NULL") continue;
        Output(i)->ReshapeLike(Input(i));
        auto* dXdata = Output(i)->template mutable_data<T, Context>();
        math::Scale(Output(i)->count(),

--- a/Dragon/src/operators/misc/gradient_op.cc
+++ b/Dragon/src/operators/misc/gradient_op.cc
@@ -8,7 +8,7 @@ namespace dragon {
 template <class Context> template <typename T>
 void GradientGenerateOp<Context>::RunWithType() {
    for (int i = 0; i < OutputSize(); i++) {
-        if (Output(i)->name() == "ignore") continue;
+        if (Output(i)->name() == "NULL") continue;
        Output(i)->ReshapeLike(Input(i));
        auto* dXdata = Output(0)->template mutable_data<T, Context>();
        math::Set(Output(0)->count(),

--- a/Dragon/src/operators/norm/batch_norm.cc
+++ b/Dragon/src/operators/norm/batch_norm.cc
@@ -174,8 +174,8 @@ void BatchNormGradientOp<Context>::InferenceRunWithType() {
    Tp* dgamma = nullptr, *dbeta = nullptr;

    // Gradient w.r.t. gamma or beta if necessary
-    if (Output(1)->name() != "ignore" ||
-            Output(2)->name() != "ignore") {
+    if (Output(1)->name() != "NULL" ||
+            Output(2)->name() != "NULL") {
        dgamma = Output(1)->template mutable_data<Tp, Context>();
        dbeta = Output(2)->template mutable_data<Tp, Context>();
    }

--- a/Dragon/src/operators/norm/cudnn_batch_norm_op.cc
+++ b/Dragon/src/operators/norm/cudnn_batch_norm_op.cc
@@ -242,8 +242,8 @@ void CuDNNBatchNormGradientOp<Context>::InferenceRunWithType() {
    Tp* dgamma = nullptr, *dbeta = nullptr;

    // Gradient w.r.t. gamma or beta if necessary
-    if (Output(1)->name() != "ignore" ||
-            Output(2)->name() != "ignore") {
+    if (Output(1)->name() != "NULL" ||
+            Output(2)->name() != "NULL") {
        dgamma = Output(1)->template mutable_data<Tp, Context>();
        dbeta = Output(2)->template mutable_data<Tp, Context>();
    }

--- a/Dragon/src/operators/recurrent/cudnn_recurrent_op.cc
+++ b/Dragon/src/operators/recurrent/cudnn_recurrent_op.cc
@@ -116,7 +116,7 @@ void CuDNNRecurrentOp<Context>::RunWithType() {
    };
    auto YsData = [this](int i) {
        if (i >= OutputSize()) return (T*)NULL;
-        if (Output(i)->name() == "ignore") return (T*)NULL;
+        if (Output(i)->name() == "NULL") return (T*)NULL;
        return Output(i)->template mutable_data<T, Context>();
    };

@@ -171,12 +171,12 @@ void CuDNNRecurrentGradientOp<Context>::RunWithType() {

    auto XsData = [this](int i) {
        if (i >= InputSize()) return (const T*)NULL;
-        if (Input(i).name() == "ignore") return (const T*)NULL;
+        if (Input(i).name() == "NULL") return (const T*)NULL;
        return Input(i).template data<T, Context>();
    };
    auto YsData = [this](int i) {
        if (i >= OutputSize()) return (T*)NULL;
-        if (Output(i)->name() == "ignore" && i > 0) return (T*)NULL;
+        if (Output(i)->name() == "NULL" && i > 0) return (T*)NULL;
        return Output(i)->template mutable_data<T, Context>();
    };

@@ -194,10 +194,10 @@ void CuDNNRecurrentGradientOp<Context>::RunWithType() {

    auto handle = ctx()->cudnn_handle();

-    if (Output(0)->name() != "ignore" ||
-            Output(1)->name() != "ignore" ||
-                Output(2)->name() != "ignore" ||
-                    Output(3)->name() != "ignore") {
+    if (Output(0)->name() != "NULL" ||
+            Output(1)->name() != "NULL" ||
+                Output(2)->name() != "NULL" ||
+                    Output(3)->name() != "NULL") {
        CUDNN_CHECK(cudnnRNNBackwardData(handle, rnn_desc,
                                               seq_length,
                              ys_desc->descs(), XsData(4), //   Y
@@ -214,7 +214,7 @@ void CuDNNRecurrentGradientOp<Context>::RunWithType() {
                                   RSdata, reserve_size));
    }

-    if (Output(1)->name() != "ignore") {
+    if (Output(1)->name() != "NULL") {
        CUDNN_CHECK(cudnnRNNBackwardWeights(handle, rnn_desc,
                                                  seq_length,
                                 xs_desc->descs(), XsData(0), //   X

--- a/Dragon/src/operators/recurrent/lstm_cell_op.cc
+++ b/Dragon/src/operators/recurrent/lstm_cell_op.cc
@@ -7,14 +7,14 @@ namespace dragon {

 template <class Context> template <typename T>
 void LSTMCellOp<Context>::RunWithType() {
-    auto* XAdata = Input(0).template mutable_data<T, Context>();
-    auto* CXdata = Input(1).template data<T, Context>();
+    auto* Xdata = Input(0).template mutable_data<T, Context>();
+    auto* HXdata = Input(1).template data<T, Context>();
    auto* Hdata = Output(0)->template mutable_data<T, Context>();
    auto* Cdata = Output(1)->template mutable_data<T, Context>();

    kernel::LSTMCell(Input(1).count(), Input(1).dim(0),
        Input(1).ndim() == 2 ? Input(1).dim(1) : Input(1).dim(2),
-            CXdata, XAdata, Cdata, Hdata, ctx());
+            HXdata, Xdata, Cdata, Hdata, ctx());
 }

 template <class Context>
@@ -34,17 +34,23 @@ OPERATOR_SCHEMA(LSTMCell).NumInputs(2, 3).NumOutputs(2);

 template <class Context> template <typename T>
 void LSTMCellGradientOp<Context>::RunWithType() {
-    auto* XAdata = Input(0).template data<T, Context>();
-    auto* CXdata = Input(1).template data<T, Context>();
+    auto* Xdata = Input(0).template data<T, Context>();
+    auto* HXdata = Input(1).template data<T, Context>();
    auto* Cdata = Input(2).template data<T, Context>();
-    auto* dHdata = Input(3).template data<T, Context>();
-    auto* dCdata = Input(4).template data<T, Context>();
+    auto* dHdata = Input(-2).template data<T, Context>();
+    auto* dCdata = Input(4).template mutable_data<T, Context>();
    auto* dXdata = Output(0)->template mutable_data<T, Context>();
-    auto* dCXdata = Output(1)->template mutable_data<T, Context>();
+    auto* dHXdata = Output(1)->template mutable_data<T, Context>();
+
+    if (Input(-1).name() == "NULL") {
+        math::Set(Input(-1).count(),
+            cast::to<T>(0.f), dCdata, ctx());
+    }

    kernel::LSTMCellGrad(Input(1).count(), Input(1).dim(0),
        Input(1).ndim() == 2 ? Input(1).dim(1) : Input(1).dim(2),
-            CXdata, XAdata, Cdata, dCdata, dHdata, dCXdata, dXdata, ctx());
+            HXdata, Xdata, Cdata, dCdata, dHdata,
+                dHXdata, dXdata, ctx());
 }

 template <class Context>
@@ -52,6 +58,12 @@ void LSTMCellGradientOp<Context>::RunOnDevice() {
    Output(0)->ReshapeLike(Input(0));
    Output(1)->ReshapeLike(Input(1));

+    if (Input(-1).name() == "NULL") {
+        // dC will be ignored if C is not solved
+        // We should Zero-Reset the dC
+        Input(-1).ReshapeLike(Input(-2));
+    }
+
    if (Input(0).template IsType<float>()) RunWithType<float>();
    else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
 }
@@ -72,8 +84,8 @@ class GetLSTMCellGradient final : public GradientMakerBase {
            vector<string>({ I(0), I(1), O(0), GO(0), GO(1) }),
            vector<string>({ GI(0), GI(1) }));
    }
-    //  fill zero for dc_{T+1}
-    vector<float> DefaultValues() override{ return { 0.f, 1.f }; }
+    // Fill zero for dCNext
+    vector<float> DefaultValues() override{ return { 1.f, 0.f }; }
 };

 REGISTER_GRADIENT(LSTMCell, GetLSTMCellGradient);

--- a/Dragon/src/operators/recurrent/recurrent_op.cc
+++ b/Dragon/src/operators/recurrent/recurrent_op.cc
@@ -24,14 +24,14 @@ class GetRecurrentGradient final : public GradientMakerBase {
    GRADIENT_MAKER_CTOR(GetRecurrentGradient);
    vector<OperatorDef> MakeDefs() override {
        vector<string> inputs({ I(0), I(1),
-            def.input_size() > 2 ? I(2) : "ignore",
-            def.input_size() > 3 ? I(3) : "ignore",
+            def.input_size() > 2 ? I(2) : "NULL",
+            def.input_size() > 3 ? I(3) : "NULL",
            O(0), GO(0),
-            def.output_size() > 1 ? GO(1) : "ignore",
-            def.output_size() > 2 ? GO(2) : "ignore"});
+            def.output_size() > 1 ? GO(1) : "NULL",
+            def.output_size() > 2 ? GO(2) : "NULL"});
        vector<string> outputs({ GI(0), GI(1),
-            def.input_size() > 2 ? GI(2) : "ignore",
-            def.input_size() > 3 ? GI(3) : "ignore"});
+            def.input_size() > 2 ? GI(2) : "NULL",
+            def.input_size() > 3 ? GI(3) : "NULL"});
        return SingleDef(def.type() + "Gradient", "", inputs, outputs);
    }
 };

--- a/Dragon/src/operators/vision/bias_add_op.cc
+++ b/Dragon/src/operators/vision/bias_add_op.cc
@@ -49,7 +49,7 @@ OPERATOR_SCHEMA(BiasAdd)

 template <class Context> template <typename T>
 void BiasAddGradientOp<Context>::RunWithType() {
-    if (Output(1)->name() != "ignore") {
+    if (Output(1)->name() != "NULL") {
        DECLARE_MULTIPLIER(multiplier, inner_dim);
        auto* dYdata = Input(-1).template mutable_data<T, Context>();
        auto* dBias = Output(1)->template mutable_data<T, Context>();
@@ -75,7 +75,7 @@ void BiasAddGradientOp<Context>::RunWithType() {
        }
    }

-    if (Output(0)->name() != "ignore" &&
+    if (Output(0)->name() != "NULL" &&
        Output(0)->name() != Input(-1).name()) {
        Output(0)->ReshapeLike(Input(-1));
        Output(0)->template CopyFrom<Context>(Input(-1), ctx());

--- a/Dragon/src/operators/vision/conv2d_op.cc
+++ b/Dragon/src/operators/vision/conv2d_op.cc
@@ -51,12 +51,12 @@ void Conv2dGradientOp<Context>::RunWithType() {
    }

    for (int n = 0; n < Input(2).dim(0); n++) {
-        if (Output(1)->name() != "ignore") {
+        if (Output(1)->name() != "NULL") {
            auto* Xdata = Input(0).template data<T, Context>();
            auto* dWdata = Output(1)->template mutable_data<T, Context>();
            Dw(dYdata + n * y_offset, Xdata + n * x_offset, dWdata);
        }
-        if (Output(0)->name() != "ignore") {
+        if (Output(0)->name() != "NULL") {
            auto* Wdata = Input(1).template data<T, Context>();
            auto* dXdata = Output(0)->template mutable_data<T, Context>();
            Dx(dYdata + n * y_offset, Wdata, dXdata + n * x_offset);

--- a/Dragon/src/operators/vision/conv2d_transpose_op.cc
+++ b/Dragon/src/operators/vision/conv2d_transpose_op.cc
@@ -48,22 +48,22 @@ template <class Context> template <typename T>
 void ConvTranspose2dGradientOp<Context>::RunWithType() {
    auto* dYdata = Input(-1).template data<T, Context>();

-    if (Output(2)->name() != "ignore") {
+    if (Output(2)->name() != "NULL") {
        auto* dBdata = Output(2)->template mutable_data<T, Context>();
        for (int n = 0; n < Input(2).dim(0); n++)
            Db(dYdata + n * y_offset, dBdata);
    }

    for (int n = 0; n < Input(2).dim(0); n++) {
-        if (Output(1)->name() != "ignore") {
+        if (Output(1)->name() != "NULL") {
            auto* Xdata = Input(0).template data<T, Context>();
            auto* dWdata = Output(1)->template mutable_data<T, Context>();
            Dw(Xdata + n * x_offset, dYdata + n * y_offset, dWdata);
        }
-        if (Output(0)->name() != "ignore") {
+        if (Output(0)->name() != "NULL") {
            auto* Wdata = Input(1).template data<T, Context>();
            auto* dXdata = Output(0)->template mutable_data<T, Context>();
-            bool skip = Output(1)->name() != "ignore";
+            bool skip = Output(1)->name() != "NULL";
            Wx(dYdata + n * y_offset, Wdata, dXdata + n * x_offset, skip);
        }
    }

--- a/Dragon/src/operators/vision/cudnn_bias_add_op.cc
+++ b/Dragon/src/operators/vision/cudnn_bias_add_op.cc
@@ -74,7 +74,7 @@ void CuDNNBiasAddGradientOp<Context>::RunWithType() {
        CUDNNType<T>::one, input_desc, dYdata,
            CUDNNType<T>::zero, bias_desc, dBdata));

-    if (Output(0)->name() != "ignore" &&
+    if (Output(0)->name() != "NULL" &&
        Output(0)->name() != Input(-1).name()) {
        Output(0)->ReshapeLike(Input(-1));
        Output(0)->template CopyFrom<Context>(Input(-1), ctx());

--- a/Dragon/src/operators/vision/cudnn_conv2d_op.cc
+++ b/Dragon/src/operators/vision/cudnn_conv2d_op.cc
@@ -294,7 +294,7 @@ void CuDNNConv2dGradientOp<Context>::RunWithType() {

    auto cudnn_handle = ctx()->cudnn_handle();

-    if (Output(2)->name() != "ignore") {
+    if (Output(2)->name() != "NULL") {
        T* dBdata = Output(2)->template mutable_data<T, Context>();
        CUDNN_CHECK(cudnnConvolutionBackwardBias(cudnn_handle,
            CUDNNType<T>::one, input2b_desc, dYdata,
@@ -302,7 +302,7 @@ void CuDNNConv2dGradientOp<Context>::RunWithType() {
    }

    for (int g = 0; g < cudnn_group; g++) {
-        if (Output(1)->name() != "ignore") {
+        if (Output(1)->name() != "NULL") {
            auto* Xdata = Input(0).template data<T, Context>();
            auto* dWdata = Output(1)->template mutable_data<T, Context>();
            CUDNN_CHECK(cudnnConvolutionBackwardFilter(cudnn_handle,
@@ -311,7 +311,7 @@ void CuDNNConv2dGradientOp<Context>::RunWithType() {
                        conv_desc, bwd_filter_algo, WSdata, bwd_filter_size,
                CUDNNType<T>::zero, filter_desc, dWdata + weight_offset * g));
        }
-        if (Output(0)->name() != "ignore") {
+        if (Output(0)->name() != "NULL") {
            auto* Wdata = Input(1).template data<T, Context>();
            auto* dXdata = Output(0)->template mutable_data<T, Context>();
            CUDNN_CHECK(cudnnConvolutionBackwardData(cudnn_handle,

--- a/Dragon/src/operators/vision/cudnn_conv2d_transpose_op.cc
+++ b/Dragon/src/operators/vision/cudnn_conv2d_transpose_op.cc
@@ -290,7 +290,7 @@ void CuDNNConvTranspose2dGradientOp<Context>::RunWithType() {

    auto cudnn_handle = ctx()->cudnn_handle();

-    if (Output(2)->name() != "ignore") {
+    if (Output(2)->name() != "NULL") {
        T* dBdata = Output(2)->template mutable_data<T, Context>();
        CUDNN_CHECK(cudnnConvolutionBackwardBias(cudnn_handle,
            CUDNNType<T>::one, input2b_desc, dYdata,
@@ -298,7 +298,7 @@ void CuDNNConvTranspose2dGradientOp<Context>::RunWithType() {
    }

    for (int g = 0; g < cudnn_group; g++) {
-        if (Output(1)->name() != "ignore") {
+        if (Output(1)->name() != "NULL") {
            auto* Xdata = Input(0).template data<T, Context>();
            auto* dWdata = Output(1)->template mutable_data<T, Context>();
            CUDNN_CHECK(cudnnConvolutionBackwardFilter(cudnn_handle,
@@ -307,7 +307,7 @@ void CuDNNConvTranspose2dGradientOp<Context>::RunWithType() {
                        conv_desc, bwd_filter_algo, WSdata, bwd_filter_size,
                CUDNNType<T>::zero, filter_desc, dWdata + weight_offset * g));
        }
-        if (Output(0)->name() != "ignore") {
+        if (Output(0)->name() != "NULL") {
            auto* Wdata = Input(1).template data<T, Context>();
            auto* dXdata = Output(0)->template mutable_data<T, Context>();
            CUDNN_CHECK(cudnnConvolutionForward(cudnn_handle,

--- a/Dragon/src/operators/vision/cudnn_depthwise_conv2d_op.cc
+++ b/Dragon/src/operators/vision/cudnn_depthwise_conv2d_op.cc
@@ -79,7 +79,7 @@ void CuDNNDepthwiseConv2dGradientOp<Context>::RunWithType() {
    }

    for (int n = 0; n < Input(2).dim(0); n++) {
-        if (Output(1)->name() != "ignore") {
+        if (Output(1)->name() != "NULL") {
            auto* Xdata = Input(0).template data<T, Context>();
            auto* dWdata = Output(1)->template mutable_data<T, Context>();
            math::Set(Output(1)->count(), cast::to<T>(0.f), dWdata, ctx());
@@ -88,7 +88,7 @@ void CuDNNDepthwiseConv2dGradientOp<Context>::RunWithType() {
                    kernel_shape[0], kernel_shape[1], stride[0], pad_l[0], pad_l[1],
                        data_format, dYdata, Xdata, dWdata, ctx());
        }
-        if (Output(0)->name() != "ignore") {
+        if (Output(0)->name() != "NULL") {
            auto* Wdata = Input(1).template data<T, Context>();
            auto* dXdata = Output(0)->template mutable_data<T, Context>();
            kernel::DepthwiseConv2dGrad(Input(0).dim(0), channels,

--- a/Dragon/src/operators/vision/depthwise_conv2d_op.cc
+++ b/Dragon/src/operators/vision/depthwise_conv2d_op.cc
@@ -54,7 +54,7 @@ void DepthwiseConv2dGradientOp<Context>::RunWithType() {
    }

    for (int n = 0; n < Input(2).dim(0); n++) {
-        if (Output(1)->name() != "ignore") {
+        if (Output(1)->name() != "NULL") {
            auto* Xdata = Input(0).template data<T, Context>();
            auto* dWdata = Output(1)->template mutable_data<T, Context>();
            math::Set(Output(1)->count(), cast::to<T>(0.f), dWdata, ctx());
@@ -63,7 +63,7 @@ void DepthwiseConv2dGradientOp<Context>::RunWithType() {
                    kernel_shape[0], kernel_shape[1], stride[0], pad_l[0], pad_l[1],
                        data_format, dYdata, Xdata, dWdata, ctx());
        }
-        if (Output(0)->name() != "ignore") {
+        if (Output(0)->name() != "NULL") {
            auto* Wdata = Input(1).template data<T, Context>();
            auto* dXdata = Output(0)->template mutable_data<T, Context>();
            kernel::DepthwiseConv2dGrad(Input(0).dim(0), channels,

--- a/Dragon/src/proto/dragon.proto
+++ b/Dragon/src/proto/dragon.proto
 // Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
 // Licensed under the BSD 2-Clause License.

+// Codes are based on:
+// https://github.com/pytorch/pytorch/blob/master/caffe2/proto/caffe2.proto
+
 syntax = "proto2";

 package dragon;

+// Store the serialized Tensor objects.
 message TensorProto {
    repeated int32 dims = 1;
    enum DataType {
        UNDEFINED = 0;
+        // Basic types.
        FLOAT = 1;
        INT32 = 2;
        BYTE = 3;
        STRING = 4;
+
+        // Less-commonly used data types.
+        BOOL = 5;
+        UINT8 = 6;
+        INT8 = 7;
+        UINT16 = 8;
+        INT16 = 9;
+        INT64 = 10;
        FLOAT16 = 12;
+        DOUBLE = 13;
    }
    optional DataType data_type = 2 [default = FLOAT];
+    // For float.
    repeated float float_data = 3 [packed = true];
+    // For int32, uint8, int8, uint16, int16, bool, and float16
+    // Note about float16: in storage we will basically convert float16 byte-wise
+    // to unsigned short and then store them in the int32_data field.
    repeated int32 int32_data = 4 [packed = true];
+    // For bytes.
    optional bytes byte_data = 5;
+    // For strings.
    repeated bytes string_data = 6;
+    // For double.
+    repeated double double_data = 9 [packed = true];
+    // For int64.
+    repeated int64 int64_data = 10 [packed = true];
+    // Store the raw data, contents are serialized as little-endian.
+    optional bytes raw_data = 13;
+
+    // Optionally, a name for the tensor.
    optional string name = 7;
 }

+// Record the filler of Tensor.
+// This structure is kept for backward compatibility
+// with caffe1, which relies implicit initializer.
 message TensorFillerProto {
    optional string tensor = 1;
    optional string type = 2 [default = 'constant'];
@@ -36,67 +67,120 @@ message TensorFillerProto {
    optional VarianceNorm variance_norm = 9 [default = FAN_IN];
 }

+// Store multiple TensorProto objects in one single proto.
 message TensorProtos {
  repeated TensorProto protos = 1;
 }

-enum DeviceType { 
-	PROTO_CPU = 0;
-	PROTO_CUDA = 1;
-	PROTO_CNML = 2;
+// DeviceType that Dragon currently supports.
+enum DeviceTypeProto {
+    // The default device.
+    PROTO_CPU = 0;
+    // NVIDIA's CUDA Environment.
+    PROTO_CUDA = 1;
+    // CAMBRICON's CNML Environment.
+    PROTO_CNML = 2;
 }

+// Device-specific options.
 message DeviceOption {
-    optional DeviceType device_type = 1 [default = PROTO_CPU];
+    // The type of device to dispatch executions.
+    optional DeviceTypeProto device_type = 1 [default = PROTO_CPU];
+    // The index of this device.
    optional int32 device_id = 2 [default = 0];
+    // The random seed to start the random generator.
    optional uint32 random_seed = 3 [default = 3];
-    optional string engine = 4;
 }

+// A named argument containing either singular float, integer and string
+// values, or repeated float, int and string arrays.
 message Argument {
+    // The name of this argument.
    optional string name = 1;
+
+    // Store the float32 value.
    optional float f = 2;
+    // Store the bool, int32, int64 value.
    optional int64 i = 3;
+    // Store the string value.
    optional bytes s = 4;

+    // Store the float32 values.
    repeated float floats = 7;
+    // Store the bool, int32, int64 values.
    repeated int64 ints = 8;
+    // Store the string values.
    repeated bytes strings = 9;
 }

+// Operator Definition
 message OperatorDef {
+    // The unique id of this operator.
+    // Set it to persist operators in the dynamic graph.
    optional string uid = 1;
+
+    // The name of inputs.
    repeated string input = 2;
+    // The name of outputs.
    repeated string output = 3;
+
+    // The optional name of this operator.
    optional string name = 4;
+    // The operator type.
    optional string type = 5;
+    // The arguments.
    repeated Argument arg = 6;
+
+    // The device option that the operator should run under.
    optional DeviceOption device_option = 7;
 }

+// Record the gradient information
 message GradientProto {
+    // The derivative target.
    optional string cost = 1;
+    // The target with respect to?
    optional string wrt = 2;
+    // The external gradient
    optional string external = 3;
 }

+// Record the updater information
 message UpdaterProto {
+    // The operator name to use.
    optional string name = 1;
+    // The operator type.
    optional string type = 2;
+    // The tensor to update.
    repeated string tensor = 3;
+    // The arguments.
    repeated Argument arg = 4;
 }

+// Graph Definition
 message GraphDef {
+    // The graph name.
    optional string name = 1;
+
+    // The operators to execute.
    repeated OperatorDef op = 2;
+
+    // The type of graph.
    optional string graph_type = 3;
+
+    // The device option for this graph.
    optional DeviceOption device_option = 5;
+
+    // The arguments.
    repeated Argument arg = 6;

+    // The name of inputs.
    repeated string input = 7;
+    // The name of outputs.
    repeated string output = 8;

+    // The gradients information.
    repeated GradientProto gradient = 9;
+    // The updaters information.
    repeated UpdaterProto updater = 10;
 }
\ No newline at end of file