Remove the duplicate workspace singletons

Summary: This commit moves the workspace api into the current workspace instance. For this reason, the namespace ``dragon.workspace`` is removed for simplicity.

Remove the duplicate workspace singletons
Summary: This commit moves the workspace api into the current workspace instance. For this reason, the namespace ``dragon.workspace`` is removed for simplicity.
Ting PAN
Commit 02ad90d5 authored Jul 12, 2020 by Ting PAN
Showing with 3854 additions and 4629 deletions
README.md
caffe/layer.py
caffe/layers/__init__.py
caffe/layers/common.py
caffe/layers/data.py
caffe/layers/loss.py
caffe/layers/neuron.py
caffe/layers/vision.py
caffe/net.py
caffe/proto/caffe.proto
caffe/solver.py
docs/README.md
docs/api/python/caffe/layers.rst
docs/api/python/caffe/layers/DepthwiseConv2d.rst
docs/api/python/caffe/layers/FusedBatchNorm.rst
docs/api/python/caffe/layers/FusedGroupNorm.rst
docs/api/python/caffe/layers/GroupNorm.rst
docs/api/python/caffe/layers/SELU.rst
docs/api/python/dragon.rst
docs/api/python/dragon/GradientTape.rst
--- a/README.md
+++ b/README.md
 <p align="center">
-    <img width="40%" src="http://dragon.seetatech.com/static/images/styles-dragon.png"/>
+    <img width="40%" src="https://dragon.seetatech.com/static/images/styles-dragon.png"/>
 </p>
-[Dragon](http://dragon.seetatech.com) is a **C**(Computation)**G**(Graph)**V**(Virtual)**M**(Machine) based distributed deep learning framework.
+[Dragon](https://dragon.seetatech.com) is a **C**(Computation)**G**(Graph)**V**(Virtual)**M**(Machine) based distributed deep learning framework.
 It fuses several modern frameworks and integrations together, powered by a unified engine.
 The computation between different programming styles is deterministic and reproduceable.
@@ -11,7 +11,7 @@ promoting internal interfaces. We will always learn from the AI community to evo
 ## Installation
-See the [install guide](http://dragon.seetatech.com/install) for the pip package
+See the [install guide](https://dragon.seetatech.com/install) for the pip package
 or how to build from source.
 ## License

--- a/caffe/layer.py
+++ b/caffe/layer.py
@@ -15,9 +15,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+import numpy
 from dragon.core.autograph.tensor import TensorRef
 from dragon.core.eager import context as eager_context
 from dragon.core.framework import context
+from dragon.core.util import logging
+from dragon.vm.caffe.proto import caffe_pb2
 class Layer(object):
@@ -34,24 +38,26 @@ class Layer(object):
        """
        self._proto = layer_param
        self._name = layer_param.name
-        self._arguments, self.arguments = {'name': self._name}, {}
+        self._arguments, self.arguments = {'name': 'output'}, {}
        # Store the inputs, outputs and trainable parameters.
        self._bottom, self._top, self._blobs = [], [], []
        for blob in layer_param.bottom:
            self._bottom.append(blob)
        for blob in layer_param.top:
            self._top.append(blob)
        # Store the loss weight to apply gradients.
        self._loss_weight = layer_param.loss_weight \
            if len(layer_param.loss_weight) > 0 else None
        # Optional mirror stage argument for memory optimization.
        if layer_param.HasField('mirror_stage'):
            self._arguments['mirror_stage'] = layer_param.mirror_stage
    @property
+    def blobs(self):
+        """Return the blobs."""
+        return self._blobs
+    @property
    def bottom(self):
        """Return the bottom names."""
        return self._bottom
@@ -62,49 +68,91 @@ class Layer(object):
        return self._loss_weight
    @property
+    def name(self):
+        """Return the layer name."""
+        return self._name
+    @property
    def top(self):
        """Return the top names."""
        return self._top
    def add_blob(self, value=None, filler=None, no_grad=False):
-        """Add a weight blob into this layer."""
+        """Add a blob into this layer."""
-        # Use a fixed name in the current workspace.
+        # Set the name for reference explicitly.
-        # Note that a non-empty tensor scope will make it
+        data_name = context.get_name_scope() + 'param:{}'.format(len(self._blobs))
-        # impossible to load/save models. You should use
+        data, diff = TensorRef(data_name), TensorRef(data_name + '_grad')
-        # a new workspace instead of the terrible name scope.
-        scoped_name = context.get_name_scope() + self._name
-        param_name = scoped_name + '/param:{}'.format(len(self._blobs))
-        # Set the name explicitly.
-        variable = TensorRef(param_name)
-        variable_grad = TensorRef(param_name + '_grad')
        if filler is not None:
-            variable._register_as(**filler)
+            data._register_as(**filler)
        else:
            # Register a constant filler by default.
            value = value if value else 0
-            variable.constant(value=value)
+            data.constant(value=value)
+        # Append to the blobs.
+        self._blobs.append({'data': data, 'diff': None if no_grad else diff})
-        # Determine whether to disable the gradients explicitly.
+    def from_proto(self, proto):
-        if no_grad is True:
+        """Deserialize from the proto.
-            variable_grad = None
-        # Append to the blobs.
+        Parameters
-        self._blobs.append({'data': variable, 'diff': variable_grad})
+        ----------
+        proto : LayerParameter
+            The ``LayerParameter`` protocol buffer.
+        """
+        for i in range(len(self._blobs)):
+            if i < len(proto.blobs):
+                blob_proto = proto.blobs[i]
+                if len(blob_proto.data) > 0:
+                    value = numpy.array(blob_proto.data, dtype='float32')
+                elif len(blob_proto.double_data) > 0:
+                    value = numpy.array(blob_proto.double_data, dtype='float64')
+                else:
+                    raise ValueError('Neither <data> or <double_data> in blob proto.')
+                if len(blob_proto.shape.dim) > 0:
+                    value = value.reshape([dim for dim in blob_proto.shape.dim])
+                self._blobs[i]['data'].set_value(value)
+                logging.info('Blob({}/param:{}) loaded, shape: {}, size: {}'
+                             .format(self._name, i, value.shape, value.size))
    def setup(self, bottom):
-        # Merge the arguments, then setup up the specific layer.
+        """Setup the layer."""
        self.arguments = dict(self.arguments, **self._arguments)
        bottom = bottom[0] if len(bottom) == 1 else bottom
        with eager_context.graph_mode():
            return self.__call__(bottom)
-    @classmethod
+    def to_proto(self):
-    def get_filler(cls, layer_param, filler_name):
+        """Serialize to the proto.
-        """Construct a filler from the parameter."""
-        if layer_param.HasField(filler_name):
+        Returns
-            filler = getattr(layer_param, filler_name)
+        -------
+        LayerParameter
+            The ``LayerParameter`` protocol buffer.
+        """
+        proto = caffe_pb2.LayerParameter()
+        proto.CopyFrom(self._proto)
+        for blob in self._blobs:
+            value = blob['data'].get_value()
+            if str(value.dtype) == 'float32':
+                blob_proto = caffe_pb2.BlobProto(
+                    data=value.flatten(),
+                    shape=caffe_pb2.BlobShape(dim=value.shape))
+            elif str(value.dtype) == 'float64':
+                blob_proto = caffe_pb2.BlobProto(
+                    double_data=value.flatten(),
+                    shape=caffe_pb2.BlobShape(dim=value.shape))
+            else:
+                raise ValueError('Either float32 or float64 blob is required.')
+            proto.blobs.extend([blob_proto])
+        return proto
+    @staticmethod
+    def get_filler(proto, filler_name):
+        """Return the filler from proto."""
+        if proto.HasField(filler_name):
+            filler = getattr(proto, filler_name)
            return {
                'type': filler.type.lower(),
                'value': filler.value,

--- a/caffe/layers/__init__.py
+++ b/caffe/layers/__init__.py
@@ -16,14 +16,10 @@ from __future__ import print_function
 from dragon.vm.caffe.layers.common import Accuracy
 from dragon.vm.caffe.layers.common import ArgMax
 from dragon.vm.caffe.layers.common import BatchNorm
-from dragon.vm.caffe.layers.common import Cast
 from dragon.vm.caffe.layers.common import Concat
 from dragon.vm.caffe.layers.common import Crop
 from dragon.vm.caffe.layers.common import Eltwise
 from dragon.vm.caffe.layers.common import Flatten
-from dragon.vm.caffe.layers.common import FusedBatchNorm
-from dragon.vm.caffe.layers.common import FusedGroupNorm
-from dragon.vm.caffe.layers.common import GroupNorm
 from dragon.vm.caffe.layers.common import InnerProduct
 from dragon.vm.caffe.layers.common import Input
 from dragon.vm.caffe.layers.common import Normalize
@@ -46,12 +42,10 @@ from dragon.vm.caffe.layers.neuron import ELU
 from dragon.vm.caffe.layers.neuron import Power
 from dragon.vm.caffe.layers.neuron import PReLU
 from dragon.vm.caffe.layers.neuron import ReLU
-from dragon.vm.caffe.layers.neuron import SELU
 from dragon.vm.caffe.layers.neuron import Sigmoid
 from dragon.vm.caffe.layers.neuron import TanH
 from dragon.vm.caffe.layers.vision import Convolution
 from dragon.vm.caffe.layers.vision import Deconvolution
-from dragon.vm.caffe.layers.vision import DepthwiseConv2d
 from dragon.vm.caffe.layers.vision import LRN
 from dragon.vm.caffe.layers.vision import Pooling
 from dragon.vm.caffe.layers.vision import ROIAlign

--- a/caffe/layers/common.py
+++ b/caffe/layers/common.py
@@ -15,7 +15,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-from dragon.core.autograph.tensor import Tensor
+from dragon.core.autograph.tensor import TensorRef
+from dragon.core.framework import context
+from dragon.core.framework import workspace
 from dragon.core.ops import activation_ops
 from dragon.core.ops import array_ops
 from dragon.core.ops import framework_ops
@@ -32,15 +34,15 @@ class Accuracy(Layer):
    ```python
    layer {
-        type: "Accuracy"
+      type: "Accuracy"
-        bottom: "ip2"
+      bottom: "ip2"
-        bottom: "label"
+      bottom: "label"
-        top: "acc"
+      top: "acc"
-        accuracy_param {
+      accuracy_param {
-            axis: 1
+        axis: 1
-            top_k: 1
+        top_k: 1
-            ignore_label: -1
+        ignore_label: -1
-        }
+      }
    }
    ```
@@ -67,13 +69,13 @@ class ArgMax(Layer):
    ```python
    layer {
-        type: "ArgMax"
+      type: "ArgMax"
-        bottom: "ip2"
+      bottom: "ip2"
-        top: "cls"
+      top: "cls"
-        argmax_param {
+      argmax_param {
-            top_k: 1
+        top_k: 1
-            axis: 1
+        axis: 1
-        }
+      }
    }
    ```
@@ -100,14 +102,14 @@ class BatchNorm(Layer):
    ```python
    layer {
-        type: "BatchNorm"
+      type: "BatchNorm"
-        bottom: "conv1"
+      bottom: "conv1"
-        top: "conv1/bn"
+      top: "conv1/bn"
-        batch_norm_param {
+      batch_norm_param {
-            use_global_stats: False
+        use_global_stats: False
-            moving_average_fraction: 0.9
+        moving_average_fraction: 0.9
-            eps: 1e-5
+        eps: 1e-5
-        }
+      }
    }
    ```
@@ -123,43 +125,27 @@ class BatchNorm(Layer):
            'eps': param.eps,
            'axis': 1,
        }
-        self.add_blob(value=1, no_grad=True)  # gamma
-        self.add_blob(value=0, no_grad=True)  # beta
        self.add_blob(value=0, no_grad=True)  # running_mean
        self.add_blob(value=1, no_grad=True)  # running_var
+        self.add_blob(value=1, no_grad=True)  # running_num_batches
+        self.add_blob(value=1, no_grad=True)  # fixed_gamma
+        self.add_blob(value=0, no_grad=True)  # fixed_beta
+        self._blobs[2]['data'].set_value([1.])
+        self._weight, self._bias = [blob['data'] for blob in self._blobs[3:5]]
+        del self._blobs[3:]  # Avoid to save the fixed blobs
+    def fuse_with_scale_layer(self, scale_layer):
+        self._weight = scale_layer._blobs[0]['data']
+        if len(scale_layer._blobs) == 2:
+            self._bias = scale_layer._blobs[1]['data']
+        scale_layer.__call__ = lambda *args, **kwargs: None
    def __call__(self, bottom):
-        inputs = [bottom] + [blob['data'] for blob in self._blobs]
+        inputs = [bottom, self._weight, self._bias] + \
+                 [blob['data'] for blob in self._blobs[:2]]
        return normalization_ops.batch_norm(inputs, **self.arguments)
-class Cast(Layer):
-    r"""Cast the data type of input.
-    Examples:
-    ```python
-    layer {
-        type: "Cast"
-        bottom: "ip2/fp16"
-        top: "ip2/fp32"
-        cast_param {
-            dtype: "float32"
-        }
-    }
-    ```
-    """
-    def __init__(self, layer_param):
-        super(Cast, self).__init__(layer_param)
-        param = layer_param.cast_param
-        self.arguments = {'dtype': param.dtype.lower()}
-    def __call__(self, bottom):
-        return array_ops.cast(bottom, **self.arguments)
 class Concat(Layer):
    r"""Concatenate the inputs along the given axis.
@@ -167,13 +153,13 @@ class Concat(Layer):
    ```python
    layer {
-        type: "Concat"
+      type: "Concat"
-        bottom: "conv2"
+      bottom: "conv2"
-        bottom: "conv1"
+      bottom: "conv1"
-        top: "conv2/fuse"
+      top: "conv2/fuse"
-        concat_param {
+      concat_param {
-            axis: 1
+        axis: 1
-        }
+      }
    }
    ```
@@ -194,15 +180,15 @@ class Crop(Layer):
    ```python
    layer {
-        type: "Crop"
+      type: "Crop"
-        bottom: "score"
+      bottom: "score"
-        bottom: "score/ref"
+      bottom: "score/ref"
-        top: "score/crop"
+      top: "score/crop"
-        crop_param {
+      crop_param {
-            axis: 2
+        axis: 2
-            offset: 5
+        offset: 5
-            offset: 10
+        offset: 10
-        }
+      }
    }
    ```
@@ -232,15 +218,15 @@ class Eltwise(Layer):
    ```python
    layer {
-        type: "Eltwise"
+      type: "Eltwise"
-        bottom: "conv2"
+      bottom: "conv2"
-        bottom: "conv1"
+      bottom: "conv1"
-        top: "conv2/fuse"
+      top: "conv2/fuse"
-        eltwise_param {
+      eltwise_param {
-            operation: SUM
+        operation: SUM
-            coeff: 1.
+        coeff: 1.
-            coeff: 1.
+        coeff: 1.
-        }
+      }
    }
    ```
@@ -250,9 +236,9 @@ class Eltwise(Layer):
        super(Eltwise, self).__init__(layer_param)
        param = layer_param.eltwise_param
        self.eltwise_op = {
-            0: math_ops.mul,      # MUL
+            0: math_ops.mul,
-            1: math_ops.add,      # SUM
+            1: math_ops.add,
-            2: math_ops.maximum,  # MAX
+            2: math_ops.maximum,
        }[param.operation]
        self.factors = [element for element in param.coeff]
@@ -273,13 +259,13 @@ class Flatten(Layer):
    ```python
    layer {
-        type: "Flatten"
+      type: "Flatten"
-        bottom: "conv5"
+      bottom: "conv5"
-        top: "conv5/flatten"
+      top: "conv5/flatten"
-        flatten_param {
+      flatten_param {
-            axis: 1
+        axis: 1
-            end_axis: -1
+        end_axis: -1
-        }
+      }
    }
    ```
@@ -296,141 +282,6 @@ class Flatten(Layer):
        return array_ops.flatten(bottom, **self.arguments)
-class FusedBatchNorm(Layer):
-    r"""Apply the fused batch normalization.
-    `[Ioffe & Szegedy, 2015] <https://arxiv.org/abs/1502.03167>`_.
-    Examples:
-    ```python
-    layer {
-        type: "FusedBatchNorm"
-        bottom: "conv1"
-        top: "conv1/bn"
-        batch_norm_param {
-            use_global_stats: False
-            moving_average_fraction: 0.9
-            eps: 1e-5
-        }
-        scale_param {
-            filler: {
-                type: "constant"
-                value: 1
-            }
-            bias_filler {
-                type: "constant"
-                value: 0
-            }
-        }
-    }
-    ```
-    """
-    def __init__(self, layer_param):
-        super(FusedBatchNorm, self).__init__(layer_param)
-        bn_param = layer_param.batch_norm_param
-        scale_param = layer_param.scale_param
-        self.arguments = {
-            'axis': 1,
-            'momentum': bn_param.moving_average_fraction,
-            'eps': bn_param.eps,
-            'use_stats': int(bn_param.use_global_stats)
-            if bn_param.HasField('use_global_stats') else -1,
-        }
-        self.add_blob(filler=self.get_filler(scale_param, 'filler'), value=1)  # gamma
-        self.add_blob(filler=self.get_filler(scale_param, 'bias_filler'))  # beta
-        self.add_blob(value=0, no_grad=True)  # running_mean
-        self.add_blob(value=1, no_grad=True)  # running_var
-    def __call__(self, bottom):
-        inputs = [bottom] + [blob['data'] for blob in self._blobs]
-        return normalization_ops.batch_norm(inputs, **self.arguments)
-class FusedGroupNorm(Layer):
-    r"""Apply the fused group normalization.
-    `[Wu & He, 2018] <https://arxiv.org/abs/1803.08494>`_.
-    Examples:
-    ```python
-    layer {
-        type: "FusedGroupNorm"
-        bottom: "conv1"
-        top: "conv1/gn"
-        group_norm_param {
-            group: 32
-            eps: 1e-5
-        }
-        scale_param {
-            filler: {
-                type: "constant"
-                value: 1
-            }
-            bias_filler {
-                type: "constant"
-                value: 0
-            }
-        }
-    }
-    ```
-    """
-    def __init__(self, layer_param):
-        super(FusedGroupNorm, self).__init__(layer_param)
-        gn_param = layer_param.group_norm_param
-        scale_param = layer_param.scale_param
-        self.arguments = {
-            'axis': 1,
-            'group': gn_param.group,
-            'eps': gn_param.eps,
-        }
-        self.add_blob(filler=self.get_filler(scale_param, 'filler'), value=1)  # gamma
-        self.add_blob(filler=self.get_filler(scale_param, 'bias_filler'))  # beta
-    def __call__(self, bottom):
-        inputs = [bottom] + [blob['data'] for blob in self._blobs]
-        return normalization_ops.group_norm(inputs, **self.arguments)
-class GroupNorm(Layer):
-    r"""Apply the group normalization.
-    `[Wu & He, 2018] <https://arxiv.org/abs/1803.08494>`_.
-    Examples:
-    ```python
-    layer {
-        type: "GroupNorm"
-        bottom: "conv1"
-        top: "conv1/gn"
-        group_norm_param {
-            group: 32
-            eps: 1e-5
-        }
-    }
-    ```
-    """
-    def __init__(self, layer_param):
-        super(GroupNorm, self).__init__(layer_param)
-        param = layer_param.group_norm_param
-        self.arguments = {
-            'axis': 1,
-            'group': param.group,
-            'eps': param.eps,
-        }
-        self.add_blob(value=1, no_grad=True)
-        self.add_blob(value=0, no_grad=True)
-    def __call__(self, bottom):
-        inputs = [bottom] + [blob['data'] for blob in self._blobs]
-        return normalization_ops.group_norm(inputs, **self.arguments)
 class InnerProduct(Layer):
    r"""Compute the dense matrix multiplication along the given axes.
@@ -438,13 +289,13 @@ class InnerProduct(Layer):
    ```python
    layer {
-        type: "InnerProduct"
+      type: "InnerProduct"
-        bottom: "conv5"
+      bottom: "conv5"
-        top: "ip1"
+      top: "ip1"
-        inner_product_param {
+      inner_product_param {
-            axis: 1
+        axis: 1
-            num_output: 1024
+        num_output: 1024
-        }
+      }
    }
    ```
@@ -458,7 +309,6 @@ class InnerProduct(Layer):
            'out_channels': param.num_output,
            'transpose_w': not param.transpose,
        }
-        # Add weights and biases
        self.add_blob(filler=self.get_filler(param, 'weight_filler'))
        if param.bias_term:
            self.add_blob(filler=self.get_filler(param, 'bias_filler'))
@@ -475,15 +325,13 @@ class Input(Layer):
    ```python
    layer {
-        type: "Input"
+      type: "Input"
-        top: "a"
+      top: "data1"
-        top: "b"
+      top: "data2"
-        input_param {
+      input_param {
-            shape: { dim: 2 dim: 3 }
+        shape: { dim: 2 dim: 3 }
-            shape: { dim: 2 dim: 3 dim: 3 }
+        shape: { dim: 2 dim: 3 dim: 3 }
-            dtype: "float32"
+      }
-            dtype: "float64"
-        }
    }
    ```
@@ -492,20 +340,24 @@ class Input(Layer):
    def __init__(self, layer_param):
        super(Input, self).__init__(layer_param)
        param = layer_param.input_param
-        self.shapes, self.dtypes = [], []
+        self.blob_shapes = []
        for i in range(len(self.top)):
            if i < len(param.shape):
-                self.shapes.append([e for e in param.shape[i].dim])
+                self.blob_shapes.append([e for e in param.shape[i].dim])
-            else:
-                self.shapes.append(None)
-            if i < len(param.dtype):
-                self.dtypes.append(param.dtype[i])
            else:
-                self.dtypes.append('float32')
+                self.blob_shapes.append(None)
    def __call__(self, bottom):
-        return [Tensor(shape=self.shapes[i], dtype=self.dtypes[i])
+        name_scope = context.get_name_scope()
-                for i in range(len(self.shapes))]
+        current_ws = workspace.get_workspace()
+        return [TensorRef(
+            name=current_ws.unique_name(
+                name_scope + 'output',
+                suffix=':{}'.format(i),
+                namespace='Tensor'),
+            shape=self.blob_shapes[i],
+            dtype='float32',
+        ).placeholder() for i in range(len(self.blob_shapes))]
 class Normalize(Layer):
@@ -516,18 +368,18 @@ class Normalize(Layer):
    ```python
    layer {
-        type: "Normalize"
+      type: "Normalize"
-        bottom: "conv4"
+      bottom: "conv4"
-        top: "conv4/norm"
+      top: "conv4/norm"
-        normalize_param {
+      normalize_param {
-            across_spatial: false
+        across_spatial: false
-            channel_shared: false
+        channel_shared: false
-            eps: 1e-12
+        eps: 1e-12
-            scale_filler: {
+        scale_filler: {
-                type: "constant"
+          type: "constant"
-                value: 1
+          value: 1
-            }
+        }
-        }
+      }
    }
    ```
@@ -560,15 +412,15 @@ class Permute(Layer):
    ```python
    layer {
-        type: "Permute"
+      type: "Permute"
-        bottom: "cls_score"
+      bottom: "cls_score"
-        top: "cls_score/perm"
+      top: "cls_score/perm"
-        permute_param {
+      permute_param {
-            order: 0
+        order: 0
-            order: 2
+        order: 2
-            order: 3
+        order: 3
-            order: 1
+        order: 1
-        }
+      }
    }
    ```
@@ -590,16 +442,16 @@ class Python(Layer):
    ```python
    layer {
-        type: "Python"
+      type: "Python"
-        bottom: "cls_prob"
+      bottom: "cls_prob"
-        bottom: "bbox_pred"
+      bottom: "bbox_pred"
-        bottom: "ims_info"
+      bottom: "ims_info"
-        top: "rois"
+      top: "rois"
-        python_param {
+      python_param {
-            module: 'rpn.proposal_layer'
+        module: 'rpn.proposal_layer'
-            layer: 'ProposalLayer'
+        layer: 'ProposalLayer'
-            param_str: "'feat_stride': 16"
+        param_str: "'feat_stride': 16"
-        }
+      }
    }
    ```
@@ -626,13 +478,13 @@ class Reduction(Layer):
    ```python
    layer {
-        type: "Reduction"
+      type: "Reduction"
-        bottom: "entropy"
+      bottom: "entropy"
-        top: "loss"
+      top: "loss"
-        reduction_param {
+      reduction_param {
-            operation: SUM
+        operation: SUM
-            axis: 1
+        axis: 1
-        }
+      }
    }
    ```
@@ -646,10 +498,7 @@ class Reduction(Layer):
                raise ValueError('The negative axis can only be -1.')
        self.scale = param.coeff
        self.arguments = {'axis': [param.axis]}
-        self.reduction = {
+        self.reduction = {1: array_ops.sum, 4: array_ops.mean}[param.operation]
-            1: array_ops.sum,
-            4: array_ops.mean,
-        }[param.operation]
    def __call__(self, bottom):
        top = self.reduction(bottom, **self.arguments)
@@ -665,16 +514,16 @@ class Reshape(Layer):
    ```python
    layer {
-        type: "Reshape"
+      type: "Reshape"
-        bottom: "bbox_pred/perm"
+      bottom: "bbox_pred/perm"
-        top: "bbox_pred/reshape"
+      top: "bbox_pred/reshape"
-        reshape_param {
+      reshape_param {
-            shape {
+        shape {
-                dim: 0
+          dim: 0
-                dim: -1
+          dim: -1
-                dim: 4
+          dim: 4
-            }
+        }
-        }
+      }
    }
    ```
@@ -696,22 +545,22 @@ class Scale(Layer):
    ```python
    layer {
-        type: "Scale"
+      type: "Scale"
-        bottom: "conv1/bn"
+      bottom: "conv1/bn"
-        top: "conv1/scale"
+      top: "conv1/scale"
-        scale_param {
+      scale_param {
-            axis: 1
+        axis: 1
-            num_axes: 1
+        num_axes: 1
-            bias_term: true
+        bias_term: true
-            filler: {
+        filler: {
-                type: "constant"
+          type: "constant"
-                value: 1
+          value: 1
-            }
+        }
-            bias_filler {
+        bias_filler {
-                type: "constant"
+          type: "constant"
-                value: 0
+          value: 0
-            }
+        }
-        }
+      }
    }
    ```
@@ -721,7 +570,6 @@ class Scale(Layer):
        super(Scale, self).__init__(layer_param)
        param = layer_param.scale_param
        self.arguments = {'axis': param.axis, 'num_axes': param.num_axes}
-        # Add weights and biases
        self.add_blob(filler=self.get_filler(param, 'filler'), value=1)
        if param.bias_term:
            self.add_blob(filler=self.get_filler(param, 'bias_filler'))
@@ -738,16 +586,16 @@ class Slice(Layer):
    ```python
    layer {
-        type: "Slice"
+      type: "Slice"
-        bottom: "image"
+      bottom: "image"
-        top: "image/b"
+      top: "image/b"
-        top: "image/g"
+      top: "image/g"
-        top: "image/r"
+      top: "image/r"
-        slice_param {
+      slice_param {
-            axis: 1
+        axis: 1
-            slice_point: 1
+        slice_point: 1
-            slice_point: 2
+        slice_point: 2
-        }
+      }
    }
    ```
@@ -773,12 +621,12 @@ class Softmax(Layer):
    ```python
    layer {
-        type: "Softmax"
+      type: "Softmax"
-        bottom: "cls_score"
+      bottom: "cls_score"
-        top: "cls_prob"
+      top: "cls_prob"
-        softmax_param {
+      softmax_param {
-            axis: 1
+        axis: 1
-        }
+      }
    }
    ```
@@ -799,9 +647,9 @@ class StopGradient(Layer):
    ```python
    layer {
-        type: "StopGradient"
+      type: "StopGradient"
-        bottom: "res2c"
+      bottom: "res2c"
-        top: "res2c/frozen"
+      top: "res2c/frozen"
    }
    ```
@@ -815,22 +663,18 @@ class StopGradient(Layer):
 class Tile(Layer):
-    r"""Tile the input according to the given multiples.
+    r"""Repeat the input according to the given axis.
    Examples:
    ```python
    layer {
-        type: "Slice"
+        type: "Tile"
-        bottom: "conv2"
+        bottom: "data"
-        top: "conv2/dup"
+        top: "output"
        tile_param {
-            multiples: {
+          axis: 1
-                dim: 1
+          tiles: 2
-                dim: 2
-                dim: 1
-                dim: 1
-            }
        }
    }
    ```
@@ -840,7 +684,9 @@ class Tile(Layer):
    def __init__(self, layer_param):
        super(Tile, self).__init__(layer_param)
        param = layer_param.tile_param
-        self.arguments = {'multiples': [e for e in param.multiples.dim]}
+        repeats = [1] * (param.axis + 1)
+        repeats[param.axis] = param.tiles
+        self.arguments = {'repeats': repeats}
    def __call__(self, bottom):
        return array_ops.tile(bottom, **self.arguments)
--- a/caffe/layers/data.py
+++ b/caffe/layers/data.py
@@ -33,8 +33,9 @@ class _DataPlugin(object):
    def forward(self, inputs, outputs):
        blobs = self.iterator.next()
+        current_ws = workspace.get_workspace()
        for i, blob in enumerate(blobs):
-            workspace.feed_tensor(outputs[i], blob)
+            current_ws.feed_tensor(outputs[i], blob)
 class Data(Layer):
@@ -44,42 +45,46 @@ class Data(Layer):
    ```python
    layer {
-        type: "Data"
+      type: "Data"
-        top: "data"
+      top: "data"
-        top: "label"
+      top: "label"
-        include { phase: TRAIN }
+      include {
-        data_param {
+        phase: TRAIN
-            source: "/data/imagenet/train"
+      }
-            batch_size: 128
+      data_param {
-            shuffle: true
+        source: "/data/train"
-            num_chunks: 0
+        batch_size: 128
-            prefetch: 5
+        shuffle: true
-        }
+        num_chunks: 0
-        transform_param {
+        prefetch: 5
-            mirror: true
+      }
-            random_crop_size: 224
+      transform_param {
-            augment_color: true
+        mirror: true
-            mean_value: 104.00698793
+        random_crop_size: 224
-            mean_value: 116.66876762
+        augment_color: true
-            mean_value: 122.67891434
+        mean_value: 104.00698793
-        }
+        mean_value: 116.66876762
+        mean_value: 122.67891434
+      }
    }
    layer {
-        type: "Data"
+      type: "Data"
-        top: "data"
+      top: "data"
-        top: "label"
+      top: "label"
-        include { phase: TEST }
+      include {
-        data_param {
+        phase: TEST
-            source: "/data/imagenet/val"
+      }
-            batch_size: 100
+      data_param {
-        }
+        source: "/data/val"
-        transform_param {
+        batch_size: 64
-            resize: 256
+      }
-            crop_size: 224
+      transform_param {
-            mean_value: 104.00698793
+        resize: 256
-            mean_value: 116.66876762
+        crop_size: 224
-            mean_value: 122.67891434
+        mean_value: 104.00698793
-        }
+        mean_value: 116.66876762
+        mean_value: 122.67891434
+      }
    }
    ```

--- a/caffe/layers/loss.py
+++ b/caffe/layers/loss.py
@@ -30,13 +30,13 @@ class EuclideanLoss(Layer):
    ```python
    layer {
-        type: "EuclideanLoss"
+      type: "EuclideanLoss"
-        bottom: "bbox_pred"
+      bottom: "bbox_pred"
-        bottom: "bbox_target"
+      bottom: "bbox_target"
-        top: "bbox_loss"
+      top: "bbox_loss"
-        loss_param {
+      loss_param {
-            normalization: BATCH_SIZE
+        normalization: BATCH_SIZE
-        }
+      }
    }
    ```
@@ -67,13 +67,13 @@ class SigmoidCrossEntropyLoss(Layer):
    ```python
    layer {
-        type: "SigmoidCrossEntropyLoss"
+      type: "SigmoidCrossEntropyLoss"
-        bottom: "rpn_cls_score"
+      bottom: "rpn_cls_score"
-        bottom: "rpn_labels"
+      bottom: "rpn_labels"
-        top: "rpn_loss"
+      top: "rpn_loss"
-        loss_param {
+      loss_param {
-            normalization: VALID
+        normalization: VALID
-        }
+      }
    }
    ```
@@ -106,15 +106,15 @@ class SmoothL1Loss(Layer):
    ```python
    layer {
-        type: "SmoothL1Loss"
+      type: "SmoothL1Loss"
-        bottom: "bbox_pred"
+      bottom: "bbox_pred"
-        bottom: "bbox_targets"
+      bottom: "bbox_targets"
-        bottom: "bbox_inside_weights"
+      bottom: "bbox_inside_weights"
-        bottom: "bbox_outside_weights"
+      bottom: "bbox_outside_weights"
-        top: "bbox_loss"
+      top: "bbox_loss"
-        loss_param {
+      loss_param {
-            normalization: BATCH_SIZE
+        normalization: BATCH_SIZE
-        }
+      }
    }
    ```
@@ -155,15 +155,17 @@ class SoftmaxWithLoss(Layer):
    ```python
    layer {
-        type: "SoftmaxWithLoss"
+      type: "SoftmaxWithLoss"
-        bottom: "cls_score"
+      bottom: "cls_score"
-        bottom: "labels"
+      bottom: "labels"
-        top: "cls_loss"
+      top: "cls_loss"
-        softmax_param { axis: 1 }
+      softmax_param {
-        loss_param {
+        axis: 1
-            ignore_label: -1
+      }
-            normalization: VALID
+      loss_param {
-        }
+        ignore_label: -1
+        normalization: VALID
+      }
    }
    ```

--- a/caffe/layers/neuron.py
+++ b/caffe/layers/neuron.py
@@ -32,12 +32,12 @@ class Dropout(Layer):
    ```python
    layer {
-        type: "Dropout"
+      type: "Dropout"
-        bottom: "fc6"
+      bottom: "fc6"
-        top: "fc6"
+      top: "fc6"
-        dropout_param {
+      dropout_param {
-            dropout_ratio: 0.5
+        dropout_ratio: 0.5
-        }
+      }
    }
    ```
@@ -73,12 +73,12 @@ class ELU(Layer):
    ```python
    layer {
-        type: "ELU"
+      type: "ELU"
-        bottom: "conv2"
+      bottom: "conv2"
-        top: "conv2"
+      top: "conv2"
-        elu_param {
+      elu_param {
-            alpha: 1.
+        alpha: 1.
-        }
+      }
    }
    ```
@@ -101,14 +101,14 @@ class Power(Layer):
    ```python
    layer {
-        type: "Power"
+      type: "Power"
-        bottom: "x"
+      bottom: "x"
-        top: "y"
+      top: "y"
-        power_param {
+      power_param {
-            scale: 1.
+        scale: 1.
-            shift: 0.
+        shift: 0.
-            power: 2.
+        power: 2.
-        }
+      }
    }
    ```
@@ -148,16 +148,16 @@ class PReLU(Layer):
    ```python
    layer {
-        type: "PReLU"
+      type: "PReLU"
-        bottom: "conv2"
+      bottom: "conv2"
-        top: "conv2/relu"
+      top: "conv2/relu"
-        prelu_param {
+      prelu_param {
-            channel_shared: false
+        channel_shared: false
-            filler {
+        filler {
-                type: "constant"
+          type: "constant"
-                value: 0.25
+          value: 0.25
-            }
        }
+      }
    }
    ```
@@ -194,12 +194,12 @@ class ReLU(Layer):
    ```python
    layer {
-        type: "ReLU"
+      type: "ReLU"
-        bottom: "conv2"
+      bottom: "conv2"
-        top: "conv2/relu"
+      top: "conv2/relu"
-        relu_param {
+      relu_param {
-           negative_slope: 0.
+        negative_slope: 0.
-        }
+      }
    }
    ```
@@ -215,38 +215,6 @@ class ReLU(Layer):
        return activation_ops.relu(bottom, **self.arguments)
-class SELU(Layer):
-    r"""Apply the scaled exponential linear unit.
-    `[Klambauer et.al, 2017] <https://arxiv.org/abs/1706.02515>`_.
-    The **SELU** function is defined as:
-    .. math::
-        \text{SELU}(x) = 1.0507 *
-        \begin{cases}
-            x, & \text{ if } x \geq 0 \\
-            1.6733 * (e^{x} - 1), & \text{ otherwise }
-        \end{cases}
-    Examples:
-    ```python
-    layer {
-        type: "SELU"
-        bottom: "conv2"
-        top: "conv2/relu"
-    }
-    ```
-    """
-    def __init__(self, layer_param):
-        super(SELU, self).__init__(layer_param)
-    def __call__(self, bottom):
-        return activation_ops.selu(bottom, **self.arguments)
 class Sigmoid(Layer):
    r"""Apply the sigmoid function.
@@ -258,9 +226,9 @@ class Sigmoid(Layer):
    ```python
    layer {
-        type: "Sigmoid"
+      type: "Sigmoid"
-        bottom: "rpn_cls_score"
+      bottom: "rpn_cls_score"
-        top: "rpn_cls_prob"
+      top: "rpn_cls_prob"
    }
    ```
@@ -284,9 +252,9 @@ class TanH(Layer):
    ```python
    layer {
-        type: "TanH"
+      type: "TanH"
-        bottom: "g/conv5"
+      bottom: "g/conv5"
-        top: "g/image"
+      top: "g/image"
    }
    ```

--- a/caffe/layers/vision.py
+++ b/caffe/layers/vision.py
@@ -23,39 +23,29 @@ from dragon.vm.caffe.layer import Layer
 class Convolution(Layer):
    r"""Apply the n-dimension convolution.
-    The spatial output dimension is computed as:
-    .. math::
-        \begin{cases}
-            \text{DK}_{size} = dilation *
-                (\text{K}_{size} - 1) + 1 \\
-            \text{Dim}_{out} = (\text{Dim}_{in} +
-                2 * pad - \text{DK}_{size}) / stride + 1
-        \end{cases}
    Examples:
    ```python
    layer {
-        type: "Convolution"
+      type: "Convolution"
-        bottom: "input"
+      bottom: "input"
-        top: "conv1"
+      top: "conv1"
-        convolution_param {
+      convolution_param {
-            num_output: 32
+        num_output: 32
-            bias_term: true
+        bias_term: true
-            kernel_size: 3
+        kernel_size: 3
-            pad: 1
+        pad: 1
-            stride: 1
+        stride: 1
-            dilation: 1
+        dilation: 1
-            group: 1
+        group: 1
-            weight_filler {
+        weight_filler {
-                type: "xavier"
+          type: "xavier"
-            }
-            bias_filler {
-                type: "constant"
-                value: 0
-            }
        }
+        bias_filler {
+          type: "constant"
+          value: 0
+        }
+      }
    }
    ```
@@ -83,7 +73,6 @@ class Convolution(Layer):
        if param.HasField('pad_h'):
            assert param.HasField('pad_w')
            self.arguments['pads'] = [param.pad_h, param.pad_w]
        self.add_blob(filler=self.get_filler(param, 'weight_filler'))
        if param.bias_term:
            self.add_blob(filler=self.get_filler(param, 'bias_filler'))
@@ -96,39 +85,29 @@ class Convolution(Layer):
 class Deconvolution(Convolution):
    r"""Apply the 2d deconvolution.
-    The spatial output dimension is computed as:
-    .. math::
-        \begin{cases}
-            \text{DK}_{size} = dilation *
-                (\text{K}_{size} - 1) + 1 \\
-            \text{Dim}_{out} = (\text{Dim}_{in} - 1) *
-                stride + \text{DK}_{size} - 2 * pad
-        \end{cases}
    Examples:
    ```python
    layer {
-        type: "Deconvolution"
+      type: "Deconvolution"
-        bottom: "conv5"
+      bottom: "conv5"
-        top: "conv5/upscale"
+      top: "conv5/upscale"
-        convolution_param {
+      convolution_param {
-            num_output: 256
+        num_output: 256
-            bias_term: true
+        bias_term: true
-            kernel_size: 2
+        kernel_size: 2
-            pad: 0
+        pad: 0
-            stride: 2
+        stride: 2
-            dilation: 1
+        dilation: 1
-            group: 1
+        group: 1
-            weight_filler {
+        weight_filler {
-                type: "xavier"
+          type: "xavier"
-            }
+        }
-            bias_filler {
+        bias_filler {
-                type: "constant"
+          type: "constant"
-                value: 0
+          value: 0
-            }
        }
+      }
    }
    ```
@@ -142,77 +121,6 @@ class Deconvolution(Convolution):
        return vision_ops.conv2d_transpose(inputs, **self.arguments)
-class DepthwiseConv2d(Layer):
-    r"""Apply the 2d depthwise convolution.
-    `[Chollet, 2016] <https://arxiv.org/abs/1610.02357>`_.
-    The spatial output dimension is computed as:
-    .. math::
-        \begin{cases}
-            \text{DK}_{size} = dilation *
-                (\text{K}_{size} - 1) + 1 \\
-            \text{Dim}_{out} = (\text{Dim}_{in} +
-                2 * pad - \text{DK}_{size}) / stride + 1
-        \end{cases}
-    Examples:
-    ```python
-    layer {
-        type: "DepthwiseConv2d"
-        bottom: "input"
-        top: "conv1"
-        convolution_param {
-            num_output: 32
-            bias_term: true
-            kernel_size: 3
-            pad: 1
-            stride: 1
-            dilation: 1
-            weight_filler {
-                type: "xavier"
-                variance_norm: FAN_OUT
-            }
-            bias_filler {
-                type: "constant"
-                value: 0
-            }
-        }
-    }
-    ```
-    """
-    def __init__(self, layer_param):
-        super(DepthwiseConv2d, self).__init__(layer_param)
-        param = layer_param.convolution_param
-        self.arguments = {
-            'out_channels': param.num_output,
-            'kernel_shape': [int(e) for e in param.kernel_size],
-            'strides': [int(e) for e in param.stride] if len(param.stride) > 0 else [1],
-            'pads': [int(e) for e in param.pad] if len(param.pad) > 0 else [0],
-            'padding': 'VALID',
-            'data_format': 'NCHW',
-        }
-        if param.HasField('kernel_h'):
-            assert param.HasField('kernel_w')
-            self.arguments['kernel_shape'] = [param.kernel_h, param.kernel_w]
-        if param.HasField('stride_h'):
-            assert param.HasField('stride_w')
-            self.arguments['strides'] = [param.stride_h, param.stride_w]
-        if param.HasField('pad_h'):
-            assert param.HasField('pad_w')
-            self.arguments['pads'] = [param.pad_h, param.pad_w]
-        self.add_blob(filler=self.get_filler(param, 'weight_filler'))
-        if param.bias_term:
-            self.add_blob(filler=self.get_filler(param, 'bias_filler'))
-    def __call__(self, bottom):
-        inputs = [bottom] + [blob['data'] for blob in self._blobs]
-        return vision_ops.depthwise_conv2d(inputs, **self.arguments)
 class LRN(Layer):
    r"""Apply the local response normalization.
    `[Krizhevsky et.al, 2012] <http://www.cs.toronto.edu/~hinton/absps/imagenet.pdf>`_.
@@ -221,15 +129,15 @@ class LRN(Layer):
    ```python
    layer {
-        type: "LRN"
+      type: "LRN"
-        bottom: "conv2"
+      bottom: "conv2"
-        top: "conv2/norm"
+      top: "conv2/norm"
-        lrn_param {
+      lrn_param {
-            local_size: 5
+        local_size: 5
-            alpha: 1.
+        alpha: 1.
-            beta: 0.75
+        beta: 0.75
-            k: 1.
+        k: 1.
-        }
+      }
    }
    ```
@@ -255,24 +163,18 @@ class LRN(Layer):
 class Pooling(Layer):
    r"""Apply the n-dimension pooling.
-    The spatial output dimension is computed as:
-    .. math::
-        \text{Dim}_{out} = (\text{Dim}_{in} +
-            2 * pad - \text{K}_{size}) / stride + 1
    Examples:
    ```python
    layer {
-        type: "Pooling"
+      type: "Pooling"
-        bottom: "conv2"
+      bottom: "conv2"
-        top: "pool2"
+      top: "pool2"
-        pooling_param {
+      pooling_param {
-            kernel_size: 3
+        kernel_size: 3
-            stride: 2
+        stride: 2
-            pool: AVG
+        pool: AVG
-        }
+      }
    }
    ```
@@ -311,14 +213,14 @@ class ROIAlign(Layer):
    ```python
    layer {
-        type: "ROIAlign"
+      type: "ROIAlign"
-        bottom: "conv5_3"
+      bottom: "conv5_3"
-        top: "roi_pool4"
+      top: "roi_pool4"
-        roi_pooling_param {
+      roi_pooling_param {
-            pooled_w: 7
+        pooled_w: 7
-            pooled_h: 7
+        pooled_h: 7
-            spatial_scale: 0.0625
+        spatial_scale: 0.0625
-        }
+      }
    }
    ```
@@ -345,14 +247,14 @@ class ROIPooling(Layer):
    ```python
    layer {
-        type: "ROIPooling"
+      type: "ROIPooling"
-        bottom: "conv5_3"
+      bottom: "conv5_3"
-        top: "roi_pool4"
+      top: "roi_pool4"
-        roi_pooling_param {
+      roi_pooling_param {
-            pooled_w: 7
+        pooled_w: 7
-            pooled_h: 7
+        pooled_h: 7
-            spatial_scale: 0.0625
+        spatial_scale: 0.0625
-        }
+      }
    }
    ```

--- a/caffe/net.py
+++ b/caffe/net.py
@@ -20,10 +20,11 @@ from google.protobuf import text_format
 from dragon.core.autograph import def_function
 from dragon.core.autograph import grad_impl
-from dragon.core.autograph.tensor import Tensor
 from dragon.core.autograph.tensor import TensorRef
+from dragon.core.framework import context
 from dragon.core.framework import workspace
 from dragon.core.util import nest
+from dragon.core.util import serialization
 from dragon.vm.caffe import layers as layer_factory
 from dragon.vm.caffe.proto import caffe_pb2
@@ -37,236 +38,83 @@ class Blob(object):
 class Net(object):
    """The abstraction ``caffe.Net``.
-    This class accepts a proto-text file, and an optional
+    This class accepts a network file, and an optional parameter file.
-    serialized model weights. You can also specify a phase
+    Besides, a phase tag is required to compute gradients or not:
-    flag to indicate whether to compute the gradients:
    ```python
-    train_net = Net('train.prototxt', 'TRAIN')
+    net1 = caffe.Net('train.prototxt', 'TRAIN')
-    test_net = Net('test.prototxt', 'my.caffemodel', 'TEST')
+    net2 = caffe.Net('test.prototxt', 'test.caffemodel', 'TEST')
    ```
    """
    def __init__(self, *args):
-        """Create a Net.
+        """Create a ``Net``.
        Parameters
        ----------
-        network_file : str
+        net_file : str
-            The path of ``net.prototxt`` file.
+            The path of text proto file to load network.
-        weights : str, optional
+        param_file : str, optional
-            The path of the weights file.
+            The path of binary proto file to load parameters.
        phase : {'TRAIN', 'TEST'}, optional
-            The optional phase.
+            The optional phase tag.
        """
        if len(args) == 2:
-            (net_file, self._phase), weights = args, None
+            (net_file, self._phase), param_file = args, None
        elif len(args) == 3:
-            net_file, weights, self._phase = args
+            net_file, param_file, self._phase = args
        else:
            raise ValueError('Excepted 2 or 3 args.')
-        self._net_proto = caffe_pb2.NetParameter()
        self._blobs = {}
        self._layers = []
        self._layer_blobs = []
        self._losses = []
-        self._variables = []
+        self._params = []
        self._blob_dict = None
        self._param_dict = None
        self._input_list = None
        self._output_list = None
+        # Parse the network file
        with open(net_file, 'r') as f:
-            text_format.Parse(f.read(), self._net_proto)
+            self._proto = text_format.Parse(f.read(), caffe_pb2.NetParameter())
+        # Construct the layer class from proto
-        if len(self._net_proto.input) > 0:
+        for layer_param in self._proto.layer:
-            shapes = self._net_proto.input_shape
+            if not self._filter_layer(layer_param):
-            for i, input_name in enumerate(self._net_proto.input):
-                shape = [e for e in shapes[i].dim] if i < len(shapes) else None
-                if input not in self._blobs:
-                    data = Tensor(input_name, shape, 'float32').placeholder()
-                    self._blobs[input_name] = {
-                        'data': data,
-                        'diff': TensorRef(data.id + '_grad', shape, data.dtype),
-                    }
-        for layer in self._net_proto.layer:
-            if not self._filter_layer(layer):
                continue
-            cls = getattr(layer_factory, layer.type)
+            cls = getattr(layer_factory, layer_param.type)
-            self._layers.append(cls(layer))
+            with context.name_scope(layer_param.name):
+                self._layers.append(cls(layer_param))
+        # Prepare for the legacy net inputs
+        if len(self._proto.input) > 0:
+            layer_param = caffe_pb2.LayerParameter(
+                name='data',
+                type='Input',
+                top=self._proto.input,
+                input_param=caffe_pb2.InputParameter(
+                    shape=self._proto.input_shape))
+            cls = getattr(layer_factory, layer_param.type)
+            with context.name_scope(layer_param.name):
+                self._layers.insert(0, cls(layer_param))
+        # Call layers sequentially to get outputs
        self._setup()
+        # Collect losses and parameters
-        for layer in self._net_proto.layer:
+        for layer in self._proto.layer:
            if not self._filter_layer(layer):
                continue
-            self._collect_losses_and_variables(layer)
+            self._collect_losses_and_params(layer)
+        # Load the pre-trained weights if necessary
-        if weights is not None:
+        if param_file is not None:
-            workspace.load(weights, format='caffe')
+            self.copy_from(param_file)
-    def _filter_layer(self, layer_param):
-        """Indicate whether the given layer should be included."""
-        phase_dict = {'TRAIN': 0, 'TEST': 1}
-        if layer_param.HasField('phase') and \
-                layer_param.phase != phase_dict[self._phase]:
-            return False
-        for include in layer_param.include:
-            if include.HasField('phase') and \
-                    include.phase != phase_dict[self._phase]:
-                return False
-        layer_param.phase = phase_dict[self._phase]
-        return True
-    def _setup(self):
-        """Connect the layers sequentially."""
-        self._net_outputs = set()
-        # Collect bottom and top blobs.
-        for layer in self._layers:
-            bottom = []
-            for blob in layer._bottom:
-                if blob not in self._blobs:
-                    raise RuntimeError('bottom({}) is unknown.'.format(blob))
-                bottom.append(self._blobs[blob])
-                if blob in self._net_outputs:
-                    self._net_outputs.remove(blob)
-            outputs = layer.setup([blob['data'] for blob in bottom])
-            outputs = nest.flatten(outputs)
-            for i, blob in enumerate(layer._top):
-                self._blobs[blob] = {
-                    'data': outputs[i],
-                    'diff': TensorRef(outputs[i].id + '_grad'),
-                }
-                self._net_outputs.add(blob)
-        # Collect layer param blobs.
-        for blobs in self.params.values():
-            self._layer_blobs.extend(blobs)
-    def _collect_losses_and_variables(self, layer_param):
-        """Collect losses and variables."""
-        if layer_param.type.find('Loss') != -1:
-            if len(layer_param.loss_weight) == 0:
-                layer_param.loss_weight.extend([1.])
-            for i, loss_weight in enumerate(layer_param.loss_weight):
-                if loss_weight <= 0:
-                    continue
-                self._losses.append(self.blobs[layer_param.top[i]].data)
-        else:
-            if len(layer_param.loss_weight) != 0:
-                for i, loss_weight in enumerate(layer_param.loss_weight):
-                    if loss_weight <= 0:
-                        continue
-                    self._losses.append(self.blobs[layer_param.top[i]].data)
-        if self._phase != 'TRAIN':
-            return
-        if len(layer_param.param) > 0:
-            for i, p in enumerate(layer_param.param):
-                blob = self.params[layer_param.name][i]
-                blob.lr_multiplier = p.lr_mult if p.HasField('lr_mult') else 1.
-                blob.decay_multiplier = p.decay_mult if p.HasField('decay_mult') else 1.
-                if blob.diff is not None and blob.lr_multiplier > 0:
-                    self._variables.append(blob.data)
-        else:
-            for blob in self.params[layer_param.name]:
-                if blob.diff is not None and blob.lr_multiplier > 0:
-                    self._variables.append(blob.data)
-    @classmethod
-    def copy_from(cls, weights):
-        """Copy the weights from the binary proto file.
-        Parameters
-        ----------
-        weights : str
-            The path of the weights file.
-        """
-        workspace.load(weights, format='caffe')
-    @def_function.function
-    def forward_backward(self, **kwargs):
-        """Forward pass following by backward pass.
-        This function will be compiled to a computation graph
-        once executed, with implicit feeding of inputs.
-        """
-        grad_impl.gradients(self._losses, self._variables)
-        return [self.blobs[key].data for key in self.outputs]
-    def forward(self, **inputs):
-        """Forward pass.
-        Parameters
-        ----------
-        inputs : dict, optional
-            The blobs to feed.
-        Returns
-        -------
-        callable
-            The callable to return outputs.
-        """
-        for name, blob in inputs.items():
-            workspace.feed_tensor(self._blobs[name]['data'], blob)
-        self.forward_backward(return_outputs=False, stage='forward')
-        return lambda: dict(
-            (output, self.blobs[output].data.get_value())
-            for output in self.outputs
-        )
-    def backward(self, **diffs):
-        """Backward pass.
-        Parameters
-        ----------
-        diffs : dict, optional
-            The diffs to feed.
-        """
-        for name, blob in diffs.items():
-            workspace.feed_tensor(self.blobs[name].diff, blob)
-        self.forward_backward(return_outputs=False, stage='backward')
-    def save(self, filename):
-        """Save the parameters into a binary file.
-        Parameters
-        ----------
-        filename : str
-            The path of model file.
-        """
-        workspace.save(
-            tensors=[blob.data for blob in self._layer_blobs],
-            filename=filename, suffix='', format='caffe',
-        )
    @property
    def blobs(self):
        """Return the blob dict.
-        Blobs stored in the dict will be:
-        ```python
-        for blob_name, blob in net.blobs():
-            print(blob.data)  # DataTensor
-            print(blob.diff)  # GradTensor
-        ```
        Returns
        -------
-        Dict
+        dict
            The blob dict.
        """
@@ -280,19 +128,9 @@ class Net(object):
    def params(self):
        """Return the parameter dict.
-        Parameters stored in the dict will be:
-        ```python
-        for layer_name, blobs in net.params():
-            print(layer_name)
-            for blob in blobs:
-                print('  *', blob.data)  # DataTensor
-                print('  *', blob.diff)  # GradTensor
-        ```
        Returns
        -------
-        Dict
+        dict
            The parameter dict.
        """
@@ -327,7 +165,7 @@ class Net(object):
        """
        if self._input_list is None:
-            self._input_list = [input for input in self._net_proto.input]
+            self._input_list = [input for input in self._proto.input]
        return self._input_list
    @property
@@ -343,3 +181,194 @@ class Net(object):
        if self._output_list is None:
            self._output_list = list(self._net_outputs)
        return self._output_list
+    def backward(self, **diffs):
+        """The backward pass.
+        Parameters
+        ----------
+        diffs : dict, optional
+            The data to feed to the diffs.
+        """
+        current_ws = workspace.get_workspace()
+        for name, blob in diffs.items():
+            current_ws.feed_tensor(self.blobs[name].diff, blob)
+        self._forward_backward_impl(return_outputs=False, stage='backward')
+    def copy_from(self, other):
+        """Copy layers from the other.
+        Parameters
+        ----------
+        other : Union[str, NetParameter]
+            The path of binary proto file or ``NetParameter``.
+        """
+        if hasattr(other, 'ParseFromString') and \
+                callable(other.ParseFromString):
+            self.from_proto(other)
+        else:
+            self.from_proto(serialization.deserialize_proto(
+                serialization.load_bytes(other), caffe_pb2.NetParameter()))
+    def forward(self, **inputs):
+        """The forward pass.
+        Parameters
+        ----------
+        inputs : dict, optional
+            The data to feed to the inputs.
+        Returns
+        -------
+        callable
+            The callable to fetch outputs.
+        """
+        current_ws = workspace.get_workspace()
+        for name, blob in inputs.items():
+            current_ws.feed_tensor(self._blobs[name]['data'], blob)
+        self._forward_backward_impl(return_outputs=False, stage='forward')
+        return lambda: dict(
+            (output, current_ws.fetch_tensor(self.blobs[output].data))
+            for output in self.outputs)
+    def forward_backward(self, **inputs):
+        """The forward and backward pass.
+        Parameters
+        ----------
+        inputs : dict, optional
+            The data to feed to the inputs.
+        Returns
+        -------
+        callable
+            The callable to fetch outputs.
+        """
+        current_ws = workspace.get_workspace()
+        for name, blob in inputs.items():
+            current_ws.feed_tensor(self._blobs[name]['data'], blob)
+        self._forward_backward_impl(return_outputs=False)
+        return lambda: dict(
+            (output, current_ws.fetch_tensor(self.blobs[output].data))
+            for output in self.outputs)
+    def from_proto(self, proto):
+        """Deserialize from the proto.
+        Parameters
+        ----------
+        proto : NetParameter
+            The ``NetParameter`` protocol buffer.
+        """
+        layer_dict = dict((layer.name, layer) for layer in proto.layer)
+        for layer in self._layers:
+            if layer.name in layer_dict:
+                layer.from_proto(layer_dict[layer.name])
+    def save(self, filepath):
+        """Save proto into a binary file.
+        Parameters
+        ----------
+        filepath : str
+            The path of binary proto file.
+        """
+        serialization.save_bytes(
+            serialization.serialize_proto(
+                self.to_proto()), filepath)
+    def to_proto(self):
+        """Serialize to the proto.
+        Returns
+        -------
+        NetParameter
+            The ``NetParameter`` protocol buffer.
+        """
+        return caffe_pb2.NetParameter(
+            name=self._proto.name,
+            layer=[layer.to_proto() for layer in self._layers])
+    def _collect_losses_and_params(self, layer_param):
+        """Collect losses and parameters."""
+        if layer_param.type.find('Loss') != -1:
+            if len(layer_param.loss_weight) == 0:
+                layer_param.loss_weight.extend([1.])
+            for i, loss_weight in enumerate(layer_param.loss_weight):
+                if loss_weight <= 0:
+                    continue
+                self._losses.append(self.blobs[layer_param.top[i]].data)
+        else:
+            if len(layer_param.loss_weight) != 0:
+                for i, loss_weight in enumerate(layer_param.loss_weight):
+                    if loss_weight <= 0:
+                        continue
+                    self._losses.append(self.blobs[layer_param.top[i]].data)
+        if self._phase != 'TRAIN':
+            return
+        if len(layer_param.param) > 0:
+            for i, p in enumerate(layer_param.param):
+                blob = self.params[layer_param.name][i]
+                blob.lr_multiplier = p.lr_mult if p.HasField('lr_mult') else 1.
+                blob.decay_multiplier = p.decay_mult if p.HasField('decay_mult') else 1.
+                if blob.diff is not None and blob.lr_multiplier > 0:
+                    self._params.append(blob.data)
+        else:
+            for blob in self.params[layer_param.name]:
+                if blob.diff is not None and blob.lr_multiplier > 0:
+                    self._params.append(blob.data)
+    def _filter_layer(self, layer_param):
+        """Check if layer should be included."""
+        phase_dict = {'TRAIN': 0, 'TEST': 1}
+        if layer_param.HasField('phase') and \
+                layer_param.phase != phase_dict[self._phase]:
+            return False
+        for include in layer_param.include:
+            if include.HasField('phase') and \
+                    include.phase != phase_dict[self._phase]:
+                return False
+        layer_param.phase = phase_dict[self._phase]
+        return True
+    @def_function.function
+    def _forward_backward_impl(self, **kwargs):
+        """Implementation for ``self.forward_backward(...)``."""
+        grad_impl.gradients(self._losses, self._params)
+        return [self.blobs[key].data for key in self.outputs]
+    def _setup(self):
+        """Connect the layers sequentially."""
+        self._net_outputs = set()
+        # Collect bottom and top blobs.
+        for layer_idx, layer in enumerate(self._layers):
+            bottom = []
+            for blob in layer._bottom:
+                if blob not in self._blobs:
+                    raise RuntimeError('bottom({}) is unknown.'.format(blob))
+                bottom.append(self._blobs[blob])
+                if blob in self._net_outputs:
+                    self._net_outputs.remove(blob)
+            if isinstance(layer, layer_factory.BatchNorm):
+                next_layer = self._layers[layer_idx + 1]
+                if isinstance(next_layer, layer_factory.Scale):
+                    layer.fuse_with_scale_layer(next_layer)
+            with context.name_scope(layer._name):
+                outputs = layer.setup([blob['data'] for blob in bottom])
+            if outputs is not None:
+                outputs = nest.flatten(outputs)
+                for blob_idx, blob in enumerate(layer._top):
+                    self._blobs[blob] = {
+                        'data': outputs[blob_idx],
+                        'diff': TensorRef(outputs[blob_idx].id + '_grad')}
+                    self._net_outputs.add(blob)
+        # Collect layer param blobs.
+        for blobs in self.params.values():
+            self._layer_blobs.extend(blobs)
--- a/caffe/proto/caffe.proto
+++ b/caffe/proto/caffe.proto
@@ -3,25 +3,29 @@ syntax = "proto2";
 package caffe;
 // Specifies the shape (dimensions) of a Blob.
-message BlobShape { repeated int64 dim = 1 [ packed = true ]; }
+message BlobShape {
+  repeated int64 dim = 1 [packed = true];
+}
 message BlobProto {
  optional BlobShape shape = 7;
-  repeated float data = 5 [ packed = true ];
+  repeated float data = 5 [packed = true];
-  repeated float diff = 6 [ packed = true ];
+  repeated float diff = 6 [packed = true];
-  repeated double double_data = 8 [ packed = true ];
+  repeated double double_data = 8 [packed = true];
-  repeated double double_diff = 9 [ packed = true ];
+  repeated double double_diff = 9 [packed = true];
  // 4D dimensions -- deprecated.  Use "shape" instead.
-  optional int32 num = 1 [ default = 0 ];
+  optional int32 num = 1 [default = 0];
-  optional int32 channels = 2 [ default = 0 ];
+  optional int32 channels = 2 [default = 0];
-  optional int32 height = 3 [ default = 0 ];
+  optional int32 height = 3 [default = 0];
-  optional int32 width = 4 [ default = 0 ];
+  optional int32 width = 4 [default = 0];
 }
 // The BlobProtoVector is simply a way to pass multiple blobproto instances
 // around.
-message BlobProtoVector { repeated BlobProto blobs = 1; }
+message BlobProtoVector {
+  repeated BlobProto blobs = 1;
+}
 message Datum {
  optional int32 channels = 1;
@@ -33,21 +37,21 @@ message Datum {
  // Optionally, the datum could also hold float data.
  repeated float float_data = 6;
  // If true data contains an encoded image that need to be decoded
-  optional bool encoded = 7 [ default = false ];
+  optional bool encoded = 7 [default = false];
  repeated int32 labels = 8;
 }
 message FillerParameter {
  // The filler type.
-  optional string type = 1 [ default = 'constant' ];
+  optional string type = 1 [default = 'constant'];
-  optional float value = 2 [ default = 0 ]; // the value in constant filler
+  optional float value = 2 [default = 0];  // the value in constant filler
-  optional float min = 3 [ default = 0 ];   // the min value in uniform filler
+  optional float min = 3 [default = 0];    // the min value in uniform filler
-  optional float max = 4 [ default = 1 ];   // the max value in uniform filler
+  optional float max = 4 [default = 1];    // the max value in uniform filler
-  optional float mean = 5 [ default = 0 ];  // the mean value in Gaussian filler
+  optional float mean = 5 [default = 0];   // the mean value in Gaussian filler
-  optional float std = 6 [ default = 1 ];   // the std value in Gaussian filler
+  optional float std = 6 [default = 1];    // the std value in Gaussian filler
  // The expected number of non-zero output weights for a given input in
  // Gaussian filler -- the default -1 means don't perform sparsification.
-  optional int32 sparse = 7 [ default = -1 ];
+  optional int32 sparse = 7 [default = -1];
  // Normalize the filler variance by fan_in, fan_out, or their average.
  // Applies to 'xavier' and 'msra' fillers.
  enum VarianceNorm {
@@ -55,11 +59,11 @@ message FillerParameter {
    FAN_OUT = 1;
    AVERAGE = 2;
  }
-  optional VarianceNorm variance_norm = 8 [ default = FAN_IN ];
+  optional VarianceNorm variance_norm = 8 [default = FAN_IN];
 }
 message NetParameter {
-  optional string name = 1; // consider giving the network a name
+  optional string name = 1;  // consider giving the network a name
  // DEPRECATED. See InputParameter. The input blobs to the network.
  repeated string input = 3;
  // DEPRECATED. See InputParameter. The shape of the input blobs.
@@ -74,7 +78,7 @@ message NetParameter {
  // Whether the network will force every layer to carry out backward operation.
  // If set False, then whether to carry out backward is determined
  // automatically according to the net structure and learning rates.
-  optional bool force_backward = 5 [ default = false ];
+  optional bool force_backward = 5 [default = false];
  // The current "state" of the network, including the phase, level, and stage.
  // Some layers may be included/excluded depending on this state and the states
  // specified in the layers' include and exclude fields.
@@ -82,11 +86,11 @@ message NetParameter {
  // Print debugging information about results while running Net::Forward,
  // Net::Backward, and Net::Update.
-  optional bool debug_info = 7 [ default = false ];
+  optional bool debug_info = 7 [default = false];
  // The layers that make up the net.  Each of their configurations, including
  // connectivity and behavior, is specified as a LayerParameter.
-  repeated LayerParameter layer = 100; // ID 100 so layers are printed last.
+  repeated LayerParameter layer = 100;  // ID 100 so layers are printed last.
  // DEPRECATED: use 'layer' instead.
  repeated V1LayerParameter layers = 2;
@@ -117,10 +121,10 @@ message SolverParameter {
  // Inline train net param, possibly combined with one or more test nets.
  optional NetParameter net_param = 25;
-  optional string train_net = 1; // Proto filename for the train net.
+  optional string train_net = 1;  // Proto filename for the train net.
-  repeated string test_net = 2;  // Proto filenames for the test nets.
+  repeated string test_net = 2;   // Proto filenames for the test nets.
-  optional NetParameter train_net_param = 21; // Inline train net params.
+  optional NetParameter train_net_param = 21;  // Inline train net params.
-  repeated NetParameter test_net_param = 22;  // Inline test net params.
+  repeated NetParameter test_net_param = 22;   // Inline test net params.
  // The states for the train/test nets. Must be unspecified or
  // specified once per net.
@@ -136,22 +140,22 @@ message SolverParameter {
  repeated int32 test_iter = 3;
  // The number of iterations between two testing phases.
-  optional int32 test_interval = 4 [ default = 0 ];
+  optional int32 test_interval = 4 [default = 0];
-  optional bool test_compute_loss = 19 [ default = false ];
+  optional bool test_compute_loss = 19 [default = false];
  // If true, run an initial test pass before the first iteration,
  // ensuring memory availability and printing the starting value of the loss.
-  optional bool test_initialization = 32 [ default = true ];
+  optional bool test_initialization = 32 [default = true];
-  optional float base_lr = 5; // The base learning rate
+  optional float base_lr = 5;  // The base learning rate
  repeated float stage_lr = 50;
  repeated int32 stage_iter = 51;
  // the number of iterations between displaying info. If display = 0, no info
  // will be displayed.
  optional int32 display = 6;
  // Display the loss averaged over the last average_loss iterations
-  optional int32 average_loss = 33 [ default = 1 ];
+  optional int32 average_loss = 33 [default = 1];
-  optional int32 max_iter = 7; // the maximum number of iterations
+  optional int32 max_iter = 7;  // the maximum number of iterations
  // accumulate gradients over `iter_size` x `batch_size` instances
-  optional int32 iter_size = 36 [ default = 1 ];
+  optional int32 iter_size = 36 [default = 1];
  // The learning rate decay policy. The currently implemented learning rate
  // policies are as follows:
@@ -169,13 +173,13 @@ message SolverParameter {
  // where base_lr, max_iter, gamma, step, stepvalue and power are defined
  // in the solver parameter protocol buffer, and iter is the current iteration.
  optional string lr_policy = 8;
-  optional float gamma = 9;     // The parameter to compute the learning rate.
+  optional float gamma = 9;      // The parameter to compute the learning rate.
-  optional float power = 10;    // The parameter to compute the learning rate.
+  optional float power = 10;     // The parameter to compute the learning rate.
-  optional float momentum = 11; // The momentum value.
+  optional float momentum = 11;  // The momentum value.
-  optional float weight_decay = 12; // The weight decay.
+  optional float weight_decay = 12;  // The weight decay.
  // regularization types supported: L1 and L2
  // controlled by weight_decay
-  optional string regularization_type = 29 [ default = "L2" ];
+  optional string regularization_type = 29 [default = "L2"];
  // the stepsize for learning rate policy "step"
  optional int32 stepsize = 13;
  // the stepsize for learning rate policy "multistep"
@@ -183,49 +187,49 @@ message SolverParameter {
  // Set clip_gradients to >= 0 to clip parameter gradients to that L2 norm,
  // whenever their actual L2 norm is larger.
-  optional float clip_gradients = 35 [ default = -1 ];
+  optional float clip_gradients = 35 [default = -1];
-  optional int32 snapshot = 14 [ default = 0 ]; // The snapshot interval
+  optional int32 snapshot = 14 [default = 0];  // The snapshot interval
-  optional string snapshot_prefix = 15;         // The prefix for the snapshot.
+  optional string snapshot_prefix = 15;        // The prefix for the snapshot.
  // whether to snapshot diff in the results or not. Snapshotting diff will help
  // debugging but the final protocol buffer size will be much larger.
-  optional bool snapshot_diff = 16 [ default = false ];
+  optional bool snapshot_diff = 16 [default = false];
  enum SnapshotFormat {
    HDF5 = 0;
    BINARYPROTO = 1;
  }
-  optional SnapshotFormat snapshot_format = 37 [ default = BINARYPROTO ];
+  optional SnapshotFormat snapshot_format = 37 [default = BINARYPROTO];
  // the mode solver will use: 0 for CPU and 1 for GPU. Use GPU in default.
  enum SolverMode {
    CPU = 0;
    GPU = 1;
  }
-  optional SolverMode solver_mode = 17 [ default = GPU ];
+  optional SolverMode solver_mode = 17 [default = GPU];
  // the device_id will that be used in GPU mode. Use device_id = 0 in default.
-  optional int32 device_id = 18 [ default = 0 ];
+  optional int32 device_id = 18 [default = 0];
  // If non-negative, the seed with which the Solver will initialize the Caffe
  // random number generator -- useful for reproducible results. Otherwise,
  // (and by default) initialize using a seed derived from the system clock.
-  optional int64 random_seed = 20 [ default = -1 ];
+  optional int64 random_seed = 20 [default = -1];
  // type of the solver
-  optional string type = 40 [ default = "SGD" ];
+  optional string type = 40 [default = "SGD"];
  // numerical stability for RMSProp, AdaGrad and AdaDelta and Adam
-  optional float delta = 31 [ default = 1e-8 ];
+  optional float delta = 31 [default = 1e-8];
  // parameters for the Adam solver
-  optional float momentum2 = 39 [ default = 0.999 ];
+  optional float momentum2 = 39 [default = 0.999];
  // RMSProp decay value
  // MeanSquare(t) = rms_decay*MeanSquare(t-1) + (1-rms_decay)*SquareGradient(t)
-  optional float rms_decay = 38 [ default = 0.99 ];
+  optional float rms_decay = 38 [default = 0.99];
  // If true, print information about the state of the net that may help with
  // debugging learning problems.
-  optional bool debug_info = 23 [ default = false ];
+  optional bool debug_info = 23 [default = false];
  // If false, don't save a snapshot after training finishes.
-  optional bool snapshot_after_train = 28 [ default = true ];
+  optional bool snapshot_after_train = 28 [default = true];
  // DEPRECATED: old solver enum types, use string instead
  enum SolverType {
@@ -237,16 +241,16 @@ message SolverParameter {
    ADAM = 5;
  }
  // DEPRECATED: use type instead of solver_type
-  optional SolverType solver_type = 30 [ default = SGD ];
+  optional SolverType solver_type = 30 [default = SGD];
 }
 // A message that stores the solver snapshots
 message SolverState {
-  optional int32 iter = 1;         // The current iteration
+  optional int32 iter = 1;          // The current iteration
-  optional string learned_net = 2; // The file that stores the learned net.
+  optional string learned_net = 2;  // The file that stores the learned net.
-  repeated BlobProto history = 3;  // The history for sgd solvers
+  repeated BlobProto history = 3;   // The history for sgd solvers
  optional int32 current_step = 4
-      [ default = 0 ]; // The current step for learning rate
+      [default = 0];  // The current step for learning rate
 }
 enum Phase {
@@ -255,8 +259,8 @@ enum Phase {
 }
 message NetState {
-  optional Phase phase = 1 [ default = TEST ];
+  optional Phase phase = 1 [default = TEST];
-  optional int32 level = 2 [ default = 0 ];
+  optional int32 level = 2 [default = 0];
  repeated string stage = 3;
 }
@@ -297,10 +301,10 @@ message ParamSpec {
  }
  // The multiplier on the global learning rate for this parameter.
-  optional float lr_mult = 3 [ default = 1.0 ];
+  optional float lr_mult = 3 [default = 1.0];
  // The multiplier on the global weight decay for this parameter.
-  optional float decay_mult = 4 [ default = 1.0 ];
+  optional float decay_mult = 4 [default = 1.0];
 }
 // NOTE
@@ -309,13 +313,13 @@ message ParamSpec {
 // LayerParameter next available layer-specific ID: 146 (last added:
 // parameter_param)
 message LayerParameter {
-  optional string name = 1;   // the layer name
+  optional string name = 1;    // the layer name
-  optional string type = 2;   // the layer type
+  optional string type = 2;    // the layer type
-  repeated string bottom = 3; // the name of each bottom blob
+  repeated string bottom = 3;  // the name of each bottom blob
-  repeated string top = 4;    // the name of each top blob
+  repeated string top = 4;     // the name of each top blob
  // The mirror stage optimization
-  optional bool mirror_stage = 150 [ default = false ];
+  optional bool mirror_stage = 150 [default = false];
  // The train / test phase for computation.
  optional Phase phase = 10;
@@ -411,8 +415,6 @@ message LayerParameter {
  optional SmoothL1LossParameter smooth_l1_loss_param = 152;
  optional PermuteParameter permute_param = 153;
  optional NormalizeParameter normalize_param = 154;
-  optional GroupNormParameter group_norm_param = 155;
-  optional CastParameter cast_param = 156;
 }
 // Message that stores parameters used to apply transformation
@@ -421,11 +423,11 @@ message TransformationParameter {
  // For data pre-processing, we can do simple scaling and subtracting the
  // data mean, if provided. Note that the mean subtraction is always carried
  // out before scaling.
-  optional float scale = 1 [ default = 1 ];
+  optional float scale = 1 [default = 1];
  // Specify if we want to randomly mirror data.
-  optional bool mirror = 2 [ default = false ];
+  optional bool mirror = 2 [default = false];
  // Specify if we would like to randomly crop an image.
-  optional uint32 crop_size = 3 [ default = 0 ];
+  optional uint32 crop_size = 3 [default = 0];
  // mean_file and mean_value cannot be specified at the same time
  optional string mean_file = 4;
  // if specified can be repeated once (would substract it from all the
@@ -433,17 +435,17 @@ message TransformationParameter {
  // subtract them from the corresponding channel)
  repeated float mean_value = 5;
  // Force the decoded image to have 3 color channels.
-  optional bool force_color = 6 [ default = false ];
+  optional bool force_color = 6 [default = false];
  // Force the decoded image to have 1 color channels.
-  optional bool force_gray = 7 [ default = false ];
+  optional bool force_gray = 7 [default = false];
  // Distort the color?
-  optional bool augment_color = 9 [ default = false ];
+  optional bool augment_color = 9 [default = false];
  // Target size.
-  optional uint32 resize = 10 [ default = 0 ];
+  optional uint32 resize = 10 [default = 0];
  // Padding size.
-  optional uint32 padding = 11 [ default = 0 ];
+  optional uint32 padding = 11 [default = 0];
  // Crop size during scale jittering
-  optional uint32 random_crop_size = 12 [ default = 0 ];
+  optional uint32 random_crop_size = 12 [default = 0];
 }
 // Message that stores parameters shared by loss layers
@@ -467,7 +469,7 @@ message LossParameter {
    // Do not normalize the loss.
    NONE = 3;
  }
-  optional NormalizationMode normalization = 3 [ default = VALID ];
+  optional NormalizationMode normalization = 3 [default = VALID];
  // Deprecated.  Ignored if normalization is specified.  If normalization
  // is not specified, then setting this to false will be equivalent to
  // normalization = BATCH_SIZE to be consistent with previous behavior.
@@ -481,14 +483,14 @@ message AccuracyParameter {
  // When computing accuracy, count as correct by comparing the true label to
  // the top k scoring classes.  By default, only compare to the top scoring
  // class (i.e. argmax).
-  optional uint32 top_k = 1 [ default = 1 ];
+  optional uint32 top_k = 1 [default = 1];
  // The "label" axis of the prediction blob, whose argmax corresponds to the
  // predicted label -- may be negative to index from the end (e.g., -1 for the
  // last axis).  For example, if axis == 1 and the predictions are
  // (N x C x H x W), the label blob is expected to contain N*H*W ground truth
  // labels with integer values in {0, 1, ..., C-1}.
-  optional int32 axis = 2 [ default = 1 ];
+  optional int32 axis = 2 [default = 1];
  // If specified, ignore instances with the given label.
  optional int32 ignore_label = 3;
@@ -496,8 +498,8 @@ message AccuracyParameter {
 message ArgMaxParameter {
  // If true produce pairs (argmax, maxval)
-  optional bool out_max_val = 1 [ default = false ];
+  optional bool out_max_val = 1 [default = false];
-  optional uint32 top_k = 2 [ default = 1 ];
+  optional uint32 top_k = 2 [default = 1];
  // The axis along which to maximise -- may be negative to index from the
  // end (e.g., -1 for the last axis).
  // By default ArgMaxLayer maximizes over the flattened trailing dimensions
@@ -510,10 +512,10 @@ message ConcatParameter {
  // end (e.g., -1 for the last axis).  Other axes must have the
  // same dimension for all the bottom blobs.
  // By default, ConcatLayer concatenates blobs along the "channels" axis (1).
-  optional int32 axis = 2 [ default = 1 ];
+  optional int32 axis = 2 [default = 1];
  // DEPRECATED: alias for "axis" -- does not support negative indexing.
-  optional uint32 concat_dim = 1 [ default = 1 ];
+  optional uint32 concat_dim = 1 [default = 1];
 }
 message BatchNormParameter {
@@ -522,10 +524,10 @@ message BatchNormParameter {
  // across the batch.
  optional bool use_global_stats = 1;
  // How much does the moving average decay each iteration?
-  optional float moving_average_fraction = 2 [ default = 0.9 ];
+  optional float moving_average_fraction = 2 [default = 0.9];
  // Small value to add to the variance estimate so that we don't divide by
  // zero.
-  optional float eps = 3 [ default = 1e-5 ];
+  optional float eps = 3 [default = 1e-5];
 }
 message BiasParameter {
@@ -542,7 +544,7 @@ message BiasParameter {
  //    (axis == 3 == -1)                                60
  // Furthermore, bottom[1] may have the empty shape (regardless of the value of
  // "axis") -- a scalar bias.
-  optional int32 axis = 1 [ default = 1 ];
+  optional int32 axis = 1 [default = 1];
  // (num_axes is ignored unless just one bottom is given and the bias is
  // a learned parameter of the layer.  Otherwise, num_axes is determined by the
@@ -550,7 +552,7 @@ message BiasParameter {
  // The number of axes of the input (bottom[0]) covered by the bias
  // parameter, or -1 to cover all axes of bottom[0] starting from `axis`.
  // Set num_axes := 0, to add a zero-axis Blob: a scalar.
-  optional int32 num_axes = 2 [ default = 1 ];
+  optional int32 num_axes = 2 [default = 1];
  // (filler is ignored unless just one bottom is given and the bias is
  // a learned parameter of the layer.)
@@ -562,49 +564,49 @@ message BiasParameter {
 message ContrastiveLossParameter {
  // margin for dissimilar pair
-  optional float margin = 1 [ default = 1.0 ];
+  optional float margin = 1 [default = 1.0];
  // The first implementation of this cost did not exactly match the cost of
  // Hadsell et al 2006 -- using (margin - d^2) instead of (margin - d)^2.
  // legacy_version = false (the default) uses (margin - d)^2 as proposed in the
  // Hadsell paper. New models should probably use this version.
  // legacy_version = true uses (margin - d^2). This is kept to support /
  // reproduce existing models and results
-  optional bool legacy_version = 2 [ default = false ];
+  optional bool legacy_version = 2 [default = false];
 }
 message ConvolutionParameter {
-  optional uint32 num_output = 1; // The number of outputs for the layer
+  optional uint32 num_output = 1;  // The number of outputs for the layer
-  optional bool bias_term = 2 [ default = true ]; // whether to have bias terms
+  optional bool bias_term = 2 [default = true];  // whether to have bias terms
  // Pad, kernel size, and stride are all given as a single value for equal
  // dimensions in all spatial dimensions, or once per spatial dimension.
-  repeated uint32 pad = 3;         // The padding size; defaults to 0
+  repeated uint32 pad = 3;          // The padding size; defaults to 0
-  repeated uint32 kernel_size = 4; // The kernel size
+  repeated uint32 kernel_size = 4;  // The kernel size
-  repeated uint32 stride = 6;      // The stride; defaults to 1
+  repeated uint32 stride = 6;       // The stride; defaults to 1
  // Factor used to dilate the kernel, (implicitly) zero-filling the resulting
  // holes. (Kernel dilation is sometimes referred to by its use in the
  // algorithme �� trous from Holschneider et al. 1987.)
-  repeated uint32 dilation = 18; // The dilation; defaults to 1
+  repeated uint32 dilation = 18;  // The dilation; defaults to 1
  // For 2D convolution only, the *_h and *_w versions may also be used to
  // specify both spatial dimensions.
-  optional uint32 pad_h = 9 [ default = 0 ];  // The padding height (2D only)
+  optional uint32 pad_h = 9 [default = 0];   // The padding height (2D only)
-  optional uint32 pad_w = 10 [ default = 0 ]; // The padding width (2D only)
+  optional uint32 pad_w = 10 [default = 0];  // The padding width (2D only)
-  optional uint32 kernel_h = 11;              // The kernel height (2D only)
+  optional uint32 kernel_h = 11;             // The kernel height (2D only)
-  optional uint32 kernel_w = 12;              // The kernel width (2D only)
+  optional uint32 kernel_w = 12;             // The kernel width (2D only)
-  optional uint32 stride_h = 13;              // The stride height (2D only)
+  optional uint32 stride_h = 13;             // The stride height (2D only)
-  optional uint32 stride_w = 14;              // The stride width (2D only)
+  optional uint32 stride_w = 14;             // The stride width (2D only)
-  optional uint32 group = 5 [ default = 1 ]; // The group size for group conv
+  optional uint32 group = 5 [default = 1];  // The group size for group conv
-  optional FillerParameter weight_filler = 7; // The filler for the weight
+  optional FillerParameter weight_filler = 7;  // The filler for the weight
-  optional FillerParameter bias_filler = 8;   // The filler for the bias
+  optional FillerParameter bias_filler = 8;    // The filler for the bias
  enum Engine {
    DEFAULT = 0;
    CAFFE = 1;
    CUDNN = 2;
  }
-  optional Engine engine = 15 [ default = DEFAULT ];
+  optional Engine engine = 15 [default = DEFAULT];
  // The axis to interpret as "channels" when performing convolution.
  // Preceding dimensions are treated as independent inputs;
@@ -615,14 +617,14 @@ message ConvolutionParameter {
  // With (N, C, D, H, W) inputs, and axis == 1, we perform
  // N independent 3D convolutions, sliding (C/g)-channels
  // filters across the spatial axes (D, H, W) of the input.
-  optional int32 axis = 16 [ default = 1 ];
+  optional int32 axis = 16 [default = 1];
  // Whether to force use of the general ND convolution, even if a specific
  // implementation for blobs of the appropriate number of spatial dimensions
  // is available. (Currently, there is only a 2D-specific convolution
  // implementation; for input blobs with num_axes != 2, this option is
  // ignored and the ND implementation will be used.)
-  optional bool force_nd_im2col = 17 [ default = false ];
+  optional bool force_nd_im2col = 17 [default = false];
 }
 message CropParameter {
@@ -639,7 +641,7 @@ message CropParameter {
  // Note: standard dimensions are N,C,H,W so the default is a spatial crop,
  // and `axis` may be negative to index from the end (e.g., -1 for the last
  // axis).
-  optional int32 axis = 1 [ default = 2 ];
+  optional int32 axis = 1 [default = 2];
  repeated uint32 offset = 2;
 }
@@ -657,33 +659,33 @@ message DataParameter {
  // point would be set as rand_skip * rand(0,1). Note that rand_skip should not
  // be larger than the number of keys in the database.
  // DEPRECATED. Each solver accesses a different subset of the database.
-  optional uint32 rand_skip = 7 [ default = 0 ];
+  optional uint32 rand_skip = 7 [default = 0];
-  optional DB backend = 8 [ default = LEVELDB ];
+  optional DB backend = 8 [default = LEVELDB];
  // DEPRECATED. See TransformationParameter. For data pre-processing, we can do
  // simple scaling and subtracting the data mean, if provided. Note that the
  // mean subtraction is always carried out before scaling.
-  optional float scale = 2 [ default = 1 ];
+  optional float scale = 2 [default = 1];
  optional string mean_file = 3;
  // DEPRECATED. See TransformationParameter. Specify if we would like to
  // randomly crop an image.
-  optional uint32 crop_size = 5 [ default = 0 ];
+  optional uint32 crop_size = 5 [default = 0];
  // DEPRECATED. See TransformationParameter. Specify if we want to randomly
  // mirror data.
-  optional bool mirror = 6 [ default = false ];
+  optional bool mirror = 6 [default = false];
  // Force the encoded image to have 3 color channels
-  optional bool force_encoded_color = 9 [ default = false ];
+  optional bool force_encoded_color = 9 [default = false];
  // Prefetch queue (Number of batches to prefetch to host memory, increase if
  // data access bandwidth varies).
-  optional uint32 prefetch = 10 [ default = 5 ];
+  optional uint32 prefetch = 10 [default = 5];
  // Whether to shuffle the data.
-  optional bool shuffle = 11 [ default = false ];
+  optional bool shuffle = 11 [default = false];
  // The number of chunks to shuffle.
-  optional int32 num_chunks = 12 [ default = 2048 ];
+  optional int32 num_chunks = 12 [default = 2048];
 }
 message DropoutParameter {
-  optional float dropout_ratio = 1 [ default = 0.5 ]; // dropout ratio
+  optional float dropout_ratio = 1 [default = 0.5];  // dropout ratio
-  optional bool scale_train = 2 [ default = true ]; // scale train or test phase
+  optional bool scale_train = 2 [default = true];  // scale train or test phase
 }
 // DummyDataLayer fills any number of arbitrarily shaped blobs with random
@@ -711,12 +713,12 @@ message EltwiseParameter {
    SUM = 1;
    MAX = 2;
  }
-  optional EltwiseOp operation = 1 [ default = SUM ]; // element-wise operation
+  optional EltwiseOp operation = 1 [default = SUM];  // element-wise operation
-  repeated float coeff = 2; // blob-wise coefficient for SUM operation
+  repeated float coeff = 2;  // blob-wise coefficient for SUM operation
  // Whether to use an asymptotically slower (for >2 inputs) but stabler method
  // of computing the gradient for the PROD operation. (No effect for SUM op.)
-  optional bool stable_prod_grad = 3 [ default = true ];
+  optional bool stable_prod_grad = 3 [default = true];
 }
 // Message that stores parameters used by ELULayer
@@ -724,20 +726,20 @@ message ELUParameter {
  // Described in:
  // Clevert, D.-A., Unterthiner, T., & Hochreiter, S. (2015). Fast and Accurate
  // Deep Network Learning by Exponential Linear Units (ELUs). arXiv
-  optional float alpha = 1 [ default = 1 ];
+  optional float alpha = 1 [default = 1];
 }
 // Message that stores parameters used by EmbedLayer
 message EmbedParameter {
-  optional uint32 num_output = 1; // The number of outputs for the layer
+  optional uint32 num_output = 1;  // The number of outputs for the layer
  // The input is given as integers to be interpreted as one-hot
  // vector indices with dimension num_input.  Hence num_input should be
  // 1 greater than the maximum possible input value.
  optional uint32 input_dim = 2;
-  optional bool bias_term = 3 [ default = true ]; // Whether to use a bias term
+  optional bool bias_term = 3 [default = true];  // Whether to use a bias term
-  optional FillerParameter weight_filler = 4;     // The filler for the weight
+  optional FillerParameter weight_filler = 4;    // The filler for the weight
-  optional FillerParameter bias_filler = 5;       // The filler for the bias
+  optional FillerParameter bias_filler = 5;      // The filler for the bias
 }
 // Message that stores parameters used by ExpLayer
@@ -745,21 +747,21 @@ message ExpParameter {
  // ExpLayer computes outputs y = base ^ (shift + scale * x), for base > 0.
  // Or if base is set to the default (-1), base is set to e,
  // so y = exp(shift + scale * x).
-  optional float base = 1 [ default = -1.0 ];
+  optional float base = 1 [default = -1.0];
-  optional float scale = 2 [ default = 1.0 ];
+  optional float scale = 2 [default = 1.0];
-  optional float shift = 3 [ default = 0.0 ];
+  optional float shift = 3 [default = 0.0];
 }
 /// Message that stores parameters used by FlattenLayer
 message FlattenParameter {
  // The first axis to flatten: all preceding axes are retained in the output.
  // May be negative to index from the end (e.g., -1 for the last axis).
-  optional int32 axis = 1 [ default = 1 ];
+  optional int32 axis = 1 [default = 1];
  // The last axis to flatten: all following axes are retained in the output.
  // May be negative to index from the end (e.g., the default -1 for the last
  // axis).
-  optional int32 end_axis = 2 [ default = -1 ];
+  optional int32 end_axis = 2 [default = -1];
 }
 // Message that stores parameters used by HDF5DataLayer
@@ -774,10 +776,12 @@ message HDF5DataParameter {
  // and the ordering of data within any given HDF5 file is shuffled,
  // but data between different files are not interleaved; all of a file's
  // data are output (in a random order) before moving onto another file.
-  optional bool shuffle = 3 [ default = false ];
+  optional bool shuffle = 3 [default = false];
 }
-message HDF5OutputParameter { optional string file_name = 1; }
+message HDF5OutputParameter {
+  optional string file_name = 1;
+}
 message HingeLossParameter {
  enum Norm {
@@ -785,38 +789,38 @@ message HingeLossParameter {
    L2 = 2;
  }
  // Specify the Norm to use L1 or L2
-  optional Norm norm = 1 [ default = L1 ];
+  optional Norm norm = 1 [default = L1];
 }
 message ImageDataParameter {
  // Specify the data source.
  optional string source = 1;
  // Specify the batch size.
-  optional uint32 batch_size = 4 [ default = 1 ];
+  optional uint32 batch_size = 4 [default = 1];
  // The rand_skip variable is for the data layer to skip a few data points
  // to avoid all asynchronous sgd clients to start at the same point. The skip
  // point would be set as rand_skip * rand(0,1). Note that rand_skip should not
  // be larger than the number of keys in the database.
-  optional uint32 rand_skip = 7 [ default = 0 ];
+  optional uint32 rand_skip = 7 [default = 0];
  // Whether or not ImageLayer should shuffle the list of files at every epoch.
-  optional bool shuffle = 8 [ default = false ];
+  optional bool shuffle = 8 [default = false];
  // It will also resize images if new_height or new_width are not zero.
-  optional uint32 new_height = 9 [ default = 0 ];
+  optional uint32 new_height = 9 [default = 0];
-  optional uint32 new_width = 10 [ default = 0 ];
+  optional uint32 new_width = 10 [default = 0];
  // Specify if the images are color or gray
-  optional bool is_color = 11 [ default = true ];
+  optional bool is_color = 11 [default = true];
  // DEPRECATED. See TransformationParameter. For data pre-processing, we can do
  // simple scaling and subtracting the data mean, if provided. Note that the
  // mean subtraction is always carried out before scaling.
-  optional float scale = 2 [ default = 1 ];
+  optional float scale = 2 [default = 1];
  optional string mean_file = 3;
  // DEPRECATED. See TransformationParameter. Specify if we would like to
  // randomly crop an image.
-  optional uint32 crop_size = 5 [ default = 0 ];
+  optional uint32 crop_size = 5 [default = 0];
  // DEPRECATED. See TransformationParameter. Specify if we want to randomly
  // mirror data.
-  optional bool mirror = 6 [ default = false ];
+  optional bool mirror = 6 [default = false];
-  optional string root_folder = 12 [ default = "" ];
+  optional string root_folder = 12 [default = ""];
 }
 message InfogainLossParameter {
@@ -825,21 +829,21 @@ message InfogainLossParameter {
 }
 message InnerProductParameter {
-  optional uint32 num_output = 1; // The number of outputs for the layer
+  optional uint32 num_output = 1;  // The number of outputs for the layer
-  optional bool bias_term = 2 [ default = true ]; // whether to have bias terms
+  optional bool bias_term = 2 [default = true];  // whether to have bias terms
-  optional FillerParameter weight_filler = 3;     // The filler for the weight
+  optional FillerParameter weight_filler = 3;    // The filler for the weight
-  optional FillerParameter bias_filler = 4;       // The filler for the bias
+  optional FillerParameter bias_filler = 4;      // The filler for the bias
  // The first axis to be lumped into a single inner product computation;
  // all preceding axes are retained in the output.
  // May be negative to index from the end (e.g., -1 for the last axis).
-  optional int32 axis = 5 [ default = 1 ];
+  optional int32 axis = 5 [default = 1];
  // Specify whether to transpose the weight matrix or not.
  // If transpose == true, any operations will be performed on the transpose
  // of the weight matrix. The weight matrix itself is not going to be
  // transposed but rather the transfer flag of operations will be toggled
  // accordingly.
-  optional bool transpose = 6 [ default = false ];
+  optional bool transpose = 6 [default = false];
 }
 message InputParameter {
@@ -848,7 +852,6 @@ message InputParameter {
  // Define 1 shape to set the same shape for every top.
  // Define no shape to defer to reshaping manually.
  repeated BlobShape shape = 1;
-  repeated string dtype = 2;
 }
 // Message that stores parameters used by LogLayer
@@ -856,28 +859,28 @@ message LogParameter {
  // LogLayer computes outputs y = log_base(shift + scale * x), for base > 0.
  // Or if base is set to the default (-1), base is set to e,
  // so y = ln(shift + scale * x) = log_e(shift + scale * x)
-  optional float base = 1 [ default = -1.0 ];
+  optional float base = 1 [default = -1.0];
-  optional float scale = 2 [ default = 1.0 ];
+  optional float scale = 2 [default = 1.0];
-  optional float shift = 3 [ default = 0.0 ];
+  optional float shift = 3 [default = 0.0];
 }
 // Message that stores parameters used by LRNLayer
 message LRNParameter {
-  optional uint32 local_size = 1 [ default = 5 ];
+  optional uint32 local_size = 1 [default = 5];
-  optional float alpha = 2 [ default = 1. ];
+  optional float alpha = 2 [default = 1.];
-  optional float beta = 3 [ default = 0.75 ];
+  optional float beta = 3 [default = 0.75];
  enum NormRegion {
    ACROSS_CHANNELS = 0;
    WITHIN_CHANNEL = 1;
  }
-  optional NormRegion norm_region = 4 [ default = ACROSS_CHANNELS ];
+  optional NormRegion norm_region = 4 [default = ACROSS_CHANNELS];
-  optional float k = 5 [ default = 1. ];
+  optional float k = 5 [default = 1.];
  enum Engine {
    DEFAULT = 0;
    CAFFE = 1;
    CUDNN = 2;
  }
-  optional Engine engine = 6 [ default = DEFAULT ];
+  optional Engine engine = 6 [default = DEFAULT];
 }
 message MemoryDataParameter {
@@ -889,16 +892,18 @@ message MemoryDataParameter {
 message MVNParameter {
  // This parameter can be set to false to normalize mean only
-  optional bool normalize_variance = 1 [ default = true ];
+  optional bool normalize_variance = 1 [default = true];
  // This parameter can be set to true to perform DNN-like MVN
-  optional bool across_channels = 2 [ default = false ];
+  optional bool across_channels = 2 [default = false];
  // Epsilon for not dividing by zero while normalizing variance
-  optional float eps = 3 [ default = 1e-9 ];
+  optional float eps = 3 [default = 1e-9];
 }
-message ParameterParameter { optional BlobShape shape = 1; }
+message ParameterParameter {
+  optional BlobShape shape = 1;
+}
 message PoolingParameter {
  enum PoolMethod {
@@ -906,45 +911,45 @@ message PoolingParameter {
    AVE = 1;
    STOCHASTIC = 2;
  }
-  optional PoolMethod pool = 1 [ default = MAX ]; // The pooling method
+  optional PoolMethod pool = 1 [default = MAX];  // The pooling method
  // Pad, kernel size, and stride are all given as a single value for equal
  // dimensions in height and width or as Y, X pairs.
-  optional uint32 pad = 4 [ default = 0 ];   // The padding size (equal in Y, X)
+  optional uint32 pad = 4 [default = 0];     // The padding size (equal in Y, X)
-  optional uint32 pad_h = 9 [ default = 0 ]; // The padding height
+  optional uint32 pad_h = 9 [default = 0];   // The padding height
-  optional uint32 pad_w = 10 [ default = 0 ]; // The padding width
+  optional uint32 pad_w = 10 [default = 0];  // The padding width
-  optional uint32 kernel_size = 2;            // The kernel size (square)
+  optional uint32 kernel_size = 2;           // The kernel size (square)
-  optional uint32 kernel_h = 5;               // The kernel height
+  optional uint32 kernel_h = 5;              // The kernel height
-  optional uint32 kernel_w = 6;               // The kernel width
+  optional uint32 kernel_w = 6;              // The kernel width
-  optional uint32 stride = 3 [ default = 1 ]; // The stride (equal in Y, X)
+  optional uint32 stride = 3 [default = 1];  // The stride (equal in Y, X)
-  optional uint32 stride_h = 7;               // The stride height
+  optional uint32 stride_h = 7;              // The stride height
-  optional uint32 stride_w = 8;               // The stride width
+  optional uint32 stride_w = 8;              // The stride width
  enum Engine {
    DEFAULT = 0;
    CAFFE = 1;
    CUDNN = 2;
  }
-  optional Engine engine = 11 [ default = DEFAULT ];
+  optional Engine engine = 11 [default = DEFAULT];
  // If global_pooling then it will pool over the size of the bottom by doing
  // kernel_h = bottom->height and kernel_w = bottom->width
-  optional bool global_pooling = 12 [ default = false ];
+  optional bool global_pooling = 12 [default = false];
 }
 // Message that stores parameters used by ROIPoolingLayer
 message ROIPoolingParameter {
  // Pad, kernel size, and stride are all given as a single value for equal
  // dimensions in height and width or as Y, X pairs.
-  optional uint32 pooled_h = 1 [ default = 0 ]; // The pooled output height
+  optional uint32 pooled_h = 1 [default = 0];  // The pooled output height
-  optional uint32 pooled_w = 2 [ default = 0 ]; // The pooled output width
+  optional uint32 pooled_w = 2 [default = 0];  // The pooled output width
  // Multiplicative spatial scale factor to translate ROI coords from their
  // input scale to the scale used when pooling
-  optional float spatial_scale = 3 [ default = 1 ];
+  optional float spatial_scale = 3 [default = 1];
 }
 message PowerParameter {
  // PowerLayer computes outputs y = (shift + scale * x) ^ power.
-  optional float power = 1 [ default = 1.0 ];
+  optional float power = 1 [default = 1.0];
-  optional float scale = 2 [ default = 1.0 ];
+  optional float scale = 2 [default = 1.0];
-  optional float shift = 3 [ default = 0.0 ];
+  optional float shift = 3 [default = 0.0];
 }
 message PythonParameter {
@@ -954,11 +959,11 @@ message PythonParameter {
  // in Python before calling the `setup()` method. This could be a number,
  // string, dictionary in Python dict format, JSON, etc. You may parse this
  // string in `setup` method and use it in `forward` and `backward`.
-  optional string param_str = 3 [ default = ''];
+  optional string param_str = 3 [default = ''];
  // Whether this PythonLayer is shared among worker solvers during data
  // parallelism. If true, each worker solver sequentially run forward from this
  // layer. This value should be set true if you are using it as a data layer.
-  optional bool share_in_parallel = 4 [ default = false ];
+  optional bool share_in_parallel = 4 [default = false];
 }
 // Message that stores parameters used by ReductionLayer
@@ -970,7 +975,7 @@ message ReductionParameter {
    MEAN = 4;
  }
-  optional ReductionOp operation = 1 [ default = SUM ]; // reduction operation
+  optional ReductionOp operation = 1 [default = SUM];  // reduction operation
  // The first axis to reduce to a scalar -- may be negative to index from the
  // end (e.g., -1 for the last axis).
@@ -985,9 +990,9 @@ message ReductionParameter {
  // If axis == 0 (the default), the output Blob always has the empty shape
  // (count 1), performing reduction across the entire input --
  // often useful for creating new loss functions.
-  optional int32 axis = 2 [ default = 0 ];
+  optional int32 axis = 2 [default = 0];
-  optional float coeff = 3 [ default = 1.0 ]; // coefficient for output
+  optional float coeff = 3 [default = 1.0];  // coefficient for output
 }
 // Message that stores parameters used by ReLULayer
@@ -997,13 +1002,13 @@ message ReLUParameter {
  // Maas, A. L., Hannun, A. Y., & Ng, A. Y. (2013). Rectifier nonlinearities
  // improve neural network acoustic models. In ICML Workshop on Deep Learning
  // for Audio, Speech, and Language Processing.
-  optional float negative_slope = 1 [ default = 0 ];
+  optional float negative_slope = 1 [default = 0];
  enum Engine {
    DEFAULT = 0;
    CAFFE = 1;
    CUDNN = 2;
  }
-  optional Engine engine = 2 [ default = DEFAULT ];
+  optional Engine engine = 2 [default = DEFAULT];
 }
 message ReshapeParameter {
@@ -1066,8 +1071,8 @@ message ReshapeParameter {
  //   reshape_param { shape { dim: 2  dim: 1  dim: 8  }  }
  //   reshape_param { shape { dim: 1 }  axis: 1  num_axes: 0 }
  //
-  optional int32 axis = 2 [ default = 0 ];
+  optional int32 axis = 2 [default = 0];
-  optional int32 num_axes = 3 [ default = -1 ];
+  optional int32 num_axes = 3 [default = -1];
 }
 message ScaleParameter {
@@ -1084,7 +1089,7 @@ message ScaleParameter {
  //    (axis == 3 == -1)                                60
  // Furthermore, bottom[1] may have the empty shape (regardless of the value of
  // "axis") -- a scalar multiplier.
-  optional int32 axis = 1 [ default = 1 ];
+  optional int32 axis = 1 [default = 1];
  // (num_axes is ignored unless just one bottom is given and the scale is
  // a learned parameter of the layer.  Otherwise, num_axes is determined by the
@@ -1092,7 +1097,7 @@ message ScaleParameter {
  // The number of axes of the input (bottom[0]) covered by the scale
  // parameter, or -1 to cover all axes of bottom[0] starting from `axis`.
  // Set num_axes := 0, to multiply with a zero-axis Blob: a scalar.
-  optional int32 num_axes = 2 [ default = 1 ];
+  optional int32 num_axes = 2 [default = 1];
  // (filler is ignored unless just one bottom is given and the scale is
  // a learned parameter of the layer.)
@@ -1103,7 +1108,7 @@ message ScaleParameter {
  // Whether to also learn a bias (equivalent to a ScaleLayer+BiasLayer, but
  // may be more efficient).  Initialized with bias_filler (defaults to 0).
-  optional bool bias_term = 4 [ default = false ];
+  optional bool bias_term = 4 [default = false];
  optional FillerParameter bias_filler = 5;
 }
@@ -1113,18 +1118,18 @@ message SigmoidParameter {
    CAFFE = 1;
    CUDNN = 2;
  }
-  optional Engine engine = 1 [ default = DEFAULT ];
+  optional Engine engine = 1 [default = DEFAULT];
 }
 message SliceParameter {
  // The axis along which to slice -- may be negative to index from the end
  // (e.g., -1 for the last axis).
  // By default, SliceLayer concatenates blobs along the "channels" axis (1).
-  optional int32 axis = 3 [ default = 1 ];
+  optional int32 axis = 3 [default = 1];
  repeated uint32 slice_point = 2;
  // DEPRECATED: alias for "axis" -- does not support negative indexing.
-  optional uint32 slice_dim = 1 [ default = 1 ];
+  optional uint32 slice_dim = 1 [default = 1];
 }
 // Message that stores parameters used by SoftmaxLayer, SoftmaxWithLossLayer
@@ -1134,12 +1139,12 @@ message SoftmaxParameter {
    CAFFE = 1;
    CUDNN = 2;
  }
-  optional Engine engine = 1 [ default = DEFAULT ];
+  optional Engine engine = 1 [default = DEFAULT];
  // The axis along which to perform the softmax -- may be negative to index
  // from the end (e.g., -1 for the last axis).
  // Any other axes will be evaluated as independent softmaxes.
-  optional int32 axis = 2 [ default = 1 ];
+  optional int32 axis = 2 [default = 1];
 }
 message TanHParameter {
@@ -1148,23 +1153,21 @@ message TanHParameter {
    CAFFE = 1;
    CUDNN = 2;
  }
-  optional Engine engine = 1 [ default = DEFAULT ];
+  optional Engine engine = 1 [default = DEFAULT];
 }
 // Message that stores parameters used by TileLayer
 message TileParameter {
  // The index of the axis to tile.
-  optional int32 axis = 1 [ default = 1 ];
+  optional int32 axis = 1 [default = 1];
  // The number of copies (tiles) of the blob to output.
  optional int32 tiles = 2;
-  optional BlobShape multiples = 3;
 }
 // Message that stores parameters used by ThresholdLayer
 message ThresholdParameter {
-  optional float threshold = 1 [ default = 0 ]; // Strictly positive values
+  optional float threshold = 1 [default = 0];  // Strictly positive values
 }
 message WindowDataParameter {
@@ -1173,31 +1176,31 @@ message WindowDataParameter {
  // For data pre-processing, we can do simple scaling and subtracting the
  // data mean, if provided. Note that the mean subtraction is always carried
  // out before scaling.
-  optional float scale = 2 [ default = 1 ];
+  optional float scale = 2 [default = 1];
  optional string mean_file = 3;
  // Specify the batch size.
  optional uint32 batch_size = 4;
  // Specify if we would like to randomly crop an image.
-  optional uint32 crop_size = 5 [ default = 0 ];
+  optional uint32 crop_size = 5 [default = 0];
  // Specify if we want to randomly mirror data.
-  optional bool mirror = 6 [ default = false ];
+  optional bool mirror = 6 [default = false];
  // Foreground (object) overlap threshold
-  optional float fg_threshold = 7 [ default = 0.5 ];
+  optional float fg_threshold = 7 [default = 0.5];
  // Background (non-object) overlap threshold
-  optional float bg_threshold = 8 [ default = 0.5 ];
+  optional float bg_threshold = 8 [default = 0.5];
  // Fraction of batch that should be foreground objects
-  optional float fg_fraction = 9 [ default = 0.25 ];
+  optional float fg_fraction = 9 [default = 0.25];
  // Amount of contextual padding to add around a window
  // (used only by the window_data_layer)
-  optional uint32 context_pad = 10 [ default = 0 ];
+  optional uint32 context_pad = 10 [default = 0];
  // Mode for cropping out a detection window
  // warp: cropped window is warped to a fixed size and aspect ratio
  // square: the tightest square around the window is cropped
-  optional string crop_mode = 11 [ default = "warp" ];
+  optional string crop_mode = 11 [default = "warp"];
  // cache_images: will load all images in memory for faster access
-  optional bool cache_images = 12 [ default = false ];
+  optional bool cache_images = 12 [default = false];
  // append root_folder to locate images
-  optional string root_folder = 13 [ default = "" ];
+  optional string root_folder = 13 [default = ""];
 }
 message SPPParameter {
@@ -1207,13 +1210,13 @@ message SPPParameter {
    STOCHASTIC = 2;
  }
  optional uint32 pyramid_height = 1;
-  optional PoolMethod pool = 2 [ default = MAX ]; // The pooling method
+  optional PoolMethod pool = 2 [default = MAX];  // The pooling method
  enum Engine {
    DEFAULT = 0;
    CAFFE = 1;
    CUDNN = 2;
  }
-  optional Engine engine = 6 [ default = DEFAULT ];
+  optional Engine engine = 6 [default = DEFAULT];
 }
 // DEPRECATED: use LayerParameter.
@@ -1312,45 +1315,45 @@ message V1LayerParameter {
 // DEPRECATED: V0LayerParameter is the old way of specifying layer parameters
 // in Caffe.  We keep this message type around for legacy support.
 message V0LayerParameter {
-  optional string name = 1; // the layer name
+  optional string name = 1;  // the layer name
-  optional string type = 2; // the string to specify the layer type
+  optional string type = 2;  // the string to specify the layer type
  // Parameters to specify layers with inner products.
-  optional uint32 num_output = 3; // The number of outputs for the layer
+  optional uint32 num_output = 3;  // The number of outputs for the layer
-  optional bool biasterm = 4 [ default = true ]; // whether to have bias terms
+  optional bool biasterm = 4 [default = true];  // whether to have bias terms
-  optional FillerParameter weight_filler = 5;    // The filler for the weight
+  optional FillerParameter weight_filler = 5;   // The filler for the weight
-  optional FillerParameter bias_filler = 6;      // The filler for the bias
+  optional FillerParameter bias_filler = 6;     // The filler for the bias
-  optional uint32 pad = 7 [ default = 0 ];     // The padding size
+  optional uint32 pad = 7 [default = 0];      // The padding size
-  optional uint32 kernelsize = 8;              // The kernel size
+  optional uint32 kernelsize = 8;             // The kernel size
-  optional uint32 group = 9 [ default = 1 ];   // The group size for group conv
+  optional uint32 group = 9 [default = 1];    // The group size for group conv
-  optional uint32 stride = 10 [ default = 1 ]; // The stride
+  optional uint32 stride = 10 [default = 1];  // The stride
  enum PoolMethod {
    MAX = 0;
    AVE = 1;
    STOCHASTIC = 2;
  }
-  optional PoolMethod pool = 11 [ default = MAX ];     // The pooling method
+  optional PoolMethod pool = 11 [default = MAX];      // The pooling method
-  optional float dropout_ratio = 12 [ default = 0.5 ]; // dropout ratio
+  optional float dropout_ratio = 12 [default = 0.5];  // dropout ratio
-  optional uint32 local_size = 13 [ default = 5 ]; // for local response norm
+  optional uint32 local_size = 13 [default = 5];  // for local response norm
-  optional float alpha = 14 [ default = 1. ];      // for local response norm
+  optional float alpha = 14 [default = 1.];       // for local response norm
-  optional float beta = 15 [ default = 0.75 ];     // for local response norm
+  optional float beta = 15 [default = 0.75];      // for local response norm
-  optional float k = 22 [ default = 1. ];
+  optional float k = 22 [default = 1.];
  // For data layers, specify the data source
  optional string source = 16;
  // For data pre-processing, we can do simple scaling and subtracting the
  // data mean, if provided. Note that the mean subtraction is always carried
  // out before scaling.
-  optional float scale = 17 [ default = 1 ];
+  optional float scale = 17 [default = 1];
  optional string meanfile = 18;
  // For data layers, specify the batch size.
  optional uint32 batchsize = 19;
  // For data layers, specify if we would like to randomly crop an image.
-  optional uint32 cropsize = 20 [ default = 0 ];
+  optional uint32 cropsize = 20 [default = 0];
  // For data layers, specify if we want to randomly mirror data.
-  optional bool mirror = 21 [ default = false ];
+  optional bool mirror = 21 [default = false];
  // The blobs containing the numeric parameters of the layer
  repeated BlobProto blobs = 50;
@@ -1364,41 +1367,41 @@ message V0LayerParameter {
  // to avoid all asynchronous sgd clients to start at the same point. The skip
  // point would be set as rand_skip * rand(0,1). Note that rand_skip should not
  // be larger than the number of keys in the database.
-  optional uint32 rand_skip = 53 [ default = 0 ];
+  optional uint32 rand_skip = 53 [default = 0];
  // Fields related to detection (det_*)
  // foreground (object) overlap threshold
-  optional float det_fg_threshold = 54 [ default = 0.5 ];
+  optional float det_fg_threshold = 54 [default = 0.5];
  // background (non-object) overlap threshold
-  optional float det_bg_threshold = 55 [ default = 0.5 ];
+  optional float det_bg_threshold = 55 [default = 0.5];
  // Fraction of batch that should be foreground objects
-  optional float det_fg_fraction = 56 [ default = 0.25 ];
+  optional float det_fg_fraction = 56 [default = 0.25];
  // optional bool OBSOLETE_can_clobber = 57 [default = true];
  // Amount of contextual padding to add around a window
  // (used only by the window_data_layer)
-  optional uint32 det_context_pad = 58 [ default = 0 ];
+  optional uint32 det_context_pad = 58 [default = 0];
  // Mode for cropping out a detection window
  // warp: cropped window is warped to a fixed size and aspect ratio
  // square: the tightest square around the window is cropped
-  optional string det_crop_mode = 59 [ default = "warp" ];
+  optional string det_crop_mode = 59 [default = "warp"];
  // For ReshapeLayer, one needs to specify the new dimensions.
-  optional int32 new_num = 60 [ default = 0 ];
+  optional int32 new_num = 60 [default = 0];
-  optional int32 new_channels = 61 [ default = 0 ];
+  optional int32 new_channels = 61 [default = 0];
-  optional int32 new_height = 62 [ default = 0 ];
+  optional int32 new_height = 62 [default = 0];
-  optional int32 new_width = 63 [ default = 0 ];
+  optional int32 new_width = 63 [default = 0];
  // Whether or not ImageLayer should shuffle the list of files at every epoch.
  // It will also resize images if new_height or new_width are not zero.
-  optional bool shuffle_images = 64 [ default = false ];
+  optional bool shuffle_images = 64 [default = false];
  // For ConcatLayer, one needs to specify the dimension for concatenation, and
  // the other dimensions must be the same for all the bottom blobs.
  // By default it will concatenate blobs along the channels dimension.
-  optional uint32 concat_dim = 65 [ default = 1 ];
+  optional uint32 concat_dim = 65 [default = 1];
  optional HDF5OutputParameter hdf5_output_param = 1001;
 }
@@ -1410,14 +1413,14 @@ message PReLUParameter {
  // Initial value of a_i. Default is a_i=0.25 for all i.
  optional FillerParameter filler = 1;
  // Whether or not slope paramters are shared across channels.
-  optional bool channel_shared = 2 [ default = false ];
+  optional bool channel_shared = 2 [default = false];
 }
 message SmoothL1LossParameter {
  // SmoothL1Loss(x) =
  //   0.5 * (sigma * x) ** 2    -- if x < 1.0 / sigma / sigma
  //   |x| - 0.5 / sigma / sigma -- otherwise
-  optional float sigma = 1 [ default = 1 ];
+  optional float sigma = 1 [default = 1];
 }
 message PermuteParameter {
@@ -1428,18 +1431,11 @@ message PermuteParameter {
 }
 message NormalizeParameter {
-  optional bool across_spatial = 1 [ default = true ];
+  optional bool across_spatial = 1 [default = true];
  // Initial value of scale. Default is 1.0 for all
  optional FillerParameter scale_filler = 2;
  // Whether or not scale parameters are shared across channels.
-  optional bool channel_shared = 3 [ default = true ];
+  optional bool channel_shared = 3 [default = true];
  // Epsilon for not dividing by zero while normalizing variance
-  optional float eps = 4 [ default = 1e-12 ];
+  optional float eps = 4 [default = 1e-12];
-}
-message GroupNormParameter {
-  optional float eps = 1 [ default = 1e-5 ];
-  optional int32 group = 2 [ default = 32 ];
 }
-message CastParameter { optional string dtype = 1; }
--- a/caffe/solver.py
+++ b/caffe/solver.py
@@ -16,15 +16,14 @@ from __future__ import division
 from __future__ import print_function
 import time
 from google.protobuf import text_format
 from dragon.core.autograph import def_function
-from dragon.core.framework import workspace
 from dragon.core.training.adam import Adam
 from dragon.core.training.rmsprop import RMSprop
 from dragon.core.training.sgd import SGD
 from dragon.core.training.sgd import Nesterov
+from dragon.core.util import logging
 from dragon.vm.caffe.net import Net
 from dragon.vm.caffe.proto import caffe_pb2
@@ -99,8 +98,9 @@ class Solver(object):
            if self._current_step < len(self._param.stepvalue) \
                    and self.iter >= self._param.stepvalue[self._current_step]:
                self._current_step = self._current_step + 1
-                print('MultiStep Status: Iteration {},  step = {}'
+                logging.info(
-                      .format(self.iter, self._current_step))
+                    'MultiStep Status: Iteration {}, step = {}'
+                    .format(self.iter, self._current_step))
                new_lr = self._param.base_lr * \
                    pow(self._param.gamma, self._current_step)
                self.base_lr = new_lr
@@ -112,8 +112,9 @@ class Solver(object):
            else:
                if self._current_step + 1 < len(stage_iters):
                    self._current_step = self._current_step + 1
-                    print('MultiFixed Status: Iteration {}, stage = {}'
+                    logging.info(
-                          .format(self.iter, self._current_step))
+                        'MultiFixed Status: Iteration {}, stage = {}'
+                        .format(self.iter, self._current_step))
                    self.base_lr = stage_lrs[self._current_step]
        elif policy == 'inv':
            power = self._param.power
@@ -130,8 +131,7 @@ class Solver(object):
    def _apply_update(self):
        """Apply the weights update."""
        for blob in self.net._layer_blobs:
-            if blob.lr_multiplier > 0 and \
+            if blob.lr_multiplier > 0 and blob.diff is not None:
-                    blob.diff is not None:
                self._optimizer.apply_gradients(
                    values_and_grads=[(blob.data, blob.diff)],
                    lr_mult=blob.lr_multiplier,
@@ -211,80 +211,18 @@ class Solver(object):
        """
        return self._test_nets
-    def one_step(self):
-        """One step run the train net.
-        Returns
-        -------
-        dict
-            The stats.
-        """
-        if self._param.test_interval and \
-                self.iter % self._param.test_interval == 0:
-            if (self.iter == 0 and
-                    self._param.test_initialization) or self.iter != 0:
-                for test_idx in range(len(self._test_nets)):
-                    self.test(test_idx)
-        # Forward, backward and compute loss.
-        run_time, stats = 0., {'loss': {'total': 0.}, 'iter': self.iter}
-        for i in range(self._param.iter_size):
-            tic = time.time()
-            self._net.forward_backward(return_outputs=False)
-            run_time += (time.time() - tic)
-            # Total loss.
-            for e in self.net.losses:
-                values = e.get_value().flatten()
-                if values.size == 1:
-                    stats['loss']['total'] += values[0]
-            # Partial loss.
-            for key in self.net.outputs:
-                values = self.net.blobs[key].data
-                values = values.get_value().flatten()
-                if values.size != 1:
-                    continue
-                if key not in stats['loss']:
-                    stats['loss'][key] = 0.
-                stats['loss'][key] += values[0]
-        # Apply Update.
-        self._get_learning_rate()
-        tic = time.time()
-        self._apply_update()
-        run_time += (time.time() - tic)
-        self.iter = self.iter + 1
-        # Snapshot.
-        if self._param.snapshot:
-            if self.iter % self._param.snapshot == 0:
-                self.snapshot()
-        # Average loss by the iter size.
-        for k in stats['loss'].keys():
-            stats['loss'][k] /= self._param.iter_size
-        # Misc stats.
-        stats['lr'] = self.base_lr
-        stats['time'] = run_time
-        return stats
    def snapshot(self):
        """Snapshot the parameters of train net."""
-        workspace.save(
+        self._net.save(
-            tensors=[blob.data for blob in self.net._layer_blobs],
+            '%s_iter_%d.caffemodel'
-            filename='_iter_%d' % self.iter,
+            % (self._param.snapshot_prefix, self._iter))
-            prefix=self._param.snapshot_prefix,
-            suffix='.caffemodel',
+    def step(self, num_iterations=1):
-            format='caffe',
-        )
-    def step(self, num_iterations):
        """Step the train net.
        Parameters
        ----------
-        num_iterations : int
+        num_iterations : int, optional, default=1
            The number of iterations to step.
        """
@@ -293,19 +231,18 @@ class Solver(object):
        loss_vec, smoothed_loss = [], 0.
        tic = time.time()
        while self.iter < stop_step:
            # Test if necessary.
-            if self._param.test_interval and \
+            if self._is_root and self._param.test_interval > 0 and \
-               self.iter % self._param.test_interval == 0:
+                    self.iter % self._param.test_interval == 0:
-                if (self.iter == 0 and
+                if (self.iter == 0 and self._param.test_initialization) or \
-                        self._param.test_initialization) or self.iter != 0:
+                        self.iter != 0:
                    for test_idx in range(len(self._test_nets)):
                        self.test(test_idx)
            # Forward, backward and compute loss.
            loss = 0.
            for i in range(self._param.iter_size):
-                self._net.forward_backward(return_outputs=False)
+                self._net.forward_backward()
                if self._is_root:
                    for e in self.net.losses:
                        values = e.get_value().flatten()
@@ -322,24 +259,23 @@ class Solver(object):
                    idx = (self.iter - start_step) % self._param.average_loss
                    smoothed_loss += ((loss - loss_vec[idx]) / self._param.average_loss)
                    loss_vec[idx] = loss
            # Apply Update.
            self._get_learning_rate()
            self._apply_update()
+            # Display iteration info.
-            # Display.
            if self._is_root and self._param.display:
                if self.iter % self._param.display == 0:
-                    print('Iteration %d, lr = %s, loss = %f, time = %.2fs' % (
+                    logging.info(
-                        self.iter, str(self.base_lr), smoothed_loss, time.time() - tic))
+                        'Iteration %d, lr = %s, loss = %f, time = %.2fs'
+                        % (self.iter, str(self.base_lr), smoothed_loss, time.time() - tic))
                    tic = time.time()
                    for idx, net_output in enumerate(self.net.outputs):
                        values = self.net.blobs[net_output].data.get_value().flatten()
                        for v in values:
-                            print(' ' * 10 + 'Train net output #{}({}): {}'
+                            logging.info(
-                                  .format(idx, net_output, v))
+                                ' ' * 10 + 'Train net output #{}({}): {}'
+                                .format(idx, net_output, v))
            self.iter = self.iter + 1
            # Snapshot if necessary.
            if self._param.snapshot:
                if self.iter % self._param.snapshot == 0:
@@ -359,7 +295,7 @@ class Solver(object):
        test_iter = self._param.test_iter[test_idx]
        for iter in range(test_iter):
-            net.forward_backward(return_outputs=False)
+            net.forward()
            if not self._is_root:
                continue
            if iter == 0:
@@ -376,27 +312,25 @@ class Solver(object):
                        test_score[i] += value
                        i += 1
-        if not self._is_root:
+        logging.info('Iteration {}, Test net #{}'.format(self.iter, test_idx))
-            return
-        print('Iteration {}, Test net #{}'.format(self.iter, test_idx))
        for i, score in enumerate(test_score):
-            print(' ' * 10 + 'Test net output #%d(%s): %.4f'
+            logging.info(
-                  % (i, output_id[i], score / test_iter))
+                ' ' * 10 + 'Test net output #%d(%s): %.4f'
+                % (i, output_id[i], score / test_iter))
 class AdamSolver(Solver):
    r"""The Adam solver.
    `[Kingma & Ba, 2014] <https://arxiv.org/abs/1412.6980>`_.
-    Following hyper parameters will be taken:
+    Examples:
    ```python
-    caffe_pb2.SolverParameter(
+    solver {
-        base_lr=0.,
+      base_lr=0.001,
-        momentum=0.,
+      momentum=0.9,
-        momentum2=0.999,
+      momentum2=0.999,
-        delta=1e-8,
+      delta=1e-8,
    )
    ```
@@ -425,13 +359,13 @@ class NesterovSolver(Solver):
    r"""The Nesterov-SGD solver.
    `[Sutskever et.al, 2013] <http://www.cs.toronto.edu/~hinton/absps/momentum.pdf>`_.
-    Following hyper parameters will be taken:
+    Examples:
    ```python
-    caffe_pb2.SolverParameter(
+    solver {
-        base_lr=0.,
+      base_lr: 0.01
-        momentum=0.,
+      momentum: 0.9
-    )
+    }
    ```
    """
@@ -457,13 +391,13 @@ class RMSPropSolver(Solver):
    r"""The RMSProp solver.
    `[Hinton et.al, 2013] <http://www.cs.utoronto.ca/~bonner/courses/2016s/csc321/lectures/lec6.pdf>`_.
-    Following hyper parameters will be taken:
+    Examples:
    ```python
-    caffe_pb2.SolverParameter(
+    solver {
-        base_lr=0.,
+      base_lr=0.01,
-        rms_decay=0.99,
+      rms_decay=0.99,
-        delta=1e-8,
+      delta=1e-8,
    )
    ```
@@ -491,12 +425,12 @@ class SGDSolver(Solver):
    r"""The Momentum-SGD solver.
    `[Polyak, 1964] <https://doi.org/10.1016/0041-5553(64)90137-5>`_.
-    Following hyper parameters will be taken:
+    Examples:
    ```python
-    caffe_pb2.SolverParameter(
+    solver {
-        base_lr=0.,
+      base_lr=0.01,
-        momentum=0.,
+      momentum=0.9,
    )
    ```

--- a/docs/README.md
+++ b/docs/README.md
@@ -3,9 +3,9 @@ Building Dragon Documentation
 This page will help you to build the following documentations:
-Dragon C++ API: http://dragon.seetatech.com/api/cc
+Dragon C++ API: https://dragon.seetatech.com/api/cc
-Dragon Python API: http://dragon.seetatech.com/api/python
+Dragon Python API: https://dragon.seetatech.com/api/python
 Build Documentation of C++ API
 ------------------------------

--- a/docs/api/python/caffe/layers.rst
+++ b/docs/api/python/caffe/layers.rst
@@ -34,10 +34,6 @@ vm.caffe.layers
  `class Deconvolution <layers/Deconvolution.html>`_
  : Apply the n-dimension deconvolution.
-  `class DepthwiseConv2d <layers/DepthwiseConv2d.html>`_
-  : Apply the 2d depthwise convolution.
-  `[Chollet, 2016] <https://arxiv.org/abs/1610.02357>`_.
  `class Dropout <layers/Dropout.html>`_
  : Set the elements of the input to zero randomly.
  `[Srivastava et.al, 2014] <http://jmlr.org/papers/v15/srivastava14a.html>`_.
@@ -58,18 +54,6 @@ vm.caffe.layers
  `class Flatten <layers/Flatten.html>`_
  : Flatten the input along the given axes.
-  `class FusedBatchNorm <layers/FusedBatchNorm.html>`_
-  : Apply the fused batch normalization.
-  `[Ioffe & Szegedy, 2015] <https://arxiv.org/abs/1502.03167>`_.
-  `class FusedGroupNorm <layers/FusedBatchNorm.html>`_
-  : Apply the fused group normalization.
-  `[Wu & He, 2018] <https://arxiv.org/abs/1803.08494>`_.
-  `class GroupNorm <layers/FusedBatchNorm.html>`_
-  : Apply the group normalization.
-  `[Wu & He, 2018] <https://arxiv.org/abs/1803.08494>`_.
  `class InnerProduct <layers/InnerProduct.html>`_
  : Compute the dense matrix multiplication along the given axes.
@@ -121,10 +105,6 @@ vm.caffe.layers
  `class Scale <layers/Scale.html>`_
  : Compute the affine transformation along the given axes.
-  `class SELU <layers/SELU.html>`_
-  : Apply the scaled exponential linear unit.
-  `[Klambauer et.al, 2017] <https://arxiv.org/abs/1706.02515>`_.
  `class Sigmoid <layers/Sigmoid.html>`_
  : Apply the sigmoid function.
@@ -145,7 +125,7 @@ vm.caffe.layers
  : Apply the tanh function.
  `class Tile <layers/Tile.html>`_
-  : Tile the input according to the given multiples.
+  : Repeat the input according to the given axis.
 .. toctree::
  :hidden:
@@ -153,21 +133,16 @@ vm.caffe.layers
  layers/Accuracy
  layers/ArgMax
  layers/BatchNorm
-  layers/Cast
  layers/Concat
  layers/Convolution
  layers/Crop
  layers/Data
  layers/Deconvolution
-  layers/DepthwiseConv2d
  layers/Dropout
  layers/Eltwise
  layers/ELU
  layers/EuclideanLoss
  layers/Flatten
-  layers/FusedBatchNorm
-  layers/FusedGroupNorm
-  layers/GroupNorm
  layers/InnerProduct
  layers/Input
  layers/LRN
@@ -183,7 +158,6 @@ vm.caffe.layers
  layers/ROIAlign
  layers/ROIPooling
  layers/Scale
-  layers/SELU
  layers/Sigmoid
  layers/SigmoidCrossEntropyLoss
  layers/SmoothL1Loss

--- a/docs/api/python/caffe/layers/DepthwiseConv2d.rst
+++ b/docs/api/python/caffe/layers/DepthwiseConv2d.rst
-DepthwiseConv2d
-===============
-.. autoclass:: dragon.vm.caffe.layers.DepthwiseConv2d
-.. raw:: html
-  <style>
-    h1:before {
-      content: "caffe.layers.";
-      color: #103d3e;
-    }
-  </style>
--- a/docs/api/python/caffe/layers/FusedBatchNorm.rst
+++ b/docs/api/python/caffe/layers/FusedBatchNorm.rst
-FusedBatchNorm
-==============
-.. autoclass:: dragon.vm.caffe.layers.FusedBatchNorm
-.. raw:: html
-  <style>
-    h1:before {
-      content: "caffe.layers.";
-      color: #103d3e;
-    }
-  </style>
--- a/docs/api/python/caffe/layers/FusedGroupNorm.rst
+++ b/docs/api/python/caffe/layers/FusedGroupNorm.rst
-FusedGroupNorm
-==============
-.. autoclass:: dragon.vm.caffe.layers.FusedGroupNorm
-.. raw:: html
-  <style>
-    h1:before {
-      content: "caffe.layers.";
-      color: #103d3e;
-    }
-  </style>
--- a/docs/api/python/caffe/layers/GroupNorm.rst
+++ b/docs/api/python/caffe/layers/GroupNorm.rst
-GroupNorm
-=========
-.. autoclass:: dragon.vm.caffe.layers.GroupNorm
-.. raw:: html
-  <style>
-    h1:before {
-      content: "caffe.layers.";
-      color: #103d3e;
-    }
-  </style>
--- a/docs/api/python/caffe/layers/SELU.rst
+++ b/docs/api/python/caffe/layers/SELU.rst
-SELU
-====
-.. autoclass:: dragon.vm.caffe.layers.SELU
-.. raw:: html
-  <style>
-    h1:before {
-      content: "caffe.layers.";
-      color: #103d3e;
-    }
-  </style>
--- a/docs/api/python/dragon.rst
+++ b/docs/api/python/dragon.rst
@@ -18,8 +18,8 @@ dragon
  `class TensorSpec <dragon/TensorSpec.html>`_
  : Spec to describe properties of a tensor.
-  `class Workspace <dragon/Workspace_.html>`_
+  `class Workspace <dragon/Workspace.html>`_
-  : Space to isolate computations that share resources.
+  : Sandbox to isolate the resources and computations.
  Functions
  ---------
@@ -151,7 +151,7 @@ dragon
  : Return the identity of input with truncated gradient-flow.
  `tile(...) <dragon/tile.html>`_
-  : Tile the input according to the given multiples.
+  : Tile the input according to the given repeats.
  `transpose(...) <dragon/transpose.html>`_
  : Permute the dimensions of input.
@@ -217,7 +217,7 @@ dragon
  dragon/tile
  dragon/transpose
  dragon/where
-  dragon/Workspace_
+  dragon/Workspace
  dragon/zeros
  dragon/zeros_like

--- a/docs/api/python/dragon/GradientTape.rst
+++ b/docs/api/python/dragon/GradientTape.rst
@@ -14,10 +14,6 @@ gradient
 ########
 .. automethod:: dragon.GradientTape.gradient
-replay
-######
-.. automethod:: dragon.GradientTape.replay
 reset
 #####
 .. automethod:: dragon.GradientTape.reset

--- a/docs/api/python/dragon/Tensor.rst
+++ b/docs/api/python/dragon/Tensor.rst
@@ -30,6 +30,10 @@ shape
 #####
 .. autoattribute:: dragon.Tensor.shape
+size
+#####
+.. autoattribute:: dragon.Tensor.size
 Methods
 -------

--- a/docs/api/python/dragon/Workspace_.rst
+++ b/docs/api/python/dragon/Workspace_.rst
-Workspace
-=========
-.. autoclass:: dragon.Workspace
-__init__
--------
-.. automethod:: dragon.Workspace.__init__
-Methods
-------
-as_default
-##########
-.. automethod:: dragon.Workspace.as_default
-clear
-#####
-.. automethod:: dragon.Workspace.clear
-merge_from
-##########
-.. automethod:: dragon.Workspace.merge_from
-.. raw:: html
-  <style>
-    h1:before {
-      content: "dragon.";
-      color: #103d3e;
-    }
-  </style>
--- a/docs/api/python/dragon/workspace.rst
+++ b/docs/api/python/dragon/workspace.rst
-dragon.workspace
+Workspace
-================
+=========
-.. only:: html
+.. autoclass:: dragon.Workspace
-  Functions
+__init__
-  ---------
+--------
+.. automethod:: dragon.Workspace.__init__
-  `feed_tensor(...) <workspace/feed_tensor.html>`_
+Methods
-  : Copy the value to tensor.
+-------
-  `fetch_tensor(...) <workspace/fetch_tensor.html>`_
+as_default
-  : Return the value of tensor.
+##########
+.. automethod:: dragon.Workspace.as_default
-  `has_tensor(...) <workspace/has_tensor.html>`_
+feed_tensor
-  : Return a bool indicating if tensor is in current workspace.
+###########
+.. automethod:: dragon.Workspace.feed_tensor
-  `load(...) <workspace/load.html>`_
+fetch_tensor
-  : Load tensors from a binary file.
+############
+.. automethod:: dragon.Workspace.fetch_tensor
-  `reset_tensor(...) <workspace/reset_tensor.html>`_
+has_tensor
-  : Reset the memory of tensor.
+##########
+.. automethod:: dragon.Workspace.has_tensor
-  `run_operator(...) <workspace/run_operator.html>`_
+merge_from
-  : Run the operators in current workspace.
+##########
+.. automethod:: dragon.Workspace.merge_from
-  `save(...) <workspace/save.html>`_
+reset_tensor
-  : Serialize tensors into a binary file.
+############
+.. automethod:: dragon.Workspace.reset_tensor
-.. toctree::
-   :hidden:
-   workspace/feed_tensor
-   workspace/fetch_tensor
-   workspace/has_tensor
-   workspace/load
-   workspace/reset_tensor
-   workspace/run_operator
-   workspace/save
 .. raw:: html
  <style>
-   h1:before {
+    h1:before {
-    content: "Module: ";
+      content: "dragon.";
-    color: #103d3e;
+      color: #103d3e;
-   }
+    }
  </style>
--- a/docs/api/python/dragon/workspace/feed_tensor.rst
+++ b/docs/api/python/dragon/workspace/feed_tensor.rst
-feed_tensor
-===========
-.. autofunction:: dragon.workspace.feed_tensor
-.. raw:: html
-  <style>
-    h1:before {
-      content: "dragon.workspace.";
-      color: #103d3e;
-    }
-  </style>
--- a/docs/api/python/dragon/workspace/fetch_tensor.rst
+++ b/docs/api/python/dragon/workspace/fetch_tensor.rst
-fetch_tensor
-============
-.. autofunction:: dragon.workspace.fetch_tensor
-.. raw:: html
-  <style>
-    h1:before {
-      content: "dragon.workspace.";
-      color: #103d3e;
-    }
-  </style>
--- a/docs/api/python/dragon/workspace/has_tensor.rst
+++ b/docs/api/python/dragon/workspace/has_tensor.rst
-has_tensor
-==========
-.. autofunction:: dragon.workspace.has_tensor
-.. raw:: html
-  <style>
-    h1:before {
-      content: "dragon.workspace.";
-      color: #103d3e;
-    }
-  </style>
--- a/docs/api/python/dragon/workspace/load.rst
+++ b/docs/api/python/dragon/workspace/load.rst
-load
-====
-.. autofunction:: dragon.workspace.load
-.. raw:: html
-  <style>
-    h1:before {
-      content: "dragon.workspace.";
-      color: #103d3e;
-    }
-  </style>
--- a/docs/api/python/dragon/workspace/reset_tensor.rst
+++ b/docs/api/python/dragon/workspace/reset_tensor.rst
-reset_tensor
-============
-.. autofunction:: dragon.workspace.reset_tensor
-.. raw:: html
-  <style>
-    h1:before {
-      content: "dragon.workspace.";
-      color: #103d3e;
-    }
-  </style>
--- a/docs/api/python/dragon/workspace/run_operator.rst
+++ b/docs/api/python/dragon/workspace/run_operator.rst
-run_operator
-============
-.. autofunction:: dragon.workspace.run_operator
-.. raw:: html
-  <style>
-    h1:before {
-      content: "dragon.workspace.";
-      color: #103d3e;
-    }
-  </style>
--- a/docs/api/python/dragon/workspace/save.rst
+++ b/docs/api/python/dragon/workspace/save.rst
-save
-====
-.. autofunction:: dragon.workspace.save
-.. raw:: html
-  <style>
-    h1:before {
-      content: "dragon.workspace.";
-      color: #103d3e;
-    }
-  </style>
--- a/docs/api/python/index.rst
+++ b/docs/api/python/index.rst
@@ -40,7 +40,6 @@ Dragon
  * `dragon.nn <dragon/nn.html>`_
  * `dragon.optimizers <dragon/optimizers.html>`_
  * `dragon.random <dragon/random.html>`_
-  * `dragon.workspace <dragon/workspace.html>`_
  * `dragon.vision <dragon/vision.html>`_
 Caffe
@@ -112,6 +111,7 @@ PyTorch
  This style involves the following components:
  * `torch <torch.html>`_
+  * `torch.autograd <torch/autograd.html>`_
  * `torch.distributed <torch/distributed.html>`_
  * `torch.jit <torch/jit.html>`_
  * `torch.nn <torch/nn.html>`_
@@ -206,15 +206,9 @@ Modules
  `Module random <dragon/random.html>`_
  : Native API for ``dragon.random`` namespace.
-  `Module workspace <dragon/workspace.html>`_
-  : Native API for ``dragon.workspace`` namespace.
  `Module vision <dragon/vision.html>`_
  : Native API for ``dragon.vision`` namespace.
-  `Module workspace <dragon/workspace.html>`_
-  : Native API for ``dragon.workspace`` namespace.
  `Module vm.caffe <caffe.html>`_
  : Virtual API for ``caffe`` namespace.
@@ -278,6 +272,9 @@ Modules
  `Module vm.torch <torch.html>`_
  : Virtual API for ``torch`` namespace.
+  `Module vm.torch.autograd <torch/autograd.html>`_
+  : Virtual API for ``torch.autograd`` namespace.
  `Module vm.torch.distributed <torch/distributed.html>`_
  : Virtual API for ``torch.distributed`` namespace.
@@ -319,7 +316,6 @@ Modules
  dragon/nn
  dragon/optimizers
  dragon/random
-  dragon/workspace
  dragon/vision
  caffe
  caffe/layers
@@ -343,6 +339,7 @@ Modules
  tensorrt
  tensorrt/backend
  torch
+  torch/autograd
  torch/distributed
  torch/jit
  torch/nn

--- a/docs/api/python/tensorflow/GradientTape.rst
+++ b/docs/api/python/tensorflow/GradientTape.rst
@@ -15,11 +15,6 @@ gradient
 .. automethod:: dragon.GradientTape.gradient
  :noindex:
-replay
-######
-.. automethod:: dragon.GradientTape.replay
-  :noindex:
 reset
 #####
 .. automethod:: dragon.GradientTape.reset

--- a/docs/api/python/torch/autograd.rst
+++ b/docs/api/python/torch/autograd.rst
+vm.torch.autograd
+==================
+.. only:: html
+  Functions
+  ---------
+  `backward(...) <autograd/backward.html>`_
+  : Compute the derivatives of tensors w.r.t. graph leaves.
+.. toctree::
+  :hidden:
+  autograd/backward
+.. raw:: html
+  <style>
+  h1:before {
+    content: "Module: dragon.";
+    color: #103d3e;
+  }
+  </style>
--- a/docs/api/python/caffe/layers/Cast.rst
+++ b/docs/api/python/caffe/layers/Cast.rst
-Cast
+backward
-====
+========
-.. autoclass:: dragon.vm.caffe.layers.Cast
+.. autofunction:: dragon.vm.torch.autograd.backward
 .. raw:: html
  <style>
    h1:before {
-      content: "caffe.layers.";
+      content: "torch.autograd.";
      color: #103d3e;
    }
  </style>
--- a/docs/api/python/torch/distributed/all_reduce.rst
+++ b/docs/api/python/torch/distributed/all_reduce.rst
@@ -7,7 +7,7 @@ all_reduce
  <style>
    h1:before {
-      content: "torch.nn.distributed.";
+      content: "torch.distributed.";
      color: #103d3e;
    }
  </style>
--- a/docs/api/python/torch/distributed/broadcast.rst
+++ b/docs/api/python/torch/distributed/broadcast.rst
@@ -7,7 +7,7 @@ broadcast
  <style>
    h1:before {
-      content: "torch.nn.distributed.";
+      content: "torch.distributed.";
      color: #103d3e;
    }
  </style>
--- a/docs/api/python/torch/jit/trace.rst
+++ b/docs/api/python/torch/jit/trace.rst
@@ -7,7 +7,7 @@ trace
  <style>
    h1:before {
-      content: "torch.nn.jit.";
+      content: "torch.jit.";
      color: #103d3e;
    }
  </style>
--- a/docs/api/python/torch/optim/Adam.rst
+++ b/docs/api/python/torch/optim/Adam.rst
@@ -10,25 +10,25 @@ __init__
 Methods
 -------
-accumulate_grad
+accumulate
-###############
+##########
-.. automethod:: dragon.vm.torch.optim.Optimizer.accumulate_grad
+.. automethod:: dragon.vm.torch.optim.Optimizer.accumulate
-   :noindex:
+  :noindex:
 add_param_group
 ###############
 .. automethod:: dragon.vm.torch.optim.Optimizer.add_param_group
-   :noindex:
+  :noindex:
 step
 ####
 .. automethod:: dragon.vm.torch.optim.Optimizer.step
-   :noindex:
+  :noindex:
 zero_grad
 #########
 .. automethod:: dragon.vm.torch.optim.Optimizer.zero_grad
-   :noindex:
+  :noindex:
 .. raw:: html

--- a/docs/api/python/torch/optim/Optimizer.rst
+++ b/docs/api/python/torch/optim/Optimizer.rst
@@ -10,9 +10,9 @@ __init__
 Methods
 -------
-accumulate_grad
+accumulate
-###############
+##########
-.. automethod:: dragon.vm.torch.optim.Optimizer.accumulate_grad
+.. automethod:: dragon.vm.torch.optim.Optimizer.accumulate
 add_param_group
 ###############

--- a/docs/api/python/torch/optim/RMSprop.rst
+++ b/docs/api/python/torch/optim/RMSprop.rst
@@ -10,25 +10,25 @@ __init__
 Methods
 -------
-accumulate_grad
+accumulate
-###############
+##########
-.. automethod:: dragon.vm.torch.optim.Optimizer.accumulate_grad
+.. automethod:: dragon.vm.torch.optim.Optimizer.accumulate
-   :noindex:
+  :noindex:
 add_param_group
 ###############
 .. automethod:: dragon.vm.torch.optim.Optimizer.add_param_group
-   :noindex:
+  :noindex:
 step
 ####
 .. automethod:: dragon.vm.torch.optim.Optimizer.step
-   :noindex:
+  :noindex:
 zero_grad
 #########
 .. automethod:: dragon.vm.torch.optim.Optimizer.zero_grad
-   :noindex:
+  :noindex:
 .. raw:: html

--- a/docs/api/python/torch/optim/SGD.rst
+++ b/docs/api/python/torch/optim/SGD.rst
@@ -10,25 +10,25 @@ __init__
 Methods
 -------
-accumulate_grad
+accumulate
-###############
+##########
-.. automethod:: dragon.vm.torch.optim.Optimizer.accumulate_grad
+.. automethod:: dragon.vm.torch.optim.Optimizer.accumulate
-   :noindex:
+  :noindex:
 add_param_group
 ###############
 .. automethod:: dragon.vm.torch.optim.Optimizer.add_param_group
-   :noindex:
+  :noindex:
 step
 ####
 .. automethod:: dragon.vm.torch.optim.Optimizer.step
-   :noindex:
+  :noindex:
 zero_grad
 #########
 .. automethod:: dragon.vm.torch.optim.Optimizer.zero_grad
-   :noindex:
+  :noindex:
 .. raw:: html

--- a/dragon/core/graph.cc
+++ b/dragon/core/graph.cc
@@ -7,7 +7,7 @@ namespace dragon {
 GraphBase::GraphBase(const GraphDef& def, Workspace* ws)
    : def_(def), ws_(ws), name_(def.name()), phase_("TEST") {
-  // Scan the defined arguments
+  // Collect arguments
  for (auto& arg : def_.arg()) {
    CHECK_GT(arg.name().size(), 0);
    CHECK_EQ(args_.count(arg.name()), 0);
@@ -18,32 +18,31 @@ GraphBase::GraphBase(const GraphDef& def, Workspace* ws)
  // Collect outputs
  Set<string> outputs;
  for (const auto& op : def.op()) {
-    for (const auto& in : op.input())
+    for (const auto& input : op.input())
-      CHECK(outputs.count(in) || ws_->HasTensor(in))
+      CHECK(outputs.count(input) || ws_->HasTensor(input))
-          << "\nInput: " << in << " for op: " << op.name() << " is unknown.";
+          << "\nThe input <" << input << "> is not in graph.";
-    for (const auto& out : op.output())
+    for (const auto& output : op.output()) {
-      outputs.insert(out);
+      outputs.insert(output);
+    }
  }
  // Check targets
  Set<string> targets;
  for (const auto& target : def.output()) {
    CHECK(outputs.count(target) || ws_->HasTensor(target))
-        << "\nTarget: " << target << " does not exist in the graph.";
+        << "\nThe output <" << target << "> is not in graph.";
    targets.insert(target);
  }
  // Check gradients
-  for (const auto& gradient : def.gradient()) {
+  for (const auto& grad_info : def.grad_info()) {
-    const auto& cost = gradient.cost();
+    const auto& y = grad_info.y();
-    const auto& wrt = gradient.wrt();
+    CHECK_GT(targets.count(y), 0)
-    CHECK(outputs.count(cost) || ws_->HasTensor(cost))
+        << "\nThe derivative target <" << y << "> is not in outputs.";
-        << "\nTarget: " << cost << "does not exist in the graph.";
+    for (const auto& x : grad_info.xs()) {
-    CHECK(outputs.count(wrt) || ws_->HasTensor(wrt))
+      CHECK(outputs.count(x) || ws_->HasTensor(x))
-        << "\nTarget: " << wrt << "does not exist in the graph.";
+          << "\nThe differentiated input <" << x << "> is not in graph.";
-    CHECK_GT(targets.count(cost), 0)
+    }
-        << "\nTo solve d(" << cost << ")/d(" << wrt << "),\n"
-        << cost << " should be set as a target.";
  }
 }
@@ -54,21 +53,18 @@ bool Graph::Create(const GraphDef& def, Workspace* ws) {
    auto op_def(def.op(i));
    LOG(DEBUG) << "Create Operator " << op_def.name() << ": " << op_def.type();
    // Inherit device option if necessary
-    if (!op_def.has_device_option() && has_device_option)
+    if (!op_def.has_device_option() && has_device_option) {
      op_def.mutable_device_option()->CopyFrom(def.device_option());
+    }
    Argument arg;
-    arg.set_name("allow_recomp");
-    arg.set_i(1);
-    op_def.add_arg()->CopyFrom(arg);
    // For the last operator, enforce the synchronization
    if (i == def.op_size() - 1) {
      arg.set_name("do_sync");
      arg.set_i(1);
      op_def.add_arg()->CopyFrom(arg);
    }
-    ops_.push_back(NewOperator(op_def, ws));
+    cached_ops_.push_back(NewOperator(op_def, ws));
-    // Attatch the output aliases info
+    cached_ops_.back()->set_output_aliases(output_aliases_);
-    ops_.back()->set_output_aliases(output_aliases_);
  }
  return true;
 }
@@ -80,7 +76,7 @@ Graph::Graph(const GraphDef& def, Workspace* ws) : GraphBase(def, ws) {
  GraphGradientMaker gradient_maker;
  Map<string, vec32_t> subgraph_indices;
  int opt = 3; // defaults: O3
-  if (args().count("optimization_level")) opt = arg("optimization_level").i();
+  if (args().count("optimization")) opt = arg("optimization").i();
  if (opt >= 1) opt_def = graph_optim.PruneNodes(def);
  if (opt >= 2) graph_optim.AddInplace(opt_def, output_aliases_);
  if (opt >= 3) {
@@ -101,22 +97,23 @@ Graph::Graph(const GraphDef& def, Workspace* ws) : GraphBase(def, ws) {
    for (const auto& it : subgraph_indices) {
      subgraph[it.first] = vector<OperatorBase*>();
      for (const auto& idx : subgraph_indices[it.first])
-        subgraph[it.first].push_back(ops_[idx]);
+        subgraph[it.first].push_back(cached_ops_[idx]);
    }
-    for (const auto& op : ops_)
+    for (auto* op : cached_ops_) {
      op->set_subgraph(subgraph);
+    }
  }
 }
-bool Graph::Run(const string& incl, const string& excl, int stream_id) {
+bool Graph::Run(const string& include, const string& exclude, int stream) {
  LOG(DEBUG) << "Run Graph: " << name();
-  for (auto op : ops_) {
+  for (auto* op : cached_ops_) {
-    if (!incl.empty() && !str::find(op->type(), incl)) continue;
+    if (!include.empty() && !str::find(op->type(), include)) continue;
-    if (!excl.empty() && str::find(op->type(), excl)) continue;
+    if (!exclude.empty() && str::find(op->type(), exclude)) continue;
    op->SwitchToPhase(phase());
-    LOG(DEBUG) << "$ Before Operator: " << op->name();
+    LOG(DEBUG) << "Run Op: " << op->name();
-    op->Run(stream_id);
+    op->Run(stream);
-    LOG(DEBUG) << "$ After Operator: " << op->name();
+    LOG(DEBUG) << "Finish Op: " << op->name();
  }
  return true;
 }

--- a/dragon/core/graph.h
+++ b/dragon/core/graph.h
@@ -88,8 +88,8 @@ class Graph : public GraphBase {
  /*! \brief Default Destructor */
  virtual ~Graph() {
-    for (auto* op : ops_) {
+    for (auto* cached_op : cached_ops_) {
-      delete op;
+      delete cached_op;
    }
  }
@@ -100,8 +100,8 @@ class Graph : public GraphBase {
  bool Run(const string&, const string&, int = 0) override;
 protected:
-  /*! \brief Store the internal operators */
+  /*! \brief The cached operators */
-  vector<OperatorBase*> ops_;
+  vector<OperatorBase*> cached_ops_;
  /*! \brief Store the candidate output aliases */
  Map<string, Set<string>> output_aliases_;

--- a/dragon/core/graph_gradient.cc
+++ b/dragon/core/graph_gradient.cc
@@ -4,23 +4,24 @@
 namespace dragon {
 bool GraphGradientMaker::CheckGrad(
-    const OperatorDef& forward_op,
+    const OperatorDef& op_def,
    const Set<string>& targets,
    vector<pair<string, int>>& gen_grads) {
-  if (NoGradientRegistry()->Has(forward_op.type())) {
+  if (NoGradientRegistry()->Has(op_def.type())) {
-    for (auto& input : forward_op.input())
+    for (auto& input : op_def.input()) {
      blacklist_set_.insert(input);
+    }
    return true;
  }
-  for (int i = 0; i < forward_op.output_size(); ++i) {
+  for (int i = 0; i < op_def.output_size(); ++i) {
-    const auto& output = forward_op.output(i);
+    const auto& output = op_def.output(i);
    if (!inputs_to_grads_.count(output)) {
      if (blacklist_set_.count(output)) return true;
      if (targets.count(output)) {
        // Consider to generate virtual gradient for targets
        gen_grads.push_back({output, i});
        inputs_to_grads_[output] = output + "_grad";
-      } else if (forward_op.output_size() == 1) {
+      } else if (op_def.output_size() == 1) {
        return true; // We can skip this op, obviously
      }
    }
@@ -30,7 +31,7 @@ bool GraphGradientMaker::CheckGrad(
 }
 void GraphGradientMaker::Make(
-    const vector<OperatorDef*>& forward_ops,
+    const vector<OperatorDef*>& op_defs,
    const vector<string>& targets,
    const vector<string>& input_grads,
    GraphDef& backward_def) {
@@ -39,11 +40,11 @@ void GraphGradientMaker::Make(
  Map<string, string> targets_to_grads;
  // PLAY for the forward
-  for (auto* op : forward_ops) {
+  for (auto* op_def : op_defs) {
-    if (NoGradientRegistry()->Has(op->type())) continue;
+    if (NoGradientRegistry()->Has(op_def->type())) continue;
-    for (const auto& input : op->input()) {
+    for (const auto& input : op_def->input()) {
      bool input_in_outputs = false;
-      for (auto& output : op->output())
+      for (auto& output : op_def->output())
        if (output == input) {
          input_in_outputs = true;
          break;
@@ -62,9 +63,9 @@ void GraphGradientMaker::Make(
    targets_set.insert(targets[i]);
  }
-  for (int op_idx = (int)forward_ops.size() - 1; op_idx >= 0; --op_idx) {
+  for (int op_idx = (int)op_defs.size() - 1; op_idx >= 0; --op_idx) {
    // Collect inputs and outputs, generate raw gradient ops
-    const OperatorDef& op = *forward_ops[op_idx];
+    const OperatorDef& op = *op_defs[op_idx];
    vector<pair<string, int>> gen_grads;
    bool is_skip = CheckGrad(op, targets_set, gen_grads);
    vector<string> g_outputs;
@@ -183,9 +184,9 @@ GraphDef GraphGradientMaker::Share(const GraphDef& input_def) {
    // Flag the gathering gradients
    if (op.type() == "GradientGather") {
      invalid_ops.insert(op_idx);
-      if (ignored_grads_.count(op.output(0))) {
+      if (empty_grads_.count(op.output(0))) {
        for (const auto& input : op.input()) {
-          ignored_grads_.insert(input);
+          empty_grads_.insert(input);
        }
        continue;
      } else {
@@ -200,7 +201,7 @@ GraphDef GraphGradientMaker::Share(const GraphDef& input_def) {
    }
    // Count the references to detect leafs
    for (const auto& input : op.input()) {
-      if (str::find(input, "grad")) {
+      if (str::endswith(input, "_grad")) {
        ref_count[input] += 1;
      }
    }
@@ -293,21 +294,17 @@ GraphDef GraphGradientMaker::Share(const GraphDef& input_def) {
    // Rewrite output gradients
    for (int i = 0; i < op->output_size(); ++i) {
+      if (str::startswith(op->type(), "Python")) continue;
      const string& output = op->output(i);
-      if (output.empty() || str::startswith(output, "/share/")) continue;
+      if (output.empty() || str::startswith(output, "/share/buffer")) continue;
-      if (ignored_grads_.count(output) > 0) {
+      if (empty_grads_.count(output) > 0) {
-        // Prune for non-trainable leafs
        *op->mutable_output(i) = "";
        continue;
      }
-      if (hooked_grads_.empty()) {
+      // Protection for leafs
-        // Protection for leafs
+      if (ref_count.count(output) == 0) continue;
-        if (ref_count.count(output) == 0) continue;
+      // Protection for sources and leafs
-      } else {
+      if (retained_grads_.count(output) > 0) continue;
-        // Protection for sources
-        if (hooked_grads_.count(output) > 0) continue;
-      }
-      if (op->type() == "PythonPluginGradient") continue;
      string new_output = output;
      if (inplace_flags[i] >= 0) {
        new_output = op->input(inplace_flags[i]);

--- a/dragon/core/graph_gradient.h
+++ b/dragon/core/graph_gradient.h
@@ -21,22 +21,22 @@ class DRAGON_API GraphGradientMaker {
 public:
  /*! \brief Generate a backward graph from the forward ops */
  void Make(
-      const vector<OperatorDef*>& forward_ops,
+      const vector<OperatorDef*>& op_defs,
      const vector<string>& targets,
      const vector<string>& input_grads,
-      GraphDef& backward_def);
+      GraphDef& graph_def);
  /*! \brief Rewrite a graph to share the intermediate grads */
  GraphDef Share(const GraphDef& input_def);
-  /*! \brief Add a hooked gradient */
+  /*! \brief Add an empty gradient */
-  void add_hooked_grad(const string& name) {
+  void add_empty_grad(const string& name) {
-    hooked_grads_.insert(name);
+    empty_grads_.insert(name);
  }
-  /*! \brief Add an ignored gradient */
+  /*! \brief Add a retained gradient */
-  void add_ignored_grad(const string& name) {
+  void add_retained_grad(const string& name) {
-    ignored_grads_.insert(name);
+    retained_grads_.insert(name);
  }
  /*! \brief Set the prefix of backward op name */
@@ -47,32 +47,32 @@ class DRAGON_API GraphGradientMaker {
 private:
  /*! \brief Check the missing grads of backward procedure */
  bool CheckGrad(
-      const OperatorDef& forward_op,
+      const OperatorDef& op_def,
      const Set<string>& targets,
      vector<pair<string, int>>& gen_grads);
  /*! \brief Return a dummy operator name */
  string GetOperatorName() {
-    if (op_prefix_.empty()) return "Generic";
+    if (op_prefix_.empty()) return "GradientOp";
    return op_prefix_ + str::to(op_index_++);
  }
-  /*! \brief Store the mapping of intermediate grads */
+  /*! \brief The mapping from input to grad */
  Map<string, string> inputs_to_grads_;
-  /*! \brief Store the non-gradient outputs */
+  /*! \brief The non-gradient outputs */
  Set<string> blacklist_set_;
-  /*! \brief Store the non-shared gradients */
+  /*! \brief The gradients should be retained */
-  Set<string> hooked_grads_;
+  Set<string> retained_grads_;
-  /*! \brief Store the gradients that are not required */
+  /*! \brief The gradients should be set to empty */
-  Set<string> ignored_grads_;
+  Set<string> empty_grads_;
-  /*! \brief Store the prefix of dummy operator name */
+  /*! \brief The prefix of op name */
  string op_prefix_;
-  /*! \brief Store the counter of dummy operator name */
+  /*! \brief The counter of op name */
  int64_t op_index_ = 0;
 };

--- a/dragon/core/graph_optimizer.cc
+++ b/dragon/core/graph_optimizer.cc
@@ -39,14 +39,12 @@ GraphDef GraphOptimizer::PruneNodes(const GraphDef& input_def) {
    BackwardPrunePass(target);
  }
-  // Forward pass from gradients
+  for (const auto& grad_info : input_def.grad_info()) {
-  for (const auto& gradient : input_def.gradient()) {
+    const auto u = grad_info.y() + "_grad";
-    auto u = gradient.cost() + "_grad";
+    for (const auto& x : grad_info.xs()) {
-    auto v = gradient.wrt() + "_grad";
+      visited_.clear();
-    if (ws_->HasTensor(u)) u = ws_->GetTensor(u)->name();
+      ForwardPrunePass(u, x + "_grad", std::deque<string>({u}));
-    if (ws_->HasTensor(v)) v = ws_->GetTensor(v)->name();
+    }
-    visited_.clear();
-    ForwardPrunePass(u, v, vector<string>({u}));
  }
  // Select all colored operators
@@ -64,7 +62,6 @@ GraphDef GraphOptimizer::PruneNodes(const GraphDef& input_def) {
  // Generate the final op sequence
  map<int, OperatorDef> final_sequence;
  for (auto op_idx : selected_op_indices) {
    const auto& op = input_def.op(op_idx);
    auto new_op(input_def.op(op_idx));
@@ -308,11 +305,13 @@ GraphDef GraphOptimizer::SimulateGC(const GraphDef& input_def) {
 void GraphOptimizer::ForwardPrunePass(
    const string& u,
    const string& leaf,
-    const vector<string>& path) {
+    const std::deque<string>& path) {
  if (visited_.count(u)) {
-    if (visited_[u])
+    if (visited_[u]) {
-      for (const auto& node : path)
+      for (const auto& node : path) {
        visited_[node] = colored_[node] = true;
+      }
+    }
    return;
  }
  visited_[u] = false;
@@ -321,8 +320,9 @@ void GraphOptimizer::ForwardPrunePass(
    auto new_path(path);
    new_path.push_back(v);
    if (v == leaf) {
-      for (const auto& node : new_path)
+      for (const auto& node : new_path) {
        visited_[node] = colored_[node] = true;
+      }
      return;
    }
    ForwardPrunePass(v, leaf, new_path);

--- a/dragon/core/graph_optimizer.h
+++ b/dragon/core/graph_optimizer.h
@@ -56,7 +56,7 @@ class GraphOptimizer {
  void ForwardPrunePass(
      const string& u,
      const string& leaf,
-      const vector<string>& path);
+      const std::deque<string>& path);
  /*! \brief Pass from targets to remove unused nodes */
  void BackwardPrunePass(const string& v);

--- a/dragon/core/operator.cc
+++ b/dragon/core/operator.cc
@@ -41,14 +41,11 @@ OperatorBase::OperatorBase(const OperatorDef& def, Workspace* ws)
  }
 }
-template <class Context>
+// template <class Context>
-Operator<Context>::Operator(const OperatorDef& def, Workspace* ws)
+// Operator<Context>::Operator(const OperatorDef& def, Workspace* ws)
-    : OperatorBase(def, ws),
+//     : OperatorBase(def, ws),
-      ctx_(def.device_option()),
+//       ctx_(def.device_option()),
-      do_sync_(OpArg<bool>("do_sync", false)),
+//       do_sync_(OpArg<bool>("do_sync", false)) {}
-      allow_recomp_(OpArg<bool>("allow_recomp", false)) {
-  allow_run_ = (!(OutputSize() == 1 && !Output(0)->has_name()));
-}
 Tensor& OperatorBase::Input(int i) {
  CHECK_LT(i, (int)inputs_.size());
@@ -112,32 +109,32 @@ OperatorBase* OperatorBase::UpdateFrom(const OperatorDef& def) {
  handle_ = def.name();
  inputs_.resize(def.input_size());
  outputs_.resize(def.output_size());
-  for (int i = 0; i < inputs_.size(); i++)
+  for (int i = 0; i < inputs_.size(); i++) {
    inputs_[i] = ws()->GetTensor(def.input(i));
-  for (int i = 0; i < outputs_.size(); i++)
+  }
+  for (int i = 0; i < outputs_.size(); i++) {
    outputs_[i] = ws()->CreateTensor(def.output(i));
+  }
  return this;
 }
 template <class Context>
 void Operator<Context>::Prepare() {
-  string tensor_name;
-  size_t ver_pos;
-  int version;
  for (int i = 0; i < InputSize(); i++) {
    if (Input(i).version() >= 0) {
-      tensor_name = def().input(i);
+      const auto& name = def().input(i);
-      ver_pos = tensor_name.find("/ver:");
+      auto ver_pos = name.find("/ver:");
-      version = std::atoi(tensor_name.substr(ver_pos + 5).c_str());
+      auto version = std::atoi(name.substr(ver_pos + 5).c_str());
      if (version == Input(i).version()) continue;
      LOG(DEBUG) << "Excepted version of Tensor(" + Input(i).name() + ") "
                 << "is " << version << ", got " << Input(i).version()
                 << ". Recompute.";
      Tensor* flag = ws()->GetTensor("/share/flag/recomputing");
      flag->mutable_data<bool, CPUContext>()[0] = true;
-      vector<OperatorBase*>& chain = subgraph()[tensor_name];
+      vector<OperatorBase*>& chain = subgraph()[name];
-      for (auto* op : chain)
+      for (auto* op : chain) {
        op->Run(ctx()->stream_id());
+      }
      flag->mutable_data<bool, CPUContext>()[0] = false;
    }
  }
@@ -145,14 +142,11 @@ void Operator<Context>::Prepare() {
 template <class Context>
 void Operator<Context>::Release() {
-  string tensor_name;
-  size_t ver_pos;
-  int version;
  for (int i = 0; i < OutputSize(); i++) {
    if (Output(i)->version() >= 0) {
-      tensor_name = def().output(i);
+      const auto& name = def().output(i);
-      ver_pos = tensor_name.find("/ver:");
+      auto ver_pos = name.find("/ver:");
-      version = std::atoi(tensor_name.substr(ver_pos + 5).c_str());
+      auto version = std::atoi(name.substr(ver_pos + 5).c_str());
      Output(i)->set_version(version);
    }
  }
@@ -195,8 +189,7 @@ TryCreateOperator(const string& key, const OperatorDef& def, Workspace* ws) {
 OperatorBase* NewOperator(const OperatorDef& def, Workspace* ws) {
  auto* schema = OpSchemaRegistry::Schema(def.type());
-  if (schema) {
+  if (schema != nullptr) {
-    // Check the Inputs and Outputs if necessary
    CHECK(schema->Verify(def))
        << "\nOperator failed to pass the schema checking.";
  }
@@ -219,7 +212,7 @@ Gradient MakeGradientForOp(
               << "not implemented.";
  Gradient grad = maker->Make();
  OperatorDef reference_def(def);
-  // Map the cache key
+  // Set the cache key
  if (reference_def.has_cache_key()) {
    for (int i = 0; i < grad.ops.size(); ++i) {
      grad.ops[i].set_cache_key(

--- a/dragon/core/operator.h
+++ b/dragon/core/operator.h
@@ -40,7 +40,7 @@ class DRAGON_API OperatorBase {
  }
  /*! \brief Run operator on the specified stream */
-  virtual void Run(int stream_id = 0) {
+  virtual void Run(int stream = 0) {
    NOT_IMPLEMENTED;
  }
@@ -154,12 +154,12 @@ class DRAGON_API OperatorBase {
  }
  /*! \brief Set the output aliases for in-place */
-  void set_output_aliases(const Map<string, Set<string>>& aliases_map) {
+  void set_output_aliases(const Map<string, Set<string>>& alias_map) {
    output_aliases_.resize(outputs_.size());
    for (int i = 0; i < outputs_.size(); ++i) {
-      auto aliases_iter = aliases_map.find(outputs_[i]->name());
+      const auto& it = alias_map.find(outputs_[i]->name());
-      if (aliases_iter != aliases_map.end()) {
+      if (it != alias_map.end()) {
-        output_aliases_[i] = aliases_iter->second;
+        output_aliases_[i] = it->second;
      } else {
        output_aliases_[i].clear();
      }
@@ -196,7 +196,10 @@ template <class Context>
 class DRAGON_API Operator : public OperatorBase {
 public:
  /*! \brief Default constructor */
-  Operator(const OperatorDef& def, Workspace* ws);
+  Operator(const OperatorDef& def, Workspace* ws)
+      : OperatorBase(def, ws),
+        ctx_(def.device_option()),
+        do_sync_(OperatorBase::Arg<bool>("do_sync", false)) {}
  /*! \brief Prepare the content of inputs */
  virtual void Prepare();
@@ -207,36 +210,32 @@ class DRAGON_API Operator : public OperatorBase {
  /*! \brief Coordinate the context of inputs and outputs */
  virtual void SwitchToDevice();
-  /*! \brief Implement the detailed execution */
+  /*! \brief The detailed execution on device */
  virtual void RunOnDevice() = 0;
-  /*! \brief Run this operator on the specified stream */
+  /*! \brief Run this operator */
-  void Run(int stream_id = 0) final {
+  void Run(int stream = 0) final {
-    if (!allow_run_) return;
+    Prepare();
-    if (allow_recomp_) Prepare();
+    ctx()->SwitchToDevice(stream);
-    ctx()->SwitchToDevice(stream_id);
    SwitchToDevice();
    RunOnDevice();
-    if (do_sync_ || stream_id > 0) {
+    if (do_sync_ || stream > 0) {
      ctx()->FinishDeviceComputation();
    }
-    if (allow_recomp_) Release();
+    Release();
  }
-  /*! \brief Return a bool indicating the run is available */
+  /*! \brief Return the context */
-  bool allow_run() const {
-    return allow_run_;
-  }
-  /*! \brief Return the internal context */
  Context* ctx() {
    return &ctx_;
  }
 protected:
-  /*! \brief Store the internal context */
+  /*! \brief The context */
  Context ctx_;
-  bool do_sync_, allow_run_, allow_recomp_;
+  /*! \brief The executing flags */
+  bool do_sync_;
 };
 /*! \brief New a operator from the raw def */
@@ -266,9 +265,8 @@ OperatorBase* NewOperator(const OperatorDef&, Workspace*);
  using OperatorBase::def;           \
  using OperatorBase::ws
-#define USE_OPERATOR_FUNCTIONS        \
+#define USE_OPERATOR_FUNCTIONS \
-  USE_OPERATOR_BASE_FUNCTIONS;        \
+  USE_OPERATOR_BASE_FUNCTIONS; \
-  using Operator<Context>::allow_run; \
  using Operator<Context>::ctx
 #define STORE_INPUT_SPEC(i)               \
@@ -342,46 +340,46 @@ DEFINE_TENSOR_TYPES_DISPATCHER(DoRunWithType);
 /* Fillers */
-#define TENSOR_FILL_WITH_TYPE(tensor, shape, type)                     \
+#define TENSOR_FILL_WITH_TYPE(tensor, shape, type)                        \
-  if (tensor.count() == 0) {                                           \
+  if (tensor.count() == 0) {                                              \
-    CHECK(ws()->GetFiller(tensor.name()))                              \
+    auto* filler_info = ws()->GetFillerInfo(tensor.name());               \
-        << "\nTensor(" << tensor.name() << ") is empty. \n"            \
+    CHECK(filler_info) << "\nTensor(" << tensor.name() << ") is empty.\n" \
-        << "may be specify a filler for it?";                          \
+                       << "May be specify a filler for it?";              \
-    tensor.Reshape(shape);                                             \
+    tensor.Reshape(shape);                                                \
-    unique_ptr<Filler<type, Context>> filler(                          \
+    unique_ptr<Filler<type, Context>> filler(                             \
-        CreateFiller<type, Context>(*ws()->GetFiller(tensor.name()))); \
+        CreateFiller<type, Context>(*filler_info));                       \
-    filler->Fill(&tensor, ctx());                                      \
+    filler->Fill(&tensor, ctx());                                         \
-  } else {                                                             \
+  } else {                                                                \
-    int64_t count = 1;                                                 \
+    int64_t count = 1;                                                    \
-    for (int i = 0; i < shape.size(); i++)                             \
+    for (int i = 0; i < shape.size(); i++)                                \
-      count *= shape[i];                                               \
+      count *= shape[i];                                                  \
-    CHECK_EQ(count, tensor.count())                                    \
+    CHECK_EQ(count, tensor.count())                                       \
-        << "\nExcepted Tensor(" << tensor.name() << ")'s "             \
+        << "\nExcepted Tensor(" << tensor.name() << ")'s "                \
-        << "size is " << count << ", \n"                               \
+        << "size is " << count << ", \n"                                  \
-        << "but now is " << tensor.count() << ", "                     \
+        << "but now is " << tensor.count() << ", "                        \
-        << "did you feed the incorrect data before?";                  \
+        << "did you feed the incorrect data before?";                     \
-    tensor.Reshape(shape);                                             \
+    tensor.Reshape(shape);                                                \
  }
-#define TENSOR_FILL(tensor, shape)                                  \
+#define TENSOR_FILL(tensor, shape)                                        \
-  if (tensor.count() == 0) {                                        \
+  if (tensor.count() == 0) {                                              \
-    CHECK(ws()->GetFiller(tensor.name()))                           \
+    auto* filler_info = ws()->GetFillerInfo(tensor.name());               \
-        << "\nTensor(" << tensor.name() << ") is empty. \n"         \
+    CHECK(filler_info) << "\nTensor(" << tensor.name() << ") is empty.\n" \
-        << "Maybe specify a filler for it?";                        \
+                       << "May be specify a filler for it?";              \
-    tensor.Reshape(shape);                                          \
+    tensor.Reshape(shape);                                                \
-    unique_ptr<Filler<T, Context>> filler(                          \
+    unique_ptr<Filler<T, Context>> filler(                                \
-        CreateFiller<T, Context>(*ws()->GetFiller(tensor.name()))); \
+        CreateFiller<T, Context>(*filler_info));                          \
-    filler->Fill(&tensor, ctx());                                   \
+    filler->Fill(&tensor, ctx());                                         \
-  } else {                                                          \
+  } else {                                                                \
-    int64_t count = 1;                                              \
+    int64_t count = 1;                                                    \
-    for (int i = 0; i < shape.size(); i++)                          \
+    for (int i = 0; i < shape.size(); i++)                                \
-      count *= shape[i];                                            \
+      count *= shape[i];                                                  \
-    CHECK_EQ(count, tensor.count())                                 \
+    CHECK_EQ(count, tensor.count())                                       \
-        << "\nExcepted Tensor(" << tensor.name() << ")'s "          \
+        << "\nExcepted Tensor(" << tensor.name() << ")'s "                \
-        << "size is " << count << ", \n"                            \
+        << "size is " << count << ", \n"                                  \
-        << "but now is " << tensor.count() << ", "                  \
+        << "but now is " << tensor.count() << ", "                        \
-        << "did you feed the incorrect data before?";               \
+        << "did you feed the incorrect data before?";                     \
-    tensor.Reshape(shape);                                          \
+    tensor.Reshape(shape);                                                \
  }
 /* Arguments */

--- a/dragon/core/workspace.cc
+++ b/dragon/core/workspace.cc
@@ -4,176 +4,150 @@
 namespace dragon {
-vector<string> Workspace::tensors() const {
+Workspace::Workspace(const string& name) : name_(name) {
-  vector<string> locals;
-  // Search the local workspace
-  for (const auto& it : tensor_map_)
-    locals.push_back(it.first);
-  // Search the remote workspaces
-  for (const auto& it : external_tensor_map_) {
-    locals.push_back(it.first);
-  }
-  return locals;
-}
-vector<string> Workspace::graphs() const {
-  vector<string> names;
-  for (const auto& it : graph_map_) {
-    names.push_back(it.first);
-  }
-  return names;
-}
-void Workspace::Initialize() {
  CreateTensor(""); // Empty placeholder
  CreateTensor("/share/flag/recomputing")
-      ->Reshape({1})
+      ->Reshape({})
      ->mutable_data<bool, CPUContext>()[0] = false;
 }
-void Workspace::Clear() {
+void Workspace::MergeFrom(Workspace* other) {
-  // Remove and Initialize again
+  if (other != nullptr) {
-  tensor_map_.clear();
+    // Add the external tensors
-  Initialize();
+    for (const auto& it : other->tensor_map_) {
-}
+      if (!it.first.empty() && !str::startswith(it.first, "/")) {
+        external_tensor_map_[it.first] = it.second.get();
-void Workspace::MergeFrom(Workspace* ws) {
+      }
-  CHECK(ws) << "\nThe given Workspace is invalid.";
+    }
-  for (const auto& it : ws->tensor_map_) {
+    // Recount the unique index to avoid duplicate names
-    if (!it.first.empty() && !str::startswith(it.first, "/")) {
+    for (const auto& i : other->unique_index_map_) {
-      external_tensor_map_[it.first] = it.second.get();
+      auto& index_map = unique_index_map_[i.first];
+      for (const auto& j : i.second) {
+        index_map[j.first] = std::max(index_map[j.first], j.second);
+      }
    }
  }
 }
-string Workspace::GetTensorName(const string& name) const {
+Tensor* Workspace::TryGetTensor(const string& name, bool external) const {
-  const auto& it = alias_active_map_.find(name);
+  // Check the alias firstly
-  if (it != alias_active_map_.end()) return it->second;
+  const auto& alias_it = alias_map_.find(name);
-  return name;
+  auto name_v2 = alias_it != alias_map_.end() ? alias_it->second : name;
-}
+  // Search this workspace
+  const auto& it = tensor_map_.find(name_v2);
-Tensor* Workspace::TryGetTensor(const string& name, bool use_remote) const {
-  // Check the proxy of this tensor firstly
-  string query = GetTensorName(name);
-  // Search the local workspace
-  const auto& it = tensor_map_.find(query);
  if (it != tensor_map_.end()) return it->second.get();
+  if (external) {
-  if (use_remote) {
+    // Search external workspaces
-    // Search the remote workspaces
+    const auto& it = external_tensor_map_.find(name_v2);
-    const auto& it = external_tensor_map_.find(query);
    if (it != external_tensor_map_.end()) return it->second;
  }
  return nullptr;
 }
-Tensor* Workspace::CreateTensor(const string& name) {
+Tensor* Workspace::CreateTensor(const string& name, FillerInfo* filler) {
-  Tensor* tensor = TryGetTensor(name);
+  auto* tensor = TryGetTensor(name);
-  if (!tensor) {
+  // Create only if name not existed
-    tensor_map_[name] = unique_ptr<Tensor>(new Tensor(name));
+  if (tensor == nullptr) {
-    return tensor_map_[name].get();
+    tensor = new Tensor(name);
+    tensor_map_[name] = unique_ptr<Tensor>(tensor);
+  }
+  // Maybe bind it with a filler
+  if (filler != nullptr) {
+    filler_map_[tensor->name()] = std::move(FillerInfo(*filler));
  }
  return tensor;
 }
-Tensor* Workspace::GetTensor(const string& name, bool use_remote) const {
+Tensor* Workspace::GetTensor(const string& name, bool external) const {
-  Tensor* tensor = TryGetTensor(name, use_remote);
+  auto* tensor = TryGetTensor(name, external);
-  CHECK(tensor) << "\nTensor(" << name << ") does not "
+  CHECK(tensor) << "\nTensor(" << name << ") is not in current workspace.";
-                << "exist in current workspace.";
  return tensor;
 }
 void Workspace::ResetTensor(const string& name) {
-  Tensor* tensor = TryGetTensor(name, false);
+  auto* tensor = TryGetTensor(name, false);
-  CHECK(tensor) << "\nTensor(" << name << ") does not "
+  CHECK(tensor) << "\nTensor(" << name << ") is not in current workspace.";
-                << "belong to current workspace.";
  tensor->Reset();
 }
-bool Workspace::HasFiller(const string& name) const {
+FillerInfo* Workspace::GetFillerInfo(const string& name) {
-  return tensor_filler_map_.count(name) > 0;
+  const auto& it = filler_map_.find(name);
-}
+  if (it != filler_map_.end()) return &it->second;
-void Workspace::CreateFiller(const TensorFillerProto& filler) {
-  CHECK_GT(filler.tensor().size(), 0)
-      << "\nTensor with an empty name can not be filled.";
-  if (HasFiller(filler.tensor())) return;
-  tensor_filler_map_[filler.tensor()] = filler;
-}
-TensorFillerProto* Workspace::GetFiller(const string& name) {
-  const auto& it = tensor_filler_map_.find(name);
-  if (it != tensor_filler_map_.end()) return &it->second;
  return nullptr;
 }
-OperatorBase* Workspace::CreateOperator(const OperatorDef& def) {
-  const auto& it = operator_map_.find(def.cache_key());
-  if (it == operator_map_.end()) {
-    auto* new_op = NewOperator(def, this);
-    operator_map_[def.cache_key()] = unique_ptr<OperatorBase>(new_op);
-    return new_op;
-  }
-  return it->second.get();
-}
 void Workspace::RunOperator(const OperatorDef& def) {
  if (def.has_cache_key()) {
-    CreateOperator(def)->UpdateFrom(def)->Run(0);
+    OperatorBase* cached_op = nullptr;
+    const auto& it = operator_map_.find(def.cache_key());
+    if (it == operator_map_.end()) {
+      cached_op = NewOperator(def, this);
+      operator_map_[def.cache_key()] = unique_ptr<OperatorBase>(cached_op);
+    } else {
+      cached_op = it->second.get();
+    }
+    cached_op->UpdateFrom(def)->Run();
  } else {
-    unique_ptr<OperatorBase> op(NewOperator(def, this));
+    OperatorBase* temporal_op = NewOperator(def, this);
-    op->Run(0);
+    temporal_op->Run();
+    delete temporal_op;
  }
 }
 GraphBase* Workspace::CreateGraph(const GraphDef& def) {
-  CHECK(def.has_name()) << "\nGraph name is missing.";
+  CHECK(def.has_name()) << "\nExcepted non-empty graph name.";
-  auto name = GetDummyName(def.name(), "", "Graph", false);
+  GraphDef def_v2(def); // Copy to set an unique name
-  LOG(DEBUG) << "Create Graph: " << name << "(" << def.name() << ")";
+  def_v2.set_name(UniqueName(def.name(), "", "Graph", false));
-  GraphDef _def(def);
+  LOG(DEBUG) << "Create Graph: " << def_v2.name() << "(" << def.name() << ")";
-  _def.set_name(name);
+  auto* cached_graph = NewGraph(def_v2, this);
-  graph_map_[name] = unique_ptr<GraphBase>(NewGraph(_def, this));
+  graph_map_[def_v2.name()] = unique_ptr<GraphBase>(cached_graph);
-  return graph_map_[name].get();
+  return cached_graph;
 }
 void Workspace::RunGraph(
-    const string& graph_name,
+    const string& name,
-    const string& incl,
+    const string& include,
-    const string& excl,
+    const string& exclude,
-    int stream_id) {
+    const int stream) {
-  if (!graph_map_.count(graph_name)) {
+  CHECK(graph_map_.count(name))
-    LOG(FATAL) << "Graph(" << graph_name << ") does not exist.";
+      << "\nGraph(" << name << ") is not in current workspace.";
-  }
+  graph_map_[name]->Run(include, exclude, stream);
-  graph_map_[graph_name]->Run(incl, excl, stream_id);
 }
-bool Workspace::ActivateAlias(const string& name, const string& alias) {
+void Workspace::RegisterAlias(const string& target, const string& alias) {
-  bool status = alias_active_map_.count(alias) > 0;
+  alias_map_[alias] = target;
-  alias_active_map_[alias] = name;
-  return status; // True if activated otherwise false
 }
-string Workspace::GetDummyName(
+string Workspace::UniqueName(
-    const string& base_name,
+    const string& name,
    const string& suffix,
-    const string& domain,
+    const string& scope,
    bool zero_based) {
-  string accepted_name;
+  auto& index_map = unique_index_map_[scope];
-  int64_t index;
+  auto required_name = name + suffix;
-  const auto required_name = base_name + suffix;
+  auto index = index_map[required_name]++;
-  auto& dmap = dummy_name_map_[domain];
+  if (index > 0) return name + "_" + str::to(index) + suffix;
-  while (1) {
+  if (zero_based) return required_name;
-    index = dmap[required_name]++;
+  return name + "_" + str::to(index_map[required_name]++) + suffix;
-    accepted_name = index ? base_name + "_" + str::to(index) + suffix
+}
-                          : zero_based
-            ? required_name
+vector<string> Workspace::tensors() const {
-            : base_name + "_" + str::to(dmap[required_name]++) + suffix;
+  vector<string> names;
-    if (external_tensor_map_.empty()) break;
+  for (const auto& it : tensor_map_) {
-    if (!HasTensor(accepted_name)) break;
+    names.push_back(it.first);
  }
-  return accepted_name;
+  for (const auto& it : external_tensor_map_) {
+    names.push_back(it.first);
+  }
+  return names;
+}
+vector<string> Workspace::graphs() const {
+  vector<string> names;
+  for (const auto& it : graph_map_) {
+    names.push_back(it.first);
+  }
+  return names;
 }
 } // namespace dragon
--- a/dragon/core/workspace.h
+++ b/dragon/core/workspace.h
@@ -20,83 +20,63 @@ namespace dragon {
 class Workspace {
 public:
  /*! \brief Constructor */
-  explicit Workspace(const string& name) : name_(name) {
+  DRAGON_API explicit Workspace(const string& name);
-    Initialize();
-  }
-  /*! \brief Create some internal tensors */
-  DRAGON_API void Initialize();
-  /*! \brief Merge tensors from a external workspace */
+  /*! \brief Merge resources from other */
  DRAGON_API void MergeFrom(Workspace*);
-  /*! \brief Destory all the tensors */
+  /* \brief Return an unique name */
-  DRAGON_API void Clear();
+  DRAGON_API string UniqueName(
+      const string& name,
-  /* \brief Return a unique dummy name within this workspace */
-  DRAGON_API string GetDummyName(
-      const string& base_name,
      const string& suffix,
-      const string& domain = "",
+      const string& scope = "",
-      bool zero_based = true);
+      const bool zero_based = false);
-  /*! \brief Whether the specified tensor is in this workspace */
+  /* \brief Register an alias for the target */
-  DRAGON_API bool HasTensor(const string& name, bool use_remote = true) const {
+  DRAGON_API void RegisterAlias(const string& target, const string& alias);
-    return TryGetTensor(name, use_remote) ? true : false;
-  }
-  /*! \brief Query the real name of specified tensor */
+  /*! \brief Return whether tensor is existing */
-  DRAGON_API string GetTensorName(const string&) const;
+  DRAGON_API bool HasTensor(const string& name, bool external = true) const {
+    return TryGetTensor(name, external) == nullptr ? false : true;
-  /* \brief Activate an alias for the target */
+  }
-  DRAGON_API bool ActivateAlias(const string& name, const string& alias);
-  /*! \brief Create a tensor in this workspace */
+  /*! \brief Create the tensor */
-  DRAGON_API Tensor* CreateTensor(const string&);
+  DRAGON_API Tensor* CreateTensor(const string&, FillerInfo* = nullptr);
-  /*! \brief Try to search the specified tensor in this workspace */
+  /*! \brief Try to return the tensor */
  DRAGON_API Tensor* TryGetTensor(const string&, bool = true) const;
-  /*! \brief Return the specified tensor */
+  /*! \brief Return the tensor */
  DRAGON_API Tensor* GetTensor(const string&, bool = true) const;
-  /*! \brief Reset the specified tensor */
+  /*! \brief Reset the tensor */
  DRAGON_API void ResetTensor(const string&);
-  /* \brief Whether the specified filler is existing */
+  /*! \brief Return the filler info */
-  DRAGON_API bool HasFiller(const string&) const;
+  DRAGON_API FillerInfo* GetFillerInfo(const string&);
-  /*! \brief Create a filler in this workspace */
-  DRAGON_API void CreateFiller(const TensorFillerProto&);
-  /*! \brief Return the specified filler */
+  /*! \brief Run the operator */
-  DRAGON_API TensorFillerProto* GetFiller(const string&);
-  /*! \brief Create an operator in this workspace */
-  DRAGON_API OperatorBase* CreateOperator(const OperatorDef&);
-  /*! \brief Run an operator in this workspace */
  DRAGON_API void RunOperator(const OperatorDef&);
-  /*! \brief Create a graph in this workspace */
+  /*! \brief Create the graph */
  DRAGON_API GraphBase* CreateGraph(const GraphDef&);
-  /*! \brief Run the specifed graph by name and rules */
+  /*! \brief Run the graph */
  DRAGON_API void RunGraph(
      const string& graph_name,
-      const string& incl = "",
+      const string& include = "",
-      const string& excl = "",
+      const string& exclude = "",
-      int stream_id = 0);
+      const int stream = 0);
-  /*! \brief Return the name of this workspace */
+  /*! \brief Return the workspace name */
  const string& name() {
    return name_;
  }
-  /*! \brief Return the name of stored tensors */
+  /*! \brief Return the name of cached tensors */
  DRAGON_API vector<string> tensors() const;
-  /*! \brief Return the name of stored graphs */
+  /*! \brief Return the name of cached graphs  */
  DRAGON_API vector<string> graphs() const;
  /*! \brief Provide a group of the shared byte data */
@@ -127,28 +107,28 @@ class Workspace {
  }
 private:
-  /*! \brief The unique workspace name */
+  /*! \brief The workspace name */
  string name_;
-  /*! \brief The dummy name indices */
+  /*! \brief The external tensors */
-  Map<string, Map<string, int64_t>> dummy_name_map_;
+  Map<string, Tensor*> external_tensor_map_;
-  /*! \brief Store the created tensors */
+  /*! \brief The unique indices */
-  Map<string, unique_ptr<Tensor>> tensor_map_;
+  Map<string, Map<string, int64_t>> unique_index_map_;
-  /*! \brief Store the external tensors */
+  /*! \brief The registered fillers */
-  Map<string, Tensor*> external_tensor_map_;
+  Map<string, FillerInfo> filler_map_;
-  /*! \brief Store the registered tensor fillers */
+  /*! \brief The registered aliases */
-  Map<string, TensorFillerProto> tensor_filler_map_;
+  Map<string, string> alias_map_;
-  /*! \brief Store the active aliases */
+  /*! \brief The cached tensors */
-  Map<string, string> alias_active_map_;
+  Map<string, unique_ptr<Tensor>> tensor_map_;
-  /*! \brief Store the registered operators for dynamic graph */
+  /*! \brief The cached operators */
  Map<string, unique_ptr<OperatorBase>> operator_map_;
-  /*! \brief Store the registered graphs for static graph */
+  /*! \brief The cached graphs */
  Map<string, unique_ptr<GraphBase>> graph_map_;
 };

--- a/dragon/kernels/activation/prelu_op_kernel.cu
+++ b/dragon/kernels/activation/prelu_op_kernel.cu
@@ -425,7 +425,7 @@ void PReluWGrad<float16, CUDAContext>(
          CUDA_THREADS,
          0,
          ctx->cuda_stream()>>>(
-          N * C * S,
+          N * S,
          C,
          S,
          reinterpret_cast<const half*>(dy),
@@ -437,7 +437,7 @@ void PReluWGrad<float16, CUDAContext>(
          CUDA_THREADS,
          0,
          ctx->cuda_stream()>>>(
-          N * C * S,
+          N * S,
          C,
          reinterpret_cast<const half*>(dy),
          reinterpret_cast<const half*>(x),
@@ -536,13 +536,13 @@ void PReluWGrad<float16, CUDAContext>(
            CUDA_2D_BLOCKS(C),                                               \
            CUDA_THREADS,                                                    \
            0,                                                               \
-            ctx->cuda_stream()>>>(N * C * S, C, S, dy, x, dw);               \
+            ctx->cuda_stream()>>>(N * S, C, S, dy, x, dw);                   \
      } else if (data_format == "NHWC") {                                    \
        _PReluWGradNHWC<<<                                                   \
            CUDA_2D_BLOCKS(C),                                               \
            CUDA_THREADS,                                                    \
            0,                                                               \
-            ctx->cuda_stream()>>>(N * C * S, C, dy, x, dw);                  \
+            ctx->cuda_stream()>>>(N * S, C, dy, x, dw);                      \
      } else {                                                               \
        LOG(FATAL) << "Unknown data format: " << data_format;                \
      }                                                                      \

--- a/dragon/modules/python/common.h
+++ b/dragon/modules/python/common.h
@@ -25,7 +25,6 @@
 #include "dragon/core/workspace.h"
 #include "dragon/modules/python/types.h"
 #include "dragon/onnx/onnx_backend.h"
-#include "dragon/utils/caffemodel.h"
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>

--- a/dragon/modules/python/dlpack.h
+++ b/dragon/modules/python/dlpack.h
@@ -74,7 +74,7 @@ class DLPackWrapper {
  }
  Tensor* From(py::object obj) {
-    CHECK(PyCapsule_CheckExact(obj.ptr())) << "\nExpected DLPack capsule";
+    CHECK(PyCapsule_CheckExact(obj.ptr())) << "\nExpected DLPack capsule.";
    auto* managed_tensor =
        (DLManagedTensor*)PyCapsule_GetPointer(obj.ptr(), "dltensor");
    CHECK(managed_tensor) << "\nInvalid DLPack capsule";

--- a/dragon/modules/python/module.cc
+++ b/dragon/modules/python/module.cc
@@ -44,48 +44,38 @@ PYBIND11_MODULE(libdragon_python, m) {
      /*! \brief Return the name of stored graphs */
      .def_property_readonly("graphs", &Workspace::graphs)
-      /*! \brief Destory all the tensors */
+      /*! \brief Merge resources from another workspace */
-      .def("Clear", &Workspace::Clear)
-      /*! \brief Merge a external workspace into self */
      .def("MergeFrom", &Workspace::MergeFrom)
-      /*! \brief Return a unique dummy name */
+      /*! \brief Return an unique name */
-      .def("GetDummyName", &Workspace::GetDummyName)
+      .def("UniqueName", &Workspace::UniqueName)
-      /*! \brief Return the unique name of given tensor */
-      .def("GetTensorName", &Workspace::GetTensorName)
-      /*! \brief Reset a tensor with the given name */
+      /*! \brief Reset the tensor */
      .def("ResetTensor", &Workspace::ResetTensor)
-      /*! \brief Indicate whether the given tensor is existing */
+      /*! \brief Return whether the tensor is existing */
      .def(
          "HasTensor",
          [](Workspace* self, const string& name) {
            return self->HasTensor(name);
          })
-      /*! \brief Create a tensor with the given name */
+      /*! \brief Create the tensor */
      .def(
          "CreateTensor",
-          [](Workspace* self, const string& name) {
+          [](Workspace* self, const string& name, const string& filler_str) {
+            if (!filler_str.empty()) {
+              FillerInfo filler_info;
+              if (!filler_info.ParseFromString(filler_str)) {
+                LOG(FATAL) << "Failed to parse the FillerInfo.";
+              }
+              return self->CreateTensor(name, &filler_info);
+            }
            return self->CreateTensor(name);
          },
          py::return_value_policy::reference_internal)
-      /*! \brief Create a tensor from the specified filler */
+      /*! \brief Return the tensor */
-      .def(
-          "CreateFiller",
-          [](Workspace* self, const string& serialized) {
-            TensorFillerProto filler_proto;
-            if (!filler_proto.ParseFromString(serialized))
-              LOG(FATAL) << "Failed to parse the TensorFiller.";
-            self->CreateFiller(filler_proto);
-            self->CreateTensor(filler_proto.tensor());
-          })
-      /*! \brief Return the CXX Tensor reference */
      .def(
          "GetTensor",
          [](Workspace* self, const string& name) {
@@ -93,11 +83,11 @@ PYBIND11_MODULE(libdragon_python, m) {
          },
          py::return_value_policy::reference_internal)
-      /* \brief Set an alias for the tensor */
+      /* \brief Register an alias for the name */
      .def(
-          "SetTensorAlias",
+          "RegisterAlias",
          [](Workspace* self, const string& name, const string& alias) {
-            return self->ActivateAlias(name, alias);
+            return self->RegisterAlias(name, alias);
          })
      /*! \brief Copy the array data to tensor */
@@ -118,7 +108,7 @@ PYBIND11_MODULE(libdragon_python, m) {
                dev, reinterpret_cast<PyArrayObject*>(value.ptr()), tensor);
          })
-      /*! \brief Copy the tensor data to the array */
+      /*! \brief Copy the tensor data to array */
      .def(
          "FetchTensor",
          [](Workspace* self, const string& name) {
@@ -142,7 +132,7 @@ PYBIND11_MODULE(libdragon_python, m) {
            }
          })
-      /*! \brief Run a operator from the def reference */
+      /*! \brief Run the operator */
      .def(
          "RunOperator",
          [](Workspace* self, OperatorDef* def, const bool verbose) {
@@ -156,7 +146,7 @@ PYBIND11_MODULE(libdragon_python, m) {
            self->RunOperator(*def);
          })
-      /*! \brief Run operators from the def reference */
+      /*! \brief Run the operators */
      .def(
          "RunOperator",
          [](Workspace* self, vector<OperatorDef*>& defs, const bool verbose) {
@@ -172,7 +162,7 @@ PYBIND11_MODULE(libdragon_python, m) {
            }
          })
-      /*! \brief Run a operator from the serialized def */
+      /*! \brief Run the operator from serialized def */
      .def(
          "RunOperator",
          [](Workspace* self, const string& serialized, const bool verbose) {
@@ -188,7 +178,7 @@ PYBIND11_MODULE(libdragon_python, m) {
            self->RunOperator(def);
          })
-      /*! \brief Create a graph from the serialized def */
+      /*! \brief Create the graph */
      .def(
          "CreateGraph",
          [](Workspace* self, const string& serialized, const bool verbose) {
@@ -213,89 +203,49 @@ PYBIND11_MODULE(libdragon_python, m) {
            return graph->name();
          })
-      /*! \brief Run an existing graph */
+      /*! \brief Run the graph */
      .def(
          "RunGraph",
          [](Workspace* self,
             const string& name,
-             const string& incl,
+             const string& include,
-             const string& excl) {
+             const string& exclude) {
            py::gil_scoped_release g;
-            self->RunGraph(name, incl, excl);
+            self->RunGraph(name, include, exclude);
          })
+      /*! \brief Run the backward */
      .def(
          "RunBackward",
          [](Workspace* self,
-             const vector<OperatorDef*>& forward_ops,
+             const vector<OperatorDef*>& op_defs,
             const vector<string>& targets,
             const vector<string>& sources,
             const vector<string>& input_grads,
-             const vector<string>& ignored_grads,
+             const vector<string>& empty_grads,
-             const bool is_sharing,
+             const bool retain_grads,
             const bool verbose) {
-            GraphDef backward_ops;
+            GraphDef graph_def;
            GraphGradientMaker maker;
-            for (const auto& name : ignored_grads) {
+            for (const auto& name : empty_grads) {
-              maker.add_ignored_grad(name);
+              maker.add_empty_grad(name);
            }
            for (const auto& name : sources) {
-              maker.add_hooked_grad(name + "_grad");
+              maker.add_retained_grad(name + "_grad");
            }
-            maker.Make(forward_ops, targets, input_grads, backward_ops);
+            maker.Make(op_defs, targets, input_grads, graph_def);
            py::gil_scoped_release g;
-            if (is_sharing) {
+            if (!retain_grads) {
-              backward_ops = maker.Share(backward_ops);
+              graph_def = maker.Share(graph_def);
            }
-            for (const auto& def : backward_ops.op()) {
+            for (const auto& op_def : graph_def.op()) {
              if (verbose) {
-                auto msg = string("\n") + def.DebugString();
+                auto msg = string("\n") + op_def.DebugString();
                msg.pop_back();
                PRINT(INFO)
                    << "op {" << str::replace_all(msg, "\n", "\n  ") << "\n}\n";
              }
-              self->RunOperator(def);
+              self->RunOperator(op_def);
-            }
-          })
-      /*! \brief Serialize tensors into a binary file */
-      .def(
-          "Save",
-          [](Workspace* self,
-             const string& filename,
-             const vector<string>& tensors,
-             const int format) {
-            vector<Tensor*> refs;
-            switch (format) {
-              case 0: // Pickle
-                LOG(FATAL) << "Format depends on Pickle. "
-                           << "Can't be used in C++.";
-                break;
-              case 1: // CaffeModel
-                for (const auto& name : tensors) {
-                  refs.emplace_back(self->GetTensor(name));
-                }
-                SavaCaffeModel(filename, refs);
-                break;
-              default:
-                LOG(FATAL) << "Unknown format, code: " << format;
-            }
-          })
-      /*! \brief Load tensors from a binary file */
-      .def(
-          "Load",
-          [](Workspace* self, const string& filename, const int format) {
-            switch (format) {
-              case 0: // Pickle
-                LOG(FATAL) << "Format depends on Pickle. "
-                           << "Can't be used in C++.";
-                break;
-              case 1: // CaffeModel
-                LoadCaffeModel(filename, self);
-                break;
-              default:
-                LOG(FATAL) << "Unknown format, code: " << format;
            }
          })

--- a/dragon/modules/python/plugin_op.cc
+++ b/dragon/modules/python/plugin_op.cc
@@ -20,7 +20,6 @@ PythonPluginInferOp<Context>::PythonPluginInferOp(
      class_name_(OpArg<string>("class_name", "")),
      kwargs_str_((OpArg<string>("kwargs_str", ""))) {
  // Optimization for all python ops
-  if (!allow_run()) return;
  this->do_sync_ = false;
  // Initialize interpreter and load module

--- a/dragon/modules/python/tensor.h
+++ b/dragon/modules/python/tensor.h
@@ -24,6 +24,9 @@ namespace tensor {
 void RegisterModule(py::module& m) {
  /*! \brief Export the Tensor class */
  py::class_<Tensor>(m, "Tensor")
+      /*! \brief Return the tensor name */
+      .def_property_readonly("name", &Tensor::name)
      /*! \brief Return the number of dimensions */
      .def_property_readonly("ndim", &Tensor::ndim)
@@ -46,9 +49,9 @@ void RegisterModule(py::module& m) {
          "device",
          [](Tensor* self) {
            if (self->has_memory()) {
-              auto mem_info = self->memory()->info();
+              auto info = self->memory()->info();
              return std::tuple<string, int>(
-                  mem_info["device_type"], atoi(mem_info["device_id"].c_str()));
+                  info["device_type"], atoi(info["device_id"].c_str()));
            } else {
              return std::tuple<string, int>("Unknown", 0);
            }

--- a/dragon/modules/runtime/dragon_runtime.h
+++ b/dragon/modules/runtime/dragon_runtime.h
@@ -119,8 +119,6 @@ DRAGON_API void DestroyGraphDef(GraphDef_t graph_def);
 * Model API
 */
-DRAGON_API void LoadCaffeModel(const std::string& model_file, Workspace_t ws);
 DRAGON_API void LoadONNXModel(
    const std::string& model_file,
    GraphDef_t init_graph,

--- a/dragon/modules/runtime/module.cc
+++ b/dragon/modules/runtime/module.cc
 #include "dragon/core/common.h"
 #include "dragon/modules/runtime/dragon_runtime.h"
 #include "dragon/onnx/onnx_backend.h"
-#include "dragon/utils/caffemodel.h"
 #include "dragon/utils/proto_utils.h"
 namespace dragon {
@@ -161,46 +160,6 @@ DRAGON_API void DestroyGraphDef(GraphDef_t graph_def) {
  if (graph_def) delete graph_def;
 }
-void LoadCaffeModel(const string& model_file, Workspace_t ws) {
-  NetParameter net_param;
-  ReadProtoFromBinaryFile(model_file.c_str(), &net_param);
-  std::string scope = "";
-  LOG(INFO) << "Load Model: " << model_file << "......";
-  LOG(INFO) << "Format: Caffe";
-  for (int i = 0; i < net_param.layer_size(); i++) {
-    const LayerParameter& layer = net_param.layer(i);
-    const string& layer_name = layer.name();
-    string prefix = scope + layer_name + "/param:";
-    for (int j = 0; j < layer.blobs_size(); j++) {
-      string tensor_name = prefix + std::to_string(j);
-      if (!ws->HasTensor(tensor_name)) ws->CreateTensor(tensor_name);
-      BlobProto blob = layer.blobs(j);
-      vector<int64_t> dims;
-      for (auto dim : blob.shape().dim())
-        dims.push_back(dim);
-      Tensor* tensor = ws->GetTensor(tensor_name);
-      std::stringstream DimString;
-      if (dims.size() > 0) {
-        tensor->Reshape(dims);
-        CHECK_EQ(tensor->count(), blob.data_size())
-            << "Tensor(" << tensor_name << ") "
-            << "failed to load, except size:  " << tensor->count()
-            << ", loaded " << blob.data_size();
-        DimString << tensor->DimString();
-      } else {
-        tensor->Reshape(vector<int64_t>(1, blob.data_size()));
-        DimString << "(missing)";
-      }
-      float* Xdata = tensor->mutable_data<float, CPUContext>();
-      for (int idx = 0; idx < blob.data_size(); idx++)
-        Xdata[idx] = blob.data(idx);
-      LOG(INFO) << "Tensor(" << tensor_name << ") "
-                << "loaded, shape: " << DimString.str()
-                << ", size: " << blob.data_size();
-    }
-  }
-}
 void LoadONNXModel(
    const string& model_file,
    GraphDef_t init_graph,

--- a/dragon/onnx/onnx_importer.cc
+++ b/dragon/onnx/onnx_importer.cc
@@ -19,7 +19,6 @@ ONNXImporterReturns ONNXBackend::ArgReduceImporter(
  auto node = NodeProto(onnx_node->node);
  auto onnx_node_v2 = ONNXNode(node);
  auto& attributes = onnx_node_v2.attributes;
  // Determine the operation
  auto* operation = attributes.AddRewrittenAttribute("operation");
  if (onnx_node->node.op_type() == "ArgMax") {
@@ -27,7 +26,6 @@ ONNXImporterReturns ONNXBackend::ArgReduceImporter(
  } else if (onnx_node->node.op_type() == "ArgMin") {
    operation->set_s("MIN");
  }
  return GenericImporter(&onnx_node_v2, ctx);
 }
@@ -37,17 +35,13 @@ ONNXImporterReturns ONNXBackend::ATenImporter(
  auto node = NodeProto(onnx_node->node);
  auto onnx_node_v2 = ONNXNode(node);
  auto& attributes = onnx_node_v2.attributes;
  auto op_type = attributes.get<string>("op_type", "");
  if (op_type.empty()) {
    LOG(FATAL) << "op_type is required to evolve "
               << "to the specific operator.";
  }
  node.set_op_type(op_type);
  attributes.remove("op_type");
  return GenericImporter(&onnx_node_v2, ctx);
 }
@@ -56,17 +50,13 @@ ONNXImporterReturns ONNXBackend::BatchNormImporter(
    const ConversionContext& ctx) {
  auto node = NodeProto(onnx_node->node);
  auto onnx_node_v2 = ONNXNode(node);
  auto& attributes = onnx_node_v2.attributes;
  // Enforce to NCHW format
  attributes.AddRewrittenAttribute("axis")->set_i(1);
  // Remove dummy attributes
  attributes.remove("consumed_inputs");
  attributes.remove("is_test");
  attributes.remove("spatial");
  return GenericImporter(&onnx_node_v2, ctx);
 }
@@ -74,12 +64,10 @@ ONNXImporterReturns ONNXBackend::CastImporter(
    ONNXNode* onnx_node,
    const ConversionContext& ctx) {
  auto& attributes = onnx_node->attributes;
  // Determine the dtype
  auto* dtype = attributes.AddRewrittenAttribute("dtype");
  auto onnx_dtype = attributes.get<int64_t>("to", TensorProto::UNDEFINED);
  auto supported_dtype = true;
  switch (onnx_dtype) {
    case ONNX_NAMESPACE::TensorProto::BOOL:
      dtype->set_s("bool");
@@ -138,11 +126,9 @@ ONNXImporterReturns ONNXBackend::CastImporter(
      supported_dtype = false;
      break;
  };
  CHECK(supported_dtype) << "\nCasting to " << dtype->s()
                         << " is not supported.";
  attributes.remove("to");
  return GenericImporter(onnx_node, ctx);
 }
@@ -151,17 +137,16 @@ ONNXImporterReturns ONNXBackend::ConvPoolImporter(
    const ConversionContext& ctx) {
  auto& attributes = onnx_node->attributes;
  const auto onnx_op_type = onnx_node->node.op_type();
  // Determine the padding
  auto mode = attributes.get<string>("auto_pad");
  auto* padding = attributes.AddRewrittenAttribute("padding");
  // SAME, SAME_LOWER, or SAME_UPPER
-  if (str::find(mode, "SAME"))
+  if (str::find(mode, "SAME")) {
    padding->set_s(mode);
-  else
+  } else {
    padding->set_s("VALID"); // Use explicit pads
+  }
  attributes.remove("auto_pad");
  // Determine the pooling mode
  if (onnx_op_type == "MaxPool") {
    attributes.AddRewrittenAttribute("mode")->set_s("MAX");
@@ -174,14 +159,11 @@ ONNXImporterReturns ONNXBackend::ConvPoolImporter(
    attributes.AddRewrittenAttribute("mode")->set_s("AVG");
    attributes.AddRewrittenAttribute("global_pooling")->set_i(1);
  }
  auto returns = GenericImporter(onnx_node, ctx);
  // Determine the op type
  OperatorDef* op_def = returns.GetOp(0);
  auto ks = attributes.get<ONNX_INTS>("kernel_shape");
  *(op_def->mutable_type()) += (str::to(ks.size() > 0 ? ks.size() : 2) + "d");
  return returns;
 }
@@ -194,11 +176,9 @@ ONNXImporterReturns ONNXBackend::GenericImporter(
  op_def->mutable_input()->MergeFrom(node.input());
  op_def->mutable_output()->MergeFrom(node.output());
  op_def->set_name(node.name());
  const auto onnx_op_type = node.op_type();
  op_def->set_type(
      get_default(get_renamed_nodes(), onnx_op_type, onnx_op_type));
  auto mapper = [&, this](const std::string& k) {
    const auto it = get_node_renamed_attrs().find(onnx_op_type);
    if (it != get_node_renamed_attrs().end()) {
@@ -224,18 +204,16 @@ ONNXImporterReturns ONNXBackend::GemmImporter(
  auto alpha = attributes.get<float>("alpha", 1.f);
  auto beta = attributes.get<float>("beta", 1.f);
  auto trans_a = attributes.get<int64_t>("transA", 0L);
+  // Remove the unsupported attributes
  if (alpha != 1.f || beta != 1.f) {
    LOG(FATAL) << "alpha/beta can not be set currently.";
  }
  if (trans_a) {
    LOG(FATAL) << "Tranposed A is not supported currently.";
  }
  attributes.remove("alpha");
  attributes.remove("beta");
  attributes.remove("transA");
  return GenericImporter(onnx_node, ctx);
 }
@@ -244,11 +222,9 @@ ONNXImporterReturns ONNXBackend::MaxRoiPoolImporter(
    const ConversionContext& ctx) {
  auto& attributes = onnx_node->attributes;
  auto pooled_shape = attributes.get<ONNX_INTS>("pooled_shape");
  attributes.AddRewrittenAttribute("pool_h")->set_i(pooled_shape.Get(0));
  attributes.AddRewrittenAttribute("pool_w")->set_i(pooled_shape.Get(1));
  attributes.remove("pooled_shape");
  return GenericImporter(onnx_node, ctx);
 }
@@ -258,18 +234,16 @@ ONNXImporterReturns ONNXBackend::ReshapeImporter(
  auto node = NodeProto(onnx_node->node);
  auto onnx_node_v2 = ONNXNode(node);
  auto& attributes = onnx_node_v2.attributes;
  attributes.remove("consumed_inputs");
  // Determine the dims
  auto* dims = attributes.AddRewrittenAttribute("dims");
  if (ctx.opset_version() < 5) {
    const auto& shape = attributes.get<ONNX_INTS>("shape");
    CHECK_GT(shape.size(), 0) << "\nExcepted the shape value";
    attributes.remove("shape");
-    for (auto d : shape)
+    for (auto d : shape) {
      dims->add_ints(d);
+    }
  } else {
    CHECK_EQ(node.input_size(), 2)
        << "\nExpectd 2 input in upsample after onnx version 5";
@@ -280,10 +254,10 @@ ONNXImporterReturns ONNXBackend::ReshapeImporter(
    Argument shape_dtype, shape_values;
    ONNXTensorToArgument(*shape_tensor, &shape_dtype, &shape_values);
    CHECK_GT(shape_values.ints_size(), 0) << "\nExcepted the shape value";
-    for (auto d : shape_values.ints())
+    for (auto d : shape_values.ints()) {
      dims->add_ints(d);
+    }
  }
  return GenericImporter(&onnx_node_v2, ctx);
 }
@@ -293,9 +267,7 @@ ONNXImporterReturns ONNXBackend::ResizeImporter(
  auto node = NodeProto(onnx_node->node);
  auto onnx_node_v2 = ONNXNode(node);
  auto& attributes = onnx_node_v2.attributes;
  attributes.remove("coordinate_transformation_mode");
  if (ctx.opset_version() >= 9) {
    node.mutable_input()->Clear();
    node.add_input(onnx_node->node.input(0));
@@ -307,21 +279,22 @@ ONNXImporterReturns ONNXBackend::ResizeImporter(
    const auto* scales_tensor = ctx.initializer().at(scales_name);
    ONNXTensorToArgument(*scales_tensor, &scales_dtype, &scale_values);
    auto* scales = attributes.AddRewrittenAttribute("scales");
-    for (auto d : scale_values.floats())
+    for (auto d : scale_values.floats()) {
      scales->add_floats(d);
+    }
    if (sizes_idx > 0) {
      Argument sizes_dtype, sizes_values;
      const auto& sizes_name = onnx_node->node.input(sizes_idx);
      const auto* sizes_tensor = ctx.initializer().at(sizes_name);
      ONNXTensorToArgument(*sizes_tensor, &sizes_dtype, &sizes_values);
      auto* sizes = attributes.AddRewrittenAttribute("sizes");
-      for (auto d : sizes_values.floats())
+      for (auto d : sizes_values.floats()) {
        sizes->add_ints(d);
+      }
    }
  } else {
    LOG(FATAL) << "Required opset >= 7";
  }
  return GenericImporter(&onnx_node_v2, ctx);
 }
@@ -330,12 +303,10 @@ ONNXImporterReturns ONNXBackend::RoiAlignImporter(
    const ConversionContext& ctx) {
  auto node = NodeProto(onnx_node->node);
  auto onnx_node_v2 = ONNXNode(node);
  // Remove the batch indices
  node.mutable_input()->Clear();
  node.add_input(onnx_node->node.input(0));
  node.add_input(onnx_node->node.input(1));
  return GenericImporter(&onnx_node_v2, ctx);
 }
@@ -345,19 +316,22 @@ ONNXImporterReturns ONNXBackend::TileImporter(
  auto node = NodeProto(onnx_node->node);
  auto onnx_node_v2 = ONNXNode(node);
  auto& attributes = onnx_node_v2.attributes;
+  if (ctx.opset_version() >= 6) {
-  // Determine the multiples from repeats
+    // Determine repeats from repeats
-  auto* multiples = attributes.AddRewrittenAttribute("multiples");
+    auto* repeats = attributes.AddRewrittenAttribute("repeats");
-  node.mutable_input()->Clear();
+    node.mutable_input()->Clear();
-  node.add_input(onnx_node->node.input(0));
+    node.add_input(onnx_node->node.input(0));
-  const auto& repeats_name = onnx_node->node.input(1);
+    const auto& repeats_name = onnx_node->node.input(1);
-  const auto* repeats_tensor = ctx.initializer().at(repeats_name);
+    const auto* repeats_tensor = ctx.initializer().at(repeats_name);
-  Argument multiples_dtype, multiples_values;
+    Argument repeats_dtype, repeats_values;
-  ONNXTensorToArgument(*repeats_tensor, &multiples_dtype, &multiples_values);
+    ONNXTensorToArgument(*repeats_tensor, &repeats_dtype, &repeats_values);
-  CHECK_GT(multiples_values.ints_size(), 0) << "\nExcepted the repeats value";
+    CHECK_GT(repeats_values.ints_size(), 0) << "\nExcepted the repeats value";
-  for (auto d : multiples_values.ints())
+    for (auto repeat : repeats_values.ints()) {
-    multiples->add_ints(d);
+      repeats->add_ints(repeat);
+    }
+  } else {
+    LOG(FATAL) << "Required opset >= 6";
+  }
  return GenericImporter(&onnx_node_v2, ctx);
 }

--- a/dragon/operators/array/initialize_ops.cc
+++ b/dragon/operators/array/initialize_ops.cc
@@ -4,13 +4,13 @@
 namespace dragon {
-#define DEFINE_FILLER_OP_IMPL(name)                  \
+#define DEFINE_FILLER_OP_IMPL(name)                        \
-  template <class Context>                           \
+  template <class Context>                                 \
-  template <typename T>                              \
+  template <typename T>                                    \
-  void name##Op<Context>::DoRunWithType() {          \
+  void name##Op<Context>::DoRunWithType() {                \
-    unique_ptr<Filler<T, Context>> f;                \
+    unique_ptr<Filler<T, Context>> f;                      \
-    f.reset(CreateFiller<T, Context>(this->proto_)); \
+    f.reset(CreateFiller<T, Context>(this->filler_info_)); \
-    f->Fill(Output(0), ctx());                       \
+    f->Fill(Output(0), ctx());                             \
  }
 #define DISPATCH_WITH_TYPES(name, ...)                    \

--- a/dragon/operators/array/initialize_ops.h
+++ b/dragon/operators/array/initialize_ops.h
@@ -30,7 +30,7 @@ class InitializeOp : public Operator<Context> {
  void RunOnDevice() override;
 protected:
-  TensorFillerProto proto_;
+  FillerInfo filler_info_;
  DECLARE_ARGS_WITH_DESC(int64_t, dims);
 };
@@ -142,9 +142,9 @@ class RandomNormalOp final : public InitializeOp<Context> {
      : InitializeOp<Context>(def, ws) {
    auto mu = OpArg<float>("mean", 0.f);
    auto sigma = OpArg<float>("std", 1.f);
-    this->proto_.set_mean(mu);
+    this->filler_info_.set_mean(mu);
-    this->proto_.set_std(sigma);
+    this->filler_info_.set_std(sigma);
-    this->proto_.set_type("normal");
+    this->filler_info_.set_type("normal");
  }
  USE_OPERATOR_FUNCTIONS;
@@ -161,9 +161,9 @@ class RandomUniformOp final : public InitializeOp<Context> {
      : InitializeOp<Context>(def, ws) {
    auto low = OpArg<float>("low", -1.f);
    auto high = OpArg<float>("high", 1.f);
-    this->proto_.set_low(low);
+    this->filler_info_.set_low(low);
-    this->proto_.set_high(high);
+    this->filler_info_.set_high(high);
-    this->proto_.set_type("uniform");
+    this->filler_info_.set_type("uniform");
  }
  USE_OPERATOR_FUNCTIONS;
@@ -180,11 +180,11 @@ class TruncatedNormalOp final : public InitializeOp<Context> {
      : InitializeOp<Context>(def, ws) {
    auto mu = OpArg<float>("mean", 0.f);
    auto sigma = OpArg<float>("std", 1.f);
-    this->proto_.set_mean(mu);
+    this->filler_info_.set_mean(mu);
-    this->proto_.set_std(sigma);
+    this->filler_info_.set_std(sigma);
-    this->proto_.set_low(mu - 2 * sigma);
+    this->filler_info_.set_low(mu - 2 * sigma);
-    this->proto_.set_high(mu + 2 * sigma);
+    this->filler_info_.set_high(mu + 2 * sigma);
-    this->proto_.set_type("truncated_normal");
+    this->filler_info_.set_type("truncated_normal");
  }
  USE_OPERATOR_FUNCTIONS;
@@ -201,15 +201,15 @@ class GlorotNormalOp final : public InitializeOp<Context> {
      : InitializeOp<Context>(def, ws) {
    auto scale = OpArg<float>("scale", 2.f);
    auto mode = OpArg<string>("mode", "fan_in");
-    this->proto_.set_type("msra");
+    this->filler_info_.set_type("glorot_normal");
    if (mode == "fan_avg") {
-      this->proto_.set_variance_norm(TensorFillerProto_VarianceNorm_FAN_AVG);
+      this->filler_info_.set_variance_norm(FillerInfo_VarianceNorm_FAN_AVG);
    } else if (mode == "fan_out") {
-      this->proto_.set_variance_norm(TensorFillerProto_VarianceNorm_FAN_OUT);
+      this->filler_info_.set_variance_norm(FillerInfo_VarianceNorm_FAN_OUT);
    } else {
-      this->proto_.set_variance_norm(TensorFillerProto_VarianceNorm_FAN_IN);
+      this->filler_info_.set_variance_norm(FillerInfo_VarianceNorm_FAN_IN);
    }
-    this->proto_.set_scale(scale);
+    this->filler_info_.set_scale(scale);
  }
  USE_OPERATOR_FUNCTIONS;
@@ -226,15 +226,15 @@ class GlorotUniformOp final : public InitializeOp<Context> {
      : InitializeOp<Context>(def, ws) {
    auto scale = OpArg<float>("scale", 3.f);
    auto mode = OpArg<string>("mode", "fan_in");
-    this->proto_.set_type("xavier");
+    this->filler_info_.set_type("glorot_uniform");
    if (mode == "fan_avg") {
-      this->proto_.set_variance_norm(TensorFillerProto_VarianceNorm_FAN_AVG);
+      this->filler_info_.set_variance_norm(FillerInfo_VarianceNorm_FAN_AVG);
    } else if (mode == "fan_out") {
-      this->proto_.set_variance_norm(TensorFillerProto_VarianceNorm_FAN_OUT);
+      this->filler_info_.set_variance_norm(FillerInfo_VarianceNorm_FAN_OUT);
    } else {
-      this->proto_.set_variance_norm(TensorFillerProto_VarianceNorm_FAN_IN);
+      this->filler_info_.set_variance_norm(FillerInfo_VarianceNorm_FAN_IN);
    }
-    this->proto_.set_scale(scale);
+    this->filler_info_.set_scale(scale);
  }
  USE_OPERATOR_FUNCTIONS;

--- a/dragon/operators/array/tile_op.cc
+++ b/dragon/operators/array/tile_op.cc
@@ -9,9 +9,12 @@ template <typename T>
 void TileOp<Context>::DoRunWithType() {
  auto &X = Input(0), *Y = Output(0);
+  int num_repeats;
+  repeats(0, &num_repeats);
  auto Y_dims = X.dims();
-  for (int i = 0; i < Y_dims.size(); ++i)
+  for (int i = 0; i < num_repeats; ++i) {
-    Y_dims[i] *= multiples(i);
+    Y_dims[i] *= repeats(i);
+  }
  if (X.dims() == Y_dims) {
    Y->Reshape(Y_dims)->CopyFrom(X, ctx());
@@ -49,7 +52,7 @@ void TileGradientOp<Context>::DoRunWithType() {
    dx = dest_->template mutable_data<T, Context>();
  }
  kernel::TileGrad(
-      dest_->count(0, axis_), dest_->count(axis_), multiple_, dy, dx, ctx());
+      dest_->count(0, axis_), dest_->count(axis_), repeat_, dy, dx, ctx());
 }
 template <class Context>
@@ -57,10 +60,14 @@ void TileGradientOp<Context>::RunOnDevice() {
  auto &dY = Input(0), *dX = Output(0);
  // Add the axes
+  int num_repeats;
+  repeats(0, &num_repeats);
  vector<pair<int, int>> dispatch_axes;
-  for (int i = 0; i < dY.ndim(); i++) {
+  for (int i = 0; i < dY.ndim() && i < num_repeats; i++) {
-    auto m = multiples(i);
+    auto repeat = repeats(i);
-    if (m > 1) dispatch_axes.push_back({m, i});
+    if (repeat > 1) {
+      dispatch_axes.push_back({repeat, i});
+    }
  }
  std::sort(dispatch_axes.begin(), dispatch_axes.end());
  std::reverse(dispatch_axes.begin(), dispatch_axes.end());
@@ -76,10 +83,10 @@ void TileGradientOp<Context>::RunOnDevice() {
  // Reduce N times along each tiled axis
  for (const auto& task : dispatch_axes) {
-    axis_ = task.second, multiple_ = task.first;
+    axis_ = task.second, repeat_ = task.first;
    vec64_t X_dims(src_->dims());
-    X_dims[axis_] /= multiple_;
+    X_dims[axis_] /= repeat_;
    dest_->Reshape(X_dims);
    DispatchHelper<FloatingTensorTypes>::Call(this, dY);

--- a/dragon/operators/array/tile_op.h
+++ b/dragon/operators/array/tile_op.h
@@ -21,7 +21,7 @@ template <class Context>
 class TileOp final : public Operator<Context> {
 public:
  TileOp(const OperatorDef& def, Workspace* ws) : Operator<Context>(def, ws) {
-    GET_ARGS_WITH_DESC(int64_t, multiples);
+    GET_ARGS_WITH_DESC(int64_t, repeats);
  }
  USE_OPERATOR_FUNCTIONS;
@@ -31,7 +31,7 @@ class TileOp final : public Operator<Context> {
  void DoRunWithType();
 protected:
-  DECLARE_ARGS_WITH_DESC(int64_t, multiples);
+  DECLARE_ARGS_WITH_DESC(int64_t, repeats);
 };
 template <class Context>
@@ -39,7 +39,7 @@ class TileGradientOp final : public Operator<Context> {
 public:
  TileGradientOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws) {
-    GET_ARGS_WITH_DESC(int64_t, multiples);
+    GET_ARGS_WITH_DESC(int64_t, repeats);
  }
  USE_OPERATOR_FUNCTIONS;
@@ -50,12 +50,12 @@ class TileGradientOp final : public Operator<Context> {
 protected:
  Tensor *dest_, *src_, nav_;
-  int64_t axis_, multiple_;
+  int64_t axis_, repeat_;
-  DECLARE_ARGS_WITH_DESC(int64_t, multiples);
+  DECLARE_ARGS_WITH_DESC(int64_t, repeats);
 };
-DEFINE_ARGS_WITH_DESC(int64_t, TileOp, multiples);
+DEFINE_ARGS_WITH_DESC(int64_t, TileOp, repeats);
-DEFINE_ARGS_WITH_DESC(int64_t, TileGradientOp, multiples);
+DEFINE_ARGS_WITH_DESC(int64_t, TileGradientOp, repeats);
 } // namespace dragon

--- a/dragon/operators/training/adam_update_op.cc
+++ b/dragon/operators/training/adam_update_op.cc
@@ -9,7 +9,6 @@ void AdamUpdateOp<Context>::ComputeUpdate(Tensor* dX) {
  t_++;
  auto beta1 = Parameter("beta1"), beta2 = Parameter("beta2");
  auto coef = sqrt(1.f - pow(beta2, t_)) / (1.f - pow(beta1, t_));
  kernel::AdamUpdate(
      dX->count(),
      Parameter("base_lr") * coef * this->lr_mult_,

--- a/dragon/operators/training/sgd_update_op.cc
+++ b/dragon/operators/training/sgd_update_op.cc
@@ -10,7 +10,6 @@ void SGDUpdateOp<Context>::ComputeUpdate(Tensor* dX) {
  auto lr = Parameter("base_lr") * this->lr_mult_;
  if (last_lr_ > 0) correction_ = lr / last_lr_;
  last_lr_ = lr; // Record the last value
  kernel::SGDUpdate(
      dX->count(),
      lr,

--- a/dragon/operators/vision/bias_add_op.cc
+++ b/dragon/operators/vision/bias_add_op.cc
@@ -20,9 +20,7 @@ void BiasAddOp<Context>::DoRunWithType() {
    LOG(FATAL) << "Unknown DataFormat: " << data_format();
  }
-  // Maybe fill the bias at the first time
  TENSOR_FILL(B, vec64_t({C}));
  kernel::BiasAdd(
      N,
      C,

--- a/dragon/operators/vision/conv_op_base.cc
+++ b/dragon/operators/vision/conv_op_base.cc
@@ -4,73 +4,69 @@
 namespace dragon {
-#define SAME_PADDING(A, B)   \
+#define DETERMINE_SAME_PADDING(l, r) \
-  A[i] = padding_needed / 2; \
+  if (padding_ != "SAME_UPPER") {    \
-  B[i] = padding_needed - A[i]
+    l[i] = pad_size / 2;             \
+    r[i] = pad_size - l[i];          \
+  } else {                           \
+    r[i] = pad_size / 2;             \
+    l[i] = pad_size - r[i];          \
+  }
 template <class Context>
 void ConvOpBase<Context>::ComputeOutShape() {
-  auto X_dims = Input(0).dims();
  out_shape_.clear();
-  for (int i = 0; i < num_axes_; i++) {
+  vec64_t X_dims = Input(0).dims();
-    if (!Transposed()) {
+  int64_t in_size, out_size, k_size, pad_size;
-      auto idm = X_dims[axis_ + i];
+  if (!Transposed()) {
-      auto dk = dilation_[i] * (kshape_[i] - 1) + 1;
+    for (int i = 0; i < num_axes_; i++) {
-      if (!str::find(padding_, "SAME")) {
+      in_size = X_dims[axis_ + i];
-        // Explicit pads
+      k_size = dilation_[i] * (kshape_[i] - 1) + 1;
-        auto odm = (idm + pad_l_[i] + pad_r_[i] - dk) / stride_[i] + 1;
+      if (!str::find(padding_, "SAME")) { // Explicit pads
-        out_shape_.push_back(odm);
+        pad_size = pad_l_[i] + pad_r_[i];
-      } else {
+        out_size = (in_size + pad_size - k_size) / stride_[i] + 1;
-        // Auto pads
+      } else { // Auto pads
-        int64_t odm = (idm + stride_[i] - 1) / (float)stride_[i];
+        out_size = (in_size + stride_[i] - 1) / stride_[i];
-        auto padding_needed =
+        pad_size = (out_size - 1) * stride_[i] + k_size - in_size;
-            std::max(int64_t(0), (odm - 1) * stride_[i] + dk - idm);
+        pad_size = std::max(pad_size, int64_t(0));
-        out_shape_.push_back(odm);
+        DETERMINE_SAME_PADDING(pad_l_, pad_r_);
-        if (padding_ == "SAME_UPPER") {
+      }
-          SAME_PADDING(pad_l_, pad_r_);
+      out_shape_.push_back(out_size);
-        } else {
+    }
-          SAME_PADDING(pad_r_, pad_l_);
+  } else {
-        } // SAME_LOWER or SAME
+    int num_output_padding;
+    output_padding(0, &num_output_padding);
+    CHECK(num_output_padding == 0 || num_output_padding == num_axes_)
+        << "\nExcepted 0 or " << num_axes_ << " ints for <output_padding>.";
+    if (!str::find(padding_, "SAME")) { // Explicit pads
+      for (int i = 0; i < num_axes_; i++) {
+        in_size = X_dims[axis_ + i];
+        k_size = dilation_[i] * (kshape_[i] - 1) + 1;
+        pad_size = pad_l_[i] + pad_r_[i];
+        out_size = stride_[i] * (in_size - 1) + k_size - pad_size;
+        if (num_output_padding > 0) out_size += output_padding(i);
+        out_shape_.push_back(out_size);
      }
    } else {
-      auto idm = X_dims[axis_ + i];
+      // Auto pads
-      auto dk = dilation_[i] * (kshape_[i] - 1) + 1;
+      int num_output_shape;
-      if (!str::find(padding_, "SAME")) {
+      output_shape(0, &num_output_shape);
-        // Explicit pads
+      CHECK(num_output_shape == num_axes_)
-        auto odm = stride_[i] * (idm - 1) + dk - pad_l_[i] - pad_r_[i];
+          << "\nExcepted " << num_axes_ << " ints for <output_shape>.";
-        out_shape_.push_back(odm);
+      for (int i = 0; i < num_axes_; i++) {
-      } else {
+        in_size = X_dims[axis_ + i];
-        // Auto pads
+        k_size = dilation_[i] * (kshape_[i] - 1) + 1;
-        int output_shape_size;
+        out_size = output_shape(i);
-        int output_padding_size;
+        pad_size = stride_[i] * (in_size - 1) + k_size;
-        output_shape(0, &output_shape_size);
+        if (num_output_padding > 0) pad_size += output_padding(i);
-        output_padding(0, &output_padding_size);
+        CHECK_GE(pad_size, out_size)
-        CHECK(output_shape_size == 0 || output_shape_size == num_axes_)
+            << "\nThe output shape is incorrect."
-            << "Excepted 0 or " << num_axes_ << " ints for output shape.";
+            << "\nDimension of spatial axis " << i << " should be at most "
-        CHECK(output_padding_size == 0 || output_padding_size == num_axes_)
+            << pad_size << ".";
-            << "Excepted 0 or " << num_axes_ << " ints for output padding.";
+        pad_size = stride_[i] * (in_size - 1) + k_size - out_size;
-        int64_t padding_needed, odm;
+        pad_size = std::max(pad_size, int64_t(0));
-        if (output_padding_size) {
+        DETERMINE_SAME_PADDING(pad_l_, pad_r_);
-          padding_needed = output_padding(i);
+        out_shape_.push_back(out_size);
-          odm = stride_[i] * (idm - 1) + dk + padding_needed;
-        } else if (output_shape_size) {
-          odm = output_shape(i);
-          padding_needed = odm - (stride_[i] * (idm - 1) + dk);
-          CHECK_GE(padding_needed, 0)
-              << "\nThe output shape is incorrect."
-              << "\nWith the given stride and kernel, "
-              << "dimension of spatial axis " << i << " should be at least "
-              << odm - padding_needed << ".";
-        } else {
-          LOG(FATAL) << "Excepted the output padding or output shape "
-                     << "for \"SAME\" padding algorithm.";
-        }
-        out_shape_.push_back(odm);
-        if (padding_ == "SAME_UPPER") {
-          SAME_PADDING(pad_l_, pad_r_);
-        } else {
-          SAME_PADDING(pad_r_, pad_l_);
-        } // SAME_LOWER or SAME
      }
    }
  }
@@ -373,7 +369,7 @@ INSTANTIATE_API(CUDAContext, float);
 INSTANTIATE_API(CUDAContext, double);
 #endif
-#undef SAME_PADDING
 #undef INSTANTIATE_API
+#undef DETERMINE_SAME_PADDING
 } // namespace dragon
--- a/dragon/operators/vision/pool_op_base.cc
+++ b/dragon/operators/vision/pool_op_base.cc
@@ -5,9 +5,14 @@
 namespace dragon {
-#define SAME_PADDING(A, B)   \
+#define DETERMINE_SAME_PADDING(l, r) \
-  A[i] = padding_needed / 2; \
+  if (padding_ != "SAME_UPPER") {    \
-  B[i] = padding_needed - A[i]
+    l[i] = pad_size / 2;             \
+    r[i] = pad_size - l[i];          \
+  } else {                           \
+    r[i] = pad_size / 2;             \
+    l[i] = pad_size - r[i];          \
+  }
 template <class Context>
 void PoolOpBase<Context>::Setup(int num_axes) {
@@ -52,41 +57,27 @@ void PoolOpBase<Context>::ComputeOutShape() {
      kshape_[i] = in_dims_[i + 2];
  }
-  // Adjust the pads for SAME padding algorithm
-  if (str::find(padding_, "SAME")) {
-    for (int i = 0; i < num_axes_; i++) {
-      auto idm = in_dims_[i + 2];
-      int64_t odm = (idm + stride_[i] - 1) / (float)stride_[i];
-      auto padding_needed =
-          std::max((int64_t)0, (odm - 1) * stride_[i] + kshape_[i] - idm);
-      if (padding_ == "SAME_UPPER") {
-        SAME_PADDING(pad_l_, pad_r_);
-      } else {
-        SAME_PADDING(pad_r_, pad_l_);
-      } /*! SAME_LOWER or SAME */
-    }
-  }
  // Compute the output dimensions
  auto floor_or_ceil = ceil_mode_ > 0
      ? static_cast<float (*)(float)>(&std::ceil)
      : static_cast<float (*)(float)>(&std::floor);
  out_dims_ = in_dims_;
  out_shape_ = Input(0).dims();
+  int64_t in_size, k_size, pad_size;
  for (int i = 0; i < num_axes_; i++) {
-    auto in_dim = in_dims_[i + 2];
+    float out_size;
-    if (!str::find(padding_, "SAME")) {
+    in_size = in_dims_[i + 2], k_size = kshape_[i];
-      // Explicit pads
+    if (!str::find(padding_, "SAME")) { // Explicit pads
-      in_dim += pad_l_[i] + pad_r_[i];
+      pad_size = pad_l_[i] + pad_r_[i];
-      out_shape_[i + axis_] = out_dims_[i + 2] =
+      out_size = float(in_size + pad_size - k_size) / float(stride_[i]) + 1.f;
-          floor_or_ceil((in_dim - kshape_[i]) / (float)stride_[i]) + 1;
+      out_size = floor_or_ceil(out_size);
-    } else {
+    } else { // Auto pads
-      // Auto pads
+      out_size = std::ceil(float(in_size) / float(stride_[i]));
-      out_shape_[i + axis_] = out_dims_[i + 2] =
+      pad_size = ((int64_t)out_size - 1) * stride_[i] + k_size - in_size;
-          floor_or_ceil(in_dim / (float)stride_[i]);
+      pad_size = std::max(pad_size, int64_t(0));
+      DETERMINE_SAME_PADDING(pad_l_, pad_r_);
    }
+    out_shape_[i + axis_] = out_dims_[i + 2] = out_size;
  }
 }
@@ -95,6 +86,6 @@ template class PoolOpBase<CPUContext>;
 template class PoolOpBase<CUDAContext>;
 #endif
-#undef SAME_PADDING
+#undef DETERMINE_SAME_PADDING
 } // namespace dragon
--- a/dragon/proto/caffemodel.proto
+++ b/dragon/proto/caffemodel.proto
-syntax = "proto2";
-package dragon;
-message BlobShape {
-  repeated int64 dim = 1 [packed = true];
-}
-message BlobProto {
-  optional BlobShape shape = 7;
-  repeated float data = 5 [packed = true];
-  optional int32 num = 1 [default = 0];
-  optional int32 channels = 2 [default = 0];
-  optional int32 height = 3 [default = 0];
-  optional int32 width = 4 [default = 0];
-}
-message NetParameter {
-  optional string name = 1;
-  repeated LayerParameter layer = 100;
-}
-message LayerParameter {
-  optional string name = 1;
-  repeated BlobProto blobs = 7;
-}
--- a/dragon/proto/dragon.proto
+++ b/dragon/proto/dragon.proto
@@ -51,26 +51,6 @@ message TensorProto {
  optional string name = 7;
 }
-// Record the filler of Tensor.
-// This structure is kept for backward compatibility
-// with caffe1, which relies implicit initializer.
-message TensorFillerProto {
-  optional string tensor = 1;
-  optional string type = 2 [default = 'constant'];
-  optional float value = 3 [default = 0];
-  optional float low = 4 [default = 0];
-  optional float high = 5 [default = 1];
-  optional float mean = 6 [default = 0];
-  optional float std = 7 [default = 1];
-  optional float scale = 8 [default = 3];
-  enum VarianceNorm {
-    FAN_IN = 0;
-    FAN_OUT = 1;
-    FAN_AVG = 2;
-  }
-  optional VarianceNorm variance_norm = 9 [default = FAN_IN];
-}
 // Store multiple TensorProto objects in one single proto.
 message TensorProtos {
  repeated TensorProto protos = 1;
@@ -139,16 +119,6 @@ message OperatorDef {
  optional string cache_key = 7;
 }
-// Record the gradient information
-message GradientProto {
-  // The derivative target.
-  optional string cost = 1;
-  // The target with respect to?
-  optional string wrt = 2;
-  // The external gradient
-  optional string external = 3;
-}
 // Graph Definition
 message GraphDef {
  // The graph name.
@@ -171,6 +141,33 @@ message GraphDef {
  // The name of outputs.
  repeated string output = 8;
-  // The gradients information.
+  // The info of gradients.
-  repeated GradientProto gradient = 9;
+  repeated GradientInfo grad_info = 9;
+}
+// Record the filler information.
+// This structure is kept for backward compatibility
+// with caffe, which relies the implicit initializer.
+message FillerInfo {
+  enum VarianceNorm {
+    FAN_IN = 0;
+    FAN_OUT = 1;
+    FAN_AVG = 2;
+  }
+  optional string type = 1 [default = 'constant'];
+  optional float value = 2 [default = 0];
+  optional float low = 3 [default = 0];
+  optional float high = 4 [default = 1];
+  optional float mean = 5 [default = 0];
+  optional float std = 6 [default = 1];
+  optional float scale = 7 [default = 3];
+  optional VarianceNorm variance_norm = 8 [default = FAN_IN];
+}
+// Record the gradient information.
+message GradientInfo {
+  // The derivative target.
+  optional string y = 1;
+  // The differentiated inputs.
+  repeated string xs = 2;
 }
--- a/dragon/python/__init__.py
+++ b/dragon/python/__init__.py
@@ -30,7 +30,6 @@ from dragon._api import metrics
 from dragon._api import nn
 from dragon._api import optimizers
 from dragon._api import random
-from dragon._api import workspace
 from dragon._api import vision
 # Virtual API

--- a/dragon/python/_api/updaters/__init__.py
+++ b/dragon/python/_api/updaters/__init__.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#    <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-from __future__ import absolute_import as _absolute_import
-from __future__ import division as _division
-from __future__ import print_function as _print_function
-from dragon.core.training.adam import Adam
-from dragon.core.training.rmsprop import RMSProp
-from dragon.core.training.sgd import Nesterov
-from dragon.core.training.sgd import SGD
-from dragon.core.training.updater import Updater
-__all__ = [_s for _s in dir() if not _s.startswith('_')]
--- a/dragon/python/_api/workspace/__init__.py
+++ b/dragon/python/_api/workspace/__init__.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#    <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-from __future__ import absolute_import as _absolute_import
-from __future__ import division as _division
-from __future__ import print_function as _print_function
-from dragon.core.framework.workspace import feed_tensor
-from dragon.core.framework.workspace import fetch_tensor
-from dragon.core.framework.workspace import has_tensor
-from dragon.core.framework.workspace import load
-from dragon.core.framework.workspace import reset_tensor
-from dragon.core.framework.workspace import run_operator
-from dragon.core.framework.workspace import save
-__all__ = [_s for _s in dir() if not _s.startswith('_')]
--- a/dragon/python/core/autograph/def_function.py
+++ b/dragon/python/core/autograph/def_function.py
@@ -28,6 +28,7 @@ from dragon.core.autograph.tensor import Tensor
 from dragon.core.eager import context as eager_context
 from dragon.core.eager.tensor import EagerTensor
 from dragon.core.framework import context
+from dragon.core.framework import device_spec
 from dragon.core.framework import workspace
 from dragon.core.training import optimizer
 from dragon.core.util import decorator
@@ -276,13 +277,16 @@ class FunctionGuard(object):
        executables = self.executables
        inputs, kwargs = self.canonicalize_inputs(*args, **kwargs)
        executables[0](*inputs, return_outputs=False, **kwargs)
-        _ = [func(return_outputs=False) for func in executables[1:]]
+        [func(return_outputs=False) for func in executables[1:]]
        outputs = []
-        for obj in self.outputs:
+        current_ws = workspace.get_workspace()
-            if isinstance(obj, Tensor):
+        for output in self.outputs:
-                outputs.append(EagerTensor(id=obj.id, own_storage=False))
+            if isinstance(output, Tensor):
+                impl = current_ws.GetTensor(output.id)
+                device = device_spec.DeviceSpec(*impl.device)
+                outputs.append(EagerTensor(impl=impl, device=device))
            else:
-                outputs.append(obj)
+                outputs.append(output)
        return outputs
    def __get__(self, instance, owner):

--- a/dragon/python/core/autograph/function_lib.py
+++ b/dragon/python/core/autograph/function_lib.py
@@ -23,7 +23,6 @@ from dragon.core.autograph.op_def import OpDef
 from dragon.core.autograph.op_def import OpInfo
 from dragon.core.autograph.tensor import Tensor
 from dragon.core.framework import config
-from dragon.core.framework import context
 from dragon.core.framework import proto_util
 from dragon.core.framework import workspace
 from dragon.core.proto import dragon_pb2
@@ -32,7 +31,7 @@ from dragon.core.util import nest
 def add_device_option(graph_def):
-    """Add the device option for graph."""
+    """Add the device option."""
    cfg = config.config()
    str2idx = {'cpu': 0, 'cuda': 1, 'cnml': 2}
    dev_opt = dragon_pb2.DeviceOption()
@@ -42,69 +41,66 @@ def add_device_option(graph_def):
    graph_def.device_option.CopyFrom(dev_opt)
-def add_gradient_info(graph_def, targets):
+def add_grad_info(graph_def, targets):
-    """Add the gradient info for graph."""
+    """Add the gradient info."""
-    gradients = set()
    for target in targets:
-        if target._grad is not None:
+        info = target._grad
-            gradients.update(target._grad.make_pairs())
+        if info is not None:
-    for (cost, wrt) in gradients:
+            graph_def.grad_info.extend([
-        gradient = dragon_pb2.GradientProto()
+                dragon_pb2.GradientInfo(
-        gradient.cost, gradient.wrt = str(cost), str(wrt)
+                    y=info.y.id,
-        graph_def.gradient.extend([gradient])
+                    xs=[x.id for x in info.xs])])
 def add_optimization(graph_def, level=None):
-    """Add the optimization attribute for graph."""
+    """Add the optimization argument."""
    cfg = config.config()
    if level is None:
        level = cfg.graph_optimization
    graph_def.arg.add().CopyFrom(
-        proto_util.make_argument(
+        proto_util.make_argument('optimization', level))
-            'optimization_level', level))
    graph_def.graph_type = cfg.graph_type
 def add_phase(graph_def, targets):
-    """Add the phase attribute for graph."""
+    """Add the phase argument."""
-    phase = context.get_graph_phase()
+    phase = 'TEST'
-    if phase is None:
+    for target in targets:
-        phase = 'TEST'
+        try:
-        for target in targets:
+            if target._grad and target._grad.required():
-            if target._grad is not None and \
-                    target._grad.required():
                phase = 'TRAIN'
                break
+        except AttributeError:
+            pass
    graph_def.arg.extend([proto_util.make_argument('phase', phase)])
-def add_update_ops(graph_def, optimizer):
+def add_update_defs(graph_def, optimizer):
-    """Add the update operators for graph."""
+    """Add the update defs."""
    if optimizer is None:
        return
-    grads, update_ops = [], []
+    grads, update_defs = [], []
    extra_arguments = optimizer._extra_kwargs
    extra_arguments['handle'] = optimizer._op_handle
-    # Generate update operators according to the updater.
+    # Generate op defs according to the collected updates
-    for e in optimizer._param_group:
+    current_ws = workspace.get_workspace()
-        (param, grad), arguments = e
+    for (param, grad), arguments in optimizer._param_group:
-        if workspace.has_tensor(grad):
+        if current_ws.has_tensor(grad):
            grads.append(grad)
            arguments = dict(arguments, **extra_arguments)
-            update_ops.append(
+            update_defs.append(
                proto_util.make_operator_def(
                    op_type=optimizer._op_type,
                    inputs=[grad],
                    outputs=[param],
                    name=OpDef.get_name(),
-                    **arguments
+                    **arguments))
-                ))
        else:
            logging.info('Skip to update Tensor({}).'.format(param))
-    # Insert a reduce op if the process group is found.
+    # Insert a reduce def if the process group is found.
    process_group = optimizer._process_group
    if process_group is not None:
-        update_ops.insert(
+        update_defs.insert(
            0, proto_util.make_operator_def(
                op_type='Collective',
                inputs=grads,
@@ -115,7 +111,7 @@ def add_update_ops(graph_def, optimizer):
                **process_group.arguments
            )
        )
-    graph_def.op.extend(update_ops)
+    graph_def.op.extend(update_defs)
 class Function(object):
@@ -128,16 +124,15 @@ class Function(object):
        self.graph_name = None  # Determined after creating
        self.inputs, self.outputs = None, None
-    def create(self, inputs=None, outputs=None, givens=None, updater=None):
+    def create(self, inputs=None, outputs=None, givens=None, optimizer=None):
        self.inputs = inputs = [] if inputs is None else nest.flatten(inputs)
        self.outputs = outputs = [] if outputs is None else nest.flatten(outputs)
-        if len(outputs) > 0 and updater is not None:
+        if len(outputs) > 0 and optimizer is not None:
-            raise ValueError('Specific either <outputs> or <updater>, not both.')
+            raise ValueError('Specific either <outputs> or <optimizer>, not both.')
+        # Collect the forward defs.
        op_info = OpInfo()
-        # Collect the forward operators.
        requires_grad = False
        for i, output in enumerate(outputs):
            op_info.merge_from(output)
@@ -149,7 +144,7 @@ class Function(object):
            except AttributeError:
                raise ValueError('Output[%d] is not a symbolic tensor.' % i)
-        # Handle givens.
+        # Handle the replacements.
        if givens is not None:
            name_dict = {}
            for k, v in givens.items():
@@ -161,62 +156,61 @@ class Function(object):
                        'Excepted a Tensor, '
                        'got {}.'.format(type(v).__name__)
                    )
-            # Update original operators.
+            # Update the original defs.
            op_info = copy.deepcopy(op_info)
            for k in op_info._defs.keys():
                op_def = op_info._defs[k]
                op_def.input.extend([
                    name_dict[input]
                    if input in name_dict else input
-                    for input in op_def.input
+                    for input in op_def.input])
-                ])
                del op_def.input[:len(op_def.input) // 2]
-        # Sort out the states.
+        # Sort out the forward defs.
        op_defs = sorted(op_info._defs.items(), key=lambda d: d[0])
-        forward_ops = copy.deepcopy([v for k, v in op_defs])
+        forward_defs = copy.deepcopy([v for k, v in op_defs])
-        # Generate the backward operators.
+        # Generate the backward defs.
        if requires_grad:
            input_grads, grad_targets = {}, []
            for output in outputs:
-                grad_info = output._grad
+                info = output._grad
-                if grad_info is not None:
+                if info is not None:
-                    if grad_info.input is not None:
+                    if info.grad_y is not None:
-                        input_grads[output.id] = output._grad.input.id
+                        input_grads[output.id] = info.grad_y.id
                    grad_targets.append(output.id)
-            forward_ops, gradient_ops, _ = \
+            backward_defs = grad_maker.GradientMaker.make(
-                grad_maker.GradientMaker.make(
+                op_defs=forward_defs,
-                    forward_ops=forward_ops,
+                targets=grad_targets,
-                    targets=grad_targets,
+                input_grads=input_grads,
-                    input_grads=input_grads,
+            )
-                )
        else:
-            gradient_ops = []
+            backward_defs = []
-        # Fill with all known graph elements.
+        # Fill graph elements.
-        self.graph_def.op.extend(forward_ops + gradient_ops)
+        self.graph_def.op.extend(forward_defs + backward_defs)
        self.graph_def.input.extend([input.name for input in inputs])
        self.graph_def.output.extend(list(op_info._targets))
        if len(outputs) > 0:
            add_device_option(self.graph_def)
            add_optimization(self.graph_def)
-            add_gradient_info(self.graph_def, outputs)
+            add_grad_info(self.graph_def, outputs)
            add_phase(self.graph_def, outputs)
-        elif updater is not None:
+        elif optimizer is not None:
            add_device_option(self.graph_def)
            add_optimization(self.graph_def, level=0)
-            add_update_ops(self.graph_def, updater)
+            add_update_defs(self.graph_def, optimizer)
        # Notify the backend to create and optimize.
-        self.graph_name = workspace.create_graph(self.graph_def)
+        current_ws = workspace.get_workspace()
+        self.graph_name = current_ws.create_graph(self.graph_def)
        # Bind a callback to run this graph.
        self.callback = lambda *args, **kwargs: \
-            workspace.run_graph(
+            current_ws.run_graph(
-                graph=self.graph_name,
+                name=self.graph_name,
-                inputs=(inputs, args),
+                inputs_and_values=(inputs, args),
                outputs=outputs,
                **kwargs
            )
@@ -273,15 +267,15 @@ class Function(object):
        add_phase(graph_def, self.outputs)
        # Notify the backend to create and optimize.
+        current_ws = workspace.get_workspace()
        self.graph_def = graph_def
-        self.graph_name = workspace.create_graph(graph_def)
+        self.graph_name = current_ws.create_graph(graph_def)
        # Bind a callback to run this graph.
-        callback_inputs = self.inputs if explicit_inputs else []
        self.callback = lambda *args, **kwargs: \
-            workspace.run_graph(
+            current_ws.run_graph(
-                graph=self.graph_name,
+                name=self.graph_name,
-                inputs=(callback_inputs, args),
+                inputs_and_values=(self.inputs if explicit_inputs else [], args),
                outputs=self.outputs,
                **kwargs
            )

--- a/dragon/python/core/autograph/grad_impl.py
+++ b/dragon/python/core/autograph/grad_impl.py
@@ -21,37 +21,26 @@ from dragon.core.util import nest
 class GradientInfo(object):
    """A class to store the known gradient relations."""
-    def __init__(self, parent):
+    def __init__(self, y, grad_y=None):
-        self._parent = parent
+        self._y, self._grad_y, self._xs = y, grad_y, []
-        self._cost, self._wrt = [], []
-        self._input = None
    @property
-    def cost(self):
+    def grad_y(self):
-        return self._cost
+        return self._grad_y
    @property
-    def input(self):
+    def xs(self):
-        return self._input
+        return self._xs
    @property
-    def wrt(self):
+    def y(self):
-        return self._wrt
+        return self._y
-    def add_cost(self, cost):
+    def add_x(self, x):
-        self._cost.append(cost)
+        self._xs.append(x)
-    def add_wrt(self, wrt):
-        self._wrt.append(wrt)
-    def make_pairs(self):
-        return [(self._parent.id, wrt) for wrt in self._wrt]
    def required(self):
-        return len(self._wrt) > 0
+        return len(self._xs) > 0
-    def set_input(self, input):
-        self._input = input
 def gradients(ys, xs, grad_ys=None):
@@ -112,18 +101,14 @@ def gradients(ys, xs, grad_ys=None):
    if grad_ys is not None:
        grad_ys = nest.flatten(grad_ys)
-    # Record the gradient info (cost, wrt, input),
+    # Record the gradient info (y, grad_y, xs),
    # then, generate the gradient references once.
    for i, y in enumerate(ys):
        if y._grad is None:
-            y._grad = GradientInfo(y)
+            grad_y = grad_ys[i] if grad_ys is not None else None
-        if grad_ys is not None:
+            y._grad = GradientInfo(y, grad_y)
-            y._grad.set_input(grad_ys[i])
        for x in xs:
-            if not hasattr(x, '_grad') or x._grad is None:
+            y._grad.add_x(x)
-                x._grad = GradientInfo(x)
-            y._grad.add_wrt(x.id)
-            x._grad.add_cost(y)
            if i == 0:
                dxs.append(TensorRef(x.id + '_grad', x.shape, x.dtype))

--- a/dragon/python/core/autograph/grad_maker.py
+++ b/dragon/python/core/autograph/grad_maker.py
@@ -13,16 +13,7 @@
 #
 # ------------------------------------------------------------
-"""Gradient maker implemented in python.
+"""Python-implemented gradient maker."""
-The basic idea of ``GradientMaker`` comes from ``caffe2``,
-Jia provided a simple way to bridge the Generator(Python) with OpScheme(C++).
-For the efficient C++ implementation, see,
-    <https://github.com/seetaresearch/Dragon/blob/master/Dragon/src/core/graph_gradient.cc>
-"""
 from __future__ import absolute_import
 from __future__ import division
@@ -40,25 +31,25 @@ class GradientMaker(object):
    """Make def for the gradient based on rules."""
    @classmethod
-    def gen_def(cls, forward_op, g_outputs):
+    def gen_def(cls, op_def, g_outputs):
        """Generate the OperatorDef from forward op."""
-        g_ops, g_inputs, defaults = backend.CreateGradientDefs(
+        grad_defs, g_inputs, defaults = backend.CreateGradientDefs(
-            forward_op.SerializeToString(), g_outputs)
+            op_def.SerializeToString(), g_outputs)
-        for idx, g_op in enumerate(g_ops):
+        for i, grad_def in enumerate(grad_defs):
            new_def = dragon_pb2.OperatorDef()
-            new_def.ParseFromString(g_op)
+            new_def.ParseFromString(grad_def)
-            g_ops[idx] = new_def
+            grad_defs[i] = new_def
-        return g_ops, g_inputs, defaults
+        return grad_defs, g_inputs, defaults
    @classmethod
-    def check(cls, forward_op, inputs_to_grads, blacklist, targets):
+    def check(cls, op_def, inputs_to_grads, blacklist, targets):
        """Check if missing gradients. If missing, skip."""
-        if forward_op.type in backend.NO_GRADIENT_OPERATORS:
+        if op_def.type in backend.NO_GRADIENT_OPERATORS:
-            for input in forward_op.input:
+            for input in op_def.input:
                blacklist.add(input)
            return True, None
        gen_grads = []
-        for idx, output in enumerate(forward_op.output):
+        for idx, output in enumerate(op_def.output):
            if output not in inputs_to_grads:
                if output in blacklist:
                    return True, gen_grads
@@ -66,50 +57,43 @@ class GradientMaker(object):
                    # Consider to generate virtual gradient for targets.
                    gen_grads.append((output, idx))
                    inputs_to_grads[output] = output + '_grad'
-                elif len(forward_op.output) == 1:
+                elif len(op_def.output) == 1:
                    # We can skip this op, obviously.
                    return True, gen_grads
        # Pass, even if missing some grads.
        return False, gen_grads
    @classmethod
-    def make(cls, forward_ops, targets, input_grads=None):
+    def make(cls, op_defs, targets, input_grads=None):
-        """The making procedure."""
+        """Make the backward op defs."""
        inputs_to_grads = {} if input_grads is None else input_grads
        inputs_count, grads_count = defaultdict(int), defaultdict(int)
        all_split_grads, blacklist = set(), set()
-        backward_ops = []
-        # A DAG may not have any in-place operators.
-        is_dag = True
        # PLAY for the forward.
-        for forward_op in forward_ops:
+        for op_def in op_defs:
-            if forward_op.type in backend.NO_GRADIENT_OPERATORS:
+            if op_def.type in backend.NO_GRADIENT_OPERATORS:
                continue
-            outputs = [o for o in forward_op.output]
+            outputs = [output for output in op_def.output]
-            for input in forward_op.input:
+            for input in op_def.input:
                if input not in outputs:
                    # Avoid to count the duplicate input,
                    # (i.e. the in-place output).
                    inputs_count[input] += 1
-                else:
-                    is_dag = False
        # PLAY for the backward.
-        for forward_op in forward_ops[::-1]:
+        backward_defs = []
+        for op_def in op_defs[::-1]:
            # Collect inputs and outputs.
            is_skip, gen_grads = cls.check(
-                forward_op=forward_op,
+                op_def=op_def,
                inputs_to_grads=inputs_to_grads,
                blacklist=blacklist,
                targets=targets,
            )
            # Missing grads are represented as ``None``.
-            g_outputs = [inputs_to_grads.get(name, '')
+            g_outputs = [inputs_to_grads.get(name, '') for name in op_def.output]
-                         for name in forward_op.output]
+            grad_defs, g_inputs, defaults = cls.gen_def(op_def, g_outputs)
-            g_ops, g_inputs, defaults = cls.gen_def(forward_op, g_outputs)
            # Append operators.
            if not is_skip:
@@ -127,17 +111,17 @@ class GradientMaker(object):
                        outputs=op_outputs,
                        defaults=values,
                    )
-                    if forward_op.HasField('device_option'):
+                    if op_def.HasField('device_option'):
-                        gen_op.device_option.CopyFrom(forward_op.device_option)
+                        gen_op.device_option.CopyFrom(op_def.device_option)
-                    backward_ops.append(gen_op)
+                    backward_defs.append(gen_op)
                # GradientOp
-                for g_op in g_ops:
+                for grad_def in grad_defs:
-                    g_op.name = OpDef.get_name()
+                    grad_def.name = OpDef.get_name()
-                    backward_ops.append(g_op)
+                    backward_defs.append(grad_def)
            # Split and gather grads for multi-used input.
-            for g_op in g_ops:
+            for grad_def in grad_defs:
-                for g_output_idx, g_output in enumerate(g_op.output):
+                for g_output_idx, g_output in enumerate(grad_def.output):
                    original_idx = -1
                    for g_input_idx, g_input in enumerate(g_inputs):
                        if g_output == g_input:
@@ -145,10 +129,10 @@ class GradientMaker(object):
                    # Ignore un-used && in-placed GI(?).
                    if original_idx == -1:
                        continue
-                    if g_output in g_op.input:
+                    if g_output in grad_def.input:
                        continue
                    # Found a split branch.
-                    original_name = forward_op.input[original_idx]
+                    original_name = op_def.input[original_idx]
                    if inputs_count[original_name] > 1:
                        # Split.
                        split_name = g_output + '_autosplit_%d' % grads_count[g_output]
@@ -161,21 +145,21 @@ class GradientMaker(object):
                            for idx in range(grads_count[g_output]):
                                if '%s_autosplit_%d' % (g_output, idx) in all_split_grads:
                                    split_inputs.append('%s_autosplit_%d' % (g_output, idx))
-                            gather_op = proto_util.make_operator_def(
+                            gather_def = proto_util.make_operator_def(
                                name=OpDef.get_name(),
                                op_type='GradientGather',
                                inputs=split_inputs,
                                outputs=[g_output],
                            )
-                            if g_op.HasField('device_option'):
+                            if grad_def.HasField('device_option'):
-                                gather_op.device_option.CopyFrom(g_op.device_option)
+                                gather_def.device_option.CopyFrom(grad_def.device_option)
-                            backward_ops.append(gather_op)
+                            backward_defs.append(gather_def)
-                        g_op.output[g_output_idx] = split_name
+                        grad_def.output[g_output_idx] = split_name
            # Done.
            if not is_skip:
-                for name, grad in zip(forward_op.input, g_inputs):
+                for name, grad in zip(op_def.input, g_inputs):
                    if grad != '':
                        inputs_to_grads[name] = grad
-        return forward_ops, backward_ops, is_dag
+        return backward_defs
--- a/dragon/python/core/autograph/op_def.py
+++ b/dragon/python/core/autograph/op_def.py
@@ -30,9 +30,9 @@ class OpInfo(object):
        self._defs = dict()
        self._targets = set()
-    def add_def(self, idx, op_def):
+    def add_def(self, index, op_def):
        """Add a operator definition."""
-        self._defs[idx] = op_def
+        self._defs[index] = op_def
    def add_target(self, target):
        """Add an extra target relied by inputs."""
@@ -74,13 +74,14 @@ class OpDef(object):
        # Create outputs.
        if outputs is None:
            outputs = []
+            current_ws = workspace.get_workspace()
            name_scope = context.get_name_scope()
            for i in range(num_outputs):
                outputs.append(TensorRef(
-                    workspace.get_dummy_name(
+                    current_ws.unique_name(
                        name_scope + (name if name else op_type),
                        suffix=':{}'.format(i),
-                        domain='Tensor')))
+                        namespace='Tensor')))
        else:
            outputs = nest.flatten(outputs)
            num_outputs = len(outputs)
@@ -124,13 +125,13 @@ class OpDef(object):
        return spec_func(arguments, inputs, outputs)
    @staticmethod
-    def get_index_and_name(prefix='Op'):
+    def get_index_and_name():
        """Return an unique op name and index."""
-        name = workspace.get_dummy_name(
+        name = workspace.get_workspace().unique_name(
-            prefix, domain='Operator', zero_based=False)
+            'Op', namespace='Op', zero_based=False)
        return int(name.split('_')[-1]), name
    @staticmethod
-    def get_name(prefix='Op'):
+    def get_name():
        """Return an unique op name."""
-        return OpDef.get_index_and_name(prefix)[1]
+        return OpDef.get_index_and_name()[1]
--- a/dragon/python/core/autograph/op_spec.py
+++ b/dragon/python/core/autograph/op_spec.py
@@ -190,24 +190,28 @@ def conv_spec(args, inputs, outputs):
    out_shape = None
    try:
        out_shape = inputs[0].shape[:]
+        num_axes = len(out_shape) - 2
        channel_axis = 1 if args['data_format'] == 'NCHW' else -1
        spatial_axis = 2 if args['data_format'] == 'NCHW' else 1
        if 'out_channels' in args:
            out_shape[channel_axis] = args['out_channels']
        else:
            out_shape[channel_axis] = inputs[1].shape[0]
-        for i in range(len(out_shape) - 2):
+        for i in range(num_axes):
-            input_size = out_shape[i + spatial_axis]
+            try:
-            k = args['kernel_shape'][i]
+                k = args['kernel_shape'][i]
-            s = args['strides'][i]
+                s = args['strides'][i]
-            pl, pr = args['pads'][i], args['pads'][i + 2]
+                d = args['dilations'][i]
-            dk, dp = (k - 1) + 1, pl + pr
+                in_size = out_shape[i + spatial_axis]
-            if 'SAME' not in args['padding']:
+                k_size = d * (k - 1) + 1
-                out_shape[i + spatial_axis] = \
+                if 'SAME' not in args['padding']:
-                    int(float(input_size + dp - dk) / s) + 1
+                    pad_size = args['pads'][i] + args['pads'][i + num_axes]
-            else:
+                    out_size = (in_size + pad_size - k_size) // s + 1
-                out_shape[i + spatial_axis] = \
+                else:
-                    int(float(input_size + s - 1) / s)
+                    out_size = (in_size + s - 1) // s
+            except IndexError:
+                out_size = None
+            out_shape[i + spatial_axis] = out_size
    except (TypeError, IndexError):
        pass
    outputs[0].shape = out_shape
@@ -220,30 +224,33 @@ def conv_transpose_spec(args, inputs, outputs):
    out_shape = None
    try:
        out_shape = inputs[0].shape[:]
+        num_axes = len(out_shape) - 2
        channel_axis = 1 if args['data_format'] == 'NCHW' else -1
        spatial_axis = 2 if args['data_format'] == 'NCHW' else 1
        if 'out_channels' in args:
            out_shape[channel_axis] = args['out_channels']
        else:
            out_shape[channel_axis] = inputs[1].shape[1]
-        for i in range(len(out_shape) - 2):
+        for i in range(num_axes):
-            k = args['kernel_shape'][i]
+            try:
-            s = args['strides'][i]
+                k = args['kernel_shape'][i]
-            d = args['dilations'][i]
+                s = args['strides'][i]
-            pl, pr = args['pads'][i], args['pads'][i + 2]
+                d = args['dilations'][i]
-            dk, dp = d * (k - 1) + 1, pl + pr
+                in_size = out_shape[i + spatial_axis]
-            input_size = out_shape[i + spatial_axis]
+                k_size = d * (k - 1) + 1
-            if 'SAME' not in args['padding']:
+                if 'SAME' not in args['padding']:
-                out_shape[i + spatial_axis] = s * \
+                    pad_size = args['pads'][i] + args['pads'][i + num_axes]
-                    (input_size - 1) + dk - dp
+                    out_size = s * (in_size - 1) + k_size - pad_size
-            else:
+                    if 'output_padding' in args and args['output_padding']:
-                out_shape[i + spatial_axis] = None
+                        out_size += args['output_padding'][i]
-                if args['output_padding'] is not None:
+                else:
-                    out_shape[i + spatial_axis] = \
+                    if 'output_shape' in args and args['output_shape']:
-                        s * (input_size - 1) + dk + \
+                        out_size = args['output_shape'][i]
-                        args['output_padding'][i]
+                    else:
-                elif args['output_shape'] is not None:
+                        out_size = None
-                    out_shape[i + spatial_axis] = args['output_shape'][i]
+            except IndexError:
+                out_size = None
+            out_shape[i + spatial_axis] = out_size
    except (TypeError, IndexError):
        pass
    outputs[0].shape = out_shape
@@ -606,21 +613,24 @@ def pool_spec(args, inputs, outputs):
    out_shape = None
    try:
        out_shape = inputs[0].shape[:]
+        num_axes = len(out_shape) - 2
        spatial_axis = 2 if args['data_format'] == 'NCHW' else 1
-        for i in range(len(out_shape) - 2):
+        for i in range(num_axes):
-            k = args['kernel_shape'][i]
-            s = args['strides'][i]
-            pl, pr = args['pads'][i], args['pads'][i + 2]
            if not args['global_pooling']:
-                floor_or_ceil = math.ceil if args['ceil_mode'] else math.floor
+                try:
-                if 'SAME' not in args['padding']:
+                    k = args['kernel_shape'][i]
-                    in_size = out_shape[i + spatial_axis] + pl + pr
+                    s = args['strides'][i]
-                    out_size = int(floor_or_ceil(float(in_size - k) / s) + 1)
-                    out_shape[i + spatial_axis] = out_size
-                else:
                    in_size = out_shape[i + spatial_axis]
-                    out_size = int(floor_or_ceil(float(in_size) / s))
+                    if 'SAME' not in args['padding']:
-                    out_shape[i + spatial_axis] = out_size
+                        floor_or_ceil = math.ceil if args['ceil_mode'] else math.floor
+                        pad_size = args['pads'][i] + args['pads'][i + num_axes]
+                        out_size = float(in_size + pad_size - k) / float(s) + 1
+                        out_size = floor_or_ceil(out_size)
+                    else:
+                        out_size = math.ceil(float(in_size) / float(s))
+                except IndexError:
+                    out_size = None
+                out_shape[i + spatial_axis] = out_size
            else:
                out_shape[i + spatial_axis] = 1
    except (TypeError, IndexError):
@@ -959,14 +969,14 @@ def stack_spec(args, inputs, outputs):
 @register('Tile')
 def tile_spec(args, inputs, outputs):
    outputs[0].dtype = inputs[0].dtype
-    multiples = args['multiples']
+    repeats = args['repeats']
-    if multiples is not None:
+    if repeats is not None:
        try:
            out_shape = inputs[0].shape[:]
-            for i, multiple in enumerate(multiples):
+            for i, size in enumerate(repeats):
                if i < len(out_shape):
                    try:
-                        out_shape[i] *= multiple
+                        out_shape[i] *= size
                    except TypeError:
                        out_shape[i] = None
            outputs[0].shape = out_shape

--- a/dragon/python/core/autograph/tensor.py
+++ b/dragon/python/core/autograph/tensor.py
@@ -21,6 +21,7 @@ from dragon.core.framework import context
 from dragon.core.framework import types
 from dragon.core.framework import workspace
 from dragon.core.proto import dragon_pb2
+from dragon.core.util import math_util
 from dragon.core.util import nest
@@ -45,11 +46,9 @@ class Tensor(types.TensorMetaclass):
            The optional data type.
        """
-        self._op = None
+        self._op, self._grad = None, None
-        self._grad = None
+        self._name, self._shape, self._dtype = None, None, None
-        self.name = name
+        self.name, self.shape, self.dtype = name, shape, dtype
-        self.shape = shape
-        self.dtype = dtype
    @property
    def dtype(self):
@@ -112,8 +111,8 @@ class Tensor(types.TensorMetaclass):
        if value != '':
            value = value if value else 'Tensor'
            name_scope = context.get_name_scope()
-            self._name = workspace.get_dummy_name(
+            self._name = workspace.get_workspace().unique_name(
-                name_scope + value, domain='Tensor')
+                name_scope + value, namespace='Tensor')
        else:
            # Set it manually for same cases
            self._name = value
@@ -142,8 +141,6 @@ class Tensor(types.TensorMetaclass):
            The shape.
        """
-        if not hasattr(self, '_shape'):
-            self._shape = None
        return self._shape
    @shape.setter
@@ -166,6 +163,22 @@ class Tensor(types.TensorMetaclass):
        else:
            self._shape = value
+    @property
+    def size(self):
+        """Return the total number of elements in this tensor.
+        Returns
+        -------
+        int
+            The total count of elements.
+        """
+        if self._shape is None:
+            return 0
+        if None in self._shape:
+            return numpy.inf
+        return math_util.prod(self._shape)
    def astype(self, dtype, inplace=False):
        """Cast the data type to a specific one.
@@ -186,7 +199,6 @@ class Tensor(types.TensorMetaclass):
        `dragon.cast(...)`_ : Cast the data type of input.
        """
-        pass
    def constant(self, value=0):
        r"""Register as a variable with constant initializer.
@@ -219,7 +231,6 @@ class Tensor(types.TensorMetaclass):
        `dragon.copy(...)`_ : Copy the value to ref.
        """
-        pass
    def get_value(self):
        """Copy the data from storage.
@@ -229,12 +240,7 @@ class Tensor(types.TensorMetaclass):
        numpy.ndarray
            The deep copied value.
-        See Also
-        --------
-        `dragon.workspace.fetch_tensor(...)`_ : Fetch the value of given tensor.
        """
-        pass
    def glorot_normal(self, scale=2.):
        r"""Register as a variable with glorot normal initializer.
@@ -326,7 +332,6 @@ class Tensor(types.TensorMetaclass):
        `dragon.reshape(...)`_ : Change the dimensions of input.
        """
-        pass
    def set_value(self, value):
        """Feed the const value to the storage.
@@ -341,12 +346,7 @@ class Tensor(types.TensorMetaclass):
        dragon.Tensor
            The self.
-        See Also
-        --------
-        `dragon.workspace.feed_tensor(...)`_ : Feed the value to the given tensor.
        """
-        pass
    def truncated_normal(self, mean=0, std=1):
        r"""Register as a variable with truncated normal initializer.
@@ -407,7 +407,7 @@ class Tensor(types.TensorMetaclass):
        Parameters
        ----------
-        value : Union[number, Sequence, numpy.ndarray]
+        value : array_like
            The value to convert.
        dtype: str, optional
            The optional data type.
@@ -420,16 +420,22 @@ class Tensor(types.TensorMetaclass):
            The constant contains the value.
        """
-        return Tensor('', dtype=dtype)._from_constant(value, name)
+        if not isinstance(value, numpy.ndarray):
+            value = numpy.array(value, dtype if dtype else 'float32')
+        return TensorRef(
+            name=workspace.get_workspace().unique_name(
+                name=context.get_name_scope() + (name if name else 'Const'),
+                suffix=':0',
+                namespace='Tensor'),
+            shape=list(value.shape),
+            dtype=str(value.dtype),
+        ).set_value(value)
    def _register_as(self, type, **kwargs):
        """Fill self with the specific type of filler."""
-        filler = dragon_pb2.TensorFillerProto()
+        filler = dragon_pb2.FillerInfo()
-        filler.tensor = self.name
        filler.type = type.lower()
-        if filler.type in ['placeholder', 'variable']:
+        if filler.type == 'constant':
-            pass
-        elif filler.type == 'constant':
            filler.value = kwargs['value'] if 'value' in kwargs else 0
        elif filler.type in ['normal', 'gaussian']:
            filler.mean = kwargs['mean'] if 'mean' in kwargs else 0
@@ -438,46 +444,59 @@ class Tensor(types.TensorMetaclass):
        elif filler.type == 'uniform':
            filler.low = kwargs['low'] if 'low' in kwargs else 0
            filler.high = kwargs['high'] if 'high' in kwargs else 1
-            filler.type = 'uniform'
+        elif filler.type == 'truncated_normal':
-        elif filler.type in ['truncated_normal', 'truncatednormal']:
            filler.mean = kwargs['mean'] if 'mean' in kwargs else 0
            filler.std = kwargs['std'] if 'std' in kwargs else 1
            filler.low = filler.mean - 2.0 * filler.std
            filler.high = filler.mean + 2.0 * filler.std
-            filler.type = 'truncated_normal'
-        elif filler.type == 'parameterized_truncated_normal':
-            filler.mean = kwargs['mean'] if 'mean' in kwargs else 0
-            filler.std = kwargs['std'] if 'std' in kwargs else 1
-            filler.low = kwargs['low'] if 'low' in kwargs else -2.0
-            filler.high = kwargs['high'] if 'high' in kwargs else 2.0
        elif filler.type in ['glorot_uniform', 'xavier']:
-            filler.scale = kwargs['scale'] if 'scale' in kwargs else 3.0
+            filler.scale = kwargs['scale'] if 'scale' in kwargs else 3
        elif filler.type in ['glorot_normal', 'msra']:
-            filler.scale = kwargs['scale'] if 'scale' in kwargs else 2.0
+            filler.scale = kwargs['scale'] if 'scale' in kwargs else 2
-        else:
+        workspace.get_workspace().create_tensor(self.name, filler)
-            raise ValueError('Unknown filler type: {}'.format(filler.type))
-        workspace.create_filler(filler)
        return self
-    def _from_constant(self, value, name=None):
-        """Convert the value to a tensor."""
-        if not isinstance(value, numpy.ndarray):
-            value = numpy.array(value, self.dtype if self.dtype else 'float32')
-        return TensorRef(
-            name=workspace.get_dummy_name(
-                basename=context.get_name_scope() +
-                        (name if name else 'Const'),
-                suffix=':0',
-                domain='Tensor'),
-            shape=list(value.shape),
-            dtype=str(value.dtype),
-        ).set_value(value)
    def __add__(self, other):
-        pass
+        r"""Compute the element-wise addition.
+        .. math:: \text{out} = \text{self} + \text{other}
+        Parameters
+        ----------
+        other : Union[dragon.Tensor, number]
+            The value to add.
+        Returns
+        -------
+        dragon.Tensor
+            The **y**.
+        See Also
+        --------
+        `dragon.math.add(...)`_ : Compute the element-wise addition.
+        """
    def __div__(self, other):
-        pass
+        r"""Compute the element-wise division.
+        .. math:: \text{out} = \text{self} \div \text{other}
+        Parameters
+        ----------
+        other : Union[dragon.Tensor, number]
+            The value to divide.
+        Returns
+        -------
+        dragon.Tensor
+            The output tensor.
+        See Also
+        --------
+        `dragon.math.div(...)`_ : Compute the element-wise division.
+        """
    def __float__(self):
        """Return a float python scalar.
@@ -491,13 +510,69 @@ class Tensor(types.TensorMetaclass):
        return float(self.get_value())
    def __ge__(self, other):
-        pass
+        r"""Compute element-wise greater-equal comparison.
+        .. math:: \text{out} = (\text{self} \geq \text{other})
+        Parameters
+        ----------
+        other : Union[dragon.Tensor, number]
+            The value to compare.
+        Returns
+        -------
+        dragon.Tensor
+            The output tensor.
+        See Also
+        --------
+        `dragon.math.greater_equal(...)`_ : Compute element-wise greater-equal comparison.
+        """
    def __getitem__(self, item):
-        pass
+        """Select the elements at the specific indices.
+        Parameters
+        ----------
+        item : Union[int, slice, dragon.Tensor]
+            The indices.
+        Returns
+        -------
+        dragon.Tensor
+            The output tensor.
+        See Also
+        --------
+        `dragon.slice(...)`_ : Select the elements according to the given sections.
+        See Also
+        --------
+        `dragon.masked_select(...)`_ : Select the elements where the given mask is 1.
+        """
    def __gt__(self, other):
-        pass
+        r"""Compute element-wise greater comparison.
+        .. math:: \text{out} = (\text{self} > \text{other})
+        Parameters
+        ----------
+        other : Union[dragon.Tensor, number]
+            The value to compare.
+        Returns
+        -------
+        dragon.Tensor
+            The output tensor.
+        See Also
+        --------
+        `dragon.math.greater(...)`_ : Compute element-wise greater comparison.
+        """
    def __hash__(self):
        return id(self)
@@ -513,20 +588,105 @@ class Tensor(types.TensorMetaclass):
        """
        return int(self.get_value())
+    def __le__(self, other):
+        r"""Compute element-wise less-equal comparison.
+        .. math:: \text{out} = (\text{self} \leq \text{other})
+        Parameters
+        ----------
+        other : Union[dragon.Tensor, number]
+            The value to compare.
+        Returns
+        -------
+        dragon.Tensor
+            The output tensor.
+        See Also
+        --------
+        `dragon.math.less_equal(...)`_ : Compute element-wise less-equal comparison.
+        """
    def __lt__(self, other):
-        pass
+        r"""Compute element-wise less comparison.
-    def __le__(self, other):
+        .. math:: \text{out} = (\text{self} < \text{other})
-        pass
+        Parameters
+        ----------
+        other : Union[dragon.Tensor, number]
+            The value to compare.
+        Returns
+        -------
+        dragon.Tensor
+            The output tensor.
+        See Also
+        --------
+        `dragon.math.less(...)`_ : Compute element-wise less comparison.
+        """
    def __mul__(self, other):
-        pass
+        r"""Compute the element-wise multiplication.
+        .. math:: \text{out} = \text{self} \times \text{other}
+        Parameters
+        ----------
+        other : Union[dragon.Tensor, number]
+            The value to multiply.
+        Returns
+        -------
+        dragon.Tensor
+            The output tensor.
+        See Also
+        --------
+        `dragon.math.mul(...)`_ : Compute the element-wise multiplication.
+        """
    def __neg__(self):
-        pass
+        r"""Compute the element-wise negative.
+        .. math:: y = -x
+        Returns
+        -------
+        dragon.Tensor
+            The output tensor.
+        See Also
+        --------
+        `dragon.math.negative(...)`_ : Compute the element-wise negative.
+        """
    def __radd__(self, other):
-        pass
+        r"""Compute the element-wise addition.
+        .. math:: \text{out} = \text{other} + \text{self}
+        Parameters
+        ----------
+        other : Union[dragon.Tensor, number]
+            The value to add.
+        Returns
+        -------
+        dragon.Tensor
+            The output tensor.
+        See Also
+        --------
+        `dragon.math.add(...)`_ : Compute the element-wise addition.
+        """
    def __repr__(self):
        shape_str = ('(' + ', '.join(
@@ -538,25 +698,108 @@ class Tensor(types.TensorMetaclass):
            .format(self.name, shape_str, self.dtype)
    def __rdiv__(self, other):
-        pass
+        r"""Compute the element-wise division.
+        .. math:: \text{out} = \text{other} \div \text{self}
+        Parameters
+        ----------
+        other : Union[dragon.Tensor, number]
+            The value to be divided.
+        Returns
+        -------
+        dragon.Tensor
+            The output tensor.
+        See Also
+        --------
+        `dragon.math.div(...)`_ : Compute the element-wise division.
+        """
    def __rmul__(self, other):
-        pass
+        r"""Compute the element-wise multiplication.
+        .. math:: \text{out} = \text{other} \times \text{self}
+        Parameters
+        ----------
+        other : Union[dragon.Tensor, number]
+            The value to multiply.
+        Returns
+        -------
+        dragon.Tensor
+            The output tensor.
+        See Also
+        --------
+        `dragon.math.mul(...)`_ : Compute the element-wise multiplication.
+        """
    def __rsub__(self, other):
-        pass
+        r"""Compute the element-wise subtraction.
+        .. math:: \text{out} = \text{other} - \text{self}
-    def __rtruediv__(self, other):
+        Parameters
-        return self.__div__(other)
+        ----------
+        other : Union[dragon.Tensor, number]
+            The value to be subtracted.
+        Returns
+        -------
+        dragon.Tensor
+            The output tensor.
+        See Also
+        --------
+        `dragon.math.sub(...)`_ : Compute the element-wise subtraction.
+        """
    def __setitem__(self, key, value):
-        pass
+        """Set the value at the specific indices.
+        Parameters
+        ----------
+        key : Union[int, slice, dragon.Tensor]
+            The indices.
+        value : number or dragon.Tensor
+            The value.
+        See Also
+        --------
+        `dragon.assign(...)`_ : Assign the value to ref.
+        See Also
+        --------
+        `dragon.masked_assign(...)`_ : Assign the value to ref where mask is 1.
+        """
    def __sub__(self, other):
-        pass
+        r"""Compute the element-wise subtraction.
+        .. math:: \text{out} = \text{self} - \text{value}
+        Parameters
+        ----------
+        other : Union[dragon.Tensor, number]
+            The value to subtract.
+        Returns
+        -------
+        dragon.Tensor
+            The output tensor.
-    def __truediv__(self, other):
+        See Also
-        return self.__div__(other)
+        --------
+        `dragon.math.sub(...)`_ : Compute the element-wise subtraction.
+        """
 class TensorRef(object):

--- a/dragon/python/core/eager/backprop.py
+++ b/dragon/python/core/eager/backprop.py
@@ -13,7 +13,7 @@
 #
 # ------------------------------------------------------------
-"""Do back-propagation from the eager expressions."""
+"""Do back-propagation from the executed operations."""
 from __future__ import absolute_import
 from __future__ import division
@@ -35,13 +35,23 @@ class Tape(object):
        self._defs = []
        self._parent = parent
        self._watched = set()
+        self._empty_grads = set()
        self._gc = workspace.get_workspace().collectors
        self._retain_graph = False
+    @property
+    def empty_grads(self):
+        """Return the recorded empty grads."""
+        return list(self._empty_grads)
    def add_def(self, op_def):
        """Add a new def."""
        self._defs.append(op_def)
+    def add_empty_grad(self, tensor_id):
+        """Add an empty grad for optimization."""
+        self._empty_grads.add(tensor_id)
    def is_watched(self, tensor):
        """Return true if tensor is watched."""
        return tensor.id in self._watched
@@ -53,7 +63,7 @@ class Tape(object):
    def __del__(self):
        """Release the resources."""
        for op_def in self._defs:
-            self._gc.OPERATOR.collect(op_def.name)
+            self._gc.OP.collect(op_def.name)
            for y in op_def.output:
                if y not in op_def.input:
                    self._gc.TENSOR.collect(y)
@@ -113,36 +123,23 @@ class GradientTape(object):
                self._pop_tape()
        # Collect gradient info.
-        inputs, outputs = [], []
+        xs, ys, grad_ys = nest.flatten(sources), nest.flatten(target), []
-        targets, ignores = [], []
-        target = nest.flatten(target)
-        sources = nest.flatten(sources)
-        sources_is_watched = []
        if output_gradients is not None:
-            output_gradients = nest.flatten(output_gradients)
+            for tensor, grad_tensor in zip(ys, nest.flatten(output_gradients)):
-            for value, grad in zip(target, output_gradients):
+                if grad_tensor.shape != tensor.shape:
-                if grad.shape != value.shape:
                    raise ValueError(
-                        'Except the dimensions of <output_gradient> is {}, '
+                        'Excepted the dimensions of output gradient is {}, '
-                        'got {}.'.format(value.shape, grad.shape)
+                        'got {}.'.format(tensor.shape, grad_tensor.shape))
-                    )
+                grad_ys.append(grad_tensor.id)
-                inputs.append(grad.id)
-        for t in target:
-            targets.append(t.id)
-        for s in sources:
-            sources_is_watched.append(self._tape.is_watched(s))
-            if not s.requires_grad and not sources_is_watched[-1]:
-                ignores.append(s.id + '_grad')
-            else:
-                outputs.append(s.id)
        # Run the gradient ops sequentially.
-        workspace.run_backward(
+        current_ws = workspace.get_workspace()
-            forward_ops=self._tape._defs,
+        current_ws.run_backward(
-            targets=targets,
+            op_defs=self._tape._defs,
-            sources=outputs,
+            targets=[y.id for y in ys],
-            input_grads=inputs,
+            sources=[x.id for x in xs],
-            ignored_grads=ignores,
+            input_grads=grad_ys,
+            empty_grads=self._tape.empty_grads,
        )
        # Remove the tape to release resources.
@@ -150,12 +147,7 @@ class GradientTape(object):
            self._tape = None
        # Pack the gradients.
-        return [_steal_grad_ref(s, w) for s, w
+        return [_steal_grad(current_ws, x) for x in xs]
-                in zip(sources, sources_is_watched)]
-    def replay(self):
-        """Run the operators stored in the tape."""
-        workspace.run_operator(self._tape._defs)
    def reset(self):
        """Destroy the tape and push a new one."""
@@ -187,8 +179,7 @@ class GradientTape(object):
        if self._tape is None:
            raise RuntimeError(
                'GradientTape.gradient can only be called '
-                'once on non-persistent tapes.'
+                'once on non-persistent tapes.')
-            )
        for t in nest.flatten(tensor):
            self._tape.watch(t)
@@ -232,17 +223,13 @@ def pop_tape():
    _GLOBAL_TAPE_STACK.pop()
-def _steal_grad_ref(source, is_watched=False):
+def _steal_grad(ws, source):
-    if not source.requires_grad and not is_watched:
+    """Steal the grad from backend."""
-        return None
+    impl = ws.GetTensor(source.id + '_grad')
-    grad_id = source.id + '_grad'
+    if impl is None:
-    grad_impl = workspace.get_workspace().GetTensor(grad_id)
-    if grad_impl is None:
        return None
-    device = device_spec.DeviceSpec(*grad_impl.device)
+    device = device_spec.DeviceSpec(*impl.device)
-    grad_ref = EagerTensor(own_storage=False, device=device)
+    return EagerTensor(impl=impl, device=device)
-    grad_ref._id, grad_ref._impl = grad_id, grad_impl
-    return grad_ref
 # Define a global stack to store the tapes of current thread.

--- a/dragon/python/core/eager/dlpack.py
+++ b/dragon/python/core/eager/dlpack.py
@@ -32,13 +32,13 @@ def from_dlpack(dlpack):
        The tensor with the dlpack data.
    """
-    ws = workspace.get_workspace()
+    current_ws = workspace.get_workspace()
-    ref = EagerTensor(device=None)  # Hack the constructor.
+    tensor = EagerTensor(device=None)
-    ref.__gc__ = ws.collectors.TENSOR
+    tensor._gc = current_ws.collectors.TENSOR
-    ref._id = ref.__gc__.alloc('${DLPACK}')
+    tensor._impl = current_ws.create_tensor(
-    ref._impl = ws.CreateTensor(ref._id).FromDLPack(dlpack)
+        tensor._gc.alloc('${DLPACK}')).FromDLPack(dlpack)
-    ref._device = device_spec.DeviceSpec(*ref._impl.device)
+    tensor._device = device_spec.DeviceSpec(*tensor._impl.device)
-    return ref
+    return tensor
 def to_dlpack(tensor, readonly=True):

--- a/dragon/python/core/eager/executor.py
+++ b/dragon/python/core/eager/executor.py
@@ -18,7 +18,6 @@ from __future__ import print_function
 from dragon.core.eager import backprop
 from dragon.core.eager.tensor import EagerTensor
 from dragon.core.framework import device_spec
-from dragon.core.framework import config
 from dragon.core.framework import context
 from dragon.core.framework import workspace
 from dragon.core.util import six
@@ -33,21 +32,22 @@ def run_operator(
 ):
    requires_grad = False
    input_names, output_names = [], []
-    tape = backprop.get_default_tape()
+    default_tape = backprop.get_default_tape()
-    for x in inputs:
+    for input in inputs:
-        input_names.append(x.id)
+        input_names.append(input.id)
-        if tape is not None:
+        if default_tape is not None:
-            if x.requires_grad:
+            if input.requires_grad:
                requires_grad = True
-            elif tape.is_watched(x):
+            elif default_tape.is_watched(input):
                requires_grad = True
+            else:
+                default_tape.add_empty_grad(input.id + '_grad')
-    if tape and tape._retain_graph:
+    if default_tape and default_tape._retain_graph:
        requires_grad = True
    # Allocate outputs.
-    cfg = config.config()
    ws = workspace.get_workspace()
    output_scope = context.get_eager_scope(requires_grad)
    gc = ws.collectors  # Garbage collectors
@@ -57,31 +57,28 @@ def run_operator(
            output_names.append(spec)
        else:
            if isinstance(spec, device_spec.DeviceSpec):
-                output_id = gc.TENSOR.alloc(output_scope)
+                impl = ws.create_tensor(gc.TENSOR.alloc(output_scope))
-                ref = EagerTensor(device=spec)
+                outputs[i] = EagerTensor(device=spec, gc=gc.TENSOR, impl=impl)
-                ref.__gc__, ref._id = gc.TENSOR, output_id
-                ref._impl = ws.CreateTensor(output_id)
-                outputs[i] = ref
            output_names.append(outputs[i].id)
-    # Generate the OpDef.
+    # Generate OpDef.
    op_def = op_def.DeriveTo(input_names, output_names)
-    # Maybe record this operation for future developments.
+    # Record operation for future developments.
    if len(inputs) > 0 and no_grad is False:
        if requires_grad:
            for output in outputs:
-                output.requires_grad = True
+                output._requires_grad = True
-            op_def.name = gc.OPERATOR.alloc(op_def.type)
+            op_def.name = gc.OP.alloc(op_def.type)
-            tape.add_def(op_def)
+            default_tape.add_def(op_def)
        else:
            for output in outputs:
-                output.requires_grad = False
+                output._requires_grad = False
    # Dispatch the computation.
    if pre_callback is not None:
        pre_callback(ws, op_def.name)
-    ws.RunOperator(op_def, cfg.graph_verbosity > 0)
+    ws.run_operator(op_def)
    # Return the outputs.
    return outputs if len(outputs) > 1 else outputs[0]
--- a/dragon/python/core/eager/tensor.py
+++ b/dragon/python/core/eager/tensor.py
@@ -20,7 +20,6 @@ import numpy
 from dragon.core.autograph.tensor import Tensor
 from dragon.core.framework import context
 from dragon.core.framework import workspace
-from dragon.core.util import math_util
 class EagerTensor(Tensor):
@@ -48,30 +47,18 @@ class EagerTensor(Tensor):
    def __init__(self, *args, **kwargs):
        """Create an ``EagerTensor``."""
        super(Tensor, self).__init__()
-        # Internal properties
+        self._gc = kwargs.get('gc', None)
-        self._id = kwargs.get('id', None)
+        self._impl = kwargs.get('impl', None)
-        self._name = kwargs.get('name', self._id)
+        self._name = kwargs.get('name', None)
-        self._own_storage = kwargs.get('own_storage', True)
+        self._device = kwargs.get('device', context.get_device_spec())
        self._requires_grad = kwargs.get('requires_grad', False)
        self._requires_grad = kwargs.get('trainable', self._requires_grad)
-        self._device = kwargs.get('device', context.get_device_spec())
+        self._is_leaf = False
-        self._const_size = None  # Attribute to represent a leaf variable
-        # Constructor
        if len(args) == 0:
-            # >>> dragon.EagerTensor(shape=?, dtype=?)
            shape = kwargs.get('shape', None)
            if shape is not None:
                self._from_shape(shape, kwargs.get('dtype', 'float32'))
-            else:
-                if self._id is not None:
-                    ws = workspace.get_workspace()
-                    self.__gc__ = ws.collectors.TENSOR
-                    self._impl = ws.CreateTensor(self._id)
-                else:
-                    self.__gc__ = None
        elif len(args) == 1:
-            # >>> dragon.EagerTensor(constant)
            self._from_numpy(
                args[0] if isinstance(args[0], numpy.ndarray)
                else numpy.array(args[0], kwargs.get('dtype', 'float32')),
@@ -106,10 +93,7 @@ class EagerTensor(Tensor):
    @dtype.setter
    def dtype(self, value):
-        raise RuntimeError(
+        raise RuntimeError('Call ``astype(...)`` to change the data type.')
-            '<dtype> is a readonly property.\n'
-            'Call ``astype(...)`` to change the data type.'
-        )
    @property
    def id(self):
@@ -121,7 +105,7 @@ class EagerTensor(Tensor):
            The tensor identity.
        """
-        return self._id
+        return self._impl.name
    @property
    def name(self):
@@ -133,7 +117,7 @@ class EagerTensor(Tensor):
            The tensor name.
        """
-        return self._name
+        return self._name or self._impl.id
    @name.setter
    def name(self, value):
@@ -174,10 +158,7 @@ class EagerTensor(Tensor):
    @shape.setter
    def shape(self, value):
-        raise RuntimeError(
+        raise RuntimeError('Call ``reshape(...)`` to change the dimensions.')
-            '<shape> is a readonly property.\n'
-            'Call ``reshape(...)`` to change the dimensions.'
-        )
    @property
    def size(self):
@@ -211,7 +192,6 @@ class EagerTensor(Tensor):
        `dragon.cast(...)`_ : Cast the data type of input.
        """
-        pass
    def constant(self, value=0):
        r"""Fill self with a constant value.
@@ -229,7 +209,6 @@ class EagerTensor(Tensor):
            The self.
        """
-        pass
    def copy(self):
        """Return a tensor with containing data copied.
@@ -244,7 +223,6 @@ class EagerTensor(Tensor):
        `dragon.copy(...)`_ : Copy the value to ref.
        """
-        pass
    def get_value(self):
        """Return the value from storage.
@@ -275,7 +253,6 @@ class EagerTensor(Tensor):
            The self.
        """
-        pass
    def glorot_uniform(self, mode='FAN_IN', scale=3.):
        r"""Fill self from a glorot uniform distribution.
@@ -298,7 +275,6 @@ class EagerTensor(Tensor):
            The self.
        """
-        pass
    def numpy(self, readonly=True):
        """Create a numpy array sharing the data.
@@ -334,7 +310,6 @@ class EagerTensor(Tensor):
            The self.
        """
-        pass
    def reshape(self, shape):
        """Return a tensor containing the same data with new shape.
@@ -354,7 +329,6 @@ class EagerTensor(Tensor):
        `dragon.reshape(...)`_ : Change the dimensions of input.
        """
-        pass
    def set_value(self, value):
        """Map the value to storage.
@@ -393,10 +367,9 @@ class EagerTensor(Tensor):
            The self.
        """
-        pass
    def uniform(self, low=0, high=1):
-        r"""Fill self from a uniform distribution.
+        self.self__ = r"""Fill self from a uniform distribution.
        .. math:: \text{self} \leftarrow U(\alpha, \beta)
@@ -413,38 +386,70 @@ class EagerTensor(Tensor):
            The self.
        """
-        pass
    def _from_numpy(self, array, copy):
        """Create impl from the numpy array."""
        ws = workspace.get_workspace()
        array = array.copy() if copy else array
        self._const_size = array.size
-        self.__gc__ = ws.collectors.TENSOR
+        self._gc, self._is_leaf = ws.collectors.TENSOR, True
-        self._id = self.__gc__.alloc(context.get_eager_scope())
+        self._impl = ws.create_tensor(self._gc.alloc(
-        self._impl = ws.CreateTensor(self._id).FromNumpy(array)
+            context.get_eager_scope())).FromNumpy(array)
    def _from_shape(self, shape, dtype):
        """Create impl from the shape and data type."""
        ws = workspace.get_workspace()
-        self._const_size = math_util.prod(shape)
+        self._gc, self._is_leaf = ws.collectors.TENSOR, True
-        self.__gc__ = ws.collectors.TENSOR
+        self._impl = ws.create_tensor(self._gc.alloc(
-        self._id = self.__gc__.alloc(context.get_eager_scope())
+            context.get_eager_scope())).FromShape(shape, dtype)
-        self._impl = ws.CreateTensor(self._id).FromShape(shape, dtype)
    def __add__(self, other):
-        pass
+        r"""Compute the element-wise addition.
+        .. math:: \text{out} = \text{self} + \text{value}
+        Parameters
+        ----------
+        other : Union[dragon.EagerTensor, number]
+            The value to add.
+        Returns
+        -------
+        dragon.EagerTensor
+            The output tensor.
+        See Also
+        --------
+        `dragon.math.add(...)`_ : Compute the element-wise addition.
+        """
    def __del__(self):
-        if not self._requires_grad or self._const_size:
+        if (self._is_leaf or not self._requires_grad) and self._gc:
-            if self._own_storage and self._id:
+            # Always reuse the leaf tensors.
-                # Always reuse the leaf variables or tensors
+            # PyGC will detect them automatically.
-                # that do not require grad.
+            self._gc.collect(self.id)
-                # PyGC will detect them automatically.
-                self.__gc__.collect(self._id)
    def __div__(self, other):
-        pass
+        r"""Compute the element-wise division.
+        .. math:: \text{out} = \text{self} \div \text{value}
+        Parameters
+        ----------
+        other : Union[dragon.EagerTensor, number]
+            The value to divide.
+        Returns
+        -------
+        dragon.EagerTensor
+            The output tensor.
+        See Also
+        --------
+        `dragon.math.div(...)`_ : Compute the element-wise division.
+        """
    def __float__(self):
        """Return a float python scalar.
@@ -455,30 +460,138 @@ class EagerTensor(Tensor):
            The float value.
        """
-        if self.size == 1:
+        return float(self.numpy())
-            return float(self.numpy())
-        raise TypeError('Only size-1 array can be converted to python scalar.')
    def __ge__(self, other):
-        pass
+        r"""Compute element-wise greater-equal comparison.
+        .. math:: \text{out} = (\text{self} \geq \text{other})
+        Parameters
+        ----------
+        other : Union[dragon.EagerTensor, number]
+            The value to compare.
+        Returns
+        -------
+        dragon.EagerTensor
+            The output tensor.
+        See Also
+        --------
+        `dragon.math.greater_equal(...)`_ : Compute element-wise greater-equal comparison.
+        """
    def __getitem__(self, item):
-        pass
+        """Select the elements at the specific indices.
+        Parameters
+        ----------
+        item : Union[int, slice, dragon.EagerTensor]
+            The indices.
+        Returns
+        -------
+        dragon.EagerTensor
+            The output tensor.
+        See Also
+        --------
+        `dragon.slice(...)`_ : Select the elements according to the given sections.
+        See Also
+        --------
+        `dragon.masked_select(...)`_ : Select the elements where the given mask is 1.
+        """
    def __gt__(self, other):
-        pass
+        r"""Compute element-wise greater comparison.
+        .. math:: \text{out} = (\text{self} > \text{other})
+        Parameters
+        ----------
+        other : Union[dragon.EagerTensor, number]
+            The value to compare.
+        Returns
+        -------
+        dragon.EagerTensor
+            The output tensor.
+        See Also
+        --------
+        `dragon.math.greater(...)`_ : Compute element-wise greater comparison.
+        """
    def __hash__(self):
        return id(self)
    def __iadd__(self, other):
-        pass
+        r"""Compute the element-wise addition.
+        .. math:: \text{self} \mathrel{+}= \text{other}
+        Parameters
+        ----------
+        other : Union[dragon.EagerTensor, number]
+            The value to add.
+        Returns
+        -------
+        dragon.EagerTensor
+            The self.
+        See Also
+        --------
+        `dragon.math.add(...)`_ : Compute the element-wise addition.
+        """
    def __idiv__(self, other):
-        pass
+        r"""Compute the element-wise division.
+        .. math:: \text{self} \mathrel{\div}= \text{other}
+        Parameters
+        ----------
+        other : Union[dragon.EagerTensor, number]
+            The value to divide.
+        Returns
+        -------
+        dragon.EagerTensor
+            The self.
+        See Also
+        --------
+        `dragon.math.div(...)`_ : Compute the element-wise division.
+        """
    def __imul__(self, other):
-        pass
+        r"""Compute the element-wise multiplication.
+        .. math:: \text{self} \mathrel{\times}= \text{other}
+        Parameters
+        ----------
+        other : Union[dragon.EagerTensor, number]
+            The value to multiply.
+        Returns
+        -------
+        dragon.EagerTensor
+            The self.
+        See Also
+        --------
+        `dragon.math.mul(...)`_ : Compute the element-wise multiplication.
+        """
    def __int__(self):
        """Return a int python scalar.
@@ -492,22 +605,125 @@ class EagerTensor(Tensor):
        return int(self.__float__())
    def __isub__(self, other):
-        pass
+        r"""Compute the element-wise division.
-    def __lt__(self, other):
+        .. math:: \text{self} \mathrel{-}= \text{other}
-        pass
+        Parameters
+        ----------
+        other : Union[dragon.EagerTensor, number]
+            The value to subtract.
+        Returns
+        -------
+        dragon.EagerTensor
+            The self.
+        See Also
+        --------
+        `dragon.math.sub(...)`_ : Compute the element-wise subtraction.
+        """
    def __le__(self, other):
-        pass
+        r"""Compute element-wise less-equal comparison.
+        .. math:: \text{out} = (\text{self} \leq \text{other})
+        Parameters
+        ----------
+        other : Union[dragon.EagerTensor, number]
+            The value to compare.
+        Returns
+        -------
+        dragon.EagerTensor
+            The output tensor.
+        See Also
+        --------
+        `dragon.math.less_equal(...)`_ : Compute element-wise less-equal comparison.
+        """
+    def __lt__(self, other):
+        r"""Compute element-wise less comparison.
+        .. math:: \text{out} = (\text{self} < \text{other})
+        Parameters
+        ----------
+        other : Union[dragon.EagerTensor, number]
+            The value to compare.
+        Returns
+        -------
+        dragon.EagerTensor
+            The output tensor.
+        See Also
+        --------
+        `dragon.math.less(...)`_ : Compute element-wise less comparison.
+        """
    def __mul__(self, other):
-        pass
+        r"""Compute the element-wise multiplication.
+        .. math:: \text{out} = \text{self} \times \text{other}
+        Parameters
+        ----------
+        other : Union[dragon.EagerTensor, number]
+            The value to multiply.
+        Returns
+        -------
+        dragon.EagerTensor
+            The output tensor.
+        See Also
+        --------
+        `dragon.math.mul(...)`_ : Compute the element-wise multiplication.
+        """
    def __neg__(self):
-        pass
+        r"""Compute the element-wise negative.
+        .. math:: y = -x
+        Returns
+        -------
+        dragon.EagerTensor
+            The output tensor.
+        See Also
+        --------
+        `dragon.math.negative(...)`_ : Compute the element-wise negative.
+        """
    def __radd__(self, other):
-        pass
+        r"""Compute the element-wise addition.
+        .. math:: \text{out} = \text{other} + \text{self}
+        Parameters
+        ----------
+        other : Union[dragon.EagerTensor, number]
+            The value to add.
+        Returns
+        -------
+        dragon.EagerTensor
+            The output tensor.
+        See Also
+        --------
+        `dragon.math.add(...)`_ : Compute the element-wise addition.
+        """
    def __repr__(self):
        array = self.numpy()
@@ -523,22 +739,105 @@ class EagerTensor(Tensor):
        return content_str + meta_str
    def __rdiv__(self, other):
-        pass
+        r"""Compute the element-wise division.
+        .. math:: \text{out} = \text{value} \div \text{self}
+        Parameters
+        ----------
+        other : Union[dragon.EagerTensor, number]
+            The value to be divided.
+        Returns
+        -------
+        dragon.EagerTensor
+            The output tensor.
+        See Also
+        --------
+        `dragon.math.div(...)`_ : Compute the element-wise division.
+        """
    def __rmul__(self, other):
-        pass
+        r"""Compute the element-wise multiplication.
+        .. math:: \text{out} = \text{other} \times \text{self}
+        Parameters
+        ----------
+        other : Union[dragon.EagerTensor, number]
+            The value to multiply.
+        Returns
+        -------
+        dragon.EagerTensor
+            The output tensor.
+        See Also
+        --------
+        `dragon.math.mul(...)`_ : Compute the element-wise multiplication.
+        """
    def __rsub__(self, other):
-        pass
+        r"""Compute the element-wise subtraction.
+        .. math:: \text{out} = \text{other} - \text{self}
+        Parameters
+        ----------
+        other : Union[dragon.EagerTensor, number]
+            The value to be subtracted.
+        Returns
+        -------
+        dragon.EagerTensor
+            The output tensor.
-    def __rtruediv__(self, other):
+        See Also
-        return self.__div__(other)
+        --------
+        `dragon.math.sub(...)`_ : Compute the element-wise subtraction.
+        """
    def __setitem__(self, key, value):
-        pass
+        """Set the value at the specific indices.
+        Parameters
+        ----------
+        key : Union[int, slice, dragon.EagerTensor]
+            The indices.
+        value : number or dragon.EagerTensor
+            The value.
+        See Also
+        --------
+        `dragon.assign(...)`_ : Assign the value to ref.
+        See Also
+        --------
+        `dragon.masked_assign(...)`_ : Assign the value to ref where mask is 1.
+        """
    def __sub__(self, other):
-        pass
+        r"""Compute the element-wise subtraction.
+        .. math:: \text{out} = \text{self} - \text{other}
+        Parameters
+        ----------
+        other : Union[dragon.EagerTensor, number]
+            The value to subtract.
+        Returns
+        -------
+        dragon.EagerTensor
+            The output tensor.
-    def __truediv__(self, other):
+        See Also
-        return self.__div__(other)
+        --------
+        `dragon.math.sub(...)`_ : Compute the element-wise subtraction.
+        """
--- a/dragon/python/core/framework/context.py
+++ b/dragon/python/core/framework/context.py
@@ -15,10 +15,11 @@ from __future__ import print_function
 from dragon.core.framework import config
 from dragon.core.framework import device_spec
+from dragon.core.framework import mapping
 from dragon.core.util import tls
-def device(device_type, device_id=0):
+def device(device_type, device_index=0):
    """Context-manager to nest the the device spec.
    Examples:
@@ -32,7 +33,7 @@ def device(device_type, device_id=0):
    ----------
    device_type : {'cpu', 'gpu', 'cuda', 'cnml'}, required
        The type of device.
-    device_id : int, optional, default=0
+    device_index : int, optional, default=0
        The index of the device.
    Returns
@@ -41,13 +42,12 @@ def device(device_type, device_id=0):
        The current default device spec.
    """
-    device_type, device_id, device_type.lower(), device_id
+    device_type = device_type.lower()
-    assert device_type in ('cpu', 'gpu', 'cuda', 'cnml')
+    if device_type not in mapping.DEVICE_STRING_TO_DEVICE_TYPE:
-    if device_type == 'gpu':
+        raise ValueError('Unsupported device type:', device_type)
-        device_type = 'cuda'
    return _GLOBAL_DEVICE_STACK.get_controller({
-        'device_type': device_type,
+        'device_type': mapping.DEVICE_STRING_TO_DEVICE_TYPE[device_type],
-        'device_index': device_id,
+        'device_index': device_index,
    })
@@ -96,21 +96,6 @@ def name_scope(name):
    return _GLOBAL_NAME_STACK.get_controller(default)
-def graph_phase(phase):
-    """Context-manager to nest the the executing phase for graph.
-    Parameters
-    ----------
-    phase : {'TRAIN', 'TEST'}, required
-        The executing phase.
-    """
-    phase = phase.upper()
-    assert phase in ('TRAIN', 'TEST'), \
-        "Specified an unknown phase: " + phase
-    return _GLOBAL_PHASE_STACK.get_controller(phase)
 def get_device_info():
    """Return the device info in current nesting."""
    return _GLOBAL_DEVICE_STACK.get_default()
@@ -144,13 +129,7 @@ def get_name_scope():
    return ret if ret is not None else ''
-def get_graph_phase():
-    """Return the graph phase in current nesting."""
-    return _GLOBAL_PHASE_STACK.get_default()
 # Thread-local stack for nesting scope.
 _GLOBAL_DEVICE_STACK = tls.Stack()
 _GLOBAL_EAGER_STACK = tls.Stack([('${GRAPH}', '${DATA}')])
 _GLOBAL_NAME_STACK = tls.Stack()
-_GLOBAL_PHASE_STACK = tls.Stack()
--- a/dragon/python/core/framework/mapping.py
+++ b/dragon/python/core/framework/mapping.py
@@ -17,6 +17,15 @@ from __future__ import print_function
 import numpy
+# Mapping to store the supported device types
+DEVICE_STRING_TO_DEVICE_TYPE = {
+    'cpu': 'cpu',
+    'gpu': 'cuda',
+    'cuda': 'cuda',
+    'cnml': 'cnml',
+}
+# Mapping to convert to the numpy type
 TENSOR_TYPE_TO_NP_TYPE = {
    'bool': numpy.bool,
    'int8': numpy.int8,
@@ -28,6 +37,7 @@ TENSOR_TYPE_TO_NP_TYPE = {
    'float64': numpy.float64,
 }
+# Mapping to convert to the torch tensor class name
 TENSOR_TYPE_TO_TORCH_TENSOR = {
    'bool': 'BoolTensor',
    'int8': 'CharTensor',

--- a/dragon/python/core/framework/ops.py
+++ b/dragon/python/core/framework/ops.py
@@ -30,10 +30,10 @@ from dragon.core.framework import workspace
 class Operator(object):
    """Wrapper to unify the symbolic and eager operator abstraction."""
-    def __init__(self, key, dev, **kwargs):
+    def __init__(self, cache_key, device, **kwargs):
        self._def = None
-        self._cache_key = key
+        self._cache_key = cache_key
-        self._device = dev
+        self._device = device
        self._arg_device = proto_util.get_device_option('cpu')
        self._arg_device = self._arg_device.SerializeToString()
        self._seed = kwargs.get('seed', config.config().random_seed)
@@ -104,7 +104,7 @@ class Operator(object):
        """Generate the OpDef from attributes."""
        attributes = self.attributes()
        self._def = proto_util.make_operator_cdef(
-            name=attributes.get('name', 'GenericOp'),
+            name=attributes.get('name', 'Op'),
            cache_key=self._cache_key,
            op_type=attributes['op_type'],
            device_option=proto_util.get_device_option(
@@ -128,17 +128,9 @@ def new_leaf(shape, dtype, device, trainable=False):
 def remove_binary_scalar(inputs):
    """Remove the scalar for binary ops."""
    if types.is_tensor(inputs[0]):
-        # (Tensor, Number)
+        inputs[1] = scalar_to_tensor(inputs[1], inputs[0].dtype)
-        inputs[1] = scalar_to_tensor(
-            inputs[1],
-            inputs[0].dtype,
-        )
    else:
-        # (Number, Tensor)
+        inputs[0] = scalar_to_tensor(inputs[0], inputs[1].dtype)
-        inputs[0] = scalar_to_tensor(
-            inputs[0],
-            inputs[1].dtype,
-        )
    return inputs
@@ -153,15 +145,11 @@ def scalar_to_tensor(input, dtype):
            '<input> should be a python number, got {}.'
            .format(type(input).__name__)
        )
-    tid = '/share/scalar/{}/{}'.format(dtype, str(input))
+    name = '/share/scalar/{}/{}'.format(dtype, str(input))
-    if not workspace.has_tensor(tid):
+    ws = workspace.get_workspace()
-        workspace.feed_tensor(tid, numpy.array(input, dtype))
+    if not ws.has_tensor(name):
-    return EagerTensor(
+        ws.feed_tensor(name, numpy.array(input, dtype))
-        id=tid,
+    return EagerTensor(impl=ws.GetTensor(name), trainable=False)
-        dtype=dtype,
-        own_storage=False,
-        requires_grad=False,
-    )
 # Define a global dict to cache the operators.

--- a/dragon/python/core/framework/workspace.py
+++ b/dragon/python/core/framework/workspace.py
@@ -17,8 +17,6 @@ from __future__ import print_function
 import collections
 import contextlib
-import os
 import numpy
 from dragon import backend
@@ -27,23 +25,15 @@ from dragon.core.framework import mapping
 from dragon.core.framework import proto_util
 from dragon.core.framework import types
 from dragon.core.proto import dragon_pb2
-from dragon.core.util import logging
+from dragon.core.util import serialization
 from dragon.core.util import tls
-from dragon.core.util import six
-class OperatorCollector(object):
-    """A FIFO free list to manage the resource handle of operators.
-    Operator who takes gradient will hold a handle,
-    and it will be collected after the backward pass.
-    Handles are collected according to the type,
+class OpCollector(object):
-    as the size of resources varies greatly.
+    """A FIFO free list to manage the resource handle of operators."""
-    """
-    def __init__(self):
+    def __init__(self, parent):
+        self._parent = parent
        self._type2keys = collections.defaultdict(collections.deque)
    def alloc(self, op_type):
@@ -52,32 +42,23 @@ class OperatorCollector(object):
            return self._type2keys[op_type].popleft()
        except IndexError:
            self._type2keys[op_type].append(
-                get_dummy_name(
+                self._parent.unique_name(
-                    basename=op_type,
+                    name=op_type,
-                    domain='Operator',
+                    namespace='Op',
-                    zero_based=False,
+                    zero_based=False))
-                ))
            return self._type2keys[op_type].popleft()
    def collect(self, handle):
-        """Collect a unique handle."""
+        """Collect an unique handle."""
        op_type, _ = handle.split('_')
        self._type2keys[op_type].append(handle)
 class TensorCollector(object):
-    """A FIFO free list to manage the reused tensors.
+    """A FIFO free list to manage the reused tensors."""
-    Tensors with the same scope are reused by turns,
-    and thus, memory fragments will be reduced.
-    Note that the fragments are inevitable due to the
-    naive FIFO policy. Reset the workspace if the number
-    of fragments is going to increase linearly.
-    """
-    def __init__(self):
+    def __init__(self, parent):
+        self._parent = parent
        self._scope2keys = collections.defaultdict(collections.deque)
    def alloc(self, scope='${DATA}'):
@@ -86,33 +67,28 @@ class TensorCollector(object):
            return self._scope2keys[scope].popleft()
        except IndexError:
            self._scope2keys[scope].append(
-                get_dummy_name(
+                self._parent.unique_name(
-                    basename='%s/Tensor' % scope,
+                    name='%s/Tensor' % scope,
-                    domain='Tensor',
+                    namespace='Tensor',
-                    zero_based=False,
+                    zero_based=False))
-                ))
            return self._scope2keys[scope].popleft()
    def collect(self, name):
-        """Collect a unique name."""
+        """Collect an unique name."""
-        if name.startswith('${'):
+        scope, _ = name.split('/')
-            scope, _ = name.split('/')
+        self._scope2keys[scope].append(name)
-            self._scope2keys[scope].append(name)
-            return True
-        else:
-            return False
 class Workspace(backend.Workspace):
-    """Space to isolate computations that share resources."""
+    """Sandbox to isolate the resources and computations."""
    class Collectors(object):
-        def __init__(self):
+        def __init__(self, workspace):
-            self.TENSOR = TensorCollector()
+            self.OP = OpCollector(workspace)
-            self.OPERATOR = OperatorCollector()
+            self.TENSOR = TensorCollector(workspace)
    def __init__(self, name=''):
-        """Create a Workspace.
+        """Create a ``Workspace``.
        Parameters
        ----------
@@ -121,24 +97,20 @@ class Workspace(backend.Workspace):
        """
        super(Workspace, self).__init__(name)
-        self._ref_objects = []
+        self._references = []
-        self._collectors = self.Collectors()
+        self._collectors = self.Collectors(self)
    @property
    def collectors(self):
        """Return the resource collectors."""
        return self._collectors
-    def merge_from(self, other):
+    def as_default(self):
-        """Merge a external workspace into ``self``.
+        """Switch ``self`` as the default workspace.
-        The ``other`` will not be reset until ``self`` is reset.
+        Call this method with the **with** keyword.
-        Carefulness should be taken to associate with the workspaces.
-        Parameters
+        Once **with** is exited, the previous default will be set.
-        ----------
-        other : dragon.Workspace
-            The given external workspace.
        Returns
        -------
@@ -146,216 +118,295 @@ class Workspace(backend.Workspace):
            The ``self``.
        """
-        self.MergeFrom(other)
+        return _GLOBAL_DEFAULT_WORKSPACE_STACK.get_controller(self)
-        self._ref_objects.append(other)
-        return self
-    def as_default(self):
-        """Switch ``self`` as the default workspace.
-        Call this method with the **with** keyword.
+    def create_graph(self, graph_def):
+        """Create the graph.
-        Once **with** is exited, the previous default will be set.
+        Parameters
+        ----------
+        graph_def : GraphDef
+            The ``GraphDef`` protocol buffer.
        Returns
        -------
-        dragon.Workspace
+        str
-            The ``self``.
+            The graph name.
        """
-        return _GLOBAL_DEFAULT_WORKSPACE_STACK.get_controller(self)
+        cfg = config.config()
+        if cfg.graph_verbosity == 2:
+            print(graph_def)
+        return self.CreateGraph(
+            serialization.serialize_proto(graph_def),
+            cfg.graph_verbosity == 1)
-    def clear(self):
+    def create_tensor(self, name, filler_info=None):
-        """Remove all the tensors.
+        """Create the tensor.
-        Optionally call this method to clean the memories.
+        Parameters
+        ----------
+        name : str
+            The tensor name.
+        filler_info : FillerInfo
+            The ``FillerInfo`` protocol buffer.
+        Returns
+        -------
+        TensorImpl
+            The tensor implementation.
        """
-        self.Clear()
+        return self.CreateTensor(
+            name, serialization.serialize_proto(filler_info))
+    def feed_tensor(self, tensor, value, dtype=None, enforce_cpu=False):
+        """Copy the value to tensor.
-def create_filler(filler_def):
+        Examples:
-    """Create a tensor filler in current workspace.
-    Parameters
+        ```python
-    ----------
+        # Define a named tensor to feed
-    filler_def : TensorFiller
+        x = dragon.Tensor('x')
-        The def of filler.
+        dragon.get_workspace().feed_tensor(x, 0)
-    """
+        # Feed by specifying a tensor name
-    filler_def = filler_def if isinstance(filler_def, str) \
+        # Note that it will create the implementation whatever
-        else filler_def.SerializePartialToString()
+        dragon.get_workspace().feed_tensor('y', 1)
-    get_workspace().CreateFiller(filler_def)
+        print(dragon.get_workspace().has_tensor('y'))  # True
+        ```
+        Parameters
+        ----------
+        tensor : Union[dragon.Tensor, str]
+            The tensor to feed.
+        value : array_like
+            The value to copy.
+        dtype : str, optional
+            The optional data type.
+        enforce_cpu : bool, optional, default=False
+            **True** to copy using cpu context.
-def create_graph(graph_def):
+        """
-    """Create the graph in current workspace.
+        if types.is_tensor(value):
+            # Steal the data if value is a tensor
+            value = getattr(value, 'get_value')()
+        # Determine the data type from argument or value
+        if not isinstance(value, numpy.ndarray):
+            dtype = 'float32' if dtype is None else dtype
+        else:
+            dtype = value.dtype if dtype is None else dtype
+        if hasattr(tensor, 'dtype') and tensor.dtype is not None:
+            if tensor.dtype not in mapping.TENSOR_TYPE_TO_NP_TYPE:
+                raise TypeError('Unsupported data type:', tensor.dtype)
+            dtype = mapping.TENSOR_TYPE_TO_NP_TYPE[tensor.dtype]
+        # Determine the copying device option
+        if enforce_cpu is True:
+            device_option = proto_util.get_device_option('cpu')
+        else:
+            device_option = proto_util.get_default_device_option()
+            if device_option is None:
+                device_option = proto_util.get_global_device_option()
+        # Copy data to the backend
+        self.FeedTensor(
+            _stringify_object(tensor),
+            numpy.array(value, dtype=dtype, copy=False),
+            serialization.serialize_proto(device_option),
+        )
+    def fetch_tensor(self, tensor):
+        """Return the value of tensor.
-    Parameters
+        Parameters
-    ----------
+        ----------
-    graph_def : GraphDef
+        tensor : Union[dragon.Tensor, str]
-        The definition of meta graph.
+            The tensor to fetch.
-    Returns
+        Returns
-    -------
+        -------
-    str
+        numpy.ndarray
-        The graph name to run.
+            The array copied from backend.
-    """
+        """
-    cfg = config.config()
+        return self.FetchTensor(_stringify_object(tensor))
-    if cfg.graph_verbosity == 2:
-        log_dir = cfg.log_dir
-        if log_dir is not None:
-            if not os.path.exists(log_dir):
-                try:
-                    os.makedirs(log_dir)
-                except Exception:
-                    raise ValueError('The given prefix is invalid.')
-            path = os.path.join(
-                log_dir,
-                graph_def.name + '.txt',
-            )
-            with open(path, 'w') as f:
-                f.write(str(graph_def))
-            logging.info('Export meta graph to: %s' % path)
-        else:
-            print(graph_def)
-    return get_workspace().CreateGraph(
-        _stringify_proto(graph_def), cfg.graph_verbosity == 1)
+    def has_tensor(self, tensor):
+        """Return whether the tensor is in this workspace.
-def create_tensor(tensor):
+        Parameters
-    """Create the tensor in current workspace.
+        ----------
+        tensor : Union[dragon.Tensor, str]
+            The tensor.
-    Parameters
+        Returns
-    ----------
+        -------
-    tensor : Union[dragon.Tensor, str]
+        bool
-        The tensor to create.
+            **True** if tensor is existing otherwise **False**.
-    """
+        """
-    tensor = _stringify_tensor(tensor)
+        return self.HasTensor(_stringify_object(tensor))
-    get_workspace().CreateTensor(tensor)
-def feed_tensor(tensor, value, dtype=None, enforce_cpu=False):
-    """Copy the value to tensor.
-    Examples:
-    ```python
-    # Define a variable, feed then fetch the value
-    x = dragon.Tensor().variable()
-    dragon.workspace.feed_tensor(x, 1)
-    print(dragon.workspace.fetch_tensor(x))
-    # Feed by specifying a optional data type
-    # Fetch through ``Tensor.get_value(...)``
-    dragon.workspace.feed_tensor(a, [[1, 2, 3]], dtype='float16')
-    print(x.get_value())
-    ```
-    Parameters
-    ----------
-    tensor : Union[dragon.Tensor, str]
-        The tensor to feed.
-    value : array_like
-        The value to copy.
-    dtype : str, optional
-        The optional data type.
-    enforce_cpu : bool, optional, default=False
-        **True** to copy using cpu context.
-    """
+    def merge_from(self, other):
-    name = tensor.name if hasattr(tensor, 'name') else str(tensor)
+        """Merge resources from another workspace.
-    if enforce_cpu is True:
-        dev = proto_util.get_device_option('cpu')
-    else:
-        dev = proto_util.get_default_device_option()
-        if dev is None:
-            dev = proto_util.get_global_device_option()
-    # Steal the value from tensor storage if necessary.
+        The ``other`` will not be reset until ``self`` is reset.
-    if types.is_tensor(value):
+        Carefulness should be taken to associate with the workspaces.
-        value = getattr(value, 'get_value')()
-    if not isinstance(value, numpy.ndarray):
+        Parameters
-        dtype = 'float32' if dtype is None else dtype
+        ----------
-    else:
+        other : dragon.Workspace
-        dtype = value.dtype if dtype is None else dtype
+            The workspace to merge.
+        Returns
+        -------
+        dragon.Workspace
+            The ``self``.
-    if hasattr(tensor, 'dtype') and tensor.dtype is not None:
+        """
-        if tensor.dtype not in mapping.TENSOR_TYPE_TO_NP_TYPE:
+        self.MergeFrom(other)
-            raise TypeError('Unsupported data type: %s' % tensor.dtype)
+        self._references.append(other)
-        dtype = mapping.TENSOR_TYPE_TO_NP_TYPE[tensor.dtype]
+        return self
-    dev = _stringify_proto(dev)
+    def register_alias(self, target, alias):
-    value = numpy.array(value, dtype=dtype, copy=False)
+        """Register an alias for the target.
-    get_workspace().FeedTensor(name, value, dev)
+        Parameters
+        ----------
+        target : Union[str, dragon.Tensor]
+            The string or named object.
+        alias : str
+            The alias.
-def fetch_tensor(tensor):
+        """
-    """Return the value of tensor.
+        self.RegisterAlias(_stringify_object(target), alias)
-    Parameters
+    def reset_tensor(self, tensor):
-    ----------
+        """Reset the tensor.
-    tensor : Union[dragon.Tensor, str]
-        The tensor to fetch.
-    Returns
+        Parameters
-    -------
+        ----------
-    numpy.ndarray
+        tensor : Union[dragon.Tensor, str]
-        The array copied from backend.
+            The tensor to reset.
-    """
+        """
-    tensor = _stringify_tensor(tensor)
+        return self.ResetTensor(_stringify_object(tensor))
-    return get_workspace().FetchTensor(tensor)
+    def run_backward(
+        self,
+        op_defs,
+        targets,
+        sources=None,
+        input_grads=None,
+        empty_grads=None,
+    ):
+        """Compute the gradients of input operators.
-def get_dummy_name(basename, suffix='', domain='', zero_based=True):
+        Parameters
-    """Return an unique dummy name in current workspace.
+        ----------
+        op_defs : Sequence[OperatorDef]
+            The executed op defs.
+        targets : Sequence[str]
+            The derivative targets.
+        sources : Sequence[str], optional
+            The differentiated inputs.
+        input_grads : Sequence[str], optional
+            The input grad for targets.
+        empty_grads : Sequence[str], optional
+            The grads to set to empty.
-    The dummy name will be formatted as:
+        """
-    <basename> + <unique_index> + <suffix>.
+        cfg = config.config()
+        self.RunBackward(
+            op_defs,
+            targets,
+            sources if sources else [],
+            input_grads if input_grads else [],
+            empty_grads if empty_grads else [],
+            cfg.graph_optimization <= 2,
+            cfg.graph_verbosity > 0,
+        )
+    def run_graph(
+        self,
+        name,
+        inputs_and_values=None,
+        outputs=None,
+        executing_stage=None,
+        return_outputs=True,
+    ):
+        """Run the graph.
-    Names in the different ``domain`` could be same.
+        Parameters
+        ----------
+        name : str
+            The graph name.
+        inputs_and_values : Tuple[Sequence, Sequence], optional
+            The input tensors and feeding values.
+        outputs : Sequence[dragon.Tensor], optional
+            The output tensors.
+        executing_stage : str, optional
+            The optional executing stage.
+        return_outputs : bool, optional, default=False
+            Whether to return the output values.
-    Parameters
+        """
-    ----------
+        # The explicit feeding for inputs.
-    basename : str
+        if inputs_and_values is not None:
-        The basename.
+            inputs, values = inputs_and_values
-    suffix : str, optional
+            if len(inputs) != len(values):
-        The optional suffix adding to basename.
+                raise ValueError(
-    domain : str, optional
+                    'Specified %d values for %d inputs.'
-        The optional domain name.
+                    % (len(values), len(inputs)))
-    zero_based : bool, optional, default=True
+            for tensor, value in zip(inputs, values):
-        Whether number the index from 0.
+                self.feed_tensor(tensor, value)
+        # Run the graph according to the specified include/exclude rule.
+        stage_str = executing_stage if executing_stage else 'default'
+        exec_stage = _PREDEFINED_GRAPH_EXECUTING_STAGES[stage_str]
+        self.RunGraph(name, exec_stage['include'], exec_stage['exclude'])
+        # Maybe return the output values.
+        if return_outputs and outputs is not None:
+            if len(outputs) == 1:
+                return outputs[0].get_value()
+            else:
+                return [outputs[i].get_value() for i in range(len(outputs))]
+    def run_operator(self, op_def):
+        """Run the operator.
-    Returns
+        Parameters
-    -------
+        ----------
-    str
+        op_def : Union[OperatorDef, Sequence[OperatorDef]]
-        The unique dummy name.
+            The ``OperatorDef`` protocol buffer.
-    """
+        """
-    return get_workspace().GetDummyName(
+        cfg = config.config()
-        basename, suffix, domain, zero_based)
+        if isinstance(op_def, dragon_pb2.OperatorDef):
+            op_def = op_def.SerializePartialToString()
+        self.RunOperator(op_def, cfg.graph_verbosity > 0)
+    def unique_name(self, name, suffix='', namespace='', zero_based=True):
+        """Return an unique name.
-def get_tensor_name(tensor):
+        Names in the different ``namespace`` could be same.
-    """Return the name of tensor in current workspace.
-    Parameters
+        Parameters
-    ----------
+        ----------
-    tensor : Union[dragon.Tensor, str]
+        name : str
-        The tensor to query.
+            The name to make unique.
+        suffix : str, optional
+            The optional suffix adding to name.
+        namespace : str, optional
+            The optional scope to make unique within.
+        zero_based : bool, optional, default=True
+            **True** to number the index from 0 otherwise 1.
-    Returns
+        Returns
-    -------
+        -------
-    str
+        str
-        The tensor name.
+            The unique name.
-    """
+        """
-    tensor = _stringify_tensor(tensor)
+        return self.UniqueName(name, suffix, namespace, zero_based)
-    return get_workspace().GetTensorName(tensor)
 def get_workspace():
@@ -370,69 +421,6 @@ def get_workspace():
    return _GLOBAL_DEFAULT_WORKSPACE_STACK.get_default()
-def has_tensor(tensor):
-    """Return a bool indicating if tensor is in current workspace.
-    Parameters
-    ----------
-    tensor : Union[dragon.Tensor, str]
-        The tensor to query.
-    Returns
-    -------
-    bool
-        **True** if specified tensor is existing otherwise **False**.
-    """
-    tensor = _stringify_tensor(tensor)
-    return get_workspace().HasTensor(tensor)
-def load(file_path, format='pkl'):
-    """Load tensors from a binary file.
-    Parameters
-    ----------
-    file_path : str
-        The path of binary file.
-    format : {'pkl', 'caffe'}, optional
-        The serializing format.
-    """
-    assert os.path.exists(file_path), \
-        'File(%s) does not exist.' % file_path
-    if format == 'pkl':
-        try:
-            with open(file_path, 'rb') as f:
-                state_dict = six.moves.pickle.load(f)
-        except UnicodeDecodeError:
-            with open(file_path, 'rb') as f:
-                state_dict = six.moves.pickle.load(f, encoding='iso-8859-1')
-        logging.info('Load From Model@: ' + file_path)
-        logging.info('Model Format: Pickle')
-        for k, v in state_dict.items():
-            if has_tensor(k):
-                feed_tensor(k, v)
-                logging.info('Tensor({}) is loaded.'.format(k))
-    elif format == 'caffe':
-        get_workspace().Load(file_path, 1)
-    else:
-        raise TypeError('Unknown binary format: ' + format)
-def reset_tensor(tensor):
-    """Reset the memory of tensor.
-    Parameters
-    ----------
-    tensor : Union[dragon.Tensor, str]
-        The tensor to reset.
-    """
-    tensor = _stringify_tensor(tensor)
-    return get_workspace().ResetTensor(tensor)
 def reset_workspace():
    """Reset the current default workspace."""
    if not _GLOBAL_DEFAULT_WORKSPACE_STACK.is_cleared():
@@ -443,185 +431,9 @@ def reset_workspace():
    _GLOBAL_DEFAULT_WORKSPACE_STACK.reset()
-def run_backward(
+def _stringify_object(obj):
-    forward_ops,
+    """Try to stringify a object."""
-    targets,
+    return obj.id if hasattr(obj, 'id') else obj
-    sources=None,
-    input_grads=None,
-    ignored_grads=None,
-):
-    """Compute the gradients of input operators.
-    Parameters
-    ----------
-    forward_ops : Sequence[OperatorDef]
-        The referring operators to generate gradients.
-    targets : Sequence[str]
-        The solving targets.
-    sources : Sequence[str], optional
-        The optional sources to hook the intermediate grads.
-    input_grads : Sequence[str], optional
-        The external input grads.
-    ignored_grads : Sequence[str], optional
-        The grads that are explicitly ignored.
-    """
-    cfg = config.config()
-    get_workspace().RunBackward(
-        forward_ops,
-        targets,
-        sources if sources else [],
-        input_grads if input_grads else [],
-        ignored_grads if ignored_grads else [],
-        cfg.graph_optimization > 2,
-        cfg.graph_verbosity > 0,
-    )
-def run_graph(
-    graph,
-    inputs=(),
-    outputs=(),
-    stage=None,
-    return_outputs=True,
-):
-    """Run the graph in current workspace.
-    Parameters
-    ----------
-    graph : str
-        The name of graph.
-    inputs : tuple
-        The **inputs** and **values**.
-    outputs : Sequence[dragon.Tensor]
-        The outputs of the graph.
-    stage : str, optional
-        The preset custom stages.
-    return_outputs : bool, optional, default=False
-        Whether to return the outputs.
-    Returns
-    -------
-    Sequence[numpy.ndarray]
-        The outputs which are copied to numpy array.
-    """
-    # The explicit feeding.
-    if len(inputs) > 0 and len(inputs[0]) > 0:
-        if len(inputs[0]) != len(inputs[1]):
-            raise RuntimeError(
-                'Defined {} args, but {} are given.'
-                .format(len(inputs[0]), len(inputs[1]))
-            )
-        for idx in range(len(inputs[0])):
-            feed_tensor(inputs[0][idx], inputs[1][idx])
-    # Run the graph according to the specified include/exclude rule.
-    runtime_stage = stage if stage else 'default'
-    rule = _PREDEFINED_GRAPH_RUNTIME_STAGES[runtime_stage]
-    get_workspace().RunGraph(
-        graph, rule['include'], rule['exclude'])
-    # Try to return the outputs.
-    # Force to return may lead to asserts if outputs are not computed.
-    if return_outputs:
-        if len(outputs) == 0:
-            return None
-        elif len(outputs) == 1:
-            return outputs[0].get_value()
-        else:
-            return [outputs[i].get_value() for i in range(len(outputs))]
-def run_operator(op_def):
-    """Run the operator(s) in current workspace.
-    Parameters
-    ----------
-    op_def : Union[OperatorDef, Sequence[OperatorDef]]
-        The definition of operator(s).
-    """
-    cfg = config.config()
-    if isinstance(op_def, dragon_pb2.OperatorDef):
-        op_def = op_def.SerializeToString()
-    get_workspace().RunOperator(op_def, cfg.graph_verbosity > 0)
-def save(
-    tensors,
-    filename,
-    prefix='',
-    suffix='.pkl',
-    format='pkl',
-):
-    """Serialize tensors into a binary file.
-    The file path is formatted as:
-    <prefix> + <filename> + <suffix>
-    Parameters
-    ----------
-    tensors : Sequence[dragon.Tensor]
-        The tensors to be wrote.
-    filename : str
-        The filename.
-    prefix : str, optional, default=''
-        The prefix.
-    suffix : str, optional, default='.pkl'
-        The suffix.
-    format : {'pkl', 'caffe'}, optional
-        The serializing format.
-    """
-    file_path = prefix + filename + suffix
-    dir = os.path.split(file_path)[0]
-    if len(dir) > 0 and not os.path.exists(dir):
-        os.makedirs(dir)
-    if format == 'pkl':
-        state_dict = {}
-        for tensor in tensors:
-            state_dict[tensor.name] = fetch_tensor(tensor)
-        with open(file_path, 'wb') as f:
-            six.moves.pickle.dump(
-                state_dict, f,
-                six.moves.pickle.HIGHEST_PROTOCOL,
-            )
-        logging.info('Save model to: ' + file_path)
-        logging.info('Model Format: Pickle')
-    elif format == 'caffe':
-        names = [tensor.name for tensor in tensors]
-        get_workspace().Save(file_path, names, 1)
-    else:
-        raise TypeError('Unknown binary format: ' + format)
-def set_tensor_alias(tensor, alias):
-    """Bind an alias to an existing tensor.
-    Parameters
-    ----------
-    tensor : Union[dragon.Tensor, str]
-        The tensor to bind the alias.
-    alias : str
-        The alias.
-    """
-    tensor = _stringify_tensor(tensor)
-    get_workspace().SetTensorAlias(tensor, alias)
-def _stringify_proto(obj):
-    """Try to stringify a proto-buffer structure."""
-    return obj.SerializeToString()
-def _stringify_tensor(obj):
-    """Try to stringify a tensor."""
-    if hasattr(obj, 'id'):
-        return str(obj.id)
-    else:
-        return str(obj)
 class _DefaultWorkspaceStack(tls.Stack):
@@ -654,11 +466,11 @@ class _DefaultWorkspaceStack(tls.Stack):
            yield g
-# Define a global stack to store the workspaces of current thread.
+# Global stack to store the workspaces of current thread.
 _GLOBAL_DEFAULT_WORKSPACE_STACK = _DefaultWorkspaceStack()
-# Define some useful runtime stages.
+# Predefined graph executing stages.
-_PREDEFINED_GRAPH_RUNTIME_STAGES = {
+_PREDEFINED_GRAPH_EXECUTING_STAGES = {
    'default': {'include': '', 'exclude': ''},
    'forward': {'include': '', 'exclude': 'Gradient'},
    'backward': {'include': 'Gradient', 'exclude': 'Generate'},

--- a/dragon/python/core/ops/array_ops.py
+++ b/dragon/python/core/ops/array_ops.py
@@ -1425,16 +1425,16 @@ def sum(inputs, axis=None, keep_dims=False, **kwargs):
 @OpSchema.num_inputs(1)
-@ArgHelper.repeated_desc(name='multiples')
+@ArgHelper.repeated_desc(name='repeats')
-def tile(inputs, multiples, **kwargs):
+def tile(inputs, repeats, **kwargs):
-    r"""Tile the input according to the given multiples.
+    r"""Tile the input according to the given repeats.
    Parameters
    ----------
    inputs : dragon.Tensor
        The input tensor.
-    multiples : Sequence[Union[int, dragon.Tensor]]
+    repeats : Sequence[Union[int, dragon.Tensor]]
-        The multiple for each axis.
+        The number of repetitions for each axis.
    Returns
    -------
@@ -1446,8 +1446,8 @@ def tile(inputs, multiples, **kwargs):
    op_lib = array_ops_lib.Tile
    if context.executing_eagerly():
        return op_lib \
-            .instantiate(ndim=len(args['multiples'])) \
+            .instantiate(ndim=len(args['repeats'])) \
-            .apply([inputs], args['multiples'])
+            .apply([inputs], args['repeats'])
    else:
        return op_lib.blend(**args)

--- a/dragon/python/core/ops/array_ops_lib.py
+++ b/dragon/python/core/ops/array_ops_lib.py
@@ -18,18 +18,15 @@ class Arange(Operator):
                'dtype': self.dtype,
                'slice_descs': [
                    '${{HANDLE}}/slice[{}]'
-                    .format(n) for n in range(self.num_args)
+                    .format(n) for n in range(self.num_args)],
-                ],
            }
        }
    def feed(self, ws, handle, slice_args):
        for i in range(len(slice_args)):
            self.feed_arg(
-                ws,
+                ws, '{}/slice[{}]'.format(handle, i),
-                '{}/slice[{}]'.format(handle, i),
+                slice_args[i], 'float32')
-                slice_args[i], 'float32'
-            )
    def forward(self, slice_args, trainable=False):
        output = self.dispatch(
@@ -72,9 +69,7 @@ class Cast(Operator):
    def attributes(self):
        return {
            'op_type': 'Cast',
-            'arguments': {
+            'arguments': {'dtype': self.dtype},
-                'dtype': self.dtype,
-            }
        }
    def forward(self, inputs, inplace=False):
@@ -104,18 +99,15 @@ class ChannelNormalize(Operator):
                'dtype': self.dtype,
                'perm_descs': [
                    '${{HANDLE}}/perm[{}]'
-                    .format(n) for n in range(self.ndim)
+                    .format(n) for n in range(self.ndim)],
-                ],
            }
        }
    def feed(self, ws, handle, perm):
        for i in range(self.ndim):
            self.feed_arg(
-                ws,
+                ws, '{}/perm[{}]'.format(handle, i),
-                '{}/perm[{}]'.format(handle, i),
+                perm[i], 'int64')
-                perm[i], 'int64'
-            )
    def forward(self, inputs, perm):
        return self.dispatch(
@@ -152,9 +144,7 @@ class Concat(Operator):
    def attributes(self):
        return {
            'op_type': 'Concat',
-            'arguments': {
+            'arguments': {'axis': self.axis},
-                'axis': self.axis,
-            }
        }
    def forward(self, inputs):
@@ -194,24 +184,21 @@ class Expand(Operator):
            'arguments': {
                'dims_descs': [
                    '${{HANDLE}}/dims[{}]'
-                    .format(n) for n in range(self.ndim)
+                    .format(n) for n in range(self.ndim)],
-                ],
            }
        }
    def feed(self, ws, handle, dims):
-        for i, d in enumerate(dims):
+        for i, dim in enumerate(dims):
            self.feed_arg(
-                ws,
+                ws, '{}/dims[{}]'.format(handle, i),
-                '{}/dims[{}]'.format(handle, i),
+                dim, 'int64')
-                d, 'int64'
-            )
    def forward(self, inputs, dims):
        return self.dispatch(
            inputs, [self.alloc()],
            callback=lambda ws, handle:
-            self.feed(ws, handle, dims)
+                self.feed(ws, handle, dims),
        )
@@ -372,24 +359,21 @@ class Pad(Operator):
                'value': self.value,
                'pads_descs': [
                    '${{HANDLE}}/pads[{}]'
-                    .format(n) for n in range(self.ndim * 2)
+                    .format(n) for n in range(self.ndim * 2)],
-                ],
            }
        }
    def feed(self, ws, handle, pads):
        for i, e in enumerate(pads):
            self.feed_arg(
-                ws,
+                ws, '{}/pads[{}]'.format(handle, i),
-                '{}/pads[{}]'.format(handle, i),
+                e, 'int64')
-                e, 'int64'
-            )
    def forward(self, inputs, pads):
        return self.dispatch(
            inputs, [self.alloc()],
            callback=lambda ws, handle:
-                self.feed(ws, handle, pads)
+                self.feed(ws, handle, pads),
        )
@@ -443,18 +427,15 @@ class Reshape(Operator):
            'arguments': {
                'dims_descs': [
                    '${{HANDLE}}/dims[{}]'
-                    .format(n) for n in range(self.ndim)
+                    .format(n) for n in range(self.ndim)],
-                ],
            }
        }
    def feed(self, ws, handle, shape):
        for i, e in enumerate(shape):
            self.feed_arg(
-                ws,
+                ws, '{}/dims[{}]'.format(handle, i),
-                '{}/dims[{}]'.format(handle, i),
+                e, 'int64')
-                e, 'int64'
-            )
    def forward(self, inputs, shape, inplace=False):
        outputs = [inputs[0] if inplace else self.alloc()]
@@ -476,33 +457,27 @@ class Slice(Operator):
            'arguments': {
                'starts_descs': [
                    '${{HANDLE}}/starts[{}]'
-                    .format(n) for n in range(self.ndim)
+                    .format(n) for n in range(self.ndim)],
-                ],
                'sizes_descs': [
                    '${{HANDLE}}/sizes[{}]'
-                    .format(n) for n in range(self.ndim)
+                    .format(n) for n in range(self.ndim)],
-                ],
            }
        }
    def feed(self, ws, handle, starts, sizes):
-        for i, e in enumerate(starts):
+        for i in range(len(starts)):
            self.feed_arg(
-                ws,
+                ws, '{}/starts[{}]'.format(handle, i),
-                '{}/starts[{}]'.format(handle, i),
+                starts[i], 'int64')
-                e, 'int64'
-            )
            self.feed_arg(
-                ws,
+                ws, '{}/sizes[{}]'.format(handle, i),
-                '{}/sizes[{}]'.format(handle, i),
+                sizes[i], 'int64')
-                sizes[i], 'int64'
-            )
    def forward(self, inputs, starts, sizes):
        return self.dispatch(
            inputs, [self.alloc()],
            callback=lambda ws, handle:
-                self.feed(ws, handle, starts, sizes)
+                self.feed(ws, handle, starts, sizes),
        )
@@ -547,9 +522,7 @@ class Squeeze(Operator):
    def attributes(self):
        return {
            'op_type': 'Squeeze',
-            'arguments': {
+            'arguments': {'axes': self.axes},
-                'axes': self.axes,
-            },
        }
    def forward(self, inputs, inplace=False):
@@ -565,9 +538,7 @@ class Stack(Operator):
    def attributes(self):
        return {
            'op_type': 'Stack',
-            'arguments': {
+            'arguments': {'axis': self.axis},
-                'axis': self.axis,
-            }
        }
    def forward(self, inputs):
@@ -583,26 +554,23 @@ class Tile(Operator):
        return {
            'op_type': 'Tile',
            'arguments': {
-                'multiples_descs': [
+                'repeats_descs': [
-                    '${{HANDLE}}/multiples[{}]'
+                    '${{HANDLE}}/repeats[{}]'
-                    .format(n) for n in range(self.ndim)
+                    .format(n) for n in range(self.ndim)],
-                ],
            }
        }
-    def feed(self, ws, handle, multiples):
+    def feed(self, ws, handle, repeats):
-        for i, d in enumerate(multiples):
+        for i, size in enumerate(repeats):
            self.feed_arg(
-                ws,
+                ws, '{}/repeats[{}]'.format(handle, i),
-                '{}/multiples[{}]'.format(handle, i),
+                size, 'int64')
-                d, 'int64'
-            )
-    def forward(self, inputs, multiples):
+    def forward(self, inputs, repeats):
        return self.dispatch(
            inputs, [self.alloc()],
            callback=lambda ws, handle:
-            self.feed(ws, handle, multiples)
+                self.feed(ws, handle, repeats),
        )
@@ -617,24 +585,21 @@ class Transpose(Operator):
            'arguments': {
                'perm_descs': [
                    '${{HANDLE}}/perm[{}]'
-                    .format(n) for n in range(self.ndim)
+                    .format(n) for n in range(self.ndim)],
-                ],
            }
        }
    def feed(self, ws, handle, perm):
        for i in range(self.ndim):
            self.feed_arg(
-                ws,
+                ws, '{}/perm[{}]'.format(handle, i),
-                '{}/perm[{}]'.format(handle, i),
+                perm[i], 'int64')
-                perm[i], 'int64'
-            )
    def forward(self, inputs, perm):
        return self.dispatch(
            inputs, [self.alloc()],
            callback=lambda ws, handle:
-            self.feed(ws, handle, perm)
+                self.feed(ws, handle, perm),
        )

--- a/dragon/python/core/ops/control_flow_ops_lib.py
+++ b/dragon/python/core/ops/control_flow_ops_lib.py
@@ -37,15 +37,11 @@ class Assign(Operator):
    def feed(self, ws, handle, starts, sizes):
        for i in range(self.ndim):
            self.feed_arg(
-                ws,
+                ws, '{}/starts[{}]'.format(handle, i),
-                '{}/starts[{}]'.format(handle, i),
+                starts[i], 'int64')
-                starts[i], 'int64',
-            )
            self.feed_arg(
-                ws,
+                ws, '{}/sizes[{}]'.format(handle, i),
-                '{}/sizes[{}]'.format(handle, i),
+                sizes[i], 'int64')
-                sizes[i], 'int64',
-            )
    def forward(self, inputs, starts, sizes):
        return self.dispatch(

--- a/dragon/python/core/ops/init_ops_lib.py
+++ b/dragon/python/core/ops/init_ops_lib.py
@@ -24,12 +24,10 @@ class Initializer(Operator):
        self.dtype = kwargs.get('dtype', 'float32')
    def feed(self, ws, handle, shape):
-        for i, e in enumerate(shape):
+        for i, dim in enumerate(shape):
            self.feed_arg(
-                ws,
+                ws, '{}/dims[{}]'.format(handle, i),
-                '{}/dims[{}]'.format(handle, i),
+                dim, 'int64')
-                e, 'int64'
-            )
    def forward(
        self,
@@ -39,18 +37,16 @@ class Initializer(Operator):
        trainable=False,
    ):
        inputs = [] if shape_like is None else [shape_like]
-        outputs = [
+        outputs = [ops.new_leaf(
-            ops.new_leaf(
+            shape=shape,
-                shape=shape,
+            dtype=self.dtype,
-                dtype=self.dtype,
+            device=self.alloc(),
-                device=self.alloc(),
+            trainable=trainable,
-                trainable=trainable,
+        ) if out is None else out]
-            ) if out is None else out
-        ]
        return self.dispatch(
            inputs, outputs,
            callback=lambda ws, handle:
-            self.feed(ws, handle, shape)
+                self.feed(ws, handle, shape),
        )
@@ -67,8 +63,7 @@ class Eye(Initializer):
                'dtype': self.dtype,
                'dims_descs': [
                    '${{HANDLE}}/dims[{}]'.format(n)
-                    for n in range(self.ndim)
+                    for n in range(self.ndim)],
-                ],
            },
        }
@@ -86,8 +81,7 @@ class Fill(Initializer):
                'value': float(self.value),
                'dims_descs': [
                    '${{HANDLE}}/dims[{}]'.format(n)
-                    for n in range(self.ndim)
+                    for n in range(self.ndim)],
-                ],
            },
        }
@@ -107,8 +101,7 @@ class GlorotNormal(Initializer):
                'mode': self.mode.lower(),
                'dims_descs': [
                    '${{HANDLE}}/dims[{}]'.format(n)
-                    for n in range(self.ndim)
+                    for n in range(self.ndim)],
-                ],
            },
        }
@@ -128,8 +121,7 @@ class GlorotUniform(Initializer):
                'mode': self.mode.lower(),
                'dims_descs': [
                    '${{HANDLE}}/dims[{}]'.format(n)
-                    for n in range(self.ndim)
+                    for n in range(self.ndim)],
-                ],
            },
        }
@@ -149,8 +141,7 @@ class RandomNormal(Initializer):
                'std': float(self.std),
                'dims_descs': [
                    '${{HANDLE}}/dims[{}]'.format(n)
-                    for n in range(self.ndim)
+                    for n in range(self.ndim)],
-                ],
            },
        }
@@ -170,8 +161,7 @@ class RandomUniform(Initializer):
                'high': float(self.high),
                'dims_descs': [
                    '${{HANDLE}}/dims[{}]'.format(n)
-                    for n in range(self.ndim)
+                    for n in range(self.ndim)],
-                ],
            },
        }
@@ -191,7 +181,6 @@ class TruncatedNormal(Initializer):
                'std': float(self.std),
                'dims_descs': [
                    '${{HANDLE}}/dims[{}]'.format(n)
-                    for n in range(self.ndim)
+                    for n in range(self.ndim)],
-                ],
            },
        }
--- a/dragon/python/core/ops/normalization_ops_lib.py
+++ b/dragon/python/core/ops/normalization_ops_lib.py
@@ -83,7 +83,7 @@ class LpNormalize(Operator):
            }
        }
-    def forward(self,inputs):
+    def forward(self, inputs):
        return self.dispatch(inputs, [self.alloc()])

--- a/dragon/python/core/ops/tensorbind_eager.py
+++ b/dragon/python/core/ops/tensorbind_eager.py
@@ -25,14 +25,14 @@ from dragon.core.ops import init_ops_lib
 from dragon.core.ops import math_ops_lib
-def add(self, value):
+def add(self, other):
    r"""Compute the element-wise addition.
    .. math:: \text{out} = \text{self} + \text{value}
    Parameters
    ----------
-    value : Union[dragon.EagerTensor, number]
+    other : Union[dragon.EagerTensor, number]
        The value to add.
    Returns
@@ -45,7 +45,7 @@ def add(self, value):
    `dragon.math.add(...)`_ : Compute the element-wise addition.
    """
-    return _binary_op(self, value, 'Add')
+    return _binary_op(self, other, 'Add')
 def astype(self, dtype, inplace=False):
@@ -114,14 +114,14 @@ def copy(self):
        .instantiate().apply([self], None)
-def div(self, value):
+def div(self, other):
    r"""Compute the element-wise division.
    .. math:: \text{out} = \text{self} \div \text{value}
    Parameters
    ----------
-    value : Union[dragon.EagerTensor, number]
+    other : Union[dragon.EagerTensor, number]
        The value to divide.
    Returns
@@ -134,7 +134,7 @@ def div(self, value):
    `dragon.math.div(...)`_ : Compute the element-wise division.
    """
-    return _binary_op(self, value, 'Div')
+    return _binary_op(self, other, 'Div')
 def ge(self, other):
@@ -271,14 +271,14 @@ def gt(self, other):
    return _binary_op(self, other, 'Greater')
-def iadd(self, value):
+def iadd(self, other):
    r"""Compute the element-wise addition.
-    .. math:: \text{self} \mathrel{+}= \text{value}
+    .. math:: \text{self} \mathrel{+}= \text{other}
    Parameters
    ----------
-    value : Union[dragon.EagerTensor, number]
+    other : Union[dragon.EagerTensor, number]
        The value to add.
    Returns
@@ -291,17 +291,17 @@ def iadd(self, value):
    `dragon.math.add(...)`_ : Compute the element-wise addition.
    """
-    return _binary_op(self, value, 'Add', [self])
+    return _binary_op(self, other, 'Add', [self])
-def idiv(self, value):
+def idiv(self, other):
    r"""Compute the element-wise division.
-    .. math:: \text{self} \mathrel{\div}= \text{value}
+    .. math:: \text{self} \mathrel{\div}= \text{other}
    Parameters
    ----------
-    value : Union[dragon.EagerTensor, number]
+    other : Union[dragon.EagerTensor, number]
        The value to divide.
    Returns
@@ -314,17 +314,17 @@ def idiv(self, value):
    `dragon.math.div(...)`_ : Compute the element-wise division.
    """
-    return _binary_op(self, value, 'Div', [self])
+    return _binary_op(self, other, 'Div', [self])
-def imul(self, value):
+def imul(self, other):
    r"""Compute the element-wise multiplication.
-    .. math:: \text{self} \mathrel{\times}= \text{value}
+    .. math:: \text{self} \mathrel{\times}= \text{other}
    Parameters
    ----------
-    value : Union[dragon.EagerTensor, number]
+    other : Union[dragon.EagerTensor, number]
        The value to multiply.
    Returns
@@ -337,17 +337,17 @@ def imul(self, value):
    `dragon.math.mul(...)`_ : Compute the element-wise multiplication.
    """
-    return _binary_op(self, value, 'Mul', [self])
+    return _binary_op(self, other, 'Mul', [self])
-def isub(self, value):
+def isub(self, other):
    r"""Compute the element-wise division.
-    .. math:: \text{self} \mathrel{-}= \text{value}
+    .. math:: \text{self} \mathrel{-}= \text{other}
    Parameters
    ----------
-    value : Union[dragon.EagerTensor, number]
+    other : Union[dragon.EagerTensor, number]
        The value to subtract.
    Returns
@@ -360,7 +360,7 @@ def isub(self, value):
    `dragon.math.sub(...)`_ : Compute the element-wise subtraction.
    """
-    return _binary_op(self, value, 'Sub', [self])
+    return _binary_op(self, other, 'Sub', [self])
 def le(self, other):
@@ -409,14 +409,14 @@ def lt(self, other):
    return _binary_op(self, other, 'Less')
-def mul(self, value):
+def mul(self, other):
    r"""Compute the element-wise multiplication.
-    .. math:: \text{out} = \text{self} \times \text{value}
+    .. math:: \text{out} = \text{self} \times \text{other}
    Parameters
    ----------
-    value : Union[dragon.EagerTensor, number]
+    other : Union[dragon.EagerTensor, number]
        The value to multiply.
    Returns
@@ -429,7 +429,7 @@ def mul(self, value):
    `dragon.math.mul(...)`_ : Compute the element-wise multiplication.
    """
-    return _binary_op(self, value, 'Mul')
+    return _binary_op(self, other, 'Mul')
 def neg(self):
@@ -478,14 +478,14 @@ def normal(self, mean=0, std=1):
        ).apply(shape, out=self)
-def radd(self, value):
+def radd(self, other):
    r"""Compute the element-wise addition.
-    .. math:: \text{out} = \text{value} + \text{self}
+    .. math:: \text{out} = \text{other} + \text{self}
    Parameters
    ----------
-    value : Union[dragon.EagerTensor, number]
+    other : Union[dragon.EagerTensor, number]
        The value to add.
    Returns
@@ -498,17 +498,17 @@ def radd(self, value):
    `dragon.math.add(...)`_ : Compute the element-wise addition.
    """
-    return _binary_op(value, self, 'Add')
+    return _binary_op(other, self, 'Add')
-def rdiv(self, value):
+def rdiv(self, other):
    r"""Compute the element-wise division.
    .. math:: \text{out} = \text{value} \div \text{self}
    Parameters
    ----------
-    value : Union[dragon.EagerTensor, number]
+    other : Union[dragon.EagerTensor, number]
        The value to be divided.
    Returns
@@ -521,7 +521,7 @@ def rdiv(self, value):
    `dragon.math.div(...)`_ : Compute the element-wise division.
    """
-    return _binary_op(value, self, 'Div')
+    return _binary_op(other, self, 'Div')
 def reshape(self, shape):
@@ -546,14 +546,14 @@ def reshape(self, shape):
        return array_ops.reshape(self, shape=shape)
-def rmul(self, value):
+def rmul(self, other):
    r"""Compute the element-wise multiplication.
-    .. math:: \text{out} = \text{value} \times \text{self}
+    .. math:: \text{out} = \text{other} \times \text{self}
    Parameters
    ----------
-    value : Union[dragon.EagerTensor, number]
+    other : Union[dragon.EagerTensor, number]
        The value to multiply.
    Returns
@@ -566,17 +566,17 @@ def rmul(self, value):
    `dragon.math.mul(...)`_ : Compute the element-wise multiplication.
    """
-    return _binary_op(value, self, 'Mul')
+    return _binary_op(other, self, 'Mul')
-def rsub(self, value):
+def rsub(self, other):
    r"""Compute the element-wise subtraction.
-    .. math:: \text{out} = \text{value} - \text{self}
+    .. math:: \text{out} = \text{other} - \text{self}
    Parameters
    ----------
-    value : Union[dragon.EagerTensor, number]
+    other : Union[dragon.EagerTensor, number]
        The value to be subtracted.
    Returns
@@ -589,7 +589,7 @@ def rsub(self, value):
    `dragon.math.sub(...)`_ : Compute the element-wise subtraction.
    """
-    return _binary_op(value, self, 'Sub')
+    return _binary_op(other, self, 'Sub')
 def setitem(self, key, value):
@@ -618,14 +618,14 @@ def setitem(self, key, value):
        _section_assign(self, value, starts, sizes)
-def sub(self, value):
+def sub(self, other):
    r"""Compute the element-wise subtraction.
-    .. math:: \text{out} = \text{self} - \text{value}
+    .. math:: \text{out} = \text{self} - \text{other}
    Parameters
    ----------
-    value : Union[dragon.EagerTensor, number]
+    other : Union[dragon.EagerTensor, number]
        The value to subtract.
    Returns
@@ -638,7 +638,7 @@ def sub(self, value):
    `dragon.math.sub(...)`_ : Compute the element-wise subtraction.
    """
-    return _binary_op(self, value, 'Sub')
+    return _binary_op(self, other, 'Sub')
 def truncated_normal(self, mean=0, std=1):
@@ -809,3 +809,4 @@ EagerTensor.__rsub__ = rsub
 EagerTensor.__rtruediv__ = rdiv
 EagerTensor.__setitem__ = setitem
 EagerTensor.__sub__ = sub
+EagerTensor.__truediv__ = div
--- a/dragon/python/core/ops/tensorbind_symbol.py
+++ b/dragon/python/core/ops/tensorbind_symbol.py
@@ -23,14 +23,14 @@ from dragon.core.framework import workspace
 from dragon.core.ops import array_ops
-def add(self, value):
+def add(self, other):
    r"""Compute the element-wise addition.
-    .. math:: \text{out} = \text{self} + \text{value}
+    .. math:: \text{out} = \text{self} + \text{other}
    Parameters
    ----------
-    value : Union[dragon.Tensor, number]
+    other : Union[dragon.Tensor, number]
        The value to add.
    Returns
@@ -43,7 +43,7 @@ def add(self, value):
    `dragon.math.add(...)`_ : Compute the element-wise addition.
    """
-    return _binary_op(self, value, 'Add')
+    return _binary_op(self, other, 'Add')
 def astype(self, dtype, inplace=False):
@@ -89,14 +89,14 @@ def copy(self):
    return OpDef.apply('Copy', [self], [outputs])
-def div(self, value):
+def div(self, other):
    r"""Compute the element-wise division.
-    .. math:: \text{out} = \text{self} \div \text{value}
+    .. math:: \text{out} = \text{self} \div \text{other}
    Parameters
    ----------
-    value : Union[dragon.Tensor, number]
+    other : Union[dragon.Tensor, number]
        The value to divide.
    Returns
@@ -109,7 +109,7 @@ def div(self, value):
    `dragon.math.div(...)`_ : Compute the element-wise division.
    """
-    return _binary_op(self, value, 'Div')
+    return _binary_op(self, other, 'Div')
 def ge(self, other):
@@ -172,12 +172,8 @@ def get_value(self):
    numpy.ndarray
        The deep copied value.
-    See Also
-    --------
-    `dragon.workspace.fetch_tensor(...)`_ : Fetch the value of given tensor.
    """
-    return workspace.fetch_tensor(self)
+    return workspace.get_workspace().fetch_tensor(self)
 def gt(self, other):
@@ -249,14 +245,14 @@ def lt(self, other):
    return _binary_op(self, other, 'Less')
-def mul(self, value):
+def mul(self, other):
    r"""Compute the element-wise multiplication.
-    .. math:: \text{out} = \text{self} \times \text{value}
+    .. math:: \text{out} = \text{self} \times \text{other}
    Parameters
    ----------
-    value : Union[dragon.Tensor, number]
+    other : Union[dragon.Tensor, number]
        The value to multiply.
    Returns
@@ -269,7 +265,7 @@ def mul(self, value):
    `dragon.math.mul(...)`_ : Compute the element-wise multiplication.
    """
-    return _binary_op(self, value, 'Mul')
+    return _binary_op(self, other, 'Mul')
 def neg(self):
@@ -290,14 +286,14 @@ def neg(self):
    return _unary_op(self, 'Neg')
-def radd(self, value):
+def radd(self, other):
    r"""Compute the element-wise addition.
-    .. math:: \text{out} = \text{value} + \text{self}
+    .. math:: \text{out} = \text{other} + \text{self}
    Parameters
    ----------
-    value : Union[dragon.Tensor, number]
+    other : Union[dragon.Tensor, number]
        The value to add.
    Returns
@@ -310,17 +306,17 @@ def radd(self, value):
    `dragon.math.add(...)`_ : Compute the element-wise addition.
    """
-    return _binary_op(value, self, 'Add')
+    return _binary_op(other, self, 'Add')
-def rdiv(self, value):
+def rdiv(self, other):
    r"""Compute the element-wise division.
-    .. math:: \text{out} = \text{value} \div \text{self}
+    .. math:: \text{out} = \text{other} \div \text{self}
    Parameters
    ----------
-    value : Union[dragon.Tensor, number]
+    other : Union[dragon.Tensor, number]
        The value to be divided.
    Returns
@@ -333,7 +329,7 @@ def rdiv(self, value):
    `dragon.math.div(...)`_ : Compute the element-wise division.
    """
-    return _binary_op(value, self, 'Div')
+    return _binary_op(other, self, 'Div')
 def reshape(self, shape):
@@ -358,14 +354,14 @@ def reshape(self, shape):
        return array_ops.reshape(self, shape=shape)
-def rmul(self, value):
+def rmul(self, other):
    r"""Compute the element-wise multiplication.
-    .. math:: \text{out} = \text{value} \times \text{self}
+    .. math:: \text{out} = \text{other} \times \text{self}
    Parameters
    ----------
-    value : Union[dragon.Tensor, number]
+    other : Union[dragon.Tensor, number]
        The value to multiply.
    Returns
@@ -378,17 +374,17 @@ def rmul(self, value):
    `dragon.math.mul(...)`_ : Compute the element-wise multiplication.
    """
-    return _binary_op(value, self, 'Mul')
+    return _binary_op(other, self, 'Mul')
-def rsub(self, value):
+def rsub(self, other):
    r"""Compute the element-wise subtraction.
-    .. math:: \text{out} = \text{value} - \text{self}
+    .. math:: \text{out} = \text{other} - \text{self}
    Parameters
    ----------
-    value : Union[dragon.Tensor, number]
+    other : Union[dragon.Tensor, number]
        The value to be subtracted.
    Returns
@@ -401,7 +397,7 @@ def rsub(self, value):
    `dragon.math.sub(...)`_ : Compute the element-wise subtraction.
    """
-    return _binary_op(value, self, 'Sub')
+    return _binary_op(other, self, 'Sub')
 def setitem(self, key, value):
@@ -443,23 +439,19 @@ def set_value(self, value):
    dragon.Tensor
        The self.
-    See Also
-    --------
-    `dragon.workspace.feed_tensor(...)`_ : Feed the value to the given tensor.
    """
-    workspace.feed_tensor(self, value)
+    workspace.get_workspace().feed_tensor(self, value)
    return self
-def sub(self, value):
+def sub(self, other):
    r"""Compute the element-wise subtraction.
    .. math:: \text{out} = \text{self} - \text{value}
    Parameters
    ----------
-    value : Union[dragon.Tensor, number]
+    other : Union[dragon.Tensor, number]
        The value to subtract.
    Returns
@@ -472,7 +464,7 @@ def sub(self, value):
    `dragon.math.sub(...)`_ : Compute the element-wise subtraction.
    """
-    return _binary_op(self, value, 'Sub')
+    return _binary_op(self, other, 'Sub')
 def _binary_op(a, b, op_type):
@@ -570,3 +562,4 @@ Tensor.__rtruediv__ = rdiv
 Tensor.__rsub__ = rsub
 Tensor.__setitem__ = setitem
 Tensor.__sub__ = sub
+Tensor.__truediv__ = div
--- a/dragon/python/core/ops/vision_ops.py
+++ b/dragon/python/core/ops/vision_ops.py
@@ -157,7 +157,7 @@ def conv2d_transpose(
    group : int, optional, default=1
        The group size of convolution.
    output_padding : Sequence[Union[int, dragon.Tensor]], optional
-        The value padded to the right side.
+        The extra size padding to output.
    output_shape : Sequence[Union[int, dragon.Tensor]], optional
        The output shape for **SAME** padding.
    padding : {'VALID', 'SAME', 'SAME_UPPER', 'SAME_LOWER'}, optional
@@ -176,7 +176,7 @@ def conv2d_transpose(
        raise ValueError('Unsupported padding algorithm: %s' % padding)
    if data_format not in ('NCHW', 'NHWC'):
        raise ValueError('Unsupported data format: %s' % data_format)
-    if output_padding is not None or output_shape is not None:
+    if output_shape is not None and 'SAME' not in padding:
        args['padding'] = 'SAME'
    for key in ('kernel_shape', 'strides', 'pads', 'dilations'):
        if key == 'pads':
@@ -327,7 +327,7 @@ def pool2d(
    pads=0,
    padding='VALID',
    ceil_mode=False,
-    mode='max',
+    mode='MAX',
    data_format='NCHW',
    global_pooling=False,
    **kwargs
@@ -366,7 +366,8 @@ def pool2d(
    """
    args = parse_args(locals())
-    if mode not in ('MAX', 'AVG'):
+    args['mode'] = mode.upper()
+    if args['mode'] not in ('MAX', 'AVG'):
        raise ValueError('Unsupported pooling mode: %s' % mode)
    if padding not in ('VALID', 'SAME', 'SAME_UPPER', 'SAME_LOWER'):
        raise ValueError('Unsupported padding algorithm: %s' % padding)
@@ -386,7 +387,7 @@ def pool2d(
                pads=args['pads'],
                padding=padding,
                ceil_mode=ceil_mode,
-                mode=mode,
+                mode=args['mode'],
                data_format=data_format,
                global_pooling=global_pooling,
            ).apply([inputs])

--- a/dragon/python/core/ops/vision_ops_lib.py
+++ b/dragon/python/core/ops/vision_ops_lib.py
@@ -104,9 +104,6 @@ class ConvTranspose2d(_ConvNd):
        super(ConvTranspose2d, self).__init__(key, dev, **kwargs)
        self.output_padding = kwargs.get('output_padding', None)
        self.output_shape = kwargs.get('output_shape', None)
-        if self.output_padding is not None or \
-                self.output_shape is not None:
-            self.padding = 'SAME'
    def attributes(self):
        return {
@@ -169,13 +166,11 @@ class Resize(Operator):
                'mode': self.mode,
                'align_corners': self.align_corners,
                'sizes_descs': [
-                    '${{HANDLE}}/sizes[{}]'.format(n)
+                    '${{HANDLE}}/sizes[{}]'
-                    for n in range(self.num_sizes)
+                    .format(n) for n in range(self.num_sizes)],
-                ],
                'scales_descs': [
-                    '${{HANDLE}}/scales[{}]'.format(n)
+                    '${{HANDLE}}/scales[{}]'
-                    for n in range(self.num_scales)
+                    .format(n) for n in range(self.num_scales)],
-                ],
                'data_format': self.data_format,
            }
        }
@@ -183,22 +178,18 @@ class Resize(Operator):
    def feed(self, ws, handle, sizes, scales):
        for i in range(self.num_sizes):
            self.feed_arg(
-                ws,
+                ws, '{}/sizes[{}]'.format(handle, i),
-                '{}/sizes[{}]'.format(handle, i),
+                sizes[i], 'int64')
-                sizes[i], 'int64',
-            )
        for i in range(self.num_scales):
            self.feed_arg(
-                ws,
+                ws, '{}/scales[{}]'.format(handle, i),
-                '{}/scales[{}]'.format(handle, i),
+                scales[i], 'float32')
-                scales[i], 'float32',
-            )
    def forward(self, inputs, sizes=None, scales=None):
        return self.dispatch(
            inputs, [self.alloc()],
            callback=lambda ws, handle:
-                self.feed(ws, handle, sizes, scales)
+                self.feed(ws, handle, sizes, scales),
        )

--- a/dragon/python/core/proto/dragon.proto
+++ b/dragon/python/core/proto/dragon.proto
@@ -10,61 +10,45 @@ package dragon;
 // Store the serialized Tensor objects.
 message TensorProto {
-    repeated int32 dims = 1;
+  repeated int32 dims = 1;
-    enum DataType {
+  enum DataType {
-        UNDEFINED = 0;
+    UNDEFINED = 0;
-        // Basic types.
+    // Basic types.
-        FLOAT = 1;
+    FLOAT = 1;
-        INT32 = 2;
+    INT32 = 2;
-        BYTE = 3;
+    BYTE = 3;
-        STRING = 4;
+    STRING = 4;
-        // Less-commonly used data types.
+    // Less-commonly used data types.
-        BOOL = 5;
+    BOOL = 5;
-        UINT8 = 6;
+    UINT8 = 6;
-        INT8 = 7;
+    INT8 = 7;
-        UINT16 = 8;
+    UINT16 = 8;
-        INT16 = 9;
+    INT16 = 9;
-        INT64 = 10;
+    INT64 = 10;
-        FLOAT16 = 12;
+    FLOAT16 = 12;
-        DOUBLE = 13;
+    DOUBLE = 13;
-    }
+  }
-    optional DataType data_type = 2 [default = FLOAT];
+  optional DataType data_type = 2 [default = FLOAT];
-    // For float.
+  // For float.
-    repeated float float_data = 3 [packed = true];
+  repeated float float_data = 3 [packed = true];
-    // For int32, uint8, int8, uint16, int16, bool, and float16
+  // For int32, uint8, int8, uint16, int16, bool, and float16
-    // Note about float16: in storage we will basically convert float16 byte-wise
+  // Note about float16: in storage we will basically convert float16 byte-wise
-    // to unsigned short and then store them in the int32_data field.
+  // to unsigned short and then store them in the int32_data field.
-    repeated int32 int32_data = 4 [packed = true];
+  repeated int32 int32_data = 4 [packed = true];
-    // For bytes.
+  // For bytes.
-    optional bytes byte_data = 5;
+  optional bytes byte_data = 5;
-    // For strings.
+  // For strings.
-    repeated bytes string_data = 6;
+  repeated bytes string_data = 6;
-    // For double.
+  // For double.
-    repeated double double_data = 9 [packed = true];
+  repeated double double_data = 9 [packed = true];
-    // For int64.
+  // For int64.
-    repeated int64 int64_data = 10 [packed = true];
+  repeated int64 int64_data = 10 [packed = true];
-    // Store the raw data, contents are serialized as little-endian.
+  // Store the raw data, contents are serialized as little-endian.
-    optional bytes raw_data = 13;
+  optional bytes raw_data = 13;
-    // Optionally, a name for the tensor.
+  // Optionally, a name for the tensor.
-    optional string name = 7;
+  optional string name = 7;
-}
-// Record the filler of Tensor.
-// This structure is kept for backward compatibility
-// with caffe1, which relies implicit initializer.
-message TensorFillerProto {
-    optional string tensor = 1;
-    optional string type = 2 [default = 'constant'];
-    optional float value = 3 [default = 0];
-    optional float low = 4 [default = 0];
-    optional float high = 5 [default = 1];
-    optional float mean = 6 [default = 0];
-    optional float std = 7 [default = 1];
-    optional float scale = 8 [default = 3];
-    enum VarianceNorm { FAN_IN = 0; FAN_OUT = 1; FAN_AVG=2; }
-    optional VarianceNorm variance_norm = 9 [default = FAN_IN];
 }
 // Store multiple TensorProto objects in one single proto.
@@ -74,99 +58,116 @@ message TensorProtos {
 // DeviceType that Dragon currently supports.
 enum DeviceTypeProto {
-    // The default device.
+  // The default device.
-    PROTO_CPU = 0;
+  PROTO_CPU = 0;
-    // NVIDIA's CUDA Environment.
+  // NVIDIA's CUDA Environment.
-    PROTO_CUDA = 1;
+  PROTO_CUDA = 1;
-    // CAMBRICON's CNML Environment.
+  // CAMBRICON's CNML Environment.
-    PROTO_CNML = 2;
+  PROTO_CNML = 2;
 }
 // Device-specific options.
 message DeviceOption {
-    // The type of device to dispatch executions.
+  // The type of device to dispatch executions.
-    optional DeviceTypeProto device_type = 1 [default = PROTO_CPU];
+  optional DeviceTypeProto device_type = 1 [default = PROTO_CPU];
-    // The index of this device.
+  // The index of this device.
-    optional int32 device_id = 2 [default = 0];
+  optional int32 device_id = 2 [default = 0];
-    // The random seed to start the random generator.
+  // The random seed to start the random generator.
-    optional uint32 random_seed = 3 [default = 3];
+  optional uint32 random_seed = 3 [default = 3];
 }
 // A named argument containing either singular float, integer and string
 // values, or repeated float, int and string arrays.
 message Argument {
-    // The name of this argument.
+  // The name of this argument.
-    optional string name = 1;
+  optional string name = 1;
-    // Store the float32 value.
+  // Store the float32 value.
-    optional float f = 2;
+  optional float f = 2;
-    // Store the bool, int32, int64 value.
+  // Store the bool, int32, int64 value.
-    optional int64 i = 3;
+  optional int64 i = 3;
-    // Store the string value.
+  // Store the string value.
-    optional bytes s = 4;
+  optional bytes s = 4;
-    // Store the float32 values.
+  // Store the float32 values.
-    repeated float floats = 7;
+  repeated float floats = 7;
-    // Store the bool, int32, int64 values.
+  // Store the bool, int32, int64 values.
-    repeated int64 ints = 8;
+  repeated int64 ints = 8;
-    // Store the string values.
+  // Store the string values.
-    repeated bytes strings = 9;
+  repeated bytes strings = 9;
 }
 // Operator Definition
 message OperatorDef {
-    // The name of inputs.
+  // The name of inputs.
-    repeated string input = 1;
+  repeated string input = 1;
-    // The name of outputs.
+  // The name of outputs.
-    repeated string output = 2;
+  repeated string output = 2;
-    // The optional name of this operator.
+  // The optional name of this operator.
-    optional string name = 3;
+  optional string name = 3;
-    // The operator type.
+  // The operator type.
-    optional string type = 4;
+  optional string type = 4;
-    // The arguments.
+  // The arguments.
-    repeated Argument arg = 5;
+  repeated Argument arg = 5;
-    // The device option that the operator should run under.
+  // The device option that the operator should run under.
-    optional DeviceOption device_option = 6;
+  optional DeviceOption device_option = 6;
-    // The optional unique key for this operator.
+  // The optional unique key for this operator.
-    // Set it to persist operators in the eager mode.
+  // Set it to persist operators in the eager mode.
-    optional string cache_key = 7;
+  optional string cache_key = 7;
-}
-// Record the gradient information
-message GradientProto {
-    // The derivative target.
-    optional string cost = 1;
-    // The target with respect to?
-    optional string wrt = 2;
-    // The external gradient
-    optional string external = 3;
 }
 // Graph Definition
 message GraphDef {
-    // The graph name.
+  // The graph name.
-    optional string name = 1;
+  optional string name = 1;
-    // The operators to execute.
+  // The operators to execute.
-    repeated OperatorDef op = 2;
+  repeated OperatorDef op = 2;
-    // The type of graph.
+  // The type of graph.
-    optional string graph_type = 3;
+  optional string graph_type = 3;
-    // The device option for this graph.
+  // The device option for this graph.
-    optional DeviceOption device_option = 5;
+  optional DeviceOption device_option = 5;
-    // The arguments.
+  // The arguments.
-    repeated Argument arg = 6;
+  repeated Argument arg = 6;
-    // The name of inputs.
+  // The name of inputs.
-    repeated string input = 7;
+  repeated string input = 7;
-    // The name of outputs.
+  // The name of outputs.
-    repeated string output = 8;
+  repeated string output = 8;
-    // The gradients information.
+  // The info of gradients.
-    repeated GradientProto gradient = 9;
+  repeated GradientInfo grad_info = 9;
 }
\ No newline at end of file
+// Record the filler information.
+// This structure is kept for backward compatibility
+// with caffe, which relies the implicit initializer.
+message FillerInfo {
+  enum VarianceNorm {
+    FAN_IN = 0;
+    FAN_OUT = 1;
+    FAN_AVG = 2;
+  }
+  optional string type = 1 [default = 'constant'];
+  optional float value = 2 [default = 0];
+  optional float low = 3 [default = 0];
+  optional float high = 4 [default = 1];
+  optional float mean = 5 [default = 0];
+  optional float std = 6 [default = 1];
+  optional float scale = 7 [default = 3];
+  optional VarianceNorm variance_norm = 8 [default = FAN_IN];
+}
+// Record the gradient information.
+message GradientInfo {
+  // The derivative target.
+  optional string y = 1;
+  // The differentiated inputs.
+  repeated string xs = 2;
+}
--- a/dragon/python/core/training/optimizer.py
+++ b/dragon/python/core/training/optimizer.py
@@ -112,7 +112,7 @@ class Optimizer(object):
        if extra is not None:
            self._defaults = dict(self._defaults, **extra)
        for k, v in self._defaults.items():
-            workspace.feed_tensor(
+            workspace.get_workspace().feed_tensor(
                '/share/hyper/%s/%s' % (self._op_handle, k), v,
                dtype='float32', enforce_cpu=True,
            )
@@ -140,14 +140,14 @@ class Optimizer(object):
    def __getattr__(self, item):
        defaults = self.__dict__.get('_defaults')
        if item in defaults:
-            return workspace.fetch_tensor(
+            return workspace.get_workspace().fetch_tensor(
                '/share/hyper/%s/%s' % (self._op_handle, item))
        return self.__dict__[item]
    def __setattr__(self, key, value):
        defaults = self.__dict__.get('_defaults')
        if defaults is not None and key in defaults:
-            workspace.feed_tensor(
+            workspace.get_workspace().feed_tensor(
                '/share/hyper/%s/%s' % (self._op_handle, key), value,
                dtype='float32', enforce_cpu=True)
        else:

--- a/dragon/python/core/training/updater.py
+++ b/dragon/python/core/training/updater.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#    <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-"""Define the base updater class.
-We dubbed it as ``Updater``, because ``Optimizer``
-has already been abused by many deep learning frameworks.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from dragon.core import distributed
-from dragon.core.eager import context
-from dragon.core.framework import workspace
-from dragon.core.ops import distributed_ops_lib
-from dragon.core.ops import training_ops_lib
-class Updater(object):
-    """The base class of updaters."""
-    # Store the global unique slot index.
-    _DEFAULT_UNIQUE_SLOT_ID = 0
-    def __init__(
-        self,
-        scale_gradient=1.,
-        clip_gradient=-1.,
-        l2_decay=-1.,
-        name=None,
-    ):
-        """Create an ``Updater``.
-        Parameters
-        ----------
-        scale_gradient : float, optional, default=1.
-            The factor to scale gradients.
-        clip_gradient : float, optional, default=-1.
-            The norm thresh to clip gradients.
-        l2_decay : float, optional, default=-1.
-            The l2 decay factor.
-        name : str, optional
-            The optional name for buffers.
-        """
-        self._defaults = {
-            'scale_gradient': scale_gradient,
-            'clip_gradient': clip_gradient,
-            'l2_decay': l2_decay,
-        }
-        self._param_group = []
-        if name:
-            self._slot = name
-        else:
-            Updater._DEFAULT_UNIQUE_SLOT_ID += 1
-            self._slot = 'Updater/Slot:{}'.format(
-                Updater._DEFAULT_UNIQUE_SLOT_ID)
-        self._op_type = None
-        self._process_group = distributed.get_group()
-        self._extra_kwargs = {}
-    def apply_gradients(
-        self,
-        values_and_grads,
-        lr_mult=None,
-        decay_mult=None,
-    ):
-        """Apply the gradients on values.
-        Parameters
-        ----------
-        values_and_grads : Sequence[Sequence[dragon.Tensor]]
-            The values and grads.
-        lr_mult : number, optional
-            The multiplier on learning rate.
-        decay_mult : number, optional
-            The multiplier on weight decay.
-        """
-        if context.executing_eagerly():
-            # Filter value whose grad is missing.
-            values, grads = [], []
-            for v, g in values_and_grads:
-                if g is not None:
-                    values.append(v)
-                    grads.append(g)
-            # Accumulate grads from the current process group.
-            if self._process_group is not None:
-                distributed_ops_lib.Collective \
-                    .instantiate(
-                        operation='MEAN',
-                        communication='ALLREDUCE',
-                        group=self._process_group,
-                    ).apply(grads)
-            # Apply the updates.
-            for v, g in zip(values, grads):
-                self._run_update(v, g, lr_mult, decay_mult)
-        else:
-            # Store for the lazy compilation.
-            for v, g in values_and_grads:
-                self._add_update(v, g, lr_mult, decay_mult)
-        return self
-    def _init_set_defaults(self, extra=None):
-        """Initialize the defaults into current workspace."""
-        if extra is not None:
-            self._defaults = dict(self._defaults, **extra)
-        self._op_type = self.__class__.__name__ + 'Update'
-        for k, v in self._defaults.items():
-            workspace.feed_tensor(
-                self._slot + "/" + k, v,
-                dtype='float32', enforce_cpu=True,
-            )
-    def _add_update(self, param, grad, lr_mult=None, decay_mult=None):
-        """Add a symbolic operator for updating."""
-        pair = (v.id if hasattr(v, 'id') else v for v in (param, grad))
-        self._param_group.append(
-            (pair, {
-                'lr_mult': float(lr_mult) if lr_mult is not None else 1.,
-                'decay_mult': float(decay_mult) if decay_mult is not None else 1.,
-            })
-        )
-    def _run_update(self, param, grad, lr_mult=None, decay_mult=None):
-        """Run an eager operation for updating."""
-        return training_ops_lib.ParamUpdate \
-            .instantiate(
-                slot=self._slot,
-                op_type=self._op_type,
-                lr_mult=float(lr_mult) if lr_mult is not None else 1.,
-                decay_mult=float(decay_mult) if decay_mult is not None else 1.,
-            ).apply(grad, param)
-    def __getattr__(self, item):
-        defaults = self.__dict__.get('_defaults')
-        if item in defaults:
-            return workspace.fetch_tensor(
-                self._slot + '/' + item)
-        return self.__dict__[item]
-    def __setattr__(self, key, value):
-        defaults = self.__dict__.get('_defaults')
-        if defaults is not None and key in defaults:
-            workspace.feed_tensor(
-                self._slot + '/' + key, value,
-                dtype='float32', enforce_cpu=True,
-            )
-        else:
-            object.__setattr__(self, key, value)
--- a/dragon/python/vm/onnx/serialization.py
+++ b/dragon/python/vm/onnx/serialization.py
@@ -7,10 +7,6 @@
 #
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
-# Codes are based on:
-#
-#    <https://github.com/onnx/onnx/blob/master/onnx/__init__.py>
-#
 # ------------------------------------------------------------
 from __future__ import absolute_import
@@ -22,29 +18,19 @@ from typing import IO
 from typing import Optional
 from typing import Text
-from google.protobuf import message
-try:
-    from onnx import ModelProto
-except ImportError:
-    from dragon.core.util import deprecation
-    ModelProto = deprecation.not_installed('onnx')
+def save_bytes(str, f):
-# str should be bytes,
+    """Save bytes to the file."""
-# f should be either writable or a file path.
+    if hasattr(f, 'write') and callable(cast(IO[bytes], f).write):
-def _save_bytes(str, f):
-    if hasattr(f, 'write') and \
-            callable(cast(IO[bytes], f).write):
        cast(IO[bytes], f).write(str)
    else:
        with open(cast(Text, f), 'wb') as writable:
            writable.write(str)
-# f should be either readable or a file path.
+def load_bytes(f):
-def _load_bytes(f):
+    """Load bytes from the file."""
-    if hasattr(f, 'read') and \
+    if hasattr(f, 'read') and callable(cast(IO[bytes], f).read):
-            callable(cast(IO[bytes], f).read):
        s = cast(IO[bytes], f).read()
    else:
        with open(cast(Text, f), 'rb') as readable:
@@ -52,8 +38,11 @@ def _load_bytes(f):
    return s
-def _serialize(proto):
+def serialize_proto(proto):
-    if isinstance(proto, bytes):
+    """Serialize the protocol buffer object."""
+    if proto is None:
+        return b''
+    elif isinstance(proto, bytes):
        return proto
    elif hasattr(proto, 'SerializeToString') and \
            callable(proto.SerializeToString):
@@ -61,52 +50,23 @@ def _serialize(proto):
        return result
    else:
        raise ValueError(
-            'No SerializeToString method is detected. '
+            'No <SerializeToString> method. Type is {}'
-            'neither proto is a str.\ntype is {}'
+            .format(type(proto)))
-            .format(type(proto))
-        )
-def _deserialize(s, proto):
+def deserialize_proto(s, proto):
+    """Deserialize the protocol buffer object."""
    if not isinstance(s, bytes):
        raise ValueError(
-            'Parameter s must be bytes, '
+            'Excepted serialized bytes, got type: {}'.format(type(s)))
-            'but got type: {}'
-            .format(type(s))
-        )
    if not (hasattr(proto, 'ParseFromString') and
            callable(proto.ParseFromString)):
        raise ValueError(
-            'No ParseFromString method is detected. '
+            'No <ParseFromString> method. Type is {}'
-            '\ntype is {}'.format(type(proto))
+            .format(type(proto)))
-        )
    decoded = cast(Optional[int], proto.ParseFromString(s))
    if decoded is not None and decoded != len(s):
-        raise message.DecodeError(
+        raise RuntimeError(
-            "Protobuf decoding consumed too few bytes: {} out of {}"
+            'Protobuf decoding consumed too few bytes: {} out of {}'
-            .format(decoded, len(s))
+            .format(decoded, len(s)))
-        )
    return proto
-def save_model(proto, f):
-    s = _serialize(proto)
-    _save_bytes(s, f)
-def load_model_from_string(s):
-    if ModelProto is None:
-        raise ImportError('ONNX is not installed.')
-    return _deserialize(s, ModelProto())
-def load_model(f):
-    s = _load_bytes(f)
-    return load_model_from_string(s)
-load = load_model
-load_from_string = load_model_from_string
-save = save_model
--- a/dragon/python/core/util/unittest.py
+++ b/dragon/python/core/util/unittest.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#    <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import sys
-import unittest
-import argparse
-import dragon
-from dragon.vm import torch as torch_vm
-parser = argparse.ArgumentParser(add_help=False)
-TEST_CUDA = dragon.cuda.is_available()
-def new_tensor(data, constructor='EagerTensor', execution=None):
-    if execution is not None:
-        if execution == 'GRAPH_MODE':
-            return dragon.Tensor(
-                shape=data.shape,
-                dtype=str(data.dtype),
-            ).set_value(data)
-        else:
-            return dragon.EagerTensor(data, copy=True)
-    if constructor == 'EagerTensor':
-        return dragon.EagerTensor(data, copy=True)
-    elif constructor == 'Tensor':
-        return dragon.Tensor(
-            shape=data.shape,
-            dtype=str(data.dtype),
-        ).set_value(data)
-    elif constructor == 'torch.Tensor':
-        return torch_vm.tensor(data)
-    else:
-        raise ValueError('Unknown constructor:', constructor)
-def run_tests(argv=None):
-    """Run tests under the current ``__main__``."""
-    if argv is None:
-        args, remaining = parser.parse_known_args()
-        argv = [sys.argv[0]] + remaining
-    unittest.main(argv=argv)
--- a/dragon/python/utils/vision/data_transformer.py
+++ b/dragon/python/utils/vision/data_transformer.py
@@ -92,7 +92,7 @@ class DataTransformer(multiprocessing.Process):
        Parameters
        ----------
-        example : Dict
+        example : dict
            The input example.
        Returns

--- a/dragon/python/vm/onnx/frontend.py
+++ b/dragon/python/vm/onnx/frontend.py
@@ -54,7 +54,7 @@ class DragonFrontend(object):
        constants=None,
        value_info=None,
        opset_version=None,
-        workspace=None,
+        ws=None,
        verbose=True,
    ):
        input_names = [] if input_names is None else input_names
@@ -79,12 +79,12 @@ class DragonFrontend(object):
        blob_aliases = {}
        for i, alias in enumerate(output_names):
            blob_aliases[graph_def.output[i]] = alias
-            workspace.SetTensorAlias(graph_def.output[i], alias)
+            ws.RegisterAlias(graph_def.output[i], alias)
            if graph_def.output[i] in value_info:
                value_info[alias] = value_info[graph_def.output[i]]
        for i, alias in enumerate(input_names):
            blob_aliases[graph_def.input[i]] = alias
-            workspace.SetTensorAlias(graph_def.input[i], alias)
+            ws.RegisterAlias(graph_def.input[i], alias)
            if graph_def.input[i] in value_info:
                value_info[alias] = value_info[graph_def.input[i]]
@@ -116,15 +116,14 @@ class DragonFrontend(object):
        for op in graph_def.op:
            # Get the shape of inputs and outputs.
            for name in itertools.chain(op.input, op.output):
-                tensor_impl = workspace.GetTensor(name)
+                impl = ws.GetTensor(name)
-                if tensor_impl is not None:
+                if impl is not None:
-                    shapes[name] = tensor_impl.dims
+                    shapes[name] = impl.dims
                else:
                    shapes[name] = value_info[name][1]
            # Translate definition.
-            nodes, const_tensors = cls._translate(
+            nodes, const_tensors = cls._translate(op, opset_version, shapes, ws)
-                op, opset_version, shapes, workspace)
            # Rewritten for names.
            for node in nodes:
@@ -135,8 +134,7 @@ class DragonFrontend(object):
            # Directly convert outputs as const tensors if necessary.
            if None in nodes:
-                const_tensors = [helper.from_tensor(name, workspace)
+                const_tensors = [helper.from_tensor(name, ws) for name in op.output]
-                                 for name in op.output]
            else:
                onnx_graph.node.extend(nodes)

--- a/dragon/python/vm/onnx/nodes/array.py
+++ b/dragon/python/vm/onnx/nodes/array.py
@@ -472,15 +472,14 @@ def squeeze_exporter(op_def, shape_dict, ws):
 @exporter.register('Tile')
 def tile_exporter(op_def, shape_dict, ws):
    node, const_tensors = exporter.translate(**locals())
    repeats = []
    for arg in op_def.arg:
-        if arg.name == 'multiples':
+        if arg.name == 'repeats':
            repeats = [e for e in arg.ints]
-        elif arg.name == 'multiples_desc':
+        elif arg.name == 'repeats_desc':
            repeats = helper.fetch_argument(op_def, arg, ws)
-        elif arg.name == 'multiples_descs':
+        elif arg.name == 'repeats_descs':
            repeats = helper.fetch_arguments(op_def, arg, ws)
    repeats = helper.from_array(

--- a/dragon/python/vm/onnx/utils.py
+++ b/dragon/python/vm/onnx/utils.py
@@ -21,9 +21,9 @@ import numpy
 from dragon.core.autograph import function_lib
 from dragon.core.framework import workspace
 from dragon.core.proto import dragon_pb2
+from dragon.core.util import serialization
 from dragon.vm.onnx.frontend import graph_def_to_onnx_model
 from dragon.vm.onnx.helper import mapping
-from dragon.vm.onnx.serialization import save_model
 def export_from_graph(
@@ -40,7 +40,7 @@ def export_from_graph(
    enable_onnx_checker=True,
 ):
    """Export an onnx model from the graph."""
-    save_model(graph_def_to_onnx_model(
+    model = graph_def_to_onnx_model(
        graph_def=graph_def,
        input_names=input_names,
        output_names=output_names,
@@ -50,7 +50,8 @@ def export_from_graph(
        opset_version=opset_version,
        workspace=workspace,
        verbose=verbose,
-        enable_onnx_checker=enable_onnx_checker), f)
+        enable_onnx_checker=enable_onnx_checker)
+    serialization.save_bytes(serialization.serialize_proto(model), f)
 def import_to_function(model_path, explicit_inputs=False):

--- a/dragon/utils/caffemodel.h
+++ b/dragon/utils/caffemodel.h
-/*!
- * Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
- *
- * Licensed under the BSD 2-Clause License.
- * You should have received a copy of the BSD 2-Clause License
- * along with the software. If not, See,
- *
- *    <https://opensource.org/licenses/BSD-2-Clause>
- *
- * ------------------------------------------------------------
- */
-#ifndef DRAGON_UTILS_CAFFEMODEL_H_
-#define DRAGON_UTILS_CAFFEMODEL_H_
-#include "dragon/core/workspace.h"
-#ifdef BUILD_RUNTIME
-#include "dragon/proto_lite/caffemodel.pb.h"
-#else
-#include "dragon/proto/caffemodel.pb.h"
-#endif
-namespace dragon {
-inline void LoadCaffeModel(const string& file, Workspace* ws) {
-  NetParameter net_param;
-  ReadProtoFromBinaryFile(file.c_str(), &net_param);
-  LOG(INFO) << "Restore From Model @: " << file << "......";
-  LOG(INFO) << "Model Format: CaffeModel";
-  for (int i = 0; i < net_param.layer_size(); i++) {
-    const auto& layer = net_param.layer(i);
-    const auto& layer_name = layer.name();
-    auto prefix = layer_name + "/param:";
-    for (int j = 0; j < layer.blobs_size(); j++) {
-      auto tensor_name = prefix + str::to(j);
-      if (!ws->HasTensor(tensor_name)) {
-        LOG(WARNING) << "Tensor(" << tensor_name << ") "
-                     << "does not exist in any Graphs, skip.";
-      } else {
-        auto blob = layer.blobs(j);
-        vec64_t tensor_shape;
-        for (auto dim : blob.shape().dim())
-          tensor_shape.push_back(dim);
-        auto* tensor = ws->GetTensor(tensor_name);
-        std::stringstream dim_str;
-        if (tensor_shape.size() > 0) {
-          tensor->Reshape(tensor_shape);
-          CHECK_EQ(tensor->count(), blob.data_size())
-              << "\nTensor(" << tensor_name << ") "
-              << "failed to load, except size:  " << tensor->count()
-              << ", loaded: " << blob.data_size();
-          dim_str << tensor->DimString();
-        } else {
-          tensor->Reshape({blob.data_size()});
-          dim_str << "(missing)";
-        }
-        auto* x = tensor->mutable_data<float, CPUContext>();
-        for (int xi = 0; xi < blob.data_size(); ++xi) {
-          x[xi] = blob.data(xi);
-        }
-        LOG(INFO) << "Tensor(" << tensor_name << ") "
-                  << "loaded, shape: " << dim_str.str()
-                  << ", size: " << blob.data_size();
-      }
-    }
-  }
-}
-inline void SavaCaffeModel(const string& file, const vector<Tensor*>& tensors) {
-  int j = -1;
-  NetParameter net;
-  Map<string, int> layer_hash;
-  for (int i = 0; i < tensors.size(); i++) {
-    if (tensors[i]->count() <= 0) continue;
-    auto splits = str::split(tensors[i]->name(), "/param:");
-    if (layer_hash.count(splits[0]) == 0) {
-      layer_hash[splits[0]] = ++j;
-      auto* layer = net.add_layer();
-      layer->set_name(splits[0]);
-    }
-    auto* blob = net.mutable_layer(j)->add_blobs();
-    for (auto dim : tensors[i]->dims())
-      blob->mutable_shape()->add_dim(dim);
-    if (XIsType((*tensors[i]), float)) {
-      auto* x = tensors[i]->data<float, CPUContext>();
-      for (int xi = 0; xi < tensors[i]->count(); ++xi)
-        blob->mutable_data()->Add(x[xi]);
-    } else if (XIsType((*tensors[i]), float16)) {
-      auto* x = tensors[i]->data<float16, CPUContext>();
-      for (int xi = 0; xi < tensors[i]->count(); ++xi)
-        blob->mutable_data()->Add(cast::to<float>(x[xi]));
-    }
-  }
-  WriteProtoToBinaryFile(net, file.c_str());
-  LOG(INFO) << "Save the model @: " << file << "......";
-  LOG(INFO) << "Model format: Caffe";
-}
-} // namespace dragon
-#endif // DRAGON_UTILS_CAFFEMODEL_H_
--- a/dragon/utils/filler.h
+++ b/dragon/utils/filler.h
@@ -21,112 +21,108 @@ namespace dragon {
 template <typename T, class Context>
 class Filler {
 public:
-  explicit Filler(const TensorFillerProto& proto) : proto_(proto) {}
+  explicit Filler(const FillerInfo& info) : info_(info) {}
  virtual ~Filler() {}
  virtual void Fill(Tensor* X, Context* ctx) = 0;
-  const TensorFillerProto& proto() {
+  const FillerInfo& info() {
-    return proto_;
+    return info_;
  }
 protected:
-  TensorFillerProto proto_;
+  FillerInfo info_;
 };
 template <typename T, class Context>
 class ConstantFiller final : public Filler<T, Context> {
 public:
-  explicit ConstantFiller(const TensorFillerProto& proto)
+  explicit ConstantFiller(const FillerInfo& info) : Filler<T, Context>(info) {}
-      : Filler<T, Context>(proto) {}
  void Fill(Tensor* X, Context* ctx) override {
    math::Set(
        X->count(),
-        cast::to<T>(proto().value()),
+        cast::to<T>(info().value()),
        X->mutable_data<T, Context>(),
        ctx);
  }
 protected:
-  using Filler<T, Context>::proto;
+  using Filler<T, Context>::info;
 };
 template <typename T, class Context>
 class NormalFiller final : public Filler<T, Context> {
 public:
-  explicit NormalFiller(const TensorFillerProto& proto)
+  explicit NormalFiller(const FillerInfo& info) : Filler<T, Context>(info) {}
-      : Filler<T, Context>(proto) {}
  void Fill(Tensor* X, Context* ctx) override {
    math::RandomNormal(
        X->count(),
-        proto().mean(),
+        info().mean(),
-        proto().std(),
+        info().std(),
        X->mutable_data<T, Context>(),
        ctx);
  }
 protected:
-  using Filler<T, Context>::proto;
+  using Filler<T, Context>::info;
 };
 template <typename T, class Context>
 class TruncatedNormalFiller final : public Filler<T, Context> {
 public:
-  explicit TruncatedNormalFiller(const TensorFillerProto& proto)
+  explicit TruncatedNormalFiller(const FillerInfo& info)
-      : Filler<T, Context>(proto) {}
+      : Filler<T, Context>(info) {}
  void Fill(Tensor* X, Context* /* ctx */) override {
    CPUContext ctx; // Enforce the cpu implementation
    math::TruncatedNormal(
        X->count(),
-        proto().mean(),
+        info().mean(),
-        proto().std(),
+        info().std(),
-        proto().low(),
+        info().low(),
-        proto().high(),
+        info().high(),
        X->mutable_data<T, CPUContext>(),
        &ctx);
  }
 protected:
-  using Filler<T, Context>::proto;
+  using Filler<T, Context>::info;
 };
 template <typename T, class Context>
 class UniformFiller final : public Filler<T, Context> {
 public:
-  explicit UniformFiller(const TensorFillerProto& proto)
+  explicit UniformFiller(const FillerInfo& info) : Filler<T, Context>(info) {}
-      : Filler<T, Context>(proto) {}
  void Fill(Tensor* X, Context* ctx) override {
    math::RandomUniform(
        X->count(),
-        proto().low(),
+        info().low(),
-        proto().high(),
+        info().high(),
        X->mutable_data<T, Context>(),
        ctx);
  }
 protected:
-  using Filler<T, Context>::proto;
+  using Filler<T, Context>::info;
 };
 template <typename T, class Context>
-class XavierFiller final : public Filler<T, Context> {
+class GlorotUniformFiller final : public Filler<T, Context> {
 public:
-  explicit XavierFiller(const TensorFillerProto& proto)
+  explicit GlorotUniformFiller(const FillerInfo& info)
-      : Filler<T, Context>(proto) {}
+      : Filler<T, Context>(info) {}
  void Fill(Tensor* X, Context* ctx) override {
    auto fan_in = X->count() / X->dim(0);
    auto fan_out = X->count() / X->dim(1);
    float n = (float)fan_in, scale = 3.f;
-    if (proto().has_scale()) scale = proto().scale();
+    if (info().has_scale()) scale = info().scale();
-    if (proto().variance_norm() == TensorFillerProto_VarianceNorm_FAN_AVG) {
+    if (info().variance_norm() == FillerInfo_VarianceNorm_FAN_AVG) {
      n = (fan_in + fan_out) / 2.f;
-    } else if (
+    } else if (info().variance_norm() == FillerInfo_VarianceNorm_FAN_OUT) {
-        proto().variance_norm() == TensorFillerProto_VarianceNorm_FAN_OUT) {
      n = (float)fan_out;
    }
    float limit = std::sqrt(scale / n);
@@ -135,24 +131,23 @@ class XavierFiller final : public Filler<T, Context> {
  }
 protected:
-  using Filler<T, Context>::proto;
+  using Filler<T, Context>::info;
 };
 template <typename T, class Context>
-class MSRAFiller final : public Filler<T, Context> {
+class GlorotNormalFiller final : public Filler<T, Context> {
 public:
-  explicit MSRAFiller(const TensorFillerProto& proto)
+  explicit GlorotNormalFiller(const FillerInfo& info)
-      : Filler<T, Context>(proto) {}
+      : Filler<T, Context>(info) {}
  void Fill(Tensor* X, Context* ctx) override {
    auto fan_in = X->count() / X->dim(0);
    auto fan_out = X->count() / X->dim(1);
    float n = (float)fan_in, scale = 2.f;
-    if (proto().has_scale()) scale = proto().scale();
+    if (info().has_scale()) scale = info().scale();
-    if (proto().variance_norm() == TensorFillerProto_VarianceNorm_FAN_AVG) {
+    if (info().variance_norm() == FillerInfo_VarianceNorm_FAN_AVG) {
      n = (fan_in + fan_out) / 2.f;
-    } else if (
+    } else if (info().variance_norm() == FillerInfo_VarianceNorm_FAN_OUT) {
-        proto().variance_norm() == TensorFillerProto_VarianceNorm_FAN_OUT) {
      n = (float)fan_out;
    }
    float std = std::sqrt(scale / n);
@@ -161,26 +156,26 @@ class MSRAFiller final : public Filler<T, Context> {
  }
 protected:
-  using Filler<T, Context>::proto;
+  using Filler<T, Context>::info;
 };
 template <typename T, class Context>
-Filler<T, Context>* CreateFiller(const TensorFillerProto& proto) {
+Filler<T, Context>* CreateFiller(const FillerInfo& info) {
-  const string& type = proto.type();
+  const string& type = info.type();
  if (type == "constant") {
-    return new ConstantFiller<T, Context>(proto);
+    return new ConstantFiller<T, Context>(info);
  } else if (type == "uniform") {
-    return new UniformFiller<T, Context>(proto);
+    return new UniformFiller<T, Context>(info);
  } else if (type == "normal") {
-    return new NormalFiller<T, Context>(proto);
+    return new NormalFiller<T, Context>(info);
  } else if (type == "truncated_normal") {
-    return new TruncatedNormalFiller<T, Context>(proto);
+    return new TruncatedNormalFiller<T, Context>(info);
-  } else if (type == "xavier" || type == "glorot_uniform") {
+  } else if (type == "glorot_uniform" || type == "xavier") {
-    return new XavierFiller<T, Context>(proto);
+    return new GlorotUniformFiller<T, Context>(info);
-  } else if (type == "msra" || type == "glorot_normal") {
+  } else if (type == "glorot_normal" || type == "msra") {
-    return new MSRAFiller<T, Context>(proto);
+    return new GlorotNormalFiller<T, Context>(info);
  }
-  return new ConstantFiller<T, Context>(proto);
+  return new ConstantFiller<T, Context>(info);
 }
 } // namespace dragon

--- a/tensorflow/core/framework/constant_op.py
+++ b/tensorflow/core/framework/constant_op.py
@@ -81,7 +81,8 @@ def constant(value, dtype=None, shape=None, name='Const'):
        return EagerTensor(value, name=name + ':0')
    else:
        return TensorRef(
-            name=workspace.get_dummy_name(name, ':0', 'Tensor'),
+            name=workspace.get_workspace().unique_name(
+                name, ':0', 'dragon.Tensor'),
            shape=list(value.shape),
            dtype=str(value.dtype),
        ).set_value(value)
--- a/tensorflow/core/keras/engine/input_layer.py
+++ b/tensorflow/core/keras/engine/input_layer.py
@@ -18,7 +18,6 @@ from __future__ import division
 from __future__ import print_function
 from dragon.core.util import six
-from dragon.core.framework import types
 from dragon.core.framework import workspace
 from dragon.vm.tensorflow.core.framework import tensor_shape
 from dragon.vm.tensorflow.core.ops import array_ops
@@ -43,7 +42,7 @@ def Input(
    # Create a placeholder with determined ``batch_size``
    x = tf.keras.Input(shape=(8,), batch_size=8, dtype='float32')
-    # Create a placeholder aliasing an existing symbolic tensor
+    # Create a placeholder aliasing an existing tensor
    x = dragon.Tensor('x', shape=(8,), dtype='float32').variable()
    xx = tf.keras.Input(tensor=x)
    ```
@@ -59,7 +58,7 @@ def Input(
    dtype : str, optional
        The optional data type.
    tensor : dragon.Tensor, optional
-        The existing symbolic tensor aliased to the placeholder.
+        The existing tensor aliased to input.
    Returns
    -------
@@ -99,16 +98,9 @@ def Input(
    elif isinstance(shape, six.integer_types):
        shape = (shape,)
-    placeholder = \
+    placeholder = array_ops.placeholder(
-        array_ops.placeholder(
+        dtype=dtype, shape=shape, name=name if name else 'input')
-            dtype=dtype,
-            shape=shape,
-            name=name if name else 'input',
-        )
    if tensor is not None:
-        if not types.is_symbolic_tensor(tensor):
+        workspace.get_workspace().register_alias(tensor, placeholder.id)
-            raise ValueError('Accepted a dragon.Tensor only.')
-        workspace.set_tensor_alias(tensor, placeholder.name)
    return placeholder
--- a/tensorflow/core/keras/optimizer/optimizer.py
+++ b/tensorflow/core/keras/optimizer/optimizer.py
@@ -127,6 +127,7 @@ class Optimizer(optimizer_v1.Optimizer):
    def _create_hypers(self):
        if self._hypers_created:
            return
+        current_ws = workspace.get_workspace()
        for name, value in sorted(self._hyper.items()):
            if types.is_tensor(value) or callable(value):
                pass
@@ -141,7 +142,7 @@ class Optimizer(optimizer_v1.Optimizer):
            hyper = self._hyper[name]
            alias = self._alias.get(name, None)
            if alias is not None:
-                workspace.set_tensor_alias(hyper, alias)
+                current_ws.register_alias(hyper, alias)
        self._hypers_created = True
    @staticmethod
@@ -173,10 +174,10 @@ class Optimizer(optimizer_v1.Optimizer):
    def _init_set_name(self, name, zero_based=True):
        """Set a name for sharing weights."""
        if not name:
-            self._name = workspace.get_dummy_name(
+            self._name = workspace.get_workspace().unique_name(
-                basename=generic_utils.to_snake_case(
+                name=generic_utils.to_snake_case(
                    self.__class__.__name__),
-                domain='Object',
+                namespace='Object',
                zero_based=zero_based,
            )
        else:
@@ -188,7 +189,7 @@ class Optimizer(optimizer_v1.Optimizer):
            self._hyper[name] = value
        else:
            if types.is_tensor(self._hyper[name]):
-                workspace.feed_tensor(
+                workspace.get_workspace().feed_tensor(
                    self._hyper[name].id,
                    value,
                    dtype='float32',

--- a/tensorflow/core/module/module.py
+++ b/tensorflow/core/module/module.py
@@ -147,9 +147,9 @@ class Module(object):
    def _init_set_name(self, name=None, zero_based=True):
        if name is None:
-            self._name = workspace.get_dummy_name(
+            self._name = workspace.get_workspace().unique_name(
-                basename=camel_to_snake(self.__class__.__name__),
+                name=camel_to_snake(self.__class__.__name__),
-                domain='Object',
+                namespace='Object',
                zero_based=zero_based,
            )
        else:

--- a/tensorflow/core/ops/array_ops.py
+++ b/tensorflow/core/ops/array_ops.py
@@ -478,11 +478,9 @@ def placeholder(dtype=None, shape=None, name=None):
    """
    # Construct a tensor from the explicit name
    return TensorRef(
-        workspace.get_dummy_name(
+        workspace.get_workspace().unique_name(
-            context.get_name_scope() + name
+            context.get_name_scope() + name if name else 'Placeholder',
-            if name else 'Placeholder',
+            suffix=':0', namespace='Tensor'),
-            suffix=':0', domain='Tensor',
-        ),
        dtype=str(dtype) if dtype else dtype,
        shape=shape,
    ).placeholder()
@@ -528,8 +526,8 @@ def shape(input, name=None):
    ```python
    x = tf.ones((2, 3))
-    print(x.shape)          # Return a sequence
+    print(x.shape)  # Return a sequence
-    print(tf.shape(x))      # Return a tensor
+    print(tf.shape(x))  # Return a tensor
    ```
    Parameters
@@ -686,11 +684,11 @@ def squeeze(input, axis=None, name=None):
    # Remove all matched dimensions if ``axis`` is None
    # Otherwise, only the specified axes will be removed
-    print(tf.squeeze(x).shape)          # (1, 2, 2, 1) -> (2, 2)
+    print(tf.squeeze(x).shape)  # (1, 2, 2, 1) -> (2, 2)
    print(tf.squeeze(x, axis=0).shape)  # (1, 2, 2, 1) -> (2, 2, 1)
    # A negative axis is the last-k axis
-    print(tf.squeeze(x, axis=3).shape)   # (1, 2, 2, 1) -> (1, 2, 2)
+    print(tf.squeeze(x, axis=3).shape)  # (1, 2, 2, 1) -> (1, 2, 2)
    print(tf.squeeze(x, axis=-1).shape)  # Equivalent
    # Also, ``axis`` could be a sequence of integers
@@ -716,7 +714,7 @@ def squeeze(input, axis=None, name=None):
 def tile(input, multiples, name=None):
-    return array_ops.tile(input, multiples=multiples, name=name)
+    return array_ops.tile(input, repeats=multiples, name=name)
 def transpose(a, perm=None, name=None):

--- a/tensorlayer/core/engine/container.py
+++ b/tensorlayer/core/engine/container.py
@@ -111,10 +111,10 @@ class LayerList(module.Module):
        return len(self._layers)
    def __repr__(self):
-        tmpstr = 'LayerList' + '(\n'
+        tmp_str = 'LayerList' + '(\n'
        for idx, layer in enumerate(self._layers):
-            modstr = layer.__repr__()
+            mod_str = layer.__repr__()
-            modstr = self._addindent(modstr, 2)
+            mod_str = self._add_indent(mod_str, 2)
-            tmpstr = tmpstr + '  (' + str(idx) + '): ' + modstr + '\n'
+            tmp_str = tmp_str + '  (' + str(idx) + '): ' + mod_str + '\n'
-        tmpstr = tmpstr + ')'
+        tmp_str = tmp_str + ')'
-        return tmpstr
+        return tmp_str
--- a/tensorlayer/core/engine/module.py
+++ b/tensorlayer/core/engine/module.py
@@ -245,12 +245,12 @@ class Module(object):
            )
    @staticmethod
-    def _addindent(s_, numSpaces):
+    def _add_indent(s_, num_spaces):
        s = s_.split('\n')
        if len(s) == 1:
            return s_
        first = s.pop(0)
-        s = [(numSpaces * ' ') + line for line in s]
+        s = [(num_spaces * ' ') + line for line in s]
        s = '\n'.join(s)
        s = first + '\n' + s
        return s
@@ -297,9 +297,9 @@ class Module(object):
    def _set_name(self, name=None, zero_based=True):
        """Set the module name."""
        if name is None:
-            self._name = workspace.get_dummy_name(
+            self._name = workspace.get_workspace().unique_name(
-                basename=self.__class__.__name__.lower(),
+                name=self.__class__.__name__.lower(),
-                domain='Object',
+                namespace='Object',
                zero_based=zero_based,
            )
        else:

--- a/tensorlayer/core/files/utils.py
+++ b/tensorlayer/core/files/utils.py
@@ -288,7 +288,6 @@ def _load_weights_from_hdf5_group(f, modules, skip=False):
    matched_info = []
    module_dict = {m.name: m for m in modules}
    module_names = [n.decode('utf8') for n in f.attrs["layer_names"]]
    for idx, name in enumerate(module_names):
        if name not in module_dict:
            if not skip:
@@ -300,7 +299,6 @@ def _load_weights_from_hdf5_group(f, modules, skip=False):
            value_names = [n.decode('utf8') for n in g.attrs['weight_names']]
            value_dict = dict((name, g[name]) for name in value_names)
            matched_info += _assign_weights_from_dict(weight_dict, value_dict, skip=True)
    return matched_info
@@ -327,6 +325,7 @@ def _save_weights_to_hdf5_group(f, modules):
 def _set_value(input, value):
    """Set the copied value to input."""
    if hasattr(input, 'id'):
-        workspace.feed_tensor(input.id, value, enforce_cpu=True)
+        workspace.get_workspace().feed_tensor(
+            input.id, value, enforce_cpu=True)
    else:
        raise ValueError('Input is not a legal tensor.')
--- a/tensorlayer/core/layers/core.py
+++ b/tensorlayer/core/layers/core.py
@@ -84,9 +84,9 @@ class LayerMetaclass(object):
    def _init_set_name(self, name=None, zero_based=True):
        """Set the model name when necessary."""
        if name is None:
-            self._name = workspace.get_dummy_name(
+            self._name = workspace.get_workspace().unique_name(
-                basename=self.__class__.__name__.lower(),
+                name=self.__class__.__name__.lower(),
-                domain='Object',
+                namespace='Object',
                zero_based=zero_based,
            )
        else:
@@ -378,21 +378,21 @@ class LayerList(Layer):
        return len(self._all_layers)
    def __repr__(self):
-        tmpstr = 'LayerList' + '(\n'
+        tmp_str = 'LayerList' + '(\n'
        for idx, layer in enumerate(self._all_layers):
-            modstr = layer.__repr__()
+            mod_str = layer.__repr__()
-            modstr = _addindent(modstr, 2)
+            mod_str = _add_indent(mod_str, 2)
-            tmpstr = tmpstr + '  (' + str(idx) + '): ' + modstr + '\n'
+            tmp_str = tmp_str + '  (' + str(idx) + '): ' + mod_str + '\n'
-        tmpstr = tmpstr + ')'
+        tmp_str = tmp_str + ')'
-        return tmpstr
+        return tmp_str
-def _addindent(s_, numSpaces):
+def _add_indent(s_, num_spaces):
    s = s_.split('\n')
    if len(s) == 1:
        return s_
    first = s.pop(0)
-    s = [(numSpaces * ' ') + line for line in s]
+    s = [(num_spaces * ' ') + line for line in s]
    s = '\n'.join(s)
    s = first + '\n' + s
    return s
--- a/tensorrt/python/core/engine.py
+++ b/tensorrt/python/core/engine.py
@@ -123,17 +123,14 @@ class Binding(object):
        if self._device_tensor is None:
            spec = device_spec.DeviceSpec('cuda', self.device_id)
            self._device_opt = spec.to_proto(serialized=True)
-            ws = workspace.get_workspace()
+            current_ws = workspace.get_workspace()
-            ref = EagerTensor(device=spec)  # Hack the constructor.
+            tensor = EagerTensor(device=spec)  # Hack the constructor.
-            ref.__gc__ = ws.collectors.TENSOR
+            tensor._gc = current_ws.collectors.TENSOR
-            ref._id = ref.__gc__.alloc('${DLPACK}')
+            tensor._impl = current_ws.create_tensor(
-            ref._impl = ws.CreateTensor(ref._id).FromPointer(
+                tensor._gc.alloc('${DLPACK}')).FromPointer(
-                self._shape,
+                self._shape, self._dtype,
-                self._dtype,
+                self._device_opt, self.device_buffer.ptr)
-                self._device_opt,
+            self._device_tensor = tensor
-                self.device_buffer.ptr,
-            )
-            self._device_tensor = ref
        return self._device_tensor._impl.ToDLPack(self._device_opt, True)
    @property
@@ -187,17 +184,14 @@ class Binding(object):
        if self._host_tensor is None:
            spec = device_spec.DeviceSpec('cpu')
            self._host_opt = spec.to_proto(serialized=True)
-            ws = workspace.get_workspace()
+            current_ws = workspace.get_workspace()
-            ref = EagerTensor(device=spec)  # Hack the constructor.
+            tensor = EagerTensor(device=spec)  # Hack the constructor.
-            ref.__gc__ = ws.collectors.TENSOR
+            tensor._gc = current_ws.collectors.TENSOR
-            ref._id = ref.__gc__.alloc('${DLPACK}')
+            tensor._impl = current_ws.create_tensor(
-            ref._impl = ws.CreateTensor(ref._id).FromPointer(
+                tensor._gc.alloc('${DLPACK}')).FromPointer(
-                self._shape,
+                self._shape, self._dtype,
-                self._dtype,
+                self._host_opt, self.host_buffer.ctypes.data)
-                self._host_opt,
+            self._host_tensor = tensor
-                self.host_buffer.ctypes.data,
-            )
-            self._host_tensor = ref
        return self._host_tensor._impl.ToDLPack(self._host_opt, True)
    @property

--- a/test/dragon/core/test_framework.py
+++ b/test/dragon/core/test_framework.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#    <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import math
+import unittest
+import dragon
+import numpy as np
+from dragon.core.eager.context import context as execution_context
+from dragon.core.testing.unittest.common_utils import run_tests
+from dragon.core.testing.unittest.common_utils import TEST_CUDA
+class TestTensor(unittest.TestCase):
+    """Test the tensor class."""
+    def test_properties(self):
+        a, b = dragon.Tensor(), dragon.EagerTensor(0)
+        self.assertEqual(dragon.Tensor().ndim, 0)
+        self.assertEqual(dragon.Tensor(shape=(2,)).ndim, 1)
+        self.assertEqual(dragon.Tensor().shape, None)
+        self.assertEqual(dragon.Tensor(shape=(2,)).shape, [2])
+        self.assertEqual(dragon.Tensor().size, 0)
+        self.assertEqual(dragon.Tensor(shape=(2, None)).size, math.inf)
+        self.assertEqual(dragon.Tensor(shape=(2,)).size, 2)
+        self.assertEqual(dragon.Tensor().dtype, None)
+        self.assertEqual(dragon.Tensor(dtype='float32').dtype, 'float32')
+        self.assertEqual(dragon.EagerTensor(shape=(2,)).ndim, 1)
+        self.assertEqual(dragon.EagerTensor(shape=(2,)).shape, [2])
+        self.assertEqual(dragon.EagerTensor(shape=(2,)).size, 2)
+        self.assertEqual(dragon.EagerTensor(shape=(2,), dtype='float32').dtype, 'float32')
+        self.assertEqual(dragon.EagerTensor().device, dragon.EagerTensor().device)
+        self.assertNotEqual(a.__hash__(), b.__hash__())
+        self.assertNotEqual(a.__repr__(), b.__repr__())
+        self.assertNotEqual(b.__repr__(), dragon.EagerTensor([2]).__repr__())
+        self.assertEqual(int(a.variable().placeholder().set_value(1)), 1)
+        self.assertEqual(float(dragon.Tensor.convert_to(1)), 1.)
+        self.assertEqual(int(b.set_value(1)), 1)
+        self.assertEqual(float(b), 1.)
+        self.assertEqual(int(b.get_value()), 1)
+        try:
+            a.shape = 1
+        except TypeError:
+            pass
+        try:
+            b.shape = (2, 3)
+        except RuntimeError:
+            pass
+        try:
+            b.dtype = 'float64'
+        except RuntimeError:
+            pass
+        try:
+            b = dragon.EagerTensor(0, 0)
+        except ValueError:
+            pass
+        with dragon.name_scope('a'):
+            a.name = 'a'
+            self.assertEqual(a.name, 'a/a')
+        with dragon.name_scope(''):
+            b.name = 'b'
+            self.assertEqual(b.name, 'b')
+    def test_dlpack_converter(self):
+        data = np.array([0., 1., 2.], 'float32')
+        with dragon.device('cpu'), dragon.eager_scope():
+            x = dragon.EagerTensor(data, copy=True)
+        x_to_dlpack = dragon.dlpack.to_dlpack(x)
+        x_from_dlpack = dragon.dlpack.from_dlpack(x_to_dlpack)
+        self.assertEqual(x_from_dlpack.shape, list(data.shape))
+        self.assertEqual(x_from_dlpack.dtype, str(data.dtype))
+        self.assertLessEqual(np.abs(x_from_dlpack.numpy() - data).max(), 1e-5)
+    @unittest.skipIf(not TEST_CUDA, 'CUDA unavailable')
+    def test_dlpack_converter_cuda(self):
+        data = np.array([0., 1., 2.], 'float32')
+        with dragon.device('cuda', 0), execution_context().mode('EAGER_MODE'):
+            x = dragon.EagerTensor(data, copy=True) + 0
+        x_to_dlpack = dragon.dlpack.to_dlpack(x)
+        x_from_dlpack = dragon.dlpack.from_dlpack(x_to_dlpack)
+        self.assertEqual(x_from_dlpack.device.type, 'cuda')
+        self.assertEqual(x_from_dlpack.device.index, 0)
+        self.assertEqual(x_from_dlpack.shape, list(data.shape))
+        self.assertEqual(x_from_dlpack.dtype, str(data.dtype))
+        self.assertLessEqual(np.abs(x_from_dlpack.numpy() - data).max(), 1e-5)
+class TestWorkspace(unittest.TestCase):
+    """Test the workspace class."""
+    def test_merge_form(self):
+        w1, w2 = dragon.Workspace(), dragon.Workspace()
+        with w1.as_default():
+            x = dragon.Tensor(str(id(w1))).set_value(0)
+        w2.merge_from(w1)
+        with w2.as_default():
+            self.assertEqual(int(x), 0)
+if __name__ == '__main__':
+    run_tests()
--- a/test/dragon/core/test_ops.py
+++ b/test/dragon/core/test_ops.py
@@ -247,7 +247,7 @@ class TestActivationOps(OpTestCase):
                    result = np.maximum(data1, 0.) + np.minimum(data1, 0.) * data2
                    grad1 = data1 * ((data1 > 0.) + (data1 < 0.) * data2)
                    grad2 = reduce_like(data1 * ((data1 < 0.) * data1), data2)
-                    self.assertEqual([y, dx, dw], [result, grad1, grad2.reshape((-1,))])
+                    self.assertEqual([y, dx, dw], [result, grad1, grad2.flatten()])
    @unittest.skipIf(not TEST_CUDA, 'CUDA unavailable')
    def test_prelu_cuda(self):
@@ -831,19 +831,20 @@ class TestArrayOps(OpTestCase):
            self.test_stack()
    def test_tile(self):
-        entries = [(1, 1), (1, 2), (2, 1), (2, 2)]
+        entries = [(2,), (1, 1), (1, 2), (2, 1), (2, 2)]
        for execution in ('EAGER_MODE', 'GRAPH_MODE'):
            with execution_context().mode(execution):
-                for multiples in entries:
+                for repeats in entries:
                    data = arange((2, 2))
-                    grad = np.tile(data, multiples)
                    x = new_tensor(data)
-                    dy = new_tensor(grad)
                    with dragon.GradientTape() as tape:
                        tape.watch(x)
-                        y = dragon.tile(x, multiples)
+                        y = dragon.tile(x, repeats)
+                    repeats = repeats + (1,) * (len(data.shape) - len(repeats))
+                    grad = np.tile(data, repeats)
+                    dy = new_tensor(grad)
                    dx = tape.gradient(y, [x], output_gradients=[dy])[0]
-                    self.assertEqual([y, dx], [grad, data * np.prod(multiples)])
+                    self.assertEqual([y, dx], [grad, data * np.prod(repeats)])
    @unittest.skipIf(not TEST_CUDA, 'CUDA unavailable')
    def test_tile_cuda(self):
@@ -2784,7 +2785,8 @@ class TestTrainingOps(OpTestCase):
        self.adam = dragon.optimizers.Adam()
        self.nesterov = dragon.optimizers.Nesterov()
        self.rmsprop = dragon.optimizers.RMSprop()
-        self.sgd = dragon.optimizers.SGD()
+        self.sgd = dragon.optimizers.SGD(name='MyOptimizer')
+        self.sgd.base_lr = 0.01
    def test_adam_update(self):
        with execution_context().mode('EAGER_MODE'):
@@ -2798,7 +2800,7 @@ class TestTrainingOps(OpTestCase):
                coef = math.sqrt(1 - math.pow(beta2, t)) / (1 - math.pow(beta1, t))
                data4 = uniform((2, 3))
                grad = new_tensor(data4)
-                self.adam._run_update(param, grad)
+                self.adam.apply_gradients([[param, grad]])
                data2 = beta1 * data2 + (1 - beta1) * data4
                data3 = beta2 * data3 + (1 - beta2) * np.square(data4)
                data1 -= (lr * coef * data2 / (np.sqrt(data3) + eps))
@@ -2817,7 +2819,7 @@ class TestTrainingOps(OpTestCase):
            for i in range(2):
                data3 = uniform((2, 3))
                grad = new_tensor(data3)
-                self.nesterov._run_update(param, grad)
+                self.nesterov.apply_gradients([[param, grad]])
                data2_new = momentum * data2 + lr * data3
                data1 -= (1 + momentum) * data2_new - momentum * data2
                data2 = data2_new
@@ -2838,7 +2840,7 @@ class TestTrainingOps(OpTestCase):
            for i in range(2):
                data4 = uniform((2, 3))
                grad = new_tensor(data4)
-                self.rmsprop._run_update(param, grad)
+                self.rmsprop.apply_gradients([[param, grad]])
                data3 = decay * data3 + (1 - decay) * np.square(data4)
                data2 = momentum * data2 + (lr * data4 / (np.sqrt(data3) + eps))
                data1 -= data2
@@ -2857,7 +2859,7 @@ class TestTrainingOps(OpTestCase):
            for i in range(2):
                data3 = uniform((2, 3))
                grad = new_tensor(data3)
-                self.sgd._run_update(param, grad)
+                self.sgd.apply_gradients([[param, grad]])
                data2 = momentum * data2 + lr * data3
                data1 -= data2
                self.assertEqual(param, data1)
@@ -3494,7 +3496,7 @@ def reduce_like(data, other, reduction='sum'):
 def uniform(shape, dtype='float32'):
    """Return the uniform data with given shape."""
-    return np.random.uniform(size=shape).astype(dtype)
+    return np.random.uniform(-1., 1., size=shape).astype(dtype)
 if __name__ == '__main__':

--- a/test/run_test.py
+++ b/test/run_test.py
@@ -19,7 +19,8 @@ import subprocess
 import argparse
 TESTS_AND_SOURCES = [
-    ('dragon/core/test_ops', 'dragon.core.ops'),
+    ('dragon/core/test_framework', 'dragon.core'),
+    ('dragon/core/test_ops', 'dragon.core'),
 ]
 TESTS = [t[0] for t in TESTS_AND_SOURCES]

--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -14,6 +14,7 @@ from __future__ import division as _division
 from __future__ import print_function as _print_function
 # Modules
+from dragon.vm.torch import autograd
 from dragon.vm.torch import jit
 from dragon.vm.torch import nn
 from dragon.vm.torch import onnx

--- a/torch/autograd/__init__.py
+++ b/torch/autograd/__init__.py
@@ -13,6 +13,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+from dragon.vm.torch.autograd.backprop import backward
 from dragon.vm.torch.autograd.grad_mode import enable_grad
 from dragon.vm.torch.autograd.grad_mode import no_grad
 from dragon.vm.torch.autograd.grad_mode import set_grad_enabled

--- a/torch/autograd/backprop.py
+++ b/torch/autograd/backprop.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#    <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+"""Do back-propagation from the executed functions."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from dragon.vm.torch import executor
+def backward(tensors, grad_tensors=None, retain_graph=False):
+    """Compute the derivatives of tensors w.r.t. graph leaves.
+    Parameters
+    ----------
+    tensors : Sequence[dragon.vm.torch.Tensor]
+        The derivative targets.
+    grad_tensors : Sequence[dragon.vm.torch.Tensor], optional
+        The optional gradient of ``tensors``.
+    retain_graph : bool, optional, default=False
+        **False** to free the graph used to compute grad.
+    """
+    return executor.run_backward(
+        tensors=tensors,
+        grad_tensors=grad_tensors,
+        retain_graph=retain_graph,
+    )
--- a/torch/autograd/function.py
+++ b/torch/autograd/function.py
@@ -98,7 +98,7 @@ class Function(object):
        """Generate the OpDef from attributes."""
        attributes = self.attributes()
        self._def = proto_util.make_operator_cdef(
-            name=attributes.get('name', 'GenericOp'),
+            name=attributes.get('name', 'Op'),
            cache_key=self._cache_key,
            op_type=attributes['op_type'],
            device_option=proto_util.get_device_option(

--- a/torch/autograd/variable.py
+++ b/torch/autograd/variable.py
@@ -15,69 +15,15 @@ from __future__ import print_function
 import warnings
-from dragon.core.framework import workspace
-from dragon.vm.torch.tensor import Tensor
+class Variable(object):
-def Variable(tensor, requires_grad=False, volatile=False):
+    """The variable class."""
-    if volatile:
-        warnings.warn("volatile was removed and now has no effect. "
+    def __new__(cls, tensor, requires_grad=False, volatile=False):
-                      "Use `with torch.no_grad():` instead.", stacklevel=2)
+        if volatile:
-    if requires_grad and volatile:
+            warnings.warn("volatile was removed and now has no effect. "
-        raise RuntimeError("Variable can't be volatile and require_grad at the same time!")
+                          "Use `with torch.no_grad():` instead.", stacklevel=2)
-    tensor.requires_grad = requires_grad
+        if requires_grad and volatile:
-    return tensor
+            raise RuntimeError("Variable can't be volatile and require_grad at the same time.")
+        tensor.requires_grad = requires_grad
+        return tensor
-@property
-def volatile(self):
-    warnings.warn("volatile was removed (Variable.volatile is always False)", stacklevel=2)
-    return False
-def backward(self, gradient=None):
-    if not self._requires_grad:
-        raise RuntimeError(
-            'This variable does not require grads.'
-            '\nCan not backward from this variable.'
-        )
-    # Collect and sort out the operation from tapes.
-    operations = [v for k, v in sorted(self.__tape__.operations.items())]
-    # Prepare resources to optimize the backward pass.
-    input_grads = []
-    if gradient is not None:
-        if not isinstance(gradient, Tensor):
-            raise TypeError(
-                '<gradient> can be either Tensor, Variable or None, '
-                'got {}'.format(type(gradient).__name__)
-            )
-        if gradient.shape != self.shape:
-            raise ValueError(
-                'Except the dimensions of <gradient> is {}, '
-                'got {}.'.format(self.shape, gradient.shape))
-        input_grads.append(gradient.id)
-    # Dispatch the backward execution.
-    workspace.run_backward(
-        operations,
-        targets=[self.id],
-        sources=None,
-        input_grads=input_grads,
-        ignored_grads=list(self._ignored_grads),
-    )
-    # Release the holt resources.
-    gc = workspace.get_workspace().collectors
-    for op_def in operations:
-        gc.OPERATOR.collect(op_def.name)
-        for output in op_def.output:
-            if output not in op_def.input:
-                gc.TENSOR.collect(output)
-# The monkey-patching.
-Tensor.backward = backward
-Tensor.volatile = volatile
--- a/torch/executor.py
+++ b/torch/executor.py
@@ -15,12 +15,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-from dragon.core.framework import config
 from dragon.core.framework import context
 from dragon.core.framework import workspace
 from dragon.core.util import six
 from dragon.vm.torch.autograd import grad_mode
-from dragon.vm.torch.cpp import device as Device
+from dragon.vm.torch.cpp import device as device_cls
 from dragon.vm.torch.jit import tape
 from dragon.vm.torch.tensor import Tensor
@@ -32,6 +31,7 @@ def run_operator(
    no_grad=False,
    pre_callback=None,
 ):
+    """Compute the outputs."""
    requires_grad = False
    input_names, output_names = [], []
@@ -43,7 +43,6 @@ def run_operator(
    requires_grad = requires_grad and grad_mode.is_grad_enabled()
    # Allocate outputs.
-    cfg = config.config()
    ws = workspace.get_workspace()
    output_scope = context.get_eager_scope(requires_grad)
    gc = ws.collectors  # Garbage collectors
@@ -52,46 +51,90 @@ def run_operator(
        if isinstance(spec, six.string_types):
            output_names.append(spec)
        else:
-            if isinstance(spec, Device):
+            if isinstance(spec, device_cls):
-                output_id = gc.TENSOR.alloc(output_scope)
+                impl = ws.create_tensor(gc.TENSOR.alloc(output_scope))
-                ref = Tensor(device=spec)
+                outputs[i] = Tensor(device=spec, gc=gc.TENSOR, impl=impl)
-                ref.__gc__, ref._id = gc.TENSOR, output_id
-                ref._impl = ws.CreateTensor(output_id)
-                outputs[i] = ref
            output_names.append(outputs[i].id)
    # Generate the OpDef.
    default_tape = tape.get_default_tape()
    op_def = op_def.DeriveTo(input_names, output_names)
-    # Maybe record this operation for future developments.
+    # Record this operation for future developments.
    if default_tape is not None:
        default_tape.add_def(op_def)
        requires_grad = requires_grad or default_tape.retain_graph
    if len(inputs) > 0 and no_grad is False:
        if requires_grad:
-            ignores = set()
            instance_tape = tape.Tape()
            for input in inputs:
-                instance_tape.merge_from(input.__tape__)
+                instance_tape.merge_from(input._tape)
-                ignores = ignores.union(input._ignored_grads)
+                if not input._requires_grad:
-            op_def.name = gc.OPERATOR.alloc(op_def.type)
+                    instance_tape.add_empty_grad(input.id + '_grad')
+            op_def.name = gc.OP.alloc(op_def.type)
            instance_tape.add_operation(op_def)
            for output in outputs:
-                output.requires_grad = True
+                output._tape = instance_tape
-                output._ignored_grads = ignores
+                output._requires_grad = True
-                output.__tape__ = instance_tape
        else:
            if default_tape is not None and default_tape.retain_ops:
-                op_def.name = gc.OPERATOR.alloc(op_def.type)
+                op_def.name = gc.OP.alloc(op_def.type)
            for output in outputs:
-                output.requires_grad = False
+                output._requires_grad = False
    # Dispatch the computation.
    if pre_callback is not None:
        pre_callback(ws, op_def.name)
-    ws.RunOperator(op_def, cfg.graph_verbosity > 0)
+    ws.run_operator(op_def)
    # Return the outputs.
    return outputs if len(outputs) > 1 else outputs[0]
+def run_backward(tensors, grad_tensors=None, retain_graph=False):
+    """Compute the gradients."""
+    # Collect the volatiles and tape from tensors
+    default_tape = tape.Tape()
+    for i, tensor in enumerate(tensors):
+        if not tensor._requires_grad:
+            raise RuntimeError('Element %d of tensors does not require grad.' % i)
+        default_tape.merge_from(tensor._tape)
+    # Collect the grad from tensors
+    input_grads = []
+    if grad_tensors is not None:
+        if len(grad_tensors) != len(tensors):
+            raise ValueError('Number of tensors and grad tensors should be same.')
+        for i, grad_tensor in enumerate(grad_tensors):
+            if not isinstance(grad_tensor, Tensor):
+                raise TypeError(
+                    'Element {} of grad tensors should be a tensor, got {}.'
+                    .format(i, type(grad_tensor).__name__))
+            if grad_tensor.shape != tensors[i].shape:
+                raise ValueError(
+                    'Size of element {} of grad tensors should be {}, got {}.'
+                    .format(i, tensors[i].shape, grad_tensor.shape))
+            input_grads.append(grad_tensor.id)
+    # Prepare resources to optimize the backward pass.
+    op_defs = [v for k, v in sorted(default_tape.operations.items())]
+    # Dispatch the backward execution.
+    current_ws = workspace.get_workspace()
+    current_ws.run_backward(
+        op_defs=op_defs,
+        targets=[tensor.id for tensor in tensors],
+        sources=default_tape.sources,
+        input_grads=input_grads,
+        empty_grads=default_tape.empty_grads,
+    )
+    # Free the retained resources
+    if not retain_graph:
+        gc = current_ws.collectors
+        for op_def in op_defs:
+            gc.OP.collect(op_def.name)
+            for output in op_def.output:
+                if output not in op_def.input:
+                    gc.TENSOR.collect(output)
--- a/torch/jit/tape.py
+++ b/torch/jit/tape.py
@@ -34,32 +34,54 @@ class Tape(object):
    def __init__(self, retain_ops=False, retain_graph=False):
        self._defs = []
        self._operations = dict()
+        self._sources = set()
+        self._empty_grads = set()
        self.retain_ops = retain_ops
        self.retain_graph = retain_graph
    @property
    def defs(self):
-        """Return the recording defs."""
+        """Return the recorded defs."""
        return self._defs
    @property
+    def empty_grads(self):
+        """Return the recorded empty grads."""
+        return list(self._empty_grads)
+    @property
    def operations(self):
-        """Return the recording operations."""
+        """Return the recorded operations."""
        return self._operations
+    @property
+    def sources(self):
+        """Return the recorded empty grads."""
+        return list(self._sources)
    def add_def(self, op_def):
        """Add a new def."""
        self._defs.append(op_def)
+    def add_empty_grad(self, tensor_id):
+        """Add an empty grad for optimization."""
+        self._empty_grads.add(tensor_id)
    def add_operation(self, op_def):
        """Add a new operation."""
        uid = next(self.UID_GENERATOR)
        self._operations[uid] = op_def
+    def add_source(self, tensor_id):
+        """Add a source for optimization."""
+        self._sources.add(tensor_id)
    def merge_from(self, other):
        """Merge operations from another."""
        if other is not None:
            self._operations = {**self._operations, **other._operations}
+            self._sources = self._sources.union(other._sources)
+            self._empty_grads = self._empty_grads.union(other._empty_grads)
    def __enter__(self):
        """Enter the tape into the stack."""

--- a/torch/jit/trace.py
+++ b/torch/jit/trace.py
@@ -83,9 +83,10 @@ class FunctionGuard(object):
        symbols = self.inputs
        inputs, extra_args = self._function_spec \
            .canonicalize_inputs(*args, **kwargs)
+        current_ws = workspace.get_workspace()
        for sym, data in zip(symbols, inputs):
            if hasattr(data, 'id'):
-                workspace.set_tensor_alias(data.id, sym.id)
+                current_ws.register_alias(data.id, sym.id)
        return symbols, extra_args
    def __call__(self, *args, **kwargs):
@@ -125,7 +126,7 @@ class FunctionGuard(object):
            # In this case, we have the recorded IR.
            # Notify the backend to run directly.
            self.canonicalize_inputs(*args, **kwargs)
-            workspace.run_operator(self.defs)
+            workspace.get_workspace().run_operator(self.defs)
        return self.outputs
    def __get__(self, instance, owner):

--- a/torch/nn/modules/_functions.py
+++ b/torch/nn/modules/_functions.py
@@ -40,7 +40,6 @@ class _ConvNd(function.Function):
        self.dilations = kwargs.get('dilations', 1)
        self.group = kwargs.get('group', None)
        self.output_padding = kwargs.get('output_padding', None)
-        self.padding = None if self.output_padding is None else 'SAME'
    def attributes(self):
        return {
@@ -50,7 +49,6 @@ class _ConvNd(function.Function):
                'strides': self.strides,
                'pads': self.pads,
                'dilations': self.dilations,
-                'padding': self.padding,
                'output_padding': self.output_padding,
                'group': self.group,
                'data_format': 'NCHW',
@@ -511,35 +509,29 @@ class Resize(function.Function):
                'align_corners': self.align_corners,
                'data_format': 'NCHW',
                'sizes_descs': [
-                    '${{HANDLE}}/sizes[{}]'.format(n)
+                    '${{HANDLE}}/sizes[{}]'
-                    for n in range(self.num_sizes)
+                    .format(n) for n in range(self.num_sizes)],
-                ],
                'scales_descs': [
-                    '${{HANDLE}}/scales[{}]'.format(n)
+                    '${{HANDLE}}/scales[{}]'
-                    for n in range(self.num_scales)
+                    .format(n) for n in range(self.num_scales)],
-                ],
            }
        }
    def feed(self, ws, handle, sizes, scales):
        for i in range(self.num_sizes):
            self.feed_arg(
-                ws,
+                ws, '{}/sizes[{}]'.format(handle, i),
-                '{}/sizes[{}]'.format(handle, i),
+                sizes[i], 'int64')
-                sizes[i], 'int64',
-            )
        for i in range(self.num_scales):
            self.feed_arg(
-                ws,
+                ws, '{}/scales[{}]'.format(handle, i),
-                '{}/scales[{}]'.format(handle, i),
+                scales[i], 'float32')
-                scales[i], 'float32',
-            )
    def forward(self, input, sizes=None, scales=None):
        return self.dispatch(
            [input], [self.alloc()],
            callback=lambda ws, handle:
-                self.feed(ws, handle, sizes, scales)
+                self.feed(ws, handle, sizes, scales),
        )

--- a/torch/nn/modules/module.py
+++ b/torch/nn/modules/module.py
@@ -237,7 +237,7 @@ class Module(object):
        Parameters
        ----------
-        state_dict : Dict
+        state_dict : dict
            The state dict.
        strict : bool, optional, default=True
            **True** to verify the names strictly.
@@ -474,7 +474,7 @@ class Module(object):
        Parameters
        ----------
-        destination : Dict, optional
+        destination : dict, optional
            The optional output dict.
        prefix : str, optional, default=''
            The prefix added to the name of states.
@@ -556,7 +556,7 @@ class Module(object):
        child_lines = []
        for key, module in self._modules.items():
            mod_str = repr(module)
-            mod_str = _addindent(mod_str, 2)
+            mod_str = _add_indent(mod_str, 2)
            child_lines.append('(' + key + '): ' + mod_str)
        lines = extra_lines + child_lines
        main_str = self._get_name() + '('
@@ -599,7 +599,7 @@ class Module(object):
                object.__setattr__(self, key, value)
-def _addindent(s_, num_spaces):
+def _add_indent(s_, num_spaces):
    s = s_.split('\n')
    if len(s) == 1:
        return s_

--- a/torch/ops/array/_functions.py
+++ b/torch/ops/array/_functions.py
@@ -61,15 +61,11 @@ class Assign(function.Function):
    def feed(self, ws, handle, starts, sizes):
        for i in range(self.ndim):
            self.feed_arg(
-                ws,
+                ws, '{}/starts[{}]'.format(handle, i),
-                '{}/starts[{}]'.format(handle, i),
+                starts[i], 'int64')
-                starts[i], 'int64',
-            )
            self.feed_arg(
-                ws,
+                ws, '{}/sizes[{}]'.format(handle, i),
-                '{}/sizes[{}]'.format(handle, i),
+                sizes[i], 'int64')
-                sizes[i], 'int64',
-            )
    def forward(self, out, starts, sizes, input):
        self._check_device([input, out])
@@ -90,9 +86,7 @@ class Cast(function.Function):
    def attributes(self):
        return {
            'op_type': 'Cast',
-            'arguments': {
+            'arguments': {'dtype': self.dtype},
-                'dtype': self.dtype,
-            },
        }
    def forward(self, input, inplace=False):
@@ -122,18 +116,15 @@ class ChannelNormalize(function.Function):
                'dtype': self.dtype,
                'perm_descs': [
                    '${{HANDLE}}/perm[{}]'
-                    .format(n) for n in range(self.ndim)
+                    .format(n) for n in range(self.ndim)],
-                ],
            }
        }
    def feed(self, ws, handle, perm):
        for i in range(self.ndim):
            self.feed_arg(
-                ws,
+                ws, '{}/perm[{}]'.format(handle, i),
-                '{}/perm[{}]'.format(handle, i),
+                perm[i], 'int64')
-                perm[i], 'int64',
-            )
    def forward(self, input, perm):
        return self.dispatch(
@@ -171,9 +162,7 @@ class Concat(function.Function):
    def attributes(self):
        return {
            'op_type': 'Concat',
-            'arguments': {
+            'arguments': {'axis': self.axis},
-                'axis': self.axis,
-            },
        }
    def forward(self, seq, out=None):
@@ -215,18 +204,15 @@ class Expand(function.Function):
            'arguments': {
                'dims_descs': [
                    '${{HANDLE}}/dims[{}]'
-                    .format(n) for n in range(self.ndim)
+                    .format(n) for n in range(self.ndim)],
-                ],
            },
        }
    def feed(self, ws, handle, times):
        for i in range(self.ndim):
            self.feed_arg(
-                ws,
+                ws, '{}/dims[{}]'.format(handle, i),
-                '{}/dims[{}]'.format(handle, i),
+                times[i], 'int64')
-                times[i], 'int64',
-            )
    def forward(self, input, dims):
        return self.dispatch(
@@ -361,18 +347,15 @@ class Reshape(function.Function):
            'arguments': {
                'dims_descs': [
                    '${{HANDLE}}/dims[{}]'
-                    .format(n) for n in range(self.ndim)
+                    .format(n) for n in range(self.ndim)],
-                ],
            },
        }
    def feed(self, ws, handle, shape):
        for i in range(self.ndim):
            self.feed_arg(
-                ws,
+                ws, '{}/dims[{}]'.format(handle, i),
-                '{}/dims[{}]'.format(handle, i),
+                shape[i], 'int64')
-                shape[i], 'int64',
-            )
    def forward(self, input, shape, out=None):
        out = out if out else self.alloc()
@@ -394,27 +377,21 @@ class Slice(function.Function):
            'arguments': {
                'starts_descs': [
                    '${{HANDLE}}/starts[{}]'
-                    .format(n) for n in range(self.ndim)
+                    .format(n) for n in range(self.ndim)],
-                ],
                'sizes_descs': [
                    '${{HANDLE}}/sizes[{}]'
-                    .format(n) for n in range(self.ndim)
+                    .format(n) for n in range(self.ndim)],
-                ],
            },
        }
    def feed(self, ws, handle, starts, sizes):
        for i in range(self.ndim):
            self.feed_arg(
-                ws,
+                ws, '{}/starts[{}]'.format(handle, i),
-                '{}/starts[{}]'.format(handle, i),
+                starts[i], 'int64')
-                starts[i], 'int64',
-            )
            self.feed_arg(
-                ws,
+                ws, '{}/sizes[{}]'.format(handle, i),
-                '{}/sizes[{}]'.format(handle, i),
+                sizes[i], 'int64')
-                sizes[i], 'int64',
-            )
    def forward(self, input, starts, sizes):
        return self.dispatch(
@@ -489,19 +466,18 @@ class Tile(function.Function):
        return {
            'op_type': 'Tile',
            'arguments': {
-                'multiples_descs': [
+                'repeats_descs': [
-                    '${{HANDLE}}/multiples[{}]'
+                    '${{HANDLE}}/repeats[{}]'
-                    .format(n) for n in range(self.ndim)
+                    .format(n) for n in range(self.ndim)],
-                ],
            },
        }
-    def feed(self, ws, handle, times):
+    def feed(self, ws, handle, repeats):
        for i in range(self.ndim):
            self.feed_arg(
                ws,
-                '{}/multiples[{}]'.format(handle, i),
+                '{}/repeats[{}]'.format(handle, i),
-                times[i], 'int64',
+                repeats[i], 'int64',
            )
    def forward(self, input, times):
@@ -523,18 +499,15 @@ class Transpose(function.Function):
            'arguments': {
                'perm_descs': [
                    '${{HANDLE}}/perm[{}]'
-                    .format(n) for n in range(self.ndim)
+                    .format(n) for n in range(self.ndim)],
-                ],
            },
        }
    def feed(self, ws, handle, perm):
        for i in range(self.ndim):
            self.feed_arg(
-                ws,
+                ws, '{}/perm[{}]'.format(handle, i),
-                '{}/perm[{}]'.format(handle, i),
+                perm[i], 'int64')
-                perm[i], 'int64',
-            )
    def forward(self, input, perm):
        return self.dispatch(

--- a/torch/ops/init/_functions.py
+++ b/torch/ops/init/_functions.py
@@ -25,10 +25,8 @@ class _Initializer(function.Function):
    def feed(self, ws, handle, shape):
        for i in range(self.ndim):
            self.feed_arg(
-                ws,
+                ws, '{}/dims[{}]'.format(handle, i),
-                '{}/dims[{}]'.format(handle, i),
+                shape[i], 'int64')
-                shape[i], 'int64',
-            )
    def forward(self, out, shape, shape_like=None):
        return self.dispatch(
@@ -51,22 +49,19 @@ class Arange(function.Function):
                'dtype': self.dtype,
                'slice_descs': [
                    '${{HANDLE}}/slice[{}]'
-                    .format(n) for n in range(self.num_args)
+                    .format(n) for n in range(self.num_args)],
-                ],
            }
        }
    def feed(self, ws, handle, slice_args):
        for i in range(len(slice_args)):
            self.feed_arg(
-                ws,
+                ws, '{}/slice[{}]'.format(handle, i),
-                '{}/slice[{}]'.format(handle, i),
+                slice_args[i], 'float32')
-                slice_args[i], 'float32'
-            )
-    def forward(self, slice_args):
+    def forward(self, slice_args, out=None):
        return self.dispatch(
-            [], [self.alloc()],
+            [], [out if out else self.alloc()],
            callback=lambda ws, handle:
            self.feed(ws, handle, slice_args)
        )
@@ -85,8 +80,7 @@ class Eye(_Initializer):
                'dtype': self.dtype,
                'dims_descs': [
                    '${{HANDLE}}/dims[{}]'.format(n)
-                    for n in range(self.ndim)
+                    for n in range(self.ndim)],
-                ],
            },
        }
@@ -104,8 +98,7 @@ class Fill(_Initializer):
                'value': float(self.value),
                'dims_descs': [
                    '${{HANDLE}}/dims[{}]'.format(n)
-                    for n in range(self.ndim)
+                    for n in range(self.ndim)],
-                ],
            },
        }
@@ -125,8 +118,7 @@ class RandomNormal(_Initializer):
                'std': float(self.std),
                'dims_descs': [
                    '${{HANDLE}}/dims[{}]'.format(n)
-                    for n in range(self.ndim)
+                    for n in range(self.ndim)],
-                ],
            },
        }
@@ -146,7 +138,6 @@ class RandomUniform(_Initializer):
                'high': float(self.high),
                'dims_descs': [
                    '${{HANDLE}}/dims[{}]'.format(n)
-                    for n in range(self.ndim)
+                    for n in range(self.ndim)],
-                ],
            },
        }
--- a/torch/ops/init/functional.py
+++ b/torch/ops/init/functional.py
@@ -97,7 +97,7 @@ def eye(
    The rows and cols of matrix are determined by ``n`` and ``m``:
    ```python
-    print(torch.eye(2))     # [[1., 0.], [0., 1.]]
+    print(torch.eye(2))  # [[1., 0.], [0., 1.]]
    print(torch.eye(2, 3))  # [[1., 0., 0.], [0., 1., 0.]]
    ```
@@ -125,11 +125,8 @@ def eye(
    m = n if m is None else m
    out = utils.new_leaf([n, m], locals()) if out is None else out
    return _functions.Eye  \
-        .instantiate(
+        .instantiate(out.device, ndim=2, dtype=out.dtype) \
-            out.device,
+        .apply(out, [n, m])
-            ndim=2,
-            dtype=out.dtype,
-        ).apply(out, [n, m])
 def fill(out, shape, value):
@@ -144,11 +141,8 @@ def fill(out, shape, value):
 def fill_like(out, shape_like, value):
    return _functions.Fill  \
-        .instantiate(
+        .instantiate(out.device, value=float(value), dtype=out.dtype) \
-            out.device,
+        .apply(out, [], shape_like)
-            value=float(value),
-            dtype=out.dtype,
-        ).apply(out, [], shape_like)
 def normal(*size, **kwargs):

--- a/torch/ops/tensorbind.py
+++ b/torch/ops/tensorbind.py
@@ -18,6 +18,7 @@ from __future__ import print_function
 from dragon.vm.torch.ops.array import functional as array_funcs
 from dragon.vm.torch.ops.math import functional as math_funcs
 from dragon.vm.torch.ops.init import functional as init_funcs
+from dragon.vm.torch import executor
 from dragon.vm.torch.tensor import Tensor
@@ -85,6 +86,24 @@ def add_(self, value):
    return math_funcs.add(self, value, self)
+def backward(self, gradient=None, retain_graph=False):
+    """Compute the derivatives of this tensor w.r.t. graph leaves.
+    Parameters
+    ----------
+    gradient : dragon.vm.torch.Tensor, optional
+        The optional gradient of this tensor.
+    retain_graph : bool, optional, default=False
+        **False** to free the graph used to compute grad.
+    """
+    return executor.run_backward(
+        tensors=[self],
+        grad_tensors=None if gradient is None else [gradient],
+        retain_graph=retain_graph,
+    )
 def bitwise_not(self):
    r"""Compute the element-wise NOT bitwise operation.
@@ -1638,6 +1657,7 @@ def _process_indices(item):
 Tensor.abs = abs
 Tensor.add = add
 Tensor.add_ = add_
+Tensor.backward = backward
 Tensor.bitwise_not = bitwise_not
 Tensor.bitwise_not_ = bitwise_not_
 Tensor.bitwise_xor = bitwise_xor

--- a/torch/ops/training/_functions.py
+++ b/torch/ops/training/_functions.py
@@ -46,13 +46,14 @@ class ParamUpdate(function.Function):
 class GradAccumulate(function.Function):
    def __init__(self, key, dev, **kwargs):
        super(GradAccumulate, self).__init__(key, dev, **kwargs)
+        self.momentum = kwargs.get('momentum', 1)
    def attributes(self):
        return {
            'op_type': 'Axpby',
-            'arguments': {'alpha': 1., 'beta': 1.},
+            'arguments': {'alpha': 1., 'beta': float(self.momentum)},
        }
    def forward(self, grads):
-        outputs = [grad.id + '[acc]' for grad in grads]
+        outputs = [grad.id + '[accum]' for grad in grads]
        return self.dispatch(grads, outputs, no_grad=True)
--- a/torch/ops/training/functional.py
+++ b/torch/ops/training/functional.py
@@ -17,16 +17,17 @@ from dragon.core.util import nest
 from dragon.vm.torch.ops.training import _functions
-def grad_accumulate(grads):
+def accumulate_grad(grads, momentum=1):
    """Accumulate the gradients."""
    grads = nest.flatten(grads)
    if len(grads) == 0:
        return
    return _functions.GradAccumulate \
-        .instantiate(grads[0].device).apply(grads)
+        .instantiate(grads[0].device, momentum=momentum) \
+        .apply(grads)
-def param_update(
+def update_param(
    param,
    grad,
    op_type,
@@ -34,7 +35,7 @@ def param_update(
    lr_mult=1,
    decay_mult=1,
 ):
-    """Apply the param update."""
+    """Apply the parameter update."""
    return _functions.ParamUpdate \
        .instantiate(
            param.device,

--- a/torch/ops/utils.py
+++ b/torch/ops/utils.py
@@ -34,23 +34,9 @@ def new_leaf(sizes, kwargs):
 def remove_binary_scalar(input, value):
    """Remove the python scalar for binary ops."""
    if isinstance(input, Tensor):
-        # (Tensor, Number)
+        return input, scalar_to_tensor(value, input.dtype, input.device)
-        return \
-            input, \
-            scalar_to_tensor(
-                value,
-                input.dtype,
-                input.device,
-            )
    else:
-        # (Number, Tensor)
+        return scalar_to_tensor(input, value.dtype, value.device), value
-        return \
-            scalar_to_tensor(
-                input,
-                value.dtype,
-                value.device,
-            ), \
-            value
 def scalar_to_tensor(input, dtype, device):
@@ -64,12 +50,11 @@ def scalar_to_tensor(input, dtype, device):
            '<input> should be a python number, got {}.'
            .format(type(input).__name__)
        )
-    tid = '/share/scalar/{}/{}'.format(dtype, str(input))
+    name = '/share/scalar/{}/{}'.format(dtype, str(input))
-    if not workspace.has_tensor(tid):
+    current_ws = workspace.get_workspace()
-        workspace.feed_tensor(tid, numpy.array(input, dtype=dtype))
+    if not current_ws.has_tensor(name):
-    t = Tensor(id=tid, dtype=dtype, device=device, own_storage=False)
+        current_ws.feed_tensor(name, numpy.array(input, dtype=dtype))
-    t.requires_grad = False
+    return Tensor(device=device, impl=current_ws.GetTensor(name), requires_grad=False)
-    return t
 def unify_devices(tensors, key='Inputs'):
@@ -78,13 +63,11 @@ def unify_devices(tensors, key='Inputs'):
    if len(set(types)) != 1:
        raise ValueError(
            '{} from different device type: [{}].'
-            .format(key, ', '.join(types))
+            .format(key, ', '.join(types)))
-        )
    if types[0] == 'cuda':
        indices = [t._device.index for t in tensors]
        if len(set(indices)) != 1:
            raise ValueError(
                '{} from different cuda device: [{}].'
-                .format(key, ', '.join([str(d) for d in indices]))
+                .format(key, ', '.join([str(d) for d in indices])))
-            )
    return cpp.device(types[0], indices[0])
--- a/torch/optim/optimizer.py
+++ b/torch/optim/optimizer.py
@@ -53,7 +53,7 @@ class Optimizer(object):
        ----------
        params : Sequence[dragon.vm.torch.nn.Parameter]
            The parameters to optimize.
-        defaults : Dict
+        defaults : dict
            The pre-defined default hyper-parameters.
        """
@@ -73,29 +73,39 @@ class Optimizer(object):
        self._process_group = distributed.get_group()
        self._shared_args = {}
-    def accumulate_grad(self):
+    def accumulate(self, momentum):
-        """Accumulate all gradients.
+        """Accumulate the gradient of params.
-        Call this method after a ``backward`` pass:
+        Call this method after each ``backward`` pass:
        ```python
-        x = torch.ones(1, 3, requires_grad=True)
+        x = torch.ones(1, requires_grad=True)
-        for i in range(10):
+        optimizer = torch.optim.SGD([x], lr=0.1)
-            y = x + 1
+        for epoch in range(2):
-            y.backward()
+            for step in range(3):
-            optimizer.accumulate_grad()
+                y = x + 1
-        optimizer.step()
+                y.backward()
+                # Note to zero the accumulation at the first step
+                optimizer.accumulate(momentum=1 if step > 0 else 1)
+            optimizer.step()
+        print(x)  # 0.4
        ```
+        Parameters
+        ----------
+        momentum : float, required
+            The momentum to the accumulated value.
        """
        grads = []
+        current_ws = workspace.get_workspace()
        for group in self.param_groups:
-            for p in group['params']:
+            group['_internal/grad_accum'] = True
-                g = self._steal_grad(p)
+            for param in group['params']:
-                if g is not None:
+                grad = self._steal_grad(current_ws, param)
-                    grads.append(g)
+                if grad is not None:
-                    p.__accumulating__ = True
+                    grads.append(grad)
-        training_funcs.grad_accumulate(grads)
+        training_funcs.accumulate_grad(grads, momentum)
    def add_param_group(self, param_group):
        """Add a new param group into the optimizer.
@@ -120,7 +130,7 @@ class Optimizer(object):
        Parameters
        ----------
-        param_group : Dict
+        param_group : dict
            The param group to add.
        """
@@ -137,10 +147,7 @@ class Optimizer(object):
        for param in param_group['params']:
            if not param.requires_grad:
-                raise ValueError(
+                raise ValueError("Optimize a parameter that doesn't require grad.")
-                    "Optimizing a parameter that "
-                    "doesn't require gradients."
-                )
        for name, default in self.defaults.items():
            if default is required and name not in param_group:
@@ -156,6 +163,9 @@ class Optimizer(object):
            param_group['name'] = 'Optimizer_{}'.format(
                Optimizer._DEFAULT_UNIQUE_HANDLE_INDEX)
+        if '_internal/grad_accum' not in param_group:
+            param_group['_internal/grad_accum'] = False
        param_set = set()
        for group in self.param_groups:
            param_set.update(set(group['params']))
@@ -179,11 +189,13 @@ class Optimizer(object):
        ```
        """
+        current_ws = workspace.get_workspace()
        for group in self.param_groups:
-            self._run_updates(group)
+            self._run_updates(current_ws, group)
+            group['_internal/grad_accum'] = False
    def zero_grad(self, reset=False):
-        """Set all gradients to zeros.
+        """Set the gradient of params to zero.
        This method is not necessary usually, as we will overwrite
        the gradients in the next computation.
@@ -201,6 +213,7 @@ class Optimizer(object):
                x += m2(x)
        optimizer.zero_grad(reset=True)
        x.backward()
+        optimizer.step()
        ```
        Parameters
@@ -209,37 +222,26 @@ class Optimizer(object):
            **True** to reset the memory instead of zeroing.
        """
+        current_ws = workspace.get_workspace()
        for group in self.param_groups:
-            for p in group['params']:
+            for param in group['params']:
-                g = self._steal_grad(p, p.__accumulating__)
+                grad = self._steal_grad(current_ws, param)
-                p.__accumulating__ = False
+                if grad is not None:
-                if g is not None:
+                    current_ws.reset_tensor(grad) if reset else grad.zero_()
-                    if reset:
-                        workspace.reset_tensor(g)
-                    else:
-                        g.zero_()
-    def _init_set_defaults(self, group):
-        """Initialize the defaults into current workspace."""
-        template = '/share/hyper/%s/{}' % group['name']
-        for k, v in group.items():
-            if k in self._shared_args:
-                workspace.feed_tensor(
-                    template.format(self._shared_args[k]),
-                    v, dtype='float32', enforce_cpu=True)
-    def _run_updates(self, group):
+    def _run_updates(self, ws, group):
        """Run updates for the parameter group."""
        # Collect params and grads.
        params, grads = [], []
+        grad_accum = group['_internal/grad_accum']
        for p in group['params']:
-            g = self._steal_grad(p, p.__accumulating__)
+            g = self._steal_grad(ws, p, grad_accum)
            if g is not None:
                params.append(p)
                grads.append(g)
        # Reset the shared defaults.
-        self._init_set_defaults(group)
+        self._reset_defaults(ws, group)
        # Accumulate grads from the current process group.
        if self._process_group is not None:
@@ -251,7 +253,7 @@ class Optimizer(object):
        # Apply the specific update.
        for p, g in zip(params, grads):
-            training_funcs.param_update(
+            training_funcs.update_param(
                p, g,
                op_type=self._op_type,
                op_handle=group['name'],
@@ -259,16 +261,24 @@ class Optimizer(object):
                decay_mult=group.get('decay_mult', 1),
            )
+    def _reset_defaults(self, ws, group):
+        """Reset the defaults to backend."""
+        template = '/share/hyper/%s/{}' % group['name']
+        for name, value in group.items():
+            if name in self._shared_args:
+                ws.feed_tensor(
+                    tensor=template.format(self._shared_args[name]),
+                    value=value,
+                    dtype='float32',
+                    enforce_cpu=True,
+                )
    @staticmethod
-    def _steal_grad(param, accumulating=False):
+    def _steal_grad(ws, param, grad_accum=False):
-        """Steal the grad tensor if existing."""
+        """Steal the grad from backend."""
-        grad_id = param.id + ('_grad[acc]' if accumulating else '_grad')
+        impl = ws.GetTensor(param.id + ('_grad[accum]' if grad_accum else '_grad'))
-        if workspace.has_tensor(grad_id):
+        if impl is not None:
-            return Tensor(
+            return Tensor(device=param.device, impl=impl)
-                id=grad_id,
-                own_storage=False,
-                device=param.device,
-            )
        return None
    def __repr__(self):

--- a/torch/serialization.py
+++ b/torch/serialization.py
@@ -88,8 +88,7 @@ def _save(obj, f, pickle_module, pickle_protocol):
 def save(obj, f, pickle_module=PICKLE_MODULE, pickle_protocol=DEFAULT_PROTOCOL):
    return _with_file_like(
-        f, "wb", lambda f: _save(obj, f, pickle_module, pickle_protocol)
+        f, "wb", lambda f: _save(obj, f, pickle_module, pickle_protocol))
-    )
 def _load(f, map_location=None, pickle_module=six.moves.pickle, file=None):

--- a/torch/tensor.py
+++ b/torch/tensor.py
@@ -14,13 +14,13 @@ from __future__ import division
 from __future__ import print_function
 import numpy
+import warnings
 from dragon.core.framework import config
 from dragon.core.framework import context
 from dragon.core.framework import mapping
 from dragon.core.framework import proto_util
 from dragon.core.framework import workspace
-from dragon.core.util import math_util
 from dragon.core.util import six
 from dragon.vm.torch import cpp
@@ -67,40 +67,24 @@ class Tensor(object):
    """
    def __init__(self, *args, **kwargs):
-        # Internal properties
+        self._tape = None
-        self._id = kwargs.get('id', None)
+        self._gc = kwargs.get('gc', None)
+        self._impl = kwargs.get('impl', None)
        self._device = kwargs.get('device', cpp.device())
        self._requires_grad = kwargs.get('requires_grad', False)
-        self._own_storage = kwargs.get('own_storage', True)
-        self._const_size = None  # Attribute to represent a leaf variable
-        self._ignored_grads = set()  # Blacklist of the non-gradient variables
-        self.__tape__ = None  # Instance tape to record operations
-        self.__accumulating__ = False  # Flag for gradient accumulating
-        # Constructor
        if len(args) == 0:
-            # >>> Empty tensor
+            self._is_leaf = False
-            if self._id is not None:
-                ws = workspace.get_workspace()
-                self.__gc__ = ws.collectors.TENSOR
-                self._impl = ws.CreateTensor(self._id)
-            else:
-                self.__gc__ = None
        elif len(args) == 1:
            if isinstance(args[0], (list, tuple)):
-                # >>> torch.Tensor(sequence)
                dtype = kwargs.get('dtype', 'float32')
                self._from_numpy(numpy.array(args[0], dtype=dtype), copy=False)
            elif isinstance(args[0], numpy.ndarray):
-                # >>> torch.Tensor(array)
                self._from_numpy(args[0], copy=kwargs.get('copy', True))
            else:
-                # >>> torch.Tensor(size)
                if not isinstance(args[0], six.integer_types):
                    raise ValueError('Excepted an integer as size.')
                self._from_shape([args[0]], kwargs.get('dtype', 'float32'))
        else:
-            # >>> torch.Tensor(*sizes)
            if not all(isinstance(arg, six.integer_types) for arg in args):
                raise ValueError('Excepted integer(s) as sizes.')
            self._from_shape(args, kwargs.get('dtype', 'float32'))
@@ -115,7 +99,7 @@ class Tensor(object):
            The data tensor.
        """
-        return Tensor(device=self.device, id=self._id, own_storage=False)
+        return Tensor(device=self.device, impl=self._impl)
    @property
    def dtype(self):
@@ -143,7 +127,7 @@ class Tensor(object):
    @property
    def grad(self):
-        """Return a grad reference if gradient had be computed.
+        """Return the grad of this tensor if computed.
        Returns
        -------
@@ -151,14 +135,11 @@ class Tensor(object):
            The grad tensor.
        """
-        grad_id = self._id + '_grad'
+        if self._requires_grad and self._gc:
-        grad_impl = workspace.get_workspace().GetTensor(grad_id)
+            impl = self._gc._workspace.GetTensor(self.id + '_grad')
-        if grad_impl is None:
+            if impl is not None:
-            return None
+                return Tensor(device=self.device, impl=impl)
-        grad_ref = Tensor(own_storage=False)
+        return None
-        grad_ref._device = cpp.device(*self._impl.device)
-        grad_ref._id, grad_ref._impl = grad_id, grad_impl
-        return grad_ref
    @property
    def grad_fn(self):
@@ -174,7 +155,19 @@ class Tensor(object):
            The identity.
        """
-        return self._id
+        return self._impl.name
+    @property
+    def is_leaf(self):
+        """Return whether tensor is a leaf.
+        Returns
+        -------
+        bool
+            **True** if this is a leaf tensor otherwise **False**.
+        """
+        return self._is_leaf or not self._requires_grad
    @property
    def requires_grad(self):
@@ -191,9 +184,6 @@ class Tensor(object):
    @requires_grad.setter
    def requires_grad(self, value):
        self._requires_grad = value
-        if self._const_size is not None:
-            self._ignored_grads = set() if value \
-                else {self._id + '_grad'}
    @property
    def shape(self):
@@ -207,6 +197,11 @@ class Tensor(object):
        """
        return self.size()
+    @property
+    def volatile(self):
+        warnings.warn('Attribute ``volatile`` was removed (always False).', stacklevel=2)
+        return False
    def abs(self):
        r"""Return a tensor with the absolute value.
@@ -268,18 +263,17 @@ class Tensor(object):
        """
        pass
-    def backward(self, gradient=None):
+    def backward(self, gradient=None, retain_graph=False):
-        """Compute the gradients starting from this tensor.
+        """Compute the derivatives of this tensor w.r.t. graph leaves.
-        If ``gradient`` is not provided, **ones** will be used instead.
        Parameters
-        ---------
+        ----------
        gradient : dragon.vm.torch.Tensor, optional
-            The optional input gradient.
+            The optional gradient of this tensor.
+        retain_graph : bool, optional, default=False
+            **False** to free the graph used to compute grad.
        """
-        pass
    def bitwise_not(self):
        r"""Compute the element-wise NOT bitwise operation.
@@ -546,9 +540,6 @@ class Tensor(object):
                src._device.index
            ),
        )
-        # Transfer the const size if necessary
-        self._const_size = src.size() \
-            if self._const_size else None
        return self
    def cos(self):
@@ -1506,6 +1497,11 @@ class Tensor(object):
        """
        pass
+    def retain_grad(self):
+        """Retain grad for the non-leaf tensor."""
+        if self._tape:
+            self._tape.add_source(self.id)
    def round(self):
        r"""Return a tensor taken the round of elements.
@@ -1934,9 +1930,6 @@ class Tensor(object):
        """
        pass
-    def volatile(self):
-        pass
    def zero_(self):
        r"""Fill self with constant 0.
@@ -1954,20 +1947,16 @@ class Tensor(object):
        """Create impl from the numpy array."""
        ws = workspace.get_workspace()
        array = array.copy() if copy else array
-        self._const_size = array.size
+        self._gc, self._is_leaf = ws.collectors.TENSOR, True
-        self.__gc__ = ws.collectors.TENSOR
+        self._impl = ws.create_tensor(self._gc.alloc(
-        self._id = self.__gc__.alloc(context.get_eager_scope())
+            context.get_eager_scope())).FromNumpy(array)
-        self._impl = ws.CreateTensor(self._id).FromNumpy(array)
-        self.requires_grad = self._requires_grad
    def _from_shape(self, shape, dtype):
        """Create impl from the shape and data type."""
        ws = workspace.get_workspace()
-        self._const_size = math_util.prod(shape)
+        self._gc, self._is_leaf = ws.collectors.TENSOR, True
-        self.__gc__ = ws.collectors.TENSOR
+        self._impl = ws.create_tensor(self._gc.alloc(
-        self._id = self.__gc__.alloc(context.get_eager_scope())
+            context.get_eager_scope())).FromShape(shape, dtype)
-        self._impl = ws.CreateTensor(self._id).FromShape(shape, dtype)
-        self.requires_grad = self._requires_grad
    def _type2str(self):
        """Return the tensor type string."""
@@ -1977,12 +1966,10 @@ class Tensor(object):
        return self.add(other)
    def __del__(self):
-        if not self._requires_grad or self._const_size:
+        if self.is_leaf and self._gc:
-            if self._own_storage and self._id:
+            # Always reuse the leaf tensors.
-                # Always reuse the leaf variables or tensors
+            # PyGC will detect them automatically.
-                # that do not require grad.
+            self._gc.collect(self.id)
-                # PyGC will detect them automatically.
-                self.__gc__.collect(self._id)
    def __div__(self, other):
        return self.div(other)

--- a/torch/utils/dlpack.py
+++ b/torch/utils/dlpack.py
@@ -32,13 +32,13 @@ def from_dlpack(dlpack):
        The tensor with the dlpack data.
    """
-    ws = workspace.get_workspace()
+    current_ws = workspace.get_workspace()
-    ref = Tensor(device=None)  # Hack the constructor.
+    tensor = Tensor(device=None)
-    ref.__gc__ = ws.collectors.TENSOR
+    tensor._gc = current_ws.collectors.TENSOR
-    ref._id = ref.__gc__.alloc('${DLPACK}')
+    tensor._impl = current_ws.create_tensor(
-    ref._impl = ws.CreateTensor(ref._id).FromDLPack(dlpack)
+        tensor._gc.alloc('${DLPACK}')).FromDLPack(dlpack)
-    ref._device = cpp.device(*ref._impl.device)
+    tensor._device = cpp.device(*tensor._impl.device)
-    return ref
+    return tensor
 def to_dlpack(tensor, readonly=True):