Clean the torch operators

Ting PAN
Commit 52402169 authored Mar 14, 2019 by Ting PAN
Showing with 2548 additions and 2233 deletions
Docs/api/python/contents/core/tensor.rst
Docs/api/python/contents/operators.rst
Docs/api/python/contents/operators/array.rst
Docs/api/python/contents/operators/ndarray.rst
Docs/api/python/contents/ops.rst
Dragon/include/operators/ndarray/arange_op.h → Dragon/include/operators/array/arange_op.h
Dragon/include/operators/ndarray/argreduce_op.h → Dragon/include/operators/array/argreduce_op.h
Dragon/include/operators/ndarray/concat_op.h → Dragon/include/operators/array/concat_op.h
Dragon/include/operators/ndarray/crop_op.h → Dragon/include/operators/array/crop_op.h
Dragon/include/operators/ndarray/dimension_op.h → Dragon/include/operators/array/dimension_op.h
Dragon/include/operators/ndarray/gather_op.h → Dragon/include/operators/array/gather_op.h
Dragon/include/operators/array/multinomial_op.h
Dragon/include/operators/ndarray/one_hot_op.h → Dragon/include/operators/array/one_hot_op.h
Dragon/include/operators/ndarray/pad_op.h → Dragon/include/operators/array/pad_op.h
Dragon/include/operators/ndarray/reduce_op.h → Dragon/include/operators/array/reduce_op.h
Dragon/include/operators/ndarray/repeat_op.h → Dragon/include/operators/array/repeat_op.h
Dragon/include/operators/ndarray/shape_op.h → Dragon/include/operators/array/shape_op.h
Dragon/include/operators/ndarray/slice_op.h → Dragon/include/operators/array/slice_op.h
Dragon/include/operators/ndarray/stack_op.h → Dragon/include/operators/array/stack_op.h
Dragon/include/operators/ndarray/tile_op.h → Dragon/include/operators/array/tile_op.h
--- a/Docs/api/python/contents/core/tensor.rst
+++ b/Docs/api/python/contents/core/tensor.rst
@@ -55,7 +55,12 @@ List                              Brief
 `Tensor.__div__`_                 x.__div__(y) <=> x / y
 `Tensor.__rdiv__`_                x.__rdiv__(y) <=> y / x
 `Tensor.__neg__`_                 x.__neg__()  <=> -x
-`Tensor.__str__`_                 Return the information(name/shape).
+`Tensor.__gt__`_                  x.__gt__()  <=> x > y
+`Tensor.__ge__`_                  x.__ge__()  <=> x >= y
+`Tensor.__lt__`_                  x.__lt__()  <=> x < y
+`Tensor.__le__`_                  x.__le__()  <=> x <= y
+`Tensor.__eq__`_                  x.__eq__()  <=> x == y
+`Tensor.__repr__`_                Return the information(name/shape).
 `Tensor.__getitem__`_             Return a Tensor with specific indices.
 `Tensor.__call__`_                Return the expressions for displaying.
 ==============================    =============================================================================
@@ -70,6 +75,24 @@ API Reference
    :members:

    .. automethod:: __init__
+    .. automethod:: __add__
+    .. automethod:: __radd__
+    .. automethod:: __sub__
+    .. automethod:: __rsub__
+    .. automethod:: __mul__
+    .. automethod:: __rmul__
+    .. automethod:: __div__
+    .. automethod:: __rdiv__
+    .. automethod:: __neg__
+    .. automethod:: __gt__
+    .. automethod:: __ge__
+    .. automethod:: __lt__
+    .. automethod:: __le__
+    .. automethod:: __eq__
+    .. automethod:: __eq__
+    .. automethod:: __repr__
+    .. automethod:: __getitem__
+    .. automethod:: __call__

 .. _Tensor.Variable: #dragon.core.tensor.Tensor.Variable
 .. _Tensor.Placeholder: #dragon.core.tensor.Tensor.Placeholder
@@ -90,8 +113,12 @@ API Reference
 .. _Tensor.__div__: #dragon.core.tensor.Tensor.__div__
 .. _Tensor.__rdiv__: #dragon.core.tensor.Tensor.__rdiv__
 .. _Tensor.__neg__: #dragon.core.tensor.Tensor.__neg__
-.. _Tensor.__str__: #dragon.core.tensor.Tensor.__str__
-.. _Tensor.__getattr__: #dragon.core.tensor.Tensor.__getattr__
+.. _Tensor.__gt__: #dragon.core.tensor.Tensor.__gt__
+.. _Tensor.__ge__: #dragon.core.tensor.Tensor.__ge__
+.. _Tensor.__lt__: #dragon.core.tensor.Tensor.__lt__
+.. _Tensor.__le__: #dragon.core.tensor.Tensor.__le__
+.. _Tensor.__eq__: #dragon.core.tensor.Tensor.__eq__
+.. _Tensor.__repr__: #dragon.core.tensor.Tensor.__repr__
 .. _Tensor.__getitem__: #dragon.core.tensor.Tensor.__getitem__
 .. _Tensor.__call__: #dragon.core.tensor.Tensor.__call__


--- a/Docs/api/python/contents/operators.rst
+++ b/Docs/api/python/contents/operators.rst
@@ -11,7 +11,7 @@ Common
   operators/data
   operators/initializer
   operators/arithmetic
-   operators/ndarray
+   operators/array
   operators/control_flow
   operators/misc
   operators/mpi

--- a/Docs/api/python/contents/operators/array.rst
+++ b/Docs/api/python/contents/operators/array.rst
+============
+:mod:`Array`
+============
+
+.. toctree::
+   :hidden:
+
+.. automodule:: dragon.operators.array
+    :members:
+
+.. _ops.Reduce(*args, **kwargs): #dragon.operators.array.Reduce
\ No newline at end of file
--- a/Docs/api/python/contents/operators/ndarray.rst
+++ b/Docs/api/python/contents/operators/ndarray.rst
-==============
-:mod:`NDArray`
-==============
-
-.. toctree::
-   :hidden:
-
-.. automodule:: dragon.operators.ndarray
-    :members:
-
-.. _ops.Reduce(*args, **kwargs): #dragon.operators.ndarray.Reduce
\ No newline at end of file
--- a/Docs/api/python/contents/ops.rst
+++ b/Docs/api/python/contents/ops.rst
@@ -129,8 +129,8 @@ List                  Brief
 `L2Norm`_             L2 Normalization. `[Liu et.al, 2015] <https://arxiv.org/abs/1506.04579>`_.
 ==================    ======================================================================

-NDArray
-------
+Array
+-----
 ===============    ======================================================================
 List               Brief
 ===============    ======================================================================
@@ -157,6 +157,7 @@ List               Brief
 `ExpandDims`_      Expand the new dimension with size 1 to specific axis.
 `Shape`_           Get the dynamic shape of a Tensor.
 `Arange`_          Return evenly spaced values within a given interval.
+`Multinomial`_     Return indices sampled from the multinomial distribution.
 ===============    ======================================================================

 Control Flow
@@ -167,7 +168,9 @@ List               Brief
 `Copy`_            Copy A to B.
 `Equal`_           *Equal* Comparing between A and B.
 `Less`_            *Less* Comparing between A and B.
+`LessEqual`_       *LessEqual* Comparing between A and B.
 `Greater`_         *Greater* Comparing between A and B.
+`GreaterEqual`_    *GreaterEqual* Comparing between A and B.
 ===============    ======================================================================

 Misc
@@ -277,34 +280,37 @@ List                 Brief
 .. _InstanceNorm: operators/norm.html#dragon.operators.norm.InstanceNorm
 .. _L2Norm: operators/norm.html#dragon.operators.norm.L2Norm

-.. _Gather: operators/ndarray.html#dragon.operators.ndarray.Gather
-.. _Crop: operators/ndarray.html#dragon.operators.ndarray.Crop
-.. _Reduce: operators/ndarray.html#dragon.operators.ndarray.Reduce
-.. _Sum: operators/ndarray.html#dragon.operators.ndarray.Sum
-.. _Mean: operators/ndarray.html#dragon.operators.ndarray.Mean
-.. _Max: operators/ndarray.html#dragon.operators.ndarray.Max
-.. _ArgMax: operators/ndarray.html#dragon.operators.ndarray.ArgMax
-.. _Min: operators/ndarray.html#dragon.operators.ndarray.Min
-.. _ArgMin: operators/ndarray.html#dragon.operators.ndarray.ArgMin
-.. _Slice: operators/ndarray.html#dragon.operators.ndarray.Slice
-.. _Stack: operators/ndarray.html#dragon.operators.ndarray.Stack
-.. _Concat: operators/ndarray.html#dragon.operators.ndarray.Concat
-.. _Transpose: operators/ndarray.html#dragon.operators.ndarray.Transpose
-.. _Repeat: operators/ndarray.html#dragon.operators.ndarray.Repeat
-.. _Tile: operators/ndarray.html#dragon.operators.ndarray.Tile
-.. _Pad: operators/ndarray.html#dragon.operators.ndarray.Pad
-.. _OneHot: operators/ndarray.html#dragon.operators.ndarray.OneHot
-.. _Flatten: operators/ndarray.html#dragon.operators.ndarray.Flatten
-.. _Reshape: operators/ndarray.html#dragon.operators.ndarray.Reshape
-.. _Squeeze: operators/ndarray.html#dragon.operators.ndarray.Squeeze
-.. _ExpandDims: operators/ndarray.html#dragon.operators.ndarray.ExpandDims
-.. _Shape: operators/ndarray.html#dragon.operators.ndarray.Shape
-.. _Arange: operators/ndarray.html#dragon.operators.ndarray.Arange
+.. _Gather: operators/array.html#dragon.operators.array.Gather
+.. _Crop: operators/array.html#dragon.operators.array.Crop
+.. _Reduce: operators/array.html#dragon.operators.array.Reduce
+.. _Sum: operators/array.html#dragon.operators.array.Sum
+.. _Mean: operators/array.html#dragon.operators.array.Mean
+.. _Max: operators/array.html#dragon.operators.array.Max
+.. _ArgMax: operators/array.html#dragon.operators.array.ArgMax
+.. _Min: operators/array.html#dragon.operators.array.Min
+.. _ArgMin: operators/array.html#dragon.operators.array.ArgMin
+.. _Slice: operators/array.html#dragon.operators.array.Slice
+.. _Stack: operators/array.html#dragon.operators.array.Stack
+.. _Concat: operators/array.html#dragon.operators.array.Concat
+.. _Transpose: operators/array.html#dragon.operators.array.Transpose
+.. _Repeat: operators/array.html#dragon.operators.array.Repeat
+.. _Tile: operators/array.html#dragon.operators.array.Tile
+.. _Pad: operators/array.html#dragon.operators.array.Pad
+.. _OneHot: operators/array.html#dragon.operators.array.OneHot
+.. _Flatten: operators/array.html#dragon.operators.array.Flatten
+.. _Reshape: operators/array.html#dragon.operators.array.Reshape
+.. _Squeeze: operators/array.html#dragon.operators.array.Squeeze
+.. _ExpandDims: operators/array.html#dragon.operators.array.ExpandDims
+.. _Shape: operators/array.html#dragon.operators.array.Shape
+.. _Arange: operators/array.html#dragon.operators.array.Arange
+.. _Multinomial: operators/array.html#dragon.operators.array.Multinomial

-.. _Copy: operators/control_flow.html#dragon.operators.control_flow.Copy
+.. _Copy: operators/control_flow.html#dAragon.operators.control_flow.Copy
 .. _Equal: operators/control_flow.html#dragon.operators.control_flow.Equal
 .. _Less: operators/control_flow.html#dragon.operators.control_flow.Less
+.. _LessEqual: operators/control_flow.html#dragon.operators.control_flow.LessEqual
 .. _Greater: operators/control_flow.html#dragon.operators.control_flow.Greater
+.. _GreaterEqual: operators/control_flow.html#dragon.operators.control_flow.GreaterEqual

 .. _Cast: operators/misc.html#dragon.operators.misc.Cast
 .. _Run: operators/misc.html#dragon.operators.misc.Run

--- a/Dragon/include/operators/ndarray/arange_op.h
+++ b/Dragon/include/operators/ndarray/arange_op.h
@@ -10,8 +10,8 @@
 * ------------------------------------------------------------
 */

-#ifndef DRAGON_OPERATORS_NDARRAY_ARGMAX_OP_H_
-#define DRAGON_OPERATORS_NDARRAY_ARGMAX_OP_H_
+#ifndef DRAGON_OPERATORS_ARRAY_ARGMAX_OP_H_
+#define DRAGON_OPERATORS_ARRAY_ARGMAX_OP_H_

 #include "core/operator.h"

@@ -46,4 +46,4 @@ DEFINE_ARGUMENT_WITH_DESC(int64_t, ArangeOp, step);

 }  // namespace dragon

-#endif  // DRAGON_OPERATORS_NDARRAY_ARANGE_OP_H_
\ No newline at end of file
+#endif  // DRAGON_OPERATORS_ARRAY_ARANGE_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/ndarray/argreduce_op.h
+++ b/Dragon/include/operators/ndarray/argreduce_op.h
@@ -10,8 +10,8 @@
 * ------------------------------------------------------------
 */

-#ifndef DRAGON_OPERATORS_NDARRAY_ARGREDUCE_OP_H_
-#define DRAGON_OPERATORS_NDARRAY_ARGREDUCE_OP_H_
+#ifndef DRAGON_OPERATORS_ARRAY_ARGREDUCE_OP_H_
+#define DRAGON_OPERATORS_ARRAY_ARGREDUCE_OP_H_

 #include "core/operator.h"

@@ -39,4 +39,4 @@ class ArgReduceOp final : public Operator<Context> {

 }  // namespace dragon

-#endif  // DRAGON_OPERATORS_NDARRAY_ARGREDUCE_OP_H_
\ No newline at end of file
+#endif  // DRAGON_OPERATORS_ARRAY_ARGREDUCE_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/ndarray/concat_op.h
+++ b/Dragon/include/operators/ndarray/concat_op.h
@@ -10,8 +10,8 @@
 * ------------------------------------------------------------
 */

-#ifndef DRAGON_OPERATORS_NDARRAY_CONCAT_OP_H_
-#define DRAGON_OPERATORS_NDARRAY_CONCAT_OP_H_
+#ifndef DRAGON_OPERATORS_ARRAY_CONCAT_OP_H_
+#define DRAGON_OPERATORS_ARRAY_CONCAT_OP_H_

 #include "core/operator.h"

@@ -55,4 +55,4 @@ class ConcatGradientOp : public Operator<Context> {

 }  // namespace dragon

-#endif  // DRAGON_OPERATORS_NDARRAY_CONCAT_OP_H_
\ No newline at end of file
+#endif  // DRAGON_OPERATORS_ARRAY_CONCAT_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/ndarray/crop_op.h
+++ b/Dragon/include/operators/ndarray/crop_op.h
@@ -10,8 +10,8 @@
 * ------------------------------------------------------------
 */

-#ifndef DRAGON_OPERATORS_NDARRAY_CROP_OP_H_
-#define DRAGON_OPERATORS_NDARRAY_CROP_OP_H_
+#ifndef DRAGON_OPERATORS_ARRAY_CROP_OP_H_
+#define DRAGON_OPERATORS_ARRAY_CROP_OP_H_

 #include "core/operator.h"

@@ -63,4 +63,4 @@ class CropGradientOp final : public Operator<Context> {

 }  // namespace dragon

-#endif  // DRAGON_OPERATORS_NDARRAY_CROP_OP_H_
\ No newline at end of file
+#endif  // DRAGON_OPERATORS_ARRAY_CROP_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/ndarray/dimension_op.h
+++ b/Dragon/include/operators/ndarray/dimension_op.h
@@ -10,8 +10,8 @@
 * ------------------------------------------------------------
 */

-#ifndef DRAGON_OPERATORS_NDARRAY_DIMENSION_OP_H_
-#define DRAGON_OPERATORS_NDARRAY_DIMENSION_OP_H_
+#ifndef DRAGON_OPERATORS_ARRAY_DIMENSION_OP_H_
+#define DRAGON_OPERATORS_ARRAY_DIMENSION_OP_H_

 #include "core/operator.h"

@@ -151,4 +151,4 @@ DEFINE_DIMENSION_GRADIENT_OP(Squeeze);

 }  // namespace dragon

-#endif  // DRAGON_OPERATORS_NDARRAY_RESHAPE_OP_H_
\ No newline at end of file
+#endif  // DRAGON_OPERATORS_ARRAY_RESHAPE_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/ndarray/gather_op.h
+++ b/Dragon/include/operators/ndarray/gather_op.h
@@ -10,8 +10,8 @@
 * ------------------------------------------------------------
 */

-#ifndef DRAGON_OPERATORS_NDARRAY_GATHER_OP_H_
-#define DRAGON_OPERATORS_NDARRAY_GATHER_OP_H_
+#ifndef DRAGON_OPERATORS_ARRAY_GATHER_OP_H_
+#define DRAGON_OPERATORS_ARRAY_GATHER_OP_H_

 #include "core/operator.h"

@@ -52,4 +52,4 @@ class GatherGradientOp final : public Operator<Context> {

 }  // namespace dragon

-#endif  // DRAGON_OPERATORS_NDARRAY_GATHER_OP_H_
\ No newline at end of file
+#endif  // DRAGON_OPERATORS_ARRAY_GATHER_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/array/multinomial_op.h
+++ b/Dragon/include/operators/array/multinomial_op.h
+/*!
+ * Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+ *
+ * Licensed under the BSD 2-Clause License.
+ * You should have received a copy of the BSD 2-Clause License
+ * along with the software. If not, See,
+ *
+ *      <https://opensource.org/licenses/BSD-2-Clause>
+ *
+ * ------------------------------------------------------------
+ */
+
+#ifndef DRAGON_OPERATORS_ARRAY_MULTINOMIAL_OP_H_
+#define DRAGON_OPERATORS_ARRAY_MULTINOMIAL_OP_H_
+
+#include "core/operator.h"
+
+namespace dragon {
+
+template <class Context>
+class MultinomialOp final : public Operator<Context> {
+ public:
+    MultinomialOp(const OperatorDef& def, Workspace* ws)
+        : Operator<Context>(def, ws),
+          normalize(OperatorBase::Arg<int64_t>("normalize", 0)),
+          num_samples(OperatorBase::Arg<int64_t>("num_samples", 1)) {}
+    USE_OPERATOR_FUNCTIONS;
+
+    void SoftmaxRun();
+
+    void RunOnDevice() override;
+    template <typename T> void RunWithType();
+
+ protected:
+    Tensor* prob;
+    int64_t normalize, num_samples, outer_dim, axis;
+    unique_ptr<OperatorBase> softmax_op;
+};
+
+}  // namespace dragon
+
+#endif  // DRAGON_OPERATORS_ARRAY_MULTINOMIAL_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/ndarray/one_hot_op.h
+++ b/Dragon/include/operators/ndarray/one_hot_op.h
@@ -10,8 +10,8 @@
 * ------------------------------------------------------------
 */

-#ifndef DRAGON_OPERATORS_NDARRAY_ONE_HOT_OP_H_
-#define DRAGON_OPERATORS_NDARRAY_ONE_HOT_OP_H_
+#ifndef DRAGON_OPERATORS_ARRAY_ONE_HOT_OP_H_
+#define DRAGON_OPERATORS_ARRAY_ONE_HOT_OP_H_

 #include "core/operator.h"

@@ -36,4 +36,4 @@ class OneHotOp final : public Operator < Context > {

 }  // namespace dragon

-#endif  // DRAGON_OPERATORS_NDARRAY_ONE_HOT_OP_H_
\ No newline at end of file
+#endif  // DRAGON_OPERATORS_ARRAY_ONE_HOT_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/ndarray/pad_op.h
+++ b/Dragon/include/operators/ndarray/pad_op.h
@@ -10,8 +10,8 @@
 * ------------------------------------------------------------
 */

-#ifndef DRAGON_OPERATORS_NDARRAY_PAD_OP_H_
-#define DRAGON_OPERATORS_NDARRAY_PAD_OP_H_
+#ifndef DRAGON_OPERATORS_ARRAY_PAD_OP_H_
+#define DRAGON_OPERATORS_ARRAY_PAD_OP_H_

 #include "core/operator.h"

@@ -73,4 +73,4 @@ class PadGradientOp final : public Operator<Context> {

 }  // namespace dragon

-#endif  // DRAGON_OPERATORS_NDARRAY_PAD_OP_H_
\ No newline at end of file
+#endif  // DRAGON_OPERATORS_ARRAY_PAD_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/ndarray/reduce_op.h
+++ b/Dragon/include/operators/ndarray/reduce_op.h
@@ -10,8 +10,8 @@
 * ------------------------------------------------------------
 */

-#ifndef DRAGON_OPERATORS_NDARRAY_REDUCE_OP_H_
-#define DRAGON_OPERATORS_NDARRAY_REDUCE_OP_H_
+#ifndef DRAGON_OPERATORS_ARRAY_REDUCE_OP_H_
+#define DRAGON_OPERATORS_ARRAY_REDUCE_OP_H_

 #include "core/operator.h"

@@ -59,4 +59,4 @@ class ReduceGradientOp final : public Operator<Context> {

 }  // namespace dragon

-#endif  // DRAGON_OPERATORS_NDARRAY_REDUCE_OP_H_
\ No newline at end of file
+#endif  // DRAGON_OPERATORS_ARRAY_REDUCE_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/ndarray/repeat_op.h
+++ b/Dragon/include/operators/ndarray/repeat_op.h
@@ -10,8 +10,8 @@
 * ------------------------------------------------------------
 */

-#ifndef DRAGON_OPERATORS_NDARRAY_REPEAT_OP_H_
-#define DRAGON_OPERATORS_NDARRAY_REPEAT_OP_H_
+#ifndef DRAGON_OPERATORS_ARRAY_REPEAT_OP_H_
+#define DRAGON_OPERATORS_ARRAY_REPEAT_OP_H_

 #include "core/operator.h"

@@ -58,4 +58,4 @@ DEFINE_ARGUMENT_WITH_DESC(int64_t, RepeatGradientOp, repeats);

 }  // namespace dragon

-#endif  // DRAGON_OPERATORS_NDARRAY_REPEAT_OP_H_
\ No newline at end of file
+#endif  // DRAGON_OPERATORS_ARRAY_REPEAT_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/ndarray/shape_op.h
+++ b/Dragon/include/operators/ndarray/shape_op.h
@@ -10,8 +10,8 @@
 * ------------------------------------------------------------
 */

-#ifndef DRAGON_OPERATORS_NDARRAY_SHAPE_OP_H_
-#define DRAGON_OPERATORS_NDARRAY_SHAPE_OP_H_
+#ifndef DRAGON_OPERATORS_ARRAY_SHAPE_OP_H_
+#define DRAGON_OPERATORS_ARRAY_SHAPE_OP_H_

 #include "core/operator.h"

@@ -27,4 +27,4 @@ class ShapeOp final : public Operator<Context> {

 }  // namespace dragon

-#endif  //DRAGON_OPERATORS_NDARRAY_SHAPE_OP_H_
\ No newline at end of file
+#endif  //DRAGON_OPERATORS_ARRAY_SHAPE_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/ndarray/slice_op.h
+++ b/Dragon/include/operators/ndarray/slice_op.h
@@ -10,8 +10,8 @@
 * ------------------------------------------------------------
 */

-#ifndef DRAGON_OPERATORS_NDARRAY_SLICE_OP_H_
-#define DRAGON_OPERATORS_NDARRAY_SLICE_OP_H_
+#ifndef DRAGON_OPERATORS_ARRAY_SLICE_OP_H_
+#define DRAGON_OPERATORS_ARRAY_SLICE_OP_H_

 #include "core/operator.h"

@@ -55,4 +55,4 @@ class SliceGradientOp final : public Operator<Context> {

 }  // namespace dragon

-#endif  // DRAGON_OPERATORS_NDARRAY_SLICE_OP_H_
\ No newline at end of file
+#endif  // DRAGON_OPERATORS_ARRAY_SLICE_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/ndarray/stack_op.h
+++ b/Dragon/include/operators/ndarray/stack_op.h
@@ -10,8 +10,8 @@
 * ------------------------------------------------------------
 */

-#ifndef DRAGON_OPERATORS_NDARRAY_STACK_OP_H_
-#define DRAGON_OPERATORS_NDARRAY_STACK_OP_H_
+#ifndef DRAGON_OPERATORS_ARRAY_STACK_OP_H_
+#define DRAGON_OPERATORS_ARRAY_STACK_OP_H_

 #include "core/operator.h"

@@ -50,4 +50,4 @@ class StackGradientOp final : public Operator<Context> {

 }  // namespace dragon

-#endif  // DRAGON_OPERATORS_NDARRAY_STACK_OP_H_
\ No newline at end of file
+#endif  // DRAGON_OPERATORS_ARRAY_STACK_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/ndarray/tile_op.h
+++ b/Dragon/include/operators/ndarray/tile_op.h
@@ -10,8 +10,8 @@
 * ------------------------------------------------------------
 */

-#ifndef DRAGON_OPERATORS_NDARRAY_TILE_OP_H_
-#define DRAGON_OPERATORS_NDARRAY_TILE_OP_H_
+#ifndef DRAGON_OPERATORS_ARRAY_TILE_OP_H_
+#define DRAGON_OPERATORS_ARRAY_TILE_OP_H_

 #include "core/operator.h"

@@ -60,4 +60,4 @@ DEFINE_ARGUMENTS_WITH_DESC(int64_t, TileGradientOp, multiples);

 }  // namespace dragon

-#endif  // DRAGON_OPERATORS_NDARRAY_TILE_OP_H_
\ No newline at end of file
+#endif  // DRAGON_OPERATORS_ARRAY_TILE_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/ndarray/transpose_op.h
+++ b/Dragon/include/operators/ndarray/transpose_op.h
@@ -10,8 +10,8 @@
 * ------------------------------------------------------------
 */

-#ifndef DRAGON_OPERATORS_NDARRAY_TRANSPOSE_OP_H_
-#define DRAGON_OPERATORS_NDARRAY_TRANSPOSE_OP_H_
+#ifndef DRAGON_OPERATORS_ARRAY_TRANSPOSE_OP_H_
+#define DRAGON_OPERATORS_ARRAY_TRANSPOSE_OP_H_

 #include "core/operator.h"

@@ -56,4 +56,4 @@ DEFINE_ARGUMENTS_WITH_DESC(int64_t, TransposeGradientOp, perm);

 }  // namespace dragon

-#endif  // DRAGON_OPERATORS_NDARRAY_TRANSPOSE_OP_H_
\ No newline at end of file
+#endif  // DRAGON_OPERATORS_ARRAY_TRANSPOSE_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/control_flow/compare_op.h
+++ b/Dragon/include/operators/control_flow/compare_op.h
@@ -30,7 +30,9 @@ class CompareOp final : public Operator<Context> {
    void RunOnDevice() override;
    template <typename T> void EqualRunWithType();
    template <typename T> void LessRunWithType();
+    template <typename T> void LessEqualRunWithType();
    template <typename T> void GreaterRunWithType();
+    template <typename T> void GreaterEqualRunWithType();
   
 protected:
    string operation;

--- a/Dragon/include/operators/misc/initialize_op.h
+++ b/Dragon/include/operators/misc/initialize_op.h
@@ -192,6 +192,6 @@ DEFINE_ARGUMENTS_WITH_DESC(int64_t, InitializeOp, dims);
 DEFINE_ARGUMENTS_WITH_DESC(int64_t, FillOp, dims);
 DEFINE_ARGUMENTS_WITH_DESC(int64_t, GivenTensorFillOp, dims);

-}  // namespace
+}  // namespace dragon

 #endif  // DRAGON_OPERATORS_MISC_INITIALIZE_OP_H_
\ No newline at end of file
--- a/Dragon/include/utils/op_kernel.h
+++ b/Dragon/include/utils/op_kernel.h
@@ -351,6 +351,14 @@ void Less(
    Context*                ctx);

 template <typename T, class Context>
+void LessEqual(
+    const int               count,
+    const T*                a,
+    const T*                b,
+    bool*                   y,
+    Context*                ctx);
+
+template <typename T, class Context>
 void Greater(
    const int               count,
    const T*                a,
@@ -358,6 +366,14 @@ void Greater(
    bool*                   y,
    Context*                ctx);

+template <typename T, class Context>
+void GreaterEqual(
+    const int               count,
+    const T*                a,
+    const T*                b,
+    bool*                   y,
+    Context*                ctx);
+
 /*! loss.l1_loss */

 template <typename T, class Context>
@@ -574,7 +590,7 @@ void ImageData(
    Ty*                     y,
    Context*                ctx);

-/*! ndarray.arange */
+/*! array.arange */

 template <typename T, class Context>
 void Arange(
@@ -584,7 +600,7 @@ void Arange(
    T*                      y,
    Context*                ctx);

-/*! ndarray.argreduce */
+/*! array.argreduce */

 template <typename T, class Context>
 void ArgMax(
@@ -608,7 +624,7 @@ void ArgMin(
    T*                      values,
    Context*                ctx);

-/*! ndarray.gather */
+/*! array.gather */

 template <typename T, class Context>
 void Gather(
@@ -632,7 +648,7 @@ void GatherGrad(
    T*                      dx,
    Context*                ctx);

-/*! ndarray.concat */
+/*! array.concat */

 template <typename T, class Context>
 void Concat(
@@ -645,7 +661,7 @@ void Concat(
    T*                      y,
    Context*                ctx);

-/*! ndarray.crop */
+/*! array.crop */

 template <typename T, class Context>
 void Crop(
@@ -669,7 +685,7 @@ void CropGrad(
    T*                      dx,
    Context*                ctx);

-/*! ndarray.pad */
+/*! array.pad */

 template <typename T, class Context>
 void ConstPad(
@@ -708,7 +724,7 @@ void EdgePad(
    T*                      y,
    Context*                ctx);

-/*! ndarray.one_hot */
+/*! array.one_hot */

 template <typename T, class Context>
 void OneHot(
@@ -719,7 +735,7 @@ void OneHot(
    T*                      y,
    Context*                ctx);

-/*! ndarray.reduce */
+/*! array.reduce */

 template <typename T, class Context>
 void ReduceSum(
@@ -744,7 +760,7 @@ void ReduceSumGrad(
    T*                      dx,
    Context*                ctx);

-/*! ndarray.repeat */
+/*! array.repeat */

 template <typename T, class Context>
 void Repeat(
@@ -766,7 +782,7 @@ void RepeatGrad(
    T*                      dx,
    Context*                ctx);

-/*! ndarray.slice */
+/*! array.slice */

 template <typename T, class Context>
 void Slice(
@@ -790,7 +806,7 @@ void SliceGrad(
    T*                      x,
    Context*                ctx);

-/*! ndarray.tile */
+/*! array.tile */

 template <typename T, class Context>
 void Tile(
@@ -812,7 +828,7 @@ void TileGrad(
    T*                      dx,
    Context*                ctx);

-/*! ndarray.transpose */
+/*! array.transpose */

 template <typename T, class Context>
 void Transpose(

--- a/Dragon/modules/python/py_module.cc
+++ b/Dragon/modules/python/py_module.cc
@@ -70,12 +70,6 @@ void OnImportModule() {

 PYBIND11_MODULE(libdragon, m) {

-    /* ------------------------------------ *
-     *                                      *
-     *              Workspace               *
-     *                                      *
-     * ------------------------------------ */
-
     /*! \brief Switch to the specific workspace */
    m.def("SwitchWorkspace", &SwitchWorkspace);

@@ -133,6 +127,7 @@ PYBIND11_MODULE(libdragon, m) {
        g_workspaces[target_workspace]->Clear();
    });

+    /*! \brief Copy the array data to the tensor */
    m.def("FeedTensor", [](
        const string&           name,
        pybind11::object        value,
@@ -150,6 +145,7 @@ PYBIND11_MODULE(libdragon, m) {
            PyArrayObject*>(value.ptr()), tensor);
    });

+    /*! \brief Copy the tensor data to the array */
    m.def("FetchTensor", [](const string& name) {
        if (!g_workspace->HasTensor(name))
            LOG(FATAL) << "Tensor(" + name + ") "
@@ -169,7 +165,7 @@ PYBIND11_MODULE(libdragon, m) {
        }
    });

-    /*!            Misc             */
+    /*! \brief Return a unique dummy name */
    m.def("GetDummyName", [](
        const string&           basename,
        const string&           suffix,

--- a/Dragon/modules/python/py_proto.h
+++ b/Dragon/modules/python/py_proto.h
@@ -63,12 +63,6 @@ void AddProtoMethods(pybind11::module& m) {
          [](OperatorDef* self, const vector<string>& output) {
              *(self->mutable_output()) = { output.begin(), output.end() };
      });
-
-    m.def("TestOperatorDefs", [](vector<OperatorDef*> defs) {
-        for (auto* def : defs) {
-            std::cout << def->DebugString() << std::endl;
-        }
-    });
 }

 }  // namespace python

--- a/Dragon/modules/python/py_tensor.h
+++ b/Dragon/modules/python/py_tensor.h
@@ -27,7 +27,7 @@ void AddTensorMethods(pybind11::module& m) {
        .def_property_readonly("size", &Tensor::size)
        .def_property_readonly("dtype", [](Tensor* self) {
            return TypeMetaToString(self->meta());
-      }).def_property_readonly("ctx", [](Tensor* self) {
+      }).def_property_readonly("device", [](Tensor* self) {
            if (self->has_memory()) {
                Map<string, string> mem_info = self->memory()->info();
                return std::tuple<string, int>(

--- a/Dragon/python/dragon/__init__.py
+++ b/Dragon/python/dragon/__init__.py
@@ -41,7 +41,7 @@ from dragon.vm.theano.tensor import grad as grad
 from dragon.core.scope import name_scope, get_default_name_scope
 from dragon.core.scope import phase_scope, get_default_phase
 from dragon.core.scope import device_scope, get_default_device
-from dragon.core.scope import WorkspaceScope as workspace_scope
+from dragon.core.scope import WorkspaceScope as ws_scope

 # Version
 from dragon.version import version

--- a/Dragon/python/dragon/config.py
+++ b/Dragon/python/dragon/config.py
@@ -20,8 +20,9 @@ import dragon.core.logging as logging

 option = {}

-# The current device, 'CPU', 'CUDA' or 'CNML'
-option['device'] = 'CPU'
+# The current device
+# enumeration in ('cpu', 'cuda', 'cnml')
+option['device'] = 'cpu'

 # The device index
 option['device_id'] = 0
@@ -73,7 +74,7 @@ def EnableCPU():

    """
    global option
-    option['device'] = 'CPU'
+    option['device'] = 'cpu'


 def EnableCUDA(gpu_id=0, use_cudnn=True):
@@ -92,7 +93,7 @@ def EnableCUDA(gpu_id=0, use_cudnn=True):

    """
    global option
-    option['device'] = 'CUDA'
+    option['device'] = 'cuda'
    option['device_id'] = gpu_id
    option['use_cudnn'] = use_cudnn

@@ -111,7 +112,7 @@ def EnableCNML(mlu_id=0):

    """
    global option
-    option['device'] = 'CNML'
+    option['device'] = 'cnml'
    option['device_id'] = mlu_id



--- a/Dragon/python/dragon/core/cuda.py
+++ b/Dragon/python/dragon/core/cuda.py
@@ -15,7 +15,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import dragon.import_c_api as C
+import dragon.import_c_api as _C


 def IsCUDADriverSufficient():
@@ -27,7 +27,7 @@ def IsCUDADriverSufficient():
        ``True`` if your device(s) support CUDA otherwise ``False``.

    """
-    return C.IsCUDADriverSufficient()
+    return _C.IsCUDADriverSufficient()


 def GetDevice():
@@ -39,7 +39,7 @@ def GetDevice():
        The device index.

    """
-    return C.cudaGetDevice()
+    return _C.cudaGetDevice()


 def SynchronizeStream(device_id=None, stream_id=0):
@@ -55,5 +55,5 @@ def SynchronizeStream(device_id=None, stream_id=0):
        The stream index.

    """
-    return C.cudaStreamSynchronize(
+    return _C.cudaStreamSynchronize(
        device_id if device_id else -1, stream_id)
\ No newline at end of file
--- a/Dragon/python/dragon/core/gradient_maker.py
+++ b/Dragon/python/dragon/core/gradient_maker.py
@@ -93,7 +93,7 @@ class GraphGradientMaker(object):
        """
        if forward_op.type in C.NO_GRADIENT_OPERATORS:
            for input in forward_op.input: blacklist.add(input)
-            return (True, None)
+            return True, None

        # Generate virtual grads for targets if necessary
        gen_grads = []
@@ -107,11 +107,11 @@ class GraphGradientMaker(object):
        for output in forward_op.output:
            if inputs_to_grads.get(output, None) is None:
                # check failed: skip backward
-                if output in blacklist: return (True, gen_grads)
-                if len(forward_op.output) == 1: return (True, gen_grads)
+                if output in blacklist: return True, gen_grads
+                if len(forward_op.output) == 1: return True, gen_grads

        # Pass, even if missing some grads
-        return (False, gen_grads)
+        return False, gen_grads

    @classmethod
    def Make(cls, forward_ops, targets, input_grads=None, auto_names=True):

--- a/Dragon/python/dragon/core/helper.py
+++ b/Dragon/python/dragon/core/helper.py
@@ -16,8 +16,8 @@ from __future__ import division
 from __future__ import print_function

 import math
-import numpy as np
-import dragon as dg
+import numpy
+import dragon


 class OperatorHelper(object):
@@ -39,11 +39,11 @@ class OperatorHelper(object):

    @classmethod
    def get_index_and_name(cls, prefix='Op'):
-        name = dg.workspace.GetDummyName(prefix, domain='Operator')
+        name = dragon.workspace.GetDummyName(prefix, domain='Operator')
        try:
            _, op_idx = name.split('_')
        except:
-            name = dg.workspace.GetDummyName(prefix, domain='Operator')
+            name = dragon.workspace.GetDummyName(prefix, domain='Operator')
            _, op_idx = name.split('_')
        return int(op_idx), name

@@ -216,7 +216,7 @@ class OperatorHelper(object):
        for i in range(3):
            try:
                if i == 0:
-                    outputs[0].shape[i] = np.prod(inputs[0].shape[:axis])
+                    outputs[0].shape[i] = numpy.prod(inputs[0].shape[:axis])
                if i >= 1:
                    outputs[0].shape[i] = inputs[0].shape[axis]
            except: pass
@@ -581,7 +581,7 @@ class OperatorHelper(object):
            if axis is None:
                try:
                    fake_shape = inputs[0].shape[:]
-                    total_count = np.prod(fake_shape)
+                    total_count = numpy.prod(fake_shape)
                    outputs[0].shape = [total_count * repeats]
                except:
                    outputs[0].shape = [None]
@@ -643,7 +643,7 @@ class OperatorHelper(object):
            outputs[0].shape = [None] * len(shape)
            n_elements, n_elements_known = None, None
            try:
-                n_elements = int(np.prod(inputs[0].shape))
+                n_elements = int(numpy.prod(inputs[0].shape))
            except:
                pass
            for i, s in enumerate(shape):
@@ -654,7 +654,7 @@ class OperatorHelper(object):
                except:
                    pass
            try:
-                n_elements_known = int(np.prod(outputs[0].shape))
+                n_elements_known = int(numpy.prod(outputs[0].shape))
            except:
                pass
            for i, s in enumerate(shape):
@@ -738,6 +738,16 @@ class OperatorHelper(object):
        outputs[0].shape = [count]
        return outputs

+    @classmethod
+    def _apply_Multinomial(cls, arguments, inputs, outputs):
+        outputs[0].dtype = 'int64'
+        try:
+            outputs[0].shape = inputs[0].shape[:]
+            outputs[0].shape[-1] = arguments['num_samples']
+        except:
+            pass
+        return outputs
+
    ###############################################
    #                                             #
    #                    Vision                   #

--- a/Dragon/python/dragon/core/mapping.py
+++ b/Dragon/python/dragon/core/mapping.py
@@ -15,18 +15,18 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import numpy as np
+import numpy


 TENSOR_TYPE_TO_NP_TYPE = {
-    'bool': np.bool,
-    'int8': np.int8,
-    'uint8': np.uint8,
-    'int32': np.int32,
-    'int64': np.int64,
-    'float16': np.float16,
-    'float32': np.float32,
-    'float64': np.float64,
+    'bool': numpy.bool,
+    'int8': numpy.int8,
+    'uint8': numpy.uint8,
+    'int32': numpy.int32,
+    'int64': numpy.int64,
+    'float16': numpy.float16,
+    'float32': numpy.float32,
+    'float64': numpy.float64,
 }



--- a/Dragon/python/dragon/core/mpi.py
+++ b/Dragon/python/dragon/core/mpi.py
@@ -15,7 +15,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import dragon.import_c_api as C
+import dragon.import_c_api as _C


 _GLOBAL_MPI_IS_INIT = False
@@ -40,7 +40,7 @@ def Init():
    This function can only be called once.

    """
-    C.MPIInit()
+    _C.MPIInit()
    global _GLOBAL_MPI_IS_INIT
    global _GLOBAL_MPI_SNAPSHOT_RANKS
    _GLOBAL_MPI_IS_INIT = True
@@ -68,7 +68,7 @@ def Rank():

    """
    _check_init()
-    return C.MPIRank()
+    return _C.MPIRank()


 def Size():
@@ -81,7 +81,7 @@ def Size():

    """
    _check_init()
-    return C.MPISize()
+    return _C.MPISize()


 def CreateGroup(root=0, incl=[], excl=[]):
@@ -103,7 +103,7 @@ def CreateGroup(root=0, incl=[], excl=[]):

    """
    _check_init()
-    return C.MPICreateGroup(root, incl, excl)
+    return _C.MPICreateGroup(root, incl, excl)


 def Snapshot(incl):
@@ -226,4 +226,4 @@ def Finalize():

    """
    _check_init()
-    C.MPIFinalize()
\ No newline at end of file
+    _C.MPIFinalize()
\ No newline at end of file
--- a/Dragon/python/dragon/core/proto_utils.py
+++ b/Dragon/python/dragon/core/proto_utils.py
@@ -21,7 +21,7 @@ import numpy as np
 from google.protobuf.message import Message

 import dragon.config as cfg
-import dragon.import_c_api as C
+import dragon.import_c_api as _C
 from dragon.proto import dragon_pb2 as pb
 from dragon.core.scope import get_default_device

@@ -97,7 +97,7 @@ def MakeCXXOperatorDef(
    op_type, inputs=(), outputs=(),
        name='', uid=None, device_option=None,
            arg=None, engine=None, **kwargs):
-    c_def = C.OperatorDef()
+    c_def = _C.OperatorDef()
    py_def = MakeOperatorDef(
        op_type, inputs, outputs, name, uid,
            device_option, arg, engine, **kwargs)
@@ -118,7 +118,7 @@ def MakeDeviceOption(

 _PREDEFINED_DEVICE_LIMITS = 16
 _PREDEFINED_DEVICE_ENGINES = ['', 'CUDNN']
-_PREDEFINED_DEVICE_DICT = {'CPU': 0, 'CUDA': 1, 'CNML': 2}
+_PREDEFINED_DEVICE_DICT = {'cpu': 0, 'cuda': 1, 'cnml': 2}
 _PREDEFINED_DEVICE_OPTION_DICT = {}


@@ -127,8 +127,8 @@ for i in range(_PREDEFINED_DEVICE_LIMITS):
        for engine in _PREDEFINED_DEVICE_ENGINES:
            _PREDEFINED_DEVICE_OPTION_DICT[(device, i, engine)] = \
                MakeDeviceOption(identify, i, engine)
-        if device == 'CUDA':
-            _PREDEFINED_DEVICE_OPTION_DICT[('CUDA', i)] = \
+        if device == 'cuda':
+            _PREDEFINED_DEVICE_OPTION_DICT[('cuda', i)] = \
                MakeDeviceOption(identify, i, 'CUDNN')



--- a/Dragon/python/dragon/core/scope.py
+++ b/Dragon/python/dragon/core/scope.py
@@ -14,7 +14,7 @@ from __future__ import division
 from __future__ import print_function

 import threading
-import dragon.import_c_api as C
+import dragon.import_c_api as _C

 from contextlib import contextmanager

@@ -76,7 +76,7 @@ class WorkspaceScope(object):
    --------
    >>> import dragon as dg
    >>> with WorkspaceScope('session1'): pass
-    >>> with dg.workspace_scope('session2'): pass
+    >>> with dg.ws_scope('session2'): pass

    """
    def __init__(self, ws_name):
@@ -88,11 +88,11 @@ class WorkspaceScope(object):
        self.prev = 'default'

    def __enter__(self):
-        self.prev = C.CurrentWorkspace()
-        C.SwitchWorkspace(self.ws, True)
+        self.prev = _C.CurrentWorkspace()
+        _C.SwitchWorkspace(self.ws, True)

    def __exit__(self, type, value, traceback):
-        C.SwitchWorkspace(self.prev, True)
+        _C.SwitchWorkspace(self.prev, True)


 _GLOBAL_TENSOR_STACK = _ThreadLocalStack()
@@ -133,7 +133,7 @@ def device_scope(device_type, device_id=0, engine='AUTO'):

    Parameters
    ----------
-    device_type : {'CPU', 'GPU', 'CUDA', 'CNML'}, required
+    device_type : {'cpu', 'gpu', 'cuda', 'cnml'}, required
        The type of device.
    device_id : int, optional
        The index of the device.
@@ -143,9 +143,9 @@ def device_scope(device_type, device_id=0, engine='AUTO'):
    """
    device_type, device_id, device_engine = \
        device_type.upper(), device_id, engine.upper()
-    assert device_type in ['CPU', 'GPU', 'CUDA', 'CNML']
+    assert device_type in ['cpu', 'gpu', 'cuda', 'cnml']
    # Default names
-    if device_type == 'GPU': device_type = 'CUDA'
+    if device_type == 'gpu': device_type = 'cuda'
    if device_engine == 'AUTO': device_engine = 'CUDNN'
    return _GLOBAL_DEVICE_STACK.get_controller({
        'device_type': device_type,

--- a/Dragon/python/dragon/core/tensor.py
+++ b/Dragon/python/dragon/core/tensor.py
@@ -45,11 +45,11 @@ class Tensor(object):

        Parameters
        ----------
-        name : None or str
+        name : str, optional
            The name of Tensor.
-        shape : None or list
+        shape : list, optional
            The shape of Tensor.
-        dtype : None or str
+        dtype : str, optional
            The type of Tensor.

        Returns
@@ -94,7 +94,7 @@ class Tensor(object):

        Parameters
        ----------
-        value : number
+        value : number, optional, default=0
            The constant value.

        Returns
@@ -105,14 +105,14 @@ class Tensor(object):
        """
        return self.Fill('constant', value=value)

-    def Uniform(self, low=-1, high=1):
+    def Uniform(self, low=0, high=1):
        """Register as a variable with uniform initializer.

        Parameters
        ----------
-        low : number
+        low : number, optional, default=0
             The lower bound of uniform distribution.
-        high : number
+        high : number, optional, default=1
            The higher bound of uniform distribution.

        Returns
@@ -128,9 +128,9 @@ class Tensor(object):

        Parameters
        ----------
-        mu : number
+        mu : number, optional, default=0
            The mu of normal distribution.
-        sigma : number
+        sigma : number, optional, default=1
            The sigma of normal distribution.

        Returns
@@ -146,9 +146,9 @@ class Tensor(object):

        Parameters
        ----------
-        mu : number
+        mu : number, optional, default=0
            The mu of normal distribution.
-        sigma : number
+        sigma : number, optional, default=1
            The sigma of normal distribution.

        Returns
@@ -164,9 +164,9 @@ class Tensor(object):

        Parameters
        ----------
-        mean : number
+        mean : number, optional, default=0
            The mean(mu) of normal distribution.
-        std : number
+        std : number, optional, default=1
            The std(sigma) of normal distribution.

        Returns
@@ -177,12 +177,12 @@ class Tensor(object):
        """
        return self.Normal(mu=mean, sigma=std)

-    def GlorotUniform(self, scale=3.0):
+    def GlorotUniform(self, scale=3.):
        """Register as a variable with glorot uniform initializer.

        Parameters
        ----------
-        scale : number
+        scale : number, optional, default=3.
            The scale factor.

        Returns
@@ -193,12 +193,12 @@ class Tensor(object):
        """
        return self.Fill('glorot_uniform', scale=scale)

-    def GlorotNormal(self, scale=2.0):
+    def GlorotNormal(self, scale=2.):
        """Register as a variable with glorot normal initializer.

        Parameters
        ----------
-        scale : number
+        scale : number, optional, default=2.
            The scale factor.

        Returns
@@ -244,7 +244,7 @@ class Tensor(object):

        Parameters
        ----------
-        value : None or str
+        value : str
            The name to set.

        Returns
@@ -270,7 +270,7 @@ class Tensor(object):

        Parameters
        ----------
-        str
+        name : str
            The name.

        Returns
@@ -284,6 +284,11 @@ class Tensor(object):
    def shape(self):
        """Return or Set the shape.

+        Parameters
+        ---------
+        value : sequence of int
+            The shape to set.
+
        Returns
        -------
        sequence of int
@@ -344,7 +349,7 @@ class Tensor(object):
        ----------
        dtype : str
            The specific dtype.
-        inplace : boolean
+        inplace : boolean, optional, default=False
            Whether to modify the inputs.

        Returns
@@ -651,6 +656,99 @@ class Tensor(object):
        """
        return self.__mul__(-1.0)

+    def __gt__(self, other):
+        """Compute *self* > *other* element-wise.
+
+        Parameters
+        ----------
+        other : Tensor or number
+            The other tensor.
+
+        Returns
+        -------
+        Tensor
+            The output tensor.
+
+        """
+        if not isinstance(other, Tensor):
+            other = self._from_constants(other)
+        return self.CreateOperator('Compare', [self, other], operation='GT')
+
+    def __ge__(self, other):
+        """Compute *self* > *other* element-wise.
+
+        Parameters
+        ----------
+        other : Tensor or number
+            The other tensor.
+
+        Returns
+        -------
+        Tensor
+            The output tensor.
+
+        """
+        if not isinstance(other, Tensor):
+            other = self._from_constants(other)
+        return self.CreateOperator('Compare', [self, other], operation='GE')
+
+    def __lt__(self, other):
+        """Compute *self* < *other* element-wise.
+
+        Parameters
+        ----------
+        other : Tensor or number
+            The other tensor.
+
+        Returns
+        -------
+        Tensor
+            The output tensor.
+
+        """
+        if not isinstance(other, Tensor):
+            other = self._from_constants(other)
+        return self.CreateOperator('Compare', [self, other], operation='LT')
+
+    def __le__(self, other):
+        """Compute *self* <= *other* element-wise.
+
+        Parameters
+        ----------
+        other : Tensor or number
+            The other tensor.
+
+        Returns
+        -------
+        Tensor
+            The output tensor.
+
+        """
+        if not isinstance(other, Tensor):
+            other = self._from_constants(other)
+        return self.CreateOperator('Compare', [self, other], operation='LE')
+
+    def __eq__(self, other):
+        """Compute *self* == *other* element-wise.
+
+        Parameters
+        ----------
+        other : Tensor or number
+            The other tensor.
+
+        Returns
+        -------
+        Tensor
+            The output tensor.
+
+        """
+        if not isinstance(other, Tensor):
+            other = self._from_constants(other)
+        return self.CreateOperator('Compare', [self, other], operation='EQ')
+
+    def __hash__(self):
+        return id(self)
+
    def __call__(self, *args, **kwargs):
        """Print the expressions.

@@ -984,7 +1082,7 @@ class Tensor(object):
        ----------
        value : number or Tensor
            The value to convert.
-        dtype : str, optional
+        dtype : str, optional, default='float32'
            The data type of the tensor.

        Returns

--- a/Dragon/python/dragon/core/tensor_utils.py
+++ b/Dragon/python/dragon/core/tensor_utils.py
@@ -15,11 +15,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import numpy as np
-import dragon as dg
-from google.protobuf.message import Message
-
-import dragon.import_c_api as C
+import numpy
+import dragon

 from dragon.core.tensor import Tensor
 from dragon.core.proto_utils import GetDeviceOption
@@ -50,7 +47,7 @@ def FromShape(shape, dtype='float32', name=None):
    tensor.shape = list(shape)
    if not isinstance(shape, (tuple, list)):
        raise TypeError('The shape should be a tuple or list.')
-    C.TensorFromShape(
+    dragon.C.TensorFromShape(
        _stringify_tensor(tensor),
            list(shape), dtype)
    return tensor
@@ -73,7 +70,7 @@ def SetShape(tensor, shape, dtype='float32'):
    None

    """
-    C.TensorFromShape(_stringify_tensor(tensor), shape, dtype)
+    dragon.C.TensorFromShape(_stringify_tensor(tensor), shape, dtype)


 def FromTensor(src, src_ctx=None, name=None, ctx=None):
@@ -100,9 +97,9 @@ def FromTensor(src, src_ctx=None, name=None, ctx=None):

    """
    tensor = _try_get_tensor(name)
-    if src_ctx is None: src_ctx = GetDeviceOption('CPU')
-    if ctx is None: ctx = GetDeviceOption('CPU')
-    C.TensorFromTensor(
+    if src_ctx is None: src_ctx = GetDeviceOption('cpu')
+    if ctx is None: ctx = GetDeviceOption('cpu')
+    dragon.C.TensorFromTensor(
        _stringify_tensor(tensor), _stringify_tensor(src),
            _stringify_proto(ctx), _stringify_proto(src_ctx))
    return tensor
@@ -130,9 +127,9 @@ def FromPyArray(array, name=None):

    """
    tensor = _try_get_tensor(name)
-    if not isinstance(array, np.ndarray):
+    if not isinstance(array, numpy.ndarray):
        raise TypeError('The given nd-array should be numpy.ndarray.')
-    C.TensorFromPyArray(_stringify_tensor(tensor), array)
+    dragon.C.TensorFromPyArray(_stringify_tensor(tensor), array)
    return tensor


@@ -157,7 +154,7 @@ def SetPyArray(tensor, array):
    The wrapper of ``TensorFromPyArrayCC``.

    """
-    C.TensorFromPyArray(_stringify_tensor(tensor), array)
+    dragon.C.TensorFromPyArray(_stringify_tensor(tensor), array)


 def ToPyArray(tensor, readonly=False):
@@ -178,7 +175,7 @@ def ToPyArray(tensor, readonly=False):
        The array sharing the memory with original tensor.

    """
-    return C.TensorToPyArray(_stringify_tensor(tensor), readonly)
+    return dragon.C.TensorToPyArray(_stringify_tensor(tensor), readonly)


 def GetStorage(tensor):
@@ -196,8 +193,8 @@ def GetStorage(tensor):

    """
    tensor = _stringify_tensor(tensor)
-    if not dg.workspace.HasTensor(tensor): return None
-    return C.GetTensor(tensor)
+    if not dragon.workspace.HasTensor(tensor): return None
+    return dragon.C.GetTensor(tensor)


 def _stringify_proto(obj):
@@ -213,9 +210,5 @@ def _stringify_tensor(obj):

 def _try_get_tensor(name=None):
    """Try to create or get a tensor"""
-    if name is None or name == '':
-        return Tensor()
-    else:
-        tensor = Tensor('')
-        tensor.set_name(name)
-        return tensor
\ No newline at end of file
+    if name is None or name == '': return Tensor()
+    else: return Tensor.Ref(name)
\ No newline at end of file
--- a/Dragon/python/dragon/core/workspace.py
+++ b/Dragon/python/dragon/core/workspace.py
@@ -25,20 +25,16 @@ from __future__ import division
 from __future__ import print_function

 import os
-import numpy as np
+import numpy
 import threading
 import six.moves.cPickle as pickle

-from google.protobuf.message import Message
-
-import dragon.import_c_api as C
+import dragon.import_c_api as _C
 import dragon.core.logging as logging
+import dragon.proto.dragon_pb2 as pb

 from dragon.config import GetGlobalOptions
-import dragon.core.mpi as mpi
-import dragon.proto.dragon_pb2 as pb
-import dragon.core.proto_utils as pb_utils
-import dragon.core.mapping as mapping
+from dragon.core import mpi, mapping, proto_utils


 def CurrentWorkspace():
@@ -50,7 +46,7 @@ def CurrentWorkspace():
        The workspace name.

    """
-    return C.CurrentWorkspace()
+    return _C.CurrentWorkspace()


 def SwitchWorkspace(workspace_name, create_if_missing=True):
@@ -70,7 +66,7 @@ def SwitchWorkspace(workspace_name, create_if_missing=True):
    """
    if workspace_name == '':
        raise ValueError('The workspace name should not be empty.')
-    C.SwitchWorkspace(workspace_name, create_if_missing)
+    _C.SwitchWorkspace(workspace_name, create_if_missing)


 def MoveWorkspace(target_ws, source_ws):
@@ -90,7 +86,7 @@ def MoveWorkspace(target_ws, source_ws):
    """
    if target_ws == '' or source_ws == '':
        raise ValueError('The target or source name can not be empty.')
-    C.MoveWorkspace(target_ws, source_ws)
+    _C.MoveWorkspace(target_ws, source_ws)


 def ResetWorkspace(workspace_name=''):
@@ -110,7 +106,7 @@ def ResetWorkspace(workspace_name=''):
    None

    """
-    C.ResetWorkspace(workspace_name)
+    _C.ResetWorkspace(workspace_name)


 def ClearWorkspace(workspace_name=''):
@@ -130,7 +126,7 @@ def ClearWorkspace(workspace_name=''):
    None

    """
-    C.ClearWorkspace(workspace_name)
+    _C.ClearWorkspace(workspace_name)


 def CreateGraph(graph_def):
@@ -150,7 +146,7 @@ def CreateGraph(graph_def):
    option = GetGlobalOptions()
    LogMetaGraph(graph_def)
    ExportMetaGraph(graph_def)
-    return C.CreateGraph(
+    return _C.CreateGraph(
        _stringify_proto(graph_def),
            option['log_optimized_graph'],
    )
@@ -173,7 +169,7 @@ def RunOperator(op_def, verbose=False):
    """
    if isinstance(op_def, pb.OperatorDef):
        op_def = op_def.SerializeToString()
-    C.RunOperator(op_def, verbose)
+    _C.RunOperator(op_def, verbose)


 def HasTensor(tensor):
@@ -190,7 +186,7 @@ def HasTensor(tensor):
        The query result.

    """
-    return C.HasTensor(_stringify_tensor(tensor))
+    return _C.HasTensor(_stringify_tensor(tensor))


 def CreateTensor(tensor):
@@ -206,7 +202,7 @@ def CreateTensor(tensor):
    None

    """
-    return C.CreateTensor(_stringify_tensor(tensor))
+    return _C.CreateTensor(_stringify_tensor(tensor))


 def CreateFiller(filler_def):
@@ -229,7 +225,7 @@ def CreateFiller(filler_def):
    """
    filler_def = filler_def if isinstance(filler_def, str) \
        else filler_def.SerializePartialToString()
-    C.CreateFiller(filler_def)
+    _C.CreateFiller(filler_def)


 def GetFillerType(tensor):
@@ -250,7 +246,7 @@ def GetFillerType(tensor):
        The filler type.

    """
-    return C.GetFillerType(_stringify_tensor(tensor))
+    return _C.GetFillerType(_stringify_tensor(tensor))


 def GetTensorName(tensor):
@@ -271,7 +267,7 @@ def GetTensorName(tensor):
    The query result may be different from the one used in the frontend.

    """
-    return C.GetTensorName(_stringify_tensor(tensor))
+    return _C.GetTensorName(_stringify_tensor(tensor))


 def SetTensorAlias(tensor, alias):
@@ -289,7 +285,7 @@ def SetTensorAlias(tensor, alias):
    None

    """
-    return C.SetTensorAlias(_stringify_tensor(tensor), alias)
+    return _C.SetTensorAlias(_stringify_tensor(tensor), alias)


 def FetchTensor(tensor):
@@ -306,7 +302,7 @@ def FetchTensor(tensor):
        The values copied from the backend.

    """
-    return C.FetchTensor(_stringify_tensor(tensor))
+    return _C.FetchTensor(_stringify_tensor(tensor))


 def FeedTensor(tensor, array, force_cpu=False, dtype=None):
@@ -329,14 +325,14 @@ def FeedTensor(tensor, array, force_cpu=False, dtype=None):

    Examples
    --------
-    >>> import dragon as dg
-    >>> a = dg.Tensor().Variable()
-    >>> dg.workspace.FeedTensor(a, 1)
-    >>> a_value = dg.workspace.FetchTensor(a)
+    >>> import dragon
+    >>> a = dragon.Tensor().Variable()
+    >>> dragon.workspace.FeedTensor(a, 1)
+    >>> a_value = dragon.workspace.FetchTensor(a)
    >>> a_value, a_value.dtype
    >>> [ 1.], "float32"

-    >>> dg.workspace.FeedTensor(a, [[1, 2, 3]], dtype='float16')
+    >>> dragon.workspace.FeedTensor(a, [[1, 2, 3]], dtype='float16')
    >>> a_value = a.get_value()
    >>> a_value, a_value.dtype
    >>> [[ 1.  2.  3.]], "float16"
@@ -344,13 +340,13 @@ def FeedTensor(tensor, array, force_cpu=False, dtype=None):
    """
    name = tensor.name if hasattr(tensor, 'name') else str(tensor)
    if force_cpu is True:
-        dev = pb_utils.GetDeviceOption('CPU')
+        dev = proto_utils.GetDeviceOption('cpu')
    else:
-        dev = pb_utils.GetDefaultDeviceOption()
-        if dev is None: dev = pb_utils.GetGlobalDeviceOption()
+        dev = proto_utils.GetDefaultDeviceOption()
+        if dev is None: dev = proto_utils.GetGlobalDeviceOption()

-    if not isinstance(array, np.ndarray):
-        auto_data_type = np.float32 if dtype is None else dtype
+    if not isinstance(array, numpy.ndarray):
+        auto_data_type = numpy.float32 if dtype is None else dtype
    else:
        auto_data_type = array.dtype if dtype is None else dtype

@@ -365,8 +361,8 @@ def FeedTensor(tensor, array, force_cpu=False, dtype=None):
                        format(preset_data_type, dtype))
        auto_data_type = preset_data_type

-    nd_array = np.array(array, dtype=auto_data_type, copy=False)
-    C.FeedTensor(name, nd_array, _stringify_proto(dev))
+    nd_array = numpy.array(array, dtype=auto_data_type, copy=False)
+    _C.FeedTensor(name, nd_array, _stringify_proto(dev))


 def ResetTensor(tensor):
@@ -384,7 +380,7 @@ def ResetTensor(tensor):
    None

    """
-    return C.ResetTensor(_stringify_tensor(tensor))
+    return _C.ResetTensor(_stringify_tensor(tensor))


 def RunGraph(
@@ -427,7 +423,7 @@ def RunGraph(
    # Run the graph according to the specified include/exclude rule
    runtime_stage = stage if stage else 'default'
    rule = _PREDEFINED_GRAPH_RUNTIME_STAGES[runtime_stage]
-    C.RunGraph(str(graph_name), str(rule['include']), str(rule['exclude']))
+    _C.RunGraph(str(graph_name), str(rule['include']), str(rule['exclude']))

    # Try to return the outputs
    # Force to return may lead to asserts if outputs are not computed
@@ -462,7 +458,7 @@ def FlowGradients(inputs, targets, input_grads=None, ignored_grads=None):
        if (option['log_optimized_graph'] or
            option['log_meta_graph']) else False

-    C.FlowGradients(
+    _C.FlowGradients(
        inputs, targets,
            input_grads if input_grads else [],
                ignored_grads if ignored_grads else [],
@@ -520,8 +516,7 @@ def ExportMetaGraph(graph_def):
 def Snapshot(
    tensors, filename,
        prefix='', suffix='.bin',
-            format='default',
-):
+            format='default'):
    """Snapshot tensors into a binary file.

    Parameters
@@ -566,7 +561,7 @@ def Snapshot(
        logging.info('Model Format: Pickle')
    elif format is 'caffe':
        names = [tensor.name for tensor in tensors]
-        C.Snapshot(file_path, names, 1)
+        _C.Snapshot(file_path, names, 1)
    else: raise TypeError('Unknown binary format: {}'.format(format))


@@ -606,7 +601,7 @@ def Restore(binary_file, format='default'):
    elif format == 'caffe':
        # Caffe models can't save the tensor name
        # We simply use "layer_name/param:X"
-        C.Restore(binary_file, 1)
+        _C.Restore(binary_file, 1)
    else:
        raise TypeError('Unknown binary format: {}'.format(format))

@@ -636,7 +631,7 @@ def GetDummyName(basename, suffix='', domain='', zero_based=True):
        The unique dummy name.

    """
-    return C.GetDummyName(basename, suffix, domain, zero_based)
+    return _C.GetDummyName(basename, suffix, domain, zero_based)


 def _stringify_proto(obj):

--- a/Dragon/python/dragon/operators/__init__.py
+++ b/Dragon/python/dragon/operators/__init__.py
@@ -69,9 +69,15 @@ class OpSchema(object):
            def Impl(*args, **kwargs):
                inputs = args[0]
                if isinstance(inputs, (list, tuple)):
+                    dtype = None
+                    for idx, input in enumerate(inputs):
+                        if isinstance(input, Tensor) and \
+                            input.dtype is not None:
+                                dtype = input.dtype
+                                break
                    for idx, input in enumerate(inputs):
                        if not isinstance(input, Tensor):
-                            inputs[idx] = Tensor.Convert(input, dtype=None)
+                            inputs[idx] = Tensor.Convert(input, dtype=dtype)
                    return op_func(inputs + list(args[1:]), **kwargs)
                else:
                    if not isinstance(inputs, Tensor):

--- a/Dragon/python/dragon/operators/ndarray.py
+++ b/Dragon/python/dragon/operators/ndarray.py
@@ -752,8 +752,8 @@ def Arange(start, stop=None, step=1, dtype='float32', **kwargs):

    Parameters
    ----------
-    start : int or Tensor
-        The start of the range.
+    inputs : Tensor
+        The input tensor.
    stop : int or Tensor, optional
        The stop of range.
    step : int or Tensor, optional
@@ -770,3 +770,33 @@ def Arange(start, stop=None, step=1, dtype='float32', **kwargs):
    arguments = ParseArgs(locals())
    arguments['dtype'] = arguments['dtype'].lower()
    return Tensor.CreateOperator('Arange', [], **arguments)
+
+
+@OpSchema.Inputs(1)
+def Multinomial(inputs, num_samples=1, normalize=False, **kwargs):
+    """Return a tensor where each row contains ``num_samples``,
+    sampled from the multinomial distribution.
+
+    If ``normalize`` is *True*, negative inputs is accepted,
+    and will be normalized by a softmax function. (*TensorFlow* Style).
+
+    Otherwise, inputs should be non-negative. (*Torch* Style).
+
+    **Type Constraints**: (*int8*, *uint8*, *int32*, *int64*, *float32*, *float64*)
+
+    Parameters
+    ----------
+    inputs : Tensor
+        The input tensor.
+    num_samples : int, optional, default=1
+        The number of samples.
+    normalize : boolean, optional, default=False
+        Whether to normalize the inputs.
+
+    Returns
+    -------
+    Tensor
+        A *int64* tensor contains the indices.
+
+    """
+    return Tensor.CreateOperator('Multinomial', **ParseArgs(locals()))
\ No newline at end of file
--- a/Dragon/python/dragon/operators/control_flow.py
+++ b/Dragon/python/dragon/operators/control_flow.py
@@ -39,6 +39,7 @@ def Copy(inputs, **kwargs):
    return Tensor.CreateOperator('Copy', **arguments)


+@OpSchema.ConvertConstantInputs()
 @OpSchema.Inputs(2)
 def Equal(inputs, to_uint8=False, **kwargs):
    """``Equal`` comparing between A and B.
@@ -61,9 +62,10 @@ def Equal(inputs, to_uint8=False, **kwargs):

    """
    arguments = ParseArgs(locals())
-    return Tensor.CreateOperator('Compare', operation='EQUAL', **arguments)
+    return Tensor.CreateOperator('Compare', operation='EQ', **arguments)


+@OpSchema.ConvertConstantInputs()
 @OpSchema.Inputs(2)
 def Less(inputs, to_uint8=False, **kwargs):
    """``Less`` comparing between A and B.
@@ -86,12 +88,65 @@ def Less(inputs, to_uint8=False, **kwargs):

    """
    arguments = ParseArgs(locals())
-    return Tensor.CreateOperator('Compare', operation='LESS', **arguments)
+    return Tensor.CreateOperator('Compare', operation='LT', **arguments)


+@OpSchema.ConvertConstantInputs()
+@OpSchema.Inputs(2)
+def LessEqual(inputs, to_uint8=False, **kwargs):
+    """``LessEqual`` comparing between A and B.
+
+    Set ``to_uint8`` if you expect the ``uint8`` results instead of ``bool``.
+
+    **Type Constraints**: (*bool*, *int8*, *uint8*, *int32*, *int64*, *float16*, *float32*, *float64*)
+
+    Parameters
+    ----------
+    inputs : sequence of Tensor
+        The inputs, represent A and B respectively.
+    to_uint8 : bool
+        ``True`` to convert to ``uint8`` results.
+
+    Returns
+    -------
+    Tensor
+        The comparing results.
+
+    """
+    arguments = ParseArgs(locals())
+    return Tensor.CreateOperator('Compare', operation='LE', **arguments)
+
+
+@OpSchema.ConvertConstantInputs()
 @OpSchema.Inputs(2)
 def Greater(inputs, to_uint8=False, **kwargs):
-    """``Less`` comparing between A and B.
+    """``Greater`` comparing between A and B.
+
+    Set ``to_uint8`` if you expect the ``uint8`` results instead of ``bool``.
+
+    **Type Constraints**: (*bool*, *int8*, *uint8*, *int32*, *int64*, *float16*, *float32*, *float64*)
+
+    Parameters
+    ----------
+    inputs : sequence of Tensor
+        The inputs, represent A and B respectively.
+    to_uint8 : bool
+        ``True`` to convert to ``uint8`` results.
+
+    Returns
+    -------
+    Tensor
+        The comparing results.
+
+    """
+    arguments = ParseArgs(locals())
+    return Tensor.CreateOperator('Compare', operation='GT', **arguments)
+
+
+@OpSchema.ConvertConstantInputs()
+@OpSchema.Inputs(2)
+def GreaterEqual(inputs, to_uint8=False, **kwargs):
+    """``GreaterEqual`` comparing between A and B.

    Set ``to_uint8`` if you expect the ``uint8`` results instead of ``bool``.

@@ -111,4 +166,4 @@ def Greater(inputs, to_uint8=False, **kwargs):

    """
    arguments = ParseArgs(locals())
-    return Tensor.CreateOperator('Compare', operation='GREATER', **arguments)
\ No newline at end of file
+    return Tensor.CreateOperator('Compare', operation='GE', **arguments)
\ No newline at end of file
--- a/Dragon/python/dragon/operators/rnn/rnn_param.py
+++ b/Dragon/python/dragon/operators/rnn/rnn_param.py
@@ -20,9 +20,8 @@ from .. import *
 def RNNParamSet(
    inputs, layer_id, param_id, param_type,
        rnn_mode, input_size, hidden_size,
-            num_layers=1, num_directions=1, **kwargs
-):
+            num_layers=1, num_directions=1, **kwargs):
    arguments = ParseArgs(locals())
    arguments['inputs'] = inputs[1]
    arguments['existing_outputs'] = inputs[0]
-    return Tensor.CreateOperator(op_type='RNNParamSet', **arguments)
\ No newline at end of file
+    return Tensor.CreateOperator('RNNParamSet', **arguments)
\ No newline at end of file
--- a/Dragon/python/dragon/operators/rnn/rnn_wrapper.py
+++ b/Dragon/python/dragon/operators/rnn/rnn_wrapper.py
@@ -13,24 +13,21 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import numpy
+import dragon
 import warnings
-import dragon as dg
-import numpy as np

 from dragon.core.tensor import Tensor
 from dragon.core.tensor_utils import FromShape
-
-from .rnn_param import RNNParamSet
+from dragon.operators.rnn.rnn_param import RNNParamSet


 class RNNBase(object):
-    """A simple class wrapping general RNN ops.
+    """A simple class wrapping general RNN ops."""

-    """
    def __init__(self,
        mode, input_size, hidden_size, num_layers=1,
-            bidirectional=False, dropout=0, name=None
-    ):
+            bidirectional=False, dropout=0, name=None):
        eligible_rnn_modes = ('rnn_tanh', 'rnn_relu', 'lstm', 'gru')
        if mode.lower() not in eligible_rnn_modes:
            raise ValueError('Unknown rnn mode: {}.'
@@ -54,26 +51,23 @@ class RNNBase(object):
        elif self.mode == 'gru': gate_size = 3 * self.hidden_size
        else: gate_size = self.hidden_size
        # 1. Plan weights
-        self._matrix_weights = []; self._bias_weights = []
+        self._matrix_shape, self._bias_shape = [], []
        for layer in range(self.num_layers):
            for direction in range(self.num_directions):
                layer_input_size = self.input_size if layer == 0 \
                    else self.hidden_size * self.num_directions
-                w_names = ['layer_{}/{}/{}'.format(layer, p, 'L' if direction == 0 else 'R')
-                           for p in ('matrix_ih', 'matrix_hh', 'bias_ih', 'bias_hh')]
-                w_ih = Tensor(name=w_names[0], shape=[gate_size, layer_input_size])
-                w_hh = Tensor(name=w_names[1], shape=[gate_size, self.hidden_size])
-                b_ih = Tensor(name=w_names[2], shape=[gate_size,])
-                b_hh = Tensor(name=w_names[3], shape=[gate_size,])
+                w_ih_shape = [gate_size, layer_input_size]
+                w_hh_shape = [gate_size, self.hidden_size]
+                b_ih_shape, b_hh_shape = [gate_size], [gate_size]
                # W (0 ~ 3), R (4 ~ 7)
-                self._matrix_weights.extend([w_ih, w_hh])
+                self._matrix_shape.extend([w_ih_shape, w_hh_shape])
                # Bw (0 ~ 3), Br (4 ~ 7)
-                self._bias_weights.extend([b_ih, b_hh])
+                self._bias_shape.extend([b_ih_shape, b_hh_shape])

        # 2. Compute total number of parameters
        self._weights_count = 0
-        for w in self._matrix_weights + self._bias_weights:
-            self._weights_count += np.prod(w.shape)
+        for shape in self._matrix_shape + self._bias_shape:
+            self._weights_count += numpy.prod(shape)

        # 3. Register the packed weights
        self.weights = FromShape(shape=[self._weights_count],
@@ -101,8 +95,8 @@ class RNNBase(object):
    ##############################################

    def _uniform_init(self, shape, dtype='float32'):
-        stdv = 1.0 / np.sqrt(self.hidden_size)
-        return np.random.uniform(-stdv, stdv, shape).astype(dtype)
+        stdv = 1.0 / numpy.sqrt(self.hidden_size)
+        return numpy.random.uniform(-stdv, stdv, shape).astype(dtype)

    def _orthogonal_init(self, shape, gain=1, dtype='float32'):
        num_rows = 1
@@ -110,16 +104,16 @@ class RNNBase(object):
        num_cols = shape[-1]
        flat_shape = (num_cols, num_rows) if num_rows < num_cols \
            else (num_rows,  num_cols)
-        W = np.random.randn(*flat_shape)
-        q, r = np.linalg.qr(W)
+        W = numpy.random.randn(*flat_shape)
+        q, r = numpy.linalg.qr(W)
        # Make Q uniform
-        d = np.diag(r)
-        q *= np.sign(d)
+        d = numpy.diag(r)
+        q *= numpy.sign(d)
        if num_rows < num_cols: q = q.T
        return gain * q.reshape(shape).astype(dtype)

    def _zero_init(self, shape, dtype='float32'):
-        return np.zeros(shape, dtype=dtype)
+        return numpy.zeros(shape, dtype=dtype)

    ##############################################
    #                                            #
@@ -137,20 +131,19 @@ class RNNBase(object):
            raise ValueError('Unknown param type: ' + type)

    def _set_param(self, layer_id, param_id, param_type, param):
-        if not isinstance(param, Tensor):
-            if isinstance(param, np.ndarray):
-                paramT = Tensor('/tmp/rnn_param').Variable()
-                paramT.set_value(param)
-                param = paramT
-            else: raise ValueError('Excepted a tensor or numpy array.')
+        if isinstance(param, numpy.ndarray):
+            param_temp = dragon.Tensor.Ref('/tmp/rnn_param')
+            param_temp.set_value(param)
+            param = param_temp
+        else: raise ValueError('Excepted a numpy array.')
        self.weights.expressions = dict() # Clear cached expressions
        outputs = RNNParamSet([self.weights, param], layer_id, param_id, param_type,
            rnn_mode=self.mode, input_size=self.input_size, hidden_size=self.hidden_size,
            num_layers=self.num_layers, num_directions=self.num_directions)
-        for k, v in outputs.expressions.items(): dg.workspace.RunOperator(v)
+        for k, v in outputs.expressions.items(): dragon.workspace.RunOperator(v)

    def _reset_params(self):
-        np.random.seed(dg.config.GetRandomSeed())
+        numpy.random.seed(dragon.config.GetRandomSeed())
        if self.mode == 'lstm': num_gates = 4
        elif self.mode == 'gru': num_gates = 3
        else: num_gates = 1
@@ -166,8 +159,8 @@ class RNNBase(object):
                        bias_init = getattr(self, '_{}_init'.format(bias_init))
                    pseudo_layer_id = layer * self.num_directions + direction
                    packed_id = pseudo_layer_id * 2 + int(param_id / num_gates)
-                    matrix_shape = self._matrix_weights[packed_id].shape[:]
-                    bias_shape = self._bias_weights[packed_id].shape[:]
+                    matrix_shape = self._matrix_shape[packed_id][:]
+                    bias_shape = self._bias_shape[packed_id][:]
                    matrix_shape[0] = bias_shape[0] = int(matrix_shape[0] / num_gates)
                    self._set_param(layer_id=pseudo_layer_id, param_id=param_id,
                        param_type='matrix', param=matrix_init(matrix_shape))
@@ -202,6 +195,7 @@ class RNNBase(object):
        if not self._init_params: self._reset_params()

        arguments = {
+            'op_type': 'Recurrent',
            'inputs': [x, self.weights] +
                          ([hx] if hx else []) +
                              ([cx] if cx else []),
@@ -213,11 +207,11 @@ class RNNBase(object):
            'dropout_ratio': self.dropout,
        }

-        if required_cell: n_out = 3
-        elif required_hidden: n_out = 2
-        else: n_out = 1
+        if required_cell: num_outputs = 3
+        elif required_hidden: num_outputs = 2
+        else: num_outputs = 1

-        return Tensor.CreateOperator(num_outputs=n_out, op_type='Recurrent', **arguments)
+        return Tensor.CreateOperator(num_outputs=num_outputs, **arguments)

    def __call__(self, *args, **kwargs):
        return self.create(*args, **kwargs)
\ No newline at end of file
--- a/Dragon/python/dragon/ops.py
+++ b/Dragon/python/dragon/ops.py
@@ -24,7 +24,7 @@ from .operators import arithmetic as math_ops
 from .operators import control_flow as control_flow_ops
 from .operators import misc as misc_ops
 from .operators import mpi as mpi_ops
-from .operators import ndarray as array_ops
+from .operators import array as array_ops
 from .operators import norm as norm_ops
 from .operators import recurrent as recurrent_ops
 from .operators import contrib as contrib_ops
@@ -137,12 +137,15 @@ ExpandDims = array_ops.ExpandDims
 Squeeze = array_ops.Squeeze
 Shape = array_ops.Shape
 Arange = array_ops.Arange
+Multinomial = array_ops.Multinomial

 # Control Flow
 Copy = control_flow_ops.Copy
 Equal = control_flow_ops.Equal
 Less = control_flow_ops.Less
-Grater = control_flow_ops.Greater
+LessEqual = control_flow_ops.LessEqual
+Greater = control_flow_ops.Greater
+GreaterEqual = control_flow_ops.GreaterEqual

 # Misc
 Cast = AsType = misc_ops.Cast

--- a/Dragon/python/dragon/updaters.py
+++ b/Dragon/python/dragon/updaters.py
@@ -22,7 +22,7 @@ from __future__ import print_function

 import pprint

-import dragon.core.workspace as ws
+from dragon.core import workspace
 from dragon.core.tensor import Tensor


@@ -93,7 +93,7 @@ class BaseUpdater(object):
        defaults = self.__dict__.get('_defaults')
        if item in defaults:
            if self._registered:
-                return ws.FetchTensor(self._slot + '/' + item)
+                return workspace.FetchTensor(self._slot + '/' + item)
            else: return defaults[item]
        return self.__dict__[item]

@@ -101,7 +101,7 @@ class BaseUpdater(object):
        defaults = self.__dict__.get('_defaults')
        if defaults is not None and key in defaults:
            if self._registered:
-                ws.FeedTensor(self._slot + '/' + key, value,
+                workspace.FeedTensor(self._slot + '/' + key, value,
                    dtype='float32', force_cpu=True)
            else:
                self._defaults[key] = value
@@ -111,7 +111,7 @@ class BaseUpdater(object):
    def register_in_workspace(self):
        if not self._registered:
            for k, v in self._defaults.items():
-                ws.FeedTensor(self._slot + "/" + k, v,
+                workspace.FeedTensor(self._slot + "/" + k, v,
                    dtype='float32', force_cpu=True)
            self._registered = True
            if self._verbose:

--- a/Dragon/python/dragon/utils/vision/data_transformer.py
+++ b/Dragon/python/dragon/utils/vision/data_transformer.py
@@ -13,7 +13,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import sys
 import numpy as np
 import numpy.random as npr
 from multiprocessing import Process
@@ -105,18 +104,8 @@ class DataTransformer(Process):
            self._max_random_scale - self._min_random_scale) \
                + self._min_random_scale
        if random_scale != 1.0:
-            if sys.version_info >= (3, 0):
-                im = cv2.resize(im, None, interpolation=cv2.INTER_LINEAR,
-                        fx=random_scale, fy=random_scale)
-            else:
-                # Fuck Fuck Fuck opencv-python2, it always has a BUG
-                # that leads to duplicate cuDA handles created at gpu:0
-                new_shape = (
-                    int(np.ceil(im.shape[1] * random_scale)),
-                    int(np.ceil(im.shape[0] * random_scale)))
-                im = PIL.Image.fromarray(im)
-                im = im.resize(new_shape, PIL.Image.BILINEAR)
-                im = np.array(im)
+            im = cv2.resize(im, None, fx=random_scale,
+                fy=random_scale, interpolation=cv2.INTER_LINEAR)

        # Padding
        if self._padding > 0:

--- a/Dragon/python/dragon/vm/caffe/timer.py
+++ b/Dragon/python/dragon/vm/caffe/timer.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# Codes are based on:
-#
-#      <https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/utils/timer.py>
-#
-# ------------------------------------------------------------
-
-import time
-
-
-class Timer(object):
-    """A simple timer."""
-    def __init__(self):
-        self.total_time = 0.
-        self.calls = 0
-        self.start_time = 0.
-        self.diff = 0.
-        self.average_time = 0.
-
-    def tic(self):
-        self.start_time = time.time()
-
-    def toc(self, average=True):
-        self.diff = time.time() - self.start_time
-        self.total_time += self.diff
-        self.calls += 1
-        self.average_time = self.total_time / self.calls
-        if average:
-            return self.average_time
-        else:
-            return self.diff
--- a/Dragon/python/dragon/vm/onnx/helper.py
+++ b/Dragon/python/dragon/vm/onnx/helper.py
@@ -89,7 +89,7 @@ def native_run_graph(graph_def, inputs, initializer, init_func=None):
    # Create an anonymous workspace
    ws = Workspace()

-    with dg.workspace_scope(ws.name):
+    with dg.ws_scope(ws.name):
        # Register all the initializer before feeding them
        for name in initializer:
            dg.Tensor(name=name).Variable()

--- a/Dragon/python/dragon/vm/onnx/workspace.py
+++ b/Dragon/python/dragon/vm/onnx/workspace.py
@@ -27,7 +27,7 @@ class Workspace(object):

    def __getattr__(self, attr):
        def f(*args, **kwargs):
-            with dg.workspace_scope(self.name, ):
+            with dg.ws_scope(self.name, ):
                return getattr(dg.workspace, attr)(*args, **kwargs)
        return f


--- a/Dragon/python/dragon/vm/tensorflow/framework/ops.py
+++ b/Dragon/python/dragon/vm/tensorflow/framework/ops.py
@@ -290,7 +290,7 @@ class _DefaultGraphStack(_DefaultStack):
    @tf_contextlib.contextmanager
    def get_controller(self, default):
        with super(_DefaultGraphStack, self).get_controller(default) as g:
-            with dragon.workspace_scope(g._workspace):
+            with dragon.ws_scope(g._workspace):
                yield g



--- a/Dragon/python/dragon/vm/tensorflow/training/learning_rate_decay.py
+++ b/Dragon/python/dragon/vm/tensorflow/training/learning_rate_decay.py
@@ -121,7 +121,6 @@ class _CosineDecayRestarts(_DecayBase):
    def run(self, inputs, outputs):
        gs = self.get(inputs[0])
        global_step = min(gs - self.last_steps, self.decay_steps)
-        print(global_step, self.decay_steps)
        cosine_decay = 0.5 * (1 + math.cos(math.pi * global_step / self.decay_steps))
        decayed = (1. - self.alpha) * cosine_decay + self.alpha
        new_lr = self.learning_rate * decayed

--- a/Dragon/python/dragon/vm/theano/compile/function.py
+++ b/Dragon/python/dragon/vm/theano/compile/function.py
@@ -178,12 +178,12 @@ def GraphDef_Device(graph_def):
    """
    from dragon.config import option
    if option['device'] is not 'None':
-        supports = {'CPU': 0, 'CUDA': 1, 'CNML': 2}
+        supports = {'cpu': 0, 'cuda': 1, 'cnml': 2}
        device_option = pb.DeviceOption()
        device_option.device_type = supports[option['device']]
        device_option.device_id = option['device_id']
        device_option.random_seed = option['random_seed']
-        if option['device'] == 'CUDA':
+        if option['device'] == 'cuda':
            if option['use_cudnn']: device_option.engine = 'CUDNN'
        graph_def.device_option.CopyFrom(device_option)


--- a/Dragon/python/dragon/vm/torch/__init__.py
+++ b/Dragon/python/dragon/vm/torch/__init__.py
@@ -14,17 +14,16 @@ from __future__ import division
 from __future__ import print_function

 # Import Dynamic Methods
-import dragon.vm.torch.ops.builtin
+import dragon.vm.torch.ops.tensor

 # Import Core Methods
 from dragon.vm.torch.tensor import *
-from dragon.vm.torch.tensor_uitls import from_numpy
-from dragon.vm.torch.c_api import Size
+from dragon.vm.torch.c_api import Size, from_numpy
 from dragon.vm.torch.serialization import save, load

 # Import Subpackages
 import dragon.vm.torch.cuda
-from dragon.vm.torch.ops import *
+from dragon.vm.torch.ops.builtin import *
 from dragon.vm.torch.autograd import *
 import dragon.vm.torch.nn
 import dragon.vm.torch.optim

--- a/Dragon/python/dragon/vm/torch/c_api.py
+++ b/Dragon/python/dragon/vm/torch/c_api.py
@@ -14,6 +14,10 @@ from __future__ import division
 from __future__ import print_function

 import copy
+import numpy
+import importlib
+
+from dragon.core import mapping, tensor_utils


 class Size(tuple):
@@ -27,30 +31,68 @@ class Size(tuple):
        return 'torch.Size([{}])'.format(', '.join([str(s) for s in self]))


-class Context(object):
-    def __init__(self, device_type='CPU', device_id=0):
-        self._device_type = device_type
-        self._device_id = device_id
+class device(object):
+    def __init__(self, type='cpu', index=0):
+        self.type, self.index = type, index

-    @property
-    def device_type(self):
-        return self._device_type
+    def copy(self):
+        return copy.deepcopy(self)

-    @device_type.setter
-    def device_type(self, value):
-        self._device_type = value
+    def __eq__(self, other):
+        return self.type == other.type and \
+               self.index == other.index

-    @property
-    def device_id(self):
-        return self._device_id
+    def __str__(self):
+        return '{}:{}'.format(self.type, self.index)

-    @device_id.setter
-    def device_id(self, value):
-        self._device_id = value
+    def __repr__(self):
+        return 'device(type={}, index={})'.format(self.type, self.index)

-    def copy(self):
-        return copy.deepcopy(self)

-    def __str__(self):
-        return '{}:{}'.format(
-            self._device_type, self._device_id)
\ No newline at end of file
+def from_numpy(data):
+    """Create a tensor from the given numpy array.
+
+    Parameters
+    ----------
+    data :  numpy.ndarray
+        The array with various data type.
+
+    Return
+    ------
+    dragon.vm.torch.Tensor
+        The torch tensor.
+
+    """
+    if not isinstance(data, numpy.ndarray):
+        raise TypeError('The data should be a numpy.ndarray.')
+    if str(data.dtype) not in mapping.TENSOR_TYPE_TO_TORCH_TENSOR:
+        raise ValueError('Unsupported type({}) to torch tensor.'.format(data.dtype))
+    module = importlib.import_module('dragon.vm.torch.tensor')
+    return getattr(module, mapping.TENSOR_TYPE_TO_TORCH_TENSOR[str(data.dtype)])(data)
+
+
+def from_dragon(tensor, own_storage=False):
+    """Create a torch tensor from a existing dragon tensor.
+
+    Set ``own_storage`` as ``True`` for automatically releasing the storage.
+
+    Parameters
+    ----------
+    tensor : Tensor or str
+        The dragon tensor.
+    own_storage : boolean
+        Whether to release storage during deconstructing.
+
+    Returns
+    -------
+    dragon.vm.torch.Tensor
+        The torch tensor.
+
+    """
+    storage = tensor_utils.GetStorage(tensor)
+    if storage is None: return None
+    module = importlib.import_module('dragon.vm.torch.tensor')
+    T = getattr(module, mapping.TENSOR_TYPE_TO_TORCH_TENSOR[storage.dtype])()
+    T._storage, T._own_storage, T._tensor = storage, own_storage, tensor
+    T._device = device(*storage.device)
+    return T
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/execution.py
+++ b/Dragon/python/dragon/vm/torch/execution.py
@@ -32,10 +32,10 @@ import dragon as dg
 import dragon.import_c_api as C
 from dragon.config import option

-from .c_api import Context
+from .c_api import device as _Device
 from .jit import JITRecorder, is_jit_enforced
 from .autograd.grad_mode import is_grad_enabled
-from .tensor import RuntimeTensor
+from .tensor import _RuntimeTensor
 from .pool import TensorPool


@@ -66,9 +66,9 @@ def RunOperator(
            outputs_name.append(output)
        else:
            # Legacy mode, a torch tensor is excepted
-            if isinstance(output, Context):
+            if isinstance(output, _Device):
                name = TensorPool.get('${JOIN}' if requires_grad else '${DETACH}')
-                outputs[ix] = RuntimeTensor(name, ctx=output)
+                outputs[ix] = _RuntimeTensor(name, device=output)
            outputs_name.append(outputs[ix].name)

    # Key + Inputs + Outputs => Op

--- a/Dragon/python/dragon/vm/torch/module.py
+++ b/Dragon/python/dragon/vm/torch/module.py
@@ -19,16 +19,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import numpy
+import dragon
+import warnings
 from collections import OrderedDict

-import numpy as np
-import dragon as dg
-
-import dragon.core.proto_utils as pb_utils
-import dragon.core.logging as logging
+from dragon.core import proto_utils, logging
 from dragon.core.scope import get_default_name_scope

-from dragon.vm.torch.c_api import Context
+from dragon.vm.torch.c_api import device as Device
 from dragon.vm.torch.tensor import Tensor, Parameter
 from dragon.vm.torch.execution import RunOperator
 from dragon.vm.torch.environ import add_submodule, get_module_name
@@ -39,8 +38,9 @@ class Module(object):
        self._modules = OrderedDict()
        self._parameters = OrderedDict()
        self._buffers = OrderedDict()
-        self._module_key = self._def = None
-        self._ctx = Context()
+        self._device = Device()
+        self._module_key = None
+        self._module_def = None
        self.training = True

    def __getattr__(self, item):
@@ -106,20 +106,8 @@ class Module(object):
                module.state_dict(destination, prefix + name + '.', to_numpy=to_numpy)
        return destination

-    def _load_state_dict_key_mismatch(self, full_name, name, is_missing): pass
-
    def load_state_dict(self, state_dict, strict=True, verbose=True):
        if verbose: logging.info('Load the state dict.')
-        def submodule_key_mismatch(full_name, is_missing):
-            module = self
-            names = full_name.split(".")
-            for module_name in names[:-1]:
-                if module_name in module._modules:
-                    module = module._modules[module_name]
-                else:
-                    return
-            module._load_state_dict_key_mismatch(full_name, names[-1], is_missing)
-
        unexpected = []
        own_state = self.state_dict()
        for name, param in state_dict.items():
@@ -133,28 +121,24 @@ class Module(object):
                        ', '.join([str(d) for d in param_shape])))
                if isinstance(param, Tensor):
                    own_state[name].copy_(param)
-                elif isinstance(param, np.ndarray):
-                    dg.tensor_utils.SetPyArray(own_state[name], param)
+                elif isinstance(param, numpy.ndarray):
+                    dragon.tensor_utils.SetPyArray(own_state[name], param)
                else:
                    raise ValueError('Excepted the type of source state is either '
                        'dragon.vm.torch.Tensor or numpy.ndarray, got {}.'.format(type(param)))
                if verbose:
                    logging.info('Tensor({}) loaded, Size: ({})'.format(name,
                            ', '.join([str(d) for d in param_shape])))
+            else:
+                unexpected.append(name)
        if strict:
            missing = set(own_state.keys()) - set(state_dict.keys())
-            # pass the mismatch info to submodules so that they have a chance to
-            # raise a custom class-specific error
-            for name in unexpected:
-                submodule_key_mismatch(name, False)
-            for name in missing:
-                submodule_key_mismatch(name, True)
            error_msg = ''
            if len(unexpected) > 0:
-                error_msg += 'Unexpected key(s) in state_dict: {}. '.format(
+                error_msg += 'Unexpected key(s) in state_dict: {}.\n'.format(
                    ', '.join('"{}"'.format(k) for k in unexpected))
            if len(missing) > 0:
-                error_msg += 'Missing key(s) in state_dict: {}. '.format(
+                error_msg += 'Missing key(s) in state_dict: {}.'.format(
                    ', '.join('"{}"'.format(k) for k in missing))
            if len(error_msg) > 0:
                raise KeyError(error_msg)
@@ -201,7 +185,7 @@ class Module(object):
        add_submodule(module, name_v2 if name_v2 else name)

    def __call__(self, *args, **kwargs):
-        with dg.name_scope(get_module_name(self)):
+        with dragon.name_scope(get_module_name(self)):
            return self.forward(*args, **kwargs)

    def forward(self, *inputs, **kwargs):
@@ -209,7 +193,10 @@ class Module(object):

    def name_scope(self, remove_separator=True):
        scope = get_default_name_scope()
-        if remove_separator and scope[-1] == '/': scope = scope[:-1]
+        if remove_separator and \
+            len(scope) > 0 and \
+                scope[-1] == '/':
+                    scope = scope[:-1]
        return scope

    def children(self):
@@ -281,17 +268,17 @@ class Module(object):
        return self

    def cpu(self):
-        self._ctx = Context()
-        # Remove key and op to re-create a one with new ctx
-        self._module_key = self._def = None
+        self._device = Device()
+        # Remove key and op to re-create a one with new device
+        self._module_key = self._module_def = None
        return self._apply(lambda t: t.cpu(),
                           lambda m: m.cpu())

    def cuda(self, device=None):
-        if device is None: device = dg.config.GetGPU()
-        self._ctx = Context('CUDA', device)
-        # Remove key and op to re-create a one with new ctx
-        self._module_key = self._def = None
+        if device is None: device = dragon.config.GetGPU()
+        self._device = Device('cuda', device)
+        # Remove key and op to re-create a one with new device
+        self._module_key = self._module_def = None
        return self._apply(lambda t: t.cuda(device),
                           lambda m: m.cuda(device))

@@ -312,7 +299,7 @@ class Module(object):

    def _gen_module_key(self):
        self._module_key = '{}{}'.format(
-            self.name_scope(False), self._ctx)
+            self.name_scope(False), self._device)

    @property
    def module_key(self):
@@ -320,37 +307,37 @@ class Module(object):
            self._gen_module_key()
        return self._module_key

-    def _gen_def(self):
-        self._def = pb_utils.MakeCXXOperatorDef(
+    def _gen_module_def(self):
+        self._module_def = \
+            proto_utils.MakeCXXOperatorDef(
                name='runtime',
                uid=self.module_key,
                op_type=self.op_meta['op_type'],
-            device_option=pb_utils.GetDeviceOption(
-                self._ctx.device_type,
-                    self._ctx.device_id,
+                device_option=proto_utils.
+                    GetDeviceOption(
+                    self._device.type,
+                        self._device.index,
                            engine='CUDNN'),
                **self.op_meta['arguments']
            )

-    def register_op(self): pass
+    def register_op(self):
+        pass

    def register_output(self):
-        return self._ctx.copy()
+        return self._device.copy()

    def unify_devices(self, inputs):
        for ix, t in enumerate(inputs):
-            if t._ctx.device_type != self._ctx.device_type or \
-                t._ctx.device_id != self._ctx.device_id:
-                    print(self._ctx, self.module_key)
-                    raise ValueError('Module({}) is defined at {}:{}, '
-                        '\nFound Input({}) is at {}:{}.'.format(
+            if t._device != self._device:
+                raise ValueError('Module({}) is defined at {}, '
+                    '\nFound Input({}) is at {}.'.format(
                        self.name_scope(True),
-                                self._ctx.device_type, self._ctx.device_id,
-                                    ix, t._ctx.device_type, t._ctx.device_id))
+                            self._device, ix, t._device))

    def run(self, inputs, outputs, auto_grad=True, callback=None):
-        if self._def is None: self._gen_def()
-        meta = (self.module_key, self._def)
+        if self._module_def is None: self._gen_module_def()
+        meta = (self.module_key, self._module_def)
        return RunOperator(
            inputs, outputs, meta,
                auto_grad=auto_grad,
@@ -366,7 +353,7 @@ class Module(object):
        return self.train(False)

    def zero_grad(self):
-        raise NotImplementedError('Deprecated. '
+        warnings.warn('Module.zero_grad() is deprecated. '
            'Use ``torch.optim.Optimizer.zero_grad()`` instead.')

    def extra_repr(self):

--- a/Dragon/python/dragon/vm/torch/nn/__init__.py
+++ b/Dragon/python/dragon/vm/torch/nn/__init__.py
@@ -21,14 +21,13 @@ from dragon.vm.torch.tensor import Parameter
 from .modules.conv import Conv2d, ConvTranspose2d
 from .modules.depthwise_conv import DepthwiseConv2d
 from .modules.pooling import MaxPool2d, AvgPool2d
+from .modules.linear import Linear

 from .modules.activation import (
    ReLU, LeakyReLU, ELU, SELU,
    Tanh, Sigmoid, Softmax,
 )

-from .modules.linear import Linear
-
 from .modules.loss import (
    BCEWithLogitsLoss,
    NLLLoss, CrossEntropyLoss,
@@ -36,11 +35,16 @@ from .modules.loss import (
    SigmoidFocalLoss, SoftmaxFocalLoss,
 )

+from .modules.rnn import (
+    RNNBase, RNNCellBase,
+    RNN, LSTM, GRU,
+    LSTMCell,
+)
+
 from .modules.container import Container, Sequential, ModuleList
 from .modules.batchnorm import BatchNorm1d, BatchNorm2d, BatchNorm3d
 from .modules.groupnorm import GroupNorm1d, GroupNorm2d, GroupNorm3d
 from .modules.affine import Affine
 from .modules.dropout import Dropout, Dropout2d, Dropout3d
 from .modules.dropblock import DropBlock2d
-from .modules.rnn import RNNBase, RNN, LSTM, GRU
 from . import init
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/nn/modules/affine.py
+++ b/Dragon/python/dragon/vm/torch/nn/modules/affine.py
@@ -14,7 +14,7 @@ from __future__ import division
 from __future__ import print_function

 from dragon.vm.torch.nn import Module, Parameter
-from dragon.vm.torch.ops.creation import zeros, ones
+from dragon.vm.torch.ops.builtin import zeros, ones


 class Affine(Module):

--- a/Dragon/python/dragon/vm/torch/nn/modules/batchnorm.py
+++ b/Dragon/python/dragon/vm/torch/nn/modules/batchnorm.py
@@ -15,7 +15,7 @@ from __future__ import print_function

 from dragon.vm.torch.tensor import Tensor
 from dragon.vm.torch.nn import Module, Parameter
-from dragon.vm.torch.ops.creation import zeros, ones
+from dragon.vm.torch.ops.builtin import zeros, ones
 from dragon.vm.torch.module import RunOperator


@@ -62,10 +62,10 @@ class _BatchNorm(Module):
               'track_running_stats={track_running_stats}'.format(**self.__dict__)

    def make_meta_from_phase(self, phase):
-        """Make the custom meta by referring the phase and ctx.
+        """Make the custom meta by referring the phase and device.

        We extend this method as the original module can only
-        detect the mutation of ctx(i.e. cpu -> cuda),
+        detect the mutation of device(i.e. cpu -> cuda),
        but not the (train -> test).

        """
@@ -75,8 +75,8 @@ class _BatchNorm(Module):
            self._module_key += '/{}'.format(phase)
            self.op_meta['arguments']['use_stats'] = 0 \
                if phase == 'TRAIN' else 1
-            self._gen_def()
-            self.op_metas[phase] = (self._module_key, self._def)
+            self._gen_module_def()
+            self.op_metas[phase] = (self._module_key, self._module_def)

        if self._module_key is None:
            # Init or Context has changed

--- a/Dragon/python/dragon/vm/torch/nn/modules/groupnorm.py
+++ b/Dragon/python/dragon/vm/torch/nn/modules/groupnorm.py
@@ -15,7 +15,7 @@ from __future__ import print_function

 from dragon.vm.torch.tensor import Tensor
 from dragon.vm.torch.nn import Module, Parameter
-from dragon.vm.torch.ops.creation import zeros, ones
+from dragon.vm.torch.ops.builtin import zeros, ones


 class _GroupNorm(Module):

--- a/Dragon/python/dragon/vm/torch/nn/modules/rnn.py
+++ b/Dragon/python/dragon/vm/torch/nn/modules/rnn.py
@@ -17,16 +17,17 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import math
 import warnings
 import numbers
-import numpy as np
-import dragon as dg
+import numpy
+import dragon

 from dragon.vm.torch.tensor import Tensor
 from dragon.vm.torch.nn import Module, Parameter
 from dragon.operators.rnn.rnn_param import RNNParamSet
 from dragon.vm.torch.module import RunOperator
-from dragon.vm.torch.autograd.grad_mode import is_grad_enabled
+from dragon.vm.torch.ops.builtin import zeros as Zeros, xw_plus_b


 class RNNBase(Module):
@@ -49,8 +50,8 @@ class RNNBase(Module):
        if not bias:
            raise NotImplementedError('Bias is required.')

-        if not isinstance(dropout, numbers.Number) or not 0 <= dropout <= 1 or \
-                isinstance(dropout, bool):
+        if not isinstance(dropout, numbers.Number) or \
+                not 0 <= dropout <= 1 or isinstance(dropout, bool):
            raise ValueError("dropout should be a number in range [0, 1] "
                             "representing the probability of an element being "
                             "zeroed")
@@ -83,8 +84,8 @@ class RNNBase(Module):
            _ = self.module_key
            self._module_key += '/{}'.format(phase)
            self.op_meta['arguments']['phase'] = phase
-            self._gen_def()
-            self.op_metas[phase] = (self._module_key, self._def)
+            self._gen_module_def()
+            self.op_metas[phase] = (self._module_key, self._module_def)

        if self._module_key is None:
            # Init or Context has changed
@@ -106,45 +107,37 @@ class RNNBase(Module):
        self.unify_devices(inputs)
        outputs = [self.register_output() for _ in range(2)]

-        requires_grad = False
-        for input in inputs:
-            if input.requires_grad: requires_grad = True
-        requires_grad = requires_grad and is_grad_enabled()
        meta = self.make_meta_from_phase(
-            'TRAIN' if requires_grad else 'TEST')
-
+            'TRAIN' if self.training else 'TEST')
        return RunOperator(inputs, outputs, meta)

    def _plan_params(self):
        if self.mode == 'lstm': gate_size = 4 * self.hidden_size
        elif self.mode == 'gru': gate_size = 3 * self.hidden_size
        else: gate_size = self.hidden_size
-        # 1. plan weights
-        self._matrix_weights = []; self._bias_weights = []
+        # 1. Plan weights
+        self._matrix_shape, self._bias_shape = [], []
        for layer in range(self.num_layers):
            for direction in range(self.num_directions):
                layer_input_size = self.input_size if layer == 0 \
                    else self.hidden_size * self.num_directions
-                w_names = ['layer_{}/{}/{}'.format(layer, p, 'L' if direction == 0 else 'R')
-                           for p in ('matrix_ih', 'matrix_hh', 'bias_ih', 'bias_hh')]
-                w_ih = dg.Tensor(name=w_names[0], shape=[gate_size, layer_input_size])
-                w_hh = dg.Tensor(name=w_names[1], shape=[gate_size, self.hidden_size])
-                b_ih = dg.Tensor(name=w_names[2], shape=[gate_size,])
-                b_hh = dg.Tensor(name=w_names[3], shape=[gate_size,])
+                w_ih_shape = [gate_size, layer_input_size]
+                w_hh_shape = [gate_size, self.hidden_size]
+                b_ih_shape, b_hh_shape = [gate_size], [gate_size]
                # W (0 ~ 3), R (4 ~ 7)
-                self._matrix_weights.extend([w_ih, w_hh])
+                self._matrix_shape.extend([w_ih_shape, w_hh_shape])
                # Bw (0 ~ 3), Br (4 ~ 7)
-                self._bias_weights.extend([b_ih, b_hh])
+                self._bias_shape.extend([b_ih_shape, b_hh_shape])

-        # 2. compute total number of parameters
+        # 2. Compute total number of parameters
        self._weights_count = 0
-        for w in self._matrix_weights + self._bias_weights:
-            self._weights_count += np.prod(w.shape)
+        for shape in self._matrix_shape + self._bias_shape:
+            self._weights_count += numpy.prod(shape)

-        # 3. register the packed weights
+        # 3. Register the packed weights
        self.weights = Parameter(Tensor(int(self._weights_count)))

-        # 4. create the initialization grids
+        # 4. Create the initialization grids
        if self.mode == 'lstm': num_params_per_layer = 8
        elif self.mode == 'gru': num_params_per_layer = 6
        else: num_params_per_layer = 2
@@ -159,7 +152,7 @@ class RNNBase(Module):
            for _ in range(self.num_layers)
        ]

-        # 5. set the init flag
+        # 5. Set the init flag
        self._init_params = False

    ##############################################
@@ -169,8 +162,8 @@ class RNNBase(Module):
    ##############################################

    def _uniform_init(self, shape, dtype='float32'):
-        stdv = 1.0 / np.sqrt(self.hidden_size)
-        return np.random.uniform(-stdv, stdv, shape).astype(dtype)
+        stdv = 1.0 / numpy.sqrt(self.hidden_size)
+        return numpy.random.uniform(-stdv, stdv, shape).astype(dtype)

    def _orthogonal_init(self, shape, gain=1, dtype='float32'):
        num_rows = 1
@@ -178,16 +171,16 @@ class RNNBase(Module):
        num_cols = shape[-1]
        flat_shape = (num_cols, num_rows) if num_rows < num_cols \
            else (num_rows,  num_cols)
-        W = np.random.randn(*flat_shape)
-        q, r = np.linalg.qr(W)
+        W = numpy.random.randn(*flat_shape)
+        q, r = numpy.linalg.qr(W)
        # Make Q uniform
-        d = np.diag(r)
-        q *= np.sign(d)
+        d = numpy.diag(r)
+        q *= numpy.sign(d)
        if num_rows < num_cols: q = q.T
        return gain * q.reshape(shape).astype(dtype)

    def _zero_init(self, shape, dtype='float32'):
-        return np.zeros(shape, dtype=dtype)
+        return numpy.zeros(shape, dtype=dtype)

    ##############################################
    #                                            #
@@ -205,20 +198,19 @@ class RNNBase(Module):
            raise ValueError('Unknown param type: ' + type)

    def _set_param(self, layer_id, param_id, param_type, param):
-        if not isinstance(param, Tensor):
-            if isinstance(param, np.ndarray):
-                paramT = dg.Tensor('/tmp/rnn_param').Variable()
-                paramT.set_value(param)
-                param = paramT
-            else: raise ValueError('Excepted a tensor or numpy array.')
+        if isinstance(param, numpy.ndarray):
+            param_temp = dragon.Tensor.Ref('/tmp/rnn_param')
+            param_temp.set_value(param)
+            param = param_temp
+        else: raise ValueError('Excepted a numpy array.')
        W = self.weights.dragon()
        outputs = RNNParamSet([W, param], layer_id, param_id, param_type,
            rnn_mode=self.mode, input_size=self.input_size, hidden_size=self.hidden_size,
            num_layers=self.num_layers, num_directions=self.num_directions)
-        for k, v in outputs.expressions.items(): dg.workspace.RunOperator(v)
+        for k, v in outputs.expressions.items(): dragon.workspace.RunOperator(v)

    def _reset_params(self):
-        np.random.seed(dg.config.GetRandomSeed())
+        numpy.random.seed(dragon.config.GetRandomSeed())
        if self.mode == 'lstm': num_gates = 4
        elif self.mode == 'gru': num_gates = 3
        else: num_gates = 1
@@ -233,8 +225,8 @@ class RNNBase(Module):
                        bias_init = getattr(self, '_{}_init'.format(bias_init))
                    pseudo_layer_id = layer * self.num_directions + direction
                    packed_id = pseudo_layer_id * 2 + int(param_id / num_gates)
-                    matrix_shape = self._matrix_weights[packed_id].shape[:]
-                    bias_shape = self._bias_weights[packed_id].shape[:]
+                    matrix_shape = self._matrix_shape[packed_id][:]
+                    bias_shape = self._bias_shape[packed_id][:]
                    matrix_shape[0] = bias_shape[0] = int(matrix_shape[0] / num_gates)
                    self._set_param(layer_id=pseudo_layer_id, param_id=param_id,
                        param_type='matrix', param=matrix_init(matrix_shape))
@@ -376,3 +368,56 @@ class GRU(RNNBase):
        """
        super(GRU, self).__init__('gru', input_size, hidden_size,
            num_layers, bias, batch_first, dropout, bidirectional)
+
+
+class RNNCellBase(Module):
+    def __init__(self, input_size, hidden_size, bias, num_chunks):
+        super(RNNCellBase, self).__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.bias = bias
+        self.weight_ih = Parameter(Tensor(num_chunks * hidden_size, input_size))
+        self.weight_hh = Parameter(Tensor(num_chunks * hidden_size, hidden_size))
+        if bias:
+            self.bias_ih = Parameter(Tensor(num_chunks * hidden_size))
+            self.bias_hh = Parameter(Tensor(num_chunks * hidden_size))
+        else:
+            self.register_parameter('bias_ih', None)
+            self.register_parameter('bias_hh', None)
+        self.reset_parameters()
+
+    def extra_repr(self):
+        s = '{input_size}, {hidden_size}'
+        if 'bias' in self.__dict__ and self.bias is not True:
+            s += ', bias={bias}'
+        if 'nonlinearity' in self.__dict__ and self.nonlinearity != "tanh":
+            s += ', nonlinearity={nonlinearity}'
+        return s.format(**self.__dict__)
+
+    def reset_parameters(self):
+        stdv = 1.0 / math.sqrt(self.hidden_size)
+        for weight in self.parameters():
+            weight.data.uniform_(-stdv, stdv)
+
+
+class LSTMCell(RNNCellBase):
+    def __init__(self, input_size, hidden_size, bias=True):
+        super(LSTMCell, self).__init__(
+            input_size, hidden_size, bias, num_chunks=4)
+        self.register_op()
+
+    def register_op(self):
+        self.op_meta = {'op_type': 'LSTMCell', 'arguments': {}}
+
+    def forward(self, input, hx=None):
+        if hx is None:
+            zeros = Zeros(
+                input.size(0), self.hidden_size,
+                    dtype=input.dtype, device=input.device)
+            hx = (zeros, zeros)
+        wx = xw_plus_b(input, self.weight_ih, self.bias_ih)
+        wh = xw_plus_b(hx[0], self.weight_hh, self.bias_hh)
+        inputs = [wx + wh, hx[1]]
+        self.unify_devices(inputs)
+        outputs = [self.register_output() for _ in range(2)]
+        return self.run(inputs, outputs)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/ops/__init__.py
+++ b/Dragon/python/dragon/vm/torch/ops/__init__.py
@@ -8,30 +8,3 @@
 #      <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
+\ No newline at end of file
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from .creation import (
-    zeros, zeros_like,
-    ones, ones_like,
-    rand, randn,
-)
-
-from .arithmetic import (
-    add, sub, mul, div,
-    log, exp, sqrt,
-    maximum, minimum, clamp,
-)
-
-from .array import (
-    squeeze, unsqueeze,
-    sum, mean, argmin, argmax, max, min, topk,
-    cat, gather, narrow, one_hot,
-)
-
-from .vision import (
-    nn_resize, bilinear_resize,
-    roi_pool, roi_align,
-)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/ops/arithmetic.py
+++ b/Dragon/python/dragon/vm/torch/ops/arithmetic.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from dragon.vm.torch.tensor import Tensor
-from dragon.vm.torch.ops.primitive import MakeContext, WrapScalar
-from dragon.vm.torch.ops.factory import get_module
-
-from dragon.vm.torch.ops.modules.arithmetic import (
-    Fundamental, Log, Exp, Sqrt,
-    Maximum, Minimum, Clamp,
-)
-
-
-def _fundamental(input, value, op='Add', out=None):
-    if not isinstance(value, Tensor):
-        value = WrapScalar(value, input.dtype, input._ctx)
-    ctx = MakeContext(inputs=[input, value])
-    key = '{}/{}'.format(op, ctx)
-    module = get_module(Fundamental, key, ctx, op_type=op)
-    return module.forward(input, value, out)
-
-
-def _rfundamental(input, value, op='RAdd', out=None):
-    if not isinstance(value, Tensor):
-        value = WrapScalar(value, input.dtype, input._ctx)
-    ctx = MakeContext(inputs=[input, value])
-    key = '{}/{}'.format(op, ctx)
-    module = get_module(Fundamental, key, ctx, op_type=op)
-    return module.forward(value, input, out)
-
-
-def _maximum(input, other, out=None):
-    if not isinstance(input, Tensor):
-        input = WrapScalar(input, other.dtype, other._ctx)
-    elif not isinstance(other, Tensor):
-        other = WrapScalar(other, input.dtype, input._ctx)
-    ctx = MakeContext(inputs=[input])
-    key = 'Maximum/{}'.format(ctx)
-    module = get_module(Maximum, key, ctx)
-    return module.forward(input, other, out)
-
-
-def _minimum(input, other, out=None):
-    if not isinstance(input, Tensor):
-        input = WrapScalar(input, other.dtype, other._ctx)
-    elif not isinstance(other, Tensor):
-        other = WrapScalar(other, input.dtype, input._ctx)
-    ctx = MakeContext(inputs=[input])
-    key = 'Minimum/{}'.format(ctx)
-    module = get_module(Minimum, key, ctx)
-    return module.forward(input, other, out)
-
-
-def _clamp(input, min=None, max=None, out=None):
-    ctx = MakeContext(inputs=[input])
-    key = 'Clamp/{}/min:{}/max:{}'.format(ctx, min, max)
-    module = get_module(Clamp, key, ctx, min=min, max=max)
-    return module.forward(input, out)
-
-
-def _exp(input, out=None):
-    ctx = MakeContext(inputs=[input])
-    key = 'Exp/{}'.format(ctx)
-    module = get_module(Exp, key, ctx)
-    return module.forward(input, out)
-
-
-def _log(input, out=None):
-    ctx = MakeContext(inputs=[input])
-    key = 'Log/{}'.format(ctx)
-    module = get_module(Log, key, ctx)
-    return module.forward(input, out)
-
-
-def _sqrt(input, out=None):
-    ctx = MakeContext(inputs=[input])
-    key = 'Sqrt/{}'.format(ctx)
-    module = get_module(Sqrt, key, ctx)
-    return module.forward(input, out)
-
-
-def add(input, value, out=None):
-    """Add the ``input`` and ``value`` into the output tensor.
-
-    Parameters
-    ----------
-    input : dragon.vm.torch.Tensor
-        The input tensor.
-    value : dragon.vm.torch.Tensor, number
-        The value tensor.
-    out : dragon.vm.torch.Tensor, optional
-        The output tensor.
-
-    Returns
-    -------
-    dragon.vm.torch.Tensor
-        The output tensor.
-
-    """
-    return _fundamental(input, value, out=out, op='Add')
-
-
-def sub(input, value, out=None):
-    """Subtract the ``input`` and ``value`` into the output tensor.
-
-    Parameters
-    ----------
-    input : dragon.vm.torch.Tensor
-        The input tensor.
-    value : dragon.vm.torch.Tensor or number
-        The value tensor.
-    out : dragon.vm.torch.Tensor, optional
-        The output tensor.
-
-    Returns
-    -------
-    torch.Tensor
-        The output tensor.
-
-    """
-    return _fundamental(input, value, out=out, op='Sub')
-
-
-def mul(input, value, out=None):
-    """Multiply the ``input`` and ``value`` into the output tensor.
-
-    Parameters
-    ----------
-    input : dragon.vm.torch.Tensor
-        The input tensor.
-    value : dragon.vm.torch.Tensor or number
-        The value tensor.
-    out : dragon.vm.torch.Tensor, optional
-        The output tensor.
-
-    Returns
-    -------
-    dragon.vm.torch.Tensor
-        The output tensor.
-
-    """
-    return _fundamental(input, value, out=out, op='Mul')
-
-
-def div(input, value, out=None):
-    """Divide the ``input`` and ``value`` into the output tensor.
-
-    Parameters
-    ----------
-    input : dragon.vm.torch.Tensor
-        The input tensor.
-    value : dragon.vm.torch.Tensor or number
-        The value tensor.
-    out : dragon.vm.torch.Tensor, optional
-        The output tensor.
-
-    Returns
-    -------
-    dragon.vm.torch.Tensor
-        The output tensor.
-
-    """
-    return _fundamental(input, value, out=out, op='Div')
-
-
-def maximum(input, other, out=None):
-    """Return the max value of given two tensors.
-
-    Parameters
-    ----------
-    input : dragon.vm.torch.Tensor or number
-        The input tensor.
-    other : dragon.vm.torch.Tensor or number
-        The input tensor.
-    out : dragon.vm.torch.Tensor, optional
-        The output tensor.
-
-    Returns
-    -------
-    dragon.vm.torch.Tensor
-        The output tensor.
-
-    """
-    return _maximum(input, other, out)
-
-
-def minimum(input, other, out=None):
-    """Return the min value of given two tensors.
-
-    Parameters
-    ----------
-    input : dragon.vm.torch.Tensor or number
-        The input tensor.
-    other : dragon.vm.torch.Tensor or number
-        The input tensor.
-    out : dragon.vm.torch.Tensor, optional
-        The output tensor.
-
-    Returns
-    -------
-    dragon.vm.torch.Tensor
-        The output tensor.
-
-    """
-    return _minimum(input, other, out)
-
-
-def clamp(input, min=None, max=None, out=None):
-    """Clamp all elements into the range [min, max].
-
-    Parameters
-    ----------
-    input : dragon.vm.torch.Tensor
-        The input tensor.
-    min : number, optional
-        The min value.
-    max : number, optional
-        The max value.
-    out : dragon.vm.torch.Tensor, optional
-        The output tensor.
-
-    Returns
-    -------
-    dragon.vm.torch.Tensor
-        The output tensor.
-
-    """
-    return _clamp(input, min, max, out)
-
-
-def log(input, out=None):
-    """Compute the natural logarithm of input.
-
-    Parameters
-    ----------
-    input : dragon.vm.torch.Tensor
-        The input tensor.
-    out : dragon.vm.torch.Tensor, optional
-        The output tensor.
-
-    Returns
-    -------
-    dragon.vm.torch.Tensor
-        The output tensor.
-
-    """
-    return _log(input, out)
-
-
-def exp(input, out=None):
-    """Compute the exponential of input.
-
-    Parameters
-    ----------
-    input : dragon.vm.torch.Tensor
-        The input tensor.
-    out : dragon.vm.torch.Tensor, optional
-        The output tensor.
-
-    Returns
-    -------
-    dragon.vm.torch.Tensor
-        The output tensor.
-
-    """
-    return _exp(input, out)
-
-
-def sqrt(input, out=None):
-    """Compute the square-root of input.
-
-    Parameters
-    ----------
-    input : dragon.vm.torch.Tensor
-        The input tensor.
-    out : dragon.vm.torch.Tensor, optional
-        The output tensor.
-
-    Returns
-    -------
-    dragon.vm.torch.Tensor
-        The output tensor.
-
-    """
-    return _sqrt(input, out)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/ops/array.py
+++ b/Dragon/python/dragon/vm/torch/ops/array.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from dragon.vm.torch.ops.primitive import MakeContext
-from dragon.vm.torch.ops.factory import get_module
-
-from dragon.vm.torch.ops.modules.init import (
-    Fill, RandomUniform, RandomNormal,
-)
-
-from dragon.vm.torch.ops.modules.array import (
-    Reshape, Squeeze, UnSqueeze, Permute,
-    Indexing, Repeat, Concat, Gather,
-    Reduce, ArgReduce, OneHot,
-)
-
-
-def reshape(input, shape, shape_like=None):
-    if shape_like is not None: shape = shape_like.shape
-    ctx = MakeContext(inputs=[input]); n_dim = len(shape)
-    key = 'Reshape/{}/n_dim:{}'.format(ctx, n_dim)
-    module = get_module(Reshape, key, ctx, n_dim=n_dim)
-    return module.forward(input, shape)
-
-
-def squeeze(input, dim=None, out=None):
-    ctx = MakeContext(inputs=[input])
-    key = 'Squeeze/{}/dim:{}'.format(ctx, dim if dim else 'None')
-    module = get_module(Squeeze, key, ctx, dim=dim)
-    return module.forward(input, out=out)
-
-
-def unsqueeze(input, dim, out=None):
-    ctx = MakeContext(inputs=[input])
-    key = 'Unsqueeze/{}/dim:{}'.format(ctx, dim if dim else 'None')
-    module = get_module(UnSqueeze, key, ctx, dim=dim)
-    return module.forward(input, out=out)
-
-
-def _permute(input, perm=None):
-    ctx = MakeContext(inputs=[input]); n_perm = len(perm) if perm else 0
-    key = 'Permute/{}/n_perm:{}'.format(ctx, n_perm)
-    module = get_module(Permute, key, ctx, n_perm=n_perm)
-    return module.forward(input, perm)
-
-
-def _repeat(input, times):
-    ctx = MakeContext(inputs=[input]); n_times = len(times)
-    key = 'Repeat/{}/n_times:{}'.format(ctx, n_times)
-    module = get_module(Repeat, key, ctx, n_times=n_times)
-    return module.forward(input, times)
-
-
-def _fill(input, shape, value):
-    ctx = MakeContext(inputs=[input]); n_dim = len(shape)
-    key = 'Fill/{}/dtype:{}/n_dim:{}/value:{}'.format(
-        ctx, input.dtype, n_dim, value)
-    module = get_module(Fill, key, ctx, n_dim=n_dim,
-        value=value, dtype=input.dtype)
-    return module.forward(input, shape)
-
-
-def _uniform(input, shape, low, high):
-    ctx = MakeContext(inputs=[input]); n_dim = len(shape)
-    key = 'Uniform/{}/dtype:{}/n_dim:{}/low:{}/high:{}'.format(
-        ctx, input.dtype, n_dim, float(low), float(high))
-    module = get_module(RandomUniform, key, ctx, n_dim=n_dim,
-        low=low, high=high, dtype=input.dtype)
-    return module.forward(input, shape)
-
-
-def _normal(input, shape, mean, std):
-    ctx = MakeContext(inputs=[input]); n_dim = len(shape)
-    key = 'Normal/{}/dtype:{}/n_dim:{}/mean:{}/std:{}'.format(
-        ctx, input.dtype, n_dim, float(mean), float(std))
-    module = get_module(RandomNormal, key, ctx, n_dim=n_dim,
-        mean=mean, std=std, dtype=input.dtype)
-    return module.forward(input, shape)
-
-
-def _reduce(input, operation, dim=None, keepdim=False, out=None):
-    ctx = MakeContext(inputs=[input])
-    if dim is None: keepdim = False
-    key = '{}/{}/dim:{}/keepdim:{}'.format(
-        operation, ctx, dim, int(keepdim))
-    module = get_module(Reduce, key, ctx,
-        operation=operation, dim=dim, keepdim=keepdim)
-    return module.forward(input, out)
-
-
-def _arg_reduce(input, operation, dim=None, keepdim=False, top_k=1, out=None):
-    ctx = MakeContext(inputs=[input])
-    if dim is None: keepdim = False
-    key = '{}/{}/dim:{}/keepdim:{}/top_k:{}'.format(
-        operation, ctx, dim, int(keepdim), top_k)
-    module = get_module(ArgReduce, key, ctx, operation=operation,
-        axis=dim, keepdim=keepdim, top_k=top_k)
-    return module.forward(input, out)
-
-
-def mean(input, dim=None, keepdim=False, out=None):
-    """Return the mean of all elements or elements along the given dim.
-
-    Parameters
-    ----------
-    input : dragon.vm.torch.Tensor
-        The input tensor.
-    dim : int, optional
-        The axis of tensor to compute mean value.
-    keepdim : bool, optional
-        Whether the output tensor has dim retained or not.
-    out : dragon.vm.torch.Tensor, optional
-        The optional output tensor.
-
-    Returns
-    -------
-    dragon.vm.torch.Tensor
-        The mean-reduced tensor.
-
-    """
-    return _reduce(input, 'MEAN', dim, keepdim, out)
-
-
-def sum(input, dim=None, keepdim=False, out=None):
-    """Return the sum of all elements or elements along the given dim.
-
-    Parameters
-    ----------
-    input : dragon.vm.torch.Tensor
-        The input tensor.
-    dim : int, optional
-        The axis of tensor to compute sum value.
-    keepdim : bool, optional
-        Whether the output tensor has dim retained or not.
-    out : dragon.vm.torch.Tensor, optional
-        The optional output tensor.
-
-    Returns
-    -------
-    torch.Tensor
-        The sum-reduced tensor.
-
-    """
-    return _reduce(input, 'SUM', dim, keepdim, out)
-
-
-def argmax(input, dim=None, keepdim=False, out=None):
-    """Return the indices of maximum elements along the given axis.
-
-    Parameters
-    ----------
-    input : dragon.vm.torch.Tensor
-        The input tensor.
-    dim : int, optional
-        The axis of tensor to compute sum value.
-    keepdim : bool, optional
-        Whether the output tensor has dim retained or not.
-    out : dragon.vm.torch.Tensor, optional
-        The optional output tensor.
-
-    Returns
-    -------
-    torch.Tensor
-        The maximum indices.
-
-    """
-    return _arg_reduce(input, 'ARGMAX', dim, keepdim, 1, out)
-
-
-def max(input, dim=None, keepdim=False, out=None):
-    """Return the values and indices of maximum elements along the given axis.
-
-    Parameters
-    ----------
-    input : dragon.vm.torch.Tensor
-        The input tensor.
-    dim : int, optional
-        The axis of tensor to compute sum value.
-    keepdim : bool, optional
-        Whether the output tensor has dim retained or not.
-    out : dragon.torch.Tensor, optional
-        The optional output tensor.
-
-    Returns
-    -------
-    tuple
-        The maximum values and indices.
-
-    """
-    return _arg_reduce(input, 'MAX', dim, keepdim, 1, out)
-
-
-def argmin(input, dim=None, keepdim=False, out=None):
-    """Return the indices of minimum elements along the given axis.
-
-    Parameters
-    ----------
-    input : dragon.vm.torch.Tensor
-        The input tensor.
-    dim : int, optional
-        The axis of tensor to compute sum value.
-    keepdim : bool, optional
-        Whether the output tensor has dim retained or not.
-    out : dragon.vm.torch.Tensor, optional
-        The optional output tensor.
-
-    Returns
-    -------
-    torch.Tensor
-        The minimum indices.
-
-    """
-    return _arg_reduce(input, 'ARGMIN', dim, keepdim, 1, out)
-
-
-def min(input, dim=None, keepdim=False, out=None):
-    """Return the values and indices of maximum elements along the given axis.
-
-    Parameters
-    ----------
-    input : dragon.vm.torch.Tensor
-        The input tensor.
-    dim : int, optional
-        The axis of tensor to compute sum value.
-    keepdim : bool, optional
-        Whether the output tensor has dim retained or not.
-    out : dragon.torch.Tensor, optional
-        The optional output tensor.
-
-    Returns
-    -------
-    sequence
-        The minimum values and indices.
-
-    """
-    return _arg_reduce(input, 'MIN', dim, keepdim, 1, out)
-
-
-def topk(input, k, dim=None, largest=True, sorted=True, out=None):
-    """Return the k largest/smallest values and indices along the given axis.
-
-    If ``dim`` is not given, the last dimension of the input is chosen.
-
-    If ``largest`` is False then the k smallest elements are returned.
-
-    Parameters
-    ----------
-    input : dragon.vm.torch.Tensor
-        The input tensor.
-    k : int
-        The top k.
-    dim : int, optional
-        The axis of tensor to compute sum value.
-    largest : bool, optional
-        Whether to return largest or smallest elements.
-    sorted : bool, optional
-        Whether to return in the sorted order.
-    out : dragon.vm.torch.Tensor, optional
-        The optional output tensor.
-
-    Returns
-    -------
-    sequence
-        The values and indices.
-
-    """
-    operation = 'MAX' if largest else 'MIN'
-    if dim is None: dim = input.ndimension() - 1
-    return _arg_reduce(input, operation, dim, True, k, out)
-
-
-def cat(seq, dim=0, out=None):
-    """Concatenate the inputs along the given axis.
-
-    Parameters
-    ----------
-    seq : sequence of dragon.vm.torch.Tensor
-        The sequence.
-    dim : int, optional
-        The dim to concatenate.
-    out : dragon.vm.torch.Tensor, optional
-        The optional output tensor.
-
-    Returns
-    -------
-    dragon.vm.torch.Tensor
-        The output tensor.
-
-    """
-    ctx = MakeContext(inputs=seq, outputs=[out] if out else [])
-    key = 'Concat/{}/dim:{}'.format(ctx, dim)
-    module = get_module(Concat, key, ctx, axis=dim)
-    return module.forward(seq, out)
-
-
-def gather(input, dim, index, out=None):
-    """Gather the input values along the given axis.
-
-    Note that it is a tensorflow style gather, which takes a vector index,
-
-    values of other dimension will be copied automatically.
-
-    Parameters
-    ----------
-    input : dragon.vm.torch.Tensor
-        The values.
-    dim : int
-        The dim to gather.
-    index : dragon.vm.torch.Tensor
-        The indices.
-    out : dragon.vm.torch.Tensor, optional
-        The optional output tensor.
-
-    Returns
-    -------
-    dragon.vm.torch.Tensor
-        The output tensor.
-
-    """
-    ctx = MakeContext(inputs=[input, index], outputs=[out] if out else [])
-    key = 'Gather/{}/dim:{}'.format(ctx, dim)
-    module = get_module(Gather, key, ctx, axis=dim)
-    return module.forward(input, index, out)
-
-
-def _indexing(input, starts, sizes):
-    n_starts, n_sizes = len(starts), len(sizes)
-    ctx = MakeContext(inputs=[input])
-    key = 'Index/{}/n_starts:{}/n_sizes:{}'.format(ctx, n_starts, n_sizes)
-    module = get_module(Indexing, key, ctx, n_starts=n_starts, n_sizes=n_sizes)
-    return module.forward(input, starts, sizes)
-
-
-def narrow(input, dimension, start, length):
-    """Return a new tensor that is a narrowed version of input tensor.
-
-    Parameters
-    ----------
-    input : torch.Tensor
-        The input tensor.
-    dimension : int
-        The dimension to narrow.
-    start : int
-        The starting position.
-    length : int
-        The distance to the ending postion.
-
-    Returns
-    -------
-    dragon.vm.torch.Tensor
-        The output tensor.
-    """
-    sizes = list(input.shape[:]); starts = [0] * len(sizes)
-    starts[dimension], sizes[dimension] = start, length
-    return _indexing(input, starts, sizes)
-
-
-def one_hot(input, depth):
-    """Return a ont hot tensor according to given input.
-
-    Parameters
-    ----------
-    input : dragon.vm.torch.Tensor
-        The input tensor.
-    depth : int
-        The depth of channels.
-
-    Returns
-    -------
-    dragon.vm.torch.FloatTensor
-        The output tensor.
-
-    """
-    ctx = MakeContext(inputs=[input])
-    key = 'OneHot/{}/depth:{}'.format(ctx, depth)
-    module = get_module(OneHot, key, ctx, depth=depth)
-    return module.forward(input)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/ops/builtin.py
+++ b/Dragon/python/dragon/vm/torch/ops/builtin.py
@@ -13,143 +13,231 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-from dragon.core.proto_utils import GetDeviceOption
-from dragon.core.tensor_utils import FromTensor
-
-from dragon.vm.torch.tensor import Tensor, Size
+from dragon.core import mpi
+from dragon.vm.torch.tensor import Tensor, _LeafTensor, _Device
+from dragon.vm.torch.ops.primitive import MakeDevice, WrapScalar
 from dragon.vm.torch.ops.factory import get_module
-from dragon.vm.torch.ops.primitive import MakeContext
-from dragon.vm.torch.ops.modules.array import Cast
+from dragon.vm.torch.ops.modules.control_flow import Compare
+
+from dragon.vm.torch.ops.modules.arithmetic import (
+    Fundamental, Log, Exp, Sqrt,
+    MM, FullyConnected,
+    Maximum, Minimum, Clamp,
+)
+
+from dragon.vm.torch.ops.modules.init import (
+    Fill, RandomUniform, RandomNormal,
+)

-from dragon.vm.torch.ops.arithmetic import (
-    _fundamental, _rfundamental,
-    _log, _exp, _sqrt, _clamp,
+from dragon.vm.torch.ops.modules.array import (
+    Reshape, Squeeze, UnSqueeze, Permute,
+    Indexing, Repeat, Concat, Gather,
+    Reduce, ArgReduce, OneHot, Multinomial,
 )

-from dragon.vm.torch.ops.array import (
-    reshape, squeeze, unsqueeze,
-    _permute, _repeat, _indexing, narrow,
-    _fill, _uniform, _normal,
-    _reduce, _arg_reduce,
+from dragon.vm.torch.ops.modules.update import (
+    Accumulate, Collective, Update,
 )

+from dragon.vm.torch.ops.modules.vision import (
+    Resize2d, RoIPool, RoIAlign,
+)
+
+
+__all__ = [
+    'add', 'sub', 'mul', 'div',
+    'maximum', 'minimum', 'clamp',
+    'log', 'exp', 'sqrt',
+    'mm', 'xw_plus_b',
+    'squeeze', 'unsqueeze',
+    'mean', 'sum', 'min', 'max', 'topk',
+    'argmin', 'argmax',
+    'gt', 'lt', 'eq', 'ge', 'le',
+    'cat', 'gather', 'narrow',
+    'one_hot', 'multinomial', 'rand', 'randn',
+    'zeros', 'zeros_like', 'ones', 'ones_like',
+    'nn_resize', 'bilinear_resize', 'roi_pool', 'roi_align',
+]
+

 ##############################################
 #                                            #
-#                   BASE                     #
+#                Arithmetic                  #
 #                                            #
 ##############################################


-def copy_(self, src, non_blocking=False):
-    """Copy the elements from ``src`` into this tensor and return ``self``.
+def _fundamental(input, value, op='Add', out=None):
+    if not isinstance(value, Tensor):
+        value = WrapScalar(value, input.dtype, input.device)
+    dev = MakeDevice(inputs=[input, value])
+    key = '{}/{}'.format(op, dev)
+    module = get_module(Fundamental, key, dev, op_type=op)
+    return module.forward(input, value, out)
+
+
+def _rfundamental(input, value, op='RAdd', out=None):
+    if not isinstance(value, Tensor):
+        value = WrapScalar(value, input.dtype, input.device)
+    dev = MakeDevice(inputs=[input, value])
+    key = '{}/{}'.format(op, dev)
+    module = get_module(Fundamental, key, dev, op_type=op)
+    return module.forward(value, input, out)
+
+
+def add(input, value, out=None):
+    """Add the ``input`` and ``value`` into the output tensor.

    Parameters
    ----------
-    src : dragon.vm.torch.Tensor
-        The source tensor.
-    non_blocking : boolean
-        Whether to copy asynchronously between CPU and GPU.
+    input : dragon.vm.torch.Tensor
+        The input tensor.
+    value : dragon.vm.torch.Tensor, number
+        The value tensor.
+    out : dragon.vm.torch.Tensor, optional
+        The output tensor.

    Returns
    -------
    dragon.vm.torch.Tensor
-        The ``self`` tensor.
+        The output tensor.

    """
-    # Copy memory
-    FromTensor(
-        src, GetDeviceOption(src._ctx.device_type, src._ctx.device_id),
-        self.name, GetDeviceOption(self._ctx.device_type, self._ctx.device_id))
-    # Transfer the static shape if necessary
-    self._static_shape = src.size() \
-        if self._static_shape else None
-    return self
+    return _fundamental(input, value, out=out, op='Add')


-Tensor.copy_ = copy_
+def sub(input, value, out=None):
+    """Subtract the ``input`` and ``value`` into the output tensor.

+    Parameters
+    ----------
+    input : dragon.vm.torch.Tensor
+        The input tensor.
+    value : dragon.vm.torch.Tensor or number
+        The value tensor.
+    out : dragon.vm.torch.Tensor, optional
+        The output tensor.

-##############################################
-#                                            #
-#                INITIALIZER                 #
-#                                            #
-##############################################
+    Returns
+    -------
+    torch.Tensor
+        The output tensor.
+
+    """
+    return _fundamental(input, value, out=out, op='Sub')


-def fill_(self, value):
-    """Fill self tensor with the specified value.
+def mul(input, value, out=None):
+    """Multiply the ``input`` and ``value`` into the output tensor.

    Parameters
    ----------
-    value : numerical type
+    input : dragon.vm.torch.Tensor
+        The input tensor.
+    value : dragon.vm.torch.Tensor or number
+        The value tensor.
+    out : dragon.vm.torch.Tensor, optional
+        The output tensor.

    Returns
    -------
    dragon.vm.torch.Tensor
-        The self.
+        The output tensor.

    """
-    return _fill(self, self.shape, value)
+    return _fundamental(input, value, out=out, op='Mul')


-def uniform_(self, low=0, high=1):
-    """Fill self tensor with the specified uniform distribution.
+def div(input, value, out=None):
+    """Divide the ``input`` and ``value`` into the output tensor.

    Parameters
    ----------
-    low : numerical type
-        The lower bound.
-    high : numerical type
-        The higher bound.
+    input : dragon.vm.torch.Tensor
+        The input tensor.
+    value : dragon.vm.torch.Tensor or number
+        The value tensor.
+    out : dragon.vm.torch.Tensor, optional
+        The output tensor.

    Returns
    -------
    dragon.vm.torch.Tensor
-        The self.
+        The output tensor.

    """
-    return _uniform(self, self.shape, low, high)
+    return _fundamental(input, value, out=out, op='Div')


-def normal_(self, mean=0, std=1):
-    """Fill self tensor with the specified normal distribution.
+def maximum(input, other, out=None):
+    """Return the max value of given two tensors.

    Parameters
    ----------
-    mean : numerical type
-        The mean(mu) of normal distribution.
-    std : numerical type
-        The std(sigma) of normal distribution.
+    input : dragon.vm.torch.Tensor or number
+        The input tensor.
+    other : dragon.vm.torch.Tensor or number
+        The input tensor.
+    out : dragon.vm.torch.Tensor, optional
+        The output tensor.

    Returns
    -------
    dragon.vm.torch.Tensor
-        The self.
+        The output tensor.

    """
-    return _normal(self, self.shape, mean, std)
+    if not isinstance(input, Tensor):
+        input = WrapScalar(input, other.dtype, other.device)
+    elif not isinstance(other, Tensor):
+        other = WrapScalar(other, input.dtype, input.device)
+    dev = MakeDevice(inputs=[input])
+    key = 'Maximum/{}'.format(dev)
+    module = get_module(Maximum, key, dev)
+    return module.forward(input, other, out)


-Tensor.fill_ = fill_
-Tensor.uniform_ = uniform_
-Tensor.normal_ = normal_
+def minimum(input, other, out=None):
+    """Return the min value of given two tensors.

+    Parameters
+    ----------
+    input : dragon.vm.torch.Tensor or number
+        The input tensor.
+    other : dragon.vm.torch.Tensor or number
+        The input tensor.
+    out : dragon.vm.torch.Tensor, optional
+        The output tensor.

-##############################################
-#                                            #
-#                 ARITHMETIC                 #
-#                                            #
-##############################################
+    Returns
+    -------
+    dragon.vm.torch.Tensor
+        The output tensor.
+
+    """
+    if not isinstance(input, Tensor):
+        input = WrapScalar(input, other.dtype, other.device)
+    elif not isinstance(other, Tensor):
+        other = WrapScalar(other, input.dtype, input.device)
+    dev = MakeDevice(inputs=[input])
+    key = 'Minimum/{}'.format(dev)
+    module = get_module(Minimum, key, dev)
+    return module.forward(input, other, out)


-def add(self, value):
-    """See ``torch.add()``
+def clamp(input, min=None, max=None, out=None):
+    """Clamp all elements into the range [min, max].

    Parameters
    ----------
-    value : dragon.vm.torch.Tensor, int or float
-        The value tensor.
+    input : dragon.vm.torch.Tensor
+        The input tensor.
+    min : number, optional
+        The min value.
+    max : number, optional
+        The max value.
+    out : dragon.vm.torch.Tensor, optional
+        The output tensor.

    Returns
    -------
@@ -157,37 +245,43 @@ def add(self, value):
        The output tensor.

    """
-    return _fundamental(self, value, op='Add')
+    dev = MakeDevice(inputs=[input])
+    key = 'Clamp/{}/min:{}/max:{}'.format(dev, min, max)
+    module = get_module(Clamp, key, dev, min=min, max=max)
+    return module.forward(input, out)


-def add_(self, value):
-    """Inplace of ``torch.add()``
+def log(input, out=None):
+    """Compute the natural logarithm of input.

    Parameters
    ----------
-    value : dragon.vm.torch.Tensor, int or float
-        The value tensor.
+    input : dragon.vm.torch.Tensor
+        The input tensor.
+    out : dragon.vm.torch.Tensor, optional
+        The output tensor.

    Returns
    -------
    dragon.vm.torch.Tensor
-        The self.
+        The output tensor.

    """
-    return _fundamental(self, value, out=self, op='Add')
-
-
-def radd(self, value):
-    return _rfundamental(self, value, op='RAdd')
+    dev = MakeDevice(inputs=[input])
+    key = 'Log/{}'.format(dev)
+    module = get_module(Log, key, dev)
+    return module.forward(input, out)


-def sub(self, value):
-    """Subtract the ``self`` and ``value`` into the output tensor.
+def exp(input, out=None):
+    """Compute the exponential of input.

    Parameters
    ----------
-    value : dragon.vm.torch.Tensor, int or float
-        The value tensor.
+    input : dragon.vm.torch.Tensor
+        The input tensor.
+    out : dragon.vm.torch.Tensor, optional
+        The output tensor.

    Returns
    -------
@@ -195,399 +289,865 @@ def sub(self, value):
        The output tensor.

    """
-    return _fundamental(self, value, op='Sub')
+    dev = MakeDevice(inputs=[input])
+    key = 'Exp/{}'.format(dev)
+    module = get_module(Exp, key, dev)
+    return module.forward(input, out)


-def sub_(self, value):
-    """Inplace of ``Tensor.sub()``
+def sqrt(input, out=None):
+    """Compute the square-root of input.

    Parameters
    ----------
-    value : torch.Tensor, int or float
-        The value tensor.
+    input : dragon.vm.torch.Tensor
+        The input tensor.
+    out : dragon.vm.torch.Tensor, optional
+        The output tensor.

    Returns
    -------
-    torch.Tensor
-        The self.
+    dragon.vm.torch.Tensor
+        The output tensor.

    """
-    return _fundamental(self, value, out=self, op='Sub')
-
+    dev = MakeDevice(inputs=[input])
+    key = 'Sqrt/{}'.format(dev)
+    module = get_module(Sqrt, key, dev)
+    return module.forward(input, out)

-def rsub(self, value):
-    return _rfundamental(self, value, op='RSub')

-
-def mul(self, value):
-    """Multiply the ``self`` and ``value`` into the output tensor.
+def mm(mat1, mat2, transA=False, transB=False, out=None):
+    """Performs a matrix multiplication of the matrices ``mat1`` and ``mat2.``

    Parameters
    ----------
-    value : torch.Tensor, int or float
-        The value tensor.
+    mat1 : dragon.vm.torch.Tensor
+        The matrix A.
+    mat2 : dragon.vm.torch.Tensor
+        The matrix B.
+    transA : boolean
+        Whether to transpose the ``mat1``.
+    transB : boolean
+        Whether to transpose the ``mat2``.

    Returns
    -------
-    torch.Tensor
+    dragon.vm.torch.Tensor
        The output tensor.

    """
-    return _fundamental(self, value, op='Mul')
+    dev = MakeDevice(inputs=[mat1, mat2])
+    key = 'Matmul/{}/transA:{}/transB:{}'.format(dev, transA, transB)
+    module = get_module(MM, key, dev, transA=transA, transB=transB)
+    return module.forward(mat1, mat2, out)


-def mul_(self, value):
-    """Inplace of ``Tensor.mul()``
+def xw_plus_b(x, w, bias=None, transW=True, out=None):
+    """Compute *matmul(x, w) + bias.*``

    Parameters
    ----------
-    value : torch.Tensor, int or float
-        The value tensor.
+    x : dragon.vm.torch.Tensor
+        The x.
+    w : dragon.vm.torch.Tensor
+        The w.
+    bias : dragon.vm.torch.Tensor, optional
+        The bias.
+    transW : boolean
+        Whether to transpose the ``w``.

    Returns
    -------
-    torch.Tensor
-        The self.
+    dragon.vm.torch.Tensor
+        The output tensor.

    """
-    return _fundamental(self, value, out=self, op='Mul')
+    dev = MakeDevice(inputs=[x, w] + ([bias] if bias else []))
+    key = 'FullyConnected/{}/transW:{}'.format(dev, transW)
+    module = get_module(FullyConnected, key, dev, transW=transW)
+    return module.forward(x, w, bias, out)
+

+##############################################
+#                                            #
+#                   Array                    #
+#                                            #
+##############################################

-def rmul(self, value):
-    return _rfundamental(self, value, op='RMul')

+def _reshape(input, shape, shape_like=None):
+    if shape_like is not None: shape = shape_like.shape
+    dev = MakeDevice(inputs=[input]); n_dim = len(shape)
+    key = 'Reshape/{}/n_dim:{}'.format(dev, n_dim)
+    module = get_module(Reshape, key, dev, n_dim=n_dim)
+    return module.forward(input, shape)
+
+
+def _permute(input, perm):
+    dev = MakeDevice(inputs=[input]); n_perm = len(perm)
+    key = 'Permute/{}/n_perm:{}'.format(dev, n_perm)
+    module = get_module(Permute, key, dev, n_perm=n_perm)
+    return module.forward(input, perm)
+
+
+def _repeat(input, times):
+    dev = MakeDevice(inputs=[input]); n_times = len(times)
+    key = 'Repeat/{}/n_times:{}'.format(dev, n_times)
+    module = get_module(Repeat, key, dev, n_times=n_times)
+    return module.forward(input, times)
+
+
+def _fill(input, shape, value):
+    dev = MakeDevice(inputs=[input]); n_dim = len(shape)
+    key = 'Fill/{}/dtype:{}/n_dim:{}/value:{}'.format(
+        dev, input.dtype, n_dim, value)
+    module = get_module(Fill, key, dev, n_dim=n_dim,
+        value=value, dtype=input.dtype)
+    return module.forward(input, shape)
+
+
+def _uniform(input, shape, low, high):
+    dev = MakeDevice(inputs=[input]); n_dim = len(shape)
+    key = 'Uniform/{}/dtype:{}/n_dim:{}/low:{}/high:{}'.format(
+        dev, input.dtype, n_dim, float(low), float(high))
+    module = get_module(
+        RandomUniform, key, dev, n_dim=n_dim,
+            low=low, high=high, dtype=input.dtype)
+    return module.forward(input, shape)
+
+
+def _normal(input, shape, mean, std):
+    dev = MakeDevice(inputs=[input]); n_dim = len(shape)
+    key = 'Normal/{}/dtype:{}/n_dim:{}/mean:{}/std:{}'.format(
+        dev, input.dtype, n_dim, float(mean), float(std))
+    module = get_module(
+        RandomNormal, key, dev, n_dim=n_dim,
+            mean=mean, std=std, dtype=input.dtype)
+    return module.forward(input, shape)
+
+
+def _reduce(input, operation, dim=None, keepdim=False, out=None):
+    if dim is None: keepdim = False
+    dev = MakeDevice(inputs=[input])
+    key = '{}/{}/dim:{}/keepdim:{}'.format(
+        operation, dev, dim, int(keepdim))
+    module = get_module(
+        Reduce, key, dev, operation=operation,
+            dim=dim, keepdim=keepdim)
+    return module.forward(input, out)
+
+
+def _arg_reduce(input, operation, dim=None, keepdim=False, top_k=1, out=None):
+    if dim is None: keepdim = False
+    dev = MakeDevice(inputs=[input])
+    key = '{}/{}/dim:{}/keepdim:{}/top_k:{}'.format(
+        operation, dev, dim, int(keepdim), top_k)
+    module = get_module(
+        ArgReduce, key, dev,
+            operation=operation, axis=dim,
+                keepdim=keepdim, top_k=top_k)
+    return module.forward(input, out)
+
+
+def _indexing(input, starts, sizes):
+    n_starts, n_sizes = len(starts), len(sizes)
+    dev = MakeDevice(inputs=[input])
+    key = 'Index/{}/n_starts:{}/n_sizes:{}'.format(dev, n_starts, n_sizes)
+    module = get_module(Indexing, key, dev, n_starts=n_starts, n_sizes=n_sizes)
+    return module.forward(input, starts, sizes)
+
+
+def _compare(input, other, operation, out=None):
+    if not isinstance(other, Tensor):
+        other = WrapScalar(other, input.dtype, input.device)
+    dev = MakeDevice(inputs=[input, other])
+    key = 'Compare/{}/{}'.format(operation, dev)
+    module = get_module(Compare, key, dev, operation=operation)
+    return module.forward(input, other, out)

-def div(self, value):
-    """Divide the ``self`` and ``value`` into the output tensor.
+
+def squeeze(input, dim=None, out=None):
+    """Return a tensor with all the dimensions of input of size 1 removed.

    Parameters
    ----------
-    value : torch.Tensor, int or float
-        The value tensor.
+    dim : int
+        The optional dim to remove.
+    out : dragon.vm.torch.Tensor, optional
+        The output tensor.

    Returns
    -------
-    torch.Tensor
-        The output tensor.
+    dragon.vm.torch.Tensor
+        The new tensor.

    """
-    return _fundamental(self, value, op='Div')
+    dev = MakeDevice(inputs=[input])
+    key = 'Squeeze/{}/dim:{}'.format(dev, dim if dim else 'None')
+    module = get_module(Squeeze, key, dev, dim=dim)
+    return module.forward(input, out=out)


-def div_(self, value):
-    """Inplace of ``Tensor.div()``
+def unsqueeze(input, dim, out=None):
+    """Returns a tensor with a dimension of size 1 inserted at the specified position.

    Parameters
    ----------
-    value : torch.Tensor, int or float
-        The value tensor.
+    dim : int
+        The dim to remove.
+    out : dragon.vm.torch.Tensor, optional
+        The output tensor.

    Returns
    -------
-    torch.Tensor
-        The self.
+    dragon.vm.torch.Tensor
+        The new tensor.

    """
-    return _fundamental(self, value, out=self, op='Div')
+    dev = MakeDevice(inputs=[input])
+    key = 'Unsqueeze/{}/dim:{}'.format(dev, dim if dim else 'None')
+    module = get_module(UnSqueeze, key, dev, dim=dim)
+    return module.forward(input, out=out)
+
+
+def mean(input, dim=None, keepdim=False, out=None):
+    """Return the mean of all elements or elements along the given dim.
+
+    Parameters
+    ----------
+    input : dragon.vm.torch.Tensor
+        The input tensor.
+    dim : int, optional
+        The axis of tensor to compute mean value.
+    keepdim : bool, optional
+        Whether the output tensor has dim retained or not.
+    out : dragon.vm.torch.Tensor, optional
+        The optional output tensor.

+    Returns
+    -------
+    dragon.vm.torch.Tensor
+        The mean-reduced tensor.

-def rdiv(self, value):
-    return _rfundamental(self, value, op='RDiv')
+    """
+    return _reduce(input, 'MEAN', dim, keepdim, out)


-def clamp(self, min=None, max=None):
-    """Return a tensor that all elements are clamped into the range [min, max].
+def sum(input, dim=None, keepdim=False, out=None):
+    """Return the sum of all elements or elements along the given dim.

    Parameters
    ----------
-    min : number, optional
-        The min value.
-    max : number, optional
-        The max value.
+    input : dragon.vm.torch.Tensor
+        The input tensor.
+    dim : int, optional
+        The axis of tensor to compute sum value.
+    keepdim : bool, optional
+        Whether the output tensor has dim retained or not.
+    out : dragon.vm.torch.Tensor, optional
+        The optional output tensor.

    Returns
    -------
    torch.Tensor
-        The output tensor.
+        The sum-reduced tensor.

    """
-    return _clamp(self, min, max)
+    return _reduce(input, 'SUM', dim, keepdim, out)


-def clamp_(self, min=None, max=None):
-    """Clamp all elements are clamped into the range [min, max].
+def argmax(input, dim=None, keepdim=False, out=None):
+    """Return the indices of maximum elements along the given axis.

    Parameters
    ----------
-    min : number, optional
-        The min value.
-    max : number, optional
-        The max value.
+    input : dragon.vm.torch.Tensor
+        The input tensor.
+    dim : int, optional
+        The axis of tensor to compute sum value.
+    keepdim : bool, optional
+        Whether the output tensor has dim retained or not.
+    out : dragon.vm.torch.Tensor, optional
+        The optional output tensor.

    Returns
    -------
    torch.Tensor
-        The output tensor.
+        The maximum indices.

    """
-    return _clamp(self, min, max, self)
+    return _arg_reduce(input, 'ARGMAX', dim, keepdim, 1, out)


-def log(self):
-    """Compute the natural logarithm of this tensor.
+def max(input, dim=None, keepdim=False, out=None):
+    """Return the values and indices of maximum elements along the given axis.

    Parameters
    ----------
-    None
+    input : dragon.vm.torch.Tensor
+        The input tensor.
+    dim : int, optional
+        The axis of tensor to compute sum value.
+    keepdim : bool, optional
+        Whether the output tensor has dim retained or not.
+    out : dragon.torch.Tensor, optional
+        The optional output tensor.

    Returns
    -------
-    torch.Tensor
-        The log tensor.
+    tuple
+        The maximum values and indices.

    """
-    return _log(self)
+    return _arg_reduce(input, 'MAX', dim, keepdim, 1, out)


-def exp(self):
-    """Compute the exponential of this tensor.
+def argmin(input, dim=None, keepdim=False, out=None):
+    """Return the indices of minimum elements along the given axis.

    Parameters
    ----------
-    None
+    input : dragon.vm.torch.Tensor
+        The input tensor.
+    dim : int, optional
+        The axis of tensor to compute sum value.
+    keepdim : bool, optional
+        Whether the output tensor has dim retained or not.
+    out : dragon.vm.torch.Tensor, optional
+        The optional output tensor.

    Returns
    -------
    torch.Tensor
-        The exp tensor.
+        The minimum indices.

    """
-    return _exp(self)
+    return _arg_reduce(input, 'ARGMIN', dim, keepdim, 1, out)


-def sqrt(self):
-    """Compute the square-root of this tensor.
+def min(input, dim=None, keepdim=False, out=None):
+    """Return the values and indices of maximum elements along the given axis.

    Parameters
    ----------
-    None
+    input : dragon.vm.torch.Tensor
+        The input tensor.
+    dim : int, optional
+        The axis of tensor to compute sum value.
+    keepdim : bool, optional
+        Whether the output tensor has dim retained or not.
+    out : dragon.torch.Tensor, optional
+        The optional output tensor.

    Returns
    -------
-    torch.Tensor
-        The sqrt tensor.
+    sequence
+        The minimum values and indices.

    """
-    return _sqrt(self)
+    return _arg_reduce(input, 'MIN', dim, keepdim, 1, out)


-Tensor.add = add
-Tensor.add_ = add_
-Tensor.__radd__ = radd
-Tensor.sub = sub
-Tensor.sub_ = sub_
-Tensor.__rsub__ = rsub
-Tensor.mul = mul
-Tensor.mul_ = mul_
-Tensor.__rmul__ = rmul
-Tensor.div = div
-Tensor.div_ = div_
-Tensor.__rdiv__ = rdiv
-Tensor.__rtruediv__ = rdiv
-Tensor.clamp = clamp
-Tensor.clamp_ = clamp_
-Tensor.log = log
-Tensor.exp = exp
-Tensor.sqrt = sqrt
+def topk(input, k, dim=None, largest=True, sorted=True, out=None):
+    """Return the k largest/smallest values and indices along the given axis.

+    If ``dim`` is not given, the last dimension of the input is chosen.

-##############################################
-#                                            #
-#                   ARRAY                    #
-#                                            #
-##############################################
+    If ``largest`` is False then the k smallest elements are returned.
+
+    Parameters
+    ----------
+    input : dragon.vm.torch.Tensor
+        The input tensor.
+    k : int
+        The top k.
+    dim : int, optional
+        The axis of tensor to compute sum value.
+    largest : bool, optional
+        Whether to return largest or smallest elements.
+    sorted : bool, optional
+        Whether to return in the sorted order.
+    out : dragon.vm.torch.Tensor, optional
+        The optional output tensor.

+    Returns
+    -------
+    sequence
+        The values and indices.
+
+    """
+    operation = 'MAX' if largest else 'MIN'
+    if dim is None: dim = input.ndimension() - 1
+    return _arg_reduce(input, operation, dim, True, k, out)

-def _squeeze(self, dim=None):
-    """Returns a tensor with all the dimensions of input of size 1 removed.
+
+def gt(input, other, out=None):
+    """Compute *input* > *other* element-wise.

    Parameters
    ----------
-    dim : int
-        The optional dim to remove.
+    input : dragon.vm.torch.Tensor
+        The input tensor.
+    other : dragon.vm.torch.Tensor, number
+        The other tensor.
+    out : dragon.vm.torch.Tensor, optional
+        The optional output tensor.
+
+    Returns
+    -------
+    dragon.vm.torch.Tensor
+        The output byte tensor.

+    """
+    return _compare(input, other, 'GT', out)
+
+
+def ge(input, other, out=None):
+    """Compute *input* >= *other* element-wise.
+
+    Parameters
+    ----------
+    input : dragon.vm.torch.Tensor
+        The input tensor.
+    other : dragon.vm.torch.Tensor, number
+        The other tensor.
+    out : dragon.vm.torch.Tensor, optional
+        The optional output tensor.

    Returns
    -------
-    torch.Tensor
-        The new tensor.
+    dragon.vm.torch.Tensor
+        The output byte tensor.

    """
-    return squeeze(self, dim=dim)
+    return _compare(input, other, 'GE', out)


-def _squeeze_(self, dim=None):
-    """Inplace of ``Tensor.squeeze()``
+def lt(input, other, out=None):
+    """Compute *input* < *other* element-wise.

    Parameters
    ----------
-    dim : int
-        The optional dim to remove.
+    input : dragon.vm.torch.Tensor
+        The input tensor.
+    other : dragon.vm.torch.Tensor, number
+        The other tensor.
+    out : dragon.vm.torch.Tensor, optional
+        The optional output tensor.

    Returns
    -------
-    torch.Tensor
-        The self.
+    dragon.vm.torch.Tensor
+        The output byte tensor.

    """
-    return squeeze(self, dim=dim, out=self)
+    return _compare(input, other, 'LT', out)


-def _unsqueeze(self, dim):
-    """Returns a tensor with a dimension of size 1 inserted at the specified position.
+def le(input, other, out=None):
+    """Compute *input* <= *other* element-wise.

    Parameters
    ----------
-    dim : int
-        The dim to insert.
+    input : dragon.vm.torch.Tensor
+        The input tensor.
+    other : dragon.vm.torch.Tensor, number
+        The other tensor.
+    out : dragon.vm.torch.Tensor, optional
+        The optional output tensor.

    Returns
    -------
-    torch.Tensor
-        The new tensor.
+    dragon.vm.torch.Tensor
+        The output byte tensor.

    """
-    return unsqueeze(self, dim=dim)
+    return _compare(input, other, 'LE', out)


-def _unsqueeze_(self, dim=None):
-    """Inplace of ``Tensor.unsqueeze()``
+def eq(input, other, out=None):
+    """Compute *input* == *other* element-wise.

    Parameters
    ----------
-    dim : int
-        The optional dim to remove.
+    input : dragon.vm.torch.Tensor
+        The input tensor.
+    other : dragon.vm.torch.Tensor, number
+        The other tensor.
+    out : dragon.vm.torch.Tensor, optional
+        The optional output tensor.

    Returns
    -------
-    torch.Tensor
-        The self.
+    dragon.vm.torch.Tensor
+        The output byte tensor.

    """
-    return unsqueeze(self, dim=dim, out=self)
+    return _compare(input, other, 'EQ', out)


-def view(self, *args):
-    return reshape(self, shape=args)
+def cat(seq, dim=0, out=None):
+    """Concatenate the inputs along the given axis.

+    Parameters
+    ----------
+    seq : sequence of dragon.vm.torch.Tensor
+        The sequence.
+    dim : int, optional
+        The dim to concatenate.
+    out : dragon.vm.torch.Tensor, optional
+        The optional output tensor.
+
+    Returns
+    -------
+    dragon.vm.torch.Tensor
+        The output tensor.
+
+    """
+    dev = MakeDevice(inputs=seq, outputs=[out] if out else [])
+    key = 'Concat/{}/dim:{}'.format(dev, dim)
+    module = get_module(Concat, key, dev, axis=dim)
+    return module.forward(seq, out)

-def view_as(self, other):
-    if not isinstance(other, Tensor):
-        raise ValueError('The other should be a torch tensor.')
-    return reshape(self, shape=None, shape_like=other)

+def gather(input, dim, index, out=None):
+    """Gather the input values along the given axis.
+
+    Note that it is a tensorflow style gather, which takes a vector index,
+
+    values of other dimension will be copied automatically.
+
+    Parameters
+    ----------
+    input : dragon.vm.torch.Tensor
+        The values.
+    dim : int
+        The dim to gather.
+    index : dragon.vm.torch.Tensor
+        The indices.
+    out : dragon.vm.torch.Tensor, optional
+        The optional output tensor.

-def permute(self, dims=None):
-    return _permute(self, dims)
+    Returns
+    -------
+    dragon.vm.torch.Tensor
+        The output tensor.

+    """
+    dev = MakeDevice(
+        inputs=[input, index],
+            outputs=[out] if out else [])
+    key = 'Gather/{}/dim:{}'.format(dev, dim)
+    module = get_module(Gather, key, dev, axis=dim)
+    return module.forward(input, index, out)

-def repeat(self, *sizes):
-    if len(sizes) == 1 and \
-        isinstance(sizes[0], Size):
-            sizes = sizes[0]
-    return _repeat(self, sizes)

+def narrow(input, dimension, start, length):
+    """Return a new tensor that is a narrowed version of input tensor.

-def indexing(self, starts, ends):
-    return _indexing(self, starts, ends)
+    Parameters
+    ----------
+    input : torch.Tensor
+        The input tensor.
+    dimension : int
+        The dimension to narrow.
+    start : int
+        The starting position.
+    length : int
+        The distance to the ending postion.

+    Returns
+    -------
+    dragon.vm.torch.Tensor
+        The output tensor.

-def _narrow(self, dimension, start, length):
-    return narrow(self, dimension, start, length)
+    """
+    sizes = list(input.shape[:]); starts = [0] * len(sizes)
+    starts[dimension], sizes[dimension] = start, length
+    return _indexing(input, starts, sizes)


-def mean(self, dim=None, keepdim=False):
-    return _reduce(self, 'MEAN', dim, keepdim)
+def one_hot(input, depth):
+    """Return a ont hot tensor according to given input.

+    Parameters
+    ----------
+    input : dragon.vm.torch.Tensor
+        The input tensor.
+    depth : int
+        The depth of channels.

-def sum(self, dim=None, keepdim=False):
-    return _reduce(self, 'SUM', dim, keepdim)
+    Returns
+    -------
+    dragon.vm.torch.FloatTensor
+        The output tensor.

+    """
+    dev = MakeDevice(inputs=[input])
+    key = 'OneHot/{}/depth:{}'.format(dev, depth)
+    module = get_module(OneHot, key, dev, depth=depth)
+    return module.forward(input)

-def max(self, dim=None, keepdim=False):
-    return _arg_reduce(self, 'MAX', dim, keepdim)

+def multinomial(input, num_samples, normalize=False, out=None):
+    """Return a tensor where each row contains ``num_samples``,
+     sampled from the multinomial distribution.

-def min(self, dim=None, keepdim=False):
-    return _arg_reduce(self, 'MIN', dim, keepdim)
+    Parameters
+    ----------
+    input : dragon.vm.torch.Tensor
+        The input tensor.
+    num_samples : int
+        The number of samples.
+    normalize : boolean, optional, default=False
+        Whether to normalize the inputs.

+    Returns
+    -------
+    dragon.vm.torch.Tensor
+        The output tensor.

-Tensor.squeeze = _squeeze
-Tensor.squeeze_ = _squeeze_
-Tensor.unsqueeze = _unsqueeze
-Tensor.unsqueeze_ = _unsqueeze_
-Tensor.view = view
-Tensor.view_as = view_as
-Tensor.permute = permute
-Tensor.repeat = repeat
-Tensor.mean = mean
-Tensor.sum = sum
-Tensor.max = max
-Tensor.min = min
-Tensor.narrow = _narrow
-Tensor._indexing = indexing
+    """
+    dev = MakeDevice(inputs=[input])
+    key = 'Multinomial/{}/num_samples:{}/normalize:{}'.format(
+        dev, num_samples, normalize)
+    module = get_module(
+        Multinomial, key, dev,
+            num_samples=num_samples,
+                normalize=normalize)
+    return module.forward(input, out)


 ##############################################
 #                                            #
-#                    TYPE                    #
+#                 Creation                   #
 #                                            #
 ##############################################


-def _type_to(input, dtype='float32', inplace=False):
-    if dtype == input.dtype: return input
-    ctx = MakeContext(inputs=[input])
-    key = 'Cast/{}/dtype:{}/inplace:{}'.format(
-        ctx, dtype, 'true' if inplace else 'false')
-    module = get_module(Cast, key, ctx, dtype=dtype, inplace=inplace)
-    return module.forward(input)
+def _get_leaf_tensor(sizes, kwargs):
+    return _LeafTensor(sizes,
+        requires_grad=kwargs['requires_grad'] \
+            if 'requires_grad' in kwargs else False,
+        dtype=kwargs.get('dtype', 'float32'),
+        device=kwargs.get('device', _Device()))


-def _type(self, dtype=None):
-    """Return the data type of this tensor.
+def zeros(*sizes, **kwargs):
+    """Return a float tensor with values of ``0``.
+
+    Parameters
+    ----------
+    sizes : tuple, list or int
+        The sizes indicating the shape of the output tensor.
+    out : dragon.vm.torch.Tensor
+        The optional output tensor.

-    If ``dtype`` is not ``None``, cast ``self`` to the new tensor.
+    Returns
+    -------
+    vm.torch.FloatTensor
+        The output tensor.
+
+    """
+    out = kwargs['out'] if 'out' in kwargs else None
+    if out is None: out = _get_leaf_tensor(sizes, kwargs)
+    return _fill(out, shape=sizes, value=0)
+
+
+def zeros_like(input, out=None, **kwargs):
+    """Return a float tensor with values of ``0``, shape as the input.
+
+    Parameters
+    ----------
+    input : dragon.vm.torch.Tensor
+        The tensor for indicating shape.
+    out : dragon.vm.torch.Tensor
+        The optional output tensor.
+
+    Returns
+    -------
+    vm.torch.FloatTensor
+        The output tensor.
+
+    """
+    if not hasattr(input, 'shape'):
+        raise ValueError('Input does not have the shape attribute.')
+    if out is None: out = _get_leaf_tensor(input.shape, kwargs)
+    return _fill(out, shape=input.shape, value=0)
+
+
+def ones(*sizes, **kwargs):
+    """Return a float tensor with values of ``1``.

    Parameters
    ----------
-    dtype : str
-        The specified type.
+    sizes : tuple, list or int
+        The sizes indicating the shape of the output tensor.
+    out : dragon.vm.torch.Tensor
+        The optional output tensor.

    Returns
    -------
-    str or torch.Tensor
-        The data type or the new tensor.
+    vm.torch.FloatTensor
+        The output tensor.

    """
-    if dtype is None:
-        return 'torch.' + self._type2str()
+    out = kwargs['out'] if 'out' in kwargs else None
+    if out is None: out = _get_leaf_tensor(sizes, kwargs)
+    return _fill(out, shape=sizes, value=1)
+
+
+def ones_like(input, out=None, **kwargs):
+    """Return a float tensor with values of ``1``, shape as the input.
+
+    Parameters
+    ----------
+    input : dragon.vm.torch.Tensor
+        The tensor for indicating shape.
+    out : dragon.vm.torch.Tensor
+        The optional output tensor.
+
+    Returns
+    -------
+    vm.torch.FloatTensor
+        The output tensor.
+
+    """
+    if not hasattr(input, 'shape'):
+        raise ValueError('Input does not have the shape attribute.')
+    if out is None: out = _get_leaf_tensor(input.shape, kwargs)
+    return _fill(out, shape=input.shape, value=1)
+
+
+def rand(*sizes, **kwargs):
+    """Return a float tensor with a uniform distribution of U(0, 1).
+
+    Parameters
+    ----------
+    sizes : tuple, list or int
+        The sizes indicating the shape of the output tensor.
+    out : dragon.vm.torch.Tensor
+        The optional output tensor.
+
+    Returns
+    -------
+    vm.torch.FloatTensor
+        The output tensor.
+
+    """
+    out = kwargs['out'] if 'out' in kwargs else None
+    if out is None: out = _get_leaf_tensor(sizes, kwargs)
+    return _uniform(out, sizes, low=0, high=1)
+
+
+def randn(*sizes, **kwargs):
+    """Return a float tensor with a normal distribution of N(0, 1).
+
+    Parameters
+    ----------
+    sizes : tuple, list or int
+        The sizes indicating the shape of the output tensor.
+    out : dragon.vm.torch.Tensor
+        The optional output tensor.
+
+    Returns
+    -------
+    vm.torch.FloatTensor
+        The output tensor.
+
+    """
+    out = kwargs['out'] if 'out' in kwargs else None
+    if out is None: out = _get_leaf_tensor(sizes, kwargs)
+    return _normal(out, sizes, mean=0, std=1)
+
+
+##############################################
+#                                            #
+#                  Update                    #
+#                                            #
+##############################################
+
+
+def _accumulate(grads):
+    if len(grads) == 0: return
+    if not isinstance(grads, (list, tuple)): grads = [grads]
+    dev = MakeDevice(inputs=grads)
+    key = 'Accumulate/{}/alpha:1./beta:1.'.format(dev)
+    module = get_module(Accumulate, key, dev)
+    return module.forward(grads)
+
+
+def _allreduce(grads):
+    if not isinstance(grads, (list, tuple)): grads = [grads]
+    dev = MakeDevice(inputs=grads)
+    mode = mpi.GetParallelMode() + '_ALLREDUCE'
+    key = 'Collective/{}/{}'.format(dev, mode.lower())
+    module = get_module(Collective, key, dev, mode=mode)
+    return module.forward(grads)
+
+
+def _update(param, grad, op_type, slot,
+            lr_mult=1.0, decay_mult=1.0):
+    dev = MakeDevice(inputs=[param])
+    key = '{}/{}/{}/{}'.format(op_type, dev, slot, param.name)
+    module = get_module(Update, key, dev, op_type=op_type,
+        lr_mult=lr_mult, decay_mult=decay_mult, slot=slot)
+    return module.forward(param, grad)
+
+
+##############################################
+#                                            #
+#                  Vision                    #
+#                                            #
+##############################################
+
+
+def _resize_2d(input, op_type, dsize, fx, fy):
+    if dsize is None:
+        if fx < 0 or fy < 0:
+            raise ValueError('Set fx and fy if dsize is None.')
    else:
-        return _type_to(self, dtype=dtype)
-
-
-Tensor.type = _type
-Tensor.half = lambda self: _type_to(self, dtype='float16', inplace=False)
-Tensor.half_ = lambda self: _type_to(self, dtype='float16', inplace=True)
-Tensor.float = lambda self: _type_to(self, dtype='float32', inplace=False)
-Tensor.float_ = lambda self: _type_to(self, dtype='float32', inplace=True)
-Tensor.double = lambda self: _type_to(self, dtype='float64', inplace=False)
-Tensor.double_ = lambda self: _type_to(self, dtype='float64', inplace=True)
-Tensor.byte = lambda self: _type_to(self, dtype='uint8', inplace=False)
-Tensor.byte_ = lambda self: _type_to(self, dtype='uint8', inplace=True)
-Tensor.char = lambda self: _type_to(self, dtype='int8', inplace=False)
-Tensor.char_ = lambda self: _type_to(self, dtype='int8', inplace=True)
-Tensor.int = lambda self: _type_to(self, dtype='int32', inplace=False)
-Tensor.int_ = lambda self: _type_to(self, dtype='int32', inplace=True)
-Tensor.long = lambda self: _type_to(self, dtype='int64', inplace=False)
-Tensor.long_ = lambda self: _type_to(self, dtype='int64', inplace=True)
\ No newline at end of file
+        if len(dsize) != 2:
+            raise ValueError('The dsize should be a list with 2 elements.')
+    if dsize is None and (fy == -1.0 or fx == -1.0):
+        raise RuntimeError('The dsize, fx/fy should be specified either.')
+    dev = MakeDevice(inputs=[input])
+    key = '{}/{}/dsize:{}/fx:{}/fy:{}'.format(
+        op_type, dev, '2' if dsize else 'none', fx, fy)
+    module = get_module(Resize2d, key, dev,
+        op_type=op_type, dsize=dsize, fx=fx, fy=fy)
+    return module.forward(input, dsize)
+
+
+def nn_resize(input, dsize, fx=-1.0, fy=-1.0):
+    return _resize_2d(input, 'NNResize', dsize, fx, fy)
+
+
+def bilinear_resize(input, dsize, fx=-1.0, fy=-1.0):
+    return _resize_2d(input, 'BilinearResize', dsize, fx, fy)
+
+
+def roi_pool(feature, rois, pooled_h, pooled_w, spatial_scale):
+    dev = MakeDevice(inputs=[feature])
+    key = 'RoIPool/{}/pool_h:{}/pool_w:{}/spatial_scale:{}'.format(
+        dev, pooled_h, pooled_w, spatial_scale)
+    module = get_module(
+        RoIPool, key, dev,
+            pooled_h=pooled_h, pooled_w=pooled_w,
+                spatial_scale=spatial_scale)
+    return module.forward(feature, rois)
+
+
+def roi_align(feature, rois, pooled_h, pooled_w,
+              spatial_scale, sampling_ratio=2):
+    dev = MakeDevice(inputs=[feature])
+    key = 'RoIAlign/{}/pool_h:{}/pool_w:{}/' \
+          'spatial_scale:{}/sampling_ratio:{}'.format(
+        dev, pooled_h, pooled_w, spatial_scale, sampling_ratio)
+    module = get_module(
+        RoIAlign, key, dev,
+            pooled_h=pooled_h, pooled_w=pooled_w,
+                spatial_scale=spatial_scale,
+                    sampling_ratio=sampling_ratio)
+    return module.forward(feature, rois)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/ops/creation.py
+++ b/Dragon/python/dragon/vm/torch/ops/creation.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from dragon.vm.torch.tensor import LeafTensor
-
-from dragon.vm.torch.ops.array import (
-    _fill, _uniform, _normal,
-)
-
-
-def zeros(*sizes, **kwargs):
-    """Return a float tensor with values of ``0``.
-
-    Parameters
-    ----------
-    sizes : tuple, list or int
-        The sizes indicating the shape of the output tensor.
-    out : dragon.vm.torch.Tensor
-        The optional output tensor.
-
-    Returns
-    -------
-    vm.torch.FloatTensor
-        The output tensor.
-
-    """
-    out = kwargs['out'] if 'out' in kwargs else None
-    if out is None:
-        out = LeafTensor(sizes, requires_grad=kwargs['requires_grad'] \
-            if 'requires_grad' in kwargs else False)
-    return _fill(out, shape=sizes, value=0)
-
-
-def zeros_like(input, out=None, **kwargs):
-    """Return a float tensor with values of ``0``, shape as the input.
-
-    Parameters
-    ----------
-    input : dragon.vm.torch.Tensor
-        The tensor for indicating shape.
-    out : dragon.vm.torch.Tensor
-        The optional output tensor.
-
-    Returns
-    -------
-    vm.torch.FloatTensor
-        The output tensor.
-
-    """
-    if not hasattr(input, 'shape'):
-        raise ValueError('Input does not have the shape attribute.')
-    if out is None:
-        out = LeafTensor(input.shape, requires_grad=kwargs['requires_grad'] \
-            if 'requires_grad' in kwargs else False)
-    return _fill(out, shape=input.shape, value=0)
-
-
-def ones(*sizes, **kwargs):
-    """Return a float tensor with values of ``1``.
-
-    Parameters
-    ----------
-    sizes : tuple, list or int
-        The sizes indicating the shape of the output tensor.
-    out : dragon.vm.torch.Tensor
-        The optional output tensor.
-
-    Returns
-    -------
-    vm.torch.FloatTensor
-        The output tensor.
-
-    """
-    out = kwargs['out'] if 'out' in kwargs else None
-    if out is None:
-        out = LeafTensor(sizes, requires_grad=kwargs['requires_grad'] \
-            if 'requires_grad' in kwargs else False)
-    return _fill(out, shape=sizes, value=1)
-
-
-def ones_like(input, out=None, **kwargs):
-    """Return a float tensor with values of ``1``, shape as the input.
-
-    Parameters
-    ----------
-    input : dragon.vm.torch.Tensor
-        The tensor for indicating shape.
-    out : dragon.vm.torch.Tensor
-        The optional output tensor.
-
-    Returns
-    -------
-    vm.torch.FloatTensor
-        The output tensor.
-
-    """
-    if not hasattr(input, 'shape'):
-        raise ValueError('Input does not have the shape attribute.')
-    if out is None:
-        out = LeafTensor(input.shape, requires_grad=kwargs['requires_grad'] \
-            if 'requires_grad' in kwargs else False)
-    return _fill(out, shape=input.shape, value=1)
-
-
-def rand(*sizes, **kwargs):
-    """Return a float tensor with a uniform distribution of U(0, 1).
-
-    Parameters
-    ----------
-    sizes : tuple, list or int
-        The sizes indicating the shape of the output tensor.
-    out : dragon.vm.torch.Tensor
-        The optional output tensor.
-
-    Returns
-    -------
-    vm.torch.FloatTensor
-        The output tensor.
-
-    """
-    out = kwargs['out'] if 'out' in kwargs else None
-    if out is None:
-        out = LeafTensor(sizes, requires_grad=kwargs['requires_grad'] \
-            if 'requires_grad' in kwargs else False)
-    return _uniform(out, sizes, low=0, high=1)
-
-
-def randn(*sizes, **kwargs):
-    """Return a float tensor with a normal distribution of N(0, 1).
-
-    Parameters
-    ----------
-    sizes : tuple, list or int
-        The sizes indicating the shape of the output tensor.
-    out : dragon.vm.torch.Tensor
-        The optional output tensor.
-
-    Returns
-    -------
-    vm.torch.FloatTensor
-        The output tensor.
-
-    """
-    out = kwargs['out'] if 'out' in kwargs else None
-    if out is None:
-        out = LeafTensor(sizes, requires_grad=kwargs['requires_grad'] \
-            if 'requires_grad' in kwargs else False)
-    return _normal(out, sizes, mean=0, std=1)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/ops/factory.py
+++ b/Dragon/python/dragon/vm/torch/ops/factory.py
@@ -21,13 +21,13 @@ def has_module(key):
    return key in _GLOBAL_TORCH_BUILTIN_MODULES


-def register_module(cls, key, ctx, **kwargs):
+def register_module(cls, key, dev, **kwargs):
    global _GLOBAL_TORCH_BUILTIN_MODULES
-    module = cls(key, ctx, **kwargs)
+    module = cls(key, dev, **kwargs)
    _GLOBAL_TORCH_BUILTIN_MODULES[key] = module
    return module


-def get_module(cls, key, ctx, **kwargs):
+def get_module(cls, key, dev, **kwargs):
    if has_module(key): return _GLOBAL_TORCH_BUILTIN_MODULES[key]
-    return register_module(cls, key, ctx, **kwargs)
\ No newline at end of file
+    return register_module(cls, key, dev, **kwargs)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/ops/modules/arithmetic.py
+++ b/Dragon/python/dragon/vm/torch/ops/modules/arithmetic.py
@@ -17,8 +17,8 @@ from dragon.vm.torch.ops.modules.base import BaseModule


 class Fundamental(BaseModule):
-    def __init__(self, key, ctx, **kwargs):
-        super(Fundamental, self).__init__(key, ctx, **kwargs)
+    def __init__(self, key, dev, **kwargs):
+        super(Fundamental, self).__init__(key, dev, **kwargs)
        self.op_type = kwargs.get('op_type', 'Add')
        self.register_op()

@@ -32,8 +32,8 @@ class Fundamental(BaseModule):


 class Maximum(BaseModule):
-    def __init__(self, key, ctx, **kwargs):
-        super(Maximum, self).__init__(key, ctx, **kwargs)
+    def __init__(self, key, dev, **kwargs):
+        super(Maximum, self).__init__(key, dev, **kwargs)
        self.register_op()

    def register_op(self):
@@ -46,8 +46,8 @@ class Maximum(BaseModule):


 class Minimum(BaseModule):
-    def __init__(self, key, ctx, **kwargs):
-        super(Minimum, self).__init__(key, ctx, **kwargs)
+    def __init__(self, key, dev, **kwargs):
+        super(Minimum, self).__init__(key, dev, **kwargs)
        self.register_op()

    def register_op(self):
@@ -60,8 +60,8 @@ class Minimum(BaseModule):


 class Clamp(BaseModule):
-    def __init__(self, key, ctx, **kwargs):
-        super(Clamp, self).__init__(key, ctx, **kwargs)
+    def __init__(self, key, dev, **kwargs):
+        super(Clamp, self).__init__(key, dev, **kwargs)
        self.min = kwargs.get('min', None)
        self.max = kwargs.get('max', None)
        if self.min is not None: self.min = float(self.min)
@@ -84,8 +84,8 @@ class Clamp(BaseModule):


 class Log(BaseModule):
-    def __init__(self, key, ctx, **kwargs):
-        super(Log, self).__init__(key, ctx, **kwargs)
+    def __init__(self, key, dev, **kwargs):
+        super(Log, self).__init__(key, dev, **kwargs)
        self.register_op()

    def register_op(self):
@@ -98,8 +98,8 @@ class Log(BaseModule):


 class Exp(BaseModule):
-    def __init__(self, key, ctx, **kwargs):
-        super(Exp, self).__init__(key, ctx, **kwargs)
+    def __init__(self, key, dev, **kwargs):
+        super(Exp, self).__init__(key, dev, **kwargs)
        self.register_op()

    def register_op(self):
@@ -112,8 +112,8 @@ class Exp(BaseModule):


 class Sqrt(BaseModule):
-    def __init__(self, key, ctx, **kwargs):
-        super(Sqrt, self).__init__(key, ctx, **kwargs)
+    def __init__(self, key, dev, **kwargs):
+        super(Sqrt, self).__init__(key, dev, **kwargs)
        self.register_op()

    def register_op(self):
@@ -123,3 +123,43 @@ class Sqrt(BaseModule):
        inputs = [x]; self.unify_devices(inputs)
        outputs = [y] if y else [self.register_output()]
        return self.run(inputs, outputs)
+
+
+class MM(BaseModule):
+    def __init__(self, key, dev, **kwargs):
+        super(MM, self).__init__(key, dev, **kwargs)
+        self.transA = kwargs.get('transA', False)
+        self.transB = kwargs.get('transB', False)
+        self.register_op()
+
+    def register_op(self):
+        self.op_meta = {
+            'op_type': 'Matmul',
+            'arguments': {
+                'transA': self.transA,
+                'transB': self.transB,
+            }}
+
+    def forward(self, x1, x2, y):
+        inputs = [x1, x2]; self.unify_devices(inputs)
+        outputs = [y] if y else [self.register_output()]
+        return self.run(inputs, outputs)
+
+
+class FullyConnected(BaseModule):
+    def __init__(self, key, dev, **kwargs):
+        super(FullyConnected, self).__init__(key, dev, **kwargs)
+        self.transW = kwargs.get('transW', True)
+        self.register_op()
+
+    def register_op(self):
+        self.op_meta = {
+            'op_type': 'FullyConnected',
+            'arguments': {'transW': self.transW},
+        }
+
+    def forward(self, x, w, b=None, y=None):
+        inputs = [x, w] + ([b] if b else [])
+        self.unify_devices(inputs)
+        outputs = [y] if y else [self.register_output()]
+        return self.run(inputs, outputs)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/ops/modules/array.py
+++ b/Dragon/python/dragon/vm/torch/ops/modules/array.py
@@ -14,7 +14,7 @@ from __future__ import division
 from __future__ import print_function

 from dragon.vm.torch.autograd import no_grad
-from dragon.vm.torch.tensor import ReferenceTensor
+from dragon.vm.torch.tensor import _ReferenceTensor
 from dragon.vm.torch.ops.modules.base import BaseModule


@@ -25,8 +25,8 @@ class Indexing(BaseModule):
    and the resulting memory is deep copied.

    """
-    def __init__(self, key, ctx, **kwargs):
-        super(Indexing, self).__init__(key, ctx, **kwargs)
+    def __init__(self, key, dev, **kwargs):
+        super(Indexing, self).__init__(key, dev, **kwargs)
        self.n_starts = kwargs.get('n_starts', 0)
        self.n_sizes = kwargs.get('n_sizes', 0)
        self.register_op()
@@ -62,8 +62,8 @@ class Concat(BaseModule):
    Concatenate the inputs along the given axis.

    """
-    def __init__(self, key, ctx, **kwargs):
-        super(Concat, self).__init__(key, ctx, **kwargs)
+    def __init__(self, key, dev, **kwargs):
+        super(Concat, self).__init__(key, dev, **kwargs)
        self.axis = kwargs.get('axis', 0)
        self.register_op()

@@ -90,8 +90,8 @@ class Gather(BaseModule):
        input.shape[:axis] + indices.shape + input.shape[axis + 1:]

    """
-    def __init__(self, key, ctx, **kwargs):
-        super(Gather, self).__init__(key, ctx, **kwargs)
+    def __init__(self, key, dev, **kwargs):
+        super(Gather, self).__init__(key, dev, **kwargs)
        self.axis = kwargs.get('axis', 0)
        self.register_op()

@@ -111,8 +111,8 @@ class Gather(BaseModule):


 class Reduce(BaseModule):
-    def __init__(self, key, ctx, **kwargs):
-        super(Reduce, self).__init__(key, ctx, **kwargs)
+    def __init__(self, key, dev, **kwargs):
+        super(Reduce, self).__init__(key, dev, **kwargs)
        self.operation = kwargs.get('operation', 'SUM')
        self.dim = kwargs.get('dim', None)
        self.keepdim = kwargs.get('keepdim', True)
@@ -135,8 +135,8 @@ class Reduce(BaseModule):


 class ArgReduce(BaseModule):
-    def __init__(self, key, ctx, **kwargs):
-        super(ArgReduce, self).__init__(key, ctx, **kwargs)
+    def __init__(self, key, dev, **kwargs):
+        super(ArgReduce, self).__init__(key, dev, **kwargs)
        self.operation = kwargs.get('operation', 'ARGMAX')
        self.axis = kwargs.get('axis', None)
        self.keepdim = kwargs.get('keepdim', True)
@@ -179,8 +179,8 @@ class ArgReduce(BaseModule):


 class Reshape(BaseModule):
-    def __init__(self, key, ctx, **kwargs):
-        super(Reshape, self).__init__(key, ctx, **kwargs)
+    def __init__(self, key, dev, **kwargs):
+        super(Reshape, self).__init__(key, dev, **kwargs)
        self.n_dim = kwargs.get('n_dim', 0)
        self.register_op()

@@ -201,14 +201,14 @@ class Reshape(BaseModule):

    def forward(self, x, shape):
        inputs = [x]; self.unify_devices(inputs)
-        outputs = [ReferenceTensor(x)]
+        outputs = [_ReferenceTensor(x)]
        callback = lambda A: self.update_arguments(A, shape)
        return self.run(inputs, outputs, callback=callback)


 class Squeeze(BaseModule):
-    def __init__(self, key, ctx, **kwargs):
-        super(Squeeze, self).__init__(key, ctx, **kwargs)
+    def __init__(self, key, dev, **kwargs):
+        super(Squeeze, self).__init__(key, dev, **kwargs)
        self.dim = kwargs.get('dim', None)
        self.register_op()

@@ -220,13 +220,13 @@ class Squeeze(BaseModule):

    def forward(self, x, out=None):
        inputs = [x]; self.unify_devices(inputs)
-        outputs = [out] if out else [ReferenceTensor(x)]
+        outputs = [out] if out else [_ReferenceTensor(x)]
        return self.run(inputs, outputs)


 class UnSqueeze(BaseModule):
-    def __init__(self, key, ctx, **kwargs):
-        super(UnSqueeze, self).__init__(key, ctx, **kwargs)
+    def __init__(self, key, dev, **kwargs):
+        super(UnSqueeze, self).__init__(key, dev, **kwargs)
        self.dim = kwargs.get('dim', None)
        self.register_op()

@@ -238,13 +238,13 @@ class UnSqueeze(BaseModule):

    def forward(self, x, out=None):
        inputs = [x]; self.unify_devices(inputs)
-        outputs = [out] if out else [ReferenceTensor(x)]
+        outputs = [out] if out else [_ReferenceTensor(x)]
        return self.run(inputs, outputs)


 class Permute(BaseModule):
-    def __init__(self, key, ctx, **kwargs):
-        super(Permute, self).__init__(key, ctx, **kwargs)
+    def __init__(self, key, dev, **kwargs):
+        super(Permute, self).__init__(key, dev, **kwargs)
        self.n_perm = kwargs.get('n_perm', 0)
        self.register_op()

@@ -270,8 +270,8 @@ class Permute(BaseModule):


 class Repeat(BaseModule):
-    def __init__(self, key, ctx, **kwargs):
-        super(Repeat, self).__init__(key, ctx, **kwargs)
+    def __init__(self, key, dev, **kwargs):
+        super(Repeat, self).__init__(key, dev, **kwargs)
        self.n_times = kwargs.get('n_times', 0)
        self.register_op()

@@ -298,8 +298,8 @@ class Repeat(BaseModule):


 class OneHot(BaseModule):
-    def __init__(self, key, ctx, **kwargs):
-        super(OneHot, self).__init__(key, ctx, **kwargs)
+    def __init__(self, key, dev, **kwargs):
+        super(OneHot, self).__init__(key, dev, **kwargs)
        self.depth = kwargs.get('depth', 1)
        self.register_op()

@@ -318,8 +318,8 @@ class OneHot(BaseModule):


 class Cast(BaseModule):
-    def __init__(self, key, ctx, **kwargs):
-        super(Cast, self).__init__(key, ctx, **kwargs)
+    def __init__(self, key, dev, **kwargs):
+        super(Cast, self).__init__(key, dev, **kwargs)
        self.dtype = kwargs.get('dtype', 'float32')
        self.inplace = kwargs.get('inplace', False)
        self.register_op()
@@ -344,3 +344,25 @@ class Cast(BaseModule):
            with no_grad():
                y = self.run([], [x])
        return y
+
+
+class Multinomial(BaseModule):
+    def __init__(self, key, dev, **kwargs):
+        super(Multinomial, self).__init__(key, dev, **kwargs)
+        self.num_samples = kwargs.get('num_samples', 1)
+        self.normalize = kwargs.get('normalize', False)
+        self.register_op()
+
+    def register_op(self):
+        self.op_meta = {
+            'op_type': 'Multinomial',
+            'arguments': {
+                'num_samples': self.num_samples,
+                'normalize': self.normalize,
+            },
+        }
+
+    def forward(self, x, y):
+        inputs = [x]; self.unify_devices(inputs)
+        outputs = [y] if y else [self.register_output()]
+        return self.run(inputs, outputs)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/ops/modules/base.py
+++ b/Dragon/python/dragon/vm/torch/ops/modules/base.py
@@ -16,17 +16,17 @@ from __future__ import print_function
 import numpy as np
 import dragon as dg

-from dragon.core import proto_utils as pb_utils
+from dragon.core import proto_utils
 from dragon.vm.torch.module import Module


 class BaseModule(Module):
-    def __init__(self, key, ctx, **kwargs):
+    def __init__(self, key, dev, **kwargs):
        super(BaseModule, self).__init__()
        self._module_key = key
-        self._ctx = ctx
-        self._args_dev = pb_utils.GetDeviceOption(
-            'CPU').SerializeToString()
+        self._device = dev
+        self._args_dev = proto_utils.\
+            GetDeviceOption('cpu').SerializeToString()

    def set_argument_i64(self, name, value):
        dg.C.FeedTensor(name, np.array(

--- a/Dragon/python/dragon/vm/torch/ops/modules/control_flow.py
+++ b/Dragon/python/dragon/vm/torch/ops/modules/control_flow.py
@@ -17,8 +17,8 @@ from dragon.vm.torch.ops.modules.base import BaseModule


 class Copy(BaseModule):
-    def __init__(self, key, ctx, **kwargs):
-        super(Copy, self).__init__(key, ctx, **kwargs)
+    def __init__(self, key, dev, **kwargs):
+        super(Copy, self).__init__(key, dev, **kwargs)
        self.register_op()

    def register_op(self):
@@ -27,3 +27,23 @@ class Copy(BaseModule):
    def forward(self, dst, src):
        outputs = [dst]; self.unify_devices(outputs)
        return self.run([src], outputs)
+
+
+class Compare(BaseModule):
+    def __init__(self, key, dev, **kwargs):
+        super(Compare, self).__init__(key, dev, **kwargs)
+        self.operation = kwargs.get('operation', 'NONE')
+        self.register_op()
+
+    def register_op(self):
+        self.op_meta = {
+            'op_type': 'Compare',
+            'arguments': {
+                'operation': self.operation,
+                'to_uint8': True,
+            }}
+
+    def forward(self, x1, x2, y):
+        inputs = [x1, x2]; self.unify_devices(inputs)
+        outputs = [y] if y else [self.register_output()]
+        return self.run(inputs, outputs)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/ops/modules/init.py
+++ b/Dragon/python/dragon/vm/torch/ops/modules/init.py
@@ -17,8 +17,8 @@ from dragon.vm.torch.ops.modules.base import BaseModule


 class _InitModule(BaseModule):
-    def __init__(self, key, ctx, **kwargs):
-        super(_InitModule, self).__init__(key, ctx, **kwargs)
+    def __init__(self, key, dev, **kwargs):
+        super(_InitModule, self).__init__(key, dev, **kwargs)
        self.n_dim = kwargs.get('n_dim', 0)
        self.dtype = kwargs.get('dtype', 'float32')

@@ -33,8 +33,8 @@ class _InitModule(BaseModule):


 class Fill(_InitModule):
-    def __init__(self, key, ctx, **kwargs):
-        super(Fill, self).__init__(key, ctx, **kwargs)
+    def __init__(self, key, dev, **kwargs):
+        super(Fill, self).__init__(key, dev, **kwargs)
        self.value = kwargs.get('value', 0.0)
        self.register_op()

@@ -53,8 +53,8 @@ class Fill(_InitModule):


 class RandomNormal(_InitModule):
-    def __init__(self, key, ctx, **kwargs):
-        super(RandomNormal, self).__init__(key, ctx, **kwargs)
+    def __init__(self, key, dev, **kwargs):
+        super(RandomNormal, self).__init__(key, dev, **kwargs)
        self.mean = kwargs.get('mean', 0.0)
        self.std = kwargs.get('std', 1.0)
        self.register_op()
@@ -75,8 +75,8 @@ class RandomNormal(_InitModule):


 class RandomUniform(_InitModule):
-    def __init__(self, key, ctx, **kwargs):
-        super(RandomUniform, self).__init__(key, ctx, **kwargs)
+    def __init__(self, key, dev, **kwargs):
+        super(RandomUniform, self).__init__(key, dev, **kwargs)
        self.low = kwargs.get('low', 0.0)
        self.high = kwargs.get('high', 1.0)
        self.register_op()

--- a/Dragon/python/dragon/vm/torch/ops/modules/update.py
+++ b/Dragon/python/dragon/vm/torch/ops/modules/update.py
@@ -18,8 +18,8 @@ from dragon.vm.torch.ops.modules.base import BaseModule


 class Update(BaseModule):
-    def __init__(self, key, ctx, **kwargs):
-        super(Update, self).__init__(key, ctx, **kwargs)
+    def __init__(self, key, dev, **kwargs):
+        super(Update, self).__init__(key, dev, **kwargs)
        self.op_type = kwargs.get('op_type', 'Update')
        self.lr_mult = kwargs.get('lr_mult', 1.0)
        self.decay_mult = kwargs.get('decay_mult', 1.0)
@@ -42,8 +42,8 @@ class Update(BaseModule):


 class Collective(BaseModule):
-    def __init__(self, key, ctx, **kwargs):
-        super(Collective, self).__init__(key, ctx, **kwargs)
+    def __init__(self, key, dev, **kwargs):
+        super(Collective, self).__init__(key, dev, **kwargs)
        self.mode = kwargs.get('mode', None)
        if self.mode is None:
            raise ValueError('Got invalid collective mode: {}'.format(self.mode))
@@ -71,8 +71,8 @@ class Collective(BaseModule):


 class Accumulate(BaseModule):
-    def __init__(self, key, ctx, **kwargs):
-        super(Accumulate, self).__init__(key, ctx, **kwargs)
+    def __init__(self, key, dev, **kwargs):
+        super(Accumulate, self).__init__(key, dev, **kwargs)
        self.register_op()

    def register_op(self):

--- a/Dragon/python/dragon/vm/torch/ops/modules/vision.py
+++ b/Dragon/python/dragon/vm/torch/ops/modules/vision.py
@@ -17,8 +17,8 @@ from dragon.vm.torch.ops.modules.base import BaseModule


 class Resize2d(BaseModule):
-    def __init__(self,  key, ctx, **kwargs):
-        super(Resize2d, self).__init__(key, ctx, **kwargs)
+    def __init__(self, key, dev, **kwargs):
+        super(Resize2d, self).__init__(key, dev, **kwargs)
        self.op_type = kwargs.get('op_type', 'NNResize')
        self.dsize = kwargs.get('dsize', None)
        self.fx = kwargs.get('fx', None)
@@ -51,8 +51,8 @@ class Resize2d(BaseModule):


 class RoIPool(BaseModule):
-    def __init__(self,  key, ctx, **kwargs):
-        super(RoIPool, self).__init__(key, ctx, **kwargs)
+    def __init__(self, key, dev, **kwargs):
+        super(RoIPool, self).__init__(key, dev, **kwargs)
        self.pool_h = kwargs.get('pooled_h', 0)
        self.pool_w = kwargs.get('pooled_w', 0)
        self.spatial_scale = kwargs.get('spatial_scale', 1.0)
@@ -74,8 +74,8 @@ class RoIPool(BaseModule):


 class RoIAlign(BaseModule):
-    def __init__(self,  key, ctx, **kwargs):
-        super(RoIAlign, self).__init__(key, ctx, **kwargs)
+    def __init__(self, key, dev, **kwargs):
+        super(RoIAlign, self).__init__(key, dev, **kwargs)
        self.pool_h = kwargs.get('pooled_h', 0)
        self.pool_w = kwargs.get('pooled_w', 0)
        self.spatial_scale = kwargs.get('spatial_scale', 1.0)

--- a/Dragon/python/dragon/vm/torch/ops/primitive.py
+++ b/Dragon/python/dragon/vm/torch/ops/primitive.py
@@ -17,34 +17,33 @@ import numpy as np
 import dragon as dg

 from dragon.vm.torch.tensor import *
-from dragon.vm.torch.c_api import Context
+from dragon.vm.torch.c_api import device as _Device


 def UnifyDevices(tensors, key='Inputs'):
-    device_types = [t._ctx.device_type for t in tensors]
-    device_ids = [0]
-    if len(set(device_types)) != 1:
+    types, indices = [t.device.type for t in tensors], [0]
+    if len(set(types)) != 1:
        raise ValueError('{} from different device type: [{}].'
-            .format(key, ', '.join(device_types)))
-    if device_types[0] == 'CUDA':
-        device_ids = [t._ctx.device_id for t in tensors]
-        if len(set(device_ids)) != 1:
+            .format(key, ', '.join(types)))
+    if types[0] == 'cuda':
+        indices = [t.device.index for t in tensors]
+        if len(set(indices)) != 1:
            raise ValueError('{} from different cuda device: [{}].'
-            .format(key, ', '.join([str(d) for d in device_ids])))
-    return Context(device_types[0], device_ids[0])
+                .format(key, ', '.join([str(d) for d in indices])))
+    return _Device(types[0], indices[0])


-def MakeContext(inputs=(), outputs=()):
+def MakeDevice(inputs=(), outputs=()):
    # Case #1: [], [] -> CPU
    # Case #2: [...], [] -> Refer Inputs
    # Case #3: [], [...] -> Refer Outputs
    # Case #4: [...], [...] -> Refer Outputs
    if len(outputs) > 0: return UnifyDevices(outputs, 'Outputs')
    if len(inputs) > 0: return UnifyDevices(inputs, 'Inputs')
-    return Context()
+    return _Device()


-def WrapScalar(scalar, dtype, ctx):
+def WrapScalar(scalar, dtype, device):
    # We use (DType + Value) to hash different scalars
    # Setting a Tensor with same DType and shape will not deconstruct it
    if 'float' in dtype: scalar = float(scalar)
@@ -52,6 +51,6 @@ def WrapScalar(scalar, dtype, ctx):
    name = '/share/scalar/{}/{}'.format(dtype, str(scalar))
    if not dg.workspace.HasTensor(name):
        dg.workspace.FeedTensor(name, np.array(scalar, dtype=dtype))
-    t = Tensor(name=name, dtype=dtype, ctx=ctx, own_storage=False)
+    t = Tensor(name=name, dtype=dtype, device=device, own_storage=False)
    t.requires_grad = False
    return t
--- a/Dragon/python/dragon/vm/torch/ops/tensor.py
+++ b/Dragon/python/dragon/vm/torch/ops/tensor.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from dragon.vm.torch.tensor import Tensor
+from dragon.vm.torch.ops.factory import get_module
+from dragon.vm.torch.ops.primitive import MakeDevice
+from dragon.vm.torch.ops.modules.array import Cast
+
+from dragon.vm.torch.ops.builtin import (
+    _fill, _uniform, _normal, multinomial,
+    _fundamental, _rfundamental,
+    log, exp, sqrt, clamp,
+    _reshape, squeeze, unsqueeze,
+    _permute, _repeat, _indexing, narrow,
+    mean, sum, max, min,
+    gt, lt, eq, ge, le,
+)
+
+
+def _type_to(input, dtype='float32', inplace=False):
+    if dtype == input.dtype: return input
+    dev = MakeDevice(inputs=[input])
+    key = 'Cast/{}/dtype:{}/inplace:{}'.format(
+        dev, dtype, 'true' if inplace else 'false')
+    module = get_module(Cast, key, dev, dtype=dtype, inplace=inplace)
+    return module.forward(input)
+
+
+Tensor.fill_ = lambda self, value: _fill(self, self.shape, value)
+Tensor.uniform_ = lambda self, low=0, high=1: _uniform(self, self.shape, low, high)
+Tensor.normal_ = lambda self, mean=0, std=1: _normal(self, self.shape, mean, std)
+Tensor.multinomial = lambda *args, **kwargs: multinomial(*args, **kwargs)
+
+
+Tensor.add = lambda self, value: _fundamental(self, value, 'Add')
+Tensor.add_ = lambda self, value: _fundamental(self, value, 'Add', self)
+Tensor.__radd__ = lambda self, value: _rfundamental(self, value, 'RAdd')
+Tensor.sub = lambda self, value: _fundamental(self, value, 'Sub')
+Tensor.sub_ = lambda self, value: _fundamental(self, value, 'Sub', self)
+Tensor.__rsub__ = lambda self, value: _rfundamental(self, value, 'RSub')
+Tensor.mul = lambda self, value: _fundamental(self, value, 'Mul')
+Tensor.mul_ = lambda self, value: _fundamental(self, value, 'Mul', self)
+Tensor.__rmul__ = lambda self, value: _rfundamental(self, value, 'RMul')
+Tensor.div = lambda self, value: _fundamental(self, value, 'Div')
+Tensor.div_ = lambda self, value: _fundamental(self, value, 'Div', self)
+Tensor.__rdiv__ = lambda self, value: _rfundamental(self, value, 'RDiv')
+Tensor.__rtruediv__ = lambda self, value: _rfundamental(self, value, 'RDiv')
+Tensor.clamp = lambda *args, **kwargs: clamp(*args, **kwargs)
+Tensor.clamp_ = lambda self, min=None, max=None: clamp(self, min, max, self)
+Tensor.log = lambda *args, **kwargs: log(*args, **kwargs)
+Tensor.exp = lambda *args, **kwargs: exp(*args, **kwargs)
+Tensor.sqrt = lambda *args, **kwargs: sqrt(*args, **kwargs)
+
+
+Tensor.squeeze = lambda *args, **kwargs: squeeze(*args, **kwargs)
+Tensor.squeeze_ = lambda self, dim: squeeze(self, dim, self)
+Tensor.unsqueeze = lambda *args, **kwargs: unsqueeze(*args, **kwargs)
+Tensor.unsqueeze_ = lambda self, dim: unsqueeze(self, dim, self)
+Tensor.view = lambda self, *shape: _reshape(self, shape)
+Tensor.view_as = lambda *args, **kwargs: _reshape(*args, **kwargs)
+Tensor.permute = lambda self, *dims: _permute(self, dims)
+Tensor.repeat = lambda self, *args: _repeat(self, args)
+Tensor.mean = lambda *args, **kwargs: mean(*args, **kwargs)
+Tensor.sum = lambda *args, **kwargs: sum(*args, **kwargs)
+Tensor.max = lambda *args, **kwargs: max(*args, **kwargs)
+Tensor.min = lambda *args, **kwargs: min(*args, **kwargs)
+Tensor.gt = lambda *args, **kwargs: gt(*args, **kwargs)
+Tensor.ge = lambda *args, **kwargs: ge(*args, **kwargs)
+Tensor.lt = lambda *args, **kwargs: lt(*args, **kwargs)
+Tensor.le = lambda *args, **kwargs: le(*args, **kwargs)
+Tensor.eq = lambda *args, **kwargs: eq(*args, **kwargs)
+Tensor.narrow = lambda *args, **kwargs: narrow(*args, **kwargs)
+Tensor._indexing = lambda *args, **kwargs: _indexing(*args, **kwargs)
+
+
+Tensor.half = lambda self: _type_to(self, dtype='float16', inplace=False)
+Tensor.half_ = lambda self: _type_to(self, dtype='float16', inplace=True)
+Tensor.float = lambda self: _type_to(self, dtype='float32', inplace=False)
+Tensor.float_ = lambda self: _type_to(self, dtype='float32', inplace=True)
+Tensor.double = lambda self: _type_to(self, dtype='float64', inplace=False)
+Tensor.double_ = lambda self: _type_to(self, dtype='float64', inplace=True)
+Tensor.byte = lambda self: _type_to(self, dtype='uint8', inplace=False)
+Tensor.byte_ = lambda self: _type_to(self, dtype='uint8', inplace=True)
+Tensor.char = lambda self: _type_to(self, dtype='int8', inplace=False)
+Tensor.char_ = lambda self: _type_to(self, dtype='int8', inplace=True)
+Tensor.int = lambda self: _type_to(self, dtype='int32', inplace=False)
+Tensor.int_ = lambda self: _type_to(self, dtype='int32', inplace=True)
+Tensor.long = lambda self: _type_to(self, dtype='int64', inplace=False)
+Tensor.long_ = lambda self: _type_to(self, dtype='int64', inplace=True)
+Tensor.type = lambda self, dtype=None: _type_to(self, dtype=dtype) \
+    if dtype is not None else 'torch.' + self._type2str()
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/ops/update.py
+++ b/Dragon/python/dragon/vm/torch/ops/update.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import dragon.core.mpi as mpi
-
-from dragon.vm.torch.ops.primitive import MakeContext
-from dragon.vm.torch.ops.factory import get_module
-from dragon.vm.torch.ops.modules.update import Accumulate
-from dragon.vm.torch.ops.modules.update import Collective
-from dragon.vm.torch.ops.modules.update import Update
-
-
-def _accumulate(grads):
-    if len(grads) == 0: return
-    if not isinstance(grads, (list, tuple)): grads = [grads]
-    ctx = MakeContext(inputs=grads)
-    key = 'Accumulate/{}/alpha:1./beta:1.'.format(ctx)
-    module = get_module(Accumulate, key, ctx)
-    return module.forward(grads)
-
-
-def _allreduce(grads):
-    if not mpi.Is_Init(): return
-    if not isinstance(grads, (list, tuple)): grads = [grads]
-    ctx = MakeContext(inputs=grads)
-    mode = mpi.GetParallelMode() + '_ALLREDUCE'
-    key = 'Collective/{}/{}'.format(ctx, mode.lower())
-    module = get_module(Collective, key, ctx, mode=mode)
-    return module.forward(grads)
-
-
-def _update(param, grad, op_type, slot,
-            lr_mult=1.0, decay_mult=1.0):
-    ctx = MakeContext(inputs=[param])
-    key = '{}/{}/{}/{}'.format(op_type, ctx, slot, param.name)
-    module = get_module(Update, key, ctx, op_type=op_type,
-        lr_mult=lr_mult, decay_mult=decay_mult, slot=slot)
-    return module.forward(param, grad)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/ops/vision.py
+++ b/Dragon/python/dragon/vm/torch/ops/vision.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from dragon.vm.torch.ops.primitive import MakeContext
-from dragon.vm.torch.ops.factory import get_module
-from dragon.vm.torch.ops.modules.vision import Resize2d
-from dragon.vm.torch.ops.modules.vision import RoIPool, RoIAlign
-
-
-def _resize_2d(input, op_type, dsize, fx, fy):
-    if dsize is None:
-        if fx < 0 or fy < 0:
-            raise ValueError('Set fx and fy if dsize is None.')
-    else:
-        if len(dsize) != 2:
-            raise ValueError('The dsize should be a list with 2 elements.')
-    if dsize is None and (fy == -1.0 or fx == -1.0):
-        raise RuntimeError('The dsize, fx/fy should be specified either.')
-    ctx = MakeContext(inputs=[input])
-    key = '{}/{}/dsize:{}/fx:{}/fy:{}'.format(
-        op_type, ctx, '2' if dsize else 'none', fx, fy)
-    module = get_module(Resize2d, key, ctx,
-        op_type=op_type, dsize=dsize, fx=fx, fy=fy)
-    return module.forward(input, dsize)
-
-
-def nn_resize(input, dsize, fx=-1.0, fy=-1.0):
-    return _resize_2d(input, 'NNResize', dsize, fx, fy)
-
-
-def bilinear_resize(input, dsize, fx=-1.0, fy=-1.0):
-    return _resize_2d(input, 'BilinearResize', dsize, fx, fy)
-
-
-def roi_pool(feature, rois, pooled_h, pooled_w, spatial_scale):
-    ctx = MakeContext(inputs=[feature])
-    key = 'RoIPool/{}/pool_h:{}/pool_w:{}/spatial_scale:{}'.format(
-        ctx, pooled_h, pooled_w, spatial_scale)
-    module = get_module(
-        RoIPool, key, ctx,
-        pooled_h=pooled_h,
-        pooled_w=pooled_w,
-        spatial_scale=spatial_scale,
-    )
-    return module.forward(feature, rois)
-
-
-def roi_align(feature, rois, pooled_h, pooled_w,
-              spatial_scale, sampling_ratio=2):
-    ctx = MakeContext(inputs=[feature])
-    key = 'RoIAlign/{}/pool_h:{}/pool_w:{}/' \
-            'spatial_scale:{}/sampling_ratio:{}'.format(
-        ctx, pooled_h, pooled_w, spatial_scale, sampling_ratio)
-    module = get_module(
-        RoIAlign, key, ctx,
-        pooled_h=pooled_h,
-        pooled_w=pooled_w,
-        spatial_scale=spatial_scale,
-        sampling_ratio=sampling_ratio,
-    )
-    return module.forward(feature, rois)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/optim/optimizer.py
+++ b/Dragon/python/dragon/vm/torch/optim/optimizer.py
@@ -22,7 +22,7 @@ from collections import defaultdict

 from dragon.vm.torch.tensor import Tensor

-from dragon.vm.torch.ops.update import (
+from dragon.vm.torch.ops.builtin import (
    _accumulate, _allreduce, _update,
 )

@@ -51,6 +51,10 @@ class Optimizer(object):
        for param_group in param_groups:
            self.add_param_group(param_group)
        self._update_type = None
+        self._allow_parallel = False
+        if dragon.mpi.Is_Init():
+            local_rank, _ = dragon.mpi.AllowParallel()
+            if local_rank != -1: self._allow_parallel = True
        self._mutable_parameters = {}

    def __repr__(self):
@@ -80,7 +84,7 @@ class Optimizer(object):
            return Tensor(
                name=grad_name,
                    own_storage=False,
-                        ctx=param._ctx)
+                        device=param.device)
        return None

    def _run_update_ops(self, group):
@@ -109,7 +113,7 @@ class Optimizer(object):
        self.feed_parameters(group)

        # Run a all-reduce op to accumulate grads if necessary
-        _allreduce(grads)
+        if self._allow_parallel: _allreduce(grads)

        # Run regular update ops
        for p, g in zip(params, grads):

--- a/Dragon/python/dragon/vm/torch/tensor.py
+++ b/Dragon/python/dragon/vm/torch/tensor.py
@@ -14,35 +14,19 @@ from __future__ import division
 from __future__ import print_function

 import six
-import numpy as np
-import dragon as dg
+import numpy
+import dragon

-import dragon.core.mapping as mapping
-import dragon.core.tensor_utils as tensor_utils
-
-from dragon.vm.torch.c_api import Size, Context
-from dragon.vm.torch.tensor_uitls import from_dragon
+from dragon.core import mapping, tensor_utils, proto_utils
 from dragon.vm.torch.pool import TensorPool
-
-
-__all__ = [
-    'Tensor', 'Parameter',
-    'CharTensor', 'ByteTensor', 'IntTensor', 'LongTensor',
-    'HalfTensor', 'FloatTensor', 'DoubleTensor',
-]
-
-
-##############################################
-#                                            #
-#                 Tensor-Core                #
-#                                            #
-##############################################
+from dragon.vm.torch.c_api import Size, from_dragon
+from dragon.vm.torch.c_api import device as _Device


 class Tensor(object):
    def __init__(self, *args, **kwargs):
        # Internal properties
-        self._ctx = kwargs.get('ctx', Context())
+        self._device = kwargs.get('device', _Device())
        self._requires_grad = kwargs.get('requires_grad', False)
        self._tensor = kwargs.get('name', None)
        self._own_storage = kwargs.get('own_storage', True)
@@ -60,13 +44,13 @@ class Tensor(object):
        if len(args) == 0:
            # + empty tensor, not leaf
            if self._tensor is not None:
-                dg.C.CreateTensor(self._tensor)
+                dragon.C.CreateTensor(self._tensor)
        elif len(args) == 1:
            if isinstance(args[0], (list, tuple)):
                # + torch.Tensor(sequence)
-                self._init_from_numpy(np.array(
+                self._init_from_numpy(numpy.array(
                    args[0], dtype=kwargs.get('dtype', 'float32')))
-            elif isinstance(args[0], np.ndarray):
+            elif isinstance(args[0], numpy.ndarray):
                # + torch.Tensor(array)
                self._init_from_numpy(args[0])
            else:
@@ -81,7 +65,7 @@ class Tensor(object):
            self._init_from_shape(args, kwargs.get('dtype', 'float32'))

        # Store the reference of backend
-        self._storage = dg.C.GetTensor(self.name) \
+        self._storage = dragon.C.GetTensor(self.name) \
            if self.name is not None else None

    def _init_from_numpy(self, array):
@@ -114,6 +98,18 @@ class Tensor(object):
        return self._tensor.name if hasattr(
            self._tensor, 'name') else self._tensor

+    @property
+    def device(self):
+        """Return the device of this tensor.
+
+        Returns
+        -------
+        dragon.vm.torch.device
+           The device.
+
+        """
+        return self._device
+
    def cpu(self):
        """Switch the internal storage on cpu memory.

@@ -123,22 +119,27 @@ class Tensor(object):
            The self.

        """
-        self._ctx.device_type = 'CPU'
+        self._device.type = 'cpu'
        self._storage.ToCPU()
        return self

    def cuda(self, device=None):
        """Switch the internal storage on cuda memory.

+        Parameters
+        ----------
+        device : int, optional
+            The device index.
+
        Returns
        -------
        dragon.vm.torch.Tensor
            The self.

        """
-        if device is None: device = dg.config.GetGPU()
+        if device is None: device = dragon.config.GetGPU()
        self._storage.ToCUDA(device)
-        self._ctx.device_type, self._ctx.device_id = ('CUDA', device)
+        self._device.type, self._device.index = 'cuda', device
        return self

    def numpy(self, readonly=False):
@@ -146,7 +147,7 @@ class Tensor(object):

        Parameters
        ----------
-        readonly : boolean
+        readonly : boolean, optional, default=False
            Whether to sync the contents with device.

        Returns
@@ -167,7 +168,7 @@ class Tensor(object):

        """
        if isinstance(self._tensor, str):
-            return dg.Tensor.Ref(self._tensor,
+            return dragon.Tensor.Ref(self._tensor,
                shape=self.shape, dtype=self.dtype)
        else: return self._tensor

@@ -342,6 +343,86 @@ class Tensor(object):
        """
        return self.mul(-1.0)

+    def __gt__(self, other):
+        """Compute *self* > *other* element-wise.
+
+        Parameters
+        ----------
+        other : dragon.vm.torch.Tensor, number
+            The other tensor.
+
+        Returns
+        -------
+        dragon.vm.torch.Tensor
+            The output byte tensor.
+
+        """
+        return self.gt(other)
+
+    def __ge__(self, other):
+        """Compute *self* >= *other* element-wise.
+
+        Parameters
+        ----------
+        other : dragon.vm.torch.Tensor, number
+            The other tensor.
+
+        Returns
+        -------
+        dragon.vm.torch.Tensor
+            The output byte tensor.
+
+        """
+        return self.ge(other)
+
+    def __lt__(self, other):
+        """Compute *self* < *other* element-wise.
+
+        Parameters
+        ----------
+        other : dragon.vm.torch.Tensor, number
+            The other tensor.
+
+        Returns
+        -------
+        dragon.vm.torch.Tensor
+            The output byte tensor.
+
+        """
+        return self.lt(other)
+
+    def __le__(self, other):
+        """Compute *self* <= *other* element-wise.
+
+        Parameters
+        ----------
+        other : dragon.vm.torch.Tensor, number
+            The other tensor.
+
+        Returns
+        -------
+        dragon.vm.torch.Tensor
+            The output byte tensor.
+
+        """
+        return self.le(other)
+
+    def __eq__(self, other):
+        """Compute *self* == *other* element-wise.
+
+        Parameters
+        ----------
+        other : dragon.vm.torch.Tensor, number
+            The other tensor.
+
+        Returns
+        -------
+        dragon.vm.torch.Tensor
+            The output byte tensor.
+
+        """
+        return self.eq(other)
+
    def __repr__(self):
        """Return a format str representing the internal storage.

@@ -354,11 +435,12 @@ class Tensor(object):
        np_data = self.numpy(readonly=True)
        if len(np_data.shape) == 0: return str(np_data)
        format_str = str(np_data)
+        format_shape = 'x'.join([str(dim) for dim in np_data.shape])
        meta_info = '\n[torch.{} of size {}]'.\
-            format(self._type2str(), 'x'.join([str(dim) for dim in np_data.shape]))
-        if self._ctx.device_type == 'CUDA':
-            meta_info = '\n[torch.cuda.{} of size {} (GPU {})]'. \
-                format(self._type2str(), 'x'.join([str(dim) for dim in np_data.shape]), self._ctx.device_id)
+            format(self._type2str(), format_shape)
+        if self.device.type == 'cuda':
+            meta_info = '\n[torch.cuda.{} of size {} (GPU {})]'.format(
+                self._type2str(), format_shape, self.device.index)
        del np_data # DECREF
        return format_str + meta_info

@@ -432,7 +514,7 @@ class Tensor(object):
                            'The cropping starts and ends of axis {} '
                                'can not be equal, got {}:{}.'
                                    .format(ix, starts[-1], it.stop))
-                # handle step
+                # Handle step
                if it.step is not None:
                    raise NotImplementedError('Indexing with step has not been implemented yet. ')
            elif isinstance(it, int):
@@ -442,8 +524,8 @@ class Tensor(object):
                raise TypeError('Unsupported type of indices: {}'.format(type(type(it))))
        return self._indexing(starts, sizes)

-    def device(self):
-        return self._ctx.device_id
+    def __hash__(self):
+        return id(self)

    ##############################################
    #                                            #
@@ -532,7 +614,7 @@ class Tensor(object):

        Parameters
        ----------
-        dtype : str
+        dtype : str, optional
            The specified type.

        Returns
@@ -541,7 +623,7 @@ class Tensor(object):
            The data type or the new tensor.

        """
-        raise NotImplementedError('Refer torch.ops.builtin.type')
+        raise NotImplementedError('Refer torch.ops.tensor.type')

    def is_floating_point(self):
        """Whether the data type is floating.
@@ -563,11 +645,11 @@ class Tensor(object):
    ##############################################

    def squeeze(self, dim=None):
-        """Returns a tensor with all the dimensions of input of size 1 removed.
+        """Return a tensor with all the dimensions of input of size 1 removed.

        Parameters
        ----------
-        dim : int
+        dim : int, optional
            The optional dim to remove.


@@ -577,14 +659,14 @@ class Tensor(object):
            The new tensor.

        """
-        raise NotImplementedError('Refer torch.ops.builtin._squeeze')
+        raise NotImplementedError('Refer torch.ops.tensor.squeeze')

    def squeeze_(self, dim=None):
        """Inplace of ``Tensor.squeeze()``

        Parameters
        ----------
-        dim : int
+        dim : int, optional
            The optional dim to remove.

        Returns
@@ -593,7 +675,7 @@ class Tensor(object):
            The self.

        """
-        raise NotImplementedError('Refer torch.ops.builtin._squeeze_')
+        raise NotImplementedError('Refer torch.ops.tensor.squeeze_')

    def unsqueeze(self, dim):
        """Returns a tensor with a dimension of size 1 inserted at the specified position.
@@ -609,7 +691,7 @@ class Tensor(object):
            The new tensor.

        """
-        raise NotImplementedError('Refer torch.ops.builtin._unsqueeze')
+        raise NotImplementedError('Refer torch.ops.tensor.unsqueeze')

    def unsqueeze_(self, dim):
        """Inplace of ``Tensor.unsqueeze()``
@@ -625,14 +707,14 @@ class Tensor(object):
            The self.

        """
-        raise NotImplementedError('Refer torch.ops.builtin._unsqueeze_')
+        raise NotImplementedError('Refer torch.ops.tensor.unsqueeze_')

-    def view(self, *args):
+    def view(self, *shape):
        """Return a new tensor with the same data but a different size.

        Parameters
        ----------
-        args : tuple or int
+        shape : int...
            The new size.

        Returns
@@ -641,16 +723,16 @@ class Tensor(object):
            The output tensor.

        """
-        raise NotImplementedError('Refer torch.ops.builtin.view')
+        raise NotImplementedError('Refer torch.ops.tensor.view')

-    def reshape(self, *args):
+    def reshape(self, *shape):
        """Return a new tensor with the same data but a different size.

-        See also: *torch.view(*args)*
+        See also: *torch.view(*shape)*

        Parameters
        ----------
-        args : tuple or int
+        shape : int...
            The new size.

        Returns
@@ -659,7 +741,7 @@ class Tensor(object):
            The output tensor.

        """
-        return self.view(*args)
+        return self.view(*shape)

    def view_as(self, other):
        """Return a new tensor with the same data but a different size as the given tensor.
@@ -675,14 +757,14 @@ class Tensor(object):
            The output tensor.

        """
-        raise NotImplementedError('Refer torch.ops.builtin.view_as')
+        raise NotImplementedError('Refer torch.ops.tensor.view_as')

-    def permute(self, dims=None):
+    def permute(self, *dims):
        """Return a new tensor with the specific order of dimensions.

        Parameters
        ----------
-        dims : sequence of int
+        dims : int...
            The new order of dimensions.

        Returns
@@ -691,7 +773,7 @@ class Tensor(object):
            The output tensor.

        """
-        raise NotImplementedError('Refer torch.ops.builtin.permute')
+        raise NotImplementedError('Refer torch.ops.tensor.permute')

    def narrow(self, dimension, start, length):
        """Return a new tensor that is a narrowed version of input tensor.
@@ -703,7 +785,7 @@ class Tensor(object):
        start : int
            The starting position.
        length : int
-            The distance to the ending postion.
+            The distance to the ending position.

        Returns
        -------
@@ -711,14 +793,14 @@ class Tensor(object):
            The output tensor.

        """
-        raise NotImplementedError('Refer torch.ops.builtin.narrow')
+        raise NotImplementedError('Refer torch.ops.tensor.narrow')

    def repeat(self, *sizes):
        """Repeat this tensor along the specified dimensions.

        Parameters
        ----------
-        sizes : vm.torch.Size or int...
+        sizes : int...
            The number of times to repeat.

        Returns
@@ -727,7 +809,7 @@ class Tensor(object):
            The output tensor.

        """
-        raise NotImplementedError('Refer torch.ops.builtin.repeat')
+        raise NotImplementedError('Refer torch.ops.tensor.repeat')

    def copy_(self, src, non_blocking=False):
        """Copy the elements from ``src`` into this tensor and return ``self``.
@@ -736,7 +818,7 @@ class Tensor(object):
        ----------
        src : dragon.vm.torch.Tensor
            The source tensor.
-        non_blocking : boolean
+        non_blocking : boolean, optional, default=False
            Whether to copy asynchronously between CPU and GPU.

        Returns
@@ -745,14 +827,24 @@ class Tensor(object):
            The ``self`` tensor.

        """
-        raise NotImplementedError('Refer torch.ops.builtin.copy_')
+        # Copy memory
+        tensor_utils.FromTensor(
+            src, proto_utils.GetDeviceOption(
+                src.device.type, src.device.index),
+            self.name, proto_utils.GetDeviceOption(
+                self.device.type, self.device.index))
+        # Transfer the static shape if necessary
+        self._static_shape = src.size() \
+            if self._static_shape else None
+        return self

    def fill_(self, value):
        """Fills self tensor with the specified value.

        Parameters
        ----------
-        value : numerical type
+        value : number
+            The value to fill.

        Returns
        -------
@@ -760,15 +852,11 @@ class Tensor(object):
            The self.

        """
-        raise NotImplementedError('Refer torch.ops.builtin.fill_')
+        raise NotImplementedError('Refer torch.ops.tensor.fill_')

    def zero_(self):
        """Fills self tensor with zeros.

-        Parameters
-        ----------
-        value : numerical type
-
        Returns
        -------
        dragon.vm.torch.Tensor
@@ -780,10 +868,6 @@ class Tensor(object):
    def one_(self):
        """Fills self tensor with ones.

-        Parameters
-        ----------
-        value : numerical type
-
        Returns
        -------
        dragon.vm.torch.Tensor
@@ -797,9 +881,9 @@ class Tensor(object):

        Parameters
        ----------
-        low : numerical type
+        low : number, optional, default=0
            The lower bound.
-        high : numerical type
+        high : number, optional, default=1
            The higher bound.

        Returns
@@ -808,16 +892,16 @@ class Tensor(object):
            The self.

        """
-        raise NotImplementedError('Refer torch.ops.builtin.uniform_')
+        raise NotImplementedError('Refer torch.ops.tensor.uniform_')

    def normal_(self, mean=0, std=1):
        """Fill self tensor with the specified normal distribution.

        Parameters
        ----------
-        mean : numerical type
+        mean : number, optional, default=0
            The mean(mu) of normal distribution.
-        std : numerical type
+        std : number, optional, default=1
            The std(sigma) of normal distribution.

        Returns
@@ -826,7 +910,26 @@ class Tensor(object):
            The self.

        """
-        raise NotImplementedError('Refer torch.ops.builtin.normal_')
+        raise NotImplementedError('Refer torch.ops.tensor.normal_')
+
+    def multinomial(self, num_samples, normalize=False):
+        """Return a tensor where each row contains ``num_samples``,
+           sampled from the multinomial distribution.
+
+        Parameters
+        ----------
+        num_samples : int
+            The number of samples.
+        normalize : boolean, optional, default=False
+            Whether to normalize the inputs.
+
+        Returns
+        -------
+        dragon.vm.torch.Tensor
+            The output tensor.
+
+        """
+        raise NotImplementedError('Refer torch.ops.tensor.multinomial')

    def add(self, value):
        """See ``torch.add()``
@@ -842,7 +945,7 @@ class Tensor(object):
            The output tensor.

        """
-        raise NotImplementedError('Refer torch.ops.builtin.add')
+        raise NotImplementedError('Refer torch.ops.tensor.add')

    def add_(self, value):
        """Inplace of ``torch.add()``
@@ -858,7 +961,7 @@ class Tensor(object):
            The self.

        """
-        raise NotImplementedError('Refer torch.ops.builtin.add_')
+        raise NotImplementedError('Refer torch.ops.tensor.add_')

    def sub(self, value):
        """Subtract the ``self`` and ``value`` into the output tensor.
@@ -874,7 +977,7 @@ class Tensor(object):
            The output tensor.

        """
-        raise NotImplementedError('Refer torch.ops.builtin.sub')
+        raise NotImplementedError('Refer torch.ops.tensor.sub')

    def sub_(self, value):
        """Inplace of ``Tensor.sub()``
@@ -890,7 +993,7 @@ class Tensor(object):
            The self.

        """
-        raise NotImplementedError('Refer torch.ops.builtin.sub_')
+        raise NotImplementedError('Refer torch.ops.tensor.sub_')

    def mul(self, value):
        """Multiply the ``self`` and ``value`` into the output tensor.
@@ -906,7 +1009,7 @@ class Tensor(object):
            The output tensor.

        """
-        raise NotImplementedError('Refer torch.ops.builtin.mul')
+        raise NotImplementedError('Refer torch.ops.tensor.mul')

    def mul_(self, value):
        """Inplace of ``Tensor.mul()``
@@ -922,7 +1025,7 @@ class Tensor(object):
            The self.

        """
-        raise NotImplementedError('Refer torch.ops.builtin.mul_')
+        raise NotImplementedError('Refer torch.ops.tensor.mul_')

    def div(self, value):
        """Divide the ``self`` and ``value`` into the output tensor.
@@ -938,7 +1041,7 @@ class Tensor(object):
            The output tensor.

        """
-        raise NotImplementedError('Refer torch.ops.builtin.div')
+        raise NotImplementedError('Refer torch.ops.tensor.div')

    def div_(self, value):
        """Inplace of ``Tensor.div()``
@@ -954,7 +1057,7 @@ class Tensor(object):
            The self.

        """
-        raise NotImplementedError('Refer torch.ops.builtin.div_')
+        raise NotImplementedError('Refer torch.ops.tensor.div_')

    def clamp(self, min=None, max=None):
        """Return a tensor that all elements are clamped into the range [min, max].
@@ -972,7 +1075,7 @@ class Tensor(object):
            The output tensor.

        """
-        raise NotImplementedError('Refer torch.ops.builtin.clamp')
+        raise NotImplementedError('Refer torch.ops.tensor.clamp')

    def clamp_(self, min=None, max=None):
        """Clamp all elements are clamped into the range [min, max].
@@ -990,52 +1093,40 @@ class Tensor(object):
            The output tensor.

        """
-        raise NotImplementedError('Refer torch.ops.builtin.clamp_')
+        raise NotImplementedError('Refer torch.ops.tensor.clamp_')

    def log(self):
        """Compute the natural logarithm of this tensor.

-        Parameters
-        ----------
-        None
-
        Returns
        -------
        dragon.vm.torch.Tensor
            The log tensor.

        """
-        raise NotImplementedError('Refer torch.ops.builtin.log')
+        raise NotImplementedError('Refer torch.ops.tensor.log')

    def exp(self):
        """Compute the exponential of this tensor.

-        Parameters
-        ----------
-        None
-
        Returns
        -------
        dragon.vm.torch.Tensor
            The exp tensor.

        """
-        raise NotImplementedError('Refer torch.ops.builtin.exp')
+        raise NotImplementedError('Refer torch.ops.tensor.exp')

    def sqrt(self):
        """Compute the square-root of this tensor.

-        Parameters
-        ----------
-        None
-
        Returns
        -------
        torch.Tensor
            The sqrt tensor.

        """
-        raise NotImplementedError('Refer torch.ops.builtin.sqrt')
+        raise NotImplementedError('Refer torch.ops.tensor.sqrt')

    def mean(self, dim=None, keepdim=False):
        """Returns the mean of all elements or elements along the given dim.
@@ -1044,7 +1135,7 @@ class Tensor(object):
        ----------
        dim : int, optional
            The axis of tensor to compute mean value.
-        keepdim : bool, optional
+        keepdim : bool, optional, default=False
            Whether the output tensor has dim retained or not.

        Returns
@@ -1053,7 +1144,7 @@ class Tensor(object):
            The mean-reduced tensor.

        """
-        raise NotImplementedError('Refer torch.ops.builtin.mean')
+        raise NotImplementedError('Refer torch.ops.tensor.mean')

    def sum(self, dim=None, keepdim=False):
        """Returns the sum of all elements or elements along the given dim.
@@ -1062,7 +1153,7 @@ class Tensor(object):
        ----------
        dim : int, optional
            The axis of tensor to compute sum value.
-        keepdim : bool, optional
+        keepdim : bool, optional, default=False
            Whether the output tensor has dim retained or not.

        Returns
@@ -1071,7 +1162,7 @@ class Tensor(object):
            The sum-reduced tensor.

        """
-        raise NotImplementedError('Refer torch.ops.builtin.sum')
+        raise NotImplementedError('Refer torch.ops.tensor.sum')

    def max(self, dim=None, keepdim=False):
        """Return the values and indices of maximum elements along the given axis.
@@ -1080,7 +1171,7 @@ class Tensor(object):
        ----------
        dim : int, optional
            The axis of tensor to compute sum value.
-        keepdim : bool, optional
+        keepdim : bool, optional, default=False
            Whether the output tensor has dim retained or not.

        Returns
@@ -1089,16 +1180,16 @@ class Tensor(object):
            The maximum values and indices.

        """
-        raise NotImplementedError('Refer torch.ops.builtin.max')
+        raise NotImplementedError('Refer torch.ops.tensor.max')

-    def min(input, dim=None, keepdim=False):
+    def min(self, dim=None, keepdim=False):
        """Return the values and indices of minimum elements along the given axis.

        Parameters
        ----------
        dim : int, optional
            The axis of tensor to compute sum value.
-        keepdim : bool, optional
+        keepdim : bool, optional, default=False
            Whether the output tensor has dim retained or not.

        Returns
@@ -1107,7 +1198,87 @@ class Tensor(object):
            The minimum values and indices.

        """
-        raise NotImplementedError('Refer torch.ops.builtin.min')
+        raise NotImplementedError('Refer torch.ops.tensor.min')
+
+    def gt(self, other):
+        """Compute *self* > *other* element-wise.
+
+        Parameters
+        ----------
+        other : dragon.vm.torch.Tensor, number
+            The other tensor.
+
+        Returns
+        -------
+        dragon.vm.torch.Tensor
+            The output byte tensor.
+
+        """
+        raise NotImplementedError('Refer torch.ops.tensor.gt')
+
+    def ge(self, other):
+        """Compute *self* >= *other* element-wise.
+
+        Parameters
+        ----------
+        other : dragon.vm.torch.Tensor, number
+            The other tensor.
+
+        Returns
+        -------
+        dragon.vm.torch.Tensor
+            The output byte tensor.
+
+        """
+        raise NotImplementedError('Refer torch.ops.tensor.ge')
+
+    def lt(self, other):
+        """Compute *self* < *other* element-wise.
+
+        Parameters
+        ----------
+        other : dragon.vm.torch.Tensor, number
+            The other tensor.
+
+        Returns
+        -------
+        dragon.vm.torch.Tensor
+            The output byte tensor.
+
+        """
+        raise NotImplementedError('Refer torch.ops.tensor.lt')
+
+    def le(self, other):
+        """Compute *self* <= *other* element-wise.
+
+        Parameters
+        ----------
+        other : dragon.vm.torch.Tensor, number
+            The other tensor.
+
+        Returns
+        -------
+        dragon.vm.torch.Tensor
+            The output byte tensor.
+
+        """
+        raise NotImplementedError('Refer torch.ops.tensor.le')
+
+    def eq(self, other):
+        """Compute *self* == *other* element-wise.
+
+        Parameters
+        ----------
+        other : dragon.vm.torch.Tensor, number
+            The other tensor.
+
+        Returns
+        -------
+        dragon.vm.torch.Tensor
+            The output byte tensor.
+
+        """
+        raise NotImplementedError('Refer torch.ops.tensor.eq')

    def half(self):
        """Return a ``float16`` tensor with elements of ``self``.
@@ -1118,7 +1289,7 @@ class Tensor(object):
            The half tensor.

        """
-        raise NotImplementedError('Refer torch.ops.builtin.half')
+        raise NotImplementedError('Refer torch.ops.tensor.half')

    def half_(self):
        """Inplace of ``Tensor.half()``.
@@ -1129,7 +1300,7 @@ class Tensor(object):
            The half tensor.

        """
-        raise NotImplementedError('Refer torch.ops.builtin.half_')
+        raise NotImplementedError('Refer torch.ops.tensor.half_')

    def float(self):
        """Return a ``float32`` tensor with elements of ``self``.
@@ -1140,7 +1311,7 @@ class Tensor(object):
            The float tensor.

        """
-        raise NotImplementedError('Refer torch.ops.builtin.float')
+        raise NotImplementedError('Refer torch.ops.tensor.float')

    def float_(self):
        """Inplace of ``Tensor.float()``.
@@ -1151,7 +1322,7 @@ class Tensor(object):
            The float tensor.

        """
-        raise NotImplementedError('Refer torch.ops.builtin.float_')
+        raise NotImplementedError('Refer torch.ops.tensor.float_')

    def double(self):
        """Return a ``float64`` tensor with elements of ``self``.
@@ -1162,7 +1333,7 @@ class Tensor(object):
            The double tensor.

        """
-        raise NotImplementedError('Refer torch.ops.builtin.double')
+        raise NotImplementedError('Refer torch.ops.tensor.double')

    def double_(self):
        """Inplace of ``Tensor.double()``.
@@ -1173,7 +1344,7 @@ class Tensor(object):
            The double tensor.

        """
-        raise NotImplementedError('Refer torch.ops.builtin.double_')
+        raise NotImplementedError('Refer torch.ops.tensor.double_')

    def int(self):
        """Return a ``int32`` tensor with elements of ``self``.
@@ -1184,7 +1355,7 @@ class Tensor(object):
            The int tensor.

        """
-        raise NotImplementedError('Refer torch.ops.builtin.int')
+        raise NotImplementedError('Refer torch.ops.tensor.int')

    def int_(self):
        """Inplace of ``Tensor.int()``.
@@ -1195,7 +1366,7 @@ class Tensor(object):
            The int tensor.

        """
-        raise NotImplementedError('Refer torch.ops.builtin.int_')
+        raise NotImplementedError('Refer torch.ops.tensor.int_')

    def long(self):
        """Return a ``int64`` tensor with elements of ``self``.
@@ -1206,7 +1377,7 @@ class Tensor(object):
            The long tensor.

        """
-        raise NotImplementedError('Refer torch.ops.builtin.long')
+        raise NotImplementedError('Refer torch.ops.tensor.long')

    def long_(self):
        """Inplace of ``Tensor.long()``.
@@ -1217,7 +1388,7 @@ class Tensor(object):
            The long tensor.

        """
-        raise NotImplementedError('Refer torch.ops.builtin.long_')
+        raise NotImplementedError('Refer torch.ops.tensor.long_')

    def byte(self):
        """Return a ``uint8`` tensor with elements of ``self``.
@@ -1228,7 +1399,7 @@ class Tensor(object):
            The byte tensor.

        """
-        raise NotImplementedError('Refer torch.ops.builtin.byte')
+        raise NotImplementedError('Refer torch.ops.tensor.byte')

    def byte_(self):
        """Inplace of ``Tensor.byte()``.
@@ -1239,7 +1410,7 @@ class Tensor(object):
            The byte tensor.

        """
-        raise NotImplementedError('Refer torch.ops.builtin.byte_')
+        raise NotImplementedError('Refer torch.ops.tensor.byte_')

    def char(self):
        """Return a ``int8`` tensor with elements of ``self``.
@@ -1250,7 +1421,7 @@ class Tensor(object):
            The byte tensor.

        """
-        raise NotImplementedError('Refer torch.ops.builtin.char')
+        raise NotImplementedError('Refer torch.ops.tensor.char')

    def char_(self):
        """Inplace of ``Tensor.char()``.
@@ -1261,7 +1432,7 @@ class Tensor(object):
            The byte tensor.

        """
-        raise NotImplementedError('Refer torch.ops.builtin.char_')
+        raise NotImplementedError('Refer torch.ops.tensor.char_')

    ##############################################
    #                                            #
@@ -1283,7 +1454,7 @@ class Tensor(object):

    @property
    def data(self):
-        return Tensor(ctx=self._ctx, name=self.name, own_storage=False)
+        return Tensor(device=self.device, name=self.name, own_storage=False)

    def detach(self):
        return self.data
@@ -1356,28 +1527,28 @@ def DoubleTensor(*args, **kwargs):
    return Tensor(*args, **kwargs)


-def LeafTensor(shape, dtype='float32', ctx=Context(), requires_grad=False):
-    """Create a torch tensor according to shape, dtype and ctx.
+def _LeafTensor(shape, dtype='float32', device=_Device(), requires_grad=False):
+    """Create a torch tensor according to shape, dtype and device.

    Commonly used to create leaf variables, i.e., the parameters or placeholders.

    """
    constructor = globals()[mapping.TENSOR_TYPE_TO_TORCH_TENSOR[dtype]]
-    return constructor(*shape, ctx=ctx, requires_grad=requires_grad)
+    return constructor(*shape, device=device, requires_grad=requires_grad)


-def RuntimeTensor(name, dtype='float32', ctx=Context()):
-    """Create a torch tensor according to dtype and ctx.
+def _RuntimeTensor(name, dtype='float32', device=_Device()):
+    """Create a torch tensor according to dtype and device.

    Commonly used to represent the outputs that are hard to compute shape,
    i.e., the shape is computed by the backend automatically.

    """
    constructor = globals()[mapping.TENSOR_TYPE_TO_TORCH_TENSOR[dtype]]
-    return constructor(name=name, ctx=ctx)
+    return constructor(name=name, device=device)


-def ReferenceTensor(src):
+def _ReferenceTensor(src):
    """Create a reference from source tensor.

    Commonly used to hold the same storage but takes different sizes,
@@ -1385,7 +1556,7 @@ def ReferenceTensor(src):

    """
    constructor = globals()[mapping.TENSOR_TYPE_TO_TORCH_TENSOR[src.dtype]]
-    T = constructor(name=TensorPool.get('${REFERENCE}'), ctx=src._ctx)
+    T = constructor(name=TensorPool.get('${REFERENCE}'), device=src.device)
    T._ref_objects.append(src)
    return T


--- a/Dragon/python/dragon/vm/torch/tensor_uitls.py
+++ b/Dragon/python/dragon/vm/torch/tensor_uitls.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import importlib
-import numpy as np
-
-import dragon.core.mapping as mapping
-from dragon.core.tensor_utils import GetStorage
-from dragon.vm.torch.c_api import Context
-
-
-def from_numpy(data):
-    """Create a tensor from the given numpy array.
-
-    Parameters
-    ----------
-    data : ndarray
-        The array with various data type.
-
-    Return
-    ------
-    dragon.vm.torch.Tensor
-        The torch tensor.
-
-    """
-    if not isinstance(data, np.ndarray):
-        raise TypeError('The data should be a numpy.ndarray.')
-    if str(data.dtype) not in mapping.TENSOR_TYPE_TO_TORCH_TENSOR:
-        raise ValueError('Unsupported type({}) to torch tensor.'.format(data.dtype))
-    module = importlib.import_module('dragon.vm.torch.tensor')
-    return getattr(module, mapping.TENSOR_TYPE_TO_TORCH_TENSOR[str(data.dtype)])(data)
-
-
-def to_numpy(tensor):
-    """Create a numpy nd-array from the given tensor.
-
-    Parameters
-    ----------
-    tensor : dragon.vm.torch.Tensor
-        The tensor with various data type.
-
-    Returns
-    -------
-    numpy.ndarray
-        The numpy array.
-
-    """
-    return tensor.numpy()
-
-
-def from_dragon(tensor, own_storage=False):
-    """Create a torch tensor from a existing dragon tensor.
-
-    Set ``own_storage`` as ``True`` for automatically releasing the storage.
-
-    Parameters
-    ----------
-    tensor : Tensor or str
-        The dragon tensor.
-    own_storage : boolean
-        Whether to release storage during deconstructing.
-
-    Returns
-    -------
-    dragon.vm.torch.Tensor
-        The torch tensor.
-
-    """
-    storage = GetStorage(tensor)
-    if storage is None: return None
-    module = importlib.import_module('dragon.vm.torch.tensor')
-    T = getattr(module, mapping.TENSOR_TYPE_TO_TORCH_TENSOR[storage.dtype])()
-    T._storage, T._own_storage, T._tensor = storage, own_storage, tensor
-    T._ctx = Context(*storage.ctx)
-    return T
-
-
-def to_str(tensor):
-    """Return a format str representing the storage of a tensor.
-
-    Parameters
-    ----------
-    tensor : dragon.vm.torch.Tensor
-        The tensor with various data type.
-
-    Returns
-    -------
-    str
-        The format str.
-
-    """
-    return str(tensor)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/timer.py
+++ b/Dragon/python/dragon/vm/torch/timer.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-
-import time
-
-
-class Timer(object):
-    def __init__(self):
-        self.total_time = 0.
-        self.calls = 0
-        self.start_time = 0.
-        self.diff = 0.
-
-    def tic(self):
-        self.start_time = time.time()
-
-    def toc(self, average=False, every_n=-1, name=''):
-        self.diff = time.time() - self.start_time
-        self.total_time += self.diff
-        self.calls += 1
-        self.average_time = self.total_time / self.calls
-        if every_n > 0 and self.calls % every_n == 0:
-            print('[{}]: total = {:.5f}s, average = {:.5f}s'.format(
-                name, self.total_time, self.total_time / self.calls * every_n))
-        if average:
-            return self.average_time
-        else:
-            return self.diff
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/utils/data/data_transformer.py
+++ b/Dragon/python/dragon/vm/torch/utils/data/data_transformer.py
--- a/Dragon/src/contrib/rcnn/proposal_op.cc
+++ b/Dragon/src/contrib/rcnn/proposal_op.cc
@@ -154,7 +154,7 @@ void ProposalOp<Context>::RunWithType() {

 template <class Context>
 void ProposalOp<Context>::RunOnDevice() {
-    ctx()->set_stream_id(0);  // Enforce SyncStream
+    ctx()->set_stream_id(0);  // Enforce DefaultStream

    num_images = Input(0).dim(0);
    CHECK_EQ(Input(-1).dim(0), num_images)

--- a/Dragon/src/core/mixedmem.cc
+++ b/Dragon/src/core/mixedmem.cc
@@ -150,17 +150,17 @@ void MixedMemory::SwitchToCUDADevice(int device_id) {

 const Map<string, string> MixedMemory::info() const {
    static map<State, string> STATE_TO_STRING {
-        { UNINITIALIZED, "UNINITIALIZED" },
-        { STATE_AT_CPU, "CPU" },
-        { STATE_AT_CUDA, "CUDA" },
-        { STATE_AT_CNML, "CNML" },
-        { SYNCED, "DEVICE" },
+        { UNINITIALIZED, "uninitialized" },
+        { STATE_AT_CPU, "cpu" },
+        { STATE_AT_CUDA, "cuda" },
+        { STATE_AT_CNML, "cnml" },
+        { SYNCED, "device" },
    };
    Map<string, string> s2s;
    string _state_ = STATE_TO_STRING[state_];
-    if (_state_ == "DEVICE") {
-        if (cuda_ptr_) _state_ = "CUDA";
-        else if (cnml_ptr_) _state_ = "CNML";
+    if (_state_ == "device") {
+        if (cuda_ptr_) _state_ = "cuda";
+        else if (cnml_ptr_) _state_ = "cnml";
        else LOG(FATAL) << "Device activated, "
                        << "but got invalid mem pointer.";
    }

--- a/Dragon/src/core/operator.cc
+++ b/Dragon/src/core/operator.cc
@@ -126,7 +126,7 @@ OperatorBase* NewOperator(
            << "\nOperator failed to pass the schema checking.";
    }
    OperatorDef mutable_def(def);
-    // Heuristically makes each random seed slightly differnet
+    // Heuristically make each random seed slightly different
    static unsigned int op_seed_uuid = 0;
    mutable_def.mutable_device_option()->set_random_seed(
        op_seed_uuid + def.device_option().random_seed());

--- a/Dragon/src/kernels/ndarray/arange_op_kernel.cc
+++ b/Dragon/src/kernels/ndarray/arange_op_kernel.cc
--- a/Dragon/src/kernels/ndarray/arange_op_kernel.cu
+++ b/Dragon/src/kernels/ndarray/arange_op_kernel.cu
--- a/Dragon/src/kernels/ndarray/argreduce_op_kernel.cc
+++ b/Dragon/src/kernels/ndarray/argreduce_op_kernel.cc
--- a/Dragon/src/kernels/ndarray/argreduce_op_kernel.cu
+++ b/Dragon/src/kernels/ndarray/argreduce_op_kernel.cu
--- a/Dragon/src/kernels/ndarray/concat_op_kernel.cc
+++ b/Dragon/src/kernels/ndarray/concat_op_kernel.cc
--- a/Dragon/src/kernels/ndarray/concat_op_kernel.cu
+++ b/Dragon/src/kernels/ndarray/concat_op_kernel.cu
--- a/Dragon/src/kernels/ndarray/crop_op_kernel.cc
+++ b/Dragon/src/kernels/ndarray/crop_op_kernel.cc
--- a/Dragon/src/kernels/ndarray/crop_op_kernel.cu
+++ b/Dragon/src/kernels/ndarray/crop_op_kernel.cu
--- a/Dragon/src/kernels/ndarray/gather_op_kernel.cc
+++ b/Dragon/src/kernels/ndarray/gather_op_kernel.cc
--- a/Dragon/src/kernels/ndarray/gather_op_kernel.cu
+++ b/Dragon/src/kernels/ndarray/gather_op_kernel.cu
--- a/Dragon/src/kernels/ndarray/one_hot_op_kernel.cc
+++ b/Dragon/src/kernels/ndarray/one_hot_op_kernel.cc
--- a/Dragon/src/kernels/ndarray/one_hot_op_kernel.cu
+++ b/Dragon/src/kernels/ndarray/one_hot_op_kernel.cu
--- a/Dragon/src/kernels/ndarray/pad_op_kernel.cc
+++ b/Dragon/src/kernels/ndarray/pad_op_kernel.cc
--- a/Dragon/src/kernels/ndarray/pad_op_kernel.cu
+++ b/Dragon/src/kernels/ndarray/pad_op_kernel.cu
--- a/Dragon/src/kernels/ndarray/reduce_sum_op_kernel.cc
+++ b/Dragon/src/kernels/ndarray/reduce_sum_op_kernel.cc
--- a/Dragon/src/kernels/ndarray/reduce_sum_op_kernel.cu
+++ b/Dragon/src/kernels/ndarray/reduce_sum_op_kernel.cu
--- a/Dragon/src/kernels/ndarray/repeat_op_kernel.cc
+++ b/Dragon/src/kernels/ndarray/repeat_op_kernel.cc
--- a/Dragon/src/kernels/ndarray/repeat_op_kernel.cu
+++ b/Dragon/src/kernels/ndarray/repeat_op_kernel.cu
--- a/Dragon/src/kernels/ndarray/slice_op_kernel.cc
+++ b/Dragon/src/kernels/ndarray/slice_op_kernel.cc
--- a/Dragon/src/kernels/ndarray/slice_op_kernel.cu
+++ b/Dragon/src/kernels/ndarray/slice_op_kernel.cu
--- a/Dragon/src/kernels/ndarray/tile_op_kernel.cc
+++ b/Dragon/src/kernels/ndarray/tile_op_kernel.cc
--- a/Dragon/src/kernels/ndarray/tile_op_kernel.cu
+++ b/Dragon/src/kernels/ndarray/tile_op_kernel.cu
--- a/Dragon/src/kernels/ndarray/transpose_op_kernel.cc
+++ b/Dragon/src/kernels/ndarray/transpose_op_kernel.cc
--- a/Dragon/src/kernels/ndarray/transpose_op_kernel.cu
+++ b/Dragon/src/kernels/ndarray/transpose_op_kernel.cu
--- a/Dragon/src/kernels/control_flow/compare_op_kernel.cc
+++ b/Dragon/src/kernels/control_flow/compare_op_kernel.cc
@@ -51,6 +51,22 @@ void _Less(
    }
 }

+/*! LessEqual <T = ?, Device = CPU> */
+
+template <typename T>
+void _LessEqual(
+    const int               count,
+    const T*                a,
+    const T*                b,
+    bool*                   y) {
+#ifdef WITH_OMP
+#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
+#endif
+    for (int i = 0; i < count; ++i) {
+        y[i] = a[i] <= b[i] ? true : false;
+    }
+}
+
 /*! Greater <T = ?, Device = CPU> */

 template <typename T>
@@ -67,6 +83,22 @@ void _Greater(
    }
 }

+/*! GreaterEqual <T = ?, Device = CPU> */
+
+template <typename T>
+void _GreaterEqual(
+    const int               count,
+    const T*                a,
+    const T*                b,
+    bool*                   y) {
+#ifdef WITH_OMP
+#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
+#endif
+    for (int i = 0; i < count; ++i) {
+        y[i] = a[i] >= b[i] ? true : false;
+    }
+}
+
 #define DEFINE_COMPARE_WARPPER(T, OP, IMPL) \
    template <> void OP<T, CPUContext>( \
        const int               count, \
@@ -93,6 +125,14 @@ DEFINE_COMPARE_WARPPER(int64_t, Less, _Less);
 DEFINE_COMPARE_WARPPER(float, Less, _Less);
 DEFINE_COMPARE_WARPPER(double, Less, _Less);

+DEFINE_COMPARE_WARPPER(bool, LessEqual, _LessEqual);
+DEFINE_COMPARE_WARPPER(int8_t, LessEqual, _LessEqual);
+DEFINE_COMPARE_WARPPER(uint8_t, LessEqual, _LessEqual);
+DEFINE_COMPARE_WARPPER(int, LessEqual, _LessEqual);
+DEFINE_COMPARE_WARPPER(int64_t, LessEqual, _LessEqual);
+DEFINE_COMPARE_WARPPER(float, LessEqual, _LessEqual);
+DEFINE_COMPARE_WARPPER(double, LessEqual, _LessEqual);
+
 DEFINE_COMPARE_WARPPER(bool, Greater, _Greater);
 DEFINE_COMPARE_WARPPER(int8_t, Greater, _Greater);
 DEFINE_COMPARE_WARPPER(uint8_t, Greater, _Greater);
@@ -101,6 +141,14 @@ DEFINE_COMPARE_WARPPER(int64_t, Greater, _Greater);
 DEFINE_COMPARE_WARPPER(float, Greater, _Greater);
 DEFINE_COMPARE_WARPPER(double, Greater, _Greater);

+DEFINE_COMPARE_WARPPER(bool, GreaterEqual, _GreaterEqual);
+DEFINE_COMPARE_WARPPER(int8_t, GreaterEqual, _GreaterEqual);
+DEFINE_COMPARE_WARPPER(uint8_t, GreaterEqual, _GreaterEqual);
+DEFINE_COMPARE_WARPPER(int, GreaterEqual, _GreaterEqual);
+DEFINE_COMPARE_WARPPER(int64_t, GreaterEqual, _GreaterEqual);
+DEFINE_COMPARE_WARPPER(float, GreaterEqual, _GreaterEqual);
+DEFINE_COMPARE_WARPPER(double, GreaterEqual, _GreaterEqual);
+
 template <> void Equal<float16, CPUContext>(
    const int               count,
    const float16*          a,
@@ -119,6 +167,15 @@ template <> void Less<float16, CPUContext>(
    CPU_FP16_NOT_SUPPORTED;
 }

+template <> void LessEqual<float16, CPUContext>(
+    const int               count,
+    const float16*          a,
+    const float16*          b,
+    bool*                   y,
+    CPUContext*             ctx) {
+    CPU_FP16_NOT_SUPPORTED;
+}
+
 template <> void Greater<float16, CPUContext>(
    const int               count,
    const float16*          a,
@@ -128,6 +185,15 @@ template <> void Greater<float16, CPUContext>(
    CPU_FP16_NOT_SUPPORTED;
 }

+template <> void GreaterEqual<float16, CPUContext>(
+    const int               count,
+    const float16*          a,
+    const float16*          b,
+    bool*                   y,
+    CPUContext*             ctx) {
+    CPU_FP16_NOT_SUPPORTED;
+}
+
 #undef DEFINE_COMPARE_WARPPER

 }  // namespace kernel

--- a/Dragon/src/kernels/control_flow/compare_op_kernel.cu
+++ b/Dragon/src/kernels/control_flow/compare_op_kernel.cu
@@ -68,6 +68,31 @@ __global__ void _LessHalf(
    }
 }

+/*! LessEqual <T = ?, Device = CUDA> */
+
+template <typename T>
+__global__ void _LessEqual(
+    const int               count,
+    const T*                a,
+    const T*                b,
+    bool*                   y) {
+    CUDA_1D_KERNEL_LOOP(idx, count) {
+        y[idx] = a[idx] <= b[idx] ? true : false;
+    }
+}
+
+__global__ void _LessEqualHalf(
+    const int               count,
+    const half*             a,
+    const half*             b,
+    bool*                   y) {
+    CUDA_1D_KERNEL_LOOP(idx, count) {
+#if __CUDA_ARCH__ >= 530
+        y[idx] = __hle(a[idx], b[idx]) ? true : false;
+#endif
+    }
+}
+
 /*! Greater <T = ?, Device = CUDA> */

 template <typename T>
@@ -93,6 +118,31 @@ __global__ void _GreaterHalf(
    }
 }

+/*! GreaterEqual <T = ?, Device = CUDA> */
+
+template <typename T>
+__global__ void _GreaterEqual(
+    const int               count,
+    const T*                a,
+    const T*                b,
+    bool*                   y) {
+    CUDA_1D_KERNEL_LOOP(idx, count) {
+        y[idx] = a[idx] >= b[idx] ? true : false;
+    }
+}
+
+__global__ void _GreaterEqualHalf(
+    const int               count,
+    const half*             a,
+    const half*             b,
+    bool*                   y) {
+    CUDA_1D_KERNEL_LOOP(idx, count) {
+#if __CUDA_ARCH__ >= 530
+        y[idx] = __hge(a[idx], b[idx]) ? true : false;
+#endif
+    }
+}
+
 #define DEFINE_COMPARE_WARPPER(T, OP, IMPL) \
    template <> void OP<T, CUDAContext>( \
        const int               count, \
@@ -138,6 +188,15 @@ DEFINE_COMPARE_WARPPER(float, Less, _Less);
 DEFINE_COMPARE_WARPPER(double, Less, _Less);
 DEFINE_COMPARE_FP16_WARPPER(Less);

+DEFINE_COMPARE_WARPPER(bool, LessEqual, _LessEqual);
+DEFINE_COMPARE_WARPPER(int8_t, LessEqual, _LessEqual);
+DEFINE_COMPARE_WARPPER(uint8_t, LessEqual, _LessEqual);
+DEFINE_COMPARE_WARPPER(int, LessEqual, _LessEqual);
+DEFINE_COMPARE_WARPPER(int64_t, LessEqual, _LessEqual);
+DEFINE_COMPARE_WARPPER(float, LessEqual, _LessEqual);
+DEFINE_COMPARE_WARPPER(double, LessEqual, _LessEqual);
+DEFINE_COMPARE_FP16_WARPPER(LessEqual);
+
 DEFINE_COMPARE_WARPPER(bool, Greater, _Greater);
 DEFINE_COMPARE_WARPPER(int8_t, Greater, _Greater);
 DEFINE_COMPARE_WARPPER(uint8_t, Greater, _Greater);
@@ -147,6 +206,15 @@ DEFINE_COMPARE_WARPPER(float, Greater, _Greater);
 DEFINE_COMPARE_WARPPER(double, Greater, _Greater);
 DEFINE_COMPARE_FP16_WARPPER(Greater);

+DEFINE_COMPARE_WARPPER(bool, GreaterEqual, _GreaterEqual);
+DEFINE_COMPARE_WARPPER(int8_t, GreaterEqual, _GreaterEqual);
+DEFINE_COMPARE_WARPPER(uint8_t, GreaterEqual, _GreaterEqual);
+DEFINE_COMPARE_WARPPER(int, GreaterEqual, _GreaterEqual);
+DEFINE_COMPARE_WARPPER(int64_t, GreaterEqual, _GreaterEqual);
+DEFINE_COMPARE_WARPPER(float, GreaterEqual, _GreaterEqual);
+DEFINE_COMPARE_WARPPER(double, GreaterEqual, _GreaterEqual);
+DEFINE_COMPARE_FP16_WARPPER(GreaterEqual);
+
 #undef DEFINE_COMPARE_WARPPER
 #undef DEFINE_COMPARE_FP16_WARPPER


--- a/Dragon/src/kernels/loss/softmax_focal_loss_op_kernel.cu
+++ b/Dragon/src/kernels/loss/softmax_focal_loss_op_kernel.cu
@@ -129,7 +129,7 @@ __global__ void _SoftmaxFocalLossGrad(
        } else {
            const int t = (oix * axis_dim + label) * inner_dim + iix;
            Tx onemp = 1 - prob[t];
-            //  unstable if gamma is 0
+            // Unstable if gamma is 0
            Tx grad = -gamma * pow(onemp, gamma - 1)
                             * log(max(prob[t], FLT_MIN))
                             * prob[t] + pow(onemp, gamma);

--- a/Dragon/src/operators/ndarray/arange_op.cc
+++ b/Dragon/src/operators/ndarray/arange_op.cc
 #include "core/workspace.h"
 #include "utils/op_kernel.h"
-#include "operators/ndarray/arange_op.h"
+#include "operators/array/arange_op.h"

 namespace dragon {


--- a/Dragon/src/operators/ndarray/argreduce_op.cc
+++ b/Dragon/src/operators/ndarray/argreduce_op.cc
 #include "utils/op_kernel.h"
 #include "utils/math_functions.h"
-#include "operators/ndarray/argreduce_op.h"
+#include "operators/array/argreduce_op.h"

 namespace dragon {


--- a/Dragon/src/operators/ndarray/concat_op.cc
+++ b/Dragon/src/operators/ndarray/concat_op.cc
 #include "core/workspace.h"
 #include "utils/op_kernel.h"
-#include "operators/ndarray/concat_op.h"
+#include "operators/array/concat_op.h"

 namespace dragon {


--- a/Dragon/src/operators/ndarray/crop_op.cc
+++ b/Dragon/src/operators/ndarray/crop_op.cc
 #include "core/workspace.h"
 #include "utils/op_kernel.h"
 #include "utils/math_functions.h"
-#include "operators/ndarray/crop_op.h"
+#include "operators/array/crop_op.h"

 namespace dragon {


--- a/Dragon/src/operators/ndarray/expand_dims_op.cc
+++ b/Dragon/src/operators/ndarray/expand_dims_op.cc
 #include "core/workspace.h"
-#include "operators/ndarray/dimension_op.h"
+#include "operators/array/dimension_op.h"

 namespace dragon {


--- a/Dragon/src/operators/ndarray/flatten_op.cc
+++ b/Dragon/src/operators/ndarray/flatten_op.cc
 #include "core/workspace.h"
-#include "operators/ndarray/dimension_op.h"
+#include "operators/array/dimension_op.h"

 namespace dragon {


--- a/Dragon/src/operators/ndarray/gather_op.cc
+++ b/Dragon/src/operators/ndarray/gather_op.cc
 #include "core/workspace.h"
 #include "utils/op_kernel.h"
 #include "utils/math_functions.h"
-#include "operators/ndarray/gather_op.h"
+#include "operators/array/gather_op.h"

 namespace dragon {


--- a/Dragon/src/operators/array/multinomial_op.cc
+++ b/Dragon/src/operators/array/multinomial_op.cc
+#include "core/workspace.h"
+#include "utils/cast.h"
+#include "utils/op_kernel.h"
+#include "operators/array/multinomial_op.h"
+
+namespace dragon {
+
+template <class Context>
+void MultinomialOp<Context>::SoftmaxRun() {
+    auto softmax_def = MakeOperatorDef("Softmax", "",
+        vector<string>({ Input(0).name() }),
+        vector<string>({ mount_name("softmax/prob") }));
+    Argument arg; arg.set_name("axis"); arg.set_i(axis);
+    softmax_def.add_arg()->CopyFrom(arg);
+    if (def().has_device_option())
+        softmax_def.mutable_device_option()->CopyFrom(
+            def().device_option());
+    if (softmax_op) { softmax_op->UpdateFrom(softmax_def); }
+    else { softmax_op.reset(NewOperator(softmax_def, ws())); }
+    softmax_op->Run(ctx()->stream_id());
+    prob = ws()->GetTensor(mount_name("softmax/prob"));
+}
+
+template <class Context> template <typename T>
+void MultinomialOp<Context>::RunWithType() {
+    auto* Xdata = normalize ?
+        prob->template data<T, CPUContext>() :
+            Input(0).template data<T, CPUContext>();
+
+    vector<double> cumsum(Input(0).dim(axis));
+    auto* Sdata = static_cast<double*>(cumsum.data());
+    auto* Ydata = Output(0)->template mutable_data<int64_t, CPUContext>();
+
+    double running_total, r;
+    int idx = 0, num_classes = Input(0).dim(axis);
+    auto* rng = ctx()->rand_generator();
+
+    for (int i = 0; i < outer_dim; ++i) {
+        running_total = 0.;
+        for (int j = 0; j < num_classes; ++j) {
+            running_total += (double)Xdata[j];
+            Sdata[j] = running_total;
+        }
+        std::uniform_real_distribution<double> dist(
+            0.f, running_total);
+        for (int j = 0; j < (int)num_samples; ++j) {
+            r = dist(*rng);
+            auto found_iter = std::upper_bound(
+                Sdata, Sdata + num_classes, r);
+            Ydata[idx++] = std::distance(Sdata, found_iter);
+        }
+        Xdata += num_classes;
+    }
+
+    Output(0)->template data<int64_t, Context>();
+}
+
+template <class Context>
+void MultinomialOp<Context>::RunOnDevice() {
+    ctx()->set_stream_id(0);  // Enforce DefaultStream
+
+    axis = Input(0).ndim() - 1;
+    auto output_dims = Input(0).dims();
+    output_dims[axis] = num_samples;
+    outer_dim = Input(0).count(0, axis);
+    Output(0)->Reshape(output_dims);
+
+    // Normalize the logits if necessary
+    if (normalize) SoftmaxRun();
+
+    if (XIsType(Input(0), int8_t)) RunWithType<int8_t>();
+    else if (XIsType(Input(0), uint8_t)) RunWithType<uint8_t>();
+    else if (XIsType(Input(0), int)) RunWithType<int>();
+    else if (XIsType(Input(0), int64_t)) RunWithType<int64_t>();
+    else if (XIsType(Input(0), float)) RunWithType<float>();
+    else if (XIsType(Input(0), double)) RunWithType<double>();
+    else LOG(FATAL) << DTypeHelper(Input(0), {
+        "bool", "int8", "uint8", "int32", "int64",
+            "float32", "float64",
+    });
+}
+
+DEPLOY_CPU(Multinomial);
+#ifdef WITH_CUDA
+DEPLOY_CUDA(Multinomial);
+#endif
+OPERATOR_SCHEMA(Multinomial).NumInputs(1).NumOutputs(1);
+
+NO_GRADIENT(Multinomial);
+
+}  // namespace dragon
\ No newline at end of file
--- a/Dragon/src/operators/ndarray/one_hot_op.cc
+++ b/Dragon/src/operators/ndarray/one_hot_op.cc
 #include "utils/op_kernel.h"
 #include "utils/math_functions.h"
-#include "operators/ndarray/one_hot_op.h"
+#include "operators/array/one_hot_op.h"

 namespace dragon {


--- a/Dragon/src/operators/ndarray/pad_op.cc
+++ b/Dragon/src/operators/ndarray/pad_op.cc
 #include "core/workspace.h"
 #include "utils/op_kernel.h"
 #include "utils/math_functions.h"
-#include "operators/ndarray/pad_op.h"
+#include "operators/array/pad_op.h"

 namespace dragon {


--- a/Dragon/src/operators/ndarray/reduce_op.cc
+++ b/Dragon/src/operators/ndarray/reduce_op.cc
 #include "core/workspace.h"
 #include "utils/op_kernel.h"
 #include "utils/math_functions.h"
-#include "operators/ndarray/reduce_op.h"
+#include "operators/array/reduce_op.h"

 namespace dragon {


--- a/Dragon/src/operators/ndarray/repeat_op.cc
+++ b/Dragon/src/operators/ndarray/repeat_op.cc
 #include "core/workspace.h"
 #include "utils/op_kernel.h"
-#include "operators/ndarray/repeat_op.h"
+#include "operators/array/repeat_op.h"

 namespace dragon {


--- a/Dragon/src/operators/ndarray/reshape_op.cc
+++ b/Dragon/src/operators/ndarray/reshape_op.cc
 #include "core/workspace.h"
-#include "operators/ndarray/dimension_op.h"
+#include "operators/array/dimension_op.h"

 namespace dragon {


--- a/Dragon/src/operators/ndarray/shape_op.cc
+++ b/Dragon/src/operators/ndarray/shape_op.cc
-#include "operators/ndarray/shape_op.h"
+#include "operators/array/shape_op.h"

 namespace dragon {


--- a/Dragon/src/operators/ndarray/slice_op.cc
+++ b/Dragon/src/operators/ndarray/slice_op.cc
 #include "utils/op_kernel.h"
 #include "utils/math_functions.h"
-#include "operators/ndarray/slice_op.h"
+#include "operators/array/slice_op.h"

 namespace dragon {


--- a/Dragon/src/operators/ndarray/squeeze_op.cc
+++ b/Dragon/src/operators/ndarray/squeeze_op.cc
 #include "core/workspace.h"
-#include "operators/ndarray/dimension_op.h"
+#include "operators/array/dimension_op.h"

 namespace dragon {


--- a/Dragon/src/operators/ndarray/stack_op.cc
+++ b/Dragon/src/operators/ndarray/stack_op.cc
 #include "core/workspace.h"
 #include "utils/op_kernel.h"
-#include "operators/ndarray/stack_op.h"
+#include "operators/array/stack_op.h"

 namespace dragon {


--- a/Dragon/src/operators/ndarray/tile_op.cc
+++ b/Dragon/src/operators/ndarray/tile_op.cc
 #include "core/workspace.h"
 #include "utils/op_kernel.h"
-#include "operators/ndarray/tile_op.h"
+#include "operators/array/tile_op.h"

 namespace dragon {


--- a/Dragon/src/operators/ndarray/transpose_op.cc
+++ b/Dragon/src/operators/ndarray/transpose_op.cc
 #include "core/workspace.h"
 #include "utils/op_kernel.h"
-#include "operators/ndarray/transpose_op.h"
+#include "operators/array/transpose_op.h"

 namespace dragon {


--- a/Dragon/src/operators/control_flow/compare_op.cc
+++ b/Dragon/src/operators/control_flow/compare_op.cc
+#include "core/workspace.h"
 #include "utils/op_kernel.h"
+#include "utils/math_functions.h"
 #include "operators/control_flow/compare_op.h"

 namespace dragon {
@@ -26,31 +28,44 @@ using kernel::Greater;
    template <class Context> template <typename T> \
    void CompareOp<Context>::Operation##RunWithType() { \
        auto* Adata = Input(0).template data<T, Context>(); \
-        auto* Bdata = Input(1).template data<T, Context>(); \
+        const T* Bdata = nullptr; \
        auto* Ydata = Output(0)->template mutable_data<bool, Context>(); \
+        if (Input(1).count() == 1) { \
+            auto* WSdata = ws()->template caches<T, Context> \
+                    ({ Input(0).count() })[0]; \
+            auto* BCdata = Input(1).template data<T, CPUContext>(); \
+            math::Set(Input(0).count(), BCdata[0], WSdata, ctx()); \
+            Bdata = WSdata; \
+        } else { Bdata = Input(1).template data<T, Context>(); } \
        kernel::Operation(Output(0)->count(), Adata, Bdata, Ydata, ctx()); \
    }

 template <class Context>
 void CompareOp<Context>::RunOnDevice() {
-    CHECK_EQ(Input(0).count(), Input(1).count())
+    if (Input(0).count() != Input(1).count()) {
+        CHECK_EQ(Input(1).count(), 1)
            << "\nBoth A and B should have the same number of elements."
+            << "\nOr the B should be a Scalar."
            << "\nDimensions of A and B are " << Input(0).DimString()
            << " and " << Input(1).DimString();
+    }

    Output(0)->ReshapeLike(Input(0));

-    if (operation == "EQUAL") { DEFINE_TYPED_CALLER(Equal); }
-    else if (operation == "LESS") { DEFINE_TYPED_CALLER(Less); }
-    else if (operation == "GREATER") { DEFINE_TYPED_CALLER(Greater); }
+    if (operation == "EQ") { DEFINE_TYPED_CALLER(Equal); }
+    else if (operation == "LT") { DEFINE_TYPED_CALLER(Less); }
+    else if (operation == "GT") { DEFINE_TYPED_CALLER(Greater); }
+    else if (operation == "LE") { DEFINE_TYPED_CALLER(LessEqual); }
+    else if (operation == "GE") { DEFINE_TYPED_CALLER(GreaterEqual); }
    else { LOG(FATAL) << "Unsupport operation: " << operation << "."; }
-
    if (to_uint8) Output(0)->SetMeta(TypeMeta::Make<uint8_t>());
 }

 DEFINE_OP_CALLER(Equal);
 DEFINE_OP_CALLER(Less);
+DEFINE_OP_CALLER(LessEqual);
 DEFINE_OP_CALLER(Greater);
+DEFINE_OP_CALLER(GreaterEqual);

 DEPLOY_CPU(Compare);
 #ifdef WITH_CUDA

--- a/Dragon/src/operators/control_flow/scan_op.cc
+++ b/Dragon/src/operators/control_flow/scan_op.cc
@@ -4,8 +4,8 @@
 #include "utils/string.h"
 #include "utils/math_functions.h"
 #include "utils/proto_utils.h"
+#include "operators/array/slice_op.h"
 #include "operators/control_flow/scan_op.h"
-#include "operators/ndarray/slice_op.h"

 namespace dragon {


--- a/Dragon/src/operators/loss/l1_loss_op.cc
+++ b/Dragon/src/operators/loss/l1_loss_op.cc
@@ -41,7 +41,7 @@ void L1LossOp<Context>::RunWithType() {

 template <class Context>
 void L1LossOp<Context>::RunOnDevice() {
-    ctx()->set_stream_id(0);  // Enforce SyncStream
+    ctx()->set_stream_id(0);  // Enforce DefaultStream

    for (int i = 1; i < InputSize(); i++) {
        CHECK_EQ(Input(0).count(), Input(i).count())

--- a/Dragon/src/operators/loss/softmax_ce_loss_op.cc
+++ b/Dragon/src/operators/loss/softmax_ce_loss_op.cc
@@ -25,6 +25,7 @@ void SoftmaxCrossEntropyOp<Context>::SoftmaxRun() {
    if (softmax_op) { softmax_op->UpdateFrom(softmax_def); }
    else { softmax_op.reset(NewOperator(softmax_def, ws())); }
    softmax_op->Run(ctx()->stream_id());
+    prob = ws()->GetTensor(mount_name("softmax/prob"));
 }

 template <class Context> template <typename T>
@@ -69,7 +70,6 @@ void SoftmaxCrossEntropyOp<Context>::RunOnDevice() {
    CHECK_EQ(Input(0).count(), Input(1).count())
        << "\nNumber of predictions must match the number of labels.";
    losses.ReshapeLike(Input(0));
-    prob = ws()->CreateTensor(mount_name("softmax/prob"));

    SoftmaxRun();


--- a/Dragon/src/operators/loss/softmax_focal_loss_op.cc
+++ b/Dragon/src/operators/loss/softmax_focal_loss_op.cc
@@ -60,8 +60,6 @@ void SoftmaxFocalLossOp<Context>::RunOnDevice() {
    flags.Reshape({ outer_dim * inner_dim });
    losses.Reshape({ outer_dim * inner_dim });

-    this->prob = ws()->CreateTensor(
-        mount_name("softmax/prob"));
    this->SoftmaxRun();

    if (XIsType(Input(0), float)) {

--- a/Dragon/src/operators/loss/sparse_softmax_ce_loss_op.cc
+++ b/Dragon/src/operators/loss/sparse_softmax_ce_loss_op.cc
@@ -25,6 +25,7 @@ void SparseSoftmaxCrossEntropyOp<Context>::SoftmaxRun() {
    if (softmax_op) { softmax_op->UpdateFrom(softmax_def); }
    else { softmax_op.reset(NewOperator(softmax_def, ws())); }
    softmax_op->Run(ctx()->stream_id());
+    prob = ws()->GetTensor(mount_name("softmax/prob"));
 }

 template <class Context> template <typename Tx, typename Ty>
@@ -75,8 +76,6 @@ void SparseSoftmaxCrossEntropyOp<Context>::RunOnDevice() {
    losses.Reshape({ outer_dim * inner_dim });
    flags.Reshape({ outer_dim * inner_dim });

-    prob = ws()->CreateTensor(
-        mount_name("softmax/prob"));
    SoftmaxRun();

    if (XIsType(Input(0), float)) {

--- a/Dragon/src/operators/recurrent/cudnn_recurrent_op.cc
+++ b/Dragon/src/operators/recurrent/cudnn_recurrent_op.cc
@@ -17,7 +17,7 @@ void CuDNNRecurrentOpBase<Context>::ResetDesc() {
    const auto num_directions = bidirectional ? 2 : 1;
    const auto output_dim = hidden_size * num_directions;

-    //  setup dropout
+    // Setup Dropout
    if (dropout_ratio < 1.f) {
 #if CUDNN_VERSION_MIN(7, 0, 0)
        if (!states_initialized) {
@@ -45,7 +45,7 @@ void CuDNNRecurrentOpBase<Context>::ResetDesc() {
 #endif
    }

-    //  setup rnn
+    // Setup RNN
 #if CUDNN_VERSION_MIN(7, 0, 0)
    CUDNN_CHECK(cudnnSetRNNDescriptor(
        ctx()->cudnn_handle(), rnn_desc,
@@ -63,7 +63,7 @@ void CuDNNRecurrentOpBase<Context>::ResetDesc() {
                        CUDNNType<T>::type));
 #endif

-    //  setup Xs & Ys & Y
+    // Setup Xs & Ys & Y
    xs_desc.reset(new cudnnTensorDescriptors(seq_length));
    xs_desc->Set<T>({ batch_size, input_dim, 1 }, { input_dim, 1, 1 });
    ys_desc.reset(new cudnnTensorDescriptors(seq_length));
@@ -72,14 +72,14 @@ void CuDNNRecurrentOpBase<Context>::ResetDesc() {
        rnn_desc, seq_length, xs_desc->descs(), &workspace_size));
    output_dims = { seq_length, batch_size, output_dim };

-    //  setup Hx & Cx & Hy & Cy
+    // Setup Hx & Cx & Hy & Cy
    hidden_dims = { num_layers * num_directions, batch_size, hidden_size };
    cudnnSetTensorDesc<T>(&hx_desc, hidden_dims);
    cudnnSetTensorDesc<T>(&cx_desc, hidden_dims);
    cudnnSetTensorDesc<T>(&hy_desc, hidden_dims);
    cudnnSetTensorDesc<T>(&cy_desc, hidden_dims);

-    //  setup packed weights
+    // Setup packed weights
    size_t weights_size; int64_t weights_count;
    CUDNN_CHECK(cudnnGetRNNParamsSize(
        ctx()->cudnn_handle(), rnn_desc, xs_desc->descs()[0],
@@ -94,7 +94,7 @@ void CuDNNRecurrentOpBase<Context>::ResetDesc() {
        CUDNNType<T>::type, CUDNN_TENSOR_NCHW, 3,
            vector<int>({ (int)weights_count, 1, 1 }).data()));

-    //  setup rnn workspace
+    // Determine the RNN workspace
    CUDNN_CHECK(cudnnGetRNNWorkspaceSize(
        ctx()->cudnn_handle(), rnn_desc, seq_length,
            xs_desc->descs(), &workspace_size));
@@ -181,7 +181,7 @@ void CuDNNRecurrentGradientOp<Context>::RunWithType() {
    };

    auto* WSdata = ws()->template caches<Context>({ workspace_size })[0];
-    //  check the reserve space
+    // Check the ReserveSpace
    CUDNN_CHECK(cudnnGetRNNTrainingReserveSize(ctx()->cudnn_handle(),
        rnn_desc, seq_length, xs_desc->descs(), &reserve_size));
    auto* reserveT = ws()->GetTensor(mount_name("rnn/reserve"));