Merge into the DimensionOp

Ting PAN
Commit 3b990761 authored Aug 14, 2018 by Ting PAN
Showing with 907 additions and 432 deletions
Docker/ubuntu-16.04-cpu-openblas/Dockerfile
Docker/ubuntu-16.04-cuda9.0-cudnn7/Dockerfile
Dragon/include/core/common.h
Dragon/include/core/operator.h
Dragon/include/core/tensor.h
Dragon/include/core/types.h
Dragon/include/operators/ndarray/dimension_op.h
Dragon/include/operators/ndarray/expand_dims_op.h
Dragon/include/operators/ndarray/flatten_op.h
Dragon/include/operators/ndarray/reshape_op.h
Dragon/include/operators/vision/conv_op_base.h
Dragon/modules/cxx/device.cc
Dragon/modules/cxx/dragon.cc
Dragon/modules/cxx/dragon.h
Dragon/modules/python/dragon.cc
Dragon/modules/python/py_tensor.h
Dragon/modules/python/py_types.h
Dragon/python/dragon/core/tensor_utils.py
Dragon/python/dragon/docs/contents/ops.rst
Dragon/python/dragon/io/data_reader.py
--- a/Docker/ubuntu-16.04-cpu-openblas/Dockerfile
+++ b/Docker/ubuntu-16.04-cpu-openblas/Dockerfile
@@ -18,7 +18,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
    python3-tk \ 
    && rm -rf /var/lib/apt/lists/*
-RUN pip3 install --no-cache-dir --upgrade setuptools wheel && \
+RUN pip3 install --no-cache-dir --upgrade setuptools wheel -i https://pypi.tuna.tsinghua.edu.cn/simple && \
    pip3 install --no-cache-dir -i https://pypi.tuna.tsinghua.edu.cn/simple \
    numpy \
    protobuf \
@@ -27,6 +27,7 @@ RUN pip3 install --no-cache-dir --upgrade setuptools wheel && \
    six \ 
    Pillow
    matplotlib \
+    scikit-image \
    pyyaml \
    cython

--- a/Docker/ubuntu-16.04-cuda9.0-cudnn7/Dockerfile
+++ b/Docker/ubuntu-16.04-cuda9.0-cudnn7/Dockerfile
@@ -21,7 +21,7 @@ RUN rm /etc/apt/sources.list.d/cuda.list && rm /etc/apt/sources.list.d/nvidia-ml
    python3-tk \
    && rm -rf /var/lib/apt/lists/*
-RUN pip3 install --no-cache-dir --upgrade setuptools wheel && \
+RUN pip3 install --no-cache-dir --upgrade setuptools wheel -i https://pypi.tuna.tsinghua.edu.cn/simple && \
    pip3 install --no-cache-dir -i https://pypi.tuna.tsinghua.edu.cn/simple \
    numpy \
    protobuf \ 
@@ -30,6 +30,7 @@ RUN pip3 install --no-cache-dir --upgrade setuptools wheel && \
    six \ 
    Pillow \
    matplotlib \
+    scikit-image \
    pyyaml \
    cython

--- a/Dragon/include/core/common.h
+++ b/Dragon/include/core/common.h
@@ -52,9 +52,9 @@ using Set = std::unordered_set<Value> ;
 /*
 * Define the Kernel version.
 *
- * | Major(2) | Minor(2) | Patch(09) |
+ * | Major(2) | Minor(2) | Patch(10) |
 */
-#define DRAGON_VERSION 2209
+#define DRAGON_VERSION 2210
 /*
 * Define the default random seed.

--- a/Dragon/include/core/operator.h
+++ b/Dragon/include/core/operator.h
@@ -114,7 +114,7 @@ class Operator : public OperatorBase {
    virtual void MakeResource();
    virtual void CleanResource();
-    void MemorySwitch() {
+    virtual void MemorySwitch() {
        for (auto* I : inputs_)
            if(I->name() != "ignore") I->SwitchToDevice();
        for (auto* O : outputs_) 

--- a/Dragon/include/core/tensor.h
+++ b/Dragon/include/core/tensor.h
@@ -40,7 +40,8 @@ class Tensor {
                capacity_ = 0;
            }
        } else {
-            if (ex_memory_ && TIndex(ex_memory_->nbytes()) <
+            if (ex_memory_ && !is_shared_ && 
+                    TIndex(ex_memory_->nbytes()) <
                        TIndex(new_size * meta_.itemsize())) {
                delete ex_memory_;
                ex_memory_ = nullptr;
@@ -232,18 +233,18 @@ class Tensor {
        return static_cast<const T*>(raw_data<Context>());
    }
-    template <class DstCTX, class SrcCTX>
+    template <class Context>
-    inline void Copy(const Tensor& other) {
+    inline void CopyFrom(const Tensor& other) {
        CHECK_EQ(size_, other.size_);
-        auto* src = other.template raw_data<SrcCTX>();
+        auto* src = other.template raw_data<Context>();
-        auto* dst = raw_mutable_data<DstCTX>(other.meta_);
+        auto* dst = raw_mutable_data<Context>(other.meta_);
        if (dst == src) return;
-        if (TypeMeta::Id<DstCTX>() ==
+        if (TypeMeta::Id<Context>() ==
                TypeMeta::Id<CPUContext>()) {
-            CPUContext::Memcpy<DstCTX, SrcCTX>(nbytes(), dst, src);
+            CPUContext::Memcpy<Context, Context>(nbytes(), dst, src);
-        } else if (TypeMeta::Id<DstCTX>() == 
+        } else if (TypeMeta::Id<Context>() ==
                TypeMeta::Id<CUDAContext>()) {
-            CUDAContext::Memcpy<DstCTX, SrcCTX>(nbytes(), dst, src);
+            CUDAContext::Memcpy<Context, Context>(nbytes(), dst, src);
        }
    }
@@ -253,6 +254,8 @@ class Tensor {
        own_mem_ = false;
    }
+    inline void Share(MixedMemory* mem) { Move(mem); is_shared_ = true; }
    inline void Reset() {
        size_ = capacity_ = 0;
        meta_ = TypeMeta();
@@ -271,7 +274,8 @@ class Tensor {
    string name_;
    shared_ptr<MixedMemory> memory_;
    MixedMemory* ex_memory_ = nullptr;
-    bool is_corrupted_ = false, own_mem_ = true;
+    bool is_corrupted_ = false, is_shared_ = false;
+    bool own_mem_ = true;
 };
 }    // namespace dragon

--- a/Dragon/include/core/types.h
+++ b/Dragon/include/core/types.h
@@ -49,7 +49,8 @@ inline const TypeMeta& TypeStringToMeta(
            { "int64", TypeMeta::Make<int64_t>() },
            { "float64", TypeMeta::Make<double>() },
            { "float16", TypeMeta::Make<float16>() },
-            { "uint8", TypeMeta::Make<uint8_t>() } 
+            { "uint8", TypeMeta::Make<uint8_t>() },
+            { "int8", TypeMeta::Make<char>() },
    };
    static TypeMeta unknown_type;
    return s2m_type_map.count(str_type) ?
@@ -65,7 +66,8 @@ inline const std::string TypeMetaToString(
            { TypeMeta::Id<int64_t>(), "int64" },
            { TypeMeta::Id<double>(), "float64", },
            { TypeMeta::Id<float16>(), "float16" },
-            { TypeMeta::Id<uint8_t>(), "uint8" } 
+            { TypeMeta::Id<uint8_t>(), "uint8" },
+            { TypeMeta::Id<char>(), "int8" }
    };
    return m2s_type_map.count(meta.id()) ?
        m2s_type_map[meta.id()] : "unknown";

--- a/Dragon/include/operators/ndarray/dimension_op.h
+++ b/Dragon/include/operators/ndarray/dimension_op.h
+// ------------------------------------------------------------
+// Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+//
+// Licensed under the BSD 2-Clause License.
+// You should have received a copy of the BSD 2-Clause License
+// along with the software. If not, See,
+//
+//      <https://opensource.org/licenses/BSD-2-Clause>
+//
+// -------------------------------------------------------------
+#ifndef DRAGON_OPERATORS_NDARRAY_DIMENSION_OP_H_
+#define DRAGON_OPERATORS_NDARRAY_DIMENSION_OP_H_
+#include "core/operator.h"
+namespace dragon {
+/*********************************************
+*                                            *
+*                   Base                     *
+*                                            *
+**********************************************/
+template <class Context>
+class DimOpBase : public Operator<Context> {
+ public:
+    USE_SIMPLE_CTOR_DTOR(DimOpBase);
+    void MemorySwitch() override {
+        /* Disable the Memory Activation */
+    }
+};
+template <class Context>
+class DimGradientOpBase : public Operator<Context> {
+public:
+    USE_SIMPLE_CTOR_DTOR(DimGradientOpBase);
+    USE_OPERATOR_FUNCTIONS;
+    void RunOnDevice() override {
+        //  simply copy the dY to dX
+        Output(0)->ReshapeLike(Input(0));
+        if (Output(0)->name() != Input(-1).name())
+            Output(0)->template CopyFrom<Context>(Input(-1));
+    }
+};
+#define DEFINE_DIMENSION_GRADIENT_OP(name) \
+    template <class Context> \
+    class name##GradientOp final : public DimGradientOpBase<Context> { \
+     public: \
+      name##GradientOp(const OperatorDef& def, Workspace* ws) \
+        : DimGradientOpBase<Context>(def, ws) {} \
+    };
+/*********************************************
+*                                            *
+*                   Reshape                  *
+*                                            *
+**********************************************/
+template <class Context>
+class ReshapeOp final : public DimOpBase<Context> {
+ public:
+    ReshapeOp(const OperatorDef& def, Workspace* ws)
+        : DimOpBase<Context>(def, ws),
+          shape_like_desc(OperatorBase::Arg<string>("shape_like", "")) {
+        GET_ARGUMENTS_WITH_DESC(int, shape);
+    }
+    USE_OPERATOR_FUNCTIONS;
+    void RunOnDevice() override;
+ protected:
+    DECLARE_ARGUMENTS_WITH_DESC(int, shape);
+    string shape_like_desc;
+    vector<TIndex> require_shape, new_shape;
+};
+DEFINE_ARGUMENTS_WITH_DESC(int, ReshapeOp, shape);
+DEFINE_DIMENSION_GRADIENT_OP(Reshape);
+/*********************************************
+*                                            *
+*                   Flatten                  *
+*                                            *
+**********************************************/
+template <class Context>
+class FlattenOp final : public DimOpBase<Context> {
+ public:
+    FlattenOp(const OperatorDef& def, Workspace* ws)
+        : DimOpBase<Context>(def, ws),
+          axis(OperatorBase::Arg<int>("axis", 0)),
+          num_axes(OperatorBase::Arg<int>("num_axes", -1)),
+          keep_axes(OperatorBase::Arg<int>("keep_axes", INT_MAX)) {}
+    USE_OPERATOR_FUNCTIONS;
+    void RunOnDevice() override;
+ protected:
+    TIndex axis, num_axes, keep_axes;
+};
+DEFINE_DIMENSION_GRADIENT_OP(Flatten);
+/*********************************************
+*                                            *
+*                Expand Dims                 *
+*                                            *
+**********************************************/
+template <class Context>
+class ExpandDimsOp final : public DimOpBase<Context> {
+ public:
+    ExpandDimsOp(const OperatorDef& def, Workspace* ws)
+        : DimOpBase<Context>(def, ws),
+          axis(OperatorBase::Arg<int>("axis", INT_MAX)) {
+        if (axis == INT_MAX)
+            LOG(FATAL) << "Excepted a axis to insert the new dim.";
+    }
+    USE_OPERATOR_FUNCTIONS;
+    void RunOnDevice() override;
+ protected:
+    TIndex axis;
+};
+DEFINE_DIMENSION_GRADIENT_OP(ExpandDims);
+/*********************************************
+*                                            *
+*                  Squeeze                   *
+*                                            *
+**********************************************/
+template <class Context>
+class SqueezeOp final : public DimOpBase<Context> {
+public:
+    SqueezeOp(const OperatorDef& def, Workspace* ws)
+        : DimOpBase<Context>(def, ws),
+        axis(OperatorBase::Arg<int>("axis", INT_MAX)) {}
+    USE_OPERATOR_FUNCTIONS;
+    void RunOnDevice() override;
+ protected:
+    TIndex axis;
+};
+DEFINE_DIMENSION_GRADIENT_OP(Squeeze);
+}    // namespace dragon
+#endif    // DRAGON_OPERATORS_NDARRAY_RESHAPE_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/ndarray/expand_dims_op.h
+++ b/Dragon/include/operators/ndarray/expand_dims_op.h
-// ------------------------------------------------------------
-// Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-//
-// Licensed under the BSD 2-Clause License.
-// You should have received a copy of the BSD 2-Clause License
-// along with the software. If not, See,
-//
-//      <https://opensource.org/licenses/BSD-2-Clause>
-//
-// -------------------------------------------------------------
-#ifndef DRAGON_OPERATORS_NDARRAY_EXPAND_DIMS_OP_H_
-#define DRAGON_OPERATORS_NDARRAY_EXPAND_DIMS_OP_H_
-#include "core/operator.h"
-namespace dragon {
-template <class Context>
-class ExpandDimsOp final : public Operator<Context> {
- public:
-    ExpandDimsOp(const OperatorDef& def, Workspace* ws)
-        : Operator<Context>(def, ws),
-          axis(OperatorBase::Arg<int>("axis", -1)) {}
-    USE_OPERATOR_FUNCTIONS;
-    void RunOnDevice() override;
- protected:
-    TIndex axis;
-};
-template <class Context>
-class ExpandDimsGradientOp final : public Operator<Context> {
- public:
-    USE_SIMPLE_CTOR_DTOR(ExpandDimsGradientOp);
-    USE_OPERATOR_FUNCTIONS;
-    void RunOnDevice() override;
-};
-}    // namespace dragon
-#endif    // DRAGON_OPERATORS_NDARRAY_EXPAND_DIMS_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/ndarray/flatten_op.h
+++ b/Dragon/include/operators/ndarray/flatten_op.h
-// ------------------------------------------------------------
-// Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-//
-// Licensed under the BSD 2-Clause License.
-// You should have received a copy of the BSD 2-Clause License
-// along with the software. If not, See,
-//
-//      <https://opensource.org/licenses/BSD-2-Clause>
-//
-// -------------------------------------------------------------
-#ifndef DRAGON_OPERATORS_NDARRAY_FLATTEN_OP_H_
-#define DRAGON_OPERATORS_NDARRAY_FLATTEN_OP_H_
-#include "core/operator.h"
-namespace dragon {
-template <class Context>
-class FlattenOp final : public Operator<Context> {
- public:
-    FlattenOp(const OperatorDef& def, Workspace* ws)
-        : Operator<Context>(def, ws),
-          axis(OperatorBase::Arg<int>("axis", 0)),
-          num_axes(OperatorBase::Arg<int>("num_axes", -1)),
-          keep_axes(OperatorBase::Arg<int>("keep_axes", INT_MAX)) {}
-    USE_OPERATOR_FUNCTIONS;
-    void RunOnDevice() override;
-    void SqueezeRun();
-    void KeepRun();
- protected:
-    TIndex axis, num_axes, keep_axes;
-};
-template <class Context>
-class FlattenGradientOp final : public Operator<Context> {
- public:
-    USE_SIMPLE_CTOR_DTOR(FlattenGradientOp);
-    USE_OPERATOR_FUNCTIONS;
-    void RunOnDevice() override;
-};
-}    // namespace dragon
-#endif    // DRAGON_OPERATORS_NDARRAY_FLATTEN_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/ndarray/reshape_op.h
+++ b/Dragon/include/operators/ndarray/reshape_op.h
-// ------------------------------------------------------------
-// Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-//
-// Licensed under the BSD 2-Clause License.
-// You should have received a copy of the BSD 2-Clause License
-// along with the software. If not, See,
-//
-//      <https://opensource.org/licenses/BSD-2-Clause>
-//
-// -------------------------------------------------------------
-#ifndef DRAGON_OPERATORS_NDARRAY_RESHAPE_OP_H_
-#define DRAGON_OPERATORS_NDARRAY_RESHAPE_OP_H_
-#include "core/operator.h"
-namespace dragon {
-template <class Context>
-class ReshapeOp final : public Operator<Context> {
- public:
-    ReshapeOp(const OperatorDef& def, Workspace* ws)
-        : Operator<Context>(def, ws),
-          shape_like_desc(OperatorBase::Arg<string>("shape_like", "")) {
-        GET_ARGUMENTS_WITH_DESC(int, shape);
-    }
-    USE_OPERATOR_FUNCTIONS;
-    void RunOnDevice() override;
- protected:
-    DECLARE_ARGUMENTS_WITH_DESC(int, shape);
-    string shape_like_desc;
-    vector<TIndex> require_shape, new_shape;
-};
-template <class Context>
-class ReshapeGradientOp final : public Operator<Context> {
- public:
-    USE_SIMPLE_CTOR_DTOR(ReshapeGradientOp);
-    USE_OPERATOR_FUNCTIONS;
-    void RunOnDevice() override;
-};
-DEFINE_ARGUMENTS_WITH_DESC(int, ReshapeOp, shape);
-}    // namespace dragon
-#endif    // DRAGON_OPERATORS_NDARRAY_RESHAPE_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/vision/conv_op_base.h
+++ b/Dragon/include/operators/vision/conv_op_base.h
@@ -55,7 +55,7 @@ class ConvOpBase : public Operator<Context> {
    void GradientReshape();
    virtual void ComputeOutputShape();
    virtual bool ReverseDimensions() = 0;
-    virtual bool HasBias() = 0;
+    virtual bool HasBias() { NOT_IMPLEMENTED; return true; }
    template <typename T> void Wx(const T* x,
        const T* weights, T* y, bool skip_im2col = false);

--- a/Dragon/modules/cxx/device.cc
+++ b/Dragon/modules/cxx/device.cc
@@ -16,12 +16,12 @@ int type_from_string(std::string type) {
 }
 Device::Device()
-    : device_type_(CPU), device_id_(0) {}
+    : device_type_(0), device_id_(0) {}
 Device::Device(std::string device_type, int device_id)
-    : device_type_((DeviceType)type_from_string(device_type)), device_id_(device_id) {}
+    : device_type_(type_from_string(device_type)), device_id_(device_id) {}
 Device::Device(std::string device_type)
-    : device_type_((DeviceType)type_from_string(device_type)), device_id_(0) {}
+    : device_type_(type_from_string(device_type)), device_id_(0) {}
 }    // namespace dragon
\ No newline at end of file
--- a/Dragon/modules/cxx/dragon.cc
+++ b/Dragon/modules/cxx/dragon.cc
@@ -6,7 +6,6 @@
 #include <google/protobuf/io/zero_copy_stream_impl.h>
 #include "dragon.h"
-#include "protos/dragon.pb.h"
 #include "core/common.h"
 #include "core/workspace.h"
 #include "utils/caffemodel.h"
@@ -35,7 +34,8 @@ Workspace* ResetWorkspace(const std::string& name) {
    g_workspaces[name].reset(new Workspace(name));
    for (auto& sub_workspace : sub_workspaces[name]) {
        if (g_workspaces.count(sub_workspace) > 0)
-            g_workspaces[name]->MoveWorkspace(g_workspaces[sub_workspace].get());
+            g_workspaces[name]->MoveWorkspace(
+                g_workspaces[sub_workspace].get());
    }
    return g_workspaces[name].get();
 }
@@ -49,7 +49,9 @@ void ReleaseWorkspace(const std::string& name) {
    g_workspaces.erase(name);
 }
-void MoveWorkspace(Workspace* target_ws, Workspace* source_ws) {
+void MoveWorkspace(
+    Workspace*                  target_ws,
+    Workspace*                  source_ws) {
    std::unique_lock<std::mutex> lock(g_mutex);
    CHECK(source_ws) << "\nThe given source workspace is invalid."; 
    CHECK(target_ws) << "\nThe given target workspace is invalid.";
@@ -59,7 +61,9 @@ void MoveWorkspace(Workspace* target_ws, Workspace* source_ws) {
              << "into the Workspace(" << target_ws->name() << ").";
 }
-std::string CreateGraph(const std::string& graph_file, Workspace* ws) {
+std::string CreateGraph(
+    const std::string&          graph_file,
+    Workspace*                  ws) {
    GraphDef meta_graph;
    int fd = open(graph_file.c_str(), O_RDONLY);
    CHECK_NE(fd, -1) << "\nFile not found: " << graph_file;
@@ -75,7 +79,10 @@ std::string CreateGraph(const std::string& graph_file, Workspace* ws) {
    return meta_graph.name();
 }
-std::string CreateGraph(const std::string& graph_file, const Device& device, Workspace* ws) {
+std::string CreateGraph(
+    const std::string&          graph_file,
+    const Device&               device,
+    Workspace*                  ws) {
    GraphDef meta_graph;
    int fd = open(graph_file.c_str(), O_RDONLY);
    CHECK_NE(fd, -1) << "\nFile not found: " << graph_file;
@@ -95,26 +102,29 @@ std::string CreateGraph(const std::string& graph_file, const Device& device, Wor
    return meta_graph.name();
 }
-void CreateTensor(const std::string& name, Workspace* ws) {
+void CreateTensor(
+    const std::string&          name,
+    Workspace*                  ws) {
    ws->CreateTensor(name);
 }
 template <typename T>
-void FeedTensor(const std::string& name,
+void FeedTensor(
+    const std::string&          name,
    const vector<TIndex>&       shape,
    const T*                    data,
    const Device&               device,
    Workspace*                  ws) {
    Tensor* tensor = ws->CreateTensor(name);
    tensor->Reshape(shape);
-    if (device.device_type() == CUDA) {
+    if (device.device_type() == 1) {
        CUDAContext context(device.device_id());
        context.SwitchToDevice();
        tensor->mutable_data<T, CUDAContext>();
        context.Memcpy<CUDAContext, CPUContext>(tensor->nbytes(),
                         tensor->raw_mutable_data<CUDAContext>(),
                                 static_cast<const void*>(data));
-    } else if (device.device_type() == CPU) {
+    } else if (device.device_type() == 0) {
        CPUContext context;
        tensor->mutable_data<T, CPUContext>();
        context.Memcpy<CPUContext, CPUContext>(tensor->nbytes(),
@@ -125,7 +135,9 @@ void FeedTensor(const std::string& name,
    }
 }
-void TransplantCaffeModel(const std::string& input_model, const std::string& output_model) {
+void TransplantCaffeModel(
+    const std::string&          input_model,
+    const std::string&          output_model) {
    TensorProtos protos;
    NetParameter net_param;
    ReadProtoFromBinaryFile(input_model.c_str(), &net_param);
@@ -151,13 +163,16 @@ void TransplantCaffeModel(const std::string& input_model, const std::string& out
                << ", size: " << blob.data_size();
        }
    }
-    std::fstream output(output_model, std::ios::out | std::ios::trunc | std::ios::binary);
+    std::fstream output(output_model,
+        std::ios::out | std::ios::trunc | std::ios::binary);
    CHECK(protos.SerializeToOstream(&output));
    LOG(INFO) << "save the model @: " << output_model << "......";
    LOG(INFO) << "model format: DragonMoel";
 }
-void LoadDragonmodel(const std::string& model_file, Workspace* ws){
+void LoadDragonmodel(
+    const std::string&          model_file,
+    Workspace*                  ws){
    TensorProtos tensors;
    ReadProtoFromBinaryFile(model_file.c_str(), &tensors);
    LOG(INFO) << "Restore From Model @: " << model_file << "......";
@@ -190,7 +205,9 @@ void LoadDragonmodel(const std::string& model_file, Workspace* ws){
    }
 }
-void LoadCaffemodel(const std::string& model_file, Workspace* ws){
+void LoadCaffemodel(
+    const std::string&          model_file,
+    Workspace*                  ws){
    NetParameter net_param;
    ReadProtoFromBinaryFile(model_file.c_str(), &net_param);
    std::string scope = "";
@@ -231,12 +248,15 @@ void LoadCaffemodel(const std::string& model_file, Workspace* ws){
    }
 }
-void RunGraph(const std::string& graph_name, Workspace* ws) {
+void RunGraph(
+    const std::string&          graph_name,
+    Workspace*                  ws) {
    ws->RunGraph(graph_name, "", "");
 }
 template <typename T>
-T* FetchTensor(const std::string& name,
+T* FetchTensor(
+    const std::string&          name,
    vector<TIndex>&             shape,
    Workspace*                  ws){
    if (!ws->HasTensor(name)){
@@ -251,13 +271,11 @@ T* FetchTensor(const std::string& name,
    shape = tensor->dims();
    void* data = malloc(tensor->nbytes());
    if (tensor->memory_state() == MixedMemory::STATE_AT_CUDA) {
-        CUDAContext::Memcpy<CPUContext, CUDAContext>(tensor->nbytes(),
+        CUDAContext::Memcpy<CPUContext, CUDAContext>(
-                                                                 data,
+            tensor->nbytes(), data, tensor->raw_data<CUDAContext>());
-                                     tensor->raw_data<CUDAContext>());
    } else {
-        CPUContext::Memcpy<CPUContext, CPUContext>(tensor->nbytes(),
+        CPUContext::Memcpy<CPUContext, CPUContext>(
-                                                               data,
+            tensor->nbytes(), data, tensor->raw_data<CPUContext>());
-                                    tensor->raw_data<CPUContext>());
    }
    return static_cast<T*>(data);
 }
@@ -266,4 +284,30 @@ void SetLogLevel(const std::string& level) {
    SetLogDestination(StrToLogSeverity(level));
 }
+template float* FetchTensor<float>(
+    const std::string&,
+    std::vector<TIndex>&,
+    Workspace*);
+template void FeedTensor<float>(
+    const std::string&,
+    const std::vector<TIndex>&,
+    const float*,
+    const Device&,
+    Workspace*);
+template void FeedTensor<int>(
+    const std::string&,
+    const std::vector<TIndex>&,
+    const int*,
+    const Device&,
+    Workspace*);
+template void FeedTensor<uint8_t>(
+    const std::string&,
+    const std::vector<TIndex>&,
+    const uint8_t*,
+    const Device&,
+    Workspace*);
 }    // namespace dragon
\ No newline at end of file
--- a/Dragon/modules/cxx/dragon.h
+++ b/Dragon/modules/cxx/dragon.h
@@ -29,18 +29,16 @@ typedef int64_t TIndex;
 class Workspace;
 class Device {
-    enum DeviceType { CPU, CUDA };
 public:
    EXPORT Device();
    EXPORT explicit Device(std::string device_type);
    EXPORT Device(std::string device_type, int device_id);
-    EXPORT const DeviceType& device_type() const { return device_type_; }
+    EXPORT const int& device_type() const { return device_type_; }
    EXPORT const int device_id() const { return device_id_; }
 private:
-     DeviceType device_type_;
+     int device_type_;
     int device_id_;
 };
@@ -52,53 +50,48 @@ EXPORT void ReleaseWorkspace(const std::string& name);
 EXPORT void MoveWorkspace(Workspace* main, Workspace* sub);
-EXPORT std::string CreateGraph(const std::string& graph_file, Workspace* ws);
+EXPORT std::string CreateGraph(
+    const std::string&          graph_file,
+    Workspace*                  ws);
-EXPORT std::string CreateGraph(const std::string& graph_file, const Device& device, Workspace* ws);
+EXPORT std::string CreateGraph(
+    const std::string&          graph_file,
+    const Device&               device,
+    Workspace*                  ws);
-EXPORT void RunGraph(const std::string& graph_name, Workspace* ws);
+EXPORT void RunGraph(
+    const std::string&          graph_name,
+    Workspace*                  ws);
-EXPORT void CreateTensor(const std::string& name, Workspace* ws);
+EXPORT void CreateTensor(
+    const std::string&          name,
+    Workspace*                  ws);
 template <typename T>
-void FeedTensor(const std::string& name,
+EXPORT void FeedTensor(
+    const std::string&          name,
    const std::vector<TIndex>&  shape,
    const T*                    data,
    const Device&               device,
    Workspace*                  ws);
 template <typename T>
-T* FetchTensor(const std::string& name,
+EXPORT T* FetchTensor(
+    const std::string&          name,
    std::vector<TIndex>&        shape,
    Workspace*                  ws);
-template EXPORT float* FetchTensor(const std::string&,
+EXPORT void LoadCaffemodel(
-                                   std::vector<TIndex>&,
+    const std::string&          model_file,
-                                   Workspace*);
+    Workspace*                  ws);
-template EXPORT void FeedTensor(const std::string&,
-                                const std::vector<TIndex>&,
-                                const float*,
-                                const Device&,
-                                Workspace*);
-template EXPORT void FeedTensor(const std::string&,
-                                const std::vector<TIndex>&,
-                                const int*,
-                                const Device&,
-                                Workspace*);
-template EXPORT void FeedTensor(const std::string&,
-                                const std::vector<TIndex>&,
-                                const uint8_t*,
-                                const Device&,
-                                Workspace*);
-EXPORT void LoadCaffemodel(const std::string& model_file, Workspace* ws);
-EXPORT void TransplantCaffeModel(const std::string& input_model, const std::string& output_model);
+EXPORT void TransplantCaffeModel(
+    const std::string&          input_model,
+    const std::string&          output_model);
-EXPORT void LoadDragonmodel(const std::string& model_file, Workspace* ws);
+EXPORT void LoadDragonmodel(
+    const std::string&          model_file,
+    Workspace*                  ws);
 EXPORT void SetLogLevel(const std::string& level);

--- a/Dragon/modules/python/dragon.cc
+++ b/Dragon/modules/python/dragon.cc
@@ -231,6 +231,7 @@ PyMethodDef* GetAllMethods() {
        PYFUNC(RenameTensorCC),
        PYFUNC(TensorFromShapeCC),
        PYFUNC(TensorFromPyArrayCC),
+        PYFUNC(TensorFromTensorCC),
        PYFUNC(GetTensorNameCC),
        PYFUNC(GetTensorInfoCC),
        PYFUNC(FeedTensorCC),

--- a/Dragon/modules/python/py_tensor.h
+++ b/Dragon/modules/python/py_tensor.h
@@ -152,6 +152,55 @@ PyObject* TensorFromPyArrayCC(PyObject* self, PyObject* args) {
    Py_RETURN_TRUE;
 }
+PyObject* TensorFromTensorCC(PyObject* self, PyObject* args) {
+    char* dst_name, *src_name;
+    PyObject* py_dst_ctx = nullptr, *py_src_ctx = nullptr;
+    if (!PyArg_ParseTuple(args, "ssOO",
+        &dst_name, &src_name, &py_dst_ctx, &py_src_ctx)) {
+        PyErr_SetString(PyExc_ValueError,
+            "Failed to create tensor from tensor.\n"
+            "Excepted the (dest, src) name and context.");
+        return nullptr;
+    }
+    DeviceOption dst_ctx, src_ctx;
+    dst_ctx.ParseFromString(PyBytes_AsStringEx(py_dst_ctx));
+    src_ctx.ParseFromString(PyBytes_AsStringEx(py_src_ctx));
+    Tensor* srcT = ws()->GetTensor(src_name);
+    Tensor* dstT = ws()->CreateTensor(dst_name);
+    dstT->ReshapeLike(*srcT);
+    dstT->SetMeta(srcT->meta());
+    if (dst_ctx.device_type() == DeviceType::CUDA) {
+        if (src_ctx.device_type() == DeviceType::CUDA) {
+            //  CUDA <- CUDA
+            CUDAContext::Memcpy<CUDAContext, CUDAContext>(
+                srcT->nbytes(),
+                    dstT->raw_mutable_data<CUDAContext>(),
+                        srcT->raw_data<CUDAContext>());
+        } else {
+            //  CUDA <- CPU
+            CUDAContext::Memcpy<CUDAContext, CUDAContext>(
+                srcT->nbytes(),
+                    dstT->raw_mutable_data<CUDAContext>(),
+                        srcT->raw_data<CPUContext>());
+        }
+    } else {
+        if (src_ctx.device_type() == DeviceType::CUDA) {
+            //  CPU <- CUDA
+            CUDAContext::Memcpy<CUDAContext, CUDAContext>(
+                srcT->nbytes(),
+                    dstT->raw_mutable_data<CPUContext>(),
+                        srcT->raw_data<CUDAContext>());
+        } else {
+            //  CPU <- CPU
+            CUDAContext::Memcpy<CUDAContext, CUDAContext>(
+                srcT->nbytes(),
+                    dstT->raw_mutable_data<CPUContext>(),
+                        srcT->raw_data<CPUContext>());
+        }
+    }
+    Py_RETURN_TRUE;
+}
 inline PyObject* TensorToPyArrayCC(PyObject* self, PyObject* args) {
    Tensor* tensor = ws()->GetTensor(ParseName(self, args));
    CHECK_GT(tensor->count(), 0);
@@ -183,7 +232,8 @@ inline PyObject* TensorToPyArrayExCC(PyObject* self, PyObject* args) {
        return nullptr;
    }
    auto* data = const_cast<void*>(tensor->raw_data<CPUContext>());
-    PyObject* array = PyArray_SimpleNewFromData(tensor->ndim(), dims.data(), npy_type, data);
+    PyObject* array = PyArray_SimpleNewFromData(
+        tensor->ndim(), dims.data(), npy_type, data);
    Py_XINCREF(array);
    return array;
 }
@@ -202,7 +252,8 @@ inline PyObject* ToCUDATensorCC(PyObject* self, PyObject* args) {
    char* cname;
    int device_id;
    if (!PyArg_ParseTuple(args, "si", &cname, &device_id)) {
-        PyErr_SetString(PyExc_ValueError, "Excepted the tensor name and device id.");
+        PyErr_SetString(PyExc_ValueError,
+            "Excepted the tensor name and device id.");
        return nullptr;
    }
    Tensor* t = ws()->GetTensor(cname);

--- a/Dragon/modules/python/py_types.h
+++ b/Dragon/modules/python/py_types.h
@@ -23,7 +23,8 @@ inline const int TypeMetaToNPY(const TypeMeta& meta) {
            { TypeMeta::Id<int64_t>(), NPY_INT64 },
            { TypeMeta::Id<double>(), NPY_FLOAT64 },
            { TypeMeta::Id<float16>(), NPY_FLOAT16 },
-            { TypeMeta::Id<uint8_t>(), NPY_UINT8  }
+            { TypeMeta::Id<uint8_t>(), NPY_UINT8 },
+            { TypeMeta::Id<char>(), NPY_INT8 }
    };
    return m2npy_type_map.count(meta.id()) ? m2npy_type_map[meta.id()] : -1;
 }
@@ -35,7 +36,8 @@ inline const TypeMeta& TypeNPYToMeta(int npy_type) {
            { NPY_INT64, TypeMeta::Make<int64_t>() },
            { NPY_FLOAT64, TypeMeta::Make<double>() },
            { NPY_FLOAT16, TypeMeta::Make<float16>() },
-            { NPY_UINT8, TypeMeta::Make<uint8_t>() }
+            { NPY_UINT8, TypeMeta::Make<uint8_t>() },
+            { NPY_INT8, TypeMeta::Make<char>() },
    };
    static TypeMeta unknown_type;
    return npy2m_type_map.count(npy_type) ? npy2m_type_map[npy_type] : unknown_type;

--- a/Dragon/python/dragon/core/tensor_utils.py
+++ b/Dragon/python/dragon/core/tensor_utils.py
@@ -24,6 +24,7 @@ from dragon.core.utils import MakeDeviceOption
 __all__ = [
    'FromShape',
    'SetShape',
+    'FromTensor',
    'FromPyArray',
    'SetPyArray',
    'ToPyArray',
@@ -113,6 +114,40 @@ def SetShape(tensor, shape, dtype='float32'):
    TensorFromShapeCC(_stringify_tensor(tensor), shape, dtype)
+def FromTensor(src, src_ctx=None, name=None, ctx=None):
+    """Create a Tensor from a existing tensor.
+    Parameters
+    ----------
+    src_ctx : str
+        The name of source tensor.
+    src_ctx : dragon_pb2.DeviceOption
+        The context of source tensor.
+    name : str
+        The optional tensor name for destination tensor.
+    ctx : dragon_pb2.DeviceOption
+        The context for destination tensor.
+    Returns
+    -------
+    Tensor
+        The tensor with the same data as source.
+    References
+    ----------
+    The wrapper of ``TensorFromTensorCC``.
+    """
+    if name is None: tensor = Tensor(name=name)
+    else: tensor = Tensor(_name=name)
+    if src_ctx is None: src_ctx = MakeDeviceOption(0, 0) # CPUContext
+    if ctx is None: ctx = MakeDeviceOption(0, 0)  # CPUContext
+    TensorFromTensorCC(
+        _stringify_tensor(tensor), _stringify_tensor(src),
+        _stringify_proto(ctx), _stringify_proto(src_ctx))
+    return tensor
 def FromPyArray(array, name=None):
    """Create a Tensor from a existing Array.
@@ -120,7 +155,7 @@ def FromPyArray(array, name=None):
    Parameters
    ----------
-    array : np.ndarray
+    array : ndarray
        The array for creating the tensor.
    name : str
        The optional tensor name.
@@ -152,7 +187,7 @@ def SetPyArray(tensor, array):
    ----------
    tensor : Tensor, str or None
        The specific tensor to use.
-    array : numpy.ndarray
+    array : ndarray
        The array for creating the tensor.
    Returns
@@ -179,7 +214,7 @@ def ToPyArray(tensor):
    Returns
    -------
-    numpy.ndarray
+    ndarray
        The array sharing the memory with original tensor.
    References
@@ -202,7 +237,7 @@ def ToPyArrayEx(tensor):
    Returns
    -------
-    numpy.ndarray
+    ndarray
        The array sharing the memory with original tensor.
    References

--- a/Dragon/python/dragon/docs/contents/ops.rst
+++ b/Dragon/python/dragon/docs/contents/ops.rst
@@ -149,7 +149,8 @@ List               Brief
 `OneHot`_          Generate the one-hot representation of inputs.
 `Flatten`_         Flatten the input along the given axes.
 `Reshape`_         Reshape the dimensions of input.
-`ExpandDims`_      ExpandDims interface of NDArray.
+`Squeeze`_         Remove the dimensions with size 1.
+`ExpandDims`_      Expand the new dimension with size 1 to specific axis.
 `Shape`_           Get the dynamic shape of a Tensor.
 `Arange`_          Return a vector of elements by arange.
 ===============    ======================================================================
@@ -285,6 +286,7 @@ List                 Brief
 .. _OneHot: operators/ndarray.html#dragon.operators.ndarray.OneHot
 .. _Flatten: operators/ndarray.html#dragon.operators.ndarray.Flatten
 .. _Reshape: operators/ndarray.html#dragon.operators.ndarray.Reshape
+.. _Squeeze: operators/ndarray.html#dragon.operators.ndarray.Squeeze
 .. _ExpandDims: operators/ndarray.html#dragon.operators.ndarray.ExpandDims
 .. _Shape: operators/ndarray.html#dragon.operators.ndarray.Shape
 .. _Arange: operators/ndarray.html#dragon.operators.ndarray.Arange

--- a/Dragon/python/dragon/io/data_reader.py
+++ b/Dragon/python/dragon/io/data_reader.py
@@ -97,7 +97,7 @@ class DataReader(Process):
        self._db.close()
        self._db.open(self._source)
        self._cur_idx = target_idx
-        self._db.set(str(self._cur_idx).zfill(self._db_zfill))
+        self._db.set(str(self._cur_idx).zfill(self._zfill))
    def reset(self):
        """Reset the cursor and environment.
@@ -112,12 +112,12 @@ class DataReader(Process):
            self._cur_chunk_idx = 0
            self._start_idx = int(self._part_idx * self._num_shuffle_parts + self._perm[self._cur_chunk_idx])
            self._start_idx = int(self._start_idx * self._chunk_size)
-            if self._start_idx >= self._db_size: self.next_chunk()
+            if self._start_idx >= self._num_entries: self.next_chunk()
            self._end_idx = self._start_idx + self._chunk_size
-            self._end_idx = min(self._db_size, self._end_idx)
+            self._end_idx = min(self._num_entries, self._end_idx)
        else:
            self._start_idx = 0
-            self._end_idx = self._db_size
+            self._end_idx = self._num_entries
        self.redirect(self._start_idx)
@@ -145,10 +145,10 @@ class DataReader(Process):
        else:
            self._start_idx = self._part_idx * self._num_shuffle_parts + self._perm[self._cur_chunk_idx]
            self._start_idx = self._start_idx * self._chunk_size
-            if self._start_idx >= self._db_size: self.next_chunk()
+            if self._start_idx >= self._num_entries: self.next_chunk()
            else:
                self._end_idx = self._start_idx + self._chunk_size
-                self._end_idx = min(self._db_size, self._end_idx)
+                self._end_idx = min(self._num_entries, self._end_idx)
            self.redirect(self._start_idx)
    def run(self):
@@ -165,14 +165,14 @@ class DataReader(Process):
        # init db
        self._db = LMDB()
        self._db.open(self._source)
-        self._db_zfill = self._db.zfill()
+        self._zfill = self._db.zfill()
-        self._db_size = self._db.num_entries()
+        self._num_entries = self._db.num_entries()
-        self._epoch_size = int(self._db_size / self._num_parts + 1)
+        self._epoch_size = int(self._num_entries / self._num_parts + 1)
        if self._use_shuffle:
            if self._chunk_size == 1:
                # each chunk has at most 1 record [For Fully Shuffle]
-                self._num_shuffle_parts = int(self._db_size / self._chunk_size / self._num_parts) + 1
+                self._num_shuffle_parts = int(self._num_entries / self._chunk_size / self._num_parts) + 1
            else:
                if self._use_shuffle and self._chunk_size == -1:
                    # search a optimal chunk size by chunks [For Chunk Shuffle]
@@ -182,12 +182,12 @@ class DataReader(Process):
                    self._chunk_size = min_chunk_size
                    self._num_shuffle_parts = int(math.ceil(self._db._total_size * 1.1 /
                                                 (self._num_parts * self._chunk_size << 20)))
-                    self._chunk_size = int(self._db_size / self._num_shuffle_parts / self._num_parts + 1)
+                    self._chunk_size = int(self._num_entries / self._num_shuffle_parts / self._num_parts + 1)
        else:
            # each chunk has at most K records [For Multiple Nodes]
            # note that if ``shuffle`` and ``multiple_nodes`` are all ``False``,
            # ``chunk_size`` and ``num_shuffle_parts`` are meaningless
-            self._chunk_size = int(self._db_size / self._num_parts) + 1
+            self._chunk_size = int(self._num_entries / self._num_parts) + 1
            self._num_shuffle_parts = 1
        self._perm = np.arange(self._num_shuffle_parts)

--- a/Dragon/python/dragon/operators/ndarray.py
+++ b/Dragon/python/dragon/operators/ndarray.py
@@ -727,11 +727,11 @@ def Reshape(inputs, shape, shape_like=None, **kwargs):
    Examples
    --------
    >>> a = Tensor(shape=[1, 2, 3, 4]).Variable()
-    >>> print Reshape(a, shape=[6, 4])
+    >>> print(Reshape(a, shape=[6, 4]))
    >>> [6, 4]
    >>> b = Reshape(a, shape=[-1, 4]) # shape will be [6, 4] in the backend
-    >>> print b.shape
+    >>> print(b.shape)
    >>> [1, 4] # fake dimension at axis 0
    """
@@ -766,15 +766,58 @@ def Reshape(inputs, shape, shape_like=None, **kwargs):
    return output
-def ExpandDims(inputs, axis=-1, **kwargs):
+def Squeeze(inputs, axis=None, **kwargs):
-    """ExpandDims interface of NDArray.
+    """Remove the dimensions with size 1.
+    Set ``axis`` to remove the specific position.
+    Parameters
+    ----------
+    inputs : Tensor
+        The input tensor.
+    axis : int or None
+        The specific axis to remove.
+    Returns
+    -------
+    Tensor
+        The output tensor.
+    Examples
+    --------
+    >>> a = Tensor(shape=[2, 1, 3, 4]).Variable()
+    >>> print(Squeeze(a).shape)
+    >>> print(Squeeze(a, axis=0).shape)
+    """
+    CheckInputs(inputs, 1)
+    arguments = ParseArguments(locals())
+    output = Tensor.CreateOperator(nout=1, op_type='Squeeze', **arguments)
+    if inputs.shape is not None:
+        output_shape = []
+        if axis: axis += (0 if axis >= 0 else len(inputs.shape))
+        for idx, dim in enumerate(inputs.shape[:]):
+            if dim != 1 or \
+                (axis and dim == 1 and idx != axis):
+                    output_shape.append(dim)
+        output.shape = output_shape
+    return output
+def ExpandDims(inputs, axis, **kwargs):
+    """Expand the new dimension with size 1 to specific axis.
+    Negative ``axis`` is equal to ``axis = axis + num_axes + 1``.
    Parameters
    ----------
    inputs : Tensor
        The input tensor.
    axis : int
-        The insert position of new dimension. Default is ``-1`` (Push Back).
+        The insert axis of new dimension.
    Returns
    -------
@@ -784,9 +827,8 @@ def ExpandDims(inputs, axis=-1, **kwargs):
    Examples
    --------
    >>> a = Tensor(shape=[1, 2, 3, 4]).Variable()
-    >>> print ExpandDims(a).shape
+    >>> print(ExpandDims(a).shape)
+    >>> print(ExpandDims(a, axis=2).shape)
-    >>> print ExpandDims(a, axis=2).shape
    """
    CheckInputs(inputs, 1)
@@ -796,7 +838,8 @@ def ExpandDims(inputs, axis=-1, **kwargs):
    if inputs.shape is not None:
        output.shape = inputs.shape[:]
-        if axis == -1 or axis >= len(inputs.shape):
+        axis += (0 if axis >= 0 else len(inputs.shape) + 1)
+        if axis < 0 or axis >= len(inputs.shape):
            output.shape.append(np.long(1))
        else: output.shape.insert(axis, np.long(1))

--- a/Dragon/python/dragon/ops.py
+++ b/Dragon/python/dragon/ops.py
@@ -129,6 +129,7 @@ OneHot = ndarray.OneHot
 Flatten = ndarray.Flatten
 Reshape = ndarray.Reshape
 ExpandDims = ndarray.ExpandDims
+Squeeze = ndarray.Squeeze
 Shape = ndarray.Shape
 Arange = ndarray.Arange

--- a/Dragon/python/dragon/protos/dragon.proto
+++ b/Dragon/python/dragon/protos/dragon.proto
 syntax = "proto2";
+package dragon;
 message TensorProto {
  repeated int32 dims = 1;
  enum DataType {

--- a/Dragon/python/dragon/protos/dragon_pb2.py
+++ b/Dragon/python/dragon/protos/dragon_pb2.py
--- a/Dragon/python/dragon/version.py
+++ b/Dragon/python/dragon/version.py
@@ -14,7 +14,7 @@ from __future__ import division
 from __future__ import print_function
 version = '0.2.2'
-full_version = '0.2.2.9'
+full_version = '0.2.2.10'
 release = False
 if not release:

--- a/Dragon/python/dragon/vm/torch/module.py
+++ b/Dragon/python/dragon/vm/torch/module.py
@@ -115,8 +115,8 @@ class Module(object):
    def _load_state_dict_key_mismatch(self, full_name, name, is_missing):
        pass
-    def load_state_dict(self, state_dict, strict=True):
+    def load_state_dict(self, state_dict, strict=True, verbose=True):
-        logger.info('Load the state dict from numpy arrays.')
+        if verbose: logger.info('Load the state dict.')
        def submodule_key_mismatch(full_name, is_missing):
            module = self
            names = full_name.split(".")
@@ -131,9 +131,6 @@ class Module(object):
        own_state = self.state_dict()
        for name, param in state_dict.items():
            if name in own_state:
-                if not isinstance(param, np.ndarray):
-                    raise ValueError('PyTorch@Dragon can only load params '
-                                     'that saved as numpy array.')
                state_shape = own_state[name].shape
                param_shape = param.shape
                if state_shape != param_shape:
@@ -145,7 +142,14 @@ class Module(object):
                    raise ValueError('DType of state({}) is {}, \n'
                        'While load from a PyArray of {}.'.format(name,
                        own_state[name].dtype, str(param.dtype)))
-                dg.workspace.FeedTensor(own_state[name].name, param)
+                if isinstance(param, Tensor):
+                    own_state[name].copy_(param)
+                elif isinstance(param, np.ndarray):
+                    dg.tensor_utils.SetPyArray(own_state[name], param)
+                else:
+                    raise ValueError('Excepted the type of source state is either '
+                        'torch.Tensor or numpy.ndarray, got {}.'.format(type(param)))
+                if verbose:
                    logger.info('* Tensor({}) loaded, Size: ({})'.format(name,
                            ', '.join([str(d) for d in param_shape])))
        if strict:

--- a/Dragon/python/dragon/vm/torch/nn/__init__.py
+++ b/Dragon/python/dragon/vm/torch/nn/__init__.py
@@ -18,7 +18,7 @@ from dragon.vm.torch.module import Module
 from dragon.vm.torch.tensor import Parameter
 from .modules.conv import Conv2d, ConvTranspose2d
 from .modules.pooling import MaxPool2d, AvgPool2d
-from .modules.activation import ReLU, Sigmoid, Softmax
+from .modules.activation import ReLU, LeakyReLU, Sigmoid, Softmax
 from .modules.linear import Linear
 from .modules.loss import CrossEntropyLoss
 from .modules.container import Container, Sequential, ModuleList

--- a/Dragon/python/dragon/vm/torch/nn/modules/activation.py
+++ b/Dragon/python/dragon/vm/torch/nn/modules/activation.py
@@ -35,6 +35,26 @@ class ReLU(Module):
        return self.run(inputs, outputs)
+class LeakyReLU(Module):
+    def __init__(self, negative_slope=0.01, inplace=False):
+        super(LeakyReLU, self).__init__()
+        self._negative_slope = negative_slope
+        self._inplace = inplace
+        self.register_op()
+    def register_op(self):
+        self.op_meta = {
+            'op_type': 'Relu',
+            'n_inputs': 1, 'n_outputs': 1,
+            'arguments': {'slope': self._negative_slope}
+        }
+    def forward(self, x):
+        inputs = [x]; self.unify_devices(inputs)
+        outputs = [x if self._inplace else self.register_output(x.dtype)]
+        return self.run(inputs, outputs)
 class Sigmoid(Module):
    def __init__(self, inplace=False):
        super(Sigmoid, self).__init__()

--- a/Dragon/python/dragon/vm/torch/ops/__init__.py
+++ b/Dragon/python/dragon/vm/torch/ops/__init__.py
@@ -19,7 +19,9 @@ from .arithmetic import (
 )
 from .ndarray import (
-    sum, mean, argmin, argmax, max, topk, cat, gather
+    squeeze, unsqueeze,
+    sum, mean, argmin, argmax, max, topk,
+    cat, gather,
 )
 from .vision import (

--- a/Dragon/python/dragon/vm/torch/ops/builtin.py
+++ b/Dragon/python/dragon/vm/torch/ops/builtin.py
@@ -13,6 +13,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+from dragon.vm.torch.constants import CTX_TO_DEVICE_OPTION
+from dragon.core.tensor_utils import FromTensor
 from dragon.vm.torch.tensor import Tensor, Size
 from dragon.vm.torch.execute_engine import RunOperator
@@ -20,9 +22,11 @@ from dragon.vm.torch.ops.factory import get_module
 from dragon.vm.torch.autograd.grad_mode import no_grad
 from dragon.vm.torch.ops.primitive import MakeContext
 from dragon.vm.torch.ops.arithmetic import _fundamental, _rfundamental
-from dragon.vm.torch.ops.control_flow import _copy
+from dragon.vm.torch.ops.ndarray import (
-from dragon.vm.torch.ops.ndarray import \
+    reshape, squeeze, unsqueeze,
-    (reshape, _permute, _repeat, _fill, _reduce, _arg_reduce,  _crop)
+    _permute, _repeat, _crop,
+    _fill, _reduce, _arg_reduce,
+)
 from dragon.vm.torch.ops.modules.dtype import AsType
@@ -33,13 +37,15 @@ from dragon.vm.torch.ops.modules.dtype import AsType
 ##############################################
-def copy_(self, src):
+def copy_(self, src, non_blocking=False):
    """Copy the elements from ``src`` into this tensor and return ``self``.
    Parameters
    ----------
    src : vm.torch.Tensor
        The source tensor.
+    non_blocking : boolean
+        Whether to copy asynchronously between CPU and GPU.
    Returns
    -------
@@ -47,7 +53,10 @@ def copy_(self, src):
        The ``self`` tensor.
    """
-    return _copy(self, src)
+    FromTensor(
+        src, CTX_TO_DEVICE_OPTION[tuple(src._ctx)],
+        self.name, CTX_TO_DEVICE_OPTION[tuple(self._ctx)])
+    return self
 Tensor.copy_ = copy_
@@ -308,6 +317,75 @@ Tensor.__rtruediv__ = rdiv
 ##############################################
+def _squeeze(self, dim=None):
+    """Returns a tensor with all the dimensions of input of size 1 removed.
+    Parameters
+    ----------
+    dim : int
+        The optional dim to remove.
+    Returns
+    -------
+    vm.torch.Tensor
+        The new tensor.
+    """
+    return squeeze(self, dim=dim)
+def _squeeze_(self, dim=None):
+    """Inplace of ``Tensor.squeeze()``
+    Parameters
+    ----------
+    dim : int
+        The optional dim to remove.
+    Returns
+    -------
+    vm.torch.Tensor
+        The self.
+    """
+    return squeeze(self, dim=dim, out=self)
+def _unsqueeze(self, dim):
+    """Returns a tensor with a dimension of size 1 inserted at the specified position.
+    Parameters
+    ----------
+    dim : int
+        The dim to insert.
+    Returns
+    -------
+    vm.torch.Tensor
+        The new tensor.
+    """
+    return unsqueeze(self, dim=dim)
+def _unsqueeze_(self, dim=None):
+    """Inplace of ``Tensor.unsqueeze()``
+    Parameters
+    ----------
+    dim : int
+        The optional dim to remove.
+    Returns
+    -------
+    vm.torch.Tensor
+        The self.
+    """
+    return unsqueeze(self, dim=dim, out=self)
 def view(self, *args):
    if self._static_shape:
        raise RuntimeError('Can not view a leaf variable, it owns the static sizes.')
@@ -353,6 +431,10 @@ def min(self, dim=None, keepdim=False):
    return _arg_reduce(self, 'MIN', dim, keepdim)
+Tensor.squeeze = _squeeze
+Tensor.squeeze_ = _squeeze_
+Tensor.unsqueeze = _unsqueeze
+Tensor.unsqueeze_ = _unsqueeze_
 Tensor.view = view
 Tensor.view_as = view_as
 Tensor.permute = permute
@@ -412,6 +494,8 @@ Tensor.double = lambda self: _type_to(self, dtype='float64', inplace=False)
 Tensor.double_ = lambda self: _type_to(self, dtype='float64', inplace=True)
 Tensor.byte = lambda self: _type_to(self, dtype='uint8', inplace=False)
 Tensor.byte_ = lambda self: _type_to(self, dtype='uint8', inplace=True)
+Tensor.char = lambda self: _type_to(self, dtype='int8', inplace=False)
+Tensor.char_ = lambda self: _type_to(self, dtype='int8', inplace=True)
 Tensor.int = lambda self: _type_to(self, dtype='int32', inplace=False)
 Tensor.int_ = lambda self: _type_to(self, dtype='int32', inplace=True)
 Tensor.long = lambda self: _type_to(self, dtype='int64', inplace=False)

--- a/Dragon/python/dragon/vm/torch/ops/control_flow.py
+++ b/Dragon/python/dragon/vm/torch/ops/control_flow.py
@@ -11,12 +11,3 @@
 from dragon.vm.torch.ops.primitive import MakeContext
 from dragon.vm.torch.ops.factory import get_module
-from dragon.vm.torch.ops.modules.control_flow import Copy
-def _copy(dst, src):
-    if id(dst) == id(src): return dst
-    ctx = MakeContext(inputs=[dst])
-    key = 'torch/ops/copy/{}:{}'.format(ctx[0].lower(), ctx[1])
-    module = get_module(Copy, key, ctx)
-    return module.forward(dst, src)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/ops/modules/shape.py
+++ b/Dragon/python/dragon/vm/torch/ops/modules/shape.py
@@ -14,6 +14,7 @@ from __future__ import division
 from __future__ import print_function
 from dragon.vm.torch.ops.modules.base import BaseModule
+from dragon.vm.torch.tensor import ReferneceTensor
 class Fill(BaseModule):
@@ -69,13 +70,61 @@ class Reshape(BaseModule):
    def forward(self, x, shape):
        inputs = [x]; self.unify_devices(inputs)
-        outputs = [self.register_output(x.dtype)]
+        outputs = [ReferneceTensor(x)]
        if shape is not None:
            for ix, d in enumerate(shape):
                self.set_argument_i(self.shape[ix], d)
        return self.run(inputs, outputs)
+class Squeeze(BaseModule):
+    def __init__(self, key, ctx, **kwargs):
+        super(Squeeze, self).__init__(key, ctx, **kwargs)
+        self.dim = kwargs.get('dim', None)
+        self.register_arguments()
+        self.register_op()
+    def register_arguments(self):
+        """No Arguments for squeeze op."""
+        pass
+    def register_op(self):
+        self.op_meta = {
+            'op_type': 'Squeeze',
+            'n_inputs': 1, 'n_outputs': 1,
+            'arguments': {'axis': self.dim}
+        }
+    def forward(self, x, out=None):
+        inputs = [x]; self.unify_devices(inputs)
+        outputs = [out] if out else [ReferneceTensor(x)]
+        return self.run(inputs, outputs)
+class UnSqueeze(BaseModule):
+    def __init__(self, key, ctx, **kwargs):
+        super(UnSqueeze, self).__init__(key, ctx, **kwargs)
+        self.dim = kwargs.get('dim', None)
+        self.register_arguments()
+        self.register_op()
+    def register_arguments(self):
+        """No Arguments for squeeze op."""
+        pass
+    def register_op(self):
+        self.op_meta = {
+            'op_type': 'ExpandDims',
+            'n_inputs': 1, 'n_outputs': 1,
+            'arguments': {'axis': self.dim}
+        }
+    def forward(self, x, out=None):
+        inputs = [x]; self.unify_devices(inputs)
+        outputs = [out] if out else [ReferneceTensor(x)]
+        return self.run(inputs, outputs)
 class Permute(BaseModule):
    def __init__(self, key, ctx, **kwargs):
        super(Permute, self).__init__(key, ctx, **kwargs)

--- a/Dragon/python/dragon/vm/torch/ops/ndarray.py
+++ b/Dragon/python/dragon/vm/torch/ops/ndarray.py
@@ -15,7 +15,8 @@ from __future__ import print_function
 from dragon.vm.torch.ops.primitive import MakeContext, CanonicalAxis
 from dragon.vm.torch.ops.factory import get_module
-from dragon.vm.torch.ops.modules.shape import Reshape, Fill, Permute, Repeat
+from dragon.vm.torch.ops.modules.shape import \
+    Reshape, Squeeze, UnSqueeze, Fill, Permute, Repeat
 from dragon.vm.torch.ops.modules.reduce import Reduce, ArgReduce
 from dragon.vm.torch.ops.modules.crop import Crop
 from dragon.vm.torch.ops.modules.axis import Concat, Gather
@@ -29,6 +30,22 @@ def reshape(input, shape, shape_like=None):
    return module.forward(input, shape)
+def squeeze(input, dim=None, out=None):
+    ctx = MakeContext(inputs=[input])
+    key = 'torch/ops/squeeze/{}:{}/dim:{}'.format(
+        ctx[0].lower(), ctx[1], dim if dim else 'None')
+    module = get_module(Squeeze, key, ctx, dim=dim)
+    return module.forward(input, out=out)
+def unsqueeze(input, dim, out=None):
+    ctx = MakeContext(inputs=[input])
+    key = 'torch/ops/unsqueeze/{}:{}/dim:{}'.format(
+        ctx[0].lower(), ctx[1], dim if dim else 'None')
+    module = get_module(UnSqueeze, key, ctx, dim=dim)
+    return module.forward(input, out=out)
 def _permute(input, perms=None):
    ctx = MakeContext(inputs=[input]); len_perms = len(perms) if perms else 0
    key = 'torch/ops/permute/{}:{}/n_dims:#{}'.format(ctx[0].lower(), ctx[1], len_perms)

--- a/Dragon/python/dragon/vm/torch/serialization.py
+++ b/Dragon/python/dragon/vm/torch/serialization.py
@@ -51,7 +51,8 @@ def _with_file_like(f, mode, body):
            (sys.version_info[0] == 3 and isinstance(f, pathlib.Path)):
        new_fd = True
        dir = os.path.dirname(f)
-        if not os.path.exists(dir): os.makedirs(dir)
+        # Bug fix: empty directory, i.e., under the work directory
+        if dir != '' and not os.path.exists(dir): os.makedirs(dir)
        f = open(f, mode)
    try:
        return body(f)

--- a/Dragon/python/dragon/vm/torch/tensor.py
+++ b/Dragon/python/dragon/vm/torch/tensor.py
@@ -13,6 +13,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+import sys
+import copy
 import numpy as np
 import dragon as dg
 import dragon.core.tensor_utils as tensor_utils
@@ -23,9 +25,11 @@ from dragon.vm.torch.constants import CTX_TO_DEVICE_OPTION
 from .c_apis import *
-__all__ = ['Tensor', 'Parameter',
+__all__ = [
+    'Tensor', 'Parameter',
    'FloatTensor', 'DoubleTensor',
-    'IntTensor', 'LongTensor', 'ByteTensor',
+    'IntTensor', 'LongTensor',
+    'ByteTensor', 'CharTensor',
 ]
@@ -48,6 +52,9 @@ class Tensor(object):
        self._requires_grad = kwargs.get('requires_grad', False)
        self._dg_tensor = kwargs.get('dg_tensor', None)
        self._own_storage = kwargs.get('own_storage', True)
+        # Hold it to lock shared objects(i.e., tensor with same storage)
+        self._ref_objects = []
        # Owned by the leaf variables(i.e. Can not be Reshaped)
        self._static_shape = None
        # Owned by the grad required variables
@@ -541,6 +548,71 @@ class Tensor(object):
    #                                            #
    ##############################################
+    def squeeze(self, dim=None):
+        """Returns a tensor with all the dimensions of input of size 1 removed.
+        Parameters
+        ----------
+        dim : int
+            The optional dim to remove.
+        Returns
+        -------
+        vm.torch.Tensor
+            The new tensor.
+        """
+        raise NotImplementedError('Refer torch.ops.builtin._squeeze')
+    def squeeze_(self, dim=None):
+        """Inplace of ``Tensor.squeeze()``
+        Parameters
+        ----------
+        dim : int
+            The optional dim to remove.
+        Returns
+        -------
+        vm.torch.Tensor
+            The self.
+        """
+        raise NotImplementedError('Refer torch.ops.builtin._squeeze_')
+    def unsqueeze(self, dim):
+        """Returns a tensor with a dimension of size 1 inserted at the specified position.
+        Parameters
+        ----------
+        dim : int
+            The dim to insert.
+        Returns
+        -------
+        vm.torch.Tensor
+            The new tensor.
+        """
+        raise NotImplementedError('Refer torch.ops.builtin._unsqueeze')
+    def unsqueeze_(self, dim):
+        """Inplace of ``Tensor.unsqueeze()``
+        Parameters
+        ----------
+        dim : int
+            The dim to insert.
+        Returns
+        -------
+        vm.torch.Tensor
+            The self.
+        """
+        raise NotImplementedError('Refer torch.ops.builtin._unsqueeze_')
    def view(self, *args):
        """Return a new tensor with the same data but a different size.
@@ -605,13 +677,15 @@ class Tensor(object):
        """
        raise NotImplementedError('Refer torch.ops.builtin.repeat')
-    def copy_(self, src):
+    def copy_(self, src, non_blocking=False):
        """Copy the elements from ``src`` into this tensor and return ``self``.
        Parameters
        ----------
        src : vm.torch.Tensor
            The source tensor.
+        non_blocking : boolean
+            Whether to copy asynchronously between CPU and GPU.
        Returns
        -------
@@ -1034,6 +1108,28 @@ class Tensor(object):
        """
        raise NotImplementedError('Refer torch.ops.builtin.byte_')
+    def char(self):
+        """Return a ``int8`` tensor with elements of ``self``.
+        Returns
+        -------
+        vm.torch.Tensor
+            The byte tensor.
+        """
+        raise NotImplementedError('Refer torch.ops.builtin.char')
+    def char_(self):
+        """Inplace of ``Tensor.char()``.
+        Returns
+        -------
+        vm.torch.Tensor
+            The byte tensor.
+        """
+        raise NotImplementedError('Refer torch.ops.builtin.char_')
    ##############################################
    #                                            #
    #                  AUTO-GRAD                 #
@@ -1126,6 +1222,11 @@ def ByteTensor(*args, **kwargs):
    return Tensor(*args, **kwargs)
+def CharTensor(*args, **kwargs):
+    kwargs['dtype'] = 'int8'
+    return Tensor(*args, **kwargs)
 _DTYPE_TO_TENSOR = {
    'float16': HalfTensor,
    'float32': FloatTensor,
@@ -1133,6 +1234,7 @@ _DTYPE_TO_TENSOR = {
    'int32': IntTensor,
    'int64': LongTensor,
    'uint8': ByteTensor,
+    'int8': CharTensor,
 }
@@ -1158,6 +1260,23 @@ def RuntimeTensor(name, dtype='float32', ctx=None):
    return constructor(dg_tensor=name, ctx=ctx)
+def ReferneceTensor(src):
+    """Create a reference from source tensor.
+    Commonly used to hold the same storage but takes different sizes,
+    i.e., view, squeeze, and unsqueeze.
+    """
+    constructor = _DTYPE_TO_TENSOR[src._dtype]
+    ref = constructor(dg_tensor=src.name, ctx=src._ctx)
+    name = '{}/id:{}'.format(
+        src.name.replace('[TPool]', '[Ref]'), id(ref))
+    dg.workspace.CreateTensor(name)
+    ref._dg_tensor, ref._own_storage = name, False
+    ref._ref_objects.append(src)
+    return ref
 ##############################################
 #                                            #
 #               Tensor-Extension             #

--- a/Dragon/python/dragon/vm/torch/tensor_uitls.py
+++ b/Dragon/python/dragon/vm/torch/tensor_uitls.py
@@ -23,7 +23,7 @@ def from_numpy(data):
    Parameters
    ----------
-    data : numpy.ndarray
+    data : ndarray
        The nd-array with various data type.
    Return
@@ -113,4 +113,5 @@ __NUMPY_TYPE_TO_TORCH = {
    'int32': 'IntTensor',
    'int64': 'LongTensor',
    'uint8': 'ByteTensor',
+    'int8': 'CharTensor',
 }
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/utils/data/io/data_reader.py
+++ b/Dragon/python/dragon/vm/torch/utils/data/io/data_reader.py
@@ -97,7 +97,7 @@ class DataReader(Process):
        self._db.close()
        self._db.open(self._source)
        self._cur_idx = target_idx
-        self._db.set(str(self._cur_idx).zfill(self._db_zfill))
+        self._db.set(str(self._cur_idx).zfill(self._zfill))
    def reset(self):
        """Reset the cursor and environment.
@@ -112,12 +112,12 @@ class DataReader(Process):
            self._cur_chunk_idx = 0
            self._start_idx = int(self._part_idx * self._num_shuffle_parts + self._perm[self._cur_chunk_idx])
            self._start_idx = int(self._start_idx * self._chunk_size)
-            if self._start_idx >= self._db_size: self.next_chunk()
+            if self._start_idx >= self._num_entries: self.next_chunk()
            self._end_idx = self._start_idx + self._chunk_size
-            self._end_idx = min(self._db_size, self._end_idx)
+            self._end_idx = min(self._num_entries, self._end_idx)
        else:
            self._start_idx = 0
-            self._end_idx = self._db_size
+            self._end_idx = self._num_entries
        self.redirect(self._start_idx)
@@ -145,10 +145,10 @@ class DataReader(Process):
        else:
            self._start_idx = self._part_idx * self._num_shuffle_parts + self._perm[self._cur_chunk_idx]
            self._start_idx = self._start_idx * self._chunk_size
-            if self._start_idx >= self._db_size: self.next_chunk()
+            if self._start_idx >= self._num_entries: self.next_chunk()
            else:
                self._end_idx = self._start_idx + self._chunk_size
-                self._end_idx = min(self._db_size, self._end_idx)
+                self._end_idx = min(self._num_entries, self._end_idx)
            self.redirect(self._start_idx)
    def run(self):
@@ -165,14 +165,14 @@ class DataReader(Process):
        # init db
        self._db = LMDB()
        self._db.open(self._source)
-        self._db_size = int(self._db.get('size'))
+        self._zfill = self._db.zfill()
-        self._db_zfill = int(self._db.get('zfill'))
+        self._num_entries = self._db.num_entries()
-        self._epoch_size = int(self._db_size / self._num_parts + 1)
+        self._epoch_size = int(self._num_entries / self._num_parts + 1)
        if self._use_shuffle:
            if self._chunk_size == 1:
                # each chunk has at most 1 record [For Fully Shuffle]
-                self._num_shuffle_parts = int(self._db_size / self._chunk_size / self._num_parts) + 1
+                self._num_shuffle_parts = int(self._num_entries / self._chunk_size / self._num_parts) + 1
            else:
                if self._use_shuffle and self._chunk_size == -1:
                    # search a optimal chunk size by chunks [For Chunk Shuffle]
@@ -182,12 +182,12 @@ class DataReader(Process):
                    self._chunk_size = min_chunk_size
                    self._num_shuffle_parts = int(math.ceil(self._db._total_size * 1.1 /
                                                 (self._num_parts * self._chunk_size << 20)))
-                    self._chunk_size = int(self._db_size / self._num_shuffle_parts / self._num_parts + 1)
+                    self._chunk_size = int(self._num_entries / self._num_shuffle_parts / self._num_parts + 1)
        else:
            # each chunk has at most K records [For Multiple Nodes]
            # note that if ``shuffle`` and ``multiple_nodes`` are all ``False``,
            # ``chunk_size`` and ``num_shuffle_parts`` are meaningless
-            self._chunk_size = int(self._db_size / self._num_parts) + 1
+            self._chunk_size = int(self._num_entries / self._num_parts) + 1
            self._num_shuffle_parts = 1
        self._perm = np.arange(self._num_shuffle_parts)

--- a/Dragon/python/setup.py
+++ b/Dragon/python/setup.py
@@ -42,7 +42,7 @@ find_modules()
 setup(name = 'dragon',
-      version='0.2.2.9',
+      version='0.2.2.10',
      description = 'Dragon: A Computation Graph Virtual Machine Based Deep Learning Framework',
      url='https://github.com/seetaresearch/Dragon',
      author='Ting Pan',

--- a/Dragon/src/contrib/rcnn/bbox_utils.h
+++ b/Dragon/src/contrib/rcnn/bbox_utils.h
@@ -114,7 +114,7 @@ inline void GenerateGridAnchors(
 /******************** Proposal ********************/
 template <typename T, class Context>
-inline void GenerateProposals(
+void GenerateProposals(
    const int                       A,
    const int                       feat_h,
    const int                       feat_w,
@@ -129,7 +129,7 @@ inline void GenerateProposals(
    T*                              proposals);
 template <typename T, class Context>
-inline void GenerateProposals_v2(
+void GenerateProposals_v2(
    const int                       total_anchors,
    const float                     im_h,
    const float                     im_w,

--- a/Dragon/src/contrib/rcnn/proposal_op.cc
+++ b/Dragon/src/contrib/rcnn/proposal_op.cc
@@ -34,7 +34,7 @@ void ProposalOp<Context>::RunWithType() {
            rcnn::GenerateProposals<T, Context>(
                A, feat_height, feat_width, strides[0],
                    im_height, im_width, min_box_h, min_box_w,
-                        Input(0).template data<T, Context>() + num_proposals,
+                        Input(0).template data<T, Context>(),
                            Input(1).template data<T, Context>(),
                                anchors_.template mutable_data<T, Context>(),
                                    proposals_.template mutable_data<T, Context>());
@@ -59,9 +59,9 @@ void ProposalOp<Context>::RunWithType() {
            CHECK_EQ(strides.size(), scales.size())
                << "\nGiven " << strides.size() << " strides and "
                << scales.size() << " scales";
-            //  cls_probs: [1, 2, total_proposals]
+            //  cls_probs: [1, total_proposals]
            //  bbox_deltas: [1, 4, total_proposals]
-            TIndex total_proposals = Input(-3).dim(2), acc_proposals = 0;
+            TIndex total_proposals = Input(-3).dim(1), acc_proposals = 0;
            const TIndex pre_nms_topn = std::min(total_proposals, pre_nms_top_n);;
            proposals_.Reshape({ total_proposals, 5 });
            auto* proposals = proposals_.template mutable_data<T, CPUContext>();
@@ -93,7 +93,7 @@ void ProposalOp<Context>::RunWithType() {
            rcnn::GenerateProposals_v2<T, Context>(total_proposals,
                im_height, im_width, min_box_h, min_box_w,
-                    Input(-3).template data<T, Context>() + total_proposals,
+                    Input(-3).template data<T, Context>(),
                        Input(-2).template data<T, Context>(),
                            proposals_.template mutable_data<T, Context>());
@@ -113,7 +113,7 @@ void ProposalOp<Context>::RunWithType() {
        }
        total_rois += num_rois;
        Ydata += (num_rois * 5);
-        im_info += 3;
+        im_info += Input(-1).dim(1);
    }
    Output(0)->Reshape(vector<TIndex>({ total_rois, 5 }));
@@ -148,9 +148,9 @@ void ProposalOp<Context>::RunWithType() {
 template <class Context>
 void ProposalOp<Context>::RunOnDevice() {
    num_images = Input(0).dim(0);
-    CHECK_EQ(Input(-1).count(), num_images * 3)
+    CHECK_EQ(Input(-1).dim(0), num_images)
-        << "\nExcepted " << num_images * 3 << " groups image info, "
+        << "\nExcepted " << num_images << " groups image info, "
-        << "but got " << Input(-1).count() / 3 << ".";
+        << "but got " << Input(-1).dim(0) << ".";
    roi_indices_.Reshape({ post_nms_top_n });
    Output(0)->Reshape({ num_images * post_nms_top_n, 5 });

--- a/Dragon/src/core/graph.cc
+++ b/Dragon/src/core/graph.cc
@@ -231,17 +231,24 @@ GraphDef Graph::Share(const GraphDef& optimized_graph) {
    GraphDef g; g.CopyFrom(optimized_graph);
+    //  actually we need a white list
+    Set<string> whitelist;
+    for (auto& target : optimized_graph.target())
+        whitelist.insert(target);
    //  rename to create in-place
    for (int i = 0; i < optimized_graph.op_size(); i++) {
        const OperatorDef& op = optimized_graph.op(i);
        for (int j = 0; j < op.input_size(); j++) {
-            if (renamed_.count(op.input(j)) &&
+            if (whitelist.count(op.input(j)) == 0 &&
+                renamed_.count(op.input(j)) &&
                ws()->SetProxy(op.input(j), renamed_[op.input(j)]))
                    *g.mutable_op(i)->mutable_input(j)
                        = renamed_[op.input(j)];
        }
        for (int j = 0; j < op.output_size(); j++) {
-            if (renamed_.count(op.output(j)) &&
+            if (whitelist.count(op.output(j)) == 0 &&
+                renamed_.count(op.output(j)) &&
                ws()->SetProxy(op.output(j), renamed_[op.output(j)]))
                    *g.mutable_op(i)->mutable_output(j)
                        = renamed_[op.output(j)];

--- a/Dragon/src/operators/loss/sigmoid_cross_entropy_op.cc
+++ b/Dragon/src/operators/loss/sigmoid_cross_entropy_op.cc
@@ -17,7 +17,7 @@ void SigmoidCrossEntropyOp<Context>::RunWithType() {
    if (normalization == "UNIT") {
        Output(0)->ReshapeLike(losses);
-        Output(0)->template Copy<Context, Context>(losses);
+        Output(0)->template CopyFrom<Context>(losses);
        return;
    }

--- a/Dragon/src/operators/loss/sigmoid_focal_loss_op.cc
+++ b/Dragon/src/operators/loss/sigmoid_focal_loss_op.cc
@@ -19,7 +19,7 @@ void SigmoidFocalLossOp<Context>::RunWithType() {
    if (normalization == "UNIT") {
        Output(0)->ReshapeLike(losses);
-        Output(0)->template Copy<Context, Context>(losses);
+        Output(0)->template CopyFrom<Context>(losses);
        return;
    }

--- a/Dragon/src/operators/loss/softmax_focal_loss_op.cc
+++ b/Dragon/src/operators/loss/softmax_focal_loss_op.cc
@@ -24,7 +24,7 @@ void SoftmaxFocalLossOp<Context>::RunWithType() {
    if (normalization == "UNIT") {
        Output(0)->ReshapeLike(losses);
-        Output(0)->template Copy<Context, Context>(losses);
+        Output(0)->template CopyFrom<Context>(losses);
        return;
    }

--- a/Dragon/src/operators/loss/sparse_softmax_cross_entropy_op.cc
+++ b/Dragon/src/operators/loss/sparse_softmax_cross_entropy_op.cc
@@ -59,7 +59,7 @@ void SparseSoftmaxCrossEntropyOp<Context>::RunWithType() {
    if (normalization == "UNIT") {
        Output(0)->ReshapeLike(losses);
-        Output(0)->template Copy<Context, Context>(losses);
+        Output(0)->template CopyFrom<Context>(losses);
        return;
    }
@@ -167,7 +167,7 @@ void SparseSoftmaxCrossEntropyGradientOp<Context>::RunOnDevice() {
            auto* dXdataF32 = Output(0)->template data<float, Context>();
            auto* dXdataF16 = prob->template mutable_data<float16, Context>();
            kernel::TypeA2B<float, float16, Context>(Output(0)->count(), dXdataF32, dXdataF16);
-            Output(0)->template Copy<Context, Context>(*prob);
+            Output(0)->template CopyFrom<Context>(*prob);
        }
    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
 }

--- a/Dragon/src/operators/misc/gradient_op.cc
+++ b/Dragon/src/operators/misc/gradient_op.cc
@@ -68,7 +68,7 @@ template <class Context>
 void StopGradientOp<Context>::RunOnDevice() {
    if (Output(0)->name() != Input(0).name()) {
        Output(0)->ReshapeLike(Input(0));
-        Output(0)->template Copy<Context, Context>(Input(0));
+        Output(0)->template CopyFrom<Context>(Input(0));
    }
 }

--- a/Dragon/src/operators/mpi/mpi_broadcast_op.cc
+++ b/Dragon/src/operators/mpi/mpi_broadcast_op.cc
@@ -14,7 +14,7 @@ void MPIBroadcastOp<Context>::RunWithType() {
        auto* Xdata = Input(0).template mutable_data<T, CPUContext>();
 #endif
        MPI_Bcast(Xdata, Input(0).count(), mpi_dtype(), comm_root, comm);
-        Output(0)->template Copy<Context, Context>(Input(0));
+        Output(0)->template CopyFrom<Context>(Input(0));
    } else { 
 #ifdef WITH_MPI_CUDA
        auto* Ydata = Output(0)->template mutable_data<T, Context>();

--- a/Dragon/src/operators/mpi/mpi_gather_op.cc
+++ b/Dragon/src/operators/mpi/mpi_gather_op.cc
@@ -8,7 +8,7 @@ namespace dragon {
 template <class Context> template <typename T>
 void MPIGatherOp<Context>::RunWithType() {
    if (comm_rank == comm_root) {
-        Output(comm_rank)->template Copy<Context, Context>(Input(0));
+        Output(comm_rank)->template CopyFrom<Context>(Input(0));
        for (int i = 0; i < comm_size; i++) {
            if (i == comm_root) continue;
 #ifdef WITH_MPI_CUDA
@@ -76,7 +76,7 @@ OPERATOR_SCHEMA(MPIGather).NumInputs(1).NumOutputs(1, INT_MAX);
 template <class Context> template <typename T>
 void MPIGatherGradientOp<Context>::RunWithType() {
    if (comm_rank == comm_root) {
-        Output(0)->template Copy<Context, Context>(Input(this->comm_rank + 1));
+        Output(0)->template CopyFrom<Context>(Input(this->comm_rank + 1));
        for (int i = 0; i < comm_size; i++) {
            if (i == comm_root) continue;
 #ifdef WITH_MPI_CUDA

--- a/Dragon/src/operators/ndarray/crop_op.cc
+++ b/Dragon/src/operators/ndarray/crop_op.cc
@@ -125,7 +125,7 @@ void CropOp<Context>::RunOnDevice() {
    //  do nothing
    if (process_axes.size() == 0) {
        Output(0)->ReshapeLike(Input(0));
-        Output(0)->template Copy<Context, Context>(Input(0));
+        Output(0)->template CopyFrom<Context>(Input(0));
        //  squeeze dimensions
        vector<TIndex> squeeze_shape;
        for (int i = 0; i < keep_dims.size(); i++)
@@ -229,7 +229,7 @@ void CropGradientOp<Context>::RunOnDevice() {
    //  do nothing 
    if (process_axes.size() == 0) {
        Output(0)->ReshapeLike(Input(-1));
-        Output(0)->template Copy<Context, Context>(Input(-1));
+        Output(0)->template CopyFrom<Context>(Input(-1));
        return;
    }

--- a/Dragon/src/operators/ndarray/expand_dims_op.cc
+++ b/Dragon/src/operators/ndarray/expand_dims_op.cc
 #include "core/workspace.h"
-#include "operators/ndarray/expand_dims_op.h"
+#include "operators/ndarray/dimension_op.h"
 namespace dragon {
 template <class Context>
 void ExpandDimsOp<Context>::RunOnDevice() {
+    TIndex _axis_ = axis >= 0 ? axis :
+        axis + (TIndex)Input(0).ndim() + 1;
    vector<TIndex> dims = Input(0).dims();
-    if (axis == -1 || axis >= (int)dims.size()) dims.push_back(1);
+    if (_axis_ < 0 ||
-    else dims.insert(dims.begin() + axis, 1);
+            _axis_ >= (TIndex)dims.size())
-    //  save Xshape
+                dims.push_back(1);
-    Tensor* sv = ws()->CreateTensor(
+    else dims.insert(dims.begin() + _axis_, 1);
-        "/mnt/" + anchor() + "/expand_dims/x_shape");
-    sv->Reshape({ (TIndex)Input(0).ndim() });
-    auto* Sdata = sv->template mutable_data<TIndex, CPUContext>();
-    for (int i = 0; i < Input(0).ndim(); i++) Sdata[i] = Input(0).dim(i);
    Output(0)->Reshape(dims);
-    if (Output(0)->name() != Input(0).name())
+    Output(0)->SetMeta(Input(0).meta());
-        Output(0)->template Copy<Context, Context>(Input(0));
+    Output(0)->Share(Input(0).memory());
 }
 DEPLOY_CPU(ExpandDims);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(ExpandDims);
 #endif
-OPERATOR_SCHEMA(ExpandDims)
+OPERATOR_SCHEMA(ExpandDims).NumInputs(1).NumOutputs(1);
-    .NumInputs(1).NumOutputs(1)
-    .Inplace({ { 0, 0 } });
-template <class Context>
-void ExpandDimsGradientOp<Context>::RunOnDevice() {
-    Tensor* sv = ws()->GetTensor(
-        "/mnt/" + anchor() + "/expand_dims/x_shape");
-    auto* Sdata = sv->template mutable_data<TIndex, CPUContext>();
-    vector<TIndex> x_shape(sv->count());
-    for (int i = 0; i < sv->count(); i++) x_shape[i] = Sdata[i]; 
-    Output(0)->Reshape(x_shape);
-    if (Output(0)->name() != Input(-1).name())
-        Output(0)->template Copy<Context, Context>(Input(-1));
-}
 DEPLOY_CPU(ExpandDimsGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(ExpandDimsGradient);
 #endif
 OPERATOR_SCHEMA(ExpandDimsGradient)
-    .NumInputs(1).NumOutputs(1)
+    .NumInputs(2).NumOutputs(1).Inplace({ { 1, 0 } });
-    .Inplace({ { 0, 0 } });
 class GetExpandDimsGradient final : public GradientMakerBase {
 public:
    GRADIENT_MAKER_CTOR(GetExpandDimsGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",
-            vector<string> {GO(0)},
+            vector<string> {I(0), GO(0)},
            vector<string> {GI(0)});
    }
 };

--- a/Dragon/src/operators/ndarray/flatten_op.cc
+++ b/Dragon/src/operators/ndarray/flatten_op.cc
 #include "core/workspace.h"
-#include "operators/ndarray/flatten_op.h"
+#include "operators/ndarray/dimension_op.h"
 namespace dragon {
 template <class Context>
-void FlattenOp<Context>::SqueezeRun() {
+void FlattenOp<Context>::RunOnDevice() {
    vector<TIndex> output_dims;
+    if (keep_axes != INT_MAX) {
+        CHECK_LE(keep_axes, (int)Input(0).ndim())
+            << "\nThe total number of axes is " + Input(0).ndim()
+            << ", can not keep " + keep_axes << " .";
+        int i = 0;
+        for (; i < keep_axes - 1; i++)
+            output_dims.push_back(Input(0).dim(i));
+        if (Input(0).count(i) != 1)
+            output_dims.push_back(Input(0).count(i));
+    } else {
        for (int i = 0; i < axis; i++)
            output_dims.push_back(Input(0).dim(i));
        if (num_axes < 1) {
@@ -16,76 +26,32 @@ void FlattenOp<Context>::SqueezeRun() {
            for (int i = axis + num_axes; i < Input(0).ndim(); i++)
                output_dims.push_back(Input(0).dim(i));
        }
+    }
    Output(0)->Reshape(output_dims);
-    if (Output(0)->name() != Input(0).name())
+    Output(0)->SetMeta(Input(0).meta());
-        Output(0)->template Copy<Context, Context>(Input(0));
+    Output(0)->Share(Input(0).memory());
-}
-template <class Context>
-void FlattenOp<Context>::KeepRun() {
-    CHECK_LE(keep_axes, (int)Input(0).ndim())
-        << "\nThe total number of axes is " + Input(0).ndim()
-        << ", can not keep " + keep_axes << " .";
-    vector<TIndex> output_dims;
-    int i = 0;
-    for (; i < keep_axes - 1; i++)
-        output_dims.push_back(Input(0).dim(i));
-    if (Input(0).count(i) != 1)
-        output_dims.push_back(Input(0).count(i));
-    if (Output(0)->name() != Input(0).name())
-        Output(0)->template Copy<Context, Context>(Input(0));
-}
-template <class Context>
-void FlattenOp<Context>::RunOnDevice() {
-    //  save Xshape
-    Tensor* sv = ws()->CreateTensor(
-        "/mnt/" + anchor() + "/flatten/x_shape");
-    sv->Reshape({ (TIndex)Input(0).ndim() });
-    auto* Sdata = sv->template mutable_data<TIndex, CPUContext>();
-    for (int i = 0; i < Input(0).ndim(); i++) 
-        Sdata[i] = Input(0).dim(i);
-    if (keep_axes != INT_MAX) KeepRun();
-    else SqueezeRun();
 }
 DEPLOY_CPU(Flatten);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(Flatten);
 #endif
-OPERATOR_SCHEMA(Flatten)
+OPERATOR_SCHEMA(Flatten).NumInputs(1).NumOutputs(1);
-    .NumInputs(1).NumOutputs(1)
-    .Inplace({ { 0, 0 } });
-template <class Context>
-void FlattenGradientOp<Context>::RunOnDevice() {
-    Tensor* sv = ws()->GetTensor(
-        "/mnt/" + anchor() + "/flatten/x_shape");
-    auto* Sdata = sv->template mutable_data<TIndex, CPUContext>();
-    vector<TIndex> x_shape(sv->count());
-    for (int i = 0; i < sv->count(); i++) x_shape[i] = Sdata[i];
-    Output(0)->Reshape(x_shape);
-    if (Output(0)->name() != Input(-1).name())
-        Output(0)->template Copy<Context, Context>(Input(-1));
-}
 DEPLOY_CPU(FlattenGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(FlattenGradient);
 #endif
 OPERATOR_SCHEMA(FlattenGradient)
-    .NumInputs(1).NumOutputs(1)
+    .NumInputs(2).NumOutputs(1).Inplace({ { 1, 0 } });
-    .Inplace({ { 0, 0 } });
 class GetFlattenGradient final : public GradientMakerBase {
 public:
    GRADIENT_MAKER_CTOR(GetFlattenGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",
-            vector<string> {GO(0)},
+            vector<string> {I(0), GO(0)},
            vector<string> {GI(0)});
    }
 };

--- a/Dragon/src/operators/ndarray/pad_op.cc
+++ b/Dragon/src/operators/ndarray/pad_op.cc
@@ -61,7 +61,7 @@ void PadOp<Context>::RunOnDevice() {
    //  do nothing
    if (process_axes.size() == 0) {
        Output(0)->ReshapeLike(Input(0));
-        Output(0)->template Copy<Context, Context>(Input(0));
+        Output(0)->template CopyFrom<Context>(Input(0));
        return;
    }
@@ -175,7 +175,7 @@ void PadGradientOp<Context>::RunOnDevice() {
    //  do nothing 
    if (process_axes.size() == 0) {
        Output(0)->ReshapeLike(Input(-1));
-        Output(0)->template Copy<Context, Context>(Input(-1));
+        Output(0)->template CopyFrom<Context>(Input(-1));
        return;
    }

--- a/Dragon/src/operators/ndarray/random_pick_op.cc
+++ b/Dragon/src/operators/ndarray/random_pick_op.cc
@@ -39,7 +39,7 @@ void RandomPickOp<Context>::RunOnDevice() {
    if (Output(1)->name() != "ignore") {
        Output(1)->ReshapeLike(*pick_indices);
-        Output(1)->template Copy<Context, Context>(*pick_indices);
+        Output(1)->template CopyFrom<Context>(*pick_indices);
    }
 }

--- a/Dragon/src/operators/ndarray/reshape_op.cc
+++ b/Dragon/src/operators/ndarray/reshape_op.cc
 #include "core/workspace.h"
-#include "operators/ndarray/reshape_op.h"
+#include "operators/ndarray/dimension_op.h"
 namespace dragon {
@@ -67,50 +67,31 @@ void ReshapeOp<Context>::RunOnDevice() {
        << "\nCan not change the total size."
        << Input(0).DimString()
        << " -> " << DimString(new_shape);
-    //  save Xshape
-    Tensor* sv = ws()->CreateTensor(
-        "/mnt/" + anchor() + "/reshape/x_shape");
-    sv->Reshape({ (TIndex)Input(0).ndim() });
-    auto* Sdata = sv->template mutable_data<TIndex, CPUContext>();
-    for (int i = 0; i < Input(0).ndim(); i++) Sdata[i] = Input(0).dim(i);
    Output(0)->Reshape(new_shape);
-    if (Output(0)->name() != Input(0).name())
+    Output(0)->SetMeta(Input(0).meta());
-        Output(0)->template Copy<Context, Context>(Input(0));
+    Output(0)->Share(Input(0).memory());
 }
 DEPLOY_CPU(Reshape);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(Reshape);
 #endif
-OPERATOR_SCHEMA(Reshape)
+OPERATOR_SCHEMA(Reshape).NumInputs(1).NumOutputs(1);
-    .NumInputs(1).NumOutputs(1)
-    .Inplace({ { 0, 0 } });
-template <class Context>
-void ReshapeGradientOp<Context>::RunOnDevice() {
-    Tensor* sv = ws()->GetTensor(
-        "/mnt/" + anchor() + "/reshape/x_shape");
-    auto* Sdata = sv->template mutable_data<TIndex, CPUContext>();
-    vector<TIndex> x_shape(sv->count());
-    for (int i = 0; i < sv->count(); i++) x_shape[i] = Sdata[i];
-    Output(0)->Reshape(x_shape);
-    if (Output(0)->name() != Input(-1).name())
-        Output(0)->template Copy<Context, Context>(Input(-1));
-}
 DEPLOY_CPU(ReshapeGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(ReshapeGradient);
 #endif
-OPERATOR_SCHEMA(ReshapeGradient).NumInputs(1).NumOutputs(1).Inplace({ { 0, 0 } });
+OPERATOR_SCHEMA(ReshapeGradient)
+    .NumInputs(2).NumOutputs(1).Inplace({ { 1, 0 } });
 class GetReshapeGradient final : public GradientMakerBase {
 public:
    GRADIENT_MAKER_CTOR(GetReshapeGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",
-            vector<string> {GO(0)},
+            vector<string> {I(0), GO(0)},
            vector<string> {GI(0)});
    }
 };

--- a/Dragon/src/operators/ndarray/squeeze_op.cc
+++ b/Dragon/src/operators/ndarray/squeeze_op.cc
+#include "core/workspace.h"
+#include "operators/ndarray/dimension_op.h"
+namespace dragon {
+template <class Context>
+void SqueezeOp<Context>::RunOnDevice() {
+    TIndex _axis_ = axis >= 0 ? axis :
+        axis + (TIndex)Input(0).ndim();
+    vector<TIndex> dims;
+    for (int i = 0; i < Input(0).ndim(); i++)
+        if ((Input(0).dim(i) != 1) ||
+                (_axis_ != INT_MAX &&
+                    Input(0).dim(i) == 1 &&
+                        i != _axis_))
+                            dims.push_back(Input(0).dim(i));
+    Output(0)->Reshape(dims);
+    Output(0)->SetMeta(Input(0).meta());
+    Output(0)->Share(Input(0).memory());
+}
+DEPLOY_CPU(Squeeze);
+#ifdef WITH_CUDA
+DEPLOY_CUDA(Squeeze);
+#endif
+OPERATOR_SCHEMA(Squeeze).NumInputs(1).NumOutputs(1);
+DEPLOY_CPU(SqueezeGradient);
+#ifdef WITH_CUDA
+DEPLOY_CUDA(SqueezeGradient);
+#endif
+OPERATOR_SCHEMA(SqueezeGradient)
+    .NumInputs(2).NumOutputs(1).Inplace({ { 1, 0 } });
+class GetSqueezeGradient final : public GradientMakerBase {
+ public:
+    GRADIENT_MAKER_CTOR(GetSqueezeGradient);
+    vector<OperatorDef> MakeDefs() override {
+        return SingleDef(def.type() + "Gradient", "",
+            vector<string> {I(0), GO(0)},
+            vector<string> {GI(0)});
+    }
+};
+REGISTER_GRADIENT(Squeeze, GetSqueezeGradient);
+}    // namespace dragon
\ No newline at end of file
--- a/Dragon/src/operators/ndarray/tile_op.cc
+++ b/Dragon/src/operators/ndarray/tile_op.cc
@@ -35,7 +35,7 @@ void TileOp<Context>::RunOnDevice() {
    //  do nothing 
    if (process_axes.size() == 0) {
        Output(0)->ReshapeLike(Input(0));
-        Output(0)->template Copy<Context, Context>(Input(0));
+        Output(0)->template CopyFrom<Context>(Input(0));
        return;
    }
@@ -96,7 +96,7 @@ void TileGradientOp<Context>::RunOnDevice() {
    //  do nothing 
    if (process_axes.size() == 0) {
        Output(0)->ReshapeLike(Input(-1));
-        Output(0)->template Copy<Context, Context>(Input(-1));
+        Output(0)->template CopyFrom<Context>(Input(-1));
        return;
    }

--- a/Dragon/src/operators/vision/lrn_op.cc
+++ b/Dragon/src/operators/vision/lrn_op.cc
@@ -17,11 +17,11 @@ template <class Context> template <typename T>
 void LRNOp<Context>::SplitRunWithType() {
    sqr_in = ws()->CreateTensor("/mnt/" + anchor() + "/sqr/in");
    sqr_in->ReshapeLike(Input(0));
-    sqr_in->template Copy<Context, Context>(Input(0));
+    sqr_in->template CopyFrom<Context>(Input(0));
    prod_in = ws()->CreateTensor("/mnt/" + anchor() + "/prod/in");
    prod_in->ReshapeLike(Input(0));
-    prod_in->template Copy<Context, Context>(Input(0));
+    prod_in->template CopyFrom<Context>(Input(0));
 }
 template <class Context> template <typename T>

--- a/Dragon/src/protos/caffemodel.proto
+++ b/Dragon/src/protos/caffemodel.proto
 syntax = "proto2";
+package dragon;
 message BlobShape {
  repeated int64 dim = 1 [packed = true];
 }

--- a/Dragon/src/protos/dragon.proto
+++ b/Dragon/src/protos/dragon.proto
 syntax = "proto2";
+package dragon;
 message TensorProto {
  repeated int32 dims = 1;
  enum DataType {