Fix the skipped algorithm finding in cached CUDNN convolution

Summary: This commit enforces the algorithm finding even if the backward of filter or data will not be executed. Otherwise, the empty algorithm will be encountered between two cached operation with the same arguments and input shape.

Fix the skipped algorithm finding in cached CUDNN convolution
Summary: This commit enforces the algorithm finding even if the backward of filter or data will not be executed. Otherwise, the empty algorithm will be encountered between two cached operation with the same arguments and input shape.
Ting PAN
Commit a7a7e4fc authored Aug 08, 2020 by Ting PAN
Showing with 478 additions and 498 deletions
docs/api/cc/dragon/core.rst
docs/api/cc/dragon/core/OpSchema.rst
docs/api/python/torch/Tensor_.rst
dragon/core/graph.cc
dragon/core/graph_gradient.cc
dragon/core/graph_gradient.h
dragon/core/graph_optimizer.cc
dragon/core/graph_optimizer.h
dragon/core/operator.cc
dragon/core/operator_schema.cc
dragon/core/operator_schema.h
dragon/modules/python/module.cc
dragon/operators/activation/drop_block2d_op.cc
dragon/operators/activation/drop_path_op.cc
dragon/operators/activation/dropout_op.cc
dragon/operators/activation/elu_op.cc
dragon/operators/activation/relu_op.cc
dragon/operators/activation/selu_op.cc
dragon/operators/activation/sigmoid_op.cc
dragon/operators/activation/softmax_op.cc
--- a/docs/api/cc/dragon/core.rst
+++ b/docs/api/cc/dragon/core.rst
@@ -18,6 +18,9 @@ dragon/core
  `class Operator <core/Operator.html>`_
  : The base operator class with context.

+  `class OpSchema <core/OpSchema.html>`_
+  : Class to record the schema of operator.
+
  `class Tensor <core/Tensor.html>`_
  : The base tensor class, manage memory or not.

@@ -37,6 +40,7 @@ dragon/core
  core/CUDAContext
  core/Graph
  core/Operator
+  core/OpSchema
  core/Tensor
  core/TypeMeta
  core/UnifiedMemory

--- a/docs/api/cc/dragon/core/OpSchema.rst
+++ b/docs/api/cc/dragon/core/OpSchema.rst
+OpSchema
+========
+
+.. doxygenclass:: dragon::OpSchema
+
+Constructors
+------------
+
+.. doxygenfunction:: dragon::OpSchema::OpSchema()
+.. doxygenfunction:: dragon::OpSchema::OpSchema(const string &op_type, const string &file, const int line)
+
+Public Functions
+----------------
+
+AllowInplace
+############
+.. doxygenfunction:: dragon::OpSchema::AllowInplace(set<pair<int, int>> inplace)
+
+AllowInplace
+############
+.. doxygenfunction:: dragon::OpSchema::AllowInplace(std::function<bool(int, int)> inplace)
+
+NumInputs
+#########
+.. doxygenfunction:: dragon::OpSchema::NumInputs(int n)
+
+NumInputs
+#########
+.. doxygenfunction:: dragon::OpSchema::NumInputs(int min_num, int max_num)
+
+NumOutputs
+##########
+.. doxygenfunction:: dragon::OpSchema::NumOutputs(int n)
+
+NumOutputs
+##########
+.. doxygenfunction:: dragon::OpSchema::NumOutputs(int min_num, int max_num)
+
+Verify
+######
+.. doxygenfunction:: dragon::OpSchema::Verify
+
+.. raw:: html
+
+  <style>
+    h1:before {
+      content: "dragon::";
+      color: #103d3e;
+    }
+  </style>
--- a/docs/api/python/torch/Tensor_.rst
+++ b/docs/api/python/torch/Tensor_.rst
@@ -470,7 +470,6 @@ zero\_
 .. _torch.div(...): div.html
 .. _torch.eq(...): eq.html
 .. _torch.exp(...): exp.html
-.. _torch.expand(...): expand.html
 .. _torch.floor(...): floor.html
 .. _torch.ge(...): ge.html
 .. _torch.gt(...): gt.html

--- a/dragon/core/graph.cc
+++ b/dragon/core/graph.cc
@@ -79,12 +79,12 @@ Graph::Graph(const GraphDef& def, Workspace* ws) : GraphBase(def, ws) {
  Map<string, vec32_t> subgraph_indices;
  int opt = 3; // default: O3
  if (args().count("optimization")) opt = arg("optimization").i();
-  if (opt >= 1) def_v2 = graph_optimizer.PruneNodes(def);
-  if (opt >= 2) graph_optimizer.AddInplace(def_v2, output_aliases_);
+  if (opt >= 1) def_v2 = graph_optimizer.EliminateUnused(def);
+  if (opt >= 2) graph_optimizer.PlanInplace(def_v2, output_aliases_);
  if (opt >= 3) {
    if (phase() == "TRAIN") {
-      def_v2 = graph_optimizer.MirrorStage(def_v2, subgraph_indices);
-      def_v2 = gradient_maker.Share(def_v2);
+      def_v2 = graph_optimizer.PlanCheckpoint(def_v2, subgraph_indices);
+      def_v2 = gradient_maker.Optimize(def_v2);
    } else {
      def_v2 = graph_optimizer.SimulateGC(def_v2);
    }
@@ -98,8 +98,8 @@ Graph::Graph(const GraphDef& def, Workspace* ws) : GraphBase(def, ws) {
    Map<string, vector<OperatorBase*>> subgraph;
    for (const auto& it : subgraph_indices) {
      subgraph[it.first] = vector<OperatorBase*>();
-      for (const auto& idx : subgraph_indices[it.first])
-        subgraph[it.first].push_back(cached_ops_[idx]);
+      for (auto op_idx : subgraph_indices[it.first])
+        subgraph[it.first].push_back(cached_ops_[op_idx]);
    }
    for (auto* op : cached_ops_) {
      op->set_subgraph(subgraph);

--- a/dragon/core/graph_gradient.cc
+++ b/dragon/core/graph_gradient.cc
@@ -4,40 +4,40 @@
 namespace dragon {

 bool GraphGradientMaker::CheckGrad(
-    const OperatorDef& op_def,
+    const OperatorDef& op,
    const Set<string>& targets,
    vector<pair<string, int>>& gen_grads) {
-  if (NoGradientRegistry()->Has(op_def.type())) {
+  if (NoGradientRegistry()->Has(op.type())) {
    return true;
  }
  bool maybe_skip = false;
-  for (int i = 0; i < op_def.output_size(); ++i) {
-    const auto& output = op_def.output(i);
-    if (!inputs_to_grads_.count(output)) {
+  for (int i = 0; i < op.output_size(); ++i) {
+    const auto& out = op.output(i);
+    if (!inputs_to_grads_.count(out)) {
      maybe_skip = true;
-      if (targets.count(output)) {
-        gen_grads.push_back({output, i});
-        inputs_to_grads_[output] = output + "_grad";
+      if (targets.count(out)) {
+        gen_grads.push_back({out, i});
+        inputs_to_grads_[out] = out + "_grad";
      }
    }
  }
-  return maybe_skip && gen_grads.empty() && op_def.output_size() == 1;
+  return maybe_skip && gen_grads.empty() && op.output_size() == 1;
 }

 void GraphGradientMaker::Make(
-    const vector<OperatorDef*>& op_defs,
+    const vector<OperatorDef*>& ops,
    const vector<string>& targets,
    const vector<string>& input_grads,
-    GraphDef& graph_def) {
+    GraphDef& graph) {
  Set<string> split_grads, targets_v2;
  Map<string, int> inputs_count, grads_count;

  // PLAY for the forward
-  for (auto* op_def : op_defs) {
-    if (NoGradientRegistry()->Has(op_def->type())) continue;
-    for (const auto& input : op_def->input()) {
+  for (auto* op : ops) {
+    if (NoGradientRegistry()->Has(op->type())) continue;
+    for (const auto& input : op->input()) {
      bool input_in_outputs = false;
-      for (auto& output : op_def->output())
+      for (auto& output : op->output())
        if (output == input) {
          input_in_outputs = true;
          break;
@@ -56,21 +56,21 @@ void GraphGradientMaker::Make(
  }

  // PLAY for the backward
-  for (int op_idx = (int)op_defs.size() - 1; op_idx >= 0; --op_idx) {
-    const OperatorDef& op_def = *op_defs[op_idx];
+  for (int op_idx = (int)ops.size() - 1; op_idx >= 0; --op_idx) {
+    const auto& op = *ops[op_idx];
    // Generate def by registered gradient maker
    vector<pair<string, int>> gen_grads;
    vector<string> grad_outputs;
-    bool is_skip = CheckGrad(op_def, targets_v2, gen_grads);
-    for (const auto& output : op_def.output()) {
-      string grad_output = "";
-      const auto& it = inputs_to_grads_.find(output);
-      if (it != inputs_to_grads_.end()) grad_output = it->second;
-      grad_outputs.push_back(grad_output);
-    }
-    auto pack = MakeGradientForOp(op_def, grad_outputs);
+    bool is_skip = CheckGrad(op, targets_v2, gen_grads);
+    for (const auto& out : op.output()) {
+      string grad_out = "";
+      const auto& it = inputs_to_grads_.find(out);
+      if (it != inputs_to_grads_.end()) grad_out = it->second;
+      grad_outputs.push_back(grad_out);
+    }
+    auto pack = MakeGradientForOp(op, grad_outputs);
    // Split and gather gradient for multi-used inputs
-    vector<OperatorDef> gather_defs;
+    vector<OperatorDef> gather_ops;
    for (auto& grad_def : pack.grad_defs) {
      if (!grad_def.has_name()) {
        grad_def.set_name(GetOperatorName());
@@ -93,38 +93,38 @@ void GraphGradientMaker::Make(
        }
        if (output_in_inputs) continue;
        // Detect a split branch
-        const auto& original_name = op_def.input(original_index);
+        const auto& original_name = op.input(original_index);
        if (inputs_count[original_name] > 1) {
          auto grad_name_v2 =
              grad_name + "_autosplit_" + str::to(grads_count[grad_name]++);
          if (!is_skip) split_grads.insert(grad_name_v2);
          if (grads_count[grad_name] == inputs_count[original_name]) {
-            auto gather_def = MakeOperatorDef(
+            auto gather_op = MakeOperatorDef(
                "GradientGather",
                GetOperatorName(),
                vector<string>({}),
                vector<string>({grad_name}));
            if (grad_def.has_device_option()) {
-              gather_def.mutable_device_option()->CopyFrom(
+              gather_op.mutable_device_option()->CopyFrom(
                  grad_def.device_option());
            }
            for (int j = 0; j < grads_count[grad_name]; j++) {
              auto name = grad_name + "_autosplit_" + str::to(j);
-              if (split_grads.count(name)) gather_def.add_input(name);
+              if (split_grads.count(name)) gather_op.add_input(name);
            }
-            gather_defs.push_back(gather_def);
+            gather_ops.push_back(gather_op);
          }
          *grad_def.mutable_output(i) = grad_name_v2;
        }
      }
    }

-    // Add defs
+    // Add gradient ops
    if (!is_skip) {
-      for (int i = 0; i < op_def.input_size(); ++i) {
-        inputs_to_grads_[op_def.input(i)] = pack.grad_inputs[i];
+      for (int i = 0; i < op.input_size(); ++i) {
+        inputs_to_grads_[op.input(i)] = pack.grad_inputs[i];
      }
-      // Add def for ``GradientGenerateOp``
+      // Add ``GradientGenerateOp``
      if (gen_grads.size() > 0) {
        vector<string> inputs, outputs;
        Argument arg_defaults;
@@ -134,37 +134,36 @@ void GraphGradientMaker::Make(
          outputs.emplace_back(gen_grad.first + "_grad");
          arg_defaults.add_floats(pack.defaults[gen_grad.second]);
        }
-        auto generate_def = MakeOperatorDef(
+        auto gen_op = MakeOperatorDef(
            "GradientGenerate",
            GetOperatorName(),
            inputs,
            outputs,
            vector<Argument>({arg_defaults}));
-        if (op_def.has_device_option()) {
-          generate_def.mutable_device_option()->CopyFrom(
-              op_def.device_option());
+        if (op.has_device_option()) {
+          gen_op.mutable_device_option()->CopyFrom(op.device_option());
        }
-        graph_def.add_op()->CopyFrom(generate_def);
+        graph.add_op()->CopyFrom(gen_op);
      }
-      // Add def for ``GenerateOp``
+      // Add ``GradientOp``
      for (const auto& grad_def : pack.grad_defs) {
-        graph_def.add_op()->CopyFrom(grad_def);
+        graph.add_op()->CopyFrom(grad_def);
      }
    }
-    // Add def for ``GradientGatherOp``
-    for (const auto& gather_def : gather_defs) {
-      graph_def.add_op()->CopyFrom(gather_def);
+    // Add ``GradientGatherOp``
+    for (const auto& gather_op : gather_ops) {
+      graph.add_op()->CopyFrom(gather_op);
    }
  }
 }

-GraphDef GraphGradientMaker::Share(const GraphDef& input_def) {
+GraphDef GraphGradientMaker::Optimize(const GraphDef& graph) {
  Set<int> invalid_ops;
  Map<string, int> ref_count;
  Map<string, pair<int, string>> gather_map;

-  for (int op_idx = 0; op_idx < input_def.op_size(); ++op_idx) {
-    const auto& op = input_def.op(op_idx);
+  for (int op_idx = 0; op_idx < graph.op_size(); ++op_idx) {
+    const auto& op = graph.op(op_idx);
    if (!str::find(op.type(), "Gradient")) continue;
    // Flag the gathering gradients
    if (op.type() == "GradientGather") {
@@ -195,17 +194,17 @@ GraphDef GraphGradientMaker::Share(const GraphDef& input_def) {
  // Decompose the <GradientGather> into <GradientAdd>
  // This trick accumulates the split to target right after computing,
  // which helps to reduce the total number of buffers.
-  GraphDef output_def(input_def);
-  output_def.clear_op();
-  for (int op_idx = 0; op_idx < input_def.op_size(); ++op_idx) {
+  auto graph_v2(graph);
+  graph_v2.clear_op();
+  for (int op_idx = 0; op_idx < graph.op_size(); ++op_idx) {
    if (invalid_ops.count(op_idx)) continue;
-    const auto& op = input_def.op(op_idx);
-    output_def.add_op()->CopyFrom(op);
+    const auto& op = graph.op(op_idx);
+    graph_v2.add_op()->CopyFrom(op);
    if (!str::find(op.type(), "Gradient")) continue;
    for (const auto& output : op.output()) {
      const auto& find_iter = gather_map.find(output);
      if (find_iter != gather_map.end()) {
-        const auto& gather_op = input_def.op(find_iter->second.first);
+        const auto& gather_op = graph.op(find_iter->second.first);
        auto add_op(gather_op);
        add_op.clear_input();
        if (output != find_iter->second.second) {
@@ -216,7 +215,7 @@ GraphDef GraphGradientMaker::Share(const GraphDef& input_def) {
          if (ref_iter != ref_count.end()) ref_iter->second++;
        }
        add_op.add_input(output);
-        output_def.add_op()->CopyFrom(add_op);
+        graph_v2.add_op()->CopyFrom(add_op);
      }
    }
  }
@@ -242,8 +241,8 @@ GraphDef GraphGradientMaker::Share(const GraphDef& input_def) {
    }
  };

-  for (int op_idx = 0; op_idx < output_def.op_size(); ++op_idx) {
-    auto* op = output_def.mutable_op(op_idx);
+  for (int op_idx = 0; op_idx < graph_v2.op_size(); ++op_idx) {
+    auto* op = graph_v2.mutable_op(op_idx);
    // Ignore the non-gradient ops
    if (!str::find(op->type(), "Gradient")) continue;
    // Check if output is an alias of input
@@ -262,45 +261,44 @@ GraphDef GraphGradientMaker::Share(const GraphDef& input_def) {
    vector<string> dead_buffers;
    // Rewrite input gradients
    for (int i = 0; i < op->input_size(); ++i) {
-      const string& input = op->input(i);
-      if (ref_count.count(input) > 0) {
-        ref_count[input] -= 1; // Decref
-        if (grad_to_buffer.count(input) == 0) continue;
-        string new_input = grad_to_buffer[input];
-        if (ref_count[input] == 0) {
-          dead_buffers.emplace_back(new_input);
+      const string& in = op->input(i);
+      if (ref_count.count(in) > 0) {
+        ref_count[in] -= 1; // Decref
+        if (grad_to_buffer.count(in) == 0) continue;
+        string in_v2 = grad_to_buffer[in];
+        if (ref_count[in] == 0) {
+          dead_buffers.emplace_back(in_v2);
        }
-        *op->mutable_input(i) = new_input;
+        *op->mutable_input(i) = in_v2;
      }
    }
    // Rewrite output gradients
    for (int i = 0; i < op->output_size(); ++i) {
      if (str::startswith(op->type(), "Python")) continue;
-      const string& output = op->output(i);
-      if (output.empty() || str::startswith(output, "/share/buffer")) continue;
-      if (empty_grads_.count(output) > 0) {
+      const string& out = op->output(i);
+      if (out.empty() || str::startswith(out, "/share/buffer")) continue;
+      if (empty_grads_.count(out) > 0) {
        *op->mutable_output(i) = "";
        continue;
      }
      // Protection for leafs
-      if (ref_count.count(output) == 0) continue;
+      if (ref_count.count(out) == 0) continue;
      // Protection for sources and leafs
-      if (retained_grads_.count(output) > 0) continue;
-      string new_output = output;
+      if (retained_grads_.count(out) > 0) continue;
+      string out_v2 = out;
      if (inplace_flags[i] >= 0) {
-        new_output = op->input(inplace_flags[i]);
+        out_v2 = op->input(inplace_flags[i]);
      } else {
-        grad_to_buffer[output] = new_output = get_buffer();
+        grad_to_buffer[out] = out_v2 = get_buffer();
      }
-      *op->mutable_output(i) = new_output;
+      *op->mutable_output(i) = out_v2;
    }
    // Update the pool
    for (auto& buffer : dead_buffers) {
      pool.emplace_back(buffer);
    }
  }
-
-  return output_def;
+  return graph_v2;
 }

 } // namespace dragon
--- a/dragon/core/graph_gradient.h
+++ b/dragon/core/graph_gradient.h
@@ -19,15 +19,15 @@ namespace dragon {

 class DRAGON_API GraphGradientMaker {
 public:
-  /*! \brief Generate graph def from the op defs */
+  /*! \brief Generate graph from the executed ops */
  void Make(
-      const vector<OperatorDef*>& op_defs,
+      const vector<OperatorDef*>& ops,
      const vector<string>& targets,
      const vector<string>& input_grads,
-      GraphDef& graph_def);
+      GraphDef& graph);

-  /*! \brief Rewrite graph def to share the intermediate grads */
-  GraphDef Share(const GraphDef& input_def);
+  /*! \brief Eliminate the unused and make sharing of outputs */
+  GraphDef Optimize(const GraphDef& graph);

  /*! \brief Add an empty gradient */
  void add_empty_grad(const string& name) {
@@ -47,14 +47,14 @@ class DRAGON_API GraphGradientMaker {
 private:
  /*! \brief Check the missing grads */
  bool CheckGrad(
-      const OperatorDef& op_def,
+      const OperatorDef& op,
      const Set<string>& targets,
      vector<pair<string, int>>& gen_grads);

  /*! \brief Return a dummy operator name */
  string GetOperatorName() {
    if (op_prefix_.empty()) return "GradientOp";
-    return op_prefix_ + str::to(op_index_++);
+    return op_prefix_ + str::to(op_idx_++);
  }

  /*! \brief The mapping from input to grad */
@@ -70,7 +70,7 @@ class DRAGON_API GraphGradientMaker {
  string op_prefix_;

  /*! \brief The counter of op name */
-  int64_t op_index_ = 0;
+  int64_t op_idx_ = 0;
 };

 } // namespace dragon

--- a/dragon/core/graph_optimizer.cc
+++ b/dragon/core/graph_optimizer.cc
@@ -7,140 +7,141 @@

 namespace dragon {

-void GraphOptimizer::BuildDAG(const GraphDef& input_def) {
-  dag_.clear();
-  colored_.clear();
+void GraphOptimizer::BuildDAG(const GraphDef& graph) {
+  nodes_.clear();
  reference_count_.clear();
-  for (int i = 0; i < input_def.op_size(); ++i) {
-    const auto& op = input_def.op(i);
-    for (const auto& u : op.input()) {
-      reference_count_[u] += 1;
+  for (int i = 0; i < graph.op_size(); ++i) {
+    const auto& op = graph.op(i);
+    for (const auto& in : op.input()) {
+      reference_count_[in] += 1;
+    }
+    for (const auto& out : op.output()) {
+      if (op.input().empty()) {
+        nodes_[""].childs.push_back(out);
+        nodes_[out].parents.push_back("");
+      } else {
+        for (const auto& in : op.input()) {
+          nodes_[in].childs.push_back(out);
+          nodes_[out].parents.push_back(in);
        }
-    for (const auto& v : op.output()) {
-      vector<string> u_set(op.input().begin(), op.input().end());
-      if (u_set.empty()) u_set.resize(op.output_size());
-      for (const auto& u : u_set) {
-        dag_[v].parents.push_back(u);
-        dag_[u].childs.push_back(v);
-        dag_[v].op_idx = i;
      }
-      dag_[v].op_def = op;
+      nodes_[out].op_idx = i;
+      nodes_[out].op_def = op;
    }
  }
 }

-GraphDef GraphOptimizer::PruneNodes(const GraphDef& input_def) {
+GraphDef GraphOptimizer::EliminateUnused(const GraphDef& graph) {
  // Initialization
-  BuildDAG(input_def);
+  BuildDAG(graph);
+  used_.clear();

-  // Backward pass from targets
-  for (const auto& target : input_def.output()) {
-    if (colored_[target]) continue;
-    BackwardPrunePass(target);
+  // Eliminate the unused nodes
+  for (const auto& out : graph.output()) {
+    EliminateUnusedNode(out);
  }
-
-  for (const auto& grad_info : input_def.grad_info()) {
-    const auto u = grad_info.y() + "_grad";
+  for (const auto& grad_info : graph.grad_info()) {
+    const auto grad_y = grad_info.y() + "_grad";
    for (const auto& x : grad_info.xs()) {
      visited_.clear();
-      ForwardPrunePass(u, x + "_grad", std::deque<string>({u}));
+      EliminateUnusedNode(grad_y, x + "_grad");
    }
  }

-  // Select all colored operators
+  // Select the used operators
  set<int> selected_op_indices;
-  for (auto it : colored_) {
-    if (dag_[it.first].op_idx == -1) continue;
-    selected_op_indices.insert(dag_[it.first].op_idx);
+  for (auto it : used_) {
+    if (nodes_[it.first].op_idx == -1) continue;
+    selected_op_indices.insert(nodes_[it.first].op_idx);
  }

-  // Remove the tensors that can not be produced
+  // Prepare the registered placeholders
  Set<string> outputs;
  for (const auto& name : ws_->tensors()) {
    outputs.insert(name);
  }

-  // Generate the final op sequence
-  map<int, OperatorDef> final_sequence;
+  // Rewrite graph
+  GraphDef graph_v2(graph);
+  graph_v2.clear_op();
  for (auto op_idx : selected_op_indices) {
-    const auto& op = input_def.op(op_idx);
-    auto new_op(input_def.op(op_idx));
+    const auto& op = graph.op(op_idx);
+    auto* op_v2 = graph_v2.add_op();
+    op_v2->CopyFrom(op);
    // Rewrite inputs
    for (int i = 0; i < op.input_size(); ++i) {
-      const auto& input = op.input(i);
-      if (!colored_[input] || outputs.count(input) == 0)
-        *new_op.mutable_input(i) = "";
+      const auto& in = op.input(i);
+      if (!used_[in] || outputs.count(in) == 0) {
+        *op_v2->mutable_input(i) = "";
+      }
    }
    // Rewrite outputs
    for (int i = 0; i < op.output_size(); ++i) {
-      const auto& output = op.output(i);
-      if (!colored_[output]) {
-        *new_op.mutable_output(i) = "";
+      const auto& out = op.output(i);
+      if (!used_[out]) {
+        *op_v2->mutable_output(i) = "";
      } else {
-        outputs.insert(output);
+        outputs.insert(out);
      }
    }
    // Rewrite hand-craft cases
    if (op.type() == "AffineGradient") {
-      if (new_op.output(1).empty()) *new_op.mutable_input(0) = "";
+      if (op_v2->output(1).empty()) *op_v2->mutable_input(0) = "";
    } else if (op.type() == "MulGradient") {
-      if (new_op.output(0).empty()) *new_op.mutable_input(1) = "";
-      if (new_op.output(1).empty()) *new_op.mutable_input(0) = "";
+      if (op_v2->output(0).empty()) *op_v2->mutable_input(1) = "";
+      if (op_v2->output(1).empty()) *op_v2->mutable_input(0) = "";
    } else if (op.type() == "DivGradient") {
-      if (new_op.output(1).empty()) {
-        *new_op.mutable_input(0) = "";
-        if (new_op.output(0).empty()) *new_op.mutable_input(1) = "";
+      if (op_v2->output(1).empty()) {
+        *op_v2->mutable_input(0) = "";
+        if (op_v2->output(0).empty()) *op_v2->mutable_input(1) = "";
      }
    }
-    // Push into the final sequence
-    final_sequence[op_idx].CopyFrom(new_op);
  }
-
-  // Done!
-  GraphDef output_def(input_def);
-  output_def.clear_op();
-  for (auto it : final_sequence)
-    output_def.add_op()->CopyFrom(it.second);
-  return output_def;
+  return graph_v2;
 }

-void GraphOptimizer::AddInplace(
-    const GraphDef& input_def,
+void GraphOptimizer::PlanInplace(
+    const GraphDef& graph,
    Map<string, Set<string>>& output_aliases) {
  // Initialization
-  BuildDAG(input_def);
-
-  // Generate runtime aliases map
-  for (auto& u_iter : reference_count_) {
-    if (u_iter.second == 1 && !u_iter.first.empty() &&
-        dag_[u_iter.first].childs.size() > 0) {
-      const auto& u = u_iter.first;
-      const auto& v0 = dag_[u].childs[0];
-      const auto& op_def = dag_[v0].op_def;
-      const auto* op_schema = OpSchemaRegistry::Schema(op_def.type());
-      for (int i = 0; i < op_def.input_size(); ++i)
-        for (int j = 0; j < op_def.output_size(); ++j)
-          if (op_schema->CheckInplace != nullptr && op_def.input(i) == u &&
-              op_schema->CheckInplace(i, j))
-            output_aliases[op_def.output(j)].insert(u);
+  BuildDAG(graph);
+
+  // Generate aliases map to apply in-place
+  for (const auto& iter : reference_count_) {
+    const auto& in = iter.first;
+    if (iter.second == 1 && !in.empty() && nodes_[in].childs.size() > 0) {
+      const auto& op = nodes_[nodes_[in].childs[0]].op_def;
+      const auto* schema = OpSchemaRegistry::Schema(op.type());
+      for (int i = 0; i < op.input_size(); ++i) {
+        if (op.input(i) == in) {
+          for (int j = 0; j < op.output_size(); ++j) {
+            if (schema->CheckInplace(i, j)) {
+              output_aliases[op.output(j)].insert(in);
+            }
+          }
+        }
+      }
    }
  }
 }

-GraphDef GraphOptimizer::MirrorStage(
-    const GraphDef& input_def,
-    Map<string, vec32_t>& op_indices) {
-  GraphDef output_def(input_def);
-  Map<string, set<int>> fake_op_indices;
+GraphDef GraphOptimizer::PlanCheckpoint(
+    const GraphDef& graph,
+    Map<string, vec32_t>& subgraph_indices) {
+  GraphDef graph_v2(graph);
+  Map<string, set<int>> op_indices;
  Map<string, string> rename_map;
  Map<string, int> versions;

-  // Check mirror stage
-  for (const auto& op : input_def.op()) {
+  // Check the mirror stage setting
+  for (const auto& op : graph.op()) {
    if (str::find(op.type(), "Gradient")) continue;
    bool mirror_stage = false;
-    for (auto& arg : op.arg())
-      if (arg.name() == "mirror_stage") mirror_stage |= (bool)arg.i();
+    for (auto& arg : op.arg()) {
+      if (arg.name() == "mirror_stage") {
+        mirror_stage |= (bool)arg.i();
+      }
+    }
    if (mirror_stage) {
      // We only assume X(0) can be recomputed
      rename_map[op.input(0)] = "placeholder";
@@ -149,24 +150,25 @@ GraphDef GraphOptimizer::MirrorStage(

  // Allocate the temporal buffers
  string v2_name, version_name;
-  for (int op_idx = 0; op_idx < input_def.op_size(); ++op_idx) {
-    const auto& op = input_def.op(op_idx);
-    auto* new_op = output_def.mutable_op(op_idx);
+  for (int op_idx = 0; op_idx < graph.op_size(); ++op_idx) {
+    const auto& op = graph.op(op_idx);
+    auto* op_v2 = graph_v2.mutable_op(op_idx);
    vector<string> used_buffers;
    for (int i = 0; i < op.input_size(); ++i) {
      const auto& it = rename_map.find(op.input(i));
      if (it != rename_map.end() && it->second != "placeholder") {
-        *new_op->mutable_input(i) = it->second;
+        *op_v2->mutable_input(i) = it->second;
        used_buffers.emplace_back(it->second);
      }
    }
    for (int i = 0; i < op.output_size(); ++i) {
      bool inplace_flag = false;
-      for (const auto& u : op.input())
-        if (u == op.output(i)) inplace_flag = true;
+      for (const auto& in : op.input()) {
+        if (in == op.output(i)) inplace_flag = true;
+      }
      if (rename_map.count(op.output(i))) {
        if (inplace_flag && rename_map[op.output(i)] != "placeholder") {
-          *new_op->mutable_output(i) = rename_map[op.output(i)];
+          *op_v2->mutable_output(i) = rename_map[op.output(i)];
          continue;
        }
        for (int j = 0; j < GRAPH_TEMPORAL_OUTPUT_MAX_SIZE; ++j) {
@@ -183,45 +185,42 @@ GraphDef GraphOptimizer::MirrorStage(
        CHECK(!v2_name.empty()) << "\nNo enough buffers for outputs.";
        ws_->CreateTensor(v2_name)->set_version(0);
        version_name = "/ver:" + str::to(versions[v2_name]++);
-        *new_op->mutable_output(i) = rename_map[op.output(i)] =
+        *op_v2->mutable_output(i) = rename_map[op.output(i)] =
            v2_name + version_name;
      }
    }
  }

-  // Plan the minimum recomputing ops for temporal buffers
-  for (int i = 0; i < input_def.op_size(); ++i) {
-    const auto& input_op = input_def.op(i);
-    const auto& output_op = output_def.op(i);
-
-    /*
-     * DP(v) = {DP(u) if input(u) != output(u) else {}} + {i}
-     */
-
-    set<int> minimum_ops = {i};
-    for (int j = 0; j < input_op.input_size(); ++j) {
-      if (input_op.input(j) != output_op.input(j)) {
-        for (auto idx : fake_op_indices[input_op.input(j)])
-          minimum_ops.insert(idx);
+  // Determine the recomputing ops for temporal buffers
+  for (int i = 0; i < graph.op_size(); ++i) {
+    const auto &op = graph.op(i), &op_v2 = graph_v2.op(i);
+    set<int> recomputing_ops = {i};
+    for (int j = 0; j < op.input_size(); ++j) {
+      if (op.input(j) != op_v2.input(j)) {
+        for (auto op_idx : op_indices[op.input(j)]) {
+          recomputing_ops.insert(op_idx);
+        }
      }
    }
-    for (const auto& output : input_op.output()) {
-      for (auto idx : minimum_ops)
-        fake_op_indices[output].insert(idx);
+    for (const auto& out : op.output()) {
+      for (auto op_idx : recomputing_ops) {
+        op_indices[out].insert(op_idx);
+      }
    }
  }

  // Bind to the renamed tensors
  for (const auto& it : rename_map) {
-    for (auto op_idx : fake_op_indices[it.first])
-      op_indices[it.second].push_back(op_idx);
+    for (auto op_idx : op_indices[it.first]) {
+      subgraph_indices[it.second].push_back(op_idx);
+    }
  }

-  // Done!
-  return output_def;
+  // Done
+  return graph_v2;
 }

-GraphDef GraphOptimizer::SimulateGC(const GraphDef& input_def) {
+GraphDef GraphOptimizer::SimulateGC(const GraphDef& graph) {
  Set<string> blacklist = {""};
  Map<string, int> ref_count;
  Map<string, string> rename_map;
@@ -241,42 +240,39 @@ GraphDef GraphOptimizer::SimulateGC(const GraphDef& input_def) {
  };

  // Count the references
-  for (const auto& op : input_def.op()) {
-    for (const auto& input : op.input())
-      ref_count[input] += 1;
+  for (const auto& op : graph.op()) {
+    for (const auto& in : op.input()) {
+      ref_count[in] += 1;
    }
-
-  // We should preserve the targets
-  for (auto& e : input_def.output()) {
-    blacklist.insert(e);
  }

-  // Rewritten the inputs and outputs
-  auto output_def(input_def);
-  for (int op_idx = 0; op_idx < input_def.op_size(); ++op_idx) {
-    const auto& op = input_def.op(op_idx);
-    auto* new_op = output_def.mutable_op(op_idx);
+  // Preserve the graph outputs
+  for (auto& out : graph.output()) {
+    blacklist.insert(out);
+  }

+  // Rewrite the inputs and outputs
+  auto graph_v2(graph);
+  for (int op_idx = 0; op_idx < graph.op_size(); ++op_idx) {
+    const auto& op = graph.op(op_idx);
+    auto* op_v2 = graph_v2.mutable_op(op_idx);
    // Ignore the init ops
    if (op.input_size() == 0) continue;
-
-    // We need to collect the dead buffers
-    // Reuse them when current operator is done
+    // We need to collect the dead buffers.
+    // Reuse them when current operator is done.
    vector<string> dead_buffers;
-
    // Rewrite inputs
    for (int i = 0; i < op.input_size(); ++i) {
      const auto& name = op.input(i);
      if (rename_map.count(name)) {
-        *new_op->mutable_input(i) = rename_map[name];
+        *op_v2->mutable_input(i) = rename_map[name];
      }
      ref_count[name]--;
      if (ref_count[name] == 0 &&
-          str::startswith(new_op->input(i), "/share/buffer/output:")) {
-        dead_buffers.push_back(new_op->input(i));
+          str::startswith(op_v2->input(i), "/share/buffer/output:")) {
+        dead_buffers.push_back(op_v2->input(i));
      }
    }
-
    // Rewrite outputs
    if (!star_ops.count(op.type())) {
      for (int i = 0; i < op.output_size(); ++i) {
@@ -286,55 +282,49 @@ GraphDef GraphOptimizer::SimulateGC(const GraphDef& input_def) {
        for (const auto& input : op.input())
          if (name == input) inplace_flag = true;
        if (inplace_flag) {
-          *new_op->mutable_output(i) = new_op->input(i);
+          *op_v2->mutable_output(i) = op_v2->input(i);
        } else {
-          rename_map[name] = *new_op->mutable_output(i) = get_buffer();
+          rename_map[name] = *op_v2->mutable_output(i) = get_buffer();
        }
      }
    }
-
    // Update the pool
    for (auto& buffer : dead_buffers) {
      pool.emplace_back(buffer);
    }
  }
-
-  return output_def;
+  return graph_v2;
 }

-void GraphOptimizer::ForwardPrunePass(
-    const string& u,
-    const string& leaf,
-    const std::deque<string>& path) {
-  if (visited_.count(u)) {
-    if (visited_[u]) {
-      for (const auto& node : path) {
-        visited_[node] = colored_[node] = true;
-      }
-    }
+void GraphOptimizer::EliminateUnusedNode(
+    const string& source,
+    const string& sink) {
+  if (visited_.count(source)) return;
+  visited_[source] = false;
+  for (const auto& next : nodes_[source].childs) {
+    if (next == sink) {
+      visited_[next] = used_[next] = true;
+      visited_[source] = used_[source] = true;
      return;
    }
-  visited_[u] = false;
-  for (int i = 0; i < dag_[u].childs.size(); ++i) {
-    auto v = dag_[u].childs[i];
-    auto new_path(path);
-    new_path.push_back(v);
-    if (v == leaf) {
-      for (const auto& node : new_path) {
-        visited_[node] = colored_[node] = true;
+    EliminateUnusedNode(next, sink);
+    if (visited_[next]) {
+      visited_[source] = used_[source] = true;
    }
-      return;
-    }
-    ForwardPrunePass(v, leaf, new_path);
  }
 }

-void GraphOptimizer::BackwardPrunePass(const string& v) {
-  colored_[v] = true;
-  for (int i = 0; i < dag_[v].parents.size(); ++i) {
-    auto u = dag_[v].parents[i];
-    if (colored_.count(u)) continue;
-    BackwardPrunePass(u);
+void GraphOptimizer::EliminateUnusedNode(const string& sink) {
+  std::queue<const string*> q;
+  q.push(&sink);
+  while (!q.empty()) {
+    const auto& source = *q.front();
+    q.pop();
+    used_[source] = true;
+    for (const auto& last : nodes_[source].parents) {
+      if (used_.count(last)) continue;
+      q.push(&last);
+    }
  }
 }


--- a/dragon/core/graph_optimizer.h
+++ b/dragon/core/graph_optimizer.h
@@ -32,45 +32,42 @@ class GraphOptimizer {
  /*! \brief Default constructor */
  GraphOptimizer(Workspace* ws) : ws_(ws) {}

-  /*! \brief Build the DAG resources for given def */
-  void BuildDAG(const GraphDef& input_def);
+  /*! \brief Build the DAG */
+  void BuildDAG(const GraphDef& graph);

-  /*! \brief Prune the redundant nodes (-O1) */
-  GraphDef PruneNodes(const GraphDef& input_def);
+  /*! \brief Eliminate the unused outputs and operators */
+  GraphDef EliminateUnused(const GraphDef& graph);

-  /*! \brief Add the inplace for outputs (-O2) */
-  void AddInplace(
-      const GraphDef& input_def,
+  /*! \brief Plan the inplace for inputs */
+  void PlanInplace(
+      const GraphDef& graph,
      Map<string, Set<string>>& output_aliases);

-  /*! \brief Plan the recomputing for inputs (-O3) */
-  GraphDef MirrorStage(
-      const GraphDef& input_def,
-      Map<string, vec32_t>& op_indices);
+  /*! \brief Plan the checkpoint for inputs */
+  GraphDef PlanCheckpoint(
+      const GraphDef& graph,
+      Map<string, vec32_t>& subgraph_indices);

-  /*! \brief Allocate the buffer for outputs (-O3) */
-  GraphDef SimulateGC(const GraphDef& input_def);
+  /*! \brief Allocate the shared buffer for outputs */
+  GraphDef SimulateGC(const GraphDef& graph);

 protected:
-  /*! \brief Pass from gradients to remove unused nodes */
-  void ForwardPrunePass(
-      const string& u,
-      const string& leaf,
-      const std::deque<string>& path);
+  /*! \brief Remote the unused nodes from a sink to all sources */
+  void EliminateUnusedNode(const string& sink);

-  /*! \brief Pass from targets to remove unused nodes */
-  void BackwardPrunePass(const string& v);
+  /*! \brief Remote the unused nodes from a source to a sink */
+  void EliminateUnusedNode(const string& source, const string& sink);

-  /* \brief Store the workspace of parent graph */
+  /* \brief The graph workspace */
  Workspace* ws_;

-  /* \brief Store the DAG */
-  Map<string, Node> dag_;
+  /* \brief The graph nodes */
+  Map<string, Node> nodes_;

-  /* \brief Store the traversal flags */
-  Map<string, bool> visited_, colored_;
+  /* \brief The traversal flags */
+  Map<string, bool> visited_, used_;

-  /* \brief Store the count of references */
+  /* \brief The reference count */
  Map<string, int> reference_count_;

 private:

--- a/dragon/core/operator.cc
+++ b/dragon/core/operator.cc
@@ -173,10 +173,7 @@ TryCreateOperator(const string& key, const OperatorDef& def, Workspace* ws) {

 OperatorBase* NewOperator(const OperatorDef& def, Workspace* ws) {
  auto* schema = OpSchemaRegistry::Schema(def.type());
-  if (schema != nullptr) {
-    CHECK(schema->Verify(def))
-        << "\nOperator failed to pass the schema checking.";
-  }
+  if (schema != nullptr) CHECK(schema->Verify(def));
  OperatorDef mutable_def(def);
  // Heuristically make each random seed slightly different
  static unsigned int seed_offset = 0;

--- a/dragon/core/operator_schema.cc
+++ b/dragon/core/operator_schema.cc
@@ -14,7 +14,6 @@ bool OpSchema::Verify(const OperatorDef& def) const {
               << " is not in range [min=" << min_output_
               << ", max=" << max_output_ << "]";
  }
-  if (CheckInplace != nullptr) {
  for (int i = 0; i < def.input_size(); ++i) {
    if (def.input(i).empty()) continue;
    for (int j = 0; j < def.output_size(); ++j) {
@@ -25,7 +24,6 @@ bool OpSchema::Verify(const OperatorDef& def) const {
      }
    }
  }
-  }
  return true;
 }

@@ -49,7 +47,12 @@ OpSchema& OpSchema::NumOutputs(int min_num, int max_num) {
  return *this;
 }

-OpSchema& OpSchema::Inplace(set<pair<int, int>> inplace) {
+OpSchema& OpSchema::AllowInplace(std::function<bool(int, int)> inplace) {
+  CheckInplace = inplace;
+  return *this;
+}
+
+OpSchema& OpSchema::AllowInplace(set<pair<int, int>> inplace) {
  CheckInplace = [inplace](int in, int out) -> bool {
    return (inplace.count(std::make_pair(in, out)) > 0);
  };

--- a/dragon/core/operator_schema.h
+++ b/dragon/core/operator_schema.h
@@ -20,6 +20,9 @@

 namespace dragon {

+/*!
+ * \brief Class to record the schema of operator.
+ */
 class DRAGON_API OpSchema {
 public:
  /*! \brief Default constructor */
@@ -27,15 +30,12 @@ class DRAGON_API OpSchema {
    Init();
  }

-  /*! \brief Constructor with defined spec */
+  /*! \brief Constructor with the defined spec */
  OpSchema(const string& op_type, const string& file, const int line)
      : op_type_(op_type), file_(file), line_(line) {
    Init();
  }

-  /*! \brief Check if the in-place setting is matched */
-  std::function<bool(int, int)> CheckInplace = nullptr;
-
  /*! \brief Set a fixed number of inputs */
  OpSchema& NumInputs(int n);

@@ -48,12 +48,18 @@ class DRAGON_API OpSchema {
  /*! \brief Set the min and max number of outputs */
  OpSchema& NumOutputs(int min_num, int max_num);

-  /*! \brief Set the in-place setting */
-  OpSchema& Inplace(set<pair<int, int>> inplace);
+  /*! \brief Set the rule to allow inplace with a group of indices */
+  OpSchema& AllowInplace(set<pair<int, int>> inplace);

-  /*! \brief Verify if the def matches the schema  */
+  /*! \brief Set the rule to allow inplace with a function */
+  OpSchema& AllowInplace(std::function<bool(int, int)> inplace);
+
+  /*! \brief Check if the given def matches this schema */
  bool Verify(const OperatorDef& def) const;

+  /*! \brief Check if the inplace is allowed */
+  std::function<bool(int, int)> CheckInplace = [](int, int) { return false; };
+
 private:
  /*! \brief Initialize the default settings */
  void Init() {

--- a/dragon/modules/python/module.cc
+++ b/dragon/modules/python/module.cc
@@ -242,7 +242,7 @@ PYBIND11_MODULE(libdragon_python, m) {
            maker.Make(op_defs, targets, input_grads, graph_def);
            py::gil_scoped_release g;
            if (!retain_grads) {
-              graph_def = maker.Share(graph_def);
+              graph_def = maker.Optimize(graph_def);
            }
            for (const auto& op_def : graph_def.op()) {
              if (verbose) {

--- a/dragon/operators/activation/drop_block2d_op.cc
+++ b/dragon/operators/activation/drop_block2d_op.cc
@@ -129,7 +129,7 @@ OPERATOR_SCHEMA(DropBlock2d)
    /* Y */
    .NumOutputs(1)
    /* X => Y */
-    .Inplace({{0, 0}});
+    .AllowInplace({{0, 0}});

 DEPLOY_CPU(DropBlock2dGradient);
 #ifdef USE_CUDA
@@ -142,7 +142,7 @@ OPERATOR_SCHEMA(DropBlock2dGradient)
    /* dX */
    .NumOutputs(1)
    /* dY => dX */
-    .Inplace({{0, 0}});
+    .AllowInplace({{0, 0}});

 REGISTER_GRADIENT(DropBlock2d, SimpleGradientMaker);


--- a/dragon/operators/activation/drop_path_op.cc
+++ b/dragon/operators/activation/drop_path_op.cc
@@ -95,7 +95,7 @@ OPERATOR_SCHEMA(DropPath)
    /* Y */
    .NumOutputs(1)
    /* X => Y */
-    .Inplace({{0, 0}});
+    .AllowInplace({{0, 0}});

 OPERATOR_SCHEMA(DropPathGradient)
    /* dY */
@@ -103,7 +103,7 @@ OPERATOR_SCHEMA(DropPathGradient)
    /* dX */
    .NumOutputs(1)
    /* dY => dX */
-    .Inplace({{0, 0}});
+    .AllowInplace({{0, 0}});

 REGISTER_GRADIENT(DropPath, SimpleGradientMaker);


--- a/dragon/operators/activation/dropout_op.cc
+++ b/dragon/operators/activation/dropout_op.cc
@@ -84,7 +84,7 @@ OPERATOR_SCHEMA(Dropout)
    /* Y */
    .NumOutputs(1)
    /* X => Y */
-    .Inplace({{0, 0}});
+    .AllowInplace({{0, 0}});

 OPERATOR_SCHEMA(DropoutGradient)
    /* dY */
@@ -92,7 +92,7 @@ OPERATOR_SCHEMA(DropoutGradient)
    /* dX */
    .NumOutputs(1)
    /* dY => dX */
-    .Inplace({{0, 0}});
+    .AllowInplace({{0, 0}});

 REGISTER_GRADIENT(Dropout, SimpleGradientMaker);


--- a/dragon/operators/activation/elu_op.cc
+++ b/dragon/operators/activation/elu_op.cc
@@ -54,7 +54,7 @@ OPERATOR_SCHEMA(Elu)
    /* Y */
    .NumOutputs(1)
    /* X => Y */
-    .Inplace({{0, 0}});
+    .AllowInplace({{0, 0}});

 OPERATOR_SCHEMA(EluGradient)
    /* Y, dY */
@@ -62,7 +62,7 @@ OPERATOR_SCHEMA(EluGradient)
    /* dX */
    .NumOutputs(1)
    /* dY => dX */
-    .Inplace({{1, 0}});
+    .AllowInplace({{1, 0}});

 REGISTER_GRADIENT(Elu, InplaceGradientMaker);


--- a/dragon/operators/activation/relu_op.cc
+++ b/dragon/operators/activation/relu_op.cc
@@ -73,7 +73,7 @@ OPERATOR_SCHEMA(Relu)
    /* Y */
    .NumOutputs(1)
    /* X => Y */
-    .Inplace({{0, 0}});
+    .AllowInplace({{0, 0}});

 OPERATOR_SCHEMA(ReluGradient)
    /* Y, dY */
@@ -81,7 +81,7 @@ OPERATOR_SCHEMA(ReluGradient)
    /* dX */
    .NumOutputs(1)
    /* dY => dX */
-    .Inplace({{1, 0}});
+    .AllowInplace({{1, 0}});

 REGISTER_GRADIENT(Relu, InplaceGradientMaker);


--- a/dragon/operators/activation/selu_op.cc
+++ b/dragon/operators/activation/selu_op.cc
@@ -56,7 +56,7 @@ OPERATOR_SCHEMA(Selu)
    /* Y */
    .NumOutputs(1)
    /* X => Y */
-    .Inplace({{0, 0}});
+    .AllowInplace({{0, 0}});

 OPERATOR_SCHEMA(SeluGradient)
    /* Y, dY */
@@ -64,7 +64,7 @@ OPERATOR_SCHEMA(SeluGradient)
    /* dX */
    .NumOutputs(1)
    /* dY => dX */
-    .Inplace({{1, 0}});
+    .AllowInplace({{1, 0}});

 REGISTER_GRADIENT(Selu, InplaceGradientMaker);


--- a/dragon/operators/activation/sigmoid_op.cc
+++ b/dragon/operators/activation/sigmoid_op.cc
@@ -52,7 +52,7 @@ OPERATOR_SCHEMA(Sigmoid)
    /* Y */
    .NumOutputs(1)
    /* X => Y */
-    .Inplace({{0, 0}});
+    .AllowInplace({{0, 0}});

 OPERATOR_SCHEMA(SigmoidGradient)
    /* Y, dY */
@@ -60,7 +60,7 @@ OPERATOR_SCHEMA(SigmoidGradient)
    /* dX */
    .NumOutputs(1)
    /* dY => dX */
-    .Inplace({{1, 0}});
+    .AllowInplace({{1, 0}});

 REGISTER_GRADIENT(Sigmoid, InplaceGradientMaker);


--- a/dragon/operators/activation/softmax_op.cc
+++ b/dragon/operators/activation/softmax_op.cc
@@ -60,7 +60,7 @@ OPERATOR_SCHEMA(Softmax)
    /* Y */
    .NumOutputs(1)
    /* X => Y */
-    .Inplace({{0, 0}});
+    .AllowInplace({{0, 0}});

 OPERATOR_SCHEMA(SoftmaxGradient)
    /* Y, dY */
@@ -68,7 +68,7 @@ OPERATOR_SCHEMA(SoftmaxGradient)
    /* dX */
    .NumOutputs(1)
    /* dY => dX */
-    .Inplace({{1, 0}});
+    .AllowInplace({{1, 0}});

 REGISTER_GRADIENT(Softmax, InplaceGradientMaker);


--- a/dragon/operators/activation/tanh_op.cc
+++ b/dragon/operators/activation/tanh_op.cc
@@ -52,7 +52,7 @@ OPERATOR_SCHEMA(Tanh)
    /* Y */
    .NumOutputs(1)
    /* X => Y */
-    .Inplace({{0, 0}});
+    .AllowInplace({{0, 0}});

 OPERATOR_SCHEMA(TanhGradient)
    /* Y, dY */
@@ -60,7 +60,7 @@ OPERATOR_SCHEMA(TanhGradient)
    /* dX */
    .NumOutputs(1)
    /* dY => dX */
-    .Inplace({{1, 0}});
+    .AllowInplace({{1, 0}});

 REGISTER_GRADIENT(Tanh, InplaceGradientMaker);


--- a/dragon/operators/array/expand_dims_op.cc
+++ b/dragon/operators/array/expand_dims_op.cc
@@ -45,7 +45,7 @@ OPERATOR_SCHEMA(ExpandDims)
    /* Y */
    .NumOutputs(1)
    /* X => Y */
-    .Inplace({{0, 0}});
+    .AllowInplace({{0, 0}});

 OPERATOR_SCHEMA(ExpandDimsGradient)
    /* dY */
@@ -53,7 +53,7 @@ OPERATOR_SCHEMA(ExpandDimsGradient)
    /* dX */
    .NumOutputs(1)
    /* dY => dX */
-    .Inplace({{0, 0}});
+    .AllowInplace({{0, 0}});

 REGISTER_GRADIENT(ExpandDims, SimpleGradientMaker);


--- a/dragon/operators/array/flatten_op.cc
+++ b/dragon/operators/array/flatten_op.cc
@@ -56,7 +56,7 @@ OPERATOR_SCHEMA(Flatten)
    /* Y */
    .NumOutputs(1)
    /* X => Y */
-    .Inplace({{0, 0}});
+    .AllowInplace({{0, 0}});

 OPERATOR_SCHEMA(FlattenGradient)
    /* dY */
@@ -64,7 +64,7 @@ OPERATOR_SCHEMA(FlattenGradient)
    /* dX */
    .NumOutputs(1)
    /* dY => dX */
-    .Inplace({{0, 0}});
+    .AllowInplace({{0, 0}});

 REGISTER_GRADIENT(Flatten, SimpleGradientMaker);


--- a/dragon/operators/array/reshape_op.cc
+++ b/dragon/operators/array/reshape_op.cc
@@ -69,7 +69,7 @@ OPERATOR_SCHEMA(Reshape)
    /* Y */
    .NumOutputs(1)
    /* X => Y */
-    .Inplace({{0, 0}});
+    .AllowInplace({{0, 0}});

 OPERATOR_SCHEMA(ReshapeGradient)
    /* dY */
@@ -77,7 +77,7 @@ OPERATOR_SCHEMA(ReshapeGradient)
    /* dX */
    .NumOutputs(1)
    /* dY => dX */
-    .Inplace({{0, 0}});
+    .AllowInplace({{0, 0}});

 REGISTER_GRADIENT(Reshape, SimpleGradientMaker);


--- a/dragon/operators/array/squeeze_op.cc
+++ b/dragon/operators/array/squeeze_op.cc
@@ -45,7 +45,7 @@ OPERATOR_SCHEMA(Squeeze)
    /* Y */
    .NumOutputs(1)
    /* X => Y */
-    .Inplace({{0, 0}});
+    .AllowInplace({{0, 0}});

 OPERATOR_SCHEMA(SqueezeGradient)
    /* dY */
@@ -53,7 +53,7 @@ OPERATOR_SCHEMA(SqueezeGradient)
    /* dX */
    .NumOutputs(1)
    /* dY => dX */
-    .Inplace({{0, 0}});
+    .AllowInplace({{0, 0}});

 REGISTER_GRADIENT(Squeeze, SimpleGradientMaker);


--- a/dragon/operators/distributed/collective_op.cc
+++ b/dragon/operators/distributed/collective_op.cc
@@ -193,7 +193,7 @@ DEPLOY_CPU(Collective);
 DEPLOY_CUDA(Collective);
 #endif

-OPERATOR_SCHEMA(Collective);
+OPERATOR_SCHEMA(Collective).AllowInplace([](int, int) -> bool { return true; });

 } // namespace dragon


--- a/dragon/operators/framework/gradient_ops.cc
+++ b/dragon/operators/framework/gradient_ops.cc
@@ -122,7 +122,7 @@ OPERATOR_SCHEMA(GradientAdd)
    /* Y */
    .NumOutputs(1)
    /* X1 => Y */
-    .Inplace({{0, 0}});
+    .AllowInplace({{0, 0}});

 OPERATOR_SCHEMA(StopGradient)
    /* X */
@@ -130,7 +130,7 @@ OPERATOR_SCHEMA(StopGradient)
    /* Y */
    .NumOutputs(1)
    /* X => Y */
-    .Inplace({{0, 0}});
+    .AllowInplace({{0, 0}});

 NO_GRADIENT(StopGradient);


--- a/dragon/operators/math/add_op.cc
+++ b/dragon/operators/math/add_op.cc
@@ -107,16 +107,16 @@ OPERATOR_SCHEMA(Add)
    .NumInputs(2)
    /* Y */
    .NumOutputs(1)
-    /* A => Y */
-    .Inplace({{0, 0}, {1, 0}});
+    /* A => Y, B => Y */
+    .AllowInplace({{0, 0}, {1, 0}});

 OPERATOR_SCHEMA(AddGradient)
    /* dY */
    .NumInputs(1)
    /* dA, dB */
    .NumOutputs(2)
-    /* dY => dA */
-    .Inplace({{0, 0}, {0, 1}});
+    /* dY => dA, dY => dB */
+    .AllowInplace({{0, 0}, {0, 1}});

 REGISTER_GRADIENT(Add, SimpleGradientMaker);


--- a/dragon/operators/math/affine_op.cc
+++ b/dragon/operators/math/affine_op.cc
@@ -151,7 +151,7 @@ OPERATOR_SCHEMA(Affine)
    /* Y */
    .NumOutputs(1)
    /* X => Y */
-    .Inplace({{0, 0}});
+    .AllowInplace({{0, 0}});

 OPERATOR_SCHEMA(AffineGradient)
    /* X, W, dY */
@@ -159,7 +159,7 @@ OPERATOR_SCHEMA(AffineGradient)
    /* dX, dW, dB */
    .NumOutputs(3)
    /* dY => dX */
-    .Inplace({{2, 0}});
+    .AllowInplace({{2, 0}});

 namespace {


--- a/dragon/operators/math/axpby_op.cc
+++ b/dragon/operators/math/axpby_op.cc
@@ -6,49 +6,28 @@ namespace dragon {

 template <class Context>
 template <typename T>
-void AxpbyOp<Context>::DoRunWithType(Tensor* X, Tensor* Y) {
-  CHECK_EQ(X->count(), Y->count());
-  auto* x = X->template data<T, Context>();
-  auto* y = Y->template mutable_data<T, Context>();
+void AxpbyOp<Context>::DoRunWithType() {
+  auto &X = Input(0), *Y = Output(0);
+  auto* x = X.template data<T, Context>();
+  auto* y = Y->ReshapeLike(X)->template mutable_data<T, Context>();
  if (beta_ == 1.f) {
    if (alpha_ == 1.f) {
-      math::Add(X->count(), x, y, y, ctx());
+      math::Add(X.count(), x, y, y, ctx());
    } else {
-      math::Axpy(X->count(), alpha_, x, y, ctx());
+      math::Axpy(X.count(), alpha_, x, y, ctx());
    }
  } else {
    if (alpha_ == 0.f) {
-      math::Scale(X->count(), beta_, y, y, ctx());
+      math::Scale(X.count(), beta_, y, y, ctx());
    } else {
-      math::Axpby(X->count(), alpha_, x, beta_, y, ctx());
+      math::Axpby(X.count(), alpha_, x, beta_, y, ctx());
    }
  }
 }

 template <class Context>
 void AxpbyOp<Context>::RunOnDevice() {
-  for (int i = 0; i < InputSize(); i++) {
-    auto &X = Input(i), *Y = Output(i);
-    Y->ReshapeLike(X);
-    if (XIsType(X, int8_t)) {
-      DoRunWithType<int8_t>(&X, Y);
-    } else if (XIsType(X, uint8_t)) {
-      DoRunWithType<uint8_t>(&X, Y);
-    } else if (XIsType(X, int)) {
-      DoRunWithType<int>(&X, Y);
-    } else if (XIsType(X, int64_t)) {
-      DoRunWithType<int64_t>(&X, Y);
-    } else if (XIsType(X, float16)) {
-      DoRunWithType<float16>(&X, Y);
-    } else if (XIsType(X, float)) {
-      DoRunWithType<float>(&X, Y);
-    } else if (XIsType(X, double)) {
-      DoRunWithType<double>(&X, Y);
-    } else
-      LOG(FATAL) << MessageForUnsupported(
-          types::to_string(X.meta()),
-          {"int8", "uint8", "int32", "int64", "float16", "float32", "float64"});
-  }
+  DispatchHelper<MathTensorTypes>::Call(this, Input(0));
 }

 DEPLOY_CPU(Axpby);
@@ -57,10 +36,12 @@ DEPLOY_CUDA(Axpby);
 #endif

 OPERATOR_SCHEMA(Axpby)
-    /* X1, ... */
-    .NumInputs(1, INT_MAX)
-    /* Y1, ... */
-    .NumOutputs(1, INT_MAX);
+    /* X */
+    .NumInputs(1)
+    /* Y */
+    .NumOutputs(1)
+    /* X => Y */
+    .AllowInplace({{0, 0}});

 NO_GRADIENT(Axpby);


--- a/dragon/operators/math/div_op.cc
+++ b/dragon/operators/math/div_op.cc
@@ -207,16 +207,16 @@ OPERATOR_SCHEMA(Div)
    .NumInputs(2)
    /* Y */
    .NumOutputs(1)
-    /* A => Y */
-    .Inplace({{0, 0}, {1, 0}});
+    /* A => Y, B => Y */
+    .AllowInplace({{0, 0}, {1, 0}});

 OPERATOR_SCHEMA(DivGradient)
    /* A, B, dY */
    .NumInputs(3)
    /* dA, dB */
    .NumOutputs(2)
-    /* dY => dA */
-    .Inplace({{2, 0}, {2, 1}});
+    /* dY => dA, dY => dB */
+    .AllowInplace({{2, 0}, {2, 1}});

 REGISTER_GRADIENT(Div, GenericGradientMaker);


--- a/dragon/operators/math/elementwise_ops.cc
+++ b/dragon/operators/math/elementwise_ops.cc
@@ -172,15 +172,15 @@ DEPLOY_CUDA(Greater);
 DEPLOY_CUDA(GreaterEqual);
 #endif

-OPERATOR_SCHEMA(Ceil).NumInputs(1).NumOutputs(1).Inplace({{0, 0}});
-OPERATOR_SCHEMA(Floor).NumInputs(1).NumOutputs(1).Inplace({{0, 0}});
-OPERATOR_SCHEMA(Round).NumInputs(1).NumOutputs(1).Inplace({{0, 0}});
-OPERATOR_SCHEMA(Sign).NumInputs(1).NumOutputs(1).Inplace({{0, 0}});
-OPERATOR_SCHEMA(Sqrt).NumInputs(1).NumOutputs(1).Inplace({{0, 0}});
-OPERATOR_SCHEMA(Rsqrt).NumInputs(1).NumOutputs(1).Inplace({{0, 0}});
-OPERATOR_SCHEMA(Exp).NumInputs(1).NumOutputs(1).Inplace({{0, 0}});
-OPERATOR_SCHEMA(Log).NumInputs(1).NumOutputs(1).Inplace({{0, 0}});
-OPERATOR_SCHEMA(Invert).NumInputs(1).NumOutputs(1).Inplace({{0, 0}});
+OPERATOR_SCHEMA(Ceil).NumInputs(1).NumOutputs(1).AllowInplace({{0, 0}});
+OPERATOR_SCHEMA(Floor).NumInputs(1).NumOutputs(1).AllowInplace({{0, 0}});
+OPERATOR_SCHEMA(Round).NumInputs(1).NumOutputs(1).AllowInplace({{0, 0}});
+OPERATOR_SCHEMA(Sign).NumInputs(1).NumOutputs(1).AllowInplace({{0, 0}});
+OPERATOR_SCHEMA(Sqrt).NumInputs(1).NumOutputs(1).AllowInplace({{0, 0}});
+OPERATOR_SCHEMA(Rsqrt).NumInputs(1).NumOutputs(1).AllowInplace({{0, 0}});
+OPERATOR_SCHEMA(Exp).NumInputs(1).NumOutputs(1).AllowInplace({{0, 0}});
+OPERATOR_SCHEMA(Log).NumInputs(1).NumOutputs(1).AllowInplace({{0, 0}});
+OPERATOR_SCHEMA(Invert).NumInputs(1).NumOutputs(1).AllowInplace({{0, 0}});
 OPERATOR_SCHEMA(Sin).NumInputs(1).NumOutputs(1);
 OPERATOR_SCHEMA(Cos).NumInputs(1).NumOutputs(1);
 OPERATOR_SCHEMA(Square).NumInputs(1).NumOutputs(1);

--- a/dragon/operators/math/elementwise_ops.h
+++ b/dragon/operators/math/elementwise_ops.h
@@ -43,7 +43,7 @@ class AxpbyOp final : public Operator<Context> {
  void RunOnDevice() override;

  template <typename T>
-  void DoRunWithType(Tensor* X, Tensor* Y);
+  void DoRunWithType();

 protected:
  float alpha_, beta_;

--- a/dragon/operators/math/exp_op.cc
+++ b/dragon/operators/math/exp_op.cc
@@ -31,7 +31,7 @@ OPERATOR_SCHEMA(ExpGradient)
    /* dX */
    .NumOutputs(1)
    /* dY => dX */
-    .Inplace({{1, 0}});
+    .AllowInplace({{1, 0}});

 REGISTER_GRADIENT(Exp, InplaceGradientMaker);


--- a/dragon/operators/math/mul_op.cc
+++ b/dragon/operators/math/mul_op.cc
@@ -189,16 +189,16 @@ OPERATOR_SCHEMA(Mul)
    .NumInputs(2)
    /* Y */
    .NumOutputs(1)
-    /* A => Y */
-    .Inplace({{0, 0}, {1, 0}});
+    /* A => Y, B => Y */
+    .AllowInplace({{0, 0}, {1, 0}});

 OPERATOR_SCHEMA(MulGradient)
    /* A, B, dY */
    .NumInputs(3)
    /* dA, dB */
    .NumOutputs(2)
-    /* dY => dA */
-    .Inplace({{2, 0}, {2, 1}});
+    /* dY => dA, dY => dB */
+    .AllowInplace({{2, 0}, {2, 1}});

 REGISTER_GRADIENT(Mul, GenericGradientMaker);


--- a/dragon/operators/math/neg_op.cc
+++ b/dragon/operators/math/neg_op.cc
@@ -54,7 +54,7 @@ OPERATOR_SCHEMA(Neg)
    /* Y */
    .NumOutputs(1)
    /* X => Y */
-    .Inplace({{0, 0}});
+    .AllowInplace({{0, 0}});

 OPERATOR_SCHEMA(NegGradient)
    /* dY */
@@ -62,7 +62,7 @@ OPERATOR_SCHEMA(NegGradient)
    /* dX */
    .NumOutputs(1)
    /* dY => dX */
-    .Inplace({{0, 0}});
+    .AllowInplace({{0, 0}});

 REGISTER_GRADIENT(Neg, SimpleGradientMaker);


--- a/dragon/operators/math/reciprocal_op.cc
+++ b/dragon/operators/math/reciprocal_op.cc
@@ -53,7 +53,7 @@ OPERATOR_SCHEMA(Reciprocal)
    /* Y */
    .NumOutputs(1)
    /* X => Y */
-    .Inplace({{0, 0}});
+    .AllowInplace({{0, 0}});

 OPERATOR_SCHEMA(ReciprocalGradient)
    /* Y, dY */
@@ -61,7 +61,7 @@ OPERATOR_SCHEMA(ReciprocalGradient)
    /* dX */
    .NumOutputs(1)
    /* dY => dX */
-    .Inplace({{1, 0}});
+    .AllowInplace({{1, 0}});

 REGISTER_GRADIENT(Reciprocal, InplaceGradientMaker);


--- a/dragon/operators/math/rsqrt_op.cc
+++ b/dragon/operators/math/rsqrt_op.cc
@@ -32,7 +32,7 @@ OPERATOR_SCHEMA(RsqrtGradient)
    /* dX */
    .NumOutputs(1)
    /* dY => dX */
-    .Inplace({{1, 0}});
+    .AllowInplace({{1, 0}});

 REGISTER_GRADIENT(Rsqrt, InplaceGradientMaker);


--- a/dragon/operators/math/sign_op.cc
+++ b/dragon/operators/math/sign_op.cc
@@ -30,7 +30,7 @@ OPERATOR_SCHEMA(SignGradient)
    /* dX */
    .NumOutputs(1)
    /* dY => dX */
-    .Inplace({{0, 0}});
+    .AllowInplace({{0, 0}});

 REGISTER_GRADIENT(Sign, SimpleGradientMaker);


--- a/dragon/operators/math/sqrt_op.cc
+++ b/dragon/operators/math/sqrt_op.cc
@@ -37,7 +37,7 @@ OPERATOR_SCHEMA(SqrtGradient)
    /* dX */
    .NumOutputs(1)
    /* dY => dX */
-    .Inplace({{1, 0}});
+    .AllowInplace({{1, 0}});

 REGISTER_GRADIENT(Sqrt, InplaceGradientMaker);


--- a/dragon/operators/math/sub_op.cc
+++ b/dragon/operators/math/sub_op.cc
@@ -112,16 +112,16 @@ OPERATOR_SCHEMA(Sub)
    .NumInputs(2)
    /* Y */
    .NumOutputs(1)
-    /* A => Y */
-    .Inplace({{0, 0}, {1, 0}});
+    /* A => Y, B => Y */
+    .AllowInplace({{0, 0}, {1, 0}});

 OPERATOR_SCHEMA(SubGradient)
    /* dY */
    .NumInputs(1)
    /* dA, dB */
    .NumOutputs(2)
-    /* dY => dA */
-    .Inplace({{0, 0}, {0, 1}});
+    /* dY => dA, dY => dB */
+    .AllowInplace({{0, 0}, {0, 1}});

 REGISTER_GRADIENT(Sub, SimpleGradientMaker);


--- a/dragon/operators/vision/bias_add_op.cc
+++ b/dragon/operators/vision/bias_add_op.cc
@@ -89,7 +89,7 @@ OPERATOR_SCHEMA(BiasAdd)
    /* Y */
    .NumOutputs(1)
    /* X => Y */
-    .Inplace({{0, 0}});
+    .AllowInplace({{0, 0}});

 OPERATOR_SCHEMA(BiasAddGradient)
    /* dY */
@@ -97,7 +97,7 @@ OPERATOR_SCHEMA(BiasAddGradient)
    /* dX, dB */
    .NumOutputs(2)
    /* dY => dX */
-    .Inplace({{0, 0}});
+    .AllowInplace({{0, 0}});

 REGISTER_GRADIENT(BiasAdd, SimpleGradientMaker);


--- a/dragon/operators/vision/conv2d_op_cudnn.cc
+++ b/dragon/operators/vision/conv2d_op_cudnn.cc
@@ -289,7 +289,7 @@ template <class Context>
 template <typename T>
 void CuDNNConv2dGradientOp<Context>::ResetDesc() {
  auto &X = Input(0), &W = Input(1), &dY = Input(-1);
-  auto *dX = Output(0), *dW = Output(1);
+  // auto *dX = Output(0), *dW = Output(1);
  bool input_changed = (X.dims() != input_dims_);
  bool filter_changed = (W.dims() != filter_dims_);
  if (input_changed || filter_changed) {
@@ -328,8 +328,8 @@ void CuDNNConv2dGradientOp<Context>::ResetDesc() {
      exhaustive_search_data_ = true;
      exhaustive_search_filter_ = true;
    } else {
-      if (dW->has_name()) {
 #if CUDNN_VERSION_MIN(7, 0, 0)
+      {
        int num_valid_algos;
        constexpr int num_algos = CUDNN_CONV_NUM_BWD_FILTER_ALGOS;
        cudnnConvolutionBwdFilterAlgoPerf_t stats[num_algos];
@@ -353,20 +353,8 @@ void CuDNNConv2dGradientOp<Context>::ResetDesc() {
        CHECK(algo_is_found)
            << "\nNo algorithms available for <cudnnConvolutionBackwardFilter> "
            << "under the current desc and workspace limit.";
-#else
-        CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
-            ctx()->cudnn_handle(),
-            output_desc_,
-            input_desc_,
-            conv_desc_,
-            filter_desc_,
-            CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
-            CUDNN_CONV_WORKSPACE_LIMIT_BYTES,
-            &bwd_filter_algo_));
-#endif // CUDNN_VERSION_MIN(7, 0, 0)
      }
-      if (dX->has_name()) {
-#if CUDNN_VERSION_MIN(7, 0, 0)
+      {
        int num_valid_algos;
        constexpr int num_algos = CUDNN_CONV_NUM_BWD_DATA_ALGOS;
        cudnnConvolutionBwdDataAlgoPerf_t stats[num_algos];
@@ -390,7 +378,17 @@ void CuDNNConv2dGradientOp<Context>::ResetDesc() {
        CHECK(algo_is_found)
            << "\nNo algorithms available for <cudnnConvolutionBackwardData> "
            << "under the current desc and workspace limit.";
+      }
 #else
+      CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
+          ctx()->cudnn_handle(),
+          output_desc_,
+          input_desc_,
+          conv_desc_,
+          filter_desc_,
+          CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+          CUDNN_CONV_WORKSPACE_LIMIT_BYTES,
+          &bwd_filter_algo_));
      CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
          ctx()->cudnn_handle(),
          filter_desc_,
@@ -402,7 +400,6 @@ void CuDNNConv2dGradientOp<Context>::ResetDesc() {
          &bwd_data_algo_));
 #endif // CUDNN_VERSION_MIN(7, 0, 0)
    }
-    }
    cudnn_ws_nbytes_ = SIZE_MAX; // Request a new size
  }
 }

--- a/dragon/operators/vision/conv2d_transpose_op_cudnn.cc
+++ b/dragon/operators/vision/conv2d_transpose_op_cudnn.cc
@@ -287,7 +287,6 @@ template <class Context>
 template <typename T>
 void CuDNNConvTranspose2dGradientOp<Context>::ResetDesc() {
  auto &X = Input(0), &W = Input(1), &dY = Input(-1);
-  auto *dX = Output(0), *dW = Output(1);
  bool input_changed = (X.dims() != input_dims_);
  bool filter_changed = (W.dims() != filter_dims_);
  if (input_changed || filter_changed) {
@@ -324,8 +323,8 @@ void CuDNNConvTranspose2dGradientOp<Context>::ResetDesc() {
      exhaustive_search_data_ = true;
      exhaustive_search_filter_ = true;
    } else {
-      if (dW->has_name()) {
 #if CUDNN_VERSION_MIN(7, 0, 0)
+      {
        int num_valid_algos;
        constexpr int num_algos = CUDNN_CONV_NUM_BWD_FILTER_ALGOS;
        cudnnConvolutionBwdFilterAlgoPerf_t stats[num_algos];
@@ -349,20 +348,8 @@ void CuDNNConvTranspose2dGradientOp<Context>::ResetDesc() {
        CHECK(algo_is_found)
            << "\nNo algorithms available for <cudnnConvolutionBackwardFilter> "
            << "under the current desc and workspace limit.";
-#else
-        CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
-            ctx()->cudnn_handle(),
-            input_desc_,
-            output_desc_,
-            conv_desc_,
-            filter_desc_,
-            CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
-            CUDNN_CONV_WORKSPACE_LIMIT_BYTES,
-            &bwd_filter_algo_));
-#endif // CUDNN_VERSION_MIN(7, 0, 0)
      }
-      if (dX->has_name()) {
-#if CUDNN_VERSION_MIN(7, 0, 0)
+      {
        int num_valid_algos;
        constexpr int num_algos = CUDNN_CONV_NUM_FWD_ALGOS;
        cudnnConvolutionFwdAlgoPerf_t stats[num_algos];
@@ -386,7 +373,17 @@ void CuDNNConvTranspose2dGradientOp<Context>::ResetDesc() {
        CHECK(algo_is_found)
            << "\nNo algorithms available for <cudnnConvolutionForward> "
            << "under the current desc and workspace limit.";
+      }
 #else
+      CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
+          ctx()->cudnn_handle(),
+          input_desc_,
+          output_desc_,
+          conv_desc_,
+          filter_desc_,
+          CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+          CUDNN_CONV_WORKSPACE_LIMIT_BYTES,
+          &bwd_filter_algo_));
      CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
          ctx()->cudnn_handle(),
          input_desc_,
@@ -398,7 +395,6 @@ void CuDNNConvTranspose2dGradientOp<Context>::ResetDesc() {
          &bwd_data_algo_));
 #endif // CUDNN_VERSION_MIN(7, 0, 0)
    }
-    }
    cudnn_ws_nbytes_ = SIZE_MAX; // Request a new size
  }
 }

--- a/dragon/python/core/autograph/config.py
+++ b/dragon/python/core/autograph/config.py
@@ -50,11 +50,11 @@ def set_optimization(level=1):

    * level = ``0``: Do nothing.

-    * level = ``1``: Prune the redundant nodes.
+    * level = ``1``: Eliminate the unused outputs and operators.

-    * level = ``2``: Add the inplace to outputs.
+    * level = ``2``: Apply inplace to the inputs if available.

-    * level = ``3``: Allocate the buffer for outputs.
+    * level = ``3``: Allocate shared buffer for the outputs.

    Parameters
    ----------

--- a/dragon/python/core/autograph/grad_maker.py
+++ b/dragon/python/core/autograph/grad_maker.py
@@ -78,7 +78,7 @@ class GradientMaker(object):
            if not is_skip:
                for input, grad_input in zip(op_def.input, grad_inputs):
                    inputs_to_grads[input] = grad_input
-                # Add def for ``GradientGenerateOp``
+                # Add ``GradientGenerateOp``
                if len(gen_grads) > 0:
                    inputs, outputs, values = [], [], []
                    for name, i in gen_grads:
@@ -94,7 +94,7 @@ class GradientMaker(object):
                        device_option=op_def.device_option
                        if op_def.HasField('device_option') else None)
                    backward_defs.append(gen_op)
-                # Add def for ``GradientOp``
+                # Add ``GradientOp``
                for grad_def in grad_defs:
                    grad_def.name = OpDef.get_name()
                    backward_defs.append(grad_def)

--- a/dragon/python/core/ops/math_ops.py
+++ b/dragon/python/core/ops/math_ops.py
@@ -130,7 +130,7 @@ def affine(inputs, axis=1, num_axes=1, **kwargs):
        return op_lib.blend(**args)


-@OpSchema.num_inputs(1, 2147483647)
+@OpSchema.num_inputs(1)
 def axpby(inputs, outputs=None, alpha=1., beta=1., **kwargs):
    r"""Compute the element-wise addition from input to output.

@@ -140,10 +140,10 @@ def axpby(inputs, outputs=None, alpha=1., beta=1., **kwargs):

    Parameters
    ----------
-    inputs : Union[dragon.Tensor, Sequence[dragon.Tensor]]
-        The input tensor(s).
-    outputs : Union[dragon.Tensor, Sequence[dragon.Tensor]], optional
-        The output tensor(s).
+    inputs : dragon.Tensor
+        The input tensor.
+    outputs : dragon.Tensor, optional
+        The output tensor.
    alpha : number, optional, default=1.
        The value to :math:`\alpha`.
    beta : number, optional, default=1.
@@ -151,23 +151,17 @@ def axpby(inputs, outputs=None, alpha=1., beta=1., **kwargs):

    Returns
    -------
-    Union[dragon.Tensor, Sequence[dragon.Tensor]]
-        The output tensor(s).
+    dragon.Tensor
+        The output tensor.

    """
    args = parse_args(locals())
    args['alpha'], args['beta'] = float(alpha), float(beta)
-    if types.is_tensor(inputs):
-        inputs = [inputs]
-    if outputs is not None and types.is_tensor(outputs):
-        args['outputs'] = [outputs]
    op_lib = math_ops_lib.Axpby
    if context.executing_eagerly():
        return op_lib \
-            .instantiate(
-                alpha=args['alpha'],
-                beta=args['beta'],
-            ).apply(inputs, args['outputs'])
+            .instantiate(alpha=args['alpha'], beta=args['beta']) \
+            .apply([inputs], [outputs])
    else:
        return op_lib.blend(**args)


--- a/torch/core/nn/modules/batchnorm.py
+++ b/torch/core/nn/modules/batchnorm.py
@@ -65,8 +65,7 @@ class _BatchNorm(Module):
               .format(**self.__dict__)

    def forward(self, input):
-        training = self.training or \
-            not self.track_running_stats
+        training = self.training or not self.track_running_stats
        return F.batch_norm(
            input, *self.inputs,
            training=training,

--- a/torch/core/ops/array/functional.py
+++ b/torch/core/ops/array/functional.py
@@ -146,10 +146,8 @@ def cat(seq, dim=0, out=None):

    """
    return _functions.Concat \
-        .instantiate(
-            seq[0].device,
-            axis=dim,
-        ).apply(seq, out)
+        .instantiate(seq[0].device, axis=dim) \
+        .apply(seq, out)


 def channel_normalize(
@@ -618,10 +616,7 @@ def nonzero(input, out=None):
        The output tensor.

    """
-    return _functions.NonZero \
-        .instantiate(
-            input.device,
-        ).apply(input, out)
+    return _functions.NonZero.instantiate(input.device).apply(input, out)


 def one_hot(input, depth):
@@ -647,8 +642,7 @@ def one_hot(input, depth):
        The output tensor.

    """
-    return _functions.OneHot \
-        .instantiate(input.device, depth=depth).apply(input)
+    return _functions.OneHot.instantiate(input.device, depth=depth).apply(input)


 def permute(input, dims):
@@ -715,18 +709,14 @@ def reshape(input, shape, out=None):
    """
    shape = nest.flatten(shape)
    return _functions.Reshape \
-        .instantiate(
-            input.device,
-            ndim=len(shape),
-        ).apply(input, shape, out)
+        .instantiate(input.device, ndim=len(shape)) \
+        .apply(input, shape, out)


 def slice(input, starts, sizes):
    return _functions.Slice \
-        .instantiate(
-            input.device,
-            ndim=len(starts),
-        ).apply(input, starts, sizes)
+        .instantiate(input.device, ndim=len(starts)) \
+        .apply(input, starts, sizes)


 def split(tensor, split_size_or_sections, dim=0):
@@ -1015,9 +1005,8 @@ def where(condition, x, y):

    """
    return _functions.Where \
-        .instantiate(
-            utils.unify_devices([condition, x, y]),
-        ).apply(condition, x, y)
+        .instantiate(utils.unify_devices([condition, x, y])) \
+        .apply(condition, x, y)


 def _arg_reduce(input, op_type, dim=None, keepdim=False, out=None):

--- a/torch/core/ops/tensorbind.py
+++ b/torch/core/ops/tensorbind.py
@@ -567,10 +567,6 @@ def expand(self, *sizes):
    dragon.vm.torch.Tensor
        The output tensor.

-    See Also
-    --------
-    `torch.expand(...)`_
-
    """
    return array_funcs.expand(self, sizes)


--- a/torch/core/ops/training/_functions.py
+++ b/torch/core/ops/training/_functions.py
@@ -51,6 +51,5 @@ class GradAccumulate(function.Function):
            'arguments': {'alpha': 1., 'beta': float(self.momentum)},
        }

-    def forward(self, grads):
-        outputs = [grad.id + '[accum]' for grad in grads]
-        return self.dispatch(grads, outputs, no_grad=True)
+    def forward(self, grad):
+        return self.dispatch([grad], [grad.id + '[accum]'], no_grad=True)
--- a/torch/core/ops/training/functional.py
+++ b/torch/core/ops/training/functional.py
@@ -14,18 +14,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-from dragon.core.util import nest
 from dragon.vm.torch.core.ops.training import _functions


-def accumulate_grad(grads, momentum=1):
-    """Accumulate the gradients."""
-    grads = nest.flatten(grads)
-    if len(grads) == 0:
-        return
+def accumulate_grad(grad, momentum=1):
+    """Accumulate the gradient."""
    return _functions.GradAccumulate \
-        .instantiate(grads[0].device, momentum=momentum) \
-        .apply(grads)
+        .instantiate(grad.device, momentum=momentum).apply(grad)


 def update_param(

--- a/torch/core/optim/optimizer.py
+++ b/torch/core/optim/optimizer.py
@@ -97,15 +97,13 @@ class Optimizer(object):
            The momentum to the accumulated value.

        """
-        grads = []
        current_ws = workspace.get_workspace()
        for group in self.param_groups:
            group['_internal/grad_accum'] = True
            for param in group['params']:
                grad = self._steal_grad(current_ws, param)
                if grad is not None:
-                    grads.append(grad)
-        training_funcs.accumulate_grad(grads, momentum)
+                    training_funcs.accumulate_grad(grad)

    def add_param_group(self, param_group):
        """Add a new param group into the optimizer.

--- a/torch/core/tensor.py
+++ b/torch/core/tensor.py
@@ -776,10 +776,6 @@ class Tensor(object):
        dragon.vm.torch.Tensor
            The output tensor.

-        See Also
-        --------
-        `torch.expand(...)`_
-
        """

    def expand_as(self, other):
@@ -795,10 +791,6 @@ class Tensor(object):
        dragon.vm.torch.Tensor
            The output tensor.

-        See Also
-        --------
-        `torch.expand(...)`_
-
        """
        return self.expand(*other.size())