Normalize the math notations in docstring

Summary: This commit normalizes the inconsistent math notations in docstring.

Normalize the math notations in docstring
Summary: This commit normalizes the inconsistent math notations in docstring.
Ting PAN
Commit 0ab14f30 authored Jul 15, 2020 by Ting PAN
Showing with 886 additions and 899 deletions
caffe/layer.py
caffe/layers/common.py
caffe/layers/data.py
caffe/layers/loss.py
caffe/layers/neuron.py
caffe/layers/vision.py
caffe/net.py
caffe/solver.py
docs/api/python/dragon/cuda.rst
docs/api/python/dragon/cuda/Stream.rst
docs/api/python/dragon/cuda/enable_cudnn_benchmark.rst
docs/api/python/dragon/math.rst
docs/api/python/tensorflow/math.rst
dragon/core/graph_gradient.cc
dragon/core/graph_gradient.h
dragon/core/operator.cc
dragon/core/operator_gradient.h
dragon/core/registry.h
dragon/modules/python/autograd.h
dragon/modules/python/cuda.h
--- a/caffe/layer.py
+++ b/caffe/layer.py
@@ -8,7 +8,6 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
 """The base layer class."""
 from __future__ import absolute_import

--- a/caffe/layers/common.py
+++ b/caffe/layers/common.py
@@ -8,8 +8,7 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
+"""The common layers."""
-"""The implementation of the common layers."""
 from __future__ import absolute_import
 from __future__ import division
@@ -628,6 +627,10 @@ class Slice(Layer):
 class Softmax(Layer):
    r"""Apply the softmax function.
+    The **Softmax** function is defined as:
+    .. math:: \text{Softmax}(x_{i}) = \frac{\exp(x_{i})}{\sum_{j} \exp(x_{j})}
    Examples:
    ```python

--- a/caffe/layers/data.py
+++ b/caffe/layers/data.py
@@ -8,8 +8,7 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
+"""The data layers."""
-"""The implementation of the data layers."""
 from __future__ import absolute_import
 from __future__ import division

--- a/caffe/layers/loss.py
+++ b/caffe/layers/loss.py
@@ -8,8 +8,7 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
+"""The loss layers."""
-"""The implementation of loss layers."""
 from __future__ import absolute_import
 from __future__ import division

--- a/caffe/layers/neuron.py
+++ b/caffe/layers/neuron.py
@@ -8,8 +8,7 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
+"""The neuron layers."""
-"""The implementation of the neuron layers."""
 from __future__ import absolute_import
 from __future__ import division
@@ -64,10 +63,10 @@ class ELU(Layer):
    .. math::
        \text{ELU}(x) =
-        \begin{cases}
+            \begin{cases}
-            x, & \text{ if } x \geq 0 \\
+                x, & \text{ if } x \geq 0 \\
-            \alpha * (e^{x} - 1), & \text{ otherwise }
+                \alpha * (\exp(x) - 1), & \text{ otherwise }
-        \end{cases}
+            \end{cases}
    Examples:
@@ -185,10 +184,10 @@ class ReLU(Layer):
    .. math::
        \text{ReLU}(x) =
-        \begin{cases}
+            \begin{cases}
-            x, & \text{ if } x \geq 0 \\
+                x, & \text{ if } x \geq 0 \\
-            0, & \text{ otherwise }
+                0, & \text{ otherwise }
-        \end{cases}
+            \end{cases}
    Examples:
@@ -220,7 +219,7 @@ class Sigmoid(Layer):
    The **Sigmoid** function is defined as:
-    .. math:: \text{Sigmoid}(x) = \frac{1}{1 + e^{-x}}
+    .. math:: \text{Sigmoid}(x) = \frac{1}{1 + \exp(-x)}
    Examples:
@@ -246,7 +245,7 @@ class TanH(Layer):
    The **Tanh** function is defined as:
-    .. math:: \text{Tanh}(x) = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}
+    .. math:: \text{Tanh}(x) = \frac{\exp(x) - \exp(-x)}{\exp(x) + \exp(-x)}
    Examples:

--- a/caffe/layers/vision.py
+++ b/caffe/layers/vision.py
@@ -8,8 +8,7 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
+"""The vision layers."""
-"""The implementation of the vision layers."""
 from __future__ import absolute_import
 from __future__ import division
@@ -184,8 +183,9 @@ class Pooling(Layer):
        super(Pooling, self).__init__(layer_param)
        param = layer_param.pooling_param
        self.arguments = {
-            'data_format': 'NCHW',
+            'ceil_mode': True,
            'mode': {0: 'MAX', 1: 'AVG'}[param.pool],
+            'data_format': 'NCHW',
            'global_pooling': param.global_pooling,
        }
        if not param.HasField('kernel_h'):

--- a/caffe/net.py
+++ b/caffe/net.py
@@ -8,7 +8,6 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
 """The base net class."""
 from __future__ import absolute_import

--- a/caffe/solver.py
+++ b/caffe/solver.py
@@ -8,7 +8,6 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
 """The solver to update parameters."""
 from __future__ import absolute_import

--- a/docs/api/python/dragon/cuda.rst
+++ b/docs/api/python/dragon/cuda.rst
@@ -3,6 +3,12 @@ dragon.cuda
 .. only:: html
+  Classes
+  -------
+  `class Stream <cuda/Stream.html>`_
+  : The CUDA stream wrapper.
  Functions
  ---------
@@ -10,10 +16,7 @@ dragon.cuda
  : Return the index of current selected device.
  `enable_cudnn(...) <cuda/enable_cudnn.html>`_
-  : Activate the CuDNN engine.
+  : Enable the CuDNN library.
-  `enable_cudnn_benchmark(...) <cuda/enable_cudnn_benckmark.html>`_
-  : Activate the CuDNN benchmark.
  `get_device_capability(...) <cuda/get_device_capability.html>`_
  : Return the capability of specified device.
@@ -28,18 +31,18 @@ dragon.cuda
  : Set the current device.
  `synchronize(...) <cuda/synchronize.html>`_
-  : Synchronize the specified cuda stream.
+  : Synchronize a specified CUDA stream.
 .. toctree::
  :hidden:
  cuda/current_device
  cuda/enable_cudnn
-  cuda/enable_cudnn_benchmark
  cuda/get_device_capability
  cuda/is_available
  cuda/set_default_device
  cuda/set_device
+  cuda/Stream
  cuda/synchronize
 .. raw:: html

--- a/docs/api/python/dragon/cuda/Stream.rst
+++ b/docs/api/python/dragon/cuda/Stream.rst
+Stream
+======
+.. autoclass:: dragon.cuda.Stream
+__init__
+--------
+.. automethod:: dragon.cuda.Stream.__init__
+Properties
+----------
+ptr
+###
+.. autoattribute:: dragon.cuda.Stream.ptr
+Methods
+-------
+synchronize
+###########
+.. automethod:: dragon.cuda.Stream.synchronize
+.. raw:: html
+  <style>
+    h1:before {
+      content: "dragon.cuda.";
+      color: #103d3e;
+    }
+  </style>
--- a/docs/api/python/dragon/cuda/enable_cudnn_benchmark.rst
+++ b/docs/api/python/dragon/cuda/enable_cudnn_benchmark.rst
-enable_cudnn_benchmark
-======================
-.. autofunction:: dragon.cuda.enable_cudnn_benchmark
-.. raw:: html
-  <style>
-    h1:before {
-      content: "dragon.cuda.";
-      color: #103d3e;
-    }
-  </style>
--- a/docs/api/python/dragon/math.rst
+++ b/docs/api/python/dragon/math.rst
@@ -139,7 +139,7 @@ dragon.math
  : Compute the sum value of elements along the given axis.
  `tanh(...) <math/tanh.html>`_
-  : Compute the tanh result of input.
+  : Compute the tanh of input.
 .. toctree::
  :hidden:

--- a/docs/api/python/tensorflow/math.rst
+++ b/docs/api/python/tensorflow/math.rst
@@ -100,7 +100,7 @@ vm.tensorflow.math
  : Compute the reciprocal square root of input.
  `sigmoid(...) <math/sigmoid.html>`_
-  : Apply the sigmoid function.
+  : Compute the sigmoid function.
  `sign(...) <math/sign.html>`_
  : Compute the sign indication of input.
@@ -118,7 +118,7 @@ vm.tensorflow.math
  : Compute the element-wise subtraction.
  `tanh(...) <math/tanh.html>`_
-  : Apply the tanh function.
+  : Compute the tanh of input.
 .. toctree::
  :hidden:

--- a/dragon/core/graph_gradient.cc
+++ b/dragon/core/graph_gradient.cc
@@ -8,36 +8,29 @@ bool GraphGradientMaker::CheckGrad(
    const Set<string>& targets,
    vector<pair<string, int>>& gen_grads) {
  if (NoGradientRegistry()->Has(op_def.type())) {
-    for (auto& input : op_def.input()) {
-      blacklist_set_.insert(input);
-    }
    return true;
  }
+  bool maybe_skip = false;
  for (int i = 0; i < op_def.output_size(); ++i) {
    const auto& output = op_def.output(i);
    if (!inputs_to_grads_.count(output)) {
-      if (blacklist_set_.count(output)) return true;
+      maybe_skip = true;
      if (targets.count(output)) {
-        // Consider to generate virtual gradient for targets
        gen_grads.push_back({output, i});
        inputs_to_grads_[output] = output + "_grad";
-      } else if (op_def.output_size() == 1) {
-        return true; // We can skip this op, obviously
      }
    }
  }
-  // Pass check, even if missing some grads
+  return maybe_skip && gen_grads.empty();
-  return false;
 }
 void GraphGradientMaker::Make(
    const vector<OperatorDef*>& op_defs,
    const vector<string>& targets,
    const vector<string>& input_grads,
-    GraphDef& backward_def) {
+    GraphDef& graph_def) {
+  Set<string> split_grads, targets_v2;
  Map<string, int> inputs_count, grads_count;
-  Set<string> all_split_grads, targets_set;
-  Map<string, string> targets_to_grads;
  // PLAY for the forward
  for (auto* op_def : op_defs) {
@@ -49,126 +42,118 @@ void GraphGradientMaker::Make(
          input_in_outputs = true;
          break;
        }
-      // Avoid to count the duplicate input(i.e. the in-place output)
+      // Avoid to count the duplicate input (i.e. the in-place output)
      if (!input_in_outputs) inputs_count[input]++;
    }
  }
-  // PLAY for the backward
+  // Set the gradient of targets
  for (int i = 0; i < targets.size(); ++i) {
-    // Set the gradient of targets
    if (i < input_grads.size()) {
      inputs_to_grads_[targets[i]] = input_grads[i];
    }
-    targets_set.insert(targets[i]);
+    targets_v2.insert(targets[i]);
  }
+  // PLAY for the backward
  for (int op_idx = (int)op_defs.size() - 1; op_idx >= 0; --op_idx) {
-    // Collect inputs and outputs, generate raw gradient ops
+    const OperatorDef& op_def = *op_defs[op_idx];
-    const OperatorDef& op = *op_defs[op_idx];
+    // Generate def by registered gradient maker
    vector<pair<string, int>> gen_grads;
-    bool is_skip = CheckGrad(op, targets_set, gen_grads);
+    vector<string> grad_outputs;
-    vector<string> g_outputs;
+    bool is_skip = CheckGrad(op_def, targets_v2, gen_grads);
-    for (const auto& output : op.output()) {
+    for (const auto& output : op_def.output()) {
-      string g_output = "";
+      string grad_output = "";
-      if (inputs_to_grads_.count(output) > 0) {
+      const auto& it = inputs_to_grads_.find(output);
-        g_output = inputs_to_grads_[output];
+      if (it != inputs_to_grads_.end()) grad_output = it->second;
-      }
+      grad_outputs.push_back(grad_output);
-      g_outputs.push_back(g_output);
    }
-    auto grad = MakeGradientForOp(op, g_outputs);
+    auto pack = MakeGradientForOp(op_def, grad_outputs);
+    // Split and gather gradient for multi-used inputs
-    // Process the raw gradient ops
+    vector<OperatorDef> gather_defs;
-    vector<OperatorDef> gather_ops;
+    for (auto& grad_def : pack.grad_defs) {
-    for (auto& grad_op : grad.ops) {
+      if (!grad_def.has_name()) {
-      // Set op name
+        grad_def.set_name(GetOperatorName());
-      if (!grad_op.has_name()) grad_op.set_name(GetOperatorName());
+      }
-      // Split and gather gradients for multi-used input
+      for (int i = 0; i < grad_def.output_size(); ++i) {
-      for (int i = 0; i < grad_op.output_size(); ++i) {
+        const auto& grad_name = grad_def.output(i);
-        auto* output = grad_op.mutable_output(i);
+        int original_index = -1;
-        int original_idx = -1;
+        for (int j = 0; j < pack.grad_inputs.size(); ++j) {
-        for (int j = 0; j < grad.g_inputs.size(); ++j) {
+          if (grad_name == pack.grad_inputs[j]) {
-          if (grad_op.output(i) == grad.g_inputs[j]) {
+            original_index = j;
-            original_idx = j;
          }
        }
-        // Ignore unused && in-placee GI
+        if (original_index == -1) continue;
-        if (original_idx == -1) continue;
        bool output_in_inputs = false;
-        for (const auto& input : grad_op.input()) {
+        for (const auto& name : grad_def.input()) {
-          if (grad_op.output(i) == input) {
+          if (grad_name == name) {
            output_in_inputs = true;
+            break;
          }
        }
        if (output_in_inputs) continue;
-        // Find a split branch
+        // Detect a split branch
-        const auto& original_name = op.input(original_idx);
+        const auto& original_name = op_def.input(original_index);
        if (inputs_count[original_name] > 1) {
-          // Split
+          auto grad_name_v2 =
-          auto split_name =
+              grad_name + "_autosplit_" + str::to(grads_count[grad_name]++);
-              *output + "_autosplit_" + str::to(grads_count[*output]++);
+          if (!is_skip) split_grads.insert(grad_name_v2);
-          if (!is_skip) all_split_grads.insert(split_name);
+          if (grads_count[grad_name] == inputs_count[original_name]) {
-          // Gather
+            auto gather_def = MakeOperatorDef(
-          if (grads_count[*output] == inputs_count[original_name]) {
+                "GradientGather",
-            OperatorDef gather_op;
+                GetOperatorName(),
-            gather_op.set_name(GetOperatorName());
+                vector<string>({}),
-            gather_op.set_type("GradientGather");
+                vector<string>({grad_name}));
-            gather_op.add_output(*output);
+            if (grad_def.has_device_option()) {
-            if (grad_op.has_device_option()) {
+              gather_def.mutable_device_option()->CopyFrom(
-              gather_op.mutable_device_option()->CopyFrom(
+                  grad_def.device_option());
-                  grad_op.device_option());
            }
-            for (int j = 0; j < grads_count[*output]; j++) {
+            for (int j = 0; j < grads_count[grad_name]; j++) {
-              auto key = *output + "_autosplit_" + str::to(j);
+              auto name = grad_name + "_autosplit_" + str::to(j);
-              if (all_split_grads.count(key)) gather_op.add_input(key);
+              if (split_grads.count(name)) gather_def.add_input(name);
            }
-            gather_ops.push_back(gather_op);
+            gather_defs.push_back(gather_def);
          }
-          *output = split_name;
+          *grad_def.mutable_output(i) = grad_name_v2;
        }
      }
    }
-    // Now, append the required ops
+    // Add defs
    if (!is_skip) {
-      // GradientGenerateOp
+      for (int i = 0; i < op_def.input_size(); ++i) {
+        inputs_to_grads_[op_def.input(i)] = pack.grad_inputs[i];
+      }
+      // Add def for ``GradientGenerateOp``
      if (gen_grads.size() > 0) {
-        vector<string> op_inputs, op_outputs;
+        vector<string> inputs, outputs;
        Argument arg_defaults;
        arg_defaults.set_name("defaults");
        for (auto& gen_grad : gen_grads) {
-          op_inputs.push_back(gen_grad.first);
+          inputs.push_back(gen_grad.first);
-          op_outputs.emplace_back(gen_grad.first + "_grad");
+          outputs.emplace_back(gen_grad.first + "_grad");
-          arg_defaults.add_floats(grad.defaults[gen_grad.second]);
+          arg_defaults.add_floats(pack.defaults[gen_grad.second]);
        }
-        auto generate_op = MakeOperatorDef(
+        auto generate_def = MakeOperatorDef(
            "GradientGenerate",
            GetOperatorName(),
-            op_inputs,
+            inputs,
-            op_outputs,
+            outputs,
            vector<Argument>({arg_defaults}));
-        if (op.has_device_option()) {
+        if (op_def.has_device_option()) {
-          generate_op.mutable_device_option()->CopyFrom(op.device_option());
+          generate_def.mutable_device_option()->CopyFrom(
+              op_def.device_option());
        }
-        backward_def.add_op()->CopyFrom(generate_op);
+        graph_def.add_op()->CopyFrom(generate_def);
      }
-      // GradientOp
+      // Add def for ``GenerateOp``
-      for (const auto& grad_op : grad.ops) {
+      for (const auto& grad_def : pack.grad_defs) {
-        backward_def.add_op()->CopyFrom(grad_op);
+        graph_def.add_op()->CopyFrom(grad_def);
      }
    }
+    // Add def for ``GradientGatherOp``
-    // GradientGatherOp
+    for (const auto& gather_def : gather_defs) {
-    for (const auto& gather_op : gather_ops) {
+      graph_def.add_op()->CopyFrom(gather_def);
-      backward_def.add_op()->CopyFrom(gather_op);
-    }
-    // Done
-    if (!is_skip) {
-      for (int i = 0; i < op.input_size(); ++i) {
-        if (!grad.g_inputs[i].empty())
-          inputs_to_grads_[op.input(i)] = grad.g_inputs[i];
-      }
    }
  }
 }
@@ -261,7 +246,6 @@ GraphDef GraphGradientMaker::Share(const GraphDef& input_def) {
    auto* op = output_def.mutable_op(op_idx);
    // Ignore the non-gradient ops
    if (!str::find(op->type(), "Gradient")) continue;
    // Check if output is an alias of input
    vec32_t inplace_flags;
    for (int i = 0; i < op->output_size(); ++i) {
@@ -273,11 +257,9 @@ GraphDef GraphGradientMaker::Share(const GraphDef& input_def) {
        }
      inplace_flags.emplace_back(flag);
    }
    // Besides, we need to collect the dead buffers
    // Reuse them when current operator is done
    vector<string> dead_buffers;
    // Rewrite input gradients
    for (int i = 0; i < op->input_size(); ++i) {
      const string& input = op->input(i);
@@ -291,7 +273,6 @@ GraphDef GraphGradientMaker::Share(const GraphDef& input_def) {
        *op->mutable_input(i) = new_input;
      }
    }
    // Rewrite output gradients
    for (int i = 0; i < op->output_size(); ++i) {
      if (str::startswith(op->type(), "Python")) continue;
@@ -313,7 +294,6 @@ GraphDef GraphGradientMaker::Share(const GraphDef& input_def) {
      }
      *op->mutable_output(i) = new_output;
    }
    // Update the pool
    for (auto& buffer : dead_buffers) {
      pool.emplace_back(buffer);

--- a/dragon/core/graph_gradient.h
+++ b/dragon/core/graph_gradient.h
@@ -19,14 +19,14 @@ namespace dragon {
 class DRAGON_API GraphGradientMaker {
 public:
-  /*! \brief Generate a backward graph from the forward ops */
+  /*! \brief Generate graph def from the op defs */
  void Make(
      const vector<OperatorDef*>& op_defs,
      const vector<string>& targets,
      const vector<string>& input_grads,
      GraphDef& graph_def);
-  /*! \brief Rewrite a graph to share the intermediate grads */
+  /*! \brief Rewrite graph def to share the intermediate grads */
  GraphDef Share(const GraphDef& input_def);
  /*! \brief Add an empty gradient */
@@ -45,7 +45,7 @@ class DRAGON_API GraphGradientMaker {
  }
 private:
-  /*! \brief Check the missing grads of backward procedure */
+  /*! \brief Check the missing grads */
  bool CheckGrad(
      const OperatorDef& op_def,
      const Set<string>& targets,
@@ -60,9 +60,6 @@ class DRAGON_API GraphGradientMaker {
  /*! \brief The mapping from input to grad */
  Map<string, string> inputs_to_grads_;
-  /*! \brief The non-gradient outputs */
-  Set<string> blacklist_set_;
  /*! \brief The gradients should be retained */
  Set<string> retained_grads_;

--- a/dragon/core/operator.cc
+++ b/dragon/core/operator.cc
@@ -202,33 +202,36 @@ OperatorBase* NewOperator(const OperatorDef& def, Workspace* ws) {
  return TryCreateOperator(def.type(), mutable_def, ws);
 }
-Gradient MakeGradientForOp(
+GradientPack MakeGradientForOp(
    const OperatorDef& def,
-    const vector<string>& g_outputs) {
+    const vector<string>& grad_outputs) {
-  unique_ptr<GradientMakerBase> maker(
+  CHECK(GradientRegistry()->Has(def.type()))
-      GradientRegistry()->Create(def.type(), def, g_outputs));
+      << "\nNo GradientMaker registered for " << def.type() << "Op.";
-  if (maker.get() == nullptr)
-    LOG(FATAL) << "Gradient maker for operator " << def.type()
-               << "not implemented.";
-  Gradient grad = maker->Make();
  OperatorDef reference_def(def);
-  // Set the cache key
+  unique_ptr<GradientMakerBase> maker(
+      GradientRegistry()->Create(def.type(), def, grad_outputs));
+  GradientPack pack = maker->Make();
+  // Copy cache key
  if (reference_def.has_cache_key()) {
-    for (int i = 0; i < grad.ops.size(); ++i) {
+    for (int i = 0; i < pack.grad_defs.size(); ++i) {
-      grad.ops[i].set_cache_key(
+      pack.grad_defs[i].set_cache_key(
          reference_def.cache_key() + "/grad" +
          (i > 0 ? (":" + str::to(i)) : ""));
    }
  }
  // Copy device option and arguments
-  if (maker->CopyDeviceOption() && def.has_device_option())
+  if (maker->CopyDeviceOption() && def.has_device_option()) {
-    for (auto& grad_def : grad.ops)
+    for (auto& grad_def : pack.grad_defs) {
      grad_def.mutable_device_option()->CopyFrom(def.device_option());
+    }
+  }
  // Copy arguments
-  if (maker->CopyArguments() && def.arg_size())
+  if (maker->CopyArguments() && def.arg_size()) {
-    for (auto& grad_def : grad.ops)
+    for (auto& grad_def : pack.grad_defs) {
      grad_def.mutable_arg()->MergeFrom(reference_def.arg());
-  return grad;
+    }
+  }
+  return pack;
 }
 /* Operator Registry */

--- a/dragon/core/operator_gradient.h
+++ b/dragon/core/operator_gradient.h
@@ -20,22 +20,22 @@
 namespace dragon {
-struct Gradient {
+struct GradientPack {
-  Gradient(
+  GradientPack(
-      const vector<OperatorDef>& ops,
+      const vector<OperatorDef>& grad_defs,
-      const vector<string>& g_inputs,
+      const vector<string>& grad_inputs,
      const vector<float>& defaults)
-      : ops(ops), g_inputs(g_inputs), defaults(defaults) {}
+      : grad_defs(grad_defs), grad_inputs(grad_inputs), defaults(defaults) {}
-  vector<OperatorDef> ops;
+  vector<OperatorDef> grad_defs;
-  vector<string> g_inputs;
+  vector<string> grad_inputs;
  vector<float> defaults;
 };
 class GradientMakerBase {
 public:
-  GradientMakerBase(const OperatorDef& def, const vector<string>& g_outputs)
+  GradientMakerBase(const OperatorDef& def, const vector<string>& grad_outputs)
-      : def(def), g_inputs_(def.input_size()), g_outputs_(g_outputs) {}
+      : def(def), grad_inputs_(def.input_size()), grad_outputs_(grad_outputs) {}
  virtual ~GradientMakerBase() {}
@@ -49,21 +49,23 @@ class GradientMakerBase {
    return true;
  }
-  virtual Gradient Make() {
+  virtual GradientPack Make() {
    auto new_defs = MakeDef();
    if (def.has_cache_key()) {
      // Attach the handle to name if having cache key
-      for (size_t i = 0; i < new_defs.size(); i++)
+      for (size_t i = 0; i < new_defs.size(); i++) {
        new_defs[i].set_name(def.name());
+      }
    } else {
      // Otherwise, just put it into the arguments
      Argument arg;
      arg.set_name("handle");
      arg.set_s(def.name());
-      for (size_t i = 0; i < new_defs.size(); i++)
+      for (size_t i = 0; i < new_defs.size(); i++) {
        new_defs[i].add_arg()->CopyFrom(arg);
+      }
    }
-    return Gradient(new_defs, g_inputs_, defaults());
+    return GradientPack(new_defs, grad_inputs_, defaults());
  };
  virtual vector<OperatorDef> MakeDef() {
@@ -84,26 +86,26 @@ class GradientMakerBase {
  }
  string GI(const int i) {
-    if (i >= int(g_inputs_.size())) return "";
+    if (i >= int(grad_inputs_.size())) return "";
-    g_inputs_[i] = def.input(i) + "_grad";
+    grad_inputs_[i] = def.input(i) + "_grad";
-    return g_inputs_[i];
+    return grad_inputs_[i];
  }
  const string GO(const int i) const {
-    return i < int(g_outputs_.size()) ? g_outputs_[i] : "";
+    return i < int(grad_outputs_.size()) ? grad_outputs_[i] : "";
  }
  virtual vector<float> defaults() {
-    return vector<float>(g_outputs_.size(), 1.f);
+    return vector<float>(grad_outputs_.size(), 1.f);
  }
 protected:
  const OperatorDef& def;
-  vector<string> g_inputs_;
+  vector<string> grad_inputs_;
-  const vector<string>& g_outputs_;
+  const vector<string>& grad_outputs_;
 };
-DRAGON_API Gradient
+DRAGON_API GradientPack
 MakeGradientForOp(const OperatorDef& op_def, const vector<string>& g_outputs);
 #define GRADIENT_MAKER_CTOR(name)                              \

--- a/dragon/core/registry.h
+++ b/dragon/core/registry.h
@@ -24,7 +24,7 @@ class Registry {
  ObjType* Create(const SrcType& key, Args... args) {
    CHECK(registry_.count(key))
-        << "\nKey(" << key << ") has not registered yet.";
+        << "\nKey(" << key << ") has not registered.";
    return registry_[key](args...);
  }

--- a/dragon/modules/python/autograd.h
+++ b/dragon/modules/python/autograd.h
@@ -23,19 +23,18 @@ namespace autograd {
 void RegisterModule(py::module& m) {
  m.def(
-      "CreateGradientDefs",
+      "CreateGradientDef",
-      [](const string& forward_def, const vector<string>& g_outputs) {
+      [](const string& def_str, const vector<string>& grad_outputs) {
        OperatorDef def;
-        if (!def.ParseFromString(forward_def))
+        CHECK(def.ParseFromString(def_str))
-          LOG(FATAL) << "Failed to parse the OperatorDef.";
+            << "\nFailed to parse the OperatorDef.";
-        if (!GradientRegistry()->Has(def.type()))
+        GradientPack pack = MakeGradientForOp(def, grad_outputs);
-          LOG(FATAL) << def.type() << "Op has no gradients.";
+        vector<py::bytes> grad_defs;
-        Gradient grad = MakeGradientForOp(def, g_outputs);
+        for (const auto& op_def : pack.grad_defs) {
-        vector<py::bytes> grad_ops;
+          grad_defs.push_back(op_def.SerializeAsString());
-        for (const auto& e : grad.ops)
+        }
-          grad_ops.push_back(e.SerializeAsString());
        return std::tuple<vector<py::bytes>, vector<string>, vector<float>>(
-            grad_ops, grad.g_inputs, grad.defaults);
+            grad_defs, pack.grad_inputs, pack.defaults);
      });
 }

--- a/dragon/modules/python/cuda.h
+++ b/dragon/modules/python/cuda.h
@@ -96,16 +96,11 @@ void RegisterModule(py::module& m) {
  });
  /*! \brief Activate the CuDNN engine */
-  m.def("cudaEnableDNN", [](bool enabled) {
+  m.def("cudaEnableDNN", [](bool enabled, bool benchmark) {
 #ifdef USE_CUDA
-    CUDAContext::object()->cudnn_enabled_ = enabled;
+    auto* cuda_object = CUDAContext::object();
-#endif
+    cuda_object->cudnn_enabled_ = enabled;
-  });
+    cuda_object->cudnn_benchmark_ = benchmark;
-  /*! \brief Activate the CuDNN benchmark */
-  m.def("cudaEnableDNNBenchmark", [](bool enabled) {
-#ifdef USE_CUDA
-    CUDAContext::object()->cudnn_benchmark_ = enabled;
 #endif
  });

--- a/dragon/python/_api/cuda/__init__.py
+++ b/dragon/python/_api/cuda/__init__.py
@@ -15,11 +15,11 @@ from __future__ import print_function as _print_function
 from dragon.core.device.cuda import current_device
 from dragon.core.device.cuda import enable_cudnn
-from dragon.core.device.cuda import enable_cudnn_benchmark
 from dragon.core.device.cuda import get_device_capability
 from dragon.core.device.cuda import is_available
 from dragon.core.device.cuda import set_default_device
 from dragon.core.device.cuda import set_device
+from dragon.core.device.cuda import Stream
 from dragon.core.device.cuda import synchronize
 __all__ = [_s for _s in dir() if not _s.startswith('_')]
--- a/dragon/python/backend.py
+++ b/dragon/python/backend.py
@@ -8,8 +8,7 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
+"""List the exported C++ API."""
-"""List the exported CXX API."""
 from __future__ import absolute_import
 from __future__ import division

--- a/dragon/python/core/autograph/config.py
+++ b/dragon/python/core/autograph/config.py
@@ -8,7 +8,6 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
 """Define the options for autograph utilities."""
 from __future__ import absolute_import

--- a/dragon/python/core/autograph/def_function.py
+++ b/dragon/python/core/autograph/def_function.py
@@ -12,7 +12,6 @@
 #    <https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/eager/def_function.py>
 #
 # ------------------------------------------------------------
 """Utilities to define a graph function with decorator."""
 from __future__ import absolute_import
@@ -267,7 +266,7 @@ class FunctionGuard(object):
            executables = [function_lib.create_function(inputs, outputs)]
            for obj in dummies:
                if isinstance(obj, optimizer.Optimizer):
-                    executables.append(function_lib.create_function(updater=obj))
+                    executables.append(function_lib.create_function(optimizer=obj))
            self.inputs = inputs
            self.outputs = returns
            self.executables = executables

--- a/dragon/python/core/autograph/function_lib.py
+++ b/dragon/python/core/autograph/function_lib.py
@@ -8,7 +8,6 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
 """Translate the graph abstraction to a python function."""
 from __future__ import absolute_import
@@ -288,7 +287,7 @@ class Function(object):
            if len(kwargs) > 0 else self.callback(*args)
-def create_function(inputs=None, outputs=None, givens=None, updater=None):
+def create_function(inputs=None, outputs=None, givens=None, optimizer=None):
    """Create a callable graph from specified outputs.
    Tensors that catch any operators can be used to create a graph:
@@ -325,37 +324,36 @@ def create_function(inputs=None, outputs=None, givens=None, updater=None):
    bar = dragon.create_function(outputs=y, givens={x: x2})
    ```
-    Specify ``updater`` to make a graph applying SGD updates:
+    Specify ``optimizer`` to make a graph applying parameter updates:
    ```python
    x = dragon.Tensor('x', dtype='float32').set_value(1)
    x_grad = dragon.Tensor('x_grad', dtype='float32').set_value(1)
-    # Define a updater to catch the operators
+    optimizer = dragon.optimizers.SGD(base_lr=0.01)
-    updater = dragon.updaters.SGD(base_lr=0.01)
+    optimizer.apply_gradients(values_and_grads=[(x, x_grad)])
-    updater.apply_gradients(values_and_grads=[(x, x_grad)])
    # Compute x -= 0.01 * x_grad
-    train_step = dragon.create_function(updater=updater)
+    train_step = dragon.create_function(optimizer=optimizer)
    train_step()
-    print(x.get_value())
+    print(x.get_value())  # 0.99
    ```
    Parameters
    ----------
    inputs : Sequence[dragon.Tensor], optional
-        The inputs to feed.
+        The input tensors.
    outputs : Sequence[dragon.Tensor], optional
-        The outputs to fetch.
+        The output tensors.
    givens : Dict[dragon.Tensor, dragon.Tensor], optional
-        The substitutions to apply.
+        The optional substitutions.
-    updater : Updater, optional
+    optimizer : dragon.optimizers.Optimizer, optional
-        The optional updater.
+        The optional optimizer.
    Returns
    -------
-    Function
+    callable
        The callable function.
    """
-    return Function().create(inputs, outputs, givens, updater)
+    return Function().create(inputs, outputs, givens, optimizer)
--- a/dragon/python/core/autograph/grad_maker.py
+++ b/dragon/python/core/autograph/grad_maker.py
@@ -12,8 +12,7 @@
 #    <https://github.com/caffe2/caffe2/blob/master/caffe2/python/core.py>
 #
 # ------------------------------------------------------------
+"""Simple gradient maker implementation."""
-"""Python-implemented gradient maker."""
 from __future__ import absolute_import
 from __future__ import division
@@ -28,47 +27,38 @@ from dragon.core.proto import dragon_pb2
 class GradientMaker(object):
-    """Make def for the gradient based on rules."""
+    """The maker to generate grad defs to run backward."""
    @classmethod
-    def gen_def(cls, op_def, g_outputs):
+    def gen_def(cls, op_def, grad_outputs):
-        """Generate the OperatorDef from forward op."""
+        """Generate the grad def."""
-        grad_defs, g_inputs, defaults = backend.CreateGradientDefs(
+        grad_defs, grad_inputs, defaults = backend.CreateGradientDef(
-            op_def.SerializeToString(), g_outputs)
+            op_def.SerializeToString(), grad_outputs)
        for i, grad_def in enumerate(grad_defs):
            new_def = dragon_pb2.OperatorDef()
            new_def.ParseFromString(grad_def)
            grad_defs[i] = new_def
-        return grad_defs, g_inputs, defaults
+        return grad_defs, grad_inputs, defaults
    @classmethod
-    def check(cls, op_def, inputs_to_grads, blacklist, targets):
+    def check(cls, op_def, inputs_to_grads, targets):
        """Check if missing gradients. If missing, skip."""
        if op_def.type in backend.NO_GRADIENT_OPERATORS:
-            for input in op_def.input:
+            return True, []
-                blacklist.add(input)
+        gen_grads, maybe_skip = [], False
-            return True, None
+        for i, output in enumerate(op_def.output):
-        gen_grads = []
-        for idx, output in enumerate(op_def.output):
            if output not in inputs_to_grads:
-                if output in blacklist:
+                maybe_skip = True
-                    return True, gen_grads
                if output in targets:
-                    # Consider to generate virtual gradient for targets.
+                    gen_grads.append((output, i))
-                    gen_grads.append((output, idx))
                    inputs_to_grads[output] = output + '_grad'
-                elif len(op_def.output) == 1:
+        return maybe_skip and len(gen_grads) == 0, gen_grads
-                    # We can skip this op, obviously.
-                    return True, gen_grads
-        # Pass, even if missing some grads.
-        return False, gen_grads
    @classmethod
    def make(cls, op_defs, targets, input_grads=None):
-        """Make the backward op defs."""
+        """Make the grad defs."""
        inputs_to_grads = {} if input_grads is None else input_grads
        inputs_count, grads_count = defaultdict(int), defaultdict(int)
-        all_split_grads, blacklist = set(), set()
        # PLAY for the forward.
        for op_def in op_defs:
@@ -77,89 +67,71 @@ class GradientMaker(object):
            outputs = [output for output in op_def.output]
            for input in op_def.input:
                if input not in outputs:
-                    # Avoid to count the duplicate input,
+                    # Avoid to count the duplicate input (i.e. the in-place output).
-                    # (i.e. the in-place output).
                    inputs_count[input] += 1
        # PLAY for the backward.
-        backward_defs = []
+        backward_defs, split_grads = [], set()
        for op_def in op_defs[::-1]:
-            # Collect inputs and outputs.
+            # Generate def by registered gradient maker.
-            is_skip, gen_grads = cls.check(
+            is_skip, gen_grads = cls.check(op_def, inputs_to_grads, targets)
-                op_def=op_def,
+            grad_outputs = [inputs_to_grads.get(name, '') for name in op_def.output]
-                inputs_to_grads=inputs_to_grads,
+            grad_defs, grad_inputs, defaults = cls.gen_def(op_def, grad_outputs)
-                blacklist=blacklist,
-                targets=targets,
-            )
-            # Missing grads are represented as ``None``.
-            g_outputs = [inputs_to_grads.get(name, '') for name in op_def.output]
-            grad_defs, g_inputs, defaults = cls.gen_def(op_def, g_outputs)
-            # Append operators.
+            # Add defs.
            if not is_skip:
-                # GradientGenerateOp
+                for input, grad_input in zip(op_def.input, grad_inputs):
+                    inputs_to_grads[input] = grad_input
+                # Add def for ``GradientGenerateOp``
                if len(gen_grads) > 0:
-                    op_inputs, op_outputs, values = [], [], []
+                    inputs, outputs, values = [], [], []
-                    for item in gen_grads:
+                    for name, i in gen_grads:
-                        op_inputs.append(item[0])
+                        inputs.append(name)
-                        op_outputs.append(item[0] + '_grad')
+                        outputs.append(name + '_grad')
-                        values.append(defaults[item[1]])
+                        values.append(defaults[i])
                    gen_op = proto_util.make_operator_def(
                        name=OpDef.get_name(),
                        op_type='GradientGenerate',
-                        inputs=op_inputs,
+                        inputs=inputs,
-                        outputs=op_outputs,
+                        outputs=outputs,
                        defaults=values,
-                    )
+                        device_option=op_def.device_option
-                    if op_def.HasField('device_option'):
+                        if op_def.HasField('device_option') else None)
-                        gen_op.device_option.CopyFrom(op_def.device_option)
                    backward_defs.append(gen_op)
-                # GradientOp
+                # Add def for ``GradientOp``
                for grad_def in grad_defs:
                    grad_def.name = OpDef.get_name()
                    backward_defs.append(grad_def)
-            # Split and gather grads for multi-used input.
+            # Split and gather gradient for multi-used inputs.
            for grad_def in grad_defs:
-                for g_output_idx, g_output in enumerate(grad_def.output):
+                for i, grad_name in enumerate(grad_def.output):
-                    original_idx = -1
+                    original_index = -1
-                    for g_input_idx, g_input in enumerate(g_inputs):
+                    for j, name in enumerate(grad_inputs):
-                        if g_output == g_input:
+                        if grad_name == name:
-                            original_idx = g_input_idx
+                            original_index = j
-                    # Ignore un-used && in-placed GI(?).
+                    if original_index == -1 or grad_name in grad_def.input:
-                    if original_idx == -1:
                        continue
-                    if g_output in grad_def.input:
+                    original_name = op_def.input[original_index]
+                    if inputs_count[original_name] <= 1:
                        continue
-                    # Found a split branch.
+                    # Detect a split branch.
-                    original_name = op_def.input[original_idx]
+                    grad_name_v2 = grad_name + '_autosplit_%d' % grads_count[grad_name]
-                    if inputs_count[original_name] > 1:
+                    if not is_skip:
-                        # Split.
+                        split_grads.add(grad_name_v2)
-                        split_name = g_output + '_autosplit_%d' % grads_count[g_output]
+                    grads_count[grad_name] += 1
-                        if not is_skip:
+                    if grads_count[grad_name] == inputs_count[original_name]:
-                            all_split_grads.add(split_name)
+                        gather_inputs = []
-                        grads_count[g_output] += 1
+                        for j in range(grads_count[grad_name]):
-                        # Gather.
+                            if '%s_autosplit_%d' % (grad_name, j) in split_grads:
-                        if grads_count[g_output] == inputs_count[original_name]:
+                                gather_inputs.append('%s_autosplit_%d' % (grad_name, j))
-                            split_inputs = []
+                        backward_defs.append(proto_util.make_operator_def(
-                            for idx in range(grads_count[g_output]):
+                            name=OpDef.get_name(),
-                                if '%s_autosplit_%d' % (g_output, idx) in all_split_grads:
+                            op_type='GradientGather',
-                                    split_inputs.append('%s_autosplit_%d' % (g_output, idx))
+                            inputs=gather_inputs,
-                            gather_def = proto_util.make_operator_def(
+                            outputs=[grad_name],
-                                name=OpDef.get_name(),
+                            device_option=grad_def.device_option
-                                op_type='GradientGather',
+                            if grad_def.HasField('device_option') else None))
-                                inputs=split_inputs,
+                    grad_def.output[i] = grad_name_v2
-                                outputs=[g_output],
-                            )
-                            if grad_def.HasField('device_option'):
-                                gather_def.device_option.CopyFrom(grad_def.device_option)
-                            backward_defs.append(gather_def)
-                        grad_def.output[g_output_idx] = split_name
-            # Done.
-            if not is_skip:
-                for name, grad in zip(op_def.input, g_inputs):
-                    if grad != '':
-                        inputs_to_grads[name] = grad
        return backward_defs
--- a/dragon/python/core/autograph/op_def.py
+++ b/dragon/python/core/autograph/op_def.py
@@ -8,7 +8,6 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
 """Define the helper for creating symbolic operators."""
 from __future__ import absolute_import
@@ -87,14 +86,16 @@ class OpDef(object):
            num_outputs = len(outputs)
        # Construct Def.
-        op_idx, op_name = OpDef.get_index_and_name()
+        op_index, op_name = OpDef.get_index_and_name()
-        op_info._defs[op_idx] = proto_util.make_operator_def(
+        op_info.add_def(
-            name=op_name,
+            op_index, proto_util.make_operator_def(
-            op_type=op_type,
+                name=op_name,
-            inputs=[input.id for input in inputs],
+                op_type=op_type,
-            outputs=[output.id for output in outputs],
+                inputs=[input.id for input in inputs],
-            device_option=proto_util.get_default_device_option(),
+                outputs=[output.id for output in outputs],
-            **kwargs)
+                device_option=proto_util.get_default_device_option(),
+                **kwargs
+            ))
        # Blend the op for outputs.
        for output in outputs:

--- a/dragon/python/core/autograph/op_spec.py
+++ b/dragon/python/core/autograph/op_spec.py
@@ -8,7 +8,6 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
 """Define the describing spec for symbolic operators."""
 from __future__ import absolute_import

--- a/dragon/python/core/autograph/tensor.py
+++ b/dragon/python/core/autograph/tensor.py
@@ -8,7 +8,6 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
 """The graph executing tensor."""
 from __future__ import absolute_import

--- a/dragon/python/core/device/cuda.py
+++ b/dragon/python/core/device/cuda.py
@@ -8,7 +8,6 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
 """CUDA utilities."""
 from __future__ import absolute_import
@@ -20,7 +19,17 @@ from dragon.core.framework import config
 class Stream(backend.CudaStream):
+    """The CUDA stream wrapper."""
    def __init__(self, device_index):
+        """Create a ``Stream``.
+        Parameters
+        ----------
+        device_index : int, required
+            The device index of stream.
+        """
        super(Stream, self).__init__(device_index)
    @property
@@ -36,7 +45,7 @@ class Stream(backend.CudaStream):
        return super(Stream, self).ptr
    def synchronize(self):
-        """Synchronize the stream."""
+        """Wait for the dispatched kernels to complete."""
        self.Synchronize()
@@ -52,38 +61,28 @@ def current_device():
    return backend.cudaGetDevice()
-def enable_cudnn(enabled=True):
+def enable_cudnn(enabled=True, benchmark=False):
-    """Activate the CuDNN engine.
+    """Enable the CuDNN library.
-    Parameters
-    ----------
-    enabled : bool, optional, default=True
-        **True** to activate CuDNN.
-    """
-    return backend.cudaEnableDNN(enabled)
-def enable_cudnn_benchmark(enabled=True):
-    """Activate the CuDNN benchmark.
    Parameters
    ----------
    enabled : bool, optional, default=True
-        **True** to activate CuDNN benchmark.
+        **True** to enable the CuDNN.
+    benchmark : bool, optional, default=False
+        **True** to select algorithms according to benchmark.
    """
-    return backend.cudaEnableDNNBenchmark(enabled)
+    return backend.cudaEnableDNN(enabled, benchmark)
-def get_device_capability(device_id=None):
+def get_device_capability(device_index=None):
    """Return the capability of specified device.
-    If ``device_id`` is **None**, the current device will be selected.
+    If ``device_index`` is **None**, the current device will be selected.
    Parameters
    ----------
-    device_id : int, optional
+    device_index : int, optional
        The device index.
    Returns
@@ -92,8 +91,8 @@ def get_device_capability(device_id=None):
        The major and minor number.
    """
-    device_id = device_id if device_id else -1
+    device_index = device_index if device_index else -1
-    return backend.cudaGetDeviceCapability(device_id)
+    return backend.cudaGetDeviceCapability(device_index)
 def is_available():
@@ -144,18 +143,18 @@ def set_device(device_index=0):
    return backend.cudaSetDevice(device_index)
-def synchronize(device_id=None, stream_id=0):
+def synchronize(device_index=None, stream_index=0):
-    """Synchronize the specified stream.
+    """Synchronize a specified CUDA stream.
-    If ``device_id`` is **None**, the current device will be selected.
+    If ``device_index`` is **None**, the current device will be selected.
    Parameters
    ----------
-    device_id : int, optional
+    device_index : int, optional
        The device index.
-    stream_id : int, optional, default=0
+    stream_index : int, optional, default=0
        The stream index.
    """
-    device_id = device_id if device_id else -1
+    device_index = device_index if device_index else -1
-    return backend.cudaStreamSynchronize(device_id, stream_id)
+    return backend.cudaStreamSynchronize(device_index, stream_index)
--- a/dragon/python/core/distributed/backend.py
+++ b/dragon/python/core/distributed/backend.py
@@ -12,7 +12,6 @@
 #    <https://github.com/pytorch/pytorch/blob/master/torch/distributed/distributed_c10d.py>
 #
 # ------------------------------------------------------------
 """Distributed utilities equipped with Python."""
 from __future__ import absolute_import

--- a/dragon/python/core/eager/backprop.py
+++ b/dragon/python/core/eager/backprop.py
@@ -12,7 +12,6 @@
 #       <https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/eager/backprop.py>
 #
 # ------------------------------------------------------------
 """Do back-propagation from the executed operations."""
 from __future__ import absolute_import
@@ -115,9 +114,8 @@ class GradientTape(object):
        # Check the pushed tape.
        if self._tape is None:
            raise RuntimeError(
-                'GradientTape.gradient can only be called '
+                'GradientTape.gradient(...) can only be called '
-                'once on non-persistent tapes.'
+                'once on non-persistent tapes.')
-            )
        if self._recording:
            if not self._persistent:
                self._pop_tape()

--- a/dragon/python/core/eager/context.py
+++ b/dragon/python/core/eager/context.py
@@ -12,7 +12,6 @@
 #    <https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/eager/context.py>
 #
 # ------------------------------------------------------------
 """State management for eager execution."""
 from __future__ import absolute_import

--- a/dragon/python/core/eager/executor.py
+++ b/dragon/python/core/eager/executor.py
@@ -8,7 +8,6 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
 """Execute tensor operations. """
 from __future__ import absolute_import

--- a/dragon/python/core/eager/tensor.py
+++ b/dragon/python/core/eager/tensor.py
@@ -8,7 +8,6 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
 """The eager executing tensor."""
 from __future__ import absolute_import

--- a/dragon/python/core/framework/config.py
+++ b/dragon/python/core/framework/config.py
@@ -8,7 +8,6 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
 """Define the global configurations."""
 from __future__ import absolute_import

--- a/dragon/python/core/framework/device_spec.py
+++ b/dragon/python/core/framework/device_spec.py
@@ -8,7 +8,6 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
 """Structure to represent a device."""
 from __future__ import absolute_import

--- a/dragon/python/core/framework/mapping.py
+++ b/dragon/python/core/framework/mapping.py
@@ -8,7 +8,6 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
 """Constant mappings."""
 from __future__ import absolute_import

--- a/dragon/python/core/framework/ops.py
+++ b/dragon/python/core/framework/ops.py
@@ -8,7 +8,6 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
 """Utilities to fly an operator."""
 from __future__ import absolute_import

--- a/dragon/python/core/framework/proto_util.py
+++ b/dragon/python/core/framework/proto_util.py
@@ -8,7 +8,6 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
 """Define some helpful protocol buffer makers here."""
 from __future__ import absolute_import
@@ -101,10 +100,9 @@ def make_operator_def(
    arg=None,
    **kwargs
 ):
-    op_def = dragon_pb2.OperatorDef()
+    op_def = dragon_pb2.OperatorDef(type=op_type, name=name)
-    op_def.type, op_def.name = op_type, name
+    op_def.input.extend(inputs)
-    op_def.input.extend([str(tensor) for tensor in inputs])
+    op_def.output.extend(outputs)
-    op_def.output.extend([str(tensor) for tensor in outputs])
    if device_option is not None:
        op_def.device_option.CopyFrom(device_option)
    if 'random_seed' in kwargs:

--- a/dragon/python/core/framework/tensor_spec.py
+++ b/dragon/python/core/framework/tensor_spec.py
@@ -8,7 +8,6 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
 """Structure to represent a tensor."""
 from __future__ import absolute_import

--- a/dragon/python/core/framework/types.py
+++ b/dragon/python/core/framework/types.py
@@ -8,7 +8,6 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
 """Define the basic prototypes."""
 from __future__ import absolute_import

--- a/dragon/python/core/framework/workspace.py
+++ b/dragon/python/core/framework/workspace.py
@@ -8,7 +8,6 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
 """Generic interfaces of current default workspace."""
 from __future__ import absolute_import

--- a/dragon/python/core/io/kpl_record.py
+++ b/dragon/python/core/io/kpl_record.py
@@ -8,7 +8,6 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
 """Utilities for KPLRecord."""
 from __future__ import absolute_import

--- a/dragon/python/core/io/tf_record.py
+++ b/dragon/python/core/io/tf_record.py
@@ -8,7 +8,6 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
 """Utilities for TFRecord."""
 from __future__ import absolute_import

--- a/dragon/python/core/ops/activation_ops.py
+++ b/dragon/python/core/ops/activation_ops.py
@@ -8,6 +8,7 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
+"""The activation ops."""
 from __future__ import absolute_import
 from __future__ import division
@@ -30,7 +31,7 @@ def dropout(inputs, prob=0.5, scale=True, **kwargs):
    The **Dropout** function is defined as:
-    .. math:: \text{Dropout}(x) = x * \text{Bernoulli}(p=1 - prob)
+    .. math:: \text{Dropout}(x) = x * (r \sim \mathcal{B}(1, 1 - \text{prob}))
    Examples:
@@ -85,10 +86,11 @@ def drop_block2d(
    The **DropBlock** function is defined as:
    .. math::
-        \text{DropBlock}(x) = x \cdot \text{Bernoulli}(\alpha\cdot\gamma) \\
+        \text{DropBlock}(x_{ijk} =
-        \quad \\ \text{where}\quad \gamma =
+            x_{ijk} * (r_{ik} \sim \mathcal{B}(1, \alpha\gamma)) \\ \quad \\
-            \frac{keep\_prob}{block\_size^{n}}
+                \text{where}\quad \gamma =
-            \frac{feat\_size^{n}}{(feat\_size - block\_size + 1)^n}
+                    \frac{\text{keep\_prob}}{\text{block\_size}^{n}}
+                    \frac{\text{feat\_size}^{n}}{(\text{feat\_size} - \text{block\_size} + 1)^n}
    Set the ``decrement`` to schedule ``keep_prob`` from **1.0**.
@@ -103,7 +105,7 @@ def drop_block2d(
    keep_prob : Union[float, dragon.Tensor], optional, default=0.9
        The keeping prob.
    alpha : float, optional, default=1.
-        The scale factor to :math:`\gamma`.
+        The value to :math:`\gamma`.
    decrement : float, optional, default=0.
        The decrement value to ``keep_prob``.
    data_format : {'NCHW', 'NHWC'}, optional
@@ -141,7 +143,7 @@ def drop_path(inputs, prob=0.2, increment=0., **kwargs):
    The **DropPath** function is defined as:
-    .. math:: \text{DropPath}(x) = x * \text{Bernoulli}(p=1 - prob)
+    .. math:: \text{DropPath}(x_{ij}) = x_{ij} * (r_{i} \sim \mathcal{B}(1, 1 - \text{prob}))
    Set the ``increment`` to schedule ``prob`` from **0.0** after each run.
@@ -185,10 +187,10 @@ def elu(inputs, alpha=1., **kwargs):
    .. math::
        \text{ELU}(x) =
-        \begin{cases}
+            \begin{cases}
-            x, & \text{ if } x \geq 0 \\
+                x, & \text{ if } x \geq 0 \\
-            \alpha * (e^{x} - 1), & \text{ otherwise }
+                \alpha * (\exp(x) - 1), & \text{ otherwise }
-        \end{cases}
+            \end{cases}
    Examples:
@@ -230,10 +232,10 @@ def leaky_relu(inputs, alpha=0.2, **kwargs):
    .. math::
        \text{LeakyReLU}(x) =
-        \begin{cases}
+            \begin{cases}
-            x, & \text{ if } x \geq 0 \\
+                x, & \text{ if } x \geq 0 \\
-            \alpha * x, & \text{ otherwise }
+                \alpha * x, & \text{ otherwise }
-        \end{cases}
+            \end{cases}
    Examples:
@@ -273,7 +275,7 @@ def log_softmax(inputs, axis=-1, **kwargs):
    The **LogSoftmax** function is defined as:
-    .. math:: \text{LogSoftmax}(x) = \log(\frac{e^{x_{i}}}{\sum e^{x_{j}}})
+    .. math:: \text{LogSoftmax}(x) = \log(\frac{\exp(x_{i})}{\sum \exp(x_{j})})
    The argument ``axis`` could be negative:
@@ -318,10 +320,10 @@ def prelu(inputs, channel_shared=False, data_format='NCHW', **kwargs):
    .. math::
        \text{PReLU}(x) =
-        \begin{cases}
+            \begin{cases}
-            x, & \text{ if } x \geq 0 \\
+                x, & \text{ if } x \geq 0 \\
-            weight * x, & \text{ otherwise }
+                weight * x, & \text{ otherwise }
-        \end{cases}
+            \end{cases}
    Examples:
@@ -365,10 +367,10 @@ def relu(inputs, **kwargs):
    .. math::
        \text{ReLU}(x) =
-        \begin{cases}
+            \begin{cases}
-            x, & \text{ if } x \geq 0 \\
+                x, & \text{ if } x \geq 0 \\
-            0, & \text{ otherwise }
+                0, & \text{ otherwise }
-        \end{cases}
+            \end{cases}
    Examples:
@@ -451,7 +453,7 @@ def selu(inputs, alpha=1.67326, gamma=1.0507, **kwargs):
        \text{SELU}(x) = \gamma *
            \begin{cases}
                x, & \text{ if } x \geq 0 \\
-                \alpha * (e^{x} - 1), & \text{ otherwise }
+                \alpha * (\exp(x) - 1), & \text{ otherwise }
            \end{cases}
    Examples:
@@ -496,7 +498,7 @@ def sigmoid(inputs, **kwargs):
    The **Sigmoid** function is defined as:
-    .. math:: \text{Sigmoid}(x) = \frac{1}{1 + e^{-x}}
+    .. math:: \text{Sigmoid}(x) = \frac{1}{1 + \exp(-x)}
    Examples:
@@ -533,7 +535,7 @@ def softmax(inputs, axis=-1, **kwargs):
    The **Softmax** function is defined as:
-    .. math:: \text{Softmax}(x) = \frac{e^{x_{i}}}{\sum e^{x_{j}}}
+    .. math:: \text{Softmax}(x_{i}) = \frac{\exp(x_{i})}{\sum_{j} \exp(x_{j})}
    The argument ``axis`` could be negative:
@@ -569,11 +571,11 @@ def softmax(inputs, axis=-1, **kwargs):
 @OpSchema.num_inputs(1)
 def tanh(inputs, **kwargs):
-    r"""Apply the tanh function.
+    r"""Compute the tanh of input.
    The **Tanh** function is defined as:
-    .. math:: \text{Tanh}(x) = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}
+    .. math:: \text{Tanh}(x) = \frac{\exp(x) - \exp(-x)}{\exp(x) + \exp(-x)}
    Examples:

--- a/dragon/python/core/ops/activation_ops_lib.py
+++ b/dragon/python/core/ops/activation_ops_lib.py
@@ -8,6 +8,7 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
+"""The activation ops library."""
 from __future__ import absolute_import
 from __future__ import division

--- a/dragon/python/core/ops/array_ops.py
+++ b/dragon/python/core/ops/array_ops.py
@@ -8,6 +8,7 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
+"""The array ops."""
 from __future__ import absolute_import
 from __future__ import division
@@ -814,8 +815,8 @@ def moments(inputs, axis=None, keep_dims=False, **kwargs):
    .. math::
        \begin{cases}
-            \text{Mean}(x) = \frac{1}{n}\sum(x) \\
+            \text{mean} = \frac{1}{n}\sum(\text{input}) \\
-            \text{Variance}(x) = \frac{1}{n}\sum(x - \text{Mean}(x))^{2}
+            \text{variance} = \frac{1}{n}\sum(x - \text{mean}(\text{input}))^{2}
        \end{cases}
    The argument ``axis`` could be negative or **None**:
@@ -910,7 +911,7 @@ def multinomial(inputs, num_samples=1, eps=0., normalize=False, **kwargs):
 def nonzero(inputs, **kwargs):
    r"""Return the index of non-zero elements.
-    .. math:: \text{out} = \{i, \text{ if } \text{input}[i] \neq 0
+    .. math:: \text{out} = \{i\}, \text{ if } \text{input}[i] \neq 0
    Parameters
    ----------
@@ -937,10 +938,10 @@ def one_hot(inputs, depth, on_value=1, off_value=0, **kwargs):
    .. math::
        \text{out}[i][j] =
-        \begin{cases}
+            \begin{cases}
-            \text{off\_value}, & \text{ if } \text{input}[i] \neq j \\
+                \text{off\_value}, & \text{ if } \text{input}[i] \neq j \\
-            \text{on\_value}, & \text{ otherwise }
+                \text{on\_value}, & \text{ otherwise }
-        \end{cases}
+            \end{cases}
    The max value of indices, i.e., the ``depth`` should be specified:
@@ -958,7 +959,7 @@ def one_hot(inputs, depth, on_value=1, off_value=0, **kwargs):
    Parameters
    ----------
    inputs : dragon.Tensor
-        The tensor :math:`x`.
+        The input tensor.
    depth : int
        The depth of representation.
    on_value : int, optional, default=1
@@ -1497,18 +1498,18 @@ def where(inputs, **kwargs):
    r"""Select the elements from two branches under the condition.
    .. math::
-        y[i] =
+        \text{out}[i] =
-        \begin{cases}
+            \begin{cases}
-            a[i] & \text{ if } \text{cond}[i] \text{ is True } \\
+                \text{input1}[i]  & \text{ if } \text{condition}[i] \text{ is True } \\
-            b[i], & \text{ otherwise }
+                \text{input2}[i], & \text{ otherwise }
-        \end{cases}
+            \end{cases}
-    Return the indices of **True** elements, if only the ``cond`` is given.
+    Return the index of **True** elements, if only the ``condition`` is given.
    Parameters
    ----------
    inputs : Sequence[dragon.Tensor]
-        The tensor :math:`a`, :math:`b`, and :math:`\text{cond}`.
+        The input1, input2 and condition tensor.
    Returns
    -------
@@ -1517,7 +1518,7 @@ def where(inputs, **kwargs):
    See Also
    --------
-    `dragon.nonzero(...)`_ : Return the indices of non-zero elements.
+    `dragon.nonzero(...)`_ : Return the index of non-zero elements.
    """
    if types.is_tensor(inputs) or len(inputs) == 1:

--- a/dragon/python/core/ops/array_ops_lib.py
+++ b/dragon/python/core/ops/array_ops_lib.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#    <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+"""The array ops library."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

--- a/dragon/python/core/ops/control_flow_ops.py
+++ b/dragon/python/core/ops/control_flow_ops.py
@@ -8,6 +8,7 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
+"""The control flow ops."""
 from __future__ import absolute_import
 from __future__ import division

--- a/dragon/python/core/ops/control_flow_ops_lib.py
+++ b/dragon/python/core/ops/control_flow_ops_lib.py
@@ -8,6 +8,7 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
+"""The control flow ops library."""
 from __future__ import absolute_import
 from __future__ import division

--- a/dragon/python/core/ops/distributed_ops.py
+++ b/dragon/python/core/ops/distributed_ops.py
@@ -8,6 +8,7 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
+"""The distributed ops."""
 from __future__ import absolute_import
 from __future__ import division

--- a/dragon/python/core/ops/distributed_ops_lib.py
+++ b/dragon/python/core/ops/distributed_ops_lib.py
@@ -8,6 +8,7 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
+"""The distributed ops library."""
 from __future__ import absolute_import
 from __future__ import division

--- a/dragon/python/core/ops/framework_ops.py
+++ b/dragon/python/core/ops/framework_ops.py
@@ -8,6 +8,7 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
+"""The framework ops."""
 from __future__ import absolute_import
 from __future__ import division

--- a/dragon/python/core/ops/init_ops.py
+++ b/dragon/python/core/ops/init_ops.py
@@ -8,6 +8,7 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
+"""The init ops."""
 from __future__ import absolute_import
 from __future__ import division

--- a/dragon/python/core/ops/init_ops_lib.py
+++ b/dragon/python/core/ops/init_ops_lib.py
@@ -8,6 +8,7 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
+"""The init ops library."""
 from __future__ import absolute_import
 from __future__ import division

--- a/dragon/python/core/ops/loss_ops.py
+++ b/dragon/python/core/ops/loss_ops.py
@@ -8,6 +8,7 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
+"""The loss ops."""
 from __future__ import absolute_import
 from __future__ import division
@@ -284,10 +285,10 @@ def smooth_l1_loss(inputs, beta=1., reduction='mean', **kwargs):
    .. math::
        \text{SmoothL1Loss}(x, y) =
-        \begin{cases}
+            \begin{cases}
-            0.5 * (x - y)^{2} / beta, & \text{ if } |x - y| < beta \\
+                0.5 * (x - y)^{2} / beta, & \text{ if } |x - y| < beta \\
-            |x - y| - 0.5 * beta, & \text{ otherwise }
+                |x - y| - 0.5 * beta, & \text{ otherwise }
-        \end{cases}
+            \end{cases}
    Parameters
    ----------

--- a/dragon/python/core/ops/loss_ops_lib.py
+++ b/dragon/python/core/ops/loss_ops_lib.py
@@ -8,6 +8,7 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
+"""The loss ops library."""
 from __future__ import absolute_import
 from __future__ import division

--- a/dragon/python/core/ops/math_ops.py
+++ b/dragon/python/core/ops/math_ops.py
@@ -8,6 +8,7 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
+"""The math ops."""
 from __future__ import absolute_import
 from __future__ import division
@@ -25,7 +26,7 @@ from dragon.core.ops.utils import parse_args
 def abs(inputs, **kwargs):
    r"""Compute the absolute value of input.
-    .. math:: \text{out} = \left| x \right|
+    .. math:: \text{out} = \left| \text{input} \right|
    Examples:
@@ -37,7 +38,7 @@ def abs(inputs, **kwargs):
    Parameters
    ----------
    inputs : dragon.Tensor
-        The tensor :math:`x`.
+        The input tensor.
    Returns
    -------
@@ -57,7 +58,7 @@ def abs(inputs, **kwargs):
 def add(inputs, **kwargs):
    r"""Compute the element-wise addition.
-    .. math:: \text{out} = a + b
+    .. math:: \text{out} = \text{input1} + \text{input2}
    Examples:
@@ -71,7 +72,7 @@ def add(inputs, **kwargs):
    Parameters
    ----------
    inputs : Sequence[dragon.Tensor]
-        The tensor :math:`a` and :math:`b`.
+        The input1 and input2 tensor.
    Returns
    -------
@@ -105,7 +106,7 @@ def affine(inputs, axis=1, num_axes=1, **kwargs):
    Parameters
    ----------
    inputs : Sequence[dragon.Tensor]
-        The **x**, **W** and **b**.
+        The tensor **x**, **W** and **b**.
    axis : int, optional, default=1
        The start axis, can be negative.
    num_axes : int, optional, default=1
@@ -114,7 +115,7 @@ def affine(inputs, axis=1, num_axes=1, **kwargs):
    Returns
    -------
    dragon.Tensor
-        The **y**.
+        The output tensor.
    """
    args = parse_args(locals())
@@ -133,16 +134,16 @@ def affine(inputs, axis=1, num_axes=1, **kwargs):
 def axpby(inputs, outputs=None, alpha=1., beta=1., **kwargs):
    r"""Compute the element-wise addition from input to output.
-    .. math:: y = \alpha x + \beta y
+    .. math:: \text{out} = \alpha * \text{input} + \beta * \text{out}
    If ``outputs`` is not provided, **zeros** will be used instead.
    Parameters
    ----------
    inputs : Union[dragon.Tensor, Sequence[dragon.Tensor]]
-        The tensor :math:`x`.
+        The input tensor(s).
    outputs : Union[dragon.Tensor, Sequence[dragon.Tensor]], optional
-        The tensor :math:`y`.
+        The output tensor(s).
    alpha : number, optional, default=1.
        The value to :math:`\alpha`.
    beta : number, optional, default=1.
@@ -151,7 +152,7 @@ def axpby(inputs, outputs=None, alpha=1., beta=1., **kwargs):
    Returns
    -------
    Union[dragon.Tensor, Sequence[dragon.Tensor]]
-        The tensor :math:`y`.
+        The output tensor(s).
    """
    args = parse_args(locals())
@@ -175,7 +176,7 @@ def axpby(inputs, outputs=None, alpha=1., beta=1., **kwargs):
 def bitwise_and(inputs, **kwargs):
    r"""Compute the element-wise AND bitwise operation.
-    .. math:: \text{out} = a \mathbin{\&} b
+    .. math:: \text{out} = \text{input1} \mathbin{\&} \text{input2}
    Examples:
@@ -189,7 +190,7 @@ def bitwise_and(inputs, **kwargs):
    Parameters
    ----------
    inputs : Sequence[dragon.Tensor]
-        The tensor :math:`a` and :math:`b`.
+        The input1 and input2 tensor.
    Returns
    -------
@@ -204,7 +205,7 @@ def bitwise_and(inputs, **kwargs):
 def bitwise_or(inputs, **kwargs):
    r"""Compute the element-wise OR bitwise operation.
-    .. math:: \text{out} = a \mathbin{|} b
+    .. math:: \text{out} = \text{input1} \mathbin{|} \text{input2}
    Examples:
@@ -218,7 +219,7 @@ def bitwise_or(inputs, **kwargs):
    Parameters
    ----------
    inputs : Sequence[dragon.Tensor]
-        The tensor :math:`a` and :math:`b`.
+        The input1 and input2 tensor.
    Returns
    -------
@@ -233,7 +234,7 @@ def bitwise_or(inputs, **kwargs):
 def bitwise_xor(inputs, **kwargs):
    r"""Compute the element-wise XOR bitwise operation.
-    .. math:: \text{out} = a \oplus b
+    .. math:: \text{out} = \text{input1} \oplus \text{input2}
    Examples:
@@ -247,7 +248,7 @@ def bitwise_xor(inputs, **kwargs):
    Parameters
    ----------
    inputs : Sequence[dragon.Tensor]
-        The tensor :math:`a` and :math:`b`.
+        The input1 and input2 tensor.
    Returns
    -------
@@ -262,7 +263,7 @@ def bitwise_xor(inputs, **kwargs):
 def ceil(inputs, **kwargs):
    r"""Compute the smallest integer not less than input.
-    .. math:: \text{out} = \lceil x \rceil
+    .. math:: \text{out} = \lceil \text{input} \rceil
    Examples:
@@ -274,7 +275,7 @@ def ceil(inputs, **kwargs):
    Parameters
    ----------
    inputs : dragon.Tensor
-        The tensor :math:`x`.
+        The input tensor.
    Returns
    -------
@@ -294,12 +295,12 @@ def ceil(inputs, **kwargs):
 def clip(inputs, low=None, high=None, **kwargs):
    r"""Compute the clipped input according to the given bounds.
-    .. math:: \text{out} = \min(\max(x, \text{low}), \text{high})
+    .. math:: \text{out} = \min(\max(\text{input}, \text{low}), \text{high})
    Parameters
    ----------
    inputs : dragon.Tensor
-        The tensor :math:`x`.
+        The input tensor.
    low : number, optional
        The value to :math:`\text{low}`.
    high : number, optional
@@ -331,7 +332,7 @@ def clip(inputs, low=None, high=None, **kwargs):
 def cos(inputs, **kwargs):
    r"""Compute the cos of input.
-    .. math:: \text{out} = \cos(x)
+    .. math:: \text{out} = \cos(\text{input})
    Examples:
@@ -343,7 +344,7 @@ def cos(inputs, **kwargs):
    Parameters
    ----------
    inputs : dragon.Tensor
-        The tensor :math:`x`.
+        The input tensor.
    Returns
    -------
@@ -363,7 +364,7 @@ def cos(inputs, **kwargs):
 def div(inputs, **kwargs):
    r"""Compute the element-wise division.
-    .. math:: \text{out} = a \div b
+    .. math:: \text{out} = \text{input1} \div \text{input2}
    Examples:
@@ -377,7 +378,7 @@ def div(inputs, **kwargs):
    Parameters
    ----------
    inputs : Sequence[dragon.Tensor]
-        The tensor :math:`a` and :math:`b`.
+        The input1 and input2 tensor.
    Returns
    -------
@@ -398,37 +399,37 @@ def div(inputs, **kwargs):
 def dot(inputs, **kwargs):
    r"""Compute the dot product.
-    .. math:: \text{out} = a \cdot b
+    .. math:: \text{out} = \text{input1} \cdot \text{input2}
-    If ``rank(a)`` == ``rank(b)`` == 1, compute vector product:
+    If ``rank(input1)`` == ``rank(input2)`` == 1, compute vector product:
    ```python
-    x = dragon.ones((2,))
+    a = dragon.ones((2,))
-    y = dragon.ones((2,))
+    b = dragon.ones((2,))
-    print(dragon.math.dot([x, y]))  # 2.0
+    print(dragon.math.dot([a, b]))  # 2.0
    ```
-    If ``rank(a)`` == ``rank(b)`` == 2, compute matrix multiplication:
+    If ``rank(input1)`` == ``rank(input2)`` == 2, compute matrix multiplication:
    ```python
-    x = dragon.ones((2, 3))
+    a = dragon.ones((2, 3))
-    y = dragon.ones((3, 2))
+    b = dragon.ones((3, 2))
-    print(dragon.math.dot([x, y]))  # [[[3. 3.], [3. 3.]]]
+    print(dragon.math.dot([a, b]))  # [[[3. 3.], [3. 3.]]]
-    print(dragon.math.matmul([x, y]))  # Equivalent
+    print(dragon.math.matmul([a, b]))  # Equivalent
    ```
-    If ``rank(a)`` >= 2, ``rank(b)`` == 1, compute matrix-vector multiplication:
+    If ``rank(input1)`` >= 2, ``rank(input2)`` == 1, compute matrix-vector multiplication:
    ```python
-    x = dragon.ones((2, 3))
+    a = dragon.ones((2, 3))
-    y = dragon.ones((3,))
+    b = dragon.ones((3,))
-    print(dragon.math.dot([x, y]))  # [[3. 3.]]
+    print(dragon.math.dot([a, b]))  # [[3. 3.]]
    ```
    Parameters
    ----------
    inputs : Sequence[dragon.Tensor]
-        The tensor :math:`a` and :math:`b`.
+        The input1 and input2 tensor.
    Returns
    -------
@@ -448,7 +449,7 @@ def dot(inputs, **kwargs):
 def equal(inputs, **kwargs):
    r"""Compute the element-wise equal comparison.
-    .. math:: \text{out} = (a == b)
+    .. math:: \text{out} = (\text{input1} == \text{input2})
    Examples:
@@ -463,7 +464,7 @@ def equal(inputs, **kwargs):
    Parameters
    ----------
    inputs : Sequence[dragon.Tensor]
-        The tensor :math:`a` and :math:`b`.
+        The input1 and input2 tensor.
    Returns
    -------
@@ -484,7 +485,7 @@ def equal(inputs, **kwargs):
 def exp(inputs, **kwargs):
    r"""Compute the exponential of input.
-    .. math:: \text{out} = e^{x}
+    .. math:: \text{out} = \exp(\text{input})
    Examples:
@@ -496,7 +497,7 @@ def exp(inputs, **kwargs):
    Parameters
    ----------
    inputs : dragon.Tensor
-        The tensor :math:`x`.
+        The input tensor.
    Returns
    -------
@@ -516,7 +517,7 @@ def exp(inputs, **kwargs):
 def floor(inputs, **kwargs):
    r"""Compute the largest integer not greater than input.
-    .. math:: \text{out} = \lfloor x \rfloor
+    .. math:: \text{out} = \lfloor \text{input} \rfloor
    Examples:
@@ -528,7 +529,7 @@ def floor(inputs, **kwargs):
    Parameters
    ----------
    inputs : dragon.Tensor
-        The tensor :math:`x`.
+        The input tensor.
    Returns
    -------
@@ -587,7 +588,7 @@ def fully_connected(inputs, axis=1, transpose_w=True, **kwargs):
 def greater(inputs, **kwargs):
    r"""Compute the element-wise greater comparison.
-    .. math:: \text{out} = (a > b)
+    .. math:: \text{out} = (\text{input1} > \text{input2})
    Examples:
@@ -602,7 +603,7 @@ def greater(inputs, **kwargs):
    Parameters
    ----------
    inputs : Sequence[dragon.Tensor]
-        The tensor :math:`a` and :math:`b`.
+        The input1 and input2 tensor.
    Returns
    -------
@@ -623,7 +624,7 @@ def greater(inputs, **kwargs):
 def greater_equal(inputs, **kwargs):
    r"""Compute the element-wise greater-equal comparison.
-    .. math:: \text{out} = (a >= b)
+    .. math:: \text{out} = (\text{input1} >= \text{input2})
    Examples:
@@ -638,7 +639,7 @@ def greater_equal(inputs, **kwargs):
    Parameters
    ----------
    inputs : Sequence[dragon.Tensor]
-        The tensor :math:`a` and :math:`b`.
+        The input1 and input2 tensor.
    Returns
    -------
@@ -659,7 +660,7 @@ def greater_equal(inputs, **kwargs):
 def invert(inputs, **kwargs):
    r"""Invert each bit of input.
-    .. math:: \text{out} = \,\,\sim x
+    .. math:: \text{out} = \,\,\sim \text{input}
    Examples:
@@ -676,7 +677,7 @@ def invert(inputs, **kwargs):
    Parameters
    ----------
    inputs : dragon.Tensor
-        The tensor :math:`x`.
+        The input tensor.
    Returns
    -------
@@ -696,7 +697,7 @@ def invert(inputs, **kwargs):
 def is_inf(inputs, **kwargs):
    r"""Check if the elements of input are infinite.
-    .. math:: \text{out} = \text{isinf}(x)
+    .. math:: \text{out} = \text{isinf}(\text{input})
    Examples:
@@ -708,7 +709,7 @@ def is_inf(inputs, **kwargs):
    Parameters
    ----------
    inputs : dragon.Tensor
-        The tensor :math:`x`.
+        The input tensor.
    Returns
    -------
@@ -728,7 +729,7 @@ def is_inf(inputs, **kwargs):
 def is_nan(inputs, **kwargs):
    r"""Check if the elements of input are NaN.
-    .. math:: \text{out} = \text{isnan}(x)
+    .. math:: \text{out} = \text{isnan}(\text{input})
    Examples:
@@ -740,7 +741,7 @@ def is_nan(inputs, **kwargs):
    Parameters
    ----------
    inputs : dragon.Tensor
-        The tensor :math:`x`.
+        The input tensor.
    Returns
    -------
@@ -760,7 +761,7 @@ def is_nan(inputs, **kwargs):
 def log(inputs, **kwargs):
    r"""Compute the logarithm of input.
-    .. math:: \text{out} = \log(x)
+    .. math:: \text{out} = \log(\text{input})
    Examples:
@@ -772,7 +773,7 @@ def log(inputs, **kwargs):
    Parameters
    ----------
    inputs : dragon.Tensor
-        The tensor :math:`x`.
+        The input tensor.
    Returns
    -------
@@ -792,7 +793,7 @@ def log(inputs, **kwargs):
 def less(inputs, **kwargs):
    r"""Compute the element-wise less comparison.
-    .. math:: \text{out} = (a < b)
+    .. math:: \text{out} = (\text{input1} < \text{input2})
    Examples:
@@ -807,7 +808,7 @@ def less(inputs, **kwargs):
    Parameters
    ----------
    inputs : Sequence[dragon.Tensor]
-        The tensor :math:`a` and :math:`b`.
+        The input1 and input2 tensor.
    Returns
    -------
@@ -828,7 +829,7 @@ def less(inputs, **kwargs):
 def less_equal(inputs, **kwargs):
    r"""Compute the element-wise less-equal comparison.
-    .. math:: \text{out} = (a <= b)
+    .. math:: \text{out} = (\text{input1} <= \text{input2})
    Examples:
@@ -843,7 +844,7 @@ def less_equal(inputs, **kwargs):
    Parameters
    ----------
    inputs : Sequence[dragon.Tensor]
-        The tensor :math:`a` and :math:`b`.
+        The input1 and input2 tensor.
    Returns
    -------
@@ -864,7 +865,7 @@ def less_equal(inputs, **kwargs):
 def matmul(inputs, transpose_a=False, transpose_b=False, **kwargs):
    r"""Compute the matrix multiplication.
-    .. math:: \text{out} = a \times b
+    .. math:: y = a \times b
    The rank of ``a`` and ``b`` should be equal and >= 2:
@@ -922,12 +923,12 @@ def matmul(inputs, transpose_a=False, transpose_b=False, **kwargs):
 def maximum(inputs, **kwargs):
    r"""Compute the maximum value of given two inputs.
-    .. math:: \text{out} = \max(a, b)
+    .. math:: \text{out} = \max(\text{input1}, \text{input2})
    Parameters
    ----------
    inputs : Sequence[Union[dragon.Tensor, number]]
-        The tensor :math:`a` and :math:`b`.
+        The input1 and input2 tensor.
    Returns
    -------
@@ -948,12 +949,12 @@ def maximum(inputs, **kwargs):
 def minimum(inputs, **kwargs):
    r"""Compute the minimum value of given two inputs.
-    .. math:: \text{out} = \min(a, b)
+    .. math:: \text{out} = \min(\text{input1}, \text{input2})
    Parameters
    ----------
    inputs : Sequence[Union[dragon.Tensor, number]]
-        The tensor :math:`a` and :math:`b`.
+        The input1 and input2 tensor.
    Returns
    -------
@@ -974,7 +975,7 @@ def minimum(inputs, **kwargs):
 def mul(inputs, **kwargs):
    r"""Compute the element-wise multiplication.
-    .. math:: \text{out} = a \times b
+    .. math:: \text{out} = \text{input1} \times \text{input2}
    Examples:
@@ -988,7 +989,7 @@ def mul(inputs, **kwargs):
    Parameters
    ----------
    inputs : Sequence[dragon.Tensor]
-        The tensor :math:`a` and :math:`b`.
+        The input1 and input2 tensor.
    Returns
    -------
@@ -1009,7 +1010,7 @@ def mul(inputs, **kwargs):
 def negative(inputs, **kwargs):
    r"""Compute the element-wise negative.
-    .. math:: \text{out} = -x
+    .. math:: \text{out} = -\text{input}
    ```python
    x = dragon.constant([-1, 0, 1])
@@ -1019,7 +1020,7 @@ def negative(inputs, **kwargs):
    Parameters
    ----------
    inputs : dragon.Tensor
-        The tensor :math:`x`.
+        The input tensor.
    Returns
    -------
@@ -1039,7 +1040,7 @@ def negative(inputs, **kwargs):
 def not_equal(inputs, **kwargs):
    r"""Compute the element-wise not-equal comparison.
-    .. math:: \text{out} = (a \neq b)
+    .. math:: \text{out} = (\text{input1} \neq \text{input2})
    Examples:
@@ -1054,7 +1055,7 @@ def not_equal(inputs, **kwargs):
    Parameters
    ----------
    inputs : Sequence[dragon.Tensor]
-        The tensor :math:`a` and :math:`b`.
+        The input1 and input2 tensor.
    Returns
    -------
@@ -1075,7 +1076,7 @@ def not_equal(inputs, **kwargs):
 def pow(inputs, **kwargs):
    r"""Compute the power of input.
-    .. math:: \text{out} = x^{y}
+    .. math:: \text{out} = \text{input}^{\text{exponent}}
    The two inputs should be broadcast to each other:
@@ -1089,7 +1090,7 @@ def pow(inputs, **kwargs):
    Parameters
    ----------
    inputs : Sequence[dragon.Tensor]
-        The tensor :math:`x` and :math:`y`.
+        The input and exponent tensor.
    Returns
    -------
@@ -1110,7 +1111,7 @@ def pow(inputs, **kwargs):
 def reciprocal(inputs, **kwargs):
    r"""Compute the reciprocal of input.
-    .. math:: \text{out} = \frac{1}{x}
+    .. math:: \text{out} = \frac{1}{\text{input}}
    Examples:
@@ -1122,7 +1123,7 @@ def reciprocal(inputs, **kwargs):
    Parameters
    ----------
    inputs : dragon.Tensor
-        The tensor :math:`x`.
+        The input tensor.
    Returns
    -------
@@ -1142,7 +1143,7 @@ def reciprocal(inputs, **kwargs):
 def round(inputs, **kwargs):
    r"""Compute the nearest integer of input.
-    .. math:: \text{out} = \lfloor x \rceil
+    .. math:: \text{out} = \lfloor \text{input} \rceil
    Examples:
@@ -1154,7 +1155,7 @@ def round(inputs, **kwargs):
    Parameters
    ----------
    inputs : dragon.Tensor
-        The tensor :math:`x`.
+        The input tensor.
    Returns
    -------
@@ -1174,7 +1175,7 @@ def round(inputs, **kwargs):
 def rsqrt(inputs, **kwargs):
    r"""Compute the reciprocal square root of input.
-    .. math:: \text{out} = \frac{1}{\sqrt{x}}
+    .. math:: \text{out} = \frac{1}{\sqrt{\text{input}}}
    Examples:
@@ -1186,7 +1187,7 @@ def rsqrt(inputs, **kwargs):
    Parameters
    ----------
    inputs : dragon.Tensor
-        The tensor :math:`x`.
+        The input tensor.
    Returns
    -------
@@ -1207,11 +1208,11 @@ def sign(inputs, **kwargs):
    r"""Compute the sign indication of input.
    .. math::
-        \text{out}_{i} =
+        \text{out}[i] =
            \begin{cases}
-                -1, & \text{ if } x_{i} < 0 \\
+                -1, & \text{ if } \text{input}[i] < 0 \\
-                 0, & \text{ if } x_{i} = 0 \\
+                 0, & \text{ if } \text{input}[i] = 0 \\
-                 1, & \text{ if } x_{i} > 0
+                 1, & \text{ if } \text{input}[i] > 0
            \end{cases}
    Examples:
@@ -1224,7 +1225,7 @@ def sign(inputs, **kwargs):
    Parameters
    ----------
    inputs : dragon.Tensor
-        The tensor :math:`x`.
+        The input tensor.
    Returns
    -------
@@ -1244,7 +1245,7 @@ def sign(inputs, **kwargs):
 def sin(inputs, **kwargs):
    r"""Compute the sin of input.
-    .. math:: \text{out} = \sin(x)
+    .. math:: \text{out} = \sin(\text{input})
    Examples:
@@ -1256,7 +1257,7 @@ def sin(inputs, **kwargs):
    Parameters
    ----------
    inputs : dragon.Tensor
-        The tensor :math:`x`.
+        The input tensor.
    Returns
    -------
@@ -1276,7 +1277,7 @@ def sin(inputs, **kwargs):
 def sqrt(inputs, **kwargs):
    r"""Compute the square root of input.
-    .. math:: \text{out} = \sqrt{x}
+    .. math:: \text{out} = \sqrt{\text{input}}
    Examples:
@@ -1288,7 +1289,7 @@ def sqrt(inputs, **kwargs):
    Parameters
    ----------
    inputs : dragon.Tensor
-        The tensor :math:`x`.
+        The input tensor.
    Returns
    -------
@@ -1308,7 +1309,7 @@ def sqrt(inputs, **kwargs):
 def square(inputs, **kwargs):
    r"""Compute the square of input.
-    .. math:: \text{out} = x^{2}
+    .. math:: \text{out} = \text{input}^{2}
    Examples:
@@ -1320,7 +1321,7 @@ def square(inputs, **kwargs):
    Parameters
    ----------
    inputs : dragon.Tensor
-        The tensor :math:`x`.
+        The input tensor.
    Returns
    -------
@@ -1340,7 +1341,7 @@ def square(inputs, **kwargs):
 def sub(inputs, **kwargs):
    r"""Compute the element-wise subtraction.
-    .. math:: \text{out} = a - b
+    .. math:: \text{out} = \text{input1} - \text{input2}
    Examples:
@@ -1354,7 +1355,7 @@ def sub(inputs, **kwargs):
    Parameters
    ----------
    inputs : Sequence[dragon.Tensor]
-        The tensor :math:`a` and :math:`b`.
+        The input1 and input2 tensor.
    Returns
    -------

--- a/dragon/python/core/ops/math_ops_lib.py
+++ b/dragon/python/core/ops/math_ops_lib.py
@@ -8,6 +8,7 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
+"""The math ops library."""
 from __future__ import absolute_import
 from __future__ import division

--- a/dragon/python/core/ops/metric_ops.py
+++ b/dragon/python/core/ops/metric_ops.py
@@ -8,6 +8,7 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
+"""The metric ops."""
 from __future__ import absolute_import
 from __future__ import division

--- a/dragon/python/core/ops/metric_ops_lib.py
+++ b/dragon/python/core/ops/metric_ops_lib.py
@@ -8,6 +8,7 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
+"""The metric ops library."""
 from __future__ import absolute_import
 from __future__ import division

--- a/dragon/python/core/ops/normalization_ops.py
+++ b/dragon/python/core/ops/normalization_ops.py
@@ -8,6 +8,7 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
+"""The normalization ops."""
 from __future__ import absolute_import
 from __future__ import division

--- a/dragon/python/core/ops/normalization_ops_lib.py
+++ b/dragon/python/core/ops/normalization_ops_lib.py
@@ -8,6 +8,7 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
+"""The normalization ops library."""
 from __future__ import absolute_import
 from __future__ import division

--- a/dragon/python/core/ops/rnn_ops.py
+++ b/dragon/python/core/ops/rnn_ops.py
@@ -8,6 +8,7 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
+"""The rnn ops."""
 from __future__ import absolute_import
 from __future__ import division

--- a/dragon/python/core/ops/rnn_ops_lib.py
+++ b/dragon/python/core/ops/rnn_ops_lib.py
@@ -8,6 +8,7 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
+"""The rnn ops library."""
 from __future__ import absolute_import
 from __future__ import division

--- a/dragon/python/core/ops/tensorbind_eager.py
+++ b/dragon/python/core/ops/tensorbind_eager.py
@@ -8,7 +8,6 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
 """Bind tensor methods executed eagerly."""
 from __future__ import absolute_import

--- a/dragon/python/core/ops/tensorbind_symbol.py
+++ b/dragon/python/core/ops/tensorbind_symbol.py
@@ -8,7 +8,6 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
 """Bind tensor methods executed symbolically."""
 from __future__ import absolute_import

--- a/dragon/python/core/ops/training_ops_lib.py
+++ b/dragon/python/core/ops/training_ops_lib.py
@@ -8,6 +8,7 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
+"""The training ops library."""
 from __future__ import absolute_import
 from __future__ import division

--- a/dragon/python/core/ops/utils.py
+++ b/dragon/python/core/ops/utils.py
@@ -8,7 +8,6 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
 """Utilities to simplify the exporting of operators."""
 from __future__ import absolute_import

--- a/dragon/python/core/ops/vision_ops.py
+++ b/dragon/python/core/ops/vision_ops.py
@@ -8,6 +8,7 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
+"""The vision ops."""
 from __future__ import absolute_import
 from __future__ import division

--- a/dragon/python/core/ops/vision_ops_lib.py
+++ b/dragon/python/core/ops/vision_ops_lib.py
@@ -8,6 +8,7 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
+"""The vision ops library."""
 from __future__ import absolute_import
 from __future__ import division

--- a/dragon/python/core/training/adam.py
+++ b/dragon/python/core/training/adam.py
@@ -8,7 +8,6 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
 """The Adam optimizers."""
 from __future__ import absolute_import

--- a/dragon/python/core/training/optimizer.py
+++ b/dragon/python/core/training/optimizer.py
@@ -8,7 +8,6 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
 """The optimizer to update parameters."""
 from __future__ import absolute_import

--- a/dragon/python/core/training/rmsprop.py
+++ b/dragon/python/core/training/rmsprop.py
@@ -8,7 +8,6 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
 """The RMSprop optimizers."""
 from __future__ import absolute_import

--- a/dragon/python/core/training/sgd.py
+++ b/dragon/python/core/training/sgd.py
@@ -8,7 +8,6 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
 """The SGD optimizers."""
 from __future__ import absolute_import

--- a/dragon/python/core/util/math_util.py
+++ b/dragon/python/core/util/math_util.py
@@ -8,7 +8,6 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
 """Define the common used math functions."""
 from __future__ import absolute_import

--- a/dragon/python/core/util/registry.py
+++ b/dragon/python/core/util/registry.py
@@ -8,7 +8,6 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
 """Registry utilities."""
 from __future__ import absolute_import

--- a/dragon/python/core/util/tls.py
+++ b/dragon/python/core/util/tls.py
@@ -8,7 +8,6 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
 """Define the common thread local structures."""
 from __future__ import absolute_import

--- a/dragon/python/vm/onnx/shell.py
+++ b/dragon/python/vm/onnx/shell.py
@@ -8,7 +8,6 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
 """Toolkit for manipulating the onnx api."""
 from __future__ import absolute_import

--- a/dragon/python/vm/onnx/utils.py
+++ b/dragon/python/vm/onnx/utils.py
@@ -8,7 +8,6 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
 """Utilities to a too simple ONNX exporting or importing."""
 from __future__ import absolute_import

--- a/tensorflow/core/keras/__init__.py
+++ b/tensorflow/core/keras/__init__.py
@@ -8,7 +8,6 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
 """Import the Keras API for TensorFlow."""
 from __future__ import absolute_import

--- a/tensorflow/core/keras/activations.py
+++ b/tensorflow/core/keras/activations.py
@@ -30,10 +30,10 @@ def elu(x, alpha=1., **kwargs):
    .. math::
        \text{ELU}(x) =
-        \begin{cases}
+            \begin{cases}
-            x, & \text{ if } x \geq 0 \\
+                x, & \text{ if } x \geq 0 \\
-            \alpha * (e^{x} - 1), & \text{ otherwise }
+                \alpha * (\exp(x) - 1), & \text{ otherwise }
-        \end{cases}
+            \end{cases}
    Examples:
@@ -45,7 +45,7 @@ def elu(x, alpha=1., **kwargs):
    Parameters
    ----------
    x : dragon.Tensor
-        The tensor :math:`x`.
+        The input tensor.
    alpha : float, optional, default=1.
        The value to :math:`\alpha`.
@@ -63,7 +63,7 @@ def exponential(x):
    The **Exponential** function is defined as:
-    .. math:: \text{out} = e^{x}
+    .. math:: \text{Exp}(x) = \exp(x)
    Examples:
@@ -75,7 +75,7 @@ def exponential(x):
    Parameters
    ----------
    x : dragon.Tensor
-        The tensor :math:`x`.
+        The input tensor.
    Returns
    -------
@@ -139,7 +139,7 @@ def relu(x, alpha=0, max_value=None, **kwargs):
    x : dragon.Tensor
        The input tensor.
    alpha : number, optional, default=0
-        The valve of :math:`\alpha`.
+        The value to :math:`\alpha`.
    max_value : number, optional
        The value to :math:`v_{max}`.
@@ -159,10 +159,10 @@ def selu(x, **kwargs):
    .. math::
        \text{SELU}(x) = 1.0507 *
-        \begin{cases}
+            \begin{cases}
-            x, & \text{ if } x \geq 0 \\
+                x, & \text{ if } x \geq 0 \\
-            1.67326 * (e^{x} - 1), & \text{ otherwise }
+                1.67326 * (\exp(x) - 1), & \text{ otherwise }
-        \end{cases}
+            \end{cases}
    Examples:
@@ -174,7 +174,7 @@ def selu(x, **kwargs):
    Parameters
    ----------
    x : dragon.Tensor
-        The tensor :math:`x`.
+        The input tensor.
    Returns
    -------
@@ -190,7 +190,7 @@ def sigmoid(x, **kwargs):
    The **Sigmoid** function is defined as:
-    .. math:: \text{Sigmoid}(x) = \frac{1}{1 + e^{-x}}
+    .. math:: \text{Sigmoid}(x) = \frac{1}{1 + \exp(-x)}
    Examples:
@@ -218,7 +218,7 @@ def softmax(x, axis=-1, **kwargs):
    The **Softmax** function is defined as:
-    .. math:: \text{Softmax}(x) = \frac{e^{x_{i}}}{\sum e^{x_{j}}}
+    .. math:: \text{Softmax}(x_{i}) = \frac{\exp(x_{i})}{\sum_{j} \exp(x_{j})}
    Examples:
@@ -230,7 +230,7 @@ def softmax(x, axis=-1, **kwargs):
    Parameters
    ----------
    x : dragon.Tensor
-        The tensor :math:`x`.
+        The input tensor.
    axis : int, optional, default=-1
        The axis to reduce.
@@ -248,7 +248,7 @@ def tanh(x, **kwargs):
    The **Tanh** function is defined as:
-    .. math:: \text{Tanh}(x) = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}
+    .. math:: \text{Tanh}(x) = \frac{\exp{x} - \exp(-x)}{\exp(x) + \exp(-x)}
    Examples:
@@ -281,5 +281,4 @@ def get(identifier):
    else:
        raise TypeError(
            'Could not interpret activation function identifier: {}.'
-            .format(repr(identifier))
+            .format(repr(identifier)))
-        )
--- a/tensorflow/core/keras/layers/advanced_activations.py
+++ b/tensorflow/core/keras/layers/advanced_activations.py
@@ -29,10 +29,10 @@ class ELU(Layer):
    .. math::
        \text{ELU}(x) =
-        \begin{cases}
+            \begin{cases}
-            x, & \text{ if } x \geq 0 \\
+                x, & \text{ if } x \geq 0 \\
-            \alpha * (e^{x} - 1), & \text{ otherwise }
+                \alpha * (\exp(x) - 1), & \text{ otherwise }
-        \end{cases}
+            \end{cases}
    Examples:
@@ -72,10 +72,10 @@ class LeakyReLU(Layer):
    .. math::
        \text{LeakyReLU}(x) =
-        \begin{cases}
+            \begin{cases}
-            x, & \text{ if } x \geq 0 \\
+                x, & \text{ if } x \geq 0 \\
-            \alpha * x, & \text{ otherwise }
+                \alpha * x, & \text{ otherwise }
-        \end{cases}
+            \end{cases}
    Examples:
@@ -160,10 +160,10 @@ class SELU(Layer):
    .. math::
        \text{SELU}(x) = 1.0507 *
-        \begin{cases}
+            \begin{cases}
-            x, & \text{ if } x \geq 0 \\
+                x, & \text{ if } x \geq 0 \\
-            1.67326 * (e^{x} - 1), & \text{ otherwise }
+                1.67326 * (\exp(x) - 1), & \text{ otherwise }
-        \end{cases}
+            \end{cases}
    Examples:
@@ -188,7 +188,7 @@ class Softmax(Layer):
    The **Softmax** function is defined as:
-    .. math:: \text{Softmax}(x) = \frac{e^{x_{i}}}{\sum e^{x_{j}}}
+    .. math:: \text{Softmax}(x_{i}) = \frac{\exp(x_{i})}{\sum_{j} \exp(x_{j})}
    Examples:

--- a/tensorflow/core/keras/layers/merge.py
+++ b/tensorflow/core/keras/layers/merge.py
@@ -47,9 +47,7 @@ class _Merge(Layer):
 class Add(_Merge):
-    r"""The layer to add a sequence of inputs.
+    """The layer to add a sequence of inputs.
-    .. math:: \text{out} = \sum(x)
    Examples:
@@ -75,7 +73,7 @@ class Add(_Merge):
 class Concatenate(_Merge):
-    r"""The layer to concatenate a sequence of inputs.
+    """The layer to concatenate a sequence of inputs.
    Examples:
@@ -103,7 +101,7 @@ class Concatenate(_Merge):
 class Maximum(_Merge):
-    r"""The layer to compute the minimum of a sequence of inputs.
+    """The layer to compute the minimum of a sequence of inputs.
    Examples:
@@ -126,7 +124,7 @@ class Maximum(_Merge):
 class Minimum(_Merge):
-    r"""The layer to compute the minimum of a sequence of inputs.
+    """The layer to compute the minimum of a sequence of inputs.
    Examples:
@@ -149,9 +147,7 @@ class Minimum(_Merge):
 class Multiply(_Merge):
-    r"""The layer to multiply a sequence of inputs.
+    """The layer to multiply a sequence of inputs.
-    .. math:: \text{out} = \prod(x)
    Examples:
@@ -174,9 +170,7 @@ class Multiply(_Merge):
 class Subtract(_Merge):
-    r"""The layer to subtract two inputs.
+    """The layer to subtract two inputs.
-    .. math:: \text{out} = x - y
    Examples:

--- a/tensorflow/core/keras/regularizers.py
+++ b/tensorflow/core/keras/regularizers.py
@@ -12,7 +12,6 @@
 #    <https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/keras/regularizers.py>
 #
 # ------------------------------------------------------------
 """Built-in regularizers."""
 from __future__ import absolute_import

--- a/tensorflow/core/ops/array_ops.py
+++ b/tensorflow/core/ops/array_ops.py
@@ -12,6 +12,7 @@
 #    <https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/array_ops.py>
 #
 # ------------------------------------------------------------
+"""The array ops."""
 from __future__ import absolute_import
 from __future__ import division

--- a/tensorflow/core/ops/bitwise_ops.py
+++ b/tensorflow/core/ops/bitwise_ops.py
@@ -12,6 +12,7 @@
 #    <https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/bitwise_ops.py>
 #
 # ------------------------------------------------------------
+"""The bitwise ops."""
 from __future__ import absolute_import
 from __future__ import division
@@ -23,7 +24,7 @@ from dragon.core.ops import math_ops
 def bitwise_and(x, y, name=None):
    r"""Compute the element-wise AND bitwise operation.
-    .. math:: \text{out} = x \mathbin{\&} y
+    .. math:: \text{out} = \text{input1} \mathbin{\&} \text{input2}
    Examples:
@@ -37,9 +38,9 @@ def bitwise_and(x, y, name=None):
    Parameters
    ----------
    x : dragon.Tensor
-        The tensor :math:`x`.
+        The input1 tensor.
    y : dragon.Tensor
-        The tensor :math:`y`.
+        The input2 tensor.
    name : str, optional
        A optional name for the operation.
@@ -55,7 +56,7 @@ def bitwise_and(x, y, name=None):
 def bitwise_or(x, y, name=None):
    r"""Compute the element-wise OR bitwise operation.
-    .. math:: \text{out} = x \mathbin{|} y
+    .. math:: \text{out} = \text{input1} \mathbin{|} \text{input2}
    Examples:
@@ -69,9 +70,9 @@ def bitwise_or(x, y, name=None):
    Parameters
    ----------
    x : dragon.Tensor
-        The tensor :math:`x`.
+        The input1 tensor.
    y : dragon.Tensor
-        The tensor :math:`y`.
+        The input2 tensor.
    name : str, optional
        A optional name for the operation.
@@ -87,7 +88,7 @@ def bitwise_or(x, y, name=None):
 def bitwise_xor(x, y, name=None):
    r"""Compute the element-wise XOR bitwise operation.
-    .. math:: \text{out} = x \oplus y
+    .. math:: \text{out} = \text{input1} \oplus \text{input2}
    Examples:
@@ -101,9 +102,9 @@ def bitwise_xor(x, y, name=None):
    Parameters
    ----------
    x : dragon.Tensor
-        The tensor :math:`x`.
+        The input1 tensor.
    y : dragon.Tensor
-        The tensor :math:`y`.
+        The input2 tensor.
    name : str, optional
        A optional name for the operation.
@@ -119,7 +120,7 @@ def bitwise_xor(x, y, name=None):
 def invert(x, name=None):
    r"""Invert each bit of input.
-    .. math:: \text{out} = \,\,\sim x
+    .. math:: \text{out} = \,\,\sim \text{input}
    Examples:
@@ -136,7 +137,7 @@ def invert(x, name=None):
    Parameters
    ----------
    x : dragon.Tensor
-        The tensor :math:`x`.
+        The input tensor.
    name : str, optional
        A optional name for the operation.

--- a/tensorflow/core/ops/clip_ops.py
+++ b/tensorflow/core/ops/clip_ops.py
@@ -12,6 +12,7 @@
 #    <https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/clip_ops.py>
 #
 # ------------------------------------------------------------
+"""The clip ops."""
 from __future__ import absolute_import
 from __future__ import division

--- a/tensorflow/core/ops/gradients_impl.py
+++ b/tensorflow/core/ops/gradients_impl.py
@@ -8,6 +8,7 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
+"""Grad implementation."""
 from __future__ import absolute_import
 from __future__ import division

--- a/tensorflow/core/ops/init_ops.py
+++ b/tensorflow/core/ops/init_ops.py
@@ -12,6 +12,7 @@
 #    <https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/init_ops.py>
 #
 # ------------------------------------------------------------
+"""The init ops."""
 from __future__ import absolute_import
 from __future__ import division

--- a/tensorflow/core/ops/linalg_ops.py
+++ b/tensorflow/core/ops/linalg_ops.py
@@ -12,6 +12,7 @@
 #    <https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/linalg_ops.py>
 #
 # ------------------------------------------------------------
+"""The linalg ops."""
 from __future__ import absolute_import
 from __future__ import division

--- a/tensorflow/core/ops/math_ops.py
+++ b/tensorflow/core/ops/math_ops.py
@@ -12,6 +12,7 @@
 #    <https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/math_ops.py>
 #
 # ------------------------------------------------------------
+"""The math ops."""
 from __future__ import absolute_import
 from __future__ import division
@@ -25,7 +26,7 @@ from dragon.core.ops import math_ops
 def abs(x, name=None):
    r"""Compute the absolute value of input.
-    .. math:: \text{out} = \left| x \right|
+    .. math:: \text{out} = \left| \text{input} \right|
    Examples:
@@ -52,7 +53,7 @@ def abs(x, name=None):
 def add(x, y, name=None):
    r"""Compute the element-wise addition.
-    .. math:: \text{out} = x + y
+    .. math:: \text{out} = \text{input1} + \text{input2}
    ```python
    x = tf.constant(1)
@@ -64,9 +65,9 @@ def add(x, y, name=None):
    Parameters
    ----------
    x : dragon.Tensor
-        The input tensor.
+        The input1 tensor.
    y : dragon.Tensor
-        The tensor :math:`y`.
+        The input2 tensor.
    name : str, optional
        A optional name for the operation.
@@ -82,7 +83,7 @@ def add(x, y, name=None):
 def add_n(inputs, name=None):
    r"""Compute the element-wise sum on a sequence of inputs.
-    .. math:: \text{out} = \sum(x)
+    .. math:: \text{out} = \sum(\text{input}_{i})
    Examples:
@@ -214,7 +215,7 @@ def cast(x, dtype, name=None):
 def ceil(x, name=None):
    r"""Compute the smallest integer not less than input.
-    .. math:: \text{out} = \lceil x \rceil
+    .. math:: \text{out} = \lceil \text{input} \rceil
    Examples:
@@ -242,7 +243,7 @@ def ceil(x, name=None):
 def cos(x, name=None):
    r"""Compute the cos of input.
-    .. math:: \text{out} = \cos(x)
+    .. math:: \text{out} = \cos(\text{input})
    Examples:
@@ -327,7 +328,7 @@ def cumsum(x, axis=0, exclusive=False, reverse=False, name=None):
 def divide(x, y, name=None):
    r"""Compute the element-wise division.
-    .. math:: \text{out} = x \div y
+    .. math:: \text{out} = \text{input1} \div \text{input2}
    Examples:
@@ -341,9 +342,9 @@ def divide(x, y, name=None):
    Parameters
    ----------
    x : dragon.Tensor
-        The input tensor.
+        The input1 tensor.
    y : dragon.Tensor
-        The tensor :math:`y`.
+        The input2 tensor.
    name : str, optional
        A optional name for the operation.
@@ -359,7 +360,7 @@ def divide(x, y, name=None):
 def equal(x, y, name=None):
    r"""Compute the element-wise equal comparison.
-    .. math:: \text{out} = (x == y)
+    .. math:: \text{out} = (\text{input1} == \text{input2})
    Examples:
@@ -374,9 +375,9 @@ def equal(x, y, name=None):
    Parameters
    ----------
    x : dragon.Tensor
-        The input tensor.
+        The input1 tensor.
    y : dragon.Tensor
-        The tensor :math:`y`.
+        The input2 tensor.
    name : str, optional
        A optional name for the operation.
@@ -392,7 +393,7 @@ def equal(x, y, name=None):
 def exp(x, name=None):
    r"""Compute the exponential of input.
-    .. math:: \text{out} = e^{x}
+    .. math:: \text{out} = \exp(\text{input})
    Examples:
@@ -420,7 +421,7 @@ def exp(x, name=None):
 def floor(x, name=None):
    r"""Compute the largest integer not greater than input.
-    .. math:: \text{out} = \lfloor x \rfloor
+    .. math:: \text{out} = \lfloor \text{input} \rfloor
    Examples:
@@ -448,7 +449,7 @@ def floor(x, name=None):
 def greater(x, y, name=None):
    r"""Compute the element-wise greater comparison.
-    .. math:: \text{out} = (x > y)
+    .. math:: \text{out} = (\text{input1} > \text{input2})
    Examples:
@@ -463,9 +464,9 @@ def greater(x, y, name=None):
    Parameters
    ----------
    x : dragon.Tensor
-        The input tensor.
+        The input1 tensor.
    y : dragon.Tensor
-        The tensor :math:`y`.
+        The input2 tensor.
    name : str, optional
        A optional name for the operation.
@@ -481,7 +482,7 @@ def greater(x, y, name=None):
 def greater_equal(x, y, name=None):
    r"""Compute the element-wise greater-equal comparison.
-    .. math:: \text{out} = (x >= y)
+    .. math:: \text{out} = (\text{input1} >= \text{input2})
    Examples:
@@ -496,9 +497,9 @@ def greater_equal(x, y, name=None):
    Parameters
    ----------
    x : dragon.Tensor
-        The input tensor.
+        The input1 tensor.
    y : dragon.Tensor
-        The tensor :math:`y`.
+        The input2 tensor.
    name : str, optional
        A optional name for the operation.
@@ -514,7 +515,7 @@ def greater_equal(x, y, name=None):
 def is_inf(x, name=None):
    r"""Check if the elements of input are infinite.
-    .. math:: \text{out} = \text{isinf}(x)
+    .. math:: \text{out} = \text{isinf}(\text{input})
    Examples:
@@ -542,7 +543,7 @@ def is_inf(x, name=None):
 def is_nan(x, name=None):
    r"""Check if the elements of input are NaN.
-    .. math:: \text{out} = \text{isnan}(x)
+    .. math:: \text{out} = \text{isnan}(\text{input})
    Examples:
@@ -570,7 +571,7 @@ def is_nan(x, name=None):
 def less(x, y, name=None):
    r"""Compute the element-wise less comparison.
-    .. math:: \text{out} = (x < y)
+    .. math:: \text{out} = (\text{input1} < \text{input2})
    Examples:
@@ -585,9 +586,9 @@ def less(x, y, name=None):
    Parameters
    ----------
    x : dragon.Tensor
-        The input tensor.
+        The input1 tensor.
    y : dragon.Tensor
-        The tensor :math:`y`.
+        The input2 tensor.
    name : str, optional
        A optional name for the operation.
@@ -603,7 +604,7 @@ def less(x, y, name=None):
 def less_equal(x, y, name=None):
    r"""Compute the element-wise less-equal comparison.
-    .. math:: \text{out} = (x <= y)
+    .. math:: \text{out} = (\text{input1} <= \text{input2})
    Examples:
@@ -618,9 +619,9 @@ def less_equal(x, y, name=None):
    Parameters
    ----------
    x : dragon.Tensor
-        The input tensor.
+        The input1 tensor.
    y : dragon.Tensor
-        The tensor :math:`y`.
+        The input2 tensor.
    name : str, optional
        A optional name for the operation.
@@ -636,7 +637,7 @@ def less_equal(x, y, name=None):
 def log(x, name=None):
    r"""Compute the logarithm of input.
-    .. math:: \text{out} = \log(x)
+    .. math:: \text{out} = \log(\text{input})
    Examples:
@@ -670,7 +671,7 @@ def matmul(
 ):
    r"""Compute the matrix multiplication.
-    .. math:: \text{out} = a \times b
+    .. math:: y = a \times b
    The rank of ``a`` and ``b`` should be equal and >= 2:
@@ -725,7 +726,7 @@ def matmul(
 def multiply(x, y, name=None):
    r"""Compute the element-wise multiplication.
-    .. math:: \text{out} = x \times y
+    .. math:: \text{out} = \text{input1} \times \text{input2}
    Examples:
@@ -739,9 +740,9 @@ def multiply(x, y, name=None):
    Parameters
    ----------
    x : dragon.Tensor
-        The input tensor.
+        The input1 tensor.
    y : dragon.Tensor
-         The tensor :math:`y`.
+        The input2 tensor.
    name : str, optional
        A optional name for the operation.
@@ -757,7 +758,7 @@ def multiply(x, y, name=None):
 def negative(x, name=None):
    r"""Compute the element-wise negative.
-    .. math:: \text{out} = -x
+    .. math:: \text{out} = -\text{input}
    ```python
    x = tf.constant([-1, 0, 1])
@@ -783,7 +784,7 @@ def negative(x, name=None):
 def not_equal(x, y, name=None):
    r"""Compute the element-wise not-equal comparison.
-    .. math:: \text{out} = (x != y)
+    .. math:: \text{out} = (\text{input1} != \text{input2})
    Examples:
@@ -798,9 +799,9 @@ def not_equal(x, y, name=None):
    Parameters
    ----------
    x : dragon.Tensor
-        The input tensor.
+        The input1 tensor.
    y : dragon.Tensor
-        The tensor :math:`y`.
+        The input2 tensor.
    name : str, optional
        A optional name for the operation.
@@ -816,7 +817,7 @@ def not_equal(x, y, name=None):
 def pow(x, y, name=None):
    r"""Compute the power of input.
-    .. math:: \text{out} = x^{y}
+    .. math:: \text{out} = \text{input}^{\text{exponent}}
    The two inputs should be broadcast to each other:
@@ -830,9 +831,9 @@ def pow(x, y, name=None):
    Parameters
    ----------
    x : Union[dragon.Tensor, number]
-        The input tensor :math:`x`.
+        The input tensor.
    y : Union[dragon.Tensor, number]
-        The input tensor :math:`y`.
+        The exponent tensor.
    name : str, optional
        A optional name for the operation.
@@ -897,7 +898,7 @@ def range(start, limit=None, delta=1, dtype='int64', name=None):
 def reciprocal(x, name=None):
    r"""Compute the reciprocal of input.
-    .. math:: \text{out} = \frac{1}{x}
+    .. math:: \text{out} = \frac{1}{\text{input}}
    Examples:
@@ -1099,7 +1100,7 @@ def reduce_sum(input_tensor, axis=None, keepdims=False, name=None):
 def round(x, name=None):
    r"""Compute the nearest integer of input.
-    .. math:: \text{out} = \lfloor x \rceil
+    .. math:: \text{out} = \lfloor \text{input} \rceil
    Examples:
@@ -1127,7 +1128,7 @@ def round(x, name=None):
 def rsqrt(x, name=None):
    r"""Compute the reciprocal square root of input.
-    .. math:: \text{out} = \frac{1}{\sqrt{x}}
+    .. math:: \text{out} = \frac{1}{\sqrt{\text{input}}}
    Examples:
@@ -1153,11 +1154,9 @@ def rsqrt(x, name=None):
 def sigmoid(x, name=None, **kwargs):
-    r"""Apply the sigmoid function.
+    r"""Compute the sigmoid function.
-    The **Sigmoid** function is defined as:
+    .. math:: \text{out} = \frac{1}{1 + \exp(-\text{input})}
-    .. math:: \text{Sigmoid}(x) = \frac{1}{1 + e^{-x}}
    Examples:
@@ -1186,11 +1185,11 @@ def sign(x, name=None):
    r"""Compute the sign indication of input.
    .. math::
-        \text{out}_{i} =
+        \text{out}[i] =
            \begin{cases}
-                -1, & \text{ if } x_{i} < 0 \\
+                -1, & \text{ if } \text{input}[i] < 0 \\
-                 0, & \text{ if } x_{i} = 0 \\
+                 0, & \text{ if } \text{input}[i] = 0 \\
-                 1, & \text{ if } x_{i} > 0
+                 1, & \text{ if } \text{input}[i] > 0
            \end{cases}
    Examples:
@@ -1219,7 +1218,7 @@ def sign(x, name=None):
 def sin(x, name=None):
    r"""Compute the sin of input.
-    .. math:: \text{out} = \sin(x)
+    .. math:: \text{out} = \sin(\text{input})
    Examples:
@@ -1247,7 +1246,7 @@ def sin(x, name=None):
 def sqrt(x, name=None):
    r"""Compute the square root of input.
-    .. math:: \text{out} = \sqrt{x}
+    .. math:: \text{out} = \sqrt{\text{input}}
    Examples:
@@ -1275,7 +1274,7 @@ def sqrt(x, name=None):
 def square(x, name=None):
    r"""Compute the square of input.
-    .. math:: \text{out} = x^{2}
+    .. math:: \text{out} = \text{input}^{2}
    Examples:
@@ -1303,7 +1302,7 @@ def square(x, name=None):
 def subtract(x, y, name=None):
    r"""Compute the element-wise subtraction.
-    .. math:: \text{out} = x - y
+    .. math:: \text{out} = \text{input1} - \text{input2}
    Examples:
@@ -1317,9 +1316,9 @@ def subtract(x, y, name=None):
    Parameters
    ----------
    x : dragon.Tensor
-        The input tensor.
+        The input1 tensor.
    y : dragon.Tensor
-         The tensor :math:`y`.
+        The input2 tensor.
    name : str, optional
        A optional name for the operation.
@@ -1333,11 +1332,10 @@ def subtract(x, y, name=None):
 def tanh(x, name=None, **kwargs):
-    r"""Apply the tanh function.
+    r"""Compute the tanh of input.
-    The **Tanh** function is defined as:
-    .. math:: \text{Tanh}(x) = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}
+    .. math:: \text{out} = \frac{\exp(\text{input}) - \exp(-\text{input})}
+                                {\exp(\text{input}) + \exp(-\text{input})}
    Examples:

--- a/tensorflow/core/ops/nn.py
+++ b/tensorflow/core/ops/nn.py
@@ -9,6 +9,8 @@
 #
 # ------------------------------------------------------------
+"""The nn components."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

--- a/tensorflow/core/ops/nn_impl.py
+++ b/tensorflow/core/ops/nn_impl.py
@@ -8,6 +8,7 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
+"""The nn ops implementation."""
 from __future__ import absolute_import
 from __future__ import division
@@ -140,8 +141,8 @@ def moments(x, axes=None, keepdims=False, name=None):
    .. math::
        \begin{cases}
-            \text{Mean}(x) = \frac{1}{n}\sum(x) \\
+            \text{mean} = \frac{1}{n}\sum(\text{input}) \\
-            \text{Variance}(x) = \frac{1}{n}\sum(x - \text{Mean}(x))^{2}
+            \text{variance} = \frac{1}{n}\sum(\text{input} - \text{mean})^{2}
        \end{cases}
    The argument ``axis`` could be negative or **None**:
@@ -164,7 +165,7 @@ def moments(x, axes=None, keepdims=False, name=None):
    Parameters
    ----------
    x : dragon.Tensor
-        The tensor :math:`x`.
+        The input tensor.
    axes : Union[int, Sequence[int]], optional
        The axis to reduce.
    keepdims : bool, optional, default=False

--- a/tensorflow/core/ops/nn_ops.py
+++ b/tensorflow/core/ops/nn_ops.py
@@ -8,6 +8,7 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
+"""The nn ops."""
 from __future__ import absolute_import
 from __future__ import division
@@ -147,17 +148,7 @@ def convolution(
    name=None,
    **kwargs
 ):
-    r"""Apply the n-dimension convolution.
+    """Apply the n-dimension convolution.
-    The spatial output dimension is computed as:
-    .. math::
-        \begin{cases}
-            \text{DK}_{size} = dilation *
-                (\text{K}_{size} - 1) + 1 \\
-            \text{Dim}_{out} = (\text{Dim}_{in} +
-                2 * pad - \text{DK}_{size}) / stride + 1
-        \end{cases}
    Parameters
    ----------
@@ -223,17 +214,7 @@ def conv_transpose(
    dilations=None,
    name=None,
 ):
-    r"""Apply the n-dimension deconvolution.
+    """Apply the n-dimension deconvolution.
-    The spatial output dimension is computed as:
-    .. math::
-        \begin{cases}
-            \text{DK}_{size} = dilation *
-                (\text{K}_{size} - 1) + 1 \\
-            \text{Dim}_{out} = (\text{Dim}_{in} - 1) *
-                stride + \text{DK}_{size} - 2 * pad
-        \end{cases}
    Parameters
    ----------
@@ -306,17 +287,7 @@ def conv2d(
    dilations=None,
    name=None,
 ):
-    r"""Apply the 2d convolution.
+    """Apply the 2d convolution.
-    The spatial output dimension is computed as:
-    .. math::
-        \begin{cases}
-            \text{DK}_{size} = dilation *
-                (\text{K}_{size} - 1) + 1 \\
-            \text{Dim}_{out} = (\text{Dim}_{in} +
-                2 * pad - \text{DK}_{size}) / stride + 1
-        \end{cases}
    Parameters
    ----------
@@ -354,17 +325,7 @@ def conv2d_transpose(
    dilations=None,
    name=None,
 ):
-    r"""Apply the 2d deconvolution.
+    """Apply the 2d deconvolution.
-    The spatial output dimension is computed as:
-    .. math::
-        \begin{cases}
-            \text{DK}_{size} = dilation *
-                (\text{K}_{size} - 1) + 1 \\
-            \text{Dim}_{out} = (\text{Dim}_{in} - 1) *
-                stride + \text{DK}_{size} - 2 * pad
-        \end{cases}
    Parameters
    ----------
@@ -403,19 +364,9 @@ def depthwise_conv2d(
    dilations=None,
    name=None,
 ):
-    r"""Apply the 2d depthwise convolution.
+    """Apply the 2d depthwise convolution.
    `[Chollet, 2016] <https://arxiv.org/abs/1610.02357>`_.
-    The spatial output dimension is computed as:
-    .. math::
-        \begin{cases}
-            \text{DK}_{size} = dilation *
-                (\text{K}_{size} - 1) + 1 \\
-            \text{Dim}_{out} = (\text{Dim}_{in} +
-                2 * pad - \text{DK}_{size}) / stride + 1
-        \end{cases}
    Parameters
    ----------
    input : dragon.Tensor
@@ -483,10 +434,10 @@ def elu(features, alpha=1., name=None, **kwargs):
    .. math::
        \text{ELU}(x) =
-        \begin{cases}
+            \begin{cases}
-            x, & \text{ if } x \geq 0 \\
+                x, & \text{ if } x \geq 0 \\
-            \alpha * (e^{x} - 1), & \text{ otherwise }
+                \alpha * (\exp(x) - 1), & \text{ otherwise }
-        \end{cases}
+            \end{cases}
    Parameters
    ----------
@@ -517,10 +468,10 @@ def leaky_relu(features, alpha=0.2, name=None, **kwargs):
    .. math::
        \text{LeakyReLU}(x) =
-        \begin{cases}
+            \begin{cases}
-            x, & \text{ if } x \geq 0 \\
+                x, & \text{ if } x \geq 0 \\
-            \alpha * x, & \text{ otherwise }
+                \alpha * x, & \text{ otherwise }
-        \end{cases}
+            \end{cases}
    Parameters
    ----------
@@ -598,7 +549,7 @@ def log_softmax(logits, axis=-1, name=None):
    The **LogSoftmax** function is defined as:
-    .. math:: \text{LogSoftmax}(x) = \log(\frac{e^{x_{i}}}{\sum e^{x_{j}}})
+    .. math:: \text{LogSoftmax}(x) = \log(\frac{\exp(x_{i})}{\sum \exp(x_{j})})
    The argument ``axis`` could be negative:
@@ -795,10 +746,10 @@ def selu(features, name=None, **kwargs):
    .. math::
        \text{SELU}(x) = 1.0507 *
-        \begin{cases}
+            \begin{cases}
-            x, & \text{ if } x \geq 0 \\
+                x, & \text{ if } x \geq 0 \\
-            1.67326 * (e^{x} - 1), & \text{ otherwise }
+                1.67326 * (\exp(x) - 1), & \text{ otherwise }
-        \end{cases}
+            \end{cases}
    Examples:
@@ -836,7 +787,7 @@ def softmax(logits, axis=-1, name=None, **kwargs):
    The **Softmax** function is defined as:
-    .. math:: \text{Softmax}(x) = \frac{e^{x_{i}}}{\sum e^{x_{j}}}
+    .. math:: \text{Softmax}(x_{i}) = \frac{\exp(x_{i})}{\sum_{j} \exp(x_{j})}
    The argument ``axis`` could be negative:

--- a/tensorflow/core/ops/random_ops.py
+++ b/tensorflow/core/ops/random_ops.py
@@ -8,6 +8,7 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
+"""The random ops."""
 from __future__ import absolute_import
 from __future__ import division
@@ -26,9 +27,7 @@ def random_normal(
 ):
    r"""Return a tensor initialized from normal distribution.
-    The **Normal** distribution is defined as:
+    .. math:: \text{out} \sim \mathcal{N}(\mu, \sigma)
-    .. math:: X \sim N(\mu, \sigma)
    Parameters
    ----------
@@ -65,9 +64,7 @@ def random_uniform(
 ):
    r"""Return a tensor initialized from the uniform distribution.
-    The **Uniform** distribution is defined as:
+    .. math:: \text{out} \sim \mathcal{U}(\alpha, \beta)
-    .. math:: X \sim U(\alpha, \beta)
    Parameters
    ----------
@@ -104,10 +101,8 @@ def truncated_normal(
 ):
    r"""Return a tensor initialized from the truncated normal distribution.
-    The **TruncatedNormal** distribution is defined as:
    .. math::
-        X \sim TN(\mu, \sigma, \mu - 2\sigma, \mu + 2\sigma)
+        \text{out} \sim \mathcal{TN}(\mu, \sigma, \mu - 2\sigma, \mu + 2\sigma)
    Parameters
    ----------

--- a/tensorflow/core/ops/standard_ops.py
+++ b/tensorflow/core/ops/standard_ops.py
@@ -8,6 +8,7 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
+"""The standard ops."""
 from __future__ import absolute_import
 from __future__ import division

--- a/tensorflow/core/ops/variables.py
+++ b/tensorflow/core/ops/variables.py
@@ -8,6 +8,7 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
+"""The Variable class."""
 from __future__ import absolute_import
 from __future__ import division

--- a/tensorlayer/__init__.py
+++ b/tensorlayer/__init__.py
@@ -9,7 +9,6 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
 """Deep learning and Reinforcement learning library for Researchers and Engineers"""
 from __future__ import absolute_import as _absolute_import

--- a/tensorlayer/core/activations.py
+++ b/tensorlayer/core/activations.py
@@ -25,10 +25,10 @@ def leaky_relu(x, alpha=0.2, name="leaky_relu", **kwargs):
    .. math::
        \text{LeakyReLU}(x) =
-        \begin{cases}
+            \begin{cases}
-            x, & \text{ if } x \geq 0 \\
+                x, & \text{ if } x \geq 0 \\
-            \alpha * x, & \text{ otherwise }
+                \alpha * x, & \text{ otherwise }
-        \end{cases}
+            \end{cases}
    Examples:

--- a/tensorlayer/core/models/resnet.py
+++ b/tensorlayer/core/models/resnet.py
@@ -9,7 +9,6 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
 """ResNet for ImageNet.
 # Reference:

--- a/test/dragon/core/test_device.py
+++ b/test/dragon/core/test_device.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#    <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import dragon
+import unittest
+from dragon.core.framework import config
+from dragon.core.testing.unittest.common_utils import run_tests
+from dragon.core.testing.unittest.common_utils import TEST_CUDA
+class TestCUDA(unittest.TestCase):
+    """Test the cuda utilities."""
+    @unittest.skipIf(not TEST_CUDA, 'CUDA unavailable')
+    def test_stream(self):
+        stream = dragon.cuda.Stream(device_index=0)
+        self.assertGreater(stream.ptr, 0)
+        stream.synchronize()
+        dragon.cuda.synchronize()
+    @unittest.skipIf(not TEST_CUDA, 'CUDA unavailable')
+    def test_device(self):
+        major, minor = dragon.cuda.get_device_capability(0)
+        self.assertGreaterEqual(major, 1)
+        self.assertGreaterEqual(minor, 0)
+        dragon.cuda.set_device(0)
+        self.assertEqual(dragon.cuda.current_device(), 0)
+        dragon.cuda.set_default_device(1)
+        self.assertEqual(config.config().device_type, 'cuda')
+        self.assertEqual(config.config().device_index, 1)
+        dragon.cuda.set_default_device(-1)
+        self.assertEqual(config.config().device_type, 'cpu')
+        self.assertEqual(config.config().device_index, 0)
+if __name__ == '__main__':
+    run_tests()
--- a/test/dragon/core/test_framework.py
+++ b/test/dragon/core/test_framework.py
@@ -24,6 +24,41 @@ from dragon.core.testing.unittest.common_utils import run_tests
 from dragon.core.testing.unittest.common_utils import TEST_CUDA
+class TestGradientTape(unittest.TestCase):
+    """Test the gradient tape."""
+    def test_pop_push(self):
+        with dragon.GradientTape() as tape:
+            tape.reset()
+            try:
+                tape._pop_tape()
+            except ValueError:
+                pass
+            try:
+                with tape.stop_recording():
+                    pass
+            except ValueError:
+                pass
+            tape._push_tape()
+            with tape.stop_recording():
+                tape._tape = None
+                try:
+                    tape.watch(self)
+                except RuntimeError:
+                    pass
+                self.assertEqual(tape._recording, False)
+            try:
+                tape._tape = None
+                with tape.stop_recording():
+                    pass
+            except ValueError:
+                pass
+            try:
+                tape._push_tape()
+            except ValueError:
+                pass
 class TestTensor(unittest.TestCase):
    """Test the tensor class."""

--- a/test/run_test.py
+++ b/test/run_test.py
@@ -19,6 +19,7 @@ import subprocess
 import argparse
 TESTS_AND_SOURCES = [
+    ('dragon/core/test_device', 'dragon.core'),
    ('dragon/core/test_framework', 'dragon.core'),
    ('dragon/core/test_ops', 'dragon.core'),
 ]

--- a/torch/autograd/backprop.py
+++ b/torch/autograd/backprop.py
@@ -8,7 +8,6 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
 """Do back-propagation from the executed functions."""
 from __future__ import absolute_import

--- a/torch/executor.py
+++ b/torch/executor.py
@@ -8,7 +8,6 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
 """Execute tensor operations. """
 from __future__ import absolute_import

--- a/torch/jit/tape.py
+++ b/torch/jit/tape.py
@@ -8,7 +8,6 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
 """Recording tape utilities."""
 from __future__ import absolute_import

--- a/torch/nn/__init__.py
+++ b/torch/nn/__init__.py
@@ -8,7 +8,6 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
 """The List of nn components."""
 from __future__ import absolute_import as _absolute_import

--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -562,10 +562,10 @@ def elu(input, alpha=1., inplace=False):
    .. math::
        \text{ELU}(x) =
-        \begin{cases}
+            \begin{cases}
-            x, & \text{ if } x \geq 0 \\
+                x, & \text{ if } x \geq 0 \\
-            alpha * (e^{x} - 1), & \text{ otherwise }
+                \alpha * (\exp(x) - 1), & \text{ otherwise }
-        \end{cases}
+            \end{cases}
    See Also
    --------
@@ -576,7 +576,7 @@ def elu(input, alpha=1., inplace=False):
    input : dragon.vm.torch.Tensor
        The input tensor.
    alpha : float, optional, default=1.
-        The value of alpha.
+        The value to :math:`\alpha`.
    inplace : bool, optional, default=False
        Whether to do the operation in-place.
@@ -743,10 +743,10 @@ def leaky_relu(input, negative_slope=0.01, inplace=False):
    .. math::
        \text{LeakyReLU}(x) =
-        \begin{cases}
+            \begin{cases}
-            x, & \text{ if } x \geq 0 \\
+                x, & \text{ if } x \geq 0 \\
-            slope * x, & \text{ otherwise }
+                slope * x, & \text{ otherwise }
-        \end{cases}
+            \end{cases}
    See Also
    --------
@@ -849,7 +849,7 @@ def log_softmax(input, dim):
    The **LogSoftmax** function is defined as:
-    .. math:: \text{LogSoftmax}(x) = \log(\frac{e^{x_{i}}}{\sum e^{x_{j}}})
+    .. math:: \text{LogSoftmax}(x) = \log(\frac{\exp(x_{i})}{\sum \exp(x_{j})})
    Parameters
    ----------
@@ -1146,10 +1146,10 @@ def relu(input, inplace=False):
    .. math::
        \text{ReLU}(x) =
-        \begin{cases}
+            \begin{cases}
-            x, & \text{ if } x \geq 0 \\
+                x, & \text{ if } x \geq 0 \\
-            0, & \text{ otherwise }
+                0, & \text{ otherwise }
-        \end{cases}
+            \end{cases}
    Parameters
    ----------
@@ -1175,10 +1175,10 @@ def relu6(input, inplace=False):
    .. math::
        \text{ReLU-6}(x) =
-        \begin{cases}
+            \begin{cases}
-            \min(x, 6), & \text{ if } x \geq 0 \\
+                \min(x, 6), & \text{ if } x \geq 0 \\
-            0, & \text{ otherwise }
+                0, & \text{ otherwise }
-        \end{cases}
+            \end{cases}
    Parameters
    ----------
@@ -1207,10 +1207,10 @@ def selu(input, inplace=False):
    .. math::
        \text{SELU}(x) = 1.0507 *
-        \begin{cases}
+            \begin{cases}
-            x, & \text{ if } x \geq 0 \\
+                x, & \text{ if } x \geq 0 \\
-            1.67326 * (e^{x} - 1), & \text{ otherwise }
+                1.67326 * (\exp(x) - 1), & \text{ otherwise }
-        \end{cases}
+            \end{cases}
    Parameters
    ----------
@@ -1233,7 +1233,7 @@ def sigmoid(input, inplace=False):
    The **Sigmoid** function is defined as:
-    .. math:: \text{Sigmoid}(x) = \frac{1}{1 + e^{-x}}
+    .. math:: \text{Sigmoid}(x) = \frac{1}{1 + \exp(-x)}
    Parameters
    ----------
@@ -1325,10 +1325,10 @@ def smooth_l1_loss(
    .. math::
        \text{SmoothL1Loss}(x, y) =
-        \begin{cases}
+            \begin{cases}
-            0.5 * (x - y)^{2} / beta, & \text{ if } |x - y| < beta \\
+                0.5 * (x - y)^{2} / beta, & \text{ if } |x - y| < beta \\
-            |x - y| - 0.5 * beta, & \text{ otherwise }
+                |x - y| - 0.5 * beta, & \text{ otherwise }
-        \end{cases}
+            \end{cases}
    Parameters
    ----------
@@ -1368,7 +1368,7 @@ def softmax(input, dim, inplace=False):
    The **Softmax** function is defined as:
-    .. math:: \text{Softmax}(x) = \frac{e^{x_{i}}}{\sum e^{x_{j}}}
+    .. math:: \text{Softmax}(x_{i}) = \frac{\exp(x_{i})}{\sum_{j} \exp(x_{j})}
    Parameters
    ----------
@@ -1513,7 +1513,7 @@ def tanh(input, inplace=False):
    The **Tanh** function is defined as:
-    .. math:: \text{Tanh}(x) = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}
+    .. math:: \text{Tanh}(x) = \frac{\exp(x) - \exp(-x)}{\exp(x) + \exp(-x)}
    Parameters
    ----------

--- a/torch/nn/modules/activation.py
+++ b/torch/nn/modules/activation.py
@@ -28,10 +28,10 @@ class ELU(Module):
    .. math::
        \text{ELU}(x) =
-        \begin{cases}
+            \begin{cases}
-            x, & \text{ if } x \geq 0 \\
+                x, & \text{ if } x \geq 0 \\
-            alpha * (e^{x} - 1), & \text{ otherwise }
+                \alpha * (\exp(x) - 1), & \text{ otherwise }
-        \end{cases}
+            \end{cases}
    Examples:
@@ -130,10 +130,10 @@ class LeakyReLU(Module):
    .. math::
        \text{LeakyReLU}(x) =
-        \begin{cases}
+            \begin{cases}
-            x, & \text{ if } x \geq 0 \\
+                x, & \text{ if } x \geq 0 \\
-            slope * x, & \text{ otherwise }
+                slope * x, & \text{ otherwise }
-        \end{cases}
+            \end{cases}
    Examples:
@@ -177,7 +177,7 @@ class LogSoftmax(Module):
    The **LogSoftmax** function is defined as:
-    .. math:: \text{LogSoftmax}(x) = \log(\frac{e^{x_{i}}}{\sum e^{x_{j}}})
+    .. math:: \text{LogSoftmax}(x) = \log(\frac{\exp(x_{i})}{\sum \exp(x_{j})})
    Examples:
@@ -215,10 +215,10 @@ class PReLU(Module):
    .. math::
        \text{PReLU}(x) =
-        \begin{cases}
+            \begin{cases}
-            x, & \text{ if } x \geq 0 \\
+                x, & \text{ if } x \geq 0 \\
-            weight * x, & \text{ otherwise }
+                weight * x, & \text{ otherwise }
-        \end{cases}
+            \end{cases}
    Examples:
@@ -264,10 +264,10 @@ class ReLU(Module):
    .. math::
        \text{ReLU}(x) =
-        \begin{cases}
+            \begin{cases}
-            x, & \text{ if } x \geq 0 \\
+                x, & \text{ if } x \geq 0 \\
-            0, & \text{ otherwise }
+                0, & \text{ otherwise }
-        \end{cases}
+            \end{cases}
    Examples:
@@ -350,10 +350,10 @@ class SELU(Module):
    .. math::
        \text{SELU}(x) = 1.0507 *
-        \begin{cases}
+            \begin{cases}
-            x, & \text{ if } x \geq 0 \\
+                x, & \text{ if } x \geq 0 \\
-            1.67326 * (e^{x} - 1), & \text{ otherwise }
+                1.67326 * (\exp(x) - 1), & \text{ otherwise }
-        \end{cases}
+            \end{cases}
    Examples:
@@ -390,7 +390,7 @@ class Sigmoid(Module):
    The **Sigmoid** function is defined as:
-    .. math:: \text{Sigmoid}(x) = \frac{1}{1 + e^{-x}}
+    .. math:: \text{Sigmoid}(x) = \frac{1}{1 + \exp(-x)}
    Examples:
@@ -427,7 +427,7 @@ class Softmax(Module):
    The **Softmax** function is defined as:
-    .. math:: \text{Softmax}(x) = \frac{e^{x_{i}}}{\sum e^{x_{j}}}
+    .. math:: \text{Softmax}(x_{i}) = \frac{\exp(x_{i})}{\sum_{j} \exp(x_{j})}
    Examples:
@@ -469,7 +469,7 @@ class Tanh(Module):
    The **Tanh** function is defined as:
-    .. math:: \text{Tanh}(x) = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}
+    .. math:: \text{Tanh}(x) = \frac{\exp(x) - \exp(-x)}{\exp(x) + \exp(-x)}
    Examples:

--- a/torch/nn/modules/dropout.py
+++ b/torch/nn/modules/dropout.py
@@ -24,10 +24,11 @@ class DropBlock2d(Module):
    The **DropBlock** function is defined as:
    .. math::
-        \text{DropBlock}(x) = x \cdot \text{Bernoulli}(\alpha\cdot\gamma) \\
+        \text{DropBlock}(x_{ijk} =
-        \quad \\ \text{where}\quad \gamma =
+            x_{ijk} * (r_{ik} \sim \mathcal{B}(1, \alpha\gamma)) \\ \quad \\
-            \frac{keep\_prob}{block\_size^{n}}
+                \text{where}\quad \gamma =
-            \frac{feat\_size^{n}}{(feat\_size - block\_size + 1)^n}
+                    \frac{\text{keep\_prob}}{\text{block\_size}^{n}}
+                    \frac{\text{feat\_size}^{n}}{(\text{feat\_size} - \text{block\_size} + 1)^n}
    Examples:
@@ -94,7 +95,7 @@ class Dropout(Module):
    The **Dropout** function is defined as:
-    .. math:: \text{Dropout}(x) = x * \text{Bernoulli}(p=1 - prob)
+    .. math:: \text{Dropout}(x) = x * (r \sim \mathcal{B}(1, 1 - \text{prob}))
    Examples:
@@ -140,7 +141,7 @@ class DropPath(Module):
    The **DropPath** function is defined as:
-    .. math:: \text{DropPath}(x) = x * \text{Bernoulli}(p=1 - prob)
+    .. math:: \text{DropPath}(x_{ij}) = x_{ij} * (r_{i} \sim \mathcal{B}(1, 1 - \text{prob}))
    Examples:

--- a/torch/nn/modules/loss.py
+++ b/torch/nn/modules/loss.py
@@ -379,10 +379,10 @@ class SmoothL1Loss(_Loss):
    .. math::
        \text{SmoothL1Loss}(x, y) =
-        \begin{cases}
+            \begin{cases}
-            0.5 * (x - y)^{2} / beta, & \text{ if } |x - y| < beta \\
+                0.5 * (x - y)^{2} / beta, & \text{ if } |x - y| < beta \\
-            |x - y| - 0.5 * beta, & \text{ otherwise }
+                |x - y| - 0.5 * beta, & \text{ otherwise }
-        \end{cases}
+            \end{cases}
    Examples:

--- a/torch/ops/array/functional.py
+++ b/torch/ops/array/functional.py
@@ -655,10 +655,10 @@ def one_hot(input, depth):
    .. math::
        \text{out}[i][j] =
-        \begin{cases}
+            \begin{cases}
-            \text{Val}_{off}, & \text{ if } \text{input}[i] \neq j \\
+                0, & \text{ if } \text{input}[i] \neq j \\
-            \text{Val}_{on}, & \text{ otherwise }
+                1, & \text{ otherwise }
-        \end{cases}
+            \end{cases}
    Parameters
    ----------
@@ -1007,10 +1007,10 @@ def where(condition, x, y):
    .. math::
        \text{out}[i] =
-        \begin{cases}
+            \begin{cases}
-            \text{x}[i] & \text{ if } \text{cond}[i] \text{ is True } \\
+                \text{x}[i] & \text{ if } \text{cond}[i] \text{ is True } \\
-            \text{y}[i], & \text{ otherwise }
+                \text{y}[i], & \text{ otherwise }
-        \end{cases}
+            \end{cases}
    Parameters
    ----------

--- a/torch/ops/math/functional.py
+++ b/torch/ops/math/functional.py
@@ -74,17 +74,17 @@ def axpby(input, alpha=1., beta=1., out=None):
        ).apply(input, out)
-def add(input, value, out=None):
+def add(input, other, out=None):
    r"""Compute the element-wise addition.
-    .. math:: \text{out} = \text{input} + \text{value}
+    .. math:: \text{out} = \text{input} + \text{other}
    Parameters
    ----------
    input : dragon.vm.torch.Tensor
        The input tensor.
-    value : Union[dragon.vm.torch.Tensor, number]
+    other : Union[dragon.vm.torch.Tensor, number]
-        The value to add.
+        The tensor to add.
    out : dragon.vm.torch.Tensor, optional
        The optional output tensor.
@@ -94,13 +94,13 @@ def add(input, value, out=None):
        The output tensor.
    """
-    return _binary_func(input, value, 'Add', out)
+    return _binary_func(input, other, 'Add', out)
 def bitwise_not(input, out=None):
    r"""Compute the element-wise NOT bitwise operation.
-    .. math:: \text{out} = \,\,\sim x
+    .. math:: \text{out} = \,\,\sim \text{input}
    Examples:
@@ -117,7 +117,7 @@ def bitwise_not(input, out=None):
    Parameters
    ----------
    input : dragon.vm.torch.Tensor
-        The tensor :math:`x`.
+        The input tensor.
    out : dragon.vm.torch.Tensor, optional
        The optional output tensor.
@@ -165,7 +165,7 @@ def bitwise_xor(input, other, out=None):
 def ceil(input, out=None):
    r"""Compute the smallest integer not less than input.
-    .. math:: \text{out} = \lceil x \rceil
+    .. math:: \text{out} = \lceil \text{input} \rceil
    Examples:
@@ -248,17 +248,17 @@ def cos(input, out=None):
    return _unary_func(input, 'Cos', out)
-def div(input, value, out=None):
+def div(input, other, out=None):
    r"""Compute the element-wise division.
-    .. math:: \text{out} = \text{input} \div \text{value}
+    .. math:: \text{out} = \text{input} \div \text{other}
    Parameters
    ----------
    input : dragon.vm.torch.Tensor
        The input tensor.
-    value : Union[dragon.vm.torch.Tensor, number]
+    other : Union[dragon.vm.torch.Tensor, number]
-        The value to divide.
+        The tensor to divide.
    out : dragon.vm.torch.Tensor, optional
        The optional output tensor.
@@ -268,20 +268,20 @@ def div(input, value, out=None):
        The output tensor.
    """
-    return _binary_func(input, value, 'Div', out)
+    return _binary_func(input, other, 'Div', out)
 def eq(input, other, out=None):
    r"""Compute the element-wise equal comparison.
-    .. math:: \text{out} = (\text{input} = \text{other})
+    .. math:: \text{out} = (\text{input} == \text{other})
    Parameters
    ----------
    input : dragon.vm.torch.Tensor
        The input tensor.
    other : Union[dragon.vm.torch.Tensor, number]
-        The value to compare.
+        The tensor to compare.
    out : dragon.vm.torch.Tensor, optional
        The optional output tensor.
@@ -297,7 +297,7 @@ def eq(input, other, out=None):
 def exp(input, out=None):
    r"""Compute the exponential of input.
-    .. math:: \text{out} = e^{\text{input}}
+    .. math:: \text{out} = \exp(\text{input})
    Parameters
    ----------
@@ -318,7 +318,7 @@ def exp(input, out=None):
 def floor(input, out=None):
    r"""Compute the largest integer not greater than input.
-    .. math:: \text{out} = \lfloor x \rfloor
+    .. math:: \text{out} = \lfloor \text{input} \rfloor
    Examples:
@@ -353,7 +353,7 @@ def ge(input, other, out=None):
    input : dragon.vm.torch.Tensor
        The input tensor.
    other : Union[dragon.vm.torch.Tensor, number]
-        The value to compare.
+        The tensor to compare.
    out : dragon.vm.torch.Tensor, optional
        The optional output tensor.
@@ -376,7 +376,7 @@ def gt(input, other, out=None):
    input : dragon.vm.torch.Tensor
        The input tensor.
    other : Union[dragon.vm.torch.Tensor, number]
-        The value to compare.
+        The tensor to compare.
    out : dragon.vm.torch.Tensor, optional
        The optional output tensor.
@@ -392,7 +392,7 @@ def gt(input, other, out=None):
 def isinf(input):
    r"""Check if the elements of input are infinite.
-    .. math:: \text{out} = \text{isinf}(x)
+    .. math:: \text{out} = \text{isinf}(\text{input})
    Examples:
@@ -418,7 +418,7 @@ def isinf(input):
 def isnan(input):
    r"""Check if the elements of input are NaN.
-    .. math:: \text{out} = \text{isnan}(x)
+    .. math:: \text{out} = \text{isnan}(\text{input})
    Examples:
@@ -451,7 +451,7 @@ def le(input, other, out=None):
    input : dragon.vm.torch.Tensor
        The input tensor.
    other : Union[dragon.vm.torch.Tensor, number]
-        The value to compare.
+        The tensor to compare.
    out : dragon.vm.torch.Tensor, optional
        The optional output tensor.
@@ -488,7 +488,7 @@ def log(input, out=None):
 def logsumexp(input, dim, keepdim=False, out=None):
    r"""Apply the composite of log, sum, and exp to input.
-    .. math:: \text{LogSumExp}(x)_{i} = \log\sum_{j}\exp(x_{ij})
+    .. math:: \text{out}_{i} = \log\sum_{j}\exp(\text{input}_{ij})
    Parameters
    ----------
@@ -520,7 +520,7 @@ def lt(input, other, out=None):
    input : dragon.vm.torch.Tensor
        The input tensor.
    other : Union[dragon.vm.torch.Tensor, number]
-        The value to compare.
+        The tensor to compare.
    out : dragon.vm.torch.Tensor, optional
        The optional output tensor.
@@ -594,7 +594,7 @@ def minimum(input, other, out=None):
 def mm(input, mat2, transpose_a=False, transpose_b=False, out=None):
    r"""Compute matrix-matrix multiplication.
-    .. math:: \text{out} = a \times b
+    .. math:: y = a \times b
    Parameters
    ----------
@@ -623,17 +623,17 @@ def mm(input, mat2, transpose_a=False, transpose_b=False, out=None):
        ).apply(input, mat2, out)
-def mul(input, value, out=None):
+def mul(input, other, out=None):
    r"""Compute the element-wise multiplication.
-    .. math:: \text{out} = \text{input} \times \text{value}
+    .. math:: \text{out} = \text{input} \times \text{other}
    Parameters
    ----------
    input : dragon.vm.torch.Tensor
        The input tensor.
-    value : Union[dragon.vm.torch.Tensor, number]
+    other : Union[dragon.vm.torch.Tensor, number]
-        The value to multiply.
+        The tensor to multiply.
    out : dragon.vm.torch.Tensor, optional
        The optional output tensor.
@@ -643,7 +643,7 @@ def mul(input, value, out=None):
        The output tensor.
    """
-    return _binary_func(input, value, 'Mul', out)
+    return _binary_func(input, other, 'Mul', out)
 def ne(input, other, out=None):
@@ -656,7 +656,7 @@ def ne(input, other, out=None):
    input : dragon.vm.torch.Tensor
        The input tensor.
    other : Union[dragon.vm.torch.Tensor, number]
-        The value to compare.
+        The tensor to compare.
    out : dragon.vm.torch.Tensor, optional
        The optional output tensor.
@@ -693,7 +693,7 @@ def neg(input, out=None):
 def pow(input, exponent, out=None):
    r"""Compute the power of input.
-    .. math:: \text{out} = x^{y}
+    .. math:: \text{out} = \text{input}^{\text{exponent}}
    The two inputs should be broadcast to each other:
@@ -707,9 +707,9 @@ def pow(input, exponent, out=None):
    Parameters
    ----------
    input : Union[dragon.vm.torch.Tensor, number]
-        The input tensor :math:`x`.
+        The input tensor.
    exponent : Union[dragon.vm.torch.Tensor, number]
-        The exponent value :math:`y`.
+        The exponent tensor.
    out : dragon.vm.torch.Tensor, optional
        The optional output tensor.
@@ -810,11 +810,11 @@ def sign(input, out=None):
    r"""Compute the sign indication of input.
    .. math::
-        \text{out}_{i} =
+        \text{out}[i] =
            \begin{cases}
-                -1, & \text{ if } \text{input}_{i} < 0 \\
+                -1, & \text{ if } \text{input}[i] < 0 \\
-                 0, & \text{ if } \text{input}_{i} = 0 \\
+                 0, & \text{ if } \text{input}[i] = 0 \\
-                 1, & \text{ if } \text{input}_{i} > 0
+                 1, & \text{ if } \text{input}[i] > 0
            \end{cases}
    Examples:
@@ -896,17 +896,17 @@ def sqrt(input, out=None):
    return _unary_func(input, 'Sqrt', out)
-def sub(input, value, out=None):
+def sub(input, other, out=None):
    r"""Compute the element-wise subtraction.
-    .. math:: \text{out} = \text{input} - \text{value}
+    .. math:: \text{out} = \text{input} - \text{other}
    Parameters
    ----------
    input : dragon.vm.torch.Tensor
        The input tensor.
-    value : Union[dragon.vm.torch.Tensor, number]
+    other : Union[dragon.vm.torch.Tensor, number]
-        The value to subtract.
+        The tensor to subtract.
    out : dragon.vm.torch.Tensor, optional
        The optional output tensor.
@@ -916,7 +916,7 @@ def sub(input, value, out=None):
        The output tensor.
    """
-    return _binary_func(input, value, 'Sub', out)
+    return _binary_func(input, other, 'Sub', out)
 def _binary_func(input, value, op_type='', out=None):

--- a/torch/ops/tensorbind.py
+++ b/torch/ops/tensorbind.py
@@ -8,7 +8,6 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
 """Bind tensor methods executed with backend."""
 from __future__ import absolute_import
@@ -849,13 +848,13 @@ def lt(self, other):
 def masked_fill_(self, mask, value):
-    r"""Fill self with the given value where ``mask`` is **1**.
+    r"""Fill self with the value where mask is 1.
    .. math::
-        \text{Ref}[i] =
+        \text{self}[i] =
-        \begin{cases}
+            \begin{cases}
-            \text{Value}[i], & \text{ if } \text{Mask}[i] = 1 \\
+                \text{value}[i], & \text{ if } \text{mask}[i] = 1 \\
-            \text{Ref}[i], & \text{ otherwise }
+                \text{self}[i], & \text{ otherwise }
        \end{cases}
    Parameters
@@ -1336,11 +1335,11 @@ def sign(self):
    r"""Return a tensor taken the sign indication of elements.
    .. math::
-        \text{out}_{i} =
+        \text{out}[i] =
            \begin{cases}
-                -1, & \text{ if } \text{self}_{i} < 0 \\
+                -1, & \text{ if } \text{self}[i] < 0 \\
-                 0, & \text{ if } \text{self}_{i} = 0 \\
+                 0, & \text{ if } \text{self}[i] = 0 \\
-                 1, & \text{ if } \text{self}_{i} > 0
+                 1, & \text{ if } \text{self}[i] > 0
            \end{cases}
    Returns
@@ -1360,11 +1359,11 @@ def sign_(self):
    r"""Set to the sign indication of elements.
    .. math::
-        \text{self}_{i} =
+        \text{self}[i] =
            \begin{cases}
-                -1, & \text{ if } \text{self}_{i} < 0 \\
+                -1, & \text{ if } \text{self}[i] < 0 \\
-                 0, & \text{ if } \text{self}_{i} = 0 \\
+                 0, & \text{ if } \text{self}[i] = 0 \\
-                 1, & \text{ if } \text{self}_{i} > 0
+                 1, & \text{ if } \text{self}[i] > 0
            \end{cases}
    Returns
@@ -1614,10 +1613,10 @@ def where(self, condition, y):
    .. math::
        \text{out}[i] =
-        \begin{cases}
+            \begin{cases}
-            \text{self}[i] & \text{ if } cond[i] \text{ is True } \\
+                \text{self}[i] & \text{ if } cond[i] \text{ is True } \\
-            y[i], & \text{ otherwise }
+                y[i], & \text{ otherwise }
-        \end{cases}
+            \end{cases}
    Parameters
    ----------

--- a/torch/tensor.py
+++ b/torch/tensor.py
@@ -1036,14 +1036,14 @@ class Tensor(object):
        """
    def masked_fill_(self, mask, value):
-        r"""Fill self with the given value where ``mask`` is **1**.
+        r"""Fill self with the value where mask is 1.
        .. math::
-            \text{Ref}[i] =
+            \text{self}[i] =
-            \begin{cases}
+                \begin{cases}
-                \text{Value}[i], & \text{ if } \text{Mask}[i] = 1 \\
+                    \text{value}[i], & \text{ if } \text{mask}[i] = 1 \\
-                \text{Ref}[i], & \text{ otherwise }
+                    \text{self}[i], & \text{ otherwise }
-            \end{cases}
+                \end{cases}
        Parameters
        ----------
@@ -1513,11 +1513,11 @@ class Tensor(object):
        r"""Return a tensor taken the sign indication of elements.
        .. math::
-            \text{out}_{i} =
+            \text{out}[i] =
                \begin{cases}
-                    -1, & \text{ if } \text{self}_{i} < 0 \\
+                    -1, & \text{ if } \text{self}[i] < 0 \\
-                     0, & \text{ if } \text{self}_{i} = 0 \\
+                     0, & \text{ if } \text{self}[i] = 0 \\
-                     1, & \text{ if } \text{self}_{i} > 0
+                     1, & \text{ if } \text{self}[i] > 0
                \end{cases}
        Returns
@@ -1535,11 +1535,11 @@ class Tensor(object):
        r"""Set to the sign indication of elements.
        .. math::
-            \text{self}_{i} =
+            \text{self}[i] =
                \begin{cases}
-                    -1, & \text{ if } \text{self}_{i} < 0 \\
+                    -1, & \text{ if } \text{self}[i] < 0 \\
-                     0, & \text{ if } \text{self}_{i} = 0 \\
+                     0, & \text{ if } \text{self}[i] = 0 \\
-                     1, & \text{ if } \text{self}_{i} > 0
+                     1, & \text{ if } \text{self}[i] > 0
                \end{cases}
        Returns
@@ -1835,10 +1835,10 @@ class Tensor(object):
        .. math::
            \text{out}[i] =
-            \begin{cases}
+                \begin{cases}
-                \text{self}[i] & \text{ if } cond[i] \text{ is True } \\
+                    \text{self}[i] & \text{ if } cond[i] \text{ is True } \\
-                y[i], & \text{ otherwise }
+                    y[i], & \text{ otherwise }
-            \end{cases}
+                \end{cases}
        Parameters
        ----------

--- a/torch/vision/models/alexnet.py
+++ b/torch/vision/models/alexnet.py
@@ -12,7 +12,6 @@
 #    <https://github.com/pytorch/vision/blob/master/torchvision/models/alexnet.py>
 #
 # ------------------------------------------------------------
 """Recommend hyper-parameters:
 Nesterov-SGD, batch_size: 256, base_lr: 0.01, weight_decay: 0.0005

--- a/torch/vision/models/inception.py
+++ b/torch/vision/models/inception.py
@@ -12,7 +12,6 @@
 #    <https://github.com/pytorch/vision/blob/master/torchvision/models/inception.py>
 #
 # ------------------------------------------------------------
 """We remove the aux-loss branch, comparing to the one in original model zoo.
 Recommend hyper-parameters:

--- a/torch/vision/models/resnet.py
+++ b/torch/vision/models/resnet.py
@@ -12,7 +12,6 @@
 #    <https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py>
 #
 # ------------------------------------------------------------
 """We add the zero-init-bn, comparing to the one in original model zoo.
 For more about zero-init, See,

--- a/torch/vision/models/squeezenet.py
+++ b/torch/vision/models/squeezenet.py
@@ -12,7 +12,6 @@
 #    <https://github.com/pytorch/vision/blob/master/torchvision/models/squeezenet.py>
 #
 # ------------------------------------------------------------
 """Recommend hyper-parameters:
 Nesterov-SGD, batch_size: 512, base_lr: 0.04, weight_decay: 0.0002

--- a/torch/vision/models/vgg.py
+++ b/torch/vision/models/vgg.py
@@ -12,7 +12,6 @@
 #    <https://github.com/pytorch/vision/blob/master/torchvision/models/vgg.py>
 #
 # ------------------------------------------------------------
 """Recommend hyper-parameters:
 Nesterov-SGD, batch_size: 256, base_lr: 0.01, weight_decay: 0.0005