Reimplement the general matrix multiplication

Summary: This commit generalizes the fully-connected operation into GEMM, and enhances the matmul operation via batched Dot, GEMV and GEMM. New representations and attributes have been consistent with ONNX.

Reimplement the general matrix multiplication
Summary: This commit generalizes the fully-connected operation into GEMM, and enhances the matmul operation via batched Dot, GEMV and GEMM. New representations and attributes have been consistent with ONNX.
Ting PAN
Commit 6bfe3e73 authored Feb 05, 2021 by Ting PAN
Showing with 1540 additions and 662 deletions
caffe/core/layers/common.py
docs/api/python/dragon/math.rst
docs/api/python/dragon/nn/fully_connected.rst → docs/api/python/dragon/math/gemm.rst
docs/api/python/dragon/nn.rst
docs/api/python/dragon/onnx.rst
docs/api/python/torch.rst
docs/api/python/torch/Tensor_.rst
docs/api/python/torch/addmm.rst
docs/api/python/torch/baddbmm.rst
docs/api/python/torch/bmm.rst
docs/api/python/torch/matmul.rst
docs/api/python/torch/nn.rst
docs/api/python/torch/nn/AdaptiveAvgPool1d.rst
docs/api/python/torch/nn/AdaptiveAvgPool2d.rst
docs/api/python/torch/nn/AdaptiveAvgPool3d.rst
docs/api/python/torch/nn/AdaptiveMaxPool1d.rst
docs/api/python/torch/nn/AdaptiveMaxPool2d.rst
docs/api/python/torch/nn/AdaptiveMaxPool3d.rst
docs/api/python/torch/nn/functional.rst
docs/api/python/torch/nn/functional/adaptive_avg_pool1d.rst
--- a/caffe/core/layers/common.py
+++ b/caffe/core/layers/common.py
@@ -313,8 +313,8 @@ class InnerProduct(Layer):
        param = layer_param.inner_product_param
        self.arguments = {
            'axis': param.axis,
-            'out_channels': param.num_output,
+            'n': param.num_output,
-            'transpose_w': not param.transpose,
+            'transpose_b': not param.transpose,
        }
        self.add_blob(filler=self.get_filler(param, 'weight_filler'))
        if param.bias_term:
@@ -322,7 +322,7 @@ class InnerProduct(Layer):
    def __call__(self, bottom):
        inputs = [bottom] + [blob['data'] for blob in self._blobs]
-        return math_ops.fully_connected(inputs, **self.arguments)
+        return math_ops.gemm(inputs, **self.arguments)
 class Input(Layer):
@@ -409,7 +409,7 @@ class Normalize(Layer):
    def __call__(self, bottom):
        norm_out = [normalization_ops.lp_normalize(bottom, **self.l2norm_arguments)]
        norm_out += [blob['data'] for blob in self._blobs]
-        return math_ops.affine(norm_out, **self.affine_arguments)
+        return array_ops.channel_affine(norm_out, **self.affine_arguments)
 class Permute(Layer):
@@ -583,7 +583,7 @@ class Scale(Layer):
    def __call__(self, bottom):
        inputs = [bottom] + [blob['data'] for blob in self._blobs]
-        return math_ops.affine(inputs, **self.arguments)
+        return array_ops.channel_affine(inputs, **self.arguments)
 class Slice(Layer):

--- a/docs/api/python/dragon/math.rst
+++ b/docs/api/python/dragon/math.rst
@@ -48,6 +48,9 @@ dragon.math
  `floor(...) <math/floor.html>`_
  : Compute the largest integer not greater than input.
+  `gemm(...) <math/gemm.html>`_
+  : Compute the general matrix multiplication.
  `greater(...) <math/greater.html>`_
  : Compute the element-wise greater comparison.
@@ -158,6 +161,7 @@ dragon.math
  math/equal
  math/exp
  math/floor
+  math/gemm
  math/greater
  math/greater_equal
  math/is_inf

--- a/docs/api/python/dragon/nn/fully_connected.rst
+++ b/docs/api/python/dragon/nn/fully_connected.rst
-fully_connected
+gemm
-===============
+====
-.. autofunction:: dragon.nn.fully_connected
+.. autofunction:: dragon.math.gemm
 .. raw:: html
  <style>
    h1:before {
-      content: "dragon.nn.";
+      content: "dragon.math.";
      color: #103d3e;
    }
  </style>
--- a/docs/api/python/dragon/nn.rst
+++ b/docs/api/python/dragon/nn.rst
@@ -74,9 +74,6 @@ dragon.nn
  : Apply the exponential linear unit.
  `[Clevert et.al, 2015] <https://arxiv.org/abs/1511.07289>`_.
-  `fully_connected(...) <nn/fully_connected.html>`_
-  : Compute the dense matrix multiplication along the given axes.
  `group_norm(...) <nn/group_norm.html>`_
  : Apply the group normalization.
  `[Wu & He, 2018] <https://arxiv.org/abs/1803.08494>`_.
@@ -167,7 +164,6 @@ dragon.nn
  nn/drop_block2d
  nn/drop_path
  nn/elu
-  nn/fully_connected
  nn/group_norm
  nn/hardsigmoid
  nn/hardswish

--- a/docs/api/python/dragon/onnx.rst
+++ b/docs/api/python/dragon/onnx.rst
@@ -79,7 +79,7 @@ Name                     Supported Reference
 `Gather`_                |v|       :func:`dragon.index_select`
 `GatherElements`_
 `GatherND`_
-`Gemm`_                  |v|       :func:`dragon.nn.fully_connected`
+`Gemm`_                  |v|       :func:`dragon.math.gemm`
 `GlobalAveragePool`_     |v|       :func:`dragon.nn.pool2d`
 `GlobalLpPool`_
 `GlobalMaxPool`_         |v|       :func:`dragon.nn.pool2d`

--- a/docs/api/python/torch.rst
+++ b/docs/api/python/torch.rst
@@ -36,6 +36,9 @@ vm.torch
  `add(...) <torch/add.html>`_
  : Compute the element-wise addition.
+  `addmm(...) <torch/addmm.html>`_
+  : Add input to the result of matrix-matrix multiplication.
  `arange(...) <torch/arange.html>`_
  : Return a tensor of evenly spaced values within a interval.
@@ -51,12 +54,18 @@ vm.torch
  `axpby(...) <torch/axpby.html>`_
  : Compute the element-wise addition from input to output.
+  `baddbmm(...) <torch/baddbmm.html>`_
+  : Add input to the result of batched matrix-matrix multiplication.
  `bitwise_not(...) <torch/bitwise_not.html>`_
  : Compute the element-wise NOT bitwise operation.
  `bitwise_xor(...) <torch/bitwise_xor.html>`_
  : Compute the element-wise XOR bitwise operation.
+  `bmm(...) <torch/bmm.html>`_
+  : Compute the batched matrix-matrix multiplication.
  `cat(...) <torch/cat.html>`_
  : Concatenate the inputs along the given dimension.
@@ -148,6 +157,9 @@ vm.torch
  `masked_select(...) <torch/logsumexp.html>`_
  : Select the input elements where mask is 1.
+  `matmul(...) <torch/matmul.html>`_
+  : Compute the matrix multiplication.
  `max(...) <torch/max.html>`_
  : Compute the max value of elements along the given dimension.
@@ -281,13 +293,16 @@ vm.torch
  torch/Tensor_
  torch/abs
  torch/add
+  torch/addmm
  torch/arange
  torch/argmax
  torch/argmin
  torch/argsort
  torch/axpby
+  torch/baddbmm
  torch/bitwise_not
  torch/bitwise_xor
+  torch/bmm
  torch/cat
  torch/ceil
  torch/channel_affine
@@ -321,6 +336,7 @@ vm.torch
  torch/logsumexp
  torch/lt
  torch/masked_select
+  torch/matmul
  torch/max
  torch/maximum
  torch/mean

--- a/docs/api/python/torch/Tensor_.rst
+++ b/docs/api/python/torch/Tensor_.rst
@@ -53,6 +53,10 @@ add\_
 #####
 .. automethod:: dragon.vm.torch.Tensor.add_
+addmm
+#####
+.. automethod:: dragon.vm.torch.Tensor.addmm
 argmax
 ######
 .. automethod:: dragon.vm.torch.Tensor.argmax
@@ -69,6 +73,14 @@ backward
 ########
 .. automethod:: dragon.vm.torch.Tensor.backward
+baddbmm
+#######
+.. automethod:: dragon.vm.torch.Tensor.baddbmm
+baddbmm\_
+#########
+.. automethod:: dragon.vm.torch.Tensor.baddbmm_
 bitwise_not
 ###########
 .. automethod:: dragon.vm.torch.Tensor.bitwise_not
@@ -85,6 +97,10 @@ bitwise_xor\_
 #############
 .. automethod:: dragon.vm.torch.Tensor.bitwise_xor_
+bmm
+###
+.. automethod:: dragon.vm.torch.Tensor.bmm
 bool
 ####
 .. automethod:: dragon.vm.torch.Tensor.bool
@@ -285,6 +301,14 @@ masked_fill\_
 #############
 .. automethod:: dragon.vm.torch.Tensor.masked_fill_
+masked_select
+#############
+.. automethod:: dragon.vm.torch.Tensor.masked_select
+matmul
+######
+.. automethod:: dragon.vm.torch.Tensor.matmul
 max
 ###
 .. automethod:: dragon.vm.torch.Tensor.max
@@ -293,10 +317,6 @@ maximum
 #######
 .. automethod:: dragon.vm.torch.Tensor.maximum
-masked_select
-#############
-.. automethod:: dragon.vm.torch.Tensor.masked_select
 mean
 ####
 .. automethod:: dragon.vm.torch.Tensor.mean
@@ -535,11 +555,14 @@ zero\_
 .. _torch.abs(...): abs.html
 .. _torch.add(...): add.html
+.. _torch.addmm(...): addmm.html
 .. _torch.argmax(...): argmax.html
 .. _torch.argmin(...): argmin.html
 .. _torch.argsort(...): argsort.html
+.. _torch.baddbmm(...): baddbmm.html
 .. _torch.bitwise_not(...): bitwise_not.html
 .. _torch.bitwise_xor(...): bitwise_xor.html
+.. _torch.bmm(...): bmm.html
 .. _torch.ceil(...): ceil.html
 .. _torch.clamp(...): clamp.html
 .. _torch.cos(...): cos.html
@@ -557,6 +580,7 @@ zero\_
 .. _torch.isnan(...): isnan.html
 .. _torch.le(...): le.html
 .. _torch.lt(...): lt.html
+.. _torch.matmul(...): matmul.html
 .. _torch.max(...): max.html
 .. _torch.maximum(...): maximum.html
 .. _torch.min(...): min.html

--- a/docs/api/python/torch/addmm.rst
+++ b/docs/api/python/torch/addmm.rst
+addmm
+=====
+.. autofunction:: dragon.vm.torch.addmm
+.. raw:: html
+  <style>
+    h1:before {
+      content: "torch.";
+      color: #103d3e;
+    }
+  </style>
--- a/docs/api/python/torch/baddbmm.rst
+++ b/docs/api/python/torch/baddbmm.rst
+baddbmm
+=======
+.. autofunction:: dragon.vm.torch.baddbmm
+.. raw:: html
+  <style>
+    h1:before {
+      content: "torch.";
+      color: #103d3e;
+    }
+  </style>
--- a/docs/api/python/torch/bmm.rst
+++ b/docs/api/python/torch/bmm.rst
+bmm
+===
+.. autofunction:: dragon.vm.torch.bmm
+.. raw:: html
+  <style>
+    h1:before {
+      content: "torch.";
+      color: #103d3e;
+    }
+  </style>
--- a/docs/api/python/torch/matmul.rst
+++ b/docs/api/python/torch/matmul.rst
+matmul
+======
+.. autofunction:: dragon.vm.torch.matmul
+.. raw:: html
+  <style>
+    h1:before {
+      content: "torch.";
+      color: #103d3e;
+    }
+  </style>
--- a/docs/api/python/torch/nn.rst
+++ b/docs/api/python/torch/nn.rst
@@ -6,6 +6,24 @@ vm.torch.nn
  Classes
  -------
+  `class AdaptiveAvgPool1d <nn/AdaptiveAvgPool1d.html>`_
+  : Apply the 1d adaptive average pooling.
+  `class AdaptiveAvgPool2d <nn/AdaptiveAvgPool2d.html>`_
+  : Apply the 2d adaptive average pooling.
+  `class AdaptiveAvgPool3d <nn/AdaptiveAvgPool3d.html>`_
+  : Apply the 3d adaptive average pooling.
+  `class AdaptiveMaxPool1d <nn/AdaptiveMaxPool1d.html>`_
+  : Apply the 1d adaptive max pooling.
+  `class AdaptiveMaxPool2d <nn/AdaptiveMaxPool2d.html>`_
+  : Apply the 2d adaptive max pooling.
+  `class AdaptiveMaxPool3d <nn/AdaptiveMaxPool3d.html>`_
+  : Apply the 3d adaptive max pooling.
  `class AffineChannel <nn/AffineChannel.html>`_
  : Apply affine transformation along the channels.
@@ -238,6 +256,12 @@ vm.torch.nn
 .. toctree::
  :hidden:
+  nn/AdaptiveAvgPool1d
+  nn/AdaptiveAvgPool2d
+  nn/AdaptiveAvgPool3d
+  nn/AdaptiveMaxPool1d
+  nn/AdaptiveMaxPool2d
+  nn/AdaptiveMaxPool3d
  nn/AffineChannel
  nn/AvgPool1d
  nn/AvgPool2d

--- a/docs/api/python/torch/nn/AdaptiveAvgPool1d.rst
+++ b/docs/api/python/torch/nn/AdaptiveAvgPool1d.rst
+AdaptiveAvgPool1d
+=================
+.. autoclass:: dragon.vm.torch.nn.AdaptiveAvgPool1d
+__init__
+--------
+.. automethod:: dragon.vm.torch.nn.AdaptiveAvgPool1d.__init__
+.. _torch.nn.functional.adaptive_avg_pool1d(...): functional/adaptive_avg_pool1d.html
+.. raw:: html
+  <style>
+    h1:before {
+      content: "torch.nn.";
+      color: #103d3e;
+    }
+  </style>
--- a/docs/api/python/torch/nn/AdaptiveAvgPool2d.rst
+++ b/docs/api/python/torch/nn/AdaptiveAvgPool2d.rst
+AdaptiveAvgPool2d
+=================
+.. autoclass:: dragon.vm.torch.nn.AdaptiveAvgPool2d
+__init__
+--------
+.. automethod:: dragon.vm.torch.nn.AdaptiveAvgPool2d.__init__
+.. _torch.nn.functional.adaptive_avg_pool2d(...): functional/adaptive_avg_pool2d.html
+.. raw:: html
+  <style>
+    h1:before {
+      content: "torch.nn.";
+      color: #103d3e;
+    }
+  </style>
--- a/docs/api/python/torch/nn/AdaptiveAvgPool3d.rst
+++ b/docs/api/python/torch/nn/AdaptiveAvgPool3d.rst
+AdaptiveAvgPool3d
+=================
+.. autoclass:: dragon.vm.torch.nn.AdaptiveAvgPool3d
+__init__
+--------
+.. automethod:: dragon.vm.torch.nn.AdaptiveAvgPool3d.__init__
+.. _torch.nn.functional.adaptive_avg_pool3d(...): functional/adaptive_avg_pool3d.html
+.. raw:: html
+  <style>
+    h1:before {
+      content: "torch.nn.";
+      color: #103d3e;
+    }
+  </style>
--- a/docs/api/python/torch/nn/AdaptiveMaxPool1d.rst
+++ b/docs/api/python/torch/nn/AdaptiveMaxPool1d.rst
+AdaptiveMaxPool1d
+=================
+.. autoclass:: dragon.vm.torch.nn.AdaptiveMaxPool1d
+__init__
+--------
+.. automethod:: dragon.vm.torch.nn.AdaptiveMaxPool1d.__init__
+.. _torch.nn.functional.adaptive_max_pool1d(...): functional/adaptive_max_pool1d.html
+.. raw:: html
+  <style>
+    h1:before {
+      content: "torch.nn.";
+      color: #103d3e;
+    }
+  </style>
--- a/docs/api/python/torch/nn/AdaptiveMaxPool2d.rst
+++ b/docs/api/python/torch/nn/AdaptiveMaxPool2d.rst
+AdaptiveMaxPool2d
+=================
+.. autoclass:: dragon.vm.torch.nn.AdaptiveMaxPool2d
+__init__
+--------
+.. automethod:: dragon.vm.torch.nn.AdaptiveMaxPool2d.__init__
+.. _torch.nn.functional.adaptive_max_pool2d(...): functional/adaptive_max_pool2d.html
+.. raw:: html
+  <style>
+    h1:before {
+      content: "torch.nn.";
+      color: #103d3e;
+    }
+  </style>
--- a/docs/api/python/torch/nn/AdaptiveMaxPool3d.rst
+++ b/docs/api/python/torch/nn/AdaptiveMaxPool3d.rst
+AdaptiveMaxPool3d
+=================
+.. autoclass:: dragon.vm.torch.nn.AdaptiveMaxPool3d
+__init__
+--------
+.. automethod:: dragon.vm.torch.nn.AdaptiveMaxPool3d.__init__
+.. _torch.nn.functional.adaptive_max_pool3d(...): functional/adaptive_max_pool3d.html
+.. raw:: html
+  <style>
+    h1:before {
+      content: "torch.nn.";
+      color: #103d3e;
+    }
+  </style>
--- a/docs/api/python/torch/nn/functional.rst
+++ b/docs/api/python/torch/nn/functional.rst
@@ -6,6 +6,24 @@ vm.torch.nn.functional
  Functions
  ---------
+  `adaptive_avg_pool1d(...) <functional/adaptive_avg_pool1d.html>`_
+  : Apply the 1d adaptive average pooling to input.
+  `adaptive_avg_pool2d(...) <functional/adaptive_avg_pool2d.html>`_
+  : Apply the 2d adaptive average pooling to input.
+  `adaptive_avg_pool3d(...) <functional/adaptive_avg_pool3d.html>`_
+  : Apply the 3d adaptive average pooling to input.
+  `adaptive_max_pool1d(...) <functional/adaptive_max_pool1d.html>`_
+  : Apply the 1d adaptive max pooling to input.
+  `adaptive_max_pool2d(...) <functional/adaptive_max_pool2d.html>`_
+  : Apply the 2d adaptive max pooling to input.
+  `adaptive_max_pool3d(...) <functional/adaptive_max_pool3d.html>`_
+  : Apply the 3d adaptive max pooling to input.
  `avg_pool1d(...) <functional/avg_pool1d.html>`_
  : Apply the 1d average pooling to input.
@@ -167,6 +185,12 @@ vm.torch.nn.functional
 .. toctree::
  :hidden:
+  functional/adaptive_avg_pool1d
+  functional/adaptive_avg_pool2d
+  functional/adaptive_avg_pool3d
+  functional/adaptive_max_pool1d
+  functional/adaptive_max_pool2d
+  functional/adaptive_max_pool3d
  functional/avg_pool1d
  functional/avg_pool2d
  functional/avg_pool3d

--- a/docs/api/python/torch/nn/functional/adaptive_avg_pool1d.rst
+++ b/docs/api/python/torch/nn/functional/adaptive_avg_pool1d.rst
+adaptive_avg_pool1d
+===================
+.. autofunction:: dragon.vm.torch.nn.functional.adaptive_avg_pool1d
+.. _torch.nn.AdaptiveAvgPool1d(...): ../AdaptiveAvgPool1d.html
+.. raw:: html
+  <style>
+    h1:before {
+      content: "torch.nn.functional.";
+      color: #103d3e;
+    }
+  </style>
--- a/docs/api/python/torch/nn/functional/adaptive_avg_pool2d.rst
+++ b/docs/api/python/torch/nn/functional/adaptive_avg_pool2d.rst
+adaptive_avg_pool2d
+===================
+.. autofunction:: dragon.vm.torch.nn.functional.adaptive_avg_pool2d
+.. _torch.nn.AdaptiveAvgPool2d(...): ../AdaptiveAvgPool2d.html
+.. raw:: html
+  <style>
+    h1:before {
+      content: "torch.nn.functional.";
+      color: #103d3e;
+    }
+  </style>
--- a/docs/api/python/torch/nn/functional/adaptive_avg_pool3d.rst
+++ b/docs/api/python/torch/nn/functional/adaptive_avg_pool3d.rst
+adaptive_avg_pool3d
+===================
+.. autofunction:: dragon.vm.torch.nn.functional.adaptive_avg_pool3d
+.. _torch.nn.AdaptiveAvgPool3d(...): ../AdaptiveAvgPool3d.html
+.. raw:: html
+  <style>
+    h1:before {
+      content: "torch.nn.functional.";
+      color: #103d3e;
+    }
+  </style>
--- a/docs/api/python/torch/nn/functional/adaptive_max_pool1d.rst
+++ b/docs/api/python/torch/nn/functional/adaptive_max_pool1d.rst
+adaptive_max_pool1d
+===================
+.. autofunction:: dragon.vm.torch.nn.functional.adaptive_max_pool1d
+.. _torch.nn.AdaptiveMaxPool1d(...): ../AdaptiveMaxPool1d.html
+.. raw:: html
+  <style>
+    h1:before {
+      content: "torch.nn.functional.";
+      color: #103d3e;
+    }
+  </style>
--- a/docs/api/python/torch/nn/functional/adaptive_max_pool2d.rst
+++ b/docs/api/python/torch/nn/functional/adaptive_max_pool2d.rst
+adaptive_max_pool2d
+===================
+.. autofunction:: dragon.vm.torch.nn.functional.adaptive_max_pool2d
+.. _torch.nn.AdaptiveMaxPool2d(...): ../AdaptiveMaxPool2d.html
+.. raw:: html
+  <style>
+    h1:before {
+      content: "torch.nn.functional.";
+      color: #103d3e;
+    }
+  </style>
--- a/docs/api/python/torch/nn/functional/adaptive_max_pool3d.rst
+++ b/docs/api/python/torch/nn/functional/adaptive_max_pool3d.rst
+adaptive_max_pool3d
+===================
+.. autofunction:: dragon.vm.torch.nn.functional.adaptive_max_pool3d
+.. _torch.nn.AdaptiveMaxPool3d(...): ../AdaptiveMaxPool3d.html
+.. raw:: html
+  <style>
+    h1:before {
+      content: "torch.nn.functional.";
+      color: #103d3e;
+    }
+  </style>
--- a/dragon/onnx/onnx_backend.cc
+++ b/dragon/onnx/onnx_backend.cc
@@ -185,7 +185,6 @@ const Map<string, Map<string, string>>& ONNXBackend::get_node_renamed_attrs()
    const {
  const static Map<string, Map<string, string>> kPerNodeRenamedAttrs = {
      {"DepthToSpace", {{"blocksize", "block_size"}}},
-      {"Gemm", {{"transB", "transW"}}},
      {"RoiAlign",
       {
           {"output_height", "pooled_h"},

--- a/dragon/onnx/onnx_importer.cc
+++ b/dragon/onnx/onnx_importer.cc
@@ -180,19 +180,7 @@ ONNXImporterReturns ONNXBackend::GemmImporter(
    ONNXNode* onnx_node,
    const ConversionContext& ctx) {
  auto& attributes = onnx_node->attributes;
-  auto alpha = attributes.get<float>("alpha", 1.f);
+  attributes.AddRewrittenAttribute("axis")->set_i(-1);
-  auto beta = attributes.get<float>("beta", 1.f);
-  auto trans_a = attributes.get<int64_t>("transA", 0L);
-  // Remove the unsupported attributes
-  if (alpha != 1.f || beta != 1.f) {
-    LOG(FATAL) << "alpha/beta can not be set currently.";
-  }
-  if (trans_a) {
-    LOG(FATAL) << "Tranposed A is not supported currently.";
-  }
-  attributes.remove("alpha");
-  attributes.remove("beta");
-  attributes.remove("transA");
  return GenericImporter(onnx_node, ctx);
 }

--- a/dragon/operators/math/elementwise_ops.h
+++ b/dragon/operators/math/elementwise_ops.h
@@ -98,7 +98,6 @@ DECLARE_ELEMENTWISE_OP(SignGradient);
 DECLARE_ELEMENTWISE_OP(SinGradient);
 DECLARE_ELEMENTWISE_OP(SqrtGradient);
 DECLARE_ELEMENTWISE_OP(SquareGradient);
 // Binary ElementwiseOp
 DECLARE_ELEMENTWISE_OP(Add);
 DECLARE_ELEMENTWISE_OP(Sub);
@@ -122,7 +121,6 @@ DECLARE_ELEMENTWISE_OP(PowGradient);
 DECLARE_ELEMENTWISE_OP(DotGradient);
 DECLARE_ELEMENTWISE_OP(MinimumGradient);
 DECLARE_ELEMENTWISE_OP(MaximumGradient);
 #undef DECLARE_ELEMENTWISE_OP
 } // namespace dragon

--- a/dragon/operators/math/fully_connected_op.cc
+++ b/dragon/operators/math/fully_connected_op.cc
-#include "dragon/operators/math/fully_connected_op.h"
-#include "dragon/core/workspace.h"
-#include "dragon/utils/filler.h"
-#include "dragon/utils/op_kernels.h"
-namespace dragon {
-template <class Context>
-template <typename T>
-void FullyConnectedOp<Context>::DoRunWithType() {
-  auto &X = Input(0), &W = Input(1), *Y = Output(0);
-  CANONICALIZE_AXIS_WITH_TENSOR(X);
-  // Determine the number of output channels
-  int64_t M = X.count(0, axis), K = X.count(axis), N;
-  if (out_channels_ <= 0) {
-    // Infer the "N" from the weights shape
-    N = W.count() / K;
-    CHECK_GT(N, 0) << "\nFailed to infer the N from "
-                   << "the weights shape: " << W.DimString();
-  } else {
-    // Use a fixed "N" from the argument
-    N = out_channels_;
-  }
-  vec64_t Y_dims(axis + 1);
-  for (int i = 0; i < axis + 1; i++) {
-    Y_dims[i] = i < axis ? X.dim(i) : N;
-  }
-  if (transW_ > 0) {
-    TENSOR_FILL(W, vec64_t({N, K}));
-    CHECK(W.ndim() == 2 && W.dim(1) == K)
-        << "\nWeights dimensions should be [N, K].\n"
-        << "Got X as (" << M << ", " << K << "), "
-        << "and W as " << W.DimString();
-  } else {
-    TENSOR_FILL(W, vec64_t({K, N}));
-    CHECK(W.ndim() == 2 && W.dim(0) == K)
-        << "\nWeights dimensions should be [K, N].\n"
-        << "Got X as (" << M << ", " << K << "), "
-        << "and W as " << W.DimString();
-  }
-  math::Gemm(
-      CblasNoTrans,
-      (CBLAS_TRANSPOSE)transW_,
-      M,
-      N,
-      K,
-      1.f,
-      X.template data<T, Context>(),
-      W.template data<T, Context>(),
-      0.f,
-      Y->Reshape(Y_dims)->template mutable_data<T, Context>(),
-      ctx());
-  if (InputSize() > 2) {
-    TENSOR_FILL(Input(2), vec64_t({N}));
-    kernel::BiasAdd(
-        M,
-        1,
-        N,
-        Y->template data<T, Context>(),
-        Input(2).template data<T, Context>(),
-        Y->template mutable_data<T, Context>(),
-        ctx());
-  }
-}
-template <class Context>
-void FullyConnectedOp<Context>::RunOnDevice() {
-  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
-}
-template <class Context>
-template <typename T>
-void FullyConnectedGradientOp<Context>::DoRunWithType() {
-  auto &X = Input(0), &W = Input(1), &dY = Input(2);
-  auto *dX = Output(0), *dW = Output(1), *dB = Output(2);
-  CANONICALIZE_AXIS_WITH_TENSOR(X);
-  // Determine the number of output channels
-  int64_t M = X.count(0, axis), K = X.count(axis), N;
-  if (out_channels_ <= 0) {
-    // Infer the "N" from the weights shape
-    N = W.count() / K;
-    CHECK_GT(N, 0) << "\nFailed to infer the N from "
-                   << "the weights shape: " << W.DimString();
-  } else {
-    // Use a fixed "N" from the argument
-    N = out_channels_;
-  }
-  if (dX->has_name()) {
-    if (transW_) {
-      math::Gemm(
-          CblasNoTrans,
-          CblasNoTrans,
-          M,
-          K,
-          N,
-          1.f,
-          dY.template data<T, Context>(),
-          W.template data<T, Context>(),
-          0.f,
-          dX->ReshapeLike(X)->template mutable_data<T, Context>(),
-          ctx());
-    } else {
-      math::Gemm(
-          CblasNoTrans,
-          CblasTrans,
-          M,
-          K,
-          N,
-          1.f,
-          dY.template data<T, Context>(),
-          W.template data<T, Context>(),
-          0.f,
-          dX->ReshapeLike(X)->template mutable_data<T, Context>(),
-          ctx());
-    }
-  }
-  if (dW->has_name()) {
-    if (transW_) {
-      math::Gemm(
-          CblasTrans,
-          CblasNoTrans,
-          N,
-          K,
-          M,
-          1.f,
-          dY.template data<T, Context>(),
-          X.template data<T, Context>(),
-          0.f,
-          dW->ReshapeLike(W)->template mutable_data<T, Context>(),
-          ctx());
-    } else {
-      math::Gemm(
-          CblasTrans,
-          CblasNoTrans,
-          K,
-          N,
-          M,
-          1.f,
-          X.template data<T, Context>(),
-          dY.template data<T, Context>(),
-          0.f,
-          dW->ReshapeLike(W)->template mutable_data<T, Context>(),
-          ctx());
-    }
-  }
-  if (dB->has_name()) {
-    vec32_t dims = {(int)M, (int)N}, axes = {0};
-    math::ReduceSum(
-        2,
-        dims.data(),
-        1,
-        axes.data(),
-        1.f,
-        dY.template data<T, Context>(),
-        dB->Reshape({N})->template mutable_data<T, Context>(),
-        ctx());
-  }
-}
-template <class Context>
-void FullyConnectedGradientOp<Context>::RunOnDevice() {
-  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
-}
-DEPLOY_CPU_OPERATOR(FullyConnected);
-#ifdef USE_CUDA
-DEPLOY_CUDA_OPERATOR(FullyConnected);
-#endif
-DEPLOY_CPU_OPERATOR(FullyConnectedGradient);
-#ifdef USE_CUDA
-DEPLOY_CUDA_OPERATOR(FullyConnectedGradient);
-#endif
-OPERATOR_SCHEMA(FullyConnected)
-    /* X, W, B */
-    .NumInputs(2, 3)
-    /* Y */
-    .NumOutputs(1);
-OPERATOR_SCHEMA(FullyConnectedGradient)
-    /* X, W, dY */
-    .NumInputs(3)
-    /* dX, dW, dB */
-    .NumOutputs(3);
-namespace {
-class GradientMaker : public GradientMakerBase {
- public:
-  GRADIENT_MAKER_CTOR(GradientMaker);
-  vector<OperatorDef> MakeDef() override {
-    return SingleDef(
-        def.type() + "Gradient",
-        "",
-        vector<string>({I(0), I(1), GO(0)}),
-        vector<string>({GI(0), GI(1), GI(2)}));
-  }
-};
-} // namespace
-REGISTER_GRADIENT(FullyConnected, GradientMaker);
-} // namespace dragon
--- a/dragon/operators/math/gemm_op.cc
+++ b/dragon/operators/math/gemm_op.cc
+#include "dragon/operators/math/gemm_op.h"
+#include "dragon/core/workspace.h"
+#include "dragon/utils/filler.h"
+#include "dragon/utils/math_functions.h"
+namespace dragon {
+template <class Context>
+template <typename T>
+void GemmOp<Context>::DoRunWithType() {
+  auto &A = Input(0), &B = Input(1), *Y = Output(0);
+  CANONICALIZE_AXIS_WITH_TENSOR(A);
+  // Check matrix A
+  auto M = transA_ ? A.count(axis) : A.count(0, axis);
+  auto K = transA_ ? A.count(0, axis) : A.count(axis);
+  // Check matrix B
+  auto N = n_; // Init "N" from the argument
+  if (N <= 0) {
+    // Infer "N" from the B shape
+    N = B.count() / K;
+    CHECK_GT(N, 0) << "\nFailed to infer 'N' from "
+                   << "the B shape: " << B.DimString();
+  }
+  if (transB_ > 0) {
+    TENSOR_FILL(B, vec64_t({N, K}));
+    CHECK(B.ndim() == 2 && B.dim(1) == K)
+        << "\nMatrixB's dimensions should be [N, K].\n"
+        << "Got A as (" << M << ", " << K << "), "
+        << "and B as " << B.DimString();
+  } else {
+    TENSOR_FILL(B, vec64_t({K, N}));
+    CHECK(B.ndim() == 2 && B.dim(0) == K)
+        << "\nMatrixB's dimensions should be [K, N].\n"
+        << "Got A as (" << M << ", " << K << "), "
+        << "and B as " << B.DimString();
+  }
+  // Copy matrix C to Y if provided
+  vec64_t Y_dims(A.dims().begin(), A.dims().begin() + axis);
+  Y_dims.insert(transA_ ? Y_dims.begin() : Y_dims.end(), N);
+  if (InputSize() > 2) {
+    auto& C = Input(2);
+    if (C.ndim() == 0) {
+      TENSOR_FILL(C, vec64_t({N}));
+    }
+    if (math::utils::IsBinaryBroadcast(Y_dims, C.dims(), Y_dims)) {
+      math::Set(
+          C.ndim(),
+          C.dims().data(),
+          Y_dims.size(),
+          Y_dims.data(),
+          C.template data<T, Context>(),
+          Y->Reshape(Y_dims)->template mutable_data<T, Context>(),
+          ctx());
+    } else {
+      LOG(FATAL) << "Could not broadcast together with shapes: "
+                 << Tensor::DimString(Y_dims) << " " << C.DimString();
+    }
+  }
+  math::Gemm(
+      (CBLAS_TRANSPOSE)transA_,
+      (CBLAS_TRANSPOSE)transB_,
+      M,
+      N,
+      K,
+      alpha_,
+      A.template data<T, Context>(),
+      B.template data<T, Context>(),
+      InputSize() > 2 ? beta_ : 0.f,
+      Y->Reshape(Y_dims)->template mutable_data<T, Context>(),
+      ctx());
+}
+template <class Context>
+void GemmOp<Context>::RunOnDevice() {
+  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
+}
+template <class Context>
+template <typename T>
+void GemmGradientOp<Context>::DoRunWithType() {
+  auto &A = Input(0), &B = Input(1), &dY = Input(3);
+  auto *dA = Output(0), *dB = Output(1), *dC = Output(2);
+  CANONICALIZE_AXIS_WITH_TENSOR(A);
+  // Check matrix A
+  auto M = transA_ ? A.count(axis) : A.count(0, axis);
+  auto K = transA_ ? A.count(0, axis) : A.count(axis);
+  // Check matrix B
+  auto N = n_; // Init "N" from the argument
+  if (N <= 0) {
+    // Infer "N" from the B shape
+    N = B.count() / K;
+    CHECK_GT(N, 0) << "\nFailed to infer 'N' from "
+                   << "the B shape: " << B.DimString();
+  }
+  if (dA->has_name()) {
+    if (transA_ > 0) {
+      math::Gemm(
+          transB_ ? CblasTrans : CblasNoTrans,
+          CblasTrans,
+          K,
+          M,
+          N,
+          alpha_,
+          B.template data<T, Context>(),
+          dY.template data<T, Context>(),
+          0.f,
+          dA->ReshapeLike(A)->template mutable_data<T, Context>(),
+          ctx());
+    } else {
+      math::Gemm(
+          CblasNoTrans,
+          transB_ ? CblasNoTrans : CblasTrans,
+          M,
+          K,
+          N,
+          alpha_,
+          dY.template data<T, Context>(),
+          B.template data<T, Context>(),
+          0.f,
+          dA->ReshapeLike(A)->template mutable_data<T, Context>(),
+          ctx());
+    }
+  }
+  if (dB->has_name()) {
+    if (transB_) {
+      math::Gemm(
+          CblasTrans,
+          transA_ ? CblasTrans : CblasNoTrans,
+          N,
+          K,
+          M,
+          alpha_,
+          dY.template data<T, Context>(),
+          A.template data<T, Context>(),
+          0.f,
+          dB->ReshapeLike(B)->template mutable_data<T, Context>(),
+          ctx());
+    } else {
+      math::Gemm(
+          transA_ ? CblasNoTrans : CblasTrans,
+          CblasNoTrans,
+          K,
+          N,
+          M,
+          alpha_,
+          A.template data<T, Context>(),
+          dY.template data<T, Context>(),
+          0.f,
+          dB->ReshapeLike(B)->template mutable_data<T, Context>(),
+          ctx());
+    }
+  }
+  if (dC->has_name()) {
+    auto& C = Input(2);
+    if (C.count() == dY.count()) {
+      math::Scale(
+          dY.count(),
+          beta_,
+          dY.template data<T, Context>(),
+          dC->ReshapeLike(C)->template mutable_data<T, Context>(),
+          ctx());
+    } else {
+      vec32_t Y_axes, C_axes;
+      math::utils::ComputeBinaryBroadcastAxes(
+          dY.dims(), C.dims(), dY.dims(), Y_axes, C_axes);
+      math::ReduceSum(
+          dY.ndim(),
+          vec32_t{dY.dims().begin(), dY.dims().end()}.data(),
+          C_axes.size(),
+          C_axes.data(),
+          beta_,
+          dY.template data<T, Context>(),
+          dC->ReshapeLike(C)->template mutable_data<T, Context>(),
+          ctx());
+    }
+  }
+}
+template <class Context>
+void GemmGradientOp<Context>::RunOnDevice() {
+  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
+}
+DEPLOY_CPU_OPERATOR(Gemm);
+#ifdef USE_CUDA
+DEPLOY_CUDA_OPERATOR(Gemm);
+#endif
+DEPLOY_CPU_OPERATOR(GemmGradient);
+#ifdef USE_CUDA
+DEPLOY_CUDA_OPERATOR(GemmGradient);
+#endif
+OPERATOR_SCHEMA(Gemm)
+    /* A, B, C */
+    .NumInputs(2, 3)
+    /* Y */
+    .NumOutputs(1);
+OPERATOR_SCHEMA(GemmGradient)
+    /* A, B, C, dY */
+    .NumInputs(4)
+    /* dA, dB, dC */
+    .NumOutputs(3);
+namespace {
+class GradientMaker : public GradientMakerBase {
+ public:
+  GRADIENT_MAKER_CTOR(GradientMaker);
+  vector<OperatorDef> MakeDef() override {
+    return SingleDef(
+        def.type() + "Gradient",
+        "",
+        vector<string>({I(0), I(1), I(2), GO(0)}),
+        vector<string>({GI(0), GI(1), GI(2)}));
+  }
+};
+} // namespace
+REGISTER_GRADIENT(Gemm, GradientMaker);
+} // namespace dragon
--- a/dragon/operators/math/fully_connected_op.h
+++ b/dragon/operators/math/fully_connected_op.h
@@ -10,20 +10,23 @@
 * ------------------------------------------------------------
 */
-#ifndef DRAGON_OPERATORS_MATH_FULLY_CONNECTED_OP_H_
+#ifndef DRAGON_OPERATORS_MATH_GEMM_OP_H_
-#define DRAGON_OPERATORS_MATH_FULLY_CONNECTED_OP_H_
+#define DRAGON_OPERATORS_MATH_GEMM_OP_H_
 #include "dragon/core/operator.h"
 namespace dragon {
 template <class Context>
-class FullyConnectedOp final : public Operator<Context> {
+class GemmOp final : public Operator<Context> {
 public:
-  FullyConnectedOp(const OperatorDef& def, Workspace* ws)
+  GemmOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        out_channels_(OP_SINGLE_ARG(int64_t, "out_channels", 0)),
+        n_(OP_SINGLE_ARG(int64_t, "n", 0)),
-        transW_(OP_SINGLE_ARG(int64_t, "transW", 1)) {}
+        alpha_(OP_SINGLE_ARG(float, "alpha", 1.f)),
+        beta_(OP_SINGLE_ARG(float, "beta", 1.f)),
+        transA_(OP_SINGLE_ARG(int64_t, "transA", 0)),
+        transB_(OP_SINGLE_ARG(int64_t, "transB", 0)) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;
@@ -32,16 +35,20 @@ class FullyConnectedOp final : public Operator<Context> {
  void DoRunWithType();
 protected:
-  int64_t out_channels_, transW_;
+  float alpha_, beta_;
+  int64_t n_, transA_, transB_;
 };
 template <class Context>
-class FullyConnectedGradientOp final : public Operator<Context> {
+class GemmGradientOp final : public Operator<Context> {
 public:
-  FullyConnectedGradientOp(const OperatorDef& def, Workspace* ws)
+  GemmGradientOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        out_channels_(OP_SINGLE_ARG(int64_t, "out_channels", 0)),
+        n_(OP_SINGLE_ARG(int64_t, "n", 0)),
-        transW_(OP_SINGLE_ARG(int64_t, "transW", 1)) {}
+        alpha_(OP_SINGLE_ARG(float, "alpha", 1.f)),
+        beta_(OP_SINGLE_ARG(float, "beta", 1.f)),
+        transA_(OP_SINGLE_ARG(int64_t, "transA", 0)),
+        transB_(OP_SINGLE_ARG(int64_t, "transB", 0)) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;
@@ -50,9 +57,10 @@ class FullyConnectedGradientOp final : public Operator<Context> {
  void DoRunWithType();
 protected:
-  int64_t out_channels_, transW_;
+  float alpha_, beta_;
+  int64_t n_, transA_, transB_;
 };
 } // namespace dragon
-#endif // DRAGON_OPERATORS_MATH_FULLY_CONNECTED_OP_H_
+#endif // DRAGON_OPERATORS_MATH_GEMM_OP_H_
--- a/dragon/operators/math/matmul_op.cc
+++ b/dragon/operators/math/matmul_op.cc
--- a/dragon/operators/math/matmul_op.h
+++ b/dragon/operators/math/matmul_op.h
@@ -20,37 +20,25 @@ namespace dragon {
 template <class Context>
 class MatMulOp final : public Operator<Context> {
 public:
-  MatMulOp(const OperatorDef& def, Workspace* ws)
+  SIMPLE_CTOR_DTOR(MatMulOp);
-      : Operator<Context>(def, ws),
-        transA_(OP_SINGLE_ARG(int64_t, "transA", 0)),
-        transB_(OP_SINGLE_ARG(int64_t, "transB", 0)) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;
  template <typename T>
  void DoRunWithType();
- protected:
-  int64_t transA_, transB_;
 };
 template <class Context>
 class MatMulGradientOp final : public Operator<Context> {
 public:
-  MatMulGradientOp(const OperatorDef& def, Workspace* ws)
+  SIMPLE_CTOR_DTOR(MatMulGradientOp);
-      : Operator<Context>(def, ws),
-        transA_(OP_SINGLE_ARG(int64_t, "transA", 0)),
-        transB_(OP_SINGLE_ARG(int64_t, "transB", 0)) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;
  template <typename T>
  void DoRunWithType();
- protected:
-  int64_t transA_, transB_;
 };
 } // namespace dragon

--- a/dragon/python/_api/math/__init__.py
+++ b/dragon/python/_api/math/__init__.py
@@ -35,6 +35,7 @@ from dragon.core.ops.math_ops import dot
 from dragon.core.ops.math_ops import equal
 from dragon.core.ops.math_ops import exp
 from dragon.core.ops.math_ops import floor
+from dragon.core.ops.math_ops import gemm
 from dragon.core.ops.math_ops import greater
 from dragon.core.ops.math_ops import greater_equal
 from dragon.core.ops.math_ops import is_inf

--- a/dragon/python/_api/nn/__init__.py
+++ b/dragon/python/_api/nn/__init__.py
@@ -33,7 +33,6 @@ from dragon.core.ops.activation_ops import relu6
 from dragon.core.ops.activation_ops import selu
 from dragon.core.ops.activation_ops import softmax
 from dragon.core.ops.activation_ops import swish
-from dragon.core.ops.math_ops import fully_connected
 from dragon.core.ops.normalization_ops import batch_norm
 from dragon.core.ops.normalization_ops import group_norm
 from dragon.core.ops.normalization_ops import instance_norm

--- a/dragon/python/core/autograph/op_spec.py
+++ b/dragon/python/core/autograph/op_spec.py
@@ -414,30 +414,34 @@ def flatten_spec(args, inputs, outputs):
    return outputs
-@register('FullyConnected')
+@register('Gemm')
-def fully_connected_spec(args, inputs, outputs):
+def gemm_spec(args, inputs, outputs):
    outputs[0].dtype = inputs[0].dtype
-    axis, out_channels = args['axis'], args.get('out_channels', None)
+    axis, n = args['axis'], args.get('n', None)
    while axis < 0:
        try:
            axis += len(inputs[0].shape)
        except TypeError:
-            return outputs
+            break
-    out_shape = [None] * (axis + 1)
+    out_shape = [None] * axis if axis >= 0 else None
-    if out_channels is None:
+    if n is None:
        try:
-            if args['transW']:
+            if args['transB']:
-                out_channels = inputs[1].shape[0]
+                n = inputs[1].shape[0]
            else:
-                out_channels = inputs[1].shape[1]
+                n = inputs[1].shape[1]
        except (TypeError, IndexError):
-            out_channels = None
+            n = None
    try:
-        out_shape[axis] = out_channels
+        if out_shape is None or inputs[0].shape is not None:
-        out_shape[:axis] = inputs[0].shape[:axis]
+            out_shape = list(inputs[0].shape[:axis])
+        if args['transA']:
+            out_shape.insert(0, n)
+        else:
+            out_shape.append(n)
+        outputs[0].shape = out_shape
    except (TypeError, IndexError):
        pass
-    outputs[0].shape = out_shape
    return outputs
@@ -510,12 +514,25 @@ def masked_select_spec(args, inputs, outputs):
 @register('MatMul')
 def matmul_spec(args, inputs, outputs):
    outputs[0].dtype = inputs[0].dtype
-    ta, tb = args['transA'], args['transB']
    try:
+        a_shape = list(inputs[0].shape[:])
        b_shape = list(inputs[1].shape[:])
-        a_shape = out_shape = list(inputs[0].shape[:])
+        if len(a_shape) >= 2 and len(b_shape) >= 2:
-        out_shape[-2] = a_shape[-1] if ta else a_shape[-2]
+            out_shape = [1] * max(len(a_shape), len(b_shape))
-        out_shape[-1] = b_shape[-2] if tb else b_shape[-1]
+            a_shape = [1] * (len(out_shape) - len(a_shape)) + a_shape
+            b_shape = [1] * (len(out_shape) - len(b_shape)) + b_shape
+            for i in range(len(out_shape)):
+                try:
+                    out_shape[i] = max(a_shape[i], b_shape[i])
+                except TypeError:
+                    out_shape[i] = None
+            out_shape[-2] = a_shape[-2]
+            out_shape[-1] = b_shape[-1]
+        elif len(a_shape) == 1 and len(b_shape) == 1:
+            out_shape = []
+        else:
+            out_shape = a_shape if len(b_shape) == 1 else b_shape
+            out_shape.pop(-1 if len(b_shape) == 1 else -2)
    except (TypeError, IndexError):
        out_shape = None
    outputs[0].shape = out_shape

--- a/dragon/python/core/ops/math_ops.py
+++ b/dragon/python/core/ops/math_ops.py
@@ -498,23 +498,30 @@ def floor(inputs, **kwargs):
 @OpSchema.num_inputs(2, 3)
-def fully_connected(inputs, axis=1, transpose_w=True, **kwargs):
+def gemm(
-    r"""Compute the dense matrix multiplication along the given axes.
+    inputs,
+    alpha=1.0,
-    .. math:: y = Wx + b
+    beta=1.0,
+    transpose_a=False,
-    The column of input matrix is determined by:
+    transpose_b=False,
+    **kwargs
-    .. math:: \text{Col} = \text{DimSince}(\text{Input}, \text{Axis})
+):
+    r"""Compute the general matrix multiplication.
+    .. math:: \text{out} = \alpha AB + \beta C
    Parameters
    ----------
    inputs : Sequence[dragon.Tensor]
-        The tensor :math:`x`, :math:`W` and :math:`b`.
+        The matrix :math:`A`, :math:`B` and optional :math:`C`.
-    axis : int, optional, default=1
+    alpha : float, optional, default=1.0
-        The start axis to compute, can be negative.
+        The value to :math:`\alpha`.
-    transpose_w : bool, optional, default=True
+    beta : float, optional, default=1.0
-        **True** to transpose :math:`W` before computation.
+        The value to :math:`\beta`.
+    transpose_a : bool, optional, default=False
+        **True** to transpose :math:`A` before computation.
+    transpose_b : bool, optional, default=False
+        **True** to transpose :math:`B` before computation.
    Returns
    -------
@@ -523,15 +530,22 @@ def fully_connected(inputs, axis=1, transpose_w=True, **kwargs):
    """
    args = ArgHelper.parse(locals())
-    op_lib = math_ops_lib.FullyConnected
+    args['axis'] = kwargs.get('axis', -1)
+    args['alpha'], args['beta'] = float(alpha), float(beta)
+    op_lib = math_ops_lib.Gemm
    if context.executing_eagerly():
        return op_lib \
-            .instantiate(axis=axis, transpose_w=transpose_w) \
+            .instantiate(
-            .apply(inputs)
+                axis=args['axis'],
+                alpha=args['alpha'],
+                beta=args['beta'],
+                transpose_a=transpose_a,
+                transpose_b=transpose_b,
+            ).apply(inputs)
    else:
-        args.pop('transpose_w')
+        args['transA'] = args.pop('transpose_a')
-        args['transW'] = transpose_w
+        args['transB'] = args.pop('transpose_b')
-        return op_lib.blend('FullyConnected', **args)
+        return op_lib.blend(**args)
 @OpSchema.num_inputs(2)
@@ -812,42 +826,44 @@ def less_equal(inputs, **kwargs):
 @OpSchema.num_inputs(2)
-def matmul(inputs, transpose_a=False, transpose_b=False, **kwargs):
+def matmul(inputs, **kwargs):
    r"""Compute the matrix multiplication.
-    .. math:: y = a \times b
+    .. math:: \text{out} = \text{input1} \times \text{input2}
-    The rank of ``a`` and ``b`` should be equal and >= 2:
+    The behavior depends on the shape of input tensors:
-    ```python
+    * If both tensors are 1d, computes the vector product.
-    # Ok, a typical matrix multiplication
+    * If tensors are 1d and >=2d, computes the vector-matrix multiplication.
-    a = dragon.ones((2, 3), 'float32')
+    * If tensors are >=2d and 1d, computes the matrix-vector multiplication.
-    b = dragon.ones((3, 3), 'float32')
+    * If both tensors are >= 2d, computes the matrix-matrix multiplication.
-    print(dragon.math.matmul([a, b]))
+    * If one tensor is >= 3d, applies batching and broadcasting to the computation.
-    # Compute a batch matrix multiplication if rank > 2
+    Examples:
-    aa = dragon.ones((4, 2, 3), 'float32')
-    bb = dragon.ones((4, 3, 3), 'float32')
-    print(dragon.math.matmul([aa, bb]))
-    ```
-    If inputs are transposed, remember to transpose them back:
    ```python
+    # Vector x Vector
+    a = dragon.ones((2,), 'float32')
+    b = dragon.ones((2,), 'float32')
+    print(dragon.math.matmul([a, b]))
+    # Vector x Matrix
+    a = dragon.ones((2,), 'float32')
+    b = dragon.ones((2, 3), 'float32')
+    print(dragon.math.matmul([a, b]))
+    # Matrix x Vector
    a = dragon.ones((3, 2), 'float32')
-    b = dragon.ones((3, 3), 'float32')
+    b = dragon.ones((2,), 'float32')
-    print(dragon.math.matmul([a, b]))  # ``a`` takes the wrong dimensions
+    print(dragon.math.matmul([a, b]))
-    print(dragon.math.matmul([a, b], transpose_a=True))  # Ok
+    # Matrix x Matrix
+    a = dragon.ones((2, 3), 'float32')
+    b = dragon.ones((3, 2), 'float32')
+    print(dragon.math.matmul([a, b]))
    ```
    Parameters
    ----------
    inputs : Sequence[dragon.Tensor]
-        The matrix :math:`a` and :math:`b`.
+        The input tensors.
-    transpose_a : bool, optional, default=False
-        **True** to transpose :math:`a` before computation.
-    transpose_b : bool, optional, default=False
-        **True** to transpose :math:`b` before computation.
    Returns
    -------
@@ -858,15 +874,9 @@ def matmul(inputs, transpose_a=False, transpose_b=False, **kwargs):
    args = ArgHelper.parse(locals())
    op_lib = math_ops_lib.MatMul
    if context.executing_eagerly():
-        return op_lib \
+        return op_lib.instantiate().apply(inputs)
-            .instantiate(
-                transpose_a=transpose_a,
-                transpose_b=transpose_b,
-            ).apply(inputs)
    else:
-        args.pop('transpose_a')
+        return op_lib.blend(**args)
-        args.pop('transpose_b')
-        return op_lib.blend(transA=transpose_a, transB=transpose_b, **args)
 @OpSchema.num_inputs(2)

--- a/dragon/python/core/ops/math_ops_lib.py
+++ b/dragon/python/core/ops/math_ops_lib.py
@@ -80,20 +80,26 @@ class Clip(Operator):
        return self.dispatch(inputs, [self.alloc()])
-class FullyConnected(Operator):
+class Gemm(Operator):
    """FullyConnected operator."""
    def __init__(self, key, dev, **kwargs):
-        super(FullyConnected, self).__init__(key, dev, **kwargs)
+        super(Gemm, self).__init__(key, dev, **kwargs)
-        self.axis = kwargs.get('axis', 1)
+        self.axis = kwargs.get('axis', -1)
-        self.transpose_w = kwargs.get('transpose_w', True)
+        self.alpha = kwargs.get('alpha', 1.0)
+        self.beta = kwargs.get('beta', 1.0)
+        self.transpose_a = kwargs.get('transpose_a', False)
+        self.transpose_b = kwargs.get('transpose_b', False)
    def attributes(self):
        return {
-            'op_type': 'FullyConnected',
+            'op_type': 'Gemm',
            'arguments': {
                'axis': self.axis,
-                'transW': self.transpose_w,
+                'alpha': self.alpha,
+                'beta': self.beta,
+                'transA': self.transpose_a,
+                'transB': self.transpose_b,
            }
        }
@@ -104,18 +110,10 @@ class FullyConnected(Operator):
 class MatMul(Operator):
    """MatMul operator."""
-    def __init__(self, key, dev, **kwargs):
-        super(MatMul, self).__init__(key, dev, **kwargs)
-        self.transpose_a = kwargs.get('transpose_a', False)
-        self.transpose_b = kwargs.get('transpose_b', False)
    def attributes(self):
        return {
            'op_type': 'MatMul',
-            'arguments': {
+            'arguments': {},
-                'transA': self.transpose_a,
-                'transB': self.transpose_b,
-            }
        }
    def forward(self, inputs):

--- a/dragon/python/core/ops/vision_ops.py
+++ b/dragon/python/core/ops/vision_ops.py
@@ -136,6 +136,7 @@ def conv(
                data_format=data_format,
                bias=len(inputs) > 2,
                dtype=inputs[1].dtype,
+                input_shape=inputs[0].shape,
            ).apply(inputs)
    else:
        return op_lib.blend(**args)
@@ -465,6 +466,7 @@ def conv_transpose(
                data_format=data_format,
                bias=len(inputs) > 2,
                dtype=inputs[1].dtype,
+                input_shape=inputs[0].shape,
            ).apply(inputs)
    else:
        return op_lib.blend(**args)

--- a/dragon/python/vm/onnx/core/exporters/activation.py
+++ b/dragon/python/vm/onnx/core/exporters/activation.py
@@ -44,13 +44,6 @@ def hardsigmoid_exporter(op_def, context):
    return node, const_tensors
-@export_util.register('PRelu')
-def prelu_exporter(op_def, context):
-    node, const_tensors = export_util.translate(**locals())
-    const_tensors = [helper.from_tensor(op_def.input[1], context.ws)]
-    return node, const_tensors
 @export_util.register('Relu')
 def relu_exporter(op_def, context):
    node, const_tensors = export_util.translate(**locals())

--- a/dragon/python/vm/onnx/core/exporters/math.py
+++ b/dragon/python/vm/onnx/core/exporters/math.py
@@ -81,24 +81,14 @@ def clip_exporter_v11(op_def, context):
    return node, const_tensors
-@export_util.register('FullyConnected-7')
+@export_util.register('Gemm-7')
-def fully_connected_exporter_v7(op_def, context):
+def gemm_exporter_v7(op_def, context):
-    node, const_tensors = export_util.translate(**locals())
+    return export_util.translate(**locals())
-    node.op_type = 'Gemm'
-    helper.add_attribute(node, 'alpha', 1.)
-    helper.add_attribute(node, 'beta', 1.)
-    for arg in op_def.arg:
-        if arg.name == 'transW':
-            helper.add_attribute(node, 'transB', arg.i)
-    # Weights and biases
-    const_tensors = [helper.from_tensor(name, context.ws)
-                     for name in op_def.input[1:]]
-    return node, const_tensors
-@export_util.register('FullyConnected')
+@export_util.register('Gemm')
-def fully_connected_exporter(op_def, context):
+def gemm_exporter(op_def, context):
-    node, const_tensors = fully_connected_exporter_v7(op_def, context)
+    node, const_tensors = gemm_exporter_v7(op_def, context)
    helper.add_attribute(node, 'broadcast', 1)  # Removed since opset 7
    return node, const_tensors

--- a/dragon/python/vm/onnx/core/exporters/normalization.py
+++ b/dragon/python/vm/onnx/core/exporters/normalization.py
@@ -29,8 +29,6 @@ def batch_norm_exporter(op_def, context):
        elif arg.name == 'momentum_desc':
            momentum = helper.fetch_argument(op_def, arg, context.ws)
            helper.add_attribute(node, 'momentum', float(momentum))
-    # Weight, bias, running mean and running variance
-    const_tensors = [helper.from_tensor(e, context.ws) for e in op_def.input[1:]]
    return node, const_tensors
@@ -48,8 +46,6 @@ def group_norm_exporter(op_def, context):
            else:
                helper.add_attribute(node, 'op_type', 'GroupNorm')
                helper.add_attribute(node, 'group', arg.i)
-    # Weight and bias
-    const_tensors = [helper.from_tensor(e, context.ws) for e in op_def.input[1:]]
    return node, const_tensors

--- a/dragon/python/vm/onnx/core/exporters/vision.py
+++ b/dragon/python/vm/onnx/core/exporters/vision.py
@@ -25,7 +25,7 @@ from dragon.vm.onnx.core.exporters import utils as export_util
    'ConvTranspose',
    'DepthwiseConv',
 ])
-def convolution(op_def, context):
+def conv_exporter(op_def, context):
    node, const_tensors = export_util.translate(**locals())
    node.op_type = 'ConvTranspose' if 'Transpose' in op_def.type else 'Conv'
    if 'Depthwise' in op_def.type:
@@ -58,8 +58,6 @@ def convolution(op_def, context):
            helper.add_attribute(node, 'output_shape', arg.ints)
        elif arg.name == 'output_padding':
            helper.add_attribute(node, 'output_padding', arg.ints)
-    # Weights and biases
-    const_tensors = [helper.from_tensor(e, context.ws) for e in op_def.input[1:]]
    return node, const_tensors

--- a/dragon/utils/math/blas.cc
+++ b/dragon/utils/math/blas.cc
@@ -203,8 +203,7 @@ DRAGON_API void Gemv<float16, CPUContext>(
    const float16* x,
    const float beta,
    float16* y,
-    CPUContext* ctx,
+    CPUContext* ctx) {
-    const std::string math_type) {
  CPU_FP16_NOT_SUPPORTED;
 }
@@ -219,8 +218,7 @@ DRAGON_API void Gemv<float16, CPUContext>(
      const T* x,                                                              \
      const float beta,                                                        \
      T* y,                                                                    \
-      CPUContext* ctx,                                                         \
+      CPUContext* ctx) {                                                       \
-      const string math_type) {                                                \
    T _alpha_ = alpha, _beta_ = beta;                                          \
    EigenVectorMap<T> y_vec(y, TransA == CblasNoTrans ? M : N);                \
    if (beta == 0.f)                                                           \
@@ -260,8 +258,7 @@ DRAGON_API void Gemm<float16, CPUContext>(
    const float16* B,
    const float beta,
    float16* C,
-    CPUContext* ctx,
+    CPUContext* ctx) {
-    const string math_type) {
  CPU_FP16_NOT_SUPPORTED;
 }
@@ -278,8 +275,7 @@ DRAGON_API void Gemm<float16, CPUContext>(
      const T* B,                                                  \
      const float beta,                                            \
      T* C,                                                        \
-      CPUContext* ctx,                                             \
+      CPUContext* ctx) {                                           \
-      const string math_type) {                                    \
    T _alpha_ = alpha, _beta_ = beta;                              \
    auto C_mat = EigenMatrixMap<T>(C, N, M);                       \
    if (beta == 0.f)                                               \
@@ -328,6 +324,105 @@ DEFINE_GEMM_FUNC(float);
 DEFINE_GEMM_FUNC(double);
 #undef DEFINE_GEMM_FUNC
+template <>
+DRAGON_API void GemmBatched<float16, CPUContext>(
+    const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB,
+    const int batch_size,
+    const int M,
+    const int N,
+    const int K,
+    const float alpha,
+    const float16** A,
+    const float16** B,
+    const float beta,
+    float16** C,
+    CPUContext* ctx) {
+  CPU_FP16_NOT_SUPPORTED;
+}
+#define DEFINE_BATCHED_GEMM_FUNC(T)                                      \
+  template <>                                                            \
+  DRAGON_API void GemmBatched<T, CPUContext>(                            \
+      const CBLAS_TRANSPOSE TransA,                                      \
+      const CBLAS_TRANSPOSE TransB,                                      \
+      const int batch_size,                                              \
+      const int M,                                                       \
+      const int N,                                                       \
+      const int K,                                                       \
+      const float alpha,                                                 \
+      const T** A,                                                       \
+      const T** B,                                                       \
+      const float beta,                                                  \
+      T** C,                                                             \
+      CPUContext* ctx) {                                                 \
+    for (int i = 0; i < batch_size; ++i) {                               \
+      Gemm(TransA, TransB, M, N, K, alpha, A[i], B[i], beta, C[i], ctx); \
+    }                                                                    \
+  }
+DEFINE_BATCHED_GEMM_FUNC(float);
+DEFINE_BATCHED_GEMM_FUNC(double);
+#undef DEFINE_BATCHED_GEMM_FUNC
+template <>
+DRAGON_API void GemmStridedBatched<float16, CPUContext>(
+    const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB,
+    const int batch_size,
+    const int M,
+    const int N,
+    const int K,
+    const int A_stride,
+    const int B_stride,
+    const int C_stride,
+    const float alpha,
+    const float16* A,
+    const float16* B,
+    const float beta,
+    float16* C,
+    CPUContext* ctx) {
+  CPU_FP16_NOT_SUPPORTED;
+}
+#define DEFINE_STRIDED_BATCHED_GEMM_FUNC(T)          \
+  template <>                                        \
+  DRAGON_API void GemmStridedBatched<T, CPUContext>( \
+      const CBLAS_TRANSPOSE TransA,                  \
+      const CBLAS_TRANSPOSE TransB,                  \
+      const int batch_size,                          \
+      const int M,                                   \
+      const int N,                                   \
+      const int K,                                   \
+      const int A_stride,                            \
+      const int B_stride,                            \
+      const int C_stride,                            \
+      const float alpha,                             \
+      const T* A,                                    \
+      const T* B,                                    \
+      const float beta,                              \
+      T* C,                                          \
+      CPUContext* ctx) {                             \
+    for (int i = 0; i < batch_size; ++i) {           \
+      Gemm(                                          \
+          TransA,                                    \
+          TransB,                                    \
+          M,                                         \
+          N,                                         \
+          K,                                         \
+          alpha,                                     \
+          A + i * A_stride,                          \
+          B + i * B_stride,                          \
+          beta,                                      \
+          C + i * C_stride,                          \
+          ctx);                                      \
+    }                                                \
+  }
+DEFINE_STRIDED_BATCHED_GEMM_FUNC(float);
+DEFINE_STRIDED_BATCHED_GEMM_FUNC(double);
+#undef DEFINE_STRIDED_BATCHED_GEMM_FUNC
 } // namespace math
 } // namespace dragon
--- a/dragon/utils/math/blas.cu
+++ b/dragon/utils/math/blas.cu
--- a/dragon/utils/math/blas.h
+++ b/dragon/utils/math/blas.h
@@ -85,8 +85,7 @@ DRAGON_API void Gemv(
    const T* x,
    const float beta,
    T* y,
-    Context* ctx,
+    Context* ctx);
-    const string math_type = "float32");
 template <typename T, class Context>
 DRAGON_API void Gemm(
@@ -100,8 +99,40 @@ DRAGON_API void Gemm(
    const T* B,
    const float beta,
    T* C,
-    Context* ctx,
+    Context* ctx);
-    const string math_type = "float32");
+template <typename T, class Context>
+DRAGON_API void GemmBatched(
+    const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB,
+    const int batch_size,
+    const int M,
+    const int N,
+    const int K,
+    const float alpha,
+    const T** A,
+    const T** B,
+    const float beta,
+    T** C,
+    Context* ctx);
+template <typename T, class Context>
+DRAGON_API void GemmStridedBatched(
+    const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB,
+    const int batch_size,
+    const int M,
+    const int N,
+    const int K,
+    const int A_stride,
+    const int B_stride,
+    const int C_stride,
+    const float alpha,
+    const T* A,
+    const T* B,
+    const float beta,
+    T* C,
+    Context* ctx);
 } // namespace math

--- a/dragon/utils/math/broadcast.cu
+++ b/dragon/utils/math/broadcast.cu
--- a/dragon/utils/math/cast.cc
+++ b/dragon/utils/math/cast.cc
@@ -24,21 +24,21 @@ void _Cast(const int n, const InputT* x, OutputT* y) {
 #define DEFINE_CAST_KERNEL_LAUNCHER(InputT, OutputT)               \
  template <>                                                      \
-  void Cast<InputT, OutputT, CPUContext>(                          \
+  DRAGON_API void Cast<InputT, OutputT, CPUContext>(               \
      const int n, const InputT* x, OutputT* y, CPUContext* ctx) { \
    _Cast(n, x, y);                                                \
  }
 #define DEFINE_UNSUPPORTED_KERNEL_LAUNCHER(InputT, OutputT)             \
  template <>                                                           \
-  void Cast<InputT, OutputT, CPUContext>(                               \
+  DRAGON_API void Cast<InputT, OutputT, CPUContext>(                    \
      const int n, const InputT* x, OutputT* y, CPUContext* ctx) {      \
    LOG(FATAL) << "Unsupported conversion: "                            \
               << types::to_string(TypeMeta::Make<InputT>()) << " -> "  \
               << types::to_string(TypeMeta::Make<OutputT>());          \
  }                                                                     \
  template <>                                                           \
-  void Cast<OutputT, InputT, CPUContext>(                               \
+  DRAGON_API void Cast<OutputT, InputT, CPUContext>(                    \
      const int n, const OutputT* x, InputT* y, CPUContext* ctx) {      \
    LOG(FATAL) << "Unsupported conversion: "                            \
               << types::to_string(TypeMeta::Make<OutputT>()) << " -> " \

--- a/dragon/utils/math/cast.cu
+++ b/dragon/utils/math/cast.cu
@@ -23,7 +23,7 @@ __global__ void _Cast(const int nthreads, const InputT* x, OutputT* y) {
 #define DEFINE_CAST_KERNEL_LAUNCHER(InputT, OutputT)                \
  template <>                                                       \
-  void Cast<InputT, OutputT, CUDAContext>(                          \
+  DRAGON_API void Cast<InputT, OutputT, CUDAContext>(               \
      const int n, const InputT* x, OutputT* y, CUDAContext* ctx) { \
    _Cast<<<CUDA_BLOCKS(n), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
        n,                                                          \
@@ -33,14 +33,14 @@ __global__ void _Cast(const int nthreads, const InputT* x, OutputT* y) {
 #define DEFINE_UNSUPPORTED_KERNEL_LAUNCHER(InputT, OutputT)             \
  template <>                                                           \
-  void Cast<InputT, OutputT, CUDAContext>(                              \
+  DRAGON_API void Cast<InputT, OutputT, CUDAContext>(                   \
      const int n, const InputT* x, OutputT* y, CUDAContext* ctx) {     \
    LOG(FATAL) << "Unsupported conversion: "                            \
               << types::to_string(TypeMeta::Make<InputT>()) << " -> "  \
               << types::to_string(TypeMeta::Make<OutputT>());          \
  }                                                                     \
  template <>                                                           \
-  void Cast<OutputT, InputT, CUDAContext>(                              \
+  DRAGON_API void Cast<OutputT, InputT, CUDAContext>(                   \
      const int n, const OutputT* x, InputT* y, CUDAContext* ctx) {     \
    LOG(FATAL) << "Unsupported conversion: "                            \
               << types::to_string(TypeMeta::Make<OutputT>()) << " -> " \

--- a/dragon/utils/math/elementwise.cu
+++ b/dragon/utils/math/elementwise.cu
--- a/dragon/utils/math/reduce.cc
+++ b/dragon/utils/math/reduce.cc
@@ -217,18 +217,18 @@ DEFINE_REDUCE_FUNC(Sum);
 /* ------------------- Launcher Separator ------------------- */
-#define DEFINE_KERNEL_LAUNCHER(name)      \
+#define DEFINE_KERNEL_LAUNCHER(name)                 \
-  template <>                             \
+  template <>                                        \
-  void Reduce##name<float16, CPUContext>( \
+  DRAGON_API void Reduce##name<float16, CPUContext>( \
-      const int num_dims,                 \
+      const int num_dims,                            \
-      const int* dims,                    \
+      const int* dims,                               \
-      const int num_axes,                 \
+      const int num_axes,                            \
-      const int* axes,                    \
+      const int* axes,                               \
-      const float scale,                  \
+      const float scale,                             \
-      const float16* x,                   \
+      const float16* x,                              \
-      float16* y,                         \
+      float16* y,                                    \
-      CPUContext* ctx) {                  \
+      CPUContext* ctx) {                             \
-    CPU_FP16_NOT_SUPPORTED;               \
+    CPU_FP16_NOT_SUPPORTED;                          \
  }
 DEFINE_KERNEL_LAUNCHER(Max);
@@ -258,7 +258,7 @@ DRAGON_API float16 Sum<float16, CPUContext>(
 #define DEFINE_KERNEL_LAUNCHER(name, T)                         \
  template <>                                                   \
-  void Reduce##name<T, CPUContext>(                             \
+  DRAGON_API void Reduce##name<T, CPUContext>(                  \
      const int num_dims,                                       \
      const int* dims,                                          \
      const int num_axes,                                       \
@@ -298,7 +298,7 @@ DEFINE_KERNEL_LAUNCHER(Sum, double);
    *y = val * T(scale);                                                   \
  }                                                                        \
  template <>                                                              \
-  T Sum<T, CPUContext>(                                                    \
+  DRAGON_API T Sum<T, CPUContext>(                                         \
      const int n, const float scale, const T* x, CPUContext* ctx) {       \
    T val = ConstEigenVectorArrayMap<T>(x, n).sum();                       \
    return val * T(scale);                                                 \

--- a/dragon/utils/math/reduce.cu
+++ b/dragon/utils/math/reduce.cu
@@ -174,7 +174,7 @@ DEFINE_REDUCE_DISPATCHER(Sum);
 // We found that FP16 accumulator drops too many small values in
 // empirical experiments.
 template <>
-void ReduceSum<float16, CUDAContext>(
+DRAGON_API void ReduceSum<float16, CUDAContext>(
    const int num_dims,
    const int* dims,
    const int num_axes,
@@ -199,7 +199,7 @@ void ReduceSum<float16, CUDAContext>(
 #define DEFINE_KERNEL_LAUNCHER(name, T, AccT, Reducer, kInit)              \
  template <>                                                              \
-  void Reduce##name<T, CUDAContext>(                                       \
+  DRAGON_API void Reduce##name<T, CUDAContext>(                            \
      const int num_dims,                                                  \
      const int* dims,                                                     \
      const int num_axes,                                                  \

--- a/dragon/utils/math/utils.cc
+++ b/dragon/utils/math/utils.cc
@@ -174,7 +174,8 @@ void ComputeBinaryBroadcastDims(
    const vec64_t& A_dims,
    const vec64_t& B_dims,
    vec64_t& A_broadcast_dims,
-    vec64_t& B_broadcast_dims) {
+    vec64_t& B_broadcast_dims,
+    int64_t* C_broadcast_dims) {
  auto num_dims = std::max(A_dims.size(), B_dims.size());
  A_broadcast_dims.resize(num_dims);
  B_broadcast_dims.resize(num_dims);
@@ -194,6 +195,16 @@ void ComputeBinaryBroadcastDims(
      B_dims.begin(),
      B_dims.end(),
      B_broadcast_dims.begin() + num_dims - B_dims.size());
+  if (C_broadcast_dims != nullptr) {
+    for (int i = 0; i < num_dims; ++i) {
+      if (A_broadcast_dims[i] == 0 || B_broadcast_dims[i] == 0) {
+        C_broadcast_dims[i] = 0;
+      } else {
+        C_broadcast_dims[i] =
+            std::max(A_broadcast_dims[i], B_broadcast_dims[i]);
+      }
+    }
+  }
 }
 void ComputeBinaryBroadcastStrides(

--- a/dragon/utils/math/utils.h
+++ b/dragon/utils/math/utils.h
@@ -304,7 +304,8 @@ DRAGON_API void ComputeBinaryBroadcastDims(
    const vec64_t& A_dims,
    const vec64_t& B_dims,
    vec64_t& A_broadcast_dims,
-    vec64_t& B_broadcast_dims);
+    vec64_t& B_broadcast_dims,
+    int64_t* C_broadcast_dims = nullptr);
 DRAGON_API void ComputeBinaryBroadcastStrides(
    const vec64_t& A_dims,
@@ -326,22 +327,22 @@ DRAGON_API void TransposeAxesForReduce(
    const int* reduce_axes,
    int* transpose_axes);
-template <typename dim_t, typename stride_t>
+template <typename DimT, typename StrideT>
 inline void
-ComputeStrides(const int num_dims, const dim_t* dims, stride_t* strides) {
+ComputeStrides(const int num_dims, const DimT* dims, StrideT* strides) {
  int64_t cur_stride = 1;
  for (int i = num_dims - 1; i >= 0; --i) {
-    strides[i] = stride_t(cur_stride);
+    strides[i] = StrideT(cur_stride);
    cur_stride *= int64_t(dims[i]);
  }
 }
-template <typename dim_t, typename axis_t, typename stride_t>
+template <typename DimT, typename AxisT, typename StrideT>
 inline void ComputeTransposeStrides(
    const int num_dims,
-    const dim_t* dims,
+    const DimT* dims,
-    const axis_t* axes,
+    const AxisT* axes,
-    stride_t* strides) {
+    StrideT* strides) {
  vec64_t buf(num_dims);
  int64_t cur_stride = 1;
  for (int i = num_dims - 1; i >= 0; --i) {
@@ -349,13 +350,25 @@ inline void ComputeTransposeStrides(
    cur_stride *= int64_t(dims[i]);
  }
  for (int i = 0; i < num_dims; ++i) {
-    strides[i] = stride_t(buf[axes[i]]);
+    strides[i] = StrideT(buf[axes[i]]);
  }
 }
-template <typename dim_t, typename index_t>
+template <typename DimT, typename IndexT>
+inline IndexT
+GetIndexFromDims(const int num_dims, const DimT* dims, IndexT* index) {
+  IndexT ret = 0;
+  for (int i = 0; i < num_dims; ++i) {
+    if (dims[i] > 1) {
+      ret = ret * dims[i] + index[i];
+    }
+  }
+  return ret;
+}
+template <typename DimT, typename IndexT>
 inline void
-IncreaseIndexInDims(const int num_dims, const dim_t* dims, index_t* index) {
+IncreaseIndexInDims(const int num_dims, const DimT* dims, IndexT* index) {
  for (int i = num_dims - 1; i >= 0; --i) {
    ++index[i];
    if (index[i] >= dims[i]) {

--- a/tensorflow/core/keras/layers/core.py
+++ b/tensorflow/core/keras/layers/core.py
@@ -116,11 +116,9 @@ class Dense(Layer):
        self.built = True
    def call(self, inputs):
-        outputs = math_ops.fully_connected(
+        outputs = math_ops.gemm(
-            [inputs, self.kernel] + [self.bias]
+            [inputs, self.kernel] +
-            if self.use_bias else [],
+            ([self.bias] if self.use_bias else []),
-            axis=-1,
-            transW=False,
        )
        if self.activation is not None:
            return self.activation(outputs)

--- a/tensorflow/core/ops/math_ops.py
+++ b/tensorflow/core/ops/math_ops.py
@@ -703,38 +703,38 @@ def log(x, name=None):
    return math_ops.log(x, name=name)
-def matmul(
+def matmul(a, b, name=None):
-    a,
-    b,
-    transpose_a=False,
-    transpose_b=False,
-    name=None,
-):
    r"""Compute the matrix multiplication.
-    .. math:: y = a \times b
+    .. math:: \text{out} = a \times b
-    The rank of ``a`` and ``b`` should be equal and >= 2:
+    The behavior depends on the shape of input tensors:
-    ```python
+    * If both tensors are 1d, computes the vector product.
-    # Ok, a typical matrix multiplication
+    * If tensors are 1d and >=2d, computes the vector-matrix multiplication.
-    a = tf.ones((2, 3), 'float32')
+    * If tensors are >=2d and 1d, computes the matrix-vector multiplication.
-    b = tf.ones((3, 3), 'float32')
+    * If both tensors are >= 2d, computes the matrix-matrix multiplication.
-    print(tf.linalg.matmul(a, b))
+    * If one tensor is >= 3d, applies batching and broadcasting to the computation.
-    # Compute a batch matrix multiplication if rank > 2
+    Examples:
-    aa = tf.ones((4, 2, 3), 'float32')
-    bb = tf.ones((4, 3, 3), 'float32')
-    print(tf.linalg.matmul(aa, bb))
-    ```
-    If inputs are transposed, remember to transpose them back:
    ```python
+    # Vector x Vector
+    a = tf.ones((2,), 'float32')
+    b = tf.ones((2,), 'float32')
+    print(tf.linalg.matmul(a, b))
+    # Vector x Matrix
+    a = tf.ones((2,), 'float32')
+    b = tf.ones((2, 3), 'float32')
+    print(tf.linalg.matmul(a, b))
+    # Matrix x Vector
    a = tf.ones((3, 2), 'float32')
-    b = tf.ones((3, 3), 'float32')
+    b = tf.ones((2,), 'float32')
-    print(tf.linalg.matmul(a, b))  # ``a`` takes the wrong dimensions
+    print(tf.linalg.matmul(a, b))
-    print(tf.linalg.matmul(a, b, transpose_a=True))  # Ok
+    # Matrix x Matrix
+    a = tf.ones((2, 3), 'float32')
+    b = tf.ones((3, 2), 'float32')
+    print(tf.linalg.matmul(a, b))
    ```
    Parameters
@@ -743,10 +743,6 @@ def matmul(
        The matrix :math:`a`.
    b : dragon.Tensor
        The matrix :math:`b`.
-    transpose_a : bool, optional, default=False
-        **True** to transpose :math:`a` before computing.
-    transpose_b : bool, optional, default=False
-        **True** to transpose :math:`b` before computing.
    name : str, optional
        The operation name.
@@ -756,12 +752,7 @@ def matmul(
        The output tensor.
    """
-    return math_ops.matmul(
+    return math_ops.matmul([a, b], name=name)
-        [a, b],
-        transpose_a=transpose_a,
-        transpose_b=transpose_b,
-        name=name,
-    )
 def multiply(x, y, name=None):

--- a/tensorlayer/core/layers/dense/base_dense.py
+++ b/tensorlayer/core/layers/dense/base_dense.py
@@ -85,25 +85,25 @@ class Dense(layer.Layer):
            raise AssertionError('The input dimension must be rank 2.'
                                 'Please reshape or flatten it.')
        if self.in_channels:
-            shape = [self.n_units, self.in_channels]
+            shape = [self.in_channels, self.n_units]
        else:
            self.in_channels = inputs_shape[1]
-            shape = [self.n_units, inputs_shape[1]]
+            shape = [inputs_shape[1], self.n_units]
        self.W = self.add_weight(
-            name="weights",
+            name='weights',
            shape=shape,
            init=self.W_init,
        )
        if self.b_init:
            self.b = self.add_weight(
-                name="biases",
+                name='biases',
                shape=[self.n_units],
                init=self.b_init,
            )
    def forward(self, inputs):
-        outputs = math_ops.fully_connected(
+        outputs = math_ops.gemm(
-            [inputs, self.W] + ([self.b] if self.b_init else []), axis=1)
+            [inputs, self.W] + ([self.b] if self.b_init else []))
        if self.act:
            outputs = self.act(outputs)
        return outputs
--- a/test/dragon/test_autograph.py
+++ b/test/dragon/test_autograph.py
@@ -281,17 +281,15 @@ class TestOpSpec(unittest.TestCase):
            self.assertEqual(dragon.flatten(
                self.sym4, axis=1, num_axes=-1).shape, (1, None))
-    def test_fully_connected(self):
+    def test_gemm(self):
        w = dragon.Tensor((3, 2))
        with dragon.graph_mode():
-            self.assertEqual(dragon.nn.fully_connected(
+            self.assertEqual(dragon.math.gemm(
-                [self.sym1, w]).shape, (None, 3))
+                [self.sym1, w]).shape, None)
-            self.assertEqual(dragon.nn.fully_connected(
+            self.assertEqual(dragon.math.gemm(
-                [self.sym1, w], transpose_w=False).shape, (None, 2))
+                [self.sym1, w], axis=1).shape, (None, 2))
-            self.assertEqual(dragon.nn.fully_connected(
+            self.assertEqual(dragon.math.gemm(
-                [self.sym1, w], axis=-1).shape, None)
+                [self.sym1, self.sym1]).shape, None)
-            self.assertEqual(dragon.nn.fully_connected(
-                [self.sym1, self.sym1]).shape, (None, None))
    def test_index_select(self):
        with dragon.graph_mode():
@@ -325,7 +323,9 @@ class TestOpSpec(unittest.TestCase):
            self.assertEqual(dragon.math.matmul(
                [self.sym1, self.sym3]).shape, None)
            self.assertEqual(dragon.math.matmul(
-                [self.sym2, self.sym3]).shape, None)
+                [self.sym2, self.sym3]).shape, (None,))
+            self.assertEqual(dragon.math.matmul(
+                [self.sym3, self.sym2]).shape, (1,))
            self.assertEqual(dragon.math.matmul(
                [self.sym3, self.sym3]).shape, (1, None))
            self.assertEqual(dragon.math.matmul(

--- a/test/dragon/test_ops.py
+++ b/test/dragon/test_ops.py
@@ -1868,22 +1868,22 @@ class TestMathOps(OpTestCase):
        with dragon.device('cuda'):
            self.test_floor()
-    def test_fully_connected(self):
+    def test_gemm(self):
        entries = [((2, 3), (3, 4), (4,), False),
                   ((2, 3), (4, 3), (4,), True)]
        for execution in ('EAGER_MODE', 'GRAPH_MODE'):
            with execution_context().mode(execution):
-                for x_shape, w_shape, b_shape, trans_w in entries:
+                for x_shape, w_shape, b_shape, trans_b in entries:
                    data1, data2, data3 = arange(x_shape), arange(w_shape), arange(b_shape)
                    x, w, b = new_tensor(data1), new_tensor(data2), new_tensor(data3)
                    with dragon.GradientTape() as tape:
                        tape.watch([x, w, b])
-                        y = dragon.nn.fully_connected([x, w, b], transpose_w=trans_w)
+                        y = dragon.math.gemm([x, w, b], transpose_b=trans_b)
                    data4 = arange(y.shape)
                    dy = new_tensor(data4)
                    dx, dw, db = tape.gradient(y, [x, w, b], output_gradients=[dy])
-                    result = np.matmul(data1, data2.T if trans_w else data2) + data3
+                    result = np.matmul(data1, data2.T if trans_b else data2) + data3
-                    if trans_w:
+                    if trans_b:
                        grad1 = np.matmul(data4, data2)
                        grad2 = np.matmul(data4.T, data1)
                    else:
@@ -1894,9 +1894,9 @@ class TestMathOps(OpTestCase):
                        [result, grad1, grad2, reduce_like(data4, data3)])
    @unittest.skipIf(not TEST_CUDA, 'CUDA unavailable')
-    def test_fully_connected_cuda(self):
+    def test_gemm_cuda(self):
        with dragon.device('cuda'):
-            self.test_fully_connected()
+            self.test_gemm()
    def test_greater(self):
        for execution in ('EAGER_MODE', 'GRAPH_MODE'):
@@ -1997,40 +1997,62 @@ class TestMathOps(OpTestCase):
            self.test_log()
    def test_matmul(self):
-        entries = [
+        entries = [((2, 3), (3, 4)),
-            ((2, 3), (3, 4), False, False),
+                   ((1, 2, 3), (2, 3, 4)),
-            ((2, 3), (4, 3), False, True),
+                   ((2, 2, 3), (1, 3, 4)),
-            ((3, 2), (3, 4), True, False),
+                   ((2, 2, 3), (2, 3, 4)),
-            ((3, 2), (4, 3), True, True)]
+                   ((2, 1, 2, 3), (2, 3, 4)),
+                   ((1, 2, 3), (2, 2, 3, 4)),
+                   ((2, 1, 2, 3), (1, 2, 3, 4))]
+        for execution in ('EAGER_MODE', 'GRAPH_MODE',):
+            with execution_context().mode(execution):
+                for a_shape, b_shape in entries:
+                    data1, data2 = arange(a_shape), arange(b_shape)
+                    a, b = new_tensor(data1), new_tensor(data2)
+                    with dragon.GradientTape() as tape:
+                        tape.watch([a, b])
+                        y = dragon.math.matmul([a, b])
+                    data3 = arange(y.shape)
+                    dy = new_tensor(data3)
+                    da, db = tape.gradient(y, [a, b], output_gradients=[dy])
+                    grad1 = np.matmul(data3, transpose_last(data2, 2))
+                    grad2 = np.matmul(transpose_last(data1, 2), data3)
+                    self.assertEqual(
+                        [y, da, db],
+                        [np.matmul(data1, data2),
+                         reduce_like(grad1, data1),
+                         reduce_like(grad2, data2)])
+        entries = [((2,), (2,), (2, 1), (2, 1), (1, 1)),
+                   ((2,), (2, 3), (2, 1), (2, 3), (1, 3)),
+                   ((2, 3), (3,), (2, 3), (1, 3), (2, 1)),
+                   ((2,), (4, 2, 3), (1, 2, 1), (4, 2, 3), (4, 1, 3)),
+                   ((4, 2, 3), (3,), (4, 2, 3), (1, 1, 3), (4, 2, 1))]
        for execution in ('EAGER_MODE', 'GRAPH_MODE'):
            with execution_context().mode(execution):
-                for a_shape, b_shape, trans_a, trans_b in entries:
+                for a_shape, b_shape, da_shape, db_shape, dy_shape in entries:
                    data1, data2 = arange(a_shape), arange(b_shape)
+                    data4 = data1 if len(a_shape) > len(b_shape) else data2
                    a, b = new_tensor(data1), new_tensor(data2)
                    with dragon.GradientTape() as tape:
                        tape.watch([a, b])
-                        y = dragon.math.matmul([a, b], trans_a, trans_b)
+                        y = dragon.math.matmul([a, b])
                    data3 = arange(y.shape)
                    dy = new_tensor(data3)
                    da, db = tape.gradient(y, [a, b], output_gradients=[dy])
-                    if trans_a:
+                    grad1 = data3.reshape(dy_shape) * data2.reshape(db_shape)
-                        if trans_b:
+                    grad2 = data1.reshape(da_shape) * data3.reshape(dy_shape)
-                            grad1 = np.matmul(data2.T, data3.T)
+                    grad1_axes, grad2_axes = [], []
-                            grad2 = np.matmul(data3.T, data1.T)
+                    for i in range(len(dy_shape)):
-                        else:
+                        if da_shape[i] != db_shape[i]:
-                            grad1 = np.matmul(data2, data3.T)
+                            if da_shape[i] == 1:
-                            grad2 = np.matmul(data1, data3)
+                                grad1_axes.append(i)
-                    else:
+                            if db_shape[i] == 1:
-                        if trans_b:
+                                grad2_axes.append(i)
-                            grad1 = np.matmul(data3, data2)
-                            grad2 = np.matmul(data3.T, data1)
-                        else:
-                            grad1 = np.matmul(data3, data2.T)
-                            grad2 = np.matmul(data1.T, data3)
                    self.assertEqual(
                        [y, da, db],
-                        [np.matmul(data1.T if trans_a else data1,
+                        [np.matmul(data1, data2),
-                                   data2.T if trans_b else data2), grad1, grad2])
+                         reduce(grad1, tuple(grad1_axes)).reshape(data1.shape),
+                         reduce(grad2, tuple(grad2_axes)).reshape(data2.shape)])
    @unittest.skipIf(not TEST_CUDA, 'CUDA unavailable')
    def test_matmul_cuda(self):
@@ -4145,6 +4167,16 @@ def reduce_like(data, other, reduction='sum'):
    return data
+def transpose_last(data, num_axes=None, axes=None):
+    """Transpose the last axes of data."""
+    if axes is None and num_axes is not None:
+        axes = list(range(num_axes))[::-1]
+    perm = list(range(len(data.shape)))
+    start_axis = len(perm) - len(axes)
+    perm[start_axis:] = [v + start_axis for v in axes]
+    return np.transpose(data, perm)
 def uniform(shape, dtype='float32'):
    """Return the uniform data with given shape."""
    return np.random.uniform(-1., 1., size=shape).astype(dtype)

--- a/test/torch/test_nn.py
+++ b/test/torch/test_nn.py
@@ -619,39 +619,54 @@ class TestModules(OpTestCase):
                self.assertEqual(m4(x), np.pad(data, pads, 'constant'))
    def test_pool1d(self):
-        entries = [((2, 2, 2,), (2,), 2, 1, 'MAX'),
+        entries = [((2, 2, 2,), (2,), 2, 1, 'MaxPool1d'),
-                   ((2, 2, 2,), (2,), 2, 1, 'AVG')]
+                   ((2, 2, 2,), (2,), 2, 1, 'AvgPool1d'),
+                   ((2, 2, 2,), (1,), 1, 0, 'AdaptiveMaxPool1d'),
+                   ((2, 2, 2,), (1,), 1, 0, 'AdaptiveAvgPool1d')]
        for x_shape, kernel_shape, strides, pads, mode in entries:
            data = arange(x_shape) * .1
-            module_cls = torch.nn.AvgPool1d if mode == 'AVG' else torch.nn.MaxPool1d
+            module_cls = getattr(torch.nn, mode)
            x = new_tensor(data)
-            m = module_cls(kernel_shape, strides, pads)
+            if 'Adaptive' in mode:
+                m = module_cls(x_shape[-1])
+            else:
+                m = module_cls(kernel_shape, strides, pads)
            y, _ = m(x), repr(m)
-            result = data / (np.prod(kernel_shape) if mode == 'AVG' else 1.)
+            result = data / (np.prod(kernel_shape) if 'Avg' in mode else 1.)
            self.assertEqual(y, result)
    def test_pool2d(self):
-        entries = [((2, 2, 2, 2), (2, 2), 2, 1, 'MAX'),
+        entries = [((2, 2, 2, 2), (2, 2), 2, 1, 'MaxPool2d'),
-                   ((2, 2, 2, 2), (2, 2), 2, 1, 'AVG')]
+                   ((2, 2, 2, 2), (2, 2), 2, 1, 'AvgPool2d'),
+                   ((2, 2, 2, 2), (1, 1), 1, 0, 'AdaptiveMaxPool2d'),
+                   ((2, 2, 2, 2), (1, 1), 1, 0, 'AdaptiveAvgPool2d')]
        for x_shape, kernel_shape, strides, pads, mode in entries:
            data = arange(x_shape) * .1
-            module_cls = torch.nn.AvgPool2d if mode == 'AVG' else torch.nn.MaxPool2d
+            module_cls = getattr(torch.nn, mode)
            x = new_tensor(data)
-            m = module_cls(kernel_shape, strides, pads)
+            if 'Adaptive' in mode:
+                m = module_cls(x_shape[-1])
+            else:
+                m = module_cls(kernel_shape, strides, pads)
            y, _ = m(x), repr(m)
-            result = data / (np.prod(kernel_shape) if mode == 'AVG' else 1.)
+            result = data / (np.prod(kernel_shape) if 'Avg' in mode else 1.)
            self.assertEqual(y, result)
    def test_pool3d(self):
-        entries = [((2, 2, 2, 2, 2), (2, 2, 2), 2, 1, 'MAX'),
+        entries = [((2, 2, 2, 2, 2), (2, 2, 2), 2, 1, 'MaxPool3d'),
-                   ((2, 2, 2, 2, 2), (2, 2, 2), 2, 1, 'AVG')]
+                   ((2, 2, 2, 2, 2), (2, 2, 2), 2, 1, 'AvgPool3d'),
+                   ((2, 2, 2, 2, 2), (1, 1, 1), 1, 0, 'AdaptiveMaxPool3d'),
+                   ((2, 2, 2, 2, 2), (1, 1, 1), 1, 0, 'AdaptiveAvgPool3d')]
        for x_shape, kernel_shape, strides, pads, mode in entries:
            data = arange(x_shape) * .1
-            module_cls = torch.nn.AvgPool3d if mode == 'AVG' else torch.nn.MaxPool3d
+            module_cls = getattr(torch.nn, mode)
            x = new_tensor(data)
-            m = module_cls(kernel_shape, strides, pads)
+            if 'Adaptive' in mode:
+                m = module_cls(x_shape[-1])
+            else:
+                m = module_cls(kernel_shape, strides, pads)
            y, _ = m(x), repr(m)
-            result = data / (np.prod(kernel_shape) if mode == 'AVG' else 1.)
+            result = data / (np.prod(kernel_shape) if 'Avg' in mode else 1.)
            self.assertEqual(y, result)
    def test_prelu(self):

--- a/test/torch/test_ops.py
+++ b/test/torch/test_ops.py
@@ -95,6 +95,16 @@ class TestTensorOps(OpTestCase):
            a += b
            self.assertEqual(a, data1 + data2)
+    def test_addmm(self):
+        entries = [((2, 3), (3, 4), (2, 4))]
+        for a_shape, b_shape, c_shape in entries:
+            data1, data2 = arange(a_shape), arange(b_shape)
+            data3 = arange(c_shape)
+            a, b = new_tensor(data1), new_tensor(data2)
+            c = new_tensor(data3)
+            y = c.addmm(a, b)
+            self.assertEqual(y, np.matmul(data1, data2) + data3)
    def test_argmax(self):
        entries = [(0, True), (0, False), (1, True), (1, False), (None, False)]
        for axis, keepdims in entries:
@@ -115,6 +125,18 @@ class TestTensorOps(OpTestCase):
                result = np.expand_dims(result, axis)
            self.assertEqual(x.argmin(axis, keepdims), result)
+    def test_baddbmm(self):
+        entries = [((2, 2, 3), (2, 3, 4), (2, 2, 4))]
+        for a_shape, b_shape, c_shape in entries:
+            data1, data2 = arange(a_shape), arange(b_shape)
+            data3 = arange(c_shape)
+            a, b = new_tensor(data1), new_tensor(data2)
+            c = new_tensor(data3)
+            y = c.baddbmm(a, b)
+            self.assertEqual(y, np.matmul(data1, data2) + data3)
+            c.baddbmm_(a, b)
+            self.assertEqual(c, np.matmul(data1, data2) + data3)
    def test_bitwise_not(self):
        for shape in self.unary_test_shapes:
            data = np.random.binomial(1, 0.5, shape).astype('bool')
@@ -132,6 +154,18 @@ class TestTensorOps(OpTestCase):
            a.bitwise_xor_(b)
            self.assertEqual(a, np.bitwise_xor(data1, data2))
+    def test_bmm(self):
+        test_shapes = [((1, 2, 3), (2, 3, 4)),
+                       ((2, 2, 3), (1, 3, 4)),
+                       ((2, 2, 3), (2, 3, 4)),
+                       ((2, 1, 2, 3), (2, 3, 4)),
+                       ((1, 2, 3), (2, 2, 3, 4)),
+                       ((2, 1, 2, 3), (1, 2, 3, 4))]
+        for a_shape, b_shape in test_shapes:
+            data1, data2 = arange(a_shape), arange(b_shape, 1)
+            a, b = new_tensor(data1, False), new_tensor(data2, False)
+            self.assertEqual(a.bmm(b), np.matmul(data1, data2))
    def test_ceil(self):
        data = np.array([1.4, 1.7, 2.0])
        x = new_tensor(data)
@@ -334,6 +368,24 @@ class TestTensorOps(OpTestCase):
        data[data > 2] = 0
        self.assertEqual(x, data)
+    def test_matmul(self):
+        test_shapes = [((2,), (2,)),
+                       ((2,), (2, 3)),
+                       ((2, 3), (3,)),
+                       ((2, 3), (3, 4)),
+                       ((2,), (4, 2, 3)),
+                       ((4, 2, 3), (3,)),
+                       ((1, 2, 3), (2, 3, 4)),
+                       ((2, 2, 3), (1, 3, 4)),
+                       ((2, 2, 3), (2, 3, 4)),
+                       ((2, 1, 2, 3), (2, 3, 4)),
+                       ((1, 2, 3), (2, 2, 3, 4)),
+                       ((2, 1, 2, 3), (1, 2, 3, 4))]
+        for a_shape, b_shape in test_shapes:
+            data1, data2 = arange(a_shape), arange(b_shape, 1)
+            a, b = new_tensor(data1, False), new_tensor(data2, False)
+            self.assertEqual(a.matmul(b), np.matmul(data1, data2))
    def test_max(self):
        entries = [(0, True), (0, False),
                   (1, True), (1, False),
@@ -382,20 +434,12 @@ class TestTensorOps(OpTestCase):
            self.assertEqual(y, np.minimum(data1, data2))
    def test_mm(self):
-        entries = [
+        entries = [((2, 3), (3, 4))]
-            ((2, 3), (3, 4), False, False),
+        for a_shape, b_shape in entries:
-            ((2, 3), (4, 3), False, True),
-            ((3, 2), (3, 4), True, False),
-            ((3, 2), (4, 3), True, True)]
-        for a_shape, b_shape, trans_a, trans_b in entries:
            data1, data2 = arange(a_shape), arange(b_shape)
            a, b = new_tensor(data1), new_tensor(data2)
-            if trans_a or trans_b:
+            y = a.mm(b)
-                y = torch.mm(a, b, trans_a, trans_b)
+            self.assertEqual(y, np.matmul(data1, data2))
-            else:
-                y = a.mm(b)
-            self.assertEqual(y, np.matmul(data1.T if trans_a else data1,
-                                          data2.T if trans_b else data2))
    def test_mul(self):
        for a_shape, b_shape in self.binary_test_shapes:

--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -94,9 +94,12 @@ from dragon.vm.torch.core.ops.init.functional import zeros
 from dragon.vm.torch.core.ops.init.functional import zeros_like
 from dragon.vm.torch.core.ops.math.functional import abs
 from dragon.vm.torch.core.ops.math.functional import add
+from dragon.vm.torch.core.ops.math.functional import addmm
 from dragon.vm.torch.core.ops.math.functional import axpby
+from dragon.vm.torch.core.ops.math.functional import baddbmm
 from dragon.vm.torch.core.ops.math.functional import bitwise_not
 from dragon.vm.torch.core.ops.math.functional import bitwise_xor
+from dragon.vm.torch.core.ops.math.functional import bmm
 from dragon.vm.torch.core.ops.math.functional import ceil
 from dragon.vm.torch.core.ops.math.functional import clamp
 from dragon.vm.torch.core.ops.math.functional import cos
@@ -112,6 +115,7 @@ from dragon.vm.torch.core.ops.math.functional import le
 from dragon.vm.torch.core.ops.math.functional import log
 from dragon.vm.torch.core.ops.math.functional import logsumexp
 from dragon.vm.torch.core.ops.math.functional import lt
+from dragon.vm.torch.core.ops.math.functional import matmul
 from dragon.vm.torch.core.ops.math.functional import maximum
 from dragon.vm.torch.core.ops.math.functional import minimum
 from dragon.vm.torch.core.ops.math.functional import mm

--- a/torch/_api/nn/__init__.py
+++ b/torch/_api/nn/__init__.py
@@ -76,6 +76,12 @@ from dragon.vm.torch.core.nn.modules.padding import ReplicationPad1d
 from dragon.vm.torch.core.nn.modules.padding import ReplicationPad2d
 from dragon.vm.torch.core.nn.modules.padding import ReplicationPad3d
 from dragon.vm.torch.core.nn.modules.padding import ZeroPad2d
+from dragon.vm.torch.core.nn.modules.pooling import AdaptiveAvgPool1d
+from dragon.vm.torch.core.nn.modules.pooling import AdaptiveAvgPool2d
+from dragon.vm.torch.core.nn.modules.pooling import AdaptiveAvgPool3d
+from dragon.vm.torch.core.nn.modules.pooling import AdaptiveMaxPool1d
+from dragon.vm.torch.core.nn.modules.pooling import AdaptiveMaxPool2d
+from dragon.vm.torch.core.nn.modules.pooling import AdaptiveMaxPool3d
 from dragon.vm.torch.core.nn.modules.pooling import AvgPool1d
 from dragon.vm.torch.core.nn.modules.pooling import AvgPool2d
 from dragon.vm.torch.core.nn.modules.pooling import AvgPool3d

--- a/torch/_api/nn/functional/__init__.py
+++ b/torch/_api/nn/functional/__init__.py
@@ -14,6 +14,12 @@ from __future__ import absolute_import as _absolute_import
 from __future__ import division as _division
 from __future__ import print_function as _print_function
+from dragon.vm.torch.core.nn.functional import adaptive_avg_pool1d
+from dragon.vm.torch.core.nn.functional import adaptive_avg_pool2d
+from dragon.vm.torch.core.nn.functional import adaptive_avg_pool3d
+from dragon.vm.torch.core.nn.functional import adaptive_max_pool1d
+from dragon.vm.torch.core.nn.functional import adaptive_max_pool2d
+from dragon.vm.torch.core.nn.functional import adaptive_max_pool3d
 from dragon.vm.torch.core.nn.functional import avg_pool1d
 from dragon.vm.torch.core.nn.functional import avg_pool2d
 from dragon.vm.torch.core.nn.functional import avg_pool3d

--- a/torch/core/autograd/function.py
+++ b/torch/core/autograd/function.py
@@ -76,7 +76,7 @@ class Function(object):
        Parameters
        ----------
        out : dragon.vm.torch.Tensor, optional
-            The optional output tensor.
+            The output tensor.
        Returns
        -------

--- a/torch/core/nn/functional.py
+++ b/torch/core/nn/functional.py
--- a/torch/core/nn/modules/_functions.py
+++ b/torch/core/nn/modules/_functions.py
@@ -86,7 +86,6 @@ class Pool(function.Function):
        self.pads = kwargs.get('pads', 0)
        self.ceil_mode = kwargs.get('ceil_mode', False)
        self.mode = kwargs.get('mode', 'MAX')
-        self.global_pool = kwargs.get('global_pool', False)
    def attributes(self):
        return {
@@ -98,7 +97,6 @@ class Pool(function.Function):
                'ceil_mode': self.ceil_mode,
                'mode': self.mode,
                'data_format': 'NCHW',
-                'global_pool': self.global_pool,
            }
        }
@@ -316,24 +314,6 @@ class L2Loss(Loss):
        }
-class Linear(function.Function):
-    """Linear function."""
-    def attributes(self):
-        return {
-            'op_type': 'FullyConnected',
-            'arguments': {
-                'axis': -1,
-                'transW': True,
-            },
-        }
-    def forward(self, input, weight, bias=None, out=None):
-        inputs = [input, weight] + ([bias] if bias else [])
-        outputs = [out] if out else [self.alloc()]
-        return self.dispatch(inputs, outputs)
 class LocalResponseNorm(function.Function):
    """LocalResponseNorm function."""

--- a/torch/core/nn/modules/linear.py
+++ b/torch/core/nn/modules/linear.py
@@ -48,7 +48,7 @@ class Identity(Module):
 class Linear(Module):
    r"""Apply the linear transformation.
-    .. math:: y = Wx + b
+    .. math:: \text{out} = \text{input} \times \text{weight}^{T} + \text{bias}
    Examples:

--- a/torch/core/nn/modules/pooling.py
+++ b/torch/core/nn/modules/pooling.py
--- a/torch/core/nn/modules/utils.py
+++ b/torch/core/nn/modules/utils.py
@@ -22,6 +22,18 @@ import itertools
 from dragon.core.util import six
+def _get_adaptive_pool_kwargs(input_sizes, output_sizes):
+    stride, kernel_size = [], []
+    for input_size, output_size in zip(input_sizes, output_sizes):
+        if output_size == 1:
+            stride.append(1)
+            kernel_size.append(input_size)
+        else:
+            stride.append(input_size // output_size)
+            kernel_size.append(input_size - (output_size - 1) * stride[-1])
+    return {'kernel_size': kernel_size, 'stride': stride}
 def _ntuple(n):
    def parse(x):
        if isinstance(x, six.collections_abc.Sequence):

--- a/torch/core/ops/array/_functions.py
+++ b/torch/core/ops/array/_functions.py
@@ -315,12 +315,16 @@ class OneHot(function.Function):
    def __init__(self, key, dev, **kwargs):
        super(OneHot, self).__init__(key, dev, **kwargs)
        self.depth = kwargs.get('depth', 1)
+        self.on_value = kwargs.get('on_value', 1)
+        self.off_value = kwargs.get('off_value', 0)
    def attributes(self):
        return {
            'op_type': 'OneHot',
            'arguments': {
                'depth': self.depth,
+                'on_value': self.on_value,
+                'off_value': self.off_value,
            },
        }

--- a/torch/core/ops/array/functional.py
+++ b/torch/core/ops/array/functional.py
@@ -46,7 +46,7 @@ def argmax(input, dim=None, keepdim=False, out=None):
    keepdim : bool, optional, default=False
        Keep the reduced dimension or not.
    out : dragon.vm.torch.Tensor, optional
-        The optional output tensor.
+        The output tensor.
    Returns
    -------
@@ -81,7 +81,7 @@ def argmin(input, dim=None, keepdim=False, out=None):
    keepdim : bool, optional, default=False
        Keep the reduced dimension or not.
    out : dragon.vm.torch.Tensor, optional
-        The optional output tensor.
+        The output tensor.
    Returns
    -------
@@ -174,7 +174,7 @@ def cat(seq, dim=0, out=None):
    dim : int, optional
        The dim to concatenate.
    out : dragon.vm.torch.Tensor, optional
-        The optional output tensor.
+        The output tensor.
    Returns
    -------
@@ -197,11 +197,11 @@ def channel_affine(input, weight, bias=None, dim=0, out=None):
    weight : dragon.vm.torch.Tensor
        The weight tensor.
    bias : dragon.vm.torch.Tensor, optional
-        The optional bias.
+        The bias tensor.
    dim : int, optional, default=0
        The start dimension to transform.
    out : dragon.vm.torch.Tensor, optional
-        The optional output tensor.
+        The output tensor.
    Returns
    -------
@@ -369,7 +369,7 @@ def cumsum(input, dim, out=None):
    dim : int
        The cumulative dimension.
    out : dragon.vm.torch.Tensor, optional
-        The optional output tensor.
+        The output tensor.
    Returns
    -------
@@ -429,7 +429,7 @@ def flatten(input, start_dim=0, end_dim=-1, out=None):
    end_dim : int, optional, default=-1
        The end dimension to flatten.
    out : dragon.vm.torch.Tensor, optional
-        The optional output tensor.
+        The output tensor.
    Returns
    -------
@@ -465,7 +465,7 @@ def index_select(input, dim, index, out=None):
    index : dragon.vm.torch.Tensor
        The index tensor.
    out : dragon.vm.torch.Tensor, optional
-        The optional output tensor.
+        The output tensor.
    Returns
    -------
@@ -523,7 +523,7 @@ def masked_select(input, mask, out=None):
    mask : dragon.vm.torch.Tensor
        The mask for selecting.
    out : dragon.vm.torch.Tensor, optional
-        The optional output tensor.
+        The output tensor.
    Returns
    -------
@@ -566,7 +566,7 @@ def max(input, dim=None, keepdim=False, out=None):
    keepdim : bool, optional, default=False
        Keep the reduced dimensions or not.
    out : dragon.vm.torch.Tensor, optional
-        The optional output tensor.
+        The output tensor.
    Returns
    -------
@@ -606,7 +606,7 @@ def mean(input, dim=None, keepdim=False, out=None):
    keepdim : bool, optional, default=False
        Keep the reduced dimensions or not.
    out : dragon.vm.torch.Tensor, optional
-        The optional output tensor.
+        The output tensor.
    Returns
    -------
@@ -646,7 +646,7 @@ def min(input, dim=None, keepdim=False, out=None):
    keepdim : bool, optional, default=False
        Keep the reduced dimensions or not.
    out : dragon.vm.torch.Tensor, optional
-        The optional output tensor.
+        The output tensor.
    Returns
    -------
@@ -721,7 +721,7 @@ def nonzero(input, out=None):
    input : dragon.vm.torch.Tensor
        The input tensor.
    out : dragon.vm.torch.Tensor, optional
-        The optional output tensor.
+        The output tensor.
    Returns
    -------
@@ -732,7 +732,7 @@ def nonzero(input, out=None):
    return _functions.NonZero.instantiate(input.device).apply(input, out)
-def one_hot(input, depth):
+def one_hot(input, depth, on_value=1, off_value=0):
    r"""Return the one-hot representation for input.
    .. math::
@@ -748,6 +748,10 @@ def one_hot(input, depth):
        The input tensor.
    depth : int
        The depth of channels.
+    on_value : int, optional, default=1
+        The value for equal branch.
+    off_value : int, optional, default=0
+        The value for not-equal branch.
    Returns
    -------
@@ -755,7 +759,12 @@ def one_hot(input, depth):
        The output tensor.
    """
-    return _functions.OneHot.instantiate(input.device, depth=depth).apply(input)
+    return _functions.OneHot.instantiate(
+        input.device,
+        depth=depth,
+        on_value=on_value,
+        off_value=off_value,
+    ).apply(input)
 def permute(input, dims):
@@ -812,7 +821,7 @@ def reshape(input, shape, out=None):
    shape : Sequence[int]
        The new shape.
    out : dragon.vm.torch.Tensor, optional
-        The optional output tensor.
+        The output tensor.
    Returns
    -------
@@ -986,7 +995,7 @@ def stack(seq, dim=0, out=None):
    dim : int, optional, default=0
        The dim to stack.
    out : dragon.vm.torch.Tensor, optional
-        The optional output tensor.
+        The output tensor.
    Returns
    -------
@@ -1030,7 +1039,7 @@ def sum(input, dim=None, keepdim=False, out=None):
    keepdim : bool, optional, default=False
        Keep the reduced dimensions or not.
    out : dragon.vm.torch.Tensor, optional
-        The optional output tensor.
+        The output tensor.
    Returns
    -------

--- a/torch/core/ops/init/functional.py
+++ b/torch/core/ops/init/functional.py
@@ -59,7 +59,7 @@ def arange(
    step : number, optional, default=1
        The spacing between two elements.
    out : dragon.vm.torch.Tensor, optional
-        The optional output tensor.
+        The output tensor.
    dtype : str, optional, default='int64'
        The optional data type.
    device : dragon.vm.torch.device, optional
@@ -113,7 +113,7 @@ def eye(
    m : int, optional
        The number output cols.
    out : dragon.vm.torch.Tensor, optional
-        The optional output tensor.
+        The output tensor.
    dtype : str, optional, default='float32'
        The optional data type.
    device : dragon.vm.torch.device, optional
@@ -175,7 +175,7 @@ def full(
    fill_value : number
        The scalar to fill.
    out : dragon.vm.torch.Tensor, optional
-        The optional output tensor.
+        The output tensor.
    dtype : str, optional, default='int64'
        The optional data type.
    device : dragon.vm.torch.device, optional
@@ -216,7 +216,7 @@ def full_like(
    fill_value : number
        The scalar to fill.
    out : dragon.vm.torch.Tensor, optional
-        The optional output tensor.
+        The output tensor.
    dtype : str, optional, default='int64'
        The optional data type.
    device : dragon.vm.torch.device, optional
@@ -268,7 +268,7 @@ def linspace(
    steps : int, optional, default=100
        The number of values to generate.
    out : dragon.vm.torch.Tensor, optional
-        The optional output tensor.
+        The output tensor.
    dtype : str, optional, default='int64'
        The optional data type.
    dim : int, optional, default=0
@@ -326,7 +326,7 @@ def ones(*size, **kwargs):
    size : int...
        The size(s) indicating the out shape.
    out : dragon.vm.torch.Tensor, optional
-        The optional output tensor.
+        The output tensor.
    dtype : str, optional, default='float32'
        The optional data type.
    device : dragon.vm.torch.device, optional
@@ -378,7 +378,7 @@ def rand(*size, **kwargs):
    size : int...
        The size(s) indicating the out shape.
    out : dragon.vm.torch.Tensor, optional
-        The optional output tensor.
+        The output tensor.
    dtype : str, optional, default='float32'
        The optional data type.
    device : dragon.vm.torch.device, optional
@@ -404,7 +404,7 @@ def randn(*size, **kwargs):
    size : int...
        The size(s) indicating the out shape.
    out : dragon.vm.torch.Tensor, optional
-        The optional output tensor.
+        The output tensor.
    dtype : str, optional, default='float32'
        The optional data type.
    device : dragon.vm.torch.device, optional
@@ -436,7 +436,7 @@ def randperm(n, out=None, dtype='int64', device=None, requires_grad=False):
    n: number
        The end of interval.
    out : dragon.vm.torch.Tensor, optional
-        The optional output tensor.
+        The output tensor.
    dtype : str, optional, default='int64'
        The optional data type.
    device : dragon.vm.torch.device, optional
@@ -479,7 +479,7 @@ def zeros(*size, **kwargs):
    size : int...
        The size(s) indicating the out shape.
    out : dragon.vm.torch.Tensor, optional
-        The optional output tensor.
+        The output tensor.
    dtype : str, optional, default='float32'
        The optional data type.
    device : dragon.vm.torch.device, optional

--- a/torch/core/ops/math/_functions.py
+++ b/torch/core/ops/math/_functions.py
@@ -77,36 +77,42 @@ class Clip(function.Function):
        return self.dispatch([input], [self.alloc(out)])
-class UnaryFunc(function.Function):
+class Gemm(function.Function):
-    """Unary function."""
+    """Gemm function."""
    def __init__(self, key, dev, **kwargs):
-        super(UnaryFunc, self).__init__(key, dev, **kwargs)
+        super(Gemm, self).__init__(key, dev, **kwargs)
-        self.op_type = kwargs.get('op_type', '')
+        self.alpha = kwargs.get('alpha', 1.0)
+        self.beta = kwargs.get('beta', 1.0)
+        self.transA = kwargs.get('transA', False)
+        self.transB = kwargs.get('transB', False)
    def attributes(self):
-        return {'op_type': self.op_type, 'arguments': {}}
+        return {
+            'op_type': 'Gemm',
+            'arguments': {
+                'axis': -1,
+                'alpha': self.alpha,
+                'beta': self.beta,
+                'transA': self.transA,
+                'transB': self.transB,
+            },
+        }
-    def forward(self, input, out=None):
+    def forward(self, mat1, mat2, mat3=None, out=None):
-        return self.dispatch([input], [self.alloc(out)])
+        inputs = [mat1, mat2] + ([mat3] if mat3 else [])
+        return self.dispatch(inputs, [self.alloc(out)])
-class MatMul(function.Function):
+class UnaryFunc(function.Function):
-    """MatMul function."""
+    """Unary function."""
    def __init__(self, key, dev, **kwargs):
-        super(MatMul, self).__init__(key, dev, **kwargs)
+        super(UnaryFunc, self).__init__(key, dev, **kwargs)
-        self.transpose_a = kwargs.get('transpose_a', False)
+        self.op_type = kwargs.get('op_type', '')
-        self.transpose_b = kwargs.get('transpose_b', False)
    def attributes(self):
-        return {
+        return {'op_type': self.op_type, 'arguments': {}}
-            'op_type': 'MatMul',
-            'arguments': {
-                'transA': self.transpose_a,
-                'transB': self.transpose_b,
-            },
-        }
-    def forward(self, mat1, mat2, out=None):
+    def forward(self, input, out=None):
-        return self.dispatch([mat1, mat2], [self.alloc(out)])
+        return self.dispatch([input], [self.alloc(out)])
--- a/torch/core/ops/math/functional.py
+++ b/torch/core/ops/math/functional.py
--- a/torch/core/ops/tensorbind.py
+++ b/torch/core/ops/tensorbind.py
@@ -85,6 +85,35 @@ def add_(self, other):
    return math_funcs.add(self, other, self)
+def addmm(self, mat1, mat2, beta=1, alpha=1):
+    r"""Add the result of matrix-matrix multiplication.
+    .. math:: \text{out} = \alpha (\text{mat1} \times \text{mat2}) + \beta \text{self}
+    Parameters
+    ----------
+    mat1 : dragon.vm.torch.Tensor
+        The first matrix.
+    mat2 : dragon.vm.torch.Tensor
+        The second matrix.
+    beta : float, optional, default=1
+        The value to :math:`\beta`.
+    alpha : float, optional, default=1
+        The value to :math:`\alpha`.
+    Returns
+    -------
+    dragon.vm.torch.Tensor
+        The output tensor.
+    See Also
+    --------
+    `torch.addmm(...)`_
+    """
+    return math_funcs.addmm(self, mat1, mat2, beta=beta, alpha=alpha)
 def argmax(self, dim=None, keepdim=False):
    """Return the index of maximum elements.
@@ -154,6 +183,71 @@ def argsort(self, dim=-1, descending=False):
    return array_funcs.argsort(self, dim, descending)
+def baddbmm(self, batch1, batch2, beta=1, alpha=1):
+    r"""Add the result of batched matrix-matrix multiplication.
+    .. math::
+        \text{out}_{i} = \alpha (\text{batch1}_{i} \times \text{batch2}_{i}) +
+                         \beta \text{self}_{i}
+    Parameters
+    ----------
+    batch1 : dragon.vm.torch.Tensor
+        The first batch of matrices.
+    batch2 : dragon.vm.torch.Tensor
+        The second batch of matrices.
+    beta : float, optional, default=1
+        The value to :math:`\beta`.
+    alpha : float, optional, default=1
+        The value to :math:`\alpha`.
+    Returns
+    -------
+    dragon.vm.torch.Tensor
+        The output tensor.
+    See Also
+    --------
+    `torch.baddbmm(...)`_
+    """
+    return math_funcs.baddbmm(self, batch1, batch2, beta=beta, alpha=alpha)
+def baddbmm_(self, batch1, batch2, beta=1, alpha=1):
+    r"""Add the result of batched matrix-matrix multiplication.
+    .. math::
+        \text{self}_{i} = \alpha (\text{batch1}_{i} \times \text{batch2}_{i}) +
+                         \beta \text{self}_{i}
+    Parameters
+    ----------
+    batch1 : dragon.vm.torch.Tensor
+        The first batch of matrices.
+    batch2 : dragon.vm.torch.Tensor
+        The second batch of matrices.
+    beta : float, optional, default=1
+        The value to :math:`\beta`.
+    alpha : float, optional, default=1
+        The value to :math:`\alpha`.
+    Returns
+    -------
+    dragon.vm.torch.Tensor
+        The output tensor.
+    See Also
+    --------
+    `torch.baddbmm(...)`_
+    """
+    return math_funcs.baddbmm(
+        self, batch1, batch2,
+        beta=beta, alpha=alpha, out=self,
+    )
 def backward(self, gradient=None, retain_graph=False):
    """Compute the derivatives of this tensor w.r.t. graph leaves.
@@ -254,6 +348,29 @@ def bitwise_xor_(self, other):
    return math_funcs.bitwise_xor(self, other, self)
+def bmm(self, batch2):
+    r"""Compute the batched matrix multiplication.
+    .. math:: \text{out}_{i} = \text{self}_{i} \times \text{batch2}_{i}
+    Parameters
+    ----------
+    batch2 : dragon.vm.torch.Tensor
+        The second batch of matrices.
+    Returns
+    -------
+    dragon.Tensor
+        The output tensor.
+    See Also
+    --------
+    `torch.bmm(...)`_
+    """
+    return math_funcs.bmm(self, batch2)
 def bool(self):
    """Return a bool tensor with the same data.
@@ -719,50 +836,6 @@ def floor_(self):
    return math_funcs.floor(self, self)
-def new_full(
-    self,
-    size,
-    fill_value,
-    dtype=None,
-    device=None,
-    requires_grad=False,
-):
-    """Return a tensor filled with a scalar.
-    Refer this tensor if ``dtype`` and ``device`` not provided.
-    Parameters
-    ----------
-    size : Sequence[int]
-        The size of output tensor.
-    fill_value : number
-        The scalar to fill.
-    dtype : str, optional
-        The optional data type.
-    device : dragon.vm.torch.device, optional
-        The optional device of returned tensor.
-    requires_grad : bool, optional, default=False
-        **True** to record gradient for returned tensor.
-    Returns
-    -------
-    dragon.vm.torch.Tensor
-        The output tensor.
-    See Also
-    --------
-    `torch.full(...)`_
-    """
-    return init_funcs.full(
-        size,
-        fill_value,
-        dtype=self.dtype if dtype is None else dtype,
-        device=self.device if device is None else device,
-        requires_grad=requires_grad,
-    )
 def ge(self, other):
    r"""Compute the element-wise greater-equal comparison.
@@ -1104,6 +1177,29 @@ def masked_select(self, mask):
    return array_funcs.masked_select(self, mask)
+def matmul(self, tensor2):
+    r"""Compute the matrix multiplication.
+    .. math:: \text{out} = \text{self} \times \text{tensor2}
+    Parameters
+    ----------
+    tensor2 : dragon.vm.torch.Tensor
+        The tensor to multiply.
+    Returns
+    -------
+    dragon.Tensor
+        The output tensor.
+    See Also
+    --------
+    `torch.matmul(...)`_
+    """
+    return math_funcs.matmul(self, tensor2)
 def max(self, dim=None, keepdim=False):
    """Compute the max value of elements along the given dimension.
@@ -1383,6 +1479,50 @@ def neg_(self):
    return math_funcs.neg(self, self)
+def new_full(
+    self,
+    size,
+    fill_value,
+    dtype=None,
+    device=None,
+    requires_grad=False,
+):
+    """Return a tensor filled with a scalar.
+    Refer this tensor if ``dtype`` and ``device`` not provided.
+    Parameters
+    ----------
+    size : Sequence[int]
+        The size of output tensor.
+    fill_value : number
+        The scalar to fill.
+    dtype : str, optional
+        The optional data type.
+    device : dragon.vm.torch.device, optional
+        The optional device of returned tensor.
+    requires_grad : bool, optional, default=False
+        **True** to record gradient for returned tensor.
+    Returns
+    -------
+    dragon.vm.torch.Tensor
+        The output tensor.
+    See Also
+    --------
+    `torch.full(...)`_
+    """
+    return init_funcs.full(
+        size,
+        fill_value,
+        dtype=self.dtype if dtype is None else dtype,
+        device=self.device if device is None else device,
+        requires_grad=requires_grad,
+    )
 def nonzero(self):
    r"""Return the index of non-zero elements.
@@ -1735,7 +1875,7 @@ def sort(self, dim=-1, descending=False):
 def split(self, split_size_or_sections, dim=0):
-    """Return the splited chunks along the given dimension.
+    """Return the split chunks along the given dimension.
    Parameters
    ----------
@@ -2132,14 +2272,18 @@ def _process_index(item):
 Tensor.abs = abs
 Tensor.add = add
 Tensor.add_ = add_
+Tensor.addmm = addmm
 Tensor.argmax = argmax
 Tensor.argmin = argmin
 Tensor.argsort = argsort
 Tensor.backward = backward
+Tensor.baddbmm = baddbmm
+Tensor.baddbmm_ = baddbmm_
 Tensor.bitwise_not = bitwise_not
 Tensor.bitwise_not_ = bitwise_not_
 Tensor.bitwise_xor = bitwise_xor
 Tensor.bitwise_xor_ = bitwise_xor_
+Tensor.bmm = bmm
 Tensor.bool = bool
 Tensor.bool_ = bool_
 Tensor.byte = byte
@@ -2184,6 +2328,7 @@ Tensor.logsumexp = logsumexp
 Tensor.lt = lt
 Tensor.masked_fill_ = masked_fill_
 Tensor.masked_select = masked_select
+Tensor.matmul = matmul
 Tensor.max = max
 Tensor.maximum = maximum
 Tensor.mean = mean

--- a/torch/core/tensor.py
+++ b/torch/core/tensor.py
@@ -270,6 +270,33 @@ class Tensor(object):
        """
+    def addmm(self, mat1, mat2, beta=1, alpha=1):
+        r"""Add the result of matrix-matrix multiplication.
+        .. math:: \text{out} = \alpha (\text{mat1} \times \text{mat2}) + \beta \text{self}
+        Parameters
+        ----------
+        mat1 : dragon.vm.torch.Tensor
+            The first matrix.
+        mat2 : dragon.vm.torch.Tensor
+            The second matrix.
+        beta : float, optional, default=1
+            The value to :math:`\beta`.
+        alpha : float, optional, default=1
+            The value to :math:`\alpha`.
+        Returns
+        -------
+        dragon.vm.torch.Tensor
+            The output tensor.
+        See Also
+        --------
+        `torch.addmm(...)`_
+        """
    def argmax(self, dim=None, keepdim=False):
        """Return the index of maximum elements.
@@ -345,6 +372,64 @@ class Tensor(object):
        """
+    def baddbmm(self, batch1, batch2, beta=1, alpha=1):
+        r"""Add the result of batched matrix-matrix multiplication.
+        .. math::
+            \text{out}_{i} = \alpha (\text{batch1}_{i} \times \text{batch2}_{i}) +
+                             \beta \text{self}_{i}
+        Parameters
+        ----------
+        batch1 : dragon.vm.torch.Tensor
+            The first batch of matrices.
+        batch2 : dragon.vm.torch.Tensor
+            The second batch of matrices.
+        beta : float, optional, default=1
+            The value to :math:`\beta`.
+        alpha : float, optional, default=1
+            The value to :math:`\alpha`.
+        Returns
+        -------
+        dragon.vm.torch.Tensor
+            The output tensor.
+        See Also
+        --------
+        `torch.baddbmm(...)`_
+        """
+    def baddbmm_(self, batch1, batch2, beta=1, alpha=1):
+        r"""Add the result of batched matrix-matrix multiplication.
+        .. math::
+            \text{self}_{i} = \alpha (\text{batch1}_{i} \times \text{batch2}_{i}) +
+                             \beta \text{self}_{i}
+        Parameters
+        ----------
+        batch1 : dragon.vm.torch.Tensor
+            The first batch of matrices.
+        batch2 : dragon.vm.torch.Tensor
+            The second batch of matrices.
+        beta : float, optional, default=1
+            The value to :math:`\beta`.
+        alpha : float, optional, default=1
+            The value to :math:`\alpha`.
+        Returns
+        -------
+        dragon.vm.torch.Tensor
+            The output tensor.
+        See Also
+        --------
+        `torch.baddbmm(...)`_
+        """
    def bitwise_not(self):
        r"""Compute the element-wise NOT bitwise operation.
@@ -419,6 +504,27 @@ class Tensor(object):
        """
+    def bmm(self, batch2):
+        r"""Compute the batched matrix multiplication.
+        .. math:: \text{out}_{i} = \text{self}_{i} \times \text{batch2}_{i}
+        Parameters
+        ----------
+        batch2 : dragon.vm.torch.Tensor
+            The second batch of matrices.
+        Returns
+        -------
+        dragon.Tensor
+            The output tensor.
+        See Also
+        --------
+        `torch.bmm(...)`_
+        """
    def bool(self):
        """Return a bool tensor with the same data.
@@ -1192,6 +1298,27 @@ class Tensor(object):
        """
+    def matmul(self, tensor2):
+        r"""Compute the matrix multiplication.
+        .. math:: \text{out} = \text{self} \times \text{tensor2}
+        Parameters
+        ----------
+        tensor2 : dragon.vm.torch.Tensor
+            The tensor to multiply.
+        Returns
+        -------
+        dragon.Tensor
+            The output tensor.
+        See Also
+        --------
+        `torch.matmul(...)`_
+        """
    def max(self, dim=None, keepdim=False):
        """Compute the max value of elements along the given dimension.