Add Reverse operator

Summary: This commit adds reverse or flip operator.

Add Reverse operator
Summary: This commit adds reverse or flip operator.
Ting PAN
Commit 094c8c32 authored Apr 28, 2021 by Ting PAN
Showing with 714 additions and 91 deletions
docs/api/python/dragon.rst
docs/api/python/dragon/reverse.rst
docs/api/python/tensorflow.rst
docs/api/python/tensorflow/reverse.rst
docs/api/python/torch.rst
docs/api/python/torch/Tensor_.rst
docs/api/python/torch/flip.rst
docs/api/python/torch/fliplr.rst
docs/api/python/torch/flipud.rst
dragon/kernels/array/reverse_op_kernel.cc
dragon/kernels/array/reverse_op_kernel.cu
dragon/operators/array/reverse_op.cc
dragon/operators/array/reverse_op.h
dragon/operators/normalization/group_norm_op.cc
dragon/python/__init__.py
dragon/python/core/autograph/op_args.py
dragon/python/core/ops/array_ops.py
dragon/utils/op_kernels.h
tensorflow/__init__.py
tensorflow/core/ops/array_ops.py
--- a/docs/api/python/dragon.rst
+++ b/docs/api/python/dragon.rst
@@ -136,6 +136,9 @@ dragon
  `reshape(...) <dragon/reshape.html>`_
  : Change the dimensions of input.

+  `reverse(...) <dragon/reverse.html>`_
+  : Reverse elements along the given axis.
+ 
  `roll(...) <dragon/roll.html>`_
  : Roll elements along the given axis.

@@ -237,6 +240,7 @@ dragon
  dragon/repeat
  dragon/reset_workspace
  dragon/reshape
+  dragon/reverse
  dragon/roll
  dragon/scatter_add
  dragon/scatter_elements

--- a/docs/api/python/dragon/reverse.rst
+++ b/docs/api/python/dragon/reverse.rst
+reverse
+=======
+
+.. autofunction:: dragon.reverse
+
+.. raw:: html
+
+  <style>
+    h1:before {
+      content: "dragon.";
+      color: #103d3e;
+    }
+  </style>
--- a/docs/api/python/tensorflow.rst
+++ b/docs/api/python/tensorflow.rst
@@ -87,6 +87,9 @@ vm.tensorflow
  `reshape(...) <tensorflow/reshape.html>`_
  : Change the dimensions of input.

+  `reverse(...) <tensorflow/reverse.html>`_
+  : Reverse elements along the given axis.
+
  `roll(...) <tensorflow/roll.html>`_
  : Roll elements along the given axis.

@@ -152,6 +155,7 @@ vm.tensorflow
  tensorflow/pad
  tensorflow/range
  tensorflow/reshape
+  tensorflow/reverse
  tensorflow/roll
  tensorflow/shape
  tensorflow/slice

--- a/docs/api/python/tensorflow/reverse.rst
+++ b/docs/api/python/tensorflow/reverse.rst
+reverse
+=======
+
+.. autofunction:: dragon.vm.tensorflow.reverse
+
+.. raw:: html
+
+  <style>
+    h1:before {
+      content: "tf.";
+      color: #103d3e;
+    }
+  </style>
--- a/docs/api/python/torch.rst
+++ b/docs/api/python/torch.rst
@@ -111,6 +111,15 @@ vm.torch
  `flatten(...) <torch/flatten.html>`_
  : Return a tensor with dimensions flattened.

+  `flip(...) <torch/flip.html>`_
+  : Reverse elements along the given dimension.
+
+  `fliplr(...) <torch/fliplr.html>`_
+  : Reverse elements along the second dimension.
+
+  `flipud(...) <torch/flipud.html>`_
+  : Reverse elements along the first dimension.
+
  `floor(...) <torch/floor.html>`_
  : Compute the largest integer not greater than input.

@@ -350,6 +359,9 @@ vm.torch
  torch/exp
  torch/eye
  torch/flatten
+  torch/flip
+  torch/fliplr
+  torch/flipud
  torch/floor
  torch/from_numpy
  torch/full

--- a/docs/api/python/torch/Tensor_.rst
+++ b/docs/api/python/torch/Tensor_.rst
@@ -233,6 +233,18 @@ flatten\_
 #########
 .. automethod:: dragon.vm.torch.Tensor.flatten_

+flip
+####
+.. automethod:: dragon.vm.torch.Tensor.flip
+
+fliplr
+######
+.. automethod:: dragon.vm.torch.Tensor.fliplr
+
+flipud
+######
+.. automethod:: dragon.vm.torch.Tensor.flipud
+
 float
 #####
 .. automethod:: dragon.vm.torch.Tensor.float
@@ -650,6 +662,9 @@ zero\_
 .. _torch.eq(...): eq.html
 .. _torch.exp(...): exp.html
 .. _torch.flatten(...): flatten.html
+.. _torch.flip(...): flip.html
+.. _torch.fliplr(...): fliplr.html
+.. _torch.flipud(...): flipud.html
 .. _torch.floor(...): floor.html
 .. _torch.full(...): full.html
 .. _torch.gather(...): gather.html

--- a/docs/api/python/torch/flip.rst
+++ b/docs/api/python/torch/flip.rst
+flip
+====
+
+.. autofunction:: dragon.vm.torch.flip
+
+.. raw:: html
+
+  <style>
+    h1:before {
+      content: "torch.";
+      color: #103d3e;
+    }
+  </style>
--- a/docs/api/python/torch/fliplr.rst
+++ b/docs/api/python/torch/fliplr.rst
+fliplr
+======
+
+.. autofunction:: dragon.vm.torch.fliplr
+
+.. raw:: html
+
+  <style>
+    h1:before {
+      content: "torch.";
+      color: #103d3e;
+    }
+  </style>
--- a/docs/api/python/torch/flipud.rst
+++ b/docs/api/python/torch/flipud.rst
+flipud
+======
+
+.. autofunction:: dragon.vm.torch.flipud
+
+.. raw:: html
+
+  <style>
+    h1:before {
+      content: "torch.";
+      color: #103d3e;
+    }
+  </style>
--- a/dragon/kernels/array/reverse_op_kernel.cc
+++ b/dragon/kernels/array/reverse_op_kernel.cc
+#include "dragon/utils/math_functions.h"
+#include "dragon/utils/op_kernels.h"
+
+namespace dragon {
+
+namespace kernels {
+
+namespace {
+
+template <typename T>
+void _Reverse(
+    const int num_dims,
+    const uint8_t* x_flips,
+    const int64_t* x_strides,
+    const int64_t* y_dims,
+    const T* x,
+    T* y) {
+  const auto N =
+      std::accumulate(y_dims, y_dims + num_dims, 1, std::multiplies<int64_t>());
+  vec64_t index(num_dims, 0);
+  int64_t xi;
+  for (int yi = 0; yi < N; ++yi) {
+    xi = 0;
+    for (int d = num_dims - 1; d >= 0; --d) {
+      xi += (x_flips[d] ? y_dims[d] - index[d] - 1 : index[d]) * x_strides[d];
+    }
+    y[yi] = x[xi];
+    math::utils::IncreaseIndexInDims(num_dims, y_dims, index.data());
+  }
+}
+
+} // namespace
+
+/* ------------------- Launcher Separator ------------------- */
+
+#define DEFINE_KERNEL_LAUNCHER(T)                         \
+  template <>                                             \
+  void Reverse<T, CPUContext>(                            \
+      const int num_dims,                                 \
+      const uint8_t* x_flips,                             \
+      const int64_t* x_strides,                           \
+      const int64_t* y_dims,                              \
+      const T* x,                                         \
+      T* y,                                               \
+      CPUContext* ctx) {                                  \
+    _Reverse(num_dims, x_flips, x_strides, y_dims, x, y); \
+  }
+
+DEFINE_KERNEL_LAUNCHER(bool);
+DEFINE_KERNEL_LAUNCHER(uint8_t);
+DEFINE_KERNEL_LAUNCHER(int8_t);
+DEFINE_KERNEL_LAUNCHER(int);
+DEFINE_KERNEL_LAUNCHER(int64_t);
+DEFINE_KERNEL_LAUNCHER(float16);
+DEFINE_KERNEL_LAUNCHER(float);
+DEFINE_KERNEL_LAUNCHER(double);
+#undef DEFINE_KERNEL_LAUNCHER
+
+} // namespace kernels
+
+} // namespace dragon
--- a/dragon/kernels/array/reverse_op_kernel.cu
+++ b/dragon/kernels/array/reverse_op_kernel.cu
+#ifdef USE_CUDA
+
+#include "dragon/core/context_cuda.h"
+#include "dragon/utils/math_functions.h"
+#include "dragon/utils/op_kernels.h"
+
+namespace dragon {
+
+namespace kernels {
+
+namespace {
+
+template <typename T, int D>
+__global__ void _Reverse(
+    const int N,
+    const int num_dims,
+    const SimpleArray<uint8_t, D> X_flips,
+    const SimpleArray<int, D> X_strides,
+    const SimpleArray<int, D> Y_dims,
+    const T* x,
+    T* y) {
+  CUDA_1D_KERNEL_LOOP(yi, N) {
+    int xi = 0, tmp = yi;
+    for (int d = num_dims - 1; d >= 0; --d) {
+      int r;
+      FIXED_DIVISOR_DIV_MOD(Y_dims.data[d], tmp, &tmp, &r);
+      xi += (X_flips.data[d] ? Y_dims.data[d] - r - 1 : r) * X_strides.data[d];
+    }
+    y[yi] = x[xi];
+  }
+}
+
+} // namespace
+
+/* ------------------- Launcher Separator ------------------- */
+
+#define DEFINE_KERNEL_LAUNCHER(T)                                      \
+  template <>                                                          \
+  void Reverse<T, CUDAContext>(                                        \
+      const int num_dims,                                              \
+      const uint8_t* x_flips,                                          \
+      const int64_t* x_strides,                                        \
+      const int64_t* y_dims,                                           \
+      const T* x,                                                      \
+      T* y,                                                            \
+      CUDAContext* ctx) {                                              \
+    CUDA_TENSOR_DIMS_CHECK(num_dims);                                  \
+    SimpleArray<uint8_t, CUDA_TENSOR_MAX_DIMS> X_flips;                \
+    SimpleArray<int, CUDA_TENSOR_MAX_DIMS> X_strides, Y_dims;          \
+    const auto N = std::accumulate(                                    \
+        y_dims, y_dims + num_dims, 1, std::multiplies<int64_t>());     \
+    for (int i = 0; i < num_dims; ++i) {                               \
+      X_flips.data[i] = x_flips[i];                                    \
+      X_strides.data[i] = x_strides[i];                                \
+      Y_dims.data[i] = y_dims[i];                                      \
+    }                                                                  \
+    _Reverse<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
+        N, num_dims, X_flips, X_strides, Y_dims, x, y);                \
+  }
+
+DEFINE_KERNEL_LAUNCHER(bool);
+DEFINE_KERNEL_LAUNCHER(uint8_t);
+DEFINE_KERNEL_LAUNCHER(int8_t);
+DEFINE_KERNEL_LAUNCHER(int);
+DEFINE_KERNEL_LAUNCHER(int64_t);
+DEFINE_KERNEL_LAUNCHER(float16);
+DEFINE_KERNEL_LAUNCHER(float);
+DEFINE_KERNEL_LAUNCHER(double);
+#undef DEFINE_KERNEL_LAUNCHER
+
+} // namespace kernels
+
+} // namespace dragon
+
+#endif // USE_CUDA
--- a/dragon/operators/array/reverse_op.cc
+++ b/dragon/operators/array/reverse_op.cc
+#include "dragon/operators/array/reverse_op.h"
+#include "dragon/utils/op_kernels.h"
+
+namespace dragon {
+
+template <class Context>
+template <typename T>
+void ReverseOp<Context>::DoRunWithType() {
+  auto &X = Input(0), *Y = Output(0);
+
+  int num_dims = X.ndim();
+  vector<uint8_t> X_flips(num_dims, 0);
+  for (int i = 0; i < axes_.size(); ++i) {
+    int axis = axes_[i];
+    axis = axis < 0 ? axis + num_dims : axis;
+    CHECK(axis >= 0 && axis < num_dims)
+        << "\nExcepted the <axis> in [-" << num_dims << ", " << num_dims
+        << "), got " << axes_[i] << ".";
+    X_flips[axis] = 1;
+  }
+
+  kernels::Reverse(
+      num_dims,
+      X_flips.data(),
+      X.strides().data(),
+      X.dims().data(),
+      X.template data<T, Context>(),
+      Y->ReshapeLike(X)->template mutable_data<T, Context>(),
+      ctx());
+}
+
+DEPLOY_CPU_OPERATOR(Reverse);
+REGISTER_CPU_OPERATOR(ReverseGradient, ReverseOp<CPUContext>);
+#ifdef USE_CUDA
+DEPLOY_CUDA_OPERATOR(Reverse);
+REGISTER_CUDA_OPERATOR(ReverseGradient, ReverseOp<CUDAContext>);
+#endif
+
+OPERATOR_SCHEMA(Reverse)
+    /* X */
+    .NumInputs(1)
+    /* Y */
+    .NumOutputs(1);
+
+OPERATOR_SCHEMA(ReverseGradient)
+    /* dY */
+    .NumInputs(1)
+    /* dX */
+    .NumOutputs(1);
+
+REGISTER_GRADIENT(Reverse, SimpleGradientMaker);
+
+} // namespace dragon
--- a/dragon/operators/array/reverse_op.h
+++ b/dragon/operators/array/reverse_op.h
+/*!
+ * Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+ *
+ * Licensed under the BSD 2-Clause License.
+ * You should have received a copy of the BSD 2-Clause License
+ * along with the software. If not, See,
+ *
+ *     <https://opensource.org/licenses/BSD-2-Clause>
+ *
+ * ------------------------------------------------------------
+ */
+
+#ifndef DRAGON_OPERATORS_ARRAY_REVERSE_OP_H_
+#define DRAGON_OPERATORS_ARRAY_REVERSE_OP_H_
+
+#include "dragon/core/operator.h"
+
+namespace dragon {
+
+template <class Context>
+class ReverseOp final : public Operator<Context> {
+ public:
+  ReverseOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws), axes_(OP_REPEATED_ARG(int64_t, "axes")) {}
+  USE_OPERATOR_FUNCTIONS;
+
+  void RunOnDevice() override {
+    DispatchHelper<dtypes::Generic>::Call(this, Input(0));
+  }
+
+  template <typename T>
+  void DoRunWithType();
+
+ protected:
+  vec64_t axes_;
+};
+
+} // namespace dragon
+
+#endif // DRAGON_OPERATORS_ARRAY_REVERSE_OP_H_
--- a/dragon/operators/normalization/group_norm_op.cc
+++ b/dragon/operators/normalization/group_norm_op.cc
@@ -11,11 +11,8 @@ void GroupNormOp<Context>::DoRunWithType() {
  using ParamT = typename math::AccmulatorType<T>::type;
  INITIALIZE_TENSOR_VIA_SPEC(Input(1), vec64_t({C_}), ParamT);
  INITIALIZE_TENSOR_VIA_SPEC(Input(2), vec64_t({C_}), ParamT);
-
  auto* X_mu = Buffer("X_mu")->Reshape({N_, G_});
  auto* X_rsig = Buffer("X_rsig")->Reshape({N_, G_});
-  auto* X_scale = Buffer("X_scale")->Reshape({N_, C_});
-  auto* X_bias = Buffer("X_bias")->Reshape({N_, C_});

  auto* x = Input(0).template data<T, Context>();
  auto* mu = X_mu->template mutable_data<ParamT, Context>();
@@ -36,6 +33,8 @@ void GroupNormOp<Context>::DoRunWithType() {
  math::InvStd(N_ * G_, epsilon_, rsig, rsig, ctx());

  // Fuse parameters to compute affine transformation
+  auto* scratch =
+      ctx()->workspace()->template data<ParamT, Context>({2 * N_ * C_})[0];
  kernels::GroupNorm(
      N_,
      G_,
@@ -47,8 +46,8 @@ void GroupNormOp<Context>::DoRunWithType() {
      rsig,
      Input(1).template data<ParamT, Context>(), // gamma
      Input(2).template data<ParamT, Context>(), // beta
-      X_scale->template mutable_data<ParamT, Context>(),
-      X_bias->template mutable_data<ParamT, Context>(),
+      scratch,
+      scratch + N_ * C_,
      Output(0)->template mutable_data<T, Context>(),
      ctx());
 }
@@ -65,12 +64,11 @@ template <typename T>
 void GroupNormGradientOp<Context>::DoRunWithType() {
  using ParamT = typename math::AccmulatorType<T>::type;
  auto *dX = Output(0), *dW = Output(1), *dB = Output(2);
-
  auto *X_mu = Buffer("X_mu"), *X_rsig = Buffer("X_rsig");
-  auto* X_scale = Buffer("X_scale")->Reshape({N_, G_});
-  auto* X_bias = Buffer("X_bias")->Reshape({N_, G_});

  // Gradient w.r.t. gamma, beta and input
+  auto* scratch =
+      ctx()->workspace()->template data<ParamT, Context>({2 * N_ * G_})[0];
  kernels::GroupNormGrad(
      N_,
      G_,
@@ -82,8 +80,8 @@ void GroupNormGradientOp<Context>::DoRunWithType() {
      X_rsig->template data<ParamT, Context>(),
      Input(1).template data<ParamT, Context>(), // gamma
      Input(2).template data<T, Context>(), // dy
-      X_scale->template mutable_data<ParamT, Context>(),
-      X_bias->template mutable_data<ParamT, Context>(),
+      scratch,
+      scratch + N_ * G_,
      dW->Reshape({C_})->template mutable_data<ParamT, Context>(),
      dB->Reshape({C_})->template mutable_data<ParamT, Context>(),
      dX->template mutable_data<T, Context>(),
@@ -120,7 +118,6 @@ OPERATOR_SCHEMA(GroupNormGradient)
    .NumOutputs(3);

 namespace {
-
 class GradientMaker final : public GradientMakerBase {
 public:
  GRADIENT_MAKER_CTOR(GradientMaker);

--- a/dragon/python/__init__.py
+++ b/dragon/python/__init__.py
@@ -76,6 +76,7 @@ from dragon.core.ops.array_ops import pad
 from dragon.core.ops.array_ops import range
 from dragon.core.ops.array_ops import repeat
 from dragon.core.ops.array_ops import reshape
+from dragon.core.ops.array_ops import reverse
 from dragon.core.ops.array_ops import roll
 from dragon.core.ops.array_ops import scatter_add
 from dragon.core.ops.array_ops import scatter_elements

--- a/dragon/python/core/autograph/op_args.py
+++ b/dragon/python/core/autograph/op_args.py
@@ -456,6 +456,11 @@ def resize_args(**kwargs):
    }


+@register('Reverse')
+def reverse_args(**kwargs):
+    return {'axes': kwargs.get('axes', None)}
+
+
 @register('Recurrent')
 def rnn_args(**kwargs):
    return {

--- a/dragon/python/core/ops/array_ops.py
+++ b/dragon/python/core/ops/array_ops.py
@@ -1229,6 +1229,42 @@ def reshape(inputs, shape, copy=True, **kwargs):


 @OpSchema.num_inputs(1)
+def reverse(inputs, axis, **kwargs):
+    """Reverse elements along the given axis.
+
+    :attr:`axis` could be negative:
+
+    ```python
+    x = dragon.constant([[1, 2, 3], [4, 5, 6]])
+
+    # A negative axis is the last-k axis
+    print(dragon.reverse(x, axis=1))  # [[3, 2, 1], [6, 5, 4]]
+    print(dragon.reverse(x, axis=-1))  # Equivalent
+
+    # Also, axis could be a sequence of integers
+    print(dragon.reverse(x, axis=(0, 1)))  # [[6, 5, 4], [3, 2, 1]]
+    ```
+
+    Parameters
+    ----------
+    inputs : dragon.Tensor
+        The input tensor.
+    axis : Union[int, Sequence[int]]
+        The axis to reverse.
+
+    Returns
+    -------
+    dragon.Tensor
+        The output tensor.
+
+    """
+    axes = nest.flatten(axis) if axis is not None else axis
+    if context.executing_eagerly():
+        return OpLib.execute('Reverse', inputs, axes=axes)
+    return OpLib.add('Reverse', inputs, axes=axes, **kwargs)
+
+
+@OpSchema.num_inputs(1)
 @OpSchema.convert_arg('shift', name_v2='shifts')
 def roll(inputs, shift, axis=None, **kwargs):
    """Roll elements along the given axis.

--- a/dragon/utils/op_kernels.h
+++ b/dragon/utils/op_kernels.h
@@ -502,6 +502,16 @@ void RepeatGrad(
    Context* ctx);

 template <typename T, class Context>
+void Reverse(
+    const int num_dims,
+    const uint8_t* x_flips,
+    const int64_t* x_strides,
+    const int64_t* y_dims,
+    const T* x,
+    T* y,
+    Context* ctx);
+
+template <typename T, class Context>
 void Roll(
    const int num_dims,
    const int64_t* x_shifts,

--- a/tensorflow/__init__.py
+++ b/tensorflow/__init__.py
@@ -80,6 +80,7 @@ from dragon.vm.tensorflow.core.ops.array_ops import one_hot
 from dragon.vm.tensorflow.core.ops.array_ops import pad
 from dragon.vm.tensorflow.core.ops.array_ops import placeholder
 from dragon.vm.tensorflow.core.ops.array_ops import reshape
+from dragon.vm.tensorflow.core.ops.array_ops import reverse
 from dragon.vm.tensorflow.core.ops.array_ops import roll
 from dragon.vm.tensorflow.core.ops.array_ops import shape
 from dragon.vm.tensorflow.core.ops.array_ops import slice

--- a/tensorflow/core/ops/array_ops.py
+++ b/tensorflow/core/ops/array_ops.py
@@ -503,6 +503,40 @@ def reshape(tensor, shape, name=None):
    return array_ops.reshape(tensor, shape=shape, name=name)


+def reverse(tensor, axis, name=None):
+    """Reverse elements along the given axis.
+
+    :attr:`axis` could be negative:
+
+    ```python
+    x = tf.constant([[1, 2, 3], [4, 5, 6]])
+
+    # A negative axis is the last-k axis
+    print(tf.reverse(x, axis=1))  # [[3, 2, 1], [6, 5, 4]]
+    print(tf.reverse(x, axis=-1))  # Equivalent
+
+    # Also, axis could be a sequence of integers
+    print(tf.reverse(x, axis=(0, 1)))  # [[6, 5, 4], [3, 2, 1]]
+    ```
+
+    Parameters
+    ----------
+    tensor : dragon.Tensor
+        The input tensor.
+    axis : Union[int, Sequence[int]]
+        The axis to reverse.
+    name : str, optional
+        The operation name.
+
+    Returns
+    -------
+    dragon.Tensor
+        The output tensor.
+
+    """
+    return array_ops.reverse(tensor, axis=axis, name=name)
+
+
 def roll(input, shift, axis, name=None):
    """Roll elements along the given axis.


--- a/test/dragon/test_ops.py
+++ b/test/dragon/test_ops.py
@@ -938,6 +938,24 @@ class TestArrayOps(OpTestCase):
        with dragon.device('cuda'):
            self.test_reshape()

+    def test_reverse(self):
+        entries = [0, 1, (1, 2)]
+        for execution in ('EAGER_MODE', 'GRAPH_MODE'):
+            with execution_context().mode(execution):
+                for axis in entries:
+                    data = arange((2, 3, 4))
+                    x = new_tensor(data)
+                    with dragon.GradientTape() as tape:
+                        tape.watch(x)
+                        y = dragon.reverse(x, axis)
+                    dx = tape.gradient(y, [x], output_gradients=[x])[0]
+                    self.assertEqual([y, dx], [np.flip(data, axis), np.flip(data, axis)])
+
+    @unittest.skipIf(not TEST_CUDA, 'CUDA unavailable')
+    def test_reverse_cuda(self):
+        with dragon.device('cuda'):
+            self.test_reverse()
+
    def test_shape(self):
        entries = [(2, 3), (2, 3, 3)]
        for execution in ('EAGER_MODE', 'GRAPH_MODE'):

--- a/test/torch/test_ops.py
+++ b/test/torch/test_ops.py
@@ -288,6 +288,13 @@ class TestTensorOps(OpTestCase):
        x.flatten_(-3, -2)
        self.assertEqual(x, data.reshape((2, 3)))

+    def test_flip(self):
+        data = arange((2, 3, 4))
+        x = new_tensor(data)
+        self.assertEqual(x.flip((1, 2)), np.flip(data, (1, 2)))
+        self.assertEqual(x.fliplr(), np.fliplr(data))
+        self.assertEqual(x.flipud(), np.flipud(data))
+
    def test_floor(self):
        data = np.array([0.9, 1.4, 1.9])
        x = new_tensor(data)

--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -55,6 +55,9 @@ from dragon.vm.torch.core.ops.array_ops import channel_normalize
 from dragon.vm.torch.core.ops.array_ops import chunk
 from dragon.vm.torch.core.ops.array_ops import cumsum
 from dragon.vm.torch.core.ops.array_ops import flatten
+from dragon.vm.torch.core.ops.array_ops import flip
+from dragon.vm.torch.core.ops.array_ops import fliplr
+from dragon.vm.torch.core.ops.array_ops import flipud
 from dragon.vm.torch.core.ops.array_ops import gather
 from dragon.vm.torch.core.ops.array_ops import index_select
 from dragon.vm.torch.core.ops.array_ops import masked_select

--- a/torch/core/nn/functional.py
+++ b/torch/core/nn/functional.py
@@ -1557,22 +1557,29 @@ def multi_head_attention_forward(
    assert embed_dim == embed_dim_to_check
    assert src_len == value.size(0) and key.size(1) == value.size(1)
    head_dim = embed_dim // num_heads
-    scaling = float(head_dim) ** -0.5
+
+    def to_qkv(input, weight, bias, num_proj=1):
+        """Compute input projections via a single matmul."""
+        qkv_size = (tgt_len, bsz, num_proj * num_heads, head_dim)
+        outputs = linear(input, weight, bias).reshape_(qkv_size)
+        outputs = outputs.permute(1, 2, 0, 3)
+        return outputs if num_proj == 1 else outputs.chunk(num_proj, 1)
+
    q, k, v = None, None, None
    if not use_separate_proj_weight:
        if (query is key) and (key is value):
-            # Parallelism for self attention
-            q, k, v = linear(query, in_proj_weight, in_proj_bias).chunk(3, dim=-1)
+            # Parallelism for self attention.
+            q, k, v = to_qkv(query, in_proj_weight, in_proj_bias, 3)
        elif key is value:
-            # Parallelism for encode-decoder attention
+            # Parallelism for encode-decoder attention.
            q_proj_weight = in_proj_weight[:embed_dim, :]
            kv_proj_weight = in_proj_weight[embed_dim:, :]
            q_proj_bias = kv_proj_bias = in_proj_bias
            if in_proj_bias is not None:
                q_proj_bias = in_proj_bias[:embed_dim]
                kv_proj_bias = in_proj_bias[embed_dim:]
-            q = linear(query, q_proj_weight, q_proj_bias)
-            k, v = linear(key, kv_proj_weight, kv_proj_bias).chunk(2, dim=-1)
+            q = to_qkv(query, q_proj_weight, q_proj_bias)
+            k, v = to_qkv(key, kv_proj_weight, kv_proj_bias, 2)
    if q is None:
        q_proj_bias = k_proj_bias = v_proj_bias = in_proj_bias
        if use_separate_proj_weight and q_proj_weight is None:
@@ -1583,37 +1590,28 @@ def multi_head_attention_forward(
            q_proj_bias = in_proj_bias[:embed_dim]
            k_proj_bias = in_proj_bias[embed_dim:embed_dim * 2]
            v_proj_bias = in_proj_bias[embed_dim * 2:]
-        q = linear(query, q_proj_weight, q_proj_bias)
-        k = linear(key, k_proj_weight, k_proj_bias)
-        v = linear(value, v_proj_weight, v_proj_bias)
-    q *= scaling
-    q = q.reshape_((-1, bsz * num_heads, head_dim)).transpose(0, 1)
-    k = k.reshape_((-1, bsz * num_heads, head_dim)).transpose(0, 1)
-    v = v.reshape_((-1, bsz * num_heads, head_dim)).transpose(0, 1)
-    attn_weights = q.bmm(k.transpose(1, 2))
-    assert attn_weights.size() == (bsz * num_heads, tgt_len, src_len)
+        q = to_qkv(query, q_proj_weight, q_proj_bias)
+        k = to_qkv(key, k_proj_weight, k_proj_bias)
+        v = to_qkv(value, v_proj_weight, v_proj_bias)
+    q *= float(head_dim) ** -0.5
+    attn = q.bmm(k.transpose(-2, -1))
+    assert attn.size() == (bsz, num_heads, tgt_len, src_len)
    if attn_mask is not None:
        if attn_mask.dtype == 'bool' or attn_mask.dtype == 'uint8':
-            attn_weights.masked_fill_(attn_mask, float('-inf'))
+            attn.masked_fill_(attn_mask, float('-inf'))
        else:
-            attn_weights += attn_mask
+            attn += attn_mask
    if key_padding_mask is not None:
-        attn_weights.reshape_((bsz, num_heads, tgt_len, src_len))
-        if key_padding_mask.size() != attn_weights.size():
+        if key_padding_mask.size() != attn.size():
            key_padding_mask.reshape_((bsz, 1, 1, src_len))
-        attn_weights.masked_fill_(key_padding_mask, float('-inf'))
-        attn_weights.reshape_((bsz * num_heads, tgt_len, src_len))
-    attn_weights = softmax(attn_weights, dim=-1, inplace=True)
-    attn_weights = dropout(attn_weights, p=dropout_p, training=training)
-    attn_output = attn_weights.bmm(v)
-    assert list(attn_output.size()) == [bsz * num_heads, tgt_len, head_dim]
-    attn_output = attn_output.transpose(0, 1).reshape_((tgt_len, bsz, embed_dim))
-    attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
-    if need_weights:
-        weights = attn_weights.reshape((bsz, num_heads, tgt_len, src_len))
-        return attn_output, weights.mean(dim=1)
-    else:
-        return attn_output, None
+        attn.masked_fill_(key_padding_mask, float('-inf'))
+    attn = softmax(attn, dim=-1, inplace=True)
+    attn = dropout(attn, p=dropout_p, training=training)
+    output = attn.bmm(v).permute(2, 0, 1, 3)
+    output = output.reshape_((tgt_len, bsz, embed_dim))
+    output = linear(output, out_proj_weight, out_proj_bias)
+    weights = attn.mean(dim=1) if need_weights else None
+    return output, weights


 def nll_loss(

--- a/torch/core/nn/modules/transformer.py
+++ b/torch/core/nn/modules/transformer.py
@@ -161,21 +161,26 @@ class TransformerDecoderLayer(Module):
        tgt_key_padding_mask=None,
        memory_key_padding_mask=None,
    ):
-        tgt2 = self.self_attn(tgt, tgt, tgt,
-                              attn_mask=tgt_mask,
-                              key_padding_mask=tgt_key_padding_mask,
-                              need_weights=False)[0]
-        tgt = tgt + self.dropout1(tgt2)
-        tgt = self.norm1(tgt)
-        tgt2 = self.multihead_attn(tgt, memory, memory,
-                                   attn_mask=memory_mask,
-                                   key_padding_mask=memory_key_padding_mask,
-                                   need_weights=False)[0]
-        tgt = tgt + self.dropout2(tgt2)
-        tgt = self.norm2(tgt)
+        tgt2 = self.self_attn(
+            tgt, tgt, tgt,
+            attn_mask=tgt_mask,
+            key_padding_mask=tgt_key_padding_mask,
+            need_weights=False)[0]
+        tgt2 = self.dropout1(tgt2)
+        tgt2 += tgt
+        tgt = self.norm1(tgt2)
+        tgt2 = self.multihead_attn(
+            tgt, memory, memory,
+            attn_mask=memory_mask,
+            key_padding_mask=memory_key_padding_mask,
+            need_weights=False)[0]
+        tgt2 = self.dropout2(tgt2)
+        tgt2 += tgt
+        tgt = self.norm2(tgt2)
        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
-        tgt = tgt + self.dropout3(tgt2)
-        tgt = self.norm3(tgt)
+        tgt2 = self.dropout3(tgt2)
+        tgt2 += tgt
+        tgt = self.norm3(tgt2)
        return tgt


@@ -292,15 +297,18 @@ class TransformerEncoderLayer(Module):
        self.activation = _get_activation_fn(activation)

    def forward(self, src, src_mask=None, src_key_padding_mask=None):
-        src2 = self.self_attn(src, src, src,
-                              attn_mask=src_mask,
-                              key_padding_mask=src_key_padding_mask,
-                              need_weights=False)[0]
-        src = src + self.dropout1(src2)
-        src = self.norm1(src)
+        src2 = self.self_attn(
+            src, src, src,
+            attn_mask=src_mask,
+            key_padding_mask=src_key_padding_mask,
+            need_weights=False)[0]
+        src2 = self.dropout1(src2)
+        src2 += src
+        src = self.norm1(src2)
        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
-        src = src + self.dropout2(src2)
-        src = self.norm2(src)
+        src2 = self.dropout2(src2)
+        src2 += src
+        src = self.norm2(src2)
        return src



--- a/torch/core/ops/array_ops.py
+++ b/torch/core/ops/array_ops.py
@@ -392,6 +392,88 @@ def flatten(input, start_dim=0, end_dim=-1, out=None):
        axis=start_dim, end_axis=end_dim)


+def flip(input, dims):
+    """Reverse elements along the given dimension.
+
+    :attr:`dims` could be negative:
+
+    ```python
+    x = torch.tensor([[1, 2, 3], [4, 5, 6]])
+
+    # A negative dimension is the last-k dimension
+    print(torch.flip(x, dims=1))  # [[3, 2, 1], [6, 5, 4]]
+    print(torch.flip(x, dims=-1))  # Equivalent
+
+    # Also, dimension could be a sequence of integers
+    print(torch.flip(x, dims=(0, 1)))  # [[6, 5, 4], [3, 2, 1]]
+    ```
+
+    Parameters
+    ----------
+    input : dragon.vm.torch.Tensor
+        The input tensor.
+    dims : Union[int, Sequence[int]]
+        The dimension to reverse.
+
+    Returns
+    -------
+    dragon.vm.torch.Tensor
+        The output tensor.
+
+    """
+    return FunctionLib.apply(
+        'Reverse', input.device, [input],
+        axes=nest.flatten(dims) if dims is not None else dims)
+
+
+def fliplr(input):
+    """Reverse elements along the second dimension.
+
+    Examples:
+
+    ```python
+    x = torch.tensor([[1, 2, 3], [4, 5, 6]])
+    print(torch.fliplr(x))  # [[3, 2, 1], [6, 5, 4]]
+    ```
+
+    Parameters
+    ----------
+    input : dragon.vm.torch.Tensor
+        The input tensor.
+
+    Returns
+    -------
+    dragon.vm.torch.Tensor
+        The output tensor.
+
+    """
+    return flip(input, 1)
+
+
+def flipud(input):
+    """Reverse elements along the first dimension.
+
+    Examples:
+
+    ```python
+    x = torch.tensor([[1, 2, 3], [4, 5, 6]])
+    print(torch.flipud(x))  # [4, 5, 6], [1, 2, 3]]
+    ```
+
+    Parameters
+    ----------
+    input : dragon.vm.torch.Tensor
+        The input tensor.
+
+    Returns
+    -------
+    dragon.vm.torch.Tensor
+        The output tensor.
+
+    """
+    return flip(input, 0)
+
+
 def gather(input, dim, index, out=None):
    """Gather elements along the given dimension of index.

@@ -559,10 +641,8 @@ def max(input, dim=None, keepdim=False, out=None):
        The output tensor.

    """
-    if dim is None:
-        keepdim = False
-    else:
-        dim = nest.flatten(dim)
+    keepdim = keepdim if dim is not None else False
+    dim = nest.flatten(dim) if dim is not None else dim
    return FunctionLib.apply(
        'ReduceMax', input.device, [input], outputs=[out],
        axes=dim, keepdims=keepdim)
@@ -605,10 +685,8 @@ def mean(input, dim=None, keepdim=False, out=None):
        The output tensor.

    """
-    if dim is None:
-        keepdim = False
-    else:
-        dim = nest.flatten(dim)
+    keepdim = keepdim if dim is not None else False
+    dim = nest.flatten(dim) if dim is not None else dim
    return FunctionLib.apply(
        'ReduceMean', input.device, [input], outputs=[out],
        axes=dim, keepdims=keepdim)
@@ -651,10 +729,8 @@ def min(input, dim=None, keepdim=False, out=None):
        The output tensor.

    """
-    if dim is None:
-        keepdim = False
-    else:
-        dim = nest.flatten(dim)
+    keepdim = keepdim if dim is not None else False
+    dim = nest.flatten(dim) if dim is not None else dim
    return FunctionLib.apply(
        'ReduceMin', input.device, [input], outputs=[out],
        axes=dim, keepdims=keepdim)
@@ -1208,10 +1284,8 @@ def sum(input, dim=None, keepdim=False, out=None):
        The output tensor.

    """
-    if dim is None:
-        keepdim = False
-    else:
-        dim = nest.flatten(dim)
+    keepdim = keepdim if dim is not None else False
+    dim = nest.flatten(dim) if dim is not None else dim
    return FunctionLib.apply(
        'ReduceSum', input.device, [input], outputs=[out],
        axes=dim, keepdims=keepdim)

--- a/torch/core/ops/tensor_ops.py
+++ b/torch/core/ops/tensor_ops.py
@@ -828,7 +828,7 @@ def fill_(self, value):


 def flatten(self, start_dim=0, end_dim=-1):
-    """Return a new tensor with dimensions flattened.
+    """Return a tensor with dimensions flattened.

    Parameters
    ----------
@@ -873,6 +873,59 @@ def flatten_(self, start_dim=0, end_dim=-1):
    return array_ops.flatten(self, start_dim, end_dim, self)


+def flip(self, dims):
+    """Return a tensor with elements reversed along the given dimension.
+
+    Parameters
+    ----------
+    dims : Union[int, Sequence[int]]
+        The dimension to reverse.
+
+    Returns
+    -------
+    dragon.vm.torch.Tensor
+        The output tensor.
+
+    See Also
+    --------
+    `torch.flip(...)`_
+
+    """
+    return array_ops.flip(self, dims)
+
+
+def fliplr(self):
+    """Return a tensor with elements reversed along the second dimension.
+
+    Returns
+    -------
+    dragon.vm.torch.Tensor
+        The output tensor.
+
+    See Also
+    --------
+    `torch.fliplr(...)`_
+
+    """
+    return array_ops.fliplr(self)
+
+
+def flipud(self):
+    """Return a tensor with elements reversed along the first dimension.
+
+    Returns
+    -------
+    dragon.vm.torch.Tensor
+        The output tensor.
+
+    See Also
+    --------
+    `torch.flipud(...)`_
+
+    """
+    return array_ops.flipud(self)
+
+
 def _float(self):
    """Return a float32 tensor with the same data.

@@ -1630,7 +1683,7 @@ def multinomial(self, num_samples):


 def narrow(self, dimension, start, length):
-    """Return a new tensor that is a narrowed version of input tensor.
+    """Return a narrowed tensor.

    Parameters
    ----------
@@ -2512,7 +2565,7 @@ def topk(self, k, dim=-1, largest=True, sorted=True):


 def transpose(self, dim0, dim1):
-    """Return a new tensor with two dimensions swapped.
+    """Return a tensor with two dimensions swapped.

    Parameters
    ----------
@@ -2867,6 +2920,9 @@ Tensor.expand = expand
 Tensor.fill_ = fill_
 Tensor.flatten = flatten
 Tensor.flatten_ = flatten_
+Tensor.flip = flip
+Tensor.fliplr = fliplr
+Tensor.flipud = flipud
 Tensor.float = _float
 Tensor.float_ = _float_
 Tensor.floor = floor

--- a/torch/core/tensor.py
+++ b/torch/core/tensor.py
@@ -1021,7 +1021,7 @@ class Tensor(object):
        """

    def flatten(self, start_dim=0, end_dim=-1):
-        """Return a new tensor with dimensions flattened.
+        """Return a tensor with dimensions flattened.

        Parameters
        ----------
@@ -1062,6 +1062,53 @@ class Tensor(object):

        """

+    def flip(self, dims):
+        """Return a tensor with elements reversed along the given dimension.
+
+        Parameters
+        ----------
+        dims : Union[int, Sequence[int]]
+            The dimension to reverse.
+
+        Returns
+        -------
+        dragon.vm.torch.Tensor
+            The output tensor.
+
+        See Also
+        --------
+        `torch.flip(...)`_
+
+        """
+
+    def fliplr(self):
+        """Return a tensor with elements reversed along the second dimension.
+
+        Returns
+        -------
+        dragon.vm.torch.Tensor
+            The output tensor.
+
+        See Also
+        --------
+        `torch.fliplr(...)`_
+
+        """
+
+    def flipud(self):
+        """Return a tensor with elements reversed along the first dimension.
+
+        Returns
+        -------
+        dragon.vm.torch.Tensor
+            The output tensor.
+
+        See Also
+        --------
+        `torch.flipud(...)`_
+
+        """
+
    def float(self):
        """Return a float32 tensor with the same data.

@@ -1723,7 +1770,7 @@ class Tensor(object):
        """

    def narrow(self, dimension, start, length):
-        """Return a new tensor that is a narrowed version of input tensor.
+        """Return a narrowed tensor.

        Parameters
        ----------
@@ -2026,7 +2073,7 @@ class Tensor(object):
        return self.fill_(1)

    def permute(self, *dims):
-        """Return a new tensor with the specific order of dimensions.
+        """Return a tensor with the specific order of dimensions.

        Parameters
        ----------
@@ -2655,7 +2702,7 @@ class Tensor(object):
        """

    def transpose(self, dim0, dim1):
-        """Return a new tensor with two dimensions swapped.
+        """Return a tensor with two dimensions swapped.

        Parameters
        ----------
@@ -2897,7 +2944,7 @@ class Tensor(object):
        return self.reshape(shape)

    def view_(self, *shape):
-        """Change into a new shape with the same data.
+        """Change into a new size with the same data.

        Parameters
        ----------
@@ -2917,8 +2964,7 @@ class Tensor(object):
        return self.reshape_(shape)

    def view_as(self, other):
-        """Return a new tensor with the same data
-         but a different size as the given tensor.
+        """Return a tensor with the same data but a different size.

        Parameters
        ----------