Unify the boolean operators

Ting PAN
Commit 4eab1c68 authored May 20, 2019 by Ting PAN
Showing with 2239 additions and 829 deletions
Docs/api/python/contents/core/tensor.rst
Docs/api/python/contents/ops.rst
Dragon/include/core/operator_gradient.h
Dragon/include/core/typeid.h
Dragon/include/operators/array/masked_select_op.h
Dragon/include/operators/array/non_zero_op.h
Dragon/include/operators/array/one_hot_op.h
Dragon/include/operators/array/where_op.h
Dragon/include/operators/control_flow/compare_op.h
Dragon/include/utils/cub_device.h
Dragon/include/utils/op_kernel.h
Dragon/modules/python/py_dragon.h
Dragon/modules/python/py_module.cc
Dragon/python/dragon/core/helper.py
Dragon/python/dragon/core/tensor.py
Dragon/python/dragon/core/workspace.py
Dragon/python/dragon/operators/array.py
Dragon/python/dragon/operators/control_flow.py
Dragon/python/dragon/ops.py
Dragon/python/dragon/vm/torch/__init__.py
--- a/Docs/api/python/contents/core/tensor.rst
+++ b/Docs/api/python/contents/core/tensor.rst
@@ -64,7 +64,6 @@ List                              Brief
 `Tensor.__repr__`_                Return the information(name/shape).
 `Tensor.__getitem__`_             Return the value at the specific indices.
 `Tensor.__setitem__`_             Set the value at the specific indices.
-`Tensor.__call__`_                Return the expressions for displaying.
 ==============================    =============================================================================
@@ -95,7 +94,6 @@ API Reference
    .. automethod:: __repr__
    .. automethod:: __getitem__
    .. automethod:: __setitem__
-    .. automethod:: __call__
 .. _Tensor.Variable: #dragon.core.tensor.Tensor.Variable
 .. _Tensor.Placeholder: #dragon.core.tensor.Tensor.Placeholder

--- a/Docs/api/python/contents/ops.rst
+++ b/Docs/api/python/contents/ops.rst
@@ -135,7 +135,9 @@ Array
 ===============    ======================================================================
 List               Brief
 ===============    ======================================================================
+`Where`_           Select elements from either *x* or *y*.
 `IndexSelect`_     Select the elements according to the indices along the given axis.
+`MaskedSelect`_    Select the the elements where *mask* is *1*.
 `Reduce`_          Reduce the inputs along the axis in given axes.
 `Sum`_             Compute the sum along the given axis.
 `Mean`_            Compute the mean along the given axis.
@@ -149,7 +151,7 @@ List               Brief
 `Repeat`_          Repeat the input along the given axis.
 `Transpose`_       Transpose the input according to the given permutations.
 `Tile`_            Tile the input according to the given multiples.
-`Pad`_             Pad the input according to the given paddings.
+`Pad`_             Pad the input according to the given sizes.
 `Crop`_            Crop the input according to the given starts and sizes.
 `OneHot`_          Generate the one-hot representation of inputs.
 `Flatten`_         Flatten the input along the given axes.
@@ -157,6 +159,7 @@ List               Brief
 `Squeeze`_         Remove the dimensions with size 1.
 `ExpandDims`_      Expand the new dimension with size 1 to specific axis.
 `Shape`_           Get the dynamic shape of a Tensor.
+`NonZero`_         Return the indices of non-zero elements.
 `Arange`_          Return evenly spaced values within a given interval.
 `Multinomial`_     Return indices sampled from the multinomial distribution.
 ===============    ======================================================================
@@ -170,6 +173,7 @@ List               Brief
 `Assign`_          Assign the *value* to *ref*.
 `MaskedAssign`_    Assign the *value* to *ref* where mask is *1*.
 `Equal`_           *Equal* Comparing between A and B.
+`NotEqual`_        *NotEqual* Comparing between A and B.
 `Less`_            *Less* Comparing between A and B.
 `LessEqual`_       *LessEqual* Comparing between A and B.
 `Greater`_         *Greater* Comparing between A and B.
@@ -284,7 +288,9 @@ List                 Brief
 .. _InstanceNorm: operators/norm.html#dragon.operators.norm.InstanceNorm
 .. _L2Norm: operators/norm.html#dragon.operators.norm.L2Norm
+.. _Where: operators/array.html#dragon.operators.array.Where
 .. _IndexSelect: operators/array.html#dragon.operators.array.IndexSelect
+.. _MaskedSelect: operators/array.html#dragon.operators.array.MaskedSelect
 .. _Crop: operators/array.html#dragon.operators.array.Crop
 .. _Reduce: operators/array.html#dragon.operators.array.Reduce
 .. _Sum: operators/array.html#dragon.operators.array.Sum
@@ -307,12 +313,14 @@ List                 Brief
 .. _ExpandDims: operators/array.html#dragon.operators.array.ExpandDims
 .. _Shape: operators/array.html#dragon.operators.array.Shape
 .. _Arange: operators/array.html#dragon.operators.array.Arange
+.. _NonZero: operators/array.html#dragon.operators.array.NonZero
 .. _Multinomial: operators/array.html#dragon.operators.array.Multinomial
 .. _Copy: operators/control_flow.html#dragon.operators.control_flow.Copy
 .. _Assign: operators/control_flow.html#dragon.operators.control_flow.Assign
 .. _MaskedAssign: operators/control_flow.html#dragon.operators.control_flow.MaskedAssign
 .. _Equal: operators/control_flow.html#dragon.operators.control_flow.Equal
+.. _NotEqual: operators/control_flow.html#dragon.operators.control_flow.NotEqual
 .. _Less: operators/control_flow.html#dragon.operators.control_flow.Less
 .. _LessEqual: operators/control_flow.html#dragon.operators.control_flow.LessEqual
 .. _Greater: operators/control_flow.html#dragon.operators.control_flow.Greater

--- a/Dragon/include/core/operator_gradient.h
+++ b/Dragon/include/core/operator_gradient.h
@@ -38,6 +38,7 @@ class GradientMakerBase {
        const vector<string>&       g_outputs)
        : def(def), g_outputs_(g_outputs),
          g_inputs_(def.input_size()) {}
    virtual ~GradientMakerBase() {}
    virtual bool CopyDeviceOption() const { return true; }
@@ -45,9 +46,9 @@ class GradientMakerBase {
    virtual bool CopyArguments() const { return true; }
    virtual Gradient Make() {
-        vector<OperatorDef> new_defs = MakeDef();
+        auto new_defs = MakeDef();
        if (def.has_uid()) {
-            // Attach the anchor to the name if having UID
+            // Attach the anchor to name if having UID
            for (int i = 0; i < new_defs.size(); i++)
                new_defs[i].set_name(def.name());
        } else {
@@ -57,14 +58,14 @@ class GradientMakerBase {
            for (int i = 0; i < new_defs.size(); i++)
                new_defs[i].add_arg()->CopyFrom(anchor);
        }
-        return Gradient(new_defs, g_inputs_, DefaultValues());
+        return Gradient(new_defs, g_inputs_, defaults());
    };
    virtual vector<OperatorDef> MakeDef() {
        return vector<OperatorDef>();
    }
-    virtual vector<float> DefaultValues() {
+    virtual vector<float> defaults() {
        return vector<float>(g_outputs_.size(), 1.f);
    }
@@ -135,15 +136,17 @@ class SimpleGradientMaker final : public GradientMakerBase {
    GRADIENT_MAKER_CTOR(SimpleGradientMaker);
    vector<OperatorDef> MakeDef() override {
        vector<string> inputs, outputs;
-        for (const auto& input : def.input()) {
+        for (const auto& input : def.input())
            inputs.push_back(input);
-        }
+        for (int i = 0; i < def.input_size(); ++i)
-        inputs.push_back(GO(0));
-        for (int i = 0; i < def.input_size(); i++) {
            outputs.push_back(GI(i));
-        }
+        inputs.push_back(GO(0));
-        return SingleDef(def.type() +
+        return SingleDef(
-            "Gradient", "", inputs, outputs);
+            def.type() + "Gradient",
+            "",
+            inputs,
+            outputs
+        );
    }
 };
@@ -162,7 +165,8 @@ class InplaceGradientMaker final : public GradientMakerBase {
            def.type() + "Gradient",          /*!   OpType   */
            "",                               /*!   OpName   */
            vector<string>({ O(0), GO(0) }),  /*!   Inputs   */
-            vector<string>({ GI(0) }));       /*!   Outputs  */
+            vector<string>({ GI(0) })         /*!   Outputs  */
+        );
    }
 };

--- a/Dragon/include/core/typeid.h
+++ b/Dragon/include/core/typeid.h
@@ -13,9 +13,9 @@
 #ifndef DRAGON_CORE_TYPEID_H_
 #define DRAGON_CORE_TYPEID_H_
+#include <map>
 #include <cstdlib>
 #include <iostream>
-#include <map>
 namespace dragon {
@@ -83,7 +83,7 @@ class TypeMeta {
    template <typename T>
    static void Ctor(void* ptr, size_t n) {
        T* typed_ptr = static_cast<T*>(ptr);
-        for (unsigned int i = 0; i < n; i++)
+        for (size_t i = 0; i < n; i++)
            new(typed_ptr + i) T;
    }
@@ -91,14 +91,14 @@ class TypeMeta {
    static void Copy(const void* src, void* dst, size_t n) {
        const T* typed_src = static_cast<const T*>(src);
        T* typed_dst = static_cast<T*>(dst);
-        for (unsigned int i = 0; i < n; i++)
+        for (size_t i = 0; i < n; ++i)
            typed_dst[i] = typed_src[i];
    }
    template <typename T>
    static void Dtor(void* ptr, size_t n) {
        T* typed_ptr = static_cast<T*>(ptr);
-        for (unsigned int i = 0; i < n; i++)
+        for (size_t i = 0; i < n; ++i)
            typed_ptr[i].~T();
    }
@@ -107,16 +107,23 @@ class TypeMeta {
    template <typename T>
    static typename FundMeta Make() {
-        return TypeMeta(Id<T>(), Itemsize<T>(),
+        return TypeMeta(
-            nullptr, nullptr, nullptr);
+            Id<T>(), Itemsize<T>(),
+            nullptr, nullptr, nullptr
+        );
    }
    template<typename T>
    static typename StructMeta Make() {
-        return TypeMeta(Id<T>(), Itemsize<T>(),
+        return TypeMeta(
-            Ctor<T>, Copy<T>, Dtor<T>);
+            Id<T>(), Itemsize<T>(),
+            Ctor<T>, Copy<T>, Dtor<T>
+        );
    }
+#undef FundMeta
+#undef StructMeta
 private:
    TypeMeta(
        TypeId              id,

--- a/Dragon/include/operators/array/masked_select_op.h
+++ b/Dragon/include/operators/array/masked_select_op.h
+/*!
+ * Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+ *
+ * Licensed under the BSD 2-Clause License.
+ * You should have received a copy of the BSD 2-Clause License
+ * along with the software. If not, See,
+ *
+ *      <https://opensource.org/licenses/BSD-2-Clause>
+ *
+ * ------------------------------------------------------------
+ */
+#ifndef DRAGON_OPERATORS_ARRAY_MASKED_SELECT_OP_H_
+#define DRAGON_OPERATORS_ARRAY_MASKED_SELECT_OP_H_
+#include "core/operator.h"
+namespace dragon {
+template <class Context>
+class MaskedSelectOp final : public Operator<Context> {
+ public:
+    MaskedSelectOp(const OperatorDef& def, Workspace* ws)
+        : Operator<Context>(def, ws) {}
+    USE_OPERATOR_FUNCTIONS;
+    void RunOnDevice() override;
+    template <typename T> void RunImpl();
+};
+template <class Context>
+class MaskedSelectGradientOp final
+    : public Operator<Context> {
+ public:
+    MaskedSelectGradientOp(
+        const OperatorDef&      def,
+        Workspace*              ws)
+        : Operator<Context>(def, ws) {}
+    USE_OPERATOR_FUNCTIONS;
+    void RunOnDevice() override;
+    template <typename T> void RunImpl();
+};
+}  // namespace dragon
+#endif  // DRAGON_OPERATORS_ARRAY_MASKED_SELECT_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/array/non_zero_op.h
+++ b/Dragon/include/operators/array/non_zero_op.h
+/*!
+ * Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+ *
+ * Licensed under the BSD 2-Clause License.
+ * You should have received a copy of the BSD 2-Clause License
+ * along with the software. If not, See,
+ *
+ *      <https://opensource.org/licenses/BSD-2-Clause>
+ *
+ * ------------------------------------------------------------
+ */
+#ifndef DRAGON_OPERATORS_ARRAY_NON_ZERO_OP_H_
+#define DRAGON_OPERATORS_ARRAY_NON_ZERO_OP_H_
+#include "core/operator.h"
+namespace dragon {
+template <class Context>
+class NonZeroOp final : public Operator<Context> {
+ public:
+    NonZeroOp(const OperatorDef& def, Workspace* ws)
+        : Operator<Context>(def, ws) {}
+    USE_OPERATOR_FUNCTIONS;
+    void RunOnDevice() override;
+    template <typename T> void RunImpl();
+ protected:
+    Tensor X_dims_;
+};
+}  // namespace dragon
+#endif  // DRAGON_OPERATORS_ARRAY_NON_ZERO_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/array/one_hot_op.h
+++ b/Dragon/include/operators/array/one_hot_op.h
@@ -18,7 +18,7 @@
 namespace dragon {
 template <class Context>
-class OneHotOp final : public Operator < Context > {
+class OneHotOp final : public Operator<Context> {
 public:
    OneHotOp(const OperatorDef& def, Workspace* ws)
        : Operator<Context>(def, ws),

--- a/Dragon/include/operators/array/where_op.h
+++ b/Dragon/include/operators/array/where_op.h
+/*!
+ * Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+ *
+ * Licensed under the BSD 2-Clause License.
+ * You should have received a copy of the BSD 2-Clause License
+ * along with the software. If not, See,
+ *
+ *      <https://opensource.org/licenses/BSD-2-Clause>
+ *
+ * ------------------------------------------------------------
+ */
+#ifndef DRAGON_OPERATORS_ARRAY_WHERE_OP_H_
+#define DRAGON_OPERATORS_ARRAY_WHERE_OP_H_
+#include "core/operator.h"
+namespace dragon {
+template <class Context>
+class WhereOp final : public Operator<Context> {
+ public:
+    WhereOp(const OperatorDef& def, Workspace* ws)
+        : Operator<Context>(def, ws) {}
+    USE_OPERATOR_FUNCTIONS;
+    void RunOnDevice() override;
+    template <typename T> void RunImpl();
+};
+template <class Context>
+class WhereGradientOp final : public Operator<Context> {
+ public:
+    WhereGradientOp(const OperatorDef& def, Workspace* ws)
+        : Operator<Context>(def, ws) {}
+    USE_OPERATOR_FUNCTIONS;
+    void RunOnDevice() override;
+    template <typename T> void RunImpl();
+};
+}  // namespace dragon
+#endif  // DRAGON_OPERATORS_ARRAY_WHERE_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/control_flow/compare_op.h
+++ b/Dragon/include/operators/control_flow/compare_op.h
@@ -27,7 +27,9 @@ class CompareOp final : public Operator<Context> {
    USE_OPERATOR_FUNCTIONS;
    void RunOnDevice() override;
+    template <typename T> void RunImpl();
    template <typename T> void EqualRunImpl();
+    template <typename T> void NotEqualRunImpl();
    template <typename T> void LessRunImpl();
    template <typename T> void LessEqualRunImpl();
    template <typename T> void GreaterRunImpl();

--- a/Dragon/include/utils/cub_device.h
+++ b/Dragon/include/utils/cub_device.h
@@ -4,6 +4,8 @@
 #ifdef WITH_CUDA
 #include <cub/block/block_reduce.cuh>
+#include <cub/device/device_select.cuh>
+#include <cub/iterator/counting_input_iterator.cuh>
 #include "utils/cuda_device.h"

--- a/Dragon/include/utils/op_kernel.h
+++ b/Dragon/include/utils/op_kernel.h
@@ -17,6 +17,8 @@
 namespace dragon {
+class Tensor;
 namespace kernel {
 /*! activation.dropout */
@@ -258,37 +260,37 @@ void ClipGrad(
 template <typename T, class Context>
 void Maximum(
    const int               count,
-    const T*                x1,
+    const T*                a,
-    const T*                x2,
+    const T*                b,
    T*                      y,
    Context*                ctx);
 template <typename T, class Context>
 void BroadcastMaximum(
    const int               count,
-    const T*                x1,
+    const T*                a,
-    const T                 x2,
+    const T                 b,
    T*                      y,
    Context*                ctx);
 template <typename T, class Context>
 void MaximumGrad(
    const int               count,
-    const T*                x1,
+    const T*                a,
-    const T*                x2,
+    const T*                b,
    const T*                dy,
-    T*                      dx1,
+    T*                      da,
-    T*                      dx2,
+    T*                      db,
    Context*                ctx);
 template <typename T, class Context>
 void BroadcastMaximumGrad(
    const int               count,
-    const T*                x1,
+    const T*                a,
-    const T                 x2,
+    const T                 b,
    const T*                dy,
-    T*                      dx1,
+    T*                      da,
-    T*                      dx2,
+    T*                      db,
    Context*                ctx);
 /*! arithmetic.minimum */
@@ -296,37 +298,37 @@ void BroadcastMaximumGrad(
 template <typename T, class Context>
 void Minimum(
    const int               count,
-    const T*                x1,
+    const T*                a,
-    const T*                x2,
+    const T*                b,
    T*                      y,
    Context*                ctx);
 template <typename T, class Context>
 void BroadcastMinimum(
    const int               count,
-    const T*                x1,
+    const T*                a,
-    const T                 x2,
+    const T                 b,
    T*                      y,
    Context*                ctx);
 template <typename T, class Context>
 void MinimumGrad(
    const int               count,
-    const T*                x1,
+    const T*                a,
-    const T*                x2,
+    const T*                b,
    const T*                dy,
-    T*                      dx1,
+    T*                      da,
-    T*                      dx2,
+    T*                      db,
    Context*                ctx);
 template <typename T, class Context>
 void BroadcastMinimumGrad(
    const int               count,
-    const T*                x1,
+    const T*                a,
-    const T                 x2,
+    const T                 b,
    const T*                dy,
-    T*                      dx1,
+    T*                      da,
-    T*                      dx2,
+    T*                      db,
    Context*                ctx);
 /*! arithmetic.moments */
@@ -437,6 +439,38 @@ void IndexSelectGrad(
    T*                      dx,
    Context*                ctx);
+/*! array.masked_select */
+template <typename T, class Context>
+void MaskedSelect(
+    const int               count,
+    const uint8_t*          mask,
+    const T*                x,
+    Tensor*                 indices,
+    Tensor*                 scratch,
+    Tensor*                 y,
+    Context*                ctx);
+template <typename T, class Context>
+void MaskedSelectGrad(
+    const int               count,
+    const int               num_indices,
+    const int64_t*          indices,
+    const T*                dy,
+    T*                      dx,
+    Context*                ctx);
+/*! array.non_zero */
+template <class Context>
+void UnravelIndex(
+    const int               count,
+    const int               ndims,
+    const int*              dims,
+    const int64_t*          x,
+    int64_t*                y,
+    Context*                ctx);
 /*! array.pad */
 template <typename T, class Context>
@@ -602,6 +636,26 @@ void TransposeGrad(
    T*                      dx,
    Context*                ctx);
+/*! array.where */
+template <typename T, class Context>
+void Where(
+    const int               count,
+    const uint8_t*          mask,
+    const T*                a,
+    const T*                b,
+    T*                      y,
+    Context*                ctx);
+template <typename T, class Context>
+void WhereGrad(
+    const int               count,
+    const uint8_t*          mask,
+    const T*                dy,
+    T*                      da,
+    T*                      db,
+    Context*                ctx);
 /*! control_flow.assgin */
 template <typename T, class Context>
@@ -618,6 +672,13 @@ void Assign(
 /*! control_flow.compare */
 template <typename T, class Context>
+void NotZero(
+    const int               count,
+    const T*                x,
+    bool*                   y,
+    Context*                ctx);
+template <typename T, class Context>
 void Equal(
    const int               count,
    const T*                a,
@@ -626,7 +687,7 @@ void Equal(
    Context*                ctx);
 template <typename T, class Context>
-void Less(
+void NotEqual(
    const int               count,
    const T*                a,
    const T*                b,
@@ -634,7 +695,7 @@ void Less(
    Context*                ctx);
 template <typename T, class Context>
-void LessEqual(
+void Less(
    const int               count,
    const T*                a,
    const T*                b,
@@ -642,7 +703,7 @@ void LessEqual(
    Context*                ctx);
 template <typename T, class Context>
-void Greater(
+void LessEqual(
    const int               count,
    const T*                a,
    const T*                b,
@@ -650,21 +711,19 @@ void Greater(
    Context*                ctx);
 template <typename T, class Context>
-void GreaterEqual(
+void Greater(
    const int               count,
    const T*                a,
    const T*                b,
    bool*                   y,
    Context*                ctx);
-/*! control_flow.masked_assign */
 template <typename T, class Context>
-void MaskedAssign(
+void GreaterEqual(
    const int               count,
-    const uint8_t*          mask,
+    const T*                a,
-    const T*                x,
+    const T*                b,
-    T*                      y,
+    bool*                   y,
    Context*                ctx);
 /*! loss.l1_loss */

--- a/Dragon/modules/python/py_dragon.h
+++ b/Dragon/modules/python/py_dragon.h
@@ -69,26 +69,29 @@ class NumpyFetcher : public TensorFetcherBase {
    pybind11::object Fetch(const Tensor& tensor) override {
        CHECK_GT(tensor.count(), 0);
        vector<npy_intp> npy_dims;
-        for (const auto dim : tensor.dims()) npy_dims.push_back(dim);
+        for (auto dim : tensor.dims()) npy_dims.push_back(dim);
        int npy_type = TypeMetaToNPY(tensor.meta());
-        if (npy_type == -1) {
+        CHECK(npy_type != -1)
-            LOG(FATAL) <<  "The data type of Tensor(" +
+            << "\nThe data type of Tensor(" << tensor.name()
-                tensor.name() + ") is unknown. Have you solved it ?";
+            << ") is unknown. Have you solved it?";
-        }
        CHECK(tensor.memory()) << "\nIllegal memory access.";
        // Create a empty array with the same shape
-        PyObject* array = PyArray_SimpleNew(
+        auto* array = PyArray_SimpleNew(
            tensor.ndim(), npy_dims.data(), npy_type);
        // Copy the tensor data to the numpy array
        if (tensor.memory_state() == MixedMemory::STATE_AT_CUDA) {
-            CUDAContext::MemcpyEx<CPUContext, CUDAContext>(tensor.nbytes(),
+            CUDAContext::MemcpyEx<CPUContext, CUDAContext>(
+                tensor.nbytes(),
                PyArray_DATA(reinterpret_cast<PyArrayObject*>(array)),
                tensor.raw_data<CUDAContext>(),
-                                             tensor.memory()->device_id());
+                tensor.memory()->device_id()
+            );
        } else {
-            CPUContext::Memcpy<CPUContext, CPUContext>(tensor.nbytes(),
+            CPUContext::Memcpy<CPUContext, CPUContext>(
+                tensor.nbytes(),
                PyArray_DATA(reinterpret_cast<PyArrayObject*>(array)),
-                                        tensor.raw_data<CPUContext>());
+                tensor.raw_data<CPUContext>()
+            );
        }
        return pybind11::reinterpret_steal<pybind11::object>(array);
    }
@@ -108,14 +111,14 @@ class NumpyFeeder : public TensorFeederBase {
        const DeviceOption&         option,
        PyArrayObject*              original_array,
        Tensor*                     tensor) override {
-        PyArrayObject* array = PyArray_GETCONTIGUOUS(original_array);
+        auto* array = PyArray_GETCONTIGUOUS(original_array);
-        const TypeMeta& meta = TypeNPYToMeta(PyArray_TYPE(array));
+        const auto& meta = TypeNPYToMeta(PyArray_TYPE(array));
        if (meta.id() == 0) LOG(FATAL) << "Unsupported DType.";
        tensor->SetMeta(meta);
        int ndim = PyArray_NDIM(array);
-        npy_intp* npy_dims = PyArray_DIMS(array);
+        vec64_t dims(ndim);
-        vector<int64_t> dims;
+        auto* npy_dims = PyArray_DIMS(array);
-        for (int i = 0; i < ndim; i++) dims.push_back(npy_dims[i]);
+        for (int i = 0; i < ndim; i++) dims[i] = npy_dims[i];
        tensor->Reshape(dims);
        if (option.device_type() == PROTO_CUDA) {
 #ifdef WITH_CUDA
@@ -123,16 +126,17 @@ class NumpyFeeder : public TensorFeederBase {
                tensor->nbytes(),
                tensor->raw_mutable_data<CUDAContext>(),
                static_cast<void*>(PyArray_DATA(array)),
-                                       option.device_id());
+                option.device_id()
+            );
 #else
            LOG(FATAL) << "CUDA was not compiled.";
 #endif
        } else {
-            auto* data = tensor->raw_mutable_data<CPUContext>();
            CPUContext::Memcpy<CPUContext, CPUContext>(
                tensor->nbytes(),
                tensor->raw_mutable_data<CPUContext>(),
-              static_cast<void*>(PyArray_DATA(array)));
+                static_cast<void*>(PyArray_DATA(array))
+            );
        }
        Py_XDECREF(array);
    }

--- a/Dragon/modules/python/py_module.cc
+++ b/Dragon/modules/python/py_module.cc
@@ -142,40 +142,44 @@ PYBIND11_MODULE(libdragon, m) {
            DeviceOption dst_ctx, src_ctx;
            dst_ctx.ParseFromString(dev1);
            src_ctx.ParseFromString(dev2);
-            Tensor* srcT = self->GetTensor(other);
+            auto* src = self->GetTensor(other);
-            Tensor* dstT = self->CreateTensor(name);
+            auto* dst = self->CreateTensor(name);
-            dstT->ReshapeLike(*srcT);
+            const auto& meta = src->meta();
-            const TypeMeta& meta = srcT->meta();
+            dst->ReshapeLike(*src);
            if (dst_ctx.device_type() == PROTO_CUDA) {
                if (src_ctx.device_type() == PROTO_CUDA) {
                    // CUDA <- CUDA
                    CUDAContext::MemcpyEx<CUDAContext, CUDAContext>(
-                        srcT->nbytes(),
+                        src->nbytes(),
-                        dstT->raw_mutable_data<CUDAContext>(meta),
+                        dst->raw_mutable_data<CUDAContext>(meta),
-                        srcT->raw_data<CUDAContext>(),
+                        src->raw_data<CUDAContext>(),
-                        src_ctx.device_id());
+                        src_ctx.device_id()
+                    );
                } else {
                    // CUDA <- CPU
                    CUDAContext::MemcpyEx<CUDAContext, CPUContext>(
-                        srcT->nbytes(),
+                        src->nbytes(),
-                        dstT->raw_mutable_data<CUDAContext>(meta),
+                        dst->raw_mutable_data<CUDAContext>(meta),
-                        srcT->raw_data<CPUContext>(),
+                        src->raw_data<CPUContext>(),
-                        dst_ctx.device_id());
+                        dst_ctx.device_id()
+                    );
                }
            } else {
                if (src_ctx.device_type() == PROTO_CUDA) {
                    // CPU <- CUDA
                    CUDAContext::MemcpyEx<CPUContext, CUDAContext>(
-                        srcT->nbytes(),
+                        src->nbytes(),
-                        dstT->raw_mutable_data<CPUContext>(meta),
+                        dst->raw_mutable_data<CPUContext>(meta),
-                        srcT->raw_data<CUDAContext>(),
+                        src->raw_data<CUDAContext>(),
-                        src_ctx.device_id());
+                        src_ctx.device_id()
+                    );
                } else {
                    // CPU <- CPU
                    CPUContext::Memcpy<CUDAContext, CUDAContext>(
-                        srcT->nbytes(),
+                        src->nbytes(),
-                        dstT->raw_mutable_data<CPUContext>(meta),
+                        dst->raw_mutable_data<CPUContext>(meta),
-                        srcT->raw_data<CPUContext>());
+                        src->raw_data<CPUContext>()
+                    );
                }
            }
        })
@@ -188,7 +192,7 @@ PYBIND11_MODULE(libdragon, m) {
            Tensor* tensor = self->GetTensor(name);
            CHECK_GT(tensor->count(), 0);
            vector<npy_intp> dims;
-            for (const auto dim : tensor->dims()) dims.push_back(dim);
+            for (auto dim : tensor->dims()) dims.push_back(dim);
            int npy_type = TypeMetaToNPY(tensor->meta());
            if (npy_type == -1) {
                LOG(FATAL) << "Tensor(" + tensor->name() + ") "

--- a/Dragon/python/dragon/core/helper.py
+++ b/Dragon/python/dragon/core/helper.py
@@ -35,7 +35,7 @@ class OperatorHelper(object):
        'Relu', 'PRelu', 'Elu', 'SElu', 'Sigmoid', 'Tanh', 'Softmax',
        'Dropout', 'DropPath', 'DropBlock2d',
        'Add', 'Sub', 'Mul', 'Div', 'Clip', 'Log', 'Exp', 'Pow', 'Square', 'Sqrt',
-        'Accumulate', 'Affine', 'Copy', 'Compare', 'StopGradient', 'MPIBroadcast',
+        'Accumulate', 'Affine', 'Copy', 'StopGradient', 'MPIBroadcast',
        'BatchNorm', 'GroupNorm', 'L2Norm', 'LRN', 'BiasAdd',
    )
@@ -107,6 +107,12 @@ class OperatorHelper(object):
                len(outputs[0].shape) < len(inputs[1].shape):
                    outputs[0].shape = inputs[1].shape
        except:
+            try:
+                outputs[0].shape = inputs[1].shape[:]
+                if outputs[0].shape != inputs[0].shape and \
+                        len(outputs[0].shape) < len(inputs[0].shape):
+                    outputs[0].shape = inputs[0].shape
+            except:
                pass
        return outputs
@@ -391,19 +397,31 @@ class OperatorHelper(object):
    ###############################################
    @classmethod
+    def _apply_Where(cls, arguments, inputs, outputs):
+        return cls._apply_Maximum(arguments, inputs, outputs)
+    @classmethod
    def _apply_IndexSelect(cls, arguments, inputs, outputs):
        outputs[0].dtype = inputs[0].dtype
        axis = arguments['axis']
        try:
+            try: index_shape = inputs[1].shape[:]
+            except: index_shape = [None]
            outputs[0].shape = \
                inputs[0].shape[:axis] + \
-                    inputs[1].shape[:] + \
+                    index_shape[:] + \
                inputs[0].shape[axis + 1:]
        except:
            pass
        return outputs
    @classmethod
+    def _apply_MaskedSelect(cls, arguments, inputs, outputs):
+        outputs[0].dtype = inputs[0].dtype
+        outputs[0].shape = [None]
+        return outputs
+    @classmethod
    def _apply_RandomPick(cls, arguments, inputs, outputs):
        outputs[0].dtype = inputs[0].dtype
        outputs[1].dtype = 'int32'
@@ -779,6 +797,25 @@ class OperatorHelper(object):
            pass
        return outputs
+    @classmethod
+    def _apply_NonZero(cls, arguments, inputs, outputs):
+        outputs[0].dtype = 'int64'
+        try:
+            outputs[0].shape = [None, len(inputs[0].shape)]
+        except:
+            pass
+        return outputs
+    ###############################################
+    #                                             #
+    #                Control Flow                 #
+    #                                             #
+    ###############################################
+    @classmethod
+    def _apply_Compare(cls, arguments, inputs, outputs):
+        return cls._apply_Maximum(arguments, inputs, outputs)
    ###############################################
    #                                             #
    #                    Vision                   #

--- a/Dragon/python/dragon/core/tensor.py
+++ b/Dragon/python/dragon/core/tensor.py
@@ -462,7 +462,7 @@ class Tensor(object):
        Parameters
        ----------
-        item : int or slice
+        item : int, slice or Tensor
            The indices.
        Returns
@@ -471,6 +471,11 @@ class Tensor(object):
            The output tensor.
        """
+        if isinstance(item, Tensor):
+            return self.CreateOperator(
+                'MaskedSelect', [self, item],
+            )
+        else:
            starts, sizes = self._process_indices(item)
            output = self.CreateOperator('Crop', self, starts=starts, sizes=sizes)
            if self.shape is not None:
@@ -774,11 +779,11 @@ class Tensor(object):
    ###############################################
    def set_value(self, new_value, **kwargs):
-        """Feed the values to C++ backend. [**Theano Style**]
+        """Feed values to the backend.
        Parameters
        ----------
-        new_value : number, list or numpy.ndarray
+        new_value : array_like
            The values to set.
        Returns
@@ -795,12 +800,12 @@ class Tensor(object):
        return self
    def get_value(self):
-        """Fetch the values from C++ backend. [**Theano Style**]
+        """Copy values from the backend.
        Returns
        -------
-        numpy.ndarray or number
+        numpy.ndarray
-            The values of this tensor in the backend.
+            The copied values.
        See Also
        --------
@@ -827,7 +832,7 @@ class Tensor(object):
        return self.CreateOperator('Copy', **arguments)
    def reshape(self, shape, **kwargs):
-        """Reshape the dimensions of input. [**Theano Style**]
+        """Reshape the dimensions of input.
        Parameters
        ----------
@@ -841,8 +846,7 @@ class Tensor(object):
        """
        if not isinstance(shape, (tuple, list)): shape = [shape]
-        return Tensor.CreateOperator(
+        return self.CreateOperator('Reshape', inputs=self, shape=shape)
-            'Reshape', inputs=self, shape=shape, **kwargs)
    def dimshuffle(self, *args, **kwargs):
        """Shuffle the dimensions. [**Theano Style**]

--- a/Dragon/python/dragon/core/workspace.py
+++ b/Dragon/python/dragon/core/workspace.py
@@ -425,7 +425,7 @@ def FeedTensor(
    ----------
    tensor : Tensor or str
        The tensor to feed.
-    array : number, list, tuple, or numpy.ndarray
+    array : array_like
        The values to feed.
    force_cpu : boolean, optional, default=False
        Whether force to feed to cpu context.

--- a/Dragon/python/dragon/operators/array.py
+++ b/Dragon/python/dragon/operators/array.py
@@ -16,6 +16,30 @@ from __future__ import print_function
 from . import *
+@OpSchema.Inputs(1, 3)
+def Where(inputs, **kwargs):
+    """Select elements from either ``x`` or ``y``, depending on ``condition``.
+    Return the indices of *True* elements, if only the ``condition`` is given.
+    **Type Constraints**: (*bool*, *int8*, *uint8*, *int32*, *int64*, *float16*, *float32*, *float64*)
+    Parameters
+    ----------
+    inputs : sequence of Tensor
+        The ``x``, ``y``, and ``condition``.
+    Returns
+    -------
+    dragon.vm.torch.Tensor
+        The output tensor.
+    """
+    if isinstance(inputs, Tensor) or len(inputs) == 1:
+        return NonZero(inputs, **kwargs)
+    return Tensor.CreateOperator('Where', **ParseArgs(locals()))
 @OpSchema.Inputs(1)
 def IndexSelect(inputs, indices, axis=0, **kwargs):
    """Select the elements according to the indices along the given axis.
@@ -26,7 +50,7 @@ def IndexSelect(inputs, indices, axis=0, **kwargs):
    ----------
    inputs : Tensor
        The input tensor.
-    indices : Tensor
+    indices : sequence or Tensor
        The indices to select elements.
    axis : int, optional
        The axis of indices.
@@ -46,6 +70,31 @@ def IndexSelect(inputs, indices, axis=0, **kwargs):
 @OpSchema.Inputs(1)
+def MaskedSelect(inputs, mask, **kwargs):
+    """Select the the elements where ``mask`` is *1*.
+    **Type Constraints**: (*bool*, *int8*, *uint8*, *int32*, *int64*, *float16*, *float32*, *float64*)
+    Parameters
+    ----------
+    inputs : Tensor
+        The input tensor.
+    mask : Tensor
+        The mask, with the same size as ``inputs``.
+    Returns
+    -------
+    Tensor
+        The output tensor.
+    """
+    arguments = ParseArgs(locals())
+    arguments['mask'] = None
+    arguments['inputs'] = [arguments['inputs'], mask]
+    return Tensor.CreateOperator('MaskedSelect', **arguments)
+@OpSchema.Inputs(1)
 @ArgumentHelper.RepeatedDesc('starts')
 @ArgumentHelper.RepeatedDesc('sizes')
 def Crop(
@@ -748,6 +797,26 @@ def Arange(start, stop=None, step=1, dtype='float32', **kwargs):
 @OpSchema.Inputs(1)
+def NonZero(inputs, **kwargs):
+    """Return the indices of non-zero elements.
+    **Type Constraints**: (*bool*, *int8*, *uint8*, *int32*, *int64*, *float16*, *float32*, *float64*)
+    Parameters
+    ----------
+    inputs : Tensor
+        The input tensor.
+    Returns
+    -------
+    Tensor
+        A *int64* tensor contains the indices.
+    """
+    return Tensor.CreateOperator('NonZero', **ParseArgs(locals()))
+@OpSchema.Inputs(1)
 def Multinomial(inputs, num_samples=1, eps=0., normalize=False, **kwargs):
    """Return a tensor where each row contains ``num_samples``,
    sampled from the multinomial distribution.

--- a/Dragon/python/dragon/operators/control_flow.py
+++ b/Dragon/python/dragon/operators/control_flow.py
@@ -125,6 +125,32 @@ def Equal(inputs, to_uint8=False, **kwargs):
    return Tensor.CreateOperator('Compare', operation='EQ', **arguments)
+@OpSchema.Inputs(2)
+def NotEqual(inputs, to_uint8=False, **kwargs):
+    """*NotEqual* comparing between A and B.
+    Set ``to_uint8`` if you expect the *uint8* results instead of *bool*.
+    **Type Constraints**: (*bool*, *int8*, *uint8*, *int32*, *int64*, *float16*, *float32*, *float64*)
+    Parameters
+    ----------
+    inputs : sequence of Tensor
+        The inputs, represent A and B respectively.
+    to_uint8 : bool
+        *True* to convert to *uint8* results.
+    Returns
+    -------
+    Tensor
+        The comparing results.
+    """
+    arguments = ParseArgs(locals())
+    return Tensor.CreateOperator('Compare', operation='NE', **arguments)
 @OpSchema.ConvertConstantInputs()
 @OpSchema.Inputs(2)
 def Less(inputs, to_uint8=False, **kwargs):

--- a/Dragon/python/dragon/ops.py
+++ b/Dragon/python/dragon/ops.py
@@ -114,7 +114,7 @@ LayerNorm = _norm_ops.LayerNorm
 InstanceNorm = _norm_ops.InstanceNorm
 L2Norm = _norm_ops.L2Norm
-# NDArray
+# Array
 Crop = _array_ops.Crop
 Reduce = _array_ops.Reduce
 Sum = _array_ops.Sum
@@ -130,7 +130,9 @@ Transpose = _array_ops.Transpose
 Repeat = _array_ops.Repeat
 Tile = _array_ops.Tile
 Pad = _array_ops.Pad
+Where = _array_ops.Where
 IndexSelect = _array_ops.IndexSelect
+MaskedSelect = _array_ops.MaskedSelect
 OneHot = _array_ops.OneHot
 Flatten = _array_ops.Flatten
 Reshape = _array_ops.Reshape
@@ -139,15 +141,17 @@ Squeeze = _array_ops.Squeeze
 Shape = _array_ops.Shape
 Arange = _array_ops.Arange
 Multinomial = _array_ops.Multinomial
+NonZero = _array_ops.NonZero
 # Control Flow
 Copy = _control_flow_ops.Copy
 Assign = _control_flow_ops.Assign
 MaskedAssign = _control_flow_ops.MaskedAssign
-Equal = _control_flow_ops.Equal
 Less = _control_flow_ops.Less
-LessEqual = _control_flow_ops.LessEqual
+Equal = _control_flow_ops.Equal
 Greater = _control_flow_ops.Greater
+LessEqual = _control_flow_ops.LessEqual
+NotEqual = _control_flow_ops.NotEqual
 GreaterEqual = _control_flow_ops.GreaterEqual
 # Misc

--- a/Dragon/python/dragon/vm/torch/__init__.py
+++ b/Dragon/python/dragon/vm/torch/__init__.py
@@ -18,7 +18,7 @@ import dragon.vm.torch.ops.tensor
 # Import Core Methods
 from dragon.vm.torch.tensor import *
-from dragon.vm.torch.c_api import Size, from_numpy
+from dragon.vm.torch.c_api import Size, from_numpy, device
 from dragon.vm.torch.serialization import save, load
 # Import Subpackages

--- a/Dragon/python/dragon/vm/torch/c_api.py
+++ b/Dragon/python/dragon/vm/torch/c_api.py
@@ -67,10 +67,11 @@ def from_numpy(data):
    """
    if not isinstance(data, numpy.ndarray):
        raise TypeError('The data should be a numpy.ndarray.')
-    if str(data.dtype) not in _mapping.TENSOR_TYPE_TO_TORCH_TENSOR:
+    tensor_types = _mapping.TENSOR_TYPE_TO_TORCH_TENSOR
-        raise ValueError('Unsupported type({}) to torch tensor.'.format(data.dtype))
+    if str(data.dtype) not in tensor_types:
+        raise ValueError('Unsupported data type: ' + str(data.dtype))
    module = importlib.import_module('dragon.vm.torch.tensor')
-    return getattr(module, _mapping.TENSOR_TYPE_TO_TORCH_TENSOR[str(data.dtype)])(data)
+    return getattr(module, tensor_types[str(data.dtype)])(data, copy=False)
 def from_dragon(tensor, own_storage=False):

--- a/Dragon/python/dragon/vm/torch/ops/builtin.py
+++ b/Dragon/python/dragon/vm/torch/ops/builtin.py
@@ -23,8 +23,8 @@ from dragon.vm.torch.ops.modules.control_flow import (
 )
 from dragon.vm.torch.ops.modules.arithmetic import (
-    Fundamental, Log, Exp, Sqrt,
+    Fundamental, Accumulate,
-    Accumulate,
+    Log, Exp, Sqrt,
    MM, FullyConnected,
    Maximum, Minimum, Clamp,
 )
@@ -35,9 +35,11 @@ from dragon.vm.torch.ops.modules.init import (
 from dragon.vm.torch.ops.modules.array import (
    Reshape, Squeeze, UnSqueeze, Permute,
-    Indexing, IndexSelect,
+    Indexing, Repeat, Concat, Stack,
-    Repeat, Concat, Stack,
+    IndexSelect, MaskedSelect,
-    Reduce, ArgReduce, OneHot, Multinomial,
+    Reduce, ArgReduce,
+    NonZero, Where,
+    OneHot, Multinomial,
 )
 from dragon.vm.torch.ops.modules.update import (
@@ -50,17 +52,16 @@ from dragon.vm.torch.ops.modules.vision import (
 __all__ = [
-    'accumulate',
+    'add', 'sub', 'mul', 'div', 'accumulate',
-    'add', 'sub', 'mul', 'div',
    'maximum', 'minimum', 'clamp',
    'log', 'exp', 'sqrt',
    'mm', 'xw_plus_b',
    'squeeze', 'unsqueeze',
    'mean', 'sum', 'min', 'max', 'topk',
-    'argmin', 'argmax',
+    'nonzero', 'where', 'argmin', 'argmax',
-    'gt', 'lt', 'eq', 'ge', 'le',
+    'gt', 'lt', 'eq', 'ne', 'ge', 'le',
    'cat', 'stack', 'narrow',
-    'index_select',
+    'index_select', 'masked_select',
    'one_hot', 'multinomial',
    'rand', 'randn',
    'ones', 'ones_like',
@@ -525,6 +526,30 @@ def _assign(output, starts, sizes, input):
    return module.forward(input, output, starts, sizes)
+def where(condition, x, y):
+    """Select elements from either ``x`` or ``y``, depending on ``condition``.
+    Parameters
+    ----------
+    condition : dragon.vm.torch.Tensor
+        The byte condition tensor.
+    x : dragon.vm.torch.Tensor
+        The elements for *1*.
+    y : dragon.vm.torch.Tensor
+        The elements for *0*.
+    Returns
+    -------
+    dragon.vm.torch.Tensor
+        The output tensor.
+    """
+    dev = MakeDevice(inputs=[condition, x, y])
+    key = 'Where/{}'.format(dev)
+    module = get_module(Where, key, dev)
+    return module.forward(condition, x, y)
 def _masked_assign(output, mask, input):
    if not isinstance(input, Tensor):
        if isinstance(input, (tuple, list)):
@@ -569,7 +594,7 @@ def squeeze(input, dim=None, out=None):
 def unsqueeze(input, dim, out=None):
-    """Returns a tensor with a dimension of size 1 inserted at the specified position.
+    """Return a tensor with a dimension of size 1 inserted at the specified position.
    Parameters
    ----------
@@ -866,6 +891,27 @@ def eq(input, other, out=None):
    return _compare(input, other, 'EQ', out)
+def ne(input, other, out=None):
+    """Compute *input* != *other* element-wise.
+    Parameters
+    ----------
+    input : dragon.vm.torch.Tensor
+        The input tensor.
+    other : dragon.vm.torch.Tensor, number
+        The other tensor.
+    out : dragon.vm.torch.Tensor, optional
+        The optional output tensor.
+    Returns
+    -------
+    dragon.vm.torch.Tensor
+        The output byte tensor.
+    """
+    return _compare(input, other, 'NE', out)
 def cat(seq, dim=0, out=None):
    """Concatenate the inputs along the given axis.
@@ -908,7 +954,7 @@ def stack(seq, dim=0, out=None):
        The output tensor.
    """
-    dev = MakeDevice(inputs=seq, outputs=[out] if out else [])
+    dev = MakeDevice(seq, [out] if out else [])
    key = 'Stack/{}/dim:{}'.format(dev, dim)
    module = get_module(Stack, key, dev, axis=dim)
    return module.forward(seq, out)
@@ -940,6 +986,30 @@ def index_select(input, dim, index, out=None):
    return module.forward(input, index, out)
+def masked_select(input, mask, out=None):
+    """Select the input values where mask is *1*.
+    Parameters
+    ----------
+    input : dragon.vm.torch.Tensor
+        The values.
+    mask : dragon.vm.torch.Tensor
+        The mask to select values.
+    out : dragon.vm.torch.Tensor, optional
+        The optional output tensor.
+    Returns
+    -------
+    dragon.vm.torch.Tensor
+        The output tensor.
+    """
+    dev = MakeDevice([input, mask], [out] if out else [])
+    key = 'MaskedSelect/{}'.format(dev)
+    module = get_module(MaskedSelect, key, dev)
+    return module.forward(input, mask, out)
 def narrow(input, dimension, start, length):
    """Return a new tensor that is a narrowed version of input tensor.
@@ -965,6 +1035,28 @@ def narrow(input, dimension, start, length):
    return _index(input, starts, sizes)
+def nonzero(input, out=None):
+    """Return the indices of non-zero elements.
+    Parameters
+    ----------
+    input : dragon.vm.torch.Tensor
+        The input tensor.
+    out : dragon.vm.torch.Tensor, optional
+        The optional output tensor.
+    Returns
+    -------
+    dragon.vm.torch.FloatTensor
+        The output tensor.
+    """
+    dev = MakeDevice(inputs=[input])
+    key = 'NonZero/{}'.format(dev)
+    module = get_module(NonZero, key, dev)
+    return module.forward(input, out)
 def one_hot(input, depth):
    """Return a ont hot tensor according to given input.

--- a/Dragon/python/dragon/vm/torch/ops/modules/array.py
+++ b/Dragon/python/dragon/vm/torch/ops/modules/array.py
@@ -134,6 +134,23 @@ class IndexSelect(BaseModule):
        return self.run(inputs, outputs)
+class MaskedSelect(BaseModule):
+    def __init__(self, key, dev, **kwargs):
+        super(MaskedSelect, self).__init__(key, dev, **kwargs)
+        self.register_op()
+    def register_op(self):
+        self.op_meta = {
+            'op_type': 'MaskedSelect',
+            'arguments': {},
+        }
+    def forward(self, x, mask, y):
+        inputs = [x, mask]; self.unify_devices(inputs)
+        outputs = [y] if y else [self.register_output()]
+        return self.run(inputs, outputs)
 class Reduce(BaseModule):
    def __init__(self, key, dev, **kwargs):
        super(Reduce, self).__init__(key, dev, **kwargs)
@@ -326,6 +343,36 @@ class Repeat(BaseModule):
        return self.run(inputs, outputs, callback=callback)
+class NonZero(BaseModule):
+    def __init__(self, key, dev, **kwargs):
+        super(NonZero, self).__init__(key, dev, **kwargs)
+        self.register_op()
+    def register_op(self):
+        self.op_meta = {
+            'op_type': 'NonZero',
+            'arguments': {},
+        }
+    def forward(self, x, y):
+        inputs = [x]; self.unify_devices(inputs)
+        outputs = [y] if y else [self.register_output()]
+        with no_grad(): return self.run(inputs, outputs)
+class Where(BaseModule):
+    def __init__(self, key, dev, **kwargs):
+        super(Where, self).__init__(key, dev, **kwargs)
+        self.register_op()
+    def register_op(self):
+        self.op_meta = {'op_type': 'Where', 'arguments': {}}
+    def forward(self, condition, x, y):
+        self.unify_devices([condition, x, y])
+        return self.run([x, y, condition], [self.register_output()])
 class OneHot(BaseModule):
    def __init__(self, key, dev, **kwargs):
        super(OneHot, self).__init__(key, dev, **kwargs)

--- a/Dragon/python/dragon/vm/torch/ops/tensor.py
+++ b/Dragon/python/dragon/vm/torch/ops/tensor.py
@@ -23,11 +23,12 @@ from dragon.vm.torch.ops.builtin import (
    _fundamental, _rfundamental,
    log, exp, sqrt, clamp,
    _reshape, squeeze, unsqueeze,
-    _permute, _repeat, narrow,
+    _permute, _repeat, narrow, _index,
-    _index, index_select,
    _assign, _masked_assign,
+    index_select, masked_select,
    mean, sum, max, min,
-    gt, lt, eq, ge, le,
+    gt, lt, eq, ne, ge, le,
+    where, nonzero,
 )
@@ -84,10 +85,14 @@ Tensor.ge = lambda *args, **kwargs: ge(*args, **kwargs)
 Tensor.lt = lambda *args, **kwargs: lt(*args, **kwargs)
 Tensor.le = lambda *args, **kwargs: le(*args, **kwargs)
 Tensor.eq = lambda *args, **kwargs: eq(*args, **kwargs)
-Tensor.index_select = lambda *args, **kwargs: index_select(*args, **kwargs)
+Tensor.ne = lambda *args, **kwargs: ne(*args, **kwargs)
+Tensor.nonzero = lambda *args, **kwargs: nonzero(*args, **kwargs)
+Tensor.where = lambda self, condition, y: where(condition, self, y)
 Tensor.narrow = lambda *args, **kwargs: narrow(*args, **kwargs)
 Tensor._index = lambda *args, **kwargs: _index(*args, **kwargs)
 Tensor._assign = lambda *args, **kwargs: _assign(*args, **kwargs)
+Tensor.index_select = lambda *args, **kwargs: index_select(*args, **kwargs)
+Tensor.masked_select = lambda *args, **kwargs: masked_select(*args, **kwargs)
 Tensor.half = lambda self: _type_to(self, dtype='float16', inplace=False)
@@ -104,5 +109,5 @@ Tensor.int = lambda self: _type_to(self, dtype='int32', inplace=False)
 Tensor.int_ = lambda self: _type_to(self, dtype='int32', inplace=True)
 Tensor.long = lambda self: _type_to(self, dtype='int64', inplace=False)
 Tensor.long_ = lambda self: _type_to(self, dtype='int64', inplace=True)
-Tensor.type = lambda self, dtype=None: _type_to(self, dtype=dtype) \
+Tensor.type = lambda self, dtype = None: _type_to(self, dtype=dtype) \
    if dtype is not None else 'torch.' + self._type2str()
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/tensor.py
+++ b/Dragon/python/dragon/vm/torch/tensor.py
--- a/Dragon/src/kernels/arithmetic/maximum_op_kernel.cc
+++ b/Dragon/src/kernels/arithmetic/maximum_op_kernel.cc
 #include "utils/op_kernel.h"
+#include "utils/eigen_utils.h"
 #include "utils/omp_alternative.h"
 namespace dragon {
@@ -10,15 +11,12 @@ namespace kernel {
 template <typename T>
 void _Maximum(
    const int               count,
-    const T*                x1,
+    const T*                a,
-    const T*                x2,
+    const T*                b,
    T*                      y) {
-#ifdef WITH_OMP
+    EigenVectorArrayMap<T>(y, count) = \
-    #pragma omp parallel for num_threads(OMP_THREADS(count))
+        ConstEigenVectorArrayMap<T>(a, count).max(
-#endif
+            ConstEigenVectorArrayMap<T>(b, count));
-    for (int i = 0; i < count; ++i) {
-        y[i] = std::max(x1[i], x2[i]);
-    }
 }
 /* <T = ?, Device = CPU> */
@@ -26,15 +24,11 @@ void _Maximum(
 template <typename T>
 void _BroadcastMaximum(
    const int               count,
-    const T*                x1,
+    const T*                a,
-    const T                 x2,
+    const T                 b,
    T*                      y) {
-#ifdef WITH_OMP
+    EigenVectorArrayMap<T>(y, count) = \
-    #pragma omp parallel for num_threads(OMP_THREADS(count))
+        ConstEigenVectorArrayMap<T>(a, count).max(b);
-#endif
-    for (int i = 0; i < count; ++i) {
-        y[i] = std::max(x1[i], x2);
-    }
 }
 /* <T = ?, Device = CPU> */
@@ -42,18 +36,19 @@ void _BroadcastMaximum(
 template <typename T>
 void _MaximumGrad(
    const int               count,
-    const T*                x1,
+    const T*                a,
-    const T*                x2,
+    const T*                b,
    const T*                dy,
-    T*                      dx1,
+    T*                      da,
-    T*                      dx2) {
+    T*                      db) {
+    const T kZero = T(0);
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(OMP_THREADS(count))
 #endif
    for (int i = 0; i < count; ++i) {
-        const bool dy_to_dx1 = x1[i] > x2[i];
+        const bool dy_to_da = a[i] > b[i];
-        dx1[i] = dy_to_dx1 ? dy[i] : 0;
+        da[i] = dy_to_da ? dy[i] : kZero;
-        dx2[i] = dy_to_dx1 ? 0 : dy[i];
+        db[i] = dy_to_da ? kZero : dy[i];
    }
 }
@@ -62,16 +57,17 @@ void _MaximumGrad(
 template <typename T>
 void _BroadcastMaximumGrad(
    const int               count,
-    const T*                x1,
+    const T*                a,
-    const T                 x2,
+    const T                 b,
    const T*                dy,
-    T*                      dx1,
+    T*                      da,
-    T*                      dx2) {
+    T*                      db) {
+    const T kZero = T(0);
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(OMP_THREADS(count))
 #endif
    for (int i = 0; i < count; ++i) {
-        dx1[i] = (x1[i] > x2) ? dy[i] : 0;
+        da[i] = a[i] > b ? dy[i] : kZero;
    }
 }
@@ -80,23 +76,23 @@ void _BroadcastMaximumGrad(
 #define DEFINE_MAXIMUM_KERNEL_LAUNCHER(name, T, T2) \
    template <> void name<T, CPUContext>( \
        const int               count, \
-        const T*                x1, \
+        const T*                a, \
-        const T2                x2, \
+        const T2                b, \
        T*                      y, \
        CPUContext*             ctx) { \
-        _##name(count, x1, x2, y); \
+        _##name(count, a, b, y); \
    }
 #define DEFINE_MAXIMUM_GRAD_KERNEL_LAUNCHER(name, T, T2) \
    template <> void name<T, CPUContext>( \
        const int               count, \
-        const T*                x1, \
+        const T*                a, \
-        const T2                x2, \
+        const T2                b, \
        const T*                dy, \
-        T*                      dx1, \
+        T*                      da, \
-        T*                      dx2, \
+        T*                      db, \
        CPUContext*             ctx) { \
-        _##name(count, x1, x2, dy, dx1, dx2); \
+        _##name(count, a, b, dy, da, db); \
    }
 DEFINE_MAXIMUM_KERNEL_LAUNCHER(Maximum, int8_t, int8_t*);
@@ -129,8 +125,8 @@ DEFINE_MAXIMUM_GRAD_KERNEL_LAUNCHER(BroadcastMaximumGrad, double, double);
 template <> void Maximum<float16, CPUContext>(
    const int               count,
-    const float16*          x1,
+    const float16*          a,
-    const float16*          x2,
+    const float16*          b,
    float16*                y,
    CPUContext*             ctx) {
    CPU_FP16_NOT_SUPPORTED;
@@ -138,8 +134,8 @@ template <> void Maximum<float16, CPUContext>(
 template <> void BroadcastMaximum<float16, CPUContext>(
    const int               count,
-    const float16*          x1,
+    const float16*          a,
-    const float16           x2,
+    const float16           b,
    float16*                y,
    CPUContext*             ctx) {
    CPU_FP16_NOT_SUPPORTED;
@@ -147,22 +143,22 @@ template <> void BroadcastMaximum<float16, CPUContext>(
 template <> void MaximumGrad<float16, CPUContext>(
    const int               count,
-    const float16*          x1,
+    const float16*          a,
-    const float16*          x2,
+    const float16*          b,
    const float16*          dy,
-    float16*                dx1,
+    float16*                da,
-    float16*                dx2,
+    float16*                db,
    CPUContext*             ctx) {
    CPU_FP16_NOT_SUPPORTED;
 }
 template <> void BroadcastMaximumGrad<float16, CPUContext>(
    const int               count,
-    const float16*          x1,
+    const float16*          a,
-    const float16           x2,
+    const float16           b,
    const float16*          dy,
-    float16*                dx1,
+    float16*                da,
-    float16*                dx2,
+    float16*                db,
    CPUContext*             ctx) {
    CPU_FP16_NOT_SUPPORTED;
 }

--- a/Dragon/src/kernels/arithmetic/maximum_op_kernel.cu
+++ b/Dragon/src/kernels/arithmetic/maximum_op_kernel.cu
@@ -13,11 +13,11 @@ namespace kernel {
 template <typename T>
 __global__ void _Maximum(
    const int               nthreads,
-    const T*                x1,
+    const T*                a,
-    const T*                x2,
+    const T*                b,
    T*                      y) {
    CUDA_1D_KERNEL_LOOP(i, nthreads) {
-        y[i] = max(x1[i], x2[i]);
+        y[i] = max(a[i], b[i]);
    }
 }
@@ -25,12 +25,12 @@ __global__ void _Maximum(
 template<> __global__ void _Maximum<half>(
    const int               nthreads,
-    const half*             x1,
+    const half*             a,
-    const half*             x2,
+    const half*             b,
    half*                   y) {
    CUDA_1D_KERNEL_LOOP(i, nthreads) {
 #if __CUDA_ARCH__ >= 530
-        y[i] = __hgt(x1[i], x2[i]) ? x1[i] : x2[i];
+        y[i] = __hgt(a[i], b[i]) ? a[i] : b[i];
 #endif
    }
 }
@@ -40,11 +40,11 @@ template<> __global__ void _Maximum<half>(
 template <typename T>
 __global__ void _BroadcastMaximum(
    const int               nthreads,
-    const T*                x1,
+    const T*                a,
-    const T                 x2,
+    const T                 b,
    T*                      y) {
    CUDA_1D_KERNEL_LOOP(i, nthreads) {
-        y[i] = max(x1[i], x2);
+        y[i] = max(a[i], b);
    }
 }
@@ -52,12 +52,12 @@ __global__ void _BroadcastMaximum(
 template<>  __global__ void _BroadcastMaximum<half>(
    const int               nthreads,
-    const half*             x1,
+    const half*             a,
-    const half              x2,
+    const half              b,
    half*                   y) {
    CUDA_1D_KERNEL_LOOP(i, nthreads) {
 #if __CUDA_ARCH__ >= 530
-        y[i] = __hgt(x1[i], x2) ? x1[i] : x2;
+        y[i] = __hgt(a[i], b) ? a[i] : b;
 #endif
    }
 }
@@ -67,15 +67,16 @@ template<>  __global__ void _BroadcastMaximum<half>(
 template <typename T>
 __global__ void _MaximumGrad(
    const int               nthreads,
-    const T*                x1,
+    const T*                a,
-    const T*                x2,
+    const T*                b,
    const T*                dy,
-    T*                      dx1,
+    T*                      da,
-    T*                      dx2) {
+    T*                      db) {
+    const T kZero = T(0);
    CUDA_1D_KERNEL_LOOP(i, nthreads) {
-        const bool dy_to_dx1 = x1[i] > x2[i];
+        const bool dy_to_da = a[i] > b[i];
-        dx1[i] = dy_to_dx1 ? dy[i] : T(0);
+        da[i] = dy_to_da ? dy[i] : kZero;
-        dx2[i] = dy_to_dx1 ? T(0) : dy[i];
+        db[i] = dy_to_da ? kZero : dy[i];
    }
 }
@@ -83,18 +84,19 @@ __global__ void _MaximumGrad(
 template<> __global__ void _MaximumGrad<half>(
    const int               nthreads,
-    const half*             x1,
+    const half*             a,
-    const half*             x2,
+    const half*             b,
    const half*             dy,
-    half*                   dx1,
+    half*                   da,
-    half*                   dx2) {
+    half*                   db) {
-    CUDA_1D_KERNEL_LOOP(i, nthreads) {
 #if __CUDA_ARCH__ >= 530
-        const bool dy_to_dx1 = __hgt(x1[i], x2[i]);
+    const half kZero = __float2half(0.f);
-        dx1[i] = dy_to_dx1 ? dy[i] : __float2half(0.f);
+    CUDA_1D_KERNEL_LOOP(i, nthreads) {
-        dx2[i] = dy_to_dx1 ? __float2half(0.f) : dy[i];
+        const bool dy_to_da = __hgt(a[i], b[i]);
-#endif
+        da[i] = dy_to_da ? dy[i] : kZero;
+        db[i] = dy_to_da ? kZero : dy[i];
    }
+#endif
 }
 /* <T = ?, Device = CUDA> */
@@ -102,13 +104,14 @@ template<> __global__ void _MaximumGrad<half>(
 template <typename T>
 __global__ void _BroadcastMaximumGrad(
    const int               nthreads,
-    const T*                x1,
+    const T*                a,
-    const T                 x2,
+    const T                 b,
    const T*                dy,
-    T*                      dx1,
+    T*                      da,
-    T*                      dx2) {
+    T*                      db) {
+    const T kZero = T(0);
    CUDA_1D_KERNEL_LOOP(i, nthreads) {
-        dx1[i] = (x1[i] > x2) ? dy[i] : T(0);
+        da[i] = (a[i] > b) ? dy[i] : kZero;
    }
 }
@@ -116,17 +119,17 @@ __global__ void _BroadcastMaximumGrad(
 template<> __global__ void _BroadcastMaximumGrad<half>(
    const int               nthreads,
-    const half*             x1,
+    const half*             a,
-    const half              x2,
+    const half              b,
    const half*             dy,
-    half*                   dx1,
+    half*                   da,
-    half*                   dx2) {
+    half*                   db) {
-    CUDA_1D_KERNEL_LOOP(i, nthreads) {
 #if __CUDA_ARCH__ >= 530
-        dx1[i] = __hgt(x1[i], x2) ?
+    const half kZero = __float2half(0.f);
-            dy[i] : __float2half(0.f);
+    CUDA_1D_KERNEL_LOOP(i, nthreads) {
-#endif
+        da[i] = __hgt(a[i], b) ? dy[i] : kZero;
    }
+#endif
 }
 /* Kernel Launchers */
@@ -134,30 +137,30 @@ template<> __global__ void _BroadcastMaximumGrad<half>(
 #define DEFINE_MAXIMUM_KERNEL_LAUNCHER(name, T, T2) \
    template <> void name<T, CUDAContext>( \
        const int               count, \
-        const T*                x1, \
+        const T*                a, \
-        const T2                x2, \
+        const T2                b, \
        T*                      y, \
        CUDAContext*            ctx) { \
        _##name \
            <<< CUDA_BLOCKS(count), CUDA_THREADS, \
                0, ctx->cuda_stream() >>>( \
-            count, x1, x2, y \
+            count, a, b, y \
        ); \
    }
 #define DEFINE_MAXIMUM_GRAD_KERNEL_LAUNCHER(name, T, T2) \
    template <> void name<T, CUDAContext>( \
        const int               count, \
-        const T*                x1, \
+        const T*                a, \
-        const T2                x2, \
+        const T2                b, \
        const T*                dy, \
-        T*                      dx1, \
+        T*                      da, \
-        T*                      dx2, \
+        T*                      db, \
        CUDAContext*            ctx) { \
        _##name \
            <<< CUDA_BLOCKS(count), CUDA_THREADS, \
                0, ctx->cuda_stream() >>>( \
-            count, x1, x2, dy, dx1, dx2 \
+            count, a, b, dy, da, db \
        ); \
    }
@@ -191,73 +194,73 @@ DEFINE_MAXIMUM_GRAD_KERNEL_LAUNCHER(BroadcastMaximumGrad, double, double);
 template <> void Maximum<float16, CUDAContext>(
    const int               count,
-    const float16*          x1,
+    const float16*          a,
-    const float16*          x2,
+    const float16*          b,
    float16*                y,
    CUDAContext*            ctx) {
    _Maximum \
        <<< CUDA_BLOCKS(count), CUDA_THREADS,
            0, ctx->cuda_stream() >>>(
        count,
-        reinterpret_cast<const half*>(x1),
+        reinterpret_cast<const half*>(a),
-        reinterpret_cast<const half*>(x2),
+        reinterpret_cast<const half*>(b),
        reinterpret_cast<half*>(y)
    );
 }
 template <> void BroadcastMaximum<float16, CUDAContext>(
    const int               count,
-    const float16*          x1,
+    const float16*          a,
-    const float16           x2,
+    const float16           b,
    float16*                y,
    CUDAContext*            ctx) {
    _BroadcastMaximum \
        <<< CUDA_BLOCKS(count), CUDA_THREADS,
            0, ctx->cuda_stream() >>>(
        count,
-        reinterpret_cast<const half*>(x1),
+        reinterpret_cast<const half*>(a),
-        cast::to<half>(x2),
+        cast::to<half>(b),
        reinterpret_cast<half*>(y)
    );
 }
 template <> void MaximumGrad<float16, CUDAContext>(
    const int               count,
-    const float16*          x1,
+    const float16*          a,
-    const float16*          x2,
+    const float16*          b,
    const float16*          dy,
-    float16*                dx1,
+    float16*                da,
-    float16*                dx2,
+    float16*                db,
    CUDAContext*            ctx) {
    _MaximumGrad \
        <<< CUDA_BLOCKS(count), CUDA_THREADS,
            0, ctx->cuda_stream() >>>(
        count,
-        reinterpret_cast<const half*>(x1),
+        reinterpret_cast<const half*>(a),
-        reinterpret_cast<const half*>(x2),
+        reinterpret_cast<const half*>(b),
        reinterpret_cast<const half*>(dy),
-        reinterpret_cast<half*>(dx1),
+        reinterpret_cast<half*>(da),
-        reinterpret_cast<half*>(dx2)
+        reinterpret_cast<half*>(db)
    );
 }
 template <> void BroadcastMaximumGrad<float16, CUDAContext>(
    const int               count,
-    const float16*          x1,
+    const float16*          a,
-    const float16           x2,
+    const float16           b,
    const float16*          dy,
-    float16*                dx1,
+    float16*                da,
-    float16*                dx2,
+    float16*                db,
    CUDAContext*            ctx) {
    _BroadcastMaximumGrad \
        <<< CUDA_BLOCKS(count), CUDA_THREADS,
            0, ctx->cuda_stream() >>>(
        count,
-        reinterpret_cast<const half*>(x1),
+        reinterpret_cast<const half*>(a),
-        cast::to<half>(x2),
+        cast::to<half>(b),
        reinterpret_cast<const half*>(dy),
-        reinterpret_cast<half*>(dx1),
+        reinterpret_cast<half*>(da),
-        reinterpret_cast<half*>(dx2)
+        reinterpret_cast<half*>(db)
    );
 }

--- a/Dragon/src/kernels/arithmetic/minimum_op_kernel.cc
+++ b/Dragon/src/kernels/arithmetic/minimum_op_kernel.cc
 #include "utils/op_kernel.h"
+#include "utils/eigen_utils.h"
 #include "utils/omp_alternative.h"
 namespace dragon {
@@ -10,15 +11,12 @@ namespace kernel {
 template <typename T>
 void _Minimum(
    const int               count,
-    const T*                x1,
+    const T*                a,
-    const T*                x2,
+    const T*                b,
    T*                      y) {
-#ifdef WITH_OMP
+    EigenVectorArrayMap<T>(y, count) = \
-    #pragma omp parallel for num_threads(OMP_THREADS(count))
+        ConstEigenVectorArrayMap<T>(a, count).min(
-#endif
+            ConstEigenVectorArrayMap<T>(b, count));
-    for (int i = 0; i < count; ++i) {
-        y[i] = std::min(x1[i], x2[i]);
-    }
 }
 /* <T = ?, Device = CPU> */
@@ -26,15 +24,11 @@ void _Minimum(
 template <typename T>
 void _BroadcastMinimum(
    const int               count,
-    const T*                x1,
+    const T*                a,
-    const T                 x2,
+    const T                 b,
    T*                      y) {
-#ifdef WITH_OMP
+    EigenVectorArrayMap<T>(y, count) = \
-    #pragma omp parallel for num_threads(OMP_THREADS(count))
+        ConstEigenVectorArrayMap<T>(a, count).min(b);
-#endif
-    for (int i = 0; i < count; ++i) {
-        y[i] = std::min(x1[i], x2);
-    }
 }
 /* <T = float32, Device = CPU> */
@@ -42,18 +36,19 @@ void _BroadcastMinimum(
 template <typename T>
 void _MinimumGrad(
    const int               count,
-    const T*                x1,
+    const T*                a,
-    const T*                x2,
+    const T*                b,
    const T*                dy,
-    T*                      dx1,
+    T*                      da,
-    T*                      dx2) {
+    T*                      db) {
+    const T kZero = T(0);
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(OMP_THREADS(count))
 #endif
    for (int i = 0; i < count; ++i) {
-        const bool dy_to_dx1 = x1[i] < x2[i];
+        const bool dy_to_da = a[i] < b[i];
-        dx1[i] = dy_to_dx1 ? dy[i] : 0;
+        da[i] = dy_to_da ? dy[i] : kZero;
-        dx2[i] = dy_to_dx1 ? 0 : dy[i];
+        db[i] = dy_to_da ? kZero : dy[i];
    }
 }
@@ -62,16 +57,17 @@ void _MinimumGrad(
 template <typename T>
 void _BroadcastMinimumGrad(
    const int               count,
-    const T*                x1,
+    const T*                a,
-    const T                 x2,
+    const T                 b,
    const T*                dy,
-    T*                      dx1,
+    T*                      da,
-    T*                      dx2) {
+    T*                      db) {
+    const T kZero = T(0);
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(OMP_THREADS(count))
 #endif
    for (int i = 0; i < count; ++i) {
-        dx1[i] = (x1[i] < x2) ? dy[i] : 0;
+        da[i] = a[i] < b ? dy[i] : kZero;
    }
 }
@@ -80,23 +76,23 @@ void _BroadcastMinimumGrad(
 #define DEFINE_MINIMUM_KERNEL_LAUNCHER(name, T, T2) \
    template <> void name<T, CPUContext>( \
        const int               count, \
-        const T*                x1, \
+        const T*                a, \
-        const T2                x2, \
+        const T2                b, \
        T*                      y, \
        CPUContext*             ctx) { \
-        _##name(count, x1, x2, y); \
+        _##name(count, a, b, y); \
    }
 #define DEFINE_MINIMUM_GRAD_KERNEL_LAUNCHER(name, T, T2) \
    template <> void name<T, CPUContext>( \
        const int               count, \
-        const T*                x1, \
+        const T*                a, \
-        const T2                x2, \
+        const T2                b, \
        const T*                dy, \
-        T*                      dx1, \
+        T*                      da, \
-        T*                      dx2, \
+        T*                      db, \
        CPUContext*             ctx) { \
-        _##name(count, x1, x2, dy, dx1, dx2); \
+        _##name(count, a, b, dy, da, db); \
    }
 DEFINE_MINIMUM_KERNEL_LAUNCHER(Minimum, int8_t, int8_t*);
@@ -129,8 +125,8 @@ DEFINE_MINIMUM_GRAD_KERNEL_LAUNCHER(BroadcastMinimumGrad, double, double);
 template <> void Minimum<float16, CPUContext>(
    const int               count,
-    const float16*          x1,
+    const float16*          a,
-    const float16*          x2,
+    const float16*          b,
    float16*                y,
    CPUContext*             ctx) {
    CPU_FP16_NOT_SUPPORTED;
@@ -138,8 +134,8 @@ template <> void Minimum<float16, CPUContext>(
 template <> void BroadcastMinimum<float16, CPUContext>(
    const int               count,
-    const float16*          x1,
+    const float16*          a,
-    const float16           x2,
+    const float16           b,
    float16*                y,
    CPUContext*             ctx) {
    CPU_FP16_NOT_SUPPORTED;
@@ -147,22 +143,22 @@ template <> void BroadcastMinimum<float16, CPUContext>(
 template <> void MinimumGrad<float16, CPUContext>(
    const int               count,
-    const float16*          x1,
+    const float16*          a,
-    const float16*          x2,
+    const float16*          b,
    const float16*          dy,
-    float16*                dx1,
+    float16*                da,
-    float16*                dx2,
+    float16*                db,
    CPUContext*             ctx) {
    CPU_FP16_NOT_SUPPORTED;
 }
 template <> void BroadcastMinimumGrad<float16, CPUContext>(
    const int               count,
-    const float16*          x1,
+    const float16*          a,
-    const float16           x2,
+    const float16           b,
    const float16*          dy,
-    float16*                dx1,
+    float16*                da,
-    float16*                dx2,
+    float16*                db,
    CPUContext*             ctx) {
    CPU_FP16_NOT_SUPPORTED;
 }

--- a/Dragon/src/kernels/arithmetic/minimum_op_kernel.cu
+++ b/Dragon/src/kernels/arithmetic/minimum_op_kernel.cu
@@ -13,11 +13,11 @@ namespace kernel {
 template <typename T>
 __global__ void _Minimum(
    const int               nthreads,
-    const T*                x1,
+    const T*                a,
-    const T*                x2,
+    const T*                b,
    T*                      y) {
    CUDA_1D_KERNEL_LOOP(i, nthreads) {
-        y[i] = min(x1[i], x2[i]);
+        y[i] = min(a[i], b[i]);
    }
 }
@@ -25,12 +25,12 @@ __global__ void _Minimum(
 template<> __global__ void _Minimum<half>(
    const int               nthreads,
-    const half*             x1,
+    const half*             a,
-    const half*             x2,
+    const half*             b,
    half*                   y) {
    CUDA_1D_KERNEL_LOOP(i, nthreads) {
 #if __CUDA_ARCH__ >= 530
-        y[i] = __hlt(x1[i], x2[i]) ? x1[i] : x2[i];
+        y[i] = __hlt(a[i], b[i]) ? a[i] : b[i];
 #endif
    }
 }
@@ -40,11 +40,11 @@ template<> __global__ void _Minimum<half>(
 template <typename T>
 __global__ void _BroadcastMinimum(
    const int               nthreads,
-    const T*                x1,
+    const T*                a,
-    const T                 x2,
+    const T                 b,
    T*                      y) {
    CUDA_1D_KERNEL_LOOP(i, nthreads) {
-        y[i] = min(x1[i], x2);
+        y[i] = min(a[i], b);
    }
 }
@@ -52,12 +52,12 @@ __global__ void _BroadcastMinimum(
 template<> __global__ void _BroadcastMinimum<half>(
    const int               nthreads,
-    const half*             x1,
+    const half*             a,
-    const half              x2,
+    const half              b,
    half*                   y) {
    CUDA_1D_KERNEL_LOOP(i, nthreads) {
 #if __CUDA_ARCH__ >= 530
-        y[i] = __hlt(x1[i], x2) ? x1[i] : x2;
+        y[i] = __hlt(a[i], b) ? a[i] : b;
 #endif
    }
 }
@@ -67,15 +67,16 @@ template<> __global__ void _BroadcastMinimum<half>(
 template <typename T>
 __global__ void _MinimumGrad(
    const int               nthreads,
-    const T*                x1,
+    const T*                a,
-    const T*                x2,
+    const T*                b,
    const T*                dy,
-    T*                      dx1,
+    T*                      da,
-    T*                      dx2) {
+    T*                      db) {
+    const T kZero = T(0);
    CUDA_1D_KERNEL_LOOP(i, nthreads) {
-        const bool dy_to_dx1 = x1[i] < x2[i];
+        const bool dy_to_da = a[i] < b[i];
-        dx1[i] = dy_to_dx1 ? dy[i] : T(0);
+        da[i] = dy_to_da ? dy[i] : kZero;
-        dx2[i] = dy_to_dx1 ? T(0) : dy[i];
+        db[i] = dy_to_da ? kZero : dy[i];
    }
 }
@@ -83,18 +84,19 @@ __global__ void _MinimumGrad(
 template<> __global__ void _MinimumGrad<half>(
    const int               nthreads,
-    const half*             x1,
+    const half*             a,
-    const half*             x2,
+    const half*             b,
    const half*             dy,
-    half*                   dx1,
+    half*                   da,
-    half*                   dx2) {
+    half*                   db) {
-    CUDA_1D_KERNEL_LOOP(i, nthreads) {
 #if __CUDA_ARCH__ >= 530
-        const bool dy_to_dx1 = __hlt(x1[i], x2[i]);
+    const half kZero = __float2half(0.f);
-        dx1[i] = dy_to_dx1 ? dy[i] : __float2half(0.f);
+    CUDA_1D_KERNEL_LOOP(i, nthreads) {
-        dx2[i] = dy_to_dx1 ? __float2half(0.f) : dy[i];
+        const bool dy_to_da = __hlt(a[i], b[i]);
-#endif
+        da[i] = dy_to_da ? dy[i] : kZero;
+        db[i] = dy_to_da ? kZero : dy[i];
    }
+#endif
 }
 /* <T = ?, Device = CUDA> */
@@ -102,13 +104,14 @@ template<> __global__ void _MinimumGrad<half>(
 template <typename T>
 __global__ void _BroadcastMinimumGrad(
    const int               nthreads,
-    const T*                x1,
+    const T*                a,
-    const T                 x2,
+    const T                 b,
    const T*                dy,
-    T*                      dx1,
+    T*                      da,
-    T*                      dx2) {
+    T*                      db) {
+    const T kZero = T(0);
    CUDA_1D_KERNEL_LOOP(i, nthreads) {
-        dx1[i] = (x1[i] < x2) ? dy[i] : T(0);
+        da[i] = (a[i] < b) ? dy[i] : kZero;
    }
 }
@@ -116,17 +119,17 @@ __global__ void _BroadcastMinimumGrad(
 template<> __global__ void _BroadcastMinimumGrad<half>(
    const int               nthreads,
-    const half*             x1,
+    const half*             a,
-    const half              x2,
+    const half              b,
    const half*             dy,
-    half*                   dx1,
+    half*                   da,
-    half*                   dx2) {
+    half*                   db) {
-    CUDA_1D_KERNEL_LOOP(i, nthreads) {
 #if __CUDA_ARCH__ >= 530
-        dx1[i] = (__hlt(x1[i], x2)) ?
+    const half kZero = __float2half(0.f);
-            dy[i] : __float2half(0.f);
+    CUDA_1D_KERNEL_LOOP(i, nthreads) {
-#endif
+        da[i] = __hlt(a[i], b) ? dy[i] : kZero;
    }
+#endif
 }
 /* Kernel Launchers */
@@ -134,30 +137,30 @@ template<> __global__ void _BroadcastMinimumGrad<half>(
 #define DEFINE_MINIMUM_KERNEL_LAUNCHER(name, T, T2) \
    template <> void name<T, CUDAContext>( \
        const int               count, \
-        const T*                x1, \
+        const T*                a, \
-        const T2                x2, \
+        const T2                b, \
        T*                      y, \
        CUDAContext*            ctx) { \
        _##name \
            <<< CUDA_BLOCKS(count), CUDA_THREADS, \
                0, ctx->cuda_stream() >>>( \
-            count, x1, x2, y \
+            count, a, b, y \
        ); \
    }
 #define DEFINE_MINIMUM_GRAD_KERNEL_LAUNCHER(name, T, T2) \
    template <> void name<T, CUDAContext>( \
        const int               count, \
-        const T*                x1, \
+        const T*                a, \
-        const T2                x2, \
+        const T2                b, \
        const T*                dy, \
-        T*                      dx1, \
+        T*                      da, \
-        T*                      dx2, \
+        T*                      db, \
        CUDAContext*            ctx) { \
        _##name \
            <<< CUDA_BLOCKS(count), CUDA_THREADS, \
                0, ctx->cuda_stream() >>>( \
-            count, x1, x2, dy, dx1, dx2 \
+            count, a, b, dy, da, db \
        ); \
    }
@@ -191,73 +194,73 @@ DEFINE_MINIMUM_GRAD_KERNEL_LAUNCHER(BroadcastMinimumGrad, double, double);
 template <> void Minimum<float16, CUDAContext>(
    const int               count,
-    const float16*          x1,
+    const float16*          a,
-    const float16*          x2,
+    const float16*          b,
    float16*                y,
    CUDAContext*            ctx) {
    _Minimum \
        <<< CUDA_BLOCKS(count), CUDA_THREADS,
            0, ctx->cuda_stream() >>>(
        count,
-        reinterpret_cast<const half*>(x1),
+        reinterpret_cast<const half*>(a),
-        reinterpret_cast<const half*>(x2),
+        reinterpret_cast<const half*>(b),
        reinterpret_cast<half*>(y)
    );
 }
 template <> void BroadcastMinimum<float16, CUDAContext>(
    const int               count,
-    const float16*          x1,
+    const float16*          a,
-    const float16           x2,
+    const float16           b,
    float16*                y,
    CUDAContext*            ctx) {
    _BroadcastMinimum \
        <<< CUDA_BLOCKS(count), CUDA_THREADS,
            0, ctx->cuda_stream() >>>(
        count,
-        reinterpret_cast<const half*>(x1),
+        reinterpret_cast<const half*>(a),
-        cast::to<half>(x2),
+        cast::to<half>(b),
        reinterpret_cast<half*>(y)
    );
 }
 template <> void MinimumGrad<float16, CUDAContext>(
    const int               count,
-    const float16*          x1,
+    const float16*          a,
-    const float16*          x2,
+    const float16*          b,
    const float16*          dy,
-    float16*                dx1,
+    float16*                da,
-    float16*                dx2,
+    float16*                db,
    CUDAContext*            ctx) {
    _MinimumGrad \
        <<< CUDA_BLOCKS(count), CUDA_THREADS,
            0, ctx->cuda_stream() >>>(
        count,
-        reinterpret_cast<const half*>(x1),
+        reinterpret_cast<const half*>(a),
-        reinterpret_cast<const half*>(x2),
+        reinterpret_cast<const half*>(b),
        reinterpret_cast<const half*>(dy),
-        reinterpret_cast<half*>(dx1),
+        reinterpret_cast<half*>(da),
-        reinterpret_cast<half*>(dx2)
+        reinterpret_cast<half*>(db)
    );
 }
 template <> void BroadcastMinimumGrad<float16, CUDAContext>(
    const int               count,
-    const float16*          x1,
+    const float16*          a,
-    const float16           x2,
+    const float16           b,
    const float16*          dy,
-    float16*                dx1,
+    float16*                da,
-    float16*                dx2,
+    float16*                db,
    CUDAContext*            ctx) {
    _BroadcastMinimumGrad \
        <<< CUDA_BLOCKS(count), CUDA_THREADS,
            0, ctx->cuda_stream() >>>(
        count,
-        reinterpret_cast<const half*>(x1),
+        reinterpret_cast<const half*>(a),
-        cast::to<half>(x2),
+        cast::to<half>(b),
        reinterpret_cast<const half*>(dy),
-        reinterpret_cast<half*>(dx1),
+        reinterpret_cast<half*>(da),
-        reinterpret_cast<half*>(dx2)
+        reinterpret_cast<half*>(db)
    );
 }

--- a/Dragon/src/kernels/array/index_select_op_kernel.cc
+++ b/Dragon/src/kernels/array/index_select_op_kernel.cc
+#include "utils/cast.h"
 #include "utils/op_kernel.h"
 #include "utils/math_functions.h"
-#include "utils/omp_alternative.h"
 namespace dragon {
@@ -49,6 +49,8 @@ void _IndexSelectGrad(
    T*                      dx,
    CPUContext*             ctx) {
    int64_t x_offset, select_idx;
+    auto nelements = outer_dim * axis_dim * inner_dim;
+    math::Set(nelements, cast::to<T>(0.f), dx, ctx);
    for (int n = 0; n < outer_dim; ++n) {
        for (int i = 0; i < num_indices; ++i) {
            select_idx = indices[i];

--- a/Dragon/src/kernels/array/index_select_op_kernel.cu
+++ b/Dragon/src/kernels/array/index_select_op_kernel.cu
 #ifdef WITH_CUDA
 #include "core/context_cuda.h"
+#include "utils/cast.h"
 #include "utils/op_kernel.h"
-#include "utils/cub_device.h"
+#include "utils/math_functions.h"
 namespace dragon {
@@ -134,6 +135,8 @@ template <> __global__ void _IndexSelectGrad<half>(
        T*                      dx, \
        CUDAContext*            ctx) { \
        auto nthreads = outer_dim * inner_dim; \
+        auto nelements = outer_dim * axis_dim * inner_dim; \
+        math::Set(nelements, cast::to<T>(0.f), dx, ctx); \
        _IndexSelectGrad \
            <<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
                0, ctx->cuda_stream() >>>( \
@@ -169,6 +172,8 @@ template <> void IndexSelectGrad<float16, CUDAContext>(
    float16*                dx,
    CUDAContext*            ctx) {
    auto nthreads = outer_dim * inner_dim;
+    auto nelements = outer_dim * axis_dim * inner_dim;
+    math::Set(nelements, cast::to<float16>(0.f), dx, ctx);
    _IndexSelectGrad
        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
            0, ctx->cuda_stream() >>>(

--- a/Dragon/src/kernels/array/masked_select_op_kernel.cc
+++ b/Dragon/src/kernels/array/masked_select_op_kernel.cc
+#include "core/tensor.h"
+#include "utils/cast.h"
+#include "utils/op_kernel.h"
+#include "utils/math_functions.h"
+namespace dragon {
+namespace kernel {
+/* <T = ?, Device = CPU> */
+template <typename T>
+void _MaskedSelectGrad(
+    const int               num_indices,
+    const int64_t*          indices,
+    const T*                dy,
+    T*                      dx) {
+    for (int i = 0; i < num_indices; ++i) {
+        dx[indices[i]] = dy[i];
+    }
+}
+/* Kernel Launchers */
+#define DEFINE_MASKED_KERNEL_LAUNCHER(T) \
+    template <> void MaskedSelect<T, CPUContext>( \
+        const int               count, \
+        const uint8_t*          mask, \
+        const T*                x, \
+        Tensor*                 indices, \
+        Tensor*                 scratch, \
+        Tensor*                 y, \
+        CPUContext*             ctx) { \
+        int64_t nelements = 0; \
+        int64_t n, last = -1, y_ofs = 0; \
+        for (int i = 0; i < count; ++i) \
+            if (mask[i]) ++nelements; \
+        auto* value = y == nullptr ? nullptr : y \
+            ->Reshape({ nelements }) \
+            ->mutable_data<T, CPUContext>(); \
+        auto* index = indices \
+            ->Reshape({ nelements }) \
+            ->mutable_data<int64_t, CPUContext>(); \
+        for (int64_t i = 0;; ++i) { \
+            if (last != -1 && ((i >= count) || !mask[i])) { \
+                n = i - last; \
+                if (value != nullptr) { \
+                    auto* src = x + last; \
+                    auto* dst = value + y_ofs; \
+                    math::Copy(n, src, dst, ctx); \
+                } \
+                y_ofs += n; last = -1; \
+            } \
+            if (i >= count) break; \
+            if (mask[i]) { \
+                *(index++) = i; \
+                if (last == -1) last = i; \
+            } \
+        } \
+    }
+#define DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(T) \
+    template <> void MaskedSelectGrad<T, CPUContext>( \
+        const int               count, \
+        const int               num_indices, \
+        const int64_t*          indices, \
+        const T*                dy, \
+        T*                      dx, \
+        CPUContext*             ctx) { \
+        math::Set(count, cast::to<T>(0.f), dx, ctx); \
+        _MaskedSelectGrad(num_indices, indices, dy, dx); \
+    }
+DEFINE_MASKED_KERNEL_LAUNCHER(bool);
+DEFINE_MASKED_KERNEL_LAUNCHER(int8_t);
+DEFINE_MASKED_KERNEL_LAUNCHER(uint8_t);
+DEFINE_MASKED_KERNEL_LAUNCHER(int);
+DEFINE_MASKED_KERNEL_LAUNCHER(int64_t);
+DEFINE_MASKED_KERNEL_LAUNCHER(float16);
+DEFINE_MASKED_KERNEL_LAUNCHER(float);
+DEFINE_MASKED_KERNEL_LAUNCHER(double);
+DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(bool);
+DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(int8_t);
+DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(uint8_t);
+DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(int);
+DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(int64_t);
+DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(float16);
+DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(float);
+DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(double);
+#undef DEFINE_MASKED_KERNEL_LAUNCHER
+#undef DEFINE_MASKED_GRAD_KERNEL_LAUNCHER
+}  // namespace kernel
+}  // namepsace dragon
\ No newline at end of file
--- a/Dragon/src/kernels/array/masked_select_op_kernel.cu
+++ b/Dragon/src/kernels/array/masked_select_op_kernel.cu
+#ifdef WITH_CUDA
+#include "core/tensor.h"
+#include "core/context_cuda.h"
+#include "utils/cast.h"
+#include "utils/op_kernel.h"
+#include "utils/cub_device.h"
+#include "utils/math_functions.h"
+namespace dragon {
+namespace kernel {
+/* <T = ?, Device = CUDA> */
+template <typename T>
+__global__ void _MaskedSelectByIndex(
+    const int               nthreads,
+    const int64_t*          indices,
+    const T*                x,
+    T*                      y) {
+    CUDA_1D_KERNEL_LOOP(i, nthreads) {
+        y[i] = x[indices[i]];
+    }
+}
+template <typename T>
+__global__ void _MaskedSelectGrad(
+    const int               nthreads,
+    const int64_t*          indices,
+    const T*                dy,
+    T*                      dx) {
+    CUDA_1D_KERNEL_LOOP(i, nthreads) {
+        dx[indices[i]] = dy[i];
+    }
+}
+/* Kernel Launchers */
+#define DEFINE_MASKED_KERNEL_LAUNCHER(T) \
+    template <> void MaskedSelect<T, CUDAContext>( \
+        const int               count, \
+        const uint8_t*          mask, \
+        const T*                x, \
+        Tensor*                 indices, \
+        Tensor*                 scratch, \
+        Tensor*                 y, \
+        CUDAContext*            ctx) { \
+        auto* i = indices \
+            ->Reshape({ count + 1 }) \
+            ->mutable_data<int64_t, CUDAContext>(); \
+        auto* n = (int*)(i + count); \
+        size_t nbytes = 0; int nelements; \
+        cub::CountingInputIterator<int> itr(0); \
+        cub::DeviceSelect::Flagged( \
+            nullptr, nbytes, \
+            itr, mask, i, n, count, \
+            ctx->cuda_stream() \
+        ); \
+        auto* storage = scratch \
+            ->Reshape({ (int64_t)nbytes }) \
+            ->mutable_data<uint8_t, CUDAContext>(); \
+        cub::DeviceSelect::Flagged( \
+            storage, nbytes, \
+            itr, mask, i, n, count, \
+            ctx->cuda_stream() \
+        ); \
+        ctx->FinishDeviceCompution(); \
+        ctx->Memcpy<CPUContext, CUDAContext>( \
+            sizeof(int), &nelements, n); \
+        indices->Reshape({ nelements }); \
+        if (y == nullptr) return; \
+        auto* value = y \
+            ->Reshape({ nelements }) \
+            ->mutable_data<T, CUDAContext>(); \
+        _MaskedSelectByIndex \
+            <<< CUDA_BLOCKS(nelements), CUDA_THREADS, \
+                0, ctx->cuda_stream() >>>( \
+            nelements, i, x, value \
+        ); \
+    }
+#define DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(T) \
+    template <> void MaskedSelectGrad<T, CUDAContext>( \
+        const int               count, \
+        const int               num_indices, \
+        const int64_t*          indices, \
+        const T*                dy, \
+        T*                      dx, \
+        CUDAContext*            ctx) { \
+        math::Set(count, cast::to<T>(0.f), dx, ctx); \
+        _MaskedSelectGrad \
+            <<< CUDA_BLOCKS(num_indices), CUDA_THREADS, \
+                0, ctx->cuda_stream() >>>( \
+            num_indices, indices, dy, dx \
+        ); \
+    }
+DEFINE_MASKED_KERNEL_LAUNCHER(bool);
+DEFINE_MASKED_KERNEL_LAUNCHER(int8_t);
+DEFINE_MASKED_KERNEL_LAUNCHER(uint8_t);
+DEFINE_MASKED_KERNEL_LAUNCHER(int);
+DEFINE_MASKED_KERNEL_LAUNCHER(int64_t);
+DEFINE_MASKED_KERNEL_LAUNCHER(float16);
+DEFINE_MASKED_KERNEL_LAUNCHER(float);
+DEFINE_MASKED_KERNEL_LAUNCHER(double);
+DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(bool);
+DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(int8_t);
+DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(uint8_t);
+DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(int);
+DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(int64_t);
+DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(float16);
+DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(float);
+DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(double);
+#undef DEFINE_MASKED_KERNEL_LAUNCHER
+#undef DEFINE_MASKED_GRAD_KERNEL_LAUNCHER
+}  // namespace kernel
+}  // namepsace dragon
+#endif  // WITH_CUDA
\ No newline at end of file
--- a/Dragon/src/kernels/array/non_zero_op_kernel.cc
+++ b/Dragon/src/kernels/array/non_zero_op_kernel.cc
+#include "utils/op_kernel.h"
+namespace dragon {
+namespace kernel {
+#define FIXED_DIVISOR_DIV_MOD(d, n, q, r) \
+    do {                                  \
+        const auto n_copy = n;            \
+        *q = n_copy / d;                  \
+        *r = n_copy % d;                  \
+    } while (0)
+template <> void UnravelIndex<CPUContext>(
+    const int               count,
+    const int               ndims,
+    const int*              dims,
+    const int64_t*          x,
+    int64_t*                y,
+    CPUContext*             ctx) {
+    int tmp, d; int64_t* Y;
+    for (int i = 0; i < count; ++i) {
+        tmp = x[i]; Y = y + i * ndims;
+        for (d = ndims - 1; d >= 0; --d) {
+            FIXED_DIVISOR_DIV_MOD(dims[d], tmp, &tmp, (Y + d));
+        }
+    }
+}
+#undef FIXED_DIVISOR_DIV_MOD
+}  // namespace kernel
+}  // namepsace dragon
\ No newline at end of file
--- a/Dragon/src/kernels/array/non_zero_op_kernel.cu
+++ b/Dragon/src/kernels/array/non_zero_op_kernel.cu
+#ifdef WITH_CUDA
+#include "core/context_cuda.h"
+#include "utils/op_kernel.h"
+namespace dragon {
+namespace kernel {
+#define FIXED_DIVISOR_DIV_MOD(d, n, q, r) \
+    do {                                  \
+        const auto n_copy = n;            \
+        *q = n_copy / d;                  \
+        *r = n_copy % d;                  \
+    } while (0)
+/* <T = ?, Device = CUDA> */
+__global__ void _UnravelIndex(
+    const int               nthreads,
+    const int               ndims,
+    const int*              dims,
+    const int64_t*          x,
+    int64_t*                y) {
+    CUDA_1D_KERNEL_LOOP(i, nthreads) {
+        int tmp = x[i], d;
+        int64_t* Y = y + i * ndims;
+#pragma unroll
+        for (d = ndims - 1; d >= 0; --d) {
+#if __CUDA_ARCH__ >= 350
+            FIXED_DIVISOR_DIV_MOD(__ldg(dims + d), tmp, &tmp, (Y + d));
+#else
+            FIXED_DIVISOR_DIV_MOD(dims[d], tmp, &tmp, (Y + d));
+#endif
+        }
+    }
+}
+template <> void UnravelIndex<CUDAContext>(
+    const int               count,
+    const int               ndims,
+    const int*              dims,
+    const int64_t*          x,
+    int64_t*                y,
+    CUDAContext*            ctx) {
+    _UnravelIndex
+        <<< CUDA_BLOCKS(count), CUDA_THREADS, \
+            0, ctx->cuda_stream() >>>(
+        count, ndims, dims, x, y
+    );
+}
+#undef FIXED_DIVISOR_DIV_MOD
+}  // namespace kernel
+}  // namepsace dragon
+#endif  // WITH_CUDA
\ No newline at end of file
--- a/Dragon/src/kernels/array/where_op_kernel.cc
+++ b/Dragon/src/kernels/array/where_op_kernel.cc
+#include "utils/op_kernel.h"
+#include "utils/omp_alternative.h"
+namespace dragon {
+namespace kernel {
+/* <T = ?, Device = CPU> */
+template <typename T>
+void _Where(
+    const int               count,
+    const uint8_t*          mask,
+    const T*                a,
+    const T*                b,
+    T*                      y) {
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(OMP_THREADS(count))
+#endif
+    for (int i = 0; i < count; ++i) {
+        y[i] = mask[i] ? a[i] : b[i];
+    }
+}
+template <typename T>
+void _WhereGrad(
+    const int               count,
+    const uint8_t*          mask,
+    const T*                dy,
+    T*                      da,
+    T*                      db) {
+    const T kZero = T(0);
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(OMP_THREADS(count))
+#endif
+    for (int i = 0; i < count; ++i) {
+        const bool dy_to_da = mask[i];
+        da[i] = dy_to_da ? dy[i] : kZero;
+        db[i] = dy_to_da ? kZero : dy[i];
+    }
+}
+/* Kernel Launchers */
+#define DEFINE_WHERE_KERNEL_LAUNCHER(T) \
+    template<> void Where<T, CPUContext>( \
+        const int               count, \
+        const uint8_t*          mask, \
+        const T*                a, \
+        const T*                b, \
+        T*                      y, \
+        CPUContext*             ctx) { \
+        _Where(count, mask, a, b, y); \
+    }
+#define DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(T) \
+    template <> void WhereGrad<T, CPUContext>( \
+        const int               count, \
+        const uint8_t*          mask, \
+        const T*                dy, \
+        T*                      da, \
+        T*                      db, \
+        CPUContext*             ctx) { \
+        _WhereGrad(count, mask, dy, da, db); \
+    }
+DEFINE_WHERE_KERNEL_LAUNCHER(bool);
+DEFINE_WHERE_KERNEL_LAUNCHER(int8_t);
+DEFINE_WHERE_KERNEL_LAUNCHER(uint8_t);
+DEFINE_WHERE_KERNEL_LAUNCHER(int);
+DEFINE_WHERE_KERNEL_LAUNCHER(int64_t);
+DEFINE_WHERE_KERNEL_LAUNCHER(float16);
+DEFINE_WHERE_KERNEL_LAUNCHER(float);
+DEFINE_WHERE_KERNEL_LAUNCHER(double);
+DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(bool);
+DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(int8_t);
+DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(uint8_t);
+DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(int);
+DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(int64_t);
+DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(float);
+DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(double);
+template <> void WhereGrad<float16, CPUContext>(
+    const int               count,
+    const uint8_t*          mask,
+    const float16*          dy,
+    float16*                da,
+    float16*                db,
+    CPUContext*             ctx) {
+    CPU_FP16_NOT_SUPPORTED;
+}
+#undef DEFINE_WHERE_KERNEL_LAUNCHER
+#undef DEFINE_WHERE_GRAD_KERNEL_LAUNCHER
+}  // namespace kernel
+}  // namepsace dragon
\ No newline at end of file
--- a/Dragon/src/kernels/array/where_op_kernel.cu
+++ b/Dragon/src/kernels/array/where_op_kernel.cu
+#ifdef WITH_CUDA
+#include "core/context_cuda.h"
+#include "utils/op_kernel.h"
+namespace dragon {
+namespace kernel {
+/* <T = ?, Device = CUDA> */
+template<typename T>
+__global__ void _Where(
+    const int               nthreads,
+    const uint8_t*          mask,
+    const T*                a,
+    const T*                b,
+    T*                      y) {
+    CUDA_1D_KERNEL_LOOP(i, nthreads) {
+        y[i] = mask[i] ? a[i] : b[i];
+    }
+}
+template <typename T>
+__global__ void _WhereGrad(
+    const int               nthreads,
+    const uint8_t*          mask,
+    const T*                dy,
+    T*                      da,
+    T*                      db) {
+    const T kZero = T(0);
+    CUDA_1D_KERNEL_LOOP(i, nthreads) {
+        da[i] = mask[i] ? dy[i] : kZero;
+        db[i] = mask[i] ? kZero : dy[i];
+    }
+}
+template<> __global__ void _WhereGrad<half>(
+    const int               nthreads,
+    const uint8_t*          mask,
+    const half*             dy,
+    half*                   da,
+    half*                   db) {
+#if __CUDA_ARCH__ >= 530
+    const half kZero = __float2half(0.f);
+    CUDA_1D_KERNEL_LOOP(i, nthreads) {
+        const bool dy_to_da = mask[i];
+        da[i] = dy_to_da ? dy[i] : kZero;
+        db[i] = dy_to_da ? kZero : dy[i];
+    }
+#endif
+}
+/* Kernel Launchers */
+#define DEFINE_WHERE_KERNEL_LAUNCHER(T) \
+    template<> void Where<T, CUDAContext>( \
+        const int               count, \
+        const uint8_t*          mask, \
+        const T*                a, \
+        const T*                b, \
+        T*                      y, \
+        CUDAContext*            ctx) { \
+        _Where \
+            <<< CUDA_BLOCKS(count), CUDA_THREADS, \
+                0, ctx->cuda_stream() >>>( \
+            count, mask, a, b, y \
+        ); \
+    }
+#define DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(T) \
+    template <> void WhereGrad<T, CUDAContext>( \
+        const int               count, \
+        const uint8_t*          mask, \
+        const T*                dy, \
+        T*                      da, \
+        T*                      db, \
+        CUDAContext*            ctx) { \
+        _WhereGrad \
+            <<< CUDA_BLOCKS(count), CUDA_THREADS, \
+                0, ctx->cuda_stream() >>>( \
+            count, mask, dy, da, db \
+        ); \
+    }
+DEFINE_WHERE_KERNEL_LAUNCHER(bool);
+DEFINE_WHERE_KERNEL_LAUNCHER(int8_t);
+DEFINE_WHERE_KERNEL_LAUNCHER(uint8_t);
+DEFINE_WHERE_KERNEL_LAUNCHER(int);
+DEFINE_WHERE_KERNEL_LAUNCHER(int64_t);
+DEFINE_WHERE_KERNEL_LAUNCHER(float16);
+DEFINE_WHERE_KERNEL_LAUNCHER(float);
+DEFINE_WHERE_KERNEL_LAUNCHER(double);
+DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(bool);
+DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(int8_t);
+DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(uint8_t);
+DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(int);
+DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(int64_t);
+DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(float);
+DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(double);
+template <> void WhereGrad<float16, CUDAContext>(
+    const int               count,
+    const uint8_t*          mask,
+    const float16*          dy,
+    float16*                da,
+    float16*                db,
+    CUDAContext*            ctx) {
+    _WhereGrad
+        <<< CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >>>(
+        count,
+        mask,
+        reinterpret_cast<const half*>(dy),
+        reinterpret_cast<half*>(da),
+        reinterpret_cast<half*>(db)
+    );
+}
+#undef DEFINE_WHERE_KERNEL_LAUNCHER
+#undef DEFINE_WHERE_GRAD_KERNEL_LAUNCHER
+}  // namespace kernel
+}  // namepsace dragon
+#endif  // WITH_CUDA
\ No newline at end of file
--- a/Dragon/src/kernels/control_flow/compare_op_kernel.cc
+++ b/Dragon/src/kernels/control_flow/compare_op_kernel.cc
 #include "utils/op_kernel.h"
-#include "utils/omp_alternative.h"
+#include "utils/eigen_utils.h"
 namespace dragon {
 namespace kernel {
-/* <T = ?, Device = CPU> */
+/* Kernel Launchers */
-template <typename T>
-void _EqualInteger(
-    const int               count,
-    const T*                a,
-    const T*                b,
-    bool*                   y) {
-#ifdef WITH_OMP
-    #pragma omp parallel for num_threads(OMP_THREADS(count))
-#endif
-    for (int i = 0; i < count; ++i) {
-        y[i] = a[i] == b[i] ? true : false;
-    }
-}
-template <typename T>
-void _EqualFloat(
-    const int               count,
-    const T*                a,
-    const T*                b,
-    bool*                   y) {
-#ifdef WITH_OMP
-    #pragma omp parallel for num_threads(OMP_THREADS(count))
-#endif
-    for (int i = 0; i < count; ++i) {
-        y[i] = fabs(a[i] - b[i]) < 1e-15 ? true : false;
-    }
-}
-/* <T = ?, Device = CPU> */
-template <typename T>
-void _Less(
-    const int               count,
-    const T*                a,
-    const T*                b,
-    bool*                   y) {
-#ifdef WITH_OMP
-    #pragma omp parallel for num_threads(OMP_THREADS(count))
-#endif
-    for (int i = 0; i < count; ++i) {
-        y[i] = a[i] < b[i] ? true : false;
-    }
-}
-/* <T = ?, Device = CPU> */
-template <typename T>
-void _LessEqual(
-    const int               count,
-    const T*                a,
-    const T*                b,
-    bool*                   y) {
-#ifdef WITH_OMP
-#pragma omp parallel for num_threads(OMP_THREADS(count))
-#endif
-    for (int i = 0; i < count; ++i) {
-        y[i] = a[i] <= b[i] ? true : false;
-    }
-}
-/* <T = ?, Device = CPU> */
-template <typename T>
-void _Greater(
-    const int               count,
-    const T*                a,
-    const T*                b,
-    bool*                   y) {
-#ifdef WITH_OMP
-    #pragma omp parallel for num_threads(OMP_THREADS(count))
-#endif
-    for (int i = 0; i < count; ++i) {
-        y[i] = a[i] > b[i] ? true : false;
-    }
-}
-/* <T = ?, Device = CPU> */
-template <typename T>
+#define DEFINE_NOTZERO_KERNEL_LAUNCHER(T) \
-void _GreaterEqual(
+    template <> void NotZero<T, CPUContext>( \
-    const int               count,
+        const int               count, \
-    const T*                a,
+        const T*                x, \
-    const T*                b,
+        bool*                   y, \
-    bool*                   y) {
+        CPUContext*             ctx) { \
-#ifdef WITH_OMP
+        EigenVectorArrayMap<bool>(y, count) = \
-#pragma omp parallel for num_threads(OMP_THREADS(count))
+            ConstEigenVectorArrayMap<T>(x, count) != T(0); \
-#endif
-    for (int i = 0; i < count; ++i) {
-        y[i] = a[i] >= b[i] ? true : false;
    }
-}
-/* Kernel Launchers */
-#define DEFINE_COMPARE_WARPPER(T, OP, IMPL) \
+#define DEFINE_COMPARE_KERNEL_LAUNCHER(T, OP, expr) \
    template <> void OP<T, CPUContext>( \
        const int               count, \
        const T*                a, \
        const T*                b, \
        bool*                   y, \
        CPUContext*             ctx) { \
-        IMPL(count, a, b, y); \
+        EigenVectorArrayMap<bool>(y, count) = \
+            ConstEigenVectorArrayMap<T>(a, count) expr \
+                ConstEigenVectorArrayMap<T>(b, count); \
    }
-DEFINE_COMPARE_WARPPER(bool, Equal, _EqualInteger);
+DEFINE_NOTZERO_KERNEL_LAUNCHER(bool);
-DEFINE_COMPARE_WARPPER(int8_t, Equal, _EqualInteger);
+DEFINE_NOTZERO_KERNEL_LAUNCHER(int8_t);
-DEFINE_COMPARE_WARPPER(uint8_t, Equal, _EqualInteger);
+DEFINE_NOTZERO_KERNEL_LAUNCHER(uint8_t);
-DEFINE_COMPARE_WARPPER(int, Equal, _EqualInteger);
+DEFINE_NOTZERO_KERNEL_LAUNCHER(int);
-DEFINE_COMPARE_WARPPER(int64_t, Equal, _EqualInteger);
+DEFINE_NOTZERO_KERNEL_LAUNCHER(int64_t);
-DEFINE_COMPARE_WARPPER(float, Equal, _EqualFloat);
+DEFINE_NOTZERO_KERNEL_LAUNCHER(float);
-DEFINE_COMPARE_WARPPER(double, Equal, _EqualFloat);
+DEFINE_NOTZERO_KERNEL_LAUNCHER(double);
-DEFINE_COMPARE_WARPPER(bool, Less, _Less);
+DEFINE_COMPARE_KERNEL_LAUNCHER(bool, Equal, ==);
-DEFINE_COMPARE_WARPPER(int8_t, Less, _Less);
+DEFINE_COMPARE_KERNEL_LAUNCHER(int8_t, Equal, ==);
-DEFINE_COMPARE_WARPPER(uint8_t, Less, _Less);
+DEFINE_COMPARE_KERNEL_LAUNCHER(uint8_t, Equal, ==);
-DEFINE_COMPARE_WARPPER(int, Less, _Less);
+DEFINE_COMPARE_KERNEL_LAUNCHER(int, Equal, ==);
-DEFINE_COMPARE_WARPPER(int64_t, Less, _Less);
+DEFINE_COMPARE_KERNEL_LAUNCHER(int64_t, Equal, ==);
-DEFINE_COMPARE_WARPPER(float, Less, _Less);
+DEFINE_COMPARE_KERNEL_LAUNCHER(float, Equal, ==);
-DEFINE_COMPARE_WARPPER(double, Less, _Less);
+DEFINE_COMPARE_KERNEL_LAUNCHER(double, Equal, ==);
-DEFINE_COMPARE_WARPPER(bool, LessEqual, _LessEqual);
+DEFINE_COMPARE_KERNEL_LAUNCHER(bool, NotEqual, !=);
-DEFINE_COMPARE_WARPPER(int8_t, LessEqual, _LessEqual);
+DEFINE_COMPARE_KERNEL_LAUNCHER(int8_t, NotEqual, !=);
-DEFINE_COMPARE_WARPPER(uint8_t, LessEqual, _LessEqual);
+DEFINE_COMPARE_KERNEL_LAUNCHER(uint8_t, NotEqual, !=);
-DEFINE_COMPARE_WARPPER(int, LessEqual, _LessEqual);
+DEFINE_COMPARE_KERNEL_LAUNCHER(int, NotEqual, !=);
-DEFINE_COMPARE_WARPPER(int64_t, LessEqual, _LessEqual);
+DEFINE_COMPARE_KERNEL_LAUNCHER(int64_t, NotEqual, !=);
-DEFINE_COMPARE_WARPPER(float, LessEqual, _LessEqual);
+DEFINE_COMPARE_KERNEL_LAUNCHER(float, NotEqual, != );
-DEFINE_COMPARE_WARPPER(double, LessEqual, _LessEqual);
+DEFINE_COMPARE_KERNEL_LAUNCHER(double, NotEqual, !=);
-DEFINE_COMPARE_WARPPER(bool, Greater, _Greater);
+DEFINE_COMPARE_KERNEL_LAUNCHER(bool, Less, <);
-DEFINE_COMPARE_WARPPER(int8_t, Greater, _Greater);
+DEFINE_COMPARE_KERNEL_LAUNCHER(int8_t, Less, <);
-DEFINE_COMPARE_WARPPER(uint8_t, Greater, _Greater);
+DEFINE_COMPARE_KERNEL_LAUNCHER(uint8_t, Less, <);
-DEFINE_COMPARE_WARPPER(int, Greater, _Greater);
+DEFINE_COMPARE_KERNEL_LAUNCHER(int, Less, <);
-DEFINE_COMPARE_WARPPER(int64_t, Greater, _Greater);
+DEFINE_COMPARE_KERNEL_LAUNCHER(int64_t, Less, <);
-DEFINE_COMPARE_WARPPER(float, Greater, _Greater);
+DEFINE_COMPARE_KERNEL_LAUNCHER(float, Less, <);
-DEFINE_COMPARE_WARPPER(double, Greater, _Greater);
+DEFINE_COMPARE_KERNEL_LAUNCHER(double, Less, <);
-DEFINE_COMPARE_WARPPER(bool, GreaterEqual, _GreaterEqual);
+DEFINE_COMPARE_KERNEL_LAUNCHER(bool, LessEqual, <=);
-DEFINE_COMPARE_WARPPER(int8_t, GreaterEqual, _GreaterEqual);
+DEFINE_COMPARE_KERNEL_LAUNCHER(int8_t, LessEqual, <=);
-DEFINE_COMPARE_WARPPER(uint8_t, GreaterEqual, _GreaterEqual);
+DEFINE_COMPARE_KERNEL_LAUNCHER(uint8_t, LessEqual, <=);
-DEFINE_COMPARE_WARPPER(int, GreaterEqual, _GreaterEqual);
+DEFINE_COMPARE_KERNEL_LAUNCHER(int, LessEqual, <=);
-DEFINE_COMPARE_WARPPER(int64_t, GreaterEqual, _GreaterEqual);
+DEFINE_COMPARE_KERNEL_LAUNCHER(int64_t, LessEqual, <=);
-DEFINE_COMPARE_WARPPER(float, GreaterEqual, _GreaterEqual);
+DEFINE_COMPARE_KERNEL_LAUNCHER(float, LessEqual, <=);
-DEFINE_COMPARE_WARPPER(double, GreaterEqual, _GreaterEqual);
+DEFINE_COMPARE_KERNEL_LAUNCHER(double, LessEqual, <=);
+DEFINE_COMPARE_KERNEL_LAUNCHER(bool, Greater, >);
+DEFINE_COMPARE_KERNEL_LAUNCHER(int8_t, Greater, >);
+DEFINE_COMPARE_KERNEL_LAUNCHER(uint8_t, Greater, >);
+DEFINE_COMPARE_KERNEL_LAUNCHER(int, Greater, >);
+DEFINE_COMPARE_KERNEL_LAUNCHER(int64_t, Greater, >);
+DEFINE_COMPARE_KERNEL_LAUNCHER(float, Greater, >);
+DEFINE_COMPARE_KERNEL_LAUNCHER(double, Greater, >);
+DEFINE_COMPARE_KERNEL_LAUNCHER(bool, GreaterEqual, >=);
+DEFINE_COMPARE_KERNEL_LAUNCHER(int8_t, GreaterEqual, >=);
+DEFINE_COMPARE_KERNEL_LAUNCHER(uint8_t, GreaterEqual, >=);
+DEFINE_COMPARE_KERNEL_LAUNCHER(int, GreaterEqual, >=);
+DEFINE_COMPARE_KERNEL_LAUNCHER(int64_t, GreaterEqual, >=);
+DEFINE_COMPARE_KERNEL_LAUNCHER(float, GreaterEqual, >=);
+DEFINE_COMPARE_KERNEL_LAUNCHER(double, GreaterEqual, >=);
+template <> void NotZero<float16, CPUContext>(
+    const int               count,
+    const float16*          x,
+    bool*                   y,
+    CPUContext* ctx) {
+    CPU_FP16_NOT_SUPPORTED;
+}
 template <> void Equal<float16, CPUContext>(
    const int               count,
@@ -160,6 +102,15 @@ template <> void Equal<float16, CPUContext>(
    CPU_FP16_NOT_SUPPORTED;
 }
+template <> void NotEqual<float16, CPUContext>(
+    const int               count,
+    const float16*          a,
+    const float16*          b,
+    bool*                   y,
+    CPUContext*             ctx) {
+    CPU_FP16_NOT_SUPPORTED;
+}
 template <> void Less<float16, CPUContext>(
    const int               count,
    const float16*          a,
@@ -196,7 +147,8 @@ template <> void GreaterEqual<float16, CPUContext>(
    CPU_FP16_NOT_SUPPORTED;
 }
-#undef DEFINE_COMPARE_WARPPER
+#undef DEFINE_NOTZERO_KERNEL_LAUNCHER
+#undef DEFINE_COMPARE_KERNEL_LAUNCHER
 }  // namespace kernel

--- a/Dragon/src/kernels/control_flow/compare_op_kernel.cu
+++ b/Dragon/src/kernels/control_flow/compare_op_kernel.cu
@@ -10,6 +10,31 @@ namespace kernel {
 /* <T = ?, Device = CUDA> */
 template <typename T>
+__global__ void _NotZero(
+    const int               nthreads,
+    const T*                x,
+    bool*                   y) {
+    const T kZero = T(0);
+    CUDA_1D_KERNEL_LOOP(i, nthreads) {
+        y[i] = x[i] != kZero ? true : false;
+    }
+}
+template<> __global__ void _NotZero<half>(
+    const int               nthreads,
+    const half*             x,
+    bool*                   y) {
+#if __CUDA_ARCH__ >= 530
+    const half kZero = __float2half(0.f);
+    CUDA_1D_KERNEL_LOOP(i, nthreads) {
+        y[i] = __hne(x[i], kZero) ? true : false;
+    }
+#endif
+}
+/* <T = ?, Device = CUDA> */
+template <typename T>
 __global__ void _EqualInteger(
    const int               nthreads,
    const T*                a,
@@ -46,6 +71,42 @@ __global__ void _EqualFloat(
 /* <T = ?, Device = CUDA> */
 template <typename T>
+__global__ void _NotEqualInteger(
+    const int               nthreads,
+    const T*                a,
+    const T*                b,
+    bool*                   y) {
+    CUDA_1D_KERNEL_LOOP(i, nthreads) {
+        y[i] = a[i] != b[i] ? true : false;
+    }
+}
+__global__ void _NotEqualHalf(
+    const int               nthreads,
+    const half*             a,
+    const half*             b,
+    bool*                   y) {
+    CUDA_1D_KERNEL_LOOP(i, nthreads) {
+#if __CUDA_ARCH__ >= 530
+        y[i] = __hne(a[i], b[i]) ? true : false;
+#endif
+    }
+}
+template <typename T>
+__global__ void _NotEqualFloat(
+    const int               nthreads,
+    const T*                a,
+    const T*                b,
+    bool*                   y) {
+    CUDA_1D_KERNEL_LOOP(i, nthreads) {
+        y[i] = fabs(a[i] - b[i]) > 1e-15 ? true : false;
+    }
+}
+/* <T = ?, Device = CUDA> */
+template <typename T>
 __global__ void _Less(
    const int               nthreads,
    const T*                a,
@@ -145,7 +206,20 @@ __global__ void _GreaterEqualHalf(
 /* Kernel Launchers */
-#define DEFINE_COMPARE_WARPPER(T, OP, IMPL) \
+#define DEFINE_NOTZERO_KERNEL_LAUNCHER(T) \
+    template <> void NotZero<T, CUDAContext>( \
+        const int               count, \
+        const T*                x, \
+        bool*                   y, \
+        CUDAContext*            ctx) { \
+        _NotZero \
+            <<< CUDA_BLOCKS(count), CUDA_THREADS, \
+                0, ctx->cuda_stream() >>>( \
+            count, x, y \
+        ); \
+    }
+#define DEFINE_COMPARE_KERNEL_LAUNCHER(T, OP, IMPL) \
    template <> void OP<T, CUDAContext>( \
        const int               count, \
        const T*                a, \
@@ -159,7 +233,7 @@ __global__ void _GreaterEqualHalf(
        ); \
    }
-#define DEFINE_COMPARE_FP16_WARPPER(OP) \
+#define DEFINE_COMPARE_FP16_KERNEL_LAUNCHER(OP) \
    template <> void OP<float16, CUDAContext>( \
        const int               count, \
        const float16*          a, \
@@ -176,53 +250,85 @@ __global__ void _GreaterEqualHalf(
        ); \
    }
-DEFINE_COMPARE_WARPPER(bool, Equal, _EqualInteger);
+DEFINE_NOTZERO_KERNEL_LAUNCHER(bool);
-DEFINE_COMPARE_WARPPER(int8_t, Equal, _EqualInteger);
+DEFINE_NOTZERO_KERNEL_LAUNCHER(int8_t);
-DEFINE_COMPARE_WARPPER(uint8_t, Equal, _EqualInteger);
+DEFINE_NOTZERO_KERNEL_LAUNCHER(uint8_t);
-DEFINE_COMPARE_WARPPER(int, Equal, _EqualInteger);
+DEFINE_NOTZERO_KERNEL_LAUNCHER(int);
-DEFINE_COMPARE_WARPPER(int64_t, Equal, _EqualInteger);
+DEFINE_NOTZERO_KERNEL_LAUNCHER(int64_t);
-DEFINE_COMPARE_WARPPER(float, Equal, _EqualFloat);
+DEFINE_NOTZERO_KERNEL_LAUNCHER(float);
-DEFINE_COMPARE_WARPPER(double, Equal, _EqualFloat);
+DEFINE_NOTZERO_KERNEL_LAUNCHER(double);
-DEFINE_COMPARE_FP16_WARPPER(Equal);
+DEFINE_COMPARE_KERNEL_LAUNCHER(bool, Equal, _EqualInteger);
-DEFINE_COMPARE_WARPPER(bool, Less, _Less);
+DEFINE_COMPARE_KERNEL_LAUNCHER(int8_t, Equal, _EqualInteger);
-DEFINE_COMPARE_WARPPER(int8_t, Less, _Less);
+DEFINE_COMPARE_KERNEL_LAUNCHER(uint8_t, Equal, _EqualInteger);
-DEFINE_COMPARE_WARPPER(uint8_t, Less, _Less);
+DEFINE_COMPARE_KERNEL_LAUNCHER(int, Equal, _EqualInteger);
-DEFINE_COMPARE_WARPPER(int, Less, _Less);
+DEFINE_COMPARE_KERNEL_LAUNCHER(int64_t, Equal, _EqualInteger);
-DEFINE_COMPARE_WARPPER(int64_t, Less, _Less);
+DEFINE_COMPARE_KERNEL_LAUNCHER(float, Equal, _EqualFloat);
-DEFINE_COMPARE_WARPPER(float, Less, _Less);
+DEFINE_COMPARE_KERNEL_LAUNCHER(double, Equal, _EqualFloat);
-DEFINE_COMPARE_WARPPER(double, Less, _Less);
+DEFINE_COMPARE_FP16_KERNEL_LAUNCHER(Equal);
-DEFINE_COMPARE_FP16_WARPPER(Less);
+DEFINE_COMPARE_KERNEL_LAUNCHER(bool, NotEqual, _NotEqualInteger);
-DEFINE_COMPARE_WARPPER(bool, LessEqual, _LessEqual);
+DEFINE_COMPARE_KERNEL_LAUNCHER(int8_t, NotEqual, _NotEqualInteger);
-DEFINE_COMPARE_WARPPER(int8_t, LessEqual, _LessEqual);
+DEFINE_COMPARE_KERNEL_LAUNCHER(uint8_t, NotEqual, _NotEqualInteger);
-DEFINE_COMPARE_WARPPER(uint8_t, LessEqual, _LessEqual);
+DEFINE_COMPARE_KERNEL_LAUNCHER(int, NotEqual, _NotEqualInteger);
-DEFINE_COMPARE_WARPPER(int, LessEqual, _LessEqual);
+DEFINE_COMPARE_KERNEL_LAUNCHER(int64_t, NotEqual, _NotEqualInteger);
-DEFINE_COMPARE_WARPPER(int64_t, LessEqual, _LessEqual);
+DEFINE_COMPARE_KERNEL_LAUNCHER(float, NotEqual, _NotEqualFloat);
-DEFINE_COMPARE_WARPPER(float, LessEqual, _LessEqual);
+DEFINE_COMPARE_KERNEL_LAUNCHER(double, NotEqual, _NotEqualFloat);
-DEFINE_COMPARE_WARPPER(double, LessEqual, _LessEqual);
+DEFINE_COMPARE_FP16_KERNEL_LAUNCHER(NotEqual);
-DEFINE_COMPARE_FP16_WARPPER(LessEqual);
+DEFINE_COMPARE_KERNEL_LAUNCHER(bool, Less, _Less);
-DEFINE_COMPARE_WARPPER(bool, Greater, _Greater);
+DEFINE_COMPARE_KERNEL_LAUNCHER(int8_t, Less, _Less);
-DEFINE_COMPARE_WARPPER(int8_t, Greater, _Greater);
+DEFINE_COMPARE_KERNEL_LAUNCHER(uint8_t, Less, _Less);
-DEFINE_COMPARE_WARPPER(uint8_t, Greater, _Greater);
+DEFINE_COMPARE_KERNEL_LAUNCHER(int, Less, _Less);
-DEFINE_COMPARE_WARPPER(int, Greater, _Greater);
+DEFINE_COMPARE_KERNEL_LAUNCHER(int64_t, Less, _Less);
-DEFINE_COMPARE_WARPPER(int64_t, Greater, _Greater);
+DEFINE_COMPARE_KERNEL_LAUNCHER(float, Less, _Less);
-DEFINE_COMPARE_WARPPER(float, Greater, _Greater);
+DEFINE_COMPARE_KERNEL_LAUNCHER(double, Less, _Less);
-DEFINE_COMPARE_WARPPER(double, Greater, _Greater);
+DEFINE_COMPARE_FP16_KERNEL_LAUNCHER(Less);
-DEFINE_COMPARE_FP16_WARPPER(Greater);
+DEFINE_COMPARE_KERNEL_LAUNCHER(bool, LessEqual, _LessEqual);
-DEFINE_COMPARE_WARPPER(bool, GreaterEqual, _GreaterEqual);
+DEFINE_COMPARE_KERNEL_LAUNCHER(int8_t, LessEqual, _LessEqual);
-DEFINE_COMPARE_WARPPER(int8_t, GreaterEqual, _GreaterEqual);
+DEFINE_COMPARE_KERNEL_LAUNCHER(uint8_t, LessEqual, _LessEqual);
-DEFINE_COMPARE_WARPPER(uint8_t, GreaterEqual, _GreaterEqual);
+DEFINE_COMPARE_KERNEL_LAUNCHER(int, LessEqual, _LessEqual);
-DEFINE_COMPARE_WARPPER(int, GreaterEqual, _GreaterEqual);
+DEFINE_COMPARE_KERNEL_LAUNCHER(int64_t, LessEqual, _LessEqual);
-DEFINE_COMPARE_WARPPER(int64_t, GreaterEqual, _GreaterEqual);
+DEFINE_COMPARE_KERNEL_LAUNCHER(float, LessEqual, _LessEqual);
-DEFINE_COMPARE_WARPPER(float, GreaterEqual, _GreaterEqual);
+DEFINE_COMPARE_KERNEL_LAUNCHER(double, LessEqual, _LessEqual);
-DEFINE_COMPARE_WARPPER(double, GreaterEqual, _GreaterEqual);
+DEFINE_COMPARE_FP16_KERNEL_LAUNCHER(LessEqual);
-DEFINE_COMPARE_FP16_WARPPER(GreaterEqual);
+DEFINE_COMPARE_KERNEL_LAUNCHER(bool, Greater, _Greater);
-#undef DEFINE_COMPARE_WARPPER
+DEFINE_COMPARE_KERNEL_LAUNCHER(int8_t, Greater, _Greater);
-#undef DEFINE_COMPARE_FP16_WARPPER
+DEFINE_COMPARE_KERNEL_LAUNCHER(uint8_t, Greater, _Greater);
+DEFINE_COMPARE_KERNEL_LAUNCHER(int, Greater, _Greater);
+DEFINE_COMPARE_KERNEL_LAUNCHER(int64_t, Greater, _Greater);
+DEFINE_COMPARE_KERNEL_LAUNCHER(float, Greater, _Greater);
+DEFINE_COMPARE_KERNEL_LAUNCHER(double, Greater, _Greater);
+DEFINE_COMPARE_FP16_KERNEL_LAUNCHER(Greater);
+DEFINE_COMPARE_KERNEL_LAUNCHER(bool, GreaterEqual, _GreaterEqual);
+DEFINE_COMPARE_KERNEL_LAUNCHER(int8_t, GreaterEqual, _GreaterEqual);
+DEFINE_COMPARE_KERNEL_LAUNCHER(uint8_t, GreaterEqual, _GreaterEqual);
+DEFINE_COMPARE_KERNEL_LAUNCHER(int, GreaterEqual, _GreaterEqual);
+DEFINE_COMPARE_KERNEL_LAUNCHER(int64_t, GreaterEqual, _GreaterEqual);
+DEFINE_COMPARE_KERNEL_LAUNCHER(float, GreaterEqual, _GreaterEqual);
+DEFINE_COMPARE_KERNEL_LAUNCHER(double, GreaterEqual, _GreaterEqual);
+DEFINE_COMPARE_FP16_KERNEL_LAUNCHER(GreaterEqual);
+template <> void NotZero<float16, CUDAContext>(
+    const int               count,
+    const float16*          x,
+    bool*                   y,
+    CUDAContext*            ctx) {
+    _NotZero
+        <<< CUDA_BLOCKS(count), CUDA_THREADS, \
+            0, ctx->cuda_stream() >>>(
+        count,
+        reinterpret_cast<const half*>(x),
+        y
+    );
+}
+#undef DEFINE_NOTZERO_KERNEL_LAUNCHER
+#undef DEFINE_COMPARE_KERNEL_LAUNCHER
+#undef DEFINE_COMPARE_FP16_KERNEL_LAUNCHER
 }  // namespace kernel

--- a/Dragon/src/kernels/control_flow/masked_assign_op_kernel.cc
+++ b/Dragon/src/kernels/control_flow/masked_assign_op_kernel.cc
-#include "utils/op_kernel.h"
-#include "utils/math_utils.h"
-#include "utils/omp_alternative.h"
-namespace dragon {
-namespace kernel {
-/* <T = ?, Device = CPU> */
-template <typename T>
-void _MaskedAssign(
-    const int               count,
-    const uint8_t*          mask,
-    const T*                x,
-    T*                      y) {
-#ifdef WITH_OMP
-    #pragma omp parallel for num_threads(OMP_THREADS(count))
-#endif
-    for (int i = 0; i < count; ++i) {
-        y[i] = mask[i] ? x[i] : y[i];
-    }
-}
-/* Kernel Launchers */
-#define DEFINE_ASSIGN_KERNEL_LAUNCHER(T) \
-    template<> void MaskedAssign<T, CPUContext>( \
-        const int               count, \
-        const uint8_t*          mask, \
-        const T*                x, \
-        T*                      y, \
-        CPUContext*             ctx) { \
-        _MaskedAssign(count, mask, x, y); \
-    }
-DEFINE_ASSIGN_KERNEL_LAUNCHER(bool);
-DEFINE_ASSIGN_KERNEL_LAUNCHER(int8_t);
-DEFINE_ASSIGN_KERNEL_LAUNCHER(uint8_t);
-DEFINE_ASSIGN_KERNEL_LAUNCHER(int);
-DEFINE_ASSIGN_KERNEL_LAUNCHER(int64_t);
-DEFINE_ASSIGN_KERNEL_LAUNCHER(float16);
-DEFINE_ASSIGN_KERNEL_LAUNCHER(float);
-DEFINE_ASSIGN_KERNEL_LAUNCHER(double);
-#undef DEFINE_ASSIGN_KERNEL_LAUNCHER
-}  // namespace kernel
-}  // namepsace dragon
\ No newline at end of file
--- a/Dragon/src/kernels/control_flow/masked_assign_op_kernel.cu
+++ b/Dragon/src/kernels/control_flow/masked_assign_op_kernel.cu
-#ifdef WITH_CUDA
-#include "core/context_cuda.h"
-#include "utils/op_kernel.h"
-namespace dragon {
-namespace kernel {
-/* <T = ?, Device = CUDA> */
-template<typename T>
-__global__ void _MaskedAssign(
-    const int               nthreads,
-    const uint8_t*          mask,
-    const T*                x,
-    T*                      y) {
-    CUDA_1D_KERNEL_LOOP(i, nthreads) {
-        y[i] = mask[i] ? x[i] : y[i];
-    }
-}
-/* Kernel Launchers */
-#define DEFINE_ASSIGN_KERNEL_LAUNCHER(T) \
-    template<> void MaskedAssign<T, CUDAContext>( \
-        const int               count, \
-        const uint8_t*          mask, \
-        const T*                x, \
-        T*                      y, \
-        CUDAContext*            ctx) { \
-        _MaskedAssign \
-            <<< CUDA_BLOCKS(count), CUDA_THREADS, \
-                0, ctx->cuda_stream() >>>( \
-            count, mask, x, y \
-        ); \
-    }
-DEFINE_ASSIGN_KERNEL_LAUNCHER(bool);
-DEFINE_ASSIGN_KERNEL_LAUNCHER(int8_t);
-DEFINE_ASSIGN_KERNEL_LAUNCHER(uint8_t);
-DEFINE_ASSIGN_KERNEL_LAUNCHER(int);
-DEFINE_ASSIGN_KERNEL_LAUNCHER(int64_t);
-DEFINE_ASSIGN_KERNEL_LAUNCHER(float16);
-DEFINE_ASSIGN_KERNEL_LAUNCHER(float);
-DEFINE_ASSIGN_KERNEL_LAUNCHER(double);
-#undef DEFINE_ASSIGN_KERNEL_LAUNCHER
-}  // namespace kernel
-}  // namepsace dragon
-#endif  // WITH_CUDA
\ No newline at end of file
--- a/Dragon/src/kernels/loss/l1_loss_op_kernel.cc
+++ b/Dragon/src/kernels/loss/l1_loss_op_kernel.cc
@@ -5,7 +5,7 @@ namespace dragon {
 namespace kernel {
-/*! AbsGrad <T = float32, Device = CPU> */
+/* <T = float32, Device = CPU> */
 template<> void AbsGrad<float, CPUContext>(
    const int               count,

--- a/Dragon/src/kernels/loss/l1_loss_op_kernel.cu
+++ b/Dragon/src/kernels/loss/l1_loss_op_kernel.cu
@@ -7,7 +7,7 @@ namespace dragon {
 namespace kernel {
-/*! AbsGrad <T = float32, Device = CUDA> */
+/* <T = float32, Device = CUDA> */
 template <typename T>
 __global__ void _AbsGrad(

--- a/Dragon/src/kernels/loss/nll_loss_op_kernel.cc
+++ b/Dragon/src/kernels/loss/nll_loss_op_kernel.cc
@@ -5,7 +5,7 @@ namespace dragon {
 namespace kernel {
-/*! <Tx = ?, Ty = ?, Device = CPU> */
+/* <Tx = ?, Ty = ?, Device = CPU> */
 template <typename Tx, typename Ty>
 void _NLLLoss(
@@ -39,7 +39,7 @@ void _NLLLoss(
    }
 }
-/*! <Tx = float32, Ty = float32, Device = CPU> */
+/* <Tx = float32, Ty = float32, Device = CPU> */
 template <> void NLLLoss<float, float, CPUContext>(
    const int               outer_dim,
@@ -58,7 +58,7 @@ template <> void NLLLoss<float, float, CPUContext>(
    );
 }
-/*! <Tx = float32, Ty = int64, Device = CPU> */
+/* <Tx = float32, Ty = int64, Device = CPU> */
 template <> void NLLLoss<float, int64_t, CPUContext>(
    const int               outer_dim,
@@ -77,7 +77,7 @@ template <> void NLLLoss<float, int64_t, CPUContext>(
    );
 }
-/*! <Tx = ?, Ty = ?, Device = CPU> */
+/* <Tx = ?, Ty = ?, Device = CPU> */
 template <typename Tx, typename Ty>
 void _NLLLossGrad(
@@ -108,7 +108,7 @@ void _NLLLossGrad(
    }
 }
-/*! <Tx = float32, Ty = float32, Device = CPU> */
+/* <Tx = float32, Ty = float32, Device = CPU> */
 template<> void NLLLossGrad<float, float, CPUContext>(
    const int               outer_dim,
@@ -127,7 +127,7 @@ template<> void NLLLossGrad<float, float, CPUContext>(
    );
 }
-/*! <Tx = float32, Ty = int64, Device = CPU> */
+/* <Tx = float32, Ty = int64, Device = CPU> */
 template<> void NLLLossGrad<float, int64_t, CPUContext>(
    const int               outer_dim,

--- a/Dragon/src/kernels/loss/nll_loss_op_kernel.cu
+++ b/Dragon/src/kernels/loss/nll_loss_op_kernel.cu
@@ -7,7 +7,7 @@ namespace dragon {
 namespace kernel {
-/*! <Tx = float32, Ty = ?, Device = CUDA> */
+/* <Tx = float32, Ty = ?, Device = CUDA> */
 template <typename Tx, typename Ty>
 __global__ void _NLLLoss(
@@ -40,7 +40,7 @@ __global__ void _NLLLoss(
    }
 }
-/*! <Tx = float32, Ty = float32, Device = CUDA> */
+/* <Tx = float32, Ty = float32, Device = CUDA> */
 template <> void NLLLoss<float, float, CUDAContext>(
    const int               outer_dim,
@@ -62,7 +62,7 @@ template <> void NLLLoss<float, float, CUDAContext>(
     );
 }
-/*! <Tx = float32, Ty = int64, Device = CUDA> */
+/* <Tx = float32, Ty = int64, Device = CUDA> */
 template <> void NLLLoss<float, int64_t, CUDAContext>(
    const int               outer_dim,
@@ -84,7 +84,7 @@ template <> void NLLLoss<float, int64_t, CUDAContext>(
    );
 }
-/*! <Tx = ?, Ty = ?, Device = CUDA> */
+/* <Tx = ?, Ty = ?, Device = CUDA> */
 template <typename Tx, typename Ty>
 __global__ void _NLLLossGrad(
@@ -114,7 +114,7 @@ __global__ void _NLLLossGrad(
    }
 }
-/*! <Tx = float32, Ty = float32, Device = CUDA> */
+/* <Tx = float32, Ty = float32, Device = CUDA> */
 template<> void NLLLossGrad<float, float, CUDAContext>(
    const int               outer_dim,
@@ -136,7 +136,7 @@ template<> void NLLLossGrad<float, float, CUDAContext>(
    );
 }
-/*! <Tx = float32, Ty = int64, Device = CUDA> */
+/* <Tx = float32, Ty = int64, Device = CUDA> */
 template<> void NLLLossGrad<float, int64_t, CUDAContext>(
    const int               outer_dim,

--- a/Dragon/src/kernels/loss/sigmoid_ce_loss_op_kernel.cc
+++ b/Dragon/src/kernels/loss/sigmoid_ce_loss_op_kernel.cc
@@ -5,7 +5,7 @@ namespace dragon {
 namespace kernel {
-/*! SigmoidCrossEntropy <T = float32, Device = CPU> */
+/* <T = float32, Device = CPU> */
 template <> void SigmoidCrossEntropy<float, CPUContext>(
    const int               count,
@@ -35,7 +35,7 @@ template <> void SigmoidCrossEntropy<float, CPUContext>(
    }
 }
-/*! SigmoidCrossEntropyGrad <T = float32, Device = CPU> */
+/* <T = float32, Device = CPU> */
 template <> void SigmoidCrossEntropyGrad<float, CPUContext>(
    const int               count,

--- a/Dragon/src/kernels/loss/sigmoid_focal_loss_op_kernel.cc
+++ b/Dragon/src/kernels/loss/sigmoid_focal_loss_op_kernel.cc
@@ -4,7 +4,7 @@ namespace dragon {
 namespace kernel {
-/*! SigmoidFocalLoss <Tx = ?, Ty = ?, Device = CPU> */
+/* <Tx = ?, Ty = ?, Device = CPU> */
 template <typename Tx, typename Ty>
 void _SigmoidFocalLoss(
@@ -56,7 +56,7 @@ void _SigmoidFocalLoss(
    }
 }
-/*! SigmoidFocalLoss <Tx = float32, Ty = float32, Device = CPU> */
+/* <Tx = float32, Ty = float32, Device = CPU> */
 template <> void SigmoidFocalLoss<float, float, CPUContext>(
    const int               outer_dim,
@@ -78,7 +78,7 @@ template <> void SigmoidFocalLoss<float, float, CPUContext>(
    );
 }
-/*! SigmoidFocalLoss <Tx = float32, Ty = int64, Device = CPU> */
+/* <Tx = float32, Ty = int64, Device = CPU> */
 template <> void SigmoidFocalLoss<float, int64_t, CPUContext>(
    const int               outer_dim,
@@ -100,7 +100,7 @@ template <> void SigmoidFocalLoss<float, int64_t, CPUContext>(
    );
 }
-/*! SigmoidFocalLossGrad <Tx = ?, Ty = ?, Device = CPU> */
+/* <Tx = ?, Ty = ?, Device = CPU> */
 template <typename Tx, typename Ty>
 void _SigmoidFocalLossGrad(
@@ -153,7 +153,7 @@ void _SigmoidFocalLossGrad(
    }
 }
-/*! SigmoidFocalLossGrad <Tx = float32, Ty = float32, Device = CPU> */
+/* <Tx = float32, Ty = float32, Device = CPU> */
 template <> void SigmoidFocalLossGrad<float, float, CPUContext>(
    const int               outer_dim,
@@ -175,7 +175,7 @@ template <> void SigmoidFocalLossGrad<float, float, CPUContext>(
    );
 }
-/*! SigmoidFocalLossGrad <Tx = float32, Ty = int64_t, Device = CPU> */
+/* <Tx = float32, Ty = int64_t, Device = CPU> */
 template <> void SigmoidFocalLossGrad<float, int64_t, CPUContext>(
    const int               outer_dim,

--- a/Dragon/src/kernels/loss/sigmoid_focal_loss_op_kernel.cu
+++ b/Dragon/src/kernels/loss/sigmoid_focal_loss_op_kernel.cu
@@ -7,7 +7,7 @@ namespace dragon {
 namespace kernel {
-/*! SigmoidFocalLoss <Tx = ?, Ty = ?, Device = CUDA> */
+/* <Tx = ?, Ty = ?, Device = CUDA> */
 template <typename Tx, typename Ty>
 __global__ void _SigmoidFocalLoss(
@@ -54,7 +54,7 @@ __global__ void _SigmoidFocalLoss(
    }
 }
-/*! SigmoidFocalLoss <Tx = float32, Ty = float32, Device = CUDA> */
+/* <Tx = float32, Ty = float32, Device = CUDA> */
 template <> void SigmoidFocalLoss<float, float, CUDAContext>(
    const int               outer_dim,
@@ -79,7 +79,7 @@ template <> void SigmoidFocalLoss<float, float, CUDAContext>(
    );
 }
-/*! SigmoidFocalLoss <Tx = float32, Ty = int64, Device = CUDA> */
+/* <Tx = float32, Ty = int64, Device = CUDA> */
 template <> void SigmoidFocalLoss<float, int64_t, CUDAContext>(
    const int               outer_dim,

--- a/Dragon/src/kernels/loss/smooth_l1_loss_op_kernel.cc
+++ b/Dragon/src/kernels/loss/smooth_l1_loss_op_kernel.cc
@@ -5,7 +5,7 @@ namespace dragon {
 namespace kernel {
-/*! SmoothL1 <T = float32, Device = CPU> */
+/* <T = float32, Device = CPU> */
 template<> void SmoothL1<float, CPUContext>(
    const int               count,
@@ -27,7 +27,7 @@ template<> void SmoothL1<float, CPUContext>(
    }
 }
-/*! SmoothL1Grad <T = float32, Device = CPU> */
+/* <T = float32, Device = CPU> */
 template<> void SmoothL1Grad<float, CPUContext>(
    const int               count,

--- a/Dragon/src/kernels/loss/smooth_l1_loss_op_kernel.cu
+++ b/Dragon/src/kernels/loss/smooth_l1_loss_op_kernel.cu
@@ -7,7 +7,7 @@ namespace dragon {
 namespace kernel {
-/*! SmoothL1 <T = float32, Device = CUDA> */
+/* <T = float32, Device = CUDA> */
 template <typename T>
 __global__ void _SmoothL1(
@@ -39,7 +39,7 @@ template<> void SmoothL1<float, CUDAContext>(
     );
 }
-/*! SmoothL1Grad <T = float32, Device = CUDA> */
+/* <T = float32, Device = CUDA> */
 template <typename T>
 __global__ void _SmoothL1Grad(

--- a/Dragon/src/kernels/loss/softmax_ce_loss_op_kernel.cc
+++ b/Dragon/src/kernels/loss/softmax_ce_loss_op_kernel.cc
@@ -5,7 +5,7 @@ namespace dragon {
 namespace kernel {
-/*! SoftmaxCrossEntropy <T = float32, Device = CPU> */
+/* <T = float32, Device = CPU> */
 template <> void SoftmaxCrossEntropy<float, CPUContext>(
    const int               count,

--- a/Dragon/src/kernels/loss/softmax_ce_loss_op_kernel.cu
+++ b/Dragon/src/kernels/loss/softmax_ce_loss_op_kernel.cu
@@ -7,7 +7,7 @@ namespace dragon {
 namespace kernel {
-/*! SoftmaxCrossEntropy <T = float32, Device = CUDA> */
+/* <T = float32, Device = CUDA> */
 template <typename T>
 __global__ void _SoftmaxCrossEntropy(

--- a/Dragon/src/kernels/loss/softmax_focal_loss_op_kernel.cc
+++ b/Dragon/src/kernels/loss/softmax_focal_loss_op_kernel.cc
@@ -4,7 +4,7 @@ namespace dragon {
 namespace kernel {
-/*! SoftmaxFocalLoss <Tx = ?, Ty = ?, Device = CPU> */
+/* <Tx = ?, Ty = ?, Device = CPU> */
 template <typename Tx, typename Ty>
 void _SoftmaxFocalLoss(
@@ -47,7 +47,7 @@ void _SoftmaxFocalLoss(
    }
 }
-/*! SoftmaxFocalLoss <Tx = float32, Ty = float32, Device = CPU> */
+/* <Tx = float32, Ty = float32, Device = CPU> */
 template <> void SoftmaxFocalLoss<float, float, CPUContext>(
    const int               outer_dim,
@@ -72,7 +72,7 @@ template <> void SoftmaxFocalLoss<float, float, CPUContext>(
    );
 }
-/*! SoftmaxFocalLoss <Tx = float32, Ty = int64, Device = CPU> */
+/* <Tx = float32, Ty = int64, Device = CPU> */
 template <> void SoftmaxFocalLoss<float, int64_t, CPUContext>(
    const int               outer_dim,
@@ -97,7 +97,7 @@ template <> void SoftmaxFocalLoss<float, int64_t, CPUContext>(
    );
 }
-/*! SoftmaxFocalLossGrad <Tx = ?, Ty = ?, Device = CPU> */
+/* <Tx = ?, Ty = ?, Device = CPU> */
 template <typename Tx, typename Ty>
 void _SoftmaxFocalLossGrad(
@@ -154,7 +154,7 @@ void _SoftmaxFocalLossGrad(
    }
 }
-/*! SoftmaxFocalLossGrad <Tx = float32, Ty = float32, Device = CPU> */
+/* <Tx = float32, Ty = float32, Device = CPU> */
 template<> void SoftmaxFocalLossGrad<float, float, CPUContext>(
    const int               outer_dim,
@@ -179,7 +179,7 @@ template<> void SoftmaxFocalLossGrad<float, float, CPUContext>(
    );
 }
-/*! SoftmaxFocalLossGrad <Tx = float32, Ty = int64, Device = CPU> */
+/* <Tx = float32, Ty = int64, Device = CPU> */
 template<> void SoftmaxFocalLossGrad<float, int64_t, CPUContext>(
    const int               outer_dim,

--- a/Dragon/src/kernels/loss/softmax_focal_loss_op_kernel.cu
+++ b/Dragon/src/kernels/loss/softmax_focal_loss_op_kernel.cu
@@ -7,7 +7,7 @@ namespace dragon {
 namespace kernel {
-/*! SoftmaxFocalLoss <Tx = ?, Ty = ?, Device = CUDA> */
+/* <Tx = ?, Ty = ?, Device = CUDA> */
 template <typename Tx, typename Ty>
 __global__ void _SoftmaxFocalLoss(
@@ -48,7 +48,7 @@ __global__ void _SoftmaxFocalLoss(
    }
 }
-/*! SoftmaxFocalLoss <Tx = float32, Ty = float32, Device = CUDA> */
+/* <Tx = float32, Ty = float32, Device = CUDA> */
 template <> void SoftmaxFocalLoss<float, float, CUDAContext>(
    const int               outer_dim,
@@ -76,7 +76,7 @@ template <> void SoftmaxFocalLoss<float, float, CUDAContext>(
    );
 }
-/*! SoftmaxFocalLoss <Tx = float32, Ty = int64, Device = CUDA> */
+/* <Tx = float32, Ty = int64, Device = CUDA> */
 template <> void SoftmaxFocalLoss<float, int64_t, CUDAContext>(
    const int               outer_dim,
@@ -104,7 +104,7 @@ template <> void SoftmaxFocalLoss<float, int64_t, CUDAContext>(
    );
 }
-/*! SoftmaxFocalLossGrad <Tx = ?, Ty = ?, Device = CUDA> */
+/* <Tx = ?, Ty = ?, Device = CUDA> */
 template <typename Tx, typename Ty>
 __global__ void _SoftmaxFocalLossGrad(
@@ -160,7 +160,7 @@ __global__ void _SoftmaxFocalLossGrad(
    }
 }
-/*! SoftmaxFocalLossGrad <Tx = float32, Ty = float32, Device = CUDA> */
+/* <Tx = float32, Ty = float32, Device = CUDA> */
 template<> void SoftmaxFocalLossGrad<float, float, CUDAContext>(
    const int               outer_dim,
@@ -188,7 +188,7 @@ template<> void SoftmaxFocalLossGrad<float, float, CUDAContext>(
    );
 }
-/*! SoftmaxFocalLossGrad <Tx = float32, Ty = int64, Device = CUDA> */
+/* <Tx = float32, Ty = int64, Device = CUDA> */
 template<> void SoftmaxFocalLossGrad<float, int64_t, CUDAContext>(
    const int               outer_dim,

--- a/Dragon/src/kernels/loss/sparse_softmax_ce_loss_op_kernel.cc
+++ b/Dragon/src/kernels/loss/sparse_softmax_ce_loss_op_kernel.cc
@@ -4,7 +4,7 @@ namespace dragon {
 namespace kernel {
-/*! <Tx = ?, Ty = ?, Device = CPU> */
+/* <Tx = ?, Ty = ?, Device = CPU> */
 template <typename Tx, typename Ty>
 void _SparseSoftmaxCrossEntropy(
@@ -40,7 +40,7 @@ void _SparseSoftmaxCrossEntropy(
    }
 }
-/*! SparseSoftmaxCrossEntropy <Tx = float32, Ty = float32, Device = CPU> */
+/* <Tx = float32, Ty = float32, Device = CPU> */
 template <> void SparseSoftmaxCrossEntropy<float, float, CPUContext>(
    const int               outer_dim,
@@ -59,7 +59,7 @@ template <> void SparseSoftmaxCrossEntropy<float, float, CPUContext>(
    );
 }
-/*! <Tx = float32, Ty = int64, Device = CPU> */
+/* <Tx = float32, Ty = int64, Device = CPU> */
 template <> void SparseSoftmaxCrossEntropy<float, int64_t, CPUContext>(
    const int               outer_dim,
@@ -78,7 +78,7 @@ template <> void SparseSoftmaxCrossEntropy<float, int64_t, CPUContext>(
    );
 }
-/*! <Tx = ?, Ty = ?, Device = CPU> */
+/* <Tx = ?, Ty = ?, Device = CPU> */
 template <typename Tx, typename Ty>
 void _SparseSoftmaxCrossEntropyGrad(
@@ -112,7 +112,7 @@ void _SparseSoftmaxCrossEntropyGrad(
    }
 }
-/*! <Tx = float32, Ty = float32, Device = CPU> */
+/* <Tx = float32, Ty = float32, Device = CPU> */
 template<> void SparseSoftmaxCrossEntropyGrad<float, float, CPUContext>(
    const int               outer_dim,
@@ -131,7 +131,7 @@ template<> void SparseSoftmaxCrossEntropyGrad<float, float, CPUContext>(
    );
 }
-/*! <Tx = float32, Ty = int64, Device = CPU> */
+/* <Tx = float32, Ty = int64, Device = CPU> */
 template<> void SparseSoftmaxCrossEntropyGrad<float, int64_t, CPUContext>(
    const int               outer_dim,

--- a/Dragon/src/kernels/loss/sparse_softmax_ce_loss_op_kernel.cu
+++ b/Dragon/src/kernels/loss/sparse_softmax_ce_loss_op_kernel.cu
@@ -8,7 +8,7 @@ namespace dragon {
 namespace kernel {
-/*! <Tx = ?, Ty = ?, Device = CUDA> */
+/* <Tx = ?, Ty = ?, Device = CUDA> */
 template <typename Tx, typename Ty>
 __global__ void _SparseSoftmaxCrossEntropy(
@@ -44,7 +44,7 @@ __global__ void _SparseSoftmaxCrossEntropy(
    }
 }
-/*! <Tx = float32, Ty = float32, Device = CUDA> */
+/* <Tx = float32, Ty = float32, Device = CUDA> */
 template <> void SparseSoftmaxCrossEntropy<float, float, CUDAContext>(
    const int               outer_dim,
@@ -66,7 +66,7 @@ template <> void SparseSoftmaxCrossEntropy<float, float, CUDAContext>(
    );
 }
-/*! <Tx = float32, Ty = int64, Device = CUDA> */
+/* <Tx = float32, Ty = int64, Device = CUDA> */
 template <> void SparseSoftmaxCrossEntropy<float, int64_t, CUDAContext>(
    const int               outer_dim,
@@ -88,7 +88,7 @@ template <> void SparseSoftmaxCrossEntropy<float, int64_t, CUDAContext>(
    );
 }
-/*! <Tx = ?, Ty = ?, Device = CUDA> */
+/* <Tx = ?, Ty = ?, Device = CUDA> */
 template <typename Tx, typename Ty>
 __global__ void _SparseSoftmaxCrossEntropyGrad(
@@ -121,7 +121,7 @@ __global__ void _SparseSoftmaxCrossEntropyGrad(
    }
 }
-/*! <Tx = float32, Ty = float32, Device = CUDA> */
+/* <Tx = float32, Ty = float32, Device = CUDA> */
 template<> void SparseSoftmaxCrossEntropyGrad<float, float, CUDAContext>(
    const int               outer_dim,
@@ -143,7 +143,7 @@ template<> void SparseSoftmaxCrossEntropyGrad<float, float, CUDAContext>(
    );
 }
-/*! <Tx = float32, Ty = int64, Device = CUDA> */
+/* <Tx = float32, Ty = int64, Device = CUDA> */
 template<> void SparseSoftmaxCrossEntropyGrad<float, int64_t, CUDAContext>(
    const int               outer_dim,

--- a/Dragon/src/kernels/misc/astype_op_kernel.cc
+++ b/Dragon/src/kernels/misc/astype_op_kernel.cc
@@ -6,7 +6,7 @@ namespace dragon {
 namespace kernel {
-/*! Astype <Ta = ?, Tb = ?, Device = CPU> */
+/* <Ta = ?, Tb = ?, Device = CPU> */
 template <typename Ta, typename Tb>
 void _TypeA2B(const int count, const Ta* a, Tb* b) {

--- a/Dragon/src/kernels/misc/astype_op_kernel.cu
+++ b/Dragon/src/kernels/misc/astype_op_kernel.cu
@@ -7,7 +7,7 @@ namespace dragon {
 namespace kernel {
-/*! Astype <Ta = ?, Tb = ?, Device = CUDA> */
+/* <Ta = ?, Tb = ?, Device = CUDA> */
 template <typename Ta, typename Tb>
 __global__ void _TypeA2B(
@@ -49,7 +49,7 @@ DEFINE_TYPE_A_TO_ALL(int64_t);
 DEFINE_TYPE_A_TO_ALL(float);
 DEFINE_TYPE_A_TO_ALL(double);
-/*! Astype <Ta = float16, Tb = float32, Device = CUDA> */
+/* <Ta = float16, Tb = float32, Device = CUDA> */
 template<> __global__ void _TypeA2B<half, float>(
    const int               nthreads,
@@ -72,7 +72,7 @@ template <> void TypeA2B<float16, float, CUDAContext>(
    );
 }
-/*! Astype <Ta = float32, Tb = float16, Device = CUDA> */
+/* <Ta = float32, Tb = float16, Device = CUDA> */
 template<> __global__ void _TypeA2B<float, half>(
    const int               nthreads,
@@ -95,7 +95,7 @@ template <> void TypeA2B<float, float16, CUDAContext>(
    );
 }
-/*! Astype <Ta = float16, Tb = float16, Device = CUDA> */
+/* <Ta = float16, Tb = float16, Device = CUDA> */
 template<> __global__ void _TypeA2B<half, half>(
    const int               nthreads,

--- a/Dragon/src/kernels/misc/gradient_op_kernel.cc
+++ b/Dragon/src/kernels/misc/gradient_op_kernel.cc
@@ -5,7 +5,7 @@ namespace dragon {
 namespace kernel {
-/*! GradientTwoSum <T = ?, Device = CUDA> */
+/* <T = ?, Device = CUDA> */
 template <typename T>
 void _GradientTwoSum(

--- a/Dragon/src/kernels/misc/gradient_op_kernel.cu
+++ b/Dragon/src/kernels/misc/gradient_op_kernel.cu
@@ -7,7 +7,7 @@ namespace dragon {
 namespace kernel {
-/*! GradientTwoSum <T = ?, Device = CUDA> */
+/* <T = ?, Device = CUDA> */
 template <typename T>
 __global__ void _GradientTwoSum(
@@ -20,7 +20,7 @@ __global__ void _GradientTwoSum(
    }
 }
-/*! GradientTwoSum <T = float16, Device = CUDA> */
+/* <T = float16, Device = CUDA> */
 template <> __global__ void _GradientTwoSum<half>(
    const int               nthreads,

--- a/Dragon/src/kernels/misc/image_data_op_kernel.cc
+++ b/Dragon/src/kernels/misc/image_data_op_kernel.cc
@@ -5,7 +5,7 @@ namespace dragon {
 namespace kernel {
-/*! ImageData <Tx = ?, Ty = ?, Device = CPU> */
+/* <Tx = ?, Ty = ?, Device = CPU> */
 template <typename Tx, typename Ty>
 void _ImageDataNCHW(
@@ -55,7 +55,7 @@ void _ImageDataNHWC(
    }
 }
-/*! ImageData <Tx = float32, Ty = float32, Device = CPU> */
+/* <Tx = float32, Ty = float32, Device = CPU> */
 template <> void ImageData<float, float, CPUContext>(
    const int               N,
@@ -77,7 +77,7 @@ template <> void ImageData<float, float, CPUContext>(
    }
 }
-/*! ImageData <Tx = uint8, Ty = float32, Device = CPU> */
+/* <Tx = uint8, Ty = float32, Device = CPU> */
 template <> void ImageData<uint8_t, float, CPUContext>(
    const int               N,
@@ -99,7 +99,7 @@ template <> void ImageData<uint8_t, float, CPUContext>(
    }
 }
-/*! ImageData <Tx = float32, Ty = float16, Device = CPU> */
+/* <Tx = float32, Ty = float16, Device = CPU> */
 template <> void ImageData<float, float16, CPUContext>(
    const int               N,
@@ -115,7 +115,7 @@ template <> void ImageData<float, float16, CPUContext>(
    CPU_FP16_NOT_SUPPORTED;
 }
-/*! ImageData <Tx = uint8, Ty = float16, Device = CPU> */
+/* <Tx = uint8, Ty = float16, Device = CPU> */
 template <> void ImageData<uint8_t, float16, CPUContext>(
    const int               N,

--- a/Dragon/src/kernels/misc/image_data_op_kernel.cu
+++ b/Dragon/src/kernels/misc/image_data_op_kernel.cu
@@ -7,7 +7,7 @@ namespace dragon {
 namespace kernel {
-/*! ImageData <Tx = ?, Ty = ?, Device = CUDA> */
+/* <Tx = ?, Ty = ?, Device = CUDA> */
 template <typename Tx, typename Ty>
 __global__ void _ImageDataNCHW(
@@ -60,7 +60,7 @@ __global__ void _ImageDataNHWC(
    }
 }
-/*! ImageData <Tx = float32, Ty = float32, Device = CUDA> */
+/* <Tx = float32, Ty = float32, Device = CUDA> */
 template <> void ImageData<float, float, CUDAContext>(
    const int               N,
@@ -91,7 +91,7 @@ template <> void ImageData<float, float, CUDAContext>(
    }
 }
-/*! ImageData <Tx = uint8, Ty = float32, Device = CUDA> */
+/* <Tx = uint8, Ty = float32, Device = CUDA> */
 template <> void ImageData<uint8_t, float, CUDAContext>(
    const int               N,
@@ -122,7 +122,7 @@ template <> void ImageData<uint8_t, float, CUDAContext>(
    }
 }
-/*! ImageData <Tx = ?, Ty = float16, Device = CUDA> */
+/* <Tx = ?, Ty = float16, Device = CUDA> */
 template <typename Tx, typename Ty>
 __global__ void _ImageDataHalfNCHW(
@@ -175,7 +175,7 @@ __global__ void _ImageDataHalfNHWC(
    }
 }
-/*! ImageData <Tx = float32, Ty = float16, Device = CUDA> */
+/* <Tx = float32, Ty = float16, Device = CUDA> */
 template <> void ImageData<float, float16, CUDAContext>(
    const int               N,

--- a/Dragon/src/kernels/norm/batch_norm_op_kernel.cc
+++ b/Dragon/src/kernels/norm/batch_norm_op_kernel.cc
@@ -16,7 +16,7 @@ namespace dragon {
 namespace kernel {
-/*! BatchNormBackwardTraining <T = ?, Device = CPU> */
+/* <T = ?, Device = CPU> */
 template <typename Tx, typename Tp, StorageOrder kOrder>
 void _BatchNormInternalGrad(
@@ -68,7 +68,7 @@ void _BatchNormTrainingGrad(
    }
 }
-/*! BatchNormBackwardInference <T = ?, Device = CPU> */
+/* <T = ?, Device = CPU> */
 template <typename Tx, typename Tp, StorageOrder kOrder>
 void _BatchNormWGrad(

--- a/Dragon/src/kernels/norm/batch_norm_op_kernel.cu
+++ b/Dragon/src/kernels/norm/batch_norm_op_kernel.cu
@@ -19,7 +19,7 @@ namespace kernel {
 #define L(x, i) __ldg(x + i)
-/*! BatchNormBackwardTraining <T = ?, Device = CUDA> */
+/* <T = ?, Device = CUDA> */
 template <typename Tx, typename Tp, StorageOrder kOrder>
 __global__ void _BatchNormInternalGrad(
@@ -107,7 +107,7 @@ __global__ void _BatchNormTrainingGrad(
    }
 }
-/*! BatchNormBackwardInference <T = ?, Device = CUDA> */
+/* <T = ?, Device = CUDA> */
 template <typename Tx, typename Tp, StorageOrder kOrder>
 __global__ void _BatchNormWGrad(

--- a/Dragon/src/kernels/norm/group_norm_op_kernel.cu
+++ b/Dragon/src/kernels/norm/group_norm_op_kernel.cu
@@ -20,7 +20,7 @@ namespace kernel {
 #define L(x, i) __ldg(x + i)
 #define LF(x, i) __half2float(__ldg(x + i))
-/*! GroupNormForward <T = ?, Device = CUDA> */
+/* <T = ?, Device = CUDA> */
 template <typename T>
 __global__ void _GroupNormFusedParams(
@@ -157,7 +157,7 @@ template<> __global__ void _GroupNormForwardNHWC<half, float>(
 #endif
 }
-/*! GroupNormBackward <T = ?, Device = CUDA> */
+/* <T = ?, Device = CUDA> */
 template <typename Tx, typename Tp, StorageOrder kOrder>
 __global__ void _GroupNormWGrad(

--- a/Dragon/src/kernels/recurrent/lstm_cell_op_kernel.cc
+++ b/Dragon/src/kernels/recurrent/lstm_cell_op_kernel.cc
@@ -5,7 +5,7 @@ namespace dragon {
 namespace kernel {
-/*! LSTMCell <T = float32, Device = CPU> */
+/* <T = float32, Device = CPU> */
 template <typename T>
 T _s(T x) { return T(1) / (T(1) + exp(-x)); }
@@ -34,7 +34,7 @@ template <> void LSTMCell<float, CPUContext>(
    }
 }
-/*! LSTMCellGrad <T = float32, Device = CPU> */
+/* <T = float32, Device = CPU> */
 template <> void LSTMCellGrad<float, CPUContext>(
    const int               N,

--- a/Dragon/src/kernels/recurrent/lstm_cell_op_kernel.cu
+++ b/Dragon/src/kernels/recurrent/lstm_cell_op_kernel.cu
@@ -7,7 +7,7 @@ namespace dragon {
 namespace kernel {
-/*! LSTMCell <T = float32, Device = CUDA> */
+/* <T = float32, Device = CUDA> */
 template <typename T>
 __global__ void _LSTMCellAct(
@@ -70,7 +70,7 @@ template <> void LSTMCell<float, CUDAContext>(
    );
 }
-/*! LSTMCellGrad <T = float32, Device = CUDA> */
+/* <T = float32, Device = CUDA> */
 template <typename T>
 __global__ void _LSTMCellGateGrad(

--- a/Dragon/src/kernels/vision/bias_add_op_kernel.cc
+++ b/Dragon/src/kernels/vision/bias_add_op_kernel.cc
@@ -5,7 +5,7 @@ namespace dragon {
 namespace kernel {
-/*! BiasAdd <T = float32, Device = CPU> */
+/* <T = float32, Device = CPU> */
 template<> void BiasAdd<float, CPUContext>(
    const int               outer_dim,

--- a/Dragon/src/kernels/vision/bias_add_op_kernel.cu
+++ b/Dragon/src/kernels/vision/bias_add_op_kernel.cu
@@ -7,7 +7,7 @@ namespace dragon {
 namespace kernel {
-/*! BiasAdd <T = float32, Device = CUDA> */
+/* <T = float32, Device = CUDA> */
 template <typename T>
 __global__ void _BiasAddNCHW(

--- a/Dragon/src/kernels/vision/depthwise_conv_op_kernel.cc
+++ b/Dragon/src/kernels/vision/depthwise_conv_op_kernel.cc
@@ -4,7 +4,7 @@ namespace dragon {
 namespace kernel {
-/*! DepthwiseConv2d <T = float32, Device = CPU> */
+/* <T = float32, Device = CPU> */
 template <typename T>
 void _DepthwiseConv2dNCHW(

--- a/Dragon/src/kernels/vision/drop_block_op_kernel.cu
+++ b/Dragon/src/kernels/vision/drop_block_op_kernel.cu
@@ -8,7 +8,7 @@ namespace dragon {
 namespace kernel {
-/* < Device = CUDA> */
+/* <Device = CUDA> */
 __global__ void _DropBlock2dNCHW(
    const int               nthreads,

--- a/Dragon/src/kernels/vision/roi_align_op_kernel.cc
+++ b/Dragon/src/kernels/vision/roi_align_op_kernel.cc
@@ -5,7 +5,7 @@ namespace dragon {
 namespace kernel {
-/*! ROIAlign <T = float32, Device = CPU> */
+/* <T = float32, Device = CPU> */
 template <typename T>
 T _ROIAlignIntp(
@@ -115,7 +115,7 @@ template<> void ROIAlign<float, CPUContext>(
    }  // End n
 }
-/*! ROIAlign <T = float16, Device = CPU> */
+/* <T = float16, Device = CPU> */
 template<> void ROIAlign<float16, CPUContext>(
    const int               C,
@@ -133,7 +133,7 @@ template<> void ROIAlign<float16, CPUContext>(
    CPU_FP16_NOT_SUPPORTED;
 }
-/*! ROIAlignGrad <T = float32, Device = CPU> */
+/* <T = float32, Device = CPU> */
 template<> void ROIAlignGrad<float, CPUContext>(
    const int               C,

--- a/Dragon/src/kernels/vision/roi_align_op_kernel.cu
+++ b/Dragon/src/kernels/vision/roi_align_op_kernel.cu
@@ -7,7 +7,7 @@ namespace dragon {
 namespace kernel {
-/*! <T = float32, Device = CUDA> */
+/* <T = float32, Device = CUDA> */
 template <typename T>
 __device__ T _ROIAlignIntp(
@@ -106,10 +106,10 @@ __global__ void _ROIAlign(
        for (int iy = 0; iy < grid_h; iy++) {
            const T y = roi_hstart + ph * bin_h +
-                (T)(iy + .5f) * bin_h / (T)(grid_h);
+                (T)(iy + .5f) * bin_h / (T)grid_h;
            for (int ix = 0; ix < grid_w; ix++) {
                const T x = roi_wstart + pw * bin_w +
-                    (T)(ix + .5f) * bin_w / (T)(grid_w);
+                    (T)(ix + .5f) * bin_w / (T)grid_w;
                intp_val += _ROIAlignIntp(X, H, W, y, x);
            }
        }
@@ -143,7 +143,7 @@ template<> void ROIAlign<float, CUDAContext>(
    );
 }
-/*! <T = float32, Device = CUDA> */
+/* <T = float32, Device = CUDA> */
 template <typename T>
 __device__ void _ROIAlignIntpGrad(

--- a/Dragon/src/kernels/vision/roi_align_op_kernel.fp16.cu
+++ b/Dragon/src/kernels/vision/roi_align_op_kernel.fp16.cu
@@ -7,10 +7,10 @@ namespace dragon {
 namespace kernel {
-/*! ROIAlign <T = float16, Device = CUDA> */
+/* <T = float16, Device = CUDA> */
-__device__ float _ROIAlignInterpolate(
+__device__ float _ROIAlignIntp(
-    const half*             Xdata,
+    const half*             X,
    const int               H,
    const int               W,
    float                   y,
@@ -41,12 +41,12 @@ __device__ float _ROIAlignInterpolate(
    const float ly = y - y_low;
    const float lx = x - x_low;
-    const float hy = 1. - ly, hx = 1. - lx;
+    const float hy = 1.f - ly, hx = 1.f - lx;
    const float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-    const float v1 = __half2float(__ldg(Xdata + (y_low * W + x_low)));
+    const float v1 = __half2float(__ldg(X + (y_low * W + x_low)));
-    const float v2 = __half2float(__ldg(Xdata + (y_low * W + x_high)));
+    const float v2 = __half2float(__ldg(X + (y_low * W + x_high)));
-    const float v3 = __half2float(__ldg(Xdata + (y_high * W + x_low)));
+    const float v3 = __half2float(__ldg(X + (y_high * W + x_low)));
-    const float v4 = __half2float(__ldg(Xdata + (y_high * W + x_high)));
+    const float v4 = __half2float(__ldg(X + (y_high * W + x_high)));
    const float value = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4;
 #else
    const float value = 0.f;
@@ -63,58 +63,55 @@ __global__ void _ROIAlignHalf(
    const int               pool_w,
    const int               sampling_ratio,
    const float             spatial_scale,
-    const half*             Xdata,
+    const half*             xdata,
    const float*            rois,
-    half*                   Ydata) {
+    half*                   ydata) {
-    CUDA_1D_KERNEL_LOOP(y_idx, nthreads) {
+    CUDA_1D_KERNEL_LOOP(yi, nthreads) {
 #if __CUDA_ARCH__ >= 530
-        int pw = y_idx % pool_w;
+        int pw = yi % pool_w;
-        int ph = (y_idx / pool_w) % pool_h;
+        int ph = (yi / pool_w) % pool_h;
-        int c = (y_idx / pool_w / pool_h) % C;
+        int c = (yi / pool_w / pool_h) % C;
-        int n = y_idx / pool_w / pool_h / C;
+        int n = yi / pool_w / pool_h / C;
-        const float* offset_rois = rois + n * 5;
+        const float* roi = rois + n * 5;
-        int roi_batch_ind = offset_rois[0];
+        int batch_ind = roi[0];
-        if (roi_batch_ind < 0) {
+        if (batch_ind < 0) {
-            Ydata[y_idx] = __float2half(0.f);
+            ydata[yi] = __float2half(0.f);
            continue;
        }
-        float roi_start_w = offset_rois[1] * spatial_scale;
+        float roi_wstart = roi[1] * spatial_scale;
-        float roi_start_h = offset_rois[2] * spatial_scale;
+        float roi_hstart = roi[2] * spatial_scale;
-        float roi_end_w = offset_rois[3] * spatial_scale;
+        float roi_wend = roi[3] * spatial_scale;
-        float roi_end_h = offset_rois[4] * spatial_scale;
+        float roi_hend = roi[4] * spatial_scale;
-        float roi_width = max(roi_end_w - roi_start_w, 1.f);
+        float roi_w = max(roi_wend - roi_wstart, 1.f);
-        float roi_height = max(roi_end_h - roi_start_h, 1.f);
+        float roi_h = max(roi_hend - roi_hstart, 1.f);
-        float bin_size_h = (float)roi_height / (float)pool_h;
+        float bin_h = roi_h / (float)pool_h;
-        float bin_size_w = (float)roi_width / (float)pool_w;
+        float bin_w = roi_w / (float)pool_w;
-        const half* offset_Xdata = Xdata + (roi_batch_ind * C + c) * H * W;
+        const half* X = xdata + (batch_ind * C + c) * H * W;
-        int roi_bin_grid_h = (sampling_ratio > 0) ?
+        int grid_h = (sampling_ratio > 0) ?
-            sampling_ratio : ceil(roi_height / pool_h);
+            sampling_ratio : ceil(roi_h / pool_h);
-        int roi_bin_grid_w = (sampling_ratio > 0) ? 
+        int grid_w = (sampling_ratio > 0) ? 
-            sampling_ratio : ceil(roi_width / pool_w);
+            sampling_ratio : ceil(roi_w / pool_w);
-        float output_val = 0.;
+        float intp_val = 0.f;
-        const float num_bin_grids = roi_bin_grid_h * roi_bin_grid_w;
+        for (int iy = 0; iy < grid_h; iy++) {
-        for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            const float y = roi_hstart + ph * bin_h +
-            const float y = roi_start_h + ph * bin_size_h +
+                (float)(iy + .5f) * bin_h / (float)grid_h;
-                static_cast<float>(iy + .5f) * bin_size_h /
+            for (int ix = 0; ix < grid_w; ix++) {
-                    static_cast<float>(roi_bin_grid_h);
+                const float x = roi_wstart + pw * bin_w +
-            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+                    (float)(ix + .5f) * bin_w / (float)grid_w;
-                const float x = roi_start_w + pw * bin_size_w +
+                intp_val += _ROIAlignIntp(X, H, W, y, x);
-                    static_cast<float>(ix + .5f) * bin_size_w /
-                        static_cast<float>(roi_bin_grid_w);
-                output_val += _ROIAlignInterpolate(
-                    offset_Xdata, H, W, y, x);
            }
        }
-        output_val /= num_bin_grids;
+        ydata[yi] = __float2half(
-        Ydata[y_idx] = __float2half(output_val);
+            intp_val / float(grid_h * grid_w)
+        );
 #endif
    }
 }
@@ -135,11 +132,16 @@ template<> void ROIAlign<float16, CUDAContext>(
    auto nthreads = num_rois * C  * pool_h * pool_w;
    _ROIAlignHalf
        <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
-            0, ctx->cuda_stream() >>>
+            0, ctx->cuda_stream() >>>(
-        (nthreads, C, H, W, pool_h, pool_w,
+        nthreads,
-            sampling_ratio, spatial_scale,
+        C, H, W,
-                reinterpret_cast<const half*>(x), rois,
+        pool_h, pool_w,
-                    reinterpret_cast<half*>(y));
+        sampling_ratio,
+        spatial_scale,
+        reinterpret_cast<const half*>(x),
+        rois,
+        reinterpret_cast<half*>(y)
+    );
 }
 }  // namespace kernel

--- a/Dragon/src/kernels/vision/roi_pool_op_kernel.cc
+++ b/Dragon/src/kernels/vision/roi_pool_op_kernel.cc
@@ -5,7 +5,7 @@ namespace dragon {
 namespace kernel {
-/*! ROIPool <T = float32, Device = CPU> */
+/* <T = float32, Device = CPU> */
 template<> void ROIPool<float, CPUContext>(
    const int               C,
@@ -83,7 +83,7 @@ template<> void ROIPool<float, CPUContext>(
    }  // End n
 }
-/*! ROIPool <T = float16, Device = CPU> */
+/* <T = float16, Device = CPU> */
 template<> void ROIPool<float16, CPUContext>(
    const int               C,
@@ -101,7 +101,7 @@ template<> void ROIPool<float16, CPUContext>(
    CPU_FP16_NOT_SUPPORTED;
 }
-/*! ROIPoolGrad <T = float32, Device = CPU> */
+/* <T = float32, Device = CPU> */
 template<> void ROIPoolGrad<float, CPUContext>(
    const int               N,

--- a/Dragon/src/operators/array/arange_op.cc
+++ b/Dragon/src/operators/array/arange_op.cc
@@ -52,7 +52,9 @@ DEPLOY_CUDA(Arange);
 #endif
 OPERATOR_SCHEMA(Arange)
-    .NumInputs(0).NumOutputs(1);
+    .NumInputs(0)
+     /* Y */
+    .NumOutputs(1);
 NO_GRADIENT(Arange);

--- a/Dragon/src/operators/array/index_select_op.cc
+++ b/Dragon/src/operators/array/index_select_op.cc
@@ -16,7 +16,7 @@ namespace dragon {
 template <class Context> template <typename T>
 void IndexSelectOp<Context>::RunImpl() {
    auto* x = X(0).template data<T, Context>();
-    auto* i = X(1).template mutable_data<int64_t, Context>();
+    auto* i = X(1).template data<int64_t, Context>();
    auto* y = Y(0)->template mutable_data<T, Context>();
    kernel::IndexSelect(
@@ -67,12 +67,6 @@ void IndexSelectGradientOp<Context>::RunImpl() {
    auto* dy = X(2).template data<T, Context>();
    auto* dx = Y(0)->template mutable_data<T, Context>();
-    math::Set(
-        X(0).count(),
-        cast::to<T>(0.f),
-        dx, ctx()
-    );
    kernel::IndexSelectGrad(
        outer_dim_,
        inner_dim_,

--- a/Dragon/src/operators/array/masked_select_op.cc
+++ b/Dragon/src/operators/array/masked_select_op.cc
+#include "core/workspace.h"
+#include "utils/op_kernel.h"
+#include "utils/math_utils.h"
+#include "utils/math_functions.h"
+#include "operators/array/masked_select_op.h"
+namespace dragon {
+template <class Context> template <typename T>
+void MaskedSelectOp<Context>::RunImpl() {
+    auto* x = X(0).template data<T, Context>();
+    auto* mask = X(1).template raw_data<Context>();
+    auto* scratch = ws()->CreateTensor("/share/data");
+    auto* indices = ws()->CreateTensor(unique_name("indices"));
+    kernel::MaskedSelect(
+        X(0).count(),
+        (const uint8_t*)mask, x,
+        indices, scratch,
+        Y(0), ctx()
+    );
+}
+template <class Context>
+void MaskedSelectOp<Context>::RunOnDevice() {
+    CHECK_EQ(X(0).count(), X(1).count())
+        << "\nSize of mask and input should be equal.";
+    CHECK(XIsType(X(1), bool) || XIsType(X(1), uint8_t))
+        << "\nExcepted bool or uint8 mask.";
+    DispatchHelper<TensorTypes
+        <bool, int8_t, uint8_t, int, int64_t,
+               float16, float, double>
+    >::Call(this, X(0));
+}
+template <class Context> template <typename T>
+void MaskedSelectGradientOp<Context>::RunImpl() {
+    auto* dy = X(1).template data<T, Context>();
+    auto* dx = Y(0)->template mutable_data<T, Context>();
+    auto* i = ws()
+        ->GetTensor(unique_name("indices"))
+        ->template data<int64_t, Context>();
+    kernel::MaskedSelectGrad(
+        X(0).count(),
+        X(1).count(),
+        i, dy,
+        dx, ctx()
+    );
+}
+template <class Context>
+void MaskedSelectGradientOp<Context>::RunOnDevice() {
+    Y(0)->ReshapeLike(X(0));
+    DispatchHelper<TensorTypes
+        <bool, int8_t, uint8_t, int, int64_t,
+               float16, float, double>
+    >::Call(this, X(0));
+}
+DEPLOY_CPU(MaskedSelect);
+#ifdef WITH_CUDA
+DEPLOY_CUDA(MaskedSelect);
+#endif
+DEPLOY_CPU(MaskedSelectGradient);
+#ifdef WITH_CUDA
+DEPLOY_CUDA(MaskedSelectGradient);
+#endif
+OPERATOR_SCHEMA(MaskedSelect)
+     /* X, M */
+    .NumInputs(2)
+     /* Y */
+    .NumOutputs(1);
+OPERATOR_SCHEMA(MaskedSelectGradient)
+     /* X, dY */
+    .NumInputs(2)
+     /* dX */
+    .NumOutputs(1);
+namespace {
+class GradientMaker final : public GradientMakerBase {
+ public:
+    GRADIENT_MAKER_CTOR(GradientMaker);
+    vector<OperatorDef> MakeDef() override {
+        return SingleDef(def.type() + "Gradient", "",
+            vector<string>({ I(0), GO(0) }),
+            vector<string>({ GI(0)} )
+        );
+    }
+};
+}  // namespace
+REGISTER_GRADIENT(MaskedSelect, GradientMaker);
+}  // namespace dragon
\ No newline at end of file
--- a/Dragon/src/operators/array/non_zero_op.cc
+++ b/Dragon/src/operators/array/non_zero_op.cc
+#include "core/workspace.h"
+#include "utils/op_kernel.h"
+#include "utils/math_functions.h"
+#include "operators/array/non_zero_op.h"
+#define TENSOR_FROM_VEC(tensor, vec, T) \
+    { \
+        tensor.Reshape({ (int64_t)vec.size() }); \
+        auto* data = tensor.template mutable_data<T, CPUContext>(); \
+        for (int i = 0; i < vec.size(); i++) data[i] = (T)vec[i]; \
+    }
+namespace dragon {
+template <class Context> template <typename T>
+void NonZeroOp<Context>::RunImpl() {
+    auto ndim = X(0).ndim();
+    auto nelements = X(0).count();
+    auto* x = X(0).template data<T, Context>();
+    auto* scratch = ws()->CreateTensor("/share/data");
+    auto* indices = ws()->CreateTensor("/share/buffer/grad:0");
+    auto* mask = ws()
+        ->CreateTensor("/share/buffer/grad:1")
+        ->Reshape({ nelements })
+        ->template mutable_data<uint8_t, Context>();
+    kernel::NotZero(nelements, x, (bool*)mask, ctx());
+    kernel::MaskedSelect(
+        nelements,
+        mask, (T*)nullptr,
+        indices, scratch,
+        (Tensor*)nullptr, ctx()
+    );
+    nelements = indices->count();
+    auto* y = Y(0)
+        ->Reshape({ nelements, (int64_t)ndim })
+        ->template mutable_data<int64_t, Context>();
+    kernel::UnravelIndex(
+        nelements, ndim,
+        X_dims_.template data<int, Context>(),
+        indices->template data<int64_t, Context>(),
+        y, ctx()
+    );
+}
+template <class Context>
+void NonZeroOp<Context>::RunOnDevice() {
+    TENSOR_FROM_VEC(X_dims_, X(0).dims(), int);
+    DispatchHelper<TensorTypes
+        <bool, int8_t, uint8_t, int, int64_t,
+               float16, float, double>
+    >::Call(this, X(0));
+}
+DEPLOY_CPU(NonZero);
+#ifdef WITH_CUDA
+DEPLOY_CUDA(NonZero);
+#endif
+OPERATOR_SCHEMA(NonZero)
+     /* X */
+    .NumInputs(1)
+     /* Y */
+    .NumOutputs(1);
+NO_GRADIENT(NonZero);
+}  // namespace dragon
\ No newline at end of file
--- a/Dragon/src/operators/array/where_op.cc
+++ b/Dragon/src/operators/array/where_op.cc
+#include "core/workspace.h"
+#include "utils/op_kernel.h"
+#include "utils/math_utils.h"
+#include "utils/math_functions.h"
+#include "operators/array/where_op.h"
+namespace dragon {
+template <class Context> template <typename T>
+void WhereOp<Context>::RunImpl() {
+    const T *a = nullptr, *b = nullptr;
+    auto* mask = X(2).template raw_data<Context>();
+    if (X(0).count() < X(1).count()) {
+        int rows, cols;
+        Y(0)->ReshapeLike(X(1));
+        a = ws()
+            ->template data<T, Context>
+                ({ X(1).count() })[0];
+        b = X(1).template data<T, Context>();
+        auto* ra = X(0).template data<T, Context>();
+        if (utils::IsRowwiseBroadcast(
+                X(0).dims(), X(1).dims(),
+                    &rows, &cols)) {
+            math::BroadcastSet(
+                rows, cols, 0, ra,
+                const_cast<T*>(a), ctx()
+            );
+        } else if (utils::IsColwiseBroadcast(
+                X(0).dims(), X(1).dims(),
+                    &rows, &cols)) {
+            math::BroadcastSet(
+                rows, cols, 1, ra,
+                const_cast<T*>(a), ctx()
+            );
+        } else {
+            LOG(FATAL)
+                << "Could not broadcast "
+                << X(0).DimString()
+                << " to "
+                << X(1).DimString();
+        }
+    } else if (X(0).count() > X(1).count()) {
+        int rows, cols;
+        Y(0)->ReshapeLike(X(0));
+        b = ws()
+            ->template data<T, Context>
+                ({ X(0).count() })[0];
+        a = X(0).template data<T, Context>();
+        auto* rb = X(1).template data<T, Context>();
+        if (utils::IsRowwiseBroadcast(
+                X(0).dims(), X(1).dims(),
+                    &rows, &cols)) {
+            math::BroadcastSet(
+                rows, cols, 0, rb,
+                const_cast<T*>(b), ctx()
+            );
+        } else if (utils::IsColwiseBroadcast(
+                X(0).dims(), X(1).dims(),
+                    &rows, &cols)) {
+            math::BroadcastSet(
+                rows, cols, 1, rb,
+                const_cast<T*>(b), ctx()
+            );
+        } else {
+            LOG(FATAL)
+                << "Could not broadcast "
+                << X(1).DimString()
+                << " to "
+                << X(0).DimString();
+        }
+    } else {
+        Y(0)->ReshapeLike(X(0));
+        a = X(0).template data<T, Context>();
+        b = X(1).template data<T, Context>();
+    }
+    CHECK_EQ(Y(0)->count(), X(2).count())
+        << "\nSize of mask and input should be equal.";
+    auto* y = Y(0)->template mutable_data<T, Context>();
+    kernel::Where(
+        Y(0)->count(),
+        (const uint8_t*)mask,
+        a, b, y, ctx()
+    );
+}
+template <class Context>
+void WhereOp<Context>::RunOnDevice() {
+    CHECK(XIsType(X(2), bool) || XIsType(X(2), uint8_t))
+        << "\nExcepted bool or uint8 mask.";
+    DispatchHelper<TensorTypes
+        <bool, int8_t, uint8_t, int, int64_t,
+               float16, float, double>
+    >::Call(this, X(0));
+}
+template <class Context> template <typename T>
+void WhereGradientOp<Context>::RunImpl() {
+    T *da = nullptr, *db = nullptr;
+    auto* dy = X(-1).template data<T, Context>();
+    auto* mask = X(2).template raw_data<Context>();
+    int rows, cols, type;
+    if (utils::IsRowwiseBroadcast(
+            X(0).dims(), X(1).dims(),
+                &rows, &cols)) {
+        type = 0;
+    } else if (utils::IsColwiseBroadcast(
+            X(0).dims(), X(1).dims(),
+                &rows, &cols)) {
+        type = 1;
+    }
+    vec32_t dims = { rows, cols };
+    vec32_t axes = { type };
+    if (X(0).count() < X(1).count()) {
+        da = ws()
+            ->template data<T, Context>
+                ({ X(1).count() })[0];
+        db = Y(1)->template mutable_data<T, Context>();
+        auto* ra = Y(0)->template mutable_data<T, Context>();
+        kernel::WhereGrad(
+            X(-1).count(),
+            (const uint8_t*)mask,
+            dy, da, db, ctx()
+        );
+        kernel::ReduceSum(
+            2, dims.data(),
+            1, axes.data(),
+            1.f, da,
+            ra, ctx()
+        );
+    } else if (X(0).count() > X(1).count()) {
+        db = ws()
+            ->template data<T, Context>
+                ({ X(0).count() })[0];
+        da = Y(0)->template mutable_data<T, Context>();
+        auto* rb = Y(1)->template mutable_data<T, Context>();
+        kernel::WhereGrad(
+            X(-1).count(),
+            (const uint8_t*)mask,
+            dy, da, db, ctx()
+        );
+        kernel::ReduceSum(
+            2, dims.data(),
+            1, axes.data(),
+            1.f, db,
+            rb, ctx()
+        );
+    } else {
+        da = Y(0)->template mutable_data<T, Context>();
+        db = Y(1)->template mutable_data<T, Context>();
+        kernel::WhereGrad(
+            Y(0)->count(),
+            (const uint8_t*)mask,
+            dy, da, db, ctx()
+        );
+    }
+}
+template <class Context>
+void WhereGradientOp<Context>::RunOnDevice() {
+    CHECK_EQ(X(-1).count(), X(2).count())
+        << "\nSize of mask and input should be equal.";
+    Y(0)->ReshapeLike(X(0));
+    Y(1)->ReshapeLike(X(1));
+    CHECK(XIsType(X(2), bool) || XIsType(X(2), uint8_t))
+        << "\nExcepted bool or uint8 mask.";
+    DispatchHelper<TensorTypes
+        <int8_t, uint8_t, int, int64_t,
+            float16, float, double>
+    >::Call(this, X(0));
+}
+DEPLOY_CPU(Where);
+#ifdef WITH_CUDA
+DEPLOY_CUDA(Where);
+#endif
+DEPLOY_CPU(WhereGradient);
+#ifdef WITH_CUDA
+DEPLOY_CUDA(WhereGradient);
+#endif
+OPERATOR_SCHEMA(Where)
+     /* A, B, M */
+    .NumInputs(3)
+     /* Y */
+    .NumOutputs(1);
+OPERATOR_SCHEMA(WhereGradient)
+     /* A, B, M, dY */
+    .NumInputs(4)
+     /* dA, dB */
+    .NumOutputs(2);
+namespace {
+class GradientMaker : public GradientMakerBase {
+ public:
+    GRADIENT_MAKER_CTOR(GradientMaker);
+    vector<OperatorDef> MakeDef() override {
+        return SingleDef(def.type() + "Gradient", "",
+            vector<string>({ I(0), I(1), I(2), GO(0) }),
+            vector<string>({ GI(0), GI(1) })
+        );
+    }
+};
+}  // namespace
+REGISTER_GRADIENT(Where, GradientMaker);
+}  // namespace dragon
\ No newline at end of file
--- a/Dragon/src/operators/control_flow/compare_op.cc
+++ b/Dragon/src/operators/control_flow/compare_op.cc
--- a/Dragon/src/operators/control_flow/masked_assign_op.cc
+++ b/Dragon/src/operators/control_flow/masked_assign_op.cc
--- a/Dragon/src/operators/norm/batch_norm.cc
+++ b/Dragon/src/operators/norm/batch_norm.cc
--- a/Dragon/src/operators/norm/group_norm.cc
+++ b/Dragon/src/operators/norm/group_norm.cc
--- a/Dragon/src/operators/recurrent/lstm_cell_op.cc
+++ b/Dragon/src/operators/recurrent/lstm_cell_op.cc
--- a/Dragon/src/operators/vision/bias_add_op.cc
+++ b/Dragon/src/operators/vision/bias_add_op.cc
--- a/Dragon/src/operators/vision/drop_block2d_op.cc
+++ b/Dragon/src/operators/vision/drop_block2d_op.cc