Optimize GatherOp

Ting PAN
Commit 1d03e8e2 authored Jan 19, 2019 by Ting PAN
Showing with 341 additions and 366 deletions
Docs/api/python/_static/css/dragon.css
Docs/api/python/contents/config.rst
Docs/api/python/contents/core/tensor.rst
Docs/api/python/contents/core/tensor_utils.rst
Docs/api/python/contents/memonger.rst
Docs/api/python/contents/tools/db.rst
Docs/api/python/contents/tools/im2db.rst
Docs/api/python/contents/tools/summary_writer.rst
Docs/api/python/contents/tools/tensorboard.rst
Docs/api/python/contents/updaters.rst
Docs/api/python/contents/vm/caffe/layer.rst
Docs/api/python/contents/vm/caffe/misc.rst
Docs/api/python/contents/vm/caffe/net.rst
Docs/api/python/contents/vm/caffe/solver.rst
Docs/api/python/contents/vm/theano/compile.rst
Dragon/include/operators/ndarray/gather_op.h
Dragon/include/utils/op_kernel.h
Dragon/modules/cxx/dragon.cc
Dragon/modules/python/py_onnx.h
Dragon/python/dragon/config.py
--- a/Docs/api/python/_static/css/dragon.css
+++ b/Docs/api/python/_static/css/dragon.css
@@ -283,14 +283,16 @@ code.docutils.literal:hover {

 dt {
    font-weight: 700;
-    background: #e7f2fa;
+    background: #f7f7f7;
    border-bottom: solid #0079b2;
-    border-radius: 1px;
+    border-radius: 8px;
    margin-bottom: 20px;
+    padding: 8px;
+    width: 75%;
 }

 dt:target, .highlighted {
-    background-color: #e7f2fa;
+    background-color: #f7f7f7;
    border-bottom: 3px solid #c7254e;
 }

@@ -299,7 +301,7 @@ dt:target:before {
    content: '';
    display: block;
    height: 65px;
-    margin: -20px 0 0;
+    margin: -20px -8px 8px;
 }

 dl.method dt {

--- a/Docs/api/python/contents/config.rst
+++ b/Docs/api/python/contents/config.rst
@@ -5,8 +5,8 @@
 .. toctree::
   :hidden:

-Quick Shortcut
--------------
+Quick Reference
+---------------

 ==========================   =============================================================================
 List                         Brief

--- a/Docs/api/python/contents/core/tensor.rst
+++ b/Docs/api/python/contents/core/tensor.rst
@@ -5,8 +5,8 @@
 .. toctree::
   :hidden:

-Quick Shortcut
--------------
+Quick Reference
+---------------

 ==============================    =============================================================================
 List                              Brief

--- a/Docs/api/python/contents/core/tensor_utils.rst
+++ b/Docs/api/python/contents/core/tensor_utils.rst
@@ -5,8 +5,8 @@
 .. toctree::
   :hidden:

-Quick Shortcut
--------------
+Quick Reference
+---------------

 ==============================    =============================================================================
 List                              Brief

--- a/Docs/api/python/contents/memonger.rst
+++ b/Docs/api/python/contents/memonger.rst
@@ -5,8 +5,8 @@
 .. toctree::
   :hidden:

-Quick Shortcut
--------------
+Quick Reference
+---------------

 ====================    =============================================================================
 List                    Brief

--- a/Docs/api/python/contents/tools/db.rst
+++ b/Docs/api/python/contents/tools/db.rst
@@ -5,8 +5,8 @@
 .. toctree::
   :hidden:

-Quick Shortcut
--------------
+Quick Reference
+---------------

 ====================    =============================================================================
 List                    Brief

--- a/Docs/api/python/contents/tools/im2db.rst
+++ b/Docs/api/python/contents/tools/im2db.rst
@@ -5,8 +5,8 @@
 .. toctree::
   :hidden:

-Quick Shortcut
--------------
+Quick Reference
+---------------

 ====================    =============================================================================
 List                    Brief

--- a/Docs/api/python/contents/tools/summary_writer.rst
+++ b/Docs/api/python/contents/tools/summary_writer.rst
@@ -5,8 +5,8 @@
 .. toctree::
   :hidden:

-Quick Shortcut
--------------
+Quick Reference
+---------------

 ====================    =============================================================================
 List                    Brief

--- a/Docs/api/python/contents/tools/tensorboard.rst
+++ b/Docs/api/python/contents/tools/tensorboard.rst
@@ -5,8 +5,8 @@
 .. toctree::
   :hidden:

-Quick Shortcut
--------------
+Quick Reference
+---------------

 ====================    =============================================================================
 List                    Brief

--- a/Docs/api/python/contents/updaters.rst
+++ b/Docs/api/python/contents/updaters.rst
@@ -5,8 +5,8 @@
 .. toctree::
   :hidden:

-Quick Shortcut
--------------
+Quick Reference
+---------------

 ====================    =============================================================================
 List                    Brief

--- a/Docs/api/python/contents/vm/caffe/layer.rst
+++ b/Docs/api/python/contents/vm/caffe/layer.rst
@@ -112,8 +112,8 @@ List                                  Brief
 =================================     =============================================================================


-Quick Shortcut
--------------
+Quick Reference
+---------------

 ====================    =============================================================================
 List                    Brief

--- a/Docs/api/python/contents/vm/caffe/misc.rst
+++ b/Docs/api/python/contents/vm/caffe/misc.rst
@@ -5,8 +5,8 @@
 .. toctree::
   :hidden:

-Quick Shortcut
--------------
+Quick Reference
+---------------

 =========================      ============================================================================
 List                           Brief

--- a/Docs/api/python/contents/vm/caffe/net.rst
+++ b/Docs/api/python/contents/vm/caffe/net.rst
@@ -5,8 +5,8 @@
 .. toctree::
   :hidden:

-Quick Shortcut
--------------
+Quick Reference
+---------------

 =========================   =============================================================================
 List                        Brief

--- a/Docs/api/python/contents/vm/caffe/solver.rst
+++ b/Docs/api/python/contents/vm/caffe/solver.rst
@@ -5,8 +5,8 @@
 .. toctree::
   :hidden:

-Quick Shortcut
--------------
+Quick Reference
+---------------

 ====================    =============================================================================
 List                    Brief

--- a/Docs/api/python/contents/vm/theano/compile.rst
+++ b/Docs/api/python/contents/vm/theano/compile.rst
@@ -6,8 +6,8 @@
   :hidden:


-Quick Shortcut
--------------
+Quick Reference
+---------------

 ==============================      =======================================================================
 List                                Brief

--- a/Dragon/include/operators/ndarray/gather_op.h
+++ b/Dragon/include/operators/ndarray/gather_op.h
@@ -39,15 +39,15 @@ class GatherGradientOp final : public Operator<Context> {
    GatherGradientOp(const OperatorDef& def, Workspace* ws)
        : Operator<Context>(def, ws),
          axis(OperatorBase::Arg<int64_t>("axis", 0)),
-          acc_grad(OperatorBase::Arg<bool>("acc_gradient", false)) {}
+          zero_grad(OperatorBase::Arg<bool>("zero_grad", true)) {}
    USE_OPERATOR_FUNCTIONS;

    void RunOnDevice() override;
    template <typename T> void RunWithType();

 protected:
+    bool zero_grad;
    int64_t axis, outer_dim, inner_dim, x_slice_dim, y_slice_dim;
-    bool acc_grad;
 };

 }  // namespace dragon

--- a/Dragon/include/utils/op_kernel.h
+++ b/Dragon/include/utils/op_kernel.h
@@ -601,32 +601,23 @@ void ArgMin(
 /*! ndarray.gather */

 template <typename T, class Context>
-void CanonicalAxis(
-    const int               count,
-    const int               dim,
-    T*                      y,
-    Context*                ctx);
-
-template <typename T, class Context>
 void Gather(
-    const int               count,
    const int               outer_dim,
    const int               inner_dim,
    const int               x_slice_dim,
    const int               y_slice_dim,
-    const int*              indices,
+    const int64_t*          indices,
    const T*                x,
    T*                      y,
    Context*                ctx);

 template <typename T, class Context>
 void GatherGrad(
-    const int               count,
    const int               outer_dim,
    const int               inner_dim,
    const int               x_slice_dim,
    const int               y_slice_dim,
-    const int*              indices,
+    const int64_t*          indices,
    const T*                dy,
    T*                      dx,
    Context*                ctx);

--- a/Dragon/modules/cxx/dragon.cc
+++ b/Dragon/modules/cxx/dragon.cc
@@ -3,7 +3,7 @@
 #include "core/common.h"
 #include "utils/proto_utils.h"
 #include "utils/caffemodel.h"
-#include "contrib/onnx/onnx_backend.h"
+#include "onnx/onnx_backend.h"

 #include "dragon.h"


--- a/Dragon/modules/python/py_onnx.h
+++ b/Dragon/modules/python/py_onnx.h
@@ -11,7 +11,7 @@
 #ifndef DRAGON_PYTHON_PY_ONNX_H_
 #define DRAGON_PYTHON_PY_ONNX_H_

-#include "contrib/onnx/onnx_backend.h"
+#include "onnx/onnx_backend.h"

 #include "py_dragon.h"


--- a/Dragon/python/dragon/config.py
+++ b/Dragon/python/dragon/config.py
@@ -270,7 +270,7 @@ def ExportMetaGraph(prefix=''):

    These text files will be saved as the following format:

-    ``prefix/Graph_xxx.metatxt``
+        *prefix/Graph.metatxt*

    Note that an empty prefix will leads to invalid exporting.

@@ -293,12 +293,12 @@ def SetLoggingLevel(level):

    Parameters
    ----------
-    level : str
-        The level, ``DEBUG``, ``INFO``, ``WARNING``, ``ERROR`` or ``FATAL``.
+    level : {'DEBUG', 'INFO, 'WARNING', 'ERROR', 'FATAL'}, required
+        The logging level.

    Notes
    -----
-    The default level is ``INFO``.
+    The default level is *INFO*.

    """
    C.SetLogLevelCC(level)

--- a/Dragon/python/dragon/core/helper.py
+++ b/Dragon/python/dragon/core/helper.py
@@ -391,9 +391,12 @@ class OperatorHelper(object):
    @classmethod
    def _apply_Gather(cls, arguments, inputs, outputs):
        outputs[0].dtype = inputs[0].dtype
+        axis = arguments['axis']
        try:
-            outputs[0].shape = inputs[0].shape[:]
-            outputs[0].shape[arguments['axis']] = None
+            outputs[0].shape = \
+                inputs[0].shape[:axis] + \
+                    inputs[1].shape[:] + \
+                        inputs[0].shape[axis + 1:]
        except:
            pass
        return outputs

--- a/Dragon/python/dragon/operators/ndarray.py
+++ b/Dragon/python/dragon/operators/ndarray.py
@@ -17,10 +17,10 @@ from . import *


 @OpSchema.Inputs(1)
-def Gather(inputs, indices, axis=0, acc_gradient=False, **kwargs):
+def Gather(inputs, indices, axis=0, zero_grad=True, **kwargs):
    """Gather the input according to the indices along the given axis.

-    **Type Constraints**: (*int32*, *float32*)
+    **Type Constraints**: (*bool*, *int8*, *uint8*, *int32*, *int64*, *float16*, *float32*, *float64*)

    Parameters
    ----------
@@ -30,7 +30,7 @@ def Gather(inputs, indices, axis=0, acc_gradient=False, **kwargs):
        The indices to form output tensor.
    axis : int, optional
        The start axis, can be negative.
-    acc_gradient : bool, optional
+    zero_grad : bool, optional
        Whether to accumulate the gradients.

    Returns
@@ -40,24 +40,10 @@ def Gather(inputs, indices, axis=0, acc_gradient=False, **kwargs):

    """
    arguments = ParseArgs(locals())
-
-    arguments['inputs'], arguments['indices'] = [arguments['inputs'],
-        Tensor.Convert(indices, dtype='int32')], None
-
-    output = Tensor.CreateOperator('Gather', **arguments)
-
-    try:
-        output.shape = inputs.shape[:]
-        if not isinstance(indices, Tensor):
-            if not isinstance(indices, (list, tuple)):
-                indices = [indices]
-            output.shape[axis] = len(indices)
-        else:
-            output.shape[axis] = None
-    except:
-        pass
-
-    return output
+    arguments['inputs'], arguments['indices'] = \
+        [arguments['inputs'], Tensor.Convert(
+            indices, dtype='int64')], None
+    return Tensor.CreateOperator('Gather', **arguments)


 @OpSchema.Inputs(1)

--- a/Dragon/python/dragon/operators/vision.py
+++ b/Dragon/python/dragon/operators/vision.py
@@ -283,9 +283,7 @@ def Pool2d(

 @OpSchema.Inputs(2)
 def ROIPool(inputs, pool_h, pool_w, spatial_scale=1.0, **kwargs):
-    """Max RoI Pooling. `[Girshick, 2015] <https://arxiv.org/abs/1504.08083>`_.
-
-    The first dimension of input must be ``1``.
+    """Max RoIPooling. `[Girshick, 2015] <https://arxiv.org/abs/1504.08083>`_.

    **Type Constraints**: (*float16*, *float32*)

@@ -311,9 +309,7 @@ def ROIPool(inputs, pool_h, pool_w, spatial_scale=1.0, **kwargs):

 @OpSchema.Inputs(2)
 def ROIAlign(inputs, pool_h=0, pool_w=0, spatial_scale=1.0, sampling_ratio=2, **kwargs):
-    """AVG ROIAlign. `[He et.al, 2017] <https://arxiv.org/abs/1703.06870>`_.
-
-    The first dimension of input must be ``1``.
+    """AVG RoIAlign. `[He et.al, 2017] <https://arxiv.org/abs/1703.06870>`_.

    **Type Constraints**: (*float16*, *float32*)


--- a/Dragon/python/dragon/utils/vision/blob_fetcher.py
+++ b/Dragon/python/dragon/utils/vision/blob_fetcher.py
@@ -20,7 +20,7 @@ from multiprocessing import Process
 class BlobFetcher(Process):
    """BlobFetcher is deployed to queue blobs from `DataTransformer`_.

-    It is supported to form ``NHWC`` image blobs and ``1D`` label blobs.
+    It is supported to form *NHWC* image blobs and *1d* label blobs.

    """
    def __init__(self, **kwargs):

--- a/Dragon/python/dragon/utils/vision/data_batch.py
+++ b/Dragon/python/dragon/utils/vision/data_batch.py
@@ -26,7 +26,7 @@ from .blob_fetcher import BlobFetcher


 class DataBatch(object):
-    """DataBatch aims to prefetch data by ``Triple-Buffering``.
+    """DataBatch aims to prefetch data by *Triple-Buffering*.

    It takes full advantages of the Process/Thread of Python,
    which provides remarkable I/O speed up for scalable distributed training.

--- a/Dragon/python/dragon/vm/caffe/model_libs.py
+++ b/Dragon/python/dragon/vm/caffe/model_libs.py
--- a/Dragon/python/dragon/vm/caffe/net_spec.py
+++ b/Dragon/python/dragon/vm/caffe/net_spec.py
@@ -236,4 +236,4 @@ class Parameters(object):

 _param_names = param_name_dict()
 layers = Layers()
-params = Parameters()
+params = Parameters()
\ No newline at end of file
--- a/Dragon/python/dragon/vm/theano/compile/function.py
+++ b/Dragon/python/dragon/vm/theano/compile/function.py
@@ -354,15 +354,14 @@ class Function(object):

        # Store for future development
        self.meta_graph = meta_graph
-        self.graph_name = meta_graph.name

        # Call c api to create graph
-        ws.CreateGraph(meta_graph)
+        self.graph_name = ws.CreateGraph(meta_graph)

        # Bind a lambda callback to run this graph
        callback_inputs = self.inputs if explicit_inputs else []
        self.callback = lambda *args, **kwargs: \
-            ws.RunGraph(meta_graph.name, (callback_inputs, args), self.outputs, **kwargs)
+            ws.RunGraph(self.graph_name, (callback_inputs, args), self.outputs, **kwargs)

        # Self return
        return self
@@ -386,7 +385,7 @@ def function(inputs=None, outputs=None, givens=None, updater=None):
    ----------
    inputs : sequence of Tensor, optional
        The inputs to feed.
-    inputs : sequence of Tensor, optional
+    outputs : sequence of Tensor, optional
        The outputs to fetch.
    givens : dict of Tensor, optional
        The substitutions to use.

--- a/Dragon/python/dragon/vm/torch/ops/modules/axis.py
+++ b/Dragon/python/dragon/vm/torch/ops/modules/axis.py
@@ -60,6 +60,7 @@ class Gather(BaseModule):
            'n_inputs': 2, 'n_outputs': 1,
            'arguments': {
                'axis': self.axis,
+                'zero_grad': True,
            }
        }


--- a/Dragon/src/contrib/rcnn/bbox_utils.h
+++ b/Dragon/src/contrib/rcnn/bbox_utils.h
@@ -188,16 +188,16 @@ inline void RetrieveRoIs(

 template <typename T>
 inline int roi_level(
-    const int                       min_level,  //  e.g. 2
-    const int                       max_level,  //  e.g. 5
-    const int                       canonical_level,  //  e.g. 4
-    const int                       canonical_scale,  //  e.g. 224
+    const int                       min_level,
+    const int                       max_level,
+    const int                       canonical_level,
+    const int                       canonical_scale,
    T*                              roi) {
    T w = roi[3] - roi[1] + 1;
    T h = roi[4] - roi[2] + 1;
    // Refer the settings of paper
-    int level = canonical_level + (int)std::log(
-            std::max(std::sqrt(w * h), (T)1) / (T)canonical_scale);
+    int level = canonical_level + std::log2(
+        std::max(std::sqrt(w * h), (T)1) / (T)canonical_scale);
    return std::min(max_level, std::max(min_level, level));
 }


--- a/Dragon/src/contrib/rcnn/proposal_op.cc
+++ b/Dragon/src/contrib/rcnn/proposal_op.cc
@@ -80,7 +80,7 @@ void ProposalOp<Context>::RunWithType(
                anchors_.Reshape({ A, 4 });

                rcnn::GenerateAnchors<BT>(strides[i],
-                    (int)ratios.size(), 1, &ratios[0], &scales[0],
+                    (int)ratios.size(), 1, &ratios[0], &scales[i],
                        anchors_.template mutable_data<BT, CPUContext>());

                rcnn::GenerateGridAnchors<BT>(

--- a/Dragon/src/kernels/ndarray/gather_op_kernel.cc
+++ b/Dragon/src/kernels/ndarray/gather_op_kernel.cc
@@ -6,134 +6,93 @@ namespace dragon {

 namespace kernel {

-/*! CanonicalAxis <T = int32, Device = CPU> */
-
-template <> void CanonicalAxis<int, CPUContext>(
-    const int               count,
-    const int               dim,
-    int*                    y,
-    CPUContext*             ctx) {
-#ifdef WITH_OMP
-    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
-#endif
-    for (int i = 0; i < count; ++i) if (y[i] < 0) y[i] += dim;
-}
-
 /*! Gather <T = ?, Device = CPU> */

 template <typename T>
 void _Gather(
-    const int               count,
    const int               outer_dim,
    const int               inner_dim,
    const int               x_slice_dim,
    const int               y_slice_dim,
-    const int*              indices,
+    const int64_t*          indices,
    const T*                x,
    T*                      y,
    CPUContext*             ctx) {
-    int64_t x_offset, y_offset, x_idx_offset, y_idx_offset;
-    for (int i = 0; i < y_slice_dim; ++i) {
-        y_idx_offset = i;
-        x_idx_offset = indices[y_idx_offset];
-        for (int n = 0; n < outer_dim; ++n) {
-            x_offset = (n * x_slice_dim + x_idx_offset) * inner_dim;
-            y_offset = (n * y_slice_dim + y_idx_offset) * inner_dim;
+    int64_t x_offset, select_idx;
+    for (int n = 0; n < outer_dim; ++n) {
+        for (int i = 0; i < y_slice_dim; ++i) {
+            select_idx = indices[i];
+            select_idx = select_idx >= 0 ?
+                select_idx : select_idx + x_slice_dim;
+            x_offset = (n * x_slice_dim + select_idx) * inner_dim;
            ctx->Copy<T, CPUContext, CPUContext>(
-                inner_dim, y + y_offset, x + x_offset);
+                inner_dim, y, x + x_offset);
+            y += inner_dim;
        }
    }
 }

-/*! Gather <T = float32, Device = CPU> */
-
-template <> void Gather<float, CPUContext>(
-    const int               count,
-    const int               outer_dim,
-    const int               inner_dim,
-    const int               x_slice_dim,
-    const int               y_slice_dim,
-    const int*              indices,
-    const float*            x,
-    float*                  y,
-    CPUContext*             ctx) {
-    _Gather<float>(count, outer_dim, inner_dim,
-        x_slice_dim, y_slice_dim, indices, x, y, ctx);
-}
-
-/*! Gather <T = int32, Device = CPU> */
-
-template <> void Gather<int, CPUContext>(
-    const int               count,
-    const int               outer_dim,
-    const int               inner_dim,
-    const int               x_slice_dim,
-    const int               y_slice_dim,
-    const int*              indices,
-    const int*              x,
-    int*                    y,
-    CPUContext*             ctx) {
-    _Gather<int>(count, outer_dim, inner_dim,
-        x_slice_dim, y_slice_dim, indices, x, y, ctx);
-}
-
 /*! GatherGrad <T = ?, Device = CPU> */

 template <typename T>
 void _GatherGrad(
-    const int               count,
    const int               outer_dim,
    const int               inner_dim,
    const int               x_slice_dim,
    const int               y_slice_dim,
-    const int*              indices,
+    const int64_t*          indices,
    const T*                dy,
    T*                      dx,
    CPUContext*             ctx) {
-    int64_t x_offset, y_offset, x_idx_offset, y_idx_offset;
-    for (int i = 0; i < y_slice_dim; ++i) {
-        y_idx_offset = i;
-        x_idx_offset = indices[y_idx_offset];
-        for (int n = 0; n < outer_dim; ++n) {
-            x_offset = (n * x_slice_dim + x_idx_offset) * inner_dim;
-            y_offset = (n * y_slice_dim + y_idx_offset) * inner_dim;
+    int64_t x_offset, select_idx;
+    for (int n = 0; n < outer_dim; ++n) {
+        for (int i = 0; i < y_slice_dim; ++i) {
+            select_idx = indices[i];
+            select_idx = select_idx >= 0 ?
+                select_idx : select_idx + x_slice_dim;
+            x_offset = (n * x_slice_dim + select_idx) * inner_dim;
            math::Add<T, CPUContext>(inner_dim,
-                dy + y_offset, dx + x_offset, dx + x_offset, ctx);
+                dy, dx + x_offset, dx + x_offset, ctx);
+            dy += inner_dim;
        }
    }
 }

-/*! GatherGrad <T = float32, Device = CPU> */
-
-template <> void GatherGrad<float, CPUContext>(
-    const int               count,
-    const int               outer_dim,
-    const int               inner_dim,
-    const int               x_slice_dim,
-    const int               y_slice_dim,
-    const int*              indices,
-    const float*            dy,
-    float*                  dx,
-    CPUContext*             ctx) {
-    _GatherGrad<float>(count, outer_dim, inner_dim,
-        x_slice_dim, y_slice_dim, indices, dy, dx, ctx);
-}
-
-/*! GatherGrad <T = int32, Device = CPU> */
+/*! Kernel Launchers */
+
+#define DEFINE_GATHER_KERNEL_LAUNCHER(name, T) \
+    template <> void name<T, CPUContext>( \
+        const int               outer_dim, \
+        const int               inner_dim, \
+        const int               x_slice_dim, \
+        const int               y_slice_dim, \
+        const int64_t*          indices, \
+        const T*                x, \
+        T*                      y, \
+        CPUContext*             ctx) { \
+        _##name<T> \
+            (outer_dim, inner_dim, x_slice_dim, \
+                y_slice_dim, indices, x, y, ctx); \
+    }

-template <> void GatherGrad<int, CPUContext>(
-    const int               count,
-    const int               outer_dim,
-    const int               inner_dim,
-    const int               x_slice_dim,
-    const int               y_slice_dim,
-    const int*              indices,
-    const int*              dy,
-    int*                    dx,
-    CPUContext*             ctx) {
-    _GatherGrad<int>(count, outer_dim, inner_dim,
-        x_slice_dim, y_slice_dim, indices, dy, dx, ctx);
-}
+DEFINE_GATHER_KERNEL_LAUNCHER(Gather, bool);
+DEFINE_GATHER_KERNEL_LAUNCHER(Gather, int8_t);
+DEFINE_GATHER_KERNEL_LAUNCHER(Gather, uint8_t);
+DEFINE_GATHER_KERNEL_LAUNCHER(Gather, int);
+DEFINE_GATHER_KERNEL_LAUNCHER(Gather, int64_t);
+DEFINE_GATHER_KERNEL_LAUNCHER(Gather, float16);
+DEFINE_GATHER_KERNEL_LAUNCHER(Gather, float);
+DEFINE_GATHER_KERNEL_LAUNCHER(Gather, double);
+
+DEFINE_GATHER_KERNEL_LAUNCHER(GatherGrad, int8_t);
+DEFINE_GATHER_KERNEL_LAUNCHER(GatherGrad, uint8_t);
+DEFINE_GATHER_KERNEL_LAUNCHER(GatherGrad, int);
+DEFINE_GATHER_KERNEL_LAUNCHER(GatherGrad, int64_t);
+DEFINE_GATHER_KERNEL_LAUNCHER(GatherGrad, float16);
+DEFINE_GATHER_KERNEL_LAUNCHER(GatherGrad, float);
+DEFINE_GATHER_KERNEL_LAUNCHER(GatherGrad, double);
+
+#undef DEFINE_GATHER_KERNEL_LAUNCHER

 }  // namespace kernel


--- a/Dragon/src/kernels/ndarray/gather_op_kernel.cu
+++ b/Dragon/src/kernels/ndarray/gather_op_kernel.cu
@@ -2,160 +2,176 @@

 #include "core/context_cuda.h"
 #include "utils/op_kernel.h"
+#include "utils/cub_device.h"

 namespace dragon {

 namespace kernel {

-/*! CanonicalAxis <T = int32, Device = CUDA> */
-
-template <typename T>
-__global__ void _CanonicalAxis(
-    const int               count,
-    const int               dim,
-    T*                      y) {
-    CUDA_1D_KERNEL_LOOP(idx, count) {
-        if (y[idx] < 0) y[idx] += dim;
-    }
-}
-
-template <> void CanonicalAxis<int, CUDAContext>(
-    const int               count,
-    const int               dim,
-    int*                    y,
-    CUDAContext*            ctx) {
-    _CanonicalAxis<int>
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >
-        (count, dim, y);
-}
-
 /*! Gather <T = ?, Device = CUDA> */

 template <typename T>
 __global__ void _Gather(
-    const int               count,
-    const int               outer_dim,
+    const int               nthreads,
    const int               inner_dim,
    const int               x_slice_dim,
    const int               y_slice_dim,
-    const int*              indices,
+    const int64_t*          indices,
    const T*                x,
    T*                      y) {
-    CUDA_1D_KERNEL_LOOP(idx, count) {
-        const int outer_idx = idx / inner_dim / y_slice_dim;
-        const int slice_idx = idx % inner_dim;
-        const int y_idx_offset = (idx / inner_dim) % y_slice_dim;
-        const int x_idx_offset = indices[y_idx_offset];
-        const int x_idx = (outer_idx * x_slice_dim + x_idx_offset)
-                                     * inner_dim + slice_idx;
-        y[idx] = x[x_idx];
+    CUDA_1D_KERNEL_LOOP(y_idx, nthreads) {
+        const int outer_idx = y_idx / inner_dim / y_slice_dim;
+        const int inner_idx = y_idx % inner_dim;
+#if __CUDA_ARCH__ >= 350
+        int select_idx = __ldg(indices +
+            ((y_idx / inner_dim) % y_slice_dim));
+#else
+        int select_idx = indices[
+            (y_idx / inner_dim) % y_slice_dim];
+#endif
+        select_idx = select_idx >= 0 ?
+            select_idx : select_idx + x_slice_dim;
+        const int x_idx = (outer_idx * x_slice_dim + select_idx)
+                                * inner_dim + inner_idx;
+        y[y_idx] = x[x_idx];
    }
 }

-/*! Gather <T = float32, Device = CUDA> */
-
-template <> void Gather<float, CUDAContext>(
-    const int               count,
-    const int               outer_dim,
-    const int               inner_dim,
-    const int               x_slice_dim,
-    const int               y_slice_dim,
-    const int*              indices,
-    const float*            x,
-    float*                  y,
-    CUDAContext*            ctx) {
-    _Gather<float>
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >
-        (count, outer_dim, inner_dim,
-            x_slice_dim, y_slice_dim,
-                indices, x, y);
-}
-
-/*! Gather <T = int32, Device = CUDA> */
-
-template <> void Gather<int, CUDAContext>(
-    const int               count,
-    const int               outer_dim,
-    const int               inner_dim,
-    const int               x_slice_dim,
-    const int               y_slice_dim,
-    const int*              indices,
-    const int*              x,
-    int*                    y,
-    CUDAContext*            ctx) {
-    _Gather<int>
-        << <CUDA_BLOCKS(count), CUDA_THREADS,
-            0, ctx->cuda_stream() >> >
-        (count, outer_dim, inner_dim,
-            x_slice_dim, y_slice_dim,
-                indices, x, y);
-}
-
 /*! GatherGrad <T = ?, Device = CUDA> */

 template <typename T>
 __global__ void _GatherGrad(
-    const int               count,
-    const int               outer_dim,
+    const int               nthreads,
    const int               inner_dim,
    const int               x_slice_dim,
    const int               y_slice_dim,
-    const int*              indices,
+    const int64_t*          indices,
    const T*                dy,
    T*                      dx) {
-    CUDA_1D_KERNEL_LOOP(idx, count) {
-        const int outer_idx = idx / inner_dim / y_slice_dim;
-        const int slice_idx = idx % inner_dim;
-        const int y_idx_offset = (idx / inner_dim) % y_slice_dim;
-        const int x_idx_offset = indices[y_idx_offset];
-        const int x_idx = (outer_idx * x_slice_dim + x_idx_offset)
-                                     * inner_dim + slice_idx;
-        atomicAdd(dx + x_idx, dy[idx]);
+    CUDA_1D_KERNEL_LOOP(i, nthreads) {
+        const int outer_idx = i / inner_dim;
+        const int inner_idx = i % inner_dim;
+        for (int j = 0; j < y_slice_dim; ++j) {
+#if __CUDA_ARCH__ >= 350
+            int select_idx = __ldg(indices + j);
+#else
+            int select_idx = indices[j];
+#endif
+            select_idx = select_idx >= 0 ?
+                select_idx : select_idx + x_slice_dim;
+            const int x_idx = (outer_idx * x_slice_dim + select_idx)
+                                     * inner_dim + inner_idx;
+            const int y_idx = (outer_idx * y_slice_dim + j)
+                                 * inner_dim + inner_idx;
+            dx[x_idx] += dy[y_idx];
+        }
    }
 }

-/*! GatherGrad <T = float32, Device = CUDA> */
+/*! GatherGrad <T = float16, Device = CUDA> */

-template <> void GatherGrad<float, CUDAContext>(
-    const int               count,
-    const int               outer_dim,
+template <> __global__ void _GatherGrad<half>(
+    const int               nthreads,
    const int               inner_dim,
    const int               x_slice_dim,
    const int               y_slice_dim,
-    const int*              indices,
-    const float*            dy,
-    float*                  dx,
-    CUDAContext*            ctx) {
-    _GatherGrad<float>
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >
-        (count, outer_dim, inner_dim,
-            x_slice_dim, y_slice_dim,
-                indices, dy, dx);
+    const int64_t*          indices,
+    const half*             dy,
+    half*                   dx) {
+    CUDA_1D_KERNEL_LOOP(i, nthreads) {
+#if __CUDA_ARCH__ >= 530
+        const int outer_idx = i / inner_dim;
+        const int inner_idx = i % inner_dim;
+        for (int j = 0; j < y_slice_dim; ++j) {
+            int select_idx = __ldg(indices + j);
+            select_idx = select_idx >= 0 ?
+                select_idx : select_idx + x_slice_dim;
+            const int x_idx = (outer_idx * x_slice_dim + select_idx)
+                * inner_dim + inner_idx;
+            const int y_idx = (outer_idx * y_slice_dim + j)
+                * inner_dim + inner_idx;
+            dx[x_idx] = __hadd(dx[x_idx], dy[y_idx]);
+        }
+#endif
+    }
 }

-/*! GatherGrad <T = int32, Device = CUDA> */
+/*! Kernel Launchers */
+
+#define DEFINE_GATHER_KERNEL_LAUNCHER(T) \
+    template <> void Gather<T, CUDAContext>( \
+        const int               outer_dim, \
+        const int               inner_dim, \
+        const int               x_slice_dim, \
+        const int               y_slice_dim, \
+        const int64_t*          indices, \
+        const T*                x, \
+        T*                      y, \
+        CUDAContext*            ctx) { \
+        auto nthreads = outer_dim * y_slice_dim * inner_dim; \
+        _Gather<T> \
+            << < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
+                 0, ctx->cuda_stream() >> > \
+            (nthreads, inner_dim, x_slice_dim, \
+                y_slice_dim, indices, x, y); \
+    }
+
+#define DEFINE_GATHER_GRAD_KERNEL_LAUNCHER(T) \
+    template <> void GatherGrad<T, CUDAContext>( \
+        const int               outer_dim, \
+        const int               inner_dim, \
+        const int               x_slice_dim, \
+        const int               y_slice_dim, \
+        const int64_t*          indices, \
+        const T*                dy, \
+        T*                      dx, \
+        CUDAContext*            ctx) { \
+        auto nthreads = outer_dim * inner_dim; \
+        _GatherGrad<T> \
+            << < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
+                 0, ctx->cuda_stream() >> > \
+            (nthreads, inner_dim, x_slice_dim, \
+                y_slice_dim, indices, dy, dx); \
+    }

-template <> void GatherGrad<int, CUDAContext>(
-    const int               count,
+DEFINE_GATHER_KERNEL_LAUNCHER(bool);
+DEFINE_GATHER_KERNEL_LAUNCHER(int8_t);
+DEFINE_GATHER_KERNEL_LAUNCHER(uint8_t);
+DEFINE_GATHER_KERNEL_LAUNCHER(int);
+DEFINE_GATHER_KERNEL_LAUNCHER(int64_t);
+DEFINE_GATHER_KERNEL_LAUNCHER(float16);
+DEFINE_GATHER_KERNEL_LAUNCHER(float);
+DEFINE_GATHER_KERNEL_LAUNCHER(double);
+
+DEFINE_GATHER_GRAD_KERNEL_LAUNCHER(int8_t);
+DEFINE_GATHER_GRAD_KERNEL_LAUNCHER(uint8_t);
+DEFINE_GATHER_GRAD_KERNEL_LAUNCHER(int);
+DEFINE_GATHER_GRAD_KERNEL_LAUNCHER(int64_t);
+DEFINE_GATHER_GRAD_KERNEL_LAUNCHER(float);
+DEFINE_GATHER_GRAD_KERNEL_LAUNCHER(double);
+
+template <> void GatherGrad<float16, CUDAContext>(
    const int               outer_dim,
    const int               inner_dim,
    const int               x_slice_dim,
    const int               y_slice_dim,
-    const int*              indices,
-    const int*              dy,
-    int*                    dx,
+    const int64_t*          indices,
+    const float16*          dy,
+    float16*                dx,
    CUDAContext*            ctx) {
-    _GatherGrad<int>
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+    auto nthreads = outer_dim * inner_dim;
+    _GatherGrad<half>
+        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
             0, ctx->cuda_stream() >> >
-        (count, outer_dim, inner_dim,
-            x_slice_dim, y_slice_dim,
-                indices, dy, dx);
+        (nthreads, inner_dim, x_slice_dim,
+            y_slice_dim, indices,
+                reinterpret_cast<const half*>(dy),
+                    reinterpret_cast<half*>(dx));
 }

+#undef DEFINE_GATHER_KERNEL_LAUNCHER
+#undef DEFINE_GATHER_GRAD_KERNEL_LAUNCHER
+
 }  // namespace kernel

 }  // namepsace dragon

--- a/Dragon/src/contrib/onnx/onnx_attibute.cc
+++ b/Dragon/src/contrib/onnx/onnx_attibute.cc
-#include "contrib/onnx/onnx_backend.h"
+#include "onnx/onnx_backend.h"

 namespace dragon {


--- a/Dragon/src/contrib/onnx/onnx_backend.cc
+++ b/Dragon/src/contrib/onnx/onnx_backend.cc
 #include "core/operator_schema.h"
 #include "utils/proto_utils.h"
-#include "contrib/onnx/onnx_backend.h"
+#include "onnx/onnx_backend.h"

 namespace dragon {


--- a/Dragon/src/contrib/onnx/onnx_backend.h
+++ b/Dragon/src/contrib/onnx/onnx_backend.h
 /*!
-* Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-*
-* Licensed under the BSD 2-Clause License.
-* You should have received a copy of the BSD 2-Clause License
-* along with the software. If not, See,
-*
-*      <https://opensource.org/licenses/BSD-2-Clause>
-*
-* Codes are based on:
-*
-*      <https://github.com/pytorch/pytorch/blob/master/caffe2/onnx/backend.h>
-*
-* ------------------------------------------------------------
-*/
-
-#ifndef DRAGON_CONTRIB_ONNX_ONNX_BACKEND_H_
-#define DRAGON_CONTRIB_ONNX_ONNX_BACKEND_H_
+ * Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+ *
+ * Licensed under the BSD 2-Clause License.
+ * You should have received a copy of the BSD 2-Clause License
+ * along with the software. If not, See,
+ *
+ *      <https://opensource.org/licenses/BSD-2-Clause>
+ *
+ * Codes are based on:
+ *
+ *      <https://github.com/pytorch/pytorch/blob/master/caffe2/onnx/backend.h>
+ *
+ * ------------------------------------------------------------
+ */
+
+#ifndef DRAGON_ONNX_ONNX_BACKEND_H_
+#define DRAGON_ONNX_ONNX_BACKEND_H_

 #include "core/common.h"
 #include "proto/onnx.pb.h"
@@ -228,4 +228,4 @@ class ONNXBackend {

 }  // namespace dragon

-#endif  // DRAGON_CONTRIB_ONNX_ONNX_BACKEND_H_
\ No newline at end of file
+#endif  // DRAGON_ONNX_ONNX_BACKEND_H_
\ No newline at end of file
--- a/Dragon/src/contrib/onnx/onnx_importer.cc
+++ b/Dragon/src/contrib/onnx/onnx_importer.cc
 #include "utils/map_utils.h"
-#include "contrib/onnx/onnx_backend.h"
+#include "onnx/onnx_backend.h"

 namespace dragon {


--- a/Dragon/src/contrib/onnx/onnx_initializer.cc
+++ b/Dragon/src/contrib/onnx/onnx_initializer.cc
-#include "contrib/onnx/onnx_backend.h"
+#include "onnx/onnx_backend.h"

 namespace dragon {


--- a/Dragon/src/operators/arithmetic/maximum_op.cc
+++ b/Dragon/src/operators/arithmetic/maximum_op.cc
@@ -57,7 +57,7 @@ void MaximumOp<Context>::RunOnDevice() {
    else if (XIsType(Input(0), double)) RunWithType<double>();
    else LOG(FATAL) << DTypeHelper(Input(0), {
        "int8", "uint8", "int32", "int64",
-        "float16", "float32", "float64",
+            "float16", "float32", "float64",
    });
 }


--- a/Dragon/src/operators/ndarray/gather_op.cc
+++ b/Dragon/src/operators/ndarray/gather_op.cc
@@ -13,12 +13,10 @@ namespace dragon {
 template <class Context> template <typename T>
 void GatherOp<Context>::RunWithType() {
    auto* Xdata = Input(0).template data<T, Context>();
-    auto* indices = Input(1).template mutable_data<int, Context>();
+    auto* indices = Input(1).template mutable_data<int64_t, Context>();
    auto* Ydata = Output(0)->template mutable_data<T, Context>();

-    kernel::CanonicalAxis(Input(1).count(), x_slice_dim, indices, ctx());
-
-    kernel::Gather(Output(0)->count(),
+    kernel::Gather(
        outer_dim, inner_dim,
            x_slice_dim, y_slice_dim,
                indices, Xdata, Ydata, ctx());
@@ -28,22 +26,38 @@ template <class Context>
 void GatherOp<Context>::RunOnDevice() {
    DETERMINE_RUNTIME_ARGUMENTS(Input(0));

-    output_dims = Input(0).dims();
    x_slice_dim = Input(0).dim(axis);
-    output_dims[axis] = y_slice_dim = Input(1).count();
+    y_slice_dim = Input(1).count();
    outer_dim = Input(0).count(0, axis);
    inner_dim = Input(0).count(axis + 1);

    CHECK_GT(y_slice_dim, 0) << "\nLength of indices must > 0.";

+    const auto& s1 = Input(0).dims().begin();
+    const auto& e1 = s1 + axis, s3 = e1 + 1;
+    const auto& e3 = Input(0).dims().end();
+    const auto& s2 = Input(1).dims().begin();
+    const auto& e2 = Input(1).dims().end();
+    output_dims.assign(s1, e1);
+    output_dims.insert(output_dims.end(), s2, e2);
+    output_dims.insert(output_dims.end(), s3, e3);
    Output(0)->Reshape(output_dims);

-    CHECK(Input(1).template IsType<int>())
-        << "\nThe type of indices should be int32.";
+    CHECK(Input(1).template IsType<int64_t>())
+        << "\nThe type of indices should be int64.";

-    if (XIsType(Input(0), float)) RunWithType<float>();
+    if (XIsType(Input(0), bool)) RunWithType<bool>();
+    else if (XIsType(Input(0), int8_t)) RunWithType<int8_t>();
+    else if (XIsType(Input(0), uint8_t)) RunWithType<uint8_t>();
    else if (XIsType(Input(0), int)) RunWithType<int>();
-    else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "int32" });
+    else if (XIsType(Input(0), int64_t)) RunWithType<int64_t>();
+    else if (XIsType(Input(0), float16)) RunWithType<float16>();
+    else if (XIsType(Input(0), float)) RunWithType<float>();
+    else if (XIsType(Input(0), double)) RunWithType<double>();
+    else LOG(FATAL) << DTypeHelper(Input(0), {
+        "bool", "int8", "uint8", "int32", "int64",
+            "float16", "float32", "float64",
+    });
 }

 DEPLOY_CPU(Gather);
@@ -54,18 +68,17 @@ OPERATOR_SCHEMA(Gather).NumInputs(2).NumOutputs(1);

 template <class Context> template <typename T>
 void GatherGradientOp<Context>::RunWithType() {
-    auto* indices = Input(1).template data<int, Context>();
+    auto* indices = Input(1).template data<int64_t, Context>();
    auto* dYdata = Input(-1).template data<T, Context>();
+    auto* dXdata = Output(0)->template mutable_data<T, Context>();

-    T* dXdata = nullptr;
-    if (!acc_grad) {
-        dXdata = Output(0)->template mutable_data<T, Context>();
-        math::Set(Output(0)->count(), cast::to<T>(0.f), dXdata, ctx());
-    } else {
-        dXdata = Output(0)->template mutable_data<T, Context>();
+    // Zero the gradients Optionally
+    if (zero_grad) {
+        math::Set(Output(0)->count(),
+            cast::to<T>(0.f), dXdata, ctx());
    }

-    kernel::GatherGrad(Input(-1).count(),
+    kernel::GatherGrad(
        outer_dim, inner_dim,
            x_slice_dim, y_slice_dim,
                indices, dYdata, dXdata, ctx());
@@ -82,12 +95,20 @@ void GatherGradientOp<Context>::RunOnDevice() {

    Output(0)->ReshapeLike(Input(0));

-    CHECK(Input(1).template IsType<int>())
-        << "\nThe type of indices should be int32.";
+    CHECK(Input(1).template IsType<int64_t>())
+        << "\nThe type of indices should be int64.";

-    if (XIsType(Input(0), float)) RunWithType<float>();
+    if (XIsType(Input(0), int8_t)) RunWithType<int8_t>();
+    else if (XIsType(Input(0), uint8_t)) RunWithType<uint8_t>();
    else if (XIsType(Input(0), int)) RunWithType<int>();
-    else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "int32" });
+    else if (XIsType(Input(0), int64_t)) RunWithType<int64_t>();
+    else if (XIsType(Input(0), float16)) RunWithType<float16>();
+    else if (XIsType(Input(0), float)) RunWithType<float>();
+    else if (XIsType(Input(0), double)) RunWithType<double>();
+    else LOG(FATAL) << DTypeHelper(Input(0), {
+        "int8", "uint8", "int32", "int64",
+            "float16", "float32", "float64",
+    });
 }

 DEPLOY_CPU(GatherGradient);

--- a/Dragon/src/operators/vision/drop_block2d_op.cc
+++ b/Dragon/src/operators/vision/drop_block2d_op.cc
@@ -15,6 +15,27 @@ void DropBlock2dOp<Context>::RunWithType() {
                Output(0)->count(), Ydata, Xdata);
        }
    } else if (phase() == "TRAIN") {
+        if (data_format == "NCHW") {
+            n = Input(0).dim(0), c = Input(0).dim(1);
+            h = Input(0).dim(2), w = Input(0).dim(3);
+        } else if (data_format == "NHWC") {
+            n = Input(0).dim(0), c = Input(0).dim(-1);
+            h = Input(0).dim(1), w = Input(0).dim(2);
+        }
+
+        seed_h = h - block_size + 1;
+        seed_w = w - block_size + 1;
+
+        CHECK(seed_h > 0 && seed_w > 0)
+            << "\nExcepted block_size <= feat_size.";
+
+        if (decrement > 0 && apply_prob > keep_prob()) {
+            apply_prob -= decrement;
+        } else { apply_prob = keep_prob(); }
+
+        gamma = (1.f - apply_prob) / (block_size * block_size);
+        gamma *= (alpha * (h * w) / (seed_h * seed_w));
+
        auto* mask = ws()->CreateTensor(mount_name(
            "drop_block/mask"))->ReshapeLike(Input(0));
        auto* norm = ws()->CreateTensor(mount_name(
@@ -58,29 +79,8 @@ void DropBlock2dOp<Context>::RunWithType() {

 template <class Context>
 void DropBlock2dOp<Context>::RunOnDevice() {
-    if (data_format == "NCHW") {
-        n = Input(0).dim(0), c = Input(0).dim(1);
-        h = Input(0).dim(2), w = Input(0).dim(3);
-    } else if (data_format == "NHWC") {
-        n = Input(0).dim(0), c = Input(0).dim(-1);
-        h = Input(0).dim(1), w = Input(0).dim(2);
-    }
-
-    seed_h = h - block_size + 1;
-    seed_w = w - block_size + 1;
-
-    CHECK(seed_h > 0 && seed_w > 0) 
-        << "\nExcepted block_size <= feat_size.";
-
    Output(0)->ReshapeLike(Input(0));

-    if (decrement > 0 && apply_prob > keep_prob()) {
-        apply_prob -= decrement;
-    } else { apply_prob = keep_prob(); }
-
-    gamma = (1.f - apply_prob) / (block_size * block_size);
-    gamma *= (alpha * (h * w) / (seed_h * seed_w));
-
    if (XIsType(Input(0), float)) RunWithType<float>();
    else if (XIsType(Input(0), float16)) RunWithType<float16>();
    else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });