Optimize GatherOp

Ting PAN
Commit 1d03e8e2 authored Jan 19, 2019 by Ting PAN
Showing with 335 additions and 1225 deletions
Docs/api/python/_static/css/dragon.css
Docs/api/python/contents/config.rst
Docs/api/python/contents/core/tensor.rst
Docs/api/python/contents/core/tensor_utils.rst
Docs/api/python/contents/memonger.rst
Docs/api/python/contents/tools/db.rst
Docs/api/python/contents/tools/im2db.rst
Docs/api/python/contents/tools/summary_writer.rst
Docs/api/python/contents/tools/tensorboard.rst
Docs/api/python/contents/updaters.rst
Docs/api/python/contents/vm/caffe/layer.rst
Docs/api/python/contents/vm/caffe/misc.rst
Docs/api/python/contents/vm/caffe/net.rst
Docs/api/python/contents/vm/caffe/solver.rst
Docs/api/python/contents/vm/theano/compile.rst
Dragon/include/operators/ndarray/gather_op.h
Dragon/include/utils/op_kernel.h
Dragon/modules/cxx/dragon.cc
Dragon/modules/python/py_onnx.h
Dragon/python/dragon/config.py
--- a/Docs/api/python/_static/css/dragon.css
+++ b/Docs/api/python/_static/css/dragon.css
@@ -283,14 +283,16 @@ code.docutils.literal:hover {

 dt {
    font-weight: 700;
-    background: #e7f2fa;
+    background: #f7f7f7;
    border-bottom: solid #0079b2;
-    border-radius: 1px;
+    border-radius: 8px;
    margin-bottom: 20px;
+    padding: 8px;
+    width: 75%;
 }

 dt:target, .highlighted {
-    background-color: #e7f2fa;
+    background-color: #f7f7f7;
    border-bottom: 3px solid #c7254e;
 }

@@ -299,7 +301,7 @@ dt:target:before {
    content: '';
    display: block;
    height: 65px;
-    margin: -20px 0 0;
+    margin: -20px -8px 8px;
 }

 dl.method dt {

--- a/Docs/api/python/contents/config.rst
+++ b/Docs/api/python/contents/config.rst
@@ -5,8 +5,8 @@
 .. toctree::
   :hidden:

-Quick Shortcut
--------------
+Quick Reference
+---------------

 ==========================   =============================================================================
 List                         Brief

--- a/Docs/api/python/contents/core/tensor.rst
+++ b/Docs/api/python/contents/core/tensor.rst
@@ -5,8 +5,8 @@
 .. toctree::
   :hidden:

-Quick Shortcut
--------------
+Quick Reference
+---------------

 ==============================    =============================================================================
 List                              Brief

--- a/Docs/api/python/contents/core/tensor_utils.rst
+++ b/Docs/api/python/contents/core/tensor_utils.rst
@@ -5,8 +5,8 @@
 .. toctree::
   :hidden:

-Quick Shortcut
--------------
+Quick Reference
+---------------

 ==============================    =============================================================================
 List                              Brief

--- a/Docs/api/python/contents/memonger.rst
+++ b/Docs/api/python/contents/memonger.rst
@@ -5,8 +5,8 @@
 .. toctree::
   :hidden:

-Quick Shortcut
--------------
+Quick Reference
+---------------

 ====================    =============================================================================
 List                    Brief

--- a/Docs/api/python/contents/tools/db.rst
+++ b/Docs/api/python/contents/tools/db.rst
@@ -5,8 +5,8 @@
 .. toctree::
   :hidden:

-Quick Shortcut
--------------
+Quick Reference
+---------------

 ====================    =============================================================================
 List                    Brief

--- a/Docs/api/python/contents/tools/im2db.rst
+++ b/Docs/api/python/contents/tools/im2db.rst
@@ -5,8 +5,8 @@
 .. toctree::
   :hidden:

-Quick Shortcut
--------------
+Quick Reference
+---------------

 ====================    =============================================================================
 List                    Brief

--- a/Docs/api/python/contents/tools/summary_writer.rst
+++ b/Docs/api/python/contents/tools/summary_writer.rst
@@ -5,8 +5,8 @@
 .. toctree::
   :hidden:

-Quick Shortcut
--------------
+Quick Reference
+---------------

 ====================    =============================================================================
 List                    Brief

--- a/Docs/api/python/contents/tools/tensorboard.rst
+++ b/Docs/api/python/contents/tools/tensorboard.rst
@@ -5,8 +5,8 @@
 .. toctree::
   :hidden:

-Quick Shortcut
--------------
+Quick Reference
+---------------

 ====================    =============================================================================
 List                    Brief

--- a/Docs/api/python/contents/updaters.rst
+++ b/Docs/api/python/contents/updaters.rst
@@ -5,8 +5,8 @@
 .. toctree::
   :hidden:

-Quick Shortcut
--------------
+Quick Reference
+---------------

 ====================    =============================================================================
 List                    Brief

--- a/Docs/api/python/contents/vm/caffe/layer.rst
+++ b/Docs/api/python/contents/vm/caffe/layer.rst
@@ -112,8 +112,8 @@ List                                  Brief
 =================================     =============================================================================


-Quick Shortcut
--------------
+Quick Reference
+---------------

 ====================    =============================================================================
 List                    Brief

--- a/Docs/api/python/contents/vm/caffe/misc.rst
+++ b/Docs/api/python/contents/vm/caffe/misc.rst
@@ -5,8 +5,8 @@
 .. toctree::
   :hidden:

-Quick Shortcut
--------------
+Quick Reference
+---------------

 =========================      ============================================================================
 List                           Brief

--- a/Docs/api/python/contents/vm/caffe/net.rst
+++ b/Docs/api/python/contents/vm/caffe/net.rst
@@ -5,8 +5,8 @@
 .. toctree::
   :hidden:

-Quick Shortcut
--------------
+Quick Reference
+---------------

 =========================   =============================================================================
 List                        Brief

--- a/Docs/api/python/contents/vm/caffe/solver.rst
+++ b/Docs/api/python/contents/vm/caffe/solver.rst
@@ -5,8 +5,8 @@
 .. toctree::
   :hidden:

-Quick Shortcut
--------------
+Quick Reference
+---------------

 ====================    =============================================================================
 List                    Brief

--- a/Docs/api/python/contents/vm/theano/compile.rst
+++ b/Docs/api/python/contents/vm/theano/compile.rst
@@ -6,8 +6,8 @@
   :hidden:


-Quick Shortcut
--------------
+Quick Reference
+---------------

 ==============================      =======================================================================
 List                                Brief

--- a/Dragon/include/operators/ndarray/gather_op.h
+++ b/Dragon/include/operators/ndarray/gather_op.h
@@ -39,15 +39,15 @@ class GatherGradientOp final : public Operator<Context> {
    GatherGradientOp(const OperatorDef& def, Workspace* ws)
        : Operator<Context>(def, ws),
          axis(OperatorBase::Arg<int64_t>("axis", 0)),
-          acc_grad(OperatorBase::Arg<bool>("acc_gradient", false)) {}
+          zero_grad(OperatorBase::Arg<bool>("zero_grad", true)) {}
    USE_OPERATOR_FUNCTIONS;

    void RunOnDevice() override;
    template <typename T> void RunWithType();

 protected:
+    bool zero_grad;
    int64_t axis, outer_dim, inner_dim, x_slice_dim, y_slice_dim;
-    bool acc_grad;
 };

 }  // namespace dragon

--- a/Dragon/include/utils/op_kernel.h
+++ b/Dragon/include/utils/op_kernel.h
@@ -601,32 +601,23 @@ void ArgMin(
 /*! ndarray.gather */

 template <typename T, class Context>
-void CanonicalAxis(
-    const int               count,
-    const int               dim,
-    T*                      y,
-    Context*                ctx);
-
-template <typename T, class Context>
 void Gather(
-    const int               count,
    const int               outer_dim,
    const int               inner_dim,
    const int               x_slice_dim,
    const int               y_slice_dim,
-    const int*              indices,
+    const int64_t*          indices,
    const T*                x,
    T*                      y,
    Context*                ctx);

 template <typename T, class Context>
 void GatherGrad(
-    const int               count,
    const int               outer_dim,
    const int               inner_dim,
    const int               x_slice_dim,
    const int               y_slice_dim,
-    const int*              indices,
+    const int64_t*          indices,
    const T*                dy,
    T*                      dx,
    Context*                ctx);

--- a/Dragon/modules/cxx/dragon.cc
+++ b/Dragon/modules/cxx/dragon.cc
@@ -3,7 +3,7 @@
 #include "core/common.h"
 #include "utils/proto_utils.h"
 #include "utils/caffemodel.h"
-#include "contrib/onnx/onnx_backend.h"
+#include "onnx/onnx_backend.h"

 #include "dragon.h"


--- a/Dragon/modules/python/py_onnx.h
+++ b/Dragon/modules/python/py_onnx.h
@@ -11,7 +11,7 @@
 #ifndef DRAGON_PYTHON_PY_ONNX_H_
 #define DRAGON_PYTHON_PY_ONNX_H_

-#include "contrib/onnx/onnx_backend.h"
+#include "onnx/onnx_backend.h"

 #include "py_dragon.h"


--- a/Dragon/python/dragon/config.py
+++ b/Dragon/python/dragon/config.py
@@ -270,7 +270,7 @@ def ExportMetaGraph(prefix=''):

    These text files will be saved as the following format:

-    ``prefix/Graph_xxx.metatxt``
+        *prefix/Graph.metatxt*

    Note that an empty prefix will leads to invalid exporting.

@@ -293,12 +293,12 @@ def SetLoggingLevel(level):

    Parameters
    ----------
-    level : str
-        The level, ``DEBUG``, ``INFO``, ``WARNING``, ``ERROR`` or ``FATAL``.
+    level : {'DEBUG', 'INFO, 'WARNING', 'ERROR', 'FATAL'}, required
+        The logging level.

    Notes
    -----
-    The default level is ``INFO``.
+    The default level is *INFO*.

    """
    C.SetLogLevelCC(level)

--- a/Dragon/python/dragon/core/helper.py
+++ b/Dragon/python/dragon/core/helper.py
@@ -391,9 +391,12 @@ class OperatorHelper(object):
    @classmethod
    def _apply_Gather(cls, arguments, inputs, outputs):
        outputs[0].dtype = inputs[0].dtype
+        axis = arguments['axis']
        try:
-            outputs[0].shape = inputs[0].shape[:]
-            outputs[0].shape[arguments['axis']] = None
+            outputs[0].shape = \
+                inputs[0].shape[:axis] + \
+                    inputs[1].shape[:] + \
+                        inputs[0].shape[axis + 1:]
        except:
            pass
        return outputs

--- a/Dragon/python/dragon/operators/ndarray.py
+++ b/Dragon/python/dragon/operators/ndarray.py
@@ -17,10 +17,10 @@ from . import *


 @OpSchema.Inputs(1)
-def Gather(inputs, indices, axis=0, acc_gradient=False, **kwargs):
+def Gather(inputs, indices, axis=0, zero_grad=True, **kwargs):
    """Gather the input according to the indices along the given axis.

-    **Type Constraints**: (*int32*, *float32*)
+    **Type Constraints**: (*bool*, *int8*, *uint8*, *int32*, *int64*, *float16*, *float32*, *float64*)

    Parameters
    ----------
@@ -30,7 +30,7 @@ def Gather(inputs, indices, axis=0, acc_gradient=False, **kwargs):
        The indices to form output tensor.
    axis : int, optional
        The start axis, can be negative.
-    acc_gradient : bool, optional
+    zero_grad : bool, optional
        Whether to accumulate the gradients.

    Returns
@@ -40,24 +40,10 @@ def Gather(inputs, indices, axis=0, acc_gradient=False, **kwargs):

    """
    arguments = ParseArgs(locals())
-
-    arguments['inputs'], arguments['indices'] = [arguments['inputs'],
-        Tensor.Convert(indices, dtype='int32')], None
-
-    output = Tensor.CreateOperator('Gather', **arguments)
-
-    try:
-        output.shape = inputs.shape[:]
-        if not isinstance(indices, Tensor):
-            if not isinstance(indices, (list, tuple)):
-                indices = [indices]
-            output.shape[axis] = len(indices)
-        else:
-            output.shape[axis] = None
-    except:
-        pass
-
-    return output
+    arguments['inputs'], arguments['indices'] = \
+        [arguments['inputs'], Tensor.Convert(
+            indices, dtype='int64')], None
+    return Tensor.CreateOperator('Gather', **arguments)


 @OpSchema.Inputs(1)

--- a/Dragon/python/dragon/operators/vision.py
+++ b/Dragon/python/dragon/operators/vision.py
@@ -283,9 +283,7 @@ def Pool2d(

 @OpSchema.Inputs(2)
 def ROIPool(inputs, pool_h, pool_w, spatial_scale=1.0, **kwargs):
-    """Max RoI Pooling. `[Girshick, 2015] <https://arxiv.org/abs/1504.08083>`_.
-
-    The first dimension of input must be ``1``.
+    """Max RoIPooling. `[Girshick, 2015] <https://arxiv.org/abs/1504.08083>`_.

    **Type Constraints**: (*float16*, *float32*)

@@ -311,9 +309,7 @@ def ROIPool(inputs, pool_h, pool_w, spatial_scale=1.0, **kwargs):

 @OpSchema.Inputs(2)
 def ROIAlign(inputs, pool_h=0, pool_w=0, spatial_scale=1.0, sampling_ratio=2, **kwargs):
-    """AVG ROIAlign. `[He et.al, 2017] <https://arxiv.org/abs/1703.06870>`_.
-
-    The first dimension of input must be ``1``.
+    """AVG RoIAlign. `[He et.al, 2017] <https://arxiv.org/abs/1703.06870>`_.

    **Type Constraints**: (*float16*, *float32*)


--- a/Dragon/python/dragon/utils/vision/blob_fetcher.py
+++ b/Dragon/python/dragon/utils/vision/blob_fetcher.py
@@ -20,7 +20,7 @@ from multiprocessing import Process
 class BlobFetcher(Process):
    """BlobFetcher is deployed to queue blobs from `DataTransformer`_.

-    It is supported to form ``NHWC`` image blobs and ``1D`` label blobs.
+    It is supported to form *NHWC* image blobs and *1d* label blobs.

    """
    def __init__(self, **kwargs):

--- a/Dragon/python/dragon/utils/vision/data_batch.py
+++ b/Dragon/python/dragon/utils/vision/data_batch.py
@@ -26,7 +26,7 @@ from .blob_fetcher import BlobFetcher


 class DataBatch(object):
-    """DataBatch aims to prefetch data by ``Triple-Buffering``.
+    """DataBatch aims to prefetch data by *Triple-Buffering*.

    It takes full advantages of the Process/Thread of Python,
    which provides remarkable I/O speed up for scalable distributed training.

--- a/Dragon/python/dragon/vm/caffe/model_libs.py
+++ b/Dragon/python/dragon/vm/caffe/model_libs.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# Codes are based on:
-#
-#      <https://github.com/weiliu89/caffe/blob/ssd/python/caffe/model_libs.py>
-#
-# ------------------------------------------------------------
-
-import os
-
-from dragon.vm.caffe import layers as L
-from dragon.vm.caffe import params as P
-from dragon.vm.caffe.proto import caffe_pb2
-
-
-def check_if_exist(path):
-    return os.path.exists(path)
-
-def make_if_not_exist(path):
-    if not os.path.exists(path):
-        os.makedirs(path)
-
-def UnpackVariable(var, num):
-  if type(var) is list and len(var) == num:
-    return var
-  else:
-    ret = []
-    if type(var) is list:
-      assert len(var) == 1
-      for i in range(0, num):
-        ret.append(var[0])
-    else:
-      for i in range(0, num):
-        ret.append(var)
-    return ret
-
-def ConvBNLayer(net, from_layer, out_layer, use_bn, use_relu, num_output,
-    kernel_size, pad, stride, dilation=1, use_scale=True, lr_mult=1,
-    conv_prefix='', conv_postfix='', bn_prefix='', bn_postfix='_bn',
-    scale_prefix='', scale_postfix='_scale', bias_prefix='', bias_postfix='_bias',
-    **bn_params):
-  if use_bn:
-    # parameters for convolution layer with batchnorm.
-    kwargs = {
-        'param': [dict(lr_mult=lr_mult, decay_mult=1)],
-        'weight_filler': dict(type='gaussian', std=0.01),
-        'bias_term': False,
-        }
-    eps = bn_params.get('eps', 1e-3)
-    moving_average_fraction = bn_params.get('moving_average_fraction', 0.9)
-    use_global_stats = bn_params.get('use_global_stats', False)
-    # parameters for batchnorm layer.
-    bn_kwargs = {
-        'param': [
-            dict(lr_mult=0, decay_mult=0),
-            dict(lr_mult=0, decay_mult=0),
-            dict(lr_mult=0, decay_mult=0)],
-        }
-    bn_lr_mult = lr_mult
-    if use_global_stats:
-      # only specify if use_global_stats is explicitly provided;
-      # otherwise, use_global_stats_ = this->phase_ == TEST;
-      bn_kwargs = {
-          'param': [
-              dict(lr_mult=0, decay_mult=0),
-              dict(lr_mult=0, decay_mult=0),
-              dict(lr_mult=0, decay_mult=0)],
-          'eps': eps,
-          'use_global_stats': use_global_stats,
-          }
-      # not updating scale/bias parameters
-      bn_lr_mult = 0
-    # parameters for scale bias layer after batchnorm.
-    if use_scale:
-      sb_kwargs = {
-          'bias_term': True}
-  else:
-    kwargs = {
-        'param': [
-            dict(lr_mult=lr_mult, decay_mult=1),
-            dict(lr_mult=2 * lr_mult, decay_mult=0)],
-        'weight_filler': dict(type='xavier'),
-        'bias_filler': dict(type='constant', value=0)
-        }
-
-  conv_name = '{}{}{}'.format(conv_prefix, out_layer, conv_postfix)
-  [kernel_h, kernel_w] = UnpackVariable(kernel_size, 2)
-  [pad_h, pad_w] = UnpackVariable(pad, 2)
-  [stride_h, stride_w] = UnpackVariable(stride, 2)
-  if kernel_h == kernel_w:
-    net[conv_name] = L.Convolution(net[from_layer], num_output=num_output,
-        kernel_size=kernel_h, pad=pad_h, stride=stride_h, **kwargs)
-  else:
-    net[conv_name] = L.Convolution(net[from_layer], num_output=num_output,
-        kernel_h=kernel_h, kernel_w=kernel_w, pad_h=pad_h, pad_w=pad_w,
-        stride_h=stride_h, stride_w=stride_w, **kwargs)
-  if dilation > 1:
-    net.update(conv_name, {'dilation': dilation})
-  if use_bn:
-    bn_name = '{}{}{}'.format(bn_prefix, out_layer, bn_postfix)
-    net[bn_name] = L.BatchNorm(net[conv_name], in_place=True, **bn_kwargs)
-    if use_scale:
-      sb_name = '{}{}{}'.format(scale_prefix, out_layer, scale_postfix)
-      net[sb_name] = L.Scale(net[bn_name], in_place=True, **sb_kwargs)
-    else:
-      bias_name = '{}{}{}'.format(bias_prefix, out_layer, bias_postfix)
-      net[bias_name] = L.Bias(net[bn_name], in_place=True, **bias_kwargs)
-  if use_relu:
-    relu_name = '{}_relu'.format(conv_name)
-    net[relu_name] = L.ReLU(net[conv_name], in_place=True)
-
-def ResBody(net, from_layer, block_name, out2a, out2b, out2c, stride, use_branch1, dilation=1, **bn_param):
-  # ResBody(net, 'pool1', '2a', 64, 64, 256, 1, True)
-
-  conv_prefix = 'res{}_'.format(block_name)
-  conv_postfix = ''
-  bn_prefix = 'bn{}_'.format(block_name)
-  bn_postfix = ''
-  scale_prefix = 'scale{}_'.format(block_name)
-  scale_postfix = ''
-  use_scale = True
-
-  if use_branch1:
-    branch_name = 'branch1'
-    ConvBNLayer(net, from_layer, branch_name, use_bn=True, use_relu=False,
-        num_output=out2c, kernel_size=1, pad=0, stride=stride, use_scale=use_scale,
-        conv_prefix=conv_prefix, conv_postfix=conv_postfix,
-        bn_prefix=bn_prefix, bn_postfix=bn_postfix,
-        scale_prefix=scale_prefix, scale_postfix=scale_postfix, **bn_param)
-    branch1 = '{}{}'.format(conv_prefix, branch_name)
-  else:
-    branch1 = from_layer
-
-  branch_name = 'branch2a'
-  ConvBNLayer(net, from_layer, branch_name, use_bn=True, use_relu=True,
-      num_output=out2a, kernel_size=1, pad=0, stride=stride, use_scale=use_scale,
-      conv_prefix=conv_prefix, conv_postfix=conv_postfix,
-      bn_prefix=bn_prefix, bn_postfix=bn_postfix,
-      scale_prefix=scale_prefix, scale_postfix=scale_postfix, **bn_param)
-  out_name = '{}{}'.format(conv_prefix, branch_name)
-
-  branch_name = 'branch2b'
-  if dilation == 1:
-    ConvBNLayer(net, out_name, branch_name, use_bn=True, use_relu=True,
-        num_output=out2b, kernel_size=3, pad=1, stride=1, use_scale=use_scale,
-        conv_prefix=conv_prefix, conv_postfix=conv_postfix,
-        bn_prefix=bn_prefix, bn_postfix=bn_postfix,
-        scale_prefix=scale_prefix, scale_postfix=scale_postfix, **bn_param)
-  else:
-    pad = int((3 + (dilation - 1) * 2) - 1) / 2
-    ConvBNLayer(net, out_name, branch_name, use_bn=True, use_relu=True,
-        num_output=out2b, kernel_size=3, pad=pad, stride=1, use_scale=use_scale,
-        dilation=dilation, conv_prefix=conv_prefix, conv_postfix=conv_postfix,
-        bn_prefix=bn_prefix, bn_postfix=bn_postfix,
-        scale_prefix=scale_prefix, scale_postfix=scale_postfix, **bn_param)
-  out_name = '{}{}'.format(conv_prefix, branch_name)
-
-  branch_name = 'branch2c'
-  ConvBNLayer(net, out_name, branch_name, use_bn=True, use_relu=False,
-      num_output=out2c, kernel_size=1, pad=0, stride=1, use_scale=use_scale,
-      conv_prefix=conv_prefix, conv_postfix=conv_postfix,
-      bn_prefix=bn_prefix, bn_postfix=bn_postfix,
-      scale_prefix=scale_prefix, scale_postfix=scale_postfix, **bn_param)
-  branch2 = '{}{}'.format(conv_prefix, branch_name)
-
-  res_name = 'res{}'.format(block_name)
-  net[res_name] = L.Eltwise(net[branch1], net[branch2])
-  relu_name = '{}_relu'.format(res_name)
-  net[relu_name] = L.ReLU(net[res_name], in_place=True)
-
-
-def InceptionTower(net, from_layer, tower_name, layer_params, **bn_param):
-  use_scale = False
-  for param in layer_params:
-    tower_layer = '{}/{}'.format(tower_name, param['name'])
-    del param['name']
-    if 'pool' in tower_layer:
-      net[tower_layer] = L.Pooling(net[from_layer], **param)
-    else:
-      param.update(bn_param)
-      ConvBNLayer(net, from_layer, tower_layer, use_bn=True, use_relu=True,
-          use_scale=use_scale, **param)
-    from_layer = tower_layer
-  return net[from_layer]
-
-def CreateAnnotatedDataLayer(source, batch_size=32, backend=P.Data.LMDB,
-        output_label=True, train=True, label_map_file='', anno_type=None,
-        transform_param={}, batch_sampler=[{}]):
-    if train:
-        kwargs = {
-                'include': dict(phase=caffe_pb2.Phase.Value('TRAIN')),
-                'transform_param': transform_param,
-                }
-    else:
-        kwargs = {
-                'include': dict(phase=caffe_pb2.Phase.Value('TEST')),
-                'transform_param': transform_param,
-                }
-    ntop = 1
-    if output_label:
-        ntop = 2
-    annotated_data_param = {
-        'label_map_file': label_map_file,
-        'batch_sampler': batch_sampler,
-        }
-    if anno_type is not None:
-        annotated_data_param.update({'anno_type': anno_type})
-    return L.AnnotatedData(name="data", annotated_data_param=annotated_data_param,
-        data_param=dict(batch_size=batch_size, backend=backend, source=source),
-        ntop=ntop, **kwargs)
-
-
-def VGGNetBody(net, from_layer, need_fc=True, fully_conv=False, reduced=False,
-        dilated=False, nopool=False, dropout=True, freeze_layers=[], dilate_pool4=False):
-    kwargs = {
-            'param': [dict(lr_mult=1, decay_mult=1), dict(lr_mult=2, decay_mult=0)],
-            'weight_filler': dict(type='xavier'),
-            'bias_filler': dict(type='constant', value=0)}
-
-    assert from_layer in net.keys()
-    net.conv1_1 = L.Convolution(net[from_layer], num_output=64, pad=1, kernel_size=3, **kwargs)
-
-    net.relu1_1 = L.ReLU(net.conv1_1, in_place=True)
-    net.conv1_2 = L.Convolution(net.relu1_1, num_output=64, pad=1, kernel_size=3, **kwargs)
-    net.relu1_2 = L.ReLU(net.conv1_2, in_place=True)
-
-    if nopool:
-        name = 'conv1_3'
-        net[name] = L.Convolution(net.relu1_2, num_output=64, pad=1, kernel_size=3, stride=2, **kwargs)
-    else:
-        name = 'pool1'
-        net.pool1 = L.Pooling(net.relu1_2, pool=P.Pooling.MAX, kernel_size=2, stride=2)
-
-    net.conv2_1 = L.Convolution(net[name], num_output=128, pad=1, kernel_size=3, **kwargs)
-    net.relu2_1 = L.ReLU(net.conv2_1, in_place=True)
-    net.conv2_2 = L.Convolution(net.relu2_1, num_output=128, pad=1, kernel_size=3, **kwargs)
-    net.relu2_2 = L.ReLU(net.conv2_2, in_place=True)
-
-    if nopool:
-        name = 'conv2_3'
-        net[name] = L.Convolution(net.relu2_2, num_output=128, pad=1, kernel_size=3, stride=2, **kwargs)
-    else:
-        name = 'pool2'
-        net[name] = L.Pooling(net.relu2_2, pool=P.Pooling.MAX, kernel_size=2, stride=2)
-
-    net.conv3_1 = L.Convolution(net[name], num_output=256, pad=1, kernel_size=3, **kwargs)
-    net.relu3_1 = L.ReLU(net.conv3_1, in_place=True)
-    net.conv3_2 = L.Convolution(net.relu3_1, num_output=256, pad=1, kernel_size=3, **kwargs)
-    net.relu3_2 = L.ReLU(net.conv3_2, in_place=True)
-    net.conv3_3 = L.Convolution(net.relu3_2, num_output=256, pad=1, kernel_size=3, **kwargs)
-    net.relu3_3 = L.ReLU(net.conv3_3, in_place=True)
-
-    if nopool:
-        name = 'conv3_4'
-        net[name] = L.Convolution(net.relu3_3, num_output=256, pad=1, kernel_size=3, stride=2, **kwargs)
-    else:
-        name = 'pool3'
-        net[name] = L.Pooling(net.relu3_3, pool=P.Pooling.MAX, kernel_size=2, stride=2)
-
-    net.conv4_1 = L.Convolution(net[name], num_output=512, pad=1, kernel_size=3, **kwargs)
-    net.relu4_1 = L.ReLU(net.conv4_1, in_place=True)
-    net.conv4_2 = L.Convolution(net.relu4_1, num_output=512, pad=1, kernel_size=3, **kwargs)
-    net.relu4_2 = L.ReLU(net.conv4_2, in_place=True)
-    net.conv4_3 = L.Convolution(net.relu4_2, num_output=512, pad=1, kernel_size=3, **kwargs)
-    net.relu4_3 = L.ReLU(net.conv4_3, in_place=True)
-
-    if nopool:
-        name = 'conv4_4'
-        net[name] = L.Convolution(net.relu4_3, num_output=512, pad=1, kernel_size=3, stride=2, **kwargs)
-    else:
-        name = 'pool4'
-        if dilate_pool4:
-            net[name] = L.Pooling(net.relu4_3, pool=P.Pooling.MAX, kernel_size=3, stride=1, pad=1)
-            dilation = 2
-        else:
-            net[name] = L.Pooling(net.relu4_3, pool=P.Pooling.MAX, kernel_size=2, stride=2)
-            dilation = 1
-
-    kernel_size = 3
-    pad = int(int((kernel_size + (dilation - 1) * (kernel_size - 1)) - 1) / 2)
-    net.conv5_1 = L.Convolution(net[name], num_output=512, pad=pad, kernel_size=kernel_size, dilation=dilation, **kwargs)
-    net.relu5_1 = L.ReLU(net.conv5_1, in_place=True)
-    net.conv5_2 = L.Convolution(net.relu5_1, num_output=512, pad=pad, kernel_size=kernel_size, dilation=dilation, **kwargs)
-    net.relu5_2 = L.ReLU(net.conv5_2, in_place=True)
-    net.conv5_3 = L.Convolution(net.relu5_2, num_output=512, pad=pad, kernel_size=kernel_size, dilation=dilation, **kwargs)
-    net.relu5_3 = L.ReLU(net.conv5_3, in_place=True)
-
-    if need_fc:
-        if dilated:
-            if nopool:
-                name = 'conv5_4'
-                net[name] = L.Convolution(net.relu5_3, num_output=512, pad=1, kernel_size=3, stride=1, **kwargs)
-            else:
-                name = 'pool5'
-                net[name] = L.Pooling(net.relu5_3, pool=P.Pooling.MAX, pad=1, kernel_size=3, stride=1)
-        else:
-            if nopool:
-                name = 'conv5_4'
-                net[name] = L.Convolution(net.relu5_3, num_output=512, pad=1, kernel_size=3, stride=2, **kwargs)
-            else:
-                name = 'pool5'
-                net[name] = L.Pooling(net.relu5_3, pool=P.Pooling.MAX, kernel_size=2, stride=2)
-
-        if fully_conv:
-            if dilated:
-                if reduced:
-                    dilation = dilation * 6
-                    kernel_size = 3
-                    num_output = 1024
-                else:
-                    dilation = dilation * 2
-                    kernel_size = 7
-                    num_output = 4096
-            else:
-                if reduced:
-                    dilation = dilation * 3
-                    kernel_size = 3
-                    num_output = 1024
-                else:
-                    kernel_size = 7
-                    num_output = 4096
-            pad = int(int((kernel_size + (dilation - 1) * (kernel_size - 1)) - 1) / 2)
-            net.fc6 = L.Convolution(net[name], num_output=num_output, pad=pad, kernel_size=kernel_size, dilation=dilation, **kwargs)
-
-            net.relu6 = L.ReLU(net.fc6, in_place=True)
-            if dropout:
-                net.drop6 = L.Dropout(net.relu6, dropout_ratio=0.5, in_place=True)
-
-            if reduced:
-                net.fc7 = L.Convolution(net.relu6, num_output=1024, kernel_size=1, **kwargs)
-            else:
-                net.fc7 = L.Convolution(net.relu6, num_output=4096, kernel_size=1, **kwargs)
-            net.relu7 = L.ReLU(net.fc7, in_place=True)
-            if dropout:
-                net.drop7 = L.Dropout(net.relu7, dropout_ratio=0.5, in_place=True)
-        else:
-            net.fc6 = L.InnerProduct(net.pool5, num_output=4096)
-            net.relu6 = L.ReLU(net.fc6, in_place=True)
-            if dropout:
-                net.drop6 = L.Dropout(net.relu6, dropout_ratio=0.5, in_place=True)
-            net.fc7 = L.InnerProduct(net.relu6, num_output=4096)
-            net.relu7 = L.ReLU(net.fc7, in_place=True)
-            if dropout:
-                net.drop7 = L.Dropout(net.relu7, dropout_ratio=0.5, in_place=True)
-
-    # Update freeze layers.
-    kwargs['param'] = [dict(lr_mult=0, decay_mult=0), dict(lr_mult=0, decay_mult=0)]
-    layers = net.keys()
-    for freeze_layer in freeze_layers:
-        if freeze_layer in layers:
-            net.update(freeze_layer, kwargs)
-
-    return net
-
-
-def ResNet101Body(net, from_layer, use_pool5=True, use_dilation_conv5=False, **bn_param):
-    conv_prefix = ''
-    conv_postfix = ''
-    bn_prefix = 'bn_'
-    bn_postfix = ''
-    scale_prefix = 'scale_'
-    scale_postfix = ''
-    ConvBNLayer(net, from_layer, 'conv1', use_bn=True, use_relu=True,
-        num_output=64, kernel_size=7, pad=3, stride=2,
-        conv_prefix=conv_prefix, conv_postfix=conv_postfix,
-        bn_prefix=bn_prefix, bn_postfix=bn_postfix,
-        scale_prefix=scale_prefix, scale_postfix=scale_postfix, **bn_param)
-
-    net.pool1 = L.Pooling(net.conv1, pool=P.Pooling.MAX, kernel_size=3, stride=2)
-
-    ResBody(net, 'pool1', '2a', out2a=64, out2b=64, out2c=256, stride=1, use_branch1=True, **bn_param)
-    ResBody(net, 'res2a', '2b', out2a=64, out2b=64, out2c=256, stride=1, use_branch1=False, **bn_param)
-    ResBody(net, 'res2b', '2c', out2a=64, out2b=64, out2c=256, stride=1, use_branch1=False, **bn_param)
-
-    ResBody(net, 'res2c', '3a', out2a=128, out2b=128, out2c=512, stride=2, use_branch1=True, **bn_param)
-
-    from_layer = 'res3a'
-    for i in range(1, 4):
-      block_name = '3b{}'.format(i)
-      ResBody(net, from_layer, block_name, out2a=128, out2b=128, out2c=512, stride=1, use_branch1=False, **bn_param)
-      from_layer = 'res{}'.format(block_name)
-
-    ResBody(net, from_layer, '4a', out2a=256, out2b=256, out2c=1024, stride=2, use_branch1=True, **bn_param)
-
-    from_layer = 'res4a'
-    for i in range(1, 23):
-      block_name = '4b{}'.format(i)
-      ResBody(net, from_layer, block_name, out2a=256, out2b=256, out2c=1024, stride=1, use_branch1=False, **bn_param)
-      from_layer = 'res{}'.format(block_name)
-
-    stride = 2
-    dilation = 1
-    if use_dilation_conv5:
-      stride = 1
-      dilation = 2
-
-    ResBody(net, from_layer, '5a', out2a=512, out2b=512, out2c=2048, stride=stride, use_branch1=True, dilation=dilation, **bn_param)
-    ResBody(net, 'res5a', '5b', out2a=512, out2b=512, out2c=2048, stride=1, use_branch1=False, dilation=dilation, **bn_param)
-    ResBody(net, 'res5b', '5c', out2a=512, out2b=512, out2c=2048, stride=1, use_branch1=False, dilation=dilation, **bn_param)
-
-    if use_pool5:
-      net.pool5 = L.Pooling(net.res5c, pool=P.Pooling.AVE, global_pooling=True)
-
-    return net
-
-
-def ResNet152Body(net, from_layer, use_pool5=True, use_dilation_conv5=False, **bn_param):
-    conv_prefix = ''
-    conv_postfix = ''
-    bn_prefix = 'bn_'
-    bn_postfix = ''
-    scale_prefix = 'scale_'
-    scale_postfix = ''
-    ConvBNLayer(net, from_layer, 'conv1', use_bn=True, use_relu=True,
-        num_output=64, kernel_size=7, pad=3, stride=2,
-        conv_prefix=conv_prefix, conv_postfix=conv_postfix,
-        bn_prefix=bn_prefix, bn_postfix=bn_postfix,
-        scale_prefix=scale_prefix, scale_postfix=scale_postfix, **bn_param)
-
-    net.pool1 = L.Pooling(net.conv1, pool=P.Pooling.MAX, kernel_size=3, stride=2)
-
-    ResBody(net, 'pool1', '2a', out2a=64, out2b=64, out2c=256, stride=1, use_branch1=True, **bn_param)
-    ResBody(net, 'res2a', '2b', out2a=64, out2b=64, out2c=256, stride=1, use_branch1=False, **bn_param)
-    ResBody(net, 'res2b', '2c', out2a=64, out2b=64, out2c=256, stride=1, use_branch1=False, **bn_param)
-
-    ResBody(net, 'res2c', '3a', out2a=128, out2b=128, out2c=512, stride=2, use_branch1=True, **bn_param)
-
-    from_layer = 'res3a'
-    for i in range(1, 8):
-      block_name = '3b{}'.format(i)
-      ResBody(net, from_layer, block_name, out2a=128, out2b=128, out2c=512, stride=1, use_branch1=False, **bn_param)
-      from_layer = 'res{}'.format(block_name)
-
-    ResBody(net, from_layer, '4a', out2a=256, out2b=256, out2c=1024, stride=2, use_branch1=True, **bn_param)
-
-    from_layer = 'res4a'
-    for i in range(1, 36):
-      block_name = '4b{}'.format(i)
-      ResBody(net, from_layer, block_name, out2a=256, out2b=256, out2c=1024, stride=1, use_branch1=False, **bn_param)
-      from_layer = 'res{}'.format(block_name)
-
-    stride = 2
-    dilation = 1
-    if use_dilation_conv5:
-      stride = 1
-      dilation = 2
-
-    ResBody(net, from_layer, '5a', out2a=512, out2b=512, out2c=2048, stride=stride, use_branch1=True, dilation=dilation, **bn_param)
-    ResBody(net, 'res5a', '5b', out2a=512, out2b=512, out2c=2048, stride=1, use_branch1=False, dilation=dilation, **bn_param)
-    ResBody(net, 'res5b', '5c', out2a=512, out2b=512, out2c=2048, stride=1, use_branch1=False, dilation=dilation, **bn_param)
-
-    if use_pool5:
-      net.pool5 = L.Pooling(net.res5c, pool=P.Pooling.AVE, global_pooling=True)
-
-    return net
-
-
-def InceptionV3Body(net, from_layer, output_pred=False, **bn_param):
-  # scale is fixed to 1, thus we ignore it.
-  use_scale = False
-
-  out_layer = 'conv'
-  ConvBNLayer(net, from_layer, out_layer, use_bn=True, use_relu=True,
-      num_output=32, kernel_size=3, pad=0, stride=2, use_scale=use_scale,
-      **bn_param)
-  from_layer = out_layer
-
-  out_layer = 'conv_1'
-  ConvBNLayer(net, from_layer, out_layer, use_bn=True, use_relu=True,
-      num_output=32, kernel_size=3, pad=0, stride=1, use_scale=use_scale,
-      **bn_param)
-  from_layer = out_layer
-
-  out_layer = 'conv_2'
-  ConvBNLayer(net, from_layer, out_layer, use_bn=True, use_relu=True,
-      num_output=64, kernel_size=3, pad=1, stride=1, use_scale=use_scale,
-      **bn_param)
-  from_layer = out_layer
-
-  out_layer = 'pool'
-  net[out_layer] = L.Pooling(net[from_layer], pool=P.Pooling.MAX,
-      kernel_size=3, stride=2, pad=0)
-  from_layer = out_layer
-
-  out_layer = 'conv_3'
-  ConvBNLayer(net, from_layer, out_layer, use_bn=True, use_relu=True,
-      num_output=80, kernel_size=1, pad=0, stride=1, use_scale=use_scale,
-      **bn_param)
-  from_layer = out_layer
-
-  out_layer = 'conv_4'
-  ConvBNLayer(net, from_layer, out_layer, use_bn=True, use_relu=True,
-      num_output=192, kernel_size=3, pad=0, stride=1, use_scale=use_scale,
-      **bn_param)
-  from_layer = out_layer
-
-  out_layer = 'pool_1'
-  net[out_layer] = L.Pooling(net[from_layer], pool=P.Pooling.MAX,
-      kernel_size=3, stride=2, pad=0)
-  from_layer = out_layer
-
-  # inceptions with 1x1, 3x3, 5x5 convolutions
-  for inception_id in range(0, 3):
-    if inception_id == 0:
-      out_layer = 'mixed'
-      tower_2_conv_num_output = 32
-    else:
-      out_layer = 'mixed_{}'.format(inception_id)
-      tower_2_conv_num_output = 64
-    towers = []
-    tower_name = '{}'.format(out_layer)
-    tower = InceptionTower(net, from_layer, tower_name, [
-        dict(name='conv', num_output=64, kernel_size=1, pad=0, stride=1),
-        ], **bn_param)
-    towers.append(tower)
-    tower_name = '{}/tower'.format(out_layer)
-    tower = InceptionTower(net, from_layer, tower_name, [
-        dict(name='conv', num_output=48, kernel_size=1, pad=0, stride=1),
-        dict(name='conv_1', num_output=64, kernel_size=5, pad=2, stride=1),
-        ], **bn_param)
-    towers.append(tower)
-    tower_name = '{}/tower_1'.format(out_layer)
-    tower = InceptionTower(net, from_layer, tower_name, [
-        dict(name='conv', num_output=64, kernel_size=1, pad=0, stride=1),
-        dict(name='conv_1', num_output=96, kernel_size=3, pad=1, stride=1),
-        dict(name='conv_2', num_output=96, kernel_size=3, pad=1, stride=1),
-        ], **bn_param)
-    towers.append(tower)
-    tower_name = '{}/tower_2'.format(out_layer)
-    tower = InceptionTower(net, from_layer, tower_name, [
-        dict(name='pool', pool=P.Pooling.AVE, kernel_size=3, pad=1, stride=1),
-        dict(name='conv', num_output=tower_2_conv_num_output, kernel_size=1, pad=0, stride=1),
-        ], **bn_param)
-    towers.append(tower)
-    out_layer = '{}/join'.format(out_layer)
-    net[out_layer] = L.Concat(*towers, axis=1)
-    from_layer = out_layer
-
-  # inceptions with 1x1, 3x3(in sequence) convolutions
-  out_layer = 'mixed_3'
-  towers = []
-  tower_name = '{}'.format(out_layer)
-  tower = InceptionTower(net, from_layer, tower_name, [
-      dict(name='conv', num_output=384, kernel_size=3, pad=0, stride=2),
-      ], **bn_param)
-  towers.append(tower)
-  tower_name = '{}/tower'.format(out_layer)
-  tower = InceptionTower(net, from_layer, tower_name, [
-      dict(name='conv', num_output=64, kernel_size=1, pad=0, stride=1),
-      dict(name='conv_1', num_output=96, kernel_size=3, pad=1, stride=1),
-      dict(name='conv_2', num_output=96, kernel_size=3, pad=0, stride=2),
-      ], **bn_param)
-  towers.append(tower)
-  tower_name = '{}'.format(out_layer)
-  tower = InceptionTower(net, from_layer, tower_name, [
-      dict(name='pool', pool=P.Pooling.MAX, kernel_size=3, pad=0, stride=2),
-      ], **bn_param)
-  towers.append(tower)
-  out_layer = '{}/join'.format(out_layer)
-  net[out_layer] = L.Concat(*towers, axis=1)
-  from_layer = out_layer
-
-  # inceptions with 1x1, 7x1, 1x7 convolutions
-  for inception_id in range(4, 8):
-    if inception_id == 4:
-      num_output = 128
-    elif inception_id == 5 or inception_id == 6:
-      num_output = 160
-    elif inception_id == 7:
-      num_output = 192
-    out_layer = 'mixed_{}'.format(inception_id)
-    towers = []
-    tower_name = '{}'.format(out_layer)
-    tower = InceptionTower(net, from_layer, tower_name, [
-        dict(name='conv', num_output=192, kernel_size=1, pad=0, stride=1),
-        ], **bn_param)
-    towers.append(tower)
-    tower_name = '{}/tower'.format(out_layer)
-    tower = InceptionTower(net, from_layer, tower_name, [
-        dict(name='conv', num_output=num_output, kernel_size=1, pad=0, stride=1),
-        dict(name='conv_1', num_output=num_output, kernel_size=[1, 7], pad=[0, 3], stride=[1, 1]),
-        dict(name='conv_2', num_output=192, kernel_size=[7, 1], pad=[3, 0], stride=[1, 1]),
-        ], **bn_param)
-    towers.append(tower)
-    tower_name = '{}/tower_1'.format(out_layer)
-    tower = InceptionTower(net, from_layer, tower_name, [
-        dict(name='conv', num_output=num_output, kernel_size=1, pad=0, stride=1),
-        dict(name='conv_1', num_output=num_output, kernel_size=[7, 1], pad=[3, 0], stride=[1, 1]),
-        dict(name='conv_2', num_output=num_output, kernel_size=[1, 7], pad=[0, 3], stride=[1, 1]),
-        dict(name='conv_3', num_output=num_output, kernel_size=[7, 1], pad=[3, 0], stride=[1, 1]),
-        dict(name='conv_4', num_output=192, kernel_size=[1, 7], pad=[0, 3], stride=[1, 1]),
-        ], **bn_param)
-    towers.append(tower)
-    tower_name = '{}/tower_2'.format(out_layer)
-    tower = InceptionTower(net, from_layer, tower_name, [
-        dict(name='pool', pool=P.Pooling.AVE, kernel_size=3, pad=1, stride=1),
-        dict(name='conv', num_output=192, kernel_size=1, pad=0, stride=1),
-        ], **bn_param)
-    towers.append(tower)
-    out_layer = '{}/join'.format(out_layer)
-    net[out_layer] = L.Concat(*towers, axis=1)
-    from_layer = out_layer
-
-  # inceptions with 1x1, 3x3, 1x7, 7x1 filters
-  out_layer = 'mixed_8'
-  towers = []
-  tower_name = '{}/tower'.format(out_layer)
-  tower = InceptionTower(net, from_layer, tower_name, [
-      dict(name='conv', num_output=192, kernel_size=1, pad=0, stride=1),
-      dict(name='conv_1', num_output=320, kernel_size=3, pad=0, stride=2),
-      ], **bn_param)
-  towers.append(tower)
-  tower_name = '{}/tower_1'.format(out_layer)
-  tower = InceptionTower(net, from_layer, tower_name, [
-      dict(name='conv', num_output=192, kernel_size=1, pad=0, stride=1),
-      dict(name='conv_1', num_output=192, kernel_size=[1, 7], pad=[0, 3], stride=[1, 1]),
-      dict(name='conv_2', num_output=192, kernel_size=[7, 1], pad=[3, 0], stride=[1, 1]),
-      dict(name='conv_3', num_output=192, kernel_size=3, pad=0, stride=2),
-      ], **bn_param)
-  towers.append(tower)
-  tower_name = '{}'.format(out_layer)
-  tower = InceptionTower(net, from_layer, tower_name, [
-      dict(name='pool', pool=P.Pooling.MAX, kernel_size=3, pad=0, stride=2),
-      ], **bn_param)
-  towers.append(tower)
-  out_layer = '{}/join'.format(out_layer)
-  net[out_layer] = L.Concat(*towers, axis=1)
-  from_layer = out_layer
-
-  for inception_id in range(9, 11):
-    num_output = 384
-    num_output2 = 448
-    if inception_id == 9:
-      pool = P.Pooling.AVE
-    else:
-      pool = P.Pooling.MAX
-    out_layer = 'mixed_{}'.format(inception_id)
-    towers = []
-    tower_name = '{}'.format(out_layer)
-    tower = InceptionTower(net, from_layer, tower_name, [
-        dict(name='conv', num_output=320, kernel_size=1, pad=0, stride=1),
-        ], **bn_param)
-    towers.append(tower)
-
-    tower_name = '{}/tower'.format(out_layer)
-    tower = InceptionTower(net, from_layer, tower_name, [
-        dict(name='conv', num_output=num_output, kernel_size=1, pad=0, stride=1),
-        ], **bn_param)
-    subtowers = []
-    subtower_name = '{}/mixed'.format(tower_name)
-    subtower = InceptionTower(net, '{}/conv'.format(tower_name), subtower_name, [
-        dict(name='conv', num_output=num_output, kernel_size=[1, 3], pad=[0, 1], stride=[1, 1]),
-        ], **bn_param)
-    subtowers.append(subtower)
-    subtower = InceptionTower(net, '{}/conv'.format(tower_name), subtower_name, [
-        dict(name='conv_1', num_output=num_output, kernel_size=[3, 1], pad=[1, 0], stride=[1, 1]),
-        ], **bn_param)
-    subtowers.append(subtower)
-    net[subtower_name] = L.Concat(*subtowers, axis=1)
-    towers.append(net[subtower_name])
-
-    tower_name = '{}/tower_1'.format(out_layer)
-    tower = InceptionTower(net, from_layer, tower_name, [
-        dict(name='conv', num_output=num_output2, kernel_size=1, pad=0, stride=1),
-        dict(name='conv_1', num_output=num_output, kernel_size=3, pad=1, stride=1),
-        ], **bn_param)
-    subtowers = []
-    subtower_name = '{}/mixed'.format(tower_name)
-    subtower = InceptionTower(net, '{}/conv_1'.format(tower_name), subtower_name, [
-        dict(name='conv', num_output=num_output, kernel_size=[1, 3], pad=[0, 1], stride=[1, 1]),
-        ], **bn_param)
-    subtowers.append(subtower)
-    subtower = InceptionTower(net, '{}/conv_1'.format(tower_name), subtower_name, [
-        dict(name='conv_1', num_output=num_output, kernel_size=[3, 1], pad=[1, 0], stride=[1, 1]),
-        ], **bn_param)
-    subtowers.append(subtower)
-    net[subtower_name] = L.Concat(*subtowers, axis=1)
-    towers.append(net[subtower_name])
-
-    tower_name = '{}/tower_2'.format(out_layer)
-    tower = InceptionTower(net, from_layer, tower_name, [
-        dict(name='pool', pool=pool, kernel_size=3, pad=1, stride=1),
-        dict(name='conv', num_output=192, kernel_size=1, pad=0, stride=1),
-        ], **bn_param)
-    towers.append(tower)
-    out_layer = '{}/join'.format(out_layer)
-    net[out_layer] = L.Concat(*towers, axis=1)
-    from_layer = out_layer
-
-  if output_pred:
-    net.pool_3 = L.Pooling(net[from_layer], pool=P.Pooling.AVE, kernel_size=8, pad=0, stride=1)
-    net.softmax = L.InnerProduct(net.pool_3, num_output=1008)
-    net.softmax_prob = L.Softmax(net.softmax)
-
-  return net
-
-def CreateMultiBoxHead(net, data_layer="data", num_classes=[], from_layers=[],
-        use_objectness=False, use_iou=False, normalizations=[], use_batchnorm=True, lr_mult=1,
-        use_scale=True, min_sizes=[], max_sizes=[], prior_variance = [0.1],
-        aspect_ratios=[], steps=[], img_height=0, img_width=0, share_location=True,
-        flip=True, clip=True, offset=0.5, inter_layer_depth=[], kernel_size=1, pad=0,
-        conf_postfix='', loc_postfix='', **bn_param):
-    assert num_classes, "must provide num_classes"
-    assert num_classes > 0, "num_classes must be positive number"
-    if normalizations:
-        assert len(from_layers) == len(normalizations), "from_layers and normalizations should have same length"
-    assert len(from_layers) == len(min_sizes), "from_layers and min_sizes should have same length"
-    if max_sizes:
-        assert len(from_layers) == len(max_sizes), "from_layers and max_sizes should have same length"
-    if aspect_ratios:
-        assert len(from_layers) == len(aspect_ratios), "from_layers and aspect_ratios should have same length"
-    if steps:
-        assert len(from_layers) == len(steps), "from_layers and steps should have same length"
-    net_layers = net.keys()
-    assert data_layer in net_layers, "data_layer is not in net's layers"
-    if inter_layer_depth:
-        assert len(from_layers) == len(inter_layer_depth), "from_layers and inter_layer_depth should have same length"
-
-    num = len(from_layers)
-    priorbox_layers = []
-    loc_layers = []
-    conf_layers = []
-    iou_layers = []
-    objectness_layers = []
-    for i in range(0, num):
-        from_layer = from_layers[i]
-
-        # Get the normalize value.
-        if normalizations:
-            if normalizations[i] != -1:
-                norm_name = "{}_norm".format(from_layer)
-                net[norm_name] = L.Normalize(net[from_layer], scale_filler=dict(type="constant", value=normalizations[i]),
-                    across_spatial=False, channel_shared=False)
-                from_layer = norm_name
-
-        # Add intermediate layers.
-        if inter_layer_depth:
-            if inter_layer_depth[i] > 0:
-                inter_name = "{}_inter".format(from_layer)
-                ConvBNLayer(net, from_layer, inter_name, use_bn=use_batchnorm, use_relu=True, lr_mult=lr_mult,
-                      num_output=inter_layer_depth[i], kernel_size=3, pad=1, stride=1, **bn_param)
-                from_layer = inter_name
-
-        # Estimate number of priors per location given provided parameters.
-        min_size = min_sizes[i]
-        if type(min_size) is not list: min_size = [min_size]
-        aspect_ratio = []
-        if len(aspect_ratios) > i:
-            aspect_ratio = aspect_ratios[i]
-            if type(aspect_ratio) is not list: aspect_ratio = [aspect_ratio]
-        max_size = []
-        if len(max_sizes) > i:
-            max_size = max_sizes[i]
-            if type(max_size) is not list: max_size = [max_size]
-            if max_size:
-                assert len(max_size) == len(min_size), "max_size and min_size should have same length."
-        if max_size:
-            num_priors_per_location = (2 + len(aspect_ratio)) * len(min_size)
-        else:
-            num_priors_per_location = (1 + len(aspect_ratio)) * len(min_size)
-        if flip:
-            num_priors_per_location += len(aspect_ratio) * len(min_size)
-        step = []
-        if len(steps) > i: step = steps[i]
-
-        # Create location prediction layer.
-        name = "{}_mbox_loc{}".format(from_layer, loc_postfix)
-        num_loc_output = num_priors_per_location * 4;
-        if not share_location:
-            num_loc_output *= num_classes
-        ConvBNLayer(net, from_layer, name, use_bn=use_batchnorm, use_relu=False, lr_mult=lr_mult,
-            num_output=num_loc_output, kernel_size=kernel_size, pad=pad, stride=1, **bn_param)
-        permute_name = "{}_perm".format(name)
-        net[permute_name] = L.Permute(net[name], order=[0, 2, 3, 1])
-        flatten_name = "{}_flat".format(name)
-        net[flatten_name] = L.Flatten(net[permute_name], axis=1)
-        loc_layers.append(net[flatten_name])
-
-        # Create confidence prediction layer.
-        name = "{}_mbox_conf{}".format(from_layer, conf_postfix)
-        num_conf_output = num_priors_per_location * num_classes;
-        ConvBNLayer(net, from_layer, name, use_bn=use_batchnorm, use_relu=False, lr_mult=lr_mult,
-            num_output=num_conf_output, kernel_size=kernel_size, pad=pad, stride=1, **bn_param)
-        permute_name = "{}_perm".format(name)
-        net[permute_name] = L.Permute(net[name], order=[0, 2, 3, 1])
-        flatten_name = "{}_flat".format(name)
-        net[flatten_name] = L.Flatten(net[permute_name], axis=1)
-        conf_layers.append(net[flatten_name])
-
-        # Create iou prediction layer.
-        if use_iou:
-            name = "{}_mbox_iou{}".format(from_layer, conf_postfix)
-            num_iou_output = num_priors_per_location
-            ConvBNLayer(net, from_layer, name, use_bn=use_batchnorm, use_relu=False, lr_mult=lr_mult,
-                    num_output=num_iou_output, kernel_size=kernel_size, pad=pad, stride=1, **bn_param)
-            permute_name = "{}_perm".format(name)
-            net[permute_name] = L.Permute(net[name], order=[0, 2, 3, 1])
-            flatten_name = "{}_flat".format(name)
-            net[flatten_name] = L.Flatten(net[permute_name], axis=1)
-            iou_layers.append(net[flatten_name])
-
-        # Create prior generation layer.
-        name = "{}_mbox_priorbox".format(from_layer)
-        priorbox_param = {'min_size': min_size,
-                          'clip': clip,
-                          'offset': offset}
-        if max_size:
-            priorbox_param.update({'max_size': max_size})
-        if aspect_ratio:
-            priorbox_param.update({'aspect_ratio': aspect_ratio, 'flip': flip})
-        if step:
-            priorbox_param.update({'step': step})
-        if img_height != 0 and img_width != 0:
-            if img_height == img_width:
-                priorbox_param.update({'img_size': img_height})
-            else:
-                priorbox_param.update({'img_h': img_height, 'img_w': img_width})
-        net[name] = L.Python(net[from_layer], net['im_info'], module='layers.prior_box_layer',
-                             layer='PriorBoxLayer', param_str=str(priorbox_param))
-        priorbox_layers.append(net[name])
-
-        # Create objectness prediction layer.
-        if use_objectness:
-            name = "{}_mbox_objectness".format(from_layer)
-            num_obj_output = num_priors_per_location * 2;
-            ConvBNLayer(net, from_layer, name, use_bn=use_batchnorm, use_relu=False, lr_mult=lr_mult,
-                num_output=num_obj_output, kernel_size=kernel_size, pad=pad, stride=1, **bn_param)
-            permute_name = "{}_perm".format(name)
-            net[permute_name] = L.Permute(net[name], order=[0, 2, 3, 1])
-            flatten_name = "{}_flat".format(name)
-            net[flatten_name] = L.Flatten(net[permute_name], axis=1)
-            objectness_layers.append(net[flatten_name])
-
-    # Concatenate priorbox, loc, and conf layers.
-    mbox_layers = []
-    name = "mbox_loc"
-    net[name] = L.Concat(*loc_layers, axis=1)
-    net['mbox_loc_reshape'] = L.Reshape(net[name], shape={'dim': [0, -1, 4]})
-    mbox_layers.append(net['mbox_loc_reshape'])
-    name = "mbox_conf"
-    net[name] = L.Concat(*conf_layers, axis=1)
-    net['mbox_conf_reshape'] = L.Reshape(net[name], shape={'dim': [0, -1, num_classes]})
-    mbox_layers.append(net['mbox_conf_reshape'])
-    if use_iou:
-        name = "mbox_iou"
-        net[name] = L.Concat(*iou_layers, axis=1)
-        net['mbox_iou_reshape'] = L.Reshape(net[name], shape={'dim': [0, -1]})
-        mbox_layers.append(net['mbox_iou_reshape'])
-    name = "mbox_priorbox"
-    net[name] = L.Concat(*priorbox_layers, axis=0)
-    mbox_layers.append(net[name])
-    if use_objectness:
-        name = "mbox_objectness"
-        net[name] = L.Concat(*objectness_layers, axis=1)
-        mbox_layers.append(net[name])
-
-    return mbox_layers
--- a/Dragon/python/dragon/vm/caffe/net_spec.py
+++ b/Dragon/python/dragon/vm/caffe/net_spec.py
--- a/Dragon/python/dragon/vm/theano/compile/function.py
+++ b/Dragon/python/dragon/vm/theano/compile/function.py
@@ -354,15 +354,14 @@ class Function(object):

        # Store for future development
        self.meta_graph = meta_graph
-        self.graph_name = meta_graph.name

        # Call c api to create graph
-        ws.CreateGraph(meta_graph)
+        self.graph_name = ws.CreateGraph(meta_graph)

        # Bind a lambda callback to run this graph
        callback_inputs = self.inputs if explicit_inputs else []
        self.callback = lambda *args, **kwargs: \
-            ws.RunGraph(meta_graph.name, (callback_inputs, args), self.outputs, **kwargs)
+            ws.RunGraph(self.graph_name, (callback_inputs, args), self.outputs, **kwargs)

        # Self return
        return self
@@ -386,7 +385,7 @@ def function(inputs=None, outputs=None, givens=None, updater=None):
    ----------
    inputs : sequence of Tensor, optional
        The inputs to feed.
-    inputs : sequence of Tensor, optional
+    outputs : sequence of Tensor, optional
        The outputs to fetch.
    givens : dict of Tensor, optional
        The substitutions to use.

--- a/Dragon/python/dragon/vm/torch/ops/modules/axis.py
+++ b/Dragon/python/dragon/vm/torch/ops/modules/axis.py
@@ -60,6 +60,7 @@ class Gather(BaseModule):
            'n_inputs': 2, 'n_outputs': 1,
            'arguments': {
                'axis': self.axis,
+                'zero_grad': True,
            }
        }


--- a/Dragon/src/contrib/rcnn/bbox_utils.h
+++ b/Dragon/src/contrib/rcnn/bbox_utils.h
@@ -188,15 +188,15 @@ inline void RetrieveRoIs(

 template <typename T>
 inline int roi_level(
-    const int                       min_level,  //  e.g. 2
-    const int                       max_level,  //  e.g. 5
-    const int                       canonical_level,  //  e.g. 4
-    const int                       canonical_scale,  //  e.g. 224
+    const int                       min_level,
+    const int                       max_level,
+    const int                       canonical_level,
+    const int                       canonical_scale,
    T*                              roi) {
    T w = roi[3] - roi[1] + 1;
    T h = roi[4] - roi[2] + 1;
    // Refer the settings of paper
-    int level = canonical_level + (int)std::log(
+    int level = canonical_level + std::log2(
        std::max(std::sqrt(w * h), (T)1) / (T)canonical_scale);
    return std::min(max_level, std::max(min_level, level));
 }

--- a/Dragon/src/contrib/rcnn/proposal_op.cc
+++ b/Dragon/src/contrib/rcnn/proposal_op.cc
@@ -80,7 +80,7 @@ void ProposalOp<Context>::RunWithType(
                anchors_.Reshape({ A, 4 });

                rcnn::GenerateAnchors<BT>(strides[i],
-                    (int)ratios.size(), 1, &ratios[0], &scales[0],
+                    (int)ratios.size(), 1, &ratios[0], &scales[i],
                        anchors_.template mutable_data<BT, CPUContext>());

                rcnn::GenerateGridAnchors<BT>(

--- a/Dragon/src/kernels/ndarray/gather_op_kernel.cc
+++ b/Dragon/src/kernels/ndarray/gather_op_kernel.cc
@@ -6,134 +6,93 @@ namespace dragon {

 namespace kernel {

-/*! CanonicalAxis <T = int32, Device = CPU> */
-
-template <> void CanonicalAxis<int, CPUContext>(
-    const int               count,
-    const int               dim,
-    int*                    y,
-    CPUContext*             ctx) {
-#ifdef WITH_OMP
-    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
-#endif
-    for (int i = 0; i < count; ++i) if (y[i] < 0) y[i] += dim;
-}
-
 /*! Gather <T = ?, Device = CPU> */

 template <typename T>
 void _Gather(
-    const int               count,
    const int               outer_dim,
    const int               inner_dim,
    const int               x_slice_dim,
    const int               y_slice_dim,
-    const int*              indices,
+    const int64_t*          indices,
    const T*                x,
    T*                      y,
    CPUContext*             ctx) {
-    int64_t x_offset, y_offset, x_idx_offset, y_idx_offset;
-    for (int i = 0; i < y_slice_dim; ++i) {
-        y_idx_offset = i;
-        x_idx_offset = indices[y_idx_offset];
+    int64_t x_offset, select_idx;
    for (int n = 0; n < outer_dim; ++n) {
-            x_offset = (n * x_slice_dim + x_idx_offset) * inner_dim;
-            y_offset = (n * y_slice_dim + y_idx_offset) * inner_dim;
+        for (int i = 0; i < y_slice_dim; ++i) {
+            select_idx = indices[i];
+            select_idx = select_idx >= 0 ?
+                select_idx : select_idx + x_slice_dim;
+            x_offset = (n * x_slice_dim + select_idx) * inner_dim;
            ctx->Copy<T, CPUContext, CPUContext>(
-                inner_dim, y + y_offset, x + x_offset);
+                inner_dim, y, x + x_offset);
+            y += inner_dim;
        }
    }
 }

-/*! Gather <T = float32, Device = CPU> */
-
-template <> void Gather<float, CPUContext>(
-    const int               count,
-    const int               outer_dim,
-    const int               inner_dim,
-    const int               x_slice_dim,
-    const int               y_slice_dim,
-    const int*              indices,
-    const float*            x,
-    float*                  y,
-    CPUContext*             ctx) {
-    _Gather<float>(count, outer_dim, inner_dim,
-        x_slice_dim, y_slice_dim, indices, x, y, ctx);
-}
-
-/*! Gather <T = int32, Device = CPU> */
-
-template <> void Gather<int, CPUContext>(
-    const int               count,
-    const int               outer_dim,
-    const int               inner_dim,
-    const int               x_slice_dim,
-    const int               y_slice_dim,
-    const int*              indices,
-    const int*              x,
-    int*                    y,
-    CPUContext*             ctx) {
-    _Gather<int>(count, outer_dim, inner_dim,
-        x_slice_dim, y_slice_dim, indices, x, y, ctx);
-}
-
 /*! GatherGrad <T = ?, Device = CPU> */

 template <typename T>
 void _GatherGrad(
-    const int               count,
    const int               outer_dim,
    const int               inner_dim,
    const int               x_slice_dim,
    const int               y_slice_dim,
-    const int*              indices,
+    const int64_t*          indices,
    const T*                dy,
    T*                      dx,
    CPUContext*             ctx) {
-    int64_t x_offset, y_offset, x_idx_offset, y_idx_offset;
-    for (int i = 0; i < y_slice_dim; ++i) {
-        y_idx_offset = i;
-        x_idx_offset = indices[y_idx_offset];
+    int64_t x_offset, select_idx;
    for (int n = 0; n < outer_dim; ++n) {
-            x_offset = (n * x_slice_dim + x_idx_offset) * inner_dim;
-            y_offset = (n * y_slice_dim + y_idx_offset) * inner_dim;
+        for (int i = 0; i < y_slice_dim; ++i) {
+            select_idx = indices[i];
+            select_idx = select_idx >= 0 ?
+                select_idx : select_idx + x_slice_dim;
+            x_offset = (n * x_slice_dim + select_idx) * inner_dim;
            math::Add<T, CPUContext>(inner_dim,
-                dy + y_offset, dx + x_offset, dx + x_offset, ctx);
+                dy, dx + x_offset, dx + x_offset, ctx);
+            dy += inner_dim;
        }
    }
 }

-/*! GatherGrad <T = float32, Device = CPU> */
-
-template <> void GatherGrad<float, CPUContext>(
-    const int               count,
-    const int               outer_dim,
-    const int               inner_dim,
-    const int               x_slice_dim,
-    const int               y_slice_dim,
-    const int*              indices,
-    const float*            dy,
-    float*                  dx,
-    CPUContext*             ctx) {
-    _GatherGrad<float>(count, outer_dim, inner_dim,
-        x_slice_dim, y_slice_dim, indices, dy, dx, ctx);
-}
-
-/*! GatherGrad <T = int32, Device = CPU> */
+/*! Kernel Launchers */
+
+#define DEFINE_GATHER_KERNEL_LAUNCHER(name, T) \
+    template <> void name<T, CPUContext>( \
+        const int               outer_dim, \
+        const int               inner_dim, \
+        const int               x_slice_dim, \
+        const int               y_slice_dim, \
+        const int64_t*          indices, \
+        const T*                x, \
+        T*                      y, \
+        CPUContext*             ctx) { \
+        _##name<T> \
+            (outer_dim, inner_dim, x_slice_dim, \
+                y_slice_dim, indices, x, y, ctx); \
+    }

-template <> void GatherGrad<int, CPUContext>(
-    const int               count,
-    const int               outer_dim,
-    const int               inner_dim,
-    const int               x_slice_dim,
-    const int               y_slice_dim,
-    const int*              indices,
-    const int*              dy,
-    int*                    dx,
-    CPUContext*             ctx) {
-    _GatherGrad<int>(count, outer_dim, inner_dim,
-        x_slice_dim, y_slice_dim, indices, dy, dx, ctx);
-}
+DEFINE_GATHER_KERNEL_LAUNCHER(Gather, bool);
+DEFINE_GATHER_KERNEL_LAUNCHER(Gather, int8_t);
+DEFINE_GATHER_KERNEL_LAUNCHER(Gather, uint8_t);
+DEFINE_GATHER_KERNEL_LAUNCHER(Gather, int);
+DEFINE_GATHER_KERNEL_LAUNCHER(Gather, int64_t);
+DEFINE_GATHER_KERNEL_LAUNCHER(Gather, float16);
+DEFINE_GATHER_KERNEL_LAUNCHER(Gather, float);
+DEFINE_GATHER_KERNEL_LAUNCHER(Gather, double);
+
+DEFINE_GATHER_KERNEL_LAUNCHER(GatherGrad, int8_t);
+DEFINE_GATHER_KERNEL_LAUNCHER(GatherGrad, uint8_t);
+DEFINE_GATHER_KERNEL_LAUNCHER(GatherGrad, int);
+DEFINE_GATHER_KERNEL_LAUNCHER(GatherGrad, int64_t);
+DEFINE_GATHER_KERNEL_LAUNCHER(GatherGrad, float16);
+DEFINE_GATHER_KERNEL_LAUNCHER(GatherGrad, float);
+DEFINE_GATHER_KERNEL_LAUNCHER(GatherGrad, double);
+
+#undef DEFINE_GATHER_KERNEL_LAUNCHER

 }  // namespace kernel


--- a/Dragon/src/kernels/ndarray/gather_op_kernel.cu
+++ b/Dragon/src/kernels/ndarray/gather_op_kernel.cu
@@ -2,160 +2,176 @@

 #include "core/context_cuda.h"
 #include "utils/op_kernel.h"
+#include "utils/cub_device.h"

 namespace dragon {

 namespace kernel {

-/*! CanonicalAxis <T = int32, Device = CUDA> */
-
-template <typename T>
-__global__ void _CanonicalAxis(
-    const int               count,
-    const int               dim,
-    T*                      y) {
-    CUDA_1D_KERNEL_LOOP(idx, count) {
-        if (y[idx] < 0) y[idx] += dim;
-    }
-}
-
-template <> void CanonicalAxis<int, CUDAContext>(
-    const int               count,
-    const int               dim,
-    int*                    y,
-    CUDAContext*            ctx) {
-    _CanonicalAxis<int>
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >
-        (count, dim, y);
-}
-
 /*! Gather <T = ?, Device = CUDA> */

 template <typename T>
 __global__ void _Gather(
-    const int               count,
-    const int               outer_dim,
+    const int               nthreads,
    const int               inner_dim,
    const int               x_slice_dim,
    const int               y_slice_dim,
-    const int*              indices,
+    const int64_t*          indices,
    const T*                x,
    T*                      y) {
-    CUDA_1D_KERNEL_LOOP(idx, count) {
-        const int outer_idx = idx / inner_dim / y_slice_dim;
-        const int slice_idx = idx % inner_dim;
-        const int y_idx_offset = (idx / inner_dim) % y_slice_dim;
-        const int x_idx_offset = indices[y_idx_offset];
-        const int x_idx = (outer_idx * x_slice_dim + x_idx_offset)
-                                     * inner_dim + slice_idx;
-        y[idx] = x[x_idx];
+    CUDA_1D_KERNEL_LOOP(y_idx, nthreads) {
+        const int outer_idx = y_idx / inner_dim / y_slice_dim;
+        const int inner_idx = y_idx % inner_dim;
+#if __CUDA_ARCH__ >= 350
+        int select_idx = __ldg(indices +
+            ((y_idx / inner_dim) % y_slice_dim));
+#else
+        int select_idx = indices[
+            (y_idx / inner_dim) % y_slice_dim];
+#endif
+        select_idx = select_idx >= 0 ?
+            select_idx : select_idx + x_slice_dim;
+        const int x_idx = (outer_idx * x_slice_dim + select_idx)
+                                * inner_dim + inner_idx;
+        y[y_idx] = x[x_idx];
    }
 }

-/*! Gather <T = float32, Device = CUDA> */
-
-template <> void Gather<float, CUDAContext>(
-    const int               count,
-    const int               outer_dim,
-    const int               inner_dim,
-    const int               x_slice_dim,
-    const int               y_slice_dim,
-    const int*              indices,
-    const float*            x,
-    float*                  y,
-    CUDAContext*            ctx) {
-    _Gather<float>
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >
-        (count, outer_dim, inner_dim,
-            x_slice_dim, y_slice_dim,
-                indices, x, y);
-}
-
-/*! Gather <T = int32, Device = CUDA> */
-
-template <> void Gather<int, CUDAContext>(
-    const int               count,
-    const int               outer_dim,
-    const int               inner_dim,
-    const int               x_slice_dim,
-    const int               y_slice_dim,
-    const int*              indices,
-    const int*              x,
-    int*                    y,
-    CUDAContext*            ctx) {
-    _Gather<int>
-        << <CUDA_BLOCKS(count), CUDA_THREADS,
-            0, ctx->cuda_stream() >> >
-        (count, outer_dim, inner_dim,
-            x_slice_dim, y_slice_dim,
-                indices, x, y);
-}
-
 /*! GatherGrad <T = ?, Device = CUDA> */

 template <typename T>
 __global__ void _GatherGrad(
-    const int               count,
-    const int               outer_dim,
+    const int               nthreads,
    const int               inner_dim,
    const int               x_slice_dim,
    const int               y_slice_dim,
-    const int*              indices,
+    const int64_t*          indices,
    const T*                dy,
    T*                      dx) {
-    CUDA_1D_KERNEL_LOOP(idx, count) {
-        const int outer_idx = idx / inner_dim / y_slice_dim;
-        const int slice_idx = idx % inner_dim;
-        const int y_idx_offset = (idx / inner_dim) % y_slice_dim;
-        const int x_idx_offset = indices[y_idx_offset];
-        const int x_idx = (outer_idx * x_slice_dim + x_idx_offset)
-                                     * inner_dim + slice_idx;
-        atomicAdd(dx + x_idx, dy[idx]);
+    CUDA_1D_KERNEL_LOOP(i, nthreads) {
+        const int outer_idx = i / inner_dim;
+        const int inner_idx = i % inner_dim;
+        for (int j = 0; j < y_slice_dim; ++j) {
+#if __CUDA_ARCH__ >= 350
+            int select_idx = __ldg(indices + j);
+#else
+            int select_idx = indices[j];
+#endif
+            select_idx = select_idx >= 0 ?
+                select_idx : select_idx + x_slice_dim;
+            const int x_idx = (outer_idx * x_slice_dim + select_idx)
+                                     * inner_dim + inner_idx;
+            const int y_idx = (outer_idx * y_slice_dim + j)
+                                 * inner_dim + inner_idx;
+            dx[x_idx] += dy[y_idx];
+        }
    }
 }

-/*! GatherGrad <T = float32, Device = CUDA> */
+/*! GatherGrad <T = float16, Device = CUDA> */

-template <> void GatherGrad<float, CUDAContext>(
-    const int               count,
-    const int               outer_dim,
+template <> __global__ void _GatherGrad<half>(
+    const int               nthreads,
    const int               inner_dim,
    const int               x_slice_dim,
    const int               y_slice_dim,
-    const int*              indices,
-    const float*            dy,
-    float*                  dx,
-    CUDAContext*            ctx) {
-    _GatherGrad<float>
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >
-        (count, outer_dim, inner_dim,
-            x_slice_dim, y_slice_dim,
-                indices, dy, dx);
+    const int64_t*          indices,
+    const half*             dy,
+    half*                   dx) {
+    CUDA_1D_KERNEL_LOOP(i, nthreads) {
+#if __CUDA_ARCH__ >= 530
+        const int outer_idx = i / inner_dim;
+        const int inner_idx = i % inner_dim;
+        for (int j = 0; j < y_slice_dim; ++j) {
+            int select_idx = __ldg(indices + j);
+            select_idx = select_idx >= 0 ?
+                select_idx : select_idx + x_slice_dim;
+            const int x_idx = (outer_idx * x_slice_dim + select_idx)
+                * inner_dim + inner_idx;
+            const int y_idx = (outer_idx * y_slice_dim + j)
+                * inner_dim + inner_idx;
+            dx[x_idx] = __hadd(dx[x_idx], dy[y_idx]);
+        }
+#endif
+    }
 }

-/*! GatherGrad <T = int32, Device = CUDA> */
+/*! Kernel Launchers */
+
+#define DEFINE_GATHER_KERNEL_LAUNCHER(T) \
+    template <> void Gather<T, CUDAContext>( \
+        const int               outer_dim, \
+        const int               inner_dim, \
+        const int               x_slice_dim, \
+        const int               y_slice_dim, \
+        const int64_t*          indices, \
+        const T*                x, \
+        T*                      y, \
+        CUDAContext*            ctx) { \
+        auto nthreads = outer_dim * y_slice_dim * inner_dim; \
+        _Gather<T> \
+            << < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
+                 0, ctx->cuda_stream() >> > \
+            (nthreads, inner_dim, x_slice_dim, \
+                y_slice_dim, indices, x, y); \
+    }

-template <> void GatherGrad<int, CUDAContext>(
-    const int               count,
+#define DEFINE_GATHER_GRAD_KERNEL_LAUNCHER(T) \
+    template <> void GatherGrad<T, CUDAContext>( \
+        const int               outer_dim, \
+        const int               inner_dim, \
+        const int               x_slice_dim, \
+        const int               y_slice_dim, \
+        const int64_t*          indices, \
+        const T*                dy, \
+        T*                      dx, \
+        CUDAContext*            ctx) { \
+        auto nthreads = outer_dim * inner_dim; \
+        _GatherGrad<T> \
+            << < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
+                 0, ctx->cuda_stream() >> > \
+            (nthreads, inner_dim, x_slice_dim, \
+                y_slice_dim, indices, dy, dx); \
+    }
+
+DEFINE_GATHER_KERNEL_LAUNCHER(bool);
+DEFINE_GATHER_KERNEL_LAUNCHER(int8_t);
+DEFINE_GATHER_KERNEL_LAUNCHER(uint8_t);
+DEFINE_GATHER_KERNEL_LAUNCHER(int);
+DEFINE_GATHER_KERNEL_LAUNCHER(int64_t);
+DEFINE_GATHER_KERNEL_LAUNCHER(float16);
+DEFINE_GATHER_KERNEL_LAUNCHER(float);
+DEFINE_GATHER_KERNEL_LAUNCHER(double);
+
+DEFINE_GATHER_GRAD_KERNEL_LAUNCHER(int8_t);
+DEFINE_GATHER_GRAD_KERNEL_LAUNCHER(uint8_t);
+DEFINE_GATHER_GRAD_KERNEL_LAUNCHER(int);
+DEFINE_GATHER_GRAD_KERNEL_LAUNCHER(int64_t);
+DEFINE_GATHER_GRAD_KERNEL_LAUNCHER(float);
+DEFINE_GATHER_GRAD_KERNEL_LAUNCHER(double);
+
+template <> void GatherGrad<float16, CUDAContext>(
    const int               outer_dim,
    const int               inner_dim,
    const int               x_slice_dim,
    const int               y_slice_dim,
-    const int*              indices,
-    const int*              dy,
-    int*                    dx,
+    const int64_t*          indices,
+    const float16*          dy,
+    float16*                dx,
    CUDAContext*            ctx) {
-    _GatherGrad<int>
-        << < CUDA_BLOCKS(count), CUDA_THREADS,
+    auto nthreads = outer_dim * inner_dim;
+    _GatherGrad<half>
+        << < CUDA_BLOCKS(nthreads), CUDA_THREADS,
             0, ctx->cuda_stream() >> >
-        (count, outer_dim, inner_dim,
-            x_slice_dim, y_slice_dim,
-                indices, dy, dx);
+        (nthreads, inner_dim, x_slice_dim,
+            y_slice_dim, indices,
+                reinterpret_cast<const half*>(dy),
+                    reinterpret_cast<half*>(dx));
 }

+#undef DEFINE_GATHER_KERNEL_LAUNCHER
+#undef DEFINE_GATHER_GRAD_KERNEL_LAUNCHER
+
 }  // namespace kernel

 }  // namepsace dragon

--- a/Dragon/src/contrib/onnx/onnx_attibute.cc
+++ b/Dragon/src/contrib/onnx/onnx_attibute.cc
-#include "contrib/onnx/onnx_backend.h"
+#include "onnx/onnx_backend.h"

 namespace dragon {


--- a/Dragon/src/contrib/onnx/onnx_backend.cc
+++ b/Dragon/src/contrib/onnx/onnx_backend.cc
 #include "core/operator_schema.h"
 #include "utils/proto_utils.h"
-#include "contrib/onnx/onnx_backend.h"
+#include "onnx/onnx_backend.h"

 namespace dragon {


--- a/Dragon/src/contrib/onnx/onnx_backend.h
+++ b/Dragon/src/contrib/onnx/onnx_backend.h
 /*!
-* Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-*
-* Licensed under the BSD 2-Clause License.
-* You should have received a copy of the BSD 2-Clause License
-* along with the software. If not, See,
-*
-*      <https://opensource.org/licenses/BSD-2-Clause>
-*
-* Codes are based on:
-*
-*      <https://github.com/pytorch/pytorch/blob/master/caffe2/onnx/backend.h>
-*
-* ------------------------------------------------------------
-*/
-
-#ifndef DRAGON_CONTRIB_ONNX_ONNX_BACKEND_H_
-#define DRAGON_CONTRIB_ONNX_ONNX_BACKEND_H_
+ * Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+ *
+ * Licensed under the BSD 2-Clause License.
+ * You should have received a copy of the BSD 2-Clause License
+ * along with the software. If not, See,
+ *
+ *      <https://opensource.org/licenses/BSD-2-Clause>
+ *
+ * Codes are based on:
+ *
+ *      <https://github.com/pytorch/pytorch/blob/master/caffe2/onnx/backend.h>
+ *
+ * ------------------------------------------------------------
+ */
+
+#ifndef DRAGON_ONNX_ONNX_BACKEND_H_
+#define DRAGON_ONNX_ONNX_BACKEND_H_

 #include "core/common.h"
 #include "proto/onnx.pb.h"
@@ -228,4 +228,4 @@ class ONNXBackend {

 }  // namespace dragon

-#endif  // DRAGON_CONTRIB_ONNX_ONNX_BACKEND_H_
\ No newline at end of file
+#endif  // DRAGON_ONNX_ONNX_BACKEND_H_
\ No newline at end of file
--- a/Dragon/src/contrib/onnx/onnx_importer.cc
+++ b/Dragon/src/contrib/onnx/onnx_importer.cc
 #include "utils/map_utils.h"
-#include "contrib/onnx/onnx_backend.h"
+#include "onnx/onnx_backend.h"

 namespace dragon {


--- a/Dragon/src/contrib/onnx/onnx_initializer.cc
+++ b/Dragon/src/contrib/onnx/onnx_initializer.cc
-#include "contrib/onnx/onnx_backend.h"
+#include "onnx/onnx_backend.h"

 namespace dragon {


--- a/Dragon/src/operators/arithmetic/maximum_op.cc
+++ b/Dragon/src/operators/arithmetic/maximum_op.cc
--- a/Dragon/src/operators/ndarray/gather_op.cc
+++ b/Dragon/src/operators/ndarray/gather_op.cc
@@ -13,12 +13,10 @@ namespace dragon {
 template <class Context> template <typename T>
 void GatherOp<Context>::RunWithType() {
    auto* Xdata = Input(0).template data<T, Context>();
-    auto* indices = Input(1).template mutable_data<int, Context>();
+    auto* indices = Input(1).template mutable_data<int64_t, Context>();
    auto* Ydata = Output(0)->template mutable_data<T, Context>();

-    kernel::CanonicalAxis(Input(1).count(), x_slice_dim, indices, ctx());
-
-    kernel::Gather(Output(0)->count(),
+    kernel::Gather(
        outer_dim, inner_dim,
            x_slice_dim, y_slice_dim,
                indices, Xdata, Ydata, ctx());
@@ -28,22 +26,38 @@ template <class Context>
 void GatherOp<Context>::RunOnDevice() {
    DETERMINE_RUNTIME_ARGUMENTS(Input(0));

-    output_dims = Input(0).dims();
    x_slice_dim = Input(0).dim(axis);
-    output_dims[axis] = y_slice_dim = Input(1).count();
+    y_slice_dim = Input(1).count();
    outer_dim = Input(0).count(0, axis);
    inner_dim = Input(0).count(axis + 1);

    CHECK_GT(y_slice_dim, 0) << "\nLength of indices must > 0.";

+    const auto& s1 = Input(0).dims().begin();
+    const auto& e1 = s1 + axis, s3 = e1 + 1;
+    const auto& e3 = Input(0).dims().end();
+    const auto& s2 = Input(1).dims().begin();
+    const auto& e2 = Input(1).dims().end();
+    output_dims.assign(s1, e1);
+    output_dims.insert(output_dims.end(), s2, e2);
+    output_dims.insert(output_dims.end(), s3, e3);
    Output(0)->Reshape(output_dims);

-    CHECK(Input(1).template IsType<int>())
-        << "\nThe type of indices should be int32.";
+    CHECK(Input(1).template IsType<int64_t>())
+        << "\nThe type of indices should be int64.";

-    if (XIsType(Input(0), float)) RunWithType<float>();
+    if (XIsType(Input(0), bool)) RunWithType<bool>();
+    else if (XIsType(Input(0), int8_t)) RunWithType<int8_t>();
+    else if (XIsType(Input(0), uint8_t)) RunWithType<uint8_t>();
    else if (XIsType(Input(0), int)) RunWithType<int>();
-    else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "int32" });
+    else if (XIsType(Input(0), int64_t)) RunWithType<int64_t>();
+    else if (XIsType(Input(0), float16)) RunWithType<float16>();
+    else if (XIsType(Input(0), float)) RunWithType<float>();
+    else if (XIsType(Input(0), double)) RunWithType<double>();
+    else LOG(FATAL) << DTypeHelper(Input(0), {
+        "bool", "int8", "uint8", "int32", "int64",
+            "float16", "float32", "float64",
+    });
 }

 DEPLOY_CPU(Gather);
@@ -54,18 +68,17 @@ OPERATOR_SCHEMA(Gather).NumInputs(2).NumOutputs(1);

 template <class Context> template <typename T>
 void GatherGradientOp<Context>::RunWithType() {
-    auto* indices = Input(1).template data<int, Context>();
+    auto* indices = Input(1).template data<int64_t, Context>();
    auto* dYdata = Input(-1).template data<T, Context>();
+    auto* dXdata = Output(0)->template mutable_data<T, Context>();

-    T* dXdata = nullptr;
-    if (!acc_grad) {
-        dXdata = Output(0)->template mutable_data<T, Context>();
-        math::Set(Output(0)->count(), cast::to<T>(0.f), dXdata, ctx());
-    } else {
-        dXdata = Output(0)->template mutable_data<T, Context>();
+    // Zero the gradients Optionally
+    if (zero_grad) {
+        math::Set(Output(0)->count(),
+            cast::to<T>(0.f), dXdata, ctx());
    }

-    kernel::GatherGrad(Input(-1).count(),
+    kernel::GatherGrad(
        outer_dim, inner_dim,
            x_slice_dim, y_slice_dim,
                indices, dYdata, dXdata, ctx());
@@ -82,12 +95,20 @@ void GatherGradientOp<Context>::RunOnDevice() {

    Output(0)->ReshapeLike(Input(0));

-    CHECK(Input(1).template IsType<int>())
-        << "\nThe type of indices should be int32.";
+    CHECK(Input(1).template IsType<int64_t>())
+        << "\nThe type of indices should be int64.";

-    if (XIsType(Input(0), float)) RunWithType<float>();
+    if (XIsType(Input(0), int8_t)) RunWithType<int8_t>();
+    else if (XIsType(Input(0), uint8_t)) RunWithType<uint8_t>();
    else if (XIsType(Input(0), int)) RunWithType<int>();
-    else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "int32" });
+    else if (XIsType(Input(0), int64_t)) RunWithType<int64_t>();
+    else if (XIsType(Input(0), float16)) RunWithType<float16>();
+    else if (XIsType(Input(0), float)) RunWithType<float>();
+    else if (XIsType(Input(0), double)) RunWithType<double>();
+    else LOG(FATAL) << DTypeHelper(Input(0), {
+        "int8", "uint8", "int32", "int64",
+            "float16", "float32", "float64",
+    });
 }

 DEPLOY_CPU(GatherGradient);

--- a/Dragon/src/operators/vision/drop_block2d_op.cc
+++ b/Dragon/src/operators/vision/drop_block2d_op.cc
@@ -15,6 +15,27 @@ void DropBlock2dOp<Context>::RunWithType() {
                Output(0)->count(), Ydata, Xdata);
        }
    } else if (phase() == "TRAIN") {
+        if (data_format == "NCHW") {
+            n = Input(0).dim(0), c = Input(0).dim(1);
+            h = Input(0).dim(2), w = Input(0).dim(3);
+        } else if (data_format == "NHWC") {
+            n = Input(0).dim(0), c = Input(0).dim(-1);
+            h = Input(0).dim(1), w = Input(0).dim(2);
+        }
+
+        seed_h = h - block_size + 1;
+        seed_w = w - block_size + 1;
+
+        CHECK(seed_h > 0 && seed_w > 0)
+            << "\nExcepted block_size <= feat_size.";
+
+        if (decrement > 0 && apply_prob > keep_prob()) {
+            apply_prob -= decrement;
+        } else { apply_prob = keep_prob(); }
+
+        gamma = (1.f - apply_prob) / (block_size * block_size);
+        gamma *= (alpha * (h * w) / (seed_h * seed_w));
+
        auto* mask = ws()->CreateTensor(mount_name(
            "drop_block/mask"))->ReshapeLike(Input(0));
        auto* norm = ws()->CreateTensor(mount_name(
@@ -58,29 +79,8 @@ void DropBlock2dOp<Context>::RunWithType() {

 template <class Context>
 void DropBlock2dOp<Context>::RunOnDevice() {
-    if (data_format == "NCHW") {
-        n = Input(0).dim(0), c = Input(0).dim(1);
-        h = Input(0).dim(2), w = Input(0).dim(3);
-    } else if (data_format == "NHWC") {
-        n = Input(0).dim(0), c = Input(0).dim(-1);
-        h = Input(0).dim(1), w = Input(0).dim(2);
-    }
-
-    seed_h = h - block_size + 1;
-    seed_w = w - block_size + 1;
-
-    CHECK(seed_h > 0 && seed_w > 0) 
-        << "\nExcepted block_size <= feat_size.";
-
    Output(0)->ReshapeLike(Input(0));

-    if (decrement > 0 && apply_prob > keep_prob()) {
-        apply_prob -= decrement;
-    } else { apply_prob = keep_prob(); }
-
-    gamma = (1.f - apply_prob) / (block_size * block_size);
-    gamma *= (alpha * (h * w) / (seed_h * seed_w));
-
    if (XIsType(Input(0), float)) RunWithType<float>();
    else if (XIsType(Input(0), float16)) RunWithType<float16>();
    else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });