Commit 1d03e8e2 by Ting PAN

Optimize GatherOp

1 parent c5def39b
Showing with 341 additions and 1230 deletions
......@@ -283,14 +283,16 @@ code.docutils.literal:hover {
dt {
font-weight: 700;
background: #e7f2fa;
background: #f7f7f7;
border-bottom: solid #0079b2;
border-radius: 1px;
border-radius: 8px;
margin-bottom: 20px;
padding: 8px;
width: 75%;
dt:target, .highlighted {
background-color: #e7f2fa;
background-color: #f7f7f7;
border-bottom: 3px solid #c7254e;
......@@ -299,7 +301,7 @@ dt:target:before {
content: '';
display: block;
height: 65px;
margin: -20px 0 0;
margin: -20px -8px 8px;
dl.method dt {
......@@ -5,8 +5,8 @@
.. toctree::
Quick Shortcut
Quick Reference
========================== =============================================================================
List Brief
......@@ -5,8 +5,8 @@
.. toctree::
Quick Shortcut
Quick Reference
============================== =============================================================================
List Brief
......@@ -5,8 +5,8 @@
.. toctree::
Quick Shortcut
Quick Reference
============================== =============================================================================
List Brief
......@@ -5,8 +5,8 @@
.. toctree::
Quick Shortcut
Quick Reference
==================== =============================================================================
List Brief
......@@ -5,8 +5,8 @@
.. toctree::
Quick Shortcut
Quick Reference
==================== =============================================================================
List Brief
......@@ -5,8 +5,8 @@
.. toctree::
Quick Shortcut
Quick Reference
==================== =============================================================================
List Brief
......@@ -5,8 +5,8 @@
.. toctree::
Quick Shortcut
Quick Reference
==================== =============================================================================
List Brief
......@@ -5,8 +5,8 @@
.. toctree::
Quick Shortcut
Quick Reference
==================== =============================================================================
List Brief
......@@ -5,8 +5,8 @@
.. toctree::
Quick Shortcut
Quick Reference
==================== =============================================================================
List Brief
......@@ -112,8 +112,8 @@ List Brief
================================= =============================================================================
Quick Shortcut
Quick Reference
==================== =============================================================================
List Brief
......@@ -5,8 +5,8 @@
.. toctree::
Quick Shortcut
Quick Reference
========================= ============================================================================
List Brief
......@@ -5,8 +5,8 @@
.. toctree::
Quick Shortcut
Quick Reference
========================= =============================================================================
List Brief
......@@ -5,8 +5,8 @@
.. toctree::
Quick Shortcut
Quick Reference
==================== =============================================================================
List Brief
......@@ -6,8 +6,8 @@
Quick Shortcut
Quick Reference
============================== =======================================================================
List Brief
......@@ -39,15 +39,15 @@ class GatherGradientOp final : public Operator<Context> {
GatherGradientOp(const OperatorDef& def, Workspace* ws)
: Operator<Context>(def, ws),
axis(OperatorBase::Arg<int64_t>("axis", 0)),
acc_grad(OperatorBase::Arg<bool>("acc_gradient", false)) {}
zero_grad(OperatorBase::Arg<bool>("zero_grad", true)) {}
void RunOnDevice() override;
template <typename T> void RunWithType();
bool zero_grad;
int64_t axis, outer_dim, inner_dim, x_slice_dim, y_slice_dim;
bool acc_grad;
} // namespace dragon
......@@ -601,32 +601,23 @@ void ArgMin(
/*! ndarray.gather */
template <typename T, class Context>
void CanonicalAxis(
const int count,
const int dim,
T* y,
Context* ctx);
template <typename T, class Context>
void Gather(
const int count,
const int outer_dim,
const int inner_dim,
const int x_slice_dim,
const int y_slice_dim,
const int* indices,
const int64_t* indices,
const T* x,
T* y,
Context* ctx);
template <typename T, class Context>
void GatherGrad(
const int count,
const int outer_dim,
const int inner_dim,
const int x_slice_dim,
const int y_slice_dim,
const int* indices,
const int64_t* indices,
const T* dy,
T* dx,
Context* ctx);
......@@ -3,7 +3,7 @@
#include "core/common.h"
#include "utils/proto_utils.h"
#include "utils/caffemodel.h"
#include "contrib/onnx/onnx_backend.h"
#include "onnx/onnx_backend.h"
#include "dragon.h"
......@@ -11,7 +11,7 @@
#include "contrib/onnx/onnx_backend.h"
#include "onnx/onnx_backend.h"
#include "py_dragon.h"
......@@ -270,7 +270,7 @@ def ExportMetaGraph(prefix=''):
These text files will be saved as the following format:
Note that an empty prefix will leads to invalid exporting.
......@@ -293,12 +293,12 @@ def SetLoggingLevel(level):
level : str
The level, ``DEBUG``, ``INFO``, ``WARNING``, ``ERROR`` or ``FATAL``.
level : {'DEBUG', 'INFO, 'WARNING', 'ERROR', 'FATAL'}, required
The logging level.
The default level is ``INFO``.
The default level is *INFO*.
......@@ -391,9 +391,12 @@ class OperatorHelper(object):
def _apply_Gather(cls, arguments, inputs, outputs):
outputs[0].dtype = inputs[0].dtype
axis = arguments['axis']
outputs[0].shape = inputs[0].shape[:]
outputs[0].shape[arguments['axis']] = None
outputs[0].shape = \
inputs[0].shape[:axis] + \
inputs[1].shape[:] + \
inputs[0].shape[axis + 1:]
return outputs
......@@ -17,10 +17,10 @@ from . import *
def Gather(inputs, indices, axis=0, acc_gradient=False, **kwargs):
def Gather(inputs, indices, axis=0, zero_grad=True, **kwargs):
"""Gather the input according to the indices along the given axis.
**Type Constraints**: (*int32*, *float32*)
**Type Constraints**: (*bool*, *int8*, *uint8*, *int32*, *int64*, *float16*, *float32*, *float64*)
......@@ -30,7 +30,7 @@ def Gather(inputs, indices, axis=0, acc_gradient=False, **kwargs):
The indices to form output tensor.
axis : int, optional
The start axis, can be negative.
acc_gradient : bool, optional
zero_grad : bool, optional
Whether to accumulate the gradients.
......@@ -40,24 +40,10 @@ def Gather(inputs, indices, axis=0, acc_gradient=False, **kwargs):
arguments = ParseArgs(locals())
arguments['inputs'], arguments['indices'] = [arguments['inputs'],
Tensor.Convert(indices, dtype='int32')], None
output = Tensor.CreateOperator('Gather', **arguments)
output.shape = inputs.shape[:]
if not isinstance(indices, Tensor):
if not isinstance(indices, (list, tuple)):
indices = [indices]
output.shape[axis] = len(indices)
output.shape[axis] = None
return output
arguments['inputs'], arguments['indices'] = \
[arguments['inputs'], Tensor.Convert(
indices, dtype='int64')], None
return Tensor.CreateOperator('Gather', **arguments)
......@@ -283,9 +283,7 @@ def Pool2d(
def ROIPool(inputs, pool_h, pool_w, spatial_scale=1.0, **kwargs):
"""Max RoI Pooling. `[Girshick, 2015] <>`_.
The first dimension of input must be ``1``.
"""Max RoIPooling. `[Girshick, 2015] <>`_.
**Type Constraints**: (*float16*, *float32*)
......@@ -311,9 +309,7 @@ def ROIPool(inputs, pool_h, pool_w, spatial_scale=1.0, **kwargs):
def ROIAlign(inputs, pool_h=0, pool_w=0, spatial_scale=1.0, sampling_ratio=2, **kwargs):
"""AVG ROIAlign. `[He, 2017] <>`_.
The first dimension of input must be ``1``.
"""AVG RoIAlign. `[He, 2017] <>`_.
**Type Constraints**: (*float16*, *float32*)
......@@ -20,7 +20,7 @@ from multiprocessing import Process
class BlobFetcher(Process):
"""BlobFetcher is deployed to queue blobs from `DataTransformer`_.
It is supported to form ``NHWC`` image blobs and ``1D`` label blobs.
It is supported to form *NHWC* image blobs and *1d* label blobs.
def __init__(self, **kwargs):
......@@ -26,7 +26,7 @@ from .blob_fetcher import BlobFetcher
class DataBatch(object):
"""DataBatch aims to prefetch data by ``Triple-Buffering``.
"""DataBatch aims to prefetch data by *Triple-Buffering*.
It takes full advantages of the Process/Thread of Python,
which provides remarkable I/O speed up for scalable distributed training.
# ------------------------------------------------------------
# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
# Licensed under the BSD 2-Clause License.
# You should have received a copy of the BSD 2-Clause License
# along with the software. If not, See,
# <>
# Codes are based on:
# <>
# ------------------------------------------------------------
import os
from dragon.vm.caffe import layers as L
from dragon.vm.caffe import params as P
from dragon.vm.caffe.proto import caffe_pb2
def check_if_exist(path):
return os.path.exists(path)
def make_if_not_exist(path):
if not os.path.exists(path):
def UnpackVariable(var, num):
if type(var) is list and len(var) == num:
return var
ret = []
if type(var) is list:
assert len(var) == 1
for i in range(0, num):
for i in range(0, num):
return ret
def ConvBNLayer(net, from_layer, out_layer, use_bn, use_relu, num_output,
kernel_size, pad, stride, dilation=1, use_scale=True, lr_mult=1,
conv_prefix='', conv_postfix='', bn_prefix='', bn_postfix='_bn',
scale_prefix='', scale_postfix='_scale', bias_prefix='', bias_postfix='_bias',
if use_bn:
# parameters for convolution layer with batchnorm.
kwargs = {
'param': [dict(lr_mult=lr_mult, decay_mult=1)],
'weight_filler': dict(type='gaussian', std=0.01),
'bias_term': False,
eps = bn_params.get('eps', 1e-3)
moving_average_fraction = bn_params.get('moving_average_fraction', 0.9)
use_global_stats = bn_params.get('use_global_stats', False)
# parameters for batchnorm layer.
bn_kwargs = {
'param': [
dict(lr_mult=0, decay_mult=0),
dict(lr_mult=0, decay_mult=0),
dict(lr_mult=0, decay_mult=0)],
bn_lr_mult = lr_mult
if use_global_stats:
# only specify if use_global_stats is explicitly provided;
# otherwise, use_global_stats_ = this->phase_ == TEST;
bn_kwargs = {
'param': [
dict(lr_mult=0, decay_mult=0),
dict(lr_mult=0, decay_mult=0),
dict(lr_mult=0, decay_mult=0)],
'eps': eps,
'use_global_stats': use_global_stats,
# not updating scale/bias parameters
bn_lr_mult = 0
# parameters for scale bias layer after batchnorm.
if use_scale:
sb_kwargs = {
'bias_term': True}
kwargs = {
'param': [
dict(lr_mult=lr_mult, decay_mult=1),
dict(lr_mult=2 * lr_mult, decay_mult=0)],
'weight_filler': dict(type='xavier'),
'bias_filler': dict(type='constant', value=0)
conv_name = '{}{}{}'.format(conv_prefix, out_layer, conv_postfix)
[kernel_h, kernel_w] = UnpackVariable(kernel_size, 2)
[pad_h, pad_w] = UnpackVariable(pad, 2)
[stride_h, stride_w] = UnpackVariable(stride, 2)
if kernel_h == kernel_w:
net[conv_name] = L.Convolution(net[from_layer], num_output=num_output,
kernel_size=kernel_h, pad=pad_h, stride=stride_h, **kwargs)
net[conv_name] = L.Convolution(net[from_layer], num_output=num_output,
kernel_h=kernel_h, kernel_w=kernel_w, pad_h=pad_h, pad_w=pad_w,
stride_h=stride_h, stride_w=stride_w, **kwargs)
if dilation > 1:
net.update(conv_name, {'dilation': dilation})
if use_bn:
bn_name = '{}{}{}'.format(bn_prefix, out_layer, bn_postfix)
net[bn_name] = L.BatchNorm(net[conv_name], in_place=True, **bn_kwargs)
if use_scale:
sb_name = '{}{}{}'.format(scale_prefix, out_layer, scale_postfix)
net[sb_name] = L.Scale(net[bn_name], in_place=True, **sb_kwargs)
bias_name = '{}{}{}'.format(bias_prefix, out_layer, bias_postfix)
net[bias_name] = L.Bias(net[bn_name], in_place=True, **bias_kwargs)
if use_relu:
relu_name = '{}_relu'.format(conv_name)
net[relu_name] = L.ReLU(net[conv_name], in_place=True)
def ResBody(net, from_layer, block_name, out2a, out2b, out2c, stride, use_branch1, dilation=1, **bn_param):
# ResBody(net, 'pool1', '2a', 64, 64, 256, 1, True)
conv_prefix = 'res{}_'.format(block_name)
conv_postfix = ''
bn_prefix = 'bn{}_'.format(block_name)
bn_postfix = ''
scale_prefix = 'scale{}_'.format(block_name)
scale_postfix = ''
use_scale = True
if use_branch1:
branch_name = 'branch1'
ConvBNLayer(net, from_layer, branch_name, use_bn=True, use_relu=False,
num_output=out2c, kernel_size=1, pad=0, stride=stride, use_scale=use_scale,
conv_prefix=conv_prefix, conv_postfix=conv_postfix,
bn_prefix=bn_prefix, bn_postfix=bn_postfix,
scale_prefix=scale_prefix, scale_postfix=scale_postfix, **bn_param)
branch1 = '{}{}'.format(conv_prefix, branch_name)
branch1 = from_layer
branch_name = 'branch2a'
ConvBNLayer(net, from_layer, branch_name, use_bn=True, use_relu=True,
num_output=out2a, kernel_size=1, pad=0, stride=stride, use_scale=use_scale,
conv_prefix=conv_prefix, conv_postfix=conv_postfix,
bn_prefix=bn_prefix, bn_postfix=bn_postfix,
scale_prefix=scale_prefix, scale_postfix=scale_postfix, **bn_param)
out_name = '{}{}'.format(conv_prefix, branch_name)
branch_name = 'branch2b'
if dilation == 1:
ConvBNLayer(net, out_name, branch_name, use_bn=True, use_relu=True,
num_output=out2b, kernel_size=3, pad=1, stride=1, use_scale=use_scale,
conv_prefix=conv_prefix, conv_postfix=conv_postfix,
bn_prefix=bn_prefix, bn_postfix=bn_postfix,
scale_prefix=scale_prefix, scale_postfix=scale_postfix, **bn_param)
pad = int((3 + (dilation - 1) * 2) - 1) / 2
ConvBNLayer(net, out_name, branch_name, use_bn=True, use_relu=True,
num_output=out2b, kernel_size=3, pad=pad, stride=1, use_scale=use_scale,
dilation=dilation, conv_prefix=conv_prefix, conv_postfix=conv_postfix,
bn_prefix=bn_prefix, bn_postfix=bn_postfix,
scale_prefix=scale_prefix, scale_postfix=scale_postfix, **bn_param)
out_name = '{}{}'.format(conv_prefix, branch_name)
branch_name = 'branch2c'
ConvBNLayer(net, out_name, branch_name, use_bn=True, use_relu=False,
num_output=out2c, kernel_size=1, pad=0, stride=1, use_scale=use_scale,
conv_prefix=conv_prefix, conv_postfix=conv_postfix,
bn_prefix=bn_prefix, bn_postfix=bn_postfix,
scale_prefix=scale_prefix, scale_postfix=scale_postfix, **bn_param)
branch2 = '{}{}'.format(conv_prefix, branch_name)
res_name = 'res{}'.format(block_name)
net[res_name] = L.Eltwise(net[branch1], net[branch2])
relu_name = '{}_relu'.format(res_name)
net[relu_name] = L.ReLU(net[res_name], in_place=True)
def InceptionTower(net, from_layer, tower_name, layer_params, **bn_param):
use_scale = False
for param in layer_params:
tower_layer = '{}/{}'.format(tower_name, param['name'])
del param['name']
if 'pool' in tower_layer:
net[tower_layer] = L.Pooling(net[from_layer], **param)
ConvBNLayer(net, from_layer, tower_layer, use_bn=True, use_relu=True,
use_scale=use_scale, **param)
from_layer = tower_layer
return net[from_layer]
def CreateAnnotatedDataLayer(source, batch_size=32, backend=P.Data.LMDB,
output_label=True, train=True, label_map_file='', anno_type=None,
transform_param={}, batch_sampler=[{}]):
if train:
kwargs = {
'include': dict(phase=caffe_pb2.Phase.Value('TRAIN')),
'transform_param': transform_param,
kwargs = {
'include': dict(phase=caffe_pb2.Phase.Value('TEST')),
'transform_param': transform_param,
ntop = 1
if output_label:
ntop = 2
annotated_data_param = {
'label_map_file': label_map_file,
'batch_sampler': batch_sampler,
if anno_type is not None:
annotated_data_param.update({'anno_type': anno_type})
return L.AnnotatedData(name="data", annotated_data_param=annotated_data_param,
data_param=dict(batch_size=batch_size, backend=backend, source=source),
ntop=ntop, **kwargs)
def VGGNetBody(net, from_layer, need_fc=True, fully_conv=False, reduced=False,
dilated=False, nopool=False, dropout=True, freeze_layers=[], dilate_pool4=False):
kwargs = {
'param': [dict(lr_mult=1, decay_mult=1), dict(lr_mult=2, decay_mult=0)],
'weight_filler': dict(type='xavier'),
'bias_filler': dict(type='constant', value=0)}
assert from_layer in net.keys()
net.conv1_1 = L.Convolution(net[from_layer], num_output=64, pad=1, kernel_size=3, **kwargs)
net.relu1_1 = L.ReLU(net.conv1_1, in_place=True)
net.conv1_2 = L.Convolution(net.relu1_1, num_output=64, pad=1, kernel_size=3, **kwargs)
net.relu1_2 = L.ReLU(net.conv1_2, in_place=True)
if nopool:
name = 'conv1_3'
net[name] = L.Convolution(net.relu1_2, num_output=64, pad=1, kernel_size=3, stride=2, **kwargs)
name = 'pool1'
net.pool1 = L.Pooling(net.relu1_2, pool=P.Pooling.MAX, kernel_size=2, stride=2)
net.conv2_1 = L.Convolution(net[name], num_output=128, pad=1, kernel_size=3, **kwargs)
net.relu2_1 = L.ReLU(net.conv2_1, in_place=True)
net.conv2_2 = L.Convolution(net.relu2_1, num_output=128, pad=1, kernel_size=3, **kwargs)
net.relu2_2 = L.ReLU(net.conv2_2, in_place=True)
if nopool:
name = 'conv2_3'
net[name] = L.Convolution(net.relu2_2, num_output=128, pad=1, kernel_size=3, stride=2, **kwargs)
name = 'pool2'
net[name] = L.Pooling(net.relu2_2, pool=P.Pooling.MAX, kernel_size=2, stride=2)
net.conv3_1 = L.Convolution(net[name], num_output=256, pad=1, kernel_size=3, **kwargs)
net.relu3_1 = L.ReLU(net.conv3_1, in_place=True)
net.conv3_2 = L.Convolution(net.relu3_1, num_output=256, pad=1, kernel_size=3, **kwargs)
net.relu3_2 = L.ReLU(net.conv3_2, in_place=True)
net.conv3_3 = L.Convolution(net.relu3_2, num_output=256, pad=1, kernel_size=3, **kwargs)
net.relu3_3 = L.ReLU(net.conv3_3, in_place=True)
if nopool:
name = 'conv3_4'
net[name] = L.Convolution(net.relu3_3, num_output=256, pad=1, kernel_size=3, stride=2, **kwargs)
name = 'pool3'
net[name] = L.Pooling(net.relu3_3, pool=P.Pooling.MAX, kernel_size=2, stride=2)
net.conv4_1 = L.Convolution(net[name], num_output=512, pad=1, kernel_size=3, **kwargs)
net.relu4_1 = L.ReLU(net.conv4_1, in_place=True)
net.conv4_2 = L.Convolution(net.relu4_1, num_output=512, pad=1, kernel_size=3, **kwargs)
net.relu4_2 = L.ReLU(net.conv4_2, in_place=True)
net.conv4_3 = L.Convolution(net.relu4_2, num_output=512, pad=1, kernel_size=3, **kwargs)
net.relu4_3 = L.ReLU(net.conv4_3, in_place=True)
if nopool:
name = 'conv4_4'
net[name] = L.Convolution(net.relu4_3, num_output=512, pad=1, kernel_size=3, stride=2, **kwargs)
name = 'pool4'
if dilate_pool4:
net[name] = L.Pooling(net.relu4_3, pool=P.Pooling.MAX, kernel_size=3, stride=1, pad=1)
dilation = 2
net[name] = L.Pooling(net.relu4_3, pool=P.Pooling.MAX, kernel_size=2, stride=2)
dilation = 1
kernel_size = 3
pad = int(int((kernel_size + (dilation - 1) * (kernel_size - 1)) - 1) / 2)
net.conv5_1 = L.Convolution(net[name], num_output=512, pad=pad, kernel_size=kernel_size, dilation=dilation, **kwargs)
net.relu5_1 = L.ReLU(net.conv5_1, in_place=True)
net.conv5_2 = L.Convolution(net.relu5_1, num_output=512, pad=pad, kernel_size=kernel_size, dilation=dilation, **kwargs)
net.relu5_2 = L.ReLU(net.conv5_2, in_place=True)
net.conv5_3 = L.Convolution(net.relu5_2, num_output=512, pad=pad, kernel_size=kernel_size, dilation=dilation, **kwargs)
net.relu5_3 = L.ReLU(net.conv5_3, in_place=True)
if need_fc:
if dilated:
if nopool:
name = 'conv5_4'
net[name] = L.Convolution(net.relu5_3, num_output=512, pad=1, kernel_size=3, stride=1, **kwargs)
name = 'pool5'
net[name] = L.Pooling(net.relu5_3, pool=P.Pooling.MAX, pad=1, kernel_size=3, stride=1)
if nopool:
name = 'conv5_4'
net[name] = L.Convolution(net.relu5_3, num_output=512, pad=1, kernel_size=3, stride=2, **kwargs)
name = 'pool5'
net[name] = L.Pooling(net.relu5_3, pool=P.Pooling.MAX, kernel_size=2, stride=2)
if fully_conv:
if dilated:
if reduced:
dilation = dilation * 6
kernel_size = 3
num_output = 1024
dilation = dilation * 2
kernel_size = 7
num_output = 4096
if reduced:
dilation = dilation * 3
kernel_size = 3
num_output = 1024
kernel_size = 7
num_output = 4096
pad = int(int((kernel_size + (dilation - 1) * (kernel_size - 1)) - 1) / 2)
net.fc6 = L.Convolution(net[name], num_output=num_output, pad=pad, kernel_size=kernel_size, dilation=dilation, **kwargs)
net.relu6 = L.ReLU(net.fc6, in_place=True)
if dropout:
net.drop6 = L.Dropout(net.relu6, dropout_ratio=0.5, in_place=True)
if reduced:
net.fc7 = L.Convolution(net.relu6, num_output=1024, kernel_size=1, **kwargs)
net.fc7 = L.Convolution(net.relu6, num_output=4096, kernel_size=1, **kwargs)
net.relu7 = L.ReLU(net.fc7, in_place=True)
if dropout:
net.drop7 = L.Dropout(net.relu7, dropout_ratio=0.5, in_place=True)
net.fc6 = L.InnerProduct(net.pool5, num_output=4096)
net.relu6 = L.ReLU(net.fc6, in_place=True)
if dropout:
net.drop6 = L.Dropout(net.relu6, dropout_ratio=0.5, in_place=True)
net.fc7 = L.InnerProduct(net.relu6, num_output=4096)
net.relu7 = L.ReLU(net.fc7, in_place=True)
if dropout:
net.drop7 = L.Dropout(net.relu7, dropout_ratio=0.5, in_place=True)
# Update freeze layers.
kwargs['param'] = [dict(lr_mult=0, decay_mult=0), dict(lr_mult=0, decay_mult=0)]
layers = net.keys()
for freeze_layer in freeze_layers:
if freeze_layer in layers:
net.update(freeze_layer, kwargs)
return net
def ResNet101Body(net, from_layer, use_pool5=True, use_dilation_conv5=False, **bn_param):
conv_prefix = ''
conv_postfix = ''
bn_prefix = 'bn_'
bn_postfix = ''
scale_prefix = 'scale_'
scale_postfix = ''
ConvBNLayer(net, from_layer, 'conv1', use_bn=True, use_relu=True,
num_output=64, kernel_size=7, pad=3, stride=2,
conv_prefix=conv_prefix, conv_postfix=conv_postfix,
bn_prefix=bn_prefix, bn_postfix=bn_postfix,
scale_prefix=scale_prefix, scale_postfix=scale_postfix, **bn_param)
net.pool1 = L.Pooling(net.conv1, pool=P.Pooling.MAX, kernel_size=3, stride=2)
ResBody(net, 'pool1', '2a', out2a=64, out2b=64, out2c=256, stride=1, use_branch1=True, **bn_param)
ResBody(net, 'res2a', '2b', out2a=64, out2b=64, out2c=256, stride=1, use_branch1=False, **bn_param)
ResBody(net, 'res2b', '2c', out2a=64, out2b=64, out2c=256, stride=1, use_branch1=False, **bn_param)
ResBody(net, 'res2c', '3a', out2a=128, out2b=128, out2c=512, stride=2, use_branch1=True, **bn_param)
from_layer = 'res3a'
for i in range(1, 4):
block_name = '3b{}'.format(i)
ResBody(net, from_layer, block_name, out2a=128, out2b=128, out2c=512, stride=1, use_branch1=False, **bn_param)
from_layer = 'res{}'.format(block_name)
ResBody(net, from_layer, '4a', out2a=256, out2b=256, out2c=1024, stride=2, use_branch1=True, **bn_param)
from_layer = 'res4a'
for i in range(1, 23):
block_name = '4b{}'.format(i)
ResBody(net, from_layer, block_name, out2a=256, out2b=256, out2c=1024, stride=1, use_branch1=False, **bn_param)
from_layer = 'res{}'.format(block_name)
stride = 2
dilation = 1
if use_dilation_conv5:
stride = 1
dilation = 2
ResBody(net, from_layer, '5a', out2a=512, out2b=512, out2c=2048, stride=stride, use_branch1=True, dilation=dilation, **bn_param)
ResBody(net, 'res5a', '5b', out2a=512, out2b=512, out2c=2048, stride=1, use_branch1=False, dilation=dilation, **bn_param)
ResBody(net, 'res5b', '5c', out2a=512, out2b=512, out2c=2048, stride=1, use_branch1=False, dilation=dilation, **bn_param)
if use_pool5:
net.pool5 = L.Pooling(net.res5c, pool=P.Pooling.AVE, global_pooling=True)
return net
def ResNet152Body(net, from_layer, use_pool5=True, use_dilation_conv5=False, **bn_param):
conv_prefix = ''
conv_postfix = ''
bn_prefix = 'bn_'
bn_postfix = ''
scale_prefix = 'scale_'
scale_postfix = ''
ConvBNLayer(net, from_layer, 'conv1', use_bn=True, use_relu=True,
num_output=64, kernel_size=7, pad=3, stride=2,
conv_prefix=conv_prefix, conv_postfix=conv_postfix,
bn_prefix=bn_prefix, bn_postfix=bn_postfix,
scale_prefix=scale_prefix, scale_postfix=scale_postfix, **bn_param)
net.pool1 = L.Pooling(net.conv1, pool=P.Pooling.MAX, kernel_size=3, stride=2)
ResBody(net, 'pool1', '2a', out2a=64, out2b=64, out2c=256, stride=1, use_branch1=True, **bn_param)
ResBody(net, 'res2a', '2b', out2a=64, out2b=64, out2c=256, stride=1, use_branch1=False, **bn_param)
ResBody(net, 'res2b', '2c', out2a=64, out2b=64, out2c=256, stride=1, use_branch1=False, **bn_param)
ResBody(net, 'res2c', '3a', out2a=128, out2b=128, out2c=512, stride=2, use_branch1=True, **bn_param)
from_layer = 'res3a'
for i in range(1, 8):
block_name = '3b{}'.format(i)
ResBody(net, from_layer, block_name, out2a=128, out2b=128, out2c=512, stride=1, use_branch1=False, **bn_param)
from_layer = 'res{}'.format(block_name)
ResBody(net, from_layer, '4a', out2a=256, out2b=256, out2c=1024, stride=2, use_branch1=True, **bn_param)
from_layer = 'res4a'
for i in range(1, 36):
block_name = '4b{}'.format(i)
ResBody(net, from_layer, block_name, out2a=256, out2b=256, out2c=1024, stride=1, use_branch1=False, **bn_param)
from_layer = 'res{}'.format(block_name)
stride = 2
dilation = 1
if use_dilation_conv5:
stride = 1
dilation = 2
ResBody(net, from_layer, '5a', out2a=512, out2b=512, out2c=2048, stride=stride, use_branch1=True, dilation=dilation, **bn_param)
ResBody(net, 'res5a', '5b', out2a=512, out2b=512, out2c=2048, stride=1, use_branch1=False, dilation=dilation, **bn_param)
ResBody(net, 'res5b', '5c', out2a=512, out2b=512, out2c=2048, stride=1, use_branch1=False, dilation=dilation, **bn_param)
if use_pool5:
net.pool5 = L.Pooling(net.res5c, pool=P.Pooling.AVE, global_pooling=True)
return net
def InceptionV3Body(net, from_layer, output_pred=False, **bn_param):
# scale is fixed to 1, thus we ignore it.
use_scale = False
out_layer = 'conv'
ConvBNLayer(net, from_layer, out_layer, use_bn=True, use_relu=True,
num_output=32, kernel_size=3, pad=0, stride=2, use_scale=use_scale,
from_layer = out_layer
out_layer = 'conv_1'
ConvBNLayer(net, from_layer, out_layer, use_bn=True, use_relu=True,
num_output=32, kernel_size=3, pad=0, stride=1, use_scale=use_scale,
from_layer = out_layer
out_layer = 'conv_2'
ConvBNLayer(net, from_layer, out_layer, use_bn=True, use_relu=True,
num_output=64, kernel_size=3, pad=1, stride=1, use_scale=use_scale,
from_layer = out_layer
out_layer = 'pool'
net[out_layer] = L.Pooling(net[from_layer], pool=P.Pooling.MAX,
kernel_size=3, stride=2, pad=0)
from_layer = out_layer
out_layer = 'conv_3'
ConvBNLayer(net, from_layer, out_layer, use_bn=True, use_relu=True,
num_output=80, kernel_size=1, pad=0, stride=1, use_scale=use_scale,
from_layer = out_layer
out_layer = 'conv_4'
ConvBNLayer(net, from_layer, out_layer, use_bn=True, use_relu=True,
num_output=192, kernel_size=3, pad=0, stride=1, use_scale=use_scale,
from_layer = out_layer
out_layer = 'pool_1'
net[out_layer] = L.Pooling(net[from_layer], pool=P.Pooling.MAX,
kernel_size=3, stride=2, pad=0)
from_layer = out_layer
# inceptions with 1x1, 3x3, 5x5 convolutions
for inception_id in range(0, 3):
if inception_id == 0:
out_layer = 'mixed'
tower_2_conv_num_output = 32
out_layer = 'mixed_{}'.format(inception_id)
tower_2_conv_num_output = 64
towers = []
tower_name = '{}'.format(out_layer)
tower = InceptionTower(net, from_layer, tower_name, [
dict(name='conv', num_output=64, kernel_size=1, pad=0, stride=1),
], **bn_param)
tower_name = '{}/tower'.format(out_layer)
tower = InceptionTower(net, from_layer, tower_name, [
dict(name='conv', num_output=48, kernel_size=1, pad=0, stride=1),
dict(name='conv_1', num_output=64, kernel_size=5, pad=2, stride=1),
], **bn_param)
tower_name = '{}/tower_1'.format(out_layer)
tower = InceptionTower(net, from_layer, tower_name, [
dict(name='conv', num_output=64, kernel_size=1, pad=0, stride=1),
dict(name='conv_1', num_output=96, kernel_size=3, pad=1, stride=1),
dict(name='conv_2', num_output=96, kernel_size=3, pad=1, stride=1),
], **bn_param)
tower_name = '{}/tower_2'.format(out_layer)
tower = InceptionTower(net, from_layer, tower_name, [
dict(name='pool', pool=P.Pooling.AVE, kernel_size=3, pad=1, stride=1),
dict(name='conv', num_output=tower_2_conv_num_output, kernel_size=1, pad=0, stride=1),
], **bn_param)
out_layer = '{}/join'.format(out_layer)
net[out_layer] = L.Concat(*towers, axis=1)
from_layer = out_layer
# inceptions with 1x1, 3x3(in sequence) convolutions
out_layer = 'mixed_3'
towers = []
tower_name = '{}'.format(out_layer)
tower = InceptionTower(net, from_layer, tower_name, [
dict(name='conv', num_output=384, kernel_size=3, pad=0, stride=2),
], **bn_param)
tower_name = '{}/tower'.format(out_layer)
tower = InceptionTower(net, from_layer, tower_name, [
dict(name='conv', num_output=64, kernel_size=1, pad=0, stride=1),
dict(name='conv_1', num_output=96, kernel_size=3, pad=1, stride=1),
dict(name='conv_2', num_output=96, kernel_size=3, pad=0, stride=2),
], **bn_param)
tower_name = '{}'.format(out_layer)
tower = InceptionTower(net, from_layer, tower_name, [
dict(name='pool', pool=P.Pooling.MAX, kernel_size=3, pad=0, stride=2),
], **bn_param)
out_layer = '{}/join'.format(out_layer)
net[out_layer] = L.Concat(*towers, axis=1)
from_layer = out_layer
# inceptions with 1x1, 7x1, 1x7 convolutions
for inception_id in range(4, 8):
if inception_id == 4:
num_output = 128
elif inception_id == 5 or inception_id == 6:
num_output = 160
elif inception_id == 7:
num_output = 192
out_layer = 'mixed_{}'.format(inception_id)
towers = []
tower_name = '{}'.format(out_layer)
tower = InceptionTower(net, from_layer, tower_name, [
dict(name='conv', num_output=192, kernel_size=1, pad=0, stride=1),
], **bn_param)
tower_name = '{}/tower'.format(out_layer)
tower = InceptionTower(net, from_layer, tower_name, [
dict(name='conv', num_output=num_output, kernel_size=1, pad=0, stride=1),
dict(name='conv_1', num_output=num_output, kernel_size=[1, 7], pad=[0, 3], stride=[1, 1]),
dict(name='conv_2', num_output=192, kernel_size=[7, 1], pad=[3, 0], stride=[1, 1]),
], **bn_param)
tower_name = '{}/tower_1'.format(out_layer)
tower = InceptionTower(net, from_layer, tower_name, [
dict(name='conv', num_output=num_output, kernel_size=1, pad=0, stride=1),
dict(name='conv_1', num_output=num_output, kernel_size=[7, 1], pad=[3, 0], stride=[1, 1]),
dict(name='conv_2', num_output=num_output, kernel_size=[1, 7], pad=[0, 3], stride=[1, 1]),
dict(name='conv_3', num_output=num_output, kernel_size=[7, 1], pad=[3, 0], stride=[1, 1]),
dict(name='conv_4', num_output=192, kernel_size=[1, 7], pad=[0, 3], stride=[1, 1]),
], **bn_param)
tower_name = '{}/tower_2'.format(out_layer)
tower = InceptionTower(net, from_layer, tower_name, [
dict(name='pool', pool=P.Pooling.AVE, kernel_size=3, pad=1, stride=1),
dict(name='conv', num_output=192, kernel_size=1, pad=0, stride=1),
], **bn_param)
out_layer = '{}/join'.format(out_layer)
net[out_layer] = L.Concat(*towers, axis=1)
from_layer = out_layer
# inceptions with 1x1, 3x3, 1x7, 7x1 filters
out_layer = 'mixed_8'
towers = []
tower_name = '{}/tower'.format(out_layer)
tower = InceptionTower(net, from_layer, tower_name, [
dict(name='conv', num_output=192, kernel_size=1, pad=0, stride=1),
dict(name='conv_1', num_output=320, kernel_size=3, pad=0, stride=2),
], **bn_param)
tower_name = '{}/tower_1'.format(out_layer)
tower = InceptionTower(net, from_layer, tower_name, [
dict(name='conv', num_output=192, kernel_size=1, pad=0, stride=1),
dict(name='conv_1', num_output=192, kernel_size=[1, 7], pad=[0, 3], stride=[1, 1]),
dict(name='conv_2', num_output=192, kernel_size=[7, 1], pad=[3, 0], stride=[1, 1]),
dict(name='conv_3', num_output=192, kernel_size=3, pad=0, stride=2),
], **bn_param)
tower_name = '{}'.format(out_layer)
tower = InceptionTower(net, from_layer, tower_name, [
dict(name='pool', pool=P.Pooling.MAX, kernel_size=3, pad=0, stride=2),
], **bn_param)
out_layer = '{}/join'.format(out_layer)
net[out_layer] = L.Concat(*towers, axis=1)
from_layer = out_layer
for inception_id in range(9, 11):
num_output = 384
num_output2 = 448
if inception_id == 9:
pool = P.Pooling.AVE
pool = P.Pooling.MAX
out_layer = 'mixed_{}'.format(inception_id)
towers = []
tower_name = '{}'.format(out_layer)
tower = InceptionTower(net, from_layer, tower_name, [
dict(name='conv', num_output=320, kernel_size=1, pad=0, stride=1),
], **bn_param)
tower_name = '{}/tower'.format(out_layer)
tower = InceptionTower(net, from_layer, tower_name, [
dict(name='conv', num_output=num_output, kernel_size=1, pad=0, stride=1),
], **bn_param)
subtowers = []
subtower_name = '{}/mixed'.format(tower_name)
subtower = InceptionTower(net, '{}/conv'.format(tower_name), subtower_name, [
dict(name='conv', num_output=num_output, kernel_size=[1, 3], pad=[0, 1], stride=[1, 1]),
], **bn_param)
subtower = InceptionTower(net, '{}/conv'.format(tower_name), subtower_name, [
dict(name='conv_1', num_output=num_output, kernel_size=[3, 1], pad=[1, 0], stride=[1, 1]),
], **bn_param)
net[subtower_name] = L.Concat(*subtowers, axis=1)
tower_name = '{}/tower_1'.format(out_layer)
tower = InceptionTower(net, from_layer, tower_name, [
dict(name='conv', num_output=num_output2, kernel_size=1, pad=0, stride=1),
dict(name='conv_1', num_output=num_output, kernel_size=3, pad=1, stride=1),
], **bn_param)
subtowers = []
subtower_name = '{}/mixed'.format(tower_name)
subtower = InceptionTower(net, '{}/conv_1'.format(tower_name), subtower_name, [
dict(name='conv', num_output=num_output, kernel_size=[1, 3], pad=[0, 1], stride=[1, 1]),
], **bn_param)
subtower = InceptionTower(net, '{}/conv_1'.format(tower_name), subtower_name, [
dict(name='conv_1', num_output=num_output, kernel_size=[3, 1], pad=[1, 0], stride=[1, 1]),
], **bn_param)
net[subtower_name] = L.Concat(*subtowers, axis=1)
tower_name = '{}/tower_2'.format(out_layer)
tower = InceptionTower(net, from_layer, tower_name, [
dict(name='pool', pool=pool, kernel_size=3, pad=1, stride=1),
dict(name='conv', num_output=192, kernel_size=1, pad=0, stride=1),
], **bn_param)
out_layer = '{}/join'.format(out_layer)
net[out_layer] = L.Concat(*towers, axis=1)
from_layer = out_layer
if output_pred:
net.pool_3 = L.Pooling(net[from_layer], pool=P.Pooling.AVE, kernel_size=8, pad=0, stride=1)
net.softmax = L.InnerProduct(net.pool_3, num_output=1008)
net.softmax_prob = L.Softmax(net.softmax)
return net
def CreateMultiBoxHead(net, data_layer="data", num_classes=[], from_layers=[],
use_objectness=False, use_iou=False, normalizations=[], use_batchnorm=True, lr_mult=1,
use_scale=True, min_sizes=[], max_sizes=[], prior_variance = [0.1],
aspect_ratios=[], steps=[], img_height=0, img_width=0, share_location=True,
flip=True, clip=True, offset=0.5, inter_layer_depth=[], kernel_size=1, pad=0,
conf_postfix='', loc_postfix='', **bn_param):
assert num_classes, "must provide num_classes"
assert num_classes > 0, "num_classes must be positive number"
if normalizations:
assert len(from_layers) == len(normalizations), "from_layers and normalizations should have same length"
assert len(from_layers) == len(min_sizes), "from_layers and min_sizes should have same length"
if max_sizes:
assert len(from_layers) == len(max_sizes), "from_layers and max_sizes should have same length"
if aspect_ratios:
assert len(from_layers) == len(aspect_ratios), "from_layers and aspect_ratios should have same length"
if steps:
assert len(from_layers) == len(steps), "from_layers and steps should have same length"
net_layers = net.keys()
assert data_layer in net_layers, "data_layer is not in net's layers"
if inter_layer_depth:
assert len(from_layers) == len(inter_layer_depth), "from_layers and inter_layer_depth should have same length"
num = len(from_layers)
priorbox_layers = []
loc_layers = []
conf_layers = []
iou_layers = []
objectness_layers = []
for i in range(0, num):
from_layer = from_layers[i]
# Get the normalize value.
if normalizations:
if normalizations[i] != -1:
norm_name = "{}_norm".format(from_layer)
net[norm_name] = L.Normalize(net[from_layer], scale_filler=dict(type="constant", value=normalizations[i]),
across_spatial=False, channel_shared=False)
from_layer = norm_name
# Add intermediate layers.
if inter_layer_depth:
if inter_layer_depth[i] > 0:
inter_name = "{}_inter".format(from_layer)
ConvBNLayer(net, from_layer, inter_name, use_bn=use_batchnorm, use_relu=True, lr_mult=lr_mult,
num_output=inter_layer_depth[i], kernel_size=3, pad=1, stride=1, **bn_param)
from_layer = inter_name
# Estimate number of priors per location given provided parameters.
min_size = min_sizes[i]
if type(min_size) is not list: min_size = [min_size]
aspect_ratio = []
if len(aspect_ratios) > i:
aspect_ratio = aspect_ratios[i]
if type(aspect_ratio) is not list: aspect_ratio = [aspect_ratio]
max_size = []
if len(max_sizes) > i:
max_size = max_sizes[i]
if type(max_size) is not list: max_size = [max_size]
if max_size:
assert len(max_size) == len(min_size), "max_size and min_size should have same length."
if max_size:
num_priors_per_location = (2 + len(aspect_ratio)) * len(min_size)
num_priors_per_location = (1 + len(aspect_ratio)) * len(min_size)
if flip:
num_priors_per_location += len(aspect_ratio) * len(min_size)
step = []
if len(steps) > i: step = steps[i]
# Create location prediction layer.
name = "{}_mbox_loc{}".format(from_layer, loc_postfix)
num_loc_output = num_priors_per_location * 4;
if not share_location:
num_loc_output *= num_classes
ConvBNLayer(net, from_layer, name, use_bn=use_batchnorm, use_relu=False, lr_mult=lr_mult,
num_output=num_loc_output, kernel_size=kernel_size, pad=pad, stride=1, **bn_param)
permute_name = "{}_perm".format(name)
net[permute_name] = L.Permute(net[name], order=[0, 2, 3, 1])
flatten_name = "{}_flat".format(name)
net[flatten_name] = L.Flatten(net[permute_name], axis=1)
# Create confidence prediction layer.
name = "{}_mbox_conf{}".format(from_layer, conf_postfix)
num_conf_output = num_priors_per_location * num_classes;
ConvBNLayer(net, from_layer, name, use_bn=use_batchnorm, use_relu=False, lr_mult=lr_mult,
num_output=num_conf_output, kernel_size=kernel_size, pad=pad, stride=1, **bn_param)
permute_name = "{}_perm".format(name)
net[permute_name] = L.Permute(net[name], order=[0, 2, 3, 1])
flatten_name = "{}_flat".format(name)
net[flatten_name] = L.Flatten(net[permute_name], axis=1)
# Create iou prediction layer.
if use_iou:
name = "{}_mbox_iou{}".format(from_layer, conf_postfix)
num_iou_output = num_priors_per_location
ConvBNLayer(net, from_layer, name, use_bn=use_batchnorm, use_relu=False, lr_mult=lr_mult,
num_output=num_iou_output, kernel_size=kernel_size, pad=pad, stride=1, **bn_param)
permute_name = "{}_perm".format(name)
net[permute_name] = L.Permute(net[name], order=[0, 2, 3, 1])
flatten_name = "{}_flat".format(name)
net[flatten_name] = L.Flatten(net[permute_name], axis=1)
# Create prior generation layer.
name = "{}_mbox_priorbox".format(from_layer)
priorbox_param = {'min_size': min_size,
'clip': clip,
'offset': offset}
if max_size:
priorbox_param.update({'max_size': max_size})
if aspect_ratio:
priorbox_param.update({'aspect_ratio': aspect_ratio, 'flip': flip})
if step:
priorbox_param.update({'step': step})
if img_height != 0 and img_width != 0:
if img_height == img_width:
priorbox_param.update({'img_size': img_height})
priorbox_param.update({'img_h': img_height, 'img_w': img_width})
net[name] = L.Python(net[from_layer], net['im_info'], module='layers.prior_box_layer',
layer='PriorBoxLayer', param_str=str(priorbox_param))
# Create objectness prediction layer.
if use_objectness:
name = "{}_mbox_objectness".format(from_layer)
num_obj_output = num_priors_per_location * 2;
ConvBNLayer(net, from_layer, name, use_bn=use_batchnorm, use_relu=False, lr_mult=lr_mult,
num_output=num_obj_output, kernel_size=kernel_size, pad=pad, stride=1, **bn_param)
permute_name = "{}_perm".format(name)
net[permute_name] = L.Permute(net[name], order=[0, 2, 3, 1])
flatten_name = "{}_flat".format(name)
net[flatten_name] = L.Flatten(net[permute_name], axis=1)
# Concatenate priorbox, loc, and conf layers.
mbox_layers = []
name = "mbox_loc"
net[name] = L.Concat(*loc_layers, axis=1)
net['mbox_loc_reshape'] = L.Reshape(net[name], shape={'dim': [0, -1, 4]})
name = "mbox_conf"
net[name] = L.Concat(*conf_layers, axis=1)
net['mbox_conf_reshape'] = L.Reshape(net[name], shape={'dim': [0, -1, num_classes]})
if use_iou:
name = "mbox_iou"
net[name] = L.Concat(*iou_layers, axis=1)
net['mbox_iou_reshape'] = L.Reshape(net[name], shape={'dim': [0, -1]})
name = "mbox_priorbox"
net[name] = L.Concat(*priorbox_layers, axis=0)
if use_objectness:
name = "mbox_objectness"
net[name] = L.Concat(*objectness_layers, axis=1)
return mbox_layers
......@@ -236,4 +236,4 @@ class Parameters(object):
_param_names = param_name_dict()
layers = Layers()
params = Parameters()
params = Parameters()
\ No newline at end of file
......@@ -354,15 +354,14 @@ class Function(object):
# Store for future development
self.meta_graph = meta_graph
self.graph_name =
# Call c api to create graph
self.graph_name = ws.CreateGraph(meta_graph)
# Bind a lambda callback to run this graph
callback_inputs = self.inputs if explicit_inputs else []
self.callback = lambda *args, **kwargs: \
ws.RunGraph(, (callback_inputs, args), self.outputs, **kwargs)
ws.RunGraph(self.graph_name, (callback_inputs, args), self.outputs, **kwargs)
# Self return
return self
......@@ -386,7 +385,7 @@ def function(inputs=None, outputs=None, givens=None, updater=None):
inputs : sequence of Tensor, optional
The inputs to feed.
inputs : sequence of Tensor, optional
outputs : sequence of Tensor, optional
The outputs to fetch.
givens : dict of Tensor, optional
The substitutions to use.
......@@ -60,6 +60,7 @@ class Gather(BaseModule):
'n_inputs': 2, 'n_outputs': 1,
'arguments': {
'axis': self.axis,
'zero_grad': True,
......@@ -188,16 +188,16 @@ inline void RetrieveRoIs(
template <typename T>
inline int roi_level(
const int min_level, // e.g. 2
const int max_level, // e.g. 5
const int canonical_level, // e.g. 4
const int canonical_scale, // e.g. 224
const int min_level,
const int max_level,
const int canonical_level,
const int canonical_scale,
T* roi) {
T w = roi[3] - roi[1] + 1;
T h = roi[4] - roi[2] + 1;
// Refer the settings of paper
int level = canonical_level + (int)std::log(
std::max(std::sqrt(w * h), (T)1) / (T)canonical_scale);
int level = canonical_level + std::log2(
std::max(std::sqrt(w * h), (T)1) / (T)canonical_scale);
return std::min(max_level, std::max(min_level, level));
......@@ -80,7 +80,7 @@ void ProposalOp<Context>::RunWithType(
anchors_.Reshape({ A, 4 });
(int)ratios.size(), 1, &ratios[0], &scales[0],
(int)ratios.size(), 1, &ratios[0], &scales[i],
anchors_.template mutable_data<BT, CPUContext>());
......@@ -6,134 +6,93 @@ namespace dragon {
namespace kernel {
/*! CanonicalAxis <T = int32, Device = CPU> */
template <> void CanonicalAxis<int, CPUContext>(
const int count,
const int dim,
int* y,
CPUContext* ctx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
for (int i = 0; i < count; ++i) if (y[i] < 0) y[i] += dim;
/*! Gather <T = ?, Device = CPU> */
template <typename T>
void _Gather(
const int count,
const int outer_dim,
const int inner_dim,
const int x_slice_dim,
const int y_slice_dim,
const int* indices,
const int64_t* indices,
const T* x,
T* y,
CPUContext* ctx) {
int64_t x_offset, y_offset, x_idx_offset, y_idx_offset;
for (int i = 0; i < y_slice_dim; ++i) {
y_idx_offset = i;
x_idx_offset = indices[y_idx_offset];
for (int n = 0; n < outer_dim; ++n) {
x_offset = (n * x_slice_dim + x_idx_offset) * inner_dim;
y_offset = (n * y_slice_dim + y_idx_offset) * inner_dim;
int64_t x_offset, select_idx;
for (int n = 0; n < outer_dim; ++n) {
for (int i = 0; i < y_slice_dim; ++i) {
select_idx = indices[i];
select_idx = select_idx >= 0 ?
select_idx : select_idx + x_slice_dim;
x_offset = (n * x_slice_dim + select_idx) * inner_dim;
ctx->Copy<T, CPUContext, CPUContext>(
inner_dim, y + y_offset, x + x_offset);
inner_dim, y, x + x_offset);
y += inner_dim;
/*! Gather <T = float32, Device = CPU> */
template <> void Gather<float, CPUContext>(
const int count,
const int outer_dim,
const int inner_dim,
const int x_slice_dim,
const int y_slice_dim,
const int* indices,
const float* x,
float* y,
CPUContext* ctx) {
_Gather<float>(count, outer_dim, inner_dim,
x_slice_dim, y_slice_dim, indices, x, y, ctx);
/*! Gather <T = int32, Device = CPU> */
template <> void Gather<int, CPUContext>(
const int count,
const int outer_dim,
const int inner_dim,
const int x_slice_dim,
const int y_slice_dim,
const int* indices,
const int* x,
int* y,
CPUContext* ctx) {
_Gather<int>(count, outer_dim, inner_dim,
x_slice_dim, y_slice_dim, indices, x, y, ctx);
/*! GatherGrad <T = ?, Device = CPU> */
template <typename T>
void _GatherGrad(
const int count,
const int outer_dim,
const int inner_dim,
const int x_slice_dim,
const int y_slice_dim,
const int* indices,
const int64_t* indices,
const T* dy,
T* dx,
CPUContext* ctx) {
int64_t x_offset, y_offset, x_idx_offset, y_idx_offset;
for (int i = 0; i < y_slice_dim; ++i) {
y_idx_offset = i;
x_idx_offset = indices[y_idx_offset];
for (int n = 0; n < outer_dim; ++n) {
x_offset = (n * x_slice_dim + x_idx_offset) * inner_dim;
y_offset = (n * y_slice_dim + y_idx_offset) * inner_dim;
int64_t x_offset, select_idx;
for (int n = 0; n < outer_dim; ++n) {
for (int i = 0; i < y_slice_dim; ++i) {
select_idx = indices[i];
select_idx = select_idx >= 0 ?
select_idx : select_idx + x_slice_dim;
x_offset = (n * x_slice_dim + select_idx) * inner_dim;
math::Add<T, CPUContext>(inner_dim,
dy + y_offset, dx + x_offset, dx + x_offset, ctx);
dy, dx + x_offset, dx + x_offset, ctx);
dy += inner_dim;
/*! GatherGrad <T = float32, Device = CPU> */
template <> void GatherGrad<float, CPUContext>(
const int count,
const int outer_dim,
const int inner_dim,
const int x_slice_dim,
const int y_slice_dim,
const int* indices,
const float* dy,
float* dx,
CPUContext* ctx) {
_GatherGrad<float>(count, outer_dim, inner_dim,
x_slice_dim, y_slice_dim, indices, dy, dx, ctx);
/*! GatherGrad <T = int32, Device = CPU> */
/*! Kernel Launchers */
template <> void name<T, CPUContext>( \
const int outer_dim, \
const int inner_dim, \
const int x_slice_dim, \
const int y_slice_dim, \
const int64_t* indices, \
const T* x, \
T* y, \
CPUContext* ctx) { \
_##name<T> \
(outer_dim, inner_dim, x_slice_dim, \
y_slice_dim, indices, x, y, ctx); \
template <> void GatherGrad<int, CPUContext>(
const int count,
const int outer_dim,
const int inner_dim,
const int x_slice_dim,
const int y_slice_dim,
const int* indices,
const int* dy,
int* dx,
CPUContext* ctx) {
_GatherGrad<int>(count, outer_dim, inner_dim,
x_slice_dim, y_slice_dim, indices, dy, dx, ctx);
} // namespace kernel
......@@ -2,160 +2,176 @@
#include "core/context_cuda.h"
#include "utils/op_kernel.h"
#include "utils/cub_device.h"
namespace dragon {
namespace kernel {
/*! CanonicalAxis <T = int32, Device = CUDA> */
template <typename T>
__global__ void _CanonicalAxis(
const int count,
const int dim,
T* y) {
CUDA_1D_KERNEL_LOOP(idx, count) {
if (y[idx] < 0) y[idx] += dim;
template <> void CanonicalAxis<int, CUDAContext>(
const int count,
const int dim,
int* y,
CUDAContext* ctx) {
0, ctx->cuda_stream() >> >
(count, dim, y);
/*! Gather <T = ?, Device = CUDA> */
template <typename T>
__global__ void _Gather(
const int count,
const int outer_dim,
const int nthreads,
const int inner_dim,
const int x_slice_dim,
const int y_slice_dim,
const int* indices,
const int64_t* indices,
const T* x,
T* y) {
CUDA_1D_KERNEL_LOOP(idx, count) {
const int outer_idx = idx / inner_dim / y_slice_dim;
const int slice_idx = idx % inner_dim;
const int y_idx_offset = (idx / inner_dim) % y_slice_dim;
const int x_idx_offset = indices[y_idx_offset];
const int x_idx = (outer_idx * x_slice_dim + x_idx_offset)
* inner_dim + slice_idx;
y[idx] = x[x_idx];
CUDA_1D_KERNEL_LOOP(y_idx, nthreads) {
const int outer_idx = y_idx / inner_dim / y_slice_dim;
const int inner_idx = y_idx % inner_dim;
#if __CUDA_ARCH__ >= 350
int select_idx = __ldg(indices +
((y_idx / inner_dim) % y_slice_dim));
int select_idx = indices[
(y_idx / inner_dim) % y_slice_dim];
select_idx = select_idx >= 0 ?
select_idx : select_idx + x_slice_dim;
const int x_idx = (outer_idx * x_slice_dim + select_idx)
* inner_dim + inner_idx;
y[y_idx] = x[x_idx];
/*! Gather <T = float32, Device = CUDA> */
template <> void Gather<float, CUDAContext>(
const int count,
const int outer_dim,
const int inner_dim,
const int x_slice_dim,
const int y_slice_dim,
const int* indices,
const float* x,
float* y,
CUDAContext* ctx) {
0, ctx->cuda_stream() >> >
(count, outer_dim, inner_dim,
x_slice_dim, y_slice_dim,
indices, x, y);
/*! Gather <T = int32, Device = CUDA> */
template <> void Gather<int, CUDAContext>(
const int count,
const int outer_dim,
const int inner_dim,
const int x_slice_dim,
const int y_slice_dim,
const int* indices,
const int* x,
int* y,
CUDAContext* ctx) {
0, ctx->cuda_stream() >> >
(count, outer_dim, inner_dim,
x_slice_dim, y_slice_dim,
indices, x, y);
/*! GatherGrad <T = ?, Device = CUDA> */
template <typename T>
__global__ void _GatherGrad(
const int count,
const int outer_dim,
const int nthreads,
const int inner_dim,
const int x_slice_dim,
const int y_slice_dim,
const int* indices,
const int64_t* indices,
const T* dy,
T* dx) {
CUDA_1D_KERNEL_LOOP(idx, count) {
const int outer_idx = idx / inner_dim / y_slice_dim;
const int slice_idx = idx % inner_dim;
const int y_idx_offset = (idx / inner_dim) % y_slice_dim;
const int x_idx_offset = indices[y_idx_offset];
const int x_idx = (outer_idx * x_slice_dim + x_idx_offset)
* inner_dim + slice_idx;
atomicAdd(dx + x_idx, dy[idx]);
CUDA_1D_KERNEL_LOOP(i, nthreads) {
const int outer_idx = i / inner_dim;
const int inner_idx = i % inner_dim;
for (int j = 0; j < y_slice_dim; ++j) {
#if __CUDA_ARCH__ >= 350
int select_idx = __ldg(indices + j);
int select_idx = indices[j];
select_idx = select_idx >= 0 ?
select_idx : select_idx + x_slice_dim;
const int x_idx = (outer_idx * x_slice_dim + select_idx)
* inner_dim + inner_idx;
const int y_idx = (outer_idx * y_slice_dim + j)
* inner_dim + inner_idx;
dx[x_idx] += dy[y_idx];
/*! GatherGrad <T = float32, Device = CUDA> */
/*! GatherGrad <T = float16, Device = CUDA> */
template <> void GatherGrad<float, CUDAContext>(
const int count,
const int outer_dim,
template <> __global__ void _GatherGrad<half>(
const int nthreads,
const int inner_dim,
const int x_slice_dim,
const int y_slice_dim,
const int* indices,
const float* dy,
float* dx,
CUDAContext* ctx) {
0, ctx->cuda_stream() >> >
(count, outer_dim, inner_dim,
x_slice_dim, y_slice_dim,
indices, dy, dx);
const int64_t* indices,
const half* dy,
half* dx) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530
const int outer_idx = i / inner_dim;
const int inner_idx = i % inner_dim;
for (int j = 0; j < y_slice_dim; ++j) {
int select_idx = __ldg(indices + j);
select_idx = select_idx >= 0 ?
select_idx : select_idx + x_slice_dim;
const int x_idx = (outer_idx * x_slice_dim + select_idx)
* inner_dim + inner_idx;
const int y_idx = (outer_idx * y_slice_dim + j)
* inner_dim + inner_idx;
dx[x_idx] = __hadd(dx[x_idx], dy[y_idx]);
/*! GatherGrad <T = int32, Device = CUDA> */
/*! Kernel Launchers */
template <> void Gather<T, CUDAContext>( \
const int outer_dim, \
const int inner_dim, \
const int x_slice_dim, \
const int y_slice_dim, \
const int64_t* indices, \
const T* x, \
T* y, \
CUDAContext* ctx) { \
auto nthreads = outer_dim * y_slice_dim * inner_dim; \
_Gather<T> \
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >> > \
(nthreads, inner_dim, x_slice_dim, \
y_slice_dim, indices, x, y); \
template <> void GatherGrad<T, CUDAContext>( \
const int outer_dim, \
const int inner_dim, \
const int x_slice_dim, \
const int y_slice_dim, \
const int64_t* indices, \
const T* dy, \
T* dx, \
CUDAContext* ctx) { \
auto nthreads = outer_dim * inner_dim; \
_GatherGrad<T> \
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >> > \
(nthreads, inner_dim, x_slice_dim, \
y_slice_dim, indices, dy, dx); \
template <> void GatherGrad<int, CUDAContext>(
const int count,
template <> void GatherGrad<float16, CUDAContext>(
const int outer_dim,
const int inner_dim,
const int x_slice_dim,
const int y_slice_dim,
const int* indices,
const int* dy,
int* dx,
const int64_t* indices,
const float16* dy,
float16* dx,
CUDAContext* ctx) {
auto nthreads = outer_dim * inner_dim;
0, ctx->cuda_stream() >> >
(count, outer_dim, inner_dim,
x_slice_dim, y_slice_dim,
indices, dy, dx);
(nthreads, inner_dim, x_slice_dim,
y_slice_dim, indices,
reinterpret_cast<const half*>(dy),
} // namespace kernel
} // namepsace dragon
#include "contrib/onnx/onnx_backend.h"
#include "onnx/onnx_backend.h"
namespace dragon {
#include "core/operator_schema.h"
#include "utils/proto_utils.h"
#include "contrib/onnx/onnx_backend.h"
#include "onnx/onnx_backend.h"
namespace dragon {
* Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
* Licensed under the BSD 2-Clause License.
* You should have received a copy of the BSD 2-Clause License
* along with the software. If not, See,
* <>
* Codes are based on:
* <>
* ------------------------------------------------------------
* Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
* Licensed under the BSD 2-Clause License.
* You should have received a copy of the BSD 2-Clause License
* along with the software. If not, See,
* <>
* Codes are based on:
* <>
* ------------------------------------------------------------
#include "core/common.h"
#include "proto/onnx.pb.h"
......@@ -228,4 +228,4 @@ class ONNXBackend {
} // namespace dragon
\ No newline at end of file
\ No newline at end of file
#include "utils/map_utils.h"
#include "contrib/onnx/onnx_backend.h"
#include "onnx/onnx_backend.h"
namespace dragon {
#include "contrib/onnx/onnx_backend.h"
#include "onnx/onnx_backend.h"
namespace dragon {
......@@ -57,7 +57,7 @@ void MaximumOp<Context>::RunOnDevice() {
else if (XIsType(Input(0), double)) RunWithType<double>();
else LOG(FATAL) << DTypeHelper(Input(0), {
"int8", "uint8", "int32", "int64",
"float16", "float32", "float64",
"float16", "float32", "float64",
......@@ -13,12 +13,10 @@ namespace dragon {
template <class Context> template <typename T>
void GatherOp<Context>::RunWithType() {
auto* Xdata = Input(0).template data<T, Context>();
auto* indices = Input(1).template mutable_data<int, Context>();
auto* indices = Input(1).template mutable_data<int64_t, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>();
kernel::CanonicalAxis(Input(1).count(), x_slice_dim, indices, ctx());
outer_dim, inner_dim,
x_slice_dim, y_slice_dim,
indices, Xdata, Ydata, ctx());
......@@ -28,22 +26,38 @@ template <class Context>
void GatherOp<Context>::RunOnDevice() {
output_dims = Input(0).dims();
x_slice_dim = Input(0).dim(axis);
output_dims[axis] = y_slice_dim = Input(1).count();
y_slice_dim = Input(1).count();
outer_dim = Input(0).count(0, axis);
inner_dim = Input(0).count(axis + 1);
CHECK_GT(y_slice_dim, 0) << "\nLength of indices must > 0.";
const auto& s1 = Input(0).dims().begin();
const auto& e1 = s1 + axis, s3 = e1 + 1;
const auto& e3 = Input(0).dims().end();
const auto& s2 = Input(1).dims().begin();
const auto& e2 = Input(1).dims().end();
output_dims.assign(s1, e1);
output_dims.insert(output_dims.end(), s2, e2);
output_dims.insert(output_dims.end(), s3, e3);
CHECK(Input(1).template IsType<int>())
<< "\nThe type of indices should be int32.";
CHECK(Input(1).template IsType<int64_t>())
<< "\nThe type of indices should be int64.";
if (XIsType(Input(0), float)) RunWithType<float>();
if (XIsType(Input(0), bool)) RunWithType<bool>();
else if (XIsType(Input(0), int8_t)) RunWithType<int8_t>();
else if (XIsType(Input(0), uint8_t)) RunWithType<uint8_t>();
else if (XIsType(Input(0), int)) RunWithType<int>();
else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "int32" });
else if (XIsType(Input(0), int64_t)) RunWithType<int64_t>();
else if (XIsType(Input(0), float16)) RunWithType<float16>();
else if (XIsType(Input(0), float)) RunWithType<float>();
else if (XIsType(Input(0), double)) RunWithType<double>();
else LOG(FATAL) << DTypeHelper(Input(0), {
"bool", "int8", "uint8", "int32", "int64",
"float16", "float32", "float64",
......@@ -54,18 +68,17 @@ OPERATOR_SCHEMA(Gather).NumInputs(2).NumOutputs(1);
template <class Context> template <typename T>
void GatherGradientOp<Context>::RunWithType() {
auto* indices = Input(1).template data<int, Context>();
auto* indices = Input(1).template data<int64_t, Context>();
auto* dYdata = Input(-1).template data<T, Context>();
auto* dXdata = Output(0)->template mutable_data<T, Context>();
T* dXdata = nullptr;
if (!acc_grad) {
dXdata = Output(0)->template mutable_data<T, Context>();
math::Set(Output(0)->count(), cast::to<T>(0.f), dXdata, ctx());
} else {
dXdata = Output(0)->template mutable_data<T, Context>();
// Zero the gradients Optionally
if (zero_grad) {
cast::to<T>(0.f), dXdata, ctx());
outer_dim, inner_dim,
x_slice_dim, y_slice_dim,
indices, dYdata, dXdata, ctx());
......@@ -82,12 +95,20 @@ void GatherGradientOp<Context>::RunOnDevice() {
CHECK(Input(1).template IsType<int>())
<< "\nThe type of indices should be int32.";
CHECK(Input(1).template IsType<int64_t>())
<< "\nThe type of indices should be int64.";
if (XIsType(Input(0), float)) RunWithType<float>();
if (XIsType(Input(0), int8_t)) RunWithType<int8_t>();
else if (XIsType(Input(0), uint8_t)) RunWithType<uint8_t>();
else if (XIsType(Input(0), int)) RunWithType<int>();
else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "int32" });
else if (XIsType(Input(0), int64_t)) RunWithType<int64_t>();
else if (XIsType(Input(0), float16)) RunWithType<float16>();
else if (XIsType(Input(0), float)) RunWithType<float>();
else if (XIsType(Input(0), double)) RunWithType<double>();
else LOG(FATAL) << DTypeHelper(Input(0), {
"int8", "uint8", "int32", "int64",
"float16", "float32", "float64",
......@@ -15,6 +15,27 @@ void DropBlock2dOp<Context>::RunWithType() {
Output(0)->count(), Ydata, Xdata);
} else if (phase() == "TRAIN") {
if (data_format == "NCHW") {
n = Input(0).dim(0), c = Input(0).dim(1);
h = Input(0).dim(2), w = Input(0).dim(3);
} else if (data_format == "NHWC") {
n = Input(0).dim(0), c = Input(0).dim(-1);
h = Input(0).dim(1), w = Input(0).dim(2);
seed_h = h - block_size + 1;
seed_w = w - block_size + 1;
CHECK(seed_h > 0 && seed_w > 0)
<< "\nExcepted block_size <= feat_size.";
if (decrement > 0 && apply_prob > keep_prob()) {
apply_prob -= decrement;
} else { apply_prob = keep_prob(); }
gamma = (1.f - apply_prob) / (block_size * block_size);
gamma *= (alpha * (h * w) / (seed_h * seed_w));
auto* mask = ws()->CreateTensor(mount_name(
auto* norm = ws()->CreateTensor(mount_name(
......@@ -58,29 +79,8 @@ void DropBlock2dOp<Context>::RunWithType() {
template <class Context>
void DropBlock2dOp<Context>::RunOnDevice() {
if (data_format == "NCHW") {
n = Input(0).dim(0), c = Input(0).dim(1);
h = Input(0).dim(2), w = Input(0).dim(3);
} else if (data_format == "NHWC") {
n = Input(0).dim(0), c = Input(0).dim(-1);
h = Input(0).dim(1), w = Input(0).dim(2);
seed_h = h - block_size + 1;
seed_w = w - block_size + 1;
CHECK(seed_h > 0 && seed_w > 0)
<< "\nExcepted block_size <= feat_size.";
if (decrement > 0 && apply_prob > keep_prob()) {
apply_prob -= decrement;
} else { apply_prob = keep_prob(); }
gamma = (1.f - apply_prob) / (block_size * block_size);
gamma *= (alpha * (h * w) / (seed_h * seed_w));
if (XIsType(Input(0), float)) RunWithType<float>();
else if (XIsType(Input(0), float16)) RunWithType<float16>();
else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!