Commit 1d03e8e2 by Ting PAN

Optimize GatherOp

1 parent c5def39b
Showing with 335 additions and 1225 deletions
...@@ -283,14 +283,16 @@ code.docutils.literal:hover { ...@@ -283,14 +283,16 @@ code.docutils.literal:hover {
dt { dt {
font-weight: 700; font-weight: 700;
background: #e7f2fa; background: #f7f7f7;
border-bottom: solid #0079b2; border-bottom: solid #0079b2;
border-radius: 1px; border-radius: 8px;
margin-bottom: 20px; margin-bottom: 20px;
padding: 8px;
width: 75%;
} }
dt:target, .highlighted { dt:target, .highlighted {
background-color: #e7f2fa; background-color: #f7f7f7;
border-bottom: 3px solid #c7254e; border-bottom: 3px solid #c7254e;
} }
...@@ -299,7 +301,7 @@ dt:target:before { ...@@ -299,7 +301,7 @@ dt:target:before {
content: ''; content: '';
display: block; display: block;
height: 65px; height: 65px;
margin: -20px 0 0; margin: -20px -8px 8px;
} }
dl.method dt { dl.method dt {
......
...@@ -5,8 +5,8 @@ ...@@ -5,8 +5,8 @@
.. toctree:: .. toctree::
:hidden: :hidden:
Quick Shortcut Quick Reference
-------------- ---------------
========================== ============================================================================= ========================== =============================================================================
List Brief List Brief
......
...@@ -5,8 +5,8 @@ ...@@ -5,8 +5,8 @@
.. toctree:: .. toctree::
:hidden: :hidden:
Quick Shortcut Quick Reference
-------------- ---------------
============================== ============================================================================= ============================== =============================================================================
List Brief List Brief
......
...@@ -5,8 +5,8 @@ ...@@ -5,8 +5,8 @@
.. toctree:: .. toctree::
:hidden: :hidden:
Quick Shortcut Quick Reference
-------------- ---------------
============================== ============================================================================= ============================== =============================================================================
List Brief List Brief
......
...@@ -5,8 +5,8 @@ ...@@ -5,8 +5,8 @@
.. toctree:: .. toctree::
:hidden: :hidden:
Quick Shortcut Quick Reference
-------------- ---------------
==================== ============================================================================= ==================== =============================================================================
List Brief List Brief
......
...@@ -5,8 +5,8 @@ ...@@ -5,8 +5,8 @@
.. toctree:: .. toctree::
:hidden: :hidden:
Quick Shortcut Quick Reference
-------------- ---------------
==================== ============================================================================= ==================== =============================================================================
List Brief List Brief
......
...@@ -5,8 +5,8 @@ ...@@ -5,8 +5,8 @@
.. toctree:: .. toctree::
:hidden: :hidden:
Quick Shortcut Quick Reference
-------------- ---------------
==================== ============================================================================= ==================== =============================================================================
List Brief List Brief
......
...@@ -5,8 +5,8 @@ ...@@ -5,8 +5,8 @@
.. toctree:: .. toctree::
:hidden: :hidden:
Quick Shortcut Quick Reference
-------------- ---------------
==================== ============================================================================= ==================== =============================================================================
List Brief List Brief
......
...@@ -5,8 +5,8 @@ ...@@ -5,8 +5,8 @@
.. toctree:: .. toctree::
:hidden: :hidden:
Quick Shortcut Quick Reference
-------------- ---------------
==================== ============================================================================= ==================== =============================================================================
List Brief List Brief
......
...@@ -5,8 +5,8 @@ ...@@ -5,8 +5,8 @@
.. toctree:: .. toctree::
:hidden: :hidden:
Quick Shortcut Quick Reference
-------------- ---------------
==================== ============================================================================= ==================== =============================================================================
List Brief List Brief
......
...@@ -112,8 +112,8 @@ List Brief ...@@ -112,8 +112,8 @@ List Brief
================================= ============================================================================= ================================= =============================================================================
Quick Shortcut Quick Reference
-------------- ---------------
==================== ============================================================================= ==================== =============================================================================
List Brief List Brief
......
...@@ -5,8 +5,8 @@ ...@@ -5,8 +5,8 @@
.. toctree:: .. toctree::
:hidden: :hidden:
Quick Shortcut Quick Reference
-------------- ---------------
========================= ============================================================================ ========================= ============================================================================
List Brief List Brief
......
...@@ -5,8 +5,8 @@ ...@@ -5,8 +5,8 @@
.. toctree:: .. toctree::
:hidden: :hidden:
Quick Shortcut Quick Reference
-------------- ---------------
========================= ============================================================================= ========================= =============================================================================
List Brief List Brief
......
...@@ -5,8 +5,8 @@ ...@@ -5,8 +5,8 @@
.. toctree:: .. toctree::
:hidden: :hidden:
Quick Shortcut Quick Reference
-------------- ---------------
==================== ============================================================================= ==================== =============================================================================
List Brief List Brief
......
...@@ -6,8 +6,8 @@ ...@@ -6,8 +6,8 @@
:hidden: :hidden:
Quick Shortcut Quick Reference
-------------- ---------------
============================== ======================================================================= ============================== =======================================================================
List Brief List Brief
......
...@@ -39,15 +39,15 @@ class GatherGradientOp final : public Operator<Context> { ...@@ -39,15 +39,15 @@ class GatherGradientOp final : public Operator<Context> {
GatherGradientOp(const OperatorDef& def, Workspace* ws) GatherGradientOp(const OperatorDef& def, Workspace* ws)
: Operator<Context>(def, ws), : Operator<Context>(def, ws),
axis(OperatorBase::Arg<int64_t>("axis", 0)), axis(OperatorBase::Arg<int64_t>("axis", 0)),
acc_grad(OperatorBase::Arg<bool>("acc_gradient", false)) {} zero_grad(OperatorBase::Arg<bool>("zero_grad", true)) {}
USE_OPERATOR_FUNCTIONS; USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override; void RunOnDevice() override;
template <typename T> void RunWithType(); template <typename T> void RunWithType();
protected: protected:
bool zero_grad;
int64_t axis, outer_dim, inner_dim, x_slice_dim, y_slice_dim; int64_t axis, outer_dim, inner_dim, x_slice_dim, y_slice_dim;
bool acc_grad;
}; };
} // namespace dragon } // namespace dragon
......
...@@ -601,32 +601,23 @@ void ArgMin( ...@@ -601,32 +601,23 @@ void ArgMin(
/*! ndarray.gather */ /*! ndarray.gather */
template <typename T, class Context> template <typename T, class Context>
void CanonicalAxis(
const int count,
const int dim,
T* y,
Context* ctx);
template <typename T, class Context>
void Gather( void Gather(
const int count,
const int outer_dim, const int outer_dim,
const int inner_dim, const int inner_dim,
const int x_slice_dim, const int x_slice_dim,
const int y_slice_dim, const int y_slice_dim,
const int* indices, const int64_t* indices,
const T* x, const T* x,
T* y, T* y,
Context* ctx); Context* ctx);
template <typename T, class Context> template <typename T, class Context>
void GatherGrad( void GatherGrad(
const int count,
const int outer_dim, const int outer_dim,
const int inner_dim, const int inner_dim,
const int x_slice_dim, const int x_slice_dim,
const int y_slice_dim, const int y_slice_dim,
const int* indices, const int64_t* indices,
const T* dy, const T* dy,
T* dx, T* dx,
Context* ctx); Context* ctx);
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
#include "core/common.h" #include "core/common.h"
#include "utils/proto_utils.h" #include "utils/proto_utils.h"
#include "utils/caffemodel.h" #include "utils/caffemodel.h"
#include "contrib/onnx/onnx_backend.h" #include "onnx/onnx_backend.h"
#include "dragon.h" #include "dragon.h"
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
#ifndef DRAGON_PYTHON_PY_ONNX_H_ #ifndef DRAGON_PYTHON_PY_ONNX_H_
#define DRAGON_PYTHON_PY_ONNX_H_ #define DRAGON_PYTHON_PY_ONNX_H_
#include "contrib/onnx/onnx_backend.h" #include "onnx/onnx_backend.h"
#include "py_dragon.h" #include "py_dragon.h"
......
...@@ -270,7 +270,7 @@ def ExportMetaGraph(prefix=''): ...@@ -270,7 +270,7 @@ def ExportMetaGraph(prefix=''):
These text files will be saved as the following format: These text files will be saved as the following format:
``prefix/Graph_xxx.metatxt`` *prefix/Graph.metatxt*
Note that an empty prefix will leads to invalid exporting. Note that an empty prefix will leads to invalid exporting.
...@@ -293,12 +293,12 @@ def SetLoggingLevel(level): ...@@ -293,12 +293,12 @@ def SetLoggingLevel(level):
Parameters Parameters
---------- ----------
level : str level : {'DEBUG', 'INFO, 'WARNING', 'ERROR', 'FATAL'}, required
The level, ``DEBUG``, ``INFO``, ``WARNING``, ``ERROR`` or ``FATAL``. The logging level.
Notes Notes
----- -----
The default level is ``INFO``. The default level is *INFO*.
""" """
C.SetLogLevelCC(level) C.SetLogLevelCC(level)
......
...@@ -391,9 +391,12 @@ class OperatorHelper(object): ...@@ -391,9 +391,12 @@ class OperatorHelper(object):
@classmethod @classmethod
def _apply_Gather(cls, arguments, inputs, outputs): def _apply_Gather(cls, arguments, inputs, outputs):
outputs[0].dtype = inputs[0].dtype outputs[0].dtype = inputs[0].dtype
axis = arguments['axis']
try: try:
outputs[0].shape = inputs[0].shape[:] outputs[0].shape = \
outputs[0].shape[arguments['axis']] = None inputs[0].shape[:axis] + \
inputs[1].shape[:] + \
inputs[0].shape[axis + 1:]
except: except:
pass pass
return outputs return outputs
......
...@@ -17,10 +17,10 @@ from . import * ...@@ -17,10 +17,10 @@ from . import *
@OpSchema.Inputs(1) @OpSchema.Inputs(1)
def Gather(inputs, indices, axis=0, acc_gradient=False, **kwargs): def Gather(inputs, indices, axis=0, zero_grad=True, **kwargs):
"""Gather the input according to the indices along the given axis. """Gather the input according to the indices along the given axis.
**Type Constraints**: (*int32*, *float32*) **Type Constraints**: (*bool*, *int8*, *uint8*, *int32*, *int64*, *float16*, *float32*, *float64*)
Parameters Parameters
---------- ----------
...@@ -30,7 +30,7 @@ def Gather(inputs, indices, axis=0, acc_gradient=False, **kwargs): ...@@ -30,7 +30,7 @@ def Gather(inputs, indices, axis=0, acc_gradient=False, **kwargs):
The indices to form output tensor. The indices to form output tensor.
axis : int, optional axis : int, optional
The start axis, can be negative. The start axis, can be negative.
acc_gradient : bool, optional zero_grad : bool, optional
Whether to accumulate the gradients. Whether to accumulate the gradients.
Returns Returns
...@@ -40,24 +40,10 @@ def Gather(inputs, indices, axis=0, acc_gradient=False, **kwargs): ...@@ -40,24 +40,10 @@ def Gather(inputs, indices, axis=0, acc_gradient=False, **kwargs):
""" """
arguments = ParseArgs(locals()) arguments = ParseArgs(locals())
arguments['inputs'], arguments['indices'] = \
arguments['inputs'], arguments['indices'] = [arguments['inputs'], [arguments['inputs'], Tensor.Convert(
Tensor.Convert(indices, dtype='int32')], None indices, dtype='int64')], None
return Tensor.CreateOperator('Gather', **arguments)
output = Tensor.CreateOperator('Gather', **arguments)
try:
output.shape = inputs.shape[:]
if not isinstance(indices, Tensor):
if not isinstance(indices, (list, tuple)):
indices = [indices]
output.shape[axis] = len(indices)
else:
output.shape[axis] = None
except:
pass
return output
@OpSchema.Inputs(1) @OpSchema.Inputs(1)
......
...@@ -283,9 +283,7 @@ def Pool2d( ...@@ -283,9 +283,7 @@ def Pool2d(
@OpSchema.Inputs(2) @OpSchema.Inputs(2)
def ROIPool(inputs, pool_h, pool_w, spatial_scale=1.0, **kwargs): def ROIPool(inputs, pool_h, pool_w, spatial_scale=1.0, **kwargs):
"""Max RoI Pooling. `[Girshick, 2015] <https://arxiv.org/abs/1504.08083>`_. """Max RoIPooling. `[Girshick, 2015] <https://arxiv.org/abs/1504.08083>`_.
The first dimension of input must be ``1``.
**Type Constraints**: (*float16*, *float32*) **Type Constraints**: (*float16*, *float32*)
...@@ -311,9 +309,7 @@ def ROIPool(inputs, pool_h, pool_w, spatial_scale=1.0, **kwargs): ...@@ -311,9 +309,7 @@ def ROIPool(inputs, pool_h, pool_w, spatial_scale=1.0, **kwargs):
@OpSchema.Inputs(2) @OpSchema.Inputs(2)
def ROIAlign(inputs, pool_h=0, pool_w=0, spatial_scale=1.0, sampling_ratio=2, **kwargs): def ROIAlign(inputs, pool_h=0, pool_w=0, spatial_scale=1.0, sampling_ratio=2, **kwargs):
"""AVG ROIAlign. `[He et.al, 2017] <https://arxiv.org/abs/1703.06870>`_. """AVG RoIAlign. `[He et.al, 2017] <https://arxiv.org/abs/1703.06870>`_.
The first dimension of input must be ``1``.
**Type Constraints**: (*float16*, *float32*) **Type Constraints**: (*float16*, *float32*)
......
...@@ -20,7 +20,7 @@ from multiprocessing import Process ...@@ -20,7 +20,7 @@ from multiprocessing import Process
class BlobFetcher(Process): class BlobFetcher(Process):
"""BlobFetcher is deployed to queue blobs from `DataTransformer`_. """BlobFetcher is deployed to queue blobs from `DataTransformer`_.
It is supported to form ``NHWC`` image blobs and ``1D`` label blobs. It is supported to form *NHWC* image blobs and *1d* label blobs.
""" """
def __init__(self, **kwargs): def __init__(self, **kwargs):
......
...@@ -26,7 +26,7 @@ from .blob_fetcher import BlobFetcher ...@@ -26,7 +26,7 @@ from .blob_fetcher import BlobFetcher
class DataBatch(object): class DataBatch(object):
"""DataBatch aims to prefetch data by ``Triple-Buffering``. """DataBatch aims to prefetch data by *Triple-Buffering*.
It takes full advantages of the Process/Thread of Python, It takes full advantages of the Process/Thread of Python,
which provides remarkable I/O speed up for scalable distributed training. which provides remarkable I/O speed up for scalable distributed training.
......
# ------------------------------------------------------------
# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
#
# Licensed under the BSD 2-Clause License.
# You should have received a copy of the BSD 2-Clause License
# along with the software. If not, See,
#
# <https://opensource.org/licenses/BSD-2-Clause>
#
# Codes are based on:
#
# <https://github.com/weiliu89/caffe/blob/ssd/python/caffe/model_libs.py>
#
# ------------------------------------------------------------
import os
from dragon.vm.caffe import layers as L
from dragon.vm.caffe import params as P
from dragon.vm.caffe.proto import caffe_pb2
def check_if_exist(path):
return os.path.exists(path)
def make_if_not_exist(path):
if not os.path.exists(path):
os.makedirs(path)
def UnpackVariable(var, num):
if type(var) is list and len(var) == num:
return var
else:
ret = []
if type(var) is list:
assert len(var) == 1
for i in range(0, num):
ret.append(var[0])
else:
for i in range(0, num):
ret.append(var)
return ret
def ConvBNLayer(net, from_layer, out_layer, use_bn, use_relu, num_output,
kernel_size, pad, stride, dilation=1, use_scale=True, lr_mult=1,
conv_prefix='', conv_postfix='', bn_prefix='', bn_postfix='_bn',
scale_prefix='', scale_postfix='_scale', bias_prefix='', bias_postfix='_bias',
**bn_params):
if use_bn:
# parameters for convolution layer with batchnorm.
kwargs = {
'param': [dict(lr_mult=lr_mult, decay_mult=1)],
'weight_filler': dict(type='gaussian', std=0.01),
'bias_term': False,
}
eps = bn_params.get('eps', 1e-3)
moving_average_fraction = bn_params.get('moving_average_fraction', 0.9)
use_global_stats = bn_params.get('use_global_stats', False)
# parameters for batchnorm layer.
bn_kwargs = {
'param': [
dict(lr_mult=0, decay_mult=0),
dict(lr_mult=0, decay_mult=0),
dict(lr_mult=0, decay_mult=0)],
}
bn_lr_mult = lr_mult
if use_global_stats:
# only specify if use_global_stats is explicitly provided;
# otherwise, use_global_stats_ = this->phase_ == TEST;
bn_kwargs = {
'param': [
dict(lr_mult=0, decay_mult=0),
dict(lr_mult=0, decay_mult=0),
dict(lr_mult=0, decay_mult=0)],
'eps': eps,
'use_global_stats': use_global_stats,
}
# not updating scale/bias parameters
bn_lr_mult = 0
# parameters for scale bias layer after batchnorm.
if use_scale:
sb_kwargs = {
'bias_term': True}
else:
kwargs = {
'param': [
dict(lr_mult=lr_mult, decay_mult=1),
dict(lr_mult=2 * lr_mult, decay_mult=0)],
'weight_filler': dict(type='xavier'),
'bias_filler': dict(type='constant', value=0)
}
conv_name = '{}{}{}'.format(conv_prefix, out_layer, conv_postfix)
[kernel_h, kernel_w] = UnpackVariable(kernel_size, 2)
[pad_h, pad_w] = UnpackVariable(pad, 2)
[stride_h, stride_w] = UnpackVariable(stride, 2)
if kernel_h == kernel_w:
net[conv_name] = L.Convolution(net[from_layer], num_output=num_output,
kernel_size=kernel_h, pad=pad_h, stride=stride_h, **kwargs)
else:
net[conv_name] = L.Convolution(net[from_layer], num_output=num_output,
kernel_h=kernel_h, kernel_w=kernel_w, pad_h=pad_h, pad_w=pad_w,
stride_h=stride_h, stride_w=stride_w, **kwargs)
if dilation > 1:
net.update(conv_name, {'dilation': dilation})
if use_bn:
bn_name = '{}{}{}'.format(bn_prefix, out_layer, bn_postfix)
net[bn_name] = L.BatchNorm(net[conv_name], in_place=True, **bn_kwargs)
if use_scale:
sb_name = '{}{}{}'.format(scale_prefix, out_layer, scale_postfix)
net[sb_name] = L.Scale(net[bn_name], in_place=True, **sb_kwargs)
else:
bias_name = '{}{}{}'.format(bias_prefix, out_layer, bias_postfix)
net[bias_name] = L.Bias(net[bn_name], in_place=True, **bias_kwargs)
if use_relu:
relu_name = '{}_relu'.format(conv_name)
net[relu_name] = L.ReLU(net[conv_name], in_place=True)
def ResBody(net, from_layer, block_name, out2a, out2b, out2c, stride, use_branch1, dilation=1, **bn_param):
# ResBody(net, 'pool1', '2a', 64, 64, 256, 1, True)
conv_prefix = 'res{}_'.format(block_name)
conv_postfix = ''
bn_prefix = 'bn{}_'.format(block_name)
bn_postfix = ''
scale_prefix = 'scale{}_'.format(block_name)
scale_postfix = ''
use_scale = True
if use_branch1:
branch_name = 'branch1'
ConvBNLayer(net, from_layer, branch_name, use_bn=True, use_relu=False,
num_output=out2c, kernel_size=1, pad=0, stride=stride, use_scale=use_scale,
conv_prefix=conv_prefix, conv_postfix=conv_postfix,
bn_prefix=bn_prefix, bn_postfix=bn_postfix,
scale_prefix=scale_prefix, scale_postfix=scale_postfix, **bn_param)
branch1 = '{}{}'.format(conv_prefix, branch_name)
else:
branch1 = from_layer
branch_name = 'branch2a'
ConvBNLayer(net, from_layer, branch_name, use_bn=True, use_relu=True,
num_output=out2a, kernel_size=1, pad=0, stride=stride, use_scale=use_scale,
conv_prefix=conv_prefix, conv_postfix=conv_postfix,
bn_prefix=bn_prefix, bn_postfix=bn_postfix,
scale_prefix=scale_prefix, scale_postfix=scale_postfix, **bn_param)
out_name = '{}{}'.format(conv_prefix, branch_name)
branch_name = 'branch2b'
if dilation == 1:
ConvBNLayer(net, out_name, branch_name, use_bn=True, use_relu=True,
num_output=out2b, kernel_size=3, pad=1, stride=1, use_scale=use_scale,
conv_prefix=conv_prefix, conv_postfix=conv_postfix,
bn_prefix=bn_prefix, bn_postfix=bn_postfix,
scale_prefix=scale_prefix, scale_postfix=scale_postfix, **bn_param)
else:
pad = int((3 + (dilation - 1) * 2) - 1) / 2
ConvBNLayer(net, out_name, branch_name, use_bn=True, use_relu=True,
num_output=out2b, kernel_size=3, pad=pad, stride=1, use_scale=use_scale,
dilation=dilation, conv_prefix=conv_prefix, conv_postfix=conv_postfix,
bn_prefix=bn_prefix, bn_postfix=bn_postfix,
scale_prefix=scale_prefix, scale_postfix=scale_postfix, **bn_param)
out_name = '{}{}'.format(conv_prefix, branch_name)
branch_name = 'branch2c'
ConvBNLayer(net, out_name, branch_name, use_bn=True, use_relu=False,
num_output=out2c, kernel_size=1, pad=0, stride=1, use_scale=use_scale,
conv_prefix=conv_prefix, conv_postfix=conv_postfix,
bn_prefix=bn_prefix, bn_postfix=bn_postfix,
scale_prefix=scale_prefix, scale_postfix=scale_postfix, **bn_param)
branch2 = '{}{}'.format(conv_prefix, branch_name)
res_name = 'res{}'.format(block_name)
net[res_name] = L.Eltwise(net[branch1], net[branch2])
relu_name = '{}_relu'.format(res_name)
net[relu_name] = L.ReLU(net[res_name], in_place=True)
def InceptionTower(net, from_layer, tower_name, layer_params, **bn_param):
use_scale = False
for param in layer_params:
tower_layer = '{}/{}'.format(tower_name, param['name'])
del param['name']
if 'pool' in tower_layer:
net[tower_layer] = L.Pooling(net[from_layer], **param)
else:
param.update(bn_param)
ConvBNLayer(net, from_layer, tower_layer, use_bn=True, use_relu=True,
use_scale=use_scale, **param)
from_layer = tower_layer
return net[from_layer]
def CreateAnnotatedDataLayer(source, batch_size=32, backend=P.Data.LMDB,
output_label=True, train=True, label_map_file='', anno_type=None,
transform_param={}, batch_sampler=[{}]):
if train:
kwargs = {
'include': dict(phase=caffe_pb2.Phase.Value('TRAIN')),
'transform_param': transform_param,
}
else:
kwargs = {
'include': dict(phase=caffe_pb2.Phase.Value('TEST')),
'transform_param': transform_param,
}
ntop = 1
if output_label:
ntop = 2
annotated_data_param = {
'label_map_file': label_map_file,
'batch_sampler': batch_sampler,
}
if anno_type is not None:
annotated_data_param.update({'anno_type': anno_type})
return L.AnnotatedData(name="data", annotated_data_param=annotated_data_param,
data_param=dict(batch_size=batch_size, backend=backend, source=source),
ntop=ntop, **kwargs)
def VGGNetBody(net, from_layer, need_fc=True, fully_conv=False, reduced=False,
dilated=False, nopool=False, dropout=True, freeze_layers=[], dilate_pool4=False):
kwargs = {
'param': [dict(lr_mult=1, decay_mult=1), dict(lr_mult=2, decay_mult=0)],
'weight_filler': dict(type='xavier'),
'bias_filler': dict(type='constant', value=0)}
assert from_layer in net.keys()
net.conv1_1 = L.Convolution(net[from_layer], num_output=64, pad=1, kernel_size=3, **kwargs)
net.relu1_1 = L.ReLU(net.conv1_1, in_place=True)
net.conv1_2 = L.Convolution(net.relu1_1, num_output=64, pad=1, kernel_size=3, **kwargs)
net.relu1_2 = L.ReLU(net.conv1_2, in_place=True)
if nopool:
name = 'conv1_3'
net[name] = L.Convolution(net.relu1_2, num_output=64, pad=1, kernel_size=3, stride=2, **kwargs)
else:
name = 'pool1'
net.pool1 = L.Pooling(net.relu1_2, pool=P.Pooling.MAX, kernel_size=2, stride=2)
net.conv2_1 = L.Convolution(net[name], num_output=128, pad=1, kernel_size=3, **kwargs)
net.relu2_1 = L.ReLU(net.conv2_1, in_place=True)
net.conv2_2 = L.Convolution(net.relu2_1, num_output=128, pad=1, kernel_size=3, **kwargs)
net.relu2_2 = L.ReLU(net.conv2_2, in_place=True)
if nopool:
name = 'conv2_3'
net[name] = L.Convolution(net.relu2_2, num_output=128, pad=1, kernel_size=3, stride=2, **kwargs)
else:
name = 'pool2'
net[name] = L.Pooling(net.relu2_2, pool=P.Pooling.MAX, kernel_size=2, stride=2)
net.conv3_1 = L.Convolution(net[name], num_output=256, pad=1, kernel_size=3, **kwargs)
net.relu3_1 = L.ReLU(net.conv3_1, in_place=True)
net.conv3_2 = L.Convolution(net.relu3_1, num_output=256, pad=1, kernel_size=3, **kwargs)
net.relu3_2 = L.ReLU(net.conv3_2, in_place=True)
net.conv3_3 = L.Convolution(net.relu3_2, num_output=256, pad=1, kernel_size=3, **kwargs)
net.relu3_3 = L.ReLU(net.conv3_3, in_place=True)
if nopool:
name = 'conv3_4'
net[name] = L.Convolution(net.relu3_3, num_output=256, pad=1, kernel_size=3, stride=2, **kwargs)
else:
name = 'pool3'
net[name] = L.Pooling(net.relu3_3, pool=P.Pooling.MAX, kernel_size=2, stride=2)
net.conv4_1 = L.Convolution(net[name], num_output=512, pad=1, kernel_size=3, **kwargs)
net.relu4_1 = L.ReLU(net.conv4_1, in_place=True)
net.conv4_2 = L.Convolution(net.relu4_1, num_output=512, pad=1, kernel_size=3, **kwargs)
net.relu4_2 = L.ReLU(net.conv4_2, in_place=True)
net.conv4_3 = L.Convolution(net.relu4_2, num_output=512, pad=1, kernel_size=3, **kwargs)
net.relu4_3 = L.ReLU(net.conv4_3, in_place=True)
if nopool:
name = 'conv4_4'
net[name] = L.Convolution(net.relu4_3, num_output=512, pad=1, kernel_size=3, stride=2, **kwargs)
else:
name = 'pool4'
if dilate_pool4:
net[name] = L.Pooling(net.relu4_3, pool=P.Pooling.MAX, kernel_size=3, stride=1, pad=1)
dilation = 2
else:
net[name] = L.Pooling(net.relu4_3, pool=P.Pooling.MAX, kernel_size=2, stride=2)
dilation = 1
kernel_size = 3
pad = int(int((kernel_size + (dilation - 1) * (kernel_size - 1)) - 1) / 2)
net.conv5_1 = L.Convolution(net[name], num_output=512, pad=pad, kernel_size=kernel_size, dilation=dilation, **kwargs)
net.relu5_1 = L.ReLU(net.conv5_1, in_place=True)
net.conv5_2 = L.Convolution(net.relu5_1, num_output=512, pad=pad, kernel_size=kernel_size, dilation=dilation, **kwargs)
net.relu5_2 = L.ReLU(net.conv5_2, in_place=True)
net.conv5_3 = L.Convolution(net.relu5_2, num_output=512, pad=pad, kernel_size=kernel_size, dilation=dilation, **kwargs)
net.relu5_3 = L.ReLU(net.conv5_3, in_place=True)
if need_fc:
if dilated:
if nopool:
name = 'conv5_4'
net[name] = L.Convolution(net.relu5_3, num_output=512, pad=1, kernel_size=3, stride=1, **kwargs)
else:
name = 'pool5'
net[name] = L.Pooling(net.relu5_3, pool=P.Pooling.MAX, pad=1, kernel_size=3, stride=1)
else:
if nopool:
name = 'conv5_4'
net[name] = L.Convolution(net.relu5_3, num_output=512, pad=1, kernel_size=3, stride=2, **kwargs)
else:
name = 'pool5'
net[name] = L.Pooling(net.relu5_3, pool=P.Pooling.MAX, kernel_size=2, stride=2)
if fully_conv:
if dilated:
if reduced:
dilation = dilation * 6
kernel_size = 3
num_output = 1024
else:
dilation = dilation * 2
kernel_size = 7
num_output = 4096
else:
if reduced:
dilation = dilation * 3
kernel_size = 3
num_output = 1024
else:
kernel_size = 7
num_output = 4096
pad = int(int((kernel_size + (dilation - 1) * (kernel_size - 1)) - 1) / 2)
net.fc6 = L.Convolution(net[name], num_output=num_output, pad=pad, kernel_size=kernel_size, dilation=dilation, **kwargs)
net.relu6 = L.ReLU(net.fc6, in_place=True)
if dropout:
net.drop6 = L.Dropout(net.relu6, dropout_ratio=0.5, in_place=True)
if reduced:
net.fc7 = L.Convolution(net.relu6, num_output=1024, kernel_size=1, **kwargs)
else:
net.fc7 = L.Convolution(net.relu6, num_output=4096, kernel_size=1, **kwargs)
net.relu7 = L.ReLU(net.fc7, in_place=True)
if dropout:
net.drop7 = L.Dropout(net.relu7, dropout_ratio=0.5, in_place=True)
else:
net.fc6 = L.InnerProduct(net.pool5, num_output=4096)
net.relu6 = L.ReLU(net.fc6, in_place=True)
if dropout:
net.drop6 = L.Dropout(net.relu6, dropout_ratio=0.5, in_place=True)
net.fc7 = L.InnerProduct(net.relu6, num_output=4096)
net.relu7 = L.ReLU(net.fc7, in_place=True)
if dropout:
net.drop7 = L.Dropout(net.relu7, dropout_ratio=0.5, in_place=True)
# Update freeze layers.
kwargs['param'] = [dict(lr_mult=0, decay_mult=0), dict(lr_mult=0, decay_mult=0)]
layers = net.keys()
for freeze_layer in freeze_layers:
if freeze_layer in layers:
net.update(freeze_layer, kwargs)
return net
def ResNet101Body(net, from_layer, use_pool5=True, use_dilation_conv5=False, **bn_param):
conv_prefix = ''
conv_postfix = ''
bn_prefix = 'bn_'
bn_postfix = ''
scale_prefix = 'scale_'
scale_postfix = ''
ConvBNLayer(net, from_layer, 'conv1', use_bn=True, use_relu=True,
num_output=64, kernel_size=7, pad=3, stride=2,
conv_prefix=conv_prefix, conv_postfix=conv_postfix,
bn_prefix=bn_prefix, bn_postfix=bn_postfix,
scale_prefix=scale_prefix, scale_postfix=scale_postfix, **bn_param)
net.pool1 = L.Pooling(net.conv1, pool=P.Pooling.MAX, kernel_size=3, stride=2)
ResBody(net, 'pool1', '2a', out2a=64, out2b=64, out2c=256, stride=1, use_branch1=True, **bn_param)
ResBody(net, 'res2a', '2b', out2a=64, out2b=64, out2c=256, stride=1, use_branch1=False, **bn_param)
ResBody(net, 'res2b', '2c', out2a=64, out2b=64, out2c=256, stride=1, use_branch1=False, **bn_param)
ResBody(net, 'res2c', '3a', out2a=128, out2b=128, out2c=512, stride=2, use_branch1=True, **bn_param)
from_layer = 'res3a'
for i in range(1, 4):
block_name = '3b{}'.format(i)
ResBody(net, from_layer, block_name, out2a=128, out2b=128, out2c=512, stride=1, use_branch1=False, **bn_param)
from_layer = 'res{}'.format(block_name)
ResBody(net, from_layer, '4a', out2a=256, out2b=256, out2c=1024, stride=2, use_branch1=True, **bn_param)
from_layer = 'res4a'
for i in range(1, 23):
block_name = '4b{}'.format(i)
ResBody(net, from_layer, block_name, out2a=256, out2b=256, out2c=1024, stride=1, use_branch1=False, **bn_param)
from_layer = 'res{}'.format(block_name)
stride = 2
dilation = 1
if use_dilation_conv5:
stride = 1
dilation = 2
ResBody(net, from_layer, '5a', out2a=512, out2b=512, out2c=2048, stride=stride, use_branch1=True, dilation=dilation, **bn_param)
ResBody(net, 'res5a', '5b', out2a=512, out2b=512, out2c=2048, stride=1, use_branch1=False, dilation=dilation, **bn_param)
ResBody(net, 'res5b', '5c', out2a=512, out2b=512, out2c=2048, stride=1, use_branch1=False, dilation=dilation, **bn_param)
if use_pool5:
net.pool5 = L.Pooling(net.res5c, pool=P.Pooling.AVE, global_pooling=True)
return net
def ResNet152Body(net, from_layer, use_pool5=True, use_dilation_conv5=False, **bn_param):
conv_prefix = ''
conv_postfix = ''
bn_prefix = 'bn_'
bn_postfix = ''
scale_prefix = 'scale_'
scale_postfix = ''
ConvBNLayer(net, from_layer, 'conv1', use_bn=True, use_relu=True,
num_output=64, kernel_size=7, pad=3, stride=2,
conv_prefix=conv_prefix, conv_postfix=conv_postfix,
bn_prefix=bn_prefix, bn_postfix=bn_postfix,
scale_prefix=scale_prefix, scale_postfix=scale_postfix, **bn_param)
net.pool1 = L.Pooling(net.conv1, pool=P.Pooling.MAX, kernel_size=3, stride=2)
ResBody(net, 'pool1', '2a', out2a=64, out2b=64, out2c=256, stride=1, use_branch1=True, **bn_param)
ResBody(net, 'res2a', '2b', out2a=64, out2b=64, out2c=256, stride=1, use_branch1=False, **bn_param)
ResBody(net, 'res2b', '2c', out2a=64, out2b=64, out2c=256, stride=1, use_branch1=False, **bn_param)
ResBody(net, 'res2c', '3a', out2a=128, out2b=128, out2c=512, stride=2, use_branch1=True, **bn_param)
from_layer = 'res3a'
for i in range(1, 8):
block_name = '3b{}'.format(i)
ResBody(net, from_layer, block_name, out2a=128, out2b=128, out2c=512, stride=1, use_branch1=False, **bn_param)
from_layer = 'res{}'.format(block_name)
ResBody(net, from_layer, '4a', out2a=256, out2b=256, out2c=1024, stride=2, use_branch1=True, **bn_param)
from_layer = 'res4a'
for i in range(1, 36):
block_name = '4b{}'.format(i)
ResBody(net, from_layer, block_name, out2a=256, out2b=256, out2c=1024, stride=1, use_branch1=False, **bn_param)
from_layer = 'res{}'.format(block_name)
stride = 2
dilation = 1
if use_dilation_conv5:
stride = 1
dilation = 2
ResBody(net, from_layer, '5a', out2a=512, out2b=512, out2c=2048, stride=stride, use_branch1=True, dilation=dilation, **bn_param)
ResBody(net, 'res5a', '5b', out2a=512, out2b=512, out2c=2048, stride=1, use_branch1=False, dilation=dilation, **bn_param)
ResBody(net, 'res5b', '5c', out2a=512, out2b=512, out2c=2048, stride=1, use_branch1=False, dilation=dilation, **bn_param)
if use_pool5:
net.pool5 = L.Pooling(net.res5c, pool=P.Pooling.AVE, global_pooling=True)
return net
def InceptionV3Body(net, from_layer, output_pred=False, **bn_param):
# scale is fixed to 1, thus we ignore it.
use_scale = False
out_layer = 'conv'
ConvBNLayer(net, from_layer, out_layer, use_bn=True, use_relu=True,
num_output=32, kernel_size=3, pad=0, stride=2, use_scale=use_scale,
**bn_param)
from_layer = out_layer
out_layer = 'conv_1'
ConvBNLayer(net, from_layer, out_layer, use_bn=True, use_relu=True,
num_output=32, kernel_size=3, pad=0, stride=1, use_scale=use_scale,
**bn_param)
from_layer = out_layer
out_layer = 'conv_2'
ConvBNLayer(net, from_layer, out_layer, use_bn=True, use_relu=True,
num_output=64, kernel_size=3, pad=1, stride=1, use_scale=use_scale,
**bn_param)
from_layer = out_layer
out_layer = 'pool'
net[out_layer] = L.Pooling(net[from_layer], pool=P.Pooling.MAX,
kernel_size=3, stride=2, pad=0)
from_layer = out_layer
out_layer = 'conv_3'
ConvBNLayer(net, from_layer, out_layer, use_bn=True, use_relu=True,
num_output=80, kernel_size=1, pad=0, stride=1, use_scale=use_scale,
**bn_param)
from_layer = out_layer
out_layer = 'conv_4'
ConvBNLayer(net, from_layer, out_layer, use_bn=True, use_relu=True,
num_output=192, kernel_size=3, pad=0, stride=1, use_scale=use_scale,
**bn_param)
from_layer = out_layer
out_layer = 'pool_1'
net[out_layer] = L.Pooling(net[from_layer], pool=P.Pooling.MAX,
kernel_size=3, stride=2, pad=0)
from_layer = out_layer
# inceptions with 1x1, 3x3, 5x5 convolutions
for inception_id in range(0, 3):
if inception_id == 0:
out_layer = 'mixed'
tower_2_conv_num_output = 32
else:
out_layer = 'mixed_{}'.format(inception_id)
tower_2_conv_num_output = 64
towers = []
tower_name = '{}'.format(out_layer)
tower = InceptionTower(net, from_layer, tower_name, [
dict(name='conv', num_output=64, kernel_size=1, pad=0, stride=1),
], **bn_param)
towers.append(tower)
tower_name = '{}/tower'.format(out_layer)
tower = InceptionTower(net, from_layer, tower_name, [
dict(name='conv', num_output=48, kernel_size=1, pad=0, stride=1),
dict(name='conv_1', num_output=64, kernel_size=5, pad=2, stride=1),
], **bn_param)
towers.append(tower)
tower_name = '{}/tower_1'.format(out_layer)
tower = InceptionTower(net, from_layer, tower_name, [
dict(name='conv', num_output=64, kernel_size=1, pad=0, stride=1),
dict(name='conv_1', num_output=96, kernel_size=3, pad=1, stride=1),
dict(name='conv_2', num_output=96, kernel_size=3, pad=1, stride=1),
], **bn_param)
towers.append(tower)
tower_name = '{}/tower_2'.format(out_layer)
tower = InceptionTower(net, from_layer, tower_name, [
dict(name='pool', pool=P.Pooling.AVE, kernel_size=3, pad=1, stride=1),
dict(name='conv', num_output=tower_2_conv_num_output, kernel_size=1, pad=0, stride=1),
], **bn_param)
towers.append(tower)
out_layer = '{}/join'.format(out_layer)
net[out_layer] = L.Concat(*towers, axis=1)
from_layer = out_layer
# inceptions with 1x1, 3x3(in sequence) convolutions
out_layer = 'mixed_3'
towers = []
tower_name = '{}'.format(out_layer)
tower = InceptionTower(net, from_layer, tower_name, [
dict(name='conv', num_output=384, kernel_size=3, pad=0, stride=2),
], **bn_param)
towers.append(tower)
tower_name = '{}/tower'.format(out_layer)
tower = InceptionTower(net, from_layer, tower_name, [
dict(name='conv', num_output=64, kernel_size=1, pad=0, stride=1),
dict(name='conv_1', num_output=96, kernel_size=3, pad=1, stride=1),
dict(name='conv_2', num_output=96, kernel_size=3, pad=0, stride=2),
], **bn_param)
towers.append(tower)
tower_name = '{}'.format(out_layer)
tower = InceptionTower(net, from_layer, tower_name, [
dict(name='pool', pool=P.Pooling.MAX, kernel_size=3, pad=0, stride=2),
], **bn_param)
towers.append(tower)
out_layer = '{}/join'.format(out_layer)
net[out_layer] = L.Concat(*towers, axis=1)
from_layer = out_layer
# inceptions with 1x1, 7x1, 1x7 convolutions
for inception_id in range(4, 8):
if inception_id == 4:
num_output = 128
elif inception_id == 5 or inception_id == 6:
num_output = 160
elif inception_id == 7:
num_output = 192
out_layer = 'mixed_{}'.format(inception_id)
towers = []
tower_name = '{}'.format(out_layer)
tower = InceptionTower(net, from_layer, tower_name, [
dict(name='conv', num_output=192, kernel_size=1, pad=0, stride=1),
], **bn_param)
towers.append(tower)
tower_name = '{}/tower'.format(out_layer)
tower = InceptionTower(net, from_layer, tower_name, [
dict(name='conv', num_output=num_output, kernel_size=1, pad=0, stride=1),
dict(name='conv_1', num_output=num_output, kernel_size=[1, 7], pad=[0, 3], stride=[1, 1]),
dict(name='conv_2', num_output=192, kernel_size=[7, 1], pad=[3, 0], stride=[1, 1]),
], **bn_param)
towers.append(tower)
tower_name = '{}/tower_1'.format(out_layer)
tower = InceptionTower(net, from_layer, tower_name, [
dict(name='conv', num_output=num_output, kernel_size=1, pad=0, stride=1),
dict(name='conv_1', num_output=num_output, kernel_size=[7, 1], pad=[3, 0], stride=[1, 1]),
dict(name='conv_2', num_output=num_output, kernel_size=[1, 7], pad=[0, 3], stride=[1, 1]),
dict(name='conv_3', num_output=num_output, kernel_size=[7, 1], pad=[3, 0], stride=[1, 1]),
dict(name='conv_4', num_output=192, kernel_size=[1, 7], pad=[0, 3], stride=[1, 1]),
], **bn_param)
towers.append(tower)
tower_name = '{}/tower_2'.format(out_layer)
tower = InceptionTower(net, from_layer, tower_name, [
dict(name='pool', pool=P.Pooling.AVE, kernel_size=3, pad=1, stride=1),
dict(name='conv', num_output=192, kernel_size=1, pad=0, stride=1),
], **bn_param)
towers.append(tower)
out_layer = '{}/join'.format(out_layer)
net[out_layer] = L.Concat(*towers, axis=1)
from_layer = out_layer
# inceptions with 1x1, 3x3, 1x7, 7x1 filters
out_layer = 'mixed_8'
towers = []
tower_name = '{}/tower'.format(out_layer)
tower = InceptionTower(net, from_layer, tower_name, [
dict(name='conv', num_output=192, kernel_size=1, pad=0, stride=1),
dict(name='conv_1', num_output=320, kernel_size=3, pad=0, stride=2),
], **bn_param)
towers.append(tower)
tower_name = '{}/tower_1'.format(out_layer)
tower = InceptionTower(net, from_layer, tower_name, [
dict(name='conv', num_output=192, kernel_size=1, pad=0, stride=1),
dict(name='conv_1', num_output=192, kernel_size=[1, 7], pad=[0, 3], stride=[1, 1]),
dict(name='conv_2', num_output=192, kernel_size=[7, 1], pad=[3, 0], stride=[1, 1]),
dict(name='conv_3', num_output=192, kernel_size=3, pad=0, stride=2),
], **bn_param)
towers.append(tower)
tower_name = '{}'.format(out_layer)
tower = InceptionTower(net, from_layer, tower_name, [
dict(name='pool', pool=P.Pooling.MAX, kernel_size=3, pad=0, stride=2),
], **bn_param)
towers.append(tower)
out_layer = '{}/join'.format(out_layer)
net[out_layer] = L.Concat(*towers, axis=1)
from_layer = out_layer
for inception_id in range(9, 11):
num_output = 384
num_output2 = 448
if inception_id == 9:
pool = P.Pooling.AVE
else:
pool = P.Pooling.MAX
out_layer = 'mixed_{}'.format(inception_id)
towers = []
tower_name = '{}'.format(out_layer)
tower = InceptionTower(net, from_layer, tower_name, [
dict(name='conv', num_output=320, kernel_size=1, pad=0, stride=1),
], **bn_param)
towers.append(tower)
tower_name = '{}/tower'.format(out_layer)
tower = InceptionTower(net, from_layer, tower_name, [
dict(name='conv', num_output=num_output, kernel_size=1, pad=0, stride=1),
], **bn_param)
subtowers = []
subtower_name = '{}/mixed'.format(tower_name)
subtower = InceptionTower(net, '{}/conv'.format(tower_name), subtower_name, [
dict(name='conv', num_output=num_output, kernel_size=[1, 3], pad=[0, 1], stride=[1, 1]),
], **bn_param)
subtowers.append(subtower)
subtower = InceptionTower(net, '{}/conv'.format(tower_name), subtower_name, [
dict(name='conv_1', num_output=num_output, kernel_size=[3, 1], pad=[1, 0], stride=[1, 1]),
], **bn_param)
subtowers.append(subtower)
net[subtower_name] = L.Concat(*subtowers, axis=1)
towers.append(net[subtower_name])
tower_name = '{}/tower_1'.format(out_layer)
tower = InceptionTower(net, from_layer, tower_name, [
dict(name='conv', num_output=num_output2, kernel_size=1, pad=0, stride=1),
dict(name='conv_1', num_output=num_output, kernel_size=3, pad=1, stride=1),
], **bn_param)
subtowers = []
subtower_name = '{}/mixed'.format(tower_name)
subtower = InceptionTower(net, '{}/conv_1'.format(tower_name), subtower_name, [
dict(name='conv', num_output=num_output, kernel_size=[1, 3], pad=[0, 1], stride=[1, 1]),
], **bn_param)
subtowers.append(subtower)
subtower = InceptionTower(net, '{}/conv_1'.format(tower_name), subtower_name, [
dict(name='conv_1', num_output=num_output, kernel_size=[3, 1], pad=[1, 0], stride=[1, 1]),
], **bn_param)
subtowers.append(subtower)
net[subtower_name] = L.Concat(*subtowers, axis=1)
towers.append(net[subtower_name])
tower_name = '{}/tower_2'.format(out_layer)
tower = InceptionTower(net, from_layer, tower_name, [
dict(name='pool', pool=pool, kernel_size=3, pad=1, stride=1),
dict(name='conv', num_output=192, kernel_size=1, pad=0, stride=1),
], **bn_param)
towers.append(tower)
out_layer = '{}/join'.format(out_layer)
net[out_layer] = L.Concat(*towers, axis=1)
from_layer = out_layer
if output_pred:
net.pool_3 = L.Pooling(net[from_layer], pool=P.Pooling.AVE, kernel_size=8, pad=0, stride=1)
net.softmax = L.InnerProduct(net.pool_3, num_output=1008)
net.softmax_prob = L.Softmax(net.softmax)
return net
def CreateMultiBoxHead(net, data_layer="data", num_classes=[], from_layers=[],
use_objectness=False, use_iou=False, normalizations=[], use_batchnorm=True, lr_mult=1,
use_scale=True, min_sizes=[], max_sizes=[], prior_variance = [0.1],
aspect_ratios=[], steps=[], img_height=0, img_width=0, share_location=True,
flip=True, clip=True, offset=0.5, inter_layer_depth=[], kernel_size=1, pad=0,
conf_postfix='', loc_postfix='', **bn_param):
assert num_classes, "must provide num_classes"
assert num_classes > 0, "num_classes must be positive number"
if normalizations:
assert len(from_layers) == len(normalizations), "from_layers and normalizations should have same length"
assert len(from_layers) == len(min_sizes), "from_layers and min_sizes should have same length"
if max_sizes:
assert len(from_layers) == len(max_sizes), "from_layers and max_sizes should have same length"
if aspect_ratios:
assert len(from_layers) == len(aspect_ratios), "from_layers and aspect_ratios should have same length"
if steps:
assert len(from_layers) == len(steps), "from_layers and steps should have same length"
net_layers = net.keys()
assert data_layer in net_layers, "data_layer is not in net's layers"
if inter_layer_depth:
assert len(from_layers) == len(inter_layer_depth), "from_layers and inter_layer_depth should have same length"
num = len(from_layers)
priorbox_layers = []
loc_layers = []
conf_layers = []
iou_layers = []
objectness_layers = []
for i in range(0, num):
from_layer = from_layers[i]
# Get the normalize value.
if normalizations:
if normalizations[i] != -1:
norm_name = "{}_norm".format(from_layer)
net[norm_name] = L.Normalize(net[from_layer], scale_filler=dict(type="constant", value=normalizations[i]),
across_spatial=False, channel_shared=False)
from_layer = norm_name
# Add intermediate layers.
if inter_layer_depth:
if inter_layer_depth[i] > 0:
inter_name = "{}_inter".format(from_layer)
ConvBNLayer(net, from_layer, inter_name, use_bn=use_batchnorm, use_relu=True, lr_mult=lr_mult,
num_output=inter_layer_depth[i], kernel_size=3, pad=1, stride=1, **bn_param)
from_layer = inter_name
# Estimate number of priors per location given provided parameters.
min_size = min_sizes[i]
if type(min_size) is not list: min_size = [min_size]
aspect_ratio = []
if len(aspect_ratios) > i:
aspect_ratio = aspect_ratios[i]
if type(aspect_ratio) is not list: aspect_ratio = [aspect_ratio]
max_size = []
if len(max_sizes) > i:
max_size = max_sizes[i]
if type(max_size) is not list: max_size = [max_size]
if max_size:
assert len(max_size) == len(min_size), "max_size and min_size should have same length."
if max_size:
num_priors_per_location = (2 + len(aspect_ratio)) * len(min_size)
else:
num_priors_per_location = (1 + len(aspect_ratio)) * len(min_size)
if flip:
num_priors_per_location += len(aspect_ratio) * len(min_size)
step = []
if len(steps) > i: step = steps[i]
# Create location prediction layer.
name = "{}_mbox_loc{}".format(from_layer, loc_postfix)
num_loc_output = num_priors_per_location * 4;
if not share_location:
num_loc_output *= num_classes
ConvBNLayer(net, from_layer, name, use_bn=use_batchnorm, use_relu=False, lr_mult=lr_mult,
num_output=num_loc_output, kernel_size=kernel_size, pad=pad, stride=1, **bn_param)
permute_name = "{}_perm".format(name)
net[permute_name] = L.Permute(net[name], order=[0, 2, 3, 1])
flatten_name = "{}_flat".format(name)
net[flatten_name] = L.Flatten(net[permute_name], axis=1)
loc_layers.append(net[flatten_name])
# Create confidence prediction layer.
name = "{}_mbox_conf{}".format(from_layer, conf_postfix)
num_conf_output = num_priors_per_location * num_classes;
ConvBNLayer(net, from_layer, name, use_bn=use_batchnorm, use_relu=False, lr_mult=lr_mult,
num_output=num_conf_output, kernel_size=kernel_size, pad=pad, stride=1, **bn_param)
permute_name = "{}_perm".format(name)
net[permute_name] = L.Permute(net[name], order=[0, 2, 3, 1])
flatten_name = "{}_flat".format(name)
net[flatten_name] = L.Flatten(net[permute_name], axis=1)
conf_layers.append(net[flatten_name])
# Create iou prediction layer.
if use_iou:
name = "{}_mbox_iou{}".format(from_layer, conf_postfix)
num_iou_output = num_priors_per_location
ConvBNLayer(net, from_layer, name, use_bn=use_batchnorm, use_relu=False, lr_mult=lr_mult,
num_output=num_iou_output, kernel_size=kernel_size, pad=pad, stride=1, **bn_param)
permute_name = "{}_perm".format(name)
net[permute_name] = L.Permute(net[name], order=[0, 2, 3, 1])
flatten_name = "{}_flat".format(name)
net[flatten_name] = L.Flatten(net[permute_name], axis=1)
iou_layers.append(net[flatten_name])
# Create prior generation layer.
name = "{}_mbox_priorbox".format(from_layer)
priorbox_param = {'min_size': min_size,
'clip': clip,
'offset': offset}
if max_size:
priorbox_param.update({'max_size': max_size})
if aspect_ratio:
priorbox_param.update({'aspect_ratio': aspect_ratio, 'flip': flip})
if step:
priorbox_param.update({'step': step})
if img_height != 0 and img_width != 0:
if img_height == img_width:
priorbox_param.update({'img_size': img_height})
else:
priorbox_param.update({'img_h': img_height, 'img_w': img_width})
net[name] = L.Python(net[from_layer], net['im_info'], module='layers.prior_box_layer',
layer='PriorBoxLayer', param_str=str(priorbox_param))
priorbox_layers.append(net[name])
# Create objectness prediction layer.
if use_objectness:
name = "{}_mbox_objectness".format(from_layer)
num_obj_output = num_priors_per_location * 2;
ConvBNLayer(net, from_layer, name, use_bn=use_batchnorm, use_relu=False, lr_mult=lr_mult,
num_output=num_obj_output, kernel_size=kernel_size, pad=pad, stride=1, **bn_param)
permute_name = "{}_perm".format(name)
net[permute_name] = L.Permute(net[name], order=[0, 2, 3, 1])
flatten_name = "{}_flat".format(name)
net[flatten_name] = L.Flatten(net[permute_name], axis=1)
objectness_layers.append(net[flatten_name])
# Concatenate priorbox, loc, and conf layers.
mbox_layers = []
name = "mbox_loc"
net[name] = L.Concat(*loc_layers, axis=1)
net['mbox_loc_reshape'] = L.Reshape(net[name], shape={'dim': [0, -1, 4]})
mbox_layers.append(net['mbox_loc_reshape'])
name = "mbox_conf"
net[name] = L.Concat(*conf_layers, axis=1)
net['mbox_conf_reshape'] = L.Reshape(net[name], shape={'dim': [0, -1, num_classes]})
mbox_layers.append(net['mbox_conf_reshape'])
if use_iou:
name = "mbox_iou"
net[name] = L.Concat(*iou_layers, axis=1)
net['mbox_iou_reshape'] = L.Reshape(net[name], shape={'dim': [0, -1]})
mbox_layers.append(net['mbox_iou_reshape'])
name = "mbox_priorbox"
net[name] = L.Concat(*priorbox_layers, axis=0)
mbox_layers.append(net[name])
if use_objectness:
name = "mbox_objectness"
net[name] = L.Concat(*objectness_layers, axis=1)
mbox_layers.append(net[name])
return mbox_layers
...@@ -354,15 +354,14 @@ class Function(object): ...@@ -354,15 +354,14 @@ class Function(object):
# Store for future development # Store for future development
self.meta_graph = meta_graph self.meta_graph = meta_graph
self.graph_name = meta_graph.name
# Call c api to create graph # Call c api to create graph
ws.CreateGraph(meta_graph) self.graph_name = ws.CreateGraph(meta_graph)
# Bind a lambda callback to run this graph # Bind a lambda callback to run this graph
callback_inputs = self.inputs if explicit_inputs else [] callback_inputs = self.inputs if explicit_inputs else []
self.callback = lambda *args, **kwargs: \ self.callback = lambda *args, **kwargs: \
ws.RunGraph(meta_graph.name, (callback_inputs, args), self.outputs, **kwargs) ws.RunGraph(self.graph_name, (callback_inputs, args), self.outputs, **kwargs)
# Self return # Self return
return self return self
...@@ -386,7 +385,7 @@ def function(inputs=None, outputs=None, givens=None, updater=None): ...@@ -386,7 +385,7 @@ def function(inputs=None, outputs=None, givens=None, updater=None):
---------- ----------
inputs : sequence of Tensor, optional inputs : sequence of Tensor, optional
The inputs to feed. The inputs to feed.
inputs : sequence of Tensor, optional outputs : sequence of Tensor, optional
The outputs to fetch. The outputs to fetch.
givens : dict of Tensor, optional givens : dict of Tensor, optional
The substitutions to use. The substitutions to use.
......
...@@ -60,6 +60,7 @@ class Gather(BaseModule): ...@@ -60,6 +60,7 @@ class Gather(BaseModule):
'n_inputs': 2, 'n_outputs': 1, 'n_inputs': 2, 'n_outputs': 1,
'arguments': { 'arguments': {
'axis': self.axis, 'axis': self.axis,
'zero_grad': True,
} }
} }
......
...@@ -188,15 +188,15 @@ inline void RetrieveRoIs( ...@@ -188,15 +188,15 @@ inline void RetrieveRoIs(
template <typename T> template <typename T>
inline int roi_level( inline int roi_level(
const int min_level, // e.g. 2 const int min_level,
const int max_level, // e.g. 5 const int max_level,
const int canonical_level, // e.g. 4 const int canonical_level,
const int canonical_scale, // e.g. 224 const int canonical_scale,
T* roi) { T* roi) {
T w = roi[3] - roi[1] + 1; T w = roi[3] - roi[1] + 1;
T h = roi[4] - roi[2] + 1; T h = roi[4] - roi[2] + 1;
// Refer the settings of paper // Refer the settings of paper
int level = canonical_level + (int)std::log( int level = canonical_level + std::log2(
std::max(std::sqrt(w * h), (T)1) / (T)canonical_scale); std::max(std::sqrt(w * h), (T)1) / (T)canonical_scale);
return std::min(max_level, std::max(min_level, level)); return std::min(max_level, std::max(min_level, level));
} }
......
...@@ -80,7 +80,7 @@ void ProposalOp<Context>::RunWithType( ...@@ -80,7 +80,7 @@ void ProposalOp<Context>::RunWithType(
anchors_.Reshape({ A, 4 }); anchors_.Reshape({ A, 4 });
rcnn::GenerateAnchors<BT>(strides[i], rcnn::GenerateAnchors<BT>(strides[i],
(int)ratios.size(), 1, &ratios[0], &scales[0], (int)ratios.size(), 1, &ratios[0], &scales[i],
anchors_.template mutable_data<BT, CPUContext>()); anchors_.template mutable_data<BT, CPUContext>());
rcnn::GenerateGridAnchors<BT>( rcnn::GenerateGridAnchors<BT>(
......
...@@ -6,134 +6,93 @@ namespace dragon { ...@@ -6,134 +6,93 @@ namespace dragon {
namespace kernel { namespace kernel {
/*! CanonicalAxis <T = int32, Device = CPU> */
template <> void CanonicalAxis<int, CPUContext>(
const int count,
const int dim,
int* y,
CPUContext* ctx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) if (y[i] < 0) y[i] += dim;
}
/*! Gather <T = ?, Device = CPU> */ /*! Gather <T = ?, Device = CPU> */
template <typename T> template <typename T>
void _Gather( void _Gather(
const int count,
const int outer_dim, const int outer_dim,
const int inner_dim, const int inner_dim,
const int x_slice_dim, const int x_slice_dim,
const int y_slice_dim, const int y_slice_dim,
const int* indices, const int64_t* indices,
const T* x, const T* x,
T* y, T* y,
CPUContext* ctx) { CPUContext* ctx) {
int64_t x_offset, y_offset, x_idx_offset, y_idx_offset; int64_t x_offset, select_idx;
for (int i = 0; i < y_slice_dim; ++i) {
y_idx_offset = i;
x_idx_offset = indices[y_idx_offset];
for (int n = 0; n < outer_dim; ++n) { for (int n = 0; n < outer_dim; ++n) {
x_offset = (n * x_slice_dim + x_idx_offset) * inner_dim; for (int i = 0; i < y_slice_dim; ++i) {
y_offset = (n * y_slice_dim + y_idx_offset) * inner_dim; select_idx = indices[i];
select_idx = select_idx >= 0 ?
select_idx : select_idx + x_slice_dim;
x_offset = (n * x_slice_dim + select_idx) * inner_dim;
ctx->Copy<T, CPUContext, CPUContext>( ctx->Copy<T, CPUContext, CPUContext>(
inner_dim, y + y_offset, x + x_offset); inner_dim, y, x + x_offset);
y += inner_dim;
} }
} }
} }
/*! Gather <T = float32, Device = CPU> */
template <> void Gather<float, CPUContext>(
const int count,
const int outer_dim,
const int inner_dim,
const int x_slice_dim,
const int y_slice_dim,
const int* indices,
const float* x,
float* y,
CPUContext* ctx) {
_Gather<float>(count, outer_dim, inner_dim,
x_slice_dim, y_slice_dim, indices, x, y, ctx);
}
/*! Gather <T = int32, Device = CPU> */
template <> void Gather<int, CPUContext>(
const int count,
const int outer_dim,
const int inner_dim,
const int x_slice_dim,
const int y_slice_dim,
const int* indices,
const int* x,
int* y,
CPUContext* ctx) {
_Gather<int>(count, outer_dim, inner_dim,
x_slice_dim, y_slice_dim, indices, x, y, ctx);
}
/*! GatherGrad <T = ?, Device = CPU> */ /*! GatherGrad <T = ?, Device = CPU> */
template <typename T> template <typename T>
void _GatherGrad( void _GatherGrad(
const int count,
const int outer_dim, const int outer_dim,
const int inner_dim, const int inner_dim,
const int x_slice_dim, const int x_slice_dim,
const int y_slice_dim, const int y_slice_dim,
const int* indices, const int64_t* indices,
const T* dy, const T* dy,
T* dx, T* dx,
CPUContext* ctx) { CPUContext* ctx) {
int64_t x_offset, y_offset, x_idx_offset, y_idx_offset; int64_t x_offset, select_idx;
for (int i = 0; i < y_slice_dim; ++i) {
y_idx_offset = i;
x_idx_offset = indices[y_idx_offset];
for (int n = 0; n < outer_dim; ++n) { for (int n = 0; n < outer_dim; ++n) {
x_offset = (n * x_slice_dim + x_idx_offset) * inner_dim; for (int i = 0; i < y_slice_dim; ++i) {
y_offset = (n * y_slice_dim + y_idx_offset) * inner_dim; select_idx = indices[i];
select_idx = select_idx >= 0 ?
select_idx : select_idx + x_slice_dim;
x_offset = (n * x_slice_dim + select_idx) * inner_dim;
math::Add<T, CPUContext>(inner_dim, math::Add<T, CPUContext>(inner_dim,
dy + y_offset, dx + x_offset, dx + x_offset, ctx); dy, dx + x_offset, dx + x_offset, ctx);
dy += inner_dim;
} }
} }
} }
/*! GatherGrad <T = float32, Device = CPU> */ /*! Kernel Launchers */
template <> void GatherGrad<float, CPUContext>( #define DEFINE_GATHER_KERNEL_LAUNCHER(name, T) \
const int count, template <> void name<T, CPUContext>( \
const int outer_dim, const int outer_dim, \
const int inner_dim, const int inner_dim, \
const int x_slice_dim, const int x_slice_dim, \
const int y_slice_dim, const int y_slice_dim, \
const int* indices, const int64_t* indices, \
const float* dy, const T* x, \
float* dx, T* y, \
CPUContext* ctx) { CPUContext* ctx) { \
_GatherGrad<float>(count, outer_dim, inner_dim, _##name<T> \
x_slice_dim, y_slice_dim, indices, dy, dx, ctx); (outer_dim, inner_dim, x_slice_dim, \
} y_slice_dim, indices, x, y, ctx); \
}
/*! GatherGrad <T = int32, Device = CPU> */
template <> void GatherGrad<int, CPUContext>( DEFINE_GATHER_KERNEL_LAUNCHER(Gather, bool);
const int count, DEFINE_GATHER_KERNEL_LAUNCHER(Gather, int8_t);
const int outer_dim, DEFINE_GATHER_KERNEL_LAUNCHER(Gather, uint8_t);
const int inner_dim, DEFINE_GATHER_KERNEL_LAUNCHER(Gather, int);
const int x_slice_dim, DEFINE_GATHER_KERNEL_LAUNCHER(Gather, int64_t);
const int y_slice_dim, DEFINE_GATHER_KERNEL_LAUNCHER(Gather, float16);
const int* indices, DEFINE_GATHER_KERNEL_LAUNCHER(Gather, float);
const int* dy, DEFINE_GATHER_KERNEL_LAUNCHER(Gather, double);
int* dx,
CPUContext* ctx) { DEFINE_GATHER_KERNEL_LAUNCHER(GatherGrad, int8_t);
_GatherGrad<int>(count, outer_dim, inner_dim, DEFINE_GATHER_KERNEL_LAUNCHER(GatherGrad, uint8_t);
x_slice_dim, y_slice_dim, indices, dy, dx, ctx); DEFINE_GATHER_KERNEL_LAUNCHER(GatherGrad, int);
} DEFINE_GATHER_KERNEL_LAUNCHER(GatherGrad, int64_t);
DEFINE_GATHER_KERNEL_LAUNCHER(GatherGrad, float16);
DEFINE_GATHER_KERNEL_LAUNCHER(GatherGrad, float);
DEFINE_GATHER_KERNEL_LAUNCHER(GatherGrad, double);
#undef DEFINE_GATHER_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
......
...@@ -2,160 +2,176 @@ ...@@ -2,160 +2,176 @@
#include "core/context_cuda.h" #include "core/context_cuda.h"
#include "utils/op_kernel.h" #include "utils/op_kernel.h"
#include "utils/cub_device.h"
namespace dragon { namespace dragon {
namespace kernel { namespace kernel {
/*! CanonicalAxis <T = int32, Device = CUDA> */
template <typename T>
__global__ void _CanonicalAxis(
const int count,
const int dim,
T* y) {
CUDA_1D_KERNEL_LOOP(idx, count) {
if (y[idx] < 0) y[idx] += dim;
}
}
template <> void CanonicalAxis<int, CUDAContext>(
const int count,
const int dim,
int* y,
CUDAContext* ctx) {
_CanonicalAxis<int>
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >
(count, dim, y);
}
/*! Gather <T = ?, Device = CUDA> */ /*! Gather <T = ?, Device = CUDA> */
template <typename T> template <typename T>
__global__ void _Gather( __global__ void _Gather(
const int count, const int nthreads,
const int outer_dim,
const int inner_dim, const int inner_dim,
const int x_slice_dim, const int x_slice_dim,
const int y_slice_dim, const int y_slice_dim,
const int* indices, const int64_t* indices,
const T* x, const T* x,
T* y) { T* y) {
CUDA_1D_KERNEL_LOOP(idx, count) { CUDA_1D_KERNEL_LOOP(y_idx, nthreads) {
const int outer_idx = idx / inner_dim / y_slice_dim; const int outer_idx = y_idx / inner_dim / y_slice_dim;
const int slice_idx = idx % inner_dim; const int inner_idx = y_idx % inner_dim;
const int y_idx_offset = (idx / inner_dim) % y_slice_dim; #if __CUDA_ARCH__ >= 350
const int x_idx_offset = indices[y_idx_offset]; int select_idx = __ldg(indices +
const int x_idx = (outer_idx * x_slice_dim + x_idx_offset) ((y_idx / inner_dim) % y_slice_dim));
* inner_dim + slice_idx; #else
y[idx] = x[x_idx]; int select_idx = indices[
(y_idx / inner_dim) % y_slice_dim];
#endif
select_idx = select_idx >= 0 ?
select_idx : select_idx + x_slice_dim;
const int x_idx = (outer_idx * x_slice_dim + select_idx)
* inner_dim + inner_idx;
y[y_idx] = x[x_idx];
} }
} }
/*! Gather <T = float32, Device = CUDA> */
template <> void Gather<float, CUDAContext>(
const int count,
const int outer_dim,
const int inner_dim,
const int x_slice_dim,
const int y_slice_dim,
const int* indices,
const float* x,
float* y,
CUDAContext* ctx) {
_Gather<float>
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >
(count, outer_dim, inner_dim,
x_slice_dim, y_slice_dim,
indices, x, y);
}
/*! Gather <T = int32, Device = CUDA> */
template <> void Gather<int, CUDAContext>(
const int count,
const int outer_dim,
const int inner_dim,
const int x_slice_dim,
const int y_slice_dim,
const int* indices,
const int* x,
int* y,
CUDAContext* ctx) {
_Gather<int>
<< <CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >
(count, outer_dim, inner_dim,
x_slice_dim, y_slice_dim,
indices, x, y);
}
/*! GatherGrad <T = ?, Device = CUDA> */ /*! GatherGrad <T = ?, Device = CUDA> */
template <typename T> template <typename T>
__global__ void _GatherGrad( __global__ void _GatherGrad(
const int count, const int nthreads,
const int outer_dim,
const int inner_dim, const int inner_dim,
const int x_slice_dim, const int x_slice_dim,
const int y_slice_dim, const int y_slice_dim,
const int* indices, const int64_t* indices,
const T* dy, const T* dy,
T* dx) { T* dx) {
CUDA_1D_KERNEL_LOOP(idx, count) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
const int outer_idx = idx / inner_dim / y_slice_dim; const int outer_idx = i / inner_dim;
const int slice_idx = idx % inner_dim; const int inner_idx = i % inner_dim;
const int y_idx_offset = (idx / inner_dim) % y_slice_dim; for (int j = 0; j < y_slice_dim; ++j) {
const int x_idx_offset = indices[y_idx_offset]; #if __CUDA_ARCH__ >= 350
const int x_idx = (outer_idx * x_slice_dim + x_idx_offset) int select_idx = __ldg(indices + j);
* inner_dim + slice_idx; #else
atomicAdd(dx + x_idx, dy[idx]); int select_idx = indices[j];
#endif
select_idx = select_idx >= 0 ?
select_idx : select_idx + x_slice_dim;
const int x_idx = (outer_idx * x_slice_dim + select_idx)
* inner_dim + inner_idx;
const int y_idx = (outer_idx * y_slice_dim + j)
* inner_dim + inner_idx;
dx[x_idx] += dy[y_idx];
}
} }
} }
/*! GatherGrad <T = float32, Device = CUDA> */ /*! GatherGrad <T = float16, Device = CUDA> */
template <> void GatherGrad<float, CUDAContext>( template <> __global__ void _GatherGrad<half>(
const int count, const int nthreads,
const int outer_dim,
const int inner_dim, const int inner_dim,
const int x_slice_dim, const int x_slice_dim,
const int y_slice_dim, const int y_slice_dim,
const int* indices, const int64_t* indices,
const float* dy, const half* dy,
float* dx, half* dx) {
CUDAContext* ctx) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
_GatherGrad<float> #if __CUDA_ARCH__ >= 530
<< < CUDA_BLOCKS(count), CUDA_THREADS, const int outer_idx = i / inner_dim;
0, ctx->cuda_stream() >> > const int inner_idx = i % inner_dim;
(count, outer_dim, inner_dim, for (int j = 0; j < y_slice_dim; ++j) {
x_slice_dim, y_slice_dim, int select_idx = __ldg(indices + j);
indices, dy, dx); select_idx = select_idx >= 0 ?
select_idx : select_idx + x_slice_dim;
const int x_idx = (outer_idx * x_slice_dim + select_idx)
* inner_dim + inner_idx;
const int y_idx = (outer_idx * y_slice_dim + j)
* inner_dim + inner_idx;
dx[x_idx] = __hadd(dx[x_idx], dy[y_idx]);
}
#endif
}
} }
/*! GatherGrad <T = int32, Device = CUDA> */ /*! Kernel Launchers */
#define DEFINE_GATHER_KERNEL_LAUNCHER(T) \
template <> void Gather<T, CUDAContext>( \
const int outer_dim, \
const int inner_dim, \
const int x_slice_dim, \
const int y_slice_dim, \
const int64_t* indices, \
const T* x, \
T* y, \
CUDAContext* ctx) { \
auto nthreads = outer_dim * y_slice_dim * inner_dim; \
_Gather<T> \
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >> > \
(nthreads, inner_dim, x_slice_dim, \
y_slice_dim, indices, x, y); \
}
template <> void GatherGrad<int, CUDAContext>( #define DEFINE_GATHER_GRAD_KERNEL_LAUNCHER(T) \
const int count, template <> void GatherGrad<T, CUDAContext>( \
const int outer_dim, \
const int inner_dim, \
const int x_slice_dim, \
const int y_slice_dim, \
const int64_t* indices, \
const T* dy, \
T* dx, \
CUDAContext* ctx) { \
auto nthreads = outer_dim * inner_dim; \
_GatherGrad<T> \
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >> > \
(nthreads, inner_dim, x_slice_dim, \
y_slice_dim, indices, dy, dx); \
}
DEFINE_GATHER_KERNEL_LAUNCHER(bool);
DEFINE_GATHER_KERNEL_LAUNCHER(int8_t);
DEFINE_GATHER_KERNEL_LAUNCHER(uint8_t);
DEFINE_GATHER_KERNEL_LAUNCHER(int);
DEFINE_GATHER_KERNEL_LAUNCHER(int64_t);
DEFINE_GATHER_KERNEL_LAUNCHER(float16);
DEFINE_GATHER_KERNEL_LAUNCHER(float);
DEFINE_GATHER_KERNEL_LAUNCHER(double);
DEFINE_GATHER_GRAD_KERNEL_LAUNCHER(int8_t);
DEFINE_GATHER_GRAD_KERNEL_LAUNCHER(uint8_t);
DEFINE_GATHER_GRAD_KERNEL_LAUNCHER(int);
DEFINE_GATHER_GRAD_KERNEL_LAUNCHER(int64_t);
DEFINE_GATHER_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GATHER_GRAD_KERNEL_LAUNCHER(double);
template <> void GatherGrad<float16, CUDAContext>(
const int outer_dim, const int outer_dim,
const int inner_dim, const int inner_dim,
const int x_slice_dim, const int x_slice_dim,
const int y_slice_dim, const int y_slice_dim,
const int* indices, const int64_t* indices,
const int* dy, const float16* dy,
int* dx, float16* dx,
CUDAContext* ctx) { CUDAContext* ctx) {
_GatherGrad<int> auto nthreads = outer_dim * inner_dim;
<< < CUDA_BLOCKS(count), CUDA_THREADS, _GatherGrad<half>
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> > 0, ctx->cuda_stream() >> >
(count, outer_dim, inner_dim, (nthreads, inner_dim, x_slice_dim,
x_slice_dim, y_slice_dim, y_slice_dim, indices,
indices, dy, dx); reinterpret_cast<const half*>(dy),
reinterpret_cast<half*>(dx));
} }
#undef DEFINE_GATHER_KERNEL_LAUNCHER
#undef DEFINE_GATHER_GRAD_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
} // namepsace dragon } // namepsace dragon
......
#include "contrib/onnx/onnx_backend.h" #include "onnx/onnx_backend.h"
namespace dragon { namespace dragon {
......
#include "core/operator_schema.h" #include "core/operator_schema.h"
#include "utils/proto_utils.h" #include "utils/proto_utils.h"
#include "contrib/onnx/onnx_backend.h" #include "onnx/onnx_backend.h"
namespace dragon { namespace dragon {
......
/*! /*!
* Copyright (c) 2017-present, SeetaTech, Co.,Ltd. * Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
* *
* Licensed under the BSD 2-Clause License. * Licensed under the BSD 2-Clause License.
* You should have received a copy of the BSD 2-Clause License * You should have received a copy of the BSD 2-Clause License
* along with the software. If not, See, * along with the software. If not, See,
* *
* <https://opensource.org/licenses/BSD-2-Clause> * <https://opensource.org/licenses/BSD-2-Clause>
* *
* Codes are based on: * Codes are based on:
* *
* <https://github.com/pytorch/pytorch/blob/master/caffe2/onnx/backend.h> * <https://github.com/pytorch/pytorch/blob/master/caffe2/onnx/backend.h>
* *
* ------------------------------------------------------------ * ------------------------------------------------------------
*/ */
#ifndef DRAGON_CONTRIB_ONNX_ONNX_BACKEND_H_ #ifndef DRAGON_ONNX_ONNX_BACKEND_H_
#define DRAGON_CONTRIB_ONNX_ONNX_BACKEND_H_ #define DRAGON_ONNX_ONNX_BACKEND_H_
#include "core/common.h" #include "core/common.h"
#include "proto/onnx.pb.h" #include "proto/onnx.pb.h"
...@@ -228,4 +228,4 @@ class ONNXBackend { ...@@ -228,4 +228,4 @@ class ONNXBackend {
} // namespace dragon } // namespace dragon
#endif // DRAGON_CONTRIB_ONNX_ONNX_BACKEND_H_ #endif // DRAGON_ONNX_ONNX_BACKEND_H_
\ No newline at end of file \ No newline at end of file
#include "utils/map_utils.h" #include "utils/map_utils.h"
#include "contrib/onnx/onnx_backend.h" #include "onnx/onnx_backend.h"
namespace dragon { namespace dragon {
......
#include "contrib/onnx/onnx_backend.h" #include "onnx/onnx_backend.h"
namespace dragon { namespace dragon {
......
...@@ -13,12 +13,10 @@ namespace dragon { ...@@ -13,12 +13,10 @@ namespace dragon {
template <class Context> template <typename T> template <class Context> template <typename T>
void GatherOp<Context>::RunWithType() { void GatherOp<Context>::RunWithType() {
auto* Xdata = Input(0).template data<T, Context>(); auto* Xdata = Input(0).template data<T, Context>();
auto* indices = Input(1).template mutable_data<int, Context>(); auto* indices = Input(1).template mutable_data<int64_t, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>(); auto* Ydata = Output(0)->template mutable_data<T, Context>();
kernel::CanonicalAxis(Input(1).count(), x_slice_dim, indices, ctx()); kernel::Gather(
kernel::Gather(Output(0)->count(),
outer_dim, inner_dim, outer_dim, inner_dim,
x_slice_dim, y_slice_dim, x_slice_dim, y_slice_dim,
indices, Xdata, Ydata, ctx()); indices, Xdata, Ydata, ctx());
...@@ -28,22 +26,38 @@ template <class Context> ...@@ -28,22 +26,38 @@ template <class Context>
void GatherOp<Context>::RunOnDevice() { void GatherOp<Context>::RunOnDevice() {
DETERMINE_RUNTIME_ARGUMENTS(Input(0)); DETERMINE_RUNTIME_ARGUMENTS(Input(0));
output_dims = Input(0).dims();
x_slice_dim = Input(0).dim(axis); x_slice_dim = Input(0).dim(axis);
output_dims[axis] = y_slice_dim = Input(1).count(); y_slice_dim = Input(1).count();
outer_dim = Input(0).count(0, axis); outer_dim = Input(0).count(0, axis);
inner_dim = Input(0).count(axis + 1); inner_dim = Input(0).count(axis + 1);
CHECK_GT(y_slice_dim, 0) << "\nLength of indices must > 0."; CHECK_GT(y_slice_dim, 0) << "\nLength of indices must > 0.";
const auto& s1 = Input(0).dims().begin();
const auto& e1 = s1 + axis, s3 = e1 + 1;
const auto& e3 = Input(0).dims().end();
const auto& s2 = Input(1).dims().begin();
const auto& e2 = Input(1).dims().end();
output_dims.assign(s1, e1);
output_dims.insert(output_dims.end(), s2, e2);
output_dims.insert(output_dims.end(), s3, e3);
Output(0)->Reshape(output_dims); Output(0)->Reshape(output_dims);
CHECK(Input(1).template IsType<int>()) CHECK(Input(1).template IsType<int64_t>())
<< "\nThe type of indices should be int32."; << "\nThe type of indices should be int64.";
if (XIsType(Input(0), float)) RunWithType<float>(); if (XIsType(Input(0), bool)) RunWithType<bool>();
else if (XIsType(Input(0), int8_t)) RunWithType<int8_t>();
else if (XIsType(Input(0), uint8_t)) RunWithType<uint8_t>();
else if (XIsType(Input(0), int)) RunWithType<int>(); else if (XIsType(Input(0), int)) RunWithType<int>();
else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "int32" }); else if (XIsType(Input(0), int64_t)) RunWithType<int64_t>();
else if (XIsType(Input(0), float16)) RunWithType<float16>();
else if (XIsType(Input(0), float)) RunWithType<float>();
else if (XIsType(Input(0), double)) RunWithType<double>();
else LOG(FATAL) << DTypeHelper(Input(0), {
"bool", "int8", "uint8", "int32", "int64",
"float16", "float32", "float64",
});
} }
DEPLOY_CPU(Gather); DEPLOY_CPU(Gather);
...@@ -54,18 +68,17 @@ OPERATOR_SCHEMA(Gather).NumInputs(2).NumOutputs(1); ...@@ -54,18 +68,17 @@ OPERATOR_SCHEMA(Gather).NumInputs(2).NumOutputs(1);
template <class Context> template <typename T> template <class Context> template <typename T>
void GatherGradientOp<Context>::RunWithType() { void GatherGradientOp<Context>::RunWithType() {
auto* indices = Input(1).template data<int, Context>(); auto* indices = Input(1).template data<int64_t, Context>();
auto* dYdata = Input(-1).template data<T, Context>(); auto* dYdata = Input(-1).template data<T, Context>();
auto* dXdata = Output(0)->template mutable_data<T, Context>();
T* dXdata = nullptr; // Zero the gradients Optionally
if (!acc_grad) { if (zero_grad) {
dXdata = Output(0)->template mutable_data<T, Context>(); math::Set(Output(0)->count(),
math::Set(Output(0)->count(), cast::to<T>(0.f), dXdata, ctx()); cast::to<T>(0.f), dXdata, ctx());
} else {
dXdata = Output(0)->template mutable_data<T, Context>();
} }
kernel::GatherGrad(Input(-1).count(), kernel::GatherGrad(
outer_dim, inner_dim, outer_dim, inner_dim,
x_slice_dim, y_slice_dim, x_slice_dim, y_slice_dim,
indices, dYdata, dXdata, ctx()); indices, dYdata, dXdata, ctx());
...@@ -82,12 +95,20 @@ void GatherGradientOp<Context>::RunOnDevice() { ...@@ -82,12 +95,20 @@ void GatherGradientOp<Context>::RunOnDevice() {
Output(0)->ReshapeLike(Input(0)); Output(0)->ReshapeLike(Input(0));
CHECK(Input(1).template IsType<int>()) CHECK(Input(1).template IsType<int64_t>())
<< "\nThe type of indices should be int32."; << "\nThe type of indices should be int64.";
if (XIsType(Input(0), float)) RunWithType<float>(); if (XIsType(Input(0), int8_t)) RunWithType<int8_t>();
else if (XIsType(Input(0), uint8_t)) RunWithType<uint8_t>();
else if (XIsType(Input(0), int)) RunWithType<int>(); else if (XIsType(Input(0), int)) RunWithType<int>();
else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "int32" }); else if (XIsType(Input(0), int64_t)) RunWithType<int64_t>();
else if (XIsType(Input(0), float16)) RunWithType<float16>();
else if (XIsType(Input(0), float)) RunWithType<float>();
else if (XIsType(Input(0), double)) RunWithType<double>();
else LOG(FATAL) << DTypeHelper(Input(0), {
"int8", "uint8", "int32", "int64",
"float16", "float32", "float64",
});
} }
DEPLOY_CPU(GatherGradient); DEPLOY_CPU(GatherGradient);
......
...@@ -15,6 +15,27 @@ void DropBlock2dOp<Context>::RunWithType() { ...@@ -15,6 +15,27 @@ void DropBlock2dOp<Context>::RunWithType() {
Output(0)->count(), Ydata, Xdata); Output(0)->count(), Ydata, Xdata);
} }
} else if (phase() == "TRAIN") { } else if (phase() == "TRAIN") {
if (data_format == "NCHW") {
n = Input(0).dim(0), c = Input(0).dim(1);
h = Input(0).dim(2), w = Input(0).dim(3);
} else if (data_format == "NHWC") {
n = Input(0).dim(0), c = Input(0).dim(-1);
h = Input(0).dim(1), w = Input(0).dim(2);
}
seed_h = h - block_size + 1;
seed_w = w - block_size + 1;
CHECK(seed_h > 0 && seed_w > 0)
<< "\nExcepted block_size <= feat_size.";
if (decrement > 0 && apply_prob > keep_prob()) {
apply_prob -= decrement;
} else { apply_prob = keep_prob(); }
gamma = (1.f - apply_prob) / (block_size * block_size);
gamma *= (alpha * (h * w) / (seed_h * seed_w));
auto* mask = ws()->CreateTensor(mount_name( auto* mask = ws()->CreateTensor(mount_name(
"drop_block/mask"))->ReshapeLike(Input(0)); "drop_block/mask"))->ReshapeLike(Input(0));
auto* norm = ws()->CreateTensor(mount_name( auto* norm = ws()->CreateTensor(mount_name(
...@@ -58,29 +79,8 @@ void DropBlock2dOp<Context>::RunWithType() { ...@@ -58,29 +79,8 @@ void DropBlock2dOp<Context>::RunWithType() {
template <class Context> template <class Context>
void DropBlock2dOp<Context>::RunOnDevice() { void DropBlock2dOp<Context>::RunOnDevice() {
if (data_format == "NCHW") {
n = Input(0).dim(0), c = Input(0).dim(1);
h = Input(0).dim(2), w = Input(0).dim(3);
} else if (data_format == "NHWC") {
n = Input(0).dim(0), c = Input(0).dim(-1);
h = Input(0).dim(1), w = Input(0).dim(2);
}
seed_h = h - block_size + 1;
seed_w = w - block_size + 1;
CHECK(seed_h > 0 && seed_w > 0)
<< "\nExcepted block_size <= feat_size.";
Output(0)->ReshapeLike(Input(0)); Output(0)->ReshapeLike(Input(0));
if (decrement > 0 && apply_prob > keep_prob()) {
apply_prob -= decrement;
} else { apply_prob = keep_prob(); }
gamma = (1.f - apply_prob) / (block_size * block_size);
gamma *= (alpha * (h * w) / (seed_h * seed_w));
if (XIsType(Input(0), float)) RunWithType<float>(); if (XIsType(Input(0), float)) RunWithType<float>();
else if (XIsType(Input(0), float16)) RunWithType<float16>(); else if (XIsType(Input(0), float16)) RunWithType<float16>();
else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" }); else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
......
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!