Commit a739c49b by Ting PAN

Optimize the CuDNNDepthwiseConv2d

1 parent 0936a502
...@@ -107,7 +107,7 @@ class CuDNNConv2dOp final : public Conv2dOp<Context> { ...@@ -107,7 +107,7 @@ class CuDNNConv2dOp final : public Conv2dOp<Context> {
}; };
template <class Context> template <class Context>
class CuDNNConv2dGradientOp final : public Conv2dGradientOp<Context> { class CuDNNConv2dGradientOp : public Conv2dGradientOp<Context> {
public: public:
CuDNNConv2dGradientOp(const OperatorDef& def, Workspace* ws) CuDNNConv2dGradientOp(const OperatorDef& def, Workspace* ws)
: Conv2dGradientOp<Context>(def, ws), : Conv2dGradientOp<Context>(def, ws),
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
#ifndef DRAGON_OPERATORS_VISION_DEPTHWISE_CONV_OP_H_ #ifndef DRAGON_OPERATORS_VISION_DEPTHWISE_CONV_OP_H_
#define DRAGON_OPERATORS_VISION_DEPTHWISE_CONV_OP_H_ #define DRAGON_OPERATORS_VISION_DEPTHWISE_CONV_OP_H_
#include "operators/vision/conv_op_base.h" #include "operators/vision/conv_op.h"
namespace dragon { namespace dragon {
...@@ -62,10 +62,10 @@ template <class Context> ...@@ -62,10 +62,10 @@ template <class Context>
class CuDNNDepthwiseConv2dOp final class CuDNNDepthwiseConv2dOp final
: public DepthwiseConv2dOp<Context> { : public DepthwiseConv2dOp<Context> {
public: public:
CuDNNDepthwiseConv2dOp( CuDNNDepthwiseConv2dOp(
const OperatorDef& def, const OperatorDef& def,
Workspace* ws) Workspace* ws)
: DepthwiseConv2dOp<Context>(def, ws) { : DepthwiseConv2dOp<Context>(def, ws) {
CUDNN_CHECK(cudnnCreateTensorDescriptor(&bias_desc)); CUDNN_CHECK(cudnnCreateTensorDescriptor(&bias_desc));
CUDNN_CHECK(cudnnCreateTensorDescriptor(&output_desc)); CUDNN_CHECK(cudnnCreateTensorDescriptor(&output_desc));
} }
...@@ -86,12 +86,12 @@ class CuDNNDepthwiseConv2dOp final ...@@ -86,12 +86,12 @@ class CuDNNDepthwiseConv2dOp final
template <class Context> template <class Context>
class CuDNNDepthwiseConv2dGradientOp final class CuDNNDepthwiseConv2dGradientOp final
: public DepthwiseConv2dGradientOp<Context> { : public CuDNNConv2dGradientOp<Context> {
public: public:
CuDNNDepthwiseConv2dGradientOp( CuDNNDepthwiseConv2dGradientOp(
const OperatorDef& def, const OperatorDef& def,
Workspace* ws) Workspace* ws)
: DepthwiseConv2dGradientOp<Context>(def, ws) { : CuDNNConv2dGradientOp<Context>(def, ws) {
CUDNN_CHECK(cudnnCreateTensorDescriptor(&bias_desc)); CUDNN_CHECK(cudnnCreateTensorDescriptor(&bias_desc));
CUDNN_CHECK(cudnnCreateTensorDescriptor(&input_desc)); CUDNN_CHECK(cudnnCreateTensorDescriptor(&input_desc));
} }
......
...@@ -941,22 +941,20 @@ void GroupNormBackward( ...@@ -941,22 +941,20 @@ void GroupNormBackward(
template <typename T, class Context> template <typename T, class Context>
void LSTMCell( void LSTMCell(
const int count,
const int N, const int N,
const int C, const int C,
const T* cx, const T* cx,
T* xact, T* actx,
T* c, T* c,
T* h, T* h,
Context* ctx); Context* ctx);
template <typename T, class Context> template <typename T, class Context>
void LSTMCellGrad( void LSTMCellGrad(
const int count,
const int N, const int N,
const int C, const int C,
const T* cx, const T* cx,
const T* xact, const T* actx,
const T* c, const T* c,
const T* dc, const T* dc,
const T* dh, const T* dh,
......
...@@ -637,6 +637,35 @@ class OperatorHelper(object): ...@@ -637,6 +637,35 @@ class OperatorHelper(object):
return outputs return outputs
@classmethod @classmethod
def _apply_Flatten(cls, arguments, inputs, outputs):
outputs[0].dtype = inputs[0].dtype
keep_axes = arguments['keep_axes']
axis, num_axes = arguments['axis'], arguments['num_axes']
try:
fake_shape = inputs[0].shape[:]
fake_shape = [1 if dim is None else dim for dim in fake_shape]
if keep_axes is not None:
keep_axes = min(keep_axes, len(inputs.shape))
total_count = numpy.prod(fake_shape)
outputs[0].shape = []
for i in range(keep_axes - 1):
outputs[0].shape.append(inputs[0].shape[i])
total_count *= fake_shape[i]
if total_count != 1:
outputs[0].shape.append(total_count)
else:
if num_axes == -1:
num_axes = len(inputs[0].shape) - axis
num_axes = max(num_axes, 1)
num_flatten = numpy.prod(fake_shape[axis : axis + num_axes])
outputs[0].shape = \
inputs[0].shape[: axis] + [num_flatten] \
+ inputs[0].shape[axis + num_axes:]
except:
pass
return outputs
@classmethod
def _apply_Reshape(cls, arguments, inputs, outputs): def _apply_Reshape(cls, arguments, inputs, outputs):
outputs[0].dtype = inputs[0].dtype outputs[0].dtype = inputs[0].dtype
shape = arguments['dims'] shape = arguments['dims']
......
...@@ -507,17 +507,21 @@ class Tensor(object): ...@@ -507,17 +507,21 @@ class Tensor(object):
def _from_constants(self, value): def _from_constants(self, value):
if not isinstance(value, numpy.ndarray): if not isinstance(value, numpy.ndarray):
try: try:
value = numpy.array(value, dtype=self.dtype value = numpy.array(
if self.dtype else 'float32') value, dtype=self.dtype
if self.dtype else 'float32')
except: except:
raise TypeError( raise TypeError(
'Can not convert the value to Tensor or numpy array.') 'Can not convert the value to Tensor or numpy array.')
ref_tensor = Tensor.Ref( return Tensor.Ref(
name=_workspace.GetDummyName('Constant', name=_workspace.GetDummyName(
domain='Tensor', zero_based=False), basename='Constant',
shape=list(value.shape), dtype=str(value.dtype)) domain='Tensor',
ref_tensor.set_value(value) zero_based=False,
return ref_tensor ),
shape=list(value.shape),
dtype=str(value.dtype),
).set_value(value)
def __add__(self, other): def __add__(self, other):
"""Calculate x + y. """Calculate x + y.
......
...@@ -770,8 +770,9 @@ def _stringify_tensor(obj): ...@@ -770,8 +770,9 @@ def _stringify_tensor(obj):
class _DefaultWorkspaceStack(_tls.Stack): class _DefaultWorkspaceStack(_tls.Stack):
"""A thread-local stack of objects for """A thread-local stack of objects for
providing an implicit default workspace.""" providing an implicit default workspace.
"""
def __init__(self): def __init__(self):
super(_DefaultWorkspaceStack, self).__init__() super(_DefaultWorkspaceStack, self).__init__()
self._global_default_workspace = None self._global_default_workspace = None
......
...@@ -165,7 +165,7 @@ def Tanh(inputs, **kwargs): ...@@ -165,7 +165,7 @@ def Tanh(inputs, **kwargs):
@OpSchema.Inputs(1) @OpSchema.Inputs(1)
@ArgumentHelper.Desc('prob', as_target=False) @ArgumentHelper.Desc('prob', as_target=True)
def Dropout(inputs, prob=0.5, scale=True, **kwargs): def Dropout(inputs, prob=0.5, scale=True, **kwargs):
"""Randomly set a unit into zero. `[Srivastava et.al, 2014] <http://jmlr.org/papers/v15/srivastava14a.html>`_. """Randomly set a unit into zero. `[Srivastava et.al, 2014] <http://jmlr.org/papers/v15/srivastava14a.html>`_.
......
...@@ -516,4 +516,4 @@ def MovingAverage(inputs, decay, **kwargs): ...@@ -516,4 +516,4 @@ def MovingAverage(inputs, decay, **kwargs):
The outputs, i.e., the *y*. The outputs, i.e., the *y*.
""" """
return Accumulate(inputs, 1 - decay, decay, **kwargs) return Accumulate(inputs, 1. - decay, decay, **kwargs)
\ No newline at end of file \ No newline at end of file
...@@ -38,9 +38,11 @@ def Gather(inputs, indices, axis=0, **kwargs): ...@@ -38,9 +38,11 @@ def Gather(inputs, indices, axis=0, **kwargs):
""" """
arguments = ParseArgs(locals()) arguments = ParseArgs(locals())
if not isinstance(indices, Tensor):
indices = Tensor.Ref('', dtype='int64') \
._from_constants(indices)
arguments['inputs'], arguments['indices'] = \ arguments['inputs'], arguments['indices'] = \
[arguments['inputs'], Tensor.Convert( [arguments['inputs'], indices], None
indices, dtype='int64')], None
return Tensor.CreateOperator('Gather', **arguments) return Tensor.CreateOperator('Gather', **arguments)
...@@ -48,9 +50,13 @@ def Gather(inputs, indices, axis=0, **kwargs): ...@@ -48,9 +50,13 @@ def Gather(inputs, indices, axis=0, **kwargs):
@ArgumentHelper.RepeatedDesc('starts') @ArgumentHelper.RepeatedDesc('starts')
@ArgumentHelper.RepeatedDesc('sizes') @ArgumentHelper.RepeatedDesc('sizes')
def Crop( def Crop(
inputs, starts=None, sizes=None, inputs,
start_axis=None, offsets=None, starts=None,
shape_like=None, **kwargs sizes=None,
start_axis=None,
offsets=None,
shape_like=None,
**kwargs
): ):
"""Crop the input according to the given starts and sizes. """Crop the input according to the given starts and sizes.
...@@ -274,7 +280,14 @@ def Mean(inputs, axes=None, keep_dims=False, **kwargs): ...@@ -274,7 +280,14 @@ def Mean(inputs, axes=None, keep_dims=False, **kwargs):
@OpSchema.Inputs(1) @OpSchema.Inputs(1)
def _ArgReduce(inputs, axis=None, operation='ARGMAX', top_k=1, keep_dims=False, **kwargs): def _ArgReduce(
inputs,
axis=None,
operation='ARGMAX',
top_k=1,
keep_dims=False,
**kwargs
):
arguments = ParseArgs(locals()) arguments = ParseArgs(locals())
arguments['axis'] = arguments['axis'] if arguments else INT_MAX arguments['axis'] = arguments['axis'] if arguments else INT_MAX
return Tensor.CreateOperator('ArgReduce', num_outputs=2, **arguments) return Tensor.CreateOperator('ArgReduce', num_outputs=2, **arguments)
...@@ -577,33 +590,7 @@ def Flatten(inputs, axis=0, num_axes=-1, keep_axes=None, **kwargs): ...@@ -577,33 +590,7 @@ def Flatten(inputs, axis=0, num_axes=-1, keep_axes=None, **kwargs):
>>> [24] >>> [24]
""" """
arguments = ParseArgs(locals()) return Tensor.CreateOperator(op_type='Flatten', **ParseArgs(locals()))
output = Tensor.CreateOperator(op_type='Flatten', **arguments)
if inputs.shape is not None:
fake_shape = inputs.shape[:]
fake_shape = [1 if dim is None else dim for dim in fake_shape]
if keep_axes is not None:
if keep_axes > len(inputs.shape):
raise ValueError(
'The total number of axes is {}, can not keep {}.'
.format(len(inputs.shape), keep_axes))
total_count = np.prod(fake_shape)
output.shape = []
for i in range(keep_axes - 1):
output.shape.append(inputs.shape[i])
total_count *= fake_shape[i]
if total_count != 1:
output.shape.append(total_count)
else:
if num_axes == -1: num_axes = len(inputs.shape) - axis
elif num_axes == 0:
raise ValueError('num_axes must > 0 or be -1.')
num_flatten = np.prod(fake_shape[axis : axis + num_axes])
output.shape = inputs.shape[: axis] + [num_flatten] + inputs.shape[axis + num_axes :]
return output
@OpSchema.Inputs(1) @OpSchema.Inputs(1)
...@@ -676,20 +663,7 @@ def Squeeze(inputs, axis=None, **kwargs): ...@@ -676,20 +663,7 @@ def Squeeze(inputs, axis=None, **kwargs):
>>> print(Squeeze(a, axis=0).shape) >>> print(Squeeze(a, axis=0).shape)
""" """
arguments = ParseArgs(locals()) return Tensor.CreateOperator(op_type='Squeeze', **ParseArgs(locals()))
output = Tensor.CreateOperator(op_type='Squeeze', **arguments)
if inputs.shape is not None:
output_shape = []
if axis: axis += (0 if axis >= 0 else len(inputs.shape))
for idx, dim in enumerate(inputs.shape[:]):
if dim != 1 or \
(axis and dim == 1 and idx != axis):
output_shape.append(dim)
output.shape = output_shape
return output
@OpSchema.Inputs(1) @OpSchema.Inputs(1)
......
...@@ -84,8 +84,13 @@ def LMDBData(**kwargs): ...@@ -84,8 +84,13 @@ def LMDBData(**kwargs):
@OpSchema.Inputs(1) @OpSchema.Inputs(1)
def ImageData( def ImageData(
inputs, mean_values=None, std_values=None, inputs,
dtype='float32', data_format='NCHW', **kwargs): mean_values=None,
std_values=None,
dtype='float32',
data_format='NCHW',
**kwargs
):
"""Process the images from 4D raw data. """Process the images from 4D raw data.
Note that we assume the data format of raw data is **NHWC**. Note that we assume the data format of raw data is **NHWC**.
......
...@@ -19,8 +19,12 @@ from .activation import Softmax ...@@ -19,8 +19,12 @@ from .activation import Softmax
@OpSchema.Inputs(2) @OpSchema.Inputs(2)
def NLLLoss( def NLLLoss(
inputs, axis=1, normalization='VALID', inputs,
ignore_labels=(), **kwargs): axis=1,
normalization='VALID',
ignore_labels=(),
**kwargs
):
"""Compute the negative likelihood loss with sparse labels. """Compute the negative likelihood loss with sparse labels.
**Type Constraints**: **Type Constraints**:
...@@ -36,7 +40,7 @@ def NLLLoss( ...@@ -36,7 +40,7 @@ def NLLLoss(
axis : int, optional axis : int, optional
The axis to apply softmax, can be negative. The axis to apply softmax, can be negative.
normalization : {'UNIT', 'FULL', 'VALID', 'BATCH_SIZE', 'NONE'}, optional normalization : {'UNIT', 'FULL', 'VALID', 'BATCH_SIZE', 'NONE'}, optional
The normalization method. The method of normalization.
ignore_labels : sequence of int, optional, default=() ignore_labels : sequence of int, optional, default=()
The label id to ignore. The label id to ignore.
...@@ -55,8 +59,12 @@ def NLLLoss( ...@@ -55,8 +59,12 @@ def NLLLoss(
@OpSchema.Inputs(2) @OpSchema.Inputs(2)
def SparseSoftmaxCrossEntropy( def SparseSoftmaxCrossEntropy(
inputs, axis=1, normalization='VALID', inputs,
ignore_labels=(), **kwargs): axis=1,
normalization='VALID',
ignore_labels=(),
**kwargs
):
"""Compute the softmax cross entropy with sparse labels. """Compute the softmax cross entropy with sparse labels.
**Type Constraints**: **Type Constraints**:
...@@ -72,7 +80,7 @@ def SparseSoftmaxCrossEntropy( ...@@ -72,7 +80,7 @@ def SparseSoftmaxCrossEntropy(
axis : int, optional axis : int, optional
The axis to apply softmax, can be negative. The axis to apply softmax, can be negative.
normalization : {'UNIT', 'FULL', 'VALID', 'BATCH_SIZE', 'NONE'}, optional normalization : {'UNIT', 'FULL', 'VALID', 'BATCH_SIZE', 'NONE'}, optional
The normalization method. The method of normalization.
ignore_labels : sequence of int, optional, default=() ignore_labels : sequence of int, optional, default=()
The label id to ignore. The label id to ignore.
...@@ -100,7 +108,7 @@ def SigmoidCrossEntropy(inputs, normalization='VALID', **kwargs): ...@@ -100,7 +108,7 @@ def SigmoidCrossEntropy(inputs, normalization='VALID', **kwargs):
inputs : sequence of Tensor inputs : sequence of Tensor
The inputs, represent [logits, targets]. The inputs, represent [logits, targets].
normalization : {'UNIT', 'FULL', 'VALID', 'BATCH_SIZE', 'NONE'}, optional normalization : {'UNIT', 'FULL', 'VALID', 'BATCH_SIZE', 'NONE'}, optional
The normalization method. The method of normalization.
Returns Returns
------- -------
...@@ -128,7 +136,7 @@ def SoftmaxCrossEntropy(inputs, axis=1, normalization='FULL', **kwargs): ...@@ -128,7 +136,7 @@ def SoftmaxCrossEntropy(inputs, axis=1, normalization='FULL', **kwargs):
axis : int, optional axis : int, optional
The axis to apply softmax, can be negative. The axis to apply softmax, can be negative.
normalization : {'UNIT', 'FULL', 'BATCH_SIZE', 'NONE'}, optional normalization : {'UNIT', 'FULL', 'BATCH_SIZE', 'NONE'}, optional
The normalization method. The method of normalization.
Returns Returns
------- -------
...@@ -158,7 +166,7 @@ def SmoothL1Loss(inputs, beta=1.0, normalization='BATCH_SIZE', **kwargs): ...@@ -158,7 +166,7 @@ def SmoothL1Loss(inputs, beta=1.0, normalization='BATCH_SIZE', **kwargs):
beta : float, optional beta : float, optional
The transition point from L1 to L2 loss The transition point from L1 to L2 loss
normalization : {'FULL', 'BATCH_SIZE', 'NONE'}, optional normalization : {'FULL', 'BATCH_SIZE', 'NONE'}, optional
The normalization method. The method of normalization.
Returns Returns
------- -------
...@@ -182,7 +190,7 @@ def L1Loss(inputs, scale=1., normalization='BATCH_SIZE', **kwargs): ...@@ -182,7 +190,7 @@ def L1Loss(inputs, scale=1., normalization='BATCH_SIZE', **kwargs):
scale : float, optional scale : float, optional
The scale factor applying on the reduced loss. The scale factor applying on the reduced loss.
normalization : {'FULL', 'BATCH_SIZE', 'NONE'}, optional normalization : {'FULL', 'BATCH_SIZE', 'NONE'}, optional
The normalization method. The method of normalization.
Returns Returns
------- -------
...@@ -206,7 +214,7 @@ def L2Loss(inputs, scale=1., normalization='BATCH_SIZE', **kwargs): ...@@ -206,7 +214,7 @@ def L2Loss(inputs, scale=1., normalization='BATCH_SIZE', **kwargs):
scale : float, optional scale : float, optional
The scale factor applying on the reduced loss. The scale factor applying on the reduced loss.
normalization : {'FULL', 'BATCH_SIZE', 'NONE'}, optional normalization : {'FULL', 'BATCH_SIZE', 'NONE'}, optional
The normalization method. The method of normalization.
Returns Returns
------- -------
...@@ -219,8 +227,14 @@ def L2Loss(inputs, scale=1., normalization='BATCH_SIZE', **kwargs): ...@@ -219,8 +227,14 @@ def L2Loss(inputs, scale=1., normalization='BATCH_SIZE', **kwargs):
@OpSchema.Inputs(2) @OpSchema.Inputs(2)
def SigmoidFocalLoss( def SigmoidFocalLoss(
inputs, axis=1, normalization='VALID', inputs,
alpha=0.25, gamma=2.0, neg_id=0, **kwargs): axis=1,
normalization='VALID',
alpha=0.25,
gamma=2.0,
neg_id=0,
**kwargs
):
"""Compute the sigmoid focal loss with sparse labels. `[Lin et.al, 2017] <https://arxiv.org/abs/1708.02002>`_. """Compute the sigmoid focal loss with sparse labels. `[Lin et.al, 2017] <https://arxiv.org/abs/1708.02002>`_.
**Type Constraints**: *float32* **Type Constraints**: *float32*
...@@ -232,7 +246,7 @@ def SigmoidFocalLoss( ...@@ -232,7 +246,7 @@ def SigmoidFocalLoss(
axis : int, optional axis : int, optional
The axis to apply softmax, can be negative. The axis to apply softmax, can be negative.
normalization : {'UNIT', 'FULL', 'VALID', 'BATCH_SIZE', 'NONE'}, optional normalization : {'UNIT', 'FULL', 'VALID', 'BATCH_SIZE', 'NONE'}, optional
The normalization method. The method of normalization.
alpha : float, optional, default=0.25 alpha : float, optional, default=0.25
The scale factor on the rare class. The scale factor on the rare class.
gamma : float, optional, default=2.0 gamma : float, optional, default=2.0
...@@ -255,8 +269,15 @@ def SigmoidFocalLoss( ...@@ -255,8 +269,15 @@ def SigmoidFocalLoss(
@OpSchema.Inputs(2) @OpSchema.Inputs(2)
def SoftmaxFocalLoss( def SoftmaxFocalLoss(
inputs, axis=1, normalization='VALID', ignore_labels=(), inputs,
alpha=0.25, gamma=2.0, neg_id=0, **kwargs): axis=1,
normalization='VALID',
ignore_labels=(),
alpha=0.25,
gamma=2.0,
neg_id=0,
**kwargs
):
"""Compute the softmax focal loss with sparse labels. `[Lin et.al, 2017] <https://arxiv.org/abs/1708.02002>`_. """Compute the softmax focal loss with sparse labels. `[Lin et.al, 2017] <https://arxiv.org/abs/1708.02002>`_.
**Type Constraints**: *float32* **Type Constraints**: *float32*
...@@ -268,7 +289,7 @@ def SoftmaxFocalLoss( ...@@ -268,7 +289,7 @@ def SoftmaxFocalLoss(
axis : int, optional axis : int, optional
The axis to apply softmax, can be negative. The axis to apply softmax, can be negative.
normalization : {'UNIT', 'FULL', 'VALID', 'BATCH_SIZE', 'NONE'}, optional normalization : {'UNIT', 'FULL', 'VALID', 'BATCH_SIZE', 'NONE'}, optional
The normalization method. The method of normalization.
ignore_labels : sequence of int, optional, default=() ignore_labels : sequence of int, optional, default=()
The label id to ignore. The label id to ignore.
alpha : float, optional, default=0.25 alpha : float, optional, default=0.25
...@@ -293,8 +314,12 @@ def SoftmaxFocalLoss( ...@@ -293,8 +314,12 @@ def SoftmaxFocalLoss(
@OpSchema.Inputs(2) @OpSchema.Inputs(2)
def CTCLoss( def CTCLoss(
inputs, blank_first=True, padding_mask=-1, inputs,
use_softmax=True, **kwargs): blank_first=True,
padding_mask=-1,
use_softmax=True,
**kwargs
):
"""Compute the ctc loss with batched variable length of labels. `[Graves & Gomez, 2006] <http://www.cs.utoronto.ca/~graves/icml_2006.pdf>`_. """Compute the ctc loss with batched variable length of labels. `[Graves & Gomez, 2006] <http://www.cs.utoronto.ca/~graves/icml_2006.pdf>`_.
The data format of inputs should be *[T, N, C]*. The data format of inputs should be *[T, N, C]*.
...@@ -329,5 +354,6 @@ def CTCLoss( ...@@ -329,5 +354,6 @@ def CTCLoss(
""" """
arguments = ParseArgs(locals()) arguments = ParseArgs(locals())
if use_softmax: arguments['inputs'][0] = Softmax(arguments['inputs'][0], axis=2) if use_softmax: arguments['inputs'][0] = \
Softmax(arguments['inputs'][0], axis=2)
return Tensor.CreateOperator('CTCLoss', **arguments) return Tensor.CreateOperator('CTCLoss', **arguments)
\ No newline at end of file
...@@ -18,8 +18,13 @@ from . import * ...@@ -18,8 +18,13 @@ from . import *
@OpSchema.Inputs(5) @OpSchema.Inputs(5)
def BatchNorm( def BatchNorm(
inputs, axis=-1, momentum=0.9, eps=1e-5, inputs,
use_stats=-1, **kwargs): axis=-1,
momentum=0.9,
eps=1e-5,
use_stats=-1,
**kwargs
):
"""Batch Normalization. `[Ioffe & Szegedy, 2015] <https://arxiv.org/abs/1502.03167>`_. """Batch Normalization. `[Ioffe & Szegedy, 2015] <https://arxiv.org/abs/1502.03167>`_.
We enforce the number of inputs should be *5*, i.e., We enforce the number of inputs should be *5*, i.e.,
......
...@@ -29,8 +29,16 @@ class RNN(RNNBase): ...@@ -29,8 +29,16 @@ class RNN(RNNBase):
>>> outputs, hidden = rnn(x) >>> outputs, hidden = rnn(x)
""" """
def __init__(self, input_size, hidden_size, nonlinearity='relu', def __init__(
num_layers=1, bidirectional=False, dropout=0, name=None): self,
input_size,
hidden_size,
nonlinearity='relu',
num_layers=1,
bidirectional=False,
dropout=0,
name=None,
):
"""Construct a RNN instance. """Construct a RNN instance.
Parameters Parameters
...@@ -57,8 +65,10 @@ class RNN(RNNBase): ...@@ -57,8 +65,10 @@ class RNN(RNNBase):
""" """
mode = 'rnn_relu' if nonlinearity == 'relu' else 'rnn_tanh' mode = 'rnn_relu' if nonlinearity == 'relu' else 'rnn_tanh'
super(RNN, self).__init__(mode, input_size, hidden_size, super(RNN, self).__init__(
num_layers, bidirectional, dropout, name) mode, input_size, hidden_size,
num_layers, bidirectional, dropout, name,
)
class LSTM(RNNBase): class LSTM(RNNBase):
...@@ -73,8 +83,15 @@ class LSTM(RNNBase): ...@@ -73,8 +83,15 @@ class LSTM(RNNBase):
>>> outputs, hidden = rnn(x) >>> outputs, hidden = rnn(x)
""" """
def __init__(self, input_size, hidden_size, def __init__(
num_layers=1, bidirectional=False, dropout=0, name=None): self,
input_size,
hidden_size,
num_layers=1,
bidirectional=False,
dropout=0,
name=None,
):
"""Construct a LSTM instance. """Construct a LSTM instance.
Parameters Parameters
...@@ -98,8 +115,10 @@ class LSTM(RNNBase): ...@@ -98,8 +115,10 @@ class LSTM(RNNBase):
The wrapper of general RNN. The wrapper of general RNN.
""" """
super(LSTM, self).__init__('lstm', input_size, hidden_size, super(LSTM, self).__init__(
num_layers, bidirectional, dropout, name) 'lstm', input_size, hidden_size,
num_layers, bidirectional, dropout, name,
)
class GRU(RNNBase): class GRU(RNNBase):
...@@ -114,8 +133,15 @@ class GRU(RNNBase): ...@@ -114,8 +133,15 @@ class GRU(RNNBase):
>>> outputs, hidden = rnn(x) >>> outputs, hidden = rnn(x)
""" """
def __init__(self, input_size, hidden_size, def __init__(
num_layers=1, bidirectional=False, dropout=0, name=None): self,
input_size,
hidden_size,
num_layers=1,
bidirectional=False,
dropout=0,
name=None,
):
"""Construct a GRU instance. """Construct a GRU instance.
Parameters Parameters
...@@ -139,8 +165,10 @@ class GRU(RNNBase): ...@@ -139,8 +165,10 @@ class GRU(RNNBase):
The wrapper of general RNN. The wrapper of general RNN.
""" """
super(GRU, self).__init__('gru', input_size, hidden_size, super(GRU, self).__init__(
num_layers, bidirectional, dropout, name) 'gru', input_size, hidden_size,
num_layers, bidirectional, dropout, name,
)
@OpSchema.Inputs(2) @OpSchema.Inputs(2)
...@@ -160,4 +188,5 @@ def LSTMCell(inputs, **kwargs): ...@@ -160,4 +188,5 @@ def LSTMCell(inputs, **kwargs):
The outputs, ``h`` and ``c`` respectively. The outputs, ``h`` and ``c`` respectively.
""" """
return Tensor.CreateOperator('LSTMCell', num_outputs=2, **ParseArgs(locals())) return Tensor.CreateOperator(
\ No newline at end of file 'LSTMCell', num_outputs=2, **ParseArgs(locals()))
\ No newline at end of file
...@@ -33,9 +33,17 @@ def _normalize_pads(value, rank): ...@@ -33,9 +33,17 @@ def _normalize_pads(value, rank):
@OpSchema.Inputs(2, 3) @OpSchema.Inputs(2, 3)
def Conv2d( def Conv2d(
inputs, num_output, kernel_shape, inputs,
strides=1, pads=0, dilations=1, group=1, num_output,
padding='VALID', data_format='NCHW', **kwargs): kernel_shape,
strides=1,
pads=0,
dilations=1,
group=1,
padding='VALID',
data_format='NCHW',
**kwargs
):
"""2D Convolution. """2D Convolution.
The spatial output dimension of convolution can be computed as follows: The spatial output dimension of convolution can be computed as follows:
...@@ -99,8 +107,15 @@ def Conv2d( ...@@ -99,8 +107,15 @@ def Conv2d(
@OpSchema.Inputs(2, 3) @OpSchema.Inputs(2, 3)
def DepthwiseConv2d( def DepthwiseConv2d(
inputs, num_output, kernel_shape=3, strides=1, pads=0, inputs,
padding='VALID', data_format='NCHW', **kwargs): num_output,
kernel_shape=3,
strides=1,
pads=0,
padding='VALID',
data_format='NCHW',
**kwargs
):
"""Depthwise 2D Convolution. `[Chollet, 2016] <https://arxiv.org/abs/1610.02357>`_. """Depthwise 2D Convolution. `[Chollet, 2016] <https://arxiv.org/abs/1610.02357>`_.
Set ``padding`` to *VALID* will use the value of ``pads``. Set ``padding`` to *VALID* will use the value of ``pads``.
...@@ -149,10 +164,19 @@ def DepthwiseConv2d( ...@@ -149,10 +164,19 @@ def DepthwiseConv2d(
@ArgumentHelper.RepeatedDesc('output_padding') @ArgumentHelper.RepeatedDesc('output_padding')
@ArgumentHelper.RepeatedDesc('output_shape') @ArgumentHelper.RepeatedDesc('output_shape')
def ConvTranspose2d( def ConvTranspose2d(
inputs, num_output, kernel_shape, inputs,
strides=1, pads=0, dilations=1, group=1, num_output,
output_padding=None, output_shape=None, kernel_shape,
padding='VALID', data_format='NCHW', **kwargs): strides=1,
pads=0,
dilations=1,
group=1,
output_padding=None,
output_shape=None,
padding='VALID',
data_format='NCHW',
**kwargs
):
"""2D Deconvolution. """2D Deconvolution.
The spatial output dimension of deconvolution can be computed as follows: The spatial output dimension of deconvolution can be computed as follows:
...@@ -224,8 +248,17 @@ def ConvTranspose2d( ...@@ -224,8 +248,17 @@ def ConvTranspose2d(
@OpSchema.Inputs(1) @OpSchema.Inputs(1)
def Pool2d( def Pool2d(
inputs, kernel_shape, strides, pads=0, padding='VALID', ceil_mode=True, inputs,
mode='MAX', data_format='NCHW', global_pooling=False, **kwargs): kernel_shape,
strides,
pads=0,
padding='VALID',
ceil_mode=True,
mode='MAX',
data_format='NCHW',
global_pooling=False,
**kwargs
):
"""2D Pooling, MAX or AVG. """2D Pooling, MAX or AVG.
The spatial output dimension of pooling can be computed as follows: The spatial output dimension of pooling can be computed as follows:
...@@ -308,7 +341,14 @@ def ROIPool(inputs, pool_h, pool_w, spatial_scale=1.0, **kwargs): ...@@ -308,7 +341,14 @@ def ROIPool(inputs, pool_h, pool_w, spatial_scale=1.0, **kwargs):
@OpSchema.Inputs(2) @OpSchema.Inputs(2)
def ROIAlign(inputs, pool_h=0, pool_w=0, spatial_scale=1.0, sampling_ratio=2, **kwargs): def ROIAlign(
inputs,
pool_h=0,
pool_w=0,
spatial_scale=1.0,
sampling_ratio=2,
**kwargs
):
"""AVG RoIAlign. `[He et.al, 2017] <https://arxiv.org/abs/1703.06870>`_. """AVG RoIAlign. `[He et.al, 2017] <https://arxiv.org/abs/1703.06870>`_.
**Type Constraints**: (*float16*, *float32*) **Type Constraints**: (*float16*, *float32*)
...@@ -337,8 +377,15 @@ def ROIAlign(inputs, pool_h=0, pool_w=0, spatial_scale=1.0, sampling_ratio=2, ** ...@@ -337,8 +377,15 @@ def ROIAlign(inputs, pool_h=0, pool_w=0, spatial_scale=1.0, sampling_ratio=2, **
@OpSchema.Inputs(1) @OpSchema.Inputs(1)
def LRN( def LRN(
inputs, local_size=5, alpha=0.0001, beta=0.75, k=2.0, inputs,
mode='ACROSS_CHANNELS', data_format='NCHW', **kwargs): local_size=5,
alpha=0.0001,
beta=0.75,
k=2.0,
mode='ACROSS_CHANNELS',
data_format='NCHW',
**kwargs
):
"""Local Response Normalization. `[Krizhevsky et.al, 2012] <http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks>`_. """Local Response Normalization. `[Krizhevsky et.al, 2012] <http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks>`_.
**Type Constraints**: (*float16*, *float32*) **Type Constraints**: (*float16*, *float32*)
...@@ -379,8 +426,14 @@ def LRN( ...@@ -379,8 +426,14 @@ def LRN(
@OpSchema.Inputs(1) @OpSchema.Inputs(1)
@ArgumentHelper.RepeatedDesc('dsize') @ArgumentHelper.RepeatedDesc('dsize')
def NNResize( def NNResize(
inputs, dsize, shape_like=None, inputs,
fy=-1.0, fx=-1.0, data_format='NCHW', **kwargs): dsize,
shape_like=None,
fy=-1.0,
fx=-1.0,
data_format='NCHW',
**kwargs
):
"""Resize the image with Nearest-Neighbor method. """Resize the image with Nearest-Neighbor method.
Set ``dsize`` to None if you want to use ``shape_like`` or ``fy/fx``. Set ``dsize`` to None if you want to use ``shape_like`` or ``fy/fx``.
...@@ -430,8 +483,14 @@ def NNResize( ...@@ -430,8 +483,14 @@ def NNResize(
@OpSchema.Inputs(1) @OpSchema.Inputs(1)
@ArgumentHelper.RepeatedDesc('dsize') @ArgumentHelper.RepeatedDesc('dsize')
def BilinearResize( def BilinearResize(
inputs, dsize, shape_like=None, inputs,
fy=-1.0, fx=-1.0, data_format='NCHW', **kwargs): dsize,
shape_like=None,
fy=-1.0,
fx=-1.0,
data_format='NCHW',
**kwargs
):
"""Resize the image with Bi-linear method. """Resize the image with Bi-linear method.
Set ``dsize`` to None if you want to use ``shape_like`` or ``fy/fx``. Set ``dsize`` to None if you want to use ``shape_like`` or ``fy/fx``.
...@@ -508,8 +567,14 @@ def BiasAdd(inputs, data_format='NCHW', **kwargs): ...@@ -508,8 +567,14 @@ def BiasAdd(inputs, data_format='NCHW', **kwargs):
@OpSchema.Inputs(1) @OpSchema.Inputs(1)
@ArgumentHelper.Desc('keep_prob', as_target=False) @ArgumentHelper.Desc('keep_prob', as_target=False)
def DropBlock2d( def DropBlock2d(
inputs, block_size=7, keep_prob=0.9, alpha=1., inputs,
decrement=0., data_format='NCHW', **kwargs): block_size=7,
keep_prob=0.9,
alpha=1.,
decrement=0.,
data_format='NCHW',
**kwargs
):
"""Randomly drop the outputs according to the spatial blocks. `[Ghiasi et.al, 2018] <https://arxiv.org/abs/1810.12890>`_. """Randomly drop the outputs according to the spatial blocks. `[Ghiasi et.al, 2018] <https://arxiv.org/abs/1810.12890>`_.
Set the ``decrement`` to schedule ``keep_prob`` for each iteration. Set the ``decrement`` to schedule ``keep_prob`` for each iteration.
......
...@@ -53,6 +53,8 @@ class DataBatch(object): ...@@ -53,6 +53,8 @@ class DataBatch(object):
The value to fill when padding is valid. The value to fill when padding is valid.
crop_size : int, optional, default=0 crop_size : int, optional, default=0
The cropping size. The cropping size.
cutout_size : int, optional, default=0
The square size to cutout.
mirror : bool, optional, default=False mirror : bool, optional, default=False
Whether to mirror(flip horizontally) images. Whether to mirror(flip horizontally) images.
color_augmentation : bool, optional, default=False color_augmentation : bool, optional, default=False
......
...@@ -47,6 +47,8 @@ class DataTransformer(multiprocessing.Process): ...@@ -47,6 +47,8 @@ class DataTransformer(multiprocessing.Process):
The value to fill when padding is valid. The value to fill when padding is valid.
crop_size : int, optional, default=0 crop_size : int, optional, default=0
The cropping size. The cropping size.
cutout_size : int, optional, default=0
The square size to cutout.
mirror : bool, optional, default=False mirror : bool, optional, default=False
Whether to mirror(flip horizontally) images. Whether to mirror(flip horizontally) images.
color_augmentation : bool, optional, default=False color_augmentation : bool, optional, default=False
...@@ -65,6 +67,7 @@ class DataTransformer(multiprocessing.Process): ...@@ -65,6 +67,7 @@ class DataTransformer(multiprocessing.Process):
self._padding = kwargs.get('padding', 0) self._padding = kwargs.get('padding', 0)
self._fill_value = kwargs.get('fill_value', 127) self._fill_value = kwargs.get('fill_value', 127)
self._crop_size = kwargs.get('crop_size', 0) self._crop_size = kwargs.get('crop_size', 0)
self._cutout_size = kwargs.get('cutout_size', 0)
self._mirror = kwargs.get('mirror', False) self._mirror = kwargs.get('mirror', False)
self._color_aug = kwargs.get('color_augmentation', False) self._color_aug = kwargs.get('color_augmentation', False)
self._min_random_scale = kwargs.get('min_random_scale', 1.0) self._min_random_scale = kwargs.get('min_random_scale', 1.0)
...@@ -127,6 +130,13 @@ class DataTransformer(multiprocessing.Process): ...@@ -127,6 +130,13 @@ class DataTransformer(multiprocessing.Process):
im = im[h_off : h_off + self._crop_size, im = im[h_off : h_off + self._crop_size,
w_off : w_off + self._crop_size, :] w_off : w_off + self._crop_size, :]
# CutOut
if self._cutout_size > 0:
h_off = numpy.random.randint(im.shape[0])
w_off = numpy.random.randint(im.shape[1])
im[h_off : h_off + self._cutout_size,
w_off : w_off + self._cutout_size, :] = self._fill_value
# Random mirror # Random mirror
if self._mirror: if self._mirror:
if numpy.random.randint(0, 2) > 0: if numpy.random.randint(0, 2) > 0:
......
...@@ -22,5 +22,4 @@ from dragon.vm.tensorflow.ops.random_ops import * ...@@ -22,5 +22,4 @@ from dragon.vm.tensorflow.ops.random_ops import *
from dragon.vm.tensorflow.ops.math_ops import * from dragon.vm.tensorflow.ops.math_ops import *
from dragon.vm.tensorflow.ops.array_ops import * from dragon.vm.tensorflow.ops.array_ops import *
from dragon.vm.tensorflow.ops.control_flow_ops import * from dragon.vm.tensorflow.ops.control_flow_ops import *
from dragon.vm.tensorflow.ops.nn_ops import *
from dragon.vm.tensorflow.ops.gradients_impl import gradients from dragon.vm.tensorflow.ops.gradients_impl import gradients
\ No newline at end of file
...@@ -8,40 +8,39 @@ namespace kernel { ...@@ -8,40 +8,39 @@ namespace kernel {
/*! LSTMCell <T = float32, Device = CPU> */ /*! LSTMCell <T = float32, Device = CPU> */
template <typename T> template <typename T>
T _SigmoidUnit(T x) { return T(1) / (T(1) + exp(-x)); } T _s(T x) { return T(1) / (T(1) + exp(-x)); }
template <> void LSTMCell<float, CPUContext>( template <> void LSTMCell<float, CPUContext>(
const int count,
const int N, const int N,
const int C, const int C,
const float* cx, const float* cx,
float* xact, float* actx,
float* c, float* c,
float* h, float* h,
CPUContext* ctx) { CPUContext* ctx) {
float i, f, o, c_; float i, f, o, c_;
int f_offset = C, o_offset = 2 * C, c_offset = 3 * C, x_offset = 4 * C; int f_offset = C, o_offset = 2 * C,
c_offset = 3 * C, x_offset = 4 * C;
for (int n = 0; n < N; ++n) { for (int n = 0; n < N; ++n) {
for (int idx = 0; idx < C; ++idx) { for (int idx = 0; idx < C; ++idx) {
xact[idx] = i = _SigmoidUnit<float>(xact[idx]); actx[idx] = i = _s<float>(actx[idx]);
xact[idx + f_offset] = f = _SigmoidUnit<float>(xact[idx + f_offset]); actx[idx + f_offset] = f = _s<float>(actx[idx + f_offset]);
xact[idx + o_offset] = o = _SigmoidUnit<float>(xact[idx + o_offset]); actx[idx + o_offset] = o = _s<float>(actx[idx + o_offset]);
xact[idx + c_offset] = c_ = tanh(xact[idx + c_offset]); actx[idx + c_offset] = c_ = tanh(actx[idx + c_offset]);
c_ = c[idx] = f * cx[idx] + i * c_; c_ = c[idx] = f * cx[idx] + i * c_;
h[idx] = o * tanh(c_); h[idx] = o * tanh(c_);
} }
cx += C; xact += x_offset; c += C; h += C; cx += C; actx += x_offset; c += C; h += C;
} }
} }
/*! LSTMCellGrad <T = float32, Device = CPU> */ /*! LSTMCellGrad <T = float32, Device = CPU> */
template <> void LSTMCellGrad<float, CPUContext>( template <> void LSTMCellGrad<float, CPUContext>(
const int count,
const int N, const int N,
const int C, const int C,
const float* cx, const float* cx,
const float* xact, const float* actx,
const float* c, const float* c,
const float* dc, const float* dc,
const float* dh, const float* dh,
...@@ -49,21 +48,19 @@ template <> void LSTMCellGrad<float, CPUContext>( ...@@ -49,21 +48,19 @@ template <> void LSTMCellGrad<float, CPUContext>(
float* dx, float* dx,
CPUContext* ctx) { CPUContext* ctx) {
float i, f, o, g, tanh_c, dcx_sum_term; float i, f, o, g, tanh_c, dcx_sum_term;
int f_offset = C, int f_offset = C, o_offset = 2 * C,
o_offset = 2 * C, c_offset = 3 * C, x_offset = 4 * C;
c_offset = 3 * C,
x_offset = 4 * C;
for (int n = 0; n < N; ++n) { for (int n = 0; n < N; ++n) {
for (int idx = 0; idx < C; ++idx) { for (int idx = 0; idx < C; ++idx) {
i = xact[idx]; i = actx[idx];
f = xact[idx + f_offset]; f = actx[idx + f_offset];
o = xact[idx + o_offset]; o = actx[idx + o_offset];
g = xact[idx + c_offset]; g = actx[idx + c_offset];
// BPTT compute the dc_{t-1} at the time of t // BPTT compute the dc_{t-1} at the time of t
// dc_{t-1} = dl / d(h_{t}) * d(h_{t}) / d(c_{t}) * d(c_{t}) / d(c_{t-1}) // dc_{t-1} = dl / d(h_{t}) * d(h_{t}) / d(c_{t}) * d(c_{t}) / d(c_{t-1})
// + d(c_{t+1}) / d(c_{t}) * d(c_{t}) / d(c_{t-1}) // + d(c_{t+1}) / d(c_{t}) * d(c_{t}) / d(c_{t-1})
// = (dl / d(h_{t}) * d(h_{t}) / d(c_{t}) + d(c_{t+1}) / d(c_{t})) // = (dl / d(h_{t}) * d(h_{t}) / d(c_{t}) + d(c_{t+1}) / d(c_{t}))
// * d(c_{t}) / d(c_{t-1}) // * d(c_{t}) / d(c_{t-1})
tanh_c = tanh(c[idx]); tanh_c = tanh(c[idx]);
dcx_sum_term = dh[idx] * o * (1 - tanh_c * tanh_c) + dc[idx]; dcx_sum_term = dh[idx] * o * (1 - tanh_c * tanh_c) + dc[idx];
dcx[idx] = dcx_sum_term * f; dcx[idx] = dcx_sum_term * f;
...@@ -72,7 +69,8 @@ template <> void LSTMCellGrad<float, CPUContext>( ...@@ -72,7 +69,8 @@ template <> void LSTMCellGrad<float, CPUContext>(
dx[idx + o_offset] = dh[idx] * tanh_c * o * (1 - o); dx[idx + o_offset] = dh[idx] * tanh_c * o * (1 - o);
dx[idx + c_offset] = dcx_sum_term * i * (1 - g * g); dx[idx + c_offset] = dcx_sum_term * i * (1 - g * g);
} }
cx += C; xact += x_offset; c += C; dc += C; dh += C; cx += C; actx += x_offset;
c += C; dc += C; dh += C;
dcx += C; dx += x_offset; dcx += C; dx += x_offset;
} }
} }
......
...@@ -11,94 +11,91 @@ namespace kernel { ...@@ -11,94 +11,91 @@ namespace kernel {
template <typename T> template <typename T>
__global__ void _LSTMCellAct( __global__ void _LSTMCellAct(
const int count, const int nthreads,
const int c_offset, const int c_offset,
const int x_offset, const int x_offset,
T* xact) { T* actx) {
CUDA_1D_KERNEL_LOOP(idx, count) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
const int offset = idx % x_offset; const int offset = i % x_offset;
xact[idx] = offset < c_offset ? actx[i] = offset < c_offset ?
((T)1 / ((T)1 + exp(-xact[idx]))) (T(1) / (T(1) + exp(-actx[i])))
: tanh(xact[idx]); : tanh(actx[i]);
} }
} }
template <typename T> template <typename T>
__global__ void _LSTMCellGate( __global__ void _LSTMCellGate(
const int count, const int nthreads,
const int hidden_size, const int hidden_size,
const int o_offset, // 2 * hidden_size const int o_offset,
const int c_offset, // 3 * hidden_size const int c_offset,
const int x_offset, // 4 * hidden_size const int x_offset,
const T* cx, const T* cx,
const T* xact, const T* actx,
T* c, T* c,
T* h) { T* h) {
CUDA_1D_KERNEL_LOOP(idx, count) { CUDA_1D_KERNEL_LOOP(idx, nthreads) {
const int n = idx / hidden_size; const int n = idx / hidden_size;
const int offset = idx % hidden_size; const int offset = idx % hidden_size;
const T* x = xact + n * x_offset; const T* actx_ = actx + n * x_offset;
const T i = x[offset]; const T i = actx_[offset];
const T f = x[offset + hidden_size]; const T f = actx_[offset + hidden_size];
const T o = x[offset + o_offset]; const T o = actx_[offset + o_offset];
T c_ = x[offset + c_offset]; T c_ = actx_[offset + c_offset];
c_ = c[idx] = f * cx[idx] + i * c_; c_ = c[idx] = f * cx[idx] + i * c_;
h[idx] = o * tanh(c_); h[idx] = o * tanh(c_);
} }
} }
template <> void LSTMCell<float, CUDAContext>( template <> void LSTMCell<float, CUDAContext>(
const int count,
const int N, const int N,
const int C, const int C,
const float* cx, const float* cx,
float* xact, float* actx,
float* c, float* c,
float* h, float* h,
CUDAContext* ctx) { CUDAContext* ctx) {
const int o_offset = 2 * C, auto o_offset = 2 * C, c_offset = 3 * C,
c_offset = 3 * C, x_offset = 4 * C, NC = N * C;
x_offset = 4 * C;
_LSTMCellAct<float> _LSTMCellAct<float>
<< < CUDA_BLOCKS(count * 4), CUDA_THREADS, << < CUDA_BLOCKS(NC * 4), CUDA_THREADS,
0, ctx->cuda_stream() >> > 0, ctx->cuda_stream() >> >
(count * 4, c_offset, x_offset, xact); (NC * 4, c_offset, x_offset, actx);
_LSTMCellGate<float> _LSTMCellGate<float>
<< < CUDA_BLOCKS(count), CUDA_THREADS, << < CUDA_BLOCKS(NC), CUDA_THREADS,
0, ctx->cuda_stream() >> > 0, ctx->cuda_stream() >> >
(count, C, o_offset, c_offset, x_offset, (NC, C, o_offset, c_offset,
cx, xact, c, h); x_offset, cx, actx, c, h);
} }
/*! LSTMCellGrad <T = float32, Device = CUDA> */ /*! LSTMCellGrad <T = float32, Device = CUDA> */
template <typename T> template <typename T>
__global__ void _LSTMCellGateGrad( __global__ void _LSTMCellGateGrad(
const int count, const int nthreads,
const int hidden_size, const int hidden_size,
const int o_offset, const int o_offset,
const int c_offset, const int c_offset,
const int x_offset, const int x_offset,
const T* cx, const T* cx,
const T* xact, const T* actx,
const T* c, const T* c,
const T* dc, const T* dc,
const T* dh, const T* dh,
T* dcx, T* dcx,
T* dx) { T* dx) {
CUDA_1D_KERNEL_LOOP(idx, count) { CUDA_1D_KERNEL_LOOP(idx, nthreads) {
const int n = idx / hidden_size; const int n = idx / hidden_size;
const int offset = idx % hidden_size; const int offset = idx % hidden_size;
const T* xact_ = xact + n * x_offset; const T* actx_ = actx + n * x_offset;
T* dx_ = dx + n * x_offset; T* dx_ = dx + n * x_offset;
const T i = xact_[offset]; const T i = actx_[offset];
const T f = xact_[offset + hidden_size]; const T f = actx_[offset + hidden_size];
const T o = xact_[offset + o_offset]; const T o = actx_[offset + o_offset];
const T g = xact_[offset + c_offset]; const T g = actx_[offset + c_offset];
const T tanh_c = tanh(c[idx]); const T tanh_c = tanh(c[idx]);
const T dcx_sum_term = const T dcx_sum_term =
dh[idx] * o * (1 - tanh_c * tanh_c) + dc[idx]; dh[idx] * o * (T(1) - tanh_c * tanh_c) + dc[idx];
dcx[idx] = dcx_sum_term * f; dcx[idx] = dcx_sum_term * f;
dx_[offset] = dcx_sum_term * g; dx_[offset] = dcx_sum_term * g;
dx_[offset + hidden_size] = dcx_sum_term * cx[idx]; dx_[offset + hidden_size] = dcx_sum_term * cx[idx];
...@@ -109,44 +106,44 @@ __global__ void _LSTMCellGateGrad( ...@@ -109,44 +106,44 @@ __global__ void _LSTMCellGateGrad(
template <typename T> template <typename T>
__global__ void _LSTMCellActGrad( __global__ void _LSTMCellActGrad(
const int count, const int nthreads,
const int c_offset, const int c_offset,
const int x_offset, const int x_offset,
const T* xact, const T* actx,
T* dx) { T* dx) {
CUDA_1D_KERNEL_LOOP(idx, count) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
const int offset = idx % x_offset; const T val = actx[i];
const T val = xact[idx]; const int offset = i % x_offset;
if (offset < c_offset) dx[idx] = dx[idx] * val * (T(1) - val); if (offset < c_offset) {
else dx[idx] = dx[idx] * (T(1) - val * val); dx[i] = dx[i] * val * (T(1) - val);
} else {
dx[i] = dx[i] * (T(1) - val * val);
}
} }
} }
template <> void LSTMCellGrad<float, CUDAContext>( template <> void LSTMCellGrad<float, CUDAContext>(
const int count,
const int N, const int N,
const int C, const int C,
const float* cx, const float* cx,
const float* xact, const float* actx,
const float* c, const float* c,
const float* dc, const float* dc,
const float* dh, const float* dh,
float* dcx, float* dcx,
float* dx, float* dx,
CUDAContext* ctx) { CUDAContext* ctx) {
const int o_offset = 2 * C, auto o_offset = 2 * C, c_offset = 3 * C,
c_offset = 3 * C, x_offset = 4 * C, NC = N * C;
x_offset = 4 * C;
_LSTMCellGateGrad<float> _LSTMCellGateGrad<float>
<< < CUDA_BLOCKS(count), CUDA_THREADS, << < CUDA_BLOCKS(NC), CUDA_THREADS,
0, ctx->cuda_stream() >> > 0, ctx->cuda_stream() >> >
(count, C, o_offset, c_offset, x_offset, (NC, C, o_offset, c_offset, x_offset,
cx, xact, c, dc, dh, dcx, dx); cx, actx, c, dc, dh, dcx, dx);
_LSTMCellActGrad<float> _LSTMCellActGrad<float>
<< < CUDA_BLOCKS(count * 4), CUDA_THREADS, << < CUDA_BLOCKS(NC * 4), CUDA_THREADS,
0, ctx->cuda_stream() >> > 0, ctx->cuda_stream() >> >
(count * 4, c_offset, x_offset, xact, dx); (NC * 4, c_offset, x_offset, actx, dx);
} }
} // namespace kernel } // namespace kernel
......
...@@ -17,8 +17,9 @@ void FlattenOp<Context>::RunOnDevice() { ...@@ -17,8 +17,9 @@ void FlattenOp<Context>::RunOnDevice() {
vector<int64_t> output_dims; vector<int64_t> output_dims;
if (keep_axes != INT_MAX) { if (keep_axes != INT_MAX) {
CHECK_LE(keep_axes, Input(0).ndim()) CHECK_LE(keep_axes, Input(0).ndim())
<< "\nThe total number of axes is " + Input(0).ndim() << "\nThe total number of axes is "
<< ", can not keep " + keep_axes << " ."; << Input(0).ndim() << ", can not keep "
<< keep_axes << " .";
int i = 0; int i = 0;
for (; i < keep_axes - 1; i++) for (; i < keep_axes - 1; i++)
output_dims.push_back(Input(0).dim(i)); output_dims.push_back(Input(0).dim(i));
......
...@@ -12,9 +12,10 @@ void LSTMCellOp<Context>::RunWithType() { ...@@ -12,9 +12,10 @@ void LSTMCellOp<Context>::RunWithType() {
auto* Hdata = Output(0)->template mutable_data<T, Context>(); auto* Hdata = Output(0)->template mutable_data<T, Context>();
auto* Cdata = Output(1)->template mutable_data<T, Context>(); auto* Cdata = Output(1)->template mutable_data<T, Context>();
kernel::LSTMCell(Input(1).count(), Input(1).dim(0), kernel::LSTMCell(
Input(1).ndim() == 2 ? Input(1).dim(1) : Input(1).dim(2), Input(1).dim(0), Input(1).ndim() == 2 ?
HXdata, Xdata, Cdata, Hdata, ctx()); Input(1).dim(1) : Input(1).dim(2),
HXdata, Xdata, Cdata, Hdata, ctx());
} }
template <class Context> template <class Context>
...@@ -47,10 +48,11 @@ void LSTMCellGradientOp<Context>::RunWithType() { ...@@ -47,10 +48,11 @@ void LSTMCellGradientOp<Context>::RunWithType() {
cast::to<T>(0.f), dCdata, ctx()); cast::to<T>(0.f), dCdata, ctx());
} }
kernel::LSTMCellGrad(Input(1).count(), Input(1).dim(0), kernel::LSTMCellGrad(
Input(1).ndim() == 2 ? Input(1).dim(1) : Input(1).dim(2), Input(1).dim(0), Input(1).ndim() == 2 ?
HXdata, Xdata, Cdata, dCdata, dHdata, Input(1).dim(1) : Input(1).dim(2),
dHXdata, dXdata, ctx()); HXdata, Xdata, Cdata, dCdata, dHdata,
dHXdata, dXdata, ctx());
} }
template <class Context> template <class Context>
......
...@@ -113,6 +113,11 @@ template <class Context> ...@@ -113,6 +113,11 @@ template <class Context>
void CuDNNDepthwiseConv2dGradientOp<Context>::RunOnDevice() { void CuDNNDepthwiseConv2dGradientOp<Context>::RunOnDevice() {
group = channels = data_format == "NCHW" ? group = channels = data_format == "NCHW" ?
Input(0).dim(1) : Input(0).dim(-1); Input(0).dim(1) : Input(0).dim(-1);
#if CUDNN_VERSION_MIN(7, 0, 0)
// The group implementation of CuDNN is faster
// Enable if CuDNN >= 7.0
return CuDNNConv2dGradientOp<Context>::RunOnDevice();
#endif
GradientReshape(); GradientReshape();
if (XIsType(Input(0), float)) RunWithType<float>(); if (XIsType(Input(0), float)) RunWithType<float>();
......
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!