Commit 2c90589f by Ting PAN

Refactor vision operators

Summary:
This commit adds 1D and 3D support for vision operators
via a generalized ND implementation.
1 parent 60e5d25a
Showing with 1442 additions and 480 deletions
......@@ -84,7 +84,7 @@ class ArgMax(Layer):
param = layer_param.argmax_param
if param.top_k != 1:
raise ValueError('Top-k argmax is not supported.')
self.arguments = {'axis': param.axis, 'keep_dims': True}
self.arguments = {'axis': param.axis, 'keepdims': True}
def __call__(self, bottom):
return array_ops.argmax(bottom, **self.arguments)
......
......@@ -118,5 +118,8 @@ class Data(Layer):
'num_outputs': 2,
}
data, label = framework_ops.python_plugin([], **args)
data.shape = (self.data_args['batch_size'],
None, None, len(self.norm_args['mean']))
label.shape = (self.data_args['batch_size'], None)
data = array_ops.channel_normalize(data, **self.norm_args)
return data, label
......@@ -78,11 +78,12 @@ class Convolution(Layer):
def __call__(self, bottom):
inputs = [bottom] + [blob['data'] for blob in self._blobs]
return vision_ops.conv2d(inputs, **self.arguments)
conv_op = 'conv{}d'.format(len(bottom.shape) - 2)
return getattr(vision_ops, conv_op)(inputs, **self.arguments)
class Deconvolution(Convolution):
r"""Apply the 2d deconvolution.
r"""Apply the n-dimension deconvolution.
Examples:
......@@ -117,7 +118,8 @@ class Deconvolution(Convolution):
def __call__(self, bottom):
inputs = [bottom] + [blob['data'] for blob in self._blobs]
return vision_ops.conv2d_transpose(inputs, **self.arguments)
conv_op = 'conv{}d_transpose'.format(len(bottom.shape) - 2)
return getattr(vision_ops, conv_op)(inputs, **self.arguments)
class LRN(Layer):
......@@ -186,7 +188,7 @@ class Pooling(Layer):
'ceil_mode': True,
'mode': {0: 'MAX', 1: 'AVG'}[param.pool],
'data_format': 'NCHW',
'global_pooling': param.global_pooling,
'global_pool': param.global_pooling,
}
if not param.HasField('kernel_h'):
self.arguments['kernel_shape'] = [param.kernel_size]
......@@ -202,7 +204,8 @@ class Pooling(Layer):
self.arguments['strides'] = [param.stride_h, param.stride_w]
def __call__(self, bottom):
return vision_ops.pool2d(bottom, **self.arguments)
pool_op = 'pool{}d'.format(len(bottom.shape) - 2)
return getattr(vision_ops, pool_op)(bottom, **self.arguments)
class ROIAlign(Layer):
......
......@@ -28,12 +28,30 @@ dragon.nn
`bias_add(...) <nn/bias_add.html>`_
: Add the bias across channels to input.
`conv(...) <nn/conv.html>`_
: Apply the n-dimension convolution.
`conv_transpose(...) <nn/conv_transpose.html>`_
: Apply the n-dimension deconvolution.
`conv1d(...) <nn/conv1d.html>`_
: Apply the 1d convolution.
`conv1d_transpose(...) <nn/conv1d_transpose.html>`_
: Apply the 1d deconvolution.
`conv2d(...) <nn/conv2d.html>`_
: Apply the 2d convolution.
`conv2d_transpose(...) <nn/conv2d_transpose.html>`_
: Apply the 2d deconvolution.
`conv3d(...) <nn/conv3d.html>`_
: Apply the 3d convolution.
`conv3d_transpose(...) <nn/conv3d_transpose.html>`_
: Apply the 3d deconvolution.
`depthwise_conv2d(...) <nn/depthwise_conv2d.html>`_
: Apply the 2d depthwise convolution.
......@@ -92,9 +110,15 @@ dragon.nn
: Apply the parametric rectified linear unit.
`[He et.al, 2015] <https://arxiv.org/abs/1502.01852>`_.
`pool1d(...) <nn/pool1d.html>`_
: Apply the 1d pooling.
`pool2d(...) <nn/pool2d.html>`_
: Apply the 2d pooling.
`pool3d(...) <nn/pool3d.html>`_
: Apply the 3d pooling.
`relu(...) <nn/relu.html>`_
: Apply the rectified linear unit.
`[Nair & Hinton, 2010] <http://www.csri.utoronto.ca/~hinton/absps/reluICML.pdf>`_.
......@@ -129,8 +153,14 @@ dragon.nn
nn/RNN
nn/batch_norm
nn/bias_add
nn/conv
nn/conv_transpose
nn/conv1d
nn/conv1d_transpose
nn/conv2d
nn/conv2d_transpose
nn/conv3d
nn/conv3d_transpose
nn/depthwise_conv2d
nn/depth_to_space
nn/dropout
......@@ -146,7 +176,10 @@ dragon.nn
nn/leaky_relu
nn/local_response_norm
nn/log_softmax
nn/pool
nn/pool1d
nn/pool2d
nn/pool3d
nn/prelu
nn/relu
nn/relu6
......
conv
====
.. autofunction:: dragon.nn.conv
.. raw:: html
<style>
h1:before {
content: "dragon.nn.";
color: #103d3e;
}
</style>
conv1d
======
.. autofunction:: dragon.nn.conv1d
.. raw:: html
<style>
h1:before {
content: "dragon.nn.";
color: #103d3e;
}
</style>
conv1d_transpose
================
.. autofunction:: dragon.nn.conv1d_transpose
.. raw:: html
<style>
h1:before {
content: "dragon.nn.";
color: #103d3e;
}
</style>
conv3d
======
.. autofunction:: dragon.nn.conv3d
.. raw:: html
<style>
h1:before {
content: "dragon.nn.";
color: #103d3e;
}
</style>
conv3d_transpose
================
.. autofunction:: dragon.nn.conv3d_transpose
.. raw:: html
<style>
h1:before {
content: "dragon.nn.";
color: #103d3e;
}
</style>
conv_transpose
==============
.. autofunction:: dragon.nn.conv_transpose
.. raw:: html
<style>
h1:before {
content: "dragon.nn.";
color: #103d3e;
}
</style>
pool
====
.. autofunction:: dragon.nn.pool
.. raw:: html
<style>
h1:before {
content: "dragon.nn.";
color: #103d3e;
}
</style>
pool1d
======
.. autofunction:: dragon.nn.pool1d
.. raw:: html
<style>
h1:before {
content: "dragon.nn.";
color: #103d3e;
}
</style>
pool3d
======
.. autofunction:: dragon.nn.pool3d
.. raw:: html
<style>
h1:before {
content: "dragon.nn.";
color: #103d3e;
}
</style>
......@@ -91,7 +91,7 @@ Name Supported Reference
`InstanceNormalization`_ |v| :func:`dragon.nn.instance_norm`
`IsInf`_ |v| :func:`dragon.math.is_inf`
`IsNaN`_ |v| :func:`dragon.math.is_nan`
`LRN`_ |v| :func:`dragon.nn.lrn`
`LRN`_ |v| :func:`dragon.nn.local_response_norm`
`LSTM`_ |v| :func:`dragon.nn.LSTM`
`LeakyRelu`_ |v| :func:`dragon.nn.leaky_relu`
`Less`_ |v| :func:`dragon.math.less`
......@@ -183,7 +183,7 @@ Name Supported Reference
`TopK`_ |v| :func:`dragon.math.top_k`
`Transpose`_ |v| :func:`dragon.transpose`
`Unique`_ |v| :func:`dragon.unique`
`Unsqueeze`_ |v| :func:`dragon.unsqueeze`
`Unsqueeze`_ |v| :func:`dragon.expand_dims`
`Upsample`_ |v| :func:`dragon.vision.resize`
`Where`_ |v| :func:`dragon.where`
`Xor`_ |v| :func:`dragon.bitwise.bitwise_xor`
......
......@@ -7,106 +7,172 @@ layers
-------
`class Add <layers/Add.html>`_
: The layer to add a sequence of inputs.
: Layer to add a sequence of inputs.
`class AveragePooling1D <layers/AveragePooling1D.html>`_
: 1D average pooling layer.
`class AveragePooling2D <layers/AveragePooling2D.html>`_
: The average 2d pooling layer.
: 2D average pooling layer.
`class AveragePooling3D <layers/AveragePooling3D.html>`_
: 3D average pooling layer.
`class BatchNormalization <layers/BatchNormalization.html>`_
: The batch normalization layer.
: Batch normalization layer.
`[Ioffe & Szegedy, 2015] <https://arxiv.org/abs/1502.03167>`_.
`class Concatenate <layers/Concatenate.html>`_
: The layer to concatenate a sequence of inputs.
: Layer to concatenate a sequence of inputs.
`class Conv1D <layers/Conv1D.html>`_
: 1D convolution layer.
`class Conv1DTranspose <layers/Conv1DTranspose.html>`_
: 1D deconvolution layer.
`class Conv2D <layers/Conv2D.html>`_
: The 2d convolution layer.
: 2D convolution layer.
`class Conv2DTranspose <layers/Conv2DTranspose.html>`_
: The 2d deconvolution layer.
: 2D deconvolution layer.
`class Conv3D <layers/Conv3D.html>`_
: 3D convolution layer.
`class Conv3DTranspose <layers/Conv3DTranspose.html>`_
: 3D deconvolution layer.
`class Dense <layers/Dense.html>`_
: The fully-connected layer.
: Fully-connected layer.
`class DepthwiseConv2D <layers/DepthwiseConv2D.html>`_
: The 2d depthwise convolution layer.
: 2D depthwise convolution layer.
`[Chollet, 2016] <https://arxiv.org/abs/1610.02357>`_.
`class Dropout <layers/Dropout.html>`_
: The dropout layer.
: Layer to apply the dropout function.
`[Srivastava et.al, 2014] <http://jmlr.org/papers/v15/srivastava14a.html>`_.
`class ELU <layers/ELU.html>`_
: The layer to apply the exponential linear unit.
: Layer to apply the exponential linear unit.
`[Clevert et.al, 2015] <https://arxiv.org/abs/1511.07289>`_.
`class Flatten <layers/Flatten.html>`_
: The layer to reshape input into a matrix.
: Layer to reshape input into a matrix.
`class GlobalAveragePooling1D <layers/GlobalAveragePooling1D.html>`_
: 1D global average pooling layer.
`class GlobalAveragePooling2D <layers/GlobalAveragePooling2D.html>`_
: The global average 2d pooling layer.
: 2D global average pooling layer.
`class GlobalAveragePooling3D <layers/GlobalAveragePooling3D.html>`_
: 3D global average pooling layer.
`class GlobalMaxPool1D <layers/GlobalMaxPool1D.html>`_
: 1D global max pooling layer.
`class GlobalMaxPool2D <layers/GlobalMaxPool2D.html>`_
: The global max 2d pooling layer.
: 2D global max pooling layer.
`class GlobalMaxPool3D <layers/GlobalMaxPool3D.html>`_
: 3D global max pooling layer.
`class Layer <layers/Layer.html>`_
: The base class of layers.
`class LeakyReLU <layers/LeakyReLU.html>`_
: The layer to apply the leaky rectified linear unit.
: Layer to apply the leaky rectified linear unit.
`class Maximum <layers/Maximum.html>`_
: The layer to compute the maximum of a sequence of inputs.
: Layer to compute the maximum of a sequence of inputs.
`class MaxPool1D <layers/MaxPool1D.html>`_
: 1D max pooling layer.
`class MaxPool2D <layers/MaxPool2D.html>`_
: The max 2d pooling layer.
: 2D max pooling layer.
`class MaxPool3D <layers/MaxPool3D.html>`_
: 3D max pooling layer.
`class Minimum <layers/Minimum.html>`_
: The layer to compute the minimum of a sequence of inputs.
: Layer to compute the minimum of a sequence of inputs.
`class Multiply <layers/Multiply.html>`_
: The layer to multiply a sequence of inputs.
: Layer to multiply a sequence of inputs.
`class Permute <layers/Permute.html>`_
: The layer to permute the dimensions of input.
: Layer to permute the dimensions of input.
`class Reshape <layers/Reshape.html>`_
: The layer to change the dimensions of input.
: Layer to change the dimensions of input.
`class ReLU <layers/ReLU.html>`_
: The layer to apply the rectified linear unit.
: Layer to apply the rectified linear unit.
`[Nair & Hinton, 2010] <http://www.csri.utoronto.ca/~hinton/absps/reluICML.pdf>`_.
`class SELU <layers/SELU.html>`_
: Apply the scaled exponential linear unit.
: Layer to apply the scaled exponential linear unit.
`[Klambauer et.al, 2017] <https://arxiv.org/abs/1706.02515>`_.
`class Softmax <layers/Softmax.html>`_
: The layer to apply the softmax function.
: Layer to apply the softmax function.
`class Subtract <layers/Subtract.html>`_
: The layer to subtract two inputs.
: Layer to subtract two inputs.
`class UpSampling1D <layers/UpSampling1D.html>`_
: 1D upsampling layer.
`class UpSampling2D <layers/UpSampling2D.html>`_
: 2D upsampling layer.
`class UpSampling3D <layers/UpSampling3D.html>`_
: 3D upsampling layer.
`class ZeroPadding1D <layers/ZeroPadding1D.html>`_
: 1D zero padding layer.
`class ZeroPadding2D <layers/ZeroPadding2D.html>`_
: 2D zero padding layer.
`class ZeroPadding3D <layers/ZeroPadding3D.html>`_
: 3D zero padding layer.
.. toctree::
:hidden:
layers/Add
layers/AveragePooling1D
layers/AveragePooling2D
layers/AveragePooling3D
layers/BatchNormalization
layers/Concatenate
layers/Conv1D
layers/Conv1DTranspose
layers/Conv2D
layers/Conv2DTranspose
layers/Conv3D
layers/Conv3DTranspose
layers/Dense
layers/DepthwiseConv2D
layers/Dropout
layers/ELU
layers/Flatten
layers/GlobalAveragePooling1D
layers/GlobalAveragePooling2D
layers/GlobalAveragePooling3D
layers/GlobalMaxPool1D
layers/GlobalMaxPool2D
layers/GlobalMaxPool3D
layers/Layer
layers/LeakyReLU
layers/Maximum
layers/MaxPool1D
layers/MaxPool2D
layers/MaxPool3D
layers/Minimum
layers/Multiply
layers/Permute
......@@ -115,6 +181,12 @@ layers
layers/SELU
layers/Softmax
layers/Subtract
layers/UpSampling1D
layers/UpSampling2D
layers/UpSampling3D
layers/ZeroPadding1D
layers/ZeroPadding2D
layers/ZeroPadding3D
.. raw:: html
......
AveragePooling1D
================
.. autoclass:: dragon.vm.tensorflow.keras.layers.AveragePooling1D
__init__
--------
.. automethod:: dragon.vm.tensorflow.keras.layers.AveragePooling1D.__init__
.. raw:: html
<style>
h1:before {
content: "tf.keras.layers.";
color: #103d3e;
}
</style>
AveragePooling3D
================
.. autoclass:: dragon.vm.tensorflow.keras.layers.AveragePooling3D
__init__
--------
.. automethod:: dragon.vm.tensorflow.keras.layers.AveragePooling3D.__init__
.. raw:: html
<style>
h1:before {
content: "tf.keras.layers.";
color: #103d3e;
}
</style>
Conv1D
======
.. autoclass:: dragon.vm.tensorflow.keras.layers.Conv1D
__init__
--------
.. automethod:: dragon.vm.tensorflow.keras.layers.Conv1D.__init__
.. raw:: html
<style>
h1:before {
content: "tf.keras.layers.";
color: #103d3e;
}
</style>
Conv1DTranspose
===============
.. autoclass:: dragon.vm.tensorflow.keras.layers.Conv1DTranspose
__init__
--------
.. automethod:: dragon.vm.tensorflow.keras.layers.Conv1DTranspose.__init__
.. raw:: html
<style>
h1:before {
content: "tf.keras.layers.";
color: #103d3e;
}
</style>
Conv3D
======
.. autoclass:: dragon.vm.tensorflow.keras.layers.Conv3D
__init__
--------
.. automethod:: dragon.vm.tensorflow.keras.layers.Conv3D.__init__
.. raw:: html
<style>
h1:before {
content: "tf.keras.layers.";
color: #103d3e;
}
</style>
Conv3DTranspose
===============
.. autoclass:: dragon.vm.tensorflow.keras.layers.Conv3DTranspose
__init__
--------
.. automethod:: dragon.vm.tensorflow.keras.layers.Conv3DTranspose.__init__
.. raw:: html
<style>
h1:before {
content: "tf.keras.layers.";
color: #103d3e;
}
</style>
GlobalAveragePooling1D
======================
.. autoclass:: dragon.vm.tensorflow.keras.layers.GlobalAveragePooling1D
__init__
--------
.. automethod:: dragon.vm.tensorflow.keras.layers.GlobalAveragePooling1D.__init__
.. raw:: html
<style>
h1:before {
content: "tf.keras.layers.";
color: #103d3e;
}
</style>
GlobalAveragePooling3D
======================
.. autoclass:: dragon.vm.tensorflow.keras.layers.GlobalAveragePooling3D
__init__
--------
.. automethod:: dragon.vm.tensorflow.keras.layers.GlobalAveragePooling3D.__init__
.. raw:: html
<style>
h1:before {
content: "tf.keras.layers.";
color: #103d3e;
}
</style>
GlobalMaxPool1D
===============
.. autoclass:: dragon.vm.tensorflow.keras.layers.GlobalMaxPooling1D
__init__
--------
.. automethod:: dragon.vm.tensorflow.keras.layers.GlobalMaxPooling1D.__init__
.. raw:: html
<style>
h1:before {
content: "tf.keras.layers.";
color: #103d3e;
}
</style>
GlobalMaxPool3D
===============
.. autoclass:: dragon.vm.tensorflow.keras.layers.GlobalMaxPooling3D
__init__
--------
.. automethod:: dragon.vm.tensorflow.keras.layers.GlobalMaxPooling3D.__init__
.. raw:: html
<style>
h1:before {
content: "tf.keras.layers.";
color: #103d3e;
}
</style>
MaxPool1D
=========
.. autoclass:: dragon.vm.tensorflow.keras.layers.MaxPooling1D
__init__
--------
.. automethod:: dragon.vm.tensorflow.keras.layers.MaxPooling1D.__init__
.. raw:: html
<style>
h1:before {
content: "tf.keras.layers.";
color: #103d3e;
}
</style>
MaxPool3D
=========
.. autoclass:: dragon.vm.tensorflow.keras.layers.MaxPooling3D
__init__
--------
.. automethod:: dragon.vm.tensorflow.keras.layers.MaxPooling3D.__init__
.. raw:: html
<style>
h1:before {
content: "tf.keras.layers.";
color: #103d3e;
}
</style>
UpSampling1D
============
.. autoclass:: dragon.vm.tensorflow.keras.layers.UpSampling1D
__init__
--------
.. automethod:: dragon.vm.tensorflow.keras.layers.UpSampling1D.__init__
.. raw:: html
<style>
h1:before {
content: "tf.keras.layers.";
color: #103d3e;
}
</style>
UpSampling2D
============
.. autoclass:: dragon.vm.tensorflow.keras.layers.UpSampling2D
__init__
--------
.. automethod:: dragon.vm.tensorflow.keras.layers.UpSampling2D.__init__
.. raw:: html
<style>
h1:before {
content: "tf.keras.layers.";
color: #103d3e;
}
</style>
UpSampling3D
============
.. autoclass:: dragon.vm.tensorflow.keras.layers.UpSampling3D
__init__
--------
.. automethod:: dragon.vm.tensorflow.keras.layers.UpSampling3D.__init__
.. raw:: html
<style>
h1:before {
content: "tf.keras.layers.";
color: #103d3e;
}
</style>
ZeroPadding1D
=============
.. autoclass:: dragon.vm.tensorflow.keras.layers.ZeroPadding1D
__init__
--------
.. automethod:: dragon.vm.tensorflow.keras.layers.ZeroPadding1D.__init__
.. raw:: html
<style>
h1:before {
content: "tf.keras.layers.";
color: #103d3e;
}
</style>
ZeroPadding2D
=============
.. autoclass:: dragon.vm.tensorflow.keras.layers.ZeroPadding2D
__init__
--------
.. automethod:: dragon.vm.tensorflow.keras.layers.ZeroPadding2D.__init__
.. raw:: html
<style>
h1:before {
content: "tf.keras.layers.";
color: #103d3e;
}
</style>
ZeroPadding3D
=============
.. autoclass:: dragon.vm.tensorflow.keras.layers.ZeroPadding3D
__init__
--------
.. automethod:: dragon.vm.tensorflow.keras.layers.ZeroPadding3D.__init__
.. raw:: html
<style>
h1:before {
content: "tf.keras.layers.";
color: #103d3e;
}
</style>
......@@ -9,19 +9,40 @@ vm.tensorflow.nn
`avg_pool(...) <nn/avg_pool.html>`_
: Apply the n-dimension average pooling.
`avg_pool1d(...) <nn/avg_pool1d.html>`_
: Apply the 1d average pooling.
`avg_pool2d(...) <nn/avg_pool2d.html>`_
: Apply the 2d average pooling.
`avg_pool3d(...) <nn/avg_pool3d.html>`_
: Apply the 3d average pooling.
`batch_normalization(...) <nn/batch_normalization.html>`_
: Apply the batch normalization.
`[Ioffe & Szegedy, 2015] <https://arxiv.org/abs/1502.03167>`_.
`bias_add(...) <nn/bias_add.html>`_
: Add the bias across channels to input.
`conv1d(...) <nn/conv1d.html>`_
: Apply the 1d convolution.
`conv1d_transpose(...) <nn/conv1d_transpose.html>`_
: Apply the 1d deconvolution.
`conv2d(...) <nn/conv2d.html>`_
: Apply the 2d convolution.
`conv2d_transpose(...) <nn/conv2d_transpose.html>`_
: Apply the 2d deconvolution.
`conv3d(...) <nn/conv3d.html>`_
: Apply the 3d convolution.
`conv3d_transpose(...) <nn/conv3d_transpose.html>`_
: Apply the 3d deconvolution.
`convolution(...) <nn/convolution.html>`_
: Apply the n-dimension convolution.
......@@ -56,9 +77,15 @@ vm.tensorflow.nn
`max_pool(...) <nn/max_pool.html>`_
: Apply the n-dimension max pooling.
`max_pool(...) <nn/max_pool.html>`_
`max_pool1d(...) <nn/max_pool1d.html>`_
: Apply the 1d max pooling.
`max_pool2d(...) <nn/max_pool2d.html>`_
: Apply the 2d max pooling.
`max_pool3d(...) <nn/max_pool3d.html>`_
: Apply the 3d max pooling.
`moments(...) <nn/moments.html>`_
: Compute the mean and variance of input along the given axes.
......@@ -94,10 +121,17 @@ vm.tensorflow.nn
:hidden:
nn/avg_pool
nn/avg_pool1d
nn/avg_pool2d
nn/avg_pool3d
nn/batch_normalization
nn/bias_add
nn/conv1d
nn/conv1d_transpose
nn/conv2d
nn/conv2d_transpose
nn/conv3d
nn/conv3d_transpose
nn/convolution
nn/conv_transpose
nn/depthwise_conv2d
......@@ -108,7 +142,9 @@ vm.tensorflow.nn
nn/local_response_normalization
nn/log_softmax
nn/max_pool
nn/max_pool1d
nn/max_pool2d
nn/max_pool3d
nn/moments
nn/relu
nn/relu6
......
avg_pool1d
==========
.. autofunction:: dragon.vm.tensorflow.nn.avg_pool1d
.. raw:: html
<style>
h1:before {
content: "tf.nn.";
color: #103d3e;
}
</style>
avg_pool3d
==========
.. autofunction:: dragon.vm.tensorflow.nn.avg_pool3d
.. raw:: html
<style>
h1:before {
content: "tf.nn.";
color: #103d3e;
}
</style>
bias_add
========
.. autofunction:: dragon.vm.tensorflow.nn.bias_add
.. raw:: html
<style>
h1:before {
content: "tf.nn.";
color: #103d3e;
}
</style>
conv1d
======
.. autofunction:: dragon.vm.tensorflow.nn.conv1d
.. raw:: html
<style>
h1:before {
content: "tf.nn.";
color: #103d3e;
}
</style>
conv1d_transpose
================
.. autofunction:: dragon.vm.tensorflow.nn.conv1d_transpose
.. raw:: html
<style>
h1:before {
content: "tf.nn.";
color: #103d3e;
}
</style>
conv3d
======
.. autofunction:: dragon.vm.tensorflow.nn.conv3d
.. raw:: html
<style>
h1:before {
content: "tf.nn.";
color: #103d3e;
}
</style>
conv3d_transpose
================
.. autofunction:: dragon.vm.tensorflow.nn.conv3d_transpose
.. raw:: html
<style>
h1:before {
content: "tf.nn.";
color: #103d3e;
}
</style>
max_pool1d
==========
.. autofunction:: dragon.vm.tensorflow.nn.max_pool1d
.. raw:: html
<style>
h1:before {
content: "tf.nn.";
color: #103d3e;
}
</style>
max_pool3d
==========
.. autofunction:: dragon.vm.tensorflow.nn.max_pool3d
.. raw:: html
<style>
h1:before {
content: "tf.nn.";
color: #103d3e;
}
</style>
......@@ -7,51 +7,51 @@ vm.tensorlayer.layers
-------
`class BatchNorm <layers/BatchNorm.html>`_
: The layer to apply the batch normalization.
: Batch normalization layer.
`[Ioffe & Szegedy, 2015] <https://arxiv.org/abs/1502.03167>`_.
`class Concat <layers/Concat.html>`_
: The layer to concat tensors according to the given axis.
: Layer to concat tensors according to the given axis.
`class Conv2d <layers/Conv2d.html>`_
: The 2d convolution layer.
: 2d convolution layer.
`class Dense <layers/Dense.html>`_
: The fully connected layer.
: Fully connected layer.
`class Elementwise <layers/Elementwise.html>`_
: The layer to combine inputs by applying element-wise operation.
: Layer to combine inputs by applying element-wise operation.
`class Flatten <layers/Flatten.html>`_
: The layer to reshape input into a matrix.
: Layer to reshape input into a matrix.
`class GlobalMaxPool2d <layers/GlobalMaxPool2d.html>`_
: The global max 2d pooling layer.
: 2d global max pooling layer.
`class GlobalMeanPool2d <layers/GlobalMeanPool2d.html>`_
: The global mean 2d pooling layer.
: 2d global mean pooling layer.
`class MaxPool2d <layers/MaxPool2d.html>`_
: The max 2d pooling layer.
: 2d max pooling layer.
`class MeanPool2d <layers/MeanPool2d.html>`_
: The mean 2d pooling layer.
: 2d mean pooling layer.
`class Layer <layers/Layer.html>`_
: The base layer class.
`class LayerList <layers/LayerList.html>`_
: The sequential layer to stack a group of layers.
: Layer to stack a group of layers.
`class Relu <layers/Relu.html>`_
: The layer to apply the rectified linear unit.
: Layer to apply the rectified linear unit.
`[Nair & Hinton, 2010] <http://www.csri.utoronto.ca/~hinton/absps/reluICML.pdf>`_.
`class Reshape <layers/Reshape.html>`_
: The layer to change the dimensions of input.
: Layer to change the dimensions of input.
`class Transpose <layers/Transpose.html>`_
: The layer to permute the dimensions of input.
: Layer to permute the dimensions of input.
Functions
---------
......
......@@ -164,7 +164,7 @@ vm.torch
: Compute the minimum value of inputs.
`mm(...) <torch/mm.html>`_
: Perform a matrix multiplication.
: Compute the matrix-matrix multiplication.
`mul(...) <torch/mul.html>`_
: Compute the element-wise multiplication.
......
......@@ -245,6 +245,14 @@ int\_
######
.. automethod:: dragon.vm.torch.Tensor.int_
isinf
#####
.. automethod:: dragon.vm.torch.Tensor.isinf
isnan
#####
.. automethod:: dragon.vm.torch.Tensor.isnan
is_floating_point
#################
.. automethod:: dragon.vm.torch.Tensor.is_floating_point
......@@ -281,6 +289,10 @@ max
###
.. automethod:: dragon.vm.torch.Tensor.max
maximum
#######
.. automethod:: dragon.vm.torch.Tensor.maximum
masked_select
#############
.. automethod:: dragon.vm.torch.Tensor.masked_select
......@@ -293,6 +305,14 @@ min
###
.. automethod:: dragon.vm.torch.Tensor.min
minimum
#######
.. automethod:: dragon.vm.torch.Tensor.minimum
mm
###
.. automethod:: dragon.vm.torch.Tensor.mm
mul
###
.. automethod:: dragon.vm.torch.Tensor.mul
......@@ -429,6 +449,10 @@ sort
####
.. automethod:: dragon.vm.torch.Tensor.sort
split
#####
.. automethod:: dragon.vm.torch.Tensor.split
sqrt
####
.. automethod:: dragon.vm.torch.Tensor.sqrt
......@@ -529,8 +553,15 @@ zero\_
.. _torch.full(...): full.html
.. _torch.ge(...): ge.html
.. _torch.gt(...): gt.html
.. _torch.isinf(...): isinf.html
.. _torch.isnan(...): isnan.html
.. _torch.le(...): le.html
.. _torch.lt(...): lt.html
.. _torch.max(...): max.html
.. _torch.maximum(...): maximum.html
.. _torch.min(...): min.html
.. _torch.minimum(...): minimum.html
.. _torch.mm(...): mm.html
.. _torch.mul(...): mul.html
.. _torch.ne(...): ne.html
.. _torch.neg(...): neg.html
......@@ -544,6 +575,7 @@ zero\_
.. _torch.sign(...): sign.html
.. _torch.sin(...): sin.html
.. _torch.sort(...): sort.html
.. _torch.split(...): split.html
.. _torch.sqrt(...): sqrt.html
.. _torch.squeeze(...): squeeze.html
.. _torch.sub(...): sub.html
......
......@@ -9,9 +9,15 @@ vm.torch.nn
`class AffineChannel <nn/AffineChannel.html>`_
: Apply affine transformation along the channels.
`class AvgPool1d <nn/AvgPool1d.html>`_
: Apply the 1d average pooling.
`class AvgPool2d <nn/AvgPool2d.html>`_
: Apply the 2d average pooling.
`class AvgPool3d <nn/AvgPool3d.html>`_
: Apply the 3d average pooling.
`class BatchNorm1d <nn/BatchNorm1d.html>`_
: Apply the batch normalization over 2d input.
`[Ioffe & Szegedy, 2015] <https://arxiv.org/abs/1502.03167>`_.
......@@ -36,12 +42,24 @@ vm.torch.nn
`class ConstantPad2d <nn/ConstantPad2d.html>`_
: Pad input according to the last 3-dimensions with a constant.
`class Conv1d <nn/Conv1d.html>`_
: Apply the 1d convolution.
`class Conv2d <nn/Conv2d.html>`_
: Apply the 2d convolution.
`class Conv3d <nn/Conv3d.html>`_
: Apply the 3d convolution.
`class ConvTranspose1d <nn/ConvTranspose1d.html>`_
: Apply the 1d deconvolution.
`class ConvTranspose2d <nn/ConvTranspose2d.html>`_
: Apply the 2d deconvolution.
`class ConvTranspose3d <nn/ConvTranspose3d.html>`_
: Apply the 3d deconvolution.
`class CrossEntropyLoss <nn/CrossEntropyLoss.html>`_
: Compute the softmax cross entropy with sparse labels.
......@@ -91,6 +109,9 @@ vm.torch.nn
: Apply the hard swish function.
`[Howard et.al, 2019] <https://arxiv.org/abs/1905.02244>`_.
`class Identity <nn/Identity.html>`_
: Apply the identity transformation.
`class KLDivLoss <nn/KLDivLoss.html>`_
: Compute the Kullback-Leibler divergence.
......@@ -118,8 +139,14 @@ vm.torch.nn
: Apply a long short-term memory (LSTM) cell.
`[Hochreiter & Schmidhuber, 1997] <https://doi.org/10.1162>`_.
`class MaxPool1d <nn/MaxPool1d.html>`_
: Apply the 1d max pooling.
`class MaxPool2d <nn/MaxPool2d.html>`_
: Apply the 2d MaxPool2d pooling.
: Apply the 2d max pooling.
`class MaxPool3d <nn/MaxPool3d.html>`_
: Apply the 3d max pooling.
`class Module <nn/Module.html>`_
: The base class of modules.
......@@ -212,7 +239,9 @@ vm.torch.nn
:hidden:
nn/AffineChannel
nn/AvgPool1d
nn/AvgPool2d
nn/AvgPool3d
nn/BatchNorm1d
nn/BatchNorm2d
nn/BatchNorm3d
......@@ -220,8 +249,12 @@ vm.torch.nn
nn/ConstantPad1d
nn/ConstantPad2d
nn/ConstantPad3d
nn/Conv1d
nn/Conv2d
nn/Conv3d
nn/ConvTranspose1d
nn/ConvTranspose2d
nn/ConvTranspose3d
nn/CrossEntropyLoss
nn/CTCLoss
nn/DepthwiseConv2d
......@@ -235,6 +268,7 @@ vm.torch.nn
nn/GumbelSoftmax
nn/Hardsigmoid
nn/Hardswish
nn/Identity
nn/KLDivLoss
nn/L1Loss
nn/LeakyReLU
......@@ -243,7 +277,9 @@ vm.torch.nn
nn/LogSoftmax
nn/LSTM
nn/LSTMCell
nn/MaxPool1d
nn/MaxPool2d
nn/MaxPool3d
nn/Module
nn/MSELoss
nn/NLLLoss
......
AvgPool1d
=========
.. autoclass:: dragon.vm.torch.nn.AvgPool1d
__init__
--------
.. automethod:: dragon.vm.torch.nn.AvgPool1d.__init__
.. _torch.nn.functional.avg_pool1d(...): functional/avg_pool1d.html
.. raw:: html
<style>
h1:before {
content: "torch.nn.";
color: #103d3e;
}
</style>
AvgPool3d
=========
.. autoclass:: dragon.vm.torch.nn.AvgPool3d
__init__
--------
.. automethod:: dragon.vm.torch.nn.AvgPool3d.__init__
.. _torch.nn.functional.avg_pool3d(...): functional/avg_pool3d.html
.. raw:: html
<style>
h1:before {
content: "torch.nn.";
color: #103d3e;
}
</style>
Conv1d
======
.. autoclass:: dragon.vm.torch.nn.Conv1d
__init__
--------
.. automethod:: dragon.vm.torch.nn.Conv1d.__init__
.. _torch.nn.functional.conv1d(...): functional/conv1d.html
.. raw:: html
<style>
h1:before {
content: "torch.nn.";
color: #103d3e;
}
</style>
Conv3d
======
.. autoclass:: dragon.vm.torch.nn.Conv3d
__init__
--------
.. automethod:: dragon.vm.torch.nn.Conv3d.__init__
.. _torch.nn.functional.conv3d(...): functional/conv3d.html
.. raw:: html
<style>
h1:before {
content: "torch.nn.";
color: #103d3e;
}
</style>
ConvTranspose1d
===============
.. autoclass:: dragon.vm.torch.nn.ConvTranspose1d
__init__
--------
.. automethod:: dragon.vm.torch.nn.ConvTranspose1d.__init__
.. _torch.nn.functional.conv_transpose1d(...): functional/conv_transpose1d.html
.. raw:: html
<style>
h1:before {
content: "torch.nn.";
color: #103d3e;
}
</style>
ConvTranspose3d
===============
.. autoclass:: dragon.vm.torch.nn.ConvTranspose3d
__init__
--------
.. automethod:: dragon.vm.torch.nn.ConvTranspose3d.__init__
.. _torch.nn.functional.conv_transpose3d(...): functional/conv_transpose3d.html
.. raw:: html
<style>
h1:before {
content: "torch.nn.";
color: #103d3e;
}
</style>
Identity
========
.. autoclass:: dragon.vm.torch.nn.Identity
__init__
--------
.. automethod:: dragon.vm.torch.nn.Identity.__init__
.. raw:: html
<style>
h1:before {
content: "torch.nn.";
color: #103d3e;
}
</style>
MaxPool1d
=========
.. autoclass:: dragon.vm.torch.nn.MaxPool1d
__init__
--------
.. automethod:: dragon.vm.torch.nn.MaxPool1d.__init__
.. _torch.nn.functional.max_pool1d(...): functional/max_pool1d.html
.. raw:: html
<style>
h1:before {
content: "torch.nn.";
color: #103d3e;
}
</style>
MaxPool3d
=========
.. autoclass:: dragon.vm.torch.nn.MaxPool3d
__init__
--------
.. automethod:: dragon.vm.torch.nn.MaxPool3d.__init__
.. _torch.nn.functional.max_pool3d(...): functional/max_pool3d.html
.. raw:: html
<style>
h1:before {
content: "torch.nn.";
color: #103d3e;
}
</style>
......@@ -7,7 +7,7 @@ __init__
--------
.. automethod:: dragon.vm.torch.nn.SyncBatchNorm.__init__
.. _torch.nn.functional.batch_norm(...): functional/batch_norm.html
.. _torch.nn.functional.sync_batch_norm(...): functional/sync_batch_norm.html
.. raw:: html
......
......@@ -6,9 +6,15 @@ vm.torch.nn.functional
Functions
---------
`avg_pool1d(...) <functional/avg_pool1d.html>`_
: Apply the 1d average pooling to input.
`avg_pool2d(...) <functional/avg_pool2d.html>`_
: Apply the 2d average pooling to input.
`avg_pool3d(...) <functional/avg_pool3d.html>`_
: Apply the 3d average pooling to input.
`batch_norm(...) <functional/batch_norm.html>`_
: Apply the batch normalization to input.
`[Ioffe & Szegedy, 2015] <https://arxiv.org/abs/1502.03167>`_.
......@@ -16,12 +22,24 @@ vm.torch.nn.functional
`binary_cross_entropy_with_logits(...) <functional/binary_cross_entropy_with_logits.html>`_
: Compute the sigmoid cross entropy with contiguous target.
`conv1d(...) <functional/conv1d.html>`_
: Apply 1d convolution to the input.
`conv2d(...) <functional/conv2d.html>`_
: Apply 2d convolution to the input.
`conv3d(...) <functional/conv3d.html>`_
: Apply 3d convolution to the input.
`conv_transpose1d(...) <functional/conv_transpose1d.html>`_
: Apply 1d deconvolution to the input.
`conv_transpose2d(...) <functional/conv_transpose2d.html>`_
: Apply 2d deconvolution to the input.
`conv_transpose3d(...) <functional/conv_transpose3d.html>`_
: Apply 3d deconvolution to the input.
`cross_entropy(...) <functional/cross_entropy.html>`_
: Compute the softmax cross entropy with sparse labels.
......@@ -82,9 +100,15 @@ vm.torch.nn.functional
`interpolate(...) <functional/interpolate.html>`_
: Resize input via interpolating neighborhoods.
`max_pool1d(...) <functional/max_pool1d.html>`_
: Apply the 1d max pooling to input.
`max_pool2d(...) <functional/max_pool2d.html>`_
: Apply the 2d max pooling to input.
`max_pool3d(...) <functional/max_pool3d.html>`_
: Apply the 3d max pooling to input.
`mse_loss(...) <functional/mse_loss.html>`_
: Compute the element-wise squared error.
......@@ -143,11 +167,17 @@ vm.torch.nn.functional
.. toctree::
:hidden:
functional/avg_pool1d
functional/avg_pool2d
functional/avg_pool3d
functional/batch_norm
functional/binary_cross_entropy_with_logits
functional/conv1d
functional/conv2d
functional/conv3d
functional/conv_transpose1d
functional/conv_transpose2d
functional/conv_transpose3d
functional/cross_entropy
functional/ctc_loss
functional/depthwise_conv2d
......@@ -165,7 +195,9 @@ vm.torch.nn.functional
functional/local_response_norm
functional/log_softmax
functional/interpolate
functional/max_pool1d
functional/max_pool2d
functional/max_pool3d
functional/mse_loss
functional/nll_loss
functional/normalize
......
avg_pool1d
==========
.. autofunction:: dragon.vm.torch.nn.functional.avg_pool1d
.. _torch.nn.AvgPool1d(...): ../AvgPool1d.html
.. raw:: html
<style>
h1:before {
content: "torch.nn.functional.";
color: #103d3e;
}
</style>
avg_pool3d
==========
.. autofunction:: dragon.vm.torch.nn.functional.avg_pool3d
.. _torch.nn.AvgPool3d(...): ../AvgPool3d.html
.. raw:: html
<style>
h1:before {
content: "torch.nn.functional.";
color: #103d3e;
}
</style>
conv1d
======
.. autofunction:: dragon.vm.torch.nn.functional.conv1d
.. _torch.nn.Conv1d(...): ../Conv1d.html
.. raw:: html
<style>
h1:before {
content: "torch.nn.functional.";
color: #103d3e;
}
</style>
conv3d
======
.. autofunction:: dragon.vm.torch.nn.functional.conv3d
.. _torch.nn.Conv3d(...): ../Conv3d.html
.. raw:: html
<style>
h1:before {
content: "torch.nn.functional.";
color: #103d3e;
}
</style>
conv_transpose1d
================
.. autofunction:: dragon.vm.torch.nn.functional.conv_transpose1d
.. _torch.nn.ConvTranspose1d(...): ../ConvTranspose1d.html
.. raw:: html
<style>
h1:before {
content: "torch.nn.functional.";
color: #103d3e;
}
</style>
conv_transpose3d
================
.. autofunction:: dragon.vm.torch.nn.functional.conv_transpose3d
.. _torch.nn.ConvTranspose3d(...): ../ConvTranspose3d.html
.. raw:: html
<style>
h1:before {
content: "torch.nn.functional.";
color: #103d3e;
}
</style>
max_pool1d
==========
.. autofunction:: dragon.vm.torch.nn.functional.max_pool1d
.. _torch.nn.MaxPool1d(...): ../MaxPool1d.html
.. raw:: html
<style>
h1:before {
content: "torch.nn.functional.";
color: #103d3e;
}
</style>
max_pool3d
==========
.. autofunction:: dragon.vm.torch.nn.functional.max_pool3d
.. _torch.nn.MaxPool3d(...): ../MaxPool3d.html
.. raw:: html
<style>
h1:before {
content: "torch.nn.functional.";
color: #103d3e;
}
</style>
......@@ -19,18 +19,13 @@
namespace dragon {
typedef enum {
NCHW = 0,
NHWC = 1,
} StorageOrder;
/*!
* \brief Memory to manage both the host and device data.
*/
class DRAGON_API UnifiedMemory {
public:
/*!
* \brief The device-aware state for data mutation.
* \brief The device-aware state for data consistency.
*/
enum State {
/*! \brief Initial state */
......@@ -60,10 +55,10 @@ class DRAGON_API UnifiedMemory {
/*! \brief Switch to the given cuda device */
void SwitchToCUDADevice(int device);
/*! \brief Involve the state to CPUContext */
/*! \brief Involve the state to cpu */
void ToCPU(size_t size = 0);
/*! \brief Involve the state to CUDAContext */
/*! \brief Involve the state to cuda */
void ToCUDA(size_t size = 0);
/*! \brief Return the memory state */
......@@ -86,7 +81,7 @@ class DRAGON_API UnifiedMemory {
return size_t(0);
}
/*! \brief Return the number of memory chunks */
/*! \brief Return the number of batch chunks */
size_t num_chunks() const {
return num_chunks_;
}
......@@ -181,10 +176,10 @@ class DRAGON_API UnifiedMemory {
/*! \brief The cnml data pointer */
void* cnml_ptr_ = nullptr;
/*! \brief The binding cpu tensor for cnml */
/*! \brief The binding cnml cpu tensor */
cnmlCpuTensor_t cnml_cpu_tensor_ = nullptr;
/*! \brief The binding mlu tensor for cnml */
/*! \brief The binding cnml mlu tensor */
cnmlTensor_t cnml_mlu_tensor_ = nullptr;
DISABLE_COPY_AND_ASSIGN(UnifiedMemory);
......
......@@ -43,6 +43,12 @@ typedef struct {
#endif
/*! \brief Order in which the values are laid out in memory */
typedef enum {
NCHW = 0,
NHWC = 1,
} StorageOrder;
/*! \brief Array that packs a fixed number of elements */
template <typename T, int N>
struct SimpleArray {
......
# ---[ General sources
add_subdirectory(activation)
add_subdirectory(array)
add_subdirectory(control_flow)
add_subdirectory(math)
add_subdirectory(normalization)
add_subdirectory(recurrent)
......
......@@ -13,14 +13,14 @@ namespace {
template <typename T, int D>
__global__ void _Transpose(
const int nthreads,
const int ndims,
const int num_dims,
const SimpleArray<int, D> x_strides,
const SimpleArray<int, D> y_dims,
const T* x,
T* y) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
int xi = 0, tmp = yi;
for (int d = ndims - 1; d >= 0; --d) {
for (int d = num_dims - 1; d >= 0; --d) {
int r;
FIXED_DIVISOR_DIV_MOD(y_dims.data[d], tmp, &tmp, &r);
xi += r * x_strides.data[d];
......@@ -32,14 +32,14 @@ __global__ void _Transpose(
template <typename T, int D>
__global__ void _TransposeGrad(
const int nthreads,
const int ndims,
const int num_dims,
const SimpleArray<int, D> x_strides,
const SimpleArray<int, D> y_dims,
const T* dy,
T* dx) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
int xi = 0, tmp = yi;
for (int d = ndims - 1; d >= 0; --d) {
for (int d = num_dims - 1; d >= 0; --d) {
int r;
FIXED_DIVISOR_DIV_MOD(y_dims.data[d], tmp, &tmp, &r);
xi += r * x_strides.data[d];
......
# ---[ General sources
file(GLOB SOURCES *.cc)
set(MODULE_SOURCES ${MODULE_SOURCES} ${SOURCES})
# ---[ CUDA sources
if (USE_CUDA)
file(GLOB CUDA_SOURCES *.cu)
set(KERNEL_CUDA_SOURCES ${KERNEL_CUDA_SOURCES} ${CUDA_SOURCES})
endif()
# ---[ Submit to the parent scope
set(MODULE_SOURCES ${MODULE_SOURCES} PARENT_SCOPE)
set(KERNEL_CUDA_SOURCES ${KERNEL_CUDA_SOURCES} PARENT_SCOPE)
......@@ -23,7 +23,7 @@ _CosGrad<half>(const int nthreads, const half* dy, const half* x, half* dx) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
dx[i] = __float2half(-__half2float(dy[i]) * sin(__half2float(x[i])));
}
}
} // CosGrad
template <>
__global__ void _CosGrad<half2>(
......@@ -36,7 +36,7 @@ __global__ void _CosGrad<half2>(
const float2 grad = __half22float2(dy[i]);
dx[i] = __floats2half2_rn(-grad.x * sin(val.x), -grad.y * sin(val.y));
}
}
} // CosGrad
template <typename T>
__global__ void _SinGrad(const int nthreads, const T* dy, const T* x, T* dx) {
......@@ -51,7 +51,7 @@ _SinGrad<half>(const int nthreads, const half* dy, const half* x, half* dx) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
dx[i] = __float2half(__half2float(dy[i]) * cos(__half2float(x[i])));
}
}
} // SinGrad
template <>
__global__ void _SinGrad<half2>(
......@@ -64,7 +64,7 @@ __global__ void _SinGrad<half2>(
const float2 grad = __half22float2(dy[i]);
dx[i] = __floats2half2_rn(grad.x * cos(val.x), grad.y * cos(val.y));
}
}
} // SinGrad
template <typename T>
__global__ void
......@@ -84,7 +84,7 @@ __global__ void _ReciprocalGrad<half>(
dx[i] = __float2half(
-__half2float(dy[i]) * math::utils::Square(__half2float(y[i])));
}
}
} // ReciprocalGrad
template <>
__global__ void _ReciprocalGrad<half2>(
......@@ -98,7 +98,7 @@ __global__ void _ReciprocalGrad<half2>(
dx[i] =
__floats2half2_rn(-grad.x * (val.x * val.x), -grad.y * (val.y * val.y));
}
}
} // ReciprocalGrad
template <typename T>
__global__ void _RsqrtGrad(const int nthreads, const T* dy, const T* y, T* dx) {
......@@ -114,7 +114,7 @@ _RsqrtGrad<half>(const int nthreads, const half* dy, const half* y, half* dx) {
dx[i] = __float2half(
-0.5f * __half2float(dy[i]) * math::utils::Cube(__half2float(y[i])));
}
}
} // ReciprocalGrad
template <>
__global__ void _RsqrtGrad<half2>(
......@@ -129,7 +129,7 @@ __global__ void _RsqrtGrad<half2>(
-0.5f * grad.x * (val.x * val.x * val.x),
-0.5f * grad.y * (val.y * val.y * val.y));
}
}
} // ReciprocalGrad
} // namespace
......
......@@ -176,35 +176,35 @@ void _Moments(
/* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(T, ScalarT, AccT) \
template <> \
void Moments<T, AccT, CUDAContext>( \
const int num_dims, \
const int* dims, \
const int num_axes, \
const int* axes, \
const T* x, \
AccT* mean, \
AccT* var, \
CUDAContext* ctx) { \
_Moments( \
num_dims, \
dims, \
num_axes, \
axes, \
reinterpret_cast<const ScalarT*>(x), \
mean, \
var, \
ctx); \
#define DEFINE_KERNEL_LAUNCHER(T, AccT) \
template <> \
void Moments<T, AccT, CUDAContext>( \
const int num_dims, \
const int* dims, \
const int num_axes, \
const int* axes, \
const T* x, \
AccT* mean, \
AccT* var, \
CUDAContext* ctx) { \
_Moments( \
num_dims, \
dims, \
num_axes, \
axes, \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \
mean, \
var, \
ctx); \
}
DEFINE_KERNEL_LAUNCHER(int8_t, int8_t, float);
DEFINE_KERNEL_LAUNCHER(uint8_t, uint8_t, float);
DEFINE_KERNEL_LAUNCHER(int, int, float);
DEFINE_KERNEL_LAUNCHER(int64_t, int64_t, double);
DEFINE_KERNEL_LAUNCHER(float16, half, float);
DEFINE_KERNEL_LAUNCHER(float, float, float);
DEFINE_KERNEL_LAUNCHER(double, double, double);
DEFINE_KERNEL_LAUNCHER(int8_t, float);
DEFINE_KERNEL_LAUNCHER(uint8_t, float);
DEFINE_KERNEL_LAUNCHER(int, float);
DEFINE_KERNEL_LAUNCHER(int64_t, double);
DEFINE_KERNEL_LAUNCHER(float16, float);
DEFINE_KERNEL_LAUNCHER(float, float);
DEFINE_KERNEL_LAUNCHER(double, double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel
......
#include "dragon/core/memory.h"
#include "dragon/utils/device/common_eigen.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h"
......
#include "dragon/core/memory.h"
#include "dragon/utils/device/common_eigen.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h"
......
......@@ -147,7 +147,7 @@ __global__ void _L2NormalizeGrad(
/* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(name, T, ScalarT, AccT) \
#define DEFINE_KERNEL_LAUNCHER(name, T, AccT) \
template <> \
void name<T, CUDAContext>( \
const int outer_dim, \
......@@ -159,18 +159,18 @@ __global__ void _L2NormalizeGrad(
T* y, \
CUDAContext* ctx) { \
const auto nblocks = outer_dim * inner_dim; \
_##name<ScalarT, AccT> \
_##name<math::ScalarType<T>::type, AccT> \
<<<CUDA_2D_BLOCKS(nblocks), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
nblocks, \
inner_dim, \
reduce_dim, \
AccT(normalizer), \
AccT(epsilon), \
reinterpret_cast<const ScalarT*>(x), \
reinterpret_cast<ScalarT*>(y)); \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<math::ScalarType<T>::type*>(y)); \
}
#define DEFINE_GRAD_KERNEL_LAUNCHER(name, T, ScalarT, AccT) \
#define DEFINE_GRAD_KERNEL_LAUNCHER(name, T, AccT) \
template <> \
void name<T, CUDAContext>( \
const int outer_dim, \
......@@ -183,30 +183,30 @@ __global__ void _L2NormalizeGrad(
T* dx, \
CUDAContext* ctx) { \
const auto nblocks = outer_dim * inner_dim; \
_##name<ScalarT, AccT> \
_##name<math::ScalarType<T>::type, AccT> \
<<<CUDA_2D_BLOCKS(nblocks), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
nblocks, \
inner_dim, \
reduce_dim, \
AccT(normalizer), \
AccT(epsilon), \
reinterpret_cast<const ScalarT*>(dy), \
reinterpret_cast<const ScalarT*>(x), \
reinterpret_cast<ScalarT*>(dx)); \
reinterpret_cast<const math::ScalarType<T>::type*>(dy), \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<math::ScalarType<T>::type*>(dx)); \
}
DEFINE_KERNEL_LAUNCHER(L1Normalize, float16, half, float);
DEFINE_KERNEL_LAUNCHER(L1Normalize, float, float, float);
DEFINE_KERNEL_LAUNCHER(L1Normalize, double, double, double);
DEFINE_KERNEL_LAUNCHER(L2Normalize, float16, half, float);
DEFINE_KERNEL_LAUNCHER(L2Normalize, float, float, float);
DEFINE_KERNEL_LAUNCHER(L2Normalize, double, double, double);
DEFINE_GRAD_KERNEL_LAUNCHER(L1NormalizeGrad, float16, half, float);
DEFINE_GRAD_KERNEL_LAUNCHER(L1NormalizeGrad, float, float, float);
DEFINE_GRAD_KERNEL_LAUNCHER(L1NormalizeGrad, double, double, double);
DEFINE_GRAD_KERNEL_LAUNCHER(L2NormalizeGrad, float16, half, float);
DEFINE_GRAD_KERNEL_LAUNCHER(L2NormalizeGrad, float, float, float);
DEFINE_GRAD_KERNEL_LAUNCHER(L2NormalizeGrad, double, double, double);
DEFINE_KERNEL_LAUNCHER(L1Normalize, float16, float);
DEFINE_KERNEL_LAUNCHER(L1Normalize, float, float);
DEFINE_KERNEL_LAUNCHER(L1Normalize, double, double);
DEFINE_KERNEL_LAUNCHER(L2Normalize, float16, float);
DEFINE_KERNEL_LAUNCHER(L2Normalize, float, float);
DEFINE_KERNEL_LAUNCHER(L2Normalize, double, double);
DEFINE_GRAD_KERNEL_LAUNCHER(L1NormalizeGrad, float16, float);
DEFINE_GRAD_KERNEL_LAUNCHER(L1NormalizeGrad, float, float);
DEFINE_GRAD_KERNEL_LAUNCHER(L1NormalizeGrad, double, double);
DEFINE_GRAD_KERNEL_LAUNCHER(L2NormalizeGrad, float16, float);
DEFINE_GRAD_KERNEL_LAUNCHER(L2NormalizeGrad, float, float);
DEFINE_GRAD_KERNEL_LAUNCHER(L2NormalizeGrad, double, double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
......
......@@ -10,15 +10,18 @@ namespace kernel {
namespace {
#if __CUDA_ARCH__ >= 350
#define LDG(x, i) __ldg(x + i)
#else
#define LDG(x, i) x[i]
#endif
template <typename T>
__global__ void
_BiasAdd(const int nthreads, const int axis_dim, const T* x, const T* b, T* y) {
auto Plus = math::PlusFunctor<T>();
CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 350
y[i] = math::PlusFunctor<T>()(x[i], __ldg(b + i % axis_dim));
#else
y[i] = math::PlusFunctor<T>()(x[i], b[i % axis_dim]);
#endif
y[i] = Plus(x[i], LDG(b, i % axis_dim));
}
}
......@@ -30,12 +33,9 @@ __global__ void _BiasAdd(
const T* x,
const T* b,
T* y) {
auto Plus = math::PlusFunctor<T>();
CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 350
y[i] = math::PlusFunctor<T>()(x[i], __ldg(b + (i / inner_dim) % axis_dim));
#else
y[i] = math::PlusFunctor<T>()(x[i], b[(i / inner_dim) % axis_dim]);
#endif
y[i] = Plus(x[i], LDG(b, (i / inner_dim) % axis_dim));
}
}
......@@ -43,64 +43,48 @@ __global__ void _BiasAdd(
/* ------------------- Launcher Separator ------------------- */
template <>
void BiasAdd<float16, CUDAContext>(
const int outer_dim,
const int inner_dim,
const int axis_dim,
const float16* x,
const float16* b,
float16* y,
CUDAContext* ctx) {
const auto nthreads = outer_dim * axis_dim * inner_dim;
if (inner_dim == 1) {
_BiasAdd<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
nthreads,
axis_dim,
reinterpret_cast<const half*>(x),
reinterpret_cast<const half*>(b),
reinterpret_cast<half*>(y));
} else {
_BiasAdd<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
nthreads,
inner_dim,
axis_dim,
reinterpret_cast<const half*>(x),
reinterpret_cast<const half*>(b),
reinterpret_cast<half*>(y));
}
}
#define DEFINE_KERNEL_LAUNCHER(T) \
template <> \
void BiasAdd<T, CUDAContext>( \
const int outer_dim, \
const int inner_dim, \
const int axis_dim, \
const T* x, \
const T* b, \
T* y, \
CUDAContext* ctx) { \
const auto nthreads = outer_dim * axis_dim * inner_dim; \
if (inner_dim == 1) { \
_BiasAdd<<< \
CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \
0, \
ctx->cuda_stream()>>>(nthreads, axis_dim, x, b, y); \
} else { \
_BiasAdd<<< \
CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \
0, \
ctx->cuda_stream()>>>(nthreads, inner_dim, axis_dim, x, b, y); \
} \
#define DEFINE_KERNEL_LAUNCHER(T) \
template <> \
void BiasAdd<T, CUDAContext>( \
const int outer_dim, \
const int inner_dim, \
const int axis_dim, \
const T* x, \
const T* b, \
T* y, \
CUDAContext* ctx) { \
const auto nthreads = outer_dim * axis_dim * inner_dim; \
if (inner_dim == 1) { \
_BiasAdd<<< \
CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \
0, \
ctx->cuda_stream()>>>( \
nthreads, \
axis_dim, \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<const math::ScalarType<T>::type*>(b), \
reinterpret_cast<math::ScalarType<T>::type*>(y)); \
} else { \
_BiasAdd<<< \
CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \
0, \
ctx->cuda_stream()>>>( \
nthreads, \
inner_dim, \
axis_dim, \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<const math::ScalarType<T>::type*>(b), \
reinterpret_cast<math::ScalarType<T>::type*>(y)); \
} \
}
DEFINE_KERNEL_LAUNCHER(int8_t);
DEFINE_KERNEL_LAUNCHER(uint8_t);
DEFINE_KERNEL_LAUNCHER(int);
DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
......
#include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......@@ -23,40 +24,36 @@ void _DepthwiseConv2dNCHW(
const int dilation_h,
const int dilation_w,
const T* x,
const T* w,
const T* filter,
T* y) {
T sum_val;
int ih, iw, xi, wi;
int yc_offset, xc_start, yc_start;
int ih_start, yh_start, iw_start;
for (int n = 0; n < N; ++n) {
for (int c = 0; c < C; ++c) {
yc_offset = n * C + c;
xc_start = yc_offset * H * W;
yc_start = yc_offset * out_h;
for (int oh = 0; oh < out_h; ++oh) {
ih_start = oh * stride_h - pad_h;
yh_start = (yc_start + oh) * out_w;
for (int ow = 0; ow < out_w; ++ow) {
sum_val = T(0);
wi = c * kernel_h * kernel_w;
iw_start = ow * stride_w - pad_w;
for (int kh = 0; kh < kernel_h; ++kh) {
for (int kw = 0; kw < kernel_w; ++kw) {
ih = ih_start + kh * dilation_h;
iw = iw_start + kw * dilation_w;
if (ih >= 0 && ih < H && iw >= 0 && iw < W) {
xi = xc_start + ih * W + iw;
sum_val += x[xi] * w[wi];
const int base_offset = n * C + c;
const int x_offset = base_offset * H * W;
const int y_offset = base_offset * out_h * out_w;
for (int h_out = 0; h_out < out_h; ++h_out) {
const int hstart = h_out * stride_h - pad_h;
for (int w_out = 0; w_out < out_w; ++w_out) {
T val = T(0);
int fi = c * kernel_h * kernel_w;
const int wstart = w_out * stride_w - pad_w;
for (int h_k = 0; h_k < kernel_h; ++h_k) {
for (int w_k = 0; w_k < kernel_w; ++w_k) {
const int h = hstart + h_k * dilation_h;
const int w = wstart + w_k * dilation_w;
if (math::utils::IsAGeZeroAndALtB(h, H) &&
math::utils::IsAGeZeroAndALtB(w, W)) {
const int xi = x_offset + h * W + w;
val += x[xi] * filter[fi];
}
++wi;
} // End kw
} // End kh
y[yh_start + ow] = sum_val;
} // End ow
} // End oh
}
} // End c && n
++fi;
} // End w_k
} // End h_k
y[y_offset + h_out * out_w + w_out] = val;
} // End w_out
} // End h_out
} // End c
} // End n
}
template <typename T>
......@@ -76,40 +73,34 @@ void _DepthwiseConv2dNHWC(
const int dilation_h,
const int dilation_w,
const T* x,
const T* w,
const T* filter,
T* y) {
T sum_val;
int ih, iw, xi, wi;
int xn_start, yn_start;
int ih_start, yh_start;
int iw_start, yw_start;
for (int n = 0; n < N; ++n) {
xn_start = n * H;
yn_start = n * out_h;
for (int oh = 0; oh < out_h; ++oh) {
ih_start = oh * stride_h - pad_h;
yh_start = (yn_start + oh) * out_w;
for (int ow = 0; ow < out_w; ++ow) {
iw_start = ow * stride_w - pad_w;
yw_start = (yh_start + ow) * C;
const int x_offset = n * H * W * C;
const int y_offset = n * out_h * out_w * C;
for (int h_out = 0; h_out < out_h; ++h_out) {
const int hstart = h_out * stride_h - pad_h;
for (int w_out = 0; w_out < out_w; ++w_out) {
const int wstart = w_out * stride_w - pad_w;
for (int c = 0; c < C; ++c) {
sum_val = T(0);
wi = c * kernel_h * kernel_w;
for (int kh = 0; kh < kernel_h; ++kh) {
for (int kw = 0; kw < kernel_w; ++kw) {
ih = ih_start + kh * dilation_h;
iw = iw_start + kw * dilation_w;
if (ih >= 0 && ih < H && iw >= 0 && iw < W) {
xi = ((xn_start + ih) * W + iw) * C + c;
sum_val += x[xi] * w[wi];
T val = T(0);
int fi = c * kernel_h * kernel_w;
for (int h_k = 0; h_k < kernel_h; ++h_k) {
for (int w_k = 0; w_k < kernel_w; ++w_k) {
const int h = hstart + h_k * dilation_h;
const int w = wstart + w_k * dilation_w;
if (math::utils::IsAGeZeroAndALtB(h, H) &&
math::utils::IsAGeZeroAndALtB(w, W)) {
const int xi = x_offset + (h * W + w) * C + c;
val += x[xi] * filter[fi];
}
++wi;
} // End kw
} // End kh
y[yw_start + c] = sum_val;
++fi;
} // End w_k
} // End h_k
y[y_offset + ((h_out * out_w) + w_out) * C + c] = val;
} // End c
} // End ow
} // End oh
} // End w_out
} // End h_out
} // End n
}
......@@ -144,7 +135,7 @@ void DepthwiseConv2d<float16, CPUContext>(
const int dilation_w,
const string& data_format,
const float16* x,
const float16* w,
const float16* filter,
float16* y,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
......@@ -168,7 +159,7 @@ void DepthwiseConv2d<float, CPUContext>(
const int dilation_w,
const string& data_format,
const float* x,
const float* w,
const float* filter,
float* y,
CPUContext* ctx) {
DISPATCH_DATA_KERNEL(
......@@ -188,7 +179,7 @@ void DepthwiseConv2d<float, CPUContext>(
dilation_h,
dilation_w,
x,
w,
filter,
y);
}
......@@ -211,7 +202,7 @@ void DepthwiseConv2d<float, CPUContext>(
const int dilation_w, \
const string& data_format, \
const T* dy, \
const T* w, \
const T* filter, \
T* dx, \
CPUContext* ctx) { \
NOT_IMPLEMENTED; \
......@@ -235,7 +226,7 @@ void DepthwiseConv2d<float, CPUContext>(
const string& data_format, \
const T* dy, \
const T* x, \
T* dw, \
T* dfilter, \
CPUContext* ctx) { \
NOT_IMPLEMENTED; \
}
......
......@@ -71,13 +71,13 @@ void _RoiAlign(
continue;
}
const float roi_start_w = roi[1] * spatial_scale;
const float roi_start_h = roi[2] * spatial_scale;
const float roi_end_w = roi[3] * spatial_scale;
const float roi_end_h = roi[4] * spatial_scale;
const float roi_wstart = roi[1] * spatial_scale;
const float roi_hstart = roi[2] * spatial_scale;
const float roi_wend = roi[3] * spatial_scale;
const float roi_hend = roi[4] * spatial_scale;
const float roi_w = std::max(roi_end_w - roi_start_w, 1.f);
const float roi_h = std::max(roi_end_h - roi_start_h, 1.f);
const float roi_w = std::max(roi_wend - roi_wstart, 1.f);
const float roi_h = std::max(roi_hend - roi_hstart, 1.f);
const float bin_h = roi_h / (float)out_h;
const float bin_w = roi_w / (float)out_w;
......@@ -94,10 +94,10 @@ void _RoiAlign(
for (int c = 0; c < C; ++c) {
yi = 0;
for (int oh = 0; oh < out_h; ++oh) {
hstart = roi_start_h + oh * bin_h;
for (int ow = 0; ow < out_w; ++ow) {
wstart = roi_start_w + ow * bin_w;
for (int h_out = 0; h_out < out_h; ++h_out) {
hstart = roi_hstart + h_out * bin_h;
for (int w_out = 0; w_out < out_w; ++w_out) {
wstart = roi_wstart + w_out * bin_w;
val = T(0);
for (int i = 0; i < grid_h; ++i) {
h = hstart + (i + .5f) * bin_h / (float)grid_h;
......@@ -108,7 +108,7 @@ void _RoiAlign(
} // End i
offset_y[yi++] = val / num_grids;
}
} // End oh && ow
} // End h_out && w_out
offset_x += x_inner_dim;
offset_y += y_inner_dim;
} // End c
......
#ifdef USE_CUDA
#include "dragon/core/context_cuda.h"
#include "dragon/utils/conversions.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......@@ -54,6 +54,7 @@ _RoiAlignIntp(const int H, const int W, float h, float w, const T* x) {
return t + (b - t) * v;
}
template <typename T>
__device__ void _RoiAlignIntpParam(
const int H,
const int W,
......@@ -63,8 +64,8 @@ __device__ void _RoiAlignIntpParam(
int& bi,
int& li,
int& ri,
float& v,
float& u) {
T& v,
T& u) {
if (h < -1.f || h > H || w < -1.f || w > W) {
li = ri = ti = bi = -1;
return;
......@@ -108,8 +109,8 @@ __global__ void _RoiAlign(
const float* rois,
T* y) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
const int ow = yi % out_w;
const int oh = (yi / out_w) % out_h;
const int w_out = yi % out_w;
const int h_out = (yi / out_w) % out_h;
const int c = (yi / out_w / out_h) % C;
const int n = yi / out_w / out_h / C;
......@@ -121,26 +122,25 @@ __global__ void _RoiAlign(
continue;
}
const float roi_start_w = roi[1] * spatial_scale;
const float roi_start_h = roi[2] * spatial_scale;
const float roi_end_w = roi[3] * spatial_scale;
const float roi_end_h = roi[4] * spatial_scale;
const float roi_wstart = roi[1] * spatial_scale;
const float roi_hstart = roi[2] * spatial_scale;
const float roi_wend = roi[3] * spatial_scale;
const float roi_hend = roi[4] * spatial_scale;
const float roi_w = max(roi_end_w - roi_start_w, 1.f);
const float roi_h = max(roi_end_h - roi_start_h, 1.f);
const float roi_w = max(roi_wend - roi_wstart, 1.f);
const float roi_h = max(roi_hend - roi_hstart, 1.f);
const float bin_h = roi_h / (float)out_h;
const float bin_w = roi_w / (float)out_w;
const float hstart = roi_hstart + h_out * bin_h;
const float wstart = roi_wstart + w_out * bin_w;
const int grid_h =
sampling_ratio > 0 ? sampling_ratio : ceil(roi_h / out_h);
const int grid_w =
sampling_ratio > 0 ? sampling_ratio : ceil(roi_w / out_w);
const float hstart = roi_start_h + oh * bin_h;
const float wstart = roi_start_w + ow * bin_w;
const T* offset_x = x + (batch_ind * C + c) * H * W;
AccT val = AccT(0);
for (int i = 0; i < grid_h; i++) {
const float h = hstart + (i + .5f) * bin_h / grid_h;
......@@ -149,7 +149,6 @@ __global__ void _RoiAlign(
val += _RoiAlignIntp(H, W, h, w, offset_x);
}
}
y[yi] = convert::To<T>(val / AccT(grid_h * grid_w));
}
}
......@@ -168,8 +167,8 @@ __global__ void _RoiAlignGrad(
const float* rois,
AccT* dx) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
const int ow = yi % out_w;
const int oh = (yi / out_w) % out_h;
const int w_out = yi % out_w;
const int h_out = (yi / out_w) % out_h;
const int c = (yi / out_w / out_h) % C;
const int n = yi / out_w / out_h / C;
......@@ -178,24 +177,23 @@ __global__ void _RoiAlignGrad(
if (batch_ind < 0) continue;
const float roi_start_w = roi[1] * spatial_scale;
const float roi_start_h = roi[2] * spatial_scale;
const float roi_end_w = roi[3] * spatial_scale;
const float roi_end_h = roi[4] * spatial_scale;
const float roi_wstart = roi[1] * spatial_scale;
const float roi_hstart = roi[2] * spatial_scale;
const float roi_wend = roi[3] * spatial_scale;
const float roi_hend = roi[4] * spatial_scale;
const float roi_w = max(roi_end_w - roi_start_w, 1.f);
const float roi_h = max(roi_end_h - roi_start_h, 1.f);
const float roi_w = max(roi_wend - roi_wstart, 1.f);
const float roi_h = max(roi_hend - roi_hstart, 1.f);
const float bin_h = roi_h / (float)out_h;
const float bin_w = roi_w / (float)out_w;
const float hstart = roi_hstart + h_out * bin_h;
const float wstart = roi_wstart + w_out * bin_w;
const int grid_h =
sampling_ratio > 0 ? sampling_ratio : ceil(roi_h / out_h);
const int grid_w =
sampling_ratio > 0 ? sampling_ratio : ceil(roi_w / out_w);
const float hstart = roi_start_h + oh * bin_h;
const float wstart = roi_start_w + ow * bin_w;
const float dyi = convert::To<float>(dy[yi]) / float(grid_h * grid_w);
float* offset_dx = dx + (batch_ind * C + c) * H * W;
......@@ -209,10 +207,10 @@ __global__ void _RoiAlignGrad(
if (li >= 0 && ri >= 0 && ti >= 0 && bi >= 0) {
const float db = dyi * v;
const float dt = dyi * (1.f - v);
atomicAdd(offset_dx + ti * W + li, (1.f - u) * dt);
atomicAdd(offset_dx + ti * W + ri, u * dt);
atomicAdd(offset_dx + bi * W + li, (1.f - u) * db);
atomicAdd(offset_dx + bi * W + ri, u * db);
math::utils::AtomicAdd(offset_dx + ti * W + li, (1.f - u) * dt);
math::utils::AtomicAdd(offset_dx + ti * W + ri, u * dt);
math::utils::AtomicAdd(offset_dx + bi * W + li, (1.f - u) * db);
math::utils::AtomicAdd(offset_dx + bi * W + ri, u * db);
}
} // End i
} // End j
......@@ -225,9 +223,9 @@ __global__ void _RoiAlignGrad(
/* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(T, ScalarT) \
#define DEFINE_KERNEL_LAUNCHER(name, InputT, OutputT) \
template <> \
void RoiAlign<T, CUDAContext>( \
void name<InputT, CUDAContext>( \
const int C, \
const int H, \
const int W, \
......@@ -236,12 +234,12 @@ __global__ void _RoiAlignGrad(
const int num_rois, \
const float spatial_scale, \
const int sampling_ratio, \
const T* x, \
const InputT* x, \
const float* rois, \
T* y, \
OutputT* y, \
CUDAContext* ctx) { \
auto nthreads = num_rois * C * out_h * out_w; \
_RoiAlign<ScalarT, float> \
_##name<math::ScalarType<InputT>::type, float> \
<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
nthreads, \
C, \
......@@ -251,53 +249,18 @@ __global__ void _RoiAlignGrad(
out_w, \
spatial_scale, \
sampling_ratio, \
reinterpret_cast<const ScalarT*>(x), \
reinterpret_cast<const math::ScalarType<InputT>::type*>(x), \
rois, \
reinterpret_cast<ScalarT*>(y)); \
}
#define DEFINE_GRAD_KERNEL_LAUNCHER(T, ScalarT) \
template <> \
void RoiAlignGrad<T, CUDAContext>( \
const int C, \
const int H, \
const int W, \
const int out_h, \
const int out_w, \
const int num_rois, \
const float spatial_scale, \
const int sampling_ratio, \
const T* dy, \
const float* rois, \
float* dx, \
CUDAContext* ctx) { \
auto nthreads = num_rois * C * out_h * out_w; \
_RoiAlignGrad<<< \
CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \
0, \
ctx->cuda_stream()>>>( \
nthreads, \
C, \
H, \
W, \
out_h, \
out_w, \
spatial_scale, \
sampling_ratio, \
reinterpret_cast<const ScalarT*>(dy), \
rois, \
dx); \
reinterpret_cast<math::ScalarType<OutputT>::type*>(y)); \
}
DEFINE_KERNEL_LAUNCHER(float16, half);
DEFINE_KERNEL_LAUNCHER(float, float);
DEFINE_KERNEL_LAUNCHER(double, double);
DEFINE_GRAD_KERNEL_LAUNCHER(float16, half);
DEFINE_GRAD_KERNEL_LAUNCHER(float, float);
DEFINE_GRAD_KERNEL_LAUNCHER(double, double);
DEFINE_KERNEL_LAUNCHER(RoiAlign, float16, float16);
DEFINE_KERNEL_LAUNCHER(RoiAlign, float, float);
DEFINE_KERNEL_LAUNCHER(RoiAlign, double, double);
DEFINE_KERNEL_LAUNCHER(RoiAlignGrad, float16, float); // RoiAlignGrad
DEFINE_KERNEL_LAUNCHER(RoiAlignGrad, float, float); // RoiAlignGrad
DEFINE_KERNEL_LAUNCHER(RoiAlignGrad, double, float); // RoiAlignGrad
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -37,51 +37,51 @@ void _RoiPool(
continue;
}
const int roi_start_w = std::round(roi[1] * spatial_scale);
const int roi_start_h = std::round(roi[2] * spatial_scale);
const int roi_end_w = std::round(roi[3] * spatial_scale);
const int roi_end_h = std::round(roi[4] * spatial_scale);
const int roi_wstart = std::round(roi[1] * spatial_scale);
const int roi_hstart = std::round(roi[2] * spatial_scale);
const int roi_wend = std::round(roi[3] * spatial_scale);
const int roi_hend = std::round(roi[4] * spatial_scale);
const int roi_w = std::max(roi_end_w - roi_start_w + 1, 1);
const int roi_h = std::max(roi_end_h - roi_start_h + 1, 1);
const int roi_w = std::max(roi_wend - roi_wstart + 1, 1);
const int roi_h = std::max(roi_hend - roi_hstart + 1, 1);
const float bin_h = (float)roi_h / (float)out_h;
const float bin_w = (float)roi_w / (float)out_w;
T val;
bool empty;
int xi, maxi, yi;
int xi, yi, mask_val;
int hstart, wstart, hend, wend;
const T* offset_x = x + batch_ind * x_cols;
for (int c = 0; c < C; ++c) {
yi = 0;
for (int oh = 0; oh < out_h; ++oh) {
hstart = (int)(bin_h * oh);
hstart = std::min(std::max(hstart + roi_start_h, 0), H);
hend = (int)ceil(bin_h * (oh + 1));
hend = std::min(std::max(hend + roi_start_h, 0), H);
for (int h_out = 0; h_out < out_h; ++h_out) {
hstart = (int)(bin_h * h_out);
hstart = std::min(std::max(hstart + roi_hstart, 0), H);
hend = (int)ceil(bin_h * (h_out + 1));
hend = std::min(std::max(hend + roi_hstart, 0), H);
empty = hend == hstart;
for (int ow = 0; ow < out_w; ++ow) {
wstart = (int)(bin_w * ow);
wstart = std::min(std::max(wstart + roi_start_w, 0), W);
wend = (int)ceil(bin_w * (ow + 1));
wend = std::min(std::max(wend + roi_start_w, 0), W);
for (int w_out = 0; w_out < out_w; ++w_out) {
wstart = (int)(bin_w * w_out);
wstart = std::min(std::max(wstart + roi_wstart, 0), W);
wend = (int)ceil(bin_w * (w_out + 1));
wend = std::min(std::max(wend + roi_wstart, 0), W);
empty = empty || (wend == wstart);
maxi = empty ? -1 : 0;
mask_val = empty ? -1 : 0;
val = empty ? T(0) : offset_x[0];
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
xi = h * W + w;
if (offset_x[xi] > offset_y[yi]) {
maxi = xi;
mask_val = xi;
val = offset_x[xi];
}
} // End w
} // End h
offset_y[yi] = val;
offset_mask[yi++] = maxi;
offset_mask[yi++] = mask_val;
}
} // End oh && ow
} // End h_out && w_out
offset_x += x_inner_dim;
offset_y += y_inner_dim;
offset_mask += y_inner_dim;
......@@ -141,7 +141,7 @@ void RoiPool<float16, CPUContext>(
const float spatial_scale, \
const T* dy, \
const float* rois, \
const int* mask, \
int* mask, \
float* dx, \
CPUContext* ctx) { \
NOT_IMPLEMENTED; \
......
......@@ -31,8 +31,8 @@ __global__ void _RoiPool(
T* y) {
auto Greater = math::GreaterFunctor<T>();
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
const int ow = yi % out_w;
const int oh = (yi / out_w) % out_h;
const int w_out = yi % out_w;
const int h_out = (yi / out_w) % out_h;
const int c = (yi / out_w / out_h) % C;
const int n = yi / out_w / out_h / C;
......@@ -45,25 +45,25 @@ __global__ void _RoiPool(
continue;
}
const int roi_start_w = round(roi[1] * spatial_scale);
const int roi_start_h = round(roi[2] * spatial_scale);
const int roi_end_w = round(roi[3] * spatial_scale);
const int roi_end_h = round(roi[4] * spatial_scale);
const int roi_wstart = round(roi[1] * spatial_scale);
const int roi_hstart = round(roi[2] * spatial_scale);
const int roi_wend = round(roi[3] * spatial_scale);
const int roi_hend = round(roi[4] * spatial_scale);
const int roi_w = max(roi_end_w - roi_start_w + 1, 1);
const int roi_h = max(roi_end_h - roi_start_h + 1, 1);
const int roi_w = max(roi_wend - roi_wstart + 1, 1);
const int roi_h = max(roi_hend - roi_hstart + 1, 1);
const float bin_h = (float)roi_h / (float)out_h;
const float bin_w = (float)roi_w / (float)out_w;
int hstart = floor(bin_h * oh);
int wstart = floor(bin_w * ow);
int hend = ceil(bin_h * (oh + 1));
int wend = ceil(bin_w * (ow + 1));
int hstart = floor(bin_h * h_out);
int wstart = floor(bin_w * w_out);
int hend = ceil(bin_h * (h_out + 1));
int wend = ceil(bin_w * (w_out + 1));
hstart = min(max(hstart + roi_start_h, 0), H);
hend = min(max(hend + roi_start_h, 0), H);
wstart = min(max(wstart + roi_start_w, 0), W);
wend = min(max(wend + roi_start_w, 0), W);
hstart = min(max(hstart + roi_hstart, 0), H);
hend = min(max(hend + roi_hstart, 0), H);
wstart = min(max(wstart + roi_wstart, 0), W);
wend = min(max(wend + roi_wstart, 0), W);
const bool empty = (hend <= hstart) || (wend <= wstart);
int max_idx = empty ? -1 : 0;
......@@ -106,7 +106,8 @@ __global__ void _RoiPoolGrad(
AccT* offset_dx = dx + (batch_ind * C + c) * H * W;
if (LDG(mask, yi) != -1) {
atomicAdd(offset_dx + LDG(mask, yi), convert::To<AccT>(dy[yi]));
math::utils::AtomicAdd(
offset_dx + LDG(mask, yi), convert::To<AccT>(dy[yi]));
}
}
}
......@@ -117,78 +118,43 @@ __global__ void _RoiPoolGrad(
/* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(T, ScalarT) \
template <> \
void RoiPool<T, CUDAContext>( \
const int C, \
const int H, \
const int W, \
const int out_h, \
const int out_w, \
const int num_rois, \
const float spatial_scale, \
const T* x, \
const float* rois, \
int* mask, \
T* y, \
CUDAContext* ctx) { \
auto nthreads = num_rois * C * out_h * out_w; \
_RoiPool<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
nthreads, \
C, \
H, \
W, \
out_h, \
out_w, \
spatial_scale, \
reinterpret_cast<const ScalarT*>(x), \
rois, \
mask, \
reinterpret_cast<ScalarT*>(y)); \
#define DEFINE_KERNEL_LAUNCHER(name, InputT, OutputT) \
template <> \
void name<InputT, CUDAContext>( \
const int C, \
const int H, \
const int W, \
const int out_h, \
const int out_w, \
const int num_rois, \
const float spatial_scale, \
const InputT* x, \
const float* rois, \
int* mask, \
OutputT* y, \
CUDAContext* ctx) { \
auto nthreads = num_rois * C * out_h * out_w; \
_##name<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
nthreads, \
C, \
H, \
W, \
out_h, \
out_w, \
spatial_scale, \
reinterpret_cast<const math::ScalarType<InputT>::type*>(x), \
rois, \
mask, \
reinterpret_cast<math::ScalarType<OutputT>::type*>(y)); \
}
#define DEFINE_GRAD_KERNEL_LAUNCHER(T, ScalarT) \
template <> \
void RoiPoolGrad<T, CUDAContext>( \
const int C, \
const int H, \
const int W, \
const int out_h, \
const int out_w, \
const int num_rois, \
const float spatial_scale, \
const T* dy, \
const float* rois, \
const int* mask, \
float* dx, \
CUDAContext* ctx) { \
auto nthreads = num_rois * C * out_h * out_w; \
_RoiPoolGrad<<< \
CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \
0, \
ctx->cuda_stream()>>>( \
nthreads, \
C, \
H, \
W, \
out_h, \
out_w, \
spatial_scale, \
reinterpret_cast<const ScalarT*>(dy), \
rois, \
mask, \
dx); \
}
DEFINE_KERNEL_LAUNCHER(float16, half);
DEFINE_KERNEL_LAUNCHER(float, float);
DEFINE_KERNEL_LAUNCHER(double, double);
DEFINE_GRAD_KERNEL_LAUNCHER(float16, half);
DEFINE_GRAD_KERNEL_LAUNCHER(float, float);
DEFINE_GRAD_KERNEL_LAUNCHER(double, double);
DEFINE_KERNEL_LAUNCHER(RoiPool, float16, float16);
DEFINE_KERNEL_LAUNCHER(RoiPool, float, float);
DEFINE_KERNEL_LAUNCHER(RoiPool, double, double);
DEFINE_KERNEL_LAUNCHER(RoiPoolGrad, float16, float); // RoiPoolGrad
DEFINE_KERNEL_LAUNCHER(RoiPoolGrad, float, float); // RoiPoolGrad
DEFINE_KERNEL_LAUNCHER(RoiPoolGrad, double, float); // RoiPoolGrad
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -63,15 +63,15 @@ if (USE_OPENMP)
endif()
if (USE_CUDA)
if (USE_SHARED_LIBS)
target_link_libraries_v2(dragon cudart)
target_link_libraries_v2(dragon cublas)
target_link_libraries_v2(dragon curand)
target_link_libraries_v2(dragonrt cudart)
target_link_libraries_v2(dragonrt cublas)
target_link_libraries_v2(dragonrt curand)
else()
target_link_libraries_v2(dragon cudart_static)
target_link_libraries_v2(dragon cublas_static)
target_link_libraries_v2(dragon curand_static)
target_link_libraries_v2(dragonrt cudart_static)
target_link_libraries_v2(dragonrt cublas_static)
target_link_libraries_v2(dragonrt curand_static)
if (CUDA_VERSION VERSION_GREATER "10.0")
target_link_libraries_v2(dragon cublasLt_static)
target_link_libraries_v2(dragonrt cublasLt_static)
endif()
endif()
endif()
......
......@@ -197,9 +197,7 @@ const Map<string, Map<string, string>>& ONNXBackend::get_node_renamed_attrs()
}
const Map<string, string>& ONNXBackend::get_renamed_attrs() const {
const static Map<string, string> kRenamedAttrs{
{"keepdims", "keep_dims"},
};
const static Map<string, string> kRenamedAttrs;
return kRenamedAttrs;
}
......
......@@ -138,17 +138,12 @@ ONNXImporterReturns ONNXBackend::ConvPoolImporter(
attributes.AddRewrittenAttribute("mode")->set_s("AVG");
} else if (onnx_op_type == "GlobalMaxPool") {
attributes.AddRewrittenAttribute("mode")->set_s("MAX");
attributes.AddRewrittenAttribute("global_pooling")->set_i(1);
attributes.AddRewrittenAttribute("global_pool")->set_i(1);
} else if (onnx_op_type == "GlobalAveragePool") {
attributes.AddRewrittenAttribute("mode")->set_s("AVG");
attributes.AddRewrittenAttribute("global_pooling")->set_i(1);
attributes.AddRewrittenAttribute("global_pool")->set_i(1);
}
auto returns = GenericImporter(onnx_node, ctx);
// Determine the op type
OperatorDef* op_def = returns.GetOp(0);
auto ks = attributes.get<ONNX_INTS>("kernel_shape");
*(op_def->mutable_type()) += (str::to(ks.size() > 0 ? ks.size() : 2) + "d");
return returns;
return GenericImporter(onnx_node, ctx);
}
ONNXImporterReturns ONNXBackend::GenericImporter(
......@@ -251,7 +246,11 @@ ONNXImporterReturns ONNXBackend::ResizeImporter(
auto node = NodeProto(onnx_node->node);
auto onnx_node_v2 = ONNXNode(node);
auto& attributes = onnx_node_v2.attributes;
auto coord_mode = attributes.get<string>("coordinate_transformation_mode");
attributes.remove("coordinate_transformation_mode");
if (coord_mode == "align_corners") {
attributes.AddRewrittenAttribute("align_corners")->set_i(1);
}
if (ctx.opset_version() >= 9) {
node.mutable_input()->Clear();
node.add_input(onnx_node->node.input(0));
......
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!