Commit bd84b7fd by Ting PAN

Add masked AssignOp

1 parent e90a8f1a
......@@ -168,6 +168,7 @@ List Brief
=============== ======================================================================
`Copy`_ Copy the *value* to *ref*.
`Assign`_ Assign the *value* to *ref*.
`MaskedAssign`_ Assign the *value* to *ref* where mask is *1*.
`Equal`_ *Equal* Comparing between A and B.
`Less`_ *Less* Comparing between A and B.
`LessEqual`_ *LessEqual* Comparing between A and B.
......@@ -308,8 +309,9 @@ List Brief
.. _Arange: operators/array.html#dragon.operators.array.Arange
.. _Multinomial: operators/array.html#dragon.operators.array.Multinomial
.. _Copy: operators/control_flow.html#dAragon.operators.control_flow.Copy
.. _Assign: operators/control_flow.html#dAragon.operators.control_flow.Assign
.. _Copy: operators/control_flow.html#dragon.operators.control_flow.Copy
.. _Assign: operators/control_flow.html#dragon.operators.control_flow.Assign
.. _MaskedAssign: operators/control_flow.html#dragon.operators.control_flow.MaskedAssign
.. _Equal: operators/control_flow.html#dragon.operators.control_flow.Equal
.. _Less: operators/control_flow.html#dragon.operators.control_flow.Less
.. _LessEqual: operators/control_flow.html#dragon.operators.control_flow.LessEqual
......
......@@ -72,7 +72,7 @@ class CUDAObject {
if (streams.size() <= (unsigned)stream_id)
streams.resize(stream_id + 1, nullptr);
if (!streams[stream_id]) {
DeviceGuard guard(device_id);
CUDADeviceGuard guard(device_id);
unsigned int flags = !stream_id ?
cudaStreamDefault :
cudaStreamNonBlocking;
......@@ -97,7 +97,7 @@ class CUDAObject {
if (handles.size() <= (unsigned)stream_id)
handles.resize(stream_id + 1, nullptr);
if (!handles[stream_id]) {
DeviceGuard guard(device_id);
CUDADeviceGuard guard(device_id);
CUBLAS_CHECK(cublasCreate_v2(&handles[stream_id]));
CUBLAS_CHECK(cublasSetStream_v2(
handles[stream_id],
......@@ -120,7 +120,7 @@ class CUDAObject {
if (handles.size() <= (unsigned)stream_id)
handles.resize(stream_id + 1, nullptr);
if (!handles[stream_id]) {
DeviceGuard guard(device_id);
CUDADeviceGuard guard(device_id);
CUDNN_CHECK(cudnnCreate(&handles[stream_id]));
CUDNN_CHECK(cudnnSetStream(
handles[stream_id],
......@@ -292,7 +292,7 @@ class CUDAContext {
/*! \brief Return the internal cuda random generator */
curandGenerator_t& curand_generator() {
if (!curand_generator_) {
DeviceGuard guard(device_id_);
CUDADeviceGuard guard(device_id_);
CURAND_CHECK(curandCreateGenerator(
&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(
......
/*!
* Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
*
* Licensed under the BSD 2-Clause License.
* You should have received a copy of the BSD 2-Clause License
* along with the software. If not, See,
*
* <https://opensource.org/licenses/BSD-2-Clause>
*
* ------------------------------------------------------------
*/
#ifndef DRAGON_OPERATORS_CONTROL_FLOW_MASKED_ASSIGN_OP_H_
#define DRAGON_OPERATORS_CONTROL_FLOW_MASKED_ASSIGN_OP_H_
#include "core/operator.h"
namespace dragon {
template <class Context>
class MaskedAssignOp final : public Operator<Context> {
public:
MaskedAssignOp(const OperatorDef& def, Workspace* ws)
: Operator<Context>(def, ws) {}
USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override;
template <typename T> void RunImpl();
};
} // namespace dragon
#endif // DRAGON_OPERATORS_CONTROL_FLOW_MASKED_ASSIGN_OP_H_
\ No newline at end of file
......@@ -135,19 +135,19 @@ struct CUDADeviceProps {
vector<cudaDeviceProp> props;
};
inline const cudaDeviceProp& GetDeviceProperty(
const int device_id) {
inline const cudaDeviceProp& GetCUDADeviceProp(
int device_id) {
static CUDADeviceProps props;
CHECK_LT(device_id, (int)props.props.size())
<< "Invalid device id: " << device_id
<< "\nInvalid device id: " << device_id
<< "\nDetected " << props.props.size()
<< " eligible cuda devices.";
<< " devices.";
return props.props[device_id];
}
inline bool CUDA_TRUE_FP16_AVAILABLE() {
int device = CUDA_GET_DEVICE();
auto& prop = GetDeviceProperty(device);
auto& prop = GetCUDADeviceProp(device);
return prop.major >= 6;
}
......@@ -156,21 +156,26 @@ inline bool TENSOR_CORE_AVAILABLE() {
return false;
#else
int device = CUDA_GET_DEVICE();
auto& prop = GetDeviceProperty(device);
auto& prop = GetCUDADeviceProp(device);
return prop.major >= 7;
#endif
}
class DeviceGuard {
class CUDADeviceGuard {
public:
DeviceGuard(int new_id) : prev_id(CUDA_GET_DEVICE()) {
if (prev_id != new_id) CUDA_CHECK(cudaSetDevice(new_id));
CUDADeviceGuard(int new_id)
: prev_id_(CUDA_GET_DEVICE()) {
if (prev_id_ != new_id) {
CUDA_CHECK(cudaSetDevice(new_id));
}
}
~DeviceGuard() { CUDA_CHECK(cudaSetDevice(prev_id)); }
~CUDADeviceGuard() {
CUDA_CHECK(cudaSetDevice(prev_id_));
}
private:
int prev_id;
int prev_id_;
};
#else
......
......@@ -100,38 +100,38 @@ void CuDNNSetTensor3dDesc(
template <typename T>
void CuDNNSetTensorDesc(
cudnnTensorDescriptor_t* desc,
const vec64_t& dims);
const vec64_t& dims);
template <typename T>
void CuDNNSetTensor4dDesc(
cudnnTensorDescriptor_t* desc,
const string& data_format,
const vec64_t& dims);
const vec64_t& dims);
template <typename T>
void CuDNNSetTensor4dDescWithGroup(
cudnnTensorDescriptor_t* desc,
const string& data_format,
const vec64_t& dims,
const vec64_t& dims,
const int64_t group);
template <typename T>
void CuDNNSetTensor5dDesc(
cudnnTensorDescriptor_t* desc,
const string& data_format,
const vec64_t& dims);
const vec64_t& dims);
template <typename T>
void CuDNNSetTensor3dDesc(
cudnnTensorDescriptor_t* desc,
const string& data_format,
const vec64_t& dims);
const vec64_t& dims);
template <typename T>
void CuDNNSetTensorDesc(
cudnnTensorDescriptor_t* desc,
const vec64_t& dims,
const vec64_t& strides);
const vec64_t& dims,
const vec64_t& strides);
} // namespace dragon
......
......@@ -657,6 +657,16 @@ void GreaterEqual(
bool* y,
Context* ctx);
/*! control_flow.masked_assign */
template <typename T, class Context>
void MaskedAssign(
const int count,
const uint8_t* mask,
const T* x,
T* y,
Context* ctx);
/*! loss.l1_loss */
template <typename T, class Context>
......
......@@ -488,9 +488,9 @@ class Tensor(object):
Parameters
----------
key : int or slice
key : int, slice or Tensor
The indices.
value : Tensor, number or sequence
value : number, sequence or Tensor
The value.
Returns
......@@ -498,11 +498,20 @@ class Tensor(object):
None
"""
starts, sizes = self._process_indices(key)
if not isinstance(value, Tensor):
value = self._from_constant(value)
return self.CreateOperator('Assign', [value],
existing_outputs=[self], starts=starts, sizes=sizes)
if isinstance(key, Tensor):
return self.CreateOperator(
'MaskedAssign', [value, key],
existing_outputs=[self],
)
else:
starts, sizes = self._process_indices(key)
return self.CreateOperator(
'Assign', [value],
starts=starts, sizes=sizes,
existing_outputs=[self],
)
def _from_constant(self, value, name=None):
if not isinstance(value, numpy.ndarray):
......
......@@ -75,10 +75,36 @@ def Assign(inputs, starts=None, sizes=None, **kwargs):
@OpSchema.ConvertConstantInputs()
@OpSchema.Inputs(2)
def MaskedAssign(inputs, mask, **kwargs):
"""Assign the ``value`` to ``ref`` where ``mask`` is *1*.
**Type Constraints**: (*bool*, *int8*, *uint8*, *int32*, *int64*, *float16*, *float32*, *float64*)
Parameters
----------
inputs : sequence of Tensor
The ``ref`` and ``value`` respectively.
mask : Tensor
The mask, with the same size as ``ref``.
Returns
-------
Tensor
The ``ref``.
"""
arguments = ParseArgs(locals())
arguments['existing_outputs'] = [arguments['inputs'][0]]
arguments['inputs'] = [arguments['inputs'][1], mask]
return Tensor.CreateOperator('Assign', **arguments)
@OpSchema.ConvertConstantInputs()
@OpSchema.Inputs(2)
def Equal(inputs, to_uint8=False, **kwargs):
"""``Equal`` comparing between A and B.
"""*Equal* comparing between A and B.
Set ``to_uint8`` if you expect the ``uint8`` results instead of ``bool``.
Set ``to_uint8`` if you expect the *uint8* results instead of *bool*.
**Type Constraints**: (*bool*, *int8*, *uint8*, *int32*, *int64*, *float16*, *float32*, *float64*)
......@@ -87,7 +113,7 @@ def Equal(inputs, to_uint8=False, **kwargs):
inputs : sequence of Tensor
The inputs, represent A and B respectively.
to_uint8 : bool
``True`` to convert to ``uint8`` results.
*True* to convert to *uint8* results.
Returns
-------
......@@ -102,9 +128,9 @@ def Equal(inputs, to_uint8=False, **kwargs):
@OpSchema.ConvertConstantInputs()
@OpSchema.Inputs(2)
def Less(inputs, to_uint8=False, **kwargs):
"""``Less`` comparing between A and B.
"""*Less* comparing between A and B.
Set ``to_uint8`` if you expect the ``uint8`` results instead of ``bool``.
Set ``to_uint8`` if you expect the *uint8* results instead of *bool*.
**Type Constraints**: (*bool*, *int8*, *uint8*, *int32*, *int64*, *float16*, *float32*, *float64*)
......@@ -113,7 +139,7 @@ def Less(inputs, to_uint8=False, **kwargs):
inputs : sequence of Tensor
The inputs, represent A and B respectively.
to_uint8 : bool
``True`` to convert to ``uint8`` results.
*True* to convert to *uint8* results.
Returns
-------
......@@ -128,9 +154,9 @@ def Less(inputs, to_uint8=False, **kwargs):
@OpSchema.ConvertConstantInputs()
@OpSchema.Inputs(2)
def LessEqual(inputs, to_uint8=False, **kwargs):
"""``LessEqual`` comparing between A and B.
"""*LessEqual* comparing between A and B.
Set ``to_uint8`` if you expect the ``uint8`` results instead of ``bool``.
Set ``to_uint8`` if you expect the *uint8* results instead of *bool*.
**Type Constraints**: (*bool*, *int8*, *uint8*, *int32*, *int64*, *float16*, *float32*, *float64*)
......@@ -139,7 +165,7 @@ def LessEqual(inputs, to_uint8=False, **kwargs):
inputs : sequence of Tensor
The inputs, represent A and B respectively.
to_uint8 : bool
``True`` to convert to ``uint8`` results.
*True* to convert to *uint8* results.
Returns
-------
......@@ -154,9 +180,9 @@ def LessEqual(inputs, to_uint8=False, **kwargs):
@OpSchema.ConvertConstantInputs()
@OpSchema.Inputs(2)
def Greater(inputs, to_uint8=False, **kwargs):
"""``Greater`` comparing between A and B.
"""*Greater* comparing between A and B.
Set ``to_uint8`` if you expect the ``uint8`` results instead of ``bool``.
Set ``to_uint8`` if you expect the *uint8* results instead of *bool*.
**Type Constraints**: (*bool*, *int8*, *uint8*, *int32*, *int64*, *float16*, *float32*, *float64*)
......@@ -165,7 +191,7 @@ def Greater(inputs, to_uint8=False, **kwargs):
inputs : sequence of Tensor
The inputs, represent A and B respectively.
to_uint8 : bool
``True`` to convert to ``uint8`` results.
*True* to convert to *uint8* results.
Returns
-------
......@@ -180,9 +206,9 @@ def Greater(inputs, to_uint8=False, **kwargs):
@OpSchema.ConvertConstantInputs()
@OpSchema.Inputs(2)
def GreaterEqual(inputs, to_uint8=False, **kwargs):
"""``GreaterEqual`` comparing between A and B.
"""*GreaterEqual* comparing between A and B.
Set ``to_uint8`` if you expect the ``uint8`` results instead of ``bool``.
Set ``to_uint8`` if you expect the *uint8* results instead of *bool*.
**Type Constraints**: (*bool*, *int8*, *uint8*, *int32*, *int64*, *float16*, *float32*, *float64*)
......@@ -191,7 +217,7 @@ def GreaterEqual(inputs, to_uint8=False, **kwargs):
inputs : sequence of Tensor
The inputs, represent A and B respectively.
to_uint8 : bool
``True`` to convert to ``uint8`` results.
*True* to convert to *uint8* results.
Returns
-------
......
......@@ -143,6 +143,7 @@ Multinomial = _array_ops.Multinomial
# Control Flow
Copy = _control_flow_ops.Copy
Assign = _control_flow_ops.Assign
MaskedAssign = _control_flow_ops.MaskedAssign
Equal = _control_flow_ops.Equal
Less = _control_flow_ops.Less
LessEqual = _control_flow_ops.LessEqual
......
......@@ -70,8 +70,8 @@ class DataTransformer(multiprocessing.Process):
self._cutout_size = kwargs.get('cutout_size', 0)
self._mirror = kwargs.get('mirror', False)
self._color_aug = kwargs.get('color_augmentation', False)
self._min_random_scale = kwargs.get('min_random_scale', 1.0)
self._max_random_scale = kwargs.get('max_random_scale', 1.0)
self._min_rand_scale = kwargs.get('min_random_scale', 1.0)
self._max_rand_scale = kwargs.get('max_random_scale', 1.0)
self._force_color = kwargs.get('force_color', False)
self._phase = kwargs.get('phase', 'TRAIN')
self._random_seed = _cfg.GetRandomSeed()
......@@ -102,12 +102,16 @@ class DataTransformer(multiprocessing.Process):
im = im.reshape((datum.height, datum.width, datum.channels))
# Random scale
random_scale = numpy.random.uniform() * (
self._max_random_scale - self._min_random_scale) \
+ self._min_random_scale
if random_scale != 1.0:
im = cv2.resize(im, None, fx=random_scale,
fy=random_scale, interpolation=cv2.INTER_LINEAR)
rand_scale = numpy.random.uniform() * (
self._max_rand_scale - self._min_rand_scale
) + self._min_rand_scale
if rand_scale != 1.0:
im = cv2.resize(
im, None,
fx=rand_scale,
fy=rand_scale,
interpolation=cv2.INTER_LINEAR,
)
# Padding
if self._padding > 0:
......@@ -149,7 +153,7 @@ class DataTransformer(multiprocessing.Process):
# Gray Transformation
if self._force_color:
if im.shape[2] == 1:
# duplicate to 3 channels
# Duplicate to 3 channels
im = numpy.concatenate([im, im, im], axis=2)
# Color Augmentation
......
......@@ -338,11 +338,13 @@ class Module(object):
def run(self, inputs, outputs, auto_grad=True, callback=None):
if self._module_def is None: self._gen_module_def()
meta = (self.module_key, self._module_def)
return RunOperator(
inputs, outputs, meta,
auto_grad=auto_grad,
callback_on_run=callback)
inputs=inputs,
outputs=outputs,
meta=(self.module_key, self._module_def),
auto_grad=auto_grad,
callback_on_run=callback,
)
def train(self, mode=True):
self.training = mode
......
......@@ -17,7 +17,10 @@ from dragon.core import mpi
from dragon.vm.torch.tensor import Tensor, _LeafTensor, _Device
from dragon.vm.torch.ops.primitive import MakeDevice, WrapScalar
from dragon.vm.torch.ops.factory import get_module
from dragon.vm.torch.ops.modules.control_flow import Compare
from dragon.vm.torch.ops.modules.control_flow import (
Assign, MaskedAssign, Compare
)
from dragon.vm.torch.ops.modules.arithmetic import (
Fundamental, Log, Exp, Sqrt,
......@@ -32,9 +35,8 @@ from dragon.vm.torch.ops.modules.init import (
from dragon.vm.torch.ops.modules.array import (
Reshape, Squeeze, UnSqueeze, Permute,
Indexing, Assigning,
Indexing, IndexSelect,
Repeat, Concat, Stack,
IndexSelect,
Reduce, ArgReduce, OneHot, Multinomial,
)
......@@ -48,8 +50,8 @@ from dragon.vm.torch.ops.modules.vision import (
__all__ = [
'add', 'sub', 'mul', 'div',
'accumulate',
'add', 'sub', 'mul', 'div',
'maximum', 'minimum', 'clamp',
'log', 'exp', 'sqrt',
'mm', 'xw_plus_b',
......@@ -59,9 +61,12 @@ __all__ = [
'gt', 'lt', 'eq', 'ge', 'le',
'cat', 'stack', 'narrow',
'index_select',
'one_hot', 'multinomial', 'rand', 'randn',
'zeros', 'zeros_like', 'ones', 'ones_like',
'nn_resize', 'bilinear_resize', 'roi_pool', 'roi_align',
'one_hot', 'multinomial',
'rand', 'randn',
'ones', 'ones_like',
'zeros', 'zeros_like',
'nn_resize', 'bilinear_resize',
'roi_pool', 'roi_align',
]
......@@ -409,52 +414,64 @@ def xw_plus_b(x, w, bias=None, transW=True, out=None):
def _reshape(input, shape, shape_like=None):
if shape_like is not None: shape = shape_like.shape
dev = MakeDevice(inputs=[input]); n_dim = len(shape)
key = 'Reshape/{}/n_dim:{}'.format(dev, n_dim)
module = get_module(Reshape, key, dev, n_dim=n_dim)
dev = MakeDevice(inputs=[input]); ndim = len(shape)
key = 'Reshape/{}/ndim:{}'.format(dev, ndim)
module = get_module(Reshape, key, dev, ndim=ndim)
return module.forward(input, shape)
def _permute(input, perm):
dev = MakeDevice(inputs=[input]); n_perm = len(perm)
key = 'Permute/{}/n_perm:{}'.format(dev, n_perm)
module = get_module(Permute, key, dev, n_perm=n_perm)
dev = MakeDevice(inputs=[input]); nperm = len(perm)
key = 'Permute/{}/nperm:{}'.format(dev, nperm)
module = get_module(Permute, key, dev, nperm=nperm)
return module.forward(input, perm)
def _repeat(input, times):
dev = MakeDevice(inputs=[input]); n_times = len(times)
key = 'Repeat/{}/n_times:{}'.format(dev, n_times)
module = get_module(Repeat, key, dev, n_times=n_times)
dev = MakeDevice(inputs=[input]); ntimes = len(times)
key = 'Repeat/{}/ntimes:{}'.format(dev, ntimes)
module = get_module(Repeat, key, dev, ntimes=ntimes)
return module.forward(input, times)
def _fill(input, shape, value):
dev = MakeDevice(inputs=[input]); n_dim = len(shape)
key = 'Fill/{}/dtype:{}/n_dim:{}/value:{}'.format(
dev, input.dtype, n_dim, value)
module = get_module(Fill, key, dev, n_dim=n_dim,
value=value, dtype=input.dtype)
dev = MakeDevice(inputs=[input]); ndim = len(shape)
key = 'Fill/{}/dtype:{}/ndim:{}/value:{}' \
.format(dev, input.dtype, ndim, value)
module = get_module(
Fill, key, dev,
ndim=ndim,
value=value,
dtype=input.dtype,
)
return module.forward(input, shape)
def _uniform(input, shape, low, high):
dev = MakeDevice(inputs=[input]); n_dim = len(shape)
key = 'Uniform/{}/dtype:{}/n_dim:{}/low:{}/high:{}'.format(
dev, input.dtype, n_dim, float(low), float(high))
dev = MakeDevice(inputs=[input]); ndim = len(shape)
key = 'Uniform/{}/dtype:{}/ndim:{}/low:{}/high:{}'.format(
dev, input.dtype, ndim, float(low), float(high))
module = get_module(
RandomUniform, key, dev, n_dim=n_dim,
low=low, high=high, dtype=input.dtype)
RandomUniform, key, dev,
ndim=ndim,
low=low,
high=high,
dtype=input.dtype,
)
return module.forward(input, shape)
def _normal(input, shape, mean, std):
dev = MakeDevice(inputs=[input]); n_dim = len(shape)
key = 'Normal/{}/dtype:{}/n_dim:{}/mean:{}/std:{}'.format(
dev, input.dtype, n_dim, float(mean), float(std))
dev = MakeDevice(inputs=[input]); ndim = len(shape)
key = 'Normal/{}/dtype:{}/ndim:{}/mean:{}/std:{}'.format(
dev, input.dtype, ndim, float(mean), float(std))
module = get_module(
RandomNormal, key, dev, n_dim=n_dim,
mean=mean, std=std, dtype=input.dtype)
RandomNormal, key, dev,
ndim=ndim,
mean=mean,
std=std,
dtype=input.dtype,
)
return module.forward(input, shape)
......@@ -464,44 +481,62 @@ def _reduce(input, operation, dim=None, keepdim=False, out=None):
key = '{}/{}/dim:{}/keepdim:{}'.format(
operation, dev, dim, int(keepdim))
module = get_module(
Reduce, key, dev, operation=operation,
dim=dim, keepdim=keepdim)
Reduce, key, dev,
dim=dim,
keepdim=keepdim,
operation=operation,
)
return module.forward(input, out)
def _arg_reduce(input, operation, dim=None, keepdim=False, top_k=1, out=None):
def _arg_reduce(input, operation, dim=None, keepdim=False, topk=1, out=None):
if dim is None: keepdim = False
dev = MakeDevice(inputs=[input])
key = '{}/{}/dim:{}/keepdim:{}/top_k:{}'.format(
operation, dev, dim, int(keepdim), top_k)
key = '{}/{}/dim:{}/keepdim:{}/topk:{}'.format(
operation, dev, dim, int(keepdim), topk)
module = get_module(
ArgReduce, key, dev,
operation=operation, axis=dim,
keepdim=keepdim, top_k=top_k)
axis=dim,
topk=topk,
keepdim=keepdim,
operation=operation,
)
return module.forward(input, out)
def _indexing(input, starts, sizes):
n_starts, n_sizes = len(starts), len(sizes)
def _index(input, starts, sizes):
nstarts, nsizes = len(starts), len(sizes)
dev = MakeDevice(inputs=[input])
key = 'Index/{}/n_starts:{}/n_sizes:{}'.format(dev, n_starts, n_sizes)
module = get_module(Indexing, key, dev, n_starts=n_starts, n_sizes=n_sizes)
key = 'Index/{}/nstarts:{}/nsizes:{}'.format(dev, nstarts, nsizes)
module = get_module(Indexing, key, dev, nstarts=nstarts, nsizes=nsizes)
return module.forward(input, starts, sizes)
def _assigning(output, input, starts, sizes):
def _assign(output, starts, sizes, input):
if not isinstance(input, Tensor):
if isinstance(input, (tuple, list)):
input = Tensor(input, dtype=output.dtype, device=output.device)
else:
input = WrapScalar(input, output.dtype, output.device)
n_starts, n_sizes = len(starts), len(sizes)
nstarts, nsizes = len(starts), len(sizes)
dev = MakeDevice(inputs=[input])
key = 'Assign/{}/n_starts:{}/n_sizes:{}'.format(dev, n_starts, n_sizes)
module = get_module(Assigning, key, dev, n_starts=n_starts, n_sizes=n_sizes)
key = 'Assign/{}/nstarts:{}/nsizes:{}'.format(dev, nstarts, nsizes)
module = get_module(Assign, key, dev, nstarts=nstarts, nsizes=nsizes)
return module.forward(input, output, starts, sizes)
def _masked_assign(output, mask, input):
if not isinstance(input, Tensor):
if isinstance(input, (tuple, list)):
input = Tensor(input, dtype=output.dtype, device=output.device)
else:
input = WrapScalar(input, output.dtype, output.device)
dev = MakeDevice(inputs=[input])
key = 'MaskedAssign/{}'.format(dev)
module = get_module(MaskedAssign, key, dev)
return module.forward(input, output, mask)
def _compare(input, other, operation, out=None):
if not isinstance(other, Tensor):
other = WrapScalar(other, input.dtype, input.device)
......@@ -927,7 +962,7 @@ def narrow(input, dimension, start, length):
"""
sizes = list(input.shape[:]); starts = [0] * len(sizes)
starts[dimension], sizes[dimension] = start, length
return _indexing(input, starts, sizes)
return _index(input, starts, sizes)
def one_hot(input, depth):
......@@ -1159,8 +1194,13 @@ def _update(
):
dev = MakeDevice(inputs=[param])
key = '{}/{}/{}/{}'.format(op_type, dev, slot, param.name)
module = get_module(Update, key, dev, op_type=op_type,
lr_mult=lr_mult, decay_mult=decay_mult, slot=slot)
module = get_module(
Update, key, dev,
op_type=op_type,
lr_mult=lr_mult,
decay_mult=decay_mult,
slot=slot,
)
return module.forward(param, grad)
......@@ -1183,8 +1223,12 @@ def _resize_2d(input, op_type, dsize, fx, fy):
dev = MakeDevice(inputs=[input])
key = '{}/{}/dsize:{}/fx:{}/fy:{}'.format(
op_type, dev, '2' if dsize else 'none', fx, fy)
module = get_module(Resize2d, key, dev,
op_type=op_type, dsize=dsize, fx=fx, fy=fy)
module = get_module(
Resize2d, key, dev,
dsize=dsize,
fx=fx, fy=fy,
op_type=op_type,
)
return module.forward(input, dsize)
......
......@@ -27,8 +27,8 @@ class Indexing(BaseModule):
"""
def __init__(self, key, dev, **kwargs):
super(Indexing, self).__init__(key, dev, **kwargs)
self.n_starts = kwargs.get('n_starts', 0)
self.n_sizes = kwargs.get('n_sizes', 0)
self.nstarts = kwargs.get('nstarts', 0)
self.nsizes = kwargs.get('nsizes', 0)
self.register_op()
def register_op(self):
......@@ -37,61 +37,25 @@ class Indexing(BaseModule):
'arguments': {
'starts_desc': [
'${{ANCHOR}}/starts[{}]'.format(n)
for n in range(self.n_starts)],
for n in range(self.nstarts)],
'sizes_desc': [
'${{ANCHOR}}/sizes[{}]'.format(n)
for n in range(self.n_sizes)],
for n in range(self.nsizes)],
},
}
def update_arguments(self, A, starts, sizes):
def update_args(self, A, starts, sizes):
for i, e in enumerate(starts):
self.set_argument_i64('{}/starts[{}]'.format(A, i), e)
self.set_argument_i64('{}/sizes[{}]'.format(A, i), sizes[i])
self.set_arg_i64('{}/starts[{}]'.format(A, i), e)
self.set_arg_i64('{}/sizes[{}]'.format(A, i), sizes[i])
def forward(self, x, starts, sizes):
inputs = [x]; self.unify_devices(inputs)
outputs = [self.register_output()]
callback = lambda A: self.update_arguments(A, starts, sizes)
callback = lambda A: self.update_args(A, starts, sizes)
return self.run(inputs, outputs, callback=callback)
class Assigning(BaseModule):
"""This module imports the *AssignOp* from backend.
Arbitrary length of starts and sizes will be take.
"""
def __init__(self, key, dev, **kwargs):
super(Assigning, self).__init__(key, dev, **kwargs)
self.n_starts = kwargs.get('n_starts', 0)
self.n_sizes = kwargs.get('n_sizes', 0)
self.register_op()
def register_op(self):
self.op_meta = {
'op_type': 'Assign',
'arguments': {
'starts_desc': [
'${{ANCHOR}}/starts[{}]'.format(n)
for n in range(self.n_starts)],
'sizes_desc': [
'${{ANCHOR}}/sizes[{}]'.format(n)
for n in range(self.n_sizes)],
},
}
def update_arguments(self, A, starts, sizes):
for i, e in enumerate(starts):
self.set_argument_i64('{}/starts[{}]'.format(A, i), e)
self.set_argument_i64('{}/sizes[{}]'.format(A, i), sizes[i])
def forward(self, x, y, starts, sizes):
self.unify_devices([x, y])
callback = lambda A: self.update_arguments(A, starts, sizes)
return self.run([x], [y], callback=callback, auto_grad=False)
class Concat(BaseModule):
"""This module imports the *ConcatOp* from backend.
......@@ -200,18 +164,19 @@ class ArgReduce(BaseModule):
self.operation = kwargs.get('operation', 'ARGMAX')
self.axis = kwargs.get('axis', None)
self.keepdim = kwargs.get('keepdim', True)
self.top_k = kwargs.get('top_k', 1)
self.topk = kwargs.get('topk', 1)
self.register_op()
def register_op(self):
self.op_meta = {
'op_type': 'ArgReduce',
'arguments': {
'operation': self.operation if 'ARG' in self.operation \
'operation': self.operation
if 'ARG' in self.operation \
else 'ARG' + self.operation,
'axis': self.axis if self.axis else 2147483647,
'keep_dims': self.keepdim,
'top_k': self.top_k,
'top_k': self.topk,
},
}
......@@ -241,7 +206,7 @@ class ArgReduce(BaseModule):
class Reshape(BaseModule):
def __init__(self, key, dev, **kwargs):
super(Reshape, self).__init__(key, dev, **kwargs)
self.n_dim = kwargs.get('n_dim', 0)
self.ndim = kwargs.get('ndim', 0)
self.register_op()
def register_op(self):
......@@ -250,19 +215,19 @@ class Reshape(BaseModule):
'arguments': {
'dims_desc': [
'${{ANCHOR}}/dims[{}]'.format(n)
for n in range(self.n_dim)
for n in range(self.ndim)
],
},
}
def update_arguments(self, A, shape):
def update_args(self, A, shape):
for i, e in enumerate(shape):
self.set_argument_i64('{}/dims[{}]'.format(A, i), e)
self.set_arg_i64('{}/dims[{}]'.format(A, i), e)
def forward(self, x, shape):
inputs = [x]; self.unify_devices(inputs)
outputs = [_ReferenceTensor(x)]
callback = lambda A: self.update_arguments(A, shape)
callback = lambda A: self.update_args(A, shape)
return self.run(inputs, outputs, callback=callback)
......@@ -275,7 +240,9 @@ class Squeeze(BaseModule):
def register_op(self):
self.op_meta = {
'op_type': 'Squeeze',
'arguments': {'axis': self.dim},
'arguments': {
'axis': self.dim,
},
}
def forward(self, x, out=None):
......@@ -293,7 +260,9 @@ class UnSqueeze(BaseModule):
def register_op(self):
self.op_meta = {
'op_type': 'ExpandDims',
'arguments': {'axis': self.dim},
'arguments': {
'axis': self.dim,
},
}
def forward(self, x, out=None):
......@@ -305,7 +274,7 @@ class UnSqueeze(BaseModule):
class Permute(BaseModule):
def __init__(self, key, dev, **kwargs):
super(Permute, self).__init__(key, dev, **kwargs)
self.n_perm = kwargs.get('n_perm', 0)
self.nperm = kwargs.get('nperm', 0)
self.register_op()
def register_op(self):
......@@ -313,26 +282,26 @@ class Permute(BaseModule):
'op_type': 'Transpose',
'arguments': {
'perm_desc': ['${{ANCHOR}}/perm[{}]'.format(n)
for n in range(self.n_perm)],
for n in range(self.nperm)],
},
}
def update_arguments(self, A, perm):
def update_args(self, A, perm):
if perm:
for i, e in enumerate(perm):
self.set_argument_i64('{}/perm[{}]'.format(A, i), e)
self.set_arg_i64('{}/perm[{}]'.format(A, i), e)
def forward(self, x, perm):
inputs = [x]; self.unify_devices(inputs)
outputs = [self.register_output()]
callback = lambda A: self.update_arguments(A, perm)
callback = lambda A: self.update_args(A, perm)
return self.run(inputs, outputs, callback=callback)
class Repeat(BaseModule):
def __init__(self, key, dev, **kwargs):
super(Repeat, self).__init__(key, dev, **kwargs)
self.n_times = kwargs.get('n_times', 0)
self.ntimes = kwargs.get('ntimes', 0)
self.register_op()
def register_op(self):
......@@ -341,19 +310,19 @@ class Repeat(BaseModule):
'arguments': {
'multiples_desc': [
'${{ANCHOR}}/multiples[{}]'.format(n)
for n in range(self.n_times)
for n in range(self.ntimes)
],
},
}
def update_arguments(self, A, times):
def update_args(self, A, times):
for i, d in enumerate(times):
self.set_argument_i64('{}/multiples[{}]'.format(A, i), d)
self.set_arg_i64('{}/multiples[{}]'.format(A, i), d)
def forward(self, x, times):
inputs = [x]; self.unify_devices(inputs)
outputs = [self.register_output()]
callback = lambda A: self.update_arguments(A, times)
callback = lambda A: self.update_args(A, times)
return self.run(inputs, outputs, callback=callback)
......@@ -409,7 +378,6 @@ class Multinomial(BaseModule):
def __init__(self, key, dev, **kwargs):
super(Multinomial, self).__init__(key, dev, **kwargs)
self.num_samples = kwargs.get('num_samples', 1)
self.normalize = kwargs.get('normalize', False)
self.register_op()
def register_op(self):
......@@ -417,7 +385,7 @@ class Multinomial(BaseModule):
'op_type': 'Multinomial',
'arguments': {
'num_samples': self.num_samples,
'normalize': self.normalize,
'normalize': False,
},
}
......
......@@ -14,9 +14,9 @@ from __future__ import division
from __future__ import print_function
import numpy
from dragon.core import proto_utils as _proto_utils
from dragon.core import workspace as _workspace
from dragon.core import proto_utils as _proto_utils
from dragon.vm.torch.module import Module
......@@ -25,10 +25,14 @@ class BaseModule(Module):
super(BaseModule, self).__init__()
self._module_key = key
self._device = dev
self._args_dev = _proto_utils.\
GetDeviceOption('cpu').SerializeToString()
self._arg_dev = _proto_utils \
.GetDeviceOption('cpu')\
.SerializeToString()
def set_argument_i64(self, name, value):
_workspace.get_default_workspace()\
.FeedTensor(name, numpy.array(
value, dtype=numpy.int64), self._args_dev)
\ No newline at end of file
def set_arg_i64(self, name, value):
_workspace.get_default_workspace() \
.FeedTensor(
name,
numpy.array(value, 'int64'),
self._arg_dev,
)
\ No newline at end of file
......@@ -46,4 +46,53 @@ class Compare(BaseModule):
def forward(self, x1, x2, y):
inputs = [x1, x2]; self.unify_devices(inputs)
outputs = [y] if y else [self.register_output()]
return self.run(inputs, outputs)
\ No newline at end of file
return self.run(inputs, outputs)
class Assign(BaseModule):
"""This module imports the *AssignOp* from backend.
Arbitrary length of starts and sizes will be take.
"""
def __init__(self, key, dev, **kwargs):
super(Assign, self).__init__(key, dev, **kwargs)
self.nstarts = kwargs.get('nstarts', 0)
self.nsizes = kwargs.get('nsizes', 0)
self.register_op()
def register_op(self):
self.op_meta = {
'op_type': 'Assign',
'arguments': {
'starts_desc': [
'${{ANCHOR}}/starts[{}]'.format(n)
for n in range(self.nstarts)],
'sizes_desc': [
'${{ANCHOR}}/sizes[{}]'.format(n)
for n in range(self.nsizes)],
},
}
def update_args(self, A, starts, sizes):
for i, e in enumerate(starts):
self.set_arg_i64('{}/starts[{}]'.format(A, i), e)
self.set_arg_i64('{}/sizes[{}]'.format(A, i), sizes[i])
def forward(self, x, y, starts, sizes):
self.unify_devices([x, y])
callback = lambda A: self.update_args(A, starts, sizes)
return self.run([x], [y], callback=callback, auto_grad=False)
class MaskedAssign(BaseModule):
def __init__(self, key, dev, **kwargs):
super(MaskedAssign, self).__init__(key, dev, **kwargs)
self.register_op()
def register_op(self):
self.op_meta = {'op_type': 'MaskedAssign', 'arguments': {}}
def forward(self, x, y, mask):
self.unify_devices([x, y])
return self.run([x, mask], [y])
\ No newline at end of file
......@@ -19,16 +19,16 @@ from dragon.vm.torch.ops.modules.base import BaseModule
class _InitModule(BaseModule):
def __init__(self, key, dev, **kwargs):
super(_InitModule, self).__init__(key, dev, **kwargs)
self.n_dim = kwargs.get('n_dim', 0)
self.ndim = kwargs.get('ndim', 0)
self.dtype = kwargs.get('dtype', 'float32')
def update_arguments(self, A, shape):
def update_args(self, A, shape):
for i, e in enumerate(shape):
self.set_argument_i64('{}/dims[{}]'.format(A, i), e)
self.set_arg_i64('{}/dims[{}]'.format(A, i), e)
def forward(self, x, shape):
outputs = [x]; self.unify_devices(outputs)
callback = lambda A: self.update_arguments(A, shape)
callback = lambda A: self.update_args(A, shape)
return self.run([], outputs, callback=callback)
......@@ -46,7 +46,7 @@ class Fill(_InitModule):
'value': float(self.value),
'dims_desc': [
'${{ANCHOR}}/dims[{}]'.format(n)
for n in range(self.n_dim)
for n in range(self.ndim)
],
},
}
......@@ -68,7 +68,7 @@ class RandomNormal(_InitModule):
'std': float(self.std),
'dims_desc': [
'${{ANCHOR}}/dims[{}]'.format(n)
for n in range(self.n_dim)
for n in range(self.ndim)
],
},
}
......@@ -90,7 +90,7 @@ class RandomUniform(_InitModule):
'high': float(self.high),
'dims_desc': [
'${{ANCHOR}}/dims[{}]'.format(n)
for n in range(self.n_dim)
for n in range(self.ndim)
],
},
}
\ No newline at end of file
......@@ -13,7 +13,7 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import dragon.core.mpi as mpi
from dragon.core import mpi as _mpi
from dragon.vm.torch.ops.modules.base import BaseModule
......@@ -50,11 +50,13 @@ class Collective(BaseModule):
self.register_op()
def register_op(self):
idx, group = mpi.AllowParallel()
idx, group = _mpi.AllowParallel()
if idx == -1:
raise RuntimeError('The mpi node({}) dost not in '
'parallel groups. \nSet it using mpi.Parallel([..]).'.format(mpi.Rank()))
mpi_comm, mpi_group = mpi.CreateGroup(root=group[0], incl=group)
raise RuntimeError(
'The mpi node({}) dost not in groups.\n'
'Set it using mpi.Parallel([..]).'.format(_mpi.Rank())
)
mpi_comm, mpi_group = _mpi.CreateGroup(root=group[0], incl=group)
self.op_meta = {
'op_type': 'CollectiveUpdate',
'arguments': {
......@@ -78,7 +80,10 @@ class Accumulate(BaseModule):
def register_op(self):
self.op_meta = {
'op_type': 'Accumulate',
'arguments': {'alpha': 1., 'beta': 1.},
'arguments': {
'alpha': 1.,
'beta': 1.,
},
}
def forward(self, grads):
......
......@@ -19,10 +19,10 @@ from dragon.vm.torch.ops.modules.base import BaseModule
class Resize2d(BaseModule):
def __init__(self, key, dev, **kwargs):
super(Resize2d, self).__init__(key, dev, **kwargs)
self.op_type = kwargs.get('op_type', 'NNResize')
self.dsize = kwargs.get('dsize', None)
self.fx = kwargs.get('fx', None)
self.fy = kwargs.get('fy', None)
self.op_type = kwargs.get('op_type', 'NNResize')
self.register_op()
def register_op(self):
......@@ -38,15 +38,15 @@ class Resize2d(BaseModule):
},
}
def update_arguments(self, A, dsize):
def update_args(self, A, dsize):
if self.dsize:
for i, e in enumerate(dsize):
self.set_argument_i64('{}/dsize[{}]'.format(A, i), e)
self.set_arg_i64('{}/dsize[{}]'.format(A, i), e)
def forward(self, input, dsize=None):
inputs = [input]; self.unify_devices(inputs)
outputs = [self.register_output()]
callback = lambda A: self.update_arguments(A, dsize)
callback = lambda A: self.update_args(A, dsize)
return self.run(inputs, outputs, callback=callback)
......@@ -62,7 +62,8 @@ class RoIPool(BaseModule):
self.op_meta = {
'op_type': 'ROIPool',
'arguments': {
'pool_h': self.pool_h, 'pool_w': self.pool_w,
'pool_h': self.pool_h,
'pool_w': self.pool_w,
'spatial_scale': self.spatial_scale,
},
}
......@@ -86,7 +87,8 @@ class RoIAlign(BaseModule):
self.op_meta = {
'op_type': 'ROIAlign',
'arguments': {
'pool_h': self.pool_h, 'pool_w': self.pool_w,
'pool_h': self.pool_h,
'pool_w': self.pool_w,
'spatial_scale': self.spatial_scale,
'sampling_ratio': self.sampling_ratio,
},
......
......@@ -23,9 +23,9 @@ from dragon.vm.torch.ops.builtin import (
_fundamental, _rfundamental,
log, exp, sqrt, clamp,
_reshape, squeeze, unsqueeze,
_permute, _repeat,
_indexing, _assigning,
narrow, index_select,
_permute, _repeat, narrow,
_index, index_select,
_assign, _masked_assign,
mean, sum, max, min,
gt, lt, eq, ge, le,
)
......@@ -41,6 +41,7 @@ def _type_to(input, dtype='float32', inplace=False):
Tensor.fill_ = lambda self, value: _fill(self, self.shape, value)
Tensor.masked_fill_ = lambda *args, **kwargs: _masked_assign(*args, **kwargs)
Tensor.uniform_ = lambda self, low=0, high=1: _uniform(self, self.shape, low, high)
Tensor.normal_ = lambda self, mean=0, std=1: _normal(self, self.shape, mean, std)
Tensor.multinomial = lambda *args, **kwargs: multinomial(*args, **kwargs)
......@@ -85,8 +86,8 @@ Tensor.le = lambda *args, **kwargs: le(*args, **kwargs)
Tensor.eq = lambda *args, **kwargs: eq(*args, **kwargs)
Tensor.index_select = lambda *args, **kwargs: index_select(*args, **kwargs)
Tensor.narrow = lambda *args, **kwargs: narrow(*args, **kwargs)
Tensor._indexing = lambda *args, **kwargs: _indexing(*args, **kwargs)
Tensor._assigning = lambda *args, **kwargs: _assigning(*args, **kwargs)
Tensor._index = lambda *args, **kwargs: _index(*args, **kwargs)
Tensor._assign = lambda *args, **kwargs: _assign(*args, **kwargs)
Tensor.half = lambda self: _type_to(self, dtype='float16', inplace=False)
......
......@@ -533,16 +533,16 @@ class Tensor(object):
"""
starts, sizes = self._process_indices(item)
return self._indexing(starts, sizes)
return self._index(starts, sizes)
def __setitem__(self, key, value):
"""Set the value at the specific indices.
Parameters
----------
key : int, slice
key : int, slice or dragon.vm.torch.Tensor
The indices.
value : dragon.vm.torch.Tensor, number or sequence
value : number, sequence or dragon.vm.torch.Tensor
The value.
Returns
......@@ -550,8 +550,11 @@ class Tensor(object):
None
"""
starts, sizes = self._process_indices(key)
return self._assigning(value, starts, sizes)
if isinstance(key, Tensor):
return self.masked_fill_(key, value)
else:
starts, sizes = self._process_indices(key)
return self._assign(starts, sizes, value)
def __hash__(self):
return id(self)
......@@ -886,7 +889,7 @@ class Tensor(object):
return self
def fill_(self, value):
"""Fills self tensor with the specified value.
"""Fill self with the given value.
Parameters
----------
......@@ -901,6 +904,24 @@ class Tensor(object):
"""
raise NotImplementedError('Refer torch.ops.tensor.fill_')
def masked_fill_(self, mask, value):
"""Fill self with the given value where ``mask`` is *1*.
Parameters
----------
mask : dragon.vm.torch.Tensor
The mask.
value : number
The value to fill.
Returns
-------
dragon.vm.torch.Tensor
The self.
"""
raise NotImplementedError('Refer torch.ops.tensor.masked_fill_')
def zero_(self):
"""Fills self tensor with zeros.
......
......@@ -123,7 +123,7 @@ void MixedMemory::SwitchToCUDADevice(int device_id) {
if (device_id != ptr_device_) {
// Move the memory to another device
void* new_ptr_ = nullptr;
DeviceGuard gurad(device_id);
CUDADeviceGuard gurad(device_id);
new_ptr_ = CUDAContext::New(nbytes_);
CUDAContext::MemcpyEx<CUDAContext, CUDAContext>(
nbytes_, new_ptr_, cuda_ptr_, ptr_device_);
......
......@@ -4,24 +4,31 @@ namespace dragon {
bool OpSchema::Verify(const OperatorDef& def) const {
if (ignore_verify_) return true;
string indicator = "[" + def.name() + ", " + def.type() + "]\n";
if (def.input_size() < min_input_ || def.input_size() > max_input_) {
LOG(FATAL) << indicator << "Input size: " << def.input_size()
<< " is not in range [min=" << min_input_
<< ", max=" << max_input_ << "]";
auto header = "[" + def.name() + ", " + def.type() + "]\n";
if (def.input_size() < min_input_ ||
def.input_size() > max_input_) {
LOG(FATAL)
<< header << "Input size: " << def.input_size()
<< " is not in range [min=" << min_input_
<< ", max=" << max_input_ << "]";
}
if (def.output_size() < min_output_ || def.output_size() > max_output_) {
LOG(FATAL) << indicator << "Output size: " << def.output_size()
<< " is not in range [min=" << min_output_
<< ", max=" << max_output_ << "]";
if (def.output_size() < min_output_ ||
def.output_size() > max_output_) {
LOG(FATAL)
<< header << "Output size: " << def.output_size()
<< " is not in range [min=" << min_output_
<< ", max=" << max_output_ << "]";
}
for (int in = 0; in < def.input_size(); in++) {
if (def.input(in) == "NULL") continue;
for (int out = 0; out < def.output_size(); out++) {
if (def.output(out) == "NULL") continue;
if (def.input(in) == def.output(out) && (!CheckInplace(in, out)))
LOG(FATAL) << indicator << "Input(" << in << ") and "
<< "Output(" << out << ") can not be set to inplace.";
for (int i = 0; i < def.input_size(); ++i) {
if (def.input(i) == "NULL") continue;
for (int j = 0; j < def.output_size(); ++j) {
if (def.output(j) == "NULL") continue;
if (def.input(i) == def.output(j) &&
!CheckInplace(i, j))
LOG(FATAL)
<< header << "Input(" << i
<< ") and Output(" << j << ") "
<< "can not be set to inplace.";
}
}
return true;
......
......@@ -54,7 +54,7 @@ __global__ void _Assign(
const T* x, \
T* y, \
CUDAContext* ctx) { \
_Assign<T> \
_Assign \
<< < CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >> >( \
count, \
......
#include "utils/op_kernel.h"
#include "utils/math_utils.h"
#include "utils/omp_alternative.h"
namespace dragon {
namespace kernel {
/* <T = ?, Device = CPU> */
template <typename T>
void _MaskedAssign(
const int count,
const uint8_t* mask,
const T* x,
T* y) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
y[i] = mask[i] ? x[i] : y[i];
}
}
/* Kernel Launchers */
#define DEFINE_ASSIGN_KERNEL_LAUNCHER(T) \
template<> void MaskedAssign<T, CPUContext>( \
const int count, \
const uint8_t* mask, \
const T* x, \
T* y, \
CPUContext* ctx) { \
_MaskedAssign(count, mask, x, y); \
}
DEFINE_ASSIGN_KERNEL_LAUNCHER(bool);
DEFINE_ASSIGN_KERNEL_LAUNCHER(int8_t);
DEFINE_ASSIGN_KERNEL_LAUNCHER(uint8_t);
DEFINE_ASSIGN_KERNEL_LAUNCHER(int);
DEFINE_ASSIGN_KERNEL_LAUNCHER(int64_t);
DEFINE_ASSIGN_KERNEL_LAUNCHER(float16);
DEFINE_ASSIGN_KERNEL_LAUNCHER(float);
DEFINE_ASSIGN_KERNEL_LAUNCHER(double);
#undef DEFINE_ASSIGN_KERNEL_LAUNCHER
} // namespace kernel
} // namepsace dragon
\ No newline at end of file
#ifdef WITH_CUDA
#include "core/context_cuda.h"
#include "utils/op_kernel.h"
namespace dragon {
namespace kernel {
/* <T = ?, Device = CUDA> */
template<typename T>
__global__ void _MaskedAssign(
const int nthreads,
const uint8_t* mask,
const T* x,
T* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
y[i] = mask[i] ? x[i] : y[i];
}
}
/* Kernel Launchers */
#define DEFINE_ASSIGN_KERNEL_LAUNCHER(T) \
template<> void MaskedAssign<T, CUDAContext>( \
const int count, \
const uint8_t* mask, \
const T* x, \
T* y, \
CUDAContext* ctx) { \
_MaskedAssign \
<< < CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >> >( \
count, mask, x, y \
); \
}
DEFINE_ASSIGN_KERNEL_LAUNCHER(bool);
DEFINE_ASSIGN_KERNEL_LAUNCHER(int8_t);
DEFINE_ASSIGN_KERNEL_LAUNCHER(uint8_t);
DEFINE_ASSIGN_KERNEL_LAUNCHER(int);
DEFINE_ASSIGN_KERNEL_LAUNCHER(int64_t);
DEFINE_ASSIGN_KERNEL_LAUNCHER(float16);
DEFINE_ASSIGN_KERNEL_LAUNCHER(float);
DEFINE_ASSIGN_KERNEL_LAUNCHER(double);
#undef DEFINE_ASSIGN_KERNEL_LAUNCHER
} // namespace kernel
} // namepsace dragon
#endif // WITH_CUDA
\ No newline at end of file
......@@ -49,7 +49,7 @@ void AssignOp<Context>::RunImpl() {
} else if (X(0).count() == X_.count()) {
x = X(0).template data<T, Context>();
} else {
LOG(FATAL)
LOG(FATAL)
<< "Could not assign "
<< X(0).DimString()
<< " to "
......@@ -146,7 +146,10 @@ DEPLOY_CUDA(Assign);
#endif
OPERATOR_SCHEMA(Assign)
.NumInputs(1).NumOutputs(1);
/* V */
.NumInputs(1)
/* X */
.NumOutputs(1);
NO_GRADIENT(Assign);
......
#include "core/workspace.h"
#include "utils/op_kernel.h"
#include "utils/math_utils.h"
#include "utils/math_functions.h"
#include "operators/control_flow/masked_assign_op.h"
namespace dragon {
template <class Context> template <typename T>
void MaskedAssignOp<Context>::RunImpl() {
const T* x = nullptr;
auto* mask = X(1).template raw_data<Context>();
auto* y = Y(0)->template mutable_data<T, Context>();
if (X(0).count() < Y(0)->count()) {
int rows, cols;
auto* scratch = ws()
->template data<T, Context>
({ Y(0)->count() })[0];
auto* rx = X(0).template data<T, Context>();
if (utils::IsRowwiseBroadcast(
Y(0)->dims(), X(0).dims(),
&rows, &cols)) {
math::BroadcastSet(
rows, cols, 0,
rx, scratch, ctx()
);
} else if (utils::IsColwiseBroadcast(
Y(0)->dims(), X(0).dims(),
&rows, &cols)) {
math::BroadcastSet(
rows, cols, 1,
rx, scratch, ctx()
);
} else {
LOG(FATAL)
<< "Could not broadcast "
<< X(0).DimString()
<< " to "
<< Y(0)->DimString();
}
x = scratch;
} else if (X(0).count() == Y(0)->count()) {
x = X(0).template data<T, Context>();
} else {
LOG(FATAL)
<< "Could not assign "
<< X(0).DimString()
<< " to "
<< Y(0)->DimString();
}
kernel::MaskedAssign(
Y(0)->count(),
(const uint8_t*)mask,
x, y, ctx()
);
}
template <class Context>
void MaskedAssignOp<Context>::RunOnDevice() {
CHECK_EQ(X(1).count(), Y(0)->count())
<< "\nSize of mask and input should be equal.";
CHECK(XIsType(X(1), bool) || XIsType(X(1), uint8_t))
<< "\nExcepted bool or uint8 mask.";
if (XIsType(X(0), bool)) {
RunImpl<bool>();
} else if (XIsType(X(0), int8_t)) {
RunImpl<int8_t>();
} else if (XIsType(X(0), uint8_t)) {
RunImpl<uint8_t>();
} else if (XIsType(X(0), int)) {
RunImpl<int>();
} else if (XIsType(X(0), int64_t)) {
RunImpl<int64_t>();
} else if (XIsType(X(0), float16)) {
RunImpl<float16>();
} else if (XIsType(X(0), float)) {
RunImpl<float>();
} else if (XIsType(X(0), double)) {
RunImpl<double>();
} else {
LOG(FATAL) << DTypeString(X(0), {
"bool", "int8", "uint8", "int32", "int64",
"float16", "float32", "float64",
});
}
}
DEPLOY_CPU(MaskedAssign);
#ifdef WITH_CUDA
DEPLOY_CUDA(MaskedAssign);
#endif
OPERATOR_SCHEMA(MaskedAssign)
/* V, M */
.NumInputs(2)
/* X */
.NumOutputs(1);
NO_GRADIENT(MaskedAssign);
} // namespace dragon
\ No newline at end of file
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!