Commit 4eab1c68 by Ting PAN

Unify the boolean operators

1 parent d1f714ea
Showing with 2255 additions and 845 deletions
......@@ -64,7 +64,6 @@ List Brief
`Tensor.__repr__`_ Return the information(name/shape).
`Tensor.__getitem__`_ Return the value at the specific indices.
`Tensor.__setitem__`_ Set the value at the specific indices.
`Tensor.__call__`_ Return the expressions for displaying.
============================== =============================================================================
......@@ -95,7 +94,6 @@ API Reference
.. automethod:: __repr__
.. automethod:: __getitem__
.. automethod:: __setitem__
.. automethod:: __call__
.. _Tensor.Variable: #dragon.core.tensor.Tensor.Variable
.. _Tensor.Placeholder: #dragon.core.tensor.Tensor.Placeholder
......
......@@ -135,7 +135,9 @@ Array
=============== ======================================================================
List Brief
=============== ======================================================================
`Where`_ Select elements from either *x* or *y*.
`IndexSelect`_ Select the elements according to the indices along the given axis.
`MaskedSelect`_ Select the the elements where *mask* is *1*.
`Reduce`_ Reduce the inputs along the axis in given axes.
`Sum`_ Compute the sum along the given axis.
`Mean`_ Compute the mean along the given axis.
......@@ -149,7 +151,7 @@ List Brief
`Repeat`_ Repeat the input along the given axis.
`Transpose`_ Transpose the input according to the given permutations.
`Tile`_ Tile the input according to the given multiples.
`Pad`_ Pad the input according to the given paddings.
`Pad`_ Pad the input according to the given sizes.
`Crop`_ Crop the input according to the given starts and sizes.
`OneHot`_ Generate the one-hot representation of inputs.
`Flatten`_ Flatten the input along the given axes.
......@@ -157,6 +159,7 @@ List Brief
`Squeeze`_ Remove the dimensions with size 1.
`ExpandDims`_ Expand the new dimension with size 1 to specific axis.
`Shape`_ Get the dynamic shape of a Tensor.
`NonZero`_ Return the indices of non-zero elements.
`Arange`_ Return evenly spaced values within a given interval.
`Multinomial`_ Return indices sampled from the multinomial distribution.
=============== ======================================================================
......@@ -170,6 +173,7 @@ List Brief
`Assign`_ Assign the *value* to *ref*.
`MaskedAssign`_ Assign the *value* to *ref* where mask is *1*.
`Equal`_ *Equal* Comparing between A and B.
`NotEqual`_ *NotEqual* Comparing between A and B.
`Less`_ *Less* Comparing between A and B.
`LessEqual`_ *LessEqual* Comparing between A and B.
`Greater`_ *Greater* Comparing between A and B.
......@@ -284,7 +288,9 @@ List Brief
.. _InstanceNorm: operators/norm.html#dragon.operators.norm.InstanceNorm
.. _L2Norm: operators/norm.html#dragon.operators.norm.L2Norm
.. _Where: operators/array.html#dragon.operators.array.Where
.. _IndexSelect: operators/array.html#dragon.operators.array.IndexSelect
.. _MaskedSelect: operators/array.html#dragon.operators.array.MaskedSelect
.. _Crop: operators/array.html#dragon.operators.array.Crop
.. _Reduce: operators/array.html#dragon.operators.array.Reduce
.. _Sum: operators/array.html#dragon.operators.array.Sum
......@@ -307,12 +313,14 @@ List Brief
.. _ExpandDims: operators/array.html#dragon.operators.array.ExpandDims
.. _Shape: operators/array.html#dragon.operators.array.Shape
.. _Arange: operators/array.html#dragon.operators.array.Arange
.. _NonZero: operators/array.html#dragon.operators.array.NonZero
.. _Multinomial: operators/array.html#dragon.operators.array.Multinomial
.. _Copy: operators/control_flow.html#dragon.operators.control_flow.Copy
.. _Assign: operators/control_flow.html#dragon.operators.control_flow.Assign
.. _MaskedAssign: operators/control_flow.html#dragon.operators.control_flow.MaskedAssign
.. _Equal: operators/control_flow.html#dragon.operators.control_flow.Equal
.. _NotEqual: operators/control_flow.html#dragon.operators.control_flow.NotEqual
.. _Less: operators/control_flow.html#dragon.operators.control_flow.Less
.. _LessEqual: operators/control_flow.html#dragon.operators.control_flow.LessEqual
.. _Greater: operators/control_flow.html#dragon.operators.control_flow.Greater
......
......@@ -38,6 +38,7 @@ class GradientMakerBase {
const vector<string>& g_outputs)
: def(def), g_outputs_(g_outputs),
g_inputs_(def.input_size()) {}
virtual ~GradientMakerBase() {}
virtual bool CopyDeviceOption() const { return true; }
......@@ -45,9 +46,9 @@ class GradientMakerBase {
virtual bool CopyArguments() const { return true; }
virtual Gradient Make() {
vector<OperatorDef> new_defs = MakeDef();
auto new_defs = MakeDef();
if (def.has_uid()) {
// Attach the anchor to the name if having UID
// Attach the anchor to name if having UID
for (int i = 0; i < new_defs.size(); i++)
new_defs[i].set_name(def.name());
} else {
......@@ -57,14 +58,14 @@ class GradientMakerBase {
for (int i = 0; i < new_defs.size(); i++)
new_defs[i].add_arg()->CopyFrom(anchor);
}
return Gradient(new_defs, g_inputs_, DefaultValues());
return Gradient(new_defs, g_inputs_, defaults());
};
virtual vector<OperatorDef> MakeDef() {
return vector<OperatorDef>();
}
virtual vector<float> DefaultValues() {
virtual vector<float> defaults() {
return vector<float>(g_outputs_.size(), 1.f);
}
......@@ -135,15 +136,17 @@ class SimpleGradientMaker final : public GradientMakerBase {
GRADIENT_MAKER_CTOR(SimpleGradientMaker);
vector<OperatorDef> MakeDef() override {
vector<string> inputs, outputs;
for (const auto& input : def.input()) {
for (const auto& input : def.input())
inputs.push_back(input);
}
inputs.push_back(GO(0));
for (int i = 0; i < def.input_size(); i++) {
for (int i = 0; i < def.input_size(); ++i)
outputs.push_back(GI(i));
}
return SingleDef(def.type() +
"Gradient", "", inputs, outputs);
inputs.push_back(GO(0));
return SingleDef(
def.type() + "Gradient",
"",
inputs,
outputs
);
}
};
......@@ -162,7 +165,8 @@ class InplaceGradientMaker final : public GradientMakerBase {
def.type() + "Gradient", /*! OpType */
"", /*! OpName */
vector<string>({ O(0), GO(0) }), /*! Inputs */
vector<string>({ GI(0) })); /*! Outputs */
vector<string>({ GI(0) }) /*! Outputs */
);
}
};
......
......@@ -13,9 +13,9 @@
#ifndef DRAGON_CORE_TYPEID_H_
#define DRAGON_CORE_TYPEID_H_
#include <map>
#include <cstdlib>
#include <iostream>
#include <map>
namespace dragon {
......@@ -83,7 +83,7 @@ class TypeMeta {
template <typename T>
static void Ctor(void* ptr, size_t n) {
T* typed_ptr = static_cast<T*>(ptr);
for (unsigned int i = 0; i < n; i++)
for (size_t i = 0; i < n; i++)
new(typed_ptr + i) T;
}
......@@ -91,14 +91,14 @@ class TypeMeta {
static void Copy(const void* src, void* dst, size_t n) {
const T* typed_src = static_cast<const T*>(src);
T* typed_dst = static_cast<T*>(dst);
for (unsigned int i = 0; i < n; i++)
for (size_t i = 0; i < n; ++i)
typed_dst[i] = typed_src[i];
}
template <typename T>
static void Dtor(void* ptr, size_t n) {
T* typed_ptr = static_cast<T*>(ptr);
for (unsigned int i = 0; i < n; i++)
for (size_t i = 0; i < n; ++i)
typed_ptr[i].~T();
}
......@@ -107,16 +107,23 @@ class TypeMeta {
template <typename T>
static typename FundMeta Make() {
return TypeMeta(Id<T>(), Itemsize<T>(),
nullptr, nullptr, nullptr);
return TypeMeta(
Id<T>(), Itemsize<T>(),
nullptr, nullptr, nullptr
);
}
template<typename T>
static typename StructMeta Make() {
return TypeMeta(Id<T>(), Itemsize<T>(),
Ctor<T>, Copy<T>, Dtor<T>);
return TypeMeta(
Id<T>(), Itemsize<T>(),
Ctor<T>, Copy<T>, Dtor<T>
);
}
#undef FundMeta
#undef StructMeta
private:
TypeMeta(
TypeId id,
......
/*!
* Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
*
* Licensed under the BSD 2-Clause License.
* You should have received a copy of the BSD 2-Clause License
* along with the software. If not, See,
*
* <https://opensource.org/licenses/BSD-2-Clause>
*
* ------------------------------------------------------------
*/
#ifndef DRAGON_OPERATORS_ARRAY_MASKED_SELECT_OP_H_
#define DRAGON_OPERATORS_ARRAY_MASKED_SELECT_OP_H_
#include "core/operator.h"
namespace dragon {
template <class Context>
class MaskedSelectOp final : public Operator<Context> {
public:
MaskedSelectOp(const OperatorDef& def, Workspace* ws)
: Operator<Context>(def, ws) {}
USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override;
template <typename T> void RunImpl();
};
template <class Context>
class MaskedSelectGradientOp final
: public Operator<Context> {
public:
MaskedSelectGradientOp(
const OperatorDef& def,
Workspace* ws)
: Operator<Context>(def, ws) {}
USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override;
template <typename T> void RunImpl();
};
} // namespace dragon
#endif // DRAGON_OPERATORS_ARRAY_MASKED_SELECT_OP_H_
\ No newline at end of file
/*!
* Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
*
* Licensed under the BSD 2-Clause License.
* You should have received a copy of the BSD 2-Clause License
* along with the software. If not, See,
*
* <https://opensource.org/licenses/BSD-2-Clause>
*
* ------------------------------------------------------------
*/
#ifndef DRAGON_OPERATORS_ARRAY_NON_ZERO_OP_H_
#define DRAGON_OPERATORS_ARRAY_NON_ZERO_OP_H_
#include "core/operator.h"
namespace dragon {
template <class Context>
class NonZeroOp final : public Operator<Context> {
public:
NonZeroOp(const OperatorDef& def, Workspace* ws)
: Operator<Context>(def, ws) {}
USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override;
template <typename T> void RunImpl();
protected:
Tensor X_dims_;
};
} // namespace dragon
#endif // DRAGON_OPERATORS_ARRAY_NON_ZERO_OP_H_
\ No newline at end of file
......@@ -18,7 +18,7 @@
namespace dragon {
template <class Context>
class OneHotOp final : public Operator < Context > {
class OneHotOp final : public Operator<Context> {
public:
OneHotOp(const OperatorDef& def, Workspace* ws)
: Operator<Context>(def, ws),
......
/*!
* Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
*
* Licensed under the BSD 2-Clause License.
* You should have received a copy of the BSD 2-Clause License
* along with the software. If not, See,
*
* <https://opensource.org/licenses/BSD-2-Clause>
*
* ------------------------------------------------------------
*/
#ifndef DRAGON_OPERATORS_ARRAY_WHERE_OP_H_
#define DRAGON_OPERATORS_ARRAY_WHERE_OP_H_
#include "core/operator.h"
namespace dragon {
template <class Context>
class WhereOp final : public Operator<Context> {
public:
WhereOp(const OperatorDef& def, Workspace* ws)
: Operator<Context>(def, ws) {}
USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override;
template <typename T> void RunImpl();
};
template <class Context>
class WhereGradientOp final : public Operator<Context> {
public:
WhereGradientOp(const OperatorDef& def, Workspace* ws)
: Operator<Context>(def, ws) {}
USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override;
template <typename T> void RunImpl();
};
} // namespace dragon
#endif // DRAGON_OPERATORS_ARRAY_WHERE_OP_H_
\ No newline at end of file
......@@ -27,7 +27,9 @@ class CompareOp final : public Operator<Context> {
USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override;
template <typename T> void RunImpl();
template <typename T> void EqualRunImpl();
template <typename T> void NotEqualRunImpl();
template <typename T> void LessRunImpl();
template <typename T> void LessEqualRunImpl();
template <typename T> void GreaterRunImpl();
......
......@@ -4,6 +4,8 @@
#ifdef WITH_CUDA
#include <cub/block/block_reduce.cuh>
#include <cub/device/device_select.cuh>
#include <cub/iterator/counting_input_iterator.cuh>
#include "utils/cuda_device.h"
......
......@@ -17,6 +17,8 @@
namespace dragon {
class Tensor;
namespace kernel {
/*! activation.dropout */
......@@ -258,37 +260,37 @@ void ClipGrad(
template <typename T, class Context>
void Maximum(
const int count,
const T* x1,
const T* x2,
const T* a,
const T* b,
T* y,
Context* ctx);
template <typename T, class Context>
void BroadcastMaximum(
const int count,
const T* x1,
const T x2,
const T* a,
const T b,
T* y,
Context* ctx);
template <typename T, class Context>
void MaximumGrad(
const int count,
const T* x1,
const T* x2,
const T* a,
const T* b,
const T* dy,
T* dx1,
T* dx2,
T* da,
T* db,
Context* ctx);
template <typename T, class Context>
void BroadcastMaximumGrad(
const int count,
const T* x1,
const T x2,
const T* a,
const T b,
const T* dy,
T* dx1,
T* dx2,
T* da,
T* db,
Context* ctx);
/*! arithmetic.minimum */
......@@ -296,37 +298,37 @@ void BroadcastMaximumGrad(
template <typename T, class Context>
void Minimum(
const int count,
const T* x1,
const T* x2,
const T* a,
const T* b,
T* y,
Context* ctx);
template <typename T, class Context>
void BroadcastMinimum(
const int count,
const T* x1,
const T x2,
const T* a,
const T b,
T* y,
Context* ctx);
template <typename T, class Context>
void MinimumGrad(
const int count,
const T* x1,
const T* x2,
const T* a,
const T* b,
const T* dy,
T* dx1,
T* dx2,
T* da,
T* db,
Context* ctx);
template <typename T, class Context>
void BroadcastMinimumGrad(
const int count,
const T* x1,
const T x2,
const T* a,
const T b,
const T* dy,
T* dx1,
T* dx2,
T* da,
T* db,
Context* ctx);
/*! arithmetic.moments */
......@@ -437,6 +439,38 @@ void IndexSelectGrad(
T* dx,
Context* ctx);
/*! array.masked_select */
template <typename T, class Context>
void MaskedSelect(
const int count,
const uint8_t* mask,
const T* x,
Tensor* indices,
Tensor* scratch,
Tensor* y,
Context* ctx);
template <typename T, class Context>
void MaskedSelectGrad(
const int count,
const int num_indices,
const int64_t* indices,
const T* dy,
T* dx,
Context* ctx);
/*! array.non_zero */
template <class Context>
void UnravelIndex(
const int count,
const int ndims,
const int* dims,
const int64_t* x,
int64_t* y,
Context* ctx);
/*! array.pad */
template <typename T, class Context>
......@@ -602,6 +636,26 @@ void TransposeGrad(
T* dx,
Context* ctx);
/*! array.where */
template <typename T, class Context>
void Where(
const int count,
const uint8_t* mask,
const T* a,
const T* b,
T* y,
Context* ctx);
template <typename T, class Context>
void WhereGrad(
const int count,
const uint8_t* mask,
const T* dy,
T* da,
T* db,
Context* ctx);
/*! control_flow.assgin */
template <typename T, class Context>
......@@ -618,6 +672,13 @@ void Assign(
/*! control_flow.compare */
template <typename T, class Context>
void NotZero(
const int count,
const T* x,
bool* y,
Context* ctx);
template <typename T, class Context>
void Equal(
const int count,
const T* a,
......@@ -626,7 +687,7 @@ void Equal(
Context* ctx);
template <typename T, class Context>
void Less(
void NotEqual(
const int count,
const T* a,
const T* b,
......@@ -634,7 +695,7 @@ void Less(
Context* ctx);
template <typename T, class Context>
void LessEqual(
void Less(
const int count,
const T* a,
const T* b,
......@@ -642,7 +703,7 @@ void LessEqual(
Context* ctx);
template <typename T, class Context>
void Greater(
void LessEqual(
const int count,
const T* a,
const T* b,
......@@ -650,21 +711,19 @@ void Greater(
Context* ctx);
template <typename T, class Context>
void GreaterEqual(
void Greater(
const int count,
const T* a,
const T* b,
bool* y,
Context* ctx);
/*! control_flow.masked_assign */
template <typename T, class Context>
void MaskedAssign(
void GreaterEqual(
const int count,
const uint8_t* mask,
const T* x,
T* y,
const T* a,
const T* b,
bool* y,
Context* ctx);
/*! loss.l1_loss */
......
......@@ -69,26 +69,29 @@ class NumpyFetcher : public TensorFetcherBase {
pybind11::object Fetch(const Tensor& tensor) override {
CHECK_GT(tensor.count(), 0);
vector<npy_intp> npy_dims;
for (const auto dim : tensor.dims()) npy_dims.push_back(dim);
for (auto dim : tensor.dims()) npy_dims.push_back(dim);
int npy_type = TypeMetaToNPY(tensor.meta());
if (npy_type == -1) {
LOG(FATAL) << "The data type of Tensor(" +
tensor.name() + ") is unknown. Have you solved it ?";
}
CHECK(npy_type != -1)
<< "\nThe data type of Tensor(" << tensor.name()
<< ") is unknown. Have you solved it?";
CHECK(tensor.memory()) << "\nIllegal memory access.";
// Create a empty array with the same shape
PyObject* array = PyArray_SimpleNew(
auto* array = PyArray_SimpleNew(
tensor.ndim(), npy_dims.data(), npy_type);
// Copy the tensor data to the numpy array
if (tensor.memory_state() == MixedMemory::STATE_AT_CUDA) {
CUDAContext::MemcpyEx<CPUContext, CUDAContext>(tensor.nbytes(),
PyArray_DATA(reinterpret_cast<PyArrayObject*>(array)),
tensor.raw_data<CUDAContext>(),
tensor.memory()->device_id());
CUDAContext::MemcpyEx<CPUContext, CUDAContext>(
tensor.nbytes(),
PyArray_DATA(reinterpret_cast<PyArrayObject*>(array)),
tensor.raw_data<CUDAContext>(),
tensor.memory()->device_id()
);
} else {
CPUContext::Memcpy<CPUContext, CPUContext>(tensor.nbytes(),
PyArray_DATA(reinterpret_cast<PyArrayObject*>(array)),
tensor.raw_data<CPUContext>());
CPUContext::Memcpy<CPUContext, CPUContext>(
tensor.nbytes(),
PyArray_DATA(reinterpret_cast<PyArrayObject*>(array)),
tensor.raw_data<CPUContext>()
);
}
return pybind11::reinterpret_steal<pybind11::object>(array);
}
......@@ -108,31 +111,32 @@ class NumpyFeeder : public TensorFeederBase {
const DeviceOption& option,
PyArrayObject* original_array,
Tensor* tensor) override {
PyArrayObject* array = PyArray_GETCONTIGUOUS(original_array);
const TypeMeta& meta = TypeNPYToMeta(PyArray_TYPE(array));
auto* array = PyArray_GETCONTIGUOUS(original_array);
const auto& meta = TypeNPYToMeta(PyArray_TYPE(array));
if (meta.id() == 0) LOG(FATAL) << "Unsupported DType.";
tensor->SetMeta(meta);
int ndim = PyArray_NDIM(array);
npy_intp* npy_dims = PyArray_DIMS(array);
vector<int64_t> dims;
for (int i = 0; i < ndim; i++) dims.push_back(npy_dims[i]);
vec64_t dims(ndim);
auto* npy_dims = PyArray_DIMS(array);
for (int i = 0; i < ndim; i++) dims[i] = npy_dims[i];
tensor->Reshape(dims);
if (option.device_type() == PROTO_CUDA) {
#ifdef WITH_CUDA
CUDAContext::MemcpyEx<CUDAContext, CPUContext>(
tensor->nbytes(),
tensor->raw_mutable_data<CUDAContext>(),
static_cast<void*>(PyArray_DATA(array)),
option.device_id());
tensor->nbytes(),
tensor->raw_mutable_data<CUDAContext>(),
static_cast<void*>(PyArray_DATA(array)),
option.device_id()
);
#else
LOG(FATAL) << "CUDA was not compiled.";
#endif
} else {
auto* data = tensor->raw_mutable_data<CPUContext>();
CPUContext::Memcpy<CPUContext, CPUContext>(
tensor->nbytes(),
tensor->nbytes(),
tensor->raw_mutable_data<CPUContext>(),
static_cast<void*>(PyArray_DATA(array)));
static_cast<void*>(PyArray_DATA(array))
);
}
Py_XDECREF(array);
}
......
......@@ -142,40 +142,44 @@ PYBIND11_MODULE(libdragon, m) {
DeviceOption dst_ctx, src_ctx;
dst_ctx.ParseFromString(dev1);
src_ctx.ParseFromString(dev2);
Tensor* srcT = self->GetTensor(other);
Tensor* dstT = self->CreateTensor(name);
dstT->ReshapeLike(*srcT);
const TypeMeta& meta = srcT->meta();
auto* src = self->GetTensor(other);
auto* dst = self->CreateTensor(name);
const auto& meta = src->meta();
dst->ReshapeLike(*src);
if (dst_ctx.device_type() == PROTO_CUDA) {
if (src_ctx.device_type() == PROTO_CUDA) {
// CUDA <- CUDA
CUDAContext::MemcpyEx<CUDAContext, CUDAContext>(
srcT->nbytes(),
dstT->raw_mutable_data<CUDAContext>(meta),
srcT->raw_data<CUDAContext>(),
src_ctx.device_id());
src->nbytes(),
dst->raw_mutable_data<CUDAContext>(meta),
src->raw_data<CUDAContext>(),
src_ctx.device_id()
);
} else {
// CUDA <- CPU
CUDAContext::MemcpyEx<CUDAContext, CPUContext>(
srcT->nbytes(),
dstT->raw_mutable_data<CUDAContext>(meta),
srcT->raw_data<CPUContext>(),
dst_ctx.device_id());
src->nbytes(),
dst->raw_mutable_data<CUDAContext>(meta),
src->raw_data<CPUContext>(),
dst_ctx.device_id()
);
}
} else {
if (src_ctx.device_type() == PROTO_CUDA) {
// CPU <- CUDA
CUDAContext::MemcpyEx<CPUContext, CUDAContext>(
srcT->nbytes(),
dstT->raw_mutable_data<CPUContext>(meta),
srcT->raw_data<CUDAContext>(),
src_ctx.device_id());
src->nbytes(),
dst->raw_mutable_data<CPUContext>(meta),
src->raw_data<CUDAContext>(),
src_ctx.device_id()
);
} else {
// CPU <- CPU
CPUContext::Memcpy<CUDAContext, CUDAContext>(
srcT->nbytes(),
dstT->raw_mutable_data<CPUContext>(meta),
srcT->raw_data<CPUContext>());
src->nbytes(),
dst->raw_mutable_data<CPUContext>(meta),
src->raw_data<CPUContext>()
);
}
}
})
......@@ -188,7 +192,7 @@ PYBIND11_MODULE(libdragon, m) {
Tensor* tensor = self->GetTensor(name);
CHECK_GT(tensor->count(), 0);
vector<npy_intp> dims;
for (const auto dim : tensor->dims()) dims.push_back(dim);
for (auto dim : tensor->dims()) dims.push_back(dim);
int npy_type = TypeMetaToNPY(tensor->meta());
if (npy_type == -1) {
LOG(FATAL) << "Tensor(" + tensor->name() + ") "
......
......@@ -35,7 +35,7 @@ class OperatorHelper(object):
'Relu', 'PRelu', 'Elu', 'SElu', 'Sigmoid', 'Tanh', 'Softmax',
'Dropout', 'DropPath', 'DropBlock2d',
'Add', 'Sub', 'Mul', 'Div', 'Clip', 'Log', 'Exp', 'Pow', 'Square', 'Sqrt',
'Accumulate', 'Affine', 'Copy', 'Compare', 'StopGradient', 'MPIBroadcast',
'Accumulate', 'Affine', 'Copy', 'StopGradient', 'MPIBroadcast',
'BatchNorm', 'GroupNorm', 'L2Norm', 'LRN', 'BiasAdd',
)
......@@ -107,7 +107,13 @@ class OperatorHelper(object):
len(outputs[0].shape) < len(inputs[1].shape):
outputs[0].shape = inputs[1].shape
except:
pass
try:
outputs[0].shape = inputs[1].shape[:]
if outputs[0].shape != inputs[0].shape and \
len(outputs[0].shape) < len(inputs[0].shape):
outputs[0].shape = inputs[0].shape
except:
pass
return outputs
@classmethod
......@@ -391,19 +397,31 @@ class OperatorHelper(object):
###############################################
@classmethod
def _apply_Where(cls, arguments, inputs, outputs):
return cls._apply_Maximum(arguments, inputs, outputs)
@classmethod
def _apply_IndexSelect(cls, arguments, inputs, outputs):
outputs[0].dtype = inputs[0].dtype
axis = arguments['axis']
try:
try: index_shape = inputs[1].shape[:]
except: index_shape = [None]
outputs[0].shape = \
inputs[0].shape[:axis] + \
inputs[1].shape[:] + \
inputs[0].shape[axis + 1:]
index_shape[:] + \
inputs[0].shape[axis + 1:]
except:
pass
return outputs
@classmethod
def _apply_MaskedSelect(cls, arguments, inputs, outputs):
outputs[0].dtype = inputs[0].dtype
outputs[0].shape = [None]
return outputs
@classmethod
def _apply_RandomPick(cls, arguments, inputs, outputs):
outputs[0].dtype = inputs[0].dtype
outputs[1].dtype = 'int32'
......@@ -779,6 +797,25 @@ class OperatorHelper(object):
pass
return outputs
@classmethod
def _apply_NonZero(cls, arguments, inputs, outputs):
outputs[0].dtype = 'int64'
try:
outputs[0].shape = [None, len(inputs[0].shape)]
except:
pass
return outputs
###############################################
# #
# Control Flow #
# #
###############################################
@classmethod
def _apply_Compare(cls, arguments, inputs, outputs):
return cls._apply_Maximum(arguments, inputs, outputs)
###############################################
# #
# Vision #
......
......@@ -462,7 +462,7 @@ class Tensor(object):
Parameters
----------
item : int or slice
item : int, slice or Tensor
The indices.
Returns
......@@ -471,17 +471,22 @@ class Tensor(object):
The output tensor.
"""
starts, sizes = self._process_indices(item)
output = self.CreateOperator('Crop', self, starts=starts, sizes=sizes)
if self.shape is not None:
output_shape, squeeze_shape = self.shape[:], []
for ix in range(len(sizes)):
output_shape[ix] = sizes[ix]
for dim in output_shape:
if dim != -1: squeeze_shape.append(dim)
if len(squeeze_shape) == 0: output.shape = []
else: output.shape = squeeze_shape[:]
return output
if isinstance(item, Tensor):
return self.CreateOperator(
'MaskedSelect', [self, item],
)
else:
starts, sizes = self._process_indices(item)
output = self.CreateOperator('Crop', self, starts=starts, sizes=sizes)
if self.shape is not None:
output_shape, squeeze_shape = self.shape[:], []
for ix in range(len(sizes)):
output_shape[ix] = sizes[ix]
for dim in output_shape:
if dim != -1: squeeze_shape.append(dim)
if len(squeeze_shape) == 0: output.shape = []
else: output.shape = squeeze_shape[:]
return output
def __setitem__(self, key, value):
"""Set the value at the specific indices.
......@@ -774,11 +779,11 @@ class Tensor(object):
###############################################
def set_value(self, new_value, **kwargs):
"""Feed the values to C++ backend. [**Theano Style**]
"""Feed values to the backend.
Parameters
----------
new_value : number, list or numpy.ndarray
new_value : array_like
The values to set.
Returns
......@@ -795,12 +800,12 @@ class Tensor(object):
return self
def get_value(self):
"""Fetch the values from C++ backend. [**Theano Style**]
"""Copy values from the backend.
Returns
-------
numpy.ndarray or number
The values of this tensor in the backend.
numpy.ndarray
The copied values.
See Also
--------
......@@ -827,7 +832,7 @@ class Tensor(object):
return self.CreateOperator('Copy', **arguments)
def reshape(self, shape, **kwargs):
"""Reshape the dimensions of input. [**Theano Style**]
"""Reshape the dimensions of input.
Parameters
----------
......@@ -841,8 +846,7 @@ class Tensor(object):
"""
if not isinstance(shape, (tuple, list)): shape = [shape]
return Tensor.CreateOperator(
'Reshape', inputs=self, shape=shape, **kwargs)
return self.CreateOperator('Reshape', inputs=self, shape=shape)
def dimshuffle(self, *args, **kwargs):
"""Shuffle the dimensions. [**Theano Style**]
......
......@@ -425,7 +425,7 @@ def FeedTensor(
----------
tensor : Tensor or str
The tensor to feed.
array : number, list, tuple, or numpy.ndarray
array : array_like
The values to feed.
force_cpu : boolean, optional, default=False
Whether force to feed to cpu context.
......
......@@ -16,6 +16,30 @@ from __future__ import print_function
from . import *
@OpSchema.Inputs(1, 3)
def Where(inputs, **kwargs):
"""Select elements from either ``x`` or ``y``, depending on ``condition``.
Return the indices of *True* elements, if only the ``condition`` is given.
**Type Constraints**: (*bool*, *int8*, *uint8*, *int32*, *int64*, *float16*, *float32*, *float64*)
Parameters
----------
inputs : sequence of Tensor
The ``x``, ``y``, and ``condition``.
Returns
-------
dragon.vm.torch.Tensor
The output tensor.
"""
if isinstance(inputs, Tensor) or len(inputs) == 1:
return NonZero(inputs, **kwargs)
return Tensor.CreateOperator('Where', **ParseArgs(locals()))
@OpSchema.Inputs(1)
def IndexSelect(inputs, indices, axis=0, **kwargs):
"""Select the elements according to the indices along the given axis.
......@@ -26,7 +50,7 @@ def IndexSelect(inputs, indices, axis=0, **kwargs):
----------
inputs : Tensor
The input tensor.
indices : Tensor
indices : sequence or Tensor
The indices to select elements.
axis : int, optional
The axis of indices.
......@@ -46,6 +70,31 @@ def IndexSelect(inputs, indices, axis=0, **kwargs):
@OpSchema.Inputs(1)
def MaskedSelect(inputs, mask, **kwargs):
"""Select the the elements where ``mask`` is *1*.
**Type Constraints**: (*bool*, *int8*, *uint8*, *int32*, *int64*, *float16*, *float32*, *float64*)
Parameters
----------
inputs : Tensor
The input tensor.
mask : Tensor
The mask, with the same size as ``inputs``.
Returns
-------
Tensor
The output tensor.
"""
arguments = ParseArgs(locals())
arguments['mask'] = None
arguments['inputs'] = [arguments['inputs'], mask]
return Tensor.CreateOperator('MaskedSelect', **arguments)
@OpSchema.Inputs(1)
@ArgumentHelper.RepeatedDesc('starts')
@ArgumentHelper.RepeatedDesc('sizes')
def Crop(
......@@ -748,6 +797,26 @@ def Arange(start, stop=None, step=1, dtype='float32', **kwargs):
@OpSchema.Inputs(1)
def NonZero(inputs, **kwargs):
"""Return the indices of non-zero elements.
**Type Constraints**: (*bool*, *int8*, *uint8*, *int32*, *int64*, *float16*, *float32*, *float64*)
Parameters
----------
inputs : Tensor
The input tensor.
Returns
-------
Tensor
A *int64* tensor contains the indices.
"""
return Tensor.CreateOperator('NonZero', **ParseArgs(locals()))
@OpSchema.Inputs(1)
def Multinomial(inputs, num_samples=1, eps=0., normalize=False, **kwargs):
"""Return a tensor where each row contains ``num_samples``,
sampled from the multinomial distribution.
......
......@@ -125,6 +125,32 @@ def Equal(inputs, to_uint8=False, **kwargs):
return Tensor.CreateOperator('Compare', operation='EQ', **arguments)
@OpSchema.Inputs(2)
def NotEqual(inputs, to_uint8=False, **kwargs):
"""*NotEqual* comparing between A and B.
Set ``to_uint8`` if you expect the *uint8* results instead of *bool*.
**Type Constraints**: (*bool*, *int8*, *uint8*, *int32*, *int64*, *float16*, *float32*, *float64*)
Parameters
----------
inputs : sequence of Tensor
The inputs, represent A and B respectively.
to_uint8 : bool
*True* to convert to *uint8* results.
Returns
-------
Tensor
The comparing results.
"""
arguments = ParseArgs(locals())
return Tensor.CreateOperator('Compare', operation='NE', **arguments)
@OpSchema.ConvertConstantInputs()
@OpSchema.Inputs(2)
def Less(inputs, to_uint8=False, **kwargs):
......
......@@ -114,7 +114,7 @@ LayerNorm = _norm_ops.LayerNorm
InstanceNorm = _norm_ops.InstanceNorm
L2Norm = _norm_ops.L2Norm
# NDArray
# Array
Crop = _array_ops.Crop
Reduce = _array_ops.Reduce
Sum = _array_ops.Sum
......@@ -130,7 +130,9 @@ Transpose = _array_ops.Transpose
Repeat = _array_ops.Repeat
Tile = _array_ops.Tile
Pad = _array_ops.Pad
Where = _array_ops.Where
IndexSelect = _array_ops.IndexSelect
MaskedSelect = _array_ops.MaskedSelect
OneHot = _array_ops.OneHot
Flatten = _array_ops.Flatten
Reshape = _array_ops.Reshape
......@@ -139,15 +141,17 @@ Squeeze = _array_ops.Squeeze
Shape = _array_ops.Shape
Arange = _array_ops.Arange
Multinomial = _array_ops.Multinomial
NonZero = _array_ops.NonZero
# Control Flow
Copy = _control_flow_ops.Copy
Assign = _control_flow_ops.Assign
MaskedAssign = _control_flow_ops.MaskedAssign
Equal = _control_flow_ops.Equal
Less = _control_flow_ops.Less
LessEqual = _control_flow_ops.LessEqual
Equal = _control_flow_ops.Equal
Greater = _control_flow_ops.Greater
LessEqual = _control_flow_ops.LessEqual
NotEqual = _control_flow_ops.NotEqual
GreaterEqual = _control_flow_ops.GreaterEqual
# Misc
......
......@@ -18,7 +18,7 @@ import dragon.vm.torch.ops.tensor
# Import Core Methods
from dragon.vm.torch.tensor import *
from dragon.vm.torch.c_api import Size, from_numpy
from dragon.vm.torch.c_api import Size, from_numpy, device
from dragon.vm.torch.serialization import save, load
# Import Subpackages
......
......@@ -67,10 +67,11 @@ def from_numpy(data):
"""
if not isinstance(data, numpy.ndarray):
raise TypeError('The data should be a numpy.ndarray.')
if str(data.dtype) not in _mapping.TENSOR_TYPE_TO_TORCH_TENSOR:
raise ValueError('Unsupported type({}) to torch tensor.'.format(data.dtype))
tensor_types = _mapping.TENSOR_TYPE_TO_TORCH_TENSOR
if str(data.dtype) not in tensor_types:
raise ValueError('Unsupported data type: ' + str(data.dtype))
module = importlib.import_module('dragon.vm.torch.tensor')
return getattr(module, _mapping.TENSOR_TYPE_TO_TORCH_TENSOR[str(data.dtype)])(data)
return getattr(module, tensor_types[str(data.dtype)])(data, copy=False)
def from_dragon(tensor, own_storage=False):
......
......@@ -23,8 +23,8 @@ from dragon.vm.torch.ops.modules.control_flow import (
)
from dragon.vm.torch.ops.modules.arithmetic import (
Fundamental, Log, Exp, Sqrt,
Accumulate,
Fundamental, Accumulate,
Log, Exp, Sqrt,
MM, FullyConnected,
Maximum, Minimum, Clamp,
)
......@@ -35,9 +35,11 @@ from dragon.vm.torch.ops.modules.init import (
from dragon.vm.torch.ops.modules.array import (
Reshape, Squeeze, UnSqueeze, Permute,
Indexing, IndexSelect,
Repeat, Concat, Stack,
Reduce, ArgReduce, OneHot, Multinomial,
Indexing, Repeat, Concat, Stack,
IndexSelect, MaskedSelect,
Reduce, ArgReduce,
NonZero, Where,
OneHot, Multinomial,
)
from dragon.vm.torch.ops.modules.update import (
......@@ -50,17 +52,16 @@ from dragon.vm.torch.ops.modules.vision import (
__all__ = [
'accumulate',
'add', 'sub', 'mul', 'div',
'add', 'sub', 'mul', 'div', 'accumulate',
'maximum', 'minimum', 'clamp',
'log', 'exp', 'sqrt',
'mm', 'xw_plus_b',
'squeeze', 'unsqueeze',
'mean', 'sum', 'min', 'max', 'topk',
'argmin', 'argmax',
'gt', 'lt', 'eq', 'ge', 'le',
'nonzero', 'where', 'argmin', 'argmax',
'gt', 'lt', 'eq', 'ne', 'ge', 'le',
'cat', 'stack', 'narrow',
'index_select',
'index_select', 'masked_select',
'one_hot', 'multinomial',
'rand', 'randn',
'ones', 'ones_like',
......@@ -525,6 +526,30 @@ def _assign(output, starts, sizes, input):
return module.forward(input, output, starts, sizes)
def where(condition, x, y):
"""Select elements from either ``x`` or ``y``, depending on ``condition``.
Parameters
----------
condition : dragon.vm.torch.Tensor
The byte condition tensor.
x : dragon.vm.torch.Tensor
The elements for *1*.
y : dragon.vm.torch.Tensor
The elements for *0*.
Returns
-------
dragon.vm.torch.Tensor
The output tensor.
"""
dev = MakeDevice(inputs=[condition, x, y])
key = 'Where/{}'.format(dev)
module = get_module(Where, key, dev)
return module.forward(condition, x, y)
def _masked_assign(output, mask, input):
if not isinstance(input, Tensor):
if isinstance(input, (tuple, list)):
......@@ -569,7 +594,7 @@ def squeeze(input, dim=None, out=None):
def unsqueeze(input, dim, out=None):
"""Returns a tensor with a dimension of size 1 inserted at the specified position.
"""Return a tensor with a dimension of size 1 inserted at the specified position.
Parameters
----------
......@@ -866,6 +891,27 @@ def eq(input, other, out=None):
return _compare(input, other, 'EQ', out)
def ne(input, other, out=None):
"""Compute *input* != *other* element-wise.
Parameters
----------
input : dragon.vm.torch.Tensor
The input tensor.
other : dragon.vm.torch.Tensor, number
The other tensor.
out : dragon.vm.torch.Tensor, optional
The optional output tensor.
Returns
-------
dragon.vm.torch.Tensor
The output byte tensor.
"""
return _compare(input, other, 'NE', out)
def cat(seq, dim=0, out=None):
"""Concatenate the inputs along the given axis.
......@@ -908,7 +954,7 @@ def stack(seq, dim=0, out=None):
The output tensor.
"""
dev = MakeDevice(inputs=seq, outputs=[out] if out else [])
dev = MakeDevice(seq, [out] if out else [])
key = 'Stack/{}/dim:{}'.format(dev, dim)
module = get_module(Stack, key, dev, axis=dim)
return module.forward(seq, out)
......@@ -940,6 +986,30 @@ def index_select(input, dim, index, out=None):
return module.forward(input, index, out)
def masked_select(input, mask, out=None):
"""Select the input values where mask is *1*.
Parameters
----------
input : dragon.vm.torch.Tensor
The values.
mask : dragon.vm.torch.Tensor
The mask to select values.
out : dragon.vm.torch.Tensor, optional
The optional output tensor.
Returns
-------
dragon.vm.torch.Tensor
The output tensor.
"""
dev = MakeDevice([input, mask], [out] if out else [])
key = 'MaskedSelect/{}'.format(dev)
module = get_module(MaskedSelect, key, dev)
return module.forward(input, mask, out)
def narrow(input, dimension, start, length):
"""Return a new tensor that is a narrowed version of input tensor.
......@@ -965,6 +1035,28 @@ def narrow(input, dimension, start, length):
return _index(input, starts, sizes)
def nonzero(input, out=None):
"""Return the indices of non-zero elements.
Parameters
----------
input : dragon.vm.torch.Tensor
The input tensor.
out : dragon.vm.torch.Tensor, optional
The optional output tensor.
Returns
-------
dragon.vm.torch.FloatTensor
The output tensor.
"""
dev = MakeDevice(inputs=[input])
key = 'NonZero/{}'.format(dev)
module = get_module(NonZero, key, dev)
return module.forward(input, out)
def one_hot(input, depth):
"""Return a ont hot tensor according to given input.
......
......@@ -134,6 +134,23 @@ class IndexSelect(BaseModule):
return self.run(inputs, outputs)
class MaskedSelect(BaseModule):
def __init__(self, key, dev, **kwargs):
super(MaskedSelect, self).__init__(key, dev, **kwargs)
self.register_op()
def register_op(self):
self.op_meta = {
'op_type': 'MaskedSelect',
'arguments': {},
}
def forward(self, x, mask, y):
inputs = [x, mask]; self.unify_devices(inputs)
outputs = [y] if y else [self.register_output()]
return self.run(inputs, outputs)
class Reduce(BaseModule):
def __init__(self, key, dev, **kwargs):
super(Reduce, self).__init__(key, dev, **kwargs)
......@@ -326,6 +343,36 @@ class Repeat(BaseModule):
return self.run(inputs, outputs, callback=callback)
class NonZero(BaseModule):
def __init__(self, key, dev, **kwargs):
super(NonZero, self).__init__(key, dev, **kwargs)
self.register_op()
def register_op(self):
self.op_meta = {
'op_type': 'NonZero',
'arguments': {},
}
def forward(self, x, y):
inputs = [x]; self.unify_devices(inputs)
outputs = [y] if y else [self.register_output()]
with no_grad(): return self.run(inputs, outputs)
class Where(BaseModule):
def __init__(self, key, dev, **kwargs):
super(Where, self).__init__(key, dev, **kwargs)
self.register_op()
def register_op(self):
self.op_meta = {'op_type': 'Where', 'arguments': {}}
def forward(self, condition, x, y):
self.unify_devices([condition, x, y])
return self.run([x, y, condition], [self.register_output()])
class OneHot(BaseModule):
def __init__(self, key, dev, **kwargs):
super(OneHot, self).__init__(key, dev, **kwargs)
......
......@@ -23,11 +23,12 @@ from dragon.vm.torch.ops.builtin import (
_fundamental, _rfundamental,
log, exp, sqrt, clamp,
_reshape, squeeze, unsqueeze,
_permute, _repeat, narrow,
_index, index_select,
_permute, _repeat, narrow, _index,
_assign, _masked_assign,
index_select, masked_select,
mean, sum, max, min,
gt, lt, eq, ge, le,
gt, lt, eq, ne, ge, le,
where, nonzero,
)
......@@ -84,10 +85,14 @@ Tensor.ge = lambda *args, **kwargs: ge(*args, **kwargs)
Tensor.lt = lambda *args, **kwargs: lt(*args, **kwargs)
Tensor.le = lambda *args, **kwargs: le(*args, **kwargs)
Tensor.eq = lambda *args, **kwargs: eq(*args, **kwargs)
Tensor.index_select = lambda *args, **kwargs: index_select(*args, **kwargs)
Tensor.ne = lambda *args, **kwargs: ne(*args, **kwargs)
Tensor.nonzero = lambda *args, **kwargs: nonzero(*args, **kwargs)
Tensor.where = lambda self, condition, y: where(condition, self, y)
Tensor.narrow = lambda *args, **kwargs: narrow(*args, **kwargs)
Tensor._index = lambda *args, **kwargs: _index(*args, **kwargs)
Tensor._assign = lambda *args, **kwargs: _assign(*args, **kwargs)
Tensor.index_select = lambda *args, **kwargs: index_select(*args, **kwargs)
Tensor.masked_select = lambda *args, **kwargs: masked_select(*args, **kwargs)
Tensor.half = lambda self: _type_to(self, dtype='float16', inplace=False)
......@@ -104,5 +109,5 @@ Tensor.int = lambda self: _type_to(self, dtype='int32', inplace=False)
Tensor.int_ = lambda self: _type_to(self, dtype='int32', inplace=True)
Tensor.long = lambda self: _type_to(self, dtype='int64', inplace=False)
Tensor.long_ = lambda self: _type_to(self, dtype='int64', inplace=True)
Tensor.type = lambda self, dtype=None: _type_to(self, dtype=dtype) \
Tensor.type = lambda self, dtype = None: _type_to(self, dtype=dtype) \
if dtype is not None else 'torch.' + self._type2str()
\ No newline at end of file
#include "utils/op_kernel.h"
#include "utils/eigen_utils.h"
#include "utils/omp_alternative.h"
namespace dragon {
......@@ -10,15 +11,12 @@ namespace kernel {
template <typename T>
void _Maximum(
const int count,
const T* x1,
const T* x2,
const T* a,
const T* b,
T* y) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
y[i] = std::max(x1[i], x2[i]);
}
EigenVectorArrayMap<T>(y, count) = \
ConstEigenVectorArrayMap<T>(a, count).max(
ConstEigenVectorArrayMap<T>(b, count));
}
/* <T = ?, Device = CPU> */
......@@ -26,15 +24,11 @@ void _Maximum(
template <typename T>
void _BroadcastMaximum(
const int count,
const T* x1,
const T x2,
const T* a,
const T b,
T* y) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
y[i] = std::max(x1[i], x2);
}
EigenVectorArrayMap<T>(y, count) = \
ConstEigenVectorArrayMap<T>(a, count).max(b);
}
/* <T = ?, Device = CPU> */
......@@ -42,18 +36,19 @@ void _BroadcastMaximum(
template <typename T>
void _MaximumGrad(
const int count,
const T* x1,
const T* x2,
const T* a,
const T* b,
const T* dy,
T* dx1,
T* dx2) {
T* da,
T* db) {
const T kZero = T(0);
#ifdef WITH_OMP
#pragma omp parallel for num_threads(OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
const bool dy_to_dx1 = x1[i] > x2[i];
dx1[i] = dy_to_dx1 ? dy[i] : 0;
dx2[i] = dy_to_dx1 ? 0 : dy[i];
const bool dy_to_da = a[i] > b[i];
da[i] = dy_to_da ? dy[i] : kZero;
db[i] = dy_to_da ? kZero : dy[i];
}
}
......@@ -62,16 +57,17 @@ void _MaximumGrad(
template <typename T>
void _BroadcastMaximumGrad(
const int count,
const T* x1,
const T x2,
const T* a,
const T b,
const T* dy,
T* dx1,
T* dx2) {
T* da,
T* db) {
const T kZero = T(0);
#ifdef WITH_OMP
#pragma omp parallel for num_threads(OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
dx1[i] = (x1[i] > x2) ? dy[i] : 0;
da[i] = a[i] > b ? dy[i] : kZero;
}
}
......@@ -80,23 +76,23 @@ void _BroadcastMaximumGrad(
#define DEFINE_MAXIMUM_KERNEL_LAUNCHER(name, T, T2) \
template <> void name<T, CPUContext>( \
const int count, \
const T* x1, \
const T2 x2, \
const T* a, \
const T2 b, \
T* y, \
CPUContext* ctx) { \
_##name(count, x1, x2, y); \
_##name(count, a, b, y); \
}
#define DEFINE_MAXIMUM_GRAD_KERNEL_LAUNCHER(name, T, T2) \
template <> void name<T, CPUContext>( \
const int count, \
const T* x1, \
const T2 x2, \
const T* a, \
const T2 b, \
const T* dy, \
T* dx1, \
T* dx2, \
T* da, \
T* db, \
CPUContext* ctx) { \
_##name(count, x1, x2, dy, dx1, dx2); \
_##name(count, a, b, dy, da, db); \
}
DEFINE_MAXIMUM_KERNEL_LAUNCHER(Maximum, int8_t, int8_t*);
......@@ -129,8 +125,8 @@ DEFINE_MAXIMUM_GRAD_KERNEL_LAUNCHER(BroadcastMaximumGrad, double, double);
template <> void Maximum<float16, CPUContext>(
const int count,
const float16* x1,
const float16* x2,
const float16* a,
const float16* b,
float16* y,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
......@@ -138,8 +134,8 @@ template <> void Maximum<float16, CPUContext>(
template <> void BroadcastMaximum<float16, CPUContext>(
const int count,
const float16* x1,
const float16 x2,
const float16* a,
const float16 b,
float16* y,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
......@@ -147,22 +143,22 @@ template <> void BroadcastMaximum<float16, CPUContext>(
template <> void MaximumGrad<float16, CPUContext>(
const int count,
const float16* x1,
const float16* x2,
const float16* a,
const float16* b,
const float16* dy,
float16* dx1,
float16* dx2,
float16* da,
float16* db,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
}
template <> void BroadcastMaximumGrad<float16, CPUContext>(
const int count,
const float16* x1,
const float16 x2,
const float16* a,
const float16 b,
const float16* dy,
float16* dx1,
float16* dx2,
float16* da,
float16* db,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
}
......
......@@ -13,11 +13,11 @@ namespace kernel {
template <typename T>
__global__ void _Maximum(
const int nthreads,
const T* x1,
const T* x2,
const T* a,
const T* b,
T* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
y[i] = max(x1[i], x2[i]);
y[i] = max(a[i], b[i]);
}
}
......@@ -25,12 +25,12 @@ __global__ void _Maximum(
template<> __global__ void _Maximum<half>(
const int nthreads,
const half* x1,
const half* x2,
const half* a,
const half* b,
half* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530
y[i] = __hgt(x1[i], x2[i]) ? x1[i] : x2[i];
y[i] = __hgt(a[i], b[i]) ? a[i] : b[i];
#endif
}
}
......@@ -40,11 +40,11 @@ template<> __global__ void _Maximum<half>(
template <typename T>
__global__ void _BroadcastMaximum(
const int nthreads,
const T* x1,
const T x2,
const T* a,
const T b,
T* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
y[i] = max(x1[i], x2);
y[i] = max(a[i], b);
}
}
......@@ -52,12 +52,12 @@ __global__ void _BroadcastMaximum(
template<> __global__ void _BroadcastMaximum<half>(
const int nthreads,
const half* x1,
const half x2,
const half* a,
const half b,
half* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530
y[i] = __hgt(x1[i], x2) ? x1[i] : x2;
y[i] = __hgt(a[i], b) ? a[i] : b;
#endif
}
}
......@@ -67,15 +67,16 @@ template<> __global__ void _BroadcastMaximum<half>(
template <typename T>
__global__ void _MaximumGrad(
const int nthreads,
const T* x1,
const T* x2,
const T* a,
const T* b,
const T* dy,
T* dx1,
T* dx2) {
T* da,
T* db) {
const T kZero = T(0);
CUDA_1D_KERNEL_LOOP(i, nthreads) {
const bool dy_to_dx1 = x1[i] > x2[i];
dx1[i] = dy_to_dx1 ? dy[i] : T(0);
dx2[i] = dy_to_dx1 ? T(0) : dy[i];
const bool dy_to_da = a[i] > b[i];
da[i] = dy_to_da ? dy[i] : kZero;
db[i] = dy_to_da ? kZero : dy[i];
}
}
......@@ -83,18 +84,19 @@ __global__ void _MaximumGrad(
template<> __global__ void _MaximumGrad<half>(
const int nthreads,
const half* x1,
const half* x2,
const half* a,
const half* b,
const half* dy,
half* dx1,
half* dx2) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
half* da,
half* db) {
#if __CUDA_ARCH__ >= 530
const bool dy_to_dx1 = __hgt(x1[i], x2[i]);
dx1[i] = dy_to_dx1 ? dy[i] : __float2half(0.f);
dx2[i] = dy_to_dx1 ? __float2half(0.f) : dy[i];
#endif
const half kZero = __float2half(0.f);
CUDA_1D_KERNEL_LOOP(i, nthreads) {
const bool dy_to_da = __hgt(a[i], b[i]);
da[i] = dy_to_da ? dy[i] : kZero;
db[i] = dy_to_da ? kZero : dy[i];
}
#endif
}
/* <T = ?, Device = CUDA> */
......@@ -102,13 +104,14 @@ template<> __global__ void _MaximumGrad<half>(
template <typename T>
__global__ void _BroadcastMaximumGrad(
const int nthreads,
const T* x1,
const T x2,
const T* a,
const T b,
const T* dy,
T* dx1,
T* dx2) {
T* da,
T* db) {
const T kZero = T(0);
CUDA_1D_KERNEL_LOOP(i, nthreads) {
dx1[i] = (x1[i] > x2) ? dy[i] : T(0);
da[i] = (a[i] > b) ? dy[i] : kZero;
}
}
......@@ -116,17 +119,17 @@ __global__ void _BroadcastMaximumGrad(
template<> __global__ void _BroadcastMaximumGrad<half>(
const int nthreads,
const half* x1,
const half x2,
const half* a,
const half b,
const half* dy,
half* dx1,
half* dx2) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
half* da,
half* db) {
#if __CUDA_ARCH__ >= 530
dx1[i] = __hgt(x1[i], x2) ?
dy[i] : __float2half(0.f);
#endif
const half kZero = __float2half(0.f);
CUDA_1D_KERNEL_LOOP(i, nthreads) {
da[i] = __hgt(a[i], b) ? dy[i] : kZero;
}
#endif
}
/* Kernel Launchers */
......@@ -134,30 +137,30 @@ template<> __global__ void _BroadcastMaximumGrad<half>(
#define DEFINE_MAXIMUM_KERNEL_LAUNCHER(name, T, T2) \
template <> void name<T, CUDAContext>( \
const int count, \
const T* x1, \
const T2 x2, \
const T* a, \
const T2 b, \
T* y, \
CUDAContext* ctx) { \
_##name \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
count, x1, x2, y \
count, a, b, y \
); \
}
#define DEFINE_MAXIMUM_GRAD_KERNEL_LAUNCHER(name, T, T2) \
template <> void name<T, CUDAContext>( \
const int count, \
const T* x1, \
const T2 x2, \
const T* a, \
const T2 b, \
const T* dy, \
T* dx1, \
T* dx2, \
T* da, \
T* db, \
CUDAContext* ctx) { \
_##name \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
count, x1, x2, dy, dx1, dx2 \
count, a, b, dy, da, db \
); \
}
......@@ -191,73 +194,73 @@ DEFINE_MAXIMUM_GRAD_KERNEL_LAUNCHER(BroadcastMaximumGrad, double, double);
template <> void Maximum<float16, CUDAContext>(
const int count,
const float16* x1,
const float16* x2,
const float16* a,
const float16* b,
float16* y,
CUDAContext* ctx) {
_Maximum \
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count,
reinterpret_cast<const half*>(x1),
reinterpret_cast<const half*>(x2),
reinterpret_cast<const half*>(a),
reinterpret_cast<const half*>(b),
reinterpret_cast<half*>(y)
);
}
template <> void BroadcastMaximum<float16, CUDAContext>(
const int count,
const float16* x1,
const float16 x2,
const float16* a,
const float16 b,
float16* y,
CUDAContext* ctx) {
_BroadcastMaximum \
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count,
reinterpret_cast<const half*>(x1),
cast::to<half>(x2),
reinterpret_cast<const half*>(a),
cast::to<half>(b),
reinterpret_cast<half*>(y)
);
}
template <> void MaximumGrad<float16, CUDAContext>(
const int count,
const float16* x1,
const float16* x2,
const float16* a,
const float16* b,
const float16* dy,
float16* dx1,
float16* dx2,
float16* da,
float16* db,
CUDAContext* ctx) {
_MaximumGrad \
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count,
reinterpret_cast<const half*>(x1),
reinterpret_cast<const half*>(x2),
reinterpret_cast<const half*>(a),
reinterpret_cast<const half*>(b),
reinterpret_cast<const half*>(dy),
reinterpret_cast<half*>(dx1),
reinterpret_cast<half*>(dx2)
reinterpret_cast<half*>(da),
reinterpret_cast<half*>(db)
);
}
template <> void BroadcastMaximumGrad<float16, CUDAContext>(
const int count,
const float16* x1,
const float16 x2,
const float16* a,
const float16 b,
const float16* dy,
float16* dx1,
float16* dx2,
float16* da,
float16* db,
CUDAContext* ctx) {
_BroadcastMaximumGrad \
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count,
reinterpret_cast<const half*>(x1),
cast::to<half>(x2),
reinterpret_cast<const half*>(a),
cast::to<half>(b),
reinterpret_cast<const half*>(dy),
reinterpret_cast<half*>(dx1),
reinterpret_cast<half*>(dx2)
reinterpret_cast<half*>(da),
reinterpret_cast<half*>(db)
);
}
......
#include "utils/op_kernel.h"
#include "utils/eigen_utils.h"
#include "utils/omp_alternative.h"
namespace dragon {
......@@ -10,15 +11,12 @@ namespace kernel {
template <typename T>
void _Minimum(
const int count,
const T* x1,
const T* x2,
const T* a,
const T* b,
T* y) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
y[i] = std::min(x1[i], x2[i]);
}
EigenVectorArrayMap<T>(y, count) = \
ConstEigenVectorArrayMap<T>(a, count).min(
ConstEigenVectorArrayMap<T>(b, count));
}
/* <T = ?, Device = CPU> */
......@@ -26,15 +24,11 @@ void _Minimum(
template <typename T>
void _BroadcastMinimum(
const int count,
const T* x1,
const T x2,
const T* a,
const T b,
T* y) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
y[i] = std::min(x1[i], x2);
}
EigenVectorArrayMap<T>(y, count) = \
ConstEigenVectorArrayMap<T>(a, count).min(b);
}
/* <T = float32, Device = CPU> */
......@@ -42,18 +36,19 @@ void _BroadcastMinimum(
template <typename T>
void _MinimumGrad(
const int count,
const T* x1,
const T* x2,
const T* a,
const T* b,
const T* dy,
T* dx1,
T* dx2) {
T* da,
T* db) {
const T kZero = T(0);
#ifdef WITH_OMP
#pragma omp parallel for num_threads(OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
const bool dy_to_dx1 = x1[i] < x2[i];
dx1[i] = dy_to_dx1 ? dy[i] : 0;
dx2[i] = dy_to_dx1 ? 0 : dy[i];
const bool dy_to_da = a[i] < b[i];
da[i] = dy_to_da ? dy[i] : kZero;
db[i] = dy_to_da ? kZero : dy[i];
}
}
......@@ -62,16 +57,17 @@ void _MinimumGrad(
template <typename T>
void _BroadcastMinimumGrad(
const int count,
const T* x1,
const T x2,
const T* a,
const T b,
const T* dy,
T* dx1,
T* dx2) {
T* da,
T* db) {
const T kZero = T(0);
#ifdef WITH_OMP
#pragma omp parallel for num_threads(OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
dx1[i] = (x1[i] < x2) ? dy[i] : 0;
da[i] = a[i] < b ? dy[i] : kZero;
}
}
......@@ -80,23 +76,23 @@ void _BroadcastMinimumGrad(
#define DEFINE_MINIMUM_KERNEL_LAUNCHER(name, T, T2) \
template <> void name<T, CPUContext>( \
const int count, \
const T* x1, \
const T2 x2, \
const T* a, \
const T2 b, \
T* y, \
CPUContext* ctx) { \
_##name(count, x1, x2, y); \
_##name(count, a, b, y); \
}
#define DEFINE_MINIMUM_GRAD_KERNEL_LAUNCHER(name, T, T2) \
template <> void name<T, CPUContext>( \
const int count, \
const T* x1, \
const T2 x2, \
const T* a, \
const T2 b, \
const T* dy, \
T* dx1, \
T* dx2, \
T* da, \
T* db, \
CPUContext* ctx) { \
_##name(count, x1, x2, dy, dx1, dx2); \
_##name(count, a, b, dy, da, db); \
}
DEFINE_MINIMUM_KERNEL_LAUNCHER(Minimum, int8_t, int8_t*);
......@@ -129,8 +125,8 @@ DEFINE_MINIMUM_GRAD_KERNEL_LAUNCHER(BroadcastMinimumGrad, double, double);
template <> void Minimum<float16, CPUContext>(
const int count,
const float16* x1,
const float16* x2,
const float16* a,
const float16* b,
float16* y,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
......@@ -138,8 +134,8 @@ template <> void Minimum<float16, CPUContext>(
template <> void BroadcastMinimum<float16, CPUContext>(
const int count,
const float16* x1,
const float16 x2,
const float16* a,
const float16 b,
float16* y,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
......@@ -147,22 +143,22 @@ template <> void BroadcastMinimum<float16, CPUContext>(
template <> void MinimumGrad<float16, CPUContext>(
const int count,
const float16* x1,
const float16* x2,
const float16* a,
const float16* b,
const float16* dy,
float16* dx1,
float16* dx2,
float16* da,
float16* db,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
}
template <> void BroadcastMinimumGrad<float16, CPUContext>(
const int count,
const float16* x1,
const float16 x2,
const float16* a,
const float16 b,
const float16* dy,
float16* dx1,
float16* dx2,
float16* da,
float16* db,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
}
......
......@@ -13,11 +13,11 @@ namespace kernel {
template <typename T>
__global__ void _Minimum(
const int nthreads,
const T* x1,
const T* x2,
const T* a,
const T* b,
T* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
y[i] = min(x1[i], x2[i]);
y[i] = min(a[i], b[i]);
}
}
......@@ -25,12 +25,12 @@ __global__ void _Minimum(
template<> __global__ void _Minimum<half>(
const int nthreads,
const half* x1,
const half* x2,
const half* a,
const half* b,
half* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530
y[i] = __hlt(x1[i], x2[i]) ? x1[i] : x2[i];
y[i] = __hlt(a[i], b[i]) ? a[i] : b[i];
#endif
}
}
......@@ -40,11 +40,11 @@ template<> __global__ void _Minimum<half>(
template <typename T>
__global__ void _BroadcastMinimum(
const int nthreads,
const T* x1,
const T x2,
const T* a,
const T b,
T* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
y[i] = min(x1[i], x2);
y[i] = min(a[i], b);
}
}
......@@ -52,12 +52,12 @@ __global__ void _BroadcastMinimum(
template<> __global__ void _BroadcastMinimum<half>(
const int nthreads,
const half* x1,
const half x2,
const half* a,
const half b,
half* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530
y[i] = __hlt(x1[i], x2) ? x1[i] : x2;
y[i] = __hlt(a[i], b) ? a[i] : b;
#endif
}
}
......@@ -67,15 +67,16 @@ template<> __global__ void _BroadcastMinimum<half>(
template <typename T>
__global__ void _MinimumGrad(
const int nthreads,
const T* x1,
const T* x2,
const T* a,
const T* b,
const T* dy,
T* dx1,
T* dx2) {
T* da,
T* db) {
const T kZero = T(0);
CUDA_1D_KERNEL_LOOP(i, nthreads) {
const bool dy_to_dx1 = x1[i] < x2[i];
dx1[i] = dy_to_dx1 ? dy[i] : T(0);
dx2[i] = dy_to_dx1 ? T(0) : dy[i];
const bool dy_to_da = a[i] < b[i];
da[i] = dy_to_da ? dy[i] : kZero;
db[i] = dy_to_da ? kZero : dy[i];
}
}
......@@ -83,18 +84,19 @@ __global__ void _MinimumGrad(
template<> __global__ void _MinimumGrad<half>(
const int nthreads,
const half* x1,
const half* x2,
const half* a,
const half* b,
const half* dy,
half* dx1,
half* dx2) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
half* da,
half* db) {
#if __CUDA_ARCH__ >= 530
const bool dy_to_dx1 = __hlt(x1[i], x2[i]);
dx1[i] = dy_to_dx1 ? dy[i] : __float2half(0.f);
dx2[i] = dy_to_dx1 ? __float2half(0.f) : dy[i];
#endif
const half kZero = __float2half(0.f);
CUDA_1D_KERNEL_LOOP(i, nthreads) {
const bool dy_to_da = __hlt(a[i], b[i]);
da[i] = dy_to_da ? dy[i] : kZero;
db[i] = dy_to_da ? kZero : dy[i];
}
#endif
}
/* <T = ?, Device = CUDA> */
......@@ -102,13 +104,14 @@ template<> __global__ void _MinimumGrad<half>(
template <typename T>
__global__ void _BroadcastMinimumGrad(
const int nthreads,
const T* x1,
const T x2,
const T* a,
const T b,
const T* dy,
T* dx1,
T* dx2) {
T* da,
T* db) {
const T kZero = T(0);
CUDA_1D_KERNEL_LOOP(i, nthreads) {
dx1[i] = (x1[i] < x2) ? dy[i] : T(0);
da[i] = (a[i] < b) ? dy[i] : kZero;
}
}
......@@ -116,17 +119,17 @@ __global__ void _BroadcastMinimumGrad(
template<> __global__ void _BroadcastMinimumGrad<half>(
const int nthreads,
const half* x1,
const half x2,
const half* a,
const half b,
const half* dy,
half* dx1,
half* dx2) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
half* da,
half* db) {
#if __CUDA_ARCH__ >= 530
dx1[i] = (__hlt(x1[i], x2)) ?
dy[i] : __float2half(0.f);
#endif
const half kZero = __float2half(0.f);
CUDA_1D_KERNEL_LOOP(i, nthreads) {
da[i] = __hlt(a[i], b) ? dy[i] : kZero;
}
#endif
}
/* Kernel Launchers */
......@@ -134,30 +137,30 @@ template<> __global__ void _BroadcastMinimumGrad<half>(
#define DEFINE_MINIMUM_KERNEL_LAUNCHER(name, T, T2) \
template <> void name<T, CUDAContext>( \
const int count, \
const T* x1, \
const T2 x2, \
const T* a, \
const T2 b, \
T* y, \
CUDAContext* ctx) { \
_##name \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
count, x1, x2, y \
count, a, b, y \
); \
}
#define DEFINE_MINIMUM_GRAD_KERNEL_LAUNCHER(name, T, T2) \
template <> void name<T, CUDAContext>( \
const int count, \
const T* x1, \
const T2 x2, \
const T* a, \
const T2 b, \
const T* dy, \
T* dx1, \
T* dx2, \
T* da, \
T* db, \
CUDAContext* ctx) { \
_##name \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
count, x1, x2, dy, dx1, dx2 \
count, a, b, dy, da, db \
); \
}
......@@ -191,73 +194,73 @@ DEFINE_MINIMUM_GRAD_KERNEL_LAUNCHER(BroadcastMinimumGrad, double, double);
template <> void Minimum<float16, CUDAContext>(
const int count,
const float16* x1,
const float16* x2,
const float16* a,
const float16* b,
float16* y,
CUDAContext* ctx) {
_Minimum \
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count,
reinterpret_cast<const half*>(x1),
reinterpret_cast<const half*>(x2),
reinterpret_cast<const half*>(a),
reinterpret_cast<const half*>(b),
reinterpret_cast<half*>(y)
);
}
template <> void BroadcastMinimum<float16, CUDAContext>(
const int count,
const float16* x1,
const float16 x2,
const float16* a,
const float16 b,
float16* y,
CUDAContext* ctx) {
_BroadcastMinimum \
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count,
reinterpret_cast<const half*>(x1),
cast::to<half>(x2),
reinterpret_cast<const half*>(a),
cast::to<half>(b),
reinterpret_cast<half*>(y)
);
}
template <> void MinimumGrad<float16, CUDAContext>(
const int count,
const float16* x1,
const float16* x2,
const float16* a,
const float16* b,
const float16* dy,
float16* dx1,
float16* dx2,
float16* da,
float16* db,
CUDAContext* ctx) {
_MinimumGrad \
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count,
reinterpret_cast<const half*>(x1),
reinterpret_cast<const half*>(x2),
reinterpret_cast<const half*>(a),
reinterpret_cast<const half*>(b),
reinterpret_cast<const half*>(dy),
reinterpret_cast<half*>(dx1),
reinterpret_cast<half*>(dx2)
reinterpret_cast<half*>(da),
reinterpret_cast<half*>(db)
);
}
template <> void BroadcastMinimumGrad<float16, CUDAContext>(
const int count,
const float16* x1,
const float16 x2,
const float16* a,
const float16 b,
const float16* dy,
float16* dx1,
float16* dx2,
float16* da,
float16* db,
CUDAContext* ctx) {
_BroadcastMinimumGrad \
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count,
reinterpret_cast<const half*>(x1),
cast::to<half>(x2),
reinterpret_cast<const half*>(a),
cast::to<half>(b),
reinterpret_cast<const half*>(dy),
reinterpret_cast<half*>(dx1),
reinterpret_cast<half*>(dx2)
reinterpret_cast<half*>(da),
reinterpret_cast<half*>(db)
);
}
......
#include "utils/cast.h"
#include "utils/op_kernel.h"
#include "utils/math_functions.h"
#include "utils/omp_alternative.h"
namespace dragon {
......@@ -49,6 +49,8 @@ void _IndexSelectGrad(
T* dx,
CPUContext* ctx) {
int64_t x_offset, select_idx;
auto nelements = outer_dim * axis_dim * inner_dim;
math::Set(nelements, cast::to<T>(0.f), dx, ctx);
for (int n = 0; n < outer_dim; ++n) {
for (int i = 0; i < num_indices; ++i) {
select_idx = indices[i];
......
#ifdef WITH_CUDA
#include "core/context_cuda.h"
#include "utils/cast.h"
#include "utils/op_kernel.h"
#include "utils/cub_device.h"
#include "utils/math_functions.h"
namespace dragon {
......@@ -134,6 +135,8 @@ template <> __global__ void _IndexSelectGrad<half>(
T* dx, \
CUDAContext* ctx) { \
auto nthreads = outer_dim * inner_dim; \
auto nelements = outer_dim * axis_dim * inner_dim; \
math::Set(nelements, cast::to<T>(0.f), dx, ctx); \
_IndexSelectGrad \
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
......@@ -169,6 +172,8 @@ template <> void IndexSelectGrad<float16, CUDAContext>(
float16* dx,
CUDAContext* ctx) {
auto nthreads = outer_dim * inner_dim;
auto nelements = outer_dim * axis_dim * inner_dim;
math::Set(nelements, cast::to<float16>(0.f), dx, ctx);
_IndexSelectGrad
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
......
#include "core/tensor.h"
#include "utils/cast.h"
#include "utils/op_kernel.h"
#include "utils/math_functions.h"
namespace dragon {
namespace kernel {
/* <T = ?, Device = CPU> */
template <typename T>
void _MaskedSelectGrad(
const int num_indices,
const int64_t* indices,
const T* dy,
T* dx) {
for (int i = 0; i < num_indices; ++i) {
dx[indices[i]] = dy[i];
}
}
/* Kernel Launchers */
#define DEFINE_MASKED_KERNEL_LAUNCHER(T) \
template <> void MaskedSelect<T, CPUContext>( \
const int count, \
const uint8_t* mask, \
const T* x, \
Tensor* indices, \
Tensor* scratch, \
Tensor* y, \
CPUContext* ctx) { \
int64_t nelements = 0; \
int64_t n, last = -1, y_ofs = 0; \
for (int i = 0; i < count; ++i) \
if (mask[i]) ++nelements; \
auto* value = y == nullptr ? nullptr : y \
->Reshape({ nelements }) \
->mutable_data<T, CPUContext>(); \
auto* index = indices \
->Reshape({ nelements }) \
->mutable_data<int64_t, CPUContext>(); \
for (int64_t i = 0;; ++i) { \
if (last != -1 && ((i >= count) || !mask[i])) { \
n = i - last; \
if (value != nullptr) { \
auto* src = x + last; \
auto* dst = value + y_ofs; \
math::Copy(n, src, dst, ctx); \
} \
y_ofs += n; last = -1; \
} \
if (i >= count) break; \
if (mask[i]) { \
*(index++) = i; \
if (last == -1) last = i; \
} \
} \
}
#define DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(T) \
template <> void MaskedSelectGrad<T, CPUContext>( \
const int count, \
const int num_indices, \
const int64_t* indices, \
const T* dy, \
T* dx, \
CPUContext* ctx) { \
math::Set(count, cast::to<T>(0.f), dx, ctx); \
_MaskedSelectGrad(num_indices, indices, dy, dx); \
}
DEFINE_MASKED_KERNEL_LAUNCHER(bool);
DEFINE_MASKED_KERNEL_LAUNCHER(int8_t);
DEFINE_MASKED_KERNEL_LAUNCHER(uint8_t);
DEFINE_MASKED_KERNEL_LAUNCHER(int);
DEFINE_MASKED_KERNEL_LAUNCHER(int64_t);
DEFINE_MASKED_KERNEL_LAUNCHER(float16);
DEFINE_MASKED_KERNEL_LAUNCHER(float);
DEFINE_MASKED_KERNEL_LAUNCHER(double);
DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(bool);
DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(int8_t);
DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(uint8_t);
DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(int);
DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(int64_t);
DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(float16);
DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(float);
DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_MASKED_KERNEL_LAUNCHER
#undef DEFINE_MASKED_GRAD_KERNEL_LAUNCHER
} // namespace kernel
} // namepsace dragon
\ No newline at end of file
#ifdef WITH_CUDA
#include "core/tensor.h"
#include "core/context_cuda.h"
#include "utils/cast.h"
#include "utils/op_kernel.h"
#include "utils/cub_device.h"
#include "utils/math_functions.h"
namespace dragon {
namespace kernel {
/* <T = ?, Device = CUDA> */
template <typename T>
__global__ void _MaskedSelectByIndex(
const int nthreads,
const int64_t* indices,
const T* x,
T* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
y[i] = x[indices[i]];
}
}
template <typename T>
__global__ void _MaskedSelectGrad(
const int nthreads,
const int64_t* indices,
const T* dy,
T* dx) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
dx[indices[i]] = dy[i];
}
}
/* Kernel Launchers */
#define DEFINE_MASKED_KERNEL_LAUNCHER(T) \
template <> void MaskedSelect<T, CUDAContext>( \
const int count, \
const uint8_t* mask, \
const T* x, \
Tensor* indices, \
Tensor* scratch, \
Tensor* y, \
CUDAContext* ctx) { \
auto* i = indices \
->Reshape({ count + 1 }) \
->mutable_data<int64_t, CUDAContext>(); \
auto* n = (int*)(i + count); \
size_t nbytes = 0; int nelements; \
cub::CountingInputIterator<int> itr(0); \
cub::DeviceSelect::Flagged( \
nullptr, nbytes, \
itr, mask, i, n, count, \
ctx->cuda_stream() \
); \
auto* storage = scratch \
->Reshape({ (int64_t)nbytes }) \
->mutable_data<uint8_t, CUDAContext>(); \
cub::DeviceSelect::Flagged( \
storage, nbytes, \
itr, mask, i, n, count, \
ctx->cuda_stream() \
); \
ctx->FinishDeviceCompution(); \
ctx->Memcpy<CPUContext, CUDAContext>( \
sizeof(int), &nelements, n); \
indices->Reshape({ nelements }); \
if (y == nullptr) return; \
auto* value = y \
->Reshape({ nelements }) \
->mutable_data<T, CUDAContext>(); \
_MaskedSelectByIndex \
<<< CUDA_BLOCKS(nelements), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
nelements, i, x, value \
); \
}
#define DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(T) \
template <> void MaskedSelectGrad<T, CUDAContext>( \
const int count, \
const int num_indices, \
const int64_t* indices, \
const T* dy, \
T* dx, \
CUDAContext* ctx) { \
math::Set(count, cast::to<T>(0.f), dx, ctx); \
_MaskedSelectGrad \
<<< CUDA_BLOCKS(num_indices), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
num_indices, indices, dy, dx \
); \
}
DEFINE_MASKED_KERNEL_LAUNCHER(bool);
DEFINE_MASKED_KERNEL_LAUNCHER(int8_t);
DEFINE_MASKED_KERNEL_LAUNCHER(uint8_t);
DEFINE_MASKED_KERNEL_LAUNCHER(int);
DEFINE_MASKED_KERNEL_LAUNCHER(int64_t);
DEFINE_MASKED_KERNEL_LAUNCHER(float16);
DEFINE_MASKED_KERNEL_LAUNCHER(float);
DEFINE_MASKED_KERNEL_LAUNCHER(double);
DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(bool);
DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(int8_t);
DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(uint8_t);
DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(int);
DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(int64_t);
DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(float16);
DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(float);
DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_MASKED_KERNEL_LAUNCHER
#undef DEFINE_MASKED_GRAD_KERNEL_LAUNCHER
} // namespace kernel
} // namepsace dragon
#endif // WITH_CUDA
\ No newline at end of file
#include "utils/op_kernel.h"
namespace dragon {
namespace kernel {
#define FIXED_DIVISOR_DIV_MOD(d, n, q, r) \
do { \
const auto n_copy = n; \
*q = n_copy / d; \
*r = n_copy % d; \
} while (0)
template <> void UnravelIndex<CPUContext>(
const int count,
const int ndims,
const int* dims,
const int64_t* x,
int64_t* y,
CPUContext* ctx) {
int tmp, d; int64_t* Y;
for (int i = 0; i < count; ++i) {
tmp = x[i]; Y = y + i * ndims;
for (d = ndims - 1; d >= 0; --d) {
FIXED_DIVISOR_DIV_MOD(dims[d], tmp, &tmp, (Y + d));
}
}
}
#undef FIXED_DIVISOR_DIV_MOD
} // namespace kernel
} // namepsace dragon
\ No newline at end of file
#ifdef WITH_CUDA
#include "core/context_cuda.h"
#include "utils/op_kernel.h"
namespace dragon {
namespace kernel {
#define FIXED_DIVISOR_DIV_MOD(d, n, q, r) \
do { \
const auto n_copy = n; \
*q = n_copy / d; \
*r = n_copy % d; \
} while (0)
/* <T = ?, Device = CUDA> */
__global__ void _UnravelIndex(
const int nthreads,
const int ndims,
const int* dims,
const int64_t* x,
int64_t* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
int tmp = x[i], d;
int64_t* Y = y + i * ndims;
#pragma unroll
for (d = ndims - 1; d >= 0; --d) {
#if __CUDA_ARCH__ >= 350
FIXED_DIVISOR_DIV_MOD(__ldg(dims + d), tmp, &tmp, (Y + d));
#else
FIXED_DIVISOR_DIV_MOD(dims[d], tmp, &tmp, (Y + d));
#endif
}
}
}
template <> void UnravelIndex<CUDAContext>(
const int count,
const int ndims,
const int* dims,
const int64_t* x,
int64_t* y,
CUDAContext* ctx) {
_UnravelIndex
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>>(
count, ndims, dims, x, y
);
}
#undef FIXED_DIVISOR_DIV_MOD
} // namespace kernel
} // namepsace dragon
#endif // WITH_CUDA
\ No newline at end of file
#include "utils/op_kernel.h"
#include "utils/omp_alternative.h"
namespace dragon {
namespace kernel {
/* <T = ?, Device = CPU> */
template <typename T>
void _Where(
const int count,
const uint8_t* mask,
const T* a,
const T* b,
T* y) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
y[i] = mask[i] ? a[i] : b[i];
}
}
template <typename T>
void _WhereGrad(
const int count,
const uint8_t* mask,
const T* dy,
T* da,
T* db) {
const T kZero = T(0);
#ifdef WITH_OMP
#pragma omp parallel for num_threads(OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
const bool dy_to_da = mask[i];
da[i] = dy_to_da ? dy[i] : kZero;
db[i] = dy_to_da ? kZero : dy[i];
}
}
/* Kernel Launchers */
#define DEFINE_WHERE_KERNEL_LAUNCHER(T) \
template<> void Where<T, CPUContext>( \
const int count, \
const uint8_t* mask, \
const T* a, \
const T* b, \
T* y, \
CPUContext* ctx) { \
_Where(count, mask, a, b, y); \
}
#define DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(T) \
template <> void WhereGrad<T, CPUContext>( \
const int count, \
const uint8_t* mask, \
const T* dy, \
T* da, \
T* db, \
CPUContext* ctx) { \
_WhereGrad(count, mask, dy, da, db); \
}
DEFINE_WHERE_KERNEL_LAUNCHER(bool);
DEFINE_WHERE_KERNEL_LAUNCHER(int8_t);
DEFINE_WHERE_KERNEL_LAUNCHER(uint8_t);
DEFINE_WHERE_KERNEL_LAUNCHER(int);
DEFINE_WHERE_KERNEL_LAUNCHER(int64_t);
DEFINE_WHERE_KERNEL_LAUNCHER(float16);
DEFINE_WHERE_KERNEL_LAUNCHER(float);
DEFINE_WHERE_KERNEL_LAUNCHER(double);
DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(bool);
DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(int8_t);
DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(uint8_t);
DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(int);
DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(int64_t);
DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(double);
template <> void WhereGrad<float16, CPUContext>(
const int count,
const uint8_t* mask,
const float16* dy,
float16* da,
float16* db,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
}
#undef DEFINE_WHERE_KERNEL_LAUNCHER
#undef DEFINE_WHERE_GRAD_KERNEL_LAUNCHER
} // namespace kernel
} // namepsace dragon
\ No newline at end of file
#ifdef WITH_CUDA
#include "core/context_cuda.h"
#include "utils/op_kernel.h"
namespace dragon {
namespace kernel {
/* <T = ?, Device = CUDA> */
template<typename T>
__global__ void _Where(
const int nthreads,
const uint8_t* mask,
const T* a,
const T* b,
T* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
y[i] = mask[i] ? a[i] : b[i];
}
}
template <typename T>
__global__ void _WhereGrad(
const int nthreads,
const uint8_t* mask,
const T* dy,
T* da,
T* db) {
const T kZero = T(0);
CUDA_1D_KERNEL_LOOP(i, nthreads) {
da[i] = mask[i] ? dy[i] : kZero;
db[i] = mask[i] ? kZero : dy[i];
}
}
template<> __global__ void _WhereGrad<half>(
const int nthreads,
const uint8_t* mask,
const half* dy,
half* da,
half* db) {
#if __CUDA_ARCH__ >= 530
const half kZero = __float2half(0.f);
CUDA_1D_KERNEL_LOOP(i, nthreads) {
const bool dy_to_da = mask[i];
da[i] = dy_to_da ? dy[i] : kZero;
db[i] = dy_to_da ? kZero : dy[i];
}
#endif
}
/* Kernel Launchers */
#define DEFINE_WHERE_KERNEL_LAUNCHER(T) \
template<> void Where<T, CUDAContext>( \
const int count, \
const uint8_t* mask, \
const T* a, \
const T* b, \
T* y, \
CUDAContext* ctx) { \
_Where \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
count, mask, a, b, y \
); \
}
#define DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(T) \
template <> void WhereGrad<T, CUDAContext>( \
const int count, \
const uint8_t* mask, \
const T* dy, \
T* da, \
T* db, \
CUDAContext* ctx) { \
_WhereGrad \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
count, mask, dy, da, db \
); \
}
DEFINE_WHERE_KERNEL_LAUNCHER(bool);
DEFINE_WHERE_KERNEL_LAUNCHER(int8_t);
DEFINE_WHERE_KERNEL_LAUNCHER(uint8_t);
DEFINE_WHERE_KERNEL_LAUNCHER(int);
DEFINE_WHERE_KERNEL_LAUNCHER(int64_t);
DEFINE_WHERE_KERNEL_LAUNCHER(float16);
DEFINE_WHERE_KERNEL_LAUNCHER(float);
DEFINE_WHERE_KERNEL_LAUNCHER(double);
DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(bool);
DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(int8_t);
DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(uint8_t);
DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(int);
DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(int64_t);
DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(double);
template <> void WhereGrad<float16, CUDAContext>(
const int count,
const uint8_t* mask,
const float16* dy,
float16* da,
float16* db,
CUDAContext* ctx) {
_WhereGrad
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count,
mask,
reinterpret_cast<const half*>(dy),
reinterpret_cast<half*>(da),
reinterpret_cast<half*>(db)
);
}
#undef DEFINE_WHERE_KERNEL_LAUNCHER
#undef DEFINE_WHERE_GRAD_KERNEL_LAUNCHER
} // namespace kernel
} // namepsace dragon
#endif // WITH_CUDA
\ No newline at end of file
#include "utils/op_kernel.h"
#include "utils/omp_alternative.h"
#include "utils/eigen_utils.h"
namespace dragon {
namespace kernel {
/* <T = ?, Device = CPU> */
template <typename T>
void _EqualInteger(
const int count,
const T* a,
const T* b,
bool* y) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
y[i] = a[i] == b[i] ? true : false;
}
}
template <typename T>
void _EqualFloat(
const int count,
const T* a,
const T* b,
bool* y) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
y[i] = fabs(a[i] - b[i]) < 1e-15 ? true : false;
}
}
/* <T = ?, Device = CPU> */
template <typename T>
void _Less(
const int count,
const T* a,
const T* b,
bool* y) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
y[i] = a[i] < b[i] ? true : false;
}
}
/* <T = ?, Device = CPU> */
template <typename T>
void _LessEqual(
const int count,
const T* a,
const T* b,
bool* y) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
y[i] = a[i] <= b[i] ? true : false;
}
}
/* <T = ?, Device = CPU> */
template <typename T>
void _Greater(
const int count,
const T* a,
const T* b,
bool* y) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
y[i] = a[i] > b[i] ? true : false;
}
}
/* <T = ?, Device = CPU> */
/* Kernel Launchers */
template <typename T>
void _GreaterEqual(
const int count,
const T* a,
const T* b,
bool* y) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
y[i] = a[i] >= b[i] ? true : false;
#define DEFINE_NOTZERO_KERNEL_LAUNCHER(T) \
template <> void NotZero<T, CPUContext>( \
const int count, \
const T* x, \
bool* y, \
CPUContext* ctx) { \
EigenVectorArrayMap<bool>(y, count) = \
ConstEigenVectorArrayMap<T>(x, count) != T(0); \
}
}
/* Kernel Launchers */
#define DEFINE_COMPARE_WARPPER(T, OP, IMPL) \
#define DEFINE_COMPARE_KERNEL_LAUNCHER(T, OP, expr) \
template <> void OP<T, CPUContext>( \
const int count, \
const T* a, \
const T* b, \
bool* y, \
CPUContext* ctx) { \
IMPL(count, a, b, y); \
EigenVectorArrayMap<bool>(y, count) = \
ConstEigenVectorArrayMap<T>(a, count) expr \
ConstEigenVectorArrayMap<T>(b, count); \
}
DEFINE_COMPARE_WARPPER(bool, Equal, _EqualInteger);
DEFINE_COMPARE_WARPPER(int8_t, Equal, _EqualInteger);
DEFINE_COMPARE_WARPPER(uint8_t, Equal, _EqualInteger);
DEFINE_COMPARE_WARPPER(int, Equal, _EqualInteger);
DEFINE_COMPARE_WARPPER(int64_t, Equal, _EqualInteger);
DEFINE_COMPARE_WARPPER(float, Equal, _EqualFloat);
DEFINE_COMPARE_WARPPER(double, Equal, _EqualFloat);
DEFINE_COMPARE_WARPPER(bool, Less, _Less);
DEFINE_COMPARE_WARPPER(int8_t, Less, _Less);
DEFINE_COMPARE_WARPPER(uint8_t, Less, _Less);
DEFINE_COMPARE_WARPPER(int, Less, _Less);
DEFINE_COMPARE_WARPPER(int64_t, Less, _Less);
DEFINE_COMPARE_WARPPER(float, Less, _Less);
DEFINE_COMPARE_WARPPER(double, Less, _Less);
DEFINE_COMPARE_WARPPER(bool, LessEqual, _LessEqual);
DEFINE_COMPARE_WARPPER(int8_t, LessEqual, _LessEqual);
DEFINE_COMPARE_WARPPER(uint8_t, LessEqual, _LessEqual);
DEFINE_COMPARE_WARPPER(int, LessEqual, _LessEqual);
DEFINE_COMPARE_WARPPER(int64_t, LessEqual, _LessEqual);
DEFINE_COMPARE_WARPPER(float, LessEqual, _LessEqual);
DEFINE_COMPARE_WARPPER(double, LessEqual, _LessEqual);
DEFINE_COMPARE_WARPPER(bool, Greater, _Greater);
DEFINE_COMPARE_WARPPER(int8_t, Greater, _Greater);
DEFINE_COMPARE_WARPPER(uint8_t, Greater, _Greater);
DEFINE_COMPARE_WARPPER(int, Greater, _Greater);
DEFINE_COMPARE_WARPPER(int64_t, Greater, _Greater);
DEFINE_COMPARE_WARPPER(float, Greater, _Greater);
DEFINE_COMPARE_WARPPER(double, Greater, _Greater);
DEFINE_COMPARE_WARPPER(bool, GreaterEqual, _GreaterEqual);
DEFINE_COMPARE_WARPPER(int8_t, GreaterEqual, _GreaterEqual);
DEFINE_COMPARE_WARPPER(uint8_t, GreaterEqual, _GreaterEqual);
DEFINE_COMPARE_WARPPER(int, GreaterEqual, _GreaterEqual);
DEFINE_COMPARE_WARPPER(int64_t, GreaterEqual, _GreaterEqual);
DEFINE_COMPARE_WARPPER(float, GreaterEqual, _GreaterEqual);
DEFINE_COMPARE_WARPPER(double, GreaterEqual, _GreaterEqual);
DEFINE_NOTZERO_KERNEL_LAUNCHER(bool);
DEFINE_NOTZERO_KERNEL_LAUNCHER(int8_t);
DEFINE_NOTZERO_KERNEL_LAUNCHER(uint8_t);
DEFINE_NOTZERO_KERNEL_LAUNCHER(int);
DEFINE_NOTZERO_KERNEL_LAUNCHER(int64_t);
DEFINE_NOTZERO_KERNEL_LAUNCHER(float);
DEFINE_NOTZERO_KERNEL_LAUNCHER(double);
DEFINE_COMPARE_KERNEL_LAUNCHER(bool, Equal, ==);
DEFINE_COMPARE_KERNEL_LAUNCHER(int8_t, Equal, ==);
DEFINE_COMPARE_KERNEL_LAUNCHER(uint8_t, Equal, ==);
DEFINE_COMPARE_KERNEL_LAUNCHER(int, Equal, ==);
DEFINE_COMPARE_KERNEL_LAUNCHER(int64_t, Equal, ==);
DEFINE_COMPARE_KERNEL_LAUNCHER(float, Equal, ==);
DEFINE_COMPARE_KERNEL_LAUNCHER(double, Equal, ==);
DEFINE_COMPARE_KERNEL_LAUNCHER(bool, NotEqual, !=);
DEFINE_COMPARE_KERNEL_LAUNCHER(int8_t, NotEqual, !=);
DEFINE_COMPARE_KERNEL_LAUNCHER(uint8_t, NotEqual, !=);
DEFINE_COMPARE_KERNEL_LAUNCHER(int, NotEqual, !=);
DEFINE_COMPARE_KERNEL_LAUNCHER(int64_t, NotEqual, !=);
DEFINE_COMPARE_KERNEL_LAUNCHER(float, NotEqual, != );
DEFINE_COMPARE_KERNEL_LAUNCHER(double, NotEqual, !=);
DEFINE_COMPARE_KERNEL_LAUNCHER(bool, Less, <);
DEFINE_COMPARE_KERNEL_LAUNCHER(int8_t, Less, <);
DEFINE_COMPARE_KERNEL_LAUNCHER(uint8_t, Less, <);
DEFINE_COMPARE_KERNEL_LAUNCHER(int, Less, <);
DEFINE_COMPARE_KERNEL_LAUNCHER(int64_t, Less, <);
DEFINE_COMPARE_KERNEL_LAUNCHER(float, Less, <);
DEFINE_COMPARE_KERNEL_LAUNCHER(double, Less, <);
DEFINE_COMPARE_KERNEL_LAUNCHER(bool, LessEqual, <=);
DEFINE_COMPARE_KERNEL_LAUNCHER(int8_t, LessEqual, <=);
DEFINE_COMPARE_KERNEL_LAUNCHER(uint8_t, LessEqual, <=);
DEFINE_COMPARE_KERNEL_LAUNCHER(int, LessEqual, <=);
DEFINE_COMPARE_KERNEL_LAUNCHER(int64_t, LessEqual, <=);
DEFINE_COMPARE_KERNEL_LAUNCHER(float, LessEqual, <=);
DEFINE_COMPARE_KERNEL_LAUNCHER(double, LessEqual, <=);
DEFINE_COMPARE_KERNEL_LAUNCHER(bool, Greater, >);
DEFINE_COMPARE_KERNEL_LAUNCHER(int8_t, Greater, >);
DEFINE_COMPARE_KERNEL_LAUNCHER(uint8_t, Greater, >);
DEFINE_COMPARE_KERNEL_LAUNCHER(int, Greater, >);
DEFINE_COMPARE_KERNEL_LAUNCHER(int64_t, Greater, >);
DEFINE_COMPARE_KERNEL_LAUNCHER(float, Greater, >);
DEFINE_COMPARE_KERNEL_LAUNCHER(double, Greater, >);
DEFINE_COMPARE_KERNEL_LAUNCHER(bool, GreaterEqual, >=);
DEFINE_COMPARE_KERNEL_LAUNCHER(int8_t, GreaterEqual, >=);
DEFINE_COMPARE_KERNEL_LAUNCHER(uint8_t, GreaterEqual, >=);
DEFINE_COMPARE_KERNEL_LAUNCHER(int, GreaterEqual, >=);
DEFINE_COMPARE_KERNEL_LAUNCHER(int64_t, GreaterEqual, >=);
DEFINE_COMPARE_KERNEL_LAUNCHER(float, GreaterEqual, >=);
DEFINE_COMPARE_KERNEL_LAUNCHER(double, GreaterEqual, >=);
template <> void NotZero<float16, CPUContext>(
const int count,
const float16* x,
bool* y,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
}
template <> void Equal<float16, CPUContext>(
const int count,
......@@ -160,6 +102,15 @@ template <> void Equal<float16, CPUContext>(
CPU_FP16_NOT_SUPPORTED;
}
template <> void NotEqual<float16, CPUContext>(
const int count,
const float16* a,
const float16* b,
bool* y,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
}
template <> void Less<float16, CPUContext>(
const int count,
const float16* a,
......@@ -196,7 +147,8 @@ template <> void GreaterEqual<float16, CPUContext>(
CPU_FP16_NOT_SUPPORTED;
}
#undef DEFINE_COMPARE_WARPPER
#undef DEFINE_NOTZERO_KERNEL_LAUNCHER
#undef DEFINE_COMPARE_KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -10,6 +10,31 @@ namespace kernel {
/* <T = ?, Device = CUDA> */
template <typename T>
__global__ void _NotZero(
const int nthreads,
const T* x,
bool* y) {
const T kZero = T(0);
CUDA_1D_KERNEL_LOOP(i, nthreads) {
y[i] = x[i] != kZero ? true : false;
}
}
template<> __global__ void _NotZero<half>(
const int nthreads,
const half* x,
bool* y) {
#if __CUDA_ARCH__ >= 530
const half kZero = __float2half(0.f);
CUDA_1D_KERNEL_LOOP(i, nthreads) {
y[i] = __hne(x[i], kZero) ? true : false;
}
#endif
}
/* <T = ?, Device = CUDA> */
template <typename T>
__global__ void _EqualInteger(
const int nthreads,
const T* a,
......@@ -46,6 +71,42 @@ __global__ void _EqualFloat(
/* <T = ?, Device = CUDA> */
template <typename T>
__global__ void _NotEqualInteger(
const int nthreads,
const T* a,
const T* b,
bool* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
y[i] = a[i] != b[i] ? true : false;
}
}
__global__ void _NotEqualHalf(
const int nthreads,
const half* a,
const half* b,
bool* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530
y[i] = __hne(a[i], b[i]) ? true : false;
#endif
}
}
template <typename T>
__global__ void _NotEqualFloat(
const int nthreads,
const T* a,
const T* b,
bool* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
y[i] = fabs(a[i] - b[i]) > 1e-15 ? true : false;
}
}
/* <T = ?, Device = CUDA> */
template <typename T>
__global__ void _Less(
const int nthreads,
const T* a,
......@@ -145,7 +206,20 @@ __global__ void _GreaterEqualHalf(
/* Kernel Launchers */
#define DEFINE_COMPARE_WARPPER(T, OP, IMPL) \
#define DEFINE_NOTZERO_KERNEL_LAUNCHER(T) \
template <> void NotZero<T, CUDAContext>( \
const int count, \
const T* x, \
bool* y, \
CUDAContext* ctx) { \
_NotZero \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
count, x, y \
); \
}
#define DEFINE_COMPARE_KERNEL_LAUNCHER(T, OP, IMPL) \
template <> void OP<T, CUDAContext>( \
const int count, \
const T* a, \
......@@ -159,7 +233,7 @@ __global__ void _GreaterEqualHalf(
); \
}
#define DEFINE_COMPARE_FP16_WARPPER(OP) \
#define DEFINE_COMPARE_FP16_KERNEL_LAUNCHER(OP) \
template <> void OP<float16, CUDAContext>( \
const int count, \
const float16* a, \
......@@ -176,53 +250,85 @@ __global__ void _GreaterEqualHalf(
); \
}
DEFINE_COMPARE_WARPPER(bool, Equal, _EqualInteger);
DEFINE_COMPARE_WARPPER(int8_t, Equal, _EqualInteger);
DEFINE_COMPARE_WARPPER(uint8_t, Equal, _EqualInteger);
DEFINE_COMPARE_WARPPER(int, Equal, _EqualInteger);
DEFINE_COMPARE_WARPPER(int64_t, Equal, _EqualInteger);
DEFINE_COMPARE_WARPPER(float, Equal, _EqualFloat);
DEFINE_COMPARE_WARPPER(double, Equal, _EqualFloat);
DEFINE_COMPARE_FP16_WARPPER(Equal);
DEFINE_COMPARE_WARPPER(bool, Less, _Less);
DEFINE_COMPARE_WARPPER(int8_t, Less, _Less);
DEFINE_COMPARE_WARPPER(uint8_t, Less, _Less);
DEFINE_COMPARE_WARPPER(int, Less, _Less);
DEFINE_COMPARE_WARPPER(int64_t, Less, _Less);
DEFINE_COMPARE_WARPPER(float, Less, _Less);
DEFINE_COMPARE_WARPPER(double, Less, _Less);
DEFINE_COMPARE_FP16_WARPPER(Less);
DEFINE_COMPARE_WARPPER(bool, LessEqual, _LessEqual);
DEFINE_COMPARE_WARPPER(int8_t, LessEqual, _LessEqual);
DEFINE_COMPARE_WARPPER(uint8_t, LessEqual, _LessEqual);
DEFINE_COMPARE_WARPPER(int, LessEqual, _LessEqual);
DEFINE_COMPARE_WARPPER(int64_t, LessEqual, _LessEqual);
DEFINE_COMPARE_WARPPER(float, LessEqual, _LessEqual);
DEFINE_COMPARE_WARPPER(double, LessEqual, _LessEqual);
DEFINE_COMPARE_FP16_WARPPER(LessEqual);
DEFINE_COMPARE_WARPPER(bool, Greater, _Greater);
DEFINE_COMPARE_WARPPER(int8_t, Greater, _Greater);
DEFINE_COMPARE_WARPPER(uint8_t, Greater, _Greater);
DEFINE_COMPARE_WARPPER(int, Greater, _Greater);
DEFINE_COMPARE_WARPPER(int64_t, Greater, _Greater);
DEFINE_COMPARE_WARPPER(float, Greater, _Greater);
DEFINE_COMPARE_WARPPER(double, Greater, _Greater);
DEFINE_COMPARE_FP16_WARPPER(Greater);
DEFINE_COMPARE_WARPPER(bool, GreaterEqual, _GreaterEqual);
DEFINE_COMPARE_WARPPER(int8_t, GreaterEqual, _GreaterEqual);
DEFINE_COMPARE_WARPPER(uint8_t, GreaterEqual, _GreaterEqual);
DEFINE_COMPARE_WARPPER(int, GreaterEqual, _GreaterEqual);
DEFINE_COMPARE_WARPPER(int64_t, GreaterEqual, _GreaterEqual);
DEFINE_COMPARE_WARPPER(float, GreaterEqual, _GreaterEqual);
DEFINE_COMPARE_WARPPER(double, GreaterEqual, _GreaterEqual);
DEFINE_COMPARE_FP16_WARPPER(GreaterEqual);
#undef DEFINE_COMPARE_WARPPER
#undef DEFINE_COMPARE_FP16_WARPPER
DEFINE_NOTZERO_KERNEL_LAUNCHER(bool);
DEFINE_NOTZERO_KERNEL_LAUNCHER(int8_t);
DEFINE_NOTZERO_KERNEL_LAUNCHER(uint8_t);
DEFINE_NOTZERO_KERNEL_LAUNCHER(int);
DEFINE_NOTZERO_KERNEL_LAUNCHER(int64_t);
DEFINE_NOTZERO_KERNEL_LAUNCHER(float);
DEFINE_NOTZERO_KERNEL_LAUNCHER(double);
DEFINE_COMPARE_KERNEL_LAUNCHER(bool, Equal, _EqualInteger);
DEFINE_COMPARE_KERNEL_LAUNCHER(int8_t, Equal, _EqualInteger);
DEFINE_COMPARE_KERNEL_LAUNCHER(uint8_t, Equal, _EqualInteger);
DEFINE_COMPARE_KERNEL_LAUNCHER(int, Equal, _EqualInteger);
DEFINE_COMPARE_KERNEL_LAUNCHER(int64_t, Equal, _EqualInteger);
DEFINE_COMPARE_KERNEL_LAUNCHER(float, Equal, _EqualFloat);
DEFINE_COMPARE_KERNEL_LAUNCHER(double, Equal, _EqualFloat);
DEFINE_COMPARE_FP16_KERNEL_LAUNCHER(Equal);
DEFINE_COMPARE_KERNEL_LAUNCHER(bool, NotEqual, _NotEqualInteger);
DEFINE_COMPARE_KERNEL_LAUNCHER(int8_t, NotEqual, _NotEqualInteger);
DEFINE_COMPARE_KERNEL_LAUNCHER(uint8_t, NotEqual, _NotEqualInteger);
DEFINE_COMPARE_KERNEL_LAUNCHER(int, NotEqual, _NotEqualInteger);
DEFINE_COMPARE_KERNEL_LAUNCHER(int64_t, NotEqual, _NotEqualInteger);
DEFINE_COMPARE_KERNEL_LAUNCHER(float, NotEqual, _NotEqualFloat);
DEFINE_COMPARE_KERNEL_LAUNCHER(double, NotEqual, _NotEqualFloat);
DEFINE_COMPARE_FP16_KERNEL_LAUNCHER(NotEqual);
DEFINE_COMPARE_KERNEL_LAUNCHER(bool, Less, _Less);
DEFINE_COMPARE_KERNEL_LAUNCHER(int8_t, Less, _Less);
DEFINE_COMPARE_KERNEL_LAUNCHER(uint8_t, Less, _Less);
DEFINE_COMPARE_KERNEL_LAUNCHER(int, Less, _Less);
DEFINE_COMPARE_KERNEL_LAUNCHER(int64_t, Less, _Less);
DEFINE_COMPARE_KERNEL_LAUNCHER(float, Less, _Less);
DEFINE_COMPARE_KERNEL_LAUNCHER(double, Less, _Less);
DEFINE_COMPARE_FP16_KERNEL_LAUNCHER(Less);
DEFINE_COMPARE_KERNEL_LAUNCHER(bool, LessEqual, _LessEqual);
DEFINE_COMPARE_KERNEL_LAUNCHER(int8_t, LessEqual, _LessEqual);
DEFINE_COMPARE_KERNEL_LAUNCHER(uint8_t, LessEqual, _LessEqual);
DEFINE_COMPARE_KERNEL_LAUNCHER(int, LessEqual, _LessEqual);
DEFINE_COMPARE_KERNEL_LAUNCHER(int64_t, LessEqual, _LessEqual);
DEFINE_COMPARE_KERNEL_LAUNCHER(float, LessEqual, _LessEqual);
DEFINE_COMPARE_KERNEL_LAUNCHER(double, LessEqual, _LessEqual);
DEFINE_COMPARE_FP16_KERNEL_LAUNCHER(LessEqual);
DEFINE_COMPARE_KERNEL_LAUNCHER(bool, Greater, _Greater);
DEFINE_COMPARE_KERNEL_LAUNCHER(int8_t, Greater, _Greater);
DEFINE_COMPARE_KERNEL_LAUNCHER(uint8_t, Greater, _Greater);
DEFINE_COMPARE_KERNEL_LAUNCHER(int, Greater, _Greater);
DEFINE_COMPARE_KERNEL_LAUNCHER(int64_t, Greater, _Greater);
DEFINE_COMPARE_KERNEL_LAUNCHER(float, Greater, _Greater);
DEFINE_COMPARE_KERNEL_LAUNCHER(double, Greater, _Greater);
DEFINE_COMPARE_FP16_KERNEL_LAUNCHER(Greater);
DEFINE_COMPARE_KERNEL_LAUNCHER(bool, GreaterEqual, _GreaterEqual);
DEFINE_COMPARE_KERNEL_LAUNCHER(int8_t, GreaterEqual, _GreaterEqual);
DEFINE_COMPARE_KERNEL_LAUNCHER(uint8_t, GreaterEqual, _GreaterEqual);
DEFINE_COMPARE_KERNEL_LAUNCHER(int, GreaterEqual, _GreaterEqual);
DEFINE_COMPARE_KERNEL_LAUNCHER(int64_t, GreaterEqual, _GreaterEqual);
DEFINE_COMPARE_KERNEL_LAUNCHER(float, GreaterEqual, _GreaterEqual);
DEFINE_COMPARE_KERNEL_LAUNCHER(double, GreaterEqual, _GreaterEqual);
DEFINE_COMPARE_FP16_KERNEL_LAUNCHER(GreaterEqual);
template <> void NotZero<float16, CUDAContext>(
const int count,
const float16* x,
bool* y,
CUDAContext* ctx) {
_NotZero
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>>(
count,
reinterpret_cast<const half*>(x),
y
);
}
#undef DEFINE_NOTZERO_KERNEL_LAUNCHER
#undef DEFINE_COMPARE_KERNEL_LAUNCHER
#undef DEFINE_COMPARE_FP16_KERNEL_LAUNCHER
} // namespace kernel
......
#include "utils/op_kernel.h"
#include "utils/math_utils.h"
#include "utils/omp_alternative.h"
namespace dragon {
namespace kernel {
/* <T = ?, Device = CPU> */
template <typename T>
void _MaskedAssign(
const int count,
const uint8_t* mask,
const T* x,
T* y) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
y[i] = mask[i] ? x[i] : y[i];
}
}
/* Kernel Launchers */
#define DEFINE_ASSIGN_KERNEL_LAUNCHER(T) \
template<> void MaskedAssign<T, CPUContext>( \
const int count, \
const uint8_t* mask, \
const T* x, \
T* y, \
CPUContext* ctx) { \
_MaskedAssign(count, mask, x, y); \
}
DEFINE_ASSIGN_KERNEL_LAUNCHER(bool);
DEFINE_ASSIGN_KERNEL_LAUNCHER(int8_t);
DEFINE_ASSIGN_KERNEL_LAUNCHER(uint8_t);
DEFINE_ASSIGN_KERNEL_LAUNCHER(int);
DEFINE_ASSIGN_KERNEL_LAUNCHER(int64_t);
DEFINE_ASSIGN_KERNEL_LAUNCHER(float16);
DEFINE_ASSIGN_KERNEL_LAUNCHER(float);
DEFINE_ASSIGN_KERNEL_LAUNCHER(double);
#undef DEFINE_ASSIGN_KERNEL_LAUNCHER
} // namespace kernel
} // namepsace dragon
\ No newline at end of file
#ifdef WITH_CUDA
#include "core/context_cuda.h"
#include "utils/op_kernel.h"
namespace dragon {
namespace kernel {
/* <T = ?, Device = CUDA> */
template<typename T>
__global__ void _MaskedAssign(
const int nthreads,
const uint8_t* mask,
const T* x,
T* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
y[i] = mask[i] ? x[i] : y[i];
}
}
/* Kernel Launchers */
#define DEFINE_ASSIGN_KERNEL_LAUNCHER(T) \
template<> void MaskedAssign<T, CUDAContext>( \
const int count, \
const uint8_t* mask, \
const T* x, \
T* y, \
CUDAContext* ctx) { \
_MaskedAssign \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
count, mask, x, y \
); \
}
DEFINE_ASSIGN_KERNEL_LAUNCHER(bool);
DEFINE_ASSIGN_KERNEL_LAUNCHER(int8_t);
DEFINE_ASSIGN_KERNEL_LAUNCHER(uint8_t);
DEFINE_ASSIGN_KERNEL_LAUNCHER(int);
DEFINE_ASSIGN_KERNEL_LAUNCHER(int64_t);
DEFINE_ASSIGN_KERNEL_LAUNCHER(float16);
DEFINE_ASSIGN_KERNEL_LAUNCHER(float);
DEFINE_ASSIGN_KERNEL_LAUNCHER(double);
#undef DEFINE_ASSIGN_KERNEL_LAUNCHER
} // namespace kernel
} // namepsace dragon
#endif // WITH_CUDA
\ No newline at end of file
......@@ -5,7 +5,7 @@ namespace dragon {
namespace kernel {
/*! AbsGrad <T = float32, Device = CPU> */
/* <T = float32, Device = CPU> */
template<> void AbsGrad<float, CPUContext>(
const int count,
......
......@@ -7,7 +7,7 @@ namespace dragon {
namespace kernel {
/*! AbsGrad <T = float32, Device = CUDA> */
/* <T = float32, Device = CUDA> */
template <typename T>
__global__ void _AbsGrad(
......
......@@ -5,7 +5,7 @@ namespace dragon {
namespace kernel {
/*! <Tx = ?, Ty = ?, Device = CPU> */
/* <Tx = ?, Ty = ?, Device = CPU> */
template <typename Tx, typename Ty>
void _NLLLoss(
......@@ -39,7 +39,7 @@ void _NLLLoss(
}
}
/*! <Tx = float32, Ty = float32, Device = CPU> */
/* <Tx = float32, Ty = float32, Device = CPU> */
template <> void NLLLoss<float, float, CPUContext>(
const int outer_dim,
......@@ -58,7 +58,7 @@ template <> void NLLLoss<float, float, CPUContext>(
);
}
/*! <Tx = float32, Ty = int64, Device = CPU> */
/* <Tx = float32, Ty = int64, Device = CPU> */
template <> void NLLLoss<float, int64_t, CPUContext>(
const int outer_dim,
......@@ -77,7 +77,7 @@ template <> void NLLLoss<float, int64_t, CPUContext>(
);
}
/*! <Tx = ?, Ty = ?, Device = CPU> */
/* <Tx = ?, Ty = ?, Device = CPU> */
template <typename Tx, typename Ty>
void _NLLLossGrad(
......@@ -108,7 +108,7 @@ void _NLLLossGrad(
}
}
/*! <Tx = float32, Ty = float32, Device = CPU> */
/* <Tx = float32, Ty = float32, Device = CPU> */
template<> void NLLLossGrad<float, float, CPUContext>(
const int outer_dim,
......@@ -127,7 +127,7 @@ template<> void NLLLossGrad<float, float, CPUContext>(
);
}
/*! <Tx = float32, Ty = int64, Device = CPU> */
/* <Tx = float32, Ty = int64, Device = CPU> */
template<> void NLLLossGrad<float, int64_t, CPUContext>(
const int outer_dim,
......
......@@ -7,7 +7,7 @@ namespace dragon {
namespace kernel {
/*! <Tx = float32, Ty = ?, Device = CUDA> */
/* <Tx = float32, Ty = ?, Device = CUDA> */
template <typename Tx, typename Ty>
__global__ void _NLLLoss(
......@@ -40,7 +40,7 @@ __global__ void _NLLLoss(
}
}
/*! <Tx = float32, Ty = float32, Device = CUDA> */
/* <Tx = float32, Ty = float32, Device = CUDA> */
template <> void NLLLoss<float, float, CUDAContext>(
const int outer_dim,
......@@ -62,7 +62,7 @@ template <> void NLLLoss<float, float, CUDAContext>(
);
}
/*! <Tx = float32, Ty = int64, Device = CUDA> */
/* <Tx = float32, Ty = int64, Device = CUDA> */
template <> void NLLLoss<float, int64_t, CUDAContext>(
const int outer_dim,
......@@ -84,7 +84,7 @@ template <> void NLLLoss<float, int64_t, CUDAContext>(
);
}
/*! <Tx = ?, Ty = ?, Device = CUDA> */
/* <Tx = ?, Ty = ?, Device = CUDA> */
template <typename Tx, typename Ty>
__global__ void _NLLLossGrad(
......@@ -114,7 +114,7 @@ __global__ void _NLLLossGrad(
}
}
/*! <Tx = float32, Ty = float32, Device = CUDA> */
/* <Tx = float32, Ty = float32, Device = CUDA> */
template<> void NLLLossGrad<float, float, CUDAContext>(
const int outer_dim,
......@@ -136,7 +136,7 @@ template<> void NLLLossGrad<float, float, CUDAContext>(
);
}
/*! <Tx = float32, Ty = int64, Device = CUDA> */
/* <Tx = float32, Ty = int64, Device = CUDA> */
template<> void NLLLossGrad<float, int64_t, CUDAContext>(
const int outer_dim,
......
......@@ -5,7 +5,7 @@ namespace dragon {
namespace kernel {
/*! SigmoidCrossEntropy <T = float32, Device = CPU> */
/* <T = float32, Device = CPU> */
template <> void SigmoidCrossEntropy<float, CPUContext>(
const int count,
......@@ -35,7 +35,7 @@ template <> void SigmoidCrossEntropy<float, CPUContext>(
}
}
/*! SigmoidCrossEntropyGrad <T = float32, Device = CPU> */
/* <T = float32, Device = CPU> */
template <> void SigmoidCrossEntropyGrad<float, CPUContext>(
const int count,
......
......@@ -4,7 +4,7 @@ namespace dragon {
namespace kernel {
/*! SigmoidFocalLoss <Tx = ?, Ty = ?, Device = CPU> */
/* <Tx = ?, Ty = ?, Device = CPU> */
template <typename Tx, typename Ty>
void _SigmoidFocalLoss(
......@@ -56,7 +56,7 @@ void _SigmoidFocalLoss(
}
}
/*! SigmoidFocalLoss <Tx = float32, Ty = float32, Device = CPU> */
/* <Tx = float32, Ty = float32, Device = CPU> */
template <> void SigmoidFocalLoss<float, float, CPUContext>(
const int outer_dim,
......@@ -78,7 +78,7 @@ template <> void SigmoidFocalLoss<float, float, CPUContext>(
);
}
/*! SigmoidFocalLoss <Tx = float32, Ty = int64, Device = CPU> */
/* <Tx = float32, Ty = int64, Device = CPU> */
template <> void SigmoidFocalLoss<float, int64_t, CPUContext>(
const int outer_dim,
......@@ -100,7 +100,7 @@ template <> void SigmoidFocalLoss<float, int64_t, CPUContext>(
);
}
/*! SigmoidFocalLossGrad <Tx = ?, Ty = ?, Device = CPU> */
/* <Tx = ?, Ty = ?, Device = CPU> */
template <typename Tx, typename Ty>
void _SigmoidFocalLossGrad(
......@@ -153,7 +153,7 @@ void _SigmoidFocalLossGrad(
}
}
/*! SigmoidFocalLossGrad <Tx = float32, Ty = float32, Device = CPU> */
/* <Tx = float32, Ty = float32, Device = CPU> */
template <> void SigmoidFocalLossGrad<float, float, CPUContext>(
const int outer_dim,
......@@ -175,7 +175,7 @@ template <> void SigmoidFocalLossGrad<float, float, CPUContext>(
);
}
/*! SigmoidFocalLossGrad <Tx = float32, Ty = int64_t, Device = CPU> */
/* <Tx = float32, Ty = int64_t, Device = CPU> */
template <> void SigmoidFocalLossGrad<float, int64_t, CPUContext>(
const int outer_dim,
......
......@@ -7,7 +7,7 @@ namespace dragon {
namespace kernel {
/*! SigmoidFocalLoss <Tx = ?, Ty = ?, Device = CUDA> */
/* <Tx = ?, Ty = ?, Device = CUDA> */
template <typename Tx, typename Ty>
__global__ void _SigmoidFocalLoss(
......@@ -54,7 +54,7 @@ __global__ void _SigmoidFocalLoss(
}
}
/*! SigmoidFocalLoss <Tx = float32, Ty = float32, Device = CUDA> */
/* <Tx = float32, Ty = float32, Device = CUDA> */
template <> void SigmoidFocalLoss<float, float, CUDAContext>(
const int outer_dim,
......@@ -79,7 +79,7 @@ template <> void SigmoidFocalLoss<float, float, CUDAContext>(
);
}
/*! SigmoidFocalLoss <Tx = float32, Ty = int64, Device = CUDA> */
/* <Tx = float32, Ty = int64, Device = CUDA> */
template <> void SigmoidFocalLoss<float, int64_t, CUDAContext>(
const int outer_dim,
......
......@@ -5,7 +5,7 @@ namespace dragon {
namespace kernel {
/*! SmoothL1 <T = float32, Device = CPU> */
/* <T = float32, Device = CPU> */
template<> void SmoothL1<float, CPUContext>(
const int count,
......@@ -27,7 +27,7 @@ template<> void SmoothL1<float, CPUContext>(
}
}
/*! SmoothL1Grad <T = float32, Device = CPU> */
/* <T = float32, Device = CPU> */
template<> void SmoothL1Grad<float, CPUContext>(
const int count,
......
......@@ -7,7 +7,7 @@ namespace dragon {
namespace kernel {
/*! SmoothL1 <T = float32, Device = CUDA> */
/* <T = float32, Device = CUDA> */
template <typename T>
__global__ void _SmoothL1(
......@@ -39,7 +39,7 @@ template<> void SmoothL1<float, CUDAContext>(
);
}
/*! SmoothL1Grad <T = float32, Device = CUDA> */
/* <T = float32, Device = CUDA> */
template <typename T>
__global__ void _SmoothL1Grad(
......
......@@ -5,7 +5,7 @@ namespace dragon {
namespace kernel {
/*! SoftmaxCrossEntropy <T = float32, Device = CPU> */
/* <T = float32, Device = CPU> */
template <> void SoftmaxCrossEntropy<float, CPUContext>(
const int count,
......
......@@ -7,7 +7,7 @@ namespace dragon {
namespace kernel {
/*! SoftmaxCrossEntropy <T = float32, Device = CUDA> */
/* <T = float32, Device = CUDA> */
template <typename T>
__global__ void _SoftmaxCrossEntropy(
......
......@@ -4,7 +4,7 @@ namespace dragon {
namespace kernel {
/*! SoftmaxFocalLoss <Tx = ?, Ty = ?, Device = CPU> */
/* <Tx = ?, Ty = ?, Device = CPU> */
template <typename Tx, typename Ty>
void _SoftmaxFocalLoss(
......@@ -47,7 +47,7 @@ void _SoftmaxFocalLoss(
}
}
/*! SoftmaxFocalLoss <Tx = float32, Ty = float32, Device = CPU> */
/* <Tx = float32, Ty = float32, Device = CPU> */
template <> void SoftmaxFocalLoss<float, float, CPUContext>(
const int outer_dim,
......@@ -72,7 +72,7 @@ template <> void SoftmaxFocalLoss<float, float, CPUContext>(
);
}
/*! SoftmaxFocalLoss <Tx = float32, Ty = int64, Device = CPU> */
/* <Tx = float32, Ty = int64, Device = CPU> */
template <> void SoftmaxFocalLoss<float, int64_t, CPUContext>(
const int outer_dim,
......@@ -97,7 +97,7 @@ template <> void SoftmaxFocalLoss<float, int64_t, CPUContext>(
);
}
/*! SoftmaxFocalLossGrad <Tx = ?, Ty = ?, Device = CPU> */
/* <Tx = ?, Ty = ?, Device = CPU> */
template <typename Tx, typename Ty>
void _SoftmaxFocalLossGrad(
......@@ -154,7 +154,7 @@ void _SoftmaxFocalLossGrad(
}
}
/*! SoftmaxFocalLossGrad <Tx = float32, Ty = float32, Device = CPU> */
/* <Tx = float32, Ty = float32, Device = CPU> */
template<> void SoftmaxFocalLossGrad<float, float, CPUContext>(
const int outer_dim,
......@@ -179,7 +179,7 @@ template<> void SoftmaxFocalLossGrad<float, float, CPUContext>(
);
}
/*! SoftmaxFocalLossGrad <Tx = float32, Ty = int64, Device = CPU> */
/* <Tx = float32, Ty = int64, Device = CPU> */
template<> void SoftmaxFocalLossGrad<float, int64_t, CPUContext>(
const int outer_dim,
......
......@@ -7,7 +7,7 @@ namespace dragon {
namespace kernel {
/*! SoftmaxFocalLoss <Tx = ?, Ty = ?, Device = CUDA> */
/* <Tx = ?, Ty = ?, Device = CUDA> */
template <typename Tx, typename Ty>
__global__ void _SoftmaxFocalLoss(
......@@ -48,7 +48,7 @@ __global__ void _SoftmaxFocalLoss(
}
}
/*! SoftmaxFocalLoss <Tx = float32, Ty = float32, Device = CUDA> */
/* <Tx = float32, Ty = float32, Device = CUDA> */
template <> void SoftmaxFocalLoss<float, float, CUDAContext>(
const int outer_dim,
......@@ -76,7 +76,7 @@ template <> void SoftmaxFocalLoss<float, float, CUDAContext>(
);
}
/*! SoftmaxFocalLoss <Tx = float32, Ty = int64, Device = CUDA> */
/* <Tx = float32, Ty = int64, Device = CUDA> */
template <> void SoftmaxFocalLoss<float, int64_t, CUDAContext>(
const int outer_dim,
......@@ -104,7 +104,7 @@ template <> void SoftmaxFocalLoss<float, int64_t, CUDAContext>(
);
}
/*! SoftmaxFocalLossGrad <Tx = ?, Ty = ?, Device = CUDA> */
/* <Tx = ?, Ty = ?, Device = CUDA> */
template <typename Tx, typename Ty>
__global__ void _SoftmaxFocalLossGrad(
......@@ -160,7 +160,7 @@ __global__ void _SoftmaxFocalLossGrad(
}
}
/*! SoftmaxFocalLossGrad <Tx = float32, Ty = float32, Device = CUDA> */
/* <Tx = float32, Ty = float32, Device = CUDA> */
template<> void SoftmaxFocalLossGrad<float, float, CUDAContext>(
const int outer_dim,
......@@ -188,7 +188,7 @@ template<> void SoftmaxFocalLossGrad<float, float, CUDAContext>(
);
}
/*! SoftmaxFocalLossGrad <Tx = float32, Ty = int64, Device = CUDA> */
/* <Tx = float32, Ty = int64, Device = CUDA> */
template<> void SoftmaxFocalLossGrad<float, int64_t, CUDAContext>(
const int outer_dim,
......
......@@ -4,7 +4,7 @@ namespace dragon {
namespace kernel {
/*! <Tx = ?, Ty = ?, Device = CPU> */
/* <Tx = ?, Ty = ?, Device = CPU> */
template <typename Tx, typename Ty>
void _SparseSoftmaxCrossEntropy(
......@@ -40,7 +40,7 @@ void _SparseSoftmaxCrossEntropy(
}
}
/*! SparseSoftmaxCrossEntropy <Tx = float32, Ty = float32, Device = CPU> */
/* <Tx = float32, Ty = float32, Device = CPU> */
template <> void SparseSoftmaxCrossEntropy<float, float, CPUContext>(
const int outer_dim,
......@@ -59,7 +59,7 @@ template <> void SparseSoftmaxCrossEntropy<float, float, CPUContext>(
);
}
/*! <Tx = float32, Ty = int64, Device = CPU> */
/* <Tx = float32, Ty = int64, Device = CPU> */
template <> void SparseSoftmaxCrossEntropy<float, int64_t, CPUContext>(
const int outer_dim,
......@@ -78,7 +78,7 @@ template <> void SparseSoftmaxCrossEntropy<float, int64_t, CPUContext>(
);
}
/*! <Tx = ?, Ty = ?, Device = CPU> */
/* <Tx = ?, Ty = ?, Device = CPU> */
template <typename Tx, typename Ty>
void _SparseSoftmaxCrossEntropyGrad(
......@@ -112,7 +112,7 @@ void _SparseSoftmaxCrossEntropyGrad(
}
}
/*! <Tx = float32, Ty = float32, Device = CPU> */
/* <Tx = float32, Ty = float32, Device = CPU> */
template<> void SparseSoftmaxCrossEntropyGrad<float, float, CPUContext>(
const int outer_dim,
......@@ -131,7 +131,7 @@ template<> void SparseSoftmaxCrossEntropyGrad<float, float, CPUContext>(
);
}
/*! <Tx = float32, Ty = int64, Device = CPU> */
/* <Tx = float32, Ty = int64, Device = CPU> */
template<> void SparseSoftmaxCrossEntropyGrad<float, int64_t, CPUContext>(
const int outer_dim,
......
......@@ -8,7 +8,7 @@ namespace dragon {
namespace kernel {
/*! <Tx = ?, Ty = ?, Device = CUDA> */
/* <Tx = ?, Ty = ?, Device = CUDA> */
template <typename Tx, typename Ty>
__global__ void _SparseSoftmaxCrossEntropy(
......@@ -44,7 +44,7 @@ __global__ void _SparseSoftmaxCrossEntropy(
}
}
/*! <Tx = float32, Ty = float32, Device = CUDA> */
/* <Tx = float32, Ty = float32, Device = CUDA> */
template <> void SparseSoftmaxCrossEntropy<float, float, CUDAContext>(
const int outer_dim,
......@@ -66,7 +66,7 @@ template <> void SparseSoftmaxCrossEntropy<float, float, CUDAContext>(
);
}
/*! <Tx = float32, Ty = int64, Device = CUDA> */
/* <Tx = float32, Ty = int64, Device = CUDA> */
template <> void SparseSoftmaxCrossEntropy<float, int64_t, CUDAContext>(
const int outer_dim,
......@@ -88,7 +88,7 @@ template <> void SparseSoftmaxCrossEntropy<float, int64_t, CUDAContext>(
);
}
/*! <Tx = ?, Ty = ?, Device = CUDA> */
/* <Tx = ?, Ty = ?, Device = CUDA> */
template <typename Tx, typename Ty>
__global__ void _SparseSoftmaxCrossEntropyGrad(
......@@ -121,7 +121,7 @@ __global__ void _SparseSoftmaxCrossEntropyGrad(
}
}
/*! <Tx = float32, Ty = float32, Device = CUDA> */
/* <Tx = float32, Ty = float32, Device = CUDA> */
template<> void SparseSoftmaxCrossEntropyGrad<float, float, CUDAContext>(
const int outer_dim,
......@@ -143,7 +143,7 @@ template<> void SparseSoftmaxCrossEntropyGrad<float, float, CUDAContext>(
);
}
/*! <Tx = float32, Ty = int64, Device = CUDA> */
/* <Tx = float32, Ty = int64, Device = CUDA> */
template<> void SparseSoftmaxCrossEntropyGrad<float, int64_t, CUDAContext>(
const int outer_dim,
......
......@@ -6,7 +6,7 @@ namespace dragon {
namespace kernel {
/*! Astype <Ta = ?, Tb = ?, Device = CPU> */
/* <Ta = ?, Tb = ?, Device = CPU> */
template <typename Ta, typename Tb>
void _TypeA2B(const int count, const Ta* a, Tb* b) {
......
......@@ -7,7 +7,7 @@ namespace dragon {
namespace kernel {
/*! Astype <Ta = ?, Tb = ?, Device = CUDA> */
/* <Ta = ?, Tb = ?, Device = CUDA> */
template <typename Ta, typename Tb>
__global__ void _TypeA2B(
......@@ -49,7 +49,7 @@ DEFINE_TYPE_A_TO_ALL(int64_t);
DEFINE_TYPE_A_TO_ALL(float);
DEFINE_TYPE_A_TO_ALL(double);
/*! Astype <Ta = float16, Tb = float32, Device = CUDA> */
/* <Ta = float16, Tb = float32, Device = CUDA> */
template<> __global__ void _TypeA2B<half, float>(
const int nthreads,
......@@ -72,7 +72,7 @@ template <> void TypeA2B<float16, float, CUDAContext>(
);
}
/*! Astype <Ta = float32, Tb = float16, Device = CUDA> */
/* <Ta = float32, Tb = float16, Device = CUDA> */
template<> __global__ void _TypeA2B<float, half>(
const int nthreads,
......@@ -95,7 +95,7 @@ template <> void TypeA2B<float, float16, CUDAContext>(
);
}
/*! Astype <Ta = float16, Tb = float16, Device = CUDA> */
/* <Ta = float16, Tb = float16, Device = CUDA> */
template<> __global__ void _TypeA2B<half, half>(
const int nthreads,
......
......@@ -5,7 +5,7 @@ namespace dragon {
namespace kernel {
/*! GradientTwoSum <T = ?, Device = CUDA> */
/* <T = ?, Device = CUDA> */
template <typename T>
void _GradientTwoSum(
......
......@@ -7,7 +7,7 @@ namespace dragon {
namespace kernel {
/*! GradientTwoSum <T = ?, Device = CUDA> */
/* <T = ?, Device = CUDA> */
template <typename T>
__global__ void _GradientTwoSum(
......@@ -20,7 +20,7 @@ __global__ void _GradientTwoSum(
}
}
/*! GradientTwoSum <T = float16, Device = CUDA> */
/* <T = float16, Device = CUDA> */
template <> __global__ void _GradientTwoSum<half>(
const int nthreads,
......
......@@ -5,7 +5,7 @@ namespace dragon {
namespace kernel {
/*! ImageData <Tx = ?, Ty = ?, Device = CPU> */
/* <Tx = ?, Ty = ?, Device = CPU> */
template <typename Tx, typename Ty>
void _ImageDataNCHW(
......@@ -55,7 +55,7 @@ void _ImageDataNHWC(
}
}
/*! ImageData <Tx = float32, Ty = float32, Device = CPU> */
/* <Tx = float32, Ty = float32, Device = CPU> */
template <> void ImageData<float, float, CPUContext>(
const int N,
......@@ -77,7 +77,7 @@ template <> void ImageData<float, float, CPUContext>(
}
}
/*! ImageData <Tx = uint8, Ty = float32, Device = CPU> */
/* <Tx = uint8, Ty = float32, Device = CPU> */
template <> void ImageData<uint8_t, float, CPUContext>(
const int N,
......@@ -99,7 +99,7 @@ template <> void ImageData<uint8_t, float, CPUContext>(
}
}
/*! ImageData <Tx = float32, Ty = float16, Device = CPU> */
/* <Tx = float32, Ty = float16, Device = CPU> */
template <> void ImageData<float, float16, CPUContext>(
const int N,
......@@ -115,7 +115,7 @@ template <> void ImageData<float, float16, CPUContext>(
CPU_FP16_NOT_SUPPORTED;
}
/*! ImageData <Tx = uint8, Ty = float16, Device = CPU> */
/* <Tx = uint8, Ty = float16, Device = CPU> */
template <> void ImageData<uint8_t, float16, CPUContext>(
const int N,
......
......@@ -7,7 +7,7 @@ namespace dragon {
namespace kernel {
/*! ImageData <Tx = ?, Ty = ?, Device = CUDA> */
/* <Tx = ?, Ty = ?, Device = CUDA> */
template <typename Tx, typename Ty>
__global__ void _ImageDataNCHW(
......@@ -60,7 +60,7 @@ __global__ void _ImageDataNHWC(
}
}
/*! ImageData <Tx = float32, Ty = float32, Device = CUDA> */
/* <Tx = float32, Ty = float32, Device = CUDA> */
template <> void ImageData<float, float, CUDAContext>(
const int N,
......@@ -91,7 +91,7 @@ template <> void ImageData<float, float, CUDAContext>(
}
}
/*! ImageData <Tx = uint8, Ty = float32, Device = CUDA> */
/* <Tx = uint8, Ty = float32, Device = CUDA> */
template <> void ImageData<uint8_t, float, CUDAContext>(
const int N,
......@@ -122,7 +122,7 @@ template <> void ImageData<uint8_t, float, CUDAContext>(
}
}
/*! ImageData <Tx = ?, Ty = float16, Device = CUDA> */
/* <Tx = ?, Ty = float16, Device = CUDA> */
template <typename Tx, typename Ty>
__global__ void _ImageDataHalfNCHW(
......@@ -175,7 +175,7 @@ __global__ void _ImageDataHalfNHWC(
}
}
/*! ImageData <Tx = float32, Ty = float16, Device = CUDA> */
/* <Tx = float32, Ty = float16, Device = CUDA> */
template <> void ImageData<float, float16, CUDAContext>(
const int N,
......
......@@ -16,7 +16,7 @@ namespace dragon {
namespace kernel {
/*! BatchNormBackwardTraining <T = ?, Device = CPU> */
/* <T = ?, Device = CPU> */
template <typename Tx, typename Tp, StorageOrder kOrder>
void _BatchNormInternalGrad(
......@@ -68,7 +68,7 @@ void _BatchNormTrainingGrad(
}
}
/*! BatchNormBackwardInference <T = ?, Device = CPU> */
/* <T = ?, Device = CPU> */
template <typename Tx, typename Tp, StorageOrder kOrder>
void _BatchNormWGrad(
......
......@@ -19,7 +19,7 @@ namespace kernel {
#define L(x, i) __ldg(x + i)
/*! BatchNormBackwardTraining <T = ?, Device = CUDA> */
/* <T = ?, Device = CUDA> */
template <typename Tx, typename Tp, StorageOrder kOrder>
__global__ void _BatchNormInternalGrad(
......@@ -107,7 +107,7 @@ __global__ void _BatchNormTrainingGrad(
}
}
/*! BatchNormBackwardInference <T = ?, Device = CUDA> */
/* <T = ?, Device = CUDA> */
template <typename Tx, typename Tp, StorageOrder kOrder>
__global__ void _BatchNormWGrad(
......
......@@ -20,7 +20,7 @@ namespace kernel {
#define L(x, i) __ldg(x + i)
#define LF(x, i) __half2float(__ldg(x + i))
/*! GroupNormForward <T = ?, Device = CUDA> */
/* <T = ?, Device = CUDA> */
template <typename T>
__global__ void _GroupNormFusedParams(
......@@ -157,7 +157,7 @@ template<> __global__ void _GroupNormForwardNHWC<half, float>(
#endif
}
/*! GroupNormBackward <T = ?, Device = CUDA> */
/* <T = ?, Device = CUDA> */
template <typename Tx, typename Tp, StorageOrder kOrder>
__global__ void _GroupNormWGrad(
......
......@@ -5,7 +5,7 @@ namespace dragon {
namespace kernel {
/*! LSTMCell <T = float32, Device = CPU> */
/* <T = float32, Device = CPU> */
template <typename T>
T _s(T x) { return T(1) / (T(1) + exp(-x)); }
......@@ -34,7 +34,7 @@ template <> void LSTMCell<float, CPUContext>(
}
}
/*! LSTMCellGrad <T = float32, Device = CPU> */
/* <T = float32, Device = CPU> */
template <> void LSTMCellGrad<float, CPUContext>(
const int N,
......
......@@ -7,7 +7,7 @@ namespace dragon {
namespace kernel {
/*! LSTMCell <T = float32, Device = CUDA> */
/* <T = float32, Device = CUDA> */
template <typename T>
__global__ void _LSTMCellAct(
......@@ -70,7 +70,7 @@ template <> void LSTMCell<float, CUDAContext>(
);
}
/*! LSTMCellGrad <T = float32, Device = CUDA> */
/* <T = float32, Device = CUDA> */
template <typename T>
__global__ void _LSTMCellGateGrad(
......
......@@ -5,7 +5,7 @@ namespace dragon {
namespace kernel {
/*! BiasAdd <T = float32, Device = CPU> */
/* <T = float32, Device = CPU> */
template<> void BiasAdd<float, CPUContext>(
const int outer_dim,
......
......@@ -7,7 +7,7 @@ namespace dragon {
namespace kernel {
/*! BiasAdd <T = float32, Device = CUDA> */
/* <T = float32, Device = CUDA> */
template <typename T>
__global__ void _BiasAddNCHW(
......
......@@ -4,7 +4,7 @@ namespace dragon {
namespace kernel {
/*! DepthwiseConv2d <T = float32, Device = CPU> */
/* <T = float32, Device = CPU> */
template <typename T>
void _DepthwiseConv2dNCHW(
......
......@@ -8,7 +8,7 @@ namespace dragon {
namespace kernel {
/* < Device = CUDA> */
/* <Device = CUDA> */
__global__ void _DropBlock2dNCHW(
const int nthreads,
......
......@@ -5,7 +5,7 @@ namespace dragon {
namespace kernel {
/*! ROIAlign <T = float32, Device = CPU> */
/* <T = float32, Device = CPU> */
template <typename T>
T _ROIAlignIntp(
......@@ -115,7 +115,7 @@ template<> void ROIAlign<float, CPUContext>(
} // End n
}
/*! ROIAlign <T = float16, Device = CPU> */
/* <T = float16, Device = CPU> */
template<> void ROIAlign<float16, CPUContext>(
const int C,
......@@ -133,7 +133,7 @@ template<> void ROIAlign<float16, CPUContext>(
CPU_FP16_NOT_SUPPORTED;
}
/*! ROIAlignGrad <T = float32, Device = CPU> */
/* <T = float32, Device = CPU> */
template<> void ROIAlignGrad<float, CPUContext>(
const int C,
......
......@@ -7,7 +7,7 @@ namespace dragon {
namespace kernel {
/*! <T = float32, Device = CUDA> */
/* <T = float32, Device = CUDA> */
template <typename T>
__device__ T _ROIAlignIntp(
......@@ -106,10 +106,10 @@ __global__ void _ROIAlign(
for (int iy = 0; iy < grid_h; iy++) {
const T y = roi_hstart + ph * bin_h +
(T)(iy + .5f) * bin_h / (T)(grid_h);
(T)(iy + .5f) * bin_h / (T)grid_h;
for (int ix = 0; ix < grid_w; ix++) {
const T x = roi_wstart + pw * bin_w +
(T)(ix + .5f) * bin_w / (T)(grid_w);
(T)(ix + .5f) * bin_w / (T)grid_w;
intp_val += _ROIAlignIntp(X, H, W, y, x);
}
}
......@@ -143,7 +143,7 @@ template<> void ROIAlign<float, CUDAContext>(
);
}
/*! <T = float32, Device = CUDA> */
/* <T = float32, Device = CUDA> */
template <typename T>
__device__ void _ROIAlignIntpGrad(
......
......@@ -7,10 +7,10 @@ namespace dragon {
namespace kernel {
/*! ROIAlign <T = float16, Device = CUDA> */
/* <T = float16, Device = CUDA> */
__device__ float _ROIAlignInterpolate(
const half* Xdata,
__device__ float _ROIAlignIntp(
const half* X,
const int H,
const int W,
float y,
......@@ -41,12 +41,12 @@ __device__ float _ROIAlignInterpolate(
const float ly = y - y_low;
const float lx = x - x_low;
const float hy = 1. - ly, hx = 1. - lx;
const float hy = 1.f - ly, hx = 1.f - lx;
const float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
const float v1 = __half2float(__ldg(Xdata + (y_low * W + x_low)));
const float v2 = __half2float(__ldg(Xdata + (y_low * W + x_high)));
const float v3 = __half2float(__ldg(Xdata + (y_high * W + x_low)));
const float v4 = __half2float(__ldg(Xdata + (y_high * W + x_high)));
const float v1 = __half2float(__ldg(X + (y_low * W + x_low)));
const float v2 = __half2float(__ldg(X + (y_low * W + x_high)));
const float v3 = __half2float(__ldg(X + (y_high * W + x_low)));
const float v4 = __half2float(__ldg(X + (y_high * W + x_high)));
const float value = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4;
#else
const float value = 0.f;
......@@ -63,58 +63,55 @@ __global__ void _ROIAlignHalf(
const int pool_w,
const int sampling_ratio,
const float spatial_scale,
const half* Xdata,
const half* xdata,
const float* rois,
half* Ydata) {
CUDA_1D_KERNEL_LOOP(y_idx, nthreads) {
half* ydata) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
#if __CUDA_ARCH__ >= 530
int pw = y_idx % pool_w;
int ph = (y_idx / pool_w) % pool_h;
int c = (y_idx / pool_w / pool_h) % C;
int n = y_idx / pool_w / pool_h / C;
int pw = yi % pool_w;
int ph = (yi / pool_w) % pool_h;
int c = (yi / pool_w / pool_h) % C;
int n = yi / pool_w / pool_h / C;
const float* offset_rois = rois + n * 5;
int roi_batch_ind = offset_rois[0];
const float* roi = rois + n * 5;
int batch_ind = roi[0];
if (roi_batch_ind < 0) {
Ydata[y_idx] = __float2half(0.f);
if (batch_ind < 0) {
ydata[yi] = __float2half(0.f);
continue;
}
float roi_start_w = offset_rois[1] * spatial_scale;
float roi_start_h = offset_rois[2] * spatial_scale;
float roi_end_w = offset_rois[3] * spatial_scale;
float roi_end_h = offset_rois[4] * spatial_scale;
float roi_wstart = roi[1] * spatial_scale;
float roi_hstart = roi[2] * spatial_scale;
float roi_wend = roi[3] * spatial_scale;
float roi_hend = roi[4] * spatial_scale;
float roi_width = max(roi_end_w - roi_start_w, 1.f);
float roi_height = max(roi_end_h - roi_start_h, 1.f);
float bin_size_h = (float)roi_height / (float)pool_h;
float bin_size_w = (float)roi_width / (float)pool_w;
float roi_w = max(roi_wend - roi_wstart, 1.f);
float roi_h = max(roi_hend - roi_hstart, 1.f);
float bin_h = roi_h / (float)pool_h;
float bin_w = roi_w / (float)pool_w;
const half* offset_Xdata = Xdata + (roi_batch_ind * C + c) * H * W;
const half* X = xdata + (batch_ind * C + c) * H * W;
int roi_bin_grid_h = (sampling_ratio > 0) ?
sampling_ratio : ceil(roi_height / pool_h);
int roi_bin_grid_w = (sampling_ratio > 0) ?
sampling_ratio : ceil(roi_width / pool_w);
int grid_h = (sampling_ratio > 0) ?
sampling_ratio : ceil(roi_h / pool_h);
int grid_w = (sampling_ratio > 0) ?
sampling_ratio : ceil(roi_w / pool_w);
float output_val = 0.;
const float num_bin_grids = roi_bin_grid_h * roi_bin_grid_w;
for (int iy = 0; iy < roi_bin_grid_h; iy++) {
const float y = roi_start_h + ph * bin_size_h +
static_cast<float>(iy + .5f) * bin_size_h /
static_cast<float>(roi_bin_grid_h);
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const float x = roi_start_w + pw * bin_size_w +
static_cast<float>(ix + .5f) * bin_size_w /
static_cast<float>(roi_bin_grid_w);
output_val += _ROIAlignInterpolate(
offset_Xdata, H, W, y, x);
float intp_val = 0.f;
for (int iy = 0; iy < grid_h; iy++) {
const float y = roi_hstart + ph * bin_h +
(float)(iy + .5f) * bin_h / (float)grid_h;
for (int ix = 0; ix < grid_w; ix++) {
const float x = roi_wstart + pw * bin_w +
(float)(ix + .5f) * bin_w / (float)grid_w;
intp_val += _ROIAlignIntp(X, H, W, y, x);
}
}
output_val /= num_bin_grids;
Ydata[y_idx] = __float2half(output_val);
ydata[yi] = __float2half(
intp_val / float(grid_h * grid_w)
);
#endif
}
}
......@@ -135,11 +132,16 @@ template<> void ROIAlign<float16, CUDAContext>(
auto nthreads = num_rois * C * pool_h * pool_w;
_ROIAlignHalf
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>
(nthreads, C, H, W, pool_h, pool_w,
sampling_ratio, spatial_scale,
reinterpret_cast<const half*>(x), rois,
reinterpret_cast<half*>(y));
0, ctx->cuda_stream() >>>(
nthreads,
C, H, W,
pool_h, pool_w,
sampling_ratio,
spatial_scale,
reinterpret_cast<const half*>(x),
rois,
reinterpret_cast<half*>(y)
);
}
} // namespace kernel
......
......@@ -5,7 +5,7 @@ namespace dragon {
namespace kernel {
/*! ROIPool <T = float32, Device = CPU> */
/* <T = float32, Device = CPU> */
template<> void ROIPool<float, CPUContext>(
const int C,
......@@ -83,7 +83,7 @@ template<> void ROIPool<float, CPUContext>(
} // End n
}
/*! ROIPool <T = float16, Device = CPU> */
/* <T = float16, Device = CPU> */
template<> void ROIPool<float16, CPUContext>(
const int C,
......@@ -101,7 +101,7 @@ template<> void ROIPool<float16, CPUContext>(
CPU_FP16_NOT_SUPPORTED;
}
/*! ROIPoolGrad <T = float32, Device = CPU> */
/* <T = float32, Device = CPU> */
template<> void ROIPoolGrad<float, CPUContext>(
const int N,
......
......@@ -52,7 +52,9 @@ DEPLOY_CUDA(Arange);
#endif
OPERATOR_SCHEMA(Arange)
.NumInputs(0).NumOutputs(1);
.NumInputs(0)
/* Y */
.NumOutputs(1);
NO_GRADIENT(Arange);
......
......@@ -16,7 +16,7 @@ namespace dragon {
template <class Context> template <typename T>
void IndexSelectOp<Context>::RunImpl() {
auto* x = X(0).template data<T, Context>();
auto* i = X(1).template mutable_data<int64_t, Context>();
auto* i = X(1).template data<int64_t, Context>();
auto* y = Y(0)->template mutable_data<T, Context>();
kernel::IndexSelect(
......@@ -67,12 +67,6 @@ void IndexSelectGradientOp<Context>::RunImpl() {
auto* dy = X(2).template data<T, Context>();
auto* dx = Y(0)->template mutable_data<T, Context>();
math::Set(
X(0).count(),
cast::to<T>(0.f),
dx, ctx()
);
kernel::IndexSelectGrad(
outer_dim_,
inner_dim_,
......
#include "core/workspace.h"
#include "utils/op_kernel.h"
#include "utils/math_utils.h"
#include "utils/math_functions.h"
#include "operators/array/masked_select_op.h"
namespace dragon {
template <class Context> template <typename T>
void MaskedSelectOp<Context>::RunImpl() {
auto* x = X(0).template data<T, Context>();
auto* mask = X(1).template raw_data<Context>();
auto* scratch = ws()->CreateTensor("/share/data");
auto* indices = ws()->CreateTensor(unique_name("indices"));
kernel::MaskedSelect(
X(0).count(),
(const uint8_t*)mask, x,
indices, scratch,
Y(0), ctx()
);
}
template <class Context>
void MaskedSelectOp<Context>::RunOnDevice() {
CHECK_EQ(X(0).count(), X(1).count())
<< "\nSize of mask and input should be equal.";
CHECK(XIsType(X(1), bool) || XIsType(X(1), uint8_t))
<< "\nExcepted bool or uint8 mask.";
DispatchHelper<TensorTypes
<bool, int8_t, uint8_t, int, int64_t,
float16, float, double>
>::Call(this, X(0));
}
template <class Context> template <typename T>
void MaskedSelectGradientOp<Context>::RunImpl() {
auto* dy = X(1).template data<T, Context>();
auto* dx = Y(0)->template mutable_data<T, Context>();
auto* i = ws()
->GetTensor(unique_name("indices"))
->template data<int64_t, Context>();
kernel::MaskedSelectGrad(
X(0).count(),
X(1).count(),
i, dy,
dx, ctx()
);
}
template <class Context>
void MaskedSelectGradientOp<Context>::RunOnDevice() {
Y(0)->ReshapeLike(X(0));
DispatchHelper<TensorTypes
<bool, int8_t, uint8_t, int, int64_t,
float16, float, double>
>::Call(this, X(0));
}
DEPLOY_CPU(MaskedSelect);
#ifdef WITH_CUDA
DEPLOY_CUDA(MaskedSelect);
#endif
DEPLOY_CPU(MaskedSelectGradient);
#ifdef WITH_CUDA
DEPLOY_CUDA(MaskedSelectGradient);
#endif
OPERATOR_SCHEMA(MaskedSelect)
/* X, M */
.NumInputs(2)
/* Y */
.NumOutputs(1);
OPERATOR_SCHEMA(MaskedSelectGradient)
/* X, dY */
.NumInputs(2)
/* dX */
.NumOutputs(1);
namespace {
class GradientMaker final : public GradientMakerBase {
public:
GRADIENT_MAKER_CTOR(GradientMaker);
vector<OperatorDef> MakeDef() override {
return SingleDef(def.type() + "Gradient", "",
vector<string>({ I(0), GO(0) }),
vector<string>({ GI(0)} )
);
}
};
} // namespace
REGISTER_GRADIENT(MaskedSelect, GradientMaker);
} // namespace dragon
\ No newline at end of file
#include "core/workspace.h"
#include "utils/op_kernel.h"
#include "utils/math_functions.h"
#include "operators/array/non_zero_op.h"
#define TENSOR_FROM_VEC(tensor, vec, T) \
{ \
tensor.Reshape({ (int64_t)vec.size() }); \
auto* data = tensor.template mutable_data<T, CPUContext>(); \
for (int i = 0; i < vec.size(); i++) data[i] = (T)vec[i]; \
}
namespace dragon {
template <class Context> template <typename T>
void NonZeroOp<Context>::RunImpl() {
auto ndim = X(0).ndim();
auto nelements = X(0).count();
auto* x = X(0).template data<T, Context>();
auto* scratch = ws()->CreateTensor("/share/data");
auto* indices = ws()->CreateTensor("/share/buffer/grad:0");
auto* mask = ws()
->CreateTensor("/share/buffer/grad:1")
->Reshape({ nelements })
->template mutable_data<uint8_t, Context>();
kernel::NotZero(nelements, x, (bool*)mask, ctx());
kernel::MaskedSelect(
nelements,
mask, (T*)nullptr,
indices, scratch,
(Tensor*)nullptr, ctx()
);
nelements = indices->count();
auto* y = Y(0)
->Reshape({ nelements, (int64_t)ndim })
->template mutable_data<int64_t, Context>();
kernel::UnravelIndex(
nelements, ndim,
X_dims_.template data<int, Context>(),
indices->template data<int64_t, Context>(),
y, ctx()
);
}
template <class Context>
void NonZeroOp<Context>::RunOnDevice() {
TENSOR_FROM_VEC(X_dims_, X(0).dims(), int);
DispatchHelper<TensorTypes
<bool, int8_t, uint8_t, int, int64_t,
float16, float, double>
>::Call(this, X(0));
}
DEPLOY_CPU(NonZero);
#ifdef WITH_CUDA
DEPLOY_CUDA(NonZero);
#endif
OPERATOR_SCHEMA(NonZero)
/* X */
.NumInputs(1)
/* Y */
.NumOutputs(1);
NO_GRADIENT(NonZero);
} // namespace dragon
\ No newline at end of file
#include "core/workspace.h"
#include "utils/op_kernel.h"
#include "utils/math_utils.h"
#include "utils/math_functions.h"
#include "operators/array/where_op.h"
namespace dragon {
template <class Context> template <typename T>
void WhereOp<Context>::RunImpl() {
const T *a = nullptr, *b = nullptr;
auto* mask = X(2).template raw_data<Context>();
if (X(0).count() < X(1).count()) {
int rows, cols;
Y(0)->ReshapeLike(X(1));
a = ws()
->template data<T, Context>
({ X(1).count() })[0];
b = X(1).template data<T, Context>();
auto* ra = X(0).template data<T, Context>();
if (utils::IsRowwiseBroadcast(
X(0).dims(), X(1).dims(),
&rows, &cols)) {
math::BroadcastSet(
rows, cols, 0, ra,
const_cast<T*>(a), ctx()
);
} else if (utils::IsColwiseBroadcast(
X(0).dims(), X(1).dims(),
&rows, &cols)) {
math::BroadcastSet(
rows, cols, 1, ra,
const_cast<T*>(a), ctx()
);
} else {
LOG(FATAL)
<< "Could not broadcast "
<< X(0).DimString()
<< " to "
<< X(1).DimString();
}
} else if (X(0).count() > X(1).count()) {
int rows, cols;
Y(0)->ReshapeLike(X(0));
b = ws()
->template data<T, Context>
({ X(0).count() })[0];
a = X(0).template data<T, Context>();
auto* rb = X(1).template data<T, Context>();
if (utils::IsRowwiseBroadcast(
X(0).dims(), X(1).dims(),
&rows, &cols)) {
math::BroadcastSet(
rows, cols, 0, rb,
const_cast<T*>(b), ctx()
);
} else if (utils::IsColwiseBroadcast(
X(0).dims(), X(1).dims(),
&rows, &cols)) {
math::BroadcastSet(
rows, cols, 1, rb,
const_cast<T*>(b), ctx()
);
} else {
LOG(FATAL)
<< "Could not broadcast "
<< X(1).DimString()
<< " to "
<< X(0).DimString();
}
} else {
Y(0)->ReshapeLike(X(0));
a = X(0).template data<T, Context>();
b = X(1).template data<T, Context>();
}
CHECK_EQ(Y(0)->count(), X(2).count())
<< "\nSize of mask and input should be equal.";
auto* y = Y(0)->template mutable_data<T, Context>();
kernel::Where(
Y(0)->count(),
(const uint8_t*)mask,
a, b, y, ctx()
);
}
template <class Context>
void WhereOp<Context>::RunOnDevice() {
CHECK(XIsType(X(2), bool) || XIsType(X(2), uint8_t))
<< "\nExcepted bool or uint8 mask.";
DispatchHelper<TensorTypes
<bool, int8_t, uint8_t, int, int64_t,
float16, float, double>
>::Call(this, X(0));
}
template <class Context> template <typename T>
void WhereGradientOp<Context>::RunImpl() {
T *da = nullptr, *db = nullptr;
auto* dy = X(-1).template data<T, Context>();
auto* mask = X(2).template raw_data<Context>();
int rows, cols, type;
if (utils::IsRowwiseBroadcast(
X(0).dims(), X(1).dims(),
&rows, &cols)) {
type = 0;
} else if (utils::IsColwiseBroadcast(
X(0).dims(), X(1).dims(),
&rows, &cols)) {
type = 1;
}
vec32_t dims = { rows, cols };
vec32_t axes = { type };
if (X(0).count() < X(1).count()) {
da = ws()
->template data<T, Context>
({ X(1).count() })[0];
db = Y(1)->template mutable_data<T, Context>();
auto* ra = Y(0)->template mutable_data<T, Context>();
kernel::WhereGrad(
X(-1).count(),
(const uint8_t*)mask,
dy, da, db, ctx()
);
kernel::ReduceSum(
2, dims.data(),
1, axes.data(),
1.f, da,
ra, ctx()
);
} else if (X(0).count() > X(1).count()) {
db = ws()
->template data<T, Context>
({ X(0).count() })[0];
da = Y(0)->template mutable_data<T, Context>();
auto* rb = Y(1)->template mutable_data<T, Context>();
kernel::WhereGrad(
X(-1).count(),
(const uint8_t*)mask,
dy, da, db, ctx()
);
kernel::ReduceSum(
2, dims.data(),
1, axes.data(),
1.f, db,
rb, ctx()
);
} else {
da = Y(0)->template mutable_data<T, Context>();
db = Y(1)->template mutable_data<T, Context>();
kernel::WhereGrad(
Y(0)->count(),
(const uint8_t*)mask,
dy, da, db, ctx()
);
}
}
template <class Context>
void WhereGradientOp<Context>::RunOnDevice() {
CHECK_EQ(X(-1).count(), X(2).count())
<< "\nSize of mask and input should be equal.";
Y(0)->ReshapeLike(X(0));
Y(1)->ReshapeLike(X(1));
CHECK(XIsType(X(2), bool) || XIsType(X(2), uint8_t))
<< "\nExcepted bool or uint8 mask.";
DispatchHelper<TensorTypes
<int8_t, uint8_t, int, int64_t,
float16, float, double>
>::Call(this, X(0));
}
DEPLOY_CPU(Where);
#ifdef WITH_CUDA
DEPLOY_CUDA(Where);
#endif
DEPLOY_CPU(WhereGradient);
#ifdef WITH_CUDA
DEPLOY_CUDA(WhereGradient);
#endif
OPERATOR_SCHEMA(Where)
/* A, B, M */
.NumInputs(3)
/* Y */
.NumOutputs(1);
OPERATOR_SCHEMA(WhereGradient)
/* A, B, M, dY */
.NumInputs(4)
/* dA, dB */
.NumOutputs(2);
namespace {
class GradientMaker : public GradientMakerBase {
public:
GRADIENT_MAKER_CTOR(GradientMaker);
vector<OperatorDef> MakeDef() override {
return SingleDef(def.type() + "Gradient", "",
vector<string>({ I(0), I(1), I(2), GO(0) }),
vector<string>({ GI(0), GI(1) })
);
}
};
} // namespace
REGISTER_GRADIENT(Where, GradientMaker);
} // namespace dragon
\ No newline at end of file
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!