Commit 4eab1c68 by Ting PAN

Unify the boolean operators

1 parent d1f714ea
Showing with 2239 additions and 829 deletions
...@@ -64,7 +64,6 @@ List Brief ...@@ -64,7 +64,6 @@ List Brief
`Tensor.__repr__`_ Return the information(name/shape). `Tensor.__repr__`_ Return the information(name/shape).
`Tensor.__getitem__`_ Return the value at the specific indices. `Tensor.__getitem__`_ Return the value at the specific indices.
`Tensor.__setitem__`_ Set the value at the specific indices. `Tensor.__setitem__`_ Set the value at the specific indices.
`Tensor.__call__`_ Return the expressions for displaying.
============================== ============================================================================= ============================== =============================================================================
...@@ -95,7 +94,6 @@ API Reference ...@@ -95,7 +94,6 @@ API Reference
.. automethod:: __repr__ .. automethod:: __repr__
.. automethod:: __getitem__ .. automethod:: __getitem__
.. automethod:: __setitem__ .. automethod:: __setitem__
.. automethod:: __call__
.. _Tensor.Variable: #dragon.core.tensor.Tensor.Variable .. _Tensor.Variable: #dragon.core.tensor.Tensor.Variable
.. _Tensor.Placeholder: #dragon.core.tensor.Tensor.Placeholder .. _Tensor.Placeholder: #dragon.core.tensor.Tensor.Placeholder
......
...@@ -135,7 +135,9 @@ Array ...@@ -135,7 +135,9 @@ Array
=============== ====================================================================== =============== ======================================================================
List Brief List Brief
=============== ====================================================================== =============== ======================================================================
`Where`_ Select elements from either *x* or *y*.
`IndexSelect`_ Select the elements according to the indices along the given axis. `IndexSelect`_ Select the elements according to the indices along the given axis.
`MaskedSelect`_ Select the the elements where *mask* is *1*.
`Reduce`_ Reduce the inputs along the axis in given axes. `Reduce`_ Reduce the inputs along the axis in given axes.
`Sum`_ Compute the sum along the given axis. `Sum`_ Compute the sum along the given axis.
`Mean`_ Compute the mean along the given axis. `Mean`_ Compute the mean along the given axis.
...@@ -149,7 +151,7 @@ List Brief ...@@ -149,7 +151,7 @@ List Brief
`Repeat`_ Repeat the input along the given axis. `Repeat`_ Repeat the input along the given axis.
`Transpose`_ Transpose the input according to the given permutations. `Transpose`_ Transpose the input according to the given permutations.
`Tile`_ Tile the input according to the given multiples. `Tile`_ Tile the input according to the given multiples.
`Pad`_ Pad the input according to the given paddings. `Pad`_ Pad the input according to the given sizes.
`Crop`_ Crop the input according to the given starts and sizes. `Crop`_ Crop the input according to the given starts and sizes.
`OneHot`_ Generate the one-hot representation of inputs. `OneHot`_ Generate the one-hot representation of inputs.
`Flatten`_ Flatten the input along the given axes. `Flatten`_ Flatten the input along the given axes.
...@@ -157,6 +159,7 @@ List Brief ...@@ -157,6 +159,7 @@ List Brief
`Squeeze`_ Remove the dimensions with size 1. `Squeeze`_ Remove the dimensions with size 1.
`ExpandDims`_ Expand the new dimension with size 1 to specific axis. `ExpandDims`_ Expand the new dimension with size 1 to specific axis.
`Shape`_ Get the dynamic shape of a Tensor. `Shape`_ Get the dynamic shape of a Tensor.
`NonZero`_ Return the indices of non-zero elements.
`Arange`_ Return evenly spaced values within a given interval. `Arange`_ Return evenly spaced values within a given interval.
`Multinomial`_ Return indices sampled from the multinomial distribution. `Multinomial`_ Return indices sampled from the multinomial distribution.
=============== ====================================================================== =============== ======================================================================
...@@ -170,6 +173,7 @@ List Brief ...@@ -170,6 +173,7 @@ List Brief
`Assign`_ Assign the *value* to *ref*. `Assign`_ Assign the *value* to *ref*.
`MaskedAssign`_ Assign the *value* to *ref* where mask is *1*. `MaskedAssign`_ Assign the *value* to *ref* where mask is *1*.
`Equal`_ *Equal* Comparing between A and B. `Equal`_ *Equal* Comparing between A and B.
`NotEqual`_ *NotEqual* Comparing between A and B.
`Less`_ *Less* Comparing between A and B. `Less`_ *Less* Comparing between A and B.
`LessEqual`_ *LessEqual* Comparing between A and B. `LessEqual`_ *LessEqual* Comparing between A and B.
`Greater`_ *Greater* Comparing between A and B. `Greater`_ *Greater* Comparing between A and B.
...@@ -284,7 +288,9 @@ List Brief ...@@ -284,7 +288,9 @@ List Brief
.. _InstanceNorm: operators/norm.html#dragon.operators.norm.InstanceNorm .. _InstanceNorm: operators/norm.html#dragon.operators.norm.InstanceNorm
.. _L2Norm: operators/norm.html#dragon.operators.norm.L2Norm .. _L2Norm: operators/norm.html#dragon.operators.norm.L2Norm
.. _Where: operators/array.html#dragon.operators.array.Where
.. _IndexSelect: operators/array.html#dragon.operators.array.IndexSelect .. _IndexSelect: operators/array.html#dragon.operators.array.IndexSelect
.. _MaskedSelect: operators/array.html#dragon.operators.array.MaskedSelect
.. _Crop: operators/array.html#dragon.operators.array.Crop .. _Crop: operators/array.html#dragon.operators.array.Crop
.. _Reduce: operators/array.html#dragon.operators.array.Reduce .. _Reduce: operators/array.html#dragon.operators.array.Reduce
.. _Sum: operators/array.html#dragon.operators.array.Sum .. _Sum: operators/array.html#dragon.operators.array.Sum
...@@ -307,12 +313,14 @@ List Brief ...@@ -307,12 +313,14 @@ List Brief
.. _ExpandDims: operators/array.html#dragon.operators.array.ExpandDims .. _ExpandDims: operators/array.html#dragon.operators.array.ExpandDims
.. _Shape: operators/array.html#dragon.operators.array.Shape .. _Shape: operators/array.html#dragon.operators.array.Shape
.. _Arange: operators/array.html#dragon.operators.array.Arange .. _Arange: operators/array.html#dragon.operators.array.Arange
.. _NonZero: operators/array.html#dragon.operators.array.NonZero
.. _Multinomial: operators/array.html#dragon.operators.array.Multinomial .. _Multinomial: operators/array.html#dragon.operators.array.Multinomial
.. _Copy: operators/control_flow.html#dragon.operators.control_flow.Copy .. _Copy: operators/control_flow.html#dragon.operators.control_flow.Copy
.. _Assign: operators/control_flow.html#dragon.operators.control_flow.Assign .. _Assign: operators/control_flow.html#dragon.operators.control_flow.Assign
.. _MaskedAssign: operators/control_flow.html#dragon.operators.control_flow.MaskedAssign .. _MaskedAssign: operators/control_flow.html#dragon.operators.control_flow.MaskedAssign
.. _Equal: operators/control_flow.html#dragon.operators.control_flow.Equal .. _Equal: operators/control_flow.html#dragon.operators.control_flow.Equal
.. _NotEqual: operators/control_flow.html#dragon.operators.control_flow.NotEqual
.. _Less: operators/control_flow.html#dragon.operators.control_flow.Less .. _Less: operators/control_flow.html#dragon.operators.control_flow.Less
.. _LessEqual: operators/control_flow.html#dragon.operators.control_flow.LessEqual .. _LessEqual: operators/control_flow.html#dragon.operators.control_flow.LessEqual
.. _Greater: operators/control_flow.html#dragon.operators.control_flow.Greater .. _Greater: operators/control_flow.html#dragon.operators.control_flow.Greater
......
...@@ -38,6 +38,7 @@ class GradientMakerBase { ...@@ -38,6 +38,7 @@ class GradientMakerBase {
const vector<string>& g_outputs) const vector<string>& g_outputs)
: def(def), g_outputs_(g_outputs), : def(def), g_outputs_(g_outputs),
g_inputs_(def.input_size()) {} g_inputs_(def.input_size()) {}
virtual ~GradientMakerBase() {} virtual ~GradientMakerBase() {}
virtual bool CopyDeviceOption() const { return true; } virtual bool CopyDeviceOption() const { return true; }
...@@ -45,9 +46,9 @@ class GradientMakerBase { ...@@ -45,9 +46,9 @@ class GradientMakerBase {
virtual bool CopyArguments() const { return true; } virtual bool CopyArguments() const { return true; }
virtual Gradient Make() { virtual Gradient Make() {
vector<OperatorDef> new_defs = MakeDef(); auto new_defs = MakeDef();
if (def.has_uid()) { if (def.has_uid()) {
// Attach the anchor to the name if having UID // Attach the anchor to name if having UID
for (int i = 0; i < new_defs.size(); i++) for (int i = 0; i < new_defs.size(); i++)
new_defs[i].set_name(def.name()); new_defs[i].set_name(def.name());
} else { } else {
...@@ -57,14 +58,14 @@ class GradientMakerBase { ...@@ -57,14 +58,14 @@ class GradientMakerBase {
for (int i = 0; i < new_defs.size(); i++) for (int i = 0; i < new_defs.size(); i++)
new_defs[i].add_arg()->CopyFrom(anchor); new_defs[i].add_arg()->CopyFrom(anchor);
} }
return Gradient(new_defs, g_inputs_, DefaultValues()); return Gradient(new_defs, g_inputs_, defaults());
}; };
virtual vector<OperatorDef> MakeDef() { virtual vector<OperatorDef> MakeDef() {
return vector<OperatorDef>(); return vector<OperatorDef>();
} }
virtual vector<float> DefaultValues() { virtual vector<float> defaults() {
return vector<float>(g_outputs_.size(), 1.f); return vector<float>(g_outputs_.size(), 1.f);
} }
...@@ -135,15 +136,17 @@ class SimpleGradientMaker final : public GradientMakerBase { ...@@ -135,15 +136,17 @@ class SimpleGradientMaker final : public GradientMakerBase {
GRADIENT_MAKER_CTOR(SimpleGradientMaker); GRADIENT_MAKER_CTOR(SimpleGradientMaker);
vector<OperatorDef> MakeDef() override { vector<OperatorDef> MakeDef() override {
vector<string> inputs, outputs; vector<string> inputs, outputs;
for (const auto& input : def.input()) { for (const auto& input : def.input())
inputs.push_back(input); inputs.push_back(input);
} for (int i = 0; i < def.input_size(); ++i)
inputs.push_back(GO(0));
for (int i = 0; i < def.input_size(); i++) {
outputs.push_back(GI(i)); outputs.push_back(GI(i));
} inputs.push_back(GO(0));
return SingleDef(def.type() + return SingleDef(
"Gradient", "", inputs, outputs); def.type() + "Gradient",
"",
inputs,
outputs
);
} }
}; };
...@@ -162,7 +165,8 @@ class InplaceGradientMaker final : public GradientMakerBase { ...@@ -162,7 +165,8 @@ class InplaceGradientMaker final : public GradientMakerBase {
def.type() + "Gradient", /*! OpType */ def.type() + "Gradient", /*! OpType */
"", /*! OpName */ "", /*! OpName */
vector<string>({ O(0), GO(0) }), /*! Inputs */ vector<string>({ O(0), GO(0) }), /*! Inputs */
vector<string>({ GI(0) })); /*! Outputs */ vector<string>({ GI(0) }) /*! Outputs */
);
} }
}; };
......
...@@ -13,9 +13,9 @@ ...@@ -13,9 +13,9 @@
#ifndef DRAGON_CORE_TYPEID_H_ #ifndef DRAGON_CORE_TYPEID_H_
#define DRAGON_CORE_TYPEID_H_ #define DRAGON_CORE_TYPEID_H_
#include <map>
#include <cstdlib> #include <cstdlib>
#include <iostream> #include <iostream>
#include <map>
namespace dragon { namespace dragon {
...@@ -83,7 +83,7 @@ class TypeMeta { ...@@ -83,7 +83,7 @@ class TypeMeta {
template <typename T> template <typename T>
static void Ctor(void* ptr, size_t n) { static void Ctor(void* ptr, size_t n) {
T* typed_ptr = static_cast<T*>(ptr); T* typed_ptr = static_cast<T*>(ptr);
for (unsigned int i = 0; i < n; i++) for (size_t i = 0; i < n; i++)
new(typed_ptr + i) T; new(typed_ptr + i) T;
} }
...@@ -91,14 +91,14 @@ class TypeMeta { ...@@ -91,14 +91,14 @@ class TypeMeta {
static void Copy(const void* src, void* dst, size_t n) { static void Copy(const void* src, void* dst, size_t n) {
const T* typed_src = static_cast<const T*>(src); const T* typed_src = static_cast<const T*>(src);
T* typed_dst = static_cast<T*>(dst); T* typed_dst = static_cast<T*>(dst);
for (unsigned int i = 0; i < n; i++) for (size_t i = 0; i < n; ++i)
typed_dst[i] = typed_src[i]; typed_dst[i] = typed_src[i];
} }
template <typename T> template <typename T>
static void Dtor(void* ptr, size_t n) { static void Dtor(void* ptr, size_t n) {
T* typed_ptr = static_cast<T*>(ptr); T* typed_ptr = static_cast<T*>(ptr);
for (unsigned int i = 0; i < n; i++) for (size_t i = 0; i < n; ++i)
typed_ptr[i].~T(); typed_ptr[i].~T();
} }
...@@ -107,16 +107,23 @@ class TypeMeta { ...@@ -107,16 +107,23 @@ class TypeMeta {
template <typename T> template <typename T>
static typename FundMeta Make() { static typename FundMeta Make() {
return TypeMeta(Id<T>(), Itemsize<T>(), return TypeMeta(
nullptr, nullptr, nullptr); Id<T>(), Itemsize<T>(),
nullptr, nullptr, nullptr
);
} }
template<typename T> template<typename T>
static typename StructMeta Make() { static typename StructMeta Make() {
return TypeMeta(Id<T>(), Itemsize<T>(), return TypeMeta(
Ctor<T>, Copy<T>, Dtor<T>); Id<T>(), Itemsize<T>(),
Ctor<T>, Copy<T>, Dtor<T>
);
} }
#undef FundMeta
#undef StructMeta
private: private:
TypeMeta( TypeMeta(
TypeId id, TypeId id,
......
/*!
* Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
*
* Licensed under the BSD 2-Clause License.
* You should have received a copy of the BSD 2-Clause License
* along with the software. If not, See,
*
* <https://opensource.org/licenses/BSD-2-Clause>
*
* ------------------------------------------------------------
*/
#ifndef DRAGON_OPERATORS_ARRAY_MASKED_SELECT_OP_H_
#define DRAGON_OPERATORS_ARRAY_MASKED_SELECT_OP_H_
#include "core/operator.h"
namespace dragon {
template <class Context>
class MaskedSelectOp final : public Operator<Context> {
public:
MaskedSelectOp(const OperatorDef& def, Workspace* ws)
: Operator<Context>(def, ws) {}
USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override;
template <typename T> void RunImpl();
};
template <class Context>
class MaskedSelectGradientOp final
: public Operator<Context> {
public:
MaskedSelectGradientOp(
const OperatorDef& def,
Workspace* ws)
: Operator<Context>(def, ws) {}
USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override;
template <typename T> void RunImpl();
};
} // namespace dragon
#endif // DRAGON_OPERATORS_ARRAY_MASKED_SELECT_OP_H_
\ No newline at end of file
/*!
* Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
*
* Licensed under the BSD 2-Clause License.
* You should have received a copy of the BSD 2-Clause License
* along with the software. If not, See,
*
* <https://opensource.org/licenses/BSD-2-Clause>
*
* ------------------------------------------------------------
*/
#ifndef DRAGON_OPERATORS_ARRAY_NON_ZERO_OP_H_
#define DRAGON_OPERATORS_ARRAY_NON_ZERO_OP_H_
#include "core/operator.h"
namespace dragon {
template <class Context>
class NonZeroOp final : public Operator<Context> {
public:
NonZeroOp(const OperatorDef& def, Workspace* ws)
: Operator<Context>(def, ws) {}
USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override;
template <typename T> void RunImpl();
protected:
Tensor X_dims_;
};
} // namespace dragon
#endif // DRAGON_OPERATORS_ARRAY_NON_ZERO_OP_H_
\ No newline at end of file
...@@ -18,7 +18,7 @@ ...@@ -18,7 +18,7 @@
namespace dragon { namespace dragon {
template <class Context> template <class Context>
class OneHotOp final : public Operator < Context > { class OneHotOp final : public Operator<Context> {
public: public:
OneHotOp(const OperatorDef& def, Workspace* ws) OneHotOp(const OperatorDef& def, Workspace* ws)
: Operator<Context>(def, ws), : Operator<Context>(def, ws),
......
/*!
* Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
*
* Licensed under the BSD 2-Clause License.
* You should have received a copy of the BSD 2-Clause License
* along with the software. If not, See,
*
* <https://opensource.org/licenses/BSD-2-Clause>
*
* ------------------------------------------------------------
*/
#ifndef DRAGON_OPERATORS_ARRAY_WHERE_OP_H_
#define DRAGON_OPERATORS_ARRAY_WHERE_OP_H_
#include "core/operator.h"
namespace dragon {
template <class Context>
class WhereOp final : public Operator<Context> {
public:
WhereOp(const OperatorDef& def, Workspace* ws)
: Operator<Context>(def, ws) {}
USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override;
template <typename T> void RunImpl();
};
template <class Context>
class WhereGradientOp final : public Operator<Context> {
public:
WhereGradientOp(const OperatorDef& def, Workspace* ws)
: Operator<Context>(def, ws) {}
USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override;
template <typename T> void RunImpl();
};
} // namespace dragon
#endif // DRAGON_OPERATORS_ARRAY_WHERE_OP_H_
\ No newline at end of file
...@@ -27,7 +27,9 @@ class CompareOp final : public Operator<Context> { ...@@ -27,7 +27,9 @@ class CompareOp final : public Operator<Context> {
USE_OPERATOR_FUNCTIONS; USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override; void RunOnDevice() override;
template <typename T> void RunImpl();
template <typename T> void EqualRunImpl(); template <typename T> void EqualRunImpl();
template <typename T> void NotEqualRunImpl();
template <typename T> void LessRunImpl(); template <typename T> void LessRunImpl();
template <typename T> void LessEqualRunImpl(); template <typename T> void LessEqualRunImpl();
template <typename T> void GreaterRunImpl(); template <typename T> void GreaterRunImpl();
......
...@@ -4,6 +4,8 @@ ...@@ -4,6 +4,8 @@
#ifdef WITH_CUDA #ifdef WITH_CUDA
#include <cub/block/block_reduce.cuh> #include <cub/block/block_reduce.cuh>
#include <cub/device/device_select.cuh>
#include <cub/iterator/counting_input_iterator.cuh>
#include "utils/cuda_device.h" #include "utils/cuda_device.h"
......
...@@ -17,6 +17,8 @@ ...@@ -17,6 +17,8 @@
namespace dragon { namespace dragon {
class Tensor;
namespace kernel { namespace kernel {
/*! activation.dropout */ /*! activation.dropout */
...@@ -258,37 +260,37 @@ void ClipGrad( ...@@ -258,37 +260,37 @@ void ClipGrad(
template <typename T, class Context> template <typename T, class Context>
void Maximum( void Maximum(
const int count, const int count,
const T* x1, const T* a,
const T* x2, const T* b,
T* y, T* y,
Context* ctx); Context* ctx);
template <typename T, class Context> template <typename T, class Context>
void BroadcastMaximum( void BroadcastMaximum(
const int count, const int count,
const T* x1, const T* a,
const T x2, const T b,
T* y, T* y,
Context* ctx); Context* ctx);
template <typename T, class Context> template <typename T, class Context>
void MaximumGrad( void MaximumGrad(
const int count, const int count,
const T* x1, const T* a,
const T* x2, const T* b,
const T* dy, const T* dy,
T* dx1, T* da,
T* dx2, T* db,
Context* ctx); Context* ctx);
template <typename T, class Context> template <typename T, class Context>
void BroadcastMaximumGrad( void BroadcastMaximumGrad(
const int count, const int count,
const T* x1, const T* a,
const T x2, const T b,
const T* dy, const T* dy,
T* dx1, T* da,
T* dx2, T* db,
Context* ctx); Context* ctx);
/*! arithmetic.minimum */ /*! arithmetic.minimum */
...@@ -296,37 +298,37 @@ void BroadcastMaximumGrad( ...@@ -296,37 +298,37 @@ void BroadcastMaximumGrad(
template <typename T, class Context> template <typename T, class Context>
void Minimum( void Minimum(
const int count, const int count,
const T* x1, const T* a,
const T* x2, const T* b,
T* y, T* y,
Context* ctx); Context* ctx);
template <typename T, class Context> template <typename T, class Context>
void BroadcastMinimum( void BroadcastMinimum(
const int count, const int count,
const T* x1, const T* a,
const T x2, const T b,
T* y, T* y,
Context* ctx); Context* ctx);
template <typename T, class Context> template <typename T, class Context>
void MinimumGrad( void MinimumGrad(
const int count, const int count,
const T* x1, const T* a,
const T* x2, const T* b,
const T* dy, const T* dy,
T* dx1, T* da,
T* dx2, T* db,
Context* ctx); Context* ctx);
template <typename T, class Context> template <typename T, class Context>
void BroadcastMinimumGrad( void BroadcastMinimumGrad(
const int count, const int count,
const T* x1, const T* a,
const T x2, const T b,
const T* dy, const T* dy,
T* dx1, T* da,
T* dx2, T* db,
Context* ctx); Context* ctx);
/*! arithmetic.moments */ /*! arithmetic.moments */
...@@ -437,6 +439,38 @@ void IndexSelectGrad( ...@@ -437,6 +439,38 @@ void IndexSelectGrad(
T* dx, T* dx,
Context* ctx); Context* ctx);
/*! array.masked_select */
template <typename T, class Context>
void MaskedSelect(
const int count,
const uint8_t* mask,
const T* x,
Tensor* indices,
Tensor* scratch,
Tensor* y,
Context* ctx);
template <typename T, class Context>
void MaskedSelectGrad(
const int count,
const int num_indices,
const int64_t* indices,
const T* dy,
T* dx,
Context* ctx);
/*! array.non_zero */
template <class Context>
void UnravelIndex(
const int count,
const int ndims,
const int* dims,
const int64_t* x,
int64_t* y,
Context* ctx);
/*! array.pad */ /*! array.pad */
template <typename T, class Context> template <typename T, class Context>
...@@ -602,6 +636,26 @@ void TransposeGrad( ...@@ -602,6 +636,26 @@ void TransposeGrad(
T* dx, T* dx,
Context* ctx); Context* ctx);
/*! array.where */
template <typename T, class Context>
void Where(
const int count,
const uint8_t* mask,
const T* a,
const T* b,
T* y,
Context* ctx);
template <typename T, class Context>
void WhereGrad(
const int count,
const uint8_t* mask,
const T* dy,
T* da,
T* db,
Context* ctx);
/*! control_flow.assgin */ /*! control_flow.assgin */
template <typename T, class Context> template <typename T, class Context>
...@@ -618,6 +672,13 @@ void Assign( ...@@ -618,6 +672,13 @@ void Assign(
/*! control_flow.compare */ /*! control_flow.compare */
template <typename T, class Context> template <typename T, class Context>
void NotZero(
const int count,
const T* x,
bool* y,
Context* ctx);
template <typename T, class Context>
void Equal( void Equal(
const int count, const int count,
const T* a, const T* a,
...@@ -626,7 +687,7 @@ void Equal( ...@@ -626,7 +687,7 @@ void Equal(
Context* ctx); Context* ctx);
template <typename T, class Context> template <typename T, class Context>
void Less( void NotEqual(
const int count, const int count,
const T* a, const T* a,
const T* b, const T* b,
...@@ -634,7 +695,7 @@ void Less( ...@@ -634,7 +695,7 @@ void Less(
Context* ctx); Context* ctx);
template <typename T, class Context> template <typename T, class Context>
void LessEqual( void Less(
const int count, const int count,
const T* a, const T* a,
const T* b, const T* b,
...@@ -642,7 +703,7 @@ void LessEqual( ...@@ -642,7 +703,7 @@ void LessEqual(
Context* ctx); Context* ctx);
template <typename T, class Context> template <typename T, class Context>
void Greater( void LessEqual(
const int count, const int count,
const T* a, const T* a,
const T* b, const T* b,
...@@ -650,21 +711,19 @@ void Greater( ...@@ -650,21 +711,19 @@ void Greater(
Context* ctx); Context* ctx);
template <typename T, class Context> template <typename T, class Context>
void GreaterEqual( void Greater(
const int count, const int count,
const T* a, const T* a,
const T* b, const T* b,
bool* y, bool* y,
Context* ctx); Context* ctx);
/*! control_flow.masked_assign */
template <typename T, class Context> template <typename T, class Context>
void MaskedAssign( void GreaterEqual(
const int count, const int count,
const uint8_t* mask, const T* a,
const T* x, const T* b,
T* y, bool* y,
Context* ctx); Context* ctx);
/*! loss.l1_loss */ /*! loss.l1_loss */
......
...@@ -69,26 +69,29 @@ class NumpyFetcher : public TensorFetcherBase { ...@@ -69,26 +69,29 @@ class NumpyFetcher : public TensorFetcherBase {
pybind11::object Fetch(const Tensor& tensor) override { pybind11::object Fetch(const Tensor& tensor) override {
CHECK_GT(tensor.count(), 0); CHECK_GT(tensor.count(), 0);
vector<npy_intp> npy_dims; vector<npy_intp> npy_dims;
for (const auto dim : tensor.dims()) npy_dims.push_back(dim); for (auto dim : tensor.dims()) npy_dims.push_back(dim);
int npy_type = TypeMetaToNPY(tensor.meta()); int npy_type = TypeMetaToNPY(tensor.meta());
if (npy_type == -1) { CHECK(npy_type != -1)
LOG(FATAL) << "The data type of Tensor(" + << "\nThe data type of Tensor(" << tensor.name()
tensor.name() + ") is unknown. Have you solved it ?"; << ") is unknown. Have you solved it?";
}
CHECK(tensor.memory()) << "\nIllegal memory access."; CHECK(tensor.memory()) << "\nIllegal memory access.";
// Create a empty array with the same shape // Create a empty array with the same shape
PyObject* array = PyArray_SimpleNew( auto* array = PyArray_SimpleNew(
tensor.ndim(), npy_dims.data(), npy_type); tensor.ndim(), npy_dims.data(), npy_type);
// Copy the tensor data to the numpy array // Copy the tensor data to the numpy array
if (tensor.memory_state() == MixedMemory::STATE_AT_CUDA) { if (tensor.memory_state() == MixedMemory::STATE_AT_CUDA) {
CUDAContext::MemcpyEx<CPUContext, CUDAContext>(tensor.nbytes(), CUDAContext::MemcpyEx<CPUContext, CUDAContext>(
tensor.nbytes(),
PyArray_DATA(reinterpret_cast<PyArrayObject*>(array)), PyArray_DATA(reinterpret_cast<PyArrayObject*>(array)),
tensor.raw_data<CUDAContext>(), tensor.raw_data<CUDAContext>(),
tensor.memory()->device_id()); tensor.memory()->device_id()
);
} else { } else {
CPUContext::Memcpy<CPUContext, CPUContext>(tensor.nbytes(), CPUContext::Memcpy<CPUContext, CPUContext>(
tensor.nbytes(),
PyArray_DATA(reinterpret_cast<PyArrayObject*>(array)), PyArray_DATA(reinterpret_cast<PyArrayObject*>(array)),
tensor.raw_data<CPUContext>()); tensor.raw_data<CPUContext>()
);
} }
return pybind11::reinterpret_steal<pybind11::object>(array); return pybind11::reinterpret_steal<pybind11::object>(array);
} }
...@@ -108,14 +111,14 @@ class NumpyFeeder : public TensorFeederBase { ...@@ -108,14 +111,14 @@ class NumpyFeeder : public TensorFeederBase {
const DeviceOption& option, const DeviceOption& option,
PyArrayObject* original_array, PyArrayObject* original_array,
Tensor* tensor) override { Tensor* tensor) override {
PyArrayObject* array = PyArray_GETCONTIGUOUS(original_array); auto* array = PyArray_GETCONTIGUOUS(original_array);
const TypeMeta& meta = TypeNPYToMeta(PyArray_TYPE(array)); const auto& meta = TypeNPYToMeta(PyArray_TYPE(array));
if (meta.id() == 0) LOG(FATAL) << "Unsupported DType."; if (meta.id() == 0) LOG(FATAL) << "Unsupported DType.";
tensor->SetMeta(meta); tensor->SetMeta(meta);
int ndim = PyArray_NDIM(array); int ndim = PyArray_NDIM(array);
npy_intp* npy_dims = PyArray_DIMS(array); vec64_t dims(ndim);
vector<int64_t> dims; auto* npy_dims = PyArray_DIMS(array);
for (int i = 0; i < ndim; i++) dims.push_back(npy_dims[i]); for (int i = 0; i < ndim; i++) dims[i] = npy_dims[i];
tensor->Reshape(dims); tensor->Reshape(dims);
if (option.device_type() == PROTO_CUDA) { if (option.device_type() == PROTO_CUDA) {
#ifdef WITH_CUDA #ifdef WITH_CUDA
...@@ -123,16 +126,17 @@ class NumpyFeeder : public TensorFeederBase { ...@@ -123,16 +126,17 @@ class NumpyFeeder : public TensorFeederBase {
tensor->nbytes(), tensor->nbytes(),
tensor->raw_mutable_data<CUDAContext>(), tensor->raw_mutable_data<CUDAContext>(),
static_cast<void*>(PyArray_DATA(array)), static_cast<void*>(PyArray_DATA(array)),
option.device_id()); option.device_id()
);
#else #else
LOG(FATAL) << "CUDA was not compiled."; LOG(FATAL) << "CUDA was not compiled.";
#endif #endif
} else { } else {
auto* data = tensor->raw_mutable_data<CPUContext>();
CPUContext::Memcpy<CPUContext, CPUContext>( CPUContext::Memcpy<CPUContext, CPUContext>(
tensor->nbytes(), tensor->nbytes(),
tensor->raw_mutable_data<CPUContext>(), tensor->raw_mutable_data<CPUContext>(),
static_cast<void*>(PyArray_DATA(array))); static_cast<void*>(PyArray_DATA(array))
);
} }
Py_XDECREF(array); Py_XDECREF(array);
} }
......
...@@ -142,40 +142,44 @@ PYBIND11_MODULE(libdragon, m) { ...@@ -142,40 +142,44 @@ PYBIND11_MODULE(libdragon, m) {
DeviceOption dst_ctx, src_ctx; DeviceOption dst_ctx, src_ctx;
dst_ctx.ParseFromString(dev1); dst_ctx.ParseFromString(dev1);
src_ctx.ParseFromString(dev2); src_ctx.ParseFromString(dev2);
Tensor* srcT = self->GetTensor(other); auto* src = self->GetTensor(other);
Tensor* dstT = self->CreateTensor(name); auto* dst = self->CreateTensor(name);
dstT->ReshapeLike(*srcT); const auto& meta = src->meta();
const TypeMeta& meta = srcT->meta(); dst->ReshapeLike(*src);
if (dst_ctx.device_type() == PROTO_CUDA) { if (dst_ctx.device_type() == PROTO_CUDA) {
if (src_ctx.device_type() == PROTO_CUDA) { if (src_ctx.device_type() == PROTO_CUDA) {
// CUDA <- CUDA // CUDA <- CUDA
CUDAContext::MemcpyEx<CUDAContext, CUDAContext>( CUDAContext::MemcpyEx<CUDAContext, CUDAContext>(
srcT->nbytes(), src->nbytes(),
dstT->raw_mutable_data<CUDAContext>(meta), dst->raw_mutable_data<CUDAContext>(meta),
srcT->raw_data<CUDAContext>(), src->raw_data<CUDAContext>(),
src_ctx.device_id()); src_ctx.device_id()
);
} else { } else {
// CUDA <- CPU // CUDA <- CPU
CUDAContext::MemcpyEx<CUDAContext, CPUContext>( CUDAContext::MemcpyEx<CUDAContext, CPUContext>(
srcT->nbytes(), src->nbytes(),
dstT->raw_mutable_data<CUDAContext>(meta), dst->raw_mutable_data<CUDAContext>(meta),
srcT->raw_data<CPUContext>(), src->raw_data<CPUContext>(),
dst_ctx.device_id()); dst_ctx.device_id()
);
} }
} else { } else {
if (src_ctx.device_type() == PROTO_CUDA) { if (src_ctx.device_type() == PROTO_CUDA) {
// CPU <- CUDA // CPU <- CUDA
CUDAContext::MemcpyEx<CPUContext, CUDAContext>( CUDAContext::MemcpyEx<CPUContext, CUDAContext>(
srcT->nbytes(), src->nbytes(),
dstT->raw_mutable_data<CPUContext>(meta), dst->raw_mutable_data<CPUContext>(meta),
srcT->raw_data<CUDAContext>(), src->raw_data<CUDAContext>(),
src_ctx.device_id()); src_ctx.device_id()
);
} else { } else {
// CPU <- CPU // CPU <- CPU
CPUContext::Memcpy<CUDAContext, CUDAContext>( CPUContext::Memcpy<CUDAContext, CUDAContext>(
srcT->nbytes(), src->nbytes(),
dstT->raw_mutable_data<CPUContext>(meta), dst->raw_mutable_data<CPUContext>(meta),
srcT->raw_data<CPUContext>()); src->raw_data<CPUContext>()
);
} }
} }
}) })
...@@ -188,7 +192,7 @@ PYBIND11_MODULE(libdragon, m) { ...@@ -188,7 +192,7 @@ PYBIND11_MODULE(libdragon, m) {
Tensor* tensor = self->GetTensor(name); Tensor* tensor = self->GetTensor(name);
CHECK_GT(tensor->count(), 0); CHECK_GT(tensor->count(), 0);
vector<npy_intp> dims; vector<npy_intp> dims;
for (const auto dim : tensor->dims()) dims.push_back(dim); for (auto dim : tensor->dims()) dims.push_back(dim);
int npy_type = TypeMetaToNPY(tensor->meta()); int npy_type = TypeMetaToNPY(tensor->meta());
if (npy_type == -1) { if (npy_type == -1) {
LOG(FATAL) << "Tensor(" + tensor->name() + ") " LOG(FATAL) << "Tensor(" + tensor->name() + ") "
......
...@@ -35,7 +35,7 @@ class OperatorHelper(object): ...@@ -35,7 +35,7 @@ class OperatorHelper(object):
'Relu', 'PRelu', 'Elu', 'SElu', 'Sigmoid', 'Tanh', 'Softmax', 'Relu', 'PRelu', 'Elu', 'SElu', 'Sigmoid', 'Tanh', 'Softmax',
'Dropout', 'DropPath', 'DropBlock2d', 'Dropout', 'DropPath', 'DropBlock2d',
'Add', 'Sub', 'Mul', 'Div', 'Clip', 'Log', 'Exp', 'Pow', 'Square', 'Sqrt', 'Add', 'Sub', 'Mul', 'Div', 'Clip', 'Log', 'Exp', 'Pow', 'Square', 'Sqrt',
'Accumulate', 'Affine', 'Copy', 'Compare', 'StopGradient', 'MPIBroadcast', 'Accumulate', 'Affine', 'Copy', 'StopGradient', 'MPIBroadcast',
'BatchNorm', 'GroupNorm', 'L2Norm', 'LRN', 'BiasAdd', 'BatchNorm', 'GroupNorm', 'L2Norm', 'LRN', 'BiasAdd',
) )
...@@ -107,6 +107,12 @@ class OperatorHelper(object): ...@@ -107,6 +107,12 @@ class OperatorHelper(object):
len(outputs[0].shape) < len(inputs[1].shape): len(outputs[0].shape) < len(inputs[1].shape):
outputs[0].shape = inputs[1].shape outputs[0].shape = inputs[1].shape
except: except:
try:
outputs[0].shape = inputs[1].shape[:]
if outputs[0].shape != inputs[0].shape and \
len(outputs[0].shape) < len(inputs[0].shape):
outputs[0].shape = inputs[0].shape
except:
pass pass
return outputs return outputs
...@@ -391,19 +397,31 @@ class OperatorHelper(object): ...@@ -391,19 +397,31 @@ class OperatorHelper(object):
############################################### ###############################################
@classmethod @classmethod
def _apply_Where(cls, arguments, inputs, outputs):
return cls._apply_Maximum(arguments, inputs, outputs)
@classmethod
def _apply_IndexSelect(cls, arguments, inputs, outputs): def _apply_IndexSelect(cls, arguments, inputs, outputs):
outputs[0].dtype = inputs[0].dtype outputs[0].dtype = inputs[0].dtype
axis = arguments['axis'] axis = arguments['axis']
try: try:
try: index_shape = inputs[1].shape[:]
except: index_shape = [None]
outputs[0].shape = \ outputs[0].shape = \
inputs[0].shape[:axis] + \ inputs[0].shape[:axis] + \
inputs[1].shape[:] + \ index_shape[:] + \
inputs[0].shape[axis + 1:] inputs[0].shape[axis + 1:]
except: except:
pass pass
return outputs return outputs
@classmethod @classmethod
def _apply_MaskedSelect(cls, arguments, inputs, outputs):
outputs[0].dtype = inputs[0].dtype
outputs[0].shape = [None]
return outputs
@classmethod
def _apply_RandomPick(cls, arguments, inputs, outputs): def _apply_RandomPick(cls, arguments, inputs, outputs):
outputs[0].dtype = inputs[0].dtype outputs[0].dtype = inputs[0].dtype
outputs[1].dtype = 'int32' outputs[1].dtype = 'int32'
...@@ -779,6 +797,25 @@ class OperatorHelper(object): ...@@ -779,6 +797,25 @@ class OperatorHelper(object):
pass pass
return outputs return outputs
@classmethod
def _apply_NonZero(cls, arguments, inputs, outputs):
outputs[0].dtype = 'int64'
try:
outputs[0].shape = [None, len(inputs[0].shape)]
except:
pass
return outputs
###############################################
# #
# Control Flow #
# #
###############################################
@classmethod
def _apply_Compare(cls, arguments, inputs, outputs):
return cls._apply_Maximum(arguments, inputs, outputs)
############################################### ###############################################
# # # #
# Vision # # Vision #
......
...@@ -462,7 +462,7 @@ class Tensor(object): ...@@ -462,7 +462,7 @@ class Tensor(object):
Parameters Parameters
---------- ----------
item : int or slice item : int, slice or Tensor
The indices. The indices.
Returns Returns
...@@ -471,6 +471,11 @@ class Tensor(object): ...@@ -471,6 +471,11 @@ class Tensor(object):
The output tensor. The output tensor.
""" """
if isinstance(item, Tensor):
return self.CreateOperator(
'MaskedSelect', [self, item],
)
else:
starts, sizes = self._process_indices(item) starts, sizes = self._process_indices(item)
output = self.CreateOperator('Crop', self, starts=starts, sizes=sizes) output = self.CreateOperator('Crop', self, starts=starts, sizes=sizes)
if self.shape is not None: if self.shape is not None:
...@@ -774,11 +779,11 @@ class Tensor(object): ...@@ -774,11 +779,11 @@ class Tensor(object):
############################################### ###############################################
def set_value(self, new_value, **kwargs): def set_value(self, new_value, **kwargs):
"""Feed the values to C++ backend. [**Theano Style**] """Feed values to the backend.
Parameters Parameters
---------- ----------
new_value : number, list or numpy.ndarray new_value : array_like
The values to set. The values to set.
Returns Returns
...@@ -795,12 +800,12 @@ class Tensor(object): ...@@ -795,12 +800,12 @@ class Tensor(object):
return self return self
def get_value(self): def get_value(self):
"""Fetch the values from C++ backend. [**Theano Style**] """Copy values from the backend.
Returns Returns
------- -------
numpy.ndarray or number numpy.ndarray
The values of this tensor in the backend. The copied values.
See Also See Also
-------- --------
...@@ -827,7 +832,7 @@ class Tensor(object): ...@@ -827,7 +832,7 @@ class Tensor(object):
return self.CreateOperator('Copy', **arguments) return self.CreateOperator('Copy', **arguments)
def reshape(self, shape, **kwargs): def reshape(self, shape, **kwargs):
"""Reshape the dimensions of input. [**Theano Style**] """Reshape the dimensions of input.
Parameters Parameters
---------- ----------
...@@ -841,8 +846,7 @@ class Tensor(object): ...@@ -841,8 +846,7 @@ class Tensor(object):
""" """
if not isinstance(shape, (tuple, list)): shape = [shape] if not isinstance(shape, (tuple, list)): shape = [shape]
return Tensor.CreateOperator( return self.CreateOperator('Reshape', inputs=self, shape=shape)
'Reshape', inputs=self, shape=shape, **kwargs)
def dimshuffle(self, *args, **kwargs): def dimshuffle(self, *args, **kwargs):
"""Shuffle the dimensions. [**Theano Style**] """Shuffle the dimensions. [**Theano Style**]
......
...@@ -425,7 +425,7 @@ def FeedTensor( ...@@ -425,7 +425,7 @@ def FeedTensor(
---------- ----------
tensor : Tensor or str tensor : Tensor or str
The tensor to feed. The tensor to feed.
array : number, list, tuple, or numpy.ndarray array : array_like
The values to feed. The values to feed.
force_cpu : boolean, optional, default=False force_cpu : boolean, optional, default=False
Whether force to feed to cpu context. Whether force to feed to cpu context.
......
...@@ -16,6 +16,30 @@ from __future__ import print_function ...@@ -16,6 +16,30 @@ from __future__ import print_function
from . import * from . import *
@OpSchema.Inputs(1, 3)
def Where(inputs, **kwargs):
"""Select elements from either ``x`` or ``y``, depending on ``condition``.
Return the indices of *True* elements, if only the ``condition`` is given.
**Type Constraints**: (*bool*, *int8*, *uint8*, *int32*, *int64*, *float16*, *float32*, *float64*)
Parameters
----------
inputs : sequence of Tensor
The ``x``, ``y``, and ``condition``.
Returns
-------
dragon.vm.torch.Tensor
The output tensor.
"""
if isinstance(inputs, Tensor) or len(inputs) == 1:
return NonZero(inputs, **kwargs)
return Tensor.CreateOperator('Where', **ParseArgs(locals()))
@OpSchema.Inputs(1) @OpSchema.Inputs(1)
def IndexSelect(inputs, indices, axis=0, **kwargs): def IndexSelect(inputs, indices, axis=0, **kwargs):
"""Select the elements according to the indices along the given axis. """Select the elements according to the indices along the given axis.
...@@ -26,7 +50,7 @@ def IndexSelect(inputs, indices, axis=0, **kwargs): ...@@ -26,7 +50,7 @@ def IndexSelect(inputs, indices, axis=0, **kwargs):
---------- ----------
inputs : Tensor inputs : Tensor
The input tensor. The input tensor.
indices : Tensor indices : sequence or Tensor
The indices to select elements. The indices to select elements.
axis : int, optional axis : int, optional
The axis of indices. The axis of indices.
...@@ -46,6 +70,31 @@ def IndexSelect(inputs, indices, axis=0, **kwargs): ...@@ -46,6 +70,31 @@ def IndexSelect(inputs, indices, axis=0, **kwargs):
@OpSchema.Inputs(1) @OpSchema.Inputs(1)
def MaskedSelect(inputs, mask, **kwargs):
"""Select the the elements where ``mask`` is *1*.
**Type Constraints**: (*bool*, *int8*, *uint8*, *int32*, *int64*, *float16*, *float32*, *float64*)
Parameters
----------
inputs : Tensor
The input tensor.
mask : Tensor
The mask, with the same size as ``inputs``.
Returns
-------
Tensor
The output tensor.
"""
arguments = ParseArgs(locals())
arguments['mask'] = None
arguments['inputs'] = [arguments['inputs'], mask]
return Tensor.CreateOperator('MaskedSelect', **arguments)
@OpSchema.Inputs(1)
@ArgumentHelper.RepeatedDesc('starts') @ArgumentHelper.RepeatedDesc('starts')
@ArgumentHelper.RepeatedDesc('sizes') @ArgumentHelper.RepeatedDesc('sizes')
def Crop( def Crop(
...@@ -748,6 +797,26 @@ def Arange(start, stop=None, step=1, dtype='float32', **kwargs): ...@@ -748,6 +797,26 @@ def Arange(start, stop=None, step=1, dtype='float32', **kwargs):
@OpSchema.Inputs(1) @OpSchema.Inputs(1)
def NonZero(inputs, **kwargs):
"""Return the indices of non-zero elements.
**Type Constraints**: (*bool*, *int8*, *uint8*, *int32*, *int64*, *float16*, *float32*, *float64*)
Parameters
----------
inputs : Tensor
The input tensor.
Returns
-------
Tensor
A *int64* tensor contains the indices.
"""
return Tensor.CreateOperator('NonZero', **ParseArgs(locals()))
@OpSchema.Inputs(1)
def Multinomial(inputs, num_samples=1, eps=0., normalize=False, **kwargs): def Multinomial(inputs, num_samples=1, eps=0., normalize=False, **kwargs):
"""Return a tensor where each row contains ``num_samples``, """Return a tensor where each row contains ``num_samples``,
sampled from the multinomial distribution. sampled from the multinomial distribution.
......
...@@ -125,6 +125,32 @@ def Equal(inputs, to_uint8=False, **kwargs): ...@@ -125,6 +125,32 @@ def Equal(inputs, to_uint8=False, **kwargs):
return Tensor.CreateOperator('Compare', operation='EQ', **arguments) return Tensor.CreateOperator('Compare', operation='EQ', **arguments)
@OpSchema.Inputs(2)
def NotEqual(inputs, to_uint8=False, **kwargs):
"""*NotEqual* comparing between A and B.
Set ``to_uint8`` if you expect the *uint8* results instead of *bool*.
**Type Constraints**: (*bool*, *int8*, *uint8*, *int32*, *int64*, *float16*, *float32*, *float64*)
Parameters
----------
inputs : sequence of Tensor
The inputs, represent A and B respectively.
to_uint8 : bool
*True* to convert to *uint8* results.
Returns
-------
Tensor
The comparing results.
"""
arguments = ParseArgs(locals())
return Tensor.CreateOperator('Compare', operation='NE', **arguments)
@OpSchema.ConvertConstantInputs() @OpSchema.ConvertConstantInputs()
@OpSchema.Inputs(2) @OpSchema.Inputs(2)
def Less(inputs, to_uint8=False, **kwargs): def Less(inputs, to_uint8=False, **kwargs):
......
...@@ -114,7 +114,7 @@ LayerNorm = _norm_ops.LayerNorm ...@@ -114,7 +114,7 @@ LayerNorm = _norm_ops.LayerNorm
InstanceNorm = _norm_ops.InstanceNorm InstanceNorm = _norm_ops.InstanceNorm
L2Norm = _norm_ops.L2Norm L2Norm = _norm_ops.L2Norm
# NDArray # Array
Crop = _array_ops.Crop Crop = _array_ops.Crop
Reduce = _array_ops.Reduce Reduce = _array_ops.Reduce
Sum = _array_ops.Sum Sum = _array_ops.Sum
...@@ -130,7 +130,9 @@ Transpose = _array_ops.Transpose ...@@ -130,7 +130,9 @@ Transpose = _array_ops.Transpose
Repeat = _array_ops.Repeat Repeat = _array_ops.Repeat
Tile = _array_ops.Tile Tile = _array_ops.Tile
Pad = _array_ops.Pad Pad = _array_ops.Pad
Where = _array_ops.Where
IndexSelect = _array_ops.IndexSelect IndexSelect = _array_ops.IndexSelect
MaskedSelect = _array_ops.MaskedSelect
OneHot = _array_ops.OneHot OneHot = _array_ops.OneHot
Flatten = _array_ops.Flatten Flatten = _array_ops.Flatten
Reshape = _array_ops.Reshape Reshape = _array_ops.Reshape
...@@ -139,15 +141,17 @@ Squeeze = _array_ops.Squeeze ...@@ -139,15 +141,17 @@ Squeeze = _array_ops.Squeeze
Shape = _array_ops.Shape Shape = _array_ops.Shape
Arange = _array_ops.Arange Arange = _array_ops.Arange
Multinomial = _array_ops.Multinomial Multinomial = _array_ops.Multinomial
NonZero = _array_ops.NonZero
# Control Flow # Control Flow
Copy = _control_flow_ops.Copy Copy = _control_flow_ops.Copy
Assign = _control_flow_ops.Assign Assign = _control_flow_ops.Assign
MaskedAssign = _control_flow_ops.MaskedAssign MaskedAssign = _control_flow_ops.MaskedAssign
Equal = _control_flow_ops.Equal
Less = _control_flow_ops.Less Less = _control_flow_ops.Less
LessEqual = _control_flow_ops.LessEqual Equal = _control_flow_ops.Equal
Greater = _control_flow_ops.Greater Greater = _control_flow_ops.Greater
LessEqual = _control_flow_ops.LessEqual
NotEqual = _control_flow_ops.NotEqual
GreaterEqual = _control_flow_ops.GreaterEqual GreaterEqual = _control_flow_ops.GreaterEqual
# Misc # Misc
......
...@@ -18,7 +18,7 @@ import dragon.vm.torch.ops.tensor ...@@ -18,7 +18,7 @@ import dragon.vm.torch.ops.tensor
# Import Core Methods # Import Core Methods
from dragon.vm.torch.tensor import * from dragon.vm.torch.tensor import *
from dragon.vm.torch.c_api import Size, from_numpy from dragon.vm.torch.c_api import Size, from_numpy, device
from dragon.vm.torch.serialization import save, load from dragon.vm.torch.serialization import save, load
# Import Subpackages # Import Subpackages
......
...@@ -67,10 +67,11 @@ def from_numpy(data): ...@@ -67,10 +67,11 @@ def from_numpy(data):
""" """
if not isinstance(data, numpy.ndarray): if not isinstance(data, numpy.ndarray):
raise TypeError('The data should be a numpy.ndarray.') raise TypeError('The data should be a numpy.ndarray.')
if str(data.dtype) not in _mapping.TENSOR_TYPE_TO_TORCH_TENSOR: tensor_types = _mapping.TENSOR_TYPE_TO_TORCH_TENSOR
raise ValueError('Unsupported type({}) to torch tensor.'.format(data.dtype)) if str(data.dtype) not in tensor_types:
raise ValueError('Unsupported data type: ' + str(data.dtype))
module = importlib.import_module('dragon.vm.torch.tensor') module = importlib.import_module('dragon.vm.torch.tensor')
return getattr(module, _mapping.TENSOR_TYPE_TO_TORCH_TENSOR[str(data.dtype)])(data) return getattr(module, tensor_types[str(data.dtype)])(data, copy=False)
def from_dragon(tensor, own_storage=False): def from_dragon(tensor, own_storage=False):
......
...@@ -23,8 +23,8 @@ from dragon.vm.torch.ops.modules.control_flow import ( ...@@ -23,8 +23,8 @@ from dragon.vm.torch.ops.modules.control_flow import (
) )
from dragon.vm.torch.ops.modules.arithmetic import ( from dragon.vm.torch.ops.modules.arithmetic import (
Fundamental, Log, Exp, Sqrt, Fundamental, Accumulate,
Accumulate, Log, Exp, Sqrt,
MM, FullyConnected, MM, FullyConnected,
Maximum, Minimum, Clamp, Maximum, Minimum, Clamp,
) )
...@@ -35,9 +35,11 @@ from dragon.vm.torch.ops.modules.init import ( ...@@ -35,9 +35,11 @@ from dragon.vm.torch.ops.modules.init import (
from dragon.vm.torch.ops.modules.array import ( from dragon.vm.torch.ops.modules.array import (
Reshape, Squeeze, UnSqueeze, Permute, Reshape, Squeeze, UnSqueeze, Permute,
Indexing, IndexSelect, Indexing, Repeat, Concat, Stack,
Repeat, Concat, Stack, IndexSelect, MaskedSelect,
Reduce, ArgReduce, OneHot, Multinomial, Reduce, ArgReduce,
NonZero, Where,
OneHot, Multinomial,
) )
from dragon.vm.torch.ops.modules.update import ( from dragon.vm.torch.ops.modules.update import (
...@@ -50,17 +52,16 @@ from dragon.vm.torch.ops.modules.vision import ( ...@@ -50,17 +52,16 @@ from dragon.vm.torch.ops.modules.vision import (
__all__ = [ __all__ = [
'accumulate', 'add', 'sub', 'mul', 'div', 'accumulate',
'add', 'sub', 'mul', 'div',
'maximum', 'minimum', 'clamp', 'maximum', 'minimum', 'clamp',
'log', 'exp', 'sqrt', 'log', 'exp', 'sqrt',
'mm', 'xw_plus_b', 'mm', 'xw_plus_b',
'squeeze', 'unsqueeze', 'squeeze', 'unsqueeze',
'mean', 'sum', 'min', 'max', 'topk', 'mean', 'sum', 'min', 'max', 'topk',
'argmin', 'argmax', 'nonzero', 'where', 'argmin', 'argmax',
'gt', 'lt', 'eq', 'ge', 'le', 'gt', 'lt', 'eq', 'ne', 'ge', 'le',
'cat', 'stack', 'narrow', 'cat', 'stack', 'narrow',
'index_select', 'index_select', 'masked_select',
'one_hot', 'multinomial', 'one_hot', 'multinomial',
'rand', 'randn', 'rand', 'randn',
'ones', 'ones_like', 'ones', 'ones_like',
...@@ -525,6 +526,30 @@ def _assign(output, starts, sizes, input): ...@@ -525,6 +526,30 @@ def _assign(output, starts, sizes, input):
return module.forward(input, output, starts, sizes) return module.forward(input, output, starts, sizes)
def where(condition, x, y):
"""Select elements from either ``x`` or ``y``, depending on ``condition``.
Parameters
----------
condition : dragon.vm.torch.Tensor
The byte condition tensor.
x : dragon.vm.torch.Tensor
The elements for *1*.
y : dragon.vm.torch.Tensor
The elements for *0*.
Returns
-------
dragon.vm.torch.Tensor
The output tensor.
"""
dev = MakeDevice(inputs=[condition, x, y])
key = 'Where/{}'.format(dev)
module = get_module(Where, key, dev)
return module.forward(condition, x, y)
def _masked_assign(output, mask, input): def _masked_assign(output, mask, input):
if not isinstance(input, Tensor): if not isinstance(input, Tensor):
if isinstance(input, (tuple, list)): if isinstance(input, (tuple, list)):
...@@ -569,7 +594,7 @@ def squeeze(input, dim=None, out=None): ...@@ -569,7 +594,7 @@ def squeeze(input, dim=None, out=None):
def unsqueeze(input, dim, out=None): def unsqueeze(input, dim, out=None):
"""Returns a tensor with a dimension of size 1 inserted at the specified position. """Return a tensor with a dimension of size 1 inserted at the specified position.
Parameters Parameters
---------- ----------
...@@ -866,6 +891,27 @@ def eq(input, other, out=None): ...@@ -866,6 +891,27 @@ def eq(input, other, out=None):
return _compare(input, other, 'EQ', out) return _compare(input, other, 'EQ', out)
def ne(input, other, out=None):
"""Compute *input* != *other* element-wise.
Parameters
----------
input : dragon.vm.torch.Tensor
The input tensor.
other : dragon.vm.torch.Tensor, number
The other tensor.
out : dragon.vm.torch.Tensor, optional
The optional output tensor.
Returns
-------
dragon.vm.torch.Tensor
The output byte tensor.
"""
return _compare(input, other, 'NE', out)
def cat(seq, dim=0, out=None): def cat(seq, dim=0, out=None):
"""Concatenate the inputs along the given axis. """Concatenate the inputs along the given axis.
...@@ -908,7 +954,7 @@ def stack(seq, dim=0, out=None): ...@@ -908,7 +954,7 @@ def stack(seq, dim=0, out=None):
The output tensor. The output tensor.
""" """
dev = MakeDevice(inputs=seq, outputs=[out] if out else []) dev = MakeDevice(seq, [out] if out else [])
key = 'Stack/{}/dim:{}'.format(dev, dim) key = 'Stack/{}/dim:{}'.format(dev, dim)
module = get_module(Stack, key, dev, axis=dim) module = get_module(Stack, key, dev, axis=dim)
return module.forward(seq, out) return module.forward(seq, out)
...@@ -940,6 +986,30 @@ def index_select(input, dim, index, out=None): ...@@ -940,6 +986,30 @@ def index_select(input, dim, index, out=None):
return module.forward(input, index, out) return module.forward(input, index, out)
def masked_select(input, mask, out=None):
"""Select the input values where mask is *1*.
Parameters
----------
input : dragon.vm.torch.Tensor
The values.
mask : dragon.vm.torch.Tensor
The mask to select values.
out : dragon.vm.torch.Tensor, optional
The optional output tensor.
Returns
-------
dragon.vm.torch.Tensor
The output tensor.
"""
dev = MakeDevice([input, mask], [out] if out else [])
key = 'MaskedSelect/{}'.format(dev)
module = get_module(MaskedSelect, key, dev)
return module.forward(input, mask, out)
def narrow(input, dimension, start, length): def narrow(input, dimension, start, length):
"""Return a new tensor that is a narrowed version of input tensor. """Return a new tensor that is a narrowed version of input tensor.
...@@ -965,6 +1035,28 @@ def narrow(input, dimension, start, length): ...@@ -965,6 +1035,28 @@ def narrow(input, dimension, start, length):
return _index(input, starts, sizes) return _index(input, starts, sizes)
def nonzero(input, out=None):
"""Return the indices of non-zero elements.
Parameters
----------
input : dragon.vm.torch.Tensor
The input tensor.
out : dragon.vm.torch.Tensor, optional
The optional output tensor.
Returns
-------
dragon.vm.torch.FloatTensor
The output tensor.
"""
dev = MakeDevice(inputs=[input])
key = 'NonZero/{}'.format(dev)
module = get_module(NonZero, key, dev)
return module.forward(input, out)
def one_hot(input, depth): def one_hot(input, depth):
"""Return a ont hot tensor according to given input. """Return a ont hot tensor according to given input.
......
...@@ -134,6 +134,23 @@ class IndexSelect(BaseModule): ...@@ -134,6 +134,23 @@ class IndexSelect(BaseModule):
return self.run(inputs, outputs) return self.run(inputs, outputs)
class MaskedSelect(BaseModule):
def __init__(self, key, dev, **kwargs):
super(MaskedSelect, self).__init__(key, dev, **kwargs)
self.register_op()
def register_op(self):
self.op_meta = {
'op_type': 'MaskedSelect',
'arguments': {},
}
def forward(self, x, mask, y):
inputs = [x, mask]; self.unify_devices(inputs)
outputs = [y] if y else [self.register_output()]
return self.run(inputs, outputs)
class Reduce(BaseModule): class Reduce(BaseModule):
def __init__(self, key, dev, **kwargs): def __init__(self, key, dev, **kwargs):
super(Reduce, self).__init__(key, dev, **kwargs) super(Reduce, self).__init__(key, dev, **kwargs)
...@@ -326,6 +343,36 @@ class Repeat(BaseModule): ...@@ -326,6 +343,36 @@ class Repeat(BaseModule):
return self.run(inputs, outputs, callback=callback) return self.run(inputs, outputs, callback=callback)
class NonZero(BaseModule):
def __init__(self, key, dev, **kwargs):
super(NonZero, self).__init__(key, dev, **kwargs)
self.register_op()
def register_op(self):
self.op_meta = {
'op_type': 'NonZero',
'arguments': {},
}
def forward(self, x, y):
inputs = [x]; self.unify_devices(inputs)
outputs = [y] if y else [self.register_output()]
with no_grad(): return self.run(inputs, outputs)
class Where(BaseModule):
def __init__(self, key, dev, **kwargs):
super(Where, self).__init__(key, dev, **kwargs)
self.register_op()
def register_op(self):
self.op_meta = {'op_type': 'Where', 'arguments': {}}
def forward(self, condition, x, y):
self.unify_devices([condition, x, y])
return self.run([x, y, condition], [self.register_output()])
class OneHot(BaseModule): class OneHot(BaseModule):
def __init__(self, key, dev, **kwargs): def __init__(self, key, dev, **kwargs):
super(OneHot, self).__init__(key, dev, **kwargs) super(OneHot, self).__init__(key, dev, **kwargs)
......
...@@ -23,11 +23,12 @@ from dragon.vm.torch.ops.builtin import ( ...@@ -23,11 +23,12 @@ from dragon.vm.torch.ops.builtin import (
_fundamental, _rfundamental, _fundamental, _rfundamental,
log, exp, sqrt, clamp, log, exp, sqrt, clamp,
_reshape, squeeze, unsqueeze, _reshape, squeeze, unsqueeze,
_permute, _repeat, narrow, _permute, _repeat, narrow, _index,
_index, index_select,
_assign, _masked_assign, _assign, _masked_assign,
index_select, masked_select,
mean, sum, max, min, mean, sum, max, min,
gt, lt, eq, ge, le, gt, lt, eq, ne, ge, le,
where, nonzero,
) )
...@@ -84,10 +85,14 @@ Tensor.ge = lambda *args, **kwargs: ge(*args, **kwargs) ...@@ -84,10 +85,14 @@ Tensor.ge = lambda *args, **kwargs: ge(*args, **kwargs)
Tensor.lt = lambda *args, **kwargs: lt(*args, **kwargs) Tensor.lt = lambda *args, **kwargs: lt(*args, **kwargs)
Tensor.le = lambda *args, **kwargs: le(*args, **kwargs) Tensor.le = lambda *args, **kwargs: le(*args, **kwargs)
Tensor.eq = lambda *args, **kwargs: eq(*args, **kwargs) Tensor.eq = lambda *args, **kwargs: eq(*args, **kwargs)
Tensor.index_select = lambda *args, **kwargs: index_select(*args, **kwargs) Tensor.ne = lambda *args, **kwargs: ne(*args, **kwargs)
Tensor.nonzero = lambda *args, **kwargs: nonzero(*args, **kwargs)
Tensor.where = lambda self, condition, y: where(condition, self, y)
Tensor.narrow = lambda *args, **kwargs: narrow(*args, **kwargs) Tensor.narrow = lambda *args, **kwargs: narrow(*args, **kwargs)
Tensor._index = lambda *args, **kwargs: _index(*args, **kwargs) Tensor._index = lambda *args, **kwargs: _index(*args, **kwargs)
Tensor._assign = lambda *args, **kwargs: _assign(*args, **kwargs) Tensor._assign = lambda *args, **kwargs: _assign(*args, **kwargs)
Tensor.index_select = lambda *args, **kwargs: index_select(*args, **kwargs)
Tensor.masked_select = lambda *args, **kwargs: masked_select(*args, **kwargs)
Tensor.half = lambda self: _type_to(self, dtype='float16', inplace=False) Tensor.half = lambda self: _type_to(self, dtype='float16', inplace=False)
...@@ -104,5 +109,5 @@ Tensor.int = lambda self: _type_to(self, dtype='int32', inplace=False) ...@@ -104,5 +109,5 @@ Tensor.int = lambda self: _type_to(self, dtype='int32', inplace=False)
Tensor.int_ = lambda self: _type_to(self, dtype='int32', inplace=True) Tensor.int_ = lambda self: _type_to(self, dtype='int32', inplace=True)
Tensor.long = lambda self: _type_to(self, dtype='int64', inplace=False) Tensor.long = lambda self: _type_to(self, dtype='int64', inplace=False)
Tensor.long_ = lambda self: _type_to(self, dtype='int64', inplace=True) Tensor.long_ = lambda self: _type_to(self, dtype='int64', inplace=True)
Tensor.type = lambda self, dtype=None: _type_to(self, dtype=dtype) \ Tensor.type = lambda self, dtype = None: _type_to(self, dtype=dtype) \
if dtype is not None else 'torch.' + self._type2str() if dtype is not None else 'torch.' + self._type2str()
\ No newline at end of file
#include "utils/op_kernel.h" #include "utils/op_kernel.h"
#include "utils/eigen_utils.h"
#include "utils/omp_alternative.h" #include "utils/omp_alternative.h"
namespace dragon { namespace dragon {
...@@ -10,15 +11,12 @@ namespace kernel { ...@@ -10,15 +11,12 @@ namespace kernel {
template <typename T> template <typename T>
void _Maximum( void _Maximum(
const int count, const int count,
const T* x1, const T* a,
const T* x2, const T* b,
T* y) { T* y) {
#ifdef WITH_OMP EigenVectorArrayMap<T>(y, count) = \
#pragma omp parallel for num_threads(OMP_THREADS(count)) ConstEigenVectorArrayMap<T>(a, count).max(
#endif ConstEigenVectorArrayMap<T>(b, count));
for (int i = 0; i < count; ++i) {
y[i] = std::max(x1[i], x2[i]);
}
} }
/* <T = ?, Device = CPU> */ /* <T = ?, Device = CPU> */
...@@ -26,15 +24,11 @@ void _Maximum( ...@@ -26,15 +24,11 @@ void _Maximum(
template <typename T> template <typename T>
void _BroadcastMaximum( void _BroadcastMaximum(
const int count, const int count,
const T* x1, const T* a,
const T x2, const T b,
T* y) { T* y) {
#ifdef WITH_OMP EigenVectorArrayMap<T>(y, count) = \
#pragma omp parallel for num_threads(OMP_THREADS(count)) ConstEigenVectorArrayMap<T>(a, count).max(b);
#endif
for (int i = 0; i < count; ++i) {
y[i] = std::max(x1[i], x2);
}
} }
/* <T = ?, Device = CPU> */ /* <T = ?, Device = CPU> */
...@@ -42,18 +36,19 @@ void _BroadcastMaximum( ...@@ -42,18 +36,19 @@ void _BroadcastMaximum(
template <typename T> template <typename T>
void _MaximumGrad( void _MaximumGrad(
const int count, const int count,
const T* x1, const T* a,
const T* x2, const T* b,
const T* dy, const T* dy,
T* dx1, T* da,
T* dx2) { T* db) {
const T kZero = T(0);
#ifdef WITH_OMP #ifdef WITH_OMP
#pragma omp parallel for num_threads(OMP_THREADS(count)) #pragma omp parallel for num_threads(OMP_THREADS(count))
#endif #endif
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
const bool dy_to_dx1 = x1[i] > x2[i]; const bool dy_to_da = a[i] > b[i];
dx1[i] = dy_to_dx1 ? dy[i] : 0; da[i] = dy_to_da ? dy[i] : kZero;
dx2[i] = dy_to_dx1 ? 0 : dy[i]; db[i] = dy_to_da ? kZero : dy[i];
} }
} }
...@@ -62,16 +57,17 @@ void _MaximumGrad( ...@@ -62,16 +57,17 @@ void _MaximumGrad(
template <typename T> template <typename T>
void _BroadcastMaximumGrad( void _BroadcastMaximumGrad(
const int count, const int count,
const T* x1, const T* a,
const T x2, const T b,
const T* dy, const T* dy,
T* dx1, T* da,
T* dx2) { T* db) {
const T kZero = T(0);
#ifdef WITH_OMP #ifdef WITH_OMP
#pragma omp parallel for num_threads(OMP_THREADS(count)) #pragma omp parallel for num_threads(OMP_THREADS(count))
#endif #endif
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
dx1[i] = (x1[i] > x2) ? dy[i] : 0; da[i] = a[i] > b ? dy[i] : kZero;
} }
} }
...@@ -80,23 +76,23 @@ void _BroadcastMaximumGrad( ...@@ -80,23 +76,23 @@ void _BroadcastMaximumGrad(
#define DEFINE_MAXIMUM_KERNEL_LAUNCHER(name, T, T2) \ #define DEFINE_MAXIMUM_KERNEL_LAUNCHER(name, T, T2) \
template <> void name<T, CPUContext>( \ template <> void name<T, CPUContext>( \
const int count, \ const int count, \
const T* x1, \ const T* a, \
const T2 x2, \ const T2 b, \
T* y, \ T* y, \
CPUContext* ctx) { \ CPUContext* ctx) { \
_##name(count, x1, x2, y); \ _##name(count, a, b, y); \
} }
#define DEFINE_MAXIMUM_GRAD_KERNEL_LAUNCHER(name, T, T2) \ #define DEFINE_MAXIMUM_GRAD_KERNEL_LAUNCHER(name, T, T2) \
template <> void name<T, CPUContext>( \ template <> void name<T, CPUContext>( \
const int count, \ const int count, \
const T* x1, \ const T* a, \
const T2 x2, \ const T2 b, \
const T* dy, \ const T* dy, \
T* dx1, \ T* da, \
T* dx2, \ T* db, \
CPUContext* ctx) { \ CPUContext* ctx) { \
_##name(count, x1, x2, dy, dx1, dx2); \ _##name(count, a, b, dy, da, db); \
} }
DEFINE_MAXIMUM_KERNEL_LAUNCHER(Maximum, int8_t, int8_t*); DEFINE_MAXIMUM_KERNEL_LAUNCHER(Maximum, int8_t, int8_t*);
...@@ -129,8 +125,8 @@ DEFINE_MAXIMUM_GRAD_KERNEL_LAUNCHER(BroadcastMaximumGrad, double, double); ...@@ -129,8 +125,8 @@ DEFINE_MAXIMUM_GRAD_KERNEL_LAUNCHER(BroadcastMaximumGrad, double, double);
template <> void Maximum<float16, CPUContext>( template <> void Maximum<float16, CPUContext>(
const int count, const int count,
const float16* x1, const float16* a,
const float16* x2, const float16* b,
float16* y, float16* y,
CPUContext* ctx) { CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED; CPU_FP16_NOT_SUPPORTED;
...@@ -138,8 +134,8 @@ template <> void Maximum<float16, CPUContext>( ...@@ -138,8 +134,8 @@ template <> void Maximum<float16, CPUContext>(
template <> void BroadcastMaximum<float16, CPUContext>( template <> void BroadcastMaximum<float16, CPUContext>(
const int count, const int count,
const float16* x1, const float16* a,
const float16 x2, const float16 b,
float16* y, float16* y,
CPUContext* ctx) { CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED; CPU_FP16_NOT_SUPPORTED;
...@@ -147,22 +143,22 @@ template <> void BroadcastMaximum<float16, CPUContext>( ...@@ -147,22 +143,22 @@ template <> void BroadcastMaximum<float16, CPUContext>(
template <> void MaximumGrad<float16, CPUContext>( template <> void MaximumGrad<float16, CPUContext>(
const int count, const int count,
const float16* x1, const float16* a,
const float16* x2, const float16* b,
const float16* dy, const float16* dy,
float16* dx1, float16* da,
float16* dx2, float16* db,
CPUContext* ctx) { CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED; CPU_FP16_NOT_SUPPORTED;
} }
template <> void BroadcastMaximumGrad<float16, CPUContext>( template <> void BroadcastMaximumGrad<float16, CPUContext>(
const int count, const int count,
const float16* x1, const float16* a,
const float16 x2, const float16 b,
const float16* dy, const float16* dy,
float16* dx1, float16* da,
float16* dx2, float16* db,
CPUContext* ctx) { CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED; CPU_FP16_NOT_SUPPORTED;
} }
......
...@@ -13,11 +13,11 @@ namespace kernel { ...@@ -13,11 +13,11 @@ namespace kernel {
template <typename T> template <typename T>
__global__ void _Maximum( __global__ void _Maximum(
const int nthreads, const int nthreads,
const T* x1, const T* a,
const T* x2, const T* b,
T* y) { T* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
y[i] = max(x1[i], x2[i]); y[i] = max(a[i], b[i]);
} }
} }
...@@ -25,12 +25,12 @@ __global__ void _Maximum( ...@@ -25,12 +25,12 @@ __global__ void _Maximum(
template<> __global__ void _Maximum<half>( template<> __global__ void _Maximum<half>(
const int nthreads, const int nthreads,
const half* x1, const half* a,
const half* x2, const half* b,
half* y) { half* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530 #if __CUDA_ARCH__ >= 530
y[i] = __hgt(x1[i], x2[i]) ? x1[i] : x2[i]; y[i] = __hgt(a[i], b[i]) ? a[i] : b[i];
#endif #endif
} }
} }
...@@ -40,11 +40,11 @@ template<> __global__ void _Maximum<half>( ...@@ -40,11 +40,11 @@ template<> __global__ void _Maximum<half>(
template <typename T> template <typename T>
__global__ void _BroadcastMaximum( __global__ void _BroadcastMaximum(
const int nthreads, const int nthreads,
const T* x1, const T* a,
const T x2, const T b,
T* y) { T* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
y[i] = max(x1[i], x2); y[i] = max(a[i], b);
} }
} }
...@@ -52,12 +52,12 @@ __global__ void _BroadcastMaximum( ...@@ -52,12 +52,12 @@ __global__ void _BroadcastMaximum(
template<> __global__ void _BroadcastMaximum<half>( template<> __global__ void _BroadcastMaximum<half>(
const int nthreads, const int nthreads,
const half* x1, const half* a,
const half x2, const half b,
half* y) { half* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530 #if __CUDA_ARCH__ >= 530
y[i] = __hgt(x1[i], x2) ? x1[i] : x2; y[i] = __hgt(a[i], b) ? a[i] : b;
#endif #endif
} }
} }
...@@ -67,15 +67,16 @@ template<> __global__ void _BroadcastMaximum<half>( ...@@ -67,15 +67,16 @@ template<> __global__ void _BroadcastMaximum<half>(
template <typename T> template <typename T>
__global__ void _MaximumGrad( __global__ void _MaximumGrad(
const int nthreads, const int nthreads,
const T* x1, const T* a,
const T* x2, const T* b,
const T* dy, const T* dy,
T* dx1, T* da,
T* dx2) { T* db) {
const T kZero = T(0);
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
const bool dy_to_dx1 = x1[i] > x2[i]; const bool dy_to_da = a[i] > b[i];
dx1[i] = dy_to_dx1 ? dy[i] : T(0); da[i] = dy_to_da ? dy[i] : kZero;
dx2[i] = dy_to_dx1 ? T(0) : dy[i]; db[i] = dy_to_da ? kZero : dy[i];
} }
} }
...@@ -83,18 +84,19 @@ __global__ void _MaximumGrad( ...@@ -83,18 +84,19 @@ __global__ void _MaximumGrad(
template<> __global__ void _MaximumGrad<half>( template<> __global__ void _MaximumGrad<half>(
const int nthreads, const int nthreads,
const half* x1, const half* a,
const half* x2, const half* b,
const half* dy, const half* dy,
half* dx1, half* da,
half* dx2) { half* db) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530 #if __CUDA_ARCH__ >= 530
const bool dy_to_dx1 = __hgt(x1[i], x2[i]); const half kZero = __float2half(0.f);
dx1[i] = dy_to_dx1 ? dy[i] : __float2half(0.f); CUDA_1D_KERNEL_LOOP(i, nthreads) {
dx2[i] = dy_to_dx1 ? __float2half(0.f) : dy[i]; const bool dy_to_da = __hgt(a[i], b[i]);
#endif da[i] = dy_to_da ? dy[i] : kZero;
db[i] = dy_to_da ? kZero : dy[i];
} }
#endif
} }
/* <T = ?, Device = CUDA> */ /* <T = ?, Device = CUDA> */
...@@ -102,13 +104,14 @@ template<> __global__ void _MaximumGrad<half>( ...@@ -102,13 +104,14 @@ template<> __global__ void _MaximumGrad<half>(
template <typename T> template <typename T>
__global__ void _BroadcastMaximumGrad( __global__ void _BroadcastMaximumGrad(
const int nthreads, const int nthreads,
const T* x1, const T* a,
const T x2, const T b,
const T* dy, const T* dy,
T* dx1, T* da,
T* dx2) { T* db) {
const T kZero = T(0);
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
dx1[i] = (x1[i] > x2) ? dy[i] : T(0); da[i] = (a[i] > b) ? dy[i] : kZero;
} }
} }
...@@ -116,17 +119,17 @@ __global__ void _BroadcastMaximumGrad( ...@@ -116,17 +119,17 @@ __global__ void _BroadcastMaximumGrad(
template<> __global__ void _BroadcastMaximumGrad<half>( template<> __global__ void _BroadcastMaximumGrad<half>(
const int nthreads, const int nthreads,
const half* x1, const half* a,
const half x2, const half b,
const half* dy, const half* dy,
half* dx1, half* da,
half* dx2) { half* db) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530 #if __CUDA_ARCH__ >= 530
dx1[i] = __hgt(x1[i], x2) ? const half kZero = __float2half(0.f);
dy[i] : __float2half(0.f); CUDA_1D_KERNEL_LOOP(i, nthreads) {
#endif da[i] = __hgt(a[i], b) ? dy[i] : kZero;
} }
#endif
} }
/* Kernel Launchers */ /* Kernel Launchers */
...@@ -134,30 +137,30 @@ template<> __global__ void _BroadcastMaximumGrad<half>( ...@@ -134,30 +137,30 @@ template<> __global__ void _BroadcastMaximumGrad<half>(
#define DEFINE_MAXIMUM_KERNEL_LAUNCHER(name, T, T2) \ #define DEFINE_MAXIMUM_KERNEL_LAUNCHER(name, T, T2) \
template <> void name<T, CUDAContext>( \ template <> void name<T, CUDAContext>( \
const int count, \ const int count, \
const T* x1, \ const T* a, \
const T2 x2, \ const T2 b, \
T* y, \ T* y, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
_##name \ _##name \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \ <<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \ 0, ctx->cuda_stream() >>>( \
count, x1, x2, y \ count, a, b, y \
); \ ); \
} }
#define DEFINE_MAXIMUM_GRAD_KERNEL_LAUNCHER(name, T, T2) \ #define DEFINE_MAXIMUM_GRAD_KERNEL_LAUNCHER(name, T, T2) \
template <> void name<T, CUDAContext>( \ template <> void name<T, CUDAContext>( \
const int count, \ const int count, \
const T* x1, \ const T* a, \
const T2 x2, \ const T2 b, \
const T* dy, \ const T* dy, \
T* dx1, \ T* da, \
T* dx2, \ T* db, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
_##name \ _##name \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \ <<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \ 0, ctx->cuda_stream() >>>( \
count, x1, x2, dy, dx1, dx2 \ count, a, b, dy, da, db \
); \ ); \
} }
...@@ -191,73 +194,73 @@ DEFINE_MAXIMUM_GRAD_KERNEL_LAUNCHER(BroadcastMaximumGrad, double, double); ...@@ -191,73 +194,73 @@ DEFINE_MAXIMUM_GRAD_KERNEL_LAUNCHER(BroadcastMaximumGrad, double, double);
template <> void Maximum<float16, CUDAContext>( template <> void Maximum<float16, CUDAContext>(
const int count, const int count,
const float16* x1, const float16* a,
const float16* x2, const float16* b,
float16* y, float16* y,
CUDAContext* ctx) { CUDAContext* ctx) {
_Maximum \ _Maximum \
<<< CUDA_BLOCKS(count), CUDA_THREADS, <<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>( 0, ctx->cuda_stream() >>>(
count, count,
reinterpret_cast<const half*>(x1), reinterpret_cast<const half*>(a),
reinterpret_cast<const half*>(x2), reinterpret_cast<const half*>(b),
reinterpret_cast<half*>(y) reinterpret_cast<half*>(y)
); );
} }
template <> void BroadcastMaximum<float16, CUDAContext>( template <> void BroadcastMaximum<float16, CUDAContext>(
const int count, const int count,
const float16* x1, const float16* a,
const float16 x2, const float16 b,
float16* y, float16* y,
CUDAContext* ctx) { CUDAContext* ctx) {
_BroadcastMaximum \ _BroadcastMaximum \
<<< CUDA_BLOCKS(count), CUDA_THREADS, <<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>( 0, ctx->cuda_stream() >>>(
count, count,
reinterpret_cast<const half*>(x1), reinterpret_cast<const half*>(a),
cast::to<half>(x2), cast::to<half>(b),
reinterpret_cast<half*>(y) reinterpret_cast<half*>(y)
); );
} }
template <> void MaximumGrad<float16, CUDAContext>( template <> void MaximumGrad<float16, CUDAContext>(
const int count, const int count,
const float16* x1, const float16* a,
const float16* x2, const float16* b,
const float16* dy, const float16* dy,
float16* dx1, float16* da,
float16* dx2, float16* db,
CUDAContext* ctx) { CUDAContext* ctx) {
_MaximumGrad \ _MaximumGrad \
<<< CUDA_BLOCKS(count), CUDA_THREADS, <<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>( 0, ctx->cuda_stream() >>>(
count, count,
reinterpret_cast<const half*>(x1), reinterpret_cast<const half*>(a),
reinterpret_cast<const half*>(x2), reinterpret_cast<const half*>(b),
reinterpret_cast<const half*>(dy), reinterpret_cast<const half*>(dy),
reinterpret_cast<half*>(dx1), reinterpret_cast<half*>(da),
reinterpret_cast<half*>(dx2) reinterpret_cast<half*>(db)
); );
} }
template <> void BroadcastMaximumGrad<float16, CUDAContext>( template <> void BroadcastMaximumGrad<float16, CUDAContext>(
const int count, const int count,
const float16* x1, const float16* a,
const float16 x2, const float16 b,
const float16* dy, const float16* dy,
float16* dx1, float16* da,
float16* dx2, float16* db,
CUDAContext* ctx) { CUDAContext* ctx) {
_BroadcastMaximumGrad \ _BroadcastMaximumGrad \
<<< CUDA_BLOCKS(count), CUDA_THREADS, <<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>( 0, ctx->cuda_stream() >>>(
count, count,
reinterpret_cast<const half*>(x1), reinterpret_cast<const half*>(a),
cast::to<half>(x2), cast::to<half>(b),
reinterpret_cast<const half*>(dy), reinterpret_cast<const half*>(dy),
reinterpret_cast<half*>(dx1), reinterpret_cast<half*>(da),
reinterpret_cast<half*>(dx2) reinterpret_cast<half*>(db)
); );
} }
......
#include "utils/op_kernel.h" #include "utils/op_kernel.h"
#include "utils/eigen_utils.h"
#include "utils/omp_alternative.h" #include "utils/omp_alternative.h"
namespace dragon { namespace dragon {
...@@ -10,15 +11,12 @@ namespace kernel { ...@@ -10,15 +11,12 @@ namespace kernel {
template <typename T> template <typename T>
void _Minimum( void _Minimum(
const int count, const int count,
const T* x1, const T* a,
const T* x2, const T* b,
T* y) { T* y) {
#ifdef WITH_OMP EigenVectorArrayMap<T>(y, count) = \
#pragma omp parallel for num_threads(OMP_THREADS(count)) ConstEigenVectorArrayMap<T>(a, count).min(
#endif ConstEigenVectorArrayMap<T>(b, count));
for (int i = 0; i < count; ++i) {
y[i] = std::min(x1[i], x2[i]);
}
} }
/* <T = ?, Device = CPU> */ /* <T = ?, Device = CPU> */
...@@ -26,15 +24,11 @@ void _Minimum( ...@@ -26,15 +24,11 @@ void _Minimum(
template <typename T> template <typename T>
void _BroadcastMinimum( void _BroadcastMinimum(
const int count, const int count,
const T* x1, const T* a,
const T x2, const T b,
T* y) { T* y) {
#ifdef WITH_OMP EigenVectorArrayMap<T>(y, count) = \
#pragma omp parallel for num_threads(OMP_THREADS(count)) ConstEigenVectorArrayMap<T>(a, count).min(b);
#endif
for (int i = 0; i < count; ++i) {
y[i] = std::min(x1[i], x2);
}
} }
/* <T = float32, Device = CPU> */ /* <T = float32, Device = CPU> */
...@@ -42,18 +36,19 @@ void _BroadcastMinimum( ...@@ -42,18 +36,19 @@ void _BroadcastMinimum(
template <typename T> template <typename T>
void _MinimumGrad( void _MinimumGrad(
const int count, const int count,
const T* x1, const T* a,
const T* x2, const T* b,
const T* dy, const T* dy,
T* dx1, T* da,
T* dx2) { T* db) {
const T kZero = T(0);
#ifdef WITH_OMP #ifdef WITH_OMP
#pragma omp parallel for num_threads(OMP_THREADS(count)) #pragma omp parallel for num_threads(OMP_THREADS(count))
#endif #endif
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
const bool dy_to_dx1 = x1[i] < x2[i]; const bool dy_to_da = a[i] < b[i];
dx1[i] = dy_to_dx1 ? dy[i] : 0; da[i] = dy_to_da ? dy[i] : kZero;
dx2[i] = dy_to_dx1 ? 0 : dy[i]; db[i] = dy_to_da ? kZero : dy[i];
} }
} }
...@@ -62,16 +57,17 @@ void _MinimumGrad( ...@@ -62,16 +57,17 @@ void _MinimumGrad(
template <typename T> template <typename T>
void _BroadcastMinimumGrad( void _BroadcastMinimumGrad(
const int count, const int count,
const T* x1, const T* a,
const T x2, const T b,
const T* dy, const T* dy,
T* dx1, T* da,
T* dx2) { T* db) {
const T kZero = T(0);
#ifdef WITH_OMP #ifdef WITH_OMP
#pragma omp parallel for num_threads(OMP_THREADS(count)) #pragma omp parallel for num_threads(OMP_THREADS(count))
#endif #endif
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
dx1[i] = (x1[i] < x2) ? dy[i] : 0; da[i] = a[i] < b ? dy[i] : kZero;
} }
} }
...@@ -80,23 +76,23 @@ void _BroadcastMinimumGrad( ...@@ -80,23 +76,23 @@ void _BroadcastMinimumGrad(
#define DEFINE_MINIMUM_KERNEL_LAUNCHER(name, T, T2) \ #define DEFINE_MINIMUM_KERNEL_LAUNCHER(name, T, T2) \
template <> void name<T, CPUContext>( \ template <> void name<T, CPUContext>( \
const int count, \ const int count, \
const T* x1, \ const T* a, \
const T2 x2, \ const T2 b, \
T* y, \ T* y, \
CPUContext* ctx) { \ CPUContext* ctx) { \
_##name(count, x1, x2, y); \ _##name(count, a, b, y); \
} }
#define DEFINE_MINIMUM_GRAD_KERNEL_LAUNCHER(name, T, T2) \ #define DEFINE_MINIMUM_GRAD_KERNEL_LAUNCHER(name, T, T2) \
template <> void name<T, CPUContext>( \ template <> void name<T, CPUContext>( \
const int count, \ const int count, \
const T* x1, \ const T* a, \
const T2 x2, \ const T2 b, \
const T* dy, \ const T* dy, \
T* dx1, \ T* da, \
T* dx2, \ T* db, \
CPUContext* ctx) { \ CPUContext* ctx) { \
_##name(count, x1, x2, dy, dx1, dx2); \ _##name(count, a, b, dy, da, db); \
} }
DEFINE_MINIMUM_KERNEL_LAUNCHER(Minimum, int8_t, int8_t*); DEFINE_MINIMUM_KERNEL_LAUNCHER(Minimum, int8_t, int8_t*);
...@@ -129,8 +125,8 @@ DEFINE_MINIMUM_GRAD_KERNEL_LAUNCHER(BroadcastMinimumGrad, double, double); ...@@ -129,8 +125,8 @@ DEFINE_MINIMUM_GRAD_KERNEL_LAUNCHER(BroadcastMinimumGrad, double, double);
template <> void Minimum<float16, CPUContext>( template <> void Minimum<float16, CPUContext>(
const int count, const int count,
const float16* x1, const float16* a,
const float16* x2, const float16* b,
float16* y, float16* y,
CPUContext* ctx) { CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED; CPU_FP16_NOT_SUPPORTED;
...@@ -138,8 +134,8 @@ template <> void Minimum<float16, CPUContext>( ...@@ -138,8 +134,8 @@ template <> void Minimum<float16, CPUContext>(
template <> void BroadcastMinimum<float16, CPUContext>( template <> void BroadcastMinimum<float16, CPUContext>(
const int count, const int count,
const float16* x1, const float16* a,
const float16 x2, const float16 b,
float16* y, float16* y,
CPUContext* ctx) { CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED; CPU_FP16_NOT_SUPPORTED;
...@@ -147,22 +143,22 @@ template <> void BroadcastMinimum<float16, CPUContext>( ...@@ -147,22 +143,22 @@ template <> void BroadcastMinimum<float16, CPUContext>(
template <> void MinimumGrad<float16, CPUContext>( template <> void MinimumGrad<float16, CPUContext>(
const int count, const int count,
const float16* x1, const float16* a,
const float16* x2, const float16* b,
const float16* dy, const float16* dy,
float16* dx1, float16* da,
float16* dx2, float16* db,
CPUContext* ctx) { CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED; CPU_FP16_NOT_SUPPORTED;
} }
template <> void BroadcastMinimumGrad<float16, CPUContext>( template <> void BroadcastMinimumGrad<float16, CPUContext>(
const int count, const int count,
const float16* x1, const float16* a,
const float16 x2, const float16 b,
const float16* dy, const float16* dy,
float16* dx1, float16* da,
float16* dx2, float16* db,
CPUContext* ctx) { CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED; CPU_FP16_NOT_SUPPORTED;
} }
......
...@@ -13,11 +13,11 @@ namespace kernel { ...@@ -13,11 +13,11 @@ namespace kernel {
template <typename T> template <typename T>
__global__ void _Minimum( __global__ void _Minimum(
const int nthreads, const int nthreads,
const T* x1, const T* a,
const T* x2, const T* b,
T* y) { T* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
y[i] = min(x1[i], x2[i]); y[i] = min(a[i], b[i]);
} }
} }
...@@ -25,12 +25,12 @@ __global__ void _Minimum( ...@@ -25,12 +25,12 @@ __global__ void _Minimum(
template<> __global__ void _Minimum<half>( template<> __global__ void _Minimum<half>(
const int nthreads, const int nthreads,
const half* x1, const half* a,
const half* x2, const half* b,
half* y) { half* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530 #if __CUDA_ARCH__ >= 530
y[i] = __hlt(x1[i], x2[i]) ? x1[i] : x2[i]; y[i] = __hlt(a[i], b[i]) ? a[i] : b[i];
#endif #endif
} }
} }
...@@ -40,11 +40,11 @@ template<> __global__ void _Minimum<half>( ...@@ -40,11 +40,11 @@ template<> __global__ void _Minimum<half>(
template <typename T> template <typename T>
__global__ void _BroadcastMinimum( __global__ void _BroadcastMinimum(
const int nthreads, const int nthreads,
const T* x1, const T* a,
const T x2, const T b,
T* y) { T* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
y[i] = min(x1[i], x2); y[i] = min(a[i], b);
} }
} }
...@@ -52,12 +52,12 @@ __global__ void _BroadcastMinimum( ...@@ -52,12 +52,12 @@ __global__ void _BroadcastMinimum(
template<> __global__ void _BroadcastMinimum<half>( template<> __global__ void _BroadcastMinimum<half>(
const int nthreads, const int nthreads,
const half* x1, const half* a,
const half x2, const half b,
half* y) { half* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530 #if __CUDA_ARCH__ >= 530
y[i] = __hlt(x1[i], x2) ? x1[i] : x2; y[i] = __hlt(a[i], b) ? a[i] : b;
#endif #endif
} }
} }
...@@ -67,15 +67,16 @@ template<> __global__ void _BroadcastMinimum<half>( ...@@ -67,15 +67,16 @@ template<> __global__ void _BroadcastMinimum<half>(
template <typename T> template <typename T>
__global__ void _MinimumGrad( __global__ void _MinimumGrad(
const int nthreads, const int nthreads,
const T* x1, const T* a,
const T* x2, const T* b,
const T* dy, const T* dy,
T* dx1, T* da,
T* dx2) { T* db) {
const T kZero = T(0);
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
const bool dy_to_dx1 = x1[i] < x2[i]; const bool dy_to_da = a[i] < b[i];
dx1[i] = dy_to_dx1 ? dy[i] : T(0); da[i] = dy_to_da ? dy[i] : kZero;
dx2[i] = dy_to_dx1 ? T(0) : dy[i]; db[i] = dy_to_da ? kZero : dy[i];
} }
} }
...@@ -83,18 +84,19 @@ __global__ void _MinimumGrad( ...@@ -83,18 +84,19 @@ __global__ void _MinimumGrad(
template<> __global__ void _MinimumGrad<half>( template<> __global__ void _MinimumGrad<half>(
const int nthreads, const int nthreads,
const half* x1, const half* a,
const half* x2, const half* b,
const half* dy, const half* dy,
half* dx1, half* da,
half* dx2) { half* db) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530 #if __CUDA_ARCH__ >= 530
const bool dy_to_dx1 = __hlt(x1[i], x2[i]); const half kZero = __float2half(0.f);
dx1[i] = dy_to_dx1 ? dy[i] : __float2half(0.f); CUDA_1D_KERNEL_LOOP(i, nthreads) {
dx2[i] = dy_to_dx1 ? __float2half(0.f) : dy[i]; const bool dy_to_da = __hlt(a[i], b[i]);
#endif da[i] = dy_to_da ? dy[i] : kZero;
db[i] = dy_to_da ? kZero : dy[i];
} }
#endif
} }
/* <T = ?, Device = CUDA> */ /* <T = ?, Device = CUDA> */
...@@ -102,13 +104,14 @@ template<> __global__ void _MinimumGrad<half>( ...@@ -102,13 +104,14 @@ template<> __global__ void _MinimumGrad<half>(
template <typename T> template <typename T>
__global__ void _BroadcastMinimumGrad( __global__ void _BroadcastMinimumGrad(
const int nthreads, const int nthreads,
const T* x1, const T* a,
const T x2, const T b,
const T* dy, const T* dy,
T* dx1, T* da,
T* dx2) { T* db) {
const T kZero = T(0);
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
dx1[i] = (x1[i] < x2) ? dy[i] : T(0); da[i] = (a[i] < b) ? dy[i] : kZero;
} }
} }
...@@ -116,17 +119,17 @@ __global__ void _BroadcastMinimumGrad( ...@@ -116,17 +119,17 @@ __global__ void _BroadcastMinimumGrad(
template<> __global__ void _BroadcastMinimumGrad<half>( template<> __global__ void _BroadcastMinimumGrad<half>(
const int nthreads, const int nthreads,
const half* x1, const half* a,
const half x2, const half b,
const half* dy, const half* dy,
half* dx1, half* da,
half* dx2) { half* db) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530 #if __CUDA_ARCH__ >= 530
dx1[i] = (__hlt(x1[i], x2)) ? const half kZero = __float2half(0.f);
dy[i] : __float2half(0.f); CUDA_1D_KERNEL_LOOP(i, nthreads) {
#endif da[i] = __hlt(a[i], b) ? dy[i] : kZero;
} }
#endif
} }
/* Kernel Launchers */ /* Kernel Launchers */
...@@ -134,30 +137,30 @@ template<> __global__ void _BroadcastMinimumGrad<half>( ...@@ -134,30 +137,30 @@ template<> __global__ void _BroadcastMinimumGrad<half>(
#define DEFINE_MINIMUM_KERNEL_LAUNCHER(name, T, T2) \ #define DEFINE_MINIMUM_KERNEL_LAUNCHER(name, T, T2) \
template <> void name<T, CUDAContext>( \ template <> void name<T, CUDAContext>( \
const int count, \ const int count, \
const T* x1, \ const T* a, \
const T2 x2, \ const T2 b, \
T* y, \ T* y, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
_##name \ _##name \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \ <<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \ 0, ctx->cuda_stream() >>>( \
count, x1, x2, y \ count, a, b, y \
); \ ); \
} }
#define DEFINE_MINIMUM_GRAD_KERNEL_LAUNCHER(name, T, T2) \ #define DEFINE_MINIMUM_GRAD_KERNEL_LAUNCHER(name, T, T2) \
template <> void name<T, CUDAContext>( \ template <> void name<T, CUDAContext>( \
const int count, \ const int count, \
const T* x1, \ const T* a, \
const T2 x2, \ const T2 b, \
const T* dy, \ const T* dy, \
T* dx1, \ T* da, \
T* dx2, \ T* db, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
_##name \ _##name \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \ <<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \ 0, ctx->cuda_stream() >>>( \
count, x1, x2, dy, dx1, dx2 \ count, a, b, dy, da, db \
); \ ); \
} }
...@@ -191,73 +194,73 @@ DEFINE_MINIMUM_GRAD_KERNEL_LAUNCHER(BroadcastMinimumGrad, double, double); ...@@ -191,73 +194,73 @@ DEFINE_MINIMUM_GRAD_KERNEL_LAUNCHER(BroadcastMinimumGrad, double, double);
template <> void Minimum<float16, CUDAContext>( template <> void Minimum<float16, CUDAContext>(
const int count, const int count,
const float16* x1, const float16* a,
const float16* x2, const float16* b,
float16* y, float16* y,
CUDAContext* ctx) { CUDAContext* ctx) {
_Minimum \ _Minimum \
<<< CUDA_BLOCKS(count), CUDA_THREADS, <<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>( 0, ctx->cuda_stream() >>>(
count, count,
reinterpret_cast<const half*>(x1), reinterpret_cast<const half*>(a),
reinterpret_cast<const half*>(x2), reinterpret_cast<const half*>(b),
reinterpret_cast<half*>(y) reinterpret_cast<half*>(y)
); );
} }
template <> void BroadcastMinimum<float16, CUDAContext>( template <> void BroadcastMinimum<float16, CUDAContext>(
const int count, const int count,
const float16* x1, const float16* a,
const float16 x2, const float16 b,
float16* y, float16* y,
CUDAContext* ctx) { CUDAContext* ctx) {
_BroadcastMinimum \ _BroadcastMinimum \
<<< CUDA_BLOCKS(count), CUDA_THREADS, <<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>( 0, ctx->cuda_stream() >>>(
count, count,
reinterpret_cast<const half*>(x1), reinterpret_cast<const half*>(a),
cast::to<half>(x2), cast::to<half>(b),
reinterpret_cast<half*>(y) reinterpret_cast<half*>(y)
); );
} }
template <> void MinimumGrad<float16, CUDAContext>( template <> void MinimumGrad<float16, CUDAContext>(
const int count, const int count,
const float16* x1, const float16* a,
const float16* x2, const float16* b,
const float16* dy, const float16* dy,
float16* dx1, float16* da,
float16* dx2, float16* db,
CUDAContext* ctx) { CUDAContext* ctx) {
_MinimumGrad \ _MinimumGrad \
<<< CUDA_BLOCKS(count), CUDA_THREADS, <<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>( 0, ctx->cuda_stream() >>>(
count, count,
reinterpret_cast<const half*>(x1), reinterpret_cast<const half*>(a),
reinterpret_cast<const half*>(x2), reinterpret_cast<const half*>(b),
reinterpret_cast<const half*>(dy), reinterpret_cast<const half*>(dy),
reinterpret_cast<half*>(dx1), reinterpret_cast<half*>(da),
reinterpret_cast<half*>(dx2) reinterpret_cast<half*>(db)
); );
} }
template <> void BroadcastMinimumGrad<float16, CUDAContext>( template <> void BroadcastMinimumGrad<float16, CUDAContext>(
const int count, const int count,
const float16* x1, const float16* a,
const float16 x2, const float16 b,
const float16* dy, const float16* dy,
float16* dx1, float16* da,
float16* dx2, float16* db,
CUDAContext* ctx) { CUDAContext* ctx) {
_BroadcastMinimumGrad \ _BroadcastMinimumGrad \
<<< CUDA_BLOCKS(count), CUDA_THREADS, <<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>( 0, ctx->cuda_stream() >>>(
count, count,
reinterpret_cast<const half*>(x1), reinterpret_cast<const half*>(a),
cast::to<half>(x2), cast::to<half>(b),
reinterpret_cast<const half*>(dy), reinterpret_cast<const half*>(dy),
reinterpret_cast<half*>(dx1), reinterpret_cast<half*>(da),
reinterpret_cast<half*>(dx2) reinterpret_cast<half*>(db)
); );
} }
......
#include "utils/cast.h"
#include "utils/op_kernel.h" #include "utils/op_kernel.h"
#include "utils/math_functions.h" #include "utils/math_functions.h"
#include "utils/omp_alternative.h"
namespace dragon { namespace dragon {
...@@ -49,6 +49,8 @@ void _IndexSelectGrad( ...@@ -49,6 +49,8 @@ void _IndexSelectGrad(
T* dx, T* dx,
CPUContext* ctx) { CPUContext* ctx) {
int64_t x_offset, select_idx; int64_t x_offset, select_idx;
auto nelements = outer_dim * axis_dim * inner_dim;
math::Set(nelements, cast::to<T>(0.f), dx, ctx);
for (int n = 0; n < outer_dim; ++n) { for (int n = 0; n < outer_dim; ++n) {
for (int i = 0; i < num_indices; ++i) { for (int i = 0; i < num_indices; ++i) {
select_idx = indices[i]; select_idx = indices[i];
......
#ifdef WITH_CUDA #ifdef WITH_CUDA
#include "core/context_cuda.h" #include "core/context_cuda.h"
#include "utils/cast.h"
#include "utils/op_kernel.h" #include "utils/op_kernel.h"
#include "utils/cub_device.h" #include "utils/math_functions.h"
namespace dragon { namespace dragon {
...@@ -134,6 +135,8 @@ template <> __global__ void _IndexSelectGrad<half>( ...@@ -134,6 +135,8 @@ template <> __global__ void _IndexSelectGrad<half>(
T* dx, \ T* dx, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
auto nthreads = outer_dim * inner_dim; \ auto nthreads = outer_dim * inner_dim; \
auto nelements = outer_dim * axis_dim * inner_dim; \
math::Set(nelements, cast::to<T>(0.f), dx, ctx); \
_IndexSelectGrad \ _IndexSelectGrad \
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \ <<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \ 0, ctx->cuda_stream() >>>( \
...@@ -169,6 +172,8 @@ template <> void IndexSelectGrad<float16, CUDAContext>( ...@@ -169,6 +172,8 @@ template <> void IndexSelectGrad<float16, CUDAContext>(
float16* dx, float16* dx,
CUDAContext* ctx) { CUDAContext* ctx) {
auto nthreads = outer_dim * inner_dim; auto nthreads = outer_dim * inner_dim;
auto nelements = outer_dim * axis_dim * inner_dim;
math::Set(nelements, cast::to<float16>(0.f), dx, ctx);
_IndexSelectGrad _IndexSelectGrad
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS, <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>( 0, ctx->cuda_stream() >>>(
......
#include "core/tensor.h"
#include "utils/cast.h"
#include "utils/op_kernel.h"
#include "utils/math_functions.h"
namespace dragon {
namespace kernel {
/* <T = ?, Device = CPU> */
template <typename T>
void _MaskedSelectGrad(
const int num_indices,
const int64_t* indices,
const T* dy,
T* dx) {
for (int i = 0; i < num_indices; ++i) {
dx[indices[i]] = dy[i];
}
}
/* Kernel Launchers */
#define DEFINE_MASKED_KERNEL_LAUNCHER(T) \
template <> void MaskedSelect<T, CPUContext>( \
const int count, \
const uint8_t* mask, \
const T* x, \
Tensor* indices, \
Tensor* scratch, \
Tensor* y, \
CPUContext* ctx) { \
int64_t nelements = 0; \
int64_t n, last = -1, y_ofs = 0; \
for (int i = 0; i < count; ++i) \
if (mask[i]) ++nelements; \
auto* value = y == nullptr ? nullptr : y \
->Reshape({ nelements }) \
->mutable_data<T, CPUContext>(); \
auto* index = indices \
->Reshape({ nelements }) \
->mutable_data<int64_t, CPUContext>(); \
for (int64_t i = 0;; ++i) { \
if (last != -1 && ((i >= count) || !mask[i])) { \
n = i - last; \
if (value != nullptr) { \
auto* src = x + last; \
auto* dst = value + y_ofs; \
math::Copy(n, src, dst, ctx); \
} \
y_ofs += n; last = -1; \
} \
if (i >= count) break; \
if (mask[i]) { \
*(index++) = i; \
if (last == -1) last = i; \
} \
} \
}
#define DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(T) \
template <> void MaskedSelectGrad<T, CPUContext>( \
const int count, \
const int num_indices, \
const int64_t* indices, \
const T* dy, \
T* dx, \
CPUContext* ctx) { \
math::Set(count, cast::to<T>(0.f), dx, ctx); \
_MaskedSelectGrad(num_indices, indices, dy, dx); \
}
DEFINE_MASKED_KERNEL_LAUNCHER(bool);
DEFINE_MASKED_KERNEL_LAUNCHER(int8_t);
DEFINE_MASKED_KERNEL_LAUNCHER(uint8_t);
DEFINE_MASKED_KERNEL_LAUNCHER(int);
DEFINE_MASKED_KERNEL_LAUNCHER(int64_t);
DEFINE_MASKED_KERNEL_LAUNCHER(float16);
DEFINE_MASKED_KERNEL_LAUNCHER(float);
DEFINE_MASKED_KERNEL_LAUNCHER(double);
DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(bool);
DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(int8_t);
DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(uint8_t);
DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(int);
DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(int64_t);
DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(float16);
DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(float);
DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_MASKED_KERNEL_LAUNCHER
#undef DEFINE_MASKED_GRAD_KERNEL_LAUNCHER
} // namespace kernel
} // namepsace dragon
\ No newline at end of file
#ifdef WITH_CUDA
#include "core/tensor.h"
#include "core/context_cuda.h"
#include "utils/cast.h"
#include "utils/op_kernel.h"
#include "utils/cub_device.h"
#include "utils/math_functions.h"
namespace dragon {
namespace kernel {
/* <T = ?, Device = CUDA> */
template <typename T>
__global__ void _MaskedSelectByIndex(
const int nthreads,
const int64_t* indices,
const T* x,
T* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
y[i] = x[indices[i]];
}
}
template <typename T>
__global__ void _MaskedSelectGrad(
const int nthreads,
const int64_t* indices,
const T* dy,
T* dx) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
dx[indices[i]] = dy[i];
}
}
/* Kernel Launchers */
#define DEFINE_MASKED_KERNEL_LAUNCHER(T) \
template <> void MaskedSelect<T, CUDAContext>( \
const int count, \
const uint8_t* mask, \
const T* x, \
Tensor* indices, \
Tensor* scratch, \
Tensor* y, \
CUDAContext* ctx) { \
auto* i = indices \
->Reshape({ count + 1 }) \
->mutable_data<int64_t, CUDAContext>(); \
auto* n = (int*)(i + count); \
size_t nbytes = 0; int nelements; \
cub::CountingInputIterator<int> itr(0); \
cub::DeviceSelect::Flagged( \
nullptr, nbytes, \
itr, mask, i, n, count, \
ctx->cuda_stream() \
); \
auto* storage = scratch \
->Reshape({ (int64_t)nbytes }) \
->mutable_data<uint8_t, CUDAContext>(); \
cub::DeviceSelect::Flagged( \
storage, nbytes, \
itr, mask, i, n, count, \
ctx->cuda_stream() \
); \
ctx->FinishDeviceCompution(); \
ctx->Memcpy<CPUContext, CUDAContext>( \
sizeof(int), &nelements, n); \
indices->Reshape({ nelements }); \
if (y == nullptr) return; \
auto* value = y \
->Reshape({ nelements }) \
->mutable_data<T, CUDAContext>(); \
_MaskedSelectByIndex \
<<< CUDA_BLOCKS(nelements), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
nelements, i, x, value \
); \
}
#define DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(T) \
template <> void MaskedSelectGrad<T, CUDAContext>( \
const int count, \
const int num_indices, \
const int64_t* indices, \
const T* dy, \
T* dx, \
CUDAContext* ctx) { \
math::Set(count, cast::to<T>(0.f), dx, ctx); \
_MaskedSelectGrad \
<<< CUDA_BLOCKS(num_indices), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
num_indices, indices, dy, dx \
); \
}
DEFINE_MASKED_KERNEL_LAUNCHER(bool);
DEFINE_MASKED_KERNEL_LAUNCHER(int8_t);
DEFINE_MASKED_KERNEL_LAUNCHER(uint8_t);
DEFINE_MASKED_KERNEL_LAUNCHER(int);
DEFINE_MASKED_KERNEL_LAUNCHER(int64_t);
DEFINE_MASKED_KERNEL_LAUNCHER(float16);
DEFINE_MASKED_KERNEL_LAUNCHER(float);
DEFINE_MASKED_KERNEL_LAUNCHER(double);
DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(bool);
DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(int8_t);
DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(uint8_t);
DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(int);
DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(int64_t);
DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(float16);
DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(float);
DEFINE_MASKED_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_MASKED_KERNEL_LAUNCHER
#undef DEFINE_MASKED_GRAD_KERNEL_LAUNCHER
} // namespace kernel
} // namepsace dragon
#endif // WITH_CUDA
\ No newline at end of file
#include "utils/op_kernel.h"
namespace dragon {
namespace kernel {
#define FIXED_DIVISOR_DIV_MOD(d, n, q, r) \
do { \
const auto n_copy = n; \
*q = n_copy / d; \
*r = n_copy % d; \
} while (0)
template <> void UnravelIndex<CPUContext>(
const int count,
const int ndims,
const int* dims,
const int64_t* x,
int64_t* y,
CPUContext* ctx) {
int tmp, d; int64_t* Y;
for (int i = 0; i < count; ++i) {
tmp = x[i]; Y = y + i * ndims;
for (d = ndims - 1; d >= 0; --d) {
FIXED_DIVISOR_DIV_MOD(dims[d], tmp, &tmp, (Y + d));
}
}
}
#undef FIXED_DIVISOR_DIV_MOD
} // namespace kernel
} // namepsace dragon
\ No newline at end of file
#ifdef WITH_CUDA
#include "core/context_cuda.h"
#include "utils/op_kernel.h"
namespace dragon {
namespace kernel {
#define FIXED_DIVISOR_DIV_MOD(d, n, q, r) \
do { \
const auto n_copy = n; \
*q = n_copy / d; \
*r = n_copy % d; \
} while (0)
/* <T = ?, Device = CUDA> */
__global__ void _UnravelIndex(
const int nthreads,
const int ndims,
const int* dims,
const int64_t* x,
int64_t* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
int tmp = x[i], d;
int64_t* Y = y + i * ndims;
#pragma unroll
for (d = ndims - 1; d >= 0; --d) {
#if __CUDA_ARCH__ >= 350
FIXED_DIVISOR_DIV_MOD(__ldg(dims + d), tmp, &tmp, (Y + d));
#else
FIXED_DIVISOR_DIV_MOD(dims[d], tmp, &tmp, (Y + d));
#endif
}
}
}
template <> void UnravelIndex<CUDAContext>(
const int count,
const int ndims,
const int* dims,
const int64_t* x,
int64_t* y,
CUDAContext* ctx) {
_UnravelIndex
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>>(
count, ndims, dims, x, y
);
}
#undef FIXED_DIVISOR_DIV_MOD
} // namespace kernel
} // namepsace dragon
#endif // WITH_CUDA
\ No newline at end of file
#include "utils/op_kernel.h"
#include "utils/omp_alternative.h"
namespace dragon {
namespace kernel {
/* <T = ?, Device = CPU> */
template <typename T>
void _Where(
const int count,
const uint8_t* mask,
const T* a,
const T* b,
T* y) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
y[i] = mask[i] ? a[i] : b[i];
}
}
template <typename T>
void _WhereGrad(
const int count,
const uint8_t* mask,
const T* dy,
T* da,
T* db) {
const T kZero = T(0);
#ifdef WITH_OMP
#pragma omp parallel for num_threads(OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
const bool dy_to_da = mask[i];
da[i] = dy_to_da ? dy[i] : kZero;
db[i] = dy_to_da ? kZero : dy[i];
}
}
/* Kernel Launchers */
#define DEFINE_WHERE_KERNEL_LAUNCHER(T) \
template<> void Where<T, CPUContext>( \
const int count, \
const uint8_t* mask, \
const T* a, \
const T* b, \
T* y, \
CPUContext* ctx) { \
_Where(count, mask, a, b, y); \
}
#define DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(T) \
template <> void WhereGrad<T, CPUContext>( \
const int count, \
const uint8_t* mask, \
const T* dy, \
T* da, \
T* db, \
CPUContext* ctx) { \
_WhereGrad(count, mask, dy, da, db); \
}
DEFINE_WHERE_KERNEL_LAUNCHER(bool);
DEFINE_WHERE_KERNEL_LAUNCHER(int8_t);
DEFINE_WHERE_KERNEL_LAUNCHER(uint8_t);
DEFINE_WHERE_KERNEL_LAUNCHER(int);
DEFINE_WHERE_KERNEL_LAUNCHER(int64_t);
DEFINE_WHERE_KERNEL_LAUNCHER(float16);
DEFINE_WHERE_KERNEL_LAUNCHER(float);
DEFINE_WHERE_KERNEL_LAUNCHER(double);
DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(bool);
DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(int8_t);
DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(uint8_t);
DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(int);
DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(int64_t);
DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(double);
template <> void WhereGrad<float16, CPUContext>(
const int count,
const uint8_t* mask,
const float16* dy,
float16* da,
float16* db,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
}
#undef DEFINE_WHERE_KERNEL_LAUNCHER
#undef DEFINE_WHERE_GRAD_KERNEL_LAUNCHER
} // namespace kernel
} // namepsace dragon
\ No newline at end of file
#ifdef WITH_CUDA
#include "core/context_cuda.h"
#include "utils/op_kernel.h"
namespace dragon {
namespace kernel {
/* <T = ?, Device = CUDA> */
template<typename T>
__global__ void _Where(
const int nthreads,
const uint8_t* mask,
const T* a,
const T* b,
T* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
y[i] = mask[i] ? a[i] : b[i];
}
}
template <typename T>
__global__ void _WhereGrad(
const int nthreads,
const uint8_t* mask,
const T* dy,
T* da,
T* db) {
const T kZero = T(0);
CUDA_1D_KERNEL_LOOP(i, nthreads) {
da[i] = mask[i] ? dy[i] : kZero;
db[i] = mask[i] ? kZero : dy[i];
}
}
template<> __global__ void _WhereGrad<half>(
const int nthreads,
const uint8_t* mask,
const half* dy,
half* da,
half* db) {
#if __CUDA_ARCH__ >= 530
const half kZero = __float2half(0.f);
CUDA_1D_KERNEL_LOOP(i, nthreads) {
const bool dy_to_da = mask[i];
da[i] = dy_to_da ? dy[i] : kZero;
db[i] = dy_to_da ? kZero : dy[i];
}
#endif
}
/* Kernel Launchers */
#define DEFINE_WHERE_KERNEL_LAUNCHER(T) \
template<> void Where<T, CUDAContext>( \
const int count, \
const uint8_t* mask, \
const T* a, \
const T* b, \
T* y, \
CUDAContext* ctx) { \
_Where \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
count, mask, a, b, y \
); \
}
#define DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(T) \
template <> void WhereGrad<T, CUDAContext>( \
const int count, \
const uint8_t* mask, \
const T* dy, \
T* da, \
T* db, \
CUDAContext* ctx) { \
_WhereGrad \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
count, mask, dy, da, db \
); \
}
DEFINE_WHERE_KERNEL_LAUNCHER(bool);
DEFINE_WHERE_KERNEL_LAUNCHER(int8_t);
DEFINE_WHERE_KERNEL_LAUNCHER(uint8_t);
DEFINE_WHERE_KERNEL_LAUNCHER(int);
DEFINE_WHERE_KERNEL_LAUNCHER(int64_t);
DEFINE_WHERE_KERNEL_LAUNCHER(float16);
DEFINE_WHERE_KERNEL_LAUNCHER(float);
DEFINE_WHERE_KERNEL_LAUNCHER(double);
DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(bool);
DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(int8_t);
DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(uint8_t);
DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(int);
DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(int64_t);
DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_WHERE_GRAD_KERNEL_LAUNCHER(double);
template <> void WhereGrad<float16, CUDAContext>(
const int count,
const uint8_t* mask,
const float16* dy,
float16* da,
float16* db,
CUDAContext* ctx) {
_WhereGrad
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count,
mask,
reinterpret_cast<const half*>(dy),
reinterpret_cast<half*>(da),
reinterpret_cast<half*>(db)
);
}
#undef DEFINE_WHERE_KERNEL_LAUNCHER
#undef DEFINE_WHERE_GRAD_KERNEL_LAUNCHER
} // namespace kernel
} // namepsace dragon
#endif // WITH_CUDA
\ No newline at end of file
#include "utils/op_kernel.h" #include "utils/op_kernel.h"
#include "utils/omp_alternative.h" #include "utils/eigen_utils.h"
namespace dragon { namespace dragon {
namespace kernel { namespace kernel {
/* <T = ?, Device = CPU> */ /* Kernel Launchers */
template <typename T>
void _EqualInteger(
const int count,
const T* a,
const T* b,
bool* y) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
y[i] = a[i] == b[i] ? true : false;
}
}
template <typename T>
void _EqualFloat(
const int count,
const T* a,
const T* b,
bool* y) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
y[i] = fabs(a[i] - b[i]) < 1e-15 ? true : false;
}
}
/* <T = ?, Device = CPU> */
template <typename T>
void _Less(
const int count,
const T* a,
const T* b,
bool* y) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
y[i] = a[i] < b[i] ? true : false;
}
}
/* <T = ?, Device = CPU> */
template <typename T>
void _LessEqual(
const int count,
const T* a,
const T* b,
bool* y) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
y[i] = a[i] <= b[i] ? true : false;
}
}
/* <T = ?, Device = CPU> */
template <typename T>
void _Greater(
const int count,
const T* a,
const T* b,
bool* y) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
y[i] = a[i] > b[i] ? true : false;
}
}
/* <T = ?, Device = CPU> */
template <typename T> #define DEFINE_NOTZERO_KERNEL_LAUNCHER(T) \
void _GreaterEqual( template <> void NotZero<T, CPUContext>( \
const int count, const int count, \
const T* a, const T* x, \
const T* b, bool* y, \
bool* y) { CPUContext* ctx) { \
#ifdef WITH_OMP EigenVectorArrayMap<bool>(y, count) = \
#pragma omp parallel for num_threads(OMP_THREADS(count)) ConstEigenVectorArrayMap<T>(x, count) != T(0); \
#endif
for (int i = 0; i < count; ++i) {
y[i] = a[i] >= b[i] ? true : false;
} }
}
/* Kernel Launchers */
#define DEFINE_COMPARE_WARPPER(T, OP, IMPL) \ #define DEFINE_COMPARE_KERNEL_LAUNCHER(T, OP, expr) \
template <> void OP<T, CPUContext>( \ template <> void OP<T, CPUContext>( \
const int count, \ const int count, \
const T* a, \ const T* a, \
const T* b, \ const T* b, \
bool* y, \ bool* y, \
CPUContext* ctx) { \ CPUContext* ctx) { \
IMPL(count, a, b, y); \ EigenVectorArrayMap<bool>(y, count) = \
ConstEigenVectorArrayMap<T>(a, count) expr \
ConstEigenVectorArrayMap<T>(b, count); \
} }
DEFINE_COMPARE_WARPPER(bool, Equal, _EqualInteger); DEFINE_NOTZERO_KERNEL_LAUNCHER(bool);
DEFINE_COMPARE_WARPPER(int8_t, Equal, _EqualInteger); DEFINE_NOTZERO_KERNEL_LAUNCHER(int8_t);
DEFINE_COMPARE_WARPPER(uint8_t, Equal, _EqualInteger); DEFINE_NOTZERO_KERNEL_LAUNCHER(uint8_t);
DEFINE_COMPARE_WARPPER(int, Equal, _EqualInteger); DEFINE_NOTZERO_KERNEL_LAUNCHER(int);
DEFINE_COMPARE_WARPPER(int64_t, Equal, _EqualInteger); DEFINE_NOTZERO_KERNEL_LAUNCHER(int64_t);
DEFINE_COMPARE_WARPPER(float, Equal, _EqualFloat); DEFINE_NOTZERO_KERNEL_LAUNCHER(float);
DEFINE_COMPARE_WARPPER(double, Equal, _EqualFloat); DEFINE_NOTZERO_KERNEL_LAUNCHER(double);
DEFINE_COMPARE_WARPPER(bool, Less, _Less); DEFINE_COMPARE_KERNEL_LAUNCHER(bool, Equal, ==);
DEFINE_COMPARE_WARPPER(int8_t, Less, _Less); DEFINE_COMPARE_KERNEL_LAUNCHER(int8_t, Equal, ==);
DEFINE_COMPARE_WARPPER(uint8_t, Less, _Less); DEFINE_COMPARE_KERNEL_LAUNCHER(uint8_t, Equal, ==);
DEFINE_COMPARE_WARPPER(int, Less, _Less); DEFINE_COMPARE_KERNEL_LAUNCHER(int, Equal, ==);
DEFINE_COMPARE_WARPPER(int64_t, Less, _Less); DEFINE_COMPARE_KERNEL_LAUNCHER(int64_t, Equal, ==);
DEFINE_COMPARE_WARPPER(float, Less, _Less); DEFINE_COMPARE_KERNEL_LAUNCHER(float, Equal, ==);
DEFINE_COMPARE_WARPPER(double, Less, _Less); DEFINE_COMPARE_KERNEL_LAUNCHER(double, Equal, ==);
DEFINE_COMPARE_WARPPER(bool, LessEqual, _LessEqual); DEFINE_COMPARE_KERNEL_LAUNCHER(bool, NotEqual, !=);
DEFINE_COMPARE_WARPPER(int8_t, LessEqual, _LessEqual); DEFINE_COMPARE_KERNEL_LAUNCHER(int8_t, NotEqual, !=);
DEFINE_COMPARE_WARPPER(uint8_t, LessEqual, _LessEqual); DEFINE_COMPARE_KERNEL_LAUNCHER(uint8_t, NotEqual, !=);
DEFINE_COMPARE_WARPPER(int, LessEqual, _LessEqual); DEFINE_COMPARE_KERNEL_LAUNCHER(int, NotEqual, !=);
DEFINE_COMPARE_WARPPER(int64_t, LessEqual, _LessEqual); DEFINE_COMPARE_KERNEL_LAUNCHER(int64_t, NotEqual, !=);
DEFINE_COMPARE_WARPPER(float, LessEqual, _LessEqual); DEFINE_COMPARE_KERNEL_LAUNCHER(float, NotEqual, != );
DEFINE_COMPARE_WARPPER(double, LessEqual, _LessEqual); DEFINE_COMPARE_KERNEL_LAUNCHER(double, NotEqual, !=);
DEFINE_COMPARE_WARPPER(bool, Greater, _Greater); DEFINE_COMPARE_KERNEL_LAUNCHER(bool, Less, <);
DEFINE_COMPARE_WARPPER(int8_t, Greater, _Greater); DEFINE_COMPARE_KERNEL_LAUNCHER(int8_t, Less, <);
DEFINE_COMPARE_WARPPER(uint8_t, Greater, _Greater); DEFINE_COMPARE_KERNEL_LAUNCHER(uint8_t, Less, <);
DEFINE_COMPARE_WARPPER(int, Greater, _Greater); DEFINE_COMPARE_KERNEL_LAUNCHER(int, Less, <);
DEFINE_COMPARE_WARPPER(int64_t, Greater, _Greater); DEFINE_COMPARE_KERNEL_LAUNCHER(int64_t, Less, <);
DEFINE_COMPARE_WARPPER(float, Greater, _Greater); DEFINE_COMPARE_KERNEL_LAUNCHER(float, Less, <);
DEFINE_COMPARE_WARPPER(double, Greater, _Greater); DEFINE_COMPARE_KERNEL_LAUNCHER(double, Less, <);
DEFINE_COMPARE_WARPPER(bool, GreaterEqual, _GreaterEqual); DEFINE_COMPARE_KERNEL_LAUNCHER(bool, LessEqual, <=);
DEFINE_COMPARE_WARPPER(int8_t, GreaterEqual, _GreaterEqual); DEFINE_COMPARE_KERNEL_LAUNCHER(int8_t, LessEqual, <=);
DEFINE_COMPARE_WARPPER(uint8_t, GreaterEqual, _GreaterEqual); DEFINE_COMPARE_KERNEL_LAUNCHER(uint8_t, LessEqual, <=);
DEFINE_COMPARE_WARPPER(int, GreaterEqual, _GreaterEqual); DEFINE_COMPARE_KERNEL_LAUNCHER(int, LessEqual, <=);
DEFINE_COMPARE_WARPPER(int64_t, GreaterEqual, _GreaterEqual); DEFINE_COMPARE_KERNEL_LAUNCHER(int64_t, LessEqual, <=);
DEFINE_COMPARE_WARPPER(float, GreaterEqual, _GreaterEqual); DEFINE_COMPARE_KERNEL_LAUNCHER(float, LessEqual, <=);
DEFINE_COMPARE_WARPPER(double, GreaterEqual, _GreaterEqual); DEFINE_COMPARE_KERNEL_LAUNCHER(double, LessEqual, <=);
DEFINE_COMPARE_KERNEL_LAUNCHER(bool, Greater, >);
DEFINE_COMPARE_KERNEL_LAUNCHER(int8_t, Greater, >);
DEFINE_COMPARE_KERNEL_LAUNCHER(uint8_t, Greater, >);
DEFINE_COMPARE_KERNEL_LAUNCHER(int, Greater, >);
DEFINE_COMPARE_KERNEL_LAUNCHER(int64_t, Greater, >);
DEFINE_COMPARE_KERNEL_LAUNCHER(float, Greater, >);
DEFINE_COMPARE_KERNEL_LAUNCHER(double, Greater, >);
DEFINE_COMPARE_KERNEL_LAUNCHER(bool, GreaterEqual, >=);
DEFINE_COMPARE_KERNEL_LAUNCHER(int8_t, GreaterEqual, >=);
DEFINE_COMPARE_KERNEL_LAUNCHER(uint8_t, GreaterEqual, >=);
DEFINE_COMPARE_KERNEL_LAUNCHER(int, GreaterEqual, >=);
DEFINE_COMPARE_KERNEL_LAUNCHER(int64_t, GreaterEqual, >=);
DEFINE_COMPARE_KERNEL_LAUNCHER(float, GreaterEqual, >=);
DEFINE_COMPARE_KERNEL_LAUNCHER(double, GreaterEqual, >=);
template <> void NotZero<float16, CPUContext>(
const int count,
const float16* x,
bool* y,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
}
template <> void Equal<float16, CPUContext>( template <> void Equal<float16, CPUContext>(
const int count, const int count,
...@@ -160,6 +102,15 @@ template <> void Equal<float16, CPUContext>( ...@@ -160,6 +102,15 @@ template <> void Equal<float16, CPUContext>(
CPU_FP16_NOT_SUPPORTED; CPU_FP16_NOT_SUPPORTED;
} }
template <> void NotEqual<float16, CPUContext>(
const int count,
const float16* a,
const float16* b,
bool* y,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
}
template <> void Less<float16, CPUContext>( template <> void Less<float16, CPUContext>(
const int count, const int count,
const float16* a, const float16* a,
...@@ -196,7 +147,8 @@ template <> void GreaterEqual<float16, CPUContext>( ...@@ -196,7 +147,8 @@ template <> void GreaterEqual<float16, CPUContext>(
CPU_FP16_NOT_SUPPORTED; CPU_FP16_NOT_SUPPORTED;
} }
#undef DEFINE_COMPARE_WARPPER #undef DEFINE_NOTZERO_KERNEL_LAUNCHER
#undef DEFINE_COMPARE_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
......
...@@ -10,6 +10,31 @@ namespace kernel { ...@@ -10,6 +10,31 @@ namespace kernel {
/* <T = ?, Device = CUDA> */ /* <T = ?, Device = CUDA> */
template <typename T> template <typename T>
__global__ void _NotZero(
const int nthreads,
const T* x,
bool* y) {
const T kZero = T(0);
CUDA_1D_KERNEL_LOOP(i, nthreads) {
y[i] = x[i] != kZero ? true : false;
}
}
template<> __global__ void _NotZero<half>(
const int nthreads,
const half* x,
bool* y) {
#if __CUDA_ARCH__ >= 530
const half kZero = __float2half(0.f);
CUDA_1D_KERNEL_LOOP(i, nthreads) {
y[i] = __hne(x[i], kZero) ? true : false;
}
#endif
}
/* <T = ?, Device = CUDA> */
template <typename T>
__global__ void _EqualInteger( __global__ void _EqualInteger(
const int nthreads, const int nthreads,
const T* a, const T* a,
...@@ -46,6 +71,42 @@ __global__ void _EqualFloat( ...@@ -46,6 +71,42 @@ __global__ void _EqualFloat(
/* <T = ?, Device = CUDA> */ /* <T = ?, Device = CUDA> */
template <typename T> template <typename T>
__global__ void _NotEqualInteger(
const int nthreads,
const T* a,
const T* b,
bool* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
y[i] = a[i] != b[i] ? true : false;
}
}
__global__ void _NotEqualHalf(
const int nthreads,
const half* a,
const half* b,
bool* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530
y[i] = __hne(a[i], b[i]) ? true : false;
#endif
}
}
template <typename T>
__global__ void _NotEqualFloat(
const int nthreads,
const T* a,
const T* b,
bool* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
y[i] = fabs(a[i] - b[i]) > 1e-15 ? true : false;
}
}
/* <T = ?, Device = CUDA> */
template <typename T>
__global__ void _Less( __global__ void _Less(
const int nthreads, const int nthreads,
const T* a, const T* a,
...@@ -145,7 +206,20 @@ __global__ void _GreaterEqualHalf( ...@@ -145,7 +206,20 @@ __global__ void _GreaterEqualHalf(
/* Kernel Launchers */ /* Kernel Launchers */
#define DEFINE_COMPARE_WARPPER(T, OP, IMPL) \ #define DEFINE_NOTZERO_KERNEL_LAUNCHER(T) \
template <> void NotZero<T, CUDAContext>( \
const int count, \
const T* x, \
bool* y, \
CUDAContext* ctx) { \
_NotZero \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
count, x, y \
); \
}
#define DEFINE_COMPARE_KERNEL_LAUNCHER(T, OP, IMPL) \
template <> void OP<T, CUDAContext>( \ template <> void OP<T, CUDAContext>( \
const int count, \ const int count, \
const T* a, \ const T* a, \
...@@ -159,7 +233,7 @@ __global__ void _GreaterEqualHalf( ...@@ -159,7 +233,7 @@ __global__ void _GreaterEqualHalf(
); \ ); \
} }
#define DEFINE_COMPARE_FP16_WARPPER(OP) \ #define DEFINE_COMPARE_FP16_KERNEL_LAUNCHER(OP) \
template <> void OP<float16, CUDAContext>( \ template <> void OP<float16, CUDAContext>( \
const int count, \ const int count, \
const float16* a, \ const float16* a, \
...@@ -176,53 +250,85 @@ __global__ void _GreaterEqualHalf( ...@@ -176,53 +250,85 @@ __global__ void _GreaterEqualHalf(
); \ ); \
} }
DEFINE_COMPARE_WARPPER(bool, Equal, _EqualInteger); DEFINE_NOTZERO_KERNEL_LAUNCHER(bool);
DEFINE_COMPARE_WARPPER(int8_t, Equal, _EqualInteger); DEFINE_NOTZERO_KERNEL_LAUNCHER(int8_t);
DEFINE_COMPARE_WARPPER(uint8_t, Equal, _EqualInteger); DEFINE_NOTZERO_KERNEL_LAUNCHER(uint8_t);
DEFINE_COMPARE_WARPPER(int, Equal, _EqualInteger); DEFINE_NOTZERO_KERNEL_LAUNCHER(int);
DEFINE_COMPARE_WARPPER(int64_t, Equal, _EqualInteger); DEFINE_NOTZERO_KERNEL_LAUNCHER(int64_t);
DEFINE_COMPARE_WARPPER(float, Equal, _EqualFloat); DEFINE_NOTZERO_KERNEL_LAUNCHER(float);
DEFINE_COMPARE_WARPPER(double, Equal, _EqualFloat); DEFINE_NOTZERO_KERNEL_LAUNCHER(double);
DEFINE_COMPARE_FP16_WARPPER(Equal);
DEFINE_COMPARE_KERNEL_LAUNCHER(bool, Equal, _EqualInteger);
DEFINE_COMPARE_WARPPER(bool, Less, _Less); DEFINE_COMPARE_KERNEL_LAUNCHER(int8_t, Equal, _EqualInteger);
DEFINE_COMPARE_WARPPER(int8_t, Less, _Less); DEFINE_COMPARE_KERNEL_LAUNCHER(uint8_t, Equal, _EqualInteger);
DEFINE_COMPARE_WARPPER(uint8_t, Less, _Less); DEFINE_COMPARE_KERNEL_LAUNCHER(int, Equal, _EqualInteger);
DEFINE_COMPARE_WARPPER(int, Less, _Less); DEFINE_COMPARE_KERNEL_LAUNCHER(int64_t, Equal, _EqualInteger);
DEFINE_COMPARE_WARPPER(int64_t, Less, _Less); DEFINE_COMPARE_KERNEL_LAUNCHER(float, Equal, _EqualFloat);
DEFINE_COMPARE_WARPPER(float, Less, _Less); DEFINE_COMPARE_KERNEL_LAUNCHER(double, Equal, _EqualFloat);
DEFINE_COMPARE_WARPPER(double, Less, _Less); DEFINE_COMPARE_FP16_KERNEL_LAUNCHER(Equal);
DEFINE_COMPARE_FP16_WARPPER(Less);
DEFINE_COMPARE_KERNEL_LAUNCHER(bool, NotEqual, _NotEqualInteger);
DEFINE_COMPARE_WARPPER(bool, LessEqual, _LessEqual); DEFINE_COMPARE_KERNEL_LAUNCHER(int8_t, NotEqual, _NotEqualInteger);
DEFINE_COMPARE_WARPPER(int8_t, LessEqual, _LessEqual); DEFINE_COMPARE_KERNEL_LAUNCHER(uint8_t, NotEqual, _NotEqualInteger);
DEFINE_COMPARE_WARPPER(uint8_t, LessEqual, _LessEqual); DEFINE_COMPARE_KERNEL_LAUNCHER(int, NotEqual, _NotEqualInteger);
DEFINE_COMPARE_WARPPER(int, LessEqual, _LessEqual); DEFINE_COMPARE_KERNEL_LAUNCHER(int64_t, NotEqual, _NotEqualInteger);
DEFINE_COMPARE_WARPPER(int64_t, LessEqual, _LessEqual); DEFINE_COMPARE_KERNEL_LAUNCHER(float, NotEqual, _NotEqualFloat);
DEFINE_COMPARE_WARPPER(float, LessEqual, _LessEqual); DEFINE_COMPARE_KERNEL_LAUNCHER(double, NotEqual, _NotEqualFloat);
DEFINE_COMPARE_WARPPER(double, LessEqual, _LessEqual); DEFINE_COMPARE_FP16_KERNEL_LAUNCHER(NotEqual);
DEFINE_COMPARE_FP16_WARPPER(LessEqual);
DEFINE_COMPARE_KERNEL_LAUNCHER(bool, Less, _Less);
DEFINE_COMPARE_WARPPER(bool, Greater, _Greater); DEFINE_COMPARE_KERNEL_LAUNCHER(int8_t, Less, _Less);
DEFINE_COMPARE_WARPPER(int8_t, Greater, _Greater); DEFINE_COMPARE_KERNEL_LAUNCHER(uint8_t, Less, _Less);
DEFINE_COMPARE_WARPPER(uint8_t, Greater, _Greater); DEFINE_COMPARE_KERNEL_LAUNCHER(int, Less, _Less);
DEFINE_COMPARE_WARPPER(int, Greater, _Greater); DEFINE_COMPARE_KERNEL_LAUNCHER(int64_t, Less, _Less);
DEFINE_COMPARE_WARPPER(int64_t, Greater, _Greater); DEFINE_COMPARE_KERNEL_LAUNCHER(float, Less, _Less);
DEFINE_COMPARE_WARPPER(float, Greater, _Greater); DEFINE_COMPARE_KERNEL_LAUNCHER(double, Less, _Less);
DEFINE_COMPARE_WARPPER(double, Greater, _Greater); DEFINE_COMPARE_FP16_KERNEL_LAUNCHER(Less);
DEFINE_COMPARE_FP16_WARPPER(Greater);
DEFINE_COMPARE_KERNEL_LAUNCHER(bool, LessEqual, _LessEqual);
DEFINE_COMPARE_WARPPER(bool, GreaterEqual, _GreaterEqual); DEFINE_COMPARE_KERNEL_LAUNCHER(int8_t, LessEqual, _LessEqual);
DEFINE_COMPARE_WARPPER(int8_t, GreaterEqual, _GreaterEqual); DEFINE_COMPARE_KERNEL_LAUNCHER(uint8_t, LessEqual, _LessEqual);
DEFINE_COMPARE_WARPPER(uint8_t, GreaterEqual, _GreaterEqual); DEFINE_COMPARE_KERNEL_LAUNCHER(int, LessEqual, _LessEqual);
DEFINE_COMPARE_WARPPER(int, GreaterEqual, _GreaterEqual); DEFINE_COMPARE_KERNEL_LAUNCHER(int64_t, LessEqual, _LessEqual);
DEFINE_COMPARE_WARPPER(int64_t, GreaterEqual, _GreaterEqual); DEFINE_COMPARE_KERNEL_LAUNCHER(float, LessEqual, _LessEqual);
DEFINE_COMPARE_WARPPER(float, GreaterEqual, _GreaterEqual); DEFINE_COMPARE_KERNEL_LAUNCHER(double, LessEqual, _LessEqual);
DEFINE_COMPARE_WARPPER(double, GreaterEqual, _GreaterEqual); DEFINE_COMPARE_FP16_KERNEL_LAUNCHER(LessEqual);
DEFINE_COMPARE_FP16_WARPPER(GreaterEqual);
DEFINE_COMPARE_KERNEL_LAUNCHER(bool, Greater, _Greater);
#undef DEFINE_COMPARE_WARPPER DEFINE_COMPARE_KERNEL_LAUNCHER(int8_t, Greater, _Greater);
#undef DEFINE_COMPARE_FP16_WARPPER DEFINE_COMPARE_KERNEL_LAUNCHER(uint8_t, Greater, _Greater);
DEFINE_COMPARE_KERNEL_LAUNCHER(int, Greater, _Greater);
DEFINE_COMPARE_KERNEL_LAUNCHER(int64_t, Greater, _Greater);
DEFINE_COMPARE_KERNEL_LAUNCHER(float, Greater, _Greater);
DEFINE_COMPARE_KERNEL_LAUNCHER(double, Greater, _Greater);
DEFINE_COMPARE_FP16_KERNEL_LAUNCHER(Greater);
DEFINE_COMPARE_KERNEL_LAUNCHER(bool, GreaterEqual, _GreaterEqual);
DEFINE_COMPARE_KERNEL_LAUNCHER(int8_t, GreaterEqual, _GreaterEqual);
DEFINE_COMPARE_KERNEL_LAUNCHER(uint8_t, GreaterEqual, _GreaterEqual);
DEFINE_COMPARE_KERNEL_LAUNCHER(int, GreaterEqual, _GreaterEqual);
DEFINE_COMPARE_KERNEL_LAUNCHER(int64_t, GreaterEqual, _GreaterEqual);
DEFINE_COMPARE_KERNEL_LAUNCHER(float, GreaterEqual, _GreaterEqual);
DEFINE_COMPARE_KERNEL_LAUNCHER(double, GreaterEqual, _GreaterEqual);
DEFINE_COMPARE_FP16_KERNEL_LAUNCHER(GreaterEqual);
template <> void NotZero<float16, CUDAContext>(
const int count,
const float16* x,
bool* y,
CUDAContext* ctx) {
_NotZero
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>>(
count,
reinterpret_cast<const half*>(x),
y
);
}
#undef DEFINE_NOTZERO_KERNEL_LAUNCHER
#undef DEFINE_COMPARE_KERNEL_LAUNCHER
#undef DEFINE_COMPARE_FP16_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
......
#include "utils/op_kernel.h"
#include "utils/math_utils.h"
#include "utils/omp_alternative.h"
namespace dragon {
namespace kernel {
/* <T = ?, Device = CPU> */
template <typename T>
void _MaskedAssign(
const int count,
const uint8_t* mask,
const T* x,
T* y) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
y[i] = mask[i] ? x[i] : y[i];
}
}
/* Kernel Launchers */
#define DEFINE_ASSIGN_KERNEL_LAUNCHER(T) \
template<> void MaskedAssign<T, CPUContext>( \
const int count, \
const uint8_t* mask, \
const T* x, \
T* y, \
CPUContext* ctx) { \
_MaskedAssign(count, mask, x, y); \
}
DEFINE_ASSIGN_KERNEL_LAUNCHER(bool);
DEFINE_ASSIGN_KERNEL_LAUNCHER(int8_t);
DEFINE_ASSIGN_KERNEL_LAUNCHER(uint8_t);
DEFINE_ASSIGN_KERNEL_LAUNCHER(int);
DEFINE_ASSIGN_KERNEL_LAUNCHER(int64_t);
DEFINE_ASSIGN_KERNEL_LAUNCHER(float16);
DEFINE_ASSIGN_KERNEL_LAUNCHER(float);
DEFINE_ASSIGN_KERNEL_LAUNCHER(double);
#undef DEFINE_ASSIGN_KERNEL_LAUNCHER
} // namespace kernel
} // namepsace dragon
\ No newline at end of file
#ifdef WITH_CUDA
#include "core/context_cuda.h"
#include "utils/op_kernel.h"
namespace dragon {
namespace kernel {
/* <T = ?, Device = CUDA> */
template<typename T>
__global__ void _MaskedAssign(
const int nthreads,
const uint8_t* mask,
const T* x,
T* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
y[i] = mask[i] ? x[i] : y[i];
}
}
/* Kernel Launchers */
#define DEFINE_ASSIGN_KERNEL_LAUNCHER(T) \
template<> void MaskedAssign<T, CUDAContext>( \
const int count, \
const uint8_t* mask, \
const T* x, \
T* y, \
CUDAContext* ctx) { \
_MaskedAssign \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
count, mask, x, y \
); \
}
DEFINE_ASSIGN_KERNEL_LAUNCHER(bool);
DEFINE_ASSIGN_KERNEL_LAUNCHER(int8_t);
DEFINE_ASSIGN_KERNEL_LAUNCHER(uint8_t);
DEFINE_ASSIGN_KERNEL_LAUNCHER(int);
DEFINE_ASSIGN_KERNEL_LAUNCHER(int64_t);
DEFINE_ASSIGN_KERNEL_LAUNCHER(float16);
DEFINE_ASSIGN_KERNEL_LAUNCHER(float);
DEFINE_ASSIGN_KERNEL_LAUNCHER(double);
#undef DEFINE_ASSIGN_KERNEL_LAUNCHER
} // namespace kernel
} // namepsace dragon
#endif // WITH_CUDA
\ No newline at end of file
...@@ -5,7 +5,7 @@ namespace dragon { ...@@ -5,7 +5,7 @@ namespace dragon {
namespace kernel { namespace kernel {
/*! AbsGrad <T = float32, Device = CPU> */ /* <T = float32, Device = CPU> */
template<> void AbsGrad<float, CPUContext>( template<> void AbsGrad<float, CPUContext>(
const int count, const int count,
......
...@@ -7,7 +7,7 @@ namespace dragon { ...@@ -7,7 +7,7 @@ namespace dragon {
namespace kernel { namespace kernel {
/*! AbsGrad <T = float32, Device = CUDA> */ /* <T = float32, Device = CUDA> */
template <typename T> template <typename T>
__global__ void _AbsGrad( __global__ void _AbsGrad(
......
...@@ -5,7 +5,7 @@ namespace dragon { ...@@ -5,7 +5,7 @@ namespace dragon {
namespace kernel { namespace kernel {
/*! <Tx = ?, Ty = ?, Device = CPU> */ /* <Tx = ?, Ty = ?, Device = CPU> */
template <typename Tx, typename Ty> template <typename Tx, typename Ty>
void _NLLLoss( void _NLLLoss(
...@@ -39,7 +39,7 @@ void _NLLLoss( ...@@ -39,7 +39,7 @@ void _NLLLoss(
} }
} }
/*! <Tx = float32, Ty = float32, Device = CPU> */ /* <Tx = float32, Ty = float32, Device = CPU> */
template <> void NLLLoss<float, float, CPUContext>( template <> void NLLLoss<float, float, CPUContext>(
const int outer_dim, const int outer_dim,
...@@ -58,7 +58,7 @@ template <> void NLLLoss<float, float, CPUContext>( ...@@ -58,7 +58,7 @@ template <> void NLLLoss<float, float, CPUContext>(
); );
} }
/*! <Tx = float32, Ty = int64, Device = CPU> */ /* <Tx = float32, Ty = int64, Device = CPU> */
template <> void NLLLoss<float, int64_t, CPUContext>( template <> void NLLLoss<float, int64_t, CPUContext>(
const int outer_dim, const int outer_dim,
...@@ -77,7 +77,7 @@ template <> void NLLLoss<float, int64_t, CPUContext>( ...@@ -77,7 +77,7 @@ template <> void NLLLoss<float, int64_t, CPUContext>(
); );
} }
/*! <Tx = ?, Ty = ?, Device = CPU> */ /* <Tx = ?, Ty = ?, Device = CPU> */
template <typename Tx, typename Ty> template <typename Tx, typename Ty>
void _NLLLossGrad( void _NLLLossGrad(
...@@ -108,7 +108,7 @@ void _NLLLossGrad( ...@@ -108,7 +108,7 @@ void _NLLLossGrad(
} }
} }
/*! <Tx = float32, Ty = float32, Device = CPU> */ /* <Tx = float32, Ty = float32, Device = CPU> */
template<> void NLLLossGrad<float, float, CPUContext>( template<> void NLLLossGrad<float, float, CPUContext>(
const int outer_dim, const int outer_dim,
...@@ -127,7 +127,7 @@ template<> void NLLLossGrad<float, float, CPUContext>( ...@@ -127,7 +127,7 @@ template<> void NLLLossGrad<float, float, CPUContext>(
); );
} }
/*! <Tx = float32, Ty = int64, Device = CPU> */ /* <Tx = float32, Ty = int64, Device = CPU> */
template<> void NLLLossGrad<float, int64_t, CPUContext>( template<> void NLLLossGrad<float, int64_t, CPUContext>(
const int outer_dim, const int outer_dim,
......
...@@ -7,7 +7,7 @@ namespace dragon { ...@@ -7,7 +7,7 @@ namespace dragon {
namespace kernel { namespace kernel {
/*! <Tx = float32, Ty = ?, Device = CUDA> */ /* <Tx = float32, Ty = ?, Device = CUDA> */
template <typename Tx, typename Ty> template <typename Tx, typename Ty>
__global__ void _NLLLoss( __global__ void _NLLLoss(
...@@ -40,7 +40,7 @@ __global__ void _NLLLoss( ...@@ -40,7 +40,7 @@ __global__ void _NLLLoss(
} }
} }
/*! <Tx = float32, Ty = float32, Device = CUDA> */ /* <Tx = float32, Ty = float32, Device = CUDA> */
template <> void NLLLoss<float, float, CUDAContext>( template <> void NLLLoss<float, float, CUDAContext>(
const int outer_dim, const int outer_dim,
...@@ -62,7 +62,7 @@ template <> void NLLLoss<float, float, CUDAContext>( ...@@ -62,7 +62,7 @@ template <> void NLLLoss<float, float, CUDAContext>(
); );
} }
/*! <Tx = float32, Ty = int64, Device = CUDA> */ /* <Tx = float32, Ty = int64, Device = CUDA> */
template <> void NLLLoss<float, int64_t, CUDAContext>( template <> void NLLLoss<float, int64_t, CUDAContext>(
const int outer_dim, const int outer_dim,
...@@ -84,7 +84,7 @@ template <> void NLLLoss<float, int64_t, CUDAContext>( ...@@ -84,7 +84,7 @@ template <> void NLLLoss<float, int64_t, CUDAContext>(
); );
} }
/*! <Tx = ?, Ty = ?, Device = CUDA> */ /* <Tx = ?, Ty = ?, Device = CUDA> */
template <typename Tx, typename Ty> template <typename Tx, typename Ty>
__global__ void _NLLLossGrad( __global__ void _NLLLossGrad(
...@@ -114,7 +114,7 @@ __global__ void _NLLLossGrad( ...@@ -114,7 +114,7 @@ __global__ void _NLLLossGrad(
} }
} }
/*! <Tx = float32, Ty = float32, Device = CUDA> */ /* <Tx = float32, Ty = float32, Device = CUDA> */
template<> void NLLLossGrad<float, float, CUDAContext>( template<> void NLLLossGrad<float, float, CUDAContext>(
const int outer_dim, const int outer_dim,
...@@ -136,7 +136,7 @@ template<> void NLLLossGrad<float, float, CUDAContext>( ...@@ -136,7 +136,7 @@ template<> void NLLLossGrad<float, float, CUDAContext>(
); );
} }
/*! <Tx = float32, Ty = int64, Device = CUDA> */ /* <Tx = float32, Ty = int64, Device = CUDA> */
template<> void NLLLossGrad<float, int64_t, CUDAContext>( template<> void NLLLossGrad<float, int64_t, CUDAContext>(
const int outer_dim, const int outer_dim,
......
...@@ -5,7 +5,7 @@ namespace dragon { ...@@ -5,7 +5,7 @@ namespace dragon {
namespace kernel { namespace kernel {
/*! SigmoidCrossEntropy <T = float32, Device = CPU> */ /* <T = float32, Device = CPU> */
template <> void SigmoidCrossEntropy<float, CPUContext>( template <> void SigmoidCrossEntropy<float, CPUContext>(
const int count, const int count,
...@@ -35,7 +35,7 @@ template <> void SigmoidCrossEntropy<float, CPUContext>( ...@@ -35,7 +35,7 @@ template <> void SigmoidCrossEntropy<float, CPUContext>(
} }
} }
/*! SigmoidCrossEntropyGrad <T = float32, Device = CPU> */ /* <T = float32, Device = CPU> */
template <> void SigmoidCrossEntropyGrad<float, CPUContext>( template <> void SigmoidCrossEntropyGrad<float, CPUContext>(
const int count, const int count,
......
...@@ -4,7 +4,7 @@ namespace dragon { ...@@ -4,7 +4,7 @@ namespace dragon {
namespace kernel { namespace kernel {
/*! SigmoidFocalLoss <Tx = ?, Ty = ?, Device = CPU> */ /* <Tx = ?, Ty = ?, Device = CPU> */
template <typename Tx, typename Ty> template <typename Tx, typename Ty>
void _SigmoidFocalLoss( void _SigmoidFocalLoss(
...@@ -56,7 +56,7 @@ void _SigmoidFocalLoss( ...@@ -56,7 +56,7 @@ void _SigmoidFocalLoss(
} }
} }
/*! SigmoidFocalLoss <Tx = float32, Ty = float32, Device = CPU> */ /* <Tx = float32, Ty = float32, Device = CPU> */
template <> void SigmoidFocalLoss<float, float, CPUContext>( template <> void SigmoidFocalLoss<float, float, CPUContext>(
const int outer_dim, const int outer_dim,
...@@ -78,7 +78,7 @@ template <> void SigmoidFocalLoss<float, float, CPUContext>( ...@@ -78,7 +78,7 @@ template <> void SigmoidFocalLoss<float, float, CPUContext>(
); );
} }
/*! SigmoidFocalLoss <Tx = float32, Ty = int64, Device = CPU> */ /* <Tx = float32, Ty = int64, Device = CPU> */
template <> void SigmoidFocalLoss<float, int64_t, CPUContext>( template <> void SigmoidFocalLoss<float, int64_t, CPUContext>(
const int outer_dim, const int outer_dim,
...@@ -100,7 +100,7 @@ template <> void SigmoidFocalLoss<float, int64_t, CPUContext>( ...@@ -100,7 +100,7 @@ template <> void SigmoidFocalLoss<float, int64_t, CPUContext>(
); );
} }
/*! SigmoidFocalLossGrad <Tx = ?, Ty = ?, Device = CPU> */ /* <Tx = ?, Ty = ?, Device = CPU> */
template <typename Tx, typename Ty> template <typename Tx, typename Ty>
void _SigmoidFocalLossGrad( void _SigmoidFocalLossGrad(
...@@ -153,7 +153,7 @@ void _SigmoidFocalLossGrad( ...@@ -153,7 +153,7 @@ void _SigmoidFocalLossGrad(
} }
} }
/*! SigmoidFocalLossGrad <Tx = float32, Ty = float32, Device = CPU> */ /* <Tx = float32, Ty = float32, Device = CPU> */
template <> void SigmoidFocalLossGrad<float, float, CPUContext>( template <> void SigmoidFocalLossGrad<float, float, CPUContext>(
const int outer_dim, const int outer_dim,
...@@ -175,7 +175,7 @@ template <> void SigmoidFocalLossGrad<float, float, CPUContext>( ...@@ -175,7 +175,7 @@ template <> void SigmoidFocalLossGrad<float, float, CPUContext>(
); );
} }
/*! SigmoidFocalLossGrad <Tx = float32, Ty = int64_t, Device = CPU> */ /* <Tx = float32, Ty = int64_t, Device = CPU> */
template <> void SigmoidFocalLossGrad<float, int64_t, CPUContext>( template <> void SigmoidFocalLossGrad<float, int64_t, CPUContext>(
const int outer_dim, const int outer_dim,
......
...@@ -7,7 +7,7 @@ namespace dragon { ...@@ -7,7 +7,7 @@ namespace dragon {
namespace kernel { namespace kernel {
/*! SigmoidFocalLoss <Tx = ?, Ty = ?, Device = CUDA> */ /* <Tx = ?, Ty = ?, Device = CUDA> */
template <typename Tx, typename Ty> template <typename Tx, typename Ty>
__global__ void _SigmoidFocalLoss( __global__ void _SigmoidFocalLoss(
...@@ -54,7 +54,7 @@ __global__ void _SigmoidFocalLoss( ...@@ -54,7 +54,7 @@ __global__ void _SigmoidFocalLoss(
} }
} }
/*! SigmoidFocalLoss <Tx = float32, Ty = float32, Device = CUDA> */ /* <Tx = float32, Ty = float32, Device = CUDA> */
template <> void SigmoidFocalLoss<float, float, CUDAContext>( template <> void SigmoidFocalLoss<float, float, CUDAContext>(
const int outer_dim, const int outer_dim,
...@@ -79,7 +79,7 @@ template <> void SigmoidFocalLoss<float, float, CUDAContext>( ...@@ -79,7 +79,7 @@ template <> void SigmoidFocalLoss<float, float, CUDAContext>(
); );
} }
/*! SigmoidFocalLoss <Tx = float32, Ty = int64, Device = CUDA> */ /* <Tx = float32, Ty = int64, Device = CUDA> */
template <> void SigmoidFocalLoss<float, int64_t, CUDAContext>( template <> void SigmoidFocalLoss<float, int64_t, CUDAContext>(
const int outer_dim, const int outer_dim,
......
...@@ -5,7 +5,7 @@ namespace dragon { ...@@ -5,7 +5,7 @@ namespace dragon {
namespace kernel { namespace kernel {
/*! SmoothL1 <T = float32, Device = CPU> */ /* <T = float32, Device = CPU> */
template<> void SmoothL1<float, CPUContext>( template<> void SmoothL1<float, CPUContext>(
const int count, const int count,
...@@ -27,7 +27,7 @@ template<> void SmoothL1<float, CPUContext>( ...@@ -27,7 +27,7 @@ template<> void SmoothL1<float, CPUContext>(
} }
} }
/*! SmoothL1Grad <T = float32, Device = CPU> */ /* <T = float32, Device = CPU> */
template<> void SmoothL1Grad<float, CPUContext>( template<> void SmoothL1Grad<float, CPUContext>(
const int count, const int count,
......
...@@ -7,7 +7,7 @@ namespace dragon { ...@@ -7,7 +7,7 @@ namespace dragon {
namespace kernel { namespace kernel {
/*! SmoothL1 <T = float32, Device = CUDA> */ /* <T = float32, Device = CUDA> */
template <typename T> template <typename T>
__global__ void _SmoothL1( __global__ void _SmoothL1(
...@@ -39,7 +39,7 @@ template<> void SmoothL1<float, CUDAContext>( ...@@ -39,7 +39,7 @@ template<> void SmoothL1<float, CUDAContext>(
); );
} }
/*! SmoothL1Grad <T = float32, Device = CUDA> */ /* <T = float32, Device = CUDA> */
template <typename T> template <typename T>
__global__ void _SmoothL1Grad( __global__ void _SmoothL1Grad(
......
...@@ -5,7 +5,7 @@ namespace dragon { ...@@ -5,7 +5,7 @@ namespace dragon {
namespace kernel { namespace kernel {
/*! SoftmaxCrossEntropy <T = float32, Device = CPU> */ /* <T = float32, Device = CPU> */
template <> void SoftmaxCrossEntropy<float, CPUContext>( template <> void SoftmaxCrossEntropy<float, CPUContext>(
const int count, const int count,
......
...@@ -7,7 +7,7 @@ namespace dragon { ...@@ -7,7 +7,7 @@ namespace dragon {
namespace kernel { namespace kernel {
/*! SoftmaxCrossEntropy <T = float32, Device = CUDA> */ /* <T = float32, Device = CUDA> */
template <typename T> template <typename T>
__global__ void _SoftmaxCrossEntropy( __global__ void _SoftmaxCrossEntropy(
......
...@@ -4,7 +4,7 @@ namespace dragon { ...@@ -4,7 +4,7 @@ namespace dragon {
namespace kernel { namespace kernel {
/*! SoftmaxFocalLoss <Tx = ?, Ty = ?, Device = CPU> */ /* <Tx = ?, Ty = ?, Device = CPU> */
template <typename Tx, typename Ty> template <typename Tx, typename Ty>
void _SoftmaxFocalLoss( void _SoftmaxFocalLoss(
...@@ -47,7 +47,7 @@ void _SoftmaxFocalLoss( ...@@ -47,7 +47,7 @@ void _SoftmaxFocalLoss(
} }
} }
/*! SoftmaxFocalLoss <Tx = float32, Ty = float32, Device = CPU> */ /* <Tx = float32, Ty = float32, Device = CPU> */
template <> void SoftmaxFocalLoss<float, float, CPUContext>( template <> void SoftmaxFocalLoss<float, float, CPUContext>(
const int outer_dim, const int outer_dim,
...@@ -72,7 +72,7 @@ template <> void SoftmaxFocalLoss<float, float, CPUContext>( ...@@ -72,7 +72,7 @@ template <> void SoftmaxFocalLoss<float, float, CPUContext>(
); );
} }
/*! SoftmaxFocalLoss <Tx = float32, Ty = int64, Device = CPU> */ /* <Tx = float32, Ty = int64, Device = CPU> */
template <> void SoftmaxFocalLoss<float, int64_t, CPUContext>( template <> void SoftmaxFocalLoss<float, int64_t, CPUContext>(
const int outer_dim, const int outer_dim,
...@@ -97,7 +97,7 @@ template <> void SoftmaxFocalLoss<float, int64_t, CPUContext>( ...@@ -97,7 +97,7 @@ template <> void SoftmaxFocalLoss<float, int64_t, CPUContext>(
); );
} }
/*! SoftmaxFocalLossGrad <Tx = ?, Ty = ?, Device = CPU> */ /* <Tx = ?, Ty = ?, Device = CPU> */
template <typename Tx, typename Ty> template <typename Tx, typename Ty>
void _SoftmaxFocalLossGrad( void _SoftmaxFocalLossGrad(
...@@ -154,7 +154,7 @@ void _SoftmaxFocalLossGrad( ...@@ -154,7 +154,7 @@ void _SoftmaxFocalLossGrad(
} }
} }
/*! SoftmaxFocalLossGrad <Tx = float32, Ty = float32, Device = CPU> */ /* <Tx = float32, Ty = float32, Device = CPU> */
template<> void SoftmaxFocalLossGrad<float, float, CPUContext>( template<> void SoftmaxFocalLossGrad<float, float, CPUContext>(
const int outer_dim, const int outer_dim,
...@@ -179,7 +179,7 @@ template<> void SoftmaxFocalLossGrad<float, float, CPUContext>( ...@@ -179,7 +179,7 @@ template<> void SoftmaxFocalLossGrad<float, float, CPUContext>(
); );
} }
/*! SoftmaxFocalLossGrad <Tx = float32, Ty = int64, Device = CPU> */ /* <Tx = float32, Ty = int64, Device = CPU> */
template<> void SoftmaxFocalLossGrad<float, int64_t, CPUContext>( template<> void SoftmaxFocalLossGrad<float, int64_t, CPUContext>(
const int outer_dim, const int outer_dim,
......
...@@ -7,7 +7,7 @@ namespace dragon { ...@@ -7,7 +7,7 @@ namespace dragon {
namespace kernel { namespace kernel {
/*! SoftmaxFocalLoss <Tx = ?, Ty = ?, Device = CUDA> */ /* <Tx = ?, Ty = ?, Device = CUDA> */
template <typename Tx, typename Ty> template <typename Tx, typename Ty>
__global__ void _SoftmaxFocalLoss( __global__ void _SoftmaxFocalLoss(
...@@ -48,7 +48,7 @@ __global__ void _SoftmaxFocalLoss( ...@@ -48,7 +48,7 @@ __global__ void _SoftmaxFocalLoss(
} }
} }
/*! SoftmaxFocalLoss <Tx = float32, Ty = float32, Device = CUDA> */ /* <Tx = float32, Ty = float32, Device = CUDA> */
template <> void SoftmaxFocalLoss<float, float, CUDAContext>( template <> void SoftmaxFocalLoss<float, float, CUDAContext>(
const int outer_dim, const int outer_dim,
...@@ -76,7 +76,7 @@ template <> void SoftmaxFocalLoss<float, float, CUDAContext>( ...@@ -76,7 +76,7 @@ template <> void SoftmaxFocalLoss<float, float, CUDAContext>(
); );
} }
/*! SoftmaxFocalLoss <Tx = float32, Ty = int64, Device = CUDA> */ /* <Tx = float32, Ty = int64, Device = CUDA> */
template <> void SoftmaxFocalLoss<float, int64_t, CUDAContext>( template <> void SoftmaxFocalLoss<float, int64_t, CUDAContext>(
const int outer_dim, const int outer_dim,
...@@ -104,7 +104,7 @@ template <> void SoftmaxFocalLoss<float, int64_t, CUDAContext>( ...@@ -104,7 +104,7 @@ template <> void SoftmaxFocalLoss<float, int64_t, CUDAContext>(
); );
} }
/*! SoftmaxFocalLossGrad <Tx = ?, Ty = ?, Device = CUDA> */ /* <Tx = ?, Ty = ?, Device = CUDA> */
template <typename Tx, typename Ty> template <typename Tx, typename Ty>
__global__ void _SoftmaxFocalLossGrad( __global__ void _SoftmaxFocalLossGrad(
...@@ -160,7 +160,7 @@ __global__ void _SoftmaxFocalLossGrad( ...@@ -160,7 +160,7 @@ __global__ void _SoftmaxFocalLossGrad(
} }
} }
/*! SoftmaxFocalLossGrad <Tx = float32, Ty = float32, Device = CUDA> */ /* <Tx = float32, Ty = float32, Device = CUDA> */
template<> void SoftmaxFocalLossGrad<float, float, CUDAContext>( template<> void SoftmaxFocalLossGrad<float, float, CUDAContext>(
const int outer_dim, const int outer_dim,
...@@ -188,7 +188,7 @@ template<> void SoftmaxFocalLossGrad<float, float, CUDAContext>( ...@@ -188,7 +188,7 @@ template<> void SoftmaxFocalLossGrad<float, float, CUDAContext>(
); );
} }
/*! SoftmaxFocalLossGrad <Tx = float32, Ty = int64, Device = CUDA> */ /* <Tx = float32, Ty = int64, Device = CUDA> */
template<> void SoftmaxFocalLossGrad<float, int64_t, CUDAContext>( template<> void SoftmaxFocalLossGrad<float, int64_t, CUDAContext>(
const int outer_dim, const int outer_dim,
......
...@@ -4,7 +4,7 @@ namespace dragon { ...@@ -4,7 +4,7 @@ namespace dragon {
namespace kernel { namespace kernel {
/*! <Tx = ?, Ty = ?, Device = CPU> */ /* <Tx = ?, Ty = ?, Device = CPU> */
template <typename Tx, typename Ty> template <typename Tx, typename Ty>
void _SparseSoftmaxCrossEntropy( void _SparseSoftmaxCrossEntropy(
...@@ -40,7 +40,7 @@ void _SparseSoftmaxCrossEntropy( ...@@ -40,7 +40,7 @@ void _SparseSoftmaxCrossEntropy(
} }
} }
/*! SparseSoftmaxCrossEntropy <Tx = float32, Ty = float32, Device = CPU> */ /* <Tx = float32, Ty = float32, Device = CPU> */
template <> void SparseSoftmaxCrossEntropy<float, float, CPUContext>( template <> void SparseSoftmaxCrossEntropy<float, float, CPUContext>(
const int outer_dim, const int outer_dim,
...@@ -59,7 +59,7 @@ template <> void SparseSoftmaxCrossEntropy<float, float, CPUContext>( ...@@ -59,7 +59,7 @@ template <> void SparseSoftmaxCrossEntropy<float, float, CPUContext>(
); );
} }
/*! <Tx = float32, Ty = int64, Device = CPU> */ /* <Tx = float32, Ty = int64, Device = CPU> */
template <> void SparseSoftmaxCrossEntropy<float, int64_t, CPUContext>( template <> void SparseSoftmaxCrossEntropy<float, int64_t, CPUContext>(
const int outer_dim, const int outer_dim,
...@@ -78,7 +78,7 @@ template <> void SparseSoftmaxCrossEntropy<float, int64_t, CPUContext>( ...@@ -78,7 +78,7 @@ template <> void SparseSoftmaxCrossEntropy<float, int64_t, CPUContext>(
); );
} }
/*! <Tx = ?, Ty = ?, Device = CPU> */ /* <Tx = ?, Ty = ?, Device = CPU> */
template <typename Tx, typename Ty> template <typename Tx, typename Ty>
void _SparseSoftmaxCrossEntropyGrad( void _SparseSoftmaxCrossEntropyGrad(
...@@ -112,7 +112,7 @@ void _SparseSoftmaxCrossEntropyGrad( ...@@ -112,7 +112,7 @@ void _SparseSoftmaxCrossEntropyGrad(
} }
} }
/*! <Tx = float32, Ty = float32, Device = CPU> */ /* <Tx = float32, Ty = float32, Device = CPU> */
template<> void SparseSoftmaxCrossEntropyGrad<float, float, CPUContext>( template<> void SparseSoftmaxCrossEntropyGrad<float, float, CPUContext>(
const int outer_dim, const int outer_dim,
...@@ -131,7 +131,7 @@ template<> void SparseSoftmaxCrossEntropyGrad<float, float, CPUContext>( ...@@ -131,7 +131,7 @@ template<> void SparseSoftmaxCrossEntropyGrad<float, float, CPUContext>(
); );
} }
/*! <Tx = float32, Ty = int64, Device = CPU> */ /* <Tx = float32, Ty = int64, Device = CPU> */
template<> void SparseSoftmaxCrossEntropyGrad<float, int64_t, CPUContext>( template<> void SparseSoftmaxCrossEntropyGrad<float, int64_t, CPUContext>(
const int outer_dim, const int outer_dim,
......
...@@ -8,7 +8,7 @@ namespace dragon { ...@@ -8,7 +8,7 @@ namespace dragon {
namespace kernel { namespace kernel {
/*! <Tx = ?, Ty = ?, Device = CUDA> */ /* <Tx = ?, Ty = ?, Device = CUDA> */
template <typename Tx, typename Ty> template <typename Tx, typename Ty>
__global__ void _SparseSoftmaxCrossEntropy( __global__ void _SparseSoftmaxCrossEntropy(
...@@ -44,7 +44,7 @@ __global__ void _SparseSoftmaxCrossEntropy( ...@@ -44,7 +44,7 @@ __global__ void _SparseSoftmaxCrossEntropy(
} }
} }
/*! <Tx = float32, Ty = float32, Device = CUDA> */ /* <Tx = float32, Ty = float32, Device = CUDA> */
template <> void SparseSoftmaxCrossEntropy<float, float, CUDAContext>( template <> void SparseSoftmaxCrossEntropy<float, float, CUDAContext>(
const int outer_dim, const int outer_dim,
...@@ -66,7 +66,7 @@ template <> void SparseSoftmaxCrossEntropy<float, float, CUDAContext>( ...@@ -66,7 +66,7 @@ template <> void SparseSoftmaxCrossEntropy<float, float, CUDAContext>(
); );
} }
/*! <Tx = float32, Ty = int64, Device = CUDA> */ /* <Tx = float32, Ty = int64, Device = CUDA> */
template <> void SparseSoftmaxCrossEntropy<float, int64_t, CUDAContext>( template <> void SparseSoftmaxCrossEntropy<float, int64_t, CUDAContext>(
const int outer_dim, const int outer_dim,
...@@ -88,7 +88,7 @@ template <> void SparseSoftmaxCrossEntropy<float, int64_t, CUDAContext>( ...@@ -88,7 +88,7 @@ template <> void SparseSoftmaxCrossEntropy<float, int64_t, CUDAContext>(
); );
} }
/*! <Tx = ?, Ty = ?, Device = CUDA> */ /* <Tx = ?, Ty = ?, Device = CUDA> */
template <typename Tx, typename Ty> template <typename Tx, typename Ty>
__global__ void _SparseSoftmaxCrossEntropyGrad( __global__ void _SparseSoftmaxCrossEntropyGrad(
...@@ -121,7 +121,7 @@ __global__ void _SparseSoftmaxCrossEntropyGrad( ...@@ -121,7 +121,7 @@ __global__ void _SparseSoftmaxCrossEntropyGrad(
} }
} }
/*! <Tx = float32, Ty = float32, Device = CUDA> */ /* <Tx = float32, Ty = float32, Device = CUDA> */
template<> void SparseSoftmaxCrossEntropyGrad<float, float, CUDAContext>( template<> void SparseSoftmaxCrossEntropyGrad<float, float, CUDAContext>(
const int outer_dim, const int outer_dim,
...@@ -143,7 +143,7 @@ template<> void SparseSoftmaxCrossEntropyGrad<float, float, CUDAContext>( ...@@ -143,7 +143,7 @@ template<> void SparseSoftmaxCrossEntropyGrad<float, float, CUDAContext>(
); );
} }
/*! <Tx = float32, Ty = int64, Device = CUDA> */ /* <Tx = float32, Ty = int64, Device = CUDA> */
template<> void SparseSoftmaxCrossEntropyGrad<float, int64_t, CUDAContext>( template<> void SparseSoftmaxCrossEntropyGrad<float, int64_t, CUDAContext>(
const int outer_dim, const int outer_dim,
......
...@@ -6,7 +6,7 @@ namespace dragon { ...@@ -6,7 +6,7 @@ namespace dragon {
namespace kernel { namespace kernel {
/*! Astype <Ta = ?, Tb = ?, Device = CPU> */ /* <Ta = ?, Tb = ?, Device = CPU> */
template <typename Ta, typename Tb> template <typename Ta, typename Tb>
void _TypeA2B(const int count, const Ta* a, Tb* b) { void _TypeA2B(const int count, const Ta* a, Tb* b) {
......
...@@ -7,7 +7,7 @@ namespace dragon { ...@@ -7,7 +7,7 @@ namespace dragon {
namespace kernel { namespace kernel {
/*! Astype <Ta = ?, Tb = ?, Device = CUDA> */ /* <Ta = ?, Tb = ?, Device = CUDA> */
template <typename Ta, typename Tb> template <typename Ta, typename Tb>
__global__ void _TypeA2B( __global__ void _TypeA2B(
...@@ -49,7 +49,7 @@ DEFINE_TYPE_A_TO_ALL(int64_t); ...@@ -49,7 +49,7 @@ DEFINE_TYPE_A_TO_ALL(int64_t);
DEFINE_TYPE_A_TO_ALL(float); DEFINE_TYPE_A_TO_ALL(float);
DEFINE_TYPE_A_TO_ALL(double); DEFINE_TYPE_A_TO_ALL(double);
/*! Astype <Ta = float16, Tb = float32, Device = CUDA> */ /* <Ta = float16, Tb = float32, Device = CUDA> */
template<> __global__ void _TypeA2B<half, float>( template<> __global__ void _TypeA2B<half, float>(
const int nthreads, const int nthreads,
...@@ -72,7 +72,7 @@ template <> void TypeA2B<float16, float, CUDAContext>( ...@@ -72,7 +72,7 @@ template <> void TypeA2B<float16, float, CUDAContext>(
); );
} }
/*! Astype <Ta = float32, Tb = float16, Device = CUDA> */ /* <Ta = float32, Tb = float16, Device = CUDA> */
template<> __global__ void _TypeA2B<float, half>( template<> __global__ void _TypeA2B<float, half>(
const int nthreads, const int nthreads,
...@@ -95,7 +95,7 @@ template <> void TypeA2B<float, float16, CUDAContext>( ...@@ -95,7 +95,7 @@ template <> void TypeA2B<float, float16, CUDAContext>(
); );
} }
/*! Astype <Ta = float16, Tb = float16, Device = CUDA> */ /* <Ta = float16, Tb = float16, Device = CUDA> */
template<> __global__ void _TypeA2B<half, half>( template<> __global__ void _TypeA2B<half, half>(
const int nthreads, const int nthreads,
......
...@@ -5,7 +5,7 @@ namespace dragon { ...@@ -5,7 +5,7 @@ namespace dragon {
namespace kernel { namespace kernel {
/*! GradientTwoSum <T = ?, Device = CUDA> */ /* <T = ?, Device = CUDA> */
template <typename T> template <typename T>
void _GradientTwoSum( void _GradientTwoSum(
......
...@@ -7,7 +7,7 @@ namespace dragon { ...@@ -7,7 +7,7 @@ namespace dragon {
namespace kernel { namespace kernel {
/*! GradientTwoSum <T = ?, Device = CUDA> */ /* <T = ?, Device = CUDA> */
template <typename T> template <typename T>
__global__ void _GradientTwoSum( __global__ void _GradientTwoSum(
...@@ -20,7 +20,7 @@ __global__ void _GradientTwoSum( ...@@ -20,7 +20,7 @@ __global__ void _GradientTwoSum(
} }
} }
/*! GradientTwoSum <T = float16, Device = CUDA> */ /* <T = float16, Device = CUDA> */
template <> __global__ void _GradientTwoSum<half>( template <> __global__ void _GradientTwoSum<half>(
const int nthreads, const int nthreads,
......
...@@ -5,7 +5,7 @@ namespace dragon { ...@@ -5,7 +5,7 @@ namespace dragon {
namespace kernel { namespace kernel {
/*! ImageData <Tx = ?, Ty = ?, Device = CPU> */ /* <Tx = ?, Ty = ?, Device = CPU> */
template <typename Tx, typename Ty> template <typename Tx, typename Ty>
void _ImageDataNCHW( void _ImageDataNCHW(
...@@ -55,7 +55,7 @@ void _ImageDataNHWC( ...@@ -55,7 +55,7 @@ void _ImageDataNHWC(
} }
} }
/*! ImageData <Tx = float32, Ty = float32, Device = CPU> */ /* <Tx = float32, Ty = float32, Device = CPU> */
template <> void ImageData<float, float, CPUContext>( template <> void ImageData<float, float, CPUContext>(
const int N, const int N,
...@@ -77,7 +77,7 @@ template <> void ImageData<float, float, CPUContext>( ...@@ -77,7 +77,7 @@ template <> void ImageData<float, float, CPUContext>(
} }
} }
/*! ImageData <Tx = uint8, Ty = float32, Device = CPU> */ /* <Tx = uint8, Ty = float32, Device = CPU> */
template <> void ImageData<uint8_t, float, CPUContext>( template <> void ImageData<uint8_t, float, CPUContext>(
const int N, const int N,
...@@ -99,7 +99,7 @@ template <> void ImageData<uint8_t, float, CPUContext>( ...@@ -99,7 +99,7 @@ template <> void ImageData<uint8_t, float, CPUContext>(
} }
} }
/*! ImageData <Tx = float32, Ty = float16, Device = CPU> */ /* <Tx = float32, Ty = float16, Device = CPU> */
template <> void ImageData<float, float16, CPUContext>( template <> void ImageData<float, float16, CPUContext>(
const int N, const int N,
...@@ -115,7 +115,7 @@ template <> void ImageData<float, float16, CPUContext>( ...@@ -115,7 +115,7 @@ template <> void ImageData<float, float16, CPUContext>(
CPU_FP16_NOT_SUPPORTED; CPU_FP16_NOT_SUPPORTED;
} }
/*! ImageData <Tx = uint8, Ty = float16, Device = CPU> */ /* <Tx = uint8, Ty = float16, Device = CPU> */
template <> void ImageData<uint8_t, float16, CPUContext>( template <> void ImageData<uint8_t, float16, CPUContext>(
const int N, const int N,
......
...@@ -7,7 +7,7 @@ namespace dragon { ...@@ -7,7 +7,7 @@ namespace dragon {
namespace kernel { namespace kernel {
/*! ImageData <Tx = ?, Ty = ?, Device = CUDA> */ /* <Tx = ?, Ty = ?, Device = CUDA> */
template <typename Tx, typename Ty> template <typename Tx, typename Ty>
__global__ void _ImageDataNCHW( __global__ void _ImageDataNCHW(
...@@ -60,7 +60,7 @@ __global__ void _ImageDataNHWC( ...@@ -60,7 +60,7 @@ __global__ void _ImageDataNHWC(
} }
} }
/*! ImageData <Tx = float32, Ty = float32, Device = CUDA> */ /* <Tx = float32, Ty = float32, Device = CUDA> */
template <> void ImageData<float, float, CUDAContext>( template <> void ImageData<float, float, CUDAContext>(
const int N, const int N,
...@@ -91,7 +91,7 @@ template <> void ImageData<float, float, CUDAContext>( ...@@ -91,7 +91,7 @@ template <> void ImageData<float, float, CUDAContext>(
} }
} }
/*! ImageData <Tx = uint8, Ty = float32, Device = CUDA> */ /* <Tx = uint8, Ty = float32, Device = CUDA> */
template <> void ImageData<uint8_t, float, CUDAContext>( template <> void ImageData<uint8_t, float, CUDAContext>(
const int N, const int N,
...@@ -122,7 +122,7 @@ template <> void ImageData<uint8_t, float, CUDAContext>( ...@@ -122,7 +122,7 @@ template <> void ImageData<uint8_t, float, CUDAContext>(
} }
} }
/*! ImageData <Tx = ?, Ty = float16, Device = CUDA> */ /* <Tx = ?, Ty = float16, Device = CUDA> */
template <typename Tx, typename Ty> template <typename Tx, typename Ty>
__global__ void _ImageDataHalfNCHW( __global__ void _ImageDataHalfNCHW(
...@@ -175,7 +175,7 @@ __global__ void _ImageDataHalfNHWC( ...@@ -175,7 +175,7 @@ __global__ void _ImageDataHalfNHWC(
} }
} }
/*! ImageData <Tx = float32, Ty = float16, Device = CUDA> */ /* <Tx = float32, Ty = float16, Device = CUDA> */
template <> void ImageData<float, float16, CUDAContext>( template <> void ImageData<float, float16, CUDAContext>(
const int N, const int N,
......
...@@ -16,7 +16,7 @@ namespace dragon { ...@@ -16,7 +16,7 @@ namespace dragon {
namespace kernel { namespace kernel {
/*! BatchNormBackwardTraining <T = ?, Device = CPU> */ /* <T = ?, Device = CPU> */
template <typename Tx, typename Tp, StorageOrder kOrder> template <typename Tx, typename Tp, StorageOrder kOrder>
void _BatchNormInternalGrad( void _BatchNormInternalGrad(
...@@ -68,7 +68,7 @@ void _BatchNormTrainingGrad( ...@@ -68,7 +68,7 @@ void _BatchNormTrainingGrad(
} }
} }
/*! BatchNormBackwardInference <T = ?, Device = CPU> */ /* <T = ?, Device = CPU> */
template <typename Tx, typename Tp, StorageOrder kOrder> template <typename Tx, typename Tp, StorageOrder kOrder>
void _BatchNormWGrad( void _BatchNormWGrad(
......
...@@ -19,7 +19,7 @@ namespace kernel { ...@@ -19,7 +19,7 @@ namespace kernel {
#define L(x, i) __ldg(x + i) #define L(x, i) __ldg(x + i)
/*! BatchNormBackwardTraining <T = ?, Device = CUDA> */ /* <T = ?, Device = CUDA> */
template <typename Tx, typename Tp, StorageOrder kOrder> template <typename Tx, typename Tp, StorageOrder kOrder>
__global__ void _BatchNormInternalGrad( __global__ void _BatchNormInternalGrad(
...@@ -107,7 +107,7 @@ __global__ void _BatchNormTrainingGrad( ...@@ -107,7 +107,7 @@ __global__ void _BatchNormTrainingGrad(
} }
} }
/*! BatchNormBackwardInference <T = ?, Device = CUDA> */ /* <T = ?, Device = CUDA> */
template <typename Tx, typename Tp, StorageOrder kOrder> template <typename Tx, typename Tp, StorageOrder kOrder>
__global__ void _BatchNormWGrad( __global__ void _BatchNormWGrad(
......
...@@ -20,7 +20,7 @@ namespace kernel { ...@@ -20,7 +20,7 @@ namespace kernel {
#define L(x, i) __ldg(x + i) #define L(x, i) __ldg(x + i)
#define LF(x, i) __half2float(__ldg(x + i)) #define LF(x, i) __half2float(__ldg(x + i))
/*! GroupNormForward <T = ?, Device = CUDA> */ /* <T = ?, Device = CUDA> */
template <typename T> template <typename T>
__global__ void _GroupNormFusedParams( __global__ void _GroupNormFusedParams(
...@@ -157,7 +157,7 @@ template<> __global__ void _GroupNormForwardNHWC<half, float>( ...@@ -157,7 +157,7 @@ template<> __global__ void _GroupNormForwardNHWC<half, float>(
#endif #endif
} }
/*! GroupNormBackward <T = ?, Device = CUDA> */ /* <T = ?, Device = CUDA> */
template <typename Tx, typename Tp, StorageOrder kOrder> template <typename Tx, typename Tp, StorageOrder kOrder>
__global__ void _GroupNormWGrad( __global__ void _GroupNormWGrad(
......
...@@ -5,7 +5,7 @@ namespace dragon { ...@@ -5,7 +5,7 @@ namespace dragon {
namespace kernel { namespace kernel {
/*! LSTMCell <T = float32, Device = CPU> */ /* <T = float32, Device = CPU> */
template <typename T> template <typename T>
T _s(T x) { return T(1) / (T(1) + exp(-x)); } T _s(T x) { return T(1) / (T(1) + exp(-x)); }
...@@ -34,7 +34,7 @@ template <> void LSTMCell<float, CPUContext>( ...@@ -34,7 +34,7 @@ template <> void LSTMCell<float, CPUContext>(
} }
} }
/*! LSTMCellGrad <T = float32, Device = CPU> */ /* <T = float32, Device = CPU> */
template <> void LSTMCellGrad<float, CPUContext>( template <> void LSTMCellGrad<float, CPUContext>(
const int N, const int N,
......
...@@ -7,7 +7,7 @@ namespace dragon { ...@@ -7,7 +7,7 @@ namespace dragon {
namespace kernel { namespace kernel {
/*! LSTMCell <T = float32, Device = CUDA> */ /* <T = float32, Device = CUDA> */
template <typename T> template <typename T>
__global__ void _LSTMCellAct( __global__ void _LSTMCellAct(
...@@ -70,7 +70,7 @@ template <> void LSTMCell<float, CUDAContext>( ...@@ -70,7 +70,7 @@ template <> void LSTMCell<float, CUDAContext>(
); );
} }
/*! LSTMCellGrad <T = float32, Device = CUDA> */ /* <T = float32, Device = CUDA> */
template <typename T> template <typename T>
__global__ void _LSTMCellGateGrad( __global__ void _LSTMCellGateGrad(
......
...@@ -5,7 +5,7 @@ namespace dragon { ...@@ -5,7 +5,7 @@ namespace dragon {
namespace kernel { namespace kernel {
/*! BiasAdd <T = float32, Device = CPU> */ /* <T = float32, Device = CPU> */
template<> void BiasAdd<float, CPUContext>( template<> void BiasAdd<float, CPUContext>(
const int outer_dim, const int outer_dim,
......
...@@ -7,7 +7,7 @@ namespace dragon { ...@@ -7,7 +7,7 @@ namespace dragon {
namespace kernel { namespace kernel {
/*! BiasAdd <T = float32, Device = CUDA> */ /* <T = float32, Device = CUDA> */
template <typename T> template <typename T>
__global__ void _BiasAddNCHW( __global__ void _BiasAddNCHW(
......
...@@ -4,7 +4,7 @@ namespace dragon { ...@@ -4,7 +4,7 @@ namespace dragon {
namespace kernel { namespace kernel {
/*! DepthwiseConv2d <T = float32, Device = CPU> */ /* <T = float32, Device = CPU> */
template <typename T> template <typename T>
void _DepthwiseConv2dNCHW( void _DepthwiseConv2dNCHW(
......
...@@ -8,7 +8,7 @@ namespace dragon { ...@@ -8,7 +8,7 @@ namespace dragon {
namespace kernel { namespace kernel {
/* < Device = CUDA> */ /* <Device = CUDA> */
__global__ void _DropBlock2dNCHW( __global__ void _DropBlock2dNCHW(
const int nthreads, const int nthreads,
......
...@@ -5,7 +5,7 @@ namespace dragon { ...@@ -5,7 +5,7 @@ namespace dragon {
namespace kernel { namespace kernel {
/*! ROIAlign <T = float32, Device = CPU> */ /* <T = float32, Device = CPU> */
template <typename T> template <typename T>
T _ROIAlignIntp( T _ROIAlignIntp(
...@@ -115,7 +115,7 @@ template<> void ROIAlign<float, CPUContext>( ...@@ -115,7 +115,7 @@ template<> void ROIAlign<float, CPUContext>(
} // End n } // End n
} }
/*! ROIAlign <T = float16, Device = CPU> */ /* <T = float16, Device = CPU> */
template<> void ROIAlign<float16, CPUContext>( template<> void ROIAlign<float16, CPUContext>(
const int C, const int C,
...@@ -133,7 +133,7 @@ template<> void ROIAlign<float16, CPUContext>( ...@@ -133,7 +133,7 @@ template<> void ROIAlign<float16, CPUContext>(
CPU_FP16_NOT_SUPPORTED; CPU_FP16_NOT_SUPPORTED;
} }
/*! ROIAlignGrad <T = float32, Device = CPU> */ /* <T = float32, Device = CPU> */
template<> void ROIAlignGrad<float, CPUContext>( template<> void ROIAlignGrad<float, CPUContext>(
const int C, const int C,
......
...@@ -7,7 +7,7 @@ namespace dragon { ...@@ -7,7 +7,7 @@ namespace dragon {
namespace kernel { namespace kernel {
/*! <T = float32, Device = CUDA> */ /* <T = float32, Device = CUDA> */
template <typename T> template <typename T>
__device__ T _ROIAlignIntp( __device__ T _ROIAlignIntp(
...@@ -106,10 +106,10 @@ __global__ void _ROIAlign( ...@@ -106,10 +106,10 @@ __global__ void _ROIAlign(
for (int iy = 0; iy < grid_h; iy++) { for (int iy = 0; iy < grid_h; iy++) {
const T y = roi_hstart + ph * bin_h + const T y = roi_hstart + ph * bin_h +
(T)(iy + .5f) * bin_h / (T)(grid_h); (T)(iy + .5f) * bin_h / (T)grid_h;
for (int ix = 0; ix < grid_w; ix++) { for (int ix = 0; ix < grid_w; ix++) {
const T x = roi_wstart + pw * bin_w + const T x = roi_wstart + pw * bin_w +
(T)(ix + .5f) * bin_w / (T)(grid_w); (T)(ix + .5f) * bin_w / (T)grid_w;
intp_val += _ROIAlignIntp(X, H, W, y, x); intp_val += _ROIAlignIntp(X, H, W, y, x);
} }
} }
...@@ -143,7 +143,7 @@ template<> void ROIAlign<float, CUDAContext>( ...@@ -143,7 +143,7 @@ template<> void ROIAlign<float, CUDAContext>(
); );
} }
/*! <T = float32, Device = CUDA> */ /* <T = float32, Device = CUDA> */
template <typename T> template <typename T>
__device__ void _ROIAlignIntpGrad( __device__ void _ROIAlignIntpGrad(
......
...@@ -7,10 +7,10 @@ namespace dragon { ...@@ -7,10 +7,10 @@ namespace dragon {
namespace kernel { namespace kernel {
/*! ROIAlign <T = float16, Device = CUDA> */ /* <T = float16, Device = CUDA> */
__device__ float _ROIAlignInterpolate( __device__ float _ROIAlignIntp(
const half* Xdata, const half* X,
const int H, const int H,
const int W, const int W,
float y, float y,
...@@ -41,12 +41,12 @@ __device__ float _ROIAlignInterpolate( ...@@ -41,12 +41,12 @@ __device__ float _ROIAlignInterpolate(
const float ly = y - y_low; const float ly = y - y_low;
const float lx = x - x_low; const float lx = x - x_low;
const float hy = 1. - ly, hx = 1. - lx; const float hy = 1.f - ly, hx = 1.f - lx;
const float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; const float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
const float v1 = __half2float(__ldg(Xdata + (y_low * W + x_low))); const float v1 = __half2float(__ldg(X + (y_low * W + x_low)));
const float v2 = __half2float(__ldg(Xdata + (y_low * W + x_high))); const float v2 = __half2float(__ldg(X + (y_low * W + x_high)));
const float v3 = __half2float(__ldg(Xdata + (y_high * W + x_low))); const float v3 = __half2float(__ldg(X + (y_high * W + x_low)));
const float v4 = __half2float(__ldg(Xdata + (y_high * W + x_high))); const float v4 = __half2float(__ldg(X + (y_high * W + x_high)));
const float value = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4; const float value = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4;
#else #else
const float value = 0.f; const float value = 0.f;
...@@ -63,58 +63,55 @@ __global__ void _ROIAlignHalf( ...@@ -63,58 +63,55 @@ __global__ void _ROIAlignHalf(
const int pool_w, const int pool_w,
const int sampling_ratio, const int sampling_ratio,
const float spatial_scale, const float spatial_scale,
const half* Xdata, const half* xdata,
const float* rois, const float* rois,
half* Ydata) { half* ydata) {
CUDA_1D_KERNEL_LOOP(y_idx, nthreads) { CUDA_1D_KERNEL_LOOP(yi, nthreads) {
#if __CUDA_ARCH__ >= 530 #if __CUDA_ARCH__ >= 530
int pw = y_idx % pool_w; int pw = yi % pool_w;
int ph = (y_idx / pool_w) % pool_h; int ph = (yi / pool_w) % pool_h;
int c = (y_idx / pool_w / pool_h) % C; int c = (yi / pool_w / pool_h) % C;
int n = y_idx / pool_w / pool_h / C; int n = yi / pool_w / pool_h / C;
const float* offset_rois = rois + n * 5; const float* roi = rois + n * 5;
int roi_batch_ind = offset_rois[0]; int batch_ind = roi[0];
if (roi_batch_ind < 0) { if (batch_ind < 0) {
Ydata[y_idx] = __float2half(0.f); ydata[yi] = __float2half(0.f);
continue; continue;
} }
float roi_start_w = offset_rois[1] * spatial_scale; float roi_wstart = roi[1] * spatial_scale;
float roi_start_h = offset_rois[2] * spatial_scale; float roi_hstart = roi[2] * spatial_scale;
float roi_end_w = offset_rois[3] * spatial_scale; float roi_wend = roi[3] * spatial_scale;
float roi_end_h = offset_rois[4] * spatial_scale; float roi_hend = roi[4] * spatial_scale;
float roi_width = max(roi_end_w - roi_start_w, 1.f); float roi_w = max(roi_wend - roi_wstart, 1.f);
float roi_height = max(roi_end_h - roi_start_h, 1.f); float roi_h = max(roi_hend - roi_hstart, 1.f);
float bin_size_h = (float)roi_height / (float)pool_h; float bin_h = roi_h / (float)pool_h;
float bin_size_w = (float)roi_width / (float)pool_w; float bin_w = roi_w / (float)pool_w;
const half* offset_Xdata = Xdata + (roi_batch_ind * C + c) * H * W; const half* X = xdata + (batch_ind * C + c) * H * W;
int roi_bin_grid_h = (sampling_ratio > 0) ? int grid_h = (sampling_ratio > 0) ?
sampling_ratio : ceil(roi_height / pool_h); sampling_ratio : ceil(roi_h / pool_h);
int roi_bin_grid_w = (sampling_ratio > 0) ? int grid_w = (sampling_ratio > 0) ?
sampling_ratio : ceil(roi_width / pool_w); sampling_ratio : ceil(roi_w / pool_w);
float output_val = 0.; float intp_val = 0.f;
const float num_bin_grids = roi_bin_grid_h * roi_bin_grid_w;
for (int iy = 0; iy < grid_h; iy++) {
for (int iy = 0; iy < roi_bin_grid_h; iy++) { const float y = roi_hstart + ph * bin_h +
const float y = roi_start_h + ph * bin_size_h + (float)(iy + .5f) * bin_h / (float)grid_h;
static_cast<float>(iy + .5f) * bin_size_h / for (int ix = 0; ix < grid_w; ix++) {
static_cast<float>(roi_bin_grid_h); const float x = roi_wstart + pw * bin_w +
for (int ix = 0; ix < roi_bin_grid_w; ix++) { (float)(ix + .5f) * bin_w / (float)grid_w;
const float x = roi_start_w + pw * bin_size_w + intp_val += _ROIAlignIntp(X, H, W, y, x);
static_cast<float>(ix + .5f) * bin_size_w /
static_cast<float>(roi_bin_grid_w);
output_val += _ROIAlignInterpolate(
offset_Xdata, H, W, y, x);
} }
} }
output_val /= num_bin_grids; ydata[yi] = __float2half(
Ydata[y_idx] = __float2half(output_val); intp_val / float(grid_h * grid_w)
);
#endif #endif
} }
} }
...@@ -135,11 +132,16 @@ template<> void ROIAlign<float16, CUDAContext>( ...@@ -135,11 +132,16 @@ template<> void ROIAlign<float16, CUDAContext>(
auto nthreads = num_rois * C * pool_h * pool_w; auto nthreads = num_rois * C * pool_h * pool_w;
_ROIAlignHalf _ROIAlignHalf
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS, <<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>> 0, ctx->cuda_stream() >>>(
(nthreads, C, H, W, pool_h, pool_w, nthreads,
sampling_ratio, spatial_scale, C, H, W,
reinterpret_cast<const half*>(x), rois, pool_h, pool_w,
reinterpret_cast<half*>(y)); sampling_ratio,
spatial_scale,
reinterpret_cast<const half*>(x),
rois,
reinterpret_cast<half*>(y)
);
} }
} // namespace kernel } // namespace kernel
......
...@@ -5,7 +5,7 @@ namespace dragon { ...@@ -5,7 +5,7 @@ namespace dragon {
namespace kernel { namespace kernel {
/*! ROIPool <T = float32, Device = CPU> */ /* <T = float32, Device = CPU> */
template<> void ROIPool<float, CPUContext>( template<> void ROIPool<float, CPUContext>(
const int C, const int C,
...@@ -83,7 +83,7 @@ template<> void ROIPool<float, CPUContext>( ...@@ -83,7 +83,7 @@ template<> void ROIPool<float, CPUContext>(
} // End n } // End n
} }
/*! ROIPool <T = float16, Device = CPU> */ /* <T = float16, Device = CPU> */
template<> void ROIPool<float16, CPUContext>( template<> void ROIPool<float16, CPUContext>(
const int C, const int C,
...@@ -101,7 +101,7 @@ template<> void ROIPool<float16, CPUContext>( ...@@ -101,7 +101,7 @@ template<> void ROIPool<float16, CPUContext>(
CPU_FP16_NOT_SUPPORTED; CPU_FP16_NOT_SUPPORTED;
} }
/*! ROIPoolGrad <T = float32, Device = CPU> */ /* <T = float32, Device = CPU> */
template<> void ROIPoolGrad<float, CPUContext>( template<> void ROIPoolGrad<float, CPUContext>(
const int N, const int N,
......
...@@ -52,7 +52,9 @@ DEPLOY_CUDA(Arange); ...@@ -52,7 +52,9 @@ DEPLOY_CUDA(Arange);
#endif #endif
OPERATOR_SCHEMA(Arange) OPERATOR_SCHEMA(Arange)
.NumInputs(0).NumOutputs(1); .NumInputs(0)
/* Y */
.NumOutputs(1);
NO_GRADIENT(Arange); NO_GRADIENT(Arange);
......
...@@ -16,7 +16,7 @@ namespace dragon { ...@@ -16,7 +16,7 @@ namespace dragon {
template <class Context> template <typename T> template <class Context> template <typename T>
void IndexSelectOp<Context>::RunImpl() { void IndexSelectOp<Context>::RunImpl() {
auto* x = X(0).template data<T, Context>(); auto* x = X(0).template data<T, Context>();
auto* i = X(1).template mutable_data<int64_t, Context>(); auto* i = X(1).template data<int64_t, Context>();
auto* y = Y(0)->template mutable_data<T, Context>(); auto* y = Y(0)->template mutable_data<T, Context>();
kernel::IndexSelect( kernel::IndexSelect(
...@@ -67,12 +67,6 @@ void IndexSelectGradientOp<Context>::RunImpl() { ...@@ -67,12 +67,6 @@ void IndexSelectGradientOp<Context>::RunImpl() {
auto* dy = X(2).template data<T, Context>(); auto* dy = X(2).template data<T, Context>();
auto* dx = Y(0)->template mutable_data<T, Context>(); auto* dx = Y(0)->template mutable_data<T, Context>();
math::Set(
X(0).count(),
cast::to<T>(0.f),
dx, ctx()
);
kernel::IndexSelectGrad( kernel::IndexSelectGrad(
outer_dim_, outer_dim_,
inner_dim_, inner_dim_,
......
#include "core/workspace.h"
#include "utils/op_kernel.h"
#include "utils/math_utils.h"
#include "utils/math_functions.h"
#include "operators/array/masked_select_op.h"
namespace dragon {
template <class Context> template <typename T>
void MaskedSelectOp<Context>::RunImpl() {
auto* x = X(0).template data<T, Context>();
auto* mask = X(1).template raw_data<Context>();
auto* scratch = ws()->CreateTensor("/share/data");
auto* indices = ws()->CreateTensor(unique_name("indices"));
kernel::MaskedSelect(
X(0).count(),
(const uint8_t*)mask, x,
indices, scratch,
Y(0), ctx()
);
}
template <class Context>
void MaskedSelectOp<Context>::RunOnDevice() {
CHECK_EQ(X(0).count(), X(1).count())
<< "\nSize of mask and input should be equal.";
CHECK(XIsType(X(1), bool) || XIsType(X(1), uint8_t))
<< "\nExcepted bool or uint8 mask.";
DispatchHelper<TensorTypes
<bool, int8_t, uint8_t, int, int64_t,
float16, float, double>
>::Call(this, X(0));
}
template <class Context> template <typename T>
void MaskedSelectGradientOp<Context>::RunImpl() {
auto* dy = X(1).template data<T, Context>();
auto* dx = Y(0)->template mutable_data<T, Context>();
auto* i = ws()
->GetTensor(unique_name("indices"))
->template data<int64_t, Context>();
kernel::MaskedSelectGrad(
X(0).count(),
X(1).count(),
i, dy,
dx, ctx()
);
}
template <class Context>
void MaskedSelectGradientOp<Context>::RunOnDevice() {
Y(0)->ReshapeLike(X(0));
DispatchHelper<TensorTypes
<bool, int8_t, uint8_t, int, int64_t,
float16, float, double>
>::Call(this, X(0));
}
DEPLOY_CPU(MaskedSelect);
#ifdef WITH_CUDA
DEPLOY_CUDA(MaskedSelect);
#endif
DEPLOY_CPU(MaskedSelectGradient);
#ifdef WITH_CUDA
DEPLOY_CUDA(MaskedSelectGradient);
#endif
OPERATOR_SCHEMA(MaskedSelect)
/* X, M */
.NumInputs(2)
/* Y */
.NumOutputs(1);
OPERATOR_SCHEMA(MaskedSelectGradient)
/* X, dY */
.NumInputs(2)
/* dX */
.NumOutputs(1);
namespace {
class GradientMaker final : public GradientMakerBase {
public:
GRADIENT_MAKER_CTOR(GradientMaker);
vector<OperatorDef> MakeDef() override {
return SingleDef(def.type() + "Gradient", "",
vector<string>({ I(0), GO(0) }),
vector<string>({ GI(0)} )
);
}
};
} // namespace
REGISTER_GRADIENT(MaskedSelect, GradientMaker);
} // namespace dragon
\ No newline at end of file
#include "core/workspace.h"
#include "utils/op_kernel.h"
#include "utils/math_functions.h"
#include "operators/array/non_zero_op.h"
#define TENSOR_FROM_VEC(tensor, vec, T) \
{ \
tensor.Reshape({ (int64_t)vec.size() }); \
auto* data = tensor.template mutable_data<T, CPUContext>(); \
for (int i = 0; i < vec.size(); i++) data[i] = (T)vec[i]; \
}
namespace dragon {
template <class Context> template <typename T>
void NonZeroOp<Context>::RunImpl() {
auto ndim = X(0).ndim();
auto nelements = X(0).count();
auto* x = X(0).template data<T, Context>();
auto* scratch = ws()->CreateTensor("/share/data");
auto* indices = ws()->CreateTensor("/share/buffer/grad:0");
auto* mask = ws()
->CreateTensor("/share/buffer/grad:1")
->Reshape({ nelements })
->template mutable_data<uint8_t, Context>();
kernel::NotZero(nelements, x, (bool*)mask, ctx());
kernel::MaskedSelect(
nelements,
mask, (T*)nullptr,
indices, scratch,
(Tensor*)nullptr, ctx()
);
nelements = indices->count();
auto* y = Y(0)
->Reshape({ nelements, (int64_t)ndim })
->template mutable_data<int64_t, Context>();
kernel::UnravelIndex(
nelements, ndim,
X_dims_.template data<int, Context>(),
indices->template data<int64_t, Context>(),
y, ctx()
);
}
template <class Context>
void NonZeroOp<Context>::RunOnDevice() {
TENSOR_FROM_VEC(X_dims_, X(0).dims(), int);
DispatchHelper<TensorTypes
<bool, int8_t, uint8_t, int, int64_t,
float16, float, double>
>::Call(this, X(0));
}
DEPLOY_CPU(NonZero);
#ifdef WITH_CUDA
DEPLOY_CUDA(NonZero);
#endif
OPERATOR_SCHEMA(NonZero)
/* X */
.NumInputs(1)
/* Y */
.NumOutputs(1);
NO_GRADIENT(NonZero);
} // namespace dragon
\ No newline at end of file
#include "core/workspace.h"
#include "utils/op_kernel.h"
#include "utils/math_utils.h"
#include "utils/math_functions.h"
#include "operators/array/where_op.h"
namespace dragon {
template <class Context> template <typename T>
void WhereOp<Context>::RunImpl() {
const T *a = nullptr, *b = nullptr;
auto* mask = X(2).template raw_data<Context>();
if (X(0).count() < X(1).count()) {
int rows, cols;
Y(0)->ReshapeLike(X(1));
a = ws()
->template data<T, Context>
({ X(1).count() })[0];
b = X(1).template data<T, Context>();
auto* ra = X(0).template data<T, Context>();
if (utils::IsRowwiseBroadcast(
X(0).dims(), X(1).dims(),
&rows, &cols)) {
math::BroadcastSet(
rows, cols, 0, ra,
const_cast<T*>(a), ctx()
);
} else if (utils::IsColwiseBroadcast(
X(0).dims(), X(1).dims(),
&rows, &cols)) {
math::BroadcastSet(
rows, cols, 1, ra,
const_cast<T*>(a), ctx()
);
} else {
LOG(FATAL)
<< "Could not broadcast "
<< X(0).DimString()
<< " to "
<< X(1).DimString();
}
} else if (X(0).count() > X(1).count()) {
int rows, cols;
Y(0)->ReshapeLike(X(0));
b = ws()
->template data<T, Context>
({ X(0).count() })[0];
a = X(0).template data<T, Context>();
auto* rb = X(1).template data<T, Context>();
if (utils::IsRowwiseBroadcast(
X(0).dims(), X(1).dims(),
&rows, &cols)) {
math::BroadcastSet(
rows, cols, 0, rb,
const_cast<T*>(b), ctx()
);
} else if (utils::IsColwiseBroadcast(
X(0).dims(), X(1).dims(),
&rows, &cols)) {
math::BroadcastSet(
rows, cols, 1, rb,
const_cast<T*>(b), ctx()
);
} else {
LOG(FATAL)
<< "Could not broadcast "
<< X(1).DimString()
<< " to "
<< X(0).DimString();
}
} else {
Y(0)->ReshapeLike(X(0));
a = X(0).template data<T, Context>();
b = X(1).template data<T, Context>();
}
CHECK_EQ(Y(0)->count(), X(2).count())
<< "\nSize of mask and input should be equal.";
auto* y = Y(0)->template mutable_data<T, Context>();
kernel::Where(
Y(0)->count(),
(const uint8_t*)mask,
a, b, y, ctx()
);
}
template <class Context>
void WhereOp<Context>::RunOnDevice() {
CHECK(XIsType(X(2), bool) || XIsType(X(2), uint8_t))
<< "\nExcepted bool or uint8 mask.";
DispatchHelper<TensorTypes
<bool, int8_t, uint8_t, int, int64_t,
float16, float, double>
>::Call(this, X(0));
}
template <class Context> template <typename T>
void WhereGradientOp<Context>::RunImpl() {
T *da = nullptr, *db = nullptr;
auto* dy = X(-1).template data<T, Context>();
auto* mask = X(2).template raw_data<Context>();
int rows, cols, type;
if (utils::IsRowwiseBroadcast(
X(0).dims(), X(1).dims(),
&rows, &cols)) {
type = 0;
} else if (utils::IsColwiseBroadcast(
X(0).dims(), X(1).dims(),
&rows, &cols)) {
type = 1;
}
vec32_t dims = { rows, cols };
vec32_t axes = { type };
if (X(0).count() < X(1).count()) {
da = ws()
->template data<T, Context>
({ X(1).count() })[0];
db = Y(1)->template mutable_data<T, Context>();
auto* ra = Y(0)->template mutable_data<T, Context>();
kernel::WhereGrad(
X(-1).count(),
(const uint8_t*)mask,
dy, da, db, ctx()
);
kernel::ReduceSum(
2, dims.data(),
1, axes.data(),
1.f, da,
ra, ctx()
);
} else if (X(0).count() > X(1).count()) {
db = ws()
->template data<T, Context>
({ X(0).count() })[0];
da = Y(0)->template mutable_data<T, Context>();
auto* rb = Y(1)->template mutable_data<T, Context>();
kernel::WhereGrad(
X(-1).count(),
(const uint8_t*)mask,
dy, da, db, ctx()
);
kernel::ReduceSum(
2, dims.data(),
1, axes.data(),
1.f, db,
rb, ctx()
);
} else {
da = Y(0)->template mutable_data<T, Context>();
db = Y(1)->template mutable_data<T, Context>();
kernel::WhereGrad(
Y(0)->count(),
(const uint8_t*)mask,
dy, da, db, ctx()
);
}
}
template <class Context>
void WhereGradientOp<Context>::RunOnDevice() {
CHECK_EQ(X(-1).count(), X(2).count())
<< "\nSize of mask and input should be equal.";
Y(0)->ReshapeLike(X(0));
Y(1)->ReshapeLike(X(1));
CHECK(XIsType(X(2), bool) || XIsType(X(2), uint8_t))
<< "\nExcepted bool or uint8 mask.";
DispatchHelper<TensorTypes
<int8_t, uint8_t, int, int64_t,
float16, float, double>
>::Call(this, X(0));
}
DEPLOY_CPU(Where);
#ifdef WITH_CUDA
DEPLOY_CUDA(Where);
#endif
DEPLOY_CPU(WhereGradient);
#ifdef WITH_CUDA
DEPLOY_CUDA(WhereGradient);
#endif
OPERATOR_SCHEMA(Where)
/* A, B, M */
.NumInputs(3)
/* Y */
.NumOutputs(1);
OPERATOR_SCHEMA(WhereGradient)
/* A, B, M, dY */
.NumInputs(4)
/* dA, dB */
.NumOutputs(2);
namespace {
class GradientMaker : public GradientMakerBase {
public:
GRADIENT_MAKER_CTOR(GradientMaker);
vector<OperatorDef> MakeDef() override {
return SingleDef(def.type() + "Gradient", "",
vector<string>({ I(0), I(1), I(2), GO(0) }),
vector<string>({ GI(0), GI(1) })
);
}
};
} // namespace
REGISTER_GRADIENT(Where, GradientMaker);
} // namespace dragon
\ No newline at end of file
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!