Commit 5cd0761b by Ting PAN

Unlock CUDA Async Streams

1 parent 3b990761
Showing with 920 additions and 882 deletions
...@@ -52,9 +52,9 @@ using Set = std::unordered_set<Value> ; ...@@ -52,9 +52,9 @@ using Set = std::unordered_set<Value> ;
/* /*
* Define the Kernel version. * Define the Kernel version.
* *
* | Major(2) | Minor(2) | Patch(10) | * | Major(2) | Minor(2) | Patch(11) |
*/ */
#define DRAGON_VERSION 2210 #define DRAGON_VERSION 2211
/* /*
* Define the default random seed. * Define the default random seed.
......
...@@ -34,6 +34,8 @@ class CPUContext { ...@@ -34,6 +34,8 @@ class CPUContext {
virtual ~CPUContext() {} virtual ~CPUContext() {}
inline void SwitchToDevice() {} inline void SwitchToDevice() {}
inline void SwitchToDevice(int stream_id) {}
inline void FinishDeviceCompution() {} inline void FinishDeviceCompution() {}
inline static void* New(size_t nbytes) { inline static void* New(size_t nbytes) {
...@@ -47,7 +49,15 @@ class CPUContext { ...@@ -47,7 +49,15 @@ class CPUContext {
return data; return data;
} }
inline static void Memset(size_t nbytes, void* ptr) { inline static void Memset(
size_t nbytes,
void* ptr) {
memset(ptr, 0, nbytes);
}
inline void MemsetAsync(
size_t nbytes,
void* ptr) {
memset(ptr, 0, nbytes); memset(ptr, 0, nbytes);
} }
...@@ -59,18 +69,16 @@ class CPUContext { ...@@ -59,18 +69,16 @@ class CPUContext {
memcpy(dst, src, nbytes); memcpy(dst, src, nbytes);
} }
inline static void Delete(void* data) { free(data); }
template<class DstContext, class SrcContext> template<class DstContext, class SrcContext>
inline static void MemcpyAsync( inline void MemcpyAsync(
size_t nbytes, size_t nbytes,
void* dst, void* dst,
const void* src) { const void* src) {
NOT_IMPLEMENTED; memcpy(dst, src, nbytes);
} }
template<typename T, class DstContext, class SrcContext> template<typename T, class DstContext, class SrcContext>
inline static void Copy( inline void Copy(
int n, int n,
T* dst, T* dst,
const T* src) { const T* src) {
...@@ -82,7 +90,10 @@ class CPUContext { ...@@ -82,7 +90,10 @@ class CPUContext {
else for (int i = 0; i < n; i++) dst[i] = src[i]; else for (int i = 0; i < n; i++) dst[i] = src[i];
} }
inline static void Delete(void* data) { free(data); }
inline int device_id() const { return 0; } inline int device_id() const { return 0; }
inline void set_stream_id(int stream_id) {}
inline std::mt19937* rand_generator() { inline std::mt19937* rand_generator() {
if (!rand_generator_.get()) if (!rand_generator_.get())
......
...@@ -23,8 +23,7 @@ namespace dragon { ...@@ -23,8 +23,7 @@ namespace dragon {
class CUDAObject { class CUDAObject {
public: public:
CUDAObject(int default_stream = 1) CUDAObject() {
: default_stream(default_stream) {
for (int i = 0; i < CUDA_MAX_DEVICES; i++) { for (int i = 0; i < CUDA_MAX_DEVICES; i++) {
cuda_streams[i] = vector<cudaStream_t>(); cuda_streams[i] = vector<cudaStream_t>();
cublas_handles[i] = vector<cublasHandle_t>(); cublas_handles[i] = vector<cublasHandle_t>();
...@@ -38,7 +37,7 @@ class CUDAObject { ...@@ -38,7 +37,7 @@ class CUDAObject {
for (int i = 0; i < CUDA_MAX_DEVICES; i++) { for (int i = 0; i < CUDA_MAX_DEVICES; i++) {
for (int j = 0; j < cuda_streams[i].size(); j++) { for (int j = 0; j < cuda_streams[i].size(); j++) {
auto& stream = cuda_streams[i][j]; auto& stream = cuda_streams[i][j];
// follow caffe2, do not check the stream destroying // follow the caffe2, do not check the stream destroying
// Error code 29 (driver shutting down) is inevitable // Error code 29 (driver shutting down) is inevitable
// TODO(PhyscalX): Can someone solve this issue? // TODO(PhyscalX): Can someone solve this issue?
if (stream) cudaStreamDestroy(stream); if (stream) cudaStreamDestroy(stream);
...@@ -52,19 +51,21 @@ class CUDAObject { ...@@ -52,19 +51,21 @@ class CUDAObject {
} }
} }
/** // follow the caffe2,
* Each device takes a group of streams. // each device takes a group of non-bl0cking streams
* // the stream 0 is reserved for default stream,
* The stream 0 is reserved for default stream, // as some computations really require it,
* stream 1 or higher is created as ``cudaStreamNonBlocking``. // e.g. cublas.asum() and mixed cpu/cuda operations
*/ // besides, somes calls, such as cudnn.conv() and cudnn.rnn(),
// produce wrong results if running them on non-blocking streams
// note that caffe2 also use default streams (within CuDNNState)
cudaStream_t GetStream(int device_id, int stream_id) { cudaStream_t GetStream(int device_id, int stream_id) {
vector<cudaStream_t>& dev_streams = cuda_streams[device_id]; vector<cudaStream_t>& dev_streams = cuda_streams[device_id];
if (dev_streams.size() <= (unsigned)stream_id) if (dev_streams.size() <= (unsigned)stream_id)
dev_streams.resize(stream_id + 1, nullptr); dev_streams.resize(stream_id + 1, nullptr);
if (!dev_streams[stream_id]) { if (!dev_streams[stream_id]) {
DeviceGuard guard(device_id); DeviceGuard guard(device_id);
unsigned int flags = !stream_id && default_stream ? unsigned int flags = !stream_id ?
cudaStreamDefault : cudaStreamNonBlocking; cudaStreamDefault : cudaStreamNonBlocking;
CUDA_CHECK(cudaStreamCreateWithFlags( CUDA_CHECK(cudaStreamCreateWithFlags(
&dev_streams[stream_id], flags)); &dev_streams[stream_id], flags));
...@@ -102,8 +103,6 @@ class CUDAObject { ...@@ -102,8 +103,6 @@ class CUDAObject {
} }
#endif #endif
int default_stream;
vector<cudaStream_t> cuda_streams[CUDA_MAX_DEVICES]; vector<cudaStream_t> cuda_streams[CUDA_MAX_DEVICES];
vector<cublasHandle_t> cublas_handles[CUDA_MAX_DEVICES]; vector<cublasHandle_t> cublas_handles[CUDA_MAX_DEVICES];
#ifdef WITH_CUDNN #ifdef WITH_CUDNN
...@@ -129,11 +128,10 @@ class CUDAContext { ...@@ -129,11 +128,10 @@ class CUDAContext {
stream_id_ = stream_id; stream_id_ = stream_id;
} }
inline void SwitchToDevice() { SwitchToDevice(0); } inline void SwitchToDevice() { SwitchToDevice(1); }
inline void FinishDeviceCompution() { inline void FinishDeviceCompution() {
cudaStreamSynchronize(cuda_object_ cudaStreamSynchronize(cuda_stream());
.GetStream(device_id_, stream_id_));
cudaError_t error = cudaGetLastError(); cudaError_t error = cudaGetLastError();
CHECK_EQ(error, cudaSuccess) CHECK_EQ(error, cudaSuccess)
<< "\nCUDA Error: " << cudaGetErrorString(error); << "\nCUDA Error: " << cudaGetErrorString(error);
...@@ -147,8 +145,17 @@ class CUDAContext { ...@@ -147,8 +145,17 @@ class CUDAContext {
return data; return data;
} }
inline static void Memset(size_t nbytes, void* ptr) { inline static void Memset(
cudaMemset(ptr, 0, nbytes); size_t nbytes,
void* ptr) {
CUDA_CHECK(cudaMemset(ptr, 0, nbytes));
}
inline void MemsetAsync(
size_t nbytes,
void* ptr) {
CUDA_CHECK(cudaMemsetAsync(ptr, 0,
nbytes, cuda_stream()));
} }
template<class DstContext, class SrcContext> template<class DstContext, class SrcContext>
...@@ -169,20 +176,22 @@ class CUDAContext { ...@@ -169,20 +176,22 @@ class CUDAContext {
cudaMemcpyDefault, cuda_stream())); cudaMemcpyDefault, cuda_stream()));
} }
inline static void Delete(void* data) { cudaFree(data); }
template<typename T, class DstContext, class SrcContext> template<typename T, class DstContext, class SrcContext>
static void Copy( inline void Copy(
int n, int n,
T* dst, T* dst,
const T* src) { const T* src) {
if (dst == src) return; if (dst == src) return;
Memcpy<SrcContext, DstContext>( MemcpyAsync<SrcContext, DstContext>(
n * sizeof(T), (void*)dst, (const void*)src); n * sizeof(T), (void*)dst, (const void*)src);
} }
inline static void Delete(void* data) { cudaFree(data); }
inline int device_id() const { return device_id_; } inline int device_id() const { return device_id_; }
inline void set_stream_id(int stream_id) { stream_id_ = stream_id; }
inline cudaStream_t cuda_stream() { inline cudaStream_t cuda_stream() {
return cuda_stream(device_id_, stream_id_); return cuda_stream(device_id_, stream_id_);
} }
...@@ -227,7 +236,7 @@ class CUDAContext { ...@@ -227,7 +236,7 @@ class CUDAContext {
static thread_local CUDAObject cuda_object_; static thread_local CUDAObject cuda_object_;
private: private:
int device_id_, stream_id_ = 0, random_seed_; int device_id_, stream_id_ = 1, random_seed_;
unique_ptr<std::mt19937> rand_generator_; unique_ptr<std::mt19937> rand_generator_;
curandGenerator_t curand_generator_ = nullptr; curandGenerator_t curand_generator_ = nullptr;
}; };
...@@ -271,7 +280,7 @@ class CUDAClosure { ...@@ -271,7 +280,7 @@ class CUDAClosure {
protected: protected:
Context* ctx_; Context* ctx_;
CUDAObject cuda_object_ = 0; CUDAObject cuda_object_;
vector<int> active_streams_; vector<int> active_streams_;
}; };
...@@ -283,8 +292,22 @@ class CUDAContext { ...@@ -283,8 +292,22 @@ class CUDAContext {
CUDAContext(const int device_id = 0) { CUDA_NOT_COMPILED; } CUDAContext(const int device_id = 0) { CUDA_NOT_COMPILED; }
inline void SwitchToDevice() { CUDA_NOT_COMPILED; } inline void SwitchToDevice() { CUDA_NOT_COMPILED; }
inline void SwitchToDevice(int stream_id) { CUDA_NOT_COMPILED; }
inline void FinishDeviceCompution() { CUDA_NOT_COMPILED; } inline void FinishDeviceCompution() { CUDA_NOT_COMPILED; }
inline static void Memset(
size_t nbytes,
void* ptr) {
CUDA_NOT_COMPILED;
}
inline void MemsetAsync(
size_t nbytes,
void* ptr) {
CUDA_NOT_COMPILED;
}
template<class DstContext, class SrcContext> template<class DstContext, class SrcContext>
inline static void Memcpy( inline static void Memcpy(
size_t nbytes, size_t nbytes,
...@@ -302,6 +325,7 @@ class CUDAContext { ...@@ -302,6 +325,7 @@ class CUDAContext {
} }
inline int device_id() const { return 0; } inline int device_id() const { return 0; }
inline void set_stream_id(int stream_id) {}
}; };
#endif // WITH_CUDA #endif // WITH_CUDA
......
...@@ -37,7 +37,8 @@ class GraphBase { ...@@ -37,7 +37,8 @@ class GraphBase {
virtual bool Run( virtual bool Run(
const string& include, const string& include,
const string& exclude) = 0; const string& exclude,
const int stream_id = 1) = 0;
inline string name() const { return name_; } inline string name() const { return name_; }
...@@ -58,7 +59,8 @@ class Graph final : public GraphBase { ...@@ -58,7 +59,8 @@ class Graph final : public GraphBase {
bool Run( bool Run(
const string& include, const string& include,
const string& exclude) override; const string& exclude,
const int stream_id = 1) override;
GraphDef Prune(const GraphDef& meta_graph); GraphDef Prune(const GraphDef& meta_graph);
GraphDef MakeUpdate(const GraphDef& meta_graph); GraphDef MakeUpdate(const GraphDef& meta_graph);
......
...@@ -44,7 +44,7 @@ class OperatorBase { ...@@ -44,7 +44,7 @@ class OperatorBase {
const string& anchor); const string& anchor);
inline void SwitchToPhase(const string& phase) { phase_ = phase; } inline void SwitchToPhase(const string& phase) { phase_ = phase; }
virtual void Run() { NOT_IMPLEMENTED; } virtual void Run(int stream_id = 1) { NOT_IMPLEMENTED; }
inline const string& name() const { return def_.name(); } inline const string& name() const { return def_.name(); }
inline const string& type() const { return def_.type(); } inline const string& type() const { return def_.type(); }
...@@ -100,13 +100,13 @@ class Operator : public OperatorBase { ...@@ -100,13 +100,13 @@ class Operator : public OperatorBase {
Output(0)->name() == "ignore")); Output(0)->name() == "ignore"));
} }
virtual void Run() final { void Run(int stream_id = 1) final {
if (!allow_run_) return; if (!allow_run_) return;
if (allow_recompute_) MakeResource(); if (allow_recompute_) MakeResource();
ctx().SwitchToDevice(); ctx()->SwitchToDevice(stream_id);
MemorySwitch(); MemorySwitch();
RunOnDevice(); RunOnDevice();
if (do_sync_) ctx().FinishDeviceCompution(); if (do_sync_) ctx()->FinishDeviceCompution();
if (allow_recompute_) CleanResource(); if (allow_recompute_) CleanResource();
} }
...@@ -123,7 +123,7 @@ class Operator : public OperatorBase { ...@@ -123,7 +123,7 @@ class Operator : public OperatorBase {
virtual void RunOnDevice() = 0; virtual void RunOnDevice() = 0;
inline Context& ctx() { return ctx_; } inline Context* ctx() { return &ctx_; }
inline bool AllowRun() { return allow_run_; } inline bool AllowRun() { return allow_run_; }
protected: protected:
...@@ -192,6 +192,27 @@ DECLARE_REGISTRY( ...@@ -192,6 +192,27 @@ DECLARE_REGISTRY(
const OperatorDef&, const OperatorDef&,
Workspace*); Workspace*);
#define TENSOR_FILL_WITH_TYPE(tensor, shape, type) \
if (tensor.count() == 0) { \
CHECK(ws()->GetFiller(tensor.name())) \
<< "\nTensor(" << tensor.name() << ") is empty. \n" \
<< "may be specify a filler for it ?"; \
tensor.Reshape(shape); \
unique_ptr< Filler<type, Context> > filler( \
CreateFiller<type, Context>(*ws()->GetFiller(tensor.name()))); \
filler->Fill(&tensor, ctx()); \
ctx()->FinishDeviceCompution(); \
} else { \
TIndex count = 1; \
for(int i = 0; i < shape.size(); i++) count *= shape[i]; \
CHECK_EQ(count, tensor.count()) \
<< "\nModel request " << "Tensor(" << tensor.name() << ")'s " \
<< "size is " << count << ", \n" \
<< "but now is " << tensor.count() << ", " \
<< "did you feed the incorrect Tensor before ?"; \
tensor.Reshape(shape); \
}
#define TENSOR_FILL(tensor, shape) \ #define TENSOR_FILL(tensor, shape) \
if (tensor.count() == 0) { \ if (tensor.count() == 0) { \
CHECK(ws()->GetFiller(tensor.name())) \ CHECK(ws()->GetFiller(tensor.name())) \
...@@ -200,7 +221,8 @@ DECLARE_REGISTRY( ...@@ -200,7 +221,8 @@ DECLARE_REGISTRY(
tensor.Reshape(shape); \ tensor.Reshape(shape); \
unique_ptr< Filler<T, Context> > filler( \ unique_ptr< Filler<T, Context> > filler( \
CreateFiller<T, Context>(*ws()->GetFiller(tensor.name()))); \ CreateFiller<T, Context>(*ws()->GetFiller(tensor.name()))); \
filler->Fill(&tensor, &ctx()); \ filler->Fill(&tensor, ctx()); \
ctx()->FinishDeviceCompution(); \
} else { \ } else { \
TIndex count = 1; \ TIndex count = 1; \
for(int i = 0; i < shape.size(); i++) count *= shape[i]; \ for(int i = 0; i < shape.size(); i++) count *= shape[i]; \
...@@ -217,7 +239,7 @@ DECLARE_REGISTRY( ...@@ -217,7 +239,7 @@ DECLARE_REGISTRY(
if (size > ptr_tensor->count()) { \ if (size > ptr_tensor->count()) { \
ptr_tensor->Reshape({ size }); \ ptr_tensor->Reshape({ size }); \
math::Set<T, Context>(size, dragon_cast<T, float>(1.f), \ math::Set<T, Context>(size, dragon_cast<T, float>(1.f), \
ptr_tensor->template mutable_data<T, Context>()); \ ptr_tensor->template mutable_data<T, Context>(), ctx()); \
} \ } \
} }
......
...@@ -74,7 +74,9 @@ class Tensor { ...@@ -74,7 +74,9 @@ class Tensor {
for (TIndex i = start; i < end; i++) ret *= dim(i); for (TIndex i = start; i < end; i++) ret *= dim(i);
return ret; return ret;
} }
inline TIndex count() const { return size_; } inline TIndex count() const { return size_; }
inline TIndex count(const TIndex start) const { inline TIndex count(const TIndex start) const {
return count(start, ndim()); return count(start, ndim());
} }
...@@ -197,7 +199,7 @@ class Tensor { ...@@ -197,7 +199,7 @@ class Tensor {
mutable_data_ptr<Context>(&data_ptr); mutable_data_ptr<Context>(&data_ptr);
// call the constructors // call the constructors
if (meta.ctor()) meta_.ctor()(data_ptr, size_); if (meta.ctor()) meta_.ctor()(data_ptr, size_);
capacity_ = size_ * meta.itemsize(); capacity_ = size_ * meta.itemsize(), require_init_ = true;
return data_ptr; return data_ptr;
} }
...@@ -225,6 +227,15 @@ class Tensor { ...@@ -225,6 +227,15 @@ class Tensor {
} }
template <typename T, class Context> template <typename T, class Context>
T* mutable_data(Context* ctx) {
auto* data = mutable_data<T, Context>();
if (!require_init_) return data;
ctx->MemsetAsync(nbytes(), (void*)data);
require_init_ = false;
return data;
}
template <typename T, class Context>
const T* data() const { const T* data() const {
CHECK(meta_ == TypeMeta::Make<T>()) CHECK(meta_ == TypeMeta::Make<T>())
<< "\nThe DType of Tensor(" << name() << ") is " << "\nThe DType of Tensor(" << name() << ") is "
...@@ -234,27 +245,31 @@ class Tensor { ...@@ -234,27 +245,31 @@ class Tensor {
} }
template <class Context> template <class Context>
inline void CopyFrom(const Tensor& other) { inline void CopyFrom(const Tensor& other, Context* ctx) {
if ((void*)&other == (void*)this) return;
CHECK_EQ(size_, other.size_); CHECK_EQ(size_, other.size_);
auto* src = other.template raw_data<Context>(); auto* src = other.template raw_data<Context>();
auto* dst = raw_mutable_data<Context>(other.meta_); auto* dst = raw_mutable_data<Context>(other.meta_);
if (dst == src) return; ctx->template MemcpyAsync<Context, Context>(
if (TypeMeta::Id<Context>() == nbytes(), dst, src);
TypeMeta::Id<CPUContext>()) { require_init_ = false;
CPUContext::Memcpy<Context, Context>(nbytes(), dst, src);
} else if (TypeMeta::Id<Context>() ==
TypeMeta::Id<CUDAContext>()) {
CUDAContext::Memcpy<Context, Context>(nbytes(), dst, src);
}
} }
inline void Move(MixedMemory* mem) { inline void Move(MixedMemory* mem) {
if (mem != nullptr) ex_memory_ = mem; if (mem != nullptr) {
else ex_memory_ = new MixedMemory(TypeMeta::Make<float>(), 4); ex_memory_ = mem;
own_mem_ = false; require_init_ = false;
} else {
ex_memory_ = new MixedMemory(
TypeMeta::Make<float>(), 4);
require_init_ = true;
} own_mem_ = false;
} }
inline void Share(MixedMemory* mem) { Move(mem); is_shared_ = true; } inline void Share(MixedMemory* mem) {
Move(mem); is_shared_ = true;
require_init_ = false;
}
inline void Reset() { inline void Reset() {
size_ = capacity_ = 0; size_ = capacity_ = 0;
...@@ -275,7 +290,7 @@ class Tensor { ...@@ -275,7 +290,7 @@ class Tensor {
shared_ptr<MixedMemory> memory_; shared_ptr<MixedMemory> memory_;
MixedMemory* ex_memory_ = nullptr; MixedMemory* ex_memory_ = nullptr;
bool is_corrupted_ = false, is_shared_ = false; bool is_corrupted_ = false, is_shared_ = false;
bool own_mem_ = true; bool own_mem_ = true, require_init_ = true;
}; };
} // namespace dragon } // namespace dragon
......
...@@ -179,29 +179,28 @@ class Workspace { ...@@ -179,29 +179,28 @@ class Workspace {
template <class Context> template <class Context>
inline vector<void*> caches( inline vector<void*> caches(
const vector<size_t>& segments) { const vector<size_t>& segments) {
TIndex total_size = 0; TIndex nbytes = 0;
for (auto& segment : segments) total_size += (TIndex)segment; for (auto& segment : segments) nbytes += (TIndex)segment;
Tensor* cacheT = CreateTensor("/share/cache"); Tensor* cache_t = CreateTensor("/share/cache");
cacheT->Reshape({ total_size }); cache_t->Reshape({ nbytes });
vector<void*> caches(segments.size()); vector<void*> Bcaches(segments.size());
caches[0] = cacheT->template mutable_data<uint8_t, Context>(); Bcaches[0] = cache_t->template mutable_data<uint8_t, Context>();
for (int i = 1; i < segments.size(); i++) for (int i = 1; i < segments.size(); i++)
caches[i] = (uint8_t*)caches[i - 1] + segments[i - 1]; Bcaches[i] = (uint8_t*)Bcaches[i - 1] + segments[i - 1];
return caches; return Bcaches;
} }
template <typename T, class Context> template <typename T, class Context>
inline vector<T*> caches( inline vector<T*> caches(
const vector<TIndex>& segments) { const vector<TIndex>& segments) {
TIndex total_count = 0; vector<size_t> Tsegments;
for (auto& segment : segments) total_count += segment; for (auto& segment : segments)
Tensor* cacheT = CreateTensor("/share/cache"); Tsegments.emplace_back(segment * sizeof(T));
cacheT->Reshape({ total_count }); vector<void*> Bcaches = caches<Context>(Tsegments);
vector<T*> caches(segments.size()); vector<T*> Tcaches(segments.size());
caches[0] = cacheT->template mutable_data<T, Context>(); for (int i = 0; i < segments.size(); i++)
for (int i = 1; i < segments.size(); i++) Tcaches[i] = (T*)Bcaches[i];
caches[i] = caches[i - 1] + segments[i - 1]; return Tcaches;
return caches;
} }
/******************** Operator ********************/ /******************** Operator ********************/
...@@ -259,11 +258,12 @@ class Workspace { ...@@ -259,11 +258,12 @@ class Workspace {
void RunGraph( void RunGraph(
const string& graph_name, const string& graph_name,
const string& include, const string& include,
const string& exclude) { const string& exclude,
const int stream_id = 1) {
if (!graph_map_.count(graph_name)) if (!graph_map_.count(graph_name))
LOG(FATAL) << "Graph(" << graph_name LOG(FATAL) << "Graph(" << graph_name
<< ") does not exist."; << ") does not exist.";
graph_map_[graph_name]->Run(include, exclude); graph_map_[graph_name]->Run(include, exclude, stream_id);
} }
vector<string> GetGraphs() { vector<string> GetGraphs() {
......
...@@ -36,7 +36,6 @@ class SparseSoftmaxCrossEntropyOp : public Operator<Context> { ...@@ -36,7 +36,6 @@ class SparseSoftmaxCrossEntropyOp : public Operator<Context> {
USE_OPERATOR_FUNCTIONS; USE_OPERATOR_FUNCTIONS;
void SoftmaxRun(); void SoftmaxRun();
void SoftmaxRunFP16();
void RunOnDevice() override; void RunOnDevice() override;
template <typename Tx, typename Ty> void RunWithType(); template <typename Tx, typename Ty> void RunWithType();
......
...@@ -42,7 +42,7 @@ public: ...@@ -42,7 +42,7 @@ public:
// simply copy the dY to dX // simply copy the dY to dX
Output(0)->ReshapeLike(Input(0)); Output(0)->ReshapeLike(Input(0));
if (Output(0)->name() != Input(-1).name()) if (Output(0)->name() != Input(-1).name())
Output(0)->template CopyFrom<Context>(Input(-1)); Output(0)->template CopyFrom<Context>(Input(-1), ctx());
} }
}; };
......
...@@ -34,7 +34,6 @@ class L2NormOp final : public Operator<Context> { ...@@ -34,7 +34,6 @@ class L2NormOp final : public Operator<Context> {
TIndex axis, num_axes, end_axis; TIndex axis, num_axes, end_axis;
float eps; float eps;
string mode; string mode;
bool across_inner;
Tensor* norm, buffer; Tensor* norm, buffer;
TIndex outer_dim, dim, inner_dim, spatial_dim; TIndex outer_dim, dim, inner_dim, spatial_dim;
}; };
...@@ -55,7 +54,6 @@ class L2NormGradientOp final : public Operator<Context> { ...@@ -55,7 +54,6 @@ class L2NormGradientOp final : public Operator<Context> {
protected: protected:
TIndex axis, num_axes, end_axis; TIndex axis, num_axes, end_axis;
string mode; string mode;
bool across_inner;
Tensor* norm, buffer, buffer_inner; Tensor* norm, buffer, buffer_inner;
TIndex outer_dim, dim, inner_dim; TIndex outer_dim, dim, inner_dim;
}; };
......
...@@ -24,7 +24,7 @@ class AdamUpdateOp final : public UpdateOpBase<Context> { ...@@ -24,7 +24,7 @@ class AdamUpdateOp final : public UpdateOpBase<Context> {
USE_OPERATOR_FUNCTIONS; USE_OPERATOR_FUNCTIONS;
USE_UPDATER_FUNCTIONS(Context); USE_UPDATER_FUNCTIONS(Context);
void ComputeRunWithFloat() override; void ComputeRunWithFloat32() override;
void ComputeRunWithFloat16() override; void ComputeRunWithFloat16() override;
protected: protected:
......
...@@ -43,10 +43,26 @@ class CollectiveUpdateOp final : public Operator<Context> { ...@@ -43,10 +43,26 @@ class CollectiveUpdateOp final : public Operator<Context> {
void InitNCCL(); void InitNCCL();
void RunOnDevice() override; void RunOnDevice() override;
void MPIAllReduceWithFloat();
void NCCLAllReduceWithFloat(); template <typename T> void MPIAllReduce(
void MPIBcastWithFloat(); Tensor* tensor,
void NCCLBcastWithFloat(); MPI_Datatype dtype);
template <typename T> void MPIBcast(
Tensor* tensor,
MPI_Datatype dtype);
#ifdef WITH_MPI_NCCL
template <typename T> void NCCLAllReduce(
Tensor* tensor,
ncclDataType_t dtype,
cudaStream_t& stream);
template <typename T> void NCCLBcast(
Tensor* tensor,
ncclDataType_t dtype,
cudaStream_t& stream);
#endif
protected: protected:
int comm_size, comm_rank, comm_root; int comm_size, comm_rank, comm_root;
......
...@@ -24,7 +24,7 @@ class NesterovUpdateOp final : public UpdateOpBase<Context> { ...@@ -24,7 +24,7 @@ class NesterovUpdateOp final : public UpdateOpBase<Context> {
USE_OPERATOR_FUNCTIONS; USE_OPERATOR_FUNCTIONS;
USE_UPDATER_FUNCTIONS(Context); USE_UPDATER_FUNCTIONS(Context);
void ComputeRunWithFloat() override; void ComputeRunWithFloat32() override;
void ComputeRunWithFloat16() override; void ComputeRunWithFloat16() override;
protected: protected:
......
...@@ -24,7 +24,7 @@ class RMSPropUpdateOp final : public UpdateOpBase<Context> { ...@@ -24,7 +24,7 @@ class RMSPropUpdateOp final : public UpdateOpBase<Context> {
USE_OPERATOR_FUNCTIONS; USE_OPERATOR_FUNCTIONS;
USE_UPDATER_FUNCTIONS(Context); USE_UPDATER_FUNCTIONS(Context);
void ComputeRunWithFloat() override; void ComputeRunWithFloat32() override;
void ComputeRunWithFloat16() override; void ComputeRunWithFloat16() override;
protected: protected:
......
...@@ -25,7 +25,7 @@ class SGDUpdateOp final : public UpdateOpBase<Context> { ...@@ -25,7 +25,7 @@ class SGDUpdateOp final : public UpdateOpBase<Context> {
USE_OPERATOR_FUNCTIONS; USE_OPERATOR_FUNCTIONS;
USE_UPDATER_FUNCTIONS(Context); USE_UPDATER_FUNCTIONS(Context);
void ComputeRunWithFloat() override; void ComputeRunWithFloat32() override;
void ComputeRunWithFloat16() override; void ComputeRunWithFloat16() override;
protected: protected:
......
...@@ -35,13 +35,11 @@ class UpdateOpBase : public Operator<Context> { ...@@ -35,13 +35,11 @@ class UpdateOpBase : public Operator<Context> {
void RunOnDevice() override; void RunOnDevice() override;
template <typename T> void PreprocessRunWithType(); template <typename T> void PreprocessRunWithType();
virtual void ComputeRunWithFloat() = 0; virtual void ComputeRunWithFloat32() = 0;
virtual void ComputeRunWithFloat16() = 0;
virtual void ComputeRunWithFloat16() { void UpdateRunWithFloat32();
LOG(FATAL) << "This Updater does not support FP16."; void UpdateRunWithFloat16();
}
template <typename T> void UpdateRunWithType();
protected: protected:
float lr_mult, decay_mult; float lr_mult, decay_mult;
......
...@@ -80,7 +80,8 @@ class ConvOpBase : public Operator<Context> { ...@@ -80,7 +80,8 @@ class ConvOpBase : public Operator<Context> {
dilation[0], dilation[1], dilation[0], dilation[1],
data_format, data_format,
im, im,
col); col,
ctx());
} else LOG(FATAL) << "ConvNd has not been implemented yet"; } else LOG(FATAL) << "ConvNd has not been implemented yet";
} }
template <typename T> void Col2Im(const T* col, T* im) { template <typename T> void Col2Im(const T* col, T* im) {
...@@ -94,7 +95,8 @@ class ConvOpBase : public Operator<Context> { ...@@ -94,7 +95,8 @@ class ConvOpBase : public Operator<Context> {
dilation[0], dilation[1], dilation[0], dilation[1],
data_format, data_format,
col, col,
im); im,
ctx());
} else LOG(FATAL) << "ConvNd has not been implemented yet"; } else LOG(FATAL) << "ConvNd has not been implemented yet";
} }
}; };
......
...@@ -19,6 +19,8 @@ ...@@ -19,6 +19,8 @@
namespace dragon { namespace dragon {
#define HFLT_MIN 6.10e-5F
template <typename DestType, typename SrcType> template <typename DestType, typename SrcType>
DestType dragon_cast(SrcType val); DestType dragon_cast(SrcType val);
......
...@@ -29,9 +29,17 @@ namespace dragon { ...@@ -29,9 +29,17 @@ namespace dragon {
#ifdef WITH_CUDA #ifdef WITH_CUDA
static const int CUDA_THREADS = 1024; // The number of cuda threads to use. We set it to
// We do have a server with 10 GPUs :-) // 1024 which would work for compute capability 2.x
#define CUDA_MAX_DEVICES 10 // Set it to 512 if using compute capability 1.x
const int CUDA_THREADS = 1024;
// The maximum number of blocks to use in the default kernel call. We set it to
// 65535 which would work for compute capability 2.x (where 65536 is the limit)
const int CUDA_MAX_BLOCKS = 65535;
// You really need a NVIDIA DGX-2 !!! :-)
#define CUDA_MAX_DEVICES 16
#define CUDA_VERSION_MIN(major, minor, patch) \ #define CUDA_VERSION_MIN(major, minor, patch) \
(CUDA_VERSION >= (major * 1000 + minor * 100 + patch)) (CUDA_VERSION >= (major * 1000 + minor * 100 + patch))
...@@ -67,12 +75,16 @@ static const int CUDA_THREADS = 1024; ...@@ -67,12 +75,16 @@ static const int CUDA_THREADS = 1024;
} while (0) } while (0)
#endif // WITH_MPI_NCCL #endif // WITH_MPI_NCCL
#define CUDA_KERNEL_LOOP(i, n) \ #define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; \
i < n; i += blockDim.x * gridDim.x) i < n; i += blockDim.x * gridDim.x)
inline int CUDA_BLOCKS(const int N) { inline int CUDA_BLOCKS(const int N) {
return (N + CUDA_THREADS - 1) / CUDA_THREADS; return std::max(
std::min(
(N + CUDA_THREADS - 1) / CUDA_THREADS,
CUDA_MAX_BLOCKS
), 1);
} }
#if CUDA_VERSION_MAX(9, 0, 0) #if CUDA_VERSION_MAX(9, 0, 0)
......
...@@ -44,6 +44,7 @@ template<> class CUDNNType<float> { ...@@ -44,6 +44,7 @@ template<> class CUDNNType<float> {
static const cudnnDataType_t type = CUDNN_DATA_FLOAT; static const cudnnDataType_t type = CUDNN_DATA_FLOAT;
static float oneval, zeroval; static float oneval, zeroval;
static const void *one, *zero; static const void *one, *zero;
typedef float BNParamType;
}; };
template<> class CUDNNType<double> { template<> class CUDNNType<double> {
...@@ -51,6 +52,7 @@ template<> class CUDNNType<double> { ...@@ -51,6 +52,7 @@ template<> class CUDNNType<double> {
static const cudnnDataType_t type = CUDNN_DATA_DOUBLE; static const cudnnDataType_t type = CUDNN_DATA_DOUBLE;
static double oneval, zeroval; static double oneval, zeroval;
static const void *one, *zero; static const void *one, *zero;
typedef double BNParamType;
}; };
#ifdef WITH_CUDA_FP16 #ifdef WITH_CUDA_FP16
...@@ -59,6 +61,7 @@ template<> class CUDNNType<float16> { ...@@ -59,6 +61,7 @@ template<> class CUDNNType<float16> {
static const cudnnDataType_t type = CUDNN_DATA_HALF; static const cudnnDataType_t type = CUDNN_DATA_HALF;
static float oneval, zeroval; static float oneval, zeroval;
static const void *one, *zero; static const void *one, *zero;
typedef float BNParamType;
}; };
#endif #endif
......
...@@ -40,7 +40,7 @@ class ConstantFiller final : public Filler<T, Context> { ...@@ -40,7 +40,7 @@ class ConstantFiller final : public Filler<T, Context> {
void Fill(Tensor* tensor, Context* ctx) override { void Fill(Tensor* tensor, Context* ctx) override {
math::Set<T, Context>(tensor->count(), math::Set<T, Context>(tensor->count(),
dragon_cast<T, float>(filler().value()), dragon_cast<T, float>(filler().value()),
tensor->mutable_data<T, Context>()); tensor->mutable_data<T, Context>(), ctx);
} }
protected: protected:
...@@ -71,11 +71,11 @@ class TruncatedNormalFiller final : public Filler<T, Context> { ...@@ -71,11 +71,11 @@ class TruncatedNormalFiller final : public Filler<T, Context> {
void Fill(Tensor* tensor, Context* ctx) override { void Fill(Tensor* tensor, Context* ctx) override {
// implement it on gpu is difficult // implement it on gpu is difficult
static CPUContext cpu_ctx; static CPUContext cctx;
math::RandomTruncatedNormal<T, CPUContext>(tensor->count(), math::RandomTruncatedNormal<T, CPUContext>(tensor->count(),
filler().mean(), filler().std(), filler().mean(), filler().std(),
filler().low(), filler().high(), filler().low(), filler().high(),
tensor->mutable_data<T, CPUContext>(), &cpu_ctx); tensor->mutable_data<T, CPUContext>(), &cctx);
} }
protected: protected:
......
...@@ -36,7 +36,8 @@ template <typename T, class Context> ...@@ -36,7 +36,8 @@ template <typename T, class Context>
void Set( void Set(
const int n, const int n,
const T alpha, const T alpha,
T* x); T* x,
Context* ctx);
template <typename T, class Context> template <typename T, class Context>
void RandomUniform( void RandomUniform(
...@@ -78,73 +79,84 @@ void Add( ...@@ -78,73 +79,84 @@ void Add(
const int n, const int n,
const T* a, const T* a,
const T* b, const T* b,
T* y); T* y,
Context* ctx);
template <typename T, class Context> template <typename T, class Context>
void Sub( void Sub(
const int n, const int n,
const T* a, const T* a,
const T* b, const T* b,
T* y); T* y,
Context* ctx);
template <typename T, class Context> template <typename T, class Context>
void Mul( void Mul(
const int n, const int n,
const T* a, const T* a,
const T* b, const T* b,
T* y); T* y,
Context* ctx);
template <typename T, class Context> template <typename T, class Context>
void Div( void Div(
const int n, const int n,
const T* a, const T* a,
const T* b, const T* b,
T* y); T* y,
Context* ctx);
template <typename T, class Context> template <typename T, class Context>
void Clip( void Clip(
const int n, const int n,
const float low, const float low,
const float high, const float high,
T* x); T* x,
Context* ctx);
template <typename T, class Context> template <typename T, class Context>
void Exp( void Exp(
const int n, const int n,
const T* x, const T* x,
T* y); T* y,
Context* ctx);
template <typename T, class Context> template <typename T, class Context>
void Log( void Log(
const int n, const int n,
const T* x, const T* x,
T* y); T* y,
Context* ctx);
template <typename T, class Context> template <typename T, class Context>
void Square( void Square(
const int n, const int n,
const T* x, const T* x,
T* y); T* y,
Context* ctx);
template <typename T, class Context> template <typename T, class Context>
void Sqrt( void Sqrt(
const int n, const int n,
const T* x, const T* x,
T* y); T* y,
Context* ctx);
template <typename T, class Context> template <typename T, class Context>
void Pow( void Pow(
const int n, const int n,
const float alpha, const float alpha,
const T* x, const T* x,
T* y); T* y,
Context* ctx);
template <typename T, class Context> template <typename T, class Context>
void Inv( void Inv(
const int n, const int n,
const float numerator, const float numerator,
const T* x, const T* x,
T* y); T* y,
Context* ctx);
/******************** Level-2 ********************/ /******************** Level-2 ********************/
...@@ -164,19 +176,21 @@ void Scale( ...@@ -164,19 +176,21 @@ void Scale(
Context* ctx); Context* ctx);
template <typename T, class Context> template <typename T, class Context>
T StridedDot( void StridedDot(
const int n, const int n,
const T* a, const T* a,
const int incx, const int incx,
const T* b, const T* b,
const int incy, const int incy,
T* y,
Context* ctx); Context* ctx);
template <typename T, class Context> template <typename T, class Context>
float Dot( void Dot(
const int n, const int n,
const T* a, const T* a,
const T* b, const T* b,
T* y,
Context* ctx); Context* ctx);
template<typename T, class Context> template<typename T, class Context>
...@@ -188,13 +202,15 @@ template<typename T, class Context> ...@@ -188,13 +202,15 @@ template<typename T, class Context>
void AddScalar( void AddScalar(
const int n, const int n,
const float alpha, const float alpha,
T* y); T* y,
Context* ctx);
template<typename T, class Context> template<typename T, class Context>
void MulScalar( void MulScalar(
const int n, const int n,
const float alpha, const float alpha,
T* y); T* y,
Context* ctx);
template<typename T, class Context> template<typename T, class Context>
void Axpy( void Axpy(
......
...@@ -80,7 +80,7 @@ T Dot( ...@@ -80,7 +80,7 @@ T Dot(
const T* b); const T* b);
template<typename T> template<typename T>
T ASum( T Sum(
const int n, const int n,
const T* x); const T* x);
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#ifdef WITH_SSE #ifdef WITH_SSE
#include <immintrin.h> #include <immintrin.h>
#include <tmmintrin.h>
#include <cstdint> #include <cstdint>
namespace dragon { namespace dragon {
......
...@@ -250,8 +250,9 @@ void LoadCaffemodel( ...@@ -250,8 +250,9 @@ void LoadCaffemodel(
void RunGraph( void RunGraph(
const std::string& graph_name, const std::string& graph_name,
Workspace* ws) { Workspace* ws,
ws->RunGraph(graph_name, "", ""); const int stream_id) {
ws->RunGraph(graph_name, "", "", stream_id);
} }
template <typename T> template <typename T>
......
...@@ -38,8 +38,7 @@ class Device { ...@@ -38,8 +38,7 @@ class Device {
EXPORT const int device_id() const { return device_id_; } EXPORT const int device_id() const { return device_id_; }
private: private:
int device_type_; int device_type_, device_id_;
int device_id_;
}; };
EXPORT Workspace* CreateWorkspace(const std::string& name); EXPORT Workspace* CreateWorkspace(const std::string& name);
...@@ -61,7 +60,8 @@ EXPORT std::string CreateGraph( ...@@ -61,7 +60,8 @@ EXPORT std::string CreateGraph(
EXPORT void RunGraph( EXPORT void RunGraph(
const std::string& graph_name, const std::string& graph_name,
Workspace* ws); Workspace* ws,
const int stream_id = 1);
EXPORT void CreateTensor( EXPORT void CreateTensor(
const std::string& name, const std::string& name,
......
...@@ -116,7 +116,7 @@ class NumpyFeeder : public TensorFeederBase { ...@@ -116,7 +116,7 @@ class NumpyFeeder : public TensorFeederBase {
#else #else
LOG(FATAL) << "CUDA was not compiled."; LOG(FATAL) << "CUDA was not compiled.";
#endif #endif
} else{ } else {
CPUContext::Memcpy<CPUContext, CPUContext>(tensor->nbytes(), CPUContext::Memcpy<CPUContext, CPUContext>(tensor->nbytes(),
tensor->raw_mutable_data<CPUContext>(), tensor->raw_mutable_data<CPUContext>(),
static_cast<void*>(PyArray_DATA(array))); static_cast<void*>(PyArray_DATA(array)));
......
...@@ -18,18 +18,22 @@ ...@@ -18,18 +18,22 @@
PyObject* CreateGradientDefsCC(PyObject* self, PyObject* args) { PyObject* CreateGradientDefsCC(PyObject* self, PyObject* args) {
PyObject* def_string = nullptr; PyObject* def_string = nullptr;
PyObject* py_g_outputs = nullptr; PyObject* py_g_outputs = nullptr;
if (!PyArg_ParseTuple(args, "SO!", &def_string, &PyList_Type, &py_g_outputs)) { if (!PyArg_ParseTuple(args, "SO!",
PyErr_SetString(PyExc_ValueError, "Excepted a serialized string of OperatorDef " &def_string, &PyList_Type, &py_g_outputs)) {
PyErr_SetString(PyExc_ValueError,
"Excepted a serialized string of OperatorDef "
"and a list containing outputs of this GradientOp."); "and a list containing outputs of this GradientOp.");
return nullptr; return nullptr;
} }
OperatorDef def; OperatorDef def;
if (!def.ParseFromString(PyBytes_AsStringEx(def_string))) { if (!def.ParseFromString(PyBytes_AsStringEx(def_string))) {
PyErr_SetString(PyExc_ValueError, "Failed to parse the OperatorDef."); PyErr_SetString(PyExc_ValueError,
"Failed to parse the OperatorDef.");
return nullptr; return nullptr;
} }
if (!GradientRegistry()->Has(def.type())) { if (!GradientRegistry()->Has(def.type())) {
PyErr_SetString(PyExc_KeyError, "This Operator does not register GradientOp."); PyErr_SetString(PyExc_KeyError,
"This Operator does not register GradientOp.");
return nullptr; return nullptr;
} }
vector<string> g_outputs; vector<string> g_outputs;
...@@ -61,7 +65,8 @@ PyObject* RunGradientFlowCC(PyObject* self, PyObject* args) { ...@@ -61,7 +65,8 @@ PyObject* RunGradientFlowCC(PyObject* self, PyObject* args) {
PyObject* py_fp_ops, *py_targets; PyObject* py_fp_ops, *py_targets;
PyObject* py_input_grads, *py_ignore_grads; PyObject* py_input_grads, *py_ignore_grads;
PyObject* py_share_grads, *py_export_graph; PyObject* py_share_grads, *py_export_graph;
if (!PyArg_ParseTuple(args, "OOOOOO", &py_fp_ops, &py_targets, if (!PyArg_ParseTuple(args, "OOOOOO",
&py_fp_ops, &py_targets,
&py_input_grads, &py_ignore_grads, &py_input_grads, &py_ignore_grads,
&py_share_grads, &py_export_graph)) { &py_share_grads, &py_export_graph)) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
...@@ -84,8 +89,8 @@ PyObject* RunGradientFlowCC(PyObject* self, PyObject* args) { ...@@ -84,8 +89,8 @@ PyObject* RunGradientFlowCC(PyObject* self, PyObject* args) {
for (auto& grad : input_grads) maker.AddExternalGrad(grad); for (auto& grad : input_grads) maker.AddExternalGrad(grad);
for (auto& grad : ignore_grads) maker.AddIgnoreGrad(grad); for (auto& grad : ignore_grads) maker.AddIgnoreGrad(grad);
maker.Make(fp_ops, targets, bp_ops); maker.Make(fp_ops, targets, bp_ops);
bool share_grads = (bool)PyObject_IsTrue(py_share_grads); bool share_grads = PyObject_IsTrue(py_share_grads) ? true : false;
bool export_graph = (bool)PyObject_IsTrue(py_export_graph); bool export_graph = PyObject_IsTrue(py_export_graph) ? true : false;
if (share_grads) maker.Share("/share/buffer/grads", bp_ops); if (share_grads) maker.Share("/share/buffer/grads", bp_ops);
if (export_graph) { if (export_graph) {
Tensor* t = ws()->CreateTensor("/export/dynamic_graph/gradient_flow"); Tensor* t = ws()->CreateTensor("/export/dynamic_graph/gradient_flow");
......
...@@ -17,7 +17,8 @@ ...@@ -17,7 +17,8 @@
inline PyObject* SetLogLevelCC(PyObject* self, PyObject* args) { inline PyObject* SetLogLevelCC(PyObject* self, PyObject* args) {
char* cname; char* cname;
if (!PyArg_ParseTuple(args, "s", &cname)) { if (!PyArg_ParseTuple(args, "s", &cname)) {
PyErr_SetString(PyExc_ValueError, "Excepted the logging level."); PyErr_SetString(PyExc_ValueError,
"Excepted the logging level.");
return nullptr; return nullptr;
} }
SetLogDestination(StrToLogSeverity(string(cname))); SetLogDestination(StrToLogSeverity(string(cname)));
......
...@@ -17,16 +17,19 @@ ...@@ -17,16 +17,19 @@
inline PyObject* CreateGraphCC(PyObject* self, PyObject* args) { inline PyObject* CreateGraphCC(PyObject* self, PyObject* args) {
PyObject* graph_str; PyObject* graph_str;
if (!PyArg_ParseTuple(args, "S", &graph_str)) { if (!PyArg_ParseTuple(args, "S", &graph_str)) {
PyErr_SetString(PyExc_ValueError, "Excepted a serialized string of GraphDef."); PyErr_SetString(PyExc_ValueError,
"Excepted a serialized string of GraphDef.");
return nullptr; return nullptr;
} }
GraphDef graph_def; GraphDef graph_def;
if (!graph_def.ParseFromString(PyBytes_AsStringEx(graph_str))) { if (!graph_def.ParseFromString(PyBytes_AsStringEx(graph_str))) {
PyErr_SetString(PyExc_RuntimeError, "Failed to parse the GraphDef."); PyErr_SetString(PyExc_RuntimeError,
"Failed to parse the GraphDef.");
return nullptr; return nullptr;
} }
if (!ws()->CreateGraph(graph_def)) { if (!ws()->CreateGraph(graph_def)) {
PyErr_SetString(PyExc_RuntimeError, "Failed to create the Graph."); PyErr_SetString(PyExc_RuntimeError,
"Failed to create the Graph.");
return nullptr; return nullptr;
} }
Py_RETURN_TRUE; Py_RETURN_TRUE;
...@@ -34,11 +37,17 @@ inline PyObject* CreateGraphCC(PyObject* self, PyObject* args) { ...@@ -34,11 +37,17 @@ inline PyObject* CreateGraphCC(PyObject* self, PyObject* args) {
inline PyObject* RunGraphCC(PyObject* self, PyObject* args) { inline PyObject* RunGraphCC(PyObject* self, PyObject* args) {
char* cname, *include, *exclude; char* cname, *include, *exclude;
if (!PyArg_ParseTuple(args, "sss", &cname, &include, &exclude)) { if (!PyArg_ParseTuple(args, "sss",
PyErr_SetString(PyExc_ValueError, "Excepted the graph name, include and exclude rules."); &cname, &include, &exclude)) {
PyErr_SetString(PyExc_ValueError,
"Excepted the graph name, include and exclude rules.");
return nullptr; return nullptr;
} }
ws()->RunGraph(string(cname), string(include), string(exclude)); ws()->RunGraph(
string(cname),
string(include),
string(exclude)
);
Py_RETURN_TRUE; Py_RETURN_TRUE;
} }
......
...@@ -46,7 +46,8 @@ inline PyObject* MPICreateGroupCC(PyObject* self, PyObject* args) { ...@@ -46,7 +46,8 @@ inline PyObject* MPICreateGroupCC(PyObject* self, PyObject* args) {
PyObject *incl, *excl, *ret; PyObject *incl, *excl, *ret;
int local_root, world_size; int local_root, world_size;
if (!PyArg_ParseTuple(args, "iOO", &local_root, &incl, &excl)) { if (!PyArg_ParseTuple(args, "iOO", &local_root, &incl, &excl)) {
PyErr_SetString(PyExc_ValueError, "Excepted the local root, include and exclued list."); PyErr_SetString(PyExc_ValueError,
"Excepted the local root, include and exclued list.");
return nullptr; return nullptr;
} }
MPI_Group world_group, local_group; MPI_Group world_group, local_group;
......
...@@ -37,12 +37,14 @@ inline PyObject* NoGradientOperatorsCC(PyObject* self, PyObject* args) { ...@@ -37,12 +37,14 @@ inline PyObject* NoGradientOperatorsCC(PyObject* self, PyObject* args) {
inline PyObject* RunOperatorCC(PyObject* self, PyObject* args) { inline PyObject* RunOperatorCC(PyObject* self, PyObject* args) {
PyObject* op_str; PyObject* op_str;
if (!PyArg_ParseTuple(args, "S", &op_str)) { if (!PyArg_ParseTuple(args, "S", &op_str)) {
PyErr_SetString(PyExc_ValueError, "Excepted a serialized string of OperatorDef."); PyErr_SetString(PyExc_ValueError,
"Excepted a serialized string of OperatorDef.");
return nullptr; return nullptr;
} }
OperatorDef op_def; OperatorDef op_def;
if (!op_def.ParseFromString(PyBytes_AsStringEx(op_str))) { if (!op_def.ParseFromString(PyBytes_AsStringEx(op_str))) {
PyErr_SetString(PyExc_RuntimeError, "Failed to parse the OperatorDef."); PyErr_SetString(PyExc_RuntimeError,
"Failed to parse the OperatorDef.");
return nullptr; return nullptr;
} }
ws()->RunOperator(op_def); ws()->RunOperator(op_def);
...@@ -52,7 +54,8 @@ inline PyObject* RunOperatorCC(PyObject* self, PyObject* args) { ...@@ -52,7 +54,8 @@ inline PyObject* RunOperatorCC(PyObject* self, PyObject* args) {
inline PyObject* RunOperatorsCC(PyObject* self, PyObject* args) { inline PyObject* RunOperatorsCC(PyObject* self, PyObject* args) {
PyObject* py_ops; PyObject* py_ops;
if (!PyArg_ParseTuple(args, "O", &py_ops)) { if (!PyArg_ParseTuple(args, "O", &py_ops)) {
PyErr_SetString(PyExc_ValueError, "Excepted a list of serialized string of OperatorDef."); PyErr_SetString(PyExc_ValueError,
"Excepted a list of serialized string of OperatorDef.");
return nullptr; return nullptr;
} }
OperatorDef op_def; OperatorDef op_def;
...@@ -67,12 +70,14 @@ inline PyObject* RunOperatorsCC(PyObject* self, PyObject* args) { ...@@ -67,12 +70,14 @@ inline PyObject* RunOperatorsCC(PyObject* self, PyObject* args) {
inline PyObject* CreatePersistentOpCC(PyObject* self, PyObject* args) { inline PyObject* CreatePersistentOpCC(PyObject* self, PyObject* args) {
PyObject* op_str; PyObject* op_str;
if (!PyArg_ParseTuple(args, "S", &op_str)) { if (!PyArg_ParseTuple(args, "S", &op_str)) {
PyErr_SetString(PyExc_ValueError, "Excepted a serialized string of OperatorDef."); PyErr_SetString(PyExc_ValueError,
"Excepted a serialized string of OperatorDef.");
return nullptr; return nullptr;
} }
OperatorDef op_def; OperatorDef op_def;
if (!op_def.ParseFromString(PyBytes_AsStringEx(op_str))) { if (!op_def.ParseFromString(PyBytes_AsStringEx(op_str))) {
PyErr_SetString(PyExc_RuntimeError, "Failed to parse the OperatorDef."); PyErr_SetString(PyExc_RuntimeError,
"Failed to parse the OperatorDef.");
return nullptr; return nullptr;
} }
ws()->CreatePersistentOp(op_def); ws()->CreatePersistentOp(op_def);
...@@ -82,8 +87,10 @@ inline PyObject* CreatePersistentOpCC(PyObject* self, PyObject* args) { ...@@ -82,8 +87,10 @@ inline PyObject* CreatePersistentOpCC(PyObject* self, PyObject* args) {
inline PyObject* RunPersistentOpCC(PyObject* self, PyObject* args) { inline PyObject* RunPersistentOpCC(PyObject* self, PyObject* args) {
char* key, *anchor; char* key, *anchor;
PyObject* py_inputs, *py_outputs; PyObject* py_inputs, *py_outputs;
if (!PyArg_ParseTuple(args, "ssOO", &key, &anchor, &py_inputs, &py_outputs)) { if (!PyArg_ParseTuple(args, "ssOO",
PyErr_SetString(PyExc_ValueError, "Excepted a persistent key, anchor, " &key, &anchor, &py_inputs, &py_outputs)) {
PyErr_SetString(PyExc_ValueError,
"Excepted a persistent key, anchor, "
"list of inputs and outputs."); "list of inputs and outputs.");
return nullptr; return nullptr;
} }
......
...@@ -39,12 +39,14 @@ inline PyObject* CreateTensorCC(PyObject* self, PyObject* args) { ...@@ -39,12 +39,14 @@ inline PyObject* CreateTensorCC(PyObject* self, PyObject* args) {
inline PyObject* CreateFillerCC(PyObject* self, PyObject* args) { inline PyObject* CreateFillerCC(PyObject* self, PyObject* args) {
PyObject* filler_string; PyObject* filler_string;
if (!PyArg_ParseTuple(args, "S", &filler_string)) { if (!PyArg_ParseTuple(args, "S", &filler_string)) {
PyErr_SetString(PyExc_ValueError, "Excepted a serialized string of TensorFiller."); PyErr_SetString(PyExc_ValueError,
"Excepted a serialized string of TensorFiller.");
return nullptr; return nullptr;
} }
TensorFiller filler_def; TensorFiller filler_def;
if (!filler_def.ParseFromString(PyBytes_AsStringEx(filler_string))) { if (!filler_def.ParseFromString(PyBytes_AsStringEx(filler_string))) {
PyErr_SetString(PyExc_RuntimeError, "Failed to parse the TensorFiller."); PyErr_SetString(PyExc_RuntimeError,
"Failed to parse the TensorFiller.");
return nullptr; return nullptr;
} }
ws()->CreateFiller(filler_def); ws()->CreateFiller(filler_def);
...@@ -60,7 +62,8 @@ inline PyObject* GetFillerTypeCC(PyObject* self, PyObject* args) { ...@@ -60,7 +62,8 @@ inline PyObject* GetFillerTypeCC(PyObject* self, PyObject* args) {
inline PyObject* RenameTensorCC(PyObject* self, PyObject* args) { inline PyObject* RenameTensorCC(PyObject* self, PyObject* args) {
char* ori_name, *tar_name; char* ori_name, *tar_name;
if (!PyArg_ParseTuple(args, "ss", &ori_name, &tar_name)) { if (!PyArg_ParseTuple(args, "ss", &ori_name, &tar_name)) {
PyErr_SetString(PyExc_ValueError, "Excepted the original and target name."); PyErr_SetString(PyExc_ValueError,
"Excepted the original and target name.");
return nullptr; return nullptr;
} }
if (!ws()->HasTensor(tar_name)) { if (!ws()->HasTensor(tar_name)) {
...@@ -77,7 +80,8 @@ PyObject* TensorFromShapeCC(PyObject* self, PyObject* args) { ...@@ -77,7 +80,8 @@ PyObject* TensorFromShapeCC(PyObject* self, PyObject* args) {
char* cname, *dtype; char* cname, *dtype;
PyObject* shape, *device_option = nullptr; PyObject* shape, *device_option = nullptr;
if (!PyArg_ParseTuple(args, "sOs|O", &cname, &shape, &dtype, &device_option)) { if (!PyArg_ParseTuple(args, "sOs|O", &cname, &shape, &dtype, &device_option)) {
PyErr_SetString(PyExc_ValueError, "Excepted the name, shape, dtype and optional device option."); PyErr_SetString(PyExc_ValueError,
"Excepted the name, shape, dtype and optional device option.");
return nullptr; return nullptr;
} }
const TypeMeta& meta = TypeStringToMeta(dtype); const TypeMeta& meta = TypeStringToMeta(dtype);
...@@ -119,7 +123,8 @@ PyObject* TensorFromPyArrayCC(PyObject* self, PyObject* args) { ...@@ -119,7 +123,8 @@ PyObject* TensorFromPyArrayCC(PyObject* self, PyObject* args) {
char* cname; char* cname;
PyArrayObject* original_array = nullptr; PyArrayObject* original_array = nullptr;
if (!PyArg_ParseTuple(args, "sO", &cname, &original_array)) { if (!PyArg_ParseTuple(args, "sO", &cname, &original_array)) {
PyErr_SetString(PyExc_ValueError, "Failed to create tensor from numpy.ndarray.\n" PyErr_SetString(PyExc_ValueError,
"Failed to create tensor from numpy.ndarray.\n"
"Excepted the name and numpy.ndarray both."); "Excepted the name and numpy.ndarray both.");
return nullptr; return nullptr;
} }
...@@ -214,7 +219,8 @@ inline PyObject* TensorToPyArrayCC(PyObject* self, PyObject* args) { ...@@ -214,7 +219,8 @@ inline PyObject* TensorToPyArrayCC(PyObject* self, PyObject* args) {
return nullptr; return nullptr;
} }
auto* data = tensor->raw_mutable_data<CPUContext>(); auto* data = tensor->raw_mutable_data<CPUContext>();
PyObject* array = PyArray_SimpleNewFromData(tensor->ndim(), dims.data(), npy_type, data); PyObject* array = PyArray_SimpleNewFromData(
(int)tensor->ndim(), dims.data(), npy_type, data);
Py_XINCREF(array); Py_XINCREF(array);
return array; return array;
} }
......
...@@ -30,6 +30,8 @@ class BlobFetcher(Process): ...@@ -30,6 +30,8 @@ class BlobFetcher(Process):
---------- ----------
batch_size : int batch_size : int
The size of a training batch. The size of a training batch.
dtype : str
The data type of batch. Default is ``float32``.
partition : boolean partition : boolean
Whether to partition batch. Default is ``False``. Whether to partition batch. Default is ``False``.
prefetch : int prefetch : int
...@@ -42,6 +44,7 @@ class BlobFetcher(Process): ...@@ -42,6 +44,7 @@ class BlobFetcher(Process):
""" """
super(BlobFetcher, self).__init__() super(BlobFetcher, self).__init__()
self._batch_size = kwargs.get('batch_size', 100) self._batch_size = kwargs.get('batch_size', 100)
self._dtype = kwargs.get('dtype', 'float32')
self._partition = kwargs.get('partition', False) self._partition = kwargs.get('partition', False)
self._mean_values = kwargs.get('mean_values', []) self._mean_values = kwargs.get('mean_values', [])
self._scale = kwargs.get('scale', 1.0) self._scale = kwargs.get('scale', 1.0)
...@@ -68,7 +71,7 @@ class BlobFetcher(Process): ...@@ -68,7 +71,7 @@ class BlobFetcher(Process):
if ix != self._batch_size - 1: im, labels = self.Q_in.get() if ix != self._batch_size - 1: im, labels = self.Q_in.get()
# mean subtraction & numerical scale # mean subtraction & numerical scale
im_blob = im_blob.astype(np.float32) im_blob = im_blob.astype(self._dtype)
if len(self._mean_values) > 0: if len(self._mean_values) > 0:
im_blob -= self._mean_values im_blob -= self._mean_values
if self._scale != 1.0: if self._scale != 1.0:
......
...@@ -70,6 +70,8 @@ class DataBatch(object): ...@@ -70,6 +70,8 @@ class DataBatch(object):
The phase of this operator, ``TRAIN`` or ``TEST``. Default is ``TRAIN``. The phase of this operator, ``TRAIN`` or ``TEST``. Default is ``TRAIN``.
batch_size : int batch_size : int
The size of a training batch. The size of a training batch.
dtype : str
The data type of batch. Default is ``float32``.
partition : boolean partition : boolean
Whether to partition batch. Default is ``False``. Whether to partition batch. Default is ``False``.
prefetch : int prefetch : int
......
...@@ -49,16 +49,14 @@ class DataReader(Process): ...@@ -49,16 +49,14 @@ class DataReader(Process):
self._source = kwargs.get('source', '') self._source = kwargs.get('source', '')
self._multiple_nodes = kwargs.get('multiple_nodes', False) self._multiple_nodes = kwargs.get('multiple_nodes', False)
self._use_shuffle = kwargs.get('shuffle', False) self._use_shuffle = kwargs.get('shuffle', False)
self._use_instance_chunk = kwargs.get('instance_chunk', False)
self._num_chunks = kwargs.get('num_chunks', 2048) self._num_chunks = kwargs.get('num_chunks', 2048)
self._chunk_size = kwargs.get('chunk_size', -1) self._chunk_size = kwargs.get('chunk_size', -1)
self._num_parts = 1 self._part_idx, self._num_parts = 0, 1
self._part_idx = 0 self._cur_idx, self._cur_chunk_idx = 0, 0
self._random_seed = config.GetRandomSeed() self._random_seed = config.GetRandomSeed()
self._cur_idx = 0
self._cur_chunk_idx = 0
self.Q_out = None self.Q_out = None
self.daemon = True self.daemon = True
...@@ -167,12 +165,13 @@ class DataReader(Process): ...@@ -167,12 +165,13 @@ class DataReader(Process):
self._db.open(self._source) self._db.open(self._source)
self._zfill = self._db.zfill() self._zfill = self._db.zfill()
self._num_entries = self._db.num_entries() self._num_entries = self._db.num_entries()
self._epoch_size = int(self._num_entries / self._num_parts + 1) self._epoch_size = int(self._num_entries/ self._num_parts + 1)
if self._use_shuffle: if self._use_shuffle:
if self._chunk_size == 1: if self._chunk_size == 1:
# each chunk has at most 1 record [For Fully Shuffle] # each chunk has at most 1 record [For Fully Shuffle]
self._num_shuffle_parts = int(self._num_entries / self._chunk_size / self._num_parts) + 1 self._chunk_size, self._num_shuffle_parts = \
1, int(self._num_entries / self._num_parts) + 1
else: else:
if self._use_shuffle and self._chunk_size == -1: if self._use_shuffle and self._chunk_size == -1:
# search a optimal chunk size by chunks [For Chunk Shuffle] # search a optimal chunk size by chunks [For Chunk Shuffle]
...@@ -183,6 +182,11 @@ class DataReader(Process): ...@@ -183,6 +182,11 @@ class DataReader(Process):
self._num_shuffle_parts = int(math.ceil(self._db._total_size * 1.1 / self._num_shuffle_parts = int(math.ceil(self._db._total_size * 1.1 /
(self._num_parts * self._chunk_size << 20))) (self._num_parts * self._chunk_size << 20)))
self._chunk_size = int(self._num_entries / self._num_shuffle_parts / self._num_parts + 1) self._chunk_size = int(self._num_entries / self._num_shuffle_parts / self._num_parts + 1)
limit = (self._num_parts - 0.5) * self._num_shuffle_parts * self._chunk_size
if self._num_entries <= limit:
# roll back to fully shuffle
self._chunk_size, self._num_shuffle_parts = \
1, int(self._num_entries / self._num_parts) + 1
else: else:
# each chunk has at most K records [For Multiple Nodes] # each chunk has at most K records [For Multiple Nodes]
# note that if ``shuffle`` and ``multiple_nodes`` are all ``False``, # note that if ``shuffle`` and ``multiple_nodes`` are all ``False``,
......
...@@ -14,7 +14,7 @@ from __future__ import division ...@@ -14,7 +14,7 @@ from __future__ import division
from __future__ import print_function from __future__ import print_function
version = '0.2.2' version = '0.2.2'
full_version = '0.2.2.10' full_version = '0.2.2.11'
release = False release = False
if not release: if not release:
......
...@@ -364,7 +364,7 @@ class BatchNormLayer(Layer): ...@@ -364,7 +364,7 @@ class BatchNormLayer(Layer):
var = Tensor(scope + '/param:1').Constant(value=0.0) var = Tensor(scope + '/param:1').Constant(value=0.0)
factor = Tensor(scope + '/param:2').Constant(value=0.0) factor = Tensor(scope + '/param:2').Constant(value=0.0)
# in dragon, set diff as None will ignore computing grad automatically # in dragon, set diff as None will ignore computing grad automatically
# but in bvlc-caffe1, you must set lr_mult = 0 manually # but in bvlc-caffe, you must set lr_mult = 0 manually
self._blobs.append({'data': mean, 'diff': None}) self._blobs.append({'data': mean, 'diff': None})
self._blobs.append({'data': var, 'diff': None}) self._blobs.append({'data': var, 'diff': None})
self._blobs.append({'data': factor, 'diff': None}) self._blobs.append({'data': factor, 'diff': None})
......
...@@ -20,7 +20,7 @@ from .arithmetic import ( ...@@ -20,7 +20,7 @@ from .arithmetic import (
from .ndarray import ( from .ndarray import (
squeeze, unsqueeze, squeeze, unsqueeze,
sum, mean, argmin, argmax, max, topk, sum, mean, argmin, argmax, max, min, topk,
cat, gather, cat, gather,
) )
......
...@@ -13,7 +13,6 @@ from __future__ import absolute_import ...@@ -13,7 +13,6 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
from dragon.vm.torch.tensor import Tensor from dragon.vm.torch.tensor import Tensor
from dragon.vm.torch.ops.primitive import MakeContext, WrapScalar from dragon.vm.torch.ops.primitive import MakeContext, WrapScalar
from dragon.vm.torch.ops.factory import get_module from dragon.vm.torch.ops.factory import get_module
...@@ -26,7 +25,6 @@ def _fundamental(input, value, op='Add', out=None): ...@@ -26,7 +25,6 @@ def _fundamental(input, value, op='Add', out=None):
raise TypeError('Type of value should be numerical, got {}.' raise TypeError('Type of value should be numerical, got {}.'
.format(type(value))) .format(type(value)))
value = WrapScalar(value, input._dtype, input._ctx) value = WrapScalar(value, input._dtype, input._ctx)
ctx = MakeContext(inputs=[input, value]) ctx = MakeContext(inputs=[input, value])
key = 'torch/ops/{}/{}:{}'.format(op.lower(), ctx[0].lower(), ctx[1]) key = 'torch/ops/{}/{}:{}'.format(op.lower(), ctx[0].lower(), ctx[1])
module = get_module(Fundamental, key, ctx, op_type=op) module = get_module(Fundamental, key, ctx, op_type=op)
......
...@@ -13,7 +13,7 @@ from __future__ import absolute_import ...@@ -13,7 +13,7 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
from dragon.vm.torch.utils.data.io.data_reader import DataReader from dragon.io.data_reader import DataReader
from dragon.vm.torch.utils.data.io.data_transformer import DataTransformer from dragon.vm.torch.utils.data.io.data_transformer import DataTransformer
......
...@@ -19,7 +19,7 @@ from multiprocessing import Queue ...@@ -19,7 +19,7 @@ from multiprocessing import Queue
import dragon.core.mpi as mpi import dragon.core.mpi as mpi
from .data_reader import DataReader from dragon.io.data_reader import DataReader
from .data_transformer import DataTransformer from .data_transformer import DataTransformer
from .blob_fetcher import BlobFetcher from .blob_fetcher import BlobFetcher
......
# ------------------------------------------------------------
# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
#
# Licensed under the BSD 2-Clause License.
# You should have received a copy of the BSD 2-Clause License
# along with the software. If not, See,
#
# <https://opensource.org/licenses/BSD-2-Clause>
#
# ------------------------------------------------------------
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import numpy as np
import numpy.random as npr
from multiprocessing import Process
import dragon.config as config
from dragon.tools.db import LMDB
class DataReader(Process):
"""DataReader is deployed to queue encoded str from `LMDB`_.
It is supported to adaptively partition and shuffle records over all distributed nodes.
"""
def __init__(self, **kwargs):
"""Construct a ``DataReader``.
Parameters
----------
source : str
The path of database.
multiple_nodes: boolean
Whether to split data for multiple parallel nodes. Default is ``False``.
shuffle : boolean
Whether to shuffle the data. Default is ``False``.
num_chunks : int
The number of chunks to split. Default is ``2048``.
chunk_size : int
The size(MB) of each chunk. Default is -1 (Refer ``num_chunks``).
"""
super(DataReader, self).__init__()
self._source = kwargs.get('source', '')
self._multiple_nodes = kwargs.get('multiple_nodes', False)
self._use_shuffle = kwargs.get('shuffle', False)
self._num_chunks = kwargs.get('num_chunks', 2048)
self._chunk_size = kwargs.get('chunk_size', -1)
self._num_parts = 1
self._part_idx = 0
self._random_seed = config.GetRandomSeed()
self._cur_idx = 0
self._cur_chunk_idx = 0
self.Q_out = None
self.daemon = True
def element(self):
"""Get the value of current record.
Returns
-------
str
The encoded str.
"""
return self._db.value()
def redirect(self, target_idx):
"""Redirect to the target position.
Parameters
----------
target_idx : int
The key of instance in ``LMDB``.
Returns
-------
None
Notes
-----
The redirection reopens the ``LMDB``.
You can drop caches by ``echo 3 > /proc/sys/vm/drop_caches``.
This will disturb getting stuck when ``Database Size`` >> ``RAM Size``.
"""
self._db.close()
self._db.open(self._source)
self._cur_idx = target_idx
self._db.set(str(self._cur_idx).zfill(self._zfill))
def reset(self):
"""Reset the cursor and environment.
Returns
-------
None
"""
if self._multiple_nodes or self._use_shuffle:
if self._use_shuffle: self._perm = npr.permutation(self._num_shuffle_parts)
self._cur_chunk_idx = 0
self._start_idx = int(self._part_idx * self._num_shuffle_parts + self._perm[self._cur_chunk_idx])
self._start_idx = int(self._start_idx * self._chunk_size)
if self._start_idx >= self._num_entries: self.next_chunk()
self._end_idx = self._start_idx + self._chunk_size
self._end_idx = min(self._num_entries, self._end_idx)
else:
self._start_idx = 0
self._end_idx = self._num_entries
self.redirect(self._start_idx)
def next_record(self):
"""Step the cursor of records.
Returns
-------
None
"""
self._cur_idx += 1
self._db.next()
def next_chunk(self):
"""Step the cursor of shuffling chunks.
Returns
-------
None
"""
self._cur_chunk_idx += 1
if self._cur_chunk_idx >= self._num_shuffle_parts: self.reset()
else:
self._start_idx = self._part_idx * self._num_shuffle_parts + self._perm[self._cur_chunk_idx]
self._start_idx = self._start_idx * self._chunk_size
if self._start_idx >= self._num_entries: self.next_chunk()
else:
self._end_idx = self._start_idx + self._chunk_size
self._end_idx = min(self._num_entries, self._end_idx)
self.redirect(self._start_idx)
def run(self):
"""Start the process.
Returns
-------
None
"""
# fix seed
npr.seed(self._random_seed)
# init db
self._db = LMDB()
self._db.open(self._source)
self._zfill = self._db.zfill()
self._num_entries = self._db.num_entries()
self._epoch_size = int(self._num_entries / self._num_parts + 1)
if self._use_shuffle:
if self._chunk_size == 1:
# each chunk has at most 1 record [For Fully Shuffle]
self._num_shuffle_parts = int(self._num_entries / self._chunk_size / self._num_parts) + 1
else:
if self._use_shuffle and self._chunk_size == -1:
# search a optimal chunk size by chunks [For Chunk Shuffle]
max_chunk_size = self._db._total_size / ((self._num_chunks * (1 << 20)))
min_chunk_size = 1
while min_chunk_size * 2 < max_chunk_size: min_chunk_size *= 2
self._chunk_size = min_chunk_size
self._num_shuffle_parts = int(math.ceil(self._db._total_size * 1.1 /
(self._num_parts * self._chunk_size << 20)))
self._chunk_size = int(self._num_entries / self._num_shuffle_parts / self._num_parts + 1)
else:
# each chunk has at most K records [For Multiple Nodes]
# note that if ``shuffle`` and ``multiple_nodes`` are all ``False``,
# ``chunk_size`` and ``num_shuffle_parts`` are meaningless
self._chunk_size = int(self._num_entries / self._num_parts) + 1
self._num_shuffle_parts = 1
self._perm = np.arange(self._num_shuffle_parts)
# init env
self.reset()
# run
while True:
self.Q_out.put(self.element())
self.next_record()
if self._cur_idx >= self._end_idx:
if self._multiple_nodes or \
self._use_shuffle: self.next_chunk()
else: self.reset()
\ No newline at end of file
...@@ -42,7 +42,7 @@ find_modules() ...@@ -42,7 +42,7 @@ find_modules()
setup(name = 'dragon', setup(name = 'dragon',
version='0.2.2.10', version='0.2.2.11',
description = 'Dragon: A Computation Graph Virtual Machine Based Deep Learning Framework', description = 'Dragon: A Computation Graph Virtual Machine Based Deep Learning Framework',
url='https://github.com/seetaresearch/Dragon', url='https://github.com/seetaresearch/Dragon',
author='Ting Pan', author='Ting Pan',
......
...@@ -19,7 +19,8 @@ template <> void GenerateProposals<float, CPUContext>( ...@@ -19,7 +19,8 @@ template <> void GenerateProposals<float, CPUContext>(
const float* scores, const float* scores,
const float* bbox_deltas, const float* bbox_deltas,
const float* anchors, const float* anchors,
float* proposals) { float* proposals,
CPUContext* ctx) {
float* proposal = proposals; float* proposal = proposals;
const int K = feat_h * feat_w; const int K = feat_h * feat_w;
for (int h = 0; h < feat_h; ++h) { for (int h = 0; h < feat_h; ++h) {
...@@ -57,7 +58,8 @@ template <> void GenerateProposals_v2<float, CPUContext>( ...@@ -57,7 +58,8 @@ template <> void GenerateProposals_v2<float, CPUContext>(
const float min_box_w, const float min_box_w,
const float* scores, const float* scores,
const float* bbox_deltas, const float* bbox_deltas,
float* proposals) { float* proposals,
CPUContext* ctx) {
float* proposal = proposals; float* proposal = proposals;
for (int i = 0; i < total_anchors; ++i) { for (int i = 0; i < total_anchors; ++i) {
// bbox_deltas: [1, 4, total_anchors] // bbox_deltas: [1, 4, total_anchors]
...@@ -98,7 +100,8 @@ template <> void ApplyNMS<float, CPUContext>( ...@@ -98,7 +100,8 @@ template <> void ApplyNMS<float, CPUContext>(
const float thresh, const float thresh,
const float* boxes, const float* boxes,
int* keep_indices, int* keep_indices,
int& num_keep) { int& num_keep,
CPUContext* ctx) {
int count = 0; int count = 0;
std::vector<char> is_dead(num_boxes); std::vector<char> is_dead(num_boxes);
for (int i = 0; i < num_boxes; ++i) is_dead[i] = 0; for (int i = 0; i < num_boxes; ++i) is_dead[i] = 0;
......
...@@ -62,7 +62,7 @@ __global__ void _GenerateProposals( ...@@ -62,7 +62,7 @@ __global__ void _GenerateProposals(
const T* bbox_deltas, const T* bbox_deltas,
const T* anchors, const T* anchors,
T* proposals) { T* proposals) {
CUDA_KERNEL_LOOP(idx, nthreads) { CUDA_1D_KERNEL_LOOP(idx, nthreads) {
const int h = idx / A / feat_w; const int h = idx / A / feat_w;
const int w = (idx / A) % feat_w; const int w = (idx / A) % feat_w;
const int a = idx % A; const int a = idx % A;
...@@ -99,11 +99,13 @@ template <> void GenerateProposals<float, CUDAContext>( ...@@ -99,11 +99,13 @@ template <> void GenerateProposals<float, CUDAContext>(
const float* scores, const float* scores,
const float* bbox_deltas, const float* bbox_deltas,
const float* anchors, const float* anchors,
float* proposals) { float* proposals,
CUDAContext* ctx) {
const int num_proposals = A * feat_h * feat_w; const int num_proposals = A * feat_h * feat_w;
_GenerateProposals<float> _GenerateProposals<float>
<< <CUDA_BLOCKS(num_proposals), CUDA_THREADS >> >( << < CUDA_BLOCKS(num_proposals), CUDA_THREADS,
num_proposals, A, feat_h, feat_w, stride, 0, ctx->cuda_stream() >> >(num_proposals,
A, feat_h, feat_w, stride,
im_h, im_w, min_box_h, min_box_w, im_h, im_w, min_box_h, min_box_w,
scores, bbox_deltas, anchors, proposals); scores, bbox_deltas, anchors, proposals);
} }
...@@ -118,7 +120,7 @@ __global__ void _GenerateProposals_v2( ...@@ -118,7 +120,7 @@ __global__ void _GenerateProposals_v2(
const T* scores, const T* scores,
const T* bbox_deltas, const T* bbox_deltas,
T* proposals) { T* proposals) {
CUDA_KERNEL_LOOP(idx, nthreads) { CUDA_1D_KERNEL_LOOP(idx, nthreads) {
const float dx = bbox_deltas[idx]; const float dx = bbox_deltas[idx];
const float dy = bbox_deltas[nthreads + idx]; const float dy = bbox_deltas[nthreads + idx];
const float d_log_w = bbox_deltas[2 * nthreads + idx]; const float d_log_w = bbox_deltas[2 * nthreads + idx];
...@@ -139,10 +141,12 @@ template <> void GenerateProposals_v2<float, CUDAContext>( ...@@ -139,10 +141,12 @@ template <> void GenerateProposals_v2<float, CUDAContext>(
const float min_box_w, const float min_box_w,
const float* scores, const float* scores,
const float* bbox_deltas, const float* bbox_deltas,
float* proposals) { float* proposals,
CUDAContext* ctx) {
_GenerateProposals_v2<float> _GenerateProposals_v2<float>
<< <CUDA_BLOCKS(total_anchors), CUDA_THREADS >> >( << < CUDA_BLOCKS(total_anchors), CUDA_THREADS,
total_anchors, im_h, im_w, min_box_h, min_box_w, 0, ctx->cuda_stream() >> >(total_anchors,
im_h, im_w, min_box_h, min_box_w,
scores, bbox_deltas, proposals); scores, bbox_deltas, proposals);
} }
...@@ -170,7 +174,7 @@ __global__ void nms_mask( ...@@ -170,7 +174,7 @@ __global__ void nms_mask(
const int num_boxes, const int num_boxes,
const T nms_thresh, const T nms_thresh,
const T* boxes, const T* boxes,
unsigned long long* mask) { uint64_t* mask) {
const int i_start = blockIdx.x * NMS_BLOCK_SIZE; const int i_start = blockIdx.x * NMS_BLOCK_SIZE;
const int di_end = min(num_boxes - i_start, NMS_BLOCK_SIZE); const int di_end = min(num_boxes - i_start, NMS_BLOCK_SIZE);
const int j_start = blockIdx.y * NMS_BLOCK_SIZE; const int j_start = blockIdx.y * NMS_BLOCK_SIZE;
...@@ -209,25 +213,30 @@ void _ApplyNMS( ...@@ -209,25 +213,30 @@ void _ApplyNMS(
const float thresh, const float thresh,
const T* boxes, const T* boxes,
int* keep_indices, int* keep_indices,
int& num_keep) { int& num_keep,
CUDAContext* ctx) {
const int num_blocks = DIV_UP(num_boxes, NMS_BLOCK_SIZE); const int num_blocks = DIV_UP(num_boxes, NMS_BLOCK_SIZE);
const dim3 blocks(num_blocks, num_blocks); const dim3 blocks(num_blocks, num_blocks);
size_t mask_nbytes = num_boxes * num_blocks * sizeof(unsigned long long); size_t mask_nbytes = num_boxes * num_blocks * sizeof(uint64_t);
size_t boxes_nbytes = num_boxes * 5 * sizeof(T); size_t boxes_nbytes = num_boxes * 5 * sizeof(T);
void* boxes_dev, *mask_dev; void* boxes_dev, *mask_dev;
CUDA_CHECK(cudaMalloc(&boxes_dev, boxes_nbytes)); CUDA_CHECK(cudaMalloc(&boxes_dev, boxes_nbytes));
CUDA_CHECK(cudaMalloc(&mask_dev, mask_nbytes)); CUDA_CHECK(cudaMalloc(&mask_dev, mask_nbytes));
CUDA_CHECK(cudaMemcpy(boxes_dev, boxes, boxes_nbytes, cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(boxes_dev, boxes,
nms_mask<T> << <blocks, NMS_BLOCK_SIZE >> > ( boxes_nbytes, cudaMemcpyHostToDevice));
num_boxes, thresh, (T*)boxes_dev, (unsigned long long*)mask_dev); nms_mask<T>
<< < blocks, NMS_BLOCK_SIZE,
0, ctx->cuda_stream() >> > (num_boxes,
thresh, (T*)boxes_dev, (uint64_t*)mask_dev);
CUDA_CHECK(cudaPeekAtLastError()); CUDA_CHECK(cudaPeekAtLastError());
std::vector<unsigned long long> mask_host(num_boxes * num_blocks); std::vector<uint64_t> mask_host(num_boxes * num_blocks);
CUDA_CHECK(cudaMemcpy(&mask_host[0], mask_dev, mask_nbytes, cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(&mask_host[0], mask_dev,
mask_nbytes, cudaMemcpyDeviceToHost));
std::vector<unsigned long long> dead_bit(num_blocks); std::vector<uint64_t> dead_bit(num_blocks);
memset(&dead_bit[0], 0, sizeof(unsigned long long) * num_blocks); memset(&dead_bit[0], 0, sizeof(uint64_t) * num_blocks);
int num_selected = 0; int num_selected = 0;
for (int i = 0; i < num_boxes; ++i) { for (int i = 0; i < num_boxes; ++i) {
...@@ -235,7 +244,7 @@ void _ApplyNMS( ...@@ -235,7 +244,7 @@ void _ApplyNMS(
const int inblock = i % NMS_BLOCK_SIZE; const int inblock = i % NMS_BLOCK_SIZE;
if (!(dead_bit[nblock] & (1ULL << inblock))) { if (!(dead_bit[nblock] & (1ULL << inblock))) {
keep_indices[num_selected++] = i; keep_indices[num_selected++] = i;
unsigned long long* mask_i = &mask_host[0] + i * num_blocks; uint64_t* mask_i = &mask_host[0] + i * num_blocks;
for (int j = nblock; j < num_blocks; ++j) dead_bit[j] |= mask_i[j]; for (int j = nblock; j < num_blocks; ++j) dead_bit[j] |= mask_i[j];
if (num_selected == max_keeps) break; if (num_selected == max_keeps) break;
} }
...@@ -251,9 +260,10 @@ template <> void ApplyNMS<float, CUDAContext>( ...@@ -251,9 +260,10 @@ template <> void ApplyNMS<float, CUDAContext>(
const float thresh, const float thresh,
const float* boxes, const float* boxes,
int* keep_indices, int* keep_indices,
int& num_keep) { int& num_keep,
CUDAContext* ctx) {
_ApplyNMS<float>(num_boxes, max_keeps, thresh, _ApplyNMS<float>(num_boxes, max_keeps, thresh,
boxes, keep_indices, num_keep); boxes, keep_indices, num_keep, ctx);
} }
} // namespace rcnn } // namespace rcnn
......
...@@ -126,7 +126,8 @@ void GenerateProposals( ...@@ -126,7 +126,8 @@ void GenerateProposals(
const T* scores, const T* scores,
const T* bbox_deltas, const T* bbox_deltas,
const T* anchors, const T* anchors,
T* proposals); T* proposals,
Context* ctx);
template <typename T, class Context> template <typename T, class Context>
void GenerateProposals_v2( void GenerateProposals_v2(
...@@ -137,7 +138,8 @@ void GenerateProposals_v2( ...@@ -137,7 +138,8 @@ void GenerateProposals_v2(
const float min_box_w, const float min_box_w,
const T* scores, const T* scores,
const T* bbox_deltas, const T* bbox_deltas,
T* proposals); T* proposals,
Context* ctx);
template <typename T> template <typename T>
inline void SortProposals( inline void SortProposals(
...@@ -246,7 +248,8 @@ void ApplyNMS( ...@@ -246,7 +248,8 @@ void ApplyNMS(
const T thresh, const T thresh,
const T* boxes, const T* boxes,
int* keep_indices, int* keep_indices,
int& num_keep); int& num_keep,
Context* ctx);
} // namespace rcnn } // namespace rcnn
......
...@@ -37,7 +37,7 @@ void ProposalOp<Context>::RunWithType() { ...@@ -37,7 +37,7 @@ void ProposalOp<Context>::RunWithType() {
Input(0).template data<T, Context>(), Input(0).template data<T, Context>(),
Input(1).template data<T, Context>(), Input(1).template data<T, Context>(),
anchors_.template mutable_data<T, Context>(), anchors_.template mutable_data<T, Context>(),
proposals_.template mutable_data<T, Context>()); proposals_.template mutable_data<T, Context>(), ctx());
rcnn::SortProposals(0, num_proposals - 1, pre_nms_top_n, rcnn::SortProposals(0, num_proposals - 1, pre_nms_top_n,
proposals_.template mutable_data<T, CPUContext>()); proposals_.template mutable_data<T, CPUContext>());
...@@ -45,7 +45,8 @@ void ProposalOp<Context>::RunWithType() { ...@@ -45,7 +45,8 @@ void ProposalOp<Context>::RunWithType() {
rcnn::ApplyNMS<T, Context>( rcnn::ApplyNMS<T, Context>(
pre_nms_topn, post_nms_top_n, nms_thresh, pre_nms_topn, post_nms_top_n, nms_thresh,
proposals_.template mutable_data<T, Context>(), proposals_.template mutable_data<T, Context>(),
roi_indices_.template mutable_data<int, CPUContext>(), num_rois); roi_indices_.template mutable_data<int, CPUContext>(),
num_rois, ctx());
rcnn::RetrieveRoIs<T>(num_rois, n, rcnn::RetrieveRoIs<T>(num_rois, n,
proposals_.template mutable_data<T, CPUContext>(), proposals_.template mutable_data<T, CPUContext>(),
...@@ -95,14 +96,15 @@ void ProposalOp<Context>::RunWithType() { ...@@ -95,14 +96,15 @@ void ProposalOp<Context>::RunWithType() {
im_height, im_width, min_box_h, min_box_w, im_height, im_width, min_box_h, min_box_w,
Input(-3).template data<T, Context>(), Input(-3).template data<T, Context>(),
Input(-2).template data<T, Context>(), Input(-2).template data<T, Context>(),
proposals_.template mutable_data<T, Context>()); proposals_.template mutable_data<T, Context>(), ctx());
rcnn::SortProposals(0, total_proposals - 1, pre_nms_top_n, rcnn::SortProposals(0, total_proposals - 1, pre_nms_top_n,
proposals_.template mutable_data<T, CPUContext>()); proposals_.template mutable_data<T, CPUContext>());
rcnn::ApplyNMS<T, Context>(pre_nms_topn, post_nms_top_n, nms_thresh, rcnn::ApplyNMS<T, Context>(pre_nms_topn, post_nms_top_n, nms_thresh,
proposals_.template mutable_data<T, Context>(), proposals_.template mutable_data<T, Context>(),
roi_indices_.template mutable_data<int, CPUContext>(), num_rois); roi_indices_.template mutable_data<int, CPUContext>(),
num_rois, ctx());
rcnn::RetrieveRoIs<T>(num_rois, n, rcnn::RetrieveRoIs<T>(num_rois, n,
proposals_.template mutable_data<T, CPUContext>(), proposals_.template mutable_data<T, CPUContext>(),
...@@ -128,7 +130,7 @@ void ProposalOp<Context>::RunWithType() { ...@@ -128,7 +130,7 @@ void ProposalOp<Context>::RunWithType() {
collective_rois.ReshapeLike(*Output(0)); collective_rois.ReshapeLike(*Output(0));
auto* rois = collective_rois.template mutable_data<T, CPUContext>(); auto* rois = collective_rois.template mutable_data<T, CPUContext>();
CPUContext::template Copy<T, CPUContext, CPUContext>( ctx()->template Copy<T, CPUContext, CPUContext>(
collective_rois.count(), rois, collective_rois.count(), rois,
Output(0)->template data<T, CPUContext>()); Output(0)->template data<T, CPUContext>());
...@@ -147,6 +149,8 @@ void ProposalOp<Context>::RunWithType() { ...@@ -147,6 +149,8 @@ void ProposalOp<Context>::RunWithType() {
template <class Context> template <class Context>
void ProposalOp<Context>::RunOnDevice() { void ProposalOp<Context>::RunOnDevice() {
ctx()->set_stream_id(0); // enforce default stream
num_images = Input(0).dim(0); num_images = Input(0).dim(0);
CHECK_EQ(Input(-1).dim(0), num_images) CHECK_EQ(Input(-1).dim(0), num_images)
<< "\nExcepted " << num_images << " groups image info, " << "\nExcepted " << num_images << " groups image info, "
......
...@@ -455,7 +455,10 @@ Graph::Graph(const GraphDef& meta_graph, Workspace* ws) ...@@ -455,7 +455,10 @@ Graph::Graph(const GraphDef& meta_graph, Workspace* ws)
RecomputingAware(optimized_graph, ws); RecomputingAware(optimized_graph, ws);
} }
bool Graph::Run(const string& include, const string& exclude) { bool Graph::Run(
const string& include,
const string& exclude,
const int stream_id) {
LOG(DEBUG) << "Run Graph: " << name(); LOG(DEBUG) << "Run Graph: " << name();
for (auto op : ops_) { for (auto op : ops_) {
if (!include.empty()) if (!include.empty())
...@@ -464,7 +467,7 @@ bool Graph::Run(const string& include, const string& exclude) { ...@@ -464,7 +467,7 @@ bool Graph::Run(const string& include, const string& exclude) {
if (op->type().find(exclude) != string::npos) continue; if (op->type().find(exclude) != string::npos) continue;
op->SwitchToPhase(this->args_["phase"].s()); op->SwitchToPhase(this->args_["phase"].s());
LOG(DEBUG) << "$ Before Operator: " << op->name(); LOG(DEBUG) << "$ Before Operator: " << op->name();
op->Run(); op->Run(stream_id);
LOG(DEBUG) << "$ After Operator: " << op->name(); LOG(DEBUG) << "$ After Operator: " << op->name();
} }
return true; return true;
......
...@@ -8,7 +8,6 @@ void MixedMemory::ToCPU() { ...@@ -8,7 +8,6 @@ void MixedMemory::ToCPU() {
switch (state_) { switch (state_) {
case UNINITIALIZED: case UNINITIALIZED:
cpu_ptr_ = CPUContext::New(nbytes_); cpu_ptr_ = CPUContext::New(nbytes_);
CPUContext::Memset(nbytes_, cpu_ptr_);
state_ = STATE_AT_CPU; state_ = STATE_AT_CPU;
break; break;
case STATE_AT_CUDA: case STATE_AT_CUDA:
...@@ -32,7 +31,6 @@ void MixedMemory::ToCUDA() { ...@@ -32,7 +31,6 @@ void MixedMemory::ToCUDA() {
switch (state_) { switch (state_) {
case UNINITIALIZED: case UNINITIALIZED:
cuda_ptr_ = CUDAContext::New(nbytes_); cuda_ptr_ = CUDAContext::New(nbytes_);
CUDAContext::Memset(nbytes_, cuda_ptr_);
state_ = STATE_AT_CUDA; state_ = STATE_AT_CUDA;
break; break;
case STATE_AT_CPU: case STATE_AT_CPU:
......
...@@ -15,33 +15,35 @@ void CuDNNDropoutOp<Context>::RunWithType() { ...@@ -15,33 +15,35 @@ void CuDNNDropoutOp<Context>::RunWithType() {
float scale = use_scale ? 1.0 / (1.0 - prob()) : 1.0; float scale = use_scale ? 1.0 / (1.0 - prob()) : 1.0;
if (phase() == "TEST") { if (phase() == "TEST") {
if (Output(0) != &Input(0)) { if (Output(0) != &Input(0)) {
ctx().template Copy<T, Context, Context>( ctx()->template Copy<T, Context, Context>(
Output(0)->count(), Ydata, Xdata); Output(0)->count(), Ydata, Xdata);
if (scale == 1.0) if (scale == 1.0)
math::Scal<T, Context>(Output(0)->count(), math::Scal<T, Context>(Output(0)->count(),
1.0 - prob(), Ydata, &ctx()); 1.0 - prob(), Ydata, ctx());
} }
} else if (phase() == "TRAIN") { } else if (phase() == "TRAIN") {
CHECK(use_scale) << "\nCuDNN only supports scale-dropout"; CHECK(use_scale) << "\nCuDNN only supports scale-dropout";
Tensor* mask = ws()->CreateTensor("/mnt/" + anchor() + "/dropout/mask"); Tensor* mask = ws()->CreateTensor(
"/mnt/" + anchor() + "/dropout/mask");
// determine the dropout states // determine the dropout states
if (!states_initialized) { if (!states_initialized) {
states_initialized = true; states_initialized = true;
CUDNN_CHECK(cudnnDropoutGetStatesSize( CUDNN_CHECK(cudnnDropoutGetStatesSize(
ctx().cudnn_handle(), &states_size)); ctx()->cudnn_handle(), &states_size));
std::lock_guard<std::mutex> lk(CUDAContext::mutex()); std::lock_guard<std::mutex> lk(CUDAContext::mutex());
Tensor* states = ws()->CreateTensor("/share/cudnn/dropout:" + Tensor* states = ws()->CreateTensor(
dragon_cast<string, unsigned long long>(random_seed) + "/states"); "/share/cudnn/dropout:" + dragon_cast<string,
unsigned long long>(random_seed) + "/states");
if (states->count() > 0) { if (states->count() > 0) {
auto* Sdata = states->template mutable_data<uint8_t, Context>(); auto* Sdata = states->template mutable_data<uint8_t, Context>();
CUDNN_CHECK(cudnnRestoreDropoutDescriptor( CUDNN_CHECK(cudnnRestoreDropoutDescriptor(
dropout_desc, ctx().cudnn_handle(), prob(), dropout_desc, ctx()->cudnn_handle(), prob(),
Sdata, states_size, random_seed)); Sdata, states_size, random_seed));
} else { } else {
states->Reshape({ (TIndex)states_size }); states->Reshape({ (TIndex)states_size });
auto* Sdata = states->template mutable_data<uint8_t, Context>(); auto* Sdata = states->template mutable_data<uint8_t, Context>();
CUDNN_CHECK(cudnnSetDropoutDescriptor( CUDNN_CHECK(cudnnSetDropoutDescriptor(
dropout_desc, ctx().cudnn_handle(), prob(), dropout_desc, ctx()->cudnn_handle(), prob(),
Sdata, states_size, random_seed)); Sdata, states_size, random_seed));
} }
} }
...@@ -53,7 +55,7 @@ void CuDNNDropoutOp<Context>::RunWithType() { ...@@ -53,7 +55,7 @@ void CuDNNDropoutOp<Context>::RunWithType() {
mask->Reshape({ (TIndex)reserve_space_size }); mask->Reshape({ (TIndex)reserve_space_size });
auto* Rdata = mask->template mutable_data<uint8_t, Context>(); auto* Rdata = mask->template mutable_data<uint8_t, Context>();
CUDNN_CHECK(cudnnDropoutForward( CUDNN_CHECK(cudnnDropoutForward(
ctx().cudnn_handle(), dropout_desc, ctx()->cudnn_handle(), dropout_desc,
input_desc, Xdata, input_desc, Xdata,
input_desc, Ydata, input_desc, Ydata,
Rdata, reserve_space_size)); Rdata, reserve_space_size));
...@@ -65,7 +67,9 @@ void CuDNNDropoutOp<Context>::RunOnDevice() { ...@@ -65,7 +67,9 @@ void CuDNNDropoutOp<Context>::RunOnDevice() {
Output(0)->ReshapeLike(Input(0)); Output(0)->ReshapeLike(Input(0));
if (XIsType(Input(0), float)) RunWithType<float>(); if (XIsType(Input(0), float)) RunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (XIsType(Input(0), float16)) RunWithType<float16>(); else if (XIsType(Input(0), float16)) RunWithType<float16>();
#endif
else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" }); else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
} }
...@@ -76,19 +80,21 @@ void CuDNNDropoutGradientOp<Context>::RunWithType() { ...@@ -76,19 +80,21 @@ void CuDNNDropoutGradientOp<Context>::RunWithType() {
if (phase() == "TEST") { NOT_IMPLEMENTED; } if (phase() == "TEST") { NOT_IMPLEMENTED; }
else if (phase() == "TRAIN") { else if (phase() == "TRAIN") {
CHECK(use_scale) << "\nCuDNN only supports scale-dropout"; CHECK(use_scale) << "\nCuDNN only supports scale-dropout";
Tensor* mask = ws()->GetTensor("/mnt/" + anchor() + "/dropout/mask"); Tensor* mask = ws()->GetTensor(
"/mnt/" + anchor() + "/dropout/mask");
// determine the dropout states // determine the dropout states
if (!states_initialized) { if (!states_initialized) {
states_initialized = true; states_initialized = true;
CUDNN_CHECK(cudnnDropoutGetStatesSize( CUDNN_CHECK(cudnnDropoutGetStatesSize(
ctx().cudnn_handle(), &states_size)); ctx()->cudnn_handle(), &states_size));
std::lock_guard<std::mutex> lk(CUDAContext::mutex()); std::lock_guard<std::mutex> lk(CUDAContext::mutex());
Tensor* states = ws()->CreateTensor("/share/cudnn/dropout:" + Tensor* states = ws()->CreateTensor(
dragon_cast<string, unsigned long long>(random_seed) + "/states"); "/share/cudnn/dropout:" + dragon_cast<string,
unsigned long long>(random_seed) + "/states");
if (states->count() > 0) { if (states->count() > 0) {
auto* Sdata = states->template mutable_data<uint8_t, Context>(); auto* Sdata = states->template mutable_data<uint8_t, Context>();
CUDNN_CHECK(cudnnRestoreDropoutDescriptor( CUDNN_CHECK(cudnnRestoreDropoutDescriptor(
dropout_desc, ctx().cudnn_handle(), prob(), dropout_desc, ctx()->cudnn_handle(), prob(),
Sdata, states_size, random_seed)); Sdata, states_size, random_seed));
} else { LOG(FATAL) << "Missing states with seed: " << random_seed; } } else { LOG(FATAL) << "Missing states with seed: " << random_seed; }
} }
...@@ -101,7 +107,7 @@ void CuDNNDropoutGradientOp<Context>::RunWithType() { ...@@ -101,7 +107,7 @@ void CuDNNDropoutGradientOp<Context>::RunWithType() {
input_desc, &reserve_space_size)); input_desc, &reserve_space_size));
auto* Rdata = mask->template mutable_data<uint8_t, Context>(); auto* Rdata = mask->template mutable_data<uint8_t, Context>();
CUDNN_CHECK(cudnnDropoutBackward( CUDNN_CHECK(cudnnDropoutBackward(
ctx().cudnn_handle(), dropout_desc, ctx()->cudnn_handle(), dropout_desc,
input_desc, dYdata, input_desc, dYdata,
input_desc, dXdata, input_desc, dXdata,
Rdata, reserve_space_size)); Rdata, reserve_space_size));
...@@ -113,7 +119,9 @@ void CuDNNDropoutGradientOp<Context>::RunOnDevice() { ...@@ -113,7 +119,9 @@ void CuDNNDropoutGradientOp<Context>::RunOnDevice() {
Output(0)->ReshapeLike(Input(0)); Output(0)->ReshapeLike(Input(0));
if (XIsType(Input(0), float)) RunWithType<float>(); if (XIsType(Input(0), float)) RunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (XIsType(Input(0), float16)) RunWithType<float16>(); else if (XIsType(Input(0), float16)) RunWithType<float16>();
#endif
else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" }); else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
} }
......
...@@ -14,7 +14,7 @@ void CuDNNEluOp<Context>::RunWithType() { ...@@ -14,7 +14,7 @@ void CuDNNEluOp<Context>::RunWithType() {
auto* Ydata = Output(0)->template mutable_data<T, Context>(); auto* Ydata = Output(0)->template mutable_data<T, Context>();
CUDNN_CHECK(cudnnActivationForward( CUDNN_CHECK(cudnnActivationForward(
ctx().cudnn_handle(), act_desc, ctx()->cudnn_handle(), act_desc,
CUDNNType<T>::one, input_desc, Xdata, CUDNNType<T>::one, input_desc, Xdata,
CUDNNType<T>::zero, output_desc, Ydata)); CUDNNType<T>::zero, output_desc, Ydata));
} }
...@@ -41,7 +41,7 @@ void CuDNNEluGradientOp<Context>::RunWithType() { ...@@ -41,7 +41,7 @@ void CuDNNEluGradientOp<Context>::RunWithType() {
auto* dXdata = Output(0)->template mutable_data<T, Context>(); auto* dXdata = Output(0)->template mutable_data<T, Context>();
CUDNN_CHECK(cudnnActivationBackward( CUDNN_CHECK(cudnnActivationBackward(
ctx().cudnn_handle(), act_desc, ctx()->cudnn_handle(), act_desc,
CUDNNType<T>::one, input_desc, Ydata, CUDNNType<T>::one, input_desc, Ydata,
input_desc, dYdata, output_desc, Ydata, input_desc, dYdata, output_desc, Ydata,
CUDNNType<T>::zero, output_desc, dXdata)); CUDNNType<T>::zero, output_desc, dXdata));
......
...@@ -13,7 +13,7 @@ void CuDNNReluOp<Context>::RunWithType() { ...@@ -13,7 +13,7 @@ void CuDNNReluOp<Context>::RunWithType() {
#if CUDNN_VERSION_MIN(5, 0, 0) #if CUDNN_VERSION_MIN(5, 0, 0)
CUDNN_CHECK(cudnnActivationForward( CUDNN_CHECK(cudnnActivationForward(
ctx().cudnn_handle(), act_desc, ctx()->cudnn_handle(), act_desc,
CUDNNType<T>::one, input_desc, Xdata, CUDNNType<T>::one, input_desc, Xdata,
CUDNNType<T>::zero, output_desc, Ydata)); CUDNNType<T>::zero, output_desc, Ydata));
#else #else
...@@ -49,7 +49,7 @@ void CuDNNReluGradientOp<Context>::RunWithType() { ...@@ -49,7 +49,7 @@ void CuDNNReluGradientOp<Context>::RunWithType() {
#if CUDNN_VERSION_MIN(5, 0, 0) #if CUDNN_VERSION_MIN(5, 0, 0)
CUDNN_CHECK(cudnnActivationBackward( CUDNN_CHECK(cudnnActivationBackward(
ctx().cudnn_handle(), act_desc, ctx()->cudnn_handle(), act_desc,
CUDNNType<T>::one, input_desc, Ydata, CUDNNType<T>::one, input_desc, Ydata,
input_desc, dYdata, output_desc, Ydata, input_desc, dYdata, output_desc, Ydata,
CUDNNType<T>::zero, output_desc, dXdata)); CUDNNType<T>::zero, output_desc, dXdata));
......
...@@ -13,12 +13,12 @@ void CuDNNSigmoidOp<Context>::RunWithType() { ...@@ -13,12 +13,12 @@ void CuDNNSigmoidOp<Context>::RunWithType() {
#if CUDNN_VERSION_MIN(5, 0, 0) #if CUDNN_VERSION_MIN(5, 0, 0)
CUDNN_CHECK(cudnnActivationForward( CUDNN_CHECK(cudnnActivationForward(
ctx().cudnn_handle(), act_desc, ctx()->cudnn_handle(), act_desc,
CUDNNType<T>::one, input_desc, Xdata, CUDNNType<T>::one, input_desc, Xdata,
CUDNNType<T>::zero, output_desc, Ydata)); CUDNNType<T>::zero, output_desc, Ydata));
#else #else
CUDNN_CHECK(cudnnActivationForward_v4( CUDNN_CHECK(cudnnActivationForward_v4(
ctx().cudnn_handle(), act_desc, ctx()->cudnn_handle(), act_desc,
CUDNNType<Dtype>::one, input_desc, Xdata, CUDNNType<Dtype>::one, input_desc, Xdata,
CUDNNType<Dtype>::zero, output_desc, Ydata)); CUDNNType<Dtype>::zero, output_desc, Ydata));
#endif #endif
...@@ -47,13 +47,13 @@ void CuDNNSigmoidGradientOp<Context>::RunWithType() { ...@@ -47,13 +47,13 @@ void CuDNNSigmoidGradientOp<Context>::RunWithType() {
#if CUDNN_VERSION_MIN(5, 0, 0) #if CUDNN_VERSION_MIN(5, 0, 0)
CUDNN_CHECK(cudnnActivationBackward( CUDNN_CHECK(cudnnActivationBackward(
ctx().cudnn_handle(), act_desc, ctx()->cudnn_handle(), act_desc,
CUDNNType<T>::one, input_desc, Ydata, CUDNNType<T>::one, input_desc, Ydata,
input_desc, dYdata, output_desc, Ydata, input_desc, dYdata, output_desc, Ydata,
CUDNNType<T>::zero, output_desc, dXdata)); CUDNNType<T>::zero, output_desc, dXdata));
#else #else
CUDNN_CHECK(cudnnActivationBackward_v4( CUDNN_CHECK(cudnnActivationBackward_v4(
ctx().cudnn_handle(), act_desc, ctx()->cudnn_handle(), act_desc,
CUDNNType<T>::one, input_desc, Ydata, CUDNNType<T>::one, input_desc, Ydata,
input_desc, dYdata, output_desc, Ydata, input_desc, dYdata, output_desc, Ydata,
CUDNNType<T>::zero, output_desc, dXdata)); CUDNNType<T>::zero, output_desc, dXdata));
......
...@@ -7,8 +7,7 @@ namespace dragon { ...@@ -7,8 +7,7 @@ namespace dragon {
template <class Context> template <typename T> template <class Context> template <typename T>
void CuDNNSoftmaxOp<Context>::RunWithType() { void CuDNNSoftmaxOp<Context>::RunWithType() {
Tensor fake_tensor(vector<TIndex>( Tensor fake_tensor(vector<TIndex>(
{ outer_dim, Input(0).dim(axis), inner_dim }) { outer_dim, Input(0).dim(axis), inner_dim }));
);
cudnnSetTensorDesc<T>(&input_desc, &fake_tensor); cudnnSetTensorDesc<T>(&input_desc, &fake_tensor);
cudnnSetTensorDesc<T>(&output_desc, &fake_tensor); cudnnSetTensorDesc<T>(&output_desc, &fake_tensor);
...@@ -16,7 +15,7 @@ void CuDNNSoftmaxOp<Context>::RunWithType() { ...@@ -16,7 +15,7 @@ void CuDNNSoftmaxOp<Context>::RunWithType() {
auto* Ydata = Output(0)->template mutable_data<T, Context>(); auto* Ydata = Output(0)->template mutable_data<T, Context>();
CUDNN_CHECK(cudnnSoftmaxForward( CUDNN_CHECK(cudnnSoftmaxForward(
ctx().cudnn_handle(), ctx()->cudnn_handle(),
CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL, CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL,
CUDNNType<T>::one, input_desc, Xdata, CUDNNType<T>::one, input_desc, Xdata,
CUDNNType<T>::zero, output_desc, Ydata)); CUDNNType<T>::zero, output_desc, Ydata));
...@@ -41,8 +40,7 @@ DEPLOY_CUDNN(Softmax); ...@@ -41,8 +40,7 @@ DEPLOY_CUDNN(Softmax);
template <class Context> template <typename T> template <class Context> template <typename T>
void CuDNNSoftmaxGradientOp<Context>::RunWithType() { void CuDNNSoftmaxGradientOp<Context>::RunWithType() {
Tensor fake_tensor(vector<TIndex>( Tensor fake_tensor(vector<TIndex>(
{ outer_dim, Input(0).dim(axis), inner_dim }) { outer_dim, Input(0).dim(axis), inner_dim }));
);
cudnnSetTensorDesc<T>(&input_desc, &fake_tensor); cudnnSetTensorDesc<T>(&input_desc, &fake_tensor);
cudnnSetTensorDesc<T>(&output_desc, &fake_tensor); cudnnSetTensorDesc<T>(&output_desc, &fake_tensor);
...@@ -50,7 +48,7 @@ void CuDNNSoftmaxGradientOp<Context>::RunWithType() { ...@@ -50,7 +48,7 @@ void CuDNNSoftmaxGradientOp<Context>::RunWithType() {
auto* Ydata = Input(0).template data<T, Context>(); auto* Ydata = Input(0).template data<T, Context>();
auto* dXdata = Output(0)->template mutable_data<T, Context>(); auto* dXdata = Output(0)->template mutable_data<T, Context>();
CUDNN_CHECK(cudnnSoftmaxBackward( CUDNN_CHECK(cudnnSoftmaxBackward(
ctx().cudnn_handle(), ctx()->cudnn_handle(),
CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL, CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL,
CUDNNType<T>::one, input_desc, Ydata, input_desc, dYdata, CUDNNType<T>::one, input_desc, Ydata, input_desc, dYdata,
CUDNNType<T>::zero, output_desc, dXdata)); CUDNNType<T>::zero, output_desc, dXdata));
......
...@@ -13,12 +13,12 @@ void CuDNNTanhOp<Context>::RunWithType() { ...@@ -13,12 +13,12 @@ void CuDNNTanhOp<Context>::RunWithType() {
#if CUDNN_VERSION_MIN(5, 0, 0) #if CUDNN_VERSION_MIN(5, 0, 0)
CUDNN_CHECK(cudnnActivationForward( CUDNN_CHECK(cudnnActivationForward(
ctx().cudnn_handle(), act_desc, ctx()->cudnn_handle(), act_desc,
CUDNNType<T>::one, input_desc, Xdata, CUDNNType<T>::one, input_desc, Xdata,
CUDNNType<T>::zero, output_desc, Ydata)); CUDNNType<T>::zero, output_desc, Ydata));
#else #else
CUDNN_CHECK(cudnnActivationForward_v4( CUDNN_CHECK(cudnnActivationForward_v4(
ctx().cudnn_handle(), act_desc, ctx()->cudnn_handle(), act_desc,
CUDNNType<Dtype>::one, input_desc, Xdata, CUDNNType<Dtype>::one, input_desc, Xdata,
CUDNNType<Dtype>::zero, output_desc, Ydata)); CUDNNType<Dtype>::zero, output_desc, Ydata));
#endif #endif
...@@ -47,13 +47,13 @@ void CuDNNTanhGradientOp<Context>::RunWithType() { ...@@ -47,13 +47,13 @@ void CuDNNTanhGradientOp<Context>::RunWithType() {
#if CUDNN_VERSION_MIN(5, 0, 0) #if CUDNN_VERSION_MIN(5, 0, 0)
CUDNN_CHECK(cudnnActivationBackward( CUDNN_CHECK(cudnnActivationBackward(
ctx().cudnn_handle(), act_desc, ctx()->cudnn_handle(), act_desc,
CUDNNType<T>::one, input_desc, Ydata, CUDNNType<T>::one, input_desc, Ydata,
input_desc, dYdata, output_desc, Ydata, input_desc, dYdata, output_desc, Ydata,
CUDNNType<T>::zero, output_desc, dXdata)); CUDNNType<T>::zero, output_desc, dXdata));
#else #else
CUDNN_CHECK(cudnnActivationBackward_v4( CUDNN_CHECK(cudnnActivationBackward_v4(
ctx().cudnn_handle(), act_desc, ctx()->cudnn_handle(), act_desc,
CUDNNType<T>::one, input_desc, Ydata, CUDNNType<T>::one, input_desc, Ydata,
input_desc, dYdata, output_desc, Ydata, input_desc, dYdata, output_desc, Ydata,
CUDNNType<T>::zero, output_desc, dXdata)); CUDNNType<T>::zero, output_desc, dXdata));
......
...@@ -11,10 +11,10 @@ void DropoutOp<Context>::RunWithType() { ...@@ -11,10 +11,10 @@ void DropoutOp<Context>::RunWithType() {
float scale = use_scale ? 1.0 / (1.0 - prob()) : 1.0; float scale = use_scale ? 1.0 / (1.0 - prob()) : 1.0;
if (phase() == "TEST") { if (phase() == "TEST") {
if (Output(0) != &Input(0)) { if (Output(0) != &Input(0)) {
ctx().template Copy<T, Context, Context>( ctx()->template Copy<T, Context, Context>(
Output(0)->count(), Ydata, Xdata); Output(0)->count(), Ydata, Xdata);
if (scale == 1.0) math::Scal<T, Context>( if (scale == 1.0) math::Scal<T, Context>(
Output(0)->count(), 1.0 - prob(), Ydata, &ctx()); Output(0)->count(), 1.0 - prob(), Ydata, ctx());
} }
} else if (phase() == "TRAIN") { } else if (phase() == "TRAIN") {
Tensor* mask = ws()->CreateTensor( Tensor* mask = ws()->CreateTensor(
...@@ -23,7 +23,7 @@ void DropoutOp<Context>::RunWithType() { ...@@ -23,7 +23,7 @@ void DropoutOp<Context>::RunWithType() {
uint32_t* Mdata = mask->template mutable_data<uint32_t, Context>(); uint32_t* Mdata = mask->template mutable_data<uint32_t, Context>();
kernel::Dropout<T, Context>( kernel::Dropout<T, Context>(
Output(0)->count(), prob(), scale, Output(0)->count(), prob(), scale,
Xdata, Mdata, Ydata, &ctx()); Xdata, Mdata, Ydata, ctx());
} else LOG(FATAL) << "Incorrect Op phase: " << phase(); } else LOG(FATAL) << "Incorrect Op phase: " << phase();
} }
...@@ -52,7 +52,8 @@ void DropoutGradientOp<Context>::RunWithType() { ...@@ -52,7 +52,8 @@ void DropoutGradientOp<Context>::RunWithType() {
else if (phase() == "TRAIN") { else if (phase() == "TRAIN") {
kernel::DropoutGrad<T, Context>( kernel::DropoutGrad<T, Context>(
Output(0)->count(), prob(), scale, Output(0)->count(), prob(), scale,
dYdata, Mdata, dXdata, &ctx()); dYdata, Mdata, dXdata, ctx());
ctx()->FinishDeviceCompution();
mask->Reset(); mask->Reset();
} else LOG(FATAL) << "Incorrect Op phase: " << phase(); } else LOG(FATAL) << "Incorrect Op phase: " << phase();
} }
......
...@@ -8,7 +8,8 @@ template <class Context> template <typename T> ...@@ -8,7 +8,8 @@ template <class Context> template <typename T>
void EluOp<Context>::RunWithType() { void EluOp<Context>::RunWithType() {
auto* Xdata = Input(0).template data<T, Context>(); auto* Xdata = Input(0).template data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>(); auto* Ydata = Output(0)->template mutable_data<T, Context>();
kernel::Elu<T, Context>(Output(0)->count(), alpha, Xdata, Ydata); kernel::Elu<T, Context>(Output(0)->count(),
alpha, Xdata, Ydata, ctx());
} }
template <class Context> template <class Context>
...@@ -30,8 +31,8 @@ void EluGradientOp<Context>::RunWithType() { ...@@ -30,8 +31,8 @@ void EluGradientOp<Context>::RunWithType() {
auto* Ydata = Input(0).template data<T, Context>(); auto* Ydata = Input(0).template data<T, Context>();
auto* dYdata = Input(1).template data<T, Context>(); auto* dYdata = Input(1).template data<T, Context>();
auto* dXdata = Output(0)->template mutable_data<T, Context>(); auto* dXdata = Output(0)->template mutable_data<T, Context>();
kernel::EluGrad<T, Context>( kernel::EluGrad<T, Context>(Output(0)->count(),
Output(0)->count(), alpha, dYdata, Ydata, dXdata); alpha, dYdata, Ydata, dXdata, ctx());
} }
template <class Context> template <class Context>
......
...@@ -18,7 +18,7 @@ void PReluOp<Context>::RunWithType() { ...@@ -18,7 +18,7 @@ void PReluOp<Context>::RunWithType() {
kernel::PRelu<T, Context>( kernel::PRelu<T, Context>(
Output(0)->count(), channels, dim, Output(0)->count(), channels, dim,
channel_shared ? true : false, data_format, channel_shared ? true : false, data_format,
Xdata, Wdata, Ydata); Xdata, Wdata, Ydata, ctx());
} }
template <class Context> template <class Context>
...@@ -49,12 +49,12 @@ void PReluGradientOp<Context>::RunWithType() { ...@@ -49,12 +49,12 @@ void PReluGradientOp<Context>::RunWithType() {
if (Output(1)->name() != "ignore") { if (Output(1)->name() != "ignore") {
DECLARE_MULTIPLIER(multiplier, channels * dim); DECLARE_MULTIPLIER(multiplier, channels * dim);
auto* dWdata = Output(1)->template mutable_data<T, Context>(); auto* dWdata = Output(1)->template mutable_data<T, Context>(ctx());
auto* dWBdata = ws()->template caches<T, Context>({ channels * dim })[0]; auto* dWBdata = ws()->template caches<T, Context>({ channels * dim })[0];
kernel::PReluWGrad<T, Context>( kernel::PReluWGrad<T, Context>(
Input(0).dim(0), Input(0).count(1), channels, dim, Input(0).dim(0), Input(0).count(1), channels, dim,
channel_shared ? true : false, data_format, channel_shared ? true : false, data_format,
dYdata, Xdata, multiplier, dWBdata, dWdata, &ctx()); dYdata, Xdata, multiplier, dWBdata, dWdata, ctx());
} }
if (Output(0)->name() != "ignore") { if (Output(0)->name() != "ignore") {
...@@ -63,7 +63,7 @@ void PReluGradientOp<Context>::RunWithType() { ...@@ -63,7 +63,7 @@ void PReluGradientOp<Context>::RunWithType() {
kernel::PReluGrad<T, Context>( kernel::PReluGrad<T, Context>(
Output(0)->count(), channels, dim, Output(0)->count(), channels, dim,
channel_shared ? true : false, data_format, channel_shared ? true : false, data_format,
dYdata, Xdata, Wdata, dXdata); dYdata, Xdata, Wdata, dXdata, ctx());
} }
} }
......
...@@ -8,7 +8,8 @@ template <class Context> template <typename T> ...@@ -8,7 +8,8 @@ template <class Context> template <typename T>
void ReluOp<Context>::RunWithType() { void ReluOp<Context>::RunWithType() {
auto* Xdata = Input(0).template data<T, Context>(); auto* Xdata = Input(0).template data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>(); auto* Ydata = Output(0)->template mutable_data<T, Context>();
kernel::Relu<T, Context>(Output(0)->count(), slope, Xdata, Ydata); kernel::Relu<T, Context>(Output(0)->count(),
slope, Xdata, Ydata, ctx());
} }
template <class Context> template <class Context>
...@@ -24,15 +25,17 @@ DEPLOY_CPU(Relu); ...@@ -24,15 +25,17 @@ DEPLOY_CPU(Relu);
#ifdef WITH_CUDA #ifdef WITH_CUDA
DEPLOY_CUDA(Relu); DEPLOY_CUDA(Relu);
#endif #endif
OPERATOR_SCHEMA(Relu).NumInputs(1).NumOutputs(1).Inplace({ { 0, 0 } }); OPERATOR_SCHEMA(Relu)
.NumInputs(1).NumOutputs(1)
.Inplace({ { 0, 0 } });
template <class Context> template <typename T> template <class Context> template <typename T>
void ReluGradientOp<Context>::RunWithType() { void ReluGradientOp<Context>::RunWithType() {
auto* Ydata = Input(0).template data<T, Context>(); auto* Ydata = Input(0).template data<T, Context>();
auto* dYdata = Input(1).template data<T, Context>(); auto* dYdata = Input(1).template data<T, Context>();
auto* dXdata = Output(0)->template mutable_data<T, Context>(); auto* dXdata = Output(0)->template mutable_data<T, Context>();
kernel::ReluGrad<T, Context>( kernel::ReluGrad<T, Context>(Output(0)->count(),
Output(0)->count(), slope, dYdata, Ydata, dXdata); slope, dYdata, Ydata, dXdata, ctx());
} }
template <class Context> template <class Context>
...@@ -47,7 +50,9 @@ DEPLOY_CPU(ReluGradient); ...@@ -47,7 +50,9 @@ DEPLOY_CPU(ReluGradient);
#ifdef WITH_CUDA #ifdef WITH_CUDA
DEPLOY_CUDA(ReluGradient); DEPLOY_CUDA(ReluGradient);
#endif #endif
OPERATOR_SCHEMA(ReluGradient).NumInputs(2).NumOutputs(1).Inplace({ { 1, 0 }}); OPERATOR_SCHEMA(ReluGradient)
.NumInputs(2).NumOutputs(1)
.Inplace({ { 1, 0 }});
class GetReluGradient final : public GradientMakerBase { class GetReluGradient final : public GradientMakerBase {
public: public:
......
...@@ -8,7 +8,7 @@ template <class Context> template <typename T> ...@@ -8,7 +8,7 @@ template <class Context> template <typename T>
void SEluOp<Context>::RunWithType() { void SEluOp<Context>::RunWithType() {
auto* Xdata = Input(0).template data<T, Context>(); auto* Xdata = Input(0).template data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>(); auto* Ydata = Output(0)->template mutable_data<T, Context>();
kernel::SElu<T, Context>(Output(0)->count(), Xdata, Ydata); kernel::SElu<T, Context>(Output(0)->count(), Xdata, Ydata, ctx());
} }
template <class Context> template <class Context>
...@@ -23,15 +23,17 @@ DEPLOY_CPU(SElu); ...@@ -23,15 +23,17 @@ DEPLOY_CPU(SElu);
#ifdef WITH_CUDA #ifdef WITH_CUDA
DEPLOY_CUDA(SElu); DEPLOY_CUDA(SElu);
#endif #endif
OPERATOR_SCHEMA(SElu).NumInputs(1).NumOutputs(1).Inplace({ { 0, 0 } }); OPERATOR_SCHEMA(SElu)
.NumInputs(1).NumOutputs(1)
.Inplace({ { 0, 0 } });
template <class Context> template <typename T> template <class Context> template <typename T>
void SEluGradientOp<Context>::RunWithType() { void SEluGradientOp<Context>::RunWithType() {
auto* Ydata = Input(0).template data<T, Context>(); auto* Ydata = Input(0).template data<T, Context>();
auto* dYdata = Input(1).template data<T, Context>(); auto* dYdata = Input(1).template data<T, Context>();
auto* dXdata = Output(0)->template mutable_data<T, Context>(); auto* dXdata = Output(0)->template mutable_data<T, Context>();
kernel::SEluGrad<T, Context>( kernel::SEluGrad<T, Context>(Output(0)->count(),
Output(0)->count(), dYdata, Ydata, dXdata); dYdata, Ydata, dXdata, ctx());
} }
template <class Context> template <class Context>
...@@ -46,7 +48,9 @@ DEPLOY_CPU(SEluGradient); ...@@ -46,7 +48,9 @@ DEPLOY_CPU(SEluGradient);
#ifdef WITH_CUDA #ifdef WITH_CUDA
DEPLOY_CUDA(SEluGradient); DEPLOY_CUDA(SEluGradient);
#endif #endif
OPERATOR_SCHEMA(SEluGradient).NumInputs(2).NumOutputs(1).Inplace({ { 1, 0 }}); OPERATOR_SCHEMA(SEluGradient)
.NumInputs(2).NumOutputs(1)
.Inplace({ { 1, 0 }});
class GetSEluGradient final : public GradientMakerBase { class GetSEluGradient final : public GradientMakerBase {
public: public:
......
...@@ -8,7 +8,7 @@ template <class Context> template <typename T> ...@@ -8,7 +8,7 @@ template <class Context> template <typename T>
void SigmoidOp<Context>::RunWithType() { void SigmoidOp<Context>::RunWithType() {
auto* Xdata = Input(0).template data<T, Context>(); auto* Xdata = Input(0).template data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>(); auto* Ydata = Output(0)->template mutable_data<T, Context>();
kernel::Sigmoid<T, Context>(Output(0)->count(), Xdata, Ydata); kernel::Sigmoid<T, Context>(Output(0)->count(), Xdata, Ydata, ctx());
} }
template <class Context> template <class Context>
...@@ -30,8 +30,8 @@ void SigmoidGradientOp<Context>::RunWithType() { ...@@ -30,8 +30,8 @@ void SigmoidGradientOp<Context>::RunWithType() {
auto* Ydata = Input(0).template data<T, Context>(); auto* Ydata = Input(0).template data<T, Context>();
auto* dYdata = Input(1).template data<T, Context>(); auto* dYdata = Input(1).template data<T, Context>();
auto* dXdata = Output(0)->template mutable_data<T, Context>(); auto* dXdata = Output(0)->template mutable_data<T, Context>();
kernel::SigmoidGrad<T, Context>( kernel::SigmoidGrad<T, Context>(Output(0)->count(),
Output(0)->count(), dYdata, Ydata, dXdata); dYdata, Ydata, dXdata, ctx());
} }
template <class Context> template <class Context>
......
...@@ -12,13 +12,13 @@ void SoftmaxOp<Context>::RunWithType() { ...@@ -12,13 +12,13 @@ void SoftmaxOp<Context>::RunWithType() {
auto* Ydata = Output(0)->template mutable_data<T, Context>(); auto* Ydata = Output(0)->template mutable_data<T, Context>();
auto* WSdata = ws()->template caches<T, Context>({ Input(0).count() })[0]; auto* WSdata = ws()->template caches<T, Context>({ Input(0).count() })[0];
ctx().template Copy<T, Context, Context>( ctx()->template Copy<T, Context, Context>(
Input(0).count(), Ydata, Xdata); Input(0).count(), Ydata, Xdata);
kernel::Softmax<T, Context>( kernel::Softmax<T, Context>(
Output(0)->count(), Input(0).dim(axis), Output(0)->count(), Input(0).dim(axis),
outer_dim, inner_dim, multiplier, outer_dim, inner_dim, multiplier,
Xdata, WSdata, Ydata, &ctx()); Xdata, WSdata, Ydata, ctx());
} }
template <class Context> template <class Context>
...@@ -36,7 +36,9 @@ DEPLOY_CPU(Softmax); ...@@ -36,7 +36,9 @@ DEPLOY_CPU(Softmax);
#ifdef WITH_CUDA #ifdef WITH_CUDA
DEPLOY_CUDA(Softmax); DEPLOY_CUDA(Softmax);
#endif #endif
OPERATOR_SCHEMA(Softmax).NumInputs(1).NumOutputs(1).Inplace({ { 0, 0 } }); OPERATOR_SCHEMA(Softmax)
.NumInputs(1).NumOutputs(1)
.Inplace({ { 0, 0 } });
template <class Context> template <typename T> template <class Context> template <typename T>
void SoftmaxGradientOp<Context>::RunWithType() { void SoftmaxGradientOp<Context>::RunWithType() {
...@@ -44,15 +46,16 @@ void SoftmaxGradientOp<Context>::RunWithType() { ...@@ -44,15 +46,16 @@ void SoftmaxGradientOp<Context>::RunWithType() {
auto* dYdata = Input(-1).template data<T, Context>(); auto* dYdata = Input(-1).template data<T, Context>();
auto* Ydata = Input(0).template data<T, Context>(); auto* Ydata = Input(0).template data<T, Context>();
auto* dXdata = Output(0)->template mutable_data<T, Context>(); auto* dXdata = Output(0)->template mutable_data<T, Context>();
auto* WSdata = ws()->template caches<T, Context>({ Input(0).count() })[0]; auto* WSdata = ws()->template caches<T, Context>(
{ Input(0).count() })[0];
ctx().template Copy<T, Context, Context>( ctx()->template Copy<T, Context, Context>(
Input(0).count(), dXdata, dYdata); Input(0).count(), dXdata, dYdata);
kernel::SoftmaxGrad<T, Context>( kernel::SoftmaxGrad<T, Context>(
Output(0)->count(), Input(0).dim(axis), Output(0)->count(), Input(0).dim(axis),
outer_dim, inner_dim, multiplier, outer_dim, inner_dim, multiplier,
dYdata, Ydata, WSdata, dXdata, &ctx()); dYdata, Ydata, WSdata, dXdata, ctx());
} }
template <class Context> template <class Context>
...@@ -70,7 +73,9 @@ DEPLOY_CPU(SoftmaxGradient); ...@@ -70,7 +73,9 @@ DEPLOY_CPU(SoftmaxGradient);
#ifdef WITH_CUDA #ifdef WITH_CUDA
DEPLOY_CUDA(SoftmaxGradient); DEPLOY_CUDA(SoftmaxGradient);
#endif #endif
OPERATOR_SCHEMA(SoftmaxGradient).NumInputs(2).NumOutputs(1).Inplace({ { 1, 0 } }); OPERATOR_SCHEMA(SoftmaxGradient)
.NumInputs(2).NumOutputs(1)
.Inplace({ { 1, 0 } });
class GetSoftmaxGradient final : public GradientMakerBase { class GetSoftmaxGradient final : public GradientMakerBase {
public: public:
......
...@@ -8,7 +8,7 @@ template <class Context> template <typename T> ...@@ -8,7 +8,7 @@ template <class Context> template <typename T>
void TanhOp<Context>::RunWithType() { void TanhOp<Context>::RunWithType() {
auto* Xdata = Input(0).template data<T, Context>(); auto* Xdata = Input(0).template data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>(); auto* Ydata = Output(0)->template mutable_data<T, Context>();
kernel::Tanh<T, Context>(Output(0)->count(), Xdata, Ydata); kernel::Tanh<T, Context>(Output(0)->count(), Xdata, Ydata, ctx());
} }
template <class Context> template <class Context>
...@@ -30,8 +30,8 @@ void TanhGradientOp<Context>::RunWithType() { ...@@ -30,8 +30,8 @@ void TanhGradientOp<Context>::RunWithType() {
auto* Ydata = Input(0).template data<T, Context>(); auto* Ydata = Input(0).template data<T, Context>();
auto* dYdata = Input(1).template data<T, Context>(); auto* dYdata = Input(1).template data<T, Context>();
auto* dXdata = Output(0)->template mutable_data<T, Context>(); auto* dXdata = Output(0)->template mutable_data<T, Context>();
kernel::TanhGrad<T, Context>( kernel::TanhGrad<T, Context>(Output(0)->count(),
Output(0)->count(), dYdata, Ydata, dXdata); dYdata, Ydata, dXdata, ctx());
} }
template <class Context> template <class Context>
......
...@@ -9,7 +9,7 @@ void AddOp<Context>::EltwiseRunWithType() { ...@@ -9,7 +9,7 @@ void AddOp<Context>::EltwiseRunWithType() {
auto* x1 = Input(0).template data<T, Context>(); auto* x1 = Input(0).template data<T, Context>();
auto* x2 = Input(1).template data<T, Context>(); auto* x2 = Input(1).template data<T, Context>();
auto* y = Output(0)->template mutable_data<T, Context>(); auto* y = Output(0)->template mutable_data<T, Context>();
math::Add<T, Context>(Output(0)->count(), x1, x2, y); math::Add<T, Context>(Output(0)->count(), x1, x2, y, ctx());
} }
template <class Context> template <typename T> template <class Context> template <typename T>
...@@ -19,23 +19,24 @@ void AddOp<Context>::BroadcastRunWithType(int type) { ...@@ -19,23 +19,24 @@ void AddOp<Context>::BroadcastRunWithType(int type) {
auto* x2 = Input(1).template data<T, Context>(); auto* x2 = Input(1).template data<T, Context>();
auto* y = Output(0)->template mutable_data<T, Context>(); auto* y = Output(0)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>( ctx()->template Copy<T, Context, Context>(
Output(0)->count(), y, x1); Output(0)->count(), y, x1);
if (type == 0 || type == 1) { if (type == 0 || type == 1) {
if (type == 0) { if (type == 0) {
outer_dim = Input(0).count(); x2 = Input(1).template data<T, CPUContext>();
inner_dim = 1; math::AddScalar<T, Context>(Output(0)->count(),
dragon_cast<float, T>(x2[0]), y, ctx());
} else { } else {
outer_dim = Input(0).count(0, Input(0).axis(-1)); outer_dim = Input(0).count(0, Input(0).axis(-1));
inner_dim = Input(0).dim(-1); inner_dim = Input(0).dim(-1);
}
DECLARE_MULTIPLIER(multiplier, outer_dim); DECLARE_MULTIPLIER(multiplier, outer_dim);
math::Gemm<T, Context>( math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans, CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1, outer_dim, inner_dim, 1,
1.0, multiplier, x2, 1.0, multiplier, x2,
1.0, y, &ctx()); 1.0, y, ctx());
}
} else if (type == 2) { } else if (type == 2) {
outer_dim = Input(0).dim(0); outer_dim = Input(0).dim(0);
inner_dim = Input(0).count(1); inner_dim = Input(0).count(1);
...@@ -44,7 +45,7 @@ void AddOp<Context>::BroadcastRunWithType(int type) { ...@@ -44,7 +45,7 @@ void AddOp<Context>::BroadcastRunWithType(int type) {
CblasNoTrans, CblasNoTrans, CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1, outer_dim, inner_dim, 1,
1.0, x2, multiplier, 1.0, x2, multiplier,
1.0, y, &ctx()); 1.0, y, ctx());
} }
} }
...@@ -77,13 +78,13 @@ void AddGradientOp<Context>::EltwiseRunWithType() { ...@@ -77,13 +78,13 @@ void AddGradientOp<Context>::EltwiseRunWithType() {
if (Output(1)->name() != "ignore") { if (Output(1)->name() != "ignore") {
auto* dx2 = Output(1)->template mutable_data<T, Context>(); auto* dx2 = Output(1)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>( ctx()->template Copy<T, Context, Context>(
Output(1)->count(), dx2, dy); Output(1)->count(), dx2, dy);
} }
if (Output(0)->name() != "ignore") { if (Output(0)->name() != "ignore") {
auto* dx1 = Output(0)->template mutable_data<T, Context>(); auto* dx1 = Output(0)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>( ctx()->template Copy<T, Context, Context>(
Output(0)->count(), dx1, dy); Output(0)->count(), dx1, dy);
} }
} }
...@@ -108,7 +109,7 @@ void AddGradientOp<Context>::BroadcastRunWithType(int type) { ...@@ -108,7 +109,7 @@ void AddGradientOp<Context>::BroadcastRunWithType(int type) {
math::Gemv<T, Context>( math::Gemv<T, Context>(
CblasTrans, outer_dim, inner_dim, CblasTrans, outer_dim, inner_dim,
1.0, dy, multiplier, 1.0, dy, multiplier,
0.0, dx2, &ctx()); 0.0, dx2, ctx());
} else if (type == 2) { } else if (type == 2) {
outer_dim = X1->dim(0); outer_dim = X1->dim(0);
inner_dim = X1->count(1); inner_dim = X1->count(1);
...@@ -116,13 +117,13 @@ void AddGradientOp<Context>::BroadcastRunWithType(int type) { ...@@ -116,13 +117,13 @@ void AddGradientOp<Context>::BroadcastRunWithType(int type) {
math::Gemv<T, Context>( math::Gemv<T, Context>(
CblasNoTrans, outer_dim, inner_dim, CblasNoTrans, outer_dim, inner_dim,
1.0, dy, multiplier, 1.0, dy, multiplier,
0.0, dx2, &ctx()); 0.0, dx2, ctx());
} }
} }
if (Output(0)->name() != "ignore") { if (Output(0)->name() != "ignore") {
auto* dx1 = Output(0)->template mutable_data<T, Context>(); auto* dx1 = Output(0)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>( ctx()->template Copy<T, Context, Context>(
X1->count(), dx1, dy); X1->count(), dx1, dy);
} }
} }
......
...@@ -34,7 +34,7 @@ void AffineOp<Context>::RunWithType() { ...@@ -34,7 +34,7 @@ void AffineOp<Context>::RunWithType() {
kernel::Affine<T, Context>( kernel::Affine<T, Context>(
Output(0)->count(), outer_dim, scale_dim, inner_dim, Output(0)->count(), outer_dim, scale_dim, inner_dim,
Xdata, Adata, Bdata, bias_multiplier, Ydata, &ctx()); Xdata, Adata, Bdata, bias_multiplier, Ydata, ctx());
} }
template <class Context> template <class Context>
...@@ -58,13 +58,13 @@ void AffineGradientOp<Context>::BiasRunWithType() { ...@@ -58,13 +58,13 @@ void AffineGradientOp<Context>::BiasRunWithType() {
DECLARE_MULTIPLIER(multiplier, inner_dim); DECLARE_MULTIPLIER(multiplier, inner_dim);
auto* dYdata = Input(-1).template data<T, Context>(); auto* dYdata = Input(-1).template data<T, Context>();
auto* dBias = Output(2)->template mutable_data<T, Context>(); auto* dBias = Output(2)->template mutable_data<T, Context>(ctx());
for (int n = 0; n < outer_dim; n++) { for (int n = 0; n < outer_dim; n++) {
math::Gemv<T, Context>( math::Gemv<T, Context>(
CblasNoTrans, scale_dim, inner_dim, CblasNoTrans, scale_dim, inner_dim,
1.0, dYdata, multiplier, 1.0, dYdata, multiplier,
1.0, dBias, &ctx()); 1.0, dBias, ctx());
dYdata += dim; dYdata += dim;
} }
} }
...@@ -79,45 +79,36 @@ void AffineGradientOp<Context>::ScaleRunWithType() { ...@@ -79,45 +79,36 @@ void AffineGradientOp<Context>::ScaleRunWithType() {
bool is_eltwise = (Input(-1).count() == Input(1).count()); bool is_eltwise = (Input(-1).count() == Input(1).count());
auto* dYdata = Input(-1).template data<T, Context>(); auto* dYdata = Input(-1).template data<T, Context>();
auto* Xdata = Input(0).template data<T, Context>(); auto* Xdata = Input(0).template data<T, Context>();
auto* dScale = Output(1)->template mutable_data<T, Context>(); auto* dScale = Output(1)->template mutable_data<T, Context>(ctx());
auto* dXdata = Output(0)->template mutable_data<T, Context>(); auto* dXdata = Output(0)->template mutable_data<T, Context>();
auto* dYxX = dXdata; auto* dYxX = dXdata;
math::Mul<T, Context>(Output(0)->count(), dYdata, Xdata, dYxX); math::Mul<T, Context>(Output(0)->count(), dYdata, Xdata, dYxX, ctx());
if (!is_eltwise) { if (!is_eltwise) {
T* SRes_data = nullptr; T* SRes_data = nullptr;
// reduce inner dimensions
if (inner_dim == 1) { if (inner_dim == 1) {
SRes_data = dYxX; SRes_data = dYxX;
} else if (sum_result.count() == 1) { // handle inner only
dScale = Output(1)->template mutable_data<T, CPUContext>();
T result = math::Dot<T, Context>(
inner_dim, dYxX, multiplier, &ctx());
*dScale += result;
} else { } else {
SRes_data = (outer_dim == 1) ? // handle scale only SRes_data = (outer_dim == 1) ?
dScale : sum_result.template mutable_data<T, Context>(); dScale : sum_result.template mutable_data<T, Context>();
math::Gemv<T, Context>( math::Gemv<T, Context>(
CblasNoTrans, sum_result.count(), inner_dim, CblasNoTrans, sum_result.count(), inner_dim,
1.0, dYxX, multiplier, 1.0, dYxX, multiplier,
SRes_data == dScale ? 1.0 : 0.0, SRes_data, &ctx()); SRes_data == dScale ? 1.0 : 0.0,
SRes_data, ctx());
} }
// reduce outer dimensions
if (outer_dim != 1) { if (outer_dim != 1) {
if (scale_dim == 1) { // handle outer only
dScale = Output(1)->template mutable_data<T, CPUContext>();
T result = math::Dot<T, Context>(
outer_dim, multiplier, SRes_data, &ctx());
*dScale += result;
} else {
math::Gemv<T, Context>( math::Gemv<T, Context>(
CblasTrans, outer_dim, scale_dim, CblasTrans, outer_dim, scale_dim,
1.0, SRes_data, multiplier, 1.0, SRes_data, multiplier,
1.0, dScale, &ctx()); 1.0, dScale, ctx());
}
} }
} else { } else {
math::Axpy<T, Context>(Output(1)->count(), math::Axpy<T, Context>(Output(1)->count(),
1.f, dYxX, dScale, &ctx()); 1.f, dYxX, dScale, ctx());
} }
} }
...@@ -131,7 +122,7 @@ void AffineGradientOp<Context>::RunWithType() { ...@@ -131,7 +122,7 @@ void AffineGradientOp<Context>::RunWithType() {
kernel::AffineGrad<T, Context>( kernel::AffineGrad<T, Context>(
Output(0)->count(), outer_dim, scale_dim, inner_dim, Output(0)->count(), outer_dim, scale_dim, inner_dim,
dYdata, Adata, dXdata, &ctx()); dYdata, Adata, dXdata, ctx());
} }
template <class Context> template <class Context>
......
...@@ -15,7 +15,7 @@ void ClipOp<Context>::RunWithType() { ...@@ -15,7 +15,7 @@ void ClipOp<Context>::RunWithType() {
auto* Ydata = Output(0)->template mutable_data<T, Context>(); auto* Ydata = Output(0)->template mutable_data<T, Context>();
auto* Mdata = mask->template mutable_data<T, Context>(); auto* Mdata = mask->template mutable_data<T, Context>();
kernel::Clip<T, Context>(Output(0)->count(), kernel::Clip<T, Context>(Output(0)->count(),
low, high, Xdata, Mdata, Ydata); low, high, Xdata, Mdata, Ydata, ctx());
} }
template <class Context> template <class Context>
...@@ -30,7 +30,9 @@ DEPLOY_CPU(Clip); ...@@ -30,7 +30,9 @@ DEPLOY_CPU(Clip);
#ifdef WITH_CUDA #ifdef WITH_CUDA
DEPLOY_CUDA(Clip); DEPLOY_CUDA(Clip);
#endif #endif
OPERATOR_SCHEMA(Clip).NumInputs(1).NumOutputs(1).Inplace({ { 0, 0 } }); OPERATOR_SCHEMA(Clip)
.NumInputs(1).NumOutputs(1)
.Inplace({ { 0, 0 } });
template <class Context> template <typename T> template <class Context> template <typename T>
void ClipGradientOp<Context>::RunWithType() { void ClipGradientOp<Context>::RunWithType() {
...@@ -39,7 +41,8 @@ void ClipGradientOp<Context>::RunWithType() { ...@@ -39,7 +41,8 @@ void ClipGradientOp<Context>::RunWithType() {
auto* dXdata = Output(0)->template mutable_data<T, Context>(); auto* dXdata = Output(0)->template mutable_data<T, Context>();
auto* Mdata = mask->template data<T, Context>(); auto* Mdata = mask->template data<T, Context>();
math::Mul<T, Context>(Output(0)->count(), dXdata, Mdata, dXdata); math::Mul<T, Context>(Output(0)->count(),
dXdata, Mdata, dXdata, ctx());
} }
template <class Context> template <class Context>
...@@ -54,7 +57,9 @@ DEPLOY_CPU(ClipGradient); ...@@ -54,7 +57,9 @@ DEPLOY_CPU(ClipGradient);
#ifdef WITH_CUDA #ifdef WITH_CUDA
DEPLOY_CUDA(ClipGradient); DEPLOY_CUDA(ClipGradient);
#endif #endif
OPERATOR_SCHEMA(ClipGradient).NumInputs(2).NumOutputs(1).Inplace({ { 1, 0 } }); OPERATOR_SCHEMA(ClipGradient)
.NumInputs(2).NumOutputs(1)
.Inplace({ { 1, 0 } });
class GetClipGradient final : public GradientMakerBase { class GetClipGradient final : public GradientMakerBase {
public: public:
......
...@@ -23,7 +23,7 @@ void CuDNNAffineOp<Context>::RunWithType() { ...@@ -23,7 +23,7 @@ void CuDNNAffineOp<Context>::RunWithType() {
mul_desc, CUDNN_OP_TENSOR_MUL, mul_desc, CUDNN_OP_TENSOR_MUL,
CUDNNType<T>::type, CUDNN_PROPAGATE_NAN)); CUDNNType<T>::type, CUDNN_PROPAGATE_NAN));
CUDNN_CHECK(cudnnOpTensor( CUDNN_CHECK(cudnnOpTensor(
ctx().cudnn_handle(), mul_desc, ctx()->cudnn_handle(), mul_desc,
CUDNNType<T>::one, input_desc, Xdata, CUDNNType<T>::one, input_desc, Xdata,
CUDNNType<T>::one, param_desc, Adata, CUDNNType<T>::one, param_desc, Adata,
CUDNNType<T>::zero, input_desc, Ydata)); CUDNNType<T>::zero, input_desc, Ydata));
...@@ -36,7 +36,7 @@ void CuDNNAffineOp<Context>::RunWithType() { ...@@ -36,7 +36,7 @@ void CuDNNAffineOp<Context>::RunWithType() {
add_desc, CUDNN_OP_TENSOR_ADD, add_desc, CUDNN_OP_TENSOR_ADD,
CUDNNType<T>::type, CUDNN_PROPAGATE_NAN)); CUDNNType<T>::type, CUDNN_PROPAGATE_NAN));
CUDNN_CHECK(cudnnOpTensor( CUDNN_CHECK(cudnnOpTensor(
ctx().cudnn_handle(), add_desc, ctx()->cudnn_handle(), add_desc,
CUDNNType<T>::one, input_desc, Ydata, CUDNNType<T>::one, input_desc, Ydata,
CUDNNType<T>::one, param_desc, Bdata, CUDNNType<T>::one, param_desc, Bdata,
CUDNNType<T>::zero, input_desc, Ydata)); CUDNNType<T>::zero, input_desc, Ydata));
...@@ -48,7 +48,9 @@ void CuDNNAffineOp<Context>::RunOnDevice() { ...@@ -48,7 +48,9 @@ void CuDNNAffineOp<Context>::RunOnDevice() {
Output(0)->ReshapeLike(Input(0)); Output(0)->ReshapeLike(Input(0));
if (XIsType(Input(0), float)) RunWithType<float>(); if (XIsType(Input(0), float)) RunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (XIsType(Input(0), float16)) RunWithType<float16>(); else if (XIsType(Input(0), float16)) RunWithType<float16>();
#endif
else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" }); else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
} }
...@@ -76,17 +78,17 @@ void CuDNNAffineGradientOp<Context>::RunWithType() { ...@@ -76,17 +78,17 @@ void CuDNNAffineGradientOp<Context>::RunWithType() {
if (Output(1)->name() != "ignore") { if (Output(1)->name() != "ignore") {
Output(1)->ReshapeLike(Input(1)); Output(1)->ReshapeLike(Input(1));
auto* Xdata = Input(0).template data<T, Context>(); auto* Xdata = Input(0).template data<T, Context>();
auto* dAdata = Output(1)->template mutable_data<T, Context>(); auto* dAdata = Output(1)->template mutable_data<T, Context>(ctx());
// eltwise // eltwise
if (Input(0).count() == Input(1).count()) { if (Input(0).count() == Input(1).count()) {
CUDNN_CHECK(cudnnOpTensor( CUDNN_CHECK(cudnnOpTensor(
ctx().cudnn_handle(), mul_desc, ctx()->cudnn_handle(), mul_desc,
CUDNNType<T>::one, input_desc, Xdata, CUDNNType<T>::one, input_desc, Xdata,
CUDNNType<T>::one, input_desc, dYdata, CUDNNType<T>::one, input_desc, dYdata,
CUDNNType<T>::one, param_desc, dAdata)); CUDNNType<T>::one, param_desc, dAdata));
} else { } else {
CUDNN_CHECK(cudnnOpTensor( CUDNN_CHECK(cudnnOpTensor(
ctx().cudnn_handle(), mul_desc, ctx()->cudnn_handle(), mul_desc,
CUDNNType<T>::one, input_desc, Xdata, CUDNNType<T>::one, input_desc, Xdata,
CUDNNType<T>::one, input_desc, dYdata, CUDNNType<T>::one, input_desc, dYdata,
CUDNNType<T>::zero, input_desc, dXdata)); CUDNNType<T>::zero, input_desc, dXdata));
...@@ -97,11 +99,11 @@ void CuDNNAffineGradientOp<Context>::RunWithType() { ...@@ -97,11 +99,11 @@ void CuDNNAffineGradientOp<Context>::RunWithType() {
// db = dy // db = dy
if (Output(2)->name() != "ignore") { if (Output(2)->name() != "ignore") {
Output(2)->ReshapeLike(Input(1)); Output(2)->ReshapeLike(Input(1));
auto* dBdata = Output(2)->template mutable_data<T, Context>(); auto* dBdata = Output(2)->template mutable_data<T, Context>(ctx());
// eltwise // eltwise
if (Input(-1).count() == Input(1).count()) { if (Input(-1).count() == Input(1).count()) {
math::Axpy<T, Context>(Output(2)->count(), math::Axpy<T, Context>(Output(2)->count(),
1.f, dYdata, dBdata, &ctx()); 1.f, dYdata, dBdata, ctx());
} else { } else {
ComputeBiasGradient_v2<T>(dYdata, dBdata); ComputeBiasGradient_v2<T>(dYdata, dBdata);
} }
...@@ -109,7 +111,7 @@ void CuDNNAffineGradientOp<Context>::RunWithType() { ...@@ -109,7 +111,7 @@ void CuDNNAffineGradientOp<Context>::RunWithType() {
// dx = alpha * dy // dx = alpha * dy
CUDNN_CHECK(cudnnOpTensor( CUDNN_CHECK(cudnnOpTensor(
ctx().cudnn_handle(), mul_desc, ctx()->cudnn_handle(), mul_desc,
CUDNNType<T>::one, input_desc, dYdata, CUDNNType<T>::one, input_desc, dYdata,
CUDNNType<T>::one, param_desc, Adata, CUDNNType<T>::one, param_desc, Adata,
CUDNNType<T>::zero, input_desc, dXdata)); CUDNNType<T>::zero, input_desc, dXdata));
...@@ -126,11 +128,11 @@ void CuDNNAffineGradientOp<Context>::ComputeScaleGradient( ...@@ -126,11 +128,11 @@ void CuDNNAffineGradientOp<Context>::ComputeScaleGradient(
CUDNN_REDUCE_TENSOR_NO_INDICES, CUDNN_32BIT_INDICES)); CUDNN_REDUCE_TENSOR_NO_INDICES, CUDNN_32BIT_INDICES));
size_t workspace_size = 0; size_t workspace_size = 0;
CUDNN_CHECK(cudnnGetReductionWorkspaceSize( CUDNN_CHECK(cudnnGetReductionWorkspaceSize(
ctx().cudnn_handle(), reduce_desc, ctx()->cudnn_handle(), reduce_desc,
input_desc, param_desc, &workspace_size)); input_desc, param_desc, &workspace_size));
auto* WSdata = ws()->template caches<Context>({ workspace_size })[0];; auto* WSdata = ws()->template caches<Context>({ workspace_size })[0];;
CUDNN_CHECK(cudnnReduceTensor( CUDNN_CHECK(cudnnReduceTensor(
ctx().cudnn_handle(), reduce_desc, ctx()->cudnn_handle(), reduce_desc,
nullptr, 0, WSdata, workspace_size, nullptr, 0, WSdata, workspace_size,
CUDNNType<T>::one, input_desc, dYxX, CUDNNType<T>::one, input_desc, dYxX,
CUDNNType<T>::one, param_desc, dA)); CUDNNType<T>::one, param_desc, dA));
...@@ -145,32 +147,23 @@ void CuDNNAffineGradientOp<Context>::ComputeScaleGradient_v2( ...@@ -145,32 +147,23 @@ void CuDNNAffineGradientOp<Context>::ComputeScaleGradient_v2(
sum_result.Reshape({ outer_dim * scale_dim }); sum_result.Reshape({ outer_dim * scale_dim });
T* SRes_data = nullptr; T* SRes_data = nullptr;
if (inner_dim == 1) SRes_data = dYxX; // reduce inner dimensions
else if (sum_result.count() == 1) { if (inner_dim == 1) {
auto* dAC = Output(1)->template mutable_data<T, CPUContext>(); SRes_data = dYxX;
T result = math::Dot<T, Context>(
inner_dim, dYxX, multiplier, &ctx());
*dAC += result;
} else { } else {
SRes_data = (outer_dim == 1) ? SRes_data = (outer_dim == 1) ?
dA : sum_result.template mutable_data<T, Context>(); dA : sum_result.template mutable_data<T, Context>();
math::Gemv<T, Context>( math::Gemv<T, Context>(
CblasNoTrans, sum_result.count(), inner_dim, CblasNoTrans, sum_result.count(), inner_dim,
1.0, dYxX, multiplier, 1.0, dYxX, multiplier,
SRes_data == dA ? 1.0 : 0.0, SRes_data, &ctx()); SRes_data == dA ? 1.0 : 0.0, SRes_data, ctx());
} }
// reduce outer dimensions
if (outer_dim != 1) { if (outer_dim != 1) {
if (scale_dim == 1) {
auto* dAC = Output(1)->template mutable_data<T, CPUContext>();
T result = math::Dot<T, Context>(
outer_dim, multiplier, SRes_data, &ctx());
*dAC += result;
} else {
math::Gemv<T, Context>( math::Gemv<T, Context>(
CblasTrans, outer_dim, scale_dim, CblasTrans, outer_dim, scale_dim,
1.0, SRes_data, multiplier, 1.0, SRes_data, multiplier,
1.0, dA, &ctx()); 1.0, dA, ctx());
}
} }
} }
...@@ -185,11 +178,11 @@ void CuDNNAffineGradientOp<Context>::ComputeBiasGradient( ...@@ -185,11 +178,11 @@ void CuDNNAffineGradientOp<Context>::ComputeBiasGradient(
CUDNN_REDUCE_TENSOR_NO_INDICES, CUDNN_32BIT_INDICES)); CUDNN_REDUCE_TENSOR_NO_INDICES, CUDNN_32BIT_INDICES));
size_t workspace_size = 0; size_t workspace_size = 0;
CUDNN_CHECK(cudnnGetReductionWorkspaceSize( CUDNN_CHECK(cudnnGetReductionWorkspaceSize(
ctx().cudnn_handle(), reduce_desc, ctx()->cudnn_handle(), reduce_desc,
input_desc, param_desc, &workspace_size)); input_desc, param_desc, &workspace_size));
auto* WSdata = ws()->template caches<Context>({ workspace_size })[0]; auto* WSdata = ws()->template caches<Context>({ workspace_size })[0];
CUDNN_CHECK(cudnnReduceTensor( CUDNN_CHECK(cudnnReduceTensor(
ctx().cudnn_handle(), reduce_desc, ctx()->cudnn_handle(), reduce_desc,
nullptr, 0, WSdata, workspace_size, nullptr, 0, WSdata, workspace_size,
CUDNNType<T>::one, input_desc, dY, CUDNNType<T>::one, input_desc, dY,
CUDNNType<T>::one, param_desc, dB)); CUDNNType<T>::one, param_desc, dB));
...@@ -205,7 +198,7 @@ void CuDNNAffineGradientOp<Context>::ComputeBiasGradient_v2( ...@@ -205,7 +198,7 @@ void CuDNNAffineGradientOp<Context>::ComputeBiasGradient_v2(
math::Gemv<T, Context>( math::Gemv<T, Context>(
CblasNoTrans, scale_dim, inner_dim, CblasNoTrans, scale_dim, inner_dim,
1.0, dY, multiplier, 1.0, dY, multiplier,
1.0, dB, &ctx()); 1.0, dB, ctx());
dY += dim; dY += dim;
} }
} }
......
...@@ -9,7 +9,7 @@ void DivOp<Context>::EltwiseRunWithType() { ...@@ -9,7 +9,7 @@ void DivOp<Context>::EltwiseRunWithType() {
auto* x1 = Input(0).template data<T, Context>(); auto* x1 = Input(0).template data<T, Context>();
auto* x2 = Input(1).template data<T, Context>(); auto* x2 = Input(1).template data<T, Context>();
auto* y = Output(0)->template mutable_data<T, Context>(); auto* y = Output(0)->template mutable_data<T, Context>();
math::Div<T, Context>(Output(0)->count(), x1, x2, y); math::Div<T, Context>(Output(0)->count(), x1, x2, y, ctx());
} }
template <class Context> template <typename T> template <class Context> template <typename T>
...@@ -18,34 +18,40 @@ void DivOp<Context>::BroadcastRunWithType(int type) { ...@@ -18,34 +18,40 @@ void DivOp<Context>::BroadcastRunWithType(int type) {
auto* x1 = Input(0).template data<T, Context>(); auto* x1 = Input(0).template data<T, Context>();
auto* x2 = Input(1).template data<T, Context>(); auto* x2 = Input(1).template data<T, Context>();
auto* y = Output(0)->template mutable_data<T, Context>(); auto* y = Output(0)->template mutable_data<T, Context>();
auto* c = ws()->template caches<T, Context>({
Output(0)->count() })[0];
if (type == 0 || type == 1) {
if (type == 0) { if (type == 0) {
outer_dim = Input(0).count(); x2 = Input(1).template data<T, CPUContext>();
inner_dim = 1; float inverse_x2 = 1.f / dragon_cast<float, T>(x2[0]);
} else { ctx()->template Copy<T, Context, Context>(
Output(0)->count(), y, x1);
math::MulScalar<T, Context>(
Output(0)->count(), inverse_x2, y, ctx());
} else if (type == 1) {
outer_dim = Input(0).count(0, Input(0).axis(-1)); outer_dim = Input(0).count(0, Input(0).axis(-1));
inner_dim = Input(0).dim(-1); inner_dim = Input(0).dim(-1);
}
DECLARE_MULTIPLIER(multiplier, outer_dim); DECLARE_MULTIPLIER(multiplier, outer_dim);
auto* c = ws()->template caches<T, Context>(
{ Output(0)->count() })[0];
math::Gemm<T, Context>( math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans, CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1, outer_dim, inner_dim, 1,
1.0, multiplier, x2, 1.0, multiplier, x2,
0.0, c, &ctx()); 0.0, c, ctx());
math::Div<T, Context>(Output(0)->count(), x1, c, y); math::Div<T, Context>(
Output(0)->count(), x1, c, y, ctx());
} else if (type == 2) { } else if (type == 2) {
outer_dim = Input(0).dim(0); outer_dim = Input(0).dim(0);
inner_dim = Input(0).count(1); inner_dim = Input(0).count(1);
DECLARE_MULTIPLIER(multiplier, inner_dim); DECLARE_MULTIPLIER(multiplier, inner_dim);
auto* c = ws()->template caches<T, Context>(
{ Output(0)->count() })[0];
math::Gemm<T, Context>( math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans, CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1, outer_dim, inner_dim, 1,
1.0, x2, multiplier, 1.0, x2, multiplier,
0.0, c, &ctx()); 0.0, c, ctx());
math::Div<T, Context>(Output(0)->count(), x1, c, y); math::Div<T, Context>(
Output(0)->count(), x1, c, y, ctx());
} }
} }
...@@ -82,16 +88,16 @@ void DivGradientOp<Context>::EltwiseRunWithType() { ...@@ -82,16 +88,16 @@ void DivGradientOp<Context>::EltwiseRunWithType() {
auto* x2 = Input(1).template data<T, Context>(); auto* x2 = Input(1).template data<T, Context>();
auto* dx2 = Output(1)->template mutable_data<T, Context>(); auto* dx2 = Output(1)->template mutable_data<T, Context>();
auto* c = ws()->template caches<T, Context>({ X1->count() })[0]; auto* c = ws()->template caches<T, Context>({ X1->count() })[0];
math::Mul<T,Context>(X1->count(), dy, x1, c); // dY * X1 math::Mul<T,Context>(X1->count(), dy, x1, c, ctx()); // dY * X1
math::Square<T, Context>(X2->count(), x2, dx2); // X2^{2} math::Square<T, Context>(X2->count(), x2, dx2, ctx()); // X2^{2}
math::Inv<T, Context>(X2->count(), -1, dx2, dx2); // -1 / X2^{2} math::Inv<T, Context>(X2->count(), -1, dx2, dx2, ctx()); // -1 / X2^{2}
math::Mul<T, Context>(X2->count(), c, dx2, dx2); math::Mul<T, Context>(X2->count(), c, dx2, dx2, ctx());
} }
if (Output(0)->name() != "ignore") { if (Output(0)->name() != "ignore") {
auto* x2 = Input(1).template data<T, Context>(); auto* x2 = Input(1).template data<T, Context>();
auto* dx1 = Output(0)->template mutable_data<T, Context>(); auto* dx1 = Output(0)->template mutable_data<T, Context>();
math::Div<T, Context>(X1->count(), dy, x2, dx1); math::Div<T, Context>(X1->count(), dy, x2, dx1, ctx());
} }
} }
...@@ -118,23 +124,23 @@ void DivGradientOp<Context>::BroadcastRunWithType(int type) { ...@@ -118,23 +124,23 @@ void DivGradientOp<Context>::BroadcastRunWithType(int type) {
auto* dx2 = Output(1)->template mutable_data<T, Context>(); auto* dx2 = Output(1)->template mutable_data<T, Context>();
auto cs = ws()->template caches<T, Context>( auto cs = ws()->template caches<T, Context>(
{ X1->count(), X2->count() }); { X1->count(), X2->count() });
math::Mul<T, Context>(X1->count(), dy, x1, cs[0]); // dY * X1 math::Mul<T, Context>(X1->count(), dy, x1, cs[0], ctx()); // dY * X1
math::Square<T, Context>(X2->count(), x2, dx2); // X2^{2} math::Square<T, Context>(X2->count(), x2, dx2, ctx()); // X2^{2}
math::Inv<T, Context>(X2->count(), -1.0, dx2, dx2); // -1 / X2^{2} math::Inv<T, Context>(X2->count(), -1, dx2, dx2, ctx()); // -1 / X2^{2}
if (type == 0 || type == 1) { if (type == 0 || type == 1) {
DECLARE_MULTIPLIER(multiplier, outer_dim); DECLARE_MULTIPLIER(multiplier, outer_dim);
math::Gemv<T, Context>( math::Gemv<T, Context>(
CblasTrans, outer_dim, inner_dim, CblasTrans, outer_dim, inner_dim,
1.0, cs[0], multiplier, 1.0, cs[0], multiplier,
0.0, cs[1], &ctx()); 0.0, cs[1], ctx());
} else if (type == 2) { } else if (type == 2) {
DECLARE_MULTIPLIER(multiplier, inner_dim); DECLARE_MULTIPLIER(multiplier, inner_dim);
math::Gemv<T, Context>( math::Gemv<T, Context>(
CblasNoTrans, outer_dim, inner_dim, CblasNoTrans, outer_dim, inner_dim,
1.0, cs[0], multiplier, 1.0, cs[0], multiplier,
0.0, cs[1], &ctx()); 0.0, cs[1], ctx());
} }
math::Mul<T, Context>(X2->count(), cs[1], dx2, dx2); math::Mul<T, Context>(X2->count(), cs[1], dx2, dx2, ctx());
} }
if (Output(0)->name() != "ignore") { if (Output(0)->name() != "ignore") {
...@@ -146,16 +152,16 @@ void DivGradientOp<Context>::BroadcastRunWithType(int type) { ...@@ -146,16 +152,16 @@ void DivGradientOp<Context>::BroadcastRunWithType(int type) {
CblasNoTrans, CblasNoTrans, CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1, outer_dim, inner_dim, 1,
1.0, multiplier, x2, 1.0, multiplier, x2,
0.0, dx1, &ctx()); 0.0, dx1, ctx());
} else if (type == 2) { } else if (type == 2) {
DECLARE_MULTIPLIER(multiplier, inner_dim); DECLARE_MULTIPLIER(multiplier, inner_dim);
math::Gemm<T, Context>( math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans, CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1, outer_dim, inner_dim, 1,
1.0, x2, multiplier, 1.0, x2, multiplier,
0.0, dx1, &ctx()); 0.0, dx1, ctx());
} }
math::Div<T, Context>(X1->count(), dy, dx1, dx1); math::Div<T, Context>(X1->count(), dy, dx1, dx1, ctx());
} }
} }
......
...@@ -7,9 +7,13 @@ template <class Context> template <typename T> ...@@ -7,9 +7,13 @@ template <class Context> template <typename T>
void DotOp<Context>::DotRunWithType() { void DotOp<Context>::DotRunWithType() {
auto* X1data = Input(0).template data<T, Context>(); auto* X1data = Input(0).template data<T, Context>();
auto* X2data = Input(1).template data<T, Context>(); auto* X2data = Input(1).template data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, CPUContext>(); auto* Ydata = Output(0)->template mutable_data<T, Context>();
Ydata[0] = math::Dot<T, Context>(
Input(0).count(), X1data, X2data, &ctx()); T result_host;
math::Dot<T, Context>(Input(0).count(),
X1data, X2data, &result_host, ctx());
ctx()->template Copy<T, Context, CPUContext>(
1, Ydata, &result_host);
} }
template <class Context> template <typename T> template <class Context> template <typename T>
...@@ -22,7 +26,7 @@ void DotOp<Context>::GemmRunWithType() { ...@@ -22,7 +26,7 @@ void DotOp<Context>::GemmRunWithType() {
TransB ? CblasTrans : CblasNoTrans, TransB ? CblasTrans : CblasNoTrans,
M, N1, K1, M, N1, K1,
1.0, X1data, X2data, 1.0, X1data, X2data,
0.0, Ydata, &ctx()); 0.0, Ydata, ctx());
} }
template <class Context> template <typename T> template <class Context> template <typename T>
...@@ -33,7 +37,7 @@ void DotOp<Context>::GemvRunWithType() { ...@@ -33,7 +37,7 @@ void DotOp<Context>::GemvRunWithType() {
math::Gemv<T, Context>( math::Gemv<T, Context>(
TransA ? CblasTrans : CblasNoTrans, M, N1, TransA ? CblasTrans : CblasNoTrans, M, N1,
1.0, X1data, X2data, 1.0, X1data, X2data,
0.0, Ydata, &ctx()); 0.0, Ydata, ctx());
} }
template <class Context> template <class Context>
...@@ -98,12 +102,14 @@ void DotGradientOp<Context>::DotRunWithType() { ...@@ -98,12 +102,14 @@ void DotGradientOp<Context>::DotRunWithType() {
auto* dYdata = Input(2).template data<T, CPUContext>(); auto* dYdata = Input(2).template data<T, CPUContext>();
auto* dX1data = Output(0)->template mutable_data<T, Context>(); auto* dX1data = Output(0)->template mutable_data<T, Context>();
auto* dX2data = Output(1)->template mutable_data<T, Context>(); auto* dX2data = Output(1)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>( ctx()->template Copy<T, Context, Context>(
Output(0)->count(), dX1data, X2data); Output(0)->count(), dX1data, X2data);
ctx().template Copy<T, Context, Context>( ctx()->template Copy<T, Context, Context>(
Output(1)->count(), dX2data, X1data); Output(1)->count(), dX2data, X1data);
math::MulScalar<T, Context>(Output(0)->count(), dYdata[0], dX1data); math::MulScalar<T, Context>(
math::MulScalar<T, Context>(Output(1)->count(), dYdata[0], dX2data); Output(0)->count(), dYdata[0], dX1data, ctx());
math::MulScalar<T, Context>(
Output(1)->count(), dYdata[0], dX2data, ctx());
} }
template <class Context> template <typename T> template <class Context> template <typename T>
...@@ -118,13 +124,13 @@ void DotGradientOp<Context>::GemmRunWithType() { ...@@ -118,13 +124,13 @@ void DotGradientOp<Context>::GemmRunWithType() {
TransB ? CblasNoTrans : CblasTrans, TransB ? CblasNoTrans : CblasTrans,
M, K1, N1, M, K1, N1,
1.0, dYdata, X2data, 1.0, dYdata, X2data,
0.0, dX1data, &ctx()); 0.0, dX1data, ctx());
math::Gemm<T, Context>( math::Gemm<T, Context>(
TransA ? CblasNoTrans : CblasTrans, TransA ? CblasNoTrans : CblasTrans,
CblasNoTrans, CblasNoTrans,
K1, N1, M, K1, N1, M,
1.0, X1data, dYdata, 1.0, X1data, dYdata,
0.0, dX2data, &ctx()); 0.0, dX2data, ctx());
} }
template <class Context> template <typename T> template <class Context> template <typename T>
...@@ -138,11 +144,11 @@ void DotGradientOp<Context>::GemvRunWithType() { ...@@ -138,11 +144,11 @@ void DotGradientOp<Context>::GemvRunWithType() {
CblasNoTrans, CblasNoTrans, CblasNoTrans, CblasNoTrans,
M, N1, 1, M, N1, 1,
1.0, dYdata, X2data, 1.0, dYdata, X2data,
0.0, dX1data, &ctx()); 0.0, dX1data, ctx());
math::Gemv<T, Context>( math::Gemv<T, Context>(
TransA ? CblasNoTrans : CblasTrans, M, N1, TransA ? CblasNoTrans : CblasTrans, M, N1,
1.0, X1data, dYdata, 1.0, X1data, dYdata,
0.0, dX2data, &ctx()); 0.0, dX2data, ctx());
} }
template <class Context> template <class Context>
......
...@@ -7,10 +7,11 @@ template <class Context> template <typename T> ...@@ -7,10 +7,11 @@ template <class Context> template <typename T>
void EltwiseOp<Context>::SumRunWithType() { void EltwiseOp<Context>::SumRunWithType() {
TIndex count = Output(0)->count(); TIndex count = Output(0)->count();
auto* Ydata = Output(0)->template mutable_data<T, Context>(); auto* Ydata = Output(0)->template mutable_data<T, Context>();
math::Set<T, Context>(count, dragon_cast<T, float>(0), Ydata); math::Set<T, Context>(count,
dragon_cast<T, float>(0), Ydata, ctx());
for (int i = 0; i < InputSize(); ++i) { for (int i = 0; i < InputSize(); ++i) {
math::Axpy<T, Context>(count, coeffs[i], math::Axpy<T, Context>(count, coeffs[i],
Input(i).template data<T, Context>(), Ydata, &ctx()); Input(i).template data<T, Context>(), Ydata, ctx());
} }
} }
...@@ -21,19 +22,24 @@ void EltwiseOp<Context>::ProdRunWithType() { ...@@ -21,19 +22,24 @@ void EltwiseOp<Context>::ProdRunWithType() {
math::Mul<T, Context>(count, math::Mul<T, Context>(count,
Input(0).template data<T, Context>(), Input(0).template data<T, Context>(),
Input(1).template data<T, Context>(), Input(1).template data<T, Context>(),
Ydata); Ydata, ctx());
for (int i = 2; i < InputSize(); i++) { for (int i = 2; i < InputSize(); i++) {
math::Mul<T, Context>(count, math::Mul<T, Context>(count,
Ydata, Ydata,
Input(i).template data<T, Context>(), Input(i).template data<T, Context>(),
Ydata); Ydata, ctx());
} }
} }
template <class Context> template <class Context>
void EltwiseOp<Context>::RunOnDevice() { void EltwiseOp<Context>::RunOnDevice() {
for (int i = 1; i < InputSize(); i++) for (int i = 1; i < InputSize(); i++) {
CHECK(Input(i).dims() == Input(0).dims()); CHECK(Input(i).dims() == Input(0).dims())
<< "\nExcepted Input(" << i << ")'s dims as "
<< Input(0).DimString() << ",\n but got "
<< Input(1).DimString() << ".";
}
Output(0)->ReshapeLike(Input(0)); Output(0)->ReshapeLike(Input(0));
if (operation == "SUM") { if (operation == "SUM") {
...@@ -65,12 +71,12 @@ void EltwiseGradientOp<Context>::SumRunWithType() { ...@@ -65,12 +71,12 @@ void EltwiseGradientOp<Context>::SumRunWithType() {
for (int i = 0; i < OutputSize(); i++) { for (int i = 0; i < OutputSize(); i++) {
if (Output(i)->name() == "ignore") continue; if (Output(i)->name() == "ignore") continue;
auto* dXdata = Output(i)->template mutable_data<T, Context>(); auto* dXdata = Output(i)->template mutable_data<T, Context>();
if (coeffs[i] == float(1)) { if (coeffs[i] == 1.f) {
ctx().template Copy<T, Context, Context>( ctx()->template Copy<T, Context, Context>(
count, dXdata, dYdata); count, dXdata, dYdata);
} else { } else {
math::Scale<T, Context>(count, math::Scale<T, Context>(count,
coeffs[i], dYdata, dXdata, &ctx()); coeffs[i], dYdata, dXdata, ctx());
} }
} }
} }
...@@ -88,11 +94,11 @@ void EltwiseGradientOp<Context>::ProdRunWithType() { ...@@ -88,11 +94,11 @@ void EltwiseGradientOp<Context>::ProdRunWithType() {
if (i == j) continue; if (i == j) continue;
auto* Xdata = Input(j).template data<T, Context>(); auto* Xdata = Input(j).template data<T, Context>();
if (!initialized) { if (!initialized) {
ctx().template Copy<T, Context, Context>(count, dXdata, Xdata); ctx()->template Copy<T, Context, Context>(count, dXdata, Xdata);
initialized = true; initialized = true;
} else math::Mul<T, Context>(count, Xdata, dXdata, dXdata); } else math::Mul<T, Context>(count, Xdata, dXdata, dXdata, ctx());
} }
math::Mul<T, Context>(count, dYdata, dXdata, dXdata); math::Mul<T, Context>(count, dYdata, dXdata, dXdata, ctx());
} }
} }
......
...@@ -8,7 +8,7 @@ template <class Context> template <typename T> ...@@ -8,7 +8,7 @@ template <class Context> template <typename T>
void ExpOp<Context>::RunWithType() { void ExpOp<Context>::RunWithType() {
auto* Xdata = Input(0).template data<T, Context>(); auto* Xdata = Input(0).template data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>(); auto* Ydata = Output(0)->template mutable_data<T, Context>();
math::Exp<T, Context>(Output(0)->count(), Xdata, Ydata); math::Exp<T, Context>(Output(0)->count(), Xdata, Ydata, ctx());
} }
template <class Context> template <class Context>
...@@ -30,7 +30,8 @@ void ExpGradientOp<Context>::RunWithType() { ...@@ -30,7 +30,8 @@ void ExpGradientOp<Context>::RunWithType() {
auto* Ydata = Input(0).template data<T, Context >(); auto* Ydata = Input(0).template data<T, Context >();
auto* dYdata = Input(-1).template data<T, Context>(); auto* dYdata = Input(-1).template data<T, Context>();
auto* dXdata = Output(0)->template mutable_data<T, Context>(); auto* dXdata = Output(0)->template mutable_data<T, Context>();
math::Mul<T, Context>(Output(0)->count(), dYdata, Ydata, dXdata); math::Mul<T, Context>(Output(0)->count(),
dYdata, Ydata, dXdata, ctx());
} }
template <class Context> template <class Context>
......
...@@ -12,7 +12,7 @@ void GramMatrixOp<Context>::RunWithType() { ...@@ -12,7 +12,7 @@ void GramMatrixOp<Context>::RunWithType() {
CblasNoTrans, CblasTrans, CblasNoTrans, CblasTrans,
dim, dim, inner_dim, dim, dim, inner_dim,
1.0, Xdata, Xdata, 1.0, Xdata, Xdata,
0.0, Ydata, &ctx()); 0.0, Ydata, ctx());
Xdata += x_offset; Xdata += x_offset;
Ydata += y_offset; Ydata += y_offset;
} }
...@@ -47,7 +47,7 @@ void GramMatrixGradientOp<Context>::RunWithType() { ...@@ -47,7 +47,7 @@ void GramMatrixGradientOp<Context>::RunWithType() {
CblasNoTrans, CblasNoTrans, CblasNoTrans, CblasNoTrans,
dim, inner_dim, dim, dim, inner_dim, dim,
2.0, dYdata, Xdata, 2.0, dYdata, Xdata,
0.0, dXdata, &ctx()); 0.0, dXdata, ctx());
dYdata += y_offset; dYdata += y_offset;
dXdata += x_offset; dXdata += x_offset;
} }
......
...@@ -23,7 +23,7 @@ void InnerProductOp<Context>::TransRunWithType() { ...@@ -23,7 +23,7 @@ void InnerProductOp<Context>::TransRunWithType() {
CblasNoTrans, CblasTrans, CblasNoTrans, CblasTrans,
M, num_output, K, M, num_output, K,
1.0, Xdata, Wdata, 1.0, Xdata, Wdata,
0.0, Ydata, &ctx()); 0.0, Ydata, ctx());
if (InputSize() > 2) { if (InputSize() > 2) {
DECLARE_MULTIPLIER(multiplier, M); DECLARE_MULTIPLIER(multiplier, M);
...@@ -32,7 +32,7 @@ void InnerProductOp<Context>::TransRunWithType() { ...@@ -32,7 +32,7 @@ void InnerProductOp<Context>::TransRunWithType() {
CblasNoTrans, CblasNoTrans, CblasNoTrans, CblasNoTrans,
M, num_output, 1, M, num_output, 1,
1.0, multiplier, Bdata, 1.0, multiplier, Bdata,
1.0, Ydata, &ctx()); 1.0, Ydata, ctx());
} }
} }
...@@ -55,7 +55,7 @@ void InnerProductOp<Context>::NoTransRunWithType() { ...@@ -55,7 +55,7 @@ void InnerProductOp<Context>::NoTransRunWithType() {
CblasNoTrans, CblasNoTrans, CblasNoTrans, CblasNoTrans,
M, num_output, K, M, num_output, K,
1.0, Xdata, Wdata, 1.0, Xdata, Wdata,
0.0, Ydata, &ctx()); 0.0, Ydata, ctx());
if (InputSize() > 2) { if (InputSize() > 2) {
DECLARE_MULTIPLIER(multiplier, M); DECLARE_MULTIPLIER(multiplier, M);
...@@ -64,7 +64,7 @@ void InnerProductOp<Context>::NoTransRunWithType() { ...@@ -64,7 +64,7 @@ void InnerProductOp<Context>::NoTransRunWithType() {
CblasNoTrans, CblasNoTrans, CblasNoTrans, CblasNoTrans,
M, num_output, 1, M, num_output, 1,
1.0, multiplier, Bdata, 1.0, multiplier, Bdata,
1.0, Ydata, &ctx()); 1.0, Ydata, ctx());
} }
} }
...@@ -102,30 +102,30 @@ void InnerProductGradientOp<Context>::RunWithType() { ...@@ -102,30 +102,30 @@ void InnerProductGradientOp<Context>::RunWithType() {
if (Output(1)->name() != "ignore") { if (Output(1)->name() != "ignore") {
Output(1)->ReshapeLike(Input(1)); Output(1)->ReshapeLike(Input(1));
auto* dWdata = Output(1)->template mutable_data<T, Context>(); auto* dWdata = Output(1)->template mutable_data<T, Context>(ctx());
if (TransW) { if (TransW) {
math::Gemm<T, Context>( math::Gemm<T, Context>(
CblasTrans, CblasNoTrans, CblasTrans, CblasNoTrans,
num_output, K, M, num_output, K, M,
1.0, dYdata, Xdata, 1.0, dYdata, Xdata,
1.0, dWdata, &ctx()); 1.0, dWdata, ctx());
} else { } else {
math::Gemm<T, Context>( math::Gemm<T, Context>(
CblasTrans, CblasNoTrans, CblasTrans, CblasNoTrans,
K, num_output, M, K, num_output, M,
1.0, Xdata, dYdata, 1.0, Xdata, dYdata,
1.0, dWdata, &ctx()); 1.0, dWdata, ctx());
} }
} }
if (Output(2)->name() != "ignore") { if (Output(2)->name() != "ignore") {
DECLARE_MULTIPLIER(multiplier, M); DECLARE_MULTIPLIER(multiplier, M);
Output(2)->Reshape({ num_output }); Output(2)->Reshape({ num_output });
auto* dBdata = Output(2)->template mutable_data<T, Context>(); auto* dBdata = Output(2)->template mutable_data<T, Context>(ctx());
math::Gemv<T, Context>( math::Gemv<T, Context>(
CblasTrans, M, num_output, CblasTrans, M, num_output,
1.0, dYdata, multiplier, 1.0, dYdata, multiplier,
1.0, dBdata, &ctx()); 1.0, dBdata, ctx());
} }
if (Output(0)->name() != "ignore") { if (Output(0)->name() != "ignore") {
...@@ -136,13 +136,13 @@ void InnerProductGradientOp<Context>::RunWithType() { ...@@ -136,13 +136,13 @@ void InnerProductGradientOp<Context>::RunWithType() {
CblasNoTrans, CblasNoTrans, CblasNoTrans, CblasNoTrans,
M, K, num_output, M, K, num_output,
1.0, dYdata, Wdata, 1.0, dYdata, Wdata,
0.0, dXdata, &ctx()); 0.0, dXdata, ctx());
} else { } else {
math::Gemm<T, Context>( math::Gemm<T, Context>(
CblasNoTrans, CblasTrans, CblasNoTrans, CblasTrans,
M, K, num_output, M, K, num_output,
1.0, dYdata, Wdata, 1.0, dYdata, Wdata,
0.0, dXdata, &ctx()); 0.0, dXdata, ctx());
} }
} }
} }
......
...@@ -7,7 +7,7 @@ template <class Context> template <typename T> ...@@ -7,7 +7,7 @@ template <class Context> template <typename T>
void LogOp<Context>::RunWithType() { void LogOp<Context>::RunWithType() {
auto* Xdata = Input(0).template data<T, Context>(); auto* Xdata = Input(0).template data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>(); auto* Ydata = Output(0)->template mutable_data<T, Context>();
math::Log<T, Context>(Output(0)->count(), Xdata, Ydata); math::Log<T, Context>(Output(0)->count(), Xdata, Ydata, ctx());
} }
template <class Context> template <class Context>
...@@ -29,7 +29,7 @@ void LogGradientOp<Context>::RunWithType() { ...@@ -29,7 +29,7 @@ void LogGradientOp<Context>::RunWithType() {
auto* Xdata = Input(0).template data<T, Context>(); auto* Xdata = Input(0).template data<T, Context>();
auto* dYdata = Input(-1).template data<T, Context>(); auto* dYdata = Input(-1).template data<T, Context>();
auto* dXdata = Output(0)->template mutable_data<T, Context>(); auto* dXdata = Output(0)->template mutable_data<T, Context>();
math::Div<T, Context>(Output(0)->count(), dYdata, Xdata, dXdata); math::Div<T, Context>(Output(0)->count(), dYdata, Xdata, dXdata, ctx());
} }
template <class Context> template <class Context>
......
...@@ -16,7 +16,7 @@ void MatmulOp<Context>::RunWithType() { ...@@ -16,7 +16,7 @@ void MatmulOp<Context>::RunWithType() {
TransB ? CblasTrans : CblasNoTrans, TransB ? CblasTrans : CblasNoTrans,
M, N, K1, M, N, K1,
1.0, X1data, X2data, 1.0, X1data, X2data,
0.0, Ydata, &ctx()); 0.0, Ydata, ctx());
X1data += x1_offset; X1data += x1_offset;
X2data += x2_offset; X2data += x2_offset;
Ydata += y_offset; Ydata += y_offset;
...@@ -76,13 +76,13 @@ void MatmulGradientOp<Context>::RunWithType() { ...@@ -76,13 +76,13 @@ void MatmulGradientOp<Context>::RunWithType() {
TransB ? CblasNoTrans : CblasTrans, TransB ? CblasNoTrans : CblasTrans,
M, K1, N, M, K1, N,
1.0, dYdata, X2data, 1.0, dYdata, X2data,
0.0, dX1data, &ctx()); 0.0, dX1data, ctx());
math::Gemm<T, Context>( math::Gemm<T, Context>(
TransA ? CblasNoTrans : CblasTrans, TransA ? CblasNoTrans : CblasTrans,
CblasNoTrans, CblasNoTrans,
K1, N, M, K1, N, M,
1.0, X1data, dYdata, 1.0, X1data, dYdata,
0.0, dX2data, &ctx()); 0.0, dX2data, ctx());
X1data += x1_offset; X1data += x1_offset;
X2data += x2_offset; X2data += x2_offset;
dX1data += x1_offset; dX1data += x1_offset;
......
...@@ -9,7 +9,7 @@ void MulOp<Context>::EltwiseRunWithType() { ...@@ -9,7 +9,7 @@ void MulOp<Context>::EltwiseRunWithType() {
auto* x1 = Input(0).template data<T, Context>(); auto* x1 = Input(0).template data<T, Context>();
auto* x2 = Input(1).template data<T, Context>(); auto* x2 = Input(1).template data<T, Context>();
auto* y = Output(0)->template mutable_data<T, Context>(); auto* y = Output(0)->template mutable_data<T, Context>();
math::Mul<T, Context>(Output(0)->count(), x1, x2, y); math::Mul<T, Context>(Output(0)->count(), x1, x2, y, ctx());
} }
template <class Context> template <typename T> template <class Context> template <typename T>
...@@ -18,34 +18,39 @@ void MulOp<Context>::BroadcastRunWithType(int type) { ...@@ -18,34 +18,39 @@ void MulOp<Context>::BroadcastRunWithType(int type) {
auto* x1 = Input(0).template data<T, Context>(); auto* x1 = Input(0).template data<T, Context>();
auto* x2 = Input(1).template data<T, Context>(); auto* x2 = Input(1).template data<T, Context>();
auto* y = Output(0)->template mutable_data<T, Context>(); auto* y = Output(0)->template mutable_data<T, Context>();
auto* c = ws()->template caches<T, Context>({
Output(0)->count() })[0];
if (type == 0 || type == 1) {
if (type == 0) { if (type == 0) {
outer_dim = Input(0).count(); x2 = Input(1).template data<T, CPUContext>();
inner_dim = 1; ctx()->template Copy<T, Context, Context>(
} else { Output(0)->count(), y, x1);
math::MulScalar<T, Context>(Output(0)->count(),
dragon_cast<float, T>(x2[0]), y, ctx());
} else if (type == 1) {
outer_dim = Input(0).count(0, Input(0).axis(-1)); outer_dim = Input(0).count(0, Input(0).axis(-1));
inner_dim = Input(0).dim(-1); inner_dim = Input(0).dim(-1);
}
DECLARE_MULTIPLIER(multiplier, outer_dim); DECLARE_MULTIPLIER(multiplier, outer_dim);
auto* c = ws()->template caches<T, Context>(
{ Output(0)->count() })[0];
math::Gemm<T, Context>( math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans, CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1, outer_dim, inner_dim, 1,
1.0, multiplier, x2, 1.0, multiplier, x2,
0.0, c, &ctx()); 0.0, c, ctx());
math::Mul<T, Context>(Output(0)->count(), x1, c, y); math::Mul<T, Context>(
Output(0)->count(), x1, c, y, ctx());
} else if (type == 2) { } else if (type == 2) {
outer_dim = Input(0).dim(0); outer_dim = Input(0).dim(0);
inner_dim = Input(0).count(1); inner_dim = Input(0).count(1);
DECLARE_MULTIPLIER(multiplier, inner_dim); DECLARE_MULTIPLIER(multiplier, inner_dim);
auto* c = ws()->template caches<T, Context>(
{ Output(0)->count() })[0];
math::Gemm<T, Context>( math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans, CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1, outer_dim, inner_dim, 1,
1.0, x2, multiplier, 1.0, x2, multiplier,
0.0, c, &ctx()); 0.0, c, ctx());
math::Mul<T, Context>(Output(0)->count(), x1, c, y); math::Mul<T, Context>(
Output(0)->count(), x1, c, y, ctx());
} }
} }
...@@ -79,13 +84,13 @@ void MulGradientOp<Context>::EltwiseRunWithType() { ...@@ -79,13 +84,13 @@ void MulGradientOp<Context>::EltwiseRunWithType() {
if (Output(1)->name() != "ignore") { if (Output(1)->name() != "ignore") {
auto* x1 = Input(0).template data<T, Context>(); auto* x1 = Input(0).template data<T, Context>();
auto* dx2 = Output(1)->template mutable_data<T, Context>(); auto* dx2 = Output(1)->template mutable_data<T, Context>();
math::Mul<T, Context>(Output(1)->count(), dy, x1, dx2); math::Mul<T, Context>(Output(1)->count(), dy, x1, dx2, ctx());
} }
if (Output(0)->name() != "ignore") { if (Output(0)->name() != "ignore") {
auto* x2 = Input(1).template data<T, Context>(); auto* x2 = Input(1).template data<T, Context>();
auto* dx1 = Output(0)->template mutable_data<T, Context>(); auto* dx1 = Output(0)->template mutable_data<T, Context>();
math::Mul<T, Context>(Output(0)->count(), dy, x2, dx1); math::Mul<T, Context>(Output(0)->count(), dy, x2, dx1, ctx());
} }
} }
...@@ -110,19 +115,19 @@ void MulGradientOp<Context>::BroadcastRunWithType(int type) { ...@@ -110,19 +115,19 @@ void MulGradientOp<Context>::BroadcastRunWithType(int type) {
auto* x1 = Input(0).template data<T, Context>(); auto* x1 = Input(0).template data<T, Context>();
auto* dx2 = Output(1)->template mutable_data<T, Context>(); auto* dx2 = Output(1)->template mutable_data<T, Context>();
auto* c = ws()->template caches<T, Context>({ X1->count() })[0]; auto* c = ws()->template caches<T, Context>({ X1->count() })[0];
math::Mul<T, Context>(X1->count(), dy, x1, c); math::Mul<T, Context>(X1->count(), dy, x1, c, ctx());
if (type == 0 || type == 1) { if (type == 0 || type == 1) {
DECLARE_MULTIPLIER(multiplier, outer_dim); DECLARE_MULTIPLIER(multiplier, outer_dim);
math::Gemv<T, Context>( math::Gemv<T, Context>(
CblasTrans, outer_dim, inner_dim, CblasTrans, outer_dim, inner_dim,
1.0, c, multiplier, 1.0, c, multiplier,
0.0, dx2, &ctx()); 0.0, dx2, ctx());
} else if (type == 2) { } else if (type == 2) {
DECLARE_MULTIPLIER(multiplier, inner_dim); DECLARE_MULTIPLIER(multiplier, inner_dim);
math::Gemv<T, Context>( math::Gemv<T, Context>(
CblasNoTrans, outer_dim, inner_dim, CblasNoTrans, outer_dim, inner_dim,
1.0, c, multiplier, 1.0, c, multiplier,
0.0, dx2, &ctx()); 0.0, dx2, ctx());
} }
} }
...@@ -135,16 +140,16 @@ void MulGradientOp<Context>::BroadcastRunWithType(int type) { ...@@ -135,16 +140,16 @@ void MulGradientOp<Context>::BroadcastRunWithType(int type) {
CblasNoTrans, CblasNoTrans, CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1, outer_dim, inner_dim, 1,
1.0, multiplier, x2, 1.0, multiplier, x2,
0.0, dx1, &ctx()); 0.0, dx1, ctx());
} else if (type == 2) { } else if (type == 2) {
DECLARE_MULTIPLIER(multiplier, inner_dim); DECLARE_MULTIPLIER(multiplier, inner_dim);
math::Gemm<T, Context>( math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans, CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1, outer_dim, inner_dim, 1,
1.0, x2, multiplier, 1.0, x2, multiplier,
0.0, dx1, &ctx()); 0.0, dx1, ctx());
} }
math::Mul<T, Context>(X1->count(), dy, dx1, dx1); math::Mul<T, Context>(X1->count(), dy, dx1, dx1, ctx());
} }
} }
......
...@@ -9,16 +9,17 @@ void PowOp<Context>::RunWithType() { ...@@ -9,16 +9,17 @@ void PowOp<Context>::RunWithType() {
TIndex count = Input(0).count(); TIndex count = Input(0).count();
auto* Ydata = Output(0)->template mutable_data<T, Context>(); auto* Ydata = Output(0)->template mutable_data<T, Context>();
if (power_scale == float(0)) { if (power_scale == 0.f) {
float value = (power == float(0)) ? float(1) : pow(shift, power); float value = (power == 0.f) ? 1.f : pow(shift, power);
math::Set<T, Context>(count, dragon_cast<T, float>(value), Ydata); math::Set<T, Context>(count,
dragon_cast<T, float>(value), Ydata, ctx());
return; return;
} }
auto* Xdata = Input(0).template data<T, Context>(); auto* Xdata = Input(0).template data<T, Context>();
ctx().template Copy<T, Context, Context>(count, Ydata, Xdata); ctx()->template Copy<T, Context, Context>(count, Ydata, Xdata);
if (scale != float(1)) math::Scal<T, Context>(count, scale, Ydata, &ctx()); if (scale != 1.f) math::Scal<T, Context>(count, scale, Ydata, ctx());
if (shift != float(0)) math::AddScalar<T, Context>(count, shift, Ydata); if (shift != 0.f) math::AddScalar<T, Context>(count, shift, Ydata, ctx());
if (power != float(1)) math::Pow<T, Context>(count, power, Ydata, Ydata); if (power != 1.f) math::Pow<T, Context>(count, power, Ydata, Ydata, ctx());
} }
template <class Context> template <class Context>
...@@ -42,35 +43,36 @@ void PowGradientOp<Context>::RunWithType() { ...@@ -42,35 +43,36 @@ void PowGradientOp<Context>::RunWithType() {
auto* dYdata = Input(-1).template data<T, Context>(); auto* dYdata = Input(-1).template data<T, Context>();
auto* dXdata = Output(0)->template mutable_data<T, Context>(); auto* dXdata = Output(0)->template mutable_data<T, Context>();
if (power_scale == float(0) || power == float(1)) { if (power_scale == 0.f || power == 1.f) {
const T value = dragon_cast<T, float>(power_scale); const T value = dragon_cast<T, float>(power_scale);
math::Set<T, Context>(count, value, dXdata); math::Set<T, Context>(count, value, dXdata, ctx());
} else { } else {
auto* Xdata = Input(0).template data<T, Context>(); auto* Xdata = Input(0).template data<T, Context>();
if (power == float(2)) { if (power == 2.f) {
math::Axpby<T, Context>(count, math::Axpby<T, Context>(count,
power_scale * scale, Xdata, power_scale * scale, Xdata,
0, dXdata, &ctx()); 0, dXdata, ctx());
if (shift != float(0)) if (shift != 0.f)
math::AddScalar<T, Context>(count, power_scale * shift, dXdata); math::AddScalar<T, Context>(count,
} else if (shift == float(0)) { power_scale * shift, dXdata, ctx());
} else if (shift == 0.f) {
auto* Ydata = Input(1).template data<T, Context>(); auto* Ydata = Input(1).template data<T, Context>();
math::Div<T, Context>(count, Ydata, Xdata, dXdata); math::Div<T, Context>(count, Ydata, Xdata, dXdata, ctx());
math::Scal<T, Context>(count, power, dXdata, &ctx()); math::Scal<T, Context>(count, power, dXdata, ctx());
} else { } else {
auto* Ydata = Input(1).template data<T, Context>(); auto* Ydata = Input(1).template data<T, Context>();
ctx().template Copy<T, Context, Context>(count, dXdata, Xdata); ctx()->template Copy<T, Context, Context>(count, dXdata, Xdata);
if (scale != float(1)) if (scale != 1.f)
math::Scal<T, Context>(count, scale, dXdata, &ctx()); math::Scal<T, Context>(count, scale, dXdata, ctx());
if (shift != float(0)) if (shift != 0.f)
math::AddScalar<T, Context>(count, shift, dXdata); math::AddScalar<T, Context>(count, shift, dXdata, ctx());
math::Div<T, Context>(count, Ydata, dXdata, dXdata); math::Div<T, Context>(count, Ydata, dXdata, dXdata, ctx());
if (power_scale != float(1)) if (power_scale != 1.f)
math::Scal<T, Context>(count, power_scale, dXdata, &ctx()); math::Scal<T, Context>(count, power_scale, dXdata, ctx());
} }
} }
if (power_scale != float(0)) if (power_scale != 0.f)
math::Mul<T, Context>(count, dYdata, dXdata, dXdata); math::Mul<T, Context>(count, dYdata, dXdata, dXdata, ctx());
} }
template <class Context> template <class Context>
......
...@@ -9,7 +9,7 @@ void RAddOp<Context>::EltwiseRunWithType() { ...@@ -9,7 +9,7 @@ void RAddOp<Context>::EltwiseRunWithType() {
auto* x1 = Input(0).template data<T, Context>(); auto* x1 = Input(0).template data<T, Context>();
auto* x2 = Input(1).template data<T, Context>(); auto* x2 = Input(1).template data<T, Context>();
auto* y = Output(0)->template mutable_data<T, Context>(); auto* y = Output(0)->template mutable_data<T, Context>();
math::Add<T, Context>(Output(0)->count(), x1, x2, y); math::Add<T, Context>(Output(0)->count(), x1, x2, y, ctx());
} }
template <class Context> template <typename T> template <class Context> template <typename T>
...@@ -19,23 +19,24 @@ void RAddOp<Context>::BroadcastRunWithType(int type) { ...@@ -19,23 +19,24 @@ void RAddOp<Context>::BroadcastRunWithType(int type) {
auto* x2 = Input(1).template data<T, Context>(); auto* x2 = Input(1).template data<T, Context>();
auto* y = Output(0)->template mutable_data<T, Context>(); auto* y = Output(0)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>( ctx()->template Copy<T, Context, Context>(
Output(0)->count(), y, x2); Output(0)->count(), y, x2);
if (type == 0 || type == 1) { if (type == 0 || type == 1) {
if (type == 0) { if (type == 0) {
outer_dim = Input(1).count(); x1 = Input(0).template data<T, CPUContext>();
inner_dim = 1; math::AddScalar<T, Context>(Output(0)->count(),
dragon_cast<float, T>(x1[0]), y, ctx());
} else { } else {
outer_dim = Input(1).count(0, Input(1).axis(-1)); outer_dim = Input(1).count(0, Input(1).axis(-1));
inner_dim = Input(1).dim(-1); inner_dim = Input(1).dim(-1);
}
DECLARE_MULTIPLIER(multiplier, outer_dim); DECLARE_MULTIPLIER(multiplier, outer_dim);
math::Gemm<T, Context>( math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans, CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1, outer_dim, inner_dim, 1,
1.0, multiplier, x1, 1.0, multiplier, x1,
1.0, y, &ctx()); 1.0, y, ctx());
}
} else if (type == 2) { } else if (type == 2) {
outer_dim = Input(1).dim(0); outer_dim = Input(1).dim(0);
inner_dim = Input(1).count(1); inner_dim = Input(1).count(1);
...@@ -44,7 +45,7 @@ void RAddOp<Context>::BroadcastRunWithType(int type) { ...@@ -44,7 +45,7 @@ void RAddOp<Context>::BroadcastRunWithType(int type) {
CblasNoTrans, CblasNoTrans, CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1, outer_dim, inner_dim, 1,
1.0, x1, multiplier, 1.0, x1, multiplier,
1.0, y, &ctx()); 1.0, y, ctx());
} }
} }
...@@ -77,13 +78,13 @@ void RAddGradientOp<Context>::EltwiseRunWithType() { ...@@ -77,13 +78,13 @@ void RAddGradientOp<Context>::EltwiseRunWithType() {
if (Output(1)->name() != "ignore") { if (Output(1)->name() != "ignore") {
auto* dx2 = Output(1)->template mutable_data<T, Context>(); auto* dx2 = Output(1)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>( ctx()->template Copy<T, Context, Context>(
Output(1)->count(), dx2, dy); Output(1)->count(), dx2, dy);
} }
if (Output(0)->name() != "ignore") { if (Output(0)->name() != "ignore") {
auto* dx1 = Output(0)->template mutable_data<T, Context>(); auto* dx1 = Output(0)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>( ctx()->template Copy<T, Context, Context>(
Output(0)->count(), dx1, dy); Output(0)->count(), dx1, dy);
} }
} }
...@@ -108,7 +109,7 @@ void RAddGradientOp<Context>::BroadcastRunWithType(int type) { ...@@ -108,7 +109,7 @@ void RAddGradientOp<Context>::BroadcastRunWithType(int type) {
math::Gemv<T, Context>( math::Gemv<T, Context>(
CblasTrans, outer_dim, inner_dim, CblasTrans, outer_dim, inner_dim,
1.0, dy, multiplier, 1.0, dy, multiplier,
0.0, dx1, &ctx()); 0.0, dx1, ctx());
} else if (type == 2) { } else if (type == 2) {
outer_dim = X2->dim(0); outer_dim = X2->dim(0);
inner_dim = X2->count(1); inner_dim = X2->count(1);
...@@ -116,13 +117,13 @@ void RAddGradientOp<Context>::BroadcastRunWithType(int type) { ...@@ -116,13 +117,13 @@ void RAddGradientOp<Context>::BroadcastRunWithType(int type) {
math::Gemv<T, Context>( math::Gemv<T, Context>(
CblasNoTrans, outer_dim, inner_dim, CblasNoTrans, outer_dim, inner_dim,
1.0, dy, multiplier, 1.0, dy, multiplier,
0.0, dx1, &ctx()); 0.0, dx1, ctx());
} }
} }
if (Output(1)->name() != "ignore") { if (Output(1)->name() != "ignore") {
auto* dx2 = Output(1)->template mutable_data<T, Context>(); auto* dx2 = Output(1)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>( ctx()->template Copy<T, Context, Context>(
X2->count(), dx2, dy); X2->count(), dx2, dy);
} }
} }
......
...@@ -9,7 +9,7 @@ void RDivOp<Context>::EltwiseRunWithType() { ...@@ -9,7 +9,7 @@ void RDivOp<Context>::EltwiseRunWithType() {
auto* x1 = Input(0).template data<T, Context>(); auto* x1 = Input(0).template data<T, Context>();
auto* x2 = Input(1).template data<T, Context>(); auto* x2 = Input(1).template data<T, Context>();
auto* y = Output(0)->template mutable_data<T, Context>(); auto* y = Output(0)->template mutable_data<T, Context>();
math::Div<T, Context>(Output(0)->count(), x1, x2, y); math::Div<T, Context>(Output(0)->count(), x1, x2, y, ctx());
} }
template <class Context> template <typename T> template <class Context> template <typename T>
...@@ -34,8 +34,8 @@ void RDivOp<Context>::BroadcastRunWithType(int type) { ...@@ -34,8 +34,8 @@ void RDivOp<Context>::BroadcastRunWithType(int type) {
CblasNoTrans, CblasNoTrans, CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1, outer_dim, inner_dim, 1,
1.0, multiplier, x1, 1.0, multiplier, x1,
0.0, c, &ctx()); 0.0, c, ctx());
math::Div<T, Context>(Output(0)->count(), c, x2, y); math::Div<T, Context>(Output(0)->count(), c, x2, y, ctx());
} else if (type == 2) { } else if (type == 2) {
outer_dim = Input(1).dim(0); outer_dim = Input(1).dim(0);
inner_dim = Input(1).count(1); inner_dim = Input(1).count(1);
...@@ -44,8 +44,8 @@ void RDivOp<Context>::BroadcastRunWithType(int type) { ...@@ -44,8 +44,8 @@ void RDivOp<Context>::BroadcastRunWithType(int type) {
CblasNoTrans, CblasNoTrans, CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1, outer_dim, inner_dim, 1,
1.0, x1, multiplier, 1.0, x1, multiplier,
0.0, c, &ctx()); 0.0, c, ctx());
math::Div<T, Context>(Output(0)->count(), c, x2, y); math::Div<T, Context>(Output(0)->count(), c, x2, y, ctx());
} }
} }
...@@ -82,16 +82,16 @@ void RDivGradientOp<Context>::EltwiseRunWithType() { ...@@ -82,16 +82,16 @@ void RDivGradientOp<Context>::EltwiseRunWithType() {
auto* x2 = Input(1).template data<T, Context>(); auto* x2 = Input(1).template data<T, Context>();
auto* dx2 = Output(1)->template mutable_data<T, Context>(); auto* dx2 = Output(1)->template mutable_data<T, Context>();
auto* c = ws()->template caches<T, Context>({ X1->count() })[0]; auto* c = ws()->template caches<T, Context>({ X1->count() })[0];
math::Mul<T, Context>(X1->count(), dy, x1, c); // dY * X1 math::Mul<T, Context>(X1->count(), dy, x1, c, ctx()); // dY * X1
math::Square<T, Context>(X2->count(), x2, dx2); // X2^{2} math::Square<T, Context>(X2->count(), x2, dx2, ctx()); // X2^{2}
math::Inv<T, Context>(X2->count(), -1, dx2, dx2); // -1 / X2^{2} math::Inv<T, Context>(X2->count(), -1, dx2, dx2, ctx()); // -1 / X2^{2}
math::Mul<T, Context>(X2->count(), c, dx2, dx2); math::Mul<T, Context>(X2->count(), c, dx2, dx2, ctx());
} }
if (Output(0)->name() != "ignore") { if (Output(0)->name() != "ignore") {
auto* x2 = Input(1).template data<T, Context>(); auto* x2 = Input(1).template data<T, Context>();
auto* dx1 = Output(0)->template mutable_data<T, Context>(); auto* dx1 = Output(0)->template mutable_data<T, Context>();
math::Div<T, Context>(X1->count(), dy, x2, dx1); math::Div<T, Context>(X1->count(), dy, x2, dx1, ctx());
} }
} }
...@@ -116,19 +116,19 @@ void RDivGradientOp<Context>::BroadcastRunWithType(int type) { ...@@ -116,19 +116,19 @@ void RDivGradientOp<Context>::BroadcastRunWithType(int type) {
auto* x2 = Input(1).template data<T, Context>(); auto* x2 = Input(1).template data<T, Context>();
auto* dx1 = Output(0)->template mutable_data<T, Context>(); auto* dx1 = Output(0)->template mutable_data<T, Context>();
auto* c = ws()->template caches<T, Context>({ X2->count() })[0]; auto* c = ws()->template caches<T, Context>({ X2->count() })[0];
math::Div<T, Context>(X2->count(), dy, x2, c); math::Div<T, Context>(X2->count(), dy, x2, c, ctx());
if (type == 0 || type == 1) { if (type == 0 || type == 1) {
DECLARE_MULTIPLIER(multiplier, outer_dim); DECLARE_MULTIPLIER(multiplier, outer_dim);
math::Gemv<T, Context>( math::Gemv<T, Context>(
CblasTrans, outer_dim, inner_dim, CblasTrans, outer_dim, inner_dim,
1.0, c, multiplier, 1.0, c, multiplier,
0.0, dx1, &ctx()); 0.0, dx1, ctx());
} else if (type == 2) { } else if (type == 2) {
DECLARE_MULTIPLIER(multiplier, inner_dim); DECLARE_MULTIPLIER(multiplier, inner_dim);
math::Gemv<T, Context>( math::Gemv<T, Context>(
CblasNoTrans, outer_dim, inner_dim, CblasNoTrans, outer_dim, inner_dim,
1.0, c, multiplier, 1.0, c, multiplier,
0.0, dx1, &ctx()); 0.0, dx1, ctx());
} }
} }
...@@ -142,18 +142,18 @@ void RDivGradientOp<Context>::BroadcastRunWithType(int type) { ...@@ -142,18 +142,18 @@ void RDivGradientOp<Context>::BroadcastRunWithType(int type) {
CblasNoTrans, CblasNoTrans, CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1, outer_dim, inner_dim, 1,
-1.0, multiplier, x1, -1.0, multiplier, x1,
0.0, dx2, &ctx()); 0.0, dx2, ctx());
} else if (type == 2) { } else if (type == 2) {
DECLARE_MULTIPLIER(multiplier, inner_dim); DECLARE_MULTIPLIER(multiplier, inner_dim);
math::Gemm<T, Context>( math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans, CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1, outer_dim, inner_dim, 1,
-1.0, x1, multiplier, -1.0, x1, multiplier,
0.0, dx2, &ctx()); 0.0, dx2, ctx());
} }
math::Mul<T, Context>(X2->count(), dy, dx2, dx2); math::Mul<T, Context>(X2->count(), dy, dx2, dx2, ctx());
math::Div<T, Context>(X2->count(), dx2, x2, dx2); math::Div<T, Context>(X2->count(), dx2, x2, dx2, ctx());
math::Div<T, Context>(X2->count(), dx2, x2, dx2); math::Div<T, Context>(X2->count(), dx2, x2, dx2, ctx());
} }
} }
......
...@@ -9,7 +9,7 @@ void RMulOp<Context>::EltwiseRunWithType() { ...@@ -9,7 +9,7 @@ void RMulOp<Context>::EltwiseRunWithType() {
auto* x1 = Input(0).template data<T, Context>(); auto* x1 = Input(0).template data<T, Context>();
auto* x2 = Input(1).template data<T, Context>(); auto* x2 = Input(1).template data<T, Context>();
auto* y = Output(0)->template mutable_data<T, Context>(); auto* y = Output(0)->template mutable_data<T, Context>();
math::Mul<T, Context>(Output(0)->count(), x1, x2, y); math::Mul<T, Context>(Output(0)->count(), x1, x2, y, ctx());
} }
template <class Context> template <typename T> template <class Context> template <typename T>
...@@ -18,34 +18,39 @@ void RMulOp<Context>::BroadcastRunWithType(int type) { ...@@ -18,34 +18,39 @@ void RMulOp<Context>::BroadcastRunWithType(int type) {
auto* x1 = Input(0).template data<T, Context>(); auto* x1 = Input(0).template data<T, Context>();
auto* x2 = Input(1).template data<T, Context>(); auto* x2 = Input(1).template data<T, Context>();
auto* y = Output(0)->template mutable_data<T, Context>(); auto* y = Output(0)->template mutable_data<T, Context>();
auto* c = ws()->template caches<T, Context>({
Output(0)->count() })[0];
if (type == 0 || type == 1) {
if (type == 0) { if (type == 0) {
outer_dim = Input(1).count(); x1 = Input(0).template data<T, CPUContext>();
inner_dim = 1; ctx()->template Copy<T, Context, Context>(
} else { Output(0)->count(), y, x2);
math::MulScalar<T, Context>(Output(0)->count(),
dragon_cast<float, T>(x1[0]), y, ctx());
} else if (type == 1) {
outer_dim = Input(1).count(0, Input(1).axis(-1)); outer_dim = Input(1).count(0, Input(1).axis(-1));
inner_dim = Input(1).dim(-1); inner_dim = Input(1).dim(-1);
}
DECLARE_MULTIPLIER(multiplier, outer_dim); DECLARE_MULTIPLIER(multiplier, outer_dim);
auto* c = ws()->template caches<T, Context>(
{ Output(0)->count() })[0];
math::Gemm<T, Context>( math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans, CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1, outer_dim, inner_dim, 1,
1.0, multiplier, x1, 1.0, multiplier, x1,
0.0, c, &ctx()); 0.0, c, ctx());
math::Mul<T, Context>(Output(0)->count(), c, x2, y); math::Mul<T, Context>(
Output(0)->count(), c, x2, y, ctx());
} else if (type == 2) { } else if (type == 2) {
outer_dim = Input(1).dim(0); outer_dim = Input(1).dim(0);
inner_dim = Input(1).count(1); inner_dim = Input(1).count(1);
DECLARE_MULTIPLIER(multiplier, inner_dim); DECLARE_MULTIPLIER(multiplier, inner_dim);
auto* c = ws()->template caches<T, Context>(
{ Output(0)->count() })[0];
math::Gemm<T, Context>( math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans, CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1, outer_dim, inner_dim, 1,
1.0, x1, multiplier, 1.0, x1, multiplier,
0.0, c, &ctx()); 0.0, c, ctx());
math::Mul<T, Context>(Output(0)->count(), c, x2, y); math::Mul<T, Context>(
Output(0)->count(), c, x2, y, ctx());
} }
} }
...@@ -79,13 +84,13 @@ void RMulGradientOp<Context>::EltwiseRunWithType() { ...@@ -79,13 +84,13 @@ void RMulGradientOp<Context>::EltwiseRunWithType() {
if (Output(1)->name() != "ignore") { if (Output(1)->name() != "ignore") {
auto* x1 = Input(0).template data<T, Context>(); auto* x1 = Input(0).template data<T, Context>();
auto* dx2 = Output(1)->template mutable_data<T, Context>(); auto* dx2 = Output(1)->template mutable_data<T, Context>();
math::Mul<T, Context>(Output(1)->count(), dy, x1, dx2); math::Mul<T, Context>(Output(1)->count(), dy, x1, dx2, ctx());
} }
if (Output(0)->name() != "ignore") { if (Output(0)->name() != "ignore") {
auto* x2 = Input(1).template data<T, Context>(); auto* x2 = Input(1).template data<T, Context>();
auto* dx1 = Output(0)->template mutable_data<T, Context>(); auto* dx1 = Output(0)->template mutable_data<T, Context>();
math::Mul<T, Context>(Output(0)->count(), dy, x2, dx1); math::Mul<T, Context>(Output(0)->count(), dy, x2, dx1, ctx());
} }
} }
...@@ -110,19 +115,19 @@ void RMulGradientOp<Context>::BroadcastRunWithType(int type) { ...@@ -110,19 +115,19 @@ void RMulGradientOp<Context>::BroadcastRunWithType(int type) {
auto* x2 = Input(1).template data<T, Context>(); auto* x2 = Input(1).template data<T, Context>();
auto* dx1 = Output(0)->template mutable_data<T, Context>(); auto* dx1 = Output(0)->template mutable_data<T, Context>();
auto* c = ws()->template caches<T, Context>({ X2->count() })[0]; auto* c = ws()->template caches<T, Context>({ X2->count() })[0];
math::Mul<T, Context>(X2->count(), dy, x2, c); math::Mul<T, Context>(X2->count(), dy, x2, c, ctx());
if (type == 0 || type == 1) { if (type == 0 || type == 1) {
DECLARE_MULTIPLIER(multiplier, outer_dim); DECLARE_MULTIPLIER(multiplier, outer_dim);
math::Gemv<T, Context>( math::Gemv<T, Context>(
CblasTrans, outer_dim, inner_dim, CblasTrans, outer_dim, inner_dim,
1.0, c, multiplier, 1.0, c, multiplier,
0.0, dx1, &ctx()); 0.0, dx1, ctx());
} else if (type == 2) { } else if (type == 2) {
DECLARE_MULTIPLIER(multiplier, inner_dim); DECLARE_MULTIPLIER(multiplier, inner_dim);
math::Gemv<T, Context>( math::Gemv<T, Context>(
CblasNoTrans, outer_dim, inner_dim, CblasNoTrans, outer_dim, inner_dim,
1.0, c, multiplier, 1.0, c, multiplier,
0.0, dx1, &ctx()); 0.0, dx1, ctx());
} }
} }
...@@ -135,16 +140,16 @@ void RMulGradientOp<Context>::BroadcastRunWithType(int type) { ...@@ -135,16 +140,16 @@ void RMulGradientOp<Context>::BroadcastRunWithType(int type) {
CblasNoTrans, CblasNoTrans, CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1, outer_dim, inner_dim, 1,
1.0, multiplier, x1, 1.0, multiplier, x1,
0.0, dx2, &ctx()); 0.0, dx2, ctx());
} else if (type == 2) { } else if (type == 2) {
DECLARE_MULTIPLIER(multiplier, inner_dim); DECLARE_MULTIPLIER(multiplier, inner_dim);
math::Gemm<T, Context>( math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans, CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1, outer_dim, inner_dim, 1,
1.0, x1, multiplier, 1.0, x1, multiplier,
0.0, dx2, &ctx()); 0.0, dx2, ctx());
} }
math::Mul<T, Context>(X2->count(), dy, dx2, dx2); math::Mul<T, Context>(X2->count(), dy, dx2, dx2, ctx());
} }
} }
......
...@@ -9,7 +9,7 @@ void RSubOp<Context>::EltwiseRunWithType() { ...@@ -9,7 +9,7 @@ void RSubOp<Context>::EltwiseRunWithType() {
auto* x1 = Input(0).template data<T, Context>(); auto* x1 = Input(0).template data<T, Context>();
auto* x2 = Input(1).template data<T, Context>(); auto* x2 = Input(1).template data<T, Context>();
auto* y = Output(0)->template mutable_data<T, Context>(); auto* y = Output(0)->template mutable_data<T, Context>();
math::Sub<T, Context>(Output(0)->count(), x1, x2, y); math::Sub<T, Context>(Output(0)->count(), x1, x2, y, ctx());
} }
template <class Context> template <typename T> template <class Context> template <typename T>
...@@ -19,7 +19,7 @@ void RSubOp<Context>::BroadcastRunWithType(int type) { ...@@ -19,7 +19,7 @@ void RSubOp<Context>::BroadcastRunWithType(int type) {
auto* x2 = Input(1).template data<T, Context>(); auto* x2 = Input(1).template data<T, Context>();
auto* y = Output(0)->template mutable_data<T, Context>(); auto* y = Output(0)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>( ctx()->template Copy<T, Context, Context>(
Output(0)->count(), y, x2); Output(0)->count(), y, x2);
if (type == 0 || type == 1) { if (type == 0 || type == 1) {
...@@ -35,7 +35,7 @@ void RSubOp<Context>::BroadcastRunWithType(int type) { ...@@ -35,7 +35,7 @@ void RSubOp<Context>::BroadcastRunWithType(int type) {
CblasNoTrans, CblasNoTrans, CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1, outer_dim, inner_dim, 1,
1.0, multiplier, x1, 1.0, multiplier, x1,
-1.0, y, &ctx()); -1.0, y, ctx());
} else if (type == 2) { } else if (type == 2) {
outer_dim = Input(1).dim(0); outer_dim = Input(1).dim(0);
inner_dim = Input(1).count(1); inner_dim = Input(1).count(1);
...@@ -44,7 +44,7 @@ void RSubOp<Context>::BroadcastRunWithType(int type) { ...@@ -44,7 +44,7 @@ void RSubOp<Context>::BroadcastRunWithType(int type) {
CblasNoTrans, CblasNoTrans, CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1, outer_dim, inner_dim, 1,
1.0, x1, multiplier, 1.0, x1, multiplier,
-1.0, y, &ctx()); -1.0, y, ctx());
} }
} }
...@@ -78,12 +78,12 @@ void RSubGradientOp<Context>::EltwiseRunWithType() { ...@@ -78,12 +78,12 @@ void RSubGradientOp<Context>::EltwiseRunWithType() {
if (Output(1)->name() != "ignore") { if (Output(1)->name() != "ignore") {
auto* dx2 = Output(1)->template mutable_data<T, Context>(); auto* dx2 = Output(1)->template mutable_data<T, Context>();
math::Scale<T, Context>( math::Scale<T, Context>(
Output(1)->count(), -1, dy, dx2, &ctx()); Output(1)->count(), -1, dy, dx2, ctx());
} }
if (Output(0)->name() != "ignore") { if (Output(0)->name() != "ignore") {
auto* dx1 = Output(0)->template mutable_data<T, Context>(); auto* dx1 = Output(0)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>( ctx()->template Copy<T, Context, Context>(
Output(0)->count(), dx1, dy); Output(0)->count(), dx1, dy);
} }
} }
...@@ -108,7 +108,7 @@ void RSubGradientOp<Context>::BroadcastRunWithType(int type) { ...@@ -108,7 +108,7 @@ void RSubGradientOp<Context>::BroadcastRunWithType(int type) {
math::Gemv<T, Context>( math::Gemv<T, Context>(
CblasTrans, outer_dim, inner_dim, CblasTrans, outer_dim, inner_dim,
1.0, dy, multiplier, 1.0, dy, multiplier,
0.0, dx1, &ctx()); 0.0, dx1, ctx());
} else if (type == 2) { } else if (type == 2) {
outer_dim = X2->dim(0); outer_dim = X2->dim(0);
inner_dim = X2->count(1); inner_dim = X2->count(1);
...@@ -116,14 +116,14 @@ void RSubGradientOp<Context>::BroadcastRunWithType(int type) { ...@@ -116,14 +116,14 @@ void RSubGradientOp<Context>::BroadcastRunWithType(int type) {
math::Gemv<T, Context>( math::Gemv<T, Context>(
CblasNoTrans, outer_dim, inner_dim, CblasNoTrans, outer_dim, inner_dim,
1.0, dy, multiplier, 1.0, dy, multiplier,
0.0, dx1, &ctx()); 0.0, dx1, ctx());
} }
} }
if (Output(1)->name() != "ignore") { if (Output(1)->name() != "ignore") {
auto* dx2 = Output(1)->template mutable_data<T, Context>(); auto* dx2 = Output(1)->template mutable_data<T, Context>();
math::Scale<T, Context>( math::Scale<T, Context>(
X2->count(), -1, dy, dx2, &ctx()); X2->count(), -1, dy, dx2, ctx());
} }
} }
......
...@@ -7,7 +7,7 @@ template <class Context> template <typename T> ...@@ -7,7 +7,7 @@ template <class Context> template <typename T>
void SquareOp<Context>::RunWithType() { void SquareOp<Context>::RunWithType() {
auto* Xdata = Input(0).template data<T, Context>(); auto* Xdata = Input(0).template data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>(); auto* Ydata = Output(0)->template mutable_data<T, Context>();
math::Pow<T, Context>(Output(0)->count(), 2.0, Xdata, Ydata); math::Pow<T, Context>(Output(0)->count(), 2.0, Xdata, Ydata, ctx());
} }
template <class Context> template <class Context>
...@@ -29,8 +29,8 @@ void SquareGradientOp<Context>::RunWithType() { ...@@ -29,8 +29,8 @@ void SquareGradientOp<Context>::RunWithType() {
auto* Xdata = Input(0).template data<T, Context>(); auto* Xdata = Input(0).template data<T, Context>();
auto* dYdata = Input(-1).template data<T, Context>(); auto* dYdata = Input(-1).template data<T, Context>();
auto* dXdata = Output(0)->template mutable_data<T, Context>(); auto* dXdata = Output(0)->template mutable_data<T, Context>();
math::Mul<T, Context>(Output(0)->count(), dYdata, Xdata, dXdata); math::Mul<T, Context>(Output(0)->count(), dYdata, Xdata, dXdata, ctx());
math::Scal<T, Context>(Output(0)->count(), 2.0, dXdata, &ctx()); math::Scal<T, Context>(Output(0)->count(), 2.0, dXdata, ctx());
} }
template <class Context> template <class Context>
......
...@@ -9,7 +9,8 @@ void SubOp<Context>::EltwiseRunWithType() { ...@@ -9,7 +9,8 @@ void SubOp<Context>::EltwiseRunWithType() {
auto* X1data = Input(0).template data<T, Context>(); auto* X1data = Input(0).template data<T, Context>();
auto* X2data = Input(1).template data<T, Context>(); auto* X2data = Input(1).template data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>(); auto* Ydata = Output(0)->template mutable_data<T, Context>();
math::Sub<T, Context>(Output(0)->count(), X1data, X2data, Ydata); math::Sub<T, Context>(Output(0)->count(),
X1data, X2data, Ydata, ctx());
} }
template <class Context> template <typename T> template <class Context> template <typename T>
...@@ -19,23 +20,24 @@ void SubOp<Context>::BroadcastRunWithType(int type) { ...@@ -19,23 +20,24 @@ void SubOp<Context>::BroadcastRunWithType(int type) {
auto* x2 = Input(1).template data<T, Context>(); auto* x2 = Input(1).template data<T, Context>();
auto* y = Output(0)->template mutable_data<T, Context>(); auto* y = Output(0)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>( ctx()->template Copy<T, Context, Context>(
Output(0)->count(), y, x1); Output(0)->count(), y, x1);
if (type == 0 || type == 1) { if (type == 0 || type == 1) {
if (type == 0) { if (type == 0) {
outer_dim = Input(0).count(); x2 = Input(1).template data<T, CPUContext>();
inner_dim = 1; math::AddScalar<T, Context>(Output(0)->count(),
-dragon_cast<float, T>(x2[0]), y, ctx());
} else { } else {
outer_dim = Input(0).count(0, Input(0).axis(-1)); outer_dim = Input(0).count(0, Input(0).axis(-1));
inner_dim = Input(0).dim(-1); inner_dim = Input(0).dim(-1);
}
DECLARE_MULTIPLIER(multiplier, outer_dim); DECLARE_MULTIPLIER(multiplier, outer_dim);
math::Gemm<T, Context>( math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans, CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1, outer_dim, inner_dim, 1,
-1.0, multiplier, x2, -1.0, multiplier, x2,
1.0, y, &ctx()); 1.0, y, ctx());
}
} }
else if (type == 2) { else if (type == 2) {
outer_dim = Input(0).dim(0); outer_dim = Input(0).dim(0);
...@@ -45,7 +47,7 @@ void SubOp<Context>::BroadcastRunWithType(int type) { ...@@ -45,7 +47,7 @@ void SubOp<Context>::BroadcastRunWithType(int type) {
CblasNoTrans, CblasNoTrans, CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1, outer_dim, inner_dim, 1,
-1.0, x2, multiplier, -1.0, x2, multiplier,
1.0, y, &ctx()); 1.0, y, ctx());
} }
} }
...@@ -79,12 +81,12 @@ void SubGradientOp<Context>::EltwiseRunWithType() { ...@@ -79,12 +81,12 @@ void SubGradientOp<Context>::EltwiseRunWithType() {
if (Output(1)->name() != "ignore") { if (Output(1)->name() != "ignore") {
auto* dx2 = Output(1)->template mutable_data<T, Context>(); auto* dx2 = Output(1)->template mutable_data<T, Context>();
math::Scale<T, Context>(Output(1)->count(), math::Scale<T, Context>(Output(1)->count(),
-1.0, dy, dx2, &ctx()); -1.0, dy, dx2, ctx());
} }
if (Output(0)->name() != "ignore") { if (Output(0)->name() != "ignore") {
auto* dx1 = Output(0)->template mutable_data<T, Context>(); auto* dx1 = Output(0)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>( ctx()->template Copy<T, Context, Context>(
Output(0)->count(), dx1, dy); Output(0)->count(), dx1, dy);
} }
} }
...@@ -109,7 +111,7 @@ void SubGradientOp<Context>::BroadcastRunWithType(int type) { ...@@ -109,7 +111,7 @@ void SubGradientOp<Context>::BroadcastRunWithType(int type) {
math::Gemv<T, Context>( math::Gemv<T, Context>(
CblasTrans, outer_dim, inner_dim, CblasTrans, outer_dim, inner_dim,
-1.0, dy, multiplier, -1.0, dy, multiplier,
0.0, dx2, &ctx()); 0.0, dx2, ctx());
} else if (type == 2) { } else if (type == 2) {
outer_dim = X1->dim(0); outer_dim = X1->dim(0);
inner_dim = X1->count(1); inner_dim = X1->count(1);
...@@ -117,13 +119,13 @@ void SubGradientOp<Context>::BroadcastRunWithType(int type) { ...@@ -117,13 +119,13 @@ void SubGradientOp<Context>::BroadcastRunWithType(int type) {
math::Gemv<T, Context>( math::Gemv<T, Context>(
CblasNoTrans, outer_dim, inner_dim, CblasNoTrans, outer_dim, inner_dim,
-1.0, dy, multiplier, -1.0, dy, multiplier,
0.0, dx2, &ctx()); 0.0, dx2, ctx());
} }
} }
if (Output(0)->name() != "ignore") { if (Output(0)->name() != "ignore") {
auto* dx1 = Output(0)->template mutable_data<T, Context>(); auto* dx1 = Output(0)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>( ctx()->template Copy<T, Context, Context>(
X1->count(), dx1, dy); X1->count(), dx1, dy);
} }
} }
......
...@@ -8,7 +8,8 @@ void CompareOp<Context>::EqualRunWithType() { ...@@ -8,7 +8,8 @@ void CompareOp<Context>::EqualRunWithType() {
auto* X1data = Input(0).template data<T, Context>(); auto* X1data = Input(0).template data<T, Context>();
auto* X2data = Input(1).template data<T, Context>(); auto* X2data = Input(1).template data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>(); auto* Ydata = Output(0)->template mutable_data<T, Context>();
kernel::Equal<T, Context>(Output(0)->count(), X1data, X2data, Ydata); kernel::Equal<T, Context>(Output(0)->count(),
X1data, X2data, Ydata, ctx());
} }
template <class Context> template <class Context>
......
...@@ -7,7 +7,7 @@ void CopyOp<Context>::RunWithType() { ...@@ -7,7 +7,7 @@ void CopyOp<Context>::RunWithType() {
auto* Xdata = Input(0).template data<T, Context>(); auto* Xdata = Input(0).template data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>(); auto* Ydata = Output(0)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>( ctx()->template Copy<T, Context, Context>(
Output(0)->count(), Ydata, Xdata); Output(0)->count(), Ydata, Xdata);
} }
......
...@@ -20,10 +20,10 @@ void CTCLossGradientOp<Context>::RunWithType() { ...@@ -20,10 +20,10 @@ void CTCLossGradientOp<Context>::RunWithType() {
auto* dXdata = Output(0)->template mutable_data<T, Context>(); auto* dXdata = Output(0)->template mutable_data<T, Context>();
auto* dYdata = Input(-1).template data<T, Context>(); auto* dYdata = Input(-1).template data<T, Context>();
T dYdata_host; ctx().template Copy<T, CPUContext, Context>( T dYdata_host; ctx()->template Copy<T, CPUContext, Context>(
1, &dYdata_host, dYdata); 1, &dYdata_host, dYdata);
math::Scale<T, Context>(Output(0)->count(), math::Scale<T, Context>(Output(0)->count(),
dYdata_host, Gdata, dXdata, &ctx()); dYdata_host, Gdata, dXdata, ctx());
} }
template <class Context> template <class Context>
......
...@@ -45,7 +45,7 @@ void CuDNNCTCLossOp<Context>::RunWithType() { ...@@ -45,7 +45,7 @@ void CuDNNCTCLossOp<Context>::RunWithType() {
cudnnSetTensorDesc<T>(&grad_desc, Input(0).dims()); cudnnSetTensorDesc<T>(&grad_desc, Input(0).dims());
CUDNN_CHECK(cudnnGetCTCLossWorkspaceSize( CUDNN_CHECK(cudnnGetCTCLossWorkspaceSize(
ctx().cudnn_handle(), prob_desc, grad_desc, ctx()->cudnn_handle(), prob_desc, grad_desc,
packed_labels.data(), label_lengths.data(), packed_labels.data(), label_lengths.data(),
input_lengths.data(), input_lengths.data(),
ctc_algo, ctc_desc, &workspace_size)); ctc_algo, ctc_desc, &workspace_size));
...@@ -58,7 +58,7 @@ void CuDNNCTCLossOp<Context>::RunWithType() { ...@@ -58,7 +58,7 @@ void CuDNNCTCLossOp<Context>::RunWithType() {
auto* WSdata = (uint8_t*)ws()->template caches<Context>({ auto* WSdata = (uint8_t*)ws()->template caches<Context>({
workspace_size })[0]; workspace_size })[0];
CUDNN_CHECK(cudnnCTCLoss(ctx().cudnn_handle(), CUDNN_CHECK(cudnnCTCLoss(ctx()->cudnn_handle(),
prob_desc, Pdata, packed_labels.data(), prob_desc, Pdata, packed_labels.data(),
label_lengths.data(), input_lengths.data(), label_lengths.data(), input_lengths.data(),
Ydata, grad_desc, Gdata, Ydata, grad_desc, Gdata,
......
...@@ -12,11 +12,13 @@ void L1LossOp<Context>::RunWithType() { ...@@ -12,11 +12,13 @@ void L1LossOp<Context>::RunWithType() {
auto* diff_data = diff->template mutable_data<T, Context>(); auto* diff_data = diff->template mutable_data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>(); auto* Ydata = Output(0)->template mutable_data<T, Context>();
math::Sub<T, Context>(Input(0).count(), X0data, X1data, diff_data); math::Sub<T, Context>(Input(0).count(),
X0data, X1data, diff_data, ctx());
if (InputSize() > 2) { if (InputSize() > 2) {
CHECK_EQ(Input(0).count(), Input(2).count()); CHECK_EQ(Input(0).count(), Input(2).count());
auto* Wdata = Input(2).template data<T, Context>(); auto* Wdata = Input(2).template data<T, Context>();
math::Mul<T, Context>(diff->count(), Wdata, diff_data, diff_data); math::Mul<T, Context>(diff->count(),
Wdata, diff_data, diff_data, ctx());
} }
T normalizer = 1; T normalizer = 1;
...@@ -27,11 +29,13 @@ void L1LossOp<Context>::RunWithType() { ...@@ -27,11 +29,13 @@ void L1LossOp<Context>::RunWithType() {
} }
T loss = math::ASum<T, Context>(diff->count(), diff_data); T loss = math::ASum<T, Context>(diff->count(), diff_data);
math::Set<T, Context>(1, loss / normalizer, Ydata); math::Set<T, Context>(1, loss / normalizer, Ydata, ctx());
} }
template <class Context> template <class Context>
void L1LossOp<Context>::RunOnDevice() { void L1LossOp<Context>::RunOnDevice() {
ctx()->set_stream_id(0); // enforce default stream
CHECK_EQ(Input(0).count(), Input(1).count()); CHECK_EQ(Input(0).count(), Input(1).count());
Output(0)->Reshape({ 1 }); Output(0)->Reshape({ 1 });
diff = ws()->CreateTensor("/mnt/" + anchor() + "/l1_loss/diff"); diff = ws()->CreateTensor("/mnt/" + anchor() + "/l1_loss/diff");
...@@ -51,9 +55,11 @@ template <class Context> template <typename T> ...@@ -51,9 +55,11 @@ template <class Context> template <typename T>
void L1LossGradientOp<Context>::RunWithType() { void L1LossGradientOp<Context>::RunWithType() {
auto* diff_data = diff->template mutable_data<T, Context>(); auto* diff_data = diff->template mutable_data<T, Context>();
auto* dYdata = Input(-1).template data<T, Context>(); auto* dYdata = Input(-1).template data<T, Context>();
T dYdata_host; ctx().template Copy<T, CPUContext, Context>( T dYdata_host; ctx()->template Copy<T, CPUContext, Context>(
1, &dYdata_host, dYdata); 1, &dYdata_host, dYdata);
kernel::AbsGrad<T, Context>(diff->count(), diff_data, diff_data); ctx()->FinishDeviceCompution();
kernel::AbsGrad<T, Context>(diff->count(),
diff_data, diff_data, ctx());
T alpha = dYdata_host, normalizer = 1; T alpha = dYdata_host, normalizer = 1;
if (normalization == "BATCH_SIZE") { if (normalization == "BATCH_SIZE") {
...@@ -69,7 +75,7 @@ void L1LossGradientOp<Context>::RunWithType() { ...@@ -69,7 +75,7 @@ void L1LossGradientOp<Context>::RunWithType() {
const T sign = (i == 0) ? 1 : -1; const T sign = (i == 0) ? 1 : -1;
alpha *= sign; alpha *= sign;
math::Axpby<T, Context>(Output(i)->count(), math::Axpby<T, Context>(Output(i)->count(),
alpha, diff_data, 0, dXdata, &ctx()); alpha, diff_data, 0, dXdata, ctx());
} }
} }
......
...@@ -9,12 +9,14 @@ void L2LossOp<Context>::RunWithType() { ...@@ -9,12 +9,14 @@ void L2LossOp<Context>::RunWithType() {
auto* X0data = Input(0).template data<T, Context>(); auto* X0data = Input(0).template data<T, Context>();
auto* X1data = Input(1).template data<T, Context>(); auto* X1data = Input(1).template data<T, Context>();
auto* diff_data = diff->template mutable_data<T, Context>(); auto* diff_data = diff->template mutable_data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>(); auto* Ydata = Output(0)->template mutable_data<float, Context>();
math::Sub<T, Context>(diff->count(), X0data, X1data, diff_data); math::Sub<T, Context>(diff->count(),
X0data, X1data, diff_data, ctx());
if (InputSize() > 2) { if (InputSize() > 2) {
CHECK_EQ(Input(0).count(), Input(2).count()); CHECK_EQ(Input(0).count(), Input(2).count());
auto* Wdata = Input(2).template data<T, Context>(); auto* Wdata = Input(2).template data<T, Context>();
math::Mul<T, Context>(diff->count(), Wdata, diff_data, diff_data); math::Mul<T, Context>(diff->count(),
Wdata, diff_data, diff_data, ctx());
} }
T normalizer = 1; T normalizer = 1;
...@@ -23,10 +25,12 @@ void L2LossOp<Context>::RunWithType() { ...@@ -23,10 +25,12 @@ void L2LossOp<Context>::RunWithType() {
} else if (normalization == "FULL") { } else if (normalization == "FULL") {
normalizer = Input(0).count(); normalizer = Input(0).count();
} }
normalizer *= 2;
T loss = T(0.5) * math::Dot<T, Context>(diff->count(), T loss;
diff_data, diff_data, &ctx()); math::Dot<T, Context>(diff->count(),
math::Set<T, Context>(1, loss / normalizer, Ydata); diff_data, diff_data, &loss, ctx());
math::Set<T, Context>(1, loss / normalizer, Ydata, ctx());
} }
template <class Context> template <class Context>
...@@ -48,10 +52,11 @@ OPERATOR_SCHEMA(L2Loss).NumInputs(2, 3).NumOutputs(1); ...@@ -48,10 +52,11 @@ OPERATOR_SCHEMA(L2Loss).NumInputs(2, 3).NumOutputs(1);
template <class Context> template <typename T> template <class Context> template <typename T>
void L2LossGradientOp<Context>::RunWithType() { void L2LossGradientOp<Context>::RunWithType() {
auto* diff_data = diff->template mutable_data<T, Context>(); auto* diff_data = diff->template data<T, Context>();
auto* dYdata = Input(-1).template data<T, Context>(); auto* dYdata = Input(-1).template data<T, Context>();
T dYdata_host; ctx().template Copy<T, CPUContext, Context>( T dYdata_host; ctx()->template Copy<T, CPUContext, Context>(
1, &dYdata_host, dYdata); 1, &dYdata_host, dYdata);
ctx()->FinishDeviceCompution();
T alpha = dYdata_host, normalizer = 1; T alpha = dYdata_host, normalizer = 1;
if (normalization == "BATCH_SIZE") { if (normalization == "BATCH_SIZE") {
...@@ -67,7 +72,7 @@ void L2LossGradientOp<Context>::RunWithType() { ...@@ -67,7 +72,7 @@ void L2LossGradientOp<Context>::RunWithType() {
const T sign = (i == 0) ? 1 : -1; const T sign = (i == 0) ? 1 : -1;
alpha *= sign; alpha *= sign;
math::Axpby<T, Context>(Output(i)->count(), math::Axpby<T, Context>(Output(i)->count(),
alpha, diff_data, 0, dXdata, &ctx()); alpha, diff_data, 0, dXdata, ctx());
} }
} }
......
...@@ -13,11 +13,11 @@ void SigmoidCrossEntropyOp<Context>::RunWithType() { ...@@ -13,11 +13,11 @@ void SigmoidCrossEntropyOp<Context>::RunWithType() {
auto* Fdata = flags.template mutable_data<T, Context>(); auto* Fdata = flags.template mutable_data<T, Context>();
kernel::SigmoidCrossEntropy<T, Context>( kernel::SigmoidCrossEntropy<T, Context>(
Input(0).count(), Xdata, Tdata, Ldata, Fdata, &ctx()); Input(0).count(), Xdata, Tdata, Ldata, Fdata, ctx());
if (normalization == "UNIT") { if (normalization == "UNIT") {
Output(0)->ReshapeLike(losses); Output(0)->ReshapeLike(losses);
Output(0)->template CopyFrom<Context>(losses); Output(0)->template CopyFrom<Context>(losses, ctx());
return; return;
} }
...@@ -35,11 +35,13 @@ void SigmoidCrossEntropyOp<Context>::RunWithType() { ...@@ -35,11 +35,13 @@ void SigmoidCrossEntropyOp<Context>::RunWithType() {
T loss = math::ASum<T, Context>(losses.count(), Ldata); T loss = math::ASum<T, Context>(losses.count(), Ldata);
Output(0)->Reshape({ 1 }); Output(0)->Reshape({ 1 });
auto* Ydata = Output(0)->template mutable_data<T, Context>(); auto* Ydata = Output(0)->template mutable_data<T, Context>();
math::Set<T, Context>(1, loss / normalizer, Ydata); math::Set<T, Context>(1, loss / normalizer, Ydata, ctx());
} }
template <class Context> template <class Context>
void SigmoidCrossEntropyOp<Context>::RunOnDevice() { void SigmoidCrossEntropyOp<Context>::RunOnDevice() {
ctx()->set_stream_id(0); // enforce default stream
CHECK_EQ(Input(0).count(), Input(1).count()) CHECK_EQ(Input(0).count(), Input(1).count())
<< "\nNumber of predictions must match the number of labels."; << "\nNumber of predictions must match the number of labels.";
losses.ReshapeLike(Input(0)); losses.ReshapeLike(Input(0));
...@@ -63,12 +65,12 @@ void SigmoidCrossEntropyGradientOp<Context>::RunWithType() { ...@@ -63,12 +65,12 @@ void SigmoidCrossEntropyGradientOp<Context>::RunWithType() {
auto* Fdata = flags.template mutable_data<T, Context>(); auto* Fdata = flags.template mutable_data<T, Context>();
kernel::SigmoidCrossEntropyGrad<T, Context>( kernel::SigmoidCrossEntropyGrad<T, Context>(
Input(0).count(), Xdata, Tdata, dXdata, Fdata, &ctx()); Input(0).count(), Xdata, Tdata, dXdata, Fdata, ctx());
if (normalization == "UNIT") { if (normalization == "UNIT") {
auto* dYdata = Input(-1).template data<T, Context>(); auto* dYdata = Input(-1).template data<T, Context>();
math::Mul<T, Context>(Output(0)->count(), math::Mul<T, Context>(Output(0)->count(),
dYdata, dXdata, dXdata); return; dYdata, dXdata, dXdata, ctx()); return;
} }
T normalizer = 1; T normalizer = 1;
...@@ -83,14 +85,16 @@ void SigmoidCrossEntropyGradientOp<Context>::RunWithType() { ...@@ -83,14 +85,16 @@ void SigmoidCrossEntropyGradientOp<Context>::RunWithType() {
} }
auto* dYdata = Input(-1).template data<T, Context>(); auto* dYdata = Input(-1).template data<T, Context>();
T dYdata_host; ctx().template Copy<T, CPUContext, Context>( T dYdata_host; ctx()->template Copy<T, CPUContext, Context>(
1, &dYdata_host, dYdata); 1, &dYdata_host, dYdata);
math::Scal<T, Context>(Output(0)->count(), math::Scal<T, Context>(Output(0)->count(),
dYdata_host / normalizer, dXdata, &ctx()); dYdata_host / normalizer, dXdata, ctx());
} }
template <class Context> template <class Context>
void SigmoidCrossEntropyGradientOp<Context>::RunOnDevice() { void SigmoidCrossEntropyGradientOp<Context>::RunOnDevice() {
ctx()->set_stream_id(0); // enforce default stream
Output(0)->ReshapeLike(Input(0)); Output(0)->ReshapeLike(Input(0));
flags.ReshapeLike(Input(0)); flags.ReshapeLike(Input(0));
......
...@@ -15,11 +15,11 @@ void SigmoidFocalLossOp<Context>::RunWithType() { ...@@ -15,11 +15,11 @@ void SigmoidFocalLossOp<Context>::RunWithType() {
kernel::SigmoidFocalLoss<T, Context>( kernel::SigmoidFocalLoss<T, Context>(
outer_dim, axis_dim, inner_dim, outer_dim, axis_dim, inner_dim,
pos_alpha, neg_alpha, gamma, neg_id, pos_alpha, neg_alpha, gamma, neg_id,
Xdata, Tdata, Ldata, Fdata, &ctx()); Xdata, Tdata, Ldata, Fdata, ctx());
if (normalization == "UNIT") { if (normalization == "UNIT") {
Output(0)->ReshapeLike(losses); Output(0)->ReshapeLike(losses);
Output(0)->template CopyFrom<Context>(losses); Output(0)->template CopyFrom<Context>(losses, ctx());
return; return;
} }
...@@ -37,11 +37,13 @@ void SigmoidFocalLossOp<Context>::RunWithType() { ...@@ -37,11 +37,13 @@ void SigmoidFocalLossOp<Context>::RunWithType() {
T loss = math::ASum<T, Context>(losses.count(), Ldata); T loss = math::ASum<T, Context>(losses.count(), Ldata);
Output(0)->Reshape({ 1 }); Output(0)->Reshape({ 1 });
auto* Ydata = Output(0)->template mutable_data<T, Context>(); auto* Ydata = Output(0)->template mutable_data<T, Context>();
math::Set<T, Context>(1, loss / normalizer, Ydata); math::Set<T, Context>(1, loss / normalizer, Ydata, ctx());
} }
template <class Context> template <class Context>
void SigmoidFocalLossOp<Context>::RunOnDevice() { void SigmoidFocalLossOp<Context>::RunOnDevice() {
ctx()->set_stream_id(0); // enforce default stream
outer_dim = Input(0).count(0, axis); outer_dim = Input(0).count(0, axis);
axis_dim = Input(0).dim(axis); axis_dim = Input(0).dim(axis);
inner_dim = Input(0).count(axis + 1); inner_dim = Input(0).count(axis + 1);
...@@ -71,12 +73,12 @@ void SigmoidFocalLossGradientOp<Context>::RunWithType() { ...@@ -71,12 +73,12 @@ void SigmoidFocalLossGradientOp<Context>::RunWithType() {
kernel::SigmoidFocalLossGradient<T, Context>( kernel::SigmoidFocalLossGradient<T, Context>(
outer_dim, axis_dim, inner_dim, outer_dim, axis_dim, inner_dim,
pos_alpha, neg_alpha, gamma, neg_id, pos_alpha, neg_alpha, gamma, neg_id,
Xdata, Tdata, dXdata, Fdata, &ctx()); Xdata, Tdata, dXdata, Fdata, ctx());
if (normalization == "UNIT") { if (normalization == "UNIT") {
auto* dYdata = Input(-1).template data<T, Context>(); auto* dYdata = Input(-1).template data<T, Context>();
math::Mul<T, Context>(Output(0)->count(), math::Mul<T, Context>(Output(0)->count(),
dYdata, dXdata, dXdata); return; dYdata, dXdata, dXdata, ctx()); return;
} }
T normalizer = 1; T normalizer = 1;
...@@ -91,14 +93,16 @@ void SigmoidFocalLossGradientOp<Context>::RunWithType() { ...@@ -91,14 +93,16 @@ void SigmoidFocalLossGradientOp<Context>::RunWithType() {
} }
auto* dYdata = Input(-1).template data<T, Context>(); auto* dYdata = Input(-1).template data<T, Context>();
T dYdata_host; ctx().template Copy<T, CPUContext, Context>( T dYdata_host; ctx()->template Copy<T, CPUContext, Context>(
1, &dYdata_host, dYdata); 1, &dYdata_host, dYdata);
math::Scal<T, Context>(Output(0)->count(), math::Scal<T, Context>(Output(0)->count(),
dYdata_host / normalizer, dXdata, &ctx()); dYdata_host / normalizer, dXdata, ctx());
} }
template <class Context> template <class Context>
void SigmoidFocalLossGradientOp<Context>::RunOnDevice() { void SigmoidFocalLossGradientOp<Context>::RunOnDevice() {
ctx()->set_stream_id(0); // enforce default stream
outer_dim = Input(0).count(0, axis); outer_dim = Input(0).count(0, axis);
axis_dim = Input(0).dim(axis); axis_dim = Input(0).dim(axis);
inner_dim = Input(0).count(axis + 1); inner_dim = Input(0).count(axis + 1);
......
...@@ -11,20 +11,21 @@ void SmoothL1LossOp<Context>::RunWithType() { ...@@ -11,20 +11,21 @@ void SmoothL1LossOp<Context>::RunWithType() {
auto* X1data = Input(1).template data<T, Context>(); auto* X1data = Input(1).template data<T, Context>();
auto* diff_data = diff->template mutable_data<T, Context>(); auto* diff_data = diff->template mutable_data<T, Context>();
auto* error_data = error->template mutable_data<T, Context>(); auto* error_data = error->template mutable_data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>(); auto* Ydata = Output(0)->template mutable_data<float, Context>();
math::Sub<T, Context>(diff->count(), X0data, X1data, diff_data); math::Sub<T, Context>(diff->count(),
X0data, X1data, diff_data, ctx());
if (InputSize() > 2) { if (InputSize() > 2) {
auto* inside_w_data = Input(2).template data<T, Context>(); auto* inside_w_data = Input(2).template data<T, Context>();
math::Mul<T, Context>(diff->count(), math::Mul<T, Context>(diff->count(),
inside_w_data, diff_data, diff_data); inside_w_data, diff_data, diff_data, ctx());
} }
kernel::SmoothL1<T, Context>( kernel::SmoothL1<T, Context>(diff->count(),
diff->count(), beta, diff_data, error_data); beta, diff_data, error_data, ctx());
if (InputSize() > 3) { if (InputSize() > 3) {
auto* outside_w_data = Input(3).template data<T, Context>(); auto* outside_w_data = Input(3).template data<T, Context>();
math::Mul<T, Context>(diff->count(), math::Mul<T, Context>(diff->count(),
outside_w_data, error_data, error_data); outside_w_data, error_data, error_data, ctx());
} }
T normalizer = 1; T normalizer = 1;
...@@ -34,12 +35,14 @@ void SmoothL1LossOp<Context>::RunWithType() { ...@@ -34,12 +35,14 @@ void SmoothL1LossOp<Context>::RunWithType() {
normalizer = Input(0).count(); normalizer = Input(0).count();
} }
T loss = math::ASum<T, Context>(error->count(), error_data); float loss = math::ASum<float, Context>(error->count(), error_data);
math::Set<T, Context>(1, loss / normalizer, Ydata); math::Set<float, Context>(1, loss / normalizer, Ydata, ctx());
} }
template <class Context> template <class Context>
void SmoothL1LossOp<Context>::RunOnDevice() { void SmoothL1LossOp<Context>::RunOnDevice() {
ctx()->set_stream_id(0); // enforce default stream
CHECK(Input(0).dims() == Input(1).dims()); CHECK(Input(0).dims() == Input(1).dims());
if (InputSize() > 2) CHECK(Input(0).dims() == Input(2).dims()); if (InputSize() > 2) CHECK(Input(0).dims() == Input(2).dims());
if (InputSize() > 3) CHECK(Input(0).dims() == Input(3).dims()); if (InputSize() > 3) CHECK(Input(0).dims() == Input(3).dims());
...@@ -64,10 +67,12 @@ template <class Context> template <typename T> ...@@ -64,10 +67,12 @@ template <class Context> template <typename T>
void SmoothL1LossGradientOp<Context>::RunWithType() { void SmoothL1LossGradientOp<Context>::RunWithType() {
auto* diff_data = diff->template mutable_data<T, Context>(); auto* diff_data = diff->template mutable_data<T, Context>();
auto* dYdata = Input(-1).template data<T, Context>(); auto* dYdata = Input(-1).template data<T, Context>();
T dYdata_host; ctx().template Copy<T, CPUContext, Context>( T dYdata_host; ctx()->template Copy<T, CPUContext, Context>(
1, &dYdata_host, dYdata); 1, &dYdata_host, dYdata);
kernel::SmoothL1Grad<T, Context>( ctx()->FinishDeviceCompution();
diff->count(), beta, diff_data, diff_data);
kernel::SmoothL1Grad<T, Context>(diff->count(),
beta, diff_data, diff_data, ctx());
T alpha = dYdata_host, normalizer = 1; T alpha = dYdata_host, normalizer = 1;
if (normalization == "BATCH_SIZE") { if (normalization == "BATCH_SIZE") {
...@@ -83,16 +88,16 @@ void SmoothL1LossGradientOp<Context>::RunWithType() { ...@@ -83,16 +88,16 @@ void SmoothL1LossGradientOp<Context>::RunWithType() {
const T sign = (i == 0) ? 1 : -1; const T sign = (i == 0) ? 1 : -1;
alpha *= sign; alpha *= sign;
math::Axpby<T, Context>(Output(i)->count(), math::Axpby<T, Context>(Output(i)->count(),
alpha, diff_data, 0, dXdata, &ctx()); alpha, diff_data, 0, dXdata, ctx());
if (InputSize() > 3) { if (InputSize() > 3) {
auto* inside_w_data = Input(2).template data<T, Context>(); auto* inside_w_data = Input(2).template data<T, Context>();
math::Mul<T, Context>(Output(i)->count(), math::Mul<T, Context>(Output(i)->count(),
inside_w_data, dXdata, dXdata); inside_w_data, dXdata, dXdata, ctx());
} }
if (InputSize() > 4) { if (InputSize() > 4) {
auto* outside_w_data = Input(3).template data<T, Context>(); auto* outside_w_data = Input(3).template data<T, Context>();
math::Mul<T, Context>(Output(i)->count(), math::Mul<T, Context>(Output(i)->count(),
outside_w_data, dXdata, dXdata); outside_w_data, dXdata, dXdata, ctx());
} }
} }
} }
......
...@@ -26,15 +26,15 @@ void SoftmaxCrossEntropyOp<Context>::RunWithType() { ...@@ -26,15 +26,15 @@ void SoftmaxCrossEntropyOp<Context>::RunWithType() {
auto* Pdata = prob->template data<T, Context>(); auto* Pdata = prob->template data<T, Context>();
auto* Tdata = Input(1).template data<T, Context>(); auto* Tdata = Input(1).template data<T, Context>();
auto* Ldata = losses.template mutable_data<T, Context>(); auto* Ldata = losses.template mutable_data<T, Context>();
kernel::SoftmaxCrossEntropy<T, Context>( kernel::SoftmaxCrossEntropy<T, Context>(Input(0).count(),
Input(0).count(), Pdata, Tdata, Ldata); Pdata, Tdata, Ldata, ctx());
if (normalization == "UNIT") { if (normalization == "UNIT") {
Output(0)->Reshape({ outer_dim * inner_dim }); Output(0)->Reshape({ outer_dim * inner_dim });
auto* Ydata = Output(0)->template mutable_data<T, Context>(); auto* Ydata = Output(0)->template mutable_data<T, Context>();
kernel::Sum<T, Context>(outer_dim * inner_dim, kernel::Sum<T, Context>(outer_dim * inner_dim,
Input(0).dim(axis), inner_dim, Input(0).dim(axis), inner_dim,
Ldata, Ydata); return; Ldata, Ydata, ctx()); return;
} }
T normalizer = 1; T normalizer = 1;
...@@ -47,11 +47,13 @@ void SoftmaxCrossEntropyOp<Context>::RunWithType() { ...@@ -47,11 +47,13 @@ void SoftmaxCrossEntropyOp<Context>::RunWithType() {
T loss = math::ASum<T, Context>(losses.count(), Ldata); T loss = math::ASum<T, Context>(losses.count(), Ldata);
Output(0)->Reshape({ 1 }); Output(0)->Reshape({ 1 });
auto* Ydata = Output(0)->template mutable_data<T, Context>(); auto* Ydata = Output(0)->template mutable_data<T, Context>();
math::Set<T, Context>(1, loss / normalizer, Ydata); math::Set<T, Context>(1, loss / normalizer, Ydata, ctx());
} }
template <class Context> template <class Context>
void SoftmaxCrossEntropyOp<Context>::RunOnDevice() { void SoftmaxCrossEntropyOp<Context>::RunOnDevice() {
ctx()->set_stream_id(0); // enforce default stream
outer_dim = Input(0).count(0, axis); outer_dim = Input(0).count(0, axis);
inner_dim = Input(0).count(axis + 1); inner_dim = Input(0).count(axis + 1);
CHECK_EQ(Input(0).count(), Input(1).count()) CHECK_EQ(Input(0).count(), Input(1).count())
...@@ -76,16 +78,16 @@ void SoftmaxCrossEntropyGradientOp<Context>::RunWithType() { ...@@ -76,16 +78,16 @@ void SoftmaxCrossEntropyGradientOp<Context>::RunWithType() {
auto* Tdata = Input(1).template data<T, Context>(); auto* Tdata = Input(1).template data<T, Context>();
auto* Pdata = prob->template mutable_data<T, Context>(); auto* Pdata = prob->template mutable_data<T, Context>();
auto* dXdata = Output(0)->template mutable_data<T, Context>(); auto* dXdata = Output(0)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>(prob->count(), dXdata, Pdata); ctx()->template Copy<T, Context, Context>(prob->count(), dXdata, Pdata);
math::Axpy<T, Context>(Output(0)->count(), math::Axpy<T, Context>(Output(0)->count(),
-1.0, Tdata, dXdata, &ctx()); -1.0, Tdata, dXdata, ctx());
if (normalization == "UNIT") { if (normalization == "UNIT") {
auto* dYdata = Input(-1).template data<T, Context>(); auto* dYdata = Input(-1).template data<T, Context>();
kernel::SumGrad<T, Context>(outer_dim * inner_dim, kernel::SumGrad<T, Context>(outer_dim * inner_dim,
Input(0).dim(axis), inner_dim, 1.0, dYdata, Pdata); Input(0).dim(axis), inner_dim, 1.0, dYdata, Pdata, ctx());
math::Mul<T, Context>(Output(0)->count(), math::Mul<T, Context>(Output(0)->count(),
Pdata, dXdata, dXdata); return; Pdata, dXdata, dXdata, ctx()); return;
} }
T normalizer = 1; T normalizer = 1;
...@@ -96,10 +98,10 @@ void SoftmaxCrossEntropyGradientOp<Context>::RunWithType() { ...@@ -96,10 +98,10 @@ void SoftmaxCrossEntropyGradientOp<Context>::RunWithType() {
} }
auto* dYdata = Input(-1).template data<T, Context>(); auto* dYdata = Input(-1).template data<T, Context>();
T dYdata_host; ctx().template Copy<T, CPUContext, Context>( T dYdata_host; ctx()->template Copy<T, CPUContext, Context>(
1, &dYdata_host, dYdata); 1, &dYdata_host, dYdata);
math::Scal<T, Context>(Output(0)->count(), math::Scal<T, Context>(Output(0)->count(),
dYdata_host / normalizer, dXdata, &ctx()); dYdata_host / normalizer, dXdata, ctx());
} }
template <class Context> template <class Context>
......
...@@ -20,11 +20,11 @@ void SoftmaxFocalLossOp<Context>::RunWithType() { ...@@ -20,11 +20,11 @@ void SoftmaxFocalLossOp<Context>::RunWithType() {
outer_dim, Input(0).dim(axis), inner_dim, outer_dim, Input(0).dim(axis), inner_dim,
pos_alpha, neg_alpha, gamma, neg_id, pos_alpha, neg_alpha, gamma, neg_id,
Pdata, Tdata, Idata, this->ignores.count(), Pdata, Tdata, Idata, this->ignores.count(),
Ldata, Fdata, &ctx()); Ldata, Fdata, ctx());
if (normalization == "UNIT") { if (normalization == "UNIT") {
Output(0)->ReshapeLike(losses); Output(0)->ReshapeLike(losses);
Output(0)->template CopyFrom<Context>(losses); Output(0)->template CopyFrom<Context>(losses, ctx());
return; return;
} }
...@@ -42,11 +42,13 @@ void SoftmaxFocalLossOp<Context>::RunWithType() { ...@@ -42,11 +42,13 @@ void SoftmaxFocalLossOp<Context>::RunWithType() {
T loss = math::ASum<T, Context>(losses.count(), Ldata); T loss = math::ASum<T, Context>(losses.count(), Ldata);
Output(0)->Reshape({ 1 }); Output(0)->Reshape({ 1 });
auto* Ydata = Output(0)->template mutable_data<T, Context>(); auto* Ydata = Output(0)->template mutable_data<T, Context>();
math::Set<T, Context>(1, loss / normalizer, Ydata); math::Set<T, Context>(1, loss / normalizer, Ydata, ctx());
} }
template <class Context> template <class Context>
void SoftmaxFocalLossOp<Context>::RunOnDevice() { void SoftmaxFocalLossOp<Context>::RunOnDevice() {
ctx()->set_stream_id(0); // enforce default stream
outer_dim = Input(0).count(0, axis); outer_dim = Input(0).count(0, axis);
inner_dim = Input(0).count(axis + 1); inner_dim = Input(0).count(axis + 1);
CHECK_EQ(outer_dim * inner_dim, Input(1).count()) CHECK_EQ(outer_dim * inner_dim, Input(1).count())
...@@ -80,16 +82,16 @@ void SoftmaxFocalLossGradientOp<Context>::RunWithType() { ...@@ -80,16 +82,16 @@ void SoftmaxFocalLossGradientOp<Context>::RunWithType() {
outer_dim, Output(0)->dim(axis), inner_dim, outer_dim, Output(0)->dim(axis), inner_dim,
pos_alpha, neg_alpha, gamma, neg_id, pos_alpha, neg_alpha, gamma, neg_id,
Pdata, Tdata, Idata, this->ignores.count(), Pdata, Tdata, Idata, this->ignores.count(),
dXdata, Fdata, &ctx()); dXdata, Fdata, ctx());
if (normalization == "UNIT") { if (normalization == "UNIT") {
auto* dYdata = Input(-1).template data<T, Context>(); auto* dYdata = Input(-1).template data<T, Context>();
kernel::SumGrad<T, Context>( kernel::SumGrad<T, Context>(
Input(0).count() / Input(0).dim(axis), Input(0).count() / Input(0).dim(axis),
Input(0).dim(axis), inner_dim, Input(0).dim(axis), inner_dim,
1.0, dYdata, Pdata); 1.0, dYdata, Pdata, ctx());
math::Mul<T, Context>(Output(0)->count(), math::Mul<T, Context>(Output(0)->count(),
Pdata, dXdata, dXdata); return; Pdata, dXdata, dXdata, ctx()); return;
} }
T normalizer = 1; T normalizer = 1;
...@@ -104,14 +106,16 @@ void SoftmaxFocalLossGradientOp<Context>::RunWithType() { ...@@ -104,14 +106,16 @@ void SoftmaxFocalLossGradientOp<Context>::RunWithType() {
} }
auto* dYdata = Input(-1).template data<T, Context>(); auto* dYdata = Input(-1).template data<T, Context>();
T dYdata_host; ctx().template Copy<T, CPUContext, Context>( T dYdata_host; ctx()->template Copy<T, CPUContext, Context>(
1, &dYdata_host, dYdata); 1, &dYdata_host, dYdata);
math::Scal<T, Context>(Output(0)->count(), math::Scal<T, Context>(Output(0)->count(),
dYdata_host / normalizer, dXdata, &ctx()); dYdata_host / normalizer, dXdata, ctx());
} }
template <class Context> template <class Context>
void SoftmaxFocalLossGradientOp<Context>::RunOnDevice() { void SoftmaxFocalLossGradientOp<Context>::RunOnDevice() {
ctx()->set_stream_id(0); // enforce default stream
this->prob = ws()->GetTensor("/mnt/" + anchor() + "/softmax/prob"); this->prob = ws()->GetTensor("/mnt/" + anchor() + "/softmax/prob");
outer_dim = this->prob->count(0, axis); outer_dim = this->prob->count(0, axis);
inner_dim = this->prob->count(axis + 1); inner_dim = this->prob->count(axis + 1);
......
...@@ -21,83 +21,66 @@ void SparseSoftmaxCrossEntropyOp<Context>::SoftmaxRun() { ...@@ -21,83 +21,66 @@ void SparseSoftmaxCrossEntropyOp<Context>::SoftmaxRun() {
softmax_op->Run(); softmax_op->Run();
} }
template <class Context>
void SparseSoftmaxCrossEntropyOp<Context>::SoftmaxRunFP16() {
Tensor* XF32 = ws()->CreateTensor(
"/mnt/" + anchor() + "/softmax/xf32");
XF32->ReshapeLike(Input(0));
auto* XdataF16 = Input(0).template data<float16, Context>();
auto* XdataF32 = XF32->template mutable_data<float, Context>();
kernel::TypeA2B<float16, float, Context>(
Input(0).count(), XdataF16, XdataF32);
OperatorDef softmax_def = MakeOperatorDef("Softmax", "",
vector<string>({ XF32->name() }),
vector<string>({ "/mnt/" + anchor() + "/softmax/prob" }));
softmax_def.add_arg()->CopyFrom(this->arg("axis"));
if (def().has_device_option())
softmax_def.mutable_device_option()
->CopyFrom(def().device_option());
if (!softmax_op) softmax_op.reset(
CreateOperator(softmax_def, ws()));
else softmax_op->MutableOp(softmax_def);
softmax_op->Run();
}
template <class Context> template <typename Tx, typename Ty> template <class Context> template <typename Tx, typename Ty>
void SparseSoftmaxCrossEntropyOp<Context>::RunWithType() { void SparseSoftmaxCrossEntropyOp<Context>::RunWithType() {
auto* Pdata = prob->template data<Tx, Context>(); auto* Pdata = prob->template data<Tx, Context>();
auto* Tdata = Input(1).template data<Ty, Context>(); auto* Tdata = Input(1).template data<Ty, Context>();
auto* Idata = !ignores.count() ? nullptr : auto* Idata = !ignores.count() ? nullptr :
ignores.template data<int, Context>(); ignores.template data<int, Context>();
auto* Ldata = losses.template mutable_data<Tx, Context>(); auto* Ldata = losses.template mutable_data<float, Context>();
auto* Fdata = flags.template mutable_data<Tx, Context>(); auto* Fdata = flags.template mutable_data<float, Context>();
kernel::SparseSoftmaxCrossEntropy<Tx, Ty, Context>( kernel::SparseSoftmaxCrossEntropy<Tx, Ty, Context>(
outer_dim, Input(0).dim(axis), inner_dim, outer_dim, Input(0).dim(axis), inner_dim,
Pdata, Tdata, Idata, ignores.count(), Pdata, Tdata, Idata, ignores.count(),
Ldata, Fdata, &ctx()); Ldata, Fdata, ctx());
if (normalization == "UNIT") { if (normalization == "UNIT") {
Output(0)->ReshapeLike(losses); Output(0)->ReshapeLike(losses);
Output(0)->template CopyFrom<Context>(losses); Output(0)->template CopyFrom<Context>(losses, ctx());
return; return;
} }
Tx normalizer = 1; float normalizer = 1;
if (normalization == "VALID") { if (normalization == "VALID") {
normalizer = std::max( normalizer = std::max(
math::ASum<Tx, Context>( math::ASum<float, Context>(
flags.count(), Fdata), (Tx)1.f); flags.count(), Fdata), 1.f);
} else if (normalization == "BATCH_SIZE") { } else if (normalization == "BATCH_SIZE") {
normalizer = Input(0).dim(0); normalizer = Input(0).dim(0);
} else if (normalization == "FULL") { } else if (normalization == "FULL") {
normalizer = outer_dim * inner_dim; normalizer = outer_dim * inner_dim;
} }
Tx loss = math::ASum<Tx, Context>(losses.count(), Ldata); float loss = math::ASum<float, Context>(losses.count(), Ldata);
Output(0)->Reshape({ 1 }); Output(0)->Reshape({ 1 });
auto* Ydata = Output(0)->template mutable_data<Tx, Context>(); auto* Ydata = Output(0)->template mutable_data<float, Context>();
math::Set<Tx, Context>(1, loss / normalizer, Ydata); math::Set<float, Context>(1, loss / normalizer, Ydata, ctx());
} }
template <class Context> template <class Context>
void SparseSoftmaxCrossEntropyOp<Context>::RunOnDevice() { void SparseSoftmaxCrossEntropyOp<Context>::RunOnDevice() {
ctx()->set_stream_id(0); // enforce default stream
outer_dim = Input(0).count(0, axis); outer_dim = Input(0).count(0, axis);
inner_dim = Input(0).count(axis + 1); inner_dim = Input(0).count(axis + 1);
CHECK_EQ(outer_dim * inner_dim, Input(1).count()) CHECK_EQ(outer_dim * inner_dim, Input(1).count())
<< "\nNumber of predictions must match the number of labels."; << "\nNumber of predictions must match the number of labels.";
losses.Reshape({ outer_dim * inner_dim }); losses.Reshape({ outer_dim * inner_dim });
flags.Reshape({ outer_dim * inner_dim }); flags.Reshape({ outer_dim * inner_dim });
prob = ws()->CreateTensor("/mnt/" + anchor() + "/softmax/prob"); prob = ws()->CreateTensor("/mnt/" + anchor() + "/softmax/prob");
SoftmaxRun();
if (XIsType(Input(0), float) || if (XIsType(Input(0), float)) {
XIsType(Input(0), float16)) {
if (XIsType(Input(0), float16)) SoftmaxRunFP16();
else SoftmaxRun();
if (XIsType(Input(1), float)) RunWithType<float, float>(); if (XIsType(Input(1), float)) RunWithType<float, float>();
else if (XIsType(Input(1), int64_t)) RunWithType<float, int64_t>(); else if (XIsType(Input(1), int64_t)) RunWithType<float, int64_t>();
else LOG(FATAL) << DTypeHelper(Input(1), { "float32", "int64" }); else LOG(FATAL) << DTypeHelper(Input(1), { "float32", "int64" });
} else LOG(FATAL) << DTypeHelper(Input(0), { "float32" }); } else if (XIsType(Input(0), float16)) {
if (XIsType(Input(1), float)) RunWithType<float16, float>();
else if (XIsType(Input(1), int64_t)) RunWithType<float16, int64_t>();
else LOG(FATAL) << DTypeHelper(Input(1), { "float32", "int64" });
} else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
} }
DEPLOY_CPU(SparseSoftmaxCrossEntropy); DEPLOY_CPU(SparseSoftmaxCrossEntropy);
...@@ -113,62 +96,66 @@ void SparseSoftmaxCrossEntropyGradientOp<Context>::RunWithType() { ...@@ -113,62 +96,66 @@ void SparseSoftmaxCrossEntropyGradientOp<Context>::RunWithType() {
auto* Idata = !ignores.count() ? nullptr : auto* Idata = !ignores.count() ? nullptr :
ignores.template data<int, Context>(); ignores.template data<int, Context>();
auto* dXdata = Output(0)->template mutable_data<Tx, Context>(); auto* dXdata = Output(0)->template mutable_data<Tx, Context>();
auto* Fdata = flags.template mutable_data<Tx, Context>(); auto* Fdata = flags.template mutable_data<float, Context>();
ctx().template Copy<Tx, Context, Context>( ctx()->template Copy<Tx, Context, Context>(
prob->count(), dXdata, Pdata); prob->count(), dXdata, Pdata);
kernel::SparseSoftmaxCrossEntropyGrad<Tx, Ty, Context>( kernel::SparseSoftmaxCrossEntropyGrad<Tx, Ty, Context>(
outer_dim, Output(0)->dim(axis), inner_dim, outer_dim, Output(0)->dim(axis), inner_dim,
Pdata, Tdata, Idata, ignores.count(), Pdata, Tdata, Idata, ignores.count(),
dXdata, Fdata, &ctx()); dXdata, Fdata, ctx());
if (normalization == "UNIT") { if (normalization == "UNIT") {
auto* dYdata = Input(-1).template data<Tx, Context>(); auto* dYdata = Input(-1).template data<float, Context>();
kernel::SumGrad<Tx, Context>( auto* WSdata = ws()->template caches<float, Context>(
{ Input(0).count() })[0];
kernel::SumGrad<float, Context>(
Input(0).count() / Input(0).dim(axis), Input(0).count() / Input(0).dim(axis),
Input(0).dim(axis), inner_dim, Input(0).dim(axis), inner_dim,
1.0, dYdata, Pdata); 1.0, dYdata, WSdata, ctx());
math::Mul<Tx, Context>( kernel::TypeA2B<float, Tx, Context>(
Output(0)->count(), Pdata, dXdata, dXdata); Input(0).count(), WSdata, Pdata, ctx());
math::Mul<Tx, Context>(Output(0)->count(),
Pdata, dXdata, dXdata, ctx());
return; return;
} }
Tx normalizer = 1; float normalizer = 1;
if (normalization == "VALID") { if (normalization == "VALID") {
normalizer = std::max( normalizer = std::max(
math::ASum<Tx, Context>( math::ASum<float, Context>(
flags.count(), Fdata), (Tx)1.f); flags.count(), Fdata), 1.f);
} else if (normalization == "BATCH_SIZE") { } else if (normalization == "BATCH_SIZE") {
normalizer = Input(0).dim(0); normalizer = Input(0).dim(0);
} else if (normalization == "FULL") { } else if (normalization == "FULL") {
normalizer = outer_dim * inner_dim; normalizer = outer_dim * inner_dim;
} }
auto* dYdata = Input(-1).template data<Tx, Context>(); auto* dYdata = Input(-1).template data<float, Context>();
Tx dYdata_host; ctx().template Copy<Tx, CPUContext, Context>( float dYdata_host; ctx()->template Copy<float, CPUContext, Context>(
1, &dYdata_host, dYdata); 1, &dYdata_host, dYdata);
math::Scal<Tx, Context>(Output(0)->count(), math::Scal<Tx, Context>(Output(0)->count(),
dYdata_host / normalizer, dXdata, &ctx()); dYdata_host / normalizer, dXdata, ctx());
} }
template <class Context> template <class Context>
void SparseSoftmaxCrossEntropyGradientOp<Context>::RunOnDevice() { void SparseSoftmaxCrossEntropyGradientOp<Context>::RunOnDevice() {
ctx()->set_stream_id(0); // enforce default stream
prob = ws()->GetTensor("/mnt/" + anchor() + "/softmax/prob"); prob = ws()->GetTensor("/mnt/" + anchor() + "/softmax/prob");
outer_dim = prob->count(0, axis); outer_dim = prob->count(0, axis);
inner_dim = prob->count(axis + 1); inner_dim = prob->count(axis + 1);
Output(0)->ReshapeLike(Input(0)); Output(0)->ReshapeLike(Input(0));
flags.Reshape({ outer_dim * inner_dim }); flags.Reshape({ outer_dim * inner_dim });
if (XIsType(Input(0), float) || XIsType(Input(0), float16)) { if (XIsType(Input(0), float)) {
if (XIsType(Input(1), float)) RunWithType<float, float>(); if (XIsType(Input(1), float)) RunWithType<float, float>();
else if (XIsType(Input(1), int64_t)) RunWithType<float, int64_t>(); else if (XIsType(Input(1), int64_t)) RunWithType<float, int64_t>();
else LOG(FATAL) << DTypeHelper(Input(1), { "float32", "int64" }); else LOG(FATAL) << DTypeHelper(Input(1), { "float32", "int64" });
if (XIsType(Input(0), float16)) { } else if (XIsType(Input(0), float16)) {
auto* dXdataF32 = Output(0)->template data<float, Context>(); if (XIsType(Input(1), float)) RunWithType<float16, float>();
auto* dXdataF16 = prob->template mutable_data<float16, Context>(); else if (XIsType(Input(1), int64_t)) RunWithType<float16, int64_t>();
kernel::TypeA2B<float, float16, Context>(Output(0)->count(), dXdataF32, dXdataF16); else LOG(FATAL) << DTypeHelper(Input(1), { "float32", "int64" });
Output(0)->template CopyFrom<Context>(*prob);
}
} else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" }); } else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
} }
......
...@@ -9,23 +9,27 @@ namespace dragon { ...@@ -9,23 +9,27 @@ namespace dragon {
template <class Context> template <typename Tx, typename Ty> template <class Context> template <typename Tx, typename Ty>
void AccuracyOp<Context>::RunWithType() { void AccuracyOp<Context>::RunWithType() {
static CPUContext cctx;
float* Y1data, *Y2data = nullptr;
Y1data = Output(0)->template mutable_data<float, CPUContext>();
if (OutputSize() > 1) { if (OutputSize() > 1) {
math::Set<float, CPUContext>(num_classes, 0, Y2data = Output(1)->template mutable_data<float, CPUContext>();
Output(1)->template mutable_data<float, CPUContext>()); math::Set<float, CPUContext>(num_classes, 0, Y2data, &cctx);
} }
Map<int, TIndex> num_per_class;
Map<int, TIndex> num_per_class;
TIndex acc = 0, count = 0; TIndex acc = 0, count = 0;
const Tx* Xdata; const Tx* Xdata;
if (XIsType(Input(0), float16)) { if (XIsType(Input(0), float16)) {
Tensor* XF32 = ws()->CreateTensor("/mnt/" + anchor() + "/accuracy/xf32"); Tensor* X32T = ws()->CreateTensor(
XF32->ReshapeLike(Input(0)); "/mnt/" + anchor() + "/accuracy/f32");
auto* XdataF16 = Input(0).template data<float16, CPUContext>(); X32T->ReshapeLike(Input(0));
auto* XdataF32 = XF32->template mutable_data<float, CPUContext>(); auto* X16 = Input(0).template data<float16, CPUContext>();
auto* X32 = X32T->template mutable_data<float, CPUContext>();
kernel::TypeA2B<float16, float, CPUContext>( kernel::TypeA2B<float16, float, CPUContext>(
Input(0).count(), XdataF16, XdataF32); Input(0).count(), X16, X32, &cctx);
Xdata = XdataF32; Xdata = X32;
} else Xdata = Input(0).template data<Tx, CPUContext>(); } else Xdata = Input(0).template data<Tx, CPUContext>();
auto* labels = Input(1).template data<Ty, CPUContext>(); auto* labels = Input(1).template data<Ty, CPUContext>();
...@@ -41,15 +45,13 @@ void AccuracyOp<Context>::RunWithType() { ...@@ -41,15 +45,13 @@ void AccuracyOp<Context>::RunWithType() {
vector<pair<Tx, int> > vec; vector<pair<Tx, int> > vec;
for (int k = 0; k < num_classes; k++) for (int k = 0; k < num_classes; k++)
vec.push_back( vec.push_back(
std::make_pair(Xdata[i * dim + k * inner_dim + j], k) std::make_pair(Xdata[i * dim + k * inner_dim + j], k));
);
std::partial_sort( std::partial_sort(
vec.begin(), vec.begin() + top_k, vec.end(), vec.begin(), vec.begin() + top_k, vec.end(),
std::greater<pair<Tx, int> >()); std::greater<pair<Tx, int> >());
for (int k = 0; k < top_k; k++) { for (int k = 0; k < top_k; k++) {
if (vec[k].second == label) { if (vec[k].second == label) {
if (OutputSize() > 1) if (OutputSize() > 1) Y2data[label]++;
Output(1)->template mutable_data<float, CPUContext>()[label]++;
acc++; acc++;
break; break;
} }
...@@ -58,12 +60,11 @@ void AccuracyOp<Context>::RunWithType() { ...@@ -58,12 +60,11 @@ void AccuracyOp<Context>::RunWithType() {
} // end inner_dim } // end inner_dim
} // end outer_dim } // end outer_dim
Output(0)->template mutable_data<float, CPUContext>()[0] = (float)acc / count; Y1data[0] = (float)acc / count;
if (OutputSize() > 1) { if (Y2data) {
auto* acc_per_class = Output(1)->template mutable_data<float, CPUContext>();
for (int i = 0; i < num_classes; i++) for (int i = 0; i < num_classes; i++)
acc_per_class[i] = num_per_class[i] == 0 ? Y2data[i] = num_per_class[i] == 0 ?
0 : acc_per_class[i] / num_per_class[i]; 0 : Y2data[i] / num_per_class[i];
} }
} }
......
...@@ -14,14 +14,14 @@ namespace dragon { ...@@ -14,14 +14,14 @@ namespace dragon {
Output(0)->ReshapeLike(Input(0)); \ Output(0)->ReshapeLike(Input(0)); \
auto* Xdata = Input(0).template data<type_a, Context>(); \ auto* Xdata = Input(0).template data<type_a, Context>(); \
auto* Ydata = Output(0)->template mutable_data<type_b, Context>(); \ auto* Ydata = Output(0)->template mutable_data<type_b, Context>(); \
kernel::TypeA2B<type_a, type_b, Context>(Input(0).count(), Xdata, Ydata); \ kernel::TypeA2B<type_a, type_b, Context>(Input(0).count(), Xdata, Ydata, ctx()); \
} else { \ } else { \
TIndex count = Output(0)->count(); \ TIndex count = Output(0)->count(); \
auto* Xdata = Output(0)->template data<type_a, Context>(); \ auto* Xdata = Output(0)->template data<type_a, Context>(); \
auto* Cdata = ws()->template caches<type_b, Context>({ count })[0]; \ auto* Cdata = ws()->template caches<type_b, Context>({ count })[0]; \
kernel::TypeA2B<type_a, type_b, Context>(count, Xdata, Cdata); \ kernel::TypeA2B<type_a, type_b, Context>(count, Xdata, Cdata, ctx()); \
auto* Ydata = Output(0)->template mutable_data<type_b, Context>(); \ auto* Ydata = Output(0)->template mutable_data<type_b, Context>(); \
ctx().template Copy<type_b, Context, Context>(count, Ydata, Cdata); \ ctx()->template Copy<type_b, Context, Context>(count, Ydata, Cdata); \
} \ } \
return; \ return; \
} }
......
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!