Commit 5cd0761b by Ting PAN

Unlock CUDA Async Streams

1 parent 3b990761
Showing with 920 additions and 882 deletions
......@@ -52,9 +52,9 @@ using Set = std::unordered_set<Value> ;
/*
* Define the Kernel version.
*
* | Major(2) | Minor(2) | Patch(10) |
* | Major(2) | Minor(2) | Patch(11) |
*/
#define DRAGON_VERSION 2210
#define DRAGON_VERSION 2211
/*
* Define the default random seed.
......
......@@ -34,6 +34,8 @@ class CPUContext {
virtual ~CPUContext() {}
inline void SwitchToDevice() {}
inline void SwitchToDevice(int stream_id) {}
inline void FinishDeviceCompution() {}
inline static void* New(size_t nbytes) {
......@@ -47,7 +49,15 @@ class CPUContext {
return data;
}
inline static void Memset(size_t nbytes, void* ptr) {
inline static void Memset(
size_t nbytes,
void* ptr) {
memset(ptr, 0, nbytes);
}
inline void MemsetAsync(
size_t nbytes,
void* ptr) {
memset(ptr, 0, nbytes);
}
......@@ -59,18 +69,16 @@ class CPUContext {
memcpy(dst, src, nbytes);
}
inline static void Delete(void* data) { free(data); }
template<class DstContext, class SrcContext>
inline static void MemcpyAsync(
inline void MemcpyAsync(
size_t nbytes,
void* dst,
const void* src) {
NOT_IMPLEMENTED;
memcpy(dst, src, nbytes);
}
template<typename T, class DstContext, class SrcContext>
inline static void Copy(
inline void Copy(
int n,
T* dst,
const T* src) {
......@@ -82,7 +90,10 @@ class CPUContext {
else for (int i = 0; i < n; i++) dst[i] = src[i];
}
inline static void Delete(void* data) { free(data); }
inline int device_id() const { return 0; }
inline void set_stream_id(int stream_id) {}
inline std::mt19937* rand_generator() {
if (!rand_generator_.get())
......
......@@ -23,8 +23,7 @@ namespace dragon {
class CUDAObject {
public:
CUDAObject(int default_stream = 1)
: default_stream(default_stream) {
CUDAObject() {
for (int i = 0; i < CUDA_MAX_DEVICES; i++) {
cuda_streams[i] = vector<cudaStream_t>();
cublas_handles[i] = vector<cublasHandle_t>();
......@@ -38,7 +37,7 @@ class CUDAObject {
for (int i = 0; i < CUDA_MAX_DEVICES; i++) {
for (int j = 0; j < cuda_streams[i].size(); j++) {
auto& stream = cuda_streams[i][j];
// follow caffe2, do not check the stream destroying
// follow the caffe2, do not check the stream destroying
// Error code 29 (driver shutting down) is inevitable
// TODO(PhyscalX): Can someone solve this issue?
if (stream) cudaStreamDestroy(stream);
......@@ -52,19 +51,21 @@ class CUDAObject {
}
}
/**
* Each device takes a group of streams.
*
* The stream 0 is reserved for default stream,
* stream 1 or higher is created as ``cudaStreamNonBlocking``.
*/
// follow the caffe2,
// each device takes a group of non-bl0cking streams
// the stream 0 is reserved for default stream,
// as some computations really require it,
// e.g. cublas.asum() and mixed cpu/cuda operations
// besides, somes calls, such as cudnn.conv() and cudnn.rnn(),
// produce wrong results if running them on non-blocking streams
// note that caffe2 also use default streams (within CuDNNState)
cudaStream_t GetStream(int device_id, int stream_id) {
vector<cudaStream_t>& dev_streams = cuda_streams[device_id];
if (dev_streams.size() <= (unsigned)stream_id)
dev_streams.resize(stream_id + 1, nullptr);
if (!dev_streams[stream_id]) {
DeviceGuard guard(device_id);
unsigned int flags = !stream_id && default_stream ?
unsigned int flags = !stream_id ?
cudaStreamDefault : cudaStreamNonBlocking;
CUDA_CHECK(cudaStreamCreateWithFlags(
&dev_streams[stream_id], flags));
......@@ -102,8 +103,6 @@ class CUDAObject {
}
#endif
int default_stream;
vector<cudaStream_t> cuda_streams[CUDA_MAX_DEVICES];
vector<cublasHandle_t> cublas_handles[CUDA_MAX_DEVICES];
#ifdef WITH_CUDNN
......@@ -129,11 +128,10 @@ class CUDAContext {
stream_id_ = stream_id;
}
inline void SwitchToDevice() { SwitchToDevice(0); }
inline void SwitchToDevice() { SwitchToDevice(1); }
inline void FinishDeviceCompution() {
cudaStreamSynchronize(cuda_object_
.GetStream(device_id_, stream_id_));
cudaStreamSynchronize(cuda_stream());
cudaError_t error = cudaGetLastError();
CHECK_EQ(error, cudaSuccess)
<< "\nCUDA Error: " << cudaGetErrorString(error);
......@@ -147,8 +145,17 @@ class CUDAContext {
return data;
}
inline static void Memset(size_t nbytes, void* ptr) {
cudaMemset(ptr, 0, nbytes);
inline static void Memset(
size_t nbytes,
void* ptr) {
CUDA_CHECK(cudaMemset(ptr, 0, nbytes));
}
inline void MemsetAsync(
size_t nbytes,
void* ptr) {
CUDA_CHECK(cudaMemsetAsync(ptr, 0,
nbytes, cuda_stream()));
}
template<class DstContext, class SrcContext>
......@@ -169,20 +176,22 @@ class CUDAContext {
cudaMemcpyDefault, cuda_stream()));
}
inline static void Delete(void* data) { cudaFree(data); }
template<typename T, class DstContext, class SrcContext>
static void Copy(
inline void Copy(
int n,
T* dst,
const T* src) {
if (dst == src) return;
Memcpy<SrcContext, DstContext>(
MemcpyAsync<SrcContext, DstContext>(
n * sizeof(T), (void*)dst, (const void*)src);
}
inline static void Delete(void* data) { cudaFree(data); }
inline int device_id() const { return device_id_; }
inline void set_stream_id(int stream_id) { stream_id_ = stream_id; }
inline cudaStream_t cuda_stream() {
return cuda_stream(device_id_, stream_id_);
}
......@@ -227,7 +236,7 @@ class CUDAContext {
static thread_local CUDAObject cuda_object_;
private:
int device_id_, stream_id_ = 0, random_seed_;
int device_id_, stream_id_ = 1, random_seed_;
unique_ptr<std::mt19937> rand_generator_;
curandGenerator_t curand_generator_ = nullptr;
};
......@@ -271,7 +280,7 @@ class CUDAClosure {
protected:
Context* ctx_;
CUDAObject cuda_object_ = 0;
CUDAObject cuda_object_;
vector<int> active_streams_;
};
......@@ -283,8 +292,22 @@ class CUDAContext {
CUDAContext(const int device_id = 0) { CUDA_NOT_COMPILED; }
inline void SwitchToDevice() { CUDA_NOT_COMPILED; }
inline void SwitchToDevice(int stream_id) { CUDA_NOT_COMPILED; }
inline void FinishDeviceCompution() { CUDA_NOT_COMPILED; }
inline static void Memset(
size_t nbytes,
void* ptr) {
CUDA_NOT_COMPILED;
}
inline void MemsetAsync(
size_t nbytes,
void* ptr) {
CUDA_NOT_COMPILED;
}
template<class DstContext, class SrcContext>
inline static void Memcpy(
size_t nbytes,
......@@ -302,6 +325,7 @@ class CUDAContext {
}
inline int device_id() const { return 0; }
inline void set_stream_id(int stream_id) {}
};
#endif // WITH_CUDA
......
......@@ -37,7 +37,8 @@ class GraphBase {
virtual bool Run(
const string& include,
const string& exclude) = 0;
const string& exclude,
const int stream_id = 1) = 0;
inline string name() const { return name_; }
......@@ -58,7 +59,8 @@ class Graph final : public GraphBase {
bool Run(
const string& include,
const string& exclude) override;
const string& exclude,
const int stream_id = 1) override;
GraphDef Prune(const GraphDef& meta_graph);
GraphDef MakeUpdate(const GraphDef& meta_graph);
......
......@@ -44,7 +44,7 @@ class OperatorBase {
const string& anchor);
inline void SwitchToPhase(const string& phase) { phase_ = phase; }
virtual void Run() { NOT_IMPLEMENTED; }
virtual void Run(int stream_id = 1) { NOT_IMPLEMENTED; }
inline const string& name() const { return def_.name(); }
inline const string& type() const { return def_.type(); }
......@@ -100,13 +100,13 @@ class Operator : public OperatorBase {
Output(0)->name() == "ignore"));
}
virtual void Run() final {
void Run(int stream_id = 1) final {
if (!allow_run_) return;
if (allow_recompute_) MakeResource();
ctx().SwitchToDevice();
ctx()->SwitchToDevice(stream_id);
MemorySwitch();
RunOnDevice();
if (do_sync_) ctx().FinishDeviceCompution();
if (do_sync_) ctx()->FinishDeviceCompution();
if (allow_recompute_) CleanResource();
}
......@@ -123,7 +123,7 @@ class Operator : public OperatorBase {
virtual void RunOnDevice() = 0;
inline Context& ctx() { return ctx_; }
inline Context* ctx() { return &ctx_; }
inline bool AllowRun() { return allow_run_; }
protected:
......@@ -192,6 +192,27 @@ DECLARE_REGISTRY(
const OperatorDef&,
Workspace*);
#define TENSOR_FILL_WITH_TYPE(tensor, shape, type) \
if (tensor.count() == 0) { \
CHECK(ws()->GetFiller(tensor.name())) \
<< "\nTensor(" << tensor.name() << ") is empty. \n" \
<< "may be specify a filler for it ?"; \
tensor.Reshape(shape); \
unique_ptr< Filler<type, Context> > filler( \
CreateFiller<type, Context>(*ws()->GetFiller(tensor.name()))); \
filler->Fill(&tensor, ctx()); \
ctx()->FinishDeviceCompution(); \
} else { \
TIndex count = 1; \
for(int i = 0; i < shape.size(); i++) count *= shape[i]; \
CHECK_EQ(count, tensor.count()) \
<< "\nModel request " << "Tensor(" << tensor.name() << ")'s " \
<< "size is " << count << ", \n" \
<< "but now is " << tensor.count() << ", " \
<< "did you feed the incorrect Tensor before ?"; \
tensor.Reshape(shape); \
}
#define TENSOR_FILL(tensor, shape) \
if (tensor.count() == 0) { \
CHECK(ws()->GetFiller(tensor.name())) \
......@@ -200,7 +221,8 @@ DECLARE_REGISTRY(
tensor.Reshape(shape); \
unique_ptr< Filler<T, Context> > filler( \
CreateFiller<T, Context>(*ws()->GetFiller(tensor.name()))); \
filler->Fill(&tensor, &ctx()); \
filler->Fill(&tensor, ctx()); \
ctx()->FinishDeviceCompution(); \
} else { \
TIndex count = 1; \
for(int i = 0; i < shape.size(); i++) count *= shape[i]; \
......@@ -217,7 +239,7 @@ DECLARE_REGISTRY(
if (size > ptr_tensor->count()) { \
ptr_tensor->Reshape({ size }); \
math::Set<T, Context>(size, dragon_cast<T, float>(1.f), \
ptr_tensor->template mutable_data<T, Context>()); \
ptr_tensor->template mutable_data<T, Context>(), ctx()); \
} \
}
......
......@@ -74,7 +74,9 @@ class Tensor {
for (TIndex i = start; i < end; i++) ret *= dim(i);
return ret;
}
inline TIndex count() const { return size_; }
inline TIndex count(const TIndex start) const {
return count(start, ndim());
}
......@@ -197,7 +199,7 @@ class Tensor {
mutable_data_ptr<Context>(&data_ptr);
// call the constructors
if (meta.ctor()) meta_.ctor()(data_ptr, size_);
capacity_ = size_ * meta.itemsize();
capacity_ = size_ * meta.itemsize(), require_init_ = true;
return data_ptr;
}
......@@ -225,6 +227,15 @@ class Tensor {
}
template <typename T, class Context>
T* mutable_data(Context* ctx) {
auto* data = mutable_data<T, Context>();
if (!require_init_) return data;
ctx->MemsetAsync(nbytes(), (void*)data);
require_init_ = false;
return data;
}
template <typename T, class Context>
const T* data() const {
CHECK(meta_ == TypeMeta::Make<T>())
<< "\nThe DType of Tensor(" << name() << ") is "
......@@ -234,27 +245,31 @@ class Tensor {
}
template <class Context>
inline void CopyFrom(const Tensor& other) {
inline void CopyFrom(const Tensor& other, Context* ctx) {
if ((void*)&other == (void*)this) return;
CHECK_EQ(size_, other.size_);
auto* src = other.template raw_data<Context>();
auto* dst = raw_mutable_data<Context>(other.meta_);
if (dst == src) return;
if (TypeMeta::Id<Context>() ==
TypeMeta::Id<CPUContext>()) {
CPUContext::Memcpy<Context, Context>(nbytes(), dst, src);
} else if (TypeMeta::Id<Context>() ==
TypeMeta::Id<CUDAContext>()) {
CUDAContext::Memcpy<Context, Context>(nbytes(), dst, src);
}
ctx->template MemcpyAsync<Context, Context>(
nbytes(), dst, src);
require_init_ = false;
}
inline void Move(MixedMemory* mem) {
if (mem != nullptr) ex_memory_ = mem;
else ex_memory_ = new MixedMemory(TypeMeta::Make<float>(), 4);
own_mem_ = false;
if (mem != nullptr) {
ex_memory_ = mem;
require_init_ = false;
} else {
ex_memory_ = new MixedMemory(
TypeMeta::Make<float>(), 4);
require_init_ = true;
} own_mem_ = false;
}
inline void Share(MixedMemory* mem) { Move(mem); is_shared_ = true; }
inline void Share(MixedMemory* mem) {
Move(mem); is_shared_ = true;
require_init_ = false;
}
inline void Reset() {
size_ = capacity_ = 0;
......@@ -275,7 +290,7 @@ class Tensor {
shared_ptr<MixedMemory> memory_;
MixedMemory* ex_memory_ = nullptr;
bool is_corrupted_ = false, is_shared_ = false;
bool own_mem_ = true;
bool own_mem_ = true, require_init_ = true;
};
} // namespace dragon
......
......@@ -179,29 +179,28 @@ class Workspace {
template <class Context>
inline vector<void*> caches(
const vector<size_t>& segments) {
TIndex total_size = 0;
for (auto& segment : segments) total_size += (TIndex)segment;
Tensor* cacheT = CreateTensor("/share/cache");
cacheT->Reshape({ total_size });
vector<void*> caches(segments.size());
caches[0] = cacheT->template mutable_data<uint8_t, Context>();
TIndex nbytes = 0;
for (auto& segment : segments) nbytes += (TIndex)segment;
Tensor* cache_t = CreateTensor("/share/cache");
cache_t->Reshape({ nbytes });
vector<void*> Bcaches(segments.size());
Bcaches[0] = cache_t->template mutable_data<uint8_t, Context>();
for (int i = 1; i < segments.size(); i++)
caches[i] = (uint8_t*)caches[i - 1] + segments[i - 1];
return caches;
Bcaches[i] = (uint8_t*)Bcaches[i - 1] + segments[i - 1];
return Bcaches;
}
template <typename T, class Context>
inline vector<T*> caches(
const vector<TIndex>& segments) {
TIndex total_count = 0;
for (auto& segment : segments) total_count += segment;
Tensor* cacheT = CreateTensor("/share/cache");
cacheT->Reshape({ total_count });
vector<T*> caches(segments.size());
caches[0] = cacheT->template mutable_data<T, Context>();
for (int i = 1; i < segments.size(); i++)
caches[i] = caches[i - 1] + segments[i - 1];
return caches;
vector<size_t> Tsegments;
for (auto& segment : segments)
Tsegments.emplace_back(segment * sizeof(T));
vector<void*> Bcaches = caches<Context>(Tsegments);
vector<T*> Tcaches(segments.size());
for (int i = 0; i < segments.size(); i++)
Tcaches[i] = (T*)Bcaches[i];
return Tcaches;
}
/******************** Operator ********************/
......@@ -259,11 +258,12 @@ class Workspace {
void RunGraph(
const string& graph_name,
const string& include,
const string& exclude) {
const string& exclude,
const int stream_id = 1) {
if (!graph_map_.count(graph_name))
LOG(FATAL) << "Graph(" << graph_name
<< ") does not exist.";
graph_map_[graph_name]->Run(include, exclude);
graph_map_[graph_name]->Run(include, exclude, stream_id);
}
vector<string> GetGraphs() {
......
......@@ -36,7 +36,6 @@ class SparseSoftmaxCrossEntropyOp : public Operator<Context> {
USE_OPERATOR_FUNCTIONS;
void SoftmaxRun();
void SoftmaxRunFP16();
void RunOnDevice() override;
template <typename Tx, typename Ty> void RunWithType();
......
......@@ -42,7 +42,7 @@ public:
// simply copy the dY to dX
Output(0)->ReshapeLike(Input(0));
if (Output(0)->name() != Input(-1).name())
Output(0)->template CopyFrom<Context>(Input(-1));
Output(0)->template CopyFrom<Context>(Input(-1), ctx());
}
};
......
......@@ -34,7 +34,6 @@ class L2NormOp final : public Operator<Context> {
TIndex axis, num_axes, end_axis;
float eps;
string mode;
bool across_inner;
Tensor* norm, buffer;
TIndex outer_dim, dim, inner_dim, spatial_dim;
};
......@@ -55,7 +54,6 @@ class L2NormGradientOp final : public Operator<Context> {
protected:
TIndex axis, num_axes, end_axis;
string mode;
bool across_inner;
Tensor* norm, buffer, buffer_inner;
TIndex outer_dim, dim, inner_dim;
};
......
......@@ -24,7 +24,7 @@ class AdamUpdateOp final : public UpdateOpBase<Context> {
USE_OPERATOR_FUNCTIONS;
USE_UPDATER_FUNCTIONS(Context);
void ComputeRunWithFloat() override;
void ComputeRunWithFloat32() override;
void ComputeRunWithFloat16() override;
protected:
......
......@@ -43,10 +43,26 @@ class CollectiveUpdateOp final : public Operator<Context> {
void InitNCCL();
void RunOnDevice() override;
void MPIAllReduceWithFloat();
void NCCLAllReduceWithFloat();
void MPIBcastWithFloat();
void NCCLBcastWithFloat();
template <typename T> void MPIAllReduce(
Tensor* tensor,
MPI_Datatype dtype);
template <typename T> void MPIBcast(
Tensor* tensor,
MPI_Datatype dtype);
#ifdef WITH_MPI_NCCL
template <typename T> void NCCLAllReduce(
Tensor* tensor,
ncclDataType_t dtype,
cudaStream_t& stream);
template <typename T> void NCCLBcast(
Tensor* tensor,
ncclDataType_t dtype,
cudaStream_t& stream);
#endif
protected:
int comm_size, comm_rank, comm_root;
......
......@@ -24,7 +24,7 @@ class NesterovUpdateOp final : public UpdateOpBase<Context> {
USE_OPERATOR_FUNCTIONS;
USE_UPDATER_FUNCTIONS(Context);
void ComputeRunWithFloat() override;
void ComputeRunWithFloat32() override;
void ComputeRunWithFloat16() override;
protected:
......
......@@ -24,7 +24,7 @@ class RMSPropUpdateOp final : public UpdateOpBase<Context> {
USE_OPERATOR_FUNCTIONS;
USE_UPDATER_FUNCTIONS(Context);
void ComputeRunWithFloat() override;
void ComputeRunWithFloat32() override;
void ComputeRunWithFloat16() override;
protected:
......
......@@ -25,7 +25,7 @@ class SGDUpdateOp final : public UpdateOpBase<Context> {
USE_OPERATOR_FUNCTIONS;
USE_UPDATER_FUNCTIONS(Context);
void ComputeRunWithFloat() override;
void ComputeRunWithFloat32() override;
void ComputeRunWithFloat16() override;
protected:
......
......@@ -35,13 +35,11 @@ class UpdateOpBase : public Operator<Context> {
void RunOnDevice() override;
template <typename T> void PreprocessRunWithType();
virtual void ComputeRunWithFloat() = 0;
virtual void ComputeRunWithFloat32() = 0;
virtual void ComputeRunWithFloat16() = 0;
virtual void ComputeRunWithFloat16() {
LOG(FATAL) << "This Updater does not support FP16.";
}
template <typename T> void UpdateRunWithType();
void UpdateRunWithFloat32();
void UpdateRunWithFloat16();
protected:
float lr_mult, decay_mult;
......
......@@ -80,7 +80,8 @@ class ConvOpBase : public Operator<Context> {
dilation[0], dilation[1],
data_format,
im,
col);
col,
ctx());
} else LOG(FATAL) << "ConvNd has not been implemented yet";
}
template <typename T> void Col2Im(const T* col, T* im) {
......@@ -94,7 +95,8 @@ class ConvOpBase : public Operator<Context> {
dilation[0], dilation[1],
data_format,
col,
im);
im,
ctx());
} else LOG(FATAL) << "ConvNd has not been implemented yet";
}
};
......
......@@ -19,6 +19,8 @@
namespace dragon {
#define HFLT_MIN 6.10e-5F
template <typename DestType, typename SrcType>
DestType dragon_cast(SrcType val);
......
......@@ -29,9 +29,17 @@ namespace dragon {
#ifdef WITH_CUDA
static const int CUDA_THREADS = 1024;
// We do have a server with 10 GPUs :-)
#define CUDA_MAX_DEVICES 10
// The number of cuda threads to use. We set it to
// 1024 which would work for compute capability 2.x
// Set it to 512 if using compute capability 1.x
const int CUDA_THREADS = 1024;
// The maximum number of blocks to use in the default kernel call. We set it to
// 65535 which would work for compute capability 2.x (where 65536 is the limit)
const int CUDA_MAX_BLOCKS = 65535;
// You really need a NVIDIA DGX-2 !!! :-)
#define CUDA_MAX_DEVICES 16
#define CUDA_VERSION_MIN(major, minor, patch) \
(CUDA_VERSION >= (major * 1000 + minor * 100 + patch))
......@@ -67,12 +75,16 @@ static const int CUDA_THREADS = 1024;
} while (0)
#endif // WITH_MPI_NCCL
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; \
i < n; i += blockDim.x * gridDim.x)
inline int CUDA_BLOCKS(const int N) {
return (N + CUDA_THREADS - 1) / CUDA_THREADS;
return std::max(
std::min(
(N + CUDA_THREADS - 1) / CUDA_THREADS,
CUDA_MAX_BLOCKS
), 1);
}
#if CUDA_VERSION_MAX(9, 0, 0)
......
......@@ -44,6 +44,7 @@ template<> class CUDNNType<float> {
static const cudnnDataType_t type = CUDNN_DATA_FLOAT;
static float oneval, zeroval;
static const void *one, *zero;
typedef float BNParamType;
};
template<> class CUDNNType<double> {
......@@ -51,6 +52,7 @@ template<> class CUDNNType<double> {
static const cudnnDataType_t type = CUDNN_DATA_DOUBLE;
static double oneval, zeroval;
static const void *one, *zero;
typedef double BNParamType;
};
#ifdef WITH_CUDA_FP16
......@@ -59,6 +61,7 @@ template<> class CUDNNType<float16> {
static const cudnnDataType_t type = CUDNN_DATA_HALF;
static float oneval, zeroval;
static const void *one, *zero;
typedef float BNParamType;
};
#endif
......
......@@ -40,7 +40,7 @@ class ConstantFiller final : public Filler<T, Context> {
void Fill(Tensor* tensor, Context* ctx) override {
math::Set<T, Context>(tensor->count(),
dragon_cast<T, float>(filler().value()),
tensor->mutable_data<T, Context>());
tensor->mutable_data<T, Context>(), ctx);
}
protected:
......@@ -71,11 +71,11 @@ class TruncatedNormalFiller final : public Filler<T, Context> {
void Fill(Tensor* tensor, Context* ctx) override {
// implement it on gpu is difficult
static CPUContext cpu_ctx;
static CPUContext cctx;
math::RandomTruncatedNormal<T, CPUContext>(tensor->count(),
filler().mean(), filler().std(),
filler().low(), filler().high(),
tensor->mutable_data<T, CPUContext>(), &cpu_ctx);
tensor->mutable_data<T, CPUContext>(), &cctx);
}
protected:
......
......@@ -36,7 +36,8 @@ template <typename T, class Context>
void Set(
const int n,
const T alpha,
T* x);
T* x,
Context* ctx);
template <typename T, class Context>
void RandomUniform(
......@@ -78,73 +79,84 @@ void Add(
const int n,
const T* a,
const T* b,
T* y);
T* y,
Context* ctx);
template <typename T, class Context>
void Sub(
const int n,
const T* a,
const T* b,
T* y);
T* y,
Context* ctx);
template <typename T, class Context>
void Mul(
const int n,
const T* a,
const T* b,
T* y);
T* y,
Context* ctx);
template <typename T, class Context>
void Div(
const int n,
const T* a,
const T* b,
T* y);
T* y,
Context* ctx);
template <typename T, class Context>
void Clip(
const int n,
const float low,
const float high,
T* x);
T* x,
Context* ctx);
template <typename T, class Context>
void Exp(
const int n,
const T* x,
T* y);
T* y,
Context* ctx);
template <typename T, class Context>
void Log(
const int n,
const T* x,
T* y);
T* y,
Context* ctx);
template <typename T, class Context>
void Square(
const int n,
const T* x,
T* y);
T* y,
Context* ctx);
template <typename T, class Context>
void Sqrt(
const int n,
const T* x,
T* y);
T* y,
Context* ctx);
template <typename T, class Context>
void Pow(
const int n,
const float alpha,
const T* x,
T* y);
T* y,
Context* ctx);
template <typename T, class Context>
void Inv(
const int n,
const float numerator,
const T* x,
T* y);
T* y,
Context* ctx);
/******************** Level-2 ********************/
......@@ -164,19 +176,21 @@ void Scale(
Context* ctx);
template <typename T, class Context>
T StridedDot(
void StridedDot(
const int n,
const T* a,
const int incx,
const T* b,
const int incy,
T* y,
Context* ctx);
template <typename T, class Context>
float Dot(
void Dot(
const int n,
const T* a,
const T* b,
T* y,
Context* ctx);
template<typename T, class Context>
......@@ -188,13 +202,15 @@ template<typename T, class Context>
void AddScalar(
const int n,
const float alpha,
T* y);
T* y,
Context* ctx);
template<typename T, class Context>
void MulScalar(
const int n,
const float alpha,
T* y);
T* y,
Context* ctx);
template<typename T, class Context>
void Axpy(
......
......@@ -80,7 +80,7 @@ T Dot(
const T* b);
template<typename T>
T ASum(
T Sum(
const int n,
const T* x);
......
......@@ -15,6 +15,7 @@
#ifdef WITH_SSE
#include <immintrin.h>
#include <tmmintrin.h>
#include <cstdint>
namespace dragon {
......
......@@ -250,8 +250,9 @@ void LoadCaffemodel(
void RunGraph(
const std::string& graph_name,
Workspace* ws) {
ws->RunGraph(graph_name, "", "");
Workspace* ws,
const int stream_id) {
ws->RunGraph(graph_name, "", "", stream_id);
}
template <typename T>
......
......@@ -38,8 +38,7 @@ class Device {
EXPORT const int device_id() const { return device_id_; }
private:
int device_type_;
int device_id_;
int device_type_, device_id_;
};
EXPORT Workspace* CreateWorkspace(const std::string& name);
......@@ -61,7 +60,8 @@ EXPORT std::string CreateGraph(
EXPORT void RunGraph(
const std::string& graph_name,
Workspace* ws);
Workspace* ws,
const int stream_id = 1);
EXPORT void CreateTensor(
const std::string& name,
......
......@@ -116,7 +116,7 @@ class NumpyFeeder : public TensorFeederBase {
#else
LOG(FATAL) << "CUDA was not compiled.";
#endif
} else{
} else {
CPUContext::Memcpy<CPUContext, CPUContext>(tensor->nbytes(),
tensor->raw_mutable_data<CPUContext>(),
static_cast<void*>(PyArray_DATA(array)));
......
......@@ -18,18 +18,22 @@
PyObject* CreateGradientDefsCC(PyObject* self, PyObject* args) {
PyObject* def_string = nullptr;
PyObject* py_g_outputs = nullptr;
if (!PyArg_ParseTuple(args, "SO!", &def_string, &PyList_Type, &py_g_outputs)) {
PyErr_SetString(PyExc_ValueError, "Excepted a serialized string of OperatorDef "
if (!PyArg_ParseTuple(args, "SO!",
&def_string, &PyList_Type, &py_g_outputs)) {
PyErr_SetString(PyExc_ValueError,
"Excepted a serialized string of OperatorDef "
"and a list containing outputs of this GradientOp.");
return nullptr;
}
OperatorDef def;
if (!def.ParseFromString(PyBytes_AsStringEx(def_string))) {
PyErr_SetString(PyExc_ValueError, "Failed to parse the OperatorDef.");
PyErr_SetString(PyExc_ValueError,
"Failed to parse the OperatorDef.");
return nullptr;
}
if (!GradientRegistry()->Has(def.type())) {
PyErr_SetString(PyExc_KeyError, "This Operator does not register GradientOp.");
PyErr_SetString(PyExc_KeyError,
"This Operator does not register GradientOp.");
return nullptr;
}
vector<string> g_outputs;
......@@ -61,7 +65,8 @@ PyObject* RunGradientFlowCC(PyObject* self, PyObject* args) {
PyObject* py_fp_ops, *py_targets;
PyObject* py_input_grads, *py_ignore_grads;
PyObject* py_share_grads, *py_export_graph;
if (!PyArg_ParseTuple(args, "OOOOOO", &py_fp_ops, &py_targets,
if (!PyArg_ParseTuple(args, "OOOOOO",
&py_fp_ops, &py_targets,
&py_input_grads, &py_ignore_grads,
&py_share_grads, &py_export_graph)) {
PyErr_SetString(PyExc_ValueError,
......@@ -84,8 +89,8 @@ PyObject* RunGradientFlowCC(PyObject* self, PyObject* args) {
for (auto& grad : input_grads) maker.AddExternalGrad(grad);
for (auto& grad : ignore_grads) maker.AddIgnoreGrad(grad);
maker.Make(fp_ops, targets, bp_ops);
bool share_grads = (bool)PyObject_IsTrue(py_share_grads);
bool export_graph = (bool)PyObject_IsTrue(py_export_graph);
bool share_grads = PyObject_IsTrue(py_share_grads) ? true : false;
bool export_graph = PyObject_IsTrue(py_export_graph) ? true : false;
if (share_grads) maker.Share("/share/buffer/grads", bp_ops);
if (export_graph) {
Tensor* t = ws()->CreateTensor("/export/dynamic_graph/gradient_flow");
......
......@@ -17,7 +17,8 @@
inline PyObject* SetLogLevelCC(PyObject* self, PyObject* args) {
char* cname;
if (!PyArg_ParseTuple(args, "s", &cname)) {
PyErr_SetString(PyExc_ValueError, "Excepted the logging level.");
PyErr_SetString(PyExc_ValueError,
"Excepted the logging level.");
return nullptr;
}
SetLogDestination(StrToLogSeverity(string(cname)));
......
......@@ -17,16 +17,19 @@
inline PyObject* CreateGraphCC(PyObject* self, PyObject* args) {
PyObject* graph_str;
if (!PyArg_ParseTuple(args, "S", &graph_str)) {
PyErr_SetString(PyExc_ValueError, "Excepted a serialized string of GraphDef.");
PyErr_SetString(PyExc_ValueError,
"Excepted a serialized string of GraphDef.");
return nullptr;
}
GraphDef graph_def;
if (!graph_def.ParseFromString(PyBytes_AsStringEx(graph_str))) {
PyErr_SetString(PyExc_RuntimeError, "Failed to parse the GraphDef.");
PyErr_SetString(PyExc_RuntimeError,
"Failed to parse the GraphDef.");
return nullptr;
}
if (!ws()->CreateGraph(graph_def)) {
PyErr_SetString(PyExc_RuntimeError, "Failed to create the Graph.");
PyErr_SetString(PyExc_RuntimeError,
"Failed to create the Graph.");
return nullptr;
}
Py_RETURN_TRUE;
......@@ -34,11 +37,17 @@ inline PyObject* CreateGraphCC(PyObject* self, PyObject* args) {
inline PyObject* RunGraphCC(PyObject* self, PyObject* args) {
char* cname, *include, *exclude;
if (!PyArg_ParseTuple(args, "sss", &cname, &include, &exclude)) {
PyErr_SetString(PyExc_ValueError, "Excepted the graph name, include and exclude rules.");
if (!PyArg_ParseTuple(args, "sss",
&cname, &include, &exclude)) {
PyErr_SetString(PyExc_ValueError,
"Excepted the graph name, include and exclude rules.");
return nullptr;
}
ws()->RunGraph(string(cname), string(include), string(exclude));
ws()->RunGraph(
string(cname),
string(include),
string(exclude)
);
Py_RETURN_TRUE;
}
......
......@@ -46,7 +46,8 @@ inline PyObject* MPICreateGroupCC(PyObject* self, PyObject* args) {
PyObject *incl, *excl, *ret;
int local_root, world_size;
if (!PyArg_ParseTuple(args, "iOO", &local_root, &incl, &excl)) {
PyErr_SetString(PyExc_ValueError, "Excepted the local root, include and exclued list.");
PyErr_SetString(PyExc_ValueError,
"Excepted the local root, include and exclued list.");
return nullptr;
}
MPI_Group world_group, local_group;
......
......@@ -37,12 +37,14 @@ inline PyObject* NoGradientOperatorsCC(PyObject* self, PyObject* args) {
inline PyObject* RunOperatorCC(PyObject* self, PyObject* args) {
PyObject* op_str;
if (!PyArg_ParseTuple(args, "S", &op_str)) {
PyErr_SetString(PyExc_ValueError, "Excepted a serialized string of OperatorDef.");
PyErr_SetString(PyExc_ValueError,
"Excepted a serialized string of OperatorDef.");
return nullptr;
}
OperatorDef op_def;
if (!op_def.ParseFromString(PyBytes_AsStringEx(op_str))) {
PyErr_SetString(PyExc_RuntimeError, "Failed to parse the OperatorDef.");
PyErr_SetString(PyExc_RuntimeError,
"Failed to parse the OperatorDef.");
return nullptr;
}
ws()->RunOperator(op_def);
......@@ -52,7 +54,8 @@ inline PyObject* RunOperatorCC(PyObject* self, PyObject* args) {
inline PyObject* RunOperatorsCC(PyObject* self, PyObject* args) {
PyObject* py_ops;
if (!PyArg_ParseTuple(args, "O", &py_ops)) {
PyErr_SetString(PyExc_ValueError, "Excepted a list of serialized string of OperatorDef.");
PyErr_SetString(PyExc_ValueError,
"Excepted a list of serialized string of OperatorDef.");
return nullptr;
}
OperatorDef op_def;
......@@ -67,12 +70,14 @@ inline PyObject* RunOperatorsCC(PyObject* self, PyObject* args) {
inline PyObject* CreatePersistentOpCC(PyObject* self, PyObject* args) {
PyObject* op_str;
if (!PyArg_ParseTuple(args, "S", &op_str)) {
PyErr_SetString(PyExc_ValueError, "Excepted a serialized string of OperatorDef.");
PyErr_SetString(PyExc_ValueError,
"Excepted a serialized string of OperatorDef.");
return nullptr;
}
OperatorDef op_def;
if (!op_def.ParseFromString(PyBytes_AsStringEx(op_str))) {
PyErr_SetString(PyExc_RuntimeError, "Failed to parse the OperatorDef.");
PyErr_SetString(PyExc_RuntimeError,
"Failed to parse the OperatorDef.");
return nullptr;
}
ws()->CreatePersistentOp(op_def);
......@@ -82,8 +87,10 @@ inline PyObject* CreatePersistentOpCC(PyObject* self, PyObject* args) {
inline PyObject* RunPersistentOpCC(PyObject* self, PyObject* args) {
char* key, *anchor;
PyObject* py_inputs, *py_outputs;
if (!PyArg_ParseTuple(args, "ssOO", &key, &anchor, &py_inputs, &py_outputs)) {
PyErr_SetString(PyExc_ValueError, "Excepted a persistent key, anchor, "
if (!PyArg_ParseTuple(args, "ssOO",
&key, &anchor, &py_inputs, &py_outputs)) {
PyErr_SetString(PyExc_ValueError,
"Excepted a persistent key, anchor, "
"list of inputs and outputs.");
return nullptr;
}
......
......@@ -39,12 +39,14 @@ inline PyObject* CreateTensorCC(PyObject* self, PyObject* args) {
inline PyObject* CreateFillerCC(PyObject* self, PyObject* args) {
PyObject* filler_string;
if (!PyArg_ParseTuple(args, "S", &filler_string)) {
PyErr_SetString(PyExc_ValueError, "Excepted a serialized string of TensorFiller.");
PyErr_SetString(PyExc_ValueError,
"Excepted a serialized string of TensorFiller.");
return nullptr;
}
TensorFiller filler_def;
if (!filler_def.ParseFromString(PyBytes_AsStringEx(filler_string))) {
PyErr_SetString(PyExc_RuntimeError, "Failed to parse the TensorFiller.");
PyErr_SetString(PyExc_RuntimeError,
"Failed to parse the TensorFiller.");
return nullptr;
}
ws()->CreateFiller(filler_def);
......@@ -60,7 +62,8 @@ inline PyObject* GetFillerTypeCC(PyObject* self, PyObject* args) {
inline PyObject* RenameTensorCC(PyObject* self, PyObject* args) {
char* ori_name, *tar_name;
if (!PyArg_ParseTuple(args, "ss", &ori_name, &tar_name)) {
PyErr_SetString(PyExc_ValueError, "Excepted the original and target name.");
PyErr_SetString(PyExc_ValueError,
"Excepted the original and target name.");
return nullptr;
}
if (!ws()->HasTensor(tar_name)) {
......@@ -77,7 +80,8 @@ PyObject* TensorFromShapeCC(PyObject* self, PyObject* args) {
char* cname, *dtype;
PyObject* shape, *device_option = nullptr;
if (!PyArg_ParseTuple(args, "sOs|O", &cname, &shape, &dtype, &device_option)) {
PyErr_SetString(PyExc_ValueError, "Excepted the name, shape, dtype and optional device option.");
PyErr_SetString(PyExc_ValueError,
"Excepted the name, shape, dtype and optional device option.");
return nullptr;
}
const TypeMeta& meta = TypeStringToMeta(dtype);
......@@ -119,7 +123,8 @@ PyObject* TensorFromPyArrayCC(PyObject* self, PyObject* args) {
char* cname;
PyArrayObject* original_array = nullptr;
if (!PyArg_ParseTuple(args, "sO", &cname, &original_array)) {
PyErr_SetString(PyExc_ValueError, "Failed to create tensor from numpy.ndarray.\n"
PyErr_SetString(PyExc_ValueError,
"Failed to create tensor from numpy.ndarray.\n"
"Excepted the name and numpy.ndarray both.");
return nullptr;
}
......@@ -214,7 +219,8 @@ inline PyObject* TensorToPyArrayCC(PyObject* self, PyObject* args) {
return nullptr;
}
auto* data = tensor->raw_mutable_data<CPUContext>();
PyObject* array = PyArray_SimpleNewFromData(tensor->ndim(), dims.data(), npy_type, data);
PyObject* array = PyArray_SimpleNewFromData(
(int)tensor->ndim(), dims.data(), npy_type, data);
Py_XINCREF(array);
return array;
}
......
......@@ -30,6 +30,8 @@ class BlobFetcher(Process):
----------
batch_size : int
The size of a training batch.
dtype : str
The data type of batch. Default is ``float32``.
partition : boolean
Whether to partition batch. Default is ``False``.
prefetch : int
......@@ -42,6 +44,7 @@ class BlobFetcher(Process):
"""
super(BlobFetcher, self).__init__()
self._batch_size = kwargs.get('batch_size', 100)
self._dtype = kwargs.get('dtype', 'float32')
self._partition = kwargs.get('partition', False)
self._mean_values = kwargs.get('mean_values', [])
self._scale = kwargs.get('scale', 1.0)
......@@ -68,7 +71,7 @@ class BlobFetcher(Process):
if ix != self._batch_size - 1: im, labels = self.Q_in.get()
# mean subtraction & numerical scale
im_blob = im_blob.astype(np.float32)
im_blob = im_blob.astype(self._dtype)
if len(self._mean_values) > 0:
im_blob -= self._mean_values
if self._scale != 1.0:
......
......@@ -70,6 +70,8 @@ class DataBatch(object):
The phase of this operator, ``TRAIN`` or ``TEST``. Default is ``TRAIN``.
batch_size : int
The size of a training batch.
dtype : str
The data type of batch. Default is ``float32``.
partition : boolean
Whether to partition batch. Default is ``False``.
prefetch : int
......
......@@ -49,16 +49,14 @@ class DataReader(Process):
self._source = kwargs.get('source', '')
self._multiple_nodes = kwargs.get('multiple_nodes', False)
self._use_shuffle = kwargs.get('shuffle', False)
self._use_instance_chunk = kwargs.get('instance_chunk', False)
self._num_chunks = kwargs.get('num_chunks', 2048)
self._chunk_size = kwargs.get('chunk_size', -1)
self._num_parts = 1
self._part_idx = 0
self._part_idx, self._num_parts = 0, 1
self._cur_idx, self._cur_chunk_idx = 0, 0
self._random_seed = config.GetRandomSeed()
self._cur_idx = 0
self._cur_chunk_idx = 0
self.Q_out = None
self.daemon = True
......@@ -167,12 +165,13 @@ class DataReader(Process):
self._db.open(self._source)
self._zfill = self._db.zfill()
self._num_entries = self._db.num_entries()
self._epoch_size = int(self._num_entries / self._num_parts + 1)
self._epoch_size = int(self._num_entries/ self._num_parts + 1)
if self._use_shuffle:
if self._chunk_size == 1:
# each chunk has at most 1 record [For Fully Shuffle]
self._num_shuffle_parts = int(self._num_entries / self._chunk_size / self._num_parts) + 1
self._chunk_size, self._num_shuffle_parts = \
1, int(self._num_entries / self._num_parts) + 1
else:
if self._use_shuffle and self._chunk_size == -1:
# search a optimal chunk size by chunks [For Chunk Shuffle]
......@@ -183,6 +182,11 @@ class DataReader(Process):
self._num_shuffle_parts = int(math.ceil(self._db._total_size * 1.1 /
(self._num_parts * self._chunk_size << 20)))
self._chunk_size = int(self._num_entries / self._num_shuffle_parts / self._num_parts + 1)
limit = (self._num_parts - 0.5) * self._num_shuffle_parts * self._chunk_size
if self._num_entries <= limit:
# roll back to fully shuffle
self._chunk_size, self._num_shuffle_parts = \
1, int(self._num_entries / self._num_parts) + 1
else:
# each chunk has at most K records [For Multiple Nodes]
# note that if ``shuffle`` and ``multiple_nodes`` are all ``False``,
......
......@@ -14,7 +14,7 @@ from __future__ import division
from __future__ import print_function
version = '0.2.2'
full_version = '0.2.2.10'
full_version = '0.2.2.11'
release = False
if not release:
......
......@@ -364,7 +364,7 @@ class BatchNormLayer(Layer):
var = Tensor(scope + '/param:1').Constant(value=0.0)
factor = Tensor(scope + '/param:2').Constant(value=0.0)
# in dragon, set diff as None will ignore computing grad automatically
# but in bvlc-caffe1, you must set lr_mult = 0 manually
# but in bvlc-caffe, you must set lr_mult = 0 manually
self._blobs.append({'data': mean, 'diff': None})
self._blobs.append({'data': var, 'diff': None})
self._blobs.append({'data': factor, 'diff': None})
......
......@@ -20,7 +20,7 @@ from .arithmetic import (
from .ndarray import (
squeeze, unsqueeze,
sum, mean, argmin, argmax, max, topk,
sum, mean, argmin, argmax, max, min, topk,
cat, gather,
)
......
......@@ -13,7 +13,6 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from dragon.vm.torch.tensor import Tensor
from dragon.vm.torch.ops.primitive import MakeContext, WrapScalar
from dragon.vm.torch.ops.factory import get_module
......@@ -26,7 +25,6 @@ def _fundamental(input, value, op='Add', out=None):
raise TypeError('Type of value should be numerical, got {}.'
.format(type(value)))
value = WrapScalar(value, input._dtype, input._ctx)
ctx = MakeContext(inputs=[input, value])
key = 'torch/ops/{}/{}:{}'.format(op.lower(), ctx[0].lower(), ctx[1])
module = get_module(Fundamental, key, ctx, op_type=op)
......
......@@ -13,7 +13,7 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from dragon.vm.torch.utils.data.io.data_reader import DataReader
from dragon.io.data_reader import DataReader
from dragon.vm.torch.utils.data.io.data_transformer import DataTransformer
......
......@@ -19,7 +19,7 @@ from multiprocessing import Queue
import dragon.core.mpi as mpi
from .data_reader import DataReader
from dragon.io.data_reader import DataReader
from .data_transformer import DataTransformer
from .blob_fetcher import BlobFetcher
......
# ------------------------------------------------------------
# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
#
# Licensed under the BSD 2-Clause License.
# You should have received a copy of the BSD 2-Clause License
# along with the software. If not, See,
#
# <https://opensource.org/licenses/BSD-2-Clause>
#
# ------------------------------------------------------------
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import numpy as np
import numpy.random as npr
from multiprocessing import Process
import dragon.config as config
from dragon.tools.db import LMDB
class DataReader(Process):
"""DataReader is deployed to queue encoded str from `LMDB`_.
It is supported to adaptively partition and shuffle records over all distributed nodes.
"""
def __init__(self, **kwargs):
"""Construct a ``DataReader``.
Parameters
----------
source : str
The path of database.
multiple_nodes: boolean
Whether to split data for multiple parallel nodes. Default is ``False``.
shuffle : boolean
Whether to shuffle the data. Default is ``False``.
num_chunks : int
The number of chunks to split. Default is ``2048``.
chunk_size : int
The size(MB) of each chunk. Default is -1 (Refer ``num_chunks``).
"""
super(DataReader, self).__init__()
self._source = kwargs.get('source', '')
self._multiple_nodes = kwargs.get('multiple_nodes', False)
self._use_shuffle = kwargs.get('shuffle', False)
self._num_chunks = kwargs.get('num_chunks', 2048)
self._chunk_size = kwargs.get('chunk_size', -1)
self._num_parts = 1
self._part_idx = 0
self._random_seed = config.GetRandomSeed()
self._cur_idx = 0
self._cur_chunk_idx = 0
self.Q_out = None
self.daemon = True
def element(self):
"""Get the value of current record.
Returns
-------
str
The encoded str.
"""
return self._db.value()
def redirect(self, target_idx):
"""Redirect to the target position.
Parameters
----------
target_idx : int
The key of instance in ``LMDB``.
Returns
-------
None
Notes
-----
The redirection reopens the ``LMDB``.
You can drop caches by ``echo 3 > /proc/sys/vm/drop_caches``.
This will disturb getting stuck when ``Database Size`` >> ``RAM Size``.
"""
self._db.close()
self._db.open(self._source)
self._cur_idx = target_idx
self._db.set(str(self._cur_idx).zfill(self._zfill))
def reset(self):
"""Reset the cursor and environment.
Returns
-------
None
"""
if self._multiple_nodes or self._use_shuffle:
if self._use_shuffle: self._perm = npr.permutation(self._num_shuffle_parts)
self._cur_chunk_idx = 0
self._start_idx = int(self._part_idx * self._num_shuffle_parts + self._perm[self._cur_chunk_idx])
self._start_idx = int(self._start_idx * self._chunk_size)
if self._start_idx >= self._num_entries: self.next_chunk()
self._end_idx = self._start_idx + self._chunk_size
self._end_idx = min(self._num_entries, self._end_idx)
else:
self._start_idx = 0
self._end_idx = self._num_entries
self.redirect(self._start_idx)
def next_record(self):
"""Step the cursor of records.
Returns
-------
None
"""
self._cur_idx += 1
self._db.next()
def next_chunk(self):
"""Step the cursor of shuffling chunks.
Returns
-------
None
"""
self._cur_chunk_idx += 1
if self._cur_chunk_idx >= self._num_shuffle_parts: self.reset()
else:
self._start_idx = self._part_idx * self._num_shuffle_parts + self._perm[self._cur_chunk_idx]
self._start_idx = self._start_idx * self._chunk_size
if self._start_idx >= self._num_entries: self.next_chunk()
else:
self._end_idx = self._start_idx + self._chunk_size
self._end_idx = min(self._num_entries, self._end_idx)
self.redirect(self._start_idx)
def run(self):
"""Start the process.
Returns
-------
None
"""
# fix seed
npr.seed(self._random_seed)
# init db
self._db = LMDB()
self._db.open(self._source)
self._zfill = self._db.zfill()
self._num_entries = self._db.num_entries()
self._epoch_size = int(self._num_entries / self._num_parts + 1)
if self._use_shuffle:
if self._chunk_size == 1:
# each chunk has at most 1 record [For Fully Shuffle]
self._num_shuffle_parts = int(self._num_entries / self._chunk_size / self._num_parts) + 1
else:
if self._use_shuffle and self._chunk_size == -1:
# search a optimal chunk size by chunks [For Chunk Shuffle]
max_chunk_size = self._db._total_size / ((self._num_chunks * (1 << 20)))
min_chunk_size = 1
while min_chunk_size * 2 < max_chunk_size: min_chunk_size *= 2
self._chunk_size = min_chunk_size
self._num_shuffle_parts = int(math.ceil(self._db._total_size * 1.1 /
(self._num_parts * self._chunk_size << 20)))
self._chunk_size = int(self._num_entries / self._num_shuffle_parts / self._num_parts + 1)
else:
# each chunk has at most K records [For Multiple Nodes]
# note that if ``shuffle`` and ``multiple_nodes`` are all ``False``,
# ``chunk_size`` and ``num_shuffle_parts`` are meaningless
self._chunk_size = int(self._num_entries / self._num_parts) + 1
self._num_shuffle_parts = 1
self._perm = np.arange(self._num_shuffle_parts)
# init env
self.reset()
# run
while True:
self.Q_out.put(self.element())
self.next_record()
if self._cur_idx >= self._end_idx:
if self._multiple_nodes or \
self._use_shuffle: self.next_chunk()
else: self.reset()
\ No newline at end of file
......@@ -42,7 +42,7 @@ find_modules()
setup(name = 'dragon',
version='0.2.2.10',
version='0.2.2.11',
description = 'Dragon: A Computation Graph Virtual Machine Based Deep Learning Framework',
url='https://github.com/seetaresearch/Dragon',
author='Ting Pan',
......
......@@ -19,7 +19,8 @@ template <> void GenerateProposals<float, CPUContext>(
const float* scores,
const float* bbox_deltas,
const float* anchors,
float* proposals) {
float* proposals,
CPUContext* ctx) {
float* proposal = proposals;
const int K = feat_h * feat_w;
for (int h = 0; h < feat_h; ++h) {
......@@ -57,7 +58,8 @@ template <> void GenerateProposals_v2<float, CPUContext>(
const float min_box_w,
const float* scores,
const float* bbox_deltas,
float* proposals) {
float* proposals,
CPUContext* ctx) {
float* proposal = proposals;
for (int i = 0; i < total_anchors; ++i) {
// bbox_deltas: [1, 4, total_anchors]
......@@ -98,7 +100,8 @@ template <> void ApplyNMS<float, CPUContext>(
const float thresh,
const float* boxes,
int* keep_indices,
int& num_keep) {
int& num_keep,
CPUContext* ctx) {
int count = 0;
std::vector<char> is_dead(num_boxes);
for (int i = 0; i < num_boxes; ++i) is_dead[i] = 0;
......
......@@ -62,7 +62,7 @@ __global__ void _GenerateProposals(
const T* bbox_deltas,
const T* anchors,
T* proposals) {
CUDA_KERNEL_LOOP(idx, nthreads) {
CUDA_1D_KERNEL_LOOP(idx, nthreads) {
const int h = idx / A / feat_w;
const int w = (idx / A) % feat_w;
const int a = idx % A;
......@@ -99,11 +99,13 @@ template <> void GenerateProposals<float, CUDAContext>(
const float* scores,
const float* bbox_deltas,
const float* anchors,
float* proposals) {
float* proposals,
CUDAContext* ctx) {
const int num_proposals = A * feat_h * feat_w;
_GenerateProposals<float>
<< <CUDA_BLOCKS(num_proposals), CUDA_THREADS >> >(
num_proposals, A, feat_h, feat_w, stride,
<< < CUDA_BLOCKS(num_proposals), CUDA_THREADS,
0, ctx->cuda_stream() >> >(num_proposals,
A, feat_h, feat_w, stride,
im_h, im_w, min_box_h, min_box_w,
scores, bbox_deltas, anchors, proposals);
}
......@@ -118,7 +120,7 @@ __global__ void _GenerateProposals_v2(
const T* scores,
const T* bbox_deltas,
T* proposals) {
CUDA_KERNEL_LOOP(idx, nthreads) {
CUDA_1D_KERNEL_LOOP(idx, nthreads) {
const float dx = bbox_deltas[idx];
const float dy = bbox_deltas[nthreads + idx];
const float d_log_w = bbox_deltas[2 * nthreads + idx];
......@@ -139,10 +141,12 @@ template <> void GenerateProposals_v2<float, CUDAContext>(
const float min_box_w,
const float* scores,
const float* bbox_deltas,
float* proposals) {
float* proposals,
CUDAContext* ctx) {
_GenerateProposals_v2<float>
<< <CUDA_BLOCKS(total_anchors), CUDA_THREADS >> >(
total_anchors, im_h, im_w, min_box_h, min_box_w,
<< < CUDA_BLOCKS(total_anchors), CUDA_THREADS,
0, ctx->cuda_stream() >> >(total_anchors,
im_h, im_w, min_box_h, min_box_w,
scores, bbox_deltas, proposals);
}
......@@ -170,7 +174,7 @@ __global__ void nms_mask(
const int num_boxes,
const T nms_thresh,
const T* boxes,
unsigned long long* mask) {
uint64_t* mask) {
const int i_start = blockIdx.x * NMS_BLOCK_SIZE;
const int di_end = min(num_boxes - i_start, NMS_BLOCK_SIZE);
const int j_start = blockIdx.y * NMS_BLOCK_SIZE;
......@@ -209,25 +213,30 @@ void _ApplyNMS(
const float thresh,
const T* boxes,
int* keep_indices,
int& num_keep) {
int& num_keep,
CUDAContext* ctx) {
const int num_blocks = DIV_UP(num_boxes, NMS_BLOCK_SIZE);
const dim3 blocks(num_blocks, num_blocks);
size_t mask_nbytes = num_boxes * num_blocks * sizeof(unsigned long long);
size_t mask_nbytes = num_boxes * num_blocks * sizeof(uint64_t);
size_t boxes_nbytes = num_boxes * 5 * sizeof(T);
void* boxes_dev, *mask_dev;
CUDA_CHECK(cudaMalloc(&boxes_dev, boxes_nbytes));
CUDA_CHECK(cudaMalloc(&mask_dev, mask_nbytes));
CUDA_CHECK(cudaMemcpy(boxes_dev, boxes, boxes_nbytes, cudaMemcpyHostToDevice));
nms_mask<T> << <blocks, NMS_BLOCK_SIZE >> > (
num_boxes, thresh, (T*)boxes_dev, (unsigned long long*)mask_dev);
CUDA_CHECK(cudaMemcpy(boxes_dev, boxes,
boxes_nbytes, cudaMemcpyHostToDevice));
nms_mask<T>
<< < blocks, NMS_BLOCK_SIZE,
0, ctx->cuda_stream() >> > (num_boxes,
thresh, (T*)boxes_dev, (uint64_t*)mask_dev);
CUDA_CHECK(cudaPeekAtLastError());
std::vector<unsigned long long> mask_host(num_boxes * num_blocks);
CUDA_CHECK(cudaMemcpy(&mask_host[0], mask_dev, mask_nbytes, cudaMemcpyDeviceToHost));
std::vector<uint64_t> mask_host(num_boxes * num_blocks);
CUDA_CHECK(cudaMemcpy(&mask_host[0], mask_dev,
mask_nbytes, cudaMemcpyDeviceToHost));
std::vector<unsigned long long> dead_bit(num_blocks);
memset(&dead_bit[0], 0, sizeof(unsigned long long) * num_blocks);
std::vector<uint64_t> dead_bit(num_blocks);
memset(&dead_bit[0], 0, sizeof(uint64_t) * num_blocks);
int num_selected = 0;
for (int i = 0; i < num_boxes; ++i) {
......@@ -235,7 +244,7 @@ void _ApplyNMS(
const int inblock = i % NMS_BLOCK_SIZE;
if (!(dead_bit[nblock] & (1ULL << inblock))) {
keep_indices[num_selected++] = i;
unsigned long long* mask_i = &mask_host[0] + i * num_blocks;
uint64_t* mask_i = &mask_host[0] + i * num_blocks;
for (int j = nblock; j < num_blocks; ++j) dead_bit[j] |= mask_i[j];
if (num_selected == max_keeps) break;
}
......@@ -251,9 +260,10 @@ template <> void ApplyNMS<float, CUDAContext>(
const float thresh,
const float* boxes,
int* keep_indices,
int& num_keep) {
int& num_keep,
CUDAContext* ctx) {
_ApplyNMS<float>(num_boxes, max_keeps, thresh,
boxes, keep_indices, num_keep);
boxes, keep_indices, num_keep, ctx);
}
} // namespace rcnn
......
......@@ -126,7 +126,8 @@ void GenerateProposals(
const T* scores,
const T* bbox_deltas,
const T* anchors,
T* proposals);
T* proposals,
Context* ctx);
template <typename T, class Context>
void GenerateProposals_v2(
......@@ -137,7 +138,8 @@ void GenerateProposals_v2(
const float min_box_w,
const T* scores,
const T* bbox_deltas,
T* proposals);
T* proposals,
Context* ctx);
template <typename T>
inline void SortProposals(
......@@ -246,7 +248,8 @@ void ApplyNMS(
const T thresh,
const T* boxes,
int* keep_indices,
int& num_keep);
int& num_keep,
Context* ctx);
} // namespace rcnn
......
......@@ -37,7 +37,7 @@ void ProposalOp<Context>::RunWithType() {
Input(0).template data<T, Context>(),
Input(1).template data<T, Context>(),
anchors_.template mutable_data<T, Context>(),
proposals_.template mutable_data<T, Context>());
proposals_.template mutable_data<T, Context>(), ctx());
rcnn::SortProposals(0, num_proposals - 1, pre_nms_top_n,
proposals_.template mutable_data<T, CPUContext>());
......@@ -45,7 +45,8 @@ void ProposalOp<Context>::RunWithType() {
rcnn::ApplyNMS<T, Context>(
pre_nms_topn, post_nms_top_n, nms_thresh,
proposals_.template mutable_data<T, Context>(),
roi_indices_.template mutable_data<int, CPUContext>(), num_rois);
roi_indices_.template mutable_data<int, CPUContext>(),
num_rois, ctx());
rcnn::RetrieveRoIs<T>(num_rois, n,
proposals_.template mutable_data<T, CPUContext>(),
......@@ -95,14 +96,15 @@ void ProposalOp<Context>::RunWithType() {
im_height, im_width, min_box_h, min_box_w,
Input(-3).template data<T, Context>(),
Input(-2).template data<T, Context>(),
proposals_.template mutable_data<T, Context>());
proposals_.template mutable_data<T, Context>(), ctx());
rcnn::SortProposals(0, total_proposals - 1, pre_nms_top_n,
proposals_.template mutable_data<T, CPUContext>());
rcnn::ApplyNMS<T, Context>(pre_nms_topn, post_nms_top_n, nms_thresh,
proposals_.template mutable_data<T, Context>(),
roi_indices_.template mutable_data<int, CPUContext>(), num_rois);
roi_indices_.template mutable_data<int, CPUContext>(),
num_rois, ctx());
rcnn::RetrieveRoIs<T>(num_rois, n,
proposals_.template mutable_data<T, CPUContext>(),
......@@ -128,7 +130,7 @@ void ProposalOp<Context>::RunWithType() {
collective_rois.ReshapeLike(*Output(0));
auto* rois = collective_rois.template mutable_data<T, CPUContext>();
CPUContext::template Copy<T, CPUContext, CPUContext>(
ctx()->template Copy<T, CPUContext, CPUContext>(
collective_rois.count(), rois,
Output(0)->template data<T, CPUContext>());
......@@ -147,6 +149,8 @@ void ProposalOp<Context>::RunWithType() {
template <class Context>
void ProposalOp<Context>::RunOnDevice() {
ctx()->set_stream_id(0); // enforce default stream
num_images = Input(0).dim(0);
CHECK_EQ(Input(-1).dim(0), num_images)
<< "\nExcepted " << num_images << " groups image info, "
......
......@@ -455,7 +455,10 @@ Graph::Graph(const GraphDef& meta_graph, Workspace* ws)
RecomputingAware(optimized_graph, ws);
}
bool Graph::Run(const string& include, const string& exclude) {
bool Graph::Run(
const string& include,
const string& exclude,
const int stream_id) {
LOG(DEBUG) << "Run Graph: " << name();
for (auto op : ops_) {
if (!include.empty())
......@@ -464,7 +467,7 @@ bool Graph::Run(const string& include, const string& exclude) {
if (op->type().find(exclude) != string::npos) continue;
op->SwitchToPhase(this->args_["phase"].s());
LOG(DEBUG) << "$ Before Operator: " << op->name();
op->Run();
op->Run(stream_id);
LOG(DEBUG) << "$ After Operator: " << op->name();
}
return true;
......
......@@ -8,7 +8,6 @@ void MixedMemory::ToCPU() {
switch (state_) {
case UNINITIALIZED:
cpu_ptr_ = CPUContext::New(nbytes_);
CPUContext::Memset(nbytes_, cpu_ptr_);
state_ = STATE_AT_CPU;
break;
case STATE_AT_CUDA:
......@@ -32,7 +31,6 @@ void MixedMemory::ToCUDA() {
switch (state_) {
case UNINITIALIZED:
cuda_ptr_ = CUDAContext::New(nbytes_);
CUDAContext::Memset(nbytes_, cuda_ptr_);
state_ = STATE_AT_CUDA;
break;
case STATE_AT_CPU:
......
......@@ -15,33 +15,35 @@ void CuDNNDropoutOp<Context>::RunWithType() {
float scale = use_scale ? 1.0 / (1.0 - prob()) : 1.0;
if (phase() == "TEST") {
if (Output(0) != &Input(0)) {
ctx().template Copy<T, Context, Context>(
ctx()->template Copy<T, Context, Context>(
Output(0)->count(), Ydata, Xdata);
if (scale == 1.0)
math::Scal<T, Context>(Output(0)->count(),
1.0 - prob(), Ydata, &ctx());
1.0 - prob(), Ydata, ctx());
}
} else if (phase() == "TRAIN") {
CHECK(use_scale) << "\nCuDNN only supports scale-dropout";
Tensor* mask = ws()->CreateTensor("/mnt/" + anchor() + "/dropout/mask");
Tensor* mask = ws()->CreateTensor(
"/mnt/" + anchor() + "/dropout/mask");
// determine the dropout states
if (!states_initialized) {
states_initialized = true;
CUDNN_CHECK(cudnnDropoutGetStatesSize(
ctx().cudnn_handle(), &states_size));
ctx()->cudnn_handle(), &states_size));
std::lock_guard<std::mutex> lk(CUDAContext::mutex());
Tensor* states = ws()->CreateTensor("/share/cudnn/dropout:" +
dragon_cast<string, unsigned long long>(random_seed) + "/states");
Tensor* states = ws()->CreateTensor(
"/share/cudnn/dropout:" + dragon_cast<string,
unsigned long long>(random_seed) + "/states");
if (states->count() > 0) {
auto* Sdata = states->template mutable_data<uint8_t, Context>();
CUDNN_CHECK(cudnnRestoreDropoutDescriptor(
dropout_desc, ctx().cudnn_handle(), prob(),
dropout_desc, ctx()->cudnn_handle(), prob(),
Sdata, states_size, random_seed));
} else {
states->Reshape({ (TIndex)states_size });
auto* Sdata = states->template mutable_data<uint8_t, Context>();
CUDNN_CHECK(cudnnSetDropoutDescriptor(
dropout_desc, ctx().cudnn_handle(), prob(),
dropout_desc, ctx()->cudnn_handle(), prob(),
Sdata, states_size, random_seed));
}
}
......@@ -53,7 +55,7 @@ void CuDNNDropoutOp<Context>::RunWithType() {
mask->Reshape({ (TIndex)reserve_space_size });
auto* Rdata = mask->template mutable_data<uint8_t, Context>();
CUDNN_CHECK(cudnnDropoutForward(
ctx().cudnn_handle(), dropout_desc,
ctx()->cudnn_handle(), dropout_desc,
input_desc, Xdata,
input_desc, Ydata,
Rdata, reserve_space_size));
......@@ -65,7 +67,9 @@ void CuDNNDropoutOp<Context>::RunOnDevice() {
Output(0)->ReshapeLike(Input(0));
if (XIsType(Input(0), float)) RunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (XIsType(Input(0), float16)) RunWithType<float16>();
#endif
else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
}
......@@ -76,19 +80,21 @@ void CuDNNDropoutGradientOp<Context>::RunWithType() {
if (phase() == "TEST") { NOT_IMPLEMENTED; }
else if (phase() == "TRAIN") {
CHECK(use_scale) << "\nCuDNN only supports scale-dropout";
Tensor* mask = ws()->GetTensor("/mnt/" + anchor() + "/dropout/mask");
Tensor* mask = ws()->GetTensor(
"/mnt/" + anchor() + "/dropout/mask");
// determine the dropout states
if (!states_initialized) {
states_initialized = true;
CUDNN_CHECK(cudnnDropoutGetStatesSize(
ctx().cudnn_handle(), &states_size));
ctx()->cudnn_handle(), &states_size));
std::lock_guard<std::mutex> lk(CUDAContext::mutex());
Tensor* states = ws()->CreateTensor("/share/cudnn/dropout:" +
dragon_cast<string, unsigned long long>(random_seed) + "/states");
Tensor* states = ws()->CreateTensor(
"/share/cudnn/dropout:" + dragon_cast<string,
unsigned long long>(random_seed) + "/states");
if (states->count() > 0) {
auto* Sdata = states->template mutable_data<uint8_t, Context>();
CUDNN_CHECK(cudnnRestoreDropoutDescriptor(
dropout_desc, ctx().cudnn_handle(), prob(),
dropout_desc, ctx()->cudnn_handle(), prob(),
Sdata, states_size, random_seed));
} else { LOG(FATAL) << "Missing states with seed: " << random_seed; }
}
......@@ -101,7 +107,7 @@ void CuDNNDropoutGradientOp<Context>::RunWithType() {
input_desc, &reserve_space_size));
auto* Rdata = mask->template mutable_data<uint8_t, Context>();
CUDNN_CHECK(cudnnDropoutBackward(
ctx().cudnn_handle(), dropout_desc,
ctx()->cudnn_handle(), dropout_desc,
input_desc, dYdata,
input_desc, dXdata,
Rdata, reserve_space_size));
......@@ -113,7 +119,9 @@ void CuDNNDropoutGradientOp<Context>::RunOnDevice() {
Output(0)->ReshapeLike(Input(0));
if (XIsType(Input(0), float)) RunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (XIsType(Input(0), float16)) RunWithType<float16>();
#endif
else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
}
......
......@@ -14,7 +14,7 @@ void CuDNNEluOp<Context>::RunWithType() {
auto* Ydata = Output(0)->template mutable_data<T, Context>();
CUDNN_CHECK(cudnnActivationForward(
ctx().cudnn_handle(), act_desc,
ctx()->cudnn_handle(), act_desc,
CUDNNType<T>::one, input_desc, Xdata,
CUDNNType<T>::zero, output_desc, Ydata));
}
......@@ -41,7 +41,7 @@ void CuDNNEluGradientOp<Context>::RunWithType() {
auto* dXdata = Output(0)->template mutable_data<T, Context>();
CUDNN_CHECK(cudnnActivationBackward(
ctx().cudnn_handle(), act_desc,
ctx()->cudnn_handle(), act_desc,
CUDNNType<T>::one, input_desc, Ydata,
input_desc, dYdata, output_desc, Ydata,
CUDNNType<T>::zero, output_desc, dXdata));
......
......@@ -13,7 +13,7 @@ void CuDNNReluOp<Context>::RunWithType() {
#if CUDNN_VERSION_MIN(5, 0, 0)
CUDNN_CHECK(cudnnActivationForward(
ctx().cudnn_handle(), act_desc,
ctx()->cudnn_handle(), act_desc,
CUDNNType<T>::one, input_desc, Xdata,
CUDNNType<T>::zero, output_desc, Ydata));
#else
......@@ -49,7 +49,7 @@ void CuDNNReluGradientOp<Context>::RunWithType() {
#if CUDNN_VERSION_MIN(5, 0, 0)
CUDNN_CHECK(cudnnActivationBackward(
ctx().cudnn_handle(), act_desc,
ctx()->cudnn_handle(), act_desc,
CUDNNType<T>::one, input_desc, Ydata,
input_desc, dYdata, output_desc, Ydata,
CUDNNType<T>::zero, output_desc, dXdata));
......
......@@ -13,12 +13,12 @@ void CuDNNSigmoidOp<Context>::RunWithType() {
#if CUDNN_VERSION_MIN(5, 0, 0)
CUDNN_CHECK(cudnnActivationForward(
ctx().cudnn_handle(), act_desc,
ctx()->cudnn_handle(), act_desc,
CUDNNType<T>::one, input_desc, Xdata,
CUDNNType<T>::zero, output_desc, Ydata));
#else
CUDNN_CHECK(cudnnActivationForward_v4(
ctx().cudnn_handle(), act_desc,
ctx()->cudnn_handle(), act_desc,
CUDNNType<Dtype>::one, input_desc, Xdata,
CUDNNType<Dtype>::zero, output_desc, Ydata));
#endif
......@@ -47,13 +47,13 @@ void CuDNNSigmoidGradientOp<Context>::RunWithType() {
#if CUDNN_VERSION_MIN(5, 0, 0)
CUDNN_CHECK(cudnnActivationBackward(
ctx().cudnn_handle(), act_desc,
ctx()->cudnn_handle(), act_desc,
CUDNNType<T>::one, input_desc, Ydata,
input_desc, dYdata, output_desc, Ydata,
CUDNNType<T>::zero, output_desc, dXdata));
#else
CUDNN_CHECK(cudnnActivationBackward_v4(
ctx().cudnn_handle(), act_desc,
ctx()->cudnn_handle(), act_desc,
CUDNNType<T>::one, input_desc, Ydata,
input_desc, dYdata, output_desc, Ydata,
CUDNNType<T>::zero, output_desc, dXdata));
......
......@@ -7,8 +7,7 @@ namespace dragon {
template <class Context> template <typename T>
void CuDNNSoftmaxOp<Context>::RunWithType() {
Tensor fake_tensor(vector<TIndex>(
{ outer_dim, Input(0).dim(axis), inner_dim })
);
{ outer_dim, Input(0).dim(axis), inner_dim }));
cudnnSetTensorDesc<T>(&input_desc, &fake_tensor);
cudnnSetTensorDesc<T>(&output_desc, &fake_tensor);
......@@ -16,7 +15,7 @@ void CuDNNSoftmaxOp<Context>::RunWithType() {
auto* Ydata = Output(0)->template mutable_data<T, Context>();
CUDNN_CHECK(cudnnSoftmaxForward(
ctx().cudnn_handle(),
ctx()->cudnn_handle(),
CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL,
CUDNNType<T>::one, input_desc, Xdata,
CUDNNType<T>::zero, output_desc, Ydata));
......@@ -41,8 +40,7 @@ DEPLOY_CUDNN(Softmax);
template <class Context> template <typename T>
void CuDNNSoftmaxGradientOp<Context>::RunWithType() {
Tensor fake_tensor(vector<TIndex>(
{ outer_dim, Input(0).dim(axis), inner_dim })
);
{ outer_dim, Input(0).dim(axis), inner_dim }));
cudnnSetTensorDesc<T>(&input_desc, &fake_tensor);
cudnnSetTensorDesc<T>(&output_desc, &fake_tensor);
......@@ -50,7 +48,7 @@ void CuDNNSoftmaxGradientOp<Context>::RunWithType() {
auto* Ydata = Input(0).template data<T, Context>();
auto* dXdata = Output(0)->template mutable_data<T, Context>();
CUDNN_CHECK(cudnnSoftmaxBackward(
ctx().cudnn_handle(),
ctx()->cudnn_handle(),
CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL,
CUDNNType<T>::one, input_desc, Ydata, input_desc, dYdata,
CUDNNType<T>::zero, output_desc, dXdata));
......
......@@ -13,12 +13,12 @@ void CuDNNTanhOp<Context>::RunWithType() {
#if CUDNN_VERSION_MIN(5, 0, 0)
CUDNN_CHECK(cudnnActivationForward(
ctx().cudnn_handle(), act_desc,
ctx()->cudnn_handle(), act_desc,
CUDNNType<T>::one, input_desc, Xdata,
CUDNNType<T>::zero, output_desc, Ydata));
#else
CUDNN_CHECK(cudnnActivationForward_v4(
ctx().cudnn_handle(), act_desc,
ctx()->cudnn_handle(), act_desc,
CUDNNType<Dtype>::one, input_desc, Xdata,
CUDNNType<Dtype>::zero, output_desc, Ydata));
#endif
......@@ -47,13 +47,13 @@ void CuDNNTanhGradientOp<Context>::RunWithType() {
#if CUDNN_VERSION_MIN(5, 0, 0)
CUDNN_CHECK(cudnnActivationBackward(
ctx().cudnn_handle(), act_desc,
ctx()->cudnn_handle(), act_desc,
CUDNNType<T>::one, input_desc, Ydata,
input_desc, dYdata, output_desc, Ydata,
CUDNNType<T>::zero, output_desc, dXdata));
#else
CUDNN_CHECK(cudnnActivationBackward_v4(
ctx().cudnn_handle(), act_desc,
ctx()->cudnn_handle(), act_desc,
CUDNNType<T>::one, input_desc, Ydata,
input_desc, dYdata, output_desc, Ydata,
CUDNNType<T>::zero, output_desc, dXdata));
......
......@@ -11,10 +11,10 @@ void DropoutOp<Context>::RunWithType() {
float scale = use_scale ? 1.0 / (1.0 - prob()) : 1.0;
if (phase() == "TEST") {
if (Output(0) != &Input(0)) {
ctx().template Copy<T, Context, Context>(
ctx()->template Copy<T, Context, Context>(
Output(0)->count(), Ydata, Xdata);
if (scale == 1.0) math::Scal<T, Context>(
Output(0)->count(), 1.0 - prob(), Ydata, &ctx());
Output(0)->count(), 1.0 - prob(), Ydata, ctx());
}
} else if (phase() == "TRAIN") {
Tensor* mask = ws()->CreateTensor(
......@@ -23,7 +23,7 @@ void DropoutOp<Context>::RunWithType() {
uint32_t* Mdata = mask->template mutable_data<uint32_t, Context>();
kernel::Dropout<T, Context>(
Output(0)->count(), prob(), scale,
Xdata, Mdata, Ydata, &ctx());
Xdata, Mdata, Ydata, ctx());
} else LOG(FATAL) << "Incorrect Op phase: " << phase();
}
......@@ -52,7 +52,8 @@ void DropoutGradientOp<Context>::RunWithType() {
else if (phase() == "TRAIN") {
kernel::DropoutGrad<T, Context>(
Output(0)->count(), prob(), scale,
dYdata, Mdata, dXdata, &ctx());
dYdata, Mdata, dXdata, ctx());
ctx()->FinishDeviceCompution();
mask->Reset();
} else LOG(FATAL) << "Incorrect Op phase: " << phase();
}
......
......@@ -8,7 +8,8 @@ template <class Context> template <typename T>
void EluOp<Context>::RunWithType() {
auto* Xdata = Input(0).template data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>();
kernel::Elu<T, Context>(Output(0)->count(), alpha, Xdata, Ydata);
kernel::Elu<T, Context>(Output(0)->count(),
alpha, Xdata, Ydata, ctx());
}
template <class Context>
......@@ -30,8 +31,8 @@ void EluGradientOp<Context>::RunWithType() {
auto* Ydata = Input(0).template data<T, Context>();
auto* dYdata = Input(1).template data<T, Context>();
auto* dXdata = Output(0)->template mutable_data<T, Context>();
kernel::EluGrad<T, Context>(
Output(0)->count(), alpha, dYdata, Ydata, dXdata);
kernel::EluGrad<T, Context>(Output(0)->count(),
alpha, dYdata, Ydata, dXdata, ctx());
}
template <class Context>
......
......@@ -18,7 +18,7 @@ void PReluOp<Context>::RunWithType() {
kernel::PRelu<T, Context>(
Output(0)->count(), channels, dim,
channel_shared ? true : false, data_format,
Xdata, Wdata, Ydata);
Xdata, Wdata, Ydata, ctx());
}
template <class Context>
......@@ -49,12 +49,12 @@ void PReluGradientOp<Context>::RunWithType() {
if (Output(1)->name() != "ignore") {
DECLARE_MULTIPLIER(multiplier, channels * dim);
auto* dWdata = Output(1)->template mutable_data<T, Context>();
auto* dWdata = Output(1)->template mutable_data<T, Context>(ctx());
auto* dWBdata = ws()->template caches<T, Context>({ channels * dim })[0];
kernel::PReluWGrad<T, Context>(
Input(0).dim(0), Input(0).count(1), channels, dim,
channel_shared ? true : false, data_format,
dYdata, Xdata, multiplier, dWBdata, dWdata, &ctx());
dYdata, Xdata, multiplier, dWBdata, dWdata, ctx());
}
if (Output(0)->name() != "ignore") {
......@@ -63,7 +63,7 @@ void PReluGradientOp<Context>::RunWithType() {
kernel::PReluGrad<T, Context>(
Output(0)->count(), channels, dim,
channel_shared ? true : false, data_format,
dYdata, Xdata, Wdata, dXdata);
dYdata, Xdata, Wdata, dXdata, ctx());
}
}
......
......@@ -8,7 +8,8 @@ template <class Context> template <typename T>
void ReluOp<Context>::RunWithType() {
auto* Xdata = Input(0).template data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>();
kernel::Relu<T, Context>(Output(0)->count(), slope, Xdata, Ydata);
kernel::Relu<T, Context>(Output(0)->count(),
slope, Xdata, Ydata, ctx());
}
template <class Context>
......@@ -24,15 +25,17 @@ DEPLOY_CPU(Relu);
#ifdef WITH_CUDA
DEPLOY_CUDA(Relu);
#endif
OPERATOR_SCHEMA(Relu).NumInputs(1).NumOutputs(1).Inplace({ { 0, 0 } });
OPERATOR_SCHEMA(Relu)
.NumInputs(1).NumOutputs(1)
.Inplace({ { 0, 0 } });
template <class Context> template <typename T>
void ReluGradientOp<Context>::RunWithType() {
auto* Ydata = Input(0).template data<T, Context>();
auto* dYdata = Input(1).template data<T, Context>();
auto* dXdata = Output(0)->template mutable_data<T, Context>();
kernel::ReluGrad<T, Context>(
Output(0)->count(), slope, dYdata, Ydata, dXdata);
kernel::ReluGrad<T, Context>(Output(0)->count(),
slope, dYdata, Ydata, dXdata, ctx());
}
template <class Context>
......@@ -47,7 +50,9 @@ DEPLOY_CPU(ReluGradient);
#ifdef WITH_CUDA
DEPLOY_CUDA(ReluGradient);
#endif
OPERATOR_SCHEMA(ReluGradient).NumInputs(2).NumOutputs(1).Inplace({ { 1, 0 }});
OPERATOR_SCHEMA(ReluGradient)
.NumInputs(2).NumOutputs(1)
.Inplace({ { 1, 0 }});
class GetReluGradient final : public GradientMakerBase {
public:
......
......@@ -8,7 +8,7 @@ template <class Context> template <typename T>
void SEluOp<Context>::RunWithType() {
auto* Xdata = Input(0).template data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>();
kernel::SElu<T, Context>(Output(0)->count(), Xdata, Ydata);
kernel::SElu<T, Context>(Output(0)->count(), Xdata, Ydata, ctx());
}
template <class Context>
......@@ -23,15 +23,17 @@ DEPLOY_CPU(SElu);
#ifdef WITH_CUDA
DEPLOY_CUDA(SElu);
#endif
OPERATOR_SCHEMA(SElu).NumInputs(1).NumOutputs(1).Inplace({ { 0, 0 } });
OPERATOR_SCHEMA(SElu)
.NumInputs(1).NumOutputs(1)
.Inplace({ { 0, 0 } });
template <class Context> template <typename T>
void SEluGradientOp<Context>::RunWithType() {
auto* Ydata = Input(0).template data<T, Context>();
auto* dYdata = Input(1).template data<T, Context>();
auto* dXdata = Output(0)->template mutable_data<T, Context>();
kernel::SEluGrad<T, Context>(
Output(0)->count(), dYdata, Ydata, dXdata);
kernel::SEluGrad<T, Context>(Output(0)->count(),
dYdata, Ydata, dXdata, ctx());
}
template <class Context>
......@@ -46,7 +48,9 @@ DEPLOY_CPU(SEluGradient);
#ifdef WITH_CUDA
DEPLOY_CUDA(SEluGradient);
#endif
OPERATOR_SCHEMA(SEluGradient).NumInputs(2).NumOutputs(1).Inplace({ { 1, 0 }});
OPERATOR_SCHEMA(SEluGradient)
.NumInputs(2).NumOutputs(1)
.Inplace({ { 1, 0 }});
class GetSEluGradient final : public GradientMakerBase {
public:
......
......@@ -8,7 +8,7 @@ template <class Context> template <typename T>
void SigmoidOp<Context>::RunWithType() {
auto* Xdata = Input(0).template data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>();
kernel::Sigmoid<T, Context>(Output(0)->count(), Xdata, Ydata);
kernel::Sigmoid<T, Context>(Output(0)->count(), Xdata, Ydata, ctx());
}
template <class Context>
......@@ -30,8 +30,8 @@ void SigmoidGradientOp<Context>::RunWithType() {
auto* Ydata = Input(0).template data<T, Context>();
auto* dYdata = Input(1).template data<T, Context>();
auto* dXdata = Output(0)->template mutable_data<T, Context>();
kernel::SigmoidGrad<T, Context>(
Output(0)->count(), dYdata, Ydata, dXdata);
kernel::SigmoidGrad<T, Context>(Output(0)->count(),
dYdata, Ydata, dXdata, ctx());
}
template <class Context>
......
......@@ -12,13 +12,13 @@ void SoftmaxOp<Context>::RunWithType() {
auto* Ydata = Output(0)->template mutable_data<T, Context>();
auto* WSdata = ws()->template caches<T, Context>({ Input(0).count() })[0];
ctx().template Copy<T, Context, Context>(
ctx()->template Copy<T, Context, Context>(
Input(0).count(), Ydata, Xdata);
kernel::Softmax<T, Context>(
Output(0)->count(), Input(0).dim(axis),
outer_dim, inner_dim, multiplier,
Xdata, WSdata, Ydata, &ctx());
Xdata, WSdata, Ydata, ctx());
}
template <class Context>
......@@ -36,7 +36,9 @@ DEPLOY_CPU(Softmax);
#ifdef WITH_CUDA
DEPLOY_CUDA(Softmax);
#endif
OPERATOR_SCHEMA(Softmax).NumInputs(1).NumOutputs(1).Inplace({ { 0, 0 } });
OPERATOR_SCHEMA(Softmax)
.NumInputs(1).NumOutputs(1)
.Inplace({ { 0, 0 } });
template <class Context> template <typename T>
void SoftmaxGradientOp<Context>::RunWithType() {
......@@ -44,15 +46,16 @@ void SoftmaxGradientOp<Context>::RunWithType() {
auto* dYdata = Input(-1).template data<T, Context>();
auto* Ydata = Input(0).template data<T, Context>();
auto* dXdata = Output(0)->template mutable_data<T, Context>();
auto* WSdata = ws()->template caches<T, Context>({ Input(0).count() })[0];
auto* WSdata = ws()->template caches<T, Context>(
{ Input(0).count() })[0];
ctx().template Copy<T, Context, Context>(
ctx()->template Copy<T, Context, Context>(
Input(0).count(), dXdata, dYdata);
kernel::SoftmaxGrad<T, Context>(
Output(0)->count(), Input(0).dim(axis),
outer_dim, inner_dim, multiplier,
dYdata, Ydata, WSdata, dXdata, &ctx());
dYdata, Ydata, WSdata, dXdata, ctx());
}
template <class Context>
......@@ -70,7 +73,9 @@ DEPLOY_CPU(SoftmaxGradient);
#ifdef WITH_CUDA
DEPLOY_CUDA(SoftmaxGradient);
#endif
OPERATOR_SCHEMA(SoftmaxGradient).NumInputs(2).NumOutputs(1).Inplace({ { 1, 0 } });
OPERATOR_SCHEMA(SoftmaxGradient)
.NumInputs(2).NumOutputs(1)
.Inplace({ { 1, 0 } });
class GetSoftmaxGradient final : public GradientMakerBase {
public:
......
......@@ -8,7 +8,7 @@ template <class Context> template <typename T>
void TanhOp<Context>::RunWithType() {
auto* Xdata = Input(0).template data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>();
kernel::Tanh<T, Context>(Output(0)->count(), Xdata, Ydata);
kernel::Tanh<T, Context>(Output(0)->count(), Xdata, Ydata, ctx());
}
template <class Context>
......@@ -30,8 +30,8 @@ void TanhGradientOp<Context>::RunWithType() {
auto* Ydata = Input(0).template data<T, Context>();
auto* dYdata = Input(1).template data<T, Context>();
auto* dXdata = Output(0)->template mutable_data<T, Context>();
kernel::TanhGrad<T, Context>(
Output(0)->count(), dYdata, Ydata, dXdata);
kernel::TanhGrad<T, Context>(Output(0)->count(),
dYdata, Ydata, dXdata, ctx());
}
template <class Context>
......
......@@ -9,7 +9,7 @@ void AddOp<Context>::EltwiseRunWithType() {
auto* x1 = Input(0).template data<T, Context>();
auto* x2 = Input(1).template data<T, Context>();
auto* y = Output(0)->template mutable_data<T, Context>();
math::Add<T, Context>(Output(0)->count(), x1, x2, y);
math::Add<T, Context>(Output(0)->count(), x1, x2, y, ctx());
}
template <class Context> template <typename T>
......@@ -19,23 +19,24 @@ void AddOp<Context>::BroadcastRunWithType(int type) {
auto* x2 = Input(1).template data<T, Context>();
auto* y = Output(0)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>(
ctx()->template Copy<T, Context, Context>(
Output(0)->count(), y, x1);
if (type == 0 || type == 1) {
if (type == 0) {
outer_dim = Input(0).count();
inner_dim = 1;
x2 = Input(1).template data<T, CPUContext>();
math::AddScalar<T, Context>(Output(0)->count(),
dragon_cast<float, T>(x2[0]), y, ctx());
} else {
outer_dim = Input(0).count(0, Input(0).axis(-1));
inner_dim = Input(0).dim(-1);
}
DECLARE_MULTIPLIER(multiplier, outer_dim);
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
1.0, multiplier, x2,
1.0, y, &ctx());
1.0, y, ctx());
}
} else if (type == 2) {
outer_dim = Input(0).dim(0);
inner_dim = Input(0).count(1);
......@@ -44,7 +45,7 @@ void AddOp<Context>::BroadcastRunWithType(int type) {
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
1.0, x2, multiplier,
1.0, y, &ctx());
1.0, y, ctx());
}
}
......@@ -77,13 +78,13 @@ void AddGradientOp<Context>::EltwiseRunWithType() {
if (Output(1)->name() != "ignore") {
auto* dx2 = Output(1)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>(
ctx()->template Copy<T, Context, Context>(
Output(1)->count(), dx2, dy);
}
if (Output(0)->name() != "ignore") {
auto* dx1 = Output(0)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>(
ctx()->template Copy<T, Context, Context>(
Output(0)->count(), dx1, dy);
}
}
......@@ -108,7 +109,7 @@ void AddGradientOp<Context>::BroadcastRunWithType(int type) {
math::Gemv<T, Context>(
CblasTrans, outer_dim, inner_dim,
1.0, dy, multiplier,
0.0, dx2, &ctx());
0.0, dx2, ctx());
} else if (type == 2) {
outer_dim = X1->dim(0);
inner_dim = X1->count(1);
......@@ -116,13 +117,13 @@ void AddGradientOp<Context>::BroadcastRunWithType(int type) {
math::Gemv<T, Context>(
CblasNoTrans, outer_dim, inner_dim,
1.0, dy, multiplier,
0.0, dx2, &ctx());
0.0, dx2, ctx());
}
}
if (Output(0)->name() != "ignore") {
auto* dx1 = Output(0)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>(
ctx()->template Copy<T, Context, Context>(
X1->count(), dx1, dy);
}
}
......
......@@ -34,7 +34,7 @@ void AffineOp<Context>::RunWithType() {
kernel::Affine<T, Context>(
Output(0)->count(), outer_dim, scale_dim, inner_dim,
Xdata, Adata, Bdata, bias_multiplier, Ydata, &ctx());
Xdata, Adata, Bdata, bias_multiplier, Ydata, ctx());
}
template <class Context>
......@@ -58,13 +58,13 @@ void AffineGradientOp<Context>::BiasRunWithType() {
DECLARE_MULTIPLIER(multiplier, inner_dim);
auto* dYdata = Input(-1).template data<T, Context>();
auto* dBias = Output(2)->template mutable_data<T, Context>();
auto* dBias = Output(2)->template mutable_data<T, Context>(ctx());
for (int n = 0; n < outer_dim; n++) {
math::Gemv<T, Context>(
CblasNoTrans, scale_dim, inner_dim,
1.0, dYdata, multiplier,
1.0, dBias, &ctx());
1.0, dBias, ctx());
dYdata += dim;
}
}
......@@ -79,45 +79,36 @@ void AffineGradientOp<Context>::ScaleRunWithType() {
bool is_eltwise = (Input(-1).count() == Input(1).count());
auto* dYdata = Input(-1).template data<T, Context>();
auto* Xdata = Input(0).template data<T, Context>();
auto* dScale = Output(1)->template mutable_data<T, Context>();
auto* dScale = Output(1)->template mutable_data<T, Context>(ctx());
auto* dXdata = Output(0)->template mutable_data<T, Context>();
auto* dYxX = dXdata;
math::Mul<T, Context>(Output(0)->count(), dYdata, Xdata, dYxX);
math::Mul<T, Context>(Output(0)->count(), dYdata, Xdata, dYxX, ctx());
if (!is_eltwise) {
T* SRes_data = nullptr;
// reduce inner dimensions
if (inner_dim == 1) {
SRes_data = dYxX;
} else if (sum_result.count() == 1) { // handle inner only
dScale = Output(1)->template mutable_data<T, CPUContext>();
T result = math::Dot<T, Context>(
inner_dim, dYxX, multiplier, &ctx());
*dScale += result;
} else {
SRes_data = (outer_dim == 1) ? // handle scale only
SRes_data = (outer_dim == 1) ?
dScale : sum_result.template mutable_data<T, Context>();
math::Gemv<T, Context>(
CblasNoTrans, sum_result.count(), inner_dim,
1.0, dYxX, multiplier,
SRes_data == dScale ? 1.0 : 0.0, SRes_data, &ctx());
SRes_data == dScale ? 1.0 : 0.0,
SRes_data, ctx());
}
// reduce outer dimensions
if (outer_dim != 1) {
if (scale_dim == 1) { // handle outer only
dScale = Output(1)->template mutable_data<T, CPUContext>();
T result = math::Dot<T, Context>(
outer_dim, multiplier, SRes_data, &ctx());
*dScale += result;
} else {
math::Gemv<T, Context>(
CblasTrans, outer_dim, scale_dim,
1.0, SRes_data, multiplier,
1.0, dScale, &ctx());
}
1.0, dScale, ctx());
}
} else {
math::Axpy<T, Context>(Output(1)->count(),
1.f, dYxX, dScale, &ctx());
1.f, dYxX, dScale, ctx());
}
}
......@@ -131,7 +122,7 @@ void AffineGradientOp<Context>::RunWithType() {
kernel::AffineGrad<T, Context>(
Output(0)->count(), outer_dim, scale_dim, inner_dim,
dYdata, Adata, dXdata, &ctx());
dYdata, Adata, dXdata, ctx());
}
template <class Context>
......
......@@ -15,7 +15,7 @@ void ClipOp<Context>::RunWithType() {
auto* Ydata = Output(0)->template mutable_data<T, Context>();
auto* Mdata = mask->template mutable_data<T, Context>();
kernel::Clip<T, Context>(Output(0)->count(),
low, high, Xdata, Mdata, Ydata);
low, high, Xdata, Mdata, Ydata, ctx());
}
template <class Context>
......@@ -30,7 +30,9 @@ DEPLOY_CPU(Clip);
#ifdef WITH_CUDA
DEPLOY_CUDA(Clip);
#endif
OPERATOR_SCHEMA(Clip).NumInputs(1).NumOutputs(1).Inplace({ { 0, 0 } });
OPERATOR_SCHEMA(Clip)
.NumInputs(1).NumOutputs(1)
.Inplace({ { 0, 0 } });
template <class Context> template <typename T>
void ClipGradientOp<Context>::RunWithType() {
......@@ -39,7 +41,8 @@ void ClipGradientOp<Context>::RunWithType() {
auto* dXdata = Output(0)->template mutable_data<T, Context>();
auto* Mdata = mask->template data<T, Context>();
math::Mul<T, Context>(Output(0)->count(), dXdata, Mdata, dXdata);
math::Mul<T, Context>(Output(0)->count(),
dXdata, Mdata, dXdata, ctx());
}
template <class Context>
......@@ -54,7 +57,9 @@ DEPLOY_CPU(ClipGradient);
#ifdef WITH_CUDA
DEPLOY_CUDA(ClipGradient);
#endif
OPERATOR_SCHEMA(ClipGradient).NumInputs(2).NumOutputs(1).Inplace({ { 1, 0 } });
OPERATOR_SCHEMA(ClipGradient)
.NumInputs(2).NumOutputs(1)
.Inplace({ { 1, 0 } });
class GetClipGradient final : public GradientMakerBase {
public:
......
......@@ -23,7 +23,7 @@ void CuDNNAffineOp<Context>::RunWithType() {
mul_desc, CUDNN_OP_TENSOR_MUL,
CUDNNType<T>::type, CUDNN_PROPAGATE_NAN));
CUDNN_CHECK(cudnnOpTensor(
ctx().cudnn_handle(), mul_desc,
ctx()->cudnn_handle(), mul_desc,
CUDNNType<T>::one, input_desc, Xdata,
CUDNNType<T>::one, param_desc, Adata,
CUDNNType<T>::zero, input_desc, Ydata));
......@@ -36,7 +36,7 @@ void CuDNNAffineOp<Context>::RunWithType() {
add_desc, CUDNN_OP_TENSOR_ADD,
CUDNNType<T>::type, CUDNN_PROPAGATE_NAN));
CUDNN_CHECK(cudnnOpTensor(
ctx().cudnn_handle(), add_desc,
ctx()->cudnn_handle(), add_desc,
CUDNNType<T>::one, input_desc, Ydata,
CUDNNType<T>::one, param_desc, Bdata,
CUDNNType<T>::zero, input_desc, Ydata));
......@@ -48,7 +48,9 @@ void CuDNNAffineOp<Context>::RunOnDevice() {
Output(0)->ReshapeLike(Input(0));
if (XIsType(Input(0), float)) RunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (XIsType(Input(0), float16)) RunWithType<float16>();
#endif
else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
}
......@@ -76,17 +78,17 @@ void CuDNNAffineGradientOp<Context>::RunWithType() {
if (Output(1)->name() != "ignore") {
Output(1)->ReshapeLike(Input(1));
auto* Xdata = Input(0).template data<T, Context>();
auto* dAdata = Output(1)->template mutable_data<T, Context>();
auto* dAdata = Output(1)->template mutable_data<T, Context>(ctx());
// eltwise
if (Input(0).count() == Input(1).count()) {
CUDNN_CHECK(cudnnOpTensor(
ctx().cudnn_handle(), mul_desc,
ctx()->cudnn_handle(), mul_desc,
CUDNNType<T>::one, input_desc, Xdata,
CUDNNType<T>::one, input_desc, dYdata,
CUDNNType<T>::one, param_desc, dAdata));
} else {
CUDNN_CHECK(cudnnOpTensor(
ctx().cudnn_handle(), mul_desc,
ctx()->cudnn_handle(), mul_desc,
CUDNNType<T>::one, input_desc, Xdata,
CUDNNType<T>::one, input_desc, dYdata,
CUDNNType<T>::zero, input_desc, dXdata));
......@@ -97,11 +99,11 @@ void CuDNNAffineGradientOp<Context>::RunWithType() {
// db = dy
if (Output(2)->name() != "ignore") {
Output(2)->ReshapeLike(Input(1));
auto* dBdata = Output(2)->template mutable_data<T, Context>();
auto* dBdata = Output(2)->template mutable_data<T, Context>(ctx());
// eltwise
if (Input(-1).count() == Input(1).count()) {
math::Axpy<T, Context>(Output(2)->count(),
1.f, dYdata, dBdata, &ctx());
1.f, dYdata, dBdata, ctx());
} else {
ComputeBiasGradient_v2<T>(dYdata, dBdata);
}
......@@ -109,7 +111,7 @@ void CuDNNAffineGradientOp<Context>::RunWithType() {
// dx = alpha * dy
CUDNN_CHECK(cudnnOpTensor(
ctx().cudnn_handle(), mul_desc,
ctx()->cudnn_handle(), mul_desc,
CUDNNType<T>::one, input_desc, dYdata,
CUDNNType<T>::one, param_desc, Adata,
CUDNNType<T>::zero, input_desc, dXdata));
......@@ -126,11 +128,11 @@ void CuDNNAffineGradientOp<Context>::ComputeScaleGradient(
CUDNN_REDUCE_TENSOR_NO_INDICES, CUDNN_32BIT_INDICES));
size_t workspace_size = 0;
CUDNN_CHECK(cudnnGetReductionWorkspaceSize(
ctx().cudnn_handle(), reduce_desc,
ctx()->cudnn_handle(), reduce_desc,
input_desc, param_desc, &workspace_size));
auto* WSdata = ws()->template caches<Context>({ workspace_size })[0];;
CUDNN_CHECK(cudnnReduceTensor(
ctx().cudnn_handle(), reduce_desc,
ctx()->cudnn_handle(), reduce_desc,
nullptr, 0, WSdata, workspace_size,
CUDNNType<T>::one, input_desc, dYxX,
CUDNNType<T>::one, param_desc, dA));
......@@ -145,32 +147,23 @@ void CuDNNAffineGradientOp<Context>::ComputeScaleGradient_v2(
sum_result.Reshape({ outer_dim * scale_dim });
T* SRes_data = nullptr;
if (inner_dim == 1) SRes_data = dYxX;
else if (sum_result.count() == 1) {
auto* dAC = Output(1)->template mutable_data<T, CPUContext>();
T result = math::Dot<T, Context>(
inner_dim, dYxX, multiplier, &ctx());
*dAC += result;
// reduce inner dimensions
if (inner_dim == 1) {
SRes_data = dYxX;
} else {
SRes_data = (outer_dim == 1) ?
dA : sum_result.template mutable_data<T, Context>();
math::Gemv<T, Context>(
CblasNoTrans, sum_result.count(), inner_dim,
1.0, dYxX, multiplier,
SRes_data == dA ? 1.0 : 0.0, SRes_data, &ctx());
SRes_data == dA ? 1.0 : 0.0, SRes_data, ctx());
}
// reduce outer dimensions
if (outer_dim != 1) {
if (scale_dim == 1) {
auto* dAC = Output(1)->template mutable_data<T, CPUContext>();
T result = math::Dot<T, Context>(
outer_dim, multiplier, SRes_data, &ctx());
*dAC += result;
} else {
math::Gemv<T, Context>(
CblasTrans, outer_dim, scale_dim,
1.0, SRes_data, multiplier,
1.0, dA, &ctx());
}
1.0, dA, ctx());
}
}
......@@ -185,11 +178,11 @@ void CuDNNAffineGradientOp<Context>::ComputeBiasGradient(
CUDNN_REDUCE_TENSOR_NO_INDICES, CUDNN_32BIT_INDICES));
size_t workspace_size = 0;
CUDNN_CHECK(cudnnGetReductionWorkspaceSize(
ctx().cudnn_handle(), reduce_desc,
ctx()->cudnn_handle(), reduce_desc,
input_desc, param_desc, &workspace_size));
auto* WSdata = ws()->template caches<Context>({ workspace_size })[0];
CUDNN_CHECK(cudnnReduceTensor(
ctx().cudnn_handle(), reduce_desc,
ctx()->cudnn_handle(), reduce_desc,
nullptr, 0, WSdata, workspace_size,
CUDNNType<T>::one, input_desc, dY,
CUDNNType<T>::one, param_desc, dB));
......@@ -205,7 +198,7 @@ void CuDNNAffineGradientOp<Context>::ComputeBiasGradient_v2(
math::Gemv<T, Context>(
CblasNoTrans, scale_dim, inner_dim,
1.0, dY, multiplier,
1.0, dB, &ctx());
1.0, dB, ctx());
dY += dim;
}
}
......
......@@ -9,7 +9,7 @@ void DivOp<Context>::EltwiseRunWithType() {
auto* x1 = Input(0).template data<T, Context>();
auto* x2 = Input(1).template data<T, Context>();
auto* y = Output(0)->template mutable_data<T, Context>();
math::Div<T, Context>(Output(0)->count(), x1, x2, y);
math::Div<T, Context>(Output(0)->count(), x1, x2, y, ctx());
}
template <class Context> template <typename T>
......@@ -18,34 +18,40 @@ void DivOp<Context>::BroadcastRunWithType(int type) {
auto* x1 = Input(0).template data<T, Context>();
auto* x2 = Input(1).template data<T, Context>();
auto* y = Output(0)->template mutable_data<T, Context>();
auto* c = ws()->template caches<T, Context>({
Output(0)->count() })[0];
if (type == 0 || type == 1) {
if (type == 0) {
outer_dim = Input(0).count();
inner_dim = 1;
} else {
x2 = Input(1).template data<T, CPUContext>();
float inverse_x2 = 1.f / dragon_cast<float, T>(x2[0]);
ctx()->template Copy<T, Context, Context>(
Output(0)->count(), y, x1);
math::MulScalar<T, Context>(
Output(0)->count(), inverse_x2, y, ctx());
} else if (type == 1) {
outer_dim = Input(0).count(0, Input(0).axis(-1));
inner_dim = Input(0).dim(-1);
}
DECLARE_MULTIPLIER(multiplier, outer_dim);
auto* c = ws()->template caches<T, Context>(
{ Output(0)->count() })[0];
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
1.0, multiplier, x2,
0.0, c, &ctx());
math::Div<T, Context>(Output(0)->count(), x1, c, y);
0.0, c, ctx());
math::Div<T, Context>(
Output(0)->count(), x1, c, y, ctx());
} else if (type == 2) {
outer_dim = Input(0).dim(0);
inner_dim = Input(0).count(1);
DECLARE_MULTIPLIER(multiplier, inner_dim);
auto* c = ws()->template caches<T, Context>(
{ Output(0)->count() })[0];
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
1.0, x2, multiplier,
0.0, c, &ctx());
math::Div<T, Context>(Output(0)->count(), x1, c, y);
0.0, c, ctx());
math::Div<T, Context>(
Output(0)->count(), x1, c, y, ctx());
}
}
......@@ -82,16 +88,16 @@ void DivGradientOp<Context>::EltwiseRunWithType() {
auto* x2 = Input(1).template data<T, Context>();
auto* dx2 = Output(1)->template mutable_data<T, Context>();
auto* c = ws()->template caches<T, Context>({ X1->count() })[0];
math::Mul<T,Context>(X1->count(), dy, x1, c); // dY * X1
math::Square<T, Context>(X2->count(), x2, dx2); // X2^{2}
math::Inv<T, Context>(X2->count(), -1, dx2, dx2); // -1 / X2^{2}
math::Mul<T, Context>(X2->count(), c, dx2, dx2);
math::Mul<T,Context>(X1->count(), dy, x1, c, ctx()); // dY * X1
math::Square<T, Context>(X2->count(), x2, dx2, ctx()); // X2^{2}
math::Inv<T, Context>(X2->count(), -1, dx2, dx2, ctx()); // -1 / X2^{2}
math::Mul<T, Context>(X2->count(), c, dx2, dx2, ctx());
}
if (Output(0)->name() != "ignore") {
auto* x2 = Input(1).template data<T, Context>();
auto* dx1 = Output(0)->template mutable_data<T, Context>();
math::Div<T, Context>(X1->count(), dy, x2, dx1);
math::Div<T, Context>(X1->count(), dy, x2, dx1, ctx());
}
}
......@@ -118,23 +124,23 @@ void DivGradientOp<Context>::BroadcastRunWithType(int type) {
auto* dx2 = Output(1)->template mutable_data<T, Context>();
auto cs = ws()->template caches<T, Context>(
{ X1->count(), X2->count() });
math::Mul<T, Context>(X1->count(), dy, x1, cs[0]); // dY * X1
math::Square<T, Context>(X2->count(), x2, dx2); // X2^{2}
math::Inv<T, Context>(X2->count(), -1.0, dx2, dx2); // -1 / X2^{2}
math::Mul<T, Context>(X1->count(), dy, x1, cs[0], ctx()); // dY * X1
math::Square<T, Context>(X2->count(), x2, dx2, ctx()); // X2^{2}
math::Inv<T, Context>(X2->count(), -1, dx2, dx2, ctx()); // -1 / X2^{2}
if (type == 0 || type == 1) {
DECLARE_MULTIPLIER(multiplier, outer_dim);
math::Gemv<T, Context>(
CblasTrans, outer_dim, inner_dim,
1.0, cs[0], multiplier,
0.0, cs[1], &ctx());
0.0, cs[1], ctx());
} else if (type == 2) {
DECLARE_MULTIPLIER(multiplier, inner_dim);
math::Gemv<T, Context>(
CblasNoTrans, outer_dim, inner_dim,
1.0, cs[0], multiplier,
0.0, cs[1], &ctx());
0.0, cs[1], ctx());
}
math::Mul<T, Context>(X2->count(), cs[1], dx2, dx2);
math::Mul<T, Context>(X2->count(), cs[1], dx2, dx2, ctx());
}
if (Output(0)->name() != "ignore") {
......@@ -146,16 +152,16 @@ void DivGradientOp<Context>::BroadcastRunWithType(int type) {
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
1.0, multiplier, x2,
0.0, dx1, &ctx());
0.0, dx1, ctx());
} else if (type == 2) {
DECLARE_MULTIPLIER(multiplier, inner_dim);
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
1.0, x2, multiplier,
0.0, dx1, &ctx());
0.0, dx1, ctx());
}
math::Div<T, Context>(X1->count(), dy, dx1, dx1);
math::Div<T, Context>(X1->count(), dy, dx1, dx1, ctx());
}
}
......
......@@ -7,9 +7,13 @@ template <class Context> template <typename T>
void DotOp<Context>::DotRunWithType() {
auto* X1data = Input(0).template data<T, Context>();
auto* X2data = Input(1).template data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, CPUContext>();
Ydata[0] = math::Dot<T, Context>(
Input(0).count(), X1data, X2data, &ctx());
auto* Ydata = Output(0)->template mutable_data<T, Context>();
T result_host;
math::Dot<T, Context>(Input(0).count(),
X1data, X2data, &result_host, ctx());
ctx()->template Copy<T, Context, CPUContext>(
1, Ydata, &result_host);
}
template <class Context> template <typename T>
......@@ -22,7 +26,7 @@ void DotOp<Context>::GemmRunWithType() {
TransB ? CblasTrans : CblasNoTrans,
M, N1, K1,
1.0, X1data, X2data,
0.0, Ydata, &ctx());
0.0, Ydata, ctx());
}
template <class Context> template <typename T>
......@@ -33,7 +37,7 @@ void DotOp<Context>::GemvRunWithType() {
math::Gemv<T, Context>(
TransA ? CblasTrans : CblasNoTrans, M, N1,
1.0, X1data, X2data,
0.0, Ydata, &ctx());
0.0, Ydata, ctx());
}
template <class Context>
......@@ -98,12 +102,14 @@ void DotGradientOp<Context>::DotRunWithType() {
auto* dYdata = Input(2).template data<T, CPUContext>();
auto* dX1data = Output(0)->template mutable_data<T, Context>();
auto* dX2data = Output(1)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>(
ctx()->template Copy<T, Context, Context>(
Output(0)->count(), dX1data, X2data);
ctx().template Copy<T, Context, Context>(
ctx()->template Copy<T, Context, Context>(
Output(1)->count(), dX2data, X1data);
math::MulScalar<T, Context>(Output(0)->count(), dYdata[0], dX1data);
math::MulScalar<T, Context>(Output(1)->count(), dYdata[0], dX2data);
math::MulScalar<T, Context>(
Output(0)->count(), dYdata[0], dX1data, ctx());
math::MulScalar<T, Context>(
Output(1)->count(), dYdata[0], dX2data, ctx());
}
template <class Context> template <typename T>
......@@ -118,13 +124,13 @@ void DotGradientOp<Context>::GemmRunWithType() {
TransB ? CblasNoTrans : CblasTrans,
M, K1, N1,
1.0, dYdata, X2data,
0.0, dX1data, &ctx());
0.0, dX1data, ctx());
math::Gemm<T, Context>(
TransA ? CblasNoTrans : CblasTrans,
CblasNoTrans,
K1, N1, M,
1.0, X1data, dYdata,
0.0, dX2data, &ctx());
0.0, dX2data, ctx());
}
template <class Context> template <typename T>
......@@ -138,11 +144,11 @@ void DotGradientOp<Context>::GemvRunWithType() {
CblasNoTrans, CblasNoTrans,
M, N1, 1,
1.0, dYdata, X2data,
0.0, dX1data, &ctx());
0.0, dX1data, ctx());
math::Gemv<T, Context>(
TransA ? CblasNoTrans : CblasTrans, M, N1,
1.0, X1data, dYdata,
0.0, dX2data, &ctx());
0.0, dX2data, ctx());
}
template <class Context>
......
......@@ -7,10 +7,11 @@ template <class Context> template <typename T>
void EltwiseOp<Context>::SumRunWithType() {
TIndex count = Output(0)->count();
auto* Ydata = Output(0)->template mutable_data<T, Context>();
math::Set<T, Context>(count, dragon_cast<T, float>(0), Ydata);
math::Set<T, Context>(count,
dragon_cast<T, float>(0), Ydata, ctx());
for (int i = 0; i < InputSize(); ++i) {
math::Axpy<T, Context>(count, coeffs[i],
Input(i).template data<T, Context>(), Ydata, &ctx());
Input(i).template data<T, Context>(), Ydata, ctx());
}
}
......@@ -21,19 +22,24 @@ void EltwiseOp<Context>::ProdRunWithType() {
math::Mul<T, Context>(count,
Input(0).template data<T, Context>(),
Input(1).template data<T, Context>(),
Ydata);
Ydata, ctx());
for (int i = 2; i < InputSize(); i++) {
math::Mul<T, Context>(count,
Ydata,
Input(i).template data<T, Context>(),
Ydata);
Ydata, ctx());
}
}
template <class Context>
void EltwiseOp<Context>::RunOnDevice() {
for (int i = 1; i < InputSize(); i++)
CHECK(Input(i).dims() == Input(0).dims());
for (int i = 1; i < InputSize(); i++) {
CHECK(Input(i).dims() == Input(0).dims())
<< "\nExcepted Input(" << i << ")'s dims as "
<< Input(0).DimString() << ",\n but got "
<< Input(1).DimString() << ".";
}
Output(0)->ReshapeLike(Input(0));
if (operation == "SUM") {
......@@ -65,12 +71,12 @@ void EltwiseGradientOp<Context>::SumRunWithType() {
for (int i = 0; i < OutputSize(); i++) {
if (Output(i)->name() == "ignore") continue;
auto* dXdata = Output(i)->template mutable_data<T, Context>();
if (coeffs[i] == float(1)) {
ctx().template Copy<T, Context, Context>(
if (coeffs[i] == 1.f) {
ctx()->template Copy<T, Context, Context>(
count, dXdata, dYdata);
} else {
math::Scale<T, Context>(count,
coeffs[i], dYdata, dXdata, &ctx());
coeffs[i], dYdata, dXdata, ctx());
}
}
}
......@@ -88,11 +94,11 @@ void EltwiseGradientOp<Context>::ProdRunWithType() {
if (i == j) continue;
auto* Xdata = Input(j).template data<T, Context>();
if (!initialized) {
ctx().template Copy<T, Context, Context>(count, dXdata, Xdata);
ctx()->template Copy<T, Context, Context>(count, dXdata, Xdata);
initialized = true;
} else math::Mul<T, Context>(count, Xdata, dXdata, dXdata);
} else math::Mul<T, Context>(count, Xdata, dXdata, dXdata, ctx());
}
math::Mul<T, Context>(count, dYdata, dXdata, dXdata);
math::Mul<T, Context>(count, dYdata, dXdata, dXdata, ctx());
}
}
......
......@@ -8,7 +8,7 @@ template <class Context> template <typename T>
void ExpOp<Context>::RunWithType() {
auto* Xdata = Input(0).template data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>();
math::Exp<T, Context>(Output(0)->count(), Xdata, Ydata);
math::Exp<T, Context>(Output(0)->count(), Xdata, Ydata, ctx());
}
template <class Context>
......@@ -30,7 +30,8 @@ void ExpGradientOp<Context>::RunWithType() {
auto* Ydata = Input(0).template data<T, Context >();
auto* dYdata = Input(-1).template data<T, Context>();
auto* dXdata = Output(0)->template mutable_data<T, Context>();
math::Mul<T, Context>(Output(0)->count(), dYdata, Ydata, dXdata);
math::Mul<T, Context>(Output(0)->count(),
dYdata, Ydata, dXdata, ctx());
}
template <class Context>
......
......@@ -12,7 +12,7 @@ void GramMatrixOp<Context>::RunWithType() {
CblasNoTrans, CblasTrans,
dim, dim, inner_dim,
1.0, Xdata, Xdata,
0.0, Ydata, &ctx());
0.0, Ydata, ctx());
Xdata += x_offset;
Ydata += y_offset;
}
......@@ -47,7 +47,7 @@ void GramMatrixGradientOp<Context>::RunWithType() {
CblasNoTrans, CblasNoTrans,
dim, inner_dim, dim,
2.0, dYdata, Xdata,
0.0, dXdata, &ctx());
0.0, dXdata, ctx());
dYdata += y_offset;
dXdata += x_offset;
}
......
......@@ -23,7 +23,7 @@ void InnerProductOp<Context>::TransRunWithType() {
CblasNoTrans, CblasTrans,
M, num_output, K,
1.0, Xdata, Wdata,
0.0, Ydata, &ctx());
0.0, Ydata, ctx());
if (InputSize() > 2) {
DECLARE_MULTIPLIER(multiplier, M);
......@@ -32,7 +32,7 @@ void InnerProductOp<Context>::TransRunWithType() {
CblasNoTrans, CblasNoTrans,
M, num_output, 1,
1.0, multiplier, Bdata,
1.0, Ydata, &ctx());
1.0, Ydata, ctx());
}
}
......@@ -55,7 +55,7 @@ void InnerProductOp<Context>::NoTransRunWithType() {
CblasNoTrans, CblasNoTrans,
M, num_output, K,
1.0, Xdata, Wdata,
0.0, Ydata, &ctx());
0.0, Ydata, ctx());
if (InputSize() > 2) {
DECLARE_MULTIPLIER(multiplier, M);
......@@ -64,7 +64,7 @@ void InnerProductOp<Context>::NoTransRunWithType() {
CblasNoTrans, CblasNoTrans,
M, num_output, 1,
1.0, multiplier, Bdata,
1.0, Ydata, &ctx());
1.0, Ydata, ctx());
}
}
......@@ -102,30 +102,30 @@ void InnerProductGradientOp<Context>::RunWithType() {
if (Output(1)->name() != "ignore") {
Output(1)->ReshapeLike(Input(1));
auto* dWdata = Output(1)->template mutable_data<T, Context>();
auto* dWdata = Output(1)->template mutable_data<T, Context>(ctx());
if (TransW) {
math::Gemm<T, Context>(
CblasTrans, CblasNoTrans,
num_output, K, M,
1.0, dYdata, Xdata,
1.0, dWdata, &ctx());
1.0, dWdata, ctx());
} else {
math::Gemm<T, Context>(
CblasTrans, CblasNoTrans,
K, num_output, M,
1.0, Xdata, dYdata,
1.0, dWdata, &ctx());
1.0, dWdata, ctx());
}
}
if (Output(2)->name() != "ignore") {
DECLARE_MULTIPLIER(multiplier, M);
Output(2)->Reshape({ num_output });
auto* dBdata = Output(2)->template mutable_data<T, Context>();
auto* dBdata = Output(2)->template mutable_data<T, Context>(ctx());
math::Gemv<T, Context>(
CblasTrans, M, num_output,
1.0, dYdata, multiplier,
1.0, dBdata, &ctx());
1.0, dBdata, ctx());
}
if (Output(0)->name() != "ignore") {
......@@ -136,13 +136,13 @@ void InnerProductGradientOp<Context>::RunWithType() {
CblasNoTrans, CblasNoTrans,
M, K, num_output,
1.0, dYdata, Wdata,
0.0, dXdata, &ctx());
0.0, dXdata, ctx());
} else {
math::Gemm<T, Context>(
CblasNoTrans, CblasTrans,
M, K, num_output,
1.0, dYdata, Wdata,
0.0, dXdata, &ctx());
0.0, dXdata, ctx());
}
}
}
......
......@@ -7,7 +7,7 @@ template <class Context> template <typename T>
void LogOp<Context>::RunWithType() {
auto* Xdata = Input(0).template data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>();
math::Log<T, Context>(Output(0)->count(), Xdata, Ydata);
math::Log<T, Context>(Output(0)->count(), Xdata, Ydata, ctx());
}
template <class Context>
......@@ -29,7 +29,7 @@ void LogGradientOp<Context>::RunWithType() {
auto* Xdata = Input(0).template data<T, Context>();
auto* dYdata = Input(-1).template data<T, Context>();
auto* dXdata = Output(0)->template mutable_data<T, Context>();
math::Div<T, Context>(Output(0)->count(), dYdata, Xdata, dXdata);
math::Div<T, Context>(Output(0)->count(), dYdata, Xdata, dXdata, ctx());
}
template <class Context>
......
......@@ -16,7 +16,7 @@ void MatmulOp<Context>::RunWithType() {
TransB ? CblasTrans : CblasNoTrans,
M, N, K1,
1.0, X1data, X2data,
0.0, Ydata, &ctx());
0.0, Ydata, ctx());
X1data += x1_offset;
X2data += x2_offset;
Ydata += y_offset;
......@@ -76,13 +76,13 @@ void MatmulGradientOp<Context>::RunWithType() {
TransB ? CblasNoTrans : CblasTrans,
M, K1, N,
1.0, dYdata, X2data,
0.0, dX1data, &ctx());
0.0, dX1data, ctx());
math::Gemm<T, Context>(
TransA ? CblasNoTrans : CblasTrans,
CblasNoTrans,
K1, N, M,
1.0, X1data, dYdata,
0.0, dX2data, &ctx());
0.0, dX2data, ctx());
X1data += x1_offset;
X2data += x2_offset;
dX1data += x1_offset;
......
......@@ -9,7 +9,7 @@ void MulOp<Context>::EltwiseRunWithType() {
auto* x1 = Input(0).template data<T, Context>();
auto* x2 = Input(1).template data<T, Context>();
auto* y = Output(0)->template mutable_data<T, Context>();
math::Mul<T, Context>(Output(0)->count(), x1, x2, y);
math::Mul<T, Context>(Output(0)->count(), x1, x2, y, ctx());
}
template <class Context> template <typename T>
......@@ -18,34 +18,39 @@ void MulOp<Context>::BroadcastRunWithType(int type) {
auto* x1 = Input(0).template data<T, Context>();
auto* x2 = Input(1).template data<T, Context>();
auto* y = Output(0)->template mutable_data<T, Context>();
auto* c = ws()->template caches<T, Context>({
Output(0)->count() })[0];
if (type == 0 || type == 1) {
if (type == 0) {
outer_dim = Input(0).count();
inner_dim = 1;
} else {
x2 = Input(1).template data<T, CPUContext>();
ctx()->template Copy<T, Context, Context>(
Output(0)->count(), y, x1);
math::MulScalar<T, Context>(Output(0)->count(),
dragon_cast<float, T>(x2[0]), y, ctx());
} else if (type == 1) {
outer_dim = Input(0).count(0, Input(0).axis(-1));
inner_dim = Input(0).dim(-1);
}
DECLARE_MULTIPLIER(multiplier, outer_dim);
auto* c = ws()->template caches<T, Context>(
{ Output(0)->count() })[0];
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
1.0, multiplier, x2,
0.0, c, &ctx());
math::Mul<T, Context>(Output(0)->count(), x1, c, y);
0.0, c, ctx());
math::Mul<T, Context>(
Output(0)->count(), x1, c, y, ctx());
} else if (type == 2) {
outer_dim = Input(0).dim(0);
inner_dim = Input(0).count(1);
DECLARE_MULTIPLIER(multiplier, inner_dim);
auto* c = ws()->template caches<T, Context>(
{ Output(0)->count() })[0];
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
1.0, x2, multiplier,
0.0, c, &ctx());
math::Mul<T, Context>(Output(0)->count(), x1, c, y);
0.0, c, ctx());
math::Mul<T, Context>(
Output(0)->count(), x1, c, y, ctx());
}
}
......@@ -79,13 +84,13 @@ void MulGradientOp<Context>::EltwiseRunWithType() {
if (Output(1)->name() != "ignore") {
auto* x1 = Input(0).template data<T, Context>();
auto* dx2 = Output(1)->template mutable_data<T, Context>();
math::Mul<T, Context>(Output(1)->count(), dy, x1, dx2);
math::Mul<T, Context>(Output(1)->count(), dy, x1, dx2, ctx());
}
if (Output(0)->name() != "ignore") {
auto* x2 = Input(1).template data<T, Context>();
auto* dx1 = Output(0)->template mutable_data<T, Context>();
math::Mul<T, Context>(Output(0)->count(), dy, x2, dx1);
math::Mul<T, Context>(Output(0)->count(), dy, x2, dx1, ctx());
}
}
......@@ -110,19 +115,19 @@ void MulGradientOp<Context>::BroadcastRunWithType(int type) {
auto* x1 = Input(0).template data<T, Context>();
auto* dx2 = Output(1)->template mutable_data<T, Context>();
auto* c = ws()->template caches<T, Context>({ X1->count() })[0];
math::Mul<T, Context>(X1->count(), dy, x1, c);
math::Mul<T, Context>(X1->count(), dy, x1, c, ctx());
if (type == 0 || type == 1) {
DECLARE_MULTIPLIER(multiplier, outer_dim);
math::Gemv<T, Context>(
CblasTrans, outer_dim, inner_dim,
1.0, c, multiplier,
0.0, dx2, &ctx());
0.0, dx2, ctx());
} else if (type == 2) {
DECLARE_MULTIPLIER(multiplier, inner_dim);
math::Gemv<T, Context>(
CblasNoTrans, outer_dim, inner_dim,
1.0, c, multiplier,
0.0, dx2, &ctx());
0.0, dx2, ctx());
}
}
......@@ -135,16 +140,16 @@ void MulGradientOp<Context>::BroadcastRunWithType(int type) {
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
1.0, multiplier, x2,
0.0, dx1, &ctx());
0.0, dx1, ctx());
} else if (type == 2) {
DECLARE_MULTIPLIER(multiplier, inner_dim);
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
1.0, x2, multiplier,
0.0, dx1, &ctx());
0.0, dx1, ctx());
}
math::Mul<T, Context>(X1->count(), dy, dx1, dx1);
math::Mul<T, Context>(X1->count(), dy, dx1, dx1, ctx());
}
}
......
......@@ -9,16 +9,17 @@ void PowOp<Context>::RunWithType() {
TIndex count = Input(0).count();
auto* Ydata = Output(0)->template mutable_data<T, Context>();
if (power_scale == float(0)) {
float value = (power == float(0)) ? float(1) : pow(shift, power);
math::Set<T, Context>(count, dragon_cast<T, float>(value), Ydata);
if (power_scale == 0.f) {
float value = (power == 0.f) ? 1.f : pow(shift, power);
math::Set<T, Context>(count,
dragon_cast<T, float>(value), Ydata, ctx());
return;
}
auto* Xdata = Input(0).template data<T, Context>();
ctx().template Copy<T, Context, Context>(count, Ydata, Xdata);
if (scale != float(1)) math::Scal<T, Context>(count, scale, Ydata, &ctx());
if (shift != float(0)) math::AddScalar<T, Context>(count, shift, Ydata);
if (power != float(1)) math::Pow<T, Context>(count, power, Ydata, Ydata);
ctx()->template Copy<T, Context, Context>(count, Ydata, Xdata);
if (scale != 1.f) math::Scal<T, Context>(count, scale, Ydata, ctx());
if (shift != 0.f) math::AddScalar<T, Context>(count, shift, Ydata, ctx());
if (power != 1.f) math::Pow<T, Context>(count, power, Ydata, Ydata, ctx());
}
template <class Context>
......@@ -42,35 +43,36 @@ void PowGradientOp<Context>::RunWithType() {
auto* dYdata = Input(-1).template data<T, Context>();
auto* dXdata = Output(0)->template mutable_data<T, Context>();
if (power_scale == float(0) || power == float(1)) {
if (power_scale == 0.f || power == 1.f) {
const T value = dragon_cast<T, float>(power_scale);
math::Set<T, Context>(count, value, dXdata);
math::Set<T, Context>(count, value, dXdata, ctx());
} else {
auto* Xdata = Input(0).template data<T, Context>();
if (power == float(2)) {
if (power == 2.f) {
math::Axpby<T, Context>(count,
power_scale * scale, Xdata,
0, dXdata, &ctx());
if (shift != float(0))
math::AddScalar<T, Context>(count, power_scale * shift, dXdata);
} else if (shift == float(0)) {
0, dXdata, ctx());
if (shift != 0.f)
math::AddScalar<T, Context>(count,
power_scale * shift, dXdata, ctx());
} else if (shift == 0.f) {
auto* Ydata = Input(1).template data<T, Context>();
math::Div<T, Context>(count, Ydata, Xdata, dXdata);
math::Scal<T, Context>(count, power, dXdata, &ctx());
math::Div<T, Context>(count, Ydata, Xdata, dXdata, ctx());
math::Scal<T, Context>(count, power, dXdata, ctx());
} else {
auto* Ydata = Input(1).template data<T, Context>();
ctx().template Copy<T, Context, Context>(count, dXdata, Xdata);
if (scale != float(1))
math::Scal<T, Context>(count, scale, dXdata, &ctx());
if (shift != float(0))
math::AddScalar<T, Context>(count, shift, dXdata);
math::Div<T, Context>(count, Ydata, dXdata, dXdata);
if (power_scale != float(1))
math::Scal<T, Context>(count, power_scale, dXdata, &ctx());
ctx()->template Copy<T, Context, Context>(count, dXdata, Xdata);
if (scale != 1.f)
math::Scal<T, Context>(count, scale, dXdata, ctx());
if (shift != 0.f)
math::AddScalar<T, Context>(count, shift, dXdata, ctx());
math::Div<T, Context>(count, Ydata, dXdata, dXdata, ctx());
if (power_scale != 1.f)
math::Scal<T, Context>(count, power_scale, dXdata, ctx());
}
}
if (power_scale != float(0))
math::Mul<T, Context>(count, dYdata, dXdata, dXdata);
if (power_scale != 0.f)
math::Mul<T, Context>(count, dYdata, dXdata, dXdata, ctx());
}
template <class Context>
......
......@@ -9,7 +9,7 @@ void RAddOp<Context>::EltwiseRunWithType() {
auto* x1 = Input(0).template data<T, Context>();
auto* x2 = Input(1).template data<T, Context>();
auto* y = Output(0)->template mutable_data<T, Context>();
math::Add<T, Context>(Output(0)->count(), x1, x2, y);
math::Add<T, Context>(Output(0)->count(), x1, x2, y, ctx());
}
template <class Context> template <typename T>
......@@ -19,23 +19,24 @@ void RAddOp<Context>::BroadcastRunWithType(int type) {
auto* x2 = Input(1).template data<T, Context>();
auto* y = Output(0)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>(
ctx()->template Copy<T, Context, Context>(
Output(0)->count(), y, x2);
if (type == 0 || type == 1) {
if (type == 0) {
outer_dim = Input(1).count();
inner_dim = 1;
x1 = Input(0).template data<T, CPUContext>();
math::AddScalar<T, Context>(Output(0)->count(),
dragon_cast<float, T>(x1[0]), y, ctx());
} else {
outer_dim = Input(1).count(0, Input(1).axis(-1));
inner_dim = Input(1).dim(-1);
}
DECLARE_MULTIPLIER(multiplier, outer_dim);
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
1.0, multiplier, x1,
1.0, y, &ctx());
1.0, y, ctx());
}
} else if (type == 2) {
outer_dim = Input(1).dim(0);
inner_dim = Input(1).count(1);
......@@ -44,7 +45,7 @@ void RAddOp<Context>::BroadcastRunWithType(int type) {
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
1.0, x1, multiplier,
1.0, y, &ctx());
1.0, y, ctx());
}
}
......@@ -77,13 +78,13 @@ void RAddGradientOp<Context>::EltwiseRunWithType() {
if (Output(1)->name() != "ignore") {
auto* dx2 = Output(1)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>(
ctx()->template Copy<T, Context, Context>(
Output(1)->count(), dx2, dy);
}
if (Output(0)->name() != "ignore") {
auto* dx1 = Output(0)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>(
ctx()->template Copy<T, Context, Context>(
Output(0)->count(), dx1, dy);
}
}
......@@ -108,7 +109,7 @@ void RAddGradientOp<Context>::BroadcastRunWithType(int type) {
math::Gemv<T, Context>(
CblasTrans, outer_dim, inner_dim,
1.0, dy, multiplier,
0.0, dx1, &ctx());
0.0, dx1, ctx());
} else if (type == 2) {
outer_dim = X2->dim(0);
inner_dim = X2->count(1);
......@@ -116,13 +117,13 @@ void RAddGradientOp<Context>::BroadcastRunWithType(int type) {
math::Gemv<T, Context>(
CblasNoTrans, outer_dim, inner_dim,
1.0, dy, multiplier,
0.0, dx1, &ctx());
0.0, dx1, ctx());
}
}
if (Output(1)->name() != "ignore") {
auto* dx2 = Output(1)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>(
ctx()->template Copy<T, Context, Context>(
X2->count(), dx2, dy);
}
}
......
......@@ -9,7 +9,7 @@ void RDivOp<Context>::EltwiseRunWithType() {
auto* x1 = Input(0).template data<T, Context>();
auto* x2 = Input(1).template data<T, Context>();
auto* y = Output(0)->template mutable_data<T, Context>();
math::Div<T, Context>(Output(0)->count(), x1, x2, y);
math::Div<T, Context>(Output(0)->count(), x1, x2, y, ctx());
}
template <class Context> template <typename T>
......@@ -34,8 +34,8 @@ void RDivOp<Context>::BroadcastRunWithType(int type) {
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
1.0, multiplier, x1,
0.0, c, &ctx());
math::Div<T, Context>(Output(0)->count(), c, x2, y);
0.0, c, ctx());
math::Div<T, Context>(Output(0)->count(), c, x2, y, ctx());
} else if (type == 2) {
outer_dim = Input(1).dim(0);
inner_dim = Input(1).count(1);
......@@ -44,8 +44,8 @@ void RDivOp<Context>::BroadcastRunWithType(int type) {
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
1.0, x1, multiplier,
0.0, c, &ctx());
math::Div<T, Context>(Output(0)->count(), c, x2, y);
0.0, c, ctx());
math::Div<T, Context>(Output(0)->count(), c, x2, y, ctx());
}
}
......@@ -82,16 +82,16 @@ void RDivGradientOp<Context>::EltwiseRunWithType() {
auto* x2 = Input(1).template data<T, Context>();
auto* dx2 = Output(1)->template mutable_data<T, Context>();
auto* c = ws()->template caches<T, Context>({ X1->count() })[0];
math::Mul<T, Context>(X1->count(), dy, x1, c); // dY * X1
math::Square<T, Context>(X2->count(), x2, dx2); // X2^{2}
math::Inv<T, Context>(X2->count(), -1, dx2, dx2); // -1 / X2^{2}
math::Mul<T, Context>(X2->count(), c, dx2, dx2);
math::Mul<T, Context>(X1->count(), dy, x1, c, ctx()); // dY * X1
math::Square<T, Context>(X2->count(), x2, dx2, ctx()); // X2^{2}
math::Inv<T, Context>(X2->count(), -1, dx2, dx2, ctx()); // -1 / X2^{2}
math::Mul<T, Context>(X2->count(), c, dx2, dx2, ctx());
}
if (Output(0)->name() != "ignore") {
auto* x2 = Input(1).template data<T, Context>();
auto* dx1 = Output(0)->template mutable_data<T, Context>();
math::Div<T, Context>(X1->count(), dy, x2, dx1);
math::Div<T, Context>(X1->count(), dy, x2, dx1, ctx());
}
}
......@@ -116,19 +116,19 @@ void RDivGradientOp<Context>::BroadcastRunWithType(int type) {
auto* x2 = Input(1).template data<T, Context>();
auto* dx1 = Output(0)->template mutable_data<T, Context>();
auto* c = ws()->template caches<T, Context>({ X2->count() })[0];
math::Div<T, Context>(X2->count(), dy, x2, c);
math::Div<T, Context>(X2->count(), dy, x2, c, ctx());
if (type == 0 || type == 1) {
DECLARE_MULTIPLIER(multiplier, outer_dim);
math::Gemv<T, Context>(
CblasTrans, outer_dim, inner_dim,
1.0, c, multiplier,
0.0, dx1, &ctx());
0.0, dx1, ctx());
} else if (type == 2) {
DECLARE_MULTIPLIER(multiplier, inner_dim);
math::Gemv<T, Context>(
CblasNoTrans, outer_dim, inner_dim,
1.0, c, multiplier,
0.0, dx1, &ctx());
0.0, dx1, ctx());
}
}
......@@ -142,18 +142,18 @@ void RDivGradientOp<Context>::BroadcastRunWithType(int type) {
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
-1.0, multiplier, x1,
0.0, dx2, &ctx());
0.0, dx2, ctx());
} else if (type == 2) {
DECLARE_MULTIPLIER(multiplier, inner_dim);
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
-1.0, x1, multiplier,
0.0, dx2, &ctx());
0.0, dx2, ctx());
}
math::Mul<T, Context>(X2->count(), dy, dx2, dx2);
math::Div<T, Context>(X2->count(), dx2, x2, dx2);
math::Div<T, Context>(X2->count(), dx2, x2, dx2);
math::Mul<T, Context>(X2->count(), dy, dx2, dx2, ctx());
math::Div<T, Context>(X2->count(), dx2, x2, dx2, ctx());
math::Div<T, Context>(X2->count(), dx2, x2, dx2, ctx());
}
}
......
......@@ -9,7 +9,7 @@ void RMulOp<Context>::EltwiseRunWithType() {
auto* x1 = Input(0).template data<T, Context>();
auto* x2 = Input(1).template data<T, Context>();
auto* y = Output(0)->template mutable_data<T, Context>();
math::Mul<T, Context>(Output(0)->count(), x1, x2, y);
math::Mul<T, Context>(Output(0)->count(), x1, x2, y, ctx());
}
template <class Context> template <typename T>
......@@ -18,34 +18,39 @@ void RMulOp<Context>::BroadcastRunWithType(int type) {
auto* x1 = Input(0).template data<T, Context>();
auto* x2 = Input(1).template data<T, Context>();
auto* y = Output(0)->template mutable_data<T, Context>();
auto* c = ws()->template caches<T, Context>({
Output(0)->count() })[0];
if (type == 0 || type == 1) {
if (type == 0) {
outer_dim = Input(1).count();
inner_dim = 1;
} else {
x1 = Input(0).template data<T, CPUContext>();
ctx()->template Copy<T, Context, Context>(
Output(0)->count(), y, x2);
math::MulScalar<T, Context>(Output(0)->count(),
dragon_cast<float, T>(x1[0]), y, ctx());
} else if (type == 1) {
outer_dim = Input(1).count(0, Input(1).axis(-1));
inner_dim = Input(1).dim(-1);
}
DECLARE_MULTIPLIER(multiplier, outer_dim);
auto* c = ws()->template caches<T, Context>(
{ Output(0)->count() })[0];
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
1.0, multiplier, x1,
0.0, c, &ctx());
math::Mul<T, Context>(Output(0)->count(), c, x2, y);
0.0, c, ctx());
math::Mul<T, Context>(
Output(0)->count(), c, x2, y, ctx());
} else if (type == 2) {
outer_dim = Input(1).dim(0);
inner_dim = Input(1).count(1);
DECLARE_MULTIPLIER(multiplier, inner_dim);
auto* c = ws()->template caches<T, Context>(
{ Output(0)->count() })[0];
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
1.0, x1, multiplier,
0.0, c, &ctx());
math::Mul<T, Context>(Output(0)->count(), c, x2, y);
0.0, c, ctx());
math::Mul<T, Context>(
Output(0)->count(), c, x2, y, ctx());
}
}
......@@ -79,13 +84,13 @@ void RMulGradientOp<Context>::EltwiseRunWithType() {
if (Output(1)->name() != "ignore") {
auto* x1 = Input(0).template data<T, Context>();
auto* dx2 = Output(1)->template mutable_data<T, Context>();
math::Mul<T, Context>(Output(1)->count(), dy, x1, dx2);
math::Mul<T, Context>(Output(1)->count(), dy, x1, dx2, ctx());
}
if (Output(0)->name() != "ignore") {
auto* x2 = Input(1).template data<T, Context>();
auto* dx1 = Output(0)->template mutable_data<T, Context>();
math::Mul<T, Context>(Output(0)->count(), dy, x2, dx1);
math::Mul<T, Context>(Output(0)->count(), dy, x2, dx1, ctx());
}
}
......@@ -110,19 +115,19 @@ void RMulGradientOp<Context>::BroadcastRunWithType(int type) {
auto* x2 = Input(1).template data<T, Context>();
auto* dx1 = Output(0)->template mutable_data<T, Context>();
auto* c = ws()->template caches<T, Context>({ X2->count() })[0];
math::Mul<T, Context>(X2->count(), dy, x2, c);
math::Mul<T, Context>(X2->count(), dy, x2, c, ctx());
if (type == 0 || type == 1) {
DECLARE_MULTIPLIER(multiplier, outer_dim);
math::Gemv<T, Context>(
CblasTrans, outer_dim, inner_dim,
1.0, c, multiplier,
0.0, dx1, &ctx());
0.0, dx1, ctx());
} else if (type == 2) {
DECLARE_MULTIPLIER(multiplier, inner_dim);
math::Gemv<T, Context>(
CblasNoTrans, outer_dim, inner_dim,
1.0, c, multiplier,
0.0, dx1, &ctx());
0.0, dx1, ctx());
}
}
......@@ -135,16 +140,16 @@ void RMulGradientOp<Context>::BroadcastRunWithType(int type) {
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
1.0, multiplier, x1,
0.0, dx2, &ctx());
0.0, dx2, ctx());
} else if (type == 2) {
DECLARE_MULTIPLIER(multiplier, inner_dim);
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
1.0, x1, multiplier,
0.0, dx2, &ctx());
0.0, dx2, ctx());
}
math::Mul<T, Context>(X2->count(), dy, dx2, dx2);
math::Mul<T, Context>(X2->count(), dy, dx2, dx2, ctx());
}
}
......
......@@ -9,7 +9,7 @@ void RSubOp<Context>::EltwiseRunWithType() {
auto* x1 = Input(0).template data<T, Context>();
auto* x2 = Input(1).template data<T, Context>();
auto* y = Output(0)->template mutable_data<T, Context>();
math::Sub<T, Context>(Output(0)->count(), x1, x2, y);
math::Sub<T, Context>(Output(0)->count(), x1, x2, y, ctx());
}
template <class Context> template <typename T>
......@@ -19,7 +19,7 @@ void RSubOp<Context>::BroadcastRunWithType(int type) {
auto* x2 = Input(1).template data<T, Context>();
auto* y = Output(0)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>(
ctx()->template Copy<T, Context, Context>(
Output(0)->count(), y, x2);
if (type == 0 || type == 1) {
......@@ -35,7 +35,7 @@ void RSubOp<Context>::BroadcastRunWithType(int type) {
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
1.0, multiplier, x1,
-1.0, y, &ctx());
-1.0, y, ctx());
} else if (type == 2) {
outer_dim = Input(1).dim(0);
inner_dim = Input(1).count(1);
......@@ -44,7 +44,7 @@ void RSubOp<Context>::BroadcastRunWithType(int type) {
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
1.0, x1, multiplier,
-1.0, y, &ctx());
-1.0, y, ctx());
}
}
......@@ -78,12 +78,12 @@ void RSubGradientOp<Context>::EltwiseRunWithType() {
if (Output(1)->name() != "ignore") {
auto* dx2 = Output(1)->template mutable_data<T, Context>();
math::Scale<T, Context>(
Output(1)->count(), -1, dy, dx2, &ctx());
Output(1)->count(), -1, dy, dx2, ctx());
}
if (Output(0)->name() != "ignore") {
auto* dx1 = Output(0)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>(
ctx()->template Copy<T, Context, Context>(
Output(0)->count(), dx1, dy);
}
}
......@@ -108,7 +108,7 @@ void RSubGradientOp<Context>::BroadcastRunWithType(int type) {
math::Gemv<T, Context>(
CblasTrans, outer_dim, inner_dim,
1.0, dy, multiplier,
0.0, dx1, &ctx());
0.0, dx1, ctx());
} else if (type == 2) {
outer_dim = X2->dim(0);
inner_dim = X2->count(1);
......@@ -116,14 +116,14 @@ void RSubGradientOp<Context>::BroadcastRunWithType(int type) {
math::Gemv<T, Context>(
CblasNoTrans, outer_dim, inner_dim,
1.0, dy, multiplier,
0.0, dx1, &ctx());
0.0, dx1, ctx());
}
}
if (Output(1)->name() != "ignore") {
auto* dx2 = Output(1)->template mutable_data<T, Context>();
math::Scale<T, Context>(
X2->count(), -1, dy, dx2, &ctx());
X2->count(), -1, dy, dx2, ctx());
}
}
......
......@@ -7,7 +7,7 @@ template <class Context> template <typename T>
void SquareOp<Context>::RunWithType() {
auto* Xdata = Input(0).template data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>();
math::Pow<T, Context>(Output(0)->count(), 2.0, Xdata, Ydata);
math::Pow<T, Context>(Output(0)->count(), 2.0, Xdata, Ydata, ctx());
}
template <class Context>
......@@ -29,8 +29,8 @@ void SquareGradientOp<Context>::RunWithType() {
auto* Xdata = Input(0).template data<T, Context>();
auto* dYdata = Input(-1).template data<T, Context>();
auto* dXdata = Output(0)->template mutable_data<T, Context>();
math::Mul<T, Context>(Output(0)->count(), dYdata, Xdata, dXdata);
math::Scal<T, Context>(Output(0)->count(), 2.0, dXdata, &ctx());
math::Mul<T, Context>(Output(0)->count(), dYdata, Xdata, dXdata, ctx());
math::Scal<T, Context>(Output(0)->count(), 2.0, dXdata, ctx());
}
template <class Context>
......
......@@ -9,7 +9,8 @@ void SubOp<Context>::EltwiseRunWithType() {
auto* X1data = Input(0).template data<T, Context>();
auto* X2data = Input(1).template data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>();
math::Sub<T, Context>(Output(0)->count(), X1data, X2data, Ydata);
math::Sub<T, Context>(Output(0)->count(),
X1data, X2data, Ydata, ctx());
}
template <class Context> template <typename T>
......@@ -19,23 +20,24 @@ void SubOp<Context>::BroadcastRunWithType(int type) {
auto* x2 = Input(1).template data<T, Context>();
auto* y = Output(0)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>(
ctx()->template Copy<T, Context, Context>(
Output(0)->count(), y, x1);
if (type == 0 || type == 1) {
if (type == 0) {
outer_dim = Input(0).count();
inner_dim = 1;
x2 = Input(1).template data<T, CPUContext>();
math::AddScalar<T, Context>(Output(0)->count(),
-dragon_cast<float, T>(x2[0]), y, ctx());
} else {
outer_dim = Input(0).count(0, Input(0).axis(-1));
inner_dim = Input(0).dim(-1);
}
DECLARE_MULTIPLIER(multiplier, outer_dim);
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
-1.0, multiplier, x2,
1.0, y, &ctx());
1.0, y, ctx());
}
}
else if (type == 2) {
outer_dim = Input(0).dim(0);
......@@ -45,7 +47,7 @@ void SubOp<Context>::BroadcastRunWithType(int type) {
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
-1.0, x2, multiplier,
1.0, y, &ctx());
1.0, y, ctx());
}
}
......@@ -79,12 +81,12 @@ void SubGradientOp<Context>::EltwiseRunWithType() {
if (Output(1)->name() != "ignore") {
auto* dx2 = Output(1)->template mutable_data<T, Context>();
math::Scale<T, Context>(Output(1)->count(),
-1.0, dy, dx2, &ctx());
-1.0, dy, dx2, ctx());
}
if (Output(0)->name() != "ignore") {
auto* dx1 = Output(0)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>(
ctx()->template Copy<T, Context, Context>(
Output(0)->count(), dx1, dy);
}
}
......@@ -109,7 +111,7 @@ void SubGradientOp<Context>::BroadcastRunWithType(int type) {
math::Gemv<T, Context>(
CblasTrans, outer_dim, inner_dim,
-1.0, dy, multiplier,
0.0, dx2, &ctx());
0.0, dx2, ctx());
} else if (type == 2) {
outer_dim = X1->dim(0);
inner_dim = X1->count(1);
......@@ -117,13 +119,13 @@ void SubGradientOp<Context>::BroadcastRunWithType(int type) {
math::Gemv<T, Context>(
CblasNoTrans, outer_dim, inner_dim,
-1.0, dy, multiplier,
0.0, dx2, &ctx());
0.0, dx2, ctx());
}
}
if (Output(0)->name() != "ignore") {
auto* dx1 = Output(0)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>(
ctx()->template Copy<T, Context, Context>(
X1->count(), dx1, dy);
}
}
......
......@@ -8,7 +8,8 @@ void CompareOp<Context>::EqualRunWithType() {
auto* X1data = Input(0).template data<T, Context>();
auto* X2data = Input(1).template data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>();
kernel::Equal<T, Context>(Output(0)->count(), X1data, X2data, Ydata);
kernel::Equal<T, Context>(Output(0)->count(),
X1data, X2data, Ydata, ctx());
}
template <class Context>
......
......@@ -7,7 +7,7 @@ void CopyOp<Context>::RunWithType() {
auto* Xdata = Input(0).template data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>(
ctx()->template Copy<T, Context, Context>(
Output(0)->count(), Ydata, Xdata);
}
......
......@@ -20,10 +20,10 @@ void CTCLossGradientOp<Context>::RunWithType() {
auto* dXdata = Output(0)->template mutable_data<T, Context>();
auto* dYdata = Input(-1).template data<T, Context>();
T dYdata_host; ctx().template Copy<T, CPUContext, Context>(
T dYdata_host; ctx()->template Copy<T, CPUContext, Context>(
1, &dYdata_host, dYdata);
math::Scale<T, Context>(Output(0)->count(),
dYdata_host, Gdata, dXdata, &ctx());
dYdata_host, Gdata, dXdata, ctx());
}
template <class Context>
......
......@@ -45,7 +45,7 @@ void CuDNNCTCLossOp<Context>::RunWithType() {
cudnnSetTensorDesc<T>(&grad_desc, Input(0).dims());
CUDNN_CHECK(cudnnGetCTCLossWorkspaceSize(
ctx().cudnn_handle(), prob_desc, grad_desc,
ctx()->cudnn_handle(), prob_desc, grad_desc,
packed_labels.data(), label_lengths.data(),
input_lengths.data(),
ctc_algo, ctc_desc, &workspace_size));
......@@ -58,7 +58,7 @@ void CuDNNCTCLossOp<Context>::RunWithType() {
auto* WSdata = (uint8_t*)ws()->template caches<Context>({
workspace_size })[0];
CUDNN_CHECK(cudnnCTCLoss(ctx().cudnn_handle(),
CUDNN_CHECK(cudnnCTCLoss(ctx()->cudnn_handle(),
prob_desc, Pdata, packed_labels.data(),
label_lengths.data(), input_lengths.data(),
Ydata, grad_desc, Gdata,
......
......@@ -12,11 +12,13 @@ void L1LossOp<Context>::RunWithType() {
auto* diff_data = diff->template mutable_data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>();
math::Sub<T, Context>(Input(0).count(), X0data, X1data, diff_data);
math::Sub<T, Context>(Input(0).count(),
X0data, X1data, diff_data, ctx());
if (InputSize() > 2) {
CHECK_EQ(Input(0).count(), Input(2).count());
auto* Wdata = Input(2).template data<T, Context>();
math::Mul<T, Context>(diff->count(), Wdata, diff_data, diff_data);
math::Mul<T, Context>(diff->count(),
Wdata, diff_data, diff_data, ctx());
}
T normalizer = 1;
......@@ -27,11 +29,13 @@ void L1LossOp<Context>::RunWithType() {
}
T loss = math::ASum<T, Context>(diff->count(), diff_data);
math::Set<T, Context>(1, loss / normalizer, Ydata);
math::Set<T, Context>(1, loss / normalizer, Ydata, ctx());
}
template <class Context>
void L1LossOp<Context>::RunOnDevice() {
ctx()->set_stream_id(0); // enforce default stream
CHECK_EQ(Input(0).count(), Input(1).count());
Output(0)->Reshape({ 1 });
diff = ws()->CreateTensor("/mnt/" + anchor() + "/l1_loss/diff");
......@@ -51,9 +55,11 @@ template <class Context> template <typename T>
void L1LossGradientOp<Context>::RunWithType() {
auto* diff_data = diff->template mutable_data<T, Context>();
auto* dYdata = Input(-1).template data<T, Context>();
T dYdata_host; ctx().template Copy<T, CPUContext, Context>(
T dYdata_host; ctx()->template Copy<T, CPUContext, Context>(
1, &dYdata_host, dYdata);
kernel::AbsGrad<T, Context>(diff->count(), diff_data, diff_data);
ctx()->FinishDeviceCompution();
kernel::AbsGrad<T, Context>(diff->count(),
diff_data, diff_data, ctx());
T alpha = dYdata_host, normalizer = 1;
if (normalization == "BATCH_SIZE") {
......@@ -69,7 +75,7 @@ void L1LossGradientOp<Context>::RunWithType() {
const T sign = (i == 0) ? 1 : -1;
alpha *= sign;
math::Axpby<T, Context>(Output(i)->count(),
alpha, diff_data, 0, dXdata, &ctx());
alpha, diff_data, 0, dXdata, ctx());
}
}
......
......@@ -9,12 +9,14 @@ void L2LossOp<Context>::RunWithType() {
auto* X0data = Input(0).template data<T, Context>();
auto* X1data = Input(1).template data<T, Context>();
auto* diff_data = diff->template mutable_data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>();
math::Sub<T, Context>(diff->count(), X0data, X1data, diff_data);
auto* Ydata = Output(0)->template mutable_data<float, Context>();
math::Sub<T, Context>(diff->count(),
X0data, X1data, diff_data, ctx());
if (InputSize() > 2) {
CHECK_EQ(Input(0).count(), Input(2).count());
auto* Wdata = Input(2).template data<T, Context>();
math::Mul<T, Context>(diff->count(), Wdata, diff_data, diff_data);
math::Mul<T, Context>(diff->count(),
Wdata, diff_data, diff_data, ctx());
}
T normalizer = 1;
......@@ -23,10 +25,12 @@ void L2LossOp<Context>::RunWithType() {
} else if (normalization == "FULL") {
normalizer = Input(0).count();
}
normalizer *= 2;
T loss = T(0.5) * math::Dot<T, Context>(diff->count(),
diff_data, diff_data, &ctx());
math::Set<T, Context>(1, loss / normalizer, Ydata);
T loss;
math::Dot<T, Context>(diff->count(),
diff_data, diff_data, &loss, ctx());
math::Set<T, Context>(1, loss / normalizer, Ydata, ctx());
}
template <class Context>
......@@ -48,10 +52,11 @@ OPERATOR_SCHEMA(L2Loss).NumInputs(2, 3).NumOutputs(1);
template <class Context> template <typename T>
void L2LossGradientOp<Context>::RunWithType() {
auto* diff_data = diff->template mutable_data<T, Context>();
auto* diff_data = diff->template data<T, Context>();
auto* dYdata = Input(-1).template data<T, Context>();
T dYdata_host; ctx().template Copy<T, CPUContext, Context>(
T dYdata_host; ctx()->template Copy<T, CPUContext, Context>(
1, &dYdata_host, dYdata);
ctx()->FinishDeviceCompution();
T alpha = dYdata_host, normalizer = 1;
if (normalization == "BATCH_SIZE") {
......@@ -67,7 +72,7 @@ void L2LossGradientOp<Context>::RunWithType() {
const T sign = (i == 0) ? 1 : -1;
alpha *= sign;
math::Axpby<T, Context>(Output(i)->count(),
alpha, diff_data, 0, dXdata, &ctx());
alpha, diff_data, 0, dXdata, ctx());
}
}
......
......@@ -13,11 +13,11 @@ void SigmoidCrossEntropyOp<Context>::RunWithType() {
auto* Fdata = flags.template mutable_data<T, Context>();
kernel::SigmoidCrossEntropy<T, Context>(
Input(0).count(), Xdata, Tdata, Ldata, Fdata, &ctx());
Input(0).count(), Xdata, Tdata, Ldata, Fdata, ctx());
if (normalization == "UNIT") {
Output(0)->ReshapeLike(losses);
Output(0)->template CopyFrom<Context>(losses);
Output(0)->template CopyFrom<Context>(losses, ctx());
return;
}
......@@ -35,11 +35,13 @@ void SigmoidCrossEntropyOp<Context>::RunWithType() {
T loss = math::ASum<T, Context>(losses.count(), Ldata);
Output(0)->Reshape({ 1 });
auto* Ydata = Output(0)->template mutable_data<T, Context>();
math::Set<T, Context>(1, loss / normalizer, Ydata);
math::Set<T, Context>(1, loss / normalizer, Ydata, ctx());
}
template <class Context>
void SigmoidCrossEntropyOp<Context>::RunOnDevice() {
ctx()->set_stream_id(0); // enforce default stream
CHECK_EQ(Input(0).count(), Input(1).count())
<< "\nNumber of predictions must match the number of labels.";
losses.ReshapeLike(Input(0));
......@@ -63,12 +65,12 @@ void SigmoidCrossEntropyGradientOp<Context>::RunWithType() {
auto* Fdata = flags.template mutable_data<T, Context>();
kernel::SigmoidCrossEntropyGrad<T, Context>(
Input(0).count(), Xdata, Tdata, dXdata, Fdata, &ctx());
Input(0).count(), Xdata, Tdata, dXdata, Fdata, ctx());
if (normalization == "UNIT") {
auto* dYdata = Input(-1).template data<T, Context>();
math::Mul<T, Context>(Output(0)->count(),
dYdata, dXdata, dXdata); return;
dYdata, dXdata, dXdata, ctx()); return;
}
T normalizer = 1;
......@@ -83,14 +85,16 @@ void SigmoidCrossEntropyGradientOp<Context>::RunWithType() {
}
auto* dYdata = Input(-1).template data<T, Context>();
T dYdata_host; ctx().template Copy<T, CPUContext, Context>(
T dYdata_host; ctx()->template Copy<T, CPUContext, Context>(
1, &dYdata_host, dYdata);
math::Scal<T, Context>(Output(0)->count(),
dYdata_host / normalizer, dXdata, &ctx());
dYdata_host / normalizer, dXdata, ctx());
}
template <class Context>
void SigmoidCrossEntropyGradientOp<Context>::RunOnDevice() {
ctx()->set_stream_id(0); // enforce default stream
Output(0)->ReshapeLike(Input(0));
flags.ReshapeLike(Input(0));
......
......@@ -15,11 +15,11 @@ void SigmoidFocalLossOp<Context>::RunWithType() {
kernel::SigmoidFocalLoss<T, Context>(
outer_dim, axis_dim, inner_dim,
pos_alpha, neg_alpha, gamma, neg_id,
Xdata, Tdata, Ldata, Fdata, &ctx());
Xdata, Tdata, Ldata, Fdata, ctx());
if (normalization == "UNIT") {
Output(0)->ReshapeLike(losses);
Output(0)->template CopyFrom<Context>(losses);
Output(0)->template CopyFrom<Context>(losses, ctx());
return;
}
......@@ -37,11 +37,13 @@ void SigmoidFocalLossOp<Context>::RunWithType() {
T loss = math::ASum<T, Context>(losses.count(), Ldata);
Output(0)->Reshape({ 1 });
auto* Ydata = Output(0)->template mutable_data<T, Context>();
math::Set<T, Context>(1, loss / normalizer, Ydata);
math::Set<T, Context>(1, loss / normalizer, Ydata, ctx());
}
template <class Context>
void SigmoidFocalLossOp<Context>::RunOnDevice() {
ctx()->set_stream_id(0); // enforce default stream
outer_dim = Input(0).count(0, axis);
axis_dim = Input(0).dim(axis);
inner_dim = Input(0).count(axis + 1);
......@@ -71,12 +73,12 @@ void SigmoidFocalLossGradientOp<Context>::RunWithType() {
kernel::SigmoidFocalLossGradient<T, Context>(
outer_dim, axis_dim, inner_dim,
pos_alpha, neg_alpha, gamma, neg_id,
Xdata, Tdata, dXdata, Fdata, &ctx());
Xdata, Tdata, dXdata, Fdata, ctx());
if (normalization == "UNIT") {
auto* dYdata = Input(-1).template data<T, Context>();
math::Mul<T, Context>(Output(0)->count(),
dYdata, dXdata, dXdata); return;
dYdata, dXdata, dXdata, ctx()); return;
}
T normalizer = 1;
......@@ -91,14 +93,16 @@ void SigmoidFocalLossGradientOp<Context>::RunWithType() {
}
auto* dYdata = Input(-1).template data<T, Context>();
T dYdata_host; ctx().template Copy<T, CPUContext, Context>(
T dYdata_host; ctx()->template Copy<T, CPUContext, Context>(
1, &dYdata_host, dYdata);
math::Scal<T, Context>(Output(0)->count(),
dYdata_host / normalizer, dXdata, &ctx());
dYdata_host / normalizer, dXdata, ctx());
}
template <class Context>
void SigmoidFocalLossGradientOp<Context>::RunOnDevice() {
ctx()->set_stream_id(0); // enforce default stream
outer_dim = Input(0).count(0, axis);
axis_dim = Input(0).dim(axis);
inner_dim = Input(0).count(axis + 1);
......
......@@ -11,20 +11,21 @@ void SmoothL1LossOp<Context>::RunWithType() {
auto* X1data = Input(1).template data<T, Context>();
auto* diff_data = diff->template mutable_data<T, Context>();
auto* error_data = error->template mutable_data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<float, Context>();
math::Sub<T, Context>(diff->count(), X0data, X1data, diff_data);
math::Sub<T, Context>(diff->count(),
X0data, X1data, diff_data, ctx());
if (InputSize() > 2) {
auto* inside_w_data = Input(2).template data<T, Context>();
math::Mul<T, Context>(diff->count(),
inside_w_data, diff_data, diff_data);
inside_w_data, diff_data, diff_data, ctx());
}
kernel::SmoothL1<T, Context>(
diff->count(), beta, diff_data, error_data);
kernel::SmoothL1<T, Context>(diff->count(),
beta, diff_data, error_data, ctx());
if (InputSize() > 3) {
auto* outside_w_data = Input(3).template data<T, Context>();
math::Mul<T, Context>(diff->count(),
outside_w_data, error_data, error_data);
outside_w_data, error_data, error_data, ctx());
}
T normalizer = 1;
......@@ -34,12 +35,14 @@ void SmoothL1LossOp<Context>::RunWithType() {
normalizer = Input(0).count();
}
T loss = math::ASum<T, Context>(error->count(), error_data);
math::Set<T, Context>(1, loss / normalizer, Ydata);
float loss = math::ASum<float, Context>(error->count(), error_data);
math::Set<float, Context>(1, loss / normalizer, Ydata, ctx());
}
template <class Context>
void SmoothL1LossOp<Context>::RunOnDevice() {
ctx()->set_stream_id(0); // enforce default stream
CHECK(Input(0).dims() == Input(1).dims());
if (InputSize() > 2) CHECK(Input(0).dims() == Input(2).dims());
if (InputSize() > 3) CHECK(Input(0).dims() == Input(3).dims());
......@@ -64,10 +67,12 @@ template <class Context> template <typename T>
void SmoothL1LossGradientOp<Context>::RunWithType() {
auto* diff_data = diff->template mutable_data<T, Context>();
auto* dYdata = Input(-1).template data<T, Context>();
T dYdata_host; ctx().template Copy<T, CPUContext, Context>(
T dYdata_host; ctx()->template Copy<T, CPUContext, Context>(
1, &dYdata_host, dYdata);
kernel::SmoothL1Grad<T, Context>(
diff->count(), beta, diff_data, diff_data);
ctx()->FinishDeviceCompution();
kernel::SmoothL1Grad<T, Context>(diff->count(),
beta, diff_data, diff_data, ctx());
T alpha = dYdata_host, normalizer = 1;
if (normalization == "BATCH_SIZE") {
......@@ -83,16 +88,16 @@ void SmoothL1LossGradientOp<Context>::RunWithType() {
const T sign = (i == 0) ? 1 : -1;
alpha *= sign;
math::Axpby<T, Context>(Output(i)->count(),
alpha, diff_data, 0, dXdata, &ctx());
alpha, diff_data, 0, dXdata, ctx());
if (InputSize() > 3) {
auto* inside_w_data = Input(2).template data<T, Context>();
math::Mul<T, Context>(Output(i)->count(),
inside_w_data, dXdata, dXdata);
inside_w_data, dXdata, dXdata, ctx());
}
if (InputSize() > 4) {
auto* outside_w_data = Input(3).template data<T, Context>();
math::Mul<T, Context>(Output(i)->count(),
outside_w_data, dXdata, dXdata);
outside_w_data, dXdata, dXdata, ctx());
}
}
}
......
......@@ -26,15 +26,15 @@ void SoftmaxCrossEntropyOp<Context>::RunWithType() {
auto* Pdata = prob->template data<T, Context>();
auto* Tdata = Input(1).template data<T, Context>();
auto* Ldata = losses.template mutable_data<T, Context>();
kernel::SoftmaxCrossEntropy<T, Context>(
Input(0).count(), Pdata, Tdata, Ldata);
kernel::SoftmaxCrossEntropy<T, Context>(Input(0).count(),
Pdata, Tdata, Ldata, ctx());
if (normalization == "UNIT") {
Output(0)->Reshape({ outer_dim * inner_dim });
auto* Ydata = Output(0)->template mutable_data<T, Context>();
kernel::Sum<T, Context>(outer_dim * inner_dim,
Input(0).dim(axis), inner_dim,
Ldata, Ydata); return;
Ldata, Ydata, ctx()); return;
}
T normalizer = 1;
......@@ -47,11 +47,13 @@ void SoftmaxCrossEntropyOp<Context>::RunWithType() {
T loss = math::ASum<T, Context>(losses.count(), Ldata);
Output(0)->Reshape({ 1 });
auto* Ydata = Output(0)->template mutable_data<T, Context>();
math::Set<T, Context>(1, loss / normalizer, Ydata);
math::Set<T, Context>(1, loss / normalizer, Ydata, ctx());
}
template <class Context>
void SoftmaxCrossEntropyOp<Context>::RunOnDevice() {
ctx()->set_stream_id(0); // enforce default stream
outer_dim = Input(0).count(0, axis);
inner_dim = Input(0).count(axis + 1);
CHECK_EQ(Input(0).count(), Input(1).count())
......@@ -76,16 +78,16 @@ void SoftmaxCrossEntropyGradientOp<Context>::RunWithType() {
auto* Tdata = Input(1).template data<T, Context>();
auto* Pdata = prob->template mutable_data<T, Context>();
auto* dXdata = Output(0)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>(prob->count(), dXdata, Pdata);
ctx()->template Copy<T, Context, Context>(prob->count(), dXdata, Pdata);
math::Axpy<T, Context>(Output(0)->count(),
-1.0, Tdata, dXdata, &ctx());
-1.0, Tdata, dXdata, ctx());
if (normalization == "UNIT") {
auto* dYdata = Input(-1).template data<T, Context>();
kernel::SumGrad<T, Context>(outer_dim * inner_dim,
Input(0).dim(axis), inner_dim, 1.0, dYdata, Pdata);
Input(0).dim(axis), inner_dim, 1.0, dYdata, Pdata, ctx());
math::Mul<T, Context>(Output(0)->count(),
Pdata, dXdata, dXdata); return;
Pdata, dXdata, dXdata, ctx()); return;
}
T normalizer = 1;
......@@ -96,10 +98,10 @@ void SoftmaxCrossEntropyGradientOp<Context>::RunWithType() {
}
auto* dYdata = Input(-1).template data<T, Context>();
T dYdata_host; ctx().template Copy<T, CPUContext, Context>(
T dYdata_host; ctx()->template Copy<T, CPUContext, Context>(
1, &dYdata_host, dYdata);
math::Scal<T, Context>(Output(0)->count(),
dYdata_host / normalizer, dXdata, &ctx());
dYdata_host / normalizer, dXdata, ctx());
}
template <class Context>
......
......@@ -20,11 +20,11 @@ void SoftmaxFocalLossOp<Context>::RunWithType() {
outer_dim, Input(0).dim(axis), inner_dim,
pos_alpha, neg_alpha, gamma, neg_id,
Pdata, Tdata, Idata, this->ignores.count(),
Ldata, Fdata, &ctx());
Ldata, Fdata, ctx());
if (normalization == "UNIT") {
Output(0)->ReshapeLike(losses);
Output(0)->template CopyFrom<Context>(losses);
Output(0)->template CopyFrom<Context>(losses, ctx());
return;
}
......@@ -42,11 +42,13 @@ void SoftmaxFocalLossOp<Context>::RunWithType() {
T loss = math::ASum<T, Context>(losses.count(), Ldata);
Output(0)->Reshape({ 1 });
auto* Ydata = Output(0)->template mutable_data<T, Context>();
math::Set<T, Context>(1, loss / normalizer, Ydata);
math::Set<T, Context>(1, loss / normalizer, Ydata, ctx());
}
template <class Context>
void SoftmaxFocalLossOp<Context>::RunOnDevice() {
ctx()->set_stream_id(0); // enforce default stream
outer_dim = Input(0).count(0, axis);
inner_dim = Input(0).count(axis + 1);
CHECK_EQ(outer_dim * inner_dim, Input(1).count())
......@@ -80,16 +82,16 @@ void SoftmaxFocalLossGradientOp<Context>::RunWithType() {
outer_dim, Output(0)->dim(axis), inner_dim,
pos_alpha, neg_alpha, gamma, neg_id,
Pdata, Tdata, Idata, this->ignores.count(),
dXdata, Fdata, &ctx());
dXdata, Fdata, ctx());
if (normalization == "UNIT") {
auto* dYdata = Input(-1).template data<T, Context>();
kernel::SumGrad<T, Context>(
Input(0).count() / Input(0).dim(axis),
Input(0).dim(axis), inner_dim,
1.0, dYdata, Pdata);
1.0, dYdata, Pdata, ctx());
math::Mul<T, Context>(Output(0)->count(),
Pdata, dXdata, dXdata); return;
Pdata, dXdata, dXdata, ctx()); return;
}
T normalizer = 1;
......@@ -104,14 +106,16 @@ void SoftmaxFocalLossGradientOp<Context>::RunWithType() {
}
auto* dYdata = Input(-1).template data<T, Context>();
T dYdata_host; ctx().template Copy<T, CPUContext, Context>(
T dYdata_host; ctx()->template Copy<T, CPUContext, Context>(
1, &dYdata_host, dYdata);
math::Scal<T, Context>(Output(0)->count(),
dYdata_host / normalizer, dXdata, &ctx());
dYdata_host / normalizer, dXdata, ctx());
}
template <class Context>
void SoftmaxFocalLossGradientOp<Context>::RunOnDevice() {
ctx()->set_stream_id(0); // enforce default stream
this->prob = ws()->GetTensor("/mnt/" + anchor() + "/softmax/prob");
outer_dim = this->prob->count(0, axis);
inner_dim = this->prob->count(axis + 1);
......
......@@ -21,83 +21,66 @@ void SparseSoftmaxCrossEntropyOp<Context>::SoftmaxRun() {
softmax_op->Run();
}
template <class Context>
void SparseSoftmaxCrossEntropyOp<Context>::SoftmaxRunFP16() {
Tensor* XF32 = ws()->CreateTensor(
"/mnt/" + anchor() + "/softmax/xf32");
XF32->ReshapeLike(Input(0));
auto* XdataF16 = Input(0).template data<float16, Context>();
auto* XdataF32 = XF32->template mutable_data<float, Context>();
kernel::TypeA2B<float16, float, Context>(
Input(0).count(), XdataF16, XdataF32);
OperatorDef softmax_def = MakeOperatorDef("Softmax", "",
vector<string>({ XF32->name() }),
vector<string>({ "/mnt/" + anchor() + "/softmax/prob" }));
softmax_def.add_arg()->CopyFrom(this->arg("axis"));
if (def().has_device_option())
softmax_def.mutable_device_option()
->CopyFrom(def().device_option());
if (!softmax_op) softmax_op.reset(
CreateOperator(softmax_def, ws()));
else softmax_op->MutableOp(softmax_def);
softmax_op->Run();
}
template <class Context> template <typename Tx, typename Ty>
void SparseSoftmaxCrossEntropyOp<Context>::RunWithType() {
auto* Pdata = prob->template data<Tx, Context>();
auto* Tdata = Input(1).template data<Ty, Context>();
auto* Idata = !ignores.count() ? nullptr :
ignores.template data<int, Context>();
auto* Ldata = losses.template mutable_data<Tx, Context>();
auto* Fdata = flags.template mutable_data<Tx, Context>();
auto* Ldata = losses.template mutable_data<float, Context>();
auto* Fdata = flags.template mutable_data<float, Context>();
kernel::SparseSoftmaxCrossEntropy<Tx, Ty, Context>(
outer_dim, Input(0).dim(axis), inner_dim,
Pdata, Tdata, Idata, ignores.count(),
Ldata, Fdata, &ctx());
Ldata, Fdata, ctx());
if (normalization == "UNIT") {
Output(0)->ReshapeLike(losses);
Output(0)->template CopyFrom<Context>(losses);
Output(0)->template CopyFrom<Context>(losses, ctx());
return;
}
Tx normalizer = 1;
float normalizer = 1;
if (normalization == "VALID") {
normalizer = std::max(
math::ASum<Tx, Context>(
flags.count(), Fdata), (Tx)1.f);
math::ASum<float, Context>(
flags.count(), Fdata), 1.f);
} else if (normalization == "BATCH_SIZE") {
normalizer = Input(0).dim(0);
} else if (normalization == "FULL") {
normalizer = outer_dim * inner_dim;
}
Tx loss = math::ASum<Tx, Context>(losses.count(), Ldata);
float loss = math::ASum<float, Context>(losses.count(), Ldata);
Output(0)->Reshape({ 1 });
auto* Ydata = Output(0)->template mutable_data<Tx, Context>();
math::Set<Tx, Context>(1, loss / normalizer, Ydata);
auto* Ydata = Output(0)->template mutable_data<float, Context>();
math::Set<float, Context>(1, loss / normalizer, Ydata, ctx());
}
template <class Context>
void SparseSoftmaxCrossEntropyOp<Context>::RunOnDevice() {
ctx()->set_stream_id(0); // enforce default stream
outer_dim = Input(0).count(0, axis);
inner_dim = Input(0).count(axis + 1);
CHECK_EQ(outer_dim * inner_dim, Input(1).count())
<< "\nNumber of predictions must match the number of labels.";
losses.Reshape({ outer_dim * inner_dim });
flags.Reshape({ outer_dim * inner_dim });
prob = ws()->CreateTensor("/mnt/" + anchor() + "/softmax/prob");
SoftmaxRun();
if (XIsType(Input(0), float) ||
XIsType(Input(0), float16)) {
if (XIsType(Input(0), float16)) SoftmaxRunFP16();
else SoftmaxRun();
if (XIsType(Input(0), float)) {
if (XIsType(Input(1), float)) RunWithType<float, float>();
else if (XIsType(Input(1), int64_t)) RunWithType<float, int64_t>();
else LOG(FATAL) << DTypeHelper(Input(1), { "float32", "int64" });
} else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
} else if (XIsType(Input(0), float16)) {
if (XIsType(Input(1), float)) RunWithType<float16, float>();
else if (XIsType(Input(1), int64_t)) RunWithType<float16, int64_t>();
else LOG(FATAL) << DTypeHelper(Input(1), { "float32", "int64" });
} else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
}
DEPLOY_CPU(SparseSoftmaxCrossEntropy);
......@@ -113,62 +96,66 @@ void SparseSoftmaxCrossEntropyGradientOp<Context>::RunWithType() {
auto* Idata = !ignores.count() ? nullptr :
ignores.template data<int, Context>();
auto* dXdata = Output(0)->template mutable_data<Tx, Context>();
auto* Fdata = flags.template mutable_data<Tx, Context>();
ctx().template Copy<Tx, Context, Context>(
auto* Fdata = flags.template mutable_data<float, Context>();
ctx()->template Copy<Tx, Context, Context>(
prob->count(), dXdata, Pdata);
kernel::SparseSoftmaxCrossEntropyGrad<Tx, Ty, Context>(
outer_dim, Output(0)->dim(axis), inner_dim,
Pdata, Tdata, Idata, ignores.count(),
dXdata, Fdata, &ctx());
dXdata, Fdata, ctx());
if (normalization == "UNIT") {
auto* dYdata = Input(-1).template data<Tx, Context>();
kernel::SumGrad<Tx, Context>(
auto* dYdata = Input(-1).template data<float, Context>();
auto* WSdata = ws()->template caches<float, Context>(
{ Input(0).count() })[0];
kernel::SumGrad<float, Context>(
Input(0).count() / Input(0).dim(axis),
Input(0).dim(axis), inner_dim,
1.0, dYdata, Pdata);
math::Mul<Tx, Context>(
Output(0)->count(), Pdata, dXdata, dXdata);
1.0, dYdata, WSdata, ctx());
kernel::TypeA2B<float, Tx, Context>(
Input(0).count(), WSdata, Pdata, ctx());
math::Mul<Tx, Context>(Output(0)->count(),
Pdata, dXdata, dXdata, ctx());
return;
}
Tx normalizer = 1;
float normalizer = 1;
if (normalization == "VALID") {
normalizer = std::max(
math::ASum<Tx, Context>(
flags.count(), Fdata), (Tx)1.f);
math::ASum<float, Context>(
flags.count(), Fdata), 1.f);
} else if (normalization == "BATCH_SIZE") {
normalizer = Input(0).dim(0);
} else if (normalization == "FULL") {
normalizer = outer_dim * inner_dim;
}
auto* dYdata = Input(-1).template data<Tx, Context>();
Tx dYdata_host; ctx().template Copy<Tx, CPUContext, Context>(
auto* dYdata = Input(-1).template data<float, Context>();
float dYdata_host; ctx()->template Copy<float, CPUContext, Context>(
1, &dYdata_host, dYdata);
math::Scal<Tx, Context>(Output(0)->count(),
dYdata_host / normalizer, dXdata, &ctx());
dYdata_host / normalizer, dXdata, ctx());
}
template <class Context>
void SparseSoftmaxCrossEntropyGradientOp<Context>::RunOnDevice() {
ctx()->set_stream_id(0); // enforce default stream
prob = ws()->GetTensor("/mnt/" + anchor() + "/softmax/prob");
outer_dim = prob->count(0, axis);
inner_dim = prob->count(axis + 1);
Output(0)->ReshapeLike(Input(0));
flags.Reshape({ outer_dim * inner_dim });
if (XIsType(Input(0), float) || XIsType(Input(0), float16)) {
if (XIsType(Input(0), float)) {
if (XIsType(Input(1), float)) RunWithType<float, float>();
else if (XIsType(Input(1), int64_t)) RunWithType<float, int64_t>();
else LOG(FATAL) << DTypeHelper(Input(1), { "float32", "int64" });
if (XIsType(Input(0), float16)) {
auto* dXdataF32 = Output(0)->template data<float, Context>();
auto* dXdataF16 = prob->template mutable_data<float16, Context>();
kernel::TypeA2B<float, float16, Context>(Output(0)->count(), dXdataF32, dXdataF16);
Output(0)->template CopyFrom<Context>(*prob);
}
} else if (XIsType(Input(0), float16)) {
if (XIsType(Input(1), float)) RunWithType<float16, float>();
else if (XIsType(Input(1), int64_t)) RunWithType<float16, int64_t>();
else LOG(FATAL) << DTypeHelper(Input(1), { "float32", "int64" });
} else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
}
......
......@@ -9,23 +9,27 @@ namespace dragon {
template <class Context> template <typename Tx, typename Ty>
void AccuracyOp<Context>::RunWithType() {
static CPUContext cctx;
float* Y1data, *Y2data = nullptr;
Y1data = Output(0)->template mutable_data<float, CPUContext>();
if (OutputSize() > 1) {
math::Set<float, CPUContext>(num_classes, 0,
Output(1)->template mutable_data<float, CPUContext>());
Y2data = Output(1)->template mutable_data<float, CPUContext>();
math::Set<float, CPUContext>(num_classes, 0, Y2data, &cctx);
}
Map<int, TIndex> num_per_class;
Map<int, TIndex> num_per_class;
TIndex acc = 0, count = 0;
const Tx* Xdata;
if (XIsType(Input(0), float16)) {
Tensor* XF32 = ws()->CreateTensor("/mnt/" + anchor() + "/accuracy/xf32");
XF32->ReshapeLike(Input(0));
auto* XdataF16 = Input(0).template data<float16, CPUContext>();
auto* XdataF32 = XF32->template mutable_data<float, CPUContext>();
Tensor* X32T = ws()->CreateTensor(
"/mnt/" + anchor() + "/accuracy/f32");
X32T->ReshapeLike(Input(0));
auto* X16 = Input(0).template data<float16, CPUContext>();
auto* X32 = X32T->template mutable_data<float, CPUContext>();
kernel::TypeA2B<float16, float, CPUContext>(
Input(0).count(), XdataF16, XdataF32);
Xdata = XdataF32;
Input(0).count(), X16, X32, &cctx);
Xdata = X32;
} else Xdata = Input(0).template data<Tx, CPUContext>();
auto* labels = Input(1).template data<Ty, CPUContext>();
......@@ -41,15 +45,13 @@ void AccuracyOp<Context>::RunWithType() {
vector<pair<Tx, int> > vec;
for (int k = 0; k < num_classes; k++)
vec.push_back(
std::make_pair(Xdata[i * dim + k * inner_dim + j], k)
);
std::make_pair(Xdata[i * dim + k * inner_dim + j], k));
std::partial_sort(
vec.begin(), vec.begin() + top_k, vec.end(),
std::greater<pair<Tx, int> >());
for (int k = 0; k < top_k; k++) {
if (vec[k].second == label) {
if (OutputSize() > 1)
Output(1)->template mutable_data<float, CPUContext>()[label]++;
if (OutputSize() > 1) Y2data[label]++;
acc++;
break;
}
......@@ -58,12 +60,11 @@ void AccuracyOp<Context>::RunWithType() {
} // end inner_dim
} // end outer_dim
Output(0)->template mutable_data<float, CPUContext>()[0] = (float)acc / count;
if (OutputSize() > 1) {
auto* acc_per_class = Output(1)->template mutable_data<float, CPUContext>();
Y1data[0] = (float)acc / count;
if (Y2data) {
for (int i = 0; i < num_classes; i++)
acc_per_class[i] = num_per_class[i] == 0 ?
0 : acc_per_class[i] / num_per_class[i];
Y2data[i] = num_per_class[i] == 0 ?
0 : Y2data[i] / num_per_class[i];
}
}
......
......@@ -14,14 +14,14 @@ namespace dragon {
Output(0)->ReshapeLike(Input(0)); \
auto* Xdata = Input(0).template data<type_a, Context>(); \
auto* Ydata = Output(0)->template mutable_data<type_b, Context>(); \
kernel::TypeA2B<type_a, type_b, Context>(Input(0).count(), Xdata, Ydata); \
kernel::TypeA2B<type_a, type_b, Context>(Input(0).count(), Xdata, Ydata, ctx()); \
} else { \
TIndex count = Output(0)->count(); \
auto* Xdata = Output(0)->template data<type_a, Context>(); \
auto* Cdata = ws()->template caches<type_b, Context>({ count })[0]; \
kernel::TypeA2B<type_a, type_b, Context>(count, Xdata, Cdata); \
kernel::TypeA2B<type_a, type_b, Context>(count, Xdata, Cdata, ctx()); \
auto* Ydata = Output(0)->template mutable_data<type_b, Context>(); \
ctx().template Copy<type_b, Context, Context>(count, Ydata, Cdata); \
ctx()->template Copy<type_b, Context, Context>(count, Ydata, Cdata); \
} \
return; \
}
......
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!