Unlock CUDA Async Streams

Ting PAN
Commit 5cd0761b authored Aug 22, 2018 by Ting PAN
Showing with 964 additions and 926 deletions
Dragon/include/core/common.h
Dragon/include/core/context.h
Dragon/include/core/context_cuda.h
Dragon/include/core/graph.h
Dragon/include/core/operator.h
Dragon/include/core/tensor.h
Dragon/include/core/workspace.h
Dragon/include/operators/loss/sparse_softmax_cross_entropy_op.h
Dragon/include/operators/ndarray/dimension_op.h
Dragon/include/operators/norm/l2_norm_op.h
Dragon/include/operators/update/adam_update_op.h
Dragon/include/operators/update/collective_update_op.h
Dragon/include/operators/update/nesterov_update_op.h
Dragon/include/operators/update/rmsprop_update_op.h
Dragon/include/operators/update/sgd_update_op.h
Dragon/include/operators/update/update_op_base.h
Dragon/include/operators/vision/conv_op_base.h
Dragon/include/utils/cast.h
Dragon/include/utils/cuda_device.h
Dragon/include/utils/cudnn_device.h
--- a/Dragon/include/core/common.h
+++ b/Dragon/include/core/common.h
@@ -52,9 +52,9 @@ using Set = std::unordered_set<Value> ;
 /*
 * Define the Kernel version.
 *
- * | Major(2) | Minor(2) | Patch(10) |
+ * | Major(2) | Minor(2) | Patch(11) |
 */
-#define DRAGON_VERSION 2210
+#define DRAGON_VERSION 2211
 /*
 * Define the default random seed.

--- a/Dragon/include/core/context.h
+++ b/Dragon/include/core/context.h
@@ -34,6 +34,8 @@ class CPUContext {
    virtual ~CPUContext() {}
    inline void SwitchToDevice() {}
+    inline void SwitchToDevice(int stream_id) {}
    inline void FinishDeviceCompution() {}
    inline static void* New(size_t nbytes) {
@@ -47,7 +49,15 @@ class CPUContext {
        return data;
    }
-    inline static void Memset(size_t nbytes, void* ptr) {
+    inline static void Memset(
+        size_t              nbytes,
+        void*               ptr) {
+        memset(ptr, 0, nbytes);
+    }
+    inline void MemsetAsync(
+        size_t              nbytes,
+        void*               ptr) {
        memset(ptr, 0, nbytes);
    }
@@ -59,18 +69,16 @@ class CPUContext {
        memcpy(dst, src, nbytes);
    }
-    inline static void Delete(void* data) { free(data); }
    template<class DstContext, class SrcContext>
-    inline static void MemcpyAsync(
+    inline void MemcpyAsync(
        size_t              nbytes,
        void*               dst,
        const void*         src) {
-        NOT_IMPLEMENTED;
+        memcpy(dst, src, nbytes);
    }
    template<typename T, class DstContext, class SrcContext>
-    inline static void Copy(
+    inline void Copy(
        int                 n,
        T*                  dst,
        const T*            src) {
@@ -82,7 +90,10 @@ class CPUContext {
        else for (int i = 0; i < n; i++) dst[i] = src[i];
    }
+    inline static void Delete(void* data) { free(data); }
    inline int device_id() const { return 0; }
+    inline void set_stream_id(int stream_id) {}
    inline std::mt19937* rand_generator() {
        if (!rand_generator_.get())

--- a/Dragon/include/core/context_cuda.h
+++ b/Dragon/include/core/context_cuda.h
@@ -23,8 +23,7 @@ namespace dragon {
 class CUDAObject {
 public:
-    CUDAObject(int default_stream = 1)
+    CUDAObject() {
-        : default_stream(default_stream) {
        for (int i = 0; i < CUDA_MAX_DEVICES; i++) {
            cuda_streams[i] = vector<cudaStream_t>();
            cublas_handles[i] = vector<cublasHandle_t>();
@@ -38,7 +37,7 @@ class CUDAObject {
        for (int i = 0; i < CUDA_MAX_DEVICES; i++) {
            for (int j = 0; j < cuda_streams[i].size(); j++) {
                auto& stream = cuda_streams[i][j];
-                //  follow caffe2, do not check the stream destroying
+                //  follow the caffe2, do not check the stream destroying
                //  Error code 29 (driver shutting down) is inevitable
                //  TODO(PhyscalX): Can someone solve this issue?
                if (stream) cudaStreamDestroy(stream);
@@ -52,19 +51,21 @@ class CUDAObject {
        }
    }
-    /**
+    //  follow the caffe2,
-     * Each device takes a group of streams.
+    //  each device takes a group of non-bl0cking streams
-     *
+    //  the stream 0 is reserved for default stream,
-     * The stream 0 is reserved for default stream,
+    //  as some computations really require it,
-     * stream 1 or higher is created as ``cudaStreamNonBlocking``.
+    //  e.g. cublas.asum() and mixed cpu/cuda operations
-     */
+    //  besides, somes calls, such as cudnn.conv() and cudnn.rnn(),
+    //  produce wrong results if running them on non-blocking streams
+    //  note that caffe2 also use default streams (within CuDNNState)
    cudaStream_t GetStream(int device_id, int stream_id) {
        vector<cudaStream_t>& dev_streams = cuda_streams[device_id];
        if (dev_streams.size() <= (unsigned)stream_id)
            dev_streams.resize(stream_id + 1, nullptr);
        if (!dev_streams[stream_id]) {
            DeviceGuard guard(device_id);
-            unsigned int flags = !stream_id && default_stream ?
+            unsigned int flags = !stream_id ?
                cudaStreamDefault : cudaStreamNonBlocking;
            CUDA_CHECK(cudaStreamCreateWithFlags(
                &dev_streams[stream_id], flags));
@@ -102,8 +103,6 @@ class CUDAObject {
    }
 #endif
-    int default_stream;
    vector<cudaStream_t> cuda_streams[CUDA_MAX_DEVICES];
    vector<cublasHandle_t> cublas_handles[CUDA_MAX_DEVICES];
 #ifdef WITH_CUDNN
@@ -129,11 +128,10 @@ class CUDAContext {
        stream_id_ = stream_id;
    }
-    inline void SwitchToDevice() { SwitchToDevice(0); }
+    inline void SwitchToDevice() { SwitchToDevice(1); }
    inline void FinishDeviceCompution() {
-        cudaStreamSynchronize(cuda_object_
+        cudaStreamSynchronize(cuda_stream());
-            .GetStream(device_id_, stream_id_));
        cudaError_t error = cudaGetLastError();
        CHECK_EQ(error, cudaSuccess)
            << "\nCUDA Error: " << cudaGetErrorString(error);
@@ -147,8 +145,17 @@ class CUDAContext {
        return data;
    }
-    inline static void Memset(size_t nbytes, void* ptr) {
+    inline static void Memset(
-        cudaMemset(ptr, 0, nbytes); 
+        size_t              nbytes,
+        void*               ptr) {
+        CUDA_CHECK(cudaMemset(ptr, 0, nbytes));
+    }
+    inline void MemsetAsync(
+        size_t              nbytes,
+        void*               ptr) {
+        CUDA_CHECK(cudaMemsetAsync(ptr, 0,
+            nbytes, cuda_stream()));
    }
    template<class DstContext, class SrcContext>
@@ -169,20 +176,22 @@ class CUDAContext {
            cudaMemcpyDefault, cuda_stream()));
    }
-    inline static void Delete(void* data) { cudaFree(data); }
    template<typename T, class DstContext, class SrcContext>
-    static void Copy(
+    inline void Copy(
        int                 n,
        T*                  dst,
        const T*            src) {
        if (dst == src) return;
-        Memcpy<SrcContext, DstContext>(
+        MemcpyAsync<SrcContext, DstContext>(
            n * sizeof(T), (void*)dst, (const void*)src);
    }
+    inline static void Delete(void* data) { cudaFree(data); }
    inline int device_id() const { return device_id_; }
+    inline void set_stream_id(int stream_id) { stream_id_ = stream_id; }
    inline cudaStream_t cuda_stream() {
        return cuda_stream(device_id_, stream_id_);
    }
@@ -227,7 +236,7 @@ class CUDAContext {
    static thread_local CUDAObject cuda_object_;
 private:
-    int device_id_, stream_id_ = 0, random_seed_;
+    int device_id_, stream_id_ = 1, random_seed_;
    unique_ptr<std::mt19937> rand_generator_;
    curandGenerator_t curand_generator_ = nullptr;
 };
@@ -271,7 +280,7 @@ class CUDAClosure {
 protected:
    Context* ctx_;
-    CUDAObject cuda_object_ = 0;
+    CUDAObject cuda_object_;
    vector<int> active_streams_;
 };
@@ -283,8 +292,22 @@ class CUDAContext {
    CUDAContext(const int device_id = 0) { CUDA_NOT_COMPILED; }
    inline void SwitchToDevice() { CUDA_NOT_COMPILED; }
+    inline void SwitchToDevice(int stream_id) { CUDA_NOT_COMPILED; }
    inline void FinishDeviceCompution() { CUDA_NOT_COMPILED; }
+    inline static void Memset(
+        size_t              nbytes,
+        void*               ptr) {
+        CUDA_NOT_COMPILED;
+    }
+    inline void MemsetAsync(
+        size_t              nbytes,
+        void*               ptr) {
+        CUDA_NOT_COMPILED;
+    }
    template<class DstContext, class SrcContext>
    inline static void Memcpy(
        size_t              nbytes,
@@ -302,6 +325,7 @@ class CUDAContext {
    }
    inline int device_id() const { return 0; }
+    inline void set_stream_id(int stream_id) {}
 };
 #endif // WITH_CUDA

--- a/Dragon/include/core/graph.h
+++ b/Dragon/include/core/graph.h
@@ -37,7 +37,8 @@ class GraphBase {
    virtual bool Run(
        const string&           include,
-        const string&           exclude) = 0;
+        const string&           exclude,
+        const int               stream_id = 1) = 0;
    inline string name() const { return name_; }
@@ -58,7 +59,8 @@ class Graph final : public GraphBase {
    bool Run(
        const string&           include,
-        const string&           exclude) override;
+        const string&           exclude,
+        const int               stream_id = 1) override;
    GraphDef Prune(const GraphDef& meta_graph);
    GraphDef MakeUpdate(const GraphDef& meta_graph);

--- a/Dragon/include/core/operator.h
+++ b/Dragon/include/core/operator.h
@@ -44,7 +44,7 @@ class OperatorBase {
                   const string& anchor);
    inline void SwitchToPhase(const string& phase) { phase_ = phase; }
-    virtual void Run() { NOT_IMPLEMENTED; }
+    virtual void Run(int stream_id = 1) { NOT_IMPLEMENTED; }
    inline const string& name() const { return def_.name(); }
    inline const string& type() const { return def_.type(); }
@@ -100,13 +100,13 @@ class Operator : public OperatorBase {
            Output(0)->name() == "ignore"));
    }
-    virtual void Run() final {
+    void Run(int stream_id = 1) final {
        if (!allow_run_) return;
        if (allow_recompute_) MakeResource();
-        ctx().SwitchToDevice();
+        ctx()->SwitchToDevice(stream_id);
        MemorySwitch();
        RunOnDevice();
-        if (do_sync_) ctx().FinishDeviceCompution();
+        if (do_sync_) ctx()->FinishDeviceCompution();
        if (allow_recompute_) CleanResource();
    }
@@ -123,7 +123,7 @@ class Operator : public OperatorBase {
    virtual void RunOnDevice() = 0;
-    inline Context& ctx() { return ctx_; }
+    inline Context* ctx() { return &ctx_; }
    inline bool AllowRun() { return allow_run_; }
 protected:
@@ -192,6 +192,27 @@ DECLARE_REGISTRY(
    const OperatorDef&,
    Workspace*);
+#define TENSOR_FILL_WITH_TYPE(tensor, shape, type) \
+    if (tensor.count() == 0) { \
+        CHECK(ws()->GetFiller(tensor.name())) \
+            << "\nTensor(" << tensor.name() << ") is empty. \n" \
+            << "may be specify a filler for it ?"; \
+        tensor.Reshape(shape); \
+        unique_ptr< Filler<type, Context> > filler(  \
+            CreateFiller<type, Context>(*ws()->GetFiller(tensor.name()))); \
+        filler->Fill(&tensor, ctx()); \
+        ctx()->FinishDeviceCompution(); \
+    } else { \
+         TIndex count = 1; \
+         for(int i = 0; i < shape.size(); i++) count *= shape[i]; \
+         CHECK_EQ(count, tensor.count()) \
+            << "\nModel request " << "Tensor(" << tensor.name() << ")'s " \
+            << "size is " << count << ", \n" \
+            << "but now is " << tensor.count() << ", " \
+            << "did you feed the incorrect Tensor before ?"; \
+        tensor.Reshape(shape); \
+    }
 #define TENSOR_FILL(tensor, shape) \
    if (tensor.count() == 0) { \
        CHECK(ws()->GetFiller(tensor.name())) \
@@ -200,7 +221,8 @@ DECLARE_REGISTRY(
        tensor.Reshape(shape); \
        unique_ptr< Filler<T, Context> > filler(  \
            CreateFiller<T, Context>(*ws()->GetFiller(tensor.name()))); \
-        filler->Fill(&tensor, &ctx()); \
+        filler->Fill(&tensor, ctx()); \
+        ctx()->FinishDeviceCompution(); \
    } else { \
         TIndex count = 1; \
         for(int i = 0; i < shape.size(); i++) count *= shape[i]; \
@@ -217,7 +239,7 @@ DECLARE_REGISTRY(
    if (size > ptr_tensor->count()) { \
        ptr_tensor->Reshape({ size }); \
        math::Set<T, Context>(size, dragon_cast<T, float>(1.f), \
-            ptr_tensor->template mutable_data<T, Context>()); \
+            ptr_tensor->template mutable_data<T, Context>(), ctx()); \
    } \
  }

--- a/Dragon/include/core/tensor.h
+++ b/Dragon/include/core/tensor.h
@@ -74,7 +74,9 @@ class Tensor {
        for (TIndex i = start; i < end; i++) ret *= dim(i);
        return ret;
    }
    inline TIndex count() const { return size_; }
    inline TIndex count(const TIndex start) const {
        return count(start, ndim());
    }
@@ -115,14 +117,14 @@ class Tensor {
    inline void Corrupt() { is_corrupted_ = true; }
    inline bool has_memory() const {
-        return memory_ || ex_memory_ != nullptr; 
+        return memory_ || ex_memory_ != nullptr;
    }
    MixedMemory* memory() const {
        return own_mem_ ? memory_.get() : ex_memory_;
    }
-    void set_memory(MixedMemory* mem) { 
+    void set_memory(MixedMemory* mem) {
        memory_.reset(mem); capacity_ = mem->nbytes();
    }
@@ -197,7 +199,7 @@ class Tensor {
        mutable_data_ptr<Context>(&data_ptr);
        //  call the constructors
        if (meta.ctor()) meta_.ctor()(data_ptr, size_);
-        capacity_ = size_ * meta.itemsize();
+        capacity_ = size_ * meta.itemsize(), require_init_ = true;
        return data_ptr;
    }
@@ -225,6 +227,15 @@ class Tensor {
    }
    template <typename T, class Context>
+    T* mutable_data(Context* ctx) {
+        auto* data = mutable_data<T, Context>();
+        if (!require_init_) return data;
+        ctx->MemsetAsync(nbytes(), (void*)data);
+        require_init_ = false;
+        return data;
+    }
+    template <typename T, class Context>
    const T* data() const {
        CHECK(meta_ == TypeMeta::Make<T>())
            << "\nThe DType of Tensor(" << name() << ") is "
@@ -234,27 +245,31 @@ class Tensor {
    }
    template <class Context>
-    inline void CopyFrom(const Tensor& other) {
+    inline void CopyFrom(const Tensor& other, Context* ctx) {
+        if ((void*)&other == (void*)this) return;
        CHECK_EQ(size_, other.size_);
        auto* src = other.template raw_data<Context>();
        auto* dst = raw_mutable_data<Context>(other.meta_);
-        if (dst == src) return;
+        ctx->template MemcpyAsync<Context, Context>(
-        if (TypeMeta::Id<Context>() ==
+            nbytes(), dst, src);
-                TypeMeta::Id<CPUContext>()) {
+        require_init_ = false;
-            CPUContext::Memcpy<Context, Context>(nbytes(), dst, src);
-        } else if (TypeMeta::Id<Context>() ==
-                TypeMeta::Id<CUDAContext>()) {
-            CUDAContext::Memcpy<Context, Context>(nbytes(), dst, src);
-        }
    }
    inline void Move(MixedMemory* mem) {
-        if (mem != nullptr) ex_memory_ = mem;
+        if (mem != nullptr) {
-        else ex_memory_ = new MixedMemory(TypeMeta::Make<float>(), 4);
+            ex_memory_ = mem;
-        own_mem_ = false;
+            require_init_ = false;
+        } else {
+            ex_memory_ = new MixedMemory(
+                TypeMeta::Make<float>(), 4);
+            require_init_ = true;
+        } own_mem_ = false;
    }
-    inline void Share(MixedMemory* mem) { Move(mem); is_shared_ = true; }
+    inline void Share(MixedMemory* mem) {
+        Move(mem); is_shared_ = true;
+        require_init_ = false;
+    }
    inline void Reset() {
        size_ = capacity_ = 0;
@@ -275,7 +290,7 @@ class Tensor {
    shared_ptr<MixedMemory> memory_;
    MixedMemory* ex_memory_ = nullptr;
    bool is_corrupted_ = false, is_shared_ = false;
-    bool own_mem_ = true;
+    bool own_mem_ = true, require_init_ = true;
 };
 }    // namespace dragon

--- a/Dragon/include/core/workspace.h
+++ b/Dragon/include/core/workspace.h
@@ -179,29 +179,28 @@ class Workspace {
    template <class Context>
    inline vector<void*> caches(
        const vector<size_t>&   segments) {
-        TIndex total_size = 0;
+        TIndex nbytes = 0;
-        for (auto& segment : segments) total_size += (TIndex)segment;
+        for (auto& segment : segments) nbytes += (TIndex)segment;
-        Tensor* cacheT = CreateTensor("/share/cache");
+        Tensor* cache_t = CreateTensor("/share/cache");
-        cacheT->Reshape({ total_size });
+        cache_t->Reshape({ nbytes });
-        vector<void*> caches(segments.size());
+        vector<void*> Bcaches(segments.size());
-        caches[0] = cacheT->template mutable_data<uint8_t, Context>();
+        Bcaches[0] = cache_t->template mutable_data<uint8_t, Context>();
        for (int i = 1; i < segments.size(); i++)
-            caches[i] = (uint8_t*)caches[i - 1] + segments[i - 1];
+            Bcaches[i] = (uint8_t*)Bcaches[i - 1] + segments[i - 1];
-        return caches;
+        return Bcaches;
    }
    template <typename T, class Context>
    inline vector<T*> caches(
        const vector<TIndex>&   segments) {
-        TIndex total_count = 0;
+        vector<size_t> Tsegments;
-        for (auto& segment : segments) total_count += segment;
+        for (auto& segment : segments)
-        Tensor* cacheT = CreateTensor("/share/cache");
+            Tsegments.emplace_back(segment * sizeof(T));
-        cacheT->Reshape({ total_count });
+        vector<void*> Bcaches = caches<Context>(Tsegments);
-        vector<T*> caches(segments.size());
+        vector<T*> Tcaches(segments.size());
-        caches[0] = cacheT->template mutable_data<T, Context>();
+        for (int i = 0; i < segments.size(); i++)
-        for (int i = 1; i < segments.size(); i++)
+            Tcaches[i] = (T*)Bcaches[i];
-            caches[i] = caches[i - 1] + segments[i - 1];
+        return Tcaches;
-        return caches;
    }
    /******************** Operator ********************/
@@ -259,11 +258,12 @@ class Workspace {
    void RunGraph(
        const string&           graph_name,
        const string&           include,
-        const string&           exclude) {
+        const string&           exclude,
+        const int               stream_id = 1) {
        if (!graph_map_.count(graph_name))
            LOG(FATAL) << "Graph(" << graph_name
                       << ") does not exist.";
-        graph_map_[graph_name]->Run(include, exclude);
+        graph_map_[graph_name]->Run(include, exclude, stream_id);
    }
    vector<string> GetGraphs() {

--- a/Dragon/include/operators/loss/sparse_softmax_cross_entropy_op.h
+++ b/Dragon/include/operators/loss/sparse_softmax_cross_entropy_op.h
@@ -36,7 +36,6 @@ class SparseSoftmaxCrossEntropyOp : public Operator<Context> {
    USE_OPERATOR_FUNCTIONS;
    void SoftmaxRun();
-    void SoftmaxRunFP16();
    void RunOnDevice() override;
    template <typename Tx, typename Ty> void RunWithType();

--- a/Dragon/include/operators/ndarray/dimension_op.h
+++ b/Dragon/include/operators/ndarray/dimension_op.h
@@ -42,7 +42,7 @@ public:
        //  simply copy the dY to dX
        Output(0)->ReshapeLike(Input(0));
        if (Output(0)->name() != Input(-1).name())
-            Output(0)->template CopyFrom<Context>(Input(-1));
+            Output(0)->template CopyFrom<Context>(Input(-1), ctx());
    }
 };

--- a/Dragon/include/operators/norm/l2_norm_op.h
+++ b/Dragon/include/operators/norm/l2_norm_op.h
@@ -34,7 +34,6 @@ class L2NormOp final : public Operator<Context> {
    TIndex axis, num_axes, end_axis;
    float eps;
    string mode;
-    bool across_inner;
    Tensor* norm, buffer;
    TIndex outer_dim, dim, inner_dim, spatial_dim;
 };
@@ -55,7 +54,6 @@ class L2NormGradientOp final : public Operator<Context> {
 protected:
    TIndex axis, num_axes, end_axis;
    string mode;
-    bool across_inner;
    Tensor* norm, buffer, buffer_inner;
    TIndex outer_dim, dim, inner_dim;
 };

--- a/Dragon/include/operators/update/adam_update_op.h
+++ b/Dragon/include/operators/update/adam_update_op.h
@@ -24,7 +24,7 @@ class AdamUpdateOp final : public UpdateOpBase<Context> {
    USE_OPERATOR_FUNCTIONS;
    USE_UPDATER_FUNCTIONS(Context);
-    void ComputeRunWithFloat() override;
+    void ComputeRunWithFloat32() override;
    void ComputeRunWithFloat16() override;
 protected:

--- a/Dragon/include/operators/update/collective_update_op.h
+++ b/Dragon/include/operators/update/collective_update_op.h
@@ -43,10 +43,26 @@ class CollectiveUpdateOp final : public Operator<Context> {
    void InitNCCL();
    void RunOnDevice() override;
-    void MPIAllReduceWithFloat();
-    void NCCLAllReduceWithFloat();
+    template <typename T> void MPIAllReduce(
-    void MPIBcastWithFloat();
+        Tensor*                 tensor,
-    void NCCLBcastWithFloat();
+        MPI_Datatype            dtype);
+    template <typename T> void MPIBcast(
+        Tensor*                 tensor,
+        MPI_Datatype            dtype);
+#ifdef WITH_MPI_NCCL
+    template <typename T> void NCCLAllReduce(
+        Tensor*                 tensor,
+        ncclDataType_t          dtype,
+        cudaStream_t&           stream);
+    template <typename T> void NCCLBcast(
+        Tensor*                 tensor,
+        ncclDataType_t          dtype,
+        cudaStream_t&           stream);
+#endif
 protected:
    int comm_size, comm_rank, comm_root;

--- a/Dragon/include/operators/update/nesterov_update_op.h
+++ b/Dragon/include/operators/update/nesterov_update_op.h
@@ -24,7 +24,7 @@ class NesterovUpdateOp final : public UpdateOpBase<Context> {
    USE_OPERATOR_FUNCTIONS;
    USE_UPDATER_FUNCTIONS(Context);
-    void ComputeRunWithFloat() override;
+    void ComputeRunWithFloat32() override;
    void ComputeRunWithFloat16() override;
 protected:

--- a/Dragon/include/operators/update/rmsprop_update_op.h
+++ b/Dragon/include/operators/update/rmsprop_update_op.h
@@ -24,7 +24,7 @@ class RMSPropUpdateOp final : public UpdateOpBase<Context> {
    USE_OPERATOR_FUNCTIONS;
    USE_UPDATER_FUNCTIONS(Context);
-    void ComputeRunWithFloat() override;
+    void ComputeRunWithFloat32() override;
    void ComputeRunWithFloat16() override;
 protected:

--- a/Dragon/include/operators/update/sgd_update_op.h
+++ b/Dragon/include/operators/update/sgd_update_op.h
@@ -25,7 +25,7 @@ class SGDUpdateOp final : public UpdateOpBase<Context> {
    USE_OPERATOR_FUNCTIONS;
    USE_UPDATER_FUNCTIONS(Context);
-    void ComputeRunWithFloat() override;
+    void ComputeRunWithFloat32() override;
    void ComputeRunWithFloat16() override;
 protected:

--- a/Dragon/include/operators/update/update_op_base.h
+++ b/Dragon/include/operators/update/update_op_base.h
@@ -35,13 +35,11 @@ class UpdateOpBase : public Operator<Context> {
    void RunOnDevice() override;
    template <typename T> void PreprocessRunWithType();
-    virtual void ComputeRunWithFloat() = 0;
+    virtual void ComputeRunWithFloat32() = 0;
+    virtual void ComputeRunWithFloat16() = 0;
-    virtual void ComputeRunWithFloat16() {
+    void UpdateRunWithFloat32();
-        LOG(FATAL) << "This Updater does not support FP16."; 
+    void UpdateRunWithFloat16();
-    }
-    template <typename T> void UpdateRunWithType();
 protected:
    float lr_mult, decay_mult;

--- a/Dragon/include/operators/vision/conv_op_base.h
+++ b/Dragon/include/operators/vision/conv_op_base.h
@@ -80,7 +80,8 @@ class ConvOpBase : public Operator<Context> {
                                  dilation[0], dilation[1],
                                               data_format,
                                                        im,
-                                                      col);
+                                                       col,
+                                                   ctx());
        } else LOG(FATAL) << "ConvNd has not been implemented yet";
    }
    template <typename T> void Col2Im(const T* col, T* im) {
@@ -94,7 +95,8 @@ class ConvOpBase : public Operator<Context> {
                                  dilation[0], dilation[1],
                                               data_format,
                                                       col,
-                                                       im);
+                                                        im,
+                                                   ctx());
        } else LOG(FATAL) << "ConvNd has not been implemented yet";
    }
 };

--- a/Dragon/include/utils/cast.h
+++ b/Dragon/include/utils/cast.h
@@ -19,6 +19,8 @@
 namespace dragon {
+#define HFLT_MIN 6.10e-5F
 template <typename DestType, typename SrcType>
 DestType dragon_cast(SrcType val);

--- a/Dragon/include/utils/cuda_device.h
+++ b/Dragon/include/utils/cuda_device.h
@@ -29,9 +29,17 @@ namespace dragon {
 #ifdef WITH_CUDA
-static const int CUDA_THREADS = 1024;
+//  The number of cuda threads to use. We set it to
-//  We do have a server with 10 GPUs :-)
+//  1024 which would work for compute capability 2.x
-#define CUDA_MAX_DEVICES 10
+//  Set it to 512 if using compute capability 1.x
+const int CUDA_THREADS = 1024;
+//  The maximum number of blocks to use in the default kernel call. We set it to
+//  65535 which would work for compute capability 2.x (where 65536 is the limit)
+const int CUDA_MAX_BLOCKS = 65535;
+//  You really need a NVIDIA DGX-2 !!! :-)
+#define CUDA_MAX_DEVICES 16
 #define CUDA_VERSION_MIN(major, minor, patch) \
    (CUDA_VERSION >= (major * 1000 + minor * 100 + patch))
@@ -67,12 +75,16 @@ static const int CUDA_THREADS = 1024;
  } while (0)
 #endif  // WITH_MPI_NCCL
-#define CUDA_KERNEL_LOOP(i, n) \
+#define CUDA_1D_KERNEL_LOOP(i, n) \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; \
       i < n; i += blockDim.x * gridDim.x)
 inline int CUDA_BLOCKS(const int N) {
-    return (N + CUDA_THREADS - 1) / CUDA_THREADS;
+    return std::max(
+        std::min(
+            (N + CUDA_THREADS - 1) / CUDA_THREADS,
+            CUDA_MAX_BLOCKS
+        ), 1);
 }
 #if CUDA_VERSION_MAX(9, 0, 0)

--- a/Dragon/include/utils/cudnn_device.h
+++ b/Dragon/include/utils/cudnn_device.h
@@ -44,6 +44,7 @@ template<> class CUDNNType<float>  {
    static const cudnnDataType_t type = CUDNN_DATA_FLOAT;
    static float oneval, zeroval;
    static const void *one, *zero;
+    typedef float BNParamType;
 };
 template<> class CUDNNType<double> {
@@ -51,6 +52,7 @@ template<> class CUDNNType<double> {
    static const cudnnDataType_t type = CUDNN_DATA_DOUBLE;
    static double oneval, zeroval;
    static const void *one, *zero;
+    typedef double BNParamType;
 };
 #ifdef WITH_CUDA_FP16
@@ -59,6 +61,7 @@ template<> class CUDNNType<float16> {
    static const cudnnDataType_t type = CUDNN_DATA_HALF;
    static float oneval, zeroval;
    static const void *one, *zero;
+    typedef float BNParamType;
 };
 #endif

--- a/Dragon/include/utils/filler.h
+++ b/Dragon/include/utils/filler.h
@@ -40,7 +40,7 @@ class ConstantFiller final : public Filler<T, Context> {
    void Fill(Tensor* tensor, Context* ctx) override {
        math::Set<T, Context>(tensor->count(),
            dragon_cast<T, float>(filler().value()),
-                tensor->mutable_data<T, Context>());
+                tensor->mutable_data<T, Context>(), ctx);
    }
 protected:
@@ -71,11 +71,11 @@ class TruncatedNormalFiller final : public Filler<T, Context> {
    void Fill(Tensor* tensor, Context* ctx) override {
        //  implement it on gpu is difficult
-        static CPUContext cpu_ctx;
+        static CPUContext cctx;
        math::RandomTruncatedNormal<T, CPUContext>(tensor->count(),
            filler().mean(), filler().std(),
                filler().low(), filler().high(),
-                    tensor->mutable_data<T, CPUContext>(), &cpu_ctx);
+                    tensor->mutable_data<T, CPUContext>(), &cctx);
    }
 protected:

--- a/Dragon/include/utils/math_functions.h
+++ b/Dragon/include/utils/math_functions.h
@@ -36,7 +36,8 @@ template <typename T, class Context>
 void Set(
    const int               n,
    const T                 alpha,
-    T*                      x);
+    T*                      x,
+    Context*                ctx);
 template <typename T, class Context>
 void RandomUniform(
@@ -78,73 +79,84 @@ void Add(
    const int               n,
    const T*                a,
    const T*                b,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);
 template <typename T, class Context>
 void Sub(
    const int               n,
    const T*                a,
    const T*                b,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);
 template <typename T, class Context>
 void Mul(
    const int               n,
    const T*                a,
    const T*                b,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);
 template <typename T, class Context>
 void Div(
    const int               n,
    const T*                a,
    const T*                b,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);
 template <typename T, class Context>
 void Clip(
    const int               n,
    const float             low,
    const float             high,
-    T*                      x);
+    T*                      x,
+    Context*                ctx);
 template <typename T, class Context>
 void Exp(
    const int               n,
    const T*                x,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);
 template <typename T, class Context>
 void Log(
    const int               n,
    const T*                x,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);
 template <typename T, class Context>
 void Square(
    const int               n,
    const T*                x,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);
 template <typename T, class Context>
 void Sqrt(
    const int               n,
    const T*                x,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);
 template <typename T, class Context>
 void Pow(
    const int               n,
    const float             alpha,
    const T*                x,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);
 template <typename T, class Context>
 void Inv(
    const int               n,
    const float             numerator,
    const T*                x,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);
 /******************** Level-2 ********************/
@@ -164,19 +176,21 @@ void Scale(
    Context*                ctx);
 template <typename T, class Context>
-T StridedDot(
+void StridedDot(
    const int               n,
    const T*                a,
    const int               incx,
    const T*                b,
    const int               incy,
+    T*                      y,
    Context*                ctx);
 template <typename T, class Context>
-float Dot(
+void Dot(
    const int               n,
    const T*                a,
    const T*                b,
+    T*                      y,
    Context*                ctx);
 template<typename T, class Context>
@@ -188,13 +202,15 @@ template<typename T, class Context>
 void AddScalar(
    const int               n,
    const float             alpha,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);
 template<typename T, class Context>
 void MulScalar(
    const int               n,
    const float             alpha,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);
 template<typename T, class Context>
 void Axpy(

--- a/Dragon/include/utils/op_kernel.h
+++ b/Dragon/include/utils/op_kernel.h
--- a/Dragon/include/utils/sse_alternative.h
+++ b/Dragon/include/utils/sse_alternative.h
@@ -80,7 +80,7 @@ T Dot(
    const T*                b);
 template<typename T>
-T ASum(
+T Sum(
    const int               n,
    const T*                x);

--- a/Dragon/include/utils/sse_device.h
+++ b/Dragon/include/utils/sse_device.h
@@ -15,6 +15,7 @@
 #ifdef WITH_SSE
 #include <immintrin.h>
+#include <tmmintrin.h>
 #include <cstdint>
 namespace dragon {

--- a/Dragon/modules/cxx/dragon.cc
+++ b/Dragon/modules/cxx/dragon.cc
@@ -250,8 +250,9 @@ void LoadCaffemodel(
 void RunGraph(
    const std::string&          graph_name,
-    Workspace*                  ws) {
+    Workspace*                  ws,
-    ws->RunGraph(graph_name, "", "");
+    const int                   stream_id) {
+    ws->RunGraph(graph_name, "", "", stream_id);
 }
 template <typename T>

--- a/Dragon/modules/cxx/dragon.h
+++ b/Dragon/modules/cxx/dragon.h
@@ -38,8 +38,7 @@ class Device {
    EXPORT const int device_id() const { return device_id_; }
 private:
-     int device_type_;
+     int device_type_, device_id_;
-     int device_id_;
 };
 EXPORT Workspace* CreateWorkspace(const std::string& name);
@@ -61,7 +60,8 @@ EXPORT std::string CreateGraph(
 EXPORT void RunGraph(
    const std::string&          graph_name,
-    Workspace*                  ws);
+    Workspace*                  ws,
+    const int                   stream_id = 1);
 EXPORT void CreateTensor(
    const std::string&          name,

--- a/Dragon/modules/python/dragon.h
+++ b/Dragon/modules/python/dragon.h
@@ -116,7 +116,7 @@ class NumpyFeeder : public TensorFeederBase {
 #else   
            LOG(FATAL) << "CUDA was not compiled.";
 #endif
-        } else{
+        } else {
            CPUContext::Memcpy<CPUContext, CPUContext>(tensor->nbytes(),
                                 tensor->raw_mutable_data<CPUContext>(),
                               static_cast<void*>(PyArray_DATA(array)));

--- a/Dragon/modules/python/py_autograd.h
+++ b/Dragon/modules/python/py_autograd.h
@@ -18,18 +18,22 @@
 PyObject* CreateGradientDefsCC(PyObject* self, PyObject* args) {
    PyObject* def_string = nullptr;
    PyObject* py_g_outputs = nullptr;
-    if (!PyArg_ParseTuple(args, "SO!", &def_string, &PyList_Type, &py_g_outputs)) {
+    if (!PyArg_ParseTuple(args, "SO!",
-        PyErr_SetString(PyExc_ValueError, "Excepted a serialized string of OperatorDef "
+            &def_string, &PyList_Type, &py_g_outputs)) {
-                                          "and a list containing outputs of this GradientOp.");
+        PyErr_SetString(PyExc_ValueError,
+            "Excepted a serialized string of OperatorDef "
+            "and a list containing outputs of this GradientOp.");
         return nullptr;
    }
    OperatorDef def;
    if (!def.ParseFromString(PyBytes_AsStringEx(def_string))) {
-        PyErr_SetString(PyExc_ValueError, "Failed to parse the OperatorDef.");
+        PyErr_SetString(PyExc_ValueError,
+            "Failed to parse the OperatorDef.");
        return nullptr;
    }
    if (!GradientRegistry()->Has(def.type())) {
-        PyErr_SetString(PyExc_KeyError, "This Operator does not register GradientOp.");
+        PyErr_SetString(PyExc_KeyError,
+            "This Operator does not register GradientOp.");
        return nullptr;
    }
    vector<string> g_outputs;
@@ -61,9 +65,10 @@ PyObject* RunGradientFlowCC(PyObject* self, PyObject* args) {
    PyObject* py_fp_ops, *py_targets;   
    PyObject* py_input_grads, *py_ignore_grads;
    PyObject* py_share_grads, *py_export_graph;
-    if (!PyArg_ParseTuple(args, "OOOOOO", &py_fp_ops, &py_targets,
+    if (!PyArg_ParseTuple(args, "OOOOOO",
-                                          &py_input_grads, &py_ignore_grads, 
+        &py_fp_ops, &py_targets,
-                                          &py_share_grads, &py_export_graph)) {
+            &py_input_grads, &py_ignore_grads,
+                &py_share_grads, &py_export_graph)) {
        PyErr_SetString(PyExc_ValueError,
            "Excepted a list of serialized input ops, targets, "
            "input grads, ignore grads and whehter to share grads or log graph.");
@@ -84,8 +89,8 @@ PyObject* RunGradientFlowCC(PyObject* self, PyObject* args) {
    for (auto& grad : input_grads) maker.AddExternalGrad(grad);
    for (auto& grad : ignore_grads) maker.AddIgnoreGrad(grad);
    maker.Make(fp_ops, targets, bp_ops);
-    bool share_grads = (bool)PyObject_IsTrue(py_share_grads);
+    bool share_grads = PyObject_IsTrue(py_share_grads) ? true : false;
-    bool export_graph = (bool)PyObject_IsTrue(py_export_graph);
+    bool export_graph = PyObject_IsTrue(py_export_graph) ? true : false;
    if (share_grads) maker.Share("/share/buffer/grads", bp_ops);
    if (export_graph) {
        Tensor* t = ws()->CreateTensor("/export/dynamic_graph/gradient_flow");

--- a/Dragon/modules/python/py_config.h
+++ b/Dragon/modules/python/py_config.h
@@ -17,7 +17,8 @@
 inline PyObject* SetLogLevelCC(PyObject* self, PyObject* args) {
    char* cname;
    if (!PyArg_ParseTuple(args, "s", &cname)) {
-        PyErr_SetString(PyExc_ValueError, "Excepted the logging level.");
+        PyErr_SetString(PyExc_ValueError,
+            "Excepted the logging level.");
        return nullptr;
    }
    SetLogDestination(StrToLogSeverity(string(cname)));

--- a/Dragon/modules/python/py_graph.h
+++ b/Dragon/modules/python/py_graph.h
@@ -17,16 +17,19 @@
 inline PyObject* CreateGraphCC(PyObject* self, PyObject* args) {
    PyObject* graph_str;
    if (!PyArg_ParseTuple(args, "S", &graph_str)) {
-        PyErr_SetString(PyExc_ValueError, "Excepted a serialized string of GraphDef.");
+        PyErr_SetString(PyExc_ValueError,
+            "Excepted a serialized string of GraphDef.");
        return nullptr;
    }
    GraphDef graph_def;
    if (!graph_def.ParseFromString(PyBytes_AsStringEx(graph_str))) {
-        PyErr_SetString(PyExc_RuntimeError, "Failed to parse the GraphDef.");
+        PyErr_SetString(PyExc_RuntimeError,
+            "Failed to parse the GraphDef.");
        return nullptr;
    } 
    if (!ws()->CreateGraph(graph_def)) {
-        PyErr_SetString(PyExc_RuntimeError, "Failed to create the Graph.");
+        PyErr_SetString(PyExc_RuntimeError,
+            "Failed to create the Graph.");
        return nullptr;
    }
    Py_RETURN_TRUE;
@@ -34,11 +37,17 @@ inline PyObject* CreateGraphCC(PyObject* self, PyObject* args) {
 inline PyObject* RunGraphCC(PyObject* self, PyObject* args) {
    char* cname, *include, *exclude;
-    if (!PyArg_ParseTuple(args, "sss", &cname, &include, &exclude)) {
+    if (!PyArg_ParseTuple(args, "sss",
-        PyErr_SetString(PyExc_ValueError, "Excepted the graph name, include and exclude rules.");
+            &cname, &include, &exclude)) {
+        PyErr_SetString(PyExc_ValueError,
+            "Excepted the graph name, include and exclude rules.");
        return nullptr;
    }
-    ws()->RunGraph(string(cname), string(include), string(exclude));
+    ws()->RunGraph(
+        string(cname),
+        string(include),
+        string(exclude)
+    );
    Py_RETURN_TRUE;
 }

--- a/Dragon/modules/python/py_io.h
+++ b/Dragon/modules/python/py_io.h
@@ -19,13 +19,13 @@ inline PyObject* SnapshotCC(PyObject* self, PyObject* args) {
    char* path; int format;
    PyObject* names; vector<Tensor*> tensors;
    if (!PyArg_ParseTuple(args, "sOi", &path, &names, &format)) {
-        PyErr_SetString(PyExc_ValueError, 
+        PyErr_SetString(PyExc_ValueError,
            "Excepted the model path, tensors, and data format.");
        return nullptr;
    }
    switch (format) {
        case 0:  //  Pickle
-            PyErr_SetString(PyExc_NotImplementedError, 
+            PyErr_SetString(PyExc_NotImplementedError,
                "Format depends on Pickle. Can't be used in C++.");
            break;
        case 1:  //  CaffeModel
@@ -42,13 +42,13 @@ inline PyObject* SnapshotCC(PyObject* self, PyObject* args) {
 inline PyObject* RestoreCC(PyObject* self, PyObject* args) {
    char* path; int format;
    if (!PyArg_ParseTuple(args, "si", &path, &format)) {
-        PyErr_SetString(PyExc_ValueError, 
+        PyErr_SetString(PyExc_ValueError,
            "Excepted the model path and data format.");
        return nullptr;
    }
    switch (format) {
        case 0:  //  Pickle
-            PyErr_SetString(PyExc_NotImplementedError, 
+            PyErr_SetString(PyExc_NotImplementedError,
                "Format depends on Pickle. Can't be used in C++.");
            break;
        case 1:  //  CaffeModel

--- a/Dragon/modules/python/py_mpi.h
+++ b/Dragon/modules/python/py_mpi.h
@@ -46,7 +46,8 @@ inline PyObject* MPICreateGroupCC(PyObject* self, PyObject* args) {
    PyObject *incl, *excl, *ret;
    int local_root, world_size;
    if (!PyArg_ParseTuple(args, "iOO", &local_root, &incl, &excl)) {
-        PyErr_SetString(PyExc_ValueError, "Excepted the local root, include and exclued list.");
+        PyErr_SetString(PyExc_ValueError,
+            "Excepted the local root, include and exclued list.");
        return nullptr;
    }
    MPI_Group world_group, local_group;

--- a/Dragon/modules/python/py_operator.h
+++ b/Dragon/modules/python/py_operator.h
@@ -37,12 +37,14 @@ inline PyObject* NoGradientOperatorsCC(PyObject* self, PyObject* args) {
 inline PyObject* RunOperatorCC(PyObject* self, PyObject* args) {
    PyObject* op_str;
    if (!PyArg_ParseTuple(args, "S", &op_str)) {
-        PyErr_SetString(PyExc_ValueError, "Excepted a serialized string of OperatorDef.");
+        PyErr_SetString(PyExc_ValueError,
+            "Excepted a serialized string of OperatorDef.");
        return nullptr;
    }
    OperatorDef op_def;
    if (!op_def.ParseFromString(PyBytes_AsStringEx(op_str))) {
-        PyErr_SetString(PyExc_RuntimeError, "Failed to parse the OperatorDef.");
+        PyErr_SetString(PyExc_RuntimeError,
+            "Failed to parse the OperatorDef.");
        return nullptr;
    }
    ws()->RunOperator(op_def);
@@ -52,7 +54,8 @@ inline PyObject* RunOperatorCC(PyObject* self, PyObject* args) {
 inline PyObject* RunOperatorsCC(PyObject* self, PyObject* args) {
    PyObject* py_ops;
    if (!PyArg_ParseTuple(args, "O", &py_ops)) {
-        PyErr_SetString(PyExc_ValueError, "Excepted a list of serialized string of OperatorDef.");
+        PyErr_SetString(PyExc_ValueError,
+            "Excepted a list of serialized string of OperatorDef.");
        return nullptr;
    }
    OperatorDef op_def;
@@ -67,12 +70,14 @@ inline PyObject* RunOperatorsCC(PyObject* self, PyObject* args) {
 inline PyObject* CreatePersistentOpCC(PyObject* self, PyObject* args) {
    PyObject* op_str;
    if (!PyArg_ParseTuple(args, "S", &op_str)) {
-        PyErr_SetString(PyExc_ValueError, "Excepted a serialized string of OperatorDef.");
+        PyErr_SetString(PyExc_ValueError,
+            "Excepted a serialized string of OperatorDef.");
        return nullptr;
    }
    OperatorDef op_def;
    if (!op_def.ParseFromString(PyBytes_AsStringEx(op_str))) {
-        PyErr_SetString(PyExc_RuntimeError, "Failed to parse the OperatorDef.");
+        PyErr_SetString(PyExc_RuntimeError,
+            "Failed to parse the OperatorDef.");
        return nullptr;
    }
    ws()->CreatePersistentOp(op_def);
@@ -82,9 +87,11 @@ inline PyObject* CreatePersistentOpCC(PyObject* self, PyObject* args) {
 inline PyObject* RunPersistentOpCC(PyObject* self, PyObject* args) {
    char* key, *anchor;
    PyObject* py_inputs, *py_outputs;
-    if (!PyArg_ParseTuple(args, "ssOO", &key, &anchor, &py_inputs, &py_outputs)) {
+    if (!PyArg_ParseTuple(args, "ssOO",
-        PyErr_SetString(PyExc_ValueError, "Excepted a persistent key, anchor, " 
+            &key, &anchor, &py_inputs, &py_outputs)) {
-                                          "list of inputs and outputs.");
+        PyErr_SetString(PyExc_ValueError, 
+            "Excepted a persistent key, anchor, "
+            "list of inputs and outputs.");
        return nullptr;
    }
    vector<string> inputs, outputs;

--- a/Dragon/modules/python/py_tensor.h
+++ b/Dragon/modules/python/py_tensor.h
@@ -39,12 +39,14 @@ inline PyObject* CreateTensorCC(PyObject* self, PyObject* args) {
 inline PyObject* CreateFillerCC(PyObject* self, PyObject* args) {
    PyObject* filler_string;
    if (!PyArg_ParseTuple(args, "S", &filler_string)) {
-        PyErr_SetString(PyExc_ValueError, "Excepted a serialized string of TensorFiller.");
+        PyErr_SetString(PyExc_ValueError,
+            "Excepted a serialized string of TensorFiller.");
        return nullptr;
    }
    TensorFiller filler_def;
    if (!filler_def.ParseFromString(PyBytes_AsStringEx(filler_string))) {
-        PyErr_SetString(PyExc_RuntimeError, "Failed to parse the TensorFiller.");
+        PyErr_SetString(PyExc_RuntimeError,
+            "Failed to parse the TensorFiller.");
        return nullptr;
    }
    ws()->CreateFiller(filler_def);
@@ -60,7 +62,8 @@ inline PyObject* GetFillerTypeCC(PyObject* self, PyObject* args) {
 inline PyObject* RenameTensorCC(PyObject* self, PyObject* args) {
    char* ori_name, *tar_name;
    if (!PyArg_ParseTuple(args, "ss", &ori_name, &tar_name)) {
-        PyErr_SetString(PyExc_ValueError, "Excepted the original and target name.");
+        PyErr_SetString(PyExc_ValueError,
+            "Excepted the original and target name.");
        return nullptr;
    }
    if (!ws()->HasTensor(tar_name)) {
@@ -77,7 +80,8 @@ PyObject* TensorFromShapeCC(PyObject* self, PyObject* args) {
    char* cname, *dtype;
    PyObject* shape, *device_option = nullptr;
    if (!PyArg_ParseTuple(args, "sOs|O", &cname, &shape, &dtype, &device_option)) {
-        PyErr_SetString(PyExc_ValueError, "Excepted the name, shape, dtype and optional device option.");
+        PyErr_SetString(PyExc_ValueError,
+            "Excepted the name, shape, dtype and optional device option.");
        return nullptr;
    }
    const TypeMeta& meta = TypeStringToMeta(dtype);
@@ -119,7 +123,8 @@ PyObject* TensorFromPyArrayCC(PyObject* self, PyObject* args) {
    char* cname;
    PyArrayObject* original_array = nullptr;
    if (!PyArg_ParseTuple(args, "sO", &cname, &original_array)) {
-        PyErr_SetString(PyExc_ValueError, "Failed to create tensor from numpy.ndarray.\n"
+        PyErr_SetString(PyExc_ValueError,
+            "Failed to create tensor from numpy.ndarray.\n"
            "Excepted the name and numpy.ndarray both.");
        return nullptr;
    }
@@ -214,7 +219,8 @@ inline PyObject* TensorToPyArrayCC(PyObject* self, PyObject* args) {
        return nullptr;
    }
    auto* data = tensor->raw_mutable_data<CPUContext>();
-    PyObject* array = PyArray_SimpleNewFromData(tensor->ndim(), dims.data(), npy_type, data);
+    PyObject* array = PyArray_SimpleNewFromData(
+        (int)tensor->ndim(), dims.data(), npy_type, data);
    Py_XINCREF(array);
    return array;
 }

--- a/Dragon/python/dragon/io/blob_fetcher.py
+++ b/Dragon/python/dragon/io/blob_fetcher.py
@@ -30,6 +30,8 @@ class BlobFetcher(Process):
        ----------
        batch_size : int
            The size of a training batch.
+        dtype : str
+            The data type of batch. Default is ``float32``.
        partition : boolean
            Whether to partition batch. Default is ``False``.
        prefetch : int
@@ -42,6 +44,7 @@ class BlobFetcher(Process):
        """
        super(BlobFetcher, self).__init__()
        self._batch_size = kwargs.get('batch_size', 100)
+        self._dtype = kwargs.get('dtype', 'float32')
        self._partition  = kwargs.get('partition', False)
        self._mean_values = kwargs.get('mean_values', [])
        self._scale = kwargs.get('scale', 1.0)
@@ -68,7 +71,7 @@ class BlobFetcher(Process):
            if ix != self._batch_size - 1: im, labels = self.Q_in.get()
        # mean subtraction & numerical scale
-        im_blob = im_blob.astype(np.float32)
+        im_blob = im_blob.astype(self._dtype)
        if len(self._mean_values) > 0:
            im_blob -= self._mean_values
        if self._scale != 1.0:

--- a/Dragon/python/dragon/io/data_batch.py
+++ b/Dragon/python/dragon/io/data_batch.py
@@ -70,6 +70,8 @@ class DataBatch(object):
            The phase of this operator, ``TRAIN`` or ``TEST``. Default is ``TRAIN``.
        batch_size : int
            The size of a training batch.
+        dtype : str
+            The data type of batch. Default is ``float32``.
        partition : boolean
            Whether to partition batch. Default is ``False``.
        prefetch : int

--- a/Dragon/python/dragon/io/data_reader.py
+++ b/Dragon/python/dragon/io/data_reader.py
@@ -49,16 +49,14 @@ class DataReader(Process):
        self._source = kwargs.get('source', '')
        self._multiple_nodes = kwargs.get('multiple_nodes', False)
        self._use_shuffle = kwargs.get('shuffle', False)
+        self._use_instance_chunk = kwargs.get('instance_chunk', False)
        self._num_chunks = kwargs.get('num_chunks', 2048)
        self._chunk_size = kwargs.get('chunk_size', -1)
-        self._num_parts = 1
+        self._part_idx, self._num_parts = 0, 1
-        self._part_idx = 0
+        self._cur_idx, self._cur_chunk_idx = 0, 0
        self._random_seed = config.GetRandomSeed()
-        self._cur_idx = 0
-        self._cur_chunk_idx = 0
        self.Q_out = None
        self.daemon = True
@@ -167,12 +165,13 @@ class DataReader(Process):
        self._db.open(self._source)
        self._zfill = self._db.zfill()
        self._num_entries = self._db.num_entries()
-        self._epoch_size = int(self._num_entries / self._num_parts + 1)
+        self._epoch_size = int(self._num_entries/ self._num_parts + 1)
        if self._use_shuffle:
            if self._chunk_size == 1:
                # each chunk has at most 1 record [For Fully Shuffle]
-                self._num_shuffle_parts = int(self._num_entries / self._chunk_size / self._num_parts) + 1
+                self._chunk_size, self._num_shuffle_parts = \
+                    1, int(self._num_entries / self._num_parts) + 1
            else:
                if self._use_shuffle and self._chunk_size == -1:
                    # search a optimal chunk size by chunks [For Chunk Shuffle]
@@ -183,6 +182,11 @@ class DataReader(Process):
                    self._num_shuffle_parts = int(math.ceil(self._db._total_size * 1.1 /
                                                 (self._num_parts * self._chunk_size << 20)))
                    self._chunk_size = int(self._num_entries / self._num_shuffle_parts / self._num_parts + 1)
+                    limit = (self._num_parts - 0.5) * self._num_shuffle_parts * self._chunk_size
+                    if self._num_entries <= limit:
+                        # roll back to fully shuffle
+                        self._chunk_size, self._num_shuffle_parts = \
+                            1, int(self._num_entries / self._num_parts) + 1
        else:
            # each chunk has at most K records [For Multiple Nodes]
            # note that if ``shuffle`` and ``multiple_nodes`` are all ``False``,

--- a/Dragon/python/dragon/version.py
+++ b/Dragon/python/dragon/version.py
@@ -14,7 +14,7 @@ from __future__ import division
 from __future__ import print_function
 version = '0.2.2'
-full_version = '0.2.2.10'
+full_version = '0.2.2.11'
 release = False
 if not release:

--- a/Dragon/python/dragon/vm/caffe/layers/common.py
+++ b/Dragon/python/dragon/vm/caffe/layers/common.py
@@ -364,7 +364,7 @@ class BatchNormLayer(Layer):
        var  = Tensor(scope + '/param:1').Constant(value=0.0)
        factor = Tensor(scope + '/param:2').Constant(value=0.0)
        # in dragon, set diff as None will ignore computing grad automatically
-        # but in bvlc-caffe1, you must set lr_mult = 0 manually
+        # but in bvlc-caffe, you must set lr_mult = 0 manually
        self._blobs.append({'data': mean, 'diff': None})
        self._blobs.append({'data': var, 'diff': None})
        self._blobs.append({'data': factor, 'diff': None})

--- a/Dragon/python/dragon/vm/torch/ops/__init__.py
+++ b/Dragon/python/dragon/vm/torch/ops/__init__.py
@@ -20,7 +20,7 @@ from .arithmetic import (
 from .ndarray import (
    squeeze, unsqueeze,
-    sum, mean, argmin, argmax, max, topk,
+    sum, mean, argmin, argmax, max, min, topk,
    cat, gather,
 )

--- a/Dragon/python/dragon/vm/torch/ops/arithmetic.py
+++ b/Dragon/python/dragon/vm/torch/ops/arithmetic.py
@@ -13,7 +13,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 from dragon.vm.torch.tensor import Tensor
 from dragon.vm.torch.ops.primitive import MakeContext, WrapScalar
 from dragon.vm.torch.ops.factory import get_module
@@ -26,7 +25,6 @@ def _fundamental(input, value, op='Add', out=None):
            raise TypeError('Type of value should be numerical, got {}.'
                    .format(type(value)))
        value = WrapScalar(value, input._dtype, input._ctx)
    ctx = MakeContext(inputs=[input, value])
    key = 'torch/ops/{}/{}:{}'.format(op.lower(), ctx[0].lower(), ctx[1])
    module = get_module(Fundamental, key, ctx, op_type=op)

--- a/Dragon/python/dragon/vm/torch/utils/data/dataset.py
+++ b/Dragon/python/dragon/vm/torch/utils/data/dataset.py
@@ -13,7 +13,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-from dragon.vm.torch.utils.data.io.data_reader import DataReader
+from dragon.io.data_reader import DataReader
 from dragon.vm.torch.utils.data.io.data_transformer import DataTransformer

--- a/Dragon/python/dragon/vm/torch/utils/data/io/data_batch.py
+++ b/Dragon/python/dragon/vm/torch/utils/data/io/data_batch.py
@@ -19,7 +19,7 @@ from multiprocessing import Queue
 import dragon.core.mpi as mpi
-from .data_reader import DataReader
+from dragon.io.data_reader import DataReader
 from .data_transformer import DataTransformer
 from .blob_fetcher import BlobFetcher

--- a/Dragon/python/dragon/vm/torch/utils/data/io/data_reader.py
+++ b/Dragon/python/dragon/vm/torch/utils/data/io/data_reader.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import math
-import numpy as np
-import numpy.random as npr
-from multiprocessing import Process
-import dragon.config as config
-from dragon.tools.db import LMDB
-class DataReader(Process):
-    """DataReader is deployed to queue encoded str from `LMDB`_.
-    It is supported to adaptively partition and shuffle records over all distributed nodes.
-    """
-    def __init__(self, **kwargs):
-        """Construct a ``DataReader``.
-        Parameters
-        ----------
-        source : str
-            The path of database.
-        multiple_nodes: boolean
-            Whether to split data for multiple parallel nodes. Default is ``False``.
-        shuffle : boolean
-            Whether to shuffle the data. Default is ``False``.
-        num_chunks : int
-            The number of chunks to split. Default is ``2048``.
-        chunk_size : int
-            The size(MB) of each chunk. Default is -1 (Refer ``num_chunks``).
-        """
-        super(DataReader, self).__init__()
-        self._source = kwargs.get('source', '')
-        self._multiple_nodes = kwargs.get('multiple_nodes', False)
-        self._use_shuffle = kwargs.get('shuffle', False)
-        self._num_chunks = kwargs.get('num_chunks', 2048)
-        self._chunk_size = kwargs.get('chunk_size', -1)
-        self._num_parts = 1
-        self._part_idx = 0
-        self._random_seed = config.GetRandomSeed()
-        self._cur_idx = 0
-        self._cur_chunk_idx = 0
-        self.Q_out = None
-        self.daemon = True
-    def element(self):
-        """Get the value of current record.
-        Returns
-        -------
-        str
-            The encoded str.
-        """
-        return self._db.value()
-    def redirect(self, target_idx):
-        """Redirect to the target position.
-        Parameters
-        ----------
-        target_idx : int
-            The key of instance in ``LMDB``.
-        Returns
-        -------
-        None
-        Notes
-        -----
-        The redirection reopens the ``LMDB``.
-        You can drop caches by ``echo 3 > /proc/sys/vm/drop_caches``.
-        This will disturb getting stuck when ``Database Size`` >> ``RAM Size``.
-        """
-        self._db.close()
-        self._db.open(self._source)
-        self._cur_idx = target_idx
-        self._db.set(str(self._cur_idx).zfill(self._zfill))
-    def reset(self):
-        """Reset the cursor and environment.
-        Returns
-        -------
-        None
-        """
-        if self._multiple_nodes or self._use_shuffle:
-            if self._use_shuffle: self._perm = npr.permutation(self._num_shuffle_parts)
-            self._cur_chunk_idx = 0
-            self._start_idx = int(self._part_idx * self._num_shuffle_parts + self._perm[self._cur_chunk_idx])
-            self._start_idx = int(self._start_idx * self._chunk_size)
-            if self._start_idx >= self._num_entries: self.next_chunk()
-            self._end_idx = self._start_idx + self._chunk_size
-            self._end_idx = min(self._num_entries, self._end_idx)
-        else:
-            self._start_idx = 0
-            self._end_idx = self._num_entries
-        self.redirect(self._start_idx)
-    def next_record(self):
-        """Step the cursor of records.
-        Returns
-        -------
-        None
-        """
-        self._cur_idx += 1
-        self._db.next()
-    def next_chunk(self):
-        """Step the cursor of shuffling chunks.
-        Returns
-        -------
-        None
-        """
-        self._cur_chunk_idx += 1
-        if self._cur_chunk_idx >= self._num_shuffle_parts: self.reset()
-        else:
-            self._start_idx = self._part_idx * self._num_shuffle_parts + self._perm[self._cur_chunk_idx]
-            self._start_idx = self._start_idx * self._chunk_size
-            if self._start_idx >= self._num_entries: self.next_chunk()
-            else:
-                self._end_idx = self._start_idx + self._chunk_size
-                self._end_idx = min(self._num_entries, self._end_idx)
-            self.redirect(self._start_idx)
-    def run(self):
-        """Start the process.
-        Returns
-        -------
-        None
-        """
-        # fix seed
-        npr.seed(self._random_seed)
-        # init db
-        self._db = LMDB()
-        self._db.open(self._source)
-        self._zfill = self._db.zfill()
-        self._num_entries = self._db.num_entries()
-        self._epoch_size = int(self._num_entries / self._num_parts + 1)
-        if self._use_shuffle:
-            if self._chunk_size == 1:
-                # each chunk has at most 1 record [For Fully Shuffle]
-                self._num_shuffle_parts = int(self._num_entries / self._chunk_size / self._num_parts) + 1
-            else:
-                if self._use_shuffle and self._chunk_size == -1:
-                    # search a optimal chunk size by chunks [For Chunk Shuffle]
-                    max_chunk_size = self._db._total_size / ((self._num_chunks * (1 << 20)))
-                    min_chunk_size = 1
-                    while min_chunk_size * 2 < max_chunk_size: min_chunk_size *= 2
-                    self._chunk_size = min_chunk_size
-                    self._num_shuffle_parts = int(math.ceil(self._db._total_size * 1.1 /
-                                                 (self._num_parts * self._chunk_size << 20)))
-                    self._chunk_size = int(self._num_entries / self._num_shuffle_parts / self._num_parts + 1)
-        else:
-            # each chunk has at most K records [For Multiple Nodes]
-            # note that if ``shuffle`` and ``multiple_nodes`` are all ``False``,
-            # ``chunk_size`` and ``num_shuffle_parts`` are meaningless
-            self._chunk_size = int(self._num_entries / self._num_parts) + 1
-            self._num_shuffle_parts = 1
-        self._perm = np.arange(self._num_shuffle_parts)
-        # init env
-        self.reset()
-        # run
-        while True:
-            self.Q_out.put(self.element())
-            self.next_record()
-            if self._cur_idx >= self._end_idx:
-                if self._multiple_nodes or \
-                    self._use_shuffle: self.next_chunk()
-                else: self.reset()
\ No newline at end of file
--- a/Dragon/python/setup.py
+++ b/Dragon/python/setup.py
@@ -42,7 +42,7 @@ find_modules()
 setup(name = 'dragon',
-      version='0.2.2.10',
+      version='0.2.2.11',
      description = 'Dragon: A Computation Graph Virtual Machine Based Deep Learning Framework',
      url='https://github.com/seetaresearch/Dragon',
      author='Ting Pan',

--- a/Dragon/src/contrib/rcnn/bbox_utils.cc
+++ b/Dragon/src/contrib/rcnn/bbox_utils.cc
@@ -19,7 +19,8 @@ template <> void GenerateProposals<float, CPUContext>(
    const float*            scores,
    const float*            bbox_deltas,
    const float*            anchors,
-    float*                  proposals) {
+    float*                  proposals,
+    CPUContext*             ctx) {
    float* proposal = proposals;
    const int K = feat_h * feat_w;
    for (int h = 0; h < feat_h; ++h) {
@@ -57,7 +58,8 @@ template <> void GenerateProposals_v2<float, CPUContext>(
    const float             min_box_w,
    const float*            scores,
    const float*            bbox_deltas,
-    float*                  proposals) {
+    float*                  proposals,
+    CPUContext*             ctx) {
    float* proposal = proposals;
    for (int i = 0; i < total_anchors; ++i) {
        //  bbox_deltas: [1, 4, total_anchors]
@@ -98,7 +100,8 @@ template <> void ApplyNMS<float, CPUContext>(
    const float             thresh,
    const float*            boxes,
    int*                    keep_indices,
-    int&                    num_keep) {
+    int&                    num_keep,
+    CPUContext*             ctx) {
    int count = 0;
    std::vector<char> is_dead(num_boxes);
    for (int i = 0; i < num_boxes; ++i) is_dead[i] = 0;

--- a/Dragon/src/contrib/rcnn/bbox_utils.cu
+++ b/Dragon/src/contrib/rcnn/bbox_utils.cu
@@ -62,7 +62,7 @@ __global__ void _GenerateProposals(
    const T*                bbox_deltas,
    const T*                anchors,
    T*                      proposals) {
-    CUDA_KERNEL_LOOP(idx, nthreads) {
+    CUDA_1D_KERNEL_LOOP(idx, nthreads) {
        const int h = idx / A / feat_w;
        const int w = (idx / A) % feat_w;
        const int a = idx % A;
@@ -99,13 +99,15 @@ template <> void GenerateProposals<float, CUDAContext>(
    const float*            scores,
    const float*            bbox_deltas,
    const float*            anchors,
-    float*                  proposals) {
+    float*                  proposals,
+    CUDAContext*            ctx) {
    const int num_proposals = A * feat_h * feat_w;
    _GenerateProposals<float>
-        << <CUDA_BLOCKS(num_proposals), CUDA_THREADS >> >(
+        << < CUDA_BLOCKS(num_proposals), CUDA_THREADS,
-            num_proposals, A, feat_h, feat_w, stride,
+             0, ctx->cuda_stream() >> >(num_proposals,
-                im_h, im_w, min_box_h, min_box_w,
+                 A, feat_h, feat_w, stride,
-                    scores, bbox_deltas, anchors, proposals);
+                     im_h, im_w, min_box_h, min_box_w,
+                         scores, bbox_deltas, anchors, proposals);
 }
 template <typename T>
@@ -118,7 +120,7 @@ __global__ void _GenerateProposals_v2(
    const T*                scores,
    const T*                bbox_deltas,
    T*                      proposals) {
-    CUDA_KERNEL_LOOP(idx, nthreads) {
+    CUDA_1D_KERNEL_LOOP(idx, nthreads) {
        const float dx = bbox_deltas[idx];
        const float dy = bbox_deltas[nthreads + idx];
        const float d_log_w = bbox_deltas[2 * nthreads + idx];
@@ -139,11 +141,13 @@ template <> void GenerateProposals_v2<float, CUDAContext>(
    const float             min_box_w,
    const float*            scores,
    const float*            bbox_deltas,
-    float*                  proposals) {
+    float*                  proposals,
+    CUDAContext*            ctx) {
    _GenerateProposals_v2<float>
-        << <CUDA_BLOCKS(total_anchors), CUDA_THREADS >> >(
+        << < CUDA_BLOCKS(total_anchors), CUDA_THREADS,
-            total_anchors, im_h, im_w, min_box_h, min_box_w,
+             0, ctx->cuda_stream() >> >(total_anchors,
-                scores, bbox_deltas, proposals);
+                 im_h, im_w, min_box_h, min_box_w,
+                     scores, bbox_deltas, proposals);
 }
 /******************** NMS ********************/
@@ -170,7 +174,7 @@ __global__ void nms_mask(
    const int               num_boxes,
    const T                 nms_thresh,
    const T*                boxes,
-    unsigned long long*     mask) {
+    uint64_t*               mask) {
    const int i_start = blockIdx.x * NMS_BLOCK_SIZE;
    const int di_end = min(num_boxes - i_start, NMS_BLOCK_SIZE);
    const int j_start = blockIdx.y * NMS_BLOCK_SIZE;
@@ -209,25 +213,30 @@ void _ApplyNMS(
    const float             thresh,
    const T*                boxes,
    int*                    keep_indices,
-    int&                    num_keep) {
+    int&                    num_keep,
+    CUDAContext*            ctx) {
    const int num_blocks = DIV_UP(num_boxes, NMS_BLOCK_SIZE);
    const dim3 blocks(num_blocks, num_blocks);
-    size_t mask_nbytes = num_boxes * num_blocks * sizeof(unsigned long long);
+    size_t mask_nbytes = num_boxes * num_blocks * sizeof(uint64_t);
    size_t boxes_nbytes = num_boxes * 5 * sizeof(T);
    void* boxes_dev, *mask_dev;
    CUDA_CHECK(cudaMalloc(&boxes_dev, boxes_nbytes));
    CUDA_CHECK(cudaMalloc(&mask_dev, mask_nbytes));
-    CUDA_CHECK(cudaMemcpy(boxes_dev, boxes, boxes_nbytes, cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(boxes_dev, boxes,
-    nms_mask<T> << <blocks, NMS_BLOCK_SIZE >> > (
+        boxes_nbytes, cudaMemcpyHostToDevice));
-        num_boxes, thresh, (T*)boxes_dev, (unsigned long long*)mask_dev);
+    nms_mask<T>
+        << < blocks, NMS_BLOCK_SIZE,
+             0, ctx->cuda_stream() >> > (num_boxes,
+                 thresh, (T*)boxes_dev, (uint64_t*)mask_dev);
    CUDA_CHECK(cudaPeekAtLastError());
-    std::vector<unsigned long long> mask_host(num_boxes * num_blocks);
+    std::vector<uint64_t> mask_host(num_boxes * num_blocks);
-    CUDA_CHECK(cudaMemcpy(&mask_host[0], mask_dev, mask_nbytes, cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpy(&mask_host[0], mask_dev,
+        mask_nbytes, cudaMemcpyDeviceToHost));
-    std::vector<unsigned long long> dead_bit(num_blocks);
+    std::vector<uint64_t> dead_bit(num_blocks);
-    memset(&dead_bit[0], 0, sizeof(unsigned long long) * num_blocks);
+    memset(&dead_bit[0], 0, sizeof(uint64_t) * num_blocks);
    int num_selected = 0;
    for (int i = 0; i < num_boxes; ++i) {
@@ -235,7 +244,7 @@ void _ApplyNMS(
        const int inblock = i % NMS_BLOCK_SIZE;
        if (!(dead_bit[nblock] & (1ULL << inblock))) {
            keep_indices[num_selected++] = i;
-            unsigned long long* mask_i = &mask_host[0] + i * num_blocks;
+            uint64_t* mask_i = &mask_host[0] + i * num_blocks;
            for (int j = nblock; j < num_blocks; ++j) dead_bit[j] |= mask_i[j];
            if (num_selected == max_keeps) break;
        }
@@ -251,9 +260,10 @@ template <> void ApplyNMS<float, CUDAContext>(
    const float             thresh,
    const float*            boxes,
    int*                    keep_indices,
-    int&                    num_keep) {
+    int&                    num_keep,
+    CUDAContext*            ctx) {
    _ApplyNMS<float>(num_boxes, max_keeps, thresh,
-        boxes, keep_indices, num_keep);
+        boxes, keep_indices, num_keep, ctx);
 }
 }    // namespace rcnn

--- a/Dragon/src/contrib/rcnn/bbox_utils.h
+++ b/Dragon/src/contrib/rcnn/bbox_utils.h
@@ -126,7 +126,8 @@ void GenerateProposals(
    const T*                        scores,
    const T*                        bbox_deltas,
    const T*                        anchors,
-    T*                              proposals);
+    T*                              proposals,
+    Context*                        ctx);
 template <typename T, class Context>
 void GenerateProposals_v2(
@@ -137,7 +138,8 @@ void GenerateProposals_v2(
    const float                     min_box_w,
    const T*                        scores,
    const T*                        bbox_deltas,
-    T*                              proposals);
+    T*                              proposals,
+    Context*                        ctx);
 template <typename T>
 inline void SortProposals(
@@ -246,7 +248,8 @@ void ApplyNMS(
    const T                         thresh,
    const T*                        boxes,
    int*                            keep_indices,
-    int&                            num_keep);
+    int&                            num_keep,
+    Context*                        ctx);
 }    // namespace rcnn

--- a/Dragon/src/contrib/rcnn/proposal_op.cc
+++ b/Dragon/src/contrib/rcnn/proposal_op.cc
@@ -37,7 +37,7 @@ void ProposalOp<Context>::RunWithType() {
                        Input(0).template data<T, Context>(),
                            Input(1).template data<T, Context>(),
                                anchors_.template mutable_data<T, Context>(),
-                                    proposals_.template mutable_data<T, Context>());
+                                    proposals_.template mutable_data<T, Context>(), ctx());
            rcnn::SortProposals(0, num_proposals - 1, pre_nms_top_n,
                proposals_.template mutable_data<T, CPUContext>());
@@ -45,7 +45,8 @@ void ProposalOp<Context>::RunWithType() {
            rcnn::ApplyNMS<T, Context>(
                pre_nms_topn, post_nms_top_n, nms_thresh,
                    proposals_.template mutable_data<T, Context>(),
-                        roi_indices_.template mutable_data<int, CPUContext>(), num_rois);
+                        roi_indices_.template mutable_data<int, CPUContext>(),
+                            num_rois, ctx());
            rcnn::RetrieveRoIs<T>(num_rois, n,
                proposals_.template mutable_data<T, CPUContext>(),
@@ -95,14 +96,15 @@ void ProposalOp<Context>::RunWithType() {
                im_height, im_width, min_box_h, min_box_w,
                    Input(-3).template data<T, Context>(),
                        Input(-2).template data<T, Context>(),
-                            proposals_.template mutable_data<T, Context>());
+                            proposals_.template mutable_data<T, Context>(), ctx());
            rcnn::SortProposals(0, total_proposals - 1, pre_nms_top_n,
                proposals_.template mutable_data<T, CPUContext>());
            rcnn::ApplyNMS<T, Context>(pre_nms_topn, post_nms_top_n, nms_thresh,
                proposals_.template mutable_data<T, Context>(),
-                    roi_indices_.template mutable_data<int, CPUContext>(), num_rois);
+                    roi_indices_.template mutable_data<int, CPUContext>(),
+                        num_rois, ctx());
            rcnn::RetrieveRoIs<T>(num_rois, n,
                proposals_.template mutable_data<T, CPUContext>(),
@@ -128,7 +130,7 @@ void ProposalOp<Context>::RunWithType() {
        collective_rois.ReshapeLike(*Output(0));
        auto* rois = collective_rois.template mutable_data<T, CPUContext>();
-        CPUContext::template Copy<T, CPUContext, CPUContext>(
+        ctx()->template Copy<T, CPUContext, CPUContext>(
            collective_rois.count(), rois,
                Output(0)->template data<T, CPUContext>());
@@ -147,6 +149,8 @@ void ProposalOp<Context>::RunWithType() {
 template <class Context>
 void ProposalOp<Context>::RunOnDevice() {
+    ctx()->set_stream_id(0);  //  enforce default stream
    num_images = Input(0).dim(0);
    CHECK_EQ(Input(-1).dim(0), num_images)
        << "\nExcepted " << num_images << " groups image info, "

--- a/Dragon/src/core/graph.cc
+++ b/Dragon/src/core/graph.cc
@@ -455,7 +455,10 @@ Graph::Graph(const GraphDef& meta_graph, Workspace* ws)
    RecomputingAware(optimized_graph, ws);
 }
-bool Graph::Run(const string& include, const string& exclude) {
+bool Graph::Run(
+    const string&               include,
+    const string&               exclude,
+    const int                   stream_id) {
    LOG(DEBUG) << "Run Graph: " << name();
    for (auto op : ops_) {
        if (!include.empty())
@@ -464,7 +467,7 @@ bool Graph::Run(const string& include, const string& exclude) {
            if (op->type().find(exclude) != string::npos) continue;
        op->SwitchToPhase(this->args_["phase"].s());
        LOG(DEBUG) << "$ Before Operator: " << op->name();
-        op->Run();
+        op->Run(stream_id);
        LOG(DEBUG) << "$ After Operator: " << op->name();
    }
    return true;

--- a/Dragon/src/core/mixedmem.cc
+++ b/Dragon/src/core/mixedmem.cc
@@ -8,7 +8,6 @@ void MixedMemory::ToCPU() {
    switch (state_) {
    case UNINITIALIZED:
        cpu_ptr_ = CPUContext::New(nbytes_);
-        CPUContext::Memset(nbytes_, cpu_ptr_);
        state_ = STATE_AT_CPU;
        break;
    case STATE_AT_CUDA:
@@ -32,7 +31,6 @@ void MixedMemory::ToCUDA() {
    switch (state_) {
    case UNINITIALIZED:
        cuda_ptr_ = CUDAContext::New(nbytes_);
-        CUDAContext::Memset(nbytes_, cuda_ptr_);
        state_ = STATE_AT_CUDA;
        break;
    case STATE_AT_CPU:

--- a/Dragon/src/operators/activation/cudnn_dropout_op.cc
+++ b/Dragon/src/operators/activation/cudnn_dropout_op.cc
@@ -15,33 +15,35 @@ void CuDNNDropoutOp<Context>::RunWithType() {
    float scale = use_scale ? 1.0 / (1.0 - prob()) : 1.0;
    if (phase() == "TEST") {
        if (Output(0) != &Input(0)) {
-            ctx().template Copy<T, Context, Context>(
+            ctx()->template Copy<T, Context, Context>(
                Output(0)->count(), Ydata, Xdata);
            if (scale == 1.0)
                math::Scal<T, Context>(Output(0)->count(),
-                    1.0 - prob(), Ydata, &ctx());
+                    1.0 - prob(), Ydata, ctx());
        }
    } else if (phase() == "TRAIN") {
        CHECK(use_scale) << "\nCuDNN only supports scale-dropout";
-        Tensor* mask = ws()->CreateTensor("/mnt/" + anchor() + "/dropout/mask");
+        Tensor* mask = ws()->CreateTensor(
+            "/mnt/" + anchor() + "/dropout/mask");
        //  determine the dropout states
        if (!states_initialized) {
            states_initialized = true;
            CUDNN_CHECK(cudnnDropoutGetStatesSize(
-                ctx().cudnn_handle(), &states_size));
+                ctx()->cudnn_handle(), &states_size));
            std::lock_guard<std::mutex> lk(CUDAContext::mutex());
-            Tensor* states = ws()->CreateTensor("/share/cudnn/dropout:" +
+            Tensor* states = ws()->CreateTensor(
-                dragon_cast<string, unsigned long long>(random_seed) + "/states");
+                "/share/cudnn/dropout:" + dragon_cast<string,
+                    unsigned long long>(random_seed) + "/states");
            if (states->count() > 0) {
                auto* Sdata = states->template mutable_data<uint8_t, Context>();
                CUDNN_CHECK(cudnnRestoreDropoutDescriptor(
-                    dropout_desc, ctx().cudnn_handle(), prob(),
+                    dropout_desc, ctx()->cudnn_handle(), prob(),
                        Sdata, states_size, random_seed));
            } else {
                states->Reshape({ (TIndex)states_size });
                auto* Sdata = states->template mutable_data<uint8_t, Context>();
                CUDNN_CHECK(cudnnSetDropoutDescriptor(
-                    dropout_desc, ctx().cudnn_handle(), prob(),
+                    dropout_desc, ctx()->cudnn_handle(), prob(),
                        Sdata, states_size, random_seed));
            }
        }
@@ -53,7 +55,7 @@ void CuDNNDropoutOp<Context>::RunWithType() {
        mask->Reshape({ (TIndex)reserve_space_size });
        auto* Rdata = mask->template mutable_data<uint8_t, Context>();
        CUDNN_CHECK(cudnnDropoutForward(
-            ctx().cudnn_handle(), dropout_desc,
+            ctx()->cudnn_handle(), dropout_desc,
                input_desc, Xdata,
                    input_desc, Ydata,
                        Rdata, reserve_space_size));
@@ -65,7 +67,9 @@ void CuDNNDropoutOp<Context>::RunOnDevice() {
    Output(0)->ReshapeLike(Input(0));
    if (XIsType(Input(0), float)) RunWithType<float>();
+#ifdef WITH_CUDA_FP16
    else if (XIsType(Input(0), float16)) RunWithType<float16>();
+#endif
    else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
 }
@@ -76,19 +80,21 @@ void CuDNNDropoutGradientOp<Context>::RunWithType() {
    if (phase() == "TEST") { NOT_IMPLEMENTED; }
    else if (phase() == "TRAIN") {
        CHECK(use_scale) << "\nCuDNN only supports scale-dropout";
-        Tensor* mask = ws()->GetTensor("/mnt/" + anchor() + "/dropout/mask");
+        Tensor* mask = ws()->GetTensor(
+            "/mnt/" + anchor() + "/dropout/mask");
        //  determine the dropout states
        if (!states_initialized) {
            states_initialized = true;
            CUDNN_CHECK(cudnnDropoutGetStatesSize(
-                ctx().cudnn_handle(), &states_size));
+                ctx()->cudnn_handle(), &states_size));
            std::lock_guard<std::mutex> lk(CUDAContext::mutex());
-            Tensor* states = ws()->CreateTensor("/share/cudnn/dropout:" +
+            Tensor* states = ws()->CreateTensor(
-                dragon_cast<string, unsigned long long>(random_seed) + "/states");
+                "/share/cudnn/dropout:" + dragon_cast<string,
+                    unsigned long long>(random_seed) + "/states");
            if (states->count() > 0) {
                auto* Sdata = states->template mutable_data<uint8_t, Context>();
                CUDNN_CHECK(cudnnRestoreDropoutDescriptor(
-                    dropout_desc, ctx().cudnn_handle(), prob(),
+                    dropout_desc, ctx()->cudnn_handle(), prob(),
                        Sdata, states_size, random_seed));
            } else { LOG(FATAL) << "Missing states with seed: " << random_seed; }
        }
@@ -101,7 +107,7 @@ void CuDNNDropoutGradientOp<Context>::RunWithType() {
            input_desc, &reserve_space_size));
        auto* Rdata = mask->template mutable_data<uint8_t, Context>();
        CUDNN_CHECK(cudnnDropoutBackward(
-            ctx().cudnn_handle(), dropout_desc,
+            ctx()->cudnn_handle(), dropout_desc,
                input_desc, dYdata,
                    input_desc, dXdata,
                        Rdata, reserve_space_size));
@@ -113,7 +119,9 @@ void CuDNNDropoutGradientOp<Context>::RunOnDevice() {
    Output(0)->ReshapeLike(Input(0));
    if (XIsType(Input(0), float)) RunWithType<float>();
+#ifdef WITH_CUDA_FP16
    else if (XIsType(Input(0), float16)) RunWithType<float16>();
+#endif
    else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
 }

--- a/Dragon/src/operators/activation/cudnn_elu_op.cc
+++ b/Dragon/src/operators/activation/cudnn_elu_op.cc
@@ -14,7 +14,7 @@ void CuDNNEluOp<Context>::RunWithType() {
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
    CUDNN_CHECK(cudnnActivationForward(
-        ctx().cudnn_handle(), act_desc,
+        ctx()->cudnn_handle(), act_desc,
            CUDNNType<T>::one, input_desc, Xdata,
                CUDNNType<T>::zero, output_desc, Ydata));
 }
@@ -41,7 +41,7 @@ void CuDNNEluGradientOp<Context>::RunWithType() {
    auto* dXdata = Output(0)->template mutable_data<T, Context>();
    CUDNN_CHECK(cudnnActivationBackward(
-        ctx().cudnn_handle(), act_desc,
+        ctx()->cudnn_handle(), act_desc,
            CUDNNType<T>::one, input_desc, Ydata,
                input_desc, dYdata, output_desc, Ydata,
                    CUDNNType<T>::zero, output_desc, dXdata));

--- a/Dragon/src/operators/activation/cudnn_relu_op.cc
+++ b/Dragon/src/operators/activation/cudnn_relu_op.cc
@@ -13,7 +13,7 @@ void CuDNNReluOp<Context>::RunWithType() {
 #if CUDNN_VERSION_MIN(5, 0, 0)
    CUDNN_CHECK(cudnnActivationForward(
-        ctx().cudnn_handle(), act_desc,
+        ctx()->cudnn_handle(), act_desc,
            CUDNNType<T>::one, input_desc, Xdata,
                CUDNNType<T>::zero, output_desc, Ydata));
 #else
@@ -49,7 +49,7 @@ void CuDNNReluGradientOp<Context>::RunWithType() {
 #if CUDNN_VERSION_MIN(5, 0, 0)
    CUDNN_CHECK(cudnnActivationBackward(
-        ctx().cudnn_handle(), act_desc,
+        ctx()->cudnn_handle(), act_desc,
            CUDNNType<T>::one, input_desc, Ydata,
                input_desc, dYdata, output_desc, Ydata,
                    CUDNNType<T>::zero, output_desc, dXdata));

--- a/Dragon/src/operators/activation/cudnn_sigmoid_op.cc
+++ b/Dragon/src/operators/activation/cudnn_sigmoid_op.cc
@@ -13,12 +13,12 @@ void CuDNNSigmoidOp<Context>::RunWithType() {
 #if CUDNN_VERSION_MIN(5, 0, 0)
    CUDNN_CHECK(cudnnActivationForward(
-        ctx().cudnn_handle(), act_desc,
+        ctx()->cudnn_handle(), act_desc,
            CUDNNType<T>::one, input_desc, Xdata,
                CUDNNType<T>::zero, output_desc, Ydata));
 #else
    CUDNN_CHECK(cudnnActivationForward_v4(
-        ctx().cudnn_handle(), act_desc,
+        ctx()->cudnn_handle(), act_desc,
            CUDNNType<Dtype>::one, input_desc, Xdata,
                CUDNNType<Dtype>::zero, output_desc, Ydata));
 #endif
@@ -47,13 +47,13 @@ void CuDNNSigmoidGradientOp<Context>::RunWithType() {
 #if CUDNN_VERSION_MIN(5, 0, 0)
    CUDNN_CHECK(cudnnActivationBackward(
-        ctx().cudnn_handle(), act_desc,
+        ctx()->cudnn_handle(), act_desc,
            CUDNNType<T>::one, input_desc, Ydata,
                input_desc, dYdata, output_desc, Ydata,
                    CUDNNType<T>::zero, output_desc, dXdata));
 #else
    CUDNN_CHECK(cudnnActivationBackward_v4(
-        ctx().cudnn_handle(), act_desc,
+        ctx()->cudnn_handle(), act_desc,
            CUDNNType<T>::one, input_desc, Ydata,
                input_desc, dYdata, output_desc, Ydata,
                    CUDNNType<T>::zero, output_desc, dXdata));

--- a/Dragon/src/operators/activation/cudnn_softmax_op.cc
+++ b/Dragon/src/operators/activation/cudnn_softmax_op.cc
@@ -7,8 +7,7 @@ namespace dragon {
 template <class Context> template <typename T>
 void CuDNNSoftmaxOp<Context>::RunWithType() {
    Tensor fake_tensor(vector<TIndex>(
-        { outer_dim, Input(0).dim(axis), inner_dim })
+        { outer_dim, Input(0).dim(axis), inner_dim }));
-    );
    cudnnSetTensorDesc<T>(&input_desc, &fake_tensor);
    cudnnSetTensorDesc<T>(&output_desc, &fake_tensor);
@@ -16,7 +15,7 @@ void CuDNNSoftmaxOp<Context>::RunWithType() {
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
    CUDNN_CHECK(cudnnSoftmaxForward(
-        ctx().cudnn_handle(),
+        ctx()->cudnn_handle(),
            CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL,
                CUDNNType<T>::one, input_desc, Xdata,
                    CUDNNType<T>::zero, output_desc, Ydata));
@@ -41,8 +40,7 @@ DEPLOY_CUDNN(Softmax);
 template <class Context> template <typename T>
 void CuDNNSoftmaxGradientOp<Context>::RunWithType() {
    Tensor fake_tensor(vector<TIndex>(
-        { outer_dim, Input(0).dim(axis), inner_dim })
+        { outer_dim, Input(0).dim(axis), inner_dim }));
-    );
    cudnnSetTensorDesc<T>(&input_desc, &fake_tensor);
    cudnnSetTensorDesc<T>(&output_desc, &fake_tensor);
@@ -50,7 +48,7 @@ void CuDNNSoftmaxGradientOp<Context>::RunWithType() {
    auto* Ydata = Input(0).template data<T, Context>();
    auto* dXdata = Output(0)->template mutable_data<T, Context>();
    CUDNN_CHECK(cudnnSoftmaxBackward(
-        ctx().cudnn_handle(),
+        ctx()->cudnn_handle(),
            CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL,
                CUDNNType<T>::one, input_desc, Ydata, input_desc, dYdata,
                    CUDNNType<T>::zero, output_desc, dXdata));

--- a/Dragon/src/operators/activation/cudnn_tanh_op.cc
+++ b/Dragon/src/operators/activation/cudnn_tanh_op.cc
@@ -13,12 +13,12 @@ void CuDNNTanhOp<Context>::RunWithType() {
 #if CUDNN_VERSION_MIN(5, 0, 0)
    CUDNN_CHECK(cudnnActivationForward(
-        ctx().cudnn_handle(), act_desc,
+        ctx()->cudnn_handle(), act_desc,
            CUDNNType<T>::one, input_desc, Xdata,
                CUDNNType<T>::zero, output_desc, Ydata));
 #else
    CUDNN_CHECK(cudnnActivationForward_v4(
-        ctx().cudnn_handle(), act_desc,
+        ctx()->cudnn_handle(), act_desc,
            CUDNNType<Dtype>::one, input_desc, Xdata,
                CUDNNType<Dtype>::zero, output_desc, Ydata));
 #endif
@@ -47,13 +47,13 @@ void CuDNNTanhGradientOp<Context>::RunWithType() {
 #if CUDNN_VERSION_MIN(5, 0, 0)
    CUDNN_CHECK(cudnnActivationBackward(
-        ctx().cudnn_handle(), act_desc,
+        ctx()->cudnn_handle(), act_desc,
            CUDNNType<T>::one, input_desc, Ydata,
                input_desc, dYdata, output_desc, Ydata,
                    CUDNNType<T>::zero, output_desc, dXdata));
 #else
    CUDNN_CHECK(cudnnActivationBackward_v4(
-        ctx().cudnn_handle(), act_desc,
+        ctx()->cudnn_handle(), act_desc,
            CUDNNType<T>::one, input_desc, Ydata, 
                input_desc, dYdata, output_desc, Ydata,
                    CUDNNType<T>::zero, output_desc, dXdata));

--- a/Dragon/src/operators/activation/dropout_op.cc
+++ b/Dragon/src/operators/activation/dropout_op.cc
@@ -11,10 +11,10 @@ void DropoutOp<Context>::RunWithType() {
    float scale = use_scale ? 1.0 / (1.0 - prob()) : 1.0;
    if (phase() == "TEST") {
        if (Output(0) != &Input(0)) {
-            ctx().template Copy<T, Context, Context>(
+            ctx()->template Copy<T, Context, Context>(
                Output(0)->count(), Ydata, Xdata);
            if (scale == 1.0) math::Scal<T, Context>(
-                Output(0)->count(), 1.0 - prob(), Ydata, &ctx());
+                Output(0)->count(), 1.0 - prob(), Ydata, ctx());
        }
    } else if (phase() == "TRAIN") {
        Tensor* mask = ws()->CreateTensor(
@@ -23,7 +23,7 @@ void DropoutOp<Context>::RunWithType() {
        uint32_t* Mdata = mask->template mutable_data<uint32_t, Context>();
        kernel::Dropout<T, Context>(
            Output(0)->count(), prob(), scale,
-                Xdata, Mdata, Ydata, &ctx());
+                Xdata, Mdata, Ydata, ctx());
    } else LOG(FATAL) << "Incorrect Op phase: " << phase();
 }
@@ -52,7 +52,8 @@ void DropoutGradientOp<Context>::RunWithType() {
    else if (phase() == "TRAIN") {
        kernel::DropoutGrad<T, Context>(
            Output(0)->count(), prob(), scale,
-                dYdata, Mdata, dXdata, &ctx());
+                dYdata, Mdata, dXdata, ctx());
+        ctx()->FinishDeviceCompution();
        mask->Reset();
    } else LOG(FATAL) << "Incorrect Op phase: " << phase();
 }

--- a/Dragon/src/operators/activation/elu_op.cc
+++ b/Dragon/src/operators/activation/elu_op.cc
@@ -8,7 +8,8 @@ template <class Context> template <typename T>
 void EluOp<Context>::RunWithType() {
    auto* Xdata = Input(0).template data<T, Context>();
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    kernel::Elu<T, Context>(Output(0)->count(), alpha, Xdata, Ydata);
+    kernel::Elu<T, Context>(Output(0)->count(),
+        alpha, Xdata, Ydata, ctx());
 }
 template <class Context>
@@ -30,8 +31,8 @@ void EluGradientOp<Context>::RunWithType() {
    auto* Ydata = Input(0).template data<T, Context>();
    auto* dYdata = Input(1).template data<T, Context>();
    auto* dXdata = Output(0)->template mutable_data<T, Context>();
-    kernel::EluGrad<T, Context>(
+    kernel::EluGrad<T, Context>(Output(0)->count(),
-        Output(0)->count(), alpha, dYdata, Ydata, dXdata);
+        alpha, dYdata, Ydata, dXdata, ctx());
 }
 template <class Context>

--- a/Dragon/src/operators/activation/prelu_op.cc
+++ b/Dragon/src/operators/activation/prelu_op.cc
@@ -18,7 +18,7 @@ void PReluOp<Context>::RunWithType() {
    kernel::PRelu<T, Context>(
        Output(0)->count(), channels, dim,
            channel_shared ? true : false, data_format,
-                Xdata, Wdata, Ydata);
+                Xdata, Wdata, Ydata, ctx());
 }
 template <class Context>
@@ -49,12 +49,12 @@ void PReluGradientOp<Context>::RunWithType() {
    if (Output(1)->name() != "ignore") {
        DECLARE_MULTIPLIER(multiplier, channels * dim);
-        auto* dWdata = Output(1)->template mutable_data<T, Context>();
+        auto* dWdata = Output(1)->template mutable_data<T, Context>(ctx());
        auto* dWBdata = ws()->template caches<T, Context>({ channels * dim })[0];
        kernel::PReluWGrad<T, Context>(
            Input(0).dim(0), Input(0).count(1), channels, dim,
                channel_shared ? true : false, data_format,
-                    dYdata, Xdata, multiplier, dWBdata, dWdata, &ctx());
+                    dYdata, Xdata, multiplier, dWBdata, dWdata, ctx());
    }
    if (Output(0)->name() != "ignore") {
@@ -63,7 +63,7 @@ void PReluGradientOp<Context>::RunWithType() {
        kernel::PReluGrad<T, Context>(
            Output(0)->count(), channels, dim,
                channel_shared ? true : false, data_format,
-                    dYdata, Xdata, Wdata, dXdata);
+                    dYdata, Xdata, Wdata, dXdata, ctx());
    }
 }

--- a/Dragon/src/operators/activation/relu_op.cc
+++ b/Dragon/src/operators/activation/relu_op.cc
@@ -8,7 +8,8 @@ template <class Context> template <typename T>
 void ReluOp<Context>::RunWithType() {
    auto* Xdata = Input(0).template data<T, Context>();
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    kernel::Relu<T, Context>(Output(0)->count(), slope, Xdata, Ydata);
+    kernel::Relu<T, Context>(Output(0)->count(),
+        slope, Xdata, Ydata, ctx());
 }
 template <class Context>
@@ -24,15 +25,17 @@ DEPLOY_CPU(Relu);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(Relu);
 #endif
-OPERATOR_SCHEMA(Relu).NumInputs(1).NumOutputs(1).Inplace({ { 0, 0 } });
+OPERATOR_SCHEMA(Relu)
+    .NumInputs(1).NumOutputs(1)
+    .Inplace({ { 0, 0 } });
 template <class Context> template <typename T>
 void ReluGradientOp<Context>::RunWithType() {
    auto* Ydata = Input(0).template data<T, Context>();
    auto* dYdata = Input(1).template data<T, Context>();
    auto* dXdata = Output(0)->template mutable_data<T, Context>();
-    kernel::ReluGrad<T, Context>(
+    kernel::ReluGrad<T, Context>(Output(0)->count(),
-        Output(0)->count(), slope, dYdata, Ydata, dXdata);
+        slope, dYdata, Ydata, dXdata, ctx());
 }
 template <class Context>
@@ -47,7 +50,9 @@ DEPLOY_CPU(ReluGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(ReluGradient);
 #endif
-OPERATOR_SCHEMA(ReluGradient).NumInputs(2).NumOutputs(1).Inplace({ { 1, 0 }});
+OPERATOR_SCHEMA(ReluGradient)
+    .NumInputs(2).NumOutputs(1)
+    .Inplace({ { 1, 0 }});
 class GetReluGradient final : public GradientMakerBase {
 public:

--- a/Dragon/src/operators/activation/selu_op.cc
+++ b/Dragon/src/operators/activation/selu_op.cc
@@ -8,7 +8,7 @@ template <class Context> template <typename T>
 void SEluOp<Context>::RunWithType() {
    auto* Xdata = Input(0).template data<T, Context>();
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    kernel::SElu<T, Context>(Output(0)->count(), Xdata, Ydata);
+    kernel::SElu<T, Context>(Output(0)->count(), Xdata, Ydata, ctx());
 }
 template <class Context>
@@ -23,15 +23,17 @@ DEPLOY_CPU(SElu);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(SElu);
 #endif
-OPERATOR_SCHEMA(SElu).NumInputs(1).NumOutputs(1).Inplace({ { 0, 0 } });
+OPERATOR_SCHEMA(SElu)
+    .NumInputs(1).NumOutputs(1)
+    .Inplace({ { 0, 0 } });
 template <class Context> template <typename T>
 void SEluGradientOp<Context>::RunWithType() {
    auto* Ydata = Input(0).template data<T, Context>();
    auto* dYdata = Input(1).template data<T, Context>();
    auto* dXdata = Output(0)->template mutable_data<T, Context>();
-    kernel::SEluGrad<T, Context>(
+    kernel::SEluGrad<T, Context>(Output(0)->count(),
-        Output(0)->count(), dYdata, Ydata, dXdata);
+        dYdata, Ydata, dXdata, ctx());
 }
 template <class Context>
@@ -46,7 +48,9 @@ DEPLOY_CPU(SEluGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(SEluGradient);
 #endif
-OPERATOR_SCHEMA(SEluGradient).NumInputs(2).NumOutputs(1).Inplace({ { 1, 0 }});
+OPERATOR_SCHEMA(SEluGradient)
+    .NumInputs(2).NumOutputs(1)
+    .Inplace({ { 1, 0 }});
 class GetSEluGradient final : public GradientMakerBase {
 public:

--- a/Dragon/src/operators/activation/sigmoid_op.cc
+++ b/Dragon/src/operators/activation/sigmoid_op.cc
@@ -8,7 +8,7 @@ template <class Context> template <typename T>
 void SigmoidOp<Context>::RunWithType() {
    auto* Xdata = Input(0).template data<T, Context>();
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    kernel::Sigmoid<T, Context>(Output(0)->count(), Xdata, Ydata);
+    kernel::Sigmoid<T, Context>(Output(0)->count(), Xdata, Ydata, ctx());
 }
 template <class Context>
@@ -30,8 +30,8 @@ void SigmoidGradientOp<Context>::RunWithType() {
    auto* Ydata = Input(0).template data<T, Context>();
    auto* dYdata = Input(1).template data<T, Context>();
    auto* dXdata = Output(0)->template mutable_data<T, Context>();
-    kernel::SigmoidGrad<T, Context>(
+    kernel::SigmoidGrad<T, Context>(Output(0)->count(),
-        Output(0)->count(), dYdata, Ydata, dXdata);
+        dYdata, Ydata, dXdata, ctx());
 }
 template <class Context>

--- a/Dragon/src/operators/activation/softmax_op.cc
+++ b/Dragon/src/operators/activation/softmax_op.cc
@@ -12,13 +12,13 @@ void SoftmaxOp<Context>::RunWithType() {
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
    auto* WSdata = ws()->template caches<T, Context>({ Input(0).count() })[0];
-    ctx().template Copy<T, Context, Context>(
+    ctx()->template Copy<T, Context, Context>(
        Input(0).count(), Ydata, Xdata);
    kernel::Softmax<T, Context>(
        Output(0)->count(), Input(0).dim(axis),
            outer_dim, inner_dim, multiplier,
-                Xdata, WSdata, Ydata, &ctx());
+                Xdata, WSdata, Ydata, ctx());
 }
 template <class Context>
@@ -36,7 +36,9 @@ DEPLOY_CPU(Softmax);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(Softmax);
 #endif
-OPERATOR_SCHEMA(Softmax).NumInputs(1).NumOutputs(1).Inplace({ { 0, 0 } });
+OPERATOR_SCHEMA(Softmax)
+    .NumInputs(1).NumOutputs(1)
+    .Inplace({ { 0, 0 } });
 template <class Context> template <typename T>
 void SoftmaxGradientOp<Context>::RunWithType() {
@@ -44,15 +46,16 @@ void SoftmaxGradientOp<Context>::RunWithType() {
    auto* dYdata = Input(-1).template data<T, Context>();
    auto* Ydata = Input(0).template data<T, Context>();
    auto* dXdata = Output(0)->template mutable_data<T, Context>();
-    auto* WSdata = ws()->template caches<T, Context>({ Input(0).count() })[0];
+    auto* WSdata = ws()->template caches<T, Context>(
+        { Input(0).count() })[0];
-    ctx().template Copy<T, Context, Context>(
+    ctx()->template Copy<T, Context, Context>(
        Input(0).count(), dXdata, dYdata);
    kernel::SoftmaxGrad<T, Context>(
        Output(0)->count(), Input(0).dim(axis),
            outer_dim, inner_dim, multiplier,
-                dYdata, Ydata, WSdata, dXdata, &ctx());
+                dYdata, Ydata, WSdata, dXdata, ctx());
 }
 template <class Context>
@@ -70,7 +73,9 @@ DEPLOY_CPU(SoftmaxGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(SoftmaxGradient);
 #endif
-OPERATOR_SCHEMA(SoftmaxGradient).NumInputs(2).NumOutputs(1).Inplace({ { 1, 0 } });
+OPERATOR_SCHEMA(SoftmaxGradient)
+    .NumInputs(2).NumOutputs(1)
+    .Inplace({ { 1, 0 } });
 class GetSoftmaxGradient final : public GradientMakerBase {
 public:

--- a/Dragon/src/operators/activation/tanh_op.cc
+++ b/Dragon/src/operators/activation/tanh_op.cc
@@ -8,7 +8,7 @@ template <class Context> template <typename T>
 void TanhOp<Context>::RunWithType() {
    auto* Xdata = Input(0).template data<T, Context>();
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    kernel::Tanh<T, Context>(Output(0)->count(), Xdata, Ydata);
+    kernel::Tanh<T, Context>(Output(0)->count(), Xdata, Ydata, ctx());
 }
 template <class Context>
@@ -30,8 +30,8 @@ void TanhGradientOp<Context>::RunWithType() {
    auto* Ydata = Input(0).template data<T, Context>();
    auto* dYdata = Input(1).template data<T, Context>();
    auto* dXdata = Output(0)->template mutable_data<T, Context>();
-    kernel::TanhGrad<T, Context>(
+    kernel::TanhGrad<T, Context>(Output(0)->count(),
-        Output(0)->count(), dYdata, Ydata, dXdata);
+        dYdata, Ydata, dXdata, ctx());
 }
 template <class Context>

--- a/Dragon/src/operators/arithmetic/add_op.cc
+++ b/Dragon/src/operators/arithmetic/add_op.cc
@@ -9,7 +9,7 @@ void AddOp<Context>::EltwiseRunWithType() {
    auto* x1 = Input(0).template data<T, Context>();
    auto* x2 = Input(1).template data<T, Context>();
    auto* y = Output(0)->template mutable_data<T, Context>();
-    math::Add<T, Context>(Output(0)->count(), x1, x2, y);
+    math::Add<T, Context>(Output(0)->count(), x1, x2, y, ctx());
 }
 template <class Context> template <typename T>
@@ -19,23 +19,24 @@ void AddOp<Context>::BroadcastRunWithType(int type) {
    auto* x2 = Input(1).template data<T, Context>();
    auto* y = Output(0)->template mutable_data<T, Context>();
-    ctx().template Copy<T, Context, Context>(
+    ctx()->template Copy<T, Context, Context>(
        Output(0)->count(), y, x1);
    if (type == 0 || type == 1) {
        if (type == 0) {
-            outer_dim = Input(0).count();
+            x2 = Input(1).template data<T, CPUContext>();
-            inner_dim = 1;
+            math::AddScalar<T, Context>(Output(0)->count(),
+                dragon_cast<float, T>(x2[0]), y, ctx());
        } else {
            outer_dim = Input(0).count(0, Input(0).axis(-1));
            inner_dim = Input(0).dim(-1);
+            DECLARE_MULTIPLIER(multiplier, outer_dim);
+            math::Gemm<T, Context>(
+                CblasNoTrans, CblasNoTrans,
+                    outer_dim, inner_dim, 1,
+                        1.0, multiplier, x2,
+                            1.0, y, ctx());
        }
-        DECLARE_MULTIPLIER(multiplier, outer_dim);
-        math::Gemm<T, Context>(
-            CblasNoTrans, CblasNoTrans,
-                outer_dim, inner_dim, 1,
-                    1.0, multiplier, x2,
-                        1.0, y, &ctx());
    } else if (type == 2) {
        outer_dim = Input(0).dim(0);
        inner_dim = Input(0).count(1);
@@ -44,7 +45,7 @@ void AddOp<Context>::BroadcastRunWithType(int type) {
            CblasNoTrans, CblasNoTrans,
                outer_dim, inner_dim, 1,
                    1.0, x2, multiplier,
-                        1.0, y, &ctx());
+                        1.0, y, ctx());
    }
 }
@@ -77,13 +78,13 @@ void AddGradientOp<Context>::EltwiseRunWithType() {
    if (Output(1)->name() != "ignore") {
        auto* dx2 = Output(1)->template mutable_data<T, Context>();
-        ctx().template Copy<T, Context, Context>(
+        ctx()->template Copy<T, Context, Context>(
            Output(1)->count(), dx2, dy);
    }
    if (Output(0)->name() != "ignore") {
        auto* dx1 = Output(0)->template mutable_data<T, Context>();
-        ctx().template Copy<T, Context, Context>(
+        ctx()->template Copy<T, Context, Context>(
            Output(0)->count(), dx1, dy);
    }
 }
@@ -108,7 +109,7 @@ void AddGradientOp<Context>::BroadcastRunWithType(int type) {
            math::Gemv<T, Context>(
                CblasTrans, outer_dim, inner_dim,
                    1.0, dy, multiplier,
-                        0.0, dx2, &ctx());
+                        0.0, dx2, ctx());
        } else if (type == 2) {
            outer_dim = X1->dim(0);
            inner_dim = X1->count(1);
@@ -116,13 +117,13 @@ void AddGradientOp<Context>::BroadcastRunWithType(int type) {
            math::Gemv<T, Context>(
                CblasNoTrans, outer_dim, inner_dim,
                    1.0, dy, multiplier,
-                        0.0, dx2, &ctx());
+                        0.0, dx2, ctx());
        }
    }
    if (Output(0)->name() != "ignore") {
        auto* dx1 = Output(0)->template mutable_data<T, Context>();
-        ctx().template Copy<T, Context, Context>(
+        ctx()->template Copy<T, Context, Context>(
            X1->count(), dx1, dy);
    }
 }

--- a/Dragon/src/operators/arithmetic/affine_op.cc
+++ b/Dragon/src/operators/arithmetic/affine_op.cc
@@ -34,7 +34,7 @@ void AffineOp<Context>::RunWithType() {
    kernel::Affine<T, Context>(
        Output(0)->count(), outer_dim, scale_dim, inner_dim,
-            Xdata, Adata, Bdata, bias_multiplier, Ydata, &ctx());
+            Xdata, Adata, Bdata, bias_multiplier, Ydata, ctx());
 }
 template <class Context>
@@ -58,13 +58,13 @@ void AffineGradientOp<Context>::BiasRunWithType() {
    DECLARE_MULTIPLIER(multiplier, inner_dim);
    auto* dYdata = Input(-1).template data<T, Context>();
-    auto* dBias = Output(2)->template mutable_data<T, Context>();
+    auto* dBias = Output(2)->template mutable_data<T, Context>(ctx());
    for (int n = 0; n < outer_dim; n++) {
        math::Gemv<T, Context>(
            CblasNoTrans, scale_dim, inner_dim,
                1.0, dYdata, multiplier,
-                    1.0, dBias, &ctx());
+                    1.0, dBias, ctx());
        dYdata += dim;
    }
 }
@@ -79,45 +79,36 @@ void AffineGradientOp<Context>::ScaleRunWithType() {
    bool is_eltwise = (Input(-1).count() == Input(1).count());
    auto* dYdata = Input(-1).template data<T, Context>();
    auto* Xdata = Input(0).template data<T, Context>();
-    auto* dScale = Output(1)->template mutable_data<T, Context>();
+    auto* dScale = Output(1)->template mutable_data<T, Context>(ctx());
    auto* dXdata = Output(0)->template mutable_data<T, Context>();
    auto* dYxX = dXdata;
-    math::Mul<T, Context>(Output(0)->count(), dYdata, Xdata, dYxX);
+    math::Mul<T, Context>(Output(0)->count(), dYdata, Xdata, dYxX, ctx());
    if (!is_eltwise) {
        T* SRes_data = nullptr;
+        //  reduce inner dimensions
        if (inner_dim == 1) {
            SRes_data = dYxX;
-        } else if (sum_result.count() == 1) {    //  handle inner only
-            dScale = Output(1)->template mutable_data<T, CPUContext>();
-            T result = math::Dot<T, Context>(
-                inner_dim, dYxX, multiplier, &ctx());
-            *dScale += result;
        } else {
-            SRes_data = (outer_dim == 1) ?  //  handle scale only
+            SRes_data = (outer_dim == 1) ?
                dScale : sum_result.template mutable_data<T, Context>();
            math::Gemv<T, Context>(
                CblasNoTrans, sum_result.count(), inner_dim,
                    1.0, dYxX, multiplier,
-                        SRes_data == dScale ? 1.0 : 0.0, SRes_data, &ctx());
+                        SRes_data == dScale ? 1.0 : 0.0,
+                            SRes_data, ctx());
        } 
+        //  reduce outer dimensions
        if (outer_dim != 1) {
-            if (scale_dim == 1) {    //  handle outer only
+            math::Gemv<T, Context>(
-                dScale = Output(1)->template mutable_data<T, CPUContext>();
+                CblasTrans, outer_dim, scale_dim,
-                T result = math::Dot<T, Context>(
+                    1.0, SRes_data, multiplier,
-                    outer_dim, multiplier, SRes_data, &ctx());
+                        1.0, dScale, ctx());
-                *dScale += result;
-            } else {
-                math::Gemv<T, Context>(
-                    CblasTrans, outer_dim, scale_dim,
-                        1.0, SRes_data, multiplier,
-                            1.0, dScale, &ctx());
-            }
        }
    } else {
        math::Axpy<T, Context>(Output(1)->count(),
-            1.f, dYxX, dScale, &ctx());
+            1.f, dYxX, dScale, ctx());
    }
 }
@@ -131,7 +122,7 @@ void AffineGradientOp<Context>::RunWithType() {
    kernel::AffineGrad<T, Context>(
        Output(0)->count(), outer_dim, scale_dim, inner_dim,
-            dYdata, Adata, dXdata, &ctx());
+            dYdata, Adata, dXdata, ctx());
 }
 template <class Context>

--- a/Dragon/src/operators/arithmetic/clip_op.cc
+++ b/Dragon/src/operators/arithmetic/clip_op.cc
@@ -15,7 +15,7 @@ void ClipOp<Context>::RunWithType() {
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
    auto* Mdata = mask->template mutable_data<T, Context>();
    kernel::Clip<T, Context>(Output(0)->count(),
-        low, high, Xdata, Mdata, Ydata);
+        low, high, Xdata, Mdata, Ydata, ctx());
 }
 template <class Context>
@@ -30,7 +30,9 @@ DEPLOY_CPU(Clip);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(Clip);
 #endif
-OPERATOR_SCHEMA(Clip).NumInputs(1).NumOutputs(1).Inplace({ { 0, 0 } });
+OPERATOR_SCHEMA(Clip)
+    .NumInputs(1).NumOutputs(1)
+    .Inplace({ { 0, 0 } });
 template <class Context> template <typename T>
 void ClipGradientOp<Context>::RunWithType() {
@@ -39,7 +41,8 @@ void ClipGradientOp<Context>::RunWithType() {
    auto* dXdata = Output(0)->template mutable_data<T, Context>();
    auto* Mdata = mask->template data<T, Context>();
-    math::Mul<T, Context>(Output(0)->count(), dXdata, Mdata, dXdata);
+    math::Mul<T, Context>(Output(0)->count(),
+        dXdata, Mdata, dXdata, ctx());
 }
 template <class Context>
@@ -54,7 +57,9 @@ DEPLOY_CPU(ClipGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(ClipGradient);
 #endif
-OPERATOR_SCHEMA(ClipGradient).NumInputs(2).NumOutputs(1).Inplace({ { 1, 0 } });
+OPERATOR_SCHEMA(ClipGradient)
+    .NumInputs(2).NumOutputs(1)
+    .Inplace({ { 1, 0 } });
 class GetClipGradient final : public GradientMakerBase {
 public:

--- a/Dragon/src/operators/arithmetic/cudnn_affine_op.cc
+++ b/Dragon/src/operators/arithmetic/cudnn_affine_op.cc
@@ -23,7 +23,7 @@ void CuDNNAffineOp<Context>::RunWithType() {
        mul_desc, CUDNN_OP_TENSOR_MUL,
            CUDNNType<T>::type, CUDNN_PROPAGATE_NAN));
    CUDNN_CHECK(cudnnOpTensor(
-        ctx().cudnn_handle(), mul_desc,
+        ctx()->cudnn_handle(), mul_desc,
            CUDNNType<T>::one, input_desc, Xdata,
                CUDNNType<T>::one, param_desc, Adata,
                    CUDNNType<T>::zero, input_desc, Ydata));
@@ -36,7 +36,7 @@ void CuDNNAffineOp<Context>::RunWithType() {
            add_desc, CUDNN_OP_TENSOR_ADD,
                CUDNNType<T>::type, CUDNN_PROPAGATE_NAN));
        CUDNN_CHECK(cudnnOpTensor(
-            ctx().cudnn_handle(), add_desc,
+            ctx()->cudnn_handle(), add_desc,
                CUDNNType<T>::one, input_desc, Ydata,
                    CUDNNType<T>::one, param_desc, Bdata,
                        CUDNNType<T>::zero, input_desc, Ydata));
@@ -48,7 +48,9 @@ void CuDNNAffineOp<Context>::RunOnDevice() {
    Output(0)->ReshapeLike(Input(0));
    if (XIsType(Input(0), float)) RunWithType<float>();
+#ifdef WITH_CUDA_FP16
    else if (XIsType(Input(0), float16)) RunWithType<float16>();
+#endif
    else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
 }
@@ -76,17 +78,17 @@ void CuDNNAffineGradientOp<Context>::RunWithType() {
    if (Output(1)->name() != "ignore") {
        Output(1)->ReshapeLike(Input(1));
        auto* Xdata = Input(0).template data<T, Context>();
-        auto* dAdata = Output(1)->template mutable_data<T, Context>();
+        auto* dAdata = Output(1)->template mutable_data<T, Context>(ctx());
        //  eltwise
        if (Input(0).count() == Input(1).count()) {
            CUDNN_CHECK(cudnnOpTensor(
-                ctx().cudnn_handle(), mul_desc,
+                ctx()->cudnn_handle(), mul_desc,
                    CUDNNType<T>::one, input_desc, Xdata,
                        CUDNNType<T>::one, input_desc, dYdata,
                            CUDNNType<T>::one, param_desc, dAdata));
        } else {
            CUDNN_CHECK(cudnnOpTensor(
-                ctx().cudnn_handle(), mul_desc,
+                ctx()->cudnn_handle(), mul_desc,
                    CUDNNType<T>::one, input_desc, Xdata,
                        CUDNNType<T>::one, input_desc, dYdata,
                            CUDNNType<T>::zero, input_desc, dXdata));
@@ -97,11 +99,11 @@ void CuDNNAffineGradientOp<Context>::RunWithType() {
    //  db = dy
    if (Output(2)->name() != "ignore") {
        Output(2)->ReshapeLike(Input(1));
-        auto* dBdata = Output(2)->template mutable_data<T, Context>();
+        auto* dBdata = Output(2)->template mutable_data<T, Context>(ctx());
        //  eltwise
        if (Input(-1).count() == Input(1).count()) {
            math::Axpy<T, Context>(Output(2)->count(),
-                1.f, dYdata, dBdata, &ctx());
+                1.f, dYdata, dBdata, ctx());
        } else {
            ComputeBiasGradient_v2<T>(dYdata, dBdata);
        }
@@ -109,7 +111,7 @@ void CuDNNAffineGradientOp<Context>::RunWithType() {
    //  dx = alpha * dy
    CUDNN_CHECK(cudnnOpTensor(
-        ctx().cudnn_handle(), mul_desc,
+        ctx()->cudnn_handle(), mul_desc,
            CUDNNType<T>::one, input_desc, dYdata,
                CUDNNType<T>::one, param_desc, Adata,
                    CUDNNType<T>::zero, input_desc, dXdata));
@@ -126,11 +128,11 @@ void CuDNNAffineGradientOp<Context>::ComputeScaleGradient(
                CUDNN_REDUCE_TENSOR_NO_INDICES, CUDNN_32BIT_INDICES));
    size_t workspace_size = 0;
    CUDNN_CHECK(cudnnGetReductionWorkspaceSize(
-        ctx().cudnn_handle(), reduce_desc,
+        ctx()->cudnn_handle(), reduce_desc,
            input_desc, param_desc, &workspace_size));
    auto* WSdata = ws()->template caches<Context>({ workspace_size })[0];;
    CUDNN_CHECK(cudnnReduceTensor(
-        ctx().cudnn_handle(), reduce_desc,
+        ctx()->cudnn_handle(), reduce_desc,
            nullptr, 0, WSdata, workspace_size,
                CUDNNType<T>::one, input_desc, dYxX,
                    CUDNNType<T>::one, param_desc, dA));
@@ -145,32 +147,23 @@ void CuDNNAffineGradientOp<Context>::ComputeScaleGradient_v2(
    sum_result.Reshape({ outer_dim * scale_dim });
    T* SRes_data = nullptr;
-    if (inner_dim == 1) SRes_data = dYxX;
+    //  reduce inner dimensions
-    else if (sum_result.count() == 1) {
+    if (inner_dim == 1) {
-        auto* dAC = Output(1)->template mutable_data<T, CPUContext>();
+        SRes_data = dYxX;
-        T result = math::Dot<T, Context>(
-            inner_dim, dYxX, multiplier, &ctx());
-        *dAC += result;
    } else {
        SRes_data = (outer_dim == 1) ?
            dA : sum_result.template mutable_data<T, Context>();
        math::Gemv<T, Context>(
            CblasNoTrans, sum_result.count(), inner_dim,
                1.0, dYxX, multiplier,
-                    SRes_data == dA ? 1.0 : 0.0, SRes_data, &ctx());
+                    SRes_data == dA ? 1.0 : 0.0, SRes_data, ctx());
    }
+    //  reduce outer dimensions
    if (outer_dim != 1) {
-        if (scale_dim == 1) {
+        math::Gemv<T, Context>(
-            auto* dAC = Output(1)->template mutable_data<T, CPUContext>();
+            CblasTrans, outer_dim, scale_dim,
-            T result = math::Dot<T, Context>(
+                1.0, SRes_data, multiplier,
-                outer_dim, multiplier, SRes_data, &ctx());
+                    1.0, dA, ctx());
-            *dAC += result;
-        } else {
-            math::Gemv<T, Context>(
-                CblasTrans, outer_dim, scale_dim,
-                    1.0, SRes_data, multiplier,
-                        1.0, dA, &ctx());
-        }
    }
 }
@@ -185,11 +178,11 @@ void CuDNNAffineGradientOp<Context>::ComputeBiasGradient(
                CUDNN_REDUCE_TENSOR_NO_INDICES, CUDNN_32BIT_INDICES));
    size_t workspace_size = 0;
    CUDNN_CHECK(cudnnGetReductionWorkspaceSize(
-        ctx().cudnn_handle(), reduce_desc,
+        ctx()->cudnn_handle(), reduce_desc,
            input_desc, param_desc, &workspace_size));
    auto* WSdata = ws()->template caches<Context>({ workspace_size })[0];
    CUDNN_CHECK(cudnnReduceTensor(
-        ctx().cudnn_handle(), reduce_desc,
+        ctx()->cudnn_handle(), reduce_desc,
            nullptr, 0, WSdata, workspace_size,
                CUDNNType<T>::one, input_desc, dY,
                    CUDNNType<T>::one, param_desc, dB));
@@ -205,7 +198,7 @@ void CuDNNAffineGradientOp<Context>::ComputeBiasGradient_v2(
        math::Gemv<T, Context>(
            CblasNoTrans, scale_dim, inner_dim,
                1.0, dY, multiplier,
-                    1.0, dB, &ctx());
+                    1.0, dB, ctx());
        dY += dim;
    }
 }

--- a/Dragon/src/operators/arithmetic/div_op.cc
+++ b/Dragon/src/operators/arithmetic/div_op.cc
@@ -9,7 +9,7 @@ void DivOp<Context>::EltwiseRunWithType() {
    auto* x1 = Input(0).template data<T, Context>();
    auto* x2 = Input(1).template data<T, Context>();
    auto* y = Output(0)->template mutable_data<T, Context>();
-    math::Div<T, Context>(Output(0)->count(), x1, x2, y);
+    math::Div<T, Context>(Output(0)->count(), x1, x2, y, ctx());
 }
 template <class Context> template <typename T>
@@ -18,34 +18,40 @@ void DivOp<Context>::BroadcastRunWithType(int type) {
    auto* x1 = Input(0).template data<T, Context>();
    auto* x2 = Input(1).template data<T, Context>();
    auto* y = Output(0)->template mutable_data<T, Context>();
-    auto* c = ws()->template caches<T, Context>({
-        Output(0)->count() })[0];
+    if (type == 0) {
+        x2 = Input(1).template data<T, CPUContext>();
-    if (type == 0 || type == 1) {
+        float inverse_x2 = 1.f / dragon_cast<float, T>(x2[0]);
-        if (type == 0) {
+        ctx()->template Copy<T, Context, Context>(
-            outer_dim = Input(0).count();
+            Output(0)->count(), y, x1);
-            inner_dim = 1;
+        math::MulScalar<T, Context>(
-        } else {
+            Output(0)->count(), inverse_x2, y, ctx());
-            outer_dim = Input(0).count(0, Input(0).axis(-1));
+    } else if (type == 1) {
-            inner_dim = Input(0).dim(-1);
+        outer_dim = Input(0).count(0, Input(0).axis(-1));
-        }
+        inner_dim = Input(0).dim(-1);
        DECLARE_MULTIPLIER(multiplier, outer_dim);
+        auto* c = ws()->template caches<T, Context>(
+            { Output(0)->count() })[0];
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                outer_dim, inner_dim, 1,
                    1.0, multiplier, x2,
-                        0.0, c, &ctx());
+                        0.0, c, ctx());
-        math::Div<T, Context>(Output(0)->count(), x1, c, y);
+        math::Div<T, Context>(
+            Output(0)->count(), x1, c, y, ctx());
    } else if (type == 2) {
        outer_dim = Input(0).dim(0);
        inner_dim = Input(0).count(1);
        DECLARE_MULTIPLIER(multiplier, inner_dim);
+        auto* c = ws()->template caches<T, Context>(
+            { Output(0)->count() })[0];
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                outer_dim, inner_dim, 1,
                    1.0, x2, multiplier,
-                        0.0, c, &ctx());
+                        0.0, c, ctx());
-        math::Div<T, Context>(Output(0)->count(), x1, c, y);
+        math::Div<T, Context>(
+            Output(0)->count(), x1, c, y, ctx());
    }
 }
@@ -82,16 +88,16 @@ void DivGradientOp<Context>::EltwiseRunWithType() {
        auto* x2 = Input(1).template data<T, Context>();
        auto* dx2 = Output(1)->template mutable_data<T, Context>();
        auto* c = ws()->template caches<T, Context>({ X1->count() })[0];
-        math::Mul<T,Context>(X1->count(), dy, x1, c); // dY * X1
+        math::Mul<T,Context>(X1->count(), dy, x1, c, ctx()); // dY * X1
-        math::Square<T, Context>(X2->count(), x2, dx2); // X2^{2}
+        math::Square<T, Context>(X2->count(), x2, dx2, ctx()); // X2^{2}
-        math::Inv<T, Context>(X2->count(), -1, dx2, dx2); // -1 / X2^{2}
+        math::Inv<T, Context>(X2->count(), -1, dx2, dx2, ctx()); // -1 / X2^{2}
-        math::Mul<T, Context>(X2->count(), c, dx2, dx2);
+        math::Mul<T, Context>(X2->count(), c, dx2, dx2, ctx());
    }
    if (Output(0)->name() != "ignore") {
        auto* x2 = Input(1).template data<T, Context>();
        auto* dx1 = Output(0)->template mutable_data<T, Context>();
-        math::Div<T, Context>(X1->count(), dy, x2, dx1);
+        math::Div<T, Context>(X1->count(), dy, x2, dx1, ctx());
    }
 }
@@ -118,23 +124,23 @@ void DivGradientOp<Context>::BroadcastRunWithType(int type) {
        auto* dx2 = Output(1)->template mutable_data<T, Context>();
        auto cs = ws()->template caches<T, Context>(
            { X1->count(), X2->count() });
-        math::Mul<T, Context>(X1->count(), dy, x1, cs[0]); // dY * X1
+        math::Mul<T, Context>(X1->count(), dy, x1, cs[0], ctx()); // dY * X1
-        math::Square<T, Context>(X2->count(), x2, dx2); // X2^{2}
+        math::Square<T, Context>(X2->count(), x2, dx2, ctx()); // X2^{2}
-        math::Inv<T, Context>(X2->count(), -1.0, dx2, dx2); // -1 / X2^{2}
+        math::Inv<T, Context>(X2->count(), -1, dx2, dx2, ctx()); // -1 / X2^{2}
        if (type == 0 || type == 1) {
            DECLARE_MULTIPLIER(multiplier, outer_dim);
            math::Gemv<T, Context>(
                CblasTrans, outer_dim, inner_dim,
                    1.0, cs[0], multiplier,
-                        0.0, cs[1], &ctx());
+                        0.0, cs[1], ctx());
        } else if (type == 2) {
            DECLARE_MULTIPLIER(multiplier, inner_dim);
            math::Gemv<T, Context>(
                CblasNoTrans, outer_dim, inner_dim,
                    1.0, cs[0], multiplier,
-                        0.0, cs[1], &ctx());
+                        0.0, cs[1], ctx());
        }
-        math::Mul<T, Context>(X2->count(), cs[1], dx2, dx2);
+        math::Mul<T, Context>(X2->count(), cs[1], dx2, dx2, ctx());
    }
    if (Output(0)->name() != "ignore") {
@@ -146,16 +152,16 @@ void DivGradientOp<Context>::BroadcastRunWithType(int type) {
                CblasNoTrans, CblasNoTrans,
                    outer_dim, inner_dim, 1,
                        1.0, multiplier, x2,
-                            0.0, dx1, &ctx());
+                            0.0, dx1, ctx());
        } else if (type == 2) {
            DECLARE_MULTIPLIER(multiplier, inner_dim);
            math::Gemm<T, Context>(
                CblasNoTrans, CblasNoTrans,
                    outer_dim, inner_dim, 1,
                        1.0, x2, multiplier,
-                            0.0, dx1, &ctx());
+                            0.0, dx1, ctx());
        }
-        math::Div<T, Context>(X1->count(), dy, dx1, dx1);
+        math::Div<T, Context>(X1->count(), dy, dx1, dx1, ctx());
    }
 }

--- a/Dragon/src/operators/arithmetic/dot_op.cc
+++ b/Dragon/src/operators/arithmetic/dot_op.cc
@@ -7,9 +7,13 @@ template <class Context> template <typename T>
 void DotOp<Context>::DotRunWithType() {
    auto* X1data = Input(0).template data<T, Context>();
    auto* X2data = Input(1).template data<T, Context>();
-    auto* Ydata = Output(0)->template mutable_data<T, CPUContext>();
+    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    Ydata[0] = math::Dot<T, Context>(
-        Input(0).count(), X1data, X2data, &ctx());
+    T result_host;
+    math::Dot<T, Context>(Input(0).count(),
+        X1data, X2data, &result_host, ctx());
+    ctx()->template Copy<T, Context, CPUContext>(
+        1, Ydata, &result_host);
 }
 template <class Context> template <typename T>
@@ -22,7 +26,7 @@ void DotOp<Context>::GemmRunWithType() {
            TransB ? CblasTrans : CblasNoTrans,
                M, N1, K1,
                    1.0, X1data, X2data,
-                        0.0, Ydata, &ctx());
+                        0.0, Ydata, ctx());
 }
 template <class Context> template <typename T>
@@ -33,7 +37,7 @@ void DotOp<Context>::GemvRunWithType() {
    math::Gemv<T, Context>(
        TransA ? CblasTrans : CblasNoTrans, M, N1,
            1.0, X1data, X2data,
-                0.0, Ydata, &ctx());
+                0.0, Ydata, ctx());
 }
 template <class Context>
@@ -98,12 +102,14 @@ void DotGradientOp<Context>::DotRunWithType() {
    auto* dYdata = Input(2).template data<T, CPUContext>();
    auto* dX1data = Output(0)->template mutable_data<T, Context>();
    auto* dX2data = Output(1)->template mutable_data<T, Context>();
-    ctx().template Copy<T, Context, Context>(
+    ctx()->template Copy<T, Context, Context>(
        Output(0)->count(), dX1data, X2data);
-    ctx().template Copy<T, Context, Context>(
+    ctx()->template Copy<T, Context, Context>(
        Output(1)->count(), dX2data, X1data);
-    math::MulScalar<T, Context>(Output(0)->count(), dYdata[0], dX1data);
+    math::MulScalar<T, Context>(
-    math::MulScalar<T, Context>(Output(1)->count(), dYdata[0], dX2data);
+        Output(0)->count(), dYdata[0], dX1data, ctx());
+    math::MulScalar<T, Context>(
+        Output(1)->count(), dYdata[0], dX2data, ctx());
 }
 template <class Context> template <typename T>
@@ -118,13 +124,13 @@ void DotGradientOp<Context>::GemmRunWithType() {
            TransB ? CblasNoTrans : CblasTrans,
                M, K1, N1,
                    1.0, dYdata, X2data,
-                        0.0, dX1data, &ctx());
+                        0.0, dX1data, ctx());
    math::Gemm<T, Context>(
        TransA ? CblasNoTrans : CblasTrans,
            CblasNoTrans,
                K1, N1, M,
                    1.0, X1data, dYdata,
-                        0.0, dX2data, &ctx());
+                        0.0, dX2data, ctx());
 }
 template <class Context> template <typename T>
@@ -138,11 +144,11 @@ void DotGradientOp<Context>::GemvRunWithType() {
        CblasNoTrans, CblasNoTrans,
            M, N1, 1,
                1.0, dYdata, X2data,
-                    0.0, dX1data, &ctx());
+                    0.0, dX1data, ctx());
    math::Gemv<T, Context>(
        TransA ? CblasNoTrans : CblasTrans, M, N1,
            1.0, X1data, dYdata,
-                0.0, dX2data, &ctx());
+                0.0, dX2data, ctx());
 }
 template <class Context>

--- a/Dragon/src/operators/arithmetic/eltwise_op.cc
+++ b/Dragon/src/operators/arithmetic/eltwise_op.cc
@@ -7,10 +7,11 @@ template <class Context> template <typename T>
 void EltwiseOp<Context>::SumRunWithType() {
    TIndex count = Output(0)->count();
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    math::Set<T, Context>(count, dragon_cast<T, float>(0), Ydata);
+    math::Set<T, Context>(count,
+        dragon_cast<T, float>(0), Ydata, ctx());
    for (int i = 0; i < InputSize(); ++i) {
        math::Axpy<T, Context>(count, coeffs[i],
-            Input(i).template data<T, Context>(), Ydata, &ctx());
+            Input(i).template data<T, Context>(), Ydata, ctx());
    }
 }
@@ -21,19 +22,24 @@ void EltwiseOp<Context>::ProdRunWithType() {
    math::Mul<T, Context>(count,
        Input(0).template data<T, Context>(),
            Input(1).template data<T, Context>(),
-                Ydata);
+                Ydata, ctx());
    for (int i = 2; i < InputSize(); i++) {
        math::Mul<T, Context>(count,
            Ydata,
                Input(i).template data<T, Context>(),
-                    Ydata);
+                    Ydata, ctx());
    }
 }
 template <class Context>
 void EltwiseOp<Context>::RunOnDevice() {
-    for (int i = 1; i < InputSize(); i++)
+    for (int i = 1; i < InputSize(); i++) {
-        CHECK(Input(i).dims() == Input(0).dims());
+        CHECK(Input(i).dims() == Input(0).dims())
+            << "\nExcepted Input(" << i << ")'s dims as "
+            << Input(0).DimString() << ",\n but got "
+            << Input(1).DimString() << ".";
+    }
    Output(0)->ReshapeLike(Input(0));
    if (operation == "SUM") {
@@ -65,12 +71,12 @@ void EltwiseGradientOp<Context>::SumRunWithType() {
    for (int i = 0; i < OutputSize(); i++) {
        if (Output(i)->name() == "ignore") continue;
        auto* dXdata = Output(i)->template mutable_data<T, Context>();
-        if (coeffs[i] == float(1)) {
+        if (coeffs[i] == 1.f) {
-            ctx().template Copy<T, Context, Context>(
+            ctx()->template Copy<T, Context, Context>(
                count, dXdata, dYdata);
        } else {
            math::Scale<T, Context>(count,
-                coeffs[i], dYdata, dXdata, &ctx());
+                coeffs[i], dYdata, dXdata, ctx());
        }
    }
 }
@@ -88,11 +94,11 @@ void EltwiseGradientOp<Context>::ProdRunWithType() {
            if (i == j) continue;
            auto* Xdata = Input(j).template data<T, Context>();
            if (!initialized) {
-                ctx().template Copy<T, Context, Context>(count, dXdata, Xdata);
+                ctx()->template Copy<T, Context, Context>(count, dXdata, Xdata);
                initialized = true;
-            } else math::Mul<T, Context>(count, Xdata, dXdata, dXdata);
+            } else math::Mul<T, Context>(count, Xdata, dXdata, dXdata, ctx());
        }
-        math::Mul<T, Context>(count, dYdata, dXdata, dXdata);
+        math::Mul<T, Context>(count, dYdata, dXdata, dXdata, ctx());
    }
 }

--- a/Dragon/src/operators/arithmetic/exp_op.cc
+++ b/Dragon/src/operators/arithmetic/exp_op.cc
@@ -8,7 +8,7 @@ template <class Context> template <typename T>
 void ExpOp<Context>::RunWithType() {
    auto* Xdata = Input(0).template data<T, Context>();
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    math::Exp<T, Context>(Output(0)->count(), Xdata, Ydata);
+    math::Exp<T, Context>(Output(0)->count(), Xdata, Ydata, ctx());
 }
 template <class Context>
@@ -30,7 +30,8 @@ void ExpGradientOp<Context>::RunWithType() {
    auto* Ydata = Input(0).template data<T, Context >();
    auto* dYdata = Input(-1).template data<T, Context>();
    auto* dXdata = Output(0)->template mutable_data<T, Context>();
-    math::Mul<T, Context>(Output(0)->count(), dYdata, Ydata, dXdata);
+    math::Mul<T, Context>(Output(0)->count(),
+        dYdata, Ydata, dXdata, ctx());
 }
 template <class Context>

--- a/Dragon/src/operators/arithmetic/gram_matrix_op.cc
+++ b/Dragon/src/operators/arithmetic/gram_matrix_op.cc
@@ -12,7 +12,7 @@ void GramMatrixOp<Context>::RunWithType() {
            CblasNoTrans, CblasTrans,
                dim, dim, inner_dim,
                    1.0, Xdata, Xdata,
-                        0.0, Ydata, &ctx());
+                        0.0, Ydata, ctx());
        Xdata += x_offset;
        Ydata += y_offset;
    }
@@ -47,7 +47,7 @@ void GramMatrixGradientOp<Context>::RunWithType() {
            CblasNoTrans, CblasNoTrans,
                dim, inner_dim, dim,
                    2.0, dYdata, Xdata,
-                        0.0, dXdata, &ctx());
+                        0.0, dXdata, ctx());
        dYdata += y_offset;
        dXdata += x_offset;
    }

--- a/Dragon/src/operators/arithmetic/inner_product_op.cc
+++ b/Dragon/src/operators/arithmetic/inner_product_op.cc
@@ -23,7 +23,7 @@ void InnerProductOp<Context>::TransRunWithType() {
        CblasNoTrans, CblasTrans,
            M, num_output, K,
                1.0, Xdata, Wdata,
-                    0.0, Ydata, &ctx());
+                    0.0, Ydata, ctx());
    if (InputSize() > 2) {
        DECLARE_MULTIPLIER(multiplier, M);
@@ -32,7 +32,7 @@ void InnerProductOp<Context>::TransRunWithType() {
            CblasNoTrans, CblasNoTrans,
                M, num_output, 1,
                    1.0, multiplier, Bdata,
-                        1.0, Ydata, &ctx());
+                        1.0, Ydata, ctx());
    }
 }
@@ -55,7 +55,7 @@ void InnerProductOp<Context>::NoTransRunWithType() {
        CblasNoTrans, CblasNoTrans,
            M, num_output, K,
                1.0, Xdata, Wdata,
-                    0.0, Ydata, &ctx());
+                    0.0, Ydata, ctx());
    if (InputSize() > 2) {
        DECLARE_MULTIPLIER(multiplier, M);
@@ -64,7 +64,7 @@ void InnerProductOp<Context>::NoTransRunWithType() {
            CblasNoTrans, CblasNoTrans,
                M, num_output, 1,
                    1.0, multiplier, Bdata,
-                        1.0, Ydata, &ctx());
+                        1.0, Ydata, ctx());
    }
 }
@@ -102,30 +102,30 @@ void InnerProductGradientOp<Context>::RunWithType() {
    if (Output(1)->name() != "ignore") {
        Output(1)->ReshapeLike(Input(1));
-        auto* dWdata = Output(1)->template mutable_data<T, Context>();
+        auto* dWdata = Output(1)->template mutable_data<T, Context>(ctx());
        if (TransW) {
            math::Gemm<T, Context>(
                CblasTrans, CblasNoTrans,
                    num_output, K, M,
                        1.0, dYdata, Xdata,
-                            1.0, dWdata, &ctx());
+                            1.0, dWdata, ctx());
        } else {
            math::Gemm<T, Context>(
                CblasTrans, CblasNoTrans,
                    K, num_output, M,
                        1.0, Xdata, dYdata,
-                            1.0, dWdata, &ctx());
+                            1.0, dWdata, ctx());
        }
    }
    if (Output(2)->name() != "ignore") {
        DECLARE_MULTIPLIER(multiplier, M);
        Output(2)->Reshape({ num_output });
-        auto* dBdata = Output(2)->template mutable_data<T, Context>();
+        auto* dBdata = Output(2)->template mutable_data<T, Context>(ctx());
        math::Gemv<T, Context>(
            CblasTrans, M, num_output,
                1.0, dYdata, multiplier,
-                    1.0, dBdata, &ctx());
+                    1.0, dBdata, ctx());
    }
    if (Output(0)->name() != "ignore") {
@@ -136,13 +136,13 @@ void InnerProductGradientOp<Context>::RunWithType() {
                CblasNoTrans, CblasNoTrans,
                    M, K, num_output,
                        1.0, dYdata, Wdata,
-                            0.0, dXdata, &ctx());
+                            0.0, dXdata, ctx());
        } else {
            math::Gemm<T, Context>(
                CblasNoTrans, CblasTrans,
                    M, K, num_output,
                        1.0, dYdata, Wdata,
-                            0.0, dXdata, &ctx());
+                            0.0, dXdata, ctx());
        }
    }
 }

--- a/Dragon/src/operators/arithmetic/log_op.cc
+++ b/Dragon/src/operators/arithmetic/log_op.cc
@@ -7,7 +7,7 @@ template <class Context> template <typename T>
 void LogOp<Context>::RunWithType() {
    auto* Xdata = Input(0).template data<T, Context>();
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    math::Log<T, Context>(Output(0)->count(), Xdata, Ydata);
+    math::Log<T, Context>(Output(0)->count(), Xdata, Ydata, ctx());
 }
 template <class Context>
@@ -29,7 +29,7 @@ void LogGradientOp<Context>::RunWithType() {
    auto* Xdata = Input(0).template data<T, Context>();
    auto* dYdata = Input(-1).template data<T, Context>();
    auto* dXdata = Output(0)->template mutable_data<T, Context>();
-    math::Div<T, Context>(Output(0)->count(), dYdata, Xdata, dXdata);
+    math::Div<T, Context>(Output(0)->count(), dYdata, Xdata, dXdata, ctx());
 }
 template <class Context>

--- a/Dragon/src/operators/arithmetic/matmul_op.cc
+++ b/Dragon/src/operators/arithmetic/matmul_op.cc
@@ -16,7 +16,7 @@ void MatmulOp<Context>::RunWithType() {
                TransB ? CblasTrans : CblasNoTrans,
                    M, N, K1,
                        1.0, X1data, X2data,
-                            0.0, Ydata, &ctx());
+                            0.0, Ydata, ctx());
        X1data += x1_offset;
        X2data += x2_offset;
        Ydata += y_offset;
@@ -76,13 +76,13 @@ void MatmulGradientOp<Context>::RunWithType() {
                TransB ? CblasNoTrans : CblasTrans,
                    M, K1, N,
                        1.0, dYdata, X2data,
-                            0.0, dX1data, &ctx());
+                            0.0, dX1data, ctx());
        math::Gemm<T, Context>(
            TransA ? CblasNoTrans : CblasTrans,
                CblasNoTrans,
                    K1, N, M,
                        1.0, X1data, dYdata,
-                            0.0, dX2data, &ctx());
+                            0.0, dX2data, ctx());
        X1data += x1_offset;
        X2data += x2_offset;
        dX1data += x1_offset;

--- a/Dragon/src/operators/arithmetic/mul_op.cc
+++ b/Dragon/src/operators/arithmetic/mul_op.cc
@@ -9,7 +9,7 @@ void MulOp<Context>::EltwiseRunWithType() {
    auto* x1 = Input(0).template data<T, Context>();
    auto* x2 = Input(1).template data<T, Context>();
    auto* y = Output(0)->template mutable_data<T, Context>();
-    math::Mul<T, Context>(Output(0)->count(), x1, x2, y);
+    math::Mul<T, Context>(Output(0)->count(), x1, x2, y, ctx());
 }
 template <class Context> template <typename T>
@@ -18,34 +18,39 @@ void MulOp<Context>::BroadcastRunWithType(int type) {
    auto* x1 = Input(0).template data<T, Context>();
    auto* x2 = Input(1).template data<T, Context>();
    auto* y = Output(0)->template mutable_data<T, Context>();
-    auto* c = ws()->template caches<T, Context>({ 
-        Output(0)->count() })[0];
+    if (type == 0) {
+        x2 = Input(1).template data<T, CPUContext>();
-    if (type == 0 || type == 1) {
+        ctx()->template Copy<T, Context, Context>(
-        if (type == 0) {
+            Output(0)->count(), y, x1);
-            outer_dim = Input(0).count();
+        math::MulScalar<T, Context>(Output(0)->count(),
-            inner_dim = 1;
+            dragon_cast<float, T>(x2[0]), y, ctx());
-        } else {
+    } else if (type == 1) {
-            outer_dim = Input(0).count(0, Input(0).axis(-1));
+        outer_dim = Input(0).count(0, Input(0).axis(-1));
-            inner_dim = Input(0).dim(-1);
+        inner_dim = Input(0).dim(-1);
-        }
        DECLARE_MULTIPLIER(multiplier, outer_dim);
+        auto* c = ws()->template caches<T, Context>(
+            { Output(0)->count() })[0];
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                outer_dim, inner_dim, 1,
                    1.0, multiplier, x2,
-                        0.0, c, &ctx());
+                        0.0, c, ctx());
-        math::Mul<T, Context>(Output(0)->count(), x1, c, y);
+        math::Mul<T, Context>(
+            Output(0)->count(), x1, c, y, ctx());
    } else if (type == 2) {
        outer_dim = Input(0).dim(0);
        inner_dim = Input(0).count(1);
        DECLARE_MULTIPLIER(multiplier, inner_dim);
+        auto* c = ws()->template caches<T, Context>(
+            { Output(0)->count() })[0];
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                outer_dim, inner_dim, 1,
                    1.0, x2, multiplier,
-                        0.0, c, &ctx());
+                        0.0, c, ctx());
-        math::Mul<T, Context>(Output(0)->count(), x1, c, y);
+        math::Mul<T, Context>(
+            Output(0)->count(), x1, c, y, ctx());
    }
 }
@@ -79,13 +84,13 @@ void MulGradientOp<Context>::EltwiseRunWithType() {
    if (Output(1)->name() != "ignore") {
        auto* x1 = Input(0).template data<T, Context>();
        auto* dx2 = Output(1)->template mutable_data<T, Context>();
-        math::Mul<T, Context>(Output(1)->count(), dy, x1, dx2);
+        math::Mul<T, Context>(Output(1)->count(), dy, x1, dx2, ctx());
    }
    if (Output(0)->name() != "ignore") {
        auto* x2 = Input(1).template data<T, Context>();
        auto* dx1 = Output(0)->template mutable_data<T, Context>();
-        math::Mul<T, Context>(Output(0)->count(), dy, x2, dx1);
+        math::Mul<T, Context>(Output(0)->count(), dy, x2, dx1, ctx());
    }
 }
@@ -110,19 +115,19 @@ void MulGradientOp<Context>::BroadcastRunWithType(int type) {
        auto* x1 = Input(0).template data<T, Context>();
        auto* dx2 = Output(1)->template mutable_data<T, Context>();
        auto* c = ws()->template caches<T, Context>({ X1->count() })[0];
-        math::Mul<T, Context>(X1->count(), dy, x1, c);
+        math::Mul<T, Context>(X1->count(), dy, x1, c, ctx());
        if (type == 0 || type == 1) {
-            DECLARE_MULTIPLIER(multiplier,  outer_dim);
+            DECLARE_MULTIPLIER(multiplier, outer_dim);
            math::Gemv<T, Context>(
                CblasTrans, outer_dim, inner_dim,
                    1.0, c, multiplier,
-                        0.0, dx2, &ctx());
+                        0.0, dx2, ctx());
        } else if (type == 2) {
            DECLARE_MULTIPLIER(multiplier, inner_dim);
            math::Gemv<T, Context>(
                CblasNoTrans, outer_dim, inner_dim,
                    1.0, c, multiplier,
-                        0.0, dx2, &ctx());
+                        0.0, dx2, ctx());
        }
    }
@@ -135,16 +140,16 @@ void MulGradientOp<Context>::BroadcastRunWithType(int type) {
                CblasNoTrans, CblasNoTrans,
                    outer_dim, inner_dim, 1,
                        1.0, multiplier, x2,
-                            0.0, dx1, &ctx());
+                            0.0, dx1, ctx());
        } else if (type == 2) {
            DECLARE_MULTIPLIER(multiplier, inner_dim);
            math::Gemm<T, Context>(
                CblasNoTrans, CblasNoTrans,
                    outer_dim, inner_dim, 1,
                        1.0, x2, multiplier,
-                            0.0, dx1, &ctx());
+                            0.0, dx1, ctx());
        }
-        math::Mul<T, Context>(X1->count(), dy, dx1, dx1);
+        math::Mul<T, Context>(X1->count(), dy, dx1, dx1, ctx());
    }
 }

--- a/Dragon/src/operators/arithmetic/pow_op.cc
+++ b/Dragon/src/operators/arithmetic/pow_op.cc
@@ -9,16 +9,17 @@ void PowOp<Context>::RunWithType() {
    TIndex count = Input(0).count();
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    if (power_scale == float(0)) {
+    if (power_scale == 0.f) {
-        float value = (power == float(0)) ? float(1) : pow(shift, power);
+        float value = (power == 0.f) ? 1.f : pow(shift, power);
-        math::Set<T, Context>(count, dragon_cast<T, float>(value), Ydata);
+        math::Set<T, Context>(count,
+            dragon_cast<T, float>(value), Ydata, ctx());
        return;
    }
    auto* Xdata = Input(0).template data<T, Context>();
-    ctx().template Copy<T, Context, Context>(count, Ydata, Xdata);
+    ctx()->template Copy<T, Context, Context>(count, Ydata, Xdata);
-    if (scale != float(1)) math::Scal<T, Context>(count, scale, Ydata, &ctx());
+    if (scale != 1.f) math::Scal<T, Context>(count, scale, Ydata, ctx());
-    if (shift != float(0)) math::AddScalar<T, Context>(count, shift, Ydata);
+    if (shift != 0.f) math::AddScalar<T, Context>(count, shift, Ydata, ctx());
-    if (power != float(1)) math::Pow<T, Context>(count, power, Ydata, Ydata);
+    if (power != 1.f) math::Pow<T, Context>(count, power, Ydata, Ydata, ctx());
 }
 template <class Context>
@@ -42,35 +43,36 @@ void PowGradientOp<Context>::RunWithType() {
    auto* dYdata = Input(-1).template data<T, Context>();
    auto* dXdata = Output(0)->template mutable_data<T, Context>();
-    if (power_scale == float(0) || power == float(1)) {
+    if (power_scale == 0.f || power == 1.f) {
        const T value = dragon_cast<T, float>(power_scale);
-        math::Set<T, Context>(count, value, dXdata);
+        math::Set<T, Context>(count, value, dXdata, ctx());
    } else {
        auto* Xdata = Input(0).template data<T, Context>();
-        if (power == float(2)) {
+        if (power == 2.f) {
            math::Axpby<T, Context>(count,
                power_scale * scale, Xdata,
-                    0, dXdata, &ctx());
+                    0, dXdata, ctx());
-            if (shift != float(0)) 
+            if (shift != 0.f) 
-                math::AddScalar<T, Context>(count, power_scale * shift, dXdata);
+                math::AddScalar<T, Context>(count,
-        } else if (shift == float(0)) {
+                    power_scale * shift, dXdata, ctx());
+        } else if (shift == 0.f) {
            auto* Ydata = Input(1).template data<T, Context>();
-            math::Div<T, Context>(count, Ydata, Xdata, dXdata);
+            math::Div<T, Context>(count, Ydata, Xdata, dXdata, ctx());
-            math::Scal<T, Context>(count, power, dXdata, &ctx());
+            math::Scal<T, Context>(count, power, dXdata, ctx());
        } else {
            auto* Ydata = Input(1).template data<T, Context>();
-            ctx().template Copy<T, Context, Context>(count, dXdata, Xdata);
+            ctx()->template Copy<T, Context, Context>(count, dXdata, Xdata);
-            if (scale != float(1))
+            if (scale != 1.f)
-                math::Scal<T, Context>(count, scale, dXdata, &ctx());
+                math::Scal<T, Context>(count, scale, dXdata, ctx());
-            if (shift != float(0))
+            if (shift != 0.f)
-                math::AddScalar<T, Context>(count, shift, dXdata);
+                math::AddScalar<T, Context>(count, shift, dXdata, ctx());
-            math::Div<T, Context>(count, Ydata, dXdata, dXdata);
+            math::Div<T, Context>(count, Ydata, dXdata, dXdata, ctx());
-            if (power_scale != float(1))
+            if (power_scale != 1.f)
-                math::Scal<T, Context>(count, power_scale, dXdata, &ctx());
+                math::Scal<T, Context>(count, power_scale, dXdata, ctx());
        }
    }
-    if (power_scale != float(0))
+    if (power_scale != 0.f)
-        math::Mul<T, Context>(count, dYdata, dXdata, dXdata);
+        math::Mul<T, Context>(count, dYdata, dXdata, dXdata, ctx());
 }
 template <class Context>

--- a/Dragon/src/operators/arithmetic/radd_op.cc
+++ b/Dragon/src/operators/arithmetic/radd_op.cc
@@ -9,7 +9,7 @@ void RAddOp<Context>::EltwiseRunWithType() {
    auto* x1 = Input(0).template data<T, Context>();
    auto* x2 = Input(1).template data<T, Context>();
    auto* y = Output(0)->template mutable_data<T, Context>();
-    math::Add<T, Context>(Output(0)->count(), x1, x2, y);
+    math::Add<T, Context>(Output(0)->count(), x1, x2, y, ctx());
 }
 template <class Context> template <typename T>
@@ -19,23 +19,24 @@ void RAddOp<Context>::BroadcastRunWithType(int type) {
    auto* x2 = Input(1).template data<T, Context>();
    auto* y = Output(0)->template mutable_data<T, Context>();
-    ctx().template Copy<T, Context, Context>(
+    ctx()->template Copy<T, Context, Context>(
        Output(0)->count(), y, x2);
    if (type == 0 || type == 1) {
        if (type == 0) {
-            outer_dim = Input(1).count();
+            x1 = Input(0).template data<T, CPUContext>();
-            inner_dim = 1;
+            math::AddScalar<T, Context>(Output(0)->count(),
+                dragon_cast<float, T>(x1[0]), y, ctx());
        } else {
            outer_dim = Input(1).count(0, Input(1).axis(-1));
            inner_dim = Input(1).dim(-1);
+            DECLARE_MULTIPLIER(multiplier, outer_dim);
+            math::Gemm<T, Context>(
+                CblasNoTrans, CblasNoTrans,
+                    outer_dim, inner_dim, 1,
+                        1.0, multiplier, x1,
+                            1.0, y, ctx());
        }
-        DECLARE_MULTIPLIER(multiplier, outer_dim);
-        math::Gemm<T, Context>(
-            CblasNoTrans, CblasNoTrans,
-                outer_dim, inner_dim, 1,
-                    1.0, multiplier, x1,
-                        1.0, y, &ctx());
    } else if (type == 2) {
        outer_dim = Input(1).dim(0);
        inner_dim = Input(1).count(1);
@@ -44,7 +45,7 @@ void RAddOp<Context>::BroadcastRunWithType(int type) {
            CblasNoTrans, CblasNoTrans,
                outer_dim, inner_dim, 1,
                    1.0, x1, multiplier,
-                        1.0, y, &ctx());
+                        1.0, y, ctx());
    }
 }
@@ -77,13 +78,13 @@ void RAddGradientOp<Context>::EltwiseRunWithType() {
    if (Output(1)->name() != "ignore") {
        auto* dx2 = Output(1)->template mutable_data<T, Context>();
-        ctx().template Copy<T, Context, Context>(
+        ctx()->template Copy<T, Context, Context>(
            Output(1)->count(), dx2, dy);
    }
    if (Output(0)->name() != "ignore") {
        auto* dx1 = Output(0)->template mutable_data<T, Context>();
-        ctx().template Copy<T, Context, Context>(
+        ctx()->template Copy<T, Context, Context>(
            Output(0)->count(), dx1, dy);
    }
 }
@@ -108,7 +109,7 @@ void RAddGradientOp<Context>::BroadcastRunWithType(int type) {
            math::Gemv<T, Context>(
                CblasTrans, outer_dim, inner_dim,
                    1.0, dy, multiplier,
-                        0.0, dx1, &ctx());
+                        0.0, dx1, ctx());
        } else if (type == 2) {
            outer_dim = X2->dim(0);
            inner_dim = X2->count(1);
@@ -116,13 +117,13 @@ void RAddGradientOp<Context>::BroadcastRunWithType(int type) {
            math::Gemv<T, Context>(
                CblasNoTrans, outer_dim, inner_dim,
                    1.0, dy, multiplier,
-                        0.0, dx1, &ctx());
+                        0.0, dx1, ctx());
        }
    }
    if (Output(1)->name() != "ignore") {
        auto* dx2 = Output(1)->template mutable_data<T, Context>();
-        ctx().template Copy<T, Context, Context>(
+        ctx()->template Copy<T, Context, Context>(
            X2->count(), dx2, dy);
    }
 }

--- a/Dragon/src/operators/arithmetic/rdiv_op.cc
+++ b/Dragon/src/operators/arithmetic/rdiv_op.cc
@@ -9,7 +9,7 @@ void RDivOp<Context>::EltwiseRunWithType() {
    auto* x1 = Input(0).template data<T, Context>();
    auto* x2 = Input(1).template data<T, Context>();
    auto* y = Output(0)->template mutable_data<T, Context>();
-    math::Div<T, Context>(Output(0)->count(), x1, x2, y);
+    math::Div<T, Context>(Output(0)->count(), x1, x2, y, ctx());
 }
 template <class Context> template <typename T>
@@ -34,8 +34,8 @@ void RDivOp<Context>::BroadcastRunWithType(int type) {
            CblasNoTrans, CblasNoTrans,
                outer_dim, inner_dim, 1,
                    1.0, multiplier, x1,
-                        0.0, c, &ctx());
+                        0.0, c, ctx());
-        math::Div<T, Context>(Output(0)->count(), c, x2, y);
+        math::Div<T, Context>(Output(0)->count(), c, x2, y, ctx());
    } else if (type == 2) {
        outer_dim = Input(1).dim(0);
        inner_dim = Input(1).count(1);
@@ -44,8 +44,8 @@ void RDivOp<Context>::BroadcastRunWithType(int type) {
            CblasNoTrans, CblasNoTrans,
                outer_dim, inner_dim, 1,
                    1.0, x1, multiplier,
-                        0.0, c, &ctx());
+                        0.0, c, ctx());
-        math::Div<T, Context>(Output(0)->count(), c, x2, y);
+        math::Div<T, Context>(Output(0)->count(), c, x2, y, ctx());
    }
 }
@@ -82,16 +82,16 @@ void RDivGradientOp<Context>::EltwiseRunWithType() {
        auto* x2 = Input(1).template data<T, Context>();
        auto* dx2 = Output(1)->template mutable_data<T, Context>();
        auto* c = ws()->template caches<T, Context>({ X1->count() })[0];
-        math::Mul<T, Context>(X1->count(), dy, x1, c); // dY * X1
+        math::Mul<T, Context>(X1->count(), dy, x1, c, ctx()); // dY * X1
-        math::Square<T, Context>(X2->count(), x2, dx2); // X2^{2}
+        math::Square<T, Context>(X2->count(), x2, dx2, ctx()); // X2^{2}
-        math::Inv<T, Context>(X2->count(), -1, dx2, dx2); // -1 / X2^{2}
+        math::Inv<T, Context>(X2->count(), -1, dx2, dx2, ctx()); // -1 / X2^{2}
-        math::Mul<T, Context>(X2->count(), c, dx2, dx2);
+        math::Mul<T, Context>(X2->count(), c, dx2, dx2, ctx());
    }
    if (Output(0)->name() != "ignore") {
        auto* x2 = Input(1).template data<T, Context>();
        auto* dx1 = Output(0)->template mutable_data<T, Context>();
-        math::Div<T, Context>(X1->count(), dy, x2, dx1);
+        math::Div<T, Context>(X1->count(), dy, x2, dx1, ctx());
    }
 }
@@ -116,19 +116,19 @@ void RDivGradientOp<Context>::BroadcastRunWithType(int type) {
        auto* x2 = Input(1).template data<T, Context>();
        auto* dx1 = Output(0)->template mutable_data<T, Context>();
        auto* c = ws()->template caches<T, Context>({ X2->count() })[0];
-        math::Div<T, Context>(X2->count(), dy, x2, c);
+        math::Div<T, Context>(X2->count(), dy, x2, c, ctx());
        if (type == 0 || type == 1) {
            DECLARE_MULTIPLIER(multiplier, outer_dim);
            math::Gemv<T, Context>(
                CblasTrans, outer_dim, inner_dim,
                    1.0, c, multiplier,
-                        0.0, dx1, &ctx());
+                        0.0, dx1, ctx());
        } else if (type == 2) {
            DECLARE_MULTIPLIER(multiplier, inner_dim);
            math::Gemv<T, Context>(
                CblasNoTrans, outer_dim, inner_dim,
                    1.0, c, multiplier,
-                        0.0, dx1, &ctx());
+                        0.0, dx1, ctx());
        }
    }
@@ -142,18 +142,18 @@ void RDivGradientOp<Context>::BroadcastRunWithType(int type) {
                CblasNoTrans, CblasNoTrans,
                    outer_dim, inner_dim, 1,
                        -1.0, multiplier, x1,
-                            0.0, dx2, &ctx());
+                            0.0, dx2, ctx());
        } else if (type == 2) {
            DECLARE_MULTIPLIER(multiplier, inner_dim);
            math::Gemm<T, Context>(
                CblasNoTrans, CblasNoTrans,
                    outer_dim, inner_dim, 1,
                        -1.0, x1, multiplier,
-                            0.0, dx2, &ctx());
+                            0.0, dx2, ctx());
        }
-        math::Mul<T, Context>(X2->count(), dy, dx2, dx2);
+        math::Mul<T, Context>(X2->count(), dy, dx2, dx2, ctx());
-        math::Div<T, Context>(X2->count(), dx2, x2, dx2);
+        math::Div<T, Context>(X2->count(), dx2, x2, dx2, ctx());
-        math::Div<T, Context>(X2->count(), dx2, x2, dx2);
+        math::Div<T, Context>(X2->count(), dx2, x2, dx2, ctx());
    }
 }

--- a/Dragon/src/operators/arithmetic/rmul_op.cc
+++ b/Dragon/src/operators/arithmetic/rmul_op.cc
@@ -9,7 +9,7 @@ void RMulOp<Context>::EltwiseRunWithType() {
    auto* x1 = Input(0).template data<T, Context>();
    auto* x2 = Input(1).template data<T, Context>();
    auto* y = Output(0)->template mutable_data<T, Context>();
-    math::Mul<T, Context>(Output(0)->count(), x1, x2, y);
+    math::Mul<T, Context>(Output(0)->count(), x1, x2, y, ctx());
 }
 template <class Context> template <typename T>
@@ -18,34 +18,39 @@ void RMulOp<Context>::BroadcastRunWithType(int type) {
    auto* x1 = Input(0).template data<T, Context>();
    auto* x2 = Input(1).template data<T, Context>();
    auto* y = Output(0)->template mutable_data<T, Context>();
-    auto* c = ws()->template caches<T, Context>({
-        Output(0)->count() })[0];
+    if (type == 0) {
+        x1 = Input(0).template data<T, CPUContext>();
-    if (type == 0 || type == 1) {
+        ctx()->template Copy<T, Context, Context>(
-        if (type == 0) {
+            Output(0)->count(), y, x2);
-            outer_dim = Input(1).count();
+        math::MulScalar<T, Context>(Output(0)->count(),
-            inner_dim = 1;
+            dragon_cast<float, T>(x1[0]), y, ctx());
-        } else {
+    } else if (type == 1) {
-            outer_dim = Input(1).count(0, Input(1).axis(-1));
+        outer_dim = Input(1).count(0, Input(1).axis(-1));
-            inner_dim = Input(1).dim(-1);
+        inner_dim = Input(1).dim(-1);
-        }
        DECLARE_MULTIPLIER(multiplier, outer_dim);
+        auto* c = ws()->template caches<T, Context>(
+            { Output(0)->count() })[0];
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                outer_dim, inner_dim, 1,
                    1.0, multiplier, x1,
-                        0.0, c, &ctx());
+                        0.0, c, ctx());
-        math::Mul<T, Context>(Output(0)->count(), c, x2, y);
+        math::Mul<T, Context>(
+            Output(0)->count(), c, x2, y, ctx());
    } else if (type == 2) {
        outer_dim = Input(1).dim(0);
        inner_dim = Input(1).count(1);
        DECLARE_MULTIPLIER(multiplier, inner_dim);
+        auto* c = ws()->template caches<T, Context>(
+            { Output(0)->count() })[0];
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                outer_dim, inner_dim, 1,
                    1.0, x1, multiplier,
-                        0.0, c, &ctx());
+                        0.0, c, ctx());
-        math::Mul<T, Context>(Output(0)->count(), c, x2, y);
+        math::Mul<T, Context>(
+            Output(0)->count(), c, x2, y, ctx());
    }
 }
@@ -79,13 +84,13 @@ void RMulGradientOp<Context>::EltwiseRunWithType() {
    if (Output(1)->name() != "ignore") {
        auto* x1 = Input(0).template data<T, Context>();
        auto* dx2 = Output(1)->template mutable_data<T, Context>();
-        math::Mul<T, Context>(Output(1)->count(), dy, x1, dx2);
+        math::Mul<T, Context>(Output(1)->count(), dy, x1, dx2, ctx());
    }
    if (Output(0)->name() != "ignore") {
        auto* x2 = Input(1).template data<T, Context>();
        auto* dx1 = Output(0)->template mutable_data<T, Context>();
-        math::Mul<T, Context>(Output(0)->count(), dy, x2, dx1);
+        math::Mul<T, Context>(Output(0)->count(), dy, x2, dx1, ctx());
    }
 }
@@ -110,19 +115,19 @@ void RMulGradientOp<Context>::BroadcastRunWithType(int type) {
        auto* x2 = Input(1).template data<T, Context>();
        auto* dx1 = Output(0)->template mutable_data<T, Context>();
        auto* c = ws()->template caches<T, Context>({ X2->count() })[0];
-        math::Mul<T, Context>(X2->count(), dy, x2, c);
+        math::Mul<T, Context>(X2->count(), dy, x2, c, ctx());
        if (type == 0 || type == 1) {
            DECLARE_MULTIPLIER(multiplier, outer_dim);
            math::Gemv<T, Context>(
                CblasTrans, outer_dim, inner_dim,
                    1.0, c, multiplier,
-                        0.0, dx1, &ctx());
+                        0.0, dx1, ctx());
        } else if (type == 2) {
            DECLARE_MULTIPLIER(multiplier, inner_dim);
            math::Gemv<T, Context>(
                CblasNoTrans, outer_dim, inner_dim,
                    1.0, c, multiplier,
-                        0.0, dx1, &ctx());
+                        0.0, dx1, ctx());
        }
    }
@@ -135,16 +140,16 @@ void RMulGradientOp<Context>::BroadcastRunWithType(int type) {
                CblasNoTrans, CblasNoTrans,
                    outer_dim, inner_dim, 1,
                        1.0, multiplier, x1,
-                            0.0, dx2, &ctx());
+                            0.0, dx2, ctx());
        } else if (type == 2) {
            DECLARE_MULTIPLIER(multiplier, inner_dim);
            math::Gemm<T, Context>(
                CblasNoTrans, CblasNoTrans,
                    outer_dim, inner_dim, 1,
                        1.0, x1, multiplier,
-                            0.0, dx2, &ctx());
+                            0.0, dx2, ctx());
        }
-        math::Mul<T, Context>(X2->count(), dy, dx2, dx2);
+        math::Mul<T, Context>(X2->count(), dy, dx2, dx2, ctx());
    }
 }

--- a/Dragon/src/operators/arithmetic/rsub_op.cc
+++ b/Dragon/src/operators/arithmetic/rsub_op.cc
@@ -9,7 +9,7 @@ void RSubOp<Context>::EltwiseRunWithType() {
    auto* x1 = Input(0).template data<T, Context>();
    auto* x2 = Input(1).template data<T, Context>();
    auto* y = Output(0)->template mutable_data<T, Context>();
-    math::Sub<T, Context>(Output(0)->count(), x1, x2, y);
+    math::Sub<T, Context>(Output(0)->count(), x1, x2, y, ctx());
 }
 template <class Context> template <typename T>
@@ -19,7 +19,7 @@ void RSubOp<Context>::BroadcastRunWithType(int type) {
    auto* x2 = Input(1).template data<T, Context>();
    auto* y = Output(0)->template mutable_data<T, Context>();
-    ctx().template Copy<T, Context, Context>(
+    ctx()->template Copy<T, Context, Context>(
        Output(0)->count(), y, x2);
    if (type == 0 || type == 1) {
@@ -35,7 +35,7 @@ void RSubOp<Context>::BroadcastRunWithType(int type) {
            CblasNoTrans, CblasNoTrans,
                outer_dim, inner_dim, 1,
                    1.0, multiplier, x1,
-                        -1.0, y, &ctx());
+                        -1.0, y, ctx());
    } else if (type == 2) {
        outer_dim = Input(1).dim(0);
        inner_dim = Input(1).count(1);
@@ -44,7 +44,7 @@ void RSubOp<Context>::BroadcastRunWithType(int type) {
            CblasNoTrans, CblasNoTrans,
                outer_dim, inner_dim, 1,
                    1.0, x1, multiplier,
-                        -1.0, y, &ctx());
+                        -1.0, y, ctx());
    }
 }
@@ -78,12 +78,12 @@ void RSubGradientOp<Context>::EltwiseRunWithType() {
    if (Output(1)->name() != "ignore") {
        auto* dx2 = Output(1)->template mutable_data<T, Context>();
        math::Scale<T, Context>(
-            Output(1)->count(), -1, dy, dx2, &ctx());
+            Output(1)->count(), -1, dy, dx2, ctx());
    }
    if (Output(0)->name() != "ignore") {
        auto* dx1 = Output(0)->template mutable_data<T, Context>();
-        ctx().template Copy<T, Context, Context>(
+        ctx()->template Copy<T, Context, Context>(
            Output(0)->count(), dx1, dy);
    }
 }
@@ -108,7 +108,7 @@ void RSubGradientOp<Context>::BroadcastRunWithType(int type) {
            math::Gemv<T, Context>(
                CblasTrans, outer_dim, inner_dim,
                    1.0, dy, multiplier,
-                        0.0, dx1, &ctx());
+                        0.0, dx1, ctx());
        } else if (type == 2) {
            outer_dim = X2->dim(0);
            inner_dim = X2->count(1);
@@ -116,14 +116,14 @@ void RSubGradientOp<Context>::BroadcastRunWithType(int type) {
            math::Gemv<T, Context>(
                CblasNoTrans, outer_dim, inner_dim,
                    1.0, dy, multiplier,
-                        0.0, dx1, &ctx());
+                        0.0, dx1, ctx());
        }
    }
    if (Output(1)->name() != "ignore") {
        auto* dx2 = Output(1)->template mutable_data<T, Context>();
        math::Scale<T, Context>(
-            X2->count(), -1, dy, dx2, &ctx());
+            X2->count(), -1, dy, dx2, ctx());
    }
 }

--- a/Dragon/src/operators/arithmetic/square_op.cc
+++ b/Dragon/src/operators/arithmetic/square_op.cc
@@ -7,7 +7,7 @@ template <class Context> template <typename T>
 void SquareOp<Context>::RunWithType() {
    auto* Xdata = Input(0).template data<T, Context>();
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    math::Pow<T, Context>(Output(0)->count(), 2.0, Xdata, Ydata);
+    math::Pow<T, Context>(Output(0)->count(), 2.0, Xdata, Ydata, ctx());
 }
 template <class Context>
@@ -29,8 +29,8 @@ void SquareGradientOp<Context>::RunWithType() {
    auto* Xdata = Input(0).template data<T, Context>();
    auto* dYdata = Input(-1).template data<T, Context>();
    auto* dXdata = Output(0)->template mutable_data<T, Context>();
-    math::Mul<T, Context>(Output(0)->count(), dYdata, Xdata, dXdata);
+    math::Mul<T, Context>(Output(0)->count(), dYdata, Xdata, dXdata, ctx());
-    math::Scal<T, Context>(Output(0)->count(), 2.0, dXdata, &ctx());
+    math::Scal<T, Context>(Output(0)->count(), 2.0, dXdata, ctx());
 }
 template <class Context>

--- a/Dragon/src/operators/arithmetic/sub_op.cc
+++ b/Dragon/src/operators/arithmetic/sub_op.cc
@@ -9,7 +9,8 @@ void SubOp<Context>::EltwiseRunWithType() {
    auto* X1data = Input(0).template data<T, Context>();
    auto* X2data = Input(1).template data<T, Context>();
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    math::Sub<T, Context>(Output(0)->count(), X1data, X2data, Ydata);
+    math::Sub<T, Context>(Output(0)->count(),
+        X1data, X2data, Ydata, ctx());
 }
 template <class Context> template <typename T>
@@ -19,23 +20,24 @@ void SubOp<Context>::BroadcastRunWithType(int type) {
    auto* x2 = Input(1).template data<T, Context>();
    auto* y = Output(0)->template mutable_data<T, Context>();
-    ctx().template Copy<T, Context, Context>(
+    ctx()->template Copy<T, Context, Context>(
        Output(0)->count(), y, x1);
    if (type == 0 || type == 1) {
        if (type == 0) {
-            outer_dim = Input(0).count();
+            x2 = Input(1).template data<T, CPUContext>();
-            inner_dim = 1;
+            math::AddScalar<T, Context>(Output(0)->count(),
+                -dragon_cast<float, T>(x2[0]), y, ctx());
        } else {
            outer_dim = Input(0).count(0, Input(0).axis(-1));
            inner_dim = Input(0).dim(-1);
+            DECLARE_MULTIPLIER(multiplier, outer_dim);
+            math::Gemm<T, Context>(
+                CblasNoTrans, CblasNoTrans,
+                    outer_dim, inner_dim, 1,
+                        -1.0, multiplier, x2,
+                            1.0, y, ctx());
        }
-        DECLARE_MULTIPLIER(multiplier, outer_dim);
-        math::Gemm<T, Context>(
-            CblasNoTrans, CblasNoTrans,
-                outer_dim, inner_dim, 1,
-                    -1.0, multiplier, x2,
-                        1.0, y, &ctx());
    } 
    else if (type == 2) {
        outer_dim = Input(0).dim(0);
@@ -45,7 +47,7 @@ void SubOp<Context>::BroadcastRunWithType(int type) {
            CblasNoTrans, CblasNoTrans,
                outer_dim, inner_dim, 1,
                    -1.0, x2, multiplier,
-                        1.0, y, &ctx());
+                        1.0, y, ctx());
    }
 }
@@ -79,12 +81,12 @@ void SubGradientOp<Context>::EltwiseRunWithType() {
    if (Output(1)->name() != "ignore") {
        auto* dx2 = Output(1)->template mutable_data<T, Context>();
        math::Scale<T, Context>(Output(1)->count(),
-            -1.0, dy, dx2, &ctx());
+            -1.0, dy, dx2, ctx());
    }
    if (Output(0)->name() != "ignore") {
        auto* dx1 = Output(0)->template mutable_data<T, Context>();
-        ctx().template Copy<T, Context, Context>(
+        ctx()->template Copy<T, Context, Context>(
            Output(0)->count(), dx1, dy);
    }
 }
@@ -109,7 +111,7 @@ void SubGradientOp<Context>::BroadcastRunWithType(int type) {
            math::Gemv<T, Context>(
                CblasTrans, outer_dim, inner_dim,
                    -1.0, dy, multiplier,
-                        0.0, dx2, &ctx());
+                        0.0, dx2, ctx());
        } else if (type == 2) {
            outer_dim = X1->dim(0);
            inner_dim = X1->count(1);
@@ -117,13 +119,13 @@ void SubGradientOp<Context>::BroadcastRunWithType(int type) {
            math::Gemv<T, Context>(
                CblasNoTrans, outer_dim, inner_dim,
                    -1.0, dy, multiplier,
-                        0.0, dx2, &ctx());
+                        0.0, dx2, ctx());
        }
    }
    if (Output(0)->name() != "ignore") {
        auto* dx1 = Output(0)->template mutable_data<T, Context>();
-        ctx().template Copy<T, Context, Context>(
+        ctx()->template Copy<T, Context, Context>(
            X1->count(), dx1, dy);
    }
 }

--- a/Dragon/src/operators/control_flow/compare_op.cc
+++ b/Dragon/src/operators/control_flow/compare_op.cc
@@ -8,7 +8,8 @@ void CompareOp<Context>::EqualRunWithType() {
    auto* X1data = Input(0).template data<T, Context>();
    auto* X2data = Input(1).template data<T, Context>();
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    kernel::Equal<T, Context>(Output(0)->count(), X1data, X2data, Ydata);
+    kernel::Equal<T, Context>(Output(0)->count(),
+        X1data, X2data, Ydata, ctx());
 }
 template <class Context>

--- a/Dragon/src/operators/control_flow/copy_op.cc
+++ b/Dragon/src/operators/control_flow/copy_op.cc
@@ -7,7 +7,7 @@ void CopyOp<Context>::RunWithType() {
    auto* Xdata = Input(0).template data<T, Context>();
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    ctx().template Copy<T, Context, Context>(
+    ctx()->template Copy<T, Context, Context>(
        Output(0)->count(), Ydata, Xdata);
 }

--- a/Dragon/src/operators/loss/ctc_loss_op.cc
+++ b/Dragon/src/operators/loss/ctc_loss_op.cc
@@ -20,10 +20,10 @@ void CTCLossGradientOp<Context>::RunWithType() {
    auto* dXdata = Output(0)->template mutable_data<T, Context>();
    auto* dYdata = Input(-1).template data<T, Context>();
-    T dYdata_host; ctx().template Copy<T, CPUContext, Context>(
+    T dYdata_host; ctx()->template Copy<T, CPUContext, Context>(
        1, &dYdata_host, dYdata);
    math::Scale<T, Context>(Output(0)->count(),
-        dYdata_host, Gdata, dXdata, &ctx());
+        dYdata_host, Gdata, dXdata, ctx());
 }
 template <class Context>

--- a/Dragon/src/operators/loss/cudnn_ctc_loss_op.cc
+++ b/Dragon/src/operators/loss/cudnn_ctc_loss_op.cc
@@ -45,7 +45,7 @@ void CuDNNCTCLossOp<Context>::RunWithType() {
    cudnnSetTensorDesc<T>(&grad_desc, Input(0).dims());
    CUDNN_CHECK(cudnnGetCTCLossWorkspaceSize(
-        ctx().cudnn_handle(), prob_desc, grad_desc,
+        ctx()->cudnn_handle(), prob_desc, grad_desc,
            packed_labels.data(), label_lengths.data(),
                input_lengths.data(),
                    ctc_algo, ctc_desc, &workspace_size));
@@ -58,7 +58,7 @@ void CuDNNCTCLossOp<Context>::RunWithType() {
    auto* WSdata = (uint8_t*)ws()->template caches<Context>({
        workspace_size })[0];
-    CUDNN_CHECK(cudnnCTCLoss(ctx().cudnn_handle(),
+    CUDNN_CHECK(cudnnCTCLoss(ctx()->cudnn_handle(),
        prob_desc, Pdata, packed_labels.data(),
            label_lengths.data(), input_lengths.data(),
                Ydata, grad_desc, Gdata,

--- a/Dragon/src/operators/loss/l1_loss_op.cc
+++ b/Dragon/src/operators/loss/l1_loss_op.cc
@@ -12,11 +12,13 @@ void L1LossOp<Context>::RunWithType() {
    auto* diff_data = diff->template mutable_data<T, Context>();
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    math::Sub<T, Context>(Input(0).count(), X0data, X1data, diff_data);
+    math::Sub<T, Context>(Input(0).count(),
+        X0data, X1data, diff_data, ctx());
    if (InputSize() > 2) {
        CHECK_EQ(Input(0).count(), Input(2).count());
        auto* Wdata = Input(2).template data<T, Context>();
-        math::Mul<T, Context>(diff->count(), Wdata, diff_data, diff_data);
+        math::Mul<T, Context>(diff->count(),
+            Wdata, diff_data, diff_data, ctx());
    }
    T normalizer = 1;
@@ -27,11 +29,13 @@ void L1LossOp<Context>::RunWithType() {
    }
    T loss = math::ASum<T, Context>(diff->count(), diff_data);
-    math::Set<T, Context>(1, loss / normalizer, Ydata);
+    math::Set<T, Context>(1, loss / normalizer, Ydata, ctx());
 }
 template <class Context>
 void L1LossOp<Context>::RunOnDevice() {
+    ctx()->set_stream_id(0);  //  enforce default stream
    CHECK_EQ(Input(0).count(), Input(1).count());
    Output(0)->Reshape({ 1 });
    diff = ws()->CreateTensor("/mnt/" + anchor() + "/l1_loss/diff");
@@ -51,9 +55,11 @@ template <class Context> template <typename T>
 void L1LossGradientOp<Context>::RunWithType() {
    auto* diff_data = diff->template mutable_data<T, Context>();
    auto* dYdata = Input(-1).template data<T, Context>();
-    T dYdata_host; ctx().template Copy<T, CPUContext, Context>(
+    T dYdata_host; ctx()->template Copy<T, CPUContext, Context>(
        1, &dYdata_host, dYdata);
-    kernel::AbsGrad<T, Context>(diff->count(), diff_data, diff_data);
+    ctx()->FinishDeviceCompution();
+    kernel::AbsGrad<T, Context>(diff->count(),
+        diff_data, diff_data, ctx());
    T alpha = dYdata_host, normalizer = 1;
    if (normalization == "BATCH_SIZE") {
@@ -69,7 +75,7 @@ void L1LossGradientOp<Context>::RunWithType() {
        const T sign = (i == 0) ? 1 : -1;
        alpha *= sign;
        math::Axpby<T, Context>(Output(i)->count(),
-            alpha, diff_data, 0, dXdata, &ctx());
+            alpha, diff_data, 0, dXdata, ctx());
    }
 }

--- a/Dragon/src/operators/loss/l2_loss_op.cc
+++ b/Dragon/src/operators/loss/l2_loss_op.cc
@@ -9,12 +9,14 @@ void L2LossOp<Context>::RunWithType() {
    auto* X0data = Input(0).template data<T, Context>();
    auto* X1data = Input(1).template data<T, Context>();
    auto* diff_data = diff->template mutable_data<T, Context>();
-    auto* Ydata = Output(0)->template mutable_data<T, Context>();
+    auto* Ydata = Output(0)->template mutable_data<float, Context>();
-    math::Sub<T, Context>(diff->count(), X0data, X1data, diff_data);
+    math::Sub<T, Context>(diff->count(),
+        X0data, X1data, diff_data, ctx());
    if (InputSize() > 2) {
        CHECK_EQ(Input(0).count(), Input(2).count());
        auto* Wdata = Input(2).template data<T, Context>();
-        math::Mul<T, Context>(diff->count(), Wdata, diff_data, diff_data);
+        math::Mul<T, Context>(diff->count(),
+            Wdata, diff_data, diff_data, ctx());
    }
    T normalizer = 1;
@@ -23,10 +25,12 @@ void L2LossOp<Context>::RunWithType() {
    } else if (normalization == "FULL") {
        normalizer = Input(0).count();
    }
+    normalizer *= 2;
-    T loss = T(0.5) * math::Dot<T, Context>(diff->count(),
+    T loss;
-        diff_data, diff_data, &ctx());
+    math::Dot<T, Context>(diff->count(),
-    math::Set<T, Context>(1, loss / normalizer, Ydata);
+        diff_data, diff_data, &loss, ctx());
+    math::Set<T, Context>(1, loss / normalizer, Ydata, ctx());
 }
 template <class Context>
@@ -48,10 +52,11 @@ OPERATOR_SCHEMA(L2Loss).NumInputs(2, 3).NumOutputs(1);
 template <class Context> template <typename T>
 void L2LossGradientOp<Context>::RunWithType() {
-    auto* diff_data = diff->template mutable_data<T, Context>();
+    auto* diff_data = diff->template data<T, Context>();
    auto* dYdata = Input(-1).template data<T, Context>();
-    T dYdata_host; ctx().template Copy<T, CPUContext, Context>(
+    T dYdata_host; ctx()->template Copy<T, CPUContext, Context>(
        1, &dYdata_host, dYdata);
+    ctx()->FinishDeviceCompution();
    T alpha = dYdata_host, normalizer = 1;
    if (normalization == "BATCH_SIZE") {
@@ -67,7 +72,7 @@ void L2LossGradientOp<Context>::RunWithType() {
        const T sign = (i == 0) ? 1 : -1;
        alpha *= sign;
        math::Axpby<T, Context>(Output(i)->count(),
-            alpha, diff_data, 0, dXdata, &ctx());
+            alpha, diff_data, 0, dXdata, ctx());
    }
 }

--- a/Dragon/src/operators/loss/sigmoid_cross_entropy_op.cc
+++ b/Dragon/src/operators/loss/sigmoid_cross_entropy_op.cc
@@ -13,11 +13,11 @@ void SigmoidCrossEntropyOp<Context>::RunWithType() {
    auto* Fdata = flags.template mutable_data<T, Context>();
    kernel::SigmoidCrossEntropy<T, Context>(
-        Input(0).count(), Xdata, Tdata, Ldata, Fdata, &ctx());
+        Input(0).count(), Xdata, Tdata, Ldata, Fdata, ctx());
    if (normalization == "UNIT") {
        Output(0)->ReshapeLike(losses);
-        Output(0)->template CopyFrom<Context>(losses);
+        Output(0)->template CopyFrom<Context>(losses, ctx());
        return;
    }
@@ -35,11 +35,13 @@ void SigmoidCrossEntropyOp<Context>::RunWithType() {
    T loss = math::ASum<T, Context>(losses.count(), Ldata);
    Output(0)->Reshape({ 1 });
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    math::Set<T, Context>(1, loss / normalizer, Ydata);
+    math::Set<T, Context>(1, loss / normalizer, Ydata, ctx());
 }
 template <class Context>
 void SigmoidCrossEntropyOp<Context>::RunOnDevice() {
+    ctx()->set_stream_id(0);  //  enforce default stream
    CHECK_EQ(Input(0).count(), Input(1).count())
        << "\nNumber of predictions must match the number of labels.";
    losses.ReshapeLike(Input(0));
@@ -63,12 +65,12 @@ void SigmoidCrossEntropyGradientOp<Context>::RunWithType() {
    auto* Fdata = flags.template mutable_data<T, Context>();
    kernel::SigmoidCrossEntropyGrad<T, Context>(
-        Input(0).count(), Xdata, Tdata, dXdata, Fdata, &ctx());
+        Input(0).count(), Xdata, Tdata, dXdata, Fdata, ctx());
    if (normalization == "UNIT") {
        auto* dYdata = Input(-1).template data<T, Context>();
        math::Mul<T, Context>(Output(0)->count(),
-            dYdata, dXdata, dXdata); return;
+            dYdata, dXdata, dXdata, ctx()); return;
    }
    T normalizer = 1;
@@ -83,14 +85,16 @@ void SigmoidCrossEntropyGradientOp<Context>::RunWithType() {
    }
    auto* dYdata = Input(-1).template data<T, Context>();
-    T dYdata_host; ctx().template Copy<T, CPUContext, Context>(
+    T dYdata_host; ctx()->template Copy<T, CPUContext, Context>(
        1, &dYdata_host, dYdata);
    math::Scal<T, Context>(Output(0)->count(),
-        dYdata_host / normalizer, dXdata, &ctx());
+        dYdata_host / normalizer, dXdata, ctx());
 }
 template <class Context>
 void SigmoidCrossEntropyGradientOp<Context>::RunOnDevice() {
+    ctx()->set_stream_id(0);  //  enforce default stream
    Output(0)->ReshapeLike(Input(0));
    flags.ReshapeLike(Input(0));

--- a/Dragon/src/operators/loss/sigmoid_focal_loss_op.cc
+++ b/Dragon/src/operators/loss/sigmoid_focal_loss_op.cc
@@ -15,11 +15,11 @@ void SigmoidFocalLossOp<Context>::RunWithType() {
    kernel::SigmoidFocalLoss<T, Context>(
        outer_dim, axis_dim, inner_dim,
            pos_alpha, neg_alpha, gamma, neg_id,
-                Xdata, Tdata, Ldata, Fdata, &ctx());
+                Xdata, Tdata, Ldata, Fdata, ctx());
    if (normalization == "UNIT") {
        Output(0)->ReshapeLike(losses);
-        Output(0)->template CopyFrom<Context>(losses);
+        Output(0)->template CopyFrom<Context>(losses, ctx());
        return;
    }
@@ -37,11 +37,13 @@ void SigmoidFocalLossOp<Context>::RunWithType() {
    T loss = math::ASum<T, Context>(losses.count(), Ldata);
    Output(0)->Reshape({ 1 });
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    math::Set<T, Context>(1, loss / normalizer, Ydata);
+    math::Set<T, Context>(1, loss / normalizer, Ydata, ctx());
 }
 template <class Context>
 void SigmoidFocalLossOp<Context>::RunOnDevice() {
+    ctx()->set_stream_id(0);  //  enforce default stream
    outer_dim = Input(0).count(0, axis);
    axis_dim = Input(0).dim(axis);
    inner_dim = Input(0).count(axis + 1);
@@ -71,12 +73,12 @@ void SigmoidFocalLossGradientOp<Context>::RunWithType() {
    kernel::SigmoidFocalLossGradient<T, Context>(
        outer_dim, axis_dim, inner_dim,
            pos_alpha, neg_alpha, gamma, neg_id,
-                Xdata, Tdata, dXdata, Fdata, &ctx());
+                Xdata, Tdata, dXdata, Fdata, ctx());
    if (normalization == "UNIT") {
        auto* dYdata = Input(-1).template data<T, Context>();
        math::Mul<T, Context>(Output(0)->count(),
-            dYdata, dXdata, dXdata); return;
+            dYdata, dXdata, dXdata, ctx()); return;
    }
    T normalizer = 1;
@@ -91,14 +93,16 @@ void SigmoidFocalLossGradientOp<Context>::RunWithType() {
    }
    auto* dYdata = Input(-1).template data<T, Context>();
-    T dYdata_host; ctx().template Copy<T, CPUContext, Context>(
+    T dYdata_host; ctx()->template Copy<T, CPUContext, Context>(
        1, &dYdata_host, dYdata);
    math::Scal<T, Context>(Output(0)->count(),
-        dYdata_host / normalizer, dXdata, &ctx());
+        dYdata_host / normalizer, dXdata, ctx());
 }
 template <class Context>
 void SigmoidFocalLossGradientOp<Context>::RunOnDevice() {
+    ctx()->set_stream_id(0);  //  enforce default stream
    outer_dim = Input(0).count(0, axis);
    axis_dim = Input(0).dim(axis);
    inner_dim = Input(0).count(axis + 1);

--- a/Dragon/src/operators/loss/smooth_l1_loss_op.cc
+++ b/Dragon/src/operators/loss/smooth_l1_loss_op.cc
@@ -11,20 +11,21 @@ void SmoothL1LossOp<Context>::RunWithType() {
    auto* X1data = Input(1).template data<T, Context>();
    auto* diff_data = diff->template mutable_data<T, Context>();
    auto* error_data = error->template mutable_data<T, Context>();
-    auto* Ydata = Output(0)->template mutable_data<T, Context>();
+    auto* Ydata = Output(0)->template mutable_data<float, Context>();
-    math::Sub<T, Context>(diff->count(), X0data, X1data, diff_data);
+    math::Sub<T, Context>(diff->count(),
+        X0data, X1data, diff_data, ctx());
    if (InputSize() > 2) {
        auto* inside_w_data = Input(2).template data<T, Context>();
        math::Mul<T, Context>(diff->count(),
-            inside_w_data, diff_data, diff_data);
+            inside_w_data, diff_data, diff_data, ctx());
    }
-    kernel::SmoothL1<T, Context>(
+    kernel::SmoothL1<T, Context>(diff->count(),
-        diff->count(), beta, diff_data, error_data);
+        beta, diff_data, error_data, ctx());
    if (InputSize() > 3) {
        auto* outside_w_data = Input(3).template data<T, Context>();
        math::Mul<T, Context>(diff->count(),
-            outside_w_data, error_data, error_data);
+            outside_w_data, error_data, error_data, ctx());
    }
    T normalizer = 1;
@@ -34,12 +35,14 @@ void SmoothL1LossOp<Context>::RunWithType() {
        normalizer = Input(0).count();
    }
-    T loss = math::ASum<T, Context>(error->count(), error_data);
+    float loss = math::ASum<float, Context>(error->count(), error_data);
-    math::Set<T, Context>(1, loss / normalizer, Ydata);
+    math::Set<float, Context>(1, loss / normalizer, Ydata, ctx());
 }
 template <class Context>
 void SmoothL1LossOp<Context>::RunOnDevice() {
+    ctx()->set_stream_id(0);  //  enforce default stream
    CHECK(Input(0).dims() == Input(1).dims());
    if (InputSize() > 2) CHECK(Input(0).dims() == Input(2).dims());
    if (InputSize() > 3) CHECK(Input(0).dims() == Input(3).dims());
@@ -64,10 +67,12 @@ template <class Context> template <typename T>
 void SmoothL1LossGradientOp<Context>::RunWithType() {
    auto* diff_data = diff->template mutable_data<T, Context>();
    auto* dYdata = Input(-1).template data<T, Context>();
-    T dYdata_host; ctx().template Copy<T, CPUContext, Context>(
+    T dYdata_host; ctx()->template Copy<T, CPUContext, Context>(
        1, &dYdata_host, dYdata);
-    kernel::SmoothL1Grad<T, Context>(
+    ctx()->FinishDeviceCompution();
-        diff->count(), beta, diff_data, diff_data);
+    kernel::SmoothL1Grad<T, Context>(diff->count(),
+        beta, diff_data, diff_data, ctx());
    T alpha = dYdata_host, normalizer = 1;
    if (normalization == "BATCH_SIZE") {
@@ -83,16 +88,16 @@ void SmoothL1LossGradientOp<Context>::RunWithType() {
        const T sign = (i == 0) ? 1 : -1;
        alpha *= sign;
        math::Axpby<T, Context>(Output(i)->count(),
-            alpha, diff_data, 0, dXdata, &ctx());
+            alpha, diff_data, 0, dXdata, ctx());
        if (InputSize() > 3) {
            auto* inside_w_data = Input(2).template data<T, Context>();
            math::Mul<T, Context>(Output(i)->count(),
-                inside_w_data, dXdata, dXdata);
+                inside_w_data, dXdata, dXdata, ctx());
        }
        if (InputSize() > 4) {
            auto* outside_w_data = Input(3).template data<T, Context>();
            math::Mul<T, Context>(Output(i)->count(),
-                outside_w_data, dXdata, dXdata);
+                outside_w_data, dXdata, dXdata, ctx());
        }
    }
 }

--- a/Dragon/src/operators/loss/softmax_cross_entropy_op.cc
+++ b/Dragon/src/operators/loss/softmax_cross_entropy_op.cc
@@ -26,15 +26,15 @@ void SoftmaxCrossEntropyOp<Context>::RunWithType() {
    auto* Pdata = prob->template data<T, Context>();
    auto* Tdata = Input(1).template data<T, Context>();
    auto* Ldata = losses.template mutable_data<T, Context>();
-    kernel::SoftmaxCrossEntropy<T, Context>(
+    kernel::SoftmaxCrossEntropy<T, Context>(Input(0).count(),
-        Input(0).count(), Pdata, Tdata, Ldata);
+        Pdata, Tdata, Ldata, ctx());
    if (normalization == "UNIT") {
        Output(0)->Reshape({ outer_dim * inner_dim });
        auto* Ydata = Output(0)->template mutable_data<T, Context>();
        kernel::Sum<T, Context>(outer_dim * inner_dim,
            Input(0).dim(axis), inner_dim,
-                Ldata, Ydata); return;
+                Ldata, Ydata, ctx()); return;
    }
    T normalizer = 1;
@@ -47,11 +47,13 @@ void SoftmaxCrossEntropyOp<Context>::RunWithType() {
    T loss = math::ASum<T, Context>(losses.count(), Ldata);
    Output(0)->Reshape({ 1 });
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    math::Set<T, Context>(1, loss / normalizer, Ydata);
+    math::Set<T, Context>(1, loss / normalizer, Ydata, ctx());
 }
 template <class Context>
 void SoftmaxCrossEntropyOp<Context>::RunOnDevice() {
+    ctx()->set_stream_id(0);  //  enforce default stream
    outer_dim = Input(0).count(0, axis);
    inner_dim = Input(0).count(axis + 1);
    CHECK_EQ(Input(0).count(), Input(1).count())
@@ -76,16 +78,16 @@ void SoftmaxCrossEntropyGradientOp<Context>::RunWithType() {
    auto* Tdata = Input(1).template data<T, Context>();
    auto* Pdata = prob->template mutable_data<T, Context>();
    auto* dXdata = Output(0)->template mutable_data<T, Context>();
-    ctx().template Copy<T, Context, Context>(prob->count(), dXdata, Pdata);
+    ctx()->template Copy<T, Context, Context>(prob->count(), dXdata, Pdata);
    math::Axpy<T, Context>(Output(0)->count(),
-        -1.0, Tdata, dXdata, &ctx());
+        -1.0, Tdata, dXdata, ctx());
    if (normalization == "UNIT") {
        auto* dYdata = Input(-1).template data<T, Context>();
        kernel::SumGrad<T, Context>(outer_dim * inner_dim,
-            Input(0).dim(axis), inner_dim, 1.0, dYdata, Pdata);
+            Input(0).dim(axis), inner_dim, 1.0, dYdata, Pdata, ctx());
        math::Mul<T, Context>(Output(0)->count(),
-            Pdata, dXdata, dXdata); return;
+            Pdata, dXdata, dXdata, ctx()); return;
    }
    T normalizer = 1;
@@ -96,10 +98,10 @@ void SoftmaxCrossEntropyGradientOp<Context>::RunWithType() {
    }
    auto* dYdata = Input(-1).template data<T, Context>();
-    T dYdata_host; ctx().template Copy<T, CPUContext, Context>(
+    T dYdata_host; ctx()->template Copy<T, CPUContext, Context>(
        1, &dYdata_host, dYdata);
    math::Scal<T, Context>(Output(0)->count(),
-        dYdata_host / normalizer, dXdata, &ctx());
+        dYdata_host / normalizer, dXdata, ctx());
 }
 template <class Context>

--- a/Dragon/src/operators/loss/softmax_focal_loss_op.cc
+++ b/Dragon/src/operators/loss/softmax_focal_loss_op.cc
@@ -20,11 +20,11 @@ void SoftmaxFocalLossOp<Context>::RunWithType() {
        outer_dim, Input(0).dim(axis), inner_dim,
            pos_alpha, neg_alpha, gamma, neg_id,
                Pdata, Tdata, Idata, this->ignores.count(),
-                    Ldata, Fdata, &ctx());
+                    Ldata, Fdata, ctx());
    if (normalization == "UNIT") {
        Output(0)->ReshapeLike(losses);
-        Output(0)->template CopyFrom<Context>(losses);
+        Output(0)->template CopyFrom<Context>(losses, ctx());
        return;
    }
@@ -42,11 +42,13 @@ void SoftmaxFocalLossOp<Context>::RunWithType() {
    T loss = math::ASum<T, Context>(losses.count(), Ldata);
    Output(0)->Reshape({ 1 });
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    math::Set<T, Context>(1, loss / normalizer, Ydata);
+    math::Set<T, Context>(1, loss / normalizer, Ydata, ctx());
 }
 template <class Context>
 void SoftmaxFocalLossOp<Context>::RunOnDevice() {
+    ctx()->set_stream_id(0);  //  enforce default stream
    outer_dim = Input(0).count(0, axis);
    inner_dim = Input(0).count(axis + 1);
    CHECK_EQ(outer_dim * inner_dim, Input(1).count())
@@ -80,16 +82,16 @@ void SoftmaxFocalLossGradientOp<Context>::RunWithType() {
        outer_dim, Output(0)->dim(axis), inner_dim,
            pos_alpha, neg_alpha, gamma, neg_id,
                Pdata, Tdata, Idata, this->ignores.count(),
-                    dXdata, Fdata, &ctx());
+                    dXdata, Fdata, ctx());
    if (normalization == "UNIT") {
        auto* dYdata = Input(-1).template data<T, Context>();
        kernel::SumGrad<T, Context>(
            Input(0).count() / Input(0).dim(axis),
                Input(0).dim(axis), inner_dim,
-                    1.0, dYdata, Pdata);
+                    1.0, dYdata, Pdata, ctx());
        math::Mul<T, Context>(Output(0)->count(),
-            Pdata, dXdata, dXdata); return;
+            Pdata, dXdata, dXdata, ctx()); return;
    }
    T normalizer = 1;
@@ -104,14 +106,16 @@ void SoftmaxFocalLossGradientOp<Context>::RunWithType() {
    }
    auto* dYdata = Input(-1).template data<T, Context>();
-    T dYdata_host; ctx().template Copy<T, CPUContext, Context>(
+    T dYdata_host; ctx()->template Copy<T, CPUContext, Context>(
        1, &dYdata_host, dYdata);
    math::Scal<T, Context>(Output(0)->count(),
-        dYdata_host / normalizer, dXdata, &ctx());
+        dYdata_host / normalizer, dXdata, ctx());
 }
 template <class Context>
 void SoftmaxFocalLossGradientOp<Context>::RunOnDevice() {
+    ctx()->set_stream_id(0);  //  enforce default stream
    this->prob = ws()->GetTensor("/mnt/" + anchor() + "/softmax/prob");
    outer_dim = this->prob->count(0, axis);
    inner_dim = this->prob->count(axis + 1);

--- a/Dragon/src/operators/loss/sparse_softmax_cross_entropy_op.cc
+++ b/Dragon/src/operators/loss/sparse_softmax_cross_entropy_op.cc
@@ -21,83 +21,66 @@ void SparseSoftmaxCrossEntropyOp<Context>::SoftmaxRun() {
    softmax_op->Run();
 }
-template <class Context>
-void SparseSoftmaxCrossEntropyOp<Context>::SoftmaxRunFP16() {
-    Tensor* XF32 = ws()->CreateTensor(
-        "/mnt/" + anchor() + "/softmax/xf32");
-    XF32->ReshapeLike(Input(0));
-    auto* XdataF16 = Input(0).template data<float16, Context>();
-    auto* XdataF32 = XF32->template mutable_data<float, Context>();
-    kernel::TypeA2B<float16, float, Context>(
-        Input(0).count(), XdataF16, XdataF32);
-    OperatorDef softmax_def = MakeOperatorDef("Softmax", "",
-        vector<string>({ XF32->name() }),
-        vector<string>({ "/mnt/" + anchor() + "/softmax/prob" }));
-    softmax_def.add_arg()->CopyFrom(this->arg("axis"));
-    if (def().has_device_option())
-        softmax_def.mutable_device_option()
-            ->CopyFrom(def().device_option());
-    if (!softmax_op) softmax_op.reset(
-        CreateOperator(softmax_def, ws()));
-    else softmax_op->MutableOp(softmax_def);
-    softmax_op->Run();
-}
 template <class Context> template <typename Tx, typename Ty>
 void SparseSoftmaxCrossEntropyOp<Context>::RunWithType() {
    auto* Pdata = prob->template data<Tx, Context>();
    auto* Tdata = Input(1).template data<Ty, Context>();
    auto* Idata = !ignores.count() ? nullptr :
        ignores.template data<int, Context>();
-    auto* Ldata = losses.template mutable_data<Tx, Context>();
+    auto* Ldata = losses.template mutable_data<float, Context>();
-    auto* Fdata = flags.template mutable_data<Tx, Context>();
+    auto* Fdata = flags.template mutable_data<float, Context>();
    kernel::SparseSoftmaxCrossEntropy<Tx, Ty, Context>(
        outer_dim, Input(0).dim(axis), inner_dim,
            Pdata, Tdata, Idata, ignores.count(),
-                Ldata, Fdata, &ctx());
+                Ldata, Fdata, ctx());
    if (normalization == "UNIT") {
        Output(0)->ReshapeLike(losses);
-        Output(0)->template CopyFrom<Context>(losses);
+        Output(0)->template CopyFrom<Context>(losses, ctx());
        return;
    }
-    Tx normalizer = 1;
+    float normalizer = 1;
    if (normalization == "VALID") {
        normalizer = std::max(
-            math::ASum<Tx, Context>(
+            math::ASum<float, Context>(
-                flags.count(), Fdata), (Tx)1.f);
+                flags.count(), Fdata), 1.f);
    } else if (normalization == "BATCH_SIZE") {
        normalizer = Input(0).dim(0);
    } else if (normalization == "FULL") {
        normalizer = outer_dim * inner_dim;
    }
-    Tx loss = math::ASum<Tx, Context>(losses.count(), Ldata);
+    float loss = math::ASum<float, Context>(losses.count(), Ldata);
    Output(0)->Reshape({ 1 });
-    auto* Ydata = Output(0)->template mutable_data<Tx, Context>();
+    auto* Ydata = Output(0)->template mutable_data<float, Context>();
-    math::Set<Tx, Context>(1, loss / normalizer, Ydata);
+    math::Set<float, Context>(1, loss / normalizer, Ydata, ctx());
 }
 template <class Context>
 void SparseSoftmaxCrossEntropyOp<Context>::RunOnDevice() {
+    ctx()->set_stream_id(0);  //  enforce default stream
    outer_dim = Input(0).count(0, axis);
    inner_dim = Input(0).count(axis + 1);
    CHECK_EQ(outer_dim * inner_dim, Input(1).count())
        << "\nNumber of predictions must match the number of labels.";
    losses.Reshape({ outer_dim * inner_dim });
    flags.Reshape({ outer_dim * inner_dim });
    prob = ws()->CreateTensor("/mnt/" + anchor() + "/softmax/prob");
+    SoftmaxRun();
-    if (XIsType(Input(0), float) ||
+    if (XIsType(Input(0), float)) {
-            XIsType(Input(0), float16)) {
-        if (XIsType(Input(0), float16)) SoftmaxRunFP16();
-        else SoftmaxRun();
        if (XIsType(Input(1), float)) RunWithType<float, float>();
        else if (XIsType(Input(1), int64_t)) RunWithType<float, int64_t>();
        else LOG(FATAL) << DTypeHelper(Input(1), { "float32", "int64" });
-    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
+    } else if (XIsType(Input(0), float16)) {
+        if (XIsType(Input(1), float)) RunWithType<float16, float>();
+        else if (XIsType(Input(1), int64_t)) RunWithType<float16, int64_t>();
+        else LOG(FATAL) << DTypeHelper(Input(1), { "float32", "int64" });
+    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
 }
 DEPLOY_CPU(SparseSoftmaxCrossEntropy);
@@ -113,62 +96,66 @@ void SparseSoftmaxCrossEntropyGradientOp<Context>::RunWithType() {
    auto* Idata = !ignores.count() ? nullptr :
        ignores.template data<int, Context>();
    auto* dXdata = Output(0)->template mutable_data<Tx, Context>();
-    auto* Fdata = flags.template mutable_data<Tx, Context>();
+    auto* Fdata = flags.template mutable_data<float, Context>();
-    ctx().template Copy<Tx, Context, Context>(
+    ctx()->template Copy<Tx, Context, Context>(
        prob->count(), dXdata, Pdata);
    kernel::SparseSoftmaxCrossEntropyGrad<Tx, Ty, Context>(
        outer_dim, Output(0)->dim(axis), inner_dim,
            Pdata, Tdata, Idata, ignores.count(),
-                dXdata, Fdata, &ctx());
+                dXdata, Fdata, ctx());
    if (normalization == "UNIT") {
-        auto* dYdata = Input(-1).template data<Tx, Context>();
+        auto* dYdata = Input(-1).template data<float, Context>();
-        kernel::SumGrad<Tx, Context>(
+        auto* WSdata = ws()->template caches<float, Context>(
+            { Input(0).count() })[0];
+        kernel::SumGrad<float, Context>(
            Input(0).count() / Input(0).dim(axis),
                Input(0).dim(axis), inner_dim,
-                    1.0, dYdata, Pdata);
+                    1.0, dYdata, WSdata, ctx());
-        math::Mul<Tx, Context>(
+        kernel::TypeA2B<float, Tx, Context>(
-            Output(0)->count(), Pdata, dXdata, dXdata);
+            Input(0).count(), WSdata, Pdata, ctx());
+        math::Mul<Tx, Context>(Output(0)->count(),
+            Pdata, dXdata, dXdata, ctx());
        return;
    }
-    Tx normalizer = 1;
+    float normalizer = 1;
    if (normalization == "VALID") {
        normalizer = std::max(
-            math::ASum<Tx, Context>(
+            math::ASum<float, Context>(
-                flags.count(), Fdata), (Tx)1.f);
+                flags.count(), Fdata), 1.f);
    } else if (normalization == "BATCH_SIZE") {
        normalizer = Input(0).dim(0);
    } else if (normalization == "FULL") {
        normalizer = outer_dim * inner_dim;
    }
-    auto* dYdata = Input(-1).template data<Tx, Context>();
+    auto* dYdata = Input(-1).template data<float, Context>();
-    Tx dYdata_host; ctx().template Copy<Tx, CPUContext, Context>(
+    float dYdata_host; ctx()->template Copy<float, CPUContext, Context>(
        1, &dYdata_host, dYdata);
    math::Scal<Tx, Context>(Output(0)->count(),
-        dYdata_host / normalizer, dXdata, &ctx());
+        dYdata_host / normalizer, dXdata, ctx());
 }
 template <class Context>
 void SparseSoftmaxCrossEntropyGradientOp<Context>::RunOnDevice() {
+    ctx()->set_stream_id(0);  //  enforce default stream
    prob = ws()->GetTensor("/mnt/" + anchor() + "/softmax/prob");
    outer_dim = prob->count(0, axis);
    inner_dim = prob->count(axis + 1);
    Output(0)->ReshapeLike(Input(0));
    flags.Reshape({ outer_dim * inner_dim });
-    if (XIsType(Input(0), float) || XIsType(Input(0), float16)) {
+    if (XIsType(Input(0), float)) {
        if (XIsType(Input(1), float)) RunWithType<float, float>();
        else if (XIsType(Input(1), int64_t)) RunWithType<float, int64_t>();
        else LOG(FATAL) << DTypeHelper(Input(1), { "float32", "int64" });
-        if (XIsType(Input(0), float16)) {
+    } else if (XIsType(Input(0), float16)) {
-            auto* dXdataF32 = Output(0)->template data<float, Context>();
+        if (XIsType(Input(1), float)) RunWithType<float16, float>();
-            auto* dXdataF16 = prob->template mutable_data<float16, Context>();
+        else if (XIsType(Input(1), int64_t)) RunWithType<float16, int64_t>();
-            kernel::TypeA2B<float, float16, Context>(Output(0)->count(), dXdataF32, dXdataF16);
+        else LOG(FATAL) << DTypeHelper(Input(1), { "float32", "int64" });
-            Output(0)->template CopyFrom<Context>(*prob);
-        }
    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
 }

--- a/Dragon/src/operators/misc/accuracy_op.cc
+++ b/Dragon/src/operators/misc/accuracy_op.cc
@@ -9,23 +9,27 @@ namespace dragon {
 template <class Context> template <typename Tx, typename Ty>
 void AccuracyOp<Context>::RunWithType() {
+    static CPUContext cctx;
+    float* Y1data, *Y2data = nullptr;
+    Y1data = Output(0)->template mutable_data<float, CPUContext>();
    if (OutputSize() > 1) {
-        math::Set<float, CPUContext>(num_classes, 0,
+        Y2data = Output(1)->template mutable_data<float, CPUContext>();
-            Output(1)->template mutable_data<float, CPUContext>());
+        math::Set<float, CPUContext>(num_classes, 0, Y2data, &cctx);
    }
-    Map<int, TIndex> num_per_class;
+    Map<int, TIndex> num_per_class;
    TIndex acc = 0, count = 0;
    const Tx* Xdata;
    if (XIsType(Input(0), float16)) {
-        Tensor* XF32 = ws()->CreateTensor("/mnt/" + anchor() + "/accuracy/xf32");
+        Tensor* X32T = ws()->CreateTensor(
-        XF32->ReshapeLike(Input(0));
+            "/mnt/" + anchor() + "/accuracy/f32");
-        auto* XdataF16 = Input(0).template data<float16, CPUContext>();
+        X32T->ReshapeLike(Input(0));
-        auto* XdataF32 = XF32->template mutable_data<float, CPUContext>();
+        auto* X16 = Input(0).template data<float16, CPUContext>();
+        auto* X32 = X32T->template mutable_data<float, CPUContext>();
        kernel::TypeA2B<float16, float, CPUContext>(
-            Input(0).count(), XdataF16, XdataF32);
+            Input(0).count(), X16, X32, &cctx);
-        Xdata = XdataF32;
+        Xdata = X32;
    } else Xdata = Input(0).template data<Tx, CPUContext>();
    auto* labels = Input(1).template data<Ty, CPUContext>();
@@ -41,15 +45,13 @@ void AccuracyOp<Context>::RunWithType() {
            vector<pair<Tx, int> > vec;
            for (int k = 0; k < num_classes; k++)
                vec.push_back(
-                    std::make_pair(Xdata[i * dim + k * inner_dim + j], k)
+                    std::make_pair(Xdata[i * dim + k * inner_dim + j], k));
-                );
            std::partial_sort(
                vec.begin(), vec.begin() + top_k, vec.end(),
                    std::greater<pair<Tx, int> >());
            for (int k = 0; k < top_k; k++) {
                if (vec[k].second == label) {
-                    if (OutputSize() > 1)
+                    if (OutputSize() > 1) Y2data[label]++;
-                        Output(1)->template mutable_data<float, CPUContext>()[label]++;
                    acc++;
                    break;
                }
@@ -58,12 +60,11 @@ void AccuracyOp<Context>::RunWithType() {
        }    //  end inner_dim
    }    // end outer_dim
-    Output(0)->template mutable_data<float, CPUContext>()[0] = (float)acc / count;
+    Y1data[0] = (float)acc / count;
-    if (OutputSize() > 1) {
+    if (Y2data) {
-        auto* acc_per_class = Output(1)->template mutable_data<float, CPUContext>();
        for (int i = 0; i < num_classes; i++)
-            acc_per_class[i] = num_per_class[i] == 0 ?
+            Y2data[i] = num_per_class[i] == 0 ?
-                0 : acc_per_class[i] / num_per_class[i];
+                0 : Y2data[i] / num_per_class[i];
    }
 }

--- a/Dragon/src/operators/misc/astype_op.cc
+++ b/Dragon/src/operators/misc/astype_op.cc
@@ -14,14 +14,14 @@ namespace dragon {
            Output(0)->ReshapeLike(Input(0)); \
            auto* Xdata = Input(0).template data<type_a, Context>(); \
            auto* Ydata = Output(0)->template mutable_data<type_b, Context>(); \
-            kernel::TypeA2B<type_a, type_b, Context>(Input(0).count(), Xdata, Ydata); \
+            kernel::TypeA2B<type_a, type_b, Context>(Input(0).count(), Xdata, Ydata, ctx()); \
        } else { \
            TIndex count = Output(0)->count(); \
            auto* Xdata = Output(0)->template data<type_a, Context>(); \
            auto* Cdata = ws()->template caches<type_b, Context>({ count })[0]; \
-            kernel::TypeA2B<type_a, type_b, Context>(count, Xdata, Cdata); \
+            kernel::TypeA2B<type_a, type_b, Context>(count, Xdata, Cdata, ctx()); \
            auto* Ydata = Output(0)->template mutable_data<type_b, Context>(); \
-            ctx().template Copy<type_b, Context, Context>(count, Ydata, Cdata); \
+            ctx()->template Copy<type_b, Context, Context>(count, Ydata, Cdata); \
        } \
        return; \
    }

--- a/Dragon/src/operators/misc/gradient_op.cc
+++ b/Dragon/src/operators/misc/gradient_op.cc
--- a/Dragon/src/operators/misc/image_data_op.cc
+++ b/Dragon/src/operators/misc/image_data_op.cc
--- a/Dragon/src/operators/misc/initialize_op.cc
+++ b/Dragon/src/operators/misc/initialize_op.cc
--- a/Dragon/src/operators/mpi/mpi_broadcast_op.cc
+++ b/Dragon/src/operators/mpi/mpi_broadcast_op.cc
--- a/Dragon/src/operators/mpi/mpi_gather_op.cc
+++ b/Dragon/src/operators/mpi/mpi_gather_op.cc
--- a/Dragon/src/operators/ndarray/arange_op.cc
+++ b/Dragon/src/operators/ndarray/arange_op.cc
--- a/Dragon/src/operators/ndarray/argreduce_op.cc
+++ b/Dragon/src/operators/ndarray/argreduce_op.cc
--- a/Dragon/src/operators/ndarray/concat_op.cc
+++ b/Dragon/src/operators/ndarray/concat_op.cc
--- a/Dragon/src/operators/ndarray/crop_op.cc
+++ b/Dragon/src/operators/ndarray/crop_op.cc
--- a/Dragon/src/operators/ndarray/gather_op.cc
+++ b/Dragon/src/operators/ndarray/gather_op.cc
--- a/Dragon/src/operators/ndarray/one_hot_op.cc
+++ b/Dragon/src/operators/ndarray/one_hot_op.cc
--- a/Dragon/src/operators/ndarray/pad_op.cc
+++ b/Dragon/src/operators/ndarray/pad_op.cc
--- a/Dragon/src/operators/ndarray/random_pick_op.cc
+++ b/Dragon/src/operators/ndarray/random_pick_op.cc
--- a/Dragon/src/operators/ndarray/reduce_op.cc
+++ b/Dragon/src/operators/ndarray/reduce_op.cc
--- a/Dragon/src/operators/ndarray/repeat_op.cc
+++ b/Dragon/src/operators/ndarray/repeat_op.cc
--- a/Dragon/src/operators/ndarray/slice_op.cc
+++ b/Dragon/src/operators/ndarray/slice_op.cc
--- a/Dragon/src/operators/ndarray/stack_op.cc
+++ b/Dragon/src/operators/ndarray/stack_op.cc
--- a/Dragon/src/operators/ndarray/tile_op.cc
+++ b/Dragon/src/operators/ndarray/tile_op.cc
--- a/Dragon/src/operators/ndarray/transpose_op.cc
+++ b/Dragon/src/operators/ndarray/transpose_op.cc
--- a/Dragon/src/operators/norm/batch_norm_op.cc
+++ b/Dragon/src/operators/norm/batch_norm_op.cc
--- a/Dragon/src/operators/norm/batch_renorm_op.cc
+++ b/Dragon/src/operators/norm/batch_renorm_op.cc
--- a/Dragon/src/operators/norm/cudnn_batch_norm_op.cc
+++ b/Dragon/src/operators/norm/cudnn_batch_norm_op.cc
--- a/Dragon/src/operators/norm/fused_batch_norm.cc
+++ b/Dragon/src/operators/norm/fused_batch_norm.cc
--- a/Dragon/src/operators/norm/fused_group_norm.cc
+++ b/Dragon/src/operators/norm/fused_group_norm.cc
--- a/Dragon/src/operators/norm/group_norm_op.cc
+++ b/Dragon/src/operators/norm/group_norm_op.cc
--- a/Dragon/src/operators/norm/instance_norm_op.cc
+++ b/Dragon/src/operators/norm/instance_norm_op.cc
--- a/Dragon/src/operators/norm/l2_norm_op.cc
+++ b/Dragon/src/operators/norm/l2_norm_op.cc
--- a/Dragon/src/operators/recurrent/cudnn_recurrent_op.cc
+++ b/Dragon/src/operators/recurrent/cudnn_recurrent_op.cc
--- a/Dragon/src/operators/recurrent/lstm_cell_op.cc
+++ b/Dragon/src/operators/recurrent/lstm_cell_op.cc
--- a/Dragon/src/operators/recurrent/rnn_param_op.cc
+++ b/Dragon/src/operators/recurrent/rnn_param_op.cc
--- a/Dragon/src/operators/update/adam_update_op.cc
+++ b/Dragon/src/operators/update/adam_update_op.cc
--- a/Dragon/src/operators/update/collective_update_op.cc
+++ b/Dragon/src/operators/update/collective_update_op.cc
--- a/Dragon/src/operators/update/moving_average_op.cc
+++ b/Dragon/src/operators/update/moving_average_op.cc
--- a/Dragon/src/operators/update/nesterov_update_op.cc
+++ b/Dragon/src/operators/update/nesterov_update_op.cc
--- a/Dragon/src/operators/update/rmsprop_update_op.cc
+++ b/Dragon/src/operators/update/rmsprop_update_op.cc
--- a/Dragon/src/operators/update/sgd_update_op.cc
+++ b/Dragon/src/operators/update/sgd_update_op.cc
--- a/Dragon/src/operators/update/update_op_base.cc
+++ b/Dragon/src/operators/update/update_op_base.cc
--- a/Dragon/src/operators/vision/bias_add_op.cc
+++ b/Dragon/src/operators/vision/bias_add_op.cc
--- a/Dragon/src/operators/vision/bilinear_resize_op.cc
+++ b/Dragon/src/operators/vision/bilinear_resize_op.cc
--- a/Dragon/src/operators/vision/conv2d_op.cc
+++ b/Dragon/src/operators/vision/conv2d_op.cc
--- a/Dragon/src/operators/vision/conv2d_transpose_op.cc
+++ b/Dragon/src/operators/vision/conv2d_transpose_op.cc
--- a/Dragon/src/operators/vision/conv_op_base.cc
+++ b/Dragon/src/operators/vision/conv_op_base.cc
--- a/Dragon/src/operators/vision/cudnn_conv2d_op.cc
+++ b/Dragon/src/operators/vision/cudnn_conv2d_op.cc
--- a/Dragon/src/operators/vision/cudnn_conv2d_transpose_op.cc
+++ b/Dragon/src/operators/vision/cudnn_conv2d_transpose_op.cc
--- a/Dragon/src/operators/vision/cudnn_lrn_op.cc
+++ b/Dragon/src/operators/vision/cudnn_lrn_op.cc
--- a/Dragon/src/operators/vision/cudnn_pooling2d_op.cc
+++ b/Dragon/src/operators/vision/cudnn_pooling2d_op.cc
--- a/Dragon/src/operators/vision/dense_concat_op.cc
+++ b/Dragon/src/operators/vision/dense_concat_op.cc
--- a/Dragon/src/operators/vision/lrn_op.cc
+++ b/Dragon/src/operators/vision/lrn_op.cc
--- a/Dragon/src/operators/vision/nn_resize_op.cc
+++ b/Dragon/src/operators/vision/nn_resize_op.cc
--- a/Dragon/src/operators/vision/pooling2d_op.cc
+++ b/Dragon/src/operators/vision/pooling2d_op.cc
--- a/Dragon/src/operators/vision/roi_align_op.cc
+++ b/Dragon/src/operators/vision/roi_align_op.cc
--- a/Dragon/src/operators/vision/roi_pooling_op.cc
+++ b/Dragon/src/operators/vision/roi_pooling_op.cc
--- a/Dragon/src/utils/math_functions.cc
+++ b/Dragon/src/utils/math_functions.cc
--- a/Dragon/src/utils/math_functions.cu
+++ b/Dragon/src/utils/math_functions.cu
--- a/Dragon/src/utils/math_functions_fp16.cu
+++ b/Dragon/src/utils/math_functions_fp16.cu
--- a/Dragon/src/utils/op_kernel.cc
+++ b/Dragon/src/utils/op_kernel.cc
--- a/Dragon/src/utils/op_kernel.cu
+++ b/Dragon/src/utils/op_kernel.cu
--- a/Dragon/src/utils/op_kernel_fp16.cu
+++ b/Dragon/src/utils/op_kernel_fp16.cu
--- a/Dragon/src/utils/sse_alternative.cc
+++ b/Dragon/src/utils/sse_alternative.cc