Refactor the python distribution script

Summary: This commit correctly uses the distribution commands to collect the python package and the compiled dynamic libraries.

Refactor the python distribution script
Summary: This commit correctly uses the distribution commands to collect the python package and the compiled dynamic libraries.
Ting PAN
Commit a79a3bba authored Mar 27, 2022 by Ting PAN
Showing with 404 additions and 449 deletions
cmake/Dependencies.cmake
dragon/core/tensor.h
dragon/kernels/activation/drop_path_op_kernel.cc
dragon/kernels/activation/drop_path_op_kernel.cu
dragon/kernels/activation/dropout_op_kernel.cu
dragon/kernels/activation/gelu_op_kernel.cu
dragon/kernels/activation/hardsigmoid_op_kernel.cu
dragon/kernels/activation/hardswish_op_kernel.cu
dragon/kernels/activation/prelu_op_kernel.cu
dragon/kernels/activation/selu_op_kernel.cc
dragon/kernels/activation/silu_op_kernel.cu
dragon/kernels/activation/softmax_op_kernel.cu
dragon/kernels/array/repeat_op_kernel.cu
dragon/kernels/loss/generic_loss_op_kernel.cu
dragon/kernels/loss/l1_loss_op_kernel.cu
dragon/kernels/math/reduce_op_kernel.cu
dragon/kernels/vision/avgpool_op_kernel.cc
dragon/kernels/vision/avgpool_op_kernel.cu
dragon/kernels/vision/bias_add_op_kernel.cu
dragon/kernels/vision/maxpool_op_kernel.cc
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -62,6 +62,7 @@ if (USE_MPI)
  list(APPEND THIRD_PARTY_LIBRARY_DIRS ${THIRD_PARTY_DIR}/mpi/lib)
 endif()
 if (USE_CUDNN)
+  list(APPEND THIRD_PARTY_LIBRARY_DIRS ${THIRD_PARTY_DIR}/cudnn/lib)
  list(APPEND THIRD_PARTY_LIBRARY_DIRS ${THIRD_PARTY_DIR}/cudnn/lib64)
  list(APPEND THIRD_PARTY_LIBRARY_DIRS ${THIRD_PARTY_DIR}/cudnn/lib/x64)
 endif()

--- a/dragon/core/tensor.h
+++ b/dragon/core/tensor.h
@@ -147,7 +147,7 @@ class DRAGON_API Tensor {
    } else {
      auto* new_memory = other->memory();
      if (new_memory != nullptr) {
-        CHECK_LE(size_, new_memory->size())
+        CHECK_LE(size_ * meta_.itemsize(), new_memory->size())
            << "\nMap from a memory with smaller capacity.";
        mapped_memory_ = new_memory;
        capacity_ = new_memory->size();

--- a/dragon/kernels/activation/drop_path_op_kernel.cc
+++ b/dragon/kernels/activation/drop_path_op_kernel.cc
@@ -25,33 +25,33 @@ void _DropPath(

 /* ------------------- Launcher Separator ------------------- */

-#define DEFINE_KERNEL_LAUNCHER(T)                                      \
-  template <>                                                          \
-  void DropPath<T, CPUContext>(                                        \
-      const int N,                                                     \
-      const int C,                                                     \
-      const float ratio,                                               \
-      const float scale,                                               \
-      const T* x,                                                      \
-      T* y,                                                            \
-      uint8_t* mask,                                                   \
-      uint32_t* /* r */,                                               \
-      CPUContext* ctx) {                                               \
-    math::RandomBernoulli(N, 1.f - ratio, mask, ctx);                  \
-    _DropPath(N, C, math::AccmulatorType<T>::type(scale), mask, x, y); \
+#define DEFINE_KERNEL_LAUNCHER(T)                                       \
+  template <>                                                           \
+  void DropPath<T, CPUContext>(                                         \
+      const int N,                                                      \
+      const int C,                                                      \
+      const float ratio,                                                \
+      const float scale,                                                \
+      const T* x,                                                       \
+      T* y,                                                             \
+      uint8_t* mask,                                                    \
+      uint32_t* /* r */,                                                \
+      CPUContext* ctx) {                                                \
+    math::RandomBernoulli(N, 1.f - ratio, mask, ctx);                   \
+    _DropPath(N, C, math::AccumulatorType<T>::type(scale), mask, x, y); \
  }

-#define DEFINE_GRAD_KERNEL_LAUNCHER(T)                                   \
-  template <>                                                            \
-  void DropPathGrad<T, CPUContext>(                                      \
-      const int N,                                                       \
-      const int C,                                                       \
-      const float scale,                                                 \
-      const uint8_t* mask,                                               \
-      const T* dy,                                                       \
-      T* dx,                                                             \
-      CPUContext* ctx) {                                                 \
-    _DropPath(N, C, math::AccmulatorType<T>::type(scale), mask, dy, dx); \
+#define DEFINE_GRAD_KERNEL_LAUNCHER(T)                                    \
+  template <>                                                             \
+  void DropPathGrad<T, CPUContext>(                                       \
+      const int N,                                                        \
+      const int C,                                                        \
+      const float scale,                                                  \
+      const uint8_t* mask,                                                \
+      const T* dy,                                                        \
+      T* dx,                                                              \
+      CPUContext* ctx) {                                                  \
+    _DropPath(N, C, math::AccumulatorType<T>::type(scale), mask, dy, dx); \
  }

 DEFINE_KERNEL_LAUNCHER(float16);

--- a/dragon/kernels/activation/drop_path_op_kernel.cu
+++ b/dragon/kernels/activation/drop_path_op_kernel.cu
@@ -62,7 +62,7 @@ __global__ void _DropPathGrad(
    _DropPath<<<CUDA_BLOCKS(NxC), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
        NxC,                                                              \
        C,                                                                \
-        convert::To<math::AccmulatorType<T>::type>(scale),                \
+        convert::To<math::AccumulatorType<T>::type>(scale),               \
        static_cast<uint32_t>(UINT_MAX * ratio),                          \
        r,                                                                \
        reinterpret_cast<const math::ScalarType<T>::type*>(x),            \
@@ -84,7 +84,7 @@ __global__ void _DropPathGrad(
    _DropPathGrad<<<CUDA_BLOCKS(NxC), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
        NxC,                                                                  \
        C,                                                                    \
-        convert::To<math::AccmulatorType<T>::type>(scale),                    \
+        convert::To<math::AccumulatorType<T>::type>(scale),                   \
        mask,                                                                 \
        reinterpret_cast<const math::ScalarType<T>::type*>(dy),               \
        reinterpret_cast<math::ScalarType<T>::type*>(dx));                    \

--- a/dragon/kernels/activation/dropout_op_kernel.cu
+++ b/dragon/kernels/activation/dropout_op_kernel.cu
@@ -43,7 +43,7 @@ __global__ void _Dropout(
    math::Random(N, r, ctx);                                           \
    _Dropout<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
        N,                                                             \
-        convert::To<math::AccmulatorType<T>::type>(scale),             \
+        convert::To<math::AccumulatorType<T>::type>(scale),            \
        static_cast<uint32_t>(UINT_MAX * ratio),                       \
        r,                                                             \
        reinterpret_cast<const math::ScalarType<T>::type*>(x),         \

--- a/dragon/kernels/activation/gelu_op_kernel.cu
+++ b/dragon/kernels/activation/gelu_op_kernel.cu
@@ -64,7 +64,7 @@ __global__ void _ApproxGeluGrad(const int N, const T* dy, const T* x, T* dx) {
 #define DEFINE_KERNEL_LAUNCHER(name, T)                                        \
  template <>                                                                  \
  void name<T, CUDAContext>(const int N, const T* x, T* y, CUDAContext* ctx) { \
-    _##name<math::ScalarType<T>::type, math::AccmulatorType<T>::type>          \
+    _##name<math::ScalarType<T>::type, math::AccumulatorType<T>::type>         \
        <<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(             \
            N,                                                                 \
            reinterpret_cast<const math::ScalarType<T>::type*>(x),             \
@@ -75,7 +75,7 @@ __global__ void _ApproxGeluGrad(const int N, const T* dy, const T* x, T* dx) {
  template <>                                                          \
  void name<T, CUDAContext>(                                           \
      const int N, const T* dy, const T* x, T* dx, CUDAContext* ctx) { \
-    _##name<math::ScalarType<T>::type, math::AccmulatorType<T>::type>  \
+    _##name<math::ScalarType<T>::type, math::AccumulatorType<T>::type> \
        <<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(     \
            N,                                                         \
            reinterpret_cast<const math::ScalarType<T>::type*>(dy),    \

--- a/dragon/kernels/activation/hardsigmoid_op_kernel.cu
+++ b/dragon/kernels/activation/hardsigmoid_op_kernel.cu
@@ -49,8 +49,8 @@ __global__ void _HardSigmoidGrad(
      CUDAContext* ctx) {                                                  \
    _HardSigmoid<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
        N,                                                                 \
-        convert::To<math::AccmulatorType<T>::type>(alpha),                 \
-        convert::To<math::AccmulatorType<T>::type>(beta),                  \
+        convert::To<math::AccumulatorType<T>::type>(alpha),                \
+        convert::To<math::AccumulatorType<T>::type>(beta),                 \
        reinterpret_cast<const math::ScalarType<T>::type*>(x),             \
        reinterpret_cast<math::ScalarType<T>::type*>(y));                  \
  }
@@ -66,7 +66,7 @@ __global__ void _HardSigmoidGrad(
      CUDAContext* ctx) {                                                      \
    _HardSigmoidGrad<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
        N,                                                                     \
-        convert::To<math::AccmulatorType<T>::type>(alpha),                     \
+        convert::To<math::AccumulatorType<T>::type>(alpha),                    \
        reinterpret_cast<const math::ScalarType<T>::type*>(dy),                \
        reinterpret_cast<const math::ScalarType<T>::type*>(y),                 \
        reinterpret_cast<math::ScalarType<T>::type*>(dx));                     \

--- a/dragon/kernels/activation/hardswish_op_kernel.cu
+++ b/dragon/kernels/activation/hardswish_op_kernel.cu
@@ -36,27 +36,27 @@ __global__ void _HardSwishGrad(const int N, const T* dy, const T* x, T* dx) {

 /* ------------------- Launcher Separator ------------------- */

-#define DEFINE_KERNEL_LAUNCHER(T)                                        \
-  template <>                                                            \
-  void HardSwish<T, CUDAContext>(                                        \
-      const int N, const T* x, T* y, CUDAContext* ctx) {                 \
-    _HardSwish<math::ScalarType<T>::type, math::AccmulatorType<T>::type> \
-        <<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(       \
-            N,                                                           \
-            reinterpret_cast<const math::ScalarType<T>::type*>(x),       \
-            reinterpret_cast<math::ScalarType<T>::type*>(y));            \
+#define DEFINE_KERNEL_LAUNCHER(T)                                         \
+  template <>                                                             \
+  void HardSwish<T, CUDAContext>(                                         \
+      const int N, const T* x, T* y, CUDAContext* ctx) {                  \
+    _HardSwish<math::ScalarType<T>::type, math::AccumulatorType<T>::type> \
+        <<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(        \
+            N,                                                            \
+            reinterpret_cast<const math::ScalarType<T>::type*>(x),        \
+            reinterpret_cast<math::ScalarType<T>::type*>(y));             \
  }

-#define DEFINE_GRAD_KERNEL_LAUNCHER(T)                                       \
-  template <>                                                                \
-  void HardSwishGrad<T, CUDAContext>(                                        \
-      const int N, const T* dy, const T* x, T* dx, CUDAContext* ctx) {       \
-    _HardSwishGrad<math::ScalarType<T>::type, math::AccmulatorType<T>::type> \
-        <<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(           \
-            N,                                                               \
-            reinterpret_cast<const math::ScalarType<T>::type*>(dy),          \
-            reinterpret_cast<const math::ScalarType<T>::type*>(x),           \
-            reinterpret_cast<math::ScalarType<T>::type*>(dx));               \
+#define DEFINE_GRAD_KERNEL_LAUNCHER(T)                                        \
+  template <>                                                                 \
+  void HardSwishGrad<T, CUDAContext>(                                         \
+      const int N, const T* dy, const T* x, T* dx, CUDAContext* ctx) {        \
+    _HardSwishGrad<math::ScalarType<T>::type, math::AccumulatorType<T>::type> \
+        <<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(            \
+            N,                                                                \
+            reinterpret_cast<const math::ScalarType<T>::type*>(dy),           \
+            reinterpret_cast<const math::ScalarType<T>::type*>(x),            \
+            reinterpret_cast<math::ScalarType<T>::type*>(dx));                \
  }

 DEFINE_KERNEL_LAUNCHER(float16);

--- a/dragon/kernels/activation/prelu_op_kernel.cu
+++ b/dragon/kernels/activation/prelu_op_kernel.cu
--- a/dragon/kernels/activation/selu_op_kernel.cc
+++ b/dragon/kernels/activation/selu_op_kernel.cc
@@ -38,40 +38,40 @@ void _SeluGrad(

 /* ------------------- Launcher Separator ------------------- */

-#define DEFINE_KERNEL_LAUNCHER(T)                          \
-  template <>                                              \
-  void Selu<T, CPUContext>(                                \
-      const int N,                                         \
-      const float alpha,                                   \
-      const float gamma,                                   \
-      const T* x,                                          \
-      T* y,                                                \
-      CPUContext* ctx) {                                   \
-    _Selu(                                                 \
-        N,                                                 \
-        convert::To<math::AccmulatorType<T>::type>(alpha), \
-        convert::To<math::AccmulatorType<T>::type>(gamma), \
-        x,                                                 \
-        y);                                                \
+#define DEFINE_KERNEL_LAUNCHER(T)                           \
+  template <>                                               \
+  void Selu<T, CPUContext>(                                 \
+      const int N,                                          \
+      const float alpha,                                    \
+      const float gamma,                                    \
+      const T* x,                                           \
+      T* y,                                                 \
+      CPUContext* ctx) {                                    \
+    _Selu(                                                  \
+        N,                                                  \
+        convert::To<math::AccumulatorType<T>::type>(alpha), \
+        convert::To<math::AccumulatorType<T>::type>(gamma), \
+        x,                                                  \
+        y);                                                 \
  }

-#define DEFINE_GRAD_KERNEL_LAUNCHER(T)                     \
-  template <>                                              \
-  void SeluGrad<T, CPUContext>(                            \
-      const int N,                                         \
-      const float alpha,                                   \
-      const float gamma,                                   \
-      const T* dy,                                         \
-      const T* y,                                          \
-      T* dx,                                               \
-      CPUContext* ctx) {                                   \
-    _SeluGrad(                                             \
-        N,                                                 \
-        convert::To<math::AccmulatorType<T>::type>(alpha), \
-        convert::To<math::AccmulatorType<T>::type>(gamma), \
-        dy,                                                \
-        y,                                                 \
-        dx);                                               \
+#define DEFINE_GRAD_KERNEL_LAUNCHER(T)                      \
+  template <>                                               \
+  void SeluGrad<T, CPUContext>(                             \
+      const int N,                                          \
+      const float alpha,                                    \
+      const float gamma,                                    \
+      const T* dy,                                          \
+      const T* y,                                           \
+      T* dx,                                                \
+      CPUContext* ctx) {                                    \
+    _SeluGrad(                                              \
+        N,                                                  \
+        convert::To<math::AccumulatorType<T>::type>(alpha), \
+        convert::To<math::AccumulatorType<T>::type>(gamma), \
+        dy,                                                 \
+        y,                                                  \
+        dx);                                                \
  }

 DEFINE_KERNEL_LAUNCHER(float16);

--- a/dragon/kernels/activation/silu_op_kernel.cu
+++ b/dragon/kernels/activation/silu_op_kernel.cu
@@ -35,23 +35,23 @@ __global__ void _SiluGrad(const int N, const T* dy, const T* x, T* dx) {
 #define DEFINE_KERNEL_LAUNCHER(T)                                              \
  template <>                                                                  \
  void Silu<T, CUDAContext>(const int N, const T* x, T* y, CUDAContext* ctx) { \
-    _Silu<math::ScalarType<T>::type, math::AccmulatorType<T>::type>            \
+    _Silu<math::ScalarType<T>::type, math::AccumulatorType<T>::type>           \
        <<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(             \
            N,                                                                 \
            reinterpret_cast<const math::ScalarType<T>::type*>(x),             \
            reinterpret_cast<math::ScalarType<T>::type*>(y));                  \
  }

-#define DEFINE_GRAD_KERNEL_LAUNCHER(T)                                  \
-  template <>                                                           \
-  void SiluGrad<T, CUDAContext>(                                        \
-      const int N, const T* dy, const T* x, T* dx, CUDAContext* ctx) {  \
-    _SiluGrad<math::ScalarType<T>::type, math::AccmulatorType<T>::type> \
-        <<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(      \
-            N,                                                          \
-            reinterpret_cast<const math::ScalarType<T>::type*>(dy),     \
-            reinterpret_cast<const math::ScalarType<T>::type*>(x),      \
-            reinterpret_cast<math::ScalarType<T>::type*>(dx));          \
+#define DEFINE_GRAD_KERNEL_LAUNCHER(T)                                   \
+  template <>                                                            \
+  void SiluGrad<T, CUDAContext>(                                         \
+      const int N, const T* dy, const T* x, T* dx, CUDAContext* ctx) {   \
+    _SiluGrad<math::ScalarType<T>::type, math::AccumulatorType<T>::type> \
+        <<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(       \
+            N,                                                           \
+            reinterpret_cast<const math::ScalarType<T>::type*>(dy),      \
+            reinterpret_cast<const math::ScalarType<T>::type*>(x),       \
+            reinterpret_cast<math::ScalarType<T>::type*>(dx));           \
  }

 DEFINE_KERNEL_LAUNCHER(float16);

--- a/dragon/kernels/activation/softmax_op_kernel.cu
+++ b/dragon/kernels/activation/softmax_op_kernel.cu
@@ -285,7 +285,7 @@ __global__ void _LogSoftmaxGradViaBlockReduce(
      const auto nblocks = math::utils::DivUp<int>(NxS, WARP_ITEMS); \
      _##name##ViaWarpReduce<                                        \
          math::ScalarType<T>::type,                                 \
-          math::AccmulatorType<T>::type>                             \
+          math::AccumulatorType<T>::type>                            \
          <<<nblocks,                                                \
             dim3(CUDA_WARP_SIZE, WARP_ITEMS),                       \
             0,                                                      \
@@ -299,7 +299,7 @@ __global__ void _LogSoftmaxGradViaBlockReduce(
    }                                                                \
    _##name##ViaBlockReduce<                                         \
        math::ScalarType<T>::type,                                   \
-        math::AccmulatorType<T>::type>                               \
+        math::AccumulatorType<T>::type>                              \
        <<<NxS, BLOCK_THREADS, 0, ctx->cuda_stream()>>>(             \
            NxS,                                                     \
            S,                                                       \
@@ -323,7 +323,7 @@ __global__ void _LogSoftmaxGradViaBlockReduce(
      const auto nblocks = math::utils::DivUp<int>(NxS, WARP_ITEMS);  \
      _##name##ViaWarpReduce<                                         \
          math::ScalarType<T>::type,                                  \
-          math::AccmulatorType<T>::type>                              \
+          math::AccumulatorType<T>::type>                             \
          <<<nblocks,                                                 \
             dim3(CUDA_WARP_SIZE, WARP_ITEMS),                        \
             0,                                                       \
@@ -338,7 +338,7 @@ __global__ void _LogSoftmaxGradViaBlockReduce(
    }                                                                 \
    _##name##ViaBlockReduce<                                          \
        math::ScalarType<T>::type,                                    \
-        math::AccmulatorType<T>::type>                                \
+        math::AccumulatorType<T>::type>                               \
        <<<NxS, BLOCK_THREADS, 0, ctx->cuda_stream()>>>(              \
            NxS,                                                      \
            S,                                                        \

--- a/dragon/kernels/array/repeat_op_kernel.cu
+++ b/dragon/kernels/array/repeat_op_kernel.cu
@@ -69,26 +69,26 @@ __global__ void _RepeatGrad(
        NxCxS2, C, S, S2, x, y);                                           \
  }

-#define DEFINE_GRAD_KERNEL_LAUNCHER(T)                                    \
-  template <>                                                             \
-  void RepeatGrad<T, CUDAContext>(                                        \
-      const int N,                                                        \
-      const int S,                                                        \
-      const int C,                                                        \
-      const int repeats,                                                  \
-      const T* dy,                                                        \
-      T* dx,                                                              \
-      CUDAContext* ctx) {                                                 \
-    const auto S2 = S * repeats;                                          \
-    const auto NxCxS = N * C * S;                                         \
-    _RepeatGrad<math::ScalarType<T>::type, math::AccmulatorType<T>::type> \
-        <<<CUDA_BLOCKS(NxCxS), CUDA_THREADS, 0, ctx->cuda_stream()>>>(    \
-            NxCxS,                                                        \
-            C,                                                            \
-            S,                                                            \
-            S2,                                                           \
-            reinterpret_cast<const math::ScalarType<T>::type*>(dy),       \
-            reinterpret_cast<math::ScalarType<T>::type*>(dx));            \
+#define DEFINE_GRAD_KERNEL_LAUNCHER(T)                                     \
+  template <>                                                              \
+  void RepeatGrad<T, CUDAContext>(                                         \
+      const int N,                                                         \
+      const int S,                                                         \
+      const int C,                                                         \
+      const int repeats,                                                   \
+      const T* dy,                                                         \
+      T* dx,                                                               \
+      CUDAContext* ctx) {                                                  \
+    const auto S2 = S * repeats;                                           \
+    const auto NxCxS = N * C * S;                                          \
+    _RepeatGrad<math::ScalarType<T>::type, math::AccumulatorType<T>::type> \
+        <<<CUDA_BLOCKS(NxCxS), CUDA_THREADS, 0, ctx->cuda_stream()>>>(     \
+            NxCxS,                                                         \
+            C,                                                             \
+            S,                                                             \
+            S2,                                                            \
+            reinterpret_cast<const math::ScalarType<T>::type*>(dy),        \
+            reinterpret_cast<math::ScalarType<T>::type*>(dx));             \
  }

 DEFINE_KERNEL_LAUNCHER(bool);

--- a/dragon/kernels/loss/generic_loss_op_kernel.cu
+++ b/dragon/kernels/loss/generic_loss_op_kernel.cu
@@ -77,7 +77,7 @@ __global__ void _BroadcastLossGrad(
      T* dx,                                                           \
      CUDAContext* ctx) {                                              \
    using ScalarT = math::ScalarType<T>::type;                         \
-    using AccT = math::AccmulatorType<T>::type;                        \
+    using AccT = math::AccumulatorType<T>::type;                       \
    if (num_masks > 0 && normalizer < 0.f) {                           \
      auto* num_valid = const_cast<T*>(mask + num_masks);              \
      math::Sum(num_masks, 1.f, mask, num_valid, ctx);                 \
@@ -105,7 +105,7 @@ __global__ void _BroadcastLossGrad(
      T* dx,                                                           \
      CUDAContext* ctx) {                                              \
    using ScalarT = math::ScalarType<T>::type;                         \
-    using AccT = math::AccmulatorType<T>::type;                        \
+    using AccT = math::AccumulatorType<T>::type;                       \
    const auto CxS = C * S;                                            \
    const auto NxCxS = N * CxS;                                        \
    _BroadcastLossGrad<ScalarT, AccT>                                  \

--- a/dragon/kernels/loss/l1_loss_op_kernel.cu
+++ b/dragon/kernels/loss/l1_loss_op_kernel.cu
@@ -42,7 +42,7 @@ __global__ void _SmoothL1Grad(const int N, const AccT beta, const T* x, T* y) {
      const int N, const float beta, const T* x, T* y, CUDAContext* ctx) { \
    _##name<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(      \
        N,                                                                 \
-        convert::To<math::AccmulatorType<T>::type>(beta),                  \
+        convert::To<math::AccumulatorType<T>::type>(beta),                 \
        reinterpret_cast<const math::ScalarType<T>::type*>(x),             \
        reinterpret_cast<math::ScalarType<T>::type*>(y));                  \
  }

--- a/dragon/kernels/math/reduce_op_kernel.cu
+++ b/dragon/kernels/math/reduce_op_kernel.cu
@@ -70,12 +70,12 @@ void _ReduceSumGradImpl(
    DISPATCH_FUNC_BY_VALUE_WITH_TYPE_2(                         \
        _ReduceSumGradImpl,                                     \
        math::ScalarType<T>::type,                              \
-        math::AccmulatorType<T>::type,                          \
+        math::AccumulatorType<T>::type,                         \
        num_dims,                                               \
        x_dims,                                                 \
        y_dims,                                                 \
        y_strides,                                              \
-        convert::To<math::AccmulatorType<T>::type>(scale),      \
+        convert::To<math::AccumulatorType<T>::type>(scale),     \
        reinterpret_cast<const math::ScalarType<T>::type*>(dy), \
        reinterpret_cast<math::ScalarType<T>::type*>(dx),       \
        ctx);                                                   \

--- a/dragon/kernels/vision/avgpool_op_kernel.cc
+++ b/dragon/kernels/vision/avgpool_op_kernel.cc
@@ -440,7 +440,7 @@ void _AvgPool3dGradNHWC(
    DISPATCH_POOL_KERNEL(               \
        _##name,                        \
        math::ScalarType<T>::type,      \
-        math::AccmulatorType<T>::type,  \
+        math::AccumulatorType<T>::type, \
        N,                              \
        C,                              \
        H,                              \
@@ -492,7 +492,7 @@ DEFINE_KERNEL_LAUNCHER(AvgPool2dGrad, double); // AvgPool2dGrad
    DISPATCH_POOL_KERNEL(               \
        _##name,                        \
        math::ScalarType<T>::type,      \
-        math::AccmulatorType<T>::type,  \
+        math::AccumulatorType<T>::type, \
        N,                              \
        C,                              \
        D,                              \

--- a/dragon/kernels/vision/avgpool_op_kernel.cu
+++ b/dragon/kernels/vision/avgpool_op_kernel.cu
@@ -457,7 +457,7 @@ __global__ void _AvgPool3dGradNHWC(
    DISPATCH_POOL_KERNEL(                                      \
        _##name,                                               \
        math::ScalarType<T>::type,                             \
-        math::AccmulatorType<T>::type,                         \
+        math::AccumulatorType<T>::type,                        \
        CUDA_BLOCKS(nthreads),                                 \
        CUDA_THREADS,                                          \
        nthreads,                                              \
@@ -512,7 +512,7 @@ DEFINE_KERNEL_LAUNCHER(AvgPool2dGrad, double, (H * W)); // AvgPool2dGrad
    DISPATCH_POOL_KERNEL(                                      \
        _##name,                                               \
        math::ScalarType<T>::type,                             \
-        math::AccmulatorType<T>::type,                         \
+        math::AccumulatorType<T>::type,                        \
        CUDA_BLOCKS(nthreads),                                 \
        CUDA_THREADS,                                          \
        nthreads,                                              \

--- a/dragon/kernels/vision/bias_add_op_kernel.cu
+++ b/dragon/kernels/vision/bias_add_op_kernel.cu
@@ -37,35 +37,35 @@ __global__ void _BiasAdd(

 /* ------------------- Launcher Separator ------------------- */

-#define DEFINE_KERNEL_LAUNCHER(T)                                        \
-  template <>                                                            \
-  void BiasAdd<T, CUDAContext>(                                          \
-      const int N,                                                       \
-      const int S,                                                       \
-      const int C,                                                       \
-      const T* x,                                                        \
-      const T* bias,                                                     \
-      T* y,                                                              \
-      CUDAContext* ctx) {                                                \
-    const auto NxCxS = N * C * S;                                        \
-    if (S == 1) {                                                        \
-      _BiasAdd<math::ScalarType<T>::type, math::AccmulatorType<T>::type> \
-          <<<CUDA_BLOCKS(NxCxS), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
-              NxCxS,                                                     \
-              C,                                                         \
-              reinterpret_cast<const math::ScalarType<T>::type*>(x),     \
-              reinterpret_cast<const math::ScalarType<T>::type*>(bias),  \
-              reinterpret_cast<math::ScalarType<T>::type*>(y));          \
-    } else {                                                             \
-      _BiasAdd<math::ScalarType<T>::type, math::AccmulatorType<T>::type> \
-          <<<CUDA_BLOCKS(NxCxS), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
-              NxCxS,                                                     \
-              S,                                                         \
-              C,                                                         \
-              reinterpret_cast<const math::ScalarType<T>::type*>(x),     \
-              reinterpret_cast<const math::ScalarType<T>::type*>(bias),  \
-              reinterpret_cast<math::ScalarType<T>::type*>(y));          \
-    }                                                                    \
+#define DEFINE_KERNEL_LAUNCHER(T)                                         \
+  template <>                                                             \
+  void BiasAdd<T, CUDAContext>(                                           \
+      const int N,                                                        \
+      const int S,                                                        \
+      const int C,                                                        \
+      const T* x,                                                         \
+      const T* bias,                                                      \
+      T* y,                                                               \
+      CUDAContext* ctx) {                                                 \
+    const auto NxCxS = N * C * S;                                         \
+    if (S == 1) {                                                         \
+      _BiasAdd<math::ScalarType<T>::type, math::AccumulatorType<T>::type> \
+          <<<CUDA_BLOCKS(NxCxS), CUDA_THREADS, 0, ctx->cuda_stream()>>>(  \
+              NxCxS,                                                      \
+              C,                                                          \
+              reinterpret_cast<const math::ScalarType<T>::type*>(x),      \
+              reinterpret_cast<const math::ScalarType<T>::type*>(bias),   \
+              reinterpret_cast<math::ScalarType<T>::type*>(y));           \
+    } else {                                                              \
+      _BiasAdd<math::ScalarType<T>::type, math::AccumulatorType<T>::type> \
+          <<<CUDA_BLOCKS(NxCxS), CUDA_THREADS, 0, ctx->cuda_stream()>>>(  \
+              NxCxS,                                                      \
+              S,                                                          \
+              C,                                                          \
+              reinterpret_cast<const math::ScalarType<T>::type*>(x),      \
+              reinterpret_cast<const math::ScalarType<T>::type*>(bias),   \
+              reinterpret_cast<math::ScalarType<T>::type*>(y));           \
+    }                                                                     \
  }

 DEFINE_KERNEL_LAUNCHER(uint8_t);

--- a/dragon/kernels/vision/maxpool_op_kernel.cc
+++ b/dragon/kernels/vision/maxpool_op_kernel.cc
@@ -393,7 +393,7 @@ void _MaxPool3dGradNHWC(
    DISPATCH_POOL_KERNEL(               \
        _##name,                        \
        math::ScalarType<T>::type,      \
-        math::AccmulatorType<T>::type,  \
+        math::AccumulatorType<T>::type, \
        N,                              \
        C,                              \
        H,                              \
@@ -447,7 +447,7 @@ DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, double); // MaxPool2dGrad
    DISPATCH_POOL_KERNEL(               \
        _##name,                        \
        math::ScalarType<T>::type,      \
-        math::AccmulatorType<T>::type,  \
+        math::AccumulatorType<T>::type, \
        N,                              \
        C,                              \
        D,                              \

--- a/dragon/kernels/vision/maxpool_op_kernel.cu
+++ b/dragon/kernels/vision/maxpool_op_kernel.cu
@@ -472,7 +472,7 @@ __global__ void _MaxPool3dGradNHWC(
    DISPATCH_POOL_KERNEL(                                      \
        _##name,                                               \
        math::ScalarType<T>::type,                             \
-        math::AccmulatorType<T>::type,                         \
+        math::AccumulatorType<T>::type,                        \
        CUDA_BLOCKS(nthreads),                                 \
        CUDA_THREADS,                                          \
        nthreads,                                              \
@@ -529,7 +529,7 @@ DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, double, (H * W)); // MaxPool2dGrad
    DISPATCH_POOL_KERNEL(                                      \
        _##name,                                               \
        math::ScalarType<T>::type,                             \
-        math::AccmulatorType<T>::type,                         \
+        math::AccumulatorType<T>::type,                        \
        CUDA_BLOCKS(nthreads),                                 \
        CUDA_THREADS,                                          \
        nthreads,                                              \

--- a/dragon/kernels/vision/resize_linear_op_kernel.cu
+++ b/dragon/kernels/vision/resize_linear_op_kernel.cu
@@ -49,7 +49,7 @@ __global__ void _ResizeLinear2dNCHW(
    const int w_out = yi % out_w;
    const int h_out = (yi / out_w) % out_h;
    const int c = (yi / out_w / out_h) % C;
-    const int n = yi / out_w / out_w / C;
+    const int n = yi / out_w / out_h / C;

    const float h = TransformCoordinate(h_out, scale_h, align_corners);
    const float w = TransformCoordinate(w_out, scale_w, align_corners);
@@ -129,7 +129,7 @@ __global__ void _ResizeLinear2dGradNCHW(
    const int w_out = yi % out_w;
    const int h_out = (yi / out_w) % out_h;
    const int c = (yi / out_w / out_h) % C;
-    const int n = yi / out_w / out_w / C;
+    const int n = yi / out_w / out_h / C;

    const float h = TransformCoordinate(h_out, scale_h, align_corners);
    const float w = TransformCoordinate(w_out, scale_w, align_corners);

--- a/dragon/modules/python/numpy.h
+++ b/dragon/modules/python/numpy.h
@@ -54,23 +54,22 @@ class NumpyWrapper {
      }
      return py::reinterpret_steal<py::object>(array);
    }
-    auto* array = PyArray_SimpleNewFromData(
+    return py::reinterpret_steal<py::object>(PyArray_SimpleNewFromData(
        dims.size(),
        dims.data(),
        dtypes::to_npy(meta),
-        const_cast<void*>(tensor_->raw_data<CPUContext>()));
-    return py::reinterpret_steal<py::object>(array);
+        const_cast<void*>(tensor_->raw_data<CPUContext>())));
  }

  Tensor* From(py::object obj, bool copy) {
-    auto* array =
-        PyArray_GETCONTIGUOUS(reinterpret_cast<PyArrayObject*>(obj.ptr()));
+    auto* array = PyArray_GETCONTIGUOUS((PyArrayObject*)obj.ptr());
    const auto& meta = dtypes::from_npy(PyArray_TYPE(array));
    CHECK(meta.id() != 0) << "\nUnsupported numpy array type.";
    auto* npy_dims = PyArray_DIMS(array);
    auto* data = static_cast<void*>(PyArray_DATA(array));
    vector<int64_t> dims(npy_dims, npy_dims + PyArray_NDIM(array));
-    auto* memory = tensor_->set_meta(meta)->Reshape(dims)->memory();
+    tensor_->set_meta(meta)->Reshape(dims);
+    auto* memory = tensor_->MapFrom(nullptr)->memory();
    if (copy) {
      auto device_type = memory ? memory->info()["device_type"] : "cpu";
      if (device_type == "cuda") {

--- a/dragon/operators/math/clip_op.cc
+++ b/dragon/operators/math/clip_op.cc
@@ -5,64 +5,33 @@ namespace dragon {

 template <class Context>
 template <typename T>
-pair<float, float> ClipOp<Context>::ComputeBoundsWithType() {
-  auto meta = TypeMeta::Make<T>();
-  if (meta.template Match<int8_t>()) {
-    return std::make_pair(std::max(low_, -128.f), std::min(high_, 127.f));
-  } else if (meta.template Match<uint8_t>()) {
-    return std::make_pair(std::max(low_, 0.f), std::min(high_, 255.f));
-  } else if (meta.template Match<int>()) {
-    return std::make_pair(
-        std::max(low_, -214748e4f), std::min(high_, 214748e4f));
-  } else if (meta.template Match<int64_t>()) {
-    return std::make_pair(
-        std::max(low_, -922337e13f), std::min(high_, 922337e13f));
-  } else if (meta.template Match<float16>()) {
-    return std::make_pair(std::max(low_, -65505.f), std::min(high_, 65504.f));
-  } else {
-    return std::make_pair(std::max(low_, -FLT_MAX), std::min(high_, FLT_MAX));
-  }
-}
-
-template <class Context>
-template <typename T>
 void ClipOp<Context>::DoRunWithType() {
  auto &X = Input(0), *Y = Output(0);
-  auto bounds = ComputeBoundsWithType<T>();
+  auto limits = this->template GetLimits<T>();
  kernels::Clip(
      X.count(),
-      bounds.first,
-      bounds.second,
+      limits.first,
+      limits.second,
      X.template data<T, Context>(),
      Y->ReshapeLike(X)->template mutable_data<T, Context>(),
      ctx());
 }

 template <class Context>
-void ClipOp<Context>::RunOnDevice() {
-  DispatchHelper<dtypes::Numerical>::Call(this, Input(0));
-}
-
-template <class Context>
 template <typename T>
 void ClipGradientOp<Context>::DoRunWithType() {
  auto &X = Input(0), &dY = Input(1), *dX = Output(0);
-  auto bounds = this->template ComputeBoundsWithType<T>();
+  auto limits = this->template GetLimits<T>();
  kernels::ClipGrad(
      X.count(),
-      bounds.first,
-      bounds.second,
+      limits.first,
+      limits.second,
      dY.template data<T, Context>(),
      X.template data<T, Context>(),
      dX->ReshapeLike(X)->template mutable_data<T, Context>(),
      ctx());
 }

-template <class Context>
-void ClipGradientOp<Context>::RunOnDevice() {
-  DispatchHelper<dtypes::Floating>::Call(this, Input(0));
-}
-
 DEPLOY_CPU_OPERATOR(Clip);
 #ifdef USE_CUDA
 DEPLOY_CUDA_OPERATOR(Clip);
@@ -82,7 +51,7 @@ OPERATOR_SCHEMA(Clip)
 OPERATOR_SCHEMA(ClipGradient)
    /* X, dY */
    .NumInputs(2)
-    /* X, dX */
+    /* dX */
    .NumOutputs(1);

 REGISTER_GRADIENT(Clip, GenericGradientMaker);

--- a/dragon/operators/math/clip_op.h
+++ b/dragon/operators/math/clip_op.h
@@ -26,10 +26,31 @@ class ClipOp : public Operator<Context> {
        high_(OP_SINGLE_ARG(float, "high", FLT_MAX)) {}
  USE_OPERATOR_FUNCTIONS;

-  void RunOnDevice() override;
+  void RunOnDevice() override {
+    DispatchHelper<dtypes::Numerical>::Call(this, Input(0));
+  }

  template <typename T>
-  pair<float, float> ComputeBoundsWithType();
+  std::pair<float, float> GetLimits() {
+    float type_min, type_max;
+    const auto meta = TypeMeta::Make<T>();
+    if (meta.template Match<uint8_t>()) {
+      type_min = float(std::numeric_limits<uint8_t>::min());
+      type_max = float(std::numeric_limits<uint8_t>::max());
+    } else if (meta.template Match<int8_t>()) {
+      type_min = float(std::numeric_limits<int8_t>::min());
+      type_max = float(std::numeric_limits<int8_t>::max());
+    } else if (meta.template Match<int>()) {
+      type_min = float(std::numeric_limits<int>::min());
+      type_max = float(std::numeric_limits<int>::max());
+    } else if (meta.template Match<float16>()) {
+      type_min = -65505.f, type_max = 65504.f;
+    } else {
+      type_min = std::numeric_limits<float>::min();
+      type_max = std::numeric_limits<float>::max();
+    }
+    return std::make_pair(std::max(low_, type_min), std::min(high_, type_max));
+  }

  template <typename T>
  void DoRunWithType();
@@ -45,7 +66,9 @@ class ClipGradientOp final : public ClipOp<Context> {
      : ClipOp<Context>(def, ws) {}
  USE_OPERATOR_FUNCTIONS;

-  void RunOnDevice() override;
+  void RunOnDevice() override {
+    DispatchHelper<dtypes::Floating>::Call(this, Input(0));
+  }

  template <typename T>
  void DoRunWithType();

--- a/dragon/operators/math/moments_op.cc
+++ b/dragon/operators/math/moments_op.cc
@@ -7,7 +7,7 @@ namespace dragon {
 template <class Context>
 template <typename T>
 void MomentsOp<Context>::DoRunWithType() {
-  using OutputT = typename math::AccmulatorType<T>::type;
+  using OutputT = typename math::AccumulatorType<T>::type;
  auto &X = Input(0), *Y1 = Output(0), *Y2 = Output(1);

  // Compute reduce axes.

--- a/dragon/operators/normalization/batch_norm_op.cc
+++ b/dragon/operators/normalization/batch_norm_op.cc
@@ -8,7 +8,7 @@ namespace dragon {
 template <class Context>
 template <typename T>
 void BatchNormOp<Context>::RunTraining() {
-  using ParamT = typename math::AccmulatorType<T>::type;
+  using ParamT = typename math::AccumulatorType<T>::type;
  INITIALIZE_TENSOR_VIA_SPEC(Input(1), vec64_t({C_}), ParamT);
  INITIALIZE_TENSOR_VIA_SPEC(Input(2), vec64_t({C_}), ParamT);
  INITIALIZE_TENSOR_VIA_SPEC(Input(3), vec64_t({C_}), ParamT);
@@ -99,7 +99,7 @@ void BatchNormOp<Context>::RunTraining() {
 template <class Context>
 template <typename T>
 void BatchNormOp<Context>::RunInference() {
-  using ParamT = typename math::AccmulatorType<T>::type;
+  using ParamT = typename math::AccumulatorType<T>::type;
  INITIALIZE_TENSOR_VIA_SPEC(Input(1), vec64_t({C_}), ParamT);
  INITIALIZE_TENSOR_VIA_SPEC(Input(2), vec64_t({C_}), ParamT);
  INITIALIZE_TENSOR_VIA_SPEC(Input(3), vec64_t({C_}), ParamT);
@@ -132,7 +132,7 @@ void BatchNormOp<Context>::RunInference() {
 template <class Context>
 template <typename T>
 void BatchNormGradientOp<Context>::RunTraining() {
-  using ParamT = typename math::AccmulatorType<T>::type;
+  using ParamT = typename math::AccumulatorType<T>::type;
  auto *dX = Output(0), *dW = Output(1), *dB = Output(2);
  auto &X_mu = Input("X_mu"), &X_rsig = Input("X_rsig");
  auto* X_params = Output("X_params")->Reshape({C_ * 2});
@@ -213,7 +213,7 @@ void BatchNormGradientOp<Context>::RunTraining() {
 template <class Context>
 template <typename T>
 void BatchNormGradientOp<Context>::RunInference() {
-  using ParamT = typename math::AccmulatorType<T>::type;
+  using ParamT = typename math::AccumulatorType<T>::type;
  auto *dX = Output(0), *dW = Output(1), *dB = Output(2);
  auto* X_params = Output("X_params")->Reshape({C_});


--- a/dragon/operators/normalization/group_norm_op.cc
+++ b/dragon/operators/normalization/group_norm_op.cc
@@ -8,7 +8,7 @@ namespace dragon {
 template <class Context>
 template <typename T>
 void GroupNormOp<Context>::DoRunWithType() {
-  using ParamT = typename math::AccmulatorType<T>::type;
+  using ParamT = typename math::AccumulatorType<T>::type;
  auto &X = Input(0), *Y = Output(0);
  auto &W = Input(1), &B = Input(2);
  GetBaseArguments();
@@ -55,7 +55,7 @@ void GroupNormOp<Context>::DoRunWithType() {
 template <class Context>
 template <typename T>
 void GroupNormGradientOp<Context>::DoRunWithType() {
-  using ParamT = typename math::AccmulatorType<T>::type;
+  using ParamT = typename math::AccumulatorType<T>::type;
  auto &X = Input(0), &W = Input(1), &dY = Input(2);
  auto &X_mu = Input("X_mu"), &X_rsig = Input("X_rsig");
  auto *dX = Output(0), *dW = Output(1), *dB = Output(2);

--- a/dragon/operators/normalization/layer_norm_op.cc
+++ b/dragon/operators/normalization/layer_norm_op.cc
@@ -7,7 +7,7 @@ namespace dragon {
 template <class Context>
 template <typename T>
 void LayerNormOp<Context>::DoRunWithType() {
-  using ParamT = typename math::AccmulatorType<T>::type;
+  using ParamT = typename math::AccumulatorType<T>::type;
  auto &X = Input(0), *Y = Output(0);
  auto &W = Input(1), &B = Input(2);
  GET_OP_AXIS_ARG(axis, X.ndim(), -1);

--- a/dragon/python/_api/distributed/__init__.py
+++ b/dragon/python/_api/distributed/__init__.py
@@ -16,6 +16,7 @@ from __future__ import print_function as _print_function
 from dragon.core.distributed.backend import is_initialized
 from dragon.core.distributed.backend import is_mpi_available
 from dragon.core.distributed.backend import is_nccl_available
+from dragon.core.distributed.backend import finalize
 from dragon.core.distributed.backend import get_backend
 from dragon.core.distributed.backend import get_group
 from dragon.core.distributed.backend import get_rank

--- a/dragon/python/core/autograph/op_lib.py
+++ b/dragon/python/core/autograph/op_lib.py
@@ -213,11 +213,6 @@ class OpLib(object):
                for output in outputs:
                    output._requires_grad = False

-        # Ensure the named operator for the tracing graph.
-        if hasattr(graph_tape, '_tracing') and not op_name:
-            op_def.name = op_name = execute_ws.create_handle(op_def.type)
-            graph_tape.add_handle(op_name)
-
        # Emit to dispatch this execution.
        for feed_key, value_type in run_config['feed_dict'].items():
            dest = execute_ws.create_tensor(op_name + '/' + feed_key)

--- a/dragon/python/core/distributed/__init__.py
+++ b/dragon/python/core/distributed/__init__.py
@@ -14,6 +14,7 @@ from __future__ import division
 from __future__ import print_function

 from dragon.core.distributed.backend import is_initialized
+from dragon.core.distributed.backend import finalize
 from dragon.core.distributed.backend import get_backend
 from dragon.core.distributed.backend import get_group
 from dragon.core.distributed.backend import get_rank

--- a/dragon/python/core/distributed/backend.py
+++ b/dragon/python/core/distributed/backend.py
@@ -169,6 +169,12 @@ def is_nccl_available():
    return _b.ncclIsAvailable()


+def finalize():
+    """Finalize the distributed environment."""
+    global _GLOBAL_MPI_CONTEXT
+    _GLOBAL_MPI_CONTEXT = None
+
+
 def get_backend(group):
    """Return the backend of given process group.


--- a/dragon/python/core/io/reader.py
+++ b/dragon/python/core/io/reader.py
@@ -58,12 +58,16 @@ class DataReader(multiprocessing.Process):

    """

-    class PartBoundaries(object):
-        """Record the boundary of current part."""
+    class BufferBound(object):
+        """Record the boundary of current buffer."""

        def __init__(self, start, end):
            self.start, self.end = start, end

+        @property
+        def is_depleted(self):
+            return self.start == self.end
+
    def __init__(self, **kwargs):
        """Create a ``DataReader``.

@@ -91,13 +95,14 @@ class DataReader(multiprocessing.Process):
        self._part_idx = kwargs.get('part_idx', 0)
        self._num_parts = kwargs.get('num_parts', 1)
        self._shuffle = kwargs.get('shuffle', False)
-        self._initial_fill = kwargs.get('initial_fill', 1024) if self._shuffle else 1
+        self._initial_fill = kwargs.get('initial_fill', 1024)
        self._seed = kwargs.get('seed', config.config().random_seed)
+        self._stick_to_part = kwargs.get('stick_to_part', True)
        self._first, self._cursor, self._last = 0, 0, 0
        self._part_size = 0
        self._num_examples = 0
-        self._example_buffer = []
-        self._parts = []
+        self._buffer_seq = []
+        self._buffer_bounds = []
        self._reader_queue = None

    def before_first(self):
@@ -110,43 +115,45 @@ class DataReader(multiprocessing.Process):
        self._cursor += 1
        return self._dataset.get()

-    def reset(self, stick_to_part=False):
+    def reset(self):
        """Reset the environment of dataset."""
        # Redirect to the adjacent part if available.
-        if not stick_to_part:
+        if not self._stick_to_part:
            self._part_idx = (self._part_idx + 1) % self._num_parts
        self._first = self._part_idx * self._part_size
        self._last = min(self._first + self._part_size, self._num_examples)
        self.before_first()
-        # Use the new boundaries to avoid sampling duplicates
+        # Use new boundary to avoid sampling duplicates
        # when buffer size is greater than dataset size.
-        counter = self._parts[-1].end
-        self._parts.append(DataReader.PartBoundaries(counter, counter))
+        counter = self._buffer_bounds[-1].end
+        self._buffer_bounds.append(self.BufferBound(counter, counter))

    def run(self):
        """Start the process."""
        self._init_dataset()
        # Persist a loop to read examples.
        while True:
-            # Pop the depleted part if necessary.
-            if self._parts[0].start == self._parts[0].end:
-                self._parts.pop(0)
-            offset = 0
+            # Pop the depleted buffer if necessary.
+            if self._buffer_bounds[0].is_depleted:
+                self._buffer_bounds.pop(0)
+            pop_bound = self._buffer_bounds[0]
+            push_bound = self._buffer_bounds[-1]
+            pop_offset = 0
            if self._shuffle:
-                # Sample a random offset if shuffle required.
-                offset = self._parts[0].end - self._parts[0].start
-                offset = int(numpy.random.uniform(high=offset))
-            # Choose a loaded example from the buffer.
-            i = self._parts[0].start % len(self._example_buffer)
-            j = (self._parts[0].start + offset) % len(self._example_buffer)
-            self._reader_queue.put(self._example_buffer[j])
-            self._example_buffer[j] = self._example_buffer[i]
-            # Load and push back a new example into the buffer.
-            k = self._parts[-1].end % len(self._example_buffer)
-            self._example_buffer[k] = self.next_example()
-            # Increase the part boundaries.
-            self._parts[-1].end += 1
-            self._parts[0].start += 1
+                # Sample a random offset.
+                pop_range = pop_bound.end - pop_bound.start
+                pop_offset = numpy.random.randint(0, pop_range)
+            # Pop an example from the buffer.
+            i = pop_bound.start % len(self._buffer_seq)
+            j = (pop_bound.start + pop_offset) % len(self._buffer_seq)
+            self._reader_queue.put(self._buffer_seq[j])
+            self._buffer_seq[j] = self._buffer_seq[i]
+            # Push an example into the buffer.
+            k = push_bound.end % len(self._buffer_seq)
+            self._buffer_seq[k] = self.next_example()
+            # Increase the buffer boundary.
+            push_bound.end += 1
+            pop_bound.start += 1
            # Reset the cursor if necessary.
            if self._cursor >= self._last:
                self.reset()
@@ -162,12 +169,12 @@ class DataReader(multiprocessing.Process):
        # Determine the part specification.
        self._num_examples = self._dataset.size
        self._part_size = (self._num_examples + self._num_parts - 1) // self._num_parts
-        self._parts.append(DataReader.PartBoundaries(0, 0))

        # Fill the initial buffer to support random sampling.
-        self.reset(stick_to_part=True)
-        for i in range(self._initial_fill):
-            self._example_buffer.append(self.next_example())
-            self._parts[-1].end += 1
+        self._buffer_bounds.append(self.BufferBound(0, 0))
+        self.reset()
+        for _ in range(self._initial_fill):
+            self._buffer_bounds[-1].end += 1
+            self._buffer_seq.append(self.next_example())
            if self._cursor >= self._last:
                self.reset()
--- a/dragon/python/vm/onnx/core/frontend/torch.py
+++ b/dragon/python/vm/onnx/core/frontend/torch.py
@@ -117,20 +117,18 @@ def export(
        if input_names is not None:
            raise ValueError(
                'Excepted the input names from <args>.\n'
-                'You should set the <input_names> to None.'
-            )
+                'You should set the <input_names> to None.')
        inputs, input_names, args = \
            list(args.values()), list(args.keys()), [args]
    else:
        inputs = args = nest.flatten(args)

    # Run the model to get the outputs.
-    execute_ws = workspace.Workspace()
-    execute_ws.merge_from(workspace.get_workspace())
-    with execute_ws.as_default():
-        with tapes.Tape() as model_tape:
-            model_tape._exporting = True
-            outputs = model(*args)
+    graph_tape = tapes.Tape()
+    graph_tape._tracing = True  # Enable tracing.
+    graph_tape._exporting = True  # Enable exporting.
+    with graph_tape:
+        outputs = model(*args)

    # Process the outputs
    if isinstance(outputs, dict):
@@ -159,7 +157,7 @@ def export(
            graph_def.output.extend([output_names[i]])

    # Add operators.
-    for op_def in model_tape.get_elements():
+    for op_def in graph_tape.get_elements():
        ops_def.append(dragon_pb2.OperatorDef())
        ops_def[-1].ParseFromString(op_def.SerializeAs())
    graph_def.op.extend(ops_def)
@@ -176,17 +174,16 @@ def export(
            constants[k] = v

    # Export.
-    with execute_ws.as_default():
-        model = graph_def_to_onnx_model(
-            graph_def=graph_def,
-            input_names=input_names,
-            output_names=output_names,
-            input_shapes=input_shapes,
-            constants=constants,
-            value_info=value_info,
-            opset_version=opset_version,
-            workspace=execute_ws,
-            verbose=verbose,
-            enable_onnx_checker=enable_onnx_checker,
-        )
-        serialization.save_bytes(serialization.serialize_proto(model), f)
+    model = graph_def_to_onnx_model(
+        graph_def=graph_def,
+        input_names=input_names,
+        output_names=output_names,
+        input_shapes=input_shapes,
+        constants=constants,
+        value_info=value_info,
+        opset_version=opset_version,
+        workspace=workspace.get_workspace(),
+        verbose=verbose,
+        enable_onnx_checker=enable_onnx_checker,
+    )
+    serialization.save_bytes(serialization.serialize_proto(model), f)
--- a/dragon/setup.py
+++ b/dragon/setup.py
@@ -14,90 +14,48 @@ from __future__ import division
 from __future__ import print_function

 import os
-import setuptools
-import setuptools.command.install
 import shutil
 import subprocess
 import sys

+import setuptools
+import setuptools.command.build_py
+import setuptools.command.install
+
 try:
-    # Override a non-pure "wheel" for pybind distributions
+    # Override a non-pure "wheel" for pybind distributions.
    from wheel.bdist_wheel import bdist_wheel as _bdist_wheel

    class bdist_wheel(_bdist_wheel):
        def finalize_options(self):
-            _bdist_wheel.finalize_options(self)
+            super(bdist_wheel, self).finalize_options()
            self.root_is_pure = False
 except ImportError:
    bdist_wheel = None


-# Read the current version info
+version = git_version = None
 with open('version.txt', 'r') as f:
    version = f.read().strip()
-try:
-    git_version = subprocess.check_output(
-        ['git', 'rev-parse', 'HEAD'], cwd='../').decode('ascii').strip()
-except (OSError, subprocess.CalledProcessError):
-    git_version = None
+if os.path.exists('.git'):
+    try:
+        git_version = subprocess.check_output(
+            ['git', 'rev-parse', 'HEAD'], cwd='../')
+        git_version = git_version.decode('ascii').strip()
+    except (OSError, subprocess.CalledProcessError):
+        pass


-def clean():
-    """Remove the work directories."""
+def clean_builds():
+    """Clean the builds."""
    if os.path.exists('dragon/version.py'):
        shutil.rmtree('dragon')
+    if os.path.exists('build/lib'):
+        shutil.rmtree('build/lib')
    if os.path.exists('seeta_dragon.egg-info'):
        shutil.rmtree('seeta_dragon.egg-info')


-def configure():
-    """Prepare the package files."""
-    clean()
-    # Create a temporary site-package directory.
-    shutil.copytree('python', 'dragon')
-    # Copy headers.
-    shutil.copytree('../targets/native/include', 'dragon/include')
-    # Copy "caffe" => "dragon.vm.caffe"
-    shutil.copytree('../caffe', 'dragon/vm/caffe')
-    # Copy "dali" => "dragon.vm.dali"
-    shutil.copytree('../dali', 'dragon/vm/dali')
-    # Copy "tensorflow" => "dragon.vm.tensorflow"
-    shutil.copytree('../tensorflow', 'dragon/vm/tensorflow')
-    # Copy "tensorlayer" => "dragon.vm.tensorlayer"
-    shutil.copytree('../tensorlayer', 'dragon/vm/tensorlayer')
-    # Copy "tensorrt/python" => "dragon.vm.tensorrt"
-    shutil.copytree('../tensorrt/python', 'dragon/vm/tensorrt')
-    # Copy "torch" => "dragon.vm.torch"
-    shutil.copytree('../torch', 'dragon/vm/torch')
-    # Copy "torchvision" => "dragon.vm.torchvision"
-    shutil.copytree('../torchvision', 'dragon/vm/torchvision')
-    # Copy the pre-built libraries.
-    if not os.path.exists('dragon/lib'):
-        os.makedirs('dragon/lib')
-    for src, dest in find_libraries().items():
-        if os.path.exists(src):
-            shutil.copy(src, dest)
-        else:
-            print('ERROR: Unable to find the library at <%s>.\n'
-                  'Build it before installing to package.' % src)
-            shutil.rmtree('dragon')
-            sys.exit()
-    # Write the version file.
-    with open('dragon/version.py', 'w') as f:
-        f.write("from __future__ import absolute_import\n"
-                "from __future__ import division\n"
-                "from __future__ import print_function\n\n"
-                "version = '{}'\n"
-                "git_version = '{}'\n".format(version, git_version))
-
-
-class install(setuptools.command.install.install):
-    """Old-style command to prevent from installing egg."""
-
-    def run(self):
-        setuptools.command.install.install.run(self)
-
-
 def find_libraries():
    """Return the pre-built libraries."""
    in_prefix = '' if sys.platform == 'win32' else 'lib'
@@ -110,7 +68,7 @@ def find_libraries():
        '../targets/native/lib/{}dragon{}'.format(in_prefix, in_suffix):
        'dragon/lib/{}dragon{}'.format(in_prefix, in_suffix),
        '../targets/native/lib/{}dragon_python{}'.format(in_prefix, in_suffix):
-        'dragon/lib/libdragon_python{}'.format(out_suffix)
+        'dragon/lib/libdragon_python{}'.format(out_suffix),
    }
    if sys.platform == 'win32':
        libraries['../targets/native/lib/dragon.lib'] = 'dragon/lib/dragon.lib'
@@ -118,30 +76,74 @@ def find_libraries():
    return libraries


-def find_packages():
+def find_packages(top):
    """Return the python sources installed to package."""
    packages = []
-    for root, _, files in os.walk('dragon'):
+    for root, _, _ in os.walk(top):
        if os.path.exists(os.path.join(root, '__init__.py')):
            packages.append(root)
    return packages


-def find_package_data():
+def find_package_data(top):
    """Return the external data installed to package."""
    headers, libraries = [], []
-    for root, _, files in os.walk('dragon/include'):
-        root = root[len('dragon/'):]
+    for root, _, files in os.walk(top + '/include'):
+        root = root[len(top + '/'):]
        for file in files:
            headers.append(os.path.join(root, file))
-    for root, _, files in os.walk('dragon/lib'):
-        root = root[len('dragon/'):]
+    for root, _, files in os.walk(top + '/lib'):
+        root = root[len(top + '/'):]
        for file in files:
            libraries.append(os.path.join(root, file))
    return headers + libraries


-configure()
+class BuildPyCommand(setuptools.command.build_py.build_py):
+    """Enhanced 'build_py' command."""
+
+    def build_packages(self):
+        clean_builds()
+        shutil.copytree('python', 'dragon')
+        shutil.copytree('../caffe', 'dragon/vm/caffe')
+        shutil.copytree('../dali', 'dragon/vm/dali')
+        shutil.copytree('../tensorflow', 'dragon/vm/tensorflow')
+        shutil.copytree('../tensorlayer', 'dragon/vm/tensorlayer')
+        shutil.copytree('../tensorrt/python', 'dragon/vm/tensorrt')
+        shutil.copytree('../torch', 'dragon/vm/torch')
+        shutil.copytree('../torchvision', 'dragon/vm/torchvision')
+        with open('dragon/version.py', 'w') as f:
+            f.write("from __future__ import absolute_import\n"
+                    "from __future__ import division\n"
+                    "from __future__ import print_function\n\n"
+                    "version = '{}'\n"
+                    "git_version = '{}'\n".format(version, git_version))
+        self.packages = find_packages('dragon')
+        super(BuildPyCommand, self).build_packages()
+
+    def build_package_data(self):
+        shutil.copytree('../targets/native/include', 'dragon/include')
+        if not os.path.exists('dragon/lib'):
+            os.makedirs('dragon/lib')
+        for src, dest in find_libraries().items():
+            if os.path.exists(src):
+                shutil.copy(src, dest)
+            else:
+                print('ERROR: Unable to find the library at <%s>.\n'
+                      'Build it before installing to package.' % src)
+                sys.exit()
+        self.package_data = {'dragon': find_package_data('dragon')}
+        super(BuildPyCommand, self).build_package_data()
+
+
+class InstallCommand(setuptools.command.install.install):
+    """Enhanced 'install' command."""
+
+    def run(self):
+        # Old-style install instead of egg.
+        super(InstallCommand, self).run()
+
+
 setuptools.setup(
    name='seeta-dragon',
    version=version,
@@ -150,31 +152,30 @@ setuptools.setup(
    url='https://github.com/seetaresearch/dragon',
    author='SeetaTech',
    license='BSD 2-Clause',
-    packages=find_packages(),
-    package_data={'dragon': find_package_data()},
+    packages=find_packages('python'),
    package_dir={'dragon': 'dragon'},
-    cmdclass={'bdist_wheel': bdist_wheel, 'install': install},
+    cmdclass={'bdist_wheel': bdist_wheel,
+              'build_py': BuildPyCommand,
+              'install': InstallCommand},
    python_requires='>=3.6',
    install_requires=['numpy', 'protobuf', 'kpl-dataset'],
-    classifiers=[
-        'Development Status :: 5 - Production/Stable',
-        'Intended Audience :: Developers',
-        'Intended Audience :: Education',
-        'Intended Audience :: Science/Research',
-        'License :: OSI Approved :: BSD License',
-        'Programming Language :: C++',
-        'Programming Language :: Python :: 3',
-        'Programming Language :: Python :: 3 :: Only',
-        'Programming Language :: Python :: 3.6',
-        'Programming Language :: Python :: 3.7',
-        'Programming Language :: Python :: 3.8',
-        'Programming Language :: Python :: 3.9',
-        'Topic :: Scientific/Engineering',
-        'Topic :: Scientific/Engineering :: Mathematics',
-        'Topic :: Scientific/Engineering :: Artificial Intelligence',
-        'Topic :: Software Development',
-        'Topic :: Software Development :: Libraries',
-        'Topic :: Software Development :: Libraries :: Python Modules',
-    ],
+    classifiers=['Development Status :: 5 - Production/Stable',
+                 'Intended Audience :: Developers',
+                 'Intended Audience :: Education',
+                 'Intended Audience :: Science/Research',
+                 'License :: OSI Approved :: BSD License',
+                 'Programming Language :: C++',
+                 'Programming Language :: Python :: 3',
+                 'Programming Language :: Python :: 3 :: Only',
+                 'Programming Language :: Python :: 3.6',
+                 'Programming Language :: Python :: 3.7',
+                 'Programming Language :: Python :: 3.8',
+                 'Programming Language :: Python :: 3.9',
+                 'Topic :: Scientific/Engineering',
+                 'Topic :: Scientific/Engineering :: Mathematics',
+                 'Topic :: Scientific/Engineering :: Artificial Intelligence',
+                 'Topic :: Software Development',
+                 'Topic :: Software Development :: Libraries',
+                 'Topic :: Software Development :: Libraries :: Python Modules'],
 )
-clean()
+clean_builds()
--- a/dragon/utils/math/sort.h
+++ b/dragon/utils/math/sort.h
-/*!
- * Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
- *
- * Licensed under the BSD 2-Clause License.
- * You should have received a copy of the BSD 2-Clause License
- * along with the software. If not, See,
- *
- *     <https://opensource.org/licenses/BSD-2-Clause>
- *
- * ------------------------------------------------------------
- */
-
-#ifndef DRAGON_UTILS_MATH_SORT_H_
-#define DRAGON_UTILS_MATH_SORT_H_
-
-#include "dragon/core/context.h"
-
-namespace dragon {
-
-namespace math {
-
-/*
- * Sort Utilities.
- */
-
-namespace utils {
-
-template <typename T>
-inline void ArgPartition(
-    const int count,
-    const int kth,
-    const bool descend,
-    const T* v,
-    vec64_t& indices) {
-  indices.resize(count);
-  std::iota(indices.begin(), indices.end(), 0);
-  if (descend) {
-    std::nth_element(
-        indices.begin(),
-        indices.begin() + kth,
-        indices.end(),
-        [&v](int64_t lhs, int64_t rhs) { return v[lhs] > v[rhs]; });
-  } else {
-    std::nth_element(
-        indices.begin(),
-        indices.begin() + kth,
-        indices.end(),
-        [&v](int64_t lhs, int64_t rhs) { return v[lhs] < v[rhs]; });
-  }
-}
-
-} // namespace utils
-
-} // namespace math
-
-} // namespace dragon
-
-#endif // DRAGON_UTILS_MATH_SORT_H_
--- a/dragon/utils/math/transform.cc
+++ b/dragon/utils/math/transform.cc
@@ -54,10 +54,14 @@ void _AffineImpl(
    const T* scale,
    const T* bias,
    T* y) {
-  if (num_dims == 2 && num_axes == 1 && axes[0] == 1) {
-    _AffineChannel(dims[0], dims[1], x, scale, bias, y);
+  if (num_dims == 1 && num_dims == 1 && axes[0] == 0) {
+    _AffineChannel(1, dims[0], x, scale, bias, y); // [NxC]
+  } else if (num_dims == 2 && num_axes == 1 && axes[0] == 1) {
+    _AffineChannel(dims[0], dims[1], x, scale, bias, y); // [N, C]
+  } else if (num_dims == 2 && num_axes == 1 && axes[0] == 0) {
+    _AffineChannel(1, dims[0], dims[1], x, scale, bias, y); // [NxC, S]
  } else if (num_dims == 3 && num_axes == 1 && axes[0] == 1) {
-    _AffineChannel(dims[0], dims[1], dims[2], x, scale, bias, y);
+    _AffineChannel(dims[0], dims[1], dims[2], x, scale, bias, y); // [N, C, S]
  } else {
    LOG(FATAL) << "Unsupported affine dimensions.";
  }

--- a/dragon/utils/math/transform.cu
+++ b/dragon/utils/math/transform.cu
@@ -65,12 +65,18 @@ void _AffineImpl(
    T* y,
    CUDAContext* ctx) {
  const auto N = math::utils::Prod(num_dims, dims);
-  if (num_dims == 2 && num_axes == 1 && axes[0] == 1) {
+  if (num_dims == 1 && num_axes == 1 && axes[0] == 0) {
    _AffineChannel<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
-        N, dims[1], x, scale, bias, y);
+        N, dims[0], x, scale, bias, y); // [NxC]
+  } else if (num_dims == 2 && num_axes == 1 && axes[0] == 1) {
+    _AffineChannel<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
+        N, dims[1], x, scale, bias, y); // [N, C]
+  } else if (num_dims == 2 && num_axes == 1 && axes[0] == 0) {
+    _AffineChannel<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
+        N, dims[0], dims[1], x, scale, bias, y); // [NxC, S]
  } else if (num_dims == 3 && num_axes == 1 && axes[0] == 1) {
    _AffineChannel<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
-        N, dims[1], dims[2], x, scale, bias, y);
+        N, dims[1], dims[2], x, scale, bias, y); // [N, C, S]
  } else {
    LOG(FATAL) << "Unsupported affine dimensions.";
  }

--- a/dragon/utils/math/types.h
+++ b/dragon/utils/math/types.h
@@ -40,19 +40,19 @@ class ScalarType<float16> {
 #endif

 template <typename T>
-class AccmulatorType {
+class AccumulatorType {
 public:
  typedef float type;
 };

 template <>
-class AccmulatorType<int64_t> {
+class AccumulatorType<int64_t> {
 public:
  typedef double type;
 };

 template <>
-class AccmulatorType<double> {
+class AccumulatorType<double> {
 public:
  typedef double type;
 };

--- a/dragon/utils/math_functions.h
+++ b/dragon/utils/math_functions.h
@@ -20,7 +20,6 @@
 #include "dragon/utils/math/functional.h"
 #include "dragon/utils/math/random.h"
 #include "dragon/utils/math/reduce.h"
-#include "dragon/utils/math/sort.h"
 #include "dragon/utils/math/transform.h"
 #include "dragon/utils/math/transpose.h"
 #include "dragon/utils/math/types.h"

--- a/test/dragon/test_distributed.py
+++ b/test/dragon/test_distributed.py
@@ -46,6 +46,10 @@ class TestBackend(unittest.TestCase):
        with group.as_default():
            self.assertEqual(dragon.distributed.get_rank(group), 0)

+    @unittest.skipIf(not TEST_MPI, 'MPI unavailable')
+    def test_finalize(self):
+        dragon.distributed.finalize()
+

 if __name__ == '__main__':
    run_tests()
--- a/torch/core/jit/trace.py
+++ b/torch/core/jit/trace.py
@@ -39,13 +39,14 @@ class FunctionGuard(function_lib.FunctionGuard):
                if not isinstance(input, Tensor) and input_spec is None:
                    inputs.append(input)
                    continue
-                shape = getattr(input, 'shape', None)
-                dtype = getattr(input, 'dtype', None)
-                device = getattr(input, 'device', None)
-                if input_spec is not None:
-                    device = input_spec['device']
-                    shape, dtype = input_spec['shape'], input_spec['dtype']
-                inputs.append(Tensor(*shape, dtype=dtype, device=device))
+                input_spec = input_spec or {}
+                for k in ('shape', 'dtype', 'device'):
+                    input_spec[k] = getattr(input, k, input_spec.get(k, None))
+                inputs.append(Tensor(*input_spec['shape'],
+                                     dtype=input_spec['dtype'],
+                                     device=input_spec['device']))
+                if isinstance(input, Tensor):
+                    inputs[-1].copy_(input)
            with tapes.Tape() as function_tape:
                function_tape._tracing = True
                attributes['inputs'] = inputs

--- a/torch/core/tensor.py
+++ b/torch/core/tensor.py
@@ -3299,7 +3299,6 @@ class Tensor(object):

    def __del__(self):
        if self._deleter:
-            # print(self._impl.name)
            self._deleter.release(self._impl.name)

    def __eq__(self, other):