Change the code structure

Ting PAN
Commit f4ecc7c7 authored Apr 08, 2020 by Ting PAN
Showing with 4157 additions and 3067 deletions
CHANGES
README.md
compile/CMakeLists.txt
compile/cmake/FindNumPy.cmake
compile/cmake/FindPythonLibs.cmake
compile/gpu_nms.h
compile/gpu_nms.pyx
compile/make.sh
compile/nms_kernel.cu
compile/setup.py
configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_1x.yml
configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_2x.yml
configs/faster_rcnn/voc_faster_rcnn_R-50-FPN.yml
configs/faster_rcnn/voc_faster_rcnn_VGG-16-C4.yml
configs/retinanet/coco_retinanet_400_R-50-FPN_1x.yml → configs/retinanet/coco_retinanet_400_R-50-FPN.yml
configs/retinanet/coco_retinanet_400_R-50-FPN_4x.yml
configs/retinanet/voc_retinanet_300_R-18-FPN.yml
configs/retinanet/voc_retinanet_300_AirNet-FPN.yml → configs/retinanet/voc_retinanet_320_AirNet-FPN.yml
configs/retinanet/voc_retinanet_300_R-34-FPN.yml → configs/retinanet/voc_retinanet_320_R-50-FPN.yml
configs/ssd/voc_ssd_300_AirNet-5b.yml
--- a/CHANGES
+++ b/CHANGES
 ------------------------------------------------------------------------
 The list of most significant changes made over time in SeetaDet.

+SeetaDet 0.4.0 (20200408)
+
+Dragon Minimum Required (Version 0.3.0.dev20200408)
+
+Changes:
+
+Preview Features:
+
+- Optimize the code structure.
+
+- DALI support for SSD, RetinaNet, and Faster-RCNN.
+
+- Use KPLRecord instead of SeetaRecord.
+
+Bugs fixed:
+
+- Fix the frozen Affine issue.
+
+------------------------------------------------------------------------
+
 SeetaDet 0.3.0 (20191121)

 Dragon Minimum Required (Version 0.3.0.dev20191121)

--- a/README.md
+++ b/README.md
@@ -2,8 +2,8 @@

 ## WHAT's SeetaDet?

-SeetaDet contains many useful object detectors, including R-CNN series, SSD,
-and the recent RetinaNet.
+SeetaDet is a platform implementing popular object detection algorithms,
+including R-CNN series, SSD, and RetinaNet.

 We have achieved the same or higher performance than the baseline reported by the original paper.

@@ -14,22 +14,33 @@ The torch-style codes help us to simplify the hierarchical pipeline of modern de

 ## Requirements

-seeta-dragon >= 0.3.0.dev20191121
+seeta-dragon >= 0.3.0.dev20200408

 ## Installation

-#### 1. Install the required python packages
+#### Build From Source
+
+If you prefer to develop modules as well as running experiments,
+following commands will build but not install to ***site-packages***:

 ```bash
-pip install cython pyyaml matplotlib
-pip install opencv-python Pillow
+cd SeetaDet && python setup.py build
 ```

-#### 2. Compile the C Extensions
+#### Install From Source
+
+Clone this repository to local disk and install:
+
+```bash
+cd SeetaDet && python setup.py install
+```
+
+#### Install From Git
+
+You can also install it from remote repository: 

 ```bash
-cd SeetaDet/compile
-bash ./make.sh
+pip install git+https://gitlab.seetatech.com/seetaresearch/SeetaDet.git@master
 ```

 ## Quick Start
@@ -37,7 +48,7 @@ bash ./make.sh
 #### Train a detection model

 ```bash
-cd SeetaDet/tools
+cd tools
 python train.py --cfg <MODEL_YAML>
 ```

@@ -46,20 +57,20 @@ We have provided the default YAML examples into ``SeetaDet/configs``.
 #### Test a detection model

 ```bash
-cd SeetaDet/tools
+cd tools
 python test.py --cfg <MODEL_YAML> --exp_dir <EXP_DIR> --iter <ITERATION>
 ```
 Or

 ```bash
-cd SeetaDet/tools
+cd tools
 python test_all.py --cfg <MODEL_YAML> --exp_dir <EXP_DIR>
 ```

 #### Export a detection model to ONNX

 ```bash
-cd SeetaDet/tools
+cd tools
 python export.py --cfg <MODEL_YAML> --exp_dir <EXP_DIR> --iter <ITERATION>
 ```


--- a/compile/CMakeLists.txt
+++ b/compile/CMakeLists.txt
-PROJECT(gpu_nms)
-CMAKE_MINIMUM_REQUIRED(VERSION 3.0.2)
-
-# ---------------- User Config ----------------
-
-# Set your python "interpreter" if necessary
-# if not, a default interpreter will be used
-# here, provide several examples:
-# set(PYTHON_EXECUTABLE /usr/bin/python) # Linux & OSX, Builtin Python
-# set(PYTHON_EXECUTABLE /X/anaconda/bin/python) # Linux & OSX, Anaconda
-# set(PYTHON_EXECUTABLE X:/Anaconda/python) # Win, Anaconda
-
-# Set CUDA compiling architecture
-# Remove "compute_70/sm_70" if using CUDA 8.0
-set(CUDA_ARCH    -gencode arch=compute_30,code=sm_30
-                 -gencode arch=compute_35,code=sm_35
-                 -gencode arch=compute_50,code=sm_50
-                 -gencode arch=compute_60,code=sm_60
-                 -gencode arch=compute_70,code=sm_70)
-
-# ---------------- User Config ----------------
-
-# ---[ Dependencies
-include(${PROJECT_SOURCE_DIR}/cmake/FindPythonLibs.cmake)
-include(${PROJECT_SOURCE_DIR}/cmake/FindNumPy.cmake)
-FIND_PACKAGE(CUDA REQUIRED)
-
-set(CMAKE_CXX_STANDARD 11)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-message(STATUS "C++11 support has been enabled by default.")
-
-# ---[ Config types
-set(CMAKE_BUILD_TYPE Release CACHE STRING "set build type to release")
-set(CMAKE_CONFIGURATION_TYPES  Release CACHE STRING "set build type to release" FORCE)
-
-# ---[ Includes
-set(INCLUDE_DIR ${PROJECT_SOURCE_DIR}/include)
-include_directories(${INCLUDE_DIR})
-include_directories(${PROJECT_SOURCE_DIR}/src)
-include_directories(${PYTHON_INCLUDE_DIRS})
-include_directories(${NUMPY_INCLUDE_DIR})
-include_directories(${CUDA_INCLUDE_DIRS})
-
-# ---[ libs
-link_directories(${PYTHON_LIBRARIES})
-
-# ---[ Install
-set(CMAKE_INSTALL_PREFIX ${PROJECT_SOURCE_DIR} CACHE STRING "set install prefix" FORCE)
-set(CMAKE_SHARED_LIBRARY_PREFIX "")
-
-# ---[ Flags
-set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ${CUDA_ARCH}")
-if(WIN32)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP /O2 /Oi /GL /Ot /Gy")
-endif()
-if(UNIX)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -s -fPIC")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -s -w -fPIC -O3 -m64 -std=c++11")
-endif()
-
-# ---[ Files
-set(HEADER_FILES gpu_nms.h)
-set(SRC_FILES gpu_nms.cpp nms_kernel.cu)
-
-# ---[ Add Target
-CUDA_ADD_LIBRARY(${PROJECT_NAME} SHARED ${HEADER_FILES} ${SRC_FILES})
-
-# ---[ Link Libs
-TARGET_LINK_LIBRARIES(${PROJECT_NAME} ${CUDA_LIBRARIES} ${CUDA_cublas_LIBRARY} ${CUDA_curand_LIBRARY})
-if(WIN32)
-    TARGET_LINK_LIBRARIES(${PROJECT_NAME} ${PYTHON_LIBRARIES})
-endif()
-
-# ---[ Install Target
-set_target_properties(${PROJECT_NAME} PROPERTIES OUTPUT_NAME "gpu_nms")
-install (TARGETS ${PROJECT_NAME} DESTINATION ${PROJECT_BINARY_DIR}/../install/lib/nms)
-
--- a/compile/cmake/FindNumPy.cmake
+++ b/compile/cmake/FindNumPy.cmake
-# - Find the NumPy libraries
-# This module finds if NumPy is installed, and sets the following variables
-# indicating where it is.
-#
-# TODO: Update to provide the libraries and paths for linking npymath lib.
-#
-#  NUMPY_FOUND               - was NumPy found
-#  NUMPY_VERSION             - the version of NumPy found as a string
-#  NUMPY_VERSION_MAJOR       - the major version number of NumPy
-#  NUMPY_VERSION_MINOR       - the minor version number of NumPy
-#  NUMPY_VERSION_PATCH       - the patch version number of NumPy
-#  NUMPY_VERSION_DECIMAL     - e.g. version 1.6.1 is 10601
-#  NUMPY_INCLUDE_DIR         - path to the NumPy include files
-
-unset(NUMPY_VERSION)
-unset(NUMPY_INCLUDE_DIR)
-
-if(PYTHONINTERP_FOUND)
-  execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
-    "import numpy as n; print(n.__version__); print(n.get_include());"
-    RESULT_VARIABLE __result
-    OUTPUT_VARIABLE __output
-    OUTPUT_STRIP_TRAILING_WHITESPACE)
-
-  if(__result MATCHES 0)
-    string(REGEX REPLACE ";" "\\\\;" __values ${__output})
-    string(REGEX REPLACE "\r?\n" ";"    __values ${__values})
-    list(GET __values 0 NUMPY_VERSION)
-    list(GET __values 1 NUMPY_INCLUDE_DIR)
-
-    string(REGEX MATCH "^([0-9])+\\.([0-9])+\\.([0-9])+" __ver_check "${NUMPY_VERSION}")
-    if(NOT "${__ver_check}" STREQUAL "")
-      set(NUMPY_VERSION_MAJOR ${CMAKE_MATCH_1})
-      set(NUMPY_VERSION_MINOR ${CMAKE_MATCH_2})
-      set(NUMPY_VERSION_PATCH ${CMAKE_MATCH_3})
-      math(EXPR NUMPY_VERSION_DECIMAL
-        "(${NUMPY_VERSION_MAJOR} * 10000) + (${NUMPY_VERSION_MINOR} * 100) + ${NUMPY_VERSION_PATCH}")
-      string(REGEX REPLACE "\\\\" "/"  NUMPY_INCLUDE_DIR ${NUMPY_INCLUDE_DIR})
-    else()
-     unset(NUMPY_VERSION)
-     unset(NUMPY_INCLUDE_DIR)
-     message(STATUS "Requested NumPy version and include path, but got instead:\n${__output}\n")
-    endif()
-  endif()
-else()
-	message("Can not find Python interpretator.")
-	message(FATAL_ERROR "Do you set PYTHON_EXECUTABLE correctly?")
-endif()
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(NumPy REQUIRED_VARS NUMPY_INCLUDE_DIR NUMPY_VERSION
-                                        VERSION_VAR   NUMPY_VERSION)
-
-if(NUMPY_FOUND)
-  message(STATUS "NumPy ver. ${NUMPY_VERSION} found (include: ${NUMPY_INCLUDE_DIR})")
-endif()
\ No newline at end of file
--- a/compile/cmake/FindPythonLibs.cmake
+++ b/compile/cmake/FindPythonLibs.cmake
-# - Find python libraries
-# This module finds the libraries corresponding to the Python interpeter
-# FindPythonInterp provides.
-# This code sets the following variables:
-#
-#  PYTHONLIBS_FOUND           - have the Python libs been found
-#  PYTHON_PREFIX              - path to the Python installation
-#  PYTHON_LIBRARIES           - path to the python library
-#  PYTHON_INCLUDE_DIRS        - path to where Python.h is found
-#  PYTHON_MODULE_EXTENSION    - lib extension, e.g. '.so' or '.pyd'
-#  PYTHON_MODULE_PREFIX       - lib name prefix: usually an empty string
-#  PYTHON_SITE_PACKAGES       - path to installation site-packages
-#  PYTHON_IS_DEBUG            - whether the Python interpreter is a debug build
-#
-# Thanks to talljimbo for the patch adding the 'LDVERSION' config
-# variable usage.
-
-#=============================================================================
-# Copyright 2001-2009 Kitware, Inc.
-# Copyright 2012 Continuum Analytics, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution.
-#
-# * Neither the names of Kitware, Inc., the Insight Software Consortium,
-# nor the names of their contributors may be used to endorse or promote
-# products derived from this software without specific prior written
-# permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#=============================================================================
-
-# Checking for the extension makes sure that `LibsNew` was found and not just `Libs`.
-if(PYTHONLIBS_FOUND AND PYTHON_MODULE_EXTENSION)
-    return()
-endif()
-
-# Use the Python interpreter to find the libs.
-if(PythonLibsNew_FIND_REQUIRED)
-    find_package(PythonInterp ${PythonLibsNew_FIND_VERSION} REQUIRED)
-else()
-    find_package(PythonInterp ${PythonLibsNew_FIND_VERSION})
-endif()
-
-if(NOT PYTHONINTERP_FOUND)
-    set(PYTHONLIBS_FOUND FALSE)
-    return()
-endif()
-
-# According to http://stackoverflow.com/questions/646518/python-how-to-detect-debug-interpreter
-# testing whether sys has the gettotalrefcount function is a reliable, cross-platform
-# way to detect a CPython debug interpreter.
-#
-# The library suffix is from the config var LDVERSION sometimes, otherwise
-# VERSION. VERSION will typically be like "2.7" on unix, and "27" on windows.
-execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
-    "from distutils import sysconfig as s;import sys;import struct;
-print('.'.join(str(v) for v in sys.version_info));
-print(sys.prefix);
-print(s.get_python_inc(plat_specific=True));
-print(s.get_python_lib(plat_specific=True));
-print(s.get_config_var('SO'));
-print(hasattr(sys, 'gettotalrefcount')+0);
-print(struct.calcsize('@P'));
-print(s.get_config_var('LDVERSION') or s.get_config_var('VERSION'));
-print(s.get_config_var('LIBDIR') or '');
-print(s.get_config_var('MULTIARCH') or '');
-"
-    RESULT_VARIABLE _PYTHON_SUCCESS
-    OUTPUT_VARIABLE _PYTHON_VALUES
-    ERROR_VARIABLE _PYTHON_ERROR_VALUE)
-
-if(NOT _PYTHON_SUCCESS MATCHES 0)
-    if(PythonLibsNew_FIND_REQUIRED)
-        message(FATAL_ERROR
-            "Python config failure:\n${_PYTHON_ERROR_VALUE}")
-    endif()
-    set(PYTHONLIBS_FOUND FALSE)
-    return()
-endif()
-
-# Convert the process output into a list
-string(REGEX REPLACE ";" "\\\\;" _PYTHON_VALUES ${_PYTHON_VALUES})
-string(REGEX REPLACE "\n" ";" _PYTHON_VALUES ${_PYTHON_VALUES})
-list(GET _PYTHON_VALUES 0 _PYTHON_VERSION_LIST)
-list(GET _PYTHON_VALUES 1 PYTHON_PREFIX)
-list(GET _PYTHON_VALUES 2 PYTHON_INCLUDE_DIR)
-list(GET _PYTHON_VALUES 3 PYTHON_SITE_PACKAGES)
-list(GET _PYTHON_VALUES 4 PYTHON_MODULE_EXTENSION)
-list(GET _PYTHON_VALUES 5 PYTHON_IS_DEBUG)
-list(GET _PYTHON_VALUES 6 PYTHON_SIZEOF_VOID_P)
-list(GET _PYTHON_VALUES 7 PYTHON_LIBRARY_SUFFIX)
-list(GET _PYTHON_VALUES 8 PYTHON_LIBDIR)
-list(GET _PYTHON_VALUES 9 PYTHON_MULTIARCH)
-
-# Make sure the Python has the same pointer-size as the chosen compiler
-# Skip if CMAKE_SIZEOF_VOID_P is not defined
-if(CMAKE_SIZEOF_VOID_P AND (NOT "${PYTHON_SIZEOF_VOID_P}" STREQUAL "${CMAKE_SIZEOF_VOID_P}"))
-    if(PythonLibsNew_FIND_REQUIRED)
-        math(EXPR _PYTHON_BITS "${PYTHON_SIZEOF_VOID_P} * 8")
-        math(EXPR _CMAKE_BITS "${CMAKE_SIZEOF_VOID_P} * 8")
-        message(FATAL_ERROR
-            "Python config failure: Python is ${_PYTHON_BITS}-bit, "
-            "chosen compiler is  ${_CMAKE_BITS}-bit")
-    endif()
-    set(PYTHONLIBS_FOUND FALSE)
-    return()
-endif()
-
-# The built-in FindPython didn't always give the version numbers
-string(REGEX REPLACE "\\." ";" _PYTHON_VERSION_LIST ${_PYTHON_VERSION_LIST})
-list(GET _PYTHON_VERSION_LIST 0 PYTHON_VERSION_MAJOR)
-list(GET _PYTHON_VERSION_LIST 1 PYTHON_VERSION_MINOR)
-list(GET _PYTHON_VERSION_LIST 2 PYTHON_VERSION_PATCH)
-
-# Make sure all directory separators are '/'
-string(REGEX REPLACE "\\\\" "/" PYTHON_PREFIX ${PYTHON_PREFIX})
-string(REGEX REPLACE "\\\\" "/" PYTHON_INCLUDE_DIR ${PYTHON_INCLUDE_DIR})
-string(REGEX REPLACE "\\\\" "/" PYTHON_SITE_PACKAGES ${PYTHON_SITE_PACKAGES})
-
-if(CMAKE_HOST_WIN32)
-    set(PYTHON_LIBRARY
-        "${PYTHON_PREFIX}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib")
-
-    # when run in a venv, PYTHON_PREFIX points to it. But the libraries remain in the
-    # original python installation. They may be found relative to PYTHON_INCLUDE_DIR.
-    if(NOT EXISTS "${PYTHON_LIBRARY}")
-        get_filename_component(_PYTHON_ROOT ${PYTHON_INCLUDE_DIR} DIRECTORY)
-        set(PYTHON_LIBRARY
-            "${_PYTHON_ROOT}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib")
-    endif()
-
-    # raise an error if the python libs are still not found.
-    if(NOT EXISTS "${PYTHON_LIBRARY}")
-        message(FATAL_ERROR "Python libraries not found")
-    endif()
-
-else()
-    if(PYTHON_MULTIARCH)
-        set(_PYTHON_LIBS_SEARCH "${PYTHON_LIBDIR}/${PYTHON_MULTIARCH}" "${PYTHON_LIBDIR}")
-    else()
-        set(_PYTHON_LIBS_SEARCH "${PYTHON_LIBDIR}")
-    endif()
-    #message(STATUS "Searching for Python libs in ${_PYTHON_LIBS_SEARCH}")
-    # Probably this needs to be more involved. It would be nice if the config
-    # information the python interpreter itself gave us were more complete.
-    find_library(PYTHON_LIBRARY
-        NAMES "python${PYTHON_LIBRARY_SUFFIX}"
-        PATHS ${_PYTHON_LIBS_SEARCH}
-        NO_DEFAULT_PATH)
-
-    # If all else fails, just set the name/version and let the linker figure out the path.
-    if(NOT PYTHON_LIBRARY)
-        set(PYTHON_LIBRARY python${PYTHON_LIBRARY_SUFFIX})
-    endif()
-endif()
-
-MARK_AS_ADVANCED(
-  PYTHON_LIBRARY
-  PYTHON_INCLUDE_DIR
-)
-
-# We use PYTHON_INCLUDE_DIR, PYTHON_LIBRARY and PYTHON_DEBUG_LIBRARY for the
-# cache entries because they are meant to specify the location of a single
-# library. We now set the variables listed by the documentation for this
-# module.
-SET(PYTHON_INCLUDE_DIRS "${PYTHON_INCLUDE_DIR}")
-SET(PYTHON_LIBRARIES "${PYTHON_LIBRARY}")
-SET(PYTHON_DEBUG_LIBRARIES "${PYTHON_DEBUG_LIBRARY}")
-
-find_package_message(PYTHON
-    "Found PythonLibs: ${PYTHON_LIBRARY}"
-    "${PYTHON_EXECUTABLE}${PYTHON_VERSION}")
-
-set(PYTHONLIBS_FOUND TRUE)
--- a/compile/gpu_nms.h
+++ b/compile/gpu_nms.h
-void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
-          int boxes_dim, float nms_overlap_thresh, int device_id);
--- a/compile/gpu_nms.pyx
+++ b/compile/gpu_nms.pyx
-# --------------------------------------------------------
-# Faster R-CNN
-# Copyright (c) 2015 Microsoft
-# Licensed under The MIT License [see LICENSE for details]
-# Written by Ross Girshick
-# --------------------------------------------------------
-
-import numpy as np
-cimport numpy as np
-
-assert sizeof(int) == sizeof(np.int32_t)
-
-cdef extern from "gpu_nms.h":
-    void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int)
-
-def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, float thresh, int device_id=0):
-    cdef int boxes_num = dets.shape[0]
-    cdef int boxes_dim = dets.shape[1]
-    cdef int num_out
-    cdef np.ndarray[np.int32_t, ndim=1] \
-        keep = np.zeros(boxes_num, dtype=np.int32)
-    cdef np.ndarray[np.float32_t, ndim=1] \
-        scores = dets[:, 4]
-    cdef np.ndarray[np.intp_t, ndim=1] \
-        order = scores.argsort()[::-1]
-    cdef np.ndarray[np.float32_t, ndim=2] \
-        sorted_dets = dets[order, :]
-    _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, thresh, device_id)
-    keep = keep[:num_out]
-    return list(order[keep])
--- a/compile/make.sh
+++ b/compile/make.sh
-#!/bin/sh
-
-# Delete cache
-rm -r build install *.c *.cpp
-
-# Compile cpp modules
-python setup.py build_ext --inplace
-
-# Compile cuda modules
-cd build && cmake .. && make install && cd ..
-
-# Copy to the library root
-cp -r install/lib ../
--- a/compile/nms_kernel.cu
+++ b/compile/nms_kernel.cu
-// ------------------------------------------------------------
-// Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-//
-// Licensed under the BSD 2-Clause License.
-// You should have received a copy of the BSD 2-Clause License
-// along with the software. If not, See,
-//
-//      <https://opensource.org/licenses/BSD-2-Clause>
-//
-// ------------------------------------------------------------
-
-#include <vector>
-
-#include "gpu_nms.h"
-
-#define CUDA_CHECK(condition) \
-  /* Code block avoids redefinition of cudaError_t error */ \
-  do { \
-    cudaError_t error = condition; \
-    if (error != cudaSuccess) { \
-      \
-    } \
-  } while (0)
-
-void SetDevice(int device_id) {
-    int current_device;
-    CUDA_CHECK(cudaGetDevice(&current_device));
-    if (current_device == device_id) return;
-    CUDA_CHECK(cudaSetDevice(device_id));
-}
-
-#define DIV_UP(m,n) ((m) / (n) + ((m) % (n) > 0))
-#define NMS_BLOCK_SIZE 64
-
-template <typename T>
-__device__  T iou(const T* A, const T* B) {
-    const T x1 = max(A[0], B[0]);
-    const T y1 = max(A[1], B[1]);
-    const T x2 = min(A[2], B[2]);
-    const T y2 = min(A[3], B[3]);
-    const T width = max((T)0, x2 - x1 + 1);
-    const T height = max((T)0, y2 - y1 + 1);
-    const T area = width * height;
-    const T A_area = (A[2] - A[0] + 1) * (A[3] - A[1] + 1);
-    const T B_area = (B[2] - B[0] + 1) * (B[3] - B[1] + 1);
-    return area / (A_area + B_area - area);
-}
-
-template <typename T>
-__global__ void nms_mask(const int num_boxes, const T nms_thresh,
-			 const T* boxes, unsigned long long* mask) {
-    const int i_start = blockIdx.x * NMS_BLOCK_SIZE;
-    const int di_end = min(num_boxes - i_start, NMS_BLOCK_SIZE);
-    const int j_start = blockIdx.y * NMS_BLOCK_SIZE;
-    const int dj_end = min(num_boxes - j_start, NMS_BLOCK_SIZE);
-
-    const int num_blocks = DIV_UP(num_boxes, NMS_BLOCK_SIZE);
-    const int bid = blockIdx.x;
-    const int tid = threadIdx.x;
-
-    __shared__ T boxes_i[NMS_BLOCK_SIZE * 4];
-
-    if (tid < di_end) {
-        boxes_i[tid * 4 + 0] = boxes[(i_start + tid) * 5 + 0];
-        boxes_i[tid * 4 + 1] = boxes[(i_start + tid) * 5 + 1];
-        boxes_i[tid * 4 + 2] = boxes[(i_start + tid) * 5 + 2];
-        boxes_i[tid * 4 + 3] = boxes[(i_start + tid) * 5 + 3];
-    }
-
-    __syncthreads();
-
-    if (tid < dj_end) {
-        const T* const box_j = boxes + (j_start + tid) * 5;
-        unsigned long long mask_j = 0;
-        const int di_start = (i_start == j_start) ? (tid + 1) : 0;
-        for (int di = di_start; di < di_end; ++di)
-            if (iou(box_j, boxes_i + di * 4) > nms_thresh) 
-		mask_j |= 1ULL << di;
-        mask[(j_start + tid) * num_blocks + bid] = mask_j;
-    }
-}
-
-template <typename T>
-void ApplyNMS(const int num_boxes, const int max_keeps, const float thresh,
-              const T* boxes, int* keep_indices, int& num_keep) {
-    const int num_blocks = DIV_UP(num_boxes, NMS_BLOCK_SIZE);
-    const dim3 blocks(num_blocks, num_blocks);
-    size_t mask_nbytes = num_boxes * num_blocks * sizeof(unsigned long long);
-    size_t boxes_nbytes = num_boxes * 5 * sizeof(T);
-
-    void* boxes_dev, *mask_dev;
-    CUDA_CHECK(cudaMalloc(&boxes_dev, boxes_nbytes));
-    CUDA_CHECK(cudaMalloc(&mask_dev, mask_nbytes));
-    CUDA_CHECK(cudaMemcpy(boxes_dev, boxes, boxes_nbytes, cudaMemcpyHostToDevice));
-    nms_mask<T> << <blocks, NMS_BLOCK_SIZE >> > (num_boxes, thresh,
-					             (T*)boxes_dev,
-                         	    (unsigned long long*)mask_dev);
-    CUDA_CHECK(cudaPeekAtLastError());
-
-    std::vector<unsigned long long> mask_host(num_boxes * num_blocks);
-    CUDA_CHECK(cudaMemcpy(&mask_host[0], mask_dev, mask_nbytes, cudaMemcpyDeviceToHost));
-
-    std::vector<unsigned long long> dead_bit(num_blocks);
-    memset(&dead_bit[0], 0, sizeof(unsigned long long) * num_blocks);
-    int num_selected = 0;
-
-    for (int i = 0; i < num_boxes; ++i) {
-        const int nblock = i / NMS_BLOCK_SIZE;
-        const int inblock = i % NMS_BLOCK_SIZE;
-        if (!(dead_bit[nblock] & (1ULL << inblock))) {
-            keep_indices[num_selected++] = i;
-            unsigned long long* mask_i = &mask_host[0] + i * num_blocks;
-            for (int j = nblock; j < num_blocks; ++j) dead_bit[j] |= mask_i[j];
-            if (num_selected == max_keeps) break;
-        }
-    }
-    num_keep = num_selected;
-    CUDA_CHECK(cudaFree(mask_dev)); 
-    CUDA_CHECK(cudaFree(boxes_dev));
-}
-
-void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
-    	  int boxes_dim, float nms_overlap_thresh, int device_id) {
-    //  set the device to use
-    SetDevice(device_id);
-
-    //  apply gpu nms
-    ApplyNMS<float>(boxes_num, boxes_num, nms_overlap_thresh,
-                             boxes_host, keep_out, *num_out);
-}
\ No newline at end of file
--- a/compile/setup.py
+++ b/compile/setup.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-
-from distutils.extension import Extension
-from distutils.core import setup
-from Cython.Distutils import build_ext
-
-import numpy as np
-numpy_include = np.get_include()
-
-ext_modules = [
-Extension(
-        "install.lib.utils.cython_bbox",
-        ["bbox.pyx"],
-        extra_compile_args=["-Wno-cpp", "-Wno-unused-function"],
-        include_dirs = [numpy_include]),
-Extension(
-        "install.lib.nms.cpu_nms",
-        ["cpu_nms.pyx"],
-        extra_compile_args=["-Wno-cpp", "-Wno-unused-function"],
-        include_dirs = [numpy_include]),
-Extension(
-        "install.deprecated.gpu_nms",
-        ["gpu_nms.pyx"],
-        extra_compile_args=["-Wno-cpp", "-Wno-unused-function"],
-        language='c++',
-        include_dirs = [numpy_include]),
-Extension(
-        'install.lib.pycocotools._mask',
-        ['../lib/pycocotools/maskApi.c', '../lib/pycocotools/_mask.pyx'],
-        include_dirs=[numpy_include, 'pycocotools'],
-        extra_compile_args=['-Wno-cpp', '-Wno-unused-function', '-std=c99']),
-]
-
-setup(name='Detectron',ext_modules=ext_modules,cmdclass = {'build_ext': build_ext})
--- a/configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_1x.yml
+++ b/configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_1x.yml
@@ -31,14 +31,14 @@ FRCNN:
  ROI_XFORM_RESOLUTION: 7
 TRAIN:
  WEIGHTS: '/model/R-101.Affine.pth'
-  DATABASE: '/data/coco_2014_trainval35k'
-  IMS_PER_BATCH: 2
+  DATASET: '/data/coco_2014_trainval35k'
  USE_DIFF: False # Do not use crowd objects
+  IMS_PER_BATCH: 2
  BATCH_SIZE: 512
  SCALES: [800]
  MAX_SIZE: 1333
 TEST:
-  DATABASE: '/data/coco_2014_minival'
+  DATASET: '/data/coco_2014_minival'
  JSON_FILE: '/data/instances_minival2014.json'
  PROTOCOL: 'coco'
  RPN_POST_NMS_TOP_N: 1000

--- a/configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_2x.yml
+++ b/configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_2x.yml
@@ -31,14 +31,14 @@ FRCNN:
  ROI_XFORM_RESOLUTION: 7
 TRAIN:
  WEIGHTS: '/model/R-101.Affine.pth'
-  DATABASE: '/data/coco_2014_trainval35k'
-  IMS_PER_BATCH: 2
+  DATASET: '/data/coco_2014_trainval35k'
  USE_DIFF: False # Do not use crowd objects
+  IMS_PER_BATCH: 2
  BATCH_SIZE: 512
  SCALES: [800]
  MAX_SIZE: 1333
 TEST:
-  DATABASE: '/data/coco_2014_minival'
+  DATASET: '/data/coco_2014_minival'
  JSON_FILE: '/data/instances_minival2014.json'
  PROTOCOL: 'coco'
  RPN_POST_NMS_TOP_N: 1000

--- a/configs/faster_rcnn/voc_faster_rcnn_R-50-FPN.yml
+++ b/configs/faster_rcnn/voc_faster_rcnn_R-50-FPN.yml
@@ -22,13 +22,13 @@ FRCNN:
  ROI_XFORM_RESOLUTION: 7
 TRAIN:
  WEIGHTS: '/model/R-50.Affine.pth'
-  DATABASE: '/data/voc_0712_trainval'
+  DATASET: '/data/voc_0712_trainval'
  IMS_PER_BATCH: 2
  BATCH_SIZE: 128
  SCALES: [600]
  MAX_SIZE: 1000
 TEST:
-  DATABASE: '/data/voc_2007_test'
+  DATASET: '/data/voc_2007_test'
  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
  RPN_POST_NMS_TOP_N: 1000
  SCALES: [600]

--- a/configs/faster_rcnn/voc_faster_rcnn_VGG-16-C4.yml
+++ b/configs/faster_rcnn/voc_faster_rcnn_VGG-16-C4.yml
@@ -28,14 +28,14 @@ FRCNN:
  MLP_HEAD_DIM: 4096
 TRAIN:
  WEIGHTS: '/model/VGG16.RCNN.pth'
-  DATABASE: '/data/voc_0712_trainval'
+  DATASET: '/data/voc_0712_trainval'
  RPN_MIN_SIZE: 16
  IMS_PER_BATCH: 2
  BATCH_SIZE: 128
  SCALES: [600]
  MAX_SIZE: 1000
 TEST:
-  DATABASE: '/data/voc_2007_test'
+  DATASET: '/data/voc_2007_test'
  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
  RPN_MIN_SIZE: 16
  RPN_POST_NMS_TOP_N: 300

--- a/configs/retinanet/coco_retinanet_400_R-50-FPN_1x.yml
+++ b/configs/retinanet/coco_retinanet_400_R-50-FPN_1x.yml
@@ -21,9 +21,9 @@ MODEL:
            'teddy bear', 'hair drier', 'toothbrush']
  NUM_CLASSES: 81
 SOLVER:
-  BASE_LR: 0.02
-  DECAY_STEPS: [30000, 40000]
-  MAX_STEPS: 45000
+  BASE_LR: 0.01
+  DECAY_STEPS: [60000, 80000]
+  MAX_STEPS: 90000
  SNAPSHOT_EVERY: 5000
  SNAPSHOT_PREFIX: coco_retinanet_400
 FPN:
@@ -31,12 +31,15 @@ FPN:
  RPN_MAX_LEVEL: 7
 TRAIN:
  WEIGHTS: '/model/R-50.Affine.pth'
-  DATABASE: '/data/coco_2014_trainval35k'
+  DATASET: '/data/coco_2014_trainval35k'
+  USE_DIFF: False  # Do not use crowd objects
+  USE_COLOR_JITTER: True
  IMS_PER_BATCH: 8
  SCALES: [400]
  MAX_SIZE: 666
+  RANDOM_SCALES: [0.75, 1.0]
 TEST:
-  DATABASE: '/data/coco_2014_minival'
+  DATASET: '/data/coco_2014_minival'
  JSON_FILE: '/data/instances_minival2014.json'
  PROTOCOL: 'coco'
  IMS_PER_BATCH: 1

--- a/configs/retinanet/coco_retinanet_400_R-50-FPN_4x.yml
+++ b/configs/retinanet/coco_retinanet_400_R-50-FPN_4x.yml
-NUM_GPUS: 4
-VIS: False
-ENABLE_TENSOR_BOARD: False
-MODEL:
-  TYPE: retinanet
-  BACKBONE: resnet50.fpn
-  CLASSES: ['__background__',
-            'person', 'bicycle', 'car', 'motorcycle', 'airplane',
-            'bus', 'train', 'truck', 'boat', 'traffic light',
-            'fire hydrant', 'stop sign', 'parking meter', 'bench',
-            'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant',
-            'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
-            'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
-            'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
-            'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife',
-            'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli',
-            'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
-            'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop',
-            'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven',
-            'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors',
-            'teddy bear', 'hair drier', 'toothbrush']
-  NUM_CLASSES: 81
-SOLVER:
-  BASE_LR: 0.02
-  WARM_UP_STEPS: 2000 # default: 500
-  DECAY_STEPS: [120000, 160000]
-  MAX_STEPS: 180000
-  SNAPSHOT_EVERY: 5000
-  SNAPSHOT_PREFIX: coco_retinanet_400
-FPN:
-  RPN_MIN_LEVEL: 3
-  RPN_MAX_LEVEL: 7
-DROPBLOCK:
-  DROP_ON: True
-  DECREMENT: 0.000005 # * 20000 = 0.1
-TRAIN:
-  WEIGHTS: '/model/R-50.Affine.pth'
-  DATABASE: '/data/coco_2014_trainval35k'
-  IMS_PER_BATCH: 8
-  SCALES: [400]
-  MAX_SIZE: 666
-  USE_SCALE_JITTER: True
-  USE_COLOR_JITTER: True
-  SCALE_JITTER_RANGE: [0.75, 1.33]
-TEST:
-  DATABASE: '/data/coco_2014_minival'
-  JSON_FILE: '/data/instances_minival2014.json'
-  PROTOCOL: 'coco'
-  IMS_PER_BATCH: 1
-  SCALES: [400]
-  MAX_SIZE: 666
-  NMS: 0.5
\ No newline at end of file
--- a/configs/retinanet/voc_retinanet_300_R-18-FPN.yml
+++ b/configs/retinanet/voc_retinanet_300_R-18-FPN.yml
-NUM_GPUS: 1
-VIS: False
-VIS_ON_FILE: False
-MODEL:
-  TYPE: retinanet
-  BACKBONE: resnet18.fpn
-  CLASSES: ['__background__',
-            'aeroplane', 'bicycle', 'bird', 'boat',
-            'bottle', 'bus', 'car', 'cat', 'chair',
-            'cow', 'diningtable', 'dog', 'horse',
-            'motorbike', 'person', 'pottedplant',
-            'sheep', 'sofa', 'train', 'tvmonitor']
-  NUM_CLASSES: 21
-SOLVER:
-  BASE_LR: 0.01
-  DECAY_STEPS: [40000, 50000, 60000]
-  WARM_UP_STEPS: 2000
-  MAX_STEPS: 60000
-  SNAPSHOT_EVERY: 5000
-  SNAPSHOT_PREFIX: voc_retinanet_300
-FPN:
-  RPN_MIN_LEVEL: 3
-  RPN_MAX_LEVEL: 7
-TRAIN:
-  WEIGHTS: '/model/R-18.Affine.pth'
-  DATABASE: '/data/voc_0712_trainval'
-  IMS_PER_BATCH: 32
-  SCALES: [300]
-  MAX_SIZE: 500
-  SCALE_JITTER_RANGE: [0.5, 2.0]
-  USE_SCALE_JITTER: True
-  USE_COLOR_JITTER: True
-TEST:
-  DATABASE: '/data/voc_2007_test'
-  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
-  IMS_PER_BATCH: 1
-  SCALES: [300]
-  MAX_SIZE: 500
-  NMS: 0.45
\ No newline at end of file
--- a/configs/retinanet/voc_retinanet_300_AirNet-FPN.yml
+++ b/configs/retinanet/voc_retinanet_300_AirNet-FPN.yml
@@ -12,27 +12,24 @@ MODEL:
            'sheep', 'sofa', 'train', 'tvmonitor']
  NUM_CLASSES: 21
 SOLVER:
-  BASE_LR: 0.02
+  BASE_LR: 0.01
  DECAY_STEPS: [40000, 50000, 60000]
  MAX_STEPS: 60000
  SNAPSHOT_EVERY: 5000
-  SNAPSHOT_PREFIX: voc_retinanet_300
+  SNAPSHOT_PREFIX: voc_retinanet_320
 FPN:
  RPN_MIN_LEVEL: 3
  RPN_MAX_LEVEL: 7
 TRAIN:
  WEIGHTS: '/model/AirNet.Affine.pth'
-  DATABASE: '/data/voc_0712_trainval'
-  IMS_PER_BATCH: 32
-  SCALES: [300]
-  MAX_SIZE: 500
-  SCALE_JITTER_RANGE: [0.5, 2.0]
-  USE_SCALE_JITTER: True
+  DATASET: '/data/voc_0712_trainval'
  USE_COLOR_JITTER: True
+  IMS_PER_BATCH: 32
+  SCALES: [320]
+  RANDOM_SCALES: [0.5, 1.0]
 TEST:
-  DATABASE: '/data/voc_2007_test'
+  DATASET: '/data/voc_2007_test'
  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
  IMS_PER_BATCH: 1
-  SCALES: [300]
-  MAX_SIZE: 500
+  SCALES: [320]
  NMS: 0.45
\ No newline at end of file
--- a/configs/retinanet/voc_retinanet_300_R-34-FPN.yml
+++ b/configs/retinanet/voc_retinanet_300_R-34-FPN.yml
@@ -17,23 +17,20 @@ SOLVER:
  WARM_UP_STEPS: 2000
  MAX_STEPS: 60000
  SNAPSHOT_EVERY: 5000
-  SNAPSHOT_PREFIX: voc_retinanet_300
+  SNAPSHOT_PREFIX: voc_retinanet_320
 FPN:
  RPN_MIN_LEVEL: 3
  RPN_MAX_LEVEL: 7
 TRAIN:
-  WEIGHTS: '/model/R-34.Affine.pth'
-  DATABASE: '/data/voc_0712_trainval'
-  IMS_PER_BATCH: 32
-  SCALES: [300]
-  MAX_SIZE: 500
-  SCALE_JITTER_RANGE: [0.5, 2.0]
-  USE_SCALE_JITTER: True
+  WEIGHTS: '/model/R-50.Affine.pth'
+  DATASET: '/data/voc_0712_trainval'
  USE_COLOR_JITTER: True
+  IMS_PER_BATCH: 32
+  SCALES: [320]
+  RANDOM_SCALES: [0.5, 2.0]
 TEST:
-  DATABASE: '/data/voc_2007_test'
+  DATASET: '/data/voc_2007_test'
  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
  IMS_PER_BATCH: 1
-  SCALES: [300]
-  MAX_SIZE: 500
+  SCALES: [320]
  NMS: 0.45
\ No newline at end of file
--- a/configs/ssd/voc_ssd_300_AirNet-5b.yml
+++ b/configs/ssd/voc_ssd_300_AirNet-5b.yml
@@ -16,24 +16,25 @@ SOLVER:
  DECAY_STEPS: [80000, 100000, 120000]
  MAX_STEPS: 120000
  SNAPSHOT_EVERY: 5000
-  SNAPSHOT_PREFIX: voc_ssd_300
+  SNAPSHOT_PREFIX: voc_ssd_320
 SSD:
-  RESIZE:
-    HEIGHT: 300
-    WIDTH: 300
+  NUM_CONVS: 2
  MULTIBOX:
+    STRIDES: [8, 16, 32]
    MIN_SIZES: [30, 90, 150]
    MAX_SIZES: [90, 150, 210]
-    STRIDES: [8, 16, 32]
    ASPECT_RATIOS: [[1, 2, 0.5], [1, 2, 0.5], [1, 2, 0.5]]
 TRAIN:
  WEIGHTS: '/model/AirNet.Affine.pth'
-  DATABASE: '/data/voc_0712_trainval'
+  DATASET: '/data/voc_0712_trainval'
+  SCALES: [320]
+  RANDOM_SCALES: [0.25, 1.00]
  IMS_PER_BATCH: 32
 TEST:
-  DATABASE: '/data/voc_2007_test'
+  DATASET: '/data/voc_2007_test'
  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
  IMS_PER_BATCH: 8
+  SCALES: [320]
  NMS_TOP_K: 400
  NMS: 0.45
  SCORE_THRESH: 0.01

--- a/configs/ssd/voc_ssd_300_VGG-16.yml
+++ b/configs/ssd/voc_ssd_300_VGG-16.yml
@@ -14,30 +14,35 @@ MODEL:
  NUM_CLASSES: 21
 SOLVER:
  BASE_LR: 0.001
-  WARM_UP_FACTOR: 0.
  WEIGHT_DECAY: 0.0005
  DECAY_STEPS: [80000, 100000, 120000]
  MAX_STEPS: 120000
  SNAPSHOT_EVERY: 5000
  SNAPSHOT_PREFIX: voc_ssd_300
 SSD:
-  RESIZE:
-    HEIGHT: 300
-    WIDTH: 300
  MULTIBOX:
    STRIDES: [8, 16, 32, 64, 100, 300]
    MIN_SIZES: [30, 60, 110, 162, 213, 264]
    MAX_SIZES: [60, 110, 162, 213, 264, 315]
-    ASPECT_RATIOS: [[1, 2, 0.5], [1, 2, 0.5, 3, 0.33], [1, 2, 0.5, 3, 0.33],
-                    [1, 2, 0.5, 3, 0.33], [1, 2, 0.5], [1, 2, 0.5]]
+    ASPECT_RATIOS: [
+      [1, 2, 0.5],
+      [1, 2, 0.5, 3, 0.33],
+      [1, 2, 0.5, 3, 0.33],
+      [1, 2, 0.5, 3, 0.33],
+      [1, 2, 0.5],
+      [1, 2, 0.5]
+    ]
 TRAIN:
  WEIGHTS: '/model/VGG16.SSD.pth'
-  DATABASE: '/data/voc_0712_trainval'
+  DATASET: '/data/voc_0712_trainval'
  IMS_PER_BATCH: 32
+  SCALES: [300]
+  RANDOM_SCALES: [0.25, 1.00]
 TEST:
-  DATABASE: '/data/voc_2007_test'
+  DATASET: '/data/voc_2007_test'
  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
  IMS_PER_BATCH: 8
+  SCALES: [300]
  NMS_TOP_K: 400
  NMS: 0.45
  SCORE_THRESH: 0.01

--- a/configs/ssd/voc_ssd_320_R-50-FPN.yml
+++ b/configs/ssd/voc_ssd_320_R-50-FPN.yml
@@ -22,23 +22,29 @@ SOLVER:
  SNAPSHOT_PREFIX: voc_ssd_320
 SSD:
  NUM_CONVS: 2
-  RESIZE:
-    HEIGHT: 320
-    WIDTH: 320
  MULTIBOX:
    STRIDES: [8, 16, 32, 64, 100, 300]
    MIN_SIZES: [30, 60, 110, 162, 213, 264]
    MAX_SIZES: [60, 110, 162, 213, 264, 315]
-    ASPECT_RATIOS: [[1, 2, 0.5], [1, 2, 0.5, 3, 0.33], [1, 2, 0.5, 3, 0.33],
-                    [1, 2, 0.5, 3, 0.33], [1, 2, 0.5], [1, 2, 0.5]]
+    ASPECT_RATIOS: [
+      [1, 2, 0.5],
+      [1, 2, 0.5, 3, 0.33],
+      [1, 2, 0.5, 3, 0.33],
+      [1, 2, 0.5, 3, 0.33],
+      [1, 2, 0.5],
+      [1, 2, 0.5]
+    ]
 TRAIN:
  WEIGHTS: '/model/R-50.Affine.pth'
-  DATABASE: '/data/voc_0712_trainval'
+  DATASET: '/data/voc_0712_trainval'
+  SCALES: [320]
+  RANDOM_SCALES: [0.25, 1.00]
  IMS_PER_BATCH: 32
 TEST:
-  DATABASE: '/data/voc_2007_test'
+  DATASET: '/data/voc_2007_test'
  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
  IMS_PER_BATCH: 8
+  SCALES: [320]
  NMS_TOP_K: 400
  NMS: 0.45
  SCORE_THRESH: 0.01

--- a/csrc/cxx/operators/nms_op.cc
+++ b/csrc/cxx/operators/nms_op.cc
+#include <dragon/core/workspace.h>
+#include <dragon/utils/math_utils.h>
+
+#include "../utils/detection_utils.h"
+#include "nms_op.h"
+
+namespace dragon {
+
+template <class Context> template <typename T>
+void NonMaxSuppressionOp<Context>::DoRunWithType() {
+    int num_selected;
+
+    utils::detection::ApplyNMS(
+        Output(0)->count(),
+        Output(0)->count(),
+        iou_threshold_,
+        Input(0).template mutable_data<T, Context>(),
+        Output(0)->template mutable_data<int64_t, CPUContext>(),
+        num_selected, ctx()
+    );
+
+    Output(0)->Reshape({ num_selected });
+}
+
+template <class Context>
+void NonMaxSuppressionOp<Context>::RunOnDevice() {
+    CHECK(Input(0).ndim() == 2 && Input(0).dim(1) == 5)
+        << "\nThe dimensions of boxes should be (num_boxes, 5).";
+
+    Output(0)->Reshape({ Input(0).dim(0) });
+
+    DispatchHelper<TensorTypes<float>>::Call(this, Input(0));
+}
+
+DEPLOY_CPU(NonMaxSuppression);
+#ifdef USE_CUDA
+DEPLOY_CUDA(NonMaxSuppression);
+#endif
+
+OPERATOR_SCHEMA(NonMaxSuppression).NumInputs(1).NumOutputs(1);
+
+NO_GRADIENT(NonMaxSuppression);
+
+}  // namespace dragon
--- a/csrc/cxx/operators/nms_op.h
+++ b/csrc/cxx/operators/nms_op.h
+/*!
+ * Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+ *
+ * Licensed under the BSD 2-Clause License.
+ * You should have received a copy of the BSD 2-Clause License
+ * along with the software. If not, See,
+ *
+ *      <https://opensource.org/licenses/BSD-2-Clause>
+ *
+ * ------------------------------------------------------------
+ */
+
+#ifndef SEETADET_CXX_OPERATORS_NMS_OP_H_
+#define SEETADET_CXX_OPERATORS_NMS_OP_H_
+
+#include "dragon/core/operator.h"
+
+namespace dragon {
+
+template <class Context>
+class NonMaxSuppressionOp final : public Operator<Context> {
+ public:
+    NonMaxSuppressionOp(const OperatorDef& def, Workspace* ws)
+        : Operator<Context>(def, ws),
+          iou_threshold_(OpArg<float>("iou_threshold", 0.5f)) {}
+    USE_OPERATOR_FUNCTIONS;
+
+    void RunOnDevice() override;
+
+    template <typename T>
+    void DoRunWithType();
+
+ protected:
+    float iou_threshold_;
+};
+
+}  // namespace dragon
+
+#endif  // SEETADET_CXX_OPERATORS_NMS_OP_H_
--- a/csrc/cxx/operators/retinanet_decoder_op.cc
+++ b/csrc/cxx/operators/retinanet_decoder_op.cc
+#include <dragon/core/workspace.h>
+#include <dragon/utils/math_utils.h>
+
+#include "../utils/detection_utils.h"
+#include "retinanet_decoder_op.h"
+
+namespace dragon {
+
+template <class Context> template <typename T>
+void RetinaNetDecoderOp<Context>::DoRunWithType() {
+    using BT = float;  // DType of BBox
+    using BC = CPUContext;  // Context of BBox
+
+    int feat_h, feat_w;
+    int C = Input(-3).dim(2), A, K;
+    int total_proposals = 0;
+    int num_candidates, num_boxes, num_proposals;
+
+    auto* batch_scores = Input(-3).template data<T, BC>();
+    auto* batch_deltas = Input(-2).template data<T, BC>();
+    auto* im_info = Input(-1).template data<BT, BC>();
+    auto* y = Output(0)->template mutable_data<BT, BC>();
+
+    for (int n = 0; n < num_images_; ++n) {
+        BT im_h = im_info[0];
+        BT im_w = im_info[1];
+        BT im_scale_h = im_info[2];
+        BT im_scale_w = im_info[2];
+        if (Input(-1).dim(1) == 4) im_scale_w = im_info[3];
+        auto* scores = batch_scores + n * Input(-3).stride(0);
+        auto* deltas = batch_deltas + n * Input(-2).stride(0);
+        CHECK_EQ(strides_.size(), InputSize() - 3)
+            << "\nGiven " << strides_.size() << " strides "
+            << "and " << InputSize() - 3 << " features";
+        // Select the top-k candidates as proposals
+        num_boxes = Input(-3).dim(1);
+        num_candidates = Input(-3).count(1);
+        roi_indices_.resize(num_candidates);
+        num_candidates = 0;
+        for (int i = 0; i < roi_indices_.size(); ++i)
+            if (scores[i] > score_thr_)
+                roi_indices_[num_candidates++] = i;
+        scores_.resize(num_candidates);
+        for (int i = 0; i < num_candidates; ++i)
+            scores_[i] = scores[roi_indices_[i]];
+        num_proposals = std::min(
+            num_candidates,
+            (int)pre_nms_topn_
+        );
+        utils::math::ArgPartition(
+            num_candidates,
+            num_proposals,
+            true,
+            scores_.data(),
+            indices_
+        );
+        for (int i = 0; i < num_proposals; ++i)
+            indices_[i] = roi_indices_[indices_[i]];
+        // Decode the candidates
+        int base_offset = 0;
+        for (int i = 0; i < strides_.size(); i++) {
+            feat_h = Input(i).dim(2);
+            feat_w = Input(i).dim(3);
+            K = feat_h * feat_w;
+            A = int(ratios_.size() * scales_.size());
+            anchors_.resize((size_t)(A * 4));
+            utils::detection::GenerateAnchors(
+                strides_[i],
+                (int)ratios_.size(),
+                (int)scales_.size(),
+                ratios_.data(),
+                scales_.data(),
+                anchors_.data()
+            );
+            utils::detection::GenerateGridAnchors(
+                num_proposals, C, A,
+                feat_h, feat_w,
+                strides_[i],
+                base_offset,
+                anchors_.data(),
+                indices_.data(),
+                y
+            );
+            base_offset += (A * K);
+        }
+        utils::detection::GenerateMCProposals(
+            num_proposals,
+            num_boxes, C,
+            n,
+            im_h,
+            im_w,
+            im_scale_h,
+            im_scale_w,
+            scores,
+            deltas,
+            indices_.data(),
+            y
+        );
+        total_proposals += num_proposals;
+        y += (num_proposals * 7);
+        im_info += Input(-1).dim(1);
+    }
+
+    Output(0)->Reshape({ total_proposals, 7 });
+}
+
+template <class Context>
+void RetinaNetDecoderOp<Context>::RunOnDevice() {
+    num_images_ = Input(0).dim(0);
+
+    CHECK_EQ(Input(-1).dim(0), num_images_)
+        << "\nExcepted " << num_images_
+        << " groups info, got "
+        << Input(-1).dim(0) << ".";
+
+    Output(0)->Reshape({ num_images_ * pre_nms_topn_, 7 });
+
+    DispatchHelper<TensorTypes<float>>::Call(this, Input(-3));
+}
+
+DEPLOY_CPU(RetinaNetDecoder);
+#ifdef USE_CUDA
+DEPLOY_CUDA(RetinaNetDecoder);
+#endif
+
+OPERATOR_SCHEMA(RetinaNetDecoder)
+    .NumInputs(3, INT_MAX)
+    .NumOutputs(1, INT_MAX);
+
+}  // namespace dragon
--- a/csrc/cxx/operators/retinanet_decoder_op.h
+++ b/csrc/cxx/operators/retinanet_decoder_op.h
+/*!
+ * Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+ *
+ * Licensed under the BSD 2-Clause License.
+ * You should have received a copy of the BSD 2-Clause License
+ * along with the software. If not, See,
+ *
+ *      <https://opensource.org/licenses/BSD-2-Clause>
+ *
+ * ------------------------------------------------------------
+ */
+
+#ifndef SEETADET_CXX_OPERATORS_RETINANET_DECODER_OP_H_
+#define SEETADET_CXX_OPERATORS_RETINANET_DECODER_OP_H_
+
+#include "dragon/core/operator.h"
+
+namespace dragon {
+
+template <class Context>
+class RetinaNetDecoderOp final : public Operator<Context> {
+ public:
+    RetinaNetDecoderOp(const OperatorDef& def, Workspace* ws)
+        : Operator<Context>(def, ws),
+          strides_(OpArgs<int64_t>("strides")),
+          ratios_(OpArgs<float>("ratios")),
+          scales_(OpArgs<float>("scales")),
+          pre_nms_topn_(OpArg<int64_t>("pre_nms_top_n", 6000)),
+          score_thr_(OpArg<float>("score_thresh", 0.05f)) {}
+    USE_OPERATOR_FUNCTIONS;
+
+    void RunOnDevice() override;
+
+    template <typename T>
+    void DoRunWithType();
+
+ protected:
+    float score_thr_;
+    vec64_t strides_, indices_, roi_indices_;
+    vector<float> ratios_, scales_, scores_, anchors_;
+    int64_t num_images_, pre_nms_topn_;
+};
+
+}  // namespace dragon
+
+#endif  // SEETADET_CXX_OPERATORS_RETINANET_DECODER_OP_H_
--- a/csrc/cxx/operators/rpn_decoder_op.cc
+++ b/csrc/cxx/operators/rpn_decoder_op.cc
+#include <dragon/core/workspace.h>
+#include <dragon/utils/math_utils.h>
+
+#include "../utils/detection_utils.h"
+#include "rpn_decoder_op.h"
+
+namespace dragon {
+
+template <class Context> template <typename T>
+void RPNDecoderOp<Context>::DoRunWithType() {
+    using BT = float;  // DType of BBox
+    using BC = CPUContext;  // Context of BBox
+
+    int feat_h, feat_w, K, A;
+    int total_rois = 0, num_rois;
+    int num_candidates, num_proposals;
+
+    auto* batch_scores = Input(-3).template data<T, BC>();
+    auto* batch_deltas = Input(-2).template data<T, BC>();
+    auto* im_info = Input(-1).template data<BT, BC>();
+    auto* y = Output(0)->template mutable_data<BT, BC>();
+
+    for (int n = 0; n < num_images_; ++n) {
+        const BT im_h = im_info[0];
+        const BT im_w = im_info[1];
+        const BT scale = im_info[2];
+        const BT min_box_h = min_size_ * scale;
+        const BT min_box_w = min_size_ * scale;
+        auto* scores = batch_scores + n * Input(-3).stride(0);
+        auto* deltas = batch_deltas + n * Input(-2).stride(0);
+        if (strides_.size() == 1) {
+            // Case 1: single stride
+            feat_h = Input(0).dim(2);
+            feat_w = Input(0).dim(3);
+            K = feat_h * feat_w;
+            A = int(ratios_.size() * scales_.size());
+            // Select the Top-K candidates as proposals
+            num_candidates = A * K;
+            num_proposals = std::min(
+                num_candidates,
+                (int)pre_nms_topn_
+            );
+            utils::math::ArgPartition(
+                num_candidates,
+                num_proposals,
+                true, scores, indices_
+            );
+            // Decode the candidates
+            anchors_.resize((size_t)(A * 4));
+            proposals_.Reshape({ num_proposals, 5 });
+            utils::detection::GenerateAnchors(
+                strides_[0],
+                (int)ratios_.size(),
+                (int)scales_.size(),
+                ratios_.data(),
+                scales_.data(),
+                anchors_.data()
+            );
+            utils::detection::GenerateGridAnchors(
+                num_proposals, A,
+                feat_h, feat_w,
+                strides_[0],
+                0,
+                anchors_.data(),
+                indices_.data(),
+                proposals_.template mutable_data<BT, BC>()
+            );
+            utils::detection::GenerateSSProposals(
+                K, num_proposals,
+                im_h, im_w,
+                min_box_h, min_box_w,
+                scores,
+                deltas,
+                indices_.data(),
+                proposals_.template mutable_data<BT, BC>()
+            );
+            // Sort, NMS and Retrieve
+            utils::detection::SortProposals(
+                0,
+                num_proposals - 1,
+                num_proposals,
+                proposals_.template mutable_data<BT, BC>()
+            );
+            utils::detection::ApplyNMS(
+                num_proposals,
+                post_nms_topn_,
+                nms_thr_,
+                proposals_.template mutable_data<BT, Context>(),
+                roi_indices_.data(),
+                num_rois, ctx()
+            );
+            utils::detection::RetrieveRoIs(
+                num_rois,
+                n,
+                proposals_.template data<BT, BC>(),
+                roi_indices_.data(),
+                y
+            );
+        } else if (strides_.size() > 1) {
+            // Case 2: multiple strides
+            CHECK_EQ(strides_.size(), InputSize() - 3)
+                << "\nGiven " << strides_.size() << " strides "
+                << "and " << InputSize() - 3 << " feature inputs";
+            CHECK_EQ(strides_.size(), scales_.size())
+                << "\nGiven " << strides_.size() << " strides "
+                << "and " << scales_.size() << " scales";
+            // Select the top-k candidates as proposals
+            num_candidates = Input(-3).dim(1);
+            num_proposals = std::min(
+                num_candidates,
+                (int)pre_nms_topn_
+            );
+            utils::math::ArgPartition(
+                num_candidates,
+                num_proposals,
+                true, scores, indices_
+            );
+            // Decode the candidates
+            int base_offset = 0;
+            proposals_.Reshape({ num_proposals, 5 });
+            auto* proposals = proposals_
+                .template mutable_data<BT, BC>();
+            for (int i = 0; i < strides_.size(); i++) {
+                feat_h = Input(i).dim(2);
+                feat_w = Input(i).dim(3);
+                K = feat_h * feat_w;
+                A = (int)ratios_.size();
+                anchors_.resize((size_t)(A * 4));
+                utils::detection::GenerateAnchors(
+                    strides_[i],
+                    (int)ratios_.size(),
+                    1,
+                    ratios_.data(),
+                    scales_.data(),
+                    anchors_.data()
+                );
+                utils::detection::GenerateGridAnchors(
+                    num_proposals, A,
+                    feat_h, feat_w,
+                    strides_[i],
+                    base_offset,
+                    anchors_.data(),
+                    indices_.data(),
+                    proposals
+                );
+                base_offset += (A * K);
+            }
+            utils::detection::GenerateMSProposals(
+                num_candidates,
+                num_proposals,
+                im_h, im_w,
+                min_box_h, min_box_w,
+                scores,
+                deltas,
+                &indices_[0],
+                proposals
+            );
+            // Sort, NMS and Retrieve
+            utils::detection::SortProposals(
+                0,
+                num_proposals - 1,
+                num_proposals,
+                proposals
+            );
+            utils::detection::ApplyNMS(
+                num_proposals,
+                post_nms_topn_,
+                nms_thr_,
+                proposals_.template mutable_data<BT, Context>(),
+                roi_indices_.data(),
+                num_rois, ctx()
+            );
+            utils::detection::RetrieveRoIs(
+                num_rois,
+                n,
+                proposals,
+                roi_indices_.data(),
+                y
+            );
+        } else {
+            LOG(FATAL) << "Excepted at least one stride for proposals.";
+        }
+        total_rois += num_rois;
+        y += (num_rois * 5);
+        im_info += Input(-1).dim(1);
+    }
+
+    Output(0)->Reshape({ total_rois, 5 });
+
+    // Distribute rois into K bins
+    if (OutputSize() > 1) {
+        CHECK_EQ(max_level_ - min_level_ + 1, OutputSize())
+            << "\nExcepted " << OutputSize() << " outputs for levels "
+               "between [" << min_level_ << ", " << max_level_ << "].";
+        vector<BT*> ys(OutputSize());
+        vector<vec64_t> bins(OutputSize());
+        Tensor RoIs; RoIs.ReshapeLike(*Output(0));
+
+        auto* rois = RoIs.template mutable_data<BT, BC>();
+
+        ctx()->template Copy<BT, BC, BC>(
+            Output(0)->count(),
+            rois, Output(0)->template data<BT, BC>()
+        );
+
+        utils::detection::CollectRoIs(
+            total_rois,
+            min_level_,
+            max_level_,
+            canonical_level_,
+            canonical_scale_,
+            rois, bins
+        );
+
+        for (int i = 0; i < OutputSize(); i++) {
+            Output(i)->Reshape({ std::max((int)bins[i].size(), 1), 5 });
+            ys[i] = Output(i)->template mutable_data<BT, BC>();
+        }
+
+        utils::detection::DistributeRoIs(bins, rois, ys);
+    }
+}
+
+template <class Context>
+void RPNDecoderOp<Context>::RunOnDevice() {
+    num_images_ = Input(0).dim(0);
+
+    CHECK_EQ(Input(-1).dim(0), num_images_)
+        << "\nExcepted " << num_images_
+        << " groups info, got "
+        << Input(-1).dim(0) << ".";
+
+    roi_indices_.resize(post_nms_topn_);
+    Output(0)->Reshape({ num_images_ * post_nms_topn_, 5 });
+
+    DispatchHelper<TensorTypes<float>>::Call(this, Input(-3));
+}
+
+DEPLOY_CPU(RPNDecoder);
+#ifdef USE_CUDA
+DEPLOY_CUDA(RPNDecoder);
+#endif
+
+OPERATOR_SCHEMA(RPNDecoder)
+    .NumInputs(3, INT_MAX)
+    .NumOutputs(1, INT_MAX);
+
+}  // namespace dragon
--- a/csrc/cxx/operators/rpn_decoder_op.h
+++ b/csrc/cxx/operators/rpn_decoder_op.h
+/*!
+ * Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+ *
+ * Licensed under the BSD 2-Clause License.
+ * You should have received a copy of the BSD 2-Clause License
+ * along with the software. If not, See,
+ *
+ *      <https://opensource.org/licenses/BSD-2-Clause>
+ *
+ * ------------------------------------------------------------
+ */
+
+#ifndef SEETADET_CXX_OPERATORS_RPN_DECODER_OP_H_
+#define SEETADET_CXX_OPERATORS_RPN_DECODER_OP_H_
+
+#include "dragon/core/operator.h"
+
+namespace dragon {
+
+template <class Context>
+class RPNDecoderOp final : public Operator<Context> {
+ public:
+    RPNDecoderOp(const OperatorDef& def, Workspace* ws)
+        : Operator<Context>(def, ws),
+          strides_(OpArgs<int64_t>("strides")),
+          ratios_(OpArgs<float>("ratios")),
+          scales_(OpArgs<float>("scales")),
+          pre_nms_topn_(OpArg<int64_t>("pre_nms_top_n", 6000)),
+          post_nms_topn_(OpArg<int64_t>("post_nms_top_n", 300)),
+          nms_thr_(OpArg<float>("nms_thresh", 0.7f)),
+          min_size_(OpArg<int64_t>("min_size", 16)),
+          min_level_(OpArg<int64_t>("min_level", 2)),
+          max_level_(OpArg<int64_t>("max_level", 5)),
+          canonical_level_(OpArg<int64_t>("canonical_level", 4)),
+          canonical_scale_(OpArg<int64_t>("canonical_scale", 224)) {}
+    USE_OPERATOR_FUNCTIONS;
+
+    void RunOnDevice() override;
+
+    template <typename T>
+    void DoRunWithType();
+
+ protected:
+    float nms_thr_;
+    vec64_t strides_, indices_, roi_indices_;
+    vector<float> ratios_, scales_, scores_, anchors_;
+    int64_t min_size_, pre_nms_topn_, post_nms_topn_;
+    int64_t num_images_, min_level_, max_level_;
+    int64_t canonical_level_, canonical_scale_;
+    Tensor proposals_;
+};
+
+}  // namespace dragon
+
+#endif  // SEETADET_CXX_OPERATORS_RPN_DECODER_OP_H_
--- a/csrc/cxx/setup.py
+++ b/csrc/cxx/setup.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+"""Build cxx sources."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from distutils.core import setup
+from dragon.tools import cpp_extension
+
+if cpp_extension.CUDA_HOME is not None and \
+        cpp_extension._cuda.is_available():
+    Extension = cpp_extension.CUDAExtension
+else:
+    Extension = cpp_extension.CppExtension
+
+ext_modules = [
+    Extension(
+        name='install.lib.modules._C',
+        sources=[
+            'utils/detection_utils.cc',
+            'utils/detection_utils.cu',
+            'operators/nms_op.cc',
+            'operators/retinanet_decoder_op.cc',
+            'operators/rpn_decoder_op.cc',
+        ],
+    ),
+]
+
+setup(
+    name='SeetaDet',
+    ext_modules=ext_modules,
+    cmdclass={'build_ext': cpp_extension.BuildExtension}
+)
--- a/csrc/cxx/utils/detection_utils.cc
+++ b/csrc/cxx/utils/detection_utils.cc
+#include <dragon/core/context.h>
+#include "detection_utils.h"
+
+namespace dragon {
+
+namespace utils {
+
+namespace detection {
+
+template <typename T>
+T IoU(const T A[], const T B[]) {
+    if (A[0] > B[2] || A[1] > B[3] ||
+        A[2] < B[0] || A[3] < B[1]) return 0;
+    const T x1 = std::max(A[0], B[0]);
+    const T y1 = std::max(A[1], B[1]);
+    const T x2 = std::min(A[2], B[2]);
+    const T y2 = std::min(A[3], B[3]);
+    const T width = std::max((T)0, x2 - x1 + 1);
+    const T height = std::max((T)0, y2 - y1 + 1);
+    const T area = width * height;
+    const T A_area = (A[2] - A[0] + 1) * (A[3] - A[1] + 1);
+    const T B_area = (B[2] - B[0] + 1) * (B[3] - B[1] + 1);
+    return area / (A_area + B_area - area);
+}
+
+template <> void ApplyNMS<float, CPUContext>(
+    const int               num_boxes,
+    const int               max_keeps,
+    const float             thresh,
+    const float*            boxes,
+    int64_t*                keep_indices,
+    int&                    num_keep,
+    CPUContext*             ctx) {
+    int count = 0;
+    std::vector<char> is_dead(num_boxes);
+    for (int i = 0; i < num_boxes; ++i) is_dead[i] = 0;
+    for (int i = 0; i < num_boxes; ++i) {
+        if (is_dead[i]) continue;
+        keep_indices[count++] = i;
+        if (count == max_keeps) break;
+        for (int j = i + 1; j < num_boxes; ++j)
+            if (!is_dead[j] && IoU(&boxes[i * 5],
+                                   &boxes[j * 5]) > thresh)
+                is_dead[j] = 1;
+    }
+    num_keep = count;
+}
+
+}  // namespace detection
+
+}  // namespace utils
+
+}  // namespace dragon
--- a/csrc/cxx/utils/detection_utils.cu
+++ b/csrc/cxx/utils/detection_utils.cu
+#ifdef USE_CUDA
+
+#include <dragon/core/context_cuda.h>
+#include "detection_utils.h"
+
+namespace dragon {
+
+namespace utils {
+
+namespace detection {
+
+#define DIV_UP(m,n) ((m) / (n) + ((m) % (n) > 0))
+#define NUM_THREADS 64
+
+namespace {
+
+template <typename T>
+__device__ bool _CheckIoU(
+    const T*               a,
+    const T*               b,
+    const float            thresh) {
+    const T x1 = max(a[0], b[0]);
+    const T y1 = max(a[1], b[1]);
+    const T x2 = min(a[2], b[2]);
+    const T y2 = min(a[3], b[3]);
+    const T width = max(T(0), x2 - x1 + 1);
+    const T height = max(T(0), y2 - y1 + 1);
+    const T inter = width * height;
+    const T Sa = (a[2] - a[0] + T(1)) * (a[3] - a[1] + T(1));
+    const T Sb = (b[2] - b[0] + T(1)) * (b[3] - b[1] + T(1));
+    return inter > thresh * (Sa + Sb - inter);
+}
+
+template <typename T>
+__global__ void _NonMaxSuppression(
+    const int               num_blocks,
+    const int               num_boxes,
+    const T                 thresh,
+    const T*                dev_boxes,
+    uint64_t*               dev_mask) {
+    const int row_start = blockIdx.y;
+    const int col_start = blockIdx.x;
+    if (row_start > col_start) return;
+
+    const int row_size = min(num_boxes - row_start * NUM_THREADS, NUM_THREADS);
+    const int col_size = min(num_boxes - col_start * NUM_THREADS, NUM_THREADS);
+
+    __shared__ T block_boxes[NUM_THREADS * 4];
+
+    if (threadIdx.x < col_size) {
+        const int c1 = threadIdx.x * 4;
+        const int c2 = (col_start * NUM_THREADS + threadIdx.x) * 5;
+        block_boxes[c1] = dev_boxes[c2];
+        block_boxes[c1 + 1] = dev_boxes[c2 + 1];
+        block_boxes[c1 + 2] = dev_boxes[c2 + 2];
+        block_boxes[c1 + 3] = dev_boxes[c2 + 3];
+    }
+
+    __syncthreads();
+
+    if (threadIdx.x < row_size) {
+        const int index = row_start * NUM_THREADS + threadIdx.x;
+        const T* dev_box = dev_boxes + index * 5;
+        unsigned long long val = 0;
+        const int start = (row_start == col_start) ? (threadIdx.x + 1) : 0;
+        for (int i = start; i < col_size; ++i) {
+            if (_CheckIoU(dev_box, block_boxes + i * 4, thresh)) {
+                val |= 1ULL << i;
+            }
+        }
+        dev_mask[index * num_blocks + col_start] = val;
+    }
+}
+
+}  // namespace
+
+template <> void ApplyNMS<float, CUDAContext>(
+    const int               num_boxes,
+    const int               max_keeps,
+    const float             thresh,
+    const float*            boxes,
+    int64_t*                keep_indices,
+    int&                    num_keep,
+    CUDAContext*            ctx) {
+    const int num_blocks = DIV_UP(num_boxes, NUM_THREADS);
+
+    vector<uint64_t> mask_host(num_boxes * num_blocks);
+    auto* mask_dev = (uint64_t*)ctx->New(mask_host.size() * sizeof(uint64_t));
+    
+    _NonMaxSuppression
+         <<< dim3(num_blocks, num_blocks), NUM_THREADS,
+             0, ctx->cuda_stream() >>>(
+        num_blocks,
+        num_boxes,
+        thresh,
+        boxes,
+        mask_dev
+    );
+    
+    CUDA_CHECK(cudaMemcpyAsync(
+        mask_host.data(),
+        mask_dev,
+        mask_host.size() * sizeof(uint64_t),
+        cudaMemcpyDeviceToHost,
+        ctx->cuda_stream()
+    ));
+
+    ctx->FinishDeviceComputation();
+
+    vector<uint64_t> dead_bit(num_blocks);
+    memset(&dead_bit[0], 0, sizeof(uint64_t) * num_blocks);
+
+    int num_selected = 0;
+    for (int i = 0; i < num_boxes; ++i) {
+        const int nblock = i / NUM_THREADS;
+        const int inblock = i % NUM_THREADS;
+        if (!(dead_bit[nblock] & (1ULL << inblock))) {
+            keep_indices[num_selected++] = i;
+            auto* mask_i = &mask_host[0] + i * num_blocks;
+            for (int j = nblock; j < num_blocks; ++j) dead_bit[j] |= mask_i[j];
+            if (num_selected == max_keeps) break;
+        }
+    }
+    num_keep = num_selected;
+
+    ctx->Delete(mask_dev);
+}
+
+}  // namespace detection
+
+}  // namespace utils
+
+}  // namespace dragon
+
+#endif  // USE_CUDA
--- a/csrc/cxx/utils/detection_utils.h
+++ b/csrc/cxx/utils/detection_utils.h
+/*!
+ * Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+ *
+ * Licensed under the BSD 2-Clause License.
+ * You should have received a copy of the BSD 2-Clause License
+ * along with the software. If not, See,
+ *
+ *      <https://opensource.org/licenses/BSD-2-Clause>
+ *
+ * ------------------------------------------------------------
+ */
+
+#ifndef SEETADET_CXX_UTILS_DETECTION_UTILS_H_
+#define SEETADET_CXX_UTILS_DETECTION_UTILS_H_
+
+#include "dragon/core/context.h"
+#include "dragon/core/operator.h"
+
+namespace dragon {
+
+namespace utils {
+
+namespace detection {
+
+#define ROUND(x) ((int)((x) + (T)0.5))
+
+/******************** BBox ********************/
+
+template <typename T>
+inline int FilterBoxes(
+    const T                         dx,
+    const T                         dy,
+    const T                         d_log_w,
+    const T                         d_log_h,
+    const T                         im_w,
+    const T                         im_h,
+    const T                         min_box_w,
+    const T                         min_box_h,
+    T*                              bbox) {
+    const T w = bbox[2] - bbox[0] + 1;
+    const T h = bbox[3] - bbox[1] + 1;
+    const T ctr_x = bbox[0] + (T)0.5 * w;
+    const T ctr_y = bbox[1] + (T)0.5 * h;
+
+    const T pred_ctr_x = dx * w + ctr_x;
+    const T pred_ctr_y = dy * h + ctr_y;
+    const T pred_w = exp(d_log_w) * w;
+    const T pred_h = exp(d_log_h) * h;
+
+    bbox[0] = pred_ctr_x - (T)0.5 * pred_w;
+    bbox[1] = pred_ctr_y - (T)0.5 * pred_h;
+    bbox[2] = pred_ctr_x + (T)0.5 * pred_w;
+    bbox[3] = pred_ctr_y + (T)0.5 * pred_h;
+
+    bbox[0] = std::max((T)0, std::min(bbox[0], im_w - 1));
+    bbox[1] = std::max((T)0, std::min(bbox[1], im_h - 1));
+    bbox[2] = std::max((T)0, std::min(bbox[2], im_w - 1));
+    bbox[3] = std::max((T)0, std::min(bbox[3], im_h - 1));
+
+    const T bbox_w = bbox[2] - bbox[0] + 1;
+    const T bbox_h = bbox[3] - bbox[1] + 1;
+    return (bbox_w >= min_box_w) * (bbox_h >= min_box_h);
+}
+
+template <typename T>
+inline void BBoxTransform(
+    const T                         dx,
+    const T                         dy,
+    const T                         d_log_w,
+    const T                         d_log_h,
+    const T                         im_w,
+    const T                         im_h,
+    const T                         im_scale_h,
+    const T                         im_scale_w,
+    T*                              bbox) {
+    const T w = bbox[2] - bbox[0] + 1;
+    const T h = bbox[3] - bbox[1] + 1;
+    const T ctr_x = bbox[0] + (T)0.5 * w;
+    const T ctr_y = bbox[1] + (T)0.5 * h;
+
+    const T pred_ctr_x = dx * w + ctr_x;
+    const T pred_ctr_y = dy * h + ctr_y;
+    const T pred_w = exp(d_log_w) * w;
+    const T pred_h = exp(d_log_h) * h;
+
+    bbox[0] = pred_ctr_x - (T)0.5 * pred_w;
+    bbox[1] = pred_ctr_y - (T)0.5 * pred_h;
+    bbox[2] = pred_ctr_x + (T)0.5 * pred_w;
+    bbox[3] = pred_ctr_y + (T)0.5 * pred_h;
+
+    bbox[0] = std::max((T)0, std::min(bbox[0], im_w - 1)) / im_scale_w;
+    bbox[1] = std::max((T)0, std::min(bbox[1], im_h - 1)) / im_scale_h;
+    bbox[2] = std::max((T)0, std::min(bbox[2], im_w - 1)) / im_scale_w;
+    bbox[3] = std::max((T)0, std::min(bbox[3], im_h - 1)) / im_scale_h;
+}
+
+/******************** Anchor ********************/
+
+template <typename T>
+inline void GenerateAnchors(
+    int                             base_size,
+    const int                       num_ratios,
+    const int                       num_scales,
+    const T*                        ratios,
+    const T*                        scales,
+    T*                              anchors) {
+    const T base_area = (T)(base_size * base_size);
+    const T center = (T)0.5 * (base_size - (T)1);
+    T* offset_anchors = anchors;
+    for (int i = 0; i < num_ratios; ++i) {
+        const T ratio_w = (T)ROUND(sqrt(base_area / ratios[i]));
+        const T ratio_h = (T)ROUND(ratio_w * ratios[i]);
+        for (int j = 0; j < num_scales; ++j) {
+            const T scale_w = (T)0.5 * (ratio_w * scales[j] - (T)1);
+            const T scale_h = (T)0.5 * (ratio_h * scales[j] - (T)1);
+            offset_anchors[0] = center - scale_w;
+            offset_anchors[1] = center - scale_h;
+            offset_anchors[2] = center + scale_w;
+            offset_anchors[3] = center + scale_h;
+            offset_anchors += 4;
+        }
+    }
+}
+
+template <typename T>
+inline void GenerateGridAnchors(
+    const int                       num_proposals,
+    const int                       num_anchors,
+    const int                       feat_h,
+    const int                       feat_w,
+    const int                       stride,
+    const int                       base_offset,
+    const T*                        anchors,
+    const int64_t*                  indices,
+    T*                              proposals) {
+    T x, y;
+    int idx_3d, a, h, w;
+    int idx_range = num_anchors * feat_h * feat_w;
+    for (int i = 0; i < num_proposals; ++i) {
+        idx_3d = (int)indices[i] - base_offset;
+        if (idx_3d >= 0 && idx_3d < idx_range) {
+            w = idx_3d % feat_w;
+            h = (idx_3d / feat_w) % feat_h;
+            a = idx_3d / feat_w / feat_h;
+            x = (T)w * stride, y = (T)h * stride;
+            auto* A = anchors + a * 4;
+            auto* P = proposals + i * 5;
+            P[0] = x + A[0], P[1] = y + A[1];
+            P[2] = x + A[2], P[3] = y + A[3];
+        }
+    }
+}
+
+template <typename T>
+inline void GenerateGridAnchors(
+    const int                       num_proposals,
+    const int                       num_classes,
+    const int                       num_anchors,
+    const int                       feat_h,
+    const int                       feat_w,
+    const int                       stride,
+    const int                       base_offset,
+    const T*                        anchors,
+    const int64_t*                  indices,
+    T*                              proposals) {
+    T x, y;
+    int idx_4d, a, h, w;
+    int lr = num_classes * base_offset;
+    int rr = num_classes * (num_anchors * feat_h * feat_w);
+    for (int i = 0; i < num_proposals; ++i) {
+        idx_4d = (int)indices[i] - lr;
+        if (idx_4d >= 0 && idx_4d < rr) {
+            idx_4d /= num_classes;
+            w = idx_4d % feat_w;
+            h = (idx_4d / feat_w) % feat_h;
+            a = idx_4d / feat_w / feat_h;
+            x = (T)w * stride, y = (T)h * stride;
+            auto* A = anchors + a * 4;
+            auto* P = proposals + i * 7 + 1;
+            P[0] = x + A[0], P[1] = y + A[1];
+            P[2] = x + A[2], P[3] = y + A[3];
+        }
+    }
+}
+
+/******************** Proposal ********************/
+
+template <typename T>
+void GenerateSSProposals(
+    const int                       K,
+    const int                       num_proposals,
+    const float                     im_h,
+    const float                     im_w,
+    const float                     min_box_h,
+    const float                     min_box_w,
+    const T*                        scores,
+    const T*                        deltas,
+    const int64_t*                  indices,
+    T*                              proposals) {
+    int64_t index, a, k;
+    const float* delta;
+    float* proposal = proposals;
+    float dx, dy, d_log_w, d_log_h;
+    for (int i = 0; i < num_proposals; ++i) {
+        index = indices[i];
+        a = index / K, k = index % K;
+        delta = deltas + k;
+        dx = delta[(a * 4 + 0) * K];
+        dy = delta[(a * 4 + 1) * K];
+        d_log_w = delta[(a * 4 + 2) * K];
+        d_log_h = delta[(a * 4 + 3) * K];
+        proposal[4] = FilterBoxes(
+            dx, dy,
+            d_log_w, d_log_h,
+            im_w, im_h,
+            min_box_w, min_box_h,
+            proposal
+        ) * scores[index];
+        proposal += 5;
+    }
+}
+
+template <typename T>
+void GenerateMSProposals(
+    const int                       num_candidates,
+    const int                       num_proposals,
+    const float                     im_h,
+    const float                     im_w,
+    const float                     min_box_h,
+    const float                     min_box_w,
+    const T*                        scores,
+    const T*                        deltas,
+    const int64_t*                  indices,
+    T*                              proposals) {
+    int64_t index;
+    int64_t num_candidates_2x = 2 * num_candidates;
+    int64_t num_candidates_3x = 3 * num_candidates;
+    float* proposal = proposals;
+    float dx, dy, d_log_w, d_log_h;
+    for (int i = 0; i < num_proposals; ++i) {
+        index = indices[i];
+        dx = deltas[index];
+        dy = deltas[num_candidates + index];
+        d_log_w = deltas[num_candidates_2x + index];
+        d_log_h = deltas[num_candidates_3x + index];
+        proposal[4] = FilterBoxes(
+            dx, dy,
+            d_log_w, d_log_h,
+            im_w, im_h,
+            min_box_w, min_box_h,
+            proposal
+        ) * scores[index];
+        proposal += 5;
+    }
+}
+
+template <typename T>
+void GenerateMCProposals(
+    const int                       num_proposals,
+    const int                       num_boxes,
+    const int                       num_classes,
+    const int                       im_idx,
+    const float                     im_h,
+    const float                     im_w,
+    const float                     im_scale_h,
+    const float                     im_scale_w,
+    const T*                        scores,
+    const T*                        deltas,
+    const int64_t*                  indices,
+    T*                              proposals) {
+    int64_t index, cls;
+    int64_t num_boxes_2x = 2 * num_boxes;
+    int64_t num_boxes_3x = 3 * num_boxes;
+    float* proposal = proposals;
+    float dx, dy, d_log_w, d_log_h;
+    for (int i = 0; i < num_proposals; ++i) {
+        cls = indices[i] % num_classes;
+        index = indices[i] / num_classes;
+        dx = deltas[index];
+        dy = deltas[num_boxes + index];
+        d_log_w = deltas[num_boxes_2x + index];
+        d_log_h = deltas[num_boxes_3x + index];
+        proposal[0] = im_idx;
+        BBoxTransform(
+            dx, dy,
+            d_log_w, d_log_h,
+            im_w, im_h,
+            im_scale_h, im_scale_w,
+            proposal + 1
+        );
+        proposal[5] = scores[indices[i]];
+        proposal[6] = cls + 1;
+        proposal += 7;
+    }
+}
+
+template <typename T>
+inline void SortProposals(
+    const int                       start,
+    const int                       end,
+    const int                       num_top,
+    T*                              proposals) {
+    const T pivot_score = proposals[start * 5 + 4];
+    int left = start + 1, right = end;
+    while (left <= right) {
+        while (left <= end && proposals[left * 5 + 4] >= pivot_score) ++left;
+        while (right > start && proposals[right * 5 + 4] <= pivot_score) --right;
+        if (left <= right) {
+            for (int i = 0; i < 5; ++i)
+                std::swap(proposals[left * 5 + i], proposals[right * 5 + i]);
+            ++left;
+            --right;
+        }
+    }
+    if (right > start) {
+        for (int i = 0; i < 5; ++i)
+            std::swap(proposals[start * 5 + i], proposals[right * 5 + i]);
+    }
+    if (start < right - 1) SortProposals(start, right - 1, num_top, proposals);
+    if (right + 1 < num_top && right + 1 < end)
+        SortProposals(right + 1, end, num_top, proposals);
+}
+
+template <typename T>
+inline void RetrieveRoIs(
+    const int                       num_rois,
+    const int                       roi_batch_ind,
+    const T*                        proposals,
+    const int64_t*                  roi_indices,
+    T*                              rois) {
+    for (int i = 0; i < num_rois; ++i) {
+        const T* proposal = proposals + roi_indices[i] * 5;
+        rois[i * 5 + 0] = (T)roi_batch_ind;
+        rois[i * 5 + 1] = proposal[0];
+        rois[i * 5 + 2] = proposal[1];
+        rois[i * 5 + 3] = proposal[2];
+        rois[i * 5 + 4] = proposal[3];
+    }
+}
+
+template <typename T>
+inline int roi_level(
+    const int                       min_level,
+    const int                       max_level,
+    const int                       canonical_level,
+    const int                       canonical_scale,
+    T*                              roi) {
+    T w = roi[3] - roi[1] + 1;
+    T h = roi[4] - roi[2] + 1;
+    // Refer the settings of paper
+    int level = canonical_level + std::log2(
+        std::max(std::sqrt(w * h), (T)1) / (T)canonical_scale);
+    return std::min(max_level, std::max(min_level, level));
+}
+
+template <typename T>
+inline void CollectRoIs(
+    const int                       num_rois,
+    const int                       min_level,
+    const int                       max_level,
+    const int                       canonical_level,
+    const int                       canonical_scale,
+    const T*                        rois,
+    vector<vec64_t>&                roi_bins) {
+    const T* roi = rois;
+    for (int i = 0; i < num_rois; ++i) {
+        int bin_idx = roi_level(min_level, max_level,
+            canonical_level, canonical_scale, roi);
+        bin_idx = std::max(bin_idx - min_level, 0);
+        roi_bins[bin_idx].push_back(i);
+        roi += 5;
+    }
+}
+
+template <typename T>
+inline void DistributeRoIs(
+    const vector<vec64_t>&              roi_bins,
+    const T*                            rois,
+    vector<T*>                          outputs) {
+    for (int i = 0; i < roi_bins.size(); i++) {
+        auto* y = outputs[i];
+        if (roi_bins[i].size() == 0) {
+            // Fake a tiny roi to avoid empty roi pooling
+            y[0] = 0, y[1] = 0, y[2] = 0, y[3] = 1, y[4] = 1;
+        } else {
+            for (int j = 0; j < roi_bins[i].size(); ++j) {
+                const T* roi = rois + roi_bins[i][j] * 5;
+                for (int k = 0; k < 5; ++k) y[k] = roi[k];
+                y += 5;
+            }
+        }
+    }
+}
+
+/******************** NMS ********************/
+
+template <typename T, class Context>
+void ApplyNMS(
+    const int                       num_boxes,
+    const int                       max_keeps,
+    const T                         thresh,
+    const T*                        boxes,
+    int64_t*                        keep_indices,
+    int&                            num_keep,
+    Context*                        ctx);
+
+}  // namespace detection
+
+}  // namespace utils
+
+}  // namespace dragon
+
+#endif  // SEETADET_CXX_UTILS_DETECTION_UTILS_H_
--- a/lib/pycocotools/_mask.pyx
+++ b/lib/pycocotools/_mask.pyx
--- a/compile/bbox.pyx
+++ b/compile/bbox.pyx
--- a/compile/cpu_nms.pyx
+++ b/compile/cpu_nms.pyx
--- a/lib/pycocotools/maskApi.c
+++ b/lib/pycocotools/maskApi.c
--- a/lib/pycocotools/maskApi.h
+++ b/lib/pycocotools/maskApi.h
--- a/csrc/pyx/setup.py
+++ b/csrc/pyx/setup.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+"""Compile the cython extensions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from distutils.extension import Extension
+from distutils.core import setup
+import os
+
+from Cython.Distutils import build_ext
+import numpy as np
+
+ext_modules = [
+    Extension(
+        'install.lib.utils.cython_bbox',
+        ['cython_bbox.pyx'],
+        extra_compile_args=['-w'],
+        include_dirs=[np.get_include()]
+    ),
+    Extension(
+        'install.lib.utils.cython_nms',
+        ['cython_nms.pyx'],
+        extra_compile_args=['-w'],
+        include_dirs=[np.get_include()]
+    ),
+    Extension(
+        'install.lib.pycocotools._mask',
+        ['maskApi.c', '_mask.pyx'],
+        include_dirs=[np.get_include(), os.path.dirname(os.path.abspath(__file__))],
+        extra_compile_args=['-w']
+    ),
+]
+
+setup(
+    name='SeetaDet',
+    ext_modules=ext_modules,
+    cmdclass={'build_ext': build_ext},
+)
--- a/lib/faster_rcnn/data_loader.py
+++ b/lib/faster_rcnn/data_loader.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import multiprocessing as mp
-import time
-
-import dragon
-import dragon.vm.torch as torch
-import numpy as np
-
-from lib.core.config import cfg
-from lib.faster_rcnn.data_transformer import DataTransformer
-from lib.datasets.factory import get_imdb
-from lib.utils import logger
-from lib.utils.blob import im_list_to_blob
-
-
-class DataLoader(object):
-    """Provide mini-batches of data."""
-
-    def __init__(self):
-        super(DataLoader, self).__init__()
-        database = get_imdb(cfg.TRAIN.DATABASE)
-        self.data_batch = DataBatch(**{
-            'dataset': lambda: dragon.io.SeetaRecordDataset(database.source),
-            'classes': database.classes,
-            'shuffle': cfg.TRAIN.USE_SHUFFLE,
-            'num_chunks': cfg.TRAIN.NUM_SHUFFLE_CHUNKS,
-            'batch_size': cfg.TRAIN.IMS_PER_BATCH * 2,
-            'num_transformers': cfg.TRAIN.NUM_WORKERS,
-        })
-
-    def __call__(self):
-        outputs = self.data_batch.get()
-        outputs['data'] = torch.from_numpy(outputs['data'])
-        return outputs
-
-
-class DataBatch(mp.Process):
-    """Prefetch the batch of data."""
-
-    def __init__(self, **kwargs):
-        """Construct a ``DataBatch``.
-
-        Parameters
-        ----------
-        dataset : lambda
-            The creator of a dataset.
-        classes : Sequence[str]
-            The class names.
-        shuffle : bool, optional, default=False
-            Whether to shuffle the data.
-        num_chunks : int, optional, default=0
-            The number of chunks to split.
-        batch_size : int, optional, default=2
-            The size of a mini-batch.
-        num_transformers : int, optional, default=3
-            The number of workers to transform data.
-
-        """
-        super(DataBatch, self).__init__()
-        # Distributed settings
-        rank, group_size = 0, 1
-        process_group = dragon.distributed.get_group()
-        if process_group is not None and kwargs.get(
-                'phase', 'TRAIN') == 'TRAIN':
-            group_size = process_group.size
-            rank = dragon.distributed.get_rank(process_group)
-        kwargs['group_size'] = group_size
-
-        # Configuration
-        self._prefetch = kwargs.get('prefetch', 5)
-        self._batch_size = kwargs.get('batch_size', 2)
-        self._num_readers = kwargs.get('num_readers', 1)
-        self._num_transformers = kwargs.get('num_transformers', 3)
-        self._num_fetchers = kwargs.get('num_fetchers', 1)
-        self.daemon = True
-
-        # Initialize queues
-        num_batches = self._prefetch * self._num_readers
-        self.Q1 = mp.Queue(num_batches * self._batch_size)
-        self.Q21 = mp.Queue(num_batches * self._batch_size)
-        self.Q22 = mp.Queue(num_batches * self._batch_size)
-        self.Q3 = mp.Queue(num_batches)
-
-        # Initialize readers
-        self._readers = []
-        for i in range(self._num_readers):
-            part_idx, num_parts = i, self._num_readers
-            num_parts *= group_size
-            part_idx += rank * self._num_readers
-            self._readers.append(dragon.io.DataReader(
-                num_parts=num_parts, part_idx=part_idx, **kwargs))
-            self._readers[i]._seed += part_idx
-            self._readers[i].q_out = self.Q1
-            self._readers[i].start()
-            time.sleep(0.1)
-
-        # Initialize transformers
-        self._transformers = []
-        for i in range(self._num_transformers):
-            transformer = DataTransformer(**kwargs)
-            transformer._seed += (i + rank * self._num_transformers)
-            transformer.q_in = self.Q1
-            transformer.q1_out, transformer.q2_out = self.Q21, self.Q22
-            transformer.start()
-            self._transformers.append(transformer)
-            time.sleep(0.1)
-
-        # Initialize batch-producer
-        self.start()
-
-        # Register cleanup callbacks
-        def cleanup():
-            def terminate(processes):
-                for process in processes:
-                    process.terminate()
-                    process.join()
-            terminate([self])
-            logger.info('Terminate DataBatch.')
-            terminate(self._transformers)
-            logger.info('Terminate DataTransformer.')
-            terminate(self._readers)
-            logger.info('Terminate DataReader.')
-
-        import atexit
-        atexit.register(cleanup)
-
-    def get(self):
-        """Get a batch.
-
-        Returns
-        -------
-        dict
-            The batch dict.
-
-        """
-        return self.Q3.get()
-
-    def run(self):
-        """Start the process to produce batches."""
-        def produce(q_in):
-            processed_ims, ims_info, all_boxes = [], [], []
-            for image_index in range(cfg.TRAIN.IMS_PER_BATCH):
-                im, im_scale, gt_boxes = q_in.get()
-                processed_ims.append(im)
-                ims_info.append(list(im.shape[:2]) + [im_scale])
-                im_boxes = np.zeros((gt_boxes.shape[0], gt_boxes.shape[1] + 1), 'float32')
-                im_boxes[:, :gt_boxes.shape[1]], im_boxes[:, -1] = gt_boxes, image_index
-                all_boxes.append(im_boxes)
-            return {
-                'data': im_list_to_blob(processed_ims),
-                'ims_info': np.array(ims_info, dtype=np.float32),
-                'gt_boxes': np.concatenate(all_boxes, axis=0),
-            }
-
-        # Two queues to implement aspect-grouping
-        # This is necessary to reduce the gpu memory
-        # from fetching a huge square batch blob
-        q1, q2 = self.Q21, self.Q22
-
-        # Main prefetch loop
-        while True:
-            if q1.qsize() >= cfg.TRAIN.IMS_PER_BATCH:
-                self.Q3.put(produce(q1))
-            elif q2.qsize() >= cfg.TRAIN.IMS_PER_BATCH:
-                self.Q3.put(produce(q2))
-            q1, q2 = q2, q1  # Uniform sampling trick
--- a/lib/mask_rcnn/data_loader.py
+++ b/lib/mask_rcnn/data_loader.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import multiprocessing as mp
-import time
-
-import dragon
-import dragon.vm.torch as torch
-import numpy as np
-
-from lib.core.config import cfg
-from lib.mask_rcnn.data_transformer import DataTransformer
-from lib.datasets.factory import get_imdb
-from lib.utils import logger
-from lib.utils.blob import im_list_to_blob
-from lib.utils.blob import mask_list_to_blob
-
-
-class DataLoader(object):
-    """Provide mini-batches of data."""
-
-    def __init__(self):
-        super(DataLoader, self).__init__()
-        database = get_imdb(cfg.TRAIN.DATABASE)
-        self.data_batch = DataBatch(**{
-            'dataset': lambda: dragon.io.SeetaRecordDataset(database.source),
-            'classes': database.classes,
-            'shuffle': cfg.TRAIN.USE_SHUFFLE,
-            'num_chunks': cfg.TRAIN.NUM_SHUFFLE_CHUNKS,
-            'batch_size': cfg.TRAIN.IMS_PER_BATCH * 2,
-            'num_transformers': cfg.TRAIN.NUM_WORKERS,
-        })
-
-    def __call__(self):
-        outputs = self.data_batch.get()
-        outputs['data'] = torch.from_numpy(outputs['data'])
-        return outputs
-
-
-class DataBatch(mp.Process):
-    """Prefetch the batch of data."""
-
-    def __init__(self, **kwargs):
-        """Construct a ``DataBatch``.
-
-        Parameters
-        ----------
-        dataset : lambda
-            The creator of a dataset.
-        classes : Sequence[str]
-            The class names.
-        shuffle : bool, optional, default=False
-            Whether to shuffle the data.
-        num_chunks : int, optional, default=0
-            The number of chunks to split.
-        batch_size : int, optional, default=2
-            The size of a mini-batch.
-        num_transformers : int, optional, default=3
-            The number of workers to transform data.
-
-        """
-        super(DataBatch, self).__init__()
-        # Distributed settings
-        rank, group_size = 0, 1
-        process_group = dragon.distributed.get_group()
-        if process_group is not None and kwargs.get(
-                'phase', 'TRAIN') == 'TRAIN':
-            group_size = process_group.size
-            rank = dragon.distributed.get_rank(process_group)
-        kwargs['group_size'] = group_size
-
-        # Configuration
-        self._prefetch = kwargs.get('prefetch', 5)
-        self._batch_size = kwargs.get('batch_size', 2)
-        self._num_readers = kwargs.get('num_readers', 1)
-        self._num_transformers = kwargs.get('num_transformers', 3)
-        self._num_fetchers = kwargs.get('num_fetchers', 1)
-        self.daemon = True
-
-        # Initialize queues
-        num_batches = self._prefetch * self._num_readers
-        self.Q1 = mp.Queue(num_batches * self._batch_size)
-        self.Q21 = mp.Queue(num_batches * self._batch_size)
-        self.Q22 = mp.Queue(num_batches * self._batch_size)
-        self.Q3 = mp.Queue(num_batches)
-
-        # Initialize readers
-        self._readers = []
-        for i in range(self._num_readers):
-            part_idx, num_parts = i, self._num_readers
-            num_parts *= group_size
-            part_idx += rank * self._num_readers
-            self._readers.append(dragon.io.DataReader(
-                num_parts=num_parts, part_idx=part_idx, **kwargs))
-            self._readers[i]._seed += part_idx
-            self._readers[i].q_out = self.Q1
-            self._readers[i].start()
-            time.sleep(0.1)
-
-        # Initialize transformers
-        self._transformers = []
-        for i in range(self._num_transformers):
-            transformer = DataTransformer(**kwargs)
-            transformer._seed += (i + rank * self._num_transformers)
-            transformer.q_in = self.Q1
-            transformer.q1_out, transformer.q2_out = self.Q21, self.Q22
-            transformer.start()
-            self._transformers.append(transformer)
-            time.sleep(0.1)
-
-        # Initialize batch-producer
-        self.start()
-
-        # Register cleanup callbacks
-        def cleanup():
-            def terminate(processes):
-                for process in processes:
-                    process.terminate()
-                    process.join()
-            terminate([self])
-            logger.info('Terminate DataBatch.')
-            terminate(self._transformers)
-            logger.info('Terminate DataTransformer.')
-            terminate(self._readers)
-            logger.info('Terminate DataReader.')
-
-        import atexit
-        atexit.register(cleanup)
-
-    def get(self):
-        """Get a batch.
-
-        Returns
-        -------
-        dict
-            The batch dict.
-
-        """
-        return self.Q3.get()
-
-    def run(self):
-        """Start the process to produce batches."""
-        def produce(q_in):
-            processed_ims, ims_info = [], []
-            packed_boxes, packed_masks = [], []
-            for image_index in range(cfg.TRAIN.IMS_PER_BATCH):
-                im, im_scale, gt_boxes, gt_masks = q_in.get()
-                processed_ims.append(im)
-                ims_info.append(list(im.shape[:2]) + [im_scale])
-                im_boxes = np.zeros((gt_boxes.shape[0], gt_boxes.shape[1] + 1), 'float32')
-                im_boxes[:, :gt_boxes.shape[1]], im_boxes[:, -1] = gt_boxes, image_index
-                packed_boxes.append(im_boxes)
-                packed_masks.append(gt_masks)
-            return {
-                'data': im_list_to_blob(processed_ims),
-                'ims_info': np.array(ims_info, 'float32'),
-                'gt_boxes': np.concatenate(packed_boxes, 0),
-                'gt_masks': mask_list_to_blob(packed_masks),
-            }
-
-        # Two queues to implement aspect-grouping
-        # This is necessary to reduce the gpu memory
-        # from fetching a huge square batch blob
-        q1, q2 = self.Q21, self.Q22
-
-        # Main prefetch loop
-        while True:
-            if q1.qsize() >= cfg.TRAIN.IMS_PER_BATCH:
-                self.Q3.put(produce(q1))
-            elif q2.qsize() >= cfg.TRAIN.IMS_PER_BATCH:
-                self.Q3.put(produce(q2))
-            q1, q2 = q2, q1  # Uniform sampling trick
--- a/lib/modeling/factory.py
+++ b/lib/modeling/factory.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-import importlib
-
-
-_STORE = collections.defaultdict(dict)
-
-
-###########################################
-#                                         #
-#                 Body                    #
-#                                         #
-###########################################
-
-
-# ResNet
-for D in [18, 34, 50, 101, 152, 200, 269]:
-    _STORE['BODY']['resnet{}'.format(D)] = \
-        'lib.modeling.resnet.make_resnet_{}'.format(D)
-
-# VGG
-for D in [16, 19]:
-    for T in ['', '_reduced_300', '_reduced_512']:
-        _STORE['BODY']['vgg{}{}'.format(D, T)] = \
-            'lib.modeling.vgg.make_vgg_{}{}'.format(D, T)
-
-# AirNet
-for D in ['', '3b', '4b', '5b']:
-    _STORE['BODY']['airnet{}'.format(D)] = \
-        'lib.modeling.airnet.make_airnet_{}'.format(D)
-
-# MobileNet
-for D in ['a1', 'v2']:
-    _STORE['BODY']['mobilenet_{}'.format(D)] = \
-        'lib.modeling.mobilenet.make_mobilenet_{}'.format(D)
-
-
-def get_template_func(name, sets, desc):
-    name = name.lower()
-    if name not in sets:
-        raise ValueError(
-            'The {} for {} was not registered.\n'
-            'Registered modules: [{}]'
-            .format(name, desc, ', '.join(sets.keys()))
-        )
-    module_name = '.'.join(sets[name].split('.')[0:-1])
-    func_name = sets[name].split('.')[-1]
-    try:
-        module = importlib.import_module(module_name)
-        return getattr(module, func_name)
-    except ImportError as e:
-        raise ValueError('Can not import module from: ' + module_name)
-
-
-def get_body_func(name):
-    return get_template_func(
-        name, _STORE['BODY'], 'Body')
--- a/lib/modules/nn.py
+++ b/lib/modules/nn.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-
-"""Define some basic structures."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from dragon.vm.torch import nn
-from lib.core.config import cfg
-
-
-class Affine(object):
-    """Affine transformation with weight and bias fixed."""
-
-    def __new__(cls, dim_in, bias=True, inplace=True):
-        return nn.Affine(
-            dim_in,
-            fix_weight=True,
-            fix_bias=True,
-            inplace=inplace,
-        )
-
-
-class Conv1x1(object):
-    """1x1 convolution."""
-
-    def __new__(cls, dim_in, dim_out, stride=1, bias=False):
-        return nn.Conv2d(
-            dim_in,
-            dim_out,
-            kernel_size=1,
-            stride=stride,
-            bias=bias,
-        )
-
-
-class Conv3x3(object):
-    """3x3 convolution."""
-
-    def __new__(cls, dim_in, dim_out, stride=1, dilation=1, bias=False):
-        return nn.Conv2d(
-            dim_in,
-            dim_out,
-            kernel_size=3,
-            stride=stride,
-            padding=1 * dilation,
-            bias=bias,
-        )
-
-
-class CrossEntropyLoss(object):
-    """Cross entropy loss."""
-
-    def __new__(cls):
-        return nn.CrossEntropyLoss(ignore_index=-1)
-
-
-class Identity(nn.Module):
-    """Pass input to the output."""
-
-    def __init__(self, *args, **kwargs):
-        super(Identity, self).__init__()
-        _, _ = args, kwargs
-
-    def forward(self, x):
-        return x
-
-
-class SigmoidFocalLoss(object):
-    """Sigmoid focal loss."""
-
-    def __new__(cls):
-        return nn.SigmoidFocalLoss(
-            alpha=cfg.MODEL.FOCAL_LOSS_ALPHA,
-            gamma=cfg.MODEL.FOCAL_LOSS_GAMMA,
-        )
-
-
-class SmoothL1Loss(object):
-    """Smoothed l1 loss."""
-
-    def __new__(cls, beta=1.):
-        return nn.SmoothL1Loss(
-            beta=beta,
-            reduction='batch_size',
-        )
-
-
-def is_conv2d(module):
-    """Return a bool indicating the module is a Conv2d."""
-    return isinstance(module, nn.Conv2d) or \
-        isinstance(module, nn.DepthwiseConv2d)
-
-
-AvgPool2d = nn.AvgPool2d
-BatchNorm2d = nn.BatchNorm2d
-BCEWithLogitsLoss = nn.BCEWithLogitsLoss
-Conv2d = nn.Conv2d
-ConvTranspose2d = nn.ConvTranspose2d
-DepthwiseConv2d = nn.DepthwiseConv2d
-Linear = nn.Linear
-MaxPool2d = nn.MaxPool2d
-Module = nn.Module
-ModuleList = nn.ModuleList
-Sequential = nn.Sequential
-ReLU = nn.ReLU
-Sigmoid = nn.Sigmoid
-Softmax = nn.Softmax
--- a/lib/pycocotools/.gitignore
+++ b/lib/pycocotools/.gitignore
-_mask.c
--- a/lib/pycocotools/license.txt
+++ b/lib/pycocotools/license.txt
-Copyright (c) 2014, Piotr Dollar and Tsung-Yi Lin
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met: 
-
-1. Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer. 
-2. Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution. 
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-The views and conclusions contained in the software and documentation are those
-of the authors and should not be interpreted as representing official policies, 
-either expressed or implied, of the FreeBSD Project.
--- a/lib/ssd/data_loader.py
+++ b/lib/ssd/data_loader.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import multiprocessing as mp
-import time
-
-import dragon
-import dragon.vm.torch as torch
-import numpy as np
-
-from lib.core.config import cfg
-from lib.datasets.factory import get_imdb
-from lib.ssd.data_transformer import DataTransformer
-from lib.utils import logger
-
-
-class DataLoader(object):
-    """Provide mini-batches of data."""
-
-    def __init__(self):
-        super(DataLoader, self).__init__()
-        database = get_imdb(cfg.TRAIN.DATABASE)
-        self.data_batch = DataBatch(**{
-            'dataset': lambda: dragon.io.SeetaRecordDataset(database.source),
-            'classes': database.classes,
-            'shuffle': cfg.TRAIN.USE_SHUFFLE,
-            'num_chunks': cfg.TRAIN.NUM_SHUFFLE_CHUNKS,
-            'batch_size': cfg.TRAIN.IMS_PER_BATCH * 2,
-            'num_transformers': cfg.TRAIN.NUM_WORKERS,
-        })
-
-    def __call__(self):
-        outputs = self.data_batch.get()
-        outputs['data'] = torch.from_numpy(outputs['data'])
-        return outputs
-
-
-class DataBatch(mp.Process):
-    """Prefetch the batch of data."""
-
-    def __init__(self, **kwargs):
-        """Construct a ``DataBatch``.
-
-        Parameters
-        ----------
-        dataset : lambda
-            The creator of a dataset.
-        classes : Sequence[str]
-            The class names.
-        shuffle : bool, optional, default=False
-            Whether to shuffle the data.
-        num_chunks : int, optional, default=0
-            The number of chunks to split.
-        batch_size : int, optional, default=2
-            The size of a mini-batch.
-        num_transformers : int, optional, default=3
-            The number of workers to transform data.
-
-        """
-        super(DataBatch, self).__init__()
-        # Distributed settings
-        rank, group_size = 0, 1
-        process_group = dragon.distributed.get_group()
-        if process_group is not None and kwargs.get(
-                'phase', 'TRAIN') == 'TRAIN':
-            group_size = process_group.size
-            rank = dragon.distributed.get_rank(process_group)
-        kwargs['group_size'] = group_size
-
-        # Configuration
-        self._prefetch = kwargs.get('prefetch', 5)
-        self._batch_size = kwargs.get('batch_size', 32)
-        self._num_readers = kwargs.get('num_readers', 1)
-        self._num_transformers = kwargs.get('num_transformers', 3)
-        self._num_fetchers = kwargs.get('num_fetchers', 1)
-
-        # Initialize queues
-        num_batches = self._prefetch * self._num_readers
-        self.Q1 = mp.Queue(num_batches * self._batch_size)
-        self.Q2 = mp.Queue(num_batches * self._batch_size)
-        self.Q3 = mp.Queue(num_batches)
-
-        # Initialize readers
-        self._readers = []
-        for i in range(self._num_readers):
-            part_idx, num_parts = i, self._num_readers
-            num_parts *= group_size
-            part_idx += rank * self._num_readers
-            self._readers.append(dragon.io.DataReader(
-                num_parts=num_parts, part_idx=part_idx, **kwargs))
-            self._readers[i]._seed += part_idx
-            self._readers[i].q_out = self.Q1
-            self._readers[i].start()
-            time.sleep(0.1)
-
-        # Initialize transformers
-        self._transformers = []
-        for i in range(self._num_transformers):
-            transformer = DataTransformer(**kwargs)
-            transformer._seed += (i + rank * self._num_transformers)
-            transformer.q_in, transformer.q_out = self.Q1, self.Q2
-            transformer.start()
-            self._transformers.append(transformer)
-            time.sleep(0.1)
-
-        # Initialize batch-producer
-        self.start()
-
-        # Register cleanup callbacks
-        def cleanup():
-            def terminate(processes):
-                for process in processes:
-                    process.terminate()
-                    process.join()
-            terminate([self])
-            logger.info('Terminate DataBatch.')
-            terminate(self._transformers)
-            logger.info('Terminate DataTransformer.')
-            terminate(self._readers)
-            logger.info('Terminate DataReader.')
-
-        import atexit
-        atexit.register(cleanup)
-
-    def get(self):
-        """Get a batch.
-
-        Returns
-        -------
-        dict
-            The batch dict.
-
-        """
-        return self.Q3.get()
-
-    def run(self):
-        """Start the process to produce batches."""
-        image_batch_shape = (
-            cfg.TRAIN.IMS_PER_BATCH,
-            cfg.SSD.RESIZE.HEIGHT,
-            cfg.SSD.RESIZE.WIDTH, 3,
-        )
-
-        # Main prefetch loop
-        while True:
-            boxes_to_pack = []
-            img, gt_boxes = self.Q2.get()
-            ims_blob = np.zeros(image_batch_shape, img.dtype)
-            for i in range(cfg.TRAIN.IMS_PER_BATCH):
-                ims_blob[i] = img
-                boxes = np.zeros((gt_boxes.shape[0], gt_boxes.shape[1] + 1), 'float32')
-                boxes[:, :gt_boxes.shape[1]], boxes[:, -1] = gt_boxes, i
-                boxes_to_pack.append(boxes)
-                if i != (cfg.TRAIN.IMS_PER_BATCH - 1):
-                    img, gt_boxes = self.Q2.get()
-            self.Q3.put({
-                'data': ims_blob,
-                'gt_boxes': np.concatenate(boxes_to_pack),
-            })
-
--- a/lib/utils/boxes.py
+++ b/lib/utils/boxes.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# Codes are based on:
-#
-#      <https://github.com/ppwwyyxx/tensorpack/blob/master/examples/FasterRCNN/utils/np_box_ops.py>
-#
-# ------------------------------------------------------------
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from lib.utils import cython_bbox
-
-
-def intersection(boxes1, boxes2):
-    """Compute pairwise intersection areas between boxes.
-
-    Args:
-      boxes1: a numpy array with shape [N, 4] holding N boxes
-      boxes2: a numpy array with shape [M, 4] holding M boxes
-
-    Returns:
-      a numpy array with shape [N*M] representing pairwise intersection area
-
-    """
-    [y_min1, x_min1, y_max1, x_max1] = np.split(boxes1, 4, axis=1)
-    [y_min2, x_min2, y_max2, x_max2] = np.split(boxes2, 4, axis=1)
-
-    all_pairs_min_ymax = np.minimum(y_max1, np.transpose(y_max2))
-    all_pairs_max_ymin = np.maximum(y_min1, np.transpose(y_min2))
-    intersect_heights = np.maximum(
-        np.zeros(all_pairs_max_ymin.shape),
-        all_pairs_min_ymax - all_pairs_max_ymin)
-    all_pairs_min_xmax = np.minimum(x_max1, np.transpose(x_max2))
-    all_pairs_max_xmin = np.maximum(x_min1, np.transpose(x_min2))
-    intersect_widths = np.maximum(
-        np.zeros(all_pairs_max_xmin.shape),
-        all_pairs_min_xmax - all_pairs_max_xmin)
-    return intersect_heights * intersect_widths
-
-
-def iou(boxes1, boxes2):
-    """Computes pairwise intersection-over-union between box collections.
-
-    Args:
-      boxes1: a numpy array with shape [N, 4] holding N boxes.
-      boxes2: a numpy array with shape [M, 4] holding M boxes.
-    Returns:
-      a numpy array with shape [N, M] representing pairwise iou scores.
-
-    """
-    intersect = intersection(boxes1, boxes2)
-    area1 = boxes_area(boxes1)
-    area2 = boxes_area(boxes2)
-    union = \
-        np.expand_dims(area1, axis=1) + \
-        np.expand_dims(area2, axis=0) - intersect
-    return intersect / union
-
-
-def ioa1(boxes1, boxes2):
-    """Computes pairwise intersection-over-area between box collections.
-    Intersection-over-area (ioa) between two boxes box1 and box2 is defined as
-    their intersection area over box2's area. Note that ioa is not symmetric,
-    that is, IOA(box1, box2) != IOA(box2, box1).
-
-    Args:
-      boxes1: a numpy array with shape [N, 4] holding N boxes.
-      boxes2: a numpy array with shape [M, 4] holding N boxes.
-
-    Returns:
-      a numpy array with shape [N, M] representing pairwise ioa scores.
-
-    """
-    intersect = intersection(boxes1, boxes2)
-    areas = np.expand_dims(boxes_area(boxes1), axis=1)
-    return intersect / areas
-
-
-def ioa2(boxes1, boxes2):
-    """Computes pairwise intersection-over-area between box collections.
-    Intersection-over-area (ioa) between two boxes box1 and box2 is defined as
-    their intersection area over box2's area. Note that ioa is not symmetric,
-    that is, IOA(box1, box2) != IOA(box2, box1).
-
-    Args:
-      boxes1: a numpy array with shape [N, 4] holding N boxes.
-      boxes2: a numpy array with shape [M, 4] holding N boxes.
-
-    Returns:
-      a numpy array with shape [N, M] representing pairwise ioa scores.
-
-    """
-    intersect = intersection(boxes1, boxes2)
-    areas = np.expand_dims(boxes_area(boxes2), axis=0)
-    return intersect / areas
-
-
-def bbox_overlaps(boxes1, boxes2):
-    """Compute the overlaps between two group of boxes."""
-    return cython_bbox.bbox_overlaps(
-        np.ascontiguousarray(boxes1, dtype=np.float),
-        np.ascontiguousarray(boxes2, dtype=np.float),
-    )
-
-
-def bbox_transform(ex_rois, gt_rois, weights=(1., 1., 1., 1.)):
-    """Transform the boxes to the regression targets."""
-    ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.
-    ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.
-    ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths
-    ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights
-
-    gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.
-    gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.
-    gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths
-    gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights
-
-    wx, wy, ww, wh = weights
-    targets = [wx * (gt_ctr_x - ex_ctr_x) / ex_widths]
-    targets += [wy * (gt_ctr_y - ex_ctr_y) / ex_heights]
-    targets += [ww * np.log(gt_widths / ex_widths)]
-    targets += [wh * np.log(gt_heights / ex_heights)]
-
-    return np.vstack(targets).transpose()
-
-
-def bbox_transform_inv(boxes, deltas, weights=(1., 1., 1., 1.)):
-    """Decode the final boxes according to the deltas."""
-    if boxes.shape[0] == 0:
-        return np.zeros((0, deltas.shape[1]), dtype=deltas.dtype)
-
-    boxes = boxes.astype(deltas.dtype, copy=False)
-
-    widths = boxes[:, 2] - boxes[:, 0] + 1.
-    heights = boxes[:, 3] - boxes[:, 1] + 1.
-    ctr_x = boxes[:, 0] + 0.5 * widths
-    ctr_y = boxes[:, 1] + 0.5 * heights
-
-    wx, wy, ww, wh = weights
-    dx = deltas[:, 0::4] / wx
-    dy = deltas[:, 1::4] / wy
-    dw = deltas[:, 2::4] / ww
-    dh = deltas[:, 3::4] / wh
-
-    pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis]
-    pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis]
-    pred_w = np.exp(dw) * widths[:, np.newaxis]
-    pred_h = np.exp(dh) * heights[:, np.newaxis]
-
-    pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype)
-    pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w  # x1
-    pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h  # y1
-    pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w  # x2
-    pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h  # y2
-
-    return pred_boxes
-
-
-def boxes_area(boxes):
-    """Compute the area of an array of boxes."""
-    w = (boxes[:, 2] - boxes[:, 0] + 1)
-    h = (boxes[:, 3] - boxes[:, 1] + 1)
-    areas = w * h
-    assert np.all(areas >= 0), 'Negative areas founds'
-    return areas
-
-
-def clip_boxes(boxes, im_shape):
-    # x1 >= 0
-    boxes[:, 0] = np.maximum(np.minimum(boxes[:, 0], im_shape[1] - 1), 0)
-    # y1 >= 0
-    boxes[:, 1] = np.maximum(np.minimum(boxes[:, 1], im_shape[0] - 1), 0)
-    # x2 < im_shape[1]
-    boxes[:, 2] = np.maximum(np.minimum(boxes[:, 2], im_shape[1] - 1), 0)
-    # y2 < im_shape[0]
-    boxes[:, 3] = np.maximum(np.minimum(boxes[:, 3], im_shape[0] - 1), 0)
-    return boxes
-
-
-def clip_tiled_boxes(boxes, im_shape):
-    # x1 >= 0
-    boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0)
-    # y1 >= 0
-    boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0)
-    # x2 < im_shape[1]
-    boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0)
-    # y2 < im_shape[0]
-    boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0)
-    return boxes
-
-
-def expand_boxes(boxes, scale):
-    """Expand an array of boxes by a given scale."""
-    w_half = (boxes[:, 2] - boxes[:, 0]) * .5
-    h_half = (boxes[:, 3] - boxes[:, 1]) * .5
-    x_c = (boxes[:, 2] + boxes[:, 0]) * .5
-    y_c = (boxes[:, 3] + boxes[:, 1]) * .5
-
-    w_half *= scale
-    h_half *= scale
-
-    boxes_exp = np.zeros(boxes.shape)
-    boxes_exp[:, 0] = x_c - w_half
-    boxes_exp[:, 2] = x_c + w_half
-    boxes_exp[:, 1] = y_c - h_half
-    boxes_exp[:, 3] = y_c + h_half
-
-    return boxes_exp
-
-
-def flip_boxes(boxes, width):
-    """Flip the boxes horizontally."""
-    flip_boxes = boxes.copy()
-    old_x1 = boxes[:, 0].copy()
-    old_x2 = boxes[:, 2].copy()
-    flip_boxes[:, 0] = width - old_x2 - 1
-    flip_boxes[:, 2] = width - old_x1 - 1
-    return flip_boxes
-
-
-def filter_boxes(boxes, min_size):
-    """Remove all boxes with any side smaller than min size."""
-    ws = boxes[:, 2] - boxes[:, 0] + 1
-    hs = boxes[:, 3] - boxes[:, 1] + 1
-    keep = np.where((ws >= min_size) & (hs >= min_size))[0]
-    return keep
-
-
-def dismantle_boxes(gt_boxes, num_images):
-    """Dismantle the packed ground-truth boxes."""
-    return [
-        gt_boxes[
-            np.where(gt_boxes[:, -1].astype(np.int32) == i)[0]
-        ][:, :-1] for i in range(num_images)
-    ]
--- a/lib/utils/framework.py
+++ b/lib/utils/framework.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-
-import dragon
-from dragon.core.framework import tensor_util
-from dragon.core.util import six
-import dragon.vm.torch as torch
-import numpy as np
-
-from lib.core.config import cfg
-
-
-def feed_tensor(tensor, array):
-    tensor_util.set_array(tensor, array)
-
-
-def get_param_groups(module, bias_lr=1., bias_decay=0.):
-    """Separate weight and bias into parameters groups.
-
-    Parameters
-    ----------
-    module : dragon.vm.torch.nn.Module
-        The module to collect parameters.
-    bias_lr : float, optional, default=1.
-        The lr multiplier of bias.
-    bias_decay : float, optional, default=0.
-        The decay multiplier of bias.
-
-    Returns
-    -------
-    Sequence[ParamGroup]
-        The parameter groups.
-
-    """
-    param_groups = [
-        {
-            'params': [],
-            'lr_mult': 1.,
-            'decay_mult': 1.,
-        },
-        {
-            'params': [],
-            'lr_mult': bias_lr,
-            'decay_mult': bias_decay,
-        }
-    ]
-    for name, param in module.named_parameters():
-        gi = 0 if 'weight' in name and param.dim() > 1 else 1
-        param_groups[gi]['params'].append(param)
-    if len(param_groups[1]['params']) == 0:
-        param_groups.pop()  # Remove empty group
-    return param_groups
-
-
-def get_workspace():
-    """Return the current default workspace.
-
-    Returns
-    -------
-    dragon.Workspace
-        The default workspace.
-
-    """
-    return dragon.get_workspace()
-
-
-def new_placeholder(device=None):
-    """Create a new tensor to feed data.
-
-    Parameters
-    ----------
-    device : int, optional
-        The device index.
-
-    Returns
-    -------
-    dragon.vm.torch.Tensor
-        The placeholder tensor.
-
-    """
-    value = torch.zeros(1)
-    if device is not None:
-        return value.cuda(device)
-    return value
-
-
-def new_tensor(data, enforce_cpu=False):
-    """Create a new tensor from the data.
-
-    Parameters
-    ----------
-    data : array_like
-        The data value.
-    enforce_cpu : bool, optional, default=False
-        **True** to enforce the cpu storage.
-
-    Returns
-    -------
-    dragon.vm.torch.Tensor
-        The tensor taken with the data.
-
-    """
-    if isinstance(data, np.ndarray):
-        tensor = torch.from_numpy(data)
-    elif isinstance(data, torch.Tensor):
-        tensor = data
-    else:
-        tensor = torch.tensor(data)
-    if not enforce_cpu:
-        tensor = tensor.cuda(cfg.GPU_ID)
-    return tensor
-
-
-def new_workspace(merge_default=True):
-    """Create a new workspace.
-
-    Parameters
-    ----------
-    merge_default : bool, optional, default=True
-        **True** to merge tensors from default workspace.
-
-    Returns
-    -------
-    dragon.Workspace
-        The new workspace.
-
-    """
-    workspace = dragon.Workspace()
-    if merge_default:
-        workspace.merge_from(get_workspace())
-    return workspace
-
-
-def reset_workspace(workspace=None, merge_default=True):
-    """Reset a workspace and return a new one.
-
-    Parameters
-    ----------
-    workspace : dragon.Workspace, optional
-        The workspace to reset.
-    merge_default : bool, optional, default=True
-        **True** to merge tensors from default workspace.
-
-    Returns
-    -------
-    dragon.Workspace
-        The new workspace.
-
-    """
-    if workspace is not None:
-        workspace.Clear()  # Block the GIL
-    return new_workspace(merge_default)
-
-
-class Graph(object):
-    """Simple sequential graph to accelerate inference.
-
-    Graph reduces the overhead of python functions
-    under eager execution. Such cost will be at least 15ms
-    for common backbones, which limits to about 60FPS.
-
-    For more details, see the eager mechanism of Dragon.
-
-    """
-
-    def __init__(self, inputs, outputs, constants=None):
-        def canonicalize(input_dict):
-            if input_dict is None:
-                return {}
-            for k, v in input_dict.items():
-                input_dict[k] = v.name if hasattr(v, 'name') else v
-            return input_dict
-        self.placeholders = {}
-        self._inputs = canonicalize(inputs)
-        self._outputs = canonicalize(outputs)
-        self._constants = canonicalize(constants)
-        self._workspace = get_workspace()
-        self._tracer = torch.jit.get_tracer()
-
-    @property
-    def workspace(self):
-        return self._workspace
-
-    @workspace.setter
-    def workspace(self, value):
-        self._workspace = value
-
-    def forward(self, **kwargs):
-        # Assign inputs
-        for name, tensor in self._inputs.items():
-            value = kwargs.get(name, None)
-            tensor_util.set_array(tensor, value)
-
-        # Replay the traced expressions
-        self._tracer.replay()
-
-        # Collect outputs
-        # 1) Target results
-        # 2) Constant values
-        outputs = collections.OrderedDict()
-
-        for name, tensor in self._outputs.items():
-            outputs[name] = tensor_util.to_array(tensor, True)
-
-        for name, value in self._constants.items():
-            outputs[name] = value
-
-        return outputs
-
-    def __call__(self, **kwargs):
-        with self._workspace.as_default():
-            return self.forward(**kwargs)
-
-
-# Aliases
-pickle = six.moves.pickle
--- a/scripts/coco/im2rec.py
+++ b/scripts/coco/im2rec.py
@@ -11,6 +11,10 @@

 """Make record file for COCO dataset."""

+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
 import os
 import shutil

@@ -37,8 +41,8 @@ if __name__ == '__main__':
        record_file=os.path.join(COCO_ROOT, 'coco_2014_trainval35k'),
        images_path=[os.path.join(COCO_ROOT, 'images/train2014'),
                     os.path.join(COCO_ROOT, 'images/val2014')],
-        splits_path=[os.path.join(COCO_ROOT, 'ImageSets'),
-                     os.path.join(COCO_ROOT, 'ImageSets')],
+        splits_path=[os.path.join(COCO_ROOT, 'splits'),
+                     os.path.join(COCO_ROOT, 'splits')],
        mask_file='build/coco_2014_trainval35k_mask.pkl',
        splits=['train', 'valminusminival'],
    )
@@ -48,7 +52,7 @@ if __name__ == '__main__':
        record_file=os.path.join(COCO_ROOT, 'coco_2014_minival'),
        images_path=os.path.join(COCO_ROOT, 'images/val2014'),
        mask_file='build/coco_2014_minival_mask.pkl',
-        splits_path=os.path.join(COCO_ROOT, 'ImageSets'),
+        splits_path=os.path.join(COCO_ROOT, 'splits'),
        splits=['minival'],
    )


--- a/scripts/coco/maker.py
+++ b/scripts/coco/maker.py
@@ -86,7 +86,7 @@ def make_record(

    print('Start Time:', time.strftime("%a, %d %b %Y %H:%M:%S", time.gmtime()))

-    writer = dragon.io.SeetaRecordWriter(
+    writer = dragon.io.KPLRecordWriter(
        path=record_file,
        protocol={
            'id': 'string',
@@ -133,6 +133,6 @@ def make_record(
    writer.close()

    end_time = time.time()
-    data_size = os.path.getsize(record_file + '/data.data') * 1e-6
+    data_size = os.path.getsize(record_file + '/root.data') * 1e-6
    print('{} images take {:.2f} MB in {:.2f} sec.'
          .format(total_line, data_size, end_time - start_time))
--- a/scripts/coco/maskgen.py
+++ b/scripts/coco/maskgen.py
@@ -20,11 +20,11 @@ except:
    import pickle as cPickle

 sys.path.insert(0, '../..')
-from lib.pycocotools.coco import COCO
-from lib.pycocotools import mask_utils
+from seetadet.pycocotools.coco import COCO
+from seetadet.pycocotools import mask_utils


-class imdb(object):
+class COCOWrapper(object):
    def __init__(self, image_set, year, data_dir):
        self._year = year
        self._image_set = image_set
@@ -120,8 +120,6 @@ class imdb(object):
                # running out of the image bound
                # Do not use them or decoding error is inevitable
                mask_bytes = mask_utils.poly2bytes(obj['segmentation'], height, width)
-            if not isinstance(mask_bytes, bytes):
-                print(type(mask_bytes))
            if obj['area'] > 0 and x2 > x1 and y2 > y1:
                obj['clean_bbox'] = [x1, y1, x2, y2]
                valid_objects.append({
@@ -146,10 +144,11 @@ class imdb(object):


 def make_mask(split, year, data_dir):
-    coco = imdb(split, year, data_dir)
-    print('Preparing to make split: {}, total {} images'.format(split, coco.num_images))
-    if not osp.exists(osp.join(coco._data_path, 'ImageSets')):
-        os.makedirs(osp.join(coco._data_path, 'ImageSets'))
+    coco = COCOWrapper(split, year, data_dir)
+    print('Preparing to make split: {}, total {} images'
+          .format(split, coco.num_images))
+    if not osp.exists(osp.join(coco._data_path, 'splits')):
+        os.makedirs(osp.join(coco._data_path, 'splits'))

    gt_recs = OrderedDict()
    for i in range(coco.num_images):
@@ -157,14 +156,14 @@ def make_mask(split, year, data_dir):
        h, w, objects = coco.annotation_at(i)
        gt_recs[filename] = objects

-    with open(osp.join('build',
-        'coco_' + year + '_' + split + '_mask.pkl'), 'wb') as f:
+    with open(osp.join('build', 'coco_' + year + '_' + split + '_mask.pkl'), 'wb') as f:
        cPickle.dump(gt_recs, f, cPickle.HIGHEST_PROTOCOL)

-    with open(osp.join(coco._data_path, 'ImageSets', split + '.txt'), 'w') as f:
+    with open(osp.join(coco._data_path, 'splits', split + '.txt'), 'w') as f:
        for i in range(coco.num_images):
            filename = (coco.image_path_at(i).split('/')[-1]).split('.')[0]
-            if i != coco.num_images - 1: filename += '\n'
+            if i != coco.num_images - 1:
+                filename += '\n'
            f.write(filename)



--- a/scripts/rotated/im2rec.py
+++ b/scripts/rotated/im2rec.py
@@ -26,6 +26,6 @@ if __name__ == '__main__':
        record_file=osp.join(data_root, 'rotated_train'),
        images_path=[osp.join(data_root, 'JPEGImages')],
        annotations_path=[osp.join(data_root, 'Annotations')],
-        imagesets_path=[osp.join(data_root, 'ImageSets')],
+        splits_path=[osp.join(data_root, 'ImageSets')],
        splits=['train']
    )
--- a/scripts/rotated/maker.py
+++ b/scripts/rotated/maker.py
@@ -57,7 +57,7 @@ def make_record(
    record_file,
    images_path,
    annotations_path,
-    imagesets_path,
+    splits_path,
    splits
 ):
    if os.path.exists(record_file):
@@ -68,15 +68,15 @@ def make_record(
        images_path = [images_path]
    if not isinstance(annotations_path, list):
        annotations_path = [annotations_path]
-    if not isinstance(imagesets_path, list):
-        imagesets_path = [imagesets_path]
-    assert len(splits) == len(imagesets_path)
+    if not isinstance(splits_path, list):
+        splits_path = [splits_path]
+    assert len(splits) == len(splits_path)
    assert len(splits) == len(images_path)
    assert len(splits) == len(annotations_path)

    print('Start Time:', time.strftime("%a, %d %b %Y %H:%M:%S", time.gmtime()))

-    writer = dragon.io.SeetaRecordWriter(
+    writer = dragon.io.KPLRecordWriter(
        path=record_file,
        protocol={
            'id': 'string',
@@ -99,31 +99,37 @@ def make_record(
        }
    )

-    count, total_line = 0, 0
-    start_time = time.time()
-
-    for db_idx, split in enumerate(splits):
-        split_file = os.path.join(imagesets_path[db_idx], split + '.txt')
-        assert os.path.exists(split_file)
+    # Scan all available entries
+    print('Scan entries...')
+    entries = []
+    for i, split in enumerate(splits):
+        split_file = os.path.join(splits_path[i], split + '.txt')
        with open(split_file, 'r') as f:
            lines = f.readlines()
-            total_line += len(lines)
        for line in lines:
-            count += 1
-            if count % 2000 == 0:
+            filename = line.strip()
+            img_file = os.path.join(images_path[i], filename + '.jpg')
+            ann_file = os.path.join(annotations_path[i], filename + '.xml')
+            entries.append((img_file, ann_file))
+
+    # Parse and write into record file
+    print('Start Time:', time.strftime("%a, %d %b %Y %H:%M:%S", time.gmtime()))
+    start_time = time.time()
+
+    for i, (img_file, ann_file) in enumerate(entries):
+        if i > 0 and i % 2000 == 0:
            now_time = time.time()
            print('{} / {} in {:.2f} sec'.format(
-                    count, total_line, now_time - start_time))
-            filename = line.strip()
-            image_file = os.path.join(images_path[db_idx], filename + '.jpg')
-            xml_file = os.path.join(annotations_path[db_idx], filename + '.xml')
-            writer.write(make_example(image_file, xml_file))
+                i, len(entries), now_time - start_time))
+        writer.write(make_example(img_file, ann_file))

    now_time = time.time()
-    print('{} / {} in {:.2f} sec'.format(count, total_line, now_time - start_time))
+    print('{} / {} in {:.2f} sec'.format(
+        len(entries), len(entries), now_time - start_time))
    writer.close()

    end_time = time.time()
-    data_size = os.path.getsize(record_file + '/data.data') * 1e-6
+    data_size = os.path.getsize(record_file + '/root.data') * 1e-6
    print('{} images take {:.2f} MB in {:.2f} sec.'
-          .format(total_line, data_size, end_time - start_time))
+          .format(len(entries), data_size, end_time - start_time))
+
--- a/scripts/voc/im2rec.py
+++ b/scripts/voc/im2rec.py
@@ -28,7 +28,7 @@ if __name__ == '__main__':
                     osp.join(voc_root, 'VOCdevkit2012/VOC2012/JPEGImages')],
        annotations_path=[osp.join(voc_root, 'VOCdevkit2007/VOC2007/Annotations'),
                          osp.join(voc_root, 'VOCdevkit2012/VOC2012/Annotations')],
-        imagesets_path=[osp.join(voc_root, 'VOCdevkit2007/VOC2007/ImageSets/Main'),
+        splits_path=[osp.join(voc_root, 'VOCdevkit2007/VOC2007/ImageSets/Main'),
                        osp.join(voc_root, 'VOCdevkit2012/VOC2012/ImageSets/Main')],
        splits=['trainval', 'trainval']
    )
@@ -37,6 +37,6 @@ if __name__ == '__main__':
        record_file=osp.join(voc_root, 'voc_2007_test'),
        images_path=osp.join(voc_root, 'VOCdevkit2007/VOC2007/JPEGImages'),
        annotations_path=osp.join(voc_root, 'VOCdevkit2007/VOC2007/Annotations'),
-        imagesets_path=osp.join(voc_root, 'VOCdevkit2007/VOC2007/ImageSets/Main'),
+        splits_path=osp.join(voc_root, 'VOCdevkit2007/VOC2007/ImageSets/Main'),
        splits=['test']
    )
--- a/scripts/voc/maker.py
+++ b/scripts/voc/maker.py
@@ -26,10 +26,16 @@ def make_example(image_file, xml_file):
    tree = ET.parse(xml_file)
    filename = os.path.split(xml_file)[-1]
    objs = tree.findall('object')
+    size = tree.find('size')
    example = {'id': filename.split('.')[0], 'object': []}
    with open(image_file, 'rb') as f:
        img_bytes = bytes(f.read())
-    img = cv2.imdecode(np.frombuffer(img_bytes, 'uint8'), 1)
+    if size is not None:
+        example['height'] = int(size.find('height').text)
+        example['width'] = int(size.find('width').text)
+        example['depth'] = int(size.find('depth').text)
+    else:
+        img = cv2.imdecode(np.frombuffer(img_bytes, 'uint8'), 3)
        example['height'], example['width'], example['depth'] = img.shape
    example['content'] = img_bytes
    for ix, obj in enumerate(objs):
@@ -53,7 +59,7 @@ def make_record(
    record_file,
    images_path,
    annotations_path,
-    imagesets_path,
+    splits_path,
    splits
 ):
    if os.path.exists(record_file):
@@ -64,15 +70,13 @@ def make_record(
        images_path = [images_path]
    if not isinstance(annotations_path, list):
        annotations_path = [annotations_path]
-    if not isinstance(imagesets_path, list):
-        imagesets_path = [imagesets_path]
-    assert len(splits) == len(imagesets_path)
+    if not isinstance(splits_path, list):
+        splits_path = [splits_path]
+    assert len(splits) == len(splits_path)
    assert len(splits) == len(images_path)
    assert len(splits) == len(annotations_path)

-    print('Start Time:', time.strftime("%a, %d %b %Y %H:%M:%S", time.gmtime()))
-
-    writer = dragon.io.SeetaRecordWriter(
+    writer = dragon.io.KPLRecordWriter(
        path=record_file,
        protocol={
            'id': 'string',
@@ -91,31 +95,36 @@ def make_record(
        }
    )

-    count, total_line = 0, 0
-    start_time = time.time()
-
-    for db_idx, split in enumerate(splits):
-        split_file = os.path.join(imagesets_path[db_idx], split + '.txt')
-        assert os.path.exists(split_file)
+    # Scan all available entries
+    print('Scan entries...')
+    entries = []
+    for i, split in enumerate(splits):
+        split_file = os.path.join(splits_path[i], split + '.txt')
        with open(split_file, 'r') as f:
            lines = f.readlines()
-            total_line += len(lines)
        for line in lines:
-            count += 1
-            if count % 2000 == 0:
+            filename = line.strip()
+            img_file = os.path.join(images_path[i], filename + '.jpg')
+            ann_file = os.path.join(annotations_path[i], filename + '.xml')
+            entries.append((img_file, ann_file))
+
+    # Parse and write into record file
+    print('Start Time:', time.strftime("%a, %d %b %Y %H:%M:%S", time.gmtime()))
+    start_time = time.time()
+
+    for i, (img_file, ann_file) in enumerate(entries):
+        if i > 0 and i % 2000 == 0:
            now_time = time.time()
            print('{} / {} in {:.2f} sec'.format(
-                    count, total_line, now_time - start_time))
-            filename = line.strip()
-            image_file = os.path.join(images_path[db_idx], filename + '.jpg')
-            xml_file = os.path.join(annotations_path[db_idx], filename + '.xml')
-            writer.write(make_example(image_file, xml_file))
+                i, len(entries), now_time - start_time))
+        writer.write(make_example(img_file, ann_file))

    now_time = time.time()
-    print('{} / {} in {:.2f} sec'.format(count, total_line, now_time - start_time))
+    print('{} / {} in {:.2f} sec'.format(
+        len(entries), len(entries), now_time - start_time))
    writer.close()

    end_time = time.time()
-    data_size = os.path.getsize(record_file + '/data.data') * 1e-6
+    data_size = os.path.getsize(record_file + '/root.data') * 1e-6
    print('{} images take {:.2f} MB in {:.2f} sec.'
-          .format(total_line, data_size, end_time - start_time))
+          .format(len(entries), data_size, end_time - start_time))
--- a/lib/__init__.py
+++ b/lib/__init__.py
--- a/lib/core/__init__.py
+++ b/lib/core/__init__.py
--- a/seetadet/algo/faster_rcnn/__init__.py
+++ b/seetadet/algo/faster_rcnn/__init__.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from seetadet.algo.faster_rcnn.anchor_target import AnchorTarget
+from seetadet.algo.faster_rcnn.data_loader import DataLoader
+from seetadet.algo.faster_rcnn.proposal import Proposal
+from seetadet.algo.faster_rcnn.proposal_target import ProposalTarget
+from seetadet.algo.faster_rcnn.utils import generate_grid_anchors
+from seetadet.algo.faster_rcnn.utils import map_blobs_by_levels
+from seetadet.algo.faster_rcnn.utils import map_rois_to_levels
+from seetadet.algo.faster_rcnn.utils import map_returns_to_blobs
--- a/lib/faster_rcnn/anchor_target.py
+++ b/lib/faster_rcnn/anchor_target.py
@@ -16,11 +16,11 @@ from __future__ import print_function
 import numpy as np
 import numpy.random as npr

-from lib.core.config import cfg
-from lib.faster_rcnn.generate_anchors import generate_anchors
-from lib.faster_rcnn.utils import generate_grid_anchors
-from lib.utils import boxes as box_util
-from lib.utils.framework import new_tensor
+from seetadet.algo.faster_rcnn.generate_anchors import generate_anchors
+from seetadet.algo.faster_rcnn.utils import generate_grid_anchors
+from seetadet.core.config import cfg
+from seetadet.utils import boxes as box_util
+from seetadet.utils.env import new_tensor


 class AnchorTarget(object):
@@ -62,9 +62,7 @@ class AnchorTarget(object):

        # Label: ``1`` is positive, ``0`` is negative, ``-1` is don't care
        labels_wide = -np.ones((num_images, num_anchors,), 'float32')
-        bbox_targets_wide = np.zeros((num_images, num_anchors, 4), 'float32')
-        bbox_inside_weights_wide = np.zeros_like(bbox_targets_wide, 'float32')
-        bbox_outside_weights_wide = np.zeros_like(bbox_targets_wide, 'float32')
+        bbox_indices_wide, bbox_anchors_wide, bbox_targets_wide = [], [], []

        for ix in range(num_images):
            # GT boxes (x1, y1, x2, y2, label, ...)
@@ -95,13 +93,13 @@ class AnchorTarget(object):
                                       np.arange(overlaps.shape[1])]
            gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0]

-            # fg label: for each gt, anchor with highest overlap
+            # Foreground: for each gt, anchor with highest overlap
            labels[gt_argmax_overlaps] = 1

-            # fg label: above threshold IOU
+            # Foreground: above threshold IoU
            labels[max_overlaps >= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = 1

-            # bg label: below threshold IOU
+            # Background: below threshold IoU
            labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0

            # Subsample positive labels if we have too many
@@ -112,6 +110,11 @@ class AnchorTarget(object):
                labels[disable_inds] = -1
                fg_inds = np.where(labels == 1)[0]

+            # Retract the clamping if we don't have one
+            if len(fg_inds) == 0:
+                labels[gt_argmax_overlaps] = 1
+                fg_inds = np.where(labels == 1)[0]
+
            # Subsample negative labels if we have too many
            num_bg = cfg.TRAIN.RPN_BATCHSIZE - np.sum(labels == 1)
            bg_inds = np.where(labels == 0)[0]
@@ -119,51 +122,27 @@ class AnchorTarget(object):
                disable_inds = npr.choice(bg_inds, len(bg_inds) - num_bg, False)
                labels[disable_inds] = -1

-            bbox_targets = np.zeros((num_inside, 4), 'float32')
-            bbox_targets[fg_inds, :] = \
+            labels_wide[ix, inds_inside] = labels
+            bbox_anchors_wide.append(anchors[fg_inds])
+            bbox_indices_wide.append(inds_inside[fg_inds] + (num_anchors * ix))
+            bbox_targets_wide.append(
                box_util.bbox_transform(
-                    anchors[fg_inds, :],
+                    anchors[fg_inds],
                    gt_boxes[argmax_overlaps[fg_inds], :4],
                )
-            bbox_inside_weights = np.zeros((num_inside, 4), 'float32')
-            bbox_inside_weights[labels == 1, :] = np.array((1., 1., 1., 1.))
-            bbox_outside_weights = np.zeros((num_inside, 4), 'float32')
-            bbox_outside_weights[labels == 1, :] = np.ones((1, 4)) / cfg.TRAIN.RPN_BATCHSIZE
-            bbox_outside_weights[labels == 0, :] = np.ones((1, 4)) / cfg.TRAIN.RPN_BATCHSIZE
-
-            labels_wide[ix, inds_inside] = labels  # label
-            bbox_targets_wide[ix, inds_inside] = bbox_targets
-            bbox_inside_weights_wide[ix, inds_inside] = bbox_inside_weights
-            bbox_outside_weights_wide[ix, inds_inside] = bbox_outside_weights
-
-        if self.num_strides > 1:
-            labels = labels_wide.reshape((num_images, num_anchors))
-            bbox_targets = bbox_targets_wide.transpose((0, 2, 1))
-            bbox_inside_weights = bbox_inside_weights_wide.transpose((0, 2, 1))
-            bbox_outside_weights = bbox_outside_weights_wide.transpose((0, 2, 1))
-        else:
+            )
+
+        if self.num_strides == 1:
            A = self.base_anchors[0].shape[0]
            height, width = features[0].shape[-2:]
-            labels = labels_wide \
+            labels_wide = labels_wide \
                .reshape((num_images, height, width, A)) \
                .transpose(0, 3, 1, 2) \
                .reshape((num_images, num_anchors))

-            bbox_targets = bbox_targets_wide \
-                .reshape((num_images, height, width, A * 4)) \
-                .transpose(0, 3, 1, 2)
-
-            bbox_inside_weights = bbox_inside_weights_wide \
-                .reshape((num_images, height, width, A * 4)) \
-                .transpose(0, 3, 1, 2)
-
-            bbox_outside_weights = bbox_outside_weights_wide \
-                .reshape((num_images, height, width, A * 4)) \
-                .transpose(0, 3, 1, 2)
-
        return {
-            'labels': new_tensor(labels),
-            'bbox_targets': new_tensor(bbox_targets),
-            'bbox_inside_weights': new_tensor(bbox_inside_weights),
-            'bbox_outside_weights': new_tensor(bbox_outside_weights),
+            'labels': new_tensor(labels_wide),
+            'bbox_indices': new_tensor(np.concatenate(bbox_indices_wide)),
+            'bbox_targets': new_tensor(np.concatenate(bbox_targets_wide).astype('float32')),
+            'bbox_anchors': new_tensor(np.concatenate(bbox_anchors_wide).astype('float32')),
        }
--- a/seetadet/algo/faster_rcnn/data_loader.py
+++ b/seetadet/algo/faster_rcnn/data_loader.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import multiprocessing as mp
+import time
+
+import dragon
+import dragon.vm.torch as torch
+import numpy as np
+
+from seetadet.algo.faster_rcnn import data_transformer
+from seetadet.core.config import cfg
+from seetadet.datasets.factory import get_dataset
+from seetadet.utils import logger
+from seetadet.utils.blob import im_list_to_blob
+
+
+class DataLoader(object):
+    """Load mini-batches of data."""
+
+    def __init__(self):
+        super(DataLoader, self).__init__()
+        dataset = get_dataset(cfg.TRAIN.DATASET)
+        if cfg.USE_DALI:
+            from seetadet.dali import rcnn_pipeline as pipe
+            self.iterator = pipe.new_iterator(dataset.source)
+        else:
+            self.iterator = Iterator(**{
+                'dataset': dataset.cls,
+                'source': dataset.source,
+                'classes': dataset.classes,
+                'shuffle': cfg.TRAIN.USE_SHUFFLE,
+                'num_chunks': cfg.TRAIN.SHUFFLE_CHUNKS,
+                'batch_size': cfg.TRAIN.IMS_PER_BATCH * 2,
+                'num_transformers': cfg.TRAIN.NUM_THREADS - 1,
+            })
+
+    def __call__(self):
+        outputs = self.iterator.next()
+        if isinstance(outputs['data'], np.ndarray):
+            outputs['data'] = torch.from_numpy(outputs['data'])
+        return outputs
+
+
+class Iterator(mp.Process):
+    """Iterator to return the batch of data."""
+
+    def __init__(self, **kwargs):
+        super(Iterator, self).__init__()
+        # Distributed settings
+        rank, group_size = 0, 1
+        process_group = dragon.distributed.get_group()
+        if process_group is not None and \
+                kwargs.get('phase', 'TRAIN') == 'TRAIN':
+            group_size = process_group.size
+            rank = dragon.distributed.get_rank(process_group)
+
+        # Configuration
+        self._prefetch = kwargs.get('prefetch', 5)
+        self._batch_size = kwargs.get('batch_size', 2)
+        self._num_readers = kwargs.get('num_readers', 1)
+        self._num_transformers = kwargs.get('num_transformers', 3)
+        self.daemon = True
+
+        # Initialize queues
+        num_batches = self._prefetch * self._num_readers
+        self.q_in = mp.Queue(num_batches * self._batch_size)
+        self.q1_out = mp.Queue(num_batches * self._batch_size)
+        self.q2_out = mp.Queue(num_batches * self._batch_size)
+
+        # Initialize readers
+        self._readers = []
+        for i in range(self._num_readers):
+            part_idx, num_parts = i, self._num_readers
+            num_parts *= group_size
+            part_idx += rank * self._num_readers
+            self._readers.append(dragon.io.DataReader(
+                part_idx=part_idx, num_parts=num_parts, **kwargs))
+            self._readers[i]._seed += part_idx
+            self._readers[i].q_out = self.q_in
+            self._readers[i].start()
+            time.sleep(0.1)
+
+        # Initialize transformers
+        self._transformers = []
+        for i in range(self._num_transformers):
+            p = data_transformer.DataTransformer(**kwargs)
+            p._seed += (i + rank * self._num_transformers)
+            p.q_in = self.q_in
+            p.q1_out, p.q2_out = self.q1_out, self.q2_out
+            p.start()
+            self._transformers.append(p)
+            time.sleep(0.1)
+
+        # Register cleanup callbacks
+        def cleanup():
+            def terminate(processes):
+                for p in processes:
+                    p.terminate()
+                    p.join()
+            terminate(self._transformers)
+            logger.info('Terminate DataTransformer.')
+            terminate(self._readers)
+            logger.info('Terminate DataReader.')
+
+        import atexit
+        atexit.register(cleanup)
+
+    def next(self):
+        """Return the next batch of data."""
+        return self.__next__()
+
+    def __iter__(self):
+        """Return the iterator self."""
+        return self
+
+    def __next__(self):
+        """Return the next batch of data."""
+        q_out = None
+        # Two queues to implement aspect-grouping
+        # This is necessary to reduce the gpu memory
+        # from fetching a huge square batch blob
+        while q_out is None:
+            if self.q1_out.qsize() >= cfg.TRAIN.IMS_PER_BATCH:
+                q_out = self.q1_out
+            elif self.q2_out.qsize() >= cfg.TRAIN.IMS_PER_BATCH:
+                q_out = self.q2_out
+        self.q1_out, self.q2_out = self.q2_out, self.q1_out
+
+        images, images_info, boxes_to_pack = [], [], []
+
+        for i in range(cfg.TRAIN.IMS_PER_BATCH):
+            image, image_scale, boxes = q_out.get()
+            images.append(image)
+            images_info.append(list(image.shape[:2]) + [image_scale])
+            gt_boxes = np.zeros((boxes.shape[0], boxes.shape[1] + 1), 'float32')
+            gt_boxes[:, :boxes.shape[1]], gt_boxes[:, -1] = boxes, i
+            boxes_to_pack.append(gt_boxes)
+
+        return {
+            'data': im_list_to_blob(images),
+            'ims_info': np.array(images_info, dtype=np.float32),
+            'gt_boxes': np.concatenate(boxes_to_pack),
+        }
--- a/lib/faster_rcnn/data_transformer.py
+++ b/lib/faster_rcnn/data_transformer.py
@@ -15,19 +15,19 @@ from __future__ import print_function

 import multiprocessing

-import cv2
 import numpy as np

-from lib.core.config import cfg
-from lib.datasets.example import Example
-from lib.utils import boxes as box_util
-from lib.utils.blob import prep_im_for_blob
-from lib.utils.image import get_image_with_target_size
+from seetadet.core.config import cfg
+from seetadet.datasets.example import Example
+from seetadet.utils import boxes as box_util
+from seetadet.utils.blob import prep_im_for_blob


 class DataTransformer(multiprocessing.Process):
    def __init__(self, **kwargs):
        super(DataTransformer, self).__init__()
+        self._scales = cfg.TRAIN.SCALES
+        self._max_size = cfg.TRAIN.MAX_SIZE
        self._seed = cfg.RNG_SEED
        self._use_flipped = cfg.TRAIN.USE_FLIPPED
        self._use_diff = cfg.TRAIN.USE_DIFF
@@ -37,13 +37,7 @@ class DataTransformer(multiprocessing.Process):
        self.q_in = self.q1_out = self.q2_out = None
        self.daemon = True

-    def make_roi_dict(
-        self,
-        example,
-        im_scale,
-        apply_flip=False,
-        offsets=None,
-    ):
+    def make_roi_dict(self, example, im_scale, apply_flip=False):
        objects, n_objects = example.objects, 0
        height, width = example.height, example.width
        if not self._use_diff:
@@ -86,15 +80,6 @@ class DataTransformer(multiprocessing.Process):
        # Scale the boxes to the detecting scale
        roi_dict['boxes'] *= im_scale

-        # Apply the offsets from scale jitter
-        if offsets is not None:
-            roi_dict['boxes'][:, 0::2] += offsets[0]
-            roi_dict['boxes'][:, 1::2] += offsets[1]
-            roi_dict['boxes'][:, :] = np.minimum(
-                np.maximum(roi_dict['boxes'][:, :], 0),
-                [offsets[2][1] - 1, offsets[2][0] - 1] * 2,
-            )
-
        return roi_dict

    def get(self, example):
@@ -102,9 +87,8 @@ class DataTransformer(multiprocessing.Process):
        img = example.image

        # Scale
-        max_size = cfg.TRAIN.MAX_SIZE
-        target_size = cfg.TRAIN.SCALES[np.random.randint(len(cfg.TRAIN.SCALES))]
-        img, im_scale, jitter = prep_im_for_blob(img, target_size, max_size)
+        target_size = self._scales[np.random.randint(len(self._scales))]
+        img, im_scale = prep_im_for_blob(img, target_size, self._max_size)

        # Flip
        apply_flip = False
@@ -113,19 +97,8 @@ class DataTransformer(multiprocessing.Process):
                img = img[:, ::-1]
                apply_flip = True

-        # Random Crop or RandomPad
-        offsets = None
-        if cfg.TRAIN.MAX_SIZE > 0:
-            if jitter != 1:
-                # To a rectangle (scale, max_size)
-                target_size = (np.array(img.shape[:2]) / jitter).astype(np.int32)
-                img, offsets = get_image_with_target_size(target_size, img)
-        else:
-            # To a square (target_size, target_size)
-            img, offsets = get_image_with_target_size([target_size] * 2, img)
-
        # Example -> RoIDict
-        roi_dict = self.make_roi_dict(example, im_scale, apply_flip, offsets)
+        roi_dict = self.make_roi_dict(example, im_scale, apply_flip)

        # Post-Process for gt boxes
        # Shape like: [num_objects, {x1, y1, x2, y2, cls}]

--- a/lib/faster_rcnn/generate_anchors.py
+++ b/lib/faster_rcnn/generate_anchors.py
--- a/lib/faster_rcnn/proposal.py
+++ b/lib/faster_rcnn/proposal.py
@@ -17,11 +17,11 @@ import collections

 import numpy as np

-from lib.core.config import cfg
-from lib.faster_rcnn.generate_anchors import generate_anchors
-from lib.faster_rcnn.utils import generate_grid_anchors
-from lib.nms import nms_wrapper
-from lib.utils import boxes as box_util
+from seetadet.algo.faster_rcnn.generate_anchors import generate_anchors
+from seetadet.algo.faster_rcnn.utils import generate_grid_anchors
+from seetadet.core.config import cfg
+from seetadet.utils import boxes as box_util
+from seetadet.utils import nms


 class Proposal(object):
@@ -67,8 +67,8 @@ class Proposal(object):

        # Prepare for the outputs
        batch_rois = []
-        cls_prob = cls_prob.numpy(True)
-        bbox_pred = bbox_pred.numpy(True)
+        cls_prob = cls_prob.numpy()
+        bbox_pred = bbox_pred.numpy()
        if self.num_strides > 1:
            # (?, 4, A * K) -> (?, A * K, 4)
            bbox_pred = bbox_pred.transpose((0, 2, 1))
@@ -113,7 +113,7 @@ class Proposal(object):
            # Apply nms (e.g. threshold = 0.7)
            # Take after_nms_topN (e.g. 300)
            # Return the top proposals (-> RoIs top)
-            keep = nms_wrapper.nms(np.hstack((proposals, scores)), nms_thresh)
+            keep = nms.gpu_nms(np.hstack((proposals, scores)), nms_thresh)
            if post_nms_top_n > 0:
                keep = keep[:post_nms_top_n]
            proposals = proposals[keep, :]

--- a/lib/faster_rcnn/proposal_target.py
+++ b/lib/faster_rcnn/proposal_target.py
@@ -18,12 +18,10 @@ import collections
 import numpy as np
 import numpy.random as npr

-from lib.core.config import cfg
-from lib.faster_rcnn.utils import map_blobs_to_outputs
-from lib.faster_rcnn.utils import map_returns_to_blobs
-from lib.faster_rcnn.utils import map_rois_to_levels
-from lib.utils import boxes as box_util
-from lib.utils.framework import new_tensor
+from seetadet.algo.faster_rcnn import utils as rcnn_util
+from seetadet.core.config import cfg
+from seetadet.utils import boxes as box_util
+from seetadet.utils.env import new_tensor


 class ProposalTarget(object):
@@ -35,10 +33,8 @@ class ProposalTarget(object):
        self.num_classes = cfg.MODEL.NUM_CLASSES
        self.defaults = collections.OrderedDict([
            ('rois', np.array([[-1, 0, 0, 1, 1]], 'float32')),
-            ('labels', np.array([-1], 'float32')),
-            ('bbox_targets', np.zeros((1, self.num_classes * 4), 'float32')),
-            ('bbox_inside_weights', np.zeros((1, self.num_classes * 4), 'float32')),
-            ('bbox_outside_weights', np.zeros((1, self.num_classes * 4), 'float32')),
+            ('labels', np.array([-1], 'int64')),
+            ('bbox_targets', np.zeros((1, 4), 'float32')),
        ])

    def __call__(self, rpn_rois, gt_boxes):
@@ -63,85 +59,64 @@ class ProposalTarget(object):
            # Sample a batch of RoIs for training
            rois_per_image = cfg.TRAIN.BATCH_SIZE
            fg_rois_per_image = np.round(cfg.TRAIN.FG_FRACTION * rois_per_image)
-            map_returns_to_blobs(
+            rcnn_util.map_returns_to_blobs(
                sample_rois(
                    rois,
                    gt_boxes,
                    rois_per_image,
                    fg_rois_per_image,
-                    self.num_classes,
                ), blobs, keys,
            )

        # Stack into continuous blobs
-        for k, v in blobs.items():
-            blobs[k] = np.concatenate(blobs[k], 0)
+        blobs = dict((k, np.concatenate(blobs[k])) for k in blobs.keys())

        if self.num_strides > 1:
            # Distribute RoIs into pyramids
            min_lvl = cfg.FPN.ROI_MIN_LEVEL
            max_lvl = cfg.FPN.ROI_MAX_LEVEL
-            k = max_lvl - min_lvl + 1
-            levels = map_rois_to_levels(blobs['rois'], min_lvl, max_lvl)
-            outputs = map_blobs_to_outputs(
+            num_levels = max_lvl - min_lvl + 1
+            levels = rcnn_util.map_rois_to_levels(blobs['rois'], min_lvl, max_lvl)
+            lvl_blobs = rcnn_util.map_blobs_by_levels(
                blobs,
                self.defaults,
-                [np.where(levels == (i + min_lvl))[0] for i in range(k)],
+                [np.where(levels == (i + min_lvl))[0] for i in range(num_levels)],
            )
-            return {
-                'rois': [new_tensor(outputs['rois'][i]) for i in range(k)],
-                'labels': new_tensor(np.concatenate(outputs['labels'], 0)),
-                'bbox_targets': new_tensor(np.vstack(outputs['bbox_targets'])),
-                'bbox_inside_weights': new_tensor(np.vstack(outputs['bbox_inside_weights'])),
-                'bbox_outside_weights': new_tensor(np.vstack(outputs['bbox_outside_weights'])),
-            }
+            blobs = dict((k, np.concatenate(lvl_blobs[k])) for k in blobs.keys())
+            rois_wide = [lvl_blobs['rois'][i] for i in range(num_levels)]
        else:
-            # Return RoIs directly for CX-stride
+            # Return RoIs directly for specified stride
+            rois_wide = [blobs['rois']]
+
+        # Select the foreground RoIs only for bbox branch
+        fg_inds = np.where(blobs['labels'] > 0)[0]
+        cls_inds = np.arange(len(blobs['rois'])) * self.num_classes
+
        return {
-                'rois': [new_tensor(blobs['rois'])],
+            'rois': [new_tensor(rois) for rois in rois_wide],
            'labels': new_tensor(blobs['labels']),
-                'bbox_targets': new_tensor(blobs['bbox_targets']),
-                'bbox_inside_weights': new_tensor(blobs['bbox_inside_weights']),
-                'bbox_outside_weights': new_tensor(blobs['bbox_outside_weights']),
+            'bbox_indices': new_tensor(cls_inds[fg_inds] + blobs['labels'][fg_inds]),
+            'bbox_targets': new_tensor(blobs['bbox_targets'][fg_inds].astype('float32')),
+            'bbox_anchors': new_tensor(blobs['rois'][fg_inds, 1:].astype('float32')),
        }


-def get_targets(ex_rois, gt_rois, gt_labels, num_classes):
-    """Compute bounding-box regression targets for an image."""
-    assert ex_rois.shape[0] == gt_rois.shape[0]
-    assert ex_rois.shape[1] == 4
-    assert gt_rois.shape[1] == 4
-    # Compute bbox regression targets
-    fg_inds = np.where(gt_labels > 0)[0]
-    targets = box_util.bbox_transform(ex_rois, gt_rois, cfg.BBOX_REG_WEIGHTS)
-    bbox_targets = np.zeros((ex_rois.shape[0], 4 * num_classes), 'float32')
-    inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32)
-    for i in fg_inds:
-        start = int(4 * gt_labels[i])
-        bbox_targets[i, start:start + 4] = targets[i]
-        inside_weights[i, start:start + 4] = (1., 1., 1., 1.)
-    outside_weights = np.array(inside_weights > 0).astype('float32')
-    return bbox_targets, inside_weights, outside_weights
-
-
-def sample_rois(
-    all_rois,
-    gt_boxes,
-    num_rois,
-    num_fg_rois,
-    num_classes,
-):
+def sample_rois(all_rois, gt_boxes, num_rois, num_fg_rois):
    """Sample a batch of RoIs comprising foreground and background examples."""
    overlaps = box_util.bbox_overlaps(all_rois[:, 1:5], gt_boxes[:, :4])
    gt_assignment = overlaps.argmax(axis=1)
    max_overlaps = overlaps.max(axis=1)
-    labels = gt_boxes[gt_assignment, 4]
+    labels = gt_boxes[gt_assignment, 4].astype('int64')

    # Select foreground RoIs as those with >= FG_THRESH overlap
-    fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0]
-    fg_rois_per_this_image = int(min(num_fg_rois, fg_inds.size))
+    fg_thresh = cfg.TRAIN.FG_THRESH
+    fg_inds = np.where(max_overlaps >= fg_thresh)[0]
+    while fg_inds.size == 0:
+        fg_thresh -= 0.01
+        fg_inds = np.where(max_overlaps >= fg_thresh)[0]
+
    # Sample foreground regions without replacement
-    if fg_inds.size > 0:
+    fg_rois_per_this_image = int(min(num_fg_rois, fg_inds.size))
    fg_inds = npr.choice(fg_inds, fg_rois_per_this_image, False)

    # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
@@ -160,15 +135,14 @@ def sample_rois(
    rois, labels = all_rois[keep_inds], labels[keep_inds]
    # Clamp labels for the background RoIs to 0
    labels[fg_rois_per_this_image:] = 0
-    # Clamp the image indices for the background RoIs to -1
-    rois[fg_rois_per_this_image:][0] = -1

    # Compute the target from RoIs
-    outputs = [rois, labels]
-    outputs += get_targets(
+    return [
+        rois,
+        labels,
+        box_util.bbox_transform(
            rois[:, 1:5],
            gt_boxes[gt_assignment[keep_inds], :4],
-        labels,
-        num_classes,
+            cfg.BBOX_REG_WEIGHTS,
        )
-    return outputs
+    ]
--- a/lib/faster_rcnn/test.py
+++ b/lib/faster_rcnn/test.py
@@ -13,17 +13,18 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import types
+
 import dragon.vm.torch as torch
 import numpy as np

-from lib.core.config import cfg
-from lib.modeling.detector import new_detector
-from lib.nms import nms_wrapper
-from lib.utils import boxes as box_util
-from lib.utils import framework
-from lib.utils import time_util
-from lib.utils.blob import im_list_to_blob
-from lib.utils.image import scale_image
+from seetadet.core.config import cfg
+from seetadet.modeling.detector import new_detector
+from seetadet.utils import boxes as box_util
+from seetadet.utils import nms as nms_util
+from seetadet.utils import time_util
+from seetadet.utils.blob import im_list_to_blob
+from seetadet.utils.image import scale_image


 def im_detect(detector, raw_image):
@@ -31,49 +32,41 @@ def im_detect(detector, raw_image):
    ims, ims_scale = scale_image(raw_image)

    # Prepare blobs
-    blobs = {'data': im_list_to_blob(ims)}
-    blobs['ims_info'] = np.array([
-        list(blobs['data'].shape[1:3]) + [im_scale]
-        for im_scale in ims_scale
-    ], dtype=np.float32)
+    data = im_list_to_blob(ims)
+    ims_info = np.array([list(data.shape[1:3]) + [im_scale]
+         for im_scale in ims_scale], dtype=np.float32)

    # Do Forward
-    if not hasattr(detector, 'graph'):
-        with framework.new_workspace().as_default():
-            data = torch.from_numpy(blobs['data'])
-            ims_info = torch.from_numpy(blobs['ims_info'])
-            with torch.no_grad():
-                with torch.jit.Tracer(retain_ops=True):
-                    inputs = {'data': data, 'ims_info': ims_info}
-                    outputs = detector.forward(inputs)
-                    detector.graph = \
-                        framework.Graph(inputs, {
-                            'rois': outputs['rois'],
-                            'cls_prob': outputs['cls_prob'],
-                            'bbox_pred': outputs['bbox_pred']
-                        })
-    outputs = detector.graph(**blobs)
+    data = torch.from_numpy(data)
+    ims_info = torch.from_numpy(ims_info)
+
+    if not hasattr(detector, 'script_forward'):
+        def script_forward(self, data, ims_info):
+            return self.forward({'data': data, 'ims_info': ims_info})
+        detector.script_forward = torch.jit.trace(
+            func=types.MethodType(script_forward, detector),
+            example_inputs=[data, ims_info],
+        )
+
+    outputs = detector.script_forward(data, ims_info)
+    outputs = dict((k, outputs[k].numpy()) for k in outputs.keys())

    # Decode results
-    rois = outputs['rois']
-    scores, boxes, batch_inds = [], [], []
+    all_scores, all_boxes = [], []
    pred_boxes = \
        box_util.bbox_transform_inv(
-            rois[:, 1:5],
+            outputs['rois'][:, 1:5],
            outputs['bbox_pred'],
            cfg.BBOX_REG_WEIGHTS,
        )

    for i in range(len(ims)):
-        inds = np.where(rois[:, 0].astype(np.int32) == i)[0]
-        im_boxes = pred_boxes[inds] / ims_scale[i]
-        scores.append(outputs['cls_prob'][inds])
-        boxes.append(box_util.clip_tiled_boxes(im_boxes, raw_image.shape))
-
-    return (
-        np.vstack(scores) if len(ims) > 0 else scores[0],
-        np.vstack(boxes) if len(ims) > 0 else boxes[0],
-    )
+        inds = np.where(outputs['rois'][:, 0].astype(np.int32) == i)[0]
+        boxes = pred_boxes[inds] / ims_scale[i]
+        all_scores.append(outputs['cls_prob'][inds])
+        all_boxes.append(box_util.clip_tiled_boxes(boxes, raw_image.shape))
+
+    return np.vstack(all_scores), np.vstack(all_boxes)


 def test_net(weights, num_classes, q_in, q_out, device):
@@ -84,7 +77,7 @@ def test_net(weights, num_classes, q_in, q_out, device):

    while True:
        idx, raw_image = q_in.get()
-        if raw_image is None:
+        if idx < 0:
            break

        boxes_this_image = [[]]
@@ -101,17 +94,16 @@ def test_net(weights, num_classes, q_in, q_out, device):
                (cls_boxes, cls_scores[:, np.newaxis])
            ).astype(np.float32, copy=False)
            if cfg.TEST.USE_SOFT_NMS:
-                keep = nms_wrapper.soft_nms(
+                keep = nms_util.soft_nms(
                    cls_detections,
                    thresh=cfg.TEST.NMS,
                    method=cfg.TEST.SOFT_NMS_METHOD,
                    sigma=cfg.TEST.SOFT_NMS_SIGMA,
                )
            else:
-                keep = nms_wrapper.nms(
+                keep = nms_util.nms(
                    cls_detections,
                    thresh=cfg.TEST.NMS,
-                    force_cpu=True,
                )
            cls_detections = cls_detections[keep, :]
            boxes_this_image.append(cls_detections)
@@ -119,11 +111,8 @@ def test_net(weights, num_classes, q_in, q_out, device):

        q_out.put((
            idx,
-            {
-                'im_detect': _t['im_detect'].average_time,
-                'misc': _t['misc'].average_time,
-            },
-            {
-                'boxes': boxes_this_image,
-            },
+            dict([('im_detect', _t['im_detect'].average_time),
+                  ('misc', _t['misc'].average_time)]),
+            dict([('boxes', boxes_this_image)]),
        ))
+
--- a/lib/faster_rcnn/utils.py
+++ b/lib/faster_rcnn/utils.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 import collections
 import numpy as np

-from lib.core.config import cfg
+from seetadet.core.config import cfg


 def generate_grid_anchors(features, base_anchors, strides):
@@ -75,7 +75,7 @@ def map_rois_to_levels(rois, k_min, k_max):
    return np.clip(target_levels, k_min, k_max)


-def map_blobs_to_outputs(blobs, defaults, lvl_inds):
+def map_blobs_by_levels(blobs, defaults, lvl_inds):
    """Map blobs to outputs according to fpn indices."""
    outputs = collections.defaultdict(list)
    for inds in lvl_inds:

--- a/lib/modeling/__init__.py
+++ b/lib/modeling/__init__.py
@@ -13,10 +13,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-# Import custom modules
-from lib.modeling.fast_rcnn import FastRCNN
-from lib.modeling.fpn import FPN
-from lib.modeling.mask_rcnn import MaskRCNN
-from lib.modeling.retinanet import RetinaNet
-from lib.modeling.rpn import RPN
-from lib.modeling.ssd import SSD
+from seetadet.algo.faster_rcnn.anchor_target import AnchorTarget
+from seetadet.algo.faster_rcnn.proposal import Proposal
+from seetadet.algo.mask_rcnn.data_loader import DataLoader
+from seetadet.algo.mask_rcnn.proposal_target import ProposalTarget
--- a/seetadet/algo/mask_rcnn/data_loader.py
+++ b/seetadet/algo/mask_rcnn/data_loader.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import multiprocessing as mp
+import time
+
+import dragon
+import dragon.vm.torch as torch
+import numpy as np
+
+from seetadet.algo.mask_rcnn import data_transformer
+from seetadet.core.config import cfg
+from seetadet.datasets.factory import get_dataset
+from seetadet.utils import logger
+from seetadet.utils.blob import im_list_to_blob
+from seetadet.utils.blob import mask_list_to_blob
+
+
+class DataLoader(object):
+    """Provide mini-batches of data."""
+
+    def __init__(self):
+        super(DataLoader, self).__init__()
+        dataset = get_dataset(cfg.TRAIN.DATASET)
+        self.iterator = Iterator(**{
+            'dataset': dataset.cls,
+            'source': dataset.source,
+            'classes': dataset.classes,
+            'shuffle': cfg.TRAIN.USE_SHUFFLE,
+            'num_chunks': cfg.TRAIN.SHUFFLE_CHUNKS,
+            'batch_size': cfg.TRAIN.IMS_PER_BATCH * 2,
+            'num_transformers': cfg.TRAIN.NUM_THREADS - 1,
+        })
+
+    def __call__(self):
+        outputs = self.iterator.next()
+        if isinstance(outputs['data'], np.ndarray):
+            outputs['data'] = torch.from_numpy(outputs['data'])
+        return outputs
+
+
+class Iterator(mp.Process):
+    """Iterator to return the batch of data."""
+
+    def __init__(self, **kwargs):
+        super(Iterator, self).__init__()
+        # Distributed settings
+        rank, group_size = 0, 1
+        process_group = dragon.distributed.get_group()
+        if process_group is not None and \
+                kwargs.get('phase', 'TRAIN') == 'TRAIN':
+            group_size = process_group.size
+            rank = dragon.distributed.get_rank(process_group)
+
+        # Configuration
+        self._prefetch = kwargs.get('prefetch', 5)
+        self._batch_size = kwargs.get('batch_size', 2)
+        self._num_readers = kwargs.get('num_readers', 1)
+        self._num_transformers = kwargs.get('num_transformers', 3)
+        self.daemon = True
+
+        # Initialize queues
+        num_batches = self._prefetch * self._num_readers
+        self.q_in = mp.Queue(num_batches * self._batch_size)
+        self.q1_out = mp.Queue(num_batches * self._batch_size)
+        self.q2_out = mp.Queue(num_batches * self._batch_size)
+
+        # Initialize readers
+        self._readers = []
+        for i in range(self._num_readers):
+            part_idx, num_parts = i, self._num_readers
+            num_parts *= group_size
+            part_idx += rank * self._num_readers
+            self._readers.append(dragon.io.DataReader(
+                part_idx=part_idx, num_parts=num_parts, **kwargs))
+            self._readers[i]._seed += part_idx
+            self._readers[i].q_out = self.q_in
+            self._readers[i].start()
+            time.sleep(0.1)
+
+        # Initialize transformers
+        self._transformers = []
+        for i in range(self._num_transformers):
+            p = data_transformer.DataTransformer(**kwargs)
+            p._seed += (i + rank * self._num_transformers)
+            p.q_in = self.q_in
+            p.q1_out, p.q2_out = self.q1_out, self.q2_out
+            p.start()
+            self._transformers.append(p)
+            time.sleep(0.1)
+
+        # Register cleanup callbacks
+        def cleanup():
+            def terminate(processes):
+                for p in processes:
+                    p.terminate()
+                    p.join()
+            terminate(self._transformers)
+            logger.info('Terminate DataTransformer.')
+            terminate(self._readers)
+            logger.info('Terminate DataReader.')
+
+        import atexit
+        atexit.register(cleanup)
+
+    def next(self):
+        """Return the next batch of data."""
+        return self.__next__()
+
+    def __iter__(self):
+        """Return the iterator self."""
+        return self
+
+    def __next__(self):
+        """Return the next batch of data."""
+        q_out = None
+        # Two queues to implement aspect-grouping
+        # This is necessary to reduce the gpu memory
+        # from fetching a huge square batch blob
+        while q_out is None:
+            if self.q1_out.qsize() >= cfg.TRAIN.IMS_PER_BATCH:
+                q_out = self.q1_out
+            elif self.q2_out.qsize() >= cfg.TRAIN.IMS_PER_BATCH:
+                q_out = self.q2_out
+        self.q1_out, self.q2_out = self.q2_out, self.q1_out
+
+        images, images_info = [], []
+        boxes_to_pack, masks_to_pack = [], []
+
+        for i in range(cfg.TRAIN.IMS_PER_BATCH):
+            image, image_scale, boxes, masks = q_out.get()
+            images.append(image)
+            images_info.append(list(image.shape[:2]) + [image_scale])
+            gt_boxes = np.zeros((boxes.shape[0], boxes.shape[1] + 1), 'float32')
+            gt_boxes[:, :boxes.shape[1]], gt_boxes[:, -1] = boxes, i
+            boxes_to_pack.append(gt_boxes)
+            masks_to_pack.append(masks)
+
+        return {
+            'data': im_list_to_blob(images),
+            'ims_info': np.array(images_info, 'float32'),
+            'gt_boxes': np.concatenate(boxes_to_pack),
+            'gt_masks': mask_list_to_blob(masks_to_pack),
+        }
--- a/lib/mask_rcnn/data_transformer.py
+++ b/lib/mask_rcnn/data_transformer.py
@@ -17,17 +17,18 @@ import multiprocessing

 import numpy as np

-from lib.core.config import cfg
-from lib.datasets.example import Example
-from lib.pycocotools import mask_utils
-from lib.utils import boxes as box_util
-from lib.utils.blob import prep_im_for_blob
-from lib.utils.image import get_image_with_target_size
+from seetadet.core.config import cfg
+from seetadet.datasets.example import Example
+from seetadet.pycocotools import mask_utils
+from seetadet.utils import boxes as box_util
+from seetadet.utils.blob import prep_im_for_blob


 class DataTransformer(multiprocessing.Process):
    def __init__(self, **kwargs):
        super(DataTransformer, self).__init__()
+        self._scales = cfg.TRAIN.SCALES
+        self._max_size = cfg.TRAIN.MAX_SIZE
        self._seed = cfg.RNG_SEED
        self._use_flipped = cfg.TRAIN.USE_FLIPPED
        self._use_diff = cfg.TRAIN.USE_DIFF
@@ -98,9 +99,8 @@ class DataTransformer(multiprocessing.Process):
        img = example.image

        # Scale
-        max_size = cfg.TRAIN.MAX_SIZE
-        target_size = cfg.TRAIN.SCALES[np.random.randint(len(cfg.TRAIN.SCALES))]
-        img, im_scale, jitter = prep_im_for_blob(img, target_size, max_size)
+        target_size = self._scales[np.random.randint(len(self._scales))]
+        img, im_scale = prep_im_for_blob(img, target_size, self._max_size)

        # Flip
        apply_flip = False

--- a/lib/mask_rcnn/proposal_target.py
+++ b/lib/mask_rcnn/proposal_target.py
@@ -18,13 +18,11 @@ import collections
 import numpy as np
 import numpy.random as npr

-from lib.core.config import cfg
-from lib.faster_rcnn.utils import map_blobs_to_outputs
-from lib.faster_rcnn.utils import map_returns_to_blobs
-from lib.faster_rcnn.utils import map_rois_to_levels
-from lib.utils import boxes as box_util
-from lib.utils import mask as mask_util
-from lib.utils.framework import new_tensor
+from seetadet.algo.faster_rcnn import utils as rcnn_util
+from seetadet.core.config import cfg
+from seetadet.utils import boxes as box_util
+from seetadet.utils import mask as mask_util
+from seetadet.utils.env import new_tensor


 class ProposalTarget(object):
@@ -36,10 +34,8 @@ class ProposalTarget(object):
        self.num_classes = cfg.MODEL.NUM_CLASSES
        self.defaults = collections.OrderedDict([
            ('rois', np.array([[-1, 0, 0, 1, 1]], 'float32')),
-            ('labels', np.array([-1], 'float32')),
-            ('bbox_targets', np.zeros((1, self.num_classes * 4), 'float32')),
-            ('bbox_inside_weights', np.zeros((1, self.num_classes * 4), 'float32')),
-            ('bbox_outside_weights', np.zeros((1, self.num_classes * 4), 'float32')),
+            ('labels', np.array([-1], 'int64')),
+            ('bbox_targets', np.zeros((1, 4), 'float32')),
            ('mask_targets', -np.ones((1, self.resolution, self.resolution), 'float32')),
        ])

@@ -72,67 +68,75 @@ class ProposalTarget(object):
            # Sample a batch of RoIs for training
            rois_per_image = cfg.TRAIN.BATCH_SIZE
            fg_rois_per_image = np.round(cfg.TRAIN.FG_FRACTION * rois_per_image)
-            map_returns_to_blobs(
+            rcnn_util.map_returns_to_blobs(
                sample_rois(
                    rois,
                    gt_boxes,
                    gt_masks,
                    rois_per_image,
                    fg_rois_per_image,
-                    self.num_classes,
                    ims_info[ix][2],
                ), blobs, keys,
            )

        # Stack into continuous blobs
-        for k, v in blobs.items():
-            blobs[k] = np.concatenate(blobs[k], 0)
+        blobs = dict((k, np.concatenate(blobs[k])) for k in blobs.keys())

        # Distribute rois into pyramids
        k_min = cfg.FPN.ROI_MIN_LEVEL
        k_max = cfg.FPN.ROI_MAX_LEVEL
-        k = k_max - k_min + 1
-        levels = map_rois_to_levels(blobs['rois'], k_min, k_max)
-        outputs = \
-            map_blobs_to_outputs(
+        num_levels = k_max - k_min + 1
+        levels = rcnn_util.map_rois_to_levels(blobs['rois'], k_min, k_max)
+        lvl_blobs = rcnn_util.map_blobs_by_levels(
            blobs,
            self.defaults,
-                [np.where(levels == (i + k_min))[0] for i in range(k)],
+            [np.where(levels == (i + k_min))[0] for i in range(num_levels)],
        )

-        # Select the foreground RoIs only for mask branch
-        for i in range(k):
-            inds = np.where(outputs['labels'][i] > 0)[0]
-            inds = inds if len(inds) > 0 else np.array([0], 'int64')
-            outputs['mask_rois'].append(outputs['rois'][i][inds])
-            outputs['mask_targets'][i] = outputs['mask_targets'][i][inds]
-            outputs['mask_labels'].append(outputs['labels'][i][inds].astype('int64') - 1)
-
-        # Use the sparse indices to select logits
-        # Reduce the overhead on feeding dense class-specific targets
-        mask_labels = np.concatenate(outputs['mask_labels'], 0)
-        mask_indices = np.arange(len(mask_labels)) * (self.num_classes - 1)
+        rois_wide = [lvl_blobs['rois'][i] for i in range(num_levels)]
+        mask_rois_wide, mask_labels_wide = [], []
+
+        # Select the foreground RoIs only for bbox/mask branch
+        for i in range(num_levels):
+            inds = np.where(lvl_blobs['labels'][i] > 0)[0]
+            if len(inds) > 0:
+                mask_rois_wide.append(lvl_blobs['rois'][i][inds])
+                mask_labels_wide.append(lvl_blobs['labels'][i][inds] - 1)
+                lvl_blobs['mask_targets'][i] = lvl_blobs['mask_targets'][i][inds]
+            else:
+                mask_rois_wide.append(self.defaults['rois'])
+                mask_labels_wide.append(np.array([0], 'int64'))
+                lvl_blobs['mask_targets'][i] = self.defaults['mask_targets']
+
+        blobs = dict((k, np.concatenate(lvl_blobs[k])) for k in blobs.keys())
+        mask_labels = np.concatenate(mask_labels_wide)
+        fg_inds = np.where(blobs['labels'] > 0)[0]
+        bbox_cls_inds = np.arange(len(blobs['rois'])) * self.num_classes
+        mask_cls_inds = np.arange(len(mask_labels)) * (self.num_classes - 1)
+
+        # Sample a proposal randomly to avoid memory issue
+        if len(fg_inds) == 0:
+            fg_inds = np.random.randint(len(blobs['labels']), size=[1])

        return {
-            'rois': [new_tensor(outputs['rois'][i]) for i in range(k)],
-            'labels': new_tensor(np.concatenate(outputs['labels'], 0)),
-            'bbox_targets': new_tensor(np.vstack(outputs['bbox_targets'])),
-            'bbox_inside_weights': new_tensor(np.vstack(outputs['bbox_inside_weights'])),
-            'bbox_outside_weights': new_tensor(np.vstack(outputs['bbox_outside_weights'])),
-            'mask_rois': [new_tensor(outputs['mask_rois'][i]) for i in range(k)],
-            'mask_targets': new_tensor(np.vstack(outputs['mask_targets'])),
-            'mask_indices': new_tensor(mask_indices + mask_labels),
+            'rois': [new_tensor(rois_wide[i]) for i in range(num_levels)],
+            'mask_rois': [new_tensor(mask_rois_wide[i]) for i in range(num_levels)],
+            'labels': new_tensor(blobs['labels']),
+            'bbox_indices': new_tensor(bbox_cls_inds[fg_inds] + blobs['labels'][fg_inds]),
+            'bbox_targets': new_tensor(blobs['bbox_targets'][fg_inds].astype('float32')),
+            'bbox_anchors': new_tensor(blobs['rois'][fg_inds, 1:].astype('float32')),
+            'mask_indices': new_tensor(mask_cls_inds + mask_labels),
+            'mask_targets': new_tensor(blobs['mask_targets']),
        }


-def get_targets(
+def compute_targets(
    ex_rois,
    gt_rois,
    gt_labels,
    gt_masks,
    mask_flags,
    mask_size,
-    num_classes,
    im_scale,
 ):
    """Compute the bounding-box regression targets."""
@@ -141,14 +145,8 @@ def get_targets(
    assert gt_rois.shape[1] == 4
    # Compute bbox regression targets
    fg_inds = np.where(gt_labels > 0)[0]
-    targets = box_util.bbox_transform(ex_rois, gt_rois, cfg.BBOX_REG_WEIGHTS)
-    bbox_targets = np.zeros((ex_rois.shape[0], 4 * num_classes), 'float32')
-    inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32)
-    for i in fg_inds:
-        start = int(4 * gt_labels[i])
-        bbox_targets[i, start:start + 4] = targets[i]
-        inside_weights[i, start:start + 4] = (1., 1., 1., 1.)
-    outside_weights = np.array(inside_weights > 0).astype('float32')
+    bbox_targets = box_util.bbox_transform(
+        ex_rois, gt_rois, cfg.BBOX_REG_WEIGHTS)
    # Compute mask classification targets
    mask_shape = [mask_size] * 2
    ex_rois_ori = np.round(ex_rois / im_scale).astype(int)
@@ -168,7 +166,7 @@ def get_targets(
                        mask=box_mask,
                        size=mask_shape,
                    )
-    return bbox_targets, inside_weights, outside_weights, mask_targets
+    return bbox_targets, mask_targets


 def sample_rois(
@@ -177,14 +175,13 @@ def sample_rois(
    gt_masks,
    num_rois,
    num_fg_rois,
-    num_classes,
    im_scale,
 ):
    """Sample a batch of RoIs comprising foreground and background examples."""
    overlaps = box_util.bbox_overlaps(all_rois[:, 1:5], gt_boxes[:, :4])
    gt_assignment = overlaps.argmax(axis=1)
    max_overlaps = overlaps.max(axis=1)
-    labels = gt_boxes[gt_assignment, 4]
+    labels = gt_boxes[gt_assignment, 4].astype('int64')

    # Select foreground RoIs as those with >= FG_THRESH overlap
    fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0]
@@ -209,19 +206,16 @@ def sample_rois(
    rois, labels = all_rois[keep_inds], labels[keep_inds]
    # Clamp labels for the background RoIs to 0
    labels[fg_rois_per_this_image:] = 0
-    # Clamp the image indices for the background RoIs to -1
-    rois[fg_rois_per_this_image:][0] = -1

    # Compute the target from RoIs
    outputs = [rois, labels]
-    outputs += get_targets(
+    outputs += compute_targets(
        rois[:, 1:5],
        gt_boxes[gt_assignment[keep_inds], :4],
        labels,
        gt_masks[gt_assignment[fg_inds]],
        gt_boxes[gt_assignment[fg_inds], 5],
        cfg.MRCNN.RESOLUTION,
-        num_classes,
        im_scale,
    )
    return outputs
--- a/lib/mask_rcnn/test.py
+++ b/lib/mask_rcnn/test.py
@@ -13,19 +13,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import types
+
 import dragon.vm.torch as torch
 import numpy as np

-from lib.core.config import cfg
-from lib.faster_rcnn import map_rois_to_levels
-from lib.faster_rcnn import map_blobs_to_outputs
-from lib.modeling.detector import new_detector
-from lib.nms import nms_wrapper
-from lib.utils import framework
-from lib.utils import time_util
-from lib.utils import boxes as box_util
-from lib.utils.blob import im_list_to_blob
-from lib.utils.image import scale_image
+from seetadet.algo.faster_rcnn import utils as rcnn_util
+from seetadet.core.config import cfg
+from seetadet.modeling.detector import new_detector
+from seetadet.utils import env
+from seetadet.utils import nms as nms_util
+from seetadet.utils import time_util
+from seetadet.utils import boxes as box_util
+from seetadet.utils.blob import im_list_to_blob
+from seetadet.utils.image import scale_image


 def im_detect(detector, raw_image):
@@ -33,50 +34,46 @@ def im_detect(detector, raw_image):
    ims, ims_scale = scale_image(raw_image)

    # Prepare blobs
-    blobs = {'data': im_list_to_blob(ims)}
-    blobs['ims_info'] = np.array([
-        list(blobs['data'].shape[1:3]) + [im_scale]
-        for im_scale in ims_scale
-    ], dtype=np.float32)
+    data = im_list_to_blob(ims)
+    ims_info = np.array([list(data.shape[1:3]) + [im_scale]
+         for im_scale in ims_scale], dtype=np.float32)

    # Do Forward
-    if not hasattr(detector, 'graph'):
-        with framework.new_workspace().as_default():
-            data = torch.from_numpy(blobs['data'])
-            ims_info = torch.from_numpy(blobs['ims_info'])
-            with torch.no_grad():
-                with torch.jit.Tracer(retain_ops=True):
-                    inputs = {'data': data, 'ims_info': ims_info}
-                    outputs = detector.forward(inputs)
-                    detector.graph = \
-                        framework.Graph(inputs, {
-                            'rois': outputs['rois'],
-                            'cls_prob': outputs['cls_prob'],
-                            'bbox_pred': outputs['bbox_pred']
-                        })
-    outputs = detector.graph(**blobs)
+    data = torch.from_numpy(data)
+    ims_info = torch.from_numpy(ims_info)
+
+    if not hasattr(detector, 'script_forward'):
+        def script_forward(self, data, ims_info):
+            return self.forward({'data': data, 'ims_info': ims_info})
+        detector.script_forward = torch.jit.trace(
+            func=types.MethodType(script_forward, detector),
+            example_inputs=[data, ims_info],
+        )
+
+    outputs = detector.script_forward(data, ims_info)
+    outputs = dict((k, outputs[k].numpy()) for k in outputs.keys())

    # Decode results
-    rois = outputs['rois']
-    scores, boxes, batch_inds = [], [], []
+    all_scores, all_boxes, batch_inds = [], [], []
+
    pred_boxes = \
        box_util.bbox_transform_inv(
-            rois[:, 1:5],
+            outputs['rois'][:, 1:5],
            outputs['bbox_pred'],
            cfg.BBOX_REG_WEIGHTS,
        )

    for i in range(len(ims)):
-        inds = np.where(rois[:, 0].astype(np.int32) == i)[0]
-        im_boxes = pred_boxes[inds] / ims_scale[i]
-        scores.append(outputs['cls_prob'][inds])
-        boxes.append(box_util.clip_tiled_boxes(im_boxes, raw_image.shape))
+        inds = np.where(outputs['rois'][:, 0].astype(np.int32) == i)[0]
+        boxes = pred_boxes[inds] / ims_scale[i]
+        all_scores.append(outputs['cls_prob'][inds])
+        all_boxes.append(box_util.clip_tiled_boxes(boxes, raw_image.shape))
        batch_inds.append(np.ones((len(inds), 1), 'int32') * i)

    return (
-        np.vstack(scores) if len(ims) > 0 else scores[0],
-        np.vstack(boxes) if len(ims) > 0 else boxes[0],
-        np.vstack(batch_inds) if len(ims) > 0 else batch_inds[0],
+        np.vstack(all_scores),
+        np.vstack(all_boxes),
+        np.vstack(batch_inds),
        np.array(ims_scale, 'float64'),
    )

@@ -85,25 +82,15 @@ def mask_detect(detector, rois):
    k_min = cfg.FPN.ROI_MIN_LEVEL
    k_max = cfg.FPN.ROI_MAX_LEVEL
    k = k_max - k_min + 1
-    levels = map_rois_to_levels(rois, k_min, k_max)
+    levels = rcnn_util.map_rois_to_levels(rois, k_min, k_max)
    level_inds = [np.where(levels == (i + k_min))[0] for i in range(k)]
-    fpn_rois = map_blobs_to_outputs(
+    fpn_rois = rcnn_util.map_blobs_by_levels(
        {'rois': rois[:, :5]},
        {'rois': np.array([[-1, 0, 0, 1, 1]], 'float32')},
        level_inds)['rois']
-    workspace = detector.graph.workspace
-    placeholders = detector.graph.placeholders
-    score_fn = detector.rcnn.compute_mask_score
-    with workspace.as_default():
-        if 'rois' not in placeholders:
-            placeholders['rois'] = \
-                [framework.new_placeholder(cfg.GPU_ID) for _ in range(k)]
-            placeholders['mask_inds'] = \
-                framework.new_placeholder(cfg.GPU_ID)
-        for i, v in enumerate(fpn_rois):
-            framework.feed_tensor(placeholders['rois'][i], v.astype('float32'))
    with torch.no_grad():
-            mask_score = score_fn(rois=placeholders['rois'])
+        mask_score = detector.rcnn.compute_mask_score(
+            rois=[env.new_tensor(r.astype('float32')) for r in fpn_rois])
    nc, i = mask_score.shape[1], 0
    mask_inds = {}
    for inds in level_inds:
@@ -114,14 +101,10 @@ def mask_detect(detector, rois):
        if len(inds) == 0:
            i += 1
    mask_inds = list(map(mask_inds.get, sorted(mask_inds)))
-        framework.feed_tensor(
-            placeholders['mask_inds'],
-            np.array(mask_inds, 'int64'),
-        )
+    mask_inds = env.new_tensor(np.array(mask_inds, 'int64'))
    with torch.no_grad():
-            mask_pred = mask_score.index_select(
-                (0, 1), placeholders['mask_inds'])
-            return detector.rcnn.sigmoid(mask_pred).numpy(True).copy()
+        mask_pred = mask_score.index_select((0, 1), mask_inds)
+    return detector.rcnn.sigmoid(mask_pred).numpy().copy()


 def test_net(weights, num_classes, q_in, q_out, device):
@@ -132,7 +115,7 @@ def test_net(weights, num_classes, q_in, q_out, device):

    while True:
        idx, raw_image = q_in.get()
-        if raw_image is None:
+        if idx < 0:
            break

        rois_this_image = []
@@ -153,17 +136,16 @@ def test_net(weights, num_classes, q_in, q_out, device):
                (cls_boxes, cls_scores[:, np.newaxis])
            ).astype(np.float32, copy=False)
            if cfg.TEST.USE_SOFT_NMS:
-                keep = nms_wrapper.soft_nms(
+                keep = nms_util.soft_nms(
                    cls_detections,
                    thresh=cfg.TEST.NMS,
                    method=cfg.TEST.SOFT_NMS_METHOD,
                    sigma=cfg.TEST.SOFT_NMS_SIGMA,
                )
            else:
-                keep = nms_wrapper.nms(
+                keep = nms_util.nms(
                    cls_detections,
                    thresh=cfg.TEST.NMS,
-                    force_cpu=True,
                )
            cls_detections = cls_detections[keep, :]
            cls_batch_inds = cls_batch_inds[keep]
@@ -190,13 +172,9 @@ def test_net(weights, num_classes, q_in, q_out, device):

        q_out.put((
            idx,
-            {
-                'im_detect': _t['im_detect'].average_time,
-                'mask_detect': _t['mask_detect'].average_time,
-                'misc': _t['misc'].average_time,
-            },
-            {
-                'boxes': boxes_this_image,
-                'masks': masks_this_image,
-            },
+            dict([('im_detect', _t['im_detect'].average_time),
+                  ('mask_detect', _t['mask_detect'].average_time),
+                  ('misc', _t['misc'].average_time)]),
+            dict([('boxes', boxes_this_image),
+                  ('masks', masks_this_image)]),
        ))
--- a/lib/mask_rcnn/__init__.py
+++ b/lib/mask_rcnn/__init__.py
@@ -13,7 +13,5 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-from lib.faster_rcnn.anchor_target import AnchorTarget
-from lib.faster_rcnn.proposal import Proposal
-from lib.mask_rcnn.data_loader import DataLoader
-from lib.mask_rcnn.proposal_target import ProposalTarget
+from seetadet.algo.retinanet.anchor_target import AnchorTarget
+from seetadet.algo.retinanet.data_loader import DataLoader
--- a/lib/retinanet/anchor_target.py
+++ b/lib/retinanet/anchor_target.py
@@ -15,12 +15,12 @@ from __future__ import print_function

 import numpy as np

-from lib.core.config import cfg
-from lib.faster_rcnn.generate_anchors import generate_anchors_v2
-from lib.faster_rcnn import generate_grid_anchors
-from lib.utils import boxes as box_util
-from lib.utils import logger
-from lib.utils.framework import new_tensor
+from seetadet.core.config import cfg
+from seetadet.algo.faster_rcnn.generate_anchors import generate_anchors_v2
+from seetadet.algo.faster_rcnn.utils import generate_grid_anchors
+from seetadet.utils import boxes as box_util
+from seetadet.utils import logger
+from seetadet.utils.env import new_tensor


 class AnchorTarget(object):
@@ -47,7 +47,7 @@ class AnchorTarget(object):
                    sizes=sizes,
                ))

-    def __call__(self, features, gt_boxes, ims_info):
+    def __call__(self, features, gt_boxes):
        num_images = cfg.TRAIN.IMS_PER_BATCH
        gt_boxes_wide = box_util.dismantle_boxes(gt_boxes, num_images)

@@ -67,10 +67,8 @@ class AnchorTarget(object):
        num_anchors = all_anchors.shape[0]

        # Label: ``1`` is positive, ``0`` is negative, ``-1` is don't care
-        labels_wide = -np.ones((num_images, num_anchors,), 'float32')
-        bbox_targets_wide = np.zeros((num_images, num_anchors, 4), 'float32')
-        bbox_inside_weights_wide = np.zeros_like(bbox_targets_wide, 'float32')
-        bbox_outside_weights_wide = np.zeros_like(bbox_targets_wide, 'float32')
+        labels_wide = -np.ones((num_images, num_anchors,), 'int64')
+        bbox_indices_wide, bbox_anchors_wide, bbox_targets_wide = [], [], []

        # Different from R-CNN, all anchors will be used
        inds_inside, anchors = np.arange(num_anchors), all_anchors
@@ -81,7 +79,7 @@ class AnchorTarget(object):
            gt_boxes = gt_boxes_wide[ix]

            # label: 1 is positive, 0 is negative, -1 is don't care
-            labels = np.empty((num_inside,), dtype=np.float32)
+            labels = np.empty((num_inside,), dtype='int64')
            labels.fill(-1)

            # Overlaps between the anchors and the gt boxes
@@ -89,48 +87,41 @@ class AnchorTarget(object):
            argmax_overlaps = overlaps.argmax(1)
            max_overlaps = overlaps[np.arange(num_inside), argmax_overlaps]

-            # fg label: for each gt, anchor with highest overlap
+            # Foreground: for each gt, anchor with highest overlap
            gt_argmax_overlaps = overlaps.argmax(0)
            gt_max_overlaps = overlaps[gt_argmax_overlaps, np.arange(overlaps.shape[1])]
            gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0]
            gt_inds = argmax_overlaps[gt_argmax_overlaps]
            labels[gt_argmax_overlaps] = gt_boxes[gt_inds, 4]

-            # fg label: above threshold IOU
+            # Foreground: above threshold IoU
            inds = max_overlaps >= cfg.RETINANET.POSITIVE_OVERLAP
            gt_inds = argmax_overlaps[inds]
            labels[inds] = gt_boxes[gt_inds, 4]
            fg_inds = np.where(labels > 0)[0]

-            # bg label: below threshold IOU
+            # Background: below threshold IoU
            labels[max_overlaps < cfg.RETINANET.NEGATIVE_OVERLAP] = 0

-            bbox_targets = np.zeros((num_inside, 4), dtype=np.float32)
-            bbox_targets[fg_inds, :] = \
+            # Retract the clamping if we don't have one
+            if len(fg_inds) == 0:
+                gt_inds = argmax_overlaps[gt_argmax_overlaps]
+                labels[gt_argmax_overlaps] = gt_boxes[gt_inds, 4]
+                fg_inds = np.where(labels > 0)[0]
+
+            labels_wide[ix, inds_inside] = labels
+            bbox_anchors_wide.append(anchors[fg_inds])
+            bbox_indices_wide.append(fg_inds + (num_anchors * ix))
+            bbox_targets_wide.append(
                box_util.bbox_transform(
-                    anchors[fg_inds, :],
+                    anchors[fg_inds],
                    gt_boxes[argmax_overlaps[fg_inds], :4],
                )
-            bbox_inside_weights = np.zeros((num_inside, 4), dtype=np.float32)
-            bbox_inside_weights[fg_inds, :] = np.array((1., 1., 1., 1.))
-
-            bbox_reg_weight = float(cfg.RETINANET.BBOX_REG_WEIGHT)
-            bbox_outside_weights = np.zeros((num_inside, 4), dtype=np.float32)
-            bbox_outside_weights[fg_inds, :] = bbox_reg_weight / max(len(fg_inds), 1)
-
-            labels_wide[ix, inds_inside] = labels
-            bbox_targets_wide[ix, inds_inside] = bbox_targets
-            bbox_inside_weights_wide[ix, inds_inside] = bbox_inside_weights
-            bbox_outside_weights_wide[ix, inds_inside] = bbox_outside_weights
-
-        labels = labels_wide.reshape((num_images, num_anchors))
-        bbox_targets = bbox_targets_wide.transpose((0, 2, 1))
-        bbox_inside_weights = bbox_inside_weights_wide.transpose((0, 2, 1))
-        bbox_outside_weights = bbox_outside_weights_wide.transpose((0, 2, 1))
+            )

        return {
-            'labels': new_tensor(labels),
-            'bbox_targets': new_tensor(bbox_targets),
-            'bbox_inside_weights': new_tensor(bbox_inside_weights),
-            'bbox_outside_weights': new_tensor(bbox_outside_weights),
+            'labels': new_tensor(labels_wide),
+            'bbox_indices': new_tensor(np.concatenate(bbox_indices_wide)),
+            'bbox_anchors': new_tensor(np.concatenate(bbox_anchors_wide).astype('float32')),
+            'bbox_targets': new_tensor(np.concatenate(bbox_targets_wide).astype('float32')),
        }
--- a/seetadet/algo/retinanet/data_loader.py
+++ b/seetadet/algo/retinanet/data_loader.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from seetadet.algo import faster_rcnn
+from seetadet.algo import ssd
+from seetadet.core.config import cfg
+
+
+class DataLoader(object):
+    """Provide mini-batches of data."""
+
+    def __new__(cls):
+        if cfg.TRAIN.MAX_SIZE > 0:
+            return faster_rcnn.DataLoader()
+        else:
+            return ssd.DataLoader()
--- a/lib/retinanet/test.py
+++ b/lib/retinanet/test.py
@@ -13,66 +13,59 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import types
+
 import dragon.vm.torch as torch
 import numpy as np

-from lib.core.config import cfg
-from lib.modeling.detector import new_detector
-from lib.nms import nms_wrapper
-from lib.utils import framework
-from lib.utils import time_util
-from lib.utils.blob import im_list_to_blob
-from lib.utils.image import scale_image
+from seetadet.core.config import cfg
+from seetadet.modeling.detector import new_detector
+from seetadet.utils import nms as nms_util
+from seetadet.utils import time_util
+from seetadet.utils.blob import im_list_to_blob
+from seetadet.utils.image import scale_image


 def ims_detect(detector, raw_images):
    """Detect images, with single or multiple scales."""
-    ims, ims_scale = scale_image(raw_images[0])
-    num_scales = len(ims_scale)
-    ims_shape = [im.shape for im in raw_images]
-    for item_idx in range(1, len(raw_images)):
-        ims_ext, ims_scale_ext = scale_image(raw_images[item_idx])
-        ims += ims_ext
-        ims_scale += ims_scale_ext
+    ims, ims_scale = [], []
+    for i in range(len(raw_images)):
+        im, im_scale = scale_image(raw_images[i])
+        ims += im
+        ims_scale += im_scale
+
+    num_scales = len(ims_scale) // len(raw_images)
+    ims_shape = np.array([im.shape[:2] for im in ims])
+    ims_scale = np.array(ims_scale).reshape((len(ims), -1))

    # Prepare blobs
-    blobs = {'data': im_list_to_blob(ims)}
-    blobs['ims_info'] = np.array([
-        list(blobs['data'].shape[1:3]) + [im_scale]
-        for im_scale in ims_scale
-    ], dtype=np.float32)
+    data = im_list_to_blob(ims)
+    ims_info = np.hstack([ims_shape, ims_scale]).astype('float32')

    # Do Forward
-    if not hasattr(detector, 'graph'):
-        with framework.new_workspace().as_default():
-            data = torch.from_numpy(blobs['data'])
-            ims_info = torch.from_numpy(blobs['ims_info'])
-            with torch.no_grad():
-                with torch.jit.Tracer(retain_ops=True):
-                    inputs = {'data': data, 'ims_info': ims_info}
-                    outputs = detector.forward(inputs)
-                    detector.graph = \
-                        framework.Graph({
-                            'data': inputs['data'],
-                            'ims_info': inputs['ims_info']
-                        }, {'detections': outputs['detections']})
-    outputs = detector.graph(**blobs)
+    data = torch.from_numpy(data)
+    ims_info = torch.from_numpy(ims_info)
+
+    if not hasattr(detector, 'script_forward'):
+        def script_forward(self, data, ims_info):
+            return self.forward({'data': data, 'ims_info': ims_info})
+        detector.script_forward = torch.jit.trace(
+            func=types.MethodType(script_forward, detector),
+            example_inputs=[data, ims_info],
+        )
+
+    outputs = detector.script_forward(data, ims_info)
+    outputs = dict((k, outputs[k].numpy()) for k in outputs.keys())

    # Unpack results
    results = outputs['detections']
-    detections = [[] for _ in range(len(ims_shape))]
+    detections = [[] for _ in range(len((raw_images)))]

    for i in range(len(ims)):
        inds = np.where(results[:, 0].astype(np.int32) == i)[0]
        detections[i // num_scales].append(results[inds, 1:])

-    for i in range(len(ims_shape)):
-        detections[i] = \
-            np.vstack(detections[i]) \
-            if len(detections[i]) > 1 \
-            else detections[i][0]
-
-    return detections
+    return [np.vstack(detections[i]) for i in range(len(raw_images))]


 def test_net(weights, num_classes, q_in, q_out, device):
@@ -88,7 +81,7 @@ def test_net(weights, num_classes, q_in, q_out, device):
        indices, raw_images = [], []
        for i in range(cfg.TEST.IMS_PER_BATCH):
            idx, raw_image = q_in.get()
-            if raw_image is None:
+            if idx < 0:
                must_stop = True
                break
            indices.append(idx)
@@ -115,17 +108,16 @@ def test_net(weights, num_classes, q_in, q_out, device):
                    cls_boxes, cls_scores[:, np.newaxis])) \
                    .astype(np.float32, copy=False)
                if cfg.TEST.USE_SOFT_NMS:
-                    keep = nms_wrapper.soft_nms(
+                    keep = nms_util.soft_nms(
                        cls_detections,
                        thresh=cfg.TEST.NMS,
                        method=cfg.TEST.SOFT_NMS_METHOD,
                        sigma=cfg.TEST.SOFT_NMS_SIGMA,
                    )
                else:
-                    keep = nms_wrapper.nms(
+                    keep = nms_util.nms(
                        cls_detections,
                        thresh=cfg.TEST.NMS,
-                        force_cpu=True,
                    )
                cls_detections = cls_detections[keep, :]
                boxes_this_image.append(cls_detections)
@@ -133,11 +125,7 @@ def test_net(weights, num_classes, q_in, q_out, device):

            q_out.put((
                indices[i],
-                {
-                    'im_detect': _t['im_detect'].average_time,
-                    'misc': _t['misc'].average_time,
-                },
-                {
-                    'boxes': boxes_this_image,
-                },
+                dict([('im_detect', _t['im_detect'].average_time),
+                      ('misc',_t['misc'].average_time)]),
+                dict([('boxes', boxes_this_image)]),
            ))
--- a/lib/faster_rcnn/__init__.py
+++ b/lib/faster_rcnn/__init__.py
@@ -13,11 +13,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-from lib.faster_rcnn.anchor_target import AnchorTarget
-from lib.faster_rcnn.data_loader import DataLoader
-from lib.faster_rcnn.proposal import Proposal
-from lib.faster_rcnn.proposal_target import ProposalTarget
-from lib.faster_rcnn.utils import generate_grid_anchors
-from lib.faster_rcnn.utils import map_blobs_to_outputs
-from lib.faster_rcnn.utils import map_rois_to_levels
-from lib.faster_rcnn.utils import map_returns_to_blobs
+from seetadet.algo.ssd.data_loader import DataLoader
+from seetadet.algo.ssd.hard_mining import HardMining
+from seetadet.algo.ssd.multibox import MultiBoxMatch
+from seetadet.algo.ssd.multibox import MultiBoxTarget
+from seetadet.algo.ssd.priorbox import PriorBox
--- a/lib/ssd/cat.jpg
+++ b/lib/ssd/cat.jpg
--- a/seetadet/algo/ssd/data_loader.py
+++ b/seetadet/algo/ssd/data_loader.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import multiprocessing as mp
+import time
+
+import dragon
+import dragon.vm.torch as torch
+import numpy as np
+
+from seetadet.algo.ssd import data_transformer
+from seetadet.core.config import cfg
+from seetadet.datasets.factory import get_dataset
+from seetadet.utils import logger
+
+
+class DataLoader(object):
+    """Provide mini-batches of data."""
+
+    def __init__(self):
+        super(DataLoader, self).__init__()
+        dataset = get_dataset(cfg.TRAIN.DATASET)
+        if cfg.USE_DALI:
+            from seetadet.dali import ssd_pipeline as pipe
+            self.iterator = pipe.new_iterator(dataset.source)
+        else:
+            self.iterator = Iterator(**{
+                'dataset': dataset.cls,
+                'source': dataset.source,
+                'classes': dataset.classes,
+                'shuffle': cfg.TRAIN.USE_SHUFFLE,
+                'num_chunks': cfg.TRAIN.SHUFFLE_CHUNKS,
+                'batch_size': cfg.TRAIN.IMS_PER_BATCH * 2,
+                'num_transformers': cfg.TRAIN.NUM_THREADS - 1,
+            })
+
+    def __call__(self):
+        outputs = self.iterator.next()
+        if isinstance(outputs['data'], np.ndarray):
+            outputs['data'] = torch.from_numpy(outputs['data'])
+        return outputs
+
+
+class Iterator(object):
+    """Iterator to return the batch of data."""
+
+    def __init__(self, **kwargs):
+        super(Iterator, self).__init__()
+        # Distributed settings
+        rank, group_size = 0, 1
+        process_group = dragon.distributed.get_group()
+        if process_group is not None and \
+                kwargs.get('phase', 'TRAIN') == 'TRAIN':
+            group_size = process_group.size
+            rank = dragon.distributed.get_rank(process_group)
+
+        # Configuration
+        self._prefetch = kwargs.get('prefetch', 5)
+        self._batch_size = kwargs.get('batch_size', 32)
+        self._num_readers = kwargs.get('num_readers', 1)
+        self._num_transformers = kwargs.get('num_transformers', 3)
+
+        # Initialize queues
+        num_batches = self._prefetch * self._num_readers
+        self.q_in = mp.Queue(num_batches * self._batch_size)
+        self.q_out = mp.Queue(num_batches * self._batch_size)
+
+        # Initialize readers
+        self._readers = []
+        for i in range(self._num_readers):
+            part_idx, num_parts = i, self._num_readers
+            num_parts *= group_size
+            part_idx += rank * self._num_readers
+            self._readers.append(dragon.io.DataReader(
+                part_idx=part_idx, num_parts=num_parts, **kwargs))
+            self._readers[i]._seed += part_idx
+            self._readers[i].q_out = self.q_in
+            self._readers[i].start()
+            time.sleep(0.1)
+
+        # Initialize transformers
+        self._transformers = []
+        for i in range(self._num_transformers):
+            p = data_transformer.DataTransformer(**kwargs)
+            p._seed += (i + rank * self._num_transformers)
+            p.q_in, p.q_out = self.q_in, self.q_out
+            p.start()
+            self._transformers.append(p)
+            time.sleep(0.1)
+
+        # Register cleanup callbacks
+        def cleanup():
+            def terminate(processes):
+                for p in processes:
+                    p.terminate()
+                    p.join()
+            terminate(self._transformers)
+            logger.info('Terminate DataTransformer.')
+            terminate(self._readers)
+            logger.info('Terminate DataReader.')
+
+        import atexit
+        atexit.register(cleanup)
+
+    def next(self):
+        """Return the next batch of data."""
+        return self.__next__()
+
+    def __iter__(self):
+        """Return the iterator self."""
+        return self
+
+    def __next__(self):
+        """Return the next batch of data."""
+        n = cfg.TRAIN.IMS_PER_BATCH
+        h = w = cfg.TRAIN.SCALES[0]
+
+        boxes_to_pack = []
+        image, boxes = self.q_out.get()
+        images = np.zeros((n, h, w, 3), image.dtype)
+
+        for i in range(n):
+            images[i] = image
+            gt_boxes = np.zeros((boxes.shape[0], boxes.shape[1] + 1), 'float32')
+            gt_boxes[:, :boxes.shape[1]], gt_boxes[:, -1] = boxes, i
+            boxes_to_pack.append(gt_boxes)
+            if i != (cfg.TRAIN.IMS_PER_BATCH - 1):
+                image, boxes = self.q_out.get()
+        boxes_to_pack = np.concatenate(boxes_to_pack)
+
+        return {'data': images, 'gt_boxes': boxes_to_pack}
--- a/lib/ssd/data_transformer.py
+++ b/lib/ssd/data_transformer.py
@@ -14,19 +14,18 @@ from __future__ import division
 from __future__ import print_function

 import multiprocessing
-
-import cv2
 import numpy as np

-from lib.core.config import cfg
-from lib.datasets.example import Example
-from lib.ssd import transforms
-from lib.utils import boxes as box_util
+from seetadet.algo.ssd import transforms
+from seetadet.core.config import cfg
+from seetadet.datasets.example import Example
+from seetadet.utils import boxes as box_util


 class DataTransformer(multiprocessing.Process):
    def __init__(self, **kwargs):
        super(DataTransformer, self).__init__()
+        self._scale = cfg.TRAIN.SCALES[0]
        self._seed = cfg.RNG_SEED
        self._mirror = cfg.TRAIN.USE_FLIPPED
        self._use_diff = cfg.TRAIN.USE_DIFF
@@ -107,14 +106,15 @@ class DataTransformer(multiprocessing.Process):
        gt_boxes = np.empty((roi_dict['gt_classes'].size, 5), 'float32')
        gt_boxes[:, :4], gt_boxes[:, 4] = roi_dict['boxes'], roi_dict['gt_classes']

+        if len(gt_boxes) == 0:
+            # Ignore the non-object image
+            return img, gt_boxes
+
        # Distort => Expand => Sample => Resize
        img, gt_boxes = self.augment_image(img, gt_boxes)

        # Restore to the blob scale
-        gt_boxes[:, 0] *= cfg.SSD.RESIZE.WIDTH
-        gt_boxes[:, 1] *= cfg.SSD.RESIZE.HEIGHT
-        gt_boxes[:, 2] *= cfg.SSD.RESIZE.WIDTH
-        gt_boxes[:, 3] *= cfg.SSD.RESIZE.HEIGHT
+        gt_boxes[:, :4] *= self._scale

        # Post-Process for image
        if img.dtype == 'uint16':

--- a/lib/ssd/generate_anchors.py
+++ b/lib/ssd/generate_anchors.py
--- a/lib/ssd/hard_mining.py
+++ b/lib/ssd/hard_mining.py
@@ -15,29 +15,25 @@ from __future__ import print_function

 import numpy as np

-from lib.core.config import cfg
-from lib.utils.framework import new_tensor
+from seetadet.core.config import cfg
+from seetadet.utils.env import new_tensor


 class HardMining(object):
-    def __call__(self, prob_wide, labels_wide, overlaps_wide):
-        prob_wide = prob_wide.numpy(True)
+    def __call__(self, prob, labels, overlaps):
+        label_shape, label_size = labels.shape, labels.size
+        prob = prob.numpy().reshape((label_size, -1))
+        labels, overlaps = labels.flatten(), overlaps.flatten()

        neg_ovr = cfg.SSD.OHEM.NEG_OVERLAP
        neg_ratio = cfg.SSD.OHEM.NEG_POS_RATIO

        # label ``-1`` will be ignored
-        new_labels_wide = -np.ones(labels_wide.shape, 'int64')
-
-        for ix in range(labels_wide.shape[0]):
-            labels = labels_wide[ix]
-            overlaps = overlaps_wide[ix]
-            prob = prob_wide[ix]
-            loss = np.zeros(labels.shape, 'float32')
-            inds = np.where(labels >= 0)[0]
-            loss[inds] = -np.log(
+        new_labels = -np.ones(labels.shape, 'int64')
+
+        cls_loss = -np.log(
            np.maximum(
-                    prob[inds, labels[inds]],
+                prob[np.arange(label_size), labels],
                np.finfo(float).eps,
            )
        )
@@ -50,12 +46,12 @@ class HardMining(object):
        neg_inds = neg_inds[eligible_neg_inds]

        # Apply mining on negatives
-            neg_loss = loss[neg_inds]
+        neg_cls_loss = cls_loss[neg_inds]
        num_pos, num_neg = len(fg_inds), len(neg_inds)
        num_bg = min(int(num_pos * neg_ratio), num_neg)
-            bg_inds = neg_inds[np.argsort(-neg_loss)][:num_bg]
-            new_labels_wide[ix][fg_inds] = labels[fg_inds]  # Keep fg indices
-            new_labels_wide[ix][bg_inds] = 0  # Use hard negatives as bg indices
+        bg_inds = neg_inds[np.argsort(-neg_cls_loss)][:num_bg]
+        new_labels[fg_inds] = labels[fg_inds]  # Keep fg indices
+        new_labels[bg_inds] = 0  # Use hard negatives as bg indices

        # Feed labels to compute cls loss
-        return {'labels': new_tensor(new_labels_wide)}
+        return {'labels': new_tensor(new_labels.reshape(label_shape))}
--- a/lib/ssd/multibox.py
+++ b/lib/ssd/multibox.py
@@ -15,9 +15,9 @@ from __future__ import print_function

 import numpy as np

-from lib.core.config import cfg
-from lib.utils import boxes as box_util
-from lib.utils.framework import new_tensor
+from seetadet.core.config import cfg
+from seetadet.utils import boxes as box_util
+from seetadet.utils.env import new_tensor


 class MultiBoxMatch(object):
@@ -47,8 +47,8 @@ class MultiBoxMatch(object):
            # Bipartite matching and assignments
            bipartite_inds = overlaps.argmax(0)
            class_assignment = gt_boxes[:, -1]
-            match_inds_wide[ix][bipartite_inds] = np.arange(num_gt, dtype='int32')
-            match_labels_wide[ix][bipartite_inds] = class_assignment
+            match_inds_wide[ix, bipartite_inds] = np.arange(num_gt, dtype='int32')
+            match_labels_wide[ix, bipartite_inds] = class_assignment

            # Per prediction matching and assignments
            # Note that SSD match each prior box for only once
@@ -56,8 +56,8 @@ class MultiBoxMatch(object):
            per_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0]
            gt_assignment = argmax_overlaps[per_inds]
            class_assignment = gt_boxes[gt_assignment, -1]
-            match_inds_wide[ix][per_inds] = gt_assignment
-            match_labels_wide[ix][per_inds] = class_assignment
+            match_inds_wide[ix, per_inds] = gt_assignment
+            match_labels_wide[ix, per_inds] = class_assignment

        return {
            'match_inds': match_inds_wide,
@@ -82,15 +82,7 @@ class MultiBoxTarget(object):

        num_priors, box_dim = prior_boxes.shape[:]
        gt_boxes_wide = box_util.dismantle_boxes(gt_boxes, num_images)
-
-        bbox_targets_wide = np.zeros((num_images, num_priors, box_dim), 'float32')
-        bbox_inside_weights_wide = np.zeros(bbox_targets_wide.shape, 'float32')
-        bbox_outside_weights_wide = np.zeros(bbox_targets_wide.shape, 'float32')
-
-        # Number of matched boxes(#positive)
-        n_pos = float(max(len(np.where(match_labels_wide > 0)[0]), 1))
-        # Multiple by the num images to compensate the smooth l1 loss
-        bbox_reg_weight = cfg.SSD.BBOX_REG_WEIGHT * num_images / n_pos
+        bbox_indices_wide, bbox_anchors_wide, bbox_targets_wide = [], [], []

        for ix in range(num_images):
            gt_boxes = gt_boxes_wide[ix]
@@ -106,17 +98,18 @@ class MultiBoxTarget(object):
            gt_rois = gt_boxes[gt_assignment]

            # Assign bbox targets
-            bbox_targets_wide[ix][ex_inds] = \
+            bbox_anchors_wide.append(ex_rois)
+            bbox_indices_wide.append(ex_inds + (num_priors * ix))
+            bbox_targets_wide.append(
                box_util.bbox_transform(
                    ex_rois,
                    gt_rois,
                    cfg.BBOX_REG_WEIGHTS,
                )
-            bbox_inside_weights_wide[ix, :] = 1.
-            bbox_outside_weights_wide[ix][ex_inds] = bbox_reg_weight
+            )

        return {
-            'bbox_targets': new_tensor(bbox_targets_wide),
-            'bbox_inside_weights': new_tensor(bbox_inside_weights_wide),
-            'bbox_outside_weights': new_tensor(bbox_outside_weights_wide),
+            'bbox_indices': new_tensor(np.concatenate(bbox_indices_wide)),
+            'bbox_anchors': new_tensor(np.concatenate(bbox_anchors_wide).astype('float32')),
+            'bbox_targets': new_tensor(np.concatenate(bbox_targets_wide).astype('float32')),
        }
--- a/lib/ssd/priorbox.py
+++ b/lib/ssd/priorbox.py
@@ -15,9 +15,8 @@ from __future__ import print_function

 import numpy as np

-from lib.core.config import cfg
-from lib.ssd.generate_anchors import generate_anchors
-from lib.utils import logger
+from seetadet.algo.ssd.generate_anchors import generate_anchors
+from seetadet.core.config import cfg


 class PriorBox(object):
@@ -29,8 +28,10 @@ class PriorBox(object):
        max_sizes = cfg.SSD.MULTIBOX.MAX_SIZES
        if len(max_sizes) > 0:
            if len(min_sizes) != len(max_sizes):
-                logger.fatal('Got {} min sizes and {} max sizes.'.format(
-                    len(min_sizes), len(max_sizes)))
+                raise ValueError(
+                    'Got {} min sizes and {} max sizes.'
+                    .format(len(min_sizes), len(max_sizes))
+                )
        self.strides = cfg.SSD.MULTIBOX.STRIDES
        aspect_ratios = cfg.SSD.MULTIBOX.ASPECT_RATIOS
        self.base_anchors = []
@@ -44,9 +45,14 @@ class PriorBox(object):
                    aspect_ratios[i],
                )
            )
+        self.grid_anchors = None

    def __call__(self, features):
-        all_anchors = []
+        if self.grid_anchors is not None:
+            return self.grid_anchors
+
+        self.grid_anchors = []
+
        for i in range(len(self.strides)):
            # 1. Generate base grids
            height, width = features[i].shape[-2:]
@@ -61,26 +67,17 @@ class PriorBox(object):
            # Reshape to (K * A, 4) shifted anchors
            A = self.base_anchors[i].shape[0]
            D = self.base_anchors[i].shape[1]
-            if D == 4:
            shifts = np.vstack((
                shift_x.ravel(),
                shift_y.ravel(),
                shift_x.ravel(),
                shift_y.ravel())
            ).transpose()
-            elif D == 5:
-                shifts = np.vstack((
-                    shift_x.ravel(),
-                    shift_y.ravel(),
-                    shift_x.ravel() * 0,
-                    shift_y.ravel() * 0,
-                    shift_y.ravel() * 0)
-                ).transpose()
-            else:
-                raise ValueError('Excepted anchor4d or anchor5d.')
            K = shifts.shape[0]  # K = map_h * map_w
            anchors = (self.base_anchors[i].reshape((1, A, D)) +
                       shifts.reshape((1, K, D)).transpose((1, 0, 2)))
            anchors = anchors.reshape((K * A, D)).astype(np.float32)
-            all_anchors.append(anchors)
-        return np.concatenate(all_anchors, axis=0)
+            self.grid_anchors.append(anchors)
+        self.grid_anchors = np.concatenate(self.grid_anchors)
+
+        return self.grid_anchors
--- a/lib/ssd/test.py
+++ b/lib/ssd/test.py
@@ -13,26 +13,30 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import types
+
 import cv2
 import dragon.vm.torch as torch
 import numpy as np

-from lib.core.config import cfg
-from lib.modeling.detector import new_detector
-from lib.nms import nms_wrapper
-from lib.utils import boxes as box_util
-from lib.utils import framework
-from lib.utils import time_util
+from seetadet.core.config import cfg
+from seetadet.modeling.detector import new_detector
+from seetadet.utils import boxes as box_util
+from seetadet.utils import nms as nms_util
+from seetadet.utils import time_util


 def get_images(ims):
-    target_h = cfg.SSD.RESIZE.HEIGHT
-    target_w = cfg.SSD.RESIZE.WIDTH
+    out_size = cfg.TEST.SCALES[0]
    processed_ims, im_scales = [], []
    for im in ims:
-        im_scales.append((float(target_h) / im.shape[0],
-                          float(target_w) / im.shape[1]))
-        processed_ims.append(cv2.resize(im, (target_w, target_h)))
+        im_scales.append((float(out_size) / im.shape[0],
+                          float(out_size) / im.shape[1]))
+        processed_ims.append(
+            cv2.resize(
+            im, (out_size, out_size),
+            interpolation=cv2.INTER_AREA,
+        ))
    if ims[0].dtype == 'uint16':
        ims_blob = np.array(processed_ims, dtype='float32') / 256.
    else:
@@ -45,34 +49,33 @@ def ims_detect(detector, ims):
    data, im_scales = get_images(ims)

    # Do Forward
-    if not hasattr(detector, 'graph'):
-        with framework.new_workspace().as_default():
-            with torch.no_grad():
-                with torch.jit.Tracer(retain_ops=True):
-                    inputs = {'data': torch.from_numpy(data)}
-                    outputs = detector.forward(inputs)
-                    detector.graph = \
-                        framework.Graph(inputs, {
-                            'cls_prob': outputs['cls_prob'],
-                            'bbox_pred': outputs['bbox_pred']
-                        }, {'prior_boxes': outputs['prior_boxes']})
-    outputs = detector.graph(data=data)
+    data = torch.from_numpy(data)
+
+    if not hasattr(detector, 'script_forward'):
+        def script_forward(self, data):
+            return self.forward({'data': data})
+        detector.script_forward = torch.jit.trace(
+            func=types.MethodType(script_forward, detector),
+            example_inputs=[data],
+        )
+
+    outputs = detector.script_forward(data)
+    cls_prob = outputs['cls_prob'].numpy()
+    bbox_pred = outputs['bbox_pred'].numpy()

    # Decode results
    batch_boxes = []
    for i in range(len(im_scales)):
        boxes = box_util.bbox_transform_inv(
            outputs['prior_boxes'],
-            outputs['bbox_pred'][i],
+            bbox_pred[i],
            cfg.BBOX_REG_WEIGHTS,
        )
-        boxes[:, 0] /= im_scales[i][1]
-        boxes[:, 1] /= im_scales[i][0]
-        boxes[:, 2] /= im_scales[i][1]
-        boxes[:, 3] /= im_scales[i][0]
+        boxes[:, 0::2] /= im_scales[i][1]
+        boxes[:, 1::2] /= im_scales[i][0]
        batch_boxes.append(box_util.clip_boxes(boxes, ims[i].shape))

-    return outputs['cls_prob'], batch_boxes
+    return cls_prob, batch_boxes


 def test_net(weights, num_classes, q_in, q_out, device):
@@ -88,7 +91,7 @@ def test_net(weights, num_classes, q_in, q_out, device):
        indices, raw_images = [], []
        for i in range(cfg.TEST.IMS_PER_BATCH):
            idx, raw_image = q_in.get()
-            if raw_image is None:
+            if idx < 0:
                must_stop = True
                break
            indices.append(idx)
@@ -116,17 +119,16 @@ def test_net(weights, num_classes, q_in, q_out, device):
                    (cls_boxes, cls_scores[:, np.newaxis])) \
                    .astype(np.float32, copy=False)
                if cfg.TEST.USE_SOFT_NMS:
-                    keep = nms_wrapper.soft_nms(
+                    keep = nms_util.soft_nms(
                        cls_detections,
                        thresh=cfg.TEST.NMS,
                        method=cfg.TEST.SOFT_NMS_METHOD,
                        sigma=cfg.TEST.SOFT_NMS_SIGMA,
                    )
                else:
-                    keep = nms_wrapper.nms(
+                    keep = nms_util.nms(
                        cls_detections,
                        thresh=cfg.TEST.NMS,
-                        force_cpu=True,
                    )
                cls_detections = cls_detections[keep, :]
                boxes_this_image.append(cls_detections)
@@ -134,11 +136,7 @@ def test_net(weights, num_classes, q_in, q_out, device):

            q_out.put((
                indices[i],
-                {
-                    'im_detect': _t['im_detect'].average_time,
-                    'misc': _t['misc'].average_time,
-                },
-                {
-                    'boxes': boxes_this_image,
-                },
+                dict([('im_detect', _t['im_detect'].average_time),
+                      ('misc',_t['misc'].average_time)]),
+                dict([('boxes', boxes_this_image)]),
            ))
--- a/lib/ssd/transforms.py
+++ b/lib/ssd/transforms.py
@@ -22,9 +22,10 @@ import PIL.ImageEnhance
 import numpy as np
 import numpy.random as npr

-from lib.core.config import cfg
-from lib.utils import boxes as box_util
-from lib.utils import logger
+from seetadet.core.config import cfg
+from seetadet.utils import boxes as box_util
+from seetadet.utils import boxes_v2 as box_util_v2
+from seetadet.utils import logger


 class Compose(object):
@@ -40,43 +41,35 @@ class Compose(object):

 class Distort(object):
    def __init__(self):
-        self._brightness_prob = cfg.SSD.DISTORT.BRIGHTNESS_PROB
-        self._contrast_prob = cfg.SSD.DISTORT.CONTRAST_PROB
-        self._saturation_prob = cfg.SSD.DISTORT.SATURATION_PROB
+        self._prob = 0.5
+        self._transforms = [
+            (PIL.ImageEnhance.Brightness, self._prob),
+            (PIL.ImageEnhance.Contrast, self._prob),
+            (PIL.ImageEnhance.Color, self._prob),
+        ]

    def apply(self, img, boxes=None):
+        if self._prob > 0:
            img = PIL.Image.fromarray(img)
-        transforms = [
-            (PIL.ImageEnhance.Brightness, self._brightness_prob),
-            (PIL.ImageEnhance.Contrast, self._contrast_prob),
-            (PIL.ImageEnhance.Color, self._saturation_prob),
-        ]
-        np.random.shuffle(transforms)
-        for transform_fn, prob in transforms:
+            for transform_fn, prob in self._transforms:
                if npr.uniform() < prob:
                    img = transform_fn(img)
                    img = img.enhance(1. + npr.uniform(-.4, .4))
            return np.array(img), boxes
+        return img, boxes


 class Expand(object):
    def __init__(self):
-        self._expand_prob = cfg.SSD.EXPAND.PROB
-        self._max_ratio = cfg.SSD.EXPAND.MAX_RATIO
-        if self._max_ratio < 1.0:
-            logger.fatal(
-                'The max expand ratio must >= 1, got {}'
-                .format(self._max_ratio)
-            )
+        self._max_ratio = 1. / cfg.TRAIN.RANDOM_SCALES[0]
+        self._expand_prob = 0.5 if self._max_ratio > 1 else 0

    def apply(self, img, boxes=None):
        prob = npr.uniform()
        if prob > self._expand_prob:
            return img, boxes
-        ratio = npr.uniform(1., self._max_ratio)
-        if ratio == 1:
-            return img, boxes

+        ratio = npr.uniform(1., self._max_ratio)
        im_h, im_w = img.shape[:2]
        expand_h, expand_w = int(im_h * ratio), int(im_w * ratio)
        h_off = int(math.floor(npr.uniform(0., expand_h - im_h)))
@@ -99,19 +92,14 @@ class Expand(object):

 class Resize(object):
    def __init__(self):
-        self._target_size = (
-            cfg.SSD.RESIZE.WIDTH,
-            cfg.SSD.RESIZE.HEIGHT,
-        )
-        interp_list = {
-            'LINEAR': cv2.INTER_LINEAR,
-            'AREA': cv2.INTER_AREA,
-            'NEAREST': cv2.INTER_NEAREST,
-            'CUBIC': cv2.INTER_CUBIC,
-            'LANCZOS4': cv2.INTER_LANCZOS4,
-        }
-        interp_mode = cfg.SSD.RESIZE.INTERP_MODE
-        self._interp_mode = [interp_list[key] for key in interp_mode]
+        self._target_size = (cfg.TRAIN.SCALES[0],) * 2
+        self._interp_mode = [
+            cv2.INTER_LINEAR,
+            cv2.INTER_AREA,
+            cv2.INTER_NEAREST,
+            cv2.INTER_CUBIC,
+            cv2.INTER_LANCZOS4,
+        ]

    def apply(self, img, boxes):
        rand = npr.randint(len(self._interp_mode))
@@ -144,7 +132,10 @@ class Sample(object):

    @classmethod
    def _compute_overlaps(cls, rand_box, gt_boxes):
-        return box_util.iou(np.expand_dims(rand_box, 0), gt_boxes[:, 0:4])
+        return box_util_v2.iou(
+            np.expand_dims(rand_box, 0),
+            gt_boxes[:, 0:4],
+        )

    @classmethod
    def _generate_sample(cls, sample_param):
@@ -162,18 +153,27 @@ class Sample(object):
        h_off = npr.uniform(0., 1. - bbox_h)
        return np.array([w_off, h_off, w_off + bbox_w, h_off + bbox_h])

-    def _check_satisfy(self, sample_box, gt_boxes, constraint):
+    def _check_center(self, sample_box, gt_boxes):
+        ctr_x = (gt_boxes[:, 2] + gt_boxes[:, 0]) / 2.0
+        ctr_y = (gt_boxes[:, 3] + gt_boxes[:, 1]) / 2.0
+        # Keep the ground-truth box whose center is in the sample box
+        # Implement ``EmitConstraint.CENTER`` in the original SSD
+        keep_inds = np.where((ctr_x >= sample_box[0]) & (ctr_x <= sample_box[2]) &
+                             (ctr_y >= sample_box[1]) & (ctr_y <= sample_box[3]))[0]
+        return len(keep_inds) > 0
+
+    def _check_overlap(self, sample_box, gt_boxes, constraint):
        min_overlap = constraint.get('min_overlap', None)
        max_overlap = constraint.get('max_overlap', None)
        if min_overlap is None and \
                max_overlap is None:
            return True
-        max_overlap = self._compute_overlaps(sample_box, gt_boxes).max()
+        ovr = self._compute_overlaps(sample_box, gt_boxes).max()
        if min_overlap is not None:
-            if max_overlap < min_overlap:
+            if ovr < min_overlap:
                return False
        if max_overlap is not None:
-            if max_overlap > max_overlap:
+            if ovr > max_overlap:
                return False
        return True

@@ -187,8 +187,9 @@ class Sample(object):
                sample_box = self._generate_sample(sampler)
                if sampler['min_overlap'] != 0. or \
                        sampler['max_overlap'] != 1.:
-                    ok = self._check_satisfy(sample_box, gt_boxes, sampler)
-                    if not ok:
+                    if not self._check_overlap(sample_box, gt_boxes, sampler):
+                        continue
+                if not self._check_center(sample_box, gt_boxes):
                    continue
                found += 1
                sample_boxes.append(sample_box)
@@ -206,8 +207,6 @@ class Sample(object):
        if gt_boxes is not None:
            ctr_x = (gt_boxes[:, 2] + gt_boxes[:, 0]) / 2.0
            ctr_y = (gt_boxes[:, 3] + gt_boxes[:, 1]) / 2.0
-            # Keep the ground-truth box whose center is in the sample box
-            # Implement ``EmitConstraint.CENTER`` in the original SSD
            keep_inds = np.where((ctr_x >= rand_box[0]) & (ctr_x <= rand_box[2]) &
                                 (ctr_y >= rand_box[1]) & (ctr_y <= rand_box[3]))[0]
            gt_boxes = gt_boxes[keep_inds]

--- a/lib/ssd/transforms_test.py
+++ b/lib/ssd/transforms_test.py
@@ -19,11 +19,14 @@ sys.path.append('../../')
 import cv2
 import numpy as np

-from lib.ssd import transforms
+from seetadet.algo.ssd import transforms
+from seetadet.core.config import cfg


 if __name__ == '__main__':
    np.random.seed(3)
+    cfg.TRAIN.SCALES = [300]
+    cfg.TRAIN.RANDOM_SCALES = [0.25, 1.00]

    augmentor = transforms.Compose(
        transforms.Distort(),
@@ -36,8 +39,6 @@ if __name__ == '__main__':
        img = cv2.imread('cat.jpg')
        boxes = np.array([[0.33, 0.04, 0.71, 0.98]], dtype=np.float32)
        img, boxes = augmentor(img, boxes)
-        if len(boxes) < 1:
-            continue
        for box in boxes:
            x1 = int(box[0] * img.shape[1])
            y1 = int(box[1] * img.shape[0])

--- a/lib/datasets/__init__.py
+++ b/lib/datasets/__init__.py
--- a/lib/core/config.py
+++ b/lib/core/config.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 import os.path as osp
 import numpy as np

-from lib.utils.attrdict import AttrDict
+from seetadet.utils.attrdict import AttrDict


 cfg = __C = AttrDict()
@@ -38,41 +38,27 @@ __C.TRAIN = AttrDict()
 # Initialize network with weights from this file
 __C.TRAIN.WEIGHTS = ''

-# Database to train
-__C.TRAIN.DATABASE = ''
+# Dataset to train
+__C.TRAIN.DATASET = ''

-# The number of workers to transform data
-__C.TRAIN.NUM_WORKERS = 3
+# The number of threads to load train data
+__C.TRAIN.NUM_THREADS = 4

 # Scales to use during training (can list multiple scales)
 # Each scale is the pixel size of an image's shortest side
-__C.TRAIN.SCALES = (600,)
+__C.TRAIN.SCALES = (300,)

 # Max pixel size of the longest side of a scaled input image
 # A square will be used if value < 1
-__C.TRAIN.MAX_SIZE = 1000
+__C.TRAIN.MAX_SIZE = 0

 # Images to use per mini-batch
 __C.TRAIN.IMS_PER_BATCH = 1

-# Minibatch size (number of regions of interest [ROIs])
-__C.TRAIN.BATCH_SIZE = 128
-
-# Fraction of minibatch that is labeled foreground (i.e. class > 0)
-__C.TRAIN.FG_FRACTION = 0.25
-
-# Overlap threshold for a ROI to be considered foreground (if >= FG_THRESH)
-__C.TRAIN.FG_THRESH = 0.5
-
-# Overlap threshold for a ROI to be considered background (class = 0 if
-# overlap in [LO, HI))
-__C.TRAIN.BG_THRESH_HI = 0.5
-__C.TRAIN.BG_THRESH_LO = 0.0
-
-# Use shuffle after each epoch
+# Use shuffled images during training?
 __C.TRAIN.USE_SHUFFLE = True
-# The number of chunks to shuffle
-__C.TRAIN.NUM_SHUFFLE_CHUNKS = 0
+# The number of shuffle chunks
+__C.TRAIN.SHUFFLE_CHUNKS = 0

 # Use horizontally-flipped images during training?
 __C.TRAIN.USE_FLIPPED = True
@@ -80,17 +66,25 @@ __C.TRAIN.USE_FLIPPED = True
 # Use the difficult(under occlusion) objects
 __C.TRAIN.USE_DIFF = True

-# Overlap required between a ROI and ground-truth box in order for that ROI to
-# be used as a bounding-box regression training example
-__C.TRAIN.BBOX_THRESH = 0.5
-
-# If True, randomly scale the image by scale range
-__C.TRAIN.USE_SCALE_JITTER = False
-__C.TRAIN.SCALE_JITTER_RANGE = [0.75, 1.0]
+# Range to jitter the image scales
+__C.TRAIN.RANDOM_SCALES = [1., 1.]

 # If True, randomly distort the image by brightness, contrast, and saturation
 __C.TRAIN.USE_COLOR_JITTER = False

+# Mini-batch size (#RoIs) for two stage detector
+__C.TRAIN.BATCH_SIZE = 128
+
+# Overlap threshold for a ROI to be considered foreground (if >= FG_THRESH)
+__C.TRAIN.FG_THRESH = 0.5
+# Fraction of mini-batch that is labeled foreground (i.e. class > 0)
+__C.TRAIN.FG_FRACTION = 0.25
+
+# Overlap threshold for a ROI to be considered background (class = 0 if
+# overlap in [LO, HI))
+__C.TRAIN.BG_THRESH_HI = 0.5
+__C.TRAIN.BG_THRESH_LO = 0.0
+
 # IOU >= thresh: positive example
 __C.TRAIN.RPN_POSITIVE_OVERLAP = 0.7
 # IOU < thresh: negative example
@@ -123,20 +117,19 @@ __C.TRAIN.RPN_STRADDLE_THRESH = 0

 __C.TEST = AttrDict()

-# Database to test
-__C.TEST.DATABASE = ''
+# Dataset to test
+__C.TEST.DATASET = ''

 # Original json ground-truth file to use
-# Records in the Database file will be used instead
 __C.TEST.JSON_FILE = ''

 # Scales to use during testing (can list multiple scales)
 # Each scale is the pixel size of an image's shortest side
-__C.TEST.SCALES = (600,)
+__C.TEST.SCALES = (300,)

 # Max pixel size of the longest side of a scaled input image
 # A square will be used if value < 1
-__C.TEST.MAX_SIZE = 1000
+__C.TEST.MAX_SIZE = 0

 # Images to use per mini-batch
 __C.TEST.IMS_PER_BATCH = 1
@@ -217,10 +210,20 @@ __C.MODEL.CLASSES = ['__background__']
 # The value of ``K`` is usually set to 2
 __C.MODEL.FREEZE_AT = 2

+# The variant of ReLU activation
+# ('ReLU', 'ReLU6')
+__C.MODEL.RELU_VARIANT = 'ReLU'
+
 # Setting of focal loss
 __C.MODEL.FOCAL_LOSS_ALPHA = 0.25
 __C.MODEL.FOCAL_LOSS_GAMMA = 2.0

+# The optional loss for bbox regression
+# ('NORM', 'IOU')
+__C.MODEL.REG_LOSS_TYPE = 'NORM'
+# Weight for bbox regression loss
+__C.MODEL.REG_LOSS_WEIGHT = 1.
+
 # Stride of the coarsest Feature level
 # This is needed so the input can be padded properly
 __C.MODEL.COARSEST_STRIDE = 32
@@ -268,9 +271,6 @@ __C.RETINANET.ANCHOR_SCALE = 4
 # NOTE: this doesn't include the last conv for logits
 __C.RETINANET.NUM_CONVS = 4

-# Weight for bbox regression loss
-__C.RETINANET.BBOX_REG_WEIGHT = 1.
-
 # During inference, #locs to select based on cls score before NMS is performed
 __C.RETINANET.PRE_NMS_TOP_N = 5000

@@ -362,9 +362,6 @@ __C.SSD = AttrDict()
 # NOTE: this doesn't include the last conv for logits
 __C.SSD.NUM_CONVS = 0

-# Weight for bbox regression loss
-__C.SSD.BBOX_REG_WEIGHT = 1.
-
 # MultiBox configs
 __C.SSD.MULTIBOX = AttrDict()
 __C.SSD.MULTIBOX.STRIDES = []
@@ -379,23 +376,6 @@ __C.SSD.OHEM.NEG_OVERLAP = 0.5
 # The ratio used in hard example mining
 __C.SSD.OHEM.NEG_POS_RATIO = 3.0

-# Distort the image?
-__C.SSD.DISTORT = AttrDict()
-__C.SSD.DISTORT.BRIGHTNESS_PROB = 0.5
-__C.SSD.DISTORT.CONTRAST_PROB = 0.5
-__C.SSD.DISTORT.SATURATION_PROB = 0.5
-
-# Expand the image?
-__C.SSD.EXPAND = AttrDict()
-__C.SSD.EXPAND.PROB = 0.5
-__C.SSD.EXPAND.MAX_RATIO = 4.0
-
-# Resize the image?
-__C.SSD.RESIZE = AttrDict()
-__C.SSD.RESIZE.HEIGHT = 300
-__C.SSD.RESIZE.WIDTH = 300
-__C.SSD.RESIZE.INTERP_MODE = ['LINEAR', 'AREA', 'NEAREST', 'CUBIC', 'LANCZOS4']
-
 # Samplers
 # Format as (min_scale, max_scale,
 #            min_aspect_ratio, max_aspect_ratio,
@@ -486,7 +466,7 @@ __C.SOLVER.LR_POLICY = 'steps_with_decay'

 # Momentum to use with SGD
 __C.SOLVER.MOMENTUM = 0.9
-# L2 regularization hyper parameters
+# L2 regularization for weight parameters
 __C.SOLVER.WEIGHT_DECAY = 0.0001
 # L2 norm factor for clipping gradients
 __C.SOLVER.CLIP_NORM = -1.0
@@ -505,6 +485,9 @@ __C.NUM_GPUS = 1
 # Use NCCL for all reduce, otherwise use cuda-aware mpi
 __C.USE_NCCL = True

+# Use DALI to load the batch of data instead of original pipeline
+__C.USE_DALI = False
+
 # Hosts for Inter-Machine communication
 __C.HOSTS = []

@@ -531,9 +514,6 @@ __C.DATA_DIR = osp.abspath(osp.join(__C.ROOT_DIR, 'data'))
 # Place outputs under an experiments directory
 __C.EXP_DIR = ''

-# Use GPU implementation of non-maximum suppression
-__C.USE_GPU_NMS = True
-
 # Default GPU device id
 __C.GPU_ID = 0


--- a/lib/core/coordinator.py
+++ b/lib/core/coordinator.py
@@ -18,8 +18,8 @@ import shutil
 import time
 import numpy as np

-from lib.core.config import cfg
-from lib.core.config import cfg_from_file
+from seetadet.core.config import cfg
+from seetadet.core.config import cfg_from_file


 class Coordinator(object):

--- a/seetadet/core/registry.py
+++ b/seetadet/core/registry.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import functools
+
+
+class Registry(object):
+    """The base registry class."""
+
+    def __init__(self, name):
+        self._name = name
+        self._registry = collections.OrderedDict()
+
+    def has(self, key):
+        return key in self._registry
+
+    def register(self, name, func=None, **kwargs):
+        def decorated(inner_function):
+            for key in (name if isinstance(
+                    name, (tuple, list)) else [name]):
+                if self.has(key):
+                    raise KeyError(
+                        '`%s` has been registered in %s.'
+                        % (key, self._name)
+                    )
+                self._registry[key] = functools.partial(
+                    inner_function, **kwargs)
+        if func is not None:
+            return decorated(func)
+        return decorated
+
+    def get(self, name):
+        if not self.has(name):
+            raise KeyError(
+                "`%s` is not registered in <%s>."
+                % (name, self._name)
+            )
+        return self._registry[name]
+
+    def try_get(self, name):
+        if self.has(name):
+            return self.get(name)
+        return None
+
+
+backbones = Registry('backbones')
+models = Registry('models')
--- a/lib/core/test.py
+++ b/lib/core/test.py
@@ -20,9 +20,9 @@ import os
 import cv2
 import dragon

-from lib.core.config import cfg
-from lib.datasets.example import Example
-from lib.datasets.factory import get_imdb
+from seetadet.core.config import cfg
+from seetadet.datasets.example import Example
+from seetadet.datasets.factory import get_dataset


 class _Server(object):
@@ -50,13 +50,13 @@ class _Server(object):
 class TestServer(_Server):
    def __init__(self, output_dir):
        super(TestServer, self).__init__(output_dir)
-        self.imdb = get_imdb(cfg.TEST.DATABASE)
-        self.imdb.competition_mode(cfg.TEST.COMPETITION_MODE)
-        self.classes = self.imdb.classes
-        self.num_images = self.imdb.num_images
-        self.num_classes = self.imdb.num_classes
+        self.dataset = get_dataset(cfg.TEST.DATASET)
+        self.dataset.competition_mode(cfg.TEST.COMPETITION_MODE)
+        self.classes = self.dataset.classes
+        self.num_images = self.dataset.num_images
+        self.num_classes = self.dataset.num_classes
        self.data_reader = dragon.io.DataReader(
-            dataset=lambda: dragon.io.SeetaRecordDataset(self.imdb.source))
+            dataset=self.dataset.cls, source=self.dataset.source)
        self.data_reader.q_out = mp.Queue(cfg.TEST.IMS_PER_BATCH * 5)
        self.data_reader.start()
        self.gt_recs = collections.OrderedDict()
@@ -81,16 +81,16 @@ class TestServer(_Server):

    def evaluate_detections(self, all_boxes):
        if cfg.TEST.PROTOCOL == 'dump':
-            self.imdb.dump_detections(all_boxes, self.output_dir)
+            self.dataset.dump_detections(all_boxes, self.output_dir)
        else:
-            self.imdb.evaluate_detections(
+            self.dataset.evaluate_detections(
                all_boxes,
                self.get_records(),
                self.output_dir,
            )

    def evaluate_segmentations(self, all_boxes, all_masks):
-        self.imdb.evaluate_segmentations(
+        self.dataset.evaluate_segmentations(
            all_boxes,
            all_masks,
            self.get_records(),
@@ -101,7 +101,7 @@ class TestServer(_Server):
 class InferServer(_Server):
    def __init__(self, output_dir):
        super(InferServer, self).__init__(output_dir)
-        self.images_dir = cfg.TEST.DATABASE
+        self.images_dir = cfg.TEST.DATASET
        self.images = os.listdir(self.images_dir)
        self.classes = cfg.MODEL.CLASSES
        self.num_images = len(self.images)

--- a/lib/core/test_engine.py
+++ b/lib/core/test_engine.py
@@ -18,9 +18,9 @@ import multiprocessing

 import numpy as np

-from lib.core.config import cfg
-from lib.utils import time_util
-from lib.utils.vis import vis_one_image
+from seetadet.core.config import cfg
+from seetadet.utils import time_util
+from seetadet.utils.vis import vis_one_image


 def run_test_net(checkpoint, server, devices):
@@ -30,8 +30,8 @@ def run_test_net(checkpoint, server, devices):
    devices = devices if devices else [cfg.GPU_ID]
    num_workers = len(devices)

-    test_fn = importlib.import_module(
-        'lib.%s.test' % cfg.MODEL.TYPE).test_net
+    test_module = 'seetadet.algo.%s.test' % cfg.MODEL.TYPE
+    test_fn = importlib.import_module(test_module).test_net

    _t = time_util.new_timers('im_detect', 'mask_detect', 'misc')


--- a/lib/core/train.py
+++ b/lib/core/train.py
@@ -22,11 +22,11 @@ import os

 import dragon.vm.torch as torch

-from lib.core.config import cfg
-from lib.solver.sgd import SGDSolver
-from lib.utils import logger
-from lib.utils import time_util
-from lib.utils.stats import SmoothedValue
+from seetadet.core.config import cfg
+from seetadet.solver.sgd import SGDSolver
+from seetadet.utils import logger
+from seetadet.utils import time_util
+from seetadet.utils.stats import SmoothedValue


 class SolverWrapper(object):

--- a/lib/modules/__init__.py
+++ b/lib/modules/__init__.py
--- a/seetadet/dali/data_reader.py
+++ b/seetadet/dali/data_reader.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+import dragon.vm.dali as dali
+import numpy as np
+
+from seetadet.core.config import cfg
+
+
+class DataReader(dali.ops.KPLRecordReader):
+    def __init__(
+        self,
+        path,
+        features,
+        pipeline,
+        shard_id=0,
+        num_shards=1,
+        shuffle_after_epoch=False,
+        shuffle_chunks=0,
+        aspect_grouping=False,
+    ):
+        super(DataReader, self).__init__(
+            path=path,
+            features=features,
+            pipeline=pipeline,
+            shard_id=shard_id,
+            num_shards=num_shards,
+            shuffle_after_epoch=shuffle_after_epoch,
+            shuffle_chunks=shuffle_chunks,
+        )
+        self._aspect_grouping = aspect_grouping
+        self._class_to_ind = dict(zip(
+            cfg.MODEL.CLASSES,
+            range(len(cfg.MODEL.CLASSES))
+        ))
+        self._queue1, self._queue2 = [], []
+
+    def feed_inputs(self):
+        if not self._aspect_grouping:
+            feed_dict = collections.defaultdict(list)
+            for i in range(self._pipe.batch_size):
+                while True:
+                    example = self._buffer.get()
+                    if len(example['object']) > 0:
+                        break
+                data = self.example_to_data(example)
+                for k, v in data.items():
+                    feed_dict[k].append(v)
+            for k, v in self.features.items():
+                self._pipe.feed_input(self.features[k], feed_dict[k])
+        else:
+            batch_size = self._pipe.batch_size
+            while True:
+                batch_data = None
+                if len(self._queue1) >= batch_size:
+                    batch_data = self._queue1[:batch_size]
+                    self._queue1 = self._queue1[batch_size:]
+                elif len(self._queue2) >= batch_size:
+                    batch_data = self._queue2[:batch_size]
+                    self._queue2 = self._queue2[batch_size:]
+                if batch_data is not None:
+                    feed_dict = collections.defaultdict(list)
+                    for data in batch_data:
+                        for k, v in data.items():
+                            feed_dict[k].append(v)
+                    for k, v in self.features.items():
+                        self._pipe.feed_input(self.features[k], feed_dict[k])
+                    break
+                while True:
+                    example = self._buffer.get()
+                    if len(example['object']) > 0:
+                        break
+                data = self.example_to_data(example)
+                ratio = float(data['shape'][0]) / data['shape'][1]
+                if ratio > 1:
+                    self._queue1.append(data)
+                else:
+                    self._queue2.append(data)
+
+    def example_to_data(self, example):
+        bbox_data, bbox_ratio, bbox_label = [], [], []
+        h, w, c = example['height'], example['width'], example['depth']
+        for obj in example['object']:
+            x1 = float(max(obj['xmin'], 0))
+            y1 = float(max(obj['ymin'], 0))
+            x2 = float(min(obj['xmax'], w - 1))
+            y2 = float(min(obj['ymax'], h - 1))
+            bbox_data.append([x1, y1, x2, y2])
+            bbox_ratio.append([x1 / w, y1 / h, x2 / w, y2 / h])
+            bbox_label.append(self._class_to_ind[obj['name']])
+        return {
+            'image': example['content'],
+            'shape': np.array([h, w, c], 'int64'),
+            'bbox/data': np.array(bbox_data, 'float32'),
+            'bbox/ratio': np.array(bbox_ratio, 'float32'),
+            'bbox/label': np.array(bbox_label, 'int32')
+        }
--- a/seetadet/dali/rcnn_pipeline.py
+++ b/seetadet/dali/rcnn_pipeline.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from dragon.vm import dali
+from dragon.vm.dali.plugin.pytorch import DALIGenericIterator
+
+from seetadet.core.config import cfg
+from seetadet.dali.data_reader import DataReader
+
+
+class Pipeline(dali.Pipeline):
+    def __init__(self, source):
+        super(Pipeline, self).__init__(
+            batch_size=cfg.TRAIN.IMS_PER_BATCH,
+            num_threads=cfg.TRAIN.NUM_THREADS,
+        )
+        random_scales = cfg.TRAIN.RANDOM_SCALES
+        if random_scales[1] > 1:
+            raise ValueError('The max scale range should be <= 1.')
+        mean_values = np.array(cfg.PIXEL_MEANS, 'int64').tolist()
+        self.max_size = cfg.TRAIN.MAX_SIZE
+
+        self.reader = DataReader(
+            path=source,
+            features=['image', 'shape', 'bbox/data', 'bbox/label'],
+            pipeline=self,
+            shard_id=dali.get_distributed_info()[0],
+            num_shards=dali.get_distributed_info()[1],
+            shuffle_after_epoch=cfg.TRAIN.USE_SHUFFLE,
+            shuffle_chunks=cfg.TRAIN.SHUFFLE_CHUNKS,
+            aspect_grouping=True,
+        )
+
+        self.decode = dali.ops.ImageDecoder()
+        self.resize = dali.ops.Resize(max_size=self.max_size)
+        self.brightness_contrast = dali.ops.BrightnessContrast()
+        self.hsv = dali.ops.Hsv()
+        self.cmn = dali.ops.CropMirrorNormalize(
+            mean=np.array(mean_values, 'int64').tolist(),
+            std=[1., 1., 1.],
+        )
+        self.pad = dali.ops.Pad(
+            axes=[1, 2],
+            align=cfg.MODEL.COARSEST_STRIDE
+            if cfg.MODEL.COARSEST_STRIDE > 0 else None,
+        )
+
+        with dali.device('cpu'):
+            self.resize_rng = dali.ops.Uniform([
+                cfg.TRAIN.SCALES[0] * random_scales[0],
+                cfg.TRAIN.SCALES[0] * random_scales[1],
+            ])
+            self.twist_rng = dali.ops.Uniform([0.6, 1.4])
+            self.flip_rng = dali.ops.CoinFlip(0.5 if cfg.TRAIN.USE_FLIPPED else 0.)
+
+    def iter_setup(self):
+        self.reader.feed_inputs()
+
+    def define_graph(self):
+        # Read inputs from file
+        inputs = self.reader()
+        shape = inputs['shape']
+        bbox = inputs['bbox/data']
+        label = inputs['bbox/label']
+
+        # Decode image
+        image = self.decode(inputs['image'])
+
+        # Augment the color space
+        if cfg.TRAIN.USE_COLOR_JITTER:
+            image = self.hsv(
+                self.brightness_contrast(
+                    image,
+                    brightness=self.twist_rng(),
+                    contrast=self.twist_rng(),
+                ),
+                saturation=self.twist_rng()
+            )
+
+        # Resize to the target size
+        target_size = self.resize_rng()
+        image = self.resize(image, resize_shorter=target_size)
+
+        # Normalize and pad to blob shape
+        apply_flip = self.flip_rng()
+        image = self.cmn(image, mirror=apply_flip)
+        image = self.pad(image)
+
+        return image, bbox, label, target_size, shape, apply_flip
+
+
+class Iterator(DALIGenericIterator):
+    def __init__(self, pipeline):
+        super(Iterator, self).__init__(pipeline)
+
+    @property
+    def handlers(self):
+        return ([0], self.copy_handler,), \
+               ([1, 2, 3, 4, 5], self.gt_handler)
+
+    def next(self):
+        (images,), (gt_boxes, ims_info) = self.__next__()
+        return {'data': images, 'gt_boxes': gt_boxes, 'ims_info': ims_info}
+
+    def gt_handler(self, tensors):
+        def impl(box_list, labels, im_shape, target_size, max_size, flip):
+            num_images = len(box_list)
+            im_size_min = np.min(im_shape[:, :2], axis=1).astype('float32')
+            im_size_max = np.max(im_shape[:, :2], axis=1).astype('float32')
+            im_scales = target_size / im_size_min
+            inds = np.where(np.round(im_scales * im_size_max) > max_size)[0]
+            im_scales[inds] = max_size / im_size_max[inds]
+            box_list = [box_list[i] * im_scales[i] for i in range(num_images)]
+            for i in (np.where(flip > 0)[0]):
+                boxes = box_list[i]
+                boxes_flipped = box_list[i].copy()
+                width = im_shape[i, 1] * im_scales[i]
+                boxes_flipped[:, 0] = width - boxes[:, 2] - 1
+                boxes_flipped[:, 2] = width - boxes[:, 0] - 1
+                box_list[i] = boxes_flipped
+            im_scales = np.expand_dims(im_scales, 1)
+            batch_inds = [np.ones([e.size, 1]) * i for i, e in enumerate(labels)]
+            boxes = np.concatenate(box_list)
+            labels = np.expand_dims(np.concatenate(labels), axis=1)
+            batch_inds = np.concatenate(batch_inds)
+            gt_boxes = np.hstack([boxes, labels, batch_inds])
+            ims_info = np.hstack([im_shape[:, :2] * im_scales, im_scales])
+            return gt_boxes.astype('float32'), ims_info.astype('float32')
+
+        bbox, label, target_size, shape, flip = tensors
+        shape = shape.as_array()
+        return impl(
+            box_list=[bbox.at(i) for i in range(len(shape))],
+            labels=[label.at(i) for i in range(len(shape))],
+            im_shape=shape,
+            target_size=target_size.as_array().squeeze(),
+            max_size=self._pipe.max_size,
+            flip=flip.as_array()
+        )
+
+
+def new_iterator(source):
+    with dali.device('cuda', cfg.GPU_ID):
+        return Iterator(Pipeline(source))
--- a/seetadet/dali/ssd_pipeline.py
+++ b/seetadet/dali/ssd_pipeline.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from dragon.vm import dali
+from dragon.vm.dali.plugin.pytorch import DALIGenericIterator
+
+from seetadet.core.config import cfg
+from seetadet.dali.data_reader import DataReader
+
+
+class Pipeline(dali.Pipeline):
+    def __init__(self, source):
+        super(Pipeline, self).__init__(
+            batch_size=cfg.TRAIN.IMS_PER_BATCH,
+            num_threads=cfg.TRAIN.NUM_THREADS,
+        )
+        paste_ratio = 1. / cfg.TRAIN.RANDOM_SCALES[0]
+        mean_values = np.array(cfg.PIXEL_MEANS, 'int64').tolist()
+        self.target_size = cfg.TRAIN.SCALES[0]
+
+        self.reader = DataReader(
+            path=source,
+            features=['image', 'bbox/ratio', 'bbox/label'],
+            pipeline=self,
+            shard_id=dali.get_distributed_info()[0],
+            num_shards=dali.get_distributed_info()[1],
+            shuffle_after_epoch=cfg.TRAIN.USE_SHUFFLE,
+            shuffle_chunks=cfg.TRAIN.SHUFFLE_CHUNKS,
+        )
+
+        self.decode = dali.ops.ImageDecoder()
+        self.brightness_contrast = dali.ops.BrightnessContrast()
+        self.hsv = dali.ops.Hsv()
+        self.paste = dali.ops.Paste(fill_value=mean_values)
+        self.slice = dali.ops.Slice()
+        self.resize = dali.ops.Resize(self.target_size, self.target_size)
+        self.cmn = dali.ops.CropMirrorNormalize(mean=mean_values, std=[1., 1., 1.])
+
+        with dali.device('cpu'):
+            self.bbox_paste = dali.ops.BBoxPaste()
+            self.bbox_crop = dali.ops.RandomBBoxCrop()
+            self.bbox_flip = dali.ops.BbFlip()
+            self.twist_rng = dali.ops.Uniform([0.6, 1.4])
+            self.paste_pos = dali.ops.Uniform((0., 1.))
+            self.paste_ratio = dali.ops.Uniform((0., paste_ratio - 1))
+            self.flip_rng = dali.ops.CoinFlip(0.5 if cfg.TRAIN.USE_FLIPPED else 0.)
+
+    def iter_setup(self):
+        self.reader.feed_inputs()
+
+    def define_graph(self):
+        # Read inputs from file
+        inputs = self.reader()
+        bbox = inputs['bbox/ratio']
+        label = inputs['bbox/label']
+
+        # Decode image
+        image = self.decode(inputs['image'])
+
+        # Augment the color space
+        image = self.hsv(
+            self.brightness_contrast(
+                image,
+                brightness=self.twist_rng(),
+                contrast=self.twist_rng(),
+            ), saturation=self.twist_rng()
+        )
+
+        # Expand randomly to get smaller objects
+        pr = self.paste_ratio() * self.flip_rng() + 1.
+        px, py = self.paste_pos(), self.paste_pos()
+        image = self.paste(image, paste_x=px, paste_y=py, ratio=pr)
+        bbox = self.bbox_paste(bbox, paste_x=px, paste_y=py, ratio=pr)
+
+        # Sample RoIs with IoU constraint
+        crop_begin, crop_size, bbox, label = self.bbox_crop(bbox, label)
+        image = self.slice(image, crop_begin, crop_size)
+
+        # Resize image to a fixed size
+        image = self.resize(image)
+
+        # Normalize
+        apply_flip = self.flip_rng()
+        image = self.cmn(image, mirror=apply_flip)
+        bbox = self.bbox_flip(bbox, horizontal=apply_flip)
+
+        return image, bbox, label
+
+
+class Iterator(DALIGenericIterator):
+    def __init__(self, pipeline):
+        super(Iterator, self).__init__(pipeline)
+
+    @property
+    def handlers(self):
+        return ([0], self.copy_handler,), ([1, 2], self.gt_handler)
+
+    def next(self):
+        (images,), gt_boxes = self.__next__()
+        return {'data': images, 'gt_boxes': gt_boxes}
+
+    def gt_handler(self, tensors):
+        bbox, label = tensors
+        num_images = self._pipe.batch_size
+        boxes = np.concatenate([bbox.at(i) for i in range(num_images)])
+        boxes[:, 0::2] *= self._pipe.target_size
+        boxes[:, 1::2] *= self._pipe.target_size
+        labels = [label.at(i) for i in range(num_images)]
+        batch_inds = [np.ones_like(e) * i for i, e in enumerate(labels)]
+        labels, batch_inds = np.concatenate(labels), np.concatenate(batch_inds)
+        return np.hstack([boxes, labels, batch_inds])
+
+
+def new_iterator(source):
+    with dali.device('cuda', cfg.GPU_ID):
+        return Iterator(Pipeline(source))
--- a/lib/nms/__init__.py
+++ b/lib/nms/__init__.py
--- a/lib/datasets/coco_evaluator.py
+++ b/lib/datasets/coco_evaluator.py
@@ -19,11 +19,11 @@ import sys

 import numpy as np

-from lib.core.config import cfg
-from lib.pycocotools import mask as mask_tools
-from lib.pycocotools.coco import COCO
-from lib.pycocotools.cocoeval import COCOeval
-from lib.utils import mask as mask_util
+from seetadet.core.config import cfg
+from seetadet.pycocotools import mask as mask_tools
+from seetadet.pycocotools.coco import COCO
+from seetadet.pycocotools.cocoeval import COCOeval
+from seetadet.utils import mask as mask_util


 class COCOEvaluator(object):

--- a/lib/datasets/imdb.py
+++ b/lib/datasets/imdb.py
@@ -20,12 +20,14 @@ from __future__ import print_function
 import os
 import uuid

-from lib.core.config import cfg
-from lib.datasets.coco_evaluator import COCOEvaluator
-from lib.datasets.voc_evaluator import VOCEvaluator
+from seetadet.core.config import cfg
+from seetadet.datasets.coco_evaluator import COCOEvaluator
+from seetadet.datasets.voc_evaluator import VOCEvaluator


-class imdb(object):
+class Dataset(object):
+    """The base dataset class."""
+
    def __init__(self, source):
        self._source = source
        self._num_images = 0
@@ -51,6 +53,10 @@ class imdb(object):
        return self._class_to_ind

    @property
+    def cls(self):
+        return type(self)
+
+    @property
    def comp_id(self):
        return '_' + self._salt if self.config['use_salt'] else ''


--- a/lib/datasets/example.py
+++ b/lib/datasets/example.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 import cv2
 import numpy as np

-from lib.pycocotools import mask_utils
+from seetadet.pycocotools import mask_utils


 class Example(object):

--- a/lib/datasets/factory.py
+++ b/lib/datasets/factory.py
@@ -18,27 +18,29 @@ from __future__ import division
 from __future__ import print_function

 import os
-from lib.datasets.taas import TaaS
+from seetadet.datasets import kpl_record


-# TaaS DataSet
-_GLOBAL_DATA_SETS = {'taas': lambda source: TaaS(source)}
-
-
-def get_imdb(name):
-    """Get an imdb (image database) by name."""
-    keys = name.split(':')
+def get_dataset(name):
+    """Get a dataset by name."""
+    keys = name.split('://')
    if len(keys) >= 2:
-        cls, source = keys[0], ':'.join(keys[1:])
-        if cls not in _GLOBAL_DATA_SETS:
-            raise KeyError('Unknown DataSet: {}'.format(cls))
-        return _GLOBAL_DATA_SETS[cls](source)
+        cls, source = keys
+        if cls not in _GLOBAL_REGISTERED_DATASET:
+            raise KeyError('Unknown dataset:', cls)
+        return _GLOBAL_REGISTERED_DATASET[cls](source)
    elif os.path.exists(name):
-        return _GLOBAL_DATA_SETS['taas'](name)
+        return _GLOBAL_REGISTERED_DATASET['default'](name)
    else:
-        raise ValueError('Illegal Database: {}' + name)
+        raise ValueError('Illegal dataset:', name)
+
+
+def list_dataset():
+    """List all registered dataset."""
+    return _GLOBAL_REGISTERED_DATASET.keys()


-def list_imdbs():
-    """List all registered imdbs."""
-    return _GLOBAL_DATA_SETS.keys()
+_GLOBAL_REGISTERED_DATASET = {
+    'default': lambda source:
+        kpl_record.KPLRecordDataset(source),
+}
--- a/lib/datasets/taas.py
+++ b/lib/datasets/taas.py
@@ -21,23 +21,26 @@ import os

 import dragon

-from lib.core.config import cfg
-from lib.datasets.imdb import imdb
+from seetadet.core.config import cfg
+from seetadet.datasets.dataset import Dataset


-class TaaS(imdb):
+class KPLRecordDataset(Dataset):
    def __init__(self, source):
-        imdb.__init__(self, source)
-        self._dataset = dragon.io.SeetaRecordDataset
-        self._num_images = self._dataset(self.source).size
+        super(KPLRecordDataset, self).__init__(source)
+        self._num_images = self.cls(self.source).size
+
+    @property
+    def cls(self):
+        return dragon.io.KPLRecordDataset

    def dump_detections(self, all_boxes, output_dir):
-        dataset = self._dataset(self.source)
+        dataset = self.cls(self.source)
        for file in ('data.data', 'data.index', 'data.meta'):
            file = os.path.join(output_dir, file)
            if os.path.exists(file):
                os.remove(file)
-        writer = dragon.io.SeetaRecordWriter(output_dir, dataset.protocol)
+        writer = dragon.io.KPLRecordWriter(output_dir, dataset.protocol)
        for i in range(len(dataset)):
            example = dataset.get()
            example['object'] = []

--- a/lib/datasets/voc_eval.py
+++ b/lib/datasets/voc_eval.py
@@ -20,11 +20,11 @@ from __future__ import print_function
 import cv2
 import numpy as np

-from lib.core.config import cfg
-from lib.pycocotools import mask_utils
-from lib.utils import boxes as box_util
-from lib.utils.framework import pickle
-from lib.utils.mask import mask_overlap
+from seetadet.core.config import cfg
+from seetadet.pycocotools import mask_utils
+from seetadet.utils import boxes as box_util
+from seetadet.utils.env import pickle
+from seetadet.utils.mask import mask_overlap


 def voc_ap(rec, prec, use_07_metric=False):

--- a/lib/datasets/voc_evaluator.py
+++ b/lib/datasets/voc_evaluator.py
@@ -16,8 +16,8 @@ from __future__ import print_function
 import os
 import numpy as np

-from lib.datasets import voc_eval
-from lib.utils.framework import pickle
+from seetadet.datasets import voc_eval
+from seetadet.utils.env import pickle


 class VOCEvaluator(object):

--- a/seetadet/modeling/__init__.py
+++ b/seetadet/modeling/__init__.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Backbones
+import seetadet.modeling.airnet
+import seetadet.modeling.mobilenet
+import seetadet.modeling.resnet
+import seetadet.modeling.vgg
+
+# Custom modules
+from seetadet.modeling.fast_rcnn import FastRCNN
+from seetadet.modeling.fpn import FPN
+from seetadet.modeling.mask_rcnn import MaskRCNN
+from seetadet.modeling.retinanet import RetinaNet
+from seetadet.modeling.rpn import RPN
+from seetadet.modeling.ssd import SSD
--- a/lib/modeling/airnet.py
+++ b/lib/modeling/airnet.py
@@ -15,17 +15,18 @@ from __future__ import print_function

 import dragon.vm.torch as torch

-from lib.modules import init
-from lib.modules import nn
+from seetadet.core.registry import backbones
+from seetadet.modules import init
+from seetadet.modules import nn


 class WideResBlock(nn.Module):
    def __init__(self, dim_in, dim_out, stride=1, downsample=None):
        super(WideResBlock, self).__init__()
        self.conv1 = nn.Conv3x3(dim_in, dim_out, stride)
-        self.bn1 = nn.Affine(dim_out)
+        self.bn1 = nn.FrozenAffine(dim_out)
        self.conv2 = nn.Conv3x3(dim_out, dim_out)
-        self.bn2 = nn.Affine(dim_out)
+        self.bn2 = nn.FrozenAffine(dim_out)
        self.downsample = downsample
        self.relu = nn.ReLU(inplace=True)

@@ -51,15 +52,15 @@ class InceptionBlock(nn.Module):
    def __init__(self, dim_in, dim_out):
        super(InceptionBlock, self).__init__()
        self.conv1 = nn.Conv1x1(dim_in, dim_out)
-        self.bn1 = nn.Affine(dim_out)
+        self.bn1 = nn.FrozenAffine(dim_out)
        self.conv2 = nn.Conv3x3(dim_out, dim_out // 2)
-        self.bn2 = nn.Affine(dim_out // 2)
+        self.bn2 = nn.FrozenAffine(dim_out // 2)
        self.conv3a = nn.Conv3x3(dim_out // 2, dim_out)
-        self.bn3a = nn.Affine(dim_out)
+        self.bn3a = nn.FrozenAffine(dim_out)
        self.conv3b = nn.Conv3x3(dim_out, dim_out)
-        self.bn3b = nn.Affine(dim_out)
+        self.bn3b = nn.FrozenAffine(dim_out)
        self.conv4 = nn.Conv3x3(dim_out * 3, dim_out)
-        self.bn4 = nn.Affine(dim_out)
+        self.bn4 = nn.FrozenAffine(dim_out)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
@@ -103,7 +104,7 @@ class AirNet(nn.Module):
            padding=3,
            bias=False,
        )
-        self.bn1 = nn.Affine(self.dim_in)
+        self.bn1 = nn.FrozenAffine(self.dim_in)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(
            kernel_size=2,
@@ -127,7 +128,7 @@ class AirNet(nn.Module):
    def make_blocks(self, dim_out, blocks, stride=1):
        downsample = nn.Sequential(
            nn.Conv1x1(self.dim_in, dim_out, stride=stride),
-            nn.Affine(dim_out),
+            nn.FrozenAffine(dim_out),
        )
        layers = [WideResBlock(self.dim_in, dim_out, stride, downsample)]
        self.dim_in = dim_out
@@ -164,13 +165,7 @@ def airnet(num_stages):
    return AirNet(blocks, num_stages)


-def make_airnet_(): return airnet(5)
-
-
-def make_airnet_3b(): return airnet(3)
-
-
-def make_airnet_4b(): return airnet(4)
-
-
-def make_airnet_5b(): return airnet(5)
+backbones.register('airnet', func=airnet, num_stages=5)
+backbones.register('airnet_3b', func=airnet, num_stages=3)
+backbones.register('airnet_4b', func=airnet, num_stages=4)
+backbones.register('airnet_5b', func=airnet, num_stages=5)
--- a/lib/modeling/detector.py
+++ b/lib/modeling/detector.py
@@ -17,17 +17,12 @@ import collections
 import importlib
 import dragon.vm.torch as torch

-from lib.core.config import cfg
-from lib.modeling import FPN
-from lib.modeling import RPN
-from lib.modeling import FastRCNN
-from lib.modeling import MaskRCNN
-from lib.modeling import RetinaNet
-from lib.modeling import SSD
-from lib.modeling.factory import get_body_func
-from lib.modules import nn
-from lib.modules import vision
-from lib.utils import logger
+from seetadet import modeling as models
+from seetadet.core.config import cfg
+from seetadet.core.registry import backbones
+from seetadet.modules import nn
+from seetadet.modules import vision
+from seetadet.utils import logger


 class Detector(nn.Module):
@@ -46,18 +41,17 @@ class Detector(nn.Module):

        # + DataLoader
        self.data_loader_cls = importlib.import_module(
-            'lib.{}'.format(model)).DataLoader
+            'seetadet.algo.{}'.format(model)).DataLoader
        self.bootstrap = vision.Bootstrap()

        # + FeatureExtractor
-        self.body = get_body_func(body)()
+        self.body = backbones.get(body)()
        feature_dims = self.body.feature_dims

        # + FeatureEnhancer
        if 'fpn' in modules:
-            self.fpn = FPN(feature_dims)
+            self.fpn = models.FPN(feature_dims)
            feature_dims = self.fpn.feature_dims
-
        elif 'mbox' in modules:
            pass  # Placeholder
        else:
@@ -65,17 +59,17 @@ class Detector(nn.Module):

        # + Detection Modules
        if 'rcnn' in model:
-            self.rpn = RPN(feature_dims[0])
+            self.rpn = models.RPN(feature_dims[0])
            if 'faster' in model:
-                self.rcnn = FastRCNN(feature_dims[0])
+                self.rcnn = models.FastRCNN(feature_dims[0])
            elif 'mask' in model:
-                self.rcnn = MaskRCNN(feature_dims[0])
+                self.rcnn = models.MaskRCNN(feature_dims[0])

        if 'retinanet' in model:
-            self.retinanet = RetinaNet(feature_dims[0])
+            self.retinanet = models.RetinaNet(feature_dims[0])

        if 'ssd' in model:
-            self.ssd = SSD(feature_dims)
+            self.ssd = models.SSD(feature_dims)

    def load_weights(self, weights):
        """Load the state dict of this detector.
@@ -171,13 +165,11 @@ class Detector(nn.Module):
        return outputs

    def optimize_for_inference(self):
-        """Optimize the graph for the inference.
+        """Optimize the graph for the inference."""

-        It usually involves the removing of BN or Affine.
-        """
-        ##################################
+        ###################################
        #  Merge Affine into Convolution  #
-        ##################################
+        ###################################
        last_module = None
        for e in self.modules():
            if isinstance(e, nn.Affine) and \
@@ -195,7 +187,7 @@ class Detector(nn.Module):
        last_module = None
        for e in self.modules():
            if isinstance(e, nn.BatchNorm2d) and \
-                    nn.is_conv2d(last_module):
+                    isinstance(last_module, nn.Conv2d):
                if last_module.bias is None:
                    delattr(last_module, 'bias')
                    e.forward = lambda x: x

--- a/lib/modeling/fast_rcnn.py
+++ b/lib/modeling/fast_rcnn.py
@@ -18,12 +18,12 @@ import functools

 import dragon.vm.torch as torch

-from lib import faster_rcnn
-from lib.core.config import cfg
-from lib.modules import det
-from lib.modules import init
-from lib.modules import nn
-from lib.modules import vision
+from seetadet.algo import faster_rcnn
+from seetadet.core.config import cfg
+from seetadet.modules import det
+from seetadet.modules import init
+from seetadet.modules import nn
+from seetadet.modules import vision


 class FastRCNN(nn.Module):
@@ -54,7 +54,11 @@ class FastRCNN(nn.Module):
            'RoIAlign': vision.roi_align
        }[cfg.FRCNN.ROI_XFORM_METHOD], size=cfg.FRCNN.ROI_XFORM_RESOLUTION)
        self.cls_loss = nn.CrossEntropyLoss()
-        self.bbox_loss = nn.SmoothL1Loss()
+        if 'IOU' in cfg.MODEL.REG_LOSS_TYPE.upper():
+            self.bbox_loss = nn.IoULoss(
+                delta_weights=cfg.BBOX_REG_WEIGHTS)
+        else:
+            self.bbox_loss = nn.SmoothL1Loss(reduction='sum')
        # Compute spatial scales according to strides
        self.spatial_scales = [
            1. / (2 ** lvl)
@@ -124,15 +128,22 @@ class FastRCNN(nn.Module):

        if self.training:
            # Compute rcnn losses
+            bbox_pred = outputs['bbox_pred'].view(0, -1, 4) \
+                .index_select((0, 1), self.data['bbox_indices'])
+            bbox_loss_weight = \
+                cfg.MODEL.REG_LOSS_WEIGHT / (
+                    roi_features.shape[0] if isinstance(
+                    self.bbox_loss, nn.SmoothL1Loss
+                ) else 1.
+            )
            outputs.update(collections.OrderedDict([
                ('cls_loss', self.cls_loss(
                    cls_score, self.data['labels'])),
                ('bbox_loss', self.bbox_loss(
-                    outputs['bbox_pred'],
+                    bbox_pred,
                    self.data['bbox_targets'],
-                    self.data['bbox_inside_weights'],
-                    self.data['bbox_outside_weights'],
-                )),
+                    self.data['bbox_anchors'],
+                ) * bbox_loss_weight),
            ]))
        else:
            # Return the rois to decode the refine boxes

--- a/lib/modeling/fpn.py
+++ b/lib/modeling/fpn.py
@@ -13,11 +13,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import dragon.vm.torch as torch
+from dragon.vm.torch.nn import functional as nn_funcs

-from lib.core.config import cfg
-from lib.modules import init
-from lib.modules import nn
+from seetadet.core.config import cfg
+from seetadet.modules import init
+from seetadet.modules import nn


 HIGHEST_BACKBONE_LVL = 5  # E.g., "conv5"-like level
@@ -36,7 +36,7 @@ class FPN(nn.Module):
            self.P.append(nn.Conv3x3(dim, dim, bias=True))
        if 'rcnn' in cfg.MODEL.TYPE:
            self.apply_func = self.apply_on_rcnn
-            self.maxpool = nn.MaxPool2d(1, 2, ceil_mode=True)
+            self.maxpool = nn.MaxPool2d(kernel_size=1, stride=2)
        else:
            self.apply_func = self.apply_on_generic
            self.relu = nn.ReLU(inplace=False)
@@ -44,6 +44,7 @@ class FPN(nn.Module):
                dim_in = feature_dims[-1] if lvl == HIGHEST_BACKBONE_LVL + 1 else dim
                self.P.append(nn.Conv3x3(dim_in, dim, stride=2, bias=True))
        self.feature_dims = [dim]
+        self.coarsest_stride = cfg.MODEL.COARSEST_STRIDE
        self.reset_parameters()

    def reset_parameters(self):
@@ -56,14 +57,18 @@ class FPN(nn.Module):
        fpn_input = self.C[-1](features[-1])
        min_lvl, max_lvl = cfg.FPN.RPN_MIN_LEVEL, cfg.FPN.RPN_MAX_LEVEL
        outputs = [self.P[HIGHEST_BACKBONE_LVL - min_lvl](fpn_input)]
-        # Apply MaxPool for higher features
+        # Apply max pool for higher features
        for i in range(HIGHEST_BACKBONE_LVL + 1, max_lvl + 1):
            outputs.append(self.maxpool(outputs[-1]))
-        # Build Pyramids between [MIN_LEVEL, HIGHEST_LEVEL]
+        # Build pyramids between [MIN_LEVEL, HIGHEST_LEVEL]
        for i in range(HIGHEST_BACKBONE_LVL - 1, min_lvl - 1, -1):
            lateral_output = self.C[i - min_lvl](features[i - 1])
-            upscale_output = torch.vision.ops.nn_resize(
-                fpn_input, dsize=None, fx=2., fy=2.)
+            if self.coarsest_stride > 0:
+                upscale_output = nn_funcs.upsample(
+                    fpn_input, scale_factor=2)
+            else:
+                upscale_output = nn_funcs.upsample(
+                    fpn_input, size=lateral_output.shape[2:])
            fpn_input = lateral_output.__iadd__(upscale_output)
            outputs.insert(0, self.P[i - min_lvl](fpn_input))
        return outputs
@@ -78,11 +83,15 @@ class FPN(nn.Module):
            outputs.append(self.P[i - min_lvl](extra_input))
            if i != max_lvl:
                extra_input = self.relu(outputs[-1])
-        # Build Pyramids between [MIN_LEVEL, HIGHEST_LEVEL]
+        # Build pyramids between [MIN_LEVEL, HIGHEST_LEVEL]
        for i in range(HIGHEST_BACKBONE_LVL - 1, min_lvl - 1, -1):
            lateral_output = self.C[i - min_lvl](features[i - 1])
-            upscale_output = torch.vision.ops.nn_resize(
-                fpn_input, dsize=None, fx=2., fy=2.)
+            if self.coarsest_stride > 0:
+                upscale_output = nn_funcs.upsample(
+                    fpn_input, scale_factor=2)
+            else:
+                upscale_output = nn_funcs.upsample(
+                    fpn_input, size=lateral_output.shape[2:])
            fpn_input = lateral_output.__iadd__(upscale_output)
            outputs.insert(0, self.P[i - min_lvl](fpn_input))
        return outputs

--- a/lib/modeling/mask_rcnn.py
+++ b/lib/modeling/mask_rcnn.py
@@ -18,12 +18,12 @@ import functools

 import dragon.vm.torch as torch

-from lib import mask_rcnn
-from lib.core.config import cfg
-from lib.modules import det
-from lib.modules import init
-from lib.modules import nn
-from lib.modules import vision
+from seetadet.algo import mask_rcnn
+from seetadet.core.config import cfg
+from seetadet.modules import det
+from seetadet.modules import init
+from seetadet.modules import nn
+from seetadet.modules import vision


 class MaskRCNN(nn.Module):
@@ -65,7 +65,7 @@ class MaskRCNN(nn.Module):
            'RoIAlign': vision.roi_align,
        }[cfg.MRCNN.ROI_XFORM_METHOD], size=cfg.MRCNN.ROI_XFORM_RESOLUTION)
        self.cls_loss = nn.CrossEntropyLoss()
-        self.bbox_loss = nn.SmoothL1Loss()
+        self.bbox_loss = nn.SmoothL1Loss(reduction='sum')
        self.mask_loss = nn.BCEWithLogitsLoss()
        # Compute spatial scales according to strides
        self.spatial_scales = [
@@ -146,15 +146,14 @@ class MaskRCNN(nn.Module):

        if self.training:
            # Compute the loss of bbox branch
+            bbox_pred = outputs['bbox_pred'].view(0, -1, 4) \
+                .index_select((0, 1), self.data['bbox_indices'])
            outputs.update(collections.OrderedDict([
                ('cls_loss', self.cls_loss(
                    cls_score, self.data['labels'])),
                ('bbox_loss', self.bbox_loss(
-                    outputs['bbox_pred'],
-                    self.data['bbox_targets'],
-                    self.data['bbox_inside_weights'],
-                    self.data['bbox_outside_weights'],
-                )),
+                    bbox_pred, self.data['bbox_targets'],
+                ) / roi_features.shape[0]),
            ]))
            # Compute the loss of mask branch
            mask_score = self.get_mask_score(
@@ -171,7 +170,7 @@ class MaskRCNN(nn.Module):
                outputs['rois'] = self.data['rois'][0]
            # Return the classification prob
            outputs['cls_prob'] = self.softmax(cls_score)
-            # Set a callback to decode mask from refine RoIs
+            # Set a callback to decode mask from refined RoIs
            self.compute_mask_score = \
                functools.partial(
                    self.get_mask_score,

--- a/lib/modeling/mobilenet.py
+++ b/lib/modeling/mobilenet.py
@@ -17,17 +17,18 @@ import functools

 import dragon.vm.torch as torch

-from lib.core.config import cfg
-from lib.modules import init
-from lib.modules import nn
-from lib.modules import vision
+from seetadet.core.config import cfg
+from seetadet.core.registry import backbones
+from seetadet.modules import init
+from seetadet.modules import nn
+from seetadet.modules import vision


 def conv_triplet(dim_in, dim_out):
    """1x1 convolution + BN + ReLU."""
    return [
        nn.Conv2d(dim_in, dim_out, 1, bias=False),
-        nn.Affine(dim_out),
+        nn.FrozenAffine(dim_out),
        nn.ReLU(True),
    ]

@@ -42,10 +43,10 @@ def conv_quintet(dim_in, dim_out, ks, stride):
            padding=ks // 2,
            bias=False,
        ),
-        nn.Affine(dim_in),
+        nn.FrozenAffine(dim_in),
        nn.ReLU(True),
        nn.Conv1x1(dim_in, dim_out),
-        nn.Affine(dim_out),
+        nn.FrozenAffine(dim_out),
    ]


@@ -76,7 +77,7 @@ def Stem(dim_out, stride=1):
            padding=1,
            bias=False,
        ),
-        nn.Affine(dim_out),
+        nn.FrozenAffine(dim_out),
        nn.ReLU(True),
    )

@@ -197,7 +198,8 @@ class NASMobileNet(nn.Module):
        return outputs


-def make_mobilenet_a1():
+@backbones.register('mobilenet_a1')
+def mobilenet_a1():
    return NASMobileNet([
        4, 6, 6, 6,
        3, 3, 4, 6,
@@ -207,7 +209,8 @@ def make_mobilenet_a1():
    ], Setting.PROXYLESS_MOBILE)


-def make_mobilenet_v2():
+@backbones.register('mobilenet_v2')
+def mobilenet_v2():
    return NASMobileNet([
        1, 1,
        1, 1, 1,

--- a/lib/modeling/resnet.py
+++ b/lib/modeling/resnet.py
@@ -19,9 +19,10 @@ from __future__ import print_function

 import dragon.vm.torch as torch

-from lib.core.config import cfg
-from lib.modules import nn
-from lib.modules import init
+from seetadet.core.config import cfg
+from seetadet.core.registry import backbones
+from seetadet.modules import nn
+from seetadet.modules import init


 class BasicBlock(nn.Module):
@@ -35,10 +36,10 @@ class BasicBlock(nn.Module):
    ):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv3x3(dim_in, dim_out, stride)
-        self.bn1 = nn.Affine(dim_out)
+        self.bn1 = nn.FrozenAffine(dim_out)
        self.relu = torch.nn.ReLU(inplace=True)
        self.conv2 = nn.Conv3x3(dim_out, dim_out)
-        self.bn2 = nn.Affine(dim_out)
+        self.bn2 = nn.FrozenAffine(dim_out)
        self.downsample = downsample
        self.dropblock = dropblock

@@ -83,11 +84,11 @@ class Bottleneck(torch.nn.Module):
        super(Bottleneck, self).__init__()
        dim = int(dim_out * self.contraction)
        self.conv1 = nn.Conv1x1(dim_in, dim)
-        self.bn1 = nn.Affine(dim)
+        self.bn1 = nn.FrozenAffine(dim)
        self.conv2 = nn.Conv3x3(dim, dim, stride=stride)
-        self.bn2 = nn.Affine(dim)
+        self.bn2 = nn.FrozenAffine(dim)
        self.conv3 = nn.Conv1x1(dim, dim_out)
-        self.bn3 = nn.Affine(dim_out)
+        self.bn3 = nn.FrozenAffine(dim_out)
        self.relu = torch.nn.ReLU(inplace=True)
        self.downsample = downsample
        self.dropblock = dropblock
@@ -132,7 +133,7 @@ class ResNet(torch.nn.Module):
            padding=3,
            bias=False,
        )
-        self.bn1 = nn.Affine(self.dim_in)
+        self.bn1 = nn.FrozenAffine(self.dim_in)
        self.relu = torch.nn.ReLU(inplace=True)
        self.maxpool = torch.nn.MaxPool2d(
            kernel_size=3,
@@ -181,7 +182,7 @@ class ResNet(torch.nn.Module):
        if stride != 1 or self.dim_in != dim_out:
            downsample = nn.Sequential(
                nn.Conv1x1(self.dim_in, dim_out, stride=stride),
-                nn.Affine(dim_out),
+                nn.FrozenAffine(dim_out),
            )
        layers = [block(self.dim_in, dim_out, stride, downsample, dropblock)]
        self.dim_in = dim_out
@@ -194,11 +195,17 @@ class ResNet(torch.nn.Module):
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)
+
        outputs = [x]
        outputs += [self.layer1(outputs[-1])]
        outputs += [self.layer2(outputs[-1])]
        outputs += [self.layer3(outputs[-1])]
        outputs += [self.layer4(outputs[-1])]
+
+        if self.training:
+            # Hold the frozen outputs if necessary
+            self.last_outputs = outputs
+
        return outputs


@@ -225,16 +232,8 @@ def resnet(depth):
    return ResNet(block, units, filters)


-def make_resnet_18(): return resnet(18)
-
-
-def make_resnet_34(): return resnet(34)
-
-
-def make_resnet_50(): return resnet(50)
-
-
-def make_resnet_101(): return resnet(101)
-
-
-def make_resnet_152(): return resnet(152)
+backbones.register(['res18', 'resnet18', 'resnet_18'], func=resnet, depth=18)
+backbones.register(['res34', 'resnet34', 'resnet_34'], func=resnet, depth=34)
+backbones.register(['res50', 'resnet50', 'resnet_50'], func=resnet, depth=50)
+backbones.register(['res101', 'resnet101', 'resnet_101'], func=resnet, depth=101)
+backbones.register(['res152', 'resnet152', 'resnet_152'], func=resnet, depth=152)
--- a/lib/modeling/retinanet.py
+++ b/lib/modeling/retinanet.py
@@ -17,11 +17,11 @@ import collections
 import math
 import dragon.vm.torch as torch

-from lib import retinanet
-from lib.core.config import cfg
-from lib.modules import det
-from lib.modules import init
-from lib.modules import nn
+from seetadet.algo import retinanet
+from seetadet.core.config import cfg
+from seetadet.modules import det
+from seetadet.modules import init
+from seetadet.modules import nn


 class RetinaNet(nn.Module):
@@ -56,7 +56,11 @@ class RetinaNet(nn.Module):

        self.anchor_target = retinanet.AnchorTarget()
        self.cls_loss = nn.SigmoidFocalLoss()
+        if 'IOU' in cfg.MODEL.REG_LOSS_TYPE.upper():
+            self.bbox_loss = nn.IoULoss()
+        else:
            self.bbox_loss = nn.SmoothL1Loss(0.1111)
+        self.centerness_loss = nn.BCEWithLogitsLoss(reduction='valid')
        self.reset_parameters()

    def reset_parameters(self):
@@ -71,7 +75,8 @@ class RetinaNet(nn.Module):
        # For details, See the official codes:
        # https://github.com/facebookresearch/Detectron
        self.cls_score.bias.fill_(
-            -math.log((1 - cfg.PRIOR_PROB) / cfg.PRIOR_PROB))
+            -math.log((1 - cfg.PRIOR_PROB) / cfg.PRIOR_PROB)
+        )

    def compute_outputs(self, features):
        """Compute the RetinaNet logits.
@@ -97,48 +102,44 @@ class RetinaNet(nn.Module):
            return torch.cat(cls_score_wide, dim=2), \
                   torch.cat(bbox_pred_wide, dim=2)
        else:
-            return cls_score_wide[0], bbox_pred_wide[0]
+            return cls_score_wide[0], bbox_pred_wide[0], \

-    def compute_losses(
-        self,
-        features,
-        cls_score,
-        bbox_pred,
-        gt_boxes,
-        ims_info,
-    ):
+    def compute_losses(self, features, cls_score, bbox_pred, gt_boxes):
        """Compute the RetinaNet classification loss and regression loss.

        Parameters
        ----------
-        features : sequence of dragon.vm.torch.Tensor
+        features : Sequence[dragon.vm.torch.Tensor]
            The features of specific conv layers.
        cls_score : dragon.vm.torch.Tensor
            The classification logits.
        bbox_pred : dragon.vm.torch.Tensor
            The bbox regression logits.
+        centerness : dragon.vm.torch.Tensor
+            The centerness logits.
        gt_boxes : numpy.ndarray
            The packed ground-truth boxes.
        ims_info : numpy.ndarray
            The information of input images.

        """
-        self.retinanet_data = \
+        self.data = \
            self.anchor_target(
                features=features,
                gt_boxes=gt_boxes,
-                ims_info=ims_info,
            )
-        return collections.OrderedDict([
+        bbox_pred = bbox_pred.permute(0, 2, 1) \
+            .index_select((0, 1), self.data['bbox_indices'])
+        outputs = collections.OrderedDict([
            ('cls_loss', self.cls_loss(
-                cls_score, self.retinanet_data['labels'])),
+                cls_score, self.data['labels'])),
            ('bbox_loss', self.bbox_loss(
                bbox_pred,
-                self.retinanet_data['bbox_targets'],
-                self.retinanet_data['bbox_inside_weights'],
-                self.retinanet_data['bbox_outside_weights'],
-            )),
+                self.data['bbox_targets'],
+                self.data['bbox_anchors'],
+            ))
        ])
+        return outputs

    def forward(self, *args, **kwargs):
        cls_score, bbox_pred = self.compute_outputs(kwargs['features'])
@@ -149,19 +150,17 @@ class RetinaNet(nn.Module):
        if self.training:
            outputs.update(
                self.compute_losses(
-                    kwargs['features'],
-                    cls_score,
-                    bbox_pred,
-                    kwargs['gt_boxes'],
-                    kwargs['ims_info'],
+                    features=kwargs['features'],
+                    cls_score=cls_score,
+                    bbox_pred=bbox_pred,
+                    gt_boxes=kwargs['gt_boxes'],
                )
            )
        else:
            outputs['detections'] = \
                self.decoder(
                    kwargs['features'],
-                    self.cls_prob(cls_score)
-                        .permute(0, 2, 1),
+                    self.cls_prob(cls_score).permute(0, 2, 1),
                    bbox_pred,
                    kwargs['ims_info'],
                )

--- a/lib/modeling/rpn.py
+++ b/lib/modeling/rpn.py
@@ -16,10 +16,10 @@ from __future__ import print_function
 import collections
 import dragon.vm.torch as torch

-from lib import faster_rcnn
-from lib.core.config import cfg
-from lib.modules import init
-from lib.modules import nn
+from seetadet.algo import faster_rcnn
+from seetadet.core.config import cfg
+from seetadet.modules import init
+from seetadet.modules import nn


 class RPN(nn.Module):
@@ -45,7 +45,8 @@ class RPN(nn.Module):

        self.anchor_target = faster_rcnn.AnchorTarget()
        self.cls_loss = nn.BCEWithLogitsLoss()
-        self.bbox_loss = nn.SmoothL1Loss(0.1111)
+        self.bbox_loss = nn.SmoothL1Loss(
+            beta=0.1111, reduction='sum')
        self.reset_parameters()

    def reset_parameters(self):
@@ -108,21 +109,26 @@ class RPN(nn.Module):
            The information of input images.

        """
-        self.rpn_data = \
+        self.data = \
            self.anchor_target(
                features=features,
                gt_boxes=gt_boxes,
                ims_info=ims_info,
            )
+        bbox_pred = bbox_pred.permute(0, 2, 1) \
+            .index_select((0, 1), self.data['bbox_indices'])
+        bbox_loss_weight = 1. / (
+            cfg.TRAIN.RPN_BATCHSIZE *
+            cfg.TRAIN.IMS_PER_BATCH
+        )
        return collections.OrderedDict([
            ('rpn_cls_loss', self.cls_loss(
-                cls_score, self.rpn_data['labels'])),
+                cls_score, self.data['labels'])),
            ('rpn_bbox_loss', self.bbox_loss(
                bbox_pred,
-                self.rpn_data['bbox_targets'],
-                self.rpn_data['bbox_inside_weights'],
-                self.rpn_data['bbox_outside_weights'],
-            )),
+                self.data['bbox_targets'],
+                self.data['bbox_anchors'],
+            ) * bbox_loss_weight),
        ])

    def forward(self, *args, **kwargs):

--- a/lib/modeling/ssd.py
+++ b/lib/modeling/ssd.py
@@ -16,10 +16,10 @@ from __future__ import print_function
 import collections
 import dragon.vm.torch as torch

-from lib import ssd
-from lib.core.config import cfg
-from lib.modules import init
-from lib.modules import nn
+from seetadet.algo import ssd
+from seetadet.core.config import cfg
+from seetadet.modules import init
+from seetadet.modules import nn


 class SSD(nn.Module):
@@ -66,6 +66,10 @@ class SSD(nn.Module):
        self.hard_mining = ssd.HardMining()
        self.box_target = ssd.MultiBoxTarget()
        self.cls_loss = nn.CrossEntropyLoss()
+        if 'IOU' in cfg.MODEL.REG_LOSS_TYPE:
+            self.bbox_loss = nn.IoULoss(
+                delta_weights=cfg.BBOX_REG_WEIGHTS)
+        else:
            self.bbox_loss = nn.SmoothL1Loss()
        self.reset_parameters()

@@ -110,8 +114,7 @@ class SSD(nn.Module):

        # Concat them if necessary
        return \
-            torch.cat(cls_score_wide, dim=1) \
-            .view(0, -1, cfg.MODEL.NUM_CLASSES), \
+            torch.cat(cls_score_wide, dim=1).view(0, -1, cfg.MODEL.NUM_CLASSES), \
            torch.cat(bbox_pred_wide, dim=1).view(0, -1, self.box_dim)

    def compute_losses(
@@ -160,6 +163,8 @@ class SSD(nn.Module):
                gt_boxes,
            )
        )
+        bbox_pred = bbox_pred.index_select(
+            (0, 1), self.data['bbox_indices'])
        return collections.OrderedDict([
            # A compensating factor of 4.0 is used
            # As we normalize both the pos and neg samples
@@ -169,9 +174,8 @@ class SSD(nn.Module):
            ('bbox_loss', self.bbox_loss(
                bbox_pred,
                self.data['bbox_targets'],
-                self.data['bbox_inside_weights'],
-                self.data['bbox_outside_weights'],
-            )),
+                self.data['bbox_anchors'],
+            ) * cfg.MODEL.REG_LOSS_WEIGHT)
        ])

    def forward(self, *args, **kwargs):

--- a/lib/modeling/vgg.py
+++ b/lib/modeling/vgg.py
@@ -13,9 +13,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-from lib.core.config import cfg
-from lib.modules import init
-from lib.modules import nn
+from seetadet.core.config import cfg
+from seetadet.core.registry import backbones
+from seetadet.modules import init
+from seetadet.modules import nn


 class VGG(nn.Module):
@@ -41,14 +42,14 @@ class VGG(nn.Module):
                if j == 0:
                    dim_in = filter_list[i]
        if reduced:
-            # L2Norm is redundant from the observation
-            # We just keep a trainable scale
-            self.conv4_3_norm = nn.Affine(filter_list[3], bias=False)
-            self.conv4_3_norm.weight.zero_()  # Zero-Init
+            self.conv4_3_norm = nn.L2Normalize(filter_list[3], init=20.)
            self.fc6 = nn.Conv2d(
-                filter_list[-1], 1024,
-                kernel_size=3, padding=6,
-                stride=1, dilation=6,
+                in_channels=filter_list[-1],
+                out_channels=1024,
+                kernel_size=3,
+                padding=6,
+                stride=1,
+                dilation=6,
            )
            self.fc7 = nn.Conv1x1(1024, 1024, bias=True)
            self.feature_dims = [filter_list[-2], 1024]
@@ -142,14 +143,18 @@ class VGG(nn.Module):
        else:
            outputs.append(x)

+        if self.training:
+            # Hold the frozen outputs if necessary
+            self.last_outputs = outputs
+
        return outputs


-def make_vgg_16():
-    return VGG(([2, 2, 3, 3, 3], [64, 128, 256, 512, 512]))
+def vgg_16(**kwargs):
+    return VGG(([2, 2, 3, 3, 3], [64, 128, 256, 512, 512]), **kwargs)


-def make_vgg_16_reduced(scale=300):
+def vgg_16_reduced(scale=300):
    if scale == 300:
        extra_arch = (
            [2, 2, 1, 1],
@@ -164,11 +169,9 @@ def make_vgg_16_reduced(scale=300):
        )
    else:
        raise ValueError('Unsupported scale: {}'.format(scale))
-    return VGG(([2, 2, 3, 3, 3], [64, 128, 256, 512, 512]),
-               extra_arch=extra_arch, reduced=True)
-
-
-def make_vgg_16_reduced_300(): return make_vgg_16_reduced(300)
+    return vgg_16(extra_arch=extra_arch, reduced=True)


-def make_vgg_16_reduced_512(): return make_vgg_16_reduced(512)
+backbones.register('vgg16', func=vgg_16)
+backbones.register('vgg16_reduced_300', func=vgg_16_reduced, scale=300)
+backbones.register('vgg16_reduced_512', func=vgg_16_reduced, scale=512)
--- a/lib/ssd/__init__.py
+++ b/lib/ssd/__init__.py
@@ -13,8 +13,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-from lib.ssd.data_loader import DataLoader
-from lib.ssd.hard_mining import HardMining
-from lib.ssd.multibox import MultiBoxMatch
-from lib.ssd.multibox import MultiBoxTarget
-from lib.ssd.priorbox import PriorBox
+import os
+
+from seetadet.utils import env
+env.load_library(os.path.join(os.path.dirname(__file__), '_C'))
--- a/lib/modules/det.py
+++ b/lib/modules/det.py
@@ -14,21 +14,40 @@ from __future__ import division
 from __future__ import print_function

 from dragon.vm.torch import nn
-from dragon.vm.torch.autograd import function
+from dragon.vm.torch.autograd.function import Function

-from lib.core.config import cfg
+from seetadet.core.config import cfg


-class _RetinaNetDecoder(function.Function):
+class _NonMaxSuppression(Function):
+    """Filter out boxes that have high IoU with selected ones."""
+
+    def __init__(self, key, dev, **kwargs):
+        super(_NonMaxSuppression, self).__init__(key, dev, **kwargs)
+        self.iou_threshold = kwargs.get('iou_threshold', 0.5)
+
+    def attributes(self):
+        return {
+            'op_type': 'NonMaxSuppression',
+            'arguments': {'iou_threshold': self.iou_threshold}
+        }
+
+    def forward(self, dets):
+        return self.dispatch([dets], [self.alloc()])
+
+
+
+class _RetinaNetDecoder(Function):
+    """Decode predictions from RetinaNet."""
+
    def __init__(self, key, dev, **kwargs):
        super(_RetinaNetDecoder, self).__init__(key, dev, **kwargs)
        self.args = kwargs

-    def register_operator(self):
+    def attributes(self):
        return {
-            'op_type': 'Proposal',
+            'op_type': 'RetinaNetDecoder',
            'arguments': {
-                'det_type': 'RETINANET',
                'strides': self.args['strides'],
                'ratios': self.args['ratios'],
                'scales': self.args['scales'],
@@ -39,20 +58,21 @@ class _RetinaNetDecoder(function.Function):

    def forward(self, features, cls_prob, bbox_pred, ims_info):
        inputs = features + [cls_prob, bbox_pred, ims_info]
-        self._unify_devices(inputs[:-1])  # Skip <ims_info>
-        return self.run(inputs, [self.alloc()], unify_devices=False)
+        self._check_device(inputs[:-1])  # Skip <ims_info>
+        return self.dispatch(inputs, [self.alloc()], check_device=False)


-class _RPNDecoder(function.Function):
+class _RPNDecoder(Function):
+    """Decode proposal regions from RPN."""
+
    def __init__(self, key, dev, **kwargs):
        super(_RPNDecoder, self).__init__(key, dev, **kwargs)
        self.args = kwargs

-    def register_operator(self):
+    def attributes(self):
        return {
-            'op_type': 'Proposal',
+            'op_type': 'RPNDecoder',
            'arguments': {
-                'det_type': 'RCNN',
                'strides': self.args['strides'],
                'ratios': self.args['ratios'],
                'scales': self.args['scales'],
@@ -69,9 +89,9 @@ class _RPNDecoder(function.Function):

    def forward(self, features, cls_prob, bbox_pred, ims_info):
        inputs = features + [cls_prob, bbox_pred, ims_info]
-        self._unify_devices(inputs[:-1])  # Skip <ims_info>
+        self._check_device(inputs[:-1])  # Skip <ims_info>
        outputs = [self.alloc() for _ in range(self.args['K'])]
-        return self.run(inputs, outputs, unify_devices=False)
+        return self.dispatch(inputs, outputs, check_device=False)


 def decode_retinanet(
@@ -85,8 +105,8 @@ def decode_retinanet(
    pre_nms_top_n,
    score_thresh,
 ):
-    return function.get(
-        _RetinaNetDecoder,
+    return _RetinaNetDecoder \
+        .instantiate(
            cls_prob.device,
            strides=strides,
            ratios=ratios,
@@ -114,8 +134,8 @@ def decode_rpn(
    canonical_scale,
    canonical_level,
 ):
-    return function.get(
-        _RPNDecoder,
+    return _RPNDecoder \
+        .instantiate(
            cls_prob.device,
            K=num_outputs,
            strides=strides,
@@ -132,8 +152,16 @@ def decode_rpn(
        ).apply(features, cls_prob, bbox_pred, ims_info)


+def nms(dets, iou_threshold=0.5):
+    return _NonMaxSuppression \
+        .instantiate(
+            dets.device,
+            iou_threshold=iou_threshold,
+        ).apply(dets)
+
+
 class RetinaNetDecoder(nn.Module):
-    """Generate pred regions from retinanet."""
+    """Decode predictions from retinanet."""

    def __init__(self):
        super(RetinaNetDecoder, self).__init__()
@@ -154,7 +182,7 @@ class RetinaNetDecoder(nn.Module):
            ratios=[float(e) for e in cfg.RETINANET.ASPECT_RATIOS],
            scales=self.scales,
            pre_nms_top_n=cfg.RETINANET.PRE_NMS_TOP_N,
-            score_thresh=cfg.TEST.SCORE_THRESH,
+            score_thresh=float(cfg.TEST.SCORE_THRESH),
        )



--- a/lib/modules/init.py
+++ b/lib/modules/init.py
--- a/seetadet/modules/nn.py
+++ b/seetadet/modules/nn.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+"""Define some basic structures."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import dragon
+from dragon.vm import torch
+from dragon.vm.torch import nn
+from dragon.vm.torch.nn import functional
+
+from seetadet.core.config import cfg
+
+
+class FrozenAffine(object):
+    """Affine transformation with weight and bias fixed."""
+
+    def __new__(cls, dim_in, bias=True, inplace=True):
+        return nn.Affine(
+            num_features=dim_in,
+            fix_weight=True,
+            fix_bias=True,
+            inplace=inplace,
+        )
+
+
+class Conv1x1(object):
+    """1x1 convolution."""
+
+    def __new__(cls, dim_in, dim_out, stride=1, bias=False):
+        return nn.Conv2d(
+            in_channels=dim_in,
+            out_channels=dim_out,
+            kernel_size=1,
+            stride=stride,
+            bias=bias,
+        )
+
+
+class Conv3x3(object):
+    """3x3 convolution."""
+
+    def __new__(cls, dim_in, dim_out, stride=1, dilation=1, bias=False):
+        return nn.Conv2d(
+            in_channels=dim_in,
+            out_channels=dim_out,
+            kernel_size=3,
+            stride=stride,
+            padding=1 * dilation,
+            bias=bias,
+        )
+
+
+class CrossEntropyLoss(object):
+    """Cross entropy loss."""
+
+    def __new__(cls):
+        return nn.CrossEntropyLoss(ignore_index=-1)
+
+
+class IoULoss(nn.Module):
+    def __init__(self, reduction='mean', delta_weights=None):
+        super(IoULoss, self).__init__()
+        self.data = {}  # Store the detached tensors
+        self.reduction = reduction
+        self.delta_weights = delta_weights
+
+    def transform_inv(self, boxes, deltas, name=None):
+        widths = boxes[:, 2] - boxes[:, 0]
+        heights = boxes[:, 3] - boxes[:, 1]
+        ctr_x = boxes[:, 0] + 0.5 * widths
+        ctr_y = boxes[:, 1] + 0.5 * heights
+
+        if name is not None:
+            self.data[name + '/widths'] = widths
+            self.data[name + '/heights'] = heights
+
+        dx, dy, dw, dh = torch.chunk(deltas, chunks=4, dim=1)
+
+        if self.delta_weights is not None:
+            wx, wy, ww, wh = self.delta_weights
+            dx, dy, dw, dh = dx / wx, dy / wy, dw / ww, dh / wh
+
+        pred_ctr_x = dx * widths + ctr_x
+        pred_ctr_y = dy * heights + ctr_y
+        pred_w = torch.exp(dw) * widths
+        pred_h = torch.exp(dh) * heights
+
+        x1 = pred_ctr_x - 0.5 * pred_w
+        y1 = pred_ctr_y - 0.5 * pred_h
+        x2 = pred_ctr_x + 0.5 * pred_w
+        y2 = pred_ctr_y + 0.5 * pred_h
+
+        return x1, y1, x2, y2
+
+    def forward_impl(self, input, target, anchor):
+        x1, y1, x2, y2 = self.transform_inv(
+            anchor, input, name='logits')
+        self.x1, self.y1, self.x2, self.y2 = \
+            self.transform_inv(anchor, target)
+
+        # Compute the independent area
+        pred_area = (x2 - x1) * (y2 - y1)
+        target_area = (self.x2 - self.x1) * (self.y2 - self.y1)
+
+        # Compute the intersecting area
+        x1_inter = torch.maximum(x1, self.x1)
+        y1_inter = torch.maximum(y1, self.y1)
+        x2_inter = torch.minimum(x2, self.x2)
+        y2_inter = torch.minimum(y2, self.y2)
+        w_inter = torch.clamp(x2_inter - x1_inter, min=0)
+        h_inter = torch.clamp(y2_inter - y1_inter, min=0)
+        area_inter = w_inter * h_inter
+
+        # Compute the enclosing area
+        x1_enc = torch.minimum(x1, self.x1)
+        y1_enc = torch.minimum(y1, self.y1)
+        x2_enc = torch.maximum(x2, self.x2)
+        y2_enc = torch.maximum(y2, self.y2)
+        area_enc = (x2_enc - x1_enc) * (y2_enc - y1_enc) + 1.
+
+        # Compute the differentiable IoU metric
+        area_union = pred_area + target_area - area_inter
+        iou = area_inter / (area_union + 1.)
+        iou_metric = iou - (area_enc - area_union) / area_enc  # GIoU
+
+        # Compute the reduced loss
+        if self.reduction == 'sum':
+            return (1 - iou_metric).sum()
+        else:
+            return (1 - iou_metric).mean()
+
+    def forward(self, *inputs, **kwargs):
+        # Enter a new detaching scope
+        with dragon.eager_scope('${IOU}'):
+            return self.forward_impl(*inputs, **kwargs)
+
+
+class Identity(nn.Module):
+    """Pass input to the output."""
+
+    def __init__(self, *args, **kwargs):
+        super(Identity, self).__init__()
+        _, _ = args, kwargs
+
+    def forward(self, x):
+        return x
+
+
+class L2Normalize(nn.Module):
+    """Normalize the input using L2 norm."""
+
+    def __init__(self, num_features, init=20.):
+        super(L2Normalize, self).__init__()
+        self.weight = nn.Parameter(torch.Tensor(num_features).fill_(init))
+
+    def forward(self, input):
+        out = functional.normalize(input, p=2, dim=1, eps=1e-5)
+        out = functional.affine(out, self.weight)
+        return out
+
+
+class ReLU(object):
+    """The generic ReLU activation."""
+
+    def __new__(cls, inplace=False):
+        return getattr(torch.nn, cfg.MODEL.RELU_VARIANT)(inplace)
+
+
+class SigmoidFocalLoss(object):
+    """Sigmoid focal loss."""
+
+    def __new__(cls):
+        return nn.SigmoidFocalLoss(
+            alpha=cfg.MODEL.FOCAL_LOSS_ALPHA,
+            gamma=cfg.MODEL.FOCAL_LOSS_GAMMA,
+        )
+
+
+class SmoothL1Loss(nn.Module):
+    """Smoothed l1 loss."""
+
+    def __init__(self, beta=1., reduction='batch_size'):
+        super(SmoothL1Loss, self).__init__()
+        self.beta = beta
+        self.reduction = reduction
+
+    def forward(self, input, target, *args):
+        return functional.smooth_l1_loss(
+            input, target,
+            beta=self.beta,
+            reduction=self.reduction,
+        )
+
+
+Affine = nn.Affine
+AvgPool2d = nn.AvgPool2d
+BatchNorm2d = nn.BatchNorm2d
+BCEWithLogitsLoss = nn.BCEWithLogitsLoss
+Conv2d = nn.Conv2d
+ConvTranspose2d = nn.ConvTranspose2d
+DepthwiseConv2d = nn.DepthwiseConv2d
+Linear = nn.Linear
+MaxPool2d = nn.MaxPool2d
+Module = nn.Module
+ModuleList = nn.ModuleList
+Sequential = nn.Sequential
+Sigmoid = nn.Sigmoid
+Softmax = nn.Softmax
--- a/lib/modules/vision.py
+++ b/lib/modules/vision.py
@@ -13,9 +13,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import functools
+
 import dragon.vm.torch as torch

-from lib.core.config import cfg
+from seetadet.core.config import cfg


 def roi_align(input, boxes, spatial_scale, size):
@@ -35,12 +37,18 @@ def roi_pool(input, boxes, spatial_scale, size):


 class Bootstrap(torch.nn.Module):
-    """Extended operator to process the images."""
+    """Process the input to match the computation."""

    def __init__(self):
        super(Bootstrap, self).__init__()
-        self.dtype = cfg.MODEL.PRECISION.lower()
-        self.mean_values = cfg.PIXEL_MEANS
+        self.normalize_func = functools.partial(
+            torch.channel_normalize,
+            mean=cfg.PIXEL_MEANS,
+            std=[1., 1., 1.],
+            dim=1,
+            dims=(0, 3, 1, 2),
+            dtype=cfg.MODEL.PRECISION.lower(),
+        )
        self.dummy_buffer = torch.ones(1)

    def _apply(self, fn):
@@ -57,12 +65,13 @@ class Bootstrap(torch.nn.Module):
        return self.dummy_buffer.device

    def forward(self, input):
+        if isinstance(input, torch.Tensor):
+            if input.size(1) <= 3:
+                return input
        cur_device = self.device()
        if input._device != cur_device:
            if cur_device.type == 'cpu':
                input = input.cpu()
            else:
                input = input.cuda(cur_device.index)
-        return torch.vision.ops.image_data(
-            input, self.dtype, self.mean_values,
-        )
+        return self.normalize_func(input)
--- a/lib/retinanet/__init__.py
+++ b/lib/retinanet/__init__.py
@@ -13,5 +13,4 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-from lib.faster_rcnn.data_loader import DataLoader
-from lib.retinanet.anchor_target import AnchorTarget
+from seetadet.onnx import nodes as _
--- a/seetadet/onnx/nodes.py
+++ b/seetadet/onnx/nodes.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from dragon.vm.onnx import exporter
+from dragon.vm.onnx import helper
+
+
+@exporter.register('RetinanetDecoder')
+def retinanet_decoder_exporter(op_def, shape_dict, ws):
+    node, const_tensors = exporter.translate(**locals())
+    node.op_type = 'ATen'  # Currently not supported in ai.onnx
+    helper.add_attribute(node, 'op_type', 'RetinaNetDecoder')
+
+    for arg in op_def.arg:
+        if arg.name == 'strides':
+            helper.add_attribute(node, 'strides', arg.ints)
+        elif arg.name == 'ratios':
+            helper.add_attribute(node, 'ratios', arg.floats)
+        elif arg.name == 'scales':
+            helper.add_attribute(node, 'scales', arg.floats)
+        elif arg.name == 'pre_nms_top_n':
+            helper.add_attribute(node, 'pre_nms_top_n', arg.i)
+        elif arg.name == 'score_thresh':
+            helper.add_attribute(node, 'score_thresh', arg.f)
+
+    return node, const_tensors
+
+
+@exporter.register('RPNDecoder')
+def rpn_decoder_exporter(op_def, shape_dict, ws):
+    node, const_tensors = exporter.translate(**locals())
+    node.op_type = 'ATen'  # Currently not supported in ai.onnx
+    helper.add_attribute(node, 'op_type', 'RPNDecoder')
+
+    for arg in op_def.arg:
+        if arg.name == 'strides':
+            helper.add_attribute(node, 'strides', arg.ints)
+        elif arg.name == 'ratios':
+            helper.add_attribute(node, 'ratios', arg.floats)
+        elif arg.name == 'scales':
+            helper.add_attribute(node, 'scales', arg.floats)
+        elif arg.name == 'pre_nms_top_n':
+            helper.add_attribute(node, 'pre_nms_top_n', arg.i)
+        elif arg.name == 'post_nms_top_n':
+            helper.add_attribute(node, 'post_nms_top_n', arg.i)
+        elif arg.name == 'nms_thresh':
+            helper.add_attribute(node, 'nms_thresh', arg.f)
+        elif arg.name == 'min_size':
+            helper.add_attribute(node, 'min_size', arg.i)
+        elif arg.name == 'min_level':
+            helper.add_attribute(node, 'min_level', arg.i)
+        elif arg.name == 'max_level':
+            helper.add_attribute(node, 'max_level', arg.i)
+        elif arg.name == 'canonical_scale':
+            helper.add_attribute(node, 'canonical_scale', arg.i)
+        elif arg.name == 'canonical_level':
+            helper.add_attribute(node, 'canonical_level', arg.i)
+
+    return node, const_tensors
--- a/lib/pycocotools/__init__.py
+++ b/lib/pycocotools/__init__.py
--- a/lib/pycocotools/coco.py
+++ b/lib/pycocotools/coco.py
--- a/lib/pycocotools/cocoeval.py
+++ b/lib/pycocotools/cocoeval.py
--- a/lib/pycocotools/mask.py
+++ b/lib/pycocotools/mask.py
 __author__ = 'tsungyi'

-import lib.pycocotools._mask as _mask
+import seetadet.pycocotools._mask as _mask

 # Interface for manipulating masks stored in RLE format.
 #

--- a/lib/pycocotools/mask_utils.py
+++ b/lib/pycocotools/mask_utils.py
@@ -15,8 +15,8 @@ from __future__ import print_function

 import numpy as np

-from lib.pycocotools import mask as mask_tools
-from lib.pycocotools.mask import frPyObjects
+from seetadet.pycocotools import mask as mask_tools
+from seetadet.pycocotools.mask import frPyObjects


 def poly2rle(poly, height, width):

--- a/lib/solver/__init__.py
+++ b/lib/solver/__init__.py
--- a/lib/solver/lr_scheduler.py
+++ b/lib/solver/lr_scheduler.py
@@ -15,7 +15,7 @@ from __future__ import print_function

 import math

-from lib.core.config import cfg
+from seetadet.core.config import cfg


 class _LRScheduler(object):

--- a/lib/solver/sgd.py
+++ b/lib/solver/sgd.py
@@ -15,11 +15,11 @@ from __future__ import print_function

 import dragon.vm.torch as torch

-from lib.core.config import cfg
-from lib.modeling.detector import Detector
-from lib.solver import lr_scheduler
-from lib.utils import framework
-from lib.utils import time_util
+from seetadet.core.config import cfg
+from seetadet.modeling.detector import Detector
+from seetadet.solver import lr_scheduler
+from seetadet.utils import env
+from seetadet.utils import time_util


 class SGDSolver(object):
@@ -28,7 +28,7 @@ class SGDSolver(object):
        self.detector = Detector()
        # Define the optimizer and its arguments
        self.optimizer = torch.optim.SGD(
-            framework.get_param_groups(self.detector),
+            env.get_param_groups(self.detector),
            lr=cfg.SOLVER.BASE_LR,
            momentum=cfg.SOLVER.MOMENTUM,
            weight_decay=cfg.SOLVER.WEIGHT_DECAY,

--- a/lib/utils/__init__.py
+++ b/lib/utils/__init__.py
--- a/lib/utils/attrdict.py
+++ b/lib/utils/attrdict.py
--- a/lib/utils/blob.py
+++ b/lib/utils/blob.py
@@ -20,9 +20,9 @@ from __future__ import print_function
 import numpy as np
 import dragon.vm.torch as torch

-from lib.core.config import cfg
-from lib.utils.image import distort_image
-from lib.utils.image import resize_image
+from seetadet.core.config import cfg
+from seetadet.utils.image import distort_image
+from seetadet.utils.image import resize_image


 def im_list_to_blob(ims):
@@ -56,7 +56,7 @@ def mask_list_to_blob(masks):
    max_shape = np.array([mask.shape[1:] for mask in masks]).max(axis=0)
    num_masks = np.array([mask.shape[0] for mask in masks]).sum()

-    blob_shape = ((num_masks, max_shape[0], max_shape[1]))
+    blob_shape = (num_masks, max_shape[0], max_shape[1])
    blob = np.zeros(blob_shape, 'uint8')

    count = 0
@@ -89,9 +89,8 @@ def prep_im_for_blob(img, target_size, max_size):
        im_size_max = np.max(im_shape[:2])
        im_scale = float(target_size) / float(im_size_max)

-    if cfg.TRAIN.USE_SCALE_JITTER:
-        r = cfg.TRAIN.SCALE_JITTER_RANGE
+    r = cfg.TRAIN.RANDOM_SCALES
    jitter = r[0] + np.random.rand() * (r[1] - r[0])
    im_scale *= jitter

-    return resize_image(img, im_scale, im_scale), im_scale, jitter
+    return resize_image(img, im_scale, im_scale), im_scale
--- a/seetadet/utils/boxes.py
+++ b/seetadet/utils/boxes.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# Codes are based on:
+#
+# ------------------------------------------------------------
+
+"""Box utilities for original coordinates."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from seetadet.utils import cython_bbox
+
+
+def bbox_overlaps(boxes1, boxes2):
+    """Compute the overlaps between two group of boxes."""
+    return cython_bbox.bbox_overlaps(
+        np.ascontiguousarray(boxes1, dtype=np.float),
+        np.ascontiguousarray(boxes2, dtype=np.float),
+    )
+
+
+def bbox_transform(ex_rois, gt_rois, weights=(1., 1., 1., 1.)):
+    """Transform the boxes to the regression targets."""
+    ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.
+    ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.
+    ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths
+    ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights
+
+    gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.
+    gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.
+    gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths
+    gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights
+
+    wx, wy, ww, wh = weights
+    targets = [wx * (gt_ctr_x - ex_ctr_x) / ex_widths]
+    targets += [wy * (gt_ctr_y - ex_ctr_y) / ex_heights]
+    targets += [ww * np.log(gt_widths / ex_widths)]
+    targets += [wh * np.log(gt_heights / ex_heights)]
+
+    return np.vstack(targets).transpose()
+
+
+def bbox_centerness(ex_rois, gt_rois):
+    """Compute centerness of the boxes to ground-truth."""
+    ex_ctr_x = (ex_rois[:, 2] + ex_rois[:, 0]) / 2
+    ex_ctr_y = (ex_rois[:, 3] + ex_rois[:, 1]) / 2
+    l = ex_ctr_x - gt_rois[:, 0]
+    t = ex_ctr_y - gt_rois[:, 1]
+    r = gt_rois[:, 2] - ex_ctr_x
+    b = gt_rois[:, 3] - ex_ctr_y
+    centerness = \
+        (np.minimum(l, r) / np.maximum(l, r)) * \
+        (np.minimum(t, b) / np.maximum(t, b))
+    min_dist = np.stack([l, t, r, b], axis=1).min(axis=1)
+    keep_inds = np.where(min_dist > 0.01)[0]
+    discard_inds = np.where(min_dist <= 0.01)[0]
+    centerness[keep_inds] = np.sqrt(centerness[keep_inds])
+    centerness[discard_inds] = -1
+    return centerness, keep_inds, discard_inds
+
+
+def bbox_transform_inv(boxes, deltas, weights=(1., 1., 1., 1.)):
+    """Decode the final boxes according to the deltas."""
+    if boxes.shape[0] == 0:
+        return np.zeros((0, deltas.shape[1]), dtype=deltas.dtype)
+
+    boxes = boxes.astype(deltas.dtype, copy=False)
+
+    widths = boxes[:, 2] - boxes[:, 0] + 1.
+    heights = boxes[:, 3] - boxes[:, 1] + 1.
+    ctr_x = boxes[:, 0] + 0.5 * widths
+    ctr_y = boxes[:, 1] + 0.5 * heights
+
+    wx, wy, ww, wh = weights
+    dx = deltas[:, 0::4] / wx
+    dy = deltas[:, 1::4] / wy
+    dw = deltas[:, 2::4] / ww
+    dh = deltas[:, 3::4] / wh
+
+    pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis]
+    pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis]
+    pred_w = np.exp(dw) * widths[:, np.newaxis]
+    pred_h = np.exp(dh) * heights[:, np.newaxis]
+
+    pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype)
+    pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w      # x1
+    pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h      # y1
+    pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w - 1  # x2
+    pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h - 1  # y2
+
+    return pred_boxes
+
+
+def clip_boxes(boxes, im_shape):
+    # x1 >= 0
+    boxes[:, 0] = np.maximum(np.minimum(boxes[:, 0], im_shape[1] - 1), 0)
+    # y1 >= 0
+    boxes[:, 1] = np.maximum(np.minimum(boxes[:, 1], im_shape[0] - 1), 0)
+    # x2 < im_shape[1]
+    boxes[:, 2] = np.maximum(np.minimum(boxes[:, 2], im_shape[1] - 1), 0)
+    # y2 < im_shape[0]
+    boxes[:, 3] = np.maximum(np.minimum(boxes[:, 3], im_shape[0] - 1), 0)
+    return boxes
+
+
+def clip_tiled_boxes(boxes, im_shape):
+    # x1 >= 0
+    boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0)
+    # y1 >= 0
+    boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0)
+    # x2 < im_shape[1]
+    boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0)
+    # y2 < im_shape[0]
+    boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0)
+    return boxes
+
+
+def dismantle_boxes(gt_boxes, num_images):
+    """Dismantle the packed ground-truth boxes."""
+    return [
+        gt_boxes[
+            np.where(gt_boxes[:, -1].astype(np.int32) == i)[0]
+        ][:, :-1] for i in range(num_images)
+    ]
+
+
+def expand_boxes(boxes, scale):
+    """Expand an array of boxes by a given scale."""
+    w_half = (boxes[:, 2] - boxes[:, 0]) * .5
+    h_half = (boxes[:, 3] - boxes[:, 1]) * .5
+    x_c = (boxes[:, 2] + boxes[:, 0]) * .5
+    y_c = (boxes[:, 3] + boxes[:, 1]) * .5
+
+    w_half *= scale
+    h_half *= scale
+
+    boxes_exp = np.zeros(boxes.shape)
+    boxes_exp[:, 0] = x_c - w_half
+    boxes_exp[:, 2] = x_c + w_half
+    boxes_exp[:, 1] = y_c - h_half
+    boxes_exp[:, 3] = y_c + h_half
+
+    return boxes_exp
+
+
+def flip_boxes(boxes, width):
+    """Flip the boxes horizontally."""
+    boxes_flipped = boxes.copy()
+    boxes_flipped[:, 0] = width - boxes[:, 2] - 1
+    boxes_flipped[:, 2] = width - boxes[:, 0] - 1
+    return boxes_flipped
+
+
+def filter_boxes(boxes, min_size):
+    """Remove all boxes with any side smaller than min size."""
+    ws = boxes[:, 2] - boxes[:, 0] + 1
+    hs = boxes[:, 3] - boxes[:, 1] + 1
+    keep = np.where((ws >= min_size) & (hs >= min_size))[0]
+    return keep
--- a/seetadet/utils/boxes_v2.py
+++ b/seetadet/utils/boxes_v2.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# Codes are based on:
+#
+# ------------------------------------------------------------
+
+"""Box utilities for normalized coordinates."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+
+def boxes_area(boxes):
+    """Compute the area of an array of boxes."""
+    w = (boxes[:, 2] - boxes[:, 0])
+    h = (boxes[:, 3] - boxes[:, 1])
+    area = w * h
+    assert np.all(area >= 0), 'Negative areas founds'
+    return area
+
+
+def intersection(boxes1, boxes2):
+    """Compute pairwise intersection areas between boxes."""
+    [y_min1, x_min1, y_max1, x_max1] = np.split(boxes1, 4, axis=1)
+    [y_min2, x_min2, y_max2, x_max2] = np.split(boxes2, 4, axis=1)
+    all_pairs_min_ymax = np.minimum(y_max1, np.transpose(y_max2))
+    all_pairs_max_ymin = np.maximum(y_min1, np.transpose(y_min2))
+    all_pairs_min_xmax = np.minimum(x_max1, np.transpose(x_max2))
+    all_pairs_max_xmin = np.maximum(x_min1, np.transpose(x_min2))
+    inter_heights = np.maximum(
+        np.zeros(all_pairs_max_ymin.shape),
+        all_pairs_min_ymax - all_pairs_max_ymin
+    )
+    inter_widths = np.maximum(
+        np.zeros(all_pairs_max_xmin.shape),
+        all_pairs_min_xmax - all_pairs_max_xmin
+    )
+    return inter_heights * inter_widths
+
+
+def ioa1(boxes1, boxes2):
+    """Computes pairwise intersection-over-area between box collections."""
+    inter = intersection(boxes1, boxes2)
+    area = np.expand_dims(boxes_area(boxes1), axis=1)
+    return inter / area
+
+
+def ioa2(boxes1, boxes2):
+    """Computes pairwise intersection-over-area between box collections."""
+    inter = intersection(boxes1, boxes2)
+    area = np.expand_dims(boxes_area(boxes2), axis=0)
+    return inter / area
+
+def iou(boxes1, boxes2):
+    """Computes pairwise intersection-over-union between box collections."""
+    inter = intersection(boxes1, boxes2)
+    area1 = boxes_area(boxes1)
+    area2 = boxes_area(boxes2)
+    union = np.expand_dims(area1, axis=1) + \
+            np.expand_dims(area2, axis=0) - inter
+    return inter / union
--- a/lib/utils/colormap.py
+++ b/lib/utils/colormap.py
--- a/seetadet/utils/env.py
+++ b/seetadet/utils/env.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import importlib.machinery
+import os
+
+import dragon
+from dragon.core.util import six
+from dragon.vm import torch
+import numpy as np
+
+from seetadet.core.config import cfg
+
+
+def get_param_groups(module):
+    """Separate parameters according to weight decay.
+
+    Parameters
+    ----------
+    module : dragon.vm.torch.nn.Module
+        The module to collect parameters.
+
+    Returns
+    -------
+    Sequence[ParamGroup]
+        The parameter groups.
+
+    """
+    param_groups = [
+        {'params': []},  # Decayed always
+        {'params': [], 'weight_decay': -1.}
+    ]
+    for name, param in module.named_parameters():
+        gi = 0 if 'weight' in name and param.dim() > 1 else 1
+        param_groups[gi]['params'].append(param)
+    if len(param_groups[1]['params']) == 0:
+        param_groups.pop()  # Remove empty group
+    return param_groups
+
+
+def load_library(library_prefix):
+    """Load a shared library.
+
+    Parameters
+    ----------
+    library_prefix : str
+        The prefix of library.
+
+    """
+    loader_details = (
+        importlib.machinery.ExtensionFileLoader,
+        importlib.machinery.EXTENSION_SUFFIXES
+    )
+    library_prefix = os.path.abspath(library_prefix)
+    lib_dir, fullname = os.path.split(library_prefix)
+    finder = importlib.machinery.FileFinder(lib_dir, loader_details)
+    ext_specs = finder.find_spec(fullname)
+    if ext_specs is None:
+        raise ImportError(
+            'Could not find the pre-built library '
+            'for <%s>.' % library_prefix
+        )
+    dragon.load_library(ext_specs.origin)
+
+
+def new_tensor(data, enforce_cpu=False):
+    """Create a new tensor from the data.
+
+    Parameters
+    ----------
+    data : array_like
+        The data value.
+    enforce_cpu : bool, optional, default=False
+        **True** to enforce the cpu storage.
+
+    Returns
+    -------
+    dragon.vm.torch.Tensor
+        The tensor taken with the data.
+
+    """
+    if data is None:
+        return data
+    if isinstance(data, np.ndarray):
+        tensor = torch.from_numpy(data)
+    elif isinstance(data, torch.Tensor):
+        tensor = data
+    else:
+        tensor = torch.tensor(data)
+    if not enforce_cpu:
+        tensor = tensor.cuda(cfg.GPU_ID)
+    return tensor
+
+
+# Aliases
+pickle = six.moves.pickle
--- a/lib/utils/image.py
+++ b/lib/utils/image.py
@@ -18,7 +18,7 @@ import numpy as np
 import PIL.Image
 import PIL.ImageEnhance

-from lib.core.config import cfg
+from seetadet.core.config import cfg


 def distort_image(img):
@@ -28,7 +28,6 @@ def distort_image(img):
        PIL.ImageEnhance.Contrast,
        PIL.ImageEnhance.Color,
    ]
-    np.random.shuffle(transforms)
    for transform in transforms:
        if np.random.uniform() < 0.5:
            img = transform(img)
@@ -62,7 +61,7 @@ def get_image_with_target_size(target_size, img):
    )


-def resize_image(img, fx, fy):
+def resize_image(img, fx=1, fy=1):
    return cv2.resize(
        img,
        dsize=None,
@@ -79,7 +78,6 @@ def scale_image(img):
        im_size_max = np.max(img.shape[:2])
        for target_size in cfg.TEST.SCALES:
            im_scale = float(target_size) / float(im_size_min)
-            # Prevent the biggest axis from being more than MAX_SIZE
            if np.round(im_scale * im_size_max) > cfg.TEST.MAX_SIZE:
                im_scale = float(cfg.TEST.MAX_SIZE) / float(im_size_max)
            processed_ims.append(
@@ -91,17 +89,16 @@ def scale_image(img):
                ))
            ims_scales.append(im_scale)
    else:
-        # Scale image along the longest side
-        im_size_max = np.max(img.shape[:2])
+        # Scale image into a square
        for target_size in cfg.TEST.SCALES:
-            im_scale = float(target_size) / float(im_size_max)
+            im_scale_h = float(target_size) / img.shape[0]
+            im_scale_w = float(target_size) / img.shape[1]
            processed_ims.append(
                cv2.resize(
                    img,
-                    dsize=None,
-                    fx=im_scale, fy=im_scale,
+                    dsize=(target_size, target_size),
                    interpolation=cv2.INTER_LINEAR,
                ))
-            ims_scales.append(im_scale)
+            ims_scales.append([im_scale_h, im_scale_w])

    return processed_ims, ims_scales
--- a/lib/utils/logger.py
+++ b/lib/utils/logger.py
@@ -42,7 +42,7 @@ def get_logger():
        if _logger:
            return _logger

-        logger = _logging.getLogger('detectron')
+        logger = _logging.getLogger('SeetaDet')
        logger.setLevel('INFO')
        logger.propagate = False


--- a/lib/utils/mask.py
+++ b/lib/utils/mask.py
@@ -21,7 +21,7 @@ import cv2
 import numpy as np
 import PIL.Image

-from lib.utils import boxes as box_util
+from seetadet.utils import boxes as box_util


 def dismantle_masks(gt_boxes, gt_masks, num_images):

--- a/lib/nms/nms_wrapper.py
+++ b/lib/nms/nms_wrapper.py
@@ -17,44 +17,53 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-from lib.core.config import cfg
+from seetadet.modules import det
+from seetadet.utils import env

 try:
-    from lib.nms.cpu_nms import cpu_nms, cpu_soft_nms
-except ImportError as e:
-    print('Failed to import cpu nms. Error: {0}'.format(str(e)))
+    from seetadet.utils.cython_nms import cpu_nms
+    from seetadet.utils.cython_nms import cpu_soft_nms
+except ImportError:
+    cpu_nms = cpu_soft_nms = print

-try:
-    from lib.nms.gpu_nms import gpu_nms
-except ImportError as e:
-    print('Failed to import gpu nms. Error: {0}'.format(str(e)))
+
+def gpu_nms(dets, thresh):
+    """Filter out the detections using GPU-NMS."""
+    if dets.shape[0] == 0:
+        return []
+    scores = dets[:, 4]
+    order = scores.argsort()[::-1]
+    sorted_dets = env.new_tensor(dets[order, :])
+    keep = det.nms(sorted_dets, iou_threshold=thresh).numpy()
+    return order[keep]


-def nms(detections, thresh, force_cpu=False):
-    """Perform either CPU or GPU Hard-NMS."""
-    if detections.shape[0] == 0:
+def nms(dets, thresh):
+    """Filter out the detections using NMS."""
+    if dets.shape[0] == 0:
        return []
-    if cfg.USE_GPU_NMS and not force_cpu:
-        return gpu_nms(detections, thresh, device_id=cfg.GPU_ID)
-    else:
-        return cpu_nms(detections, thresh)
+    if cpu_nms is print:
+        raise ImportError('Failed to load <cython_nms> library.')
+    return cpu_nms(dets, thresh)


 def soft_nms(
-    detections,
+    dets,
    thresh,
    method='linear',
    sigma=0.5,
    score_thresh=0.001,
 ):
-    """Perform CPU Soft-NMS."""
-    if detections.shape[0] == 0:
+    """Filter out the detections using Soft-NMS."""
+    if dets.shape[0] == 0:
        return []
+    if cpu_soft_nms is print:
+        raise ImportError('Failed to load <cython_nms> library.')
    methods = {'hard': 0, 'linear': 1, 'gaussian': 2}
    if method not in methods:
        raise ValueError('Unknown soft nms method:', method)
    return cpu_soft_nms(
-        detections,
+        dets,
        thresh,
        methods[method],
        sigma,

--- a/lib/utils/stats.py
+++ b/lib/utils/stats.py
--- a/lib/utils/time_util.py
+++ b/lib/utils/time_util.py
--- a/lib/utils/vis.py
+++ b/lib/utils/vis.py
@@ -30,8 +30,8 @@ import matplotlib.pyplot as plt
 from matplotlib.patches import Polygon
 import numpy as np

-from lib.utils.colormap import colormap
-from lib.utils.boxes import expand_boxes
+from seetadet.utils.colormap import colormap
+from seetadet.utils.boxes import expand_boxes


 plt.rcParams['pdf.fonttype'] = 42  # For editing in Adobe Illustrator

--- a/setup.py
+++ b/setup.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+import setuptools
+import setuptools.command.install
+import sys
+import subprocess
+
+
+def clean():
+    """Remove the work directories."""
+    if os.path.exists('build'):
+        shutil.rmtree('build')
+    if os.path.exists('seeta_det.egg-info'):
+        shutil.rmtree('seeta_det.egg-info')
+
+
+def configure():
+    """Prepare the package files."""
+    # Compile cxx sources
+    py_exec = sys.executable
+    if subprocess.call(
+        'cd csrc/cxx && '
+        '{} setup.py build_ext -b ../ --no-python-abi-suffix=0 -j 4 &&'
+        '{} setup.py clean'.format(py_exec, py_exec), shell=True
+    ) > 0:
+        raise RuntimeError('Failed to build the cxx sources.')
+    # Compile pyx sources
+    if subprocess.call(
+        'cd csrc/pyx && '
+        '{} setup.py build_ext -b ../ --cython-c-in-temp -j 4 &&'
+        '{} setup.py clean'.format(py_exec, py_exec), shell=True,
+    ) > 0:
+        raise RuntimeError('Failed to build the pyx sources.')
+    # Copy the pre-built libraries
+    for root, _, files in os.walk('csrc/install'):
+        root = root[len('csrc/install/'):]
+        for file in files:
+            src = os.path.join(root, file)
+            dest = src.replace('lib', 'seetadet')
+            if os.path.exists(dest):
+                os.remove(dest)
+            shutil.copy(os.path.join('csrc/install', src), dest)
+    shutil.rmtree('csrc/install')
+
+
+class install(setuptools.command.install.install):
+    """Old-style command to prevent from installing egg."""
+
+    def run(self):
+        setuptools.command.install.install.run(self)
+
+
+def find_packages():
+    """Return the python sources installed to package."""
+    packages = []
+    for root, _, files in os.walk('seetadet'):
+        if os.path.exists(os.path.join(root, '__init__.py')):
+            packages.append(root)
+    return packages
+
+
+def find_package_data():
+    """Return the external data installed to package."""
+    libraries = []
+    for root, _, files in os.walk('seetadet'):
+        root = root[len('seetadet/'):]
+        for file in files:
+            if file.endswith('.so') or file.endswith('.pyd'):
+                libraries.append(os.path.join(root, file))
+    return libraries
+
+
+configure()
+setuptools.setup(
+    name='seeta-det',
+    version='0.4.0',
+    description='SeetaDet: A platform implementing popular object detection algorithms.',
+    url='https://gitlab.seetatech.com/seetaresearch/SeetaDet',
+    author='SeetaTech',
+    license='BSD 2-Clause',
+    packages=find_packages(),
+    package_data={'seetadet': find_package_data()},
+    package_dir={'seetadet': 'seetadet'},
+    cmdclass={'install': install},
+    install_requires=['opencv-python', 'Pillow'],
+    classifiers=[
+        'Development Status :: 5 - Production/Stable',
+        'Intended Audience :: Developers',
+        'Intended Audience :: Education',
+        'Intended Audience :: Science/Research',
+        'License :: OSI Approved :: BSD License',
+        'Programming Language :: C++',
+        'Programming Language :: Python',
+        'Topic :: Scientific/Engineering',
+        'Topic :: Scientific/Engineering :: Mathematics',
+        'Topic :: Scientific/Engineering :: Artificial Intelligence',
+        'Topic :: Software Development',
+        'Topic :: Software Development :: Libraries',
+        'Topic :: Software Development :: Libraries :: Python Modules',
+    ],
+)
+clean()
--- a/tools/export.py
+++ b/tools/export.py
@@ -21,10 +21,11 @@ import argparse
 import dragon.vm.torch as torch
 import pprint

-from lib.core.config import cfg
-from lib.core.coordinator import Coordinator
-from lib.modeling.detector import new_detector
-from lib.utils import logger
+from seetadet import onnx as _
+from seetadet.core.config import cfg
+from seetadet.core.coordinator import Coordinator
+from seetadet.modeling.detector import new_detector
+from seetadet.utils import logger


 def parse_args():
@@ -71,8 +72,8 @@ if __name__ == '__main__':
                .format(coordinator.exports_dir()))
    detector = new_detector(cfg.GPU_ID, checkpoint)

-    data = torch.zeros(*args.input_shape).byte()
-    ims_info = torch.zeros(args.input_shape[0], 3).float()
+    data = torch.zeros(*args.input_shape, dtype='uint8')
+    ims_info = torch.zeros(args.input_shape[0], 3, dtype='float32')

    torch.onnx.export(
        model=detector,

--- a/tools/mpi_train.py
+++ b/tools/mpi_train.py
@@ -21,11 +21,11 @@ import argparse
 import dragon
 import numpy

-from lib.core.config import cfg
-from lib.core.coordinator import Coordinator
-from lib.core.train import train_net
-from lib.datasets.factory import get_imdb
-from lib.utils import logger
+from seetadet.core.config import cfg
+from seetadet.core.coordinator import Coordinator
+from seetadet.core.train import train_net
+from seetadet.datasets.factory import get_dataset
+from seetadet.utils import logger


 def parse_args():
@@ -79,12 +79,12 @@ if __name__ == '__main__':

    # Fix the random seed for reproducibility
    numpy.random.seed(cfg.RNG_SEED)
-    dragon.config.set_random_seed(cfg.RNG_SEED)
+    dragon.random.set_seed(cfg.RNG_SEED)

-    # Inspect the database
-    database = get_imdb(cfg.TRAIN.DATABASE)
-    logger.info('Database({}): {} images will be used to train.'
-                .format(cfg.TRAIN.DATABASE, database.num_images))
+    # Inspect the dataset
+    dataset = get_dataset(cfg.TRAIN.DATASET)
+    logger.info('Dataset({}): {} images will be used to train.'
+                .format(cfg.TRAIN.DATASET, dataset.num_images))

    # Ready to train the network
    logger.info('Output will be saved to `{:s}`'

--- a/tools/test.py
+++ b/tools/test.py
@@ -20,12 +20,12 @@ sys.path.insert(0, '..')
 import argparse
 import pprint

-from lib.core import test_engine
-from lib.core.config import cfg
-from lib.core.coordinator import Coordinator
-from lib.core.test import TestServer
-from lib.datasets.factory import get_imdb
-from lib.utils import logger
+from seetadet.core import test_engine
+from seetadet.core.config import cfg
+from seetadet.core.coordinator import Coordinator
+from seetadet.core.test import TestServer
+from seetadet.datasets.factory import get_dataset
+from seetadet.utils import logger


 def parse_args():
@@ -81,11 +81,11 @@ if __name__ == '__main__':
    if checkpoint is None:
        raise RuntimeError('The checkpoint of global step {} does not exist.'.format(args.iter))

-    # Inspect the database
-    database = get_imdb(cfg.TEST.DATABASE)
+    # Inspect the dataset
+    dataset = get_dataset(cfg.TEST.DATASET)
    cfg.TEST.PROTOCOL = 'dump' if args.dump else cfg.TEST.PROTOCOL
-    logger.info('Database({}): {} images will be used to test.'
-                .format(cfg.TEST.DATABASE, database.num_images))
+    logger.info('Dataset({}): {} images will be used to test.'
+                .format(cfg.TEST.DATASET, dataset.num_images))

    # Ready to test the network
    output_dir = coordinator.results_dir(checkpoint, args.output_dir)

--- a/tools/test_all.py
+++ b/tools/test_all.py
@@ -20,8 +20,8 @@ sys.path.insert(0, '..')
 import argparse
 import numpy

-from lib.core.coordinator import Coordinator
-from lib.utils import logger
+from seetadet.core.coordinator import Coordinator
+from seetadet.utils import logger


 def parse_args():

--- a/tools/train.py
+++ b/tools/train.py
@@ -22,11 +22,11 @@ import dragon
 import numpy
 import pprint

-from lib.core.config import cfg
-from lib.core.coordinator import Coordinator
-from lib.core.train import train_net
-from lib.datasets.factory import get_imdb
-from lib.utils import logger
+from seetadet.core.config import cfg
+from seetadet.core.coordinator import Coordinator
+from seetadet.core.train import train_net
+from seetadet.datasets.factory import get_dataset
+from seetadet.utils import logger


 def parse_args():
@@ -59,7 +59,7 @@ def mpi_train(cfg_file, exp_dir):

    """
    import subprocess
-    args = 'mpirun --allow-run-as-root -n {} '.format(cfg.NUM_GPUS)
+    args = 'mpirun --allow-run-as-root -n {} --bind-to none '.format(cfg.NUM_GPUS)
    args += '{} {} '.format(sys.executable, 'mpi_train.py')
    args += '--cfg {} --exp_dir {} '.format(osp.abspath(cfg_file), exp_dir)
    return subprocess.call(args, shell=True)
@@ -84,12 +84,12 @@ if __name__ == '__main__':

        # Fix the random seed for reproducibility
        numpy.random.seed(cfg.RNG_SEED)
-        dragon.config.set_random_seed(cfg.RNG_SEED)
+        dragon.random.set_seed(cfg.RNG_SEED)

-        # Inspect the database
-        database = get_imdb(cfg.TRAIN.DATABASE)
-        logger.info('Database({}): {} images will be used to train.'
-                    .format(cfg.TRAIN.DATABASE, database.num_images))
+        # Inspect the dataset
+        dataset = get_dataset(cfg.TRAIN.DATASET)
+        logger.info('Dataset({}): {} images will be used to train.'
+                    .format(cfg.TRAIN.DATASET, dataset.num_images))

        # Ready to train the network
        logger.info('Output will be saved to `{:s}`'