Change the code structure

Ting PAN
Commit f4ecc7c7 authored Apr 08, 2020 by Ting PAN
Showing with 4844 additions and 3750 deletions
CHANGES
README.md
compile/CMakeLists.txt
compile/cmake/FindNumPy.cmake
compile/cmake/FindPythonLibs.cmake
compile/gpu_nms.h
compile/gpu_nms.pyx
compile/make.sh
compile/nms_kernel.cu
compile/setup.py
configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_1x.yml
configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_2x.yml
configs/faster_rcnn/voc_faster_rcnn_R-50-FPN.yml
configs/faster_rcnn/voc_faster_rcnn_VGG-16-C4.yml
configs/retinanet/coco_retinanet_400_R-50-FPN_1x.yml → configs/retinanet/coco_retinanet_400_R-50-FPN.yml
configs/retinanet/coco_retinanet_400_R-50-FPN_4x.yml
configs/retinanet/voc_retinanet_300_R-18-FPN.yml
configs/retinanet/voc_retinanet_300_AirNet-FPN.yml → configs/retinanet/voc_retinanet_320_AirNet-FPN.yml
configs/retinanet/voc_retinanet_300_R-34-FPN.yml → configs/retinanet/voc_retinanet_320_R-50-FPN.yml
configs/ssd/voc_ssd_300_AirNet-5b.yml
--- a/CHANGES
+++ b/CHANGES
 ------------------------------------------------------------------------
 The list of most significant changes made over time in SeetaDet.
+SeetaDet 0.4.0 (20200408)
+Dragon Minimum Required (Version 0.3.0.dev20200408)
+Changes:
+Preview Features:
+- Optimize the code structure.
+- DALI support for SSD, RetinaNet, and Faster-RCNN.
+- Use KPLRecord instead of SeetaRecord.
+Bugs fixed:
+- Fix the frozen Affine issue.
+------------------------------------------------------------------------
 SeetaDet 0.3.0 (20191121)
 Dragon Minimum Required (Version 0.3.0.dev20191121)

--- a/README.md
+++ b/README.md
@@ -2,8 +2,8 @@
 ## WHAT's SeetaDet?
-SeetaDet contains many useful object detectors, including R-CNN series, SSD,
+SeetaDet is a platform implementing popular object detection algorithms,
-and the recent RetinaNet.
+including R-CNN series, SSD, and RetinaNet.
 We have achieved the same or higher performance than the baseline reported by the original paper.
@@ -14,22 +14,33 @@ The torch-style codes help us to simplify the hierarchical pipeline of modern de
 ## Requirements
-seeta-dragon >= 0.3.0.dev20191121
+seeta-dragon >= 0.3.0.dev20200408
 ## Installation
-#### 1. Install the required python packages
+#### Build From Source
+If you prefer to develop modules as well as running experiments,
+following commands will build but not install to ***site-packages***:
 ```bash
-pip install cython pyyaml matplotlib
+cd SeetaDet && python setup.py build
-pip install opencv-python Pillow
 ```
-#### 2. Compile the C Extensions
+#### Install From Source
+Clone this repository to local disk and install:
+```bash
+cd SeetaDet && python setup.py install
+```
+#### Install From Git
+You can also install it from remote repository: 
 ```bash
-cd SeetaDet/compile
+pip install git+https://gitlab.seetatech.com/seetaresearch/SeetaDet.git@master
-bash ./make.sh
 ```
 ## Quick Start
@@ -37,7 +48,7 @@ bash ./make.sh
 #### Train a detection model
 ```bash
-cd SeetaDet/tools
+cd tools
 python train.py --cfg <MODEL_YAML>
 ```
@@ -46,20 +57,20 @@ We have provided the default YAML examples into ``SeetaDet/configs``.
 #### Test a detection model
 ```bash
-cd SeetaDet/tools
+cd tools
 python test.py --cfg <MODEL_YAML> --exp_dir <EXP_DIR> --iter <ITERATION>
 ```
 Or
 ```bash
-cd SeetaDet/tools
+cd tools
 python test_all.py --cfg <MODEL_YAML> --exp_dir <EXP_DIR>
 ```
 #### Export a detection model to ONNX
 ```bash
-cd SeetaDet/tools
+cd tools
 python export.py --cfg <MODEL_YAML> --exp_dir <EXP_DIR> --iter <ITERATION>
 ```

--- a/compile/CMakeLists.txt
+++ b/compile/CMakeLists.txt
-PROJECT(gpu_nms)
-CMAKE_MINIMUM_REQUIRED(VERSION 3.0.2)
-# ---------------- User Config ----------------
-# Set your python "interpreter" if necessary
-# if not, a default interpreter will be used
-# here, provide several examples:
-# set(PYTHON_EXECUTABLE /usr/bin/python) # Linux & OSX, Builtin Python
-# set(PYTHON_EXECUTABLE /X/anaconda/bin/python) # Linux & OSX, Anaconda
-# set(PYTHON_EXECUTABLE X:/Anaconda/python) # Win, Anaconda
-# Set CUDA compiling architecture
-# Remove "compute_70/sm_70" if using CUDA 8.0
-set(CUDA_ARCH    -gencode arch=compute_30,code=sm_30
-                 -gencode arch=compute_35,code=sm_35
-                 -gencode arch=compute_50,code=sm_50
-                 -gencode arch=compute_60,code=sm_60
-                 -gencode arch=compute_70,code=sm_70)
-# ---------------- User Config ----------------
-# ---[ Dependencies
-include(${PROJECT_SOURCE_DIR}/cmake/FindPythonLibs.cmake)
-include(${PROJECT_SOURCE_DIR}/cmake/FindNumPy.cmake)
-FIND_PACKAGE(CUDA REQUIRED)
-set(CMAKE_CXX_STANDARD 11)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-message(STATUS "C++11 support has been enabled by default.")
-# ---[ Config types
-set(CMAKE_BUILD_TYPE Release CACHE STRING "set build type to release")
-set(CMAKE_CONFIGURATION_TYPES  Release CACHE STRING "set build type to release" FORCE)
-# ---[ Includes
-set(INCLUDE_DIR ${PROJECT_SOURCE_DIR}/include)
-include_directories(${INCLUDE_DIR})
-include_directories(${PROJECT_SOURCE_DIR}/src)
-include_directories(${PYTHON_INCLUDE_DIRS})
-include_directories(${NUMPY_INCLUDE_DIR})
-include_directories(${CUDA_INCLUDE_DIRS})
-# ---[ libs
-link_directories(${PYTHON_LIBRARIES})
-# ---[ Install
-set(CMAKE_INSTALL_PREFIX ${PROJECT_SOURCE_DIR} CACHE STRING "set install prefix" FORCE)
-set(CMAKE_SHARED_LIBRARY_PREFIX "")
-# ---[ Flags
-set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ${CUDA_ARCH}")
-if(WIN32)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP /O2 /Oi /GL /Ot /Gy")
-endif()
-if(UNIX)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -s -fPIC")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -s -w -fPIC -O3 -m64 -std=c++11")
-endif()
-# ---[ Files
-set(HEADER_FILES gpu_nms.h)
-set(SRC_FILES gpu_nms.cpp nms_kernel.cu)
-# ---[ Add Target
-CUDA_ADD_LIBRARY(${PROJECT_NAME} SHARED ${HEADER_FILES} ${SRC_FILES})
-# ---[ Link Libs
-TARGET_LINK_LIBRARIES(${PROJECT_NAME} ${CUDA_LIBRARIES} ${CUDA_cublas_LIBRARY} ${CUDA_curand_LIBRARY})
-if(WIN32)
-    TARGET_LINK_LIBRARIES(${PROJECT_NAME} ${PYTHON_LIBRARIES})
-endif()
-# ---[ Install Target
-set_target_properties(${PROJECT_NAME} PROPERTIES OUTPUT_NAME "gpu_nms")
-install (TARGETS ${PROJECT_NAME} DESTINATION ${PROJECT_BINARY_DIR}/../install/lib/nms)
--- a/compile/cmake/FindNumPy.cmake
+++ b/compile/cmake/FindNumPy.cmake
-# - Find the NumPy libraries
-# This module finds if NumPy is installed, and sets the following variables
-# indicating where it is.
-#
-# TODO: Update to provide the libraries and paths for linking npymath lib.
-#
-#  NUMPY_FOUND               - was NumPy found
-#  NUMPY_VERSION             - the version of NumPy found as a string
-#  NUMPY_VERSION_MAJOR       - the major version number of NumPy
-#  NUMPY_VERSION_MINOR       - the minor version number of NumPy
-#  NUMPY_VERSION_PATCH       - the patch version number of NumPy
-#  NUMPY_VERSION_DECIMAL     - e.g. version 1.6.1 is 10601
-#  NUMPY_INCLUDE_DIR         - path to the NumPy include files
-unset(NUMPY_VERSION)
-unset(NUMPY_INCLUDE_DIR)
-if(PYTHONINTERP_FOUND)
-  execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
-    "import numpy as n; print(n.__version__); print(n.get_include());"
-    RESULT_VARIABLE __result
-    OUTPUT_VARIABLE __output
-    OUTPUT_STRIP_TRAILING_WHITESPACE)
-  if(__result MATCHES 0)
-    string(REGEX REPLACE ";" "\\\\;" __values ${__output})
-    string(REGEX REPLACE "\r?\n" ";"    __values ${__values})
-    list(GET __values 0 NUMPY_VERSION)
-    list(GET __values 1 NUMPY_INCLUDE_DIR)
-    string(REGEX MATCH "^([0-9])+\\.([0-9])+\\.([0-9])+" __ver_check "${NUMPY_VERSION}")
-    if(NOT "${__ver_check}" STREQUAL "")
-      set(NUMPY_VERSION_MAJOR ${CMAKE_MATCH_1})
-      set(NUMPY_VERSION_MINOR ${CMAKE_MATCH_2})
-      set(NUMPY_VERSION_PATCH ${CMAKE_MATCH_3})
-      math(EXPR NUMPY_VERSION_DECIMAL
-        "(${NUMPY_VERSION_MAJOR} * 10000) + (${NUMPY_VERSION_MINOR} * 100) + ${NUMPY_VERSION_PATCH}")
-      string(REGEX REPLACE "\\\\" "/"  NUMPY_INCLUDE_DIR ${NUMPY_INCLUDE_DIR})
-    else()
-     unset(NUMPY_VERSION)
-     unset(NUMPY_INCLUDE_DIR)
-     message(STATUS "Requested NumPy version and include path, but got instead:\n${__output}\n")
-    endif()
-  endif()
-else()
-	message("Can not find Python interpretator.")
-	message(FATAL_ERROR "Do you set PYTHON_EXECUTABLE correctly?")
-endif()
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(NumPy REQUIRED_VARS NUMPY_INCLUDE_DIR NUMPY_VERSION
-                                        VERSION_VAR   NUMPY_VERSION)
-if(NUMPY_FOUND)
-  message(STATUS "NumPy ver. ${NUMPY_VERSION} found (include: ${NUMPY_INCLUDE_DIR})")
-endif()
\ No newline at end of file
--- a/compile/cmake/FindPythonLibs.cmake
+++ b/compile/cmake/FindPythonLibs.cmake
-# - Find python libraries
-# This module finds the libraries corresponding to the Python interpeter
-# FindPythonInterp provides.
-# This code sets the following variables:
-#
-#  PYTHONLIBS_FOUND           - have the Python libs been found
-#  PYTHON_PREFIX              - path to the Python installation
-#  PYTHON_LIBRARIES           - path to the python library
-#  PYTHON_INCLUDE_DIRS        - path to where Python.h is found
-#  PYTHON_MODULE_EXTENSION    - lib extension, e.g. '.so' or '.pyd'
-#  PYTHON_MODULE_PREFIX       - lib name prefix: usually an empty string
-#  PYTHON_SITE_PACKAGES       - path to installation site-packages
-#  PYTHON_IS_DEBUG            - whether the Python interpreter is a debug build
-#
-# Thanks to talljimbo for the patch adding the 'LDVERSION' config
-# variable usage.
-#=============================================================================
-# Copyright 2001-2009 Kitware, Inc.
-# Copyright 2012 Continuum Analytics, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution.
-#
-# * Neither the names of Kitware, Inc., the Insight Software Consortium,
-# nor the names of their contributors may be used to endorse or promote
-# products derived from this software without specific prior written
-# permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#=============================================================================
-# Checking for the extension makes sure that `LibsNew` was found and not just `Libs`.
-if(PYTHONLIBS_FOUND AND PYTHON_MODULE_EXTENSION)
-    return()
-endif()
-# Use the Python interpreter to find the libs.
-if(PythonLibsNew_FIND_REQUIRED)
-    find_package(PythonInterp ${PythonLibsNew_FIND_VERSION} REQUIRED)
-else()
-    find_package(PythonInterp ${PythonLibsNew_FIND_VERSION})
-endif()
-if(NOT PYTHONINTERP_FOUND)
-    set(PYTHONLIBS_FOUND FALSE)
-    return()
-endif()
-# According to http://stackoverflow.com/questions/646518/python-how-to-detect-debug-interpreter
-# testing whether sys has the gettotalrefcount function is a reliable, cross-platform
-# way to detect a CPython debug interpreter.
-#
-# The library suffix is from the config var LDVERSION sometimes, otherwise
-# VERSION. VERSION will typically be like "2.7" on unix, and "27" on windows.
-execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
-    "from distutils import sysconfig as s;import sys;import struct;
-print('.'.join(str(v) for v in sys.version_info));
-print(sys.prefix);
-print(s.get_python_inc(plat_specific=True));
-print(s.get_python_lib(plat_specific=True));
-print(s.get_config_var('SO'));
-print(hasattr(sys, 'gettotalrefcount')+0);
-print(struct.calcsize('@P'));
-print(s.get_config_var('LDVERSION') or s.get_config_var('VERSION'));
-print(s.get_config_var('LIBDIR') or '');
-print(s.get_config_var('MULTIARCH') or '');
-"
-    RESULT_VARIABLE _PYTHON_SUCCESS
-    OUTPUT_VARIABLE _PYTHON_VALUES
-    ERROR_VARIABLE _PYTHON_ERROR_VALUE)
-if(NOT _PYTHON_SUCCESS MATCHES 0)
-    if(PythonLibsNew_FIND_REQUIRED)
-        message(FATAL_ERROR
-            "Python config failure:\n${_PYTHON_ERROR_VALUE}")
-    endif()
-    set(PYTHONLIBS_FOUND FALSE)
-    return()
-endif()
-# Convert the process output into a list
-string(REGEX REPLACE ";" "\\\\;" _PYTHON_VALUES ${_PYTHON_VALUES})
-string(REGEX REPLACE "\n" ";" _PYTHON_VALUES ${_PYTHON_VALUES})
-list(GET _PYTHON_VALUES 0 _PYTHON_VERSION_LIST)
-list(GET _PYTHON_VALUES 1 PYTHON_PREFIX)
-list(GET _PYTHON_VALUES 2 PYTHON_INCLUDE_DIR)
-list(GET _PYTHON_VALUES 3 PYTHON_SITE_PACKAGES)
-list(GET _PYTHON_VALUES 4 PYTHON_MODULE_EXTENSION)
-list(GET _PYTHON_VALUES 5 PYTHON_IS_DEBUG)
-list(GET _PYTHON_VALUES 6 PYTHON_SIZEOF_VOID_P)
-list(GET _PYTHON_VALUES 7 PYTHON_LIBRARY_SUFFIX)
-list(GET _PYTHON_VALUES 8 PYTHON_LIBDIR)
-list(GET _PYTHON_VALUES 9 PYTHON_MULTIARCH)
-# Make sure the Python has the same pointer-size as the chosen compiler
-# Skip if CMAKE_SIZEOF_VOID_P is not defined
-if(CMAKE_SIZEOF_VOID_P AND (NOT "${PYTHON_SIZEOF_VOID_P}" STREQUAL "${CMAKE_SIZEOF_VOID_P}"))
-    if(PythonLibsNew_FIND_REQUIRED)
-        math(EXPR _PYTHON_BITS "${PYTHON_SIZEOF_VOID_P} * 8")
-        math(EXPR _CMAKE_BITS "${CMAKE_SIZEOF_VOID_P} * 8")
-        message(FATAL_ERROR
-            "Python config failure: Python is ${_PYTHON_BITS}-bit, "
-            "chosen compiler is  ${_CMAKE_BITS}-bit")
-    endif()
-    set(PYTHONLIBS_FOUND FALSE)
-    return()
-endif()
-# The built-in FindPython didn't always give the version numbers
-string(REGEX REPLACE "\\." ";" _PYTHON_VERSION_LIST ${_PYTHON_VERSION_LIST})
-list(GET _PYTHON_VERSION_LIST 0 PYTHON_VERSION_MAJOR)
-list(GET _PYTHON_VERSION_LIST 1 PYTHON_VERSION_MINOR)
-list(GET _PYTHON_VERSION_LIST 2 PYTHON_VERSION_PATCH)
-# Make sure all directory separators are '/'
-string(REGEX REPLACE "\\\\" "/" PYTHON_PREFIX ${PYTHON_PREFIX})
-string(REGEX REPLACE "\\\\" "/" PYTHON_INCLUDE_DIR ${PYTHON_INCLUDE_DIR})
-string(REGEX REPLACE "\\\\" "/" PYTHON_SITE_PACKAGES ${PYTHON_SITE_PACKAGES})
-if(CMAKE_HOST_WIN32)
-    set(PYTHON_LIBRARY
-        "${PYTHON_PREFIX}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib")
-    # when run in a venv, PYTHON_PREFIX points to it. But the libraries remain in the
-    # original python installation. They may be found relative to PYTHON_INCLUDE_DIR.
-    if(NOT EXISTS "${PYTHON_LIBRARY}")
-        get_filename_component(_PYTHON_ROOT ${PYTHON_INCLUDE_DIR} DIRECTORY)
-        set(PYTHON_LIBRARY
-            "${_PYTHON_ROOT}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib")
-    endif()
-    # raise an error if the python libs are still not found.
-    if(NOT EXISTS "${PYTHON_LIBRARY}")
-        message(FATAL_ERROR "Python libraries not found")
-    endif()
-else()
-    if(PYTHON_MULTIARCH)
-        set(_PYTHON_LIBS_SEARCH "${PYTHON_LIBDIR}/${PYTHON_MULTIARCH}" "${PYTHON_LIBDIR}")
-    else()
-        set(_PYTHON_LIBS_SEARCH "${PYTHON_LIBDIR}")
-    endif()
-    #message(STATUS "Searching for Python libs in ${_PYTHON_LIBS_SEARCH}")
-    # Probably this needs to be more involved. It would be nice if the config
-    # information the python interpreter itself gave us were more complete.
-    find_library(PYTHON_LIBRARY
-        NAMES "python${PYTHON_LIBRARY_SUFFIX}"
-        PATHS ${_PYTHON_LIBS_SEARCH}
-        NO_DEFAULT_PATH)
-    # If all else fails, just set the name/version and let the linker figure out the path.
-    if(NOT PYTHON_LIBRARY)
-        set(PYTHON_LIBRARY python${PYTHON_LIBRARY_SUFFIX})
-    endif()
-endif()
-MARK_AS_ADVANCED(
-  PYTHON_LIBRARY
-  PYTHON_INCLUDE_DIR
-)
-# We use PYTHON_INCLUDE_DIR, PYTHON_LIBRARY and PYTHON_DEBUG_LIBRARY for the
-# cache entries because they are meant to specify the location of a single
-# library. We now set the variables listed by the documentation for this
-# module.
-SET(PYTHON_INCLUDE_DIRS "${PYTHON_INCLUDE_DIR}")
-SET(PYTHON_LIBRARIES "${PYTHON_LIBRARY}")
-SET(PYTHON_DEBUG_LIBRARIES "${PYTHON_DEBUG_LIBRARY}")
-find_package_message(PYTHON
-    "Found PythonLibs: ${PYTHON_LIBRARY}"
-    "${PYTHON_EXECUTABLE}${PYTHON_VERSION}")
-set(PYTHONLIBS_FOUND TRUE)
--- a/compile/gpu_nms.h
+++ b/compile/gpu_nms.h
-void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
-          int boxes_dim, float nms_overlap_thresh, int device_id);
--- a/compile/gpu_nms.pyx
+++ b/compile/gpu_nms.pyx
-# --------------------------------------------------------
-# Faster R-CNN
-# Copyright (c) 2015 Microsoft
-# Licensed under The MIT License [see LICENSE for details]
-# Written by Ross Girshick
-# --------------------------------------------------------
-import numpy as np
-cimport numpy as np
-assert sizeof(int) == sizeof(np.int32_t)
-cdef extern from "gpu_nms.h":
-    void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int)
-def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, float thresh, int device_id=0):
-    cdef int boxes_num = dets.shape[0]
-    cdef int boxes_dim = dets.shape[1]
-    cdef int num_out
-    cdef np.ndarray[np.int32_t, ndim=1] \
-        keep = np.zeros(boxes_num, dtype=np.int32)
-    cdef np.ndarray[np.float32_t, ndim=1] \
-        scores = dets[:, 4]
-    cdef np.ndarray[np.intp_t, ndim=1] \
-        order = scores.argsort()[::-1]
-    cdef np.ndarray[np.float32_t, ndim=2] \
-        sorted_dets = dets[order, :]
-    _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, thresh, device_id)
-    keep = keep[:num_out]
-    return list(order[keep])
--- a/compile/make.sh
+++ b/compile/make.sh
-#!/bin/sh
-# Delete cache
-rm -r build install *.c *.cpp
-# Compile cpp modules
-python setup.py build_ext --inplace
-# Compile cuda modules
-cd build && cmake .. && make install && cd ..
-# Copy to the library root
-cp -r install/lib ../
--- a/compile/nms_kernel.cu
+++ b/compile/nms_kernel.cu
-// ------------------------------------------------------------
-// Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-//
-// Licensed under the BSD 2-Clause License.
-// You should have received a copy of the BSD 2-Clause License
-// along with the software. If not, See,
-//
-//      <https://opensource.org/licenses/BSD-2-Clause>
-//
-// ------------------------------------------------------------
-#include <vector>
-#include "gpu_nms.h"
-#define CUDA_CHECK(condition) \
-  /* Code block avoids redefinition of cudaError_t error */ \
-  do { \
-    cudaError_t error = condition; \
-    if (error != cudaSuccess) { \
-      \
-    } \
-  } while (0)
-void SetDevice(int device_id) {
-    int current_device;
-    CUDA_CHECK(cudaGetDevice(&current_device));
-    if (current_device == device_id) return;
-    CUDA_CHECK(cudaSetDevice(device_id));
-}
-#define DIV_UP(m,n) ((m) / (n) + ((m) % (n) > 0))
-#define NMS_BLOCK_SIZE 64
-template <typename T>
-__device__  T iou(const T* A, const T* B) {
-    const T x1 = max(A[0], B[0]);
-    const T y1 = max(A[1], B[1]);
-    const T x2 = min(A[2], B[2]);
-    const T y2 = min(A[3], B[3]);
-    const T width = max((T)0, x2 - x1 + 1);
-    const T height = max((T)0, y2 - y1 + 1);
-    const T area = width * height;
-    const T A_area = (A[2] - A[0] + 1) * (A[3] - A[1] + 1);
-    const T B_area = (B[2] - B[0] + 1) * (B[3] - B[1] + 1);
-    return area / (A_area + B_area - area);
-}
-template <typename T>
-__global__ void nms_mask(const int num_boxes, const T nms_thresh,
-			 const T* boxes, unsigned long long* mask) {
-    const int i_start = blockIdx.x * NMS_BLOCK_SIZE;
-    const int di_end = min(num_boxes - i_start, NMS_BLOCK_SIZE);
-    const int j_start = blockIdx.y * NMS_BLOCK_SIZE;
-    const int dj_end = min(num_boxes - j_start, NMS_BLOCK_SIZE);
-    const int num_blocks = DIV_UP(num_boxes, NMS_BLOCK_SIZE);
-    const int bid = blockIdx.x;
-    const int tid = threadIdx.x;
-    __shared__ T boxes_i[NMS_BLOCK_SIZE * 4];
-    if (tid < di_end) {
-        boxes_i[tid * 4 + 0] = boxes[(i_start + tid) * 5 + 0];
-        boxes_i[tid * 4 + 1] = boxes[(i_start + tid) * 5 + 1];
-        boxes_i[tid * 4 + 2] = boxes[(i_start + tid) * 5 + 2];
-        boxes_i[tid * 4 + 3] = boxes[(i_start + tid) * 5 + 3];
-    }
-    __syncthreads();
-    if (tid < dj_end) {
-        const T* const box_j = boxes + (j_start + tid) * 5;
-        unsigned long long mask_j = 0;
-        const int di_start = (i_start == j_start) ? (tid + 1) : 0;
-        for (int di = di_start; di < di_end; ++di)
-            if (iou(box_j, boxes_i + di * 4) > nms_thresh) 
-		mask_j |= 1ULL << di;
-        mask[(j_start + tid) * num_blocks + bid] = mask_j;
-    }
-}
-template <typename T>
-void ApplyNMS(const int num_boxes, const int max_keeps, const float thresh,
-              const T* boxes, int* keep_indices, int& num_keep) {
-    const int num_blocks = DIV_UP(num_boxes, NMS_BLOCK_SIZE);
-    const dim3 blocks(num_blocks, num_blocks);
-    size_t mask_nbytes = num_boxes * num_blocks * sizeof(unsigned long long);
-    size_t boxes_nbytes = num_boxes * 5 * sizeof(T);
-    void* boxes_dev, *mask_dev;
-    CUDA_CHECK(cudaMalloc(&boxes_dev, boxes_nbytes));
-    CUDA_CHECK(cudaMalloc(&mask_dev, mask_nbytes));
-    CUDA_CHECK(cudaMemcpy(boxes_dev, boxes, boxes_nbytes, cudaMemcpyHostToDevice));
-    nms_mask<T> << <blocks, NMS_BLOCK_SIZE >> > (num_boxes, thresh,
-					             (T*)boxes_dev,
-                         	    (unsigned long long*)mask_dev);
-    CUDA_CHECK(cudaPeekAtLastError());
-    std::vector<unsigned long long> mask_host(num_boxes * num_blocks);
-    CUDA_CHECK(cudaMemcpy(&mask_host[0], mask_dev, mask_nbytes, cudaMemcpyDeviceToHost));
-    std::vector<unsigned long long> dead_bit(num_blocks);
-    memset(&dead_bit[0], 0, sizeof(unsigned long long) * num_blocks);
-    int num_selected = 0;
-    for (int i = 0; i < num_boxes; ++i) {
-        const int nblock = i / NMS_BLOCK_SIZE;
-        const int inblock = i % NMS_BLOCK_SIZE;
-        if (!(dead_bit[nblock] & (1ULL << inblock))) {
-            keep_indices[num_selected++] = i;
-            unsigned long long* mask_i = &mask_host[0] + i * num_blocks;
-            for (int j = nblock; j < num_blocks; ++j) dead_bit[j] |= mask_i[j];
-            if (num_selected == max_keeps) break;
-        }
-    }
-    num_keep = num_selected;
-    CUDA_CHECK(cudaFree(mask_dev)); 
-    CUDA_CHECK(cudaFree(boxes_dev));
-}
-void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
-    	  int boxes_dim, float nms_overlap_thresh, int device_id) {
-    //  set the device to use
-    SetDevice(device_id);
-    //  apply gpu nms
-    ApplyNMS<float>(boxes_num, boxes_num, nms_overlap_thresh,
-                             boxes_host, keep_out, *num_out);
-}
\ No newline at end of file
--- a/compile/setup.py
+++ b/compile/setup.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-from distutils.extension import Extension
-from distutils.core import setup
-from Cython.Distutils import build_ext
-import numpy as np
-numpy_include = np.get_include()
-ext_modules = [
-Extension(
-        "install.lib.utils.cython_bbox",
-        ["bbox.pyx"],
-        extra_compile_args=["-Wno-cpp", "-Wno-unused-function"],
-        include_dirs = [numpy_include]),
-Extension(
-        "install.lib.nms.cpu_nms",
-        ["cpu_nms.pyx"],
-        extra_compile_args=["-Wno-cpp", "-Wno-unused-function"],
-        include_dirs = [numpy_include]),
-Extension(
-        "install.deprecated.gpu_nms",
-        ["gpu_nms.pyx"],
-        extra_compile_args=["-Wno-cpp", "-Wno-unused-function"],
-        language='c++',
-        include_dirs = [numpy_include]),
-Extension(
-        'install.lib.pycocotools._mask',
-        ['../lib/pycocotools/maskApi.c', '../lib/pycocotools/_mask.pyx'],
-        include_dirs=[numpy_include, 'pycocotools'],
-        extra_compile_args=['-Wno-cpp', '-Wno-unused-function', '-std=c99']),
-]
-setup(name='Detectron',ext_modules=ext_modules,cmdclass = {'build_ext': build_ext})
--- a/configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_1x.yml
+++ b/configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_1x.yml
 NUM_GPUS: 8
 VIS: False
 ENABLE_TENSOR_BOARD: False
 MODEL:
  TYPE: faster_rcnn
  BACKBONE: resnet101.fpn
  CLASSES: ['__background__',
            'person', 'bicycle', 'car', 'motorcycle', 'airplane',
            'bus', 'train', 'truck', 'boat', 'traffic light',
            'fire hydrant', 'stop sign', 'parking meter', 'bench',
            'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant',
            'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
            'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
            'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
            'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife',
            'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli',
            'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
            'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop',
            'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven',
            'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors',
            'teddy bear', 'hair drier', 'toothbrush']
  NUM_CLASSES: 81
 SOLVER:
  BASE_LR: 0.02
  DECAY_STEPS: [60000, 80000]
  MAX_STEPS: 90000
  SNAPSHOT_EVERY: 5000
  SNAPSHOT_PREFIX: coco_faster_rcnn
 FRCNN:
  ROI_XFORM_METHOD: RoIAlign
  ROI_XFORM_RESOLUTION: 7
 TRAIN:
  WEIGHTS: '/model/R-101.Affine.pth'
-  DATABASE: '/data/coco_2014_trainval35k'
+  DATASET: '/data/coco_2014_trainval35k'
-  IMS_PER_BATCH: 2
+  USE_DIFF: False # Do not use crowd objects
-  USE_DIFF: False # Do not use crowd objects
+  IMS_PER_BATCH: 2
  BATCH_SIZE: 512
  SCALES: [800]
  MAX_SIZE: 1333
 TEST:
-  DATABASE: '/data/coco_2014_minival'
+  DATASET: '/data/coco_2014_minival'
  JSON_FILE: '/data/instances_minival2014.json'
  PROTOCOL: 'coco'
  RPN_POST_NMS_TOP_N: 1000
  SCALES: [800]
  MAX_SIZE: 1333
  NMS: 0.5
--- a/configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_2x.yml
+++ b/configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_2x.yml
 NUM_GPUS: 8
 VIS: False
 ENABLE_TENSOR_BOARD: False
 MODEL:
  TYPE: faster_rcnn
  BACKBONE: resnet101.fpn
  CLASSES: ['__background__',
            'person', 'bicycle', 'car', 'motorcycle', 'airplane',
            'bus', 'train', 'truck', 'boat', 'traffic light',
            'fire hydrant', 'stop sign', 'parking meter', 'bench',
            'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant',
            'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
            'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
            'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
            'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife',
            'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli',
            'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
            'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop',
            'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven',
            'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors',
            'teddy bear', 'hair drier', 'toothbrush']
  NUM_CLASSES: 81
 SOLVER:
  BASE_LR: 0.02
  DECAY_STEPS: [120000, 160000]
  MAX_STEPS: 180000
  SNAPSHOT_EVERY: 5000
  SNAPSHOT_PREFIX: coco_faster_rcnn
 FRCNN:
  ROI_XFORM_METHOD: RoIAlign
  ROI_XFORM_RESOLUTION: 7
 TRAIN:
  WEIGHTS: '/model/R-101.Affine.pth'
-  DATABASE: '/data/coco_2014_trainval35k'
+  DATASET: '/data/coco_2014_trainval35k'
-  IMS_PER_BATCH: 2
+  USE_DIFF: False # Do not use crowd objects
-  USE_DIFF: False # Do not use crowd objects
+  IMS_PER_BATCH: 2
  BATCH_SIZE: 512
  SCALES: [800]
  MAX_SIZE: 1333
 TEST:
-  DATABASE: '/data/coco_2014_minival'
+  DATASET: '/data/coco_2014_minival'
  JSON_FILE: '/data/instances_minival2014.json'
  PROTOCOL: 'coco'
  RPN_POST_NMS_TOP_N: 1000
  SCALES: [800]
  MAX_SIZE: 1333
  NMS: 0.5
--- a/configs/faster_rcnn/voc_faster_rcnn_R-50-FPN.yml
+++ b/configs/faster_rcnn/voc_faster_rcnn_R-50-FPN.yml
 NUM_GPUS: 1
 VIS: False
 ENABLE_TENSOR_BOARD: False
 MODEL:
  TYPE: faster_rcnn
  BACKBONE: resnet50.fpn
  CLASSES: ['__background__',
            'aeroplane', 'bicycle', 'bird', 'boat',
            'bottle', 'bus', 'car', 'cat', 'chair',
            'cow', 'diningtable', 'dog', 'horse',
            'motorbike', 'person', 'pottedplant',
            'sheep', 'sofa', 'train', 'tvmonitor']
  NUM_CLASSES: 21
 SOLVER:
  BASE_LR: 0.002
  DECAY_STEPS: [100000, 140000]
  MAX_STEPS: 140000
  SNAPSHOT_EVERY: 5000
  SNAPSHOT_PREFIX: voc_faster_rcnn
 FRCNN:
  ROI_XFORM_METHOD: RoIAlign
  ROI_XFORM_RESOLUTION: 7
 TRAIN:
  WEIGHTS: '/model/R-50.Affine.pth'
-  DATABASE: '/data/voc_0712_trainval'
+  DATASET: '/data/voc_0712_trainval'
  IMS_PER_BATCH: 2
  BATCH_SIZE: 128
  SCALES: [600]
  MAX_SIZE: 1000
 TEST:
-  DATABASE: '/data/voc_2007_test'
+  DATASET: '/data/voc_2007_test'
  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
  RPN_POST_NMS_TOP_N: 1000
  SCALES: [600]
  MAX_SIZE: 1000
  NMS: 0.45
\ No newline at end of file
--- a/configs/faster_rcnn/voc_faster_rcnn_VGG-16-C4.yml
+++ b/configs/faster_rcnn/voc_faster_rcnn_VGG-16-C4.yml
 NUM_GPUS: 1
 VIS: False
 ENABLE_TENSOR_BOARD: False
 MODEL:
  TYPE: faster_rcnn
  BACKBONE: vgg16.c4
  CLASSES: ['__background__',
            'aeroplane', 'bicycle', 'bird', 'boat',
            'bottle', 'bus', 'car', 'cat', 'chair',
            'cow', 'diningtable', 'dog', 'horse',
            'motorbike', 'person', 'pottedplant',
            'sheep', 'sofa', 'train', 'tvmonitor']
  NUM_CLASSES: 21
 SOLVER:
  BASE_LR: 0.001
  WEIGHT_DECAY: 0.0005
  DECAY_STEPS: [100000, 140000]
  MAX_STEPS: 140000
  SNAPSHOT_EVERY: 5000
  SNAPSHOT_PREFIX: voc_faster_rcnn
 RPN:
  STRIDES: [16]
  SCALES: [8, 16, 32] # RField: [128, 256, 512]
  ASPECT_RATIOS: [0.5, 1.0, 2.0]
 FRCNN:
  ROI_XFORM_METHOD: RoIPool
  ROI_XFORM_RESOLUTION: 7
  MLP_HEAD_DIM: 4096
 TRAIN:
  WEIGHTS: '/model/VGG16.RCNN.pth'
-  DATABASE: '/data/voc_0712_trainval'
+  DATASET: '/data/voc_0712_trainval'
  RPN_MIN_SIZE: 16
  IMS_PER_BATCH: 2
  BATCH_SIZE: 128
  SCALES: [600]
  MAX_SIZE: 1000
 TEST:
-  DATABASE: '/data/voc_2007_test'
+  DATASET: '/data/voc_2007_test'
  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
  RPN_MIN_SIZE: 16
  RPN_POST_NMS_TOP_N: 300
  SCALES: [600]
  MAX_SIZE: 1000
  NMS: 0.45
\ No newline at end of file
--- a/configs/retinanet/coco_retinanet_400_R-50-FPN_1x.yml
+++ b/configs/retinanet/coco_retinanet_400_R-50-FPN_1x.yml
 NUM_GPUS: 4
 VIS: False
 ENABLE_TENSOR_BOARD: False
 MODEL:
  TYPE: retinanet
  BACKBONE: resnet50.fpn
  CLASSES: ['__background__',
            'person', 'bicycle', 'car', 'motorcycle', 'airplane',
            'bus', 'train', 'truck', 'boat', 'traffic light',
            'fire hydrant', 'stop sign', 'parking meter', 'bench',
            'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant',
            'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
            'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
            'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
            'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife',
            'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli',
            'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
            'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop',
            'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven',
            'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors',
            'teddy bear', 'hair drier', 'toothbrush']
  NUM_CLASSES: 81
 SOLVER:
-  BASE_LR: 0.02
+  BASE_LR: 0.01
-  DECAY_STEPS: [30000, 40000]
+  DECAY_STEPS: [60000, 80000]
-  MAX_STEPS: 45000
+  MAX_STEPS: 90000
  SNAPSHOT_EVERY: 5000
  SNAPSHOT_PREFIX: coco_retinanet_400
 FPN:
  RPN_MIN_LEVEL: 3
  RPN_MAX_LEVEL: 7
 TRAIN:
  WEIGHTS: '/model/R-50.Affine.pth'
-  DATABASE: '/data/coco_2014_trainval35k'
+  DATASET: '/data/coco_2014_trainval35k'
-  IMS_PER_BATCH: 8
+  USE_DIFF: False  # Do not use crowd objects
-  SCALES: [400]
+  USE_COLOR_JITTER: True
-  MAX_SIZE: 666
+  IMS_PER_BATCH: 8
-TEST:
+  SCALES: [400]
-  DATABASE: '/data/coco_2014_minival'
+  MAX_SIZE: 666
-  JSON_FILE: '/data/instances_minival2014.json'
+  RANDOM_SCALES: [0.75, 1.0]
-  PROTOCOL: 'coco'
+TEST:
-  IMS_PER_BATCH: 1
+  DATASET: '/data/coco_2014_minival'
-  SCALES: [400]
+  JSON_FILE: '/data/instances_minival2014.json'
-  MAX_SIZE: 666
+  PROTOCOL: 'coco'
+  IMS_PER_BATCH: 1
+  SCALES: [400]
+  MAX_SIZE: 666
  NMS: 0.5
\ No newline at end of file
--- a/configs/retinanet/coco_retinanet_400_R-50-FPN_4x.yml
+++ b/configs/retinanet/coco_retinanet_400_R-50-FPN_4x.yml
-NUM_GPUS: 4
-VIS: False
-ENABLE_TENSOR_BOARD: False
-MODEL:
-  TYPE: retinanet
-  BACKBONE: resnet50.fpn
-  CLASSES: ['__background__',
-            'person', 'bicycle', 'car', 'motorcycle', 'airplane',
-            'bus', 'train', 'truck', 'boat', 'traffic light',
-            'fire hydrant', 'stop sign', 'parking meter', 'bench',
-            'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant',
-            'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
-            'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
-            'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
-            'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife',
-            'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli',
-            'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
-            'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop',
-            'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven',
-            'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors',
-            'teddy bear', 'hair drier', 'toothbrush']
-  NUM_CLASSES: 81
-SOLVER:
-  BASE_LR: 0.02
-  WARM_UP_STEPS: 2000 # default: 500
-  DECAY_STEPS: [120000, 160000]
-  MAX_STEPS: 180000
-  SNAPSHOT_EVERY: 5000
-  SNAPSHOT_PREFIX: coco_retinanet_400
-FPN:
-  RPN_MIN_LEVEL: 3
-  RPN_MAX_LEVEL: 7
-DROPBLOCK:
-  DROP_ON: True
-  DECREMENT: 0.000005 # * 20000 = 0.1
-TRAIN:
-  WEIGHTS: '/model/R-50.Affine.pth'
-  DATABASE: '/data/coco_2014_trainval35k'
-  IMS_PER_BATCH: 8
-  SCALES: [400]
-  MAX_SIZE: 666
-  USE_SCALE_JITTER: True
-  USE_COLOR_JITTER: True
-  SCALE_JITTER_RANGE: [0.75, 1.33]
-TEST:
-  DATABASE: '/data/coco_2014_minival'
-  JSON_FILE: '/data/instances_minival2014.json'
-  PROTOCOL: 'coco'
-  IMS_PER_BATCH: 1
-  SCALES: [400]
-  MAX_SIZE: 666
-  NMS: 0.5
\ No newline at end of file
--- a/configs/retinanet/voc_retinanet_300_R-18-FPN.yml
+++ b/configs/retinanet/voc_retinanet_300_R-18-FPN.yml
-NUM_GPUS: 1
-VIS: False
-VIS_ON_FILE: False
-MODEL:
-  TYPE: retinanet
-  BACKBONE: resnet18.fpn
-  CLASSES: ['__background__',
-            'aeroplane', 'bicycle', 'bird', 'boat',
-            'bottle', 'bus', 'car', 'cat', 'chair',
-            'cow', 'diningtable', 'dog', 'horse',
-            'motorbike', 'person', 'pottedplant',
-            'sheep', 'sofa', 'train', 'tvmonitor']
-  NUM_CLASSES: 21
-SOLVER:
-  BASE_LR: 0.01
-  DECAY_STEPS: [40000, 50000, 60000]
-  WARM_UP_STEPS: 2000
-  MAX_STEPS: 60000
-  SNAPSHOT_EVERY: 5000
-  SNAPSHOT_PREFIX: voc_retinanet_300
-FPN:
-  RPN_MIN_LEVEL: 3
-  RPN_MAX_LEVEL: 7
-TRAIN:
-  WEIGHTS: '/model/R-18.Affine.pth'
-  DATABASE: '/data/voc_0712_trainval'
-  IMS_PER_BATCH: 32
-  SCALES: [300]
-  MAX_SIZE: 500
-  SCALE_JITTER_RANGE: [0.5, 2.0]
-  USE_SCALE_JITTER: True
-  USE_COLOR_JITTER: True
-TEST:
-  DATABASE: '/data/voc_2007_test'
-  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
-  IMS_PER_BATCH: 1
-  SCALES: [300]
-  MAX_SIZE: 500
-  NMS: 0.45
\ No newline at end of file
--- a/configs/retinanet/voc_retinanet_300_AirNet-FPN.yml
+++ b/configs/retinanet/voc_retinanet_300_AirNet-FPN.yml
 NUM_GPUS: 1
 VIS: False
 VIS_ON_FILE: False
 MODEL:
  TYPE: retinanet
  BACKBONE: airnet.fpn
  CLASSES: ['__background__',
            'aeroplane', 'bicycle', 'bird', 'boat',
            'bottle', 'bus', 'car', 'cat', 'chair',
            'cow', 'diningtable', 'dog', 'horse',
            'motorbike', 'person', 'pottedplant',
            'sheep', 'sofa', 'train', 'tvmonitor']
  NUM_CLASSES: 21
 SOLVER:
-  BASE_LR: 0.02
+  BASE_LR: 0.01
  DECAY_STEPS: [40000, 50000, 60000]
  MAX_STEPS: 60000
  SNAPSHOT_EVERY: 5000
-  SNAPSHOT_PREFIX: voc_retinanet_300
+  SNAPSHOT_PREFIX: voc_retinanet_320
 FPN:
  RPN_MIN_LEVEL: 3
  RPN_MAX_LEVEL: 7
 TRAIN:
  WEIGHTS: '/model/AirNet.Affine.pth'
-  DATABASE: '/data/voc_0712_trainval'
+  DATASET: '/data/voc_0712_trainval'
-  IMS_PER_BATCH: 32
+  USE_COLOR_JITTER: True
-  SCALES: [300]
+  IMS_PER_BATCH: 32
-  MAX_SIZE: 500
+  SCALES: [320]
-  SCALE_JITTER_RANGE: [0.5, 2.0]
+  RANDOM_SCALES: [0.5, 1.0]
-  USE_SCALE_JITTER: True
+TEST:
-  USE_COLOR_JITTER: True
+  DATASET: '/data/voc_2007_test'
-TEST:
+  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
-  DATABASE: '/data/voc_2007_test'
+  IMS_PER_BATCH: 1
-  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
+  SCALES: [320]
-  IMS_PER_BATCH: 1
-  SCALES: [300]
-  MAX_SIZE: 500
  NMS: 0.45
\ No newline at end of file
--- a/configs/retinanet/voc_retinanet_300_R-34-FPN.yml
+++ b/configs/retinanet/voc_retinanet_300_R-34-FPN.yml
 NUM_GPUS: 1
 VIS: False
 VIS_ON_FILE: False
 MODEL:
  TYPE: retinanet
  BACKBONE: resnet34.fpn
  CLASSES: ['__background__',
            'aeroplane', 'bicycle', 'bird', 'boat',
            'bottle', 'bus', 'car', 'cat', 'chair',
            'cow', 'diningtable', 'dog', 'horse',
            'motorbike', 'person', 'pottedplant',
            'sheep', 'sofa', 'train', 'tvmonitor']
  NUM_CLASSES: 21
 SOLVER:
  BASE_LR: 0.01
  DECAY_STEPS: [40000, 50000, 60000]
  WARM_UP_STEPS: 2000
  MAX_STEPS: 60000
  SNAPSHOT_EVERY: 5000
-  SNAPSHOT_PREFIX: voc_retinanet_300
+  SNAPSHOT_PREFIX: voc_retinanet_320
 FPN:
  RPN_MIN_LEVEL: 3
  RPN_MAX_LEVEL: 7
 TRAIN:
-  WEIGHTS: '/model/R-34.Affine.pth'
+  WEIGHTS: '/model/R-50.Affine.pth'
-  DATABASE: '/data/voc_0712_trainval'
+  DATASET: '/data/voc_0712_trainval'
-  IMS_PER_BATCH: 32
+  USE_COLOR_JITTER: True
-  SCALES: [300]
+  IMS_PER_BATCH: 32
-  MAX_SIZE: 500
+  SCALES: [320]
-  SCALE_JITTER_RANGE: [0.5, 2.0]
+  RANDOM_SCALES: [0.5, 2.0]
-  USE_SCALE_JITTER: True
+TEST:
-  USE_COLOR_JITTER: True
+  DATASET: '/data/voc_2007_test'
-TEST:
+  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
-  DATABASE: '/data/voc_2007_test'
+  IMS_PER_BATCH: 1
-  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
+  SCALES: [320]
-  IMS_PER_BATCH: 1
-  SCALES: [300]
-  MAX_SIZE: 500
  NMS: 0.45
\ No newline at end of file
--- a/configs/ssd/voc_ssd_300_AirNet-5b.yml
+++ b/configs/ssd/voc_ssd_300_AirNet-5b.yml
 NUM_GPUS: 1
 VIS: False
 ENABLE_TENSOR_BOARD: False
 MODEL:
  TYPE: ssd
  BACKBONE: airnet5b.mbox
  CLASSES: ['__background__',
            'aeroplane', 'bicycle', 'bird', 'boat',
            'bottle', 'bus', 'car', 'cat', 'chair',
            'cow', 'diningtable', 'dog', 'horse',
            'motorbike', 'person', 'pottedplant',
            'sheep', 'sofa', 'train', 'tvmonitor']
  NUM_CLASSES: 21
 SOLVER:
  BASE_LR: 0.001
  DECAY_STEPS: [80000, 100000, 120000]
  MAX_STEPS: 120000
  SNAPSHOT_EVERY: 5000
-  SNAPSHOT_PREFIX: voc_ssd_300
+  SNAPSHOT_PREFIX: voc_ssd_320
 SSD:
-  RESIZE:
+  NUM_CONVS: 2
-    HEIGHT: 300
+  MULTIBOX:
-    WIDTH: 300
+    STRIDES: [8, 16, 32]
-  MULTIBOX:
+    MIN_SIZES: [30, 90, 150]
-    MIN_SIZES: [30, 90, 150]
+    MAX_SIZES: [90, 150, 210]
-    MAX_SIZES: [90, 150, 210]
+    ASPECT_RATIOS: [[1, 2, 0.5], [1, 2, 0.5], [1, 2, 0.5]]
-    STRIDES: [8, 16, 32]
+TRAIN:
-    ASPECT_RATIOS: [[1, 2, 0.5], [1, 2, 0.5], [1, 2, 0.5]]
+  WEIGHTS: '/model/AirNet.Affine.pth'
-TRAIN:
+  DATASET: '/data/voc_0712_trainval'
-  WEIGHTS: '/model/AirNet.Affine.pth'
+  SCALES: [320]
-  DATABASE: '/data/voc_0712_trainval'
+  RANDOM_SCALES: [0.25, 1.00]
  IMS_PER_BATCH: 32
 TEST:
-  DATABASE: '/data/voc_2007_test'
+  DATASET: '/data/voc_2007_test'
  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
  IMS_PER_BATCH: 8
-  NMS_TOP_K: 400
+  SCALES: [320]
-  NMS: 0.45
+  NMS_TOP_K: 400
-  SCORE_THRESH: 0.01
+  NMS: 0.45
+  SCORE_THRESH: 0.01
  DETECTIONS_PER_IM: 200
\ No newline at end of file
--- a/configs/ssd/voc_ssd_300_VGG-16.yml
+++ b/configs/ssd/voc_ssd_300_VGG-16.yml
 NUM_GPUS: 1
 VIS: False
 ENABLE_TENSOR_BOARD: False
 MODEL:
  TYPE: ssd
  BACKBONE: vgg16_reduced_300.mbox
  FREEZE_AT: 0
  CLASSES: ['__background__',
            'aeroplane', 'bicycle', 'bird', 'boat',
            'bottle', 'bus', 'car', 'cat', 'chair',
            'cow', 'diningtable', 'dog', 'horse',
            'motorbike', 'person', 'pottedplant',
            'sheep', 'sofa', 'train', 'tvmonitor']
  NUM_CLASSES: 21
 SOLVER:
  BASE_LR: 0.001
-  WARM_UP_FACTOR: 0.
+  WEIGHT_DECAY: 0.0005
-  WEIGHT_DECAY: 0.0005
+  DECAY_STEPS: [80000, 100000, 120000]
-  DECAY_STEPS: [80000, 100000, 120000]
+  MAX_STEPS: 120000
-  MAX_STEPS: 120000
+  SNAPSHOT_EVERY: 5000
-  SNAPSHOT_EVERY: 5000
+  SNAPSHOT_PREFIX: voc_ssd_300
-  SNAPSHOT_PREFIX: voc_ssd_300
+SSD:
-SSD:
+  MULTIBOX:
-  RESIZE:
+    STRIDES: [8, 16, 32, 64, 100, 300]
-    HEIGHT: 300
+    MIN_SIZES: [30, 60, 110, 162, 213, 264]
-    WIDTH: 300
+    MAX_SIZES: [60, 110, 162, 213, 264, 315]
-  MULTIBOX:
+    ASPECT_RATIOS: [
-    STRIDES: [8, 16, 32, 64, 100, 300]
+      [1, 2, 0.5],
-    MIN_SIZES: [30, 60, 110, 162, 213, 264]
+      [1, 2, 0.5, 3, 0.33],
-    MAX_SIZES: [60, 110, 162, 213, 264, 315]
+      [1, 2, 0.5, 3, 0.33],
-    ASPECT_RATIOS: [[1, 2, 0.5], [1, 2, 0.5, 3, 0.33], [1, 2, 0.5, 3, 0.33],
+      [1, 2, 0.5, 3, 0.33],
-                    [1, 2, 0.5, 3, 0.33], [1, 2, 0.5], [1, 2, 0.5]]
+      [1, 2, 0.5],
-TRAIN:
+      [1, 2, 0.5]
-  WEIGHTS: '/model/VGG16.SSD.pth'
+    ]
-  DATABASE: '/data/voc_0712_trainval'
+TRAIN:
-  IMS_PER_BATCH: 32
+  WEIGHTS: '/model/VGG16.SSD.pth'
-TEST:
+  DATASET: '/data/voc_0712_trainval'
-  DATABASE: '/data/voc_2007_test'
+  IMS_PER_BATCH: 32
-  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
+  SCALES: [300]
-  IMS_PER_BATCH: 8
+  RANDOM_SCALES: [0.25, 1.00]
-  NMS_TOP_K: 400
+TEST:
-  NMS: 0.45
+  DATASET: '/data/voc_2007_test'
-  SCORE_THRESH: 0.01
+  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
-  DETECTIONS_PER_IM: 200
+  IMS_PER_BATCH: 8
+  SCALES: [300]
+  NMS_TOP_K: 400
+  NMS: 0.45
+  SCORE_THRESH: 0.01
+  DETECTIONS_PER_IM: 200
--- a/configs/ssd/voc_ssd_320_R-50-FPN.yml
+++ b/configs/ssd/voc_ssd_320_R-50-FPN.yml
@@ -22,23 +22,29 @@ SOLVER:
  SNAPSHOT_PREFIX: voc_ssd_320
 SSD:
  NUM_CONVS: 2
-  RESIZE:
-    HEIGHT: 320
-    WIDTH: 320
  MULTIBOX:
    STRIDES: [8, 16, 32, 64, 100, 300]
    MIN_SIZES: [30, 60, 110, 162, 213, 264]
    MAX_SIZES: [60, 110, 162, 213, 264, 315]
-    ASPECT_RATIOS: [[1, 2, 0.5], [1, 2, 0.5, 3, 0.33], [1, 2, 0.5, 3, 0.33],
+    ASPECT_RATIOS: [
-                    [1, 2, 0.5, 3, 0.33], [1, 2, 0.5], [1, 2, 0.5]]
+      [1, 2, 0.5],
+      [1, 2, 0.5, 3, 0.33],
+      [1, 2, 0.5, 3, 0.33],
+      [1, 2, 0.5, 3, 0.33],
+      [1, 2, 0.5],
+      [1, 2, 0.5]
+    ]
 TRAIN:
  WEIGHTS: '/model/R-50.Affine.pth'
-  DATABASE: '/data/voc_0712_trainval'
+  DATASET: '/data/voc_0712_trainval'
+  SCALES: [320]
+  RANDOM_SCALES: [0.25, 1.00]
  IMS_PER_BATCH: 32
 TEST:
-  DATABASE: '/data/voc_2007_test'
+  DATASET: '/data/voc_2007_test'
  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
  IMS_PER_BATCH: 8
+  SCALES: [320]
  NMS_TOP_K: 400
  NMS: 0.45
  SCORE_THRESH: 0.01

--- a/csrc/cxx/operators/nms_op.cc
+++ b/csrc/cxx/operators/nms_op.cc
+#include <dragon/core/workspace.h>
+#include <dragon/utils/math_utils.h>
+#include "../utils/detection_utils.h"
+#include "nms_op.h"
+namespace dragon {
+template <class Context> template <typename T>
+void NonMaxSuppressionOp<Context>::DoRunWithType() {
+    int num_selected;
+    utils::detection::ApplyNMS(
+        Output(0)->count(),
+        Output(0)->count(),
+        iou_threshold_,
+        Input(0).template mutable_data<T, Context>(),
+        Output(0)->template mutable_data<int64_t, CPUContext>(),
+        num_selected, ctx()
+    );
+    Output(0)->Reshape({ num_selected });
+}
+template <class Context>
+void NonMaxSuppressionOp<Context>::RunOnDevice() {
+    CHECK(Input(0).ndim() == 2 && Input(0).dim(1) == 5)
+        << "\nThe dimensions of boxes should be (num_boxes, 5).";
+    Output(0)->Reshape({ Input(0).dim(0) });
+    DispatchHelper<TensorTypes<float>>::Call(this, Input(0));
+}
+DEPLOY_CPU(NonMaxSuppression);
+#ifdef USE_CUDA
+DEPLOY_CUDA(NonMaxSuppression);
+#endif
+OPERATOR_SCHEMA(NonMaxSuppression).NumInputs(1).NumOutputs(1);
+NO_GRADIENT(NonMaxSuppression);
+}  // namespace dragon
--- a/csrc/cxx/operators/nms_op.h
+++ b/csrc/cxx/operators/nms_op.h
+/*!
+ * Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+ *
+ * Licensed under the BSD 2-Clause License.
+ * You should have received a copy of the BSD 2-Clause License
+ * along with the software. If not, See,
+ *
+ *      <https://opensource.org/licenses/BSD-2-Clause>
+ *
+ * ------------------------------------------------------------
+ */
+#ifndef SEETADET_CXX_OPERATORS_NMS_OP_H_
+#define SEETADET_CXX_OPERATORS_NMS_OP_H_
+#include "dragon/core/operator.h"
+namespace dragon {
+template <class Context>
+class NonMaxSuppressionOp final : public Operator<Context> {
+ public:
+    NonMaxSuppressionOp(const OperatorDef& def, Workspace* ws)
+        : Operator<Context>(def, ws),
+          iou_threshold_(OpArg<float>("iou_threshold", 0.5f)) {}
+    USE_OPERATOR_FUNCTIONS;
+    void RunOnDevice() override;
+    template <typename T>
+    void DoRunWithType();
+ protected:
+    float iou_threshold_;
+};
+}  // namespace dragon
+#endif  // SEETADET_CXX_OPERATORS_NMS_OP_H_
--- a/csrc/cxx/operators/retinanet_decoder_op.cc
+++ b/csrc/cxx/operators/retinanet_decoder_op.cc
+#include <dragon/core/workspace.h>
+#include <dragon/utils/math_utils.h>
+#include "../utils/detection_utils.h"
+#include "retinanet_decoder_op.h"
+namespace dragon {
+template <class Context> template <typename T>
+void RetinaNetDecoderOp<Context>::DoRunWithType() {
+    using BT = float;  // DType of BBox
+    using BC = CPUContext;  // Context of BBox
+    int feat_h, feat_w;
+    int C = Input(-3).dim(2), A, K;
+    int total_proposals = 0;
+    int num_candidates, num_boxes, num_proposals;
+    auto* batch_scores = Input(-3).template data<T, BC>();
+    auto* batch_deltas = Input(-2).template data<T, BC>();
+    auto* im_info = Input(-1).template data<BT, BC>();
+    auto* y = Output(0)->template mutable_data<BT, BC>();
+    for (int n = 0; n < num_images_; ++n) {
+        BT im_h = im_info[0];
+        BT im_w = im_info[1];
+        BT im_scale_h = im_info[2];
+        BT im_scale_w = im_info[2];
+        if (Input(-1).dim(1) == 4) im_scale_w = im_info[3];
+        auto* scores = batch_scores + n * Input(-3).stride(0);
+        auto* deltas = batch_deltas + n * Input(-2).stride(0);
+        CHECK_EQ(strides_.size(), InputSize() - 3)
+            << "\nGiven " << strides_.size() << " strides "
+            << "and " << InputSize() - 3 << " features";
+        // Select the top-k candidates as proposals
+        num_boxes = Input(-3).dim(1);
+        num_candidates = Input(-3).count(1);
+        roi_indices_.resize(num_candidates);
+        num_candidates = 0;
+        for (int i = 0; i < roi_indices_.size(); ++i)
+            if (scores[i] > score_thr_)
+                roi_indices_[num_candidates++] = i;
+        scores_.resize(num_candidates);
+        for (int i = 0; i < num_candidates; ++i)
+            scores_[i] = scores[roi_indices_[i]];
+        num_proposals = std::min(
+            num_candidates,
+            (int)pre_nms_topn_
+        );
+        utils::math::ArgPartition(
+            num_candidates,
+            num_proposals,
+            true,
+            scores_.data(),
+            indices_
+        );
+        for (int i = 0; i < num_proposals; ++i)
+            indices_[i] = roi_indices_[indices_[i]];
+        // Decode the candidates
+        int base_offset = 0;
+        for (int i = 0; i < strides_.size(); i++) {
+            feat_h = Input(i).dim(2);
+            feat_w = Input(i).dim(3);
+            K = feat_h * feat_w;
+            A = int(ratios_.size() * scales_.size());
+            anchors_.resize((size_t)(A * 4));
+            utils::detection::GenerateAnchors(
+                strides_[i],
+                (int)ratios_.size(),
+                (int)scales_.size(),
+                ratios_.data(),
+                scales_.data(),
+                anchors_.data()
+            );
+            utils::detection::GenerateGridAnchors(
+                num_proposals, C, A,
+                feat_h, feat_w,
+                strides_[i],
+                base_offset,
+                anchors_.data(),
+                indices_.data(),
+                y
+            );
+            base_offset += (A * K);
+        }
+        utils::detection::GenerateMCProposals(
+            num_proposals,
+            num_boxes, C,
+            n,
+            im_h,
+            im_w,
+            im_scale_h,
+            im_scale_w,
+            scores,
+            deltas,
+            indices_.data(),
+            y
+        );
+        total_proposals += num_proposals;
+        y += (num_proposals * 7);
+        im_info += Input(-1).dim(1);
+    }
+    Output(0)->Reshape({ total_proposals, 7 });
+}
+template <class Context>
+void RetinaNetDecoderOp<Context>::RunOnDevice() {
+    num_images_ = Input(0).dim(0);
+    CHECK_EQ(Input(-1).dim(0), num_images_)
+        << "\nExcepted " << num_images_
+        << " groups info, got "
+        << Input(-1).dim(0) << ".";
+    Output(0)->Reshape({ num_images_ * pre_nms_topn_, 7 });
+    DispatchHelper<TensorTypes<float>>::Call(this, Input(-3));
+}
+DEPLOY_CPU(RetinaNetDecoder);
+#ifdef USE_CUDA
+DEPLOY_CUDA(RetinaNetDecoder);
+#endif
+OPERATOR_SCHEMA(RetinaNetDecoder)
+    .NumInputs(3, INT_MAX)
+    .NumOutputs(1, INT_MAX);
+}  // namespace dragon
--- a/csrc/cxx/operators/retinanet_decoder_op.h
+++ b/csrc/cxx/operators/retinanet_decoder_op.h
+/*!
+ * Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+ *
+ * Licensed under the BSD 2-Clause License.
+ * You should have received a copy of the BSD 2-Clause License
+ * along with the software. If not, See,
+ *
+ *      <https://opensource.org/licenses/BSD-2-Clause>
+ *
+ * ------------------------------------------------------------
+ */
+#ifndef SEETADET_CXX_OPERATORS_RETINANET_DECODER_OP_H_
+#define SEETADET_CXX_OPERATORS_RETINANET_DECODER_OP_H_
+#include "dragon/core/operator.h"
+namespace dragon {
+template <class Context>
+class RetinaNetDecoderOp final : public Operator<Context> {
+ public:
+    RetinaNetDecoderOp(const OperatorDef& def, Workspace* ws)
+        : Operator<Context>(def, ws),
+          strides_(OpArgs<int64_t>("strides")),
+          ratios_(OpArgs<float>("ratios")),
+          scales_(OpArgs<float>("scales")),
+          pre_nms_topn_(OpArg<int64_t>("pre_nms_top_n", 6000)),
+          score_thr_(OpArg<float>("score_thresh", 0.05f)) {}
+    USE_OPERATOR_FUNCTIONS;
+    void RunOnDevice() override;
+    template <typename T>
+    void DoRunWithType();
+ protected:
+    float score_thr_;
+    vec64_t strides_, indices_, roi_indices_;
+    vector<float> ratios_, scales_, scores_, anchors_;
+    int64_t num_images_, pre_nms_topn_;
+};
+}  // namespace dragon
+#endif  // SEETADET_CXX_OPERATORS_RETINANET_DECODER_OP_H_
--- a/csrc/cxx/operators/rpn_decoder_op.cc
+++ b/csrc/cxx/operators/rpn_decoder_op.cc
+#include <dragon/core/workspace.h>
+#include <dragon/utils/math_utils.h>
+#include "../utils/detection_utils.h"
+#include "rpn_decoder_op.h"
+namespace dragon {
+template <class Context> template <typename T>
+void RPNDecoderOp<Context>::DoRunWithType() {
+    using BT = float;  // DType of BBox
+    using BC = CPUContext;  // Context of BBox
+    int feat_h, feat_w, K, A;
+    int total_rois = 0, num_rois;
+    int num_candidates, num_proposals;
+    auto* batch_scores = Input(-3).template data<T, BC>();
+    auto* batch_deltas = Input(-2).template data<T, BC>();
+    auto* im_info = Input(-1).template data<BT, BC>();
+    auto* y = Output(0)->template mutable_data<BT, BC>();
+    for (int n = 0; n < num_images_; ++n) {
+        const BT im_h = im_info[0];
+        const BT im_w = im_info[1];
+        const BT scale = im_info[2];
+        const BT min_box_h = min_size_ * scale;
+        const BT min_box_w = min_size_ * scale;
+        auto* scores = batch_scores + n * Input(-3).stride(0);
+        auto* deltas = batch_deltas + n * Input(-2).stride(0);
+        if (strides_.size() == 1) {
+            // Case 1: single stride
+            feat_h = Input(0).dim(2);
+            feat_w = Input(0).dim(3);
+            K = feat_h * feat_w;
+            A = int(ratios_.size() * scales_.size());
+            // Select the Top-K candidates as proposals
+            num_candidates = A * K;
+            num_proposals = std::min(
+                num_candidates,
+                (int)pre_nms_topn_
+            );
+            utils::math::ArgPartition(
+                num_candidates,
+                num_proposals,
+                true, scores, indices_
+            );
+            // Decode the candidates
+            anchors_.resize((size_t)(A * 4));
+            proposals_.Reshape({ num_proposals, 5 });
+            utils::detection::GenerateAnchors(
+                strides_[0],
+                (int)ratios_.size(),
+                (int)scales_.size(),
+                ratios_.data(),
+                scales_.data(),
+                anchors_.data()
+            );
+            utils::detection::GenerateGridAnchors(
+                num_proposals, A,
+                feat_h, feat_w,
+                strides_[0],
+                0,
+                anchors_.data(),
+                indices_.data(),
+                proposals_.template mutable_data<BT, BC>()
+            );
+            utils::detection::GenerateSSProposals(
+                K, num_proposals,
+                im_h, im_w,
+                min_box_h, min_box_w,
+                scores,
+                deltas,
+                indices_.data(),
+                proposals_.template mutable_data<BT, BC>()
+            );
+            // Sort, NMS and Retrieve
+            utils::detection::SortProposals(
+                0,
+                num_proposals - 1,
+                num_proposals,
+                proposals_.template mutable_data<BT, BC>()
+            );
+            utils::detection::ApplyNMS(
+                num_proposals,
+                post_nms_topn_,
+                nms_thr_,
+                proposals_.template mutable_data<BT, Context>(),
+                roi_indices_.data(),
+                num_rois, ctx()
+            );
+            utils::detection::RetrieveRoIs(
+                num_rois,
+                n,
+                proposals_.template data<BT, BC>(),
+                roi_indices_.data(),
+                y
+            );
+        } else if (strides_.size() > 1) {
+            // Case 2: multiple strides
+            CHECK_EQ(strides_.size(), InputSize() - 3)
+                << "\nGiven " << strides_.size() << " strides "
+                << "and " << InputSize() - 3 << " feature inputs";
+            CHECK_EQ(strides_.size(), scales_.size())
+                << "\nGiven " << strides_.size() << " strides "
+                << "and " << scales_.size() << " scales";
+            // Select the top-k candidates as proposals
+            num_candidates = Input(-3).dim(1);
+            num_proposals = std::min(
+                num_candidates,
+                (int)pre_nms_topn_
+            );
+            utils::math::ArgPartition(
+                num_candidates,
+                num_proposals,
+                true, scores, indices_
+            );
+            // Decode the candidates
+            int base_offset = 0;
+            proposals_.Reshape({ num_proposals, 5 });
+            auto* proposals = proposals_
+                .template mutable_data<BT, BC>();
+            for (int i = 0; i < strides_.size(); i++) {
+                feat_h = Input(i).dim(2);
+                feat_w = Input(i).dim(3);
+                K = feat_h * feat_w;
+                A = (int)ratios_.size();
+                anchors_.resize((size_t)(A * 4));
+                utils::detection::GenerateAnchors(
+                    strides_[i],
+                    (int)ratios_.size(),
+                    1,
+                    ratios_.data(),
+                    scales_.data(),
+                    anchors_.data()
+                );
+                utils::detection::GenerateGridAnchors(
+                    num_proposals, A,
+                    feat_h, feat_w,
+                    strides_[i],
+                    base_offset,
+                    anchors_.data(),
+                    indices_.data(),
+                    proposals
+                );
+                base_offset += (A * K);
+            }
+            utils::detection::GenerateMSProposals(
+                num_candidates,
+                num_proposals,
+                im_h, im_w,
+                min_box_h, min_box_w,
+                scores,
+                deltas,
+                &indices_[0],
+                proposals
+            );
+            // Sort, NMS and Retrieve
+            utils::detection::SortProposals(
+                0,
+                num_proposals - 1,
+                num_proposals,
+                proposals
+            );
+            utils::detection::ApplyNMS(
+                num_proposals,
+                post_nms_topn_,
+                nms_thr_,
+                proposals_.template mutable_data<BT, Context>(),
+                roi_indices_.data(),
+                num_rois, ctx()
+            );
+            utils::detection::RetrieveRoIs(
+                num_rois,
+                n,
+                proposals,
+                roi_indices_.data(),
+                y
+            );
+        } else {
+            LOG(FATAL) << "Excepted at least one stride for proposals.";
+        }
+        total_rois += num_rois;
+        y += (num_rois * 5);
+        im_info += Input(-1).dim(1);
+    }
+    Output(0)->Reshape({ total_rois, 5 });
+    // Distribute rois into K bins
+    if (OutputSize() > 1) {
+        CHECK_EQ(max_level_ - min_level_ + 1, OutputSize())
+            << "\nExcepted " << OutputSize() << " outputs for levels "
+               "between [" << min_level_ << ", " << max_level_ << "].";
+        vector<BT*> ys(OutputSize());
+        vector<vec64_t> bins(OutputSize());
+        Tensor RoIs; RoIs.ReshapeLike(*Output(0));
+        auto* rois = RoIs.template mutable_data<BT, BC>();
+        ctx()->template Copy<BT, BC, BC>(
+            Output(0)->count(),
+            rois, Output(0)->template data<BT, BC>()
+        );
+        utils::detection::CollectRoIs(
+            total_rois,
+            min_level_,
+            max_level_,
+            canonical_level_,
+            canonical_scale_,
+            rois, bins
+        );
+        for (int i = 0; i < OutputSize(); i++) {
+            Output(i)->Reshape({ std::max((int)bins[i].size(), 1), 5 });
+            ys[i] = Output(i)->template mutable_data<BT, BC>();
+        }
+        utils::detection::DistributeRoIs(bins, rois, ys);
+    }
+}
+template <class Context>
+void RPNDecoderOp<Context>::RunOnDevice() {
+    num_images_ = Input(0).dim(0);
+    CHECK_EQ(Input(-1).dim(0), num_images_)
+        << "\nExcepted " << num_images_
+        << " groups info, got "
+        << Input(-1).dim(0) << ".";
+    roi_indices_.resize(post_nms_topn_);
+    Output(0)->Reshape({ num_images_ * post_nms_topn_, 5 });
+    DispatchHelper<TensorTypes<float>>::Call(this, Input(-3));
+}
+DEPLOY_CPU(RPNDecoder);
+#ifdef USE_CUDA
+DEPLOY_CUDA(RPNDecoder);
+#endif
+OPERATOR_SCHEMA(RPNDecoder)
+    .NumInputs(3, INT_MAX)
+    .NumOutputs(1, INT_MAX);
+}  // namespace dragon
--- a/csrc/cxx/operators/rpn_decoder_op.h
+++ b/csrc/cxx/operators/rpn_decoder_op.h
+/*!
+ * Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+ *
+ * Licensed under the BSD 2-Clause License.
+ * You should have received a copy of the BSD 2-Clause License
+ * along with the software. If not, See,
+ *
+ *      <https://opensource.org/licenses/BSD-2-Clause>
+ *
+ * ------------------------------------------------------------
+ */
+#ifndef SEETADET_CXX_OPERATORS_RPN_DECODER_OP_H_
+#define SEETADET_CXX_OPERATORS_RPN_DECODER_OP_H_
+#include "dragon/core/operator.h"
+namespace dragon {
+template <class Context>
+class RPNDecoderOp final : public Operator<Context> {
+ public:
+    RPNDecoderOp(const OperatorDef& def, Workspace* ws)
+        : Operator<Context>(def, ws),
+          strides_(OpArgs<int64_t>("strides")),
+          ratios_(OpArgs<float>("ratios")),
+          scales_(OpArgs<float>("scales")),
+          pre_nms_topn_(OpArg<int64_t>("pre_nms_top_n", 6000)),
+          post_nms_topn_(OpArg<int64_t>("post_nms_top_n", 300)),
+          nms_thr_(OpArg<float>("nms_thresh", 0.7f)),
+          min_size_(OpArg<int64_t>("min_size", 16)),
+          min_level_(OpArg<int64_t>("min_level", 2)),
+          max_level_(OpArg<int64_t>("max_level", 5)),
+          canonical_level_(OpArg<int64_t>("canonical_level", 4)),
+          canonical_scale_(OpArg<int64_t>("canonical_scale", 224)) {}
+    USE_OPERATOR_FUNCTIONS;
+    void RunOnDevice() override;
+    template <typename T>
+    void DoRunWithType();
+ protected:
+    float nms_thr_;
+    vec64_t strides_, indices_, roi_indices_;
+    vector<float> ratios_, scales_, scores_, anchors_;
+    int64_t min_size_, pre_nms_topn_, post_nms_topn_;
+    int64_t num_images_, min_level_, max_level_;
+    int64_t canonical_level_, canonical_scale_;
+    Tensor proposals_;
+};
+}  // namespace dragon
+#endif  // SEETADET_CXX_OPERATORS_RPN_DECODER_OP_H_
--- a/csrc/cxx/setup.py
+++ b/csrc/cxx/setup.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+"""Build cxx sources."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from distutils.core import setup
+from dragon.tools import cpp_extension
+if cpp_extension.CUDA_HOME is not None and \
+        cpp_extension._cuda.is_available():
+    Extension = cpp_extension.CUDAExtension
+else:
+    Extension = cpp_extension.CppExtension
+ext_modules = [
+    Extension(
+        name='install.lib.modules._C',
+        sources=[
+            'utils/detection_utils.cc',
+            'utils/detection_utils.cu',
+            'operators/nms_op.cc',
+            'operators/retinanet_decoder_op.cc',
+            'operators/rpn_decoder_op.cc',
+        ],
+    ),
+]
+setup(
+    name='SeetaDet',
+    ext_modules=ext_modules,
+    cmdclass={'build_ext': cpp_extension.BuildExtension}
+)
--- a/csrc/cxx/utils/detection_utils.cc
+++ b/csrc/cxx/utils/detection_utils.cc
+#include <dragon/core/context.h>
+#include "detection_utils.h"
+namespace dragon {
+namespace utils {
+namespace detection {
+template <typename T>
+T IoU(const T A[], const T B[]) {
+    if (A[0] > B[2] || A[1] > B[3] ||
+        A[2] < B[0] || A[3] < B[1]) return 0;
+    const T x1 = std::max(A[0], B[0]);
+    const T y1 = std::max(A[1], B[1]);
+    const T x2 = std::min(A[2], B[2]);
+    const T y2 = std::min(A[3], B[3]);
+    const T width = std::max((T)0, x2 - x1 + 1);
+    const T height = std::max((T)0, y2 - y1 + 1);
+    const T area = width * height;
+    const T A_area = (A[2] - A[0] + 1) * (A[3] - A[1] + 1);
+    const T B_area = (B[2] - B[0] + 1) * (B[3] - B[1] + 1);
+    return area / (A_area + B_area - area);
+}
+template <> void ApplyNMS<float, CPUContext>(
+    const int               num_boxes,
+    const int               max_keeps,
+    const float             thresh,
+    const float*            boxes,
+    int64_t*                keep_indices,
+    int&                    num_keep,
+    CPUContext*             ctx) {
+    int count = 0;
+    std::vector<char> is_dead(num_boxes);
+    for (int i = 0; i < num_boxes; ++i) is_dead[i] = 0;
+    for (int i = 0; i < num_boxes; ++i) {
+        if (is_dead[i]) continue;
+        keep_indices[count++] = i;
+        if (count == max_keeps) break;
+        for (int j = i + 1; j < num_boxes; ++j)
+            if (!is_dead[j] && IoU(&boxes[i * 5],
+                                   &boxes[j * 5]) > thresh)
+                is_dead[j] = 1;
+    }
+    num_keep = count;
+}
+}  // namespace detection
+}  // namespace utils
+}  // namespace dragon
--- a/csrc/cxx/utils/detection_utils.cu
+++ b/csrc/cxx/utils/detection_utils.cu
+#ifdef USE_CUDA
+#include <dragon/core/context_cuda.h>
+#include "detection_utils.h"
+namespace dragon {
+namespace utils {
+namespace detection {
+#define DIV_UP(m,n) ((m) / (n) + ((m) % (n) > 0))
+#define NUM_THREADS 64
+namespace {
+template <typename T>
+__device__ bool _CheckIoU(
+    const T*               a,
+    const T*               b,
+    const float            thresh) {
+    const T x1 = max(a[0], b[0]);
+    const T y1 = max(a[1], b[1]);
+    const T x2 = min(a[2], b[2]);
+    const T y2 = min(a[3], b[3]);
+    const T width = max(T(0), x2 - x1 + 1);
+    const T height = max(T(0), y2 - y1 + 1);
+    const T inter = width * height;
+    const T Sa = (a[2] - a[0] + T(1)) * (a[3] - a[1] + T(1));
+    const T Sb = (b[2] - b[0] + T(1)) * (b[3] - b[1] + T(1));
+    return inter > thresh * (Sa + Sb - inter);
+}
+template <typename T>
+__global__ void _NonMaxSuppression(
+    const int               num_blocks,
+    const int               num_boxes,
+    const T                 thresh,
+    const T*                dev_boxes,
+    uint64_t*               dev_mask) {
+    const int row_start = blockIdx.y;
+    const int col_start = blockIdx.x;
+    if (row_start > col_start) return;
+    const int row_size = min(num_boxes - row_start * NUM_THREADS, NUM_THREADS);
+    const int col_size = min(num_boxes - col_start * NUM_THREADS, NUM_THREADS);
+    __shared__ T block_boxes[NUM_THREADS * 4];
+    if (threadIdx.x < col_size) {
+        const int c1 = threadIdx.x * 4;
+        const int c2 = (col_start * NUM_THREADS + threadIdx.x) * 5;
+        block_boxes[c1] = dev_boxes[c2];
+        block_boxes[c1 + 1] = dev_boxes[c2 + 1];
+        block_boxes[c1 + 2] = dev_boxes[c2 + 2];
+        block_boxes[c1 + 3] = dev_boxes[c2 + 3];
+    }
+    __syncthreads();
+    if (threadIdx.x < row_size) {
+        const int index = row_start * NUM_THREADS + threadIdx.x;
+        const T* dev_box = dev_boxes + index * 5;
+        unsigned long long val = 0;
+        const int start = (row_start == col_start) ? (threadIdx.x + 1) : 0;
+        for (int i = start; i < col_size; ++i) {
+            if (_CheckIoU(dev_box, block_boxes + i * 4, thresh)) {
+                val |= 1ULL << i;
+            }
+        }
+        dev_mask[index * num_blocks + col_start] = val;
+    }
+}
+}  // namespace
+template <> void ApplyNMS<float, CUDAContext>(
+    const int               num_boxes,
+    const int               max_keeps,
+    const float             thresh,
+    const float*            boxes,
+    int64_t*                keep_indices,
+    int&                    num_keep,
+    CUDAContext*            ctx) {
+    const int num_blocks = DIV_UP(num_boxes, NUM_THREADS);
+    vector<uint64_t> mask_host(num_boxes * num_blocks);
+    auto* mask_dev = (uint64_t*)ctx->New(mask_host.size() * sizeof(uint64_t));
+    _NonMaxSuppression
+         <<< dim3(num_blocks, num_blocks), NUM_THREADS,
+             0, ctx->cuda_stream() >>>(
+        num_blocks,
+        num_boxes,
+        thresh,
+        boxes,
+        mask_dev
+    );
+    CUDA_CHECK(cudaMemcpyAsync(
+        mask_host.data(),
+        mask_dev,
+        mask_host.size() * sizeof(uint64_t),
+        cudaMemcpyDeviceToHost,
+        ctx->cuda_stream()
+    ));
+    ctx->FinishDeviceComputation();
+    vector<uint64_t> dead_bit(num_blocks);
+    memset(&dead_bit[0], 0, sizeof(uint64_t) * num_blocks);
+    int num_selected = 0;
+    for (int i = 0; i < num_boxes; ++i) {
+        const int nblock = i / NUM_THREADS;
+        const int inblock = i % NUM_THREADS;
+        if (!(dead_bit[nblock] & (1ULL << inblock))) {
+            keep_indices[num_selected++] = i;
+            auto* mask_i = &mask_host[0] + i * num_blocks;
+            for (int j = nblock; j < num_blocks; ++j) dead_bit[j] |= mask_i[j];
+            if (num_selected == max_keeps) break;
+        }
+    }
+    num_keep = num_selected;
+    ctx->Delete(mask_dev);
+}
+}  // namespace detection
+}  // namespace utils
+}  // namespace dragon
+#endif  // USE_CUDA
--- a/csrc/cxx/utils/detection_utils.h
+++ b/csrc/cxx/utils/detection_utils.h
+/*!
+ * Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+ *
+ * Licensed under the BSD 2-Clause License.
+ * You should have received a copy of the BSD 2-Clause License
+ * along with the software. If not, See,
+ *
+ *      <https://opensource.org/licenses/BSD-2-Clause>
+ *
+ * ------------------------------------------------------------
+ */
+#ifndef SEETADET_CXX_UTILS_DETECTION_UTILS_H_
+#define SEETADET_CXX_UTILS_DETECTION_UTILS_H_
+#include "dragon/core/context.h"
+#include "dragon/core/operator.h"
+namespace dragon {
+namespace utils {
+namespace detection {
+#define ROUND(x) ((int)((x) + (T)0.5))
+/******************** BBox ********************/
+template <typename T>
+inline int FilterBoxes(
+    const T                         dx,
+    const T                         dy,
+    const T                         d_log_w,
+    const T                         d_log_h,
+    const T                         im_w,
+    const T                         im_h,
+    const T                         min_box_w,
+    const T                         min_box_h,
+    T*                              bbox) {
+    const T w = bbox[2] - bbox[0] + 1;
+    const T h = bbox[3] - bbox[1] + 1;
+    const T ctr_x = bbox[0] + (T)0.5 * w;
+    const T ctr_y = bbox[1] + (T)0.5 * h;
+    const T pred_ctr_x = dx * w + ctr_x;
+    const T pred_ctr_y = dy * h + ctr_y;
+    const T pred_w = exp(d_log_w) * w;
+    const T pred_h = exp(d_log_h) * h;
+    bbox[0] = pred_ctr_x - (T)0.5 * pred_w;
+    bbox[1] = pred_ctr_y - (T)0.5 * pred_h;
+    bbox[2] = pred_ctr_x + (T)0.5 * pred_w;
+    bbox[3] = pred_ctr_y + (T)0.5 * pred_h;
+    bbox[0] = std::max((T)0, std::min(bbox[0], im_w - 1));
+    bbox[1] = std::max((T)0, std::min(bbox[1], im_h - 1));
+    bbox[2] = std::max((T)0, std::min(bbox[2], im_w - 1));
+    bbox[3] = std::max((T)0, std::min(bbox[3], im_h - 1));
+    const T bbox_w = bbox[2] - bbox[0] + 1;
+    const T bbox_h = bbox[3] - bbox[1] + 1;
+    return (bbox_w >= min_box_w) * (bbox_h >= min_box_h);
+}
+template <typename T>
+inline void BBoxTransform(
+    const T                         dx,
+    const T                         dy,
+    const T                         d_log_w,
+    const T                         d_log_h,
+    const T                         im_w,
+    const T                         im_h,
+    const T                         im_scale_h,
+    const T                         im_scale_w,
+    T*                              bbox) {
+    const T w = bbox[2] - bbox[0] + 1;
+    const T h = bbox[3] - bbox[1] + 1;
+    const T ctr_x = bbox[0] + (T)0.5 * w;
+    const T ctr_y = bbox[1] + (T)0.5 * h;
+    const T pred_ctr_x = dx * w + ctr_x;
+    const T pred_ctr_y = dy * h + ctr_y;
+    const T pred_w = exp(d_log_w) * w;
+    const T pred_h = exp(d_log_h) * h;
+    bbox[0] = pred_ctr_x - (T)0.5 * pred_w;
+    bbox[1] = pred_ctr_y - (T)0.5 * pred_h;
+    bbox[2] = pred_ctr_x + (T)0.5 * pred_w;
+    bbox[3] = pred_ctr_y + (T)0.5 * pred_h;
+    bbox[0] = std::max((T)0, std::min(bbox[0], im_w - 1)) / im_scale_w;
+    bbox[1] = std::max((T)0, std::min(bbox[1], im_h - 1)) / im_scale_h;
+    bbox[2] = std::max((T)0, std::min(bbox[2], im_w - 1)) / im_scale_w;
+    bbox[3] = std::max((T)0, std::min(bbox[3], im_h - 1)) / im_scale_h;
+}
+/******************** Anchor ********************/
+template <typename T>
+inline void GenerateAnchors(
+    int                             base_size,
+    const int                       num_ratios,
+    const int                       num_scales,
+    const T*                        ratios,
+    const T*                        scales,
+    T*                              anchors) {
+    const T base_area = (T)(base_size * base_size);
+    const T center = (T)0.5 * (base_size - (T)1);
+    T* offset_anchors = anchors;
+    for (int i = 0; i < num_ratios; ++i) {
+        const T ratio_w = (T)ROUND(sqrt(base_area / ratios[i]));
+        const T ratio_h = (T)ROUND(ratio_w * ratios[i]);
+        for (int j = 0; j < num_scales; ++j) {
+            const T scale_w = (T)0.5 * (ratio_w * scales[j] - (T)1);
+            const T scale_h = (T)0.5 * (ratio_h * scales[j] - (T)1);
+            offset_anchors[0] = center - scale_w;
+            offset_anchors[1] = center - scale_h;
+            offset_anchors[2] = center + scale_w;
+            offset_anchors[3] = center + scale_h;
+            offset_anchors += 4;
+        }
+    }
+}
+template <typename T>
+inline void GenerateGridAnchors(
+    const int                       num_proposals,
+    const int                       num_anchors,
+    const int                       feat_h,
+    const int                       feat_w,
+    const int                       stride,
+    const int                       base_offset,
+    const T*                        anchors,
+    const int64_t*                  indices,
+    T*                              proposals) {
+    T x, y;
+    int idx_3d, a, h, w;
+    int idx_range = num_anchors * feat_h * feat_w;
+    for (int i = 0; i < num_proposals; ++i) {
+        idx_3d = (int)indices[i] - base_offset;
+        if (idx_3d >= 0 && idx_3d < idx_range) {
+            w = idx_3d % feat_w;
+            h = (idx_3d / feat_w) % feat_h;
+            a = idx_3d / feat_w / feat_h;
+            x = (T)w * stride, y = (T)h * stride;
+            auto* A = anchors + a * 4;
+            auto* P = proposals + i * 5;
+            P[0] = x + A[0], P[1] = y + A[1];
+            P[2] = x + A[2], P[3] = y + A[3];
+        }
+    }
+}
+template <typename T>
+inline void GenerateGridAnchors(
+    const int                       num_proposals,
+    const int                       num_classes,
+    const int                       num_anchors,
+    const int                       feat_h,
+    const int                       feat_w,
+    const int                       stride,
+    const int                       base_offset,
+    const T*                        anchors,
+    const int64_t*                  indices,
+    T*                              proposals) {
+    T x, y;
+    int idx_4d, a, h, w;
+    int lr = num_classes * base_offset;
+    int rr = num_classes * (num_anchors * feat_h * feat_w);
+    for (int i = 0; i < num_proposals; ++i) {
+        idx_4d = (int)indices[i] - lr;
+        if (idx_4d >= 0 && idx_4d < rr) {
+            idx_4d /= num_classes;
+            w = idx_4d % feat_w;
+            h = (idx_4d / feat_w) % feat_h;
+            a = idx_4d / feat_w / feat_h;
+            x = (T)w * stride, y = (T)h * stride;
+            auto* A = anchors + a * 4;
+            auto* P = proposals + i * 7 + 1;
+            P[0] = x + A[0], P[1] = y + A[1];
+            P[2] = x + A[2], P[3] = y + A[3];
+        }
+    }
+}
+/******************** Proposal ********************/
+template <typename T>
+void GenerateSSProposals(
+    const int                       K,
+    const int                       num_proposals,
+    const float                     im_h,
+    const float                     im_w,
+    const float                     min_box_h,
+    const float                     min_box_w,
+    const T*                        scores,
+    const T*                        deltas,
+    const int64_t*                  indices,
+    T*                              proposals) {
+    int64_t index, a, k;
+    const float* delta;
+    float* proposal = proposals;
+    float dx, dy, d_log_w, d_log_h;
+    for (int i = 0; i < num_proposals; ++i) {
+        index = indices[i];
+        a = index / K, k = index % K;
+        delta = deltas + k;
+        dx = delta[(a * 4 + 0) * K];
+        dy = delta[(a * 4 + 1) * K];
+        d_log_w = delta[(a * 4 + 2) * K];
+        d_log_h = delta[(a * 4 + 3) * K];
+        proposal[4] = FilterBoxes(
+            dx, dy,
+            d_log_w, d_log_h,
+            im_w, im_h,
+            min_box_w, min_box_h,
+            proposal
+        ) * scores[index];
+        proposal += 5;
+    }
+}
+template <typename T>
+void GenerateMSProposals(
+    const int                       num_candidates,
+    const int                       num_proposals,
+    const float                     im_h,
+    const float                     im_w,
+    const float                     min_box_h,
+    const float                     min_box_w,
+    const T*                        scores,
+    const T*                        deltas,
+    const int64_t*                  indices,
+    T*                              proposals) {
+    int64_t index;
+    int64_t num_candidates_2x = 2 * num_candidates;
+    int64_t num_candidates_3x = 3 * num_candidates;
+    float* proposal = proposals;
+    float dx, dy, d_log_w, d_log_h;
+    for (int i = 0; i < num_proposals; ++i) {
+        index = indices[i];
+        dx = deltas[index];
+        dy = deltas[num_candidates + index];
+        d_log_w = deltas[num_candidates_2x + index];
+        d_log_h = deltas[num_candidates_3x + index];
+        proposal[4] = FilterBoxes(
+            dx, dy,
+            d_log_w, d_log_h,
+            im_w, im_h,
+            min_box_w, min_box_h,
+            proposal
+        ) * scores[index];
+        proposal += 5;
+    }
+}
+template <typename T>
+void GenerateMCProposals(
+    const int                       num_proposals,
+    const int                       num_boxes,
+    const int                       num_classes,
+    const int                       im_idx,
+    const float                     im_h,
+    const float                     im_w,
+    const float                     im_scale_h,
+    const float                     im_scale_w,
+    const T*                        scores,
+    const T*                        deltas,
+    const int64_t*                  indices,
+    T*                              proposals) {
+    int64_t index, cls;
+    int64_t num_boxes_2x = 2 * num_boxes;
+    int64_t num_boxes_3x = 3 * num_boxes;
+    float* proposal = proposals;
+    float dx, dy, d_log_w, d_log_h;
+    for (int i = 0; i < num_proposals; ++i) {
+        cls = indices[i] % num_classes;
+        index = indices[i] / num_classes;
+        dx = deltas[index];
+        dy = deltas[num_boxes + index];
+        d_log_w = deltas[num_boxes_2x + index];
+        d_log_h = deltas[num_boxes_3x + index];
+        proposal[0] = im_idx;
+        BBoxTransform(
+            dx, dy,
+            d_log_w, d_log_h,
+            im_w, im_h,
+            im_scale_h, im_scale_w,
+            proposal + 1
+        );
+        proposal[5] = scores[indices[i]];
+        proposal[6] = cls + 1;
+        proposal += 7;
+    }
+}
+template <typename T>
+inline void SortProposals(
+    const int                       start,
+    const int                       end,
+    const int                       num_top,
+    T*                              proposals) {
+    const T pivot_score = proposals[start * 5 + 4];
+    int left = start + 1, right = end;
+    while (left <= right) {
+        while (left <= end && proposals[left * 5 + 4] >= pivot_score) ++left;
+        while (right > start && proposals[right * 5 + 4] <= pivot_score) --right;
+        if (left <= right) {
+            for (int i = 0; i < 5; ++i)
+                std::swap(proposals[left * 5 + i], proposals[right * 5 + i]);
+            ++left;
+            --right;
+        }
+    }
+    if (right > start) {
+        for (int i = 0; i < 5; ++i)
+            std::swap(proposals[start * 5 + i], proposals[right * 5 + i]);
+    }
+    if (start < right - 1) SortProposals(start, right - 1, num_top, proposals);
+    if (right + 1 < num_top && right + 1 < end)
+        SortProposals(right + 1, end, num_top, proposals);
+}
+template <typename T>
+inline void RetrieveRoIs(
+    const int                       num_rois,
+    const int                       roi_batch_ind,
+    const T*                        proposals,
+    const int64_t*                  roi_indices,
+    T*                              rois) {
+    for (int i = 0; i < num_rois; ++i) {
+        const T* proposal = proposals + roi_indices[i] * 5;
+        rois[i * 5 + 0] = (T)roi_batch_ind;
+        rois[i * 5 + 1] = proposal[0];
+        rois[i * 5 + 2] = proposal[1];
+        rois[i * 5 + 3] = proposal[2];
+        rois[i * 5 + 4] = proposal[3];
+    }
+}
+template <typename T>
+inline int roi_level(
+    const int                       min_level,
+    const int                       max_level,
+    const int                       canonical_level,
+    const int                       canonical_scale,
+    T*                              roi) {
+    T w = roi[3] - roi[1] + 1;
+    T h = roi[4] - roi[2] + 1;
+    // Refer the settings of paper
+    int level = canonical_level + std::log2(
+        std::max(std::sqrt(w * h), (T)1) / (T)canonical_scale);
+    return std::min(max_level, std::max(min_level, level));
+}
+template <typename T>
+inline void CollectRoIs(
+    const int                       num_rois,
+    const int                       min_level,
+    const int                       max_level,
+    const int                       canonical_level,
+    const int                       canonical_scale,
+    const T*                        rois,
+    vector<vec64_t>&                roi_bins) {
+    const T* roi = rois;
+    for (int i = 0; i < num_rois; ++i) {
+        int bin_idx = roi_level(min_level, max_level,
+            canonical_level, canonical_scale, roi);
+        bin_idx = std::max(bin_idx - min_level, 0);
+        roi_bins[bin_idx].push_back(i);
+        roi += 5;
+    }
+}
+template <typename T>
+inline void DistributeRoIs(
+    const vector<vec64_t>&              roi_bins,
+    const T*                            rois,
+    vector<T*>                          outputs) {
+    for (int i = 0; i < roi_bins.size(); i++) {
+        auto* y = outputs[i];
+        if (roi_bins[i].size() == 0) {
+            // Fake a tiny roi to avoid empty roi pooling
+            y[0] = 0, y[1] = 0, y[2] = 0, y[3] = 1, y[4] = 1;
+        } else {
+            for (int j = 0; j < roi_bins[i].size(); ++j) {
+                const T* roi = rois + roi_bins[i][j] * 5;
+                for (int k = 0; k < 5; ++k) y[k] = roi[k];
+                y += 5;
+            }
+        }
+    }
+}
+/******************** NMS ********************/
+template <typename T, class Context>
+void ApplyNMS(
+    const int                       num_boxes,
+    const int                       max_keeps,
+    const T                         thresh,
+    const T*                        boxes,
+    int64_t*                        keep_indices,
+    int&                            num_keep,
+    Context*                        ctx);
+}  // namespace detection
+}  // namespace utils
+}  // namespace dragon
+#endif  // SEETADET_CXX_UTILS_DETECTION_UTILS_H_
--- a/lib/pycocotools/_mask.pyx
+++ b/lib/pycocotools/_mask.pyx
--- a/compile/bbox.pyx
+++ b/compile/bbox.pyx
 # --------------------------------------------------------
 # Fast R-CNN
 # Copyright (c) 2015 Microsoft
 # Licensed under The MIT License [see LICENSE for details]
 # Written by Sergey Karayev
 # --------------------------------------------------------
 cimport cython
 import numpy as np
 cimport numpy as np
 DTYPE = np.float
 ctypedef np.float_t DTYPE_t
 @cython.boundscheck(False)
 def bbox_overlaps(
        np.ndarray[DTYPE_t, ndim=2] boxes,
        np.ndarray[DTYPE_t, ndim=2] query_boxes):
    """
    Parameters
    ----------
    boxes: (N, 4) ndarray of float
    query_boxes: (K, 4) ndarray of float
    Returns
    -------
    overlaps: (N, K) ndarray of overlap between boxes and query_boxes
    """
    cdef unsigned int N = boxes.shape[0]
    cdef unsigned int K = query_boxes.shape[0]
    cdef np.ndarray[DTYPE_t, ndim=2] overlaps = np.zeros((N, K), dtype=DTYPE)
    cdef DTYPE_t iw, ih, box_area
    cdef DTYPE_t ua
    cdef unsigned int k, n
    with nogil:
        for k in range(K):
            box_area = (
                (query_boxes[k, 2] - query_boxes[k, 0] + 1) *
                (query_boxes[k, 3] - query_boxes[k, 1] + 1)
            )
            for n in range(N):
                iw = (
                    min(boxes[n, 2], query_boxes[k, 2]) -
                    max(boxes[n, 0], query_boxes[k, 0]) + 1
                )
                if iw > 0:
                    ih = (
                        min(boxes[n, 3], query_boxes[k, 3]) -
                        max(boxes[n, 1], query_boxes[k, 1]) + 1
                    )
                    if ih > 0:
                        ua = float(
                            (boxes[n, 2] - boxes[n, 0] + 1) *
                            (boxes[n, 3] - boxes[n, 1] + 1) +
                            box_area - iw * ih
                        )
                        overlaps[n, k] = iw * ih / ua
    return overlaps
\ No newline at end of file
--- a/compile/cpu_nms.pyx
+++ b/compile/cpu_nms.pyx
--- a/lib/pycocotools/maskApi.c
+++ b/lib/pycocotools/maskApi.c
--- a/lib/pycocotools/maskApi.h
+++ b/lib/pycocotools/maskApi.h
--- a/csrc/pyx/setup.py
+++ b/csrc/pyx/setup.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+"""Compile the cython extensions."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from distutils.extension import Extension
+from distutils.core import setup
+import os
+from Cython.Distutils import build_ext
+import numpy as np
+ext_modules = [
+    Extension(
+        'install.lib.utils.cython_bbox',
+        ['cython_bbox.pyx'],
+        extra_compile_args=['-w'],
+        include_dirs=[np.get_include()]
+    ),
+    Extension(
+        'install.lib.utils.cython_nms',
+        ['cython_nms.pyx'],
+        extra_compile_args=['-w'],
+        include_dirs=[np.get_include()]
+    ),
+    Extension(
+        'install.lib.pycocotools._mask',
+        ['maskApi.c', '_mask.pyx'],
+        include_dirs=[np.get_include(), os.path.dirname(os.path.abspath(__file__))],
+        extra_compile_args=['-w']
+    ),
+]
+setup(
+    name='SeetaDet',
+    ext_modules=ext_modules,
+    cmdclass={'build_ext': build_ext},
+)
--- a/lib/faster_rcnn/data_loader.py
+++ b/lib/faster_rcnn/data_loader.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import multiprocessing as mp
-import time
-import dragon
-import dragon.vm.torch as torch
-import numpy as np
-from lib.core.config import cfg
-from lib.faster_rcnn.data_transformer import DataTransformer
-from lib.datasets.factory import get_imdb
-from lib.utils import logger
-from lib.utils.blob import im_list_to_blob
-class DataLoader(object):
-    """Provide mini-batches of data."""
-    def __init__(self):
-        super(DataLoader, self).__init__()
-        database = get_imdb(cfg.TRAIN.DATABASE)
-        self.data_batch = DataBatch(**{
-            'dataset': lambda: dragon.io.SeetaRecordDataset(database.source),
-            'classes': database.classes,
-            'shuffle': cfg.TRAIN.USE_SHUFFLE,
-            'num_chunks': cfg.TRAIN.NUM_SHUFFLE_CHUNKS,
-            'batch_size': cfg.TRAIN.IMS_PER_BATCH * 2,
-            'num_transformers': cfg.TRAIN.NUM_WORKERS,
-        })
-    def __call__(self):
-        outputs = self.data_batch.get()
-        outputs['data'] = torch.from_numpy(outputs['data'])
-        return outputs
-class DataBatch(mp.Process):
-    """Prefetch the batch of data."""
-    def __init__(self, **kwargs):
-        """Construct a ``DataBatch``.
-        Parameters
-        ----------
-        dataset : lambda
-            The creator of a dataset.
-        classes : Sequence[str]
-            The class names.
-        shuffle : bool, optional, default=False
-            Whether to shuffle the data.
-        num_chunks : int, optional, default=0
-            The number of chunks to split.
-        batch_size : int, optional, default=2
-            The size of a mini-batch.
-        num_transformers : int, optional, default=3
-            The number of workers to transform data.
-        """
-        super(DataBatch, self).__init__()
-        # Distributed settings
-        rank, group_size = 0, 1
-        process_group = dragon.distributed.get_group()
-        if process_group is not None and kwargs.get(
-                'phase', 'TRAIN') == 'TRAIN':
-            group_size = process_group.size
-            rank = dragon.distributed.get_rank(process_group)
-        kwargs['group_size'] = group_size
-        # Configuration
-        self._prefetch = kwargs.get('prefetch', 5)
-        self._batch_size = kwargs.get('batch_size', 2)
-        self._num_readers = kwargs.get('num_readers', 1)
-        self._num_transformers = kwargs.get('num_transformers', 3)
-        self._num_fetchers = kwargs.get('num_fetchers', 1)
-        self.daemon = True
-        # Initialize queues
-        num_batches = self._prefetch * self._num_readers
-        self.Q1 = mp.Queue(num_batches * self._batch_size)
-        self.Q21 = mp.Queue(num_batches * self._batch_size)
-        self.Q22 = mp.Queue(num_batches * self._batch_size)
-        self.Q3 = mp.Queue(num_batches)
-        # Initialize readers
-        self._readers = []
-        for i in range(self._num_readers):
-            part_idx, num_parts = i, self._num_readers
-            num_parts *= group_size
-            part_idx += rank * self._num_readers
-            self._readers.append(dragon.io.DataReader(
-                num_parts=num_parts, part_idx=part_idx, **kwargs))
-            self._readers[i]._seed += part_idx
-            self._readers[i].q_out = self.Q1
-            self._readers[i].start()
-            time.sleep(0.1)
-        # Initialize transformers
-        self._transformers = []
-        for i in range(self._num_transformers):
-            transformer = DataTransformer(**kwargs)
-            transformer._seed += (i + rank * self._num_transformers)
-            transformer.q_in = self.Q1
-            transformer.q1_out, transformer.q2_out = self.Q21, self.Q22
-            transformer.start()
-            self._transformers.append(transformer)
-            time.sleep(0.1)
-        # Initialize batch-producer
-        self.start()
-        # Register cleanup callbacks
-        def cleanup():
-            def terminate(processes):
-                for process in processes:
-                    process.terminate()
-                    process.join()
-            terminate([self])
-            logger.info('Terminate DataBatch.')
-            terminate(self._transformers)
-            logger.info('Terminate DataTransformer.')
-            terminate(self._readers)
-            logger.info('Terminate DataReader.')
-        import atexit
-        atexit.register(cleanup)
-    def get(self):
-        """Get a batch.
-        Returns
-        -------
-        dict
-            The batch dict.
-        """
-        return self.Q3.get()
-    def run(self):
-        """Start the process to produce batches."""
-        def produce(q_in):
-            processed_ims, ims_info, all_boxes = [], [], []
-            for image_index in range(cfg.TRAIN.IMS_PER_BATCH):
-                im, im_scale, gt_boxes = q_in.get()
-                processed_ims.append(im)
-                ims_info.append(list(im.shape[:2]) + [im_scale])
-                im_boxes = np.zeros((gt_boxes.shape[0], gt_boxes.shape[1] + 1), 'float32')
-                im_boxes[:, :gt_boxes.shape[1]], im_boxes[:, -1] = gt_boxes, image_index
-                all_boxes.append(im_boxes)
-            return {
-                'data': im_list_to_blob(processed_ims),
-                'ims_info': np.array(ims_info, dtype=np.float32),
-                'gt_boxes': np.concatenate(all_boxes, axis=0),
-            }
-        # Two queues to implement aspect-grouping
-        # This is necessary to reduce the gpu memory
-        # from fetching a huge square batch blob
-        q1, q2 = self.Q21, self.Q22
-        # Main prefetch loop
-        while True:
-            if q1.qsize() >= cfg.TRAIN.IMS_PER_BATCH:
-                self.Q3.put(produce(q1))
-            elif q2.qsize() >= cfg.TRAIN.IMS_PER_BATCH:
-                self.Q3.put(produce(q2))
-            q1, q2 = q2, q1  # Uniform sampling trick
--- a/lib/mask_rcnn/data_loader.py
+++ b/lib/mask_rcnn/data_loader.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import multiprocessing as mp
-import time
-import dragon
-import dragon.vm.torch as torch
-import numpy as np
-from lib.core.config import cfg
-from lib.mask_rcnn.data_transformer import DataTransformer
-from lib.datasets.factory import get_imdb
-from lib.utils import logger
-from lib.utils.blob import im_list_to_blob
-from lib.utils.blob import mask_list_to_blob
-class DataLoader(object):
-    """Provide mini-batches of data."""
-    def __init__(self):
-        super(DataLoader, self).__init__()
-        database = get_imdb(cfg.TRAIN.DATABASE)
-        self.data_batch = DataBatch(**{
-            'dataset': lambda: dragon.io.SeetaRecordDataset(database.source),
-            'classes': database.classes,
-            'shuffle': cfg.TRAIN.USE_SHUFFLE,
-            'num_chunks': cfg.TRAIN.NUM_SHUFFLE_CHUNKS,
-            'batch_size': cfg.TRAIN.IMS_PER_BATCH * 2,
-            'num_transformers': cfg.TRAIN.NUM_WORKERS,
-        })
-    def __call__(self):
-        outputs = self.data_batch.get()
-        outputs['data'] = torch.from_numpy(outputs['data'])
-        return outputs
-class DataBatch(mp.Process):
-    """Prefetch the batch of data."""
-    def __init__(self, **kwargs):
-        """Construct a ``DataBatch``.
-        Parameters
-        ----------
-        dataset : lambda
-            The creator of a dataset.
-        classes : Sequence[str]
-            The class names.
-        shuffle : bool, optional, default=False
-            Whether to shuffle the data.
-        num_chunks : int, optional, default=0
-            The number of chunks to split.
-        batch_size : int, optional, default=2
-            The size of a mini-batch.
-        num_transformers : int, optional, default=3
-            The number of workers to transform data.
-        """
-        super(DataBatch, self).__init__()
-        # Distributed settings
-        rank, group_size = 0, 1
-        process_group = dragon.distributed.get_group()
-        if process_group is not None and kwargs.get(
-                'phase', 'TRAIN') == 'TRAIN':
-            group_size = process_group.size
-            rank = dragon.distributed.get_rank(process_group)
-        kwargs['group_size'] = group_size
-        # Configuration
-        self._prefetch = kwargs.get('prefetch', 5)
-        self._batch_size = kwargs.get('batch_size', 2)
-        self._num_readers = kwargs.get('num_readers', 1)
-        self._num_transformers = kwargs.get('num_transformers', 3)
-        self._num_fetchers = kwargs.get('num_fetchers', 1)
-        self.daemon = True
-        # Initialize queues
-        num_batches = self._prefetch * self._num_readers
-        self.Q1 = mp.Queue(num_batches * self._batch_size)
-        self.Q21 = mp.Queue(num_batches * self._batch_size)
-        self.Q22 = mp.Queue(num_batches * self._batch_size)
-        self.Q3 = mp.Queue(num_batches)
-        # Initialize readers
-        self._readers = []
-        for i in range(self._num_readers):
-            part_idx, num_parts = i, self._num_readers
-            num_parts *= group_size
-            part_idx += rank * self._num_readers
-            self._readers.append(dragon.io.DataReader(
-                num_parts=num_parts, part_idx=part_idx, **kwargs))
-            self._readers[i]._seed += part_idx
-            self._readers[i].q_out = self.Q1
-            self._readers[i].start()
-            time.sleep(0.1)
-        # Initialize transformers
-        self._transformers = []
-        for i in range(self._num_transformers):
-            transformer = DataTransformer(**kwargs)
-            transformer._seed += (i + rank * self._num_transformers)
-            transformer.q_in = self.Q1
-            transformer.q1_out, transformer.q2_out = self.Q21, self.Q22
-            transformer.start()
-            self._transformers.append(transformer)
-            time.sleep(0.1)
-        # Initialize batch-producer
-        self.start()
-        # Register cleanup callbacks
-        def cleanup():
-            def terminate(processes):
-                for process in processes:
-                    process.terminate()
-                    process.join()
-            terminate([self])
-            logger.info('Terminate DataBatch.')
-            terminate(self._transformers)
-            logger.info('Terminate DataTransformer.')
-            terminate(self._readers)
-            logger.info('Terminate DataReader.')
-        import atexit
-        atexit.register(cleanup)
-    def get(self):
-        """Get a batch.
-        Returns
-        -------
-        dict
-            The batch dict.
-        """
-        return self.Q3.get()
-    def run(self):
-        """Start the process to produce batches."""
-        def produce(q_in):
-            processed_ims, ims_info = [], []
-            packed_boxes, packed_masks = [], []
-            for image_index in range(cfg.TRAIN.IMS_PER_BATCH):
-                im, im_scale, gt_boxes, gt_masks = q_in.get()
-                processed_ims.append(im)
-                ims_info.append(list(im.shape[:2]) + [im_scale])
-                im_boxes = np.zeros((gt_boxes.shape[0], gt_boxes.shape[1] + 1), 'float32')
-                im_boxes[:, :gt_boxes.shape[1]], im_boxes[:, -1] = gt_boxes, image_index
-                packed_boxes.append(im_boxes)
-                packed_masks.append(gt_masks)
-            return {
-                'data': im_list_to_blob(processed_ims),
-                'ims_info': np.array(ims_info, 'float32'),
-                'gt_boxes': np.concatenate(packed_boxes, 0),
-                'gt_masks': mask_list_to_blob(packed_masks),
-            }
-        # Two queues to implement aspect-grouping
-        # This is necessary to reduce the gpu memory
-        # from fetching a huge square batch blob
-        q1, q2 = self.Q21, self.Q22
-        # Main prefetch loop
-        while True:
-            if q1.qsize() >= cfg.TRAIN.IMS_PER_BATCH:
-                self.Q3.put(produce(q1))
-            elif q2.qsize() >= cfg.TRAIN.IMS_PER_BATCH:
-                self.Q3.put(produce(q2))
-            q1, q2 = q2, q1  # Uniform sampling trick
--- a/lib/modeling/factory.py
+++ b/lib/modeling/factory.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import collections
-import importlib
-_STORE = collections.defaultdict(dict)
-###########################################
-#                                         #
-#                 Body                    #
-#                                         #
-###########################################
-# ResNet
-for D in [18, 34, 50, 101, 152, 200, 269]:
-    _STORE['BODY']['resnet{}'.format(D)] = \
-        'lib.modeling.resnet.make_resnet_{}'.format(D)
-# VGG
-for D in [16, 19]:
-    for T in ['', '_reduced_300', '_reduced_512']:
-        _STORE['BODY']['vgg{}{}'.format(D, T)] = \
-            'lib.modeling.vgg.make_vgg_{}{}'.format(D, T)
-# AirNet
-for D in ['', '3b', '4b', '5b']:
-    _STORE['BODY']['airnet{}'.format(D)] = \
-        'lib.modeling.airnet.make_airnet_{}'.format(D)
-# MobileNet
-for D in ['a1', 'v2']:
-    _STORE['BODY']['mobilenet_{}'.format(D)] = \
-        'lib.modeling.mobilenet.make_mobilenet_{}'.format(D)
-def get_template_func(name, sets, desc):
-    name = name.lower()
-    if name not in sets:
-        raise ValueError(
-            'The {} for {} was not registered.\n'
-            'Registered modules: [{}]'
-            .format(name, desc, ', '.join(sets.keys()))
-        )
-    module_name = '.'.join(sets[name].split('.')[0:-1])
-    func_name = sets[name].split('.')[-1]
-    try:
-        module = importlib.import_module(module_name)
-        return getattr(module, func_name)
-    except ImportError as e:
-        raise ValueError('Can not import module from: ' + module_name)
-def get_body_func(name):
-    return get_template_func(
-        name, _STORE['BODY'], 'Body')
--- a/lib/modules/nn.py
+++ b/lib/modules/nn.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-"""Define some basic structures."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from dragon.vm.torch import nn
-from lib.core.config import cfg
-class Affine(object):
-    """Affine transformation with weight and bias fixed."""
-    def __new__(cls, dim_in, bias=True, inplace=True):
-        return nn.Affine(
-            dim_in,
-            fix_weight=True,
-            fix_bias=True,
-            inplace=inplace,
-        )
-class Conv1x1(object):
-    """1x1 convolution."""
-    def __new__(cls, dim_in, dim_out, stride=1, bias=False):
-        return nn.Conv2d(
-            dim_in,
-            dim_out,
-            kernel_size=1,
-            stride=stride,
-            bias=bias,
-        )
-class Conv3x3(object):
-    """3x3 convolution."""
-    def __new__(cls, dim_in, dim_out, stride=1, dilation=1, bias=False):
-        return nn.Conv2d(
-            dim_in,
-            dim_out,
-            kernel_size=3,
-            stride=stride,
-            padding=1 * dilation,
-            bias=bias,
-        )
-class CrossEntropyLoss(object):
-    """Cross entropy loss."""
-    def __new__(cls):
-        return nn.CrossEntropyLoss(ignore_index=-1)
-class Identity(nn.Module):
-    """Pass input to the output."""
-    def __init__(self, *args, **kwargs):
-        super(Identity, self).__init__()
-        _, _ = args, kwargs
-    def forward(self, x):
-        return x
-class SigmoidFocalLoss(object):
-    """Sigmoid focal loss."""
-    def __new__(cls):
-        return nn.SigmoidFocalLoss(
-            alpha=cfg.MODEL.FOCAL_LOSS_ALPHA,
-            gamma=cfg.MODEL.FOCAL_LOSS_GAMMA,
-        )
-class SmoothL1Loss(object):
-    """Smoothed l1 loss."""
-    def __new__(cls, beta=1.):
-        return nn.SmoothL1Loss(
-            beta=beta,
-            reduction='batch_size',
-        )
-def is_conv2d(module):
-    """Return a bool indicating the module is a Conv2d."""
-    return isinstance(module, nn.Conv2d) or \
-        isinstance(module, nn.DepthwiseConv2d)
-AvgPool2d = nn.AvgPool2d
-BatchNorm2d = nn.BatchNorm2d
-BCEWithLogitsLoss = nn.BCEWithLogitsLoss
-Conv2d = nn.Conv2d
-ConvTranspose2d = nn.ConvTranspose2d
-DepthwiseConv2d = nn.DepthwiseConv2d
-Linear = nn.Linear
-MaxPool2d = nn.MaxPool2d
-Module = nn.Module
-ModuleList = nn.ModuleList
-Sequential = nn.Sequential
-ReLU = nn.ReLU
-Sigmoid = nn.Sigmoid
-Softmax = nn.Softmax
--- a/lib/pycocotools/.gitignore
+++ b/lib/pycocotools/.gitignore
-_mask.c
--- a/lib/pycocotools/license.txt
+++ b/lib/pycocotools/license.txt
-Copyright (c) 2014, Piotr Dollar and Tsung-Yi Lin
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met: 
-1. Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer. 
-2. Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution. 
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-The views and conclusions contained in the software and documentation are those
-of the authors and should not be interpreted as representing official policies, 
-either expressed or implied, of the FreeBSD Project.
--- a/lib/ssd/data_loader.py
+++ b/lib/ssd/data_loader.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import multiprocessing as mp
-import time
-import dragon
-import dragon.vm.torch as torch
-import numpy as np
-from lib.core.config import cfg
-from lib.datasets.factory import get_imdb
-from lib.ssd.data_transformer import DataTransformer
-from lib.utils import logger
-class DataLoader(object):
-    """Provide mini-batches of data."""
-    def __init__(self):
-        super(DataLoader, self).__init__()
-        database = get_imdb(cfg.TRAIN.DATABASE)
-        self.data_batch = DataBatch(**{
-            'dataset': lambda: dragon.io.SeetaRecordDataset(database.source),
-            'classes': database.classes,
-            'shuffle': cfg.TRAIN.USE_SHUFFLE,
-            'num_chunks': cfg.TRAIN.NUM_SHUFFLE_CHUNKS,
-            'batch_size': cfg.TRAIN.IMS_PER_BATCH * 2,
-            'num_transformers': cfg.TRAIN.NUM_WORKERS,
-        })
-    def __call__(self):
-        outputs = self.data_batch.get()
-        outputs['data'] = torch.from_numpy(outputs['data'])
-        return outputs
-class DataBatch(mp.Process):
-    """Prefetch the batch of data."""
-    def __init__(self, **kwargs):
-        """Construct a ``DataBatch``.
-        Parameters
-        ----------
-        dataset : lambda
-            The creator of a dataset.
-        classes : Sequence[str]
-            The class names.
-        shuffle : bool, optional, default=False
-            Whether to shuffle the data.
-        num_chunks : int, optional, default=0
-            The number of chunks to split.
-        batch_size : int, optional, default=2
-            The size of a mini-batch.
-        num_transformers : int, optional, default=3
-            The number of workers to transform data.
-        """
-        super(DataBatch, self).__init__()
-        # Distributed settings
-        rank, group_size = 0, 1
-        process_group = dragon.distributed.get_group()
-        if process_group is not None and kwargs.get(
-                'phase', 'TRAIN') == 'TRAIN':
-            group_size = process_group.size
-            rank = dragon.distributed.get_rank(process_group)
-        kwargs['group_size'] = group_size
-        # Configuration
-        self._prefetch = kwargs.get('prefetch', 5)
-        self._batch_size = kwargs.get('batch_size', 32)
-        self._num_readers = kwargs.get('num_readers', 1)
-        self._num_transformers = kwargs.get('num_transformers', 3)
-        self._num_fetchers = kwargs.get('num_fetchers', 1)
-        # Initialize queues
-        num_batches = self._prefetch * self._num_readers
-        self.Q1 = mp.Queue(num_batches * self._batch_size)
-        self.Q2 = mp.Queue(num_batches * self._batch_size)
-        self.Q3 = mp.Queue(num_batches)
-        # Initialize readers
-        self._readers = []
-        for i in range(self._num_readers):
-            part_idx, num_parts = i, self._num_readers
-            num_parts *= group_size
-            part_idx += rank * self._num_readers
-            self._readers.append(dragon.io.DataReader(
-                num_parts=num_parts, part_idx=part_idx, **kwargs))
-            self._readers[i]._seed += part_idx
-            self._readers[i].q_out = self.Q1
-            self._readers[i].start()
-            time.sleep(0.1)
-        # Initialize transformers
-        self._transformers = []
-        for i in range(self._num_transformers):
-            transformer = DataTransformer(**kwargs)
-            transformer._seed += (i + rank * self._num_transformers)
-            transformer.q_in, transformer.q_out = self.Q1, self.Q2
-            transformer.start()
-            self._transformers.append(transformer)
-            time.sleep(0.1)
-        # Initialize batch-producer
-        self.start()
-        # Register cleanup callbacks
-        def cleanup():
-            def terminate(processes):
-                for process in processes:
-                    process.terminate()
-                    process.join()
-            terminate([self])
-            logger.info('Terminate DataBatch.')
-            terminate(self._transformers)
-            logger.info('Terminate DataTransformer.')
-            terminate(self._readers)
-            logger.info('Terminate DataReader.')
-        import atexit
-        atexit.register(cleanup)
-    def get(self):
-        """Get a batch.
-        Returns
-        -------
-        dict
-            The batch dict.
-        """
-        return self.Q3.get()
-    def run(self):
-        """Start the process to produce batches."""
-        image_batch_shape = (
-            cfg.TRAIN.IMS_PER_BATCH,
-            cfg.SSD.RESIZE.HEIGHT,
-            cfg.SSD.RESIZE.WIDTH, 3,
-        )
-        # Main prefetch loop
-        while True:
-            boxes_to_pack = []
-            img, gt_boxes = self.Q2.get()
-            ims_blob = np.zeros(image_batch_shape, img.dtype)
-            for i in range(cfg.TRAIN.IMS_PER_BATCH):
-                ims_blob[i] = img
-                boxes = np.zeros((gt_boxes.shape[0], gt_boxes.shape[1] + 1), 'float32')
-                boxes[:, :gt_boxes.shape[1]], boxes[:, -1] = gt_boxes, i
-                boxes_to_pack.append(boxes)
-                if i != (cfg.TRAIN.IMS_PER_BATCH - 1):
-                    img, gt_boxes = self.Q2.get()
-            self.Q3.put({
-                'data': ims_blob,
-                'gt_boxes': np.concatenate(boxes_to_pack),
-            })
--- a/lib/utils/boxes.py
+++ b/lib/utils/boxes.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# Codes are based on:
-#
-#      <https://github.com/ppwwyyxx/tensorpack/blob/master/examples/FasterRCNN/utils/np_box_ops.py>
-#
-# ------------------------------------------------------------
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import numpy as np
-from lib.utils import cython_bbox
-def intersection(boxes1, boxes2):
-    """Compute pairwise intersection areas between boxes.
-    Args:
-      boxes1: a numpy array with shape [N, 4] holding N boxes
-      boxes2: a numpy array with shape [M, 4] holding M boxes
-    Returns:
-      a numpy array with shape [N*M] representing pairwise intersection area
-    """
-    [y_min1, x_min1, y_max1, x_max1] = np.split(boxes1, 4, axis=1)
-    [y_min2, x_min2, y_max2, x_max2] = np.split(boxes2, 4, axis=1)
-    all_pairs_min_ymax = np.minimum(y_max1, np.transpose(y_max2))
-    all_pairs_max_ymin = np.maximum(y_min1, np.transpose(y_min2))
-    intersect_heights = np.maximum(
-        np.zeros(all_pairs_max_ymin.shape),
-        all_pairs_min_ymax - all_pairs_max_ymin)
-    all_pairs_min_xmax = np.minimum(x_max1, np.transpose(x_max2))
-    all_pairs_max_xmin = np.maximum(x_min1, np.transpose(x_min2))
-    intersect_widths = np.maximum(
-        np.zeros(all_pairs_max_xmin.shape),
-        all_pairs_min_xmax - all_pairs_max_xmin)
-    return intersect_heights * intersect_widths
-def iou(boxes1, boxes2):
-    """Computes pairwise intersection-over-union between box collections.
-    Args:
-      boxes1: a numpy array with shape [N, 4] holding N boxes.
-      boxes2: a numpy array with shape [M, 4] holding M boxes.
-    Returns:
-      a numpy array with shape [N, M] representing pairwise iou scores.
-    """
-    intersect = intersection(boxes1, boxes2)
-    area1 = boxes_area(boxes1)
-    area2 = boxes_area(boxes2)
-    union = \
-        np.expand_dims(area1, axis=1) + \
-        np.expand_dims(area2, axis=0) - intersect
-    return intersect / union
-def ioa1(boxes1, boxes2):
-    """Computes pairwise intersection-over-area between box collections.
-    Intersection-over-area (ioa) between two boxes box1 and box2 is defined as
-    their intersection area over box2's area. Note that ioa is not symmetric,
-    that is, IOA(box1, box2) != IOA(box2, box1).
-    Args:
-      boxes1: a numpy array with shape [N, 4] holding N boxes.
-      boxes2: a numpy array with shape [M, 4] holding N boxes.
-    Returns:
-      a numpy array with shape [N, M] representing pairwise ioa scores.
-    """
-    intersect = intersection(boxes1, boxes2)
-    areas = np.expand_dims(boxes_area(boxes1), axis=1)
-    return intersect / areas
-def ioa2(boxes1, boxes2):
-    """Computes pairwise intersection-over-area between box collections.
-    Intersection-over-area (ioa) between two boxes box1 and box2 is defined as
-    their intersection area over box2's area. Note that ioa is not symmetric,
-    that is, IOA(box1, box2) != IOA(box2, box1).
-    Args:
-      boxes1: a numpy array with shape [N, 4] holding N boxes.
-      boxes2: a numpy array with shape [M, 4] holding N boxes.
-    Returns:
-      a numpy array with shape [N, M] representing pairwise ioa scores.
-    """
-    intersect = intersection(boxes1, boxes2)
-    areas = np.expand_dims(boxes_area(boxes2), axis=0)
-    return intersect / areas
-def bbox_overlaps(boxes1, boxes2):
-    """Compute the overlaps between two group of boxes."""
-    return cython_bbox.bbox_overlaps(
-        np.ascontiguousarray(boxes1, dtype=np.float),
-        np.ascontiguousarray(boxes2, dtype=np.float),
-    )
-def bbox_transform(ex_rois, gt_rois, weights=(1., 1., 1., 1.)):
-    """Transform the boxes to the regression targets."""
-    ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.
-    ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.
-    ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths
-    ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights
-    gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.
-    gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.
-    gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths
-    gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights
-    wx, wy, ww, wh = weights
-    targets = [wx * (gt_ctr_x - ex_ctr_x) / ex_widths]
-    targets += [wy * (gt_ctr_y - ex_ctr_y) / ex_heights]
-    targets += [ww * np.log(gt_widths / ex_widths)]
-    targets += [wh * np.log(gt_heights / ex_heights)]
-    return np.vstack(targets).transpose()
-def bbox_transform_inv(boxes, deltas, weights=(1., 1., 1., 1.)):
-    """Decode the final boxes according to the deltas."""
-    if boxes.shape[0] == 0:
-        return np.zeros((0, deltas.shape[1]), dtype=deltas.dtype)
-    boxes = boxes.astype(deltas.dtype, copy=False)
-    widths = boxes[:, 2] - boxes[:, 0] + 1.
-    heights = boxes[:, 3] - boxes[:, 1] + 1.
-    ctr_x = boxes[:, 0] + 0.5 * widths
-    ctr_y = boxes[:, 1] + 0.5 * heights
-    wx, wy, ww, wh = weights
-    dx = deltas[:, 0::4] / wx
-    dy = deltas[:, 1::4] / wy
-    dw = deltas[:, 2::4] / ww
-    dh = deltas[:, 3::4] / wh
-    pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis]
-    pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis]
-    pred_w = np.exp(dw) * widths[:, np.newaxis]
-    pred_h = np.exp(dh) * heights[:, np.newaxis]
-    pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype)
-    pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w  # x1
-    pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h  # y1
-    pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w  # x2
-    pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h  # y2
-    return pred_boxes
-def boxes_area(boxes):
-    """Compute the area of an array of boxes."""
-    w = (boxes[:, 2] - boxes[:, 0] + 1)
-    h = (boxes[:, 3] - boxes[:, 1] + 1)
-    areas = w * h
-    assert np.all(areas >= 0), 'Negative areas founds'
-    return areas
-def clip_boxes(boxes, im_shape):
-    # x1 >= 0
-    boxes[:, 0] = np.maximum(np.minimum(boxes[:, 0], im_shape[1] - 1), 0)
-    # y1 >= 0
-    boxes[:, 1] = np.maximum(np.minimum(boxes[:, 1], im_shape[0] - 1), 0)
-    # x2 < im_shape[1]
-    boxes[:, 2] = np.maximum(np.minimum(boxes[:, 2], im_shape[1] - 1), 0)
-    # y2 < im_shape[0]
-    boxes[:, 3] = np.maximum(np.minimum(boxes[:, 3], im_shape[0] - 1), 0)
-    return boxes
-def clip_tiled_boxes(boxes, im_shape):
-    # x1 >= 0
-    boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0)
-    # y1 >= 0
-    boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0)
-    # x2 < im_shape[1]
-    boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0)
-    # y2 < im_shape[0]
-    boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0)
-    return boxes
-def expand_boxes(boxes, scale):
-    """Expand an array of boxes by a given scale."""
-    w_half = (boxes[:, 2] - boxes[:, 0]) * .5
-    h_half = (boxes[:, 3] - boxes[:, 1]) * .5
-    x_c = (boxes[:, 2] + boxes[:, 0]) * .5
-    y_c = (boxes[:, 3] + boxes[:, 1]) * .5
-    w_half *= scale
-    h_half *= scale
-    boxes_exp = np.zeros(boxes.shape)
-    boxes_exp[:, 0] = x_c - w_half
-    boxes_exp[:, 2] = x_c + w_half
-    boxes_exp[:, 1] = y_c - h_half
-    boxes_exp[:, 3] = y_c + h_half
-    return boxes_exp
-def flip_boxes(boxes, width):
-    """Flip the boxes horizontally."""
-    flip_boxes = boxes.copy()
-    old_x1 = boxes[:, 0].copy()
-    old_x2 = boxes[:, 2].copy()
-    flip_boxes[:, 0] = width - old_x2 - 1
-    flip_boxes[:, 2] = width - old_x1 - 1
-    return flip_boxes
-def filter_boxes(boxes, min_size):
-    """Remove all boxes with any side smaller than min size."""
-    ws = boxes[:, 2] - boxes[:, 0] + 1
-    hs = boxes[:, 3] - boxes[:, 1] + 1
-    keep = np.where((ws >= min_size) & (hs >= min_size))[0]
-    return keep
-def dismantle_boxes(gt_boxes, num_images):
-    """Dismantle the packed ground-truth boxes."""
-    return [
-        gt_boxes[
-            np.where(gt_boxes[:, -1].astype(np.int32) == i)[0]
-        ][:, :-1] for i in range(num_images)
-    ]
--- a/lib/utils/framework.py
+++ b/lib/utils/framework.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import collections
-import dragon
-from dragon.core.framework import tensor_util
-from dragon.core.util import six
-import dragon.vm.torch as torch
-import numpy as np
-from lib.core.config import cfg
-def feed_tensor(tensor, array):
-    tensor_util.set_array(tensor, array)
-def get_param_groups(module, bias_lr=1., bias_decay=0.):
-    """Separate weight and bias into parameters groups.
-    Parameters
-    ----------
-    module : dragon.vm.torch.nn.Module
-        The module to collect parameters.
-    bias_lr : float, optional, default=1.
-        The lr multiplier of bias.
-    bias_decay : float, optional, default=0.
-        The decay multiplier of bias.
-    Returns
-    -------
-    Sequence[ParamGroup]
-        The parameter groups.
-    """
-    param_groups = [
-        {
-            'params': [],
-            'lr_mult': 1.,
-            'decay_mult': 1.,
-        },
-        {
-            'params': [],
-            'lr_mult': bias_lr,
-            'decay_mult': bias_decay,
-        }
-    ]
-    for name, param in module.named_parameters():
-        gi = 0 if 'weight' in name and param.dim() > 1 else 1
-        param_groups[gi]['params'].append(param)
-    if len(param_groups[1]['params']) == 0:
-        param_groups.pop()  # Remove empty group
-    return param_groups
-def get_workspace():
-    """Return the current default workspace.
-    Returns
-    -------
-    dragon.Workspace
-        The default workspace.
-    """
-    return dragon.get_workspace()
-def new_placeholder(device=None):
-    """Create a new tensor to feed data.
-    Parameters
-    ----------
-    device : int, optional
-        The device index.
-    Returns
-    -------
-    dragon.vm.torch.Tensor
-        The placeholder tensor.
-    """
-    value = torch.zeros(1)
-    if device is not None:
-        return value.cuda(device)
-    return value
-def new_tensor(data, enforce_cpu=False):
-    """Create a new tensor from the data.
-    Parameters
-    ----------
-    data : array_like
-        The data value.
-    enforce_cpu : bool, optional, default=False
-        **True** to enforce the cpu storage.
-    Returns
-    -------
-    dragon.vm.torch.Tensor
-        The tensor taken with the data.
-    """
-    if isinstance(data, np.ndarray):
-        tensor = torch.from_numpy(data)
-    elif isinstance(data, torch.Tensor):
-        tensor = data
-    else:
-        tensor = torch.tensor(data)
-    if not enforce_cpu:
-        tensor = tensor.cuda(cfg.GPU_ID)
-    return tensor
-def new_workspace(merge_default=True):
-    """Create a new workspace.
-    Parameters
-    ----------
-    merge_default : bool, optional, default=True
-        **True** to merge tensors from default workspace.
-    Returns
-    -------
-    dragon.Workspace
-        The new workspace.
-    """
-    workspace = dragon.Workspace()
-    if merge_default:
-        workspace.merge_from(get_workspace())
-    return workspace
-def reset_workspace(workspace=None, merge_default=True):
-    """Reset a workspace and return a new one.
-    Parameters
-    ----------
-    workspace : dragon.Workspace, optional
-        The workspace to reset.
-    merge_default : bool, optional, default=True
-        **True** to merge tensors from default workspace.
-    Returns
-    -------
-    dragon.Workspace
-        The new workspace.
-    """
-    if workspace is not None:
-        workspace.Clear()  # Block the GIL
-    return new_workspace(merge_default)
-class Graph(object):
-    """Simple sequential graph to accelerate inference.
-    Graph reduces the overhead of python functions
-    under eager execution. Such cost will be at least 15ms
-    for common backbones, which limits to about 60FPS.
-    For more details, see the eager mechanism of Dragon.
-    """
-    def __init__(self, inputs, outputs, constants=None):
-        def canonicalize(input_dict):
-            if input_dict is None:
-                return {}
-            for k, v in input_dict.items():
-                input_dict[k] = v.name if hasattr(v, 'name') else v
-            return input_dict
-        self.placeholders = {}
-        self._inputs = canonicalize(inputs)
-        self._outputs = canonicalize(outputs)
-        self._constants = canonicalize(constants)
-        self._workspace = get_workspace()
-        self._tracer = torch.jit.get_tracer()
-    @property
-    def workspace(self):
-        return self._workspace
-    @workspace.setter
-    def workspace(self, value):
-        self._workspace = value
-    def forward(self, **kwargs):
-        # Assign inputs
-        for name, tensor in self._inputs.items():
-            value = kwargs.get(name, None)
-            tensor_util.set_array(tensor, value)
-        # Replay the traced expressions
-        self._tracer.replay()
-        # Collect outputs
-        # 1) Target results
-        # 2) Constant values
-        outputs = collections.OrderedDict()
-        for name, tensor in self._outputs.items():
-            outputs[name] = tensor_util.to_array(tensor, True)
-        for name, value in self._constants.items():
-            outputs[name] = value
-        return outputs
-    def __call__(self, **kwargs):
-        with self._workspace.as_default():
-            return self.forward(**kwargs)
-# Aliases
-pickle = six.moves.pickle
--- a/scripts/coco/im2rec.py
+++ b/scripts/coco/im2rec.py
@@ -11,6 +11,10 @@
 """Make record file for COCO dataset."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
 import os
 import shutil
@@ -37,8 +41,8 @@ if __name__ == '__main__':
        record_file=os.path.join(COCO_ROOT, 'coco_2014_trainval35k'),
        images_path=[os.path.join(COCO_ROOT, 'images/train2014'),
                     os.path.join(COCO_ROOT, 'images/val2014')],
-        splits_path=[os.path.join(COCO_ROOT, 'ImageSets'),
+        splits_path=[os.path.join(COCO_ROOT, 'splits'),
-                     os.path.join(COCO_ROOT, 'ImageSets')],
+                     os.path.join(COCO_ROOT, 'splits')],
        mask_file='build/coco_2014_trainval35k_mask.pkl',
        splits=['train', 'valminusminival'],
    )
@@ -48,7 +52,7 @@ if __name__ == '__main__':
        record_file=os.path.join(COCO_ROOT, 'coco_2014_minival'),
        images_path=os.path.join(COCO_ROOT, 'images/val2014'),
        mask_file='build/coco_2014_minival_mask.pkl',
-        splits_path=os.path.join(COCO_ROOT, 'ImageSets'),
+        splits_path=os.path.join(COCO_ROOT, 'splits'),
        splits=['minival'],
    )

--- a/scripts/coco/maker.py
+++ b/scripts/coco/maker.py
@@ -86,7 +86,7 @@ def make_record(
    print('Start Time:', time.strftime("%a, %d %b %Y %H:%M:%S", time.gmtime()))
-    writer = dragon.io.SeetaRecordWriter(
+    writer = dragon.io.KPLRecordWriter(
        path=record_file,
        protocol={
            'id': 'string',
@@ -133,6 +133,6 @@ def make_record(
    writer.close()
    end_time = time.time()
-    data_size = os.path.getsize(record_file + '/data.data') * 1e-6
+    data_size = os.path.getsize(record_file + '/root.data') * 1e-6
    print('{} images take {:.2f} MB in {:.2f} sec.'
          .format(total_line, data_size, end_time - start_time))
--- a/scripts/coco/maskgen.py
+++ b/scripts/coco/maskgen.py
@@ -20,11 +20,11 @@ except:
    import pickle as cPickle
 sys.path.insert(0, '../..')
-from lib.pycocotools.coco import COCO
+from seetadet.pycocotools.coco import COCO
-from lib.pycocotools import mask_utils
+from seetadet.pycocotools import mask_utils
-class imdb(object):
+class COCOWrapper(object):
    def __init__(self, image_set, year, data_dir):
        self._year = year
        self._image_set = image_set
@@ -120,8 +120,6 @@ class imdb(object):
                # running out of the image bound
                # Do not use them or decoding error is inevitable
                mask_bytes = mask_utils.poly2bytes(obj['segmentation'], height, width)
-            if not isinstance(mask_bytes, bytes):
-                print(type(mask_bytes))
            if obj['area'] > 0 and x2 > x1 and y2 > y1:
                obj['clean_bbox'] = [x1, y1, x2, y2]
                valid_objects.append({
@@ -146,10 +144,11 @@ class imdb(object):
 def make_mask(split, year, data_dir):
-    coco = imdb(split, year, data_dir)
+    coco = COCOWrapper(split, year, data_dir)
-    print('Preparing to make split: {}, total {} images'.format(split, coco.num_images))
+    print('Preparing to make split: {}, total {} images'
-    if not osp.exists(osp.join(coco._data_path, 'ImageSets')):
+          .format(split, coco.num_images))
-        os.makedirs(osp.join(coco._data_path, 'ImageSets'))
+    if not osp.exists(osp.join(coco._data_path, 'splits')):
+        os.makedirs(osp.join(coco._data_path, 'splits'))
    gt_recs = OrderedDict()
    for i in range(coco.num_images):
@@ -157,14 +156,14 @@ def make_mask(split, year, data_dir):
        h, w, objects = coco.annotation_at(i)
        gt_recs[filename] = objects
-    with open(osp.join('build',
+    with open(osp.join('build', 'coco_' + year + '_' + split + '_mask.pkl'), 'wb') as f:
-        'coco_' + year + '_' + split + '_mask.pkl'), 'wb') as f:
+        cPickle.dump(gt_recs, f, cPickle.HIGHEST_PROTOCOL)
-            cPickle.dump(gt_recs, f, cPickle.HIGHEST_PROTOCOL)
-    with open(osp.join(coco._data_path, 'ImageSets', split + '.txt'), 'w') as f:
+    with open(osp.join(coco._data_path, 'splits', split + '.txt'), 'w') as f:
        for i in range(coco.num_images):
            filename = (coco.image_path_at(i).split('/')[-1]).split('.')[0]
-            if i != coco.num_images - 1: filename += '\n'
+            if i != coco.num_images - 1:
+                filename += '\n'
            f.write(filename)

--- a/scripts/rotated/im2rec.py
+++ b/scripts/rotated/im2rec.py
@@ -26,6 +26,6 @@ if __name__ == '__main__':
        record_file=osp.join(data_root, 'rotated_train'),
        images_path=[osp.join(data_root, 'JPEGImages')],
        annotations_path=[osp.join(data_root, 'Annotations')],
-        imagesets_path=[osp.join(data_root, 'ImageSets')],
+        splits_path=[osp.join(data_root, 'ImageSets')],
        splits=['train']
    )
--- a/scripts/rotated/maker.py
+++ b/scripts/rotated/maker.py
@@ -57,7 +57,7 @@ def make_record(
    record_file,
    images_path,
    annotations_path,
-    imagesets_path,
+    splits_path,
    splits
 ):
    if os.path.exists(record_file):
@@ -68,15 +68,15 @@ def make_record(
        images_path = [images_path]
    if not isinstance(annotations_path, list):
        annotations_path = [annotations_path]
-    if not isinstance(imagesets_path, list):
+    if not isinstance(splits_path, list):
-        imagesets_path = [imagesets_path]
+        splits_path = [splits_path]
-    assert len(splits) == len(imagesets_path)
+    assert len(splits) == len(splits_path)
    assert len(splits) == len(images_path)
    assert len(splits) == len(annotations_path)
    print('Start Time:', time.strftime("%a, %d %b %Y %H:%M:%S", time.gmtime()))
-    writer = dragon.io.SeetaRecordWriter(
+    writer = dragon.io.KPLRecordWriter(
        path=record_file,
        protocol={
            'id': 'string',
@@ -99,31 +99,37 @@ def make_record(
        }
    )
-    count, total_line = 0, 0
+    # Scan all available entries
-    start_time = time.time()
+    print('Scan entries...')
+    entries = []
-    for db_idx, split in enumerate(splits):
+    for i, split in enumerate(splits):
-        split_file = os.path.join(imagesets_path[db_idx], split + '.txt')
+        split_file = os.path.join(splits_path[i], split + '.txt')
-        assert os.path.exists(split_file)
        with open(split_file, 'r') as f:
            lines = f.readlines()
-            total_line += len(lines)
        for line in lines:
-            count += 1
-            if count % 2000 == 0:
-                now_time = time.time()
-                print('{} / {} in {:.2f} sec'.format(
-                    count, total_line, now_time - start_time))
            filename = line.strip()
-            image_file = os.path.join(images_path[db_idx], filename + '.jpg')
+            img_file = os.path.join(images_path[i], filename + '.jpg')
-            xml_file = os.path.join(annotations_path[db_idx], filename + '.xml')
+            ann_file = os.path.join(annotations_path[i], filename + '.xml')
-            writer.write(make_example(image_file, xml_file))
+            entries.append((img_file, ann_file))
+    # Parse and write into record file
+    print('Start Time:', time.strftime("%a, %d %b %Y %H:%M:%S", time.gmtime()))
+    start_time = time.time()
+    for i, (img_file, ann_file) in enumerate(entries):
+        if i > 0 and i % 2000 == 0:
+            now_time = time.time()
+            print('{} / {} in {:.2f} sec'.format(
+                i, len(entries), now_time - start_time))
+        writer.write(make_example(img_file, ann_file))
    now_time = time.time()
-    print('{} / {} in {:.2f} sec'.format(count, total_line, now_time - start_time))
+    print('{} / {} in {:.2f} sec'.format(
+        len(entries), len(entries), now_time - start_time))
    writer.close()
    end_time = time.time()
-    data_size = os.path.getsize(record_file + '/data.data') * 1e-6
+    data_size = os.path.getsize(record_file + '/root.data') * 1e-6
    print('{} images take {:.2f} MB in {:.2f} sec.'
-          .format(total_line, data_size, end_time - start_time))
+          .format(len(entries), data_size, end_time - start_time))
--- a/scripts/voc/im2rec.py
+++ b/scripts/voc/im2rec.py
@@ -28,7 +28,7 @@ if __name__ == '__main__':
                     osp.join(voc_root, 'VOCdevkit2012/VOC2012/JPEGImages')],
        annotations_path=[osp.join(voc_root, 'VOCdevkit2007/VOC2007/Annotations'),
                          osp.join(voc_root, 'VOCdevkit2012/VOC2012/Annotations')],
-        imagesets_path=[osp.join(voc_root, 'VOCdevkit2007/VOC2007/ImageSets/Main'),
+        splits_path=[osp.join(voc_root, 'VOCdevkit2007/VOC2007/ImageSets/Main'),
                        osp.join(voc_root, 'VOCdevkit2012/VOC2012/ImageSets/Main')],
        splits=['trainval', 'trainval']
    )
@@ -37,6 +37,6 @@ if __name__ == '__main__':
        record_file=osp.join(voc_root, 'voc_2007_test'),
        images_path=osp.join(voc_root, 'VOCdevkit2007/VOC2007/JPEGImages'),
        annotations_path=osp.join(voc_root, 'VOCdevkit2007/VOC2007/Annotations'),
-        imagesets_path=osp.join(voc_root, 'VOCdevkit2007/VOC2007/ImageSets/Main'),
+        splits_path=osp.join(voc_root, 'VOCdevkit2007/VOC2007/ImageSets/Main'),
        splits=['test']
-   )
+    )
--- a/scripts/voc/maker.py
+++ b/scripts/voc/maker.py
@@ -26,11 +26,17 @@ def make_example(image_file, xml_file):
    tree = ET.parse(xml_file)
    filename = os.path.split(xml_file)[-1]
    objs = tree.findall('object')
+    size = tree.find('size')
    example = {'id': filename.split('.')[0], 'object': []}
    with open(image_file, 'rb') as f:
        img_bytes = bytes(f.read())
-    img = cv2.imdecode(np.frombuffer(img_bytes, 'uint8'), 1)
+    if size is not None:
-    example['height'], example['width'], example['depth'] = img.shape
+        example['height'] = int(size.find('height').text)
+        example['width'] = int(size.find('width').text)
+        example['depth'] = int(size.find('depth').text)
+    else:
+        img = cv2.imdecode(np.frombuffer(img_bytes, 'uint8'), 3)
+        example['height'], example['width'], example['depth'] = img.shape
    example['content'] = img_bytes
    for ix, obj in enumerate(objs):
        bbox = obj.find('bndbox')
@@ -53,7 +59,7 @@ def make_record(
    record_file,
    images_path,
    annotations_path,
-    imagesets_path,
+    splits_path,
    splits
 ):
    if os.path.exists(record_file):
@@ -64,15 +70,13 @@ def make_record(
        images_path = [images_path]
    if not isinstance(annotations_path, list):
        annotations_path = [annotations_path]
-    if not isinstance(imagesets_path, list):
+    if not isinstance(splits_path, list):
-        imagesets_path = [imagesets_path]
+        splits_path = [splits_path]
-    assert len(splits) == len(imagesets_path)
+    assert len(splits) == len(splits_path)
    assert len(splits) == len(images_path)
    assert len(splits) == len(annotations_path)
-    print('Start Time:', time.strftime("%a, %d %b %Y %H:%M:%S", time.gmtime()))
+    writer = dragon.io.KPLRecordWriter(
-    writer = dragon.io.SeetaRecordWriter(
        path=record_file,
        protocol={
            'id': 'string',
@@ -91,31 +95,36 @@ def make_record(
        }
    )
-    count, total_line = 0, 0
+    # Scan all available entries
-    start_time = time.time()
+    print('Scan entries...')
+    entries = []
-    for db_idx, split in enumerate(splits):
+    for i, split in enumerate(splits):
-        split_file = os.path.join(imagesets_path[db_idx], split + '.txt')
+        split_file = os.path.join(splits_path[i], split + '.txt')
-        assert os.path.exists(split_file)
        with open(split_file, 'r') as f:
            lines = f.readlines()
-            total_line += len(lines)
        for line in lines:
-            count += 1
-            if count % 2000 == 0:
-                now_time = time.time()
-                print('{} / {} in {:.2f} sec'.format(
-                    count, total_line, now_time - start_time))
            filename = line.strip()
-            image_file = os.path.join(images_path[db_idx], filename + '.jpg')
+            img_file = os.path.join(images_path[i], filename + '.jpg')
-            xml_file = os.path.join(annotations_path[db_idx], filename + '.xml')
+            ann_file = os.path.join(annotations_path[i], filename + '.xml')
-            writer.write(make_example(image_file, xml_file))
+            entries.append((img_file, ann_file))
+    # Parse and write into record file
+    print('Start Time:', time.strftime("%a, %d %b %Y %H:%M:%S", time.gmtime()))
+    start_time = time.time()
+    for i, (img_file, ann_file) in enumerate(entries):
+        if i > 0 and i % 2000 == 0:
+            now_time = time.time()
+            print('{} / {} in {:.2f} sec'.format(
+                i, len(entries), now_time - start_time))
+        writer.write(make_example(img_file, ann_file))
    now_time = time.time()
-    print('{} / {} in {:.2f} sec'.format(count, total_line, now_time - start_time))
+    print('{} / {} in {:.2f} sec'.format(
+        len(entries), len(entries), now_time - start_time))
    writer.close()
    end_time = time.time()
-    data_size = os.path.getsize(record_file + '/data.data') * 1e-6
+    data_size = os.path.getsize(record_file + '/root.data') * 1e-6
    print('{} images take {:.2f} MB in {:.2f} sec.'
-          .format(total_line, data_size, end_time - start_time))
+          .format(len(entries), data_size, end_time - start_time))
--- a/lib/__init__.py
+++ b/lib/__init__.py
--- a/lib/core/__init__.py
+++ b/lib/core/__init__.py
--- a/seetadet/algo/faster_rcnn/__init__.py
+++ b/seetadet/algo/faster_rcnn/__init__.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from seetadet.algo.faster_rcnn.anchor_target import AnchorTarget
+from seetadet.algo.faster_rcnn.data_loader import DataLoader
+from seetadet.algo.faster_rcnn.proposal import Proposal
+from seetadet.algo.faster_rcnn.proposal_target import ProposalTarget
+from seetadet.algo.faster_rcnn.utils import generate_grid_anchors
+from seetadet.algo.faster_rcnn.utils import map_blobs_by_levels
+from seetadet.algo.faster_rcnn.utils import map_rois_to_levels
+from seetadet.algo.faster_rcnn.utils import map_returns_to_blobs
--- a/lib/faster_rcnn/anchor_target.py
+++ b/lib/faster_rcnn/anchor_target.py
@@ -16,11 +16,11 @@ from __future__ import print_function
 import numpy as np
 import numpy.random as npr
-from lib.core.config import cfg
+from seetadet.algo.faster_rcnn.generate_anchors import generate_anchors
-from lib.faster_rcnn.generate_anchors import generate_anchors
+from seetadet.algo.faster_rcnn.utils import generate_grid_anchors
-from lib.faster_rcnn.utils import generate_grid_anchors
+from seetadet.core.config import cfg
-from lib.utils import boxes as box_util
+from seetadet.utils import boxes as box_util
-from lib.utils.framework import new_tensor
+from seetadet.utils.env import new_tensor
 class AnchorTarget(object):
@@ -62,9 +62,7 @@ class AnchorTarget(object):
        # Label: ``1`` is positive, ``0`` is negative, ``-1` is don't care
        labels_wide = -np.ones((num_images, num_anchors,), 'float32')
-        bbox_targets_wide = np.zeros((num_images, num_anchors, 4), 'float32')
+        bbox_indices_wide, bbox_anchors_wide, bbox_targets_wide = [], [], []
-        bbox_inside_weights_wide = np.zeros_like(bbox_targets_wide, 'float32')
-        bbox_outside_weights_wide = np.zeros_like(bbox_targets_wide, 'float32')
        for ix in range(num_images):
            # GT boxes (x1, y1, x2, y2, label, ...)
@@ -95,13 +93,13 @@ class AnchorTarget(object):
                                       np.arange(overlaps.shape[1])]
            gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0]
-            # fg label: for each gt, anchor with highest overlap
+            # Foreground: for each gt, anchor with highest overlap
            labels[gt_argmax_overlaps] = 1
-            # fg label: above threshold IOU
+            # Foreground: above threshold IoU
            labels[max_overlaps >= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = 1
-            # bg label: below threshold IOU
+            # Background: below threshold IoU
            labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0
            # Subsample positive labels if we have too many
@@ -112,6 +110,11 @@ class AnchorTarget(object):
                labels[disable_inds] = -1
                fg_inds = np.where(labels == 1)[0]
+            # Retract the clamping if we don't have one
+            if len(fg_inds) == 0:
+                labels[gt_argmax_overlaps] = 1
+                fg_inds = np.where(labels == 1)[0]
            # Subsample negative labels if we have too many
            num_bg = cfg.TRAIN.RPN_BATCHSIZE - np.sum(labels == 1)
            bg_inds = np.where(labels == 0)[0]
@@ -119,51 +122,27 @@ class AnchorTarget(object):
                disable_inds = npr.choice(bg_inds, len(bg_inds) - num_bg, False)
                labels[disable_inds] = -1
-            bbox_targets = np.zeros((num_inside, 4), 'float32')
+            labels_wide[ix, inds_inside] = labels
-            bbox_targets[fg_inds, :] = \
+            bbox_anchors_wide.append(anchors[fg_inds])
+            bbox_indices_wide.append(inds_inside[fg_inds] + (num_anchors * ix))
+            bbox_targets_wide.append(
                box_util.bbox_transform(
-                    anchors[fg_inds, :],
+                    anchors[fg_inds],
                    gt_boxes[argmax_overlaps[fg_inds], :4],
                )
-            bbox_inside_weights = np.zeros((num_inside, 4), 'float32')
+            )
-            bbox_inside_weights[labels == 1, :] = np.array((1., 1., 1., 1.))
-            bbox_outside_weights = np.zeros((num_inside, 4), 'float32')
+        if self.num_strides == 1:
-            bbox_outside_weights[labels == 1, :] = np.ones((1, 4)) / cfg.TRAIN.RPN_BATCHSIZE
-            bbox_outside_weights[labels == 0, :] = np.ones((1, 4)) / cfg.TRAIN.RPN_BATCHSIZE
-            labels_wide[ix, inds_inside] = labels  # label
-            bbox_targets_wide[ix, inds_inside] = bbox_targets
-            bbox_inside_weights_wide[ix, inds_inside] = bbox_inside_weights
-            bbox_outside_weights_wide[ix, inds_inside] = bbox_outside_weights
-        if self.num_strides > 1:
-            labels = labels_wide.reshape((num_images, num_anchors))
-            bbox_targets = bbox_targets_wide.transpose((0, 2, 1))
-            bbox_inside_weights = bbox_inside_weights_wide.transpose((0, 2, 1))
-            bbox_outside_weights = bbox_outside_weights_wide.transpose((0, 2, 1))
-        else:
            A = self.base_anchors[0].shape[0]
            height, width = features[0].shape[-2:]
-            labels = labels_wide \
+            labels_wide = labels_wide \
                .reshape((num_images, height, width, A)) \
                .transpose(0, 3, 1, 2) \
                .reshape((num_images, num_anchors))
-            bbox_targets = bbox_targets_wide \
-                .reshape((num_images, height, width, A * 4)) \
-                .transpose(0, 3, 1, 2)
-            bbox_inside_weights = bbox_inside_weights_wide \
-                .reshape((num_images, height, width, A * 4)) \
-                .transpose(0, 3, 1, 2)
-            bbox_outside_weights = bbox_outside_weights_wide \
-                .reshape((num_images, height, width, A * 4)) \
-                .transpose(0, 3, 1, 2)
        return {
-            'labels': new_tensor(labels),
+            'labels': new_tensor(labels_wide),
-            'bbox_targets': new_tensor(bbox_targets),
+            'bbox_indices': new_tensor(np.concatenate(bbox_indices_wide)),
-            'bbox_inside_weights': new_tensor(bbox_inside_weights),
+            'bbox_targets': new_tensor(np.concatenate(bbox_targets_wide).astype('float32')),
-            'bbox_outside_weights': new_tensor(bbox_outside_weights),
+            'bbox_anchors': new_tensor(np.concatenate(bbox_anchors_wide).astype('float32')),
        }
--- a/seetadet/algo/faster_rcnn/data_loader.py
+++ b/seetadet/algo/faster_rcnn/data_loader.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import multiprocessing as mp
+import time
+import dragon
+import dragon.vm.torch as torch
+import numpy as np
+from seetadet.algo.faster_rcnn import data_transformer
+from seetadet.core.config import cfg
+from seetadet.datasets.factory import get_dataset
+from seetadet.utils import logger
+from seetadet.utils.blob import im_list_to_blob
+class DataLoader(object):
+    """Load mini-batches of data."""
+    def __init__(self):
+        super(DataLoader, self).__init__()
+        dataset = get_dataset(cfg.TRAIN.DATASET)
+        if cfg.USE_DALI:
+            from seetadet.dali import rcnn_pipeline as pipe
+            self.iterator = pipe.new_iterator(dataset.source)
+        else:
+            self.iterator = Iterator(**{
+                'dataset': dataset.cls,
+                'source': dataset.source,
+                'classes': dataset.classes,
+                'shuffle': cfg.TRAIN.USE_SHUFFLE,
+                'num_chunks': cfg.TRAIN.SHUFFLE_CHUNKS,
+                'batch_size': cfg.TRAIN.IMS_PER_BATCH * 2,
+                'num_transformers': cfg.TRAIN.NUM_THREADS - 1,
+            })
+    def __call__(self):
+        outputs = self.iterator.next()
+        if isinstance(outputs['data'], np.ndarray):
+            outputs['data'] = torch.from_numpy(outputs['data'])
+        return outputs
+class Iterator(mp.Process):
+    """Iterator to return the batch of data."""
+    def __init__(self, **kwargs):
+        super(Iterator, self).__init__()
+        # Distributed settings
+        rank, group_size = 0, 1
+        process_group = dragon.distributed.get_group()
+        if process_group is not None and \
+                kwargs.get('phase', 'TRAIN') == 'TRAIN':
+            group_size = process_group.size
+            rank = dragon.distributed.get_rank(process_group)
+        # Configuration
+        self._prefetch = kwargs.get('prefetch', 5)
+        self._batch_size = kwargs.get('batch_size', 2)
+        self._num_readers = kwargs.get('num_readers', 1)
+        self._num_transformers = kwargs.get('num_transformers', 3)
+        self.daemon = True
+        # Initialize queues
+        num_batches = self._prefetch * self._num_readers
+        self.q_in = mp.Queue(num_batches * self._batch_size)
+        self.q1_out = mp.Queue(num_batches * self._batch_size)
+        self.q2_out = mp.Queue(num_batches * self._batch_size)
+        # Initialize readers
+        self._readers = []
+        for i in range(self._num_readers):
+            part_idx, num_parts = i, self._num_readers
+            num_parts *= group_size
+            part_idx += rank * self._num_readers
+            self._readers.append(dragon.io.DataReader(
+                part_idx=part_idx, num_parts=num_parts, **kwargs))
+            self._readers[i]._seed += part_idx
+            self._readers[i].q_out = self.q_in
+            self._readers[i].start()
+            time.sleep(0.1)
+        # Initialize transformers
+        self._transformers = []
+        for i in range(self._num_transformers):
+            p = data_transformer.DataTransformer(**kwargs)
+            p._seed += (i + rank * self._num_transformers)
+            p.q_in = self.q_in
+            p.q1_out, p.q2_out = self.q1_out, self.q2_out
+            p.start()
+            self._transformers.append(p)
+            time.sleep(0.1)
+        # Register cleanup callbacks
+        def cleanup():
+            def terminate(processes):
+                for p in processes:
+                    p.terminate()
+                    p.join()
+            terminate(self._transformers)
+            logger.info('Terminate DataTransformer.')
+            terminate(self._readers)
+            logger.info('Terminate DataReader.')
+        import atexit
+        atexit.register(cleanup)
+    def next(self):
+        """Return the next batch of data."""
+        return self.__next__()
+    def __iter__(self):
+        """Return the iterator self."""
+        return self
+    def __next__(self):
+        """Return the next batch of data."""
+        q_out = None
+        # Two queues to implement aspect-grouping
+        # This is necessary to reduce the gpu memory
+        # from fetching a huge square batch blob
+        while q_out is None:
+            if self.q1_out.qsize() >= cfg.TRAIN.IMS_PER_BATCH:
+                q_out = self.q1_out
+            elif self.q2_out.qsize() >= cfg.TRAIN.IMS_PER_BATCH:
+                q_out = self.q2_out
+        self.q1_out, self.q2_out = self.q2_out, self.q1_out
+        images, images_info, boxes_to_pack = [], [], []
+        for i in range(cfg.TRAIN.IMS_PER_BATCH):
+            image, image_scale, boxes = q_out.get()
+            images.append(image)
+            images_info.append(list(image.shape[:2]) + [image_scale])
+            gt_boxes = np.zeros((boxes.shape[0], boxes.shape[1] + 1), 'float32')
+            gt_boxes[:, :boxes.shape[1]], gt_boxes[:, -1] = boxes, i
+            boxes_to_pack.append(gt_boxes)
+        return {
+            'data': im_list_to_blob(images),
+            'ims_info': np.array(images_info, dtype=np.float32),
+            'gt_boxes': np.concatenate(boxes_to_pack),
+        }
--- a/lib/faster_rcnn/data_transformer.py
+++ b/lib/faster_rcnn/data_transformer.py
@@ -15,19 +15,19 @@ from __future__ import print_function
 import multiprocessing
-import cv2
 import numpy as np
-from lib.core.config import cfg
+from seetadet.core.config import cfg
-from lib.datasets.example import Example
+from seetadet.datasets.example import Example
-from lib.utils import boxes as box_util
+from seetadet.utils import boxes as box_util
-from lib.utils.blob import prep_im_for_blob
+from seetadet.utils.blob import prep_im_for_blob
-from lib.utils.image import get_image_with_target_size
 class DataTransformer(multiprocessing.Process):
    def __init__(self, **kwargs):
        super(DataTransformer, self).__init__()
+        self._scales = cfg.TRAIN.SCALES
+        self._max_size = cfg.TRAIN.MAX_SIZE
        self._seed = cfg.RNG_SEED
        self._use_flipped = cfg.TRAIN.USE_FLIPPED
        self._use_diff = cfg.TRAIN.USE_DIFF
@@ -37,13 +37,7 @@ class DataTransformer(multiprocessing.Process):
        self.q_in = self.q1_out = self.q2_out = None
        self.daemon = True
-    def make_roi_dict(
+    def make_roi_dict(self, example, im_scale, apply_flip=False):
-        self,
-        example,
-        im_scale,
-        apply_flip=False,
-        offsets=None,
-    ):
        objects, n_objects = example.objects, 0
        height, width = example.height, example.width
        if not self._use_diff:
@@ -86,15 +80,6 @@ class DataTransformer(multiprocessing.Process):
        # Scale the boxes to the detecting scale
        roi_dict['boxes'] *= im_scale
-        # Apply the offsets from scale jitter
-        if offsets is not None:
-            roi_dict['boxes'][:, 0::2] += offsets[0]
-            roi_dict['boxes'][:, 1::2] += offsets[1]
-            roi_dict['boxes'][:, :] = np.minimum(
-                np.maximum(roi_dict['boxes'][:, :], 0),
-                [offsets[2][1] - 1, offsets[2][0] - 1] * 2,
-            )
        return roi_dict
    def get(self, example):
@@ -102,9 +87,8 @@ class DataTransformer(multiprocessing.Process):
        img = example.image
        # Scale
-        max_size = cfg.TRAIN.MAX_SIZE
+        target_size = self._scales[np.random.randint(len(self._scales))]
-        target_size = cfg.TRAIN.SCALES[np.random.randint(len(cfg.TRAIN.SCALES))]
+        img, im_scale = prep_im_for_blob(img, target_size, self._max_size)
-        img, im_scale, jitter = prep_im_for_blob(img, target_size, max_size)
        # Flip
        apply_flip = False
@@ -113,19 +97,8 @@ class DataTransformer(multiprocessing.Process):
                img = img[:, ::-1]
                apply_flip = True
-        # Random Crop or RandomPad
-        offsets = None
-        if cfg.TRAIN.MAX_SIZE > 0:
-            if jitter != 1:
-                # To a rectangle (scale, max_size)
-                target_size = (np.array(img.shape[:2]) / jitter).astype(np.int32)
-                img, offsets = get_image_with_target_size(target_size, img)
-        else:
-            # To a square (target_size, target_size)
-            img, offsets = get_image_with_target_size([target_size] * 2, img)
        # Example -> RoIDict
-        roi_dict = self.make_roi_dict(example, im_scale, apply_flip, offsets)
+        roi_dict = self.make_roi_dict(example, im_scale, apply_flip)
        # Post-Process for gt boxes
        # Shape like: [num_objects, {x1, y1, x2, y2, cls}]

--- a/lib/faster_rcnn/generate_anchors.py
+++ b/lib/faster_rcnn/generate_anchors.py
--- a/lib/faster_rcnn/proposal.py
+++ b/lib/faster_rcnn/proposal.py
@@ -17,11 +17,11 @@ import collections
 import numpy as np
-from lib.core.config import cfg
+from seetadet.algo.faster_rcnn.generate_anchors import generate_anchors
-from lib.faster_rcnn.generate_anchors import generate_anchors
+from seetadet.algo.faster_rcnn.utils import generate_grid_anchors
-from lib.faster_rcnn.utils import generate_grid_anchors
+from seetadet.core.config import cfg
-from lib.nms import nms_wrapper
+from seetadet.utils import boxes as box_util
-from lib.utils import boxes as box_util
+from seetadet.utils import nms
 class Proposal(object):
@@ -67,8 +67,8 @@ class Proposal(object):
        # Prepare for the outputs
        batch_rois = []
-        cls_prob = cls_prob.numpy(True)
+        cls_prob = cls_prob.numpy()
-        bbox_pred = bbox_pred.numpy(True)
+        bbox_pred = bbox_pred.numpy()
        if self.num_strides > 1:
            # (?, 4, A * K) -> (?, A * K, 4)
            bbox_pred = bbox_pred.transpose((0, 2, 1))
@@ -113,7 +113,7 @@ class Proposal(object):
            # Apply nms (e.g. threshold = 0.7)
            # Take after_nms_topN (e.g. 300)
            # Return the top proposals (-> RoIs top)
-            keep = nms_wrapper.nms(np.hstack((proposals, scores)), nms_thresh)
+            keep = nms.gpu_nms(np.hstack((proposals, scores)), nms_thresh)
            if post_nms_top_n > 0:
                keep = keep[:post_nms_top_n]
            proposals = proposals[keep, :]

--- a/lib/faster_rcnn/proposal_target.py
+++ b/lib/faster_rcnn/proposal_target.py
@@ -18,12 +18,10 @@ import collections
 import numpy as np
 import numpy.random as npr
-from lib.core.config import cfg
+from seetadet.algo.faster_rcnn import utils as rcnn_util
-from lib.faster_rcnn.utils import map_blobs_to_outputs
+from seetadet.core.config import cfg
-from lib.faster_rcnn.utils import map_returns_to_blobs
+from seetadet.utils import boxes as box_util
-from lib.faster_rcnn.utils import map_rois_to_levels
+from seetadet.utils.env import new_tensor
-from lib.utils import boxes as box_util
-from lib.utils.framework import new_tensor
 class ProposalTarget(object):
@@ -35,10 +33,8 @@ class ProposalTarget(object):
        self.num_classes = cfg.MODEL.NUM_CLASSES
        self.defaults = collections.OrderedDict([
            ('rois', np.array([[-1, 0, 0, 1, 1]], 'float32')),
-            ('labels', np.array([-1], 'float32')),
+            ('labels', np.array([-1], 'int64')),
-            ('bbox_targets', np.zeros((1, self.num_classes * 4), 'float32')),
+            ('bbox_targets', np.zeros((1, 4), 'float32')),
-            ('bbox_inside_weights', np.zeros((1, self.num_classes * 4), 'float32')),
-            ('bbox_outside_weights', np.zeros((1, self.num_classes * 4), 'float32')),
        ])
    def __call__(self, rpn_rois, gt_boxes):
@@ -63,86 +59,65 @@ class ProposalTarget(object):
            # Sample a batch of RoIs for training
            rois_per_image = cfg.TRAIN.BATCH_SIZE
            fg_rois_per_image = np.round(cfg.TRAIN.FG_FRACTION * rois_per_image)
-            map_returns_to_blobs(
+            rcnn_util.map_returns_to_blobs(
                sample_rois(
                    rois,
                    gt_boxes,
                    rois_per_image,
                    fg_rois_per_image,
-                    self.num_classes,
                ), blobs, keys,
            )
        # Stack into continuous blobs
-        for k, v in blobs.items():
+        blobs = dict((k, np.concatenate(blobs[k])) for k in blobs.keys())
-            blobs[k] = np.concatenate(blobs[k], 0)
        if self.num_strides > 1:
            # Distribute RoIs into pyramids
            min_lvl = cfg.FPN.ROI_MIN_LEVEL
            max_lvl = cfg.FPN.ROI_MAX_LEVEL
-            k = max_lvl - min_lvl + 1
+            num_levels = max_lvl - min_lvl + 1
-            levels = map_rois_to_levels(blobs['rois'], min_lvl, max_lvl)
+            levels = rcnn_util.map_rois_to_levels(blobs['rois'], min_lvl, max_lvl)
-            outputs = map_blobs_to_outputs(
+            lvl_blobs = rcnn_util.map_blobs_by_levels(
                blobs,
                self.defaults,
-                [np.where(levels == (i + min_lvl))[0] for i in range(k)],
+                [np.where(levels == (i + min_lvl))[0] for i in range(num_levels)],
            )
-            return {
+            blobs = dict((k, np.concatenate(lvl_blobs[k])) for k in blobs.keys())
-                'rois': [new_tensor(outputs['rois'][i]) for i in range(k)],
+            rois_wide = [lvl_blobs['rois'][i] for i in range(num_levels)]
-                'labels': new_tensor(np.concatenate(outputs['labels'], 0)),
-                'bbox_targets': new_tensor(np.vstack(outputs['bbox_targets'])),
-                'bbox_inside_weights': new_tensor(np.vstack(outputs['bbox_inside_weights'])),
-                'bbox_outside_weights': new_tensor(np.vstack(outputs['bbox_outside_weights'])),
-            }
        else:
-            # Return RoIs directly for CX-stride
+            # Return RoIs directly for specified stride
-            return {
+            rois_wide = [blobs['rois']]
-                'rois': [new_tensor(blobs['rois'])],
-                'labels': new_tensor(blobs['labels']),
+        # Select the foreground RoIs only for bbox branch
-                'bbox_targets': new_tensor(blobs['bbox_targets']),
+        fg_inds = np.where(blobs['labels'] > 0)[0]
-                'bbox_inside_weights': new_tensor(blobs['bbox_inside_weights']),
+        cls_inds = np.arange(len(blobs['rois'])) * self.num_classes
-                'bbox_outside_weights': new_tensor(blobs['bbox_outside_weights']),
-            }
+        return {
+            'rois': [new_tensor(rois) for rois in rois_wide],
+            'labels': new_tensor(blobs['labels']),
-def get_targets(ex_rois, gt_rois, gt_labels, num_classes):
+            'bbox_indices': new_tensor(cls_inds[fg_inds] + blobs['labels'][fg_inds]),
-    """Compute bounding-box regression targets for an image."""
+            'bbox_targets': new_tensor(blobs['bbox_targets'][fg_inds].astype('float32')),
-    assert ex_rois.shape[0] == gt_rois.shape[0]
+            'bbox_anchors': new_tensor(blobs['rois'][fg_inds, 1:].astype('float32')),
-    assert ex_rois.shape[1] == 4
+        }
-    assert gt_rois.shape[1] == 4
-    # Compute bbox regression targets
-    fg_inds = np.where(gt_labels > 0)[0]
+def sample_rois(all_rois, gt_boxes, num_rois, num_fg_rois):
-    targets = box_util.bbox_transform(ex_rois, gt_rois, cfg.BBOX_REG_WEIGHTS)
-    bbox_targets = np.zeros((ex_rois.shape[0], 4 * num_classes), 'float32')
-    inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32)
-    for i in fg_inds:
-        start = int(4 * gt_labels[i])
-        bbox_targets[i, start:start + 4] = targets[i]
-        inside_weights[i, start:start + 4] = (1., 1., 1., 1.)
-    outside_weights = np.array(inside_weights > 0).astype('float32')
-    return bbox_targets, inside_weights, outside_weights
-def sample_rois(
-    all_rois,
-    gt_boxes,
-    num_rois,
-    num_fg_rois,
-    num_classes,
-):
    """Sample a batch of RoIs comprising foreground and background examples."""
    overlaps = box_util.bbox_overlaps(all_rois[:, 1:5], gt_boxes[:, :4])
    gt_assignment = overlaps.argmax(axis=1)
    max_overlaps = overlaps.max(axis=1)
-    labels = gt_boxes[gt_assignment, 4]
+    labels = gt_boxes[gt_assignment, 4].astype('int64')
    # Select foreground RoIs as those with >= FG_THRESH overlap
-    fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0]
+    fg_thresh = cfg.TRAIN.FG_THRESH
-    fg_rois_per_this_image = int(min(num_fg_rois, fg_inds.size))
+    fg_inds = np.where(max_overlaps >= fg_thresh)[0]
+    while fg_inds.size == 0:
+        fg_thresh -= 0.01
+        fg_inds = np.where(max_overlaps >= fg_thresh)[0]
    # Sample foreground regions without replacement
-    if fg_inds.size > 0:
+    fg_rois_per_this_image = int(min(num_fg_rois, fg_inds.size))
-        fg_inds = npr.choice(fg_inds, fg_rois_per_this_image, False)
+    fg_inds = npr.choice(fg_inds, fg_rois_per_this_image, False)
    # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
    bg_inds = np.where((max_overlaps < cfg.TRAIN.BG_THRESH_HI) &
@@ -160,15 +135,14 @@ def sample_rois(
    rois, labels = all_rois[keep_inds], labels[keep_inds]
    # Clamp labels for the background RoIs to 0
    labels[fg_rois_per_this_image:] = 0
-    # Clamp the image indices for the background RoIs to -1
-    rois[fg_rois_per_this_image:][0] = -1
    # Compute the target from RoIs
-    outputs = [rois, labels]
+    return [
-    outputs += get_targets(
+        rois,
-        rois[:, 1:5],
-        gt_boxes[gt_assignment[keep_inds], :4],
        labels,
-        num_classes,
+        box_util.bbox_transform(
-    )
+            rois[:, 1:5],
-    return outputs
+            gt_boxes[gt_assignment[keep_inds], :4],
+            cfg.BBOX_REG_WEIGHTS,
+        )
+    ]
--- a/lib/faster_rcnn/test.py
+++ b/lib/faster_rcnn/test.py
@@ -13,17 +13,18 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+import types
 import dragon.vm.torch as torch
 import numpy as np
-from lib.core.config import cfg
+from seetadet.core.config import cfg
-from lib.modeling.detector import new_detector
+from seetadet.modeling.detector import new_detector
-from lib.nms import nms_wrapper
+from seetadet.utils import boxes as box_util
-from lib.utils import boxes as box_util
+from seetadet.utils import nms as nms_util
-from lib.utils import framework
+from seetadet.utils import time_util
-from lib.utils import time_util
+from seetadet.utils.blob import im_list_to_blob
-from lib.utils.blob import im_list_to_blob
+from seetadet.utils.image import scale_image
-from lib.utils.image import scale_image
 def im_detect(detector, raw_image):
@@ -31,49 +32,41 @@ def im_detect(detector, raw_image):
    ims, ims_scale = scale_image(raw_image)
    # Prepare blobs
-    blobs = {'data': im_list_to_blob(ims)}
+    data = im_list_to_blob(ims)
-    blobs['ims_info'] = np.array([
+    ims_info = np.array([list(data.shape[1:3]) + [im_scale]
-        list(blobs['data'].shape[1:3]) + [im_scale]
+         for im_scale in ims_scale], dtype=np.float32)
-        for im_scale in ims_scale
-    ], dtype=np.float32)
    # Do Forward
-    if not hasattr(detector, 'graph'):
+    data = torch.from_numpy(data)
-        with framework.new_workspace().as_default():
+    ims_info = torch.from_numpy(ims_info)
-            data = torch.from_numpy(blobs['data'])
-            ims_info = torch.from_numpy(blobs['ims_info'])
+    if not hasattr(detector, 'script_forward'):
-            with torch.no_grad():
+        def script_forward(self, data, ims_info):
-                with torch.jit.Tracer(retain_ops=True):
+            return self.forward({'data': data, 'ims_info': ims_info})
-                    inputs = {'data': data, 'ims_info': ims_info}
+        detector.script_forward = torch.jit.trace(
-                    outputs = detector.forward(inputs)
+            func=types.MethodType(script_forward, detector),
-                    detector.graph = \
+            example_inputs=[data, ims_info],
-                        framework.Graph(inputs, {
+        )
-                            'rois': outputs['rois'],
-                            'cls_prob': outputs['cls_prob'],
+    outputs = detector.script_forward(data, ims_info)
-                            'bbox_pred': outputs['bbox_pred']
+    outputs = dict((k, outputs[k].numpy()) for k in outputs.keys())
-                        })
-    outputs = detector.graph(**blobs)
    # Decode results
-    rois = outputs['rois']
+    all_scores, all_boxes = [], []
-    scores, boxes, batch_inds = [], [], []
    pred_boxes = \
        box_util.bbox_transform_inv(
-            rois[:, 1:5],
+            outputs['rois'][:, 1:5],
            outputs['bbox_pred'],
            cfg.BBOX_REG_WEIGHTS,
        )
    for i in range(len(ims)):
-        inds = np.where(rois[:, 0].astype(np.int32) == i)[0]
+        inds = np.where(outputs['rois'][:, 0].astype(np.int32) == i)[0]
-        im_boxes = pred_boxes[inds] / ims_scale[i]
+        boxes = pred_boxes[inds] / ims_scale[i]
-        scores.append(outputs['cls_prob'][inds])
+        all_scores.append(outputs['cls_prob'][inds])
-        boxes.append(box_util.clip_tiled_boxes(im_boxes, raw_image.shape))
+        all_boxes.append(box_util.clip_tiled_boxes(boxes, raw_image.shape))
-    return (
+    return np.vstack(all_scores), np.vstack(all_boxes)
-        np.vstack(scores) if len(ims) > 0 else scores[0],
-        np.vstack(boxes) if len(ims) > 0 else boxes[0],
-    )
 def test_net(weights, num_classes, q_in, q_out, device):
@@ -84,7 +77,7 @@ def test_net(weights, num_classes, q_in, q_out, device):
    while True:
        idx, raw_image = q_in.get()
-        if raw_image is None:
+        if idx < 0:
            break
        boxes_this_image = [[]]
@@ -101,17 +94,16 @@ def test_net(weights, num_classes, q_in, q_out, device):
                (cls_boxes, cls_scores[:, np.newaxis])
            ).astype(np.float32, copy=False)
            if cfg.TEST.USE_SOFT_NMS:
-                keep = nms_wrapper.soft_nms(
+                keep = nms_util.soft_nms(
                    cls_detections,
                    thresh=cfg.TEST.NMS,
                    method=cfg.TEST.SOFT_NMS_METHOD,
                    sigma=cfg.TEST.SOFT_NMS_SIGMA,
                )
            else:
-                keep = nms_wrapper.nms(
+                keep = nms_util.nms(
                    cls_detections,
                    thresh=cfg.TEST.NMS,
-                    force_cpu=True,
                )
            cls_detections = cls_detections[keep, :]
            boxes_this_image.append(cls_detections)
@@ -119,11 +111,8 @@ def test_net(weights, num_classes, q_in, q_out, device):
        q_out.put((
            idx,
-            {
+            dict([('im_detect', _t['im_detect'].average_time),
-                'im_detect': _t['im_detect'].average_time,
+                  ('misc', _t['misc'].average_time)]),
-                'misc': _t['misc'].average_time,
+            dict([('boxes', boxes_this_image)]),
-            },
-            {
-                'boxes': boxes_this_image,
-            },
        ))
--- a/lib/faster_rcnn/utils.py
+++ b/lib/faster_rcnn/utils.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 import collections
 import numpy as np
-from lib.core.config import cfg
+from seetadet.core.config import cfg
 def generate_grid_anchors(features, base_anchors, strides):
@@ -75,7 +75,7 @@ def map_rois_to_levels(rois, k_min, k_max):
    return np.clip(target_levels, k_min, k_max)
-def map_blobs_to_outputs(blobs, defaults, lvl_inds):
+def map_blobs_by_levels(blobs, defaults, lvl_inds):
    """Map blobs to outputs according to fpn indices."""
    outputs = collections.defaultdict(list)
    for inds in lvl_inds:

--- a/lib/modeling/__init__.py
+++ b/lib/modeling/__init__.py
@@ -13,10 +13,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-# Import custom modules
+from seetadet.algo.faster_rcnn.anchor_target import AnchorTarget
-from lib.modeling.fast_rcnn import FastRCNN
+from seetadet.algo.faster_rcnn.proposal import Proposal
-from lib.modeling.fpn import FPN
+from seetadet.algo.mask_rcnn.data_loader import DataLoader
-from lib.modeling.mask_rcnn import MaskRCNN
+from seetadet.algo.mask_rcnn.proposal_target import ProposalTarget
-from lib.modeling.retinanet import RetinaNet
-from lib.modeling.rpn import RPN
-from lib.modeling.ssd import SSD
--- a/seetadet/algo/mask_rcnn/data_loader.py
+++ b/seetadet/algo/mask_rcnn/data_loader.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import multiprocessing as mp
+import time
+import dragon
+import dragon.vm.torch as torch
+import numpy as np
+from seetadet.algo.mask_rcnn import data_transformer
+from seetadet.core.config import cfg
+from seetadet.datasets.factory import get_dataset
+from seetadet.utils import logger
+from seetadet.utils.blob import im_list_to_blob
+from seetadet.utils.blob import mask_list_to_blob
+class DataLoader(object):
+    """Provide mini-batches of data."""
+    def __init__(self):
+        super(DataLoader, self).__init__()
+        dataset = get_dataset(cfg.TRAIN.DATASET)
+        self.iterator = Iterator(**{
+            'dataset': dataset.cls,
+            'source': dataset.source,
+            'classes': dataset.classes,
+            'shuffle': cfg.TRAIN.USE_SHUFFLE,
+            'num_chunks': cfg.TRAIN.SHUFFLE_CHUNKS,
+            'batch_size': cfg.TRAIN.IMS_PER_BATCH * 2,
+            'num_transformers': cfg.TRAIN.NUM_THREADS - 1,
+        })
+    def __call__(self):
+        outputs = self.iterator.next()
+        if isinstance(outputs['data'], np.ndarray):
+            outputs['data'] = torch.from_numpy(outputs['data'])
+        return outputs
+class Iterator(mp.Process):
+    """Iterator to return the batch of data."""
+    def __init__(self, **kwargs):
+        super(Iterator, self).__init__()
+        # Distributed settings
+        rank, group_size = 0, 1
+        process_group = dragon.distributed.get_group()
+        if process_group is not None and \
+                kwargs.get('phase', 'TRAIN') == 'TRAIN':
+            group_size = process_group.size
+            rank = dragon.distributed.get_rank(process_group)
+        # Configuration
+        self._prefetch = kwargs.get('prefetch', 5)
+        self._batch_size = kwargs.get('batch_size', 2)
+        self._num_readers = kwargs.get('num_readers', 1)
+        self._num_transformers = kwargs.get('num_transformers', 3)
+        self.daemon = True
+        # Initialize queues
+        num_batches = self._prefetch * self._num_readers
+        self.q_in = mp.Queue(num_batches * self._batch_size)
+        self.q1_out = mp.Queue(num_batches * self._batch_size)
+        self.q2_out = mp.Queue(num_batches * self._batch_size)
+        # Initialize readers
+        self._readers = []
+        for i in range(self._num_readers):
+            part_idx, num_parts = i, self._num_readers
+            num_parts *= group_size
+            part_idx += rank * self._num_readers
+            self._readers.append(dragon.io.DataReader(
+                part_idx=part_idx, num_parts=num_parts, **kwargs))
+            self._readers[i]._seed += part_idx
+            self._readers[i].q_out = self.q_in
+            self._readers[i].start()
+            time.sleep(0.1)
+        # Initialize transformers
+        self._transformers = []
+        for i in range(self._num_transformers):
+            p = data_transformer.DataTransformer(**kwargs)
+            p._seed += (i + rank * self._num_transformers)
+            p.q_in = self.q_in
+            p.q1_out, p.q2_out = self.q1_out, self.q2_out
+            p.start()
+            self._transformers.append(p)
+            time.sleep(0.1)
+        # Register cleanup callbacks
+        def cleanup():
+            def terminate(processes):
+                for p in processes:
+                    p.terminate()
+                    p.join()
+            terminate(self._transformers)
+            logger.info('Terminate DataTransformer.')
+            terminate(self._readers)
+            logger.info('Terminate DataReader.')
+        import atexit
+        atexit.register(cleanup)
+    def next(self):
+        """Return the next batch of data."""
+        return self.__next__()
+    def __iter__(self):
+        """Return the iterator self."""
+        return self
+    def __next__(self):
+        """Return the next batch of data."""
+        q_out = None
+        # Two queues to implement aspect-grouping
+        # This is necessary to reduce the gpu memory
+        # from fetching a huge square batch blob
+        while q_out is None:
+            if self.q1_out.qsize() >= cfg.TRAIN.IMS_PER_BATCH:
+                q_out = self.q1_out
+            elif self.q2_out.qsize() >= cfg.TRAIN.IMS_PER_BATCH:
+                q_out = self.q2_out
+        self.q1_out, self.q2_out = self.q2_out, self.q1_out
+        images, images_info = [], []
+        boxes_to_pack, masks_to_pack = [], []
+        for i in range(cfg.TRAIN.IMS_PER_BATCH):
+            image, image_scale, boxes, masks = q_out.get()
+            images.append(image)
+            images_info.append(list(image.shape[:2]) + [image_scale])
+            gt_boxes = np.zeros((boxes.shape[0], boxes.shape[1] + 1), 'float32')
+            gt_boxes[:, :boxes.shape[1]], gt_boxes[:, -1] = boxes, i
+            boxes_to_pack.append(gt_boxes)
+            masks_to_pack.append(masks)
+        return {
+            'data': im_list_to_blob(images),
+            'ims_info': np.array(images_info, 'float32'),
+            'gt_boxes': np.concatenate(boxes_to_pack),
+            'gt_masks': mask_list_to_blob(masks_to_pack),
+        }
--- a/lib/mask_rcnn/data_transformer.py
+++ b/lib/mask_rcnn/data_transformer.py
@@ -17,17 +17,18 @@ import multiprocessing
 import numpy as np
-from lib.core.config import cfg
+from seetadet.core.config import cfg
-from lib.datasets.example import Example
+from seetadet.datasets.example import Example
-from lib.pycocotools import mask_utils
+from seetadet.pycocotools import mask_utils
-from lib.utils import boxes as box_util
+from seetadet.utils import boxes as box_util
-from lib.utils.blob import prep_im_for_blob
+from seetadet.utils.blob import prep_im_for_blob
-from lib.utils.image import get_image_with_target_size
 class DataTransformer(multiprocessing.Process):
    def __init__(self, **kwargs):
        super(DataTransformer, self).__init__()
+        self._scales = cfg.TRAIN.SCALES
+        self._max_size = cfg.TRAIN.MAX_SIZE
        self._seed = cfg.RNG_SEED
        self._use_flipped = cfg.TRAIN.USE_FLIPPED
        self._use_diff = cfg.TRAIN.USE_DIFF
@@ -98,9 +99,8 @@ class DataTransformer(multiprocessing.Process):
        img = example.image
        # Scale
-        max_size = cfg.TRAIN.MAX_SIZE
+        target_size = self._scales[np.random.randint(len(self._scales))]
-        target_size = cfg.TRAIN.SCALES[np.random.randint(len(cfg.TRAIN.SCALES))]
+        img, im_scale = prep_im_for_blob(img, target_size, self._max_size)
-        img, im_scale, jitter = prep_im_for_blob(img, target_size, max_size)
        # Flip
        apply_flip = False

--- a/lib/mask_rcnn/proposal_target.py
+++ b/lib/mask_rcnn/proposal_target.py
@@ -18,13 +18,11 @@ import collections
 import numpy as np
 import numpy.random as npr
-from lib.core.config import cfg
+from seetadet.algo.faster_rcnn import utils as rcnn_util
-from lib.faster_rcnn.utils import map_blobs_to_outputs
+from seetadet.core.config import cfg
-from lib.faster_rcnn.utils import map_returns_to_blobs
+from seetadet.utils import boxes as box_util
-from lib.faster_rcnn.utils import map_rois_to_levels
+from seetadet.utils import mask as mask_util
-from lib.utils import boxes as box_util
+from seetadet.utils.env import new_tensor
-from lib.utils import mask as mask_util
-from lib.utils.framework import new_tensor
 class ProposalTarget(object):
@@ -36,10 +34,8 @@ class ProposalTarget(object):
        self.num_classes = cfg.MODEL.NUM_CLASSES
        self.defaults = collections.OrderedDict([
            ('rois', np.array([[-1, 0, 0, 1, 1]], 'float32')),
-            ('labels', np.array([-1], 'float32')),
+            ('labels', np.array([-1], 'int64')),
-            ('bbox_targets', np.zeros((1, self.num_classes * 4), 'float32')),
+            ('bbox_targets', np.zeros((1, 4), 'float32')),
-            ('bbox_inside_weights', np.zeros((1, self.num_classes * 4), 'float32')),
-            ('bbox_outside_weights', np.zeros((1, self.num_classes * 4), 'float32')),
            ('mask_targets', -np.ones((1, self.resolution, self.resolution), 'float32')),
        ])
@@ -72,67 +68,75 @@ class ProposalTarget(object):
            # Sample a batch of RoIs for training
            rois_per_image = cfg.TRAIN.BATCH_SIZE
            fg_rois_per_image = np.round(cfg.TRAIN.FG_FRACTION * rois_per_image)
-            map_returns_to_blobs(
+            rcnn_util.map_returns_to_blobs(
                sample_rois(
                    rois,
                    gt_boxes,
                    gt_masks,
                    rois_per_image,
                    fg_rois_per_image,
-                    self.num_classes,
                    ims_info[ix][2],
                ), blobs, keys,
            )
        # Stack into continuous blobs
-        for k, v in blobs.items():
+        blobs = dict((k, np.concatenate(blobs[k])) for k in blobs.keys())
-            blobs[k] = np.concatenate(blobs[k], 0)
        # Distribute rois into pyramids
        k_min = cfg.FPN.ROI_MIN_LEVEL
        k_max = cfg.FPN.ROI_MAX_LEVEL
-        k = k_max - k_min + 1
+        num_levels = k_max - k_min + 1
-        levels = map_rois_to_levels(blobs['rois'], k_min, k_max)
+        levels = rcnn_util.map_rois_to_levels(blobs['rois'], k_min, k_max)
-        outputs = \
+        lvl_blobs = rcnn_util.map_blobs_by_levels(
-            map_blobs_to_outputs(
+            blobs,
-                blobs,
+            self.defaults,
-                self.defaults,
+            [np.where(levels == (i + k_min))[0] for i in range(num_levels)],
-                [np.where(levels == (i + k_min))[0] for i in range(k)],
+        )
-            )
+        rois_wide = [lvl_blobs['rois'][i] for i in range(num_levels)]
-        # Select the foreground RoIs only for mask branch
+        mask_rois_wide, mask_labels_wide = [], []
-        for i in range(k):
-            inds = np.where(outputs['labels'][i] > 0)[0]
+        # Select the foreground RoIs only for bbox/mask branch
-            inds = inds if len(inds) > 0 else np.array([0], 'int64')
+        for i in range(num_levels):
-            outputs['mask_rois'].append(outputs['rois'][i][inds])
+            inds = np.where(lvl_blobs['labels'][i] > 0)[0]
-            outputs['mask_targets'][i] = outputs['mask_targets'][i][inds]
+            if len(inds) > 0:
-            outputs['mask_labels'].append(outputs['labels'][i][inds].astype('int64') - 1)
+                mask_rois_wide.append(lvl_blobs['rois'][i][inds])
+                mask_labels_wide.append(lvl_blobs['labels'][i][inds] - 1)
-        # Use the sparse indices to select logits
+                lvl_blobs['mask_targets'][i] = lvl_blobs['mask_targets'][i][inds]
-        # Reduce the overhead on feeding dense class-specific targets
+            else:
-        mask_labels = np.concatenate(outputs['mask_labels'], 0)
+                mask_rois_wide.append(self.defaults['rois'])
-        mask_indices = np.arange(len(mask_labels)) * (self.num_classes - 1)
+                mask_labels_wide.append(np.array([0], 'int64'))
+                lvl_blobs['mask_targets'][i] = self.defaults['mask_targets']
+        blobs = dict((k, np.concatenate(lvl_blobs[k])) for k in blobs.keys())
+        mask_labels = np.concatenate(mask_labels_wide)
+        fg_inds = np.where(blobs['labels'] > 0)[0]
+        bbox_cls_inds = np.arange(len(blobs['rois'])) * self.num_classes
+        mask_cls_inds = np.arange(len(mask_labels)) * (self.num_classes - 1)
+        # Sample a proposal randomly to avoid memory issue
+        if len(fg_inds) == 0:
+            fg_inds = np.random.randint(len(blobs['labels']), size=[1])
        return {
-            'rois': [new_tensor(outputs['rois'][i]) for i in range(k)],
+            'rois': [new_tensor(rois_wide[i]) for i in range(num_levels)],
-            'labels': new_tensor(np.concatenate(outputs['labels'], 0)),
+            'mask_rois': [new_tensor(mask_rois_wide[i]) for i in range(num_levels)],
-            'bbox_targets': new_tensor(np.vstack(outputs['bbox_targets'])),
+            'labels': new_tensor(blobs['labels']),
-            'bbox_inside_weights': new_tensor(np.vstack(outputs['bbox_inside_weights'])),
+            'bbox_indices': new_tensor(bbox_cls_inds[fg_inds] + blobs['labels'][fg_inds]),
-            'bbox_outside_weights': new_tensor(np.vstack(outputs['bbox_outside_weights'])),
+            'bbox_targets': new_tensor(blobs['bbox_targets'][fg_inds].astype('float32')),
-            'mask_rois': [new_tensor(outputs['mask_rois'][i]) for i in range(k)],
+            'bbox_anchors': new_tensor(blobs['rois'][fg_inds, 1:].astype('float32')),
-            'mask_targets': new_tensor(np.vstack(outputs['mask_targets'])),
+            'mask_indices': new_tensor(mask_cls_inds + mask_labels),
-            'mask_indices': new_tensor(mask_indices + mask_labels),
+            'mask_targets': new_tensor(blobs['mask_targets']),
        }
-def get_targets(
+def compute_targets(
    ex_rois,
    gt_rois,
    gt_labels,
    gt_masks,
    mask_flags,
    mask_size,
-    num_classes,
    im_scale,
 ):
    """Compute the bounding-box regression targets."""
@@ -141,14 +145,8 @@ def get_targets(
    assert gt_rois.shape[1] == 4
    # Compute bbox regression targets
    fg_inds = np.where(gt_labels > 0)[0]
-    targets = box_util.bbox_transform(ex_rois, gt_rois, cfg.BBOX_REG_WEIGHTS)
+    bbox_targets = box_util.bbox_transform(
-    bbox_targets = np.zeros((ex_rois.shape[0], 4 * num_classes), 'float32')
+        ex_rois, gt_rois, cfg.BBOX_REG_WEIGHTS)
-    inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32)
-    for i in fg_inds:
-        start = int(4 * gt_labels[i])
-        bbox_targets[i, start:start + 4] = targets[i]
-        inside_weights[i, start:start + 4] = (1., 1., 1., 1.)
-    outside_weights = np.array(inside_weights > 0).astype('float32')
    # Compute mask classification targets
    mask_shape = [mask_size] * 2
    ex_rois_ori = np.round(ex_rois / im_scale).astype(int)
@@ -168,7 +166,7 @@ def get_targets(
                        mask=box_mask,
                        size=mask_shape,
                    )
-    return bbox_targets, inside_weights, outside_weights, mask_targets
+    return bbox_targets, mask_targets
 def sample_rois(
@@ -177,14 +175,13 @@ def sample_rois(
    gt_masks,
    num_rois,
    num_fg_rois,
-    num_classes,
    im_scale,
 ):
    """Sample a batch of RoIs comprising foreground and background examples."""
    overlaps = box_util.bbox_overlaps(all_rois[:, 1:5], gt_boxes[:, :4])
    gt_assignment = overlaps.argmax(axis=1)
    max_overlaps = overlaps.max(axis=1)
-    labels = gt_boxes[gt_assignment, 4]
+    labels = gt_boxes[gt_assignment, 4].astype('int64')
    # Select foreground RoIs as those with >= FG_THRESH overlap
    fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0]
@@ -209,19 +206,16 @@ def sample_rois(
    rois, labels = all_rois[keep_inds], labels[keep_inds]
    # Clamp labels for the background RoIs to 0
    labels[fg_rois_per_this_image:] = 0
-    # Clamp the image indices for the background RoIs to -1
-    rois[fg_rois_per_this_image:][0] = -1
    # Compute the target from RoIs
    outputs = [rois, labels]
-    outputs += get_targets(
+    outputs += compute_targets(
        rois[:, 1:5],
        gt_boxes[gt_assignment[keep_inds], :4],
        labels,
        gt_masks[gt_assignment[fg_inds]],
        gt_boxes[gt_assignment[fg_inds], 5],
        cfg.MRCNN.RESOLUTION,
-        num_classes,
        im_scale,
    )
    return outputs
--- a/lib/mask_rcnn/test.py
+++ b/lib/mask_rcnn/test.py
@@ -13,19 +13,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+import types
 import dragon.vm.torch as torch
 import numpy as np
-from lib.core.config import cfg
+from seetadet.algo.faster_rcnn import utils as rcnn_util
-from lib.faster_rcnn import map_rois_to_levels
+from seetadet.core.config import cfg
-from lib.faster_rcnn import map_blobs_to_outputs
+from seetadet.modeling.detector import new_detector
-from lib.modeling.detector import new_detector
+from seetadet.utils import env
-from lib.nms import nms_wrapper
+from seetadet.utils import nms as nms_util
-from lib.utils import framework
+from seetadet.utils import time_util
-from lib.utils import time_util
+from seetadet.utils import boxes as box_util
-from lib.utils import boxes as box_util
+from seetadet.utils.blob import im_list_to_blob
-from lib.utils.blob import im_list_to_blob
+from seetadet.utils.image import scale_image
-from lib.utils.image import scale_image
 def im_detect(detector, raw_image):
@@ -33,50 +34,46 @@ def im_detect(detector, raw_image):
    ims, ims_scale = scale_image(raw_image)
    # Prepare blobs
-    blobs = {'data': im_list_to_blob(ims)}
+    data = im_list_to_blob(ims)
-    blobs['ims_info'] = np.array([
+    ims_info = np.array([list(data.shape[1:3]) + [im_scale]
-        list(blobs['data'].shape[1:3]) + [im_scale]
+         for im_scale in ims_scale], dtype=np.float32)
-        for im_scale in ims_scale
-    ], dtype=np.float32)
    # Do Forward
-    if not hasattr(detector, 'graph'):
+    data = torch.from_numpy(data)
-        with framework.new_workspace().as_default():
+    ims_info = torch.from_numpy(ims_info)
-            data = torch.from_numpy(blobs['data'])
-            ims_info = torch.from_numpy(blobs['ims_info'])
+    if not hasattr(detector, 'script_forward'):
-            with torch.no_grad():
+        def script_forward(self, data, ims_info):
-                with torch.jit.Tracer(retain_ops=True):
+            return self.forward({'data': data, 'ims_info': ims_info})
-                    inputs = {'data': data, 'ims_info': ims_info}
+        detector.script_forward = torch.jit.trace(
-                    outputs = detector.forward(inputs)
+            func=types.MethodType(script_forward, detector),
-                    detector.graph = \
+            example_inputs=[data, ims_info],
-                        framework.Graph(inputs, {
+        )
-                            'rois': outputs['rois'],
-                            'cls_prob': outputs['cls_prob'],
+    outputs = detector.script_forward(data, ims_info)
-                            'bbox_pred': outputs['bbox_pred']
+    outputs = dict((k, outputs[k].numpy()) for k in outputs.keys())
-                        })
-    outputs = detector.graph(**blobs)
    # Decode results
-    rois = outputs['rois']
+    all_scores, all_boxes, batch_inds = [], [], []
-    scores, boxes, batch_inds = [], [], []
    pred_boxes = \
        box_util.bbox_transform_inv(
-            rois[:, 1:5],
+            outputs['rois'][:, 1:5],
            outputs['bbox_pred'],
            cfg.BBOX_REG_WEIGHTS,
        )
    for i in range(len(ims)):
-        inds = np.where(rois[:, 0].astype(np.int32) == i)[0]
+        inds = np.where(outputs['rois'][:, 0].astype(np.int32) == i)[0]
-        im_boxes = pred_boxes[inds] / ims_scale[i]
+        boxes = pred_boxes[inds] / ims_scale[i]
-        scores.append(outputs['cls_prob'][inds])
+        all_scores.append(outputs['cls_prob'][inds])
-        boxes.append(box_util.clip_tiled_boxes(im_boxes, raw_image.shape))
+        all_boxes.append(box_util.clip_tiled_boxes(boxes, raw_image.shape))
        batch_inds.append(np.ones((len(inds), 1), 'int32') * i)
    return (
-        np.vstack(scores) if len(ims) > 0 else scores[0],
+        np.vstack(all_scores),
-        np.vstack(boxes) if len(ims) > 0 else boxes[0],
+        np.vstack(all_boxes),
-        np.vstack(batch_inds) if len(ims) > 0 else batch_inds[0],
+        np.vstack(batch_inds),
        np.array(ims_scale, 'float64'),
    )
@@ -85,43 +82,29 @@ def mask_detect(detector, rois):
    k_min = cfg.FPN.ROI_MIN_LEVEL
    k_max = cfg.FPN.ROI_MAX_LEVEL
    k = k_max - k_min + 1
-    levels = map_rois_to_levels(rois, k_min, k_max)
+    levels = rcnn_util.map_rois_to_levels(rois, k_min, k_max)
    level_inds = [np.where(levels == (i + k_min))[0] for i in range(k)]
-    fpn_rois = map_blobs_to_outputs(
+    fpn_rois = rcnn_util.map_blobs_by_levels(
        {'rois': rois[:, :5]},
        {'rois': np.array([[-1, 0, 0, 1, 1]], 'float32')},
        level_inds)['rois']
-    workspace = detector.graph.workspace
+    with torch.no_grad():
-    placeholders = detector.graph.placeholders
+        mask_score = detector.rcnn.compute_mask_score(
-    score_fn = detector.rcnn.compute_mask_score
+            rois=[env.new_tensor(r.astype('float32')) for r in fpn_rois])
-    with workspace.as_default():
+    nc, i = mask_score.shape[1], 0
-        if 'rois' not in placeholders:
+    mask_inds = {}
-            placeholders['rois'] = \
+    for inds in level_inds:
-                [framework.new_placeholder(cfg.GPU_ID) for _ in range(k)]
+        for idx in inds:
-            placeholders['mask_inds'] = \
+            cls = int(rois[idx, 5])
-                framework.new_placeholder(cfg.GPU_ID)
+            mask_inds[idx] = (i * nc + cls)
-        for i, v in enumerate(fpn_rois):
+            i += 1
-            framework.feed_tensor(placeholders['rois'][i], v.astype('float32'))
+        if len(inds) == 0:
-        with torch.no_grad():
+            i += 1
-            mask_score = score_fn(rois=placeholders['rois'])
+    mask_inds = list(map(mask_inds.get, sorted(mask_inds)))
-        nc, i = mask_score.shape[1], 0
+    mask_inds = env.new_tensor(np.array(mask_inds, 'int64'))
-        mask_inds = {}
+    with torch.no_grad():
-        for inds in level_inds:
+        mask_pred = mask_score.index_select((0, 1), mask_inds)
-            for idx in inds:
+    return detector.rcnn.sigmoid(mask_pred).numpy().copy()
-                cls = int(rois[idx, 5])
-                mask_inds[idx] = (i * nc + cls)
-                i += 1
-            if len(inds) == 0:
-                i += 1
-        mask_inds = list(map(mask_inds.get, sorted(mask_inds)))
-        framework.feed_tensor(
-            placeholders['mask_inds'],
-            np.array(mask_inds, 'int64'),
-        )
-        with torch.no_grad():
-            mask_pred = mask_score.index_select(
-                (0, 1), placeholders['mask_inds'])
-            return detector.rcnn.sigmoid(mask_pred).numpy(True).copy()
 def test_net(weights, num_classes, q_in, q_out, device):
@@ -132,7 +115,7 @@ def test_net(weights, num_classes, q_in, q_out, device):
    while True:
        idx, raw_image = q_in.get()
-        if raw_image is None:
+        if idx < 0:
            break
        rois_this_image = []
@@ -153,17 +136,16 @@ def test_net(weights, num_classes, q_in, q_out, device):
                (cls_boxes, cls_scores[:, np.newaxis])
            ).astype(np.float32, copy=False)
            if cfg.TEST.USE_SOFT_NMS:
-                keep = nms_wrapper.soft_nms(
+                keep = nms_util.soft_nms(
                    cls_detections,
                    thresh=cfg.TEST.NMS,
                    method=cfg.TEST.SOFT_NMS_METHOD,
                    sigma=cfg.TEST.SOFT_NMS_SIGMA,
                )
            else:
-                keep = nms_wrapper.nms(
+                keep = nms_util.nms(
                    cls_detections,
                    thresh=cfg.TEST.NMS,
-                    force_cpu=True,
                )
            cls_detections = cls_detections[keep, :]
            cls_batch_inds = cls_batch_inds[keep]
@@ -190,13 +172,9 @@ def test_net(weights, num_classes, q_in, q_out, device):
        q_out.put((
            idx,
-            {
+            dict([('im_detect', _t['im_detect'].average_time),
-                'im_detect': _t['im_detect'].average_time,
+                  ('mask_detect', _t['mask_detect'].average_time),
-                'mask_detect': _t['mask_detect'].average_time,
+                  ('misc', _t['misc'].average_time)]),
-                'misc': _t['misc'].average_time,
+            dict([('boxes', boxes_this_image),
-            },
+                  ('masks', masks_this_image)]),
-            {
-                'boxes': boxes_this_image,
-                'masks': masks_this_image,
-            },
        ))
--- a/lib/mask_rcnn/__init__.py
+++ b/lib/mask_rcnn/__init__.py
@@ -13,7 +13,5 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-from lib.faster_rcnn.anchor_target import AnchorTarget
+from seetadet.algo.retinanet.anchor_target import AnchorTarget
-from lib.faster_rcnn.proposal import Proposal
+from seetadet.algo.retinanet.data_loader import DataLoader
-from lib.mask_rcnn.data_loader import DataLoader
-from lib.mask_rcnn.proposal_target import ProposalTarget
--- a/lib/retinanet/anchor_target.py
+++ b/lib/retinanet/anchor_target.py
@@ -15,12 +15,12 @@ from __future__ import print_function
 import numpy as np
-from lib.core.config import cfg
+from seetadet.core.config import cfg
-from lib.faster_rcnn.generate_anchors import generate_anchors_v2
+from seetadet.algo.faster_rcnn.generate_anchors import generate_anchors_v2
-from lib.faster_rcnn import generate_grid_anchors
+from seetadet.algo.faster_rcnn.utils import generate_grid_anchors
-from lib.utils import boxes as box_util
+from seetadet.utils import boxes as box_util
-from lib.utils import logger
+from seetadet.utils import logger
-from lib.utils.framework import new_tensor
+from seetadet.utils.env import new_tensor
 class AnchorTarget(object):
@@ -47,7 +47,7 @@ class AnchorTarget(object):
                    sizes=sizes,
                ))
-    def __call__(self, features, gt_boxes, ims_info):
+    def __call__(self, features, gt_boxes):
        num_images = cfg.TRAIN.IMS_PER_BATCH
        gt_boxes_wide = box_util.dismantle_boxes(gt_boxes, num_images)
@@ -67,10 +67,8 @@ class AnchorTarget(object):
        num_anchors = all_anchors.shape[0]
        # Label: ``1`` is positive, ``0`` is negative, ``-1` is don't care
-        labels_wide = -np.ones((num_images, num_anchors,), 'float32')
+        labels_wide = -np.ones((num_images, num_anchors,), 'int64')
-        bbox_targets_wide = np.zeros((num_images, num_anchors, 4), 'float32')
+        bbox_indices_wide, bbox_anchors_wide, bbox_targets_wide = [], [], []
-        bbox_inside_weights_wide = np.zeros_like(bbox_targets_wide, 'float32')
-        bbox_outside_weights_wide = np.zeros_like(bbox_targets_wide, 'float32')
        # Different from R-CNN, all anchors will be used
        inds_inside, anchors = np.arange(num_anchors), all_anchors
@@ -81,7 +79,7 @@ class AnchorTarget(object):
            gt_boxes = gt_boxes_wide[ix]
            # label: 1 is positive, 0 is negative, -1 is don't care
-            labels = np.empty((num_inside,), dtype=np.float32)
+            labels = np.empty((num_inside,), dtype='int64')
            labels.fill(-1)
            # Overlaps between the anchors and the gt boxes
@@ -89,48 +87,41 @@ class AnchorTarget(object):
            argmax_overlaps = overlaps.argmax(1)
            max_overlaps = overlaps[np.arange(num_inside), argmax_overlaps]
-            # fg label: for each gt, anchor with highest overlap
+            # Foreground: for each gt, anchor with highest overlap
            gt_argmax_overlaps = overlaps.argmax(0)
            gt_max_overlaps = overlaps[gt_argmax_overlaps, np.arange(overlaps.shape[1])]
            gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0]
            gt_inds = argmax_overlaps[gt_argmax_overlaps]
            labels[gt_argmax_overlaps] = gt_boxes[gt_inds, 4]
-            # fg label: above threshold IOU
+            # Foreground: above threshold IoU
            inds = max_overlaps >= cfg.RETINANET.POSITIVE_OVERLAP
            gt_inds = argmax_overlaps[inds]
            labels[inds] = gt_boxes[gt_inds, 4]
            fg_inds = np.where(labels > 0)[0]
-            # bg label: below threshold IOU
+            # Background: below threshold IoU
            labels[max_overlaps < cfg.RETINANET.NEGATIVE_OVERLAP] = 0
-            bbox_targets = np.zeros((num_inside, 4), dtype=np.float32)
+            # Retract the clamping if we don't have one
-            bbox_targets[fg_inds, :] = \
+            if len(fg_inds) == 0:
+                gt_inds = argmax_overlaps[gt_argmax_overlaps]
+                labels[gt_argmax_overlaps] = gt_boxes[gt_inds, 4]
+                fg_inds = np.where(labels > 0)[0]
+            labels_wide[ix, inds_inside] = labels
+            bbox_anchors_wide.append(anchors[fg_inds])
+            bbox_indices_wide.append(fg_inds + (num_anchors * ix))
+            bbox_targets_wide.append(
                box_util.bbox_transform(
-                    anchors[fg_inds, :],
+                    anchors[fg_inds],
                    gt_boxes[argmax_overlaps[fg_inds], :4],
                )
-            bbox_inside_weights = np.zeros((num_inside, 4), dtype=np.float32)
+            )
-            bbox_inside_weights[fg_inds, :] = np.array((1., 1., 1., 1.))
-            bbox_reg_weight = float(cfg.RETINANET.BBOX_REG_WEIGHT)
-            bbox_outside_weights = np.zeros((num_inside, 4), dtype=np.float32)
-            bbox_outside_weights[fg_inds, :] = bbox_reg_weight / max(len(fg_inds), 1)
-            labels_wide[ix, inds_inside] = labels
-            bbox_targets_wide[ix, inds_inside] = bbox_targets
-            bbox_inside_weights_wide[ix, inds_inside] = bbox_inside_weights
-            bbox_outside_weights_wide[ix, inds_inside] = bbox_outside_weights
-        labels = labels_wide.reshape((num_images, num_anchors))
-        bbox_targets = bbox_targets_wide.transpose((0, 2, 1))
-        bbox_inside_weights = bbox_inside_weights_wide.transpose((0, 2, 1))
-        bbox_outside_weights = bbox_outside_weights_wide.transpose((0, 2, 1))
        return {
-            'labels': new_tensor(labels),
+            'labels': new_tensor(labels_wide),
-            'bbox_targets': new_tensor(bbox_targets),
+            'bbox_indices': new_tensor(np.concatenate(bbox_indices_wide)),
-            'bbox_inside_weights': new_tensor(bbox_inside_weights),
+            'bbox_anchors': new_tensor(np.concatenate(bbox_anchors_wide).astype('float32')),
-            'bbox_outside_weights': new_tensor(bbox_outside_weights),
+            'bbox_targets': new_tensor(np.concatenate(bbox_targets_wide).astype('float32')),
        }
--- a/seetadet/algo/retinanet/data_loader.py
+++ b/seetadet/algo/retinanet/data_loader.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from seetadet.algo import faster_rcnn
+from seetadet.algo import ssd
+from seetadet.core.config import cfg
+class DataLoader(object):
+    """Provide mini-batches of data."""
+    def __new__(cls):
+        if cfg.TRAIN.MAX_SIZE > 0:
+            return faster_rcnn.DataLoader()
+        else:
+            return ssd.DataLoader()
--- a/lib/retinanet/test.py
+++ b/lib/retinanet/test.py
@@ -13,66 +13,59 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+import types
 import dragon.vm.torch as torch
 import numpy as np
-from lib.core.config import cfg
+from seetadet.core.config import cfg
-from lib.modeling.detector import new_detector
+from seetadet.modeling.detector import new_detector
-from lib.nms import nms_wrapper
+from seetadet.utils import nms as nms_util
-from lib.utils import framework
+from seetadet.utils import time_util
-from lib.utils import time_util
+from seetadet.utils.blob import im_list_to_blob
-from lib.utils.blob import im_list_to_blob
+from seetadet.utils.image import scale_image
-from lib.utils.image import scale_image
 def ims_detect(detector, raw_images):
    """Detect images, with single or multiple scales."""
-    ims, ims_scale = scale_image(raw_images[0])
+    ims, ims_scale = [], []
-    num_scales = len(ims_scale)
+    for i in range(len(raw_images)):
-    ims_shape = [im.shape for im in raw_images]
+        im, im_scale = scale_image(raw_images[i])
-    for item_idx in range(1, len(raw_images)):
+        ims += im
-        ims_ext, ims_scale_ext = scale_image(raw_images[item_idx])
+        ims_scale += im_scale
-        ims += ims_ext
-        ims_scale += ims_scale_ext
+    num_scales = len(ims_scale) // len(raw_images)
+    ims_shape = np.array([im.shape[:2] for im in ims])
+    ims_scale = np.array(ims_scale).reshape((len(ims), -1))
    # Prepare blobs
-    blobs = {'data': im_list_to_blob(ims)}
+    data = im_list_to_blob(ims)
-    blobs['ims_info'] = np.array([
+    ims_info = np.hstack([ims_shape, ims_scale]).astype('float32')
-        list(blobs['data'].shape[1:3]) + [im_scale]
-        for im_scale in ims_scale
-    ], dtype=np.float32)
    # Do Forward
-    if not hasattr(detector, 'graph'):
+    data = torch.from_numpy(data)
-        with framework.new_workspace().as_default():
+    ims_info = torch.from_numpy(ims_info)
-            data = torch.from_numpy(blobs['data'])
-            ims_info = torch.from_numpy(blobs['ims_info'])
+    if not hasattr(detector, 'script_forward'):
-            with torch.no_grad():
+        def script_forward(self, data, ims_info):
-                with torch.jit.Tracer(retain_ops=True):
+            return self.forward({'data': data, 'ims_info': ims_info})
-                    inputs = {'data': data, 'ims_info': ims_info}
+        detector.script_forward = torch.jit.trace(
-                    outputs = detector.forward(inputs)
+            func=types.MethodType(script_forward, detector),
-                    detector.graph = \
+            example_inputs=[data, ims_info],
-                        framework.Graph({
+        )
-                            'data': inputs['data'],
-                            'ims_info': inputs['ims_info']
+    outputs = detector.script_forward(data, ims_info)
-                        }, {'detections': outputs['detections']})
+    outputs = dict((k, outputs[k].numpy()) for k in outputs.keys())
-    outputs = detector.graph(**blobs)
    # Unpack results
    results = outputs['detections']
-    detections = [[] for _ in range(len(ims_shape))]
+    detections = [[] for _ in range(len((raw_images)))]
    for i in range(len(ims)):
        inds = np.where(results[:, 0].astype(np.int32) == i)[0]
        detections[i // num_scales].append(results[inds, 1:])
-    for i in range(len(ims_shape)):
+    return [np.vstack(detections[i]) for i in range(len(raw_images))]
-        detections[i] = \
-            np.vstack(detections[i]) \
-            if len(detections[i]) > 1 \
-            else detections[i][0]
-    return detections
 def test_net(weights, num_classes, q_in, q_out, device):
@@ -88,7 +81,7 @@ def test_net(weights, num_classes, q_in, q_out, device):
        indices, raw_images = [], []
        for i in range(cfg.TEST.IMS_PER_BATCH):
            idx, raw_image = q_in.get()
-            if raw_image is None:
+            if idx < 0:
                must_stop = True
                break
            indices.append(idx)
@@ -115,17 +108,16 @@ def test_net(weights, num_classes, q_in, q_out, device):
                    cls_boxes, cls_scores[:, np.newaxis])) \
                    .astype(np.float32, copy=False)
                if cfg.TEST.USE_SOFT_NMS:
-                    keep = nms_wrapper.soft_nms(
+                    keep = nms_util.soft_nms(
                        cls_detections,
                        thresh=cfg.TEST.NMS,
                        method=cfg.TEST.SOFT_NMS_METHOD,
                        sigma=cfg.TEST.SOFT_NMS_SIGMA,
                    )
                else:
-                    keep = nms_wrapper.nms(
+                    keep = nms_util.nms(
                        cls_detections,
                        thresh=cfg.TEST.NMS,
-                        force_cpu=True,
                    )
                cls_detections = cls_detections[keep, :]
                boxes_this_image.append(cls_detections)
@@ -133,11 +125,7 @@ def test_net(weights, num_classes, q_in, q_out, device):
            q_out.put((
                indices[i],
-                {
+                dict([('im_detect', _t['im_detect'].average_time),
-                    'im_detect': _t['im_detect'].average_time,
+                      ('misc',_t['misc'].average_time)]),
-                    'misc': _t['misc'].average_time,
+                dict([('boxes', boxes_this_image)]),
-                },
-                {
-                    'boxes': boxes_this_image,
-                },
            ))
--- a/lib/faster_rcnn/__init__.py
+++ b/lib/faster_rcnn/__init__.py
@@ -13,11 +13,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-from lib.faster_rcnn.anchor_target import AnchorTarget
+from seetadet.algo.ssd.data_loader import DataLoader
-from lib.faster_rcnn.data_loader import DataLoader
+from seetadet.algo.ssd.hard_mining import HardMining
-from lib.faster_rcnn.proposal import Proposal
+from seetadet.algo.ssd.multibox import MultiBoxMatch
-from lib.faster_rcnn.proposal_target import ProposalTarget
+from seetadet.algo.ssd.multibox import MultiBoxTarget
-from lib.faster_rcnn.utils import generate_grid_anchors
+from seetadet.algo.ssd.priorbox import PriorBox
-from lib.faster_rcnn.utils import map_blobs_to_outputs
-from lib.faster_rcnn.utils import map_rois_to_levels
-from lib.faster_rcnn.utils import map_returns_to_blobs
--- a/lib/ssd/cat.jpg
+++ b/lib/ssd/cat.jpg
--- a/seetadet/algo/ssd/data_loader.py
+++ b/seetadet/algo/ssd/data_loader.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import multiprocessing as mp
+import time
+import dragon
+import dragon.vm.torch as torch
+import numpy as np
+from seetadet.algo.ssd import data_transformer
+from seetadet.core.config import cfg
+from seetadet.datasets.factory import get_dataset
+from seetadet.utils import logger
+class DataLoader(object):
+    """Provide mini-batches of data."""
+    def __init__(self):
+        super(DataLoader, self).__init__()
+        dataset = get_dataset(cfg.TRAIN.DATASET)
+        if cfg.USE_DALI:
+            from seetadet.dali import ssd_pipeline as pipe
+            self.iterator = pipe.new_iterator(dataset.source)
+        else:
+            self.iterator = Iterator(**{
+                'dataset': dataset.cls,
+                'source': dataset.source,
+                'classes': dataset.classes,
+                'shuffle': cfg.TRAIN.USE_SHUFFLE,
+                'num_chunks': cfg.TRAIN.SHUFFLE_CHUNKS,
+                'batch_size': cfg.TRAIN.IMS_PER_BATCH * 2,
+                'num_transformers': cfg.TRAIN.NUM_THREADS - 1,
+            })
+    def __call__(self):
+        outputs = self.iterator.next()
+        if isinstance(outputs['data'], np.ndarray):
+            outputs['data'] = torch.from_numpy(outputs['data'])
+        return outputs
+class Iterator(object):
+    """Iterator to return the batch of data."""
+    def __init__(self, **kwargs):
+        super(Iterator, self).__init__()
+        # Distributed settings
+        rank, group_size = 0, 1
+        process_group = dragon.distributed.get_group()
+        if process_group is not None and \
+                kwargs.get('phase', 'TRAIN') == 'TRAIN':
+            group_size = process_group.size
+            rank = dragon.distributed.get_rank(process_group)
+        # Configuration
+        self._prefetch = kwargs.get('prefetch', 5)
+        self._batch_size = kwargs.get('batch_size', 32)
+        self._num_readers = kwargs.get('num_readers', 1)
+        self._num_transformers = kwargs.get('num_transformers', 3)
+        # Initialize queues
+        num_batches = self._prefetch * self._num_readers
+        self.q_in = mp.Queue(num_batches * self._batch_size)
+        self.q_out = mp.Queue(num_batches * self._batch_size)
+        # Initialize readers
+        self._readers = []
+        for i in range(self._num_readers):
+            part_idx, num_parts = i, self._num_readers
+            num_parts *= group_size
+            part_idx += rank * self._num_readers
+            self._readers.append(dragon.io.DataReader(
+                part_idx=part_idx, num_parts=num_parts, **kwargs))
+            self._readers[i]._seed += part_idx
+            self._readers[i].q_out = self.q_in
+            self._readers[i].start()
+            time.sleep(0.1)
+        # Initialize transformers
+        self._transformers = []
+        for i in range(self._num_transformers):
+            p = data_transformer.DataTransformer(**kwargs)
+            p._seed += (i + rank * self._num_transformers)
+            p.q_in, p.q_out = self.q_in, self.q_out
+            p.start()
+            self._transformers.append(p)
+            time.sleep(0.1)
+        # Register cleanup callbacks
+        def cleanup():
+            def terminate(processes):
+                for p in processes:
+                    p.terminate()
+                    p.join()
+            terminate(self._transformers)
+            logger.info('Terminate DataTransformer.')
+            terminate(self._readers)
+            logger.info('Terminate DataReader.')
+        import atexit
+        atexit.register(cleanup)
+    def next(self):
+        """Return the next batch of data."""
+        return self.__next__()
+    def __iter__(self):
+        """Return the iterator self."""
+        return self
+    def __next__(self):
+        """Return the next batch of data."""
+        n = cfg.TRAIN.IMS_PER_BATCH
+        h = w = cfg.TRAIN.SCALES[0]
+        boxes_to_pack = []
+        image, boxes = self.q_out.get()
+        images = np.zeros((n, h, w, 3), image.dtype)
+        for i in range(n):
+            images[i] = image
+            gt_boxes = np.zeros((boxes.shape[0], boxes.shape[1] + 1), 'float32')
+            gt_boxes[:, :boxes.shape[1]], gt_boxes[:, -1] = boxes, i
+            boxes_to_pack.append(gt_boxes)
+            if i != (cfg.TRAIN.IMS_PER_BATCH - 1):
+                image, boxes = self.q_out.get()
+        boxes_to_pack = np.concatenate(boxes_to_pack)
+        return {'data': images, 'gt_boxes': boxes_to_pack}
--- a/lib/ssd/data_transformer.py
+++ b/lib/ssd/data_transformer.py
@@ -14,19 +14,18 @@ from __future__ import division
 from __future__ import print_function
 import multiprocessing
-import cv2
 import numpy as np
-from lib.core.config import cfg
+from seetadet.algo.ssd import transforms
-from lib.datasets.example import Example
+from seetadet.core.config import cfg
-from lib.ssd import transforms
+from seetadet.datasets.example import Example
-from lib.utils import boxes as box_util
+from seetadet.utils import boxes as box_util
 class DataTransformer(multiprocessing.Process):
    def __init__(self, **kwargs):
        super(DataTransformer, self).__init__()
+        self._scale = cfg.TRAIN.SCALES[0]
        self._seed = cfg.RNG_SEED
        self._mirror = cfg.TRAIN.USE_FLIPPED
        self._use_diff = cfg.TRAIN.USE_DIFF
@@ -107,14 +106,15 @@ class DataTransformer(multiprocessing.Process):
        gt_boxes = np.empty((roi_dict['gt_classes'].size, 5), 'float32')
        gt_boxes[:, :4], gt_boxes[:, 4] = roi_dict['boxes'], roi_dict['gt_classes']
+        if len(gt_boxes) == 0:
+            # Ignore the non-object image
+            return img, gt_boxes
        # Distort => Expand => Sample => Resize
        img, gt_boxes = self.augment_image(img, gt_boxes)
        # Restore to the blob scale
-        gt_boxes[:, 0] *= cfg.SSD.RESIZE.WIDTH
+        gt_boxes[:, :4] *= self._scale
-        gt_boxes[:, 1] *= cfg.SSD.RESIZE.HEIGHT
-        gt_boxes[:, 2] *= cfg.SSD.RESIZE.WIDTH
-        gt_boxes[:, 3] *= cfg.SSD.RESIZE.HEIGHT
        # Post-Process for image
        if img.dtype == 'uint16':

--- a/lib/ssd/generate_anchors.py
+++ b/lib/ssd/generate_anchors.py
--- a/lib/ssd/hard_mining.py
+++ b/lib/ssd/hard_mining.py
@@ -15,47 +15,43 @@ from __future__ import print_function
 import numpy as np
-from lib.core.config import cfg
+from seetadet.core.config import cfg
-from lib.utils.framework import new_tensor
+from seetadet.utils.env import new_tensor
 class HardMining(object):
-    def __call__(self, prob_wide, labels_wide, overlaps_wide):
+    def __call__(self, prob, labels, overlaps):
-        prob_wide = prob_wide.numpy(True)
+        label_shape, label_size = labels.shape, labels.size
+        prob = prob.numpy().reshape((label_size, -1))
+        labels, overlaps = labels.flatten(), overlaps.flatten()
        neg_ovr = cfg.SSD.OHEM.NEG_OVERLAP
        neg_ratio = cfg.SSD.OHEM.NEG_POS_RATIO
        # label ``-1`` will be ignored
-        new_labels_wide = -np.ones(labels_wide.shape, 'int64')
+        new_labels = -np.ones(labels.shape, 'int64')
-        for ix in range(labels_wide.shape[0]):
-            labels = labels_wide[ix]
-            overlaps = overlaps_wide[ix]
-            prob = prob_wide[ix]
-            loss = np.zeros(labels.shape, 'float32')
-            inds = np.where(labels >= 0)[0]
-            loss[inds] = -np.log(
-                np.maximum(
-                    prob[inds, labels[inds]],
-                    np.finfo(float).eps,
-                )
-            )
-            # Filter negatives
+        cls_loss = -np.log(
-            fg_inds = np.where(labels > 0)[0]
+            np.maximum(
-            neg_inds = np.where(labels == 0)[0]
+                prob[np.arange(label_size), labels],
-            neg_overlaps = overlaps[neg_inds]
+                np.finfo(float).eps,
-            eligible_neg_inds = np.where(neg_overlaps < neg_ovr)[0]
+            )
-            neg_inds = neg_inds[eligible_neg_inds]
+        )
-            # Apply mining on negatives
+        # Filter negatives
-            neg_loss = loss[neg_inds]
+        fg_inds = np.where(labels > 0)[0]
-            num_pos, num_neg = len(fg_inds), len(neg_inds)
+        neg_inds = np.where(labels == 0)[0]
-            num_bg = min(int(num_pos * neg_ratio), num_neg)
+        neg_overlaps = overlaps[neg_inds]
-            bg_inds = neg_inds[np.argsort(-neg_loss)][:num_bg]
+        eligible_neg_inds = np.where(neg_overlaps < neg_ovr)[0]
-            new_labels_wide[ix][fg_inds] = labels[fg_inds]  # Keep fg indices
+        neg_inds = neg_inds[eligible_neg_inds]
-            new_labels_wide[ix][bg_inds] = 0  # Use hard negatives as bg indices
+        # Apply mining on negatives
+        neg_cls_loss = cls_loss[neg_inds]
+        num_pos, num_neg = len(fg_inds), len(neg_inds)
+        num_bg = min(int(num_pos * neg_ratio), num_neg)
+        bg_inds = neg_inds[np.argsort(-neg_cls_loss)][:num_bg]
+        new_labels[fg_inds] = labels[fg_inds]  # Keep fg indices
+        new_labels[bg_inds] = 0  # Use hard negatives as bg indices
        # Feed labels to compute cls loss
-        return {'labels': new_tensor(new_labels_wide)}
+        return {'labels': new_tensor(new_labels.reshape(label_shape))}
--- a/lib/ssd/multibox.py
+++ b/lib/ssd/multibox.py
@@ -15,9 +15,9 @@ from __future__ import print_function
 import numpy as np
-from lib.core.config import cfg
+from seetadet.core.config import cfg
-from lib.utils import boxes as box_util
+from seetadet.utils import boxes as box_util
-from lib.utils.framework import new_tensor
+from seetadet.utils.env import new_tensor
 class MultiBoxMatch(object):
@@ -47,8 +47,8 @@ class MultiBoxMatch(object):
            # Bipartite matching and assignments
            bipartite_inds = overlaps.argmax(0)
            class_assignment = gt_boxes[:, -1]
-            match_inds_wide[ix][bipartite_inds] = np.arange(num_gt, dtype='int32')
+            match_inds_wide[ix, bipartite_inds] = np.arange(num_gt, dtype='int32')
-            match_labels_wide[ix][bipartite_inds] = class_assignment
+            match_labels_wide[ix, bipartite_inds] = class_assignment
            # Per prediction matching and assignments
            # Note that SSD match each prior box for only once
@@ -56,8 +56,8 @@ class MultiBoxMatch(object):
            per_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0]
            gt_assignment = argmax_overlaps[per_inds]
            class_assignment = gt_boxes[gt_assignment, -1]
-            match_inds_wide[ix][per_inds] = gt_assignment
+            match_inds_wide[ix, per_inds] = gt_assignment
-            match_labels_wide[ix][per_inds] = class_assignment
+            match_labels_wide[ix, per_inds] = class_assignment
        return {
            'match_inds': match_inds_wide,
@@ -82,15 +82,7 @@ class MultiBoxTarget(object):
        num_priors, box_dim = prior_boxes.shape[:]
        gt_boxes_wide = box_util.dismantle_boxes(gt_boxes, num_images)
+        bbox_indices_wide, bbox_anchors_wide, bbox_targets_wide = [], [], []
-        bbox_targets_wide = np.zeros((num_images, num_priors, box_dim), 'float32')
-        bbox_inside_weights_wide = np.zeros(bbox_targets_wide.shape, 'float32')
-        bbox_outside_weights_wide = np.zeros(bbox_targets_wide.shape, 'float32')
-        # Number of matched boxes(#positive)
-        n_pos = float(max(len(np.where(match_labels_wide > 0)[0]), 1))
-        # Multiple by the num images to compensate the smooth l1 loss
-        bbox_reg_weight = cfg.SSD.BBOX_REG_WEIGHT * num_images / n_pos
        for ix in range(num_images):
            gt_boxes = gt_boxes_wide[ix]
@@ -106,17 +98,18 @@ class MultiBoxTarget(object):
            gt_rois = gt_boxes[gt_assignment]
            # Assign bbox targets
-            bbox_targets_wide[ix][ex_inds] = \
+            bbox_anchors_wide.append(ex_rois)
+            bbox_indices_wide.append(ex_inds + (num_priors * ix))
+            bbox_targets_wide.append(
                box_util.bbox_transform(
                    ex_rois,
                    gt_rois,
                    cfg.BBOX_REG_WEIGHTS,
                )
-            bbox_inside_weights_wide[ix, :] = 1.
+            )
-            bbox_outside_weights_wide[ix][ex_inds] = bbox_reg_weight
        return {
-            'bbox_targets': new_tensor(bbox_targets_wide),
+            'bbox_indices': new_tensor(np.concatenate(bbox_indices_wide)),
-            'bbox_inside_weights': new_tensor(bbox_inside_weights_wide),
+            'bbox_anchors': new_tensor(np.concatenate(bbox_anchors_wide).astype('float32')),
-            'bbox_outside_weights': new_tensor(bbox_outside_weights_wide),
+            'bbox_targets': new_tensor(np.concatenate(bbox_targets_wide).astype('float32')),
        }
--- a/lib/ssd/priorbox.py
+++ b/lib/ssd/priorbox.py
@@ -15,9 +15,8 @@ from __future__ import print_function
 import numpy as np
-from lib.core.config import cfg
+from seetadet.algo.ssd.generate_anchors import generate_anchors
-from lib.ssd.generate_anchors import generate_anchors
+from seetadet.core.config import cfg
-from lib.utils import logger
 class PriorBox(object):
@@ -29,8 +28,10 @@ class PriorBox(object):
        max_sizes = cfg.SSD.MULTIBOX.MAX_SIZES
        if len(max_sizes) > 0:
            if len(min_sizes) != len(max_sizes):
-                logger.fatal('Got {} min sizes and {} max sizes.'.format(
+                raise ValueError(
-                    len(min_sizes), len(max_sizes)))
+                    'Got {} min sizes and {} max sizes.'
+                    .format(len(min_sizes), len(max_sizes))
+                )
        self.strides = cfg.SSD.MULTIBOX.STRIDES
        aspect_ratios = cfg.SSD.MULTIBOX.ASPECT_RATIOS
        self.base_anchors = []
@@ -44,9 +45,14 @@ class PriorBox(object):
                    aspect_ratios[i],
                )
            )
+        self.grid_anchors = None
    def __call__(self, features):
-        all_anchors = []
+        if self.grid_anchors is not None:
+            return self.grid_anchors
+        self.grid_anchors = []
        for i in range(len(self.strides)):
            # 1. Generate base grids
            height, width = features[i].shape[-2:]
@@ -61,26 +67,17 @@ class PriorBox(object):
            # Reshape to (K * A, 4) shifted anchors
            A = self.base_anchors[i].shape[0]
            D = self.base_anchors[i].shape[1]
-            if D == 4:
+            shifts = np.vstack((
-                shifts = np.vstack((
+                shift_x.ravel(),
-                    shift_x.ravel(),
+                shift_y.ravel(),
-                    shift_y.ravel(),
+                shift_x.ravel(),
-                    shift_x.ravel(),
+                shift_y.ravel())
-                    shift_y.ravel())
+            ).transpose()
-                ).transpose()
-            elif D == 5:
-                shifts = np.vstack((
-                    shift_x.ravel(),
-                    shift_y.ravel(),
-                    shift_x.ravel() * 0,
-                    shift_y.ravel() * 0,
-                    shift_y.ravel() * 0)
-                ).transpose()
-            else:
-                raise ValueError('Excepted anchor4d or anchor5d.')
            K = shifts.shape[0]  # K = map_h * map_w
            anchors = (self.base_anchors[i].reshape((1, A, D)) +
                       shifts.reshape((1, K, D)).transpose((1, 0, 2)))
            anchors = anchors.reshape((K * A, D)).astype(np.float32)
-            all_anchors.append(anchors)
+            self.grid_anchors.append(anchors)
-        return np.concatenate(all_anchors, axis=0)
+        self.grid_anchors = np.concatenate(self.grid_anchors)
+        return self.grid_anchors
--- a/lib/ssd/test.py
+++ b/lib/ssd/test.py
@@ -13,26 +13,30 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+import types
 import cv2
 import dragon.vm.torch as torch
 import numpy as np
-from lib.core.config import cfg
+from seetadet.core.config import cfg
-from lib.modeling.detector import new_detector
+from seetadet.modeling.detector import new_detector
-from lib.nms import nms_wrapper
+from seetadet.utils import boxes as box_util
-from lib.utils import boxes as box_util
+from seetadet.utils import nms as nms_util
-from lib.utils import framework
+from seetadet.utils import time_util
-from lib.utils import time_util
 def get_images(ims):
-    target_h = cfg.SSD.RESIZE.HEIGHT
+    out_size = cfg.TEST.SCALES[0]
-    target_w = cfg.SSD.RESIZE.WIDTH
    processed_ims, im_scales = [], []
    for im in ims:
-        im_scales.append((float(target_h) / im.shape[0],
+        im_scales.append((float(out_size) / im.shape[0],
-                          float(target_w) / im.shape[1]))
+                          float(out_size) / im.shape[1]))
-        processed_ims.append(cv2.resize(im, (target_w, target_h)))
+        processed_ims.append(
+            cv2.resize(
+            im, (out_size, out_size),
+            interpolation=cv2.INTER_AREA,
+        ))
    if ims[0].dtype == 'uint16':
        ims_blob = np.array(processed_ims, dtype='float32') / 256.
    else:
@@ -45,34 +49,33 @@ def ims_detect(detector, ims):
    data, im_scales = get_images(ims)
    # Do Forward
-    if not hasattr(detector, 'graph'):
+    data = torch.from_numpy(data)
-        with framework.new_workspace().as_default():
-            with torch.no_grad():
+    if not hasattr(detector, 'script_forward'):
-                with torch.jit.Tracer(retain_ops=True):
+        def script_forward(self, data):
-                    inputs = {'data': torch.from_numpy(data)}
+            return self.forward({'data': data})
-                    outputs = detector.forward(inputs)
+        detector.script_forward = torch.jit.trace(
-                    detector.graph = \
+            func=types.MethodType(script_forward, detector),
-                        framework.Graph(inputs, {
+            example_inputs=[data],
-                            'cls_prob': outputs['cls_prob'],
+        )
-                            'bbox_pred': outputs['bbox_pred']
-                        }, {'prior_boxes': outputs['prior_boxes']})
+    outputs = detector.script_forward(data)
-    outputs = detector.graph(data=data)
+    cls_prob = outputs['cls_prob'].numpy()
+    bbox_pred = outputs['bbox_pred'].numpy()
    # Decode results
    batch_boxes = []
    for i in range(len(im_scales)):
        boxes = box_util.bbox_transform_inv(
            outputs['prior_boxes'],
-            outputs['bbox_pred'][i],
+            bbox_pred[i],
            cfg.BBOX_REG_WEIGHTS,
        )
-        boxes[:, 0] /= im_scales[i][1]
+        boxes[:, 0::2] /= im_scales[i][1]
-        boxes[:, 1] /= im_scales[i][0]
+        boxes[:, 1::2] /= im_scales[i][0]
-        boxes[:, 2] /= im_scales[i][1]
-        boxes[:, 3] /= im_scales[i][0]
        batch_boxes.append(box_util.clip_boxes(boxes, ims[i].shape))
-    return outputs['cls_prob'], batch_boxes
+    return cls_prob, batch_boxes
 def test_net(weights, num_classes, q_in, q_out, device):
@@ -88,7 +91,7 @@ def test_net(weights, num_classes, q_in, q_out, device):
        indices, raw_images = [], []
        for i in range(cfg.TEST.IMS_PER_BATCH):
            idx, raw_image = q_in.get()
-            if raw_image is None:
+            if idx < 0:
                must_stop = True
                break
            indices.append(idx)
@@ -116,17 +119,16 @@ def test_net(weights, num_classes, q_in, q_out, device):
                    (cls_boxes, cls_scores[:, np.newaxis])) \
                    .astype(np.float32, copy=False)
                if cfg.TEST.USE_SOFT_NMS:
-                    keep = nms_wrapper.soft_nms(
+                    keep = nms_util.soft_nms(
                        cls_detections,
                        thresh=cfg.TEST.NMS,
                        method=cfg.TEST.SOFT_NMS_METHOD,
                        sigma=cfg.TEST.SOFT_NMS_SIGMA,
                    )
                else:
-                    keep = nms_wrapper.nms(
+                    keep = nms_util.nms(
                        cls_detections,
                        thresh=cfg.TEST.NMS,
-                        force_cpu=True,
                    )
                cls_detections = cls_detections[keep, :]
                boxes_this_image.append(cls_detections)
@@ -134,11 +136,7 @@ def test_net(weights, num_classes, q_in, q_out, device):
            q_out.put((
                indices[i],
-                {
+                dict([('im_detect', _t['im_detect'].average_time),
-                    'im_detect': _t['im_detect'].average_time,
+                      ('misc',_t['misc'].average_time)]),
-                    'misc': _t['misc'].average_time,
+                dict([('boxes', boxes_this_image)]),
-                },
-                {
-                    'boxes': boxes_this_image,
-                },
            ))
--- a/lib/ssd/transforms.py
+++ b/lib/ssd/transforms.py
@@ -22,9 +22,10 @@ import PIL.ImageEnhance
 import numpy as np
 import numpy.random as npr
-from lib.core.config import cfg
+from seetadet.core.config import cfg
-from lib.utils import boxes as box_util
+from seetadet.utils import boxes as box_util
-from lib.utils import logger
+from seetadet.utils import boxes_v2 as box_util_v2
+from seetadet.utils import logger
 class Compose(object):
@@ -40,43 +41,35 @@ class Compose(object):
 class Distort(object):
    def __init__(self):
-        self._brightness_prob = cfg.SSD.DISTORT.BRIGHTNESS_PROB
+        self._prob = 0.5
-        self._contrast_prob = cfg.SSD.DISTORT.CONTRAST_PROB
+        self._transforms = [
-        self._saturation_prob = cfg.SSD.DISTORT.SATURATION_PROB
+            (PIL.ImageEnhance.Brightness, self._prob),
+            (PIL.ImageEnhance.Contrast, self._prob),
+            (PIL.ImageEnhance.Color, self._prob),
+        ]
    def apply(self, img, boxes=None):
-        img = PIL.Image.fromarray(img)
+        if self._prob > 0:
-        transforms = [
+            img = PIL.Image.fromarray(img)
-            (PIL.ImageEnhance.Brightness, self._brightness_prob),
+            for transform_fn, prob in self._transforms:
-            (PIL.ImageEnhance.Contrast, self._contrast_prob),
+                if npr.uniform() < prob:
-            (PIL.ImageEnhance.Color, self._saturation_prob),
+                    img = transform_fn(img)
-        ]
+                    img = img.enhance(1. + npr.uniform(-.4, .4))
-        np.random.shuffle(transforms)
+            return np.array(img), boxes
-        for transform_fn, prob in transforms:
+        return img, boxes
-            if npr.uniform() < prob:
-                img = transform_fn(img)
-                img = img.enhance(1. + npr.uniform(-.4, .4))
-        return np.array(img), boxes
 class Expand(object):
    def __init__(self):
-        self._expand_prob = cfg.SSD.EXPAND.PROB
+        self._max_ratio = 1. / cfg.TRAIN.RANDOM_SCALES[0]
-        self._max_ratio = cfg.SSD.EXPAND.MAX_RATIO
+        self._expand_prob = 0.5 if self._max_ratio > 1 else 0
-        if self._max_ratio < 1.0:
-            logger.fatal(
-                'The max expand ratio must >= 1, got {}'
-                .format(self._max_ratio)
-            )
    def apply(self, img, boxes=None):
        prob = npr.uniform()
        if prob > self._expand_prob:
            return img, boxes
-        ratio = npr.uniform(1., self._max_ratio)
-        if ratio == 1:
-            return img, boxes
+        ratio = npr.uniform(1., self._max_ratio)
        im_h, im_w = img.shape[:2]
        expand_h, expand_w = int(im_h * ratio), int(im_w * ratio)
        h_off = int(math.floor(npr.uniform(0., expand_h - im_h)))
@@ -99,19 +92,14 @@ class Expand(object):
 class Resize(object):
    def __init__(self):
-        self._target_size = (
+        self._target_size = (cfg.TRAIN.SCALES[0],) * 2
-            cfg.SSD.RESIZE.WIDTH,
+        self._interp_mode = [
-            cfg.SSD.RESIZE.HEIGHT,
+            cv2.INTER_LINEAR,
-        )
+            cv2.INTER_AREA,
-        interp_list = {
+            cv2.INTER_NEAREST,
-            'LINEAR': cv2.INTER_LINEAR,
+            cv2.INTER_CUBIC,
-            'AREA': cv2.INTER_AREA,
+            cv2.INTER_LANCZOS4,
-            'NEAREST': cv2.INTER_NEAREST,
+        ]
-            'CUBIC': cv2.INTER_CUBIC,
-            'LANCZOS4': cv2.INTER_LANCZOS4,
-        }
-        interp_mode = cfg.SSD.RESIZE.INTERP_MODE
-        self._interp_mode = [interp_list[key] for key in interp_mode]
    def apply(self, img, boxes):
        rand = npr.randint(len(self._interp_mode))
@@ -144,7 +132,10 @@ class Sample(object):
    @classmethod
    def _compute_overlaps(cls, rand_box, gt_boxes):
-        return box_util.iou(np.expand_dims(rand_box, 0), gt_boxes[:, 0:4])
+        return box_util_v2.iou(
+            np.expand_dims(rand_box, 0),
+            gt_boxes[:, 0:4],
+        )
    @classmethod
    def _generate_sample(cls, sample_param):
@@ -162,18 +153,27 @@ class Sample(object):
        h_off = npr.uniform(0., 1. - bbox_h)
        return np.array([w_off, h_off, w_off + bbox_w, h_off + bbox_h])
-    def _check_satisfy(self, sample_box, gt_boxes, constraint):
+    def _check_center(self, sample_box, gt_boxes):
+        ctr_x = (gt_boxes[:, 2] + gt_boxes[:, 0]) / 2.0
+        ctr_y = (gt_boxes[:, 3] + gt_boxes[:, 1]) / 2.0
+        # Keep the ground-truth box whose center is in the sample box
+        # Implement ``EmitConstraint.CENTER`` in the original SSD
+        keep_inds = np.where((ctr_x >= sample_box[0]) & (ctr_x <= sample_box[2]) &
+                             (ctr_y >= sample_box[1]) & (ctr_y <= sample_box[3]))[0]
+        return len(keep_inds) > 0
+    def _check_overlap(self, sample_box, gt_boxes, constraint):
        min_overlap = constraint.get('min_overlap', None)
        max_overlap = constraint.get('max_overlap', None)
        if min_overlap is None and \
                max_overlap is None:
            return True
-        max_overlap = self._compute_overlaps(sample_box, gt_boxes).max()
+        ovr = self._compute_overlaps(sample_box, gt_boxes).max()
        if min_overlap is not None:
-            if max_overlap < min_overlap:
+            if ovr < min_overlap:
                return False
        if max_overlap is not None:
-            if max_overlap > max_overlap:
+            if ovr > max_overlap:
                return False
        return True
@@ -187,9 +187,10 @@ class Sample(object):
                sample_box = self._generate_sample(sampler)
                if sampler['min_overlap'] != 0. or \
                        sampler['max_overlap'] != 1.:
-                    ok = self._check_satisfy(sample_box, gt_boxes, sampler)
+                    if not self._check_overlap(sample_box, gt_boxes, sampler):
-                    if not ok:
                        continue
+                if not self._check_center(sample_box, gt_boxes):
+                    continue
                found += 1
                sample_boxes.append(sample_box)
        return sample_boxes
@@ -206,8 +207,6 @@ class Sample(object):
        if gt_boxes is not None:
            ctr_x = (gt_boxes[:, 2] + gt_boxes[:, 0]) / 2.0
            ctr_y = (gt_boxes[:, 3] + gt_boxes[:, 1]) / 2.0
-            # Keep the ground-truth box whose center is in the sample box
-            # Implement ``EmitConstraint.CENTER`` in the original SSD
            keep_inds = np.where((ctr_x >= rand_box[0]) & (ctr_x <= rand_box[2]) &
                                 (ctr_y >= rand_box[1]) & (ctr_y <= rand_box[3]))[0]
            gt_boxes = gt_boxes[keep_inds]

--- a/lib/ssd/transforms_test.py
+++ b/lib/ssd/transforms_test.py
@@ -19,11 +19,14 @@ sys.path.append('../../')
 import cv2
 import numpy as np
-from lib.ssd import transforms
+from seetadet.algo.ssd import transforms
+from seetadet.core.config import cfg
 if __name__ == '__main__':
    np.random.seed(3)
+    cfg.TRAIN.SCALES = [300]
+    cfg.TRAIN.RANDOM_SCALES = [0.25, 1.00]
    augmentor = transforms.Compose(
        transforms.Distort(),
@@ -36,8 +39,6 @@ if __name__ == '__main__':
        img = cv2.imread('cat.jpg')
        boxes = np.array([[0.33, 0.04, 0.71, 0.98]], dtype=np.float32)
        img, boxes = augmentor(img, boxes)
-        if len(boxes) < 1:
-            continue
        for box in boxes:
            x1 = int(box[0] * img.shape[1])
            y1 = int(box[1] * img.shape[0])

--- a/lib/datasets/__init__.py
+++ b/lib/datasets/__init__.py
--- a/lib/core/config.py
+++ b/lib/core/config.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 import os.path as osp
 import numpy as np
-from lib.utils.attrdict import AttrDict
+from seetadet.utils.attrdict import AttrDict
 cfg = __C = AttrDict()
@@ -38,41 +38,27 @@ __C.TRAIN = AttrDict()
 # Initialize network with weights from this file
 __C.TRAIN.WEIGHTS = ''
-# Database to train
+# Dataset to train
-__C.TRAIN.DATABASE = ''
+__C.TRAIN.DATASET = ''
-# The number of workers to transform data
+# The number of threads to load train data
-__C.TRAIN.NUM_WORKERS = 3
+__C.TRAIN.NUM_THREADS = 4
 # Scales to use during training (can list multiple scales)
 # Each scale is the pixel size of an image's shortest side
-__C.TRAIN.SCALES = (600,)
+__C.TRAIN.SCALES = (300,)
 # Max pixel size of the longest side of a scaled input image
 # A square will be used if value < 1
-__C.TRAIN.MAX_SIZE = 1000
+__C.TRAIN.MAX_SIZE = 0
 # Images to use per mini-batch
 __C.TRAIN.IMS_PER_BATCH = 1
-# Minibatch size (number of regions of interest [ROIs])
+# Use shuffled images during training?
-__C.TRAIN.BATCH_SIZE = 128
-# Fraction of minibatch that is labeled foreground (i.e. class > 0)
-__C.TRAIN.FG_FRACTION = 0.25
-# Overlap threshold for a ROI to be considered foreground (if >= FG_THRESH)
-__C.TRAIN.FG_THRESH = 0.5
-# Overlap threshold for a ROI to be considered background (class = 0 if
-# overlap in [LO, HI))
-__C.TRAIN.BG_THRESH_HI = 0.5
-__C.TRAIN.BG_THRESH_LO = 0.0
-# Use shuffle after each epoch
 __C.TRAIN.USE_SHUFFLE = True
-# The number of chunks to shuffle
+# The number of shuffle chunks
-__C.TRAIN.NUM_SHUFFLE_CHUNKS = 0
+__C.TRAIN.SHUFFLE_CHUNKS = 0
 # Use horizontally-flipped images during training?
 __C.TRAIN.USE_FLIPPED = True
@@ -80,17 +66,25 @@ __C.TRAIN.USE_FLIPPED = True
 # Use the difficult(under occlusion) objects
 __C.TRAIN.USE_DIFF = True
-# Overlap required between a ROI and ground-truth box in order for that ROI to
+# Range to jitter the image scales
-# be used as a bounding-box regression training example
+__C.TRAIN.RANDOM_SCALES = [1., 1.]
-__C.TRAIN.BBOX_THRESH = 0.5
-# If True, randomly scale the image by scale range
-__C.TRAIN.USE_SCALE_JITTER = False
-__C.TRAIN.SCALE_JITTER_RANGE = [0.75, 1.0]
 # If True, randomly distort the image by brightness, contrast, and saturation
 __C.TRAIN.USE_COLOR_JITTER = False
+# Mini-batch size (#RoIs) for two stage detector
+__C.TRAIN.BATCH_SIZE = 128
+# Overlap threshold for a ROI to be considered foreground (if >= FG_THRESH)
+__C.TRAIN.FG_THRESH = 0.5
+# Fraction of mini-batch that is labeled foreground (i.e. class > 0)
+__C.TRAIN.FG_FRACTION = 0.25
+# Overlap threshold for a ROI to be considered background (class = 0 if
+# overlap in [LO, HI))
+__C.TRAIN.BG_THRESH_HI = 0.5
+__C.TRAIN.BG_THRESH_LO = 0.0
 # IOU >= thresh: positive example
 __C.TRAIN.RPN_POSITIVE_OVERLAP = 0.7
 # IOU < thresh: negative example
@@ -123,20 +117,19 @@ __C.TRAIN.RPN_STRADDLE_THRESH = 0
 __C.TEST = AttrDict()
-# Database to test
+# Dataset to test
-__C.TEST.DATABASE = ''
+__C.TEST.DATASET = ''
 # Original json ground-truth file to use
-# Records in the Database file will be used instead
 __C.TEST.JSON_FILE = ''
 # Scales to use during testing (can list multiple scales)
 # Each scale is the pixel size of an image's shortest side
-__C.TEST.SCALES = (600,)
+__C.TEST.SCALES = (300,)
 # Max pixel size of the longest side of a scaled input image
 # A square will be used if value < 1
-__C.TEST.MAX_SIZE = 1000
+__C.TEST.MAX_SIZE = 0
 # Images to use per mini-batch
 __C.TEST.IMS_PER_BATCH = 1
@@ -217,10 +210,20 @@ __C.MODEL.CLASSES = ['__background__']
 # The value of ``K`` is usually set to 2
 __C.MODEL.FREEZE_AT = 2
+# The variant of ReLU activation
+# ('ReLU', 'ReLU6')
+__C.MODEL.RELU_VARIANT = 'ReLU'
 # Setting of focal loss
 __C.MODEL.FOCAL_LOSS_ALPHA = 0.25
 __C.MODEL.FOCAL_LOSS_GAMMA = 2.0
+# The optional loss for bbox regression
+# ('NORM', 'IOU')
+__C.MODEL.REG_LOSS_TYPE = 'NORM'
+# Weight for bbox regression loss
+__C.MODEL.REG_LOSS_WEIGHT = 1.
 # Stride of the coarsest Feature level
 # This is needed so the input can be padded properly
 __C.MODEL.COARSEST_STRIDE = 32
@@ -268,9 +271,6 @@ __C.RETINANET.ANCHOR_SCALE = 4
 # NOTE: this doesn't include the last conv for logits
 __C.RETINANET.NUM_CONVS = 4
-# Weight for bbox regression loss
-__C.RETINANET.BBOX_REG_WEIGHT = 1.
 # During inference, #locs to select based on cls score before NMS is performed
 __C.RETINANET.PRE_NMS_TOP_N = 5000
@@ -362,9 +362,6 @@ __C.SSD = AttrDict()
 # NOTE: this doesn't include the last conv for logits
 __C.SSD.NUM_CONVS = 0
-# Weight for bbox regression loss
-__C.SSD.BBOX_REG_WEIGHT = 1.
 # MultiBox configs
 __C.SSD.MULTIBOX = AttrDict()
 __C.SSD.MULTIBOX.STRIDES = []
@@ -379,23 +376,6 @@ __C.SSD.OHEM.NEG_OVERLAP = 0.5
 # The ratio used in hard example mining
 __C.SSD.OHEM.NEG_POS_RATIO = 3.0
-# Distort the image?
-__C.SSD.DISTORT = AttrDict()
-__C.SSD.DISTORT.BRIGHTNESS_PROB = 0.5
-__C.SSD.DISTORT.CONTRAST_PROB = 0.5
-__C.SSD.DISTORT.SATURATION_PROB = 0.5
-# Expand the image?
-__C.SSD.EXPAND = AttrDict()
-__C.SSD.EXPAND.PROB = 0.5
-__C.SSD.EXPAND.MAX_RATIO = 4.0
-# Resize the image?
-__C.SSD.RESIZE = AttrDict()
-__C.SSD.RESIZE.HEIGHT = 300
-__C.SSD.RESIZE.WIDTH = 300
-__C.SSD.RESIZE.INTERP_MODE = ['LINEAR', 'AREA', 'NEAREST', 'CUBIC', 'LANCZOS4']
 # Samplers
 # Format as (min_scale, max_scale,
 #            min_aspect_ratio, max_aspect_ratio,
@@ -486,7 +466,7 @@ __C.SOLVER.LR_POLICY = 'steps_with_decay'
 # Momentum to use with SGD
 __C.SOLVER.MOMENTUM = 0.9
-# L2 regularization hyper parameters
+# L2 regularization for weight parameters
 __C.SOLVER.WEIGHT_DECAY = 0.0001
 # L2 norm factor for clipping gradients
 __C.SOLVER.CLIP_NORM = -1.0
@@ -505,6 +485,9 @@ __C.NUM_GPUS = 1
 # Use NCCL for all reduce, otherwise use cuda-aware mpi
 __C.USE_NCCL = True
+# Use DALI to load the batch of data instead of original pipeline
+__C.USE_DALI = False
 # Hosts for Inter-Machine communication
 __C.HOSTS = []
@@ -531,9 +514,6 @@ __C.DATA_DIR = osp.abspath(osp.join(__C.ROOT_DIR, 'data'))
 # Place outputs under an experiments directory
 __C.EXP_DIR = ''
-# Use GPU implementation of non-maximum suppression
-__C.USE_GPU_NMS = True
 # Default GPU device id
 __C.GPU_ID = 0

--- a/lib/core/coordinator.py
+++ b/lib/core/coordinator.py
@@ -18,8 +18,8 @@ import shutil
 import time
 import numpy as np
-from lib.core.config import cfg
+from seetadet.core.config import cfg
-from lib.core.config import cfg_from_file
+from seetadet.core.config import cfg_from_file
 class Coordinator(object):

--- a/seetadet/core/registry.py
+++ b/seetadet/core/registry.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import collections
+import functools
+class Registry(object):
+    """The base registry class."""
+    def __init__(self, name):
+        self._name = name
+        self._registry = collections.OrderedDict()
+    def has(self, key):
+        return key in self._registry
+    def register(self, name, func=None, **kwargs):
+        def decorated(inner_function):
+            for key in (name if isinstance(
+                    name, (tuple, list)) else [name]):
+                if self.has(key):
+                    raise KeyError(
+                        '`%s` has been registered in %s.'
+                        % (key, self._name)
+                    )
+                self._registry[key] = functools.partial(
+                    inner_function, **kwargs)
+        if func is not None:
+            return decorated(func)
+        return decorated
+    def get(self, name):
+        if not self.has(name):
+            raise KeyError(
+                "`%s` is not registered in <%s>."
+                % (name, self._name)
+            )
+        return self._registry[name]
+    def try_get(self, name):
+        if self.has(name):
+            return self.get(name)
+        return None
+backbones = Registry('backbones')
+models = Registry('models')
--- a/lib/core/test.py
+++ b/lib/core/test.py
@@ -20,9 +20,9 @@ import os
 import cv2
 import dragon
-from lib.core.config import cfg
+from seetadet.core.config import cfg
-from lib.datasets.example import Example
+from seetadet.datasets.example import Example
-from lib.datasets.factory import get_imdb
+from seetadet.datasets.factory import get_dataset
 class _Server(object):
@@ -50,13 +50,13 @@ class _Server(object):
 class TestServer(_Server):
    def __init__(self, output_dir):
        super(TestServer, self).__init__(output_dir)
-        self.imdb = get_imdb(cfg.TEST.DATABASE)
+        self.dataset = get_dataset(cfg.TEST.DATASET)
-        self.imdb.competition_mode(cfg.TEST.COMPETITION_MODE)
+        self.dataset.competition_mode(cfg.TEST.COMPETITION_MODE)
-        self.classes = self.imdb.classes
+        self.classes = self.dataset.classes
-        self.num_images = self.imdb.num_images
+        self.num_images = self.dataset.num_images
-        self.num_classes = self.imdb.num_classes
+        self.num_classes = self.dataset.num_classes
        self.data_reader = dragon.io.DataReader(
-            dataset=lambda: dragon.io.SeetaRecordDataset(self.imdb.source))
+            dataset=self.dataset.cls, source=self.dataset.source)
        self.data_reader.q_out = mp.Queue(cfg.TEST.IMS_PER_BATCH * 5)
        self.data_reader.start()
        self.gt_recs = collections.OrderedDict()
@@ -81,16 +81,16 @@ class TestServer(_Server):
    def evaluate_detections(self, all_boxes):
        if cfg.TEST.PROTOCOL == 'dump':
-            self.imdb.dump_detections(all_boxes, self.output_dir)
+            self.dataset.dump_detections(all_boxes, self.output_dir)
        else:
-            self.imdb.evaluate_detections(
+            self.dataset.evaluate_detections(
                all_boxes,
                self.get_records(),
                self.output_dir,
            )
    def evaluate_segmentations(self, all_boxes, all_masks):
-        self.imdb.evaluate_segmentations(
+        self.dataset.evaluate_segmentations(
            all_boxes,
            all_masks,
            self.get_records(),
@@ -101,7 +101,7 @@ class TestServer(_Server):
 class InferServer(_Server):
    def __init__(self, output_dir):
        super(InferServer, self).__init__(output_dir)
-        self.images_dir = cfg.TEST.DATABASE
+        self.images_dir = cfg.TEST.DATASET
        self.images = os.listdir(self.images_dir)
        self.classes = cfg.MODEL.CLASSES
        self.num_images = len(self.images)

--- a/lib/core/test_engine.py
+++ b/lib/core/test_engine.py
@@ -18,9 +18,9 @@ import multiprocessing
 import numpy as np
-from lib.core.config import cfg
+from seetadet.core.config import cfg
-from lib.utils import time_util
+from seetadet.utils import time_util
-from lib.utils.vis import vis_one_image
+from seetadet.utils.vis import vis_one_image
 def run_test_net(checkpoint, server, devices):
@@ -30,8 +30,8 @@ def run_test_net(checkpoint, server, devices):
    devices = devices if devices else [cfg.GPU_ID]
    num_workers = len(devices)
-    test_fn = importlib.import_module(
+    test_module = 'seetadet.algo.%s.test' % cfg.MODEL.TYPE
-        'lib.%s.test' % cfg.MODEL.TYPE).test_net
+    test_fn = importlib.import_module(test_module).test_net
    _t = time_util.new_timers('im_detect', 'mask_detect', 'misc')

--- a/lib/core/train.py
+++ b/lib/core/train.py
@@ -22,11 +22,11 @@ import os
 import dragon.vm.torch as torch
-from lib.core.config import cfg
+from seetadet.core.config import cfg
-from lib.solver.sgd import SGDSolver
+from seetadet.solver.sgd import SGDSolver
-from lib.utils import logger
+from seetadet.utils import logger
-from lib.utils import time_util
+from seetadet.utils import time_util
-from lib.utils.stats import SmoothedValue
+from seetadet.utils.stats import SmoothedValue
 class SolverWrapper(object):

--- a/lib/modules/__init__.py
+++ b/lib/modules/__init__.py
--- a/seetadet/dali/data_reader.py
+++ b/seetadet/dali/data_reader.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import collections
+import dragon.vm.dali as dali
+import numpy as np
+from seetadet.core.config import cfg
+class DataReader(dali.ops.KPLRecordReader):
+    def __init__(
+        self,
+        path,
+        features,
+        pipeline,
+        shard_id=0,
+        num_shards=1,
+        shuffle_after_epoch=False,
+        shuffle_chunks=0,
+        aspect_grouping=False,
+    ):
+        super(DataReader, self).__init__(
+            path=path,
+            features=features,
+            pipeline=pipeline,
+            shard_id=shard_id,
+            num_shards=num_shards,
+            shuffle_after_epoch=shuffle_after_epoch,
+            shuffle_chunks=shuffle_chunks,
+        )
+        self._aspect_grouping = aspect_grouping
+        self._class_to_ind = dict(zip(
+            cfg.MODEL.CLASSES,
+            range(len(cfg.MODEL.CLASSES))
+        ))
+        self._queue1, self._queue2 = [], []
+    def feed_inputs(self):
+        if not self._aspect_grouping:
+            feed_dict = collections.defaultdict(list)
+            for i in range(self._pipe.batch_size):
+                while True:
+                    example = self._buffer.get()
+                    if len(example['object']) > 0:
+                        break
+                data = self.example_to_data(example)
+                for k, v in data.items():
+                    feed_dict[k].append(v)
+            for k, v in self.features.items():
+                self._pipe.feed_input(self.features[k], feed_dict[k])
+        else:
+            batch_size = self._pipe.batch_size
+            while True:
+                batch_data = None
+                if len(self._queue1) >= batch_size:
+                    batch_data = self._queue1[:batch_size]
+                    self._queue1 = self._queue1[batch_size:]
+                elif len(self._queue2) >= batch_size:
+                    batch_data = self._queue2[:batch_size]
+                    self._queue2 = self._queue2[batch_size:]
+                if batch_data is not None:
+                    feed_dict = collections.defaultdict(list)
+                    for data in batch_data:
+                        for k, v in data.items():
+                            feed_dict[k].append(v)
+                    for k, v in self.features.items():
+                        self._pipe.feed_input(self.features[k], feed_dict[k])
+                    break
+                while True:
+                    example = self._buffer.get()
+                    if len(example['object']) > 0:
+                        break
+                data = self.example_to_data(example)
+                ratio = float(data['shape'][0]) / data['shape'][1]
+                if ratio > 1:
+                    self._queue1.append(data)
+                else:
+                    self._queue2.append(data)
+    def example_to_data(self, example):
+        bbox_data, bbox_ratio, bbox_label = [], [], []
+        h, w, c = example['height'], example['width'], example['depth']
+        for obj in example['object']:
+            x1 = float(max(obj['xmin'], 0))
+            y1 = float(max(obj['ymin'], 0))
+            x2 = float(min(obj['xmax'], w - 1))
+            y2 = float(min(obj['ymax'], h - 1))
+            bbox_data.append([x1, y1, x2, y2])
+            bbox_ratio.append([x1 / w, y1 / h, x2 / w, y2 / h])
+            bbox_label.append(self._class_to_ind[obj['name']])
+        return {
+            'image': example['content'],
+            'shape': np.array([h, w, c], 'int64'),
+            'bbox/data': np.array(bbox_data, 'float32'),
+            'bbox/ratio': np.array(bbox_ratio, 'float32'),
+            'bbox/label': np.array(bbox_label, 'int32')
+        }
--- a/seetadet/dali/rcnn_pipeline.py
+++ b/seetadet/dali/rcnn_pipeline.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+from dragon.vm import dali
+from dragon.vm.dali.plugin.pytorch import DALIGenericIterator
+from seetadet.core.config import cfg
+from seetadet.dali.data_reader import DataReader
+class Pipeline(dali.Pipeline):
+    def __init__(self, source):
+        super(Pipeline, self).__init__(
+            batch_size=cfg.TRAIN.IMS_PER_BATCH,
+            num_threads=cfg.TRAIN.NUM_THREADS,
+        )
+        random_scales = cfg.TRAIN.RANDOM_SCALES
+        if random_scales[1] > 1:
+            raise ValueError('The max scale range should be <= 1.')
+        mean_values = np.array(cfg.PIXEL_MEANS, 'int64').tolist()
+        self.max_size = cfg.TRAIN.MAX_SIZE
+        self.reader = DataReader(
+            path=source,
+            features=['image', 'shape', 'bbox/data', 'bbox/label'],
+            pipeline=self,
+            shard_id=dali.get_distributed_info()[0],
+            num_shards=dali.get_distributed_info()[1],
+            shuffle_after_epoch=cfg.TRAIN.USE_SHUFFLE,
+            shuffle_chunks=cfg.TRAIN.SHUFFLE_CHUNKS,
+            aspect_grouping=True,
+        )
+        self.decode = dali.ops.ImageDecoder()
+        self.resize = dali.ops.Resize(max_size=self.max_size)
+        self.brightness_contrast = dali.ops.BrightnessContrast()
+        self.hsv = dali.ops.Hsv()
+        self.cmn = dali.ops.CropMirrorNormalize(
+            mean=np.array(mean_values, 'int64').tolist(),
+            std=[1., 1., 1.],
+        )
+        self.pad = dali.ops.Pad(
+            axes=[1, 2],
+            align=cfg.MODEL.COARSEST_STRIDE
+            if cfg.MODEL.COARSEST_STRIDE > 0 else None,
+        )
+        with dali.device('cpu'):
+            self.resize_rng = dali.ops.Uniform([
+                cfg.TRAIN.SCALES[0] * random_scales[0],
+                cfg.TRAIN.SCALES[0] * random_scales[1],
+            ])
+            self.twist_rng = dali.ops.Uniform([0.6, 1.4])
+            self.flip_rng = dali.ops.CoinFlip(0.5 if cfg.TRAIN.USE_FLIPPED else 0.)
+    def iter_setup(self):
+        self.reader.feed_inputs()
+    def define_graph(self):
+        # Read inputs from file
+        inputs = self.reader()
+        shape = inputs['shape']
+        bbox = inputs['bbox/data']
+        label = inputs['bbox/label']
+        # Decode image
+        image = self.decode(inputs['image'])
+        # Augment the color space
+        if cfg.TRAIN.USE_COLOR_JITTER:
+            image = self.hsv(
+                self.brightness_contrast(
+                    image,
+                    brightness=self.twist_rng(),
+                    contrast=self.twist_rng(),
+                ),
+                saturation=self.twist_rng()
+            )
+        # Resize to the target size
+        target_size = self.resize_rng()
+        image = self.resize(image, resize_shorter=target_size)
+        # Normalize and pad to blob shape
+        apply_flip = self.flip_rng()
+        image = self.cmn(image, mirror=apply_flip)
+        image = self.pad(image)
+        return image, bbox, label, target_size, shape, apply_flip
+class Iterator(DALIGenericIterator):
+    def __init__(self, pipeline):
+        super(Iterator, self).__init__(pipeline)
+    @property
+    def handlers(self):
+        return ([0], self.copy_handler,), \
+               ([1, 2, 3, 4, 5], self.gt_handler)
+    def next(self):
+        (images,), (gt_boxes, ims_info) = self.__next__()
+        return {'data': images, 'gt_boxes': gt_boxes, 'ims_info': ims_info}
+    def gt_handler(self, tensors):
+        def impl(box_list, labels, im_shape, target_size, max_size, flip):
+            num_images = len(box_list)
+            im_size_min = np.min(im_shape[:, :2], axis=1).astype('float32')
+            im_size_max = np.max(im_shape[:, :2], axis=1).astype('float32')
+            im_scales = target_size / im_size_min
+            inds = np.where(np.round(im_scales * im_size_max) > max_size)[0]
+            im_scales[inds] = max_size / im_size_max[inds]
+            box_list = [box_list[i] * im_scales[i] for i in range(num_images)]
+            for i in (np.where(flip > 0)[0]):
+                boxes = box_list[i]
+                boxes_flipped = box_list[i].copy()
+                width = im_shape[i, 1] * im_scales[i]
+                boxes_flipped[:, 0] = width - boxes[:, 2] - 1
+                boxes_flipped[:, 2] = width - boxes[:, 0] - 1
+                box_list[i] = boxes_flipped
+            im_scales = np.expand_dims(im_scales, 1)
+            batch_inds = [np.ones([e.size, 1]) * i for i, e in enumerate(labels)]
+            boxes = np.concatenate(box_list)
+            labels = np.expand_dims(np.concatenate(labels), axis=1)
+            batch_inds = np.concatenate(batch_inds)
+            gt_boxes = np.hstack([boxes, labels, batch_inds])
+            ims_info = np.hstack([im_shape[:, :2] * im_scales, im_scales])
+            return gt_boxes.astype('float32'), ims_info.astype('float32')
+        bbox, label, target_size, shape, flip = tensors
+        shape = shape.as_array()
+        return impl(
+            box_list=[bbox.at(i) for i in range(len(shape))],
+            labels=[label.at(i) for i in range(len(shape))],
+            im_shape=shape,
+            target_size=target_size.as_array().squeeze(),
+            max_size=self._pipe.max_size,
+            flip=flip.as_array()
+        )
+def new_iterator(source):
+    with dali.device('cuda', cfg.GPU_ID):
+        return Iterator(Pipeline(source))
--- a/seetadet/dali/ssd_pipeline.py
+++ b/seetadet/dali/ssd_pipeline.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+from dragon.vm import dali
+from dragon.vm.dali.plugin.pytorch import DALIGenericIterator
+from seetadet.core.config import cfg
+from seetadet.dali.data_reader import DataReader
+class Pipeline(dali.Pipeline):
+    def __init__(self, source):
+        super(Pipeline, self).__init__(
+            batch_size=cfg.TRAIN.IMS_PER_BATCH,
+            num_threads=cfg.TRAIN.NUM_THREADS,
+        )
+        paste_ratio = 1. / cfg.TRAIN.RANDOM_SCALES[0]
+        mean_values = np.array(cfg.PIXEL_MEANS, 'int64').tolist()
+        self.target_size = cfg.TRAIN.SCALES[0]
+        self.reader = DataReader(
+            path=source,
+            features=['image', 'bbox/ratio', 'bbox/label'],
+            pipeline=self,
+            shard_id=dali.get_distributed_info()[0],
+            num_shards=dali.get_distributed_info()[1],
+            shuffle_after_epoch=cfg.TRAIN.USE_SHUFFLE,
+            shuffle_chunks=cfg.TRAIN.SHUFFLE_CHUNKS,
+        )
+        self.decode = dali.ops.ImageDecoder()
+        self.brightness_contrast = dali.ops.BrightnessContrast()
+        self.hsv = dali.ops.Hsv()
+        self.paste = dali.ops.Paste(fill_value=mean_values)
+        self.slice = dali.ops.Slice()
+        self.resize = dali.ops.Resize(self.target_size, self.target_size)
+        self.cmn = dali.ops.CropMirrorNormalize(mean=mean_values, std=[1., 1., 1.])
+        with dali.device('cpu'):
+            self.bbox_paste = dali.ops.BBoxPaste()
+            self.bbox_crop = dali.ops.RandomBBoxCrop()
+            self.bbox_flip = dali.ops.BbFlip()
+            self.twist_rng = dali.ops.Uniform([0.6, 1.4])
+            self.paste_pos = dali.ops.Uniform((0., 1.))
+            self.paste_ratio = dali.ops.Uniform((0., paste_ratio - 1))
+            self.flip_rng = dali.ops.CoinFlip(0.5 if cfg.TRAIN.USE_FLIPPED else 0.)
+    def iter_setup(self):
+        self.reader.feed_inputs()
+    def define_graph(self):
+        # Read inputs from file
+        inputs = self.reader()
+        bbox = inputs['bbox/ratio']
+        label = inputs['bbox/label']
+        # Decode image
+        image = self.decode(inputs['image'])
+        # Augment the color space
+        image = self.hsv(
+            self.brightness_contrast(
+                image,
+                brightness=self.twist_rng(),
+                contrast=self.twist_rng(),
+            ), saturation=self.twist_rng()
+        )
+        # Expand randomly to get smaller objects
+        pr = self.paste_ratio() * self.flip_rng() + 1.
+        px, py = self.paste_pos(), self.paste_pos()
+        image = self.paste(image, paste_x=px, paste_y=py, ratio=pr)
+        bbox = self.bbox_paste(bbox, paste_x=px, paste_y=py, ratio=pr)
+        # Sample RoIs with IoU constraint
+        crop_begin, crop_size, bbox, label = self.bbox_crop(bbox, label)
+        image = self.slice(image, crop_begin, crop_size)
+        # Resize image to a fixed size
+        image = self.resize(image)
+        # Normalize
+        apply_flip = self.flip_rng()
+        image = self.cmn(image, mirror=apply_flip)
+        bbox = self.bbox_flip(bbox, horizontal=apply_flip)
+        return image, bbox, label
+class Iterator(DALIGenericIterator):
+    def __init__(self, pipeline):
+        super(Iterator, self).__init__(pipeline)
+    @property
+    def handlers(self):
+        return ([0], self.copy_handler,), ([1, 2], self.gt_handler)
+    def next(self):
+        (images,), gt_boxes = self.__next__()
+        return {'data': images, 'gt_boxes': gt_boxes}
+    def gt_handler(self, tensors):
+        bbox, label = tensors
+        num_images = self._pipe.batch_size
+        boxes = np.concatenate([bbox.at(i) for i in range(num_images)])
+        boxes[:, 0::2] *= self._pipe.target_size
+        boxes[:, 1::2] *= self._pipe.target_size
+        labels = [label.at(i) for i in range(num_images)]
+        batch_inds = [np.ones_like(e) * i for i, e in enumerate(labels)]
+        labels, batch_inds = np.concatenate(labels), np.concatenate(batch_inds)
+        return np.hstack([boxes, labels, batch_inds])
+def new_iterator(source):
+    with dali.device('cuda', cfg.GPU_ID):
+        return Iterator(Pipeline(source))
--- a/lib/nms/__init__.py
+++ b/lib/nms/__init__.py
--- a/lib/datasets/coco_evaluator.py
+++ b/lib/datasets/coco_evaluator.py
@@ -19,11 +19,11 @@ import sys
 import numpy as np
-from lib.core.config import cfg
+from seetadet.core.config import cfg
-from lib.pycocotools import mask as mask_tools
+from seetadet.pycocotools import mask as mask_tools
-from lib.pycocotools.coco import COCO
+from seetadet.pycocotools.coco import COCO
-from lib.pycocotools.cocoeval import COCOeval
+from seetadet.pycocotools.cocoeval import COCOeval
-from lib.utils import mask as mask_util
+from seetadet.utils import mask as mask_util
 class COCOEvaluator(object):

--- a/lib/datasets/imdb.py
+++ b/lib/datasets/imdb.py
@@ -20,12 +20,14 @@ from __future__ import print_function
 import os
 import uuid
-from lib.core.config import cfg
+from seetadet.core.config import cfg
-from lib.datasets.coco_evaluator import COCOEvaluator
+from seetadet.datasets.coco_evaluator import COCOEvaluator
-from lib.datasets.voc_evaluator import VOCEvaluator
+from seetadet.datasets.voc_evaluator import VOCEvaluator
-class imdb(object):
+class Dataset(object):
+    """The base dataset class."""
    def __init__(self, source):
        self._source = source
        self._num_images = 0
@@ -51,6 +53,10 @@ class imdb(object):
        return self._class_to_ind
    @property
+    def cls(self):
+        return type(self)
+    @property
    def comp_id(self):
        return '_' + self._salt if self.config['use_salt'] else ''

--- a/lib/datasets/example.py
+++ b/lib/datasets/example.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 import cv2
 import numpy as np
-from lib.pycocotools import mask_utils
+from seetadet.pycocotools import mask_utils
 class Example(object):

--- a/lib/datasets/factory.py
+++ b/lib/datasets/factory.py
@@ -18,27 +18,29 @@ from __future__ import division
 from __future__ import print_function
 import os
-from lib.datasets.taas import TaaS
+from seetadet.datasets import kpl_record
-# TaaS DataSet
+def get_dataset(name):
-_GLOBAL_DATA_SETS = {'taas': lambda source: TaaS(source)}
+    """Get a dataset by name."""
+    keys = name.split('://')
-def get_imdb(name):
-    """Get an imdb (image database) by name."""
-    keys = name.split(':')
    if len(keys) >= 2:
-        cls, source = keys[0], ':'.join(keys[1:])
+        cls, source = keys
-        if cls not in _GLOBAL_DATA_SETS:
+        if cls not in _GLOBAL_REGISTERED_DATASET:
-            raise KeyError('Unknown DataSet: {}'.format(cls))
+            raise KeyError('Unknown dataset:', cls)
-        return _GLOBAL_DATA_SETS[cls](source)
+        return _GLOBAL_REGISTERED_DATASET[cls](source)
    elif os.path.exists(name):
-        return _GLOBAL_DATA_SETS['taas'](name)
+        return _GLOBAL_REGISTERED_DATASET['default'](name)
    else:
-        raise ValueError('Illegal Database: {}' + name)
+        raise ValueError('Illegal dataset:', name)
+def list_dataset():
+    """List all registered dataset."""
+    return _GLOBAL_REGISTERED_DATASET.keys()
-def list_imdbs():
+_GLOBAL_REGISTERED_DATASET = {
-    """List all registered imdbs."""
+    'default': lambda source:
-    return _GLOBAL_DATA_SETS.keys()
+        kpl_record.KPLRecordDataset(source),
+}
--- a/lib/datasets/taas.py
+++ b/lib/datasets/taas.py
@@ -21,23 +21,26 @@ import os
 import dragon
-from lib.core.config import cfg
+from seetadet.core.config import cfg
-from lib.datasets.imdb import imdb
+from seetadet.datasets.dataset import Dataset
-class TaaS(imdb):
+class KPLRecordDataset(Dataset):
    def __init__(self, source):
-        imdb.__init__(self, source)
+        super(KPLRecordDataset, self).__init__(source)
-        self._dataset = dragon.io.SeetaRecordDataset
+        self._num_images = self.cls(self.source).size
-        self._num_images = self._dataset(self.source).size
+    @property
+    def cls(self):
+        return dragon.io.KPLRecordDataset
    def dump_detections(self, all_boxes, output_dir):
-        dataset = self._dataset(self.source)
+        dataset = self.cls(self.source)
        for file in ('data.data', 'data.index', 'data.meta'):
            file = os.path.join(output_dir, file)
            if os.path.exists(file):
                os.remove(file)
-        writer = dragon.io.SeetaRecordWriter(output_dir, dataset.protocol)
+        writer = dragon.io.KPLRecordWriter(output_dir, dataset.protocol)
        for i in range(len(dataset)):
            example = dataset.get()
            example['object'] = []

--- a/lib/datasets/voc_eval.py
+++ b/lib/datasets/voc_eval.py
@@ -20,11 +20,11 @@ from __future__ import print_function
 import cv2
 import numpy as np
-from lib.core.config import cfg
+from seetadet.core.config import cfg
-from lib.pycocotools import mask_utils
+from seetadet.pycocotools import mask_utils
-from lib.utils import boxes as box_util
+from seetadet.utils import boxes as box_util
-from lib.utils.framework import pickle
+from seetadet.utils.env import pickle
-from lib.utils.mask import mask_overlap
+from seetadet.utils.mask import mask_overlap
 def voc_ap(rec, prec, use_07_metric=False):

--- a/lib/datasets/voc_evaluator.py
+++ b/lib/datasets/voc_evaluator.py
@@ -16,8 +16,8 @@ from __future__ import print_function
 import os
 import numpy as np
-from lib.datasets import voc_eval
+from seetadet.datasets import voc_eval
-from lib.utils.framework import pickle
+from seetadet.utils.env import pickle
 class VOCEvaluator(object):

--- a/seetadet/modeling/__init__.py
+++ b/seetadet/modeling/__init__.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+# Backbones
+import seetadet.modeling.airnet
+import seetadet.modeling.mobilenet
+import seetadet.modeling.resnet
+import seetadet.modeling.vgg
+# Custom modules
+from seetadet.modeling.fast_rcnn import FastRCNN
+from seetadet.modeling.fpn import FPN
+from seetadet.modeling.mask_rcnn import MaskRCNN
+from seetadet.modeling.retinanet import RetinaNet
+from seetadet.modeling.rpn import RPN
+from seetadet.modeling.ssd import SSD
--- a/lib/modeling/airnet.py
+++ b/lib/modeling/airnet.py
@@ -15,17 +15,18 @@ from __future__ import print_function
 import dragon.vm.torch as torch
-from lib.modules import init
+from seetadet.core.registry import backbones
-from lib.modules import nn
+from seetadet.modules import init
+from seetadet.modules import nn
 class WideResBlock(nn.Module):
    def __init__(self, dim_in, dim_out, stride=1, downsample=None):
        super(WideResBlock, self).__init__()
        self.conv1 = nn.Conv3x3(dim_in, dim_out, stride)
-        self.bn1 = nn.Affine(dim_out)
+        self.bn1 = nn.FrozenAffine(dim_out)
        self.conv2 = nn.Conv3x3(dim_out, dim_out)
-        self.bn2 = nn.Affine(dim_out)
+        self.bn2 = nn.FrozenAffine(dim_out)
        self.downsample = downsample
        self.relu = nn.ReLU(inplace=True)
@@ -51,15 +52,15 @@ class InceptionBlock(nn.Module):
    def __init__(self, dim_in, dim_out):
        super(InceptionBlock, self).__init__()
        self.conv1 = nn.Conv1x1(dim_in, dim_out)
-        self.bn1 = nn.Affine(dim_out)
+        self.bn1 = nn.FrozenAffine(dim_out)
        self.conv2 = nn.Conv3x3(dim_out, dim_out // 2)
-        self.bn2 = nn.Affine(dim_out // 2)
+        self.bn2 = nn.FrozenAffine(dim_out // 2)
        self.conv3a = nn.Conv3x3(dim_out // 2, dim_out)
-        self.bn3a = nn.Affine(dim_out)
+        self.bn3a = nn.FrozenAffine(dim_out)
        self.conv3b = nn.Conv3x3(dim_out, dim_out)
-        self.bn3b = nn.Affine(dim_out)
+        self.bn3b = nn.FrozenAffine(dim_out)
        self.conv4 = nn.Conv3x3(dim_out * 3, dim_out)
-        self.bn4 = nn.Affine(dim_out)
+        self.bn4 = nn.FrozenAffine(dim_out)
        self.relu = nn.ReLU(inplace=True)
    def forward(self, x):
@@ -103,7 +104,7 @@ class AirNet(nn.Module):
            padding=3,
            bias=False,
        )
-        self.bn1 = nn.Affine(self.dim_in)
+        self.bn1 = nn.FrozenAffine(self.dim_in)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(
            kernel_size=2,
@@ -127,7 +128,7 @@ class AirNet(nn.Module):
    def make_blocks(self, dim_out, blocks, stride=1):
        downsample = nn.Sequential(
            nn.Conv1x1(self.dim_in, dim_out, stride=stride),
-            nn.Affine(dim_out),
+            nn.FrozenAffine(dim_out),
        )
        layers = [WideResBlock(self.dim_in, dim_out, stride, downsample)]
        self.dim_in = dim_out
@@ -164,13 +165,7 @@ def airnet(num_stages):
    return AirNet(blocks, num_stages)
-def make_airnet_(): return airnet(5)
+backbones.register('airnet', func=airnet, num_stages=5)
+backbones.register('airnet_3b', func=airnet, num_stages=3)
+backbones.register('airnet_4b', func=airnet, num_stages=4)
-def make_airnet_3b(): return airnet(3)
+backbones.register('airnet_5b', func=airnet, num_stages=5)
-def make_airnet_4b(): return airnet(4)
-def make_airnet_5b(): return airnet(5)
--- a/lib/modeling/detector.py
+++ b/lib/modeling/detector.py
@@ -17,17 +17,12 @@ import collections
 import importlib
 import dragon.vm.torch as torch
-from lib.core.config import cfg
+from seetadet import modeling as models
-from lib.modeling import FPN
+from seetadet.core.config import cfg
-from lib.modeling import RPN
+from seetadet.core.registry import backbones
-from lib.modeling import FastRCNN
+from seetadet.modules import nn
-from lib.modeling import MaskRCNN
+from seetadet.modules import vision
-from lib.modeling import RetinaNet
+from seetadet.utils import logger
-from lib.modeling import SSD
-from lib.modeling.factory import get_body_func
-from lib.modules import nn
-from lib.modules import vision
-from lib.utils import logger
 class Detector(nn.Module):
@@ -46,18 +41,17 @@ class Detector(nn.Module):
        # + DataLoader
        self.data_loader_cls = importlib.import_module(
-            'lib.{}'.format(model)).DataLoader
+            'seetadet.algo.{}'.format(model)).DataLoader
        self.bootstrap = vision.Bootstrap()
        # + FeatureExtractor
-        self.body = get_body_func(body)()
+        self.body = backbones.get(body)()
        feature_dims = self.body.feature_dims
        # + FeatureEnhancer
        if 'fpn' in modules:
-            self.fpn = FPN(feature_dims)
+            self.fpn = models.FPN(feature_dims)
            feature_dims = self.fpn.feature_dims
        elif 'mbox' in modules:
            pass  # Placeholder
        else:
@@ -65,17 +59,17 @@ class Detector(nn.Module):
        # + Detection Modules
        if 'rcnn' in model:
-            self.rpn = RPN(feature_dims[0])
+            self.rpn = models.RPN(feature_dims[0])
            if 'faster' in model:
-                self.rcnn = FastRCNN(feature_dims[0])
+                self.rcnn = models.FastRCNN(feature_dims[0])
            elif 'mask' in model:
-                self.rcnn = MaskRCNN(feature_dims[0])
+                self.rcnn = models.MaskRCNN(feature_dims[0])
        if 'retinanet' in model:
-            self.retinanet = RetinaNet(feature_dims[0])
+            self.retinanet = models.RetinaNet(feature_dims[0])
        if 'ssd' in model:
-            self.ssd = SSD(feature_dims)
+            self.ssd = models.SSD(feature_dims)
    def load_weights(self, weights):
        """Load the state dict of this detector.
@@ -171,13 +165,11 @@ class Detector(nn.Module):
        return outputs
    def optimize_for_inference(self):
-        """Optimize the graph for the inference.
+        """Optimize the graph for the inference."""
-        It usually involves the removing of BN or Affine.
+        ###################################
-        """
+        #  Merge Affine into Convolution  #
-        ##################################
+        ###################################
-        #  Merge Affine into Convolution #
-        ##################################
        last_module = None
        for e in self.modules():
            if isinstance(e, nn.Affine) and \
@@ -195,7 +187,7 @@ class Detector(nn.Module):
        last_module = None
        for e in self.modules():
            if isinstance(e, nn.BatchNorm2d) and \
-                    nn.is_conv2d(last_module):
+                    isinstance(last_module, nn.Conv2d):
                if last_module.bias is None:
                    delattr(last_module, 'bias')
                    e.forward = lambda x: x

--- a/lib/modeling/fast_rcnn.py
+++ b/lib/modeling/fast_rcnn.py
@@ -18,12 +18,12 @@ import functools
 import dragon.vm.torch as torch
-from lib import faster_rcnn
+from seetadet.algo import faster_rcnn
-from lib.core.config import cfg
+from seetadet.core.config import cfg
-from lib.modules import det
+from seetadet.modules import det
-from lib.modules import init
+from seetadet.modules import init
-from lib.modules import nn
+from seetadet.modules import nn
-from lib.modules import vision
+from seetadet.modules import vision
 class FastRCNN(nn.Module):
@@ -54,7 +54,11 @@ class FastRCNN(nn.Module):
            'RoIAlign': vision.roi_align
        }[cfg.FRCNN.ROI_XFORM_METHOD], size=cfg.FRCNN.ROI_XFORM_RESOLUTION)
        self.cls_loss = nn.CrossEntropyLoss()
-        self.bbox_loss = nn.SmoothL1Loss()
+        if 'IOU' in cfg.MODEL.REG_LOSS_TYPE.upper():
+            self.bbox_loss = nn.IoULoss(
+                delta_weights=cfg.BBOX_REG_WEIGHTS)
+        else:
+            self.bbox_loss = nn.SmoothL1Loss(reduction='sum')
        # Compute spatial scales according to strides
        self.spatial_scales = [
            1. / (2 ** lvl)
@@ -124,15 +128,22 @@ class FastRCNN(nn.Module):
        if self.training:
            # Compute rcnn losses
+            bbox_pred = outputs['bbox_pred'].view(0, -1, 4) \
+                .index_select((0, 1), self.data['bbox_indices'])
+            bbox_loss_weight = \
+                cfg.MODEL.REG_LOSS_WEIGHT / (
+                    roi_features.shape[0] if isinstance(
+                    self.bbox_loss, nn.SmoothL1Loss
+                ) else 1.
+            )
            outputs.update(collections.OrderedDict([
                ('cls_loss', self.cls_loss(
                    cls_score, self.data['labels'])),
                ('bbox_loss', self.bbox_loss(
-                    outputs['bbox_pred'],
+                    bbox_pred,
                    self.data['bbox_targets'],
-                    self.data['bbox_inside_weights'],
+                    self.data['bbox_anchors'],
-                    self.data['bbox_outside_weights'],
+                ) * bbox_loss_weight),
-                )),
            ]))
        else:
            # Return the rois to decode the refine boxes

--- a/lib/modeling/fpn.py
+++ b/lib/modeling/fpn.py
@@ -13,11 +13,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-import dragon.vm.torch as torch
+from dragon.vm.torch.nn import functional as nn_funcs
-from lib.core.config import cfg
+from seetadet.core.config import cfg
-from lib.modules import init
+from seetadet.modules import init
-from lib.modules import nn
+from seetadet.modules import nn
 HIGHEST_BACKBONE_LVL = 5  # E.g., "conv5"-like level
@@ -36,7 +36,7 @@ class FPN(nn.Module):
            self.P.append(nn.Conv3x3(dim, dim, bias=True))
        if 'rcnn' in cfg.MODEL.TYPE:
            self.apply_func = self.apply_on_rcnn
-            self.maxpool = nn.MaxPool2d(1, 2, ceil_mode=True)
+            self.maxpool = nn.MaxPool2d(kernel_size=1, stride=2)
        else:
            self.apply_func = self.apply_on_generic
            self.relu = nn.ReLU(inplace=False)
@@ -44,6 +44,7 @@ class FPN(nn.Module):
                dim_in = feature_dims[-1] if lvl == HIGHEST_BACKBONE_LVL + 1 else dim
                self.P.append(nn.Conv3x3(dim_in, dim, stride=2, bias=True))
        self.feature_dims = [dim]
+        self.coarsest_stride = cfg.MODEL.COARSEST_STRIDE
        self.reset_parameters()
    def reset_parameters(self):
@@ -56,14 +57,18 @@ class FPN(nn.Module):
        fpn_input = self.C[-1](features[-1])
        min_lvl, max_lvl = cfg.FPN.RPN_MIN_LEVEL, cfg.FPN.RPN_MAX_LEVEL
        outputs = [self.P[HIGHEST_BACKBONE_LVL - min_lvl](fpn_input)]
-        # Apply MaxPool for higher features
+        # Apply max pool for higher features
        for i in range(HIGHEST_BACKBONE_LVL + 1, max_lvl + 1):
            outputs.append(self.maxpool(outputs[-1]))
-        # Build Pyramids between [MIN_LEVEL, HIGHEST_LEVEL]
+        # Build pyramids between [MIN_LEVEL, HIGHEST_LEVEL]
        for i in range(HIGHEST_BACKBONE_LVL - 1, min_lvl - 1, -1):
            lateral_output = self.C[i - min_lvl](features[i - 1])
-            upscale_output = torch.vision.ops.nn_resize(
+            if self.coarsest_stride > 0:
-                fpn_input, dsize=None, fx=2., fy=2.)
+                upscale_output = nn_funcs.upsample(
+                    fpn_input, scale_factor=2)
+            else:
+                upscale_output = nn_funcs.upsample(
+                    fpn_input, size=lateral_output.shape[2:])
            fpn_input = lateral_output.__iadd__(upscale_output)
            outputs.insert(0, self.P[i - min_lvl](fpn_input))
        return outputs
@@ -78,11 +83,15 @@ class FPN(nn.Module):
            outputs.append(self.P[i - min_lvl](extra_input))
            if i != max_lvl:
                extra_input = self.relu(outputs[-1])
-        # Build Pyramids between [MIN_LEVEL, HIGHEST_LEVEL]
+        # Build pyramids between [MIN_LEVEL, HIGHEST_LEVEL]
        for i in range(HIGHEST_BACKBONE_LVL - 1, min_lvl - 1, -1):
            lateral_output = self.C[i - min_lvl](features[i - 1])
-            upscale_output = torch.vision.ops.nn_resize(
+            if self.coarsest_stride > 0:
-                fpn_input, dsize=None, fx=2., fy=2.)
+                upscale_output = nn_funcs.upsample(
+                    fpn_input, scale_factor=2)
+            else:
+                upscale_output = nn_funcs.upsample(
+                    fpn_input, size=lateral_output.shape[2:])
            fpn_input = lateral_output.__iadd__(upscale_output)
            outputs.insert(0, self.P[i - min_lvl](fpn_input))
        return outputs

--- a/lib/modeling/mask_rcnn.py
+++ b/lib/modeling/mask_rcnn.py
@@ -18,12 +18,12 @@ import functools
 import dragon.vm.torch as torch
-from lib import mask_rcnn
+from seetadet.algo import mask_rcnn
-from lib.core.config import cfg
+from seetadet.core.config import cfg
-from lib.modules import det
+from seetadet.modules import det
-from lib.modules import init
+from seetadet.modules import init
-from lib.modules import nn
+from seetadet.modules import nn
-from lib.modules import vision
+from seetadet.modules import vision
 class MaskRCNN(nn.Module):
@@ -65,7 +65,7 @@ class MaskRCNN(nn.Module):
            'RoIAlign': vision.roi_align,
        }[cfg.MRCNN.ROI_XFORM_METHOD], size=cfg.MRCNN.ROI_XFORM_RESOLUTION)
        self.cls_loss = nn.CrossEntropyLoss()
-        self.bbox_loss = nn.SmoothL1Loss()
+        self.bbox_loss = nn.SmoothL1Loss(reduction='sum')
        self.mask_loss = nn.BCEWithLogitsLoss()
        # Compute spatial scales according to strides
        self.spatial_scales = [
@@ -146,15 +146,14 @@ class MaskRCNN(nn.Module):
        if self.training:
            # Compute the loss of bbox branch
+            bbox_pred = outputs['bbox_pred'].view(0, -1, 4) \
+                .index_select((0, 1), self.data['bbox_indices'])
            outputs.update(collections.OrderedDict([
                ('cls_loss', self.cls_loss(
                    cls_score, self.data['labels'])),
                ('bbox_loss', self.bbox_loss(
-                    outputs['bbox_pred'],
+                    bbox_pred, self.data['bbox_targets'],
-                    self.data['bbox_targets'],
+                ) / roi_features.shape[0]),
-                    self.data['bbox_inside_weights'],
-                    self.data['bbox_outside_weights'],
-                )),
            ]))
            # Compute the loss of mask branch
            mask_score = self.get_mask_score(
@@ -171,7 +170,7 @@ class MaskRCNN(nn.Module):
                outputs['rois'] = self.data['rois'][0]
            # Return the classification prob
            outputs['cls_prob'] = self.softmax(cls_score)
-            # Set a callback to decode mask from refine RoIs
+            # Set a callback to decode mask from refined RoIs
            self.compute_mask_score = \
                functools.partial(
                    self.get_mask_score,

--- a/lib/modeling/mobilenet.py
+++ b/lib/modeling/mobilenet.py
@@ -17,17 +17,18 @@ import functools
 import dragon.vm.torch as torch
-from lib.core.config import cfg
+from seetadet.core.config import cfg
-from lib.modules import init
+from seetadet.core.registry import backbones
-from lib.modules import nn
+from seetadet.modules import init
-from lib.modules import vision
+from seetadet.modules import nn
+from seetadet.modules import vision
 def conv_triplet(dim_in, dim_out):
    """1x1 convolution + BN + ReLU."""
    return [
        nn.Conv2d(dim_in, dim_out, 1, bias=False),
-        nn.Affine(dim_out),
+        nn.FrozenAffine(dim_out),
        nn.ReLU(True),
    ]
@@ -42,10 +43,10 @@ def conv_quintet(dim_in, dim_out, ks, stride):
            padding=ks // 2,
            bias=False,
        ),
-        nn.Affine(dim_in),
+        nn.FrozenAffine(dim_in),
        nn.ReLU(True),
        nn.Conv1x1(dim_in, dim_out),
-        nn.Affine(dim_out),
+        nn.FrozenAffine(dim_out),
    ]
@@ -76,7 +77,7 @@ def Stem(dim_out, stride=1):
            padding=1,
            bias=False,
        ),
-        nn.Affine(dim_out),
+        nn.FrozenAffine(dim_out),
        nn.ReLU(True),
    )
@@ -197,7 +198,8 @@ class NASMobileNet(nn.Module):
        return outputs
-def make_mobilenet_a1():
+@backbones.register('mobilenet_a1')
+def mobilenet_a1():
    return NASMobileNet([
        4, 6, 6, 6,
        3, 3, 4, 6,
@@ -207,7 +209,8 @@ def make_mobilenet_a1():
    ], Setting.PROXYLESS_MOBILE)
-def make_mobilenet_v2():
+@backbones.register('mobilenet_v2')
+def mobilenet_v2():
    return NASMobileNet([
        1, 1,
        1, 1, 1,

--- a/lib/modeling/resnet.py
+++ b/lib/modeling/resnet.py
@@ -19,9 +19,10 @@ from __future__ import print_function
 import dragon.vm.torch as torch
-from lib.core.config import cfg
+from seetadet.core.config import cfg
-from lib.modules import nn
+from seetadet.core.registry import backbones
-from lib.modules import init
+from seetadet.modules import nn
+from seetadet.modules import init
 class BasicBlock(nn.Module):
@@ -35,10 +36,10 @@ class BasicBlock(nn.Module):
    ):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv3x3(dim_in, dim_out, stride)
-        self.bn1 = nn.Affine(dim_out)
+        self.bn1 = nn.FrozenAffine(dim_out)
        self.relu = torch.nn.ReLU(inplace=True)
        self.conv2 = nn.Conv3x3(dim_out, dim_out)
-        self.bn2 = nn.Affine(dim_out)
+        self.bn2 = nn.FrozenAffine(dim_out)
        self.downsample = downsample
        self.dropblock = dropblock
@@ -83,11 +84,11 @@ class Bottleneck(torch.nn.Module):
        super(Bottleneck, self).__init__()
        dim = int(dim_out * self.contraction)
        self.conv1 = nn.Conv1x1(dim_in, dim)
-        self.bn1 = nn.Affine(dim)
+        self.bn1 = nn.FrozenAffine(dim)
        self.conv2 = nn.Conv3x3(dim, dim, stride=stride)
-        self.bn2 = nn.Affine(dim)
+        self.bn2 = nn.FrozenAffine(dim)
        self.conv3 = nn.Conv1x1(dim, dim_out)
-        self.bn3 = nn.Affine(dim_out)
+        self.bn3 = nn.FrozenAffine(dim_out)
        self.relu = torch.nn.ReLU(inplace=True)
        self.downsample = downsample
        self.dropblock = dropblock
@@ -132,7 +133,7 @@ class ResNet(torch.nn.Module):
            padding=3,
            bias=False,
        )
-        self.bn1 = nn.Affine(self.dim_in)
+        self.bn1 = nn.FrozenAffine(self.dim_in)
        self.relu = torch.nn.ReLU(inplace=True)
        self.maxpool = torch.nn.MaxPool2d(
            kernel_size=3,
@@ -181,7 +182,7 @@ class ResNet(torch.nn.Module):
        if stride != 1 or self.dim_in != dim_out:
            downsample = nn.Sequential(
                nn.Conv1x1(self.dim_in, dim_out, stride=stride),
-                nn.Affine(dim_out),
+                nn.FrozenAffine(dim_out),
            )
        layers = [block(self.dim_in, dim_out, stride, downsample, dropblock)]
        self.dim_in = dim_out
@@ -194,11 +195,17 @@ class ResNet(torch.nn.Module):
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        outputs = [x]
        outputs += [self.layer1(outputs[-1])]
        outputs += [self.layer2(outputs[-1])]
        outputs += [self.layer3(outputs[-1])]
        outputs += [self.layer4(outputs[-1])]
+        if self.training:
+            # Hold the frozen outputs if necessary
+            self.last_outputs = outputs
        return outputs
@@ -225,16 +232,8 @@ def resnet(depth):
    return ResNet(block, units, filters)
-def make_resnet_18(): return resnet(18)
+backbones.register(['res18', 'resnet18', 'resnet_18'], func=resnet, depth=18)
+backbones.register(['res34', 'resnet34', 'resnet_34'], func=resnet, depth=34)
+backbones.register(['res50', 'resnet50', 'resnet_50'], func=resnet, depth=50)
-def make_resnet_34(): return resnet(34)
+backbones.register(['res101', 'resnet101', 'resnet_101'], func=resnet, depth=101)
+backbones.register(['res152', 'resnet152', 'resnet_152'], func=resnet, depth=152)
-def make_resnet_50(): return resnet(50)
-def make_resnet_101(): return resnet(101)
-def make_resnet_152(): return resnet(152)
--- a/lib/modeling/retinanet.py
+++ b/lib/modeling/retinanet.py
@@ -17,11 +17,11 @@ import collections
 import math
 import dragon.vm.torch as torch
-from lib import retinanet
+from seetadet.algo import retinanet
-from lib.core.config import cfg
+from seetadet.core.config import cfg
-from lib.modules import det
+from seetadet.modules import det
-from lib.modules import init
+from seetadet.modules import init
-from lib.modules import nn
+from seetadet.modules import nn
 class RetinaNet(nn.Module):
@@ -56,7 +56,11 @@ class RetinaNet(nn.Module):
        self.anchor_target = retinanet.AnchorTarget()
        self.cls_loss = nn.SigmoidFocalLoss()
-        self.bbox_loss = nn.SmoothL1Loss(0.1111)
+        if 'IOU' in cfg.MODEL.REG_LOSS_TYPE.upper():
+            self.bbox_loss = nn.IoULoss()
+        else:
+            self.bbox_loss = nn.SmoothL1Loss(0.1111)
+        self.centerness_loss = nn.BCEWithLogitsLoss(reduction='valid')
        self.reset_parameters()
    def reset_parameters(self):
@@ -71,7 +75,8 @@ class RetinaNet(nn.Module):
        # For details, See the official codes:
        # https://github.com/facebookresearch/Detectron
        self.cls_score.bias.fill_(
-            -math.log((1 - cfg.PRIOR_PROB) / cfg.PRIOR_PROB))
+            -math.log((1 - cfg.PRIOR_PROB) / cfg.PRIOR_PROB)
+        )
    def compute_outputs(self, features):
        """Compute the RetinaNet logits.
@@ -97,48 +102,44 @@ class RetinaNet(nn.Module):
            return torch.cat(cls_score_wide, dim=2), \
                   torch.cat(bbox_pred_wide, dim=2)
        else:
-            return cls_score_wide[0], bbox_pred_wide[0]
+            return cls_score_wide[0], bbox_pred_wide[0], \
-    def compute_losses(
+    def compute_losses(self, features, cls_score, bbox_pred, gt_boxes):
-        self,
-        features,
-        cls_score,
-        bbox_pred,
-        gt_boxes,
-        ims_info,
-    ):
        """Compute the RetinaNet classification loss and regression loss.
        Parameters
        ----------
-        features : sequence of dragon.vm.torch.Tensor
+        features : Sequence[dragon.vm.torch.Tensor]
            The features of specific conv layers.
        cls_score : dragon.vm.torch.Tensor
            The classification logits.
        bbox_pred : dragon.vm.torch.Tensor
            The bbox regression logits.
+        centerness : dragon.vm.torch.Tensor
+            The centerness logits.
        gt_boxes : numpy.ndarray
            The packed ground-truth boxes.
        ims_info : numpy.ndarray
            The information of input images.
        """
-        self.retinanet_data = \
+        self.data = \
            self.anchor_target(
                features=features,
                gt_boxes=gt_boxes,
-                ims_info=ims_info,
            )
-        return collections.OrderedDict([
+        bbox_pred = bbox_pred.permute(0, 2, 1) \
+            .index_select((0, 1), self.data['bbox_indices'])
+        outputs = collections.OrderedDict([
            ('cls_loss', self.cls_loss(
-                cls_score, self.retinanet_data['labels'])),
+                cls_score, self.data['labels'])),
            ('bbox_loss', self.bbox_loss(
                bbox_pred,
-                self.retinanet_data['bbox_targets'],
+                self.data['bbox_targets'],
-                self.retinanet_data['bbox_inside_weights'],
+                self.data['bbox_anchors'],
-                self.retinanet_data['bbox_outside_weights'],
+            ))
-            )),
        ])
+        return outputs
    def forward(self, *args, **kwargs):
        cls_score, bbox_pred = self.compute_outputs(kwargs['features'])
@@ -149,19 +150,17 @@ class RetinaNet(nn.Module):
        if self.training:
            outputs.update(
                self.compute_losses(
-                    kwargs['features'],
+                    features=kwargs['features'],
-                    cls_score,
+                    cls_score=cls_score,
-                    bbox_pred,
+                    bbox_pred=bbox_pred,
-                    kwargs['gt_boxes'],
+                    gt_boxes=kwargs['gt_boxes'],
-                    kwargs['ims_info'],
                )
            )
        else:
            outputs['detections'] = \
                self.decoder(
                    kwargs['features'],
-                    self.cls_prob(cls_score)
+                    self.cls_prob(cls_score).permute(0, 2, 1),
-                        .permute(0, 2, 1),
                    bbox_pred,
                    kwargs['ims_info'],
                )

--- a/lib/modeling/rpn.py
+++ b/lib/modeling/rpn.py
@@ -16,10 +16,10 @@ from __future__ import print_function
 import collections
 import dragon.vm.torch as torch
-from lib import faster_rcnn
+from seetadet.algo import faster_rcnn
-from lib.core.config import cfg
+from seetadet.core.config import cfg
-from lib.modules import init
+from seetadet.modules import init
-from lib.modules import nn
+from seetadet.modules import nn
 class RPN(nn.Module):
@@ -45,7 +45,8 @@ class RPN(nn.Module):
        self.anchor_target = faster_rcnn.AnchorTarget()
        self.cls_loss = nn.BCEWithLogitsLoss()
-        self.bbox_loss = nn.SmoothL1Loss(0.1111)
+        self.bbox_loss = nn.SmoothL1Loss(
+            beta=0.1111, reduction='sum')
        self.reset_parameters()
    def reset_parameters(self):
@@ -108,21 +109,26 @@ class RPN(nn.Module):
            The information of input images.
        """
-        self.rpn_data = \
+        self.data = \
            self.anchor_target(
                features=features,
                gt_boxes=gt_boxes,
                ims_info=ims_info,
            )
+        bbox_pred = bbox_pred.permute(0, 2, 1) \
+            .index_select((0, 1), self.data['bbox_indices'])
+        bbox_loss_weight = 1. / (
+            cfg.TRAIN.RPN_BATCHSIZE *
+            cfg.TRAIN.IMS_PER_BATCH
+        )
        return collections.OrderedDict([
            ('rpn_cls_loss', self.cls_loss(
-                cls_score, self.rpn_data['labels'])),
+                cls_score, self.data['labels'])),
            ('rpn_bbox_loss', self.bbox_loss(
                bbox_pred,
-                self.rpn_data['bbox_targets'],
+                self.data['bbox_targets'],
-                self.rpn_data['bbox_inside_weights'],
+                self.data['bbox_anchors'],
-                self.rpn_data['bbox_outside_weights'],
+            ) * bbox_loss_weight),
-            )),
        ])
    def forward(self, *args, **kwargs):

--- a/lib/modeling/ssd.py
+++ b/lib/modeling/ssd.py
@@ -16,10 +16,10 @@ from __future__ import print_function
 import collections
 import dragon.vm.torch as torch
-from lib import ssd
+from seetadet.algo import ssd
-from lib.core.config import cfg
+from seetadet.core.config import cfg
-from lib.modules import init
+from seetadet.modules import init
-from lib.modules import nn
+from seetadet.modules import nn
 class SSD(nn.Module):
@@ -66,7 +66,11 @@ class SSD(nn.Module):
        self.hard_mining = ssd.HardMining()
        self.box_target = ssd.MultiBoxTarget()
        self.cls_loss = nn.CrossEntropyLoss()
-        self.bbox_loss = nn.SmoothL1Loss()
+        if 'IOU' in cfg.MODEL.REG_LOSS_TYPE:
+            self.bbox_loss = nn.IoULoss(
+                delta_weights=cfg.BBOX_REG_WEIGHTS)
+        else:
+            self.bbox_loss = nn.SmoothL1Loss()
        self.reset_parameters()
    def reset_parameters(self):
@@ -110,8 +114,7 @@ class SSD(nn.Module):
        # Concat them if necessary
        return \
-            torch.cat(cls_score_wide, dim=1) \
+            torch.cat(cls_score_wide, dim=1).view(0, -1, cfg.MODEL.NUM_CLASSES), \
-            .view(0, -1, cfg.MODEL.NUM_CLASSES), \
            torch.cat(bbox_pred_wide, dim=1).view(0, -1, self.box_dim)
    def compute_losses(
@@ -160,6 +163,8 @@ class SSD(nn.Module):
                gt_boxes,
            )
        )
+        bbox_pred = bbox_pred.index_select(
+            (0, 1), self.data['bbox_indices'])
        return collections.OrderedDict([
            # A compensating factor of 4.0 is used
            # As we normalize both the pos and neg samples
@@ -169,9 +174,8 @@ class SSD(nn.Module):
            ('bbox_loss', self.bbox_loss(
                bbox_pred,
                self.data['bbox_targets'],
-                self.data['bbox_inside_weights'],
+                self.data['bbox_anchors'],
-                self.data['bbox_outside_weights'],
+            ) * cfg.MODEL.REG_LOSS_WEIGHT)
-            )),
        ])
    def forward(self, *args, **kwargs):

--- a/lib/modeling/vgg.py
+++ b/lib/modeling/vgg.py
@@ -13,9 +13,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-from lib.core.config import cfg
+from seetadet.core.config import cfg
-from lib.modules import init
+from seetadet.core.registry import backbones
-from lib.modules import nn
+from seetadet.modules import init
+from seetadet.modules import nn
 class VGG(nn.Module):
@@ -41,14 +42,14 @@ class VGG(nn.Module):
                if j == 0:
                    dim_in = filter_list[i]
        if reduced:
-            # L2Norm is redundant from the observation
+            self.conv4_3_norm = nn.L2Normalize(filter_list[3], init=20.)
-            # We just keep a trainable scale
-            self.conv4_3_norm = nn.Affine(filter_list[3], bias=False)
-            self.conv4_3_norm.weight.zero_()  # Zero-Init
            self.fc6 = nn.Conv2d(
-                filter_list[-1], 1024,
+                in_channels=filter_list[-1],
-                kernel_size=3, padding=6,
+                out_channels=1024,
-                stride=1, dilation=6,
+                kernel_size=3,
+                padding=6,
+                stride=1,
+                dilation=6,
            )
            self.fc7 = nn.Conv1x1(1024, 1024, bias=True)
            self.feature_dims = [filter_list[-2], 1024]
@@ -142,14 +143,18 @@ class VGG(nn.Module):
        else:
            outputs.append(x)
+        if self.training:
+            # Hold the frozen outputs if necessary
+            self.last_outputs = outputs
        return outputs
-def make_vgg_16():
+def vgg_16(**kwargs):
-    return VGG(([2, 2, 3, 3, 3], [64, 128, 256, 512, 512]))
+    return VGG(([2, 2, 3, 3, 3], [64, 128, 256, 512, 512]), **kwargs)
-def make_vgg_16_reduced(scale=300):
+def vgg_16_reduced(scale=300):
    if scale == 300:
        extra_arch = (
            [2, 2, 1, 1],
@@ -164,11 +169,9 @@ def make_vgg_16_reduced(scale=300):
        )
    else:
        raise ValueError('Unsupported scale: {}'.format(scale))
-    return VGG(([2, 2, 3, 3, 3], [64, 128, 256, 512, 512]),
+    return vgg_16(extra_arch=extra_arch, reduced=True)
-               extra_arch=extra_arch, reduced=True)
-def make_vgg_16_reduced_300(): return make_vgg_16_reduced(300)
-def make_vgg_16_reduced_512(): return make_vgg_16_reduced(512)
+backbones.register('vgg16', func=vgg_16)
+backbones.register('vgg16_reduced_300', func=vgg_16_reduced, scale=300)
+backbones.register('vgg16_reduced_512', func=vgg_16_reduced, scale=512)
--- a/lib/ssd/__init__.py
+++ b/lib/ssd/__init__.py
@@ -13,8 +13,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-from lib.ssd.data_loader import DataLoader
+import os
-from lib.ssd.hard_mining import HardMining
-from lib.ssd.multibox import MultiBoxMatch
+from seetadet.utils import env
-from lib.ssd.multibox import MultiBoxTarget
+env.load_library(os.path.join(os.path.dirname(__file__), '_C'))
-from lib.ssd.priorbox import PriorBox
--- a/lib/modules/det.py
+++ b/lib/modules/det.py
@@ -14,21 +14,40 @@ from __future__ import division
 from __future__ import print_function
 from dragon.vm.torch import nn
-from dragon.vm.torch.autograd import function
+from dragon.vm.torch.autograd.function import Function
-from lib.core.config import cfg
+from seetadet.core.config import cfg
-class _RetinaNetDecoder(function.Function):
+class _NonMaxSuppression(Function):
+    """Filter out boxes that have high IoU with selected ones."""
+    def __init__(self, key, dev, **kwargs):
+        super(_NonMaxSuppression, self).__init__(key, dev, **kwargs)
+        self.iou_threshold = kwargs.get('iou_threshold', 0.5)
+    def attributes(self):
+        return {
+            'op_type': 'NonMaxSuppression',
+            'arguments': {'iou_threshold': self.iou_threshold}
+        }
+    def forward(self, dets):
+        return self.dispatch([dets], [self.alloc()])
+class _RetinaNetDecoder(Function):
+    """Decode predictions from RetinaNet."""
    def __init__(self, key, dev, **kwargs):
        super(_RetinaNetDecoder, self).__init__(key, dev, **kwargs)
        self.args = kwargs
-    def register_operator(self):
+    def attributes(self):
        return {
-            'op_type': 'Proposal',
+            'op_type': 'RetinaNetDecoder',
            'arguments': {
-                'det_type': 'RETINANET',
                'strides': self.args['strides'],
                'ratios': self.args['ratios'],
                'scales': self.args['scales'],
@@ -39,20 +58,21 @@ class _RetinaNetDecoder(function.Function):
    def forward(self, features, cls_prob, bbox_pred, ims_info):
        inputs = features + [cls_prob, bbox_pred, ims_info]
-        self._unify_devices(inputs[:-1])  # Skip <ims_info>
+        self._check_device(inputs[:-1])  # Skip <ims_info>
-        return self.run(inputs, [self.alloc()], unify_devices=False)
+        return self.dispatch(inputs, [self.alloc()], check_device=False)
+class _RPNDecoder(Function):
+    """Decode proposal regions from RPN."""
-class _RPNDecoder(function.Function):
    def __init__(self, key, dev, **kwargs):
        super(_RPNDecoder, self).__init__(key, dev, **kwargs)
        self.args = kwargs
-    def register_operator(self):
+    def attributes(self):
        return {
-            'op_type': 'Proposal',
+            'op_type': 'RPNDecoder',
            'arguments': {
-                'det_type': 'RCNN',
                'strides': self.args['strides'],
                'ratios': self.args['ratios'],
                'scales': self.args['scales'],
@@ -69,9 +89,9 @@ class _RPNDecoder(function.Function):
    def forward(self, features, cls_prob, bbox_pred, ims_info):
        inputs = features + [cls_prob, bbox_pred, ims_info]
-        self._unify_devices(inputs[:-1])  # Skip <ims_info>
+        self._check_device(inputs[:-1])  # Skip <ims_info>
        outputs = [self.alloc() for _ in range(self.args['K'])]
-        return self.run(inputs, outputs, unify_devices=False)
+        return self.dispatch(inputs, outputs, check_device=False)
 def decode_retinanet(
@@ -85,15 +105,15 @@ def decode_retinanet(
    pre_nms_top_n,
    score_thresh,
 ):
-    return function.get(
+    return _RetinaNetDecoder \
-        _RetinaNetDecoder,
+        .instantiate(
-        cls_prob.device,
+            cls_prob.device,
-        strides=strides,
+            strides=strides,
-        ratios=ratios,
+            ratios=ratios,
-        scales=scales,
+            scales=scales,
-        pre_nms_top_n=pre_nms_top_n,
+            pre_nms_top_n=pre_nms_top_n,
-        score_thresh=score_thresh,
+            score_thresh=score_thresh,
-    ).apply(features, cls_prob, bbox_pred, ims_info)
+        ).apply(features, cls_prob, bbox_pred, ims_info)
 def decode_rpn(
@@ -114,26 +134,34 @@ def decode_rpn(
    canonical_scale,
    canonical_level,
 ):
-    return function.get(
+    return _RPNDecoder \
-        _RPNDecoder,
+        .instantiate(
-        cls_prob.device,
+            cls_prob.device,
-        K=num_outputs,
+            K=num_outputs,
-        strides=strides,
+            strides=strides,
-        ratios=ratios,
+            ratios=ratios,
-        scales=scales,
+            scales=scales,
-        pre_nms_top_n=pre_nms_top_n,
+            pre_nms_top_n=pre_nms_top_n,
-        post_nms_top_n=post_nms_top_n,
+            post_nms_top_n=post_nms_top_n,
-        nms_thresh=nms_thresh,
+            nms_thresh=nms_thresh,
-        min_size=min_size,
+            min_size=min_size,
-        min_level=min_level,
+            min_level=min_level,
-        max_level=max_level,
+            max_level=max_level,
-        canonical_scale=canonical_scale,
+            canonical_scale=canonical_scale,
-        canonical_level=canonical_level,
+            canonical_level=canonical_level,
-    ).apply(features, cls_prob, bbox_pred, ims_info)
+        ).apply(features, cls_prob, bbox_pred, ims_info)
+def nms(dets, iou_threshold=0.5):
+    return _NonMaxSuppression \
+        .instantiate(
+            dets.device,
+            iou_threshold=iou_threshold,
+        ).apply(dets)
 class RetinaNetDecoder(nn.Module):
-    """Generate pred regions from retinanet."""
+    """Decode predictions from retinanet."""
    def __init__(self):
        super(RetinaNetDecoder, self).__init__()
@@ -154,7 +182,7 @@ class RetinaNetDecoder(nn.Module):
            ratios=[float(e) for e in cfg.RETINANET.ASPECT_RATIOS],
            scales=self.scales,
            pre_nms_top_n=cfg.RETINANET.PRE_NMS_TOP_N,
-            score_thresh=cfg.TEST.SCORE_THRESH,
+            score_thresh=float(cfg.TEST.SCORE_THRESH),
        )

--- a/lib/modules/init.py
+++ b/lib/modules/init.py
--- a/seetadet/modules/nn.py
+++ b/seetadet/modules/nn.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+"""Define some basic structures."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import dragon
+from dragon.vm import torch
+from dragon.vm.torch import nn
+from dragon.vm.torch.nn import functional
+from seetadet.core.config import cfg
+class FrozenAffine(object):
+    """Affine transformation with weight and bias fixed."""
+    def __new__(cls, dim_in, bias=True, inplace=True):
+        return nn.Affine(
+            num_features=dim_in,
+            fix_weight=True,
+            fix_bias=True,
+            inplace=inplace,
+        )
+class Conv1x1(object):
+    """1x1 convolution."""
+    def __new__(cls, dim_in, dim_out, stride=1, bias=False):
+        return nn.Conv2d(
+            in_channels=dim_in,
+            out_channels=dim_out,
+            kernel_size=1,
+            stride=stride,
+            bias=bias,
+        )
+class Conv3x3(object):
+    """3x3 convolution."""
+    def __new__(cls, dim_in, dim_out, stride=1, dilation=1, bias=False):
+        return nn.Conv2d(
+            in_channels=dim_in,
+            out_channels=dim_out,
+            kernel_size=3,
+            stride=stride,
+            padding=1 * dilation,
+            bias=bias,
+        )
+class CrossEntropyLoss(object):
+    """Cross entropy loss."""
+    def __new__(cls):
+        return nn.CrossEntropyLoss(ignore_index=-1)
+class IoULoss(nn.Module):
+    def __init__(self, reduction='mean', delta_weights=None):
+        super(IoULoss, self).__init__()
+        self.data = {}  # Store the detached tensors
+        self.reduction = reduction
+        self.delta_weights = delta_weights
+    def transform_inv(self, boxes, deltas, name=None):
+        widths = boxes[:, 2] - boxes[:, 0]
+        heights = boxes[:, 3] - boxes[:, 1]
+        ctr_x = boxes[:, 0] + 0.5 * widths
+        ctr_y = boxes[:, 1] + 0.5 * heights
+        if name is not None:
+            self.data[name + '/widths'] = widths
+            self.data[name + '/heights'] = heights
+        dx, dy, dw, dh = torch.chunk(deltas, chunks=4, dim=1)
+        if self.delta_weights is not None:
+            wx, wy, ww, wh = self.delta_weights
+            dx, dy, dw, dh = dx / wx, dy / wy, dw / ww, dh / wh
+        pred_ctr_x = dx * widths + ctr_x
+        pred_ctr_y = dy * heights + ctr_y
+        pred_w = torch.exp(dw) * widths
+        pred_h = torch.exp(dh) * heights
+        x1 = pred_ctr_x - 0.5 * pred_w
+        y1 = pred_ctr_y - 0.5 * pred_h
+        x2 = pred_ctr_x + 0.5 * pred_w
+        y2 = pred_ctr_y + 0.5 * pred_h
+        return x1, y1, x2, y2
+    def forward_impl(self, input, target, anchor):
+        x1, y1, x2, y2 = self.transform_inv(
+            anchor, input, name='logits')
+        self.x1, self.y1, self.x2, self.y2 = \
+            self.transform_inv(anchor, target)
+        # Compute the independent area
+        pred_area = (x2 - x1) * (y2 - y1)
+        target_area = (self.x2 - self.x1) * (self.y2 - self.y1)
+        # Compute the intersecting area
+        x1_inter = torch.maximum(x1, self.x1)
+        y1_inter = torch.maximum(y1, self.y1)
+        x2_inter = torch.minimum(x2, self.x2)
+        y2_inter = torch.minimum(y2, self.y2)
+        w_inter = torch.clamp(x2_inter - x1_inter, min=0)
+        h_inter = torch.clamp(y2_inter - y1_inter, min=0)
+        area_inter = w_inter * h_inter
+        # Compute the enclosing area
+        x1_enc = torch.minimum(x1, self.x1)
+        y1_enc = torch.minimum(y1, self.y1)
+        x2_enc = torch.maximum(x2, self.x2)
+        y2_enc = torch.maximum(y2, self.y2)
+        area_enc = (x2_enc - x1_enc) * (y2_enc - y1_enc) + 1.
+        # Compute the differentiable IoU metric
+        area_union = pred_area + target_area - area_inter
+        iou = area_inter / (area_union + 1.)
+        iou_metric = iou - (area_enc - area_union) / area_enc  # GIoU
+        # Compute the reduced loss
+        if self.reduction == 'sum':
+            return (1 - iou_metric).sum()
+        else:
+            return (1 - iou_metric).mean()
+    def forward(self, *inputs, **kwargs):
+        # Enter a new detaching scope
+        with dragon.eager_scope('${IOU}'):
+            return self.forward_impl(*inputs, **kwargs)
+class Identity(nn.Module):
+    """Pass input to the output."""
+    def __init__(self, *args, **kwargs):
+        super(Identity, self).__init__()
+        _, _ = args, kwargs
+    def forward(self, x):
+        return x
+class L2Normalize(nn.Module):
+    """Normalize the input using L2 norm."""
+    def __init__(self, num_features, init=20.):
+        super(L2Normalize, self).__init__()
+        self.weight = nn.Parameter(torch.Tensor(num_features).fill_(init))
+    def forward(self, input):
+        out = functional.normalize(input, p=2, dim=1, eps=1e-5)
+        out = functional.affine(out, self.weight)
+        return out
+class ReLU(object):
+    """The generic ReLU activation."""
+    def __new__(cls, inplace=False):
+        return getattr(torch.nn, cfg.MODEL.RELU_VARIANT)(inplace)
+class SigmoidFocalLoss(object):
+    """Sigmoid focal loss."""
+    def __new__(cls):
+        return nn.SigmoidFocalLoss(
+            alpha=cfg.MODEL.FOCAL_LOSS_ALPHA,
+            gamma=cfg.MODEL.FOCAL_LOSS_GAMMA,
+        )
+class SmoothL1Loss(nn.Module):
+    """Smoothed l1 loss."""
+    def __init__(self, beta=1., reduction='batch_size'):
+        super(SmoothL1Loss, self).__init__()
+        self.beta = beta
+        self.reduction = reduction
+    def forward(self, input, target, *args):
+        return functional.smooth_l1_loss(
+            input, target,
+            beta=self.beta,
+            reduction=self.reduction,
+        )
+Affine = nn.Affine
+AvgPool2d = nn.AvgPool2d
+BatchNorm2d = nn.BatchNorm2d
+BCEWithLogitsLoss = nn.BCEWithLogitsLoss
+Conv2d = nn.Conv2d
+ConvTranspose2d = nn.ConvTranspose2d
+DepthwiseConv2d = nn.DepthwiseConv2d
+Linear = nn.Linear
+MaxPool2d = nn.MaxPool2d
+Module = nn.Module
+ModuleList = nn.ModuleList
+Sequential = nn.Sequential
+Sigmoid = nn.Sigmoid
+Softmax = nn.Softmax
--- a/lib/modules/vision.py
+++ b/lib/modules/vision.py
@@ -13,9 +13,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+import functools
 import dragon.vm.torch as torch
-from lib.core.config import cfg
+from seetadet.core.config import cfg
 def roi_align(input, boxes, spatial_scale, size):
@@ -35,12 +37,18 @@ def roi_pool(input, boxes, spatial_scale, size):
 class Bootstrap(torch.nn.Module):
-    """Extended operator to process the images."""
+    """Process the input to match the computation."""
    def __init__(self):
        super(Bootstrap, self).__init__()
-        self.dtype = cfg.MODEL.PRECISION.lower()
+        self.normalize_func = functools.partial(
-        self.mean_values = cfg.PIXEL_MEANS
+            torch.channel_normalize,
+            mean=cfg.PIXEL_MEANS,
+            std=[1., 1., 1.],
+            dim=1,
+            dims=(0, 3, 1, 2),
+            dtype=cfg.MODEL.PRECISION.lower(),
+        )
        self.dummy_buffer = torch.ones(1)
    def _apply(self, fn):
@@ -57,12 +65,13 @@ class Bootstrap(torch.nn.Module):
        return self.dummy_buffer.device
    def forward(self, input):
+        if isinstance(input, torch.Tensor):
+            if input.size(1) <= 3:
+                return input
        cur_device = self.device()
        if input._device != cur_device:
            if cur_device.type == 'cpu':
                input = input.cpu()
            else:
                input = input.cuda(cur_device.index)
-        return torch.vision.ops.image_data(
+        return self.normalize_func(input)
-            input, self.dtype, self.mean_values,
-        )
--- a/lib/retinanet/__init__.py
+++ b/lib/retinanet/__init__.py
@@ -13,5 +13,4 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-from lib.faster_rcnn.data_loader import DataLoader
+from seetadet.onnx import nodes as _
-from lib.retinanet.anchor_target import AnchorTarget
--- a/seetadet/onnx/nodes.py
+++ b/seetadet/onnx/nodes.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from dragon.vm.onnx import exporter
+from dragon.vm.onnx import helper
+@exporter.register('RetinanetDecoder')
+def retinanet_decoder_exporter(op_def, shape_dict, ws):
+    node, const_tensors = exporter.translate(**locals())
+    node.op_type = 'ATen'  # Currently not supported in ai.onnx
+    helper.add_attribute(node, 'op_type', 'RetinaNetDecoder')
+    for arg in op_def.arg:
+        if arg.name == 'strides':
+            helper.add_attribute(node, 'strides', arg.ints)
+        elif arg.name == 'ratios':
+            helper.add_attribute(node, 'ratios', arg.floats)
+        elif arg.name == 'scales':
+            helper.add_attribute(node, 'scales', arg.floats)
+        elif arg.name == 'pre_nms_top_n':
+            helper.add_attribute(node, 'pre_nms_top_n', arg.i)
+        elif arg.name == 'score_thresh':
+            helper.add_attribute(node, 'score_thresh', arg.f)
+    return node, const_tensors
+@exporter.register('RPNDecoder')
+def rpn_decoder_exporter(op_def, shape_dict, ws):
+    node, const_tensors = exporter.translate(**locals())
+    node.op_type = 'ATen'  # Currently not supported in ai.onnx
+    helper.add_attribute(node, 'op_type', 'RPNDecoder')
+    for arg in op_def.arg:
+        if arg.name == 'strides':
+            helper.add_attribute(node, 'strides', arg.ints)
+        elif arg.name == 'ratios':
+            helper.add_attribute(node, 'ratios', arg.floats)
+        elif arg.name == 'scales':
+            helper.add_attribute(node, 'scales', arg.floats)
+        elif arg.name == 'pre_nms_top_n':
+            helper.add_attribute(node, 'pre_nms_top_n', arg.i)
+        elif arg.name == 'post_nms_top_n':
+            helper.add_attribute(node, 'post_nms_top_n', arg.i)
+        elif arg.name == 'nms_thresh':
+            helper.add_attribute(node, 'nms_thresh', arg.f)
+        elif arg.name == 'min_size':
+            helper.add_attribute(node, 'min_size', arg.i)
+        elif arg.name == 'min_level':
+            helper.add_attribute(node, 'min_level', arg.i)
+        elif arg.name == 'max_level':
+            helper.add_attribute(node, 'max_level', arg.i)
+        elif arg.name == 'canonical_scale':
+            helper.add_attribute(node, 'canonical_scale', arg.i)
+        elif arg.name == 'canonical_level':
+            helper.add_attribute(node, 'canonical_level', arg.i)
+    return node, const_tensors
--- a/lib/pycocotools/__init__.py
+++ b/lib/pycocotools/__init__.py
--- a/lib/pycocotools/coco.py
+++ b/lib/pycocotools/coco.py
@@ -430,4 +430,4 @@ class COCO:
        """
        rle = self.annToRLE(ann)
        m = maskUtils.decode(rle)
        return m
\ No newline at end of file
--- a/lib/pycocotools/cocoeval.py
+++ b/lib/pycocotools/cocoeval.py
--- a/lib/pycocotools/mask.py
+++ b/lib/pycocotools/mask.py
 __author__ = 'tsungyi'
-import lib.pycocotools._mask as _mask
+import seetadet.pycocotools._mask as _mask
 # Interface for manipulating masks stored in RLE format.
 #

--- a/lib/pycocotools/mask_utils.py
+++ b/lib/pycocotools/mask_utils.py
@@ -15,8 +15,8 @@ from __future__ import print_function
 import numpy as np
-from lib.pycocotools import mask as mask_tools
+from seetadet.pycocotools import mask as mask_tools
-from lib.pycocotools.mask import frPyObjects
+from seetadet.pycocotools.mask import frPyObjects
 def poly2rle(poly, height, width):

--- a/lib/solver/__init__.py
+++ b/lib/solver/__init__.py
--- a/lib/solver/lr_scheduler.py
+++ b/lib/solver/lr_scheduler.py
@@ -15,7 +15,7 @@ from __future__ import print_function
 import math
-from lib.core.config import cfg
+from seetadet.core.config import cfg
 class _LRScheduler(object):

--- a/lib/solver/sgd.py
+++ b/lib/solver/sgd.py
@@ -15,11 +15,11 @@ from __future__ import print_function
 import dragon.vm.torch as torch
-from lib.core.config import cfg
+from seetadet.core.config import cfg
-from lib.modeling.detector import Detector
+from seetadet.modeling.detector import Detector
-from lib.solver import lr_scheduler
+from seetadet.solver import lr_scheduler
-from lib.utils import framework
+from seetadet.utils import env
-from lib.utils import time_util
+from seetadet.utils import time_util
 class SGDSolver(object):
@@ -28,7 +28,7 @@ class SGDSolver(object):
        self.detector = Detector()
        # Define the optimizer and its arguments
        self.optimizer = torch.optim.SGD(
-            framework.get_param_groups(self.detector),
+            env.get_param_groups(self.detector),
            lr=cfg.SOLVER.BASE_LR,
            momentum=cfg.SOLVER.MOMENTUM,
            weight_decay=cfg.SOLVER.WEIGHT_DECAY,

--- a/lib/utils/__init__.py
+++ b/lib/utils/__init__.py
--- a/lib/utils/attrdict.py
+++ b/lib/utils/attrdict.py
--- a/lib/utils/blob.py
+++ b/lib/utils/blob.py
 # ------------------------------------------------------------
 # Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
 #
 # Licensed under the BSD 2-Clause License.
 # You should have received a copy of the BSD 2-Clause License
 # along with the software. If not, See,
 #
 #      <https://opensource.org/licenses/BSD-2-Clause>
 #
 # Codes are based on:
 #
 #      <https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/utils/blob.py>
 #
 # ------------------------------------------------------------
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import numpy as np
 import dragon.vm.torch as torch
-from lib.core.config import cfg
+from seetadet.core.config import cfg
-from lib.utils.image import distort_image
+from seetadet.utils.image import distort_image
-from lib.utils.image import resize_image
+from seetadet.utils.image import resize_image
 def im_list_to_blob(ims):
    """Convert a list of images into a network input.
    Assume that images are not means subtracted, and with BGR order.
    """
    blob_dtype = 'uint8' if ims[0].dtype == 'uint8' else 'float32'
    max_shape = np.array([im.shape for im in ims]).max(axis=0)
    if cfg.MODEL.COARSEST_STRIDE > 0:
        stride = float(cfg.MODEL.COARSEST_STRIDE)
        max_shape[0] = int(np.ceil(max_shape[0] / stride) * stride)
        max_shape[1] = int(np.ceil(max_shape[1] / stride) * stride)
    blob_shape = (len(ims), max_shape[0], max_shape[1], 3)
    blob = np.empty(blob_shape, blob_dtype)
    blob[:] = cfg.PIXEL_MEANS
    for i, im in enumerate(ims):
        if im.dtype == 'uint16':
            im = im.astype(blob_dtype) / 256.
        blob[i, :im.shape[0], :im.shape[1], :] = im
    return blob
 def mask_list_to_blob(masks):
    """Convert a list of masks into a network input."""
    max_shape = np.array([mask.shape[1:] for mask in masks]).max(axis=0)
    num_masks = np.array([mask.shape[0] for mask in masks]).sum()
-    blob_shape = ((num_masks, max_shape[0], max_shape[1]))
+    blob_shape = (num_masks, max_shape[0], max_shape[1])
    blob = np.zeros(blob_shape, 'uint8')
    count = 0
    for mask in masks:
        n, h, w = mask.shape
        blob[count:count + n, :h, :w] = mask
        count += n
    return blob
 def prep_im_for_blob(img, target_size, max_size):
    """Scale an image for use in a blob."""
    im_shape, jitter = img.shape, 1.
    if cfg.TRAIN.USE_COLOR_JITTER:
        img = distort_image(img)
    if max_size > 0:
        # Scale image along the shortest side
        im_size_min = np.min(im_shape[:2])
        im_size_max = np.max(im_shape[:2])
        im_scale = float(target_size) / float(im_size_min)
        # Prevent the biggest axis from being more than MAX_SIZE
        if np.round(im_scale * im_size_max) > max_size:
            im_scale = float(max_size) / float(im_size_max)
    else:
        # Scale image along the longest side
        im_size_max = np.max(im_shape[:2])
        im_scale = float(target_size) / float(im_size_max)
-    if cfg.TRAIN.USE_SCALE_JITTER:
+    r = cfg.TRAIN.RANDOM_SCALES
-        r = cfg.TRAIN.SCALE_JITTER_RANGE
+    jitter = r[0] + np.random.rand() * (r[1] - r[0])
-        jitter = r[0] + np.random.rand() * (r[1] - r[0])
+    im_scale *= jitter
-        im_scale *= jitter
+    return resize_image(img, im_scale, im_scale), im_scale
-    return resize_image(img, im_scale, im_scale), im_scale, jitter
--- a/seetadet/utils/boxes.py
+++ b/seetadet/utils/boxes.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# Codes are based on:
+#
+# ------------------------------------------------------------
+"""Box utilities for original coordinates."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+from seetadet.utils import cython_bbox
+def bbox_overlaps(boxes1, boxes2):
+    """Compute the overlaps between two group of boxes."""
+    return cython_bbox.bbox_overlaps(
+        np.ascontiguousarray(boxes1, dtype=np.float),
+        np.ascontiguousarray(boxes2, dtype=np.float),
+    )
+def bbox_transform(ex_rois, gt_rois, weights=(1., 1., 1., 1.)):
+    """Transform the boxes to the regression targets."""
+    ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.
+    ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.
+    ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths
+    ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights
+    gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.
+    gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.
+    gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths
+    gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights
+    wx, wy, ww, wh = weights
+    targets = [wx * (gt_ctr_x - ex_ctr_x) / ex_widths]
+    targets += [wy * (gt_ctr_y - ex_ctr_y) / ex_heights]
+    targets += [ww * np.log(gt_widths / ex_widths)]
+    targets += [wh * np.log(gt_heights / ex_heights)]
+    return np.vstack(targets).transpose()
+def bbox_centerness(ex_rois, gt_rois):
+    """Compute centerness of the boxes to ground-truth."""
+    ex_ctr_x = (ex_rois[:, 2] + ex_rois[:, 0]) / 2
+    ex_ctr_y = (ex_rois[:, 3] + ex_rois[:, 1]) / 2
+    l = ex_ctr_x - gt_rois[:, 0]
+    t = ex_ctr_y - gt_rois[:, 1]
+    r = gt_rois[:, 2] - ex_ctr_x
+    b = gt_rois[:, 3] - ex_ctr_y
+    centerness = \
+        (np.minimum(l, r) / np.maximum(l, r)) * \
+        (np.minimum(t, b) / np.maximum(t, b))
+    min_dist = np.stack([l, t, r, b], axis=1).min(axis=1)
+    keep_inds = np.where(min_dist > 0.01)[0]
+    discard_inds = np.where(min_dist <= 0.01)[0]
+    centerness[keep_inds] = np.sqrt(centerness[keep_inds])
+    centerness[discard_inds] = -1
+    return centerness, keep_inds, discard_inds
+def bbox_transform_inv(boxes, deltas, weights=(1., 1., 1., 1.)):
+    """Decode the final boxes according to the deltas."""
+    if boxes.shape[0] == 0:
+        return np.zeros((0, deltas.shape[1]), dtype=deltas.dtype)
+    boxes = boxes.astype(deltas.dtype, copy=False)
+    widths = boxes[:, 2] - boxes[:, 0] + 1.
+    heights = boxes[:, 3] - boxes[:, 1] + 1.
+    ctr_x = boxes[:, 0] + 0.5 * widths
+    ctr_y = boxes[:, 1] + 0.5 * heights
+    wx, wy, ww, wh = weights
+    dx = deltas[:, 0::4] / wx
+    dy = deltas[:, 1::4] / wy
+    dw = deltas[:, 2::4] / ww
+    dh = deltas[:, 3::4] / wh
+    pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis]
+    pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis]
+    pred_w = np.exp(dw) * widths[:, np.newaxis]
+    pred_h = np.exp(dh) * heights[:, np.newaxis]
+    pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype)
+    pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w      # x1
+    pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h      # y1
+    pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w - 1  # x2
+    pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h - 1  # y2
+    return pred_boxes
+def clip_boxes(boxes, im_shape):
+    # x1 >= 0
+    boxes[:, 0] = np.maximum(np.minimum(boxes[:, 0], im_shape[1] - 1), 0)
+    # y1 >= 0
+    boxes[:, 1] = np.maximum(np.minimum(boxes[:, 1], im_shape[0] - 1), 0)
+    # x2 < im_shape[1]
+    boxes[:, 2] = np.maximum(np.minimum(boxes[:, 2], im_shape[1] - 1), 0)
+    # y2 < im_shape[0]
+    boxes[:, 3] = np.maximum(np.minimum(boxes[:, 3], im_shape[0] - 1), 0)
+    return boxes
+def clip_tiled_boxes(boxes, im_shape):
+    # x1 >= 0
+    boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0)
+    # y1 >= 0
+    boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0)
+    # x2 < im_shape[1]
+    boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0)
+    # y2 < im_shape[0]
+    boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0)
+    return boxes
+def dismantle_boxes(gt_boxes, num_images):
+    """Dismantle the packed ground-truth boxes."""
+    return [
+        gt_boxes[
+            np.where(gt_boxes[:, -1].astype(np.int32) == i)[0]
+        ][:, :-1] for i in range(num_images)
+    ]
+def expand_boxes(boxes, scale):
+    """Expand an array of boxes by a given scale."""
+    w_half = (boxes[:, 2] - boxes[:, 0]) * .5
+    h_half = (boxes[:, 3] - boxes[:, 1]) * .5
+    x_c = (boxes[:, 2] + boxes[:, 0]) * .5
+    y_c = (boxes[:, 3] + boxes[:, 1]) * .5
+    w_half *= scale
+    h_half *= scale
+    boxes_exp = np.zeros(boxes.shape)
+    boxes_exp[:, 0] = x_c - w_half
+    boxes_exp[:, 2] = x_c + w_half
+    boxes_exp[:, 1] = y_c - h_half
+    boxes_exp[:, 3] = y_c + h_half
+    return boxes_exp
+def flip_boxes(boxes, width):
+    """Flip the boxes horizontally."""
+    boxes_flipped = boxes.copy()
+    boxes_flipped[:, 0] = width - boxes[:, 2] - 1
+    boxes_flipped[:, 2] = width - boxes[:, 0] - 1
+    return boxes_flipped
+def filter_boxes(boxes, min_size):
+    """Remove all boxes with any side smaller than min size."""
+    ws = boxes[:, 2] - boxes[:, 0] + 1
+    hs = boxes[:, 3] - boxes[:, 1] + 1
+    keep = np.where((ws >= min_size) & (hs >= min_size))[0]
+    return keep
--- a/seetadet/utils/boxes_v2.py
+++ b/seetadet/utils/boxes_v2.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# Codes are based on:
+#
+# ------------------------------------------------------------
+"""Box utilities for normalized coordinates."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+def boxes_area(boxes):
+    """Compute the area of an array of boxes."""
+    w = (boxes[:, 2] - boxes[:, 0])
+    h = (boxes[:, 3] - boxes[:, 1])
+    area = w * h
+    assert np.all(area >= 0), 'Negative areas founds'
+    return area
+def intersection(boxes1, boxes2):
+    """Compute pairwise intersection areas between boxes."""
+    [y_min1, x_min1, y_max1, x_max1] = np.split(boxes1, 4, axis=1)
+    [y_min2, x_min2, y_max2, x_max2] = np.split(boxes2, 4, axis=1)
+    all_pairs_min_ymax = np.minimum(y_max1, np.transpose(y_max2))
+    all_pairs_max_ymin = np.maximum(y_min1, np.transpose(y_min2))
+    all_pairs_min_xmax = np.minimum(x_max1, np.transpose(x_max2))
+    all_pairs_max_xmin = np.maximum(x_min1, np.transpose(x_min2))
+    inter_heights = np.maximum(
+        np.zeros(all_pairs_max_ymin.shape),
+        all_pairs_min_ymax - all_pairs_max_ymin
+    )
+    inter_widths = np.maximum(
+        np.zeros(all_pairs_max_xmin.shape),
+        all_pairs_min_xmax - all_pairs_max_xmin
+    )
+    return inter_heights * inter_widths
+def ioa1(boxes1, boxes2):
+    """Computes pairwise intersection-over-area between box collections."""
+    inter = intersection(boxes1, boxes2)
+    area = np.expand_dims(boxes_area(boxes1), axis=1)
+    return inter / area
+def ioa2(boxes1, boxes2):
+    """Computes pairwise intersection-over-area between box collections."""
+    inter = intersection(boxes1, boxes2)
+    area = np.expand_dims(boxes_area(boxes2), axis=0)
+    return inter / area
+def iou(boxes1, boxes2):
+    """Computes pairwise intersection-over-union between box collections."""
+    inter = intersection(boxes1, boxes2)
+    area1 = boxes_area(boxes1)
+    area2 = boxes_area(boxes2)
+    union = np.expand_dims(area1, axis=1) + \
+            np.expand_dims(area2, axis=0) - inter
+    return inter / union
--- a/lib/utils/colormap.py
+++ b/lib/utils/colormap.py
--- a/seetadet/utils/env.py
+++ b/seetadet/utils/env.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import importlib.machinery
+import os
+import dragon
+from dragon.core.util import six
+from dragon.vm import torch
+import numpy as np
+from seetadet.core.config import cfg
+def get_param_groups(module):
+    """Separate parameters according to weight decay.
+    Parameters
+    ----------
+    module : dragon.vm.torch.nn.Module
+        The module to collect parameters.
+    Returns
+    -------
+    Sequence[ParamGroup]
+        The parameter groups.
+    """
+    param_groups = [
+        {'params': []},  # Decayed always
+        {'params': [], 'weight_decay': -1.}
+    ]
+    for name, param in module.named_parameters():
+        gi = 0 if 'weight' in name and param.dim() > 1 else 1
+        param_groups[gi]['params'].append(param)
+    if len(param_groups[1]['params']) == 0:
+        param_groups.pop()  # Remove empty group
+    return param_groups
+def load_library(library_prefix):
+    """Load a shared library.
+    Parameters
+    ----------
+    library_prefix : str
+        The prefix of library.
+    """
+    loader_details = (
+        importlib.machinery.ExtensionFileLoader,
+        importlib.machinery.EXTENSION_SUFFIXES
+    )
+    library_prefix = os.path.abspath(library_prefix)
+    lib_dir, fullname = os.path.split(library_prefix)
+    finder = importlib.machinery.FileFinder(lib_dir, loader_details)
+    ext_specs = finder.find_spec(fullname)
+    if ext_specs is None:
+        raise ImportError(
+            'Could not find the pre-built library '
+            'for <%s>.' % library_prefix
+        )
+    dragon.load_library(ext_specs.origin)
+def new_tensor(data, enforce_cpu=False):
+    """Create a new tensor from the data.
+    Parameters
+    ----------
+    data : array_like
+        The data value.
+    enforce_cpu : bool, optional, default=False
+        **True** to enforce the cpu storage.
+    Returns
+    -------
+    dragon.vm.torch.Tensor
+        The tensor taken with the data.
+    """
+    if data is None:
+        return data
+    if isinstance(data, np.ndarray):
+        tensor = torch.from_numpy(data)
+    elif isinstance(data, torch.Tensor):
+        tensor = data
+    else:
+        tensor = torch.tensor(data)
+    if not enforce_cpu:
+        tensor = tensor.cuda(cfg.GPU_ID)
+    return tensor
+# Aliases
+pickle = six.moves.pickle
--- a/lib/utils/image.py
+++ b/lib/utils/image.py
@@ -18,7 +18,7 @@ import numpy as np
 import PIL.Image
 import PIL.ImageEnhance
-from lib.core.config import cfg
+from seetadet.core.config import cfg
 def distort_image(img):
@@ -28,7 +28,6 @@ def distort_image(img):
        PIL.ImageEnhance.Contrast,
        PIL.ImageEnhance.Color,
    ]
-    np.random.shuffle(transforms)
    for transform in transforms:
        if np.random.uniform() < 0.5:
            img = transform(img)
@@ -62,7 +61,7 @@ def get_image_with_target_size(target_size, img):
    )
-def resize_image(img, fx, fy):
+def resize_image(img, fx=1, fy=1):
    return cv2.resize(
        img,
        dsize=None,
@@ -79,7 +78,6 @@ def scale_image(img):
        im_size_max = np.max(img.shape[:2])
        for target_size in cfg.TEST.SCALES:
            im_scale = float(target_size) / float(im_size_min)
-            # Prevent the biggest axis from being more than MAX_SIZE
            if np.round(im_scale * im_size_max) > cfg.TEST.MAX_SIZE:
                im_scale = float(cfg.TEST.MAX_SIZE) / float(im_size_max)
            processed_ims.append(
@@ -91,17 +89,16 @@ def scale_image(img):
                ))
            ims_scales.append(im_scale)
    else:
-        # Scale image along the longest side
+        # Scale image into a square
-        im_size_max = np.max(img.shape[:2])
        for target_size in cfg.TEST.SCALES:
-            im_scale = float(target_size) / float(im_size_max)
+            im_scale_h = float(target_size) / img.shape[0]
+            im_scale_w = float(target_size) / img.shape[1]
            processed_ims.append(
                cv2.resize(
                    img,
-                    dsize=None,
+                    dsize=(target_size, target_size),
-                    fx=im_scale, fy=im_scale,
                    interpolation=cv2.INTER_LINEAR,
                ))
-            ims_scales.append(im_scale)
+            ims_scales.append([im_scale_h, im_scale_w])
    return processed_ims, ims_scales
--- a/lib/utils/logger.py
+++ b/lib/utils/logger.py
 # ------------------------------------------------------------
 # Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
 #
 # Licensed under the BSD 2-Clause License.
 # You should have received a copy of the BSD 2-Clause License
 # along with the software. If not, See,
 #
 #      <https://opensource.org/licenses/BSD-2-Clause>
 #
 # Codes are based on:
 #
 #   <https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/platform/tf_logging.py>
 #
 # ------------------------------------------------------------
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import inspect
 import logging as _logging
 import os
 import sys as _sys
 import threading
 _logger = None
 _is_root = True
 _logger_lock = threading.Lock()
 def get_logger():
    global _logger
    # Use double-checked locking to avoid taking lock unnecessarily.
    if _logger:
        return _logger
    _logger_lock.acquire()
    try:
        if _logger:
            return _logger
-        logger = _logging.getLogger('detectron')
+        logger = _logging.getLogger('SeetaDet')
        logger.setLevel('INFO')
        logger.propagate = False
        if True:
            # Determine whether we are in an interactive environment
            _interactive = False
            try:
                # This is only defined in interactive shells.
                if _sys.ps1:
                    _interactive = True
            except AttributeError:
                # Even now, we may be in an interactive shell with `python -i`.
                _interactive = _sys.flags.interactive
            # If we are in an interactive environment (like Jupyter), set loglevel
            # to INFO and pipe the output to stdout.
            if _interactive:
                logger.setLevel('INFO')
                _logging_target = _sys.stdout
            else:
                _logging_target = _sys.stderr
            # Add the output handler.
            _handler = _logging.StreamHandler(_logging_target)
            _handler.setFormatter(_logging.Formatter('%(levelname)s %(message)s'))
            logger.addHandler(_handler)
        _logger = logger
        return _logger
    finally:
        _logger_lock.release()
 def _detailed_msg(msg):
    file, lineno = inspect.stack()[:3][2][1:3]
    return "{}:{}] {}".format(os.path.split(file)[-1], lineno, msg)
 def log(level, msg, *args, **kwargs):
    get_logger().log(level, _detailed_msg(msg), *args, **kwargs)
 def debug(msg, *args, **kwargs):
    if is_root():
        get_logger().debug(_detailed_msg(msg), *args, **kwargs)
 def error(msg, *args, **kwargs):
    get_logger().error(_detailed_msg(msg), *args, **kwargs)
    assert 0
 def fatal(msg, *args, **kwargs):
    get_logger().fatal(_detailed_msg(msg), *args, **kwargs)
    assert 0
 def info(msg, *args, **kwargs):
    if is_root():
        get_logger().info(_detailed_msg(msg), *args, **kwargs)
 def warn(msg, *args, **kwargs):
    if is_root():
        get_logger().warn(_detailed_msg(msg), *args, **kwargs)
 def warning(msg, *args, **kwargs):
    if is_root():
        get_logger().warning(_detailed_msg(msg), *args, **kwargs)
 def get_verbosity():
    """Return how much logging output will be produced."""
    return get_logger().getEffectiveLevel()
 def set_verbosity(v):
    """Sets the threshold for what messages will be logged."""
    get_logger().setLevel(v)
 def set_root_logger(is_root=True):
    global _is_root
    _is_root = is_root
 def is_root():
    return _is_root
--- a/lib/utils/mask.py
+++ b/lib/utils/mask.py
@@ -21,7 +21,7 @@ import cv2
 import numpy as np
 import PIL.Image
-from lib.utils import boxes as box_util
+from seetadet.utils import boxes as box_util
 def dismantle_masks(gt_boxes, gt_masks, num_images):

--- a/lib/nms/nms_wrapper.py
+++ b/lib/nms/nms_wrapper.py
@@ -17,44 +17,53 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-from lib.core.config import cfg
+from seetadet.modules import det
+from seetadet.utils import env
 try:
-    from lib.nms.cpu_nms import cpu_nms, cpu_soft_nms
+    from seetadet.utils.cython_nms import cpu_nms
-except ImportError as e:
+    from seetadet.utils.cython_nms import cpu_soft_nms
-    print('Failed to import cpu nms. Error: {0}'.format(str(e)))
+except ImportError:
+    cpu_nms = cpu_soft_nms = print
-try:
-    from lib.nms.gpu_nms import gpu_nms
+def gpu_nms(dets, thresh):
-except ImportError as e:
+    """Filter out the detections using GPU-NMS."""
-    print('Failed to import gpu nms. Error: {0}'.format(str(e)))
+    if dets.shape[0] == 0:
+        return []
+    scores = dets[:, 4]
+    order = scores.argsort()[::-1]
+    sorted_dets = env.new_tensor(dets[order, :])
+    keep = det.nms(sorted_dets, iou_threshold=thresh).numpy()
+    return order[keep]
-def nms(detections, thresh, force_cpu=False):
+def nms(dets, thresh):
-    """Perform either CPU or GPU Hard-NMS."""
+    """Filter out the detections using NMS."""
-    if detections.shape[0] == 0:
+    if dets.shape[0] == 0:
        return []
-    if cfg.USE_GPU_NMS and not force_cpu:
+    if cpu_nms is print:
-        return gpu_nms(detections, thresh, device_id=cfg.GPU_ID)
+        raise ImportError('Failed to load <cython_nms> library.')
-    else:
+    return cpu_nms(dets, thresh)
-        return cpu_nms(detections, thresh)
 def soft_nms(
-    detections,
+    dets,
    thresh,
    method='linear',
    sigma=0.5,
    score_thresh=0.001,
 ):
-    """Perform CPU Soft-NMS."""
+    """Filter out the detections using Soft-NMS."""
-    if detections.shape[0] == 0:
+    if dets.shape[0] == 0:
        return []
+    if cpu_soft_nms is print:
+        raise ImportError('Failed to load <cython_nms> library.')
    methods = {'hard': 0, 'linear': 1, 'gaussian': 2}
    if method not in methods:
        raise ValueError('Unknown soft nms method:', method)
    return cpu_soft_nms(
-        detections,
+        dets,
        thresh,
        methods[method],
        sigma,

--- a/lib/utils/stats.py
+++ b/lib/utils/stats.py
--- a/lib/utils/time_util.py
+++ b/lib/utils/time_util.py
--- a/lib/utils/vis.py
+++ b/lib/utils/vis.py
@@ -30,8 +30,8 @@ import matplotlib.pyplot as plt
 from matplotlib.patches import Polygon
 import numpy as np
-from lib.utils.colormap import colormap
+from seetadet.utils.colormap import colormap
-from lib.utils.boxes import expand_boxes
+from seetadet.utils.boxes import expand_boxes
 plt.rcParams['pdf.fonttype'] = 42  # For editing in Adobe Illustrator

--- a/setup.py
+++ b/setup.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import shutil
+import setuptools
+import setuptools.command.install
+import sys
+import subprocess
+def clean():
+    """Remove the work directories."""
+    if os.path.exists('build'):
+        shutil.rmtree('build')
+    if os.path.exists('seeta_det.egg-info'):
+        shutil.rmtree('seeta_det.egg-info')
+def configure():
+    """Prepare the package files."""
+    # Compile cxx sources
+    py_exec = sys.executable
+    if subprocess.call(
+        'cd csrc/cxx && '
+        '{} setup.py build_ext -b ../ --no-python-abi-suffix=0 -j 4 &&'
+        '{} setup.py clean'.format(py_exec, py_exec), shell=True
+    ) > 0:
+        raise RuntimeError('Failed to build the cxx sources.')
+    # Compile pyx sources
+    if subprocess.call(
+        'cd csrc/pyx && '
+        '{} setup.py build_ext -b ../ --cython-c-in-temp -j 4 &&'
+        '{} setup.py clean'.format(py_exec, py_exec), shell=True,
+    ) > 0:
+        raise RuntimeError('Failed to build the pyx sources.')
+    # Copy the pre-built libraries
+    for root, _, files in os.walk('csrc/install'):
+        root = root[len('csrc/install/'):]
+        for file in files:
+            src = os.path.join(root, file)
+            dest = src.replace('lib', 'seetadet')
+            if os.path.exists(dest):
+                os.remove(dest)
+            shutil.copy(os.path.join('csrc/install', src), dest)
+    shutil.rmtree('csrc/install')
+class install(setuptools.command.install.install):
+    """Old-style command to prevent from installing egg."""
+    def run(self):
+        setuptools.command.install.install.run(self)
+def find_packages():
+    """Return the python sources installed to package."""
+    packages = []
+    for root, _, files in os.walk('seetadet'):
+        if os.path.exists(os.path.join(root, '__init__.py')):
+            packages.append(root)
+    return packages
+def find_package_data():
+    """Return the external data installed to package."""
+    libraries = []
+    for root, _, files in os.walk('seetadet'):
+        root = root[len('seetadet/'):]
+        for file in files:
+            if file.endswith('.so') or file.endswith('.pyd'):
+                libraries.append(os.path.join(root, file))
+    return libraries
+configure()
+setuptools.setup(
+    name='seeta-det',
+    version='0.4.0',
+    description='SeetaDet: A platform implementing popular object detection algorithms.',
+    url='https://gitlab.seetatech.com/seetaresearch/SeetaDet',
+    author='SeetaTech',
+    license='BSD 2-Clause',
+    packages=find_packages(),
+    package_data={'seetadet': find_package_data()},
+    package_dir={'seetadet': 'seetadet'},
+    cmdclass={'install': install},
+    install_requires=['opencv-python', 'Pillow'],
+    classifiers=[
+        'Development Status :: 5 - Production/Stable',
+        'Intended Audience :: Developers',
+        'Intended Audience :: Education',
+        'Intended Audience :: Science/Research',
+        'License :: OSI Approved :: BSD License',
+        'Programming Language :: C++',
+        'Programming Language :: Python',
+        'Topic :: Scientific/Engineering',
+        'Topic :: Scientific/Engineering :: Mathematics',
+        'Topic :: Scientific/Engineering :: Artificial Intelligence',
+        'Topic :: Software Development',
+        'Topic :: Software Development :: Libraries',
+        'Topic :: Software Development :: Libraries :: Python Modules',
+    ],
+)
+clean()
--- a/tools/export.py
+++ b/tools/export.py
@@ -21,10 +21,11 @@ import argparse
 import dragon.vm.torch as torch
 import pprint
-from lib.core.config import cfg
+from seetadet import onnx as _
-from lib.core.coordinator import Coordinator
+from seetadet.core.config import cfg
-from lib.modeling.detector import new_detector
+from seetadet.core.coordinator import Coordinator
-from lib.utils import logger
+from seetadet.modeling.detector import new_detector
+from seetadet.utils import logger
 def parse_args():
@@ -71,8 +72,8 @@ if __name__ == '__main__':
                .format(coordinator.exports_dir()))
    detector = new_detector(cfg.GPU_ID, checkpoint)
-    data = torch.zeros(*args.input_shape).byte()
+    data = torch.zeros(*args.input_shape, dtype='uint8')
-    ims_info = torch.zeros(args.input_shape[0], 3).float()
+    ims_info = torch.zeros(args.input_shape[0], 3, dtype='float32')
    torch.onnx.export(
        model=detector,

--- a/tools/mpi_train.py
+++ b/tools/mpi_train.py
@@ -21,11 +21,11 @@ import argparse
 import dragon
 import numpy
-from lib.core.config import cfg
+from seetadet.core.config import cfg
-from lib.core.coordinator import Coordinator
+from seetadet.core.coordinator import Coordinator
-from lib.core.train import train_net
+from seetadet.core.train import train_net
-from lib.datasets.factory import get_imdb
+from seetadet.datasets.factory import get_dataset
-from lib.utils import logger
+from seetadet.utils import logger
 def parse_args():
@@ -79,12 +79,12 @@ if __name__ == '__main__':
    # Fix the random seed for reproducibility
    numpy.random.seed(cfg.RNG_SEED)
-    dragon.config.set_random_seed(cfg.RNG_SEED)
+    dragon.random.set_seed(cfg.RNG_SEED)
-    # Inspect the database
+    # Inspect the dataset
-    database = get_imdb(cfg.TRAIN.DATABASE)
+    dataset = get_dataset(cfg.TRAIN.DATASET)
-    logger.info('Database({}): {} images will be used to train.'
+    logger.info('Dataset({}): {} images will be used to train.'
-                .format(cfg.TRAIN.DATABASE, database.num_images))
+                .format(cfg.TRAIN.DATASET, dataset.num_images))
    # Ready to train the network
    logger.info('Output will be saved to `{:s}`'

--- a/tools/test.py
+++ b/tools/test.py
@@ -20,12 +20,12 @@ sys.path.insert(0, '..')
 import argparse
 import pprint
-from lib.core import test_engine
+from seetadet.core import test_engine
-from lib.core.config import cfg
+from seetadet.core.config import cfg
-from lib.core.coordinator import Coordinator
+from seetadet.core.coordinator import Coordinator
-from lib.core.test import TestServer
+from seetadet.core.test import TestServer
-from lib.datasets.factory import get_imdb
+from seetadet.datasets.factory import get_dataset
-from lib.utils import logger
+from seetadet.utils import logger
 def parse_args():
@@ -81,11 +81,11 @@ if __name__ == '__main__':
    if checkpoint is None:
        raise RuntimeError('The checkpoint of global step {} does not exist.'.format(args.iter))
-    # Inspect the database
+    # Inspect the dataset
-    database = get_imdb(cfg.TEST.DATABASE)
+    dataset = get_dataset(cfg.TEST.DATASET)
    cfg.TEST.PROTOCOL = 'dump' if args.dump else cfg.TEST.PROTOCOL
-    logger.info('Database({}): {} images will be used to test.'
+    logger.info('Dataset({}): {} images will be used to test.'
-                .format(cfg.TEST.DATABASE, database.num_images))
+                .format(cfg.TEST.DATASET, dataset.num_images))
    # Ready to test the network
    output_dir = coordinator.results_dir(checkpoint, args.output_dir)

--- a/tools/test_all.py
+++ b/tools/test_all.py
@@ -20,8 +20,8 @@ sys.path.insert(0, '..')
 import argparse
 import numpy
-from lib.core.coordinator import Coordinator
+from seetadet.core.coordinator import Coordinator
-from lib.utils import logger
+from seetadet.utils import logger
 def parse_args():

--- a/tools/train.py
+++ b/tools/train.py
@@ -22,11 +22,11 @@ import dragon
 import numpy
 import pprint
-from lib.core.config import cfg
+from seetadet.core.config import cfg
-from lib.core.coordinator import Coordinator
+from seetadet.core.coordinator import Coordinator
-from lib.core.train import train_net
+from seetadet.core.train import train_net
-from lib.datasets.factory import get_imdb
+from seetadet.datasets.factory import get_dataset
-from lib.utils import logger
+from seetadet.utils import logger
 def parse_args():
@@ -59,7 +59,7 @@ def mpi_train(cfg_file, exp_dir):
    """
    import subprocess
-    args = 'mpirun --allow-run-as-root -n {} '.format(cfg.NUM_GPUS)
+    args = 'mpirun --allow-run-as-root -n {} --bind-to none '.format(cfg.NUM_GPUS)
    args += '{} {} '.format(sys.executable, 'mpi_train.py')
    args += '--cfg {} --exp_dir {} '.format(osp.abspath(cfg_file), exp_dir)
    return subprocess.call(args, shell=True)
@@ -84,12 +84,12 @@ if __name__ == '__main__':
        # Fix the random seed for reproducibility
        numpy.random.seed(cfg.RNG_SEED)
-        dragon.config.set_random_seed(cfg.RNG_SEED)
+        dragon.random.set_seed(cfg.RNG_SEED)
-        # Inspect the database
+        # Inspect the dataset
-        database = get_imdb(cfg.TRAIN.DATABASE)
+        dataset = get_dataset(cfg.TRAIN.DATASET)
-        logger.info('Database({}): {} images will be used to train.'
+        logger.info('Dataset({}): {} images will be used to train.'
-                    .format(cfg.TRAIN.DATABASE, database.num_images))
+                    .format(cfg.TRAIN.DATASET, dataset.num_images))
        # Ready to train the network
        logger.info('Output will be saved to `{:s}`'