Change the code structure

Ting PAN
Commit f4ecc7c7 authored Apr 08, 2020 by Ting PAN
Showing with 1750 additions and 2619 deletions
CHANGES
README.md
compile/CMakeLists.txt
compile/cmake/FindNumPy.cmake
compile/cmake/FindPythonLibs.cmake
compile/gpu_nms.h
compile/gpu_nms.pyx
compile/make.sh
compile/nms_kernel.cu
compile/setup.py
configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_1x.yml
configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_2x.yml
configs/faster_rcnn/voc_faster_rcnn_R-50-FPN.yml
configs/faster_rcnn/voc_faster_rcnn_VGG-16-C4.yml
configs/retinanet/coco_retinanet_400_R-50-FPN_1x.yml → configs/retinanet/coco_retinanet_400_R-50-FPN.yml
configs/retinanet/coco_retinanet_400_R-50-FPN_4x.yml
configs/retinanet/voc_retinanet_300_R-18-FPN.yml
configs/retinanet/voc_retinanet_300_AirNet-FPN.yml → configs/retinanet/voc_retinanet_320_AirNet-FPN.yml
configs/retinanet/voc_retinanet_300_R-34-FPN.yml → configs/retinanet/voc_retinanet_320_R-50-FPN.yml
configs/ssd/voc_ssd_300_AirNet-5b.yml
--- a/CHANGES
+++ b/CHANGES
 ------------------------------------------------------------------------
 The list of most significant changes made over time in SeetaDet.

+SeetaDet 0.4.0 (20200408)
+
+Dragon Minimum Required (Version 0.3.0.dev20200408)
+
+Changes:
+
+Preview Features:
+
+- Optimize the code structure.
+
+- DALI support for SSD, RetinaNet, and Faster-RCNN.
+
+- Use KPLRecord instead of SeetaRecord.
+
+Bugs fixed:
+
+- Fix the frozen Affine issue.
+
+------------------------------------------------------------------------
+
 SeetaDet 0.3.0 (20191121)

 Dragon Minimum Required (Version 0.3.0.dev20191121)

--- a/README.md
+++ b/README.md
@@ -2,8 +2,8 @@

 ## WHAT's SeetaDet?

-SeetaDet contains many useful object detectors, including R-CNN series, SSD,
-and the recent RetinaNet.
+SeetaDet is a platform implementing popular object detection algorithms,
+including R-CNN series, SSD, and RetinaNet.

 We have achieved the same or higher performance than the baseline reported by the original paper.

@@ -14,22 +14,33 @@ The torch-style codes help us to simplify the hierarchical pipeline of modern de

 ## Requirements

-seeta-dragon >= 0.3.0.dev20191121
+seeta-dragon >= 0.3.0.dev20200408

 ## Installation

-#### 1. Install the required python packages
+#### Build From Source
+
+If you prefer to develop modules as well as running experiments,
+following commands will build but not install to ***site-packages***:

 ```bash
-pip install cython pyyaml matplotlib
-pip install opencv-python Pillow
+cd SeetaDet && python setup.py build
 ```

-#### 2. Compile the C Extensions
+#### Install From Source
+
+Clone this repository to local disk and install:
+
+```bash
+cd SeetaDet && python setup.py install
+```
+
+#### Install From Git
+
+You can also install it from remote repository: 

 ```bash
-cd SeetaDet/compile
-bash ./make.sh
+pip install git+https://gitlab.seetatech.com/seetaresearch/SeetaDet.git@master
 ```

 ## Quick Start
@@ -37,7 +48,7 @@ bash ./make.sh
 #### Train a detection model

 ```bash
-cd SeetaDet/tools
+cd tools
 python train.py --cfg <MODEL_YAML>
 ```

@@ -46,20 +57,20 @@ We have provided the default YAML examples into ``SeetaDet/configs``.
 #### Test a detection model

 ```bash
-cd SeetaDet/tools
+cd tools
 python test.py --cfg <MODEL_YAML> --exp_dir <EXP_DIR> --iter <ITERATION>
 ```
 Or

 ```bash
-cd SeetaDet/tools
+cd tools
 python test_all.py --cfg <MODEL_YAML> --exp_dir <EXP_DIR>
 ```

 #### Export a detection model to ONNX

 ```bash
-cd SeetaDet/tools
+cd tools
 python export.py --cfg <MODEL_YAML> --exp_dir <EXP_DIR> --iter <ITERATION>
 ```


--- a/compile/CMakeLists.txt
+++ b/compile/CMakeLists.txt
-PROJECT(gpu_nms)
-CMAKE_MINIMUM_REQUIRED(VERSION 3.0.2)
-
-# ---------------- User Config ----------------
-
-# Set your python "interpreter" if necessary
-# if not, a default interpreter will be used
-# here, provide several examples:
-# set(PYTHON_EXECUTABLE /usr/bin/python) # Linux & OSX, Builtin Python
-# set(PYTHON_EXECUTABLE /X/anaconda/bin/python) # Linux & OSX, Anaconda
-# set(PYTHON_EXECUTABLE X:/Anaconda/python) # Win, Anaconda
-
-# Set CUDA compiling architecture
-# Remove "compute_70/sm_70" if using CUDA 8.0
-set(CUDA_ARCH    -gencode arch=compute_30,code=sm_30
-                 -gencode arch=compute_35,code=sm_35
-                 -gencode arch=compute_50,code=sm_50
-                 -gencode arch=compute_60,code=sm_60
-                 -gencode arch=compute_70,code=sm_70)
-
-# ---------------- User Config ----------------
-
-# ---[ Dependencies
-include(${PROJECT_SOURCE_DIR}/cmake/FindPythonLibs.cmake)
-include(${PROJECT_SOURCE_DIR}/cmake/FindNumPy.cmake)
-FIND_PACKAGE(CUDA REQUIRED)
-
-set(CMAKE_CXX_STANDARD 11)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-message(STATUS "C++11 support has been enabled by default.")
-
-# ---[ Config types
-set(CMAKE_BUILD_TYPE Release CACHE STRING "set build type to release")
-set(CMAKE_CONFIGURATION_TYPES  Release CACHE STRING "set build type to release" FORCE)
-
-# ---[ Includes
-set(INCLUDE_DIR ${PROJECT_SOURCE_DIR}/include)
-include_directories(${INCLUDE_DIR})
-include_directories(${PROJECT_SOURCE_DIR}/src)
-include_directories(${PYTHON_INCLUDE_DIRS})
-include_directories(${NUMPY_INCLUDE_DIR})
-include_directories(${CUDA_INCLUDE_DIRS})
-
-# ---[ libs
-link_directories(${PYTHON_LIBRARIES})
-
-# ---[ Install
-set(CMAKE_INSTALL_PREFIX ${PROJECT_SOURCE_DIR} CACHE STRING "set install prefix" FORCE)
-set(CMAKE_SHARED_LIBRARY_PREFIX "")
-
-# ---[ Flags
-set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ${CUDA_ARCH}")
-if(WIN32)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP /O2 /Oi /GL /Ot /Gy")
-endif()
-if(UNIX)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -s -fPIC")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -s -w -fPIC -O3 -m64 -std=c++11")
-endif()
-
-# ---[ Files
-set(HEADER_FILES gpu_nms.h)
-set(SRC_FILES gpu_nms.cpp nms_kernel.cu)
-
-# ---[ Add Target
-CUDA_ADD_LIBRARY(${PROJECT_NAME} SHARED ${HEADER_FILES} ${SRC_FILES})
-
-# ---[ Link Libs
-TARGET_LINK_LIBRARIES(${PROJECT_NAME} ${CUDA_LIBRARIES} ${CUDA_cublas_LIBRARY} ${CUDA_curand_LIBRARY})
-if(WIN32)
-    TARGET_LINK_LIBRARIES(${PROJECT_NAME} ${PYTHON_LIBRARIES})
-endif()
-
-# ---[ Install Target
-set_target_properties(${PROJECT_NAME} PROPERTIES OUTPUT_NAME "gpu_nms")
-install (TARGETS ${PROJECT_NAME} DESTINATION ${PROJECT_BINARY_DIR}/../install/lib/nms)
-
--- a/compile/cmake/FindNumPy.cmake
+++ b/compile/cmake/FindNumPy.cmake
-# - Find the NumPy libraries
-# This module finds if NumPy is installed, and sets the following variables
-# indicating where it is.
-#
-# TODO: Update to provide the libraries and paths for linking npymath lib.
-#
-#  NUMPY_FOUND               - was NumPy found
-#  NUMPY_VERSION             - the version of NumPy found as a string
-#  NUMPY_VERSION_MAJOR       - the major version number of NumPy
-#  NUMPY_VERSION_MINOR       - the minor version number of NumPy
-#  NUMPY_VERSION_PATCH       - the patch version number of NumPy
-#  NUMPY_VERSION_DECIMAL     - e.g. version 1.6.1 is 10601
-#  NUMPY_INCLUDE_DIR         - path to the NumPy include files
-
-unset(NUMPY_VERSION)
-unset(NUMPY_INCLUDE_DIR)
-
-if(PYTHONINTERP_FOUND)
-  execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
-    "import numpy as n; print(n.__version__); print(n.get_include());"
-    RESULT_VARIABLE __result
-    OUTPUT_VARIABLE __output
-    OUTPUT_STRIP_TRAILING_WHITESPACE)
-
-  if(__result MATCHES 0)
-    string(REGEX REPLACE ";" "\\\\;" __values ${__output})
-    string(REGEX REPLACE "\r?\n" ";"    __values ${__values})
-    list(GET __values 0 NUMPY_VERSION)
-    list(GET __values 1 NUMPY_INCLUDE_DIR)
-
-    string(REGEX MATCH "^([0-9])+\\.([0-9])+\\.([0-9])+" __ver_check "${NUMPY_VERSION}")
-    if(NOT "${__ver_check}" STREQUAL "")
-      set(NUMPY_VERSION_MAJOR ${CMAKE_MATCH_1})
-      set(NUMPY_VERSION_MINOR ${CMAKE_MATCH_2})
-      set(NUMPY_VERSION_PATCH ${CMAKE_MATCH_3})
-      math(EXPR NUMPY_VERSION_DECIMAL
-        "(${NUMPY_VERSION_MAJOR} * 10000) + (${NUMPY_VERSION_MINOR} * 100) + ${NUMPY_VERSION_PATCH}")
-      string(REGEX REPLACE "\\\\" "/"  NUMPY_INCLUDE_DIR ${NUMPY_INCLUDE_DIR})
-    else()
-     unset(NUMPY_VERSION)
-     unset(NUMPY_INCLUDE_DIR)
-     message(STATUS "Requested NumPy version and include path, but got instead:\n${__output}\n")
-    endif()
-  endif()
-else()
-	message("Can not find Python interpretator.")
-	message(FATAL_ERROR "Do you set PYTHON_EXECUTABLE correctly?")
-endif()
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(NumPy REQUIRED_VARS NUMPY_INCLUDE_DIR NUMPY_VERSION
-                                        VERSION_VAR   NUMPY_VERSION)
-
-if(NUMPY_FOUND)
-  message(STATUS "NumPy ver. ${NUMPY_VERSION} found (include: ${NUMPY_INCLUDE_DIR})")
-endif()
\ No newline at end of file
--- a/compile/cmake/FindPythonLibs.cmake
+++ b/compile/cmake/FindPythonLibs.cmake
-# - Find python libraries
-# This module finds the libraries corresponding to the Python interpeter
-# FindPythonInterp provides.
-# This code sets the following variables:
-#
-#  PYTHONLIBS_FOUND           - have the Python libs been found
-#  PYTHON_PREFIX              - path to the Python installation
-#  PYTHON_LIBRARIES           - path to the python library
-#  PYTHON_INCLUDE_DIRS        - path to where Python.h is found
-#  PYTHON_MODULE_EXTENSION    - lib extension, e.g. '.so' or '.pyd'
-#  PYTHON_MODULE_PREFIX       - lib name prefix: usually an empty string
-#  PYTHON_SITE_PACKAGES       - path to installation site-packages
-#  PYTHON_IS_DEBUG            - whether the Python interpreter is a debug build
-#
-# Thanks to talljimbo for the patch adding the 'LDVERSION' config
-# variable usage.
-
-#=============================================================================
-# Copyright 2001-2009 Kitware, Inc.
-# Copyright 2012 Continuum Analytics, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution.
-#
-# * Neither the names of Kitware, Inc., the Insight Software Consortium,
-# nor the names of their contributors may be used to endorse or promote
-# products derived from this software without specific prior written
-# permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#=============================================================================
-
-# Checking for the extension makes sure that `LibsNew` was found and not just `Libs`.
-if(PYTHONLIBS_FOUND AND PYTHON_MODULE_EXTENSION)
-    return()
-endif()
-
-# Use the Python interpreter to find the libs.
-if(PythonLibsNew_FIND_REQUIRED)
-    find_package(PythonInterp ${PythonLibsNew_FIND_VERSION} REQUIRED)
-else()
-    find_package(PythonInterp ${PythonLibsNew_FIND_VERSION})
-endif()
-
-if(NOT PYTHONINTERP_FOUND)
-    set(PYTHONLIBS_FOUND FALSE)
-    return()
-endif()
-
-# According to http://stackoverflow.com/questions/646518/python-how-to-detect-debug-interpreter
-# testing whether sys has the gettotalrefcount function is a reliable, cross-platform
-# way to detect a CPython debug interpreter.
-#
-# The library suffix is from the config var LDVERSION sometimes, otherwise
-# VERSION. VERSION will typically be like "2.7" on unix, and "27" on windows.
-execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
-    "from distutils import sysconfig as s;import sys;import struct;
-print('.'.join(str(v) for v in sys.version_info));
-print(sys.prefix);
-print(s.get_python_inc(plat_specific=True));
-print(s.get_python_lib(plat_specific=True));
-print(s.get_config_var('SO'));
-print(hasattr(sys, 'gettotalrefcount')+0);
-print(struct.calcsize('@P'));
-print(s.get_config_var('LDVERSION') or s.get_config_var('VERSION'));
-print(s.get_config_var('LIBDIR') or '');
-print(s.get_config_var('MULTIARCH') or '');
-"
-    RESULT_VARIABLE _PYTHON_SUCCESS
-    OUTPUT_VARIABLE _PYTHON_VALUES
-    ERROR_VARIABLE _PYTHON_ERROR_VALUE)
-
-if(NOT _PYTHON_SUCCESS MATCHES 0)
-    if(PythonLibsNew_FIND_REQUIRED)
-        message(FATAL_ERROR
-            "Python config failure:\n${_PYTHON_ERROR_VALUE}")
-    endif()
-    set(PYTHONLIBS_FOUND FALSE)
-    return()
-endif()
-
-# Convert the process output into a list
-string(REGEX REPLACE ";" "\\\\;" _PYTHON_VALUES ${_PYTHON_VALUES})
-string(REGEX REPLACE "\n" ";" _PYTHON_VALUES ${_PYTHON_VALUES})
-list(GET _PYTHON_VALUES 0 _PYTHON_VERSION_LIST)
-list(GET _PYTHON_VALUES 1 PYTHON_PREFIX)
-list(GET _PYTHON_VALUES 2 PYTHON_INCLUDE_DIR)
-list(GET _PYTHON_VALUES 3 PYTHON_SITE_PACKAGES)
-list(GET _PYTHON_VALUES 4 PYTHON_MODULE_EXTENSION)
-list(GET _PYTHON_VALUES 5 PYTHON_IS_DEBUG)
-list(GET _PYTHON_VALUES 6 PYTHON_SIZEOF_VOID_P)
-list(GET _PYTHON_VALUES 7 PYTHON_LIBRARY_SUFFIX)
-list(GET _PYTHON_VALUES 8 PYTHON_LIBDIR)
-list(GET _PYTHON_VALUES 9 PYTHON_MULTIARCH)
-
-# Make sure the Python has the same pointer-size as the chosen compiler
-# Skip if CMAKE_SIZEOF_VOID_P is not defined
-if(CMAKE_SIZEOF_VOID_P AND (NOT "${PYTHON_SIZEOF_VOID_P}" STREQUAL "${CMAKE_SIZEOF_VOID_P}"))
-    if(PythonLibsNew_FIND_REQUIRED)
-        math(EXPR _PYTHON_BITS "${PYTHON_SIZEOF_VOID_P} * 8")
-        math(EXPR _CMAKE_BITS "${CMAKE_SIZEOF_VOID_P} * 8")
-        message(FATAL_ERROR
-            "Python config failure: Python is ${_PYTHON_BITS}-bit, "
-            "chosen compiler is  ${_CMAKE_BITS}-bit")
-    endif()
-    set(PYTHONLIBS_FOUND FALSE)
-    return()
-endif()
-
-# The built-in FindPython didn't always give the version numbers
-string(REGEX REPLACE "\\." ";" _PYTHON_VERSION_LIST ${_PYTHON_VERSION_LIST})
-list(GET _PYTHON_VERSION_LIST 0 PYTHON_VERSION_MAJOR)
-list(GET _PYTHON_VERSION_LIST 1 PYTHON_VERSION_MINOR)
-list(GET _PYTHON_VERSION_LIST 2 PYTHON_VERSION_PATCH)
-
-# Make sure all directory separators are '/'
-string(REGEX REPLACE "\\\\" "/" PYTHON_PREFIX ${PYTHON_PREFIX})
-string(REGEX REPLACE "\\\\" "/" PYTHON_INCLUDE_DIR ${PYTHON_INCLUDE_DIR})
-string(REGEX REPLACE "\\\\" "/" PYTHON_SITE_PACKAGES ${PYTHON_SITE_PACKAGES})
-
-if(CMAKE_HOST_WIN32)
-    set(PYTHON_LIBRARY
-        "${PYTHON_PREFIX}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib")
-
-    # when run in a venv, PYTHON_PREFIX points to it. But the libraries remain in the
-    # original python installation. They may be found relative to PYTHON_INCLUDE_DIR.
-    if(NOT EXISTS "${PYTHON_LIBRARY}")
-        get_filename_component(_PYTHON_ROOT ${PYTHON_INCLUDE_DIR} DIRECTORY)
-        set(PYTHON_LIBRARY
-            "${_PYTHON_ROOT}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib")
-    endif()
-
-    # raise an error if the python libs are still not found.
-    if(NOT EXISTS "${PYTHON_LIBRARY}")
-        message(FATAL_ERROR "Python libraries not found")
-    endif()
-
-else()
-    if(PYTHON_MULTIARCH)
-        set(_PYTHON_LIBS_SEARCH "${PYTHON_LIBDIR}/${PYTHON_MULTIARCH}" "${PYTHON_LIBDIR}")
-    else()
-        set(_PYTHON_LIBS_SEARCH "${PYTHON_LIBDIR}")
-    endif()
-    #message(STATUS "Searching for Python libs in ${_PYTHON_LIBS_SEARCH}")
-    # Probably this needs to be more involved. It would be nice if the config
-    # information the python interpreter itself gave us were more complete.
-    find_library(PYTHON_LIBRARY
-        NAMES "python${PYTHON_LIBRARY_SUFFIX}"
-        PATHS ${_PYTHON_LIBS_SEARCH}
-        NO_DEFAULT_PATH)
-
-    # If all else fails, just set the name/version and let the linker figure out the path.
-    if(NOT PYTHON_LIBRARY)
-        set(PYTHON_LIBRARY python${PYTHON_LIBRARY_SUFFIX})
-    endif()
-endif()
-
-MARK_AS_ADVANCED(
-  PYTHON_LIBRARY
-  PYTHON_INCLUDE_DIR
-)
-
-# We use PYTHON_INCLUDE_DIR, PYTHON_LIBRARY and PYTHON_DEBUG_LIBRARY for the
-# cache entries because they are meant to specify the location of a single
-# library. We now set the variables listed by the documentation for this
-# module.
-SET(PYTHON_INCLUDE_DIRS "${PYTHON_INCLUDE_DIR}")
-SET(PYTHON_LIBRARIES "${PYTHON_LIBRARY}")
-SET(PYTHON_DEBUG_LIBRARIES "${PYTHON_DEBUG_LIBRARY}")
-
-find_package_message(PYTHON
-    "Found PythonLibs: ${PYTHON_LIBRARY}"
-    "${PYTHON_EXECUTABLE}${PYTHON_VERSION}")
-
-set(PYTHONLIBS_FOUND TRUE)
--- a/compile/gpu_nms.h
+++ b/compile/gpu_nms.h
-void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
-          int boxes_dim, float nms_overlap_thresh, int device_id);
--- a/compile/gpu_nms.pyx
+++ b/compile/gpu_nms.pyx
-# --------------------------------------------------------
-# Faster R-CNN
-# Copyright (c) 2015 Microsoft
-# Licensed under The MIT License [see LICENSE for details]
-# Written by Ross Girshick
-# --------------------------------------------------------
-
-import numpy as np
-cimport numpy as np
-
-assert sizeof(int) == sizeof(np.int32_t)
-
-cdef extern from "gpu_nms.h":
-    void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int)
-
-def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, float thresh, int device_id=0):
-    cdef int boxes_num = dets.shape[0]
-    cdef int boxes_dim = dets.shape[1]
-    cdef int num_out
-    cdef np.ndarray[np.int32_t, ndim=1] \
-        keep = np.zeros(boxes_num, dtype=np.int32)
-    cdef np.ndarray[np.float32_t, ndim=1] \
-        scores = dets[:, 4]
-    cdef np.ndarray[np.intp_t, ndim=1] \
-        order = scores.argsort()[::-1]
-    cdef np.ndarray[np.float32_t, ndim=2] \
-        sorted_dets = dets[order, :]
-    _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, thresh, device_id)
-    keep = keep[:num_out]
-    return list(order[keep])
--- a/compile/make.sh
+++ b/compile/make.sh
-#!/bin/sh
-
-# Delete cache
-rm -r build install *.c *.cpp
-
-# Compile cpp modules
-python setup.py build_ext --inplace
-
-# Compile cuda modules
-cd build && cmake .. && make install && cd ..
-
-# Copy to the library root
-cp -r install/lib ../
--- a/compile/nms_kernel.cu
+++ b/compile/nms_kernel.cu
-// ------------------------------------------------------------
-// Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-//
-// Licensed under the BSD 2-Clause License.
-// You should have received a copy of the BSD 2-Clause License
-// along with the software. If not, See,
-//
-//      <https://opensource.org/licenses/BSD-2-Clause>
-//
-// ------------------------------------------------------------
-
-#include <vector>
-
-#include "gpu_nms.h"
-
-#define CUDA_CHECK(condition) \
-  /* Code block avoids redefinition of cudaError_t error */ \
-  do { \
-    cudaError_t error = condition; \
-    if (error != cudaSuccess) { \
-      \
-    } \
-  } while (0)
-
-void SetDevice(int device_id) {
-    int current_device;
-    CUDA_CHECK(cudaGetDevice(&current_device));
-    if (current_device == device_id) return;
-    CUDA_CHECK(cudaSetDevice(device_id));
-}
-
-#define DIV_UP(m,n) ((m) / (n) + ((m) % (n) > 0))
-#define NMS_BLOCK_SIZE 64
-
-template <typename T>
-__device__  T iou(const T* A, const T* B) {
-    const T x1 = max(A[0], B[0]);
-    const T y1 = max(A[1], B[1]);
-    const T x2 = min(A[2], B[2]);
-    const T y2 = min(A[3], B[3]);
-    const T width = max((T)0, x2 - x1 + 1);
-    const T height = max((T)0, y2 - y1 + 1);
-    const T area = width * height;
-    const T A_area = (A[2] - A[0] + 1) * (A[3] - A[1] + 1);
-    const T B_area = (B[2] - B[0] + 1) * (B[3] - B[1] + 1);
-    return area / (A_area + B_area - area);
-}
-
-template <typename T>
-__global__ void nms_mask(const int num_boxes, const T nms_thresh,
-			 const T* boxes, unsigned long long* mask) {
-    const int i_start = blockIdx.x * NMS_BLOCK_SIZE;
-    const int di_end = min(num_boxes - i_start, NMS_BLOCK_SIZE);
-    const int j_start = blockIdx.y * NMS_BLOCK_SIZE;
-    const int dj_end = min(num_boxes - j_start, NMS_BLOCK_SIZE);
-
-    const int num_blocks = DIV_UP(num_boxes, NMS_BLOCK_SIZE);
-    const int bid = blockIdx.x;
-    const int tid = threadIdx.x;
-
-    __shared__ T boxes_i[NMS_BLOCK_SIZE * 4];
-
-    if (tid < di_end) {
-        boxes_i[tid * 4 + 0] = boxes[(i_start + tid) * 5 + 0];
-        boxes_i[tid * 4 + 1] = boxes[(i_start + tid) * 5 + 1];
-        boxes_i[tid * 4 + 2] = boxes[(i_start + tid) * 5 + 2];
-        boxes_i[tid * 4 + 3] = boxes[(i_start + tid) * 5 + 3];
-    }
-
-    __syncthreads();
-
-    if (tid < dj_end) {
-        const T* const box_j = boxes + (j_start + tid) * 5;
-        unsigned long long mask_j = 0;
-        const int di_start = (i_start == j_start) ? (tid + 1) : 0;
-        for (int di = di_start; di < di_end; ++di)
-            if (iou(box_j, boxes_i + di * 4) > nms_thresh) 
-		mask_j |= 1ULL << di;
-        mask[(j_start + tid) * num_blocks + bid] = mask_j;
-    }
-}
-
-template <typename T>
-void ApplyNMS(const int num_boxes, const int max_keeps, const float thresh,
-              const T* boxes, int* keep_indices, int& num_keep) {
-    const int num_blocks = DIV_UP(num_boxes, NMS_BLOCK_SIZE);
-    const dim3 blocks(num_blocks, num_blocks);
-    size_t mask_nbytes = num_boxes * num_blocks * sizeof(unsigned long long);
-    size_t boxes_nbytes = num_boxes * 5 * sizeof(T);
-
-    void* boxes_dev, *mask_dev;
-    CUDA_CHECK(cudaMalloc(&boxes_dev, boxes_nbytes));
-    CUDA_CHECK(cudaMalloc(&mask_dev, mask_nbytes));
-    CUDA_CHECK(cudaMemcpy(boxes_dev, boxes, boxes_nbytes, cudaMemcpyHostToDevice));
-    nms_mask<T> << <blocks, NMS_BLOCK_SIZE >> > (num_boxes, thresh,
-					             (T*)boxes_dev,
-                         	    (unsigned long long*)mask_dev);
-    CUDA_CHECK(cudaPeekAtLastError());
-
-    std::vector<unsigned long long> mask_host(num_boxes * num_blocks);
-    CUDA_CHECK(cudaMemcpy(&mask_host[0], mask_dev, mask_nbytes, cudaMemcpyDeviceToHost));
-
-    std::vector<unsigned long long> dead_bit(num_blocks);
-    memset(&dead_bit[0], 0, sizeof(unsigned long long) * num_blocks);
-    int num_selected = 0;
-
-    for (int i = 0; i < num_boxes; ++i) {
-        const int nblock = i / NMS_BLOCK_SIZE;
-        const int inblock = i % NMS_BLOCK_SIZE;
-        if (!(dead_bit[nblock] & (1ULL << inblock))) {
-            keep_indices[num_selected++] = i;
-            unsigned long long* mask_i = &mask_host[0] + i * num_blocks;
-            for (int j = nblock; j < num_blocks; ++j) dead_bit[j] |= mask_i[j];
-            if (num_selected == max_keeps) break;
-        }
-    }
-    num_keep = num_selected;
-    CUDA_CHECK(cudaFree(mask_dev)); 
-    CUDA_CHECK(cudaFree(boxes_dev));
-}
-
-void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
-    	  int boxes_dim, float nms_overlap_thresh, int device_id) {
-    //  set the device to use
-    SetDevice(device_id);
-
-    //  apply gpu nms
-    ApplyNMS<float>(boxes_num, boxes_num, nms_overlap_thresh,
-                             boxes_host, keep_out, *num_out);
-}
\ No newline at end of file
--- a/compile/setup.py
+++ b/compile/setup.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-
-from distutils.extension import Extension
-from distutils.core import setup
-from Cython.Distutils import build_ext
-
-import numpy as np
-numpy_include = np.get_include()
-
-ext_modules = [
-Extension(
-        "install.lib.utils.cython_bbox",
-        ["bbox.pyx"],
-        extra_compile_args=["-Wno-cpp", "-Wno-unused-function"],
-        include_dirs = [numpy_include]),
-Extension(
-        "install.lib.nms.cpu_nms",
-        ["cpu_nms.pyx"],
-        extra_compile_args=["-Wno-cpp", "-Wno-unused-function"],
-        include_dirs = [numpy_include]),
-Extension(
-        "install.deprecated.gpu_nms",
-        ["gpu_nms.pyx"],
-        extra_compile_args=["-Wno-cpp", "-Wno-unused-function"],
-        language='c++',
-        include_dirs = [numpy_include]),
-Extension(
-        'install.lib.pycocotools._mask',
-        ['../lib/pycocotools/maskApi.c', '../lib/pycocotools/_mask.pyx'],
-        include_dirs=[numpy_include, 'pycocotools'],
-        extra_compile_args=['-Wno-cpp', '-Wno-unused-function', '-std=c99']),
-]
-
-setup(name='Detectron',ext_modules=ext_modules,cmdclass = {'build_ext': build_ext})
--- a/configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_1x.yml
+++ b/configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_1x.yml
-NUM_GPUS: 8
-VIS: False
-ENABLE_TENSOR_BOARD: False
-MODEL:
-  TYPE: faster_rcnn
-  BACKBONE: resnet101.fpn
-  CLASSES: ['__background__',
-            'person', 'bicycle', 'car', 'motorcycle', 'airplane',
-            'bus', 'train', 'truck', 'boat', 'traffic light',
-            'fire hydrant', 'stop sign', 'parking meter', 'bench',
-            'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant',
-            'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
-            'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
-            'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
-            'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife',
-            'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli',
-            'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
-            'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop',
-            'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven',
-            'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors',
-            'teddy bear', 'hair drier', 'toothbrush']
-  NUM_CLASSES: 81
-SOLVER:
-  BASE_LR: 0.02
-  DECAY_STEPS: [60000, 80000]
-  MAX_STEPS: 90000
-  SNAPSHOT_EVERY: 5000
-  SNAPSHOT_PREFIX: coco_faster_rcnn
-FRCNN:
-  ROI_XFORM_METHOD: RoIAlign
-  ROI_XFORM_RESOLUTION: 7
-TRAIN:
-  WEIGHTS: '/model/R-101.Affine.pth'
-  DATABASE: '/data/coco_2014_trainval35k'
-  IMS_PER_BATCH: 2
-  USE_DIFF: False # Do not use crowd objects
-  BATCH_SIZE: 512
-  SCALES: [800]
-  MAX_SIZE: 1333
-TEST:
-  DATABASE: '/data/coco_2014_minival'
-  JSON_FILE: '/data/instances_minival2014.json'
-  PROTOCOL: 'coco'
-  RPN_POST_NMS_TOP_N: 1000
-  SCALES: [800]
-  MAX_SIZE: 1333
-  NMS: 0.5
+NUM_GPUS: 8
+VIS: False
+ENABLE_TENSOR_BOARD: False
+MODEL:
+  TYPE: faster_rcnn
+  BACKBONE: resnet101.fpn
+  CLASSES: ['__background__',
+            'person', 'bicycle', 'car', 'motorcycle', 'airplane',
+            'bus', 'train', 'truck', 'boat', 'traffic light',
+            'fire hydrant', 'stop sign', 'parking meter', 'bench',
+            'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant',
+            'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
+            'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
+            'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
+            'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife',
+            'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli',
+            'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
+            'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop',
+            'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven',
+            'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors',
+            'teddy bear', 'hair drier', 'toothbrush']
+  NUM_CLASSES: 81
+SOLVER:
+  BASE_LR: 0.02
+  DECAY_STEPS: [60000, 80000]
+  MAX_STEPS: 90000
+  SNAPSHOT_EVERY: 5000
+  SNAPSHOT_PREFIX: coco_faster_rcnn
+FRCNN:
+  ROI_XFORM_METHOD: RoIAlign
+  ROI_XFORM_RESOLUTION: 7
+TRAIN:
+  WEIGHTS: '/model/R-101.Affine.pth'
+  DATASET: '/data/coco_2014_trainval35k'
+  USE_DIFF: False # Do not use crowd objects
+  IMS_PER_BATCH: 2
+  BATCH_SIZE: 512
+  SCALES: [800]
+  MAX_SIZE: 1333
+TEST:
+  DATASET: '/data/coco_2014_minival'
+  JSON_FILE: '/data/instances_minival2014.json'
+  PROTOCOL: 'coco'
+  RPN_POST_NMS_TOP_N: 1000
+  SCALES: [800]
+  MAX_SIZE: 1333
+  NMS: 0.5
--- a/configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_2x.yml
+++ b/configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_2x.yml
-NUM_GPUS: 8
-VIS: False
-ENABLE_TENSOR_BOARD: False
-MODEL:
-  TYPE: faster_rcnn
-  BACKBONE: resnet101.fpn
-  CLASSES: ['__background__',
-            'person', 'bicycle', 'car', 'motorcycle', 'airplane',
-            'bus', 'train', 'truck', 'boat', 'traffic light',
-            'fire hydrant', 'stop sign', 'parking meter', 'bench',
-            'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant',
-            'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
-            'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
-            'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
-            'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife',
-            'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli',
-            'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
-            'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop',
-            'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven',
-            'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors',
-            'teddy bear', 'hair drier', 'toothbrush']
-  NUM_CLASSES: 81
-SOLVER:
-  BASE_LR: 0.02
-  DECAY_STEPS: [120000, 160000]
-  MAX_STEPS: 180000
-  SNAPSHOT_EVERY: 5000
-  SNAPSHOT_PREFIX: coco_faster_rcnn
-FRCNN:
-  ROI_XFORM_METHOD: RoIAlign
-  ROI_XFORM_RESOLUTION: 7
-TRAIN:
-  WEIGHTS: '/model/R-101.Affine.pth'
-  DATABASE: '/data/coco_2014_trainval35k'
-  IMS_PER_BATCH: 2
-  USE_DIFF: False # Do not use crowd objects
-  BATCH_SIZE: 512
-  SCALES: [800]
-  MAX_SIZE: 1333
-TEST:
-  DATABASE: '/data/coco_2014_minival'
-  JSON_FILE: '/data/instances_minival2014.json'
-  PROTOCOL: 'coco'
-  RPN_POST_NMS_TOP_N: 1000
-  SCALES: [800]
-  MAX_SIZE: 1333
-  NMS: 0.5
+NUM_GPUS: 8
+VIS: False
+ENABLE_TENSOR_BOARD: False
+MODEL:
+  TYPE: faster_rcnn
+  BACKBONE: resnet101.fpn
+  CLASSES: ['__background__',
+            'person', 'bicycle', 'car', 'motorcycle', 'airplane',
+            'bus', 'train', 'truck', 'boat', 'traffic light',
+            'fire hydrant', 'stop sign', 'parking meter', 'bench',
+            'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant',
+            'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
+            'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
+            'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
+            'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife',
+            'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli',
+            'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
+            'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop',
+            'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven',
+            'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors',
+            'teddy bear', 'hair drier', 'toothbrush']
+  NUM_CLASSES: 81
+SOLVER:
+  BASE_LR: 0.02
+  DECAY_STEPS: [120000, 160000]
+  MAX_STEPS: 180000
+  SNAPSHOT_EVERY: 5000
+  SNAPSHOT_PREFIX: coco_faster_rcnn
+FRCNN:
+  ROI_XFORM_METHOD: RoIAlign
+  ROI_XFORM_RESOLUTION: 7
+TRAIN:
+  WEIGHTS: '/model/R-101.Affine.pth'
+  DATASET: '/data/coco_2014_trainval35k'
+  USE_DIFF: False # Do not use crowd objects
+  IMS_PER_BATCH: 2
+  BATCH_SIZE: 512
+  SCALES: [800]
+  MAX_SIZE: 1333
+TEST:
+  DATASET: '/data/coco_2014_minival'
+  JSON_FILE: '/data/instances_minival2014.json'
+  PROTOCOL: 'coco'
+  RPN_POST_NMS_TOP_N: 1000
+  SCALES: [800]
+  MAX_SIZE: 1333
+  NMS: 0.5
--- a/configs/faster_rcnn/voc_faster_rcnn_R-50-FPN.yml
+++ b/configs/faster_rcnn/voc_faster_rcnn_R-50-FPN.yml
-NUM_GPUS: 1
-VIS: False
-ENABLE_TENSOR_BOARD: False
-MODEL:
-  TYPE: faster_rcnn
-  BACKBONE: resnet50.fpn
-  CLASSES: ['__background__',
-            'aeroplane', 'bicycle', 'bird', 'boat',
-            'bottle', 'bus', 'car', 'cat', 'chair',
-            'cow', 'diningtable', 'dog', 'horse',
-            'motorbike', 'person', 'pottedplant',
-            'sheep', 'sofa', 'train', 'tvmonitor']
-  NUM_CLASSES: 21
-SOLVER:
-  BASE_LR: 0.002
-  DECAY_STEPS: [100000, 140000]
-  MAX_STEPS: 140000
-  SNAPSHOT_EVERY: 5000
-  SNAPSHOT_PREFIX: voc_faster_rcnn
-FRCNN:
-  ROI_XFORM_METHOD: RoIAlign
-  ROI_XFORM_RESOLUTION: 7
-TRAIN:
-  WEIGHTS: '/model/R-50.Affine.pth'
-  DATABASE: '/data/voc_0712_trainval'
-  IMS_PER_BATCH: 2
-  BATCH_SIZE: 128
-  SCALES: [600]
-  MAX_SIZE: 1000
-TEST:
-  DATABASE: '/data/voc_2007_test'
-  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
-  RPN_POST_NMS_TOP_N: 1000
-  SCALES: [600]
-  MAX_SIZE: 1000
+NUM_GPUS: 1
+VIS: False
+ENABLE_TENSOR_BOARD: False
+MODEL:
+  TYPE: faster_rcnn
+  BACKBONE: resnet50.fpn
+  CLASSES: ['__background__',
+            'aeroplane', 'bicycle', 'bird', 'boat',
+            'bottle', 'bus', 'car', 'cat', 'chair',
+            'cow', 'diningtable', 'dog', 'horse',
+            'motorbike', 'person', 'pottedplant',
+            'sheep', 'sofa', 'train', 'tvmonitor']
+  NUM_CLASSES: 21
+SOLVER:
+  BASE_LR: 0.002
+  DECAY_STEPS: [100000, 140000]
+  MAX_STEPS: 140000
+  SNAPSHOT_EVERY: 5000
+  SNAPSHOT_PREFIX: voc_faster_rcnn
+FRCNN:
+  ROI_XFORM_METHOD: RoIAlign
+  ROI_XFORM_RESOLUTION: 7
+TRAIN:
+  WEIGHTS: '/model/R-50.Affine.pth'
+  DATASET: '/data/voc_0712_trainval'
+  IMS_PER_BATCH: 2
+  BATCH_SIZE: 128
+  SCALES: [600]
+  MAX_SIZE: 1000
+TEST:
+  DATASET: '/data/voc_2007_test'
+  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
+  RPN_POST_NMS_TOP_N: 1000
+  SCALES: [600]
+  MAX_SIZE: 1000
  NMS: 0.45
\ No newline at end of file
--- a/configs/faster_rcnn/voc_faster_rcnn_VGG-16-C4.yml
+++ b/configs/faster_rcnn/voc_faster_rcnn_VGG-16-C4.yml
-NUM_GPUS: 1
-VIS: False
-ENABLE_TENSOR_BOARD: False
-MODEL:
-  TYPE: faster_rcnn
-  BACKBONE: vgg16.c4
-  CLASSES: ['__background__',
-            'aeroplane', 'bicycle', 'bird', 'boat',
-            'bottle', 'bus', 'car', 'cat', 'chair',
-            'cow', 'diningtable', 'dog', 'horse',
-            'motorbike', 'person', 'pottedplant',
-            'sheep', 'sofa', 'train', 'tvmonitor']
-  NUM_CLASSES: 21
-SOLVER:
-  BASE_LR: 0.001
-  WEIGHT_DECAY: 0.0005
-  DECAY_STEPS: [100000, 140000]
-  MAX_STEPS: 140000
-  SNAPSHOT_EVERY: 5000
-  SNAPSHOT_PREFIX: voc_faster_rcnn
-RPN:
-  STRIDES: [16]
-  SCALES: [8, 16, 32] # RField: [128, 256, 512]
-  ASPECT_RATIOS: [0.5, 1.0, 2.0]
-FRCNN:
-  ROI_XFORM_METHOD: RoIPool
-  ROI_XFORM_RESOLUTION: 7
-  MLP_HEAD_DIM: 4096
-TRAIN:
-  WEIGHTS: '/model/VGG16.RCNN.pth'
-  DATABASE: '/data/voc_0712_trainval'
-  RPN_MIN_SIZE: 16
-  IMS_PER_BATCH: 2
-  BATCH_SIZE: 128
-  SCALES: [600]
-  MAX_SIZE: 1000
-TEST:
-  DATABASE: '/data/voc_2007_test'
-  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
-  RPN_MIN_SIZE: 16
-  RPN_POST_NMS_TOP_N: 300
-  SCALES: [600]
-  MAX_SIZE: 1000
+NUM_GPUS: 1
+VIS: False
+ENABLE_TENSOR_BOARD: False
+MODEL:
+  TYPE: faster_rcnn
+  BACKBONE: vgg16.c4
+  CLASSES: ['__background__',
+            'aeroplane', 'bicycle', 'bird', 'boat',
+            'bottle', 'bus', 'car', 'cat', 'chair',
+            'cow', 'diningtable', 'dog', 'horse',
+            'motorbike', 'person', 'pottedplant',
+            'sheep', 'sofa', 'train', 'tvmonitor']
+  NUM_CLASSES: 21
+SOLVER:
+  BASE_LR: 0.001
+  WEIGHT_DECAY: 0.0005
+  DECAY_STEPS: [100000, 140000]
+  MAX_STEPS: 140000
+  SNAPSHOT_EVERY: 5000
+  SNAPSHOT_PREFIX: voc_faster_rcnn
+RPN:
+  STRIDES: [16]
+  SCALES: [8, 16, 32] # RField: [128, 256, 512]
+  ASPECT_RATIOS: [0.5, 1.0, 2.0]
+FRCNN:
+  ROI_XFORM_METHOD: RoIPool
+  ROI_XFORM_RESOLUTION: 7
+  MLP_HEAD_DIM: 4096
+TRAIN:
+  WEIGHTS: '/model/VGG16.RCNN.pth'
+  DATASET: '/data/voc_0712_trainval'
+  RPN_MIN_SIZE: 16
+  IMS_PER_BATCH: 2
+  BATCH_SIZE: 128
+  SCALES: [600]
+  MAX_SIZE: 1000
+TEST:
+  DATASET: '/data/voc_2007_test'
+  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
+  RPN_MIN_SIZE: 16
+  RPN_POST_NMS_TOP_N: 300
+  SCALES: [600]
+  MAX_SIZE: 1000
  NMS: 0.45
\ No newline at end of file
--- a/configs/retinanet/coco_retinanet_400_R-50-FPN_1x.yml
+++ b/configs/retinanet/coco_retinanet_400_R-50-FPN_1x.yml
-NUM_GPUS: 4
-VIS: False
-ENABLE_TENSOR_BOARD: False
-MODEL:
-  TYPE: retinanet
-  BACKBONE: resnet50.fpn
-  CLASSES: ['__background__',
-            'person', 'bicycle', 'car', 'motorcycle', 'airplane',
-            'bus', 'train', 'truck', 'boat', 'traffic light',
-            'fire hydrant', 'stop sign', 'parking meter', 'bench',
-            'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant',
-            'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
-            'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
-            'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
-            'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife',
-            'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli',
-            'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
-            'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop',
-            'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven',
-            'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors',
-            'teddy bear', 'hair drier', 'toothbrush']
-  NUM_CLASSES: 81
-SOLVER:
-  BASE_LR: 0.02
-  DECAY_STEPS: [30000, 40000]
-  MAX_STEPS: 45000
-  SNAPSHOT_EVERY: 5000
-  SNAPSHOT_PREFIX: coco_retinanet_400
-FPN:
-  RPN_MIN_LEVEL: 3
-  RPN_MAX_LEVEL: 7
-TRAIN:
-  WEIGHTS: '/model/R-50.Affine.pth'
-  DATABASE: '/data/coco_2014_trainval35k'
-  IMS_PER_BATCH: 8
-  SCALES: [400]
-  MAX_SIZE: 666
-TEST:
-  DATABASE: '/data/coco_2014_minival'
-  JSON_FILE: '/data/instances_minival2014.json'
-  PROTOCOL: 'coco'
-  IMS_PER_BATCH: 1
-  SCALES: [400]
-  MAX_SIZE: 666
+NUM_GPUS: 4
+VIS: False
+ENABLE_TENSOR_BOARD: False
+MODEL:
+  TYPE: retinanet
+  BACKBONE: resnet50.fpn
+  CLASSES: ['__background__',
+            'person', 'bicycle', 'car', 'motorcycle', 'airplane',
+            'bus', 'train', 'truck', 'boat', 'traffic light',
+            'fire hydrant', 'stop sign', 'parking meter', 'bench',
+            'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant',
+            'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
+            'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
+            'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
+            'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife',
+            'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli',
+            'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
+            'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop',
+            'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven',
+            'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors',
+            'teddy bear', 'hair drier', 'toothbrush']
+  NUM_CLASSES: 81
+SOLVER:
+  BASE_LR: 0.01
+  DECAY_STEPS: [60000, 80000]
+  MAX_STEPS: 90000
+  SNAPSHOT_EVERY: 5000
+  SNAPSHOT_PREFIX: coco_retinanet_400
+FPN:
+  RPN_MIN_LEVEL: 3
+  RPN_MAX_LEVEL: 7
+TRAIN:
+  WEIGHTS: '/model/R-50.Affine.pth'
+  DATASET: '/data/coco_2014_trainval35k'
+  USE_DIFF: False  # Do not use crowd objects
+  USE_COLOR_JITTER: True
+  IMS_PER_BATCH: 8
+  SCALES: [400]
+  MAX_SIZE: 666
+  RANDOM_SCALES: [0.75, 1.0]
+TEST:
+  DATASET: '/data/coco_2014_minival'
+  JSON_FILE: '/data/instances_minival2014.json'
+  PROTOCOL: 'coco'
+  IMS_PER_BATCH: 1
+  SCALES: [400]
+  MAX_SIZE: 666
  NMS: 0.5
\ No newline at end of file
--- a/configs/retinanet/coco_retinanet_400_R-50-FPN_4x.yml
+++ b/configs/retinanet/coco_retinanet_400_R-50-FPN_4x.yml
-NUM_GPUS: 4
-VIS: False
-ENABLE_TENSOR_BOARD: False
-MODEL:
-  TYPE: retinanet
-  BACKBONE: resnet50.fpn
-  CLASSES: ['__background__',
-            'person', 'bicycle', 'car', 'motorcycle', 'airplane',
-            'bus', 'train', 'truck', 'boat', 'traffic light',
-            'fire hydrant', 'stop sign', 'parking meter', 'bench',
-            'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant',
-            'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
-            'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
-            'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
-            'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife',
-            'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli',
-            'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
-            'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop',
-            'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven',
-            'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors',
-            'teddy bear', 'hair drier', 'toothbrush']
-  NUM_CLASSES: 81
-SOLVER:
-  BASE_LR: 0.02
-  WARM_UP_STEPS: 2000 # default: 500
-  DECAY_STEPS: [120000, 160000]
-  MAX_STEPS: 180000
-  SNAPSHOT_EVERY: 5000
-  SNAPSHOT_PREFIX: coco_retinanet_400
-FPN:
-  RPN_MIN_LEVEL: 3
-  RPN_MAX_LEVEL: 7
-DROPBLOCK:
-  DROP_ON: True
-  DECREMENT: 0.000005 # * 20000 = 0.1
-TRAIN:
-  WEIGHTS: '/model/R-50.Affine.pth'
-  DATABASE: '/data/coco_2014_trainval35k'
-  IMS_PER_BATCH: 8
-  SCALES: [400]
-  MAX_SIZE: 666
-  USE_SCALE_JITTER: True
-  USE_COLOR_JITTER: True
-  SCALE_JITTER_RANGE: [0.75, 1.33]
-TEST:
-  DATABASE: '/data/coco_2014_minival'
-  JSON_FILE: '/data/instances_minival2014.json'
-  PROTOCOL: 'coco'
-  IMS_PER_BATCH: 1
-  SCALES: [400]
-  MAX_SIZE: 666
-  NMS: 0.5
\ No newline at end of file
--- a/configs/retinanet/voc_retinanet_300_R-18-FPN.yml
+++ b/configs/retinanet/voc_retinanet_300_R-18-FPN.yml
-NUM_GPUS: 1
-VIS: False
-VIS_ON_FILE: False
-MODEL:
-  TYPE: retinanet
-  BACKBONE: resnet18.fpn
-  CLASSES: ['__background__',
-            'aeroplane', 'bicycle', 'bird', 'boat',
-            'bottle', 'bus', 'car', 'cat', 'chair',
-            'cow', 'diningtable', 'dog', 'horse',
-            'motorbike', 'person', 'pottedplant',
-            'sheep', 'sofa', 'train', 'tvmonitor']
-  NUM_CLASSES: 21
-SOLVER:
-  BASE_LR: 0.01
-  DECAY_STEPS: [40000, 50000, 60000]
-  WARM_UP_STEPS: 2000
-  MAX_STEPS: 60000
-  SNAPSHOT_EVERY: 5000
-  SNAPSHOT_PREFIX: voc_retinanet_300
-FPN:
-  RPN_MIN_LEVEL: 3
-  RPN_MAX_LEVEL: 7
-TRAIN:
-  WEIGHTS: '/model/R-18.Affine.pth'
-  DATABASE: '/data/voc_0712_trainval'
-  IMS_PER_BATCH: 32
-  SCALES: [300]
-  MAX_SIZE: 500
-  SCALE_JITTER_RANGE: [0.5, 2.0]
-  USE_SCALE_JITTER: True
-  USE_COLOR_JITTER: True
-TEST:
-  DATABASE: '/data/voc_2007_test'
-  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
-  IMS_PER_BATCH: 1
-  SCALES: [300]
-  MAX_SIZE: 500
-  NMS: 0.45
\ No newline at end of file
--- a/configs/retinanet/voc_retinanet_300_AirNet-FPN.yml
+++ b/configs/retinanet/voc_retinanet_300_AirNet-FPN.yml
-NUM_GPUS: 1
-VIS: False
-VIS_ON_FILE: False
-MODEL:
-  TYPE: retinanet
-  BACKBONE: airnet.fpn
-  CLASSES: ['__background__',
-            'aeroplane', 'bicycle', 'bird', 'boat',
-            'bottle', 'bus', 'car', 'cat', 'chair',
-            'cow', 'diningtable', 'dog', 'horse',
-            'motorbike', 'person', 'pottedplant',
-            'sheep', 'sofa', 'train', 'tvmonitor']
-  NUM_CLASSES: 21
-SOLVER:
-  BASE_LR: 0.02
-  DECAY_STEPS: [40000, 50000, 60000]
-  MAX_STEPS: 60000
-  SNAPSHOT_EVERY: 5000
-  SNAPSHOT_PREFIX: voc_retinanet_300
-FPN:
-  RPN_MIN_LEVEL: 3
-  RPN_MAX_LEVEL: 7
-TRAIN:
-  WEIGHTS: '/model/AirNet.Affine.pth'
-  DATABASE: '/data/voc_0712_trainval'
-  IMS_PER_BATCH: 32
-  SCALES: [300]
-  MAX_SIZE: 500
-  SCALE_JITTER_RANGE: [0.5, 2.0]
-  USE_SCALE_JITTER: True
-  USE_COLOR_JITTER: True
-TEST:
-  DATABASE: '/data/voc_2007_test'
-  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
-  IMS_PER_BATCH: 1
-  SCALES: [300]
-  MAX_SIZE: 500
+NUM_GPUS: 1
+VIS: False
+VIS_ON_FILE: False
+MODEL:
+  TYPE: retinanet
+  BACKBONE: airnet.fpn
+  CLASSES: ['__background__',
+            'aeroplane', 'bicycle', 'bird', 'boat',
+            'bottle', 'bus', 'car', 'cat', 'chair',
+            'cow', 'diningtable', 'dog', 'horse',
+            'motorbike', 'person', 'pottedplant',
+            'sheep', 'sofa', 'train', 'tvmonitor']
+  NUM_CLASSES: 21
+SOLVER:
+  BASE_LR: 0.01
+  DECAY_STEPS: [40000, 50000, 60000]
+  MAX_STEPS: 60000
+  SNAPSHOT_EVERY: 5000
+  SNAPSHOT_PREFIX: voc_retinanet_320
+FPN:
+  RPN_MIN_LEVEL: 3
+  RPN_MAX_LEVEL: 7
+TRAIN:
+  WEIGHTS: '/model/AirNet.Affine.pth'
+  DATASET: '/data/voc_0712_trainval'
+  USE_COLOR_JITTER: True
+  IMS_PER_BATCH: 32
+  SCALES: [320]
+  RANDOM_SCALES: [0.5, 1.0]
+TEST:
+  DATASET: '/data/voc_2007_test'
+  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
+  IMS_PER_BATCH: 1
+  SCALES: [320]
  NMS: 0.45
\ No newline at end of file
--- a/configs/retinanet/voc_retinanet_300_R-34-FPN.yml
+++ b/configs/retinanet/voc_retinanet_300_R-34-FPN.yml
-NUM_GPUS: 1
-VIS: False
-VIS_ON_FILE: False
-MODEL:
-  TYPE: retinanet
-  BACKBONE: resnet34.fpn
-  CLASSES: ['__background__',
-            'aeroplane', 'bicycle', 'bird', 'boat',
-            'bottle', 'bus', 'car', 'cat', 'chair',
-            'cow', 'diningtable', 'dog', 'horse',
-            'motorbike', 'person', 'pottedplant',
-            'sheep', 'sofa', 'train', 'tvmonitor']
-  NUM_CLASSES: 21
-SOLVER:
-  BASE_LR: 0.01
-  DECAY_STEPS: [40000, 50000, 60000]
-  WARM_UP_STEPS: 2000
-  MAX_STEPS: 60000
-  SNAPSHOT_EVERY: 5000
-  SNAPSHOT_PREFIX: voc_retinanet_300
-FPN:
-  RPN_MIN_LEVEL: 3
-  RPN_MAX_LEVEL: 7
-TRAIN:
-  WEIGHTS: '/model/R-34.Affine.pth'
-  DATABASE: '/data/voc_0712_trainval'
-  IMS_PER_BATCH: 32
-  SCALES: [300]
-  MAX_SIZE: 500
-  SCALE_JITTER_RANGE: [0.5, 2.0]
-  USE_SCALE_JITTER: True
-  USE_COLOR_JITTER: True
-TEST:
-  DATABASE: '/data/voc_2007_test'
-  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
-  IMS_PER_BATCH: 1
-  SCALES: [300]
-  MAX_SIZE: 500
+NUM_GPUS: 1
+VIS: False
+VIS_ON_FILE: False
+MODEL:
+  TYPE: retinanet
+  BACKBONE: resnet34.fpn
+  CLASSES: ['__background__',
+            'aeroplane', 'bicycle', 'bird', 'boat',
+            'bottle', 'bus', 'car', 'cat', 'chair',
+            'cow', 'diningtable', 'dog', 'horse',
+            'motorbike', 'person', 'pottedplant',
+            'sheep', 'sofa', 'train', 'tvmonitor']
+  NUM_CLASSES: 21
+SOLVER:
+  BASE_LR: 0.01
+  DECAY_STEPS: [40000, 50000, 60000]
+  WARM_UP_STEPS: 2000
+  MAX_STEPS: 60000
+  SNAPSHOT_EVERY: 5000
+  SNAPSHOT_PREFIX: voc_retinanet_320
+FPN:
+  RPN_MIN_LEVEL: 3
+  RPN_MAX_LEVEL: 7
+TRAIN:
+  WEIGHTS: '/model/R-50.Affine.pth'
+  DATASET: '/data/voc_0712_trainval'
+  USE_COLOR_JITTER: True
+  IMS_PER_BATCH: 32
+  SCALES: [320]
+  RANDOM_SCALES: [0.5, 2.0]
+TEST:
+  DATASET: '/data/voc_2007_test'
+  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
+  IMS_PER_BATCH: 1
+  SCALES: [320]
  NMS: 0.45
\ No newline at end of file
--- a/configs/ssd/voc_ssd_300_AirNet-5b.yml
+++ b/configs/ssd/voc_ssd_300_AirNet-5b.yml
-NUM_GPUS: 1
-VIS: False
-ENABLE_TENSOR_BOARD: False
-MODEL:
-  TYPE: ssd
-  BACKBONE: airnet5b.mbox
-  CLASSES: ['__background__',
-            'aeroplane', 'bicycle', 'bird', 'boat',
-            'bottle', 'bus', 'car', 'cat', 'chair',
-            'cow', 'diningtable', 'dog', 'horse',
-            'motorbike', 'person', 'pottedplant',
-            'sheep', 'sofa', 'train', 'tvmonitor']
-  NUM_CLASSES: 21
-SOLVER:
-  BASE_LR: 0.001
-  DECAY_STEPS: [80000, 100000, 120000]
-  MAX_STEPS: 120000
-  SNAPSHOT_EVERY: 5000
-  SNAPSHOT_PREFIX: voc_ssd_300
-SSD:
-  RESIZE:
-    HEIGHT: 300
-    WIDTH: 300
-  MULTIBOX:
-    MIN_SIZES: [30, 90, 150]
-    MAX_SIZES: [90, 150, 210]
-    STRIDES: [8, 16, 32]
-    ASPECT_RATIOS: [[1, 2, 0.5], [1, 2, 0.5], [1, 2, 0.5]]
-TRAIN:
-  WEIGHTS: '/model/AirNet.Affine.pth'
-  DATABASE: '/data/voc_0712_trainval'
-  IMS_PER_BATCH: 32
-TEST:
-  DATABASE: '/data/voc_2007_test'
-  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
-  IMS_PER_BATCH: 8
-  NMS_TOP_K: 400
-  NMS: 0.45
-  SCORE_THRESH: 0.01
+NUM_GPUS: 1
+VIS: False
+ENABLE_TENSOR_BOARD: False
+MODEL:
+  TYPE: ssd
+  BACKBONE: airnet5b.mbox
+  CLASSES: ['__background__',
+            'aeroplane', 'bicycle', 'bird', 'boat',
+            'bottle', 'bus', 'car', 'cat', 'chair',
+            'cow', 'diningtable', 'dog', 'horse',
+            'motorbike', 'person', 'pottedplant',
+            'sheep', 'sofa', 'train', 'tvmonitor']
+  NUM_CLASSES: 21
+SOLVER:
+  BASE_LR: 0.001
+  DECAY_STEPS: [80000, 100000, 120000]
+  MAX_STEPS: 120000
+  SNAPSHOT_EVERY: 5000
+  SNAPSHOT_PREFIX: voc_ssd_320
+SSD:
+  NUM_CONVS: 2
+  MULTIBOX:
+    STRIDES: [8, 16, 32]
+    MIN_SIZES: [30, 90, 150]
+    MAX_SIZES: [90, 150, 210]
+    ASPECT_RATIOS: [[1, 2, 0.5], [1, 2, 0.5], [1, 2, 0.5]]
+TRAIN:
+  WEIGHTS: '/model/AirNet.Affine.pth'
+  DATASET: '/data/voc_0712_trainval'
+  SCALES: [320]
+  RANDOM_SCALES: [0.25, 1.00]
+  IMS_PER_BATCH: 32
+TEST:
+  DATASET: '/data/voc_2007_test'
+  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
+  IMS_PER_BATCH: 8
+  SCALES: [320]
+  NMS_TOP_K: 400
+  NMS: 0.45
+  SCORE_THRESH: 0.01
  DETECTIONS_PER_IM: 200
\ No newline at end of file
--- a/configs/ssd/voc_ssd_300_VGG-16.yml
+++ b/configs/ssd/voc_ssd_300_VGG-16.yml
-NUM_GPUS: 1
-VIS: False
-ENABLE_TENSOR_BOARD: False
-MODEL:
-  TYPE: ssd
-  BACKBONE: vgg16_reduced_300.mbox
-  FREEZE_AT: 0
-  CLASSES: ['__background__',
-            'aeroplane', 'bicycle', 'bird', 'boat',
-            'bottle', 'bus', 'car', 'cat', 'chair',
-            'cow', 'diningtable', 'dog', 'horse',
-            'motorbike', 'person', 'pottedplant',
-            'sheep', 'sofa', 'train', 'tvmonitor']
-  NUM_CLASSES: 21
-SOLVER:
-  BASE_LR: 0.001
-  WARM_UP_FACTOR: 0.
-  WEIGHT_DECAY: 0.0005
-  DECAY_STEPS: [80000, 100000, 120000]
-  MAX_STEPS: 120000
-  SNAPSHOT_EVERY: 5000
-  SNAPSHOT_PREFIX: voc_ssd_300
-SSD:
-  RESIZE:
-    HEIGHT: 300
-    WIDTH: 300
-  MULTIBOX:
-    STRIDES: [8, 16, 32, 64, 100, 300]
-    MIN_SIZES: [30, 60, 110, 162, 213, 264]
-    MAX_SIZES: [60, 110, 162, 213, 264, 315]
-    ASPECT_RATIOS: [[1, 2, 0.5], [1, 2, 0.5, 3, 0.33], [1, 2, 0.5, 3, 0.33],
-                    [1, 2, 0.5, 3, 0.33], [1, 2, 0.5], [1, 2, 0.5]]
-TRAIN:
-  WEIGHTS: '/model/VGG16.SSD.pth'
-  DATABASE: '/data/voc_0712_trainval'
-  IMS_PER_BATCH: 32
-TEST:
-  DATABASE: '/data/voc_2007_test'
-  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
-  IMS_PER_BATCH: 8
-  NMS_TOP_K: 400
-  NMS: 0.45
-  SCORE_THRESH: 0.01
-  DETECTIONS_PER_IM: 200
-
+NUM_GPUS: 1
+VIS: False
+ENABLE_TENSOR_BOARD: False
+MODEL:
+  TYPE: ssd
+  BACKBONE: vgg16_reduced_300.mbox
+  FREEZE_AT: 0
+  CLASSES: ['__background__',
+            'aeroplane', 'bicycle', 'bird', 'boat',
+            'bottle', 'bus', 'car', 'cat', 'chair',
+            'cow', 'diningtable', 'dog', 'horse',
+            'motorbike', 'person', 'pottedplant',
+            'sheep', 'sofa', 'train', 'tvmonitor']
+  NUM_CLASSES: 21
+SOLVER:
+  BASE_LR: 0.001
+  WEIGHT_DECAY: 0.0005
+  DECAY_STEPS: [80000, 100000, 120000]
+  MAX_STEPS: 120000
+  SNAPSHOT_EVERY: 5000
+  SNAPSHOT_PREFIX: voc_ssd_300
+SSD:
+  MULTIBOX:
+    STRIDES: [8, 16, 32, 64, 100, 300]
+    MIN_SIZES: [30, 60, 110, 162, 213, 264]
+    MAX_SIZES: [60, 110, 162, 213, 264, 315]
+    ASPECT_RATIOS: [
+      [1, 2, 0.5],
+      [1, 2, 0.5, 3, 0.33],
+      [1, 2, 0.5, 3, 0.33],
+      [1, 2, 0.5, 3, 0.33],
+      [1, 2, 0.5],
+      [1, 2, 0.5]
+    ]
+TRAIN:
+  WEIGHTS: '/model/VGG16.SSD.pth'
+  DATASET: '/data/voc_0712_trainval'
+  IMS_PER_BATCH: 32
+  SCALES: [300]
+  RANDOM_SCALES: [0.25, 1.00]
+TEST:
+  DATASET: '/data/voc_2007_test'
+  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
+  IMS_PER_BATCH: 8
+  SCALES: [300]
+  NMS_TOP_K: 400
+  NMS: 0.45
+  SCORE_THRESH: 0.01
+  DETECTIONS_PER_IM: 200
+
--- a/configs/ssd/voc_ssd_320_R-50-FPN.yml
+++ b/configs/ssd/voc_ssd_320_R-50-FPN.yml
@@ -22,23 +22,29 @@ SOLVER:
  SNAPSHOT_PREFIX: voc_ssd_320
 SSD:
  NUM_CONVS: 2
-  RESIZE:
-    HEIGHT: 320
-    WIDTH: 320
  MULTIBOX:
    STRIDES: [8, 16, 32, 64, 100, 300]
    MIN_SIZES: [30, 60, 110, 162, 213, 264]
    MAX_SIZES: [60, 110, 162, 213, 264, 315]
-    ASPECT_RATIOS: [[1, 2, 0.5], [1, 2, 0.5, 3, 0.33], [1, 2, 0.5, 3, 0.33],
-                    [1, 2, 0.5, 3, 0.33], [1, 2, 0.5], [1, 2, 0.5]]
+    ASPECT_RATIOS: [
+      [1, 2, 0.5],
+      [1, 2, 0.5, 3, 0.33],
+      [1, 2, 0.5, 3, 0.33],
+      [1, 2, 0.5, 3, 0.33],
+      [1, 2, 0.5],
+      [1, 2, 0.5]
+    ]
 TRAIN:
  WEIGHTS: '/model/R-50.Affine.pth'
-  DATABASE: '/data/voc_0712_trainval'
+  DATASET: '/data/voc_0712_trainval'
+  SCALES: [320]
+  RANDOM_SCALES: [0.25, 1.00]
  IMS_PER_BATCH: 32
 TEST:
-  DATABASE: '/data/voc_2007_test'
+  DATASET: '/data/voc_2007_test'
  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
  IMS_PER_BATCH: 8
+  SCALES: [320]
  NMS_TOP_K: 400
  NMS: 0.45
  SCORE_THRESH: 0.01

--- a/csrc/cxx/operators/nms_op.cc
+++ b/csrc/cxx/operators/nms_op.cc
+#include <dragon/core/workspace.h>
+#include <dragon/utils/math_utils.h>
+
+#include "../utils/detection_utils.h"
+#include "nms_op.h"
+
+namespace dragon {
+
+template <class Context> template <typename T>
+void NonMaxSuppressionOp<Context>::DoRunWithType() {
+    int num_selected;
+
+    utils::detection::ApplyNMS(
+        Output(0)->count(),
+        Output(0)->count(),
+        iou_threshold_,
+        Input(0).template mutable_data<T, Context>(),
+        Output(0)->template mutable_data<int64_t, CPUContext>(),
+        num_selected, ctx()
+    );
+
+    Output(0)->Reshape({ num_selected });
+}
+
+template <class Context>
+void NonMaxSuppressionOp<Context>::RunOnDevice() {
+    CHECK(Input(0).ndim() == 2 && Input(0).dim(1) == 5)
+        << "\nThe dimensions of boxes should be (num_boxes, 5).";
+
+    Output(0)->Reshape({ Input(0).dim(0) });
+
+    DispatchHelper<TensorTypes<float>>::Call(this, Input(0));
+}
+
+DEPLOY_CPU(NonMaxSuppression);
+#ifdef USE_CUDA
+DEPLOY_CUDA(NonMaxSuppression);
+#endif
+
+OPERATOR_SCHEMA(NonMaxSuppression).NumInputs(1).NumOutputs(1);
+
+NO_GRADIENT(NonMaxSuppression);
+
+}  // namespace dragon
--- a/csrc/cxx/operators/nms_op.h
+++ b/csrc/cxx/operators/nms_op.h
+/*!
+ * Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+ *
+ * Licensed under the BSD 2-Clause License.
+ * You should have received a copy of the BSD 2-Clause License
+ * along with the software. If not, See,
+ *
+ *      <https://opensource.org/licenses/BSD-2-Clause>
+ *
+ * ------------------------------------------------------------
+ */
+
+#ifndef SEETADET_CXX_OPERATORS_NMS_OP_H_
+#define SEETADET_CXX_OPERATORS_NMS_OP_H_
+
+#include "dragon/core/operator.h"
+
+namespace dragon {
+
+template <class Context>
+class NonMaxSuppressionOp final : public Operator<Context> {
+ public:
+    NonMaxSuppressionOp(const OperatorDef& def, Workspace* ws)
+        : Operator<Context>(def, ws),
+          iou_threshold_(OpArg<float>("iou_threshold", 0.5f)) {}
+    USE_OPERATOR_FUNCTIONS;
+
+    void RunOnDevice() override;
+
+    template <typename T>
+    void DoRunWithType();
+
+ protected:
+    float iou_threshold_;
+};
+
+}  // namespace dragon
+
+#endif  // SEETADET_CXX_OPERATORS_NMS_OP_H_
--- a/csrc/cxx/operators/retinanet_decoder_op.cc
+++ b/csrc/cxx/operators/retinanet_decoder_op.cc
+#include <dragon/core/workspace.h>
+#include <dragon/utils/math_utils.h>
+
+#include "../utils/detection_utils.h"
+#include "retinanet_decoder_op.h"
+
+namespace dragon {
+
+template <class Context> template <typename T>
+void RetinaNetDecoderOp<Context>::DoRunWithType() {
+    using BT = float;  // DType of BBox
+    using BC = CPUContext;  // Context of BBox
+
+    int feat_h, feat_w;
+    int C = Input(-3).dim(2), A, K;
+    int total_proposals = 0;
+    int num_candidates, num_boxes, num_proposals;
+
+    auto* batch_scores = Input(-3).template data<T, BC>();
+    auto* batch_deltas = Input(-2).template data<T, BC>();
+    auto* im_info = Input(-1).template data<BT, BC>();
+    auto* y = Output(0)->template mutable_data<BT, BC>();
+
+    for (int n = 0; n < num_images_; ++n) {
+        BT im_h = im_info[0];
+        BT im_w = im_info[1];
+        BT im_scale_h = im_info[2];
+        BT im_scale_w = im_info[2];
+        if (Input(-1).dim(1) == 4) im_scale_w = im_info[3];
+        auto* scores = batch_scores + n * Input(-3).stride(0);
+        auto* deltas = batch_deltas + n * Input(-2).stride(0);
+        CHECK_EQ(strides_.size(), InputSize() - 3)
+            << "\nGiven " << strides_.size() << " strides "
+            << "and " << InputSize() - 3 << " features";
+        // Select the top-k candidates as proposals
+        num_boxes = Input(-3).dim(1);
+        num_candidates = Input(-3).count(1);
+        roi_indices_.resize(num_candidates);
+        num_candidates = 0;
+        for (int i = 0; i < roi_indices_.size(); ++i)
+            if (scores[i] > score_thr_)
+                roi_indices_[num_candidates++] = i;
+        scores_.resize(num_candidates);
+        for (int i = 0; i < num_candidates; ++i)
+            scores_[i] = scores[roi_indices_[i]];
+        num_proposals = std::min(
+            num_candidates,
+            (int)pre_nms_topn_
+        );
+        utils::math::ArgPartition(
+            num_candidates,
+            num_proposals,
+            true,
+            scores_.data(),
+            indices_
+        );
+        for (int i = 0; i < num_proposals; ++i)
+            indices_[i] = roi_indices_[indices_[i]];
+        // Decode the candidates
+        int base_offset = 0;
+        for (int i = 0; i < strides_.size(); i++) {
+            feat_h = Input(i).dim(2);
+            feat_w = Input(i).dim(3);
+            K = feat_h * feat_w;
+            A = int(ratios_.size() * scales_.size());
+            anchors_.resize((size_t)(A * 4));
+            utils::detection::GenerateAnchors(
+                strides_[i],
+                (int)ratios_.size(),
+                (int)scales_.size(),
+                ratios_.data(),
+                scales_.data(),
+                anchors_.data()
+            );
+            utils::detection::GenerateGridAnchors(
+                num_proposals, C, A,
+                feat_h, feat_w,
+                strides_[i],
+                base_offset,
+                anchors_.data(),
+                indices_.data(),
+                y
+            );
+            base_offset += (A * K);
+        }
+        utils::detection::GenerateMCProposals(
+            num_proposals,
+            num_boxes, C,
+            n,
+            im_h,
+            im_w,
+            im_scale_h,
+            im_scale_w,
+            scores,
+            deltas,
+            indices_.data(),
+            y
+        );
+        total_proposals += num_proposals;
+        y += (num_proposals * 7);
+        im_info += Input(-1).dim(1);
+    }
+
+    Output(0)->Reshape({ total_proposals, 7 });
+}
+
+template <class Context>
+void RetinaNetDecoderOp<Context>::RunOnDevice() {
+    num_images_ = Input(0).dim(0);
+
+    CHECK_EQ(Input(-1).dim(0), num_images_)
+        << "\nExcepted " << num_images_
+        << " groups info, got "
+        << Input(-1).dim(0) << ".";
+
+    Output(0)->Reshape({ num_images_ * pre_nms_topn_, 7 });
+
+    DispatchHelper<TensorTypes<float>>::Call(this, Input(-3));
+}
+
+DEPLOY_CPU(RetinaNetDecoder);
+#ifdef USE_CUDA
+DEPLOY_CUDA(RetinaNetDecoder);
+#endif
+
+OPERATOR_SCHEMA(RetinaNetDecoder)
+    .NumInputs(3, INT_MAX)
+    .NumOutputs(1, INT_MAX);
+
+}  // namespace dragon
--- a/csrc/cxx/operators/retinanet_decoder_op.h
+++ b/csrc/cxx/operators/retinanet_decoder_op.h
+/*!
+ * Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+ *
+ * Licensed under the BSD 2-Clause License.
+ * You should have received a copy of the BSD 2-Clause License
+ * along with the software. If not, See,
+ *
+ *      <https://opensource.org/licenses/BSD-2-Clause>
+ *
+ * ------------------------------------------------------------
+ */
+
+#ifndef SEETADET_CXX_OPERATORS_RETINANET_DECODER_OP_H_
+#define SEETADET_CXX_OPERATORS_RETINANET_DECODER_OP_H_
+
+#include "dragon/core/operator.h"
+
+namespace dragon {
+
+template <class Context>
+class RetinaNetDecoderOp final : public Operator<Context> {
+ public:
+    RetinaNetDecoderOp(const OperatorDef& def, Workspace* ws)
+        : Operator<Context>(def, ws),
+          strides_(OpArgs<int64_t>("strides")),
+          ratios_(OpArgs<float>("ratios")),
+          scales_(OpArgs<float>("scales")),
+          pre_nms_topn_(OpArg<int64_t>("pre_nms_top_n", 6000)),
+          score_thr_(OpArg<float>("score_thresh", 0.05f)) {}
+    USE_OPERATOR_FUNCTIONS;
+
+    void RunOnDevice() override;
+
+    template <typename T>
+    void DoRunWithType();
+
+ protected:
+    float score_thr_;
+    vec64_t strides_, indices_, roi_indices_;
+    vector<float> ratios_, scales_, scores_, anchors_;
+    int64_t num_images_, pre_nms_topn_;
+};
+
+}  // namespace dragon
+
+#endif  // SEETADET_CXX_OPERATORS_RETINANET_DECODER_OP_H_
--- a/csrc/cxx/operators/rpn_decoder_op.cc
+++ b/csrc/cxx/operators/rpn_decoder_op.cc
+#include <dragon/core/workspace.h>
+#include <dragon/utils/math_utils.h>
+
+#include "../utils/detection_utils.h"
+#include "rpn_decoder_op.h"
+
+namespace dragon {
+
+template <class Context> template <typename T>
+void RPNDecoderOp<Context>::DoRunWithType() {
+    using BT = float;  // DType of BBox
+    using BC = CPUContext;  // Context of BBox
+
+    int feat_h, feat_w, K, A;
+    int total_rois = 0, num_rois;
+    int num_candidates, num_proposals;
+
+    auto* batch_scores = Input(-3).template data<T, BC>();
+    auto* batch_deltas = Input(-2).template data<T, BC>();
+    auto* im_info = Input(-1).template data<BT, BC>();
+    auto* y = Output(0)->template mutable_data<BT, BC>();
+
+    for (int n = 0; n < num_images_; ++n) {
+        const BT im_h = im_info[0];
+        const BT im_w = im_info[1];
+        const BT scale = im_info[2];
+        const BT min_box_h = min_size_ * scale;
+        const BT min_box_w = min_size_ * scale;
+        auto* scores = batch_scores + n * Input(-3).stride(0);
+        auto* deltas = batch_deltas + n * Input(-2).stride(0);
+        if (strides_.size() == 1) {
+            // Case 1: single stride
+            feat_h = Input(0).dim(2);
+            feat_w = Input(0).dim(3);
+            K = feat_h * feat_w;
+            A = int(ratios_.size() * scales_.size());
+            // Select the Top-K candidates as proposals
+            num_candidates = A * K;
+            num_proposals = std::min(
+                num_candidates,
+                (int)pre_nms_topn_
+            );
+            utils::math::ArgPartition(
+                num_candidates,
+                num_proposals,
+                true, scores, indices_
+            );
+            // Decode the candidates
+            anchors_.resize((size_t)(A * 4));
+            proposals_.Reshape({ num_proposals, 5 });
+            utils::detection::GenerateAnchors(
+                strides_[0],
+                (int)ratios_.size(),
+                (int)scales_.size(),
+                ratios_.data(),
+                scales_.data(),
+                anchors_.data()
+            );
+            utils::detection::GenerateGridAnchors(
+                num_proposals, A,
+                feat_h, feat_w,
+                strides_[0],
+                0,
+                anchors_.data(),
+                indices_.data(),
+                proposals_.template mutable_data<BT, BC>()
+            );
+            utils::detection::GenerateSSProposals(
+                K, num_proposals,
+                im_h, im_w,
+                min_box_h, min_box_w,
+                scores,
+                deltas,
+                indices_.data(),
+                proposals_.template mutable_data<BT, BC>()
+            );
+            // Sort, NMS and Retrieve
+            utils::detection::SortProposals(
+                0,
+                num_proposals - 1,
+                num_proposals,
+                proposals_.template mutable_data<BT, BC>()
+            );
+            utils::detection::ApplyNMS(
+                num_proposals,
+                post_nms_topn_,
+                nms_thr_,
+                proposals_.template mutable_data<BT, Context>(),
+                roi_indices_.data(),
+                num_rois, ctx()
+            );
+            utils::detection::RetrieveRoIs(
+                num_rois,
+                n,
+                proposals_.template data<BT, BC>(),
+                roi_indices_.data(),
+                y
+            );
+        } else if (strides_.size() > 1) {
+            // Case 2: multiple strides
+            CHECK_EQ(strides_.size(), InputSize() - 3)
+                << "\nGiven " << strides_.size() << " strides "
+                << "and " << InputSize() - 3 << " feature inputs";
+            CHECK_EQ(strides_.size(), scales_.size())
+                << "\nGiven " << strides_.size() << " strides "
+                << "and " << scales_.size() << " scales";
+            // Select the top-k candidates as proposals
+            num_candidates = Input(-3).dim(1);
+            num_proposals = std::min(
+                num_candidates,
+                (int)pre_nms_topn_
+            );
+            utils::math::ArgPartition(
+                num_candidates,
+                num_proposals,
+                true, scores, indices_
+            );
+            // Decode the candidates
+            int base_offset = 0;
+            proposals_.Reshape({ num_proposals, 5 });
+            auto* proposals = proposals_
+                .template mutable_data<BT, BC>();
+            for (int i = 0; i < strides_.size(); i++) {
+                feat_h = Input(i).dim(2);
+                feat_w = Input(i).dim(3);
+                K = feat_h * feat_w;
+                A = (int)ratios_.size();
+                anchors_.resize((size_t)(A * 4));
+                utils::detection::GenerateAnchors(
+                    strides_[i],
+                    (int)ratios_.size(),
+                    1,
+                    ratios_.data(),
+                    scales_.data(),
+                    anchors_.data()
+                );
+                utils::detection::GenerateGridAnchors(
+                    num_proposals, A,
+                    feat_h, feat_w,
+                    strides_[i],
+                    base_offset,
+                    anchors_.data(),
+                    indices_.data(),
+                    proposals
+                );
+                base_offset += (A * K);
+            }
+            utils::detection::GenerateMSProposals(
+                num_candidates,
+                num_proposals,
+                im_h, im_w,
+                min_box_h, min_box_w,
+                scores,
+                deltas,
+                &indices_[0],
+                proposals
+            );
+            // Sort, NMS and Retrieve
+            utils::detection::SortProposals(
+                0,
+                num_proposals - 1,
+                num_proposals,
+                proposals
+            );
+            utils::detection::ApplyNMS(
+                num_proposals,
+                post_nms_topn_,
+                nms_thr_,
+                proposals_.template mutable_data<BT, Context>(),
+                roi_indices_.data(),
+                num_rois, ctx()
+            );
+            utils::detection::RetrieveRoIs(
+                num_rois,
+                n,
+                proposals,
+                roi_indices_.data(),
+                y
+            );
+        } else {
+            LOG(FATAL) << "Excepted at least one stride for proposals.";
+        }
+        total_rois += num_rois;
+        y += (num_rois * 5);
+        im_info += Input(-1).dim(1);
+    }
+
+    Output(0)->Reshape({ total_rois, 5 });
+
+    // Distribute rois into K bins
+    if (OutputSize() > 1) {
+        CHECK_EQ(max_level_ - min_level_ + 1, OutputSize())
+            << "\nExcepted " << OutputSize() << " outputs for levels "
+               "between [" << min_level_ << ", " << max_level_ << "].";
+        vector<BT*> ys(OutputSize());
+        vector<vec64_t> bins(OutputSize());
+        Tensor RoIs; RoIs.ReshapeLike(*Output(0));
+
+        auto* rois = RoIs.template mutable_data<BT, BC>();
+
+        ctx()->template Copy<BT, BC, BC>(
+            Output(0)->count(),
+            rois, Output(0)->template data<BT, BC>()
+        );
+
+        utils::detection::CollectRoIs(
+            total_rois,
+            min_level_,
+            max_level_,
+            canonical_level_,
+            canonical_scale_,
+            rois, bins
+        );
+
+        for (int i = 0; i < OutputSize(); i++) {
+            Output(i)->Reshape({ std::max((int)bins[i].size(), 1), 5 });
+            ys[i] = Output(i)->template mutable_data<BT, BC>();
+        }
+
+        utils::detection::DistributeRoIs(bins, rois, ys);
+    }
+}
+
+template <class Context>
+void RPNDecoderOp<Context>::RunOnDevice() {
+    num_images_ = Input(0).dim(0);
+
+    CHECK_EQ(Input(-1).dim(0), num_images_)
+        << "\nExcepted " << num_images_
+        << " groups info, got "
+        << Input(-1).dim(0) << ".";
+
+    roi_indices_.resize(post_nms_topn_);
+    Output(0)->Reshape({ num_images_ * post_nms_topn_, 5 });
+
+    DispatchHelper<TensorTypes<float>>::Call(this, Input(-3));
+}
+
+DEPLOY_CPU(RPNDecoder);
+#ifdef USE_CUDA
+DEPLOY_CUDA(RPNDecoder);
+#endif
+
+OPERATOR_SCHEMA(RPNDecoder)
+    .NumInputs(3, INT_MAX)
+    .NumOutputs(1, INT_MAX);
+
+}  // namespace dragon
--- a/csrc/cxx/operators/rpn_decoder_op.h
+++ b/csrc/cxx/operators/rpn_decoder_op.h
+/*!
+ * Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+ *
+ * Licensed under the BSD 2-Clause License.
+ * You should have received a copy of the BSD 2-Clause License
+ * along with the software. If not, See,
+ *
+ *      <https://opensource.org/licenses/BSD-2-Clause>
+ *
+ * ------------------------------------------------------------
+ */
+
+#ifndef SEETADET_CXX_OPERATORS_RPN_DECODER_OP_H_
+#define SEETADET_CXX_OPERATORS_RPN_DECODER_OP_H_
+
+#include "dragon/core/operator.h"
+
+namespace dragon {
+
+template <class Context>
+class RPNDecoderOp final : public Operator<Context> {
+ public:
+    RPNDecoderOp(const OperatorDef& def, Workspace* ws)
+        : Operator<Context>(def, ws),
+          strides_(OpArgs<int64_t>("strides")),
+          ratios_(OpArgs<float>("ratios")),
+          scales_(OpArgs<float>("scales")),
+          pre_nms_topn_(OpArg<int64_t>("pre_nms_top_n", 6000)),
+          post_nms_topn_(OpArg<int64_t>("post_nms_top_n", 300)),
+          nms_thr_(OpArg<float>("nms_thresh", 0.7f)),
+          min_size_(OpArg<int64_t>("min_size", 16)),
+          min_level_(OpArg<int64_t>("min_level", 2)),
+          max_level_(OpArg<int64_t>("max_level", 5)),
+          canonical_level_(OpArg<int64_t>("canonical_level", 4)),
+          canonical_scale_(OpArg<int64_t>("canonical_scale", 224)) {}
+    USE_OPERATOR_FUNCTIONS;
+
+    void RunOnDevice() override;
+
+    template <typename T>
+    void DoRunWithType();
+
+ protected:
+    float nms_thr_;
+    vec64_t strides_, indices_, roi_indices_;
+    vector<float> ratios_, scales_, scores_, anchors_;
+    int64_t min_size_, pre_nms_topn_, post_nms_topn_;
+    int64_t num_images_, min_level_, max_level_;
+    int64_t canonical_level_, canonical_scale_;
+    Tensor proposals_;
+};
+
+}  // namespace dragon
+
+#endif  // SEETADET_CXX_OPERATORS_RPN_DECODER_OP_H_
--- a/csrc/cxx/setup.py
+++ b/csrc/cxx/setup.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+"""Build cxx sources."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from distutils.core import setup
+from dragon.tools import cpp_extension
+
+if cpp_extension.CUDA_HOME is not None and \
+        cpp_extension._cuda.is_available():
+    Extension = cpp_extension.CUDAExtension
+else:
+    Extension = cpp_extension.CppExtension
+
+ext_modules = [
+    Extension(
+        name='install.lib.modules._C',
+        sources=[
+            'utils/detection_utils.cc',
+            'utils/detection_utils.cu',
+            'operators/nms_op.cc',
+            'operators/retinanet_decoder_op.cc',
+            'operators/rpn_decoder_op.cc',
+        ],
+    ),
+]
+
+setup(
+    name='SeetaDet',
+    ext_modules=ext_modules,
+    cmdclass={'build_ext': cpp_extension.BuildExtension}
+)
--- a/csrc/cxx/utils/detection_utils.cc
+++ b/csrc/cxx/utils/detection_utils.cc
+#include <dragon/core/context.h>
+#include "detection_utils.h"
+
+namespace dragon {
+
+namespace utils {
+
+namespace detection {
+
+template <typename T>
+T IoU(const T A[], const T B[]) {
+    if (A[0] > B[2] || A[1] > B[3] ||
+        A[2] < B[0] || A[3] < B[1]) return 0;
+    const T x1 = std::max(A[0], B[0]);
+    const T y1 = std::max(A[1], B[1]);
+    const T x2 = std::min(A[2], B[2]);
+    const T y2 = std::min(A[3], B[3]);
+    const T width = std::max((T)0, x2 - x1 + 1);
+    const T height = std::max((T)0, y2 - y1 + 1);
+    const T area = width * height;
+    const T A_area = (A[2] - A[0] + 1) * (A[3] - A[1] + 1);
+    const T B_area = (B[2] - B[0] + 1) * (B[3] - B[1] + 1);
+    return area / (A_area + B_area - area);
+}
+
+template <> void ApplyNMS<float, CPUContext>(
+    const int               num_boxes,
+    const int               max_keeps,
+    const float             thresh,
+    const float*            boxes,
+    int64_t*                keep_indices,
+    int&                    num_keep,
+    CPUContext*             ctx) {
+    int count = 0;
+    std::vector<char> is_dead(num_boxes);
+    for (int i = 0; i < num_boxes; ++i) is_dead[i] = 0;
+    for (int i = 0; i < num_boxes; ++i) {
+        if (is_dead[i]) continue;
+        keep_indices[count++] = i;
+        if (count == max_keeps) break;
+        for (int j = i + 1; j < num_boxes; ++j)
+            if (!is_dead[j] && IoU(&boxes[i * 5],
+                                   &boxes[j * 5]) > thresh)
+                is_dead[j] = 1;
+    }
+    num_keep = count;
+}
+
+}  // namespace detection
+
+}  // namespace utils
+
+}  // namespace dragon
--- a/csrc/cxx/utils/detection_utils.cu
+++ b/csrc/cxx/utils/detection_utils.cu
+#ifdef USE_CUDA
+
+#include <dragon/core/context_cuda.h>
+#include "detection_utils.h"
+
+namespace dragon {
+
+namespace utils {
+
+namespace detection {
+
+#define DIV_UP(m,n) ((m) / (n) + ((m) % (n) > 0))
+#define NUM_THREADS 64
+
+namespace {
+
+template <typename T>
+__device__ bool _CheckIoU(
+    const T*               a,
+    const T*               b,
+    const float            thresh) {
+    const T x1 = max(a[0], b[0]);
+    const T y1 = max(a[1], b[1]);
+    const T x2 = min(a[2], b[2]);
+    const T y2 = min(a[3], b[3]);
+    const T width = max(T(0), x2 - x1 + 1);
+    const T height = max(T(0), y2 - y1 + 1);
+    const T inter = width * height;
+    const T Sa = (a[2] - a[0] + T(1)) * (a[3] - a[1] + T(1));
+    const T Sb = (b[2] - b[0] + T(1)) * (b[3] - b[1] + T(1));
+    return inter > thresh * (Sa + Sb - inter);
+}
+
+template <typename T>
+__global__ void _NonMaxSuppression(
+    const int               num_blocks,
+    const int               num_boxes,
+    const T                 thresh,
+    const T*                dev_boxes,
+    uint64_t*               dev_mask) {
+    const int row_start = blockIdx.y;
+    const int col_start = blockIdx.x;
+    if (row_start > col_start) return;
+
+    const int row_size = min(num_boxes - row_start * NUM_THREADS, NUM_THREADS);
+    const int col_size = min(num_boxes - col_start * NUM_THREADS, NUM_THREADS);
+
+    __shared__ T block_boxes[NUM_THREADS * 4];
+
+    if (threadIdx.x < col_size) {
+        const int c1 = threadIdx.x * 4;
+        const int c2 = (col_start * NUM_THREADS + threadIdx.x) * 5;
+        block_boxes[c1] = dev_boxes[c2];
+        block_boxes[c1 + 1] = dev_boxes[c2 + 1];
+        block_boxes[c1 + 2] = dev_boxes[c2 + 2];
+        block_boxes[c1 + 3] = dev_boxes[c2 + 3];
+    }
+
+    __syncthreads();
+
+    if (threadIdx.x < row_size) {
+        const int index = row_start * NUM_THREADS + threadIdx.x;
+        const T* dev_box = dev_boxes + index * 5;
+        unsigned long long val = 0;
+        const int start = (row_start == col_start) ? (threadIdx.x + 1) : 0;
+        for (int i = start; i < col_size; ++i) {
+            if (_CheckIoU(dev_box, block_boxes + i * 4, thresh)) {
+                val |= 1ULL << i;
+            }
+        }
+        dev_mask[index * num_blocks + col_start] = val;
+    }
+}
+
+}  // namespace
+
+template <> void ApplyNMS<float, CUDAContext>(
+    const int               num_boxes,
+    const int               max_keeps,
+    const float             thresh,
+    const float*            boxes,
+    int64_t*                keep_indices,
+    int&                    num_keep,
+    CUDAContext*            ctx) {
+    const int num_blocks = DIV_UP(num_boxes, NUM_THREADS);
+
+    vector<uint64_t> mask_host(num_boxes * num_blocks);
+    auto* mask_dev = (uint64_t*)ctx->New(mask_host.size() * sizeof(uint64_t));
+    
+    _NonMaxSuppression
+         <<< dim3(num_blocks, num_blocks), NUM_THREADS,
+             0, ctx->cuda_stream() >>>(
+        num_blocks,
+        num_boxes,
+        thresh,
+        boxes,
+        mask_dev
+    );
+    
+    CUDA_CHECK(cudaMemcpyAsync(
+        mask_host.data(),
+        mask_dev,
+        mask_host.size() * sizeof(uint64_t),
+        cudaMemcpyDeviceToHost,
+        ctx->cuda_stream()
+    ));
+
+    ctx->FinishDeviceComputation();
+
+    vector<uint64_t> dead_bit(num_blocks);
+    memset(&dead_bit[0], 0, sizeof(uint64_t) * num_blocks);
+
+    int num_selected = 0;
+    for (int i = 0; i < num_boxes; ++i) {
+        const int nblock = i / NUM_THREADS;
+        const int inblock = i % NUM_THREADS;
+        if (!(dead_bit[nblock] & (1ULL << inblock))) {
+            keep_indices[num_selected++] = i;
+            auto* mask_i = &mask_host[0] + i * num_blocks;
+            for (int j = nblock; j < num_blocks; ++j) dead_bit[j] |= mask_i[j];
+            if (num_selected == max_keeps) break;
+        }
+    }
+    num_keep = num_selected;
+
+    ctx->Delete(mask_dev);
+}
+
+}  // namespace detection
+
+}  // namespace utils
+
+}  // namespace dragon
+
+#endif  // USE_CUDA
--- a/csrc/cxx/utils/detection_utils.h
+++ b/csrc/cxx/utils/detection_utils.h
--- a/lib/pycocotools/_mask.pyx
+++ b/lib/pycocotools/_mask.pyx
--- a/compile/bbox.pyx
+++ b/compile/bbox.pyx
-# --------------------------------------------------------
-# Fast R-CNN
-# Copyright (c) 2015 Microsoft
-# Licensed under The MIT License [see LICENSE for details]
-# Written by Sergey Karayev
-# --------------------------------------------------------
-
-cimport cython
-import numpy as np
-cimport numpy as np
-
-DTYPE = np.float
-ctypedef np.float_t DTYPE_t
-
-@cython.boundscheck(False)
-def bbox_overlaps(
-        np.ndarray[DTYPE_t, ndim=2] boxes,
-        np.ndarray[DTYPE_t, ndim=2] query_boxes):
-    """
-    Parameters
-    ----------
-    boxes: (N, 4) ndarray of float
-    query_boxes: (K, 4) ndarray of float
-    Returns
-    -------
-    overlaps: (N, K) ndarray of overlap between boxes and query_boxes
-    """
-    cdef unsigned int N = boxes.shape[0]
-    cdef unsigned int K = query_boxes.shape[0]
-    cdef np.ndarray[DTYPE_t, ndim=2] overlaps = np.zeros((N, K), dtype=DTYPE)
-    cdef DTYPE_t iw, ih, box_area
-    cdef DTYPE_t ua
-    cdef unsigned int k, n
-    with nogil:
-        for k in range(K):
-            box_area = (
-                (query_boxes[k, 2] - query_boxes[k, 0] + 1) *
-                (query_boxes[k, 3] - query_boxes[k, 1] + 1)
-            )
-            for n in range(N):
-                iw = (
-                    min(boxes[n, 2], query_boxes[k, 2]) -
-                    max(boxes[n, 0], query_boxes[k, 0]) + 1
-                )
-                if iw > 0:
-                    ih = (
-                        min(boxes[n, 3], query_boxes[k, 3]) -
-                        max(boxes[n, 1], query_boxes[k, 1]) + 1
-                    )
-                    if ih > 0:
-                        ua = float(
-                            (boxes[n, 2] - boxes[n, 0] + 1) *
-                            (boxes[n, 3] - boxes[n, 1] + 1) +
-                            box_area - iw * ih
-                        )
-                        overlaps[n, k] = iw * ih / ua
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Sergey Karayev
+# --------------------------------------------------------
+
+cimport cython
+import numpy as np
+cimport numpy as np
+
+DTYPE = np.float
+ctypedef np.float_t DTYPE_t
+
+@cython.boundscheck(False)
+def bbox_overlaps(
+        np.ndarray[DTYPE_t, ndim=2] boxes,
+        np.ndarray[DTYPE_t, ndim=2] query_boxes):
+    """
+    Parameters
+    ----------
+    boxes: (N, 4) ndarray of float
+    query_boxes: (K, 4) ndarray of float
+    Returns
+    -------
+    overlaps: (N, K) ndarray of overlap between boxes and query_boxes
+    """
+    cdef unsigned int N = boxes.shape[0]
+    cdef unsigned int K = query_boxes.shape[0]
+    cdef np.ndarray[DTYPE_t, ndim=2] overlaps = np.zeros((N, K), dtype=DTYPE)
+    cdef DTYPE_t iw, ih, box_area
+    cdef DTYPE_t ua
+    cdef unsigned int k, n
+    with nogil:
+        for k in range(K):
+            box_area = (
+                (query_boxes[k, 2] - query_boxes[k, 0] + 1) *
+                (query_boxes[k, 3] - query_boxes[k, 1] + 1)
+            )
+            for n in range(N):
+                iw = (
+                    min(boxes[n, 2], query_boxes[k, 2]) -
+                    max(boxes[n, 0], query_boxes[k, 0]) + 1
+                )
+                if iw > 0:
+                    ih = (
+                        min(boxes[n, 3], query_boxes[k, 3]) -
+                        max(boxes[n, 1], query_boxes[k, 1]) + 1
+                    )
+                    if ih > 0:
+                        ua = float(
+                            (boxes[n, 2] - boxes[n, 0] + 1) *
+                            (boxes[n, 3] - boxes[n, 1] + 1) +
+                            box_area - iw * ih
+                        )
+                        overlaps[n, k] = iw * ih / ua
    return overlaps
\ No newline at end of file
--- a/compile/cpu_nms.pyx
+++ b/compile/cpu_nms.pyx
--- a/lib/pycocotools/maskApi.c
+++ b/lib/pycocotools/maskApi.c
--- a/lib/pycocotools/maskApi.h
+++ b/lib/pycocotools/maskApi.h
--- a/csrc/pyx/setup.py
+++ b/csrc/pyx/setup.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+"""Compile the cython extensions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from distutils.extension import Extension
+from distutils.core import setup
+import os
+
+from Cython.Distutils import build_ext
+import numpy as np
+
+ext_modules = [
+    Extension(
+        'install.lib.utils.cython_bbox',
+        ['cython_bbox.pyx'],
+        extra_compile_args=['-w'],
+        include_dirs=[np.get_include()]
+    ),
+    Extension(
+        'install.lib.utils.cython_nms',
+        ['cython_nms.pyx'],
+        extra_compile_args=['-w'],
+        include_dirs=[np.get_include()]
+    ),
+    Extension(
+        'install.lib.pycocotools._mask',
+        ['maskApi.c', '_mask.pyx'],
+        include_dirs=[np.get_include(), os.path.dirname(os.path.abspath(__file__))],
+        extra_compile_args=['-w']
+    ),
+]
+
+setup(
+    name='SeetaDet',
+    ext_modules=ext_modules,
+    cmdclass={'build_ext': build_ext},
+)
--- a/lib/faster_rcnn/data_loader.py
+++ b/lib/faster_rcnn/data_loader.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import multiprocessing as mp
-import time
-
-import dragon
-import dragon.vm.torch as torch
-import numpy as np
-
-from lib.core.config import cfg
-from lib.faster_rcnn.data_transformer import DataTransformer
-from lib.datasets.factory import get_imdb
-from lib.utils import logger
-from lib.utils.blob import im_list_to_blob
-
-
-class DataLoader(object):
-    """Provide mini-batches of data."""
-
-    def __init__(self):
-        super(DataLoader, self).__init__()
-        database = get_imdb(cfg.TRAIN.DATABASE)
-        self.data_batch = DataBatch(**{
-            'dataset': lambda: dragon.io.SeetaRecordDataset(database.source),
-            'classes': database.classes,
-            'shuffle': cfg.TRAIN.USE_SHUFFLE,
-            'num_chunks': cfg.TRAIN.NUM_SHUFFLE_CHUNKS,
-            'batch_size': cfg.TRAIN.IMS_PER_BATCH * 2,
-            'num_transformers': cfg.TRAIN.NUM_WORKERS,
-        })
-
-    def __call__(self):
-        outputs = self.data_batch.get()
-        outputs['data'] = torch.from_numpy(outputs['data'])
-        return outputs
-
-
-class DataBatch(mp.Process):
-    """Prefetch the batch of data."""
-
-    def __init__(self, **kwargs):
-        """Construct a ``DataBatch``.
-
-        Parameters
-        ----------
-        dataset : lambda
-            The creator of a dataset.
-        classes : Sequence[str]
-            The class names.
-        shuffle : bool, optional, default=False
-            Whether to shuffle the data.
-        num_chunks : int, optional, default=0
-            The number of chunks to split.
-        batch_size : int, optional, default=2
-            The size of a mini-batch.
-        num_transformers : int, optional, default=3
-            The number of workers to transform data.
-
-        """
-        super(DataBatch, self).__init__()
-        # Distributed settings
-        rank, group_size = 0, 1
-        process_group = dragon.distributed.get_group()
-        if process_group is not None and kwargs.get(
-                'phase', 'TRAIN') == 'TRAIN':
-            group_size = process_group.size
-            rank = dragon.distributed.get_rank(process_group)
-        kwargs['group_size'] = group_size
-
-        # Configuration
-        self._prefetch = kwargs.get('prefetch', 5)
-        self._batch_size = kwargs.get('batch_size', 2)
-        self._num_readers = kwargs.get('num_readers', 1)
-        self._num_transformers = kwargs.get('num_transformers', 3)
-        self._num_fetchers = kwargs.get('num_fetchers', 1)
-        self.daemon = True
-
-        # Initialize queues
-        num_batches = self._prefetch * self._num_readers
-        self.Q1 = mp.Queue(num_batches * self._batch_size)
-        self.Q21 = mp.Queue(num_batches * self._batch_size)
-        self.Q22 = mp.Queue(num_batches * self._batch_size)
-        self.Q3 = mp.Queue(num_batches)
-
-        # Initialize readers
-        self._readers = []
-        for i in range(self._num_readers):
-            part_idx, num_parts = i, self._num_readers
-            num_parts *= group_size
-            part_idx += rank * self._num_readers
-            self._readers.append(dragon.io.DataReader(
-                num_parts=num_parts, part_idx=part_idx, **kwargs))
-            self._readers[i]._seed += part_idx
-            self._readers[i].q_out = self.Q1
-            self._readers[i].start()
-            time.sleep(0.1)
-
-        # Initialize transformers
-        self._transformers = []
-        for i in range(self._num_transformers):
-            transformer = DataTransformer(**kwargs)
-            transformer._seed += (i + rank * self._num_transformers)
-            transformer.q_in = self.Q1
-            transformer.q1_out, transformer.q2_out = self.Q21, self.Q22
-            transformer.start()
-            self._transformers.append(transformer)
-            time.sleep(0.1)
-
-        # Initialize batch-producer
-        self.start()
-
-        # Register cleanup callbacks
-        def cleanup():
-            def terminate(processes):
-                for process in processes:
-                    process.terminate()
-                    process.join()
-            terminate([self])
-            logger.info('Terminate DataBatch.')
-            terminate(self._transformers)
-            logger.info('Terminate DataTransformer.')
-            terminate(self._readers)
-            logger.info('Terminate DataReader.')
-
-        import atexit
-        atexit.register(cleanup)
-
-    def get(self):
-        """Get a batch.
-
-        Returns
-        -------
-        dict
-            The batch dict.
-
-        """
-        return self.Q3.get()
-
-    def run(self):
-        """Start the process to produce batches."""
-        def produce(q_in):
-            processed_ims, ims_info, all_boxes = [], [], []
-            for image_index in range(cfg.TRAIN.IMS_PER_BATCH):
-                im, im_scale, gt_boxes = q_in.get()
-                processed_ims.append(im)
-                ims_info.append(list(im.shape[:2]) + [im_scale])
-                im_boxes = np.zeros((gt_boxes.shape[0], gt_boxes.shape[1] + 1), 'float32')
-                im_boxes[:, :gt_boxes.shape[1]], im_boxes[:, -1] = gt_boxes, image_index
-                all_boxes.append(im_boxes)
-            return {
-                'data': im_list_to_blob(processed_ims),
-                'ims_info': np.array(ims_info, dtype=np.float32),
-                'gt_boxes': np.concatenate(all_boxes, axis=0),
-            }
-
-        # Two queues to implement aspect-grouping
-        # This is necessary to reduce the gpu memory
-        # from fetching a huge square batch blob
-        q1, q2 = self.Q21, self.Q22
-
-        # Main prefetch loop
-        while True:
-            if q1.qsize() >= cfg.TRAIN.IMS_PER_BATCH:
-                self.Q3.put(produce(q1))
-            elif q2.qsize() >= cfg.TRAIN.IMS_PER_BATCH:
-                self.Q3.put(produce(q2))
-            q1, q2 = q2, q1  # Uniform sampling trick
--- a/lib/mask_rcnn/data_loader.py
+++ b/lib/mask_rcnn/data_loader.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import multiprocessing as mp
-import time
-
-import dragon
-import dragon.vm.torch as torch
-import numpy as np
-
-from lib.core.config import cfg
-from lib.mask_rcnn.data_transformer import DataTransformer
-from lib.datasets.factory import get_imdb
-from lib.utils import logger
-from lib.utils.blob import im_list_to_blob
-from lib.utils.blob import mask_list_to_blob
-
-
-class DataLoader(object):
-    """Provide mini-batches of data."""
-
-    def __init__(self):
-        super(DataLoader, self).__init__()
-        database = get_imdb(cfg.TRAIN.DATABASE)
-        self.data_batch = DataBatch(**{
-            'dataset': lambda: dragon.io.SeetaRecordDataset(database.source),
-            'classes': database.classes,
-            'shuffle': cfg.TRAIN.USE_SHUFFLE,
-            'num_chunks': cfg.TRAIN.NUM_SHUFFLE_CHUNKS,
-            'batch_size': cfg.TRAIN.IMS_PER_BATCH * 2,
-            'num_transformers': cfg.TRAIN.NUM_WORKERS,
-        })
-
-    def __call__(self):
-        outputs = self.data_batch.get()
-        outputs['data'] = torch.from_numpy(outputs['data'])
-        return outputs
-
-
-class DataBatch(mp.Process):
-    """Prefetch the batch of data."""
-
-    def __init__(self, **kwargs):
-        """Construct a ``DataBatch``.
-
-        Parameters
-        ----------
-        dataset : lambda
-            The creator of a dataset.
-        classes : Sequence[str]
-            The class names.
-        shuffle : bool, optional, default=False
-            Whether to shuffle the data.
-        num_chunks : int, optional, default=0
-            The number of chunks to split.
-        batch_size : int, optional, default=2
-            The size of a mini-batch.
-        num_transformers : int, optional, default=3
-            The number of workers to transform data.
-
-        """
-        super(DataBatch, self).__init__()
-        # Distributed settings
-        rank, group_size = 0, 1
-        process_group = dragon.distributed.get_group()
-        if process_group is not None and kwargs.get(
-                'phase', 'TRAIN') == 'TRAIN':
-            group_size = process_group.size
-            rank = dragon.distributed.get_rank(process_group)
-        kwargs['group_size'] = group_size
-
-        # Configuration
-        self._prefetch = kwargs.get('prefetch', 5)
-        self._batch_size = kwargs.get('batch_size', 2)
-        self._num_readers = kwargs.get('num_readers', 1)
-        self._num_transformers = kwargs.get('num_transformers', 3)
-        self._num_fetchers = kwargs.get('num_fetchers', 1)
-        self.daemon = True
-
-        # Initialize queues
-        num_batches = self._prefetch * self._num_readers
-        self.Q1 = mp.Queue(num_batches * self._batch_size)
-        self.Q21 = mp.Queue(num_batches * self._batch_size)
-        self.Q22 = mp.Queue(num_batches * self._batch_size)
-        self.Q3 = mp.Queue(num_batches)
-
-        # Initialize readers
-        self._readers = []
-        for i in range(self._num_readers):
-            part_idx, num_parts = i, self._num_readers
-            num_parts *= group_size
-            part_idx += rank * self._num_readers
-            self._readers.append(dragon.io.DataReader(
-                num_parts=num_parts, part_idx=part_idx, **kwargs))
-            self._readers[i]._seed += part_idx
-            self._readers[i].q_out = self.Q1
-            self._readers[i].start()
-            time.sleep(0.1)
-
-        # Initialize transformers
-        self._transformers = []
-        for i in range(self._num_transformers):
-            transformer = DataTransformer(**kwargs)
-            transformer._seed += (i + rank * self._num_transformers)
-            transformer.q_in = self.Q1
-            transformer.q1_out, transformer.q2_out = self.Q21, self.Q22
-            transformer.start()
-            self._transformers.append(transformer)
-            time.sleep(0.1)
-
-        # Initialize batch-producer
-        self.start()
-
-        # Register cleanup callbacks
-        def cleanup():
-            def terminate(processes):
-                for process in processes:
-                    process.terminate()
-                    process.join()
-            terminate([self])
-            logger.info('Terminate DataBatch.')
-            terminate(self._transformers)
-            logger.info('Terminate DataTransformer.')
-            terminate(self._readers)
-            logger.info('Terminate DataReader.')
-
-        import atexit
-        atexit.register(cleanup)
-
-    def get(self):
-        """Get a batch.
-
-        Returns
-        -------
-        dict
-            The batch dict.
-
-        """
-        return self.Q3.get()
-
-    def run(self):
-        """Start the process to produce batches."""
-        def produce(q_in):
-            processed_ims, ims_info = [], []
-            packed_boxes, packed_masks = [], []
-            for image_index in range(cfg.TRAIN.IMS_PER_BATCH):
-                im, im_scale, gt_boxes, gt_masks = q_in.get()
-                processed_ims.append(im)
-                ims_info.append(list(im.shape[:2]) + [im_scale])
-                im_boxes = np.zeros((gt_boxes.shape[0], gt_boxes.shape[1] + 1), 'float32')
-                im_boxes[:, :gt_boxes.shape[1]], im_boxes[:, -1] = gt_boxes, image_index
-                packed_boxes.append(im_boxes)
-                packed_masks.append(gt_masks)
-            return {
-                'data': im_list_to_blob(processed_ims),
-                'ims_info': np.array(ims_info, 'float32'),
-                'gt_boxes': np.concatenate(packed_boxes, 0),
-                'gt_masks': mask_list_to_blob(packed_masks),
-            }
-
-        # Two queues to implement aspect-grouping
-        # This is necessary to reduce the gpu memory
-        # from fetching a huge square batch blob
-        q1, q2 = self.Q21, self.Q22
-
-        # Main prefetch loop
-        while True:
-            if q1.qsize() >= cfg.TRAIN.IMS_PER_BATCH:
-                self.Q3.put(produce(q1))
-            elif q2.qsize() >= cfg.TRAIN.IMS_PER_BATCH:
-                self.Q3.put(produce(q2))
-            q1, q2 = q2, q1  # Uniform sampling trick
--- a/lib/modeling/factory.py
+++ b/lib/modeling/factory.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-import importlib
-
-
-_STORE = collections.defaultdict(dict)
-
-
-###########################################
-#                                         #
-#                 Body                    #
-#                                         #
-###########################################
-
-
-# ResNet
-for D in [18, 34, 50, 101, 152, 200, 269]:
-    _STORE['BODY']['resnet{}'.format(D)] = \
-        'lib.modeling.resnet.make_resnet_{}'.format(D)
-
-# VGG
-for D in [16, 19]:
-    for T in ['', '_reduced_300', '_reduced_512']:
-        _STORE['BODY']['vgg{}{}'.format(D, T)] = \
-            'lib.modeling.vgg.make_vgg_{}{}'.format(D, T)
-
-# AirNet
-for D in ['', '3b', '4b', '5b']:
-    _STORE['BODY']['airnet{}'.format(D)] = \
-        'lib.modeling.airnet.make_airnet_{}'.format(D)
-
-# MobileNet
-for D in ['a1', 'v2']:
-    _STORE['BODY']['mobilenet_{}'.format(D)] = \
-        'lib.modeling.mobilenet.make_mobilenet_{}'.format(D)
-
-
-def get_template_func(name, sets, desc):
-    name = name.lower()
-    if name not in sets:
-        raise ValueError(
-            'The {} for {} was not registered.\n'
-            'Registered modules: [{}]'
-            .format(name, desc, ', '.join(sets.keys()))
-        )
-    module_name = '.'.join(sets[name].split('.')[0:-1])
-    func_name = sets[name].split('.')[-1]
-    try:
-        module = importlib.import_module(module_name)
-        return getattr(module, func_name)
-    except ImportError as e:
-        raise ValueError('Can not import module from: ' + module_name)
-
-
-def get_body_func(name):
-    return get_template_func(
-        name, _STORE['BODY'], 'Body')
--- a/lib/modules/nn.py
+++ b/lib/modules/nn.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-
-"""Define some basic structures."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from dragon.vm.torch import nn
-from lib.core.config import cfg
-
-
-class Affine(object):
-    """Affine transformation with weight and bias fixed."""
-
-    def __new__(cls, dim_in, bias=True, inplace=True):
-        return nn.Affine(
-            dim_in,
-            fix_weight=True,
-            fix_bias=True,
-            inplace=inplace,
-        )
-
-
-class Conv1x1(object):
-    """1x1 convolution."""
-
-    def __new__(cls, dim_in, dim_out, stride=1, bias=False):
-        return nn.Conv2d(
-            dim_in,
-            dim_out,
-            kernel_size=1,
-            stride=stride,
-            bias=bias,
-        )
-
-
-class Conv3x3(object):
-    """3x3 convolution."""
-
-    def __new__(cls, dim_in, dim_out, stride=1, dilation=1, bias=False):
-        return nn.Conv2d(
-            dim_in,
-            dim_out,
-            kernel_size=3,
-            stride=stride,
-            padding=1 * dilation,
-            bias=bias,
-        )
-
-
-class CrossEntropyLoss(object):
-    """Cross entropy loss."""
-
-    def __new__(cls):
-        return nn.CrossEntropyLoss(ignore_index=-1)
-
-
-class Identity(nn.Module):
-    """Pass input to the output."""
-
-    def __init__(self, *args, **kwargs):
-        super(Identity, self).__init__()
-        _, _ = args, kwargs
-
-    def forward(self, x):
-        return x
-
-
-class SigmoidFocalLoss(object):
-    """Sigmoid focal loss."""
-
-    def __new__(cls):
-        return nn.SigmoidFocalLoss(
-            alpha=cfg.MODEL.FOCAL_LOSS_ALPHA,
-            gamma=cfg.MODEL.FOCAL_LOSS_GAMMA,
-        )
-
-
-class SmoothL1Loss(object):
-    """Smoothed l1 loss."""
-
-    def __new__(cls, beta=1.):
-        return nn.SmoothL1Loss(
-            beta=beta,
-            reduction='batch_size',
-        )
-
-
-def is_conv2d(module):
-    """Return a bool indicating the module is a Conv2d."""
-    return isinstance(module, nn.Conv2d) or \
-        isinstance(module, nn.DepthwiseConv2d)
-
-
-AvgPool2d = nn.AvgPool2d
-BatchNorm2d = nn.BatchNorm2d
-BCEWithLogitsLoss = nn.BCEWithLogitsLoss
-Conv2d = nn.Conv2d
-ConvTranspose2d = nn.ConvTranspose2d
-DepthwiseConv2d = nn.DepthwiseConv2d
-Linear = nn.Linear
-MaxPool2d = nn.MaxPool2d
-Module = nn.Module
-ModuleList = nn.ModuleList
-Sequential = nn.Sequential
-ReLU = nn.ReLU
-Sigmoid = nn.Sigmoid
-Softmax = nn.Softmax
--- a/lib/pycocotools/.gitignore
+++ b/lib/pycocotools/.gitignore
-_mask.c
--- a/lib/pycocotools/license.txt
+++ b/lib/pycocotools/license.txt
-Copyright (c) 2014, Piotr Dollar and Tsung-Yi Lin
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met: 
-
-1. Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer. 
-2. Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution. 
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-The views and conclusions contained in the software and documentation are those
-of the authors and should not be interpreted as representing official policies, 
-either expressed or implied, of the FreeBSD Project.
--- a/lib/ssd/data_loader.py
+++ b/lib/ssd/data_loader.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import multiprocessing as mp
-import time
-
-import dragon
-import dragon.vm.torch as torch
-import numpy as np
-
-from lib.core.config import cfg
-from lib.datasets.factory import get_imdb
-from lib.ssd.data_transformer import DataTransformer
-from lib.utils import logger
-
-
-class DataLoader(object):
-    """Provide mini-batches of data."""
-
-    def __init__(self):
-        super(DataLoader, self).__init__()
-        database = get_imdb(cfg.TRAIN.DATABASE)
-        self.data_batch = DataBatch(**{
-            'dataset': lambda: dragon.io.SeetaRecordDataset(database.source),
-            'classes': database.classes,
-            'shuffle': cfg.TRAIN.USE_SHUFFLE,
-            'num_chunks': cfg.TRAIN.NUM_SHUFFLE_CHUNKS,
-            'batch_size': cfg.TRAIN.IMS_PER_BATCH * 2,
-            'num_transformers': cfg.TRAIN.NUM_WORKERS,
-        })
-
-    def __call__(self):
-        outputs = self.data_batch.get()
-        outputs['data'] = torch.from_numpy(outputs['data'])
-        return outputs
-
-
-class DataBatch(mp.Process):
-    """Prefetch the batch of data."""
-
-    def __init__(self, **kwargs):
-        """Construct a ``DataBatch``.
-
-        Parameters
-        ----------
-        dataset : lambda
-            The creator of a dataset.
-        classes : Sequence[str]
-            The class names.
-        shuffle : bool, optional, default=False
-            Whether to shuffle the data.
-        num_chunks : int, optional, default=0
-            The number of chunks to split.
-        batch_size : int, optional, default=2
-            The size of a mini-batch.
-        num_transformers : int, optional, default=3
-            The number of workers to transform data.
-
-        """
-        super(DataBatch, self).__init__()
-        # Distributed settings
-        rank, group_size = 0, 1
-        process_group = dragon.distributed.get_group()
-        if process_group is not None and kwargs.get(
-                'phase', 'TRAIN') == 'TRAIN':
-            group_size = process_group.size
-            rank = dragon.distributed.get_rank(process_group)
-        kwargs['group_size'] = group_size
-
-        # Configuration
-        self._prefetch = kwargs.get('prefetch', 5)
-        self._batch_size = kwargs.get('batch_size', 32)
-        self._num_readers = kwargs.get('num_readers', 1)
-        self._num_transformers = kwargs.get('num_transformers', 3)
-        self._num_fetchers = kwargs.get('num_fetchers', 1)
-
-        # Initialize queues
-        num_batches = self._prefetch * self._num_readers
-        self.Q1 = mp.Queue(num_batches * self._batch_size)
-        self.Q2 = mp.Queue(num_batches * self._batch_size)
-        self.Q3 = mp.Queue(num_batches)
-
-        # Initialize readers
-        self._readers = []
-        for i in range(self._num_readers):
-            part_idx, num_parts = i, self._num_readers
-            num_parts *= group_size
-            part_idx += rank * self._num_readers
-            self._readers.append(dragon.io.DataReader(
-                num_parts=num_parts, part_idx=part_idx, **kwargs))
-            self._readers[i]._seed += part_idx
-            self._readers[i].q_out = self.Q1
-            self._readers[i].start()
-            time.sleep(0.1)
-
-        # Initialize transformers
-        self._transformers = []
-        for i in range(self._num_transformers):
-            transformer = DataTransformer(**kwargs)
-            transformer._seed += (i + rank * self._num_transformers)
-            transformer.q_in, transformer.q_out = self.Q1, self.Q2
-            transformer.start()
-            self._transformers.append(transformer)
-            time.sleep(0.1)
-
-        # Initialize batch-producer
-        self.start()
-
-        # Register cleanup callbacks
-        def cleanup():
-            def terminate(processes):
-                for process in processes:
-                    process.terminate()
-                    process.join()
-            terminate([self])
-            logger.info('Terminate DataBatch.')
-            terminate(self._transformers)
-            logger.info('Terminate DataTransformer.')
-            terminate(self._readers)
-            logger.info('Terminate DataReader.')
-
-        import atexit
-        atexit.register(cleanup)
-
-    def get(self):
-        """Get a batch.
-
-        Returns
-        -------
-        dict
-            The batch dict.
-
-        """
-        return self.Q3.get()
-
-    def run(self):
-        """Start the process to produce batches."""
-        image_batch_shape = (
-            cfg.TRAIN.IMS_PER_BATCH,
-            cfg.SSD.RESIZE.HEIGHT,
-            cfg.SSD.RESIZE.WIDTH, 3,
-        )
-
-        # Main prefetch loop
-        while True:
-            boxes_to_pack = []
-            img, gt_boxes = self.Q2.get()
-            ims_blob = np.zeros(image_batch_shape, img.dtype)
-            for i in range(cfg.TRAIN.IMS_PER_BATCH):
-                ims_blob[i] = img
-                boxes = np.zeros((gt_boxes.shape[0], gt_boxes.shape[1] + 1), 'float32')
-                boxes[:, :gt_boxes.shape[1]], boxes[:, -1] = gt_boxes, i
-                boxes_to_pack.append(boxes)
-                if i != (cfg.TRAIN.IMS_PER_BATCH - 1):
-                    img, gt_boxes = self.Q2.get()
-            self.Q3.put({
-                'data': ims_blob,
-                'gt_boxes': np.concatenate(boxes_to_pack),
-            })
-
--- a/lib/utils/boxes.py
+++ b/lib/utils/boxes.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# Codes are based on:
-#
-#      <https://github.com/ppwwyyxx/tensorpack/blob/master/examples/FasterRCNN/utils/np_box_ops.py>
-#
-# ------------------------------------------------------------
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from lib.utils import cython_bbox
-
-
-def intersection(boxes1, boxes2):
-    """Compute pairwise intersection areas between boxes.
-
-    Args:
-      boxes1: a numpy array with shape [N, 4] holding N boxes
-      boxes2: a numpy array with shape [M, 4] holding M boxes
-
-    Returns:
-      a numpy array with shape [N*M] representing pairwise intersection area
-
-    """
-    [y_min1, x_min1, y_max1, x_max1] = np.split(boxes1, 4, axis=1)
-    [y_min2, x_min2, y_max2, x_max2] = np.split(boxes2, 4, axis=1)
-
-    all_pairs_min_ymax = np.minimum(y_max1, np.transpose(y_max2))
-    all_pairs_max_ymin = np.maximum(y_min1, np.transpose(y_min2))
-    intersect_heights = np.maximum(
-        np.zeros(all_pairs_max_ymin.shape),
-        all_pairs_min_ymax - all_pairs_max_ymin)
-    all_pairs_min_xmax = np.minimum(x_max1, np.transpose(x_max2))
-    all_pairs_max_xmin = np.maximum(x_min1, np.transpose(x_min2))
-    intersect_widths = np.maximum(
-        np.zeros(all_pairs_max_xmin.shape),
-        all_pairs_min_xmax - all_pairs_max_xmin)
-    return intersect_heights * intersect_widths
-
-
-def iou(boxes1, boxes2):
-    """Computes pairwise intersection-over-union between box collections.
-
-    Args:
-      boxes1: a numpy array with shape [N, 4] holding N boxes.
-      boxes2: a numpy array with shape [M, 4] holding M boxes.
-    Returns:
-      a numpy array with shape [N, M] representing pairwise iou scores.
-
-    """
-    intersect = intersection(boxes1, boxes2)
-    area1 = boxes_area(boxes1)
-    area2 = boxes_area(boxes2)
-    union = \
-        np.expand_dims(area1, axis=1) + \
-        np.expand_dims(area2, axis=0) - intersect
-    return intersect / union
-
-
-def ioa1(boxes1, boxes2):
-    """Computes pairwise intersection-over-area between box collections.
-    Intersection-over-area (ioa) between two boxes box1 and box2 is defined as
-    their intersection area over box2's area. Note that ioa is not symmetric,
-    that is, IOA(box1, box2) != IOA(box2, box1).
-
-    Args:
-      boxes1: a numpy array with shape [N, 4] holding N boxes.
-      boxes2: a numpy array with shape [M, 4] holding N boxes.
-
-    Returns:
-      a numpy array with shape [N, M] representing pairwise ioa scores.
-
-    """
-    intersect = intersection(boxes1, boxes2)
-    areas = np.expand_dims(boxes_area(boxes1), axis=1)
-    return intersect / areas
-
-
-def ioa2(boxes1, boxes2):
-    """Computes pairwise intersection-over-area between box collections.
-    Intersection-over-area (ioa) between two boxes box1 and box2 is defined as
-    their intersection area over box2's area. Note that ioa is not symmetric,
-    that is, IOA(box1, box2) != IOA(box2, box1).
-
-    Args:
-      boxes1: a numpy array with shape [N, 4] holding N boxes.
-      boxes2: a numpy array with shape [M, 4] holding N boxes.
-
-    Returns:
-      a numpy array with shape [N, M] representing pairwise ioa scores.
-
-    """
-    intersect = intersection(boxes1, boxes2)
-    areas = np.expand_dims(boxes_area(boxes2), axis=0)
-    return intersect / areas
-
-
-def bbox_overlaps(boxes1, boxes2):
-    """Compute the overlaps between two group of boxes."""
-    return cython_bbox.bbox_overlaps(
-        np.ascontiguousarray(boxes1, dtype=np.float),
-        np.ascontiguousarray(boxes2, dtype=np.float),
-    )
-
-
-def bbox_transform(ex_rois, gt_rois, weights=(1., 1., 1., 1.)):
-    """Transform the boxes to the regression targets."""
-    ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.
-    ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.
-    ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths
-    ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights
-
-    gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.
-    gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.
-    gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths
-    gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights
-
-    wx, wy, ww, wh = weights
-    targets = [wx * (gt_ctr_x - ex_ctr_x) / ex_widths]
-    targets += [wy * (gt_ctr_y - ex_ctr_y) / ex_heights]
-    targets += [ww * np.log(gt_widths / ex_widths)]
-    targets += [wh * np.log(gt_heights / ex_heights)]
-
-    return np.vstack(targets).transpose()
-
-
-def bbox_transform_inv(boxes, deltas, weights=(1., 1., 1., 1.)):
-    """Decode the final boxes according to the deltas."""
-    if boxes.shape[0] == 0:
-        return np.zeros((0, deltas.shape[1]), dtype=deltas.dtype)
-
-    boxes = boxes.astype(deltas.dtype, copy=False)
-
-    widths = boxes[:, 2] - boxes[:, 0] + 1.
-    heights = boxes[:, 3] - boxes[:, 1] + 1.
-    ctr_x = boxes[:, 0] + 0.5 * widths
-    ctr_y = boxes[:, 1] + 0.5 * heights
-
-    wx, wy, ww, wh = weights
-    dx = deltas[:, 0::4] / wx
-    dy = deltas[:, 1::4] / wy
-    dw = deltas[:, 2::4] / ww
-    dh = deltas[:, 3::4] / wh
-
-    pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis]
-    pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis]
-    pred_w = np.exp(dw) * widths[:, np.newaxis]
-    pred_h = np.exp(dh) * heights[:, np.newaxis]
-
-    pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype)
-    pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w  # x1
-    pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h  # y1
-    pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w  # x2
-    pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h  # y2
-
-    return pred_boxes
-
-
-def boxes_area(boxes):
-    """Compute the area of an array of boxes."""
-    w = (boxes[:, 2] - boxes[:, 0] + 1)
-    h = (boxes[:, 3] - boxes[:, 1] + 1)
-    areas = w * h
-    assert np.all(areas >= 0), 'Negative areas founds'
-    return areas
-
-
-def clip_boxes(boxes, im_shape):
-    # x1 >= 0
-    boxes[:, 0] = np.maximum(np.minimum(boxes[:, 0], im_shape[1] - 1), 0)
-    # y1 >= 0
-    boxes[:, 1] = np.maximum(np.minimum(boxes[:, 1], im_shape[0] - 1), 0)
-    # x2 < im_shape[1]
-    boxes[:, 2] = np.maximum(np.minimum(boxes[:, 2], im_shape[1] - 1), 0)
-    # y2 < im_shape[0]
-    boxes[:, 3] = np.maximum(np.minimum(boxes[:, 3], im_shape[0] - 1), 0)
-    return boxes
-
-
-def clip_tiled_boxes(boxes, im_shape):
-    # x1 >= 0
-    boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0)
-    # y1 >= 0
-    boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0)
-    # x2 < im_shape[1]
-    boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0)
-    # y2 < im_shape[0]
-    boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0)
-    return boxes
-
-
-def expand_boxes(boxes, scale):
-    """Expand an array of boxes by a given scale."""
-    w_half = (boxes[:, 2] - boxes[:, 0]) * .5
-    h_half = (boxes[:, 3] - boxes[:, 1]) * .5
-    x_c = (boxes[:, 2] + boxes[:, 0]) * .5
-    y_c = (boxes[:, 3] + boxes[:, 1]) * .5
-
-    w_half *= scale
-    h_half *= scale
-
-    boxes_exp = np.zeros(boxes.shape)
-    boxes_exp[:, 0] = x_c - w_half
-    boxes_exp[:, 2] = x_c + w_half
-    boxes_exp[:, 1] = y_c - h_half
-    boxes_exp[:, 3] = y_c + h_half
-
-    return boxes_exp
-
-
-def flip_boxes(boxes, width):
-    """Flip the boxes horizontally."""
-    flip_boxes = boxes.copy()
-    old_x1 = boxes[:, 0].copy()
-    old_x2 = boxes[:, 2].copy()
-    flip_boxes[:, 0] = width - old_x2 - 1
-    flip_boxes[:, 2] = width - old_x1 - 1
-    return flip_boxes
-
-
-def filter_boxes(boxes, min_size):
-    """Remove all boxes with any side smaller than min size."""
-    ws = boxes[:, 2] - boxes[:, 0] + 1
-    hs = boxes[:, 3] - boxes[:, 1] + 1
-    keep = np.where((ws >= min_size) & (hs >= min_size))[0]
-    return keep
-
-
-def dismantle_boxes(gt_boxes, num_images):
-    """Dismantle the packed ground-truth boxes."""
-    return [
-        gt_boxes[
-            np.where(gt_boxes[:, -1].astype(np.int32) == i)[0]
-        ][:, :-1] for i in range(num_images)
-    ]
--- a/lib/utils/framework.py
+++ b/lib/utils/framework.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-
-import dragon
-from dragon.core.framework import tensor_util
-from dragon.core.util import six
-import dragon.vm.torch as torch
-import numpy as np
-
-from lib.core.config import cfg
-
-
-def feed_tensor(tensor, array):
-    tensor_util.set_array(tensor, array)
-
-
-def get_param_groups(module, bias_lr=1., bias_decay=0.):
-    """Separate weight and bias into parameters groups.
-
-    Parameters
-    ----------
-    module : dragon.vm.torch.nn.Module
-        The module to collect parameters.
-    bias_lr : float, optional, default=1.
-        The lr multiplier of bias.
-    bias_decay : float, optional, default=0.
-        The decay multiplier of bias.
-
-    Returns
-    -------
-    Sequence[ParamGroup]
-        The parameter groups.
-
-    """
-    param_groups = [
-        {
-            'params': [],
-            'lr_mult': 1.,
-            'decay_mult': 1.,
-        },
-        {
-            'params': [],
-            'lr_mult': bias_lr,
-            'decay_mult': bias_decay,
-        }
-    ]
-    for name, param in module.named_parameters():
-        gi = 0 if 'weight' in name and param.dim() > 1 else 1
-        param_groups[gi]['params'].append(param)
-    if len(param_groups[1]['params']) == 0:
-        param_groups.pop()  # Remove empty group
-    return param_groups
-
-
-def get_workspace():
-    """Return the current default workspace.
-
-    Returns
-    -------
-    dragon.Workspace
-        The default workspace.
-
-    """
-    return dragon.get_workspace()
-
-
-def new_placeholder(device=None):
-    """Create a new tensor to feed data.
-
-    Parameters
-    ----------
-    device : int, optional
-        The device index.
-
-    Returns
-    -------
-    dragon.vm.torch.Tensor
-        The placeholder tensor.
-
-    """
-    value = torch.zeros(1)
-    if device is not None:
-        return value.cuda(device)
-    return value
-
-
-def new_tensor(data, enforce_cpu=False):
-    """Create a new tensor from the data.
-
-    Parameters
-    ----------
-    data : array_like
-        The data value.
-    enforce_cpu : bool, optional, default=False
-        **True** to enforce the cpu storage.
-
-    Returns
-    -------
-    dragon.vm.torch.Tensor
-        The tensor taken with the data.
-
-    """
-    if isinstance(data, np.ndarray):
-        tensor = torch.from_numpy(data)
-    elif isinstance(data, torch.Tensor):
-        tensor = data
-    else:
-        tensor = torch.tensor(data)
-    if not enforce_cpu:
-        tensor = tensor.cuda(cfg.GPU_ID)
-    return tensor
-
-
-def new_workspace(merge_default=True):
-    """Create a new workspace.
-
-    Parameters
-    ----------
-    merge_default : bool, optional, default=True
-        **True** to merge tensors from default workspace.
-
-    Returns
-    -------
-    dragon.Workspace
-        The new workspace.
-
-    """
-    workspace = dragon.Workspace()
-    if merge_default:
-        workspace.merge_from(get_workspace())
-    return workspace
-
-
-def reset_workspace(workspace=None, merge_default=True):
-    """Reset a workspace and return a new one.
-
-    Parameters
-    ----------
-    workspace : dragon.Workspace, optional
-        The workspace to reset.
-    merge_default : bool, optional, default=True
-        **True** to merge tensors from default workspace.
-
-    Returns
-    -------
-    dragon.Workspace
-        The new workspace.
-
-    """
-    if workspace is not None:
-        workspace.Clear()  # Block the GIL
-    return new_workspace(merge_default)
-
-
-class Graph(object):
-    """Simple sequential graph to accelerate inference.
-
-    Graph reduces the overhead of python functions
-    under eager execution. Such cost will be at least 15ms
-    for common backbones, which limits to about 60FPS.
-
-    For more details, see the eager mechanism of Dragon.
-
-    """
-
-    def __init__(self, inputs, outputs, constants=None):
-        def canonicalize(input_dict):
-            if input_dict is None:
-                return {}
-            for k, v in input_dict.items():
-                input_dict[k] = v.name if hasattr(v, 'name') else v
-            return input_dict
-        self.placeholders = {}
-        self._inputs = canonicalize(inputs)
-        self._outputs = canonicalize(outputs)
-        self._constants = canonicalize(constants)
-        self._workspace = get_workspace()
-        self._tracer = torch.jit.get_tracer()
-
-    @property
-    def workspace(self):
-        return self._workspace
-
-    @workspace.setter
-    def workspace(self, value):
-        self._workspace = value
-
-    def forward(self, **kwargs):
-        # Assign inputs
-        for name, tensor in self._inputs.items():
-            value = kwargs.get(name, None)
-            tensor_util.set_array(tensor, value)
-
-        # Replay the traced expressions
-        self._tracer.replay()
-
-        # Collect outputs
-        # 1) Target results
-        # 2) Constant values
-        outputs = collections.OrderedDict()
-
-        for name, tensor in self._outputs.items():
-            outputs[name] = tensor_util.to_array(tensor, True)
-
-        for name, value in self._constants.items():
-            outputs[name] = value
-
-        return outputs
-
-    def __call__(self, **kwargs):
-        with self._workspace.as_default():
-            return self.forward(**kwargs)
-
-
-# Aliases
-pickle = six.moves.pickle
--- a/scripts/coco/im2rec.py
+++ b/scripts/coco/im2rec.py
@@ -11,6 +11,10 @@

 """Make record file for COCO dataset."""

+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
 import os
 import shutil

@@ -37,8 +41,8 @@ if __name__ == '__main__':
        record_file=os.path.join(COCO_ROOT, 'coco_2014_trainval35k'),
        images_path=[os.path.join(COCO_ROOT, 'images/train2014'),
                     os.path.join(COCO_ROOT, 'images/val2014')],
-        splits_path=[os.path.join(COCO_ROOT, 'ImageSets'),
-                     os.path.join(COCO_ROOT, 'ImageSets')],
+        splits_path=[os.path.join(COCO_ROOT, 'splits'),
+                     os.path.join(COCO_ROOT, 'splits')],
        mask_file='build/coco_2014_trainval35k_mask.pkl',
        splits=['train', 'valminusminival'],
    )
@@ -48,7 +52,7 @@ if __name__ == '__main__':
        record_file=os.path.join(COCO_ROOT, 'coco_2014_minival'),
        images_path=os.path.join(COCO_ROOT, 'images/val2014'),
        mask_file='build/coco_2014_minival_mask.pkl',
-        splits_path=os.path.join(COCO_ROOT, 'ImageSets'),
+        splits_path=os.path.join(COCO_ROOT, 'splits'),
        splits=['minival'],
    )


--- a/scripts/coco/maker.py
+++ b/scripts/coco/maker.py
@@ -86,7 +86,7 @@ def make_record(

    print('Start Time:', time.strftime("%a, %d %b %Y %H:%M:%S", time.gmtime()))

-    writer = dragon.io.SeetaRecordWriter(
+    writer = dragon.io.KPLRecordWriter(
        path=record_file,
        protocol={
            'id': 'string',
@@ -133,6 +133,6 @@ def make_record(
    writer.close()

    end_time = time.time()
-    data_size = os.path.getsize(record_file + '/data.data') * 1e-6
+    data_size = os.path.getsize(record_file + '/root.data') * 1e-6
    print('{} images take {:.2f} MB in {:.2f} sec.'
          .format(total_line, data_size, end_time - start_time))
--- a/scripts/coco/maskgen.py
+++ b/scripts/coco/maskgen.py
@@ -20,11 +20,11 @@ except:
    import pickle as cPickle

 sys.path.insert(0, '../..')
-from lib.pycocotools.coco import COCO
-from lib.pycocotools import mask_utils
+from seetadet.pycocotools.coco import COCO
+from seetadet.pycocotools import mask_utils


-class imdb(object):
+class COCOWrapper(object):
    def __init__(self, image_set, year, data_dir):
        self._year = year
        self._image_set = image_set
@@ -120,8 +120,6 @@ class imdb(object):
                # running out of the image bound
                # Do not use them or decoding error is inevitable
                mask_bytes = mask_utils.poly2bytes(obj['segmentation'], height, width)
-            if not isinstance(mask_bytes, bytes):
-                print(type(mask_bytes))
            if obj['area'] > 0 and x2 > x1 and y2 > y1:
                obj['clean_bbox'] = [x1, y1, x2, y2]
                valid_objects.append({
@@ -146,10 +144,11 @@ class imdb(object):


 def make_mask(split, year, data_dir):
-    coco = imdb(split, year, data_dir)
-    print('Preparing to make split: {}, total {} images'.format(split, coco.num_images))
-    if not osp.exists(osp.join(coco._data_path, 'ImageSets')):
-        os.makedirs(osp.join(coco._data_path, 'ImageSets'))
+    coco = COCOWrapper(split, year, data_dir)
+    print('Preparing to make split: {}, total {} images'
+          .format(split, coco.num_images))
+    if not osp.exists(osp.join(coco._data_path, 'splits')):
+        os.makedirs(osp.join(coco._data_path, 'splits'))

    gt_recs = OrderedDict()
    for i in range(coco.num_images):
@@ -157,14 +156,14 @@ def make_mask(split, year, data_dir):
        h, w, objects = coco.annotation_at(i)
        gt_recs[filename] = objects

-    with open(osp.join('build',
-        'coco_' + year + '_' + split + '_mask.pkl'), 'wb') as f:
-            cPickle.dump(gt_recs, f, cPickle.HIGHEST_PROTOCOL)
+    with open(osp.join('build', 'coco_' + year + '_' + split + '_mask.pkl'), 'wb') as f:
+        cPickle.dump(gt_recs, f, cPickle.HIGHEST_PROTOCOL)

-    with open(osp.join(coco._data_path, 'ImageSets', split + '.txt'), 'w') as f:
+    with open(osp.join(coco._data_path, 'splits', split + '.txt'), 'w') as f:
        for i in range(coco.num_images):
            filename = (coco.image_path_at(i).split('/')[-1]).split('.')[0]
-            if i != coco.num_images - 1: filename += '\n'
+            if i != coco.num_images - 1:
+                filename += '\n'
            f.write(filename)



--- a/scripts/rotated/im2rec.py
+++ b/scripts/rotated/im2rec.py
@@ -26,6 +26,6 @@ if __name__ == '__main__':
        record_file=osp.join(data_root, 'rotated_train'),
        images_path=[osp.join(data_root, 'JPEGImages')],
        annotations_path=[osp.join(data_root, 'Annotations')],
-        imagesets_path=[osp.join(data_root, 'ImageSets')],
+        splits_path=[osp.join(data_root, 'ImageSets')],
        splits=['train']
    )
--- a/scripts/rotated/maker.py
+++ b/scripts/rotated/maker.py
@@ -57,7 +57,7 @@ def make_record(
    record_file,
    images_path,
    annotations_path,
-    imagesets_path,
+    splits_path,
    splits
 ):
    if os.path.exists(record_file):
@@ -68,15 +68,15 @@ def make_record(
        images_path = [images_path]
    if not isinstance(annotations_path, list):
        annotations_path = [annotations_path]
-    if not isinstance(imagesets_path, list):
-        imagesets_path = [imagesets_path]
-    assert len(splits) == len(imagesets_path)
+    if not isinstance(splits_path, list):
+        splits_path = [splits_path]
+    assert len(splits) == len(splits_path)
    assert len(splits) == len(images_path)
    assert len(splits) == len(annotations_path)

    print('Start Time:', time.strftime("%a, %d %b %Y %H:%M:%S", time.gmtime()))

-    writer = dragon.io.SeetaRecordWriter(
+    writer = dragon.io.KPLRecordWriter(
        path=record_file,
        protocol={
            'id': 'string',
@@ -99,31 +99,37 @@ def make_record(
        }
    )

-    count, total_line = 0, 0
-    start_time = time.time()
-
-    for db_idx, split in enumerate(splits):
-        split_file = os.path.join(imagesets_path[db_idx], split + '.txt')
-        assert os.path.exists(split_file)
+    # Scan all available entries
+    print('Scan entries...')
+    entries = []
+    for i, split in enumerate(splits):
+        split_file = os.path.join(splits_path[i], split + '.txt')
        with open(split_file, 'r') as f:
            lines = f.readlines()
-            total_line += len(lines)
        for line in lines:
-            count += 1
-            if count % 2000 == 0:
-                now_time = time.time()
-                print('{} / {} in {:.2f} sec'.format(
-                    count, total_line, now_time - start_time))
            filename = line.strip()
-            image_file = os.path.join(images_path[db_idx], filename + '.jpg')
-            xml_file = os.path.join(annotations_path[db_idx], filename + '.xml')
-            writer.write(make_example(image_file, xml_file))
+            img_file = os.path.join(images_path[i], filename + '.jpg')
+            ann_file = os.path.join(annotations_path[i], filename + '.xml')
+            entries.append((img_file, ann_file))
+
+    # Parse and write into record file
+    print('Start Time:', time.strftime("%a, %d %b %Y %H:%M:%S", time.gmtime()))
+    start_time = time.time()
+
+    for i, (img_file, ann_file) in enumerate(entries):
+        if i > 0 and i % 2000 == 0:
+            now_time = time.time()
+            print('{} / {} in {:.2f} sec'.format(
+                i, len(entries), now_time - start_time))
+        writer.write(make_example(img_file, ann_file))

    now_time = time.time()
-    print('{} / {} in {:.2f} sec'.format(count, total_line, now_time - start_time))
+    print('{} / {} in {:.2f} sec'.format(
+        len(entries), len(entries), now_time - start_time))
    writer.close()

    end_time = time.time()
-    data_size = os.path.getsize(record_file + '/data.data') * 1e-6
+    data_size = os.path.getsize(record_file + '/root.data') * 1e-6
    print('{} images take {:.2f} MB in {:.2f} sec.'
-          .format(total_line, data_size, end_time - start_time))
+          .format(len(entries), data_size, end_time - start_time))
+
--- a/scripts/voc/im2rec.py
+++ b/scripts/voc/im2rec.py
@@ -28,7 +28,7 @@ if __name__ == '__main__':
                     osp.join(voc_root, 'VOCdevkit2012/VOC2012/JPEGImages')],
        annotations_path=[osp.join(voc_root, 'VOCdevkit2007/VOC2007/Annotations'),
                          osp.join(voc_root, 'VOCdevkit2012/VOC2012/Annotations')],
-        imagesets_path=[osp.join(voc_root, 'VOCdevkit2007/VOC2007/ImageSets/Main'),
+        splits_path=[osp.join(voc_root, 'VOCdevkit2007/VOC2007/ImageSets/Main'),
                        osp.join(voc_root, 'VOCdevkit2012/VOC2012/ImageSets/Main')],
        splits=['trainval', 'trainval']
    )
@@ -37,6 +37,6 @@ if __name__ == '__main__':
        record_file=osp.join(voc_root, 'voc_2007_test'),
        images_path=osp.join(voc_root, 'VOCdevkit2007/VOC2007/JPEGImages'),
        annotations_path=osp.join(voc_root, 'VOCdevkit2007/VOC2007/Annotations'),
-        imagesets_path=osp.join(voc_root, 'VOCdevkit2007/VOC2007/ImageSets/Main'),
+        splits_path=osp.join(voc_root, 'VOCdevkit2007/VOC2007/ImageSets/Main'),
        splits=['test']
-   )
+    )
--- a/scripts/voc/maker.py
+++ b/scripts/voc/maker.py
@@ -26,11 +26,17 @@ def make_example(image_file, xml_file):
    tree = ET.parse(xml_file)
    filename = os.path.split(xml_file)[-1]
    objs = tree.findall('object')
+    size = tree.find('size')
    example = {'id': filename.split('.')[0], 'object': []}
    with open(image_file, 'rb') as f:
        img_bytes = bytes(f.read())
-    img = cv2.imdecode(np.frombuffer(img_bytes, 'uint8'), 1)
-    example['height'], example['width'], example['depth'] = img.shape
+    if size is not None:
+        example['height'] = int(size.find('height').text)
+        example['width'] = int(size.find('width').text)
+        example['depth'] = int(size.find('depth').text)
+    else:
+        img = cv2.imdecode(np.frombuffer(img_bytes, 'uint8'), 3)
+        example['height'], example['width'], example['depth'] = img.shape
    example['content'] = img_bytes
    for ix, obj in enumerate(objs):
        bbox = obj.find('bndbox')
@@ -53,7 +59,7 @@ def make_record(
    record_file,
    images_path,
    annotations_path,
-    imagesets_path,
+    splits_path,
    splits
 ):
    if os.path.exists(record_file):
@@ -64,15 +70,13 @@ def make_record(
        images_path = [images_path]
    if not isinstance(annotations_path, list):
        annotations_path = [annotations_path]
-    if not isinstance(imagesets_path, list):
-        imagesets_path = [imagesets_path]
-    assert len(splits) == len(imagesets_path)
+    if not isinstance(splits_path, list):
+        splits_path = [splits_path]
+    assert len(splits) == len(splits_path)
    assert len(splits) == len(images_path)
    assert len(splits) == len(annotations_path)

-    print('Start Time:', time.strftime("%a, %d %b %Y %H:%M:%S", time.gmtime()))
-
-    writer = dragon.io.SeetaRecordWriter(
+    writer = dragon.io.KPLRecordWriter(
        path=record_file,
        protocol={
            'id': 'string',
@@ -91,31 +95,36 @@ def make_record(
        }
    )

-    count, total_line = 0, 0
-    start_time = time.time()
-
-    for db_idx, split in enumerate(splits):
-        split_file = os.path.join(imagesets_path[db_idx], split + '.txt')
-        assert os.path.exists(split_file)
+    # Scan all available entries
+    print('Scan entries...')
+    entries = []
+    for i, split in enumerate(splits):
+        split_file = os.path.join(splits_path[i], split + '.txt')
        with open(split_file, 'r') as f:
            lines = f.readlines()
-            total_line += len(lines)
        for line in lines:
-            count += 1
-            if count % 2000 == 0:
-                now_time = time.time()
-                print('{} / {} in {:.2f} sec'.format(
-                    count, total_line, now_time - start_time))
            filename = line.strip()
-            image_file = os.path.join(images_path[db_idx], filename + '.jpg')
-            xml_file = os.path.join(annotations_path[db_idx], filename + '.xml')
-            writer.write(make_example(image_file, xml_file))
+            img_file = os.path.join(images_path[i], filename + '.jpg')
+            ann_file = os.path.join(annotations_path[i], filename + '.xml')
+            entries.append((img_file, ann_file))
+
+    # Parse and write into record file
+    print('Start Time:', time.strftime("%a, %d %b %Y %H:%M:%S", time.gmtime()))
+    start_time = time.time()
+
+    for i, (img_file, ann_file) in enumerate(entries):
+        if i > 0 and i % 2000 == 0:
+            now_time = time.time()
+            print('{} / {} in {:.2f} sec'.format(
+                i, len(entries), now_time - start_time))
+        writer.write(make_example(img_file, ann_file))

    now_time = time.time()
-    print('{} / {} in {:.2f} sec'.format(count, total_line, now_time - start_time))
+    print('{} / {} in {:.2f} sec'.format(
+        len(entries), len(entries), now_time - start_time))
    writer.close()

    end_time = time.time()
-    data_size = os.path.getsize(record_file + '/data.data') * 1e-6
+    data_size = os.path.getsize(record_file + '/root.data') * 1e-6
    print('{} images take {:.2f} MB in {:.2f} sec.'
-          .format(total_line, data_size, end_time - start_time))
+          .format(len(entries), data_size, end_time - start_time))
--- a/lib/__init__.py
+++ b/lib/__init__.py
--- a/lib/core/__init__.py
+++ b/lib/core/__init__.py
--- a/seetadet/algo/faster_rcnn/__init__.py
+++ b/seetadet/algo/faster_rcnn/__init__.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from seetadet.algo.faster_rcnn.anchor_target import AnchorTarget
+from seetadet.algo.faster_rcnn.data_loader import DataLoader
+from seetadet.algo.faster_rcnn.proposal import Proposal
+from seetadet.algo.faster_rcnn.proposal_target import ProposalTarget
+from seetadet.algo.faster_rcnn.utils import generate_grid_anchors
+from seetadet.algo.faster_rcnn.utils import map_blobs_by_levels
+from seetadet.algo.faster_rcnn.utils import map_rois_to_levels
+from seetadet.algo.faster_rcnn.utils import map_returns_to_blobs
--- a/lib/faster_rcnn/anchor_target.py
+++ b/lib/faster_rcnn/anchor_target.py
@@ -16,11 +16,11 @@ from __future__ import print_function
 import numpy as np
 import numpy.random as npr

-from lib.core.config import cfg
-from lib.faster_rcnn.generate_anchors import generate_anchors
-from lib.faster_rcnn.utils import generate_grid_anchors
-from lib.utils import boxes as box_util
-from lib.utils.framework import new_tensor
+from seetadet.algo.faster_rcnn.generate_anchors import generate_anchors
+from seetadet.algo.faster_rcnn.utils import generate_grid_anchors
+from seetadet.core.config import cfg
+from seetadet.utils import boxes as box_util
+from seetadet.utils.env import new_tensor


 class AnchorTarget(object):
@@ -62,9 +62,7 @@ class AnchorTarget(object):

        # Label: ``1`` is positive, ``0`` is negative, ``-1` is don't care
        labels_wide = -np.ones((num_images, num_anchors,), 'float32')
-        bbox_targets_wide = np.zeros((num_images, num_anchors, 4), 'float32')
-        bbox_inside_weights_wide = np.zeros_like(bbox_targets_wide, 'float32')
-        bbox_outside_weights_wide = np.zeros_like(bbox_targets_wide, 'float32')
+        bbox_indices_wide, bbox_anchors_wide, bbox_targets_wide = [], [], []

        for ix in range(num_images):
            # GT boxes (x1, y1, x2, y2, label, ...)
@@ -95,13 +93,13 @@ class AnchorTarget(object):
                                       np.arange(overlaps.shape[1])]
            gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0]

-            # fg label: for each gt, anchor with highest overlap
+            # Foreground: for each gt, anchor with highest overlap
            labels[gt_argmax_overlaps] = 1

-            # fg label: above threshold IOU
+            # Foreground: above threshold IoU
            labels[max_overlaps >= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = 1

-            # bg label: below threshold IOU
+            # Background: below threshold IoU
            labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0

            # Subsample positive labels if we have too many
@@ -112,6 +110,11 @@ class AnchorTarget(object):
                labels[disable_inds] = -1
                fg_inds = np.where(labels == 1)[0]

+            # Retract the clamping if we don't have one
+            if len(fg_inds) == 0:
+                labels[gt_argmax_overlaps] = 1
+                fg_inds = np.where(labels == 1)[0]
+
            # Subsample negative labels if we have too many
            num_bg = cfg.TRAIN.RPN_BATCHSIZE - np.sum(labels == 1)
            bg_inds = np.where(labels == 0)[0]
@@ -119,51 +122,27 @@ class AnchorTarget(object):
                disable_inds = npr.choice(bg_inds, len(bg_inds) - num_bg, False)
                labels[disable_inds] = -1

-            bbox_targets = np.zeros((num_inside, 4), 'float32')
-            bbox_targets[fg_inds, :] = \
+            labels_wide[ix, inds_inside] = labels
+            bbox_anchors_wide.append(anchors[fg_inds])
+            bbox_indices_wide.append(inds_inside[fg_inds] + (num_anchors * ix))
+            bbox_targets_wide.append(
                box_util.bbox_transform(
-                    anchors[fg_inds, :],
+                    anchors[fg_inds],
                    gt_boxes[argmax_overlaps[fg_inds], :4],
                )
-            bbox_inside_weights = np.zeros((num_inside, 4), 'float32')
-            bbox_inside_weights[labels == 1, :] = np.array((1., 1., 1., 1.))
-            bbox_outside_weights = np.zeros((num_inside, 4), 'float32')
-            bbox_outside_weights[labels == 1, :] = np.ones((1, 4)) / cfg.TRAIN.RPN_BATCHSIZE
-            bbox_outside_weights[labels == 0, :] = np.ones((1, 4)) / cfg.TRAIN.RPN_BATCHSIZE
-
-            labels_wide[ix, inds_inside] = labels  # label
-            bbox_targets_wide[ix, inds_inside] = bbox_targets
-            bbox_inside_weights_wide[ix, inds_inside] = bbox_inside_weights
-            bbox_outside_weights_wide[ix, inds_inside] = bbox_outside_weights
-
-        if self.num_strides > 1:
-            labels = labels_wide.reshape((num_images, num_anchors))
-            bbox_targets = bbox_targets_wide.transpose((0, 2, 1))
-            bbox_inside_weights = bbox_inside_weights_wide.transpose((0, 2, 1))
-            bbox_outside_weights = bbox_outside_weights_wide.transpose((0, 2, 1))
-        else:
+            )
+
+        if self.num_strides == 1:
            A = self.base_anchors[0].shape[0]
            height, width = features[0].shape[-2:]
-            labels = labels_wide \
+            labels_wide = labels_wide \
                .reshape((num_images, height, width, A)) \
                .transpose(0, 3, 1, 2) \
                .reshape((num_images, num_anchors))

-            bbox_targets = bbox_targets_wide \
-                .reshape((num_images, height, width, A * 4)) \
-                .transpose(0, 3, 1, 2)
-
-            bbox_inside_weights = bbox_inside_weights_wide \
-                .reshape((num_images, height, width, A * 4)) \
-                .transpose(0, 3, 1, 2)
-
-            bbox_outside_weights = bbox_outside_weights_wide \
-                .reshape((num_images, height, width, A * 4)) \
-                .transpose(0, 3, 1, 2)
-
        return {
-            'labels': new_tensor(labels),
-            'bbox_targets': new_tensor(bbox_targets),
-            'bbox_inside_weights': new_tensor(bbox_inside_weights),
-            'bbox_outside_weights': new_tensor(bbox_outside_weights),
+            'labels': new_tensor(labels_wide),
+            'bbox_indices': new_tensor(np.concatenate(bbox_indices_wide)),
+            'bbox_targets': new_tensor(np.concatenate(bbox_targets_wide).astype('float32')),
+            'bbox_anchors': new_tensor(np.concatenate(bbox_anchors_wide).astype('float32')),
        }
--- a/seetadet/algo/faster_rcnn/data_loader.py
+++ b/seetadet/algo/faster_rcnn/data_loader.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import multiprocessing as mp
+import time
+
+import dragon
+import dragon.vm.torch as torch
+import numpy as np
+
+from seetadet.algo.faster_rcnn import data_transformer
+from seetadet.core.config import cfg
+from seetadet.datasets.factory import get_dataset
+from seetadet.utils import logger
+from seetadet.utils.blob import im_list_to_blob
+
+
+class DataLoader(object):
+    """Load mini-batches of data."""
+
+    def __init__(self):
+        super(DataLoader, self).__init__()
+        dataset = get_dataset(cfg.TRAIN.DATASET)
+        if cfg.USE_DALI:
+            from seetadet.dali import rcnn_pipeline as pipe
+            self.iterator = pipe.new_iterator(dataset.source)
+        else:
+            self.iterator = Iterator(**{
+                'dataset': dataset.cls,
+                'source': dataset.source,
+                'classes': dataset.classes,
+                'shuffle': cfg.TRAIN.USE_SHUFFLE,
+                'num_chunks': cfg.TRAIN.SHUFFLE_CHUNKS,
+                'batch_size': cfg.TRAIN.IMS_PER_BATCH * 2,
+                'num_transformers': cfg.TRAIN.NUM_THREADS - 1,
+            })
+
+    def __call__(self):
+        outputs = self.iterator.next()
+        if isinstance(outputs['data'], np.ndarray):
+            outputs['data'] = torch.from_numpy(outputs['data'])
+        return outputs
+
+
+class Iterator(mp.Process):
+    """Iterator to return the batch of data."""
+
+    def __init__(self, **kwargs):
+        super(Iterator, self).__init__()
+        # Distributed settings
+        rank, group_size = 0, 1
+        process_group = dragon.distributed.get_group()
+        if process_group is not None and \
+                kwargs.get('phase', 'TRAIN') == 'TRAIN':
+            group_size = process_group.size
+            rank = dragon.distributed.get_rank(process_group)
+
+        # Configuration
+        self._prefetch = kwargs.get('prefetch', 5)
+        self._batch_size = kwargs.get('batch_size', 2)
+        self._num_readers = kwargs.get('num_readers', 1)
+        self._num_transformers = kwargs.get('num_transformers', 3)
+        self.daemon = True
+
+        # Initialize queues
+        num_batches = self._prefetch * self._num_readers
+        self.q_in = mp.Queue(num_batches * self._batch_size)
+        self.q1_out = mp.Queue(num_batches * self._batch_size)
+        self.q2_out = mp.Queue(num_batches * self._batch_size)
+
+        # Initialize readers
+        self._readers = []
+        for i in range(self._num_readers):
+            part_idx, num_parts = i, self._num_readers
+            num_parts *= group_size
+            part_idx += rank * self._num_readers
+            self._readers.append(dragon.io.DataReader(
+                part_idx=part_idx, num_parts=num_parts, **kwargs))
+            self._readers[i]._seed += part_idx
+            self._readers[i].q_out = self.q_in
+            self._readers[i].start()
+            time.sleep(0.1)
+
+        # Initialize transformers
+        self._transformers = []
+        for i in range(self._num_transformers):
+            p = data_transformer.DataTransformer(**kwargs)
+            p._seed += (i + rank * self._num_transformers)
+            p.q_in = self.q_in
+            p.q1_out, p.q2_out = self.q1_out, self.q2_out
+            p.start()
+            self._transformers.append(p)
+            time.sleep(0.1)
+
+        # Register cleanup callbacks
+        def cleanup():
+            def terminate(processes):
+                for p in processes:
+                    p.terminate()
+                    p.join()
+            terminate(self._transformers)
+            logger.info('Terminate DataTransformer.')
+            terminate(self._readers)
+            logger.info('Terminate DataReader.')
+
+        import atexit
+        atexit.register(cleanup)
+
+    def next(self):
+        """Return the next batch of data."""
+        return self.__next__()
+
+    def __iter__(self):
+        """Return the iterator self."""
+        return self
+
+    def __next__(self):
+        """Return the next batch of data."""
+        q_out = None
+        # Two queues to implement aspect-grouping
+        # This is necessary to reduce the gpu memory
+        # from fetching a huge square batch blob
+        while q_out is None:
+            if self.q1_out.qsize() >= cfg.TRAIN.IMS_PER_BATCH:
+                q_out = self.q1_out
+            elif self.q2_out.qsize() >= cfg.TRAIN.IMS_PER_BATCH:
+                q_out = self.q2_out
+        self.q1_out, self.q2_out = self.q2_out, self.q1_out
+
+        images, images_info, boxes_to_pack = [], [], []
+
+        for i in range(cfg.TRAIN.IMS_PER_BATCH):
+            image, image_scale, boxes = q_out.get()
+            images.append(image)
+            images_info.append(list(image.shape[:2]) + [image_scale])
+            gt_boxes = np.zeros((boxes.shape[0], boxes.shape[1] + 1), 'float32')
+            gt_boxes[:, :boxes.shape[1]], gt_boxes[:, -1] = boxes, i
+            boxes_to_pack.append(gt_boxes)
+
+        return {
+            'data': im_list_to_blob(images),
+            'ims_info': np.array(images_info, dtype=np.float32),
+            'gt_boxes': np.concatenate(boxes_to_pack),
+        }
--- a/lib/faster_rcnn/data_transformer.py
+++ b/lib/faster_rcnn/data_transformer.py
@@ -15,19 +15,19 @@ from __future__ import print_function

 import multiprocessing

-import cv2
 import numpy as np

-from lib.core.config import cfg
-from lib.datasets.example import Example
-from lib.utils import boxes as box_util
-from lib.utils.blob import prep_im_for_blob
-from lib.utils.image import get_image_with_target_size
+from seetadet.core.config import cfg
+from seetadet.datasets.example import Example
+from seetadet.utils import boxes as box_util
+from seetadet.utils.blob import prep_im_for_blob


 class DataTransformer(multiprocessing.Process):
    def __init__(self, **kwargs):
        super(DataTransformer, self).__init__()
+        self._scales = cfg.TRAIN.SCALES
+        self._max_size = cfg.TRAIN.MAX_SIZE
        self._seed = cfg.RNG_SEED
        self._use_flipped = cfg.TRAIN.USE_FLIPPED
        self._use_diff = cfg.TRAIN.USE_DIFF
@@ -37,13 +37,7 @@ class DataTransformer(multiprocessing.Process):
        self.q_in = self.q1_out = self.q2_out = None
        self.daemon = True

-    def make_roi_dict(
-        self,
-        example,
-        im_scale,
-        apply_flip=False,
-        offsets=None,
-    ):
+    def make_roi_dict(self, example, im_scale, apply_flip=False):
        objects, n_objects = example.objects, 0
        height, width = example.height, example.width
        if not self._use_diff:
@@ -86,15 +80,6 @@ class DataTransformer(multiprocessing.Process):
        # Scale the boxes to the detecting scale
        roi_dict['boxes'] *= im_scale

-        # Apply the offsets from scale jitter
-        if offsets is not None:
-            roi_dict['boxes'][:, 0::2] += offsets[0]
-            roi_dict['boxes'][:, 1::2] += offsets[1]
-            roi_dict['boxes'][:, :] = np.minimum(
-                np.maximum(roi_dict['boxes'][:, :], 0),
-                [offsets[2][1] - 1, offsets[2][0] - 1] * 2,
-            )
-
        return roi_dict

    def get(self, example):
@@ -102,9 +87,8 @@ class DataTransformer(multiprocessing.Process):
        img = example.image

        # Scale
-        max_size = cfg.TRAIN.MAX_SIZE
-        target_size = cfg.TRAIN.SCALES[np.random.randint(len(cfg.TRAIN.SCALES))]
-        img, im_scale, jitter = prep_im_for_blob(img, target_size, max_size)
+        target_size = self._scales[np.random.randint(len(self._scales))]
+        img, im_scale = prep_im_for_blob(img, target_size, self._max_size)

        # Flip
        apply_flip = False
@@ -113,19 +97,8 @@ class DataTransformer(multiprocessing.Process):
                img = img[:, ::-1]
                apply_flip = True

-        # Random Crop or RandomPad
-        offsets = None
-        if cfg.TRAIN.MAX_SIZE > 0:
-            if jitter != 1:
-                # To a rectangle (scale, max_size)
-                target_size = (np.array(img.shape[:2]) / jitter).astype(np.int32)
-                img, offsets = get_image_with_target_size(target_size, img)
-        else:
-            # To a square (target_size, target_size)
-            img, offsets = get_image_with_target_size([target_size] * 2, img)
-
        # Example -> RoIDict
-        roi_dict = self.make_roi_dict(example, im_scale, apply_flip, offsets)
+        roi_dict = self.make_roi_dict(example, im_scale, apply_flip)

        # Post-Process for gt boxes
        # Shape like: [num_objects, {x1, y1, x2, y2, cls}]

--- a/lib/faster_rcnn/generate_anchors.py
+++ b/lib/faster_rcnn/generate_anchors.py
--- a/lib/faster_rcnn/proposal.py
+++ b/lib/faster_rcnn/proposal.py
@@ -17,11 +17,11 @@ import collections

 import numpy as np

-from lib.core.config import cfg
-from lib.faster_rcnn.generate_anchors import generate_anchors
-from lib.faster_rcnn.utils import generate_grid_anchors
-from lib.nms import nms_wrapper
-from lib.utils import boxes as box_util
+from seetadet.algo.faster_rcnn.generate_anchors import generate_anchors
+from seetadet.algo.faster_rcnn.utils import generate_grid_anchors
+from seetadet.core.config import cfg
+from seetadet.utils import boxes as box_util
+from seetadet.utils import nms


 class Proposal(object):
@@ -67,8 +67,8 @@ class Proposal(object):

        # Prepare for the outputs
        batch_rois = []
-        cls_prob = cls_prob.numpy(True)
-        bbox_pred = bbox_pred.numpy(True)
+        cls_prob = cls_prob.numpy()
+        bbox_pred = bbox_pred.numpy()
        if self.num_strides > 1:
            # (?, 4, A * K) -> (?, A * K, 4)
            bbox_pred = bbox_pred.transpose((0, 2, 1))
@@ -113,7 +113,7 @@ class Proposal(object):
            # Apply nms (e.g. threshold = 0.7)
            # Take after_nms_topN (e.g. 300)
            # Return the top proposals (-> RoIs top)
-            keep = nms_wrapper.nms(np.hstack((proposals, scores)), nms_thresh)
+            keep = nms.gpu_nms(np.hstack((proposals, scores)), nms_thresh)
            if post_nms_top_n > 0:
                keep = keep[:post_nms_top_n]
            proposals = proposals[keep, :]

--- a/lib/faster_rcnn/proposal_target.py
+++ b/lib/faster_rcnn/proposal_target.py
@@ -18,12 +18,10 @@ import collections
 import numpy as np
 import numpy.random as npr

-from lib.core.config import cfg
-from lib.faster_rcnn.utils import map_blobs_to_outputs
-from lib.faster_rcnn.utils import map_returns_to_blobs
-from lib.faster_rcnn.utils import map_rois_to_levels
-from lib.utils import boxes as box_util
-from lib.utils.framework import new_tensor
+from seetadet.algo.faster_rcnn import utils as rcnn_util
+from seetadet.core.config import cfg
+from seetadet.utils import boxes as box_util
+from seetadet.utils.env import new_tensor


 class ProposalTarget(object):
@@ -35,10 +33,8 @@ class ProposalTarget(object):
        self.num_classes = cfg.MODEL.NUM_CLASSES
        self.defaults = collections.OrderedDict([
            ('rois', np.array([[-1, 0, 0, 1, 1]], 'float32')),
-            ('labels', np.array([-1], 'float32')),
-            ('bbox_targets', np.zeros((1, self.num_classes * 4), 'float32')),
-            ('bbox_inside_weights', np.zeros((1, self.num_classes * 4), 'float32')),
-            ('bbox_outside_weights', np.zeros((1, self.num_classes * 4), 'float32')),
+            ('labels', np.array([-1], 'int64')),
+            ('bbox_targets', np.zeros((1, 4), 'float32')),
        ])

    def __call__(self, rpn_rois, gt_boxes):
@@ -63,86 +59,65 @@ class ProposalTarget(object):
            # Sample a batch of RoIs for training
            rois_per_image = cfg.TRAIN.BATCH_SIZE
            fg_rois_per_image = np.round(cfg.TRAIN.FG_FRACTION * rois_per_image)
-            map_returns_to_blobs(
+            rcnn_util.map_returns_to_blobs(
                sample_rois(
                    rois,
                    gt_boxes,
                    rois_per_image,
                    fg_rois_per_image,
-                    self.num_classes,
                ), blobs, keys,
            )

        # Stack into continuous blobs
-        for k, v in blobs.items():
-            blobs[k] = np.concatenate(blobs[k], 0)
+        blobs = dict((k, np.concatenate(blobs[k])) for k in blobs.keys())

        if self.num_strides > 1:
            # Distribute RoIs into pyramids
            min_lvl = cfg.FPN.ROI_MIN_LEVEL
            max_lvl = cfg.FPN.ROI_MAX_LEVEL
-            k = max_lvl - min_lvl + 1
-            levels = map_rois_to_levels(blobs['rois'], min_lvl, max_lvl)
-            outputs = map_blobs_to_outputs(
+            num_levels = max_lvl - min_lvl + 1
+            levels = rcnn_util.map_rois_to_levels(blobs['rois'], min_lvl, max_lvl)
+            lvl_blobs = rcnn_util.map_blobs_by_levels(
                blobs,
                self.defaults,
-                [np.where(levels == (i + min_lvl))[0] for i in range(k)],
+                [np.where(levels == (i + min_lvl))[0] for i in range(num_levels)],
            )
-            return {
-                'rois': [new_tensor(outputs['rois'][i]) for i in range(k)],
-                'labels': new_tensor(np.concatenate(outputs['labels'], 0)),
-                'bbox_targets': new_tensor(np.vstack(outputs['bbox_targets'])),
-                'bbox_inside_weights': new_tensor(np.vstack(outputs['bbox_inside_weights'])),
-                'bbox_outside_weights': new_tensor(np.vstack(outputs['bbox_outside_weights'])),
-            }
+            blobs = dict((k, np.concatenate(lvl_blobs[k])) for k in blobs.keys())
+            rois_wide = [lvl_blobs['rois'][i] for i in range(num_levels)]
        else:
-            # Return RoIs directly for CX-stride
-            return {
-                'rois': [new_tensor(blobs['rois'])],
-                'labels': new_tensor(blobs['labels']),
-                'bbox_targets': new_tensor(blobs['bbox_targets']),
-                'bbox_inside_weights': new_tensor(blobs['bbox_inside_weights']),
-                'bbox_outside_weights': new_tensor(blobs['bbox_outside_weights']),
-            }
-
-
-def get_targets(ex_rois, gt_rois, gt_labels, num_classes):
-    """Compute bounding-box regression targets for an image."""
-    assert ex_rois.shape[0] == gt_rois.shape[0]
-    assert ex_rois.shape[1] == 4
-    assert gt_rois.shape[1] == 4
-    # Compute bbox regression targets
-    fg_inds = np.where(gt_labels > 0)[0]
-    targets = box_util.bbox_transform(ex_rois, gt_rois, cfg.BBOX_REG_WEIGHTS)
-    bbox_targets = np.zeros((ex_rois.shape[0], 4 * num_classes), 'float32')
-    inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32)
-    for i in fg_inds:
-        start = int(4 * gt_labels[i])
-        bbox_targets[i, start:start + 4] = targets[i]
-        inside_weights[i, start:start + 4] = (1., 1., 1., 1.)
-    outside_weights = np.array(inside_weights > 0).astype('float32')
-    return bbox_targets, inside_weights, outside_weights
-
-
-def sample_rois(
-    all_rois,
-    gt_boxes,
-    num_rois,
-    num_fg_rois,
-    num_classes,
-):
+            # Return RoIs directly for specified stride
+            rois_wide = [blobs['rois']]
+
+        # Select the foreground RoIs only for bbox branch
+        fg_inds = np.where(blobs['labels'] > 0)[0]
+        cls_inds = np.arange(len(blobs['rois'])) * self.num_classes
+
+        return {
+            'rois': [new_tensor(rois) for rois in rois_wide],
+            'labels': new_tensor(blobs['labels']),
+            'bbox_indices': new_tensor(cls_inds[fg_inds] + blobs['labels'][fg_inds]),
+            'bbox_targets': new_tensor(blobs['bbox_targets'][fg_inds].astype('float32')),
+            'bbox_anchors': new_tensor(blobs['rois'][fg_inds, 1:].astype('float32')),
+        }
+
+
+def sample_rois(all_rois, gt_boxes, num_rois, num_fg_rois):
    """Sample a batch of RoIs comprising foreground and background examples."""
    overlaps = box_util.bbox_overlaps(all_rois[:, 1:5], gt_boxes[:, :4])
    gt_assignment = overlaps.argmax(axis=1)
    max_overlaps = overlaps.max(axis=1)
-    labels = gt_boxes[gt_assignment, 4]
+    labels = gt_boxes[gt_assignment, 4].astype('int64')

    # Select foreground RoIs as those with >= FG_THRESH overlap
-    fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0]
-    fg_rois_per_this_image = int(min(num_fg_rois, fg_inds.size))
+    fg_thresh = cfg.TRAIN.FG_THRESH
+    fg_inds = np.where(max_overlaps >= fg_thresh)[0]
+    while fg_inds.size == 0:
+        fg_thresh -= 0.01
+        fg_inds = np.where(max_overlaps >= fg_thresh)[0]
+
    # Sample foreground regions without replacement
-    if fg_inds.size > 0:
-        fg_inds = npr.choice(fg_inds, fg_rois_per_this_image, False)
+    fg_rois_per_this_image = int(min(num_fg_rois, fg_inds.size))
+    fg_inds = npr.choice(fg_inds, fg_rois_per_this_image, False)

    # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
    bg_inds = np.where((max_overlaps < cfg.TRAIN.BG_THRESH_HI) &
@@ -160,15 +135,14 @@ def sample_rois(
    rois, labels = all_rois[keep_inds], labels[keep_inds]
    # Clamp labels for the background RoIs to 0
    labels[fg_rois_per_this_image:] = 0
-    # Clamp the image indices for the background RoIs to -1
-    rois[fg_rois_per_this_image:][0] = -1

    # Compute the target from RoIs
-    outputs = [rois, labels]
-    outputs += get_targets(
-        rois[:, 1:5],
-        gt_boxes[gt_assignment[keep_inds], :4],
+    return [
+        rois,
        labels,
-        num_classes,
-    )
-    return outputs
+        box_util.bbox_transform(
+            rois[:, 1:5],
+            gt_boxes[gt_assignment[keep_inds], :4],
+            cfg.BBOX_REG_WEIGHTS,
+        )
+    ]
--- a/lib/faster_rcnn/test.py
+++ b/lib/faster_rcnn/test.py
@@ -13,17 +13,18 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import types
+
 import dragon.vm.torch as torch
 import numpy as np

-from lib.core.config import cfg
-from lib.modeling.detector import new_detector
-from lib.nms import nms_wrapper
-from lib.utils import boxes as box_util
-from lib.utils import framework
-from lib.utils import time_util
-from lib.utils.blob import im_list_to_blob
-from lib.utils.image import scale_image
+from seetadet.core.config import cfg
+from seetadet.modeling.detector import new_detector
+from seetadet.utils import boxes as box_util
+from seetadet.utils import nms as nms_util
+from seetadet.utils import time_util
+from seetadet.utils.blob import im_list_to_blob
+from seetadet.utils.image import scale_image


 def im_detect(detector, raw_image):
@@ -31,49 +32,41 @@ def im_detect(detector, raw_image):
    ims, ims_scale = scale_image(raw_image)

    # Prepare blobs
-    blobs = {'data': im_list_to_blob(ims)}
-    blobs['ims_info'] = np.array([
-        list(blobs['data'].shape[1:3]) + [im_scale]
-        for im_scale in ims_scale
-    ], dtype=np.float32)
+    data = im_list_to_blob(ims)
+    ims_info = np.array([list(data.shape[1:3]) + [im_scale]
+         for im_scale in ims_scale], dtype=np.float32)

    # Do Forward
-    if not hasattr(detector, 'graph'):
-        with framework.new_workspace().as_default():
-            data = torch.from_numpy(blobs['data'])
-            ims_info = torch.from_numpy(blobs['ims_info'])
-            with torch.no_grad():
-                with torch.jit.Tracer(retain_ops=True):
-                    inputs = {'data': data, 'ims_info': ims_info}
-                    outputs = detector.forward(inputs)
-                    detector.graph = \
-                        framework.Graph(inputs, {
-                            'rois': outputs['rois'],
-                            'cls_prob': outputs['cls_prob'],
-                            'bbox_pred': outputs['bbox_pred']
-                        })
-    outputs = detector.graph(**blobs)
+    data = torch.from_numpy(data)
+    ims_info = torch.from_numpy(ims_info)
+
+    if not hasattr(detector, 'script_forward'):
+        def script_forward(self, data, ims_info):
+            return self.forward({'data': data, 'ims_info': ims_info})
+        detector.script_forward = torch.jit.trace(
+            func=types.MethodType(script_forward, detector),
+            example_inputs=[data, ims_info],
+        )
+
+    outputs = detector.script_forward(data, ims_info)
+    outputs = dict((k, outputs[k].numpy()) for k in outputs.keys())

    # Decode results
-    rois = outputs['rois']
-    scores, boxes, batch_inds = [], [], []
+    all_scores, all_boxes = [], []
    pred_boxes = \
        box_util.bbox_transform_inv(
-            rois[:, 1:5],
+            outputs['rois'][:, 1:5],
            outputs['bbox_pred'],
            cfg.BBOX_REG_WEIGHTS,
        )

    for i in range(len(ims)):
-        inds = np.where(rois[:, 0].astype(np.int32) == i)[0]
-        im_boxes = pred_boxes[inds] / ims_scale[i]
-        scores.append(outputs['cls_prob'][inds])
-        boxes.append(box_util.clip_tiled_boxes(im_boxes, raw_image.shape))
+        inds = np.where(outputs['rois'][:, 0].astype(np.int32) == i)[0]
+        boxes = pred_boxes[inds] / ims_scale[i]
+        all_scores.append(outputs['cls_prob'][inds])
+        all_boxes.append(box_util.clip_tiled_boxes(boxes, raw_image.shape))

-    return (
-        np.vstack(scores) if len(ims) > 0 else scores[0],
-        np.vstack(boxes) if len(ims) > 0 else boxes[0],
-    )
+    return np.vstack(all_scores), np.vstack(all_boxes)


 def test_net(weights, num_classes, q_in, q_out, device):
@@ -84,7 +77,7 @@ def test_net(weights, num_classes, q_in, q_out, device):

    while True:
        idx, raw_image = q_in.get()
-        if raw_image is None:
+        if idx < 0:
            break

        boxes_this_image = [[]]
@@ -101,17 +94,16 @@ def test_net(weights, num_classes, q_in, q_out, device):
                (cls_boxes, cls_scores[:, np.newaxis])
            ).astype(np.float32, copy=False)
            if cfg.TEST.USE_SOFT_NMS:
-                keep = nms_wrapper.soft_nms(
+                keep = nms_util.soft_nms(
                    cls_detections,
                    thresh=cfg.TEST.NMS,
                    method=cfg.TEST.SOFT_NMS_METHOD,
                    sigma=cfg.TEST.SOFT_NMS_SIGMA,
                )
            else:
-                keep = nms_wrapper.nms(
+                keep = nms_util.nms(
                    cls_detections,
                    thresh=cfg.TEST.NMS,
-                    force_cpu=True,
                )
            cls_detections = cls_detections[keep, :]
            boxes_this_image.append(cls_detections)
@@ -119,11 +111,8 @@ def test_net(weights, num_classes, q_in, q_out, device):

        q_out.put((
            idx,
-            {
-                'im_detect': _t['im_detect'].average_time,
-                'misc': _t['misc'].average_time,
-            },
-            {
-                'boxes': boxes_this_image,
-            },
+            dict([('im_detect', _t['im_detect'].average_time),
+                  ('misc', _t['misc'].average_time)]),
+            dict([('boxes', boxes_this_image)]),
        ))
+
--- a/lib/faster_rcnn/utils.py
+++ b/lib/faster_rcnn/utils.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 import collections
 import numpy as np

-from lib.core.config import cfg
+from seetadet.core.config import cfg


 def generate_grid_anchors(features, base_anchors, strides):
@@ -75,7 +75,7 @@ def map_rois_to_levels(rois, k_min, k_max):
    return np.clip(target_levels, k_min, k_max)


-def map_blobs_to_outputs(blobs, defaults, lvl_inds):
+def map_blobs_by_levels(blobs, defaults, lvl_inds):
    """Map blobs to outputs according to fpn indices."""
    outputs = collections.defaultdict(list)
    for inds in lvl_inds:

--- a/lib/modeling/__init__.py
+++ b/lib/modeling/__init__.py
@@ -13,10 +13,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-# Import custom modules
-from lib.modeling.fast_rcnn import FastRCNN
-from lib.modeling.fpn import FPN
-from lib.modeling.mask_rcnn import MaskRCNN
-from lib.modeling.retinanet import RetinaNet
-from lib.modeling.rpn import RPN
-from lib.modeling.ssd import SSD
+from seetadet.algo.faster_rcnn.anchor_target import AnchorTarget
+from seetadet.algo.faster_rcnn.proposal import Proposal
+from seetadet.algo.mask_rcnn.data_loader import DataLoader
+from seetadet.algo.mask_rcnn.proposal_target import ProposalTarget
--- a/seetadet/algo/mask_rcnn/data_loader.py
+++ b/seetadet/algo/mask_rcnn/data_loader.py
--- a/lib/mask_rcnn/data_transformer.py
+++ b/lib/mask_rcnn/data_transformer.py
--- a/lib/mask_rcnn/proposal_target.py
+++ b/lib/mask_rcnn/proposal_target.py
--- a/lib/mask_rcnn/test.py
+++ b/lib/mask_rcnn/test.py
--- a/lib/mask_rcnn/__init__.py
+++ b/lib/mask_rcnn/__init__.py
--- a/lib/retinanet/anchor_target.py
+++ b/lib/retinanet/anchor_target.py
--- a/seetadet/algo/retinanet/data_loader.py
+++ b/seetadet/algo/retinanet/data_loader.py
--- a/lib/retinanet/test.py
+++ b/lib/retinanet/test.py
--- a/lib/faster_rcnn/__init__.py
+++ b/lib/faster_rcnn/__init__.py
--- a/lib/ssd/cat.jpg
+++ b/lib/ssd/cat.jpg
--- a/seetadet/algo/ssd/data_loader.py
+++ b/seetadet/algo/ssd/data_loader.py
--- a/lib/ssd/data_transformer.py
+++ b/lib/ssd/data_transformer.py
--- a/lib/ssd/generate_anchors.py
+++ b/lib/ssd/generate_anchors.py
--- a/lib/ssd/hard_mining.py
+++ b/lib/ssd/hard_mining.py
--- a/lib/ssd/multibox.py
+++ b/lib/ssd/multibox.py
--- a/lib/ssd/priorbox.py
+++ b/lib/ssd/priorbox.py
--- a/lib/ssd/test.py
+++ b/lib/ssd/test.py
--- a/lib/ssd/transforms.py
+++ b/lib/ssd/transforms.py
--- a/lib/ssd/transforms_test.py
+++ b/lib/ssd/transforms_test.py
--- a/lib/datasets/__init__.py
+++ b/lib/datasets/__init__.py
--- a/lib/core/config.py
+++ b/lib/core/config.py
--- a/lib/core/coordinator.py
+++ b/lib/core/coordinator.py
--- a/seetadet/core/registry.py
+++ b/seetadet/core/registry.py
--- a/lib/core/test.py
+++ b/lib/core/test.py
--- a/lib/core/test_engine.py
+++ b/lib/core/test_engine.py
--- a/lib/core/train.py
+++ b/lib/core/train.py
--- a/lib/modules/__init__.py
+++ b/lib/modules/__init__.py
--- a/seetadet/dali/data_reader.py
+++ b/seetadet/dali/data_reader.py
--- a/seetadet/dali/rcnn_pipeline.py
+++ b/seetadet/dali/rcnn_pipeline.py
--- a/seetadet/dali/ssd_pipeline.py
+++ b/seetadet/dali/ssd_pipeline.py
--- a/lib/nms/__init__.py
+++ b/lib/nms/__init__.py
--- a/lib/datasets/coco_evaluator.py
+++ b/lib/datasets/coco_evaluator.py
--- a/lib/datasets/imdb.py
+++ b/lib/datasets/imdb.py
--- a/lib/datasets/example.py
+++ b/lib/datasets/example.py
--- a/lib/datasets/factory.py
+++ b/lib/datasets/factory.py
--- a/lib/datasets/taas.py
+++ b/lib/datasets/taas.py
--- a/lib/datasets/voc_eval.py
+++ b/lib/datasets/voc_eval.py
--- a/lib/datasets/voc_evaluator.py
+++ b/lib/datasets/voc_evaluator.py
--- a/seetadet/modeling/__init__.py
+++ b/seetadet/modeling/__init__.py
--- a/lib/modeling/airnet.py
+++ b/lib/modeling/airnet.py
--- a/lib/modeling/detector.py
+++ b/lib/modeling/detector.py
--- a/lib/modeling/fast_rcnn.py
+++ b/lib/modeling/fast_rcnn.py
--- a/lib/modeling/fpn.py
+++ b/lib/modeling/fpn.py
--- a/lib/modeling/mask_rcnn.py
+++ b/lib/modeling/mask_rcnn.py
--- a/lib/modeling/mobilenet.py
+++ b/lib/modeling/mobilenet.py
--- a/lib/modeling/resnet.py
+++ b/lib/modeling/resnet.py
--- a/lib/modeling/retinanet.py
+++ b/lib/modeling/retinanet.py
--- a/lib/modeling/rpn.py
+++ b/lib/modeling/rpn.py
--- a/lib/modeling/ssd.py
+++ b/lib/modeling/ssd.py
--- a/lib/modeling/vgg.py
+++ b/lib/modeling/vgg.py
--- a/lib/ssd/__init__.py
+++ b/lib/ssd/__init__.py
--- a/lib/modules/det.py
+++ b/lib/modules/det.py
--- a/lib/modules/init.py
+++ b/lib/modules/init.py
--- a/seetadet/modules/nn.py
+++ b/seetadet/modules/nn.py
--- a/lib/modules/vision.py
+++ b/lib/modules/vision.py
--- a/lib/retinanet/__init__.py
+++ b/lib/retinanet/__init__.py
--- a/seetadet/onnx/nodes.py
+++ b/seetadet/onnx/nodes.py
--- a/lib/pycocotools/__init__.py
+++ b/lib/pycocotools/__init__.py
--- a/lib/pycocotools/coco.py
+++ b/lib/pycocotools/coco.py
--- a/lib/pycocotools/cocoeval.py
+++ b/lib/pycocotools/cocoeval.py
--- a/lib/pycocotools/mask.py
+++ b/lib/pycocotools/mask.py
--- a/lib/pycocotools/mask_utils.py
+++ b/lib/pycocotools/mask_utils.py
--- a/lib/solver/__init__.py
+++ b/lib/solver/__init__.py
--- a/lib/solver/lr_scheduler.py
+++ b/lib/solver/lr_scheduler.py
--- a/lib/solver/sgd.py
+++ b/lib/solver/sgd.py
--- a/lib/utils/__init__.py
+++ b/lib/utils/__init__.py
--- a/lib/utils/attrdict.py
+++ b/lib/utils/attrdict.py
--- a/lib/utils/blob.py
+++ b/lib/utils/blob.py
--- a/seetadet/utils/boxes.py
+++ b/seetadet/utils/boxes.py
--- a/seetadet/utils/boxes_v2.py
+++ b/seetadet/utils/boxes_v2.py
--- a/lib/utils/colormap.py
+++ b/lib/utils/colormap.py
--- a/seetadet/utils/env.py
+++ b/seetadet/utils/env.py
--- a/lib/utils/image.py
+++ b/lib/utils/image.py
--- a/lib/utils/logger.py
+++ b/lib/utils/logger.py
--- a/lib/utils/mask.py
+++ b/lib/utils/mask.py
--- a/lib/nms/nms_wrapper.py
+++ b/lib/nms/nms_wrapper.py
--- a/lib/utils/stats.py
+++ b/lib/utils/stats.py
--- a/lib/utils/time_util.py
+++ b/lib/utils/time_util.py
--- a/lib/utils/vis.py
+++ b/lib/utils/vis.py
--- a/setup.py
+++ b/setup.py
--- a/tools/export.py
+++ b/tools/export.py
--- a/tools/mpi_train.py
+++ b/tools/mpi_train.py
--- a/tools/test.py
+++ b/tools/test.py
--- a/tools/test_all.py
+++ b/tools/test_all.py
--- a/tools/train.py
+++ b/tools/train.py