Support Mask R-CNN

Ting PAN
Commit d3ed62db authored Nov 22, 2019 by Ting PAN
Showing with 4534 additions and 3796 deletions
CHANGES
README.md
compile/make.sh
compile/rbox.cc
lib/core/config.py
lib/core/test.py
lib/core/test_engine.py
lib/core/train.py
lib/datasets/coco_evaluator.py
lib/datasets/example.py
lib/datasets/factory.py
lib/datasets/imdb.py
lib/datasets/taas.py
lib/datasets/voc_eval.py
lib/datasets/voc_evaluator.py
lib/faster_rcnn/__init__.py
lib/faster_rcnn/anchor_target.py
lib/faster_rcnn/anchor_target_layer.py
lib/faster_rcnn/data_layer.py → lib/faster_rcnn/data_loader.py
lib/faster_rcnn/data_transformer.py
--- a/CHANGES
+++ b/CHANGES
 ------------------------------------------------------------------------
 The list of most significant changes made over time in SeetaDet.

+SeetaDet 0.3.0 (20191121)
+
+Dragon Minimum Required (Version 0.3.0.dev20191121)
+
+Changes:
+
+Preview Features:
+
+- New algorithm: Mask R-CNN.
+
+- Add MobileNet(V2 and NAS) as backbone.
+
+- Refactor testing module, multi-GPU is supported.
+
+Bugs fixed:
+
+- Remove rotated boxes, use Mask R-CNN instead.
+
+------------------------------------------------------------------------
+
 SeetaDet 0.2.3 (20191101)

 Dragon Minimum Required (Version 0.3.0.dev20191021)

--- a/README.md
+++ b/README.md
@@ -12,6 +12,10 @@ while the style of codes is PyTorch.

 The torch-style codes help us to simplify the hierarchical pipeline of modern detection.

+## Requirements
+
+seeta-dragon >= 0.3.0.dev20191121
+
 ## Installation

 #### 1. Install the required python packages

--- a/compile/make.sh
+++ b/compile/make.sh
@@ -5,7 +5,6 @@ rm -r build install *.c *.cpp

 # Compile cpp modules
 python setup.py build_ext --inplace
-g++ -o ../lib/utils/ctypes_rbox.so -shared -fPIC -O2 rbox.cc -std=c++11 -fopenmp

 # Compile cuda modules
 cd build && cmake .. && make install && cd ..

--- a/compile/rbox.cc
+++ b/compile/rbox.cc
-// ------------------------------------------------------------
-// Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-//
-// Licensed under the BSD 2-Clause License.
-// You should have received a copy of the BSD 2-Clause License
-// along with the software. If not, See,
-//
-//      <https://opensource.org/licenses/BSD-2-Clause>
-//
-// Codes are based on:
-//
-//      <https://github.com/facebookresearch/detectron2/blob/master/detectron2
-//              /layers/csrc/box_iou_rotated/box_iou_rotated_utils.h>
-//
-// ------------------------------------------------------------
-
-#include <cmath>
-#include <algorithm>
-#include <omp.h>
-
-template <typename T>
-struct RotatedBox {
-    T x_ctr, y_ctr, w, h, a;
-};
-
-template <typename T>
-struct Point {
-    T x, y;
-    Point(const T& px = 0, const T& py = 0) : x(px), y(py) {}
-    Point operator+(const Point& p) const {
-        return Point(x + p.x, y + p.y);
-    }
-    Point& operator+=(const Point& p) {
-        x += p.x;
-        y += p.y;
-        return *this;
-    }
-    Point operator-(const Point& p) const {
-        return Point(x - p.x, y - p.y);
-    }
-    Point operator*(const T coeff) const {
-        return Point(x * coeff, y * coeff);
-    }
-};
-
-template <typename T>
-T dot_2d(const Point<T>& A, const Point<T>& B) {
-    return A.x * B.x + A.y * B.y;
-}
-
-template <typename T>
-T cross_2d(const Point<T>& A, const Point<T>& B) {
-    return A.x * B.y - B.x * A.y;
-}
-
-template <typename T>
-void get_rotated_vertices(
-    const RotatedBox<T>&        box,
-    Point<T>                    (&pts)[4]) {
-    // M_PI / 180. == 0.01745329251
-    double theta = box.a * 0.01745329251;
-    T cosTheta2 = (T)cos(theta) * 0.5f;
-    T sinTheta2 = (T)sin(theta) * 0.5f;
-    // y: top --> down; x: left --> right
-    pts[0].x = box.x_ctr - sinTheta2 * box.h - cosTheta2 * box.w;
-    pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w;
-    pts[1].x = box.x_ctr + sinTheta2 * box.h - cosTheta2 * box.w;
-    pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w;
-    pts[2].x = 2 * box.x_ctr - pts[0].x;
-    pts[2].y = 2 * box.y_ctr - pts[0].y;
-    pts[3].x = 2 * box.x_ctr - pts[1].x;
-    pts[3].y = 2 * box.y_ctr - pts[1].y;
-}
-
-template <typename T>
-int get_intersection_points(
-    const Point<T>          (&pts1)[4],
-    const Point<T>          (&pts2)[4],
-    Point<T>                (&intersections)[24]) {
-    // Line vector
-    // A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1]
-    Point<T> vec1[4], vec2[4];
-    for (int i = 0; i < 4; i++) {
-        vec1[i] = pts1[(i + 1) % 4] - pts1[i];
-        vec2[i] = pts2[(i + 1) % 4] - pts2[i];
-    }
-
-    // Line test - test all line combos for intersection
-    int num = 0; // number of intersections
-    for (int i = 0; i < 4; i++) {
-        for (int j = 0; j < 4; j++) {
-            // Solve for 2x2 Ax=b
-            T det = cross_2d(vec2[j], vec1[i]);
-
-            // This takes care of parallel lines
-            if (fabs(det) <= 1e-14) {
-                continue;
-            }
-
-            auto vec12 = pts2[j] - pts1[i];
-
-            T t1 = cross_2d(vec2[j], vec12) / det;
-            T t2 = cross_2d(vec1[i], vec12) / det;
-
-            if (t1 >= 0.0f && t1 <= 1.0f && t2 >= 0.0f && t2 <= 1.0f) {
-                intersections[num++] = pts1[i] + vec1[i] * t1;
-            }
-        }
-    }
-
-    // Check for vertices of rect1 inside rect2
-    {
-        const auto& AB = vec2[0];
-        const auto& DA = vec2[3];
-        auto ABdotAB = dot_2d(AB, AB);
-        auto ADdotAD = dot_2d(DA, DA);
-        for (int i = 0; i < 4; i++) {
-            // assume ABCD is the rectangle, and P is the point to be judged
-            // P is inside ABCD iff. P's projection on AB lies within AB
-            // and P's projection on AD lies within AD
-
-            auto AP = pts1[i] - pts2[0];
-
-            auto APdotAB = dot_2d<T>(AP, AB);
-            auto APdotAD = -dot_2d<T>(AP, DA);
-
-            if ((APdotAB >= 0) &&
-                    (APdotAD >= 0) &&
-                        (APdotAB <= ABdotAB) &&
-                            (APdotAD <= ADdotAD)) {
-                intersections[num++] = pts1[i];
-            }
-        }
-    }
-
-    // Reverse the check - check for vertices of rect2 inside rect1
-    {
-        const auto& AB = vec1[0];
-        const auto& DA = vec1[3];
-        auto ABdotAB = dot_2d<T>(AB, AB);
-        auto ADdotAD = dot_2d<T>(DA, DA);
-        for (int i = 0; i < 4; i++) {
-            auto AP = pts2[i] - pts1[0];
-
-            auto APdotAB = dot_2d<T>(AP, AB);
-            auto APdotAD = -dot_2d<T>(AP, DA);
-
-            if ((APdotAB >= 0) &&
-                    (APdotAD >= 0) &&
-                        (APdotAB <= ABdotAB) &&
-                            (APdotAD <= ADdotAD)) {
-                intersections[num++] = pts2[i];
-            }
-        }
-    }
-
-    return num;
-}
-
-template <typename T>
-int convex_hull_graham(
-    const Point<T>          (&p)[24],
-    const int&              num_in,
-    Point<T>                (&q)[24],
-    bool                    shift_to_zero = false) {
-
-    // Step 1:
-    // Find point with minimum y
-    // if more than 1 points have the same minimum y,
-    // pick the one with the minimum x.
-    int t = 0;
-    for (int i = 1; i < num_in; i++) {
-        if (p[i].y < p[t].y || (p[i].y == p[t].y && p[i].x < p[t].x)) {
-            t = i;
-        }
-    }
-    auto& start = p[t]; // starting point
-
-    // Step 2:
-    // Subtract starting point from every points (for sorting in the next step)
-    for (int i = 0; i < num_in; i++) {
-        q[i] = p[i] - start;
-    }
-
-    // Swap the starting point to position 0
-    auto tmp = q[0];
-    q[0] = q[t];
-    q[t] = tmp;
-
-    // Step 3:
-    // Sort point 1 ~ num_in according to their relative cross-product values
-    // (essentially sorting according to angles)
-     // If the angles are the same, sort according to their distance to origin
-    T dist[24];
-    for (int i = 0; i < num_in; i++) {
-        dist[i] = dot_2d(q[i], q[i]);
-    }
-
-    std::sort(
-        q + 1, q + num_in, [](const Point<T>& A, const Point<T>& B) -> bool {
-            T temp = cross_2d<T>(A, B);
-            if (fabs(temp) < 1e-6) {
-                return dot_2d(A, A) < dot_2d(B, B);
-            } else {
-                return temp > 0;
-            }
-      });
-
-    // Step 4:
-    // Make sure there are at least 2 points (that don't overlap with each other)
-    // in the stack
-    int k; // index of the non-overlapped second point
-    for (k = 1; k < num_in; k++) {
-        if (dist[k] > 1e-8) {
-            break;
-        }
-    }
-    if (k == num_in) {
-        // We reach the end, which means the convex hull is just one point
-        q[0] = p[t];
-        return 1;
-    }
-    q[1] = q[k];
-    int m = 2; // 2 points in the stack
-    // Step 5:
-    // Finally we can start the scanning process.
-    // When a non-convex relationship between the 3 points is found
-    // (either concave shape or duplicated points),
-    // we pop the previous point from the stack
-    // until the 3-point relationship is convex again, or
-    // until the stack only contains two points
-    for (int i = k + 1; i < num_in; i++) {
-        while (m > 1 && cross_2d(q[i] - q[m - 2], q[m - 1] - q[m - 2]) >= 0) {
-            m--;
-        }
-        q[m++] = q[i];
-    }
-
-    // Step 6 (Optional):
-    // In general sense we need the original coordinates, so we
-    // need to shift the points back (reverting Step 2)
-    // But if we're only interested in getting the area/perimeter of the shape
-    // We can simply return.
-    if (!shift_to_zero) {
-        for (int i = 0; i < m; i++) {
-            q[i] += start;
-        }
-    }
-
-    return m;
-}
-
-template <typename T>
-T polygon_area(const Point<T> (&q)[24], const int& m) {
-    if (m <= 2) {
-        return 0;
-    }
-
-    T area = 0;
-    for (int i = 1; i < m - 1; i++) {
-        area += fabs(cross_2d(q[i] - q[0], q[i + 1] - q[0]));
-    }
-    return area / 2.0;
-}
-
-template <typename T>
-T rotated_boxes_intersection(
-    const RotatedBox<T>&        box1,
-    const RotatedBox<T>&        box2) {
-    // There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned
-    // from rotated_rect_intersection_pts
-    Point<T> intersectPts[24], orderedPts[24];
-
-    Point<T> pts1[4];
-    Point<T> pts2[4];
-    get_rotated_vertices(box1, pts1);
-    get_rotated_vertices(box2, pts2);
-
-    int num = get_intersection_points(pts1, pts2, intersectPts);
-
-    if (num <= 2) {
-        return 0.0;
-    }
-
-    // Convex Hull to order the intersection points in clockwise order and find
-    // the contour area.
-    int num_convex = convex_hull_graham(intersectPts, num, orderedPts, true);
-    return polygon_area(orderedPts, num_convex);
-}
-
-
-template <typename T>
-T single_box_iou_rotated(
-    T const* const          box1_raw,
-    T const* const          box2_raw) {
-    // shift center to the middle point to achieve higher precision in result
-    RotatedBox<T> box1, box2;
-    auto center_shift_x = (box1_raw[0] + box2_raw[0]) / 2.0;
-    auto center_shift_y = (box1_raw[1] + box2_raw[1]) / 2.0;
-    box1.x_ctr = box1_raw[0] - center_shift_x;
-    box1.y_ctr = box1_raw[1] - center_shift_y;
-    box1.w = box1_raw[2];
-    box1.h = box1_raw[3];
-    box1.a = box1_raw[4];
-    box2.x_ctr = box2_raw[0] - center_shift_x;
-    box2.y_ctr = box2_raw[1] - center_shift_y;
-    box2.w = box2_raw[2];
-    box2.h = box2_raw[3];
-    box2.a = box2_raw[4];
-
-    const T area1 = box1.w * box1.h;
-    const T area2 = box2.w * box2.h;
-    if (area1 < 1e-14 || area2 < 1e-14) {
-        return 0.f;
-    }
-
-    const T inter = rotated_boxes_intersection(box1, box2);
-    const T iou = inter / (area1 + area2 - inter);
-    return iou;
-}
-
-extern "C" {
-	void apply_cpu_nms(
-	    double*             dets,
-	    int*                indices,
-	    int&                n,
-	    double              threshold) {
-	    int count = 0;
-	    for(int i = 0; i < n; i++) {
-	        bool keep = true;
-	        auto* box1 = dets + i * 6;
-		    for(int j = 0; j < count; j++) {
-			    auto* box2 = dets + indices[j] * 6;
-			    auto ovr = single_box_iou_rotated(box1, box2);
-			    if (ovr > threshold) {
-				    keep = false;
-				    break;
-			    }
-		    }
-		    if (keep) {
-			    indices[count] = i;
-			    count++;
-		    }
-	    }
-	    n = count;
-	}
-
-    void bbox_overlaps(
-        double*             boxes1,
-        double*             boxes2,
-        int*                shape,
-        double*             overlaps) {
-        int N = shape[0], K = shape[1];
-#pragma omp parallel for num_threads(std::min(omp_get_num_procs(), 4))
-        for (int i = 0; i < N; i++) {
-            auto* box1 = boxes1 + i * 5;
-            for (int j = 0; j < K; j++) {
-                auto* box2 = boxes2 + j * 5;
-                overlaps[i * K + j] = single_box_iou_rotated(box1, box2);
-            }
-        }
-    }
-}
--- a/lib/core/config.py
+++ b/lib/core/config.py
@@ -41,6 +41,9 @@ __C.TRAIN.WEIGHTS = ''
 # Database to train
 __C.TRAIN.DATABASE = ''

+# The number of workers to transform data
+__C.TRAIN.NUM_WORKERS = 3
+
 # Scales to use during training (can list multiple scales)
 # Each scale is the pixel size of an image's shortest side
 __C.TRAIN.SCALES = (600,)
@@ -151,10 +154,10 @@ __C.TEST.SOFT_NMS_SIGMA = 0.5
 # The top-k prior boxes before nms.
 __C.TEST.NMS_TOP_K = 400

-# The threshold for prAttrDicting boxes
+# The threshold for predicting boxes
 __C.TEST.SCORE_THRESH = 0.05

-# The threshold for prAttrDicting masks
+# The threshold for predicting masks
 __C.TEST.BINARY_THRESH = 0.5

 # NMS threshold used on RPN proposals
@@ -192,8 +195,9 @@ __C.MODEL = AttrDict()

 # The type of the model
 # ('faster_rcnn',
-#  'ssd',
+#  'mask_rcnn',
 #  'retinanet,
+#  'ssd',
 # )
 __C.MODEL.TYPE = ''

@@ -361,14 +365,14 @@ __C.SSD.NUM_CONVS = 0
 # Weight for bbox regression loss
 __C.SSD.BBOX_REG_WEIGHT = 1.

-__C.SSD.MULTIBOX = AttrDict()
 # MultiBox configs
+__C.SSD.MULTIBOX = AttrDict()
 __C.SSD.MULTIBOX.STRIDES = []
 __C.SSD.MULTIBOX.MIN_SIZES = []
 __C.SSD.MULTIBOX.MAX_SIZES = []
 __C.SSD.MULTIBOX.ASPECT_RATIOS = []
-__C.SSD.MULTIBOX.ASPECT_ANGLES = []

+# OHEM configs
 __C.SSD.OHEM = AttrDict()
 # The threshold for selecting negative bbox in hard example mining
 __C.SSD.OHEM.NEG_OVERLAP = 0.5

--- a/lib/core/test.py
+++ b/lib/core/test.py
@@ -21,46 +21,56 @@ import cv2
 import dragon

 from lib.core.config import cfg
+from lib.datasets.example import Example
 from lib.datasets.factory import get_imdb
-from lib.faster_rcnn.data_transformer import DataTransformer


-class TestServer(object):
+class _Server(object):
    def __init__(self, output_dir):
-        self.imdb = get_imdb(cfg.TEST.DATABASE)
-        self.imdb.competition_mode(cfg.TEST.COMPETITION_MODE)
-        self.num_images, self.num_classes, self.classes = \
-            self.imdb.num_images, self.imdb.num_classes, self.imdb.classes
-        self.data_reader = dragon.io.DataReader(
-            dataset=lambda: dragon.io.SeetaRecordDataset(self.imdb.source))
-        self.data_transformer = DataTransformer()
-        self.data_reader.q_out = mp.Queue(cfg.TEST.IMS_PER_BATCH * 5)
-        self.data_reader.start()
-        self.gt_recs = collections.OrderedDict()
        self.output_dir = output_dir
        if cfg.VIS_ON_FILE:
            self.vis_dir = os.path.join(self.output_dir, 'vis')
            if not os.path.exists(self.vis_dir):
                os.makedirs(self.vis_dir)

-    def set_transformer(self, transformer_cls):
-        self.data_transformer = transformer_cls()
+    def evaluate_detections(self, all_boxes):
+        pass
+
+    def evaluate_segmentations(self, all_boxes, all_masks):
+        pass

    def get_image(self):
-        example = self.data_reader.q_out.get()
-        image = self.data_transformer.get_image(example)
-        image_id, objects = self.data_transformer.get_annotations(example)
-        self.gt_recs[image_id] = {
-            'objects': objects,
-            'width': image.shape[1],
-            'height': image.shape[0],
-        }
-        return image_id, image
+        pass

    def get_save_filename(self, image_id, ext='.jpg'):
        return os.path.join(self.vis_dir, image_id + ext) \
            if cfg.VIS_ON_FILE else None

+
+class TestServer(_Server):
+    def __init__(self, output_dir):
+        super(TestServer, self).__init__(output_dir)
+        self.imdb = get_imdb(cfg.TEST.DATABASE)
+        self.imdb.competition_mode(cfg.TEST.COMPETITION_MODE)
+        self.classes = self.imdb.classes
+        self.num_images = self.imdb.num_images
+        self.num_classes = self.imdb.num_classes
+        self.data_reader = dragon.io.DataReader(
+            dataset=lambda: dragon.io.SeetaRecordDataset(self.imdb.source))
+        self.data_reader.q_out = mp.Queue(cfg.TEST.IMS_PER_BATCH * 5)
+        self.data_reader.start()
+        self.gt_recs = collections.OrderedDict()
+
+    def get_image(self):
+        example = Example(self.data_reader.q_out.get())
+        image, image_id = example.image, example.id
+        self.gt_recs[image_id] = {
+            'height': example.height,
+            'width': example.width,
+            'objects': example.objects,
+        }
+        return image_id, image
+
    def get_records(self):
        if len(self.gt_recs) != self.num_images:
            raise RuntimeError(
@@ -70,7 +80,7 @@ class TestServer(object):
        return self.gt_recs

    def evaluate_detections(self, all_boxes):
-        if cfg.TEST.PROTOCOL == 'null':
+        if cfg.TEST.PROTOCOL == 'dump':
            self.imdb.dump_detections(all_boxes, self.output_dir)
        else:
            self.imdb.evaluate_detections(
@@ -88,56 +98,20 @@ class TestServer(object):
        )


-class InferServer(object):
+class InferServer(_Server):
    def __init__(self, output_dir):
+        super(InferServer, self).__init__(output_dir)
        self.images_dir = cfg.TEST.DATABASE
-        self.imdb = get_imdb('taas:/empty')
        self.images = os.listdir(self.images_dir)
-        self.num_images, self.num_classes, self.classes = \
-            len(self.images), cfg.MODEL.NUM_CLASSES, cfg.MODEL.CLASSES
-        self.data_transformer = DataTransformer()
-        self.gt_recs = collections.OrderedDict()
+        self.classes = cfg.MODEL.CLASSES
+        self.num_images = len(self.images)
+        self.num_classes = cfg.MODEL.NUM_CLASSES
        self.output_dir = output_dir
        self.image_idx = 0
-        if cfg.VIS_ON_FILE:
-            self.vis_dir = os.path.join(self.output_dir, 'vis')
-            if not os.path.exists(self.vis_dir):
-                os.makedirs(self.vis_dir)
-
-    def set_transformer(self, transformer_cls):
-        self.data_transformer = transformer_cls()

    def get_image(self):
        image_name = self.images[self.image_idx]
        image_id = image_name.split('.')[0]
        image = cv2.imread(os.path.join(self.images_dir, image_name))
        self.image_idx = (self.image_idx + 1) % self.num_images
-        self.gt_recs[image_id] = {'width': image.shape[1], 'height': image.shape[0]}
        return image_id, image
-
-    def get_save_filename(self, image_id, ext='.jpg'):
-        return os.path.join(self.vis_dir, image_id + ext) \
-            if cfg.VIS_ON_FILE else None
-
-    def get_records(self):
-        if len(self.gt_recs) != self.num_images:
-            raise RuntimeError(
-                'Loading {} records, while {} required.'
-                .format(len(self.gt_recs), self.num_images),
-            )
-        return self.gt_recs
-
-    def evaluate_detections(self, all_boxes):
-        self.imdb.evaluate_detections(
-            all_boxes,
-            self.get_records(),
-            self.output_dir,
-        )
-
-    def evaluate_segmentations(self, all_boxes, all_masks):
-        self.imdb.evaluate_segmentations(
-            all_boxes,
-            all_masks,
-            self.get_records(),
-            self.output_dir,
-        )
--- a/lib/core/test_engine.py
+++ b/lib/core/test_engine.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import importlib
+import multiprocessing
+
+import numpy as np
+
+from lib.core.config import cfg
+from lib.utils import time_util
+from lib.utils.vis import vis_one_image
+
+
+def run_test_net(checkpoint, server, devices):
+    classes = server.classes
+    num_images = server.num_images
+    num_classes = server.num_classes
+    devices = devices if devices else [cfg.GPU_ID]
+    num_workers = len(devices)
+
+    test_fn = importlib.import_module(
+        'lib.%s.test' % cfg.MODEL.TYPE).test_net
+
+    _t = time_util.new_timers('im_detect', 'mask_detect', 'misc')
+
+    vis_image_dict = {}
+
+    all_boxes = [[[] for _ in range(num_images)] for _ in range(num_classes)]
+    all_masks = [[[] for _ in range(num_images)] for _ in range(num_classes)]
+
+    queues = [
+        multiprocessing.Queue()
+        for _ in range(num_workers + 1)
+    ]
+    workers = [
+        multiprocessing.Process(
+            target=test_fn,
+            kwargs={
+                'weights': checkpoint,
+                'num_classes': server.num_classes,
+                'q_in': queues[i],
+                'q_out': queues[-1],
+                'device': devices[i],
+            }
+        ) for i in range(num_workers)
+    ]
+
+    for process in workers:
+        process.start()
+
+    for i in range(num_images):
+        image_id, raw_image = server.get_image()
+        queues[i % num_workers].put((i, raw_image))
+        # Hold the image until the visualization
+        if cfg.VIS or cfg.VIS_ON_FILE:
+            vis_image_dict[i] = (image_id, raw_image)
+
+    for i in range(num_workers):
+        queues[i].put((-1, None))
+
+    for count in range(num_images):
+        i, time_diffs, results = queues[-1].get()
+
+        # Unpack the diverse results
+        boxes_this_image = results['boxes']
+        masks_this_image = results.get('masks', None)
+
+        # Disable some collections
+        if masks_this_image is None:
+            all_masks = None
+
+        # Update time difference
+        for name, diff in time_diffs.items():
+            _t[name].add_diff(diff)
+
+        # Visualize the results if necessary
+        if cfg.VIS or cfg.VIS_ON_FILE:
+            image_id, raw_image = vis_image_dict[i]
+            vis_one_image(
+                raw_image,
+                classes,
+                boxes_this_image,
+                masks_this_image,
+                thresh=cfg.VIS_TH,
+                box_alpha=1.,
+                show_class=True,
+                filename=server.get_save_filename(image_id),
+            )
+            del vis_image_dict[i]
+
+        _t['misc'].tic()
+
+        # Pack the results in the class-major order
+        for j in range(1, num_classes):
+            all_boxes[j][i] = boxes_this_image[j]
+            if all_masks is not None:
+                if j < len(masks_this_image):
+                    all_masks[j][i] = masks_this_image[j]
+
+        # Limit to max_per_image detections *over all classes*
+        max_detections = cfg.TEST.DETECTIONS_PER_IM
+        if max_detections > 0:
+            scores = []
+            for j in range(1, num_classes):
+                if len(all_boxes[j][i]) < 1:
+                    continue
+                scores.append(all_boxes[j][i][:, -1])
+            if len(scores) > 0:
+                scores = np.hstack(scores)
+            if len(scores) > max_detections:
+                thr = np.sort(scores)[-max_detections]
+                for j in range(1, num_classes):
+                    keep = np.where(all_boxes[j][i][:, -1] >= thr)[0]
+                    all_boxes[j][i] = all_boxes[j][i][keep, :]
+                    if all_masks is not None:
+                        all_masks[j][i] = all_masks[j][i][keep]
+
+        _t['misc'].toc()
+
+        print('\rim_detect: {:d}/{:d} {:.3f}s|{:.3f}s {:.3f}s'
+              .format(count + 1, num_images,
+                      _t['im_detect'].average_time,
+                      _t['mask_detect'].average_time,
+                      _t['misc'].average_time),
+              end='')
+
+    print('\n\n>>> Evaluating detections\n')
+    server.evaluate_detections(all_boxes)
+
+    if all_masks is not None:
+        print('>>> Evaluating segmentations\n')
+        server.evaluate_segmentations(all_boxes, all_masks)
--- a/lib/core/train.py
+++ b/lib/core/train.py
@@ -31,9 +31,9 @@ from lib.utils.stats import SmoothedValue

 class SolverWrapper(object):
    def __init__(self, coordinator):
+        self.output_dir = coordinator.checkpoints_dir()
        self.solver = SGDSolver()
        self.detector = self.solver.detector
-        self.output_dir = coordinator.checkpoints_dir()

        # Setup the detector
        self.detector.load_weights(cfg.TRAIN.WEIGHTS)
@@ -89,7 +89,6 @@ class SolverWrapper(object):
        display = self.solver.iter % cfg.SOLVER.DISPLAY == 0
        stats = self.solver.one_step()
        self.add_metrics(stats)
-        self.send_metrics(stats)

        if display:
            logger.info(
@@ -104,6 +103,7 @@ class SolverWrapper(object):
                    continue
                logger.info(' ' * 10 + 'Train net output({}): {}'
                            .format(k, v.GetMedianValue()))
+            self.send_metrics(stats)

    def train_model(self):
        """Network training loop."""

--- a/lib/datasets/coco_evaluator.py
+++ b/lib/datasets/coco_evaluator.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+import os
+import sys
+
+import numpy as np
+
+from lib.core.config import cfg
+from lib.pycocotools import mask as mask_tools
+from lib.pycocotools.coco import COCO
+from lib.pycocotools.cocoeval import COCOeval
+from lib.utils import mask as mask_util
+
+
+class COCOEvaluator(object):
+    def __init__(self, imdb, ann_file=None):
+        self.imdb = imdb
+        if ann_file is not None and \
+                os.path.exists(ann_file):
+            self.coco = COCO(ann_file)
+            cats = self.coco.loadCats(self.coco.getCatIds())
+            self.class_to_cat_id = dict(
+                zip([c['name'] for c in cats],
+                    self.coco.getCatIds()))
+        else:
+            self.coco = None
+            self.class_to_cat_id = None
+
+    def bbox_results_one_category(self, boxes, cat_id, gt_recs):
+        ix, results = 0, []
+        for image_name, rec in gt_recs.items():
+            dets = boxes[ix]
+            ix += 1
+            if isinstance(dets, list) and len(dets) == 0:
+                continue
+            dets = dets.astype('float64')
+            scores = dets[:, -1]
+            xs = dets[:, 0]
+            ys = dets[:, 1]
+            ws = dets[:, 2] - xs + 1
+            hs = dets[:, 3] - ys + 1
+            results.extend([{
+                'image_id': self.get_image_id(image_name),
+                'category_id': cat_id,
+                'bbox': [xs[k], ys[k], ws[k], hs[k]],
+                'score': scores[k],
+            } for k in range(dets.shape[0])])
+        return results
+
+    def do_bbox_eval(self, res_file):
+        coco_dt = self.coco.loadRes(res_file)
+        coco_eval = COCOeval(self.coco, coco_dt, 'bbox')
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        self.print_coco_eval_results(coco_eval)
+
+    def do_segm_eval(self, res_file):
+        coco_dt = self.coco.loadRes(res_file)
+        coco_eval = COCOeval(self.coco, coco_dt, 'segm')
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        self.print_coco_eval_results(coco_eval)
+
+    @staticmethod
+    def encode_masks(masks, boxes, im_h, im_w):
+        mask_image = mask_util.project_masks(
+            masks, boxes, im_h, im_w,
+            cfg.TEST.BINARY_THRESH)
+        return mask_tools.encode(mask_image)
+
+    @staticmethod
+    def get_prefix(type='bbox'):
+        if type == 'bbox':
+            return 'detections'
+        elif type == 'segm':
+            return 'segmentations'
+        elif type == 'kpt':
+            return 'keypoints'
+        return ''
+
+    @staticmethod
+    def get_annotations_file(results_folder, type='bbox'):
+        # experiments/model_id/annotations/[GT]detections.json
+        filename = '[GT]' + COCOEvaluator.get_prefix(type) + '.json'
+        if not os.path.exists(results_folder):
+            os.makedirs(results_folder)
+        return os.path.join(results_folder, filename)
+
+    @staticmethod
+    def get_image_id(image_name):
+        image_id = image_name.split('_')[-1].split('.')[0]
+        try:
+            return int(image_id)
+        except ValueError:
+            return image_name
+
+    def get_results_file(self, results_folder, type='bbox'):
+        # experiments/model_id/results/detections_taas_<comp_id>.json
+        filename = self.get_prefix(type) + self.imdb.comp_id + '.json'
+        if not os.path.exists(results_folder):
+            os.makedirs(results_folder)
+        return os.path.join(results_folder, filename)
+
+    def print_coco_eval_results(self, coco_eval, iou_thr=(0.5, 0.95)):
+        def get_thr_ind(coco_eval, thr):
+            ind = np.where((coco_eval.params.iouThrs > thr - 1e-5) &
+                           (coco_eval.params.iouThrs < thr + 1e-5))[0][0]
+            iou_thr = coco_eval.params.iouThrs[ind]
+            assert np.isclose(iou_thr, thr)
+            return ind
+
+        ind_lo = get_thr_ind(coco_eval, iou_thr[0])
+        ind_hi = get_thr_ind(coco_eval, iou_thr[1])
+
+        # Precision has dims (iou, recall, cls, area range, max dets)
+        # Area range index 0: all area ranges
+        # Max dets index 2: 100 per image
+        precision_res = coco_eval.eval['precision']
+        precision = precision_res[ind_lo:(ind_hi + 1), :, :, 0, 2]
+        ap_default = np.mean(precision[precision > -1])
+        print('~~~~ Mean and per-category AP @ IoU=[{:.2f},{:.2f}] '
+              '~~~~'.format(iou_thr[0], iou_thr[1]))
+        print('{:.1f}'.format(100 * ap_default))
+        for cls_ind, cls in enumerate(self.imdb.classes):
+            if cls == '__background__':
+                continue
+            precision = precision_res[ind_lo:(ind_hi + 1), :, cls_ind - 1, 0, 2]
+            ap = np.mean(precision[precision > -1])
+            print('{:.1f}'.format(100 * ap))
+
+        print('~~~~ Summary metrics ~~~~')
+        coco_eval.summarize()
+
+    def segm_results_one_category(self, boxes, masks, cat_id, gt_recs):
+        def filter_boxes(dets):
+            boxes = dets[:, :4]
+            ws = boxes[:, 2] - boxes[:, 0]
+            hs = boxes[:, 3] - boxes[:, 1]
+            keep = np.where((ws >= 1) & (hs >= 1))[0]
+            return keep
+        results = []
+        ix = 0
+        for image_name, rec in gt_recs.items():
+            dets = boxes[ix].astype(np.float)
+            msks = masks[ix]
+            ix += 1
+            keep = filter_boxes(dets)
+            im_h, im_w = rec['height'], rec['width']
+            if len(keep) == 0:
+                continue
+            scores = dets[:, -1]
+            mask_encode = self.encode_masks(
+                msks[keep], dets[keep, :4], im_h, im_w)
+            for k in range(dets[keep].shape[0]):
+                rle = mask_encode[k]
+                if sys.version_info >= (3, 0):
+                    rle['counts'] = rle['counts'].decode()
+                results.append({
+                    'image_id': self.get_image_id(image_name),
+                    'category_id': cat_id,
+                    'segmentation': rle,
+                    'score': scores[k],
+                })
+        return results
+
+    def write_bbox_annotations(self, gt_recs, output_dir):
+        # Build images
+        dataset = {'images': []}
+        for image_name, rec in gt_recs.items():
+            dataset['images'].append({
+                'file_name': image_name + '.jpg',
+                'id': self.get_image_id(image_name),
+                'height': rec['height'], 'width': rec['width'],
+            })
+        # Build categories
+        dataset['categories'] = []
+        for cls in self.imdb.classes:
+            if cls == '__background__':
+                continue
+            dataset['categories'].append({
+                'name': cls,
+                'id': self.imdb.class_to_ind[cls],
+            })
+        # Build annotations
+        dataset['annotations'] = []
+        ann_id = 0
+        for image_name, rec in gt_recs.items():
+            for obj in rec['objects']:
+                x, y = obj['bbox'][0], obj['bbox'][1]
+                w, h = obj['bbox'][2] - x + 1, obj['bbox'][3] - y + 1
+                dataset['annotations'].append({
+                    'id': str(ann_id),
+                    'bbox': [x, y, w, h],
+                    'area': w * h,
+                    'iscrowd': obj['difficult'],
+                    'image_id': self.get_image_id(image_name),
+                    'category_id': self.imdb.class_to_ind[obj['name']],
+                })
+                ann_id += 1
+        ann_file = self.get_annotations_file(output_dir, 'bbox')
+        with open(ann_file, 'w') as f:
+            json.dump(dataset, f)
+        return ann_file
+
+    def write_bbox_results(self, all_boxes, gt_recs, output_dir):
+        filename = self.get_results_file(output_dir)
+        results = []
+        for cls_ind, cls in enumerate(self.imdb.classes):
+            if cls == '__background__':
+                continue
+            print('Collecting {} results ({:d}/{:d})'
+                  .format(cls, cls_ind, self.imdb.num_classes - 1))
+            cat_id = self.class_to_cat_id[cls]
+            results.extend(self.bbox_results_one_category(
+                all_boxes[cls_ind], cat_id, gt_recs))
+        print('Writing results json to {}'.format(filename))
+        with open(filename, 'w') as fid:
+            json.dump(results, fid)
+        return filename
+
+    def write_segm_annotations(self, gt_recs, output_dir):
+        # Build images
+        dataset = {'images': []}
+        for image_name, rec in gt_recs.items():
+            dataset['images'].append({
+                'file_name': image_name + '.jpg',
+                'id': self.get_image_id(image_name),
+                'height': rec['height'], 'width': rec['width'],
+            })
+        # Build categories
+        dataset['categories'] = []
+        for cls in self.imdb._classes:
+            if cls == '__background__':
+                continue
+            dataset['categories'].append({
+                'name': cls,
+                'id': self.imdb.class_to_ind[cls],
+            })
+        # Build annotations
+        dataset['annotations'] = []
+        ann_id = 0
+        for image_name, rec in gt_recs.items():
+            mask_size = (rec['height'], rec['width'])
+            for obj in rec['objects']:
+                x, y = obj['bbox'][0], obj['bbox'][1]
+                w, h = obj['bbox'][2] - x + 1, obj['bbox'][3] - y + 1
+                mask = obj['mask']
+                if sys.version_info >= (3, 0):
+                    mask = mask.decode()
+                dataset['annotations'].append({
+                    'id': str(ann_id),
+                    'bbox': [x, y, w, h],
+                    'area': w * h,
+                    'segmentation': {'size': mask_size, 'counts': mask},
+                    'iscrowd': obj['difficult'],
+                    'image_id': self.get_image_id(image_name),
+                    'category_id': self.imdb.class_to_ind[obj['name']],
+                })
+                ann_id += 1
+        ann_file = self.get_annotations_file(output_dir, 'segm')
+        with open(ann_file, 'w') as f:
+            json.dump(dataset, f)
+        return ann_file
+
+    def write_segm_results(self, all_boxes, all_masks, gt_recs, output_dir):
+        filename = self.get_results_file(output_dir, 'segm')
+        results = []
+        for cls_ind, cls in enumerate(self.imdb.classes):
+            if cls == '__background__':
+                continue
+            print('Collecting {} results ({:d}/{:d})'
+                  .format(cls, cls_ind, self.imdb.num_classes - 1))
+            cat_id = self.class_to_cat_id[cls]
+            results.extend(self.segm_results_one_category(
+                all_boxes[cls_ind], all_masks[cls_ind], cat_id, gt_recs))
+        print('Writing results json to {}'.format(filename))
+        with open(filename, 'w') as fid:
+            json.dump(results, fid)
+        return filename
--- a/lib/datasets/example.py
+++ b/lib/datasets/example.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import cv2
+import numpy as np
+
+from lib.pycocotools import mask_utils
+
+
+class Example(object):
+    """Wrapper for annotated example."""
+
+    def __init__(self, datum):
+        """Create a ``Example``.
+
+        Parameters
+        ----------
+        datum : Dict
+            The data loaded for dataset
+
+        """
+        self._datum = datum
+
+    @property
+    def id(self):
+        """Return the example id.
+
+        Returns
+        -------
+        str
+            The unique id.
+
+        """
+        return self._datum['id']
+
+    @property
+    def image(self):
+        """Return the image data.
+
+        Returns
+        -------
+        numpy.ndarray
+            The image.
+
+        """
+        img = np.frombuffer(self._datum['content'], 'uint8')
+        return cv2.imdecode(img, 3)
+
+    @property
+    def height(self):
+        """Return the image height.
+
+        Returns
+        -------
+        int
+            The height of image.
+
+        """
+        return self._datum['height']
+
+    @property
+    def objects(self):
+        """Return the annotated objects.
+
+        Returns
+        -------
+        Sequence[Dict]
+            The objects.
+
+        """
+        objects = []
+        for ix, obj in enumerate(self._datum['object']):
+            mask = obj.get('mask', None)
+            if 'x3' in obj:
+                poly = np.array([
+                    obj['x1'], obj['y1'],
+                    obj['x2'], obj['y2'],
+                    obj['x3'], obj['y3'],
+                    obj['x4'], obj['y4']
+                ], 'float32')
+                x, y, w, h = cv2.boundingRect(
+                    poly.reshape((-1, 2)))
+                bbox = [x, y, x + w, y + h]
+                mask = mask_utils.poly2bytes(
+                    [poly],
+                    self._datum['height'],
+                    self._datum['width'],
+                )
+            elif 'x2' in obj:
+                bbox = [obj['x1'], obj['y1'], obj['x2'], obj['y2']]
+            elif 'xmin' in obj:
+                bbox = [obj['xmin'], obj['ymin'], obj['xmax'], obj['ymax']]
+            else:
+                bbox = obj['bbox']
+            objects.append({
+                'name': obj['name'],
+                'bbox': bbox,
+                'mask': mask,
+                'difficult': obj.get('difficult', 0),
+            })
+        return objects
+
+    @property
+    def width(self):
+        """Return the image width.
+
+        Returns
+        -------
+        int
+            The width of image.
+
+        """
+        return self._datum['width']
--- a/lib/datasets/factory.py
+++ b/lib/datasets/factory.py
@@ -34,7 +34,7 @@ def get_imdb(name):
            raise KeyError('Unknown DataSet: {}'.format(cls))
        return _GLOBAL_DATA_SETS[cls](source)
    elif os.path.exists(name):
-            return _GLOBAL_DATA_SETS['taas'](name)
+        return _GLOBAL_DATA_SETS['taas'](name)
    else:
        raise ValueError('Illegal Database: {}' + name)


--- a/lib/datasets/imdb.py
+++ b/lib/datasets/imdb.py
@@ -13,84 +13,118 @@
 #
 # ------------------------------------------------------------

+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
 import os
-import shutil
-import dragon
+import uuid

 from lib.core.config import cfg
+from lib.datasets.coco_evaluator import COCOEvaluator
+from lib.datasets.voc_evaluator import VOCEvaluator


 class imdb(object):
-    def __init__(self, name):
-        self._name = name
-        self._num_classes = 0
-        self._classes = []
-
-    @property
-    def name(self):
-        return self._name
+    def __init__(self, source):
+        self._source = source
+        self._num_images = 0
+        self._classes = cfg.MODEL.CLASSES
+        self._class_to_ind = self._class_to_cat_id = \
+            dict(zip(self.classes, range(self.num_classes)))
+        self._salt = str(uuid.uuid4())
+        self.config = {'cleanup': True, 'use_salt': True}

    @property
-    def num_classes(self):
-        return len(self._classes)
+    def cache_path(self):
+        cache_path = os.path.abspath(os.path.join(cfg.DATA_DIR, 'cache'))
+        if not os.path.exists(cache_path):
+            os.makedirs(cache_path)
+        return cache_path

    @property
    def classes(self):
        return self._classes

    @property
-    def cache_path(self):
-        cache_path = os.path.abspath(os.path.join(cfg.DATA_DIR, 'cache'))
-        if not os.path.exists(cache_path):
-            os.makedirs(cache_path)
-        return cache_path
+    def class_to_ind(self):
+        return self._class_to_ind

    @property
-    def source(self):
-        excepted_source = os.path.join(self.cache_path, self.name)
-        if not os.path.exists(excepted_source):
-            raise RuntimeError(
-                'Excepted source from: {}, '
-                'but it is not existed.'
-                .format(excepted_source)
-            )
-        return excepted_source
+    def comp_id(self):
+        return '_' + self._salt if self.config['use_salt'] else ''
+
+    @property
+    def num_classes(self):
+        return len(self._classes)

    @property
    def num_images(self):
-        return dragon.io.SeetaRecordDataset(self.source).size
+        return self._num_images
+
+    @property
+    def source(self):
+        return self._source
+
+    def competition_mode(self, on):
+        if on:
+            self.config['use_salt'] = False
+            self.config['cleanup'] = False
+        else:
+            self.config['use_salt'] = True
+            self.config['cleanup'] = True

    def dump_detections(self, all_boxes, output_dir):
-        dataset = dragon.io.SeetaRecordDataset(self.source)
-        for file in ('data.data', 'data.index', 'data.meta'):
-            file = os.path.join(output_dir, file)
-            if os.path.exists(file):
-                os.remove(file)
-        writer = dragon.io.SeetaRecordWriter(output_dir, dataset.protocol)
-        for i in range(len(dataset)):
-            example = dataset.get()
-            example['object'] = []
-            for cls_ind, cls in enumerate(self.classes):
-                if cls == '__background__':
-                    continue
-                detections = all_boxes[cls_ind][i]
-                if len(detections) == 0:
-                    continue
-                for k in range(detections.shape[0]):
-                    if detections[k, -1] < cfg.VIS_TH:
-                        continue
-                    example['object'].append({
-                        'name': cls,
-                        'xmin': float(detections[k][0]),
-                        'ymin': float(detections[k][1]),
-                        'xmax': float(detections[k][2]),
-                        'ymax': float(detections[k][3]),
-                        'difficult': 0,
-                    })
-            writer.write(example)
+        pass

    def evaluate_detections(self, all_boxes, gt_recs, output_dir):
-        pass
+        protocol = cfg.TEST.PROTOCOL
+        if 'voc' in protocol:
+            evaluator = VOCEvaluator(self)
+            evaluator.write_bbox_results(all_boxes, gt_recs, output_dir)
+            if '!' not in protocol:
+                for ovr in (0.5, 0.7):
+                    evaluator.do_bbox_eval(
+                        gt_recs,
+                        output_dir,
+                        iou=ovr,
+                        use_07_metric='2007' in protocol,
+                    )
+        elif 'coco' in protocol:
+            ann_file = cfg.TEST.JSON_FILE
+            evaluator = COCOEvaluator(self, ann_file)
+            if evaluator.coco is None:
+                ann_file = evaluator \
+                    .write_bbox_annotations(
+                        gt_recs, output_dir)
+                evaluator = COCOEvaluator(self, ann_file)
+            res_file = evaluator.write_bbox_results(
+                all_boxes, gt_recs, output_dir)
+            if '!' not in protocol:
+                evaluator.do_bbox_eval(res_file)

-    def evaluate_masks(self, all_boxes, all_masks, output_dir):
-        pass
+    def evaluate_segmentations(self, all_boxes, all_masks, gt_recs, output_dir):
+        protocol = cfg.TEST.PROTOCOL
+        if 'voc' in protocol:
+            evaluator = VOCEvaluator(self)
+            evaluator.write_segm_results(all_boxes, all_masks, output_dir)
+            if '!' not in protocol:
+                for ovr in (0.5, 0.7):
+                    evaluator.do_segm_eval(
+                        gt_recs,
+                        output_dir,
+                        iou=ovr,
+                        use_07_metric='2007' in protocol,
+                    )
+        elif 'coco' in protocol:
+            ann_file = cfg.TEST.JSON_FILE
+            evaluator = COCOEvaluator(self, ann_file)
+            if evaluator.coco is None:
+                ann_file = evaluator \
+                    .write_segm_annotations(
+                        gt_recs, output_dir)
+                evaluator = COCOEvaluator(self, ann_file)
+            res_file = evaluator.write_segm_results(
+                all_boxes, all_masks, gt_recs, output_dir)
+            if '!' not in protocol:
+                evaluator.do_segm_eval(res_file)
--- a/lib/datasets/taas.py
+++ b/lib/datasets/taas.py
@@ -17,496 +17,45 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import json
 import os
-import sys
-import uuid

-import cv2
-import numpy as np
-try:
-    import cPickle
-except:
-    import pickle as cPickle
+import dragon

 from lib.core.config import cfg
 from lib.datasets.imdb import imdb
-from lib.datasets.voc_eval import voc_bbox_eval
-from lib.datasets.voc_eval import voc_segm_eval
-from lib.pycocotools.mask import encode as encode_masks
-from lib.utils import boxes as box_utils


 class TaaS(imdb):
    def __init__(self, source):
-        imdb.__init__(self, 'taas')
-        self._classes = cfg.MODEL.CLASSES
-        self._source = source
-        self._class_to_ind = dict(zip(self.classes, range(self.num_classes)))
-        self._class_to_cat_id = self._class_to_ind
-        self._salt = str(uuid.uuid4())
-        self.config = {'cleanup': True, 'use_salt': True}
-
-    @property
-    def source(self):
-        excepted_source = self._source
-        if not os.path.exists(excepted_source):
-            raise RuntimeError(
-                'Excepted source from: {}, '
-                'but it is not existed.'
-                .format(excepted_source)
-            )
-        return excepted_source
-
-    ##############################################
-    #                                            #
-    #                   UTILS                    #
-    #                                            #
-    ##############################################
-
-    def _get_comp_id(self):
-        return '_' + self._salt if self.config['use_salt'] else ''
-
-    @classmethod
-    def _get_prefix(cls, type='bbox'):
-        if type == 'bbox':
-            return 'detections_'
-        elif type == 'segm':
-            return 'segmentations_'
-        elif type == 'kpt':
-            return 'keypoints_'
-        return ''
-
-    def _get_voc_results_T(self, results_folder, type='bbox'):
-        # experiments/model_id/results/detections_taas_<comp_id>_aeroplane.txt
-        if type == 'bbox':
-            filename = self._get_prefix(type) + self._name + self._get_comp_id() + '_{:s}.txt'
-        elif type == 'segm':
-            filename = self._get_prefix(type) + self._name + self._get_comp_id() + '_{:s}.pkl'
-        else:
-            raise ValueError('Type of results can be either bbox or segm.')
-        if not os.path.exists(results_folder):
-            os.makedirs(results_folder)
-        return os.path.join(results_folder, filename)
-
-    def _get_coco_annotations_T(self, results_folder, type='bbox'):
-        # experiments/model_id/annotations/[GT]detections_taas_<comp_id>.json
-        filename = '[GT]_' + self._get_prefix(type) + self._name + '.json'
-        if not os.path.exists(results_folder):
-            os.makedirs(results_folder)
-        return os.path.join(results_folder, filename)
-
-    def _get_coco_results_T(self, results_folder, type='bbox'):
-        # experiments/model_id/results/detections_taas_<comp_id>.json
-        filename = self._get_prefix(type) + self._name + self._get_comp_id() + '.json'
-        if not os.path.exists(results_folder):
-            os.makedirs(results_folder)
-        return os.path.join(results_folder, filename)
-
-    ##############################################
-    #                                            #
-    #                    VOC                     #
-    #                                            #
-    ##############################################
-
-    def _write_voc_bbox_results(self, all_boxes, gt_recs, output_dir):
-        for cls_ind, cls in enumerate(self.classes):
-            if cls == '__background__':
-                continue
-            print('Writing {} VOC format bbox results'.format(cls))
-            filename = self._get_voc_results_T(output_dir).format(cls)
-            with open(filename, 'wt') as f:
-                ix = 0
-                for image_id, rec in gt_recs.items():
-                    dets = all_boxes[cls_ind][ix]
-                    ix += 1
-                    if len(dets) == 0:
+        imdb.__init__(self, source)
+        self._dataset = dragon.io.SeetaRecordDataset
+        self._num_images = self._dataset(self.source).size
+
+    def dump_detections(self, all_boxes, output_dir):
+        dataset = self._dataset(self.source)
+        for file in ('data.data', 'data.index', 'data.meta'):
+            file = os.path.join(output_dir, file)
+            if os.path.exists(file):
+                os.remove(file)
+        writer = dragon.io.SeetaRecordWriter(output_dir, dataset.protocol)
+        for i in range(len(dataset)):
+            example = dataset.get()
+            example['object'] = []
+            for cls_ind, cls in enumerate(self.classes):
+                if cls == '__background__':
+                    continue
+                detections = all_boxes[cls_ind][i]
+                if len(detections) == 0:
+                    continue
+                for k in range(detections.shape[0]):
+                    if detections[k, -1] < cfg.VIS_TH:
                        continue
-                    for k in range(dets.shape[0]):
-                        content = '{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}' \
-                            .format(image_id, dets[k, -1],
-                                    dets[k, 0] + 1, dets[k, 1] + 1,
-                                    dets[k, 2] + 1, dets[k, 3] + 1)
-                        if dets.shape[1] == 6:
-                            content += ' {:.2f}'.format(dets[k, 4])
-                        f.write(content + '\n')
-
-    def _write_voc_segm_results(self, all_boxes, all_masks, output_dir):
-        for cls_inds, cls in enumerate(self.classes):
-            if cls == '__background__':
-                continue
-            print('Writing {} VOC format segm results'.format(cls))
-            segm_filename = self._get_voc_results_T(output_dir, type='segm').format(cls)
-            bbox_filename = segm_filename.replace('segmentations', 'detections')
-            with open(bbox_filename, 'wb') as f:
-                cPickle.dump(all_boxes[cls_inds], f, cPickle.HIGHEST_PROTOCOL)
-            with open(segm_filename, 'wb') as f:
-                cPickle.dump(all_masks[cls_inds], f, cPickle.HIGHEST_PROTOCOL)
-
-    def _do_voc_bbox_eval(self, gt_recs, output_dir, IoU=0.5, use_07_metric=True):
-        aps = []
-        print('VOC07 metric? ' + ('Yes' if use_07_metric else 'No'))
-        for i, cls in enumerate(self._classes):
-            if cls == '__background__':
-                continue
-            det_file = self._get_voc_results_T(output_dir).format(cls)
-            rec, prec, ap = voc_bbox_eval(
-                det_file, gt_recs, cls,
-                IoU=IoU, use_07_metric=use_07_metric,
-            )
-            if ap > 0:
-                aps += [ap]
-            print('AP for {} = {:.4f}'.format(cls, ap))
-        print('Mean AP = {:.4f}\n'.format(np.mean(aps)))
-
-    def _do_voc_segm_eval(self, gt_recs, output_dir, IoU=0.5, use_07_metric=True):
-        aps = []
-        print('VOC07 metric? ' + ('Yes' if use_07_metric else 'No'))
-        for i, cls in enumerate(self.classes):
-            if cls == '__background__':
-                continue
-            segm_filename = self._get_voc_results_T(output_dir, type='segm').format(cls)
-            bbox_filename = segm_filename.replace('segmentations', 'detections')
-            ap = voc_segm_eval(
-                bbox_filename, segm_filename, gt_recs, cls,
-                IoU=IoU, use_07_metric=use_07_metric,
-            )
-            if ap > 0:
-                aps += [ap]
-            print('AP for {} = {:.4f}'.format(cls, ap))
-        print('Mean AP = {:.4f}\n'.format(np.mean(aps)))
-
-    ##############################################
-    #                                            #
-    #                    COCO                    #
-    #                                            #
-    ##############################################
-
-    @classmethod
-    def _get_coco_image_id(cls, image_name):
-        image_id = image_name.split('_')[-1].split('.')[0]
-        try:
-            return int(image_id)
-        except:
-            return image_name
-
-    @classmethod
-    def _encode_coco_masks(cls, masks, boxes, im_h, im_w):
-        num_pred = len(boxes)
-        assert len(masks) == num_pred
-        mask_image = np.zeros((im_h, im_w, num_pred), dtype=np.uint8, order='F')
-        M = masks[0].shape[0]
-        scale = (M + 2.0) / M
-        ref_boxes = box_utils.expand_boxes(boxes, scale)
-        ref_boxes = ref_boxes.astype(np.int32)
-        padded_mask = np.zeros((M + 2, M + 2), dtype=np.float32)
-        for i in range(num_pred):
-            ref_box = ref_boxes[i, :4]
-            mask = masks[i]
-            padded_mask[1:-1, 1:-1] = mask[:, :]
-            w = ref_box[2] - ref_box[0] + 1
-            h = ref_box[3] - ref_box[1] + 1
-            w = np.maximum(w, 1)
-            h = np.maximum(h, 1)
-            mask = cv2.resize(padded_mask, (w, h))
-            mask = np.array(mask > cfg.TEST.BINARY_THRESH, dtype=np.uint8)
-            x1 = max(ref_box[0], 0)
-            y1 = max(ref_box[1], 0)
-            x2 = min(ref_box[2] + 1, im_w)
-            y2 = min(ref_box[3] + 1, im_h)
-            mask_image[y1:y2, x1:x2, i] = \
-                mask[(y1 - ref_box[1]):(y2 - ref_box[1]),
-                     (x1 - ref_box[0]):(x2 - ref_box[0])]
-        return encode_masks(mask_image)
-
-    def _write_coco_bbox_annotations(self, gt_recs, output_dir):
-        # Build images
-        dataset = {'images': []}
-        for image_name, rec in gt_recs.items():
-            dataset['images'].append({
-                'file_name': image_name + '.jpg',
-                'id': self._get_coco_image_id(image_name),
-                'height': rec['height'], 'width': rec['width'],
-            })
-        # Build categories
-        dataset['categories'] = []
-        for cls in self._classes:
-            if cls == '__background__':
-                continue
-            dataset['categories'].append({
-                'name': cls,
-                'id': self._class_to_ind[cls],
-            })
-        # Build annotations
-        dataset['annotations'] = []
-        ann_id = 0
-        for image_name, rec in gt_recs.items():
-            for obj in rec['objects']:
-                x, y = obj['bbox'][0], obj['bbox'][1]
-                w, h = obj['bbox'][2] - x + 1, obj['bbox'][3] - y + 1
-                dataset['annotations'].append({
-                    'id': str(ann_id),
-                    'bbox': [x, y, w, h],
-                    'area': w * h,
-                    'iscrowd': obj['difficult'],
-                    'image_id': self._get_coco_image_id(image_name),
-                    'category_id': self._class_to_ind[obj['name']],
-                })
-                ann_id += 1
-        ann_file = self._get_coco_annotations_T(output_dir, type='bbox')
-        with open(ann_file, 'w') as f:
-            json.dump(dataset, f)
-        return ann_file
-
-    def _write_coco_segm_annotations(self, gt_recs, output_dir):
-        # Build images
-        dataset = {'images': []}
-        for image_name, rec in gt_recs.items():
-            dataset['images'].append({
-                'file_name': image_name + '.jpg',
-                'id': self._get_coco_image_id(image_name),
-                'height': rec['height'], 'width': rec['width'],
-            })
-        # Build categories
-        dataset['categories'] = []
-        for cls in self._classes:
-            if cls == '__background__':
-                continue
-            dataset['categories'].append({
-                'name': cls,
-                'id': self._class_to_ind[cls],
-            })
-        # Build annotations
-        dataset['annotations'] = []
-        ann_id = 0
-        for image_name, rec in gt_recs.items():
-            for obj in rec['objects']:
-                x, y = obj['bbox'][0], obj['bbox'][1]
-                w, h = obj['bbox'][2] - x + 1, obj['bbox'][3] - y + 1
-                dataset['annotations'].append({
-                    'id': str(ann_id),
-                    'bbox': [x, y, w, h],
-                    'area': w * h,
-                    'segmentation': {
-                        'size': [rec['height'], rec['width']],
-                        'counts': obj['mask'],
-                    },
-                    'iscrowd': obj['difficult'],
-                    'image_id': self._get_coco_image_id(image_name),
-                    'category_id': self._class_to_ind[obj['name']],
-                })
-                ann_id += 1
-        ann_file = self._get_coco_annotations_T(output_dir, type='segm')
-        with open(ann_file, 'w') as f:
-            json.dump(dataset, f)
-        return ann_file
-
-    def _coco_bbox_results_one_category(self, boxes, cat_id, gt_recs):
-        ix, results = 0, []
-        for image_name, rec in gt_recs.items():
-            dets = boxes[ix]
-            ix += 1
-            if isinstance(dets, list) and len(dets) == 0:
-                continue
-            dets = dets.astype(np.float)
-            scores = dets[:, -1]
-            xs = dets[:, 0]
-            ys = dets[:, 1]
-            ws = dets[:, 2] - xs + 1
-            hs = dets[:, 3] - ys + 1
-            results.extend(
-                [{'image_id': self._get_coco_image_id(image_name),
-                  'category_id': cat_id,
-                  'bbox': [xs[k], ys[k], ws[k], hs[k]],
-                  'score': scores[k],
-                  } for k in range(dets.shape[0])]
-            )
-        return results
-
-    def _coco_segm_results_one_category(self, boxes, masks, cat_id, gt_recs):
-        def filter_boxes(dets):
-            boxes = dets[:, :4]
-            ws = boxes[:, 2] - boxes[:, 0]
-            hs = boxes[:, 3] - boxes[:, 1]
-            keep = np.where((ws >= 1) & (hs >= 1))[0]
-            return keep
-        results = []
-        ix = 0
-        for image_name, rec in gt_recs.items():
-            dets = boxes[ix].astype(np.float)
-            msks = masks[ix]
-            ix += 1
-            keep = filter_boxes(dets)
-            im_h, im_w = rec['height'], rec['width']
-            if len(keep) == 0:
-                continue
-            scores = dets[:, -1]
-            mask_encode = self._encode_coco_masks(
-                msks[keep], dets[keep, :4], im_h, im_w)
-            for k in range(dets[keep].shape[0]):
-                rle = mask_encode[k]
-                if sys.version_info >= (3, 0):
-                    rle['counts'] = rle['counts'].decode()
-                results.append({
-                    'image_id': self._get_coco_image_id(image_name),
-                    'category_id': cat_id,
-                    'segmentation': rle,
-                    'score': scores[k],
-                })
-        return results
-
-    def _write_coco_bbox_results(self, all_boxes, gt_recs, output_dir):
-        filename = self._get_coco_results_T(output_dir)
-        results = []
-        for cls_ind, cls in enumerate(self.classes):
-            if cls == '__background__':
-                continue
-            print('Collecting {} results ({:d}/{:d})'
-                  .format(cls, cls_ind, self.num_classes - 1))
-            cat_id = self._class_to_cat_id[cls]
-            results.extend(self._coco_bbox_results_one_category(
-                all_boxes[cls_ind], cat_id, gt_recs))
-        print('Writing results json to {}'.format(filename))
-        with open(filename, 'w') as fid:
-            json.dump(results, fid)
-        return filename
-
-    def _write_coco_segm_results(self, all_boxes, all_masks, gt_recs, output_dir):
-        filename = self._get_coco_results_T(output_dir, type='segm')
-        results = []
-        for cls_ind, cls in enumerate(self.classes):
-            if cls == '__background__':
-                continue
-            print('Collecting {} results ({:d}/{:d})'
-                  .format(cls, cls_ind, self.num_classes - 1))
-            cat_id = self._class_to_cat_id[cls]
-            results.extend(self._coco_segm_results_one_category(
-                all_boxes[cls_ind], all_masks[cls_ind], cat_id, gt_recs))
-        print('Writing results json to {}'.format(filename))
-        with open(filename, 'w') as fid:
-            json.dump(results, fid)
-        return filename
-
-    def _do_coco_bbox_eval(self, coco, res_file):
-        from lib.pycocotools.cocoeval import COCOeval
-        coco_dt = coco.loadRes(res_file)
-        coco_eval = COCOeval(coco, coco_dt, 'bbox')
-        coco_eval.evaluate()
-        coco_eval.accumulate()
-        self._print_coco_eval_results(coco_eval)
-
-    def _do_coco_segm_eval(self, coco, res_file):
-        from lib.pycocotools.cocoeval import COCOeval
-        coco_dt = coco.loadRes(res_file)
-        coco_eval = COCOeval(coco, coco_dt, 'segm')
-        coco_eval.evaluate()
-        coco_eval.accumulate()
-        self._print_coco_eval_results(coco_eval)
-
-    def _print_coco_eval_results(self, coco_eval):
-        IoU_lo_thresh = 0.5
-        IoU_hi_thresh = 0.95
-
-        def _get_thr_ind(coco_eval, thr):
-            ind = np.where((coco_eval.params.iouThrs > thr - 1e-5) &
-                           (coco_eval.params.iouThrs < thr + 1e-5))[0][0]
-            iou_thr = coco_eval.params.iouThrs[ind]
-            assert np.isclose(iou_thr, thr)
-            return ind
-
-        ind_lo = _get_thr_ind(coco_eval, IoU_lo_thresh)
-        ind_hi = _get_thr_ind(coco_eval, IoU_hi_thresh)
-
-        # Precision has dims (iou, recall, cls, area range, max dets)
-        # Area range index 0: all area ranges
-        # Max dets index 2: 100 per image
-        precision = \
-            coco_eval.eval['precision'][ind_lo:(ind_hi + 1), :, :, 0, 2]
-        ap_default = np.mean(precision[precision > -1])
-        print('~~~~ Mean and per-category AP @ IoU=[{:.2f},{:.2f}] '
-              '~~~~'.format(IoU_lo_thresh, IoU_hi_thresh))
-        print('{:.1f}'.format(100 * ap_default))
-        for cls_ind, cls in enumerate(self.classes):
-            if cls == '__background__':
-                continue
-            # Minus 1 because of __background__
-            precision = coco_eval.eval['precision'][ind_lo:(ind_hi + 1), :, cls_ind - 1, 0, 2]
-            ap = np.mean(precision[precision > -1])
-            print('{:.1f}'.format(100 * ap))
-
-        print('~~~~ Summary metrics ~~~~')
-        coco_eval.summarize()
-
-    ##############################################
-    #                                            #
-    #                    EVAL-API                #
-    #                                            #
-    ##############################################
-
-    def evaluate_detections(self, all_boxes, gt_recs, output_dir):
-        protocol = cfg.TEST.PROTOCOL
-        if 'voc' in protocol:
-            self._write_voc_bbox_results(all_boxes, gt_recs, output_dir)
-            if 'wo' not in protocol:
-                print('\n~~~~~~ Evaluation IoU@0.5 ~~~~~~')
-                self._do_voc_bbox_eval(
-                    gt_recs, output_dir, IoU=0.5,
-                    use_07_metric='2007' in protocol)
-                print('~~~~~~ Evaluation IoU@0.7 ~~~~~~')
-                self._do_voc_bbox_eval(
-                    gt_recs, output_dir, IoU=0.7,
-                    use_07_metric='2007' in protocol)
-        elif 'coco' in protocol:
-            from lib.pycocotools.coco import COCO
-            if os.path.exists(cfg.TEST.JSON_FILE):
-                coco = COCO(cfg.TEST.JSON_FILE)
-                # We should override category id before writing results
-                cats = coco.loadCats(coco.getCatIds())
-                self._class_to_cat_id = dict(zip(
-                    [c['name'] for c in cats], coco.getCatIds()))
-            else:
-                coco = None
-            res_file = self._write_coco_bbox_results(
-                all_boxes, gt_recs, output_dir)
-            if 'wo' not in protocol:
-                if coco is None:
-                    ann_file = self._write_coco_bbox_annotations(gt_recs, output_dir)
-                    coco = COCO(ann_file)
-                self._do_coco_bbox_eval(coco, res_file)
-
-    def evaluate_segmentations(self, all_boxes, all_masks, gt_recs, output_dir):
-        protocol = cfg.TEST.PROTOCOL
-        if 'voc' in protocol:
-            self._write_voc_segm_results(all_boxes, all_masks, output_dir)
-            if 'wo' not in protocol:
-                print('\n~~~~~~ Evaluation IoU@0.5 ~~~~~~')
-                self._do_voc_segm_eval(
-                    gt_recs, output_dir, IoU=0.5,
-                    use_07_metric='2007' in protocol)
-                print('~~~~~~ Evaluation IoU@0.7 ~~~~~~')
-                self._do_voc_segm_eval(
-                    gt_recs, output_dir, IoU=0.7,
-                    use_07_metric='2007' in protocol)
-        elif 'coco' in protocol:
-            from lib.pycocotools.coco import COCO
-            if os.path.exists(cfg.TEST.JSON_FILE):
-                coco = COCO(cfg.TEST.JSON_FILE)
-                # We should override category id before writing results
-                cats = coco.loadCats(coco.getCatIds())
-                self._class_to_cat_id = dict(
-                    zip([c['name'] for c in cats], coco.getCatIds()))
-            else:
-                coco = None
-            res_file = self._write_coco_segm_results(all_boxes, all_masks, gt_recs, output_dir)
-            if 'wo' not in protocol:
-                if coco is None:
-                    coco = COCO(self._write_coco_segm_annotations(gt_recs, output_dir))
-                self._do_coco_segm_eval(coco, res_file)
-
-    def competition_mode(self, on):
-        if on:
-            self.config['use_salt'] = False
-            self.config['cleanup'] = False
-        else:
-            self.config['use_salt'] = True
-            self.config['cleanup'] = True
+                    example['object'].append({
+                        'name': cls,
+                        'xmin': float(detections[k][0]),
+                        'ymin': float(detections[k][1]),
+                        'xmax': float(detections[k][2]),
+                        'ymax': float(detections[k][3]),
+                        'difficult': 0,
+                    })
+            writer.write(example)
--- a/lib/datasets/voc_eval.py
+++ b/lib/datasets/voc_eval.py
@@ -20,15 +20,10 @@ from __future__ import print_function
 import cv2
 import numpy as np

-try:
-    import cPickle
-except:
-    import pickle as cPickle
-
 from lib.core.config import cfg
-from lib.pycocotools.mask_utils import mask_rle2im
-from lib.utils import rotated_boxes
-from lib.utils.boxes import expand_boxes
+from lib.pycocotools import mask_utils
+from lib.utils import boxes as box_util
+from lib.utils.framework import pickle
 from lib.utils.mask import mask_overlap


@@ -66,15 +61,15 @@ def voc_bbox_eval(
    det_file,
    gt_recs,
    cls_name,
-    IoU=0.5,
+    iou=0.5,
    use_07_metric=False,
 ):
    class_recs, n_pos = {}, 0
    for image_name, rec in gt_recs.items():
-        R = [obj for obj in rec['objects'] if obj['name'] == cls_name]
-        bbox = np.array([x['bbox'] for x in R])
-        diff = np.array([x['difficult'] for x in R]).astype(np.bool)
-        det = [False] * len(R)
+        objects = [obj for obj in rec['objects'] if obj['name'] == cls_name]
+        bbox = np.array([x['bbox'] for x in objects])
+        diff = np.array([x['difficult'] for x in objects]).astype(np.bool)
+        det = [False] * len(objects)
        n_pos = n_pos + sum(~diff)
        class_recs[image_name] = {'bbox': bbox, 'difficult': diff, 'det': det}

@@ -100,7 +95,7 @@ def voc_bbox_eval(
    nd = len(image_ids)
    tp, fp = np.zeros(nd), np.zeros(nd)

-    def overlaps4(bb, BBGT):
+    def compute_overlaps(bb, BBGT):
        ixmin = np.maximum(BBGT[:, 0], bb[0])
        iymin = np.maximum(BBGT[:, 1], bb[1])
        ixmax = np.minimum(BBGT[:, 2], bb[2])
@@ -114,9 +109,6 @@ def voc_bbox_eval(
               (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters)
        return inters / uni

-    def overlaps5(bb, BBGT):
-        return rotated_boxes.bbox_overlaps(bb.reshape((1, 5)), BBGT)[0]
-
    for d in range(nd):
        R = class_recs[image_ids[d]]
        bb = BB[d, :].astype(float)
@@ -124,12 +116,11 @@ def voc_bbox_eval(
        BBGT = R['bbox'].astype(float)

        if BBGT.size > 0:
-            overlaps = overlaps4(bb, BBGT) \
-                if len(bb) == 4 else overlaps5(bb, BBGT)
+            overlaps = compute_overlaps(bb, BBGT)
            ov_max = np.max(overlaps)
            j_max = np.argmax(overlaps)

-        if ov_max > IoU:
+        if ov_max > iou:
            if not R['difficult'][j_max]:
                if not R['det'][j_max]:
                    tp[d] = 1.
@@ -154,23 +145,29 @@ def voc_segm_eval(
    seg_file,
    gt_recs,
    cls_name,
-    IoU=0.5,
+    iou=0.5,
    use_07_metric=False,
 ):
    # 0. Constants
    M = cfg.MRCNN.RESOLUTION
    binary_thresh = cfg.TEST.BINARY_THRESH
-    scale = (M + 2.0) / M
+    scale = (M + 2.) / M
    padded_mask = np.zeros((M + 2, M + 2), dtype=np.float32)

    # 1. Get bbox & mask ground truths
    image_names, class_recs, n_pos = [], {}, 0
    for image_name, rec in gt_recs.items():
-        R = [obj for obj in rec['objects'] if obj['name'] == cls_name]
-        bbox = np.array([x['bbox'] for x in R])
-        mask = np.array([mask_rle2im([x['mask']], rec['height'], rec['width'])[0] for x in R])
-        difficult = np.array([x['difficult'] for x in R]).astype(np.bool)
-        det = [False] * len(R)
+        objects = [obj for obj in rec['objects'] if obj['name'] == cls_name]
+        bbox = np.array([x['bbox'] for x in objects])
+        mask = np.array([
+            mask_utils.bytes2img(
+                x['mask'],
+                rec['height'],
+                rec['width']
+            ) for x in objects]
+        )
+        difficult = np.array([x['difficult'] for x in objects]).astype(np.bool)
+        det = [False] * len(objects)
        n_pos = n_pos + sum(~difficult)
        class_recs[image_name] = {
            'bbox': bbox,
@@ -182,9 +179,9 @@ def voc_segm_eval(

    # 2. Get predict pickle file for this class
    with open(det_file, 'rb') as f:
-        boxes_pkl = cPickle.load(f)
+        boxes_pkl = pickle.load(f)
    with open(seg_file, 'rb') as f:
-        masks_pkl = cPickle.load(f)
+        masks_pkl = pickle.load(f)

    # 3. Pre-compute number of total instances to allocate memory
    num_images = len(gt_recs)
@@ -222,7 +219,7 @@ def voc_segm_eval(
    fp = np.zeros((num_pred, 1))
    tp = np.zeros((num_pred, 1))

-    ref_boxes = expand_boxes(new_boxes, scale)
+    ref_boxes = box_util.expand_boxes(new_boxes, scale)
    ref_boxes = ref_boxes.astype(np.int32)

    for i in range(num_pred):
@@ -261,13 +258,19 @@ def voc_segm_eval(
            crop_mask = R['mask'][j][gt_mask_bound[1]:gt_mask_bound[3] + 1,
                                     gt_mask_bound[0]:gt_mask_bound[2] + 1]

-            ov = mask_overlap(gt_mask_bound, pred_mask_bound, crop_mask, pred_mask)
+            ov = \
+                mask_overlap(
+                    gt_mask_bound,
+                    pred_mask_bound,
+                    crop_mask,
+                    pred_mask,
+                )

            if ov > ovmax:
                ovmax = ov
                jmax = j

-        if ovmax > IoU:
+        if ovmax > iou:
            if not R['difficult'][jmax]:
                if not R['det'][jmax]:
                    tp[i] = 1.
@@ -281,7 +284,7 @@ def voc_segm_eval(
    fp = np.cumsum(fp)
    tp = np.cumsum(tp)
    rec = tp / float(n_pos)
-    # avoid divide by zero in case the first matches a difficult gt
+    # Avoid divide by zero in case the first matches a difficult gt
    prec = tp / np.maximum(fp + tp, np.finfo(np.float64).eps)
    ap = voc_ap(rec, prec, use_07_metric=use_07_metric)
    return ap
--- a/lib/datasets/voc_evaluator.py
+++ b/lib/datasets/voc_evaluator.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import numpy as np
+
+from lib.datasets import voc_eval
+from lib.utils.framework import pickle
+
+
+class VOCEvaluator(object):
+    def __init__(self, imdb):
+        self.imdb = imdb
+
+    def do_bbox_eval(
+        self,
+        gt_recs,
+        output_dir,
+        iou=0.5,
+        use_07_metric=True,
+    ):
+        aps = []
+        print('~~~~~~ Evaluation IoU@%s ~~~~~~' % str(iou))
+        print('VOC07 metric? ' + ('Yes' if use_07_metric else 'No'))
+        for i, cls in enumerate(self.imdb.classes):
+            if cls == '__background__':
+                continue
+            det_file = self.get_results_file(output_dir).format(cls)
+            rec, prec, ap = \
+                voc_eval.voc_bbox_eval(
+                    det_file,
+                    gt_recs, cls,
+                    iou=iou,
+                    use_07_metric=use_07_metric,
+                )
+            if ap > 0:
+                aps += [ap]
+            print('AP for {} = {:.4f}'.format(cls, ap))
+        print('Mean AP = {:.4f}\n'.format(np.mean(aps)))
+
+    def do_segm_eval(
+        self,
+        gt_recs,
+        output_dir,
+        iou=0.5,
+        use_07_metric=True,
+    ):
+        aps = []
+        print('~~~~~~ Evaluation IoU@%s ~~~~~~' % str(iou))
+        print('VOC07 metric? ' + ('Yes' if use_07_metric else 'No'))
+        for i, cls in enumerate(self.imdb.classes):
+            if cls == '__background__':
+                continue
+            segm_filename = self.get_results_file(output_dir, 'segm').format(cls)
+            bbox_filename = segm_filename.replace('segmentations', 'detections')
+            ap = voc_eval.voc_segm_eval(
+                bbox_filename,
+                segm_filename,
+                gt_recs, cls,
+                iou=iou,
+                use_07_metric=use_07_metric,
+            )
+            if ap > 0:
+                aps += [ap]
+            print('AP for {} = {:.4f}'.format(cls, ap))
+        print('Mean AP = {:.4f}\n'.format(np.mean(aps)))
+
+    @staticmethod
+    def get_prefix(type='bbox'):
+        if type == 'bbox':
+            return 'detections'
+        elif type == 'segm':
+            return 'segmentations'
+        elif type == 'kpt':
+            return 'keypoints'
+        return ''
+
+    def get_results_file(self, results_folder, type='bbox'):
+        # experiments/model_id/results/detections_<comp_id>_<class_name>.txt
+        if type == 'bbox':
+            filename = self.get_prefix(type) + self.imdb.comp_id + '_{:s}.txt'
+        elif type == 'segm':
+            filename = self.get_prefix(type) + self.imdb.comp_id + '_{:s}.pkl'
+        else:
+            raise ValueError('Type of results can be either bbox or segm.')
+        if not os.path.exists(results_folder):
+            os.makedirs(results_folder)
+        return os.path.join(results_folder, filename)
+
+    def write_bbox_results(self, all_boxes, gt_recs, output_dir):
+        for cls_ind, cls in enumerate(self.imdb.classes):
+            if cls == '__background__':
+                continue
+            print('Writing {} VOC format bbox results'.format(cls))
+            filename = self.get_results_file(output_dir).format(cls)
+            with open(filename, 'wt') as f:
+                ix = 0
+                for image_id, rec in gt_recs.items():
+                    dets = all_boxes[cls_ind][ix]
+                    ix += 1
+                    if len(dets) == 0:
+                        continue
+                    for k in range(dets.shape[0]):
+                        content = '{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}' \
+                            .format(image_id, dets[k, -1],
+                                    dets[k, 0] + 1, dets[k, 1] + 1,
+                                    dets[k, 2] + 1, dets[k, 3] + 1)
+                        if dets.shape[1] == 6:
+                            content += ' {:.2f}'.format(dets[k, 4])
+                        f.write(content + '\n')
+
+    def write_segm_results(self, all_boxes, all_masks, output_dir):
+        for cls_inds, cls in enumerate(self.imdb.classes):
+            if cls == '__background__':
+                continue
+            print('Writing {} VOC format segm results'.format(cls))
+            segm_filename = self.get_results_file(output_dir, 'segm').format(cls)
+            bbox_filename = segm_filename.replace('segmentations', 'detections')
+            with open(bbox_filename, 'wb') as f:
+                pickle.dump(all_boxes[cls_inds], f, pickle.HIGHEST_PROTOCOL)
+            with open(segm_filename, 'wb') as f:
+                pickle.dump(all_masks[cls_inds], f, pickle.HIGHEST_PROTOCOL)
--- a/lib/faster_rcnn/__init__.py
+++ b/lib/faster_rcnn/__init__.py
@@ -13,7 +13,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-from lib.faster_rcnn.anchor_target_layer import AnchorTargetLayer
-from lib.faster_rcnn.data_layer import DataLayer
-from lib.faster_rcnn.proposal_layer import ProposalLayer
-from lib.faster_rcnn.proposal_target_layer import ProposalTargetLayer
+from lib.faster_rcnn.anchor_target import AnchorTarget
+from lib.faster_rcnn.data_loader import DataLoader
+from lib.faster_rcnn.proposal import Proposal
+from lib.faster_rcnn.proposal_target import ProposalTarget
+from lib.faster_rcnn.utils import generate_grid_anchors
+from lib.faster_rcnn.utils import map_blobs_to_outputs
+from lib.faster_rcnn.utils import map_rois_to_levels
+from lib.faster_rcnn.utils import map_returns_to_blobs
--- a/lib/faster_rcnn/anchor_target.py
+++ b/lib/faster_rcnn/anchor_target.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import numpy.random as npr
+
+from lib.core.config import cfg
+from lib.faster_rcnn.generate_anchors import generate_anchors
+from lib.faster_rcnn.utils import generate_grid_anchors
+from lib.utils import boxes as box_util
+from lib.utils.framework import new_tensor
+
+
+class AnchorTarget(object):
+    """Assign ground-truth targets to anchors."""
+
+    def __init__(self):
+        super(AnchorTarget, self).__init__()
+        # Load the basic configs
+        self.scales = cfg.RPN.SCALES
+        self.strides = cfg.RPN.STRIDES
+        self.ratios = cfg.RPN.ASPECT_RATIOS
+        self.num_strides = len(self.strides)
+        self.allowed_border = cfg.TRAIN.RPN_STRADDLE_THRESH
+        # Generate base anchors
+        self.base_anchors = []
+        for i in range(self.num_strides):
+            self.base_anchors.append(
+                generate_anchors(
+                    self.strides[i],
+                    self.ratios,
+                    np.array([self.scales[i]])
+                    if self.num_strides > 1
+                    else np.array(self.scales)
+                )
+            )
+
+    def __call__(self, features, gt_boxes, ims_info):
+        num_images = cfg.TRAIN.IMS_PER_BATCH
+        gt_boxes_wide = box_util.dismantle_boxes(gt_boxes, num_images)
+
+        # Generate grid anchors from base
+        all_anchors = \
+            generate_grid_anchors(
+                features,
+                self.base_anchors,
+                self.strides,
+            )
+        num_anchors = all_anchors.shape[0]
+
+        # Label: ``1`` is positive, ``0`` is negative, ``-1` is don't care
+        labels_wide = -np.ones((num_images, num_anchors,), 'float32')
+        bbox_targets_wide = np.zeros((num_images, num_anchors, 4), 'float32')
+        bbox_inside_weights_wide = np.zeros_like(bbox_targets_wide, 'float32')
+        bbox_outside_weights_wide = np.zeros_like(bbox_targets_wide, 'float32')
+
+        for ix in range(num_images):
+            # GT boxes (x1, y1, x2, y2, label, ...)
+            gt_boxes = gt_boxes_wide[ix]
+            im_info = ims_info[ix]
+            if self.allowed_border >= 0:
+                # Only keep anchors inside the image
+                inds_inside = np.where(
+                    (all_anchors[:, 0] >= -self.allowed_border) &
+                    (all_anchors[:, 1] >= -self.allowed_border) &
+                    (all_anchors[:, 2] < im_info[1] + self.allowed_border) &
+                    (all_anchors[:, 3] < im_info[0] + self.allowed_border))[0]
+                anchors = all_anchors[inds_inside, :]
+            else:
+                inds_inside, anchors = np.arange(num_anchors), all_anchors
+
+            num_inside = len(inds_inside)
+            labels = np.empty((num_inside,), 'float32')
+            labels.fill(-1)
+
+            # Overlaps between the anchors and the gt boxes
+            overlaps = box_util.bbox_overlaps(anchors, gt_boxes)
+            argmax_overlaps = overlaps.argmax(axis=1)
+            max_overlaps = overlaps[np.arange(num_inside), argmax_overlaps]
+
+            gt_argmax_overlaps = overlaps.argmax(axis=0)
+            gt_max_overlaps = overlaps[gt_argmax_overlaps,
+                                       np.arange(overlaps.shape[1])]
+            gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0]
+
+            # fg label: for each gt, anchor with highest overlap
+            labels[gt_argmax_overlaps] = 1
+
+            # fg label: above threshold IOU
+            labels[max_overlaps >= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = 1
+
+            # bg label: below threshold IOU
+            labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0
+
+            # Subsample positive labels if we have too many
+            num_fg = int(cfg.TRAIN.RPN_FG_FRACTION * cfg.TRAIN.RPN_BATCHSIZE)
+            fg_inds = np.where(labels == 1)[0]
+            if len(fg_inds) > num_fg:
+                disable_inds = npr.choice(fg_inds, len(fg_inds) - num_fg, False)
+                labels[disable_inds] = -1
+                fg_inds = np.where(labels == 1)[0]
+
+            # Subsample negative labels if we have too many
+            num_bg = cfg.TRAIN.RPN_BATCHSIZE - np.sum(labels == 1)
+            bg_inds = np.where(labels == 0)[0]
+            if len(bg_inds) > num_bg:
+                disable_inds = npr.choice(bg_inds, len(bg_inds) - num_bg, False)
+                labels[disable_inds] = -1
+
+            bbox_targets = np.zeros((num_inside, 4), 'float32')
+            bbox_targets[fg_inds, :] = \
+                box_util.bbox_transform(
+                    anchors[fg_inds, :],
+                    gt_boxes[argmax_overlaps[fg_inds], :4],
+                )
+            bbox_inside_weights = np.zeros((num_inside, 4), 'float32')
+            bbox_inside_weights[labels == 1, :] = np.array((1., 1., 1., 1.))
+            bbox_outside_weights = np.zeros((num_inside, 4), 'float32')
+            bbox_outside_weights[labels == 1, :] = np.ones((1, 4)) / cfg.TRAIN.RPN_BATCHSIZE
+            bbox_outside_weights[labels == 0, :] = np.ones((1, 4)) / cfg.TRAIN.RPN_BATCHSIZE
+
+            labels_wide[ix, inds_inside] = labels  # label
+            bbox_targets_wide[ix, inds_inside] = bbox_targets
+            bbox_inside_weights_wide[ix, inds_inside] = bbox_inside_weights
+            bbox_outside_weights_wide[ix, inds_inside] = bbox_outside_weights
+
+        if self.num_strides > 1:
+            labels = labels_wide.reshape((num_images, num_anchors))
+            bbox_targets = bbox_targets_wide.transpose((0, 2, 1))
+            bbox_inside_weights = bbox_inside_weights_wide.transpose((0, 2, 1))
+            bbox_outside_weights = bbox_outside_weights_wide.transpose((0, 2, 1))
+        else:
+            A = self.base_anchors[0].shape[0]
+            height, width = features[0].shape[-2:]
+            labels = labels_wide \
+                .reshape((num_images, height, width, A)) \
+                .transpose(0, 3, 1, 2) \
+                .reshape((num_images, num_anchors))
+
+            bbox_targets = bbox_targets_wide \
+                .reshape((num_images, height, width, A * 4)) \
+                .transpose(0, 3, 1, 2)
+
+            bbox_inside_weights = bbox_inside_weights_wide \
+                .reshape((num_images, height, width, A * 4)) \
+                .transpose(0, 3, 1, 2)
+
+            bbox_outside_weights = bbox_outside_weights_wide \
+                .reshape((num_images, height, width, A * 4)) \
+                .transpose(0, 3, 1, 2)
+
+        return {
+            'labels': new_tensor(labels),
+            'bbox_targets': new_tensor(bbox_targets),
+            'bbox_inside_weights': new_tensor(bbox_inside_weights),
+            'bbox_outside_weights': new_tensor(bbox_outside_weights),
+        }
--- a/lib/faster_rcnn/anchor_target_layer.py
+++ b/lib/faster_rcnn/anchor_target_layer.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import numpy.random as npr
-import dragon.vm.torch as torch
-
-from lib.core.config import cfg
-from lib.faster_rcnn.generate_anchors import generate_anchors
-from lib.utils import logger
-from lib.utils.blob import array2tensor
-from lib.utils.boxes import bbox_overlaps
-from lib.utils.boxes import bbox_transform
-from lib.utils.boxes import dismantle_gt_boxes
-
-
-class AnchorTargetLayer(torch.nn.Module):
-    """Assign anchors to ground-truth targets."""
-
-    def __init__(self):
-        super(AnchorTargetLayer, self).__init__()
-        # Load the basic configs
-        # C4 backbone takes the first stride
-        self.scales = cfg.RPN.SCALES
-        self.stride = cfg.RPN.STRIDES[0]
-        self.ratios = cfg.RPN.ASPECT_RATIOS
-
-        # Allow boxes to sit over the edge by a small amount
-        self._allowed_border = cfg.TRAIN.RPN_STRADDLE_THRESH
-
-        # Generate base anchors
-        self.base_anchors = generate_anchors(
-            base_size=self.stride,
-            ratios=self.ratios,
-            scales=np.array(self.scales),
-        )
-
-    def forward(self, features, gt_boxes, ims_info):
-        """Produces anchor classification labels and bounding-box regression targets.
-
-        Parameters
-        ----------
-        features : sequence of dragon.vm.torch.Tensor
-            The features of specific conv layers.
-        gt_boxes : numpy.ndarray
-            The packed ground-truth boxes.
-        ims_info : numpy.ndarray
-            The information of input images.
-
-        """
-        num_images = cfg.TRAIN.IMS_PER_BATCH
-        gt_boxes_wide = dismantle_gt_boxes(gt_boxes, num_images)
-
-        if len(gt_boxes_wide) != num_images:
-            logger.fatal(
-                'Input {} images, got {} slices of gt boxes.'
-                .format(num_images, len(gt_boxes_wide))
-            )
-
-        # Generate proposals from shifted anchors
-        height, width = features[0].shape[-2:]
-        shift_x = np.arange(0, width) * self.stride
-        shift_y = np.arange(0, height) * self.stride
-        shift_x, shift_y = np.meshgrid(shift_x, shift_y)
-        shifts = np.vstack((shift_x.ravel(), shift_y.ravel(),
-                            shift_x.ravel(), shift_y.ravel())).transpose()
-        # Add A anchors (1, A, 4) to
-        # cell K shifts (K, 1, 4) to get
-        # shift anchors (K, A, 4)
-        # Reshape to (K * A, 4) shifted anchors
-        A = self.base_anchors.shape[0]
-        K = shifts.shape[0]
-        all_anchors = (self.base_anchors.reshape((1, A, 4)) +
-                       shifts.reshape((1, K, 4)).transpose((1, 0, 2)))
-        all_anchors = all_anchors.reshape((K * A, 4))
-        total_anchors = int(K * A)
-
-        # label: 1 is positive, 0 is negative, -1 is don not care
-        all_labels = -np.ones((num_images, total_anchors,), dtype=np.float32)
-        all_bbox_targets = np.zeros((num_images, total_anchors, 4), dtype=np.float32)
-        all_bbox_inside_weights = np.zeros_like(all_bbox_targets, dtype=np.float32)
-        all_bbox_outside_weights = np.zeros_like(all_bbox_targets, dtype=np.float32)
-
-        for ix in range(num_images):
-            # GT boxes (x1, y1, x2, y2, label)
-            gt_boxes = gt_boxes_wide[ix]
-            im_info = ims_info[ix]
-
-            if self._allowed_border >= 0:
-                # Only keep anchors inside the image
-                inds_inside = np.where(
-                    (all_anchors[:, 0] >= -self._allowed_border) &
-                    (all_anchors[:, 1] >= -self._allowed_border) &
-                    (all_anchors[:, 2] < im_info[1] + self._allowed_border) &
-                    (all_anchors[:, 3] < im_info[0] + self._allowed_border))[0]
-                anchors = all_anchors[inds_inside, :]
-            else:
-                inds_inside = np.arange(all_anchors.shape[0])
-                anchors = all_anchors
-            num_inside = len(inds_inside)
-
-            # label: 1 is positive, 0 is negative, -1 is don't care
-            labels = np.empty((num_inside,), dtype=np.float32)
-            labels.fill(-1)
-
-            # Overlaps between the anchors and the gt boxes
-            overlaps = bbox_overlaps(anchors, gt_boxes)
-            argmax_overlaps = overlaps.argmax(axis=1)
-            max_overlaps = overlaps[np.arange(num_inside), argmax_overlaps]
-            gt_argmax_overlaps = overlaps.argmax(axis=0)
-            gt_max_overlaps = overlaps[gt_argmax_overlaps, np.arange(overlaps.shape[1])]
-            gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0]
-
-            if not cfg.TRAIN.RPN_CLOBBER_POSITIVES:
-                # Assign bg labels first so that positive labels can clobber them
-                labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0
-
-            # fg label: for each gt, anchor with highest overlap
-            labels[gt_argmax_overlaps] = 1
-
-            # fg label: above threshold IOU
-            labels[max_overlaps >= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = 1
-
-            if cfg.TRAIN.RPN_CLOBBER_POSITIVES:
-                # Assign bg labels last so that negative labels can clobber positives
-                labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0
-
-            # Subsample positive labels if we have too many
-            num_fg = int(cfg.TRAIN.RPN_FG_FRACTION * cfg.TRAIN.RPN_BATCHSIZE)
-            fg_inds = np.where(labels == 1)[0]
-            if len(fg_inds) > num_fg:
-                disable_inds = npr.choice(
-                    fg_inds,
-                    size=len(fg_inds) - num_fg,
-                    replace=False,
-                )
-                labels[disable_inds] = -1
-                fg_inds = np.where(labels == 1)[0]
-
-            # Subsample negative labels if we have too many
-            num_bg = cfg.TRAIN.RPN_BATCHSIZE - np.sum(labels == 1)
-            bg_inds = np.where(labels == 0)[0]
-            if len(bg_inds) > num_bg:
-                disable_inds = npr.choice(
-                    bg_inds,
-                    size=len(bg_inds) - num_bg,
-                    replace=False,
-                )
-                labels[disable_inds] = -1
-
-            bbox_targets = np.zeros((num_inside, 4), dtype=np.float32)
-            bbox_targets[fg_inds, :] = bbox_transform(
-                ex_rois=anchors[fg_inds, :],
-                gt_rois=gt_boxes[argmax_overlaps[fg_inds], :4],
-            )
-            bbox_inside_weights = np.zeros((num_inside, 4), dtype=np.float32)
-            bbox_inside_weights[labels == 1, :] = np.array((1., 1., 1., 1.))
-            bbox_outside_weights = np.zeros((num_inside, 4), dtype=np.float32)
-            bbox_outside_weights[labels == 1, :] = np.ones((1, 4)) / cfg.TRAIN.RPN_BATCHSIZE
-            bbox_outside_weights[labels == 0, :] = np.ones((1, 4)) / cfg.TRAIN.RPN_BATCHSIZE
-
-            all_labels[ix, inds_inside] = labels  # label
-            all_bbox_targets[ix, inds_inside] = bbox_targets
-            all_bbox_inside_weights[ix, inds_inside] = bbox_inside_weights
-            all_bbox_outside_weights[ix, inds_inside] = bbox_outside_weights
-
-        labels = all_labels \
-            .reshape((num_images, height, width, A)) \
-            .transpose(0, 3, 1, 2) \
-            .reshape((num_images, total_anchors))
-
-        bbox_targets = all_bbox_targets \
-            .reshape((num_images, height, width, A * 4)) \
-            .transpose(0, 3, 1, 2)
-
-        bbox_inside_weights = all_bbox_inside_weights \
-            .reshape((num_images, height, width, A * 4)) \
-            .transpose(0, 3, 1, 2)
-
-        bbox_outside_weights = all_bbox_outside_weights \
-            .reshape((num_images, height, width, A * 4)) \
-            .transpose(0, 3, 1, 2)
-
-        return {
-            'labels': array2tensor(labels),
-            'bbox_targets': array2tensor(bbox_targets),
-            'bbox_inside_weights': array2tensor(bbox_inside_weights),
-            'bbox_outside_weights': array2tensor(bbox_outside_weights),
-        }
--- a/lib/faster_rcnn/data_layer.py
+++ b/lib/faster_rcnn/data_layer.py
@@ -27,11 +27,11 @@ from lib.utils import logger
 from lib.utils.blob import im_list_to_blob


-class DataLayer(torch.nn.Module):
-    """Generate a mini-batch of data."""
+class DataLoader(object):
+    """Provide mini-batches of data."""

    def __init__(self):
-        super(DataLayer, self).__init__()
+        super(DataLoader, self).__init__()
        database = get_imdb(cfg.TRAIN.DATABASE)
        self.data_batch = DataBatch(**{
            'dataset': lambda: dragon.io.SeetaRecordDataset(database.source),
@@ -39,12 +39,11 @@ class DataLayer(torch.nn.Module):
            'shuffle': cfg.TRAIN.USE_SHUFFLE,
            'num_chunks': cfg.TRAIN.NUM_SHUFFLE_CHUNKS,
            'batch_size': cfg.TRAIN.IMS_PER_BATCH * 2,
+            'num_transformers': cfg.TRAIN.NUM_WORKERS,
        })

-    def forward(self):
-        # Get an array blob from the Queue
+    def __call__(self):
        outputs = self.data_batch.get()
-        # Zero-Copy the array to tensor
        outputs['data'] = torch.from_numpy(outputs['data'])
        return outputs

@@ -59,14 +58,16 @@ class DataBatch(mp.Process):
        ----------
        dataset : lambda
            The creator of a dataset.
+        classes : Sequence[str]
+            The class names.
        shuffle : bool, optional, default=False
            Whether to shuffle the data.
        num_chunks : int, optional, default=0
            The number of chunks to split.
        batch_size : int, optional, default=2
            The size of a mini-batch.
-        prefetch : int, optional, default=5
-            The prefetch count.
+        num_transformers : int, optional, default=3
+            The number of workers to transform data.

        """
        super(DataBatch, self).__init__()
@@ -83,20 +84,10 @@ class DataBatch(mp.Process):
        self._prefetch = kwargs.get('prefetch', 5)
        self._batch_size = kwargs.get('batch_size', 2)
        self._num_readers = kwargs.get('num_readers', 1)
-        self._num_transformers = kwargs.get('num_transformers', -1)
-        self._max_transformers = kwargs.get('max_transformers', 3)
+        self._num_transformers = kwargs.get('num_transformers', 3)
        self._num_fetchers = kwargs.get('num_fetchers', 1)
        self.daemon = True

-        # Io-Aware Policy
-        if self._num_transformers == -1:
-            self._num_transformers = 2
-            # Add 1 transformer for color augmentation
-            if cfg.TRAIN.USE_COLOR_JITTER:
-                self._num_transformers += 1
-        self._num_transformers = min(
-            self._num_transformers, self._max_transformers)
-
        # Initialize queues
        num_batches = self._prefetch * self._num_readers
        self.Q1 = mp.Queue(num_batches * self._batch_size)

--- a/lib/faster_rcnn/data_transformer.py
+++ b/lib/faster_rcnn/data_transformer.py
@@ -19,9 +19,9 @@ import cv2
 import numpy as np

 from lib.core.config import cfg
-from lib.utils import rotated_boxes
+from lib.datasets.example import Example
+from lib.utils import boxes as box_util
 from lib.utils.blob import prep_im_for_blob
-from lib.utils.boxes import flip_boxes
 from lib.utils.image import get_image_with_target_size


@@ -44,32 +44,32 @@ class DataTransformer(multiprocessing.Process):
        apply_flip=False,
        offsets=None,
    ):
-        n_objects = 0
+        objects, n_objects = example.objects, 0
+        height, width = example.height, example.width
        if not self._use_diff:
-            for obj in example['object']:
+            for obj in objects:
                if obj.get('difficult', 0) == 0:
                    n_objects += 1
        else:
-            n_objects = len(example['object'])
+            n_objects = len(objects)

        roi_dict = {
-            'width': example['width'],
-            'height': example['height'],
-            'gt_classes': np.zeros((n_objects,), 'int32'),
            'boxes': np.zeros((n_objects, 4), 'float32'),
+            'gt_classes': np.zeros((n_objects,), 'int32'),
        }

        # Filter the difficult instances
        object_idx = 0
-        for obj in example['object']:
+        for obj in objects:
            if not self._use_diff and \
                    obj.get('difficult', 0) > 0:
                continue
+            bbox = obj['bbox']
            roi_dict['boxes'][object_idx, :] = [
-                max(0, obj['xmin']),
-                max(0, obj['ymin']),
-                min(obj['xmax'], example['width'] - 1),
-                min(obj['ymax'], example['height'] - 1),
+                max(0, bbox[0]),
+                max(0, bbox[1]),
+                min(bbox[2], width - 1),
+                min(bbox[3], height - 1),
            ]
            roi_dict['gt_classes'][object_idx] = \
                self._class_to_ind[obj['name']]
@@ -77,8 +77,11 @@ class DataTransformer(multiprocessing.Process):

        # Flip the boxes if necessary
        if apply_flip:
-            roi_dict['boxes'] = flip_boxes(
-                roi_dict['boxes'], roi_dict['width'])
+            roi_dict['boxes'] = \
+                box_util.flip_boxes(
+                    roi_dict['boxes'],
+                    width,
+                )

        # Scale the boxes to the detecting scale
        roi_dict['boxes'] *= im_scale
@@ -94,61 +97,32 @@ class DataTransformer(multiprocessing.Process):

        return roi_dict

-    @classmethod
-    def get_image(cls, example):
-        img = np.frombuffer(example['content'], np.uint8)
-        return cv2.imdecode(img, -1)
-
-    @classmethod
-    def get_annotations(cls, example):
-        objects = []
-        for ix, obj in enumerate(example['object']):
-            if 'x3' in obj:
-                bbox = rotated_boxes.vertices2box(
-                    [obj['x1'], obj['y1'],
-                     obj['x2'], obj['y2'],
-                     obj['x3'], obj['y3'],
-                     obj['x4'], obj['y4']]
-                )
-            elif 'x2' in obj:
-                bbox = [obj['x1'], obj['y1'], obj['x2'], obj['y2']]
-            elif 'xmin' in obj:
-                bbox = [obj['xmin'], obj['ymin'], obj['xmax'], obj['ymax']]
-            else:
-                bbox = obj['bbox']
-            objects.append({
-                'name': obj['name'],
-                'difficult': obj.get('difficult', 0),
-                'bbox': bbox,
-            })
-        return example['id'], objects
-
    def get(self, example):
-        img = np.frombuffer(example['content'], np.uint8)
-        img = cv2.imdecode(img, 1)
+        example = Example(example)
+        img = example.image

        # Scale
-        scale_indices = np.random.randint(len(cfg.TRAIN.SCALES))
-        target_size = cfg.TRAIN.SCALES[scale_indices]
-        im, im_scale, jitter = prep_im_for_blob(img, target_size, cfg.TRAIN.MAX_SIZE)
+        max_size = cfg.TRAIN.MAX_SIZE
+        target_size = cfg.TRAIN.SCALES[np.random.randint(len(cfg.TRAIN.SCALES))]
+        img, im_scale, jitter = prep_im_for_blob(img, target_size, max_size)

        # Flip
        apply_flip = False
        if self._use_flipped:
            if np.random.randint(2) > 0:
-                im = im[:, ::-1, :]
+                img = img[:, ::-1]
                apply_flip = True

        # Random Crop or RandomPad
        offsets = None
        if cfg.TRAIN.MAX_SIZE > 0:
-            if jitter != 1.0:
+            if jitter != 1:
                # To a rectangle (scale, max_size)
-                target_size = (np.array(im.shape[0:2]) / jitter).astype(np.int)
-                im, offsets = get_image_with_target_size(target_size, im)
+                target_size = (np.array(img.shape[:2]) / jitter).astype(np.int32)
+                img, offsets = get_image_with_target_size(target_size, img)
        else:
            # To a square (target_size, target_size)
-            im, offsets = get_image_with_target_size([target_size] * 2, im)
+            img, offsets = get_image_with_target_size([target_size] * 2, img)

        # Example -> RoIDict
        roi_dict = self.make_roi_dict(example, im_scale, apply_flip, offsets)
@@ -158,7 +132,7 @@ class DataTransformer(multiprocessing.Process):
        gt_boxes = np.empty((len(roi_dict['gt_classes']), 5), dtype=np.float32)
        gt_boxes[:, :4], gt_boxes[:, 4] = roi_dict['boxes'], roi_dict['gt_classes']

-        return im, im_scale, gt_boxes
+        return img, im_scale, gt_boxes

    def run(self):
        # Fix the process-local random seed

--- a/lib/faster_rcnn/proposal.py
+++ b/lib/faster_rcnn/proposal.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+import numpy as np
+
+from lib.core.config import cfg
+from lib.faster_rcnn.generate_anchors import generate_anchors
+from lib.faster_rcnn.utils import generate_grid_anchors
+from lib.nms import nms_wrapper
+from lib.utils import boxes as box_util
+
+
+class Proposal(object):
+    """Compute proposals by applying transformations anchors."""
+
+    def __init__(self):
+        super(Proposal, self).__init__()
+        # Load the basic configs
+        self.scales = cfg.RPN.SCALES
+        self.strides = cfg.RPN.STRIDES
+        self.ratios = cfg.RPN.ASPECT_RATIOS
+        self.num_strides = len(self.strides)
+        self.defaults = collections.OrderedDict([
+            ('rois', np.array([[-1, 0, 0, 1, 1]], 'float32')),
+        ])
+        # Generate base anchors
+        self.base_anchors = []
+        for i in range(self.num_strides):
+            self.base_anchors.append(
+                generate_anchors(
+                    self.strides[i],
+                    self.ratios,
+                    np.array([self.scales[i]])
+                    if self.num_strides > 1
+                    else np.array(self.scales)
+                )
+            )
+
+    def __call__(self, features, cls_prob, bbox_pred, ims_info):
+        pre_nms_top_n = cfg.TRAIN.RPN_PRE_NMS_TOP_N
+        post_nms_top_n = cfg.TRAIN.RPN_POST_NMS_TOP_N
+        nms_thresh = cfg.TRAIN.RPN_NMS_THRESH
+        min_size = cfg.TRAIN.RPN_MIN_SIZE
+
+        # Get resources
+        num_images = ims_info.shape[0]
+        all_anchors = \
+            generate_grid_anchors(
+                features,
+                self.base_anchors,
+                self.strides,
+            )
+
+        # Prepare for the outputs
+        batch_rois = []
+        cls_prob = cls_prob.numpy(True)
+        bbox_pred = bbox_pred.numpy(True)
+        if self.num_strides > 1:
+            # (?, 4, A * K) -> (?, A * K, 4)
+            bbox_pred = bbox_pred.transpose((0, 2, 1))
+        else:
+            # (?, A * 4, H, W) -> (?, H, W, A * 4)
+            cls_prob = cls_prob.transpose((0, 2, 3, 1))
+            bbox_pred = bbox_pred.transpose((0, 2, 3, 1))
+
+        # Extract RoIs separately
+        for ix in range(num_images):
+            # [?, N] -> [? * N, 1]
+            scores = cls_prob[ix].reshape((-1, 1))
+            if self.num_strides > 1:
+                deltas = bbox_pred[ix]
+            else:
+                deltas = bbox_pred[ix].reshape((-1, 4))
+
+            if pre_nms_top_n <= 0 or pre_nms_top_n >= len(scores):
+                order = np.argsort(-scores.squeeze())
+            else:
+                # Avoid sorting possibly large arrays; First partition to get top K
+                # unsorted and then sort just those (~20x faster for 200k scores)
+                inds = np.argpartition(-scores.squeeze(), pre_nms_top_n)[:pre_nms_top_n]
+                order = np.argsort(-scores[inds].squeeze())
+                order = inds[order]
+
+            deltas = deltas[order]
+            anchors = all_anchors[order]
+            scores = scores[order]
+
+            # Convert anchors into proposals via bbox transformations
+            proposals = box_util.bbox_transform_inv(anchors, deltas)
+
+            # Clip predicted boxes to image
+            proposals = box_util.clip_tiled_boxes(proposals, ims_info[ix, :2])
+
+            # Remove predicted boxes with either height or width < threshold
+            keep = box_util.filter_boxes(proposals, min_size * ims_info[ix, 2])
+            proposals = proposals[keep, :]
+            scores = scores[keep]
+
+            # Apply nms (e.g. threshold = 0.7)
+            # Take after_nms_topN (e.g. 300)
+            # Return the top proposals (-> RoIs top)
+            keep = nms_wrapper.nms(np.hstack((proposals, scores)), nms_thresh)
+            if post_nms_top_n > 0:
+                keep = keep[:post_nms_top_n]
+            proposals = proposals[keep, :]
+
+            # Attach RoIs with batch indices
+            batch_inds = np.empty((proposals.shape[0], 1), 'float32')
+            batch_inds.fill(ix)
+            rpn_rois = np.hstack((batch_inds, proposals.astype('float32', copy=False)))
+            batch_rois.append(rpn_rois)
+
+        # Merge RoIs into a blob
+        return np.concatenate(batch_rois, 0)
--- a/lib/faster_rcnn/proposal_layer.py
+++ b/lib/faster_rcnn/proposal_layer.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# --------------------------------------------------------
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import dragon.vm.torch as torch
-import numpy as np
-
-from lib.core.config import cfg
-from lib.faster_rcnn.generate_anchors import generate_anchors
-from lib.nms import nms_wrapper
-from lib.utils.blob import array2tensor
-from lib.utils.boxes import bbox_transform_inv
-from lib.utils.boxes import clip_tiled_boxes
-from lib.utils.boxes import filter_boxes
-
-
-class ProposalLayer(torch.nn.Module):
-    """Compute proposals by applying transformations to anchors."""
-
-    def __init__(self):
-        super(ProposalLayer, self).__init__()
-        # Load the basic configs
-        self.scales = cfg.RPN.SCALES
-        self.stride = cfg.RPN.STRIDES[0]
-        self.ratios = cfg.RPN.ASPECT_RATIOS
-
-        # Generate base anchors
-        self.base_anchors = generate_anchors(
-            base_size=self.stride,
-            ratios=self.ratios,
-            scales=np.array(self.scales),
-        )
-
-    def forward(self, features, cls_prob, bbox_pred, ims_info):
-        cfg_key = 'TRAIN' if self.training else 'TEST'
-        pre_nms_top_n = cfg[cfg_key].RPN_PRE_NMS_TOP_N
-        post_nms_top_n = cfg[cfg_key].RPN_POST_NMS_TOP_N
-        nms_thresh = cfg[cfg_key].RPN_NMS_THRESH
-        min_size = cfg[cfg_key].RPN_MIN_SIZE
-
-        # Get resources
-        num_images = ims_info.shape[0]
-
-        # Generate proposals from shifted anchors
-        height, width = cls_prob.shape[-2:]
-        shift_x = np.arange(0, width) * self.stride
-        shift_y = np.arange(0, height) * self.stride
-        shift_x, shift_y = np.meshgrid(shift_x, shift_y)
-        shifts = np.vstack((shift_x.ravel(), shift_y.ravel(),
-                            shift_x.ravel(), shift_y.ravel())).transpose()
-        # Add A anchors (1, A, 4) to
-        # cell K shifts (K, 1, 4) to get
-        # shift anchors (K, A, 4)
-        # Reshape to (K * A, 4) shifted anchors
-        A = self.base_anchors.shape[0]
-        K = shifts.shape[0]
-        anchors = \
-            self.base_anchors.reshape((1, A, 4)) + \
-            shifts.reshape((1, K, 4)).transpose((1, 0, 2))
-        all_anchors = anchors.reshape((K * A, 4))
-
-        # Prepare for the outputs
-        batch_rois = []
-        # scores & deltas are (1, A, H, W) format
-        # Transpose to (1, H, W, A)
-        batch_scores = cls_prob.numpy(True).transpose((0, 2, 3, 1))
-        batch_deltas = bbox_pred.numpy(True).transpose((0, 2, 3, 1))
-
-        # Extract RoIs separately
-        for ix in range(num_images):
-            scores = batch_scores[ix].reshape((-1, 1))  # [1, n] -> [n, 1]
-            deltas = batch_deltas[ix].reshape((-1, 4))
-
-            if pre_nms_top_n <= 0 or pre_nms_top_n >= len(scores):
-                order = np.argsort(-scores.squeeze())
-            else:
-                # Avoid sorting possibly large arrays; First partition to get top K
-                # unsorted and then sort just those (~20x faster for 200k scores)
-                inds = np.argpartition(-scores.squeeze(), pre_nms_top_n)[:pre_nms_top_n]
-                order = np.argsort(-scores[inds].squeeze())
-                order = inds[order]
-
-            deltas = deltas[order]
-            anchors = all_anchors[order]
-            scores = scores[order]
-
-            # 1. Convert anchors into proposals via bbox transformations
-            proposals = bbox_transform_inv(anchors, deltas)
-
-            # 2. Clip predicted boxes to image
-            proposals = clip_tiled_boxes(proposals, ims_info[ix, :2])
-
-            # 3. remove predicted boxes with either height or width < threshold
-            # (NOTE: convert min_size to input image scale stored in im_info[2])
-            keep = filter_boxes(proposals, min_size * ims_info[ix, 2])
-            proposals = proposals[keep, :]
-            scores = scores[keep]
-
-            # 6. Apply nms (e.g. threshold = 0.7)
-            # 7. Take after_nms_top_n (e.g. 300)
-            # 8. Return the top proposals (-> RoIs top)
-            keep = nms_wrapper.nms(np.hstack((proposals, scores)), nms_thresh)
-            if post_nms_top_n > 0:
-                keep = keep[:post_nms_top_n]
-            proposals = proposals[keep, :]
-
-            # Output rois blob
-            batch_inds = np.empty((proposals.shape[0], 1), dtype=np.float32)
-            batch_inds.fill(ix)
-            rpn_rois = np.hstack((batch_inds, proposals.astype(np.float32, copy=False)))
-            batch_rois.append(rpn_rois)
-
-        # Merge RoIs into a blob
-        rpn_rois = np.concatenate(batch_rois, axis=0)
-        if cfg_key == 'TRAIN':
-            return rpn_rois
-        else:
-            return [array2tensor(rpn_rois)]
--- a/lib/faster_rcnn/proposal_target.py
+++ b/lib/faster_rcnn/proposal_target.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+import numpy as np
+import numpy.random as npr
+
+from lib.core.config import cfg
+from lib.faster_rcnn.utils import map_blobs_to_outputs
+from lib.faster_rcnn.utils import map_returns_to_blobs
+from lib.faster_rcnn.utils import map_rois_to_levels
+from lib.utils import boxes as box_util
+from lib.utils.framework import new_tensor
+
+
+class ProposalTarget(object):
+    """Assign ground-truth targets to proposals."""
+
+    def __init__(self):
+        super(ProposalTarget, self).__init__()
+        self.num_strides = len(cfg.RPN.STRIDES)
+        self.num_classes = cfg.MODEL.NUM_CLASSES
+        self.defaults = collections.OrderedDict([
+            ('rois', np.array([[-1, 0, 0, 1, 1]], 'float32')),
+            ('labels', np.array([-1], 'float32')),
+            ('bbox_targets', np.zeros((1, self.num_classes * 4), 'float32')),
+            ('bbox_inside_weights', np.zeros((1, self.num_classes * 4), 'float32')),
+            ('bbox_outside_weights', np.zeros((1, self.num_classes * 4), 'float32')),
+        ])
+
+    def __call__(self, rpn_rois, gt_boxes):
+        num_images = cfg.TRAIN.IMS_PER_BATCH
+        # Proposal ROIs (0, x1, y1, x2, y2) coming from RPN
+        all_rois = rpn_rois
+        # GT boxes (x1, y1, x2, y2, label)
+        gt_boxes_wide = box_util.dismantle_boxes(gt_boxes, num_images)
+
+        # Prepare for the outputs
+        keys = self.defaults.keys()
+        blobs = dict(map(lambda a, b: (a, b), keys, [[] for _ in keys]))
+
+        # Generate targets separately
+        for ix in range(num_images):
+            gt_boxes = gt_boxes_wide[ix]
+            # Extract proposals for this image
+            rois = all_rois[np.where(all_rois[:, 0].astype('int32') == ix)[0]]
+            # Include ground-truth boxes in the set of candidate rois
+            inds = np.ones((gt_boxes.shape[0], 1), gt_boxes.dtype) * ix
+            rois = np.vstack((rois, np.hstack((inds, gt_boxes[:, :4]))))
+            # Sample a batch of RoIs for training
+            rois_per_image = cfg.TRAIN.BATCH_SIZE
+            fg_rois_per_image = np.round(cfg.TRAIN.FG_FRACTION * rois_per_image)
+            map_returns_to_blobs(
+                sample_rois(
+                    rois,
+                    gt_boxes,
+                    rois_per_image,
+                    fg_rois_per_image,
+                    self.num_classes,
+                ), blobs, keys,
+            )
+
+        # Stack into continuous blobs
+        for k, v in blobs.items():
+            blobs[k] = np.concatenate(blobs[k], 0)
+
+        if self.num_strides > 1:
+            # Distribute RoIs into pyramids
+            min_lvl = cfg.FPN.ROI_MIN_LEVEL
+            max_lvl = cfg.FPN.ROI_MAX_LEVEL
+            k = max_lvl - min_lvl + 1
+            levels = map_rois_to_levels(blobs['rois'], min_lvl, max_lvl)
+            outputs = map_blobs_to_outputs(
+                blobs,
+                self.defaults,
+                [np.where(levels == (i + min_lvl))[0] for i in range(k)],
+            )
+            return {
+                'rois': [new_tensor(outputs['rois'][i]) for i in range(k)],
+                'labels': new_tensor(np.concatenate(outputs['labels'], 0)),
+                'bbox_targets': new_tensor(np.vstack(outputs['bbox_targets'])),
+                'bbox_inside_weights': new_tensor(np.vstack(outputs['bbox_inside_weights'])),
+                'bbox_outside_weights': new_tensor(np.vstack(outputs['bbox_outside_weights'])),
+            }
+        else:
+            # Return RoIs directly for CX-stride
+            return {
+                'rois': [new_tensor(blobs['rois'])],
+                'labels': new_tensor(blobs['labels']),
+                'bbox_targets': new_tensor(blobs['bbox_targets']),
+                'bbox_inside_weights': new_tensor(blobs['bbox_inside_weights']),
+                'bbox_outside_weights': new_tensor(blobs['bbox_outside_weights']),
+            }
+
+
+def get_targets(ex_rois, gt_rois, gt_labels, num_classes):
+    """Compute bounding-box regression targets for an image."""
+    assert ex_rois.shape[0] == gt_rois.shape[0]
+    assert ex_rois.shape[1] == 4
+    assert gt_rois.shape[1] == 4
+    # Compute bbox regression targets
+    fg_inds = np.where(gt_labels > 0)[0]
+    targets = box_util.bbox_transform(ex_rois, gt_rois, cfg.BBOX_REG_WEIGHTS)
+    bbox_targets = np.zeros((ex_rois.shape[0], 4 * num_classes), 'float32')
+    inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32)
+    for i in fg_inds:
+        start = int(4 * gt_labels[i])
+        bbox_targets[i, start:start + 4] = targets[i]
+        inside_weights[i, start:start + 4] = (1., 1., 1., 1.)
+    outside_weights = np.array(inside_weights > 0).astype('float32')
+    return bbox_targets, inside_weights, outside_weights
+
+
+def sample_rois(
+    all_rois,
+    gt_boxes,
+    num_rois,
+    num_fg_rois,
+    num_classes,
+):
+    """Sample a batch of RoIs comprising foreground and background examples."""
+    overlaps = box_util.bbox_overlaps(all_rois[:, 1:5], gt_boxes[:, :4])
+    gt_assignment = overlaps.argmax(axis=1)
+    max_overlaps = overlaps.max(axis=1)
+    labels = gt_boxes[gt_assignment, 4]
+
+    # Select foreground RoIs as those with >= FG_THRESH overlap
+    fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0]
+    fg_rois_per_this_image = int(min(num_fg_rois, fg_inds.size))
+    # Sample foreground regions without replacement
+    if fg_inds.size > 0:
+        fg_inds = npr.choice(fg_inds, fg_rois_per_this_image, False)
+
+    # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
+    bg_inds = np.where((max_overlaps < cfg.TRAIN.BG_THRESH_HI) &
+                       (max_overlaps >= cfg.TRAIN.BG_THRESH_LO))[0]
+    # Compute number of background RoIs to take from this image
+    bg_rois_per_this_image = num_rois - fg_rois_per_this_image
+    bg_rois_per_this_image = min(bg_rois_per_this_image, bg_inds.size)
+    # Sample background regions without replacement
+    if bg_inds.size > 0:
+        bg_inds = npr.choice(bg_inds, bg_rois_per_this_image, False)
+
+    # The indices that we're selecting (both fg and bg)
+    keep_inds = np.append(fg_inds, bg_inds)
+    # Select sampled values from various arrays
+    rois, labels = all_rois[keep_inds], labels[keep_inds]
+    # Clamp labels for the background RoIs to 0
+    labels[fg_rois_per_this_image:] = 0
+    # Clamp the image indices for the background RoIs to -1
+    rois[fg_rois_per_this_image:][0] = -1
+
+    # Compute the target from RoIs
+    outputs = [rois, labels]
+    outputs += get_targets(
+        rois[:, 1:5],
+        gt_boxes[gt_assignment[keep_inds], :4],
+        labels,
+        num_classes,
+    )
+    return outputs
--- a/lib/faster_rcnn/proposal_target_layer.py
+++ b/lib/faster_rcnn/proposal_target_layer.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# --------------------------------------------------------
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import dragon.vm.torch as torch
-import numpy as np
-import numpy.random as npr
-
-from lib.core.config import cfg
-from lib.utils.blob import array2tensor
-from lib.utils.boxes import bbox_overlaps
-from lib.utils.boxes import bbox_transform
-from lib.utils.boxes import dismantle_gt_boxes
-
-
-class ProposalTargetLayer(torch.nn.Module):
-    """Assign object detection proposals to ground-truth targets."""
-
-    def __init__(self):
-        super(ProposalTargetLayer, self).__init__()
-        self.num_classes = cfg.MODEL.NUM_CLASSES
-
-    def forward(self, rpn_rois, gt_boxes):
-        num_images = cfg.TRAIN.IMS_PER_BATCH
-        # Proposal ROIs (0, x1, y1, x2, y2) coming from RPN
-        # (i.e., rpn.proposal_layer.ProposalLayer), or any other source
-        all_rois = rpn_rois
-        # GT boxes (x1, y1, x2, y2, label)
-        gt_boxes_wide = dismantle_gt_boxes(gt_boxes, num_images)
-
-        # Prepare for the outputs
-        keys = ['labels', 'rois', 'bbox_targets',
-                'bbox_inside_weights', 'bbox_outside_weights']
-        batch_outputs = dict(map(lambda a, b: (a, b), keys, [[] for _ in keys]))
-
-        # Generate targets separately
-        for ix in range(num_images):
-            gt_boxes = gt_boxes_wide[ix]
-            # Extract proposals for this image
-            rois = all_rois[np.where(all_rois[:, 0].astype(np.int32) == ix)[0]]
-            # Include ground-truth boxes in the set of candidate rois
-            inds = np.ones((gt_boxes.shape[0], 1), dtype=gt_boxes.dtype) * ix
-            rois = np.vstack((rois, np.hstack((inds, gt_boxes[:, 0:4]))))
-            # Sample a batch of rois for training
-            rois_per_image = cfg.TRAIN.BATCH_SIZE
-            fg_rois_per_image = np.round(cfg.TRAIN.FG_FRACTION * rois_per_image)
-            labels, rois, bbox_targets, bbox_inside_weights = _sample_rois(
-                rois, gt_boxes, fg_rois_per_image, rois_per_image, self.num_classes)
-            bbox_outside_weights = np.array(bbox_inside_weights > 0).astype(np.float32)
-            _fmap_batch([
-                labels,
-                rois, 
-                bbox_targets,
-                bbox_inside_weights,
-                bbox_outside_weights],
-                batch_outputs,
-                keys,
-            )
-
-        # Merge targets into blobs
-        for k, v in batch_outputs.items():
-            batch_outputs[k] = np.concatenate(batch_outputs[k], axis=0)
-
-        return {
-            'rois': [array2tensor(batch_outputs['rois'])],
-            'labels': array2tensor(batch_outputs['labels']),
-            'bbox_targets': array2tensor(batch_outputs['bbox_targets']),
-            'bbox_inside_weights': array2tensor(batch_outputs['bbox_inside_weights']),
-            'bbox_outside_weights': array2tensor(batch_outputs['bbox_outside_weights']),
-        }
-
-
-def _get_bbox_regression_labels(bbox_target_data, num_classes):
-    """Bounding-box regression targets (bbox_target_data) are stored in a
-    compact form N x (class, tx, ty, tw, th)
-
-    This function expands those targets into the 4-of-4*K representation used
-    by the network (i.e. only one class has non-zero targets).
-
-    Returns:
-        bbox_target (ndarray): N x 4K blob of regression targets
-        bbox_inside_weights (ndarray): N x 4K blob of loss weights
-
-    """
-    clss = bbox_target_data[:, 0]
-    bbox_targets = np.zeros((clss.size, 4 * num_classes), dtype=np.float32)
-    bbox_inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32)
-    inds = np.where(clss > 0)[0]
-    for ind in inds:
-        cls = clss[ind]
-        start = 4 * cls
-        end = start + 4
-        bbox_targets[ind, int(start):int(end)] = bbox_target_data[ind, 1:]
-        bbox_inside_weights[ind, int(start):int(end)] = (1.0, 1.0, 1.0, 1.0)
-
-    return bbox_targets, bbox_inside_weights
-
-
-def _compute_targets(ex_rois, gt_rois, labels):
-    """Compute bounding-box regression targets for an image."""
-    assert ex_rois.shape[0] == gt_rois.shape[0]
-    assert ex_rois.shape[1] == 4
-    assert gt_rois.shape[1] == 4
-    targets = bbox_transform(ex_rois, gt_rois, cfg.BBOX_REG_WEIGHTS)
-    return np.hstack((labels[:, np.newaxis], targets)).astype(np.float32, copy=False)
-
-
-def _sample_rois(
-    all_rois,
-    gt_boxes,
-    fg_rois_per_image,
-    rois_per_image,
-    num_classes,
-):
-    """Generate a random sample of RoIs."""
-    overlaps = bbox_overlaps(all_rois[:, 1:5], gt_boxes[:, :4])
-    gt_assignment = overlaps.argmax(axis=1)
-    max_overlaps = overlaps.max(axis=1)
-    labels = gt_boxes[gt_assignment, 4]
-
-    # Select foreground RoIs as those with >= FG_THRESH overlap
-    fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0]
-    # Guard against the case when an image has fewer than fg_rois_per_image
-    # foreground RoIs
-    fg_rois_per_this_image = int(min(fg_rois_per_image, fg_inds.size))
-    # Sample foreground regions without replacement
-    if fg_inds.size > 0:
-        fg_inds = npr.choice(fg_inds, size=fg_rois_per_this_image, replace=False)
-
-    # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
-    bg_inds = np.where((max_overlaps < cfg.TRAIN.BG_THRESH_HI) &
-                       (max_overlaps >= cfg.TRAIN.BG_THRESH_LO))[0]
-    # Compute number of background RoIs to take from this image (guarding
-    # against there being fewer than desired)
-    bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image
-    bg_rois_per_this_image = min(bg_rois_per_this_image, bg_inds.size)
-    # Sample background regions without replacement
-    if bg_inds.size > 0:
-        bg_inds = npr.choice(bg_inds, size=bg_rois_per_this_image, replace=False)
-
-    # The indices that we're selecting (both fg and bg)
-    keep_inds = np.append(fg_inds, bg_inds)
-    # Select sampled values from various arrays:
-    labels = labels[keep_inds]
-    # Clamp labels for the background RoIs to 0
-    labels[fg_rois_per_this_image:] = 0
-    rois = all_rois[keep_inds]
-
-    bbox_target_data = _compute_targets(
-        rois[:, 1:5], gt_boxes[gt_assignment[keep_inds], :4], labels)
-
-    bbox_targets, bbox_inside_weights = \
-        _get_bbox_regression_labels(bbox_target_data, num_classes)
-
-    return labels, rois, bbox_targets, bbox_inside_weights
-
-
-def _fmap_batch(inputs, outputs, keys):
-    for i, key in enumerate(keys):
-        outputs[key].append(inputs[i])
--- a/lib/faster_rcnn/test.py
+++ b/lib/faster_rcnn/test.py
@@ -17,14 +17,13 @@ import dragon.vm.torch as torch
 import numpy as np

 from lib.core.config import cfg
+from lib.modeling.detector import new_detector
 from lib.nms import nms_wrapper
+from lib.utils import boxes as box_util
 from lib.utils import framework
 from lib.utils import time_util
 from lib.utils.blob import im_list_to_blob
-from lib.utils.boxes import bbox_transform_inv
-from lib.utils.boxes import clip_tiled_boxes
 from lib.utils.image import scale_image
-from lib.utils.vis import vis_one_image


 def im_detect(detector, raw_image):
@@ -39,69 +38,65 @@ def im_detect(detector, raw_image):
    ], dtype=np.float32)

    # Do Forward
-    if not hasattr(detector, 'frozen_graph'):
-        inputs = {
-            'data': torch.from_numpy(blobs['data']),
-            'ims_info': torch.from_numpy(blobs['ims_info']),
-        }
-        with torch.no_grad():
-            with torch.jit.Recorder(retain_ops=True):
-                outputs = detector.forward(inputs)
-                detector.frozen_graph = \
-                    framework.FrozenGraph(
-                        {'data': inputs['data'],
-                         'ims_info': inputs['ims_info']},
-                        {'rois': outputs['rois'],
-                         'cls_prob': outputs['cls_prob'],
-                         'bbox_pred': outputs['bbox_pred']},
-                    )
-    outputs = detector.frozen_graph(**blobs)
+    if not hasattr(detector, 'graph'):
+        with framework.new_workspace().as_default():
+            data = torch.from_numpy(blobs['data'])
+            ims_info = torch.from_numpy(blobs['ims_info'])
+            with torch.no_grad():
+                with torch.jit.Tracer(retain_ops=True):
+                    inputs = {'data': data, 'ims_info': ims_info}
+                    outputs = detector.forward(inputs)
+                    detector.graph = \
+                        framework.Graph(inputs, {
+                            'rois': outputs['rois'],
+                            'cls_prob': outputs['cls_prob'],
+                            'bbox_pred': outputs['bbox_pred']
+                        })
+    outputs = detector.graph(**blobs)

    # Decode results
-    batch_rois = outputs['rois']
-    batch_scores = outputs['cls_prob']
-    batch_deltas = outputs['bbox_pred']
-    batch_boxes = bbox_transform_inv(
-        batch_rois[:, 1:5],
-        batch_deltas,
-        cfg.BBOX_REG_WEIGHTS,
+    rois = outputs['rois']
+    scores, boxes, batch_inds = [], [], []
+    pred_boxes = \
+        box_util.bbox_transform_inv(
+            rois[:, 1:5],
+            outputs['bbox_pred'],
+            cfg.BBOX_REG_WEIGHTS,
+        )
+
+    for i in range(len(ims)):
+        inds = np.where(rois[:, 0].astype(np.int32) == i)[0]
+        im_boxes = pred_boxes[inds] / ims_scale[i]
+        scores.append(outputs['cls_prob'][inds])
+        boxes.append(box_util.clip_tiled_boxes(im_boxes, raw_image.shape))
+
+    return (
+        np.vstack(scores) if len(ims) > 0 else scores[0],
+        np.vstack(boxes) if len(ims) > 0 else boxes[0],
    )

-    scores_wide, boxes_wide = [], []

-    for im_idx in range(len(ims)):
-        indices = np.where(batch_rois[:, 0].astype(np.int32) == im_idx)[0]
-        boxes = batch_boxes[indices]
-        boxes /= ims_scale[im_idx]
-        clip_tiled_boxes(boxes, raw_image.shape)
-        scores_wide.append(batch_scores[indices])
-        boxes_wide.append(boxes)
+def test_net(weights, num_classes, q_in, q_out, device):
+    num_classes, cfg.GPU_ID = num_classes, device
+    detector = new_detector(device, weights)

-    return (np.vstack(scores_wide), np.vstack(boxes_wide)) \
-        if len(scores_wide) > 1 else (scores_wide[0], boxes_wide[0])
+    _t = time_util.new_timers('im_detect', 'misc')

+    while True:
+        idx, raw_image = q_in.get()
+        if raw_image is None:
+            break

-def test_net(detector, server):
-    # Load settings
-    classes = server.classes
-    num_images = server.num_images
-    num_classes = server.num_classes
-    all_boxes = [[[] for _ in range(num_images)] for _ in range(num_classes)]
-
-    _t = {'im_detect': time_util.Timer(), 'misc': time_util.Timer()}
-
-    for i in range(num_images):
-        image_id, raw_image = server.get_image()
+        boxes_this_image = [[]]

        with _t['im_detect'].tic_and_toc():
            scores, boxes = im_detect(detector, raw_image)

        _t['misc'].tic()
-        boxes_this_image = [[]]
        for j in range(1, num_classes):
            inds = np.where(scores[:, j] > cfg.TEST.SCORE_THRESH)[0]
            cls_scores = scores[inds, j]
-            cls_boxes = boxes[inds, j*4:(j+1)*4]
+            cls_boxes = boxes[inds, j * 4:(j + 1) * 4]
            cls_detections = np.hstack(
                (cls_boxes, cls_scores[:, np.newaxis])
            ).astype(np.float32, copy=False)
@@ -119,43 +114,16 @@ def test_net(detector, server):
                    force_cpu=True,
                )
            cls_detections = cls_detections[keep, :]
-            all_boxes[j][i] = cls_detections
            boxes_this_image.append(cls_detections)
-
-        if cfg.VIS or cfg.VIS_ON_FILE:
-            vis_one_image(
-                raw_image,
-                classes,
-                boxes_this_image,
-                thresh=cfg.VIS_TH,
-                box_alpha=1.,
-                show_class=True,
-                filename=server.get_save_filename(image_id),
-            )
-
-        # Limit to max_per_image detections *over all classes*
-        if cfg.TEST.DETECTIONS_PER_IM > 0:
-            image_scores = []
-            for j in range(1, num_classes):
-                if len(all_boxes[j][i]) < 1:
-                    continue
-                image_scores.append(all_boxes[j][i][:, -1])
-            if len(image_scores) > 0:
-                image_scores = np.hstack(image_scores)
-            if len(image_scores) > cfg.TEST.DETECTIONS_PER_IM:
-                image_thresh = np.sort(image_scores)[-cfg.TEST.DETECTIONS_PER_IM]
-                for j in range(1, num_classes):
-                    keep = np.where(all_boxes[j][i][:, -1] >= image_thresh)[0]
-                    all_boxes[j][i] = all_boxes[j][i][keep, :]
        _t['misc'].toc()

-        print('\rim_detect: {:d}/{:d} {:.3f}s {:.3f}s'
-              .format(i + 1, num_images,
-                      _t['im_detect'].average_time,
-                      _t['misc'].average_time),
-              end='')
-
-    print('\n>>>>>>>>>>>>>>>>>>> Evaluating <<<<<<<<<<<<<<<<<<<<')
-
-    print('Evaluating detections')
-    server.evaluate_detections(all_boxes)
+        q_out.put((
+            idx,
+            {
+                'im_detect': _t['im_detect'].average_time,
+                'misc': _t['misc'].average_time,
+            },
+            {
+                'boxes': boxes_this_image,
+            },
+        ))
--- a/lib/faster_rcnn/utils.py
+++ b/lib/faster_rcnn/utils.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import numpy as np
+
+from lib.core.config import cfg
+
+
+def generate_grid_anchors(features, base_anchors, strides):
+    num_strides = len(strides)
+    if len(features) != num_strides:
+        raise ValueError(
+            'Given %d features for %d strides.'
+            % (len(features), num_strides)
+        )
+    # Generate proposals from shifted anchors
+    anchors_to_pack = []
+    for i in range(len(features)):
+        height, width = features[i].shape[-2:]
+        shift_x = np.arange(0, width) * strides[i]
+        shift_y = np.arange(0, height) * strides[i]
+        shift_x, shift_y = np.meshgrid(shift_x, shift_y)
+        shifts = np.vstack((shift_x.ravel(), shift_y.ravel(),
+                            shift_x.ravel(), shift_y.ravel())).transpose()
+        # Add A anchors (1, A, 4) to
+        # cell K shifts (K, 1, 4) to get
+        # shift anchors (K, A, 4)
+        # Reshape to (K * A, 4) shifted anchors
+        A = base_anchors[i].shape[0]
+        K = shifts.shape[0]
+        anchors = (base_anchors[i].reshape((1, A, 4)) +
+                   shifts.reshape((1, K, 4)).transpose((1, 0, 2)))
+        if num_strides > 1:
+            # Transpose from (K, A, 4) to (A, K, 4)
+            # We will pack it with other strides to
+            # match the data format of (N, C, H, W)
+            anchors = anchors.transpose((1, 0, 2))
+            anchors = anchors.reshape((A * K, 4))
+            anchors_to_pack.append(anchors)
+        else:
+            # Original order of Faster R-CNN
+            return anchors.reshape((K * A, 4))
+    return np.vstack(anchors_to_pack)
+
+
+def map_returns_to_blobs(returns, blobs, keys):
+    """Map returns of image to blobs."""
+    for i, key in enumerate(keys):
+        blobs[key].append(returns[i])
+
+
+def map_rois_to_levels(rois, k_min, k_max):
+    """Map rois to fpn levels."""
+    if len(rois) == 0:
+        return []
+    ws = rois[:, 3] - rois[:, 1] + 1
+    hs = rois[:, 4] - rois[:, 2] + 1
+    s = np.sqrt(ws * hs)
+    s0 = cfg.FPN.ROI_CANONICAL_SCALE  # default: 224
+    lvl0 = cfg.FPN.ROI_CANONICAL_LEVEL  # default: 4
+    target_levels = np.floor(lvl0 + np.log2(s / s0 + 1e-6))
+    return np.clip(target_levels, k_min, k_max)
+
+
+def map_blobs_to_outputs(blobs, defaults, lvl_inds):
+    """Map blobs to outputs according to fpn indices."""
+    outputs = collections.defaultdict(list)
+    for inds in lvl_inds:
+        for key, blob in blobs.items():
+            outputs[key].append(
+                blob[inds]
+                if len(inds) > 0
+                else defaults[key]
+            )
+    return outputs
--- a/lib/fpn/anchor_target_layer.py
+++ b/lib/fpn/anchor_target_layer.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-
-import dragon.vm.torch as torch
-import numpy as np
-import numpy.random as npr
-
-from lib.core.config import cfg
-from lib.faster_rcnn.generate_anchors import generate_anchors
-from lib.utils import logger
-from lib.utils.blob import array2tensor
-from lib.utils.boxes import bbox_overlaps
-from lib.utils.boxes import bbox_transform
-from lib.utils.boxes import dismantle_gt_boxes
-
-
-class AnchorTargetLayer(torch.nn.Module):
-    """Assign anchors to ground-truth targets."""
-
-    def __init__(self):
-        super(AnchorTargetLayer, self).__init__()
-        # Load the basic configs
-        self.scales = cfg.RPN.SCALES
-        self.strides = cfg.RPN.STRIDES
-        self.ratios = cfg.RPN.ASPECT_RATIOS
-        if len(self.scales) != len(self.strides):
-            logger.fatal(
-                'Given {} scales and {} strides.'
-                .format(len(self.scales), len(self.strides))
-            )
-
-        # Allow boxes to sit over the edge by a small amount
-        self._allowed_border = cfg.TRAIN.RPN_STRADDLE_THRESH
-
-        # Generate base anchors
-        self.base_anchors = []
-        for i in range(len(self.strides)):
-            base_size, scale = self.strides[i], self.scales[i]
-            if not isinstance(scale, collections.Iterable):
-                scale = [scale]
-            self.base_anchors.append(
-                generate_anchors(
-                    base_size=base_size,
-                    ratios=self.ratios,
-                    scales=np.array(scale),
-                )
-            )
-
-    def forward(self, features, gt_boxes, ims_info):
-        """Produces anchor classification labels and bounding-box regression targets."""
-        num_images = cfg.TRAIN.IMS_PER_BATCH
-        gt_boxes_wide = dismantle_gt_boxes(gt_boxes, num_images)
-
-        if len(gt_boxes_wide) != num_images:
-            logger.fatal(
-                'Input {} images, got {} slices of gt boxes.'
-                .format(num_images, len(gt_boxes_wide))
-            )
-
-        # Generate proposals from shifted anchors
-        all_anchors, total_anchors = [], 0
-        for i in range(len(self.strides)):
-            height, width = features[i].shape[-2:]
-            shift_x = np.arange(0, width) * self.strides[i]
-            shift_y = np.arange(0, height) * self.strides[i]
-            shift_x, shift_y = np.meshgrid(shift_x, shift_y)
-            shifts = np.vstack((shift_x.ravel(), shift_y.ravel(),
-                                shift_x.ravel(), shift_y.ravel())).transpose()
-            # Add A anchors (1, A, 4) to
-            # cell K shifts (K, 1, 4) to get
-            # shift anchors (K, A, 4)
-            # Reshape to (K * A, 4) shifted anchors
-            A = self.base_anchors[i].shape[0]
-            K = shifts.shape[0]
-            anchors = (self.base_anchors[i].reshape((1, A, 4)) +
-                       shifts.reshape((1, K, 4)).transpose((1, 0, 2)))
-            # [K, A, 4] -> [A, K, 4]
-            anchors = anchors.transpose((1, 0, 2))
-            anchors = anchors.reshape((A * K, 4))
-            all_anchors.append(anchors)
-            total_anchors += anchors.shape[0]
-
-        all_anchors = np.vstack(all_anchors)
-
-        # label: 1 is positive, 0 is negative, -1 is don't care
-        labels_wide = -np.ones((num_images, total_anchors,), dtype=np.float32)
-        bbox_targets_wide = np.zeros((num_images, total_anchors, 4), dtype=np.float32)
-        bbox_inside_weights_wide = np.zeros_like(bbox_targets_wide, dtype=np.float32)
-        bbox_outside_weights_wide = np.zeros_like(bbox_targets_wide, dtype=np.float32)
-
-        for ix in range(num_images):
-            # GT boxes (x1, y1, x2, y2, label, has_mask)
-            gt_boxes = gt_boxes_wide[ix]
-            im_info = ims_info[ix]
-            if self._allowed_border >= 0:
-                # Only keep anchors inside the image
-                inds_inside = np.where(
-                    (all_anchors[:, 0] >= -self._allowed_border) &
-                    (all_anchors[:, 1] >= -self._allowed_border) &
-                    (all_anchors[:, 2] < im_info[1] + self._allowed_border) &
-                    (all_anchors[:, 3] < im_info[0] + self._allowed_border))[0]
-                anchors = all_anchors[inds_inside, :]
-            else:
-                inds_inside = np.arange(all_anchors.shape[0])
-                anchors = all_anchors
-            num_inside = len(inds_inside)
-
-            # label: 1 is positive, 0 is negative, -1 is don't care
-            labels = np.empty((num_inside,), dtype=np.float32)
-            labels.fill(-1)
-
-            # Overlaps between the anchors and the gt boxes
-            overlaps = bbox_overlaps(anchors, gt_boxes)
-            argmax_overlaps = overlaps.argmax(axis=1)
-            max_overlaps = overlaps[np.arange(num_inside), argmax_overlaps]
-
-            gt_argmax_overlaps = overlaps.argmax(axis=0)
-            gt_max_overlaps = overlaps[gt_argmax_overlaps,
-                                       np.arange(overlaps.shape[1])]
-            gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0]
-
-            # fg label: for each gt, anchor with highest overlap
-            labels[gt_argmax_overlaps] = 1
-
-            # fg label: above threshold IOU
-            labels[max_overlaps >= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = 1
-
-            # bg label: below threshold IOU
-            labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0
-
-            # Subsample positive labels if we have too many
-            num_fg = int(cfg.TRAIN.RPN_FG_FRACTION * cfg.TRAIN.RPN_BATCHSIZE)
-            fg_inds = np.where(labels == 1)[0]
-            if len(fg_inds) > num_fg:
-                disable_inds = npr.choice(
-                    fg_inds, size=(len(fg_inds) - num_fg), replace=False)
-                labels[disable_inds] = -1
-                fg_inds = np.where(labels == 1)[0]
-
-            # Subsample negative labels if we have too many
-            num_bg = cfg.TRAIN.RPN_BATCHSIZE - np.sum(labels == 1)
-            bg_inds = np.where(labels == 0)[0]
-            if len(bg_inds) > num_bg:
-                disable_inds = npr.choice(
-                    bg_inds, size=(len(bg_inds) - num_bg), replace=False)
-                labels[disable_inds] = -1
-
-            bbox_targets = np.zeros((num_inside, 4), dtype=np.float32)
-            bbox_targets[fg_inds, :] = bbox_transform(
-                anchors[fg_inds, :],
-                gt_boxes[argmax_overlaps[fg_inds], :4],
-            )
-            bbox_inside_weights = np.zeros((num_inside, 4), dtype=np.float32)
-            bbox_inside_weights[labels == 1, :] = np.array((1., 1., 1., 1.))
-            bbox_outside_weights = np.zeros((num_inside, 4), dtype=np.float32)
-            bbox_outside_weights[labels == 1, :] = np.ones((1, 4)) / cfg.TRAIN.RPN_BATCHSIZE
-            bbox_outside_weights[labels == 0, :] = np.ones((1, 4)) / cfg.TRAIN.RPN_BATCHSIZE
-
-            labels_wide[ix, inds_inside] = labels  # label
-            bbox_targets_wide[ix, inds_inside] = bbox_targets
-            bbox_inside_weights_wide[ix, inds_inside] = bbox_inside_weights
-            bbox_outside_weights_wide[ix, inds_inside] = bbox_outside_weights
-
-        labels = labels_wide.reshape((num_images, total_anchors))
-        bbox_targets = bbox_targets_wide.transpose((0, 2, 1))
-        bbox_inside_weights = bbox_inside_weights_wide.transpose((0, 2, 1))
-        bbox_outside_weights = bbox_outside_weights_wide.transpose((0, 2, 1))
-
-        return {
-            'labels': array2tensor(labels),
-            'bbox_targets': array2tensor(bbox_targets),
-            'bbox_inside_weights': array2tensor(bbox_inside_weights),
-            'bbox_outside_weights': array2tensor(bbox_outside_weights),
-        }
--- a/lib/fpn/proposal_layer.py
+++ b/lib/fpn/proposal_layer.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-import dragon.vm.torch as torch
-import numpy as np
-
-from lib.core.config import cfg
-from lib.faster_rcnn.generate_anchors import generate_anchors
-from lib.nms import nms_wrapper
-from lib.utils import logger
-from lib.utils.blob import array2tensor
-from lib.utils.boxes import bbox_transform_inv
-from lib.utils.boxes import clip_tiled_boxes
-from lib.utils.boxes import filter_boxes
-
-
-class ProposalLayer(torch.nn.Module):
-    """Compute proposals by applying transformations anchors."""
-
-    def __init__(self):
-        super(ProposalLayer, self).__init__()
-        # Load the basic configs
-        self.scales = cfg.RPN.SCALES
-        self.strides = cfg.RPN.STRIDES
-        self.ratios = cfg.RPN.ASPECT_RATIOS
-        if len(self.scales) != len(self.strides):
-            logger.fatal(
-                'Given {} scales and {} strides.'
-                .format(len(self.scales), len(self.strides))
-            )
-
-        # Generate base anchors
-        self.base_anchors = []
-        for i in range(len(self.strides)):
-            base_size, scale = self.strides[i], self.scales[i]
-            if not isinstance(scale, collections.Iterable):
-                scale = [scale]
-            self.base_anchors.append(
-                generate_anchors(
-                    base_size=base_size,
-                    ratios=self.ratios,
-                    scales=np.array(scale),
-                )
-            )
-
-    def generate_grid_anchors(self, features):
-        # Generate proposals from shifted anchors
-        anchors_wide = []
-        for i in range(len(self.strides)):
-            height, width = features[i].shape[-2:]
-            shift_x = np.arange(0, width) * self.strides[i]
-            shift_y = np.arange(0, height) * self.strides[i]
-            shift_x, shift_y = np.meshgrid(shift_x, shift_y)
-            shifts = np.vstack((shift_x.ravel(), shift_y.ravel(),
-                                shift_x.ravel(), shift_y.ravel())).transpose()
-            # Add A anchors (1, A, 4) to
-            # cell K shifts (K, 1, 4) to get
-            # shift anchors (K, A, 4)
-            # Reshape to (K * A, 4) shifted anchors
-            A = self.base_anchors[i].shape[0]
-            K = shifts.shape[0]
-            anchors = (self.base_anchors[i].reshape((1, A, 4)) +
-                       shifts.reshape((1, K, 4)).transpose((1, 0, 2)))
-            # [K, A, 4] -> [A, K, 4]
-            anchors = anchors.transpose((1, 0, 2))
-            anchors = anchors.reshape((A * K, 4))
-            anchors_wide.append(anchors)
-        return np.vstack(anchors_wide)
-
-    def forward(self, features, cls_prob, bbox_pred, ims_info):
-        cfg_key = 'TRAIN' if self.training else 'TEST'
-        pre_nms_top_n = cfg[cfg_key].RPN_PRE_NMS_TOP_N
-        post_nms_top_n = cfg[cfg_key].RPN_POST_NMS_TOP_N
-        nms_thresh = cfg[cfg_key].RPN_NMS_THRESH
-        min_size = cfg[cfg_key].RPN_MIN_SIZE
-
-        # Get resources
-        num_images = ims_info.shape[0]
-        all_anchors = self.generate_grid_anchors(features)  # [n, 4]
-
-        if cls_prob.shape[0] != num_images or \
-                bbox_pred.shape[0] != num_images:
-            logger.fatal('Incorrect num of images: {}'.format(num_images))
-
-        # Prepare for the outputs
-        batch_rois = []
-        batch_scores = cls_prob.numpy(True)
-        batch_deltas = bbox_pred.numpy(True) \
-            .transpose((0, 2, 1))  # [?, 4, n] -> [?, n, 4]
-
-        # Extract RoIs separately
-        for ix in range(num_images):
-            scores = batch_scores[ix].reshape((-1, 1))  # [1, n] -> [n, 1]
-            deltas = batch_deltas[ix]  # [n, 4]
-
-            if pre_nms_top_n <= 0 or pre_nms_top_n >= len(scores):
-                order = np.argsort(-scores.squeeze())
-            else:
-                # Avoid sorting possibly large arrays; First partition to get top K
-                # unsorted and then sort just those (~20x faster for 200k scores)
-                inds = np.argpartition(-scores.squeeze(), pre_nms_top_n)[:pre_nms_top_n]
-                order = np.argsort(-scores[inds].squeeze())
-                order = inds[order]
-
-            deltas = deltas[order]
-            anchors = all_anchors[order]
-            scores = scores[order]
-
-            # 1. Convert anchors into proposals via bbox transformations
-            proposals = bbox_transform_inv(anchors, deltas)
-
-            # 2. Clip predicted boxes to image
-            proposals = clip_tiled_boxes(proposals, ims_info[ix, :2])
-
-            # 3. remove predicted boxes with either height or width < threshold
-            keep = filter_boxes(proposals, min_size * ims_info[ix, 2])
-            proposals = proposals[keep, :]
-            scores = scores[keep]
-
-            # 6. Apply nms (e.g. threshold = 0.7)
-            # 7. Take after_nms_topN (e.g. 300)
-            # 8. Return the top proposals (-> RoIs top)
-            keep = nms_wrapper.nms(np.hstack((proposals, scores)), nms_thresh)
-            if post_nms_top_n > 0:
-                keep = keep[:post_nms_top_n]
-            proposals = proposals[keep, :]
-
-            # Output rois blob
-            batch_inds = np.empty((proposals.shape[0], 1), dtype=np.float32)
-            batch_inds.fill(ix)
-            rpn_rois = np.hstack((batch_inds, proposals.astype(np.float32, copy=False)))
-            batch_rois.append(rpn_rois)
-
-        # Merge RoIs into a blob
-        rpn_rois = np.concatenate(batch_rois, axis=0)
-
-        if cfg_key == 'TRAIN':
-            return rpn_rois
-        else:
-            # Distribute rois into K levels
-            min_level = cfg.FPN.ROI_MIN_LEVEL
-            max_level = cfg.FPN.ROI_MAX_LEVEL
-            k = max_level - min_level + 1
-            fpn_levels = _map_rois_to_fpn_levels(rpn_rois, min_level, max_level)
-            all_rois = []
-            for i in range(k):
-                lv_indices = np.where(fpn_levels == (i + min_level))[0]
-                if len(lv_indices) == 0:
-                    # Fake a tiny roi to avoid empty roi pooling
-                    all_rois.append(array2tensor(np.array([[-1, 0, 0, 1, 1]], dtype=np.float32)))
-                else:
-                    all_rois.append(array2tensor(rpn_rois[lv_indices]))
-            return all_rois
-
-
-def _map_rois_to_fpn_levels(rois, k_min, k_max):
-    """
-    Determine which FPN level each RoI in a set of RoIs
-    should map to based on the heuristic in the FPN paper.
-    """
-    if len(rois) == 0:
-        return []
-    ws = rois[:, 3] - rois[:, 1] + 1
-    hs = rois[:, 4] - rois[:, 2] + 1
-    s = np.sqrt(ws * hs)
-    s0 = cfg.FPN.ROI_CANONICAL_SCALE  # default: 224
-    lvl0 = cfg.FPN.ROI_CANONICAL_LEVEL  # default: 4
-    target_levels = np.floor(lvl0 + np.log2(s / s0 + 1e-6))
-    return np.clip(target_levels, k_min, k_max)
--- a/lib/fpn/proposal_target_layer.py
+++ b/lib/fpn/proposal_target_layer.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import dragon.vm.torch as torch
-import numpy as np
-import numpy.random as npr
-
-from lib.core.config import cfg
-from lib.utils.blob import array2tensor
-from lib.utils.boxes import bbox_overlaps
-from lib.utils.boxes import bbox_transform
-from lib.utils.boxes import dismantle_gt_boxes
-
-
-class ProposalTargetLayer(torch.nn.Module):
-    """Assign object detection proposals to ground-truth targets.
-
-    Produces proposal classification labels and bounding-box regression targets.
-
-    """
-    def __init__(self):
-        super(ProposalTargetLayer, self).__init__()
-        self.num_classes = cfg.MODEL.NUM_CLASSES
-        self.fake_outputs = {
-            'rois': np.array([[0, 0, 0, 1, 1]], dtype=np.float32),
-            'labels': np.array([-1], dtype=np.float32),
-            'bbox_targets': np.zeros((1, self.num_classes * 4), dtype=np.float32),
-            'bbox_inside_weights': np.zeros((1, self.num_classes * 4), dtype=np.float32),
-            'bbox_outside_weights': np.zeros((1, self.num_classes * 4), dtype=np.float32),
-        }
-
-    def forward(self, rpn_rois, gt_boxes):
-        num_images = cfg.TRAIN.IMS_PER_BATCH
-        # Proposal ROIs (0, x1, y1, x2, y2) coming from RPN
-        # (i.e., rpn.proposal_layer.ProposalLayer), or any other source
-        all_rois = rpn_rois
-        # GT boxes (x1, y1, x2, y2, label)
-        gt_boxes_wide = dismantle_gt_boxes(gt_boxes, num_images)
-
-        # Prepare for the outputs
-        keys = ['labels', 'rois', 'bbox_targets',
-                'bbox_inside_weights', 'bbox_outside_weights']
-        outputs = dict(map(lambda a, b: (a, b), keys, [[] for _ in keys]))
-        batch_outputs = dict(map(lambda a, b: (a, b), keys, [[] for _ in keys]))
-
-        # Generate targets separately
-        for ix in range(num_images):
-            gt_boxes = gt_boxes_wide[ix]
-            # Extract proposals for this image
-            rois = all_rois[np.where(all_rois[:, 0].astype(np.int32) == ix)[0]]
-            # Include ground-truth boxes in the set of candidate rois
-            inds = np.ones((gt_boxes.shape[0], 1), dtype=gt_boxes.dtype) * ix
-            rois = np.vstack((rois, np.hstack((inds, gt_boxes[:, 0:4]))))
-            # Sample a batch of rois for training
-            rois_per_image = cfg.TRAIN.BATCH_SIZE
-            fg_rois_per_image = np.round(cfg.TRAIN.FG_FRACTION * rois_per_image)
-            labels, rois, bbox_targets, bbox_inside_weights = \
-                _sample_rois(rois, gt_boxes, fg_rois_per_image, rois_per_image, self.num_classes)
-            bbox_outside_weights = np.array(bbox_inside_weights > 0).astype(np.float32)
-
-            _fmap_batch([
-                labels,
-                rois,
-                bbox_targets,
-                bbox_inside_weights,
-                bbox_outside_weights],
-                batch_outputs,
-                keys,
-            )
-
-        # Merge targets into blobs
-        for k, v in batch_outputs.items():
-            batch_outputs[k] = np.concatenate(batch_outputs[k], axis=0)
-
-        # Distribute rois into K levels
-        min_level = cfg.FPN.ROI_MIN_LEVEL
-        max_level = cfg.FPN.ROI_MAX_LEVEL
-        k = max_level - min_level + 1
-        fpn_levels = _map_rois_to_fpn_levels(batch_outputs['rois'], min_level, max_level)
-        lvs_indices = [np.where(fpn_levels == (i + min_level))[0] for i in range(k)]
-        _fmap_rois(
-            inputs=[batch_outputs[key] for key in keys],
-            fake_outputs=self.fake_outputs,
-            outputs=outputs,
-            keys=keys,
-            levels=lvs_indices,
-        )
-
-        return {
-            'rois': [array2tensor(outputs['rois'][i]) for i in range(k)],
-            'labels': array2tensor(np.concatenate(outputs['labels'], axis=0)),
-            'bbox_targets': array2tensor(np.vstack(outputs['bbox_targets'])),
-            'bbox_inside_weights': array2tensor(np.vstack(outputs['bbox_inside_weights'])),
-            'bbox_outside_weights': array2tensor(np.vstack(outputs['bbox_outside_weights'])),
-        }
-
-
-def _get_bbox_regression_labels(bbox_target_data, num_classes):
-    """Bounding-box regression targets (bbox_target_data) are stored in a
-    compact form N x (class, tx, ty, tw, th)
-
-    This function expands those targets into the 4-of-4*K representation used
-    by the network (i.e. only one class has non-zero targets).
-
-    Returns:
-        bbox_target (ndarray): N x 4K blob of regression targets
-        bbox_inside_weights (ndarray): N x 4K blob of loss weights
-
-    """
-    clss = bbox_target_data[:, 0]
-    bbox_targets = np.zeros((clss.size, 4 * num_classes), dtype=np.float32)
-    bbox_inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32)
-    inds = np.where(clss > 0)[0]
-    for ind in inds:
-        cls = clss[ind]
-        start = 4 * cls
-        end = start + 4
-        bbox_targets[ind, int(start):int(end)] = bbox_target_data[ind, 1:]
-        bbox_inside_weights[ind, int(start):int(end)] = (1.0, 1.0, 1.0, 1.0)
-    return bbox_targets, bbox_inside_weights
-
-
-def _compute_targets(ex_rois, gt_rois, labels):
-    """Compute bounding-box regression targets for an image."""
-    assert ex_rois.shape[0] == gt_rois.shape[0]
-    assert ex_rois.shape[1] == 4
-    assert gt_rois.shape[1] == 4
-    targets = bbox_transform(ex_rois, gt_rois, cfg.BBOX_REG_WEIGHTS)
-    return np.hstack((labels[:, np.newaxis], targets)).astype(np.float32, copy=False)
-
-
-def _map_rois_to_fpn_levels(rois, k_min, k_max):
-    """
-    Determine which FPN level each RoI in a set of RoIs
-    should map to based on the heuristic in the FPN paper.
-    """
-    if len(rois) == 0:
-        return []
-    ws = rois[:, 3] - rois[:, 1] + 1
-    hs = rois[:, 4] - rois[:, 2] + 1
-    s = np.sqrt(ws * hs)
-    s0 = cfg.FPN.ROI_CANONICAL_SCALE  # default: 224
-    lvl0 = cfg.FPN.ROI_CANONICAL_LEVEL  # default: 4
-    target_levels = np.floor(lvl0 + np.log2(s / s0 + 1e-6))
-    return np.clip(target_levels, k_min, k_max)
-
-
-def _sample_rois(all_rois, gt_boxes, fg_rois_per_image, rois_per_image, num_classes):
-    """Sample a batch of RoIs comprising foreground and background examples."""
-    # overlaps: (rois x gt_boxes)
-    overlaps = bbox_overlaps(all_rois[:, 1:5], gt_boxes[:, :4])
-    gt_assignment = overlaps.argmax(axis=1)
-    max_overlaps = overlaps.max(axis=1)
-    labels = gt_boxes[gt_assignment, 4]
-
-    # Select foreground RoIs as those with >= FG_THRESH overlap
-    fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0]
-    # Guard against the case when an image has fewer than fg_rois_per_image
-    # foreground RoIs
-
-    fg_rois_per_this_image = int(min(fg_rois_per_image, fg_inds.size))
-    # Sample foreground regions without replacement
-    if fg_inds.size > 0:
-        fg_inds = npr.choice(fg_inds, size=fg_rois_per_this_image, replace=False)
-
-    # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
-    bg_inds = np.where((max_overlaps < cfg.TRAIN.BG_THRESH_HI) &
-                       (max_overlaps >= cfg.TRAIN.BG_THRESH_LO))[0]
-    # Compute number of background RoIs to take from this image (guarding
-    # against there being fewer than desired)
-    bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image
-    bg_rois_per_this_image = min(bg_rois_per_this_image, bg_inds.size)
-    # Sample background regions without replacement
-    if bg_inds.size > 0:
-        bg_inds = npr.choice(bg_inds, size=bg_rois_per_this_image, replace=False)
-
-    # The indices that we're selecting (both fg and bg)
-    keep_inds = np.append(fg_inds, bg_inds)
-    # Select sampled values from various arrays:
-    labels = labels[keep_inds]
-    # Clamp labels for the background RoIs to 0
-    labels[fg_rois_per_this_image:] = 0
-    rois = all_rois[keep_inds]
-
-    bbox_target_data = _compute_targets(
-        rois[:, 1:5], gt_boxes[gt_assignment[keep_inds], :4], labels)
-
-    bbox_targets, bbox_inside_weights = \
-        _get_bbox_regression_labels(bbox_target_data, num_classes)
-
-    return labels, rois, bbox_targets, bbox_inside_weights
-
-
-def _fmap_batch(inputs, outputs, keys):
-    for i, key in enumerate(keys):
-        outputs[key].append(inputs[i])
-
-
-def _fmap_rois(inputs, fake_outputs, outputs, keys, levels):
-    def impl(a, b, indices):
-        return a[indices] if len(indices) > 0 else b
-    for k in range(len(levels)):
-        inds = levels[k]
-        for i, key in enumerate(keys):
-            outputs[key].append(impl(inputs[i], fake_outputs[key], inds))
--- a/lib/fpn/__init__.py
+++ b/lib/fpn/__init__.py
@@ -13,6 +13,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-from lib.fpn.anchor_target_layer import AnchorTargetLayer
-from lib.fpn.proposal_layer import ProposalLayer
-from lib.fpn.proposal_target_layer import ProposalTargetLayer
+from lib.faster_rcnn.anchor_target import AnchorTarget
+from lib.faster_rcnn.proposal import Proposal
+from lib.mask_rcnn.data_loader import DataLoader
+from lib.mask_rcnn.proposal_target import ProposalTarget
--- a/lib/mask_rcnn/data_loader.py
+++ b/lib/mask_rcnn/data_loader.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import multiprocessing as mp
+import time
+
+import dragon
+import dragon.vm.torch as torch
+import numpy as np
+
+from lib.core.config import cfg
+from lib.mask_rcnn.data_transformer import DataTransformer
+from lib.datasets.factory import get_imdb
+from lib.utils import logger
+from lib.utils.blob import im_list_to_blob
+from lib.utils.blob import mask_list_to_blob
+
+
+class DataLoader(object):
+    """Provide mini-batches of data."""
+
+    def __init__(self):
+        super(DataLoader, self).__init__()
+        database = get_imdb(cfg.TRAIN.DATABASE)
+        self.data_batch = DataBatch(**{
+            'dataset': lambda: dragon.io.SeetaRecordDataset(database.source),
+            'classes': database.classes,
+            'shuffle': cfg.TRAIN.USE_SHUFFLE,
+            'num_chunks': cfg.TRAIN.NUM_SHUFFLE_CHUNKS,
+            'batch_size': cfg.TRAIN.IMS_PER_BATCH * 2,
+            'num_transformers': cfg.TRAIN.NUM_WORKERS,
+        })
+
+    def __call__(self):
+        outputs = self.data_batch.get()
+        outputs['data'] = torch.from_numpy(outputs['data'])
+        return outputs
+
+
+class DataBatch(mp.Process):
+    """Prefetch the batch of data."""
+
+    def __init__(self, **kwargs):
+        """Construct a ``DataBatch``.
+
+        Parameters
+        ----------
+        dataset : lambda
+            The creator of a dataset.
+        classes : Sequence[str]
+            The class names.
+        shuffle : bool, optional, default=False
+            Whether to shuffle the data.
+        num_chunks : int, optional, default=0
+            The number of chunks to split.
+        batch_size : int, optional, default=2
+            The size of a mini-batch.
+        num_transformers : int, optional, default=3
+            The number of workers to transform data.
+
+        """
+        super(DataBatch, self).__init__()
+        # Distributed settings
+        rank, group_size = 0, 1
+        process_group = dragon.distributed.get_group()
+        if process_group is not None and kwargs.get(
+                'phase', 'TRAIN') == 'TRAIN':
+            group_size = process_group.size
+            rank = dragon.distributed.get_rank(process_group)
+        kwargs['group_size'] = group_size
+
+        # Configuration
+        self._prefetch = kwargs.get('prefetch', 5)
+        self._batch_size = kwargs.get('batch_size', 2)
+        self._num_readers = kwargs.get('num_readers', 1)
+        self._num_transformers = kwargs.get('num_transformers', 3)
+        self._num_fetchers = kwargs.get('num_fetchers', 1)
+        self.daemon = True
+
+        # Initialize queues
+        num_batches = self._prefetch * self._num_readers
+        self.Q1 = mp.Queue(num_batches * self._batch_size)
+        self.Q21 = mp.Queue(num_batches * self._batch_size)
+        self.Q22 = mp.Queue(num_batches * self._batch_size)
+        self.Q3 = mp.Queue(num_batches)
+
+        # Initialize readers
+        self._readers = []
+        for i in range(self._num_readers):
+            part_idx, num_parts = i, self._num_readers
+            num_parts *= group_size
+            part_idx += rank * self._num_readers
+            self._readers.append(dragon.io.DataReader(
+                num_parts=num_parts, part_idx=part_idx, **kwargs))
+            self._readers[i]._seed += part_idx
+            self._readers[i].q_out = self.Q1
+            self._readers[i].start()
+            time.sleep(0.1)
+
+        # Initialize transformers
+        self._transformers = []
+        for i in range(self._num_transformers):
+            transformer = DataTransformer(**kwargs)
+            transformer._seed += (i + rank * self._num_transformers)
+            transformer.q_in = self.Q1
+            transformer.q1_out, transformer.q2_out = self.Q21, self.Q22
+            transformer.start()
+            self._transformers.append(transformer)
+            time.sleep(0.1)
+
+        # Initialize batch-producer
+        self.start()
+
+        # Register cleanup callbacks
+        def cleanup():
+            def terminate(processes):
+                for process in processes:
+                    process.terminate()
+                    process.join()
+            terminate([self])
+            logger.info('Terminate DataBatch.')
+            terminate(self._transformers)
+            logger.info('Terminate DataTransformer.')
+            terminate(self._readers)
+            logger.info('Terminate DataReader.')
+
+        import atexit
+        atexit.register(cleanup)
+
+    def get(self):
+        """Get a batch.
+
+        Returns
+        -------
+        dict
+            The batch dict.
+
+        """
+        return self.Q3.get()
+
+    def run(self):
+        """Start the process to produce batches."""
+        def produce(q_in):
+            processed_ims, ims_info = [], []
+            packed_boxes, packed_masks = [], []
+            for image_index in range(cfg.TRAIN.IMS_PER_BATCH):
+                im, im_scale, gt_boxes, gt_masks = q_in.get()
+                processed_ims.append(im)
+                ims_info.append(list(im.shape[:2]) + [im_scale])
+                im_boxes = np.zeros((gt_boxes.shape[0], gt_boxes.shape[1] + 1), 'float32')
+                im_boxes[:, :gt_boxes.shape[1]], im_boxes[:, -1] = gt_boxes, image_index
+                packed_boxes.append(im_boxes)
+                packed_masks.append(gt_masks)
+            return {
+                'data': im_list_to_blob(processed_ims),
+                'ims_info': np.array(ims_info, 'float32'),
+                'gt_boxes': np.concatenate(packed_boxes, 0),
+                'gt_masks': mask_list_to_blob(packed_masks),
+            }
+
+        # Two queues to implement aspect-grouping
+        # This is necessary to reduce the gpu memory
+        # from fetching a huge square batch blob
+        q1, q2 = self.Q21, self.Q22
+
+        # Main prefetch loop
+        while True:
+            if q1.qsize() >= cfg.TRAIN.IMS_PER_BATCH:
+                self.Q3.put(produce(q1))
+            elif q2.qsize() >= cfg.TRAIN.IMS_PER_BATCH:
+                self.Q3.put(produce(q2))
+            q1, q2 = q2, q1  # Uniform sampling trick
--- a/lib/mask_rcnn/data_transformer.py
+++ b/lib/mask_rcnn/data_transformer.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import multiprocessing
+
+import numpy as np
+
+from lib.core.config import cfg
+from lib.datasets.example import Example
+from lib.pycocotools import mask_utils
+from lib.utils import boxes as box_util
+from lib.utils.blob import prep_im_for_blob
+from lib.utils.image import get_image_with_target_size
+
+
+class DataTransformer(multiprocessing.Process):
+    def __init__(self, **kwargs):
+        super(DataTransformer, self).__init__()
+        self._seed = cfg.RNG_SEED
+        self._use_flipped = cfg.TRAIN.USE_FLIPPED
+        self._use_diff = cfg.TRAIN.USE_DIFF
+        self._classes = kwargs.get('classes', ('__background__',))
+        self._num_classes = len(self._classes)
+        self._class_to_ind = dict(zip(self._classes, range(self._num_classes)))
+        self.q_in = self.q1_out = self.q2_out = None
+        self.daemon = True
+
+    def make_roi_dict(self, example, im_scale, apply_flip=False):
+        objects, n_objects = example.objects, 0
+        height, width = example.height, example.width
+        if not self._use_diff:
+            for obj in objects:
+                if obj.get('difficult', 0) == 0:
+                    n_objects += 1
+        else:
+            n_objects = len(objects)
+
+        roi_dict = {
+            'boxes': np.zeros((n_objects, 4), 'float32'),
+            'masks': np.empty((n_objects, height, width), 'uint8'),
+            'gt_classes': np.zeros((n_objects, 1), 'int32'),
+            'mask_flags': np.ones((n_objects, 1), 'float32'),
+        }
+
+        # Filter the difficult instances
+        object_idx = 0
+        for obj in objects:
+            if not self._use_diff and \
+                    obj.get('difficult', 0) > 0:
+                continue
+            bbox, mask = obj['bbox'], obj['mask']
+            roi_dict['boxes'][object_idx, :] = [
+                max(0, bbox[0]),
+                max(0, bbox[1]),
+                min(bbox[2], width - 1),
+                min(bbox[3], height - 1),
+            ]
+            if mask is not None:
+                roi_dict['masks'][object_idx] = (
+                    mask_utils.bytes2img(
+                        obj['mask'],
+                        height,
+                        width,
+                    ))
+            else:
+                roi_dict['mask_flags'][object_idx] = 0.
+            roi_dict['gt_classes'][object_idx] = \
+                self._class_to_ind[obj['name']]
+            object_idx += 1
+
+        # Flip the boxes if necessary
+        if apply_flip:
+            roi_dict['boxes'] = \
+                box_util.flip_boxes(
+                    roi_dict['boxes'],
+                    width,
+                )
+
+        # Scale the boxes to the detecting scale
+        roi_dict['boxes'] *= im_scale
+
+        return roi_dict
+
+    def get(self, example):
+        example = Example(example)
+        img = example.image
+
+        # Scale
+        max_size = cfg.TRAIN.MAX_SIZE
+        target_size = cfg.TRAIN.SCALES[np.random.randint(len(cfg.TRAIN.SCALES))]
+        img, im_scale, jitter = prep_im_for_blob(img, target_size, max_size)
+
+        # Flip
+        apply_flip = False
+        if self._use_flipped:
+            if np.random.randint(2) > 0:
+                img = img[:, ::-1]
+                apply_flip = True
+
+        # Example -> RoIDict
+        roi_dict = self.make_roi_dict(example, im_scale, apply_flip)
+
+        # Post-Process for gt boxes
+        # Shape like: [num_objects, {x1, y1, x2, y2, cls, flag}]
+        gt_boxes = \
+            np.concatenate([
+                roi_dict['boxes'],
+                roi_dict['gt_classes'],
+                roi_dict['mask_flags']
+            ], axis=1)
+
+        # Post-Process for gt masks
+        # Shape like: [num_objects, im_h, im_w]
+        if gt_boxes.shape[0] > 0:
+            gt_masks = roi_dict['masks']
+            if apply_flip:
+                gt_masks = gt_masks[:, :, ::-1]
+        else:
+            gt_masks = None
+
+        return img, im_scale, gt_boxes, gt_masks
+
+    def run(self):
+        # Fix the process-local random seed
+        np.random.seed(self._seed)
+
+        # Main prefetch loop
+        while True:
+            outputs = self.get(self.q_in.get())
+            if len(outputs[2]) < 1:
+                continue  # Ignore the non-object image
+            aspect_ratio = float(outputs[0].shape[0]) / outputs[0].shape[1]
+            if aspect_ratio > 1.:
+                self.q1_out.put(outputs)
+            else:
+                self.q2_out.put(outputs)
--- a/lib/mask_rcnn/proposal_target.py
+++ b/lib/mask_rcnn/proposal_target.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+import numpy as np
+import numpy.random as npr
+
+from lib.core.config import cfg
+from lib.faster_rcnn.utils import map_blobs_to_outputs
+from lib.faster_rcnn.utils import map_returns_to_blobs
+from lib.faster_rcnn.utils import map_rois_to_levels
+from lib.utils import boxes as box_util
+from lib.utils import mask as mask_util
+from lib.utils.framework import new_tensor
+
+
+class ProposalTarget(object):
+    """Assign proposals to ground-truth targets."""
+
+    def __init__(self):
+        super(ProposalTarget, self).__init__()
+        self.resolution = cfg.MRCNN.RESOLUTION
+        self.num_classes = cfg.MODEL.NUM_CLASSES
+        self.defaults = collections.OrderedDict([
+            ('rois', np.array([[-1, 0, 0, 1, 1]], 'float32')),
+            ('labels', np.array([-1], 'float32')),
+            ('bbox_targets', np.zeros((1, self.num_classes * 4), 'float32')),
+            ('bbox_inside_weights', np.zeros((1, self.num_classes * 4), 'float32')),
+            ('bbox_outside_weights', np.zeros((1, self.num_classes * 4), 'float32')),
+            ('mask_targets', -np.ones((1, self.resolution, self.resolution), 'float32')),
+        ])
+
+    def __call__(self, rpn_rois, gt_boxes, gt_masks, ims_info):
+        num_images = cfg.TRAIN.IMS_PER_BATCH
+        # Proposal ROIs (0, x1, y1, x2, y2) coming from RPN
+        all_rois = rpn_rois
+        # GT boxes (x1, y1, x2, y2, label)
+        # GT masks (num_objects, im_h, im_w)
+        gt_boxes_wide, gt_masks_wide = \
+            mask_util.dismantle_masks(
+                gt_boxes,
+                gt_masks,
+                num_images,
+            )
+
+        # Prepare for the outputs
+        keys = self.defaults.keys()
+        blobs = dict(map(lambda a, b: (a, b), keys, [[] for _ in keys]))
+
+        # Generate targets separately
+        for ix in range(num_images):
+            gt_boxes = gt_boxes_wide[ix]
+            gt_masks = gt_masks_wide[ix]
+            # Extract proposals for this image
+            rois = all_rois[np.where(all_rois[:, 0].astype('int32') == ix)[0]]
+            # Include ground-truth boxes in the set of candidate rois
+            inds = np.ones((gt_boxes.shape[0], 1), gt_boxes.dtype) * ix
+            rois = np.vstack((rois, np.hstack((inds, gt_boxes[:, :4]))))
+            # Sample a batch of RoIs for training
+            rois_per_image = cfg.TRAIN.BATCH_SIZE
+            fg_rois_per_image = np.round(cfg.TRAIN.FG_FRACTION * rois_per_image)
+            map_returns_to_blobs(
+                sample_rois(
+                    rois,
+                    gt_boxes,
+                    gt_masks,
+                    rois_per_image,
+                    fg_rois_per_image,
+                    self.num_classes,
+                    ims_info[ix][2],
+                ), blobs, keys,
+            )
+
+        # Stack into continuous blobs
+        for k, v in blobs.items():
+            blobs[k] = np.concatenate(blobs[k], 0)
+
+        # Distribute rois into pyramids
+        k_min = cfg.FPN.ROI_MIN_LEVEL
+        k_max = cfg.FPN.ROI_MAX_LEVEL
+        k = k_max - k_min + 1
+        levels = map_rois_to_levels(blobs['rois'], k_min, k_max)
+        outputs = \
+            map_blobs_to_outputs(
+                blobs,
+                self.defaults,
+                [np.where(levels == (i + k_min))[0] for i in range(k)],
+            )
+
+        # Select the foreground RoIs only for mask branch
+        for i in range(k):
+            inds = np.where(outputs['labels'][i] > 0)[0]
+            inds = inds if len(inds) > 0 else np.array([0], 'int64')
+            outputs['mask_rois'].append(outputs['rois'][i][inds])
+            outputs['mask_targets'][i] = outputs['mask_targets'][i][inds]
+            outputs['mask_labels'].append(outputs['labels'][i][inds].astype('int64') - 1)
+
+        # Use the sparse indices to select logits
+        # Reduce the overhead on feeding dense class-specific targets
+        mask_labels = np.concatenate(outputs['mask_labels'], 0)
+        mask_indices = np.arange(len(mask_labels)) * (self.num_classes - 1)
+
+        return {
+            'rois': [new_tensor(outputs['rois'][i]) for i in range(k)],
+            'labels': new_tensor(np.concatenate(outputs['labels'], 0)),
+            'bbox_targets': new_tensor(np.vstack(outputs['bbox_targets'])),
+            'bbox_inside_weights': new_tensor(np.vstack(outputs['bbox_inside_weights'])),
+            'bbox_outside_weights': new_tensor(np.vstack(outputs['bbox_outside_weights'])),
+            'mask_rois': [new_tensor(outputs['mask_rois'][i]) for i in range(k)],
+            'mask_targets': new_tensor(np.vstack(outputs['mask_targets'])),
+            'mask_indices': new_tensor(mask_indices + mask_labels),
+        }
+
+
+def get_targets(
+    ex_rois,
+    gt_rois,
+    gt_labels,
+    gt_masks,
+    mask_flags,
+    mask_size,
+    num_classes,
+    im_scale,
+):
+    """Compute the bounding-box regression targets."""
+    assert ex_rois.shape[0] == gt_rois.shape[0]
+    assert ex_rois.shape[1] == 4
+    assert gt_rois.shape[1] == 4
+    # Compute bbox regression targets
+    fg_inds = np.where(gt_labels > 0)[0]
+    targets = box_util.bbox_transform(ex_rois, gt_rois, cfg.BBOX_REG_WEIGHTS)
+    bbox_targets = np.zeros((ex_rois.shape[0], 4 * num_classes), 'float32')
+    inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32)
+    for i in fg_inds:
+        start = int(4 * gt_labels[i])
+        bbox_targets[i, start:start + 4] = targets[i]
+        inside_weights[i, start:start + 4] = (1., 1., 1., 1.)
+    outside_weights = np.array(inside_weights > 0).astype('float32')
+    # Compute mask classification targets
+    mask_shape = [mask_size] * 2
+    ex_rois_ori = np.round(ex_rois / im_scale).astype(int)
+    gt_rois_ori = np.round(gt_rois / im_scale).astype(int)
+    mask_targets = -np.ones([len(gt_labels)] + mask_shape, 'float32')
+    for i in fg_inds:
+        if mask_flags[i] > 0:
+            box_mask = \
+                mask_util.intersect_box_mask(
+                    ex_rois_ori[i],
+                    gt_rois_ori[i],
+                    gt_masks[i],
+                )
+            if box_mask is not None:
+                mask_targets[i] = \
+                    mask_util.resize_mask(
+                        mask=box_mask,
+                        size=mask_shape,
+                    )
+    return bbox_targets, inside_weights, outside_weights, mask_targets
+
+
+def sample_rois(
+    all_rois,
+    gt_boxes,
+    gt_masks,
+    num_rois,
+    num_fg_rois,
+    num_classes,
+    im_scale,
+):
+    """Sample a batch of RoIs comprising foreground and background examples."""
+    overlaps = box_util.bbox_overlaps(all_rois[:, 1:5], gt_boxes[:, :4])
+    gt_assignment = overlaps.argmax(axis=1)
+    max_overlaps = overlaps.max(axis=1)
+    labels = gt_boxes[gt_assignment, 4]
+
+    # Select foreground RoIs as those with >= FG_THRESH overlap
+    fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0]
+    fg_rois_per_this_image = int(min(num_fg_rois, fg_inds.size))
+    # Sample foreground regions without replacement
+    if fg_inds.size > 0:
+        fg_inds = npr.choice(fg_inds, fg_rois_per_this_image, False)
+
+    # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
+    bg_inds = np.where((max_overlaps < cfg.TRAIN.BG_THRESH_HI) &
+                       (max_overlaps >= cfg.TRAIN.BG_THRESH_LO))[0]
+    # Compute number of background RoIs to take from this image
+    bg_rois_per_this_image = num_rois - fg_rois_per_this_image
+    bg_rois_per_this_image = min(bg_rois_per_this_image, bg_inds.size)
+    # Sample background regions without replacement
+    if bg_inds.size > 0:
+        bg_inds = npr.choice(bg_inds, bg_rois_per_this_image, False)
+
+    # The indices that we're selecting (both fg and bg)
+    keep_inds = np.append(fg_inds, bg_inds)
+    # Select sampled values from various arrays
+    rois, labels = all_rois[keep_inds], labels[keep_inds]
+    # Clamp labels for the background RoIs to 0
+    labels[fg_rois_per_this_image:] = 0
+    # Clamp the image indices for the background RoIs to -1
+    rois[fg_rois_per_this_image:][0] = -1
+
+    # Compute the target from RoIs
+    outputs = [rois, labels]
+    outputs += get_targets(
+        rois[:, 1:5],
+        gt_boxes[gt_assignment[keep_inds], :4],
+        labels,
+        gt_masks[gt_assignment[fg_inds]],
+        gt_boxes[gt_assignment[fg_inds], 5],
+        cfg.MRCNN.RESOLUTION,
+        num_classes,
+        im_scale,
+    )
+    return outputs
--- a/lib/mask_rcnn/test.py
+++ b/lib/mask_rcnn/test.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import dragon.vm.torch as torch
+import numpy as np
+
+from lib.core.config import cfg
+from lib.faster_rcnn import map_rois_to_levels
+from lib.faster_rcnn import map_blobs_to_outputs
+from lib.modeling.detector import new_detector
+from lib.nms import nms_wrapper
+from lib.utils import framework
+from lib.utils import time_util
+from lib.utils import boxes as box_util
+from lib.utils.blob import im_list_to_blob
+from lib.utils.image import scale_image
+
+
+def im_detect(detector, raw_image):
+    """Detect a image, with single or multiple scales."""
+    ims, ims_scale = scale_image(raw_image)
+
+    # Prepare blobs
+    blobs = {'data': im_list_to_blob(ims)}
+    blobs['ims_info'] = np.array([
+        list(blobs['data'].shape[1:3]) + [im_scale]
+        for im_scale in ims_scale
+    ], dtype=np.float32)
+
+    # Do Forward
+    if not hasattr(detector, 'graph'):
+        with framework.new_workspace().as_default():
+            data = torch.from_numpy(blobs['data'])
+            ims_info = torch.from_numpy(blobs['ims_info'])
+            with torch.no_grad():
+                with torch.jit.Tracer(retain_ops=True):
+                    inputs = {'data': data, 'ims_info': ims_info}
+                    outputs = detector.forward(inputs)
+                    detector.graph = \
+                        framework.Graph(inputs, {
+                            'rois': outputs['rois'],
+                            'cls_prob': outputs['cls_prob'],
+                            'bbox_pred': outputs['bbox_pred']
+                        })
+    outputs = detector.graph(**blobs)
+
+    # Decode results
+    rois = outputs['rois']
+    scores, boxes, batch_inds = [], [], []
+    pred_boxes = \
+        box_util.bbox_transform_inv(
+            rois[:, 1:5],
+            outputs['bbox_pred'],
+            cfg.BBOX_REG_WEIGHTS,
+        )
+
+    for i in range(len(ims)):
+        inds = np.where(rois[:, 0].astype(np.int32) == i)[0]
+        im_boxes = pred_boxes[inds] / ims_scale[i]
+        scores.append(outputs['cls_prob'][inds])
+        boxes.append(box_util.clip_tiled_boxes(im_boxes, raw_image.shape))
+        batch_inds.append(np.ones((len(inds), 1), 'int32') * i)
+
+    return (
+        np.vstack(scores) if len(ims) > 0 else scores[0],
+        np.vstack(boxes) if len(ims) > 0 else boxes[0],
+        np.vstack(batch_inds) if len(ims) > 0 else batch_inds[0],
+        np.array(ims_scale, 'float64'),
+    )
+
+
+def mask_detect(detector, rois):
+    k_min = cfg.FPN.ROI_MIN_LEVEL
+    k_max = cfg.FPN.ROI_MAX_LEVEL
+    k = k_max - k_min + 1
+    levels = map_rois_to_levels(rois, k_min, k_max)
+    level_inds = [np.where(levels == (i + k_min))[0] for i in range(k)]
+    fpn_rois = map_blobs_to_outputs(
+        {'rois': rois[:, :5]},
+        {'rois': np.array([[-1, 0, 0, 1, 1]], 'float32')},
+        level_inds)['rois']
+    workspace = detector.graph.workspace
+    placeholders = detector.graph.placeholders
+    score_fn = detector.rcnn.compute_mask_score
+    with workspace.as_default():
+        if 'rois' not in placeholders:
+            placeholders['rois'] = \
+                [framework.new_placeholder(cfg.GPU_ID) for _ in range(k)]
+            placeholders['mask_inds'] = \
+                framework.new_placeholder(cfg.GPU_ID)
+        for i, v in enumerate(fpn_rois):
+            framework.feed_tensor(placeholders['rois'][i], v.astype('float32'))
+        with torch.no_grad():
+            mask_score = score_fn(rois=placeholders['rois'])
+        nc, i = mask_score.shape[1], 0
+        mask_inds = {}
+        for inds in level_inds:
+            for idx in inds:
+                cls = int(rois[idx, 5])
+                mask_inds[idx] = (i * nc + cls)
+                i += 1
+            if len(inds) == 0:
+                i += 1
+        mask_inds = list(map(mask_inds.get, sorted(mask_inds)))
+        framework.feed_tensor(
+            placeholders['mask_inds'],
+            np.array(mask_inds, 'int64'),
+        )
+        with torch.no_grad():
+            mask_pred = mask_score.index_select(
+                (0, 1), placeholders['mask_inds'])
+            return detector.rcnn.sigmoid(mask_pred).numpy(True).copy()
+
+
+def test_net(weights, num_classes, q_in, q_out, device):
+    num_classes, cfg.GPU_ID = num_classes, device
+    detector = new_detector(device, weights)
+
+    _t = time_util.new_timers('im_detect', 'mask_detect', 'misc')
+
+    while True:
+        idx, raw_image = q_in.get()
+        if raw_image is None:
+            break
+
+        rois_this_image = []
+        boxes_this_image = [[]]
+        masks_this_image = [[]]
+
+        with _t['im_detect'].tic_and_toc():
+            scores, boxes, batch_inds, ims_scale = \
+                im_detect(detector, raw_image)
+
+        _t['misc'].tic()
+        for j in range(1, num_classes):
+            inds = np.where(scores[:, j] > cfg.TEST.SCORE_THRESH)[0]
+            cls_scores = scores[inds, j]
+            cls_boxes = boxes[inds, j * 4:(j + 1) * 4]
+            cls_batch_inds = batch_inds[inds]
+            cls_detections = np.hstack(
+                (cls_boxes, cls_scores[:, np.newaxis])
+            ).astype(np.float32, copy=False)
+            if cfg.TEST.USE_SOFT_NMS:
+                keep = nms_wrapper.soft_nms(
+                    cls_detections,
+                    thresh=cfg.TEST.NMS,
+                    method=cfg.TEST.SOFT_NMS_METHOD,
+                    sigma=cfg.TEST.SOFT_NMS_SIGMA,
+                )
+            else:
+                keep = nms_wrapper.nms(
+                    cls_detections,
+                    thresh=cfg.TEST.NMS,
+                    force_cpu=True,
+                )
+            cls_detections = cls_detections[keep, :]
+            cls_batch_inds = cls_batch_inds[keep]
+            boxes_this_image.append(cls_detections)
+            rois_this_image.append(
+                np.hstack((
+                    cls_batch_inds,
+                    cls_detections[:, :4] * ims_scale[cls_batch_inds],
+                    np.ones((len(keep), 1)) * (j - 1),
+                )))
+        mask_rois = np.concatenate(rois_this_image)
+        _t['misc'].toc()
+
+        if len(mask_rois) > 0:
+            k = 0
+            _t['mask_detect'].tic()
+            mask_pred = mask_detect(detector, mask_rois)
+            for j in range(1, num_classes):
+                num_pred = len(boxes_this_image[j])
+                cls_masks = mask_pred[k:k + num_pred]
+                masks_this_image.append(cls_masks)
+                k += num_pred
+            _t['mask_detect'].toc()
+
+        q_out.put((
+            idx,
+            {
+                'im_detect': _t['im_detect'].average_time,
+                'mask_detect': _t['mask_detect'].average_time,
+                'misc': _t['misc'].average_time,
+            },
+            {
+                'boxes': boxes_this_image,
+                'masks': masks_this_image,
+            },
+        ))
--- a/lib/modeling/__init__.py
+++ b/lib/modeling/__init__.py
@@ -14,12 +14,9 @@ from __future__ import division
 from __future__ import print_function

 # Import custom modules
-from lib.modeling.base import affine
-from lib.modeling.base import bn
-from lib.modeling.base import conv1x1
-from lib.modeling.base import conv3x3
 from lib.modeling.fast_rcnn import FastRCNN
 from lib.modeling.fpn import FPN
+from lib.modeling.mask_rcnn import MaskRCNN
 from lib.modeling.retinanet import RetinaNet
 from lib.modeling.rpn import RPN
 from lib.modeling.ssd import SSD
--- a/lib/modeling/airnet.py
+++ b/lib/modeling/airnet.py
@@ -15,20 +15,19 @@ from __future__ import print_function

 import dragon.vm.torch as torch

-from lib.modeling import affine
-from lib.modeling import conv1x1
-from lib.modeling import conv3x3
+from lib.modules import init
+from lib.modules import nn


-class WideResBlock(torch.nn.Module):
+class WideResBlock(nn.Module):
    def __init__(self, dim_in, dim_out, stride=1, downsample=None):
        super(WideResBlock, self).__init__()
-        self.conv1 = conv3x3(dim_in, dim_out, stride)
-        self.bn1 = affine(dim_out)
-        self.conv2 = conv3x3(dim_out, dim_out)
-        self.bn2 = affine(dim_out)
+        self.conv1 = nn.Conv3x3(dim_in, dim_out, stride)
+        self.bn1 = nn.Affine(dim_out)
+        self.conv2 = nn.Conv3x3(dim_out, dim_out)
+        self.bn2 = nn.Affine(dim_out)
        self.downsample = downsample
-        self.relu = torch.nn.ReLU(inplace=True)
+        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        residual = x
@@ -48,20 +47,20 @@ class WideResBlock(torch.nn.Module):
        return out


-class InceptionBlock(torch.nn.Module):
+class InceptionBlock(nn.Module):
    def __init__(self, dim_in, dim_out):
        super(InceptionBlock, self).__init__()
-        self.conv1 = conv1x1(dim_in, dim_out)
-        self.bn1 = affine(dim_out)
-        self.conv2 = conv3x3(dim_out, dim_out // 2)
-        self.bn2 = affine(dim_out // 2)
-        self.conv3a = conv3x3(dim_out // 2, dim_out)
-        self.bn3a = affine(dim_out)
-        self.conv3b = conv3x3(dim_out, dim_out)
-        self.bn3b = affine(dim_out)
-        self.conv4 = conv3x3(dim_out * 3, dim_out)
-        self.bn4 = affine(dim_out)
-        self.relu = torch.nn.ReLU(inplace=True)
+        self.conv1 = nn.Conv1x1(dim_in, dim_out)
+        self.bn1 = nn.Affine(dim_out)
+        self.conv2 = nn.Conv3x3(dim_out, dim_out // 2)
+        self.bn2 = nn.Affine(dim_out // 2)
+        self.conv3a = nn.Conv3x3(dim_out // 2, dim_out)
+        self.bn3a = nn.Affine(dim_out)
+        self.conv3b = nn.Conv3x3(dim_out, dim_out)
+        self.bn3b = nn.Affine(dim_out)
+        self.conv4 = nn.Conv3x3(dim_out * 3, dim_out)
+        self.bn4 = nn.Affine(dim_out)
+        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        residual = x
@@ -82,7 +81,7 @@ class InceptionBlock(torch.nn.Module):
        out_3x3_b = self.bn3b(out)
        out_3x3_b = self.relu(out_3x3_b)

-        out = torch.cat([out_1x1, out_3x3_a, out_3x3_b], dim=1)
+        out = torch.cat([out_1x1, out_3x3_a, out_3x3_b], 1)
        out = self.conv4(out)
        out = self.bn4(out)

@@ -91,22 +90,22 @@ class InceptionBlock(torch.nn.Module):
        return out


-class AirNet(torch.nn.Module):
+class AirNet(nn.Module):
    def __init__(self, blocks, num_stages):
        super(AirNet, self).__init__()
        self.dim_in, filters = 64, [64, 128, 256, 384]
        self.feature_dims = [None, None] + \
                            filters[1:num_stages - 1]
-        self.conv1 = torch.nn.Conv2d(
+        self.conv1 = nn.Conv2d(
            3, 64,
            kernel_size=7,
            stride=2,
            padding=3,
            bias=False,
        )
-        self.bn1 = affine(self.dim_in)
-        self.relu = torch.nn.ReLU(inplace=True)
-        self.maxpool = torch.nn.MaxPool2d(
+        self.bn1 = nn.Affine(self.dim_in)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(
            kernel_size=2,
            stride=2,
            padding=0,
@@ -121,19 +120,14 @@ class AirNet(torch.nn.Module):
        self.reset_parameters()

    def reset_parameters(self):
-        # The Kaiming Initialization
        for m in self.modules():
-            if isinstance(m, torch.nn.Conv2d):
-                torch.nn.init.kaiming_uniform_(
-                    m.weight,
-                    # Fix the gain for [-127, 127]
-                    a=1,
-                )  # Xavier Initialization
+            if isinstance(m, nn.Conv2d):
+                init.xaiver(m.weight)

    def make_blocks(self, dim_out, blocks, stride=1):
-        downsample = torch.nn.Sequential(
-            conv1x1(self.dim_in, dim_out, stride=stride),
-            affine(dim_out),
+        downsample = nn.Sequential(
+            nn.Conv1x1(self.dim_in, dim_out, stride=stride),
+            nn.Affine(dim_out),
        )
        layers = [WideResBlock(self.dim_in, dim_out, stride, downsample)]
        self.dim_in = dim_out
@@ -144,7 +138,7 @@ class AirNet(torch.nn.Module):
                layers.append(InceptionBlock(dim_out, dim_out))
            else:
                raise ValueError('Unknown block flag: ' + blocks[i])
-        return torch.nn.Sequential(*layers)
+        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)

--- a/lib/modeling/base.py
+++ b/lib/modeling/base.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-
-"""Define some basic structures."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import dragon.vm.torch as torch
-
-
-def affine(dim_in, inplace=True):
-    """AffineBN, weight and bias are fixed."""
-    return torch.nn.Affine(
-        dim_in,
-        fix_weight=True,
-        fix_bias=True,
-        inplace=inplace,
-    )
-
-
-def bn(dim_in, eps=1e-5):
-    """The BatchNorm."""
-    return torch.nn.BatchNorm2d(dim_in, eps=eps)
-
-
-def conv1x1(dim_in, dim_out, stride=1, bias=False):
-    """1x1 convolution."""
-    return torch.nn.Conv2d(
-        dim_in,
-        dim_out,
-        kernel_size=1,
-        stride=stride,
-        bias=bias,
-    )
-
-
-def conv3x3(dim_in, dim_out, stride=1, bias=False):
-    """3x3 convolution with padding."""
-    return torch.nn.Conv2d(
-        dim_in,
-        dim_out,
-        kernel_size=3,
-        stride=stride,
-        padding=1,
-        bias=bias,
-    )
--- a/lib/modeling/detector.py
+++ b/lib/modeling/detector.py
@@ -21,14 +21,16 @@ from lib.core.config import cfg
 from lib.modeling import FPN
 from lib.modeling import RPN
 from lib.modeling import FastRCNN
+from lib.modeling import MaskRCNN
 from lib.modeling import RetinaNet
 from lib.modeling import SSD
 from lib.modeling.factory import get_body_func
-from lib.ops.modules import Bootstrap
-from lib.utils.logger import is_root
+from lib.modules import nn
+from lib.modules import vision
+from lib.utils import logger


-class Detector(torch.nn.Module):
+class Detector(nn.Module):
    """Organize the detection pipelines.

    A bunch of classic algorithms are integrated, see the
@@ -42,19 +44,20 @@ class Detector(torch.nn.Module):
        backbone = cfg.MODEL.BACKBONE.lower().split('.')
        body, modules = backbone[0], backbone[1:]

-        # + Data Loader
-        self.data_layer = importlib.import_module(
-            'lib.{}'.format(model)).DataLayer
-        self.bootstrap = Bootstrap()
+        # + DataLoader
+        self.data_loader_cls = importlib.import_module(
+            'lib.{}'.format(model)).DataLoader
+        self.bootstrap = vision.Bootstrap()

-        # + Feature Extractor
+        # + FeatureExtractor
        self.body = get_body_func(body)()
        feature_dims = self.body.feature_dims

-        # + Feature Enhancer
+        # + FeatureEnhancer
        if 'fpn' in modules:
            self.fpn = FPN(feature_dims)
            feature_dims = self.fpn.feature_dims
+
        elif 'mbox' in modules:
            pass  # Placeholder
        else:
@@ -63,7 +66,10 @@ class Detector(torch.nn.Module):
        # + Detection Modules
        if 'rcnn' in model:
            self.rpn = RPN(feature_dims[0])
-            self.fast_rcnn = FastRCNN(feature_dims[0])
+            if 'faster' in model:
+                self.rcnn = FastRCNN(feature_dims[0])
+            elif 'mask' in model:
+                self.rcnn = MaskRCNN(feature_dims[0])

        if 'retinanet' in model:
            self.retinanet = RetinaNet(feature_dims[0])
@@ -85,7 +91,7 @@ class Detector(torch.nn.Module):
        self.load_state_dict(
            torch.load(weights),
            strict=False,
-            verbose=is_root(),
+            verbose=logger.is_root(),
        )

    def forward(self, inputs=None):
@@ -107,7 +113,7 @@ class Detector(torch.nn.Module):
            # 1) Training: <= DataLayer
            # 2) Inference: <= Given
            if not hasattr(self, 'data_loader'):
-                self.data_loader = self.data_layer()
+                self.data_loader = self.data_loader_cls()
            inputs = self.data_loader()

        # 1. Extract features
@@ -126,7 +132,7 @@ class Detector(torch.nn.Module):
        # 3. Collect detection outputs
        outputs = collections.OrderedDict()

-        # 3.1 Feature -> RPN -> Fast R-CNN
+        # 3.1 Feature -> RPN -> R-CNN
        if hasattr(self, 'rpn'):
            outputs.update(
                self.rpn(
@@ -135,7 +141,7 @@ class Detector(torch.nn.Module):
                )
            )
            outputs.update(
-                self.fast_rcnn(
+                self.rcnn(
                    features=features,
                    rpn_cls_score=outputs['rpn_cls_score'],
                    rpn_bbox_pred=outputs['rpn_bbox_pred'],
@@ -174,8 +180,8 @@ class Detector(torch.nn.Module):
        ##################################
        last_module = None
        for e in self.modules():
-            if isinstance(e, torch.nn.Affine) and \
-                    isinstance(last_module, torch.nn.Conv2d):
+            if isinstance(e, nn.Affine) and \
+                    isinstance(last_module, nn.Conv2d):
                if last_module.bias is None:
                    delattr(last_module, 'bias')
                    e.forward = lambda x: x
@@ -188,8 +194,8 @@ class Detector(torch.nn.Module):
        ######################################
        last_module = None
        for e in self.modules():
-            if isinstance(e, torch.nn.BatchNorm2d) and \
-                    isinstance(last_module, torch.nn.Conv2d):
+            if isinstance(e, nn.BatchNorm2d) and \
+                    nn.is_conv2d(last_module):
                if last_module.bias is None:
                    delattr(last_module, 'bias')
                    e.forward = lambda x: x
@@ -204,3 +210,17 @@ class Detector(torch.nn.Module):
                    else:
                        last_module.weight.data.mul_(term)
            last_module = e
+
+
+def new_detector(device, weights=None, training=False):
+    detector = Detector().cuda(device)
+    if weights is not None:
+        detector.load_weights(weights)
+    if not training:
+        detector.eval()
+        detector.optimize_for_inference()
+    # Enable the fp16 inference support if necessary
+    # Boost a little if TensorCore is available
+    if cfg.MODEL.PRECISION.lower() == 'float16':
+        detector.half()
+    return detector
--- a/lib/modeling/factory.py
+++ b/lib/modeling/factory.py
@@ -43,14 +43,20 @@ for D in ['', '3b', '4b', '5b']:
    _STORE['BODY']['airnet{}'.format(D)] = \
        'lib.modeling.airnet.make_airnet_{}'.format(D)

+# MobileNet
+for D in ['a1', 'v2']:
+    _STORE['BODY']['mobilenet_{}'.format(D)] = \
+        'lib.modeling.mobilenet.make_mobilenet_{}'.format(D)
+

 def get_template_func(name, sets, desc):
    name = name.lower()
    if name not in sets:
        raise ValueError(
            'The {} for {} was not registered.\n'
-            'Registered modules: [{}]'.format(
-                name, desc, ', '.join(sets.keys())))
+            'Registered modules: [{}]'
+            .format(name, desc, ', '.join(sets.keys()))
+        )
    module_name = '.'.join(sets[name].split('.')[0:-1])
    func_name = sets[name].split('.')[-1]
    try:

--- a/lib/modeling/fast_rcnn.py
+++ b/lib/modeling/fast_rcnn.py
@@ -14,13 +14,19 @@ from __future__ import division
 from __future__ import print_function

 import collections
+import functools
+
 import dragon.vm.torch as torch

+from lib import faster_rcnn
 from lib.core.config import cfg
-from lib.ops.modules import RPNDecoder
+from lib.modules import det
+from lib.modules import init
+from lib.modules import nn
+from lib.modules import vision


-class FastRCNN(torch.nn.Module):
+class FastRCNN(nn.Module):
    """Generate proposal regions for R-CNN series.

    The pipeline is as follows:
@@ -32,59 +38,45 @@ class FastRCNN(torch.nn.Module):
    """
    def __init__(self, dim_in=256):
        super(FastRCNN, self).__init__()
-        if len(cfg.RPN.STRIDES) > 1:
-            # RPN with multiple strides(i.e. FPN)
-            from lib.fpn import ProposalLayer, ProposalTargetLayer
-        else:
-            # RPN with single stride(i.e. C4)
-            from lib.faster_rcnn import ProposalLayer, ProposalTargetLayer
        self.roi_head_dim = dim_in * (cfg.FRCNN.ROI_XFORM_RESOLUTION ** 2)
-        self.fc6 = torch.nn.Linear(self.roi_head_dim, cfg.FRCNN.MLP_HEAD_DIM)
-        self.fc7 = torch.nn.Linear(cfg.FRCNN.MLP_HEAD_DIM, cfg.FRCNN.MLP_HEAD_DIM)
-        self.cls_score = torch.nn.Linear(cfg.FRCNN.MLP_HEAD_DIM, cfg.MODEL.NUM_CLASSES)
-        self.bbox_pred = torch.nn.Linear(cfg.FRCNN.MLP_HEAD_DIM, cfg.MODEL.NUM_CLASSES * 4)
-        self.rpn_decoder = RPNDecoder()
-        self.proposal_layer = ProposalLayer()
-        self.proposal_target_layer = ProposalTargetLayer()
-        self.softmax = torch.nn.Softmax(dim=1)
-        self.relu = torch.nn.ReLU(inplace=True)
-        self.sigmoid = torch.nn.Sigmoid(inplace=False)
-        self.roi_func = {
-            'RoIPool': torch.vision.ops.roi_pool,
-            'RoIAlign': torch.vision.ops.roi_align,
-        }[cfg.FRCNN.ROI_XFORM_METHOD]
-        self.cls_loss = torch.nn.CrossEntropyLoss(ignore_index=-1)
-        self.bbox_loss = torch.nn.SmoothL1Loss(reduction='batch_size')
-        # Compute spatial scales for multiple strides
-        roi_levels = [level for level in range(
-            cfg.FPN.ROI_MIN_LEVEL, cfg.FPN.ROI_MAX_LEVEL + 1)]
-        self.spatial_scales = [1.0 / (2 ** level) for level in roi_levels]
+        self.fc6 = nn.Linear(self.roi_head_dim, cfg.FRCNN.MLP_HEAD_DIM)
+        self.fc7 = nn.Linear(cfg.FRCNN.MLP_HEAD_DIM, cfg.FRCNN.MLP_HEAD_DIM)
+        self.cls_score = nn.Linear(cfg.FRCNN.MLP_HEAD_DIM, cfg.MODEL.NUM_CLASSES)
+        self.bbox_pred = nn.Linear(cfg.FRCNN.MLP_HEAD_DIM, cfg.MODEL.NUM_CLASSES * 4)
+        self.rpn_decoder = det.RPNDecoder()
+        self.proposal = faster_rcnn.Proposal()
+        self.proposal_target = faster_rcnn.ProposalTarget()
+        self.softmax = nn.Softmax(dim=1)
+        self.relu = nn.ReLU(inplace=True)
+        self.sigmoid = nn.Sigmoid()
+        self.box_roi_feature = functools.partial({
+            'RoIPool': vision.roi_pool,
+            'RoIAlign': vision.roi_align
+        }[cfg.FRCNN.ROI_XFORM_METHOD], size=cfg.FRCNN.ROI_XFORM_RESOLUTION)
+        self.cls_loss = nn.CrossEntropyLoss()
+        self.bbox_loss = nn.SmoothL1Loss()
+        # Compute spatial scales according to strides
+        self.spatial_scales = [
+            1. / (2 ** lvl)
+            for lvl in range(
+                cfg.FPN.ROI_MIN_LEVEL,
+                cfg.FPN.ROI_MAX_LEVEL + 1
+            )]
        self.reset_parameters()

    def reset_parameters(self):
        # Careful initialization for Fast R-CNN
-        torch.nn.init.normal_(self.cls_score.weight, std=0.01)
-        torch.nn.init.normal_(self.bbox_pred.weight, std=0.001)
+        init.normal(self.cls_score.weight, std=0.01)
+        init.normal(self.bbox_pred.weight, std=0.001)
        for name, p in self.named_parameters():
            if 'bias' in name:
-                torch.nn.init.constant_(p, 0)
-
-    def RoIFeatureTransform(self, feature, rois, spatial_scale):
-        return self.roi_func(
-            feature, rois,
-            output_size=(
-                cfg.FRCNN.ROI_XFORM_RESOLUTION,
-                cfg.FRCNN.ROI_XFORM_RESOLUTION,
-            ),
-            spatial_scale=spatial_scale,
-        )
+                init.constant(p, 0)

    def forward(self, **kwargs):
-        # Generate Proposals
-        # Apply the CXX implementation during inference
-        proposal_func = self.proposal_layer \
+        # Generate proposals
+        proposal_func = self.proposal \
            if self.training else self.rpn_decoder
-        self.rcnn_data = {
+        self.data = {
            'rois': proposal_func(
                kwargs['features'],
                self.sigmoid(kwargs['rpn_cls_score'].data),
@@ -93,66 +85,61 @@ class FastRCNN(torch.nn.Module):
            )
        }

-        # Generate Targets from Proposals
+        # Generate targets from proposals
        if self.training:
-            self.rcnn_data.update(
-                self.proposal_target_layer(
-                    rpn_rois=self.rcnn_data['rois'],
+            self.data.update(
+                self.proposal_target(
+                    rpn_rois=self.data['rois'],
                    gt_boxes=kwargs['gt_boxes'],
                )
            )

-        # Transform RoI Feature
-        roi_features = []
-        if len(self.rcnn_data['rois']) > 1:
-            for i, spatial_scale in enumerate(self.spatial_scales):
-                roi_features.append(
-                    self.RoIFeatureTransform(
+        # Transform RoI features
+        if len(self.data['rois']) > 1:
+            roi_features = \
+                torch.cat([
+                    self.box_roi_feature(
                        kwargs['features'][i],
-                        self.rcnn_data['rois'][i],
+                        self.data['rois'][i],
                        spatial_scale,
-                    )
-                )
-            roi_features = torch.cat(roi_features, dim=0)
+                    ) for i, spatial_scale in enumerate(self.spatial_scales)
+                ], dim=0)
        else:
-            spatial_scale = 1.0 / cfg.RPN.STRIDES[0]
            roi_features = \
-                self.RoIFeatureTransform(
+                self.box_roi_feature(
                    kwargs['features'][0],
-                    self.rcnn_data['rois'][0],
-                    spatial_scale,
+                    self.data['rois'][0],
+                    1. / cfg.RPN.STRIDES[0],
                )

        # Apply a simple MLP
        roi_features = roi_features.view(-1, self.roi_head_dim)
-        rcnn_output = self.relu(self.fc6(roi_features))
-        rcnn_output = self.relu(self.fc7(rcnn_output))
+        roi_features = self.relu(self.fc6(roi_features))
+        roi_features = self.relu(self.fc7(roi_features))

-        # Compute rcnn logits
-        cls_score = self.cls_score(rcnn_output).float()
-        outputs = collections.OrderedDict([
-            ('bbox_pred', self.bbox_pred(rcnn_output).float()),
-        ])
+        # Compute logits and losses
+        outputs = collections.OrderedDict()
+        cls_score = self.cls_score(roi_features).float()
+        outputs['bbox_pred'] = self.bbox_pred(roi_features).float()

        if self.training:
            # Compute rcnn losses
            outputs.update(collections.OrderedDict([
                ('cls_loss', self.cls_loss(
-                    cls_score, self.rcnn_data['labels'])),
+                    cls_score, self.data['labels'])),
                ('bbox_loss', self.bbox_loss(
                    outputs['bbox_pred'],
-                    self.rcnn_data['bbox_targets'],
-                    self.rcnn_data['bbox_inside_weights'],
-                    self.rcnn_data['bbox_outside_weights'],
+                    self.data['bbox_targets'],
+                    self.data['bbox_inside_weights'],
+                    self.data['bbox_outside_weights'],
                )),
            ]))
        else:
            # Return the rois to decode the refine boxes
-            if len(self.rcnn_data['rois']) > 1:
-                outputs['rois'] = torch.cat(
-                    self.rcnn_data['rois'], dim=0)
+            if len(self.data['rois']) > 1:
+                outputs['rois'] = torch.cat(self.data['rois'], 0)
            else:
-                outputs['rois'] = self.rcnn_data['rois'][0]
+                outputs['rois'] = self.data['rois'][0]
            # Return the classification prob
            outputs['cls_prob'] = self.softmax(cls_score)


--- a/lib/modeling/fpn.py
+++ b/lib/modeling/fpn.py
@@ -16,43 +16,41 @@ from __future__ import print_function
 import dragon.vm.torch as torch

 from lib.core.config import cfg
-from lib.modeling import conv1x1
-from lib.modeling import conv3x3
+from lib.modules import init
+from lib.modules import nn


 HIGHEST_BACKBONE_LVL = 5  # E.g., "conv5"-like level


-class FPN(torch.nn.Module):
+class FPN(nn.Module):
    """Feature Pyramid Networks for R-CNN and RetinaNet."""

    def __init__(self, feature_dims):
        super(FPN, self).__init__()
-        self.C = torch.nn.ModuleList()
-        self.P = torch.nn.ModuleList()
+        dim = cfg.FPN.DIM
+        self.C = nn.ModuleList()
+        self.P = nn.ModuleList()
        for lvl in range(cfg.FPN.RPN_MIN_LEVEL, HIGHEST_BACKBONE_LVL + 1):
-            self.C.append(conv1x1(feature_dims[lvl - 1], cfg.FPN.DIM, bias=True))
-            self.P.append(conv3x3(cfg.FPN.DIM, cfg.FPN.DIM, bias=True))
+            self.C.append(nn.Conv1x1(feature_dims[lvl - 1], dim, bias=True))
+            self.P.append(nn.Conv3x3(dim, dim, bias=True))
        if 'rcnn' in cfg.MODEL.TYPE:
            self.apply_func = self.apply_on_rcnn
-            self.maxpool = torch.nn.MaxPool2d(1, 2, ceil_mode=True)
+            self.maxpool = nn.MaxPool2d(1, 2, ceil_mode=True)
        else:
            self.apply_func = self.apply_on_generic
-            self.relu = torch.nn.ReLU(inplace=False)
+            self.relu = nn.ReLU(inplace=False)
            for lvl in range(HIGHEST_BACKBONE_LVL + 1, cfg.FPN.RPN_MAX_LEVEL + 1):
-                dim_in = feature_dims[-1] if lvl == HIGHEST_BACKBONE_LVL + 1 else cfg.FPN.DIM
-                self.P.append(conv3x3(dim_in, cfg.FPN.DIM, stride=2, bias=True))
+                dim_in = feature_dims[-1] if lvl == HIGHEST_BACKBONE_LVL + 1 else dim
+                self.P.append(nn.Conv3x3(dim_in, dim, stride=2, bias=True))
+        self.feature_dims = [dim]
        self.reset_parameters()
-        self.feature_dims = [cfg.FPN.DIM]

    def reset_parameters(self):
        for m in self.modules():
-            if isinstance(m, torch.nn.Conv2d):
-                torch.nn.init.kaiming_uniform_(
-                    m.weight,
-                    a=1,  # Fix the gain for [-127, 127]
-                )  # Xavier Initialization
-                torch.nn.init.constant_(m.bias, 0)
+            if isinstance(m, nn.Conv2d):
+                init.xaiver(m.weight)
+                init.constant(m.bias, 0)

    def apply_on_rcnn(self, features):
        fpn_input = self.C[-1](features[-1])

--- a/lib/modeling/mask_rcnn.py
+++ b/lib/modeling/mask_rcnn.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import functools
+
+import dragon.vm.torch as torch
+
+from lib import mask_rcnn
+from lib.core.config import cfg
+from lib.modules import det
+from lib.modules import init
+from lib.modules import nn
+from lib.modules import vision
+
+
+class MaskRCNN(nn.Module):
+    def __init__(self, dim_in=256):
+        """Generate mask regions for R-CNN series.
+
+        The pipeline is as follows:
+
+        ... -> BoxRoIs  \                          /-> cls_score -> cls_loss
+                         -> RoIFeatureXform -> MLP
+        ... -> Features /                          \-> bbox_pred -> bbox_loss
+
+        ... -> MaskRoIs \
+                         -> RoIFeatureXform -> FCN  -> mask_score -> mask_loss
+        ... -> Features /
+
+        """
+        super(MaskRCNN, self).__init__()
+        self.roi_head_dim = dim_in * (cfg.FRCNN.ROI_XFORM_RESOLUTION ** 2)
+        self.fc6 = nn.Linear(self.roi_head_dim, cfg.FRCNN.MLP_HEAD_DIM)
+        self.fc7 = nn.Linear(cfg.FRCNN.MLP_HEAD_DIM, cfg.FRCNN.MLP_HEAD_DIM)
+        self.fcn = nn.ModuleList([nn.Conv3x3(dim_in, dim_in, bias=True) for _ in range(4)])
+        self.fcn += [nn.ConvTranspose2d(dim_in, dim_in, 2, 2, 0)]
+        self.cls_score = nn.Linear(cfg.FRCNN.MLP_HEAD_DIM, cfg.MODEL.NUM_CLASSES)
+        self.bbox_pred = nn.Linear(cfg.FRCNN.MLP_HEAD_DIM, cfg.MODEL.NUM_CLASSES * 4)
+        self.mask_score = nn.Conv1x1(dim_in, cfg.MODEL.NUM_CLASSES - 1, bias=True)
+        self.rpn_decoder = det.RPNDecoder()
+        self.proposal = mask_rcnn.Proposal()
+        self.proposal_target = mask_rcnn.ProposalTarget()
+        self.sigmoid = nn.Sigmoid()
+        self.softmax = nn.Softmax(dim=1)
+        self.relu = nn.ReLU(True)
+        self.box_roi_feature = functools.partial({
+            'RoIPool': vision.roi_pool,
+            'RoIAlign': vision.roi_align,
+        }[cfg.FRCNN.ROI_XFORM_METHOD], size=cfg.FRCNN.ROI_XFORM_RESOLUTION)
+        self.mask_roi_feature = functools.partial({
+            'RoIPool': vision.roi_pool,
+            'RoIAlign': vision.roi_align,
+        }[cfg.MRCNN.ROI_XFORM_METHOD], size=cfg.MRCNN.ROI_XFORM_RESOLUTION)
+        self.cls_loss = nn.CrossEntropyLoss()
+        self.bbox_loss = nn.SmoothL1Loss()
+        self.mask_loss = nn.BCEWithLogitsLoss()
+        # Compute spatial scales according to strides
+        self.spatial_scales = [
+            1. / (2 ** lvl)
+            for lvl in range(
+                cfg.FPN.ROI_MIN_LEVEL,
+                cfg.FPN.ROI_MAX_LEVEL + 1
+            )]
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        # Careful initialization for Fast R-CNN
+        init.normal(self.cls_score.weight, std=0.01)
+        init.normal(self.bbox_pred.weight, std=0.001)
+        # Careful initialization for Mask R-CNN
+        init.normal(self.mask_score.weight, std=0.001)
+        for m in self.fcn.modules():
+            if hasattr(m, 'weight'):
+                init.kaiming_normal(m.weight)
+        for name, p in self.named_parameters():
+            if 'bias' in name:
+                init.constant(p, 0)
+
+    def get_mask_score(self, features, rois):
+        roi_features = \
+            torch.cat([
+                self.mask_roi_feature(
+                    features[i], rois[i], spatial_scale,
+                ) for i, spatial_scale in enumerate(self.spatial_scales)
+            ], dim=0)
+        for i in range(len(self.fcn)):
+            roi_features = self.relu(self.fcn[i](roi_features))
+        return self.mask_score(roi_features).float()
+
+    def forward(self, **kwargs):
+        # Generate proposals
+        proposal_func = self.proposal \
+            if self.training else self.rpn_decoder
+        self.data = {
+            'rois': proposal_func(
+                kwargs['features'],
+                self.sigmoid(kwargs['rpn_cls_score'].data),
+                kwargs['rpn_bbox_pred'],
+                kwargs['ims_info'],
+            )
+        }
+
+        # Generate targets from proposals
+        if self.training:
+            self.data.update(
+                self.proposal_target(
+                    rpn_rois=self.data['rois'],
+                    gt_boxes=kwargs['gt_boxes'],
+                    gt_masks=kwargs['gt_masks'],
+                    ims_info=kwargs['ims_info'],
+                )
+            )
+
+        # Transform RoI features
+        roi_features = \
+            torch.cat([
+                self.box_roi_feature(
+                    kwargs['features'][i],
+                    self.data['rois'][i],
+                    spatial_scale,
+                ) for i, spatial_scale in enumerate(self.spatial_scales)
+            ], dim=0)
+
+        # Apply a simple MLP
+        roi_features = roi_features.view(-1, self.roi_head_dim)
+        roi_features = self.relu(self.fc6(roi_features))
+        roi_features = self.relu(self.fc7(roi_features))
+
+        # Compute logits and losses
+        outputs = collections.OrderedDict()
+        cls_score = self.cls_score(roi_features).float()
+        outputs['bbox_pred'] = self.bbox_pred(roi_features).float()
+
+        if self.training:
+            # Compute the loss of bbox branch
+            outputs.update(collections.OrderedDict([
+                ('cls_loss', self.cls_loss(
+                    cls_score, self.data['labels'])),
+                ('bbox_loss', self.bbox_loss(
+                    outputs['bbox_pred'],
+                    self.data['bbox_targets'],
+                    self.data['bbox_inside_weights'],
+                    self.data['bbox_outside_weights'],
+                )),
+            ]))
+            # Compute the loss of mask branch
+            mask_score = self.get_mask_score(
+                kwargs['features'], self.data['mask_rois'])
+            mask_score = mask_score.index_select(
+                (0, 1), self.data['mask_indices'])
+            outputs['mask_loss'] = self.mask_loss(
+                mask_score, self.data['mask_targets'])
+        else:
+            # Return the RoIs to decode the refine boxes
+            if len(self.data['rois']) > 1:
+                outputs['rois'] = torch.cat(self.data['rois'], 0)
+            else:
+                outputs['rois'] = self.data['rois'][0]
+            # Return the classification prob
+            outputs['cls_prob'] = self.softmax(cls_score)
+            # Set a callback to decode mask from refine RoIs
+            self.compute_mask_score = \
+                functools.partial(
+                    self.get_mask_score,
+                    features=kwargs['features'],
+                )
+
+        return outputs
--- a/lib/modeling/mobilenet.py
+++ b/lib/modeling/mobilenet.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+
+import dragon.vm.torch as torch
+
+from lib.core.config import cfg
+from lib.modules import init
+from lib.modules import nn
+from lib.modules import vision
+
+
+def conv_triplet(dim_in, dim_out):
+    """1x1 convolution + BN + ReLU."""
+    return [
+        nn.Conv2d(dim_in, dim_out, 1, bias=False),
+        nn.Affine(dim_out),
+        nn.ReLU(True),
+    ]
+
+
+def conv_quintet(dim_in, dim_out, ks, stride):
+    """KxK convolution + BN + ReLU."""
+    return [
+        nn.DepthwiseConv2d(
+            dim_in, dim_in,
+            kernel_size=ks,
+            stride=stride,
+            padding=ks // 2,
+            bias=False,
+        ),
+        nn.Affine(dim_in),
+        nn.ReLU(True),
+        nn.Conv1x1(dim_in, dim_out),
+        nn.Affine(dim_out),
+    ]
+
+
+class Setting(object):
+    V2 = (
+        [2, 3, 4, 3, 3, 1],
+        [2, 2, 2, 1, 2, 1],
+        [32, 16, 24, 32, 64, 96, 160, 320, 1280],
+    )
+    PROXYLESS_MOBILE = (
+        [4, 4, 4, 4, 4, 1],
+        [2, 2, 2, 1, 2, 1],
+        [32, 16, 32, 40, 80, 96, 192, 320, 1280],
+    )
+    PROXYLESS_GPU = (
+        [4, 4, 4, 4, 4, 1],
+        [2, 2, 2, 1, 2, 1],
+        [40, 24, 32, 56, 112, 128, 256, 432, 1280],
+    )
+
+
+def Stem(dim_out, stride=1):
+    return torch.nn.Sequential(
+        torch.nn.Conv2d(
+            3, dim_out,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            bias=False,
+        ),
+        nn.Affine(dim_out),
+        nn.ReLU(True),
+    )
+
+
+class Choice(nn.Module):
+    def __init__(self, dim_in, dim_out, mb=3, ks=3, stride=1):
+        super(Choice, self).__init__()
+        self.mb = mb
+        dim_hidden = int(round(dim_in * mb))
+        seq = conv_triplet(dim_in, dim_hidden) if mb != 1 else []
+        seq += conv_quintet(dim_hidden, dim_out, ks, stride)
+        self.conv = nn.ModuleList(seq)
+        self.stride = stride
+        self.apply_residual = stride == 1 and dim_in == dim_out
+
+    def forward(self, x):
+        residual = x if self.apply_residual else None
+        for i in range(3):
+            x = self.conv[i](x)
+        y = x if self.stride == 2 else None
+        for i in range(3, len(self.conv)):
+            x = self.conv[i](x)
+        if self.apply_residual:
+            return residual + x, y
+        else:
+            return x, y
+
+
+class NASMobileNet(nn.Module):
+    def __init__(self, choices, preset=Setting.PROXYLESS_MOBILE):
+        super(NASMobileNet, self).__init__()
+
+        # Pre-defined blocks
+        def select_block(choice):
+            return {
+                0: functools.partial(Choice, mb=3, ks=3),
+                1: functools.partial(Choice, mb=6, ks=3),
+                2: functools.partial(Choice, mb=3, ks=5),
+                3: functools.partial(Choice, mb=6, ks=5),
+                4: functools.partial(Choice, mb=3, ks=7),
+                5: functools.partial(Choice, mb=6, ks=7),
+                6: nn.Identity,
+            }[choice]
+
+        # Hand-craft configurations
+        repeats, strides, out_channels = preset
+        names = ['2!', '3!', '4', '4!', '5', '5!']
+        self.num_layers = len(choices)
+        assert sum(repeats) == self.num_layers
+
+        # + Stem
+        self.bootstrap = vision.Bootstrap()
+        self.conv1 = Stem(out_channels[0], stride=2)
+        self.stage1 = Choice(out_channels[0], out_channels[1], mb=1, ks=3)
+        dim_in = out_channels[1]
+        self.feature_dims = [out_channels[-1]]
+
+        # + Body
+        self.layers = []
+        for name, rep, dim_out, stride in zip(
+                names, repeats, out_channels[2:], strides):
+            self.layers.append(select_block(
+                choices[len(self.layers)]
+            )(dim_in, dim_out, stride=stride))
+            if stride == 2:
+                self.feature_dims.insert(
+                    -1, dim_in * self.layers[-1].mb)
+            for i in range(rep - 1):
+                self.layers.append(select_block(
+                    choices[len(self.layers)]
+                )(dim_out, dim_out, stride=1))
+            fullname = 'stage%s' % name.split('!')[0]
+            seq = getattr(self, fullname, [])
+            seq += self.layers[-rep:]
+            seq = nn.Sequential(*seq) if '!' in name else seq
+            setattr(self, fullname, seq)
+            dim_in = dim_out
+
+        self.conv6 = nn.Sequential(*conv_triplet(dim_in, out_channels[-1]))
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        for m in self.modules():
+            if nn.is_conv2d(m):
+                init.kaiming_normal(m.weight, 'fan_out')
+                if m.bias is not None:
+                    init.constant(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm2d):
+                init.constant(m.weight, 1)
+            elif isinstance(m, nn.Linear):
+                if m.bias is not None:
+                    init.constant(m.bias, 0)
+
+        # Stop the gradients if necessary
+        def freeze_func(m):
+            if nn.is_conv2d(m):
+                m.weight.requires_grad = False
+                m._buffers['weight'] = m.weight
+                del m._parameters['weight']
+
+        if cfg.MODEL.FREEZE_AT > 0:
+            self.conv1.apply(freeze_func)
+            self.stage1.apply(freeze_func)
+
+        for i in range(cfg.MODEL.FREEZE_AT, 1, -1):
+            getattr(self, 'stage{}'.format(i)).apply(freeze_func)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x, _ = self.stage1(x)
+        outputs = []
+        for layer in self.layers:
+            x = layer(x)
+            x, y = x if isinstance(x, tuple) else (x, None)
+            if y is not None:
+                outputs.append(y)
+        outputs.append(self.conv6(x))
+        return outputs
+
+
+def make_mobilenet_a1():
+    return NASMobileNet([
+        4, 6, 6, 6,
+        3, 3, 4, 6,
+        2, 4, 0, 4, 1, 5, 3, 5,
+        2, 4, 2, 4,
+        1,
+    ], Setting.PROXYLESS_MOBILE)
+
+
+def make_mobilenet_v2():
+    return NASMobileNet([
+        1, 1,
+        1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1,
+        1,
+    ], Setting.V2)
--- a/lib/modeling/resnet.py
+++ b/lib/modeling/resnet.py
@@ -20,12 +20,11 @@ from __future__ import print_function
 import dragon.vm.torch as torch

 from lib.core.config import cfg
-from lib.modeling import affine
-from lib.modeling import conv1x1
-from lib.modeling import conv3x3
+from lib.modules import nn
+from lib.modules import init


-class BasicBlock(torch.nn.Module):
+class BasicBlock(nn.Module):
    def __init__(
        self,
        dim_in,
@@ -35,11 +34,11 @@ class BasicBlock(torch.nn.Module):
        dropblock=None,
    ):
        super(BasicBlock, self).__init__()
-        self.conv1 = conv3x3(dim_in, dim_out, stride)
-        self.bn1 = affine(dim_out)
+        self.conv1 = nn.Conv3x3(dim_in, dim_out, stride)
+        self.bn1 = nn.Affine(dim_out)
        self.relu = torch.nn.ReLU(inplace=True)
-        self.conv2 = conv3x3(dim_out, dim_out)
-        self.bn2 = affine(dim_out)
+        self.conv2 = nn.Conv3x3(dim_out, dim_out)
+        self.bn2 = nn.Affine(dim_out)
        self.downsample = downsample
        self.dropblock = dropblock

@@ -83,12 +82,12 @@ class Bottleneck(torch.nn.Module):
    ):
        super(Bottleneck, self).__init__()
        dim = int(dim_out * self.contraction)
-        self.conv1 = conv1x1(dim_in, dim)
-        self.bn1 = affine(dim)
-        self.conv2 = conv3x3(dim, dim, stride=stride)
-        self.bn2 = affine(dim)
-        self.conv3 = conv1x1(dim, dim_out)
-        self.bn3 = affine(dim_out)
+        self.conv1 = nn.Conv1x1(dim_in, dim)
+        self.bn1 = nn.Affine(dim)
+        self.conv2 = nn.Conv3x3(dim, dim, stride=stride)
+        self.bn2 = nn.Affine(dim)
+        self.conv3 = nn.Conv1x1(dim, dim_out)
+        self.bn3 = nn.Affine(dim_out)
        self.relu = torch.nn.ReLU(inplace=True)
        self.downsample = downsample
        self.dropblock = dropblock
@@ -133,7 +132,7 @@ class ResNet(torch.nn.Module):
            padding=3,
            bias=False,
        )
-        self.bn1 = affine(self.dim_in)
+        self.bn1 = nn.Affine(self.dim_in)
        self.relu = torch.nn.ReLU(inplace=True)
        self.maxpool = torch.nn.MaxPool2d(
            kernel_size=3,
@@ -160,13 +159,9 @@ class ResNet(torch.nn.Module):
        self.reset_parameters()

    def reset_parameters(self):
-        # The Kaiming Initialization
        for m in self.modules():
-            if isinstance(m, torch.nn.Conv2d):
-                torch.nn.init.kaiming_normal_(
-                    m.weight,
-                    nonlinearity='relu',
-                )
+            if isinstance(m, nn.Conv2d):
+                init.kaiming_normal(m.weight)

        # Stop the gradients if necessary
        def freeze_func(m):
@@ -184,15 +179,15 @@ class ResNet(torch.nn.Module):
    def make_blocks(self, block, dim_out, blocks, stride=1, dropblock=None):
        downsample = None
        if stride != 1 or self.dim_in != dim_out:
-            downsample = torch.nn.Sequential(
-                conv1x1(self.dim_in, dim_out, stride=stride),
-                affine(dim_out),
+            downsample = nn.Sequential(
+                nn.Conv1x1(self.dim_in, dim_out, stride=stride),
+                nn.Affine(dim_out),
            )
        layers = [block(self.dim_in, dim_out, stride, downsample, dropblock)]
        self.dim_in = dim_out
        for i in range(1, blocks):
            layers.append(block(dim_out, dim_out, dropblock=dropblock))
-        return torch.nn.Sequential(*layers)
+        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)

--- a/lib/modeling/retinanet.py
+++ b/lib/modeling/retinanet.py
@@ -17,13 +17,14 @@ import collections
 import math
 import dragon.vm.torch as torch

+from lib import retinanet
 from lib.core.config import cfg
-from lib.modeling import conv3x3
-from lib.ops.modules import RetinaNetDecoder
-from lib.retinanet import AnchorTargetLayer
+from lib.modules import det
+from lib.modules import init
+from lib.modules import nn


-class RetinaNet(torch.nn.Module):
+class RetinaNet(nn.Module):
    def __init__(self, dim_in=256):
        super(RetinaNet, self).__init__()

@@ -32,34 +33,30 @@ class RetinaNet(torch.nn.Module):
        ########################################

        self.cls_conv = torch.nn.ModuleList(
-            conv3x3(dim_in, dim_in, bias=True)
+            nn.Conv3x3(dim_in, dim_in, bias=True)
            for _ in range(cfg.RETINANET.NUM_CONVS)
        )
        self.bbox_conv = torch.nn.ModuleList(
-            conv3x3(dim_in, dim_in, bias=True)
+            nn.Conv3x3(dim_in, dim_in, bias=True)
            for _ in range(cfg.RETINANET.NUM_CONVS)
        )
        # Packed as [C, A] not [A, C]
        self.C = cfg.MODEL.NUM_CLASSES - 1
        A = len(cfg.RETINANET.ASPECT_RATIOS) * \
            cfg.RETINANET.SCALES_PER_OCTAVE
-        self.cls_score = conv3x3(dim_in, self.C * A, bias=True)
-        self.bbox_pred = conv3x3(dim_in, 4 * A, bias=True)
-        self.cls_prob = torch.nn.Sigmoid(inplace=True)
-        self.relu = torch.nn.ReLU(inplace=True)
-        self.decoder = RetinaNetDecoder()
+        self.cls_score = nn.Conv3x3(dim_in, self.C * A, bias=True)
+        self.bbox_pred = nn.Conv3x3(dim_in, 4 * A, bias=True)
+        self.cls_prob = nn.Sigmoid(inplace=True)
+        self.relu = nn.ReLU(inplace=True)
+        self.decoder = det.RetinaNetDecoder()

        ########################################
        #           RetinaNet losses           #
        ########################################

-        self.anchor_target_layer = AnchorTargetLayer()
-        self.cls_loss = torch.nn.SigmoidFocalLoss(
-            alpha=cfg.MODEL.FOCAL_LOSS_ALPHA,
-            gamma=cfg.MODEL.FOCAL_LOSS_GAMMA,
-        )
-        self.bbox_loss = torch.nn.SmoothL1Loss(
-            beta=.11, reduction='batch_size')
+        self.anchor_target = retinanet.AnchorTarget()
+        self.cls_loss = nn.SigmoidFocalLoss()
+        self.bbox_loss = nn.SmoothL1Loss(0.1111)
        self.reset_parameters()

    def reset_parameters(self):
@@ -67,8 +64,8 @@ class RetinaNet(torch.nn.Module):
        # Weight ~ Normal(0, 0.01)
        for m in self.modules():
            if isinstance(m, torch.nn.Conv2d):
-                torch.nn.init.normal_(m.weight, std=0.01)
-                torch.nn.init.constant_(m.bias, 0)
+                init.normal(m.weight, std=0.01)
+                init.constant(m.bias, 0)

        # Bias prior initialization for Focal Loss
        # For details, See the official codes:
@@ -127,7 +124,7 @@ class RetinaNet(torch.nn.Module):

        """
        self.retinanet_data = \
-            self.anchor_target_layer(
+            self.anchor_target(
                features=features,
                gt_boxes=gt_boxes,
                ims_info=ims_info,

--- a/lib/modeling/rpn.py
+++ b/lib/modeling/rpn.py
@@ -16,12 +16,13 @@ from __future__ import print_function
 import collections
 import dragon.vm.torch as torch

+from lib import faster_rcnn
 from lib.core.config import cfg
-from lib.modeling import conv1x1
-from lib.modeling import conv3x3
+from lib.modules import init
+from lib.modules import nn


-class RPN(torch.nn.Module):
+class RPN(nn.Module):
    """Region Proposal Networks for R-CNN series."""

    def __init__(self, dim_in=256):
@@ -33,34 +34,26 @@ class RPN(torch.nn.Module):

        num_anchors = len(cfg.RPN.ASPECT_RATIOS) * (
            len(cfg.RPN.SCALES) if len(cfg.RPN.STRIDES) == 1 else 1)
-        self.output = conv3x3(dim_in, dim_in, bias=True)
-        self.cls_score = conv1x1(dim_in, num_anchors, bias=True)
-        self.bbox_pred = conv1x1(dim_in, num_anchors * 4, bias=True)
-        self.relu = torch.nn.ReLU(inplace=True)
+        self.output = nn.Conv3x3(dim_in, dim_in, bias=True)
+        self.cls_score = nn.Conv1x1(dim_in, num_anchors, bias=True)
+        self.bbox_pred = nn.Conv1x1(dim_in, num_anchors * 4, bias=True)
+        self.relu = nn.ReLU(inplace=True)

        ##################################
        #            RPN losses          #
        ##################################

-        if len(cfg.RPN.STRIDES) > 1:
-            # RPN with multiple strides(i.e. FPN)
-            from lib.fpn.anchor_target_layer import AnchorTargetLayer
-        else:
-            # RPN with single stride(i.e. C4)
-            from lib.faster_rcnn.anchor_target_layer import AnchorTargetLayer
-
-        self.anchor_target_layer = AnchorTargetLayer()
-        self.cls_loss = torch.nn.BCEWithLogitsLoss()
-        self.bbox_loss = torch.nn.SmoothL1Loss(
-            beta=.11, reduction='batch_size')
+        self.anchor_target = faster_rcnn.AnchorTarget()
+        self.cls_loss = nn.BCEWithLogitsLoss()
+        self.bbox_loss = nn.SmoothL1Loss(0.1111)
        self.reset_parameters()

    def reset_parameters(self):
        # Initialization for the RPN
        # Weight ~ Normal(0, 0.01)
        for m in self.modules():
-            if isinstance(m, torch.nn.Conv2d):
-                torch.nn.init.normal_(m.weight, std=0.01)
+            if isinstance(m, nn.Conv2d):
+                init.normal(m.weight, std=0.01)

    def compute_outputs(self, features):
        """Compute the RPN logits.
@@ -116,7 +109,7 @@ class RPN(torch.nn.Module):

        """
        self.rpn_data = \
-            self.anchor_target_layer(
+            self.anchor_target(
                features=features,
                gt_boxes=gt_boxes,
                ims_info=ims_info,

--- a/lib/modeling/ssd.py
+++ b/lib/modeling/ssd.py
@@ -16,15 +16,13 @@ from __future__ import print_function
 import collections
 import dragon.vm.torch as torch

+from lib import ssd
 from lib.core.config import cfg
-from lib.modeling import conv3x3
-from lib.ssd import HardMiningLayer
-from lib.ssd import MultiBoxMatchLayer
-from lib.ssd import MultiBoxTargetLayer
-from lib.ssd import PriorBoxLayer
+from lib.modules import init
+from lib.modules import nn


-class SSD(torch.nn.Module):
+class SSD(nn.Module):
    def __init__(self, feature_dims):
        super(SSD, self).__init__()

@@ -33,20 +31,19 @@ class SSD(torch.nn.Module):
        ########################################

        self.cls_conv = torch.nn.ModuleList(
-            conv3x3(feature_dims[0], feature_dims[0], bias=True)
+            nn.Conv3x3(feature_dims[0], feature_dims[0], bias=True)
            for _ in range(cfg.SSD.NUM_CONVS)
        )
        self.bbox_conv = torch.nn.ModuleList(
-            conv3x3(feature_dims[0], feature_dims[0], bias=True)
+            nn.Conv3x3(feature_dims[0], feature_dims[0], bias=True)
            for _ in range(cfg.SSD.NUM_CONVS)
        )

-        self.cls_score = torch.nn.ModuleList()
-        self.bbox_pred = torch.nn.ModuleList()
-        self.softmax = torch.nn.Softmax(dim=2)
-        self.relu = torch.nn.ReLU(inplace=True)
+        self.cls_score = nn.ModuleList()
+        self.bbox_pred = nn.ModuleList()
+        self.softmax = nn.Softmax(dim=2)
+        self.relu = nn.ReLU(inplace=True)

-        C = cfg.MODEL.NUM_CLASSES
        self.box_dim = len(cfg.BBOX_REG_WEIGHTS)
        if len(feature_dims) == 1 and \
                len(feature_dims) != len(cfg.SSD.MULTIBOX.STRIDES):
@@ -54,24 +51,22 @@ class SSD(torch.nn.Module):
        feature_dims = list(filter(None, feature_dims))

        for i, dim_in in enumerate(feature_dims):
-            A = len(cfg.SSD.MULTIBOX.ASPECT_RATIOS[i]) + 1
-            if self.box_dim == 5 and \
-                    len(cfg.SSD.MULTIBOX.ASPECT_ANGLES) > 0:
-                A *= len(cfg.SSD.MULTIBOX.ASPECT_ANGLES)
-            self.cls_score.append(conv3x3(dim_in, A * C, bias=True))
-            self.bbox_pred.append(conv3x3(dim_in, A * self.box_dim, bias=True))
+            nc = cfg.MODEL.NUM_CLASSES
+            na = len(cfg.SSD.MULTIBOX.ASPECT_RATIOS[i]) + 1
+            self.cls_score.append(nn.Conv3x3(dim_in, na * nc, bias=True))
+            self.bbox_pred.append(nn.Conv3x3(dim_in, na * self.box_dim, bias=True))

-        self.prior_box_layer = PriorBoxLayer()
+        self.prior_box = ssd.PriorBox()

        ########################################
        #              SSD losses              #
        ########################################

-        self.box_match_layer = MultiBoxMatchLayer()
-        self.hard_mining_layer = HardMiningLayer()
-        self.box_target_layer = MultiBoxTargetLayer()
-        self.cls_loss = torch.nn.CrossEntropyLoss(ignore_index=-1)
-        self.bbox_loss = torch.nn.SmoothL1Loss(reduction='batch_size')
+        self.box_match = ssd.MultiBoxMatch()
+        self.hard_mining = ssd.HardMining()
+        self.box_target = ssd.MultiBoxTarget()
+        self.cls_loss = nn.CrossEntropyLoss()
+        self.bbox_loss = nn.SmoothL1Loss()
        self.reset_parameters()

    def reset_parameters(self):
@@ -79,16 +74,16 @@ class SSD(torch.nn.Module):
            # Initialization following the RPN
            # Weight ~ Normal(0, 0.01)
            for m in self.modules():
-                if isinstance(m, torch.nn.Conv2d):
-                    torch.nn.init.normal_(m.weight, std=0.01)
-                    torch.nn.init.constant_(m.bias, 0)
+                if isinstance(m, nn.Conv2d):
+                    init.normal(m.weight, std=0.01)
+                    init.constant(m.bias, 0)
        else:
            # Careful Initialization
            # Weight ~ Normal(0, 0.001)
            for m in self.modules():
-                if isinstance(m, torch.nn.Conv2d):
-                    torch.nn.init.normal_(m.weight, std=0.001)
-                    torch.nn.init.constant_(m.bias, 0)
+                if isinstance(m, nn.Conv2d):
+                    init.normal(m.weight, std=0.001)
+                    init.constant(m.bias, 0)

    def compute_outputs(self, features):
        """Compute the SSD logits.
@@ -145,24 +140,24 @@ class SSD(torch.nn.Module):
        """
        # Collect the SSD training data
        # See the paper(Liu et al. 2016) for details
-        self.ssd_data = \
-            self.box_match_layer(
-                prior_boxes=prior_boxes,
-                gt_boxes=gt_boxes,
+        self.data = \
+            self.box_match(
+                prior_boxes,
+                gt_boxes,
            )
-        self.ssd_data.update(
-            self.hard_mining_layer(
-                conf_prob=cls_prob,
-                match_labels=self.ssd_data['match_labels'],
-                max_overlaps=self.ssd_data['max_overlaps'],
+        self.data.update(
+            self.hard_mining(
+                cls_prob,
+                self.data['match_labels'],
+                self.data['max_overlaps'],
            )
        )
-        self.ssd_data.update(
-            self.box_target_layer(
-                match_inds=self.ssd_data['match_inds'],
-                match_labels=self.ssd_data['match_labels'],
-                prior_boxes=prior_boxes,
-                gt_boxes=gt_boxes,
+        self.data.update(
+            self.box_target(
+                self.data['match_inds'],
+                self.data['match_labels'],
+                prior_boxes,
+                gt_boxes,
            )
        )
        return collections.OrderedDict([
@@ -170,17 +165,17 @@ class SSD(torch.nn.Module):
            # As we normalize both the pos and neg samples
            ('cls_loss', self.cls_loss(
                cls_score.view(-1, cfg.MODEL.NUM_CLASSES),
-                self.ssd_data['labels']) * 4.),
+                self.data['labels']) * 4.),
            ('bbox_loss', self.bbox_loss(
                bbox_pred,
-                self.ssd_data['bbox_targets'],
-                self.ssd_data['bbox_inside_weights'],
-                self.ssd_data['bbox_outside_weights'],
+                self.data['bbox_targets'],
+                self.data['bbox_inside_weights'],
+                self.data['bbox_outside_weights'],
            )),
        ])

    def forward(self, *args, **kwargs):
-        prior_boxes = self.prior_box_layer(kwargs['features'])
+        prior_boxes = self.prior_box(kwargs['features'])
        cls_score, bbox_pred = self.compute_outputs(kwargs['features'])
        cls_score, bbox_pred = cls_score.float(), bbox_pred.float()


--- a/lib/modeling/vgg.py
+++ b/lib/modeling/vgg.py
@@ -13,24 +13,22 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import dragon.vm.torch as torch
-
 from lib.core.config import cfg
-from lib.modeling import conv1x1
-from lib.modeling import conv3x3
+from lib.modules import init
+from lib.modules import nn


-class VGG(torch.nn.Module):
+class VGG(nn.Module):
    def __init__(self, arch, extra_arch=None, reduced=False):
        super(VGG, self).__init__()
        self.reduced = reduced
        self.units, filter_list = arch
        self.feature_dims = filter_list[:]
-        self.maxpool = torch.nn.MaxPool2d(
+        self.maxpool = nn.MaxPool2d(
            kernel_size=2, stride=2, ceil_mode=True)
-        self.s1pool = torch.nn.MaxPool2d(
+        self.s1pool = nn.MaxPool2d(
            kernel_size=3, stride=1, padding=1, ceil_mode=True)
-        self.relu = torch.nn.ReLU(inplace=True)
+        self.relu = nn.ReLU(inplace=True)
        for i in range(len(self.units)):
            conv_name = 'conv{}'.format(i + 1)
            dim_in = 3 if i == 0 else filter_list[i - 1]
@@ -38,21 +36,21 @@ class VGG(torch.nn.Module):
                self.__setattr__(
                    '{}_{}'
                    .format(conv_name, j + 1),
-                    conv3x3(dim_in, filter_list[i], bias=True),
+                    nn.Conv3x3(dim_in, filter_list[i], bias=True),
                )
                if j == 0:
                    dim_in = filter_list[i]
        if reduced:
-            # L2Norm is redundant from the observation of
-            # empirical experiments. We just keep a trainable scale
-            self.conv4_3_norm = torch.nn.Affine(filter_list[3], bias=False)
+            # L2Norm is redundant from the observation
+            # We just keep a trainable scale
+            self.conv4_3_norm = nn.Affine(filter_list[3], bias=False)
            self.conv4_3_norm.weight.zero_()  # Zero-Init
-            self.fc6 = torch.nn.Conv2d(
+            self.fc6 = nn.Conv2d(
                filter_list[-1], 1024,
                kernel_size=3, padding=6,
                stride=1, dilation=6,
            )
-            self.fc7 = conv1x1(1024, 1024, bias=True)
+            self.fc7 = nn.Conv1x1(1024, 1024, bias=True)
            self.feature_dims = [filter_list[-2], 1024]
        if extra_arch is not None:
            strides, filter_list, kps = extra_arch
@@ -63,36 +61,44 @@ class VGG(torch.nn.Module):
                dim_in = 1024 if i == 0 else filter_list[i - 1] * 2
                self.__setattr__(
                    '{}_1'.format(conv_name),
-                    conv1x1(dim_in, filter_list[i], bias=True),
+                    nn.Conv1x1(
+                        dim_in,
+                        filter_list[i],
+                        bias=True,
+                    ),
                )
                if strides[i] == 2:
                    self.__setattr__(
                        '{}_2'.format(conv_name),
-                        conv3x3(filter_list[i], filter_list[i] * 2, 2, bias=True),
+                        nn.Conv3x3(
+                            filter_list[i],
+                            filter_list[i] * 2,
+                            stride=2,
+                            bias=True,
+                        ),
                    )
                else:
                    self.__setattr__(
                        '{}_2'.format(conv_name),
-                        torch.nn.Conv2d(
-                            filter_list[i], filter_list[i] * 2,
-                            kernel_size=kps[0], padding=kps[1], stride=kps[2]
+                        nn.Conv2d(
+                            filter_list[i],
+                            filter_list[i] * 2,
+                            kernel_size=kps[0],
+                            padding=kps[1],
+                            stride=kps[2]
                        ),
                    )
        self.reset_parameters()

    def reset_parameters(self):
        for m in self.modules():
-            if isinstance(m, torch.nn.Conv2d):
-                torch.nn.init.kaiming_uniform_(
-                    m.weight,
-                    # Fix the gain for [-127, 127]
-                    a=1,
-                )  # Xavier Initialization
-                torch.nn.init.constant_(m.bias, 0)
+            if isinstance(m, nn.Conv2d):
+                init.xaiver(m.weight)
+                init.constant(m.bias, 0)

        # Stop the gradients if necessary
        def freeze_func(m):
-            if isinstance(m, torch.nn.Conv2d):
+            if isinstance(m, nn.Conv2d):
                m.weight.requires_grad = False
                m._buffers['weight'] = m.weight
                del m._parameters['weight']

--- a/lib/ops/__init__.py
+++ b/lib/ops/__init__.py
--- a/lib/ops/functional.py
+++ b/lib/ops/functional.py
@@ -13,8 +13,65 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+from dragon.vm.torch import nn
 from dragon.vm.torch.autograd import function
-from lib.ops import functions
+
+from lib.core.config import cfg
+
+
+class _RetinaNetDecoder(function.Function):
+    def __init__(self, key, dev, **kwargs):
+        super(_RetinaNetDecoder, self).__init__(key, dev, **kwargs)
+        self.args = kwargs
+
+    def register_operator(self):
+        return {
+            'op_type': 'Proposal',
+            'arguments': {
+                'det_type': 'RETINANET',
+                'strides': self.args['strides'],
+                'ratios': self.args['ratios'],
+                'scales': self.args['scales'],
+                'pre_nms_top_n': self.args['pre_nms_top_n'],
+                'score_thresh': self.args['score_thresh'],
+            }
+        }
+
+    def forward(self, features, cls_prob, bbox_pred, ims_info):
+        inputs = features + [cls_prob, bbox_pred, ims_info]
+        self._unify_devices(inputs[:-1])  # Skip <ims_info>
+        return self.run(inputs, [self.alloc()], unify_devices=False)
+
+
+class _RPNDecoder(function.Function):
+    def __init__(self, key, dev, **kwargs):
+        super(_RPNDecoder, self).__init__(key, dev, **kwargs)
+        self.args = kwargs
+
+    def register_operator(self):
+        return {
+            'op_type': 'Proposal',
+            'arguments': {
+                'det_type': 'RCNN',
+                'strides': self.args['strides'],
+                'ratios': self.args['ratios'],
+                'scales': self.args['scales'],
+                'pre_nms_top_n': self.args['pre_nms_top_n'],
+                'post_nms_top_n': self.args['post_nms_top_n'],
+                'nms_thresh': self.args['nms_thresh'],
+                'min_size': self.args['min_size'],
+                'min_level': self.args['min_level'],
+                'max_level': self.args['max_level'],
+                'canonical_scale': self.args['canonical_scale'],
+                'canonical_level': self.args['canonical_level'],
+            }
+        }
+
+    def forward(self, features, cls_prob, bbox_pred, ims_info):
+        inputs = features + [cls_prob, bbox_pred, ims_info]
+        self._unify_devices(inputs[:-1])  # Skip <ims_info>
+        outputs = [self.alloc() for _ in range(self.args['K'])]
+        return self.run(inputs, outputs, unify_devices=False)


 def decode_retinanet(
@@ -29,7 +86,7 @@ def decode_retinanet(
    score_thresh,
 ):
    return function.get(
-        functions.RetinaNetDecoder,
+        _RetinaNetDecoder,
        cls_prob.device,
        strides=strides,
        ratios=ratios,
@@ -58,7 +115,7 @@ def decode_rpn(
    canonical_level,
 ):
    return function.get(
-        functions.RPNDecoder,
+        _RPNDecoder,
        cls_prob.device,
        K=num_outputs,
        strides=strides,
@@ -74,3 +131,59 @@ def decode_rpn(
        canonical_level=canonical_level,
    ).apply(features, cls_prob, bbox_pred, ims_info)

+
+class RetinaNetDecoder(nn.Module):
+    """Generate pred regions from retinanet."""
+
+    def __init__(self):
+        super(RetinaNetDecoder, self).__init__()
+        k_max, k_min = cfg.FPN.RPN_MAX_LEVEL, cfg.FPN.RPN_MIN_LEVEL
+        scales_per_octave = cfg.RETINANET.SCALES_PER_OCTAVE
+        self.strides = [int(2. ** lvl) for lvl in range(k_min, k_max + 1)]
+        self.scales = [cfg.RETINANET.ANCHOR_SCALE *
+                       (2 ** (octave / float(scales_per_octave)))
+                       for octave in range(scales_per_octave)]
+
+    def forward(self, features, cls_prob, bbox_pred, ims_info):
+        return decode_retinanet(
+            features=features,
+            cls_prob=cls_prob,
+            bbox_pred=bbox_pred,
+            ims_info=ims_info,
+            strides=self.strides,
+            ratios=[float(e) for e in cfg.RETINANET.ASPECT_RATIOS],
+            scales=self.scales,
+            pre_nms_top_n=cfg.RETINANET.PRE_NMS_TOP_N,
+            score_thresh=cfg.TEST.SCORE_THRESH,
+        )
+
+
+class RPNDecoder(nn.Module):
+    """Generate proposal regions from RPN."""
+
+    def __init__(self):
+        super(RPNDecoder, self).__init__()
+        self.K = (cfg.FPN.ROI_MAX_LEVEL -
+                  cfg.FPN.ROI_MIN_LEVEL + 1) \
+            if len(cfg.RPN.STRIDES) > 1 else 1
+
+    def forward(self, features, cls_prob, bbox_pred, ims_info):
+        outputs = decode_rpn(
+            features=features,
+            cls_prob=cls_prob,
+            bbox_pred=bbox_pred,
+            ims_info=ims_info,
+            num_outputs=self.K,
+            strides=cfg.RPN.STRIDES,
+            ratios=[float(e) for e in cfg.RPN.ASPECT_RATIOS],
+            scales=[float(e) for e in cfg.RPN.SCALES],
+            pre_nms_top_n=cfg.TEST.RPN_PRE_NMS_TOP_N,
+            post_nms_top_n=cfg.TEST.RPN_POST_NMS_TOP_N,
+            nms_thresh=cfg.TEST.RPN_NMS_THRESH,
+            min_size=cfg.TEST.RPN_MIN_SIZE,
+            min_level=cfg.FPN.ROI_MIN_LEVEL,
+            max_level=cfg.FPN.ROI_MAX_LEVEL,
+            canonical_scale=cfg.FPN.ROI_CANONICAL_SCALE,
+            canonical_level=cfg.FPN.ROI_CANONICAL_LEVEL,
+        )
+        return [outputs] if self.K == 1 else outputs
--- a/lib/modules/init.py
+++ b/lib/modules/init.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from dragon.vm.torch import nn
+
+
+def xaiver(weight, mode='fan_in'):
+    """The initializer of xavier uniform distribution."""
+    nn.init.kaiming_uniform_(
+        weight,
+        a=1,  # Fix the gain for [-127, 127]
+        mode=mode,
+    )
+
+
+def kaiming_normal(weight, mode='fan_in'):
+    """The initializer of kaiming normal distribution."""
+    nn.init.kaiming_normal_(
+        weight,
+        mode=mode,
+        nonlinearity='relu',
+    )
+
+# Aliases
+constant = nn.init.constant_
+normal = nn.init.normal_
--- a/lib/modules/nn.py
+++ b/lib/modules/nn.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+"""Define some basic structures."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from dragon.vm.torch import nn
+from lib.core.config import cfg
+
+
+class Affine(object):
+    """Affine transformation with weight and bias fixed."""
+
+    def __new__(cls, dim_in, bias=True, inplace=True):
+        return nn.Affine(
+            dim_in,
+            fix_weight=True,
+            fix_bias=True,
+            inplace=inplace,
+        )
+
+
+class Conv1x1(object):
+    """1x1 convolution."""
+
+    def __new__(cls, dim_in, dim_out, stride=1, bias=False):
+        return nn.Conv2d(
+            dim_in,
+            dim_out,
+            kernel_size=1,
+            stride=stride,
+            bias=bias,
+        )
+
+
+class Conv3x3(object):
+    """3x3 convolution."""
+
+    def __new__(cls, dim_in, dim_out, stride=1, dilation=1, bias=False):
+        return nn.Conv2d(
+            dim_in,
+            dim_out,
+            kernel_size=3,
+            stride=stride,
+            padding=1 * dilation,
+            bias=bias,
+        )
+
+
+class CrossEntropyLoss(object):
+    """Cross entropy loss."""
+
+    def __new__(cls):
+        return nn.CrossEntropyLoss(ignore_index=-1)
+
+
+class Identity(nn.Module):
+    """Pass input to the output."""
+
+    def __init__(self, *args, **kwargs):
+        super(Identity, self).__init__()
+        _, _ = args, kwargs
+
+    def forward(self, x):
+        return x
+
+
+class SigmoidFocalLoss(object):
+    """Sigmoid focal loss."""
+
+    def __new__(cls):
+        return nn.SigmoidFocalLoss(
+            alpha=cfg.MODEL.FOCAL_LOSS_ALPHA,
+            gamma=cfg.MODEL.FOCAL_LOSS_GAMMA,
+        )
+
+
+class SmoothL1Loss(object):
+    """Smoothed l1 loss."""
+
+    def __new__(cls, beta=1.):
+        return nn.SmoothL1Loss(
+            beta=beta,
+            reduction='batch_size',
+        )
+
+
+def is_conv2d(module):
+    """Return a bool indicating the module is a Conv2d."""
+    return isinstance(module, nn.Conv2d) or \
+        isinstance(module, nn.DepthwiseConv2d)
+
+
+AvgPool2d = nn.AvgPool2d
+BatchNorm2d = nn.BatchNorm2d
+BCEWithLogitsLoss = nn.BCEWithLogitsLoss
+Conv2d = nn.Conv2d
+ConvTranspose2d = nn.ConvTranspose2d
+DepthwiseConv2d = nn.DepthwiseConv2d
+Linear = nn.Linear
+MaxPool2d = nn.MaxPool2d
+Module = nn.Module
+ModuleList = nn.ModuleList
+Sequential = nn.Sequential
+ReLU = nn.ReLU
+Sigmoid = nn.Sigmoid
+Softmax = nn.Softmax
--- a/lib/ops/modules.py
+++ b/lib/ops/modules.py
@@ -16,7 +16,22 @@ from __future__ import print_function
 import dragon.vm.torch as torch

 from lib.core.config import cfg
-from lib.ops import functional as F
+
+
+def roi_align(input, boxes, spatial_scale, size):
+    return torch.vision.ops.roi_align(
+        input, boxes,
+        output_size=(size, size),
+        spatial_scale=spatial_scale,
+    )
+
+
+def roi_pool(input, boxes, spatial_scale, size):
+    return torch.vision.ops.roi_pool(
+        input, boxes,
+        output_size=(size, size),
+        spatial_scale=spatial_scale,
+    )


 class Bootstrap(torch.nn.Module):
@@ -51,60 +66,3 @@ class Bootstrap(torch.nn.Module):
        return torch.vision.ops.image_data(
            input, self.dtype, self.mean_values,
        )
-
-
-class RetinaNetDecoder(torch.nn.Module):
-    """Generate proposal regions from retinanet."""
-
-    def __init__(self):
-        super(RetinaNetDecoder, self).__init__()
-        k_max, k_min = cfg.FPN.RPN_MAX_LEVEL, cfg.FPN.RPN_MIN_LEVEL
-        scales_per_octave = cfg.RETINANET.SCALES_PER_OCTAVE
-        self.strides = [int(2. ** lvl) for lvl in range(k_min, k_max + 1)]
-        self.scales = [cfg.RETINANET.ANCHOR_SCALE *
-                       (2 ** (octave / float(scales_per_octave)))
-                       for octave in range(scales_per_octave)]
-
-    def forward(self, features, cls_prob, bbox_pred, ims_info):
-        return F.decode_retinanet(
-            features=features,
-            cls_prob=cls_prob,
-            bbox_pred=bbox_pred,
-            ims_info=ims_info,
-            strides=self.strides,
-            ratios=[float(e) for e in cfg.RETINANET.ASPECT_RATIOS],
-            scales=self.scales,
-            pre_nms_top_n=cfg.RETINANET.PRE_NMS_TOP_N,
-            score_thresh=cfg.TEST.SCORE_THRESH,
-        )
-
-
-class RPNDecoder(torch.nn.Module):
-    """Generate proposal regions from RPN."""
-
-    def __init__(self):
-        super(RPNDecoder, self).__init__()
-        self.K = (cfg.FPN.ROI_MAX_LEVEL -
-                  cfg.FPN.ROI_MIN_LEVEL + 1) \
-            if len(cfg.RPN.STRIDES) > 1 else 1
-
-    def forward(self, features, cls_prob, bbox_pred, ims_info):
-        outputs = F.decode_rpn(
-            features=features,
-            cls_prob=cls_prob,
-            bbox_pred=bbox_pred,
-            ims_info=ims_info,
-            num_outputs=self.K,
-            strides=cfg.RPN.STRIDES,
-            ratios=[float(e) for e in cfg.RPN.ASPECT_RATIOS],
-            scales=[float(e) for e in cfg.RPN.SCALES],
-            pre_nms_top_n=cfg.TEST.RPN_PRE_NMS_TOP_N,
-            post_nms_top_n=cfg.TEST.RPN_POST_NMS_TOP_N,
-            nms_thresh=cfg.TEST.RPN_NMS_THRESH,
-            min_size=cfg.TEST.RPN_MIN_SIZE,
-            min_level=cfg.FPN.ROI_MIN_LEVEL,
-            max_level=cfg.FPN.ROI_MAX_LEVEL,
-            canonical_scale=cfg.FPN.ROI_CANONICAL_SCALE,
-            canonical_level=cfg.FPN.ROI_CANONICAL_LEVEL,
-        )
-        return [outputs] if self.K == 1 else outputs
--- a/lib/nms/nms_wrapper.py
+++ b/lib/nms/nms_wrapper.py
@@ -18,8 +18,6 @@ from __future__ import division
 from __future__ import print_function

 from lib.core.config import cfg
-from lib.utils import logger
-from lib.utils import rotated_boxes

 try:
    from lib.nms.cpu_nms import cpu_nms, cpu_soft_nms
@@ -36,8 +34,6 @@ def nms(detections, thresh, force_cpu=False):
    """Perform either CPU or GPU Hard-NMS."""
    if detections.shape[0] == 0:
        return []
-    if detections.shape[1] == 6:
-        return rotated_boxes.cpu_nms(detections, thresh)
    if cfg.USE_GPU_NMS and not force_cpu:
        return gpu_nms(detections, thresh, device_id=cfg.GPU_ID)
    else:
@@ -56,7 +52,7 @@ def soft_nms(
        return []
    methods = {'hard': 0, 'linear': 1, 'gaussian': 2}
    if method not in methods:
-        logger.fatal('Unknown soft nms method: {}'.format(method))
+        raise ValueError('Unknown soft nms method:', method)
    return cpu_soft_nms(
        detections,
        thresh,

--- a/lib/ops/functions.py
+++ b/lib/ops/functions.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from dragon.vm.torch.autograd import function
-
-
-class RetinaNetDecoder(function.Function):
-    def __init__(self, key, dev, **kwargs):
-        super(RetinaNetDecoder, self).__init__(key, dev, **kwargs)
-        self.args = kwargs
-
-    def register_operator(self):
-        return {
-            'op_type': 'Proposal',
-            'arguments': {
-                'det_type': 'RETINANET',
-                'strides': self.args['strides'],
-                'ratios': self.args['ratios'],
-                'scales': self.args['scales'],
-                'pre_nms_top_n': self.args['pre_nms_top_n'],
-                'score_thresh': self.args['score_thresh'],
-            }
-        }
-
-    def forward(self, features, cls_prob, bbox_pred, ims_info):
-        inputs = features + [cls_prob, bbox_pred, ims_info]
-        self._unify_devices(inputs[:-1])  # Skip <ims_info>
-        return self.run(inputs, [self.alloc()], unify_devices=False)
-
-
-class RPNDecoder(function.Function):
-    def __init__(self, key, dev, **kwargs):
-        super(RPNDecoder, self).__init__(key, dev, **kwargs)
-        self.args = kwargs
-
-    def register_operator(self):
-        return {
-            'op_type': 'Proposal',
-            'arguments': {
-                'det_type': 'RCNN',
-                'strides': self.args['strides'],
-                'ratios': self.args['ratios'],
-                'scales': self.args['scales'],
-                'pre_nms_top_n': self.args['pre_nms_top_n'],
-                'post_nms_top_n': self.args['post_nms_top_n'],
-                'nms_thresh': self.args['nms_thresh'],
-                'min_size': self.args['min_size'],
-                'min_level': self.args['min_level'],
-                'max_level': self.args['max_level'],
-                'canonical_scale': self.args['canonical_scale'],
-                'canonical_level': self.args['canonical_level'],
-            }
-        }
-
-    def forward(self, features, cls_prob, bbox_pred, ims_info):
-        inputs = features + [cls_prob, bbox_pred, ims_info]
-        self._unify_devices(inputs[:-1])  # Skip <ims_info>
-        outputs = [self.alloc() for _ in range(self.args['K'])]
-        return self.run(inputs, outputs, unify_devices=False)
--- a/lib/pycocotools/mask_utils.py
+++ b/lib/pycocotools/mask_utils.py
@@ -14,62 +14,146 @@ from __future__ import division
 from __future__ import print_function

 import numpy as np
-from lib.pycocotools.mask import encode as encode_masks, \
-    decode as decode_masks, frPyObjects
-
-
-def decode_rle(R):
-    N = len(R['counts'])
-    M = np.zeros( (R['size'][0]*R['size'][1], ), dtype=np.uint8)
-    n = 0
-    val = 1
-    for pos in range(N):
-        val = not val
-        for c in range(R['counts'][pos]):
-            R['counts'][pos]
-            M[n] = val
-            n += 1
-    return M.reshape((R['size']), order='F')
-
-
-def mask_poly2im(polys, im_height, im_width):
-    return frPyObjects(polys, im_height, im_width)
-
-
-def mask_coco2im(coco_masks, im_height, im_width):
-    im_masks = []
-    for i, ann in enumerate(coco_masks):
-        if isinstance(ann, list):
-            m = mask_poly2im(ann, im_height, im_width)
-        elif isinstance(ann, np.ndarray):
-            m = ann.astype(np.uint8)
-        else:
-            raise TypeError('Unknown type of mask: {}'.format(type(ann)))
-        im_masks.append(m)
-    return im_masks
-
-
-def mask_rle2im(rle_masks, im_height, im_width):
-    coco_masks = [{'counts': rle, 'size': [im_height, im_width]} for rle in rle_masks]
-    coco_masks = decode_masks(coco_masks)
-    coco_masks = coco_masks.transpose((2, 0, 1))
-    return mask_coco2im(coco_masks, im_height, im_width)
-
-
-def mask_bin2rle(bin_masks):
-    rle_masks = []
-    for bin_mask in bin_masks:
-        if bin_mask is None:
-            rle_mask = ''
-        else:
-            rle_mask = encode_masks(np.array(np.stack([bin_mask], axis=2), order='F'))[0]['counts']
-        rle_masks.append(rle_mask)
-    return rle_masks
-
-
-def mask_poly2rle(segmentations, im_height, im_width):
-    masks = []
-    for polys in segmentations:
-        mask = mask_poly2im(polys, im_height, im_width)
-        masks.append(mask[0]['counts'])
-    return masks
\ No newline at end of file
+
+from lib.pycocotools import mask as mask_tools
+from lib.pycocotools.mask import frPyObjects
+
+
+def poly2rle(poly, height, width):
+    """Convert polygon(s) into encoded rle.
+
+    The polygon(s) may be store in following format:
+
+    1. Polygon with uncompressed RLE:
+        {'size': (h, w), 'counts', [1, 2, ...]}
+
+    2. Polygons with number of coordinates > 4:
+        [[x1, y1, x2, y2, x3, y3, ...], [x1, y1, x2, y2, x3, y3, ...]]
+
+    3. Polygons with uncompressed RLE:
+        [{'size': (h, w), 'counts', [1, 2, ...]}]
+
+    COCO use **2** and **1** to annotate instances and crowed objects.
+
+    The output rle(s) will be:
+       {'size': (h, w), 'counts': 'abc...'} or [{'size': (h, w), 'counts': 'abc...'}]
+
+    Parameters
+    ----------
+    poly : Union[List, Dict]
+        The input polygons.
+    height : int
+        The height of image.
+    width : int
+        The width of image.
+
+    Returns
+    -------
+    Union[List, Dict]
+        The bytes or a sequence of bytes.
+
+    Notes
+    -----
+    COCODataset uses **2** and **1** to annotate instances and crowed objects.
+
+    """
+    return frPyObjects(poly, height, width)
+
+
+def poly2bytes(poly, height, width):
+    """Convert polygon(s) into encoded mask bytes.
+
+    The polygon(s) may be store in the following format:
+
+    1. Polygon with uncompressed RLE:
+        {'size': (h, w), 'counts', [1, 2, ...]}
+
+    2. Polygons with number of coordinates > 4:
+        [[x1, y1, x2, y2, x3, y3, ...], [x1, y1, x2, y2, x3, y3, ...]]
+
+    3. Polygons with uncompressed RLE:
+        [{'size': (h, w), 'counts', [1, 2, ...]}]
+
+    If the number of polygons >= 2, we will merge them into a single mask.
+
+    Parameters
+    ----------
+    poly : Union[List, Dict]
+        The input polygons.
+    height : int
+        The height of image.
+    width : int
+        The width of image.
+
+    Returns
+    -------
+    bytes
+        The mask bytes.
+
+    Notes
+    -----
+    COCODataset uses **2** and **1** to annotate instances and crowed objects.
+
+    """
+    rle_objects = poly2rle(poly, height, width)
+    if isinstance(rle_objects, list):
+        if len(rle_objects) == 1:
+            return rle_objects[0]['counts']
+        rle_objects = mask_tools.merge(rle_objects)
+    return rle_objects['counts']
+
+
+def bytes2img(data, height, width):
+    """Decode the RLE mask bytes to a 2d image.
+
+    Parameters
+    ----------
+    data : bytes
+        The encoded bytes.
+    height : int
+        The height of image.
+    width : int
+        The width of image.
+
+    Returns
+    -------
+    numpy.ndarray
+        The mask image.
+
+    """
+    rle_objects = [{'counts': data, 'size': [height, width]}]
+    mask_image = mask_tools.decode(rle_objects)
+    if mask_image.shape[2] != 1:
+        raise ValueError(
+            '{} instances are found in data.\n'
+            'Merge them before compressing.'
+            .format(mask_image.shape[2])
+        )
+    return mask_image[:, :, 0]
+
+
+def img2bytes(data):
+    """Compress a 2d mask image to RLE bytes.
+
+    Parameters
+    ----------
+    data : numpy.ndarray
+        The image to compress.
+
+    Returns
+    -------
+    bytes
+        The encoded bytes.
+
+    """
+    if len(data.shape) == 3:
+        raise ValueError(
+            '{} instances are found in data.\n'
+            'Merge them before compressing.'
+            .format(data.shape[2])
+        )
+    elif len(data.shape) != 2:
+        raise ValueError('Excepted a 2d mask.')
+    rle_objects = mask_tools.encode(
+        np.array(np.stack([data], 2), order='F'))
+    return rle_objects[0]['counts']
--- a/lib/retinanet/__init__.py
+++ b/lib/retinanet/__init__.py
@@ -13,5 +13,5 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-from lib.faster_rcnn.data_layer import DataLayer
-from lib.retinanet.anchor_target_layer import AnchorTargetLayer
+from lib.faster_rcnn.data_loader import DataLoader
+from lib.retinanet.anchor_target import AnchorTarget
--- a/lib/retinanet/anchor_target_layer.py
+++ b/lib/retinanet/anchor_target_layer.py
@@ -13,23 +13,21 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import dragon.vm.torch as torch
 import numpy as np

 from lib.core.config import cfg
 from lib.faster_rcnn.generate_anchors import generate_anchors_v2
+from lib.faster_rcnn import generate_grid_anchors
+from lib.utils import boxes as box_util
 from lib.utils import logger
-from lib.utils.blob import array2tensor
-from lib.utils.boxes import bbox_overlaps
-from lib.utils.boxes import bbox_transform
-from lib.utils.boxes import dismantle_gt_boxes
+from lib.utils.framework import new_tensor


-class AnchorTargetLayer(torch.nn.Module):
-    """Assign anchors to ground-truth targets."""
+class AnchorTarget(object):
+    """Assign ground-truth targets to anchors."""

    def __init__(self):
-        super(AnchorTargetLayer, self).__init__()
+        super(AnchorTarget, self).__init__()
        # Load the basic configs
        k_max, k_min = cfg.FPN.RPN_MAX_LEVEL, cfg.FPN.RPN_MIN_LEVEL
        scales_per_octave = cfg.RETINANET.SCALES_PER_OCTAVE
@@ -49,10 +47,9 @@ class AnchorTargetLayer(torch.nn.Module):
                    sizes=sizes,
                ))

-    def forward(self, features, gt_boxes, ims_info):
-        """Produces anchor classification labels and bounding-box regression targets."""
+    def __call__(self, features, gt_boxes, ims_info):
        num_images = cfg.TRAIN.IMS_PER_BATCH
-        gt_boxes_wide = dismantle_gt_boxes(gt_boxes, num_images)
+        gt_boxes_wide = box_util.dismantle_boxes(gt_boxes, num_images)

        if len(gt_boxes_wide) != num_images:
            logger.fatal(
@@ -60,39 +57,23 @@ class AnchorTargetLayer(torch.nn.Module):
                .format(num_images, len(gt_boxes_wide))
            )

-        # Generate proposals from shifted anchors
-        all_anchors, total_anchors = [], 0
-        for i in range(len(self.strides)):
-            height, width = features[i].shape[-2:]
-            shift_x = np.arange(0, width) * self.strides[i]
-            shift_y = np.arange(0, height) * self.strides[i]
-            shift_x, shift_y = np.meshgrid(shift_x, shift_y)
-            shifts = np.vstack((shift_x.ravel(), shift_y.ravel(),
-                                shift_x.ravel(), shift_y.ravel())).transpose()
-            # Add A anchors (1, A, 4) to
-            # cell K shifts (K, 1, 4) to get
-            # shift anchors (K, A, 4)
-            # Reshape to (K * A, 4) shifted anchors
-            A = self.base_anchors[i].shape[0]
-            K = shifts.shape[0]
-            anchors = (self.base_anchors[i].reshape((1, A, 4)) +
-                       shifts.reshape((1, K, 4)).transpose((1, 0, 2)))
-            # [K, A, 4] -> [A, K, 4]
-            anchors = anchors.transpose((1, 0, 2))
-            anchors = anchors.reshape((A * K, 4))
-            all_anchors.append(anchors)
-            total_anchors += anchors.shape[0]
-
-        all_anchors = np.concatenate(all_anchors, axis=0)
-
-        # label: 1 is positive, 0 is negative, -1 is don't care
-        labels_wide = -np.ones((num_images, total_anchors,), dtype=np.float32)
-        bbox_targets_wide = np.zeros((num_images, total_anchors, 4), dtype=np.float32)
-        bbox_inside_weights_wide = np.zeros_like(bbox_targets_wide, dtype=np.float32)
-        bbox_outside_weights_wide = np.zeros_like(bbox_targets_wide, dtype=np.float32)
-
-        anchors = all_anchors
-        inds_inside = np.arange(all_anchors.shape[0])
+        # Generate grid anchors from base
+        all_anchors = \
+            generate_grid_anchors(
+                features,
+                self.base_anchors,
+                self.strides,
+            )
+        num_anchors = all_anchors.shape[0]
+
+        # Label: ``1`` is positive, ``0`` is negative, ``-1` is don't care
+        labels_wide = -np.ones((num_images, num_anchors,), 'float32')
+        bbox_targets_wide = np.zeros((num_images, num_anchors, 4), 'float32')
+        bbox_inside_weights_wide = np.zeros_like(bbox_targets_wide, 'float32')
+        bbox_outside_weights_wide = np.zeros_like(bbox_targets_wide, 'float32')
+
+        # Different from R-CNN, all anchors will be used
+        inds_inside, anchors = np.arange(num_anchors), all_anchors
        num_inside = len(inds_inside)

        for ix in range(num_images):
@@ -104,12 +85,12 @@ class AnchorTargetLayer(torch.nn.Module):
            labels.fill(-1)

            # Overlaps between the anchors and the gt boxes
-            overlaps = bbox_overlaps(anchors, gt_boxes)
-            argmax_overlaps = overlaps.argmax(axis=1)
+            overlaps = box_util.bbox_overlaps(anchors, gt_boxes)
+            argmax_overlaps = overlaps.argmax(1)
            max_overlaps = overlaps[np.arange(num_inside), argmax_overlaps]

            # fg label: for each gt, anchor with highest overlap
-            gt_argmax_overlaps = overlaps.argmax(axis=0)
+            gt_argmax_overlaps = overlaps.argmax(0)
            gt_max_overlaps = overlaps[gt_argmax_overlaps, np.arange(overlaps.shape[1])]
            gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0]
            gt_inds = argmax_overlaps[gt_argmax_overlaps]
@@ -125,8 +106,11 @@ class AnchorTargetLayer(torch.nn.Module):
            labels[max_overlaps < cfg.RETINANET.NEGATIVE_OVERLAP] = 0

            bbox_targets = np.zeros((num_inside, 4), dtype=np.float32)
-            bbox_targets[fg_inds, :] = bbox_transform(
-                anchors[fg_inds, :], gt_boxes[argmax_overlaps[fg_inds], :4])
+            bbox_targets[fg_inds, :] = \
+                box_util.bbox_transform(
+                    anchors[fg_inds, :],
+                    gt_boxes[argmax_overlaps[fg_inds], :4],
+                )
            bbox_inside_weights = np.zeros((num_inside, 4), dtype=np.float32)
            bbox_inside_weights[fg_inds, :] = np.array((1., 1., 1., 1.))

@@ -139,14 +123,14 @@ class AnchorTargetLayer(torch.nn.Module):
            bbox_inside_weights_wide[ix, inds_inside] = bbox_inside_weights
            bbox_outside_weights_wide[ix, inds_inside] = bbox_outside_weights

-        labels = labels_wide.reshape((num_images, total_anchors))
+        labels = labels_wide.reshape((num_images, num_anchors))
        bbox_targets = bbox_targets_wide.transpose((0, 2, 1))
        bbox_inside_weights = bbox_inside_weights_wide.transpose((0, 2, 1))
        bbox_outside_weights = bbox_outside_weights_wide.transpose((0, 2, 1))

        return {
-            'labels': array2tensor(labels),
-            'bbox_targets': array2tensor(bbox_targets),
-            'bbox_inside_weights': array2tensor(bbox_inside_weights),
-            'bbox_outside_weights': array2tensor(bbox_outside_weights),
+            'labels': new_tensor(labels),
+            'bbox_targets': new_tensor(bbox_targets),
+            'bbox_inside_weights': new_tensor(bbox_inside_weights),
+            'bbox_outside_weights': new_tensor(bbox_outside_weights),
        }
--- a/lib/retinanet/test.py
+++ b/lib/retinanet/test.py
@@ -17,12 +17,12 @@ import dragon.vm.torch as torch
 import numpy as np

 from lib.core.config import cfg
+from lib.modeling.detector import new_detector
 from lib.nms import nms_wrapper
 from lib.utils import framework
 from lib.utils import time_util
 from lib.utils.blob import im_list_to_blob
 from lib.utils.image import scale_image
-from lib.utils.vis import vis_one_image


 def ims_detect(detector, raw_images):
@@ -43,65 +43,67 @@ def ims_detect(detector, raw_images):
    ], dtype=np.float32)

    # Do Forward
-    if not hasattr(detector, 'frozen_graph'):
-        inputs = {
-            'data': torch.from_numpy(blobs['data']),
-            'ims_info': torch.from_numpy(blobs['ims_info']),
-        }
-        with torch.no_grad():
-            with torch.jit.Recorder(retain_ops=True):
-                outputs = detector.forward(inputs)
-                detector.frozen_graph = \
-                    framework.FrozenGraph(
-                        {'data': inputs['data'],
-                         'ims_info': inputs['ims_info']},
-                        {'detections': outputs['detections']},
-                    )
-    outputs = detector.frozen_graph(**blobs)
+    if not hasattr(detector, 'graph'):
+        with framework.new_workspace().as_default():
+            data = torch.from_numpy(blobs['data'])
+            ims_info = torch.from_numpy(blobs['ims_info'])
+            with torch.no_grad():
+                with torch.jit.Tracer(retain_ops=True):
+                    inputs = {'data': data, 'ims_info': ims_info}
+                    outputs = detector.forward(inputs)
+                    detector.graph = \
+                        framework.Graph({
+                            'data': inputs['data'],
+                            'ims_info': inputs['ims_info']
+                        }, {'detections': outputs['detections']})
+    outputs = detector.graph(**blobs)

    # Unpack results
    results = outputs['detections']
-    detections_wide = [[] for _ in range(len(ims_shape))]
+    detections = [[] for _ in range(len(ims_shape))]

    for i in range(len(ims)):
-        indices = np.where(results[:, 0].astype(np.int32) == i)[0]
-        detections = results[indices, 1:]
-        detections_wide[i // num_scales].append(detections)
+        inds = np.where(results[:, 0].astype(np.int32) == i)[0]
+        detections[i // num_scales].append(results[inds, 1:])

    for i in range(len(ims_shape)):
-        detections_wide[i] = np.vstack(detections_wide[i]) \
-            if len(detections_wide[i]) > 1 else detections_wide[i][0]
-
-    return detections_wide
-
-
-def test_net(detector, server):
-    # Load settings
-    classes = server.classes
-    num_images = server.num_images
-    num_classes = server.num_classes
-    all_boxes = [[[] for _ in range(num_images)] for _ in range(num_classes)]
-
-    _t = {'im_detect': time_util.Timer(), 'misc': time_util.Timer()}
-
-    for batch_idx in range(0, num_images, cfg.TEST.IMS_PER_BATCH):
-        # Collect raw images and ground-truths
-        image_ids, raw_images = [], []
-        for item_idx in range(cfg.TEST.IMS_PER_BATCH):
-            if batch_idx + item_idx >= num_images:
-                continue
-            image_id, raw_image = server.get_image()
-            image_ids.append(image_id)
+        detections[i] = \
+            np.vstack(detections[i]) \
+            if len(detections[i]) > 1 \
+            else detections[i][0]
+
+    return detections
+
+
+def test_net(weights, num_classes, q_in, q_out, device):
+    num_classes, cfg.GPU_ID = num_classes, device
+    detector = new_detector(device, weights)
+
+    must_stop = False
+    _t = time_util.new_timers('im_detect', 'misc')
+
+    while True:
+        if must_stop:
+            break
+        indices, raw_images = [], []
+        for i in range(cfg.TEST.IMS_PER_BATCH):
+            idx, raw_image = q_in.get()
+            if raw_image is None:
+                must_stop = True
+                break
+            indices.append(idx)
            raw_images.append(raw_image)

+        if len(raw_images) == 0:
+            continue
+
        # Run detecting on specific scales
        with _t['im_detect'].tic_and_toc():
            results = ims_detect(detector, raw_images)

        # Post-Processing
-        _t['misc'].tic()
-        for item_idx, detections in enumerate(results):
-            i = batch_idx + item_idx
+        for i, detections in enumerate(results):
+            _t['misc'].tic()
            boxes_this_image = [[]]
            # {x1, y1, x2, y2, score, cls}
            detections = np.array(detections)
@@ -126,44 +128,16 @@ def test_net(detector, server):
                        force_cpu=True,
                    )
                cls_detections = cls_detections[keep, :]
-                all_boxes[j][i] = cls_detections
                boxes_this_image.append(cls_detections)
-
-            if cfg.VIS or cfg.VIS_ON_FILE:
-                vis_one_image(
-                    raw_images[item_idx],
-                    classes,
-                    boxes_this_image,
-                    thresh=cfg.VIS_TH,
-                    box_alpha=1.,
-                    show_class=True,
-                    filename=server.get_save_filename(image_ids[item_idx]),
-                )
-
-            # Limit to max_per_image detections *over all classes*
-            if cfg.TEST.DETECTIONS_PER_IM > 0:
-                image_scores = []
-                for j in range(1, num_classes):
-                    if len(all_boxes[j][i]) < 1:
-                        continue
-                    image_scores.append(all_boxes[j][i][:, -1])
-                if len(image_scores) > 0:
-                    image_scores = np.hstack(image_scores)
-                if len(image_scores) > cfg.TEST.DETECTIONS_PER_IM:
-                    image_thresh = np.sort(image_scores)[-cfg.TEST.DETECTIONS_PER_IM]
-                    for j in range(1, num_classes):
-                        keep = np.where(all_boxes[j][i][:, -1] >= image_thresh)[0]
-                        all_boxes[j][i] = all_boxes[j][i][keep, :]
-        _t['misc'].toc()
-
-        print('\rim_detect: {:d}/{:d} {:.3f}s {:.3f}s'
-              .format(batch_idx + cfg.TEST.IMS_PER_BATCH,
-                      num_images,
-                      _t['im_detect'].average_time,
-                      _t['misc'].average_time),
-              end='')
-
-    print('\n>>>>>>>>>>>>>>>>>>> Evaluating <<<<<<<<<<<<<<<<<<<<')
-
-    print('Evaluating detections')
-    server.evaluate_detections(all_boxes)
+            _t['misc'].toc()
+
+            q_out.put((
+                indices[i],
+                {
+                    'im_detect': _t['im_detect'].average_time,
+                    'misc': _t['misc'].average_time,
+                },
+                {
+                    'boxes': boxes_this_image,
+                },
+            ))
--- a/lib/solver/lr_scheduler.py
+++ b/lib/solver/lr_scheduler.py
@@ -48,29 +48,33 @@ class _LRScheduler(object):
        raise NotImplementedError


-class StepLR(_LRScheduler):
+class CosineLR(_LRScheduler):
    def __init__(
        self,
        lr_max,
+        lr_min,
        decay_step,
-        decay_gamma,
+        max_steps,
        warmup_steps=0,
        warmup_factor=0.,
    ):
-        super(StepLR, self).__init__(
+        super(CosineLR, self).__init__(
            lr_max=lr_max,
+            lr_min=lr_min,
            warmup_steps=warmup_steps,
            warmup_factor=warmup_factor,
        )
        self._decay_step = decay_step
-        self._decay_gamma = decay_gamma
+        self._max_steps = max_steps - warmup_steps

    def schedule_impl(self):
        step_count = self._step_count - self._last_steps
        if step_count % self._decay_step == 0:
-            decay_factor = step_count // self._decay_step
-            self._last_lr = self._lr_max * (
-                self._decay_gamma ** decay_factor)
+            decay_factor = 0.5 * (1. + math.cos(
+                math.pi * step_count / self._max_steps))
+            self._last_lr = self._lr_min + (
+                    self._lr_max - self._lr_min
+            ) * decay_factor
        return self._last_lr


@@ -105,18 +109,19 @@ class MultiStepLR(_LRScheduler):
        return self._last_lr


-class LinearLR(_LRScheduler):
+class LinearCosineLR(_LRScheduler):
    def __init__(
        self,
        lr_max,
+        lr_min,
        decay_step,
        max_steps,
        warmup_steps=0,
        warmup_factor=0.,
    ):
-        super(LinearLR, self).__init__(
+        super(LinearCosineLR, self).__init__(
            lr_max=lr_max,
-            lr_min=0.,
+            lr_min=lr_min,
            warmup_steps=warmup_steps,
            warmup_factor=warmup_factor,
        )
@@ -126,44 +131,63 @@ class LinearLR(_LRScheduler):
    def schedule_impl(self):
        step_count = self._step_count - self._last_steps
        if step_count % self._decay_step == 0:
-            decay_factor = 1. - float(step_count) / self._max_steps
-            self._last_lr = self._lr_max * decay_factor
+            linear_decay = 1. - float(step_count) / self._max_steps
+            cosine_decay= 0.5 * (1. + math.cos(
+                math.pi * step_count / self._max_steps))
+            decay_factor = linear_decay * cosine_decay
+            self._last_lr = self._lr_min + (
+                    self._lr_max - self._lr_min
+            ) * decay_factor
        return self._last_lr


-class CosineLR(_LRScheduler):
+class StepLR(_LRScheduler):
    def __init__(
        self,
        lr_max,
-        lr_min,
        decay_step,
-        max_steps,
+        decay_gamma,
        warmup_steps=0,
        warmup_factor=0.,
    ):
-        super(CosineLR, self).__init__(
+        super(StepLR, self).__init__(
            lr_max=lr_max,
-            lr_min=lr_min,
            warmup_steps=warmup_steps,
            warmup_factor=warmup_factor,
        )
        self._decay_step = decay_step
-        self._max_steps = max_steps - warmup_steps
+        self._decay_gamma = decay_gamma

    def schedule_impl(self):
        step_count = self._step_count - self._last_steps
        if step_count % self._decay_step == 0:
-            decay_factor = 0.5 * (1. + math.cos(
-                math.pi * step_count / self._max_steps))
-            self._last_lr = self._lr_min + (
-                    self._lr_max - self._lr_min
-            ) * decay_factor
+            decay_factor = step_count // self._decay_step
+            self._last_lr = self._lr_max * (
+                self._decay_gamma ** decay_factor)
        return self._last_lr


 def get_scheduler():
    lr_policy = cfg.SOLVER.LR_POLICY
-    if lr_policy == 'step':
+    if lr_policy == 'cosine_decay':
+        return CosineLR(
+            lr_max=cfg.SOLVER.BASE_LR,
+            lr_min=0.,
+            decay_step=cfg.SOLVER.DECAY_STEP,
+            max_steps=cfg.SOLVER.MAX_STEPS,
+            warmup_steps=cfg.SOLVER.WARM_UP_STEPS,
+            warmup_factor=cfg.SOLVER.WARM_UP_FACTOR,
+        )
+    elif lr_policy == 'linear_cosine_decay':
+        return LinearCosineLR(
+            lr_max=cfg.SOLVER.BASE_LR,
+            lr_min=0.,
+            decay_step=cfg.SOLVER.DECAY_STEP,
+            max_steps=cfg.SOLVER.MAX_STEPS,
+            warmup_steps=cfg.SOLVER.WARM_UP_STEPS,
+            warmup_factor=cfg.SOLVER.WARM_UP_FACTOR,
+        )
+    elif lr_policy == 'step':
        return StepLR(
            lr_max=cfg.SOLVER.BASE_LR,
            decay_step=cfg.SOLVER.DECAY_STEP,
@@ -179,15 +203,7 @@ def get_scheduler():
            warmup_steps=cfg.SOLVER.WARM_UP_STEPS,
            warmup_factor=cfg.SOLVER.WARM_UP_FACTOR,
        )
-    elif lr_policy == 'cosine_decay':
-        return CosineLR(
-            lr_max=cfg.SOLVER.BASE_LR,
-            lr_min=0.,
-            decay_step=cfg.SOLVER.DECAY_STEP,
-            max_steps=cfg.SOLVER.MAX_STEPS,
-            warmup_steps=cfg.SOLVER.WARM_UP_STEPS,
-            warmup_factor=cfg.SOLVER.WARM_UP_FACTOR,
-        )
+
    else:
        raise ValueError('Unknown lr policy: ' + lr_policy)

@@ -196,14 +212,16 @@ if __name__ == '__main__':
    def extract_label(scheduler):
        class_name = scheduler.__class__.__name__
        label = class_name + '('
-        if class_name == 'StepLR':
-            label += 'α=' + str(scheduler._decay_step) + ', '
-            label += 'γ=' + str(scheduler._decay_gamma)
+        if class_name == 'CosineLR':
+            label += 'α=' + str(scheduler._decay_step)
+        elif class_name == 'LinearCosineLR':
+            label += 'α=' + str(scheduler._decay_step)
        elif class_name == 'MultiStepLR':
            label += 'α=' + str(scheduler._decay_steps) + ', '
            label += 'γ=' + str(scheduler._decay_gamma)
-        elif class_name == 'CosineLR':
-            label += 'α=' + str(scheduler._decay_step)
+        elif class_name == 'StepLR':
+            label += 'α=' + str(scheduler._decay_step) + ', '
+            label += 'γ=' + str(scheduler._decay_gamma)
        label += ')'
        return label

@@ -218,7 +236,7 @@ if __name__ == '__main__':
        StepLR(decay_step=1, decay_gamma=0.97, **shared_args),
        MultiStepLR(decay_steps=[60, 120, 180], decay_gamma=0.1, **shared_args),
        CosineLR(lr_min=0., decay_step=1, max_steps=max_steps, **shared_args),
-        LinearLR(decay_step=1, max_steps=max_steps, **shared_args),
+        LinearCosineLR(lr_min=0., decay_step=1, max_steps=max_steps, **shared_args),
    ]

    for i in range(max_steps):
@@ -240,7 +258,7 @@ if __name__ == '__main__':
        plt.title('Visualization of different LR Schedulers')
        plt.xlabel('Step')
        plt.ylabel('Learning Rate')
-        line = '--'
+        line = '-'
        colors = ['r', 'g', 'b', 'c', 'm', 'y', 'k']
        for i, scheduler in enumerate(schedulers):
            plt.plot(
@@ -251,4 +269,5 @@ if __name__ == '__main__':
                label=extract_label(scheduler),
            )
        plt.legend()
+        plt.grid(linestyle='--')
        plt.show()
--- a/lib/ssd/__init__.py
+++ b/lib/ssd/__init__.py
@@ -13,8 +13,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-from lib.ssd.data_layer import DataLayer
-from lib.ssd.hard_mining_layer import HardMiningLayer
-from lib.ssd.multibox_layer import MultiBoxMatchLayer
-from lib.ssd.multibox_layer import MultiBoxTargetLayer
-from lib.ssd.priorbox_layer import PriorBoxLayer
+from lib.ssd.data_loader import DataLoader
+from lib.ssd.hard_mining import HardMining
+from lib.ssd.multibox import MultiBoxMatch
+from lib.ssd.multibox import MultiBoxTarget
+from lib.ssd.priorbox import PriorBox
--- a/lib/ssd/data_layer.py
+++ b/lib/ssd/data_layer.py
@@ -26,11 +26,11 @@ from lib.ssd.data_transformer import DataTransformer
 from lib.utils import logger


-class DataLayer(torch.nn.Module):
-    """Generate a mini-batch of data."""
+class DataLoader(object):
+    """Provide mini-batches of data."""

    def __init__(self):
-        super(DataLayer, self).__init__()
+        super(DataLoader, self).__init__()
        database = get_imdb(cfg.TRAIN.DATABASE)
        self.data_batch = DataBatch(**{
            'dataset': lambda: dragon.io.SeetaRecordDataset(database.source),
@@ -38,12 +38,11 @@ class DataLayer(torch.nn.Module):
            'shuffle': cfg.TRAIN.USE_SHUFFLE,
            'num_chunks': cfg.TRAIN.NUM_SHUFFLE_CHUNKS,
            'batch_size': cfg.TRAIN.IMS_PER_BATCH * 2,
+            'num_transformers': cfg.TRAIN.NUM_WORKERS,
        })

-    def forward(self):
-        # Get an array blob from the Queue
+    def __call__(self):
        outputs = self.data_batch.get()
-        # Zero-Copy the array to tensor
        outputs['data'] = torch.from_numpy(outputs['data'])
        return outputs

@@ -58,14 +57,16 @@ class DataBatch(mp.Process):
        ----------
        dataset : lambda
            The creator of a dataset.
+        classes : Sequence[str]
+            The class names.
        shuffle : bool, optional, default=False
            Whether to shuffle the data.
        num_chunks : int, optional, default=0
            The number of chunks to split.
-        batch_size : int, optional, default=32
+        batch_size : int, optional, default=2
            The size of a mini-batch.
-        prefetch : int, optional, default=5
-            The prefetch count.
+        num_transformers : int, optional, default=3
+            The number of workers to transform data.

        """
        super(DataBatch, self).__init__()
@@ -82,16 +83,9 @@ class DataBatch(mp.Process):
        self._prefetch = kwargs.get('prefetch', 5)
        self._batch_size = kwargs.get('batch_size', 32)
        self._num_readers = kwargs.get('num_readers', 1)
-        self._num_transformers = kwargs.get('num_transformers', -1)
-        self._max_transformers = kwargs.get('max_transformers', 3)
+        self._num_transformers = kwargs.get('num_transformers', 3)
        self._num_fetchers = kwargs.get('num_fetchers', 1)

-        # Io-Aware Policy
-        if self._num_transformers == -1:
-            self._num_transformers = 3
-        self._num_transformers = min(
-            self._num_transformers, self._max_transformers)
-
        # Initialize queues
        num_batches = self._prefetch * self._num_readers
        self.Q1 = mp.Queue(num_batches * self._batch_size)
@@ -162,14 +156,17 @@ class DataBatch(mp.Process):
        # Main prefetch loop
        while True:
            boxes_to_pack = []
-            image_batch = np.zeros(image_batch_shape, 'uint8')
-            for image_index in range(cfg.TRAIN.IMS_PER_BATCH):
-                image_batch[image_index], gt_boxes = self.Q2.get()
+            img, gt_boxes = self.Q2.get()
+            ims_blob = np.zeros(image_batch_shape, img.dtype)
+            for i in range(cfg.TRAIN.IMS_PER_BATCH):
+                ims_blob[i] = img
                boxes = np.zeros((gt_boxes.shape[0], gt_boxes.shape[1] + 1), 'float32')
-                boxes[:, :gt_boxes.shape[1]], boxes[:, -1] = gt_boxes, image_index
+                boxes[:, :gt_boxes.shape[1]], boxes[:, -1] = gt_boxes, i
                boxes_to_pack.append(boxes)
+                if i != (cfg.TRAIN.IMS_PER_BATCH - 1):
+                    img, gt_boxes = self.Q2.get()
            self.Q3.put({
-                'data': image_batch,
+                'data': ims_blob,
                'gt_boxes': np.concatenate(boxes_to_pack),
            })

--- a/lib/ssd/data_transformer.py
+++ b/lib/ssd/data_transformer.py
@@ -19,9 +19,9 @@ import cv2
 import numpy as np

 from lib.core.config import cfg
+from lib.datasets.example import Example
 from lib.ssd import transforms
-from lib.utils import rotated_boxes
-from lib.utils.boxes import flip_boxes
+from lib.utils import boxes as box_util


 class DataTransformer(multiprocessing.Process):
@@ -33,102 +33,82 @@ class DataTransformer(multiprocessing.Process):
        self._classes = kwargs.get('classes', ('__background__',))
        self._num_classes = len(self._classes)
        self._class_to_ind = dict(zip(self._classes, range(self._num_classes)))
-        self._image_aug = transforms.Compose(
-            transforms.Distort(),  # Color augmentation
-            transforms.Expand(),   # Expand and padding
-            transforms.Sample(),   # Sample a patch randomly
-            transforms.Resize(),   # Resize to a fixed scale
-        )
+        self.augment_image = \
+            transforms.Compose(
+                transforms.Distort(),  # Color augmentation
+                transforms.Expand(),   # Expand and padding
+                transforms.Sample(),   # Sample a patch randomly
+                transforms.Resize(),   # Resize to a fixed scale
+            )
        self.q_in = self.q_out = None
        self.daemon = True

-    def make_roi_dict(self, example, flip=False):
-        n_objects, box_dim = 0, len(cfg.BBOX_REG_WEIGHTS)
+    def make_roi_dict(self, example, apply_flip=False):
+        objects, n_objects = example.objects, 0
+        height, width = example.height, example.width
        if not self._use_diff:
-            for obj in example['object']:
+            for obj in objects:
                if obj.get('difficult', 0) == 0:
                    n_objects += 1
        else:
-            n_objects = len(example['object'])
+            n_objects = len(objects)

        roi_dict = {
-            'width': example['width'],
-            'height': example['height'],
+            'boxes': np.zeros((n_objects, 4), 'float32'),
            'gt_classes': np.zeros((n_objects,), 'int32'),
-            'boxes': np.zeros((n_objects, box_dim), 'float32'),
-            'normalized_boxes': np.zeros((n_objects, box_dim), 'float32'),
        }

        # Filter the difficult instances
        object_idx = 0
-        for obj in example['object']:
+        for obj in objects:
            if not self._use_diff and \
                    obj.get('difficult', 0) > 0:
                continue
-            if box_dim == 4:
-                roi_dict['boxes'][object_idx, :] = [
-                    max(0, obj['xmin']),
-                    max(0, obj['ymin']),
-                    min(obj['xmax'], example['width'] - 1),
-                    min(obj['ymax'], example['height'] - 1),
-                ]
-            elif box_dim == 5:
-                if 'bbox' in obj:
-                    roi_dict['boxes'][object_idx, :] = [
-                        max(0, obj['bbox'][0]),
-                        max(0, obj['bbox'][1]),
-                        min(obj['bbox'][2], example['width'] - 1),
-                        min(obj['bbox'][3], example['height'] - 1),
-                        rotated_boxes.clip_angle(obj['bbox'][4]),
-                    ]
-                else:
-                    roi_dict['boxes'][object_idx, :] = \
-                        rotated_boxes.vertices2box(
-                            [obj['x1'], obj['y1'],
-                             obj['x2'], obj['y2'],
-                             obj['x3'], obj['y3'],
-                             obj['x4'], obj['y4']]
-                    )
-            else:
-                raise ValueError('Excepted box4d or box5d.')
+            bbox = obj['bbox']
+            roi_dict['boxes'][object_idx, :] = [
+                max(0, bbox[0]),
+                max(0, bbox[1]),
+                min(bbox[2], width - 1),
+                min(bbox[3], height - 1),
+            ]
            roi_dict['gt_classes'][object_idx] = \
                self._class_to_ind[obj['name']]
            object_idx += 1

-        if flip:
-            roi_dict['boxes'] = flip_boxes(
-                roi_dict['boxes'], roi_dict['width'])
+        if apply_flip:
+            roi_dict['boxes'] = \
+                box_util.flip_boxes(
+                    roi_dict['boxes'],
+                    width,
+                )

-        roi_dict['boxes'][:, 0] /= roi_dict['width']
-        roi_dict['boxes'][:, 1] /= roi_dict['height']
-        roi_dict['boxes'][:, 2] /= roi_dict['width']
-        roi_dict['boxes'][:, 3] /= roi_dict['height']
+        # Normalize to unit sizes
+        roi_dict['boxes'][:, 0::2] /= width
+        roi_dict['boxes'][:, 1::2] /= height

        return roi_dict

    def get(self, example):
-        img = np.frombuffer(example['content'], np.uint8)
-        img = cv2.imdecode(img, 1)
+        example = Example(example)
+        img = example.image

        # Flip
-        flip = False
+        apply_flip = False
        if self._mirror:
            if np.random.randint(2) > 0:
-                img = img[:, ::-1, :]
-                flip = True
+                img = img[:, ::-1]
+                apply_flip = True

        # Example -> RoIDict
-        roi_dict = self.make_roi_dict(example, flip)
+        roi_dict = self.make_roi_dict(example, apply_flip)

        # Post-Process for gt boxes
        # Shape like: [num_objects, {x1, y1, x2, y2, cls}]
-        box_dim = roi_dict['boxes'].shape[1]
-        gt_boxes = np.empty((roi_dict['gt_classes'].size, box_dim + 1), 'float32')
-        gt_boxes[:, :box_dim], gt_boxes[:, box_dim] = \
-            roi_dict['boxes'], roi_dict['gt_classes']
+        gt_boxes = np.empty((roi_dict['gt_classes'].size, 5), 'float32')
+        gt_boxes[:, :4], gt_boxes[:, 4] = roi_dict['boxes'], roi_dict['gt_classes']

        # Distort => Expand => Sample => Resize
-        img, gt_boxes = self._image_aug(img, gt_boxes)
+        img, gt_boxes = self.augment_image(img, gt_boxes)

        # Restore to the blob scale
        gt_boxes[:, 0] *= cfg.SSD.RESIZE.WIDTH
@@ -136,6 +116,10 @@ class DataTransformer(multiprocessing.Process):
        gt_boxes[:, 2] *= cfg.SSD.RESIZE.WIDTH
        gt_boxes[:, 3] *= cfg.SSD.RESIZE.HEIGHT

+        # Post-Process for image
+        if img.dtype == 'uint16':
+            img = img.astype('float32') / 256.
+
        return img, gt_boxes

    def run(self):

--- a/lib/ssd/generate_anchors.py
+++ b/lib/ssd/generate_anchors.py
@@ -16,15 +16,8 @@ from __future__ import print_function
 import numpy as np


-def generate_anchors(min_sizes, max_sizes, ratios, angles=()):
-    """
-    Generate anchor (reference) windows by enumerating
-    aspect ratios, min_sizes, max_sizes wrt a reference ctr (x, y, w, h).
-    """
-    if len(angles) > 0:
-        return generate_rotated_anchors(
-            min_sizes, max_sizes, ratios, angles)
-
+def generate_anchors(min_sizes, max_sizes, ratios):
+    """Generate anchors by enumerating aspect ratios and sizes."""
    total_anchors = []

    for idx, min_size in enumerate(min_sizes):
@@ -47,37 +40,6 @@ def generate_anchors(min_sizes, max_sizes, ratios, angles=()):
    return np.vstack(total_anchors)


-def generate_rotated_anchors(min_sizes, max_sizes, ratios, angles):
-    """
-    Generate anchor (reference) windows by enumerating
-    aspect ratios, min_sizes, max_sizes wrt a reference ctr (x, y, w, h).
-    """
-    total_anchors = []
-
-    for angle in angles:
-        for idx, min_size in enumerate(min_sizes):
-            angle_array = np.ones((len(ratios), 1)) * angle
-            # Note that SSD assume it is a ctr-anchor
-            base_anchor = np.array([0, 0, min_size, min_size])
-            anchors = _ratio_enum(base_anchor, ratios, _mkanchors_v2)
-            if len(max_sizes) > 0:
-                max_size = max_sizes[idx]
-                _anchors = anchors[0].reshape((1, 4))
-                _anchors = np.vstack([
-                    _anchors,
-                    _max_size_enum(
-                        base_anchor,
-                        min_size,
-                        max_size,
-                        _mkanchors_v2,
-                    )])
-                anchors = np.vstack([_anchors, anchors[1:]])
-                angle_array = np.vstack((angle_array, angle))
-            anchors = np.hstack((anchors, angle_array))
-            total_anchors.append(anchors)
-    return np.vstack(total_anchors)
-
-
 def _whctrs(anchor):
    """Return width, height, x center, and y center for an anchor (window)."""
    w, h = anchor[2], anchor[3]
@@ -125,4 +87,3 @@ def _max_size_enum(base_anchor, min_size, max_size, make_fn):

 if __name__ == '__main__':
    print(generate_anchors(min_sizes=[30], max_sizes=[60], ratios=[1]))
-    print(generate_rotated_anchors(min_sizes=[30], max_sizes=[60], ratios=[1], angles=[1]))
--- a/lib/ssd/hard_mining.py
+++ b/lib/ssd/hard_mining.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from lib.core.config import cfg
+from lib.utils.framework import new_tensor
+
+
+class HardMining(object):
+    def __call__(self, prob_wide, labels_wide, overlaps_wide):
+        prob_wide = prob_wide.numpy(True)
+
+        neg_ovr = cfg.SSD.OHEM.NEG_OVERLAP
+        neg_ratio = cfg.SSD.OHEM.NEG_POS_RATIO
+
+        # label ``-1`` will be ignored
+        new_labels_wide = -np.ones(labels_wide.shape, 'int64')
+
+        for ix in range(labels_wide.shape[0]):
+            labels = labels_wide[ix]
+            overlaps = overlaps_wide[ix]
+            prob = prob_wide[ix]
+            loss = np.zeros(labels.shape, 'float32')
+            inds = np.where(labels >= 0)[0]
+            loss[inds] = -np.log(
+                np.maximum(
+                    prob[inds, labels[inds]],
+                    np.finfo(float).eps,
+                )
+            )
+
+            # Filter negatives
+            fg_inds = np.where(labels > 0)[0]
+            neg_inds = np.where(labels == 0)[0]
+            neg_overlaps = overlaps[neg_inds]
+            eligible_neg_inds = np.where(neg_overlaps < neg_ovr)[0]
+            neg_inds = neg_inds[eligible_neg_inds]
+
+            # Apply mining on negatives
+            neg_loss = loss[neg_inds]
+            num_pos, num_neg = len(fg_inds), len(neg_inds)
+            num_bg = min(int(num_pos * neg_ratio), num_neg)
+            bg_inds = neg_inds[np.argsort(-neg_loss)][:num_bg]
+            new_labels_wide[ix][fg_inds] = labels[fg_inds]  # Keep fg indices
+            new_labels_wide[ix][bg_inds] = 0  # Use hard negatives as bg indices
+
+        # Feed labels to compute cls loss
+        return {'labels': new_tensor(new_labels_wide)}
--- a/lib/ssd/hard_mining_layer.py
+++ b/lib/ssd/hard_mining_layer.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import dragon.vm.torch as torch
-import numpy as np
-
-from lib.core.config import cfg
-from lib.utils.blob import array2tensor
-
-
-class HardMiningLayer(torch.nn.Module):
-    def __init__(self):
-        super(HardMiningLayer, self).__init__()
-
-    def forward(self, conf_prob, match_labels, max_overlaps):
-        # Confidence of each matched box
-        conf_prob_wide = conf_prob.numpy(True)
-        # Label of each matched box
-        match_labels_wide = match_labels
-        # Max overlaps between default boxes and gt boxes
-        max_overlaps_wide = max_overlaps
-
-        # label ``-1`` will be ignored
-        labels_wide = -np.ones(match_labels_wide.shape, dtype=np.int64)
-
-        for ix in range(match_labels_wide.shape[0]):
-            match_labels = match_labels_wide[ix]
-            max_overlaps = max_overlaps_wide[ix]
-            conf_prob = conf_prob_wide[ix]
-            conf_loss = np.zeros(match_labels.shape, dtype=np.float32)
-            inds = np.where(match_labels >= 0)[0]
-            flt_min = np.finfo(float).eps
-            # Softmax cross-entropy
-            conf_loss[inds] = -np.log(np.maximum(
-                conf_prob[inds, match_labels[inds]], flt_min))
-
-            # Filter negatives
-            fg_inds = np.where(match_labels > 0)[0]
-            neg_inds = np.where(match_labels == 0)[0]
-            neg_overlaps = max_overlaps[neg_inds]
-            eligible_neg_inds = np.where(neg_overlaps < cfg.SSD.OHEM.NEG_OVERLAP)[0]
-            sel_inds = neg_inds[eligible_neg_inds]
-
-            # Do Mining
-            sel_loss = conf_loss[sel_inds]
-            num_pos = len(fg_inds)
-            num_sel = min(int(num_pos * cfg.SSD.OHEM.NEG_POS_RATIO), len(sel_inds))
-            sorted_sel_inds = sel_inds[np.argsort(-sel_loss)]
-            bg_inds = sorted_sel_inds[:num_sel]
-            labels_wide[ix][fg_inds] = match_labels[fg_inds]  # Keep fg indices
-            labels_wide[ix][bg_inds] = 0  # Use hard negatives as bg indices
-
-        # Feed labels to compute cls loss
-        return {'labels': array2tensor(labels_wide)}
--- a/lib/ssd/multibox_layer.py
+++ b/lib/ssd/multibox_layer.py
@@ -14,22 +14,16 @@ from __future__ import division
 from __future__ import print_function

 import numpy as np
-import dragon.vm.torch as torch

 from lib.core.config import cfg
-from lib.utils.blob import array2tensor
-from lib.utils.boxes import bbox_overlaps
-from lib.utils.boxes import bbox_transform
-from lib.utils.boxes import dismantle_gt_boxes
+from lib.utils import boxes as box_util
+from lib.utils.framework import new_tensor


-class MultiBoxMatchLayer(torch.nn.Module):
-    def __init__(self):
-        super(MultiBoxMatchLayer, self).__init__()
-
-    def forward(self, prior_boxes, gt_boxes):
+class MultiBoxMatch(object):
+    def __call__(self, prior_boxes, gt_boxes):
        num_images = cfg.TRAIN.IMS_PER_BATCH
-        gt_boxes_wide = dismantle_gt_boxes(gt_boxes, num_images)
+        gt_boxes_wide = box_util.dismantle_boxes(gt_boxes, num_images)
        num_priors, box_dim = prior_boxes.shape[:]

        # Do matching between prior boxes and gt boxes
@@ -40,20 +34,20 @@ class MultiBoxMatchLayer(torch.nn.Module):
        for ix in range(num_images):
            # GT boxes (x1, y1, x2, y2, label)
            gt_boxes = gt_boxes_wide[ix]
-            if gt_boxes.shape[0] == 0:
+            num_gt = gt_boxes.shape[0]
+            if num_gt == 0:
                continue

            # Compute the overlaps between prior boxes and gt boxes
-            overlaps = bbox_overlaps(prior_boxes, gt_boxes)
-            argmax_overlaps = overlaps.argmax(axis=1)
+            overlaps = box_util.bbox_overlaps(prior_boxes, gt_boxes)
+            argmax_overlaps = overlaps.argmax(1)
            max_overlaps = overlaps[np.arange(num_priors), argmax_overlaps]
            max_overlaps_wide[ix] = max_overlaps

            # Bipartite matching and assignments
-            bipartite_inds = overlaps.argmax(axis=0)
+            bipartite_inds = overlaps.argmax(0)
            class_assignment = gt_boxes[:, -1]
-            match_inds_wide[ix][bipartite_inds] = np.arange(
-                gt_boxes.shape[0], dtype=np.int32)
+            match_inds_wide[ix][bipartite_inds] = np.arange(num_gt, dtype='int32')
            match_labels_wide[ix][bipartite_inds] = class_assignment

            # Per prediction matching and assignments
@@ -72,11 +66,8 @@ class MultiBoxMatchLayer(torch.nn.Module):
        }


-class MultiBoxTargetLayer(torch.nn.Module):
-    def __init__(self):
-        super(MultiBoxTargetLayer, self).__init__()
-
-    def forward(
+class MultiBoxTarget(object):
+    def __call__(
        self,
        match_inds,
        match_labels,
@@ -90,15 +81,15 @@ class MultiBoxTargetLayer(torch.nn.Module):
        match_labels_wide = match_labels

        num_priors, box_dim = prior_boxes.shape[:]
-        gt_boxes_wide = dismantle_gt_boxes(gt_boxes, num_images)
+        gt_boxes_wide = box_util.dismantle_boxes(gt_boxes, num_images)

        bbox_targets_wide = np.zeros((num_images, num_priors, box_dim), 'float32')
        bbox_inside_weights_wide = np.zeros(bbox_targets_wide.shape, 'float32')
        bbox_outside_weights_wide = np.zeros(bbox_targets_wide.shape, 'float32')

        # Number of matched boxes(#positive)
-        # We divide it by num of images, as SmoothLLLoss will divide it also
        n_pos = float(max(len(np.where(match_labels_wide > 0)[0]), 1))
+        # Multiple by the num images to compensate the smooth l1 loss
        bbox_reg_weight = cfg.SSD.BBOX_REG_WEIGHT * num_images / n_pos

        for ix in range(num_images):
@@ -106,7 +97,7 @@ class MultiBoxTargetLayer(torch.nn.Module):
            if gt_boxes.shape[0] == 0:
                continue

-            # Sample fg-rois(default boxes) & gt-rois(gt boxes)
+            # Select ground-truth
            match_inds = match_inds_wide[ix]
            match_labels = match_labels_wide[ix]
            ex_inds = np.where(match_labels > 0)[0]
@@ -114,14 +105,18 @@ class MultiBoxTargetLayer(torch.nn.Module):
            gt_assignment = match_inds[ex_inds]
            gt_rois = gt_boxes[gt_assignment]

-            # Assign targets & inside weights & outside weights
-            bbox_targets_wide[ix][ex_inds] = bbox_transform(
-                ex_rois, gt_rois, cfg.BBOX_REG_WEIGHTS)
+            # Assign bbox targets
+            bbox_targets_wide[ix][ex_inds] = \
+                box_util.bbox_transform(
+                    ex_rois,
+                    gt_rois,
+                    cfg.BBOX_REG_WEIGHTS,
+                )
            bbox_inside_weights_wide[ix, :] = 1.
            bbox_outside_weights_wide[ix][ex_inds] = bbox_reg_weight

        return {
-            'bbox_targets': array2tensor(bbox_targets_wide),
-            'bbox_inside_weights': array2tensor(bbox_inside_weights_wide),
-            'bbox_outside_weights': array2tensor(bbox_outside_weights_wide),
+            'bbox_targets': new_tensor(bbox_targets_wide),
+            'bbox_inside_weights': new_tensor(bbox_inside_weights_wide),
+            'bbox_outside_weights': new_tensor(bbox_outside_weights_wide),
        }
--- a/lib/ssd/priorbox_layer.py
+++ b/lib/ssd/priorbox_layer.py
@@ -14,18 +14,17 @@ from __future__ import division
 from __future__ import print_function

 import numpy as np
-import dragon.vm.torch as torch

 from lib.core.config import cfg
 from lib.ssd.generate_anchors import generate_anchors
 from lib.utils import logger


-class PriorBoxLayer(torch.nn.Module):
+class PriorBox(object):
    """Generate default boxes(anchors)."""

    def __init__(self):
-        super(PriorBoxLayer, self).__init__()
+        super(PriorBox, self).__init__()
        min_sizes = cfg.SSD.MULTIBOX.MIN_SIZES
        max_sizes = cfg.SSD.MULTIBOX.MAX_SIZES
        if len(max_sizes) > 0:
@@ -34,7 +33,6 @@ class PriorBoxLayer(torch.nn.Module):
                    len(min_sizes), len(max_sizes)))
        self.strides = cfg.SSD.MULTIBOX.STRIDES
        aspect_ratios = cfg.SSD.MULTIBOX.ASPECT_RATIOS
-        aspect_angles = cfg.SSD.MULTIBOX.ASPECT_ANGLES
        self.base_anchors = []
        for i in range(len(min_sizes)):
            self.base_anchors.append(
@@ -44,11 +42,10 @@ class PriorBoxLayer(torch.nn.Module):
                    max_sizes[i] if isinstance(
                        max_sizes[i], (list, tuple)) else [max_sizes[i]],
                    aspect_ratios[i],
-                    aspect_angles,
                )
            )

-    def forward(self, features):
+    def __call__(self, features):
        all_anchors = []
        for i in range(len(self.strides)):
            # 1. Generate base grids

--- a/lib/ssd/test.py
+++ b/lib/ssd/test.py
@@ -18,12 +18,11 @@ import dragon.vm.torch as torch
 import numpy as np

 from lib.core.config import cfg
+from lib.modeling.detector import new_detector
 from lib.nms import nms_wrapper
+from lib.utils import boxes as box_util
 from lib.utils import framework
 from lib.utils import time_util
-from lib.utils.boxes import bbox_transform_inv
-from lib.utils.boxes import clip_boxes
-from lib.utils.vis import vis_one_image


 def get_images(ims):
@@ -34,7 +33,10 @@ def get_images(ims):
        im_scales.append((float(target_h) / im.shape[0],
                          float(target_w) / im.shape[1]))
        processed_ims.append(cv2.resize(im, (target_w, target_h)))
-    ims_blob = np.array(processed_ims, dtype=np.uint8)
+    if ims[0].dtype == 'uint16':
+        ims_blob = np.array(processed_ims, dtype='float32') / 256.
+    else:
+        ims_blob = np.array(processed_ims, dtype='uint8')
    return ims_blob, im_scales


@@ -43,24 +45,23 @@ def ims_detect(detector, ims):
    data, im_scales = get_images(ims)

    # Do Forward
-    if not hasattr(detector, 'frozen_graph'):
-        image = torch.from_numpy(data)
-        with torch.no_grad():
-            with torch.jit.Recorder(retain_ops=True):
-                outputs = detector.forward(inputs={'data': image})
-                detector.frozen_graph = \
-                    framework.FrozenGraph(
-                        {'data': image},
-                        {'cls_prob': outputs['cls_prob'],
-                         'bbox_pred': outputs['bbox_pred']},
-                        {'prior_boxes': outputs['prior_boxes']},
-                    )
-    outputs = detector.frozen_graph(data=data)
+    if not hasattr(detector, 'graph'):
+        with framework.new_workspace().as_default():
+            with torch.no_grad():
+                with torch.jit.Tracer(retain_ops=True):
+                    inputs = {'data': torch.from_numpy(data)}
+                    outputs = detector.forward(inputs)
+                    detector.graph = \
+                        framework.Graph(inputs, {
+                            'cls_prob': outputs['cls_prob'],
+                            'bbox_pred': outputs['bbox_pred']
+                        }, {'prior_boxes': outputs['prior_boxes']})
+    outputs = detector.graph(data=data)

    # Decode results
    batch_boxes = []
    for i in range(len(im_scales)):
-        boxes = bbox_transform_inv(
+        boxes = box_util.bbox_transform_inv(
            outputs['prior_boxes'],
            outputs['bbox_pred'][i],
            cfg.BBOX_REG_WEIGHTS,
@@ -69,39 +70,40 @@ def ims_detect(detector, ims):
        boxes[:, 1] /= im_scales[i][0]
        boxes[:, 2] /= im_scales[i][1]
        boxes[:, 3] /= im_scales[i][0]
-        batch_boxes.append(clip_boxes(boxes, ims[i].shape))
+        batch_boxes.append(box_util.clip_boxes(boxes, ims[i].shape))

    return outputs['cls_prob'], batch_boxes


-def test_net(detector, server):
-    # Load settings
-    classes = server.classes
-    num_images = server.num_images
-    num_classes = server.num_classes
-    all_boxes = [[[] for _ in range(num_images)] for _ in range(num_classes)]
-
-    _t = {'im_detect': time_util.Timer(), 'misc': time_util.Timer()}
+def test_net(weights, num_classes, q_in, q_out, device):
+    num_classes, cfg.GPU_ID = num_classes, device
+    detector = new_detector(device, weights)

-    for batch_idx in range(0, num_images, cfg.TEST.IMS_PER_BATCH):
-        # Collect raw images and ground-truths
-        image_ids, raw_images = [], []
+    must_stop = False
+    _t = time_util.new_timers('im_detect', 'misc')

-        for item_idx in range(cfg.TEST.IMS_PER_BATCH):
-            if batch_idx + item_idx >= num_images:
-                continue
-            image_id, raw_image = server.get_image()
-            image_ids.append(image_id)
+    while True:
+        if must_stop:
+            break
+        indices, raw_images = [], []
+        for i in range(cfg.TEST.IMS_PER_BATCH):
+            idx, raw_image = q_in.get()
+            if raw_image is None:
+                must_stop = True
+                break
+            indices.append(idx)
            raw_images.append(raw_image)

+        if len(raw_images) == 0:
+            continue
+
        with _t['im_detect'].tic_and_toc():
-            batch_scores, batch_boxes = ims_detect(detector, raw_images)
+            batch_scores, batch_boxes = \
+                ims_detect(detector, raw_images)

-        _t['misc'].tic()
-        for item_idx in range(len(batch_scores)):
-            i = batch_idx + item_idx
-            scores = batch_scores[item_idx]
-            boxes = batch_boxes[item_idx]
+        for i in range(len(batch_scores)):
+            _t['misc'].tic()
+            scores, boxes = batch_scores[i], batch_boxes[i]
            boxes_this_image = [[]]
            for j in range(1, num_classes):
                inds = np.where(scores[:, j] > cfg.TEST.SCORE_THRESH)[0]
@@ -127,44 +129,16 @@ def test_net(detector, server):
                        force_cpu=True,
                    )
                cls_detections = cls_detections[keep, :]
-                all_boxes[j][i] = cls_detections
                boxes_this_image.append(cls_detections)
-
-            if cfg.VIS or cfg.VIS_ON_FILE:
-                vis_one_image(
-                    raw_images[item_idx],
-                    classes,
-                    boxes_this_image,
-                    thresh=cfg.VIS_TH,
-                    box_alpha=1.,
-                    show_class=True,
-                    filename=server.get_save_filename(image_ids[item_idx]),
-                )
-
-            # Limit to max_per_image detections *over all classes*
-            if cfg.TEST.DETECTIONS_PER_IM > 0:
-                image_scores = []
-                for j in range(1, num_classes):
-                    if len(all_boxes[j][i]) < 1:
-                        continue
-                    image_scores.append(all_boxes[j][i][:, -1])
-                if len(image_scores) > 0:
-                    image_scores = np.hstack(image_scores)
-                if len(image_scores) > cfg.TEST.DETECTIONS_PER_IM:
-                    image_thresh = np.sort(image_scores)[-cfg.TEST.DETECTIONS_PER_IM]
-                    for j in range(1, num_classes):
-                        keep = np.where(all_boxes[j][i][:, -1] >= image_thresh)[0]
-                        all_boxes[j][i] = all_boxes[j][i][keep, :]
-        _t['misc'].toc()
-
-        print('\rim_detect: {:d}/{:d} {:.3f}s {:.3f}s'
-              .format(batch_idx + cfg.TEST.IMS_PER_BATCH,
-                      num_images,
-                      _t['im_detect'].average_time,
-                      _t['misc'].average_time),
-              end='')
-
-    print('\n>>>>>>>>>>>>>>>>>>> Evaluating <<<<<<<<<<<<<<<<<<<<')
-
-    print('Evaluating detections')
-    server.evaluate_detections(all_boxes)
+            _t['misc'].toc()
+
+            q_out.put((
+                indices[i],
+                {
+                    'im_detect': _t['im_detect'].average_time,
+                    'misc': _t['misc'].average_time,
+                },
+                {
+                    'boxes': boxes_this_image,
+                },
+            ))
--- a/lib/ssd/transforms.py
+++ b/lib/ssd/transforms.py
@@ -23,9 +23,8 @@ import numpy as np
 import numpy.random as npr

 from lib.core.config import cfg
+from lib.utils import boxes as box_util
 from lib.utils import logger
-from lib.utils.boxes import clip_tiled_boxes
-from lib.utils.boxes import iou


 class Compose(object):
@@ -52,7 +51,7 @@ class Distort(object):
            (PIL.ImageEnhance.Contrast, self._contrast_prob),
            (PIL.ImageEnhance.Color, self._saturation_prob),
        ]
-        npr.shuffle(transforms)
+        np.random.shuffle(transforms)
        for transform_fn, prob in transforms:
            if npr.uniform() < prob:
                img = transform_fn(img)
@@ -145,7 +144,7 @@ class Sample(object):

    @classmethod
    def _compute_overlaps(cls, rand_box, gt_boxes):
-        return iou(np.expand_dims(rand_box, 0), gt_boxes[:, 0:4])
+        return box_util.iou(np.expand_dims(rand_box, 0), gt_boxes[:, 0:4])

    @classmethod
    def _generate_sample(cls, sample_param):
@@ -217,7 +216,7 @@ class Sample(object):
            new_gt_boxes[:, 1] = (gt_boxes[:, 1] * im_h - h_off)
            new_gt_boxes[:, 2] = (gt_boxes[:, 2] * im_w - w_off)
            new_gt_boxes[:, 3] = (gt_boxes[:, 3] * im_h - h_off)
-            new_gt_boxes = clip_tiled_boxes(new_gt_boxes, (crop_h, crop_w))
+            new_gt_boxes = box_util.clip_boxes(new_gt_boxes, (crop_h, crop_w))
            new_gt_boxes[:, 0] = new_gt_boxes[:, 0] / crop_w
            new_gt_boxes[:, 1] = new_gt_boxes[:, 1] / crop_h
            new_gt_boxes[:, 2] = new_gt_boxes[:, 2] / crop_w

--- a/lib/utils/blob.py
+++ b/lib/utils/blob.py
@@ -31,6 +31,7 @@ def im_list_to_blob(ims):
    Assume that images are not means subtracted, and with BGR order.

    """
+    blob_dtype = 'uint8' if ims[0].dtype == 'uint8' else 'float32'
    max_shape = np.array([im.shape for im in ims]).max(axis=0)

    if cfg.MODEL.COARSEST_STRIDE > 0:
@@ -38,11 +39,13 @@ def im_list_to_blob(ims):
        max_shape[0] = int(np.ceil(max_shape[0] / stride) * stride)
        max_shape[1] = int(np.ceil(max_shape[1] / stride) * stride)

-    unify_shape = (len(ims), max_shape[0], max_shape[1], 3)
-    blob = np.empty(unify_shape, dtype=np.uint8)
+    blob_shape = (len(ims), max_shape[0], max_shape[1], 3)
+    blob = np.empty(blob_shape, blob_dtype)
    blob[:] = cfg.PIXEL_MEANS

    for i, im in enumerate(ims):
+        if im.dtype == 'uint16':
+            im = im.astype(blob_dtype) / 256.
        blob[i, :im.shape[0], :im.shape[1], :] = im

    return blob
@@ -52,12 +55,16 @@ def mask_list_to_blob(masks):
    """Convert a list of masks into a network input."""
    max_shape = np.array([mask.shape[1:] for mask in masks]).max(axis=0)
    num_masks = np.array([mask.shape[0] for mask in masks]).sum()
-    blob = np.zeros((num_masks, max_shape[0], max_shape[1]), dtype=np.uint8)
-    pos = 0
+
+    blob_shape = ((num_masks, max_shape[0], max_shape[1]))
+    blob = np.zeros(blob_shape, 'uint8')
+
+    count = 0
    for mask in masks:
-        blob[pos : pos + mask.shape[0],
-             0 : mask.shape[1], 0 : mask.shape[2]] = mask
-        pos += mask.shape[0]
+        n, h, w = mask.shape
+        blob[count:count + n, :h, :w] = mask
+        count += n
+
    return blob


@@ -88,22 +95,3 @@ def prep_im_for_blob(img, target_size, max_size):
        im_scale *= jitter

    return resize_image(img, im_scale, im_scale), im_scale, jitter
-
-
-def array2tensor(array, enforce_cpu=False):
-    if isinstance(array, np.ndarray):
-        # Zero-Copy from numpy
-        cpu_tensor = torch.from_numpy(array)
-    else:
-        cpu_tensor = array
-    return cpu_tensor if enforce_cpu else \
-        cpu_tensor.cuda(cfg.GPU_ID)
-
-
-def tensor2array(tensor, copy=False):
-    if isinstance(tensor, torch.Tensor):
-        # Zero-Copy from numpy
-        array = tensor.numpy(True)
-    else:
-        array = tensor
-    return array.copy() if copy else array
--- a/lib/utils/boxes.py
+++ b/lib/utils/boxes.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 import numpy as np

 from lib.utils import cython_bbox
-from lib.utils import rotated_boxes


 def intersection(boxes1, boxes2):
@@ -109,8 +108,6 @@ def ioa2(boxes1, boxes2):

 def bbox_overlaps(boxes1, boxes2):
    """Compute the overlaps between two group of boxes."""
-    if boxes1.shape[1] == 5:
-        return rotated_boxes.bbox_overlaps(boxes1, boxes2)
    return cython_bbox.bbox_overlaps(
        np.ascontiguousarray(boxes1, dtype=np.float),
        np.ascontiguousarray(boxes2, dtype=np.float),
@@ -119,10 +116,6 @@ def bbox_overlaps(boxes1, boxes2):

 def bbox_transform(ex_rois, gt_rois, weights=(1., 1., 1., 1.)):
    """Transform the boxes to the regression targets."""
-    if len(weights) == 5:
-        # Transform the rotated boxes
-        return rotated_boxes.bbox_transform(ex_rois, gt_rois, weights)
-
    ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.
    ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.
    ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths
@@ -134,20 +127,16 @@ def bbox_transform(ex_rois, gt_rois, weights=(1., 1., 1., 1.)):
    gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights

    wx, wy, ww, wh = weights
-    targets_dx = wx * (gt_ctr_x - ex_ctr_x) / ex_widths
-    targets_dy = wy * (gt_ctr_y - ex_ctr_y) / ex_heights
-    targets_dw = ww * np.log(gt_widths / ex_widths)
-    targets_dh = wh * np.log(gt_heights / ex_heights)
+    targets = [wx * (gt_ctr_x - ex_ctr_x) / ex_widths]
+    targets += [wy * (gt_ctr_y - ex_ctr_y) / ex_heights]
+    targets += [ww * np.log(gt_widths / ex_widths)]
+    targets += [wh * np.log(gt_heights / ex_heights)]

-    return np.vstack((targets_dx, targets_dy, targets_dw, targets_dh)).transpose()
+    return np.vstack(targets).transpose()


 def bbox_transform_inv(boxes, deltas, weights=(1., 1., 1., 1.)):
    """Decode the final boxes according to the deltas."""
-    if len(weights) == 5:
-        # Decode the rotated boxes
-        return rotated_boxes.bbox_transform_inv(boxes, deltas, weights)
-
    if boxes.shape[0] == 0:
        return np.zeros((0, deltas.shape[1]), dtype=deltas.dtype)

@@ -188,8 +177,6 @@ def boxes_area(boxes):


 def clip_boxes(boxes, im_shape):
-    if boxes.shape[1] == 5:
-        return rotated_boxes.clip_boxes(boxes, im_shape)
    # x1 >= 0
    boxes[:, 0] = np.maximum(np.minimum(boxes[:, 0], im_shape[1] - 1), 0)
    # y1 >= 0
@@ -234,8 +221,6 @@ def expand_boxes(boxes, scale):

 def flip_boxes(boxes, width):
    """Flip the boxes horizontally."""
-    if boxes.shape[1] == 5:
-        return rotated_boxes.flip_boxes(boxes, width)
    flip_boxes = boxes.copy()
    old_x1 = boxes[:, 0].copy()
    old_x2 = boxes[:, 2].copy()
@@ -252,10 +237,10 @@ def filter_boxes(boxes, min_size):
    return keep


-def dismantle_gt_boxes(gt_boxes, num_images):
+def dismantle_boxes(gt_boxes, num_images):
    """Dismantle the packed ground-truth boxes."""
    return [
        gt_boxes[
-            np.where(gt_boxes[:, -1].astype(np.int32) == ix)[0]
-        ][:, :-1] for ix in range(num_images)
+            np.where(gt_boxes[:, -1].astype(np.int32) == i)[0]
+        ][:, :-1] for i in range(num_images)
    ]
--- a/lib/utils/framework.py
+++ b/lib/utils/framework.py
@@ -16,9 +16,16 @@ from __future__ import print_function
 import collections

 import dragon
-import dragon.vm.torch as torch
 from dragon.core.framework import tensor_util
 from dragon.core.util import six
+import dragon.vm.torch as torch
+import numpy as np
+
+from lib.core.config import cfg
+
+
+def feed_tensor(tensor, array):
+    tensor_util.set_array(tensor, array)


 def get_param_groups(module, bias_lr=1., bias_decay=0.):
@@ -52,7 +59,7 @@ def get_param_groups(module, bias_lr=1., bias_decay=0.):
        }
    ]
    for name, param in module.named_parameters():
-        gi = 1 if 'bias' in name else 0
+        gi = 0 if 'weight' in name and param.dim() > 1 else 1
        param_groups[gi]['params'].append(param)
    if len(param_groups[1]['params']) == 0:
        param_groups.pop()  # Remove empty group
@@ -68,7 +75,54 @@ def get_workspace():
        The default workspace.

    """
-    return dragon.workspace.get_default()
+    return dragon.get_workspace()
+
+
+def new_placeholder(device=None):
+    """Create a new tensor to feed data.
+
+    Parameters
+    ----------
+    device : int, optional
+        The device index.
+
+    Returns
+    -------
+    dragon.vm.torch.Tensor
+        The placeholder tensor.
+
+    """
+    value = torch.zeros(1)
+    if device is not None:
+        return value.cuda(device)
+    return value
+
+
+def new_tensor(data, enforce_cpu=False):
+    """Create a new tensor from the data.
+
+    Parameters
+    ----------
+    data : array_like
+        The data value.
+    enforce_cpu : bool, optional, default=False
+        **True** to enforce the cpu storage.
+
+    Returns
+    -------
+    dragon.vm.torch.Tensor
+        The tensor taken with the data.
+
+    """
+    if isinstance(data, np.ndarray):
+        tensor = torch.from_numpy(data)
+    elif isinstance(data, torch.Tensor):
+        tensor = data
+    else:
+        tensor = torch.tensor(data)
+    if not enforce_cpu:
+        tensor = tensor.cuda(cfg.GPU_ID)
+    return tensor


 def new_workspace(merge_default=True):
@@ -112,10 +166,10 @@ def reset_workspace(workspace=None, merge_default=True):
    return new_workspace(merge_default)


-class FrozenGraph(object):
+class Graph(object):
    """Simple sequential graph to accelerate inference.

-    The frozen graph reduces the overhead of python functions
+    Graph reduces the overhead of python functions
    under eager execution. Such cost will be at least 15ms
    for common backbones, which limits to about 60FPS.

@@ -130,11 +184,20 @@ class FrozenGraph(object):
            for k, v in input_dict.items():
                input_dict[k] = v.name if hasattr(v, 'name') else v
            return input_dict
+        self.placeholders = {}
        self._inputs = canonicalize(inputs)
        self._outputs = canonicalize(outputs)
        self._constants = canonicalize(constants)
-        self._graph = new_workspace()
-        self._tape = torch.jit.get_default_recorder()
+        self._workspace = get_workspace()
+        self._tracer = torch.jit.get_tracer()
+
+    @property
+    def workspace(self):
+        return self._workspace
+
+    @workspace.setter
+    def workspace(self, value):
+        self._workspace = value

    def forward(self, **kwargs):
        # Assign inputs
@@ -142,8 +205,8 @@ class FrozenGraph(object):
            value = kwargs.get(name, None)
            tensor_util.set_array(tensor, value)

-        # Replay the tape
-        self._tape.replay()
+        # Replay the traced expressions
+        self._tracer.replay()

        # Collect outputs
        # 1) Target results
@@ -159,7 +222,7 @@ class FrozenGraph(object):
        return outputs

    def __call__(self, **kwargs):
-        with self._graph.as_default():
+        with self._workspace.as_default():
            return self.forward(**kwargs)



--- a/lib/utils/image.py
+++ b/lib/utils/image.py
@@ -30,7 +30,7 @@ def distort_image(img):
    ]
    np.random.shuffle(transforms)
    for transform in transforms:
-        if np.random.uniform() < .5:
+        if np.random.uniform() < 0.5:
            img = transform(img)
            img = img.enhance(1. + np.random.uniform(-.4, .4))
    return np.array(img)
@@ -71,12 +71,6 @@ def resize_image(img, fx, fy):
    )


-# Faster and robust resizing than OpenCV methods
-def resize_mask(mask, size):
-    mask = PIL.Image.fromarray(mask)
-    return np.array(mask.resize(size, PIL.Image.NEAREST))
-
-
 def scale_image(img):
    processed_ims, ims_scales = [], []


--- a/lib/utils/mask.py
+++ b/lib/utils/mask.py
@@ -17,7 +17,24 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import cv2
 import numpy as np
+import PIL.Image
+
+from lib.utils import boxes as box_util
+
+
+def dismantle_masks(gt_boxes, gt_masks, num_images):
+    """Dismantle the packed ground-truth boxes."""
+    return [
+        gt_boxes[
+            np.where(gt_boxes[:, -1].astype(np.int32) == i)[0]
+        ][:, :-1] for i in range(num_images)
+    ], [
+        gt_masks[
+            np.where(gt_boxes[:, -1].astype(np.int32) == i)[0]
+        ] for i in range(num_images)
+    ]


 def intersect_box_mask(ex_box, gt_box, gt_mask):
@@ -66,3 +83,100 @@ def mask_overlap(box1, box2, mask1, mask2):
    if union < 1.:
        return 0.
    return float(inter) / float(union)
+
+
+def project_masks(
+    masks,
+    boxes,
+    height,
+    width,
+    thresh=0.5,
+    data_format='HWC',
+    data_order='F',
+):
+    """Project the predicting masks to a image.
+
+    Parameters
+    ----------
+    masks : numpy.ndarray
+        The masks packed in (C, H, W) format.
+    boxes : numpy.ndarray
+        The predicting bounding boxes.
+    height : int
+        The height of image.
+    width : int
+        The width of image.
+    thresh : float, optional, default=0.5
+        The threshold to binarize floating mask.
+    data_format : {'HWC', 'CHW'}, optional
+        The data format of output image.
+    data_order : {'F', 'C'}, optional
+        The fortran-style or c-style order.
+
+    Returns
+    -------
+    numpy.ndarray
+        The output image.
+
+    """
+    num_pred = boxes.shape[0]
+    assert masks.shape[0] == num_pred
+    mask_shape = [height, width]
+    if data_format == 'HWC':
+        mask_shape += [num_pred]
+    elif data_format == 'CHW':
+        mask_shape = [num_pred] + mask_shape
+    else:
+        raise ValueError('Unknown data format', data_format)
+    mask_image = np.zeros(mask_shape, 'uint8', data_order)
+    M = masks[0].shape[0]
+    scale = (M + 2.) / M
+    ref_boxes = box_util.expand_boxes(boxes, scale)
+    ref_boxes = ref_boxes.astype(np.int32)
+    padded_mask = np.zeros((M + 2, M + 2), 'float32')
+    for i in range(num_pred):
+        ref_box = ref_boxes[i, :4]
+        mask = masks[i]
+        padded_mask[1:-1, 1:-1] = mask[:, :]
+        w = ref_box[2] - ref_box[0] + 1
+        h = ref_box[3] - ref_box[1] + 1
+        w = np.maximum(w, 1)
+        h = np.maximum(h, 1)
+        mask = cv2.resize(padded_mask, (w, h))
+        mask = np.array(mask > thresh, 'uint8')
+        x1 = max(ref_box[0], 0)
+        y1 = max(ref_box[1], 0)
+        x2 = min(ref_box[2] + 1, width)
+        y2 = min(ref_box[3] + 1, height)
+        if data_format == 'HWC':
+            mask_image[y1:y2, x1:x2, i] = \
+                mask[(y1 - ref_box[1]):(y2 - ref_box[1]),
+                     (x1 - ref_box[0]):(x2 - ref_box[0])]
+        elif data_format == 'CHW':
+            mask_image[i, y1:y2, x1:x2] = \
+                mask[(y1 - ref_box[1]):(y2 - ref_box[1]),
+                     (x1 - ref_box[0]):(x2 - ref_box[0])]
+    return mask_image
+
+
+def resize_mask(mask, size):
+    """Resize the mask with nearest neighbor method.
+
+    PIL implementation while not OpenCV is used,
+    as we found the former will provide higher mask AP.
+
+    Parameters
+    ----------
+    mask : numpy.ndarray
+        The 2d mask array.
+    size : Sequence[int]
+        The output width and height.
+
+    Returns
+    -------
+    numpy.ndarray
+        The resizing mask.
+
+    """
+    mask = PIL.Image.fromarray(mask)
+    return np.array(mask.resize(size, PIL.Image.NEAREST))
\ No newline at end of file
--- a/lib/utils/rotated_boxes.py
+++ b/lib/utils/rotated_boxes.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import ctypes
-import math
-import os
-
-import numpy as np
-
-
-class _CppExtension(object):
-    dtype_mapping = {
-        'int32': ctypes.c_int32,
-        'float64': ctypes.c_double,
-    }
-
-    def __init__(self, library_name):
-        libc = ctypes.cdll.LoadLibrary(
-            os.path.join(os.path.split(
-                os.path.abspath(__file__))[0],
-                library_name,
-            )
-        )
-
-        def load_func(name, arg_types):
-            func = getattr(libc, name)
-            func.argtypes = self.get_arg_types(*arg_types)
-            return func
-
-        self._apply_cpu_nms = load_func(
-            'apply_cpu_nms', (
-                ('float64', 1),  # dets
-                ('int32', 1),    # indices
-                ('int32', 1),    # n
-                ('float64', 0),  # thresh
-            )
-        )
-        self._bbox_overlaps = load_func(
-            'bbox_overlaps', (
-                ('float64', 1),  # boxes1
-                ('float64', 1),  # boxes2
-                ('int32', 1),    # n, k
-                ('float64', 1)   # overlaps
-            )
-        )
-
-    @staticmethod
-    def array2ptr(array):
-        return array.ctypes.data_as(
-            _CppExtension.get_ptr(str(array.dtype)))
-
-    @staticmethod
-    def contiguous(array, dtype='float64'):
-        return np.ascontiguousarray(array.flatten(), dtype)
-
-    @staticmethod
-    def get_arg_types(*args):
-        arg_types = []
-        for (dtype, is_pointer) in args:
-            arg_types.append(
-                _CppExtension.get_ptr(dtype) if is_pointer
-                else _CppExtension.dtype_mapping[dtype]
-            )
-        return arg_types
-
-    @staticmethod
-    def get_ptr(dtype):
-        return ctypes.POINTER(_CppExtension.dtype_mapping[dtype])
-
-    @staticmethod
-    def ptr2array(ptr, shape):
-        return np.ctypeslib.as_array(
-            shape.from_address(
-                ctypes.addressof(ptr.contents)
-            ))
-
-    def bbox_overlaps(self, boxes1, boxes2):
-        """Computer overlaps between boxes and query boxes."""
-        def canonicalize(boxes):
-            box_dim = boxes.shape[1]
-            if box_dim > 5:
-                boxes = boxes[:, :5]
-            elif box_dim < 5:
-                raise ValueError('Excepted box5d.')
-            return self.contiguous(boxes, 'float64')
-        n, k = boxes1.shape[0], boxes2.shape[0]
-        boxes1 = canonicalize(boxes1)
-        boxes2 = canonicalize(boxes2)
-        overlaps_shape = (ctypes.c_int32 * 2)()
-        overlaps_shape[:] = (n, k)
-        overlaps = np.zeros((n * k,), 'float64')
-        overlaps_ptr = self.array2ptr(overlaps)
-        self._bbox_overlaps(
-            self.array2ptr(boxes1),
-            self.array2ptr(boxes2),
-            ctypes.cast(overlaps_shape, self.get_ptr('int32')),
-            overlaps_ptr,
-        )
-        return self.ptr2array(overlaps_ptr, ctypes.c_double * k * n)
-
-    def cpu_nms(self, dets, thresh):
-        """Apply Hard-NMS."""
-        if dets.shape[1] != 6:
-            raise ValueError('Excepted det6d.')
-        order = dets[:, 5].argsort()[::-1]
-        sorted_dets = dets[order, :]
-
-        num_keep = sorted_dets.shape[0]
-        num_keep_ins = ctypes.c_int32(num_keep)
-        indices = np.zeros((num_keep,), np.int32)
-        indices_ptr = self.array2ptr(indices)
-
-        self._apply_cpu_nms(
-            self.array2ptr(self.contiguous(dets, 'float64')),
-            indices_ptr,
-            ctypes.byref(num_keep_ins),
-            ctypes.c_double(thresh),
-        )
-        keep_indices = self.ptr2array(
-            indices_ptr, (ctypes.c_int32 * num_keep_ins.value))
-        return list(order[keep_indices])
-
-
-def bbox_transform(ex_rois, gt_rois, weights=(1., 1., 1., 1., 1.)):
-    """Transform the boxes to the regression targets."""
-    ex_ctr_x = ex_rois[:, 0]
-    ex_ctr_y = ex_rois[:, 1]
-    ex_widths = ex_rois[:, 2]
-    ex_heights = ex_rois[:, 3]
-    ex_angles = ex_rois[:, 4]
-
-    gt_ctr_x = gt_rois[:, 0]
-    gt_ctr_y = gt_rois[:, 1]
-    gt_widths = gt_rois[:, 2]
-    gt_heights = gt_rois[:, 3]
-    gt_angles = gt_rois[:, 4]
-
-    wx, wy, ww, wh, wa = weights
-    targets_dx = wx * (gt_ctr_x - ex_ctr_x) / ex_widths
-    targets_dy = wy * (gt_ctr_y - ex_ctr_y) / ex_heights
-    targets_dw = ww * np.log(gt_widths / ex_widths)
-    targets_dh = wh * np.log(gt_heights / ex_heights)
-    targets_da = wa * np.sin(np.radians(gt_angles - ex_angles))
-
-    return np.vstack((
-        targets_dx,
-        targets_dy,
-        targets_dw,
-        targets_dh,
-        targets_da,
-    )).transpose()
-
-
-def bbox_transform_inv(boxes, deltas, weights=(1., 1., 1., 1., 1.)):
-    """Decode the final boxes according to the deltas."""
-    if boxes.shape[0] == 0:
-        return np.zeros((0, deltas.shape[1]), dtype=deltas.dtype)
-
-    boxes = boxes.astype(deltas.dtype, copy=False)
-
-    ctr_x = boxes[:, 0]
-    ctr_y = boxes[:, 1]
-    widths = boxes[:, 2]
-    heights = boxes[:, 3]
-    angles = boxes[:, 4:5]
-
-    wx, wy, ww, wh, wa = weights
-    dx = deltas[:, 0::5] / wx
-    dy = deltas[:, 1::5] / wy
-    dw = deltas[:, 2::5] / ww
-    dh = deltas[:, 3::5] / wh
-    da = deltas[:, 4::5] / wa
-
-    pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis]
-    pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis]
-    pred_w = np.exp(dw) * widths[:, np.newaxis]
-    pred_h = np.exp(dh) * heights[:, np.newaxis]
-    da = np.minimum(np.maximum(da, -1), 1)
-    pred_a = np.rad2deg(np.arcsin(da)) + angles
-
-    pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype)
-    pred_boxes[:, 0::5] = pred_ctr_x  # x_ctr
-    pred_boxes[:, 1::5] = pred_ctr_y  # y_ctr
-    pred_boxes[:, 2::5] = pred_w      # w
-    pred_boxes[:, 3::5] = pred_h      # h
-    pred_boxes[:, 4::5] = pred_a      # angle
-
-    return pred_boxes
-
-
-def box2vertices(values):
-    x_ctr, y_ctr, w, h, a = values
-    theta = a * 0.01745329251
-    cos_theta2 = math.cos(theta) * 0.5
-    sin_theta2 = math.sin(theta) * 0.5
-    vertices = [
-        x_ctr - sin_theta2 * h - cos_theta2 * w,
-        y_ctr + cos_theta2 * h - sin_theta2 * w,
-        x_ctr + sin_theta2 * h - cos_theta2 * w,
-        y_ctr - cos_theta2 * h - sin_theta2 * w,
-    ]
-    vertices.extend([
-        2 * x_ctr - vertices[0],
-        2 * y_ctr - vertices[1],
-        2 * x_ctr - vertices[2],
-        2 * y_ctr - vertices[3],
-    ])
-    return vertices
-
-
-def vertices2box(vertices):
-    def sort(vertices):
-        poly = np.array(vertices).reshape((4, 2))
-        # lt, rt, rb, lb
-        edge = [
-            (poly[1][0] - poly[0][0]) * (poly[1][1] + poly[0][1]),
-            (poly[2][0] - poly[1][0]) * (poly[2][1] + poly[1][1]),
-            (poly[3][0] - poly[2][0]) * (poly[3][1] + poly[2][1]),
-            (poly[0][0] - poly[3][0]) * (poly[0][1] + poly[3][1])
-        ]
-        p_area = np.sum(edge) / 2.
-        _poly = poly.copy()
-        if abs(p_area) < 1:
-            raise ValueError
-        if p_area > 0:
-            _poly = _poly[(0, 3, 2, 1), :]  # clock wise
-        anchor = np.array([np.min(poly[:, 0]), np.min(poly[:, 1])])
-        line0 = np.linalg.norm(anchor - _poly[0])
-        line1 = np.linalg.norm(anchor - _poly[1])
-        line2 = np.linalg.norm(anchor - _poly[2])
-        line3 = np.linalg.norm(anchor - _poly[3])
-        argmin = np.argmin([line0, line1, line2, line3])
-        lt = _poly[argmin]
-        rt = _poly[(argmin + 1) % 4]
-        rb = _poly[(argmin + 2) % 4]
-        lb = _poly[(argmin + 3) % 4]
-        return np.array([lt, rt, rb, lb]).flatten()
-    values = sort(vertices)
-    y4my3 = values[7] - values[5]
-    if y4my3 != 0:
-        x2mx1 = values[2] - values[0]
-        theta = math.atan(x2mx1 / y4my3)
-        cos_theta = math.cos(theta)
-        sin_theta = math.sin(theta)
-        h = x2mx1 / sin_theta
-        x2px1 = values[2] + values[0]
-        x4px3 = values[6] + values[4]
-        w = (x4px3 - x2px1) / (2. * cos_theta)
-        a = theta / 0.01745329251
-    else:
-        w = values[2] - values[0]
-        h = values[5] - values[1]
-        a = 0.
-    x_ctr = 0.5 * (values[0] + values[4])
-    y_ctr = 0.5 * (values[1] + values[5])
-    return x_ctr, y_ctr, w, h, a
-
-
-def clip_angle(d):
-    while d < 0:
-        d += 360
-    while d >= 360:
-        d -= 360
-    return d
-
-
-def clip_boxes(boxes, im_shape):
-    # ctr_x >= 0
-    boxes[:, 0] = np.maximum(np.minimum(boxes[:, 0], im_shape[1] - 1), 0)
-    # ctr_y >= 0
-    boxes[:, 1] = np.maximum(np.minimum(boxes[:, 1], im_shape[0] - 1), 0)
-    # w < im_shape[1]
-    boxes[:, 2] = np.maximum(np.minimum(boxes[:, 2], im_shape[1] - 1), 0)
-    # h < im_shape[0]
-    boxes[:, 3] = np.maximum(np.minimum(boxes[:, 3], im_shape[0] - 1), 0)
-    # 0 < a < 360
-    boxes[:, 4] = np.maximum(np.minimum(boxes[:, 4], 359), 0)
-    return boxes
-
-
-def clip_tiled_boxes(boxes, im_shape):
-    # ctr_x >= 0
-    boxes[:, 0::5] = np.maximum(np.minimum(boxes[:, 0::5], im_shape[1] - 1), 0)
-    # ctr_y >= 0
-    boxes[:, 1::5] = np.maximum(np.minimum(boxes[:, 1::5], im_shape[0] - 1), 0)
-    # w < im_shape[1]
-    boxes[:, 2::5] = np.maximum(np.minimum(boxes[:, 2::5], im_shape[1] - 1), 0)
-    # h < im_shape[0]
-    boxes[:, 3::5] = np.maximum(np.minimum(boxes[:, 3::5], im_shape[0] - 1), 0)
-    # 0 < a < 360
-    boxes[:, 4::5] = np.maximum(np.minimum(boxes[:, 4::5], 359), 0)
-    return boxes
-
-
-def flip_boxes(boxes, width):
-    ca = np.vectorize(clip_angle)
-    flip_boxes = boxes.copy()
-    old_cx = boxes[:, 0].copy()
-    old_a = boxes[:, 4].copy()
-    flip_boxes[:, 0] = width - old_cx - 1
-    flip_boxes[:, 4] = ca(180 - old_a)
-    return flip_boxes
-
-
-# Aliases
-libc = _CppExtension('ctypes_rbox.so')
-bbox_overlaps = libc.bbox_overlaps
-cpu_nms = libc.cpu_nms
-
-
-if __name__ == "__main__":
-    prior_boxes = np.array([[4, 4, 15, 15, 150], [4, 4, 15, 15, 45]], dtype='float64')
-    gt_boxes = np.array([[4, 4, 15, 15, 45, 1.]], dtype='float64')
-    ov = bbox_overlaps(prior_boxes, gt_boxes)
-    indices = cpu_nms(gt_boxes, 0.45)
-    print(ov)
-    print(indices)
--- a/lib/utils/stats.py
+++ b/lib/utils/stats.py
@@ -22,11 +22,7 @@ import numpy as np


 class SmoothedValue(object):
-    """
-    Track a series of values and provide access to smoothed values
-    over a window or the global series average.
-
-    """
+    """Track a series of values and provide smoothed report."""

    def __init__(self, window_size):
        self.deque = collections.deque(maxlen=window_size)

--- a/lib/utils/time_util.py
+++ b/lib/utils/time_util.py
@@ -24,6 +24,7 @@ import time

 class Timer(object):
    """A simple timer."""
+
    def __init__(self):
        self.total_time = 0.
        self.calls = 0
@@ -31,6 +32,15 @@ class Timer(object):
        self.diff = 0.
        self.average_time = 0.

+    def add_diff(self, diff, average=True):
+        self.total_time += diff
+        self.calls += 1
+        self.average_time = self.total_time / self.calls
+        if average:
+            return self.average_time
+        else:
+            return self.diff
+
    @contextlib.contextmanager
    def tic_and_toc(self):
        try:
@@ -78,3 +88,20 @@ def get_progress_info(timer, curr_step, max_steps):
    progress = (curr_step + 1.) / max_steps
    return '< PROGRESS: {:.2%} | SPEED: {:.3f}s / iter | ETA: {} >' \
            .format(progress, timer.average_time, eta)
+
+
+def new_timers(*args):
+    """Return a dict that contains specified timers.
+
+    Parameters
+    ----------
+    args : str...
+        The key(s) to create timers.
+
+    Returns
+    -------
+    Dict[Timer]
+        The timer dict.
+
+    """
+    return dict([(k, Timer()) for k in args])
--- a/lib/utils/vis.py
+++ b/lib/utils/vis.py
@@ -120,21 +120,21 @@ def get_bbox_contours(rotated_box):
    return quad, main_direction


-def get_mask(boxes, segms, im_shape, mask_thresh=0.4):
-    i, masks = 0, np.zeros(list(im_shape) + [len(boxes)], dtype=np.uint8)
+def get_mask(boxes, segms, im_shape, mask_thresh=0.5):
+    i, masks = 0, np.zeros(list(im_shape) + [len(boxes)], 'uint8')
    for det, msk in zip(boxes, segms):
        M = msk.shape[0]
-        scale = (M + 2.0) / M
-        ref_box = expand_boxes(np.array([det[0:4]]), scale)[0]
+        scale = (M + 2.) / M
+        ref_box = expand_boxes(np.array([det[:4]]), scale)[0]
        ref_box = ref_box.astype(np.int32)
-        padded_mask = np.zeros((M + 2, M + 2), dtype=np.float32)
+        padded_mask = np.zeros((M + 2, M + 2), 'float32')
        padded_mask[1:-1, 1:-1] = msk[:, :]
        w = ref_box[2] - ref_box[0] + 1
        h = ref_box[3] - ref_box[1] + 1
        w = np.maximum(w, 1)
        h = np.maximum(h, 1)
        mask = cv2.resize(padded_mask, (w, h))
-        mask = np.array(mask > mask_thresh, dtype=np.uint8)
+        mask = np.array(mask > mask_thresh, 'uint8')
        x1 = max(ref_box[0], 0)
        y1 = max(ref_box[1], 0)
        x2 = min(ref_box[2] + 1, im_shape[1])
@@ -157,6 +157,7 @@ def vis_one_image(
    dpi=100,
    box_alpha=0.,
    show_class=True,
+    show_rotated=False,
    filename=None,
 ):
    """Visual debugging of detections."""
@@ -199,7 +200,7 @@ def vis_one_image(
            continue

        # Show box
-        if bbox.size == 4:
+        if bbox.size == 4 and not show_rotated:
            ax.add_patch(
                plt.Rectangle(
                    (bbox[0], bbox[1]),
@@ -211,28 +212,6 @@ def vis_one_image(
                    alpha=box_alpha,
                )
            )
-        elif bbox.size == 5:
-            quad, md = get_bbox_contours(bbox)
-            ax.add_patch(
-                Polygon(
-                    quad,
-                    fill=False,
-                    edgecolor='g',
-                    linewidth=1.,
-                    alpha=box_alpha,
-                )
-            )
-            ax.add_patch(
-                plt.arrow(
-                    md[0, 0],
-                    md[0, 1],
-                    md[1, 0] - md[0, 0],
-                    md[1, 1] - md[0, 1],
-                    width=2,
-                    color='g',
-                    alpha=box_alpha,
-                )
-            )

        # Show class
        if show_class:
@@ -258,10 +237,28 @@ def vis_one_image(
                img[:, :, c] = color_mask[c]
            e = masks[:, :, i]

-            _, contour, hier = cv2.findContours(
-                e.copy(), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE)
-
-            for c in contour:
+            results = cv2.findContours(
+                e.copy(),
+                cv2.RETR_CCOMP,
+                cv2.CHAIN_APPROX_NONE,
+            )
+            contours = results[0] if len(results) == 2 else results[1]
+
+            if show_rotated and len(contours) > 1:
+                counters = [max(contours, key=cv2.contourArea)]
+
+            for c in contours:
+                if show_rotated:
+                    rect = cv2.minAreaRect(c)
+                    ax.add_patch(
+                        Polygon(
+                            cv2.boxPoints(rect),
+                            fill=False,
+                            edgecolor='g',
+                            linewidth=1.,
+                            alpha=box_alpha,
+                        )
+                    )
                ax.add_patch(Polygon(
                    c.reshape((-1, 2)),
                    fill=True,

--- a/scripts/coco/__init__.py
+++ b/scripts/coco/__init__.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
--- a/scripts/coco/im2rec.py
+++ b/scripts/coco/im2rec.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+"""Make record file for COCO dataset."""
+
+import os
+import shutil
+
+from maker import make_record
+from maskgen import make_mask, merge_mask
+
+
+if __name__ == '__main__':
+    COCO_ROOT = '/data'
+
+    # Encode masks to RLE bytes
+    if not os.path.exists('build'):
+         os.makedirs('build')
+    make_mask('train', '2014', COCO_ROOT)
+    make_mask('valminusminival', '2014', COCO_ROOT)
+    make_mask('minival', '2014', COCO_ROOT)
+    merge_mask('trainval35k', '2014', [
+         'build/coco_2014_train_mask.pkl',
+         'build/coco_2014_valminusminival_mask.pkl']
+    )
+
+    # coco_2014_trainval35k
+    make_record(
+        record_file=os.path.join(COCO_ROOT, 'coco_2014_trainval35k'),
+        images_path=[os.path.join(COCO_ROOT, 'images/train2014'),
+                     os.path.join(COCO_ROOT, 'images/val2014')],
+        splits_path=[os.path.join(COCO_ROOT, 'ImageSets'),
+                     os.path.join(COCO_ROOT, 'ImageSets')],
+        mask_file='build/coco_2014_trainval35k_mask.pkl',
+        splits=['train', 'valminusminival'],
+    )
+
+    # coco_2014_minival
+    make_record(
+        record_file=os.path.join(COCO_ROOT, 'coco_2014_minival'),
+        images_path=os.path.join(COCO_ROOT, 'images/val2014'),
+        mask_file='build/coco_2014_minival_mask.pkl',
+        splits_path=os.path.join(COCO_ROOT, 'ImageSets'),
+        splits=['minival'],
+    )
+
+    shutil.rmtree('build')
--- a/scripts/coco/maker.py
+++ b/scripts/coco/maker.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+import os
+import time
+
+import cv2
+import dragon
+import numpy as np
+
+try:
+    import cPickle
+except:
+    import pickle as cPickle
+
+
+def make_example(image_file, mask_objects, im_scale=None):
+    filename = os.path.split(image_file)[-1]
+    example = {'id': filename.split('.')[0], 'object': []}
+
+    if im_scale:
+        img = cv2.imread(image_file)
+        img = cv2.resize(
+            img, None,
+            fx=im_scale, fy=im_scale,
+            interpolation=cv2.INTER_LINEAR,
+        )
+        example['height'], example['width'], example['depth'] = img.shape
+        _, img = cv2.imencode('.jpg', img, [int(cv2.IMWRITE_JPEG_QUALITY), 95])
+        example['content'] = img.tostring()
+    else:
+        with open(image_file, 'rb') as f:
+            img_bytes = bytes(f.read())
+        img = cv2.imdecode(np.frombuffer(img_bytes, 'uint8'), 3)
+        example['height'], example['width'], example['depth'] = img.shape
+        example['content'] = img_bytes
+
+    for ix, obj in enumerate(mask_objects):
+        x1, y1, x2, y2 = obj['bbox']
+        example['object'].append({
+            'name': obj['name'],
+            'xmin': x1,
+            'ymin': y1,
+            'xmax': x2,
+            'ymax': y2,
+            'mask': obj['mask'],
+            'difficult': obj.get('crowd', 0),
+        })
+
+    return example
+
+
+def make_record(
+    record_file,
+    images_path,
+    mask_file,
+    splits_path,
+    splits,
+    ext='.jpg',
+    im_scale=None,
+):
+    if os.path.exists(record_file):
+        raise ValueError('The record file is already exist.')
+    os.makedirs(record_file)
+
+    if not isinstance(images_path, list):
+        images_path = [images_path]
+    if not isinstance(splits_path, list):
+        splits_path = [splits_path]
+    assert len(splits) == len(splits_path)
+    assert len(splits) == len(images_path)
+
+    if mask_file is not None:
+        with open(mask_file, 'rb') as f:
+            all_masks = cPickle.load(f)
+    else:
+        all_masks = {}
+
+    print('Start Time:', time.strftime("%a, %d %b %Y %H:%M:%S", time.gmtime()))
+
+    writer = dragon.io.SeetaRecordWriter(
+        path=record_file,
+        protocol={
+            'id': 'string',
+            'content': 'bytes',
+            'height': 'int64',
+            'width': 'int64',
+            'depth': 'int64',
+            'object': [{
+                'name': 'string',
+                'xmin': 'float64',
+                'ymin': 'float64',
+                'xmax': 'float64',
+                'ymax': 'float64',
+                'mask': 'bytes',
+                'difficult': 'int64',
+            }]
+        }
+    )
+
+    count, total_line = 0, 0
+    start_time = time.time()
+
+    for db_idx, split in enumerate(splits):
+        split_file = os.path.join(splits_path[db_idx], split + '.txt')
+        assert os.path.exists(split_file)
+        with open(split_file, 'r') as f:
+            lines = f.readlines()
+            total_line += len(lines)
+        for line in lines:
+            count += 1
+            if count % 2000 == 0:
+                now_time = time.time()
+                print('{} / {} in {:.2f} sec'.format(
+                    count, total_line, now_time - start_time))
+            filename = line.strip()
+            image_file = os.path.join(images_path[db_idx], filename + ext)
+            mask_objects = all_masks[filename] if filename in all_masks else None
+            if mask_objects is None:
+                raise ValueError('The image({}) takes invalid mask settings.'.format(filename))
+            writer.write( make_example(image_file, mask_objects, im_scale))
+
+    now_time = time.time()
+    print('{} / {} in {:.2f} sec'.format(count, total_line, now_time - start_time))
+    writer.close()
+
+    end_time = time.time()
+    data_size = os.path.getsize(record_file + '/data.data') * 1e-6
+    print('{} images take {:.2f} MB in {:.2f} sec.'
+          .format(total_line, data_size, end_time - start_time))
--- a/scripts/coco/maskgen.py
+++ b/scripts/coco/maskgen.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+import os
+import sys
+import os.path as osp
+from collections import OrderedDict
+
+try:
+    import cPickle
+except:
+    import pickle as cPickle
+
+sys.path.insert(0, '../..')
+from lib.pycocotools.coco import COCO
+from lib.pycocotools import mask_utils
+
+
+class imdb(object):
+    def __init__(self, image_set, year, data_dir):
+        self._year = year
+        self._image_set = image_set
+        self._data_path = osp.join(data_dir)
+        self.invalid_cnt = 0
+        self.ignore_cnt = 0
+        
+        # Load COCO API, classes, class <-> id mappings
+        self._COCO = COCO(self._get_ann_file())
+        cats = self._COCO.loadCats(self._COCO.getCatIds())
+        self._classes = tuple(['__background__'] + [c['name'] for c in cats])
+        self._class_to_ind = dict(zip(self._classes, range(self.num_classes)))
+        self._ind_to_class = dict(zip(range(self.num_classes), self._classes))
+        self._class_to_cat_id = dict(zip([c['name'] for c in cats], self._COCO.getCatIds()))
+        self._cat_id_to_class_id = dict([(self._class_to_cat_id[cls], 
+                                          self._class_to_ind[cls])
+                                          for cls in self._classes[1:]])
+        self._data_name = {
+            # 5k ``val2014`` subset
+            'minival2014': 'val2014',
+            # ``val2014`` minus ``minival2014``
+            'valminusminival2014': 'val2014',
+        }.get(image_set + year, image_set + year)
+        self._image_index = self._load_image_set_index()
+        self._annotations = self._load_annotations()
+
+    def _get_ann_file(self):
+        prefix = 'instances' \
+            if self._image_set.find('test') == -1 \
+            else 'image_info'
+        return osp.join(
+            self._data_path, 
+            'annotations',
+            prefix + '_' + 
+            self._image_set + 
+            self._year + '.json'
+        )
+
+    def _load_image_set_index(self):
+        """Load image ids."""
+        image_ids = self._COCO.getImgIds()
+        return image_ids
+
+    def _load_annotations(self):
+        """Load annotations."""
+        annotations = [self._load_coco_annotation(index)
+                       for index in self._image_index]
+        return annotations
+
+    def image_path_from_index(self, index):
+        """Construct an image path from the image's "index" identifier."""
+        # Example image path for index=119993:
+        #   images/train2014/COCO_train2014_000000119993.jpg
+        file_name = ('COCO_' + self._data_name + '_' +
+                     str(index).zfill(12) + '.jpg')
+        image_path = osp.join(self._data_path, 'images',
+                              self._data_name, file_name)
+        assert osp.exists(image_path), \
+            'Path does not exist: {}'.format(image_path)
+        return image_path
+
+    def image_path_at(self, i):
+        """Return the absolute path to image i in the image sequence."""
+        return self.image_path_from_index(self._image_index[i])
+
+    def annotation_at(self, i):
+        """Return the absolute path to image i in the image sequence."""
+        return self._annotations[i]
+
+    def _load_coco_annotation(self, index):
+        """Loads COCO bounding-box instance annotations."""
+        im_ann = self._COCO.loadImgs(index)[0]
+        width, height = im_ann['width'], im_ann['height']
+        ann_ids = self._COCO.getAnnIds(imgIds=index, iscrowd=None)
+        objects = self._COCO.loadAnns(ann_ids)
+        # Sanitize boxes -- some are invalid
+        valid_objects = []
+        for obj in objects:
+            x1 = float(max(0, obj['bbox'][0]))
+            y1 = float(max(0, obj['bbox'][1]))
+            x2 = float(min(width - 1, x1 + max(0, obj['bbox'][2] - 1)))
+            y2 = float(min(height - 1, y1 + max(0, obj['bbox'][3] - 1)))
+            if isinstance(obj['segmentation'], list):
+                for p in obj['segmentation']:
+                    if len(p) < 6:
+                        print('Remove Invalid segm.')
+                # Valid polygons have >= 3 points, so require >= 6 coordinates
+                poly = [p for p in obj['segmentation'] if len(p) >= 6]
+                mask_bytes = mask_utils.poly2bytes(poly, height, width)
+            else:
+                # Crowd masks
+                # Some are encoded with height or width
+                # running out of the image bound
+                # Do not use them or decoding error is inevitable
+                mask_bytes = mask_utils.poly2bytes(obj['segmentation'], height, width)
+            if not isinstance(mask_bytes, bytes):
+                print(type(mask_bytes))
+            if obj['area'] > 0 and x2 > x1 and y2 > y1:
+                obj['clean_bbox'] = [x1, y1, x2, y2]
+                valid_objects.append({
+                    'bbox': [x1, y1, x2, y2],
+                    'mask': mask_bytes,
+                    'category_id': obj['category_id'],
+                    'class_id': self._cat_id_to_class_id[obj['category_id']],
+                    'crowd': obj['iscrowd'],
+                })
+                valid_objects[-1]['name'] = \
+                    self._ind_to_class[valid_objects[-1]['class_id']]
+
+        return height, width, valid_objects
+
+    @property
+    def num_images(self):
+        return len(self._image_index)
+
+    @property
+    def num_classes(self):
+        return len(self._classes)
+
+
+def make_mask(split, year, data_dir):
+    coco = imdb(split, year, data_dir)
+    print('Preparing to make split: {}, total {} images'.format(split, coco.num_images))
+    if not osp.exists(osp.join(coco._data_path, 'ImageSets')):
+        os.makedirs(osp.join(coco._data_path, 'ImageSets'))
+
+    gt_recs = OrderedDict()
+    for i in range(coco.num_images):
+        filename = (coco.image_path_at(i).split('/')[-1]).split('.')[0]
+        h, w, objects = coco.annotation_at(i)
+        gt_recs[filename] = objects
+
+    with open(osp.join('build',
+        'coco_' + year + '_' + split + '_mask.pkl'), 'wb') as f:
+            cPickle.dump(gt_recs, f, cPickle.HIGHEST_PROTOCOL)
+
+    with open(osp.join(coco._data_path, 'ImageSets', split + '.txt'), 'w') as f:
+        for i in range(coco.num_images):
+            filename = (coco.image_path_at(i).split('/')[-1]).split('.')[0]
+            if i != coco.num_images - 1: filename += '\n'
+            f.write(filename)
+
+
+def merge_mask(split, year, mask_files):
+    gt_recs = OrderedDict()
+    data_path = os.path.dirname(mask_files[0])
+
+    for mask_file in mask_files:
+        with open(mask_file, 'rb') as f:
+            recs = cPickle.load(f)
+            gt_recs.update(recs)
+
+    with open(osp.join(data_path, 'coco_' + year + '_' + split + '_mask.pkl'), 'wb') as f:
+        cPickle.dump(gt_recs, f, cPickle.HIGHEST_PROTOCOL)
--- a/scripts/rotated/im2rec.py
+++ b/scripts/rotated/im2rec.py
@@ -9,6 +9,8 @@
 #
 # ------------------------------------------------------------

+"""Make record file for Rotated dataset."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -18,23 +20,12 @@ from maker import make_record


 if __name__ == '__main__':
-    voc_root = '/data/VOC'
+    data_root = '/data'

    make_record(
-        record_file=osp.join(voc_root, 'voc_0712_trainval'),
-        images_path=[osp.join(voc_root, 'VOCdevkit2007/VOC2007/JPEGImages'),
-                     osp.join(voc_root, 'VOCdevkit2012/VOC2012/JPEGImages')],
-        annotations_path=[osp.join(voc_root, 'VOCdevkit2007/VOC2007/Annotations'),
-                          osp.join(voc_root, 'VOCdevkit2012/VOC2012/Annotations')],
-        imagesets_path=[osp.join(voc_root, 'VOCdevkit2007/VOC2007/ImageSets/Main'),
-                        osp.join(voc_root, 'VOCdevkit2012/VOC2012/ImageSets/Main')],
-        splits=['trainval', 'trainval']
+        record_file=osp.join(data_root, 'rotated_train'),
+        images_path=[osp.join(data_root, 'JPEGImages')],
+        annotations_path=[osp.join(data_root, 'Annotations')],
+        imagesets_path=[osp.join(data_root, 'ImageSets')],
+        splits=['train']
    )
-
-    make_record(
-        record_file=osp.join(voc_root, 'voc_2007_test'),
-        images_path=osp.join(voc_root, 'VOCdevkit2007/VOC2007/JPEGImages'),
-        annotations_path=osp.join(voc_root, 'VOCdevkit2007/VOC2007/Annotations'),
-        imagesets_path=osp.join(voc_root, 'VOCdevkit2007/VOC2007/ImageSets/Main'),
-        splits=['test']
-   )
--- a/scripts/voc/im2rec.py
+++ b/scripts/voc/im2rec.py
@@ -9,6 +9,8 @@
 #
 # ------------------------------------------------------------

+"""Make record file for VOC dataset."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -18,7 +20,7 @@ from maker import make_record


 if __name__ == '__main__':
-    voc_root = '/data/VOC'
+    voc_root = '/data'

    make_record(
        record_file=osp.join(voc_root, 'voc_0712_trainval'),

--- a/tools/export.py
+++ b/tools/export.py
@@ -23,7 +23,7 @@ import pprint

 from lib.core.config import cfg
 from lib.core.coordinator import Coordinator
-from lib.modeling.detector import Detector
+from lib.modeling.detector import new_detector
 from lib.utils import logger


@@ -69,13 +69,7 @@ if __name__ == '__main__':
    # Ready to export the network
    logger.info('Exporting model will be saved to `{:s}`'
                .format(coordinator.exports_dir()))
-    detector = Detector().eval().cuda(cfg.GPU_ID)
-    detector.load_weights(checkpoint)
-    detector.optimize_for_inference()
-
-    # Mixed precision training?
-    if cfg.MODEL.PRECISION.lower() == 'float16':
-        detector.half()  # Powerful FP16 Support
+    detector = new_detector(cfg.GPU_ID, checkpoint)

    data = torch.zeros(*args.input_shape).byte()
    ims_info = torch.zeros(args.input_shape[0], 3).float()

--- a/tools/test.py
+++ b/tools/test.py
@@ -13,7 +13,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import importlib
 import os
 import sys
 sys.path.insert(0, '..')
@@ -21,19 +20,23 @@ sys.path.insert(0, '..')
 import argparse
 import pprint

+from lib.core import test_engine
 from lib.core.config import cfg
 from lib.core.coordinator import Coordinator
 from lib.core.test import TestServer
 from lib.datasets.factory import get_imdb
-from lib.modeling.detector import Detector
 from lib.utils import logger


 def parse_args():
    """Parse input arguments"""
    parser = argparse.ArgumentParser(description='Test a Detection Network')
+    parser.add_argument('--gpus', dest='gpus',
+                        help='index of GPUs to use',
+                        default=None, nargs='+', type=int)
    parser.add_argument('--cfg', dest='cfg_file',
-                        help='optional config file', default=None, type=str)
+                        help='optional config file',
+                        default=None, type=str)
    parser.add_argument('--exp_dir', dest='exp_dir',
                        help='experiment dir',
                        default=None, type=str)
@@ -70,30 +73,24 @@ if __name__ == '__main__':
    logger.info('Called with args:')
    logger.info(args)

-    coordinator = Coordinator(args.cfg_file, exp_dir=args.exp_dir)
+    coordinator = Coordinator(args.cfg_file, args.exp_dir)
    logger.info('Using config:\n' + pprint.pformat(cfg))

    # Load the checkpoint and test engine
-    checkpoint, _ = coordinator.checkpoint(global_step=args.iter, wait=args.wait)
+    checkpoint, _ = coordinator.checkpoint(args.iter, wait=args.wait)
    if checkpoint is None:
        raise RuntimeError('The checkpoint of global step {} does not exist.'.format(args.iter))
-    test_engine = importlib.import_module('lib.{}.test'.format(cfg.MODEL.TYPE))

    # Inspect the database
    database = get_imdb(cfg.TEST.DATABASE)
-    cfg.TEST.PROTOCOL = 'null' if args.dump else cfg.TEST.PROTOCOL
+    cfg.TEST.PROTOCOL = 'dump' if args.dump else cfg.TEST.PROTOCOL
    logger.info('Database({}): {} images will be used to test.'
                .format(cfg.TEST.DATABASE, database.num_images))

    # Ready to test the network
    output_dir = coordinator.results_dir(checkpoint, args.output_dir)
    logger.info('Results will be saved to `{:s}`'.format(output_dir))
-    detector = Detector().eval().cuda(cfg.GPU_ID)
-    detector.load_weights(checkpoint)
-    detector.optimize_for_inference()
-
-    # Mixed precision training?
-    if cfg.MODEL.PRECISION.lower() == 'float16':
-        detector.half()  # Powerful FP16 Support

-    test_engine.test_net(detector, TestServer(output_dir))
+    # Bind the server and run the test
+    server = TestServer(coordinator.results_dir(checkpoint))
+    test_engine.run_test_net(checkpoint, server, args.gpus)