Refactor the API of rotated boxes

Ting PAN
Commit 41b3932b authored Nov 01, 2019 by Ting PAN
Showing with 1838 additions and 1189 deletions
CHANGES
compile/make.sh
compile/rbox.cc
configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_1x.yml
configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_2x.yml
configs/faster_rcnn/voc_faster_rcnn_R-50-FPN.yml
configs/faster_rcnn/voc_faster_rcnn_VGG-16-C4.yml
configs/retinanet/coco_retinanet_400_R-50-FPN_1x.yml
configs/retinanet/coco_retinanet_400_R-50-FPN_4x.yml
configs/retinanet/voc_retinanet_300_AirNet-FPN.yml
configs/retinanet/voc_retinanet_300_R-18-FPN.yml
configs/retinanet/voc_retinanet_300_R-34-FPN.yml
configs/ssd/voc_ssd_300_AirNet-5b.yml
configs/ssd/voc_ssd_300_VGG-16.yml
configs/ssd/voc_ssd_320_R-50-FPN.yml
lib/core/config.py
lib/core/coordinator.py
lib/core/solver.py
lib/core/test.py
lib/core/train.py
--- a/CHANGES
+++ b/CHANGES
 ------------------------------------------------------------------------
 The list of most significant changes made over time in SeetaDet.

+SeetaDet 0.2.3 (20191101)
+
+Dragon Minimum Required (Version 0.3.0.dev20191021)
+
+Changes:
+
+Preview Features:
+
+- Refactor the API of rotated boxes.
+
+- Simplify the solver by adding LRScheduler.
+
+- Change the ``ITER`` naming to ``STEP``.
+
+Bugs fixed:
+
+- None
+
+------------------------------------------------------------------------
+
+SeetaDet 0.2.2 (20191021)
+
+Dragon Minimum Required (Version 0.3.0.dev20191021)
+
+Changes:
+
+Preview Features:
+
+- Add the dumping if detection results.
+
+Bugs fixed:
+
+- None
+
+------------------------------------------------------------------------
+
 SeetaDet 0.2.1 (20191017)

 Dragon Minimum Required (Version 0.3.0.dev20191017)

--- a/compile/make.sh
+++ b/compile/make.sh
 #!/bin/sh

-# delete cache
+# Delete cache
 rm -r build install *.c *.cpp

-# compile cython modules
+# Compile cpp modules
 python setup.py build_ext --inplace
-g++ -o ../lib/utils/ctypes_rbox.so -shared -fPIC -O2 rbox.cc -fopenmp
+g++ -o ../lib/utils/ctypes_rbox.so -shared -fPIC -O2 rbox.cc -std=c++11 -fopenmp

-# compile cuda modules
+# Compile cuda modules
 cd build && cmake .. && make install && cd ..

-# setup
+# Copy to the library root
 cp -r install/lib ../
--- a/compile/rbox.cc
+++ b/compile/rbox.cc
@@ -9,363 +9,355 @@
 //
 // Codes are based on:
 //
-//      <https://github.com/liulei01/DRBox/blob/master/examples/rbox/deploy/librbox.cpp.code>
+//      <https://github.com/facebookresearch/detectron2/blob/master/detectron2
+//              /layers/csrc/box_iou_rotated/box_iou_rotated_utils.h>
 //
 // ------------------------------------------------------------

 #include <cmath>
 #include <algorithm>
 #include <omp.h>
-using namespace std;
-
-struct Line {
-	int crossnum;   // 0:ignore; -1:all inner point; 2:two crossing point; 1:one crossing point
-	int p1;     // index of the start point
-	int p2;     // index of the end point
-	int d[2][2];    // the index of the start point after division
-	double length;  // the length after division
+
+template <typename T>
+struct RotatedBox {
+    T x_ctr, y_ctr, w, h, a;
 };

-void _Overlap(double *rbox1, double *rbox2, double *area) {
-    double xcenter1 = rbox1[0];
-    double ycenter1 = rbox1[1];
-    double width1 = rbox1[2];
-    double height1 = rbox1[3];
-    double angle1 = rbox1[4];
-    double xcenter2 = rbox2[0];
-    double ycenter2 = rbox2[1];
-    double width2 = rbox2[2];
-    double height2 = rbox2[3];
-    double angle2 = rbox2[4];
-
-	angle1 = -angle1;
-	angle2 = -angle2;
-	double angled = angle2 - angle1;
-	angled *= (double)3.14159265/180;
-	angle1 *= (double)3.14159265/180;
-
-	area[0] = 0;
-	double hw1 = width1 / 2;
-	double hh1 = height1 / 2;
-	double hw2 = width2 / 2;
-	double hh2 = height2 / 2;
-	double xcenterd = xcenter2 - xcenter1;
-	double ycenterd = ycenter2 - ycenter1;
-	double tmp = xcenterd * cosf(angle1) + ycenterd * sinf(angle1);
-	ycenterd = -xcenterd * sinf(angle1) + ycenterd * cosf(angle1);
-	xcenterd = tmp;
-	double max_width_height1 = width1 > height1? width1 : height1;
-	double max_width_height2 = width2 > height2? width2 : height2;
-	if (sqrt(xcenterd * xcenterd + ycenterd * ycenterd) >
-		(max_width_height1 + max_width_height2) * 1.414214/2) {
-		area[0] = 0;
-		return;
-	}
-	if (fabs(sin(angled)) < 1e-3) {
-		if (fabs(xcenterd) > (hw1 + hw2) || fabs(ycenterd) > (hh1 + hh2)) {
-			area[0] = 0;
-			return;
-		} else {
-			double x_min_inter = -hw1 > (xcenterd - hw2)? -hw1 : (xcenterd - hw2);
-			double x_max_inter = hw1 < (xcenterd + hw2)? hw1 : (xcenterd + hw2);
-			double y_min_inter = -hh1 > (ycenterd - hh2)? -hh1 : (ycenterd - hh2);
-			double y_max_inter = hh1 < (ycenterd + hh2)? hh1 : (ycenterd + hh2);
-			const double inter_width = x_max_inter - x_min_inter;
-			const double inter_height = y_max_inter - y_min_inter;
-			const double inter_size = inter_width * inter_height;
-			area[0] = inter_size;
-            area[0] = area[0] / (width1 * height1 + width2 * height2 - area[0]);
-			return;
-		}
-	}
-	if (fabs(cos(angled)) < 1e-3) {
-		double x_min_inter = -hw1 > (xcenterd - hh2)? -hw1 : (xcenterd - hh2);
-		double x_max_inter = hw1 < (xcenterd + hh2)? hw1 : (xcenterd + hh2);
-		double y_min_inter = -hh1 > (ycenterd - hw2)? -hh1 : (ycenterd - hw2);
-		double y_max_inter = hh1 < (ycenterd + hw2)? hh1 : (ycenterd + hw2);
-		const double inter_width = x_max_inter - x_min_inter;
-		const double inter_height = y_max_inter - y_min_inter;
-		const double inter_size = inter_width * inter_height;
-		area[0] = inter_size;
-        area[0] = area[0] / (width1 * height1 + width2 * height2 - area[0]);
-		return;
-	}
-
-	double cos_angled = cosf(angled);
-	double sin_angled = sinf(angled);
-	double cos_angled_hw1 = cos_angled * hw1;
-	double sin_angled_hw1 = sin_angled * hw1;
-	double cos_angled_hh1 = cos_angled * hh1;
-	double sin_angled_hh1 = sin_angled * hh1;
-	double cos_angled_hw2 = cos_angled * hw2;
-	double sin_angled_hw2 = sin_angled * hw2;
-	double cos_angled_hh2 = cos_angled * hh2;
-	double sin_angled_hh2 = sin_angled * hh2;
-
-	// point20: (w/2, h/2)
-	double point2x[4], point2y[4];
-	point2x[0] = xcenterd + cos_angled_hw2 - sin_angled_hh2;
-	point2y[0] = ycenterd + sin_angled_hw2 + cos_angled_hh2;
-	// point21: (-w/2, h/2)
-	point2x[1] = xcenterd - cos_angled_hw2 - sin_angled_hh2;
-	point2y[1] = ycenterd - sin_angled_hw2 + cos_angled_hh2;
-	// point22: (-w/2, -h/2)
-	point2x[2] = xcenterd - cos_angled_hw2 + sin_angled_hh2;
-	point2y[2] = ycenterd - sin_angled_hw2 - cos_angled_hh2;
-	// point23: (w/2, -h/2)
-	point2x[3] = xcenterd + cos_angled_hw2 + sin_angled_hh2;
-	point2y[3] = ycenterd + sin_angled_hw2 - cos_angled_hh2;
-
-	double pcenter_x = 0, pcenter_y = 0;
-	int count = 0;
+template <typename T>
+struct Point {
+    T x, y;
+    Point(const T& px = 0, const T& py = 0) : x(px), y(py) {}
+    Point operator+(const Point& p) const {
+        return Point(x + p.x, y + p.y);
+    }
+    Point& operator+=(const Point& p) {
+        x += p.x;
+        y += p.y;
+        return *this;
+    }
+    Point operator-(const Point& p) const {
+        return Point(x - p.x, y - p.y);
+    }
+    Point operator*(const T coeff) const {
+        return Point(x * coeff, y * coeff);
+    }
+};

-	// determine the inner point
-	bool inner_side2[4][4], inner2[4];
-	for(int i = 0; i < 4; i++) {
-		inner_side2[i][0] = point2y[i] < hh1;
-		inner_side2[i][1] = point2x[i] > -hw1;
-		inner_side2[i][2] = point2y[i] > -hh1;
-		inner_side2[i][3] = point2x[i] < hw1;
-		inner2[i] = inner_side2[i][0] & inner_side2[i][1] & inner_side2[i][2] & inner_side2[i][3];
-		if (inner2[i]) { pcenter_x += point2x[i]; pcenter_y += point2y[i]; count++;}
-	}
-
-	//similar operating for rbox1: angled -> -angled, xcenterd -> -xcenterd, ycenterd -> -ycenterd
-	// point10: (w/2, h/2)
-	double xcenterd_hat = - xcenterd * cos_angled - ycenterd * sin_angled;
-	double ycenterd_hat = xcenterd * sin_angled - ycenterd * cos_angled;
-	double point1x[4], point1y[4];
-
-	point1x[0] = xcenterd_hat + cos_angled_hw1 + sin_angled_hh1;
-	point1y[0] = ycenterd_hat - sin_angled_hw1 + cos_angled_hh1;
-	// point21: (-w/2, h/2)
-	point1x[1] = xcenterd_hat - cos_angled_hw1 + sin_angled_hh1;
-	point1y[1] = ycenterd_hat + sin_angled_hw1 + cos_angled_hh1;
-	// point22: (-w/2, -h/2)
-	point1x[2] = xcenterd_hat - cos_angled_hw1 - sin_angled_hh1;
-	point1y[2] = ycenterd_hat + sin_angled_hw1 - cos_angled_hh1;
-	// point23: (w/2, -h/2)
-	point1x[3] = xcenterd_hat + cos_angled_hw1 - sin_angled_hh1;
-	point1y[3] = ycenterd_hat - sin_angled_hw1 - cos_angled_hh1;
-
-	// determine the inner point
-	// determine the inner point
-	bool inner_side1[4][4], inner1[4];
-	for(int i = 0; i < 4; i++)
-	{
-		inner_side1[i][0] = point1y[i] < hh2;
-		inner_side1[i][1] = point1x[i] > -hw2;
-		inner_side1[i][2] = point1y[i] > -hh2;
-		inner_side1[i][3] = point1x[i] < hw2;
-		inner1[i] = inner_side1[i][0] & inner_side1[i][1] & inner_side1[i][2] & inner_side1[i][3];
-	}
-	point1x[0] = hw1;
-	point1y[0] = hh1;
-	// point21: (-w/2, h/2)
-	point1x[1] = -hw1;
-	point1y[1] = hh1;
-	// point22: (-w/2, -h/2)
-	point1x[2] = -hw1;
-	point1y[2] = -hh1;
-	// point23: (w/2, -h/2)
-	point1x[3] = hw1;
-	point1y[3] = -hh1;
-	if (inner1[0]) { pcenter_x += hw1; pcenter_y += hh1; count++;}
-	if (inner1[1]) { pcenter_x -= hw1; pcenter_y += hh1; count++;}
-	if (inner1[2]) { pcenter_x -= hw1; pcenter_y -= hh1; count++;}
-	if (inner1[3]) { pcenter_x += hw1; pcenter_y -= hh1; count++;}
-	//find cross_points
-	Line line1[4], line2[4];
-	line1[0].p1 = 0; line1[0].p2 = 1;
-	line1[1].p1 = 1; line1[1].p2 = 2;
-	line1[2].p1 = 2; line1[2].p2 = 3;
-	line1[3].p1 = 3; line1[3].p2 = 0;
-	line2[0].p1 = 0; line2[0].p2 = 1;
-	line2[1].p1 = 1; line2[1].p2 = 2;
-	line2[2].p1 = 2; line2[2].p2 = 3;
-	line2[3].p1 = 3; line2[3].p2 = 0;
-	double pointc_x[4][4], pointc_y[4][4];
+template <typename T>
+T dot_2d(const Point<T>& A, const Point<T>& B) {
+    return A.x * B.x + A.y * B.y;
+}
+
+template <typename T>
+T cross_2d(const Point<T>& A, const Point<T>& B) {
+    return A.x * B.y - B.x * A.y;
+}
+
+template <typename T>
+void get_rotated_vertices(
+    const RotatedBox<T>&        box,
+    Point<T>                    (&pts)[4]) {
+    // M_PI / 180. == 0.01745329251
+    double theta = box.a * 0.01745329251;
+    T cosTheta2 = (T)cos(theta) * 0.5f;
+    T sinTheta2 = (T)sin(theta) * 0.5f;
+    // y: top --> down; x: left --> right
+    pts[0].x = box.x_ctr - sinTheta2 * box.h - cosTheta2 * box.w;
+    pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w;
+    pts[1].x = box.x_ctr + sinTheta2 * box.h - cosTheta2 * box.w;
+    pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w;
+    pts[2].x = 2 * box.x_ctr - pts[0].x;
+    pts[2].y = 2 * box.y_ctr - pts[0].y;
+    pts[3].x = 2 * box.x_ctr - pts[1].x;
+    pts[3].y = 2 * box.y_ctr - pts[1].y;
+}
+
+template <typename T>
+int get_intersection_points(
+    const Point<T>          (&pts1)[4],
+    const Point<T>          (&pts2)[4],
+    Point<T>                (&intersections)[24]) {
+    // Line vector
+    // A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1]
+    Point<T> vec1[4], vec2[4];
    for (int i = 0; i < 4; i++) {
-		int index1 = line1[i].p1;
-		int index2 = line1[i].p2;
-		line1[i].crossnum = 0;
-		if (inner1[index1] && inner1[index2]) {
-			if (i == 0 || i == 2) line1[i].length = width1;
-			else line1[i].length = height1;
-			line1[i].crossnum = -1;
-			continue;
+        vec1[i] = pts1[(i + 1) % 4] - pts1[i];
+        vec2[i] = pts2[(i + 1) % 4] - pts2[i];
    }
-		if (inner1[index1]) {
-			line1[i].crossnum ++;
-			line1[i].d[0][0] = index1;
-			line1[i].d[0][1] = -1;
+
+    // Line test - test all line combos for intersection
+    int num = 0; // number of intersections
+    for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++) {
+            // Solve for 2x2 Ax=b
+            T det = cross_2d(vec2[j], vec1[i]);
+
+            // This takes care of parallel lines
+            if (fabs(det) <= 1e-14) {
                continue;
            }
-		if (inner1[index2]) {
-			line1[i].crossnum ++;
-			line1[i].d[0][0] = index2;
-			line1[i].d[0][1] = -1;
-			continue;
+
+            auto vec12 = pts2[j] - pts1[i];
+
+            T t1 = cross_2d(vec2[j], vec12) / det;
+            T t2 = cross_2d(vec1[i], vec12) / det;
+
+            if (t1 >= 0.0f && t1 <= 1.0f && t2 >= 0.0f && t2 <= 1.0f) {
+                intersections[num++] = pts1[i] + vec1[i] * t1;
+            }
        }
    }
+
+    // Check for vertices of rect1 inside rect2
+    {
+        const auto& AB = vec2[0];
+        const auto& DA = vec2[3];
+        auto ABdotAB = dot_2d(AB, AB);
+        auto ADdotAD = dot_2d(DA, DA);
        for (int i = 0; i < 4; i++) {
-		int index1 = line2[i].p1;
-		double x1 = point2x[index1];
-		double y1 = point2y[index1];
-		int index2 = line2[i].p2;
-		double x2 = point2x[index2];
-		double y2 = point2y[index2];
-		line2[i].crossnum = 0;
-		if (inner2[index1] && inner2[index2]) {
-			if (i == 0 || i == 2) line2[i].length = width2;
-			else line2[i].length = height1;
-			line2[i].crossnum = -1;
-			continue;
+            // assume ABCD is the rectangle, and P is the point to be judged
+            // P is inside ABCD iff. P's projection on AB lies within AB
+            // and P's projection on AD lies within AD
+
+            auto AP = pts1[i] - pts2[0];
+
+            auto APdotAB = dot_2d<T>(AP, AB);
+            auto APdotAD = -dot_2d<T>(AP, DA);
+
+            if ((APdotAB >= 0) &&
+                    (APdotAD >= 0) &&
+                        (APdotAB <= ABdotAB) &&
+                            (APdotAD <= ADdotAD)) {
+                intersections[num++] = pts1[i];
            }
-		if (inner2[index1]) {
-			line2[i].crossnum ++;
-			line2[i].d[0][0] = index1;
-			line2[i].d[0][1] = -1;
-		} else if (inner2[index2]) {
-			line2[i].crossnum ++;
-			line2[i].d[0][0] = index2;
-			line2[i].d[0][1] = -1;
-		}
-		double tmp1 = (y1*x2 - y2*x1) / (y1 - y2);
-		double tmp2 = (x1 - x2) / (y1 - y2);
-		double tmp3 = (x1*y2 - x2*y1) / (x1 - x2);
-		double tmp4 = 1/tmp2 * hw1;
-		tmp2 *= hh1;
-		for (int j = 0; j < 4; j++) {
-			int index3 = line1[j].p1;
-			int index4 = line1[j].p2;
-			if ((inner_side2[index1][j] != inner_side2[index2][j])
-				&& (inner_side1[index3][i] != inner_side1[index4][i])) {
-				switch (j) {
-				case 0:
-					pointc_x[i][j] = tmp1 + tmp2;
-					pointc_y[i][j] = hh1;
-					break;
-				case 1:
-					pointc_y[i][j] = tmp3 - tmp4;
-					pointc_x[i][j] = -hw1;
-					break;
-				case 2:
-					pointc_x[i][j] = tmp1 - tmp2;
-					pointc_y[i][j] = -hh1;
-					break;
-				case 3:
-					pointc_y[i][j] = tmp3 + tmp4;
-					pointc_x[i][j] = hw1;
-					break;
-				default:
-					break;
        }
-				line1[j].d[line1[j].crossnum][0] = i;
-				line1[j].d[line1[j].crossnum ++][1] = j;
-				line2[i].d[line2[i].crossnum][0] = i;
-				line2[i].d[line2[i].crossnum ++][1] = j;
-				pcenter_x += pointc_x[i][j];
-				pcenter_y += pointc_y[i][j];
-				count ++;
+    }
+
+    // Reverse the check - check for vertices of rect2 inside rect1
+    {
+        const auto& AB = vec1[0];
+        const auto& DA = vec1[3];
+        auto ABdotAB = dot_2d<T>(AB, AB);
+        auto ADdotAD = dot_2d<T>(DA, DA);
+        for (int i = 0; i < 4; i++) {
+            auto AP = pts2[i] - pts1[0];
+
+            auto APdotAB = dot_2d<T>(AP, AB);
+            auto APdotAD = -dot_2d<T>(AP, DA);
+
+            if ((APdotAB >= 0) &&
+                    (APdotAD >= 0) &&
+                        (APdotAB <= ABdotAB) &&
+                            (APdotAD <= ADdotAD)) {
+                intersections[num++] = pts2[i];
            }
        }
    }
-	pcenter_x /= (double)count;
-	pcenter_y /= (double)count;
-	double pcenter_x_hat, pcenter_y_hat;
-	pcenter_x_hat = pcenter_x - xcenterd;
-	pcenter_y_hat = pcenter_y - ycenterd;
-	tmp = cos_angled * pcenter_x_hat + sin_angled * pcenter_y_hat;
-	pcenter_y_hat = -sin_angled * pcenter_x_hat + cos_angled * pcenter_y_hat;
-	pcenter_x_hat = tmp;

-	for (int i = 0; i < 4; i++) {
-		if (line1[i].crossnum > 0) {
-			if (line1[i].d[0][1] == -1) {
-				if (i==0 || i==2)
-					line1[i].length = fabs(point1x[line1[i].d[0][0]] - pointc_x[line1[i].d[1][0]][line1[i].d[1][1]]);
-				else
-					line1[i].length = fabs(point1y[line1[i].d[0][0]] - pointc_y[line1[i].d[1][0]][line1[i].d[1][1]]);
+    return num;
+}
+
+template <typename T>
+int convex_hull_graham(
+    const Point<T>          (&p)[24],
+    const int&              num_in,
+    Point<T>                (&q)[24],
+    bool                    shift_to_zero = false) {
+
+    // Step 1:
+    // Find point with minimum y
+    // if more than 1 points have the same minimum y,
+    // pick the one with the minimum x.
+    int t = 0;
+    for (int i = 1; i < num_in; i++) {
+        if (p[i].y < p[t].y || (p[i].y == p[t].y && p[i].x < p[t].x)) {
+            t = i;
+        }
+    }
+    auto& start = p[t]; // starting point
+
+    // Step 2:
+    // Subtract starting point from every points (for sorting in the next step)
+    for (int i = 0; i < num_in; i++) {
+        q[i] = p[i] - start;
+    }
+
+    // Swap the starting point to position 0
+    auto tmp = q[0];
+    q[0] = q[t];
+    q[t] = tmp;
+
+    // Step 3:
+    // Sort point 1 ~ num_in according to their relative cross-product values
+    // (essentially sorting according to angles)
+     // If the angles are the same, sort according to their distance to origin
+    T dist[24];
+    for (int i = 0; i < num_in; i++) {
+        dist[i] = dot_2d(q[i], q[i]);
+    }
+
+    std::sort(
+        q + 1, q + num_in, [](const Point<T>& A, const Point<T>& B) -> bool {
+            T temp = cross_2d<T>(A, B);
+            if (fabs(temp) < 1e-6) {
+                return dot_2d(A, A) < dot_2d(B, B);
            } else {
-				if (i==0 || i==2)
-					line1[i].length = fabs(pointc_x[line1[i].d[0][0]][line1[i].d[0][1]] - pointc_x[line1[i].d[1][0]][line1[i].d[1][1]]);
-				else
-					line1[i].length = fabs(pointc_y[line1[i].d[0][0]][line1[i].d[0][1]] - pointc_y[line1[i].d[1][0]][line1[i].d[1][1]]);
-			}
-		}
-		if (line2[i].crossnum >0) {
-			if (line2[i].d[0][1] == -1)
-				line2[i].length = fabs(point2x[line2[i].d[0][0]] - pointc_x[line2[i].d[1][0]][line2[i].d[1][1]]);
-			else
-				line2[i].length = fabs(pointc_x[line2[i].d[0][0]][line2[i].d[0][1]] - pointc_x[line2[i].d[1][0]][line2[i].d[1][1]]);
-			if(i == 0 || i == 2) line2[i].length *= width2 / fabs(point2x[line2[i].p1] - point2x[line2[i].p2]);
-			else line2[i].length *= height2 / fabs(point2x[line2[i].p1] - point2x[line2[i].p2]);
-		}
-	}
-	double dis1[4], dis2[4];
-	dis1[0] = fabs(pcenter_y - hh1);
-	dis1[1] = fabs(pcenter_x + hw1);
-	dis1[2] = fabs(pcenter_y + hh1);
-	dis1[3] = fabs(pcenter_x - hw1);
-	dis2[0] = fabs(pcenter_y_hat - hh2);
-	dis2[1] = fabs(pcenter_x_hat + hw2);
-	dis2[2] = fabs(pcenter_y_hat + hh2);
-	dis2[3] = fabs(pcenter_x_hat - hw2);
-	for (int i=0; i < 4; i++) {
-		if (line1[i].crossnum != 0)
-			area[0] += dis1[i] * line1[i].length;
-		if (line2[i].crossnum != 0)
-			area[0] += dis2[i] * line2[i].length;
-	}
-	area[0] /= 2;
-    area[0] = area[0] / (width1 * height1 + width2 * height2 - area[0]);
+                return temp > 0;
+            }
+      });
+
+    // Step 4:
+    // Make sure there are at least 2 points (that don't overlap with each other)
+    // in the stack
+    int k; // index of the non-overlapped second point
+    for (k = 1; k < num_in; k++) {
+        if (dist[k] > 1e-8) {
+            break;
+        }
+    }
+    if (k == num_in) {
+        // We reach the end, which means the convex hull is just one point
+        q[0] = p[t];
+        return 1;
+    }
+    q[1] = q[k];
+    int m = 2; // 2 points in the stack
+    // Step 5:
+    // Finally we can start the scanning process.
+    // When a non-convex relationship between the 3 points is found
+    // (either concave shape or duplicated points),
+    // we pop the previous point from the stack
+    // until the 3-point relationship is convex again, or
+    // until the stack only contains two points
+    for (int i = k + 1; i < num_in; i++) {
+        while (m > 1 && cross_2d(q[i] - q[m - 2], q[m - 1] - q[m - 2]) >= 0) {
+            m--;
+        }
+        q[m++] = q[i];
+    }
+
+    // Step 6 (Optional):
+    // In general sense we need the original coordinates, so we
+    // need to shift the points back (reverting Step 2)
+    // But if we're only interested in getting the area/perimeter of the shape
+    // We can simply return.
+    if (!shift_to_zero) {
+        for (int i = 0; i < m; i++) {
+            q[i] += start;
+        }
+    }
+
+    return m;
 }

-void _Overlaps(double *boxes, double *query_boxes, int* n, double *area) {
-    int p = n[0];
-    int k = n[1];
-    const int nthreads = std::min(omp_get_num_procs(), 4);
-#pragma omp parallel for num_threads(nthreads)
-    for (int i = 0; i < p; i++) {
-        double box1[5] = {boxes[5 * i], boxes[5 * i + 1], boxes[5 * i + 2], boxes[5 * i + 3], boxes[5 * i + 4]};
-        for (int j = 0; j < k; j++) {
-            double box2[5] = {query_boxes[5 * j], query_boxes[5 * j + 1], query_boxes[5 * j + 2], query_boxes[5 * j +3], query_boxes[5 * j + 4]};
-            double area_tmp[1];
-            _Overlap(box1, box2, area_tmp);
-            area[i * k + j] = area_tmp[0];
+template <typename T>
+T polygon_area(const Point<T> (&q)[24], const int& m) {
+    if (m <= 2) {
+        return 0;
    }
+
+    T area = 0;
+    for (int i = 1; i < m - 1; i++) {
+        area += fabs(cross_2d(q[i] - q[0], q[i + 1] - q[0]));
    }
+    return area / 2.0;
 }

-void _NMS(double* preds, int* indices, double* scores, int& n, double threshold) {
+template <typename T>
+T rotated_boxes_intersection(
+    const RotatedBox<T>&        box1,
+    const RotatedBox<T>&        box2) {
+    // There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned
+    // from rotated_rect_intersection_pts
+    Point<T> intersectPts[24], orderedPts[24];
+
+    Point<T> pts1[4];
+    Point<T> pts2[4];
+    get_rotated_vertices(box1, pts1);
+    get_rotated_vertices(box2, pts2);
+
+    int num = get_intersection_points(pts1, pts2, intersectPts);
+
+    if (num <= 2) {
+        return 0.0;
+    }
+
+    // Convex Hull to order the intersection points in clockwise order and find
+    // the contour area.
+    int num_convex = convex_hull_graham(intersectPts, num, orderedPts, true);
+    return polygon_area(orderedPts, num_convex);
+}
+
+
+template <typename T>
+T single_box_iou_rotated(
+    T const* const          box1_raw,
+    T const* const          box2_raw) {
+    // shift center to the middle point to achieve higher precision in result
+    RotatedBox<T> box1, box2;
+    auto center_shift_x = (box1_raw[0] + box2_raw[0]) / 2.0;
+    auto center_shift_y = (box1_raw[1] + box2_raw[1]) / 2.0;
+    box1.x_ctr = box1_raw[0] - center_shift_x;
+    box1.y_ctr = box1_raw[1] - center_shift_y;
+    box1.w = box1_raw[2];
+    box1.h = box1_raw[3];
+    box1.a = box1_raw[4];
+    box2.x_ctr = box2_raw[0] - center_shift_x;
+    box2.y_ctr = box2_raw[1] - center_shift_y;
+    box2.w = box2_raw[2];
+    box2.h = box2_raw[3];
+    box2.a = box2_raw[4];
+
+    const T area1 = box1.w * box1.h;
+    const T area2 = box2.w * box2.h;
+    if (area1 < 1e-14 || area2 < 1e-14) {
+        return 0.f;
+    }
+
+    const T inter = rotated_boxes_intersection(box1, box2);
+    const T iou = inter / (area1 + area2 - inter);
+    return iou;
+}
+
+extern "C" {
+	void apply_cpu_nms(
+	    double*             dets,
+	    int*                indices,
+	    int&                n,
+	    double              threshold) {
 	    int count = 0;
 	    for(int i = 0; i < n; i++) {
-		int ind_n = i;
 	        bool keep = true;
+	        auto* box1 = dets + i * 6;
 		    for(int j = 0; j < count; j++) {
-			int ind_p = indices[j];
-			double area[1];
-			_Overlap(preds + ind_p * 5, preds + ind_n * 5, &area[0]);
-			if (area[0] > threshold) {
+			    auto* box2 = dets + indices[j] * 6;
+			    auto ovr = single_box_iou_rotated(box1, box2);
+			    if (ovr > threshold) {
 				    keep = false;
 				    break;
 			    }
 		    }
-		if(keep) {
-			indices[count] = ind_n;
+		    if (keep) {
+			    indices[count] = i;
 			    count++;
 		    }
 	    }
 	    n = count;
-}
-
-extern "C" {
-	void NMS(double *preds, int *indices, double *scores, int& n, double threshold) {
-		_NMS(preds, indices, scores, n, threshold);
 	}

-    void Overlaps(double *boxes, double *query_boxes, int* n, double *area) {
-        _Overlaps(boxes, query_boxes, n, area);
+    void bbox_overlaps(
+        double*             boxes1,
+        double*             boxes2,
+        int*                shape,
+        double*             overlaps) {
+        int N = shape[0], K = shape[1];
+#pragma omp parallel for num_threads(std::min(omp_get_num_procs(), 4))
+        for (int i = 0; i < N; i++) {
+            auto* box1 = boxes1 + i * 5;
+            for (int j = 0; j < K; j++) {
+                auto* box2 = boxes2 + j * 5;
+                overlaps[i * K + j] = single_box_iou_rotated(box1, box2);
+            }
+        }
    }
 }
--- a/configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_1x.yml
+++ b/configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_1x.yml
@@ -22,11 +22,9 @@ MODEL:
  NUM_CLASSES: 81
 SOLVER:
  BASE_LR: 0.02
-  WEIGHT_DECAY: 0.0001
-  LR_POLICY: steps_with_decay
-  STEPS: [60000, 80000]
-  MAX_ITERS: 90000
-  SNAPSHOT_ITERS: 5000
+  DECAY_STEPS: [60000, 80000]
+  MAX_STEPS: 90000
+  SNAPSHOT_EVERY: 5000
  SNAPSHOT_PREFIX: coco_faster_rcnn
 FRCNN:
  ROI_XFORM_METHOD: RoIAlign

--- a/configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_2x.yml
+++ b/configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_2x.yml
@@ -22,11 +22,9 @@ MODEL:
  NUM_CLASSES: 81
 SOLVER:
  BASE_LR: 0.02
-  WEIGHT_DECAY: 0.0001
-  LR_POLICY: steps_with_decay
-  STEPS: [120000, 160000]
-  MAX_ITERS: 180000
-  SNAPSHOT_ITERS: 5000
+  DECAY_STEPS: [120000, 160000]
+  MAX_STEPS: 180000
+  SNAPSHOT_EVERY: 5000
  SNAPSHOT_PREFIX: coco_faster_rcnn
 FRCNN:
  ROI_XFORM_METHOD: RoIAlign

--- a/configs/faster_rcnn/voc_faster_rcnn_R-50-FPN.yml
+++ b/configs/faster_rcnn/voc_faster_rcnn_R-50-FPN.yml
@@ -13,11 +13,9 @@ MODEL:
  NUM_CLASSES: 21
 SOLVER:
  BASE_LR: 0.002
-  WEIGHT_DECAY: 0.0001
-  LR_POLICY: steps_with_decay
-  STEPS: [100000, 140000]
-  MAX_ITERS: 140000
-  SNAPSHOT_ITERS: 5000
+  DECAY_STEPS: [100000, 140000]
+  MAX_STEPS: 140000
+  SNAPSHOT_EVERY: 5000
  SNAPSHOT_PREFIX: voc_faster_rcnn
 FRCNN:
  ROI_XFORM_METHOD: RoIAlign

--- a/configs/faster_rcnn/voc_faster_rcnn_VGG-16-C4.yml
+++ b/configs/faster_rcnn/voc_faster_rcnn_VGG-16-C4.yml
@@ -14,10 +14,9 @@ MODEL:
 SOLVER:
  BASE_LR: 0.001
  WEIGHT_DECAY: 0.0005
-  LR_POLICY: steps_with_decay
-  STEPS: [100000, 140000]
-  MAX_ITERS: 140000
-  SNAPSHOT_ITERS: 5000
+  DECAY_STEPS: [100000, 140000]
+  MAX_STEPS: 140000
+  SNAPSHOT_EVERY: 5000
  SNAPSHOT_PREFIX: voc_faster_rcnn
 RPN:
  STRIDES: [16]

--- a/configs/retinanet/coco_retinanet_400_R-50-FPN_1x.yml
+++ b/configs/retinanet/coco_retinanet_400_R-50-FPN_1x.yml
@@ -22,11 +22,9 @@ MODEL:
  NUM_CLASSES: 81
 SOLVER:
  BASE_LR: 0.02
-  WEIGHT_DECAY: 0.0001
-  LR_POLICY: steps_with_decay
-  STEPS: [30000, 40000]
-  MAX_ITERS: 45000
-  SNAPSHOT_ITERS: 5000
+  DECAY_STEPS: [30000, 40000]
+  MAX_STEPS: 45000
+  SNAPSHOT_EVERY: 5000
  SNAPSHOT_PREFIX: coco_retinanet_400
 FPN:
  RPN_MIN_LEVEL: 3

--- a/configs/retinanet/coco_retinanet_400_R-50-FPN_4x.yml
+++ b/configs/retinanet/coco_retinanet_400_R-50-FPN_4x.yml
@@ -22,12 +22,10 @@ MODEL:
  NUM_CLASSES: 81
 SOLVER:
  BASE_LR: 0.02
-  WEIGHT_DECAY: 0.0001
-  WARM_UP_ITERS: 2000 # default: 500
-  LR_POLICY: steps_with_decay
-  STEPS: [120000, 160000]
-  MAX_ITERS: 180000
-  SNAPSHOT_ITERS: 5000
+  WARM_UP_STEPS: 2000 # default: 500
+  DECAY_STEPS: [120000, 160000]
+  MAX_STEPS: 180000
+  SNAPSHOT_EVERY: 5000
  SNAPSHOT_PREFIX: coco_retinanet_400
 FPN:
  RPN_MIN_LEVEL: 3
@@ -41,9 +39,9 @@ TRAIN:
  IMS_PER_BATCH: 8
  SCALES: [400]
  MAX_SIZE: 666
-  SCALE_JITTERING: True
-  COLOR_JITTERING: True
-  SCALE_RANGE: [0.75, 1.33]
+  USE_SCALE_JITTER: True
+  USE_COLOR_JITTER: True
+  SCALE_JITTER_RANGE: [0.75, 1.33]
 TEST:
  DATABASE: '/data/coco_2014_minival'
  JSON_FILE: '/data/instances_minival2014.json'

--- a/configs/retinanet/voc_retinanet_300_AirNet-FPN.yml
+++ b/configs/retinanet/voc_retinanet_300_AirNet-FPN.yml
@@ -13,11 +13,9 @@ MODEL:
  NUM_CLASSES: 21
 SOLVER:
  BASE_LR: 0.02
-  WEIGHT_DECAY: 0.0001
-  LR_POLICY: steps_with_decay
-  STEPS: [40000, 50000, 60000]
-  MAX_ITERS: 60000
-  SNAPSHOT_ITERS: 5000
+  DECAY_STEPS: [40000, 50000, 60000]
+  MAX_STEPS: 60000
+  SNAPSHOT_EVERY: 5000
  SNAPSHOT_PREFIX: voc_retinanet_300
 FPN:
  RPN_MIN_LEVEL: 3
@@ -28,9 +26,9 @@ TRAIN:
  IMS_PER_BATCH: 32
  SCALES: [300]
  MAX_SIZE: 500
-  SCALE_RANGE: [0.5, 2.0]
-  SCALE_JITTERING: True
-  COLOR_JITTERING: True
+  SCALE_JITTER_RANGE: [0.5, 2.0]
+  USE_SCALE_JITTER: True
+  USE_COLOR_JITTER: True
 TEST:
  DATABASE: '/data/voc_2007_test'
  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'

--- a/configs/retinanet/voc_retinanet_300_R-18-FPN.yml
+++ b/configs/retinanet/voc_retinanet_300_R-18-FPN.yml
@@ -13,12 +13,10 @@ MODEL:
  NUM_CLASSES: 21
 SOLVER:
  BASE_LR: 0.01
-  WEIGHT_DECAY: 0.0001
-  LR_POLICY: steps_with_decay
-  STEPS: [40000, 50000, 60000]
-  WARM_UP_ITERS: 2000
-  MAX_ITERS: 60000
-  SNAPSHOT_ITERS: 5000
+  DECAY_STEPS: [40000, 50000, 60000]
+  WARM_UP_STEPS: 2000
+  MAX_STEPS: 60000
+  SNAPSHOT_EVERY: 5000
  SNAPSHOT_PREFIX: voc_retinanet_300
 FPN:
  RPN_MIN_LEVEL: 3
@@ -29,9 +27,9 @@ TRAIN:
  IMS_PER_BATCH: 32
  SCALES: [300]
  MAX_SIZE: 500
-  SCALE_RANGE: [0.5, 2.0]
-  SCALE_JITTERING: True
-  COLOR_JITTERING: True
+  SCALE_JITTER_RANGE: [0.5, 2.0]
+  USE_SCALE_JITTER: True
+  USE_COLOR_JITTER: True
 TEST:
  DATABASE: '/data/voc_2007_test'
  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'

--- a/configs/retinanet/voc_retinanet_300_R-34-FPN.yml
+++ b/configs/retinanet/voc_retinanet_300_R-34-FPN.yml
@@ -13,12 +13,10 @@ MODEL:
  NUM_CLASSES: 21
 SOLVER:
  BASE_LR: 0.01
-  WEIGHT_DECAY: 0.0001
-  LR_POLICY: steps_with_decay
-  STEPS: [40000, 50000, 60000]
-  WARM_UP_ITERS: 2000
-  MAX_ITERS: 60000
-  SNAPSHOT_ITERS: 5000
+  DECAY_STEPS: [40000, 50000, 60000]
+  WARM_UP_STEPS: 2000
+  MAX_STEPS: 60000
+  SNAPSHOT_EVERY: 5000
  SNAPSHOT_PREFIX: voc_retinanet_300
 FPN:
  RPN_MIN_LEVEL: 3
@@ -29,9 +27,9 @@ TRAIN:
  IMS_PER_BATCH: 32
  SCALES: [300]
  MAX_SIZE: 500
-  SCALE_RANGE: [0.5, 2.0]
-  SCALE_JITTERING: True
-  COLOR_JITTERING: True
+  SCALE_JITTER_RANGE: [0.5, 2.0]
+  USE_SCALE_JITTER: True
+  USE_COLOR_JITTER: True
 TEST:
  DATABASE: '/data/voc_2007_test'
  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'

--- a/configs/ssd/voc_ssd_300_AirNet-5b.yml
+++ b/configs/ssd/voc_ssd_300_AirNet-5b.yml
@@ -13,11 +13,9 @@ MODEL:
  NUM_CLASSES: 21
 SOLVER:
  BASE_LR: 0.001
-  WEIGHT_DECAY: 0.0001
-  LR_POLICY: steps_with_decay
-  STEPS: [80000, 100000, 120000]
-  MAX_ITERS: 120000
-  SNAPSHOT_ITERS: 5000
+  DECAY_STEPS: [80000, 100000, 120000]
+  MAX_STEPS: 120000
+  SNAPSHOT_EVERY: 5000
  SNAPSHOT_PREFIX: voc_ssd_300
 SSD:
  RESIZE:

--- a/configs/ssd/voc_ssd_300_VGG-16.yml
+++ b/configs/ssd/voc_ssd_300_VGG-16.yml
@@ -13,13 +13,12 @@ MODEL:
            'sheep', 'sofa', 'train', 'tvmonitor']
  NUM_CLASSES: 21
 SOLVER:
-  BASE_LR: 0.002
+  BASE_LR: 0.001
  WARM_UP_FACTOR: 0.
  WEIGHT_DECAY: 0.0005
-  LR_POLICY: steps_with_decay
-  STEPS: [80000, 100000, 120000]
-  MAX_ITERS: 120000
-  SNAPSHOT_ITERS: 5000
+  DECAY_STEPS: [80000, 100000, 120000]
+  MAX_STEPS: 120000
+  SNAPSHOT_EVERY: 5000
  SNAPSHOT_PREFIX: voc_ssd_300
 SSD:
  RESIZE:

--- a/configs/ssd/voc_ssd_320_R-50-FPN.yml
+++ b/configs/ssd/voc_ssd_320_R-50-FPN.yml
+NUM_GPUS: 1
+VIS: False
+ENABLE_TENSOR_BOARD: False
+MODEL:
+  TYPE: ssd
+  BACKBONE: resnet50.fpn
+  CLASSES: ['__background__',
+            'aeroplane', 'bicycle', 'bird', 'boat',
+            'bottle', 'bus', 'car', 'cat', 'chair',
+            'cow', 'diningtable', 'dog', 'horse',
+            'motorbike', 'person', 'pottedplant',
+            'sheep', 'sofa', 'train', 'tvmonitor']
+  NUM_CLASSES: 21
+FPN:
+  RPN_MIN_LEVEL: 3
+  RPN_MAX_LEVEL: 8
+SOLVER:
+  BASE_LR: 0.001
+  DECAY_STEPS: [80000, 100000, 120000]
+  MAX_STEPS: 120000
+  SNAPSHOT_EVERY: 5000
+  SNAPSHOT_PREFIX: voc_ssd_320
+SSD:
+  NUM_CONVS: 2
+  RESIZE:
+    HEIGHT: 320
+    WIDTH: 320
+  MULTIBOX:
+    STRIDES: [8, 16, 32, 64, 100, 300]
+    MIN_SIZES: [30, 60, 110, 162, 213, 264]
+    MAX_SIZES: [60, 110, 162, 213, 264, 315]
+    ASPECT_RATIOS: [[1, 2, 0.5], [1, 2, 0.5, 3, 0.33], [1, 2, 0.5, 3, 0.33],
+                    [1, 2, 0.5, 3, 0.33], [1, 2, 0.5], [1, 2, 0.5]]
+TRAIN:
+  WEIGHTS: '/model/R-50.Affine.pth'
+  DATABASE: '/data/voc_0712_trainval'
+  IMS_PER_BATCH: 32
+TEST:
+  DATABASE: '/data/voc_2007_test'
+  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
+  IMS_PER_BATCH: 8
+  NMS_TOP_K: 400
+  NMS: 0.45
+  SCORE_THRESH: 0.01
+  DETECTIONS_PER_IM: 200
+
--- a/lib/core/config.py
+++ b/lib/core/config.py
@@ -20,10 +20,10 @@ from __future__ import print_function
 import os.path as osp
 import numpy as np

-from lib.utils.attrdict import AttrDict as edict
+from lib.utils.attrdict import AttrDict

-__C = edict()
-cfg = __C
+
+cfg = __C = AttrDict()


 ###########################################
@@ -33,7 +33,7 @@ cfg = __C
 ###########################################


-__C.TRAIN = edict()
+__C.TRAIN = AttrDict()

 # Initialize network with weights from this file
 __C.TRAIN.WEIGHTS = ''
@@ -82,17 +82,17 @@ __C.TRAIN.USE_DIFF = True
 __C.TRAIN.BBOX_THRESH = 0.5

 # If True, randomly scale the image by scale range
-__C.TRAIN.SCALE_JITTERING = False
-__C.TRAIN.SCALE_RANGE = [0.75, 1.0]
+__C.TRAIN.USE_SCALE_JITTER = False
+__C.TRAIN.SCALE_JITTER_RANGE = [0.75, 1.0]

 # If True, randomly distort the image by brightness, contrast, and saturation
-__C.TRAIN.COLOR_JITTERING = False
+__C.TRAIN.USE_COLOR_JITTER = False

 # IOU >= thresh: positive example
 __C.TRAIN.RPN_POSITIVE_OVERLAP = 0.7
 # IOU < thresh: negative example
 __C.TRAIN.RPN_NEGATIVE_OVERLAP = 0.3
-# If an anchor statisfied by positive and negative conditions set to negative
+# If an anchor satisfied by positive and negative conditions set to negative
 __C.TRAIN.RPN_CLOBBER_POSITIVES = False
 # Max number of foreground examples
 __C.TRAIN.RPN_FG_FRACTION = 0.5
@@ -118,7 +118,7 @@ __C.TRAIN.RPN_STRADDLE_THRESH = 0
 ###########################################


-__C.TEST = edict()
+__C.TEST = AttrDict()

 # Database to test
 __C.TEST.DATABASE = ''
@@ -151,10 +151,10 @@ __C.TEST.SOFT_NMS_SIGMA = 0.5
 # The top-k prior boxes before nms.
 __C.TEST.NMS_TOP_K = 400

-# The threshold for predicting boxes
+# The threshold for prAttrDicting boxes
 __C.TEST.SCORE_THRESH = 0.05

-# The threshold for predicting masks
+# The threshold for prAttrDicting masks
 __C.TEST.BINARY_THRESH = 0.5

 # NMS threshold used on RPN proposals
@@ -188,37 +188,32 @@ __C.TEST.DETECTIONS_PER_IM = 100
 ###########################################


-__C.MODEL = edict()
+__C.MODEL = AttrDict()

 # The type of the model
 # ('faster_rcnn',
-#  'mask_rcnn',
 #  'ssd',
-#  'rssd',
 #  'retinanet,
 # )
 __C.MODEL.TYPE = ''

 # The float precision for training and inference
 # (FLOAT32, FLOAT16,)
-__C.MODEL.DATA_TYPE = 'FLOAT32'
+__C.MODEL.PRECISION = 'FLOAT32'

 # The backbone
 __C.MODEL.BACKBONE = ''

 # The number of classes in the dataset
 __C.MODEL.NUM_CLASSES = -1
-
-# Keep it for TaaS DataSet
+# The name for each object class
 __C.MODEL.CLASSES = ['__background__']

-# Add StopGrad at a specified stage so the bottom layers are frozen
+# Frozen the gradient since the convolution stage K
+# The value of ``K`` is usually set to 2
 __C.MODEL.FREEZE_AT = 2

-# Whether to use focal loss for one-stage detectors?
-# Enabled if model type in ('ssd',)
-# Retinanet is force to use focal loss
-__C.MODEL.USE_FOCAL_LOSS = False
+# Setting of focal loss
 __C.MODEL.FOCAL_LOSS_ALPHA = 0.25
 __C.MODEL.FOCAL_LOSS_GAMMA = 2.0

@@ -234,7 +229,7 @@ __C.MODEL.COARSEST_STRIDE = 32
 ###########################################


-__C.RPN = edict()
+__C.RPN = AttrDict()

 # Strides for multiple rpn heads
 __C.RPN.STRIDES = [4, 8, 16, 32, 64]
@@ -253,7 +248,7 @@ __C.RPN.ASPECT_RATIOS = [0.5, 1, 2]
 ###########################################


-__C.RETINANET = edict()
+__C.RETINANET = AttrDict()

 # Anchor aspect ratios to use
 __C.RETINANET.ASPECT_RATIOS = (0.5, 1.0, 2.0)
@@ -291,7 +286,7 @@ __C.RETINANET.NEGATIVE_OVERLAP = 0.4
 ###########################################


-__C.FPN = edict()
+__C.FPN = AttrDict()

 # Channel dimension of the FPN feature levels
 __C.FPN.DIM = 256
@@ -317,7 +312,7 @@ __C.FPN.ROI_MIN_LEVEL = 2
 ###########################################


-__C.FRCNN = edict()
+__C.FRCNN = AttrDict()

 # RoI transformation function (e.g., RoIPool or RoIAlign)
 __C.FRCNN.ROI_XFORM_METHOD = 'RoIPool'
@@ -338,7 +333,7 @@ __C.FRCNN.ROI_XFORM_RESOLUTION = 7
 ###########################################


-__C.MRCNN = edict()
+__C.MRCNN = AttrDict()

 # Resolution of mask predictions
 __C.MRCNN.RESOLUTION = 28
@@ -357,10 +352,7 @@ __C.MRCNN.ROI_XFORM_RESOLUTION = 14
 ###########################################


-__C.SSD = edict()
-
-# Whether to enable FPN enhancement?
-__C.SSD.FPN_ON = False
+__C.SSD = AttrDict()

 # Convolutions to use in the cls and bbox tower
 # NOTE: this doesn't include the last conv for logits
@@ -369,7 +361,7 @@ __C.SSD.NUM_CONVS = 0
 # Weight for bbox regression loss
 __C.SSD.BBOX_REG_WEIGHT = 1.

-__C.SSD.MULTIBOX = edict()
+__C.SSD.MULTIBOX = AttrDict()
 # MultiBox configs
 __C.SSD.MULTIBOX.STRIDES = []
 __C.SSD.MULTIBOX.MIN_SIZES = []
@@ -377,25 +369,25 @@ __C.SSD.MULTIBOX.MAX_SIZES = []
 __C.SSD.MULTIBOX.ASPECT_RATIOS = []
 __C.SSD.MULTIBOX.ASPECT_ANGLES = []

-__C.SSD.OHEM = edict()
+__C.SSD.OHEM = AttrDict()
 # The threshold for selecting negative bbox in hard example mining
 __C.SSD.OHEM.NEG_OVERLAP = 0.5
 # The ratio used in hard example mining
 __C.SSD.OHEM.NEG_POS_RATIO = 3.0

 # Distort the image?
-__C.SSD.DISTORT = edict()
+__C.SSD.DISTORT = AttrDict()
 __C.SSD.DISTORT.BRIGHTNESS_PROB = 0.5
 __C.SSD.DISTORT.CONTRAST_PROB = 0.5
 __C.SSD.DISTORT.SATURATION_PROB = 0.5

 # Expand the image?
-__C.SSD.EXPAND = edict()
+__C.SSD.EXPAND = AttrDict()
 __C.SSD.EXPAND.PROB = 0.5
 __C.SSD.EXPAND.MAX_RATIO = 4.0

 # Resize the image?
-__C.SSD.RESIZE = edict()
+__C.SSD.RESIZE = AttrDict()
 __C.SSD.RESIZE.HEIGHT = 300
 __C.SSD.RESIZE.WIDTH = 300
 __C.SSD.RESIZE.INTERP_MODE = ['LINEAR', 'AREA', 'NEAREST', 'CUBIC', 'LANCZOS4']
@@ -403,7 +395,7 @@ __C.SSD.RESIZE.INTERP_MODE = ['LINEAR', 'AREA', 'NEAREST', 'CUBIC', 'LANCZOS4']
 # Samplers
 # Format as (min_scale, max_scale,
 #            min_aspect_ratio, max_aspect_ratio,
-#            min_jaccard_overlap, max_jaccard_overlap,
+#            min_overlap, max_overlap,
 #            max_trials, max_sample)
 __C.SSD.SAMPLERS = [
    (1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1, 1),   # Entire image
@@ -423,7 +415,7 @@ __C.SSD.SAMPLERS = [
 ###########################################


-__C.RESNET = edict()
+__C.RESNET = AttrDict()

 # Number of groups to use; 1 ==> ResNet; > 1 ==> ResNeXt
 __C.RESNET.NUM_GROUPS = 1
@@ -439,7 +431,7 @@ __C.RESNET.GROUP_WIDTH = 64
 ###########################################


-__C.DROPBLOCK = edict()
+__C.DROPBLOCK = AttrDict()

 # Whether to use drop block for more regulization
 __C.DROPBLOCK.DROP_ON = False
@@ -455,59 +447,46 @@ __C.DROPBLOCK.DECREMENT = 1e-6
 ###########################################


-__C.SOLVER = edict()
+__C.SOLVER = AttrDict()

-# Base learning rate for the specified schedule
-__C.SOLVER.BASE_LR = 0.001
+# The interval to display logs
+__C.SOLVER.DISPLAY = 20
+
+# The interval to snapshot a model
+__C.SOLVER.SNAPSHOT_EVERY = 5000
+# Prefix to yield the path: <prefix>_iters_XYZ.pth
+__C.SOLVER.SNAPSHOT_PREFIX = ''

 # Optional scaling factor for total loss
 # This option is helpful to scale the magnitude
 # of gradients during FP16 training
 __C.SOLVER.LOSS_SCALING = 1.

-# Schedule type (see functions in utils.lr_policy for options)
-# E.g., 'step', 'steps_with_decay', ...
-__C.SOLVER.LR_POLICY = 'steps_with_decay'
-
-# Hyperparameter used by the specified policy
-# For 'step', the current LR is multiplied by SOLVER.GAMMA at each step
-__C.SOLVER.GAMMA = 0.1
-
-# Uniform step size for 'steps' policy
-__C.SOLVER.STEP_SIZE = 30000
-
-__C.SOLVER.STEPS = []
-
 # Maximum number of SGD iterations
-__C.SOLVER.MAX_ITERS = 40000
+__C.SOLVER.MAX_STEPS = 40000
+
+# Base learning rate for the specified schedule
+__C.SOLVER.BASE_LR = 0.001
+# The uniform interval for LRScheduler
+__C.SOLVER.DECAY_STEP = 1
+# The custom intervals for LRScheduler
+__C.SOLVER.DECAY_STEPS = []
+# The decay factor for exponential LRScheduler
+__C.SOLVER.DECAY_GAMMA = 0.1
+# Warm up to ``BASE_LR`` over this number of steps
+__C.SOLVER.WARM_UP_STEPS = 500
+# Start the warm up from ``BASE_LR`` * ``FACTOR``
+__C.SOLVER.WARM_UP_FACTOR = 0.333
+# The type of LRScheduler
+__C.SOLVER.LR_POLICY = 'steps_with_decay'

 # Momentum to use with SGD
 __C.SOLVER.MOMENTUM = 0.9
-
 # L2 regularization hyper parameters
-__C.SOLVER.WEIGHT_DECAY = 0.0005
-
+__C.SOLVER.WEIGHT_DECAY = 0.0001
 # L2 norm factor for clipping gradients
 __C.SOLVER.CLIP_NORM = -1.0

-# Warm up to SOLVER.BASE_LR over this number of SGD iterations
-__C.SOLVER.WARM_UP_ITERS = 500
-
-# Start the warm up from SOLVER.BASE_LR * SOLVER.WARM_UP_FACTOR
-__C.SOLVER.WARM_UP_FACTOR = 1.0 / 3.0
-
-# The steps for accumulating gradients
-__C.SOLVER.ITER_SIZE = 1
-
-# The interval to display logs
-__C.SOLVER.DISPLAY = 20
-
-# The interval to snapshot a model
-__C.SOLVER.SNAPSHOT_ITERS = 5000
-
-# prefix to yield the path: <prefix>_iters_XYZ.caffemodel
-__C.SOLVER.SNAPSHOT_PREFIX = ''
-

 ###########################################
 #                                         #
@@ -532,9 +511,6 @@ __C.PIXEL_MEANS = [102., 115., 122.]
 # These are empirically chosen to approximately lead to unit variance targets
 __C.BBOX_REG_WEIGHTS = (10., 10., 5., 5.)

-# Default weights on (dx, dy, dw, dh, da) for normalizing rbox regression targets
-__C.RBOX_REG_WEIGHTS = (10.0, 10.0, 5., 5., 10.)
-
 # Prior prob for the positives at the beginning of training.
 # This is used to set the bias init for the logits layer
 __C.PRIOR_PROB = 0.01
@@ -581,7 +557,7 @@ def _merge_a_into_b(a, b):
        # the types must match, too
        v = _check_and_coerce_cfg_value_type(v, b[k], k)
        # recursively merge dicts
-        if type(v) is edict:
+        if type(v) is AttrDict:
            try:
                _merge_a_into_b(a[k], b[k])
            except:
@@ -595,7 +571,7 @@ def cfg_from_file(filename):
    """Load a config file and merge it into the default options."""
    import yaml
    with open(filename, 'r') as f:
-        yaml_cfg = edict(yaml.load(f))
+        yaml_cfg = AttrDict(yaml.load(f))
    global __C
    _merge_a_into_b(yaml_cfg, __C)

@@ -643,8 +619,8 @@ def _check_and_coerce_cfg_value_type(value_a, value_b, key):
        value_a = list(value_a)
    elif isinstance(value_a, list) and isinstance(value_b, tuple):
        value_a = tuple(value_a)
-    elif isinstance(value_a, dict) and isinstance(value_b, edict):
-        value_a = edict(value_a)
+    elif isinstance(value_a, dict) and isinstance(value_b, AttrDict):
+        value_a = AttrDict(value_a)
    else:
        raise ValueError(
            'Type mismatch ({} vs. {}) with values ({} vs. {}) for config '

--- a/lib/core/coordinator.py
+++ b/lib/core/coordinator.py
@@ -23,10 +23,8 @@ from lib.core.config import cfg_from_file


 class Coordinator(object):
-    """Coordinator is a simple tool to manage the
-     unique experiments from the YAML configurations.
+    """Manage the unique experiments."""

-    """
    def __init__(self, cfg_file, exp_dir=None):
        # Override the default configs
        cfg_from_file(cfg_file)
@@ -44,9 +42,14 @@ class Coordinator(object):
            self.experiment_dir = exp_dir

    def _path_at(self, file, auto_create=True):
+        try:
            path = os.path.abspath(os.path.join(self.experiment_dir, file))
            if auto_create and not os.path.exists(path):
                os.makedirs(path)
+        except OSError:
+            path = os.path.abspath(os.path.join('/tmp', file))
+            if auto_create and not os.path.exists(path):
+                os.makedirs(path)
        return path

    def checkpoints_dir(self):
@@ -55,7 +58,9 @@ class Coordinator(object):
    def exports_dir(self):
        return self._path_at('exports')

-    def results_dir(self, checkpoint=None):
+    def results_dir(self, checkpoint=None, output_dir=None):
+        if output_dir is not None:
+            return output_dir
        sub_dir = os.path.splitext(os.path.basename(checkpoint))[0] if checkpoint else ''
        return self._path_at(os.path.join('results', sub_dir))


--- a/lib/core/solver.py
+++ b/lib/core/solver.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import time
-import dragon.vm.torch as torch
-
-from lib.core.config import cfg
-from lib.modeling.detector import Detector
-from lib.utils import logger
-
-
-class Solver(object):
-    def __init__(self):
-        # Define the generic detector
-        self.detector = Detector()
-        # Define the optimizer and its arguments
-        self.optimizer = None
-        self.opt_arguments = {
-            'scale_gradient': 1. / (
-                cfg.SOLVER.LOSS_SCALING *
-                cfg.SOLVER.ITER_SIZE
-            ),
-            'clip_gradient': float(cfg.SOLVER.CLIP_NORM),
-            'weight_decay': cfg.SOLVER.WEIGHT_DECAY,
-        }
-        # Define the global step
-        self.iter = 0
-        # Define the decay step
-        self._current_step = 0
-
-    def _get_param_groups(self):
-        param_groups = [
-            {
-                'params': [],
-                'lr_mult': 1.,
-                'decay_mult': 1.,
-            },
-            # Special treatment for biases (mainly to match historical impl.
-            # details):
-            # (1) Do not apply weight decay
-            # (2) Use a 2x higher learning rate
-            {
-                'params': [],
-                'lr_mult': 2.,
-                'decay_mult': 0.,
-            }
-        ]
-        for name, param in self.detector.named_parameters():
-            if 'bias' in name:
-                param_groups[1]['params'].append(param)
-            else:
-                param_groups[0]['params'].append(param)
-        return param_groups
-
-    def set_learning_rate(self):
-        policy = cfg.SOLVER.LR_POLICY
-        if policy == 'steps_with_decay':
-            if self._current_step < len(cfg.SOLVER.STEPS) \
-                    and self.iter >= cfg.SOLVER.STEPS[self._current_step]:
-                self._current_step = self._current_step + 1
-                logger.info(
-                    'MultiStep Status: Iteration {}, step = {}'
-                    .format(self.iter, self._current_step)
-                )
-                new_lr = cfg.SOLVER.BASE_LR * (
-                        cfg.SOLVER.GAMMA ** self._current_step)
-                self.optimizer.param_groups[0]['lr'] = \
-                    self.optimizer.param_groups[1]['lr'] = new_lr
-        else:
-            raise ValueError('Unknown lr policy: ' + policy)
-
-    def one_step(self):
-        def add_loss(x, y):
-            return y if x is None else x + y
-        # Forward & Backward & Compute_loss
-        iter_size = cfg.SOLVER.ITER_SIZE
-        loss_scaling = cfg.SOLVER.LOSS_SCALING
-        stats = {'loss': {'total': 0.}, 'iter': self.iter}
-
-        run_time, tic = 0., time.time()
-
-        if iter_size > 1:
-            # Dragon is designed for manual gradients accumulating
-            # ``zero_grad`` is only required if calling ``accumulate_grad``
-            self.optimizer.zero_grad()
-
-        for i in range(iter_size):
-            outputs, total_loss = self.detector(), None
-            # Sum the partial losses
-            for k, v in outputs.items():
-                if 'loss' in k:
-                    if k not in stats['loss']:
-                        stats['loss'][k] = 0.
-                    total_loss = add_loss(total_loss, v)
-                    stats['loss'][k] += float(v) * loss_scaling
-            if loss_scaling != 1.:
-                total_loss *= loss_scaling
-            stats['loss']['total'] += float(total_loss)
-
-            total_loss.backward()
-            if iter_size > 1:
-                self.optimizer.accumulate_grad()
-
-        run_time += (time.time() - tic)
-
-        # Apply Update
-        self.set_learning_rate()
-        tic = time.time()
-        self.optimizer.step()
-        run_time += (time.time() - tic)
-        self.iter += 1
-
-        # Average loss by the iter size
-        for k in stats['loss'].keys():
-            stats['loss'][k] /= cfg.SOLVER.ITER_SIZE
-
-        # Misc stats
-        stats['lr'] = self.base_lr
-        stats['time'] = run_time
-        return stats
-
-    @property
-    def base_lr(self):
-        return self.optimizer.param_groups[0]['lr']
-
-    @base_lr.setter
-    def base_lr(self, value):
-        self.optimizer.param_groups[0]['lr'] = \
-            self.optimizer.param_groups[1]['lr'] = value
-
-
-class SGDSolver(Solver):
-    def __init__(self):
-        super(SGDSolver, self).__init__()
-        self.opt_arguments.update(**{
-            'lr': cfg.SOLVER.BASE_LR,
-            'momentum': cfg.SOLVER.MOMENTUM,
-        })
-        self.optimizer = torch.optim.SGD(
-            self._get_param_groups(), **self.opt_arguments)
-
-
-class NesterovSolver(Solver):
-    def __init__(self):
-        super(NesterovSolver, self).__init__()
-        self.opt_arguments.update(**{
-            'lr': cfg.SOLVER.BASE_LR,
-            'momentum': cfg.SOLVER.MOMENTUM,
-            'nesterov': True,
-        })
-        self.optimizer = torch.optim.SGD(
-            self._get_param_groups(), **self.opt_arguments)
-
-
-class RMSPropSolver(Solver):
-    def __init__(self):
-        super(RMSPropSolver, self).__init__()
-        self.opt_arguments.update(**{
-            'lr': cfg.SOLVER.BASE_LR,
-            'alpha': 0.9,
-            'eps': 1e-5,
-        })
-        self.optimizer = torch.optim.RMSprop(
-            self._get_param_groups(), **self.opt_arguments)
-
-
-class AdamSolver(Solver):
-    def __init__(self):
-        super(AdamSolver, self).__init__()
-        self.opt_arguments.update(**{
-            'lr': cfg.SOLVER.BASE_LR,
-            'beta1': 0.9,
-            'beta2': 0.999,
-            'eps': 1e-5,
-        })
-        self.optimizer = torch.optim.RMSprop(
-            self._get_param_groups(), **self.opt_arguments)
-
-
-def get_solver_func(type):
-    if type == 'MomentumSGD':
-        return SGDSolver
-    elif type == 'Nesterov':
-        return NesterovSolver
-    elif type == 'RMSProp':
-        return RMSPropSolver
-    elif type == 'Adam':
-        return AdamSolver
-    else:
-        raise ValueError(
-            'Unsupported solver type: {}.\n'
-            'Excepted in (MomentumSGD, Nesterov, RMSProp, Adam).'
-            .format(type)
-        )
--- a/lib/core/test.py
+++ b/lib/core/test.py
@@ -34,7 +34,7 @@ class TestServer(object):
        self.data_reader = dragon.io.DataReader(
            dataset=lambda: dragon.io.SeetaRecordDataset(self.imdb.source))
        self.data_transformer = DataTransformer()
-        self.data_reader.q_out = mp.Queue(cfg.TEST.IMS_PER_BATCH)
+        self.data_reader.q_out = mp.Queue(cfg.TEST.IMS_PER_BATCH * 5)
        self.data_reader.start()
        self.gt_recs = collections.OrderedDict()
        self.output_dir = output_dir
@@ -70,6 +70,9 @@ class TestServer(object):
        return self.gt_recs

    def evaluate_detections(self, all_boxes):
+        if cfg.TEST.PROTOCOL == 'null':
+            self.imdb.dump_detections(all_boxes, self.output_dir)
+        else:
            self.imdb.evaluate_detections(
                all_boxes,
                self.get_records(),

--- a/lib/core/train.py
+++ b/lib/core/train.py
@@ -18,53 +18,48 @@ from __future__ import division
 from __future__ import print_function

 import collections
-import datetime
 import os

 import dragon.vm.torch as torch

 from lib.core.config import cfg
-from lib.core.solver import get_solver_func
+from lib.solver.sgd import SGDSolver
 from lib.utils import logger
+from lib.utils import time_util
 from lib.utils.stats import SmoothedValue
-from lib.utils.timer import Timer


 class SolverWrapper(object):
    def __init__(self, coordinator):
+        self.solver = SGDSolver()
+        self.detector = self.solver.detector
        self.output_dir = coordinator.checkpoints_dir()
-        self.solver = get_solver_func('MomentumSGD')()
-
-        # Load the pre-trained weights
-        init_weights = cfg.TRAIN.WEIGHTS
-        if init_weights != '':
-            if os.path.exists(init_weights):
-                logger.info('Loading weights from {}.'.format(init_weights))
-                self.solver.detector.load_weights(init_weights)
-            else:
-                raise ValueError('Invalid path of weights: {}'.format(init_weights))
-
-        # Mixed precision training?
-        if cfg.MODEL.DATA_TYPE.lower() == 'float16':
-            self.solver.detector.half()  # Powerful FP16 Support

-        self.solver.detector.cuda(cfg.GPU_ID)
+        # Setup the detector
+        self.detector.load_weights(cfg.TRAIN.WEIGHTS)
+        if cfg.MODEL.PRECISION.lower() == 'float16':
+            # Mixed precision training
+            self.detector.half()
+        self.detector.cuda(cfg.GPU_ID)

        # Plan the metrics
+        self.board = None
        self.metrics = collections.OrderedDict()
        if cfg.ENABLE_TENSOR_BOARD and logger.is_root():
+            try:
                from dragon.tools.tensorboard import TensorBoard
-            self.board = TensorBoard(log_dir=coordinator.experiment_dir + '/logs')
+                log_dir = coordinator.experiment_dir + '/logs'
+                self.board = TensorBoard(log_dir=log_dir)
+            except ImportError:
+                pass

    def snapshot(self):
-        if not logger.is_root():
-            return None
-        filename = (cfg.SOLVER.SNAPSHOT_PREFIX + '_iter_{:d}'
-                    .format(self.solver.iter) + '.pth')
+        filename = cfg.SOLVER.SNAPSHOT_PREFIX + \
+                   '_iter_{}.pth'.format(self.solver.iter)
        filename = os.path.join(self.output_dir, filename)
-        torch.save(self.solver.detector.state_dict(), filename)
+        if logger.is_root() and not os.path.exists(filename):
+            torch.save(self.detector.state_dict(), filename)
            logger.info('Wrote snapshot to: {:s}'.format(filename))
-        return filename

    def add_metrics(self, stats):
        for k, v in stats['loss'].items():
@@ -73,7 +68,7 @@ class SolverWrapper(object):
            self.metrics[k].AddValue(v)

    def send_metrics(self, stats):
-        if hasattr(self, 'board'):
+        if self.board is not None:
            self.board.scalar_summary('lr', stats['lr'], stats['iter'])
            self.board.scalar_summary('time', stats['time'], stats['iter'])
            for k, v in self.metrics.items():
@@ -90,10 +85,12 @@ class SolverWrapper(object):
                        stats['iter'],
                    )

-    def step(self, display=False):
+    def step(self):
+        display = self.solver.iter % cfg.SOLVER.DISPLAY == 0
        stats = self.solver.one_step()
        self.add_metrics(stats)
        self.send_metrics(stats)
+
        if display:
            logger.info(
                'Iteration %d, lr = %.8f, loss = %f, time = %.2fs' % (
@@ -110,43 +107,28 @@ class SolverWrapper(object):

    def train_model(self):
        """Network training loop."""
-        last_snapshot_iter = -1
-        timer = Timer()
-        model_paths = []
-        start_lr = self.solver.base_lr
-        while self.solver.iter < cfg.SOLVER.MAX_ITERS:
-            if self.solver.iter < cfg.SOLVER.WARM_UP_ITERS:
-                alpha = (self.solver.iter + 1.0) / cfg.SOLVER.WARM_UP_ITERS
-                self.solver.base_lr = \
-                    start_lr * (cfg.SOLVER.WARM_UP_FACTOR * (1 - alpha) + alpha)
+        timer = time_util.Timer()
+        max_steps = cfg.SOLVER.MAX_STEPS

+        while self.solver.iter < max_steps:
            # Apply 1-step SGD update
            with timer.tic_and_toc():
-                self.step(display=self.solver.iter % cfg.SOLVER.DISPLAY == 0)
-
-            if self.solver.iter % (10 * cfg.SOLVER.DISPLAY) == 0:
-                average_time = timer.average_time
-                eta_seconds = average_time * (
-                    cfg.SOLVER.MAX_ITERS - self.solver.iter)
-                eta = str(datetime.timedelta(seconds=int(eta_seconds)))
-                progress = float(self.solver.iter + 1) / cfg.SOLVER.MAX_ITERS
+                _, global_step = self.step(), self.solver.iter
+
+            if global_step % (10 * cfg.SOLVER.DISPLAY) == 0:
                logger.info(
-                    '< PROGRESS: {:.2%} | SPEED: {:.3f}s / iter | ETA: {} >'
-                    .format(progress, timer.average_time, eta)
+                    time_util.get_progress_info(
+                        timer, global_step, max_steps
+                    )
                )

-            if self.solver.iter % cfg.SOLVER.SNAPSHOT_ITERS == 0:
-                last_snapshot_iter = self.solver.iter
-                model_paths.append(self.snapshot())
-
-        if last_snapshot_iter != self.solver.iter:
-            model_paths.append(self.snapshot())
-        return model_paths
+            if global_step % cfg.SOLVER.SNAPSHOT_EVERY == 0:
+                self.snapshot()


 def train_net(coordinator, start_iter=0):
    sw = SolverWrapper(coordinator)
    sw.solver.iter = start_iter
    logger.info('Solving...')
-    model_paths = sw.train_model()
-    return model_paths
+    sw.train_model()
+    sw.snapshot()
--- a/lib/datasets/imdb.py
+++ b/lib/datasets/imdb.py
@@ -14,6 +14,7 @@
 # ------------------------------------------------------------

 import os
+import shutil
 import dragon

 from lib.core.config import cfg
@@ -59,6 +60,35 @@ class imdb(object):
    def num_images(self):
        return dragon.io.SeetaRecordDataset(self.source).size

+    def dump_detections(self, all_boxes, output_dir):
+        dataset = dragon.io.SeetaRecordDataset(self.source)
+        for file in ('data.data', 'data.index', 'data.meta'):
+            file = os.path.join(output_dir, file)
+            if os.path.exists(file):
+                os.remove(file)
+        writer = dragon.io.SeetaRecordWriter(output_dir, dataset.protocol)
+        for i in range(len(dataset)):
+            example = dataset.get()
+            example['object'] = []
+            for cls_ind, cls in enumerate(self.classes):
+                if cls == '__background__':
+                    continue
+                detections = all_boxes[cls_ind][i]
+                if len(detections) == 0:
+                    continue
+                for k in range(detections.shape[0]):
+                    if detections[k, -1] < cfg.VIS_TH:
+                        continue
+                    example['object'].append({
+                        'name': cls,
+                        'xmin': float(detections[k][0]),
+                        'ymin': float(detections[k][1]),
+                        'xmax': float(detections[k][2]),
+                        'ymax': float(detections[k][3]),
+                        'difficult': 0,
+                    })
+            writer.write(example)
+
    def evaluate_detections(self, all_boxes, gt_recs, output_dir):
        pass


--- a/lib/datasets/taas.py
+++ b/lib/datasets/taas.py
@@ -109,36 +109,6 @@ class TaaS(imdb):
    #                                            #
    ##############################################

-    def _write_xml_bbox_results(self, all_boxes, gt_recs, output_dir):
-        from xml.dom import minidom
-        import xml.etree.ElementTree as ET
-        ix = 0
-        for image_id, rec in gt_recs.items():
-            root = ET.Element('annotation')
-            ET.SubElement(root, 'filename').text = str(image_id)
-            for cls_ind, cls in enumerate(self.classes):
-                if cls == '__background__':
-                    continue
-                detections = all_boxes[cls_ind][ix]
-                if len(detections) == 0:
-                    continue
-                for k in range(detections.shape[0]):
-                    if detections[k, -1] < cfg.VIS_TH:
-                        continue
-                    object = ET.SubElement(root, 'object')
-                    ET.SubElement(object, 'name').text = cls
-                    ET.SubElement(object, 'difficult').text = '0'
-                    bnd_box = ET.SubElement(object, 'bndbox')
-                    ET.SubElement(bnd_box, 'xmin').text = str(detections[k][0])
-                    ET.SubElement(bnd_box, 'ymin').text = str(detections[k][1])
-                    ET.SubElement(bnd_box, 'xmax').text = str(detections[k][2])
-                    ET.SubElement(bnd_box, 'ymax').text = str(detections[k][3])
-            ix += 1
-            rawText = ET.tostring(root)
-            dom = minidom.parseString(rawText)
-            with open('{}/{}.xml'.format(output_dir, image_id), 'w') as f:
-                dom.writexml(f, "", "\t", "\n", "utf-8")
-
    def _write_voc_bbox_results(self, all_boxes, gt_recs, output_dir):
        for cls_ind, cls in enumerate(self.classes):
            if cls == '__background__':
@@ -486,10 +456,6 @@ class TaaS(imdb):
                self._do_voc_bbox_eval(
                    gt_recs, output_dir, IoU=0.7,
                    use_07_metric='2007' in protocol)
-        elif 'xml' in protocol:
-            if cfg.EXP_DIR != '':
-                output_dir = cfg.EXP_DIR
-            self._write_xml_bbox_results(all_boxes, gt_recs, output_dir)
        elif 'coco' in protocol:
            from lib.pycocotools.coco import COCO
            if os.path.exists(cfg.TEST.JSON_FILE):

--- a/lib/faster_rcnn/anchor_target_layer.py
+++ b/lib/faster_rcnn/anchor_target_layer.py
@@ -20,7 +20,7 @@ import dragon.vm.torch as torch
 from lib.core.config import cfg
 from lib.faster_rcnn.generate_anchors import generate_anchors
 from lib.utils import logger
-from lib.utils.blob import blob_to_tensor
+from lib.utils.blob import array2tensor
 from lib.utils.boxes import bbox_overlaps
 from lib.utils.boxes import bbox_transform
 from lib.utils.boxes import dismantle_gt_boxes
@@ -194,8 +194,8 @@ class AnchorTargetLayer(torch.nn.Module):
            .transpose(0, 3, 1, 2)

        return {
-            'labels': blob_to_tensor(labels),
-            'bbox_targets': blob_to_tensor(bbox_targets),
-            'bbox_inside_weights': blob_to_tensor(bbox_inside_weights),
-            'bbox_outside_weights': blob_to_tensor(bbox_outside_weights),
+            'labels': array2tensor(labels),
+            'bbox_targets': array2tensor(bbox_targets),
+            'bbox_inside_weights': array2tensor(bbox_inside_weights),
+            'bbox_outside_weights': array2tensor(bbox_outside_weights),
        }
--- a/lib/faster_rcnn/data_layer.py
+++ b/lib/faster_rcnn/data_layer.py
@@ -92,7 +92,7 @@ class DataBatch(mp.Process):
        if self._num_transformers == -1:
            self._num_transformers = 2
            # Add 1 transformer for color augmentation
-            if cfg.TRAIN.COLOR_JITTERING:
+            if cfg.TRAIN.USE_COLOR_JITTER:
                self._num_transformers += 1
        self._num_transformers = min(
            self._num_transformers, self._max_transformers)

--- a/lib/faster_rcnn/data_transformer.py
+++ b/lib/faster_rcnn/data_transformer.py
@@ -19,8 +19,10 @@ import cv2
 import numpy as np

 from lib.core.config import cfg
+from lib.utils import rotated_boxes
 from lib.utils.blob import prep_im_for_blob
 from lib.utils.boxes import flip_boxes
+from lib.utils.image import get_image_with_target_size


 class DataTransformer(multiprocessing.Process):
@@ -101,23 +103,29 @@ class DataTransformer(multiprocessing.Process):
    def get_annotations(cls, example):
        objects = []
        for ix, obj in enumerate(example['object']):
-            if 'xmin' in obj:
-                objects.append({
-                    'name': obj['name'],
-                    'difficult': obj.get('difficult', 0),
-                    'bbox': [obj['xmin'], obj['ymin'], obj['xmax'], obj['ymax']],
-                })
+            if 'x3' in obj:
+                bbox = rotated_boxes.vertices2box(
+                    [obj['x1'], obj['y1'],
+                     obj['x2'], obj['y2'],
+                     obj['x3'], obj['y3'],
+                     obj['x4'], obj['y4']]
+                )
+            elif 'x2' in obj:
+                bbox = [obj['x1'], obj['y1'], obj['x2'], obj['y2']]
+            elif 'xmin' in obj:
+                bbox = [obj['xmin'], obj['ymin'], obj['xmax'], obj['ymax']]
            else:
+                bbox = obj['bbox']
            objects.append({
                'name': obj['name'],
                'difficult': obj.get('difficult', 0),
-                    'bbox': obj['bbox'],
+                'bbox': bbox,
            })
        return example['id'], objects

    def get(self, example):
        img = np.frombuffer(example['content'], np.uint8)
-        img = cv2.imdecode(img, -1)
+        img = cv2.imdecode(img, 1)

        # Scale
        scale_indices = np.random.randint(len(cfg.TRAIN.SCALES))
@@ -137,10 +145,10 @@ class DataTransformer(multiprocessing.Process):
            if jitter != 1.0:
                # To a rectangle (scale, max_size)
                target_size = (np.array(im.shape[0:2]) / jitter).astype(np.int)
-                im, offsets = _get_image_with_target_size(target_size, im)
+                im, offsets = get_image_with_target_size(target_size, im)
        else:
            # To a square (target_size, target_size)
-            im, offsets = _get_image_with_target_size([target_size] * 2, im)
+            im, offsets = get_image_with_target_size([target_size] * 2, im)

        # Example -> RoIDict
        roi_dict = self.make_roi_dict(example, im_scale, apply_flip, offsets)
@@ -166,29 +174,3 @@ class DataTransformer(multiprocessing.Process):
                self.q1_out.put(outputs)
            else:
                self.q2_out.put(outputs)
-
-
-def _get_image_with_target_size(target_size, img):
-    im_shape = list(img.shape)
-    height_diff = target_size[0] - im_shape[0]
-    width_diff = target_size[1] - im_shape[1]
-
-    ofs_crop_width = np.random.randint(max(-width_diff, 0) + 1)
-    ofs_pad_width = np.random.randint(max(width_diff, 0) + 1)
-    ofs_crop_height = np.random.randint(max(-height_diff, 0) + 1)
-    ofs_pad_height = np.random.randint(max(height_diff, 0) + 1)
-
-    im_shape[:2] = target_size
-    new_img = np.empty(im_shape, dtype=img.dtype)
-    new_img[:] = cfg.PIXEL_MEANS
-
-    new_img[ofs_pad_height:ofs_pad_height + img.shape[0],
-            ofs_pad_width:ofs_pad_width + img.shape[1]] = \
-        img[ofs_crop_height:ofs_crop_height + target_size[0],
-            ofs_crop_width:ofs_crop_width + target_size[1]]
-
-    return new_img, (
-        ofs_pad_width - ofs_crop_width,
-        ofs_pad_height - ofs_crop_height,
-        target_size,
-    )
--- a/lib/faster_rcnn/proposal_layer.py
+++ b/lib/faster_rcnn/proposal_layer.py
@@ -18,19 +18,15 @@ import numpy as np

 from lib.core.config import cfg
 from lib.faster_rcnn.generate_anchors import generate_anchors
-from lib.nms.nms_wrapper import nms
-from lib.utils.blob import blob_to_tensor
+from lib.nms import nms_wrapper
+from lib.utils.blob import array2tensor
 from lib.utils.boxes import bbox_transform_inv
 from lib.utils.boxes import clip_tiled_boxes
 from lib.utils.boxes import filter_boxes


 class ProposalLayer(torch.nn.Module):
-    """
-    Compute proposals by applying estimated bounding-box
-    transformations to a set of regular boxes (called "anchors").
-
-    """
+    """Compute proposals by applying transformations to anchors."""

    def __init__(self):
        super(ProposalLayer, self).__init__()
@@ -48,8 +44,8 @@ class ProposalLayer(torch.nn.Module):

    def forward(self, features, cls_prob, bbox_pred, ims_info):
        cfg_key = 'TRAIN' if self.training else 'TEST'
-        pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N
-        post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N
+        pre_nms_top_n = cfg[cfg_key].RPN_PRE_NMS_TOP_N
+        post_nms_top_n = cfg[cfg_key].RPN_POST_NMS_TOP_N
        nms_thresh = cfg[cfg_key].RPN_NMS_THRESH
        min_size = cfg[cfg_key].RPN_MIN_SIZE

@@ -86,14 +82,15 @@ class ProposalLayer(torch.nn.Module):
            scores = batch_scores[ix].reshape((-1, 1))  # [1, n] -> [n, 1]
            deltas = batch_deltas[ix].reshape((-1, 4))

-            if pre_nms_topN <= 0 or pre_nms_topN >= len(scores):
+            if pre_nms_top_n <= 0 or pre_nms_top_n >= len(scores):
                order = np.argsort(-scores.squeeze())
            else:
                # Avoid sorting possibly large arrays; First partition to get top K
                # unsorted and then sort just those (~20x faster for 200k scores)
-                inds = np.argpartition(-scores.squeeze(), pre_nms_topN)[:pre_nms_topN]
+                inds = np.argpartition(-scores.squeeze(), pre_nms_top_n)[:pre_nms_top_n]
                order = np.argsort(-scores[inds].squeeze())
                order = inds[order]
+
            deltas = deltas[order]
            anchors = all_anchors[order]
            scores = scores[order]
@@ -111,11 +108,11 @@ class ProposalLayer(torch.nn.Module):
            scores = scores[keep]

            # 6. Apply nms (e.g. threshold = 0.7)
-            # 7. Take after_nms_topN (e.g. 300)
+            # 7. Take after_nms_top_n (e.g. 300)
            # 8. Return the top proposals (-> RoIs top)
-            keep = nms(np.hstack((proposals, scores)), nms_thresh)
-            if post_nms_topN > 0:
-                keep = keep[:post_nms_topN]
+            keep = nms_wrapper.nms(np.hstack((proposals, scores)), nms_thresh)
+            if post_nms_top_n > 0:
+                keep = keep[:post_nms_top_n]
            proposals = proposals[keep, :]

            # Output rois blob
@@ -129,4 +126,4 @@ class ProposalLayer(torch.nn.Module):
        if cfg_key == 'TRAIN':
            return rpn_rois
        else:
-            return [blob_to_tensor(rpn_rois)]
+            return [array2tensor(rpn_rois)]
--- a/lib/faster_rcnn/proposal_target_layer.py
+++ b/lib/faster_rcnn/proposal_target_layer.py
@@ -18,7 +18,7 @@ import numpy as np
 import numpy.random as npr

 from lib.core.config import cfg
-from lib.utils.blob import blob_to_tensor
+from lib.utils.blob import array2tensor
 from lib.utils.boxes import bbox_overlaps
 from lib.utils.boxes import bbox_transform
 from lib.utils.boxes import dismantle_gt_boxes
@@ -73,11 +73,11 @@ class ProposalTargetLayer(torch.nn.Module):
            batch_outputs[k] = np.concatenate(batch_outputs[k], axis=0)

        return {
-            'rois': [blob_to_tensor(batch_outputs['rois'])],
-            'labels': blob_to_tensor(batch_outputs['labels']),
-            'bbox_targets': blob_to_tensor(batch_outputs['bbox_targets']),
-            'bbox_inside_weights': blob_to_tensor(batch_outputs['bbox_inside_weights']),
-            'bbox_outside_weights': blob_to_tensor(batch_outputs['bbox_outside_weights']),
+            'rois': [array2tensor(batch_outputs['rois'])],
+            'labels': array2tensor(batch_outputs['labels']),
+            'bbox_targets': array2tensor(batch_outputs['bbox_targets']),
+            'bbox_inside_weights': array2tensor(batch_outputs['bbox_inside_weights']),
+            'bbox_outside_weights': array2tensor(batch_outputs['bbox_outside_weights']),
        }



--- a/lib/faster_rcnn/test.py
+++ b/lib/faster_rcnn/test.py
@@ -17,14 +17,13 @@ import dragon.vm.torch as torch
 import numpy as np

 from lib.core.config import cfg
-from lib.nms.nms_wrapper import nms
-from lib.nms.nms_wrapper import soft_nms
+from lib.nms import nms_wrapper
+from lib.utils import framework
+from lib.utils import time_util
 from lib.utils.blob import im_list_to_blob
 from lib.utils.boxes import bbox_transform_inv
 from lib.utils.boxes import clip_tiled_boxes
 from lib.utils.image import scale_image
-from lib.utils.timer import Timer
-from lib.utils.graph import FrozenGraph
 from lib.utils.vis import vis_one_image


@@ -48,7 +47,8 @@ def im_detect(detector, raw_image):
        with torch.no_grad():
            with torch.jit.Recorder(retain_ops=True):
                outputs = detector.forward(inputs)
-                detector.frozen_graph = FrozenGraph(
+                detector.frozen_graph = \
+                    framework.FrozenGraph(
                        {'data': inputs['data'],
                         'ims_info': inputs['ims_info']},
                        {'rois': outputs['rois'],
@@ -88,14 +88,13 @@ def test_net(detector, server):
    num_classes = server.num_classes
    all_boxes = [[[] for _ in range(num_images)] for _ in range(num_classes)]

-    _t = {'im_detect': Timer(), 'misc': Timer()}
+    _t = {'im_detect': time_util.Timer(), 'misc': time_util.Timer()}

    for i in range(num_images):
        image_id, raw_image = server.get_image()

-        _t['im_detect'].tic()
+        with _t['im_detect'].tic_and_toc():
            scores, boxes = im_detect(detector, raw_image)
-        _t['im_detect'].toc()

        _t['misc'].tic()
        boxes_this_image = [[]]
@@ -107,21 +106,30 @@ def test_net(detector, server):
                (cls_boxes, cls_scores[:, np.newaxis])
            ).astype(np.float32, copy=False)
            if cfg.TEST.USE_SOFT_NMS:
-                keep = soft_nms(
-                    cls_detections, cfg.TEST.NMS,
+                keep = nms_wrapper.soft_nms(
+                    cls_detections,
+                    thresh=cfg.TEST.NMS,
                    method=cfg.TEST.SOFT_NMS_METHOD,
                    sigma=cfg.TEST.SOFT_NMS_SIGMA,
                )
            else:
-                keep = nms(cls_detections, cfg.TEST.NMS, force_cpu=True)
+                keep = nms_wrapper.nms(
+                    cls_detections,
+                    thresh=cfg.TEST.NMS,
+                    force_cpu=True,
+                )
            cls_detections = cls_detections[keep, :]
            all_boxes[j][i] = cls_detections
            boxes_this_image.append(cls_detections)

        if cfg.VIS or cfg.VIS_ON_FILE:
            vis_one_image(
-                raw_image, classes, boxes_this_image,
-                thresh=cfg.VIS_TH, box_alpha=1.0, show_class=True,
+                raw_image,
+                classes,
+                boxes_this_image,
+                thresh=cfg.VIS_TH,
+                box_alpha=1.,
+                show_class=True,
                filename=server.get_save_filename(image_id),
            )

@@ -129,7 +137,8 @@ def test_net(detector, server):
        if cfg.TEST.DETECTIONS_PER_IM > 0:
            image_scores = []
            for j in range(1, num_classes):
-                if len(all_boxes[j][i]) < 1: continue
+                if len(all_boxes[j][i]) < 1:
+                    continue
                image_scores.append(all_boxes[j][i][:, -1])
            if len(image_scores) > 0:
                image_scores = np.hstack(image_scores)

--- a/lib/fpn/anchor_target_layer.py
+++ b/lib/fpn/anchor_target_layer.py
@@ -14,6 +14,7 @@ from __future__ import division
 from __future__ import print_function

 import collections
+
 import dragon.vm.torch as torch
 import numpy as np
 import numpy.random as npr
@@ -21,7 +22,7 @@ import numpy.random as npr
 from lib.core.config import cfg
 from lib.faster_rcnn.generate_anchors import generate_anchors
 from lib.utils import logger
-from lib.utils.blob import blob_to_tensor
+from lib.utils.blob import array2tensor
 from lib.utils.boxes import bbox_overlaps
 from lib.utils.boxes import bbox_transform
 from lib.utils.boxes import dismantle_gt_boxes
@@ -180,8 +181,8 @@ class AnchorTargetLayer(torch.nn.Module):
        bbox_outside_weights = bbox_outside_weights_wide.transpose((0, 2, 1))

        return {
-            'labels': blob_to_tensor(labels),
-            'bbox_targets': blob_to_tensor(bbox_targets),
-            'bbox_inside_weights': blob_to_tensor(bbox_inside_weights),
-            'bbox_outside_weights': blob_to_tensor(bbox_outside_weights),
+            'labels': array2tensor(labels),
+            'bbox_targets': array2tensor(bbox_targets),
+            'bbox_inside_weights': array2tensor(bbox_inside_weights),
+            'bbox_outside_weights': array2tensor(bbox_outside_weights),
        }
--- a/lib/fpn/proposal_layer.py
+++ b/lib/fpn/proposal_layer.py
@@ -19,20 +19,16 @@ import numpy as np

 from lib.core.config import cfg
 from lib.faster_rcnn.generate_anchors import generate_anchors
-from lib.nms.nms_wrapper import nms
+from lib.nms import nms_wrapper
 from lib.utils import logger
-from lib.utils.blob import blob_to_tensor
+from lib.utils.blob import array2tensor
 from lib.utils.boxes import bbox_transform_inv
 from lib.utils.boxes import clip_tiled_boxes
 from lib.utils.boxes import filter_boxes


 class ProposalLayer(torch.nn.Module):
-    """
-    Compute proposals by applying estimated bounding-box
-    transformations to a set of regular boxes (called "anchors").
-
-    """
+    """Compute proposals by applying transformations anchors."""

    def __init__(self):
        super(ProposalLayer, self).__init__()
@@ -86,8 +82,8 @@ class ProposalLayer(torch.nn.Module):

    def forward(self, features, cls_prob, bbox_pred, ims_info):
        cfg_key = 'TRAIN' if self.training else 'TEST'
-        pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N
-        post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N
+        pre_nms_top_n = cfg[cfg_key].RPN_PRE_NMS_TOP_N
+        post_nms_top_n = cfg[cfg_key].RPN_POST_NMS_TOP_N
        nms_thresh = cfg[cfg_key].RPN_NMS_THRESH
        min_size = cfg[cfg_key].RPN_MIN_SIZE

@@ -110,14 +106,15 @@ class ProposalLayer(torch.nn.Module):
            scores = batch_scores[ix].reshape((-1, 1))  # [1, n] -> [n, 1]
            deltas = batch_deltas[ix]  # [n, 4]

-            if pre_nms_topN <= 0 or pre_nms_topN >= len(scores):
+            if pre_nms_top_n <= 0 or pre_nms_top_n >= len(scores):
                order = np.argsort(-scores.squeeze())
            else:
                # Avoid sorting possibly large arrays; First partition to get top K
                # unsorted and then sort just those (~20x faster for 200k scores)
-                inds = np.argpartition(-scores.squeeze(), pre_nms_topN)[:pre_nms_topN]
+                inds = np.argpartition(-scores.squeeze(), pre_nms_top_n)[:pre_nms_top_n]
                order = np.argsort(-scores[inds].squeeze())
                order = inds[order]
+
            deltas = deltas[order]
            anchors = all_anchors[order]
            scores = scores[order]
@@ -136,9 +133,9 @@ class ProposalLayer(torch.nn.Module):
            # 6. Apply nms (e.g. threshold = 0.7)
            # 7. Take after_nms_topN (e.g. 300)
            # 8. Return the top proposals (-> RoIs top)
-            keep = nms(np.hstack((proposals, scores)), nms_thresh)
-            if post_nms_topN > 0:
-                keep = keep[:post_nms_topN]
+            keep = nms_wrapper.nms(np.hstack((proposals, scores)), nms_thresh)
+            if post_nms_top_n > 0:
+                keep = keep[:post_nms_top_n]
            proposals = proposals[keep, :]

            # Output rois blob
@@ -156,16 +153,16 @@ class ProposalLayer(torch.nn.Module):
            # Distribute rois into K levels
            min_level = cfg.FPN.ROI_MIN_LEVEL
            max_level = cfg.FPN.ROI_MAX_LEVEL
-            K = max_level - min_level + 1
+            k = max_level - min_level + 1
            fpn_levels = _map_rois_to_fpn_levels(rpn_rois, min_level, max_level)
            all_rois = []
-            for i in range(K):
+            for i in range(k):
                lv_indices = np.where(fpn_levels == (i + min_level))[0]
                if len(lv_indices) == 0:
                    # Fake a tiny roi to avoid empty roi pooling
-                    all_rois.append(blob_to_tensor(np.array([[-1, 0, 0, 1, 1]], dtype=np.float32)))
+                    all_rois.append(array2tensor(np.array([[-1, 0, 0, 1, 1]], dtype=np.float32)))
                else:
-                    all_rois.append(blob_to_tensor(rpn_rois[lv_indices]))
+                    all_rois.append(array2tensor(rpn_rois[lv_indices]))
            return all_rois



--- a/lib/fpn/proposal_target_layer.py
+++ b/lib/fpn/proposal_target_layer.py
@@ -13,12 +13,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import dragon.vm.torch as torch
 import numpy as np
 import numpy.random as npr
-import dragon.vm.torch as torch

 from lib.core.config import cfg
-from lib.utils.blob import blob_to_tensor
+from lib.utils.blob import array2tensor
 from lib.utils.boxes import bbox_overlaps
 from lib.utils.boxes import bbox_transform
 from lib.utils.boxes import dismantle_gt_boxes
@@ -87,9 +87,9 @@ class ProposalTargetLayer(torch.nn.Module):
        # Distribute rois into K levels
        min_level = cfg.FPN.ROI_MIN_LEVEL
        max_level = cfg.FPN.ROI_MAX_LEVEL
-        K = max_level - min_level + 1
+        k = max_level - min_level + 1
        fpn_levels = _map_rois_to_fpn_levels(batch_outputs['rois'], min_level, max_level)
-        lvs_indices = [np.where(fpn_levels == (i + min_level))[0] for i in range(K)]
+        lvs_indices = [np.where(fpn_levels == (i + min_level))[0] for i in range(k)]
        _fmap_rois(
            inputs=[batch_outputs[key] for key in keys],
            fake_outputs=self.fake_outputs,
@@ -99,11 +99,11 @@ class ProposalTargetLayer(torch.nn.Module):
        )

        return {
-            'rois': [blob_to_tensor(outputs['rois'][i]) for i in range(K)],
-            'labels': blob_to_tensor(np.concatenate(outputs['labels'], axis=0)),
-            'bbox_targets': blob_to_tensor(np.vstack(outputs['bbox_targets'])),
-            'bbox_inside_weights': blob_to_tensor(np.vstack(outputs['bbox_inside_weights'])),
-            'bbox_outside_weights': blob_to_tensor(np.vstack(outputs['bbox_outside_weights'])),
+            'rois': [array2tensor(outputs['rois'][i]) for i in range(k)],
+            'labels': array2tensor(np.concatenate(outputs['labels'], axis=0)),
+            'bbox_targets': array2tensor(np.vstack(outputs['bbox_targets'])),
+            'bbox_inside_weights': array2tensor(np.vstack(outputs['bbox_inside_weights'])),
+            'bbox_outside_weights': array2tensor(np.vstack(outputs['bbox_outside_weights'])),
        }



--- a/lib/modeling/detector.py
+++ b/lib/modeling/detector.py
@@ -29,7 +29,7 @@ from lib.utils.logger import is_root


 class Detector(torch.nn.Module):
-    """The "Detector" organizes the detection pipelines.
+    """Organize the detection pipelines.

    A bunch of classic algorithms are integrated, see the
    ``lib.core.config`` for their hyper-parameters.
@@ -112,6 +112,7 @@ class Detector(torch.nn.Module):

        # 1. Extract features
        # Process the data:
+        #  0) CPU => CUDA
        #  1) NHWC => NCHW
        #  2) uint8 => float32 or float16
        #  3) Mean subtraction

--- a/lib/modeling/fpn.py
+++ b/lib/modeling/fpn.py
@@ -30,17 +30,18 @@ class FPN(torch.nn.Module):
        super(FPN, self).__init__()
        self.C = torch.nn.ModuleList()
        self.P = torch.nn.ModuleList()
-        self.apply_func = self.apply_on_rcnn
        for lvl in range(cfg.FPN.RPN_MIN_LEVEL, HIGHEST_BACKBONE_LVL + 1):
            self.C.append(conv1x1(feature_dims[lvl - 1], cfg.FPN.DIM, bias=True))
            self.P.append(conv3x3(cfg.FPN.DIM, cfg.FPN.DIM, bias=True))
-        if 'retinanet' in cfg.MODEL.TYPE or 'ssd' in cfg.MODEL.TYPE:
+        if 'rcnn' in cfg.MODEL.TYPE:
+            self.apply_func = self.apply_on_rcnn
+            self.maxpool = torch.nn.MaxPool2d(1, 2, ceil_mode=True)
+        else:
+            self.apply_func = self.apply_on_generic
+            self.relu = torch.nn.ReLU(inplace=False)
            for lvl in range(HIGHEST_BACKBONE_LVL + 1, cfg.FPN.RPN_MAX_LEVEL + 1):
                dim_in = feature_dims[-1] if lvl == HIGHEST_BACKBONE_LVL + 1 else cfg.FPN.DIM
                self.P.append(conv3x3(dim_in, cfg.FPN.DIM, stride=2, bias=True))
-            self.apply_func = self.apply_on_retinanet
-        self.relu = torch.nn.ReLU(inplace=False)
-        self.maxpool = torch.nn.MaxPool2d(1, 2, ceil_mode=True)
        self.reset_parameters()
        self.feature_dims = [cfg.FPN.DIM]

@@ -69,7 +70,7 @@ class FPN(torch.nn.Module):
            outputs.insert(0, self.P[i - min_lvl](fpn_input))
        return outputs

-    def apply_on_retinanet(self, features):
+    def apply_on_generic(self, features):
        fpn_input = self.C[-1](features[-1])
        min_lvl, max_lvl = cfg.FPN.RPN_MIN_LEVEL, cfg.FPN.RPN_MAX_LEVEL
        outputs = [self.P[HIGHEST_BACKBONE_LVL - min_lvl](fpn_input)]

--- a/lib/nms/nms_wrapper.py
+++ b/lib/nms/nms_wrapper.py
@@ -37,7 +37,7 @@ def nms(detections, thresh, force_cpu=False):
    if detections.shape[0] == 0:
        return []
    if detections.shape[1] == 6:
-        return rotated_boxes.nms(detections, thresh)
+        return rotated_boxes.cpu_nms(detections, thresh)
    if cfg.USE_GPU_NMS and not force_cpu:
        return gpu_nms(detections, thresh, device_id=cfg.GPU_ID)
    else:

--- a/lib/ops/modules.py
+++ b/lib/ops/modules.py
@@ -17,7 +17,6 @@ import dragon.vm.torch as torch

 from lib.core.config import cfg
 from lib.ops import functional as F
-from lib.utils.blob import blob_to_tensor


 class Bootstrap(torch.nn.Module):
@@ -25,7 +24,7 @@ class Bootstrap(torch.nn.Module):

    def __init__(self):
        super(Bootstrap, self).__init__()
-        self.dtype = cfg.MODEL.DATA_TYPE.lower()
+        self.dtype = cfg.MODEL.PRECISION.lower()
        self.mean_values = cfg.PIXEL_MEANS
        self.dummy_buffer = torch.ones(1)


--- a/lib/retinanet/anchor_target_layer.py
+++ b/lib/retinanet/anchor_target_layer.py
@@ -19,7 +19,7 @@ import numpy as np
 from lib.core.config import cfg
 from lib.faster_rcnn.generate_anchors import generate_anchors_v2
 from lib.utils import logger
-from lib.utils.blob import blob_to_tensor
+from lib.utils.blob import array2tensor
 from lib.utils.boxes import bbox_overlaps
 from lib.utils.boxes import bbox_transform
 from lib.utils.boxes import dismantle_gt_boxes
@@ -145,8 +145,8 @@ class AnchorTargetLayer(torch.nn.Module):
        bbox_outside_weights = bbox_outside_weights_wide.transpose((0, 2, 1))

        return {
-            'labels': blob_to_tensor(labels),
-            'bbox_targets': blob_to_tensor(bbox_targets),
-            'bbox_inside_weights': blob_to_tensor(bbox_inside_weights),
-            'bbox_outside_weights': blob_to_tensor(bbox_outside_weights),
+            'labels': array2tensor(labels),
+            'bbox_targets': array2tensor(bbox_targets),
+            'bbox_inside_weights': array2tensor(bbox_inside_weights),
+            'bbox_outside_weights': array2tensor(bbox_outside_weights),
        }
--- a/lib/retinanet/test.py
+++ b/lib/retinanet/test.py
@@ -17,44 +17,14 @@ import dragon.vm.torch as torch
 import numpy as np

 from lib.core.config import cfg
-from lib.nms.nms_wrapper import nms
-from lib.nms.nms_wrapper import soft_nms
+from lib.nms import nms_wrapper
+from lib.utils import framework
+from lib.utils import time_util
 from lib.utils.blob import im_list_to_blob
-from lib.utils.graph import FrozenGraph
 from lib.utils.image import scale_image
-from lib.utils.timer import Timer
 from lib.utils.vis import vis_one_image


-def im_detect(detector, raw_image):
-    """Detect a image, with single or multiple scales."""
-    ims, ims_scale = scale_image(raw_image)
-
-    # Prepare blobs
-    blobs = {'data': im_list_to_blob(ims)}
-    blobs['ims_info'] = np.array([
-        list(blobs['data'].shape[1:3]) + [im_scale]
-        for im_scale in ims_scale
-    ], dtype=np.float32)
-
-    # Do Forward
-    if not hasattr(detector, 'frozen_graph'):
-        inputs = {
-            'data': torch.from_numpy(blobs['data']),
-            'ims_info': torch.from_numpy(blobs['ims_info']),
-        }
-        with torch.no_grad():
-            with torch.jit.Recorder(retain_ops=True):
-                outputs = detector.forward(inputs)
-                detector.frozen_graph = FrozenGraph(
-                    {'data': inputs['data'],
-                     'ims_info': inputs['ims_info']},
-                    {'detections': outputs['detections']},
-                )
-    outputs = detector.frozen_graph(**blobs)
-    return outputs['detections'][:, 1:]
-
-
 def ims_detect(detector, raw_images):
    """Detect images, with single or multiple scales."""
    ims, ims_scale = scale_image(raw_images[0])
@@ -81,7 +51,8 @@ def ims_detect(detector, raw_images):
        with torch.no_grad():
            with torch.jit.Recorder(retain_ops=True):
                outputs = detector.forward(inputs)
-                detector.frozen_graph = FrozenGraph(
+                detector.frozen_graph = \
+                    framework.FrozenGraph(
                        {'data': inputs['data'],
                         'ims_info': inputs['ims_info']},
                        {'detections': outputs['detections']},
@@ -111,24 +82,21 @@ def test_net(detector, server):
    num_classes = server.num_classes
    all_boxes = [[[] for _ in range(num_images)] for _ in range(num_classes)]

-    _t = {'im_detect': Timer(), 'misc': Timer()}
+    _t = {'im_detect': time_util.Timer(), 'misc': time_util.Timer()}

    for batch_idx in range(0, num_images, cfg.TEST.IMS_PER_BATCH):
        # Collect raw images and ground-truths
        image_ids, raw_images = [], []
        for item_idx in range(cfg.TEST.IMS_PER_BATCH):
-            if batch_idx + item_idx >= num_images: continue
+            if batch_idx + item_idx >= num_images:
+                continue
            image_id, raw_image = server.get_image()
            image_ids.append(image_id)
            raw_images.append(raw_image)

        # Run detecting on specific scales
-        _t['im_detect'].tic()
-        if cfg.TEST.IMS_PER_BATCH > 1:
+        with _t['im_detect'].tic_and_toc():
            results = ims_detect(detector, raw_images)
-        else:
-            results = [im_detect(detector, raw_images[0])]
-        _t['im_detect'].toc()

        # Post-Processing
        _t['misc'].tic()
@@ -139,22 +107,22 @@ def test_net(detector, server):
            detections = np.array(detections)
            for j in range(1, num_classes):
                cls_indices = np.where(detections[:, 5].astype(np.int32) == j)[0]
-                cls_boxes = detections[cls_indices, 0:4]
+                cls_boxes = detections[cls_indices, :4]
                cls_scores = detections[cls_indices, 4]
                cls_detections = np.hstack((
                    cls_boxes, cls_scores[:, np.newaxis])) \
                    .astype(np.float32, copy=False)
                if cfg.TEST.USE_SOFT_NMS:
-                    keep = soft_nms(
+                    keep = nms_wrapper.soft_nms(
                        cls_detections,
-                        cfg.TEST.NMS,
+                        thresh=cfg.TEST.NMS,
                        method=cfg.TEST.SOFT_NMS_METHOD,
                        sigma=cfg.TEST.SOFT_NMS_SIGMA,
                    )
                else:
-                    keep = nms(
+                    keep = nms_wrapper.nms(
                        cls_detections,
-                        cfg.TEST.NMS,
+                        thresh=cfg.TEST.NMS,
                        force_cpu=True,
                    )
                cls_detections = cls_detections[keep, :]

--- a/lib/solver/__init__.py
+++ b/lib/solver/__init__.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
--- a/lib/solver/lr_scheduler.py
+++ b/lib/solver/lr_scheduler.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+from lib.core.config import cfg
+
+
+class _LRScheduler(object):
+    def __init__(
+        self,
+        lr_max,
+        lr_min=0.,
+        warmup_steps=0,
+        warmup_factor=0.,
+    ):
+        self._step_count = 0
+        self._lr_max, self._lr_min = lr_max, lr_min
+        self._warmup_steps = warmup_steps
+        self._warmup_factor = warmup_factor
+        self._last_lr = self._lr_max
+        self._last_steps = self._warmup_steps
+
+    def step(self):
+        self._step_count += 1
+
+    def get_lr(self):
+        if self._step_count < self._warmup_steps:
+            alpha = (self._step_count + 1.) / self._warmup_steps
+            decay_factor = self._warmup_factor * (1 - alpha) + alpha
+            self._last_lr = self._lr_max * decay_factor
+            return self._last_lr
+        return self.schedule_impl()
+
+    def schedule_impl(self):
+        raise NotImplementedError
+
+
+class StepLR(_LRScheduler):
+    def __init__(
+        self,
+        lr_max,
+        decay_step,
+        decay_gamma,
+        warmup_steps=0,
+        warmup_factor=0.,
+    ):
+        super(StepLR, self).__init__(
+            lr_max=lr_max,
+            warmup_steps=warmup_steps,
+            warmup_factor=warmup_factor,
+        )
+        self._decay_step = decay_step
+        self._decay_gamma = decay_gamma
+
+    def schedule_impl(self):
+        step_count = self._step_count - self._last_steps
+        if step_count % self._decay_step == 0:
+            decay_factor = step_count // self._decay_step
+            self._last_lr = self._lr_max * (
+                self._decay_gamma ** decay_factor)
+        return self._last_lr
+
+
+class MultiStepLR(_LRScheduler):
+    def __init__(
+        self,
+        lr_max,
+        decay_steps,
+        decay_gamma,
+        warmup_steps=0,
+        warmup_factor=0.,
+    ):
+        super(MultiStepLR, self).__init__(
+            lr_max=lr_max,
+            warmup_steps=warmup_steps,
+            warmup_factor=warmup_factor,
+        )
+        self._decay_steps = decay_steps
+        self._decay_gamma = decay_gamma
+        self._stage_count, self._num_stages = 0, len(self._decay_steps)
+
+    def schedule_impl(self):
+        if self._stage_count < self._num_stages:
+            k = self._decay_steps[self._stage_count]
+            while self._step_count >= k:
+                self._stage_count += 1
+                if self._stage_count >= self._num_stages:
+                    break
+                k = self._decay_steps[self._stage_count]
+            self._last_lr = self._lr_max * (
+                self._decay_gamma ** self._stage_count)
+        return self._last_lr
+
+
+class LinearLR(_LRScheduler):
+    def __init__(
+        self,
+        lr_max,
+        decay_step,
+        max_steps,
+        warmup_steps=0,
+        warmup_factor=0.,
+    ):
+        super(LinearLR, self).__init__(
+            lr_max=lr_max,
+            lr_min=0.,
+            warmup_steps=warmup_steps,
+            warmup_factor=warmup_factor,
+        )
+        self._decay_step = decay_step
+        self._max_steps = max_steps - warmup_steps
+
+    def schedule_impl(self):
+        step_count = self._step_count - self._last_steps
+        if step_count % self._decay_step == 0:
+            decay_factor = 1. - float(step_count) / self._max_steps
+            self._last_lr = self._lr_max * decay_factor
+        return self._last_lr
+
+
+class CosineLR(_LRScheduler):
+    def __init__(
+        self,
+        lr_max,
+        lr_min,
+        decay_step,
+        max_steps,
+        warmup_steps=0,
+        warmup_factor=0.,
+    ):
+        super(CosineLR, self).__init__(
+            lr_max=lr_max,
+            lr_min=lr_min,
+            warmup_steps=warmup_steps,
+            warmup_factor=warmup_factor,
+        )
+        self._decay_step = decay_step
+        self._max_steps = max_steps - warmup_steps
+
+    def schedule_impl(self):
+        step_count = self._step_count - self._last_steps
+        if step_count % self._decay_step == 0:
+            decay_factor = 0.5 * (1. + math.cos(
+                math.pi * step_count / self._max_steps))
+            self._last_lr = self._lr_min + (
+                    self._lr_max - self._lr_min
+            ) * decay_factor
+        return self._last_lr
+
+
+def get_scheduler():
+    lr_policy = cfg.SOLVER.LR_POLICY
+    if lr_policy == 'step':
+        return StepLR(
+            lr_max=cfg.SOLVER.BASE_LR,
+            decay_step=cfg.SOLVER.DECAY_STEP,
+            decay_gamma=cfg.SOLVER.DECAY_GAMMA,
+            warmup_steps=cfg.SOLVER.WARM_UP_STEPS,
+            warmup_factor=cfg.SOLVER.WARM_UP_FACTOR,
+        )
+    elif lr_policy == 'steps_with_decay':
+        return MultiStepLR(
+            lr_max=cfg.SOLVER.BASE_LR,
+            decay_steps=cfg.SOLVER.DECAY_STEPS,
+            decay_gamma=cfg.SOLVER.DECAY_GAMMA,
+            warmup_steps=cfg.SOLVER.WARM_UP_STEPS,
+            warmup_factor=cfg.SOLVER.WARM_UP_FACTOR,
+        )
+    elif lr_policy == 'cosine_decay':
+        return CosineLR(
+            lr_max=cfg.SOLVER.BASE_LR,
+            lr_min=0.,
+            decay_step=cfg.SOLVER.DECAY_STEP,
+            max_steps=cfg.SOLVER.MAX_STEPS,
+            warmup_steps=cfg.SOLVER.WARM_UP_STEPS,
+            warmup_factor=cfg.SOLVER.WARM_UP_FACTOR,
+        )
+    else:
+        raise ValueError('Unknown lr policy: ' + lr_policy)
+
+
+if __name__ == '__main__':
+    def extract_label(scheduler):
+        class_name = scheduler.__class__.__name__
+        label = class_name + '('
+        if class_name == 'StepLR':
+            label += 'α=' + str(scheduler._decay_step) + ', '
+            label += 'γ=' + str(scheduler._decay_gamma)
+        elif class_name == 'MultiStepLR':
+            label += 'α=' + str(scheduler._decay_steps) + ', '
+            label += 'γ=' + str(scheduler._decay_gamma)
+        elif class_name == 'CosineLR':
+            label += 'α=' + str(scheduler._decay_step)
+        label += ')'
+        return label
+
+    vis = True
+    max_steps = 240
+    shared_args = {
+        'lr_max': 0.4,
+        'warmup_steps': 5,
+        'warmup_factor': 0.,
+    }
+    schedulers = [
+        StepLR(decay_step=1, decay_gamma=0.97, **shared_args),
+        MultiStepLR(decay_steps=[60, 120, 180], decay_gamma=0.1, **shared_args),
+        CosineLR(lr_min=0., decay_step=1, max_steps=max_steps, **shared_args),
+        LinearLR(decay_step=1, max_steps=max_steps, **shared_args),
+    ]
+
+    for i in range(max_steps):
+        info = 'Step = %d\n' % i
+        for scheduler in schedulers:
+            if i == 0:
+                scheduler.lr_seq = []
+            info += '  * {}: {}\n'.format(
+                extract_label(scheduler),
+                scheduler.get_lr())
+            scheduler.lr_seq.append(scheduler.get_lr())
+            scheduler.step()
+        if not vis:
+            print(info)
+
+    if vis:
+        import matplotlib.pyplot as plt
+        plt.figure(1)
+        plt.title('Visualization of different LR Schedulers')
+        plt.xlabel('Step')
+        plt.ylabel('Learning Rate')
+        line = '--'
+        colors = ['r', 'g', 'b', 'c', 'm', 'y', 'k']
+        for i, scheduler in enumerate(schedulers):
+            plt.plot(
+                range(max_steps),
+                scheduler.lr_seq,
+                colors[i] + line,
+                linewidth=1.,
+                label=extract_label(scheduler),
+            )
+        plt.legend()
+        plt.show()
--- a/lib/solver/sgd.py
+++ b/lib/solver/sgd.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import dragon.vm.torch as torch
+
+from lib.core.config import cfg
+from lib.modeling.detector import Detector
+from lib.solver import lr_scheduler
+from lib.utils import framework
+from lib.utils import time_util
+
+
+class SGDSolver(object):
+    def __init__(self):
+        # Define the generic detector
+        self.detector = Detector()
+        # Define the optimizer and its arguments
+        self.optimizer = torch.optim.SGD(
+            framework.get_param_groups(self.detector),
+            lr=cfg.SOLVER.BASE_LR,
+            momentum=cfg.SOLVER.MOMENTUM,
+            weight_decay=cfg.SOLVER.WEIGHT_DECAY,
+            clip_gradient=float(cfg.SOLVER.CLIP_NORM),
+            scale_gradient=1. / cfg.SOLVER.LOSS_SCALING,
+        )
+        self.lr_scheduler = lr_scheduler.get_scheduler()
+
+    def one_step(self):
+        def add_loss(x, y):
+            return y if x is None else x + y
+
+        stats = {
+            'iter': self.iter,
+            'loss': {'total': 0.},
+            'time': time_util.Timer(),
+        }
+
+        with stats['time'].tic_and_toc():
+            # Forward pass
+            outputs = self.detector()
+
+            # Backward pass
+            total_loss = None
+            loss_scaling = cfg.SOLVER.LOSS_SCALING
+            for k, v in outputs.items():
+                if 'loss' in k:
+                    if k not in stats['loss']:
+                        stats['loss'][k] = 0.
+                    total_loss = add_loss(total_loss, v)
+                    stats['loss'][k] += float(v) * loss_scaling
+            if loss_scaling != 1.:
+                total_loss *= loss_scaling
+            stats['loss']['total'] += float(total_loss)
+            total_loss.backward()
+
+            # Apply Update
+            self.base_lr = self.lr_scheduler.get_lr()
+            self.optimizer.step()
+            self.lr_scheduler.step()
+
+        # Misc stats
+        stats['lr'] = self.base_lr
+        stats['time'] = stats['time'].total_time
+        return stats
+
+    @property
+    def base_lr(self):
+        return self.optimizer.param_groups[0]['lr']
+
+    @base_lr.setter
+    def base_lr(self, value):
+        for group in self.optimizer.param_groups:
+            group['lr'] = value
+
+    @property
+    def iter(self):
+        return self.lr_scheduler._step_count
+
+    @iter.setter
+    def iter(self, value):
+        self.lr_scheduler._step_count = value
--- a/lib/ssd/data_transformer.py
+++ b/lib/ssd/data_transformer.py
@@ -83,7 +83,7 @@ class DataTransformer(multiprocessing.Process):
                    ]
                else:
                    roi_dict['boxes'][object_idx, :] = \
-                        rotated_boxes.canonicalize(
+                        rotated_boxes.vertices2box(
                            [obj['x1'], obj['y1'],
                             obj['x2'], obj['y2'],
                             obj['x3'], obj['y3'],
@@ -108,7 +108,7 @@ class DataTransformer(multiprocessing.Process):

    def get(self, example):
        img = np.frombuffer(example['content'], np.uint8)
-        img = cv2.imdecode(img, -1)
+        img = cv2.imdecode(img, 1)

        # Flip
        flip = False

--- a/lib/ssd/hard_mining_layer.py
+++ b/lib/ssd/hard_mining_layer.py
@@ -17,7 +17,7 @@ import dragon.vm.torch as torch
 import numpy as np

 from lib.core.config import cfg
-from lib.utils.blob import blob_to_tensor
+from lib.utils.blob import array2tensor


 class HardMiningLayer(torch.nn.Module):
@@ -63,4 +63,4 @@ class HardMiningLayer(torch.nn.Module):
            labels_wide[ix][bg_inds] = 0  # Use hard negatives as bg indices

        # Feed labels to compute cls loss
-        return {'labels': blob_to_tensor(labels_wide)}
+        return {'labels': array2tensor(labels_wide)}
--- a/lib/ssd/multibox_layer.py
+++ b/lib/ssd/multibox_layer.py
@@ -17,7 +17,7 @@ import numpy as np
 import dragon.vm.torch as torch

 from lib.core.config import cfg
-from lib.utils.blob import blob_to_tensor
+from lib.utils.blob import array2tensor
 from lib.utils.boxes import bbox_overlaps
 from lib.utils.boxes import bbox_transform
 from lib.utils.boxes import dismantle_gt_boxes
@@ -121,7 +121,7 @@ class MultiBoxTargetLayer(torch.nn.Module):
            bbox_outside_weights_wide[ix][ex_inds] = bbox_reg_weight

        return {
-            'bbox_targets': blob_to_tensor(bbox_targets_wide),
-            'bbox_inside_weights': blob_to_tensor(bbox_inside_weights_wide),
-            'bbox_outside_weights': blob_to_tensor(bbox_outside_weights_wide),
+            'bbox_targets': array2tensor(bbox_targets_wide),
+            'bbox_inside_weights': array2tensor(bbox_inside_weights_wide),
+            'bbox_outside_weights': array2tensor(bbox_outside_weights_wide),
        }
--- a/lib/ssd/test.py
+++ b/lib/ssd/test.py
@@ -18,12 +18,11 @@ import dragon.vm.torch as torch
 import numpy as np

 from lib.core.config import cfg
-from lib.nms.nms_wrapper import nms
-from lib.nms.nms_wrapper import soft_nms
+from lib.nms import nms_wrapper
+from lib.utils import framework
+from lib.utils import time_util
 from lib.utils.boxes import bbox_transform_inv
 from lib.utils.boxes import clip_boxes
-from lib.utils.timer import Timer
-from lib.utils.graph import FrozenGraph
 from lib.utils.vis import vis_one_image


@@ -49,7 +48,8 @@ def ims_detect(detector, ims):
        with torch.no_grad():
            with torch.jit.Recorder(retain_ops=True):
                outputs = detector.forward(inputs={'data': image})
-                detector.frozen_graph = FrozenGraph(
+                detector.frozen_graph = \
+                    framework.FrozenGraph(
                        {'data': image},
                        {'cls_prob': outputs['cls_prob'],
                         'bbox_pred': outputs['bbox_pred']},
@@ -81,21 +81,21 @@ def test_net(detector, server):
    num_classes = server.num_classes
    all_boxes = [[[] for _ in range(num_images)] for _ in range(num_classes)]

-    _t = {'im_detect': Timer(), 'misc': Timer()}
+    _t = {'im_detect': time_util.Timer(), 'misc': time_util.Timer()}

    for batch_idx in range(0, num_images, cfg.TEST.IMS_PER_BATCH):
        # Collect raw images and ground-truths
        image_ids, raw_images = [], []

        for item_idx in range(cfg.TEST.IMS_PER_BATCH):
-            if batch_idx + item_idx >= num_images: continue
+            if batch_idx + item_idx >= num_images:
+                continue
            image_id, raw_image = server.get_image()
            image_ids.append(image_id)
            raw_images.append(raw_image)

-        _t['im_detect'].tic()
+        with _t['im_detect'].tic_and_toc():
            batch_scores, batch_boxes = ims_detect(detector, raw_images)
-        _t['im_detect'].toc()

        _t['misc'].tic()
        for item_idx in range(len(batch_scores)):
@@ -114,16 +114,16 @@ def test_net(detector, server):
                    (cls_boxes, cls_scores[:, np.newaxis])) \
                    .astype(np.float32, copy=False)
                if cfg.TEST.USE_SOFT_NMS:
-                    keep = soft_nms(
+                    keep = nms_wrapper.soft_nms(
                        cls_detections,
-                        cfg.TEST.NMS,
+                        thresh=cfg.TEST.NMS,
                        method=cfg.TEST.SOFT_NMS_METHOD,
                        sigma=cfg.TEST.SOFT_NMS_SIGMA,
                    )
                else:
-                    keep = nms(
+                    keep = nms_wrapper.nms(
                        cls_detections,
-                        cfg.TEST.NMS,
+                        thresh=cfg.TEST.NMS,
                        force_cpu=True,
                    )
                cls_detections = cls_detections[keep, :]

--- a/lib/ssd/transforms.py
+++ b/lib/ssd/transforms.py
@@ -47,18 +47,16 @@ class Distort(object):

    def apply(self, img, boxes=None):
        img = PIL.Image.fromarray(img)
-        if npr.uniform() < self._brightness_prob:
-            delta = npr.uniform(-0.3, 0.3) + 1.
-            img = PIL.ImageEnhance.Brightness(img)
-            img = img.enhance(delta)
-        if npr.uniform() < self._contrast_prob:
-            delta = npr.uniform(-0.3, 0.3) + 1.
-            img = PIL.ImageEnhance.Contrast(img)
-            img = img.enhance(delta)
-        if npr.uniform() < self._saturation_prob:
-            delta = npr.uniform(-0.3, 0.3) + 1.
-            img = PIL.ImageEnhance.Color(img)
-            img = img.enhance(delta)
+        transforms = [
+            (PIL.ImageEnhance.Brightness, self._brightness_prob),
+            (PIL.ImageEnhance.Contrast, self._contrast_prob),
+            (PIL.ImageEnhance.Color, self._saturation_prob),
+        ]
+        npr.shuffle(transforms)
+        for transform_fn, prob in transforms:
+            if npr.uniform() < prob:
+                img = transform_fn(img)
+                img = img.enhance(1. + npr.uniform(-.4, .4))
        return np.array(img), boxes



--- a/lib/utils/blob.py
+++ b/lib/utils/blob.py
@@ -21,7 +21,8 @@ import numpy as np
 import dragon.vm.torch as torch

 from lib.core.config import cfg
-from lib.utils.image import resize_image, distort_image
+from lib.utils.image import distort_image
+from lib.utils.image import resize_image


 def im_list_to_blob(ims):
@@ -60,17 +61,17 @@ def mask_list_to_blob(masks):
    return blob


-def prep_im_for_blob(im, target_size, max_size):
+def prep_im_for_blob(img, target_size, max_size):
    """Scale an image for use in a blob."""
-    im_shape, jitter = im.shape, 1.
+    im_shape, jitter = img.shape, 1.

-    if cfg.TRAIN.COLOR_JITTERING:
-        im = distort_image(im)
+    if cfg.TRAIN.USE_COLOR_JITTER:
+        img = distort_image(img)

    if max_size > 0:
        # Scale image along the shortest side
-        im_size_min = np.min(im_shape[0:2])
-        im_size_max = np.max(im_shape[0:2])
+        im_size_min = np.min(im_shape[:2])
+        im_size_max = np.max(im_shape[:2])
        im_scale = float(target_size) / float(im_size_min)

        # Prevent the biggest axis from being more than MAX_SIZE
@@ -78,31 +79,31 @@ def prep_im_for_blob(im, target_size, max_size):
            im_scale = float(max_size) / float(im_size_max)
    else:
        # Scale image along the longest side
-        im_size_max = np.max(im_shape[0:2])
+        im_size_max = np.max(im_shape[:2])
        im_scale = float(target_size) / float(im_size_max)

-    if cfg.TRAIN.SCALE_JITTERING:
-        r = cfg.TRAIN.SCALE_RANGE
+    if cfg.TRAIN.USE_SCALE_JITTER:
+        r = cfg.TRAIN.SCALE_JITTER_RANGE
        jitter = r[0] + np.random.rand() * (r[1] - r[0])
        im_scale *= jitter

-    return resize_image(im, im_scale, im_scale), im_scale, jitter
+    return resize_image(img, im_scale, im_scale), im_scale, jitter


-def blob_to_tensor(blob, enforce_cpu=False):
-    if isinstance(blob, np.ndarray):
+def array2tensor(array, enforce_cpu=False):
+    if isinstance(array, np.ndarray):
        # Zero-Copy from numpy
-        cpu_tensor = torch.from_numpy(blob)
+        cpu_tensor = torch.from_numpy(array)
    else:
-        cpu_tensor = blob
+        cpu_tensor = array
    return cpu_tensor if enforce_cpu else \
        cpu_tensor.cuda(cfg.GPU_ID)


-def tensor_to_blob(blob, copy=False):
-    if isinstance(blob, torch.Tensor):
+def tensor2array(tensor, copy=False):
+    if isinstance(tensor, torch.Tensor):
        # Zero-Copy from numpy
-        array = blob.numpy(True)
+        array = tensor.numpy(True)
    else:
-        array = blob
+        array = tensor
    return array.copy() if copy else array
--- a/lib/utils/graph.py
+++ b/lib/utils/graph.py
@@ -16,8 +16,100 @@ from __future__ import print_function
 import collections

 import dragon
+import dragon.vm.torch as torch
 from dragon.core.framework import tensor_util
-from dragon.vm.torch.jit.recorder import get_default_recorder
+from dragon.core.util import six
+
+
+def get_param_groups(module, bias_lr=1., bias_decay=0.):
+    """Separate weight and bias into parameters groups.
+
+    Parameters
+    ----------
+    module : dragon.vm.torch.nn.Module
+        The module to collect parameters.
+    bias_lr : float, optional, default=1.
+        The lr multiplier of bias.
+    bias_decay : float, optional, default=0.
+        The decay multiplier of bias.
+
+    Returns
+    -------
+    Sequence[ParamGroup]
+        The parameter groups.
+
+    """
+    param_groups = [
+        {
+            'params': [],
+            'lr_mult': 1.,
+            'decay_mult': 1.,
+        },
+        {
+            'params': [],
+            'lr_mult': bias_lr,
+            'decay_mult': bias_decay,
+        }
+    ]
+    for name, param in module.named_parameters():
+        gi = 1 if 'bias' in name else 0
+        param_groups[gi]['params'].append(param)
+    if len(param_groups[1]['params']) == 0:
+        param_groups.pop()  # Remove empty group
+    return param_groups
+
+
+def get_workspace():
+    """Return the current default workspace.
+
+    Returns
+    -------
+    dragon.Workspace
+        The default workspace.
+
+    """
+    return dragon.workspace.get_default()
+
+
+def new_workspace(merge_default=True):
+    """Create a new workspace.
+
+    Parameters
+    ----------
+    merge_default : bool, optional, default=True
+        **True** to merge tensors from default workspace.
+
+    Returns
+    -------
+    dragon.Workspace
+        The new workspace.
+
+    """
+    workspace = dragon.Workspace()
+    if merge_default:
+        workspace.merge_from(get_workspace())
+    return workspace
+
+
+def reset_workspace(workspace=None, merge_default=True):
+    """Reset a workspace and return a new one.
+
+    Parameters
+    ----------
+    workspace : dragon.Workspace, optional
+        The workspace to reset.
+    merge_default : bool, optional, default=True
+        **True** to merge tensors from default workspace.
+
+    Returns
+    -------
+    dragon.Workspace
+        The new workspace.
+
+    """
+    if workspace is not None:
+        workspace.Clear()  # Block the GIL
+    return new_workspace(merge_default)


 class FrozenGraph(object):
@@ -41,9 +133,8 @@ class FrozenGraph(object):
        self._inputs = canonicalize(inputs)
        self._outputs = canonicalize(outputs)
        self._constants = canonicalize(constants)
-        self._graph = dragon.Workspace() \
-            .merge_from(dragon.workspace.get_default())
-        self._tape = get_default_recorder()
+        self._graph = new_workspace()
+        self._tape = torch.jit.get_default_recorder()

    def forward(self, **kwargs):
        # Assign inputs
@@ -70,3 +161,7 @@ class FrozenGraph(object):
    def __call__(self, **kwargs):
        with self._graph.as_default():
            return self.forward(**kwargs)
+
+
+# Aliases
+pickle = six.moves.pickle
--- a/lib/utils/image.py
+++ b/lib/utils/image.py
@@ -21,9 +21,50 @@ import PIL.ImageEnhance
 from lib.core.config import cfg


-def resize_image(im, fx, fy):
+def distort_image(img):
+    img = PIL.Image.fromarray(img)
+    transforms = [
+        PIL.ImageEnhance.Brightness,
+        PIL.ImageEnhance.Contrast,
+        PIL.ImageEnhance.Color,
+    ]
+    np.random.shuffle(transforms)
+    for transform in transforms:
+        if np.random.uniform() < .5:
+            img = transform(img)
+            img = img.enhance(1. + np.random.uniform(-.4, .4))
+    return np.array(img)
+
+
+def get_image_with_target_size(target_size, img):
+    im_shape = list(img.shape)
+    height_diff = target_size[0] - im_shape[0]
+    width_diff = target_size[1] - im_shape[1]
+
+    ofs_crop_width = np.random.randint(max(-width_diff, 0) + 1)
+    ofs_pad_width = np.random.randint(max(width_diff, 0) + 1)
+    ofs_crop_height = np.random.randint(max(-height_diff, 0) + 1)
+    ofs_pad_height = np.random.randint(max(height_diff, 0) + 1)
+
+    im_shape[:2] = target_size
+    new_img = np.empty(im_shape, dtype=img.dtype)
+    new_img[:] = cfg.PIXEL_MEANS
+
+    new_img[ofs_pad_height:ofs_pad_height + img.shape[0],
+            ofs_pad_width:ofs_pad_width + img.shape[1]] = \
+        img[ofs_crop_height:ofs_crop_height + target_size[0],
+            ofs_crop_width:ofs_crop_width + target_size[1]]
+
+    return new_img, (
+        ofs_pad_width - ofs_crop_width,
+        ofs_pad_height - ofs_crop_height,
+        target_size,
+    )
+
+
+def resize_image(img, fx, fy):
    return cv2.resize(
-        im,
+        img,
        dsize=None,
        fx=fx, fy=fy,
        interpolation=cv2.INTER_LINEAR,
@@ -36,29 +77,12 @@ def resize_mask(mask, size):
    return np.array(mask.resize(size, PIL.Image.NEAREST))


-def distort_image(im):
-    im = PIL.Image.fromarray(im)
-    if np.random.uniform() < 0.5:
-        delta_brightness = np.random.uniform(-0.3, 0.3) + 1.
-        im = PIL.ImageEnhance.Brightness(im)
-        im = im.enhance(delta_brightness)
-    if np.random.uniform() < 0.5:
-        delta_contrast = np.random.uniform(-0.3, 0.3) + 1.
-        im = PIL.ImageEnhance.Contrast(im)
-        im = im.enhance(delta_contrast)
-    if np.random.uniform() < 0.5:
-        delta_saturation = np.random.uniform(-0.3, 0.3) + 1.
-        im = PIL.ImageEnhance.Color(im)
-        im = im.enhance(delta_saturation)
-    return np.array(im)
-
-
-def scale_image(im):
+def scale_image(img):
    processed_ims, ims_scales = [], []

    if cfg.TEST.MAX_SIZE > 0:
-        im_size_min = np.min(im.shape[:2])
-        im_size_max = np.max(im.shape[:2])
+        im_size_min = np.min(img.shape[:2])
+        im_size_max = np.max(img.shape[:2])
        for target_size in cfg.TEST.SCALES:
            im_scale = float(target_size) / float(im_size_min)
            # Prevent the biggest axis from being more than MAX_SIZE
@@ -66,7 +90,7 @@ def scale_image(im):
                im_scale = float(cfg.TEST.MAX_SIZE) / float(im_size_max)
            processed_ims.append(
                cv2.resize(
-                    im,
+                    img,
                    dsize=None,
                    fx=im_scale, fy=im_scale,
                    interpolation=cv2.INTER_LINEAR,
@@ -74,12 +98,12 @@ def scale_image(im):
            ims_scales.append(im_scale)
    else:
        # Scale image along the longest side
-        im_size_max = np.max(im.shape[0:2])
+        im_size_max = np.max(img.shape[:2])
        for target_size in cfg.TEST.SCALES:
            im_scale = float(target_size) / float(im_size_max)
            processed_ims.append(
                cv2.resize(
-                    im,
+                    img,
                    dsize=None,
                    fx=im_scale, fy=im_scale,
                    interpolation=cv2.INTER_LINEAR,

--- a/lib/utils/rotated_boxes.py
+++ b/lib/utils/rotated_boxes.py
@@ -13,138 +13,124 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-from ctypes import *
-import os.path as osp
+import ctypes
+import math
+import os

 import numpy as np


-class LibRotatedBoxes(object):
-    def __init__(self):
-        self._nms = cdll.LoadLibrary(
-            osp.join(osp.split(
-                osp.abspath(__file__))[0],
-                     "ctypes_rbox.so")
-        ).NMS
-        self._overlaps = cdll.LoadLibrary(
-            osp.join(osp.split(
-                osp.abspath(__file__))[0],
-                     "ctypes_rbox.so")
-        ).Overlaps
-        self._nms.argtypes = (
-            POINTER(c_double),
-            POINTER(c_int),
-            POINTER(c_double),
-            POINTER(c_int),
-            c_double,
+class _CppExtension(object):
+    dtype_mapping = {
+        'int32': ctypes.c_int32,
+        'float64': ctypes.c_double,
+    }
+
+    def __init__(self, library_name):
+        libc = ctypes.cdll.LoadLibrary(
+            os.path.join(os.path.split(
+                os.path.abspath(__file__))[0],
+                library_name,
            )
-        self._overlaps.argtypes = \
-            (POINTER(c_double),
-             POINTER(c_double),
-             POINTER(c_int),
-             POINTER(c_double)
        )
-        self._nms.restype = None
-        self._overlaps.restype = None
-
-    def nms(self, dets, thresh):
-        """CPU Hard-NMS.
-
-        Parameters
-        ----------
-        dets: (N, 6) ndarray of double [cx, cy, w, h, a, scores]
-        thresh : float
-
-        """
-        assert dets.shape[1] == 6
-        order = dets[:, 5].argsort()[::-1]
-        sorted_dets = dets[order, :]
-
-        N = sorted_dets.shape[0]
-        num_ctypes = c_int(N)
-        thresh = c_double(thresh)
-
-        pred_boxes = sorted_dets[:, 0:-1].flatten()
-        pred_scores = sorted_dets[:, -1:].flatten()
-        indices = np.zeros(N)
-
-        _boxes = np.ascontiguousarray(pred_boxes, dtype=np.double)
-        _scores = np.ascontiguousarray(pred_scores, dtype=np.double)
-        _inds = np.ascontiguousarray(indices, dtype=np.int32)

-        boxes_ctypes_ptr = _boxes.ctypes.data_as(POINTER(c_double))
-        scores_ctypes_ptr = _scores.ctypes.data_as(POINTER(c_double))
-        inds_ctypes_ptr = _inds.ctypes.data_as(POINTER(c_int32))
-
-        self._nms(
-            boxes_ctypes_ptr,
-            inds_ctypes_ptr,
-            scores_ctypes_ptr,
-            byref(num_ctypes),
-            thresh,
+        def load_func(name, arg_types):
+            func = getattr(libc, name)
+            func.argtypes = self.get_arg_types(*arg_types)
+            return func
+
+        self._apply_cpu_nms = load_func(
+            'apply_cpu_nms', (
+                ('float64', 1),  # dets
+                ('int32', 1),    # indices
+                ('int32', 1),    # n
+                ('float64', 0),  # thresh
            )
-        keep_indices = np.ctypeslib.as_array(
-            (c_int32 * num_ctypes.value).from_address(
-                addressof(inds_ctypes_ptr.contents)))
-        return list(order[keep_indices.astype(np.int32)])
-
-    def overlaps(self, boxes, query_boxes):
-        """Computer overlaps between boxes and query boxes.
-
-        Parameters
-        ----------
-        boxes: (N, 5) ndarray of double [cx, cy, w, h, a]
-        query_boxes: (K, 6) ndarray of double [cx, cy, w, h, a, cls]
-
-        Returns
-        -------
-        overlaps: (N, K) ndarray of overlap between boxes and query_boxes
-
-        """
-        assert boxes.shape[1] == 5
-        if query_boxes.shape[1] == 6:
-            query_boxes = query_boxes[:, :-1]
-
-        N = boxes.shape[0]
-        K = query_boxes.shape[0]
-
-        num_ctypes = (c_int * 2)()
-        num_ctypes[0] = N
-        num_ctypes[1] = K
-        num_ctypes_ptr = cast(num_ctypes, POINTER(c_int))
-
-        _boxes = boxes.flatten()
-        _query_boxes = query_boxes.flatten()
-        _areas = np.zeros((N, K), dtype=np.double).flatten()
-        _boxes = np.ascontiguousarray(_boxes, dtype=np.double)
-        _query_boxes = np.ascontiguousarray(_query_boxes, dtype=np.double)
-        _areas = np.ascontiguousarray(_areas, dtype=np.double)
-
-        boxes_ctypes_ptr = _boxes.ctypes.data_as(POINTER(c_double))
-        query_boxes_ctypes_ptr = _query_boxes.ctypes.data_as(POINTER(c_double))
-        areas_ctypes_ptr = _areas.ctypes.data_as(POINTER(c_double))
-
-        self._overlaps(
-            boxes_ctypes_ptr,
-            query_boxes_ctypes_ptr,
-            num_ctypes_ptr,
-            areas_ctypes_ptr,
        )
-        area = np.ctypeslib.as_array(
-            (c_double * K * N).from_address(
-                addressof(areas_ctypes_ptr.contents)
+        self._bbox_overlaps = load_func(
+            'bbox_overlaps', (
+                ('float64', 1),  # boxes1
+                ('float64', 1),  # boxes2
+                ('int32', 1),    # n, k
+                ('float64', 1)   # overlaps
            )
        )
-        rarea = np.nan_to_num(area.astype(np.float32))
-        return rarea

+    @staticmethod
+    def array2ptr(array):
+        return array.ctypes.data_as(
+            _CppExtension.get_ptr(str(array.dtype)))
+
+    @staticmethod
+    def contiguous(array, dtype='float64'):
+        return np.ascontiguousarray(array.flatten(), dtype)
+
+    @staticmethod
+    def get_arg_types(*args):
+        arg_types = []
+        for (dtype, is_pointer) in args:
+            arg_types.append(
+                _CppExtension.get_ptr(dtype) if is_pointer
+                else _CppExtension.dtype_mapping[dtype]
+            )
+        return arg_types
+
+    @staticmethod
+    def get_ptr(dtype):
+        return ctypes.POINTER(_CppExtension.dtype_mapping[dtype])
+
+    @staticmethod
+    def ptr2array(ptr, shape):
+        return np.ctypeslib.as_array(
+            shape.from_address(
+                ctypes.addressof(ptr.contents)
+            ))
+
+    def bbox_overlaps(self, boxes1, boxes2):
+        """Computer overlaps between boxes and query boxes."""
+        def canonicalize(boxes):
+            box_dim = boxes.shape[1]
+            if box_dim > 5:
+                boxes = boxes[:, :5]
+            elif box_dim < 5:
+                raise ValueError('Excepted box5d.')
+            return self.contiguous(boxes, 'float64')
+        n, k = boxes1.shape[0], boxes2.shape[0]
+        boxes1 = canonicalize(boxes1)
+        boxes2 = canonicalize(boxes2)
+        overlaps_shape = (ctypes.c_int32 * 2)()
+        overlaps_shape[:] = (n, k)
+        overlaps = np.zeros((n * k,), 'float64')
+        overlaps_ptr = self.array2ptr(overlaps)
+        self._bbox_overlaps(
+            self.array2ptr(boxes1),
+            self.array2ptr(boxes2),
+            ctypes.cast(overlaps_shape, self.get_ptr('int32')),
+            overlaps_ptr,
+        )
+        return self.ptr2array(overlaps_ptr, ctypes.c_double * k * n)

-libc = LibRotatedBoxes()
+    def cpu_nms(self, dets, thresh):
+        """Apply Hard-NMS."""
+        if dets.shape[1] != 6:
+            raise ValueError('Excepted det6d.')
+        order = dets[:, 5].argsort()[::-1]
+        sorted_dets = dets[order, :]

+        num_keep = sorted_dets.shape[0]
+        num_keep_ins = ctypes.c_int32(num_keep)
+        indices = np.zeros((num_keep,), np.int32)
+        indices_ptr = self.array2ptr(indices)

-def bbox_overlaps(boxes1, boxes2):
-    """Compute the overlaps between two group of boxes."""
-    return libc.overlaps(boxes1, boxes2)
+        self._apply_cpu_nms(
+            self.array2ptr(self.contiguous(dets, 'float64')),
+            indices_ptr,
+            ctypes.byref(num_keep_ins),
+            ctypes.c_double(thresh),
+        )
+        keep_indices = self.ptr2array(
+            indices_ptr, (ctypes.c_int32 * num_keep_ins.value))
+        return list(order[keep_indices])


 def bbox_transform(ex_rois, gt_rois, weights=(1., 1., 1., 1., 1.)):
@@ -214,36 +200,72 @@ def bbox_transform_inv(boxes, deltas, weights=(1., 1., 1., 1., 1.)):
    return pred_boxes


-def canonicalize(values):
-    def poly8_to_poly5(values):
-        pt1, pt2 = values[0:2], values[2:4]
-        pt3, pt4 = values[4:6], values[6:8]
-        edge1 = np.sqrt((pt1[0] - pt2[0]) * (pt1[0] - pt2[0]) + (pt1[1] - pt2[1]) * (pt1[1] - pt2[1]))
-        edge2 = np.sqrt((pt2[0] - pt3[0]) * (pt2[0] - pt3[0]) + (pt2[1] - pt3[1]) * (pt2[1] - pt3[1]))
-        angle, width, height = 0, 0, 0
-        if edge1 > edge2:
-            width = edge1
-            height = edge2
-            if pt1[0] - pt2[0] != 0:
-                angle = -np.arctan(float(pt1[1] - pt2[1]) / float(pt1[0] - pt2[0])) / 3.1415926 * 180
+def box2vertices(values):
+    x_ctr, y_ctr, w, h, a = values
+    theta = a * 0.01745329251
+    cos_theta2 = math.cos(theta) * 0.5
+    sin_theta2 = math.sin(theta) * 0.5
+    vertices = [
+        x_ctr - sin_theta2 * h - cos_theta2 * w,
+        y_ctr + cos_theta2 * h - sin_theta2 * w,
+        x_ctr + sin_theta2 * h - cos_theta2 * w,
+        y_ctr - cos_theta2 * h - sin_theta2 * w,
+    ]
+    vertices.extend([
+        2 * x_ctr - vertices[0],
+        2 * y_ctr - vertices[1],
+        2 * x_ctr - vertices[2],
+        2 * y_ctr - vertices[3],
+    ])
+    return vertices
+
+
+def vertices2box(vertices):
+    def sort(vertices):
+        poly = np.array(vertices).reshape((4, 2))
+        # lt, rt, rb, lb
+        edge = [
+            (poly[1][0] - poly[0][0]) * (poly[1][1] + poly[0][1]),
+            (poly[2][0] - poly[1][0]) * (poly[2][1] + poly[1][1]),
+            (poly[3][0] - poly[2][0]) * (poly[3][1] + poly[2][1]),
+            (poly[0][0] - poly[3][0]) * (poly[0][1] + poly[3][1])
+        ]
+        p_area = np.sum(edge) / 2.
+        _poly = poly.copy()
+        if abs(p_area) < 1:
+            raise ValueError
+        if p_area > 0:
+            _poly = _poly[(0, 3, 2, 1), :]  # clock wise
+        anchor = np.array([np.min(poly[:, 0]), np.min(poly[:, 1])])
+        line0 = np.linalg.norm(anchor - _poly[0])
+        line1 = np.linalg.norm(anchor - _poly[1])
+        line2 = np.linalg.norm(anchor - _poly[2])
+        line3 = np.linalg.norm(anchor - _poly[3])
+        argmin = np.argmin([line0, line1, line2, line3])
+        lt = _poly[argmin]
+        rt = _poly[(argmin + 1) % 4]
+        rb = _poly[(argmin + 2) % 4]
+        lb = _poly[(argmin + 3) % 4]
+        return np.array([lt, rt, rb, lb]).flatten()
+    values = sort(vertices)
+    y4my3 = values[7] - values[5]
+    if y4my3 != 0:
+        x2mx1 = values[2] - values[0]
+        theta = math.atan(x2mx1 / y4my3)
+        cos_theta = math.cos(theta)
+        sin_theta = math.sin(theta)
+        h = x2mx1 / sin_theta
+        x2px1 = values[2] + values[0]
+        x4px3 = values[6] + values[4]
+        w = (x4px3 - x2px1) / (2. * cos_theta)
+        a = theta / 0.01745329251
    else:
-                angle = 90.
-        elif edge2 >= edge1:
-            width = edge2
-            height = edge1
-            if pt2[0] - pt3[0] != 0:
-                angle = -np.arctan(float(pt2[1] - pt3[1]) / float(pt2[0] - pt3[0])) / 3.1415926 * 180
-            else:
-                angle = 90.
-        if angle < -45.:
-            angle = angle + 180.
-        x_ctr = (pt1[0] + pt3[0]) / 2.
-        y_ctr = (pt1[1] + pt3[1]) / 2.
-        return x_ctr, y_ctr, width, height, angle
-
-    if len(values) == 8:
-        return poly8_to_poly5(values)
-    return values
+        w = values[2] - values[0]
+        h = values[5] - values[1]
+        a = 0.
+    x_ctr = 0.5 * (values[0] + values[4])
+    y_ctr = 0.5 * (values[1] + values[5])
+    return x_ctr, y_ctr, w, h, a


 def clip_angle(d):
@@ -292,12 +314,16 @@ def flip_boxes(boxes, width):
    return flip_boxes


-def nms(dets, thresh):
-    return libc.nms(dets, thresh)
+# Aliases
+libc = _CppExtension('ctypes_rbox.so')
+bbox_overlaps = libc.bbox_overlaps
+cpu_nms = libc.cpu_nms


 if __name__ == "__main__":
-    prior_boxes = np.array([[4, 4, 5, 5, 90], [4, 4, 15, 15, 90]], dtype=np.double)
-    gt_boxes = np.array([[4, 4, 15, 15, 90, 1]], dtype=np.double)
+    prior_boxes = np.array([[4, 4, 15, 15, 150], [4, 4, 15, 15, 45]], dtype='float64')
+    gt_boxes = np.array([[4, 4, 15, 15, 45, 1.]], dtype='float64')
    ov = bbox_overlaps(prior_boxes, gt_boxes)
+    indices = cpu_nms(gt_boxes, 0.45)
    print(ov)
+    print(indices)
--- a/lib/utils/timer.py
+++ b/lib/utils/timer.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function

 import contextlib
+import datetime
 import time


@@ -30,9 +31,16 @@ class Timer(object):
        self.diff = 0.
        self.average_time = 0.

+    @contextlib.contextmanager
+    def tic_and_toc(self):
+        try:
+            yield self.tic()
+        finally:
+            self.toc()
+
    def tic(self):
        # Using time.time instead of time.clock because time time.clock
-        # does not normalize for multi-threading
+        # does not normalize for multithreading
        self.start_time = time.time()

    def toc(self, average=True):
@@ -45,9 +53,28 @@ class Timer(object):
        else:
            return self.diff

-    @contextlib.contextmanager
-    def tic_and_toc(self):
-        try:
-            yield self.tic()
-        finally:
-            self.toc()
+
+def get_progress_info(timer, curr_step, max_steps):
+    """Return a info of current progress.
+
+    Parameters
+    ----------
+    timer : Timer
+        The timer to get progress.
+    curr_step : int
+        The current step.
+    max_steps : int
+        The total number of steps.
+
+    Returns
+    -------
+    str
+        The progress info.
+
+    """
+    average_time = timer.average_time
+    eta_seconds = average_time * (max_steps - curr_step)
+    eta = str(datetime.timedelta(seconds=int(eta_seconds)))
+    progress = (curr_step + 1.) / max_steps
+    return '< PROGRESS: {:.2%} | SPEED: {:.3f}s / iter | ETA: {} >' \
+            .format(progress, timer.average_time, eta)
--- a/scripts/rotated/__init__.py
+++ b/scripts/rotated/__init__.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
--- a/scripts/rotated/im2rec.py
+++ b/scripts/rotated/im2rec.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from os import path as osp
+from maker import make_record
+
+
+if __name__ == '__main__':
+    voc_root = '/data/VOC'
+
+    make_record(
+        record_file=osp.join(voc_root, 'voc_0712_trainval'),
+        images_path=[osp.join(voc_root, 'VOCdevkit2007/VOC2007/JPEGImages'),
+                     osp.join(voc_root, 'VOCdevkit2012/VOC2012/JPEGImages')],
+        annotations_path=[osp.join(voc_root, 'VOCdevkit2007/VOC2007/Annotations'),
+                          osp.join(voc_root, 'VOCdevkit2012/VOC2012/Annotations')],
+        imagesets_path=[osp.join(voc_root, 'VOCdevkit2007/VOC2007/ImageSets/Main'),
+                        osp.join(voc_root, 'VOCdevkit2012/VOC2012/ImageSets/Main')],
+        splits=['trainval', 'trainval']
+    )
+
+    make_record(
+        record_file=osp.join(voc_root, 'voc_2007_test'),
+        images_path=osp.join(voc_root, 'VOCdevkit2007/VOC2007/JPEGImages'),
+        annotations_path=osp.join(voc_root, 'VOCdevkit2007/VOC2007/Annotations'),
+        imagesets_path=osp.join(voc_root, 'VOCdevkit2007/VOC2007/ImageSets/Main'),
+        splits=['test']
+   )
--- a/scripts/rotated/maker.py
+++ b/scripts/rotated/maker.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import time
+
+import cv2
+import dragon
+import numpy as np
+import xml.etree.ElementTree as ET
+
+
+def make_example(image_file, xml_file):
+    tree = ET.parse(xml_file)
+    filename = os.path.split(xml_file)[-1]
+    objs = tree.findall('object')
+    example = {'id': filename.split('.')[0], 'object': []}
+    with open(image_file, 'rb') as f:
+        img_bytes = bytes(f.read())
+    img = cv2.imdecode(np.frombuffer(img_bytes, 'uint8'), 1)
+    example['height'], example['width'], example['depth'] = img.shape
+    example['content'] = img_bytes
+    for ix, obj in enumerate(objs):
+        bbox = obj.find('bndbox')
+        is_diff = 0
+        if obj.find('difficult') is not None:
+            is_diff = int(obj.find('difficult').text) == 1
+        example['object'].append({
+            'name': obj.find('name').text.strip(),
+            'x1': float(bbox.find('x1').text),
+            'y1': float(bbox.find('y1').text),
+            'x2': float(bbox.find('x2').text),
+            'y2': float(bbox.find('y2').text),
+            'x3': float(bbox.find('x3').text),
+            'y3': float(bbox.find('y3').text),
+            'x4': float(bbox.find('x4').text),
+            'y4': float(bbox.find('y4').text),
+            'difficult': is_diff,
+        })
+
+    return example
+
+
+def make_record(
+    record_file,
+    images_path,
+    annotations_path,
+    imagesets_path,
+    splits
+):
+    if os.path.exists(record_file):
+        raise ValueError('The record file is already exist.')
+    os.makedirs(record_file)
+
+    if not isinstance(images_path, list):
+        images_path = [images_path]
+    if not isinstance(annotations_path, list):
+        annotations_path = [annotations_path]
+    if not isinstance(imagesets_path, list):
+        imagesets_path = [imagesets_path]
+    assert len(splits) == len(imagesets_path)
+    assert len(splits) == len(images_path)
+    assert len(splits) == len(annotations_path)
+
+    print('Start Time:', time.strftime("%a, %d %b %Y %H:%M:%S", time.gmtime()))
+
+    writer = dragon.io.SeetaRecordWriter(
+        path=record_file,
+        protocol={
+            'id': 'string',
+            'content': 'bytes',
+            'height': 'int64',
+            'width': 'int64',
+            'depth': 'int64',
+            'object': [{
+                'name': 'string',
+                'x1': 'float64',
+                'y1': 'float64',
+                'x2': 'float64',
+                'y2': 'float64',
+                'x3': 'float64',
+                'y3': 'float64',
+                'x4': 'float64',
+                'y4': 'float64',
+                'difficult': 'int64',
+            }]
+        }
+    )
+
+    count, total_line = 0, 0
+    start_time = time.time()
+
+    for db_idx, split in enumerate(splits):
+        split_file = os.path.join(imagesets_path[db_idx], split + '.txt')
+        assert os.path.exists(split_file)
+        with open(split_file, 'r') as f:
+            lines = f.readlines()
+            total_line += len(lines)
+        for line in lines:
+            count += 1
+            if count % 2000 == 0:
+                now_time = time.time()
+                print('{} / {} in {:.2f} sec'.format(
+                    count, total_line, now_time - start_time))
+            filename = line.strip()
+            image_file = os.path.join(images_path[db_idx], filename + '.jpg')
+            xml_file = os.path.join(annotations_path[db_idx], filename + '.xml')
+            writer.write(make_example(image_file, xml_file))
+
+    now_time = time.time()
+    print('{} / {} in {:.2f} sec'.format(count, total_line, now_time - start_time))
+    writer.close()
+
+    end_time = time.time()
+    data_size = os.path.getsize(record_file + '/data.data') * 1e-6
+    print('{} images take {:.2f} MB in {:.2f} sec.'
+          .format(total_line, data_size, end_time - start_time))
--- a/scripts/voc/__init__.py
+++ b/scripts/voc/__init__.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
--- a/scripts/voc/im2rec.py
+++ b/scripts/voc/im2rec.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from os import path as osp
+from maker import make_record
+
+
+if __name__ == '__main__':
+    voc_root = '/data/VOC'
+
+    make_record(
+        record_file=osp.join(voc_root, 'voc_0712_trainval'),
+        images_path=[osp.join(voc_root, 'VOCdevkit2007/VOC2007/JPEGImages'),
+                     osp.join(voc_root, 'VOCdevkit2012/VOC2012/JPEGImages')],
+        annotations_path=[osp.join(voc_root, 'VOCdevkit2007/VOC2007/Annotations'),
+                          osp.join(voc_root, 'VOCdevkit2012/VOC2012/Annotations')],
+        imagesets_path=[osp.join(voc_root, 'VOCdevkit2007/VOC2007/ImageSets/Main'),
+                        osp.join(voc_root, 'VOCdevkit2012/VOC2012/ImageSets/Main')],
+        splits=['trainval', 'trainval']
+    )
+
+    make_record(
+        record_file=osp.join(voc_root, 'voc_2007_test'),
+        images_path=osp.join(voc_root, 'VOCdevkit2007/VOC2007/JPEGImages'),
+        annotations_path=osp.join(voc_root, 'VOCdevkit2007/VOC2007/Annotations'),
+        imagesets_path=osp.join(voc_root, 'VOCdevkit2007/VOC2007/ImageSets/Main'),
+        splits=['test']
+   )
--- a/scripts/voc/maker.py
+++ b/scripts/voc/maker.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import time
+
+import cv2
+import dragon
+import numpy as np
+import xml.etree.ElementTree as ET
+
+
+def make_example(image_file, xml_file):
+    tree = ET.parse(xml_file)
+    filename = os.path.split(xml_file)[-1]
+    objs = tree.findall('object')
+    example = {'id': filename.split('.')[0], 'object': []}
+    with open(image_file, 'rb') as f:
+        img_bytes = bytes(f.read())
+    img = cv2.imdecode(np.frombuffer(img_bytes, 'uint8'), 1)
+    example['height'], example['width'], example['depth'] = img.shape
+    example['content'] = img_bytes
+    for ix, obj in enumerate(objs):
+        bbox = obj.find('bndbox')
+        is_diff = 0
+        if obj.find('difficult') is not None:
+            is_diff = int(obj.find('difficult').text) == 1
+        example['object'].append({
+            'name': obj.find('name').text.strip(),
+            'xmin': float(bbox.find('xmin').text),
+            'ymin': float(bbox.find('ymin').text),
+            'xmax': float(bbox.find('xmax').text),
+            'ymax': float(bbox.find('ymax').text),
+            'difficult': is_diff,
+        })
+
+    return example
+
+
+def make_record(
+    record_file,
+    images_path,
+    annotations_path,
+    imagesets_path,
+    splits
+):
+    if os.path.exists(record_file):
+        raise ValueError('The record file is already exist.')
+    os.makedirs(record_file)
+
+    if not isinstance(images_path, list):
+        images_path = [images_path]
+    if not isinstance(annotations_path, list):
+        annotations_path = [annotations_path]
+    if not isinstance(imagesets_path, list):
+        imagesets_path = [imagesets_path]
+    assert len(splits) == len(imagesets_path)
+    assert len(splits) == len(images_path)
+    assert len(splits) == len(annotations_path)
+
+    print('Start Time:', time.strftime("%a, %d %b %Y %H:%M:%S", time.gmtime()))
+
+    writer = dragon.io.SeetaRecordWriter(
+        path=record_file,
+        protocol={
+            'id': 'string',
+            'content': 'bytes',
+            'height': 'int64',
+            'width': 'int64',
+            'depth': 'int64',
+            'object': [{
+                'name': 'string',
+                'xmin': 'float64',
+                'ymin': 'float64',
+                'xmax': 'float64',
+                'ymax': 'float64',
+                'difficult': 'int64',
+            }]
+        }
+    )
+
+    count, total_line = 0, 0
+    start_time = time.time()
+
+    for db_idx, split in enumerate(splits):
+        split_file = os.path.join(imagesets_path[db_idx], split + '.txt')
+        assert os.path.exists(split_file)
+        with open(split_file, 'r') as f:
+            lines = f.readlines()
+            total_line += len(lines)
+        for line in lines:
+            count += 1
+            if count % 2000 == 0:
+                now_time = time.time()
+                print('{} / {} in {:.2f} sec'.format(
+                    count, total_line, now_time - start_time))
+            filename = line.strip()
+            image_file = os.path.join(images_path[db_idx], filename + '.jpg')
+            xml_file = os.path.join(annotations_path[db_idx], filename + '.xml')
+            writer.write(make_example(image_file, xml_file))
+
+    now_time = time.time()
+    print('{} / {} in {:.2f} sec'.format(count, total_line, now_time - start_time))
+    writer.close()
+
+    end_time = time.time()
+    data_size = os.path.getsize(record_file + '/data.data') * 1e-6
+    print('{} images take {:.2f} MB in {:.2f} sec.'
+          .format(total_line, data_size, end_time - start_time))
--- a/tools/export.py
+++ b/tools/export.py
@@ -74,7 +74,7 @@ if __name__ == '__main__':
    detector.optimize_for_inference()

    # Mixed precision training?
-    if cfg.MODEL.DATA_TYPE.lower() == 'float16':
+    if cfg.MODEL.PRECISION.lower() == 'float16':
        detector.half()  # Powerful FP16 Support

    data = torch.zeros(*args.input_shape).byte()

--- a/tools/test.py
+++ b/tools/test.py
@@ -37,8 +37,14 @@ def parse_args():
    parser.add_argument('--exp_dir', dest='exp_dir',
                        help='experiment dir',
                        default=None, type=str)
+    parser.add_argument('--output_dir', dest='output_dir',
+                        help='output dir',
+                        default=None, type=str)
    parser.add_argument('--iter', dest='iter', help='global step',
-                        default=0, type=int)
+                        default=None, type=int)
+    parser.add_argument('--dump', dest='dump',
+                        help='dump the result back to record?',
+                        action='store_true')
    parser.add_argument('--wait', dest='wait',
                        help='wait the checkpoint?',
                        action='store_true')
@@ -75,19 +81,19 @@ if __name__ == '__main__':

    # Inspect the database
    database = get_imdb(cfg.TEST.DATABASE)
+    cfg.TEST.PROTOCOL = 'null' if args.dump else cfg.TEST.PROTOCOL
    logger.info('Database({}): {} images will be used to test.'
                .format(cfg.TEST.DATABASE, database.num_images))

    # Ready to test the network
-    logger.info('Results will be saved to `{:s}`'
-                .format(coordinator.results_dir(checkpoint)))
+    output_dir = coordinator.results_dir(checkpoint, args.output_dir)
+    logger.info('Results will be saved to `{:s}`'.format(output_dir))
    detector = Detector().eval().cuda(cfg.GPU_ID)
    detector.load_weights(checkpoint)
    detector.optimize_for_inference()

    # Mixed precision training?
-    if cfg.MODEL.DATA_TYPE.lower() == 'float16':
+    if cfg.MODEL.PRECISION.lower() == 'float16':
        detector.half()  # Powerful FP16 Support

-    server = TestServer(coordinator.results_dir(checkpoint))
-    test_engine.test_net(detector, server)
+    test_engine.test_net(detector, TestServer(output_dir))