Add Model Zoo

Ting PAN
Commit 9d12d142 authored Oct 19, 2020 by Ting PAN
Showing with 3828 additions and 4472 deletions
csrc/cxx/.clang-format → .clang-format
.flake8
.gitignore
CHANGES
LICENSE
MODEL_ZOO.md
README.md
configs/faster_rcnn/README.md
configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_1x.yml → configs/faster_rcnn/coco_faster_rcnn_R-50-FPN_800_1x.yml
configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_2x.yml → configs/faster_rcnn/coco_faster_rcnn_R-50-FPN_800_2x.yml
configs/faster_rcnn/voc_faster_rcnn_R-50-FPN.yml → configs/faster_rcnn/voc_faster_rcnn_R-50-FPN_640.yml
configs/faster_rcnn/voc_faster_rcnn_VGG-16-C4.yml
configs/mask_rcnn/README.md
configs/mask_rcnn/coco_mask_rcnn_R-101-FPN_1x.yml → configs/mask_rcnn/coco_mask_rcnn_R-50-FPN_800_1x.yml
configs/mask_rcnn/coco_mask_rcnn_R-101-FPN_2x.yml → configs/mask_rcnn/coco_mask_rcnn_R-50-FPN_800_2x.yml
configs/retinanet/README.md
configs/retinanet/coco_retinanet_R-50-FPN_416_6x.yml
configs/retinanet/coco_retinanet_R-50-FPN_512_6x.yml
configs/retinanet/coco_retinanet_R-50-FPN_800_1x.yml
configs/retinanet/coco_retinanet_416_R-50-FPN.yml → configs/retinanet/coco_retinanet_R-50-FPN_800_2x.yml
--- a/csrc/cxx/.clang-format
+++ b/csrc/cxx/.clang-format
--- a/.flake8
+++ b/.flake8
+[flake8]
+max-line-length = 120
+ignore = E741, # ambiguous variable name
+         F403, # ‘from module import *’ used; unable to detect undefined names
+         F405, # name may be undefined, or defined from star imports: module
+         F811, # redefinition of unused name from line N
+         F821, # undefined name
+         W503, # line break before binary operator
+         W504, # line break after binary operator
+# module imported but unused
+per-file-ignores = __init__.py: F401
+exclude = seetadet/utils/pycocotools
--- a/.gitignore
+++ b/.gitignore
@@ -43,8 +43,13 @@ __pycache__
 # VSCode files
 .vscode

-# PyCharm files
+# IDEA files
 .idea

 # OSX dir files
 .DS_Store
+
+# Android files
+.gradle
+*.iml
+local.properties
--- a/CHANGES
+++ b/CHANGES
------------------------------------------------------------------------
-The list of most significant changes made over time in SeetaDet.
-
-SeetaDet 0.4.3 (20200724)
-
-Dragon Minimum Required (Version 0.3.0.dev20200723)
-
-Changes:
-
- Adapt to the latest dragon preview version.
-
-Preview Features:
-
- None
-
-Bugs fixed:
-
- None
-
------------------------------------------------------------------------
-
-SeetaDet 0.4.2 (20200707)
-
-Dragon Minimum Required (Version 0.3.0.dev20200707)
-
-Changes:
-
- Adapt to the latest dragon preview version.
-
-Preview Features:
-
- None
-
-Bugs fixed:
-
- None
-
------------------------------------------------------------------------
-
-SeetaDet 0.4.1 (20200421)
-
-Dragon Minimum Required (Version 0.3.0.dev20200421)
-
-Changes:
-
- Plan the queueing of testing images instead of reading them all.
-
-Preview Features:
-
- None
-
-Bugs fixed:
-
- None
-
------------------------------------------------------------------------
-
-SeetaDet 0.4.0 (20200408)
-
-Dragon Minimum Required (Version 0.3.0.dev20200408)
-
-Changes:
-
-Preview Features:
-
- Optimize the code structure.
-
- DALI support for SSD, RetinaNet, and Faster-RCNN.
-
- Use KPLRecord instead of SeetaRecord.
-
-Bugs fixed:
-
- Fix the frozen Affine issue.
-
------------------------------------------------------------------------
-
-SeetaDet 0.3.0 (20191121)
-
-Dragon Minimum Required (Version 0.3.0.dev20191121)
-
-Changes:
-
-Preview Features:
-
- New algorithm: Mask R-CNN.
-
- Add MobileNet(V2 and NAS) as backbone.
-
- Refactor testing module, multi-GPU is supported.
-
-Bugs fixed:
-
- Remove rotated boxes, use Mask R-CNN instead.
-
------------------------------------------------------------------------
-
-SeetaDet 0.2.3 (20191101)
-
-Dragon Minimum Required (Version 0.3.0.dev20191021)
-
-Changes:
-
-Preview Features:
-
- Refactor the API of rotated boxes.
-
- Simplify the solver by adding LRScheduler.
-
- Change the ``ITER`` naming to ``STEP``.
-
-Bugs fixed:
-
- None
-
------------------------------------------------------------------------
-
-SeetaDet 0.2.2 (20191021)
-
-Dragon Minimum Required (Version 0.3.0.dev20191021)
-
-Changes:
-
-Preview Features:
-
- Add the dumping if detection results.
-
-Bugs fixed:
-
- None
-
------------------------------------------------------------------------
-
-SeetaDet 0.2.1 (20191017)
-
-Dragon Minimum Required (Version 0.3.0.dev20191017)
-
-Changes:
-
-Preview Features:
-
- Rotated boxes and FPN support for SSD.
-
- Frozen the graph to speed up inference.
-
-Bugs fixed:
-
- None
-
------------------------------------------------------------------------
-
-SeetaDet 0.2.0 (20190929)
-
-Dragon Minimum Required (Version 0.3.0.dev20190929)
-
-Changes:
-
-Preview Features:
-
- Use SeetaRecord instead of LMDB.
-
- Flatten the implementation of layers.
-
-Bugs fixed:
-
- None
-
------------------------------------------------------------------------
-
-SeetaDet 0.1.2 (20190723)
-
-Dragon Minimum Required (Version 0.3.0.0)
-
-Changes:
-
-Preview Features:
-
- Change to the PEP8 code style.
-
- Adapt the new Dragon API.
-
-Bugs fixed:
-
- None
-
------------------------------------------------------------------------
-
-SeetaDet 0.1.1 (20190409)
-
-Dragon Minimum Required (Version 0.3.0.0)
-
-Changes:
-
-Preview Features:
-
- Add RandomCrop/RandomPad for ScaleJittering.
-
- Add ResNet18/ResNet34/AirNet for R-CNN and RetinaNet.
-
- Use C++ Implemented Decoder for RetinaNet instead.
-
-Bugs fixed:
-
- None
-
------------------------------------------------------------------------
-
-SeetaDet 0.1.0 (20190314)
-
-Dragon Minimum Required (Version 0.3.0.0)
-
-Changes:
-
-Preview Features:
-
- Init repository.
-
-Bugs fixed:
-
- None
--- a/LICENSE
+++ b/LICENSE
+Copyright (c) 2017, SeetaTech, Co.,Ltd. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/MODEL_ZOO.md
+++ b/MODEL_ZOO.md
+# Benchmark and Model Zoo
+
+## Introduction
+
+### ImageNet Pretrained Models
+
+#### ResNet Models
+
+- [R-50.pkl](https://dragon.seetatech.com/download/models/seetadet/imagenet/R-50.pkl)
+- [R-101.pkl](https://dragon.seetatech.com/download/models/seetadet/imagenet/R-101.pkl)
+
+#### VGG Models
+
+- [VGG16.SSD.pkl](https://dragon.seetatech.com/download/models/seetadet/imagenet/VGG16.SSD.pkl)
+
+#### MobileNet Models
+
+- [MobileNetV2.pkl](https://dragon.seetatech.com/download/models/seetadet/imagenet/MobileNetV2.pkl)
+- [ProxylessMobile.pkl](https://dragon.seetatech.com/download/models/seetadet/imagenet/ProxylessMobile.pkl)
+
+#### AirNet Models
+
+- [AirNet.pkl](https://dragon.seetatech.com/download/models/seetadet/imagenet/AirNet.pkl)
+
+## Baselines
+
+### Faster R-CNN
+
+Please refer to [Faster R-CNN](configs/faster_rcnn) for details.
+
+### Mask R-CNN
+
+Please refer to [Mask R-CNN](configs/mask_rcnn) for details.
+
+### RetinaNet
+
+Please refer to [RetinaNet](configs/retinanet) for details.
+
+### SSD
+
+Please refer to [SSD](configs/ssd) for details.
--- a/README.md
+++ b/README.md
-## SeetaDet
+# SeetaDet

-## WHAT's SeetaDet?
+SeetaDet is a platform implementing popular object detection algorithms.

-SeetaDet is a platform implementing popular object detection algorithms,
-including R-CNN series, SSD, and RetinaNet.
-
-We have achieved the same or higher performance than the baseline reported by the original paper.
-
-This repository is based on [Dragon](https://github.com/seetaresearch/dragon),
-while the style of codes is PyTorch.
+This repository is based on [seeta-dragon](https://github.com/seetaresearch/dragon),
+while the style of codes is torch. 

 The torch-style codes help us to simplify the hierarchical pipeline of modern detection.

 ## Requirements

-seeta-dragon >= 0.3.0.dev20200723
+seeta-dragon >= 0.3.0.dev20201014

 ## Installation

-#### Build From Source
+### Build From Source

 If you prefer to develop modules as well as running experiments,
 following commands will build but not install to ***site-packages***:

 ```bash
-cd SeetaDet && python setup.py build
+cd seetadet && python setup.py build
 ```

-#### Install From Source
+### Install From Source

 Clone this repository to local disk and install:

 ```bash
-cd SeetaDet && python setup.py install
+cd seetadet && python setup.py install
 ```

-#### Install From Git
+### Install From Git

 You can also install it from remote repository: 

@@ -45,16 +40,16 @@ pip install git+https://gitlab.seetatech.com/seetaresearch/seetadet.git@master

 ## Quick Start

-#### Train a detection model
+### Train a detection model

 ```bash
 cd tools
 python train.py --cfg <MODEL_YAML>
 ```

-We have provided the default YAML examples into ``seetadet/configs``.
+We have provided the default YAML examples into [configs](configs).

-#### Test a detection model
+### Test a detection model

 ```bash
 cd tools
@@ -64,42 +59,33 @@ Or

 ```bash
 cd tools
-python test_all.py --cfg <MODEL_YAML> --exp_dir <EXP_DIR>
+python test_all.py --cfg <MODEL_YAML> --exp_dir <EXP_DIR> --last 1
 ```

-#### Export a detection model to ONNX
+### Export a detection model to ONNX

 ```bash
 cd tools
 python export.py --cfg <MODEL_YAML> --exp_dir <EXP_DIR> --iter <ITERATION>
 ```

-## Resources
-
-#### Pre-trained ImageNet models
-
-| Model | Usage |
-| :------: | :------: |
-| [VGG16.SSD](https://dragon.seetatech.com/download/models/seetadet/imagenet/VGG16.SSD.pth)| SSD |
-| [VGG16.RCNN](https://dragon.seetatech.com/download/models/seetadet/imagenet/VGG16.RCNN.pth)| R-CNN |
-| [R-18.Affine](https://dragon.seetatech.com/download/models/seetadet/imagenet/R-18.Affine.pth)| R-CNN, RetinaNet, SSD |
-| [R-34.Affine](https://dragon.seetatech.com/download/models/seetadet/imagenet/R-34.Affine.pth)| R-CNN, RetinaNet, SSD |
-| [R-50.Affine](https://dragon.seetatech.com/download/models/seetadet/imagenet/R-50.Affine.pth)| R-CNN, RetinaNet, SSD |
-| [R-101.Affine](https://dragon.seetatech.com/download/models/seetadet/imagenet/R-101.Affine.pth)| R-CNN, RetinaNet, SSD |
-| [AirNet.Affine](https://dragon.seetatech.com/download/models/seetadet/imagenet/AirNet.Affine.pth)| R-CNN, RetinaNet, SSD |
-
-## References
-
-[1] [Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks](https://arxiv.org/abs/1506.01497). Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. NIPS, 2015.
+## Benchmark and Model Zoo

-[2] [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385). Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. CVPR, 2016.
+Results and models are available in the [Model Zoo](MODEL_ZOO.md).

-[3] [SSD: Single Shot MultiBox Detector](https://arxiv.org/abs/1512.02325). Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed, Cheng-Yang Fu, and Alexander C. Berg. ECCV, 2016.
+### Supported Backbones

-[4] [Feature Pyramid Networks for Object Detection](https://arxiv.org/abs/1612.03144). Tsung-Yi Lin, Piotr Dollár, Ross Girshick, Kaiming He, Bharath Hariharan, and Serge Belongie. CVPR, 2017.
+- [ResNet](MODEL_ZOO.md#resnet-models)
+- [VGG](MODEL_ZOO.md#vgg-models)
+- [MobileNet](MODEL_ZOO.md#mobilenet-models)
+- [AirNet](MODEL_ZOO.md#airnet-models)

-[5] [Focal Loss for Dense Object Detection](https://arxiv.org/abs/1708.02002). Tsung-Yi Lin, Priya Goyal, Ross Girshick, Kaiming He, and Piotr Dollár. ICCV, 2017.
+### Supported Algorithms

-[6] [Mask R-CNN](https://arxiv.org/abs/1703.06870). Kaiming He, Georgia Gkioxari, Piotr Dollár and Ross Girshick. ICCV, 2017.
+- [Faster R-CNN](configs/faster_rcnn)
+- [Mask R-CNN](configs/mask_rcnn)
+- [SSD](configs/ssd)
+- [RetinaNet](configs/retinanet)

-[7] [Detectron](https://github.com/facebookresearch/Detectron). Ross Girshick, Ilija Radosavovic, Georgia Gkioxari, Piotr Dollar and Kaiming He. 2018.
+## License
+[BSD 2-Clause license](LICENSE)
--- a/configs/faster_rcnn/README.md
+++ b/configs/faster_rcnn/README.md
+# Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks
+
+## Introduction
+```
+@article{Ren_2017,
+   title={Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks},
+   journal={IEEE Transactions on Pattern Analysis and Machine Intelligence},
+   publisher={Institute of Electrical and Electronics Engineers (IEEE)},
+   author={Ren, Shaoqing and He, Kaiming and Girshick, Ross and Sun, Jian},
+   year={2017},
+   month={Jun},
+}
+```
+
+## COCO Object Detection Baselines
+
+| Model | Lr sched | Infer time (s/im) | box AP | Download |
+| :---: | :------: | :---------------: | :----: | :------: |
+| [R-50-FPN-800](coco_faster_rcnn_R-50-FPN_800_1x.yml) | 1x | 0.046 | 38.3 | [model](https://dragon.seetatech.com/download/models/seetadet/faster_rcnn/coco_faster_rcnn_R-50-FPN_800_1x/model_final.pkl) |
+| [R-50-FPN-800](coco_faster_rcnn_R-50-FPN_800_2x.yml) | 2x | 0.046 | 39.7 | [model](https://dragon.seetatech.com/download/models/seetadet/faster_rcnn/coco_faster_rcnn_R-50-FPN_800_2x/model_final.pkl) |
+
+## Pascal VOC Object Detection Baselines
+
+| Model | Infer time (s/im) | AP@0.5 | Download |
+| :---: | :---------------: | :----: | :------: |
+| [R-50-FPN-640](voc_faster_rcnn_R-50-FPN_640.yml) | 0.030 | 80.8 | [model](https://dragon.seetatech.com/download/models/seetadet/faster_rcnn/voc_faster_rcnn_R-50-FPN_640_1x/model_final.pkl) |
--- a/configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_1x.yml
+++ b/configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_1x.yml
 NUM_GPUS: 8
-VIS: False
-ENABLE_TENSOR_BOARD: False
+PIXEL_STDS: [57.375, 57.12, 58.395]
+PIXEL_MEANS: [103.53, 116.28, 123.675]
 MODEL:
  TYPE: faster_rcnn
-  BACKBONE: resnet101.fpn
+  BACKBONE: resnet50.fpn
  CLASSES: ['__background__',
            'person', 'bicycle', 'car', 'motorcycle', 'airplane',
            'bus', 'train', 'truck', 'boat', 'traffic light',
@@ -19,30 +19,28 @@ MODEL:
            'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven',
            'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors',
            'teddy bear', 'hair drier', 'toothbrush']
-  NUM_CLASSES: 81
 SOLVER:
  BASE_LR: 0.02
+  LR_POLICY: steps_with_decay
  DECAY_STEPS: [60000, 80000]
  MAX_STEPS: 90000
  SNAPSHOT_EVERY: 5000
-  SNAPSHOT_PREFIX: coco_faster_rcnn
+  SNAPSHOT_PREFIX: coco_faster_rcnn_R-50-FPN_800_1x
 FRCNN:
-  ROI_XFORM_METHOD: RoIAlign
+  BATCH_SIZE: 512
  ROI_XFORM_RESOLUTION: 7
 TRAIN:
-  WEIGHTS: '/model/R-101.Affine.pth'
+  WEIGHTS: '/model/R-50.pkl'
  DATASET: '/data/coco_2014_trainval35k'
  IMS_PER_BATCH: 2
-  BATCH_SIZE: 512
-  SCALES: [800]
+  SCALES: [640, 672, 704, 736, 768, 800]
  MAX_SIZE: 1333
  USE_DIFF: False # Do not use crowd objects
 TEST:
  DATASET: '/data/coco_2014_minival'
  JSON_FILE: '/data/instances_minival2014.json'
  PROTOCOL: 'coco'
+  IMS_PER_BATCH: 1
  SCALES: [800]
  MAX_SIZE: 1333
  NMS: 0.5
-  RPN_POST_NMS_TOP_N: 1000
-
--- a/configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_2x.yml
+++ b/configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_2x.yml
 NUM_GPUS: 8
-VIS: False
-ENABLE_TENSOR_BOARD: False
+PIXEL_STDS: [57.375, 57.12, 58.395]
+PIXEL_MEANS: [103.53, 116.28, 123.675]
 MODEL:
  TYPE: faster_rcnn
-  BACKBONE: resnet101.fpn
+  BACKBONE: resnet50.fpn
  CLASSES: ['__background__',
            'person', 'bicycle', 'car', 'motorcycle', 'airplane',
            'bus', 'train', 'truck', 'boat', 'traffic light',
@@ -19,29 +19,28 @@ MODEL:
            'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven',
            'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors',
            'teddy bear', 'hair drier', 'toothbrush']
-  NUM_CLASSES: 81
 SOLVER:
  BASE_LR: 0.02
+  LR_POLICY: steps_with_decay
  DECAY_STEPS: [120000, 160000]
  MAX_STEPS: 180000
  SNAPSHOT_EVERY: 5000
-  SNAPSHOT_PREFIX: coco_faster_rcnn
+  SNAPSHOT_PREFIX: coco_faster_rcnn_R-50-FPN_800_2x
 FRCNN:
-  ROI_XFORM_METHOD: RoIAlign
+  BATCH_SIZE: 512
  ROI_XFORM_RESOLUTION: 7
 TRAIN:
-  WEIGHTS: '/model/R-101.Affine.pth'
+  WEIGHTS: '/model/R-50.pkl'
  DATASET: '/data/coco_2014_trainval35k'
  IMS_PER_BATCH: 2
-  BATCH_SIZE: 512
-  SCALES: [800]
+  SCALES: [640, 672, 704, 736, 768, 800]
  MAX_SIZE: 1333
  USE_DIFF: False # Do not use crowd objects
 TEST:
  DATASET: '/data/coco_2014_minival'
  JSON_FILE: '/data/instances_minival2014.json'
  PROTOCOL: 'coco'
+  IMS_PER_BATCH: 1
  SCALES: [800]
  MAX_SIZE: 1333
  NMS: 0.5
-  RPN_POST_NMS_TOP_N: 1000
--- a/configs/faster_rcnn/voc_faster_rcnn_R-50-FPN.yml
+++ b/configs/faster_rcnn/voc_faster_rcnn_R-50-FPN.yml
 NUM_GPUS: 1
-VIS: False
-ENABLE_TENSOR_BOARD: False
+PIXEL_STDS: [57.375, 57.12, 58.395]
+PIXEL_MEANS: [103.53, 116.28, 123.675]
 MODEL:
  TYPE: faster_rcnn
  BACKBONE: resnet50.fpn
@@ -10,27 +10,26 @@ MODEL:
            'cow', 'diningtable', 'dog', 'horse',
            'motorbike', 'person', 'pottedplant',
            'sheep', 'sofa', 'train', 'tvmonitor']
-  NUM_CLASSES: 21
+FRCNN:
+  BATCH_SIZE: 128
+  ROI_XFORM_RESOLUTION: 7
 SOLVER:
  BASE_LR: 0.002
-  DECAY_STEPS: [100000, 140000]
-  MAX_STEPS: 140000
+  DECAY_STEPS: [80000, 100000]
+  MAX_STEPS: 120000
  SNAPSHOT_EVERY: 5000
-  SNAPSHOT_PREFIX: voc_faster_rcnn
-FRCNN:
-  ROI_XFORM_METHOD: RoIAlign
-  ROI_XFORM_RESOLUTION: 7
+  SNAPSHOT_PREFIX: voc_faster_rcnn_R-50-FPN_640
 TRAIN:
-  WEIGHTS: '/model/R-50.Affine.pth'
+  WEIGHTS: '/model/R-50.pkl'
  DATASET: '/data/voc_0712_trainval'
  IMS_PER_BATCH: 2
-  BATCH_SIZE: 128
-  SCALES: [600]
-  MAX_SIZE: 1000
+  SCALES: [480, 512, 544, 576, 608, 640]
+  MAX_SIZE: 1066
+  USE_COLOR_JITTER: True
 TEST:
  DATASET: '/data/voc_2007_test'
-  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
-  SCALES: [600]
-  MAX_SIZE: 1000
+  PROTOCOL: 'voc2007'
+  IMS_PER_BATCH: 1
+  SCALES: [640]
+  MAX_SIZE: 1066
  NMS: 0.45
-  RPN_POST_NMS_TOP_N: 1000
\ No newline at end of file
--- a/configs/faster_rcnn/voc_faster_rcnn_VGG-16-C4.yml
+++ b/configs/faster_rcnn/voc_faster_rcnn_VGG-16-C4.yml
-NUM_GPUS: 1
-VIS: False
-ENABLE_TENSOR_BOARD: False
-MODEL:
-  TYPE: faster_rcnn
-  BACKBONE: vgg16.c4
-  CLASSES: ['__background__',
-            'aeroplane', 'bicycle', 'bird', 'boat',
-            'bottle', 'bus', 'car', 'cat', 'chair',
-            'cow', 'diningtable', 'dog', 'horse',
-            'motorbike', 'person', 'pottedplant',
-            'sheep', 'sofa', 'train', 'tvmonitor']
-  NUM_CLASSES: 21
-SOLVER:
-  BASE_LR: 0.001
-  WEIGHT_DECAY: 0.0005
-  DECAY_STEPS: [100000, 140000]
-  MAX_STEPS: 140000
-  SNAPSHOT_EVERY: 5000
-  SNAPSHOT_PREFIX: voc_faster_rcnn
-RPN:
-  STRIDES: [16]
-  SCALES: [8, 16, 32] # RField: [128, 256, 512]
-  ASPECT_RATIOS: [0.5, 1.0, 2.0]
-FRCNN:
-  ROI_XFORM_METHOD: RoIPool
-  ROI_XFORM_RESOLUTION: 7
-  MLP_HEAD_DIM: 4096
-TRAIN:
-  WEIGHTS: '/model/VGG16.RCNN.pth'
-  DATASET: '/data/voc_0712_trainval'
-  IMS_PER_BATCH: 2
-  BATCH_SIZE: 128
-  SCALES: [600]
-  MAX_SIZE: 1000
-  RPN_MIN_SIZE: 16
-TEST:
-  DATASET: '/data/voc_2007_test'
-  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
-  SCALES: [600]
-  MAX_SIZE: 1000
-  RPN_MIN_SIZE: 16
-  NMS: 0.45
-  RPN_POST_NMS_TOP_N: 300
\ No newline at end of file
--- a/configs/mask_rcnn/README.md
+++ b/configs/mask_rcnn/README.md
+# Mask R-CNN
+
+## Introduction
+```
+@article{He_2017,
+   title={Mask R-CNN},
+   journal={2017 IEEE International Conference on Computer Vision (ICCV)},
+   publisher={IEEE},
+   author={He, Kaiming and Gkioxari, Georgia and Dollar, Piotr and Girshick, Ross},
+   year={2017},
+   month={Oct}
+}
+```
+
+## COCO Instance Segmentation Baselines
+
+| Model | Lr sched | Infer time (s/im) | box AP | mask AP | Download |
+| :---: | :------: | :---------------: | :----: | :-----: | :------: |
+| [R-50-FPN-800](coco_mask_rcnn_R-50-FPN_800_1x.yml) | 1x | 0.056 | 39.2 | 34.8 | [model](https://dragon.seetatech.com/download/models/seetadet/mask_rcnn/coco_mask_rcnn_R-50-FPN_800_1x/model_final.pkl) |
+| [R-50-FPN-800](coco_mask_rcnn_R-50-FPN_800_2x.yml) | 2x | 0.056 | 41.4 | 36.5 | [model](https://dragon.seetatech.com/download/models/seetadet/mask_rcnn/coco_mask_rcnn_R-50-FPN_800_2x/model_final.pkl) |
--- a/configs/mask_rcnn/coco_mask_rcnn_R-101-FPN_1x.yml
+++ b/configs/mask_rcnn/coco_mask_rcnn_R-101-FPN_1x.yml
 NUM_GPUS: 8
-VIS: False
-ENABLE_TENSOR_BOARD: False
+PIXEL_STDS: [57.375, 57.12, 58.395]
+PIXEL_MEANS: [103.53, 116.28, 123.675]
 MODEL:
  TYPE: mask_rcnn
-  BACKBONE: resnet101.fpn
+  BACKBONE: resnet50.fpn
  CLASSES: ['__background__',
            'person', 'bicycle', 'car', 'motorcycle', 'airplane',
            'bus', 'train', 'truck', 'boat', 'traffic light',
@@ -19,25 +19,22 @@ MODEL:
            'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven',
            'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors',
            'teddy bear', 'hair drier', 'toothbrush']
-  NUM_CLASSES: 81
 SOLVER:
  BASE_LR: 0.02
  DECAY_STEPS: [60000, 80000]
  MAX_STEPS: 90000
  SNAPSHOT_EVERY: 5000
-  SNAPSHOT_PREFIX: coco_mask_rcnn
+  SNAPSHOT_PREFIX: coco_mask_rcnn_R-50-FPN_800_1x
 FRCNN:
-  ROI_XFORM_METHOD: RoIAlign
+  BATCH_SIZE: 512
  ROI_XFORM_RESOLUTION: 7
 MRCNN:
-  ROI_XFORM_METHOD: RoIAlign
  ROI_XFORM_RESOLUTION: 14
 TRAIN:
-  WEIGHTS: '/model/R-101.Affine.pth'
+  WEIGHTS: '/model/R-50.pkl'
  DATASET: '/data/coco_2014_trainval35k'
  IMS_PER_BATCH: 2
-  BATCH_SIZE: 512
-  SCALES: [800]
+  SCALES: [640, 672, 704, 736, 768, 800]
  MAX_SIZE: 1333
  USE_DIFF: False # Do not use crowd objects
 TEST:
@@ -47,5 +44,3 @@ TEST:
  SCALES: [800]
  MAX_SIZE: 1333
  NMS: 0.5
-  RPN_POST_NMS_TOP_N: 1000
-
--- a/configs/mask_rcnn/coco_mask_rcnn_R-101-FPN_2x.yml
+++ b/configs/mask_rcnn/coco_mask_rcnn_R-101-FPN_2x.yml
 NUM_GPUS: 8
-VIS: False
-ENABLE_TENSOR_BOARD: False
+PIXEL_STDS: [57.375, 57.12, 58.395]
+PIXEL_MEANS: [103.53, 116.28, 123.675]
 MODEL:
  TYPE: mask_rcnn
-  BACKBONE: resnet101.fpn
+  BACKBONE: resnet50.fpn
  CLASSES: ['__background__',
            'person', 'bicycle', 'car', 'motorcycle', 'airplane',
            'bus', 'train', 'truck', 'boat', 'traffic light',
@@ -19,25 +19,22 @@ MODEL:
            'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven',
            'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors',
            'teddy bear', 'hair drier', 'toothbrush']
-  NUM_CLASSES: 81
 SOLVER:
  BASE_LR: 0.02
  DECAY_STEPS: [120000, 160000]
  MAX_STEPS: 180000
  SNAPSHOT_EVERY: 5000
-  SNAPSHOT_PREFIX: coco_mask_rcnn
+  SNAPSHOT_PREFIX: coco_mask_rcnn_R-50-FPN_800_2x
 FRCNN:
-  ROI_XFORM_METHOD: RoIAlign
+  BATCH_SIZE: 512
  ROI_XFORM_RESOLUTION: 7
 MRCNN:
-  ROI_XFORM_METHOD: RoIAlign
  ROI_XFORM_RESOLUTION: 14
 TRAIN:
-  WEIGHTS: '/model/R-101.Affine.pth'
+  WEIGHTS: '/model/R-50.pkl'
  DATASET: '/data/coco_2014_trainval35k'
  IMS_PER_BATCH: 2
-  BATCH_SIZE: 512
-  SCALES: [800]
+  SCALES: [640, 672, 704, 736, 768, 800]
  MAX_SIZE: 1333
  USE_DIFF: False # Do not use crowd objects
 TEST:
@@ -47,4 +44,3 @@ TEST:
  SCALES: [800]
  MAX_SIZE: 1333
  NMS: 0.5
-  RPN_POST_NMS_TOP_N: 1000
--- a/configs/retinanet/README.md
+++ b/configs/retinanet/README.md
+# Focal Loss for Dense Object Detection
+
+## Introduction
+```
+@inproceedings{lin2017focal,
+  title={Focal loss for dense object detection},
+  author={Lin, Tsung-Yi and Goyal, Priya and Girshick, Ross and He, Kaiming and Doll{\'a}r, Piotr},
+  booktitle={Proceedings of the IEEE international conference on computer vision},
+  year={2017}
+}
+```
+
+## COCO Object Detection Baselines
+
+| Model | Lr sched | Infer time (s/im) | box AP | Download |
+| :---: | :------: | :---------------: | :----: | :------: |
+| [R-50-FPN-416](coco_retinanet_R-50-FPN_416_6x.yml) | 6x | 0.019 | 34.4 | [model](https://dragon.seetatech.com/download/models/seetadet/retinanet/coco_retinanet_R-50-FPN_416_6x/model_final.pkl) |
+| [R-50-FPN-512](coco_retinanet_R-50-FPN_512_6x.yml) | 6x | 0.022 | 36.4 | [model](https://dragon.seetatech.com/download/models/seetadet/retinanet/coco_retinanet_R-50-FPN_512_6x/model_final.pkl) |
+| [R-50-FPN-800](coco_retinanet_R-50-FPN_800_1x.yml) | 1x | 0.051 | 37.4 | [model](https://dragon.seetatech.com/download/models/seetadet/retinanet/coco_retinanet_R-50-FPN_800_1x/model_final.pkl) |
+| [R-50-FPN-800](coco_retinanet_R-50-FPN_800_2x.yml) | 2x | 0.051 | 39.1 | [model](https://dragon.seetatech.com/download/models/seetadet/retinanet/coco_retinanet_R-50-FPN_800_2x/model_final.pkl) |
+
+## Pascal VOC Object Detection Baselines
+
+| Model | Infer time (s/im) | AP@0.5 | Download |
+| :---: | :---------------: | :----: | :------: |
+| [R-50-FPN-416](voc_retinanet_R-50-FPN_416.yml) | 0.015 | 82.3 | [model](https://dragon.seetatech.com/download/models/seetadet/retinanet/voc_retinanet_R-50-FPN_416/model_final.pkl) |
+| [R-50-FPN-512](voc_retinanet_R-50-FPN_512.yml) | 0.017 | 83.0 | [model](https://dragon.seetatech.com/download/models/seetadet/retinanet/voc_retinanet_R-50-FPN_512/model_final.pkl) |
--- a/configs/retinanet/coco_retinanet_R-50-FPN_416_6x.yml
+++ b/configs/retinanet/coco_retinanet_R-50-FPN_416_6x.yml
+NUM_GPUS: 8
+PIXEL_STDS: [57.375, 57.12, 58.395]
+PIXEL_MEANS: [103.53, 116.28, 123.675]
+MODEL:
+  TYPE: retinanet
+  BACKBONE: resnet50.fpn
+  CLASSES: ['__background__',
+            'person', 'bicycle', 'car', 'motorcycle', 'airplane',
+            'bus', 'train', 'truck', 'boat', 'traffic light',
+            'fire hydrant', 'stop sign', 'parking meter', 'bench',
+            'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant',
+            'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
+            'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
+            'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
+            'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife',
+            'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli',
+            'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
+            'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop',
+            'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven',
+            'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors',
+            'teddy bear', 'hair drier', 'toothbrush']
+FPN:
+  RPN_MIN_LEVEL: 3
+  RPN_MAX_LEVEL: 7
+SOLVER:
+  BASE_LR: 0.01
+  LR_POLICY: steps_with_decay
+  DECAY_STEPS: [90000, 120000]
+  MAX_STEPS: 135000
+  SNAPSHOT_EVERY: 2500
+  SNAPSHOT_PREFIX: coco_retinanet_R-50-FPN_416_6x
+PIPELINE:
+  TYPE: 'ssd'
+TRAIN:
+  WEIGHTS: '/model/R-50.pkl'
+  DATASET: '/data/coco_2014_trainval35k'
+  IMS_PER_BATCH: 8
+  SCALES: [416]
+  USE_DIFF: False # Do not use crowd objects
+TEST:
+  DATASET: '/data/coco_2014_minival'
+  JSON_FILE: '/data/instances_minival2014.json'
+  PROTOCOL: 'coco'
+  IMS_PER_BATCH: 1
+  SCALES: [416]
+  NMS: 0.5
--- a/configs/retinanet/coco_retinanet_R-50-FPN_512_6x.yml
+++ b/configs/retinanet/coco_retinanet_R-50-FPN_512_6x.yml
+NUM_GPUS: 8
+PIXEL_STDS: [57.375, 57.12, 58.395]
+PIXEL_MEANS: [103.53, 116.28, 123.675]
+MODEL:
+  TYPE: retinanet
+  BACKBONE: resnet50.fpn
+  CLASSES: ['__background__',
+            'person', 'bicycle', 'car', 'motorcycle', 'airplane',
+            'bus', 'train', 'truck', 'boat', 'traffic light',
+            'fire hydrant', 'stop sign', 'parking meter', 'bench',
+            'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant',
+            'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
+            'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
+            'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
+            'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife',
+            'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli',
+            'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
+            'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop',
+            'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven',
+            'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors',
+            'teddy bear', 'hair drier', 'toothbrush']
+FPN:
+  RPN_MIN_LEVEL: 3
+  RPN_MAX_LEVEL: 7
+SOLVER:
+  BASE_LR: 0.01
+  LR_POLICY: steps_with_decay
+  DECAY_STEPS: [90000, 120000]
+  MAX_STEPS: 135000
+  SNAPSHOT_EVERY: 2500
+  SNAPSHOT_PREFIX: coco_retinanet_R-50-FPN_512_6x
+PIPELINE:
+  TYPE: 'ssd'
+TRAIN:
+  WEIGHTS: '/model/R-50.pkl'
+  DATASET: '/data/coco_2014_trainval35k'
+  IMS_PER_BATCH: 8
+  SCALES: [512]
+  USE_DIFF: False # Do not use crowd objects
+TEST:
+  DATASET: '/data/coco_2014_minival'
+  JSON_FILE: '/data/instances_minival2014.json'
+  PROTOCOL: 'coco'
+  IMS_PER_BATCH: 1
+  SCALES: [512]
+  NMS: 0.5
--- a/configs/retinanet/coco_retinanet_R-50-FPN_800_1x.yml
+++ b/configs/retinanet/coco_retinanet_R-50-FPN_800_1x.yml
+NUM_GPUS: 8
+PIXEL_STDS: [57.375, 57.12, 58.395]
+PIXEL_MEANS: [103.53, 116.28, 123.675]
+MODEL:
+  TYPE: retinanet
+  BACKBONE: resnet50.fpn
+  CLASSES: ['__background__',
+            'person', 'bicycle', 'car', 'motorcycle', 'airplane',
+            'bus', 'train', 'truck', 'boat', 'traffic light',
+            'fire hydrant', 'stop sign', 'parking meter', 'bench',
+            'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant',
+            'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
+            'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
+            'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
+            'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife',
+            'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli',
+            'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
+            'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop',
+            'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven',
+            'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors',
+            'teddy bear', 'hair drier', 'toothbrush']
+FPN:
+  RPN_MIN_LEVEL: 3
+  RPN_MAX_LEVEL: 7
+SOLVER:
+  BASE_LR: 0.01
+  LR_POLICY: steps_with_decay
+  DECAY_STEPS: [60000, 80000]
+  MAX_STEPS: 90000
+  SNAPSHOT_EVERY: 5000
+  SNAPSHOT_PREFIX: coco_retinanet_R-50-FPN_800_1x
+TRAIN:
+  WEIGHTS: '/model/R-50.pkl'
+  DATASET: '/data/coco_2014_trainval35k'
+  IMS_PER_BATCH: 2
+  SCALES: [640, 672, 704, 736, 768, 800]
+  MAX_SIZE: 1333
+  USE_DIFF: False # Do not use crowd objects
+TEST:
+  DATASET: '/data/coco_2014_minival'
+  JSON_FILE: '/data/instances_minival2014.json'
+  PROTOCOL: 'coco'
+  IMS_PER_BATCH: 1
+  SCALES: [800]
+  MAX_SIZE: 1333
+  NMS: 0.5
--- a/configs/retinanet/coco_retinanet_416_R-50-FPN.yml
+++ b/configs/retinanet/coco_retinanet_416_R-50-FPN.yml
-NUM_GPUS: 4
-VIS: False
-ENABLE_TENSOR_BOARD: False
+NUM_GPUS: 8
+PIXEL_STDS: [57.375, 57.12, 58.395]
+PIXEL_MEANS: [103.53, 116.28, 123.675]
 MODEL:
  TYPE: retinanet
  BACKBONE: resnet50.fpn
@@ -19,28 +19,28 @@ MODEL:
            'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven',
            'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors',
            'teddy bear', 'hair drier', 'toothbrush']
-  NUM_CLASSES: 81
+FPN:
+  RPN_MIN_LEVEL: 3
+  RPN_MAX_LEVEL: 7
 SOLVER:
  BASE_LR: 0.01
+  LR_POLICY: steps_with_decay
  DECAY_STEPS: [120000, 160000]
  MAX_STEPS: 180000
  SNAPSHOT_EVERY: 5000
-  SNAPSHOT_PREFIX: coco_retinanet_416
-FPN:
-  RPN_MIN_LEVEL: 3
-  RPN_MAX_LEVEL: 7
+  SNAPSHOT_PREFIX: coco_retinanet_R-50-FPN_800_2x
 TRAIN:
-  WEIGHTS: '/model/R-50.Affine.pth'
+  WEIGHTS: '/model/R-50.pkl'
  DATASET: '/data/coco_2014_trainval35k'
-  IMS_PER_BATCH: 16
-  SCALES: [416]
-  RANDOM_SCALES: [0.25, 1.0]
-  USE_DIFF: False  # Do not use crowd objects
-  USE_COLOR_JITTER: False
+  IMS_PER_BATCH: 2
+  SCALES: [640, 672, 704, 736, 768, 800]
+  MAX_SIZE: 1333
+  USE_DIFF: False # Do not use crowd objects
 TEST:
  DATASET: '/data/coco_2014_minival'
  JSON_FILE: '/data/instances_minival2014.json'
  PROTOCOL: 'coco'
  IMS_PER_BATCH: 1
-  SCALES: [416]
-  NMS: 0.5
\ No newline at end of file
+  SCALES: [800]
+  MAX_SIZE: 1333
+  NMS: 0.5
--- a/configs/retinanet/voc_retinanet_320_AirNet-FPN.yml
+++ b/configs/retinanet/voc_retinanet_320_AirNet-FPN.yml
 NUM_GPUS: 1
-VIS: False
-VIS_ON_FILE: False
+PIXEL_STDS: [57.375, 57.12, 58.395]
+PIXEL_MEANS: [103.53, 116.28, 123.675]
 MODEL:
  TYPE: retinanet
-  BACKBONE: airnet.fpn
+  BACKBONE: resnet50.fpn
  CLASSES: ['__background__',
            'aeroplane', 'bicycle', 'bird', 'boat',
            'bottle', 'bus', 'car', 'cat', 'chair',
            'cow', 'diningtable', 'dog', 'horse',
            'motorbike', 'person', 'pottedplant',
            'sheep', 'sofa', 'train', 'tvmonitor']
-  NUM_CLASSES: 21
-SOLVER:
-  BASE_LR: 0.01
-  DECAY_STEPS: [40000, 50000, 60000]
-  MAX_STEPS: 60000
-  SNAPSHOT_EVERY: 5000
-  SNAPSHOT_PREFIX: voc_retinanet_320
 FPN:
  RPN_MIN_LEVEL: 3
  RPN_MAX_LEVEL: 7
+RETINANET:
+  NUM_CONVS: 2
+SOLVER:
+  BASE_LR: 0.01
+  DECAY_STEPS: [80000, 100000]
+  MAX_STEPS: 120000
+  SNAPSHOT_EVERY: 5000
+  SNAPSHOT_PREFIX: voc_retinanet_R-50-FPN_416
+PIPELINE:
+  TYPE: 'ssd'
 TRAIN:
-  WEIGHTS: '/model/AirNet.Affine.pth'
+  WEIGHTS: '/model/R-50.pkl'
  DATASET: '/data/voc_0712_trainval'
-  IMS_PER_BATCH: 32
-  SCALES: [320]
+  IMS_PER_BATCH: 16
+  SCALES: [416]
  RANDOM_SCALES: [0.25, 1.0]
  USE_COLOR_JITTER: True
 TEST:
  DATASET: '/data/voc_2007_test'
-  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
+  PROTOCOL: 'voc2007'
  IMS_PER_BATCH: 1
-  SCALES: [320]
-  NMS: 0.45
\ No newline at end of file
+  SCALES: [416]
+  NMS: 0.45
+  RETINANET_PRE_NMS_TOP_N: 1000
--- a/configs/retinanet/voc_retinanet_320_R-50-FPN.yml
+++ b/configs/retinanet/voc_retinanet_320_R-50-FPN.yml
-NUM_GPUS: 1
-VIS: False
-VIS_ON_FILE: False
+NUM_GPUS: 2
+PIXEL_STDS: [57.375, 57.12, 58.395]
+PIXEL_MEANS: [103.53, 116.28, 123.675]
 MODEL:
  TYPE: retinanet
-  BACKBONE: resnet34.fpn
+  BACKBONE: resnet50.fpn
  CLASSES: ['__background__',
            'aeroplane', 'bicycle', 'bird', 'boat',
            'bottle', 'bus', 'car', 'cat', 'chair',
            'cow', 'diningtable', 'dog', 'horse',
            'motorbike', 'person', 'pottedplant',
            'sheep', 'sofa', 'train', 'tvmonitor']
-  NUM_CLASSES: 21
-SOLVER:
-  BASE_LR: 0.01
-  DECAY_STEPS: [40000, 50000, 60000]
-  WARM_UP_STEPS: 2000
-  MAX_STEPS: 60000
-  SNAPSHOT_EVERY: 5000
-  SNAPSHOT_PREFIX: voc_retinanet_320
 FPN:
  RPN_MIN_LEVEL: 3
  RPN_MAX_LEVEL: 7
+RETINANET:
+  NUM_CONVS: 2
+SOLVER:
+  BASE_LR: 0.01
+  DECAY_STEPS: [80000, 100000]
+  MAX_STEPS: 120000
+  SNAPSHOT_EVERY: 5000
+  SNAPSHOT_PREFIX: voc_retinanet_R-50-FPN_512
+PIPELINE:
+  TYPE: 'ssd'
 TRAIN:
-  WEIGHTS: '/model/R-50.Affine.pth'
+  WEIGHTS: '/model/R-50.pkl'
  DATASET: '/data/voc_0712_trainval'
-  IMS_PER_BATCH: 32
-  SCALES: [320]
-  RANDOM_SCALES: [0.25, 2.0]
+  IMS_PER_BATCH: 8
+  SCALES: [512]
+  RANDOM_SCALES: [0.25, 1.0]
  USE_COLOR_JITTER: True
 TEST:
  DATASET: '/data/voc_2007_test'
-  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
+  PROTOCOL: 'voc2007'
  IMS_PER_BATCH: 1
-  SCALES: [320]
-  NMS: 0.45
\ No newline at end of file
+  SCALES: [512]
+  NMS: 0.45
+  RETINANET_PRE_NMS_TOP_N: 1000
--- a/configs/ssd/README.md
+++ b/configs/ssd/README.md
+# SSD: Single Shot MultiBox Detector
+
+## Introduction
+```
+@article{Liu_2016,
+   title={SSD: Single Shot MultiBox Detector},
+   journal={ECCV},
+   author={Liu, Wei and Anguelov, Dragomir and Erhan, Dumitru and Szegedy, Christian and Reed, Scott and Fu, Cheng-Yang and Berg, Alexander C.},
+   year={2016},
+}
+```
+
+## Pascal VOC Object Detection Baselines
+
+| Model | Infer time (s/im) | AP@0.5 | Download |
+| :---: | :---------------: | :----: | :------: |
+| [VGG-16-300](voc_ssd_VGG-16_300.yml) | 0.012 | 78.3 | [model](https://dragon.seetatech.com/download/models/seetadet/ssd/voc_ssd_VGG-16_300/model_final.pkl) |
+| [VGG-16-512](voc_ssd_VGG-16_512.yml) | 0.021 | 80.1 | [model](https://dragon.seetatech.com/download/models/seetadet/ssd/voc_ssd_VGG-16_512/model_final.pkl) |
--- a/configs/ssd/voc_ssd_320_AirNet-FPN.yml
+++ b/configs/ssd/voc_ssd_320_AirNet-FPN.yml
-NUM_GPUS: 1
-VIS: False
-ENABLE_TENSOR_BOARD: False
-MODEL:
-  TYPE: ssd
-  BACKBONE: airnet.fpn
-  CLASSES: ['__background__',
-            'aeroplane', 'bicycle', 'bird', 'boat',
-            'bottle', 'bus', 'car', 'cat', 'chair',
-            'cow', 'diningtable', 'dog', 'horse',
-            'motorbike', 'person', 'pottedplant',
-            'sheep', 'sofa', 'train', 'tvmonitor']
-  NUM_CLASSES: 21
-SOLVER:
-  BASE_LR: 0.001
-  DECAY_STEPS: [80000, 100000, 120000]
-  MAX_STEPS: 120000
-  SNAPSHOT_EVERY: 5000
-  SNAPSHOT_PREFIX: voc_ssd_320
-FPN:
-  RPN_MIN_LEVEL: 3
-  RPN_MAX_LEVEL: 8
-SSD:
-  NUM_CONVS: 2
-  MULTIBOX:
-    STRIDES: [8, 16, 32, 64, 100, 300]
-    MIN_SIZES: [30, 60, 110, 162, 213, 264]
-    MAX_SIZES: [60, 110, 162, 213, 264, 315]
-    ASPECT_RATIOS: [
-      [1, 2, 0.5],
-      [1, 2, 0.5, 3, 0.33],
-      [1, 2, 0.5, 3, 0.33],
-      [1, 2, 0.5, 3, 0.33],
-      [1, 2, 0.5],
-      [1, 2, 0.5],
-    ]
-TRAIN:
-  WEIGHTS: '/model/AirNet.Affine.pth'
-  DATASET: '/data/voc_0712_trainval'
-  IMS_PER_BATCH: 32
-  SCALES: [320]
-  RANDOM_SCALES: [0.25, 1.00]
-  USE_COLOR_JITTER: True
-TEST:
-  DATASET: '/data/voc_2007_test'
-  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
-  IMS_PER_BATCH: 8
-  SCALES: [320]
-  NMS_TOP_K: 400
-  NMS: 0.45
-  SCORE_THRESH: 0.01
-  DETECTIONS_PER_IM: 200
\ No newline at end of file
--- a/configs/ssd/voc_ssd_320_R-50-FPN.yml
+++ b/configs/ssd/voc_ssd_320_R-50-FPN.yml
-NUM_GPUS: 1
-VIS: False
-ENABLE_TENSOR_BOARD: False
-MODEL:
-  TYPE: ssd
-  BACKBONE: resnet50.fpn
-  CLASSES: ['__background__',
-            'aeroplane', 'bicycle', 'bird', 'boat',
-            'bottle', 'bus', 'car', 'cat', 'chair',
-            'cow', 'diningtable', 'dog', 'horse',
-            'motorbike', 'person', 'pottedplant',
-            'sheep', 'sofa', 'train', 'tvmonitor']
-  NUM_CLASSES: 21
-FPN:
-  RPN_MIN_LEVEL: 3
-  RPN_MAX_LEVEL: 8
-SOLVER:
-  BASE_LR: 0.001
-  DECAY_STEPS: [80000, 100000, 120000]
-  MAX_STEPS: 120000
-  SNAPSHOT_EVERY: 5000
-  SNAPSHOT_PREFIX: voc_ssd_320
-SSD:
-  NUM_CONVS: 2
-  MULTIBOX:
-    STRIDES: [8, 16, 32, 64, 100, 300]
-    MIN_SIZES: [30, 60, 110, 162, 213, 264]
-    MAX_SIZES: [60, 110, 162, 213, 264, 315]
-    ASPECT_RATIOS: [
-      [1, 2, 0.5],
-      [1, 2, 0.5, 3, 0.33],
-      [1, 2, 0.5, 3, 0.33],
-      [1, 2, 0.5, 3, 0.33],
-      [1, 2, 0.5],
-      [1, 2, 0.5]
-    ]
-TRAIN:
-  WEIGHTS: '/model/R-50.Affine.pth'
-  DATASET: '/data/voc_0712_trainval'
-  IMS_PER_BATCH: 32
-  SCALES: [320]
-  RANDOM_SCALES: [0.25, 1.00]
-  USE_COLOR_JITTER: True
-TEST:
-  DATASET: '/data/voc_2007_test'
-  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
-  IMS_PER_BATCH: 8
-  SCALES: [320]
-  NMS_TOP_K: 400
-  NMS: 0.45
-  SCORE_THRESH: 0.01
-  DETECTIONS_PER_IM: 200
-
--- a/configs/ssd/voc_ssd_300_VGG-16.yml
+++ b/configs/ssd/voc_ssd_300_VGG-16.yml
 NUM_GPUS: 1
-VIS: False
-ENABLE_TENSOR_BOARD: False
+PIXEL_STDS: [1.0, 1.0, 1.0]
+PIXEL_MEANS: [103.53, 116.28, 123.675]
 MODEL:
  TYPE: ssd
-  BACKBONE: vgg16_reduced_300.mbox
-  FREEZE_AT: 0
+  BACKBONE: vgg16_reduced_300
+  COARSEST_STRIDE: 0
  CLASSES: ['__background__',
            'aeroplane', 'bicycle', 'bird', 'boat',
            'bottle', 'bus', 'car', 'cat', 'chair',
            'cow', 'diningtable', 'dog', 'horse',
            'motorbike', 'person', 'pottedplant',
            'sheep', 'sofa', 'train', 'tvmonitor']
-  NUM_CLASSES: 21
+SSD:
+  STRIDES: [8, 16, 32, 64, 100, 300]
+  ANCHOR_SIZES: [[30, 60],
+                 [60, 110],
+                 [110, 162],
+                 [162, 213],
+                 [213, 264],
+                 [264, 315]]
+  ASPECT_RATIOS: [[1, 2, 0.5],
+                  [1, 2, 0.5, 3, 0.33],
+                  [1, 2, 0.5, 3, 0.33],
+                  [1, 2, 0.5, 3, 0.33],
+                  [1, 2, 0.5],
+                  [1, 2, 0.5]]
 SOLVER:
  BASE_LR: 0.001
  WEIGHT_DECAY: 0.0005
-  DECAY_STEPS: [80000, 100000, 120000]
+  DECAY_STEPS: [80000, 100000]
  MAX_STEPS: 120000
  SNAPSHOT_EVERY: 5000
-  SNAPSHOT_PREFIX: voc_ssd_300
-SSD:
-  MULTIBOX:
-    STRIDES: [8, 16, 32, 64, 100, 300]
-    MIN_SIZES: [30, 60, 110, 162, 213, 264]
-    MAX_SIZES: [60, 110, 162, 213, 264, 315]
-    ASPECT_RATIOS: [
-      [1, 2, 0.5],
-      [1, 2, 0.5, 3, 0.33],
-      [1, 2, 0.5, 3, 0.33],
-      [1, 2, 0.5, 3, 0.33],
-      [1, 2, 0.5],
-      [1, 2, 0.5]
-    ]
+  SNAPSHOT_PREFIX: voc_ssd_VGG-16_300
 TRAIN:
-  WEIGHTS: '/model/VGG16.SSD.pth'
+  WEIGHTS: '/model/VGG16.SSD.pkl'
  DATASET: '/data/voc_0712_trainval'
-  IMS_PER_BATCH: 32
+  IMS_PER_BATCH: 16
  SCALES: [300]
-  RANDOM_SCALES: [0.25, 1.00]
+  RANDOM_SCALES: [0.25, 1.0]
  USE_COLOR_JITTER: True
 TEST:
  DATASET: '/data/voc_2007_test'
-  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
-  IMS_PER_BATCH: 8
+  PROTOCOL: 'voc2007'
+  IMS_PER_BATCH: 1
  SCALES: [300]
-  NMS_TOP_K: 400
  NMS: 0.45
  SCORE_THRESH: 0.01
-  DETECTIONS_PER_IM: 200
-
--- a/configs/ssd/voc_ssd_VGG-16_512.yml
+++ b/configs/ssd/voc_ssd_VGG-16_512.yml
+NUM_GPUS: 2
+PIXEL_STDS: [1.0, 1.0, 1.0]
+PIXEL_MEANS: [103.53, 116.28, 123.675]
+MODEL:
+  TYPE: ssd
+  BACKBONE: vgg16_reduced_512
+  CLASSES: ['__background__',
+            'aeroplane', 'bicycle', 'bird', 'boat',
+            'bottle', 'bus', 'car', 'cat', 'chair',
+            'cow', 'diningtable', 'dog', 'horse',
+            'motorbike', 'person', 'pottedplant',
+            'sheep', 'sofa', 'train', 'tvmonitor']
+SSD:
+  STRIDES: [8, 16, 32, 64, 128, 256, 512]
+  ANCHOR_SIZES: [[35.84, 76.8],
+                 [76.8, 153.6],
+                 [153.6, 230.4],
+                 [230.4, 307.2],
+                 [307.2, 384.0],
+                 [384.0, 460.8],
+                 [460.8, 537.6]]
+  ASPECT_RATIOS: [[1, 2, 0.5],
+                  [1, 2, 0.5, 3, 0.33],
+                  [1, 2, 0.5, 3, 0.33],
+                  [1, 2, 0.5, 3, 0.33],
+                  [1, 2, 0.5, 3, 0.33],
+                  [1, 2, 0.5],
+                  [1, 2, 0.5]]
+SOLVER:
+  BASE_LR: 0.001
+  WEIGHT_DECAY: 0.0005
+  DECAY_STEPS: [80000, 100000]
+  MAX_STEPS: 120000
+  SNAPSHOT_EVERY: 5000
+  SNAPSHOT_PREFIX: voc_ssd_VGG-16_512
+TRAIN:
+  WEIGHTS: '/model/VGG16.SSD.pkl'
+  DATASET: '/data/voc_0712_trainval'
+  IMS_PER_BATCH: 8
+  SCALES: [512]
+  RANDOM_SCALES: [0.25, 1.0]
+  USE_COLOR_JITTER: True
+TEST:
+  DATASET: '/data/voc_2007_test'
+  PROTOCOL: 'voc2007'
+  IMS_PER_BATCH: 1
+  SCALES: [512]
+  NMS: 0.45
+  SCORE_THRESH: 0.01
--- a/csrc/cxx/operators/nms_op.cc
+++ b/csrc/cxx/operators/nms_op.cc
@@ -7,7 +7,6 @@ template <class Context>
 template <typename T>
 void NonMaxSuppressionOp<Context>::DoRunWithType() {
  int num_selected;
-
  utils::detection::ApplyNMS(
      Output(0)->count(),
      Output(0)->count(),
@@ -16,7 +15,6 @@ void NonMaxSuppressionOp<Context>::DoRunWithType() {
      Output(0)->template mutable_data<int64_t, CPUContext>(),
      num_selected,
      ctx());
-
  Output(0)->Reshape({num_selected});
 }

@@ -24,14 +22,13 @@ template <class Context>
 void NonMaxSuppressionOp<Context>::RunOnDevice() {
  CHECK(Input(0).ndim() == 2 && Input(0).dim(1) == 5)
      << "\nThe dimensions of boxes should be (num_boxes, 5).";
-
  Output(0)->Reshape({Input(0).dim(0)});
  DispatchHelper<TensorTypes<float>>::Call(this, Input(0));
 }

-DEPLOY_CPU(NonMaxSuppression);
+DEPLOY_CPU_OPERATOR(NonMaxSuppression);
 #ifdef USE_CUDA
-DEPLOY_CUDA(NonMaxSuppression);
+DEPLOY_CUDA_OPERATOR(NonMaxSuppression);
 #endif

 OPERATOR_SCHEMA(NonMaxSuppression).NumInputs(1).NumOutputs(1);

--- a/csrc/cxx/operators/nms_op.h
+++ b/csrc/cxx/operators/nms_op.h
@@ -22,7 +22,7 @@ class NonMaxSuppressionOp final : public Operator<Context> {
 public:
  NonMaxSuppressionOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        iou_threshold_(OpArg<float>("iou_threshold", 0.5f)) {}
+        iou_threshold_(OP_SINGLE_ARG(float, "iou_threshold", 0.5f)) {}
  USE_OPERATOR_FUNCTIONS;

  void RunOnDevice() override;

--- a/csrc/cxx/operators/retinanet_decoder_op.cc
+++ b/csrc/cxx/operators/retinanet_decoder_op.cc
@@ -10,50 +10,48 @@ template <typename T>
 void RetinaNetDecoderOp<Context>::DoRunWithType() {
  using BT = float; // DType of BBox
  using BC = CPUContext; // Context of BBox
-
-  int feat_h, feat_w;
-  int C = Input(-3).dim(2), A, K;
  int total_proposals = 0;
-  int num_candidates, num_boxes, num_proposals;

-  auto* batch_scores = Input(-3).template data<T, BC>();
-  auto* batch_deltas = Input(-2).template data<T, BC>();
-  auto* im_info = Input(-1).template data<BT, BC>();
-  auto* y = Output(0)->template mutable_data<BT, BC>();
+  auto* batch_scores = Input(SCORES).template data<T, Context>();
+  auto* batch_deltas = Input(DELTAS).template data<T, BC>();
+  auto* im_info = Input(IMAGE_INFO).template data<BT, BC>();
+  auto* all_proposals = Output(0)->template mutable_data<BT, BC>();

-  for (int n = 0; n < num_images_; ++n) {
+  for (int im_idx = 0; im_idx < num_images_; ++im_idx) {
    BT im_h = im_info[0];
    BT im_w = im_info[1];
    BT im_scale_h = im_info[2];
    BT im_scale_w = im_info[2];
-    if (Input(-1).dim(1) == 4) im_scale_w = im_info[3];
-    auto* scores = batch_scores + n * Input(-3).stride(0);
-    auto* deltas = batch_deltas + n * Input(-2).stride(0);
+    if (Input(IMAGE_INFO).dim(1) == 4) im_scale_w = im_info[3];
    CHECK_EQ(strides_.size(), InputSize() - 3)
        << "\nGiven " << strides_.size() << " strides "
        << "and " << InputSize() - 3 << " features";
    // Select the top-k candidates as proposals
-    num_boxes = Input(-3).dim(1);
-    num_candidates = Input(-3).count(1);
-    roi_indices_.resize(num_candidates);
-    num_candidates = 0;
-    for (int i = 0; i < roi_indices_.size(); ++i)
-      if (scores[i] > score_thr_) roi_indices_[num_candidates++] = i;
-    scores_.resize(num_candidates);
-    for (int i = 0; i < num_candidates; ++i)
-      scores_[i] = scores[roi_indices_[i]];
-    num_proposals = std::min(num_candidates, (int)pre_nms_topn_);
-    utils::math::ArgPartition(
-        num_candidates, num_proposals, true, scores_.data(), indices_);
-    for (int i = 0; i < num_proposals; ++i)
+    auto num_boxes = Input(SCORES).dim(1);
+    auto num_classes = Input(SCORES).dim(2);
+    utils::detection::SelectProposals(
+        Input(SCORES).count(1),
+        score_thr_,
+        batch_scores + im_idx * Input(SCORES).stride(0),
+        roi_scores_,
+        roi_indices_,
+        ctx());
+    auto num_candidates = (int)roi_scores_.size();
+    auto num_proposals = std::min(num_candidates, (int)pre_nms_topn_);
+    utils::detection::ArgPartition(
+        num_candidates, num_proposals, true, roi_scores_.data(), indices_);
+    scores_.resize(indices_.size());
+    for (int i = 0; i < num_proposals; ++i) {
+      scores_[i] = roi_scores_[indices_[i]];
      indices_[i] = roi_indices_[indices_[i]];
-    // Decode the candidates
-    int base_offset = 0;
+    }
+    // Decode proposals via anchors
+    int stride_offset = 0;
    for (int i = 0; i < strides_.size(); i++) {
-      feat_h = Input(i).dim(2);
-      feat_w = Input(i).dim(3);
-      K = feat_h * feat_w;
-      A = int(ratios_.size() * scales_.size());
+      auto feature_h = Input(i).dim(2);
+      auto feature_w = Input(i).dim(3);
+      auto K = feature_h * feature_w;
+      auto A = int(ratios_.size() * scales_.size());
      anchors_.resize((size_t)(A * 4));
      utils::detection::GenerateAnchors(
          strides_[i],
@@ -62,35 +60,35 @@ void RetinaNetDecoderOp<Context>::DoRunWithType() {
          ratios_.data(),
          scales_.data(),
          anchors_.data());
-      utils::detection::GenerateGridAnchors(
+      utils::detection::GetShiftedAnchors(
          num_proposals,
-          C,
+          num_classes,
          A,
-          feat_h,
-          feat_w,
+          feature_h,
+          feature_w,
          strides_[i],
-          base_offset,
+          stride_offset,
          anchors_.data(),
          indices_.data(),
-          y);
-      base_offset += (A * K);
+          all_proposals);
+      stride_offset += (A * K);
    }
-    utils::detection::GenerateMCProposals(
+    utils::detection::GenerateDetections(
        num_proposals,
        num_boxes,
-        C,
-        n,
+        num_classes,
+        im_idx,
        im_h,
        im_w,
        im_scale_h,
        im_scale_w,
-        scores,
-        deltas,
+        scores_.data(),
+        batch_deltas + im_idx * Input(DELTAS).stride(0),
        indices_.data(),
-        y);
+        all_proposals);
    total_proposals += num_proposals;
-    y += (num_proposals * 7);
-    im_info += Input(-1).dim(1);
+    all_proposals += (num_proposals * 7);
+    im_info += Input(IMAGE_INFO).dim(1);
  }

  Output(0)->Reshape({total_proposals, 7});
@@ -99,20 +97,20 @@ void RetinaNetDecoderOp<Context>::DoRunWithType() {
 template <class Context>
 void RetinaNetDecoderOp<Context>::RunOnDevice() {
  num_images_ = Input(0).dim(0);
-
  CHECK_EQ(Input(-1).dim(0), num_images_)
      << "\nExcepted " << num_images_ << " groups info, got "
      << Input(-1).dim(0) << ".";
-
  Output(0)->Reshape({num_images_ * pre_nms_topn_, 7});
-  DispatchHelper<TensorTypes<float>>::Call(this, Input(-3));
+  DispatchHelper<TensorTypes<float>>::Call(this, Input(SCORES));
 }

-DEPLOY_CPU(RetinaNetDecoder);
+DEPLOY_CPU_OPERATOR(RetinaNetDecoder);
 #ifdef USE_CUDA
-DEPLOY_CUDA(RetinaNetDecoder);
+DEPLOY_CUDA_OPERATOR(RetinaNetDecoder);
 #endif

 OPERATOR_SCHEMA(RetinaNetDecoder).NumInputs(3, INT_MAX).NumOutputs(1, INT_MAX);

+NO_GRADIENT(RetinaNetDecoder);
+
 } // namespace dragon
--- a/csrc/cxx/operators/retinanet_decoder_op.h
+++ b/csrc/cxx/operators/retinanet_decoder_op.h
@@ -22,11 +22,11 @@ class RetinaNetDecoderOp final : public Operator<Context> {
 public:
  RetinaNetDecoderOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        strides_(OpArgs<int64_t>("strides")),
-        ratios_(OpArgs<float>("ratios")),
-        scales_(OpArgs<float>("scales")),
-        pre_nms_topn_(OpArg<int64_t>("pre_nms_top_n", 6000)),
-        score_thr_(OpArg<float>("score_thresh", 0.05f)) {}
+        strides_(OP_REPEATED_ARG(int64_t, "strides")),
+        ratios_(OP_REPEATED_ARG(float, "ratios")),
+        scales_(OP_REPEATED_ARG(float, "scales")),
+        pre_nms_topn_(OP_SINGLE_ARG(int64_t, "pre_nms_top_n", 6000)),
+        score_thr_(OP_SINGLE_ARG(float, "score_thresh", 0.05f)) {}
  USE_OPERATOR_FUNCTIONS;

  void RunOnDevice() override;
@@ -34,10 +34,13 @@ class RetinaNetDecoderOp final : public Operator<Context> {
  template <typename T>
  void DoRunWithType();

+  enum INPUT_TAGS { SCORES = -3, DELTAS = -2, IMAGE_INFO = -1 };
+
 protected:
  float score_thr_;
  vec64_t strides_, indices_, roi_indices_;
-  vector<float> ratios_, scales_, scores_, anchors_;
+  vector<float> ratios_, scales_, anchors_;
+  vector<float> scores_, roi_scores_;
  int64_t num_images_, pre_nms_topn_;
 };


--- a/csrc/cxx/operators/rpn_decoder_op.cc
+++ b/csrc/cxx/operators/rpn_decoder_op.cc
@@ -15,153 +15,81 @@ void RPNDecoderOp<Context>::DoRunWithType() {
  int total_rois = 0, num_rois;
  int num_candidates, num_proposals;

-  auto* batch_scores = Input(-3).template data<T, BC>();
-  auto* batch_deltas = Input(-2).template data<T, BC>();
-  auto* im_info = Input(-1).template data<BT, BC>();
-  auto* y = Output(0)->template mutable_data<BT, BC>();
+  auto* batch_scores = Input(SCORES).template data<T, BC>();
+  auto* batch_deltas = Input(DELTAS).template data<T, BC>();
+  auto* im_info = Input(IMAGE_INFO).template data<BT, BC>();
+  auto* all_rois = Output(0)->template mutable_data<BT, BC>();

-  for (int n = 0; n < num_images_; ++n) {
+  for (int im_idx = 0; im_idx < num_images_; ++im_idx) {
    const BT im_h = im_info[0];
    const BT im_w = im_info[1];
-    const BT scale = im_info[2];
-    const BT min_box_h = min_size_ * scale;
-    const BT min_box_w = min_size_ * scale;
-    auto* scores = batch_scores + n * Input(-3).stride(0);
-    auto* deltas = batch_deltas + n * Input(-2).stride(0);
-    if (strides_.size() == 1) {
-      // Case 1: single stride
-      feat_h = Input(0).dim(2);
-      feat_w = Input(0).dim(3);
+    auto* scores = batch_scores + im_idx * Input(SCORES).stride(0);
+    auto* deltas = batch_deltas + im_idx * Input(DELTAS).stride(0);
+    CHECK_EQ(strides_.size(), InputSize() - 3)
+        << "\nGiven " << strides_.size() << " strides "
+        << "and " << InputSize() - 3 << " feature inputs";
+    CHECK_EQ(strides_.size(), scales_.size())
+        << "\nGiven " << strides_.size() << " strides "
+        << "and " << scales_.size() << " scales";
+    // Select the top-k candidates as proposals
+    num_candidates = Input(SCORES).dim(1);
+    num_proposals = std::min(num_candidates, (int)pre_nms_top_n_);
+    utils::math::ArgPartition(
+        num_candidates, num_proposals, true, scores, indices_);
+    // Decode the candidates
+    int stride_offset = 0;
+    proposals_.Reshape({num_proposals, 5});
+    auto* proposals = proposals_.template mutable_data<BT, BC>();
+    for (int i = 0; i < strides_.size(); i++) {
+      feat_h = Input(i).dim(2);
+      feat_w = Input(i).dim(3);
      K = feat_h * feat_w;
-      A = int(ratios_.size() * scales_.size());
-      // Select the Top-K candidates as proposals
-      num_candidates = A * K;
-      num_proposals = std::min(num_candidates, (int)pre_nms_topn_);
-      utils::math::ArgPartition(
-          num_candidates, num_proposals, true, scores, indices_);
-      // Decode the candidates
+      A = (int)ratios_.size();
      anchors_.resize((size_t)(A * 4));
-      proposals_.Reshape({num_proposals, 5});
      utils::detection::GenerateAnchors(
-          strides_[0],
+          strides_[i],
          (int)ratios_.size(),
-          (int)scales_.size(),
+          1,
          ratios_.data(),
          scales_.data(),
          anchors_.data());
-      utils::detection::GenerateGridAnchors(
+      utils::detection::GetShiftedAnchors(
          num_proposals,
          A,
          feat_h,
          feat_w,
-          strides_[0],
-          0,
+          strides_[i],
+          stride_offset,
          anchors_.data(),
          indices_.data(),
-          proposals_.template mutable_data<BT, BC>());
-      utils::detection::GenerateSSProposals(
-          K,
-          num_proposals,
-          im_h,
-          im_w,
-          min_box_h,
-          min_box_w,
-          scores,
-          deltas,
-          indices_.data(),
-          proposals_.template mutable_data<BT, BC>());
-      // Sort, NMS and Retrieve
-      utils::detection::SortProposals(
-          0,
-          num_proposals - 1,
-          num_proposals,
-          proposals_.template mutable_data<BT, BC>());
-      utils::detection::ApplyNMS(
-          num_proposals,
-          post_nms_topn_,
-          nms_thr_,
-          proposals_.template mutable_data<BT, Context>(),
-          roi_indices_.data(),
-          num_rois,
-          ctx());
-      utils::detection::RetrieveRoIs(
-          num_rois,
-          n,
-          proposals_.template data<BT, BC>(),
-          roi_indices_.data(),
-          y);
-    } else if (strides_.size() > 1) {
-      // Case 2: multiple strides
-      CHECK_EQ(strides_.size(), InputSize() - 3)
-          << "\nGiven " << strides_.size() << " strides "
-          << "and " << InputSize() - 3 << " feature inputs";
-      CHECK_EQ(strides_.size(), scales_.size())
-          << "\nGiven " << strides_.size() << " strides "
-          << "and " << scales_.size() << " scales";
-      // Select the top-k candidates as proposals
-      num_candidates = Input(-3).dim(1);
-      num_proposals = std::min(num_candidates, (int)pre_nms_topn_);
-      utils::math::ArgPartition(
-          num_candidates, num_proposals, true, scores, indices_);
-      // Decode the candidates
-      int base_offset = 0;
-      proposals_.Reshape({num_proposals, 5});
-      auto* proposals = proposals_.template mutable_data<BT, BC>();
-      for (int i = 0; i < strides_.size(); i++) {
-        feat_h = Input(i).dim(2);
-        feat_w = Input(i).dim(3);
-        K = feat_h * feat_w;
-        A = (int)ratios_.size();
-        anchors_.resize((size_t)(A * 4));
-        utils::detection::GenerateAnchors(
-            strides_[i],
-            (int)ratios_.size(),
-            1,
-            ratios_.data(),
-            scales_.data(),
-            anchors_.data());
-        utils::detection::GenerateGridAnchors(
-            num_proposals,
-            A,
-            feat_h,
-            feat_w,
-            strides_[i],
-            base_offset,
-            anchors_.data(),
-            indices_.data(),
-            proposals);
-        base_offset += (A * K);
-      }
-      utils::detection::GenerateMSProposals(
-          num_candidates,
-          num_proposals,
-          im_h,
-          im_w,
-          min_box_h,
-          min_box_w,
-          scores,
-          deltas,
-          &indices_[0],
          proposals);
-      // Sort, NMS and Retrieve
-      utils::detection::SortProposals(
-          0, num_proposals - 1, num_proposals, proposals);
-      utils::detection::ApplyNMS(
-          num_proposals,
-          post_nms_topn_,
-          nms_thr_,
-          proposals_.template mutable_data<BT, Context>(),
-          roi_indices_.data(),
-          num_rois,
-          ctx());
-      utils::detection::RetrieveRoIs(
-          num_rois, n, proposals, roi_indices_.data(), y);
-    } else {
-      LOG(FATAL) << "Excepted at least one stride for proposals.";
+      stride_offset += (A * K);
    }
+    utils::detection::GenerateProposals(
+        num_candidates,
+        num_proposals,
+        im_h,
+        im_w,
+        scores,
+        deltas,
+        &indices_[0],
+        proposals);
+    // Sort, NMS and Retrieve
+    utils::detection::SortProposals(
+        0, num_proposals - 1, num_proposals, proposals);
+    utils::detection::ApplyNMS(
+        num_proposals,
+        post_nms_top_n_,
+        nms_thr_,
+        proposals_.template mutable_data<BT, Context>(),
+        roi_indices_.data(),
+        num_rois,
+        ctx());
+    utils::detection::RetrieveRoIs(
+        num_rois, im_idx, proposals, roi_indices_.data(), all_rois);
    total_rois += num_rois;
-    y += (num_rois * 5);
-    im_info += Input(-1).dim(1);
+    all_rois += (num_rois * 5);
+    im_info += Input(IMAGE_INFO).dim(1);
  }

  Output(0)->Reshape({total_rois, 5});
@@ -202,22 +130,21 @@ void RPNDecoderOp<Context>::DoRunWithType() {
 template <class Context>
 void RPNDecoderOp<Context>::RunOnDevice() {
  num_images_ = Input(0).dim(0);
-
-  CHECK_EQ(Input(-1).dim(0), num_images_)
+  CHECK_EQ(Input(IMAGE_INFO).dim(0), num_images_)
      << "\nExcepted " << num_images_ << " groups info, got "
-      << Input(-1).dim(0) << ".";
-
-  roi_indices_.resize(post_nms_topn_);
-  Output(0)->Reshape({num_images_ * post_nms_topn_, 5});
-
-  DispatchHelper<TensorTypes<float>>::Call(this, Input(-3));
+      << Input(IMAGE_INFO).dim(0) << ".";
+  roi_indices_.resize(post_nms_top_n_);
+  Output(0)->Reshape({num_images_ * post_nms_top_n_, 5});
+  DispatchHelper<TensorTypes<float>>::Call(this, Input(SCORES));
 }

-DEPLOY_CPU(RPNDecoder);
+DEPLOY_CPU_OPERATOR(RPNDecoder);
 #ifdef USE_CUDA
-DEPLOY_CUDA(RPNDecoder);
+DEPLOY_CUDA_OPERATOR(RPNDecoder);
 #endif

 OPERATOR_SCHEMA(RPNDecoder).NumInputs(3, INT_MAX).NumOutputs(1, INT_MAX);

+NO_GRADIENT(RPNDecoder);
+
 } // namespace dragon
--- a/csrc/cxx/operators/rpn_decoder_op.h
+++ b/csrc/cxx/operators/rpn_decoder_op.h
@@ -22,17 +22,16 @@ class RPNDecoderOp final : public Operator<Context> {
 public:
  RPNDecoderOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        strides_(OpArgs<int64_t>("strides")),
-        ratios_(OpArgs<float>("ratios")),
-        scales_(OpArgs<float>("scales")),
-        pre_nms_topn_(OpArg<int64_t>("pre_nms_top_n", 6000)),
-        post_nms_topn_(OpArg<int64_t>("post_nms_top_n", 300)),
-        nms_thr_(OpArg<float>("nms_thresh", 0.7f)),
-        min_size_(OpArg<int64_t>("min_size", 16)),
-        min_level_(OpArg<int64_t>("min_level", 2)),
-        max_level_(OpArg<int64_t>("max_level", 5)),
-        canonical_level_(OpArg<int64_t>("canonical_level", 4)),
-        canonical_scale_(OpArg<int64_t>("canonical_scale", 224)) {}
+        strides_(OP_REPEATED_ARG(int64_t, "strides")),
+        ratios_(OP_REPEATED_ARG(float, "ratios")),
+        scales_(OP_REPEATED_ARG(float, "scales")),
+        pre_nms_top_n_(OP_SINGLE_ARG(int64_t, "pre_nms_top_n", 6000)),
+        post_nms_top_n_(OP_SINGLE_ARG(int64_t, "post_nms_top_n", 1000)),
+        nms_thr_(OP_SINGLE_ARG(float, "nms_thresh", 0.7f)),
+        min_level_(OP_SINGLE_ARG(int64_t, "min_level", 2)),
+        max_level_(OP_SINGLE_ARG(int64_t, "max_level", 5)),
+        canonical_level_(OP_SINGLE_ARG(int64_t, "canonical_level", 4)),
+        canonical_scale_(OP_SINGLE_ARG(int64_t, "canonical_scale", 224)) {}
  USE_OPERATOR_FUNCTIONS;

  void RunOnDevice() override;
@@ -40,11 +39,13 @@ class RPNDecoderOp final : public Operator<Context> {
  template <typename T>
  void DoRunWithType();

+  enum INPUT_TAGS { SCORES = -3, DELTAS = -2, IMAGE_INFO = -1 };
+
 protected:
  float nms_thr_;
  vec64_t strides_, indices_, roi_indices_;
  vector<float> ratios_, scales_, scores_, anchors_;
-  int64_t min_size_, pre_nms_topn_, post_nms_topn_;
+  int64_t pre_nms_top_n_, post_nms_top_n_;
  int64_t num_images_, min_level_, max_level_;
  int64_t canonical_level_, canonical_scale_;
  Tensor proposals_;

--- a/csrc/cxx/setup.py
+++ b/csrc/cxx/setup.py
@@ -8,7 +8,6 @@
 #     <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
-
 """Build cxx sources."""

 from __future__ import absolute_import
@@ -16,14 +15,14 @@ from __future__ import division
 from __future__ import print_function

 import glob
-from distutils.core import setup

 from dragon.tools import cpp_extension
-if cpp_extension.CUDA_HOME is not None and \
-        cpp_extension._cuda.is_available():
-    Extension = cpp_extension.CUDAExtension
-else:
-    Extension = cpp_extension.CppExtension
+from setuptools import setup
+
+Extension = cpp_extension.CppExtension
+if cpp_extension.CUDA_HOME is not None:
+    if cpp_extension._cuda.is_available():
+        Extension = cpp_extension.CUDAExtension


 def find_sources(*dirs):
@@ -44,11 +43,12 @@ ext_modules = [
    Extension(
        name='install.lib.modules._C',
        sources=find_sources('**'),
+        define_macros=[('THRUST_IGNORE_CUB_VERSION_CHECK', None)],
    ),
 ]

 setup(
    name='SeetaDet',
    ext_modules=ext_modules,
-    cmdclass={'build_ext': cpp_extension.BuildExtension}
+    cmdclass={'build_ext': cpp_extension.BuildExtension},
 )
--- a/csrc/cxx/utils/detection_utils.cc
+++ b/csrc/cxx/utils/detection_utils.cc
@@ -47,6 +47,26 @@ void ApplyNMS<float, CPUContext>(
  num_keep = count;
 }

+template <>
+void SelectProposals<float, CPUContext>(
+    const int count,
+    const float score_thresh,
+    const float* input_scores,
+    vector<float>& output_scores,
+    vector<int64_t>& output_indices,
+    CPUContext* ctx) {
+  int num_proposals = 0;
+  for (int i = 0; i < count; ++i) {
+    if (input_scores[i] > score_thresh) {
+      output_indices[num_proposals++] = i;
+    }
+  }
+  output_scores.resize(num_proposals);
+  for (int i = 0; i < num_proposals; ++i) {
+    output_scores[i] = input_scores[output_indices[i]];
+  }
+}
+
 } // namespace detection

 } // namespace utils

--- a/csrc/cxx/utils/detection_utils.cu
+++ b/csrc/cxx/utils/detection_utils.cu
 #ifdef USE_CUDA

 #include <dragon/core/context_cuda.h>
+#include <dragon/core/workspace.h>
+#include <dragon/utils/device/common_cub.h>
+#include <dragon/utils/device/common_thrust.h>
 #include "detection_utils.h"

 namespace dragon {
@@ -15,6 +18,16 @@ namespace detection {
 namespace {

 template <typename T>
+struct ThresholdFunctor {
+  ThresholdFunctor(float thresh) : thresh_(thresh) {}
+  inline __device__ bool operator()(
+      const thrust::tuple<int64_t, T>& key_val) const {
+    return thrust::get<1>(key_val) > thresh_;
+  }
+  float thresh_;
+};
+
+template <typename T>
 __device__ bool _CheckIoU(const T* a, const T* b, const float thresh) {
  const T x1 = max(a[0], b[0]);
  const T y1 = max(a[1], b[1]);
@@ -72,6 +85,41 @@ __global__ void _NonMaxSuppression(
 } // namespace

 template <>
+void SelectProposals<float, CUDAContext>(
+    const int count,
+    const float score_thresh,
+    const float* in_scores,
+    vector<float>& out_scores,
+    vector<int64_t>& out_indices,
+    CUDAContext* ctx) {
+  auto* in_indices = ctx->workspace()->template data<int64_t, CUDAContext>(
+      {count}, "data:1")[0];
+  auto iter = thrust::make_zip_iterator(
+      thrust::make_tuple(in_indices, const_cast<float*>(in_scores)));
+  auto policy = thrust::cuda::par.on(ctx->cuda_stream());
+  thrust::counting_iterator<int64_t> offset(0);
+  thrust::copy(policy, offset, offset + count, in_indices);
+  auto last = thrust::partition(
+      policy, iter, iter + count, ThresholdFunctor<float>(score_thresh));
+  size_t num_proposals = last - iter;
+  out_scores.resize(num_proposals);
+  out_indices.resize(num_proposals);
+  CUDA_CHECK(cudaMemcpyAsync(
+      out_scores.data(),
+      in_scores,
+      num_proposals * sizeof(float),
+      cudaMemcpyDeviceToHost,
+      ctx->cuda_stream()));
+  CUDA_CHECK(cudaMemcpyAsync(
+      out_indices.data(),
+      in_indices,
+      num_proposals * sizeof(int64_t),
+      cudaMemcpyDeviceToHost,
+      ctx->cuda_stream()));
+  ctx->FinishDeviceComputation();
+}
+
+template <>
 void ApplyNMS<float, CUDAContext>(
    const int num_boxes,
    const int max_keeps,
@@ -83,7 +131,8 @@ void ApplyNMS<float, CUDAContext>(
  const int num_blocks = DIV_UP(num_boxes, NUM_THREADS);

  vector<uint64_t> mask_host(num_boxes * num_blocks);
-  auto* mask_dev = (uint64_t*)ctx->New(mask_host.size() * sizeof(uint64_t));
+  auto* mask_dev = (uint64_t*)ctx->workspace()->data<CUDAContext>(
+      {mask_host.size() * sizeof(uint64_t)}, "data:1")[0];

  _NonMaxSuppression<<<
      dim3(num_blocks, num_blocks),
@@ -115,9 +164,7 @@ void ApplyNMS<float, CUDAContext>(
      if (num_selected == max_keeps) break;
    }
  }
-
  num_keep = num_selected;
-  ctx->Delete(mask_dev);
 }

 } // namespace detection

--- a/csrc/cxx/utils/detection_utils.h
+++ b/csrc/cxx/utils/detection_utils.h
@@ -24,45 +24,37 @@ namespace detection {
 #define ROUND(x) ((int)((x) + (T)0.5))

 /*!
- * Box API
+ * Functional API
 */

 template <typename T>
-inline int FilterBoxes(
-    const T dx,
-    const T dy,
-    const T d_log_w,
-    const T d_log_h,
-    const T im_w,
-    const T im_h,
-    const T min_box_w,
-    const T min_box_h,
-    T* bbox) {
-  const T w = bbox[2] - bbox[0] + 1;
-  const T h = bbox[3] - bbox[1] + 1;
-  const T ctr_x = bbox[0] + (T)0.5 * w;
-  const T ctr_y = bbox[1] + (T)0.5 * h;
-
-  const T pred_ctr_x = dx * w + ctr_x;
-  const T pred_ctr_y = dy * h + ctr_y;
-  const T pred_w = exp(d_log_w) * w;
-  const T pred_h = exp(d_log_h) * h;
-
-  bbox[0] = pred_ctr_x - (T)0.5 * pred_w;
-  bbox[1] = pred_ctr_y - (T)0.5 * pred_h;
-  bbox[2] = pred_ctr_x + (T)0.5 * pred_w;
-  bbox[3] = pred_ctr_y + (T)0.5 * pred_h;
-
-  bbox[0] = std::max((T)0, std::min(bbox[0], im_w - 1));
-  bbox[1] = std::max((T)0, std::min(bbox[1], im_h - 1));
-  bbox[2] = std::max((T)0, std::min(bbox[2], im_w - 1));
-  bbox[3] = std::max((T)0, std::min(bbox[3], im_h - 1));
-
-  const T bbox_w = bbox[2] - bbox[0] + 1;
-  const T bbox_h = bbox[3] - bbox[1] + 1;
-  return (bbox_w >= min_box_w) * (bbox_h >= min_box_h);
+inline void ArgPartition(
+    const int count,
+    const int kth,
+    const bool descend,
+    const T* v,
+    vec64_t& indices) {
+  indices.resize(count);
+  std::iota(indices.begin(), indices.end(), 0);
+  if (descend) {
+    std::nth_element(
+        indices.begin(),
+        indices.begin() + kth,
+        indices.end(),
+        [&v](int64_t lhs, int64_t rhs) { return v[lhs] > v[rhs]; });
+  } else {
+    std::nth_element(
+        indices.begin(),
+        indices.begin() + kth,
+        indices.end(),
+        [&v](int64_t lhs, int64_t rhs) { return v[lhs] < v[rhs]; });
+  }
 }

+/*!
+ * Box API
+ */
+
 template <typename T>
 inline void BBoxTransform(
    const T dx,
@@ -126,28 +118,28 @@ inline void GenerateAnchors(
 }

 template <typename T>
-inline void GenerateGridAnchors(
+inline void GetShiftedAnchors(
    const int num_proposals,
    const int num_anchors,
    const int feat_h,
    const int feat_w,
    const int stride,
-    const int base_offset,
-    const T* anchors,
+    const int stride_offset,
+    const T* base_anchors,
    const int64_t* indices,
-    T* proposals) {
+    T* shifted_anchors) {
  T x, y;
  int idx_3d, a, h, w;
  int idx_range = num_anchors * feat_h * feat_w;
  for (int i = 0; i < num_proposals; ++i) {
-    idx_3d = (int)indices[i] - base_offset;
+    idx_3d = (int)indices[i] - stride_offset;
    if (idx_3d >= 0 && idx_3d < idx_range) {
      w = idx_3d % feat_w;
      h = (idx_3d / feat_w) % feat_h;
      a = idx_3d / feat_w / feat_h;
      x = (T)w * stride, y = (T)h * stride;
-      auto* A = anchors + a * 4;
-      auto* P = proposals + i * 5;
+      auto* A = base_anchors + a * 4;
+      auto* P = shifted_anchors + i * 5;
      P[0] = x + A[0], P[1] = y + A[1];
      P[2] = x + A[2], P[3] = y + A[3];
    }
@@ -155,20 +147,20 @@ inline void GenerateGridAnchors(
 }

 template <typename T>
-inline void GenerateGridAnchors(
+inline void GetShiftedAnchors(
    const int num_proposals,
    const int num_classes,
    const int num_anchors,
    const int feat_h,
    const int feat_w,
    const int stride,
-    const int base_offset,
-    const T* anchors,
+    const int stride_offset,
+    const T* base_anchors,
    const int64_t* indices,
-    T* proposals) {
+    T* shifted_anchors) {
  T x, y;
  int idx_4d, a, h, w;
-  int lr = num_classes * base_offset;
+  int lr = num_classes * stride_offset;
  int rr = num_classes * (num_anchors * feat_h * feat_w);
  for (int i = 0; i < num_proposals; ++i) {
    idx_4d = (int)indices[i] - lr;
@@ -178,8 +170,8 @@ inline void GenerateGridAnchors(
      h = (idx_4d / feat_w) % feat_h;
      a = idx_4d / feat_w / feat_h;
      x = (T)w * stride, y = (T)h * stride;
-      auto* A = anchors + a * 4;
-      auto* P = proposals + i * 7 + 1;
+      auto* A = base_anchors + a * 4;
+      auto* P = shifted_anchors + i * 7 + 1;
      P[0] = x + A[0], P[1] = y + A[1];
      P[2] = x + A[2], P[3] = y + A[3];
    }
@@ -190,22 +182,30 @@ inline void GenerateGridAnchors(
 * Proposal API
 */

+template <typename T, class Context>
+void SelectProposals(
+    const int count,
+    const float score_thresh,
+    const T* input_scores,
+    vector<T>& output_scores,
+    vector<int64_t>& output_indices,
+    Context* ctx);
+
 template <typename T>
-void GenerateSSProposals(
+void GenerateProposals_v1(
    const int K,
    const int num_proposals,
    const float im_h,
    const float im_w,
-    const float min_box_h,
-    const float min_box_w,
    const T* scores,
    const T* deltas,
    const int64_t* indices,
    T* proposals) {
+  // Shifted anchors in format: [K, A, 4]
  int64_t index, a, k;
-  const float* delta;
-  float* proposal = proposals;
-  float dx, dy, d_log_w, d_log_h;
+  const T* delta;
+  T* proposal = proposals;
+  T dx, dy, d_log_w, d_log_h;
  for (int i = 0; i < num_proposals; ++i) {
    index = indices[i];
    a = index / K, k = index % K;
@@ -214,61 +214,42 @@ void GenerateSSProposals(
    dy = delta[(a * 4 + 1) * K];
    d_log_w = delta[(a * 4 + 2) * K];
    d_log_h = delta[(a * 4 + 3) * K];
-    proposal[4] = FilterBoxes(
-                      dx,
-                      dy,
-                      d_log_w,
-                      d_log_h,
-                      im_w,
-                      im_h,
-                      min_box_w,
-                      min_box_h,
-                      proposal) *
-        scores[index];
+    BBoxTransform(dx, dy, d_log_w, d_log_h, im_w, im_h, T(1), T(1), proposal);
+    proposal[4] = scores[index];
    proposal += 5;
  }
 }

 template <typename T>
-void GenerateMSProposals(
+void GenerateProposals(
    const int num_candidates,
    const int num_proposals,
    const float im_h,
    const float im_w,
-    const float min_box_h,
-    const float min_box_w,
    const T* scores,
    const T* deltas,
    const int64_t* indices,
    T* proposals) {
+  // Shifted anchors in format: [4, A, K]
  int64_t index;
  int64_t num_candidates_2x = 2 * num_candidates;
  int64_t num_candidates_3x = 3 * num_candidates;
-  float* proposal = proposals;
-  float dx, dy, d_log_w, d_log_h;
+  T* proposal = proposals;
+  T dx, dy, d_log_w, d_log_h;
  for (int i = 0; i < num_proposals; ++i) {
    index = indices[i];
    dx = deltas[index];
    dy = deltas[num_candidates + index];
    d_log_w = deltas[num_candidates_2x + index];
    d_log_h = deltas[num_candidates_3x + index];
-    proposal[4] = FilterBoxes(
-                      dx,
-                      dy,
-                      d_log_w,
-                      d_log_h,
-                      im_w,
-                      im_h,
-                      min_box_w,
-                      min_box_h,
-                      proposal) *
-        scores[index];
+    BBoxTransform(dx, dy, d_log_w, d_log_h, im_w, im_h, T(1), T(1), proposal);
+    proposal[4] = scores[index];
    proposal += 5;
  }
 }

 template <typename T>
-void GenerateMCProposals(
+void GenerateDetections(
    const int num_proposals,
    const int num_boxes,
    const int num_classes,
@@ -280,11 +261,11 @@ void GenerateMCProposals(
    const T* scores,
    const T* deltas,
    const int64_t* indices,
-    T* proposals) {
+    T* detections) {
  int64_t index, cls;
  int64_t num_boxes_2x = 2 * num_boxes;
  int64_t num_boxes_3x = 3 * num_boxes;
-  float* proposal = proposals;
+  T* detection = detections;
  float dx, dy, d_log_w, d_log_h;
  for (int i = 0; i < num_proposals; ++i) {
    cls = indices[i] % num_classes;
@@ -293,7 +274,7 @@ void GenerateMCProposals(
    dy = deltas[num_boxes + index];
    d_log_w = deltas[num_boxes_2x + index];
    d_log_h = deltas[num_boxes_3x + index];
-    proposal[0] = im_idx;
+    detection[0] = im_idx;
    BBoxTransform(
        dx,
        dy,
@@ -303,10 +284,11 @@ void GenerateMCProposals(
        im_h,
        im_scale_h,
        im_scale_w,
-        proposal + 1);
-    proposal[5] = scores[indices[i]];
-    proposal[6] = cls + 1;
-    proposal += 7;
+        detection + 1);
+    // detection[5] = scores[indices[i]];
+    detection[5] = scores[i];
+    detection[6] = cls + 1;
+    detection += 7;
  }
 }


--- a/csrc/pyx/setup.py
+++ b/csrc/pyx/setup.py
@@ -8,7 +8,6 @@
 #     <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
-
 """Compile the cython extensions."""

 from __future__ import absolute_import
@@ -36,7 +35,7 @@ ext_modules = [
        include_dirs=[np.get_include()]
    ),
    Extension(
-        'install.lib.pycocotools._mask',
+        'install.lib.utils.pycocotools._mask',
        ['maskApi.c', '_mask.pyx'],
        include_dirs=[np.get_include(), os.path.dirname(os.path.abspath(__file__))],
        extra_compile_args=['-w']

--- a/scripts/coco/im2rec.py
+++ b/scripts/coco/im2rec.py
@@ -8,7 +8,6 @@
 #     <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
-
 """Make record file for COCO dataset."""

 from __future__ import absolute_import
@@ -27,14 +26,12 @@ if __name__ == '__main__':

    # Encode masks to RLE bytes
    if not os.path.exists('build'):
-         os.makedirs('build')
+        os.makedirs('build')
    make_mask('train', '2014', COCO_ROOT)
    make_mask('valminusminival', '2014', COCO_ROOT)
    make_mask('minival', '2014', COCO_ROOT)
-    merge_mask('trainval35k', '2014', [
-         'build/coco_2014_train_mask.pkl',
-         'build/coco_2014_valminusminival_mask.pkl']
-    )
+    merge_mask('trainval35k', '2014', ['build/coco_2014_train_mask.pkl',
+                                       'build/coco_2014_valminusminival_mask.pkl'])

    # coco_2014_trainval35k
    make_record(

--- a/scripts/coco/maker.py
+++ b/scripts/coco/maker.py
@@ -10,17 +10,13 @@
 # ------------------------------------------------------------

 import os
+import pickle
 import time

 import cv2
 import dragon
 import numpy as np

-try:
-    import cPickle
-except:
-    import pickle as cPickle
-

 def make_example(image_file, mask_objects, im_scale=None):
    filename = os.path.split(image_file)[-1]
@@ -52,6 +48,7 @@ def make_example(image_file, mask_objects, im_scale=None):
            'xmax': x2,
            'ymax': y2,
            'mask': obj['mask'],
+            'polygons': obj['polygons'],
            'difficult': obj.get('crowd', 0),
        })

@@ -80,7 +77,7 @@ def make_record(

    if mask_file is not None:
        with open(mask_file, 'rb') as f:
-            all_masks = cPickle.load(f)
+            all_masks = pickle.load(f)
    else:
        all_masks = {}

@@ -101,6 +98,7 @@ def make_record(
                'xmax': 'float64',
                'ymax': 'float64',
                'mask': 'bytes',
+                'polygons': [['float64']],
                'difficult': 'int64',
            }]
        }
@@ -111,10 +109,22 @@ def make_record(

    for db_idx, split in enumerate(splits):
        split_file = os.path.join(splits_path[db_idx], split + '.txt')
-        assert os.path.exists(split_file)
-        with open(split_file, 'r') as f:
-            lines = f.readlines()
-            total_line += len(lines)
+        if not os.path.exists(split_file):
+            # Fallback to try if split provided as json format
+            split_file = os.path.join(splits_path[db_idx], split + '.json')
+            if not os.path.exists(split_file):
+                raise FileNotFoundError('Unable to find the split:', split)
+            with open(split_file, 'r') as f:
+                import json
+                images_info = json.load(f)
+                total_line = len(images_info['images'])
+                lines = []
+                for info in images_info['images']:
+                    lines.append(os.path.splitext(info['file_name'])[0])
+        else:
+            with open(split_file, 'r') as f:
+                lines = f.readlines()
+                total_line += len(lines)
        for line in lines:
            count += 1
            if count % 2000 == 0:
@@ -123,10 +133,8 @@ def make_record(
                    count, total_line, now_time - start_time))
            filename = line.strip()
            image_file = os.path.join(images_path[db_idx], filename + ext)
-            mask_objects = all_masks[filename] if filename in all_masks else None
-            if mask_objects is None:
-                raise ValueError('The image({}) takes invalid mask settings.'.format(filename))
-            writer.write( make_example(image_file, mask_objects, im_scale))
+            mask_objects = all_masks[filename] if filename in all_masks else {}
+            writer.write(make_example(image_file, mask_objects, im_scale))

    now_time = time.time()
    print('{} / {} in {:.2f} sec'.format(count, total_line, now_time - start_time))

--- a/scripts/coco/maskgen.py
+++ b/scripts/coco/maskgen.py
@@ -9,19 +9,17 @@
 #
 # ------------------------------------------------------------

+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
 import os
-import sys
 import os.path as osp
-from collections import OrderedDict
-
-try:
-    import cPickle
-except:
-    import pickle as cPickle
+import pickle

-sys.path.insert(0, '../..')
-from seetadet.pycocotools.coco import COCO
-from seetadet.pycocotools import mask_utils
+from seetadet.utils.pycocotools import mask_utils
+from seetadet.utils.pycocotools.coco import COCO


 class COCOWrapper(object):
@@ -31,7 +29,7 @@ class COCOWrapper(object):
        self._data_path = osp.join(data_dir)
        self.invalid_cnt = 0
        self.ignore_cnt = 0
-        
+
        # Load COCO API, classes, class <-> id mappings
        self._COCO = COCO(self._get_ann_file())
        cats = self._COCO.loadCats(self._COCO.getCatIds())
@@ -39,9 +37,8 @@ class COCOWrapper(object):
        self._class_to_ind = dict(zip(self._classes, range(self.num_classes)))
        self._ind_to_class = dict(zip(range(self.num_classes), self._classes))
        self._class_to_cat_id = dict(zip([c['name'] for c in cats], self._COCO.getCatIds()))
-        self._cat_id_to_class_id = dict([(self._class_to_cat_id[cls], 
-                                          self._class_to_ind[cls])
-                                          for cls in self._classes[1:]])
+        self._cat_id_to_class_id = dict([(self._class_to_cat_id[cls], self._class_to_ind[cls])
+                                         for cls in self._classes[1:]])
        self._data_name = {
            # 5k ``val2014`` subset
            'minival2014': 'val2014',
@@ -56,10 +53,10 @@ class COCOWrapper(object):
            if self._image_set.find('test') == -1 \
            else 'image_info'
        return osp.join(
-            self._data_path, 
+            self._data_path,
            'annotations',
-            prefix + '_' + 
-            self._image_set + 
+            prefix + '_' +
+            self._image_set +
            self._year + '.json'
        )

@@ -107,31 +104,32 @@ class COCOWrapper(object):
            y1 = float(max(0, obj['bbox'][1]))
            x2 = float(min(width - 1, x1 + max(0, obj['bbox'][2] - 1)))
            y2 = float(min(height - 1, y1 + max(0, obj['bbox'][3] - 1)))
+            mask, polygons = b'', []
            if isinstance(obj['segmentation'], list):
                for p in obj['segmentation']:
                    if len(p) < 6:
                        print('Remove Invalid segm.')
                # Valid polygons have >= 3 points, so require >= 6 coordinates
-                poly = [p for p in obj['segmentation'] if len(p) >= 6]
-                mask_bytes = mask_utils.poly2bytes(poly, height, width)
+                polygons = [p for p in obj['segmentation'] if len(p) >= 6]
+                # mask_bytes = mask_utils.poly2bytes(poly, height, width)
            else:
                # Crowd masks
                # Some are encoded with height or width
                # running out of the image bound
                # Do not use them or decoding error is inevitable
-                mask_bytes = mask_utils.poly2bytes(obj['segmentation'], height, width)
+                mask = mask_utils.poly2bytes(obj['segmentation'], height, width)
            if obj['area'] > 0 and x2 > x1 and y2 > y1:
                obj['clean_bbox'] = [x1, y1, x2, y2]
                valid_objects.append({
                    'bbox': [x1, y1, x2, y2],
-                    'mask': mask_bytes,
+                    'mask': mask,
+                    'polygons': polygons,
                    'category_id': obj['category_id'],
                    'class_id': self._cat_id_to_class_id[obj['category_id']],
                    'crowd': obj['iscrowd'],
                })
                valid_objects[-1]['name'] = \
                    self._ind_to_class[valid_objects[-1]['class_id']]
-
        return height, width, valid_objects

    @property
@@ -150,31 +148,35 @@ def make_mask(split, year, data_dir):
    if not osp.exists(osp.join(coco._data_path, 'splits')):
        os.makedirs(osp.join(coco._data_path, 'splits'))

-    gt_recs = OrderedDict()
+    gt_recs = collections.OrderedDict()
    for i in range(coco.num_images):
-        filename = (coco.image_path_at(i).split('/')[-1]).split('.')[0]
+        filename = osp.basename(coco.image_path_at(i)).split('.')[0]
        h, w, objects = coco.annotation_at(i)
        gt_recs[filename] = objects

-    with open(osp.join('build', 'coco_' + year + '_' + split + '_mask.pkl'), 'wb') as f:
-        cPickle.dump(gt_recs, f, cPickle.HIGHEST_PROTOCOL)
+    with open(osp.join('build',
+                       'coco_' + year +
+                       '_' + split + '_mask.pkl'), 'wb') as f:
+        pickle.dump(gt_recs, f, pickle.HIGHEST_PROTOCOL)

    with open(osp.join(coco._data_path, 'splits', split + '.txt'), 'w') as f:
        for i in range(coco.num_images):
-            filename = (coco.image_path_at(i).split('/')[-1]).split('.')[0]
+            filename = str(osp.basename(coco.image_path_at(i)).split('.')[0])
            if i != coco.num_images - 1:
                filename += '\n'
            f.write(filename)


 def merge_mask(split, year, mask_files):
-    gt_recs = OrderedDict()
+    gt_recs = collections.OrderedDict()
    data_path = os.path.dirname(mask_files[0])

    for mask_file in mask_files:
        with open(mask_file, 'rb') as f:
-            recs = cPickle.load(f)
+            recs = pickle.load(f)
            gt_recs.update(recs)

-    with open(osp.join(data_path, 'coco_' + year + '_' + split + '_mask.pkl'), 'wb') as f:
-        cPickle.dump(gt_recs, f, cPickle.HIGHEST_PROTOCOL)
+    with open(osp.join(data_path,
+                       'coco_' + year +
+                       '_' + split + '_mask.pkl'), 'wb') as f:
+        pickle.dump(gt_recs, f, pickle.HIGHEST_PROTOCOL)
--- a/scripts/rotated/maker.py
+++ b/scripts/rotated/maker.py
@@ -132,4 +132,3 @@ def make_record(
    data_size = os.path.getsize(record_file + '/root.data') * 1e-6
    print('{} images take {:.2f} MB in {:.2f} sec.'
          .format(len(entries), data_size, end_time - start_time))
-
--- a/scripts/voc/im2rec.py
+++ b/scripts/voc/im2rec.py
@@ -8,7 +8,6 @@
 #     <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
-
 """Make record file for VOC dataset."""

 from __future__ import absolute_import
@@ -29,7 +28,7 @@ if __name__ == '__main__':
        annotations_path=[osp.join(voc_root, 'VOCdevkit2007/VOC2007/Annotations'),
                          osp.join(voc_root, 'VOCdevkit2012/VOC2012/Annotations')],
        splits_path=[osp.join(voc_root, 'VOCdevkit2007/VOC2007/ImageSets/Main'),
-                        osp.join(voc_root, 'VOCdevkit2012/VOC2012/ImageSets/Main')],
+                     osp.join(voc_root, 'VOCdevkit2012/VOC2012/ImageSets/Main')],
        splits=['trainval', 'trainval']
    )


--- a/seetadet/__init__.py
+++ b/seetadet/__init__.py
@@ -8,3 +8,11 @@
 #     <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
+"""A platform implementing popular object detection algorithms."""
+
+from __future__ import absolute_import as _absolute_import
+from __future__ import division as _division
+from __future__ import print_function as _print_function
+
+# Version
+from seetadet.version import version as __version__
--- a/seetadet/dali/__init__.py
+++ b/seetadet/dali/__init__.py
@@ -8,3 +8,9 @@
 #     <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from seetadet.algo.common.anchor_sampler import AnchorSampler
--- a/seetadet/algo/common/anchor_sampler.py
+++ b/seetadet/algo/common/anchor_sampler.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#     <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from seetadet.core.config import cfg
+
+
+class AnchorSampler(object):
+    """Sample precomputed anchors asynchronously."""
+
+    def __init__(self):
+        self._rpn_target = None
+        self._retinanet_target = None
+        self._ssd_target = None
+        if 'rcnn' in cfg.MODEL.TYPE:
+            from seetadet.algo.faster_rcnn import anchor_target
+            self._rpn_target = anchor_target.AnchorTarget()
+        elif cfg.MODEL.TYPE == 'retinanet':
+            from seetadet.algo.retinanet import anchor_target
+            self._retinanet_target = anchor_target.AnchorTarget()
+        elif cfg.MODEL.TYPE == 'ssd':
+            from seetadet.algo.ssd import anchor_target
+            self._ssd_target = anchor_target.AnchorTarget()
+
+    def __call__(self, **inputs):
+        """Return the sample anchors."""
+        if self._rpn_target:
+            fg_inds, bg_inds = \
+                self._rpn_target.sample_anchors(
+                    gt_boxes=inputs['gt_boxes'],
+                    im_info=inputs['im_info'],
+                )
+            return {'fg_inds': fg_inds, 'bg_inds': bg_inds}
+        if self._retinanet_target:
+            fg_inds, ignore_inds = \
+                self._retinanet_target.sample_anchors(
+                    gt_boxes=inputs['gt_boxes'],
+                    im_info=inputs['im_info'],
+                )
+            return {'fg_inds': fg_inds, 'bg_inds': ignore_inds}
+        if self._ssd_target:
+            fg_inds, neg_inds = \
+                self._ssd_target.sample_anchors(
+                    gt_boxes=inputs['gt_boxes'],
+                )
+            return {'fg_inds': fg_inds, 'bg_inds': neg_inds}
+        return {}
--- a/seetadet/algo/faster_rcnn/__init__.py
+++ b/seetadet/algo/faster_rcnn/__init__.py
@@ -17,7 +17,3 @@ from seetadet.algo.faster_rcnn.anchor_target import AnchorTarget
 from seetadet.algo.faster_rcnn.data_loader import DataLoader
 from seetadet.algo.faster_rcnn.proposal import Proposal
 from seetadet.algo.faster_rcnn.proposal_target import ProposalTarget
-from seetadet.algo.faster_rcnn.utils import generate_grid_anchors
-from seetadet.algo.faster_rcnn.utils import map_blobs_by_levels
-from seetadet.algo.faster_rcnn.utils import map_rois_to_levels
-from seetadet.algo.faster_rcnn.utils import map_returns_to_blobs
--- a/seetadet/algo/faster_rcnn/anchor_target.py
+++ b/seetadet/algo/faster_rcnn/anchor_target.py
@@ -13,11 +13,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import collections
+import math
+
 import numpy as np
 import numpy.random as npr

-from seetadet.algo.faster_rcnn.generate_anchors import generate_anchors
-from seetadet.algo.faster_rcnn.utils import generate_grid_anchors
+from seetadet.algo.faster_rcnn import generate_anchors as anchor_util
+from seetadet.algo.faster_rcnn import utils as rcnn_util
 from seetadet.core.config import cfg
 from seetadet.utils import boxes as box_util
 from seetadet.utils.env import new_tensor
@@ -33,113 +36,126 @@ class AnchorTarget(object):
        self.strides = cfg.RPN.STRIDES
        self.ratios = cfg.RPN.ASPECT_RATIOS
        self.num_strides = len(self.strides)
-        self.allowed_border = cfg.TRAIN.RPN_STRADDLE_THRESH
        # Generate base anchors
        self.base_anchors = []
        for i in range(self.num_strides):
            self.base_anchors.append(
-                generate_anchors(
+                anchor_util.generate_anchors(
                    self.strides[i],
                    self.ratios,
                    np.array([self.scales[i]])
                    if self.num_strides > 1
-                    else np.array(self.scales)
-                )
-            )
-
-    def __call__(self, features, gt_boxes, ims_info):
-        num_images = cfg.TRAIN.IMS_PER_BATCH
-        gt_boxes_wide = box_util.dismantle_boxes(gt_boxes, num_images)
+                    else np.array(self.scales)))
+        # Plan the maximum shifted anchor layout
+        max_size = cfg.TRAIN.MAX_SIZE
+        if cfg.MODEL.COARSEST_STRIDE > 0:
+            stride = float(cfg.MODEL.COARSEST_STRIDE)
+            max_size = int(math.ceil(max_size / stride) * stride)
+        self.max_shapes = [[math.ceil(max_size / stride)] * 2
+                           for stride in self.strides]
+        self.all_coords = rcnn_util.get_shifted_coords(
+            self.max_shapes, self.base_anchors)
+        self.all_anchors = rcnn_util.get_shifted_anchors(
+            self.max_shapes, self.base_anchors, self.strides)
+
+    def sample_anchors(self, gt_boxes, im_info, all_anchors=None):
+        if all_anchors is None:
+            all_anchors = self.all_anchors
+
+        # Only keep anchors inside the image
+        # to get higher quality proposals.
+        inds_inside = np.where(
+            (all_anchors[:, 0] >= 0) &
+            (all_anchors[:, 1] >= 0) &
+            (all_anchors[:, 2] < im_info[1]) &
+            (all_anchors[:, 3] < im_info[0]))[0]
+        anchors = all_anchors[inds_inside, :]
+
+        num_inside = len(inds_inside)
+        labels = np.empty((num_inside,), 'int32')
+        labels.fill(-1)
+
+        # Overlaps between the anchors and the gt boxes.
+        overlaps = box_util.bbox_overlaps(anchors, gt_boxes)
+        argmax_overlaps = overlaps.argmax(axis=1)
+        max_overlaps = overlaps[np.arange(num_inside), argmax_overlaps]
+
+        # Overlaps between the gt boxes and anchors with highest IoU.
+        gt_argmax_overlaps = overlaps.argmax(axis=0)
+        gt_max_overlaps = overlaps[gt_argmax_overlaps, np.arange(overlaps.shape[1])]
+        gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0]
+
+        # Foreground: for each gt, anchor with highest overlap.
+        labels[gt_argmax_overlaps] = 1
+
+        # Foreground: above threshold IoU.
+        labels[max_overlaps >= cfg.RPN.POSITIVE_OVERLAP] = 1
+
+        # Background: below threshold IoU.
+        labels[max_overlaps < cfg.RPN.NEGATIVE_OVERLAP] = 0
+
+        # Retract the clamping if we don't have one.
+        fg_inds = np.where(labels == 1)[0]
+        if len(fg_inds) == 0:
+            labels[gt_argmax_overlaps] = 1
+            fg_inds = np.where(labels == 1)[0]

-        # Generate grid anchors from base
-        grid_shapes = [f.shape[-2:] for f in features]
-        all_anchors = generate_grid_anchors(
-            grid_shapes, self.base_anchors, self.strides)
-        num_anchors = all_anchors.shape[0]
+        # Subsample positive labels if we have too many.
+        num_fg = int(cfg.RPN.FG_FRACTION * cfg.RPN.BATCH_SIZE)
+        if len(fg_inds) > num_fg:
+            fg_inds = npr.choice(fg_inds, num_fg, False)

-        # Label: ``1`` is positive, ``0`` is negative, ``-1` is don't care
-        labels_wide = -np.ones((num_images, num_anchors,), 'float32')
-        bbox_indices_wide, bbox_anchors_wide, bbox_targets_wide = [], [], []
+        # Subsample negative labels if we have too many.
+        num_bg = cfg.RPN.BATCH_SIZE - len(fg_inds)
+        bg_inds = np.where(labels == 0)[0]
+        if len(bg_inds) > num_bg:
+            bg_inds = npr.choice(bg_inds, num_bg, False)

-        for ix in range(num_images):
-            # GT boxes (x1, y1, x2, y2, label, ...)
-            gt_boxes = gt_boxes_wide[ix]
-            im_info = ims_info[ix]
-            if self.allowed_border >= 0:
-                # Only keep anchors inside the image
-                inds_inside = np.where(
-                    (all_anchors[:, 0] >= -self.allowed_border) &
-                    (all_anchors[:, 1] >= -self.allowed_border) &
-                    (all_anchors[:, 2] < im_info[1] + self.allowed_border) &
-                    (all_anchors[:, 3] < im_info[0] + self.allowed_border))[0]
-                anchors = all_anchors[inds_inside, :]
-            else:
-                inds_inside, anchors = np.arange(num_anchors), all_anchors
-
-            num_inside = len(inds_inside)
-            labels = np.empty((num_inside,), 'float32')
-            labels.fill(-1)
-
-            # Overlaps between the anchors and the gt boxes
-            overlaps = box_util.bbox_overlaps(anchors, gt_boxes)
-            argmax_overlaps = overlaps.argmax(axis=1)
-            max_overlaps = overlaps[np.arange(num_inside), argmax_overlaps]
-
-            gt_argmax_overlaps = overlaps.argmax(axis=0)
-            gt_max_overlaps = overlaps[gt_argmax_overlaps,
-                                       np.arange(overlaps.shape[1])]
-            gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0]
-
-            # Foreground: for each gt, anchor with highest overlap
-            labels[gt_argmax_overlaps] = 1
+        return inds_inside[fg_inds], inds_inside[bg_inds]

-            # Foreground: above threshold IoU
-            labels[max_overlaps >= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = 1
+    def __call__(self, **inputs):
+        num_images = cfg.TRAIN.IMS_PER_BATCH
+        shapes = [f.shape[-2:] for f in inputs['features']]
+        image_stride = sum(self.base_anchors[i].shape[0] * np.prod(shapes[i])
+                           for i in range(len(inputs['features'])))

-            # Background: below threshold IoU
-            labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0
+        narrow_args = [self.all_coords, self.base_anchors, self.max_shapes, shapes]
+        outputs = collections.defaultdict(list)

-            # Subsample positive labels if we have too many
-            num_fg = int(cfg.TRAIN.RPN_FG_FRACTION * cfg.TRAIN.RPN_BATCHSIZE)
-            fg_inds = np.where(labels == 1)[0]
-            if len(fg_inds) > num_fg:
-                disable_inds = npr.choice(fg_inds, len(fg_inds) - num_fg, False)
-                labels[disable_inds] = -1
-                fg_inds = np.where(labels == 1)[0]
-
-            # Retract the clamping if we don't have one
-            if len(fg_inds) == 0:
-                labels[gt_argmax_overlaps] = 1
-                fg_inds = np.where(labels == 1)[0]
-
-            # Subsample negative labels if we have too many
-            num_bg = cfg.TRAIN.RPN_BATCHSIZE - np.sum(labels == 1)
-            bg_inds = np.where(labels == 0)[0]
-            if len(bg_inds) > num_bg:
-                disable_inds = npr.choice(bg_inds, len(bg_inds) - num_bg, False)
-                labels[disable_inds] = -1
-
-            labels_wide[ix, inds_inside] = labels
-            bbox_anchors_wide.append(anchors[fg_inds])
-            bbox_indices_wide.append(inds_inside[fg_inds] + (num_anchors * ix))
-            bbox_targets_wide.append(
-                box_util.bbox_transform(
-                    anchors[fg_inds],
-                    gt_boxes[argmax_overlaps[fg_inds], :4],
-                )
-            )
-
-        if self.num_strides == 1:
-            A = self.base_anchors[0].shape[0]
-            height, width = features[0].shape[-2:]
-            labels_wide = labels_wide \
-                .reshape((num_images, height, width, A)) \
-                .transpose(0, 3, 1, 2) \
-                .reshape((num_images, num_anchors))
+        for ix in range(num_images):
+            fg_inds = inputs['fg_inds'][ix]
+            bg_inds = inputs['bg_inds'][ix]
+            gt_boxes = inputs['gt_boxes'][ix]
+
+            # Narrow anchors to match the feature layout
+            anchors = self.all_anchors[fg_inds]
+            bg_inds = rcnn_util.narrow_anchors(*(narrow_args + [bg_inds]))
+            _, anchors = rcnn_util.narrow_anchors(*(narrow_args + [fg_inds, anchors]))
+            fg_inds = rcnn_util.narrow_anchors(*(narrow_args + [fg_inds]))
+
+            # Compute bbox targets
+            gt_assignment = box_util.bbox_overlaps(anchors, gt_boxes).argmax(axis=1)
+            bbox_targets = box_util.bbox_transform(anchors, gt_boxes[gt_assignment, :4])
+            outputs['bbox_anchors'].append(anchors)
+            outputs['bbox_targets'].append(bbox_targets)
+
+            # Compute sparse indices
+            fg_inds += ix * image_stride
+            bg_inds += ix * image_stride
+            outputs['cls_inds'].extend([fg_inds, bg_inds])
+            outputs['bbox_inds'].extend([fg_inds])
+            outputs['labels'].extend([np.ones_like(fg_inds, 'float32'),
+                                      np.zeros_like(bg_inds, 'float32')])

        return {
-            'labels': new_tensor(labels_wide),
-            'bbox_indices': new_tensor(np.concatenate(bbox_indices_wide)),
-            'bbox_targets': new_tensor(np.concatenate(bbox_targets_wide).astype('float32')),
-            'bbox_anchors': new_tensor(np.concatenate(bbox_anchors_wide).astype('float32')),
+            'labels': new_tensor(
+                np.concatenate(outputs['labels'])),
+            'cls_inds': new_tensor(
+                np.concatenate(outputs['cls_inds'])),
+            'bbox_inds': new_tensor(
+                np.concatenate(outputs['bbox_inds'])),
+            'bbox_targets': new_tensor(
+                np.concatenate(outputs['bbox_targets']).astype('float32')),
+            'bbox_anchors': new_tensor(
+                np.concatenate(outputs['bbox_anchors']).astype('float32')),
        }
--- a/seetadet/algo/faster_rcnn/data_loader.py
+++ b/seetadet/algo/faster_rcnn/data_loader.py
@@ -13,8 +13,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import collections
 import multiprocessing as mp
 import time
+import threading
+import queue

 import dragon
 import dragon.vm.torch as torch
@@ -23,8 +26,8 @@ import numpy as np
 from seetadet.algo.faster_rcnn import data_transformer
 from seetadet.core.config import cfg
 from seetadet.datasets.factory import get_dataset
+from seetadet.utils import blob as blob_util
 from seetadet.utils import logger
-from seetadet.utils.blob import im_list_to_blob


 class DataLoader(object):
@@ -33,28 +36,24 @@ class DataLoader(object):
    def __init__(self):
        super(DataLoader, self).__init__()
        dataset = get_dataset(cfg.TRAIN.DATASET)
-        if cfg.USE_DALI:
-            from seetadet.dali import rcnn_pipeline as pipe
-            self.iterator = pipe.new_iterator(dataset.source)
-        else:
-            self.iterator = Iterator(**{
-                'dataset': dataset.cls,
-                'source': dataset.source,
-                'classes': dataset.classes,
-                'shuffle': cfg.TRAIN.USE_SHUFFLE,
-                'num_chunks': cfg.TRAIN.SHUFFLE_CHUNKS,
-                'batch_size': cfg.TRAIN.IMS_PER_BATCH * 2,
-                'num_transformers': cfg.TRAIN.NUM_THREADS - 1,
-            })
+        self.iterator = Iterator(**{
+            'dataset': dataset.cls,
+            'source': dataset.source,
+            'classes': dataset.classes,
+            'shuffle': cfg.TRAIN.USE_SHUFFLE,
+            'batch_size': cfg.TRAIN.IMS_PER_BATCH * 2,
+            'num_transformers': cfg.TRAIN.NUM_THREADS - 1,
+        })
+        self.iterator.start()

    def __call__(self):
        outputs = self.iterator.next()
-        if isinstance(outputs['data'], np.ndarray):
-            outputs['data'] = torch.from_numpy(outputs['data'])
+        if isinstance(outputs['image'], np.ndarray):
+            outputs['image'] = torch.from_numpy(outputs['image'])
        return outputs


-class Iterator(mp.Process):
+class Iterator(threading.Thread):
    """Iterator to return the batch of data."""

    def __init__(self, **kwargs):
@@ -68,17 +67,16 @@ class Iterator(mp.Process):
            rank = dragon.distributed.get_rank(process_group)

        # Configuration
-        self._prefetch = kwargs.get('prefetch', 5)
        self._batch_size = kwargs.get('batch_size', 2)
        self._num_readers = kwargs.get('num_readers', 1)
        self._num_transformers = kwargs.get('num_transformers', 3)
        self.daemon = True

        # Initialize queues
-        num_batches = self._prefetch * self._num_readers
-        self.q_in = mp.Queue(num_batches * self._batch_size)
-        self.q1_out = mp.Queue(num_batches * self._batch_size)
-        self.q2_out = mp.Queue(num_batches * self._batch_size)
+        num_batches = self._num_readers
+        self._queue1 = mp.Queue(num_batches * self._batch_size)
+        self._queue2 = mp.Queue(num_batches * self._batch_size)
+        self._queue3 = queue.Queue(num_batches)

        # Initialize readers
        self._readers = []
@@ -89,7 +87,7 @@ class Iterator(mp.Process):
            self._readers.append(dragon.io.DataReader(
                part_idx=part_idx, num_parts=num_parts, **kwargs))
            self._readers[i]._seed += part_idx
-            self._readers[i].q_out = self.q_in
+            self._readers[i].q_out = self._queue1
            self._readers[i].start()
            time.sleep(0.1)

@@ -98,8 +96,7 @@ class Iterator(mp.Process):
        for i in range(self._num_transformers):
            p = data_transformer.DataTransformer(**kwargs)
            p._seed += (i + rank * self._num_transformers)
-            p.q_in = self.q_in
-            p.q1_out, p.q2_out = self.q1_out, self.q2_out
+            p.q_in, p.q_out = self._queue1, self._queue2
            p.start()
            self._transformers.append(p)
            time.sleep(0.1)
@@ -122,35 +119,43 @@ class Iterator(mp.Process):
        """Return the next batch of data."""
        return self.__next__()

+    def run(self):
+        """Main loop."""
+        num_images = cfg.TRAIN.IMS_PER_BATCH
+        num_batches = cfg.TRAIN.ASPECT_GROUPING
+        logger.info('Initialize prefetching batches...')
+        example_buffer = [self._queue2.get()
+                          for _ in range(num_images * num_batches)]
+        next_examples = []
+
+        while True:
+            # Use cached buffer for next N examples
+            # Examples are sorted to simulate aspect grouping
+            if len(next_examples) == 0:
+                next_examples = example_buffer
+                next_examples.sort(key=lambda d: d['aspect_ratio'])
+                example_buffer = []
+
+            # Prepare the next batch
+            outputs = collections.defaultdict(list)
+            for i in range(num_images):
+                example = next_examples.pop(0)
+                outputs['image'].append(example['image'])
+                outputs['gt_boxes'].append(example['boxes'])
+                outputs['im_info'].append(example['im_info'])
+                outputs['fg_inds'].append(example.get('fg_inds', None))
+                outputs['bg_inds'].append(example.get('bg_inds', None))
+                example_buffer.append(self._queue2.get())
+            outputs['image'] = blob_util.im_list_to_blob(
+                outputs['image'], coarsest_stride=cfg.MODEL.COARSEST_STRIDE)
+
+            # Send batch data to consumer
+            self._queue3.put(outputs)
+
    def __iter__(self):
        """Return the iterator self."""
        return self

    def __next__(self):
        """Return the next batch of data."""
-        q_out = None
-        # Two queues to implement aspect-grouping
-        # This is necessary to reduce the gpu memory
-        # from fetching a huge square batch blob
-        while q_out is None:
-            if self.q1_out.qsize() >= cfg.TRAIN.IMS_PER_BATCH:
-                q_out = self.q1_out
-            elif self.q2_out.qsize() >= cfg.TRAIN.IMS_PER_BATCH:
-                q_out = self.q2_out
-        self.q1_out, self.q2_out = self.q2_out, self.q1_out
-
-        images, images_info, boxes_to_pack = [], [], []
-
-        for i in range(cfg.TRAIN.IMS_PER_BATCH):
-            image, image_scale, boxes = q_out.get()
-            images.append(image)
-            images_info.append(list(image.shape[:2]) + [image_scale])
-            gt_boxes = np.zeros((boxes.shape[0], boxes.shape[1] + 1), 'float32')
-            gt_boxes[:, :boxes.shape[1]], gt_boxes[:, -1] = boxes, i
-            boxes_to_pack.append(gt_boxes)
-
-        return {
-            'data': im_list_to_blob(images),
-            'ims_info': np.array(images_info, dtype=np.float32),
-            'gt_boxes': np.concatenate(boxes_to_pack),
-        }
+        return self._queue3.get()
--- a/seetadet/algo/faster_rcnn/data_transformer.py
+++ b/seetadet/algo/faster_rcnn/data_transformer.py
@@ -15,109 +15,122 @@ from __future__ import print_function

 import multiprocessing

+import cv2
 import numpy as np
+import numpy.random as npr

+from seetadet.algo import common as algo_common
 from seetadet.core.config import cfg
 from seetadet.datasets.example import Example
 from seetadet.utils import boxes as box_util
-from seetadet.utils.blob import prep_im_for_blob
+from seetadet.utils import image as image_util


 class DataTransformer(multiprocessing.Process):
    def __init__(self, **kwargs):
        super(DataTransformer, self).__init__()
        self._scales = cfg.TRAIN.SCALES
+        self._random_scales = cfg.TRAIN.RANDOM_SCALES
        self._max_size = cfg.TRAIN.MAX_SIZE
        self._seed = cfg.RNG_SEED
-        self._use_flipped = cfg.TRAIN.USE_FLIPPED
        self._use_diff = cfg.TRAIN.USE_DIFF
+        self._use_flipped = cfg.TRAIN.USE_FLIPPED
+        self._use_distort = cfg.TRAIN.USE_COLOR_JITTER
        self._classes = kwargs.get('classes', ('__background__',))
        self._num_classes = len(self._classes)
        self._class_to_ind = dict(zip(self._classes, range(self._num_classes)))
-        self.q_in = self.q1_out = self.q2_out = None
+        self._anchor_sampler = algo_common.AnchorSampler()
+        self.q_in = self.q_out = None
        self.daemon = True

-    def make_roi_dict(self, example, im_scale, apply_flip=False):
-        objects, n_objects = example.objects, 0
+    def get_boxes(self, example, im_scale):
+        objects, num_objects = example.objects, 0
        height, width = example.height, example.width
        if not self._use_diff:
            for obj in objects:
                if obj.get('difficult', 0) == 0:
-                    n_objects += 1
+                    num_objects += 1
        else:
-            n_objects = len(objects)
+            num_objects = len(objects)

-        roi_dict = {
-            'boxes': np.zeros((n_objects, 4), 'float32'),
-            'gt_classes': np.zeros((n_objects,), 'int32'),
-        }
+        boxes = np.zeros((num_objects, 4), 'float32')
+        gt_classes = np.zeros((num_objects,), 'float32')

        # Filter the difficult instances
        object_idx = 0
        for obj in objects:
-            if not self._use_diff and \
-                    obj.get('difficult', 0) > 0:
+            if not self._use_diff and obj.get('difficult', 0) > 0:
                continue
            bbox = obj['bbox']
-            roi_dict['boxes'][object_idx, :] = [
-                max(0, bbox[0]),
-                max(0, bbox[1]),
-                min(bbox[2], width - 1),
-                min(bbox[3], height - 1),
-            ]
-            roi_dict['gt_classes'][object_idx] = \
-                self._class_to_ind[obj['name']]
+            boxes[object_idx, :] = [max(0, bbox[0]),
+                                    max(0, bbox[1]),
+                                    min(bbox[2], width - 1),
+                                    min(bbox[3], height - 1)]
+            gt_classes[object_idx] = self._class_to_ind[obj['name']]
            object_idx += 1

-        # Flip the boxes if necessary
-        if apply_flip:
-            roi_dict['boxes'] = \
-                box_util.flip_boxes(
-                    roi_dict['boxes'],
-                    width,
-                )
-
        # Scale the boxes to the detecting scale
-        roi_dict['boxes'] *= im_scale
+        boxes *= im_scale
+
+        # Attach the classes
+        gt_boxes = np.empty((num_objects, 5), dtype=np.float32)
+        gt_boxes[:, :4], gt_boxes[:, 4] = boxes, gt_classes

-        return roi_dict
+        return gt_boxes

    def get(self, example):
        example = Example(example)
-        img = example.image

-        # Scale
-        target_size = self._scales[np.random.randint(len(self._scales))]
-        img, im_scale = prep_im_for_blob(img, target_size, self._max_size)
+        # Resize
+        img, im_scale = image_util.resize_image_with_target_size(
+            example.image,
+            target_size=npr.choice(self._scales),
+            max_size=self._max_size,
+            random_scales=self._random_scales,
+        )

        # Flip
-        apply_flip = False
-        if self._use_flipped:
-            if np.random.randint(2) > 0:
-                img = img[:, ::-1]
-                apply_flip = True
+        flipped = False
+        if self._use_flipped and npr.randint(2) > 0:
+            img = img[:, ::-1]
+            flipped = True
+
+        # Distort
+        if self._use_distort:
+            img = image_util.distort_image(img)
+
+        # Boxes
+        boxes = self.get_boxes(example, im_scale)
+
+        # Flip the boxes if necessary
+        if flipped:
+            boxes = box_util.flip_boxes(boxes, img.shape[1])

-        # Example -> RoIDict
-        roi_dict = self.make_roi_dict(example, im_scale, apply_flip)
+        # Standard outputs.
+        outputs = {'image': img,
+                   'boxes': boxes,
+                   'im_info': img.shape[:2] + (im_scale,)}

-        # Post-Process for gt boxes
-        # Shape like: [num_objects, {x1, y1, x2, y2, cls}]
-        gt_boxes = np.empty((len(roi_dict['gt_classes']), 5), dtype=np.float32)
-        gt_boxes[:, :4], gt_boxes[:, 4] = roi_dict['boxes'], roi_dict['gt_classes']
+        # Attach precomputed targets.
+        if len(boxes) > 0:
+            outputs.update(
+                self._anchor_sampler(
+                    gt_boxes=boxes,
+                    im_info=outputs['im_info']))

-        return img, im_scale, gt_boxes
+        return outputs

    def run(self):
-        # Fix the process-local random seed
+        # Disable the opencv threading.
+        cv2.setNumThreads(1)
+        # Fix the process-local random seed.
        np.random.seed(self._seed)

        # Main prefetch loop
        while True:
            outputs = self.get(self.q_in.get())
-            if len(outputs[2]) < 1:
-                continue  # Ignore the non-object image
-            aspect_ratio = float(outputs[0].shape[0]) / outputs[0].shape[1]
-            if aspect_ratio > 1.:
-                self.q1_out.put(outputs)
-            else:
-                self.q2_out.put(outputs)
+            if len(outputs['boxes']) < 1:
+                continue  # Ignore non-object image.
+            height, width = outputs['image'].shape[:2]
+            outputs['aspect_ratio'] = float(height) / float(width)
+            self.q_out.put(outputs)
--- a/seetadet/algo/faster_rcnn/proposal.py
+++ b/seetadet/algo/faster_rcnn/proposal.py
@@ -17,8 +17,8 @@ import collections

 import numpy as np

-from seetadet.algo.faster_rcnn.generate_anchors import generate_anchors
-from seetadet.algo.faster_rcnn.utils import generate_grid_anchors
+from seetadet.algo.faster_rcnn import generate_anchors as anchor_util
+from seetadet.algo.faster_rcnn import utils as rcnn_util
 from seetadet.core.config import cfg
 from seetadet.utils import boxes as box_util
 from seetadet.utils import nms
@@ -29,59 +29,50 @@ class Proposal(object):

    def __init__(self):
        super(Proposal, self).__init__()
-        # Load the basic configs
+        # Load basic configs
        self.scales = cfg.RPN.SCALES
        self.strides = cfg.RPN.STRIDES
        self.ratios = cfg.RPN.ASPECT_RATIOS
        self.num_strides = len(self.strides)
        self.defaults = collections.OrderedDict([
-            ('rois', np.array([[-1, 0, 0, 1, 1]], 'float32')),
-        ])
+            ('rois', np.array([[-1, 0, 0, 1, 1]], 'float32'))])
+        self.bbox_transform_clip = \
+            np.log(cfg.TRAIN.MAX_SIZE / min(self.strides))
        # Generate base anchors
        self.base_anchors = []
        for i in range(self.num_strides):
            self.base_anchors.append(
-                generate_anchors(
+                anchor_util.generate_anchors(
                    self.strides[i],
                    self.ratios,
                    np.array([self.scales[i]])
                    if self.num_strides > 1
-                    else np.array(self.scales)
-                )
-            )
+                    else np.array(self.scales)))

-    def __call__(self, features, cls_prob, bbox_pred, ims_info):
+    def __call__(self, **inputs):
+        num_images = cfg.TRAIN.IMS_PER_BATCH
        pre_nms_top_n = cfg.TRAIN.RPN_PRE_NMS_TOP_N
        post_nms_top_n = cfg.TRAIN.RPN_POST_NMS_TOP_N
        nms_thresh = cfg.TRAIN.RPN_NMS_THRESH
-        min_size = cfg.TRAIN.RPN_MIN_SIZE

        # Get resources
-        num_images = ims_info.shape[0]
-        grid_shapes = [f.shape[-2:] for f in features]
-        all_anchors = generate_grid_anchors(
-            grid_shapes, self.base_anchors, self.strides)
+        shapes = [f.shape[-2:] for f in inputs['features']]
+        all_anchors = rcnn_util.get_shifted_anchors(
+            shapes, self.base_anchors, self.strides)

        # Prepare for the outputs
        batch_rois = []
-        cls_prob = cls_prob.numpy()
-        bbox_pred = bbox_pred.numpy()
-        if self.num_strides > 1:
-            # (?, 4, A * K) -> (?, A * K, 4)
-            bbox_pred = bbox_pred.transpose((0, 2, 1))
-        else:
-            # (?, A * 4, H, W) -> (?, H, W, A * 4)
-            cls_prob = cls_prob.transpose((0, 2, 3, 1))
-            bbox_pred = bbox_pred.transpose((0, 2, 3, 1))
+        cls_prob = inputs['cls_prob'].numpy()
+        # (?, 4, A * K) -> (?, A * K, 4)
+        bbox_pred = inputs['bbox_pred'].numpy()
+        bbox_pred = bbox_pred.transpose((0, 2, 1))

        # Extract RoIs separately
        for ix in range(num_images):
            # [?, N] -> [? * N, 1]
            scores = cls_prob[ix].reshape((-1, 1))
-            if self.num_strides > 1:
-                deltas = bbox_pred[ix]
-            else:
-                deltas = bbox_pred[ix].reshape((-1, 4))
+            deltas = bbox_pred[ix]
+            im_info = inputs['im_info'][ix]

            if pre_nms_top_n <= 0 or pre_nms_top_n >= len(scores):
                order = np.argsort(-scores.squeeze())
@@ -97,15 +88,11 @@ class Proposal(object):
            scores = scores[order]

            # Convert anchors into proposals via bbox transformations
-            proposals = box_util.bbox_transform_inv(anchors, deltas)
+            proposals = box_util.bbox_transform_inv(
+                anchors, deltas, clip=self.bbox_transform_clip)

            # Clip predicted boxes to image
-            proposals = box_util.clip_tiled_boxes(proposals, ims_info[ix, :2])
-
-            # Remove predicted boxes with either height or width < threshold
-            keep = box_util.filter_boxes(proposals, min_size * ims_info[ix, 2])
-            proposals = proposals[keep, :]
-            scores = scores[keep]
+            proposals = box_util.clip_tiled_boxes(proposals, im_info[:2])

            # Apply nms (e.g. threshold = 0.7)
            # Take after_nms_topN (e.g. 300)

--- a/seetadet/algo/faster_rcnn/proposal_target.py
+++ b/seetadet/algo/faster_rcnn/proposal_target.py
@@ -30,19 +30,17 @@ class ProposalTarget(object):
    def __init__(self):
        super(ProposalTarget, self).__init__()
        self.num_strides = len(cfg.RPN.STRIDES)
-        self.num_classes = cfg.MODEL.NUM_CLASSES
+        self.num_classes = len(cfg.MODEL.CLASSES)
        self.defaults = collections.OrderedDict([
            ('rois', np.array([[-1, 0, 0, 1, 1]], 'float32')),
            ('labels', np.array([-1], 'int64')),
            ('bbox_targets', np.zeros((1, 4), 'float32')),
        ])

-    def __call__(self, rpn_rois, gt_boxes):
+    def __call__(self, **inputs):
        num_images = cfg.TRAIN.IMS_PER_BATCH
        # Proposal ROIs (0, x1, y1, x2, y2) coming from RPN
-        all_rois = rpn_rois
-        # GT boxes (x1, y1, x2, y2, label)
-        gt_boxes_wide = box_util.dismantle_boxes(gt_boxes, num_images)
+        all_rois = inputs['rois']

        # Prepare for the outputs
        keys = self.defaults.keys()
@@ -50,22 +48,22 @@ class ProposalTarget(object):

        # Generate targets separately
        for ix in range(num_images):
-            gt_boxes = gt_boxes_wide[ix]
+            # GT boxes (x1, y1, x2, y2, label)
+            gt_boxes = inputs['gt_boxes'][ix]
            # Extract proposals for this image
            rois = all_rois[np.where(all_rois[:, 0].astype('int32') == ix)[0]]
            # Include ground-truth boxes in the set of candidate rois
            inds = np.ones((gt_boxes.shape[0], 1), gt_boxes.dtype) * ix
            rois = np.vstack((rois, np.hstack((inds, gt_boxes[:, :4]))))
            # Sample a batch of RoIs for training
-            rois_per_image = cfg.TRAIN.BATCH_SIZE
-            fg_rois_per_image = np.round(cfg.TRAIN.FG_FRACTION * rois_per_image)
+            rois_per_image = cfg.FRCNN.BATCH_SIZE
+            fg_rois_per_image = np.round(cfg.FRCNN.FG_FRACTION * rois_per_image)
            rcnn_util.map_returns_to_blobs(
-                sample_rois(
-                    rois,
-                    gt_boxes,
-                    rois_per_image,
-                    fg_rois_per_image,
-                ), blobs, keys,
+                sample_rois(rois,
+                            gt_boxes,
+                            rois_per_image,
+                            fg_rois_per_image),
+                blobs, keys,
            )

        # Stack into continuous blobs
@@ -95,7 +93,7 @@ class ProposalTarget(object):
        return {
            'rois': [new_tensor(rois) for rois in rois_wide],
            'labels': new_tensor(blobs['labels']),
-            'bbox_indices': new_tensor(cls_inds[fg_inds] + blobs['labels'][fg_inds]),
+            'bbox_inds': new_tensor(cls_inds[fg_inds] + blobs['labels'][fg_inds]),
            'bbox_targets': new_tensor(blobs['bbox_targets'][fg_inds].astype('float32')),
            'bbox_anchors': new_tensor(blobs['rois'][fg_inds, 1:].astype('float32')),
        }
@@ -108,8 +106,8 @@ def sample_rois(all_rois, gt_boxes, num_rois, num_fg_rois):
    max_overlaps = overlaps.max(axis=1)
    labels = gt_boxes[gt_assignment, 4].astype('int64')

-    # Select foreground RoIs as those with >= FG_THRESH overlap
-    fg_thresh = cfg.TRAIN.FG_THRESH
+    # Select foreground RoIs as those with >= POSITIVE_OVERLAP
+    fg_thresh = cfg.FRCNN.POSITIVE_OVERLAP
    fg_inds = np.where(max_overlaps >= fg_thresh)[0]
    while fg_inds.size == 0:
        fg_thresh -= 0.01
@@ -119,9 +117,10 @@ def sample_rois(all_rois, gt_boxes, num_rois, num_fg_rois):
    fg_rois_per_this_image = int(min(num_fg_rois, fg_inds.size))
    fg_inds = npr.choice(fg_inds, fg_rois_per_this_image, False)

-    # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
-    bg_inds = np.where((max_overlaps < cfg.TRAIN.BG_THRESH_HI) &
-                       (max_overlaps >= cfg.TRAIN.BG_THRESH_LO))[0]
+    # Select background RoIs as those within
+    # [NEGATIVE_OVERLAP_LO, NEGATIVE_OVERLAP_HI)
+    bg_inds = np.where((max_overlaps < cfg.FRCNN.NEGATIVE_OVERLAP_HI) &
+                       (max_overlaps >= cfg.FRCNN.NEGATIVE_OVERLAP_LO))[0]
    # Compute number of background RoIs to take from this image
    bg_rois_per_this_image = num_rois - fg_rois_per_this_image
    bg_rois_per_this_image = min(bg_rois_per_this_image, bg_inds.size)
@@ -129,7 +128,7 @@ def sample_rois(all_rois, gt_boxes, num_rois, num_fg_rois):
    if bg_inds.size > 0:
        bg_inds = npr.choice(bg_inds, bg_rois_per_this_image, False)

-    # The indices that we're selecting (both fg and bg)
+    # The selecting indices (both fg and bg)
    keep_inds = np.append(fg_inds, bg_inds)
    # Select sampled values from various arrays
    rois, labels = all_rois[keep_inds], labels[keep_inds]
@@ -137,12 +136,9 @@ def sample_rois(all_rois, gt_boxes, num_rois, num_fg_rois):
    labels[fg_rois_per_this_image:] = 0

    # Compute the target from RoIs
-    return [
-        rois,
-        labels,
-        box_util.bbox_transform(
-            rois[:, 1:5],
-            gt_boxes[gt_assignment[keep_inds], :4],
-            cfg.BBOX_REG_WEIGHTS,
-        )
-    ]
+    outputs = [rois, labels]
+    outputs += [box_util.bbox_transform(
+        rois[:, 1:5],
+        gt_boxes[gt_assignment[keep_inds], :4],
+        cfg.BBOX_REG_WEIGHTS)]
+    return outputs
--- a/seetadet/algo/faster_rcnn/test.py
+++ b/seetadet/algo/faster_rcnn/test.py
@@ -20,97 +20,131 @@ import numpy as np

 from seetadet.core.config import cfg
 from seetadet.modeling.detector import new_detector
+from seetadet.utils import blob as blob_util
 from seetadet.utils import boxes as box_util
+from seetadet.utils import image as image_util
+from seetadet.utils import logger
 from seetadet.utils import nms as nms_util
 from seetadet.utils import time_util
-from seetadet.utils.blob import im_list_to_blob
-from seetadet.utils.image import scale_image


-def im_detect(detector, raw_image):
-    """Detect a image, with single or multiple scales."""
-    ims, ims_scale = scale_image(raw_image)
-
-    # Prepare blobs
-    data = im_list_to_blob(ims)
-    ims_info = np.array([list(data.shape[1:3]) + [im_scale]
-                         for im_scale in ims_scale], dtype=np.float32)
-
-    # Do Forward
-    data = torch.from_numpy(data)
-    ims_info = torch.from_numpy(ims_info)
-
+def get_data(raw_images):
+    """Return the test data."""
+    max_size = cfg.TEST.MAX_SIZE
+    images_wide = []
+    image_shapes_wide, image_scales_wide = [], []
+    for img in raw_images:
+        images, image_scales = image_util.scale_image(
+            img, scales=cfg.TEST.SCALES, max_size=max_size)
+        images_wide += images
+        image_scales_wide += image_scales
+        image_shapes_wide += [img.shape[:2] for img in images]
+    images = blob_util.im_list_to_blob(
+        images_wide, coarsest_stride=cfg.MODEL.COARSEST_STRIDE)
+    image_shapes = np.array(image_shapes_wide)
+    image_scales = np.array(image_scales_wide).reshape((len(images), -1))
+    images_info = np.hstack([image_shapes, image_scales]).astype('float32')
+    return images, images_info
+
+
+def ims_detect(detector, raw_images, timer=None):
+    """Detect images at single or multiple scales."""
+    images, images_info = get_data(raw_images)
+    timer.tic() if timer else timer
+
+    # Do forward
+    inputs = {'image': torch.from_numpy(images),
+              'im_info': torch.from_numpy(images_info)}
    if not hasattr(detector, 'script_forward'):
-        def script_forward(self, data, ims_info):
-            return self.forward({'data': data, 'ims_info': ims_info})
+        def script_forward(self, image, im_info):
+            return self.forward({'image': image, 'im_info': im_info})
        detector.script_forward = torch.jit.trace(
            func=types.MethodType(script_forward, detector),
-            example_inputs=[data, ims_info],
+            example_inputs=[inputs['image'], inputs['im_info']],
        )
-
-    outputs = detector.script_forward(data, ims_info)
+    outputs = detector.script_forward(inputs['image'], inputs['im_info'])
    outputs = dict((k, outputs[k].numpy()) for k in outputs.keys())

    # Decode results
-    all_scores, all_boxes = [], []
-    pred_boxes = box_util.bbox_transform_inv(
+    batch_pred = box_util.bbox_transform_inv(
        outputs['rois'][:, 1:5],
        outputs['bbox_pred'],
-        cfg.BBOX_REG_WEIGHTS,
-    )
-
-    for i in range(len(ims)):
+        cfg.BBOX_REG_WEIGHTS)
+    results = [([], []) for _ in range(len(raw_images))]
+    for i in range(len(images)):
+        ii = i // len(cfg.TEST.SCALES)
        inds = np.where(outputs['rois'][:, 0].astype(np.int32) == i)[0]
-        boxes = pred_boxes[inds] / ims_scale[i]
-        all_scores.append(outputs['cls_prob'][inds])
-        all_boxes.append(box_util.clip_tiled_boxes(boxes, raw_image.shape))
-
-    return np.vstack(all_scores), np.vstack(all_boxes)
-
-
-def test_net(weights, num_classes, q_in, q_out, device):
-    num_classes, cfg.GPU_ID = num_classes, device
+        boxes = batch_pred[inds] / images_info[i][2]
+        boxes = box_util.clip_tiled_boxes(boxes, raw_images[ii].shape)
+        results[ii][0].append(outputs['cls_prob'][inds])
+        results[ii][1].append(boxes)
+
+    # Merge from multiple scales
+    ret = [(np.vstack(s), np.vstack(b)) for s, b in results]
+    timer.toc() if timer else timer
+    return ret
+
+
+def test_net(weights, q_in, q_out, device, root_logger=True):
+    """Test a network trained with Faster R-CNN algorithm."""
+    cfg.GPU_ID = device
+    num_classes = len(cfg.MODEL.CLASSES)
+    logger.set_root_logger(root_logger)
    detector = new_detector(device, weights)

-    _t = time_util.new_timers('im_detect', 'misc')
+    must_stop = False
+    timers = time_util.new_timers('im_detect_bbox', 'misc')
+    empty_detections = np.zeros((0, 5), 'float32')

    while True:
-        i, raw_image = q_in.get()
-        if i < 0:
+        if must_stop:
            break
-
-        boxes_this_image = [[]]
-
-        with _t['im_detect'].tic_and_toc():
-            scores, boxes = im_detect(detector, raw_image)
-
-        _t['misc'].tic()
-        for j in range(1, num_classes):
-            inds = np.where(scores[:, j] > cfg.TEST.SCORE_THRESH)[0]
-            cls_scores = scores[inds, j]
-            cls_boxes = boxes[inds, j * 4:(j + 1) * 4]
-            cls_detections = np.hstack(
-                (cls_boxes, cls_scores[:, np.newaxis])
-            ).astype(np.float32, copy=False)
-            if cfg.TEST.USE_SOFT_NMS:
-                keep = nms_util.soft_nms(
-                    cls_detections,
-                    thresh=cfg.TEST.NMS,
-                    method=cfg.TEST.SOFT_NMS_METHOD,
-                    sigma=cfg.TEST.SOFT_NMS_SIGMA,
-                )
-            else:
-                keep = nms_util.nms(
-                    cls_detections,
-                    thresh=cfg.TEST.NMS,
-                )
-            cls_detections = cls_detections[keep, :]
-            boxes_this_image.append(cls_detections)
-        _t['misc'].toc()
-
-        q_out.put((
-            i,
-            dict([('im_detect', _t['im_detect'].average_time),
-                  ('misc', _t['misc'].average_time)]),
-            dict([('boxes', boxes_this_image)]),
-        ))
+        indices, raw_images = [], []
+        for _ in range(cfg.TEST.IMS_PER_BATCH):
+            i, raw_image = q_in.get()
+            if i < 0:
+                must_stop = True
+                break
+            indices.append(i)
+            raw_images.append(raw_image)
+
+        if len(raw_images) == 0:
+            continue
+
+        results = ims_detect(detector, raw_images, timers['im_detect_bbox'])
+
+        for i, (scores, boxes) in enumerate(results):
+            timers['misc'].tic()
+            boxes_this_image = [[]]
+            for j in range(1, num_classes):
+                inds = np.where(scores[:, j] > cfg.TEST.SCORE_THRESH)[0]
+                if len(inds) == 0:
+                    boxes_this_image.append(empty_detections)
+                    continue
+                cls_scores = scores[inds, j]
+                cls_boxes = boxes[inds, j * 4:(j + 1) * 4]
+                cls_detections = np.hstack(
+                    (cls_boxes, cls_scores[:, np.newaxis])) \
+                    .astype(np.float32, copy=False)
+                if cfg.TEST.USE_SOFT_NMS:
+                    keep = nms_util.soft_nms(
+                        cls_detections,
+                        thresh=cfg.TEST.NMS,
+                        method=cfg.TEST.SOFT_NMS_METHOD,
+                        sigma=cfg.TEST.SOFT_NMS_SIGMA,
+                    )
+                else:
+                    keep = nms_util.nms(
+                        cls_detections,
+                        thresh=cfg.TEST.NMS,
+                    )
+                cls_detections = cls_detections[keep, :]
+                boxes_this_image.append(cls_detections)
+            timers['misc'].toc()
+
+            q_out.put((
+                indices[i],
+                dict([('im_detect', timers['im_detect_bbox'].average_time),
+                      ('misc', timers['misc'].average_time)]),
+                dict([('boxes', boxes_this_image)]),
+            ))
--- a/seetadet/algo/faster_rcnn/utils.py
+++ b/seetadet/algo/faster_rcnn/utils.py
@@ -19,43 +19,78 @@ import numpy as np
 from seetadet.core.config import cfg


-def generate_grid_anchors(grid_shapes, base_anchors, strides):
-    num_strides = len(strides)
-    if len(grid_shapes) != num_strides:
-        raise ValueError(
-            'Given %d grids for %d strides.'
-            % (len(grid_shapes), num_strides)
-        )
-    # Generate proposals from shifted anchors
+def get_shifted_coords(shapes, base_anchors):
+    """Return the x-y coordinates of shifted anchors."""
+    xs, ys = [], []
+    for i in range(len(shapes)):
+        height, width = shapes[i]
+        x, y = np.arange(0, width), np.arange(0, height)
+        x, y = np.meshgrid(x, y)
+        # Add A anchors (A,) to cell K shifts (K,)
+        # to get shift coords (A, K)
+        xs.append(np.tile(x.flatten(), base_anchors[i].shape[0]))
+        ys.append(np.tile(y.flatten(), base_anchors[i].shape[0]))
+    return np.concatenate(xs), np.concatenate(ys)
+
+
+def get_shifted_anchors(shapes, base_anchors, strides):
+    """Return the shifted anchors on given shapes."""
    anchors_to_pack = []
-    for i in range(len(grid_shapes)):
-        height, width = grid_shapes[i]
+    for i in range(len(shapes)):
+        height, width = shapes[i]
        shift_x = np.arange(0, width) * strides[i]
        shift_y = np.arange(0, height) * strides[i]
        shift_x, shift_y = np.meshgrid(shift_x, shift_y)
        shifts = np.vstack((shift_x.ravel(), shift_y.ravel(),
                            shift_x.ravel(), shift_y.ravel())).transpose()
-        # Add a anchors (1, a, 4) to
-        # cell k shifts (k, 1, 4) to get
-        # shift anchors (k, a, 4)
-        # Reshape to (k * a, 4) shifted anchors
+        # Add A anchors (A, 1, 4) to cell K shifts (1, K, 4)
+        # to get shift anchors (A, K, 4)
        a = base_anchors[i].shape[0]
        k = shifts.shape[0]
-        anchors = (base_anchors[i].reshape((1, a, 4)) +
-                   shifts.reshape((1, k, 4)).transpose((1, 0, 2)))
-        if num_strides > 1:
-            # Transpose from (K, A, 4) to (A, K, 4)
-            # We will pack it with other strides to
-            # match the data format of (N, C, H, W)
-            anchors = anchors.transpose((1, 0, 2))
-            anchors = anchors.reshape((a * k, 4))
-            anchors_to_pack.append(anchors)
-        else:
-            # Original order of Faster R-CNN
-            return anchors.reshape((k * a, 4))
+        anchors = (base_anchors[i].reshape((a, 1, 4)) +
+                   shifts.reshape((1, k, 4)))
+        anchors_to_pack.append(anchors.reshape((a * k, 4)))
    return np.vstack(anchors_to_pack)


+def narrow_anchors(
+    all_coords,
+    base_anchors,
+    max_shapes,
+    shapes,
+    inds,
+    remapping=None,
+):
+    """Return the valid shifted anchors on given shapes."""
+    x_coords, y_coords = all_coords
+    inds_wide, remapping_wide = [], []
+    offset = num = 0
+    for i in range(len(max_shapes)):
+        num += base_anchors[i].shape[0] * np.prod(max_shapes[i])
+        inds_inside = np.where((inds >= offset) & (inds < num))[0]
+        inds_wide.append(inds[inds_inside])
+        if remapping is not None:
+            remapping_wide.append(remapping[inds_inside])
+        offset = num
+    offset1 = offset2 = num1 = num2 = 0
+    for i in range(len(max_shapes)):
+        num1 += base_anchors[i].shape[0] * np.prod(max_shapes[i])
+        num2 += base_anchors[i].shape[0] * np.prod(shapes[i])
+        inds = inds_wide[i]
+        x, y = x_coords[inds], y_coords[inds]
+        a = ((inds - offset1) // max_shapes[i][1]) // max_shapes[i][0]
+        inds = (a * shapes[i][0] + y) * shapes[i][1] + x + offset2
+        inds_mask = np.where((x < shapes[i][1]) & (y < shapes[i][0]))[0]
+        inds_wide[i] = inds[inds_mask]
+        if remapping is not None:
+            remapping_wide[i] = remapping_wide[i][inds_mask]
+        offset1, offset2 = num1, num2
+    outputs = [np.concatenate(inds_wide)]
+    if remapping is not None:
+        outputs += [np.concatenate(remapping_wide)]
+    return outputs[0] if len(outputs) == 1 else outputs
+
+
 def map_returns_to_blobs(returns, blobs, keys):
    """Map returns of image to blobs."""
    for i, key in enumerate(keys):
@@ -83,6 +118,5 @@ def map_blobs_by_levels(blobs, defaults, lvl_inds):
            outputs[key].append(
                blob[inds]
                if len(inds) > 0
-                else defaults[key]
-            )
+                else defaults[key])
    return outputs
--- a/seetadet/algo/mask_rcnn/data_loader.py
+++ b/seetadet/algo/mask_rcnn/data_loader.py
@@ -13,8 +13,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import collections
 import multiprocessing as mp
 import time
+import threading
+import queue

 import dragon
 import dragon.vm.torch as torch
@@ -23,9 +26,8 @@ import numpy as np
 from seetadet.algo.mask_rcnn import data_transformer
 from seetadet.core.config import cfg
 from seetadet.datasets.factory import get_dataset
+from seetadet.utils import blob as blob_util
 from seetadet.utils import logger
-from seetadet.utils.blob import im_list_to_blob
-from seetadet.utils.blob import mask_list_to_blob


 class DataLoader(object):
@@ -39,19 +41,19 @@ class DataLoader(object):
            'source': dataset.source,
            'classes': dataset.classes,
            'shuffle': cfg.TRAIN.USE_SHUFFLE,
-            'num_chunks': cfg.TRAIN.SHUFFLE_CHUNKS,
            'batch_size': cfg.TRAIN.IMS_PER_BATCH * 2,
            'num_transformers': cfg.TRAIN.NUM_THREADS - 1,
        })
+        self.iterator.start()

    def __call__(self):
        outputs = self.iterator.next()
-        if isinstance(outputs['data'], np.ndarray):
-            outputs['data'] = torch.from_numpy(outputs['data'])
+        if isinstance(outputs['image'], np.ndarray):
+            outputs['image'] = torch.from_numpy(outputs['image'])
        return outputs


-class Iterator(mp.Process):
+class Iterator(threading.Thread):
    """Iterator to return the batch of data."""

    def __init__(self, **kwargs):
@@ -65,17 +67,16 @@ class Iterator(mp.Process):
            rank = dragon.distributed.get_rank(process_group)

        # Configuration
-        self._prefetch = kwargs.get('prefetch', 5)
        self._batch_size = kwargs.get('batch_size', 2)
        self._num_readers = kwargs.get('num_readers', 1)
        self._num_transformers = kwargs.get('num_transformers', 3)
        self.daemon = True

        # Initialize queues
-        num_batches = self._prefetch * self._num_readers
-        self.q_in = mp.Queue(num_batches * self._batch_size)
-        self.q1_out = mp.Queue(num_batches * self._batch_size)
-        self.q2_out = mp.Queue(num_batches * self._batch_size)
+        num_batches = self._num_readers
+        self._queue1 = mp.Queue(num_batches * self._batch_size)
+        self._queue2 = mp.Queue(num_batches * self._batch_size)
+        self._queue3 = queue.Queue(num_batches)

        # Initialize readers
        self._readers = []
@@ -86,7 +87,7 @@ class Iterator(mp.Process):
            self._readers.append(dragon.io.DataReader(
                part_idx=part_idx, num_parts=num_parts, **kwargs))
            self._readers[i]._seed += part_idx
-            self._readers[i].q_out = self.q_in
+            self._readers[i].q_out = self._queue1
            self._readers[i].start()
            time.sleep(0.1)

@@ -95,8 +96,7 @@ class Iterator(mp.Process):
        for i in range(self._num_transformers):
            p = data_transformer.DataTransformer(**kwargs)
            p._seed += (i + rank * self._num_transformers)
-            p.q_in = self.q_in
-            p.q1_out, p.q2_out = self.q1_out, self.q2_out
+            p.q_in, p.q_out = self._queue1, self._queue2
            p.start()
            self._transformers.append(p)
            time.sleep(0.1)
@@ -119,38 +119,44 @@ class Iterator(mp.Process):
        """Return the next batch of data."""
        return self.__next__()

+    def run(self):
+        """Main loop."""
+        num_images = cfg.TRAIN.IMS_PER_BATCH
+        num_batches = cfg.TRAIN.ASPECT_GROUPING
+        logger.info('Initialize prefetching batches...')
+        example_buffer = [self._queue2.get()
+                          for _ in range(num_images * num_batches)]
+        next_examples = []
+
+        while True:
+            # Use cached buffer for next N examples
+            # Examples are sorted to simulate aspect grouping
+            if len(next_examples) == 0:
+                next_examples = example_buffer
+                next_examples.sort(key=lambda d: d['aspect_ratio'])
+                example_buffer = []
+
+            # Prepare the next batch
+            outputs = collections.defaultdict(list)
+            for i in range(num_images):
+                example = next_examples.pop(0)
+                outputs['image'].append(example['image'])
+                outputs['gt_boxes'].append(example['boxes'])
+                outputs['gt_segms'].append(example['segms'])
+                outputs['im_info'].append(example['im_info'])
+                outputs['fg_inds'].append(example.get('fg_inds', None))
+                outputs['bg_inds'].append(example.get('bg_inds', None))
+                example_buffer.append(self._queue2.get())
+            outputs['image'] = blob_util.im_list_to_blob(
+                outputs['image'], coarsest_stride=cfg.MODEL.COARSEST_STRIDE)
+
+            # Send batch data to consumer
+            self._queue3.put(outputs)
+
    def __iter__(self):
        """Return the iterator self."""
        return self

    def __next__(self):
        """Return the next batch of data."""
-        q_out = None
-        # Two queues to implement aspect-grouping
-        # This is necessary to reduce the gpu memory
-        # from fetching a huge square batch blob
-        while q_out is None:
-            if self.q1_out.qsize() >= cfg.TRAIN.IMS_PER_BATCH:
-                q_out = self.q1_out
-            elif self.q2_out.qsize() >= cfg.TRAIN.IMS_PER_BATCH:
-                q_out = self.q2_out
-        self.q1_out, self.q2_out = self.q2_out, self.q1_out
-
-        images, images_info = [], []
-        boxes_to_pack, masks_to_pack = [], []
-
-        for i in range(cfg.TRAIN.IMS_PER_BATCH):
-            image, image_scale, boxes, masks = q_out.get()
-            images.append(image)
-            images_info.append(list(image.shape[:2]) + [image_scale])
-            gt_boxes = np.zeros((boxes.shape[0], boxes.shape[1] + 1), 'float32')
-            gt_boxes[:, :boxes.shape[1]], gt_boxes[:, -1] = boxes, i
-            boxes_to_pack.append(gt_boxes)
-            masks_to_pack.append(masks)
-
-        return {
-            'data': im_list_to_blob(images),
-            'ims_info': np.array(images_info, 'float32'),
-            'gt_boxes': np.concatenate(boxes_to_pack),
-            'gt_masks': mask_list_to_blob(masks_to_pack),
-        }
+        return self._queue3.get()
--- a/seetadet/algo/mask_rcnn/data_transformer.py
+++ b/seetadet/algo/mask_rcnn/data_transformer.py
@@ -15,134 +15,136 @@ from __future__ import print_function

 import multiprocessing

+import cv2
 import numpy as np
+import numpy.random as npr

+from seetadet.algo import common as algo_common
 from seetadet.core.config import cfg
 from seetadet.datasets.example import Example
-from seetadet.pycocotools import mask_utils
+from seetadet.utils.pycocotools import mask_utils
 from seetadet.utils import boxes as box_util
-from seetadet.utils.blob import prep_im_for_blob
+from seetadet.utils import image as image_util


 class DataTransformer(multiprocessing.Process):
    def __init__(self, **kwargs):
        super(DataTransformer, self).__init__()
        self._scales = cfg.TRAIN.SCALES
+        self._random_scales = cfg.TRAIN.RANDOM_SCALES
        self._max_size = cfg.TRAIN.MAX_SIZE
        self._seed = cfg.RNG_SEED
-        self._use_flipped = cfg.TRAIN.USE_FLIPPED
        self._use_diff = cfg.TRAIN.USE_DIFF
+        self._use_flipped = cfg.TRAIN.USE_FLIPPED
+        self._use_distort = cfg.TRAIN.USE_COLOR_JITTER
        self._classes = kwargs.get('classes', ('__background__',))
        self._num_classes = len(self._classes)
        self._class_to_ind = dict(zip(self._classes, range(self._num_classes)))
-        self.q_in = self.q1_out = self.q2_out = None
+        self._anchor_sampler = algo_common.AnchorSampler()
+        self.q_in = self.q_out = None
        self.daemon = True

-    def make_roi_dict(self, example, im_scale, apply_flip=False):
-        objects, n_objects = example.objects, 0
+    def get_boxes_and_segms(self, example, im_scale, flipped):
+        objects, num_objects = example.objects, 0
        height, width = example.height, example.width
        if not self._use_diff:
            for obj in objects:
                if obj.get('difficult', 0) == 0:
-                    n_objects += 1
+                    num_objects += 1
        else:
-            n_objects = len(objects)
+            num_objects = len(objects)

-        roi_dict = {
-            'boxes': np.zeros((n_objects, 4), 'float32'),
-            'masks': np.empty((n_objects, height, width), 'uint8'),
-            'gt_classes': np.zeros((n_objects, 1), 'int32'),
-            'mask_flags': np.ones((n_objects, 1), 'float32'),
-        }
+        boxes, segms = np.zeros((num_objects, 4), 'float32'), []
+        gt_classes = np.zeros((num_objects,), 'float32')
+        segm_flags = np.ones((num_objects,), 'float32')

-        # Filter the difficult instances
+        # Filter the difficult instances.
        object_idx = 0
        for obj in objects:
-            if not self._use_diff and \
-                    obj.get('difficult', 0) > 0:
+            if not self._use_diff and obj.get('difficult', 0) > 0:
                continue
-            bbox, mask = obj['bbox'], obj['mask']
-            roi_dict['boxes'][object_idx, :] = [
-                max(0, bbox[0]),
-                max(0, bbox[1]),
-                min(bbox[2], width - 1),
-                min(bbox[3], height - 1),
-            ]
-            if mask is not None:
-                roi_dict['masks'][object_idx] = (
-                    mask_utils.bytes2img(
-                        obj['mask'],
-                        height,
-                        width,
-                    ))
+            bbox = obj['bbox']
+            boxes[object_idx, :] = [max(0, bbox[0]),
+                                    max(0, bbox[1]),
+                                    min(bbox[2], width - 1),
+                                    min(bbox[3], height - 1)]
+            if 'mask' in obj:
+                mask_img = mask_utils.bytes2img(obj['mask'], height, width)
+                segms.append(mask_img[:, ::-1] if flipped else mask_img)
+            elif 'polygons' in obj:
+                polygons = obj['polygons']
+                segms.append(box_util.flip_polygons(
+                    polygons, width) if flipped else polygons)
            else:
-                roi_dict['mask_flags'][object_idx] = 0.
-            roi_dict['gt_classes'][object_idx] = \
-                self._class_to_ind[obj['name']]
+                segms.append(None)
+                segm_flags[object_idx] = 0.
+            gt_classes[object_idx] = self._class_to_ind[obj['name']]
            object_idx += 1

-        # Flip the boxes if necessary
-        if apply_flip:
-            roi_dict['boxes'] = \
-                box_util.flip_boxes(
-                    roi_dict['boxes'],
-                    width,
-                )
+        # Scale the boxes to the detecting scale.
+        boxes *= im_scale

-        # Scale the boxes to the detecting scale
-        roi_dict['boxes'] *= im_scale
+        # Attach the classes and mask flags.
+        gt_boxes = np.empty((num_objects, 6), dtype=np.float32)
+        gt_boxes[:, :4], gt_boxes[:, 4] = boxes, gt_classes
+        gt_boxes[:, 5] = segm_flags  # Has segmentation or not.

-        return roi_dict
+        return gt_boxes, segms

    def get(self, example):
        example = Example(example)
-        img = example.image
-
-        # Scale
-        target_size = self._scales[np.random.randint(len(self._scales))]
-        img, im_scale = prep_im_for_blob(img, target_size, self._max_size)
-
-        # Flip
-        apply_flip = False
-        if self._use_flipped:
-            if np.random.randint(2) > 0:
-                img = img[:, ::-1]
-                apply_flip = True
-
-        # Example -> RoIDict
-        roi_dict = self.make_roi_dict(example, im_scale, apply_flip)
-
-        # Post-Process for gt boxes
-        # Shape like: [num_objects, {x1, y1, x2, y2, cls, flag}]
-        gt_boxes = \
-            np.concatenate([
-                roi_dict['boxes'],
-                roi_dict['gt_classes'],
-                roi_dict['mask_flags']
-            ], axis=1)
-
-        # Post-Process for gt masks
-        # Shape like: [num_objects, im_h, im_w]
-        if gt_boxes.shape[0] > 0:
-            gt_masks = roi_dict['masks']
-            if apply_flip:
-                gt_masks = gt_masks[:, :, ::-1]
-        else:
-            gt_masks = None

-        return img, im_scale, gt_boxes, gt_masks
+        # Resize.
+        img, im_scale = image_util.resize_image_with_target_size(
+            example.image,
+            target_size=npr.choice(self._scales),
+            max_size=self._max_size,
+            random_scales=self._random_scales,
+        )
+
+        # Flip.
+        flipped = False
+        if self._use_flipped and npr.randint(2) > 0:
+            img = img[:, ::-1]
+            flipped = True
+
+        # Distort.
+        if self._use_distort:
+            img = image_util.distort_image(img)
+
+        # Boxes and segmentations.
+        boxes, segms = self.get_boxes_and_segms(example, im_scale, flipped)
+
+        # Flip the boxes if necessary.
+        if flipped:
+            boxes = box_util.flip_boxes(boxes, img.shape[1])
+
+        # Standard outputs.
+        outputs = {'image': img,
+                   'boxes': boxes,
+                   'segms': segms,
+                   'im_info': img.shape[:2] + (im_scale,)}
+
+        # Attach precomputed targets.
+        if len(boxes) > 0:
+            outputs.update(
+                self._anchor_sampler(
+                    gt_boxes=boxes,
+                    im_info=outputs['im_info']))
+
+        return outputs

    def run(self):
-        # Fix the process-local random seed
+        # Disable the opencv threading.
+        cv2.setNumThreads(1)
+        # Fix the process-local random seed.
        np.random.seed(self._seed)

        # Main prefetch loop
        while True:
            outputs = self.get(self.q_in.get())
-            if len(outputs[2]) < 1:
-                continue  # Ignore the non-object image
-            aspect_ratio = float(outputs[0].shape[0]) / outputs[0].shape[1]
-            if aspect_ratio > 1.:
-                self.q1_out.put(outputs)
-            else:
-                self.q2_out.put(outputs)
+            if len(outputs['boxes']) < 1:
+                continue  # Ignore non-object image.
+            height, width = outputs['image'].shape[:2]
+            outputs['aspect_ratio'] = float(height) / float(width)
+            self.q_out.put(outputs)
--- a/seetadet/algo/mask_rcnn/proposal_target.py
+++ b/seetadet/algo/mask_rcnn/proposal_target.py
@@ -31,7 +31,7 @@ class ProposalTarget(object):
    def __init__(self):
        super(ProposalTarget, self).__init__()
        self.resolution = cfg.MRCNN.RESOLUTION
-        self.num_classes = cfg.MODEL.NUM_CLASSES
+        self.num_classes = len(cfg.MODEL.CLASSES)
        self.defaults = collections.OrderedDict([
            ('rois', np.array([[-1, 0, 0, 1, 1]], 'float32')),
            ('labels', np.array([-1], 'int64')),
@@ -39,18 +39,10 @@ class ProposalTarget(object):
            ('mask_targets', -np.ones((1, self.resolution, self.resolution), 'float32')),
        ])

-    def __call__(self, rpn_rois, gt_boxes, gt_masks, ims_info):
+    def __call__(self, **inputs):
        num_images = cfg.TRAIN.IMS_PER_BATCH
        # Proposal ROIs (0, x1, y1, x2, y2) coming from RPN
-        all_rois = rpn_rois
-        # GT boxes (x1, y1, x2, y2, label)
-        # GT masks (num_objects, im_h, im_w)
-        gt_boxes_wide, gt_masks_wide = \
-            mask_util.dismantle_masks(
-                gt_boxes,
-                gt_masks,
-                num_images,
-            )
+        all_rois = inputs['rois']

        # Prepare for the outputs
        keys = self.defaults.keys()
@@ -58,24 +50,25 @@ class ProposalTarget(object):

        # Generate targets separately
        for ix in range(num_images):
-            gt_boxes = gt_boxes_wide[ix]
-            gt_masks = gt_masks_wide[ix]
+            # GT boxes (x1, y1, x2, y2, label)
+            gt_boxes = inputs['gt_boxes'][ix]
+            gt_segms = inputs['gt_segms'][ix]
            # Extract proposals for this image
            rois = all_rois[np.where(all_rois[:, 0].astype('int32') == ix)[0]]
            # Include ground-truth boxes in the set of candidate rois
            inds = np.ones((gt_boxes.shape[0], 1), gt_boxes.dtype) * ix
            rois = np.vstack((rois, np.hstack((inds, gt_boxes[:, :4]))))
            # Sample a batch of RoIs for training
-            rois_per_image = cfg.TRAIN.BATCH_SIZE
-            fg_rois_per_image = np.round(cfg.TRAIN.FG_FRACTION * rois_per_image)
+            rois_per_image = cfg.FRCNN.BATCH_SIZE
+            fg_rois_per_image = np.round(cfg.FRCNN.FG_FRACTION * rois_per_image)
            rcnn_util.map_returns_to_blobs(
                sample_rois(
                    rois,
                    gt_boxes,
-                    gt_masks,
+                    gt_segms,
                    rois_per_image,
                    fg_rois_per_image,
-                    ims_info[ix][2],
+                    inputs['im_info'][ix][2],
                ), blobs, keys,
            )

@@ -122,10 +115,10 @@ class ProposalTarget(object):
            'rois': [new_tensor(rois_wide[i]) for i in range(num_levels)],
            'mask_rois': [new_tensor(mask_rois_wide[i]) for i in range(num_levels)],
            'labels': new_tensor(blobs['labels']),
-            'bbox_indices': new_tensor(bbox_cls_inds[fg_inds] + blobs['labels'][fg_inds]),
+            'bbox_inds': new_tensor(bbox_cls_inds[fg_inds] + blobs['labels'][fg_inds]),
            'bbox_targets': new_tensor(blobs['bbox_targets'][fg_inds].astype('float32')),
            'bbox_anchors': new_tensor(blobs['rois'][fg_inds, 1:].astype('float32')),
-            'mask_indices': new_tensor(mask_cls_inds + mask_labels),
+            'mask_inds': new_tensor(mask_cls_inds + mask_labels),
            'mask_targets': new_tensor(blobs['mask_targets']),
        }

@@ -134,7 +127,7 @@ def compute_targets(
    ex_rois,
    gt_rois,
    gt_labels,
-    gt_masks,
+    gt_segms,
    mask_flags,
    mask_size,
    im_scale,
@@ -150,29 +143,25 @@ def compute_targets(
    # Compute mask classification targets
    mask_shape = [mask_size] * 2
    ex_rois_ori = np.round(ex_rois / im_scale).astype(int)
-    gt_rois_ori = np.round(gt_rois / im_scale).astype(int)
    mask_targets = -np.ones([len(gt_labels)] + mask_shape, 'float32')
    for i in fg_inds:
        if mask_flags[i] > 0:
-            box_mask = \
-                mask_util.intersect_box_mask(
-                    ex_rois_ori[i],
-                    gt_rois_ori[i],
-                    gt_masks[i],
-                )
-            if box_mask is not None:
-                mask_targets[i] = \
-                    mask_util.resize_mask(
-                        mask=box_mask,
-                        size=mask_shape,
-                    )
+            if isinstance(gt_segms[i], list):
+                ret = mask_util.warp_mask_via_polygons(
+                    gt_segms[i], ex_rois_ori[i], mask_shape)
+            else:
+                gt_rois_ori = np.round(gt_rois / im_scale).astype(int)
+                ret = mask_util.warp_mask_via_intersection(
+                    gt_segms[i], ex_rois_ori[i], gt_rois_ori[i], mask_shape)
+            if ret is not None:
+                mask_targets[i] = ret.astype('float32')
    return bbox_targets, mask_targets


 def sample_rois(
    all_rois,
    gt_boxes,
-    gt_masks,
+    gt_segms,
    num_rois,
    num_fg_rois,
    im_scale,
@@ -184,15 +173,15 @@ def sample_rois(
    labels = gt_boxes[gt_assignment, 4].astype('int64')

    # Select foreground RoIs as those with >= FG_THRESH overlap
-    fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0]
+    fg_inds = np.where(max_overlaps >= cfg.FRCNN.POSITIVE_OVERLAP)[0]
    fg_rois_per_this_image = int(min(num_fg_rois, fg_inds.size))
    # Sample foreground regions without replacement
    if fg_inds.size > 0:
        fg_inds = npr.choice(fg_inds, fg_rois_per_this_image, False)

    # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
-    bg_inds = np.where((max_overlaps < cfg.TRAIN.BG_THRESH_HI) &
-                       (max_overlaps >= cfg.TRAIN.BG_THRESH_LO))[0]
+    bg_inds = np.where((max_overlaps < cfg.FRCNN.NEGATIVE_OVERLAP_HI) &
+                       (max_overlaps >= cfg.FRCNN.NEGATIVE_OVERLAP_LO))[0]
    # Compute number of background RoIs to take from this image
    bg_rois_per_this_image = num_rois - fg_rois_per_this_image
    bg_rois_per_this_image = min(bg_rois_per_this_image, bg_inds.size)
@@ -213,7 +202,7 @@ def sample_rois(
        rois[:, 1:5],
        gt_boxes[gt_assignment[keep_inds], :4],
        labels,
-        gt_masks[gt_assignment[fg_inds]],
+        [gt_segms[i] for i in gt_assignment[fg_inds]],
        gt_boxes[gt_assignment[fg_inds], 5],
        cfg.MRCNN.RESOLUTION,
        im_scale,

--- a/seetadet/algo/mask_rcnn/test.py
+++ b/seetadet/algo/mask_rcnn/test.py
@@ -22,59 +22,71 @@ from seetadet.algo.faster_rcnn import utils as rcnn_util
 from seetadet.core.config import cfg
 from seetadet.modeling.detector import new_detector
 from seetadet.utils import env
+from seetadet.utils import blob as blob_util
+from seetadet.utils import boxes as box_util
+from seetadet.utils import image as image_util
+from seetadet.utils import logger
 from seetadet.utils import nms as nms_util
 from seetadet.utils import time_util
-from seetadet.utils import boxes as box_util
-from seetadet.utils.blob import im_list_to_blob
-from seetadet.utils.image import scale_image


-def im_detect(detector, raw_image):
+def get_data(raw_images):
+    """Return the test data."""
+    max_size = cfg.TEST.MAX_SIZE
+    images_wide = []
+    image_shapes_wide, image_scales_wide = [], []
+    for img in raw_images:
+        images, image_scales = image_util.scale_image(
+            img, scales=cfg.TEST.SCALES, max_size=max_size)
+        images_wide += images
+        image_scales_wide += image_scales
+        image_shapes_wide += [img.shape[:2] for img in images]
+    images = blob_util.im_list_to_blob(
+        images_wide, coarsest_stride=cfg.MODEL.COARSEST_STRIDE)
+    image_shapes = np.array(image_shapes_wide)
+    image_scales = np.array(image_scales_wide).reshape((len(images), -1))
+    images_info = np.hstack([image_shapes, image_scales]).astype('float32')
+    return images, images_info
+
+
+def ims_detect(detector, raw_images, timer=None):
    """Detect a image, with single or multiple scales."""
-    ims, ims_scale = scale_image(raw_image)
-
-    # Prepare blobs
-    data = im_list_to_blob(ims)
-    ims_info = np.array([list(data.shape[1:3]) + [im_scale]
-                         for im_scale in ims_scale], dtype=np.float32)
-
-    # Do Forward
-    data = torch.from_numpy(data)
-    ims_info = torch.from_numpy(ims_info)
+    images, images_info = get_data(raw_images)
+    timer.tic() if timer else timer

+    # Do forward
+    inputs = {'image': torch.from_numpy(images),
+              'im_info': torch.from_numpy(images_info)}
    if not hasattr(detector, 'script_forward'):
-        def script_forward(self, data, ims_info):
-            return self.forward({'data': data, 'ims_info': ims_info})
+        def script_forward(self, image, im_info):
+            return self.forward({'image': image, 'im_info': im_info})
        detector.script_forward = torch.jit.trace(
            func=types.MethodType(script_forward, detector),
-            example_inputs=[data, ims_info],
+            example_inputs=[inputs['image'], inputs['im_info']],
        )
-
-    outputs = detector.script_forward(data, ims_info)
+    outputs = detector.script_forward(inputs['image'], inputs['im_info'])
    outputs = dict((k, outputs[k].numpy()) for k in outputs.keys())

    # Decode results
-    all_scores, all_boxes, batch_inds = [], [], []
-
-    pred_boxes = box_util.bbox_transform_inv(
+    batch_pred = box_util.bbox_transform_inv(
        outputs['rois'][:, 1:5],
        outputs['bbox_pred'],
-        cfg.BBOX_REG_WEIGHTS,
-    )
-
-    for i in range(len(ims)):
+        cfg.BBOX_REG_WEIGHTS)
+    results = [([], [], []) for _ in range(len(raw_images))]
+    for i in range(len(images)):
+        ii = i // len(cfg.TEST.SCALES)
        inds = np.where(outputs['rois'][:, 0].astype(np.int32) == i)[0]
-        boxes = pred_boxes[inds] / ims_scale[i]
-        all_scores.append(outputs['cls_prob'][inds])
-        all_boxes.append(box_util.clip_tiled_boxes(boxes, raw_image.shape))
-        batch_inds.append(np.ones((len(inds), 1), 'int32') * i)
+        boxes = batch_pred[inds] / images_info[i, 2]
+        boxes = box_util.clip_tiled_boxes(boxes, raw_images[ii].shape)
+        results[ii][0].append(outputs['cls_prob'][inds])
+        results[ii][1].append(boxes)
+        results[ii][2].append(np.ones((len(inds), 1), 'int32') * i)

-    return (
-        np.vstack(all_scores),
-        np.vstack(all_boxes),
-        np.vstack(batch_inds),
-        np.array(ims_scale, 'float64'),
-    )
+    # Merge from multiple scales
+    ret = [(np.vstack(s), np.vstack(b),
+            np.vstack(i), images_info[:, 2]) for s, b, i in results]
+    timer.toc() if timer else timer
+    return ret


 def mask_detect(detector, rois):
@@ -106,74 +118,92 @@ def mask_detect(detector, rois):
    return detector.rcnn.sigmoid(mask_pred).numpy().copy()


-def test_net(weights, num_classes, q_in, q_out, device):
-    num_classes, cfg.GPU_ID = num_classes, device
+def test_net(weights, q_in, q_out, device, root_logger=True):
+    """Test a network trained with Mask R-CNN algorithm."""
+    cfg.GPU_ID = device
+    num_classes = len(cfg.MODEL.CLASSES)
+    logger.set_root_logger(root_logger)
    detector = new_detector(device, weights)

-    _t = time_util.new_timers('im_detect', 'mask_detect', 'misc')
+    must_stop = False
+    timers = time_util.new_timers('im_detect_bbox', 'im_detect_mask', 'misc')
+    empty_detections = np.zeros((0, 5), 'float32')
+    empty_rois = np.zeros((0, 6), 'float32')

    while True:
-        i, raw_image = q_in.get()
-        if i < 0:
+        if must_stop:
            break
-
-        rois_this_image = []
-        boxes_this_image = [[]]
-        masks_this_image = [[]]
-
-        with _t['im_detect'].tic_and_toc():
-            scores, boxes, batch_inds, ims_scale = \
-                im_detect(detector, raw_image)
-
-        _t['misc'].tic()
-        for j in range(1, num_classes):
-            inds = np.where(scores[:, j] > cfg.TEST.SCORE_THRESH)[0]
-            cls_scores = scores[inds, j]
-            cls_boxes = boxes[inds, j * 4:(j + 1) * 4]
-            cls_batch_inds = batch_inds[inds]
-            cls_detections = np.hstack(
-                (cls_boxes, cls_scores[:, np.newaxis])
-            ).astype(np.float32, copy=False)
-            if cfg.TEST.USE_SOFT_NMS:
-                keep = nms_util.soft_nms(
-                    cls_detections,
-                    thresh=cfg.TEST.NMS,
-                    method=cfg.TEST.SOFT_NMS_METHOD,
-                    sigma=cfg.TEST.SOFT_NMS_SIGMA,
-                )
-            else:
-                keep = nms_util.nms(
-                    cls_detections,
-                    thresh=cfg.TEST.NMS,
-                )
-            cls_detections = cls_detections[keep, :]
-            cls_batch_inds = cls_batch_inds[keep]
-            boxes_this_image.append(cls_detections)
-            rois_this_image.append(
-                np.hstack((
-                    cls_batch_inds,
-                    cls_detections[:, :4] * ims_scale[cls_batch_inds],
-                    np.ones((len(keep), 1)) * (j - 1),
-                )))
-        mask_rois = np.concatenate(rois_this_image)
-        _t['misc'].toc()
-
-        if len(mask_rois) > 0:
-            k = 0
-            _t['mask_detect'].tic()
-            mask_pred = mask_detect(detector, mask_rois)
+        indices, raw_images = [], []
+        for _ in range(cfg.TEST.IMS_PER_BATCH):
+            i, raw_image = q_in.get()
+            if i < 0:
+                must_stop = True
+                break
+            indices.append(i)
+            raw_images.append(raw_image)
+
+        if len(raw_images) == 0:
+            continue
+
+        results = ims_detect(detector, raw_images, timers['im_detect_bbox'])
+
+        for i, (scores, boxes, batch_inds, im_scales) in enumerate(results):
+            timers['misc'].tic()
+            rois_this_image = []
+            boxes_this_image = [[]]
+            masks_this_image = [[]]
            for j in range(1, num_classes):
-                num_pred = len(boxes_this_image[j])
-                cls_masks = mask_pred[k:k + num_pred]
-                masks_this_image.append(cls_masks)
-                k += num_pred
-            _t['mask_detect'].toc()
-
-        q_out.put((
-            i,
-            dict([('im_detect', _t['im_detect'].average_time),
-                  ('mask_detect', _t['mask_detect'].average_time),
-                  ('misc', _t['misc'].average_time)]),
-            dict([('boxes', boxes_this_image),
-                  ('masks', masks_this_image)]),
-        ))
+                inds = np.where(scores[:, j] > cfg.TEST.SCORE_THRESH)[0]
+                if len(inds) == 0:
+                    boxes_this_image.append(empty_detections)
+                    rois_this_image.append(empty_rois)
+                    continue
+                cls_scores = scores[inds, j]
+                cls_boxes = boxes[inds, j * 4:(j + 1) * 4]
+                cls_batch_inds = batch_inds[inds]
+                cls_detections = np.hstack(
+                    (cls_boxes, cls_scores[:, np.newaxis])) \
+                    .astype(np.float32, copy=False)
+                if cfg.TEST.USE_SOFT_NMS:
+                    keep = nms_util.soft_nms(
+                        cls_detections,
+                        thresh=cfg.TEST.NMS,
+                        method=cfg.TEST.SOFT_NMS_METHOD,
+                        sigma=cfg.TEST.SOFT_NMS_SIGMA,
+                    )
+                else:
+                    keep = nms_util.nms(
+                        cls_detections,
+                        thresh=cfg.TEST.NMS,
+                    )
+                cls_detections = cls_detections[keep, :]
+                cls_batch_inds = cls_batch_inds[keep]
+                boxes_this_image.append(cls_detections)
+                rois_this_image.append(
+                    np.hstack((
+                        cls_batch_inds,
+                        cls_detections[:, :4] * im_scales[cls_batch_inds],
+                        np.ones((len(keep), 1)) * (j - 1),
+                    )))
+            mask_rois = np.concatenate(rois_this_image)
+            timers['misc'].toc()
+
+            if len(mask_rois) > 0:
+                k = 0
+                timers['im_detect_mask'].tic()
+                mask_pred = mask_detect(detector, mask_rois)
+                for j in range(1, num_classes):
+                    num_pred = len(boxes_this_image[j])
+                    cls_masks = mask_pred[k:k + num_pred]
+                    masks_this_image.append(cls_masks)
+                    k += num_pred
+                timers['im_detect_mask'].toc()
+
+            q_out.put((
+                indices[i],
+                dict([('im_detect', (timers['im_detect_bbox'].average_time +
+                                     timers['im_detect_mask'].average_time)),
+                      ('misc', timers['misc'].average_time)]),
+                dict([('boxes', boxes_this_image),
+                      ('masks', masks_this_image)]),
+            ))
--- a/seetadet/algo/retinanet/anchor_target.py
+++ b/seetadet/algo/retinanet/anchor_target.py
@@ -13,13 +13,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import collections
+import math
+
 import numpy as np

+from seetadet.algo.faster_rcnn import generate_anchors as anchor_util
+from seetadet.algo.faster_rcnn import utils as rcnn_util
 from seetadet.core.config import cfg
-from seetadet.algo.faster_rcnn.generate_anchors import generate_anchors_v2
-from seetadet.algo.faster_rcnn.utils import generate_grid_anchors
 from seetadet.utils import boxes as box_util
-from seetadet.utils import logger
 from seetadet.utils.env import new_tensor


@@ -41,95 +43,113 @@ class AnchorTarget(object):
                     (2 ** (octave / float(scales_per_octave)))
                     for octave in range(scales_per_octave)]
            self.base_anchors.append(
-                generate_anchors_v2(
+                anchor_util.generate_anchors_v2(
                    stride=stride,
                    ratios=self.ratios,
-                    sizes=sizes,
-                ))
-        # Store the cached grid anchors
-        self.last_grid_shapes = None
-        self.last_grid_anchors = None
+                    sizes=sizes))
+        # Plan the maximum anchor layout
+        max_size = cfg.TRAIN.MAX_SIZE
+        if max_size == 0:
+            max_size = cfg.TRAIN.SCALES[0]
+        if cfg.MODEL.COARSEST_STRIDE > 0:
+            stride = float(cfg.MODEL.COARSEST_STRIDE)
+            max_size = int(math.ceil(max_size / stride) * stride)
+        self.max_shapes = [[math.ceil(max_size / stride)] * 2
+                           for stride in self.strides]
+        self.all_coords = rcnn_util.get_shifted_coords(
+            self.max_shapes, self.base_anchors)
+        self.all_anchors = rcnn_util.get_shifted_anchors(
+            self.max_shapes, self.base_anchors, self.strides)
+
+    def sample_anchors(self, gt_boxes, im_info, all_anchors=None):
+        all_anchors = self.all_anchors \
+            if all_anchors is None else all_anchors
+
+        # Remove anchors separating from the image
+        inds_inside = np.where((all_anchors[:, 0] < im_info[1]) &
+                               (all_anchors[:, 1] < im_info[0]))[0]
+        anchors = all_anchors[inds_inside, :]
+
+        num_inside = len(anchors)
+        labels = np.empty((num_inside,), dtype='int32')
+        labels.fill(-1)
+
+        # Overlaps between the anchors and the gt boxes.
+        overlaps = box_util.bbox_overlaps(anchors, gt_boxes)
+        argmax_overlaps = overlaps.argmax(axis=1)
+        max_overlaps = overlaps[np.arange(num_inside), argmax_overlaps]
+
+        # Foreground: for each gt, anchor with highest overlap.
+        gt_argmax_overlaps = overlaps.argmax(axis=0)
+        gt_max_overlaps = overlaps[gt_argmax_overlaps, np.arange(overlaps.shape[1])]
+        gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0]
+        gt_assignment = argmax_overlaps[gt_argmax_overlaps]
+        labels[gt_argmax_overlaps] = gt_boxes[gt_assignment, 4]
+
+        # Foreground: above threshold IoU.
+        inds = max_overlaps >= cfg.RETINANET.POSITIVE_OVERLAP
+        gt_assignment = argmax_overlaps[inds]
+        labels[inds] = gt_boxes[gt_assignment, 4]
+
+        # Background: below threshold IoU.
+        labels[max_overlaps < cfg.RETINANET.NEGATIVE_OVERLAP] = 0
+
+        # Retract the clamping if we don't have one.
+        fg_inds = np.where(labels > 0)[0]
+        if len(fg_inds) == 0:
+            gt_assignment = argmax_overlaps[gt_argmax_overlaps]
+            labels[gt_argmax_overlaps] = gt_boxes[gt_assignment, 4]
+            fg_inds = np.where(labels > 0)[0]
+
+        # Select ignore labels to avoid too many negatives
+        # (~100x faster for 200 background indices)
+        ignore_inds = np.where(labels < 0)[0]
+
+        return inds_inside[fg_inds], inds_inside[ignore_inds]

-    def __call__(self, features, gt_boxes):
+    def __call__(self, **inputs):
        num_images = cfg.TRAIN.IMS_PER_BATCH
-        gt_boxes_wide = box_util.dismantle_boxes(gt_boxes, num_images)
-
-        if len(gt_boxes_wide) != num_images:
-            logger.fatal(
-                'Input {} images, got {} slices of gt boxes.'
-                .format(num_images, len(gt_boxes_wide))
-            )
-
-        # Generate grid anchors from base
-        grid_shapes = [f.shape[-2:] for f in features]
-        if grid_shapes == self.last_grid_shapes:
-            all_anchors = self.last_grid_anchors
-        else:
-            self.last_grid_shapes = grid_shapes
-            self.last_grid_anchors = all_anchors = \
-                generate_grid_anchors(
-                    grid_shapes,
-                    self.base_anchors,
-                    self.strides,
-                )
-        num_anchors = all_anchors.shape[0]
+        shapes = [f.shape[-2:] for f in inputs['features']]
+        image_stride = sum(self.base_anchors[i].shape[0] * np.prod(shapes[i])
+                           for i in range(len(inputs['features'])))

-        # Label: ``1`` is positive, ``0`` is negative, ``-1` is don't care
-        labels_wide = -np.ones((num_images, num_anchors,), 'int64')
-        bbox_indices_wide, bbox_anchors_wide, bbox_targets_wide = [], [], []
+        narrow_args = [self.all_coords, self.base_anchors, self.max_shapes, shapes]
+        outputs = collections.defaultdict(list)

-        # Different from R-CNN, all anchors will be used
-        inds_inside, anchors = np.arange(num_anchors), all_anchors
-        num_inside = len(inds_inside)
+        # Label: ``1`` is positive, ``0`` is negative, ``-1` is don't care
+        output_labels = np.zeros((num_images, image_stride,), 'int64')

        for ix in range(num_images):
-            # GT boxes (x1, y1, x2, y2, label)
-            gt_boxes = gt_boxes_wide[ix]
-
-            # label: 1 is positive, 0 is negative, -1 is don't care
-            labels = np.empty((num_inside,), dtype='int64')
-            labels.fill(-1)
-
-            # Overlaps between the anchors and the gt boxes
-            overlaps = box_util.bbox_overlaps(anchors, gt_boxes)
-            argmax_overlaps = overlaps.argmax(1)
-            max_overlaps = overlaps[np.arange(num_inside), argmax_overlaps]
-
-            # Foreground: for each gt, anchor with highest overlap
-            gt_argmax_overlaps = overlaps.argmax(0)
-            gt_max_overlaps = overlaps[gt_argmax_overlaps, np.arange(overlaps.shape[1])]
-            gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0]
-            gt_inds = argmax_overlaps[gt_argmax_overlaps]
-            labels[gt_argmax_overlaps] = gt_boxes[gt_inds, 4]
-
-            # Foreground: above threshold IoU
-            inds = max_overlaps >= cfg.RETINANET.POSITIVE_OVERLAP
-            gt_inds = argmax_overlaps[inds]
-            labels[inds] = gt_boxes[gt_inds, 4]
-            fg_inds = np.where(labels > 0)[0]
-
-            # Background: below threshold IoU
-            labels[max_overlaps < cfg.RETINANET.NEGATIVE_OVERLAP] = 0
-
-            # Retract the clamping if we don't have one
-            if len(fg_inds) == 0:
-                gt_inds = argmax_overlaps[gt_argmax_overlaps]
-                labels[gt_argmax_overlaps] = gt_boxes[gt_inds, 4]
-                fg_inds = np.where(labels > 0)[0]
-
-            labels_wide[ix, inds_inside] = labels
-            bbox_anchors_wide.append(anchors[fg_inds])
-            bbox_indices_wide.append(fg_inds + (num_anchors * ix))
-            bbox_targets_wide.append(
-                box_util.bbox_transform(
-                    anchors[fg_inds],
-                    gt_boxes[argmax_overlaps[fg_inds], :4],
-                )
-            )
+            fg_inds = inputs['fg_inds'][ix]
+            ignore_inds = inputs['bg_inds'][ix]
+            gt_boxes = inputs['gt_boxes'][ix]
+
+            # Narrow anchors to match the feature layout
+            anchors = self.all_anchors[fg_inds]
+            ignore_inds = rcnn_util.narrow_anchors(*(narrow_args + [ignore_inds]))
+            _, anchors = rcnn_util.narrow_anchors(*(narrow_args + [fg_inds, anchors]))
+            fg_inds = rcnn_util.narrow_anchors(*(narrow_args + [fg_inds]))
+
+            # Compute bbox targets
+            gt_assignment = box_util.bbox_overlaps(anchors, gt_boxes).argmax(axis=1)
+            bbox_targets = box_util.bbox_transform(anchors, gt_boxes[gt_assignment, :4])
+            outputs['bbox_anchors'].append(anchors)
+            outputs['bbox_targets'].append(bbox_targets)
+
+            # Compute label assignments
+            output_labels[ix, ignore_inds] = -1
+            output_labels[ix, fg_inds] = gt_boxes[gt_assignment, 4]
+
+            # Compute sparse indices
+            fg_inds += ix * image_stride
+            outputs['bbox_inds'].extend([fg_inds])

        return {
-            'labels': new_tensor(labels_wide),
-            'bbox_indices': new_tensor(np.concatenate(bbox_indices_wide)),
-            'bbox_anchors': new_tensor(np.concatenate(bbox_anchors_wide).astype('float32')),
-            'bbox_targets': new_tensor(np.concatenate(bbox_targets_wide).astype('float32')),
+            'labels': new_tensor(output_labels),
+            'bbox_inds': new_tensor(
+                np.concatenate(outputs['bbox_inds'])),
+            'bbox_targets': new_tensor(
+                np.concatenate(outputs['bbox_targets']).astype('float32')),
+            'bbox_anchors': new_tensor(
+                np.concatenate(outputs['bbox_anchors']).astype('float32')),
        }
--- a/seetadet/algo/retinanet/data_loader.py
+++ b/seetadet/algo/retinanet/data_loader.py
@@ -22,7 +22,10 @@ class DataLoader(object):
    """Provide mini-batches of data."""

    def __new__(cls):
-        if cfg.TRAIN.MAX_SIZE > 0:
+        pipeline_type = cfg.PIPELINE.TYPE.lower()
+        if pipeline_type == 'default' or pipeline_type == 'rcnn':
            return faster_rcnn.DataLoader()
-        else:
+        elif pipeline_type == 'ssd':
            return ssd.DataLoader()
+        else:
+            raise ValueError('Unsupported pipeline: ' + pipeline_type)
--- a/seetadet/algo/retinanet/test.py
+++ b/seetadet/algo/retinanet/test.py
@@ -20,60 +20,79 @@ import numpy as np

 from seetadet.core.config import cfg
 from seetadet.modeling.detector import new_detector
+from seetadet.utils import blob as blob_util
+from seetadet.utils import image as image_util
+from seetadet.utils import logger
 from seetadet.utils import nms as nms_util
 from seetadet.utils import time_util
-from seetadet.utils.blob import im_list_to_blob
-from seetadet.utils.image import scale_image


-def ims_detect(detector, raw_images):
-    """Detect images, with single or multiple scales."""
-    ims, ims_scale = [], []
-    for i in range(len(raw_images)):
-        im, im_scale = scale_image(raw_images[i])
-        ims += im
-        ims_scale += im_scale
-
-    num_scales = len(ims_scale) // len(raw_images)
-    ims_shape = np.array([im.shape[:2] for im in ims])
-    ims_scale = np.array(ims_scale).reshape((len(ims), -1))
-
-    # Prepare blobs
-    data = im_list_to_blob(ims)
-    ims_info = np.hstack([ims_shape, ims_scale]).astype('float32')
+def get_data(raw_images):
+    """Return the test data."""
+    max_size = cfg.TEST.MAX_SIZE
+    if cfg.PIPELINE.TYPE.lower() == 'ssd':
+        max_size = 0  # Warped to a fixed size
+    images_wide = []
+    image_shapes_wide, image_scales_wide = [], []
+    for img in raw_images:
+        images, image_scales = image_util.scale_image(
+            img, scales=cfg.TEST.SCALES, max_size=max_size)
+        images_wide += images
+        image_scales_wide += image_scales
+        image_shapes_wide += [img.shape[:2] for img in images]
+    images = blob_util.im_list_to_blob(
+        images_wide, coarsest_stride=cfg.MODEL.COARSEST_STRIDE)
+    image_shapes = np.array(image_shapes_wide)
+    image_scales = np.array(image_scales_wide).reshape((len(images), -1))
+    images_info = np.hstack([image_shapes, image_scales]).astype('float32')
+    return images, images_info
+
+
+def ims_detect(detector, raw_images, timer=None):
+    """Detect images at single or multiple scales."""
+    images, images_info = get_data(raw_images)
+    timer.tic() if timer else timer

    # Do Forward
-    data = torch.from_numpy(data)
-    ims_info = torch.from_numpy(ims_info)
-
+    inputs = {'image': torch.from_numpy(images),
+              'im_info': torch.from_numpy(images_info)}
+    # with torch.no_grad():
+    #     outputs = detector.forward(inputs)
    if not hasattr(detector, 'script_forward'):
-        def script_forward(self, data, ims_info):
-            return self.forward({'data': data, 'ims_info': ims_info})
+        def script_forward(self, image, im_info):
+            return self.forward({'image': image, 'im_info': im_info})
        detector.script_forward = torch.jit.trace(
            func=types.MethodType(script_forward, detector),
-            example_inputs=[data, ims_info],
+            example_inputs=[inputs['image'], inputs['im_info']],
        )

-    outputs = detector.script_forward(data, ims_info)
+    outputs = detector.script_forward(inputs['image'], inputs['im_info'])
    outputs = dict((k, outputs[k].numpy()) for k in outputs.keys())
-
-    # Unpack results
-    results = outputs['detections']
-    detections = [[] for _ in range(len(raw_images))]
-
-    for i in range(len(ims)):
-        inds = np.where(results[:, 0].astype(np.int32) == i)[0]
-        detections[i // num_scales].append(results[inds, 1:])
-
-    return [np.vstack(detections[i]) for i in range(len(raw_images))]
-
-
-def test_net(weights, num_classes, q_in, q_out, device):
-    num_classes, cfg.GPU_ID = num_classes, device
+    timer.toc() if timer else timer
+
+    # Decode results
+    detections = outputs['detections']
+    results = [[] for _ in range(len(raw_images))]
+    for i in range(len(images)):
+        inds = np.where(detections[:, 0].astype(np.int32) == i)[0]
+        results[i // len(cfg.TEST.SCALES)].append(detections[inds, 1:])
+
+    # Merge from multiple scales
+    ret = [np.vstack(d) for d in results]
+    timer.toc() if timer else timer
+    return ret
+
+
+def test_net(weights, q_in, q_out, device, root_logger=True):
+    """Test a network trained with RetinaNet algorithm."""
+    cfg.GPU_ID = device
+    num_classes = len(cfg.MODEL.CLASSES)
+    logger.set_root_logger(root_logger)
    detector = new_detector(device, weights)

    must_stop = False
-    _t = time_util.new_timers('im_detect', 'misc')
+    timers = time_util.new_timers('im_detect_bbox', 'misc')
+    empty_detections = np.zeros((0, 5), 'float32')

    while True:
        if must_stop:
@@ -91,17 +110,19 @@ def test_net(weights, num_classes, q_in, q_out, device):
            continue

        # Run detecting on specific scales
-        with _t['im_detect'].tic_and_toc():
-            results = ims_detect(detector, raw_images)
+        results = ims_detect(detector, raw_images, timers['im_detect_bbox'])

-        # Post-Processing
+        # Post-processing
        for i, detections in enumerate(results):
-            _t['misc'].tic()
+            timers['misc'].tic()
            boxes_this_image = [[]]
-            # {x1, y1, x2, y2, score, cls}
+            # Detection format: (x1, y1, x2, y2, score, cls)
            detections = np.array(detections)
            for j in range(1, num_classes):
                cls_indices = np.where(detections[:, 5].astype(np.int32) == j)[0]
+                if len(cls_indices) == 0:
+                    boxes_this_image.append(empty_detections)
+                    continue
                cls_boxes = detections[cls_indices, :4]
                cls_scores = detections[cls_indices, 4]
                cls_detections = np.hstack((
@@ -121,11 +142,11 @@ def test_net(weights, num_classes, q_in, q_out, device):
                    )
                cls_detections = cls_detections[keep, :]
                boxes_this_image.append(cls_detections)
-            _t['misc'].toc()
+            timers['misc'].toc()

            q_out.put((
                indices[i],
-                dict([('im_detect', _t['im_detect'].average_time),
-                      ('misc', _t['misc'].average_time)]),
+                dict([('im_detect', timers['im_detect_bbox'].average_time),
+                      ('misc', timers['misc'].average_time)]),
                dict([('boxes', boxes_this_image)]),
            ))
--- a/seetadet/algo/ssd/__init__.py
+++ b/seetadet/algo/ssd/__init__.py
@@ -14,7 +14,4 @@ from __future__ import division
 from __future__ import print_function

 from seetadet.algo.ssd.data_loader import DataLoader
-from seetadet.algo.ssd.hard_mining import HardMining
-from seetadet.algo.ssd.multibox import MultiBoxMatch
-from seetadet.algo.ssd.multibox import MultiBoxTarget
-from seetadet.algo.ssd.priorbox import PriorBox
+from seetadet.algo.ssd.anchor_target import AnchorTarget
--- a/seetadet/algo/ssd/anchor_target.py
+++ b/seetadet/algo/ssd/anchor_target.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#     <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import math
+
+import numpy as np
+
+from seetadet.algo.ssd import generate_anchors as anchor_util
+from seetadet.algo.ssd import utils as ssd_util
+from seetadet.core.config import cfg
+from seetadet.utils import boxes as box_util
+from seetadet.utils.env import new_tensor
+
+
+class AnchorTarget(object):
+    """Assign ground-truth targets to anchors."""
+
+    def __init__(self):
+        super(AnchorTarget, self).__init__()
+        # Load the basic configs
+        self.strides = cfg.SSD.STRIDES
+        anchor_sizes = cfg.SSD.ANCHOR_SIZES
+        aspect_ratios = cfg.SSD.ASPECT_RATIOS
+        self.base_anchors = []
+        for i in range(len(anchor_sizes)):
+            ratios = aspect_ratios[i]
+            if not isinstance(ratios, (tuple, list)):
+                # All strides share the same ratios
+                ratios = aspect_ratios
+            self.base_anchors.append(
+                anchor_util.generate_anchors(
+                    min_sizes=[anchor_sizes[i][0]],
+                    max_sizes=[anchor_sizes[i][1]],
+                    ratios=ratios))
+        # Plan the fixed anchor layout
+        max_size = cfg.TRAIN.SCALES[0]
+        if cfg.MODEL.COARSEST_STRIDE > 0:
+            stride = float(cfg.MODEL.COARSEST_STRIDE)
+            max_size = int(math.ceil(max_size / stride) * stride)
+        shapes = [[math.ceil(max_size / stride)] * 2
+                  for stride in self.strides]
+        self.all_anchors = ssd_util.get_shifted_anchors(
+            shapes, self.base_anchors, self.strides)
+
+    def sample_anchors(self, gt_boxes, all_anchors=None):
+        anchors = self.all_anchors \
+            if all_anchors is None else all_anchors
+
+        num_anchors = len(anchors)
+        labels = np.empty((num_anchors,), dtype='int32')
+        labels.fill(-1)
+
+        # Overlaps between the anchors and the gt boxes.
+        overlaps = box_util.bbox_overlaps(anchors, gt_boxes)
+        argmax_overlaps = overlaps.argmax(axis=1)
+        max_overlaps = overlaps[np.arange(num_anchors), argmax_overlaps]
+
+        # Foreground: for each gt, anchor with highest overlap.
+        gt_argmax_overlaps = overlaps.argmax(axis=0)
+        gt_assignment = argmax_overlaps[gt_argmax_overlaps]
+        labels[gt_argmax_overlaps] = gt_boxes[gt_assignment, 4]
+
+        # Foreground: above threshold IoU.
+        inds = max_overlaps >= cfg.SSD.POSITIVE_OVERLAP
+        gt_assignment = argmax_overlaps[inds]
+        labels[inds] = gt_boxes[gt_assignment, 4]
+        fg_inds = np.where(labels > 0)[0]
+
+        # Negative: not matched and below threshold IoU.
+        neg_inds = np.where(labels <= 0)[0]
+        neg_overlaps = max_overlaps[neg_inds]
+        eligible_neg_inds = np.where(neg_overlaps < cfg.SSD.NEGATIVE_OVERLAP)[0]
+        neg_inds = neg_inds[eligible_neg_inds]
+
+        return fg_inds, neg_inds
+
+    def __call__(self, **inputs):
+        num_images = cfg.TRAIN.IMS_PER_BATCH
+        neg_pos_ratio = cfg.SSD.NEGATIVE_POSITIVE_RATIO
+        image_stride = self.all_anchors.shape[0]
+
+        cls_prob = inputs['cls_prob'].numpy()
+        outputs = collections.defaultdict(list)
+
+        # Label: ``1`` is positive, ``0`` is negative, ``-1` is don't care
+        output_labels = np.empty((num_images, image_stride,), 'int64')
+        output_labels.fill(-1)
+
+        for ix in range(num_images):
+            fg_inds = inputs['fg_inds'][ix]
+            neg_inds = inputs['bg_inds'][ix]
+            gt_boxes = inputs['gt_boxes'][ix]
+
+            # Mining hard negatives as background.
+            num_pos, num_neg = len(fg_inds), len(neg_inds)
+            num_bg = min(int(num_pos * neg_pos_ratio), num_neg)
+            neg_loss = -np.log(np.maximum(
+                cls_prob[ix, neg_inds][np.arange(num_neg),
+                                       np.zeros((num_neg,), 'int32')],
+                np.finfo(float).eps))
+            bg_inds = neg_inds[np.argsort(-neg_loss)][:num_bg]
+
+            # Compute bbox targets.
+            anchors = self.all_anchors[fg_inds]
+            gt_assignment = box_util.bbox_overlaps(
+                anchors, gt_boxes).argmax(axis=1)
+            bbox_targets = box_util.bbox_transform(
+                anchors, gt_boxes[gt_assignment, :4],
+                cfg.BBOX_REG_WEIGHTS)
+            outputs['bbox_anchors'].append(anchors)
+            outputs['bbox_targets'].append(bbox_targets)
+
+            # Compute label assignments.
+            output_labels[ix, bg_inds] = 0
+            output_labels[ix, fg_inds] = gt_boxes[gt_assignment, 4]
+
+            # Compute sparse indices.
+            fg_inds += ix * image_stride
+            outputs['bbox_inds'].extend([fg_inds])
+
+        return {
+            'labels': new_tensor(output_labels),
+            'bbox_inds': new_tensor(
+                np.concatenate(outputs['bbox_inds'])),
+            'bbox_targets': new_tensor(
+                np.concatenate(outputs['bbox_targets']).astype('float32')),
+            'bbox_anchors': new_tensor(
+                np.concatenate(outputs['bbox_anchors']).astype('float32')),
+        }
--- a/seetadet/algo/ssd/cat.jpg
+++ b/seetadet/algo/ssd/cat.jpg
--- a/seetadet/algo/ssd/data_loader.py
+++ b/seetadet/algo/ssd/data_loader.py
@@ -13,8 +13,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import collections
 import multiprocessing as mp
 import time
+import threading
+import queue

 import dragon
 import dragon.vm.torch as torch
@@ -23,6 +26,7 @@ import numpy as np
 from seetadet.algo.ssd import data_transformer
 from seetadet.core.config import cfg
 from seetadet.datasets.factory import get_dataset
+from seetadet.utils import blob as blob_util
 from seetadet.utils import logger


@@ -32,28 +36,24 @@ class DataLoader(object):
    def __init__(self):
        super(DataLoader, self).__init__()
        dataset = get_dataset(cfg.TRAIN.DATASET)
-        if cfg.USE_DALI:
-            from seetadet.dali import ssd_pipeline as pipe
-            self.iterator = pipe.new_iterator(dataset.source)
-        else:
-            self.iterator = Iterator(**{
-                'dataset': dataset.cls,
-                'source': dataset.source,
-                'classes': dataset.classes,
-                'shuffle': cfg.TRAIN.USE_SHUFFLE,
-                'num_chunks': cfg.TRAIN.SHUFFLE_CHUNKS,
-                'batch_size': cfg.TRAIN.IMS_PER_BATCH * 2,
-                'num_transformers': cfg.TRAIN.NUM_THREADS - 1,
-            })
+        self.iterator = Iterator(**{
+            'dataset': dataset.cls,
+            'source': dataset.source,
+            'classes': dataset.classes,
+            'shuffle': cfg.TRAIN.USE_SHUFFLE,
+            'batch_size': cfg.TRAIN.IMS_PER_BATCH * 2,
+            'num_transformers': cfg.TRAIN.NUM_THREADS - 1,
+        })
+        self.iterator.start()

    def __call__(self):
        outputs = self.iterator.next()
-        if isinstance(outputs['data'], np.ndarray):
-            outputs['data'] = torch.from_numpy(outputs['data'])
+        if isinstance(outputs['image'], np.ndarray):
+            outputs['image'] = torch.from_numpy(outputs['image'])
        return outputs


-class Iterator(object):
+class Iterator(threading.Thread):
    """Iterator to return the batch of data."""

    def __init__(self, **kwargs):
@@ -67,15 +67,16 @@ class Iterator(object):
            rank = dragon.distributed.get_rank(process_group)

        # Configuration
-        self._prefetch = kwargs.get('prefetch', 5)
-        self._batch_size = kwargs.get('batch_size', 32)
+        self._batch_size = kwargs.get('batch_size', 8)
        self._num_readers = kwargs.get('num_readers', 1)
        self._num_transformers = kwargs.get('num_transformers', 3)
+        self.daemon = True

        # Initialize queues
-        num_batches = self._prefetch * self._num_readers
-        self.q_in = mp.Queue(num_batches * self._batch_size)
-        self.q_out = mp.Queue(num_batches * self._batch_size)
+        num_batches = self._num_readers
+        self._queue1 = mp.Queue(num_batches * self._batch_size)
+        self._queue2 = mp.Queue(num_batches * self._batch_size)
+        self._queue3 = queue.Queue(num_batches)

        # Initialize readers
        self._readers = []
@@ -86,7 +87,7 @@ class Iterator(object):
            self._readers.append(dragon.io.DataReader(
                part_idx=part_idx, num_parts=num_parts, **kwargs))
            self._readers[i]._seed += part_idx
-            self._readers[i].q_out = self.q_in
+            self._readers[i].q_out = self._queue1
            self._readers[i].start()
            time.sleep(0.1)

@@ -95,7 +96,7 @@ class Iterator(object):
        for i in range(self._num_transformers):
            p = data_transformer.DataTransformer(**kwargs)
            p._seed += (i + rank * self._num_transformers)
-            p.q_in, p.q_out = self.q_in, self.q_out
+            p.q_in, p.q_out = self._queue1, self._queue2
            p.start()
            self._transformers.append(p)
            time.sleep(0.1)
@@ -118,26 +119,41 @@ class Iterator(object):
        """Return the next batch of data."""
        return self.__next__()

+    def run(self):
+        """Main loop."""
+        num_images = cfg.TRAIN.IMS_PER_BATCH
+        num_batches = cfg.TRAIN.ASPECT_GROUPING
+        logger.info('Initialize prefetching batches...')
+        example_buffer = [self._queue2.get()
+                          for _ in range(num_images * num_batches)]
+        next_examples = []
+
+        while True:
+            # Use cached buffer for next N examples
+            if len(next_examples) == 0:
+                next_examples = example_buffer
+                example_buffer = []
+
+            # Prepare the next batch
+            outputs = collections.defaultdict(list)
+            for i in range(num_images):
+                example = next_examples.pop(0)
+                outputs['image'].append(example['image'])
+                outputs['gt_boxes'].append(example['boxes'])
+                outputs['im_info'].append(example['im_info'])
+                outputs['fg_inds'].append(example.get('fg_inds', None))
+                outputs['bg_inds'].append(example.get('bg_inds', None))
+                example_buffer.append(self._queue2.get())
+            outputs['image'] = blob_util.im_list_to_blob(
+                outputs['image'], coarsest_stride=cfg.MODEL.COARSEST_STRIDE)
+
+            # Send batch data to consumer
+            self._queue3.put(outputs)
+
    def __iter__(self):
        """Return the iterator self."""
        return self

    def __next__(self):
        """Return the next batch of data."""
-        n = cfg.TRAIN.IMS_PER_BATCH
-        h = w = cfg.TRAIN.SCALES[0]
-
-        boxes_to_pack = []
-        image, boxes = self.q_out.get()
-        images = np.zeros((n, h, w, 3), image.dtype)
-
-        for i in range(n):
-            images[i] = image
-            gt_boxes = np.zeros((boxes.shape[0], boxes.shape[1] + 1), 'float32')
-            gt_boxes[:, :boxes.shape[1]], gt_boxes[:, -1] = boxes, i
-            boxes_to_pack.append(gt_boxes)
-            if i != (cfg.TRAIN.IMS_PER_BATCH - 1):
-                image, boxes = self.q_out.get()
-        boxes_to_pack = np.concatenate(boxes_to_pack)
-
-        return {'data': images, 'gt_boxes': boxes_to_pack}
+        return self._queue3.get()
--- a/seetadet/algo/ssd/data_transformer.py
+++ b/seetadet/algo/ssd/data_transformer.py
@@ -14,8 +14,12 @@ from __future__ import division
 from __future__ import print_function

 import multiprocessing
+
+import cv2
 import numpy as np
+import numpy.random as npr

+from seetadet.algo import common as algo_common
 from seetadet.algo.ssd import transforms
 from seetadet.core.config import cfg
 from seetadet.datasets.example import Example
@@ -27,108 +31,95 @@ class DataTransformer(multiprocessing.Process):
        super(DataTransformer, self).__init__()
        self._scale = cfg.TRAIN.SCALES[0]
        self._seed = cfg.RNG_SEED
-        self._mirror = cfg.TRAIN.USE_FLIPPED
        self._use_diff = cfg.TRAIN.USE_DIFF
+        self._use_flipped = cfg.TRAIN.USE_FLIPPED
        self._classes = kwargs.get('classes', ('__background__',))
        self._num_classes = len(self._classes)
        self._class_to_ind = dict(zip(self._classes, range(self._num_classes)))
-        self.augment_image = \
-            transforms.Compose(
-                transforms.Distort(),  # Color augmentation
-                transforms.Expand(),   # Expand and padding
-                transforms.Sample(),   # Sample a patch randomly
-                transforms.Resize(),   # Resize to a fixed scale
-            )
+        self._anchor_sampler = algo_common.AnchorSampler()
+        self._apply_transform = transforms.Compose(transforms.Distort(),
+                                                   transforms.Expand(),
+                                                   transforms.Sample(),
+                                                   transforms.Resize())
        self.q_in = self.q_out = None
        self.daemon = True

-    def make_roi_dict(self, example, apply_flip=False):
-        objects, n_objects = example.objects, 0
+    def get_boxes(self, example):
+        objects, num_objects = example.objects, 0
        height, width = example.height, example.width
        if not self._use_diff:
            for obj in objects:
                if obj.get('difficult', 0) == 0:
-                    n_objects += 1
+                    num_objects += 1
        else:
-            n_objects = len(objects)
+            num_objects = len(objects)

-        roi_dict = {
-            'boxes': np.zeros((n_objects, 4), 'float32'),
-            'gt_classes': np.zeros((n_objects,), 'int32'),
-        }
+        boxes = np.zeros((num_objects, 4), 'float32')
+        gt_classes = np.zeros((num_objects,), 'int32')

-        # Filter the difficult instances
+        # Filter the difficult instances.
        object_idx = 0
        for obj in objects:
-            if not self._use_diff and \
-                    obj.get('difficult', 0) > 0:
+            if not self._use_diff and obj.get('difficult', 0) > 0:
                continue
            bbox = obj['bbox']
-            roi_dict['boxes'][object_idx, :] = [
-                max(0, bbox[0]),
-                max(0, bbox[1]),
-                min(bbox[2], width - 1),
-                min(bbox[3], height - 1),
-            ]
-            roi_dict['gt_classes'][object_idx] = \
-                self._class_to_ind[obj['name']]
+            boxes[object_idx, :] = [max(0, bbox[0]),
+                                    max(0, bbox[1]),
+                                    min(bbox[2], width - 1),
+                                    min(bbox[3], height - 1)]
+            gt_classes[object_idx] = self._class_to_ind[obj['name']]
            object_idx += 1

-        if apply_flip:
-            roi_dict['boxes'] = \
-                box_util.flip_boxes(
-                    roi_dict['boxes'],
-                    width,
-                )
+        # Normalize.
+        boxes[:, 0::2] /= width
+        boxes[:, 1::2] /= height

-        # Normalize to unit sizes
-        roi_dict['boxes'][:, 0::2] /= width
-        roi_dict['boxes'][:, 1::2] /= height
+        # Attach the classes.
+        gt_boxes = np.empty((num_objects, 5), dtype=np.float32)
+        gt_boxes[:, :4], gt_boxes[:, 4] = boxes, gt_classes

-        return roi_dict
+        return gt_boxes

    def get(self, example):
        example = Example(example)
-        img = example.image
-
-        # Flip
-        apply_flip = False
-        if self._mirror:
-            if np.random.randint(2) > 0:
-                img = img[:, ::-1]
-                apply_flip = True

-        # Example -> RoIDict
-        roi_dict = self.make_roi_dict(example, apply_flip)
+        # Boxes.
+        boxes = self.get_boxes(example)
+        if len(boxes) == 0:
+            return {'boxes': boxes}

-        # Post-Process for gt boxes
-        # Shape like: [num_objects, {x1, y1, x2, y2, cls}]
-        gt_boxes = np.empty((roi_dict['gt_classes'].size, 5), 'float32')
-        gt_boxes[:, :4], gt_boxes[:, 4] = roi_dict['boxes'], roi_dict['gt_classes']
+        # Distort => Expand => Sample => Resize
+        img, boxes = self._apply_transform(example.image, boxes)

-        if len(gt_boxes) == 0:
-            # Ignore the non-object image
-            return img, gt_boxes
+        # Restore to the blob scale.
+        boxes[:, :4] *= self._scale

-        # Distort => Expand => Sample => Resize
-        img, gt_boxes = self.augment_image(img, gt_boxes)
+        # Flip.
+        if self._use_flipped and npr.randint(2) > 0:
+            img = img[:, ::-1]
+            boxes = box_util.flip_boxes(boxes, img.shape[1])

-        # Restore to the blob scale
-        gt_boxes[:, :4] *= self._scale
+        # Standard outputs.
+        outputs = {'image': img, 'boxes': boxes, 'im_info': img.shape[:2]}

-        # Post-Process for image
-        if img.dtype == 'uint16':
-            img = img.astype('float32') / 256.
+        # Attach precomputed targets.
+        if len(boxes) > 0:
+            outputs.update(
+                self._anchor_sampler(
+                    gt_boxes=boxes,
+                    im_info=outputs['im_info']))

-        return img, gt_boxes
+        return outputs

    def run(self):
-        # Fix the process-local random seed
+        # Disable the opencv threading.
+        cv2.setNumThreads(1)
+        # Fix the process-local random seed.
        np.random.seed(self._seed)

        # Main prefetch loop
        while True:
            outputs = self.get(self.q_in.get())
-            if len(outputs[1]) < 1:
-                continue  # Ignore the non-object image
+            if len(outputs['boxes']) < 1:
+                continue  # Ignore non-object image.
            self.q_out.put(outputs)
--- a/seetadet/algo/ssd/generate_anchors.py
+++ b/seetadet/algo/ssd/generate_anchors.py
@@ -19,9 +19,8 @@ import numpy as np
 def generate_anchors(min_sizes, max_sizes, ratios):
    """Generate anchors by enumerating aspect ratios and sizes."""
    total_anchors = []
-
    for idx, min_size in enumerate(min_sizes):
-        # Note that SSD assume it is a ctr-anchor
+        # Note that SSD assume it is a center anchor
        base_anchor = np.array([0, 0, min_size, min_size])
        anchors = _ratio_enum(base_anchor, ratios, _mkanchors)
        if len(max_sizes) > 0:

--- a/seetadet/algo/ssd/hard_mining.py
+++ b/seetadet/algo/ssd/hard_mining.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#     <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from seetadet.core.config import cfg
-from seetadet.utils.env import new_tensor
-
-
-class HardMining(object):
-    def __call__(self, prob, labels, overlaps):
-        label_shape, label_size = labels.shape, labels.size
-        prob = prob.numpy().reshape((label_size, -1))
-        labels, overlaps = labels.flatten(), overlaps.flatten()
-
-        neg_ovr = cfg.SSD.OHEM.NEG_OVERLAP
-        neg_ratio = cfg.SSD.OHEM.NEG_POS_RATIO
-
-        # label ``-1`` will be ignored
-        new_labels = -np.ones(labels.shape, 'int64')
-
-        cls_loss = -np.log(
-            np.maximum(
-                prob[np.arange(label_size), labels],
-                np.finfo(float).eps,
-            )
-        )
-
-        # Filter negatives
-        fg_inds = np.where(labels > 0)[0]
-        neg_inds = np.where(labels == 0)[0]
-        neg_overlaps = overlaps[neg_inds]
-        eligible_neg_inds = np.where(neg_overlaps < neg_ovr)[0]
-        neg_inds = neg_inds[eligible_neg_inds]
-
-        # Apply mining on negatives
-        neg_cls_loss = cls_loss[neg_inds]
-        num_pos, num_neg = len(fg_inds), len(neg_inds)
-        num_bg = min(int(num_pos * neg_ratio), num_neg)
-        bg_inds = neg_inds[np.argsort(-neg_cls_loss)][:num_bg]
-        new_labels[fg_inds] = labels[fg_inds]  # Keep fg indices
-        new_labels[bg_inds] = 0  # Use hard negatives as bg indices
-
-        # Feed labels to compute cls loss
-        return {'labels': new_tensor(new_labels.reshape(label_shape))}
--- a/seetadet/algo/ssd/multibox.py
+++ b/seetadet/algo/ssd/multibox.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#     <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from seetadet.core.config import cfg
-from seetadet.utils import boxes as box_util
-from seetadet.utils.env import new_tensor
-
-
-class MultiBoxMatch(object):
-    def __call__(self, prior_boxes, gt_boxes):
-        num_images = cfg.TRAIN.IMS_PER_BATCH
-        gt_boxes_wide = box_util.dismantle_boxes(gt_boxes, num_images)
-        num_priors, box_dim = prior_boxes.shape[:]
-
-        # Do matching between prior boxes and gt boxes
-        match_inds_wide = -np.ones((num_images, num_priors), 'int32')
-        match_labels_wide = np.zeros(match_inds_wide.shape, 'int64')
-        max_overlaps_wide = np.zeros(match_inds_wide.shape, 'float32')
-
-        for ix in range(num_images):
-            # GT boxes (x1, y1, x2, y2, label)
-            gt_boxes = gt_boxes_wide[ix]
-            num_gt = gt_boxes.shape[0]
-            if num_gt == 0:
-                continue
-
-            # Compute the overlaps between prior boxes and gt boxes
-            overlaps = box_util.bbox_overlaps(prior_boxes, gt_boxes)
-            argmax_overlaps = overlaps.argmax(1)
-            max_overlaps = overlaps[np.arange(num_priors), argmax_overlaps]
-            max_overlaps_wide[ix] = max_overlaps
-
-            # Bipartite matching and assignments
-            bipartite_inds = overlaps.argmax(0)
-            class_assignment = gt_boxes[:, -1]
-            match_inds_wide[ix, bipartite_inds] = np.arange(num_gt, dtype='int32')
-            match_labels_wide[ix, bipartite_inds] = class_assignment
-
-            # Per prediction matching and assignments
-            # Note that SSD match each prior box for only once
-            # We simply implement it by clobbering the assignments matched in bipartite
-            per_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0]
-            gt_assignment = argmax_overlaps[per_inds]
-            class_assignment = gt_boxes[gt_assignment, -1]
-            match_inds_wide[ix, per_inds] = gt_assignment
-            match_labels_wide[ix, per_inds] = class_assignment
-
-        return {
-            'match_inds': match_inds_wide,
-            'match_labels': match_labels_wide,
-            'max_overlaps': max_overlaps_wide,
-        }
-
-
-class MultiBoxTarget(object):
-    def __call__(
-        self,
-        match_inds,
-        match_labels,
-        prior_boxes,
-        gt_boxes,
-    ):
-        num_images = cfg.TRAIN.IMS_PER_BATCH
-        # GT assignments between default boxes and gt boxes
-        match_inds_wide = match_inds
-        # Matched labels (After hard mining possibly)
-        match_labels_wide = match_labels
-
-        num_priors, box_dim = prior_boxes.shape[:]
-        gt_boxes_wide = box_util.dismantle_boxes(gt_boxes, num_images)
-        bbox_indices_wide, bbox_anchors_wide, bbox_targets_wide = [], [], []
-
-        for ix in range(num_images):
-            gt_boxes = gt_boxes_wide[ix]
-            if gt_boxes.shape[0] == 0:
-                continue
-
-            # Select ground-truth
-            match_inds = match_inds_wide[ix]
-            match_labels = match_labels_wide[ix]
-            ex_inds = np.where(match_labels > 0)[0]
-            ex_rois = prior_boxes[ex_inds]
-            gt_assignment = match_inds[ex_inds]
-            gt_rois = gt_boxes[gt_assignment]
-
-            # Assign bbox targets
-            bbox_anchors_wide.append(ex_rois)
-            bbox_indices_wide.append(ex_inds + (num_priors * ix))
-            bbox_targets_wide.append(
-                box_util.bbox_transform(
-                    ex_rois,
-                    gt_rois,
-                    cfg.BBOX_REG_WEIGHTS,
-                )
-            )
-
-        return {
-            'bbox_indices': new_tensor(np.concatenate(bbox_indices_wide)),
-            'bbox_anchors': new_tensor(np.concatenate(bbox_anchors_wide).astype('float32')),
-            'bbox_targets': new_tensor(np.concatenate(bbox_targets_wide).astype('float32')),
-        }
--- a/seetadet/algo/ssd/priorbox.py
+++ b/seetadet/algo/ssd/priorbox.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#     <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from seetadet.algo.ssd.generate_anchors import generate_anchors
-from seetadet.core.config import cfg
-
-
-class PriorBox(object):
-    """Generate default boxes(anchors)."""
-
-    def __init__(self):
-        super(PriorBox, self).__init__()
-        min_sizes = cfg.SSD.MULTIBOX.MIN_SIZES
-        max_sizes = cfg.SSD.MULTIBOX.MAX_SIZES
-        if len(max_sizes) > 0:
-            if len(min_sizes) != len(max_sizes):
-                raise ValueError(
-                    'Got {} min sizes and {} max sizes.'
-                    .format(len(min_sizes), len(max_sizes))
-                )
-        self.strides = cfg.SSD.MULTIBOX.STRIDES
-        aspect_ratios = cfg.SSD.MULTIBOX.ASPECT_RATIOS
-        self.base_anchors = []
-        for i in range(len(min_sizes)):
-            self.base_anchors.append(
-                generate_anchors(
-                    min_sizes[i] if isinstance(
-                        min_sizes[i], (list, tuple)) else [min_sizes[i]],
-                    max_sizes[i] if isinstance(
-                        max_sizes[i], (list, tuple)) else [max_sizes[i]],
-                    aspect_ratios[i],
-                )
-            )
-        # Store the cached grid anchors
-        self.last_grid_anchors = None
-
-    def __call__(self, features):
-        if self.last_grid_anchors is not None:
-            return self.last_grid_anchors
-
-        all_anchors = []
-        for i in range(len(self.strides)):
-            # 1. Generate base grids
-            height, width = features[i].shape[-2:]
-            shift_x = (np.arange(0, width) + 0.5) * self.strides[i]
-            shift_y = (np.arange(0, height) + 0.5) * self.strides[i]
-            shift_x, shift_y = np.meshgrid(shift_x, shift_y)
-
-            # 2. Apply anchors on base grids
-            # Add a anchors (1, a, 4) to
-            # cell k shifts (k, 1, 4) to get
-            # shift anchors (k, a, 4)
-            # Reshape to (k * a, 4) shifted anchors
-            a = self.base_anchors[i].shape[0]
-            d = self.base_anchors[i].shape[1]
-            shifts = np.vstack((
-                shift_x.ravel(),
-                shift_y.ravel(),
-                shift_x.ravel(),
-                shift_y.ravel())
-            ).transpose()
-            k = shifts.shape[0]  # k = map_h * map_w
-            anchors = (self.base_anchors[i].reshape((1, a, d)) +
-                       shifts.reshape((1, k, d)).transpose((1, 0, 2)))
-            anchors = anchors.reshape((k * a, d)).astype(np.float32)
-            all_anchors.append(anchors)
-
-        self.last_grid_anchors = np.concatenate(all_anchors)
-        return self.last_grid_anchors
--- a/seetadet/algo/ssd/test.py
+++ b/seetadet/algo/ssd/test.py
@@ -15,69 +15,70 @@ from __future__ import print_function

 import types

-import cv2
 import dragon.vm.torch as torch
 import numpy as np

 from seetadet.core.config import cfg
 from seetadet.modeling.detector import new_detector
+from seetadet.utils import blob as blob_util
 from seetadet.utils import boxes as box_util
+from seetadet.utils import image as image_util
+from seetadet.utils import logger
 from seetadet.utils import nms as nms_util
 from seetadet.utils import time_util


-def get_images(ims):
-    out_size = cfg.TEST.SCALES[0]
-    processed_ims, im_scales = [], []
-    for im in ims:
-        im_scales.append((float(out_size) / im.shape[0],
-                          float(out_size) / im.shape[1]))
-        processed_ims.append(cv2.resize(
-            im, (out_size, out_size),
-            interpolation=cv2.INTER_AREA))
-    if ims[0].dtype == 'uint16':
-        ims_blob = np.array(processed_ims, dtype='float32') / 256.
-    else:
-        ims_blob = np.array(processed_ims, dtype='uint8')
-    return ims_blob, im_scales
+def get_data(raw_images):
+    """Return the test data."""
+    images_wide, image_scales_wide = [], []
+    for img in raw_images:
+        images, image_scales = image_util.scale_image(
+            img, scales=cfg.TEST.SCALES, max_size=0)
+        images_wide += images
+        image_scales_wide += image_scales
+    images_wide = blob_util.im_list_to_blob(
+        images_wide, coarsest_stride=cfg.MODEL.COARSEST_STRIDE)
+    return images_wide, image_scales_wide


-def ims_detect(detector, ims):
-    """Detect images, with the single scale."""
-    data, im_scales = get_images(ims)
-
-    # Do Forward
-    data = torch.from_numpy(data)
+def ims_detect(detector, raw_images):
+    """Detect images at single or multiple scales."""
+    images, image_scales = get_data(raw_images)

+    # Do forward
+    inputs = {'image': torch.from_numpy(images)}
    if not hasattr(detector, 'script_forward'):
-        def script_forward(self, data):
-            return self.forward({'data': data})
+        def script_forward(self, image):
+            return self.forward({'image': image})
        detector.script_forward = torch.jit.trace(
            func=types.MethodType(script_forward, detector),
-            example_inputs=[data],
+            example_inputs=[inputs['image']],
        )
-
-    outputs = detector.script_forward(data)
-    cls_prob = outputs['cls_prob'].numpy()
-    bbox_pred = outputs['bbox_pred'].numpy()
+    outputs = detector.script_forward(inputs['image'])

    # Decode results
-    batch_boxes = []
-    for i in range(len(im_scales)):
+    batch_pred = outputs['bbox_pred'].numpy()
+    batch_scores = outputs['cls_prob'].numpy()
+    results = [([], []) for _ in range(len(raw_images))]
+    for i in range(len(images)):
        boxes = box_util.bbox_transform_inv(
-            outputs['prior_boxes'],
-            bbox_pred[i],
-            cfg.BBOX_REG_WEIGHTS,
-        )
-        boxes[:, 0::2] /= im_scales[i][1]
-        boxes[:, 1::2] /= im_scales[i][0]
-        batch_boxes.append(box_util.clip_boxes(boxes, ims[i].shape))
-
-    return cls_prob, batch_boxes
-
-
-def test_net(weights, num_classes, q_in, q_out, device):
-    num_classes, cfg.GPU_ID = num_classes, device
+            outputs['prior_boxes'], batch_pred[i],
+            cfg.BBOX_REG_WEIGHTS)
+        boxes[:, 0::2] /= image_scales[i][1]
+        boxes[:, 1::2] /= image_scales[i][0]
+        boxes = box_util.clip_boxes(boxes, raw_images[i].shape)
+        results[i // len(cfg.TEST.SCALES)][0].append(batch_scores[i])
+        results[i // len(cfg.TEST.SCALES)][1].append(boxes)
+
+    # Merge from multiple scales
+    return [(np.vstack(s), np.vstack(b)) for s, b in results]
+
+
+def test_net(weights, q_in, q_out, device, root_logger=True):
+    """Test a network trained with SSD algorithm."""
+    cfg.GPU_ID = device
+    num_classes = len(cfg.MODEL.CLASSES)
+    logger.set_root_logger(root_logger)
    detector = new_detector(device, weights)

    must_stop = False
@@ -99,18 +100,17 @@ def test_net(weights, num_classes, q_in, q_out, device):
            continue

        with _t['im_detect'].tic_and_toc():
-            batch_scores, batch_boxes = \
-                ims_detect(detector, raw_images)
+            results = ims_detect(detector, raw_images)

-        for i in range(len(batch_scores)):
+        for i, (scores, boxes) in enumerate(results):
            _t['misc'].tic()
-            scores, boxes = batch_scores[i], batch_boxes[i]
            boxes_this_image = [[]]
+            # Detection format: (score...), (x1, y1, x2, y2)
            for j in range(1, num_classes):
                inds = np.where(scores[:, j] > cfg.TEST.SCORE_THRESH)[0]
                cls_scores = scores[inds, j]
                cls_boxes = boxes[inds]
-                pre_nms_inds = np.argsort(-cls_scores)[:cfg.TEST.NMS_TOP_K]
+                pre_nms_inds = np.argsort(-cls_scores)[:cfg.TEST.PRE_NMS_TOP_N]
                cls_scores = cls_scores[pre_nms_inds]
                cls_boxes = cls_boxes[pre_nms_inds]
                cls_detections = np.hstack(

--- a/seetadet/algo/ssd/transforms.py
+++ b/seetadet/algo/ssd/transforms.py
@@ -13,23 +13,22 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import collections
 import math

-import cv2
-import PIL.Image
-import PIL.ImageEnhance
 import numpy as np
 import numpy.random as npr
+import PIL.Image
+import PIL.ImageEnhance

 from seetadet.core.config import cfg
 from seetadet.utils import boxes as box_util
 from seetadet.utils import boxes_v2 as box_util_v2
-from seetadet.utils import logger
+from seetadet.utils import image as image_util


 class Compose(object):
    """Compose the several transforms together."""
+
    def __init__(self, *transforms):
        self.transforms = transforms

@@ -40,26 +39,29 @@ class Compose(object):


 class Distort(object):
+    """Distort the brightness, contrast and color of image."""
+
    def __init__(self):
-        self._prob = 0.5
-        self._transforms = [
-            (PIL.ImageEnhance.Brightness, self._prob),
-            (PIL.ImageEnhance.Contrast, self._prob),
-            (PIL.ImageEnhance.Color, self._prob),
-        ]
+        self._prob = 0.5 if cfg.TRAIN.USE_COLOR_JITTER else 0

    def apply(self, img, boxes=None):
-        self._prob = 0.5 if cfg.TRAIN.USE_COLOR_JITTER else 0
-        img = PIL.Image.fromarray(img)
-        for transform_fn, prob in self._transforms:
-            if npr.uniform() < prob:
-                img = transform_fn(img)
-                img = img.enhance(1. + npr.uniform(-.4, .4))
-            return np.array(img), boxes
+        if self._prob > 0:
+            transforms = [PIL.ImageEnhance.Brightness,
+                          PIL.ImageEnhance.Contrast,
+                          PIL.ImageEnhance.Color]
+            npr.shuffle(transforms)
+            img = PIL.Image.fromarray(img)
+            for transform in transforms:
+                if npr.uniform() < self._prob:
+                    img = transform(img)
+                    img = img.enhance(1. + npr.uniform(-.4, .4))
+            img = np.array(img)
        return img, boxes


 class Expand(object):
+    """Expand image to get smaller objects."""
+
    def __init__(self):
        self._max_ratio = 1. / cfg.TRAIN.RANDOM_SCALES[0]
        self._expand_prob = 0.5 if self._max_ratio > 1 else 0
@@ -91,51 +93,44 @@ class Expand(object):


 class Resize(object):
+    """Resize image."""
+
    def __init__(self):
        self._target_size = (cfg.TRAIN.SCALES[0],) * 2
-        self._interp_mode = [
-            cv2.INTER_LINEAR,
-            cv2.INTER_AREA,
-            cv2.INTER_NEAREST,
-            cv2.INTER_CUBIC,
-            cv2.INTER_LANCZOS4,
-        ]

    def apply(self, img, boxes):
-        rand = npr.randint(len(self._interp_mode))
-        return cv2.resize(
-            img, self._target_size,
-            interpolation=self._interp_mode[rand],
-        ), boxes
+        return image_util.resize_image(img, size=self._target_size), boxes


 class Sample(object):
+    """Crop image by sampling a region restricted by bounding boxes."""
+
    def __init__(self):
-        samplers = cfg.SSD.SAMPLERS
-        if not isinstance(samplers, collections.Iterable):
-            samplers = [samplers]
-        self._samplers = []
-        for sampler in samplers:
-            if len(sampler) != 8:
-                logger.fatal('The sample params should be a tuple of length 8.')
-            sample_param = {
-                'min_scale': sampler[0],
-                'max_scale': sampler[1],
-                'min_aspect_ratio': sampler[2],
-                'max_aspect_ratio': sampler[3],
-                'min_overlap': sampler[4],
-                'max_overlap': sampler[5],
-                'max_trials': sampler[6],
-                'max_sample': sampler[7],
-            }
-            self._samplers.append(sample_param)
+        min_scale, max_scale = \
+            cfg.PIPELINE.RANDOM_BBOX_CROP.SCALING
+        min_aspect_ratio, max_aspect_ratio = \
+            cfg.PIPELINE.RANDOM_BBOX_CROP.ASPECT_RATIO
+        self._samplers = [{'min_scale': 1.0,
+                           'max_scale': 1.0,
+                           'min_aspect_ratio': 1.0,
+                           'max_aspect_ratio': 1.0,
+                           'min_overlap': 0.0,
+                           'max_overlap': 1.0,
+                           'max_trials': 1,
+                           'max_sample': 1}]
+        for min_overlap in cfg.PIPELINE.RANDOM_BBOX_CROP.THRESHOLDS:
+            self._samplers.append({'min_scale': min_scale,
+                                   'max_scale': max_scale,
+                                   'min_aspect_ratio': min_aspect_ratio,
+                                   'max_aspect_ratio': max_aspect_ratio,
+                                   'min_overlap': min_overlap,
+                                   'max_overlap': 1.0,
+                                   'max_trials': 10,
+                                   'max_sample': 1})

    @classmethod
    def _compute_overlaps(cls, rand_box, gt_boxes):
-        return box_util_v2.iou(
-            np.expand_dims(rand_box, 0),
-            gt_boxes[:, 0:4],
-        )
+        return box_util_v2.iou(np.expand_dims(rand_box, 0), gt_boxes[:, 0:4])

    @classmethod
    def _generate_sample(cls, sample_param):
@@ -153,14 +148,14 @@ class Sample(object):
        h_off = npr.uniform(0., 1. - bbox_h)
        return np.array([w_off, h_off, w_off + bbox_w, h_off + bbox_h])

-    def _check_center(self, sample_box, gt_boxes):
+    @staticmethod
+    def _check_center(sample_box, gt_boxes):
        ctr_x = (gt_boxes[:, 2] + gt_boxes[:, 0]) / 2.0
        ctr_y = (gt_boxes[:, 3] + gt_boxes[:, 1]) / 2.0
        # Keep the ground-truth box whose center is in the sample box
-        # Implement ``EmitConstraint.CENTER`` in the original SSD
-        keep_inds = np.where((ctr_x >= sample_box[0]) & (ctr_x <= sample_box[2]) &
-                             (ctr_y >= sample_box[1]) & (ctr_y <= sample_box[3]))[0]
-        return len(keep_inds) > 0
+        keep_indices = np.where((ctr_x >= sample_box[0]) & (ctr_x <= sample_box[2]) &
+                                (ctr_y >= sample_box[1]) & (ctr_y <= sample_box[3]))[0]
+        return len(keep_indices) > 0

    def _check_overlap(self, sample_box, gt_boxes, constraint):
        min_overlap = constraint.get('min_overlap', None)
@@ -207,9 +202,9 @@ class Sample(object):
        if gt_boxes is not None:
            ctr_x = (gt_boxes[:, 2] + gt_boxes[:, 0]) / 2.0
            ctr_y = (gt_boxes[:, 3] + gt_boxes[:, 1]) / 2.0
-            keep_inds = np.where((ctr_x >= rand_box[0]) & (ctr_x <= rand_box[2]) &
-                                 (ctr_y >= rand_box[1]) & (ctr_y <= rand_box[3]))[0]
-            gt_boxes = gt_boxes[keep_inds]
+            keep_indices = np.where((ctr_x >= rand_box[0]) & (ctr_x <= rand_box[2]) &
+                                    (ctr_y >= rand_box[1]) & (ctr_y <= rand_box[3]))[0]
+            gt_boxes = gt_boxes[keep_indices]
            new_gt_boxes = gt_boxes.astype(gt_boxes.dtype, copy=True)
            new_gt_boxes[:, 0] = (gt_boxes[:, 0] * im_w - w_off)
            new_gt_boxes[:, 1] = (gt_boxes[:, 1] * im_h - h_off)

--- a/seetadet/algo/ssd/transforms_test.py
+++ b/seetadet/algo/ssd/transforms_test.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#     <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import sys
-sys.path.append('../../')
-
-import cv2
-import numpy as np
-
-from seetadet.algo.ssd import transforms
-from seetadet.core.config import cfg
-
-
-if __name__ == '__main__':
-    np.random.seed(3)
-    cfg.TRAIN.SCALES = [300]
-    cfg.TRAIN.RANDOM_SCALES = [0.25, 1.00]
-    cfg.TRAIN.USE_COLOR_JITTER = True
-
-    transformer = transforms.Compose(
-        transforms.Distort(),
-        transforms.Expand(),
-        transforms.Sample(),
-        transforms.Resize(),
-    )
-
-    while True:
-        img = cv2.imread('cat.jpg')
-        boxes = np.array([[0.33, 0.04, 0.71, 0.98]], dtype=np.float32)
-        img, boxes = transformer(img, boxes)
-        for box in boxes:
-            x1 = int(box[0] * img.shape[1])
-            y1 = int(box[1] * img.shape[0])
-            x2 = int(box[2] * img.shape[1])
-            y2 = int(box[3] * img.shape[0])
-            cv2.rectangle(img, (x1, y1), (x2, y2), (188, 119, 64), 2)
-        cv2.imshow('Transforms - Preview', img)
-        cv2.waitKey(0)
--- a/seetadet/algo/ssd/utils.py
+++ b/seetadet/algo/ssd/utils.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#     <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+
+def get_shifted_anchors(shapes, base_anchors, strides):
+    """Return the shifted anchors on given shapes."""
+    anchors_to_pack = []
+    for i in range(len(shapes)):
+        height, width = shapes[i]
+        shift_x = (np.arange(0, width) + 0.5) * strides[i]
+        shift_y = (np.arange(0, height) + 0.5) * strides[i]
+        shift_x, shift_y = np.meshgrid(shift_x, shift_y)
+        shifts = np.vstack((shift_x.ravel(), shift_y.ravel(),
+                            shift_x.ravel(), shift_y.ravel())).transpose()
+        # Add a anchors (1, A, 4) to cell K shifts (K, 1, 4)
+        # to get shift anchors (K, A, 4) and reshape to (K * A, 4)
+        a = base_anchors[i].shape[0]
+        k = shifts.shape[0]
+        anchors = (base_anchors[i].reshape((1, a, 4)) +
+                   shifts.reshape((1, k, 4)).transpose((1, 0, 2)))
+        anchors_to_pack.append(anchors.reshape((k * a, 4)))
+    return np.vstack(anchors_to_pack)
--- a/seetadet/core/config.py
+++ b/seetadet/core/config.py
@@ -7,24 +7,41 @@
 #
 #     <https://opensource.org/licenses/BSD-2-Clause>
 #
-# Codes are based on:
-#
-#     <https://github.com/facebookresearch/Detectron/blob/master/lib/core/config.py>
-#
 # ------------------------------------------------------------

 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import os.path as osp
 import numpy as np

 from seetadet.utils.attrdict import AttrDict

-
 cfg = __C = AttrDict()

+###########################################
+#                                         #
+#            Pipeline Options             #
+#                                         #
+###########################################
+
+__C.PIPELINE = AttrDict()
+
+# The pipeline type
+# Value supported as follows:
+#  - 'ssd'
+#  - 'rcnn'
+#  - 'default'
+__C.PIPELINE.TYPE = 'default'
+
+# RandomBBoxCrop
+__C.PIPELINE.RANDOM_BBOX_CROP = AttrDict()
+#   - The range of scale for sampling regions
+__C.PIPELINE.RANDOM_BBOX_CROP.SCALING = [0.3, 1.0]
+#   - The range of aspect ratio for sampling regions
+__C.PIPELINE.RANDOM_BBOX_CROP.ASPECT_RATIO = [0.5, 2.0]
+#   - The minimum IoU to satisfy
+__C.PIPELINE.RANDOM_BBOX_CROP.THRESHOLDS = [0.0, 0.1, 0.3, 0.5, 0.7, 0.9]

 ###########################################
 #                                         #
@@ -32,7 +49,6 @@ cfg = __C = AttrDict()
 #                                         #
 ###########################################

-
 __C.TRAIN = AttrDict()

 # Initialize network with weights from this file
@@ -46,19 +62,22 @@ __C.TRAIN.NUM_THREADS = 4

 # Scales to use during training (can list multiple scales)
 # Each scale is the pixel size of an image's shortest side
-__C.TRAIN.SCALES = (300,)
+__C.TRAIN.SCALES = (640,)
+
+# Range to jitter the selected scale
+__C.TRAIN.RANDOM_SCALES = [1., 1.]

 # Max pixel size of the longest side of a scaled input image
-# A square will be used if value < 1
 __C.TRAIN.MAX_SIZE = 0

 # Images to use per mini-batch
 __C.TRAIN.IMS_PER_BATCH = 1

+# The number of training batches to init for aspect grouping
+__C.TRAIN.ASPECT_GROUPING = 64
+
 # Use shuffled images during training?
 __C.TRAIN.USE_SHUFFLE = True
-# The number of shuffle chunks
-__C.TRAIN.SHUFFLE_CHUNKS = 0

 # Use horizontally-flipped images during training?
 __C.TRAIN.USE_FLIPPED = True
@@ -66,47 +85,17 @@ __C.TRAIN.USE_FLIPPED = True
 # Use the difficult(under occlusion) objects
 __C.TRAIN.USE_DIFF = True

-# Range to jitter the image scales
-__C.TRAIN.RANDOM_SCALES = [1., 1.]
-
-# If True, randomly distort the image by brightness, contrast, and saturation
+# If True, distort th brightness, contrast, and saturation
 __C.TRAIN.USE_COLOR_JITTER = False

-# Mini-batch size (#RoIs) for two stage detector
-__C.TRAIN.BATCH_SIZE = 128
-
-# Overlap threshold for a ROI to be considered foreground (if >= FG_THRESH)
-__C.TRAIN.FG_THRESH = 0.5
-# Fraction of mini-batch that is labeled foreground (i.e. class > 0)
-__C.TRAIN.FG_FRACTION = 0.25
-
-# Overlap threshold for a ROI to be considered background (class = 0 if
-# overlap in [LO, HI))
-__C.TRAIN.BG_THRESH_HI = 0.5
-__C.TRAIN.BG_THRESH_LO = 0.0
-
-# IOU >= thresh: positive example
-__C.TRAIN.RPN_POSITIVE_OVERLAP = 0.7
-# IOU < thresh: negative example
-__C.TRAIN.RPN_NEGATIVE_OVERLAP = 0.3
-# If an anchor satisfied by positive and negative conditions set to negative
-__C.TRAIN.RPN_CLOBBER_POSITIVES = False
-# Max number of foreground examples
-__C.TRAIN.RPN_FG_FRACTION = 0.5
-# Total number of examples
-__C.TRAIN.RPN_BATCHSIZE = 256
 # NMS threshold used on RPN proposals
 __C.TRAIN.RPN_NMS_THRESH = 0.7
-# Number of top scoring boxes to keep before apply NMS to RPN proposals
+
+# Number of top scoring boxes to keep before NMS to RPN proposals
 __C.TRAIN.RPN_PRE_NMS_TOP_N = 12000
-# Number of top scoring boxes to keep after applying NMS to RPN proposals
-__C.TRAIN.RPN_POST_NMS_TOP_N = 2000
-# Proposal height and width both need to be greater than RPN_MIN_SIZE (at orig image scale)
-__C.TRAIN.RPN_MIN_SIZE = 0
-# Remove RPN anchors that go outside the image by RPN_STRADDLE_THRESH pixels
-# Set to -1 or a large value, e.g. 100000, to disable pruning anchors
-__C.TRAIN.RPN_STRADDLE_THRESH = 0

+# Number of top scoring boxes to keep after NMS to RPN proposals
+__C.TRAIN.RPN_POST_NMS_TOP_N = 2000

 ###########################################
 #                                         #
@@ -114,28 +103,38 @@ __C.TRAIN.RPN_STRADDLE_THRESH = 0
 #                                         #
 ###########################################

-
 __C.TEST = AttrDict()

 # Dataset to test
 __C.TEST.DATASET = ''

+# The test protocol for dataset
+# Available protocols: 'voc2007', 'voc2010', 'coco'
+__C.TEST.PROTOCOL = 'voc2007'
+
 # Original json ground-truth file to use
 __C.TEST.JSON_FILE = ''

 # Scales to use during testing (can list multiple scales)
 # Each scale is the pixel size of an image's shortest side
-__C.TEST.SCALES = (300,)
+__C.TEST.SCALES = (640,)

 # Max pixel size of the longest side of a scaled input image
-# A square will be used if value < 1
 __C.TEST.MAX_SIZE = 0

 # Images to use per mini-batch
 __C.TEST.IMS_PER_BATCH = 1

-# Overlap threshold used for non-maximum suppression (suppress boxes with
-# IoU >= this threshold)
+# The threshold for predicting boxes
+__C.TEST.SCORE_THRESH = 0.05
+
+# The threshold for predicting masks
+__C.TEST.BINARY_THRESH = 0.5
+
+# Number of top scoring boxes to keep before NMS to detections
+__C.TEST.PRE_NMS_TOP_N = 300
+
+# Overlap threshold used for NMS
 __C.TEST.NMS = 0.3

 # Use Soft-NMS instead of standard NMS?
@@ -144,54 +143,40 @@ __C.TEST.USE_SOFT_NMS = False
 __C.TEST.SOFT_NMS_METHOD = 'linear'
 __C.TEST.SOFT_NMS_SIGMA = 0.5

-# The top-k prior boxes before nms.
-__C.TEST.NMS_TOP_K = 400
-
-# The threshold for predicting boxes
-__C.TEST.SCORE_THRESH = 0.05
-
-# The threshold for predicting masks
-__C.TEST.BINARY_THRESH = 0.5
-
 # NMS threshold used on RPN proposals
 __C.TEST.RPN_NMS_THRESH = 0.7
-# Number of top scoring boxes to keep before apply NMS to RPN proposals
+
+# Number of top scoring boxes to keep before NMS to RPN proposals
 __C.TEST.RPN_PRE_NMS_TOP_N = 6000
-# Number of top scoring boxes to keep after applying NMS to RPN proposals
-__C.TEST.RPN_POST_NMS_TOP_N = 300
-# Proposal height and width both need to be greater than RPN_MIN_SIZE (at orig image scale)
-__C.TEST.RPN_MIN_SIZE = 0
+
+# Number of top scoring boxes to keep after NMS to RPN proposals
+__C.TEST.RPN_POST_NMS_TOP_N = 1000
+
+# Number of top scoring boxes to keep before NMS to RetinaNet detections
+__C.TEST.RETINANET_PRE_NMS_TOP_N = 3000

 # Save detection results files if True
-# If false, results files are cleaned up (they can be large) after local
-# evaluation
+# If false, results files are cleaned up after evaluation
 __C.TEST.COMPETITION_MODE = True

-# The optional test protocol for custom dataSet
-# Ignored by VOC, COCO dataSets
-# Available protocols: 'voc2007', 'voc2010', 'coco'
-__C.TEST.PROTOCOL = 'voc2007'
-
-# Maximum number of detections to return per image (100 is based on the limit
-# established for the COCO dataset)
+# Maximum number of detections to return per image
+# 100 is based on the limit established for the COCO dataset
 __C.TEST.DETECTIONS_PER_IM = 100

-
 ###########################################
 #                                         #
 #              Model Options              #
 #                                         #
 ###########################################

-
 __C.MODEL = AttrDict()

-# The type of the model
-# ('faster_rcnn',
-#  'mask_rcnn',
-#  'retinanet,
-#  'ssd',
-# )
+# The model type
+# Value supported as follows:
+#  - 'faster_rcnn'
+#  - 'mask_rcnn'``
+#  - 'retinanet
+#  - 'ssd'
 __C.MODEL.TYPE = ''

 # The float precision for training and inference
@@ -201,8 +186,10 @@ __C.MODEL.PRECISION = 'FLOAT32'
 # The backbone
 __C.MODEL.BACKBONE = ''

-# The number of classes in the dataset
-__C.MODEL.NUM_CLASSES = -1
+# The backbone normalization module
+# Values supported: 'FrozenBN', 'BN'
+__C.MODEL.BACKBONE_NORM = 'FrozenBN'
+
 # The name for each object class
 __C.MODEL.CLASSES = ['__background__']

@@ -211,33 +198,31 @@ __C.MODEL.CLASSES = ['__background__']
 __C.MODEL.FREEZE_AT = 2

 # The variant of ReLU activation
-# ('ReLU', 'ReLU6')
+# Values supported: 'ReLU', 'ReLU6'
 __C.MODEL.RELU_VARIANT = 'ReLU'

 # Setting of focal loss
 __C.MODEL.FOCAL_LOSS_ALPHA = 0.25
 __C.MODEL.FOCAL_LOSS_GAMMA = 2.0

-# The optional loss for bbox regression
-# ('NORM', 'IOU')
-__C.MODEL.REG_LOSS_TYPE = 'NORM'
-# Weight for bbox regression loss
-__C.MODEL.REG_LOSS_WEIGHT = 1.
-
-# Stride of the coarsest Feature level
+# Stride of the coarsest feature level
 # This is needed so the input can be padded properly
 __C.MODEL.COARSEST_STRIDE = 32

-
 ###########################################
 #                                         #
 #              RPN Options                #
 #                                         #
 ###########################################

-
 __C.RPN = AttrDict()

+# Total number of rpn training examples per image
+__C.RPN.BATCH_SIZE = 256
+
+# Target fraction of foreground examples per training batch
+__C.RPN.FG_FRACTION = 0.5
+
 # Strides for multiple rpn heads
 __C.RPN.STRIDES = [4, 8, 16, 32, 64]

@@ -247,6 +232,20 @@ __C.RPN.SCALES = [8, 8, 8, 8, 8]
 # RPN anchor aspect ratios
 __C.RPN.ASPECT_RATIOS = [0.5, 1, 2]

+# IoU overlap ratio for labeling an anchor as positive
+# Anchors with >= iou overlap are labeled positive
+__C.RPN.POSITIVE_OVERLAP = 0.7
+
+# IoU overlap ratio for labeling an anchor as negative
+# Anchors with < iou overlap are labeled negative
+__C.RPN.NEGATIVE_OVERLAP = 0.3
+
+# The optional loss for bbox regression
+# Values supported: 'l1', 'smooth_l1'
+__C.RPN.BBOX_REG_LOSS_TYPE = 'l1'
+
+# Weight for bbox regression loss
+__C.RPN.BBOX_REG_LOSS_WEIGHT = 1.0

 ###########################################
 #                                         #
@@ -254,7 +253,6 @@ __C.RPN.ASPECT_RATIOS = [0.5, 1, 2]
 #                                         #
 ###########################################

-
 __C.RETINANET = AttrDict()

 # Anchor aspect ratios to use
@@ -271,9 +269,6 @@ __C.RETINANET.ANCHOR_SCALE = 4
 # NOTE: this doesn't include the last conv for logits
 __C.RETINANET.NUM_CONVS = 4

-# During inference, #locs to select based on cls score before NMS is performed
-__C.RETINANET.PRE_NMS_TOP_N = 5000
-
 # IoU overlap ratio for labeling an anchor as positive
 # Anchors with >= iou overlap are labeled positive
 __C.RETINANET.POSITIVE_OVERLAP = 0.5
@@ -282,6 +277,12 @@ __C.RETINANET.POSITIVE_OVERLAP = 0.5
 # Anchors with < iou overlap are labeled negative
 __C.RETINANET.NEGATIVE_OVERLAP = 0.4

+# The optional loss for bbox regression
+# Values supported: 'l1', 'smooth_l1', 'giou'
+__C.RETINANET.BBOX_REG_LOSS_TYPE = 'l1'
+
+# Weight for bbox regression loss
+__C.RETINANET.BBOX_REG_LOSS_WEIGHT = 1.0

 ###########################################
 #                                         #
@@ -289,7 +290,6 @@ __C.RETINANET.NEGATIVE_OVERLAP = 0.4
 #                                         #
 ###########################################

-
 __C.FPN = AttrDict()

 # Channel dimension of the FPN feature levels
@@ -303,32 +303,52 @@ __C.FPN.RPN_MIN_LEVEL = 2
 # Hyper-Parameters for the RoI-to-FPN level mapping heuristic
 __C.FPN.ROI_CANONICAL_SCALE = 224
 __C.FPN.ROI_CANONICAL_LEVEL = 4
+
 # Coarsest level of the FPN pyramid
 __C.FPN.ROI_MAX_LEVEL = 5
 # Finest level of the FPN pyramid
 __C.FPN.ROI_MIN_LEVEL = 2

-
 ###########################################
 #                                         #
 #           Fast R-CNN Options            #
 #                                         #
 ###########################################

-
 __C.FRCNN = AttrDict()

-# RoI transformation function (e.g., RoIPool or RoIAlign)
-__C.FRCNN.ROI_XFORM_METHOD = 'RoIPool'
+# Total number of training RoIs per image
+__C.FRCNN.BATCH_SIZE = 128

-# Hidden layer dimension when using an MLP for the RoI box head
-__C.FRCNN.MLP_HEAD_DIM = 1024
+# Target fraction of foreground RoIs per training batch
+__C.FRCNN.FG_FRACTION = 0.25
+
+# IoU overlap ratio for labeling a RoI as positive
+# RoIs with >= iou overlap are labeled positive
+__C.FRCNN.POSITIVE_OVERLAP = 0.5

+# IoU overlap ratio for labeling a RoI as negative
+# RoIs with iou overlap in [LO, HI) are labeled negative
+__C.FRCNN.NEGATIVE_OVERLAP_HI = 0.5
+__C.FRCNN.NEGATIVE_OVERLAP_LO = 0.0
+
+# RoI transform function
+# Values supported: 'RoIAlign', 'RoIAlign'
+__C.FRCNN.ROI_XFORM_METHOD = 'RoIAlign'
 # RoI transform output resolution
-# Note: some models may have constraints on what they can use, e.g. they use
-# pretrained FC layers like in VGG16, and will ignore this option
 __C.FRCNN.ROI_XFORM_RESOLUTION = 7
+# Resampling window size for RoI transformation
+__C.FRCNN.ROI_XFORM_SAMPLING_RATIO = 0

+# Hidden layer dimension when using an MLP for the RoI box head
+__C.FRCNN.MLP_HEAD_DIM = 1024
+
+# The optional loss for bbox regression
+# Values supported: 'l1', 'smooth_l1'
+__C.FRCNN.BBOX_REG_LOSS_TYPE = 'l1'
+
+# Weight for bbox regression loss
+__C.FRCNN.BBOX_REG_LOSS_WEIGHT = 1.0

 ###########################################
 #                                         #
@@ -336,18 +356,18 @@ __C.FRCNN.ROI_XFORM_RESOLUTION = 7
 #                                         #
 ###########################################

-
 __C.MRCNN = AttrDict()

 # Resolution of mask predictions
 __C.MRCNN.RESOLUTION = 28

-# RoI transformation function (e.g., RoIPool or RoIAlign)
+# RoI transform function
+# Values supported: 'RoIAlign', 'RoIAlign'
 __C.MRCNN.ROI_XFORM_METHOD = 'RoIAlign'
-
-# RoI transformation function (e.g., RoIPool or RoIAlign)
+# RoI transform output resolution
 __C.MRCNN.ROI_XFORM_RESOLUTION = 14
-
+# Resampling window size for RoI transformation
+__C.MRCNN.ROI_XFORM_SAMPLING_RATIO = 0

 ###########################################
 #                                         #
@@ -355,74 +375,55 @@ __C.MRCNN.ROI_XFORM_RESOLUTION = 14
 #                                         #
 ###########################################

-
 __C.SSD = AttrDict()

 # Convolutions to use in the cls and bbox tower
 # NOTE: this doesn't include the last conv for logits
 __C.SSD.NUM_CONVS = 0

-# MultiBox configs
-__C.SSD.MULTIBOX = AttrDict()
-__C.SSD.MULTIBOX.STRIDES = []
-__C.SSD.MULTIBOX.MIN_SIZES = []
-__C.SSD.MULTIBOX.MAX_SIZES = []
-__C.SSD.MULTIBOX.ASPECT_RATIOS = []
-
-# OHEM configs
-__C.SSD.OHEM = AttrDict()
-# The threshold for selecting negative bbox in hard example mining
-__C.SSD.OHEM.NEG_OVERLAP = 0.5
-# The ratio used in hard example mining
-__C.SSD.OHEM.NEG_POS_RATIO = 3.0
-
-# Samplers
-# Format as (min_scale, max_scale,
-#            min_aspect_ratio, max_aspect_ratio,
-#            min_overlap, max_overlap,
-#            max_trials, max_sample)
-__C.SSD.SAMPLERS = [
-    (1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1, 1),   # Entire image
-    (0.3, 1.0, 0.5, 2.0, 0.1, 1.0, 10, 1),  # IoU >= 0.1
-    (0.3, 1.0, 0.5, 2.0, 0.3, 1.0, 10, 1),  # IoU >= 0.3
-    (0.3, 1.0, 0.5, 2.0, 0.5, 1.0, 5, 1),   # IoU >= 0.5
-    (0.3, 1.0, 0.5, 2.0, 0.7, 1.0, 5, 1),   # IoU >= 0.7
-    (0.3, 1.0, 0.5, 2.0, 0.9, 1.0, 5, 1),   # IoU >= 0.9
-    (0.3, 1.0, 0.5, 2.0, 0.0, 1.0, 1, 1),   # Any patches
-]
+# Anchor aspect ratios to use
+__C.SSD.ASPECT_RATIOS = []

+# Strides for multiple ssd heads
+__C.SSD.STRIDES = []

-###########################################
-#                                         #
-#             ResNet Options              #
-#                                         #
-###########################################
+# Anchor sizes to use
+__C.SSD.ANCHOR_SIZES = []

+# IoU overlap ratio for labeling an anchor as positive
+# Anchors with >= iou overlap are labeled positive
+__C.SSD.POSITIVE_OVERLAP = 0.5

-__C.RESNET = AttrDict()
+# IoU overlap ratio for labeling an anchor as negative
+# Anchors with < iou overlap are labeled negative
+__C.SSD.NEGATIVE_OVERLAP = 0.5

-# Number of groups to use; 1 ==> ResNet; > 1 ==> ResNeXt
-__C.RESNET.NUM_GROUPS = 1
+# The ratio to sample negative anchors as background
+__C.SSD.NEGATIVE_POSITIVE_RATIO = 3.0

-# Baseline width of each group
-__C.RESNET.GROUP_WIDTH = 64
+# The optional loss for bbox regression
+# Values supported: 'l1', 'smooth_l1', 'giou'
+__C.SSD.BBOX_REG_LOSS_TYPE = 'l1'

+# Weight for bbox regression loss
+__C.SSD.BBOX_REG_LOSS_WEIGHT = 1.0

 ###########################################
 #                                         #
-#            DropBlock Options            #
+#             ResNet Options              #
 #                                         #
 ###########################################

+__C.RESNET = AttrDict()

-__C.DROPBLOCK = AttrDict()
-
-# Whether to use drop block for more regulization
-__C.DROPBLOCK.DROP_ON = False
-
-# Decrement for scheduling keep prob after each iteration
-__C.DROPBLOCK.DECREMENT = 1e-6
+# Number of groups to use
+# 1 ==> ResNet; > 1 ==> ResNeXt
+# ResNext 32x8d: NUM_GROUPS, WIDTH_PER_GROUP = 32, 8
+# ResNext 64x4d: NUM_GROUPS, WIDTH_PER_GROUP = 64, 4
+__C.RESNET.NUM_GROUPS = 1

+# Baseline width of each group
+__C.RESNET.WIDTH_PER_GROUP = 64

 ###########################################
 #                                         #
@@ -430,7 +431,6 @@ __C.DROPBLOCK.DECREMENT = 1e-6
 #                                         #
 ###########################################

-
 __C.SOLVER = AttrDict()

 # The interval to display logs
@@ -438,13 +438,13 @@ __C.SOLVER.DISPLAY = 20

 # The interval to snapshot a model
 __C.SOLVER.SNAPSHOT_EVERY = 5000
-# Prefix to yield the path: <prefix>_iters_XYZ.pth
+# Prefix to yield the path: <prefix>_iter_XYZ.pkl
 __C.SOLVER.SNAPSHOT_PREFIX = ''

 # Optional scaling factor for total loss
 # This option is helpful to scale the magnitude
 # of gradients during FP16 training
-__C.SOLVER.LOSS_SCALING = 1.
+__C.SOLVER.LOSS_SCALING = 1.0

 # Maximum number of SGD iterations
 __C.SOLVER.MAX_STEPS = 40000
@@ -468,9 +468,10 @@ __C.SOLVER.LR_POLICY = 'steps_with_decay'
 __C.SOLVER.MOMENTUM = 0.9
 # L2 regularization for weight parameters
 __C.SOLVER.WEIGHT_DECAY = 0.0001
+# L2 regularization for legacy bias parameters
+__C.SOLVER.WEIGHT_DECAY_BIAS = 0.0
 # L2 norm factor for clipping gradients
-__C.SOLVER.CLIP_NORM = -1.0
-
+__C.SOLVER.CLIP_NORM = 0.0

 ###########################################
 #                                         #
@@ -478,21 +479,18 @@ __C.SOLVER.CLIP_NORM = -1.0
 #                                         #
 ###########################################

-
-# Number of GPUs to use (applies to both training and testing)
+# Number of GPUs to use during training
 __C.NUM_GPUS = 1

 # Use NCCL for all reduce, otherwise use cuda-aware mpi
 __C.USE_NCCL = True

-# Use DALI to load the batch of data instead of original pipeline
-__C.USE_DALI = False
-
 # Hosts for Inter-Machine communication
 __C.HOSTS = []

-# Pixel mean values (BGR order)
-__C.PIXEL_MEANS = [102., 115., 122.]
+# Pixel stddev and mean values (BGR order)
+__C.PIXEL_STDS = [1.0, 1.0, 1.0]
+__C.PIXEL_MEANS = [103.53, 116.28, 123.675]

 # Default weights on (dx, dy, dw, dh) for normalizing bbox regression targets
 # These are empirically chosen to approximately lead to unit variance targets
@@ -505,57 +503,30 @@ __C.PRIOR_PROB = 0.01
 # For reproducibility
 __C.RNG_SEED = 3

-# Root directory of project
-__C.ROOT_DIR = osp.abspath(osp.join(osp.dirname(__file__), '..', '..'))
-
-# Data directory
-__C.DATA_DIR = osp.abspath(osp.join(__C.ROOT_DIR, 'data'))
-
 # Place outputs under an experiments directory
 __C.EXP_DIR = ''

-# Default GPU device id
+# Default GPU device index
 __C.GPU_ID = 0

-# Dump detection visualizations
+# Show detection visualizations
 __C.VIS = False
+
+# Write detection visualizations instead of showing
 __C.VIS_ON_FILE = False

 # Score threshold for visualization
 __C.VIS_TH = 0.7

-# Write summaries by tensor board
+# Write summaries by TensorBoard
 __C.ENABLE_TENSOR_BOARD = False


-def _merge_a_into_b(a, b):
-    """Merge config dictionary a into config dictionary b, clobbering the
-    options in b whenever they are also specified in a.
-    """
-    if not isinstance(a, dict):
-        return
-    for k, v in a.items():
-        # a must specify keys that are in b
-        if k not in b:
-            raise KeyError('{} is not a valid config key'.format(k))
-        # the types must match, too
-        v = _check_and_coerce_cfg_value_type(v, b[k], k)
-        # recursively merge dicts
-        if type(v) is AttrDict:
-            try:
-                _merge_a_into_b(a[k], b[k])
-            except:
-                print('Error under config key: {}'.format(k))
-                raise
-        else:
-            b[k] = v
-
-
 def cfg_from_file(filename):
    """Load a config file and merge it into the default options."""
    import yaml
    with open(filename, 'r') as f:
-        yaml_cfg = AttrDict(yaml.load(f))
+        yaml_cfg = AttrDict(yaml.safe_load(f))
    global __C
    _merge_a_into_b(yaml_cfg, __C)

@@ -567,35 +538,51 @@ def cfg_from_list(cfg_list):
    for k, v in zip(cfg_list[0::2], cfg_list[1::2]):
        key_list = k.split('.')
        d = __C
-        for subkey in key_list[:-1]:
-            assert d.has_key(subkey)
-            d = d[subkey]
-        subkey = key_list[-1]
-        assert subkey in d
+        for sub_key in key_list[:-1]:
+            assert sub_key in d
+            d = d[sub_key]
+        sub_key = key_list[-1]
+        assert sub_key in d
        try:
            value = literal_eval(v)
-        except:
+        except:  # noqa
            # Handle the case when v is a string literal
            value = v
-        assert type(value) == type(d[subkey]), \
-            'type {} does not match original type {}'\
-            .format(type(value), type(d[subkey]))
-        d[subkey] = value
+        if type(value) != type(d[sub_key]):  # noqa
+            raise TypeError('Type {} does not match original type {}'
+                            .format(type(value), type(d[sub_key])))
+        d[sub_key] = value
+
+
+def _merge_a_into_b(a, b):
+    """Merge config dictionary a into config dictionary b, clobbering the
+       options in b whenever they are also specified in a."""
+    if not isinstance(a, dict):
+        return
+    for k, v in a.items():
+        # a must specify keys that are in b
+        if k not in b:
+            raise KeyError('{} is not a valid config key'.format(k))
+        # The types must match, too
+        v = _check_and_coerce_cfg_value_type(v, b[k], k)
+        # Recursively merge dicts
+        if type(v) is AttrDict:
+            try:
+                _merge_a_into_b(a[k], b[k])
+            except:  # noqa
+                print('Error under config key: {}'.format(k))
+                raise
+        else:
+            b[k] = v


 def _check_and_coerce_cfg_value_type(value_a, value_b, key):
-    """Checks that `value_a`, which is intended to replace `value_b` is of the
-    right type. The type is correct if it matches exactly or is one of a few
-    cases in which the type can be easily coerced.
-    """
-    # The types must match (with some exceptions)
-    type_b = type(value_b)
-    type_a = type(value_a)
+    """Check if the value type matched."""
+    type_a, type_b = type(value_a), type(value_b)
    if type_a is type_b:
        return value_a
    if type_b is float and type_a is int:
        return float(value_a)
-
    # Exceptions: numpy arrays, strings, tuple<->list
    if isinstance(value_b, np.ndarray):
        value_a = np.array(value_a, dtype=value_b.dtype)

--- a/seetadet/core/coordinator.py
+++ b/seetadet/core/coordinator.py
@@ -14,12 +14,14 @@ from __future__ import division
 from __future__ import print_function

 import os
-import shutil
+import os.path as osp
 import time
+
 import numpy as np

 from seetadet.core.config import cfg
 from seetadet.core.config import cfg_from_file
+from seetadet.utils import logger


 class Coordinator(object):
@@ -31,24 +33,24 @@ class Coordinator(object):
        if cfg.EXP_DIR != '':
            exp_dir = cfg.EXP_DIR
        if exp_dir is None:
-            model_id = time.strftime(
-                '%Y%m%d_%H%M%S', time.localtime(time.time()))
-            self.experiment_dir = '../experiments/{}'.format(model_id)
-            if not os.path.exists(self.experiment_dir):
-                os.makedirs(self.experiment_dir)
+            name = time.strftime('%Y%m%d_%H%M%S',
+                                 time.localtime(time.time()))
+            exp_dir = '../experiments/{}'.format(name)
+            if not osp.exists(exp_dir):
+                os.makedirs(exp_dir)
        else:
-            if not os.path.exists(exp_dir):
-                raise ValueError('ExperimentDir({}) does not exist.'.format(exp_dir))
-            self.experiment_dir = exp_dir
+            if not osp.exists(exp_dir):
+                raise ValueError('Invalid experiment dir: ' + exp_dir)
+        self.exp_dir = exp_dir

    def _path_at(self, file, auto_create=True):
        try:
-            path = os.path.abspath(os.path.join(self.experiment_dir, file))
-            if auto_create and not os.path.exists(path):
+            path = osp.abspath(osp.join(self.exp_dir, file))
+            if auto_create and not osp.exists(path):
                os.makedirs(path)
        except OSError:
-            path = os.path.abspath(os.path.join('/tmp', file))
-            if auto_create and not os.path.exists(path):
+            path = osp.abspath(osp.join('/tmp', file))
+            if auto_create and not osp.exists(path):
                os.makedirs(path)
        return path

@@ -61,32 +63,35 @@ class Coordinator(object):
    def results_dir(self, checkpoint=None, output_dir=None):
        if output_dir is not None:
            return output_dir
-        sub_dir = os.path.splitext(os.path.basename(checkpoint))[0] if checkpoint else ''
-        return self._path_at(os.path.join('results', sub_dir))
+        path = osp.splitext(osp.basename(checkpoint))[0] if checkpoint else ''
+        return self._path_at(osp.join('results', path))
+
+    def checkpoint(self, step=None, last_idx=1, wait=False):
+        path = self.checkpoints_dir()

-    def checkpoint(self, global_step=None, wait=True):
-        def locate():
-            files = os.listdir(self.checkpoints_dir())
-            steps = []
-            for ix, file in enumerate(files):
-                step = int(file.split('_iter_')[-1].split('.')[0])
-                if global_step == step:
-                    return os.path.join(self.checkpoints_dir(), files[ix]), step
-                steps.append(step)
-            if global_step is None:
+        def locate(last_idx=None):
+            files = os.listdir(path)
+            files = list(filter(lambda x: '_iter_' in x and
+                                          x.endswith('.pkl'), files))
+            file_steps = []
+            for i, file in enumerate(files):
+                file_step = int(file.split('_iter_')[-1].split('.')[0])
+                if step == file_step:
+                    return osp.join(path, files[i]), file_step
+                file_steps.append(file_step)
+            if step is None:
                if len(files) == 0:
                    return None, 0
-                last_idx = int(np.argmax(steps))
-                last_step = steps[last_idx]
-                return os.path.join(self.checkpoints_dir(), files[last_idx]), last_step
+                if last_idx > len(files):
+                    return None, 0
+                file = files[np.argsort(file_steps)[-last_idx]]
+                file_step = file_steps[np.argsort(file_steps)[-last_idx]]
+                return osp.join(path, file), file_step
            return None, 0
-        result = locate()
-        while result[0] is None and wait:
-            print('\rWaiting for step_{}.checkpoint to exist...'.format(global_step), end='')
-            time.sleep(10)
-            result = locate()
-        return result

-    def delete_experiment(self):
-        if os.path.exists(self.experiment_dir):
-            shutil.rmtree(self.experiment_dir)
+        file, file_step = locate(last_idx)
+        while file is None and wait:
+            logger.info('Wait for checkpoint at {}.'.format(step))
+            time.sleep(10)
+            file, file_step = locate(last_idx)
+        return file, file_step
--- a/seetadet/core/registry.py
+++ b/seetadet/core/registry.py
@@ -34,10 +34,10 @@ class Registry(object):
                if self.has(key):
                    raise KeyError(
                        '`%s` has been registered in %s.'
-                        % (key, self._name)
-                    )
-                self._registry[key] = functools.partial(
-                    inner_function, **kwargs)
+                        % (key, self._name))
+                self._registry[key] = \
+                    functools.partial(inner_function, **kwargs)
+            return inner_function
        if func is not None:
            return decorated(func)
        return decorated
@@ -46,8 +46,7 @@ class Registry(object):
        if not self.has(name):
            raise KeyError(
                "`%s` is not registered in <%s>."
-                % (name, self._name)
-            )
+                % (name, self._name))
        return self._registry[name]

    def try_get(self, name):
@@ -56,5 +55,5 @@ class Registry(object):
        return None


-backbones = Registry('backbones')
-models = Registry('models')
+backbone = Registry('backbone')
+fusion_pass = Registry('fusion_pass')
--- a/seetadet/core/test_engine.py
+++ b/seetadet/core/test_engine.py
@@ -13,6 +13,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import datetime
 import importlib
 import multiprocessing

@@ -23,45 +24,48 @@ from seetadet.utils import time_util
 from seetadet.utils.vis import vis_one_image


-def run_test_net(checkpoint, server, devices, read_every=1000):
+def run_test_net(
+    checkpoint,
+    server,
+    devices,
+    read_every=1000,
+    log_every=100,
+):
    classes = server.classes
    num_images = server.num_images
    num_classes = server.num_classes
    devices = devices if devices else [cfg.GPU_ID]
    num_workers = len(devices)
+    read_stride = float(num_workers * cfg.TEST.IMS_PER_BATCH)
+    read_every = int(np.ceil(read_every / read_stride) * read_stride)
+    log_every = log_every if log_every > 0 else num_images

    test_module = 'seetadet.algo.%s.test' % cfg.MODEL.TYPE
-    test_fn = importlib.import_module(test_module).test_net
-
-    _t = time_util.new_timers('im_detect', 'mask_detect', 'misc')
+    test_fn = getattr(importlib.import_module(test_module), 'test_net')

+    timers = time_util.new_timers('im_detect', 'misc')
    vis_image_dict = {}

    all_boxes = [[[] for _ in range(num_images)] for _ in range(num_classes)]
    all_masks = [[[] for _ in range(num_images)] for _ in range(num_classes)]

-    queues = [
-        multiprocessing.Queue()
-        for _ in range(num_workers + 1)
-    ]
+    queues = [multiprocessing.Queue() for _ in range(num_workers + 1)]
    workers = [
        multiprocessing.Process(
            target=test_fn,
            kwargs={
                'weights': checkpoint,
-                'num_classes': server.num_classes,
                'q_in': queues[i],
                'q_out': queues[-1],
                'device': devices[i],
+                'root_logger': i == 0,
            }
        ) for i in range(num_workers)
    ]
-
    for process in workers:
        process.start()

    num_sends = 0
-
    for count in range(num_images):
        if count >= num_sends:
            num_to_send = min(read_every, num_images - num_sends)
@@ -87,7 +91,7 @@ def run_test_net(checkpoint, server, devices, read_every=1000):

        # Update time difference
        for name, diff in time_diffs.items():
-            _t[name].add_diff(diff)
+            timers[name].add_diff(diff)

        # Visualize the results if necessary
        if cfg.VIS or cfg.VIS_ON_FILE:
@@ -104,8 +108,6 @@ def run_test_net(checkpoint, server, devices, read_every=1000):
            )
            del vis_image_dict[i]

-        _t['misc'].tic()
-
        # Pack the results in the class-major order
        for j in range(1, num_classes):
            all_boxes[j][i] = boxes_this_image[j]
@@ -131,14 +133,15 @@ def run_test_net(checkpoint, server, devices, read_every=1000):
                    if all_masks is not None:
                        all_masks[j][i] = all_masks[j][i][keep]

-        _t['misc'].toc()
-
-        print('\rim_detect: {:d}/{:d} {:.3f}s|{:.3f}s {:.3f}s'
-              .format(count + 1, num_images,
-                      _t['im_detect'].average_time,
-                      _t['mask_detect'].average_time,
-                      _t['misc'].average_time),
-              end='')
+        if (count + 1) % log_every == 0:
+            avg_total_time = np.sum([t.average_time for t in timers.values()])
+            eta_seconds = avg_total_time * (num_images - count - 1)
+            print('\rim_detect: {:d}/{:d} [{:.3f}s + {:.3f}s] (eta: {})'
+                  .format(count + 1, num_images,
+                          timers['im_detect'].average_time,
+                          timers['misc'].average_time,
+                          str(datetime.timedelta(seconds=int(eta_seconds)))),
+                  end='')

    print('\n\n>>> Evaluating detections\n')
    server.evaluate_detections(all_boxes)

--- a/seetadet/core/test.py
+++ b/seetadet/core/test.py
@@ -26,6 +26,8 @@ from seetadet.datasets.factory import get_dataset


 class _Server(object):
+    """Base server class."""
+
    def __init__(self, output_dir):
        self.output_dir = output_dir
        if cfg.VIS_ON_FILE:
@@ -47,9 +49,11 @@ class _Server(object):
            if cfg.VIS_ON_FILE else None


-class TestServer(_Server):
+class EvaluateServer(_Server):
+    """Server to evaluate network with ground-truth."""
+
    def __init__(self, output_dir):
-        super(TestServer, self).__init__(output_dir)
+        super(EvaluateServer, self).__init__(output_dir)
        self.dataset = get_dataset(cfg.TEST.DATASET)
        self.dataset.competition_mode(cfg.TEST.COMPETITION_MODE)
        self.classes = self.dataset.classes
@@ -57,7 +61,7 @@ class TestServer(_Server):
        self.num_classes = self.dataset.num_classes
        self.data_reader = dragon.io.DataReader(
            dataset=self.dataset.cls, source=self.dataset.source)
-        self.data_reader.q_out = mp.Queue(cfg.TEST.IMS_PER_BATCH * 5)
+        self.data_reader.q_out = mp.Queue(cfg.TEST.IMS_PER_BATCH * 4)
        self.data_reader.start()
        self.gt_recs = collections.OrderedDict()

@@ -75,8 +79,7 @@ class TestServer(_Server):
        if len(self.gt_recs) != self.num_images:
            raise RuntimeError(
                'Loading {} records, while {} required.'
-                .format(len(self.gt_recs), self.num_images),
-            )
+                .format(len(self.gt_recs), self.num_images))
        return self.gt_recs

    def evaluate_detections(self, all_boxes):
@@ -99,13 +102,15 @@ class TestServer(_Server):


 class InferServer(_Server):
+    """Server to run inference."""
+
    def __init__(self, output_dir):
        super(InferServer, self).__init__(output_dir)
        self.images_dir = cfg.TEST.DATASET
        self.images = os.listdir(self.images_dir)
        self.classes = cfg.MODEL.CLASSES
        self.num_images = len(self.images)
-        self.num_classes = cfg.MODEL.NUM_CLASSES
+        self.num_classes = len(cfg.MODEL.CLASSES)
        self.output_dir = output_dir
        self.image_idx = 0


--- a/seetadet/core/train.py
+++ b/seetadet/core/train.py
@@ -7,10 +7,6 @@
 #
 #     <https://opensource.org/licenses/BSD-2-Clause>
 #
-# Codes are based on:
-#
-#     <https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/fast_rcnn/train.py>
-#
 # ------------------------------------------------------------

 from __future__ import absolute_import
@@ -20,7 +16,7 @@ from __future__ import print_function
 import collections
 import os

-import dragon.vm.torch as torch
+from dragon.vm import torch

 from seetadet.core.config import cfg
 from seetadet.solver.sgd import SGDSolver
@@ -30,6 +26,8 @@ from seetadet.utils.stats import SmoothedValue


 class SolverWrapper(object):
+    """Sovler wrapper."""
+
    def __init__(self, coordinator):
        self.output_dir = coordinator.checkpoints_dir()
        self.solver = SGDSolver()
@@ -37,10 +35,10 @@ class SolverWrapper(object):

        # Setup the detector
        self.detector.load_weights(cfg.TRAIN.WEIGHTS)
+        self.detector.cuda(cfg.GPU_ID)
        if cfg.MODEL.PRECISION.lower() == 'float16':
            # Mixed precision training
            self.detector.half()
-        self.detector.cuda(cfg.GPU_ID)

        # Plan the metrics
        self.board = None
@@ -48,14 +46,14 @@ class SolverWrapper(object):
        if cfg.ENABLE_TENSOR_BOARD and logger.is_root():
            try:
                from dragon.tools.tensorboard import TensorBoard
-                log_dir = coordinator.experiment_dir + '/logs'
+                log_dir = coordinator.exp_dir + '/logs'
                self.board = TensorBoard(log_dir=log_dir)
            except ImportError:
                pass

    def snapshot(self):
-        filename = cfg.SOLVER.SNAPSHOT_PREFIX + \
-                   '_iter_{}.pth'.format(self.solver.iter)
+        filename = (cfg.SOLVER.SNAPSHOT_PREFIX +
+                    '_iter_{}.pkl'.format(self.solver.iter))
        filename = os.path.join(self.output_dir, filename)
        if logger.is_root() and not os.path.exists(filename):
            torch.save(self.detector.state_dict(), filename)
@@ -74,26 +72,24 @@ class SolverWrapper(object):
            for k, v in self.metrics.items():
                if k == 'total':
                    self.board.scalar_summary(
-                        'total_loss', v.get_median(), stats['iter'])
+                        'total_loss', v.median(), stats['iter'])
                else:
-                    self.board.scalar_summary(
-                        k, v.get_median(), stats['iter'])
+                    self.board.scalar_summary(k, v.median(), stats['iter'])

    def step(self):
        display = self.solver.iter % cfg.SOLVER.DISPLAY == 0
-        stats = self.solver.one_step()
+        stats = self.solver.step()
        self.add_metrics(stats)
-
        if display:
            logger.info(
                'Iteration %d, lr = %.8f, loss = %f, time = %.2fs'
                % (stats['iter'], stats['lr'],
-                   self.metrics['total'].get_median(), stats['time']))
+                   self.metrics['total'].median(), stats['time']))
            for k, v in self.metrics.items():
                if k == 'total':
                    continue
                logger.info(' ' * 10 + 'Train net output({}): {}'
-                            .format(k, v.get_median()))
+                            .format(k, v.median()))
            self.send_metrics(stats)

    def train_model(self):
@@ -102,14 +98,11 @@ class SolverWrapper(object):
        max_steps = cfg.SOLVER.MAX_STEPS

        while self.solver.iter < max_steps:
-            # Apply 1-step SGD update
            with timer.tic_and_toc():
-                _, global_step = self.step(), self.solver.iter
-
-            if global_step % (10 * cfg.SOLVER.DISPLAY) == 0:
-                logger.info(time_util.get_progress_info(timer, global_step, max_steps))
-
-            if global_step % cfg.SOLVER.SNAPSHOT_EVERY == 0:
+                _, step = self.step(), self.solver.iter
+            if step % (10 * cfg.SOLVER.DISPLAY) == 0:
+                logger.info(time_util.get_progress_info(timer, step, max_steps))
+            if step % cfg.SOLVER.SNAPSHOT_EVERY == 0:
                self.snapshot()



--- a/seetadet/dali/data_reader.py
+++ b/seetadet/dali/data_reader.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#     <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-
-import dragon.vm.dali as dali
-import numpy as np
-
-from seetadet.core.config import cfg
-
-
-class DataReader(dali.ops.KPLRecordReader):
-    def __init__(
-        self,
-        path,
-        features,
-        pipeline,
-        shard_id=0,
-        num_shards=1,
-        shuffle_after_epoch=False,
-        shuffle_chunks=0,
-        aspect_grouping=False,
-    ):
-        super(DataReader, self).__init__(
-            path=path,
-            features=features,
-            pipeline=pipeline,
-            shard_id=shard_id,
-            num_shards=num_shards,
-            shuffle_after_epoch=shuffle_after_epoch,
-            shuffle_chunks=shuffle_chunks,
-        )
-        self._aspect_grouping = aspect_grouping
-        self._class_to_ind = dict(zip(
-            cfg.MODEL.CLASSES,
-            range(len(cfg.MODEL.CLASSES))
-        ))
-        self._queue1, self._queue2 = [], []
-
-    def feed_inputs(self):
-        if not self._aspect_grouping:
-            feed_dict = collections.defaultdict(list)
-            for i in range(self._pipe.batch_size):
-                while True:
-                    example = self._buffer.get()
-                    if len(example['object']) > 0:
-                        break
-                data = self.example_to_data(example)
-                for k, v in data.items():
-                    feed_dict[k].append(v)
-            for k, v in self.features.items():
-                self._pipe.feed_input(self.features[k], feed_dict[k])
-        else:
-            batch_size = self._pipe.batch_size
-            while True:
-                batch_data = None
-                if len(self._queue1) >= batch_size:
-                    batch_data = self._queue1[:batch_size]
-                    self._queue1 = self._queue1[batch_size:]
-                elif len(self._queue2) >= batch_size:
-                    batch_data = self._queue2[:batch_size]
-                    self._queue2 = self._queue2[batch_size:]
-                if batch_data is not None:
-                    feed_dict = collections.defaultdict(list)
-                    for data in batch_data:
-                        for k, v in data.items():
-                            feed_dict[k].append(v)
-                    for k, v in self.features.items():
-                        self._pipe.feed_input(self.features[k], feed_dict[k])
-                    break
-                while True:
-                    example = self._buffer.get()
-                    if len(example['object']) > 0:
-                        break
-                data = self.example_to_data(example)
-                ratio = float(data['shape'][0]) / data['shape'][1]
-                if ratio > 1:
-                    self._queue1.append(data)
-                else:
-                    self._queue2.append(data)
-
-    def example_to_data(self, example):
-        bbox_data, bbox_ratio, bbox_label = [], [], []
-        h, w, c = example['height'], example['width'], example['depth']
-        for obj in example['object']:
-            x1 = float(max(obj['xmin'], 0))
-            y1 = float(max(obj['ymin'], 0))
-            x2 = float(min(obj['xmax'], w - 1))
-            y2 = float(min(obj['ymax'], h - 1))
-            bbox_data.append([x1, y1, x2, y2])
-            bbox_ratio.append([x1 / w, y1 / h, x2 / w, y2 / h])
-            bbox_label.append(self._class_to_ind[obj['name']])
-        return {
-            'image': example['content'],
-            'shape': np.array([h, w, c], 'int64'),
-            'bbox/data': np.array(bbox_data, 'float32'),
-            'bbox/ratio': np.array(bbox_ratio, 'float32'),
-            'bbox/label': np.array(bbox_label, 'int32')
-        }
--- a/seetadet/dali/rcnn_pipeline.py
+++ b/seetadet/dali/rcnn_pipeline.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#     <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-from dragon.vm import dali
-from dragon.vm.dali.plugin.pytorch import DALIGenericIterator
-
-from seetadet.core.config import cfg
-from seetadet.dali.data_reader import DataReader
-
-
-class Pipeline(dali.Pipeline):
-    def __init__(self, source):
-        super(Pipeline, self).__init__(
-            batch_size=cfg.TRAIN.IMS_PER_BATCH,
-            num_threads=cfg.TRAIN.NUM_THREADS,
-        )
-        random_scales = cfg.TRAIN.RANDOM_SCALES
-        if random_scales[1] > 1:
-            raise ValueError('The max scale range should be <= 1.')
-        mean_values = np.array(cfg.PIXEL_MEANS, 'int64').tolist()
-        self.max_size = cfg.TRAIN.MAX_SIZE
-
-        self.reader = DataReader(
-            path=source,
-            features=['image', 'shape', 'bbox/data', 'bbox/label'],
-            pipeline=self,
-            shard_id=dali.get_distributed_info()[0],
-            num_shards=dali.get_distributed_info()[1],
-            shuffle_after_epoch=cfg.TRAIN.USE_SHUFFLE,
-            shuffle_chunks=cfg.TRAIN.SHUFFLE_CHUNKS,
-            aspect_grouping=True,
-        )
-
-        self.decode = dali.ops.ImageDecoder()
-        self.resize = dali.ops.Resize(max_size=self.max_size)
-        self.brightness_contrast = dali.ops.BrightnessContrast()
-        self.hsv = dali.ops.Hsv()
-        self.cmn = dali.ops.CropMirrorNormalize(
-            mean=np.array(mean_values, 'int64').tolist(),
-            std=[1., 1., 1.],
-        )
-        self.pad = dali.ops.Pad(
-            axes=[1, 2],
-            align=cfg.MODEL.COARSEST_STRIDE
-            if cfg.MODEL.COARSEST_STRIDE > 0 else None,
-        )
-
-        with dali.device('cpu'):
-            self.resize_rng = dali.ops.Uniform([
-                cfg.TRAIN.SCALES[0] * random_scales[0],
-                cfg.TRAIN.SCALES[0] * random_scales[1],
-            ])
-            self.twist_rng = dali.ops.Uniform([0.6, 1.4])
-            self.flip_rng = dali.ops.CoinFlip(0.5 if cfg.TRAIN.USE_FLIPPED else 0.)
-
-    def iter_setup(self):
-        self.reader.feed_inputs()
-
-    def define_graph(self):
-        # Read inputs from file
-        inputs = self.reader()
-        shape = inputs['shape']
-        bbox = inputs['bbox/data']
-        label = inputs['bbox/label']
-
-        # Decode image
-        image = self.decode(inputs['image'])
-
-        # Augment the color space
-        if cfg.TRAIN.USE_COLOR_JITTER:
-            image = self.hsv(
-                self.brightness_contrast(
-                    image,
-                    brightness=self.twist_rng(),
-                    contrast=self.twist_rng(),
-                ),
-                saturation=self.twist_rng()
-            )
-
-        # Resize to the target size
-        target_size = self.resize_rng()
-        image = self.resize(image, resize_shorter=target_size)
-
-        # Normalize and pad to blob shape
-        apply_flip = self.flip_rng()
-        image = self.cmn(image, mirror=apply_flip)
-        image = self.pad(image)
-
-        return image, bbox, label, target_size, shape, apply_flip
-
-
-class Iterator(DALIGenericIterator):
-    def __init__(self, pipeline):
-        super(Iterator, self).__init__(pipeline)
-
-    @property
-    def handlers(self):
-        return ([0], self.copy_handler,), \
-               ([1, 2, 3, 4, 5], self.gt_handler)
-
-    def next(self):
-        (images,), (gt_boxes, ims_info) = self.__next__()
-        return {'data': images, 'gt_boxes': gt_boxes, 'ims_info': ims_info}
-
-    def gt_handler(self, tensors):
-        def impl(box_list, labels, im_shape, target_size, max_size, flip):
-            num_images = len(box_list)
-            im_size_min = np.min(im_shape[:, :2], axis=1).astype('float32')
-            im_size_max = np.max(im_shape[:, :2], axis=1).astype('float32')
-            im_scales = target_size / im_size_min
-            inds = np.where(np.round(im_scales * im_size_max) > max_size)[0]
-            im_scales[inds] = max_size / im_size_max[inds]
-            box_list = [box_list[i] * im_scales[i] for i in range(num_images)]
-            for i in (np.where(flip > 0)[0]):
-                boxes = box_list[i]
-                boxes_flipped = box_list[i].copy()
-                width = im_shape[i, 1] * im_scales[i]
-                boxes_flipped[:, 0] = width - boxes[:, 2] - 1
-                boxes_flipped[:, 2] = width - boxes[:, 0] - 1
-                box_list[i] = boxes_flipped
-            im_scales = np.expand_dims(im_scales, 1)
-            batch_inds = [np.ones([e.size, 1]) * i for i, e in enumerate(labels)]
-            boxes = np.concatenate(box_list)
-            labels = np.expand_dims(np.concatenate(labels), axis=1)
-            batch_inds = np.concatenate(batch_inds)
-            gt_boxes = np.hstack([boxes, labels, batch_inds])
-            ims_info = np.hstack([im_shape[:, :2] * im_scales, im_scales])
-            return gt_boxes.astype('float32'), ims_info.astype('float32')
-
-        bbox, label, target_size, shape, flip = tensors
-        shape = shape.as_array()
-        return impl(
-            box_list=[bbox.at(i) for i in range(len(shape))],
-            labels=[label.at(i) for i in range(len(shape))],
-            im_shape=shape,
-            target_size=target_size.as_array().squeeze(),
-            max_size=self._pipe.max_size,
-            flip=flip.as_array()
-        )
-
-
-def new_iterator(source):
-    with dali.device('cuda', cfg.GPU_ID):
-        return Iterator(Pipeline(source))
--- a/seetadet/dali/ssd_pipeline.py
+++ b/seetadet/dali/ssd_pipeline.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#     <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-from dragon.vm import dali
-from dragon.vm.dali.plugin.pytorch import DALIGenericIterator
-
-from seetadet.core.config import cfg
-from seetadet.dali.data_reader import DataReader
-
-
-class Pipeline(dali.Pipeline):
-    def __init__(self, source):
-        super(Pipeline, self).__init__(
-            batch_size=cfg.TRAIN.IMS_PER_BATCH,
-            num_threads=cfg.TRAIN.NUM_THREADS,
-        )
-        paste_ratio = 1. / cfg.TRAIN.RANDOM_SCALES[0]
-        mean_values = np.array(cfg.PIXEL_MEANS, 'int64').tolist()
-        self.target_size = cfg.TRAIN.SCALES[0]
-
-        self.reader = DataReader(
-            path=source,
-            features=['image', 'bbox/ratio', 'bbox/label'],
-            pipeline=self,
-            shard_id=dali.get_distributed_info()[0],
-            num_shards=dali.get_distributed_info()[1],
-            shuffle_after_epoch=cfg.TRAIN.USE_SHUFFLE,
-            shuffle_chunks=cfg.TRAIN.SHUFFLE_CHUNKS,
-        )
-
-        self.decode = dali.ops.ImageDecoder()
-        self.brightness_contrast = dali.ops.BrightnessContrast()
-        self.hsv = dali.ops.Hsv()
-        self.paste = dali.ops.Paste(fill_value=mean_values)
-        self.slice = dali.ops.Slice()
-        self.resize = dali.ops.Resize(self.target_size, self.target_size)
-        self.cmn = dali.ops.CropMirrorNormalize(mean=mean_values, std=[1., 1., 1.])
-
-        with dali.device('cpu'):
-            self.bbox_paste = dali.ops.BBoxPaste()
-            self.bbox_crop = dali.ops.RandomBBoxCrop()
-            self.bbox_flip = dali.ops.BbFlip()
-            self.twist_rng = dali.ops.Uniform([0.6, 1.4])
-            self.paste_pos = dali.ops.Uniform((0., 1.))
-            self.paste_ratio = dali.ops.Uniform((0., paste_ratio - 1))
-            self.flip_rng = dali.ops.CoinFlip(0.5 if cfg.TRAIN.USE_FLIPPED else 0.)
-
-    def iter_setup(self):
-        self.reader.feed_inputs()
-
-    def define_graph(self):
-        # Read inputs from file
-        inputs = self.reader()
-        bbox = inputs['bbox/ratio']
-        label = inputs['bbox/label']
-
-        # Decode image
-        image = self.decode(inputs['image'])
-
-        # Augment the color space if necessary
-        if cfg.TRAIN.USE_COLOR_JITTER:
-            image = self.hsv(
-                self.brightness_contrast(
-                    image,
-                    brightness=self.twist_rng(),
-                    contrast=self.twist_rng(),
-                ), saturation=self.twist_rng()
-            )
-
-        # Expand randomly to get smaller objects
-        pr = self.paste_ratio() * self.flip_rng() + 1.
-        px, py = self.paste_pos(), self.paste_pos()
-        image = self.paste(image, paste_x=px, paste_y=py, ratio=pr)
-        bbox = self.bbox_paste(bbox, paste_x=px, paste_y=py, ratio=pr)
-
-        # Sample RoIs with IoU constraint
-        crop_begin, crop_size, bbox, label = self.bbox_crop(bbox, label)
-        image = self.slice(image, crop_begin, crop_size)
-
-        # Resize image to a fixed size
-        image = self.resize(image)
-
-        # Normalize
-        apply_flip = self.flip_rng()
-        image = self.cmn(image, mirror=apply_flip)
-        bbox = self.bbox_flip(bbox, horizontal=apply_flip)
-
-        return image, bbox, label
-
-
-class Iterator(DALIGenericIterator):
-    def __init__(self, pipeline):
-        super(Iterator, self).__init__(pipeline)
-
-    @property
-    def handlers(self):
-        return ([0], self.copy_handler,), ([1, 2], self.gt_handler)
-
-    def next(self):
-        (images,), gt_boxes = self.__next__()
-        return {'data': images, 'gt_boxes': gt_boxes}
-
-    def gt_handler(self, tensors):
-        bbox, label = tensors
-        num_images = self._pipe.batch_size
-        boxes = np.concatenate([bbox.at(i) for i in range(num_images)])
-        boxes[:, 0::2] *= self._pipe.target_size
-        boxes[:, 1::2] *= self._pipe.target_size
-        labels = [label.at(i) for i in range(num_images)]
-        batch_inds = [np.ones_like(e) * i for i, e in enumerate(labels)]
-        labels, batch_inds = np.concatenate(labels), np.concatenate(batch_inds)
-        return np.hstack([boxes, labels, batch_inds])
-
-
-def new_iterator(source):
-    with dali.device('cuda', cfg.GPU_ID):
-        return Iterator(Pipeline(source))
--- a/seetadet/datasets/coco_evaluator.py
+++ b/seetadet/datasets/coco_evaluator.py
@@ -20,10 +20,10 @@ import sys
 import numpy as np

 from seetadet.core.config import cfg
-from seetadet.pycocotools import mask as mask_tools
-from seetadet.pycocotools.coco import COCO
-from seetadet.pycocotools.cocoeval import COCOeval
 from seetadet.utils import mask as mask_util
+from seetadet.utils.pycocotools import mask as mask_tools
+from seetadet.utils.pycocotools.coco import COCO
+from seetadet.utils.pycocotools.cocoeval import COCOeval


 class COCOEvaluator(object):
@@ -258,14 +258,24 @@ class COCOEvaluator(object):
            for obj in rec['objects']:
                x, y = obj['bbox'][0], obj['bbox'][1]
                w, h = obj['bbox'][2] - x + 1, obj['bbox'][3] - y + 1
-                mask = obj['mask']
-                if sys.version_info >= (3, 0):
-                    mask = mask.decode()
+                if 'mask' in obj:
+                    segm = {'size': mask_size, 'counts': obj['mask']}
+                    if sys.version_info >= (3, 0):
+                        segm['counts'] = segm['counts'].decode()
+                elif 'polygons' in obj:
+                    segm = []
+                    for poly in obj['polygons']:
+                        if isinstance(poly, np.ndarray):
+                            segm.append(poly.tolist())
+                        else:
+                            segm.append(poly)
+                else:
+                    raise ValueError('Excepted mask-rle or polygons.')
                dataset['annotations'].append({
                    'id': str(ann_id),
                    'bbox': [x, y, w, h],
                    'area': w * h,
-                    'segmentation': {'size': mask_size, 'counts': mask},
+                    'segmentation': segm,
                    'iscrowd': obj['difficult'],
                    'image_id': self.get_image_id(image_name),
                    'category_id': self.imdb.class_to_ind[obj['name']],

--- a/seetadet/datasets/dataset.py
+++ b/seetadet/datasets/dataset.py
@@ -13,7 +13,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import os
 import uuid

 from seetadet.core.config import cfg
@@ -34,13 +33,6 @@ class Dataset(object):
        self.config = {'cleanup': True, 'use_salt': True}

    @property
-    def cache_path(self):
-        cache_path = os.path.abspath(os.path.join(cfg.DATA_DIR, 'cache'))
-        if not os.path.exists(cache_path):
-            os.makedirs(cache_path)
-        return cache_path
-
-    @property
    def classes(self):
        return self._classes

@@ -85,7 +77,7 @@ class Dataset(object):
            evaluator = VOCEvaluator(self)
            evaluator.write_bbox_results(all_boxes, gt_recs, output_dir)
            if '!' not in protocol:
-                for ovr in (0.5, 0.7):
+                for ovr in (0.5,):
                    evaluator.do_bbox_eval(
                        gt_recs,
                        output_dir,
@@ -111,7 +103,7 @@ class Dataset(object):
            evaluator = VOCEvaluator(self)
            evaluator.write_segm_results(all_boxes, all_masks, output_dir)
            if '!' not in protocol:
-                for ovr in (0.5, 0.7):
+                for ovr in (0.5,):
                    evaluator.do_segm_eval(
                        gt_recs,
                        output_dir,

--- a/seetadet/datasets/example.py
+++ b/seetadet/datasets/example.py
@@ -16,8 +16,6 @@ from __future__ import print_function
 import cv2
 import numpy as np

-from seetadet.pycocotools import mask_utils
-

 class Example(object):
    """Wrapper for annotated example."""
@@ -32,6 +30,7 @@ class Example(object):

        """
        self._datum = datum
+        self._image = None

    @property
    def id(self):
@@ -52,11 +51,13 @@ class Example(object):
        Returns
        -------
        numpy.ndarray
-            The image.
+            The image array.

        """
-        img = np.frombuffer(self._datum['content'], 'uint8')
-        return cv2.imdecode(img, 3)
+        if self._image is None:
+            img_bytes = np.frombuffer(self._datum['content'], 'uint8')
+            self._image = cv2.imdecode(img_bytes, 3)
+        return self._image

    @property
    def height(self):
@@ -65,7 +66,7 @@ class Example(object):
        Returns
        -------
        int
-            The height of image.
+            The image height.

        """
        return self._datum['height']
@@ -83,21 +84,15 @@ class Example(object):
        objects = []
        for ix, obj in enumerate(self._datum['object']):
            mask = obj.get('mask', None)
+            polygons = obj.get('polygons', None)
            if 'x3' in obj:
-                poly = np.array([
-                    obj['x1'], obj['y1'],
-                    obj['x2'], obj['y2'],
-                    obj['x3'], obj['y3'],
-                    obj['x4'], obj['y4']
-                ], 'float32')
-                x, y, w, h = cv2.boundingRect(
-                    poly.reshape((-1, 2)))
+                poly = np.array([obj['x1'], obj['y1'],
+                                 obj['x2'], obj['y2'],
+                                 obj['x3'], obj['y3'],
+                                 obj['x4'], obj['y4']], 'float32')
+                x, y, w, h = cv2.boundingRect(poly.reshape((-1, 2)))
                bbox = [x, y, x + w, y + h]
-                mask = mask_utils.poly2bytes(
-                    [poly],
-                    self._datum['height'],
-                    self._datum['width'],
-                )
+                polygons = [poly]
            elif 'x2' in obj:
                bbox = [obj['x1'], obj['y1'], obj['x2'], obj['y2']]
            elif 'xmin' in obj:
@@ -107,9 +102,12 @@ class Example(object):
            objects.append({
                'name': obj['name'],
                'bbox': bbox,
-                'mask': mask,
                'difficult': obj.get('difficult', 0),
            })
+            if mask is not None and len(mask) > 0:
+                objects[-1]['mask'] = mask
+            elif polygons is not None and len(polygons) > 0:
+                objects[-1]['polygons'] = [np.array(p) for p in polygons]
        return objects

    @property
@@ -119,7 +117,7 @@ class Example(object):
        Returns
        -------
        int
-            The width of image.
+            The image width.

        """
        return self._datum['width']
--- a/seetadet/datasets/voc_eval.py
+++ b/seetadet/datasets/voc_eval.py
@@ -21,10 +21,10 @@ import cv2
 import numpy as np

 from seetadet.core.config import cfg
-from seetadet.pycocotools import mask_utils
 from seetadet.utils import boxes as box_util
 from seetadet.utils.env import pickle
 from seetadet.utils.mask import mask_overlap
+from seetadet.utils.pycocotools import mask_utils


 def voc_ap(rec, prec, use_07_metric=False):
@@ -258,13 +258,10 @@ def voc_segm_eval(
            crop_mask = R['mask'][j][gt_mask_bound[1]:gt_mask_bound[3] + 1,
                                     gt_mask_bound[0]:gt_mask_bound[2] + 1]

-            ov = \
-                mask_overlap(
-                    gt_mask_bound,
-                    pred_mask_bound,
-                    crop_mask,
-                    pred_mask,
-                )
+            ov = mask_overlap(gt_mask_bound,
+                              pred_mask_bound,
+                              crop_mask,
+                              pred_mask)

            if ov > ovmax:
                ovmax = ov

--- a/seetadet/modeling/airnet.py
+++ b/seetadet/modeling/airnet.py
@@ -13,58 +13,63 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import collections
+
 import dragon.vm.torch as torch

-from seetadet.core.registry import backbones
+from seetadet.core import registry
+from seetadet.core.config import cfg
 from seetadet.modules import init
 from seetadet.modules import nn


-class WideResBlock(nn.Module):
+class ResBlock(nn.Module):
+    """The resnet block."""
+
    def __init__(self, dim_in, dim_out, stride=1, downsample=None):
-        super(WideResBlock, self).__init__()
+        super(ResBlock, self).__init__()
+        norm = cfg.MODEL.BACKBONE_NORM
        self.conv1 = nn.Conv3x3(dim_in, dim_out, stride)
-        self.bn1 = nn.FrozenAffine(dim_out)
+        self.bn1 = nn.get_norm(norm, dim_out)
        self.conv2 = nn.Conv3x3(dim_out, dim_out)
-        self.bn2 = nn.FrozenAffine(dim_out)
+        self.bn2 = nn.get_norm(norm, dim_out)
        self.downsample = downsample
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
-        residual = x
-
+        identity = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
-
        out = self.conv2(out)
        out = self.bn2(out)
-
        if self.downsample is not None:
-            residual = self.downsample(residual)
-
-        out += residual
+            identity = self.downsample(x)
+        out += identity
        out = self.relu(out)
        return out


 class InceptionBlock(nn.Module):
+    """The inception block."""
+
    def __init__(self, dim_in, dim_out):
        super(InceptionBlock, self).__init__()
+        norm = cfg.MODEL.BACKBONE_NORM
        self.conv1 = nn.Conv1x1(dim_in, dim_out)
-        self.bn1 = nn.FrozenAffine(dim_out)
+        self.bn1 = nn.get_norm(norm, dim_out)
        self.conv2 = nn.Conv3x3(dim_out, dim_out // 2)
-        self.bn2 = nn.FrozenAffine(dim_out // 2)
+        self.bn2 = nn.get_norm(norm, dim_out // 2)
        self.conv3a = nn.Conv3x3(dim_out // 2, dim_out)
-        self.bn3a = nn.FrozenAffine(dim_out)
+        self.bn3a = nn.get_norm(norm, dim_out)
        self.conv3b = nn.Conv3x3(dim_out, dim_out)
-        self.bn3b = nn.FrozenAffine(dim_out)
+        self.bn3b = nn.get_norm(norm, dim_out)
        self.conv4 = nn.Conv3x3(dim_out * 3, dim_out)
-        self.bn4 = nn.FrozenAffine(dim_out)
+        self.bn4 = nn.get_norm(norm, dim_out)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
-        residual = x
+        identity = x

        out = self.conv1(x)
        out_1x1 = self.bn1(out)
@@ -86,87 +91,67 @@ class InceptionBlock(nn.Module):
        out = self.conv4(out)
        out = self.bn4(out)

-        out += residual
+        out += identity
        out = self.relu(out)
+
        return out


 class AirNet(nn.Module):
-    def __init__(self, blocks, num_stages):
+    """The airnet class."""
+
+    def __init__(self, model_cfg):
        super(AirNet, self).__init__()
-        self.dim_in, filters = 64, [64, 128, 256, 384]
-        self.feature_dims = [None, None] + filters[1:num_stages - 1]
-        self.conv1 = nn.Conv2d(
-            3, 64,
-            kernel_size=7,
-            stride=2,
-            padding=3,
-            bias=False,
-        )
-        self.bn1 = nn.FrozenAffine(self.dim_in)
+        dim_in, dims, features = 64, [64, 128, 256, 384], []
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7,
+                               stride=2, padding=3, bias=False)
+        self.bn1 = nn.get_norm(cfg.MODEL.BACKBONE_NORM, dim_in)
        self.relu = nn.ReLU(inplace=True)
-        self.maxpool = nn.MaxPool2d(
-            kernel_size=2,
-            stride=2,
-            padding=0,
-            ceil_mode=True,
-        )
-        self.layer1 = self.make_blocks(filters[0], blocks[0])
-        self.layer2 = self.make_blocks(filters[1], blocks[1], 2)
-        if num_stages >= 4:
-            self.layer3 = self.make_blocks(filters[2], blocks[2], 2)
-        if num_stages >= 5:
-            self.layer4 = self.make_blocks(filters[3], blocks[3], 2)
+        self.maxpool = nn.MaxPool2d(kernel_size=2, stride=2)
+        self.feature_dims = collections.OrderedDict(stem=64)
+        for i, v, dim_out in zip(range(4), model_cfg, dims):
+            stride = 1 if i == 0 else 2
+            downsample = nn.Sequential(
+                nn.Conv1x1(dim_in, dim_out, stride=stride),
+                nn.get_norm(cfg.MODEL.BACKBONE_NORM, dim_out),
+            )
+            features.append(ResBlock(dim_in, dim_out, stride, downsample))
+            for j in range(1, len(v)):
+                if v[j] == 'r':
+                    features.append(ResBlock(dim_out, dim_out))
+                elif v[j] == 'i':
+                    features.append(InceptionBlock(dim_out, dim_out))
+                else:
+                    raise ValueError('Unknown block flag: ' + v[i])
+            setattr(self, 'layer%d' % (i + 1), nn.Sequential(*features[-len(v):]))
+            self.feature_dims[id(features[-1])] = dim_in = dim_out
+        self.features = features
        self.reset_parameters()

    def reset_parameters(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
-                init.xaiver(m.weight)
-
-    def make_blocks(self, dim_out, blocks, stride=1):
-        downsample = nn.Sequential(
-            nn.Conv1x1(self.dim_in, dim_out, stride=stride),
-            nn.FrozenAffine(dim_out),
-        )
-        layers = [WideResBlock(self.dim_in, dim_out, stride, downsample)]
-        self.dim_in = dim_out
-        for i in range(1, len(blocks)):
-            if blocks[i] == 'r':
-                layers.append(WideResBlock(dim_out, dim_out))
-            elif blocks[i] == 'i':
-                layers.append(InceptionBlock(dim_out, dim_out))
-            else:
-                raise ValueError('Unknown block flag: ' + blocks[i])
-        return nn.Sequential(*layers)
+                init.kaiming_normal(m.weight, mode='fan_out')

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)
-
-        x = self.layer1(x)
-        outputs = [None, None, self.layer2(x)]
-        if hasattr(self, 'layer3'):
-            outputs += [self.layer3(outputs[-1])]
-        if hasattr(self, 'layer4'):
-            outputs += [self.layer4(outputs[-1])]
-
+        outputs = [None]
+        for layer in self.features:
+            x = layer(x)
+            if self.feature_dims.get(id(layer)):
+                outputs.append(x)
        return outputs


-def airnet(num_stages):
-    blocks = (
-        ('r', 'r'),  # conv2
-        ('r', 'i'),  # conv3
-        ('r', 'i'),  # conv4
-        ('r', 'i'),  # conv5
-    )
-    return AirNet(blocks, num_stages)
+def airnet(num_layers=5):
+    model_cfg = (('r', 'r'), ('r', 'i'), ('r', 'i'), ('r', 'i'))
+    return AirNet(model_cfg[:num_layers])


-backbones.register('airnet', func=airnet, num_stages=5)
-backbones.register('airnet_3b', func=airnet, num_stages=3)
-backbones.register('airnet_4b', func=airnet, num_stages=4)
-backbones.register('airnet_5b', func=airnet, num_stages=5)
+registry.backbone.register('airnet', airnet)
+registry.backbone.register('airnet_3b', airnet, num_layers=3)
+registry.backbone.register('airnet_4b', airnet, num_layers=4)
+registry.backbone.register('airnet_5b', airnet, num_layers=5)
--- a/seetadet/modeling/detector.py
+++ b/seetadet/modeling/detector.py
@@ -15,62 +15,57 @@ from __future__ import print_function

 import collections
 import importlib
+
 import dragon.vm.torch as torch

 from seetadet import modeling as models
 from seetadet.core.config import cfg
-from seetadet.core.registry import backbones
+from seetadet.core import registry
 from seetadet.modules import nn
+from seetadet.modules import utils as module_util
 from seetadet.modules import vision
 from seetadet.utils import logger


 class Detector(nn.Module):
-    """Organize the detection pipelines.
-
-    A bunch of classic algorithms are integrated, see the
-    ``lib.core.config`` for their hyper-parameters.
-
-    """
+    """Organize the detection pipelines."""

    def __init__(self):
        super(Detector, self).__init__()
-        model = cfg.MODEL.TYPE
+        model_type = cfg.MODEL.TYPE
        backbone = cfg.MODEL.BACKBONE.lower().split('.')
-        body, modules = backbone[0], backbone[1:]
+        conv_body, conv_modules = backbone[0], backbone[1:]

        # DataLoader
        self.data_loader = None
-        self.data_loader_cls = importlib.import_module(
-            'seetadet.algo.{}'.format(model)).DataLoader
-        self.bootstrap = vision.Bootstrap()
+        self.data_loader_cls = getattr(importlib.import_module(
+            'seetadet.algo.{}'.format(model_type)), 'DataLoader')
+        self.image_norm = vision.ImageNormalizer()

        # FeatureExtractor
-        self.body = backbones.get(body)()
-        feature_dims = self.body.feature_dims
+        self.conv_body = registry.backbone.get(conv_body)()
+        feature_dims = list(self.conv_body.feature_dims.values())

        # FeatureEnhancer
-        if 'fpn' in modules:
+        if 'fpn' in conv_modules:
            self.fpn = models.FPN(feature_dims)
-            feature_dims = self.fpn.feature_dims
-        elif 'mbox' in modules:
-            pass  # Placeholder
-        else:
-            feature_dims = [feature_dims[-1]]
+            feature_dims = [self.fpn.feature_dim]

-        # Detection Modules
-        if 'rcnn' in model:
+        # DetectionHead
+        if 'rcnn' in model_type:
            self.rpn = models.RPN(feature_dims[0])
-            if 'faster' in model:
+            if 'faster' in model_type:
                self.rcnn = models.FastRCNN(feature_dims[0])
-            elif 'mask' in model:
+            elif 'mask' in model_type:
                self.rcnn = models.MaskRCNN(feature_dims[0])
-
-        if 'retinanet' in model:
+            else:
+                raise ValueError('Unsupported model: ' + model_type)
+        elif model_type == 'retinanet':
            self.retinanet = models.RetinaNet(feature_dims[0])
-
-        if 'ssd' in model:
+        elif model_type == 'ssd':
            self.ssd = models.SSD(feature_dims)
+        else:
+            raise ValueError('Unsupported model: ' + model_type)

    def load_weights(self, weights):
        """Load the state dict of this detector.
@@ -83,11 +78,7 @@ class Detector(nn.Module):
            The path of the weights file.

        """
-        self.load_state_dict(
-            torch.load(weights),
-            strict=False,
-            verbose=logger.is_root(),
-        )
+        self.load_state_dict(torch.load(weights), strict=False)

    def forward(self, inputs=None):
        """Compute the detection outputs.
@@ -103,38 +94,26 @@ class Detector(nn.Module):
            The outputs.

        """
-        # 0. Get the inputs
+        # Get the inputs
        if inputs is None:
-            # 1) Training: <= DataLayer
-            # 2) Inference: <= Given
            if self.data_loader is None:
                self.data_loader = self.data_loader_cls()
            inputs = self.data_loader()

-        # 1. Extract features
-        # Process the data:
-        # 1) CPU => CUDA
-        # 2) NHWC => NCHW
-        # 3) uint8 => float32 or float16
-        # 4) Mean subtraction
-        image_data = self.bootstrap(inputs['data'])
-        features = self.body(image_data)
+        # Extract features
+        image = self.image_norm(inputs['image'])
+        features = self.conv_body(image)

-        # 2. Apply the FPN to enhance features if necessary
+        # Apply the FPN to enhance features if necessary
        if hasattr(self, 'fpn'):
            features = self.fpn(features)

-        # 3. Collect detection outputs
+        # Collect detection outputs
        outputs = collections.OrderedDict()

-        # 3.1 Feature -> RPN -> R-CNN
+        # Features -> RPN -> R-CNN
        if hasattr(self, 'rpn'):
-            outputs.update(
-                self.rpn(
-                    features=features,
-                    **inputs
-                )
-            )
+            outputs.update(self.rpn(features=features, **inputs))
            outputs.update(
                self.rcnn(
                    features=features,
@@ -144,70 +123,30 @@ class Detector(nn.Module):
                )
            )

-        # 3.2 Feature -> RetinaNet
+        # Features -> RetinaNet
        if hasattr(self, 'retinanet'):
-            outputs.update(
-                self.retinanet(
-                    features=features,
-                    **inputs
-                )
-            )
+            outputs.update(self.retinanet(features=features, **inputs))

-        # 3.3 Feature -> SSD
+        # Features -> SSD
        if hasattr(self, 'ssd'):
-            features = list(filter(None, features))
-            outputs.update(
-                self.ssd(
-                    features=features,
-                    **inputs
-                )
-            )
+            outputs.update(self.ssd(features=features, **inputs))

        return outputs

    def optimize_for_inference(self):
        """Optimize the graph for the inference."""
-
-        ###################################
-        #  Merge Affine into Convolution  #
-        ###################################
-        last_module = None
-        for module in self.modules():
-            if isinstance(module, nn.Affine) and \
-                    isinstance(last_module, nn.Conv2d):
-                if last_module.bias is None:
-                    delattr(last_module, 'bias')
-                    module.forward = lambda x: x
-                    last_module.bias = module.bias
-                    weight = module.weight.data.view(
-                        0, *([1] * (last_module.weight.ndimension() - 1)))
-                    last_module.weight.data.mul_(weight)
-            last_module = module
-
-        ######################################
-        #  Merge BatchNorm into Convolution  #
-        ######################################
+        # Optimization #1: LayerFusion
+        fusions = set()
        last_module = None
        for module in self.modules():
-            if isinstance(module, nn.BatchNorm2d) and \
-                    isinstance(last_module, nn.Conv2d):
-                if last_module.bias is None:
-                    delattr(last_module, 'bias')
-                    module.forward = lambda x: x
-                    term = torch.sqrt(module.running_var.data + module.eps)
-                    term = module.weight.data / term
-                    last_module.bias = \
-                        module.bias.data - \
-                        term * module.running_mean.data
-                    term = term.view(0, *([1] * (last_module.weight.ndimension() - 1)))
-                    if last_module.weight.dtype == 'float16':
-                        last_module.bias.half_()
-                        weight = last_module.weight.data.float()
-                        weight.mul_(term)
-                        last_module.weight.copy_(weight)
-                    else:
-                        last_module.weight.data.mul_(term)
+            pass_key, pass_fn = module_util \
+                .get_fusion_pass(last_module, module)
+            if pass_fn is not None:
+                fusions.add(pass_key)
+                pass_fn(last_module, module)
            last_module = module
+        if len(fusions) > 0:
+            logger.info('Enable fusions: ' + ', '.join(fusions))


 def new_detector(device, weights=None, training=False):

--- a/seetadet/modeling/fast_rcnn.py
+++ b/seetadet/modeling/fast_rcnn.py
@@ -43,8 +43,8 @@ class FastRCNN(nn.Module):
        self.roi_head_dim = dim_in * (cfg.FRCNN.ROI_XFORM_RESOLUTION ** 2)
        self.fc6 = nn.Linear(self.roi_head_dim, cfg.FRCNN.MLP_HEAD_DIM)
        self.fc7 = nn.Linear(cfg.FRCNN.MLP_HEAD_DIM, cfg.FRCNN.MLP_HEAD_DIM)
-        self.cls_score = nn.Linear(cfg.FRCNN.MLP_HEAD_DIM, cfg.MODEL.NUM_CLASSES)
-        self.bbox_pred = nn.Linear(cfg.FRCNN.MLP_HEAD_DIM, cfg.MODEL.NUM_CLASSES * 4)
+        self.cls_score = nn.Linear(cfg.FRCNN.MLP_HEAD_DIM, len(cfg.MODEL.CLASSES))
+        self.bbox_pred = nn.Linear(cfg.FRCNN.MLP_HEAD_DIM, len(cfg.MODEL.CLASSES) * 4)
        self.rpn_decoder = det.RPNDecoder()
        self.proposal = faster_rcnn.Proposal()
        self.proposal_target = faster_rcnn.ProposalTarget()
@@ -53,15 +53,16 @@ class FastRCNN(nn.Module):
        self.sigmoid = nn.Sigmoid()
        self.box_roi_feature = functools.partial({
            'RoIPool': vision.roi_pool,
-            'RoIAlign': vision.roi_align
-        }[cfg.FRCNN.ROI_XFORM_METHOD], size=cfg.FRCNN.ROI_XFORM_RESOLUTION)
+            'RoIAlign': vision.roi_align,
+        }[cfg.FRCNN.ROI_XFORM_METHOD],
+            size=cfg.FRCNN.ROI_XFORM_RESOLUTION,
+            sampling_ratio=cfg.FRCNN.ROI_XFORM_SAMPLING_RATIO)
        self.cls_loss = nn.CrossEntropyLoss()
-        if 'IOU' in cfg.MODEL.REG_LOSS_TYPE.upper():
-            self.bbox_loss = nn.IoULoss(
-                delta_weights=cfg.BBOX_REG_WEIGHTS)
+        if cfg.FRCNN.BBOX_REG_LOSS_TYPE.lower() == 'l1':
+            self.bbox_loss = nn.L1Loss(reduction='sum')
        else:
-            self.bbox_loss = nn.SmoothL1Loss(reduction='sum')
-        # Compute spatial scales according to strides
+            self.bbox_loss = nn.SmoothL1Loss(beta=1.0, reduction='sum')
+        # Compute spatial scales according to strides.
        self.spatial_scales = [
            1. / (2 ** lvl)
            for lvl in range(
@@ -71,36 +72,35 @@ class FastRCNN(nn.Module):
        self.reset_parameters()

    def reset_parameters(self):
-        # Careful initialization for Fast R-CNN
        init.normal(self.cls_score.weight, std=0.01)
        init.normal(self.bbox_pred.weight, std=0.001)
-        for name, p in self.named_parameters():
+        for name, param in self.named_parameters():
            if 'bias' in name:
-                init.constant(p, 0)
+                init.constant(param, 0)

    def forward(self, **kwargs):
-        # Generate proposals
-        proposal_func = self.proposal \
+        # Generate proposals.
+        proposal_fn = self.proposal \
            if self.training else self.rpn_decoder
        self.data = {
-            'rois': proposal_func(
-                kwargs['features'],
-                self.sigmoid(kwargs['rpn_cls_score'].data),
-                kwargs['rpn_bbox_pred'],
-                kwargs['ims_info'],
+            'rois': proposal_fn(
+                features=kwargs['features'],
+                cls_prob=self.sigmoid(kwargs['rpn_cls_score'].data),
+                bbox_pred=kwargs['rpn_bbox_pred'],
+                im_info=kwargs['im_info'],
            )
        }

-        # Generate targets from proposals
+        # Generate targets from proposals.
        if self.training:
            self.data.update(
                self.proposal_target(
-                    rpn_rois=self.data['rois'],
+                    rois=self.data['rois'],
                    gt_boxes=kwargs['gt_boxes'],
                )
            )

-        # Transform RoI features
+        # Transform RoI features.
        if len(self.data['rois']) > 1:
            roi_features = \
                torch.cat([
@@ -118,39 +118,39 @@ class FastRCNN(nn.Module):
                    1. / cfg.RPN.STRIDES[0],
                )

-        # Apply a simple MLP
+        # Apply a simple MLP.
        roi_features = roi_features.view(-1, self.roi_head_dim)
        roi_features = self.relu(self.fc6(roi_features))
        roi_features = self.relu(self.fc7(roi_features))

-        # Compute logits and losses
+        # Compute logits and losses.
        outputs = collections.OrderedDict()
        cls_score = self.cls_score(roi_features).float()
        outputs['bbox_pred'] = self.bbox_pred(roi_features).float()

        if self.training:
-            # Compute rcnn losses
+            # Compute rcnn losses.
            bbox_pred = outputs['bbox_pred'].view(0, -1, 4) \
-                .index_select((0, 1), self.data['bbox_indices'])
-            bbox_loss_weight = cfg.MODEL.REG_LOSS_WEIGHT / (
-                roi_features.shape[0] if isinstance(
-                    self.bbox_loss, nn.SmoothL1Loss) else 1.)
+                .index_select((0, 1), self.data['bbox_inds'])
+            batch_size = roi_features.size(0)
+            bbox_loss_weight = cfg.FRCNN.BBOX_REG_LOSS_WEIGHT
+            bbox_loss_weight /= float(batch_size)
            outputs.update(collections.OrderedDict([
                ('cls_loss', self.cls_loss(
-                    cls_score, self.data['labels'])),
+                    cls_score,
+                    self.data['labels'])),
                ('bbox_loss', self.bbox_loss(
                    bbox_pred,
                    self.data['bbox_targets'],
-                    self.data['bbox_anchors'],
-                ) * bbox_loss_weight),
+                    self.data['bbox_anchors']) * bbox_loss_weight),
            ]))
        else:
-            # Return the rois to decode the refine boxes
+            # Return the rois to decode the refine boxes.
            if len(self.data['rois']) > 1:
                outputs['rois'] = torch.cat(self.data['rois'], 0)
            else:
                outputs['rois'] = self.data['rois'][0]
-            # Return the classification prob
+            # Return the classification prob.
            outputs['cls_prob'] = self.softmax(cls_score)

        return outputs
--- a/seetadet/modeling/fpn.py
+++ b/seetadet/modeling/fpn.py
@@ -25,37 +25,36 @@ class FPN(nn.Module):

    def __init__(self, feature_dims):
        super(FPN, self).__init__()
-        dim = cfg.FPN.DIM
        self.C = nn.ModuleList()
        self.P = nn.ModuleList()
+        self.feature_dim = dim = cfg.FPN.DIM
        self.highest_backbone_lvl = min(cfg.FPN.RPN_MAX_LEVEL, HIGHEST_BACKBONE_LVL)
        for lvl in range(cfg.FPN.RPN_MIN_LEVEL, self.highest_backbone_lvl + 1):
            self.C.append(nn.Conv1x1(feature_dims[lvl - 1], dim, bias=True))
            self.P.append(nn.Conv3x3(dim, dim, bias=True))
        if 'rcnn' in cfg.MODEL.TYPE:
-            self.apply_func = self.apply_on_rcnn
+            self.apply_func = self.apply_rcnn
            self.maxpool = nn.MaxPool2d(kernel_size=1, stride=2)
        else:
-            self.apply_func = self.apply_on_generic
+            self.apply_func = self.apply_generic
            self.relu = nn.ReLU(inplace=False)
            for lvl in range(self.highest_backbone_lvl + 1, cfg.FPN.RPN_MAX_LEVEL + 1):
                dim_in = feature_dims[-1] if lvl == self.highest_backbone_lvl + 1 else dim
                self.P.append(nn.Conv3x3(dim_in, dim, stride=2, bias=True))
-        self.feature_dims = [dim]
        self.coarsest_stride = cfg.MODEL.COARSEST_STRIDE
        self.reset_parameters()

    def reset_parameters(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
-                init.xaiver(m.weight)
+                init.xavier_uniform(m.weight)
                init.constant(m.bias, 0)

-    def apply_on_rcnn(self, features):
+    def apply_rcnn(self, features):
        fpn_input = self.C[-1](features[-1])
        min_lvl, max_lvl = cfg.FPN.RPN_MIN_LEVEL, cfg.FPN.RPN_MAX_LEVEL
        outputs = [self.P[self.highest_backbone_lvl - min_lvl](fpn_input)]
-        # Apply max pool for higher features
+        # Apply max pool for higher features.
        for i in range(self.highest_backbone_lvl + 1, max_lvl + 1):
            outputs.append(self.maxpool(outputs[-1]))
        # Build pyramids between [MIN_LEVEL, HIGHEST_LEVEL]
@@ -69,11 +68,11 @@ class FPN(nn.Module):
            outputs.insert(0, self.P[i - min_lvl](fpn_input))
        return outputs

-    def apply_on_generic(self, features):
+    def apply_generic(self, features):
        fpn_input = self.C[-1](features[-1])
        min_lvl, max_lvl = cfg.FPN.RPN_MIN_LEVEL, cfg.FPN.RPN_MAX_LEVEL
        outputs = [self.P[self.highest_backbone_lvl - min_lvl](fpn_input)]
-        # Add extra convolutions for higher features
+        # Add extra convolutions for higher features.
        extra_input = features[-1]
        for i in range(self.highest_backbone_lvl + 1, max_lvl + 1):
            outputs.append(self.P[i - min_lvl](extra_input))

--- a/seetadet/modeling/mask_rcnn.py
+++ b/seetadet/modeling/mask_rcnn.py
@@ -49,9 +49,9 @@ class MaskRCNN(nn.Module):
        self.fc7 = nn.Linear(cfg.FRCNN.MLP_HEAD_DIM, cfg.FRCNN.MLP_HEAD_DIM)
        self.fcn = nn.ModuleList([nn.Conv3x3(dim_in, dim_in, bias=True) for _ in range(4)])
        self.fcn += [nn.ConvTranspose2d(dim_in, dim_in, 2, 2, 0)]
-        self.cls_score = nn.Linear(cfg.FRCNN.MLP_HEAD_DIM, cfg.MODEL.NUM_CLASSES)
-        self.bbox_pred = nn.Linear(cfg.FRCNN.MLP_HEAD_DIM, cfg.MODEL.NUM_CLASSES * 4)
-        self.mask_score = nn.Conv1x1(dim_in, cfg.MODEL.NUM_CLASSES - 1, bias=True)
+        self.cls_score = nn.Linear(cfg.FRCNN.MLP_HEAD_DIM, len(cfg.MODEL.CLASSES))
+        self.bbox_pred = nn.Linear(cfg.FRCNN.MLP_HEAD_DIM, len(cfg.MODEL.CLASSES) * 4)
+        self.mask_score = nn.Conv1x1(dim_in, len(cfg.MODEL.CLASSES) - 1, bias=True)
        self.rpn_decoder = det.RPNDecoder()
        self.proposal = mask_rcnn.Proposal()
        self.proposal_target = mask_rcnn.ProposalTarget()
@@ -61,36 +61,39 @@ class MaskRCNN(nn.Module):
        self.box_roi_feature = functools.partial({
            'RoIPool': vision.roi_pool,
            'RoIAlign': vision.roi_align,
-        }[cfg.FRCNN.ROI_XFORM_METHOD], size=cfg.FRCNN.ROI_XFORM_RESOLUTION)
+        }[cfg.FRCNN.ROI_XFORM_METHOD],
+            size=cfg.FRCNN.ROI_XFORM_RESOLUTION,
+            sampling_ratio=cfg.FRCNN.ROI_XFORM_SAMPLING_RATIO)
        self.mask_roi_feature = functools.partial({
            'RoIPool': vision.roi_pool,
            'RoIAlign': vision.roi_align,
-        }[cfg.MRCNN.ROI_XFORM_METHOD], size=cfg.MRCNN.ROI_XFORM_RESOLUTION)
+        }[cfg.MRCNN.ROI_XFORM_METHOD],
+            size=cfg.MRCNN.ROI_XFORM_RESOLUTION,
+            sampling_ratio=cfg.MRCNN.ROI_XFORM_SAMPLING_RATIO)
        self.cls_loss = nn.CrossEntropyLoss()
-        self.bbox_loss = nn.SmoothL1Loss(reduction='sum')
+        if cfg.FRCNN.BBOX_REG_LOSS_TYPE.lower() == 'l1':
+            self.bbox_loss = nn.L1Loss(reduction='sum')
+        else:
+            self.bbox_loss = nn.SmoothL1Loss(beta=1.0, reduction='sum')
        self.mask_loss = nn.BCEWithLogitsLoss()
        self.compute_mask_score = None
-        # Compute spatial scales according to strides
+        # Compute spatial scales according to strides.
        self.spatial_scales = [
            1. / (2 ** lvl)
-            for lvl in range(
-                cfg.FPN.ROI_MIN_LEVEL,
-                cfg.FPN.ROI_MAX_LEVEL + 1
-            )]
+            for lvl in range(cfg.FPN.ROI_MIN_LEVEL,
+                             cfg.FPN.ROI_MAX_LEVEL + 1)]
        self.reset_parameters()

    def reset_parameters(self):
-        # Careful initialization for Fast R-CNN
        init.normal(self.cls_score.weight, std=0.01)
        init.normal(self.bbox_pred.weight, std=0.001)
-        # Careful initialization for Mask R-CNN
        init.normal(self.mask_score.weight, std=0.001)
        for m in self.fcn.modules():
            if hasattr(m, 'weight'):
                init.kaiming_normal(m.weight)
-        for name, p in self.named_parameters():
+        for name, param in self.named_parameters():
            if 'bias' in name:
-                init.constant(p, 0)
+                init.constant(param, 0)

    def get_mask_score(self, features, rois):
        roi_features = \
@@ -104,30 +107,30 @@ class MaskRCNN(nn.Module):
        return self.mask_score(roi_features).float()

    def forward(self, **kwargs):
-        # Generate proposals
+        # Generate proposals.
        proposal_func = self.proposal \
            if self.training else self.rpn_decoder
        self.data = {
            'rois': proposal_func(
-                kwargs['features'],
-                self.sigmoid(kwargs['rpn_cls_score'].data),
-                kwargs['rpn_bbox_pred'],
-                kwargs['ims_info'],
+                features=kwargs['features'],
+                cls_prob=self.sigmoid(kwargs['rpn_cls_score'].data),
+                bbox_pred=kwargs['rpn_bbox_pred'],
+                im_info=kwargs['im_info'],
            )
        }

-        # Generate targets from proposals
+        # Generate targets from proposals.
        if self.training:
            self.data.update(
                self.proposal_target(
-                    rpn_rois=self.data['rois'],
+                    rois=self.data['rois'],
                    gt_boxes=kwargs['gt_boxes'],
-                    gt_masks=kwargs['gt_masks'],
-                    ims_info=kwargs['ims_info'],
+                    gt_segms=kwargs['gt_segms'],
+                    im_info=kwargs['im_info'],
                )
            )

-        # Transform RoI features
+        # Transform RoI features.
        roi_features = \
            torch.cat([
                self.box_roi_feature(
@@ -137,47 +140,48 @@ class MaskRCNN(nn.Module):
                ) for i, spatial_scale in enumerate(self.spatial_scales)
            ], dim=0)

-        # Apply a simple MLP
+        # Apply a simple MLP.
        roi_features = roi_features.view(-1, self.roi_head_dim)
        roi_features = self.relu(self.fc6(roi_features))
        roi_features = self.relu(self.fc7(roi_features))

-        # Compute logits and losses
+        # Compute logits and losses.
        outputs = collections.OrderedDict()
        cls_score = self.cls_score(roi_features).float()
        outputs['bbox_pred'] = self.bbox_pred(roi_features).float()

        if self.training:
-            # Compute the loss of bbox branch
+            # Compute the loss of bbox branch.
            bbox_pred = outputs['bbox_pred'].view(0, -1, 4) \
-                .index_select((0, 1), self.data['bbox_indices'])
+                .index_select((0, 1), self.data['bbox_inds'])
+            batch_size = roi_features.size(0)
+            bbox_loss_weight = cfg.FRCNN.BBOX_REG_LOSS_WEIGHT
+            bbox_loss_weight /= float(batch_size)
            outputs.update(collections.OrderedDict([
                ('cls_loss', self.cls_loss(
-                    cls_score, self.data['labels'])),
+                    cls_score,
+                    self.data['labels'])),
                ('bbox_loss', self.bbox_loss(
-                    bbox_pred, self.data['bbox_targets'],
-                ) / roi_features.shape[0]),
+                    bbox_pred,
+                    self.data['bbox_targets']) * bbox_loss_weight),
            ]))
-            # Compute the loss of mask branch
+            # Compute the loss of mask branch.
            mask_score = self.get_mask_score(
                kwargs['features'], self.data['mask_rois'])
-            mask_score = mask_score.index_select(
-                (0, 1), self.data['mask_indices'])
+            mask_score = mask_score \
+                .index_select((0, 1), self.data['mask_inds'])
            outputs['mask_loss'] = self.mask_loss(
                mask_score, self.data['mask_targets'])
        else:
-            # Return the RoIs to decode the refine boxes
+            # Return the RoIs to decode the refine boxes.
            if len(self.data['rois']) > 1:
                outputs['rois'] = torch.cat(self.data['rois'], 0)
            else:
                outputs['rois'] = self.data['rois'][0]
-            # Return the classification prob
+            # Return the classification prob.
            outputs['cls_prob'] = self.softmax(cls_score)
-            # Set a callback to decode mask from refined RoIs
-            self.compute_mask_score = \
-                functools.partial(
-                    self.get_mask_score,
-                    features=kwargs['features'],
-                )
+            # Set a callback to decode mask from refined RoIs.
+            self.compute_mask_score = functools.partial(
+                self.get_mask_score, features=kwargs['features'])

        return outputs
--- a/seetadet/modeling/mobilenet.py
+++ b/seetadet/modeling/mobilenet.py
@@ -13,212 +13,162 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import collections
 import functools

-import dragon.vm.torch as torch
-
+from seetadet.core import registry
 from seetadet.core.config import cfg
-from seetadet.core.registry import backbones
 from seetadet.modules import init
 from seetadet.modules import nn
-from seetadet.modules import vision
-
-
-def conv_triplet(dim_in, dim_out):
-    """1x1 convolution + BN + ReLU."""
-    return [
-        nn.Conv2d(dim_in, dim_out, 1, bias=False),
-        nn.FrozenAffine(dim_out),
-        nn.ReLU(True),
-    ]
-
-
-def conv_quintet(dim_in, dim_out, ks, stride):
-    """KxK convolution + BN + ReLU."""
-    return [
-        nn.DepthwiseConv2d(
-            dim_in, dim_in,
-            kernel_size=ks,
-            stride=stride,
-            padding=ks // 2,
-            bias=False,
-        ),
-        nn.FrozenAffine(dim_in),
-        nn.ReLU(True),
-        nn.Conv1x1(dim_in, dim_out),
-        nn.FrozenAffine(dim_out),
-    ]
-
-
-class Setting(object):

-    V2 = (
-        [2, 3, 4, 3, 3, 1],
-        [2, 2, 2, 1, 2, 1],
-        [32, 16, 24, 32, 64, 96, 160, 320, 1280],
-    )
-
-    PROXYLESS_MOBILE = (
-        [4, 4, 4, 4, 4, 1],
-        [2, 2, 2, 1, 2, 1],
-        [32, 16, 32, 40, 80, 96, 192, 320, 1280],
-    )
-
-    PROXYLESS_GPU = (
-        [4, 4, 4, 4, 4, 1],
-        [2, 2, 2, 1, 2, 1],
-        [40, 24, 32, 56, 112, 128, 256, 432, 1280],
-    )
-
-
-def stem(dim_out, stride=1):
-    return torch.nn.Sequential(
-        torch.nn.Conv2d(
-            3, dim_out,
-            kernel_size=3,
-            stride=stride,
-            padding=1,
-            bias=False,
-        ),
-        nn.FrozenAffine(dim_out),
-        nn.ReLU(True),
-    )

-
-class Choice(nn.Module):
-    def __init__(self, dim_in, dim_out, mb=3, ks=3, stride=1):
-        super(Choice, self).__init__()
-        self.mb = mb
-        dim_hidden = int(round(dim_in * mb))
-        seq = conv_triplet(dim_in, dim_hidden) if mb != 1 else []
-        seq += conv_quintet(dim_hidden, dim_out, ks, stride)
-        self.conv = nn.ModuleList(seq)
+def conv_triplet(dim_in, dim_out, kernel_size=1, stride=1):
+    """Return a convolution triplet."""
+    return [nn.Conv2d(dim_in, dim_out,
+                      kernel_size=kernel_size,
+                      stride=stride,
+                      padding=kernel_size // 2,
+                      bias=False),
+            nn.get_norm(cfg.MODEL.BACKBONE_NORM, dim_out),
+            nn.ReLU(True)]
+
+
+def conv_quintet(dim_in, dim_out, kernel_size, stride):
+    """Return a convolution quintet."""
+    return [nn.Conv2d(dim_in, dim_in,
+                      kernel_size=kernel_size,
+                      stride=stride,
+                      padding=kernel_size // 2,
+                      groups=dim_in,
+                      bias=False),
+            nn.get_norm(cfg.MODEL.BACKBONE_NORM, dim_in),
+            nn.ReLU(True),
+            nn.Conv2d(dim_in, dim_out, kernel_size=1, bias=False),
+            nn.get_norm(cfg.MODEL.BACKBONE_NORM, dim_out)]
+
+
+class InvertedResidual(nn.Module):
+    """The invert residual block."""
+
+    def __init__(self, dim_in, dim_out, kernel_size=3, expand_ratio=3, stride=1):
+        super(InvertedResidual, self).__init__()
+        self.dim = dim = int(round(dim_in * expand_ratio))
+        layers = []
+        if expand_ratio != 1:
+            layers.append(nn.Sequential(*conv_triplet(dim_in, dim)))
+        quintet = conv_quintet(dim, dim_out, kernel_size, stride)
+        layers.append(nn.Sequential(*quintet[:3]))
+        layers.extend(quintet[3:])
+        self.conv = nn.Sequential(*layers)
        self.stride = stride
        self.apply_residual = stride == 1 and dim_in == dim_out
+        self.feature = None

    def forward(self, x):
-        residual = x if self.apply_residual else None
-        for i in range(3):
-            x = self.conv[i](x)
-        y = x if self.stride == 2 else None
-        for i in range(3, len(self.conv)):
-            x = self.conv[i](x)
+        out = self.conv[0](x)
+        self.feature = out if self.stride == 2 else None
+        for layer in self.conv[1:]:
+            out = layer(out)
        if self.apply_residual:
-            return residual + x, y
-        else:
-            return x, y
+            out += x
+        return out


 class NASMobileNet(nn.Module):
-    def __init__(self, choices, preset=Setting.PROXYLESS_MOBILE):
+    """The NAS variant of mobilenet series."""
+
+    # Pre-defined conv blocks
+    blocks = {
+        0: functools.partial(InvertedResidual, kernel_size=3, expand_ratio=3),
+        1: functools.partial(InvertedResidual, kernel_size=3, expand_ratio=6),
+        2: functools.partial(InvertedResidual, kernel_size=5, expand_ratio=3),
+        3: functools.partial(InvertedResidual, kernel_size=5, expand_ratio=6),
+        4: functools.partial(InvertedResidual, kernel_size=7, expand_ratio=3),
+        5: functools.partial(InvertedResidual, kernel_size=7, expand_ratio=6),
+        6: nn.Identity,
+    }
+
+    def __init__(self, arch, preset):
        super(NASMobileNet, self).__init__()
-
-        # Pre-defined blocks
-        def select_block(choice):
-            return {
-                0: functools.partial(Choice, mb=3, ks=3),
-                1: functools.partial(Choice, mb=6, ks=3),
-                2: functools.partial(Choice, mb=3, ks=5),
-                3: functools.partial(Choice, mb=6, ks=5),
-                4: functools.partial(Choice, mb=3, ks=7),
-                5: functools.partial(Choice, mb=6, ks=7),
-                6: nn.Identity,
-            }[choice]
-
        # Hand-craft configurations
        repeats, strides, out_channels = preset
-        names = ['2!', '3!', '4', '4!', '5', '5!']
-        self.num_layers = len(choices)
-        assert sum(repeats) == self.num_layers
+        assert sum(repeats) == len(arch)
+        self.feature_dims = collections.OrderedDict()

        # Stem
-        self.bootstrap = vision.Bootstrap()
-        self.conv1 = stem(out_channels[0], stride=2)
-        self.stage1 = Choice(out_channels[0], out_channels[1], mb=1, ks=3)
-        dim_in = out_channels[1]
-        self.feature_dims = [out_channels[-1]]
+        features = [nn.Sequential(*conv_triplet(3, out_channels[0], 3, stride=2)),
+                    InvertedResidual(*out_channels[:2], 3, 1)]

        # Body
-        self.layers = []
-        for name, rep, dim_out, stride in zip(
-                names, repeats, out_channels[2:], strides):
-            block_cls = select_block(choices[len(self.layers)])
-            self.layers.append(block_cls(dim_in, dim_out, stride=stride))
-            if stride == 2:
-                self.feature_dims.insert(
-                    -1, dim_in * self.layers[-1].mb)
-            for i in range(rep - 1):
-                block_cls = select_block(choices[len(self.layers)])
-                self.layers.append(block_cls(dim_out, dim_out, stride=1))
-            fullname = 'stage%s' % name.split('!')[0]
-            seq = getattr(self, fullname, [])
-            seq += self.layers[-rep:]
-            seq = nn.Sequential(*seq) if '!' in name else seq
-            setattr(self, fullname, seq)
-            dim_in = dim_out
-
-        self.conv6 = nn.Sequential(*conv_triplet(dim_in, out_channels[-1]))
-        self.last_outputs = None
+        dim_in = out_channels[1]
+        for repeat, dim_out, stride in \
+                zip(repeats, out_channels[2:], strides):
+            for i in range(repeat):
+                stride = stride if i == 0 else 1
+                block = self.blocks[arch[len(features) - 2]]
+                features.append(block(dim_in, dim_out, stride=stride))
+                dim_in = dim_out
+                if stride == 2:
+                    self.feature_dims[id(features[-1])] = features[-1].dim
+        features.append(nn.Sequential(*conv_triplet(dim_in, out_channels[-1])))
+        self.feature_dims[id(features[-1])] = out_channels[-1]
+        self.features = nn.Sequential(*features)
        self.reset_parameters()

    def reset_parameters(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
-                init.kaiming_normal(m.weight, 'fan_out')
-                if m.bias is not None:
-                    init.constant(m.bias, 0)
-            elif isinstance(m, nn.BatchNorm2d):
-                init.constant(m.weight, 1)
-            elif isinstance(m, nn.Linear):
-                if m.bias is not None:
-                    init.constant(m.bias, 0)
-
-        # Stop the gradients if necessary
-        def freeze_func(m):
-            if isinstance(m, nn.Conv2d):
-                m.weight.requires_grad = False
-                m._buffers['weight'] = m.weight
-                del m._parameters['weight']
-
-        if cfg.MODEL.FREEZE_AT > 0:
-            self.conv1.apply(freeze_func)
-            self.stage1.apply(freeze_func)
-
-        for i in range(cfg.MODEL.FREEZE_AT, 1, -1):
-            getattr(self, 'stage{}'.format(i)).apply(freeze_func)
+                init.kaiming_normal(m.weight, mode='fan_out')

    def forward(self, x):
-        x = self.conv1(x)
-        x, _ = self.stage1(x)
        outputs = []
-        for layer in self.layers:
+        for layer in self.features:
            x = layer(x)
-            x, y = x if isinstance(x, tuple) else (x, None)
-            if y is not None:
-                outputs.append(y)
-        outputs.append(self.conv6(x))
-        if self.training:
-            self.last_outputs = outputs
+            if self.feature_dims.get(id(layer)):
+                if hasattr(layer, 'feature'):
+                    outputs.append(layer.feature)
+                else:
+                    outputs.append(x)
        return outputs


-@backbones.register('mobilenet_a1')
-def mobilenet_a1():
-    return NASMobileNet([
-        4, 6, 6, 6,
-        3, 3, 4, 6,
-        2, 4, 0, 4, 1, 5, 3, 5,
-        2, 4, 2, 4,
-        1,
-    ], Setting.PROXYLESS_MOBILE)
+class ModelSetting(object):
+    """Hand-craft model setting."""
+
+    V2 = (
+        [2, 3, 4, 3, 3, 1],
+        [2, 2, 2, 1, 2, 1],
+        [32, 16, 24, 32, 64, 96, 160, 320, 1280],
+    )
+
+    PROXYLESS_MOBILE = (
+        [4, 4, 4, 4, 4, 1],
+        [2, 2, 2, 1, 2, 1],
+        [32, 16, 32, 40, 80, 96, 192, 320, 1280],
+    )
+
+    PROXYLESS_GPU = (
+        [4, 4, 4, 4, 4, 1],
+        [2, 2, 2, 1, 2, 1],
+        [40, 24, 32, 56, 112, 128, 256, 432, 1280],
+    )


-@backbones.register('mobilenet_v2')
+@registry.backbone.register('mobilenet_v2')
 def mobilenet_v2():
-    return NASMobileNet([
-        1, 1,
-        1, 1, 1,
-        1, 1, 1, 1, 1, 1, 1,
-        1, 1, 1,
-        1,
-    ], Setting.V2)
+    return NASMobileNet([1, 1,
+                         1, 1, 1,
+                         1, 1, 1, 1,
+                         1, 1, 1,
+                         1, 1, 1,
+                         1], ModelSetting.V2)
+
+
+@registry.backbone.register('proxyless_mobile')
+def proxyless_mobile():
+    return NASMobileNet([2, 0, 6, 6,
+                         4, 0, 2, 2,
+                         5, 2, 2, 2,
+                         3, 2, 2, 2,
+                         5, 5, 4, 4,
+                         5], ModelSetting.PROXYLESS_MOBILE)
--- a/seetadet/modeling/resnet.py
+++ b/seetadet/modeling/resnet.py
@@ -7,233 +7,165 @@
 #
 #     <https://opensource.org/licenses/BSD-2-Clause>
 #
-# Codes are based on:
-#
-#     <https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py>
-#
 # ------------------------------------------------------------

 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import collections
+
+from seetadet.core import registry
 from seetadet.core.config import cfg
-from seetadet.core.registry import backbones
 from seetadet.modules import nn
 from seetadet.modules import init
+from seetadet.utils import env


 class BasicBlock(nn.Module):
-    def __init__(
-        self,
-        dim_in,
-        dim_out,
-        stride=1,
-        downsample=None,
-        dropblock=None,
-    ):
+    """The basic resnet block."""
+
+    expansion = 1
+
+    def __init__(self, dim_in, dim, stride=1, downsample=None):
        super(BasicBlock, self).__init__()
-        self.conv1 = nn.Conv3x3(dim_in, dim_out, stride)
-        self.bn1 = nn.FrozenAffine(dim_out)
+        norm = cfg.MODEL.BACKBONE_NORM
+        self.conv1 = nn.Conv3x3(dim_in, dim, stride)
+        self.bn1 = nn.get_norm(norm, dim)
        self.relu = nn.ReLU(inplace=True)
-        self.conv2 = nn.Conv3x3(dim_out, dim_out)
-        self.bn2 = nn.FrozenAffine(dim_out)
+        self.conv2 = nn.Conv3x3(dim, dim)
+        self.bn2 = nn.get_norm(norm, dim)
        self.downsample = downsample
-        self.dropblock1 = nn.DropBlock2d(**dropblock) if dropblock else None
-        self.dropblock2 = nn.DropBlock2d(**dropblock) if dropblock else None

    def forward(self, x):
-        residual = x
-
+        identity = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
-
-        if self.dropblock1 is not None:
-            out = self.dropblock1(out)
-
        out = self.conv2(out)
        out = self.bn2(out)
-
-        if self.dropblock2 is not None:
-            residual = self.dropblock2(residual)
-
        if self.downsample is not None:
-            residual = self.downsample(residual)
-
-        out += residual
+            identity = self.downsample(x)
+        out += identity
        out = self.relu(out)
        return out


 class Bottleneck(nn.Module):
-    # 1x64d => 0.25 (ResNet)
-    # 32x8d, 64x4d => 1.0 (ResNeXt)
-    contraction = cfg.RESNET.NUM_GROUPS \
-        * cfg.RESNET.GROUP_WIDTH / 256.0
-
-    def __init__(
-        self,
-        dim_in,
-        dim_out,
-        stride=1,
-        downsample=None,
-        dropblock=None,
-    ):
+    """The bottleneck resnet block."""
+
+    expansion = 4
+
+    def __init__(self, dim_in, dim, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
-        dim = int(dim_out * self.contraction)
-        self.conv1 = nn.Conv1x1(dim_in, dim)
-        self.bn1 = nn.FrozenAffine(dim)
-        self.conv2 = nn.Conv3x3(dim, dim, stride=stride)
-        self.drop2 = nn.DropBlock2d(**dropblock) if dropblock else None
-        self.bn2 = nn.FrozenAffine(dim)
-        self.conv3 = nn.Conv1x1(dim, dim_out)
-        self.drop3 = nn.DropBlock2d(**dropblock) if dropblock else None
-        self.bn3 = nn.FrozenAffine(dim_out)
+        groups = cfg.RESNET.NUM_GROUPS
+        width_per_group = cfg.RESNET.WIDTH_PER_GROUP
+        norm = cfg.MODEL.BACKBONE_NORM
+        width = int(dim * (width_per_group / 64.)) * groups
+        self.conv1 = nn.Conv1x1(dim_in, width)
+        self.bn1 = nn.get_norm(norm, width)
+        self.conv2 = nn.Conv3x3(width, width, stride=stride)
+        self.bn2 = nn.get_norm(norm, width)
+        self.conv3 = nn.Conv1x1(width, dim * self.expansion)
+        self.bn3 = nn.get_norm(norm, dim * self.expansion)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample

    def forward(self, x):
-        residual = x
-
+        identity = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
-
        out = self.conv2(out)
-        if self.drop2 is not None:
-            out = self.drop2(out)
        out = self.bn2(out)
        out = self.relu(out)
-
        out = self.conv3(out)
        out = self.bn3(out)
-
        if self.downsample is not None:
-            residual = self.downsample(residual)
-
-        out += residual
-        if self.drop3 is not None:
-            out = self.drop3(out)
+            identity = self.downsample(x)
+        out += identity
        out = self.relu(out)
        return out


 class ResNet(nn.Module):
-    def __init__(self, block, layers, filters):
+    """The resnet class."""
+
+    def __init__(self, block, layers):
        super(ResNet, self).__init__()
-        self.dim_in, filters = filters[0], filters[1:]
-        self.feature_dims = [self.dim_in] + filters
-        self.conv1 = nn.Conv2d(
-            3, 64,
-            kernel_size=7,
-            stride=2,
-            padding=3,
-            bias=False,
-        )
-        self.bn1 = nn.FrozenAffine(self.dim_in)
+        dim_in, dims, features = 64, [64, 128, 256, 512], []
+        self.conv1 = nn.Conv2d(3, dim_in, kernel_size=7,
+                               stride=2, padding=3, bias=False)
+        self.bn1 = nn.get_norm(cfg.MODEL.BACKBONE_NORM, dim_in)
        self.relu = nn.ReLU(inplace=True)
-        self.maxpool = nn.MaxPool2d(
-            kernel_size=3,
-            stride=2,
-            padding=0,
-            ceil_mode=True,
-        )
-        drop3 = {
-            'kp': 0.9,
-            'block_size': 7,
-            'alpha': 1.00,
-            'decrement': cfg.DROPBLOCK.DECREMENT,
-            'inplace': True,
-        } if cfg.DROPBLOCK.DROP_ON else None
-        drop4 = {
-            'kp': 0.9,
-            'block_size': 7,
-            'alpha': 1.00,
-            'decrement': cfg.DROPBLOCK.DECREMENT,
-            'inplace': True,
-        } if cfg.DROPBLOCK.DROP_ON else None
-        self.layer1 = self.make_blocks(block, filters[0], layers[0])
-        self.layer2 = self.make_blocks(block, filters[1], layers[1], 2)
-        self.layer3 = self.make_blocks(block, filters[2], layers[2], 2, drop3)
-        self.layer4 = self.make_blocks(block, filters[3], layers[3], 2, drop4)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.feature_dims = collections.OrderedDict(stem=64)
+        for i, repeat, dim in zip(range(4), layers, dims):
+            stride = 1 if i == 0 else 2
+            downsample = None
+            if stride != 1 or dim_in != dim * block.expansion:
+                downsample = nn.Sequential(
+                    nn.Conv1x1(dim_in, dim * block.expansion, stride=stride),
+                    nn.get_norm(cfg.MODEL.BACKBONE_NORM, dim * block.expansion))
+            features.append(block(dim_in, dim, stride, downsample))
+            dim_in = dim * block.expansion
+            for j in range(repeat - 1):
+                features.append(block(dim_in, dim))
+            setattr(self, 'layer%d' % (i + 1), nn.Sequential(*features[-repeat:]))
+            self.feature_dims[id(features[-1])] = dim_in
+        self.features = features
        self.last_outputs = None
        self.reset_parameters()

    def reset_parameters(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
-                init.kaiming_normal(m.weight)
-
-        # Stop the gradients if necessary
-        def freeze_func(m):
-            if isinstance(m, nn.Conv2d):
-                m.weight.requires_grad = False
-                m._buffers['weight'] = m.weight
-                del m._parameters['weight']
+                init.kaiming_normal(m.weight, mode='fan_out')

        if cfg.MODEL.FREEZE_AT > 0:
-            self.conv1.apply(freeze_func)
+            self.conv1.apply(env.freeze_module)
+            self.bn1.apply(env.freeze_module)

        for i in range(cfg.MODEL.FREEZE_AT, 1, -1):
-            getattr(self, 'layer{}'.format(i - 1)).apply(freeze_func)
-
-    def make_blocks(self, block, dim_out, blocks, stride=1, dropblock=None):
-        downsample = None
-        if stride != 1 or self.dim_in != dim_out:
-            downsample = nn.Sequential(
-                nn.Conv1x1(self.dim_in, dim_out, stride=stride),
-                nn.FrozenAffine(dim_out),
-            )
-        layers = [block(self.dim_in, dim_out, stride, downsample, dropblock)]
-        self.dim_in = dim_out
-        for i in range(1, blocks):
-            layers.append(block(dim_out, dim_out, dropblock=dropblock))
-        return nn.Sequential(*layers)
+            getattr(self, 'layer{}'.format(i - 1)).apply(env.freeze_module)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)
-
-        outputs = [x]
-        outputs += [self.layer1(outputs[-1])]
-        outputs += [self.layer2(outputs[-1])]
-        outputs += [self.layer3(outputs[-1])]
-        outputs += [self.layer4(outputs[-1])]
-
+        outputs = [None]
+        for layer in self.features:
+            x = layer(x)
+            if self.feature_dims.get(id(layer)):
+                outputs.append(x)
        if self.training:
            self.last_outputs = outputs
-
        return outputs


 def resnet(depth):
    if depth == 18:
-        units = [2, 2, 2, 2]
+        layers = [2, 2, 2, 2]
    elif depth == 34:
-        units = [3, 4, 6, 3]
+        layers = [3, 4, 6, 3]
    elif depth == 50:
-        units = [3, 4, 6, 3]
+        layers = [3, 4, 6, 3]
    elif depth == 101:
-        units = [3, 4, 23, 3]
+        layers = [3, 4, 23, 3]
    elif depth == 152:
-        units = [3, 8, 36, 3]
+        layers = [3, 8, 36, 3]
    elif depth == 200:
-        units = [3, 24, 36, 3]
+        layers = [3, 24, 36, 3]
    elif depth == 269:
-        units = [3, 30, 48, 8]
+        layers = [3, 30, 48, 8]
    else:
        raise ValueError('Unsupported depth: %d' % depth)
    block = Bottleneck if depth >= 50 else BasicBlock
-    filters = [64, 256, 512, 1024, 2048] \
-        if depth >= 50 else [64, 64, 128, 256, 512]
-    return ResNet(block, units, filters)
+    return ResNet(block, layers)


-backbones.register(['res18', 'resnet18', 'resnet_18'], func=resnet, depth=18)
-backbones.register(['res34', 'resnet34', 'resnet_34'], func=resnet, depth=34)
-backbones.register(['res50', 'resnet50', 'resnet_50'], func=resnet, depth=50)
-backbones.register(['res101', 'resnet101', 'resnet_101'], func=resnet, depth=101)
-backbones.register(['res152', 'resnet152', 'resnet_152'], func=resnet, depth=152)
+registry.backbone.register(['res50', 'resnet50', 'resnet_50'],
+                           func=resnet, depth=50)
+registry.backbone.register(['res101', 'resnet101', 'resnet_101'],
+                           func=resnet, depth=101)
--- a/seetadet/modeling/retinanet.py
+++ b/seetadet/modeling/retinanet.py
@@ -15,6 +15,7 @@ from __future__ import print_function

 import collections
 import math
+
 import dragon.vm.torch as torch

 from seetadet.algo import retinanet
@@ -22,30 +23,31 @@ from seetadet.core.config import cfg
 from seetadet.modules import det
 from seetadet.modules import init
 from seetadet.modules import nn
+from seetadet.utils import stats


 class RetinaNet(nn.Module):
    def __init__(self, dim_in=256):
        super(RetinaNet, self).__init__()
-        self.data = {}
+        self.data = dict()

        ########################################
        #           RetinaNet outputs          #
        ########################################

-        self.cls_conv = torch.nn.ModuleList(
+        self.cls_conv = nn.ModuleList(
            nn.Conv3x3(dim_in, dim_in, bias=True)
            for _ in range(cfg.RETINANET.NUM_CONVS)
        )
-        self.bbox_conv = torch.nn.ModuleList(
+        self.bbox_conv = nn.ModuleList(
            nn.Conv3x3(dim_in, dim_in, bias=True)
            for _ in range(cfg.RETINANET.NUM_CONVS)
        )
-        # Packed as [C, A] not [A, C]
-        self.C = cfg.MODEL.NUM_CLASSES - 1
-        A = len(cfg.RETINANET.ASPECT_RATIOS) * cfg.RETINANET.SCALES_PER_OCTAVE
-        self.cls_score = nn.Conv3x3(dim_in, self.C * A, bias=True)
-        self.bbox_pred = nn.Conv3x3(dim_in, 4 * A, bias=True)
+        self.cls_dim = len(cfg.MODEL.CLASSES) - 1
+        anchor_dim = (len(cfg.RETINANET.ASPECT_RATIOS) *
+                      cfg.RETINANET.SCALES_PER_OCTAVE)
+        self.cls_score = nn.Conv3x3(dim_in, self.cls_dim * anchor_dim, bias=True)
+        self.bbox_pred = nn.Conv3x3(dim_in, 4 * anchor_dim, bias=True)
        self.cls_prob = nn.Sigmoid(inplace=True)
        self.relu = nn.ReLU(inplace=True)
        self.decoder = det.RetinaNetDecoder()
@@ -56,105 +58,83 @@ class RetinaNet(nn.Module):

        self.anchor_target = retinanet.AnchorTarget()
        self.cls_loss = nn.SigmoidFocalLoss()
-        if 'IOU' in cfg.MODEL.REG_LOSS_TYPE.upper():
-            self.bbox_loss = nn.IoULoss()
+        if cfg.RETINANET.BBOX_REG_LOSS_TYPE.lower() == 'l1':
+            self.bbox_loss = nn.L1Loss(reduction='sum')
+        elif cfg.RETINANET.BBOX_REG_LOSS_TYPE.lower() == 'giou':
+            self.bbox_loss = nn.GIoULoss(reduction='sum')
        else:
-            self.bbox_loss = nn.SmoothL1Loss(0.1111)
+            self.bbox_loss = nn.SmoothL1Loss(beta=0.1, reduction='sum')
+        self.normalizer = stats.ExponentialMovingAverage(decay=0.9)
        self.reset_parameters()

    def reset_parameters(self):
-        # Initialization following the RPN
-        # Weight ~ Normal(0, 0.01)
        for m in self.modules():
-            if isinstance(m, torch.nn.Conv2d):
+            if isinstance(m, nn.Conv2d):
                init.normal(m.weight, std=0.01)
                init.constant(m.bias, 0)
-
-        # Bias prior initialization for Focal Loss
+        # Bias prior initialization for Focal Loss.
        # For details, See the official codes:
        # https://github.com/facebookresearch/Detectron
-        self.cls_score.bias.fill_(
-            -math.log((1 - cfg.PRIOR_PROB) / cfg.PRIOR_PROB)
-        )
+        bias_init = -math.log((1 - cfg.PRIOR_PROB) / cfg.PRIOR_PROB)
+        self.cls_score.bias.fill_(bias_init)

    def compute_outputs(self, features):
-        """Compute the RetinaNet logits.
-
-        Parameters
-        ----------
-        features : sequence of dragon.vm.torch.Tensor
-            The features of specific conv layers.
-
-        """
-        # Compute logits
+        """Compute RetinaNet logits."""
        cls_score_wide, bbox_pred_wide = [], []
        for j, feature in enumerate(features):
-            cls_x, bbox_x = feature, feature
+            cls_input, bbox_input = feature, feature
            for i in range(cfg.RETINANET.NUM_CONVS):
-                cls_x = self.relu(self.cls_conv[i](cls_x))
-                bbox_x = self.relu(self.bbox_conv[i](bbox_x))
-            cls_score_wide.append(self.cls_score(cls_x).view(0, self.C, - 1))
-            bbox_pred_wide.append(self.bbox_pred(bbox_x).view(0, 4, -1))
-
+                cls_input = self.relu(self.cls_conv[i](cls_input))
+                bbox_input = self.relu(self.bbox_conv[i](bbox_input))
+            cls_score_wide.append(self.cls_score(cls_input).view(0, self.cls_dim, - 1))
+            bbox_pred_wide.append(self.bbox_pred(bbox_input).view(0, 4, -1))
        if len(features) > 1:
-            # Concat them if necessary
            return (torch.cat(cls_score_wide, dim=2),
                    torch.cat(bbox_pred_wide, dim=2))
        else:
            return cls_score_wide[0], bbox_pred_wide[0]

-    def compute_losses(self, features, cls_score, bbox_pred, gt_boxes):
-        """Compute the RetinaNet classification loss and regression loss.
-
-        Parameters
-        ----------
-        features : Sequence[dragon.vm.torch.Tensor]
-            The features of specific conv layers.
-        cls_score : dragon.vm.torch.Tensor
-            The classification logits.
-        bbox_pred : dragon.vm.torch.Tensor
-            The bbox regression logits.
-        gt_boxes : numpy.ndarray
-            The packed ground-truth boxes.
-
-        """
-        self.data = \
-            self.anchor_target(
-                features=features,
-                gt_boxes=gt_boxes,
-            )
-        bbox_pred = bbox_pred.permute(0, 2, 1) \
-            .index_select((0, 1), self.data['bbox_indices'])
+    def compute_losses(self, **inputs):
+        """Compute RetinaNet classification and regression loss."""
+        self.data = self.anchor_target(**inputs)
+        bbox_pred = inputs['bbox_pred'].permute(0, 2, 1) \
+            .index_select((0, 1), self.data['bbox_inds'])
+        self.normalizer.add_value(self.data['bbox_inds'].size(0))
+        cls_loss_weight = 1.0 / self.normalizer.running_average()
+        bbox_loss_weight = (cfg.RETINANET.BBOX_REG_LOSS_WEIGHT /
+                            self.normalizer.running_average())
        outputs = collections.OrderedDict([
            ('cls_loss', self.cls_loss(
-                cls_score, self.data['labels'])),
+                inputs['cls_score'],
+                self.data['labels'],) * cls_loss_weight),
            ('bbox_loss', self.bbox_loss(
                bbox_pred,
                self.data['bbox_targets'],
-                self.data['bbox_anchors'],
-            ))
-        ])
+                self.data['bbox_anchors']) * bbox_loss_weight)])
        return outputs

-    def forward(self, *args, **kwargs):
+    def forward(self, **kwargs):
        cls_score, bbox_pred = self.compute_outputs(kwargs['features'])
        cls_score, bbox_pred = cls_score.float(), bbox_pred.float()
        outputs = collections.OrderedDict([('bbox_pred', bbox_pred)])
+
        if self.training:
            outputs.update(
                self.compute_losses(
                    features=kwargs['features'],
                    cls_score=cls_score,
                    bbox_pred=bbox_pred,
+                    fg_inds=kwargs['fg_inds'],
+                    bg_inds=kwargs['bg_inds'],
                    gt_boxes=kwargs['gt_boxes'],
                )
            )
        else:
-            outputs['detections'] = \
-                self.decoder(
-                    kwargs['features'],
-                    self.cls_prob(cls_score).permute(0, 2, 1),
-                    bbox_pred,
-                    kwargs['ims_info'],
-                )
+            outputs['detections'] = self.decoder(
+                kwargs['features'],
+                self.cls_prob(cls_score).permute(0, 2, 1),
+                bbox_pred,
+                kwargs['im_info'],
+            )
+
        return outputs
--- a/seetadet/modeling/rpn.py
+++ b/seetadet/modeling/rpn.py
@@ -45,110 +45,69 @@ class RPN(nn.Module):
        ##################################

        self.anchor_target = faster_rcnn.AnchorTarget()
-        self.cls_loss = nn.BCEWithLogitsLoss()
-        self.bbox_loss = nn.SmoothL1Loss(
-            beta=0.1111, reduction='sum')
+        self.cls_loss = nn.BCEWithLogitsLoss(reduction='mean')
+        if cfg.RPN.BBOX_REG_LOSS_TYPE.lower() == 'l1':
+            self.bbox_loss = nn.L1Loss(reduction='sum')
+        else:
+            self.bbox_loss = nn.SmoothL1Loss(beta=0.1, reduction='sum')
        self.reset_parameters()

    def reset_parameters(self):
-        # Initialization for the RPN
-        # Weight ~ Normal(0, 0.01)
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                init.normal(m.weight, std=0.01)
+                init.constant(m.bias, 0)

    def compute_outputs(self, features):
-        """Compute the RPN logits.
-
-        Parameters
-        ----------
-        features : sequence of dragon.vm.torch.Tensor
-            The features of specific conv layers.
-
-        """
-        # Compute rpn logits
-        cls_score_wide,  bbox_pred_wide = [], []
-        for feature in features:
+        """Compute the RPN logits."""
+        cls_score_wide, bbox_pred_wide = [], []
+        for i, feature in enumerate(features):
            x = self.relu(self.output(feature))
-            if len(features) > 1:
-                cls_score = self.cls_score(x).view(0, -1)
-                bbox_pred = self.bbox_pred(x).view(0, 4, -1)
-            else:
-                cls_score = self.cls_score(x)
-                bbox_pred = self.bbox_pred(x)
-            cls_score_wide.append(cls_score)
-            bbox_pred_wide.append(bbox_pred)
-
+            cls_score_wide.append(self.cls_score(x).view(0, -1))
+            bbox_pred_wide.append(self.bbox_pred(x).view(0, 4, -1))
        if len(features) > 1:
-            # Concat them if necessary
-            return torch.cat(cls_score_wide, dim=1), \
-                   torch.cat(bbox_pred_wide, dim=2)
+            return (torch.cat(cls_score_wide, dim=1),
+                    torch.cat(bbox_pred_wide, dim=2))
        else:
            return cls_score_wide[0], bbox_pred_wide[0]

-    def compute_losses(
-        self,
-        features,
-        cls_score,
-        bbox_pred,
-        gt_boxes,
-        ims_info,
-    ):
-        """Compute the RPN classification loss and regression loss.
-
-        Parameters
-        ----------
-        features : sequence of dragon.vm.torch.Tensor
-            The features of specific conv layers.
-        cls_score : dragon.vm.torch.Tensor
-            The (binary) classification logits.
-        bbox_pred : dragon.vm.torch.Tensor
-            The bbox regression logits.
-        gt_boxes : numpy.ndarray
-            The packed ground-truth boxes.
-        ims_info : numpy.ndarray
-            The information of input images.
-
-        """
-        self.data = \
-            self.anchor_target(
-                features=features,
-                gt_boxes=gt_boxes,
-                ims_info=ims_info,
-            )
-        bbox_pred = bbox_pred.permute(0, 2, 1) \
-            .index_select((0, 1), self.data['bbox_indices'])
-        bbox_loss_weight = 1. / (
-            cfg.TRAIN.RPN_BATCHSIZE *
-            cfg.TRAIN.IMS_PER_BATCH
-        )
+    def compute_losses(self, **inputs):
+        """Compute the RPN classification loss and regression loss."""
+        self.data = self.anchor_target(**inputs)
+        cls_score = inputs['cls_score'] \
+            .index_select((0, 1), self.data['cls_inds'])
+        bbox_pred = inputs['bbox_pred'].permute(0, 2, 1) \
+            .index_select((0, 1), self.data['bbox_inds'])
+        batch_size = cfg.RPN.BATCH_SIZE * cfg.TRAIN.IMS_PER_BATCH
+        bbox_loss_weight = cfg.RPN.BBOX_REG_LOSS_WEIGHT / float(batch_size)
        return collections.OrderedDict([
            ('rpn_cls_loss', self.cls_loss(
-                cls_score, self.data['labels'])),
+                cls_score,
+                self.data['labels'])),
            ('rpn_bbox_loss', self.bbox_loss(
                bbox_pred,
                self.data['bbox_targets'],
-                self.data['bbox_anchors'],
-            ) * bbox_loss_weight),
+                self.data['bbox_anchors']) * bbox_loss_weight),
        ])

-    def forward(self, *args, **kwargs):
-        cls_score, bbox_pred = self.compute_outputs(kwargs['features'])
-        cls_score, bbox_pred = cls_score.float(), bbox_pred.float()
+    def forward(self, **kwargs):
+        cls_score, bbox_pred = \
+            self.compute_outputs(kwargs['features'])

        outputs = collections.OrderedDict([
-            ('rpn_cls_score', cls_score),
-            ('rpn_bbox_pred', bbox_pred),
+            ('rpn_cls_score', cls_score.float()),
+            ('rpn_bbox_pred', bbox_pred.float()),
        ])

        if self.training:
            outputs.update(
                self.compute_losses(
-                    kwargs['features'],
-                    cls_score,
-                    bbox_pred,
-                    kwargs['gt_boxes'],
-                    kwargs['ims_info'],
+                    features=kwargs['features'],
+                    cls_score=outputs['rpn_cls_score'],
+                    bbox_pred=outputs['rpn_bbox_pred'],
+                    fg_inds=kwargs['fg_inds'],
+                    bg_inds=kwargs['bg_inds'],
+                    gt_boxes=kwargs['gt_boxes'],
                )
            )


--- a/seetadet/modeling/ssd.py
+++ b/seetadet/modeling/ssd.py
@@ -20,6 +20,7 @@ from seetadet.algo import ssd
 from seetadet.core.config import cfg
 from seetadet.modules import init
 from seetadet.modules import nn
+from seetadet.utils import stats


 class SSD(nn.Module):
@@ -30,173 +31,120 @@ class SSD(nn.Module):
        ########################################
        #             SSD outputs              #
        ########################################
+
        self.cls_conv = torch.nn.ModuleList(
            nn.Conv3x3(feature_dims[0], feature_dims[0], bias=True)
-            for _ in range(cfg.SSD.NUM_CONVS)
-        )
+            for _ in range(cfg.SSD.NUM_CONVS))
        self.bbox_conv = torch.nn.ModuleList(
            nn.Conv3x3(feature_dims[0], feature_dims[0], bias=True)
-            for _ in range(cfg.SSD.NUM_CONVS)
-        )
+            for _ in range(cfg.SSD.NUM_CONVS))

        self.cls_score = nn.ModuleList()
        self.bbox_pred = nn.ModuleList()
        self.softmax = nn.Softmax(dim=2)
        self.relu = nn.ReLU(inplace=True)
-
        self.box_dim = len(cfg.BBOX_REG_WEIGHTS)
-        if len(feature_dims) == 1 and \
-                len(feature_dims) != len(cfg.SSD.MULTIBOX.STRIDES):
-            feature_dims = feature_dims * len(cfg.SSD.MULTIBOX.STRIDES)
-        feature_dims = list(filter(None, feature_dims))

-        for i, dim_in in enumerate(feature_dims):
-            nc = cfg.MODEL.NUM_CLASSES
-            na = len(cfg.SSD.MULTIBOX.ASPECT_RATIOS[i]) + 1
-            self.cls_score.append(nn.Conv3x3(dim_in, na * nc, bias=True))
-            self.bbox_pred.append(nn.Conv3x3(dim_in, na * self.box_dim, bias=True))
+        if len(feature_dims) != len(cfg.SSD.STRIDES):
+            # FPN case, all strides share the same feature dim
+            feature_dims = [feature_dims[0]] * len(cfg.SSD.STRIDES)

-        self.prior_box = ssd.PriorBox()
+        for i, dim in enumerate(feature_dims):
+            ratios = cfg.SSD.ASPECT_RATIOS[i]
+            if not isinstance(ratios, (tuple, list)):
+                # Legacy case, All strides share the same ratios
+                ratios = cfg.SSD.ASPECT_RATIOS
+            nc, na = len(cfg.MODEL.CLASSES), len(ratios) + 1
+            self.cls_score.append(nn.Conv3x3(dim, na * nc, bias=True))
+            self.bbox_pred.append(nn.Conv3x3(dim, na * self.box_dim, bias=True))

        ########################################
        #              SSD losses              #
        ########################################

-        self.box_match = ssd.MultiBoxMatch()
-        self.hard_mining = ssd.HardMining()
-        self.box_target = ssd.MultiBoxTarget()
-        self.cls_loss = nn.CrossEntropyLoss()
-        if 'IOU' in cfg.MODEL.REG_LOSS_TYPE:
-            self.bbox_loss = nn.IoULoss(
-                delta_weights=cfg.BBOX_REG_WEIGHTS)
+        self.anchor_target = ssd.AnchorTarget()
+        self.cls_loss = nn.CrossEntropyLoss(reduction='sum')
+        if cfg.SSD.BBOX_REG_LOSS_TYPE.lower() == 'l1':
+            self.bbox_loss = nn.L1Loss(reduction='sum')
+        elif cfg.SSD.BBOX_REG_LOSS_TYPE.lower() == 'giou':
+            self.bbox_loss = nn.GIoULoss(
+                reduction='sum', delta_weights=cfg.BBOX_REG_WEIGHTS)
        else:
-            self.bbox_loss = nn.SmoothL1Loss()
+            self.bbox_loss = nn.SmoothL1Loss(beta=1.0, reduction='sum')
+        self.normalizer = stats.ExponentialMovingAverage(decay=0.9)
        self.reset_parameters()

    def reset_parameters(self):
        if cfg.SSD.NUM_CONVS > 0:
-            # Initialization following the RPN
-            # Weight ~ Normal(0, 0.01)
            for m in self.modules():
                if isinstance(m, nn.Conv2d):
                    init.normal(m.weight, std=0.01)
                    init.constant(m.bias, 0)
        else:
-            # Careful Initialization
-            # Weight ~ Normal(0, 0.001)
            for m in self.modules():
                if isinstance(m, nn.Conv2d):
                    init.normal(m.weight, std=0.001)
                    init.constant(m.bias, 0)

    def compute_outputs(self, features):
-        """Compute the SSD logits.
-
-        Parameters
-        ----------
-        features : sequence of dragon.vm.torch.Tensor
-            The features of specific conv layers.
-
-        """
-        # Compute logits
+        """Compute SSD logits."""
        cls_score_wide, bbox_pred_wide = [], []
        for i, feature in enumerate(features):
-            cls_x, bbox_x = feature, feature
+            cls_input, bbox_input = feature, feature
            for j in range(cfg.SSD.NUM_CONVS):
-                cls_x = self.relu(self.cls_conv[j](cls_x))
-                bbox_x = self.relu(self.bbox_conv[j](bbox_x))
+                cls_input = self.relu(self.cls_conv[j](cls_input))
+                bbox_input = self.relu(self.bbox_conv[j](bbox_input))
            cls_score_wide.append(
-                self.cls_score[i](cls_x)
+                self.cls_score[i](cls_input)
                    .permute(0, 2, 3, 1).view(0, -1))
            bbox_pred_wide.append(
-                self.bbox_pred[i](bbox_x)
+                self.bbox_pred[i](bbox_input)
                    .permute(0, 2, 3, 1).view(0, -1))
-
        return (torch.cat(cls_score_wide, dim=1)
-                     .view(0, -1, cfg.MODEL.NUM_CLASSES),
+                     .view(0, -1, len(cfg.MODEL.CLASSES)),
                torch.cat(bbox_pred_wide, dim=1)
                     .view(0, -1, self.box_dim))

-    def compute_losses(
-        self,
-        prior_boxes,
-        gt_boxes,
-        cls_score,
-        bbox_pred,
-        cls_prob,
-    ):
-        """Compute the SSD classification loss and regression loss.
-
-        Parameters
-        ----------
-        prior_boxes : numpy.ndarray
-            The prior boxes(anchors).
-        gt_boxes : numpy.ndarray
-            The packed ground-truth boxes.
-        cls_score : dragon.vm.torch.Tensor
-            The classification logits.
-        bbox_pred : dragon.vm.torch.Tensor
-            The bbox regression logits.
-        cls_prob : dragon.vm.torch.Tensor
-            The logits after a softmax function.
-
-        """
-        # Collect the SSD training data
-        # See the paper(Liu et al. 2016) for details
-        self.data = \
-            self.box_match(
-                prior_boxes,
-                gt_boxes,
-            )
-        self.data.update(
-            self.hard_mining(
-                cls_prob,
-                self.data['match_labels'],
-                self.data['max_overlaps'],
-            )
-        )
-        self.data.update(
-            self.box_target(
-                self.data['match_inds'],
-                self.data['match_labels'],
-                prior_boxes,
-                gt_boxes,
-            )
-        )
-        bbox_pred = bbox_pred.index_select(
-            (0, 1), self.data['bbox_indices'])
+    def compute_losses(self, **inputs):
+        """Compute tSSD classification and regression loss."""
+        self.data = self.anchor_target(**inputs)
+        bbox_pred = inputs['bbox_pred'] \
+            .index_select((0, 1), self.data['bbox_inds'])
+        self.normalizer.add_value(self.data['bbox_inds'].size(0))
+        cls_loss_weight = 1.0 / self.normalizer.running_average()
+        bbox_loss_weight = (cfg.SSD.BBOX_REG_LOSS_WEIGHT /
+                            self.normalizer.running_average())
        return collections.OrderedDict([
-            # A compensating factor of 4.0 is used
-            # As we normalize both the pos and neg samples
            ('cls_loss', self.cls_loss(
-                cls_score.view(-1, cfg.MODEL.NUM_CLASSES),
-                self.data['labels']) * 4.),
+                inputs['cls_score'].view(-1, len(cfg.MODEL.CLASSES)),
+                self.data['labels']) * cls_loss_weight),
            ('bbox_loss', self.bbox_loss(
                bbox_pred,
                self.data['bbox_targets'],
-                self.data['bbox_anchors'],
-            ) * cfg.MODEL.REG_LOSS_WEIGHT)
+                self.data['bbox_anchors']) * bbox_loss_weight)
        ])

-    def forward(self, *args, **kwargs):
-        prior_boxes = self.prior_box(kwargs['features'])
+    def forward(self, **kwargs):
        cls_score, bbox_pred = self.compute_outputs(kwargs['features'])
        cls_score, bbox_pred = cls_score.float(), bbox_pred.float()
+        if cls_score.size(1) != self.anchor_target.all_anchors.shape[0]:
+            raise ValueError('Misalignment between default anchors and features.\n'
+                             'Specify correct <SSD.STRIDES> to avoid this problem.')
        outputs = collections.OrderedDict([
            ('bbox_pred', bbox_pred),
-            ('prior_boxes', prior_boxes),
+            ('prior_boxes', self.anchor_target.all_anchors),
        ])
        if self.training:
            outputs.update(
                self.compute_losses(
-                    prior_boxes,
-                    kwargs['gt_boxes'],
-                    cls_score,
-                    bbox_pred,
-                    self.softmax(cls_score.data),
+                    cls_score=cls_score,
+                    bbox_pred=bbox_pred,
+                    cls_prob=self.softmax(cls_score.data),
+                    fg_inds=kwargs['fg_inds'],
+                    bg_inds=kwargs['bg_inds'],
+                    gt_boxes=kwargs['gt_boxes'],
                )
            )
        else:
-            outputs['cls_prob'] = \
-                self.softmax(cls_score)
+            outputs['cls_prob'] = self.softmax(cls_score)
        return outputs
--- a/seetadet/modeling/vgg.py
+++ b/seetadet/modeling/vgg.py
@@ -13,165 +13,102 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-from seetadet.core.config import cfg
-from seetadet.core.registry import backbones
+import collections
+
+from seetadet.core import registry
 from seetadet.modules import init
 from seetadet.modules import nn


 class VGG(nn.Module):
-    def __init__(self, arch, extra_arch=None, reduced=False):
+    """The VGG net class."""
+
+    def __init__(self, model_cfg, extra_cfg=None):
        super(VGG, self).__init__()
-        self.reduced = reduced
-        self.units, filter_list = arch
-        self.feature_dims = filter_list[:]
-        self.maxpool = nn.MaxPool2d(
-            kernel_size=2, stride=2, ceil_mode=True)
-        self.s1pool = nn.MaxPool2d(
-            kernel_size=3, stride=1, padding=1, ceil_mode=True)
-        self.relu = nn.ReLU(inplace=True)
-        for i in range(len(self.units)):
-            conv_name = 'conv{}'.format(i + 1)
-            dim_in = 3 if i == 0 else filter_list[i - 1]
-            for j in range(self.units[i]):
-                self.__setattr__(
-                    '{}_{}'
-                    .format(conv_name, j + 1),
-                    nn.Conv3x3(dim_in, filter_list[i], bias=True),
-                )
-                if j == 0:
-                    dim_in = filter_list[i]
-        if reduced:
-            self.conv4_3_norm = nn.L2Normalize(filter_list[3], init=20.)
-            self.fc6 = nn.Conv2d(
-                in_channels=filter_list[-1],
-                out_channels=1024,
-                kernel_size=3,
-                padding=6,
-                stride=1,
-                dilation=6,
-            )
-            self.fc7 = nn.Conv1x1(1024, 1024, bias=True)
-            self.feature_dims = [filter_list[-2], 1024]
-        if extra_arch is not None:
-            strides, filter_list, kps = extra_arch
-            self.extra_units = [2] * len(strides)
-            self.feature_dims += [n * 2 for n in filter_list]
-            for i in range(len(strides)):
-                conv_name = 'conv{}'.format(i + 6)
-                dim_in = 1024 if i == 0 else filter_list[i - 1] * 2
-                self.__setattr__(
-                    '{}_1'.format(conv_name),
-                    nn.Conv1x1(
-                        dim_in,
-                        filter_list[i],
-                        bias=True,
-                    ),
-                )
-                if strides[i] == 2:
-                    self.__setattr__(
-                        '{}_2'.format(conv_name),
-                        nn.Conv3x3(
-                            filter_list[i],
-                            filter_list[i] * 2,
-                            stride=2,
-                            bias=True,
-                        ),
-                    )
+        layers, features, dim_in = [], [], 3
+        self.feature_dims = collections.OrderedDict()
+        self.feature_norms = nn.ModuleList()
+        for v in model_cfg:
+            if v == 'M':
+                features.append(nn.Sequential(*layers))
+                if extra_cfg and len(features) == 5:
+                    layers = [nn.MaxPool2d(kernel_size=3, padding=1)]
                else:
-                    self.__setattr__(
-                        '{}_2'.format(conv_name),
-                        nn.Conv2d(
-                            filter_list[i],
-                            filter_list[i] * 2,
-                            kernel_size=kps[0],
-                            padding=kps[1],
-                            stride=kps[2]
-                        ),
-                    )
+                    layers = [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)]
+                if len(features) > 1:
+                    self.feature_dims[id(features[-1])] = dim_in
+                if extra_cfg and len(features) == 4:
+                    self.feature_norms.append(nn.L2Normalize(dim_in, init=20.))
+            else:
+                conv2d = nn.Conv2d(dim_in, v, kernel_size=3, padding=1)
+                layers += [conv2d, nn.ReLU(inplace=True)]
+                dim_in = v
+        if extra_cfg:
+            lowest_lvl = id(features[3])
+            self.feature_dims = collections.OrderedDict(
+                [(lowest_lvl, self.feature_dims[lowest_lvl])])
+            layers += [nn.Conv2d(dim_in, 1024, kernel_size=3, padding=6, dilation=6)]
+            layers += [nn.ReLU(inplace=True)]
+            layers += [nn.Conv2d(1024, 1024, kernel_size=1)]
+            layers += [nn.ReLU(inplace=True)]
+            features.append(nn.Sequential(*layers))
+            self.feature_dims[id(features[-1])] = dim_in = 1024
+            for c, (k, s, p) in extra_cfg:
+                features.append(nn.Sequential(
+                    nn.Conv2d(dim_in, c, kernel_size=1),
+                    nn.ReLU(inplace=True),
+                    nn.Conv2d(c, c * 2, kernel_size=k, stride=s, padding=p),
+                    nn.ReLU(inplace=True),
+                ))
+                self.feature_dims[id(features[-1])] = dim_in = c * 2
+        self.features = nn.Sequential(*features)
        self.last_outputs = None
        self.reset_parameters()

    def reset_parameters(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
-                init.xaiver(m.weight)
+                init.xavier_uniform(m.weight)
                init.constant(m.bias, 0)

-        # Stop the gradients if necessary
-        def freeze_func(m):
-            if isinstance(m, nn.Conv2d):
-                m.weight.requires_grad = False
-                m._buffers['weight'] = m.weight
-                del m._parameters['weight']
-                m.bias.requires_grad = False
-                m._buffers['bias'] = m.bias
-                del m._parameters['bias']
-
-        for i in range(cfg.MODEL.FREEZE_AT, 0, -1):
-            conv_name = 'conv{}'.format(i)
-            for j in range(self.units[i - 1]):
-                self.__getattr__(
-                    '{}_{}'.format(conv_name, j + 1)
-                ).apply(freeze_func)
-
    def forward(self, x):
        outputs = []
-        # Conv1.x ~ Conv5.x
-        for i in range(len(self.units)):
-            conv_name = 'conv{}'.format(i + 1)
-            for j in range(self.units[i]):
-                x = self.relu(self.__getattr__(
-                    '{}_{}'.format(conv_name, j + 1))(x))
-            if self.reduced and i == 3:
-                outputs.append(self.conv4_3_norm(x))
-            if i < 4:
-                x = self.maxpool(x)
-            else:
-                x = self.s1pool(x) if self.reduced else x
-
-        # Internal FC layers and Extra Conv Layers
-        if self.reduced:
-            x = self.relu(self.fc6(x))
-            x = self.relu(self.fc7(x))
-            outputs.append(x)
-            for i in range(len(self.extra_units)):
-                conv_name = 'conv{}'.format(i + 6)
-                for j in range(self.extra_units[i]):
-                    x = self.relu(self.__getattr__(
-                        '{}_{}'.format(conv_name, j + 1))(x))
+        for layer in self.features:
+            x = layer(x)
+            if self.feature_dims.get(id(layer)):
                outputs.append(x)
-        else:
-            outputs.append(x)
-
+        for i, norm_layer in enumerate(self.feature_norms):
+            outputs[i] = norm_layer(outputs[i])
        if self.training:
            self.last_outputs = outputs
-
        return outputs


-def vgg_16(**kwargs):
-    return VGG(([2, 2, 3, 3, 3], [64, 128, 256, 512, 512]), **kwargs)
+def vgg16(extra_cfg=None):
+    model_cfg = [64, 64, 'M',
+                 128, 128, 'M',
+                 256, 256, 256, 'M',
+                 512, 512, 512, 'M',
+                 512, 512, 512, 'M']
+    return VGG(model_cfg, extra_cfg)


-def vgg_16_reduced(scale=300):
+def vgg16_reduced(scale=300):
    if scale == 300:
-        extra_arch = (
-            [2, 2, 1, 1],
-            [256, 128, 128, 128],
-            [3, 0, 1],
-        )
+        extra_cfg = [(256, (3, 2, 1)),
+                     (128, (3, 2, 1)),
+                     (128, (3, 1, 0)),
+                     (128, (3, 1, 0))]
    elif scale == 512:
-        extra_arch = (
-            [2, 2, 2, 2, 1],
-            [256, 128, 128, 128, 128],
-            [4, 1, 1],
-        )
+        extra_cfg = [(256, (3, 2, 1)),
+                     (128, (3, 2, 1)),
+                     (128, (3, 2, 1)),
+                     (128, (3, 2, 1)),
+                     (128, (4, 1, 1))]
    else:
        raise ValueError('Unsupported scale: {}'.format(scale))
-    return vgg_16(extra_arch=extra_arch, reduced=True)
+    return vgg16(extra_cfg)


-backbones.register('vgg16', func=vgg_16)
-backbones.register('vgg16_reduced_300', func=vgg_16_reduced, scale=300)
-backbones.register('vgg16_reduced_512', func=vgg_16_reduced, scale=512)
+registry.backbone.register('vgg16_reduced_300', vgg16_reduced, scale=300)
+registry.backbone.register('vgg16_reduced_512', vgg16_reduced, scale=512)
--- a/seetadet/modules/det.py
+++ b/seetadet/modules/det.py
@@ -8,6 +8,7 @@
 #     <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
+"""Detection modules."""

 from __future__ import absolute_import
 from __future__ import division
@@ -32,8 +33,8 @@ class _NonMaxSuppression(autograd.Function):
            'arguments': {'iou_threshold': self.iou_threshold}
        }

-    def forward(self, dets):
-        return self.dispatch([dets], [self.alloc()])
+    def forward(self, input):
+        return self.dispatch([input], [self.alloc()])


 class _RetinaNetDecoder(autograd.Function):
@@ -78,7 +79,6 @@ class _RPNDecoder(autograd.Function):
                'pre_nms_top_n': self.args['pre_nms_top_n'],
                'post_nms_top_n': self.args['post_nms_top_n'],
                'nms_thresh': self.args['nms_thresh'],
-                'min_size': self.args['min_size'],
                'min_level': self.args['min_level'],
                'max_level': self.args['max_level'],
                'canonical_scale': self.args['canonical_scale'],
@@ -86,10 +86,11 @@ class _RPNDecoder(autograd.Function):
            }
        }

-    def forward(self, features, cls_prob, bbox_pred, ims_info):
-        inputs = features + [cls_prob, bbox_pred, ims_info]
-        self._check_device(inputs[:-1])  # Skip <ims_info>
-        outputs = [self.alloc() for _ in range(self.args['K'])]
+    def forward(self, features, cls_prob, bbox_pred, im_info):
+        inputs = features + [cls_prob, bbox_pred, im_info]
+        self._check_device(inputs[:-1])  # Skip <im_info>
+        num_outputs = self.args['max_level'] - self.args['min_level'] + 1
+        outputs = [self.alloc() for _ in range(num_outputs)]
        return self.dispatch(inputs, outputs, check_device=False)


@@ -119,15 +120,13 @@ def decode_rpn(
    features,
    cls_prob,
    bbox_pred,
-    ims_info,
-    num_outputs,
+    im_info,
    strides,
    ratios,
    scales,
    pre_nms_top_n,
    post_nms_top_n,
    nms_thresh,
-    min_size,
    min_level,
    max_level,
    canonical_scale,
@@ -136,27 +135,25 @@ def decode_rpn(
    return _RPNDecoder \
        .instantiate(
            cls_prob.device,
-            K=num_outputs,
            strides=strides,
            ratios=ratios,
            scales=scales,
            pre_nms_top_n=pre_nms_top_n,
            post_nms_top_n=post_nms_top_n,
            nms_thresh=nms_thresh,
-            min_size=min_size,
            min_level=min_level,
            max_level=max_level,
            canonical_scale=canonical_scale,
            canonical_level=canonical_level,
-        ).apply(features, cls_prob, bbox_pred, ims_info)
+        ).apply(features, cls_prob, bbox_pred, im_info)


-def nms(dets, iou_threshold=0.5):
+def nms(input, iou_threshold=0.5):
    return _NonMaxSuppression \
        .instantiate(
-            dets.device,
+            input.device,
            iou_threshold=iou_threshold,
-        ).apply(dets)
+        ).apply(input)


 class RetinaNetDecoder(nn.Module):
@@ -180,7 +177,7 @@ class RetinaNetDecoder(nn.Module):
            strides=self.strides,
            ratios=[float(e) for e in cfg.RETINANET.ASPECT_RATIOS],
            scales=self.scales,
-            pre_nms_top_n=cfg.RETINANET.PRE_NMS_TOP_N,
+            pre_nms_top_n=cfg.TEST.RETINANET_PRE_NMS_TOP_N,
            score_thresh=float(cfg.TEST.SCORE_THRESH),
        )

@@ -190,27 +187,21 @@ class RPNDecoder(nn.Module):

    def __init__(self):
        super(RPNDecoder, self).__init__()
-        self.K = (cfg.FPN.ROI_MAX_LEVEL -
-                  cfg.FPN.ROI_MIN_LEVEL + 1) \
-            if len(cfg.RPN.STRIDES) > 1 else 1

-    def forward(self, features, cls_prob, bbox_pred, ims_info):
-        outputs = decode_rpn(
+    def forward(self, features, cls_prob, bbox_pred, im_info):
+        return decode_rpn(
            features=features,
            cls_prob=cls_prob,
            bbox_pred=bbox_pred,
-            ims_info=ims_info,
-            num_outputs=self.K,
+            im_info=im_info,
            strides=cfg.RPN.STRIDES,
            ratios=[float(e) for e in cfg.RPN.ASPECT_RATIOS],
            scales=[float(e) for e in cfg.RPN.SCALES],
            pre_nms_top_n=cfg.TEST.RPN_PRE_NMS_TOP_N,
            post_nms_top_n=cfg.TEST.RPN_POST_NMS_TOP_N,
            nms_thresh=cfg.TEST.RPN_NMS_THRESH,
-            min_size=cfg.TEST.RPN_MIN_SIZE,
            min_level=cfg.FPN.ROI_MIN_LEVEL,
            max_level=cfg.FPN.ROI_MAX_LEVEL,
            canonical_scale=cfg.FPN.ROI_CANONICAL_SCALE,
            canonical_level=cfg.FPN.ROI_CANONICAL_LEVEL,
        )
-        return [outputs] if self.K == 1 else outputs
--- a/seetadet/modules/init.py
+++ b/seetadet/modules/init.py
@@ -8,6 +8,7 @@
 #     <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
+"""Init modules."""

 from __future__ import absolute_import
 from __future__ import division
@@ -16,22 +17,14 @@ from __future__ import print_function
 from dragon.vm.torch import nn


-def xaiver(weight, mode='fan_in'):
+def xavier_uniform(weight, mode='fan_in'):
    """The initializer of xavier uniform distribution."""
-    nn.init.kaiming_uniform_(
-        weight,
-        a=1,  # Fix the gain for [-127, 127]
-        mode=mode,
-    )
+    nn.init.kaiming_uniform_(weight, mode=mode, nonlinearity='linear')


 def kaiming_normal(weight, mode='fan_in'):
    """The initializer of kaiming normal distribution."""
-    nn.init.kaiming_normal_(
-        weight,
-        mode=mode,
-        nonlinearity='relu',
-    )
+    nn.init.kaiming_normal_(weight, mode=mode, nonlinearity='relu')


 # Aliases

--- a/seetadet/modules/nn.py
+++ b/seetadet/modules/nn.py
@@ -8,8 +8,7 @@
 #     <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
-
-"""Define some basic structures."""
+"""NN modules."""

 from __future__ import absolute_import
 from __future__ import division
@@ -22,18 +21,6 @@ from dragon.vm.torch import nn
 from seetadet.core.config import cfg


-class FrozenAffine(object):
-    """Affine transformation with weight and bias fixed."""
-
-    def __new__(cls, dim_in, bias=True, inplace=True):
-        return nn.Affine(
-            num_features=dim_in,
-            fix_weight=True,
-            fix_bias=True,
-            inplace=inplace,
-        )
-
-
 class Conv1x1(object):
    """1x1 convolution."""

@@ -50,13 +37,23 @@ class Conv1x1(object):
 class Conv3x3(object):
    """3x3 convolution."""

-    def __new__(cls, dim_in, dim_out, stride=1, dilation=1, bias=False):
+    def __new__(
+        cls,
+        dim_in,
+        dim_out,
+        stride=1,
+        dilation=1,
+        groups=1,
+        bias=False
+    ):
        return nn.Conv2d(
            in_channels=dim_in,
            out_channels=dim_out,
            kernel_size=3,
            stride=stride,
            padding=1 * dilation,
+            dilation=dilation,
+            groups=groups,
            bias=bias,
        )

@@ -64,13 +61,71 @@ class Conv3x3(object):
 class CrossEntropyLoss(object):
    """Cross entropy loss."""

-    def __new__(cls):
-        return nn.CrossEntropyLoss(ignore_index=-1)
+    def __new__(cls, reduction='valid'):
+        return nn.CrossEntropyLoss(
+            reduction=reduction, ignore_index=-1)
+

+class FrozenBatchNorm2d(nn.Module):
+    """BatchNorm2d where statistics and the affine parameters are fixed."""

-class IoULoss(nn.Module):
-    def __init__(self, reduction='mean', delta_weights=None):
-        super(IoULoss, self).__init__()
+    def __init__(self, num_features, eps=1e-5, inplace=True):
+        super(FrozenBatchNorm2d, self).__init__()
+        self.num_features = num_features
+        self.eps = eps
+        self.inplace = inplace
+        self.register_buffer('weight', torch.ones(num_features))
+        self.register_buffer('bias', torch.zeros(num_features))
+        self.register_buffer('running_mean', torch.zeros(num_features))
+        self.register_buffer('running_var', torch.ones(num_features) - eps)
+
+    def extra_repr(self):
+        affine_str = '{num_features}, eps={eps}'.format(**self.__dict__)
+        inplace_str = ', inplace' if self.inplace else ''
+        return affine_str + inplace_str
+
+    def forward(self, input):
+        return torch.channel_affine(
+            input,
+            self.weight,
+            self.bias,
+            dim=1,
+            out=input if self.inplace else None,
+        )
+
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        super(FrozenBatchNorm2d, self)._load_from_state_dict(
+            state_dict,
+            prefix,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+        # Fuse the running stats into weight and bias.
+        # Note that this behavior will break the original stats
+        # into zero means and one stds.
+        with torch.no_grad():
+            self.running_var.float_().add_(self.eps).sqrt_()
+            self.weight.float_().div_(self.running_var)
+            self.bias.float_().sub_(self.running_mean.float_() * self.weight)
+            self.running_mean.zero_()
+            self.running_var.one_().sub_(self.eps)
+
+
+class GIoULoss(nn.Module):
+    """GIoU loss."""
+
+    def __init__(self, reduction='sum', delta_weights=None):
+        super(GIoULoss, self).__init__()
        self.reduction = reduction
        self.delta_weights = delta_weights
        # Store the detached tensors
@@ -134,7 +189,7 @@ class IoULoss(nn.Module):
        # Compute the differentiable IoU metric
        area_union = pred_area + target_area - area_inter
        iou = area_inter / (area_union + 1.)
-        iou_metric = iou - (area_enc - area_union) / area_enc  # GIoU
+        iou_metric = iou - (area_enc - area_union) / area_enc

        # Compute the reduced loss
        if self.reduction == 'sum':
@@ -144,7 +199,7 @@ class IoULoss(nn.Module):

    def forward(self, *inputs, **kwargs):
        # Enter a new detaching scope
-        with dragon.eager_scope('${IOU}'):
+        with dragon.eager_scope('${IOU_LOSS}'):
            return self.forward_impl(*inputs, **kwargs)


@@ -159,6 +214,20 @@ class Identity(nn.Module):
        return x


+class L1Loss(nn.Module):
+    """L1 loss."""
+
+    def __init__(self, reduction='sum'):
+        super(L1Loss, self).__init__()
+        self.reduction = reduction
+
+    def forward(self, input, target, *args):
+        return nn.functional.l1_loss(
+            input, target,
+            reduction=self.reduction,
+        )
+
+
 class L2Normalize(nn.Module):
    """Normalize the input using L2 norm."""

@@ -168,7 +237,7 @@ class L2Normalize(nn.Module):

    def forward(self, input):
        out = nn.functional.normalize(input, p=2, dim=1, eps=1e-5)
-        out = nn.functional.affine(out, self.weight)
+        out = torch.channel_affine(out, self.weight, dim=1)
        return out


@@ -182,18 +251,19 @@ class ReLU(object):
 class SigmoidFocalLoss(object):
    """Sigmoid focal loss."""

-    def __new__(cls):
+    def __new__(cls, reduction='sum'):
        return nn.SigmoidFocalLoss(
            alpha=cfg.MODEL.FOCAL_LOSS_ALPHA,
            gamma=cfg.MODEL.FOCAL_LOSS_GAMMA,
            negative_index=0,  # Background index
+            reduction=reduction,
        )


 class SmoothL1Loss(nn.Module):
    """Smoothed l1 loss."""

-    def __init__(self, beta=1., reduction='batch_size'):
+    def __init__(self, beta=1.0, reduction='sum'):
        super(SmoothL1Loss, self).__init__()
        self.beta = beta
        self.reduction = reduction
@@ -206,8 +276,18 @@ class SmoothL1Loss(nn.Module):
        )


+# Getters
+def get_norm(norm, dim_in):
+    """Return a normalization module."""
+    if isinstance(norm, str):
+        if len(norm) == 0:
+            return Identity()
+        norm = {'BN': BatchNorm2d,
+                'FrozenBN': FrozenBatchNorm2d}[norm]
+    return norm(dim_in)
+
+
 # Aliases
-Affine = nn.Affine
 AvgPool2d = nn.AvgPool2d
 BatchNorm2d = nn.BatchNorm2d
 BCEWithLogitsLoss = nn.BCEWithLogitsLoss

--- a/seetadet/modules/utils.py
+++ b/seetadet/modules/utils.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#     <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+"""Module utilities."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from dragon.vm import torch
+
+from seetadet.core import registry
+
+
+@registry.fusion_pass.register([
+    'Conv2d+BatchNorm2d',
+    'Conv2d+FrozenBatchNorm2d',
+    'DepthwiseConv2d+BatchNorm2d',
+    'DepthwiseConv2d+FrozenBatchNorm2d',
+])
+def layer_fusion_conv2d_and_bn2d(conv_module, bn_module):
+    """Layer fusion between Conv2d and BatchNorm2d."""
+    if conv_module.bias is None:
+        with torch.no_grad():
+            delattr(conv_module, 'bias')
+            bn_module.forward = lambda x: x
+            t = torch.sqrt(bn_module.running_var + bn_module.eps)
+            t = bn_module.weight / t
+            conv_module.register_buffer(
+                'bias', bn_module.bias - t * bn_module.running_mean)
+            t = t.view(0, *([1] * (conv_module.weight.ndimension() - 1)))
+            if conv_module.weight.dtype == 'float16':
+                conv_module.bias.half_()
+                weight = conv_module.weight.float()
+                weight.mul_(t).half_()
+                conv_module.weight.copy_(weight)
+            else:
+                conv_module.weight.mul_(t)
+
+
+def get_fusion_pass(*modules):
+    """Return the fusion pass between modules."""
+    pass_key = '+'.join(m.__class__.__name__ for m in modules)
+    return pass_key, registry.fusion_pass.try_get(pass_key)
--- a/seetadet/modules/vision.py
+++ b/seetadet/modules/vision.py
@@ -8,6 +8,7 @@
 #     <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
+"""Vision modules."""

 from __future__ import absolute_import
 from __future__ import division
@@ -22,15 +23,17 @@ from dragon.vm.torch import nn
 from seetadet.core.config import cfg


-def roi_align(input, boxes, spatial_scale, size):
+def roi_align(input, boxes, spatial_scale, size, **kwargs):
    return torchvision.ops.roi_align(
        input, boxes,
        output_size=(size, size),
        spatial_scale=spatial_scale,
+        sampling_ratio=kwargs.get('sampling_ratio', 0),
    )


-def roi_pool(input, boxes, spatial_scale, size):
+def roi_pool(input, boxes, spatial_scale, size, **kwargs):
+    _ = locals()  # Unused
    return torchvision.ops.roi_pool(
        input, boxes,
        output_size=(size, size),
@@ -38,17 +41,17 @@ def roi_pool(input, boxes, spatial_scale, size):
    )


-class Bootstrap(nn.Module):
-    """Process the input to match the computation."""
+class ImageNormalizer(nn.Module):
+    """Normalize the image to match the computation."""

    def __init__(self):
-        super(Bootstrap, self).__init__()
+        super(ImageNormalizer, self).__init__()
        self._device = torch.device('cpu')
        self._dummy_buffer = torch.ones(1)
        self._normalize_func = functools.partial(
            torch.channel_normalize,
            mean=cfg.PIXEL_MEANS,
-            std=[1., 1., 1.],
+            std=cfg.PIXEL_STDS,
            dim=1,
            dims=(0, 3, 1, 2),
            dtype=cfg.MODEL.PRECISION.lower(),

--- a/seetadet/onnx/nodes.py
+++ b/seetadet/onnx/nodes.py
@@ -57,8 +57,6 @@ def rpn_decoder_exporter(op_def, shape_dict, ws):
            helper.add_attribute(node, 'post_nms_top_n', arg.i)
        elif arg.name == 'nms_thresh':
            helper.add_attribute(node, 'nms_thresh', arg.f)
-        elif arg.name == 'min_size':
-            helper.add_attribute(node, 'min_size', arg.i)
        elif arg.name == 'min_level':
            helper.add_attribute(node, 'min_level', arg.i)
        elif arg.name == 'max_level':

--- a/seetadet/solver/lr_scheduler.py
+++ b/seetadet/solver/lr_scheduler.py
@@ -27,7 +27,8 @@ class _LRScheduler(object):
        warmup_factor=0.,
    ):
        self._step_count = 0
-        self._lr_max, self._lr_min = lr_max, lr_min
+        self._lr_max = lr_max
+        self._lr_min = lr_min
        self._warmup_steps = warmup_steps
        self._warmup_factor = warmup_factor
        self._last_lr = self._lr_max
@@ -72,9 +73,8 @@ class CosineLR(_LRScheduler):
        if step_count % self._decay_step == 0:
            decay_factor = 0.5 * (1. + math.cos(
                math.pi * step_count / self._max_steps))
-            self._last_lr = self._lr_min + (
-                    self._lr_max - self._lr_min
-            ) * decay_factor
+            self._last_lr = self._lr_min + \
+                (self._lr_max - self._lr_min) * decay_factor
        return self._last_lr


@@ -94,7 +94,8 @@ class MultiStepLR(_LRScheduler):
        )
        self._decay_steps = decay_steps
        self._decay_gamma = decay_gamma
-        self._stage_count, self._num_stages = 0, len(self._decay_steps)
+        self._stage_count = 0
+        self._num_stages = len(self._decay_steps)

    def schedule_impl(self):
        if self._stage_count < self._num_stages:
@@ -132,12 +133,11 @@ class LinearCosineLR(_LRScheduler):
        step_count = self._step_count - self._last_steps
        if step_count % self._decay_step == 0:
            linear_decay = 1. - float(step_count) / self._max_steps
-            cosine_decay= 0.5 * (1. + math.cos(
+            cosine_decay = 0.5 * (1. + math.cos(
                math.pi * step_count / self._max_steps))
            decay_factor = linear_decay * cosine_decay
-            self._last_lr = self._lr_min + (
-                    self._lr_max - self._lr_min
-            ) * decay_factor
+            self._last_lr = \
+                self._lr_min + (self._lr_max - self._lr_min) * decay_factor
        return self._last_lr



--- a/seetadet/solver/sgd.py
+++ b/seetadet/solver/sgd.py
@@ -33,11 +33,11 @@ class SGDSolver(object):
            momentum=cfg.SOLVER.MOMENTUM,
            weight_decay=cfg.SOLVER.WEIGHT_DECAY,
            clip_norm=float(cfg.SOLVER.CLIP_NORM),
-            scale=1. / cfg.SOLVER.LOSS_SCALING,
+            scale=1.0 / cfg.SOLVER.LOSS_SCALING,
        )
        self.lr_scheduler = lr_scheduler.get_scheduler()

-    def one_step(self):
+    def step(self):
        def add_loss(x, y):
            return y if x is None else x + y

@@ -59,10 +59,10 @@ class SGDSolver(object):
                    if k not in stats['loss']:
                        stats['loss'][k] = 0.
                    total_loss = add_loss(total_loss, v)
-                    stats['loss'][k] += float(v) * loss_scaling
-            if loss_scaling != 1.:
-                total_loss *= loss_scaling
+                    stats['loss'][k] += float(v)
            stats['loss']['total'] += float(total_loss)
+            if loss_scaling != 1.0:
+                total_loss *= loss_scaling
            total_loss.backward()

            # Apply Update

--- a/seetadet/utils/attrdict.py
+++ b/seetadet/utils/attrdict.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ##############################################################################
-
 """A simple attribute dictionary used for representing configuration options."""

 from __future__ import absolute_import
@@ -44,9 +43,8 @@ class AttrDict(dict):
                self[name] = value
        else:
            raise AttributeError(
-                'Attempted to set "{}" to "{}", but AttrDict is immutable'.
-                    format(name, value)
-            )
+                'Attempted to set "{}" to "{}", but AttrDict is immutable'
+                .format(name, value))

    def immutable(self, is_immutable):
        """Set immutability to is_immutable and recursively apply the setting

--- a/seetadet/utils/blob.py
+++ b/seetadet/utils/blob.py
@@ -20,21 +20,15 @@ from __future__ import print_function
 import numpy as np

 from seetadet.core.config import cfg
-from seetadet.utils.image import distort_image
-from seetadet.utils.image import resize_image


-def im_list_to_blob(ims):
-    """Convert a list of images into a network input.
-
-    Assume that images are not means subtracted, and with BGR order.
-
-    """
+def im_list_to_blob(ims, coarsest_stride=0):
+    """Convert a list of images into a network input."""
    blob_dtype = 'uint8' if ims[0].dtype == 'uint8' else 'float32'
    max_shape = np.array([im.shape for im in ims]).max(axis=0)

-    if cfg.MODEL.COARSEST_STRIDE > 0:
-        stride = float(cfg.MODEL.COARSEST_STRIDE)
+    if coarsest_stride > 0:
+        stride = coarsest_stride
        max_shape[0] = int(np.ceil(max_shape[0] / stride) * stride)
        max_shape[1] = int(np.ceil(max_shape[1] / stride) * stride)

@@ -48,48 +42,3 @@ def im_list_to_blob(ims):
        blob[i, :im.shape[0], :im.shape[1], :] = im

    return blob
-
-
-def mask_list_to_blob(masks):
-    """Convert a list of masks into a network input."""
-    max_shape = np.array([mask.shape[1:] for mask in masks]).max(axis=0)
-    num_masks = np.array([mask.shape[0] for mask in masks]).sum()
-
-    blob_shape = (num_masks, max_shape[0], max_shape[1])
-    blob = np.zeros(blob_shape, 'uint8')
-
-    count = 0
-    for mask in masks:
-        n, h, w = mask.shape
-        blob[count:count + n, :h, :w] = mask
-        count += n
-
-    return blob
-
-
-def prep_im_for_blob(img, target_size, max_size):
-    """Scale an image for use in a blob."""
-    im_shape, jitter = img.shape, 1.
-
-    if cfg.TRAIN.USE_COLOR_JITTER:
-        img = distort_image(img)
-
-    if max_size > 0:
-        # Scale image along the shortest side
-        im_size_min = np.min(im_shape[:2])
-        im_size_max = np.max(im_shape[:2])
-        im_scale = float(target_size) / float(im_size_min)
-
-        # Prevent the biggest axis from being more than MAX_SIZE
-        if np.round(im_scale * im_size_max) > max_size:
-            im_scale = float(max_size) / float(im_size_max)
-    else:
-        # Scale image along the longest side
-        im_size_max = np.max(im_shape[:2])
-        im_scale = float(target_size) / float(im_size_max)
-
-    r = cfg.TRAIN.RANDOM_SCALES
-    jitter = r[0] + np.random.rand() * (r[1] - r[0])
-    im_scale *= jitter
-
-    return resize_image(img, im_scale, im_scale), im_scale
--- a/seetadet/utils/boxes.py
+++ b/seetadet/utils/boxes.py
@@ -7,10 +7,7 @@
 #
 #     <https://opensource.org/licenses/BSD-2-Clause>
 #
-# Codes are based on:
-#
 # ------------------------------------------------------------
-
 """Box utilities for original coordinates."""

 from __future__ import absolute_import
@@ -70,11 +67,10 @@ def bbox_centerness(ex_rois, gt_rois):
    return centerness, keep_inds, discard_inds


-def bbox_transform_inv(boxes, deltas, weights=(1., 1., 1., 1.)):
+def bbox_transform_inv(boxes, deltas, weights=(1., 1., 1., 1.), clip=None):
    """Decode the final boxes according to the deltas."""
    if boxes.shape[0] == 0:
        return np.zeros((0, deltas.shape[1]), dtype=deltas.dtype)
-
    boxes = boxes.astype(deltas.dtype, copy=False)

    widths = boxes[:, 2] - boxes[:, 0] + 1.
@@ -88,6 +84,12 @@ def bbox_transform_inv(boxes, deltas, weights=(1., 1., 1., 1.)):
    dw = deltas[:, 2::4] / ww
    dh = deltas[:, 3::4] / wh

+    # Heuristically clip height and width deltas
+    # to avoid too large value in np.exp(...)
+    if clip is not None:
+        dw = np.minimum(dw, clip)
+        dh = np.minimum(dh, clip)
+
    pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis]
    pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis]
    pred_w = np.exp(dw) * widths[:, np.newaxis]
@@ -126,31 +128,19 @@ def clip_tiled_boxes(boxes, im_shape):
    return boxes


-def dismantle_boxes(gt_boxes, num_images):
-    """Dismantle the packed ground-truth boxes."""
-    return [
-        gt_boxes[
-            np.where(gt_boxes[:, -1].astype(np.int32) == i)[0]
-        ][:, :-1] for i in range(num_images)
-    ]
-
-
 def expand_boxes(boxes, scale):
    """Expand an array of boxes by a given scale."""
    w_half = (boxes[:, 2] - boxes[:, 0]) * .5
    h_half = (boxes[:, 3] - boxes[:, 1]) * .5
    x_c = (boxes[:, 2] + boxes[:, 0]) * .5
    y_c = (boxes[:, 3] + boxes[:, 1]) * .5
-
    w_half *= scale
    h_half *= scale
-
    boxes_exp = np.zeros(boxes.shape)
    boxes_exp[:, 0] = x_c - w_half
    boxes_exp[:, 2] = x_c + w_half
    boxes_exp[:, 1] = y_c - h_half
    boxes_exp[:, 3] = y_c + h_half
-
    return boxes_exp


@@ -162,6 +152,15 @@ def flip_boxes(boxes, width):
    return boxes_flipped


+def flip_polygons(polygons, width):
+    """Flip the polygons horizontally."""
+    for i, poly in enumerate(polygons):
+        poly_flipped = poly.copy()
+        poly_flipped[0::2] = width - poly[0::2] - 1
+        polygons[i] = poly_flipped
+    return polygons
+
+
 def filter_boxes(boxes, min_size):
    """Remove all boxes with any side smaller than min size."""
    ws = boxes[:, 2] - boxes[:, 0] + 1

--- a/seetadet/utils/boxes_v2.py
+++ b/seetadet/utils/boxes_v2.py
@@ -7,10 +7,7 @@
 #
 #     <https://opensource.org/licenses/BSD-2-Clause>
 #
-# Codes are based on:
-#
 # ------------------------------------------------------------
-
 """Box utilities for normalized coordinates."""

 from __future__ import absolute_import
@@ -21,7 +18,7 @@ import numpy as np


 def boxes_area(boxes):
-    """Compute the area of an array of boxes."""
+    """Compute the area of input boxes."""
    w = (boxes[:, 2] - boxes[:, 0])
    h = (boxes[:, 3] - boxes[:, 1])
    area = w * h
@@ -29,10 +26,10 @@ def boxes_area(boxes):
    return area


-def intersection(boxes1, boxes2):
-    """Compute pairwise intersection areas between boxes."""
-    [y_min1, x_min1, y_max1, x_max1] = np.split(boxes1, 4, axis=1)
-    [y_min2, x_min2, y_max2, x_max2] = np.split(boxes2, 4, axis=1)
+def intersection(box1, box2):
+    """Compute intersection between boxes."""
+    [y_min1, x_min1, y_max1, x_max1] = np.split(box1, 4, axis=1)
+    [y_min2, x_min2, y_max2, x_max2] = np.split(box2, 4, axis=1)
    all_pairs_min_ymax = np.minimum(y_max1, np.transpose(y_max2))
    all_pairs_max_ymin = np.maximum(y_min1, np.transpose(y_min2))
    all_pairs_min_xmax = np.minimum(x_max1, np.transpose(x_max2))
@@ -46,25 +43,25 @@ def intersection(boxes1, boxes2):
    return inter_heights * inter_widths


-def ioa1(boxes1, boxes2):
-    """Computes pairwise intersection-over-area between box collections."""
-    inter = intersection(boxes1, boxes2)
-    area = np.expand_dims(boxes_area(boxes1), axis=1)
+def ioa1(box1, box2):
+    """Compute intersection-over-area1 between boxes."""
+    inter = intersection(box1, box2)
+    area = np.expand_dims(boxes_area(box1), axis=1)
    return inter / area


-def ioa2(boxes1, boxes2):
-    """Computes pairwise intersection-over-area between box collections."""
-    inter = intersection(boxes1, boxes2)
-    area = np.expand_dims(boxes_area(boxes2), axis=0)
+def ioa2(box1, box2):
+    """Compute intersection-over-area2 between boxes."""
+    inter = intersection(box1, box2)
+    area = np.expand_dims(boxes_area(box2), axis=0)
    return inter / area


-def iou(boxes1, boxes2):
-    """Computes pairwise intersection-over-union between box collections."""
-    inter = intersection(boxes1, boxes2)
-    area1 = boxes_area(boxes1)
-    area2 = boxes_area(boxes2)
+def iou(box1, box2):
+    """Compute intersection-over-union between boxes."""
+    inter = intersection(box1, box2)
+    area1 = boxes_area(box1)
+    area2 = boxes_area(box2)
    union = (np.expand_dims(area1, axis=1) +
             np.expand_dims(area2, axis=0) - inter)
    return inter / union
--- a/seetadet/utils/env.py
+++ b/seetadet/utils/env.py
@@ -24,8 +24,23 @@ from dragon.vm import torch
 from seetadet.core.config import cfg


+def freeze_module(module):
+    """Freeze parameters of given module.
+
+    Parameters
+    ----------
+    module : dragon.vm.torch.nn.Module
+        The module to freeze parameters.
+
+    """
+    for param in list(module._parameters.keys()):
+        module._parameters[param].requires_grad = False
+        module._buffers[param] = module._parameters[param]
+        del module._parameters[param]
+
+
 def get_param_groups(module):
-    """Separate parameters according to weight decay.
+    """Separate parameters for different weight decay.

    Parameters
    ----------
@@ -39,15 +54,20 @@ def get_param_groups(module):

    """
    param_groups = [
-        {'params': []},  # Decayed always
-        {'params': [], 'weight_decay': -1.}
+        {'params': [], 'weight_decay': cfg.SOLVER.WEIGHT_DECAY},
+        {'params': [], 'weight_decay': 0.},
+        {'params': [], 'weight_decay': cfg.SOLVER.WEIGHT_DECAY_BIAS},
    ]
+    legacy_biases = set()
+    for name, param in module.named_parameters():
+        if name.endswith('weight') and param.dim() > 1:
+            legacy_biases.add(name[:-6] + 'bias')
    for name, param in module.named_parameters():
        gi = 0 if 'weight' in name and param.dim() > 1 else 1
+        if gi > 0 and name in legacy_biases:
+            gi = 2
        param_groups[gi]['params'].append(param)
-    if len(param_groups[1]['params']) == 0:
-        param_groups.pop()  # Remove empty group
-    return param_groups
+    return list(filter(lambda g: len(g['params']) > 0, param_groups))


 def load_library(library_prefix):

--- a/seetadet/utils/image.py
+++ b/seetadet/utils/image.py
@@ -13,8 +13,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import cv2
 import numpy as np
+import numpy.random as npr
 import PIL.Image
 import PIL.ImageEnhance

@@ -22,79 +22,97 @@ from seetadet.core.config import cfg


 def distort_image(img):
+    """Distort the brightness, contrast and color of an image."""
    img = PIL.Image.fromarray(img)
-    transforms = [
-        PIL.ImageEnhance.Brightness,
-        PIL.ImageEnhance.Contrast,
-        PIL.ImageEnhance.Color,
-    ]
+    transforms = [PIL.ImageEnhance.Brightness,
+                  PIL.ImageEnhance.Contrast,
+                  PIL.ImageEnhance.Color]
+    npr.shuffle(transforms)
    for transform in transforms:
-        if np.random.uniform() < 0.5:
+        if npr.uniform() < 0.5:
            img = transform(img)
            img = img.enhance(1. + np.random.uniform(-.4, .4))
    return np.array(img)


-def get_image_with_target_size(target_size, img):
+def get_image_with_target_size(img, target_size, no_offset=False):
+    """Crop or pad an image with the target size."""
    im_shape = list(img.shape)
-    height_diff = target_size[0] - im_shape[0]
-    width_diff = target_size[1] - im_shape[1]
+    if not isinstance(target_size, (tuple, list)):
+        target_size = [target_size, target_size]
+    h_diff = target_size[0] - im_shape[0]
+    w_diff = target_size[1] - im_shape[1]

-    ofs_crop_width = np.random.randint(max(-width_diff, 0) + 1)
-    ofs_pad_width = np.random.randint(max(width_diff, 0) + 1)
-    ofs_crop_height = np.random.randint(max(-height_diff, 0) + 1)
-    ofs_pad_height = np.random.randint(max(height_diff, 0) + 1)
+    def get_param(diff, crop, no_offset):
+        diff = max(-diff if crop else diff, 0)
+        return 0 if no_offset else npr.randint(diff + 1)
+
+    offset_crop_w = get_param(w_diff, True, no_offset)
+    offset_crop_h = get_param(h_diff, True, no_offset)

    im_shape[:2] = target_size
    new_img = np.empty(im_shape, dtype=img.dtype)
    new_img[:] = cfg.PIXEL_MEANS

-    new_img[ofs_pad_height:ofs_pad_height + img.shape[0],
-            ofs_pad_width:ofs_pad_width + img.shape[1]] = \
-        img[ofs_crop_height:ofs_crop_height + target_size[0],
-            ofs_crop_width:ofs_crop_width + target_size[1]]
-
-    return new_img, (
-        ofs_pad_width - ofs_crop_width,
-        ofs_pad_height - ofs_crop_height,
-        target_size,
-    )
-
+    new_img[:img.shape[0], :img.shape[1]] = \
+        img[offset_crop_h:offset_crop_h + target_size[0],
+            offset_crop_w:offset_crop_w + target_size[1]]

-def resize_image(img, fx=1, fy=1):
-    return cv2.resize(
-        img,
-        dsize=None,
-        fx=fx, fy=fy,
-        interpolation=cv2.INTER_LINEAR,
-    )
+    offset_w = -offset_crop_w
+    offset_h = -offset_crop_h
+    return new_img, (offset_h, offset_w, target_size)


-def scale_image(img):
-    processed_ims, ims_scales = [], []
-
-    if cfg.TEST.MAX_SIZE > 0:
+def resize_image(img, fx=1.0, fy=1.0, size=None):
+    """Resize an image."""
+    if size is None:
+        size = (int(img.shape[1] * fx), int(img.shape[0] * fy))
+    else:
+        if not isinstance(size, (tuple, list)):
+            size = (size, size)
+    img = PIL.Image.fromarray(img)
+    return np.array(img.resize(size, PIL.Image.BILINEAR))
+
+
+def resize_image_with_target_size(
+    img,
+    target_size,
+    max_size=0,
+    random_scales=(1.0, 1.0),
+):
+    """Resize an image with the target size."""
+    im_shape = img.shape
+    # Scale along the shortest side
+    im_size_min = np.min(im_shape[:2])
+    im_size_max = np.max(im_shape[:2])
+    im_scale = float(target_size) / float(im_size_min)
+    if max_size > 0:
+        # Prevent the biggest axis from being more than MAX_SIZE
+        if np.round(im_scale * im_size_max) > max_size:
+            im_scale = float(max_size) / float(im_size_max)
+    # Apply the scale jitter to get a range of dynamic scales
+    r = random_scales
+    jitter = r[0] + npr.rand() * (r[1] - r[0])
+    im_scale *= jitter
+    return resize_image(img, im_scale, im_scale), im_scale
+
+
+def scale_image(img, scales, max_size=0):
+    """Resize image to match the detecting scales."""
+    processed_images, image_scales = [], []
+    if max_size > 0:
        im_size_min = np.min(img.shape[:2])
        im_size_max = np.max(img.shape[:2])
-        for target_size in cfg.TEST.SCALES:
+        for target_size in scales:
            im_scale = float(target_size) / float(im_size_min)
            if np.round(im_scale * im_size_max) > cfg.TEST.MAX_SIZE:
                im_scale = float(cfg.TEST.MAX_SIZE) / float(im_size_max)
-            processed_ims.append(cv2.resize(
-                img,
-                dsize=None,
-                fx=im_scale, fy=im_scale,
-                interpolation=cv2.INTER_LINEAR))
-            ims_scales.append(im_scale)
+            processed_images.append(resize_image(img, im_scale, im_scale))
+            image_scales.append(im_scale)
    else:
-        # Scale image into a square
-        for target_size in cfg.TEST.SCALES:
-            im_scale_h = float(target_size) / img.shape[0]
-            im_scale_w = float(target_size) / img.shape[1]
-            processed_ims.append(cv2.resize(
-                img,
-                dsize=(target_size, target_size),
-                interpolation=cv2.INTER_LINEAR))
-            ims_scales.append([im_scale_h, im_scale_w])
-
-    return processed_ims, ims_scales
+        for target_size in scales:
+            fy = float(target_size) / img.shape[0]
+            fx = float(target_size) / img.shape[1]
+            processed_images.append(resize_image(img, size=target_size))
+            image_scales.append([fy, fx])
+    return processed_images, image_scales
--- a/seetadet/utils/mask.py
+++ b/seetadet/utils/mask.py
@@ -7,11 +7,8 @@
 #
 #     <https://opensource.org/licenses/BSD-2-Clause>
 #
-# Codes are based on:
-#
-#     <https://github.com/msracver/FCIS/blob/master/lib/mask/mask_transform.py>
-#
 # ------------------------------------------------------------
+"""Mask utilities with boxes."""

 from __future__ import absolute_import
 from __future__ import division
@@ -21,38 +18,55 @@ import cv2
 import numpy as np
 import PIL.Image

+from seetadet.utils.pycocotools import mask as mask_tools
 from seetadet.utils import boxes as box_util


-def dismantle_masks(gt_boxes, gt_masks, num_images):
-    """Dismantle the packed ground-truth boxes."""
-    return ([gt_boxes[np.where(gt_boxes[:, -1].astype(np.int32) == i)[0]][:, :-1]
-             for i in range(num_images)],
-            [gt_masks[np.where(gt_boxes[:, -1].astype(np.int32) == i)[0]]
-             for i in range(num_images)])
-
-
-def intersect_box_mask(ex_box, gt_box, gt_mask):
-    x1 = max(ex_box[0], gt_box[0])
-    y1 = max(ex_box[1], gt_box[1])
-    x2 = min(ex_box[2], gt_box[2])
-    y2 = min(ex_box[3], gt_box[3])
+def warp_mask_via_intersection(mask, box1, box2, size):
+    """Warp mask via intersection."""
+    x1 = max(box1[0], box2[0])
+    y1 = max(box1[1], box2[1])
+    x2 = min(box1[2], min(box2[2], mask.shape[1] - 1))
+    y2 = min(box1[3], min(box2[3], mask.shape[0] - 1))
    if x1 > x2 or y1 > y2:
        return None
    w = x2 - x1 + 1
    h = y2 - y1 + 1
-    ex_start_y = y1 - ex_box[1]
-    ex_start_x = x1 - ex_box[0]
-    inter_mask = gt_mask[y1:y2 + 1, x1:x2 + 1]
-    target_h = ex_box[3] - ex_box[1] + 1
-    target_w = ex_box[2] - ex_box[0] + 1
-    reg_target = np.zeros((target_h, target_w), dtype=np.uint8)
-    reg_target[ex_start_y:ex_start_y + h,
-               ex_start_x:ex_start_x + w] = inter_mask
-    return reg_target
+    ex_start_y = y1 - box1[1]
+    ex_start_x = x1 - box1[0]
+    inter_mask = mask[y1:y2 + 1, x1:x2 + 1]
+    target_h = box1[3] - box1[1] + 1
+    target_w = box1[2] - box1[0] + 1
+    warped_mask = np.zeros((target_h, target_w), dtype=mask.dtype)
+    warped_mask[ex_start_y:ex_start_y + h,
+                ex_start_x:ex_start_x + w] = inter_mask
+    if not isinstance(size, (tuple, list)):
+        size = (size, size)
+    mask = PIL.Image.fromarray(warped_mask)
+    return np.array(mask.resize((size[1], size[0]), PIL.Image.NEAREST))
+
+
+def warp_mask_via_polygons(polygons, box, size):
+    """Warp mask via polygons."""
+    w = np.maximum(box[2] - box[0], 1)
+    h = np.maximum(box[3] - box[1], 1)
+    if not isinstance(size, (tuple, list)):
+        size = (size, size)
+    polygons_norm = []
+    for poly in polygons:
+        p = np.array(poly, dtype=np.float32)
+        p[0::2] = (p[0::2] - box[0]) * size[1] / w
+        p[1::2] = (p[1::2] - box[1]) * size[0] / h
+        polygons_norm.append(p)
+    rle = mask_tools.frPyObjects(polygons_norm, size[0], size[1])
+    mask = np.array(mask_tools.decode(rle))
+    mask = np.sum(mask, axis=2)
+    mask = np.array(mask > 0)
+    return mask


 def mask_overlap(box1, box2, mask1, mask2):
+    """Compute the overlap of two masks."""
    x1 = max(box1[0], box2[0])
    y1 = max(box1[1], box2[1])
    x2 = min(box1[2], box2[2])
@@ -120,11 +134,11 @@ def project_masks(
    else:
        raise ValueError('Unknown data format', data_format)
    mask_image = np.zeros(mask_shape, 'uint8', data_order)
-    M = masks[0].shape[0]
-    scale = (M + 2.) / M
+    size = masks[0].shape[0]
+    scale = (size + 2.) / size
    ref_boxes = box_util.expand_boxes(boxes, scale)
    ref_boxes = ref_boxes.astype(np.int32)
-    padded_mask = np.zeros((M + 2, M + 2), 'float32')
+    padded_mask = np.zeros((size + 2, size + 2), 'float32')
    for i in range(num_pred):
        ref_box = ref_boxes[i, :4]
        mask = masks[i]
@@ -148,26 +162,3 @@ def project_masks(
                mask[(y1 - ref_box[1]):(y2 - ref_box[1]),
                     (x1 - ref_box[0]):(x2 - ref_box[0])]
    return mask_image
-
-
-def resize_mask(mask, size):
-    """Resize the mask with nearest neighbor method.
-
-    PIL implementation while not OpenCV is used,
-    as we found the former will provide higher mask AP.
-
-    Parameters
-    ----------
-    mask : numpy.ndarray
-        The 2d mask array.
-    size : Sequence[int]
-        The output width and height.
-
-    Returns
-    -------
-    numpy.ndarray
-        The resizing mask.
-
-    """
-    mask = PIL.Image.fromarray(mask)
-    return np.array(mask.resize(size, PIL.Image.NEAREST))
\ No newline at end of file
--- a/seetadet/utils/observer.py
+++ b/seetadet/utils/observer.py
@@ -20,42 +20,32 @@ from dragon.vm import torch
 from seetadet.modules import nn


-def dense_conv_flops(m, inputs, output):
-    """Hook to compute flops for a dense convolution."""
+def conv_flops(m, inputs, output):
+    """Hook to compute flops for a convolution."""
+    _ = locals()  # Unused
    k_dim = functools.reduce(operator.mul, m.kernel_size)
    out_dim = functools.reduce(operator.mul, output.shape[2:])
-    in_c, out_c = inputs[0].shape[1], output.shape[1]
+    out_c, in_c = m.weight.shape[:2]
    m.__params__ = (k_dim * in_c + (1 if m.bias else 0)) * out_c
    m.__flops__ = m.__params__ * out_dim


-def depthwise_conv_flops(m, inputs, output):
-    """Hook to compute flops for a depthwise convolution."""
-    k_dim = functools.reduce(operator.mul, m.kernel_size)
-    out_dim = functools.reduce(operator.mul, output.shape[2:])
-    out_c = output.shape[1]
-    m.__params__ = (k_dim + (1 if m.bias else 0)) * out_c
-    m.__flops__ = m.__params__ * out_dim
-
-
 def register_flops(module):
    """Register hooks to collect flops info."""
    if not hasattr(module, '__flops__'):
        module.__flops__ = 0.
        for m in module.modules():
-            if isinstance(m, nn.DepthwiseConv2d):
-                m.register_forward_hook(depthwise_conv_flops)
-            elif isinstance(m, nn.Conv2d):
-                m.register_forward_hook(dense_conv_flops)
+            if isinstance(m, nn.Conv2d):
+                m.register_forward_hook(conv_flops)


 def collect_flops(module, normalizer=1e6):
    """Collect flops from the last forward."""
-    total_flops = 0.
+    total_flops = 0.0
    for m in module.modules():
        if hasattr(m, '__flops__'):
            total_flops += m.__flops__
-            m.__flops__ = 0.
+            m.__flops__ = 0.0
    return total_flops / normalizer



--- a/seetadet/pycocotools/__init__.py
+++ b/seetadet/pycocotools/__init__.py
--- a/seetadet/pycocotools/coco.py
+++ b/seetadet/pycocotools/coco.py
--- a/seetadet/pycocotools/cocoeval.py
+++ b/seetadet/pycocotools/cocoeval.py
@@ -544,8 +544,8 @@ class Params:
        self.imgIds = []
        self.catIds = []
        # np.arange causes trouble.  the data point on arange is slightly larger than the true value
-        self.iouThrs = np.linspace(.5, 0.95, np.round((0.95 - .5) / .05) + 1, endpoint=True)
-        self.recThrs = np.linspace(.0, 1.00, np.round((1.00 - .0) / .01) + 1, endpoint=True)
+        self.iouThrs = np.linspace(.5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
+        self.recThrs = np.linspace(.0, 1.00, int(np.round((1.00 - .0) / .01)) + 1, endpoint=True)
        self.maxDets = [1, 10, 100]
        self.areaRng = [[0 ** 2, 1e5 ** 2], [0 ** 2, 32 ** 2], [32 ** 2, 96 ** 2], [96 ** 2, 1e5 ** 2]]
        self.areaRngLbl = ['all', 'small', 'medium', 'large']
@@ -555,8 +555,8 @@ class Params:
        self.imgIds = []
        self.catIds = []
        # np.arange causes trouble.  the data point on arange is slightly larger than the true value
-        self.iouThrs = np.linspace(.5, 0.95, np.round((0.95 - .5) / .05) + 1, endpoint=True)
-        self.recThrs = np.linspace(.0, 1.00, np.round((1.00 - .0) / .01) + 1, endpoint=True)
+        self.iouThrs = np.linspace(.5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
+        self.recThrs = np.linspace(.0, 1.00, int(np.round((1.00 - .0) / .01)) + 1, endpoint=True)
        self.maxDets = [20]
        self.areaRng = [[0 ** 2, 1e5 ** 2], [32 ** 2, 96 ** 2], [96 ** 2, 1e5 ** 2]]
        self.areaRngLbl = ['all', 'medium', 'large']

--- a/seetadet/pycocotools/mask.py
+++ b/seetadet/pycocotools/mask.py
 __author__ = 'tsungyi'

-import seetadet.pycocotools._mask as _mask
+import seetadet.utils.pycocotools._mask as _mask

 # Interface for manipulating masks stored in RLE format.
 #

--- a/seetadet/pycocotools/mask_utils.py
+++ b/seetadet/pycocotools/mask_utils.py
@@ -15,8 +15,8 @@ from __future__ import print_function

 import numpy as np

-from seetadet.pycocotools import mask as mask_tools
-from seetadet.pycocotools.mask import frPyObjects
+from seetadet.utils.pycocotools import mask as mask_tools
+from seetadet.utils.pycocotools.mask import frPyObjects


 def poly2rle(poly, height, width):
@@ -127,8 +127,7 @@ def bytes2img(data, height, width):
        raise ValueError(
            '{} instances are found in data.\n'
            'Merge them before compressing.'
-            .format(mask_image.shape[2])
-        )
+            .format(mask_image.shape[2]))
    return mask_image[:, :, 0]



--- a/seetadet/utils/stats.py
+++ b/seetadet/utils/stats.py
@@ -32,11 +32,39 @@ class SmoothedValue(object):
        self.count += 1
        self.total += value

-    def get_median(self):
+    def average(self):
+        return np.mean(self.deque)
+
+    def global_average(self):
+        return self.total / self.count
+
+    def median(self):
        return np.median(self.deque)

-    def get_average(self):
-        return np.mean(self.deque)

-    def get_global_average(self):
+class ExponentialMovingAverage(object):
+    """Track a series of values and provide EMA report."""
+
+    def __init__(self, decay=0.9):
+        self.value = None
+        self.decay = decay
+        self.total = 0.0
+        self.count = 0
+
+    def add_value(self, value):
+        if self.value is None:
+            self.value = value
+        else:
+            self.value = (self.decay * self.value +
+                          (1.0 - self.decay) * value)
+        self.total += value
+        self.count += 1
+
+    def global_average(self):
        return self.total / self.count
+
+    def running_average(self):
+        return float(self.value)
+
+    def __float__(self):
+        return self.running_average()
--- a/seetadet/utils/time_util.py
+++ b/seetadet/utils/time_util.py
@@ -7,10 +7,6 @@
 #
 #     <https://opensource.org/licenses/BSD-2-Clause>
 #
-# Codes are based on:
-#
-#     <https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/utils/timer.py>
-#
 # ------------------------------------------------------------

 from __future__ import absolute_import
@@ -62,14 +58,14 @@ class Timer(object):
            return self.diff


-def get_progress_info(timer, curr_step, max_steps):
+def get_progress_info(timer, step, max_steps):
    """Return a info of current progress.

    Parameters
    ----------
    timer : Timer
        The timer to get progress.
-    curr_step : int
+    step : int
        The current step.
    max_steps : int
        The total number of steps.
@@ -81,9 +77,9 @@ def get_progress_info(timer, curr_step, max_steps):

    """
    average_time = timer.average_time
-    eta_seconds = average_time * (max_steps - curr_step)
+    eta_seconds = average_time * (max_steps - step)
    eta = str(datetime.timedelta(seconds=int(eta_seconds)))
-    progress = (curr_step + 1.) / max_steps
+    progress = (step + 1.) / max_steps
    return ('< PROGRESS: {:.2%} | SPEED: {:.3f}s / iter | ETA: {} >'
            .format(progress, timer.average_time, eta))


--- a/seetadet/utils/vis.py
+++ b/seetadet/utils/vis.py
@@ -17,7 +17,6 @@
 #    <https://github.com/facebookresearch/Detectron/blob/master/detectron/utils/vis.py>
 #
 ##############################################################################
-
 """Detection output visualization module."""

 from __future__ import absolute_import
@@ -169,7 +168,7 @@ def vis_one_image(
            max(boxes[:, -1]) < thresh:
        return

-    im, mask, masks = im[:, :, ::-1], None, None
+    im, masks = im[:, :, ::-1], None

    if segms is not None and len(segms) > 0:
        masks = get_mask(boxes, segms, im.shape[:2])
@@ -245,7 +244,7 @@ def vis_one_image(
            contours = results[0] if len(results) == 2 else results[1]

            if show_rotated and len(contours) > 1:
-                counters = [max(contours, key=cv2.contourArea)]
+                contours = [max(contours, key=cv2.contourArea)]

            for c in contours:
                if show_rotated:

--- a/setup.py
+++ b/setup.py
@@ -21,6 +21,16 @@ import sys
 import subprocess


+# Read the current version info
+with open('version.txt', 'r') as f:
+    version = f.read().strip()
+try:
+    git_version = subprocess.check_output(
+        ['git', 'rev-parse', 'HEAD'], cwd='../').decode('ascii').strip()
+except (OSError, subprocess.CalledProcessError):
+    git_version = None
+
+
 def clean():
    """Remove the work directories."""
    if os.path.exists('build'):
@@ -56,6 +66,13 @@ def configure():
                os.remove(dest)
            shutil.copy(os.path.join('csrc/install', src), dest)
    shutil.rmtree('csrc/install')
+    # Write the version file.
+    with open('seetadet/version.py', 'w') as f:
+        f.write("from __future__ import absolute_import\n"
+                "from __future__ import division\n"
+                "from __future__ import print_function\n\n"
+                "version = '{}'\n"
+                "git_version = '{}'\n".format(version, git_version))


 class install(setuptools.command.install.install):
@@ -88,16 +105,15 @@ def find_package_data():
 configure()
 setuptools.setup(
    name='seeta-det',
-    version='0.4.0',
+    version=version,
    description='SeetaDet: A platform implementing popular object detection algorithms.',
-    url='https://gitlab.seetatech.com/seetaresearch/SeetaDet',
+    url='https://gitlab.seetatech.com/seetaresearch/seetadet',
    author='SeetaTech',
    license='BSD 2-Clause',
    packages=find_packages(),
    package_data={'seetadet': find_package_data()},
    package_dir={'seetadet': 'seetadet'},
    cmdclass={'install': install},
-    install_requires=['opencv-python', 'Pillow'],
    classifiers=[
        'Development Status :: 5 - Production/Stable',
        'Intended Audience :: Developers',
@@ -105,13 +121,11 @@ setuptools.setup(
        'Intended Audience :: Science/Research',
        'License :: OSI Approved :: BSD License',
        'Programming Language :: C++',
-        'Programming Language :: Python',
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3 :: Only',
        'Topic :: Scientific/Engineering',
        'Topic :: Scientific/Engineering :: Mathematics',
        'Topic :: Scientific/Engineering :: Artificial Intelligence',
-        'Topic :: Software Development',
-        'Topic :: Software Development :: Libraries',
-        'Topic :: Software Development :: Libraries :: Python Modules',
    ],
 )
 clean()
--- a/tools/__init__.py
+++ b/tools/__init__.py
@@ -7,4 +7,4 @@
 #
 #     <https://opensource.org/licenses/BSD-2-Clause>
 #
-# ------------------------------------------------------------
\ No newline at end of file
+# ------------------------------------------------------------
--- a/tools/export.py
+++ b/tools/export.py
@@ -19,7 +19,6 @@ import argparse
 import dragon.vm.torch as torch
 import pprint

-sys.path.insert(0, '..')
 from seetadet import onnx as _
 from seetadet.core.config import cfg
 from seetadet.core.coordinator import Coordinator
@@ -28,25 +27,43 @@ from seetadet.utils import logger


 def parse_args():
-    """Parse input arguments"""
+    """Parse arguments"""
    parser = argparse.ArgumentParser(
        description='Export a detection network into the onnx model')
-    parser.add_argument('--cfg', dest='cfg_file',
-                        help='optional config file',
-                        default=None, type=str)
-    parser.add_argument('--exp_dir', dest='exp_dir',
-                        help='experiment dir',
-                        default='', type=str)
-    parser.add_argument('--input_shape', dest='input_shape',
-                        help='The shape of dummy input',
-                        default=(1, 224, 224, 3), type=tuple)
-
+    parser.add_argument(
+        '--cfg',
+        dest='cfg_file',
+        default=None,
+        help='config file')
+    parser.add_argument(
+        '--exp_dir',
+        default='',
+        help='experiment dir')
+    parser.add_argument(
+        '--iter',
+        type=int,
+        default=None,
+        help='iteration step of exporting checkpoint')
+    parser.add_argument(
+        '--input_shape',
+        nargs='+',
+        type=int,
+        default=(1, 224, 224, 3),
+        help='spec of input shape')
+    parser.add_argument(
+        '--opset',
+        type=int,
+        default=None,
+        help='opset version to export')
+    parser.add_argument(
+        '--check_model',
+        type=bool,
+        default=True,
+        help='check the model validation or not')
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)
-
-    args = parser.parse_args()
-    return args
+    return parser.parse_args()


 if __name__ == '__main__':
@@ -57,7 +74,11 @@ if __name__ == '__main__':
    logger.info('Using config:\n' + pprint.pformat(cfg))

    # Load the checkpoint and test engine
-    checkpoint, _ = coordinator.checkpoint(global_step=None, wait=True)
+    checkpoint, _ = coordinator.checkpoint(args.iter)
+    if checkpoint is None:
+        raise RuntimeError(
+            'The checkpoint of step {} does not exist.'
+            .format(args.iter))

    # Ready to export the network
    logger.info('Exporting model will be saved to `{:s}`'
@@ -71,6 +92,8 @@ if __name__ == '__main__':
        model=detector,
        args={'data': data, 'ims_info': ims_info},
        f=checkpoint.replace('checkpoints', 'exports')
-                    .replace('pth', 'onnx'),
+                    .replace('pkl', 'onnx'),
        verbose=True,
+        opset_version=args.opset,
+        enable_onnx_checker=args.check_model,
    )
--- a/tools/mpi_train.py
+++ b/tools/mpi_train.py
@@ -19,7 +19,6 @@ import argparse
 import dragon
 import numpy

-sys.path.insert(0, '..')
 from seetadet.core.config import cfg
 from seetadet.core.coordinator import Coordinator
 from seetadet.core.train import train_net
@@ -28,30 +27,29 @@ from seetadet.utils import logger


 def parse_args():
-    """Parse input arguments."""
+    """Parse arguments."""
    parser = argparse.ArgumentParser(
        description='Train a detection network with mpi utilities')
-    parser.add_argument('--cfg', dest='cfg_file',
-                        help='config file',
-                        default=None, type=str)
-    parser.add_argument('--exp_dir', dest='exp_dir',
-                        help='experiment dir',
-                        default='', type=str)
-
+    parser.add_argument(
+        '--cfg',
+        dest='cfg_file',
+        default=None,
+        help='config file')
+    parser.add_argument(
+        '--exp_dir',
+        default='',
+        help='experiment dir')
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)
-
-    args = parser.parse_args()
-    return args
+    return parser.parse_args()


 if __name__ == '__main__':
    args = parse_args()

    coordinator = Coordinator(args.cfg_file, exp_dir=args.exp_dir)
-    checkpoint, start_iter = coordinator.checkpoint(wait=False)
-
+    checkpoint, start_iter = coordinator.checkpoint()
    if checkpoint is not None:
        cfg.TRAIN.WEIGHTS = checkpoint

@@ -61,8 +59,7 @@ if __name__ == '__main__':
    if cfg.NUM_GPUS != world_size:
        raise ValueError(
            'Excepted staring of {} processes, got {}.'
-            .format(cfg.NUM_GPUS, world_size)
-        )
+            .format(cfg.NUM_GPUS, world_size))

    # Setup the logging modules
    logger.set_root_logger(world_rank == 0)

--- a/tools/test.py
+++ b/tools/test.py
@@ -13,52 +13,74 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import os
 import sys

 import argparse
 import pprint

-sys.path.insert(0, '..')
 from seetadet.core import test_engine
-from seetadet.core.config import cfg
+from seetadet.core import test_server
 from seetadet.core.coordinator import Coordinator
-from seetadet.core.test import TestServer
+from seetadet.core.config import cfg
 from seetadet.datasets.factory import get_dataset
 from seetadet.utils import logger


 def parse_args():
-    """Parse input arguments"""
+    """Parse arguments"""
    parser = argparse.ArgumentParser(
        description='Test a detection network with a specified checkpoint')
-    parser.add_argument('--gpus', dest='gpus',
-                        help='index of GPUs to use',
-                        default=None, nargs='+', type=int)
-    parser.add_argument('--cfg', dest='cfg_file',
-                        help='optional config file',
-                        default=None, type=str)
-    parser.add_argument('--exp_dir', dest='exp_dir',
-                        help='experiment dir',
-                        default='', type=str)
-    parser.add_argument('--output_dir', dest='output_dir',
-                        help='output dir',
-                        default=None, type=str)
-    parser.add_argument('--iter', dest='iter', help='global step',
-                        default=None, type=int)
-    parser.add_argument('--read_every', dest='read_every',
-                        help='read every n images for testing',
-                        default=1000, type=int)
-    parser.add_argument('--dump', dest='dump',
-                        help='dump the result back to record?',
-                        action='store_true')
-    parser.add_argument('--wait', dest='wait',
-                        help='wait the checkpoint?',
-                        action='store_true')
-
+    parser.add_argument(
+        '--cfg',
+        dest='cfg_file',
+        default=None,
+        help='config file')
+    parser.add_argument(
+        '--exp_dir',
+        default='',
+        help='experiment dir')
+    parser.add_argument(
+        '--model_dir',
+        default='',
+        help='final model dir')
+    parser.add_argument(
+        '--gpus',
+        nargs='+',
+        type=int,
+        default=None,
+        help='index of GPUs to use')
+    parser.add_argument(
+        '--iter',
+        type=int,
+        default=None,
+        help='test checkpoint of given step')
+    parser.add_argument(
+        '--last',
+        type=int,
+        default=1,
+        help='test n last checkpoints')
+    parser.add_argument(
+        '--read_every',
+        type=int,
+        default=1000,
+        help='read every-n images for testing')
+    parser.add_argument(
+        '--log_every',
+        type=int,
+        default=100,
+        help='display testing progress every-n images')
+    parser.add_argument(
+        '--dump',
+        action='store_true',
+        help='dump the result back to record or not')
+    parser.add_argument(
+        '--wait',
+        action='store_true',
+        help='wait the checkpoint or not')
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)
-
    args = parser.parse_args()
    return args

@@ -67,27 +89,45 @@ if __name__ == '__main__':
    args = parse_args()
    logger.info('Called with args:\n' + str(args))

-    coordinator = Coordinator(args.cfg_file, args.exp_dir)
+    coordinator = Coordinator(args.cfg_file, args.exp_dir or args.model_dir)
    logger.info('Using config:\n' + pprint.pformat(cfg))

-    # Load the checkpoint and test engine
-    checkpoint, _ = coordinator.checkpoint(args.iter, wait=args.wait)
-    if checkpoint is None:
-        raise RuntimeError(
-            'The checkpoint of step {} does not exist.'
-            .format(args.iter)
-        )
-
    # Inspect the dataset
    dataset = get_dataset(cfg.TEST.DATASET)
    cfg.TEST.PROTOCOL = 'dump' if args.dump else cfg.TEST.PROTOCOL
    logger.info('Dataset({}): {} images will be used to test.'
                .format(cfg.TEST.DATASET, dataset.num_images))

-    # Ready to test the network
-    output_dir = coordinator.results_dir(checkpoint, args.output_dir)
-    logger.info('Results will be saved to `{:s}`'.format(output_dir))
+    # Inspect the checkpoints
+    test_checkpoints = []
+    if args.model_dir:
+        for file in os.listdir(args.model_dir):
+            if file.endswith('.pkl'):
+                test_checkpoints.append(os.path.join(args.model_dir, file))
+    else:
+        if args.iter is not None:
+            checkpoint, _ = coordinator.checkpoint(args.iter, wait=True)
+            test_checkpoints.append(checkpoint)
+        else:
+            i = 1
+            while True:
+                checkpoint, _ = coordinator.checkpoint(last_idx=i)
+                if checkpoint is not None:
+                    test_checkpoints.append(checkpoint)
+                    i += 1
+                    if args.last is not None and i > args.last:
+                        break
+                else:
+                    break

-    # Bind the server and run the test
-    server = TestServer(coordinator.results_dir(checkpoint))
-    test_engine.run_test_net(checkpoint, server, args.gpus, args.read_every)
+    for checkpoint in test_checkpoints:
+        # Create the server and run the test
+        output_dir = coordinator.results_dir(checkpoint)
+        logger.info('Results will be saved to ' + output_dir)
+        test_engine.run_test_net(
+            checkpoint=checkpoint,
+            server=test_server.EvaluateServer(output_dir),
+            devices=args.gpus,
+            read_every=args.read_every,
+            log_every=args.log_every,
+        )
--- a/tools/test_all.py
+++ b/tools/test_all.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#     <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import sys
-
-import argparse
-import numpy
-
-sys.path.insert(0, '..')
-from seetadet.core.coordinator import Coordinator
-from seetadet.utils import logger
-
-
-def parse_args():
-    """Parse input arguments"""
-    parser = argparse.ArgumentParser(
-        description='Test a detection network with all checkpoints')
-    parser.add_argument('--cfg', dest='cfg_file',
-                        help='optional config file',
-                        default=None, type=str)
-    parser.add_argument('--exp_dir', dest='exp_dir',
-                        help='experiment dir',
-                        default='', type=str)
-
-    if len(sys.argv) == 1:
-        parser.print_help()
-        sys.exit(1)
-
-    args = parser.parse_args()
-    return args
-
-
-def test(cfg_file, exp_dir, global_step):
-    """Call test.py to test models on specific global step."""
-    import subprocess
-    args = '{} {} '.format(sys.executable, 'test.py')
-    args += '--cfg {} --exp_dir {} --iter {} '.format(
-        os.path.abspath(cfg_file), exp_dir, global_step)
-    return subprocess.call(args, shell=True)
-
-
-if __name__ == '__main__':
-    args = parse_args()
-    logger.info('Called with args:\n' + str(args))
-
-    coordinator = Coordinator(args.cfg_file, exp_dir=args.exp_dir)
-
-    global_steps = []
-    files = os.listdir(coordinator.checkpoints_dir())
-
-    for file in files:
-        step = int(file.split('_iter_')[-1].split('.')[0])
-        global_steps.append(step)
-
-    order = numpy.argsort(-numpy.array(global_steps))
-
-    for test_idx in order:
-        logger.info('Testing net at global step: {}......'
-                    .format(global_steps[test_idx]))
-        logger.info(' - Using model file: {}'.format(files[test_idx]))
-        test(args.cfg_file, args.exp_dir, global_steps[test_idx])
--- a/tools/train.py
+++ b/tools/train.py
@@ -21,7 +21,6 @@ import dragon
 import numpy
 import pprint

-sys.path.insert(0, '..')
 from seetadet.core.config import cfg
 from seetadet.core.coordinator import Coordinator
 from seetadet.core.train import train_net
@@ -30,22 +29,22 @@ from seetadet.utils import logger


 def parse_args():
-    """Parse input arguments."""
+    """Parse arguments."""
    parser = argparse.ArgumentParser(
        description='Train a detection network')
-    parser.add_argument('--cfg', dest='cfg_file',
-                        help='optional config file',
-                        default=None, type=str)
-    parser.add_argument('--exp_dir', dest='exp_dir',
-                        help='experiment dir',
-                        default=None, type=str)
-
+    parser.add_argument(
+        '--cfg',
+        dest='cfg_file',
+        default=None,
+        help='config file')
+    parser.add_argument(
+        '--exp_dir',
+        default=None,
+        help='experiment dir')
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)
-
-    args = parser.parse_args()
-    return args
+    return parser.parse_args()


 def mpi_train(cfg_file, exp_dir):
@@ -76,10 +75,10 @@ if __name__ == '__main__':
    if cfg.NUM_GPUS > 1:
        # Dispatch the MPI to start a multi-nodes task
        coordinator.checkpoints_dir()
-        mpi_train(args.cfg_file, coordinator.experiment_dir)
+        mpi_train(args.cfg_file, coordinator.exp_dir)
    else:
        # Resume training?
-        checkpoint, start_iter = coordinator.checkpoint(wait=False)
+        checkpoint, start_iter = coordinator.checkpoint()
        if checkpoint is not None:
            cfg.TRAIN.WEIGHTS = checkpoint


--- a/version.txt
+++ b/version.txt
+0.5.0a0