Fix the bug of scaling flipped box

Ting PAN
Commit e3b9b641 authored Oct 26, 2020 by Ting PAN
Showing with 215 additions and 187 deletions
configs/faster_rcnn/coco_faster_rcnn_R-50-FPN_800_1x.yml
configs/faster_rcnn/coco_faster_rcnn_R-50-FPN_800_2x.yml
configs/faster_rcnn/voc_faster_rcnn_R-50-FPN_640.yml
configs/mask_rcnn/coco_mask_rcnn_R-50-FPN_800_1x.yml
configs/mask_rcnn/coco_mask_rcnn_R-50-FPN_800_2x.yml
configs/retinanet/coco_retinanet_R-50-FPN_416_6x.yml
configs/retinanet/coco_retinanet_R-50-FPN_512_6x.yml
configs/retinanet/coco_retinanet_R-50-FPN_800_1x.yml
configs/retinanet/coco_retinanet_R-50-FPN_800_2x.yml
configs/retinanet/voc_retinanet_R-50-FPN_416.yml
configs/retinanet/voc_retinanet_R-50-FPN_512.yml
configs/ssd/voc_ssd_VGG-16_300.yml
configs/ssd/voc_ssd_VGG-16_512.yml
scripts/coco/im2rec.py
scripts/coco/maker.py
scripts/coco/maskgen.py → scripts/coco/roidb.py
seetadet/algo/faster_rcnn/data_transformer.py
seetadet/algo/mask_rcnn/data_transformer.py
seetadet/algo/mask_rcnn/proposal_target.py
seetadet/algo/ssd/data_transformer.py
--- a/configs/faster_rcnn/coco_faster_rcnn_R-50-FPN_800_1x.yml
+++ b/configs/faster_rcnn/coco_faster_rcnn_R-50-FPN_800_1x.yml
@@ -2,8 +2,8 @@ NUM_GPUS: 8
 PIXEL_STDS: [57.375, 57.12, 58.395]
 PIXEL_MEANS: [103.53, 116.28, 123.675]
 MODEL:
-  TYPE: faster_rcnn
+  TYPE: 'faster_rcnn'
-  BACKBONE: resnet50.fpn
+  BACKBONE: 'resnet50.fpn'
  CLASSES: ['__background__',
            'person', 'bicycle', 'car', 'motorcycle', 'airplane',
            'bus', 'train', 'truck', 'boat', 'traffic light',
@@ -21,24 +21,23 @@ MODEL:
            'teddy bear', 'hair drier', 'toothbrush']
 SOLVER:
  BASE_LR: 0.02
-  LR_POLICY: steps_with_decay
  DECAY_STEPS: [60000, 80000]
  MAX_STEPS: 90000
  SNAPSHOT_EVERY: 5000
-  SNAPSHOT_PREFIX: coco_faster_rcnn_R-50-FPN_800_1x
+  SNAPSHOT_PREFIX: 'coco_faster_rcnn_R-50-FPN_800_1x'
 FRCNN:
  BATCH_SIZE: 512
  ROI_XFORM_RESOLUTION: 7
 TRAIN:
  WEIGHTS: '/model/R-50.pkl'
-  DATASET: '/data/coco_2014_trainval35k'
+  DATASET: '/data/coco_2017_train'
  IMS_PER_BATCH: 2
  SCALES: [640, 672, 704, 736, 768, 800]
  MAX_SIZE: 1333
  USE_DIFF: False # Do not use crowd objects
 TEST:
-  DATASET: '/data/coco_2014_minival'
+  DATASET: '/data/coco_2017_val'
-  JSON_FILE: '/data/instances_minival2014.json'
+  JSON_FILE: '/data/instances_val2017.json'
  PROTOCOL: 'coco'
  IMS_PER_BATCH: 1
  SCALES: [800]

--- a/configs/faster_rcnn/coco_faster_rcnn_R-50-FPN_800_2x.yml
+++ b/configs/faster_rcnn/coco_faster_rcnn_R-50-FPN_800_2x.yml
@@ -2,8 +2,8 @@ NUM_GPUS: 8
 PIXEL_STDS: [57.375, 57.12, 58.395]
 PIXEL_MEANS: [103.53, 116.28, 123.675]
 MODEL:
-  TYPE: faster_rcnn
+  TYPE: 'faster_rcnn'
-  BACKBONE: resnet50.fpn
+  BACKBONE: 'resnet50.fpn'
  CLASSES: ['__background__',
            'person', 'bicycle', 'car', 'motorcycle', 'airplane',
            'bus', 'train', 'truck', 'boat', 'traffic light',
@@ -21,24 +21,23 @@ MODEL:
            'teddy bear', 'hair drier', 'toothbrush']
 SOLVER:
  BASE_LR: 0.02
-  LR_POLICY: steps_with_decay
  DECAY_STEPS: [120000, 160000]
  MAX_STEPS: 180000
  SNAPSHOT_EVERY: 5000
-  SNAPSHOT_PREFIX: coco_faster_rcnn_R-50-FPN_800_2x
+  SNAPSHOT_PREFIX: 'coco_faster_rcnn_R-50-FPN_800_2x'
 FRCNN:
  BATCH_SIZE: 512
  ROI_XFORM_RESOLUTION: 7
 TRAIN:
  WEIGHTS: '/model/R-50.pkl'
-  DATASET: '/data/coco_2014_trainval35k'
+  DATASET: '/data/coco_2017_train'
  IMS_PER_BATCH: 2
  SCALES: [640, 672, 704, 736, 768, 800]
  MAX_SIZE: 1333
  USE_DIFF: False # Do not use crowd objects
 TEST:
-  DATASET: '/data/coco_2014_minival'
+  DATASET: '/data/coco_2017_val'
-  JSON_FILE: '/data/instances_minival2014.json'
+  JSON_FILE: '/data/instances_val2017.json'
  PROTOCOL: 'coco'
  IMS_PER_BATCH: 1
  SCALES: [800]

--- a/configs/faster_rcnn/voc_faster_rcnn_R-50-FPN_640.yml
+++ b/configs/faster_rcnn/voc_faster_rcnn_R-50-FPN_640.yml
@@ -2,8 +2,8 @@ NUM_GPUS: 1
 PIXEL_STDS: [57.375, 57.12, 58.395]
 PIXEL_MEANS: [103.53, 116.28, 123.675]
 MODEL:
-  TYPE: faster_rcnn
+  TYPE: 'faster_rcnn'
-  BACKBONE: resnet50.fpn
+  BACKBONE: 'resnet50.fpn'
  CLASSES: ['__background__',
            'aeroplane', 'bicycle', 'bird', 'boat',
            'bottle', 'bus', 'car', 'cat', 'chair',
@@ -18,7 +18,7 @@ SOLVER:
  DECAY_STEPS: [80000, 100000]
  MAX_STEPS: 120000
  SNAPSHOT_EVERY: 5000
-  SNAPSHOT_PREFIX: voc_faster_rcnn_R-50-FPN_640
+  SNAPSHOT_PREFIX: 'voc_faster_rcnn_R-50-FPN_640'
 TRAIN:
  WEIGHTS: '/model/R-50.pkl'
  DATASET: '/data/voc_0712_trainval'

--- a/configs/mask_rcnn/coco_mask_rcnn_R-50-FPN_800_1x.yml
+++ b/configs/mask_rcnn/coco_mask_rcnn_R-50-FPN_800_1x.yml
@@ -2,8 +2,8 @@ NUM_GPUS: 8
 PIXEL_STDS: [57.375, 57.12, 58.395]
 PIXEL_MEANS: [103.53, 116.28, 123.675]
 MODEL:
-  TYPE: mask_rcnn
+  TYPE: 'mask_rcnn'
-  BACKBONE: resnet50.fpn
+  BACKBONE: 'resnet50.fpn'
  CLASSES: ['__background__',
            'person', 'bicycle', 'car', 'motorcycle', 'airplane',
            'bus', 'train', 'truck', 'boat', 'traffic light',
@@ -24,7 +24,7 @@ SOLVER:
  DECAY_STEPS: [60000, 80000]
  MAX_STEPS: 90000
  SNAPSHOT_EVERY: 5000
-  SNAPSHOT_PREFIX: coco_mask_rcnn_R-50-FPN_800_1x
+  SNAPSHOT_PREFIX: 'coco_mask_rcnn_R-50-FPN_800_1x'
 FRCNN:
  BATCH_SIZE: 512
  ROI_XFORM_RESOLUTION: 7
@@ -32,14 +32,14 @@ MRCNN:
  ROI_XFORM_RESOLUTION: 14
 TRAIN:
  WEIGHTS: '/model/R-50.pkl'
-  DATASET: '/data/coco_2014_trainval35k'
+  DATASET: '/data/coco_2017_train'
  IMS_PER_BATCH: 2
  SCALES: [640, 672, 704, 736, 768, 800]
  MAX_SIZE: 1333
  USE_DIFF: False # Do not use crowd objects
 TEST:
-  DATASET: '/data/coco_2014_minival'
+  DATASET: '/data/coco_2017_val'
-  JSON_FILE: '/data/instances_minival2014.json'
+  JSON_FILE: '/data/instances_val2017.json'
  PROTOCOL: 'coco'
  SCALES: [800]
  MAX_SIZE: 1333

--- a/configs/mask_rcnn/coco_mask_rcnn_R-50-FPN_800_2x.yml
+++ b/configs/mask_rcnn/coco_mask_rcnn_R-50-FPN_800_2x.yml
@@ -2,8 +2,8 @@ NUM_GPUS: 8
 PIXEL_STDS: [57.375, 57.12, 58.395]
 PIXEL_MEANS: [103.53, 116.28, 123.675]
 MODEL:
-  TYPE: mask_rcnn
+  TYPE: 'mask_rcnn'
-  BACKBONE: resnet50.fpn
+  BACKBONE: 'resnet50.fpn'
  CLASSES: ['__background__',
            'person', 'bicycle', 'car', 'motorcycle', 'airplane',
            'bus', 'train', 'truck', 'boat', 'traffic light',
@@ -24,7 +24,7 @@ SOLVER:
  DECAY_STEPS: [120000, 160000]
  MAX_STEPS: 180000
  SNAPSHOT_EVERY: 5000
-  SNAPSHOT_PREFIX: coco_mask_rcnn_R-50-FPN_800_2x
+  SNAPSHOT_PREFIX: 'coco_mask_rcnn_R-50-FPN_800_2x'
 FRCNN:
  BATCH_SIZE: 512
  ROI_XFORM_RESOLUTION: 7
@@ -32,14 +32,14 @@ MRCNN:
  ROI_XFORM_RESOLUTION: 14
 TRAIN:
  WEIGHTS: '/model/R-50.pkl'
-  DATASET: '/data/coco_2014_trainval35k'
+  DATASET: '/data/coco_2017_train'
  IMS_PER_BATCH: 2
  SCALES: [640, 672, 704, 736, 768, 800]
  MAX_SIZE: 1333
  USE_DIFF: False # Do not use crowd objects
 TEST:
-  DATASET: '/data/coco_2014_minival'
+  DATASET: '/data/coco_2017_val'
-  JSON_FILE: '/data/instances_minival2014.json'
+  JSON_FILE: '/data/instances_val2017.json'
  PROTOCOL: 'coco'
  SCALES: [800]
  MAX_SIZE: 1333

--- a/configs/retinanet/coco_retinanet_R-50-FPN_416_6x.yml
+++ b/configs/retinanet/coco_retinanet_R-50-FPN_416_6x.yml
@@ -2,8 +2,8 @@ NUM_GPUS: 8
 PIXEL_STDS: [57.375, 57.12, 58.395]
 PIXEL_MEANS: [103.53, 116.28, 123.675]
 MODEL:
-  TYPE: retinanet
+  TYPE: 'retinanet'
-  BACKBONE: resnet50.fpn
+  BACKBONE: 'resnet50.fpn'
  CLASSES: ['__background__',
            'person', 'bicycle', 'car', 'motorcycle', 'airplane',
            'bus', 'train', 'truck', 'boat', 'traffic light',
@@ -24,22 +24,21 @@ FPN:
  RPN_MAX_LEVEL: 7
 SOLVER:
  BASE_LR: 0.01
-  LR_POLICY: steps_with_decay
  DECAY_STEPS: [90000, 120000]
  MAX_STEPS: 135000
  SNAPSHOT_EVERY: 2500
-  SNAPSHOT_PREFIX: coco_retinanet_R-50-FPN_416_6x
+  SNAPSHOT_PREFIX: 'coco_retinanet_R-50-FPN_416_6x'
 PIPELINE:
  TYPE: 'ssd'
 TRAIN:
  WEIGHTS: '/model/R-50.pkl'
-  DATASET: '/data/coco_2014_trainval35k'
+  DATASET: '/data/coco_2017_train'
  IMS_PER_BATCH: 8
  SCALES: [416]
  USE_DIFF: False # Do not use crowd objects
 TEST:
-  DATASET: '/data/coco_2014_minival'
+  DATASET: '/data/coco_2017_val'
-  JSON_FILE: '/data/instances_minival2014.json'
+  JSON_FILE: '/data/instances_val2017.json'
  PROTOCOL: 'coco'
  IMS_PER_BATCH: 1
  SCALES: [416]

--- a/configs/retinanet/coco_retinanet_R-50-FPN_512_6x.yml
+++ b/configs/retinanet/coco_retinanet_R-50-FPN_512_6x.yml
@@ -2,8 +2,8 @@ NUM_GPUS: 8
 PIXEL_STDS: [57.375, 57.12, 58.395]
 PIXEL_MEANS: [103.53, 116.28, 123.675]
 MODEL:
-  TYPE: retinanet
+  TYPE: 'retinanet'
-  BACKBONE: resnet50.fpn
+  BACKBONE: 'resnet50.fpn'
  CLASSES: ['__background__',
            'person', 'bicycle', 'car', 'motorcycle', 'airplane',
            'bus', 'train', 'truck', 'boat', 'traffic light',
@@ -24,22 +24,21 @@ FPN:
  RPN_MAX_LEVEL: 7
 SOLVER:
  BASE_LR: 0.01
-  LR_POLICY: steps_with_decay
  DECAY_STEPS: [90000, 120000]
  MAX_STEPS: 135000
  SNAPSHOT_EVERY: 2500
-  SNAPSHOT_PREFIX: coco_retinanet_R-50-FPN_512_6x
+  SNAPSHOT_PREFIX: 'coco_retinanet_R-50-FPN_512_6x'
 PIPELINE:
  TYPE: 'ssd'
 TRAIN:
  WEIGHTS: '/model/R-50.pkl'
-  DATASET: '/data/coco_2014_trainval35k'
+  DATASET: '/data/coco_2017_train'
  IMS_PER_BATCH: 8
  SCALES: [512]
  USE_DIFF: False # Do not use crowd objects
 TEST:
-  DATASET: '/data/coco_2014_minival'
+  DATASET: '/data/coco_2017_val'
-  JSON_FILE: '/data/instances_minival2014.json'
+  JSON_FILE: '/data/instances_val2017.json'
  PROTOCOL: 'coco'
  IMS_PER_BATCH: 1
  SCALES: [512]

--- a/configs/retinanet/coco_retinanet_R-50-FPN_800_1x.yml
+++ b/configs/retinanet/coco_retinanet_R-50-FPN_800_1x.yml
@@ -2,8 +2,8 @@ NUM_GPUS: 8
 PIXEL_STDS: [57.375, 57.12, 58.395]
 PIXEL_MEANS: [103.53, 116.28, 123.675]
 MODEL:
-  TYPE: retinanet
+  TYPE: 'retinanet'
-  BACKBONE: resnet50.fpn
+  BACKBONE: 'resnet50.fpn'
  CLASSES: ['__background__',
            'person', 'bicycle', 'car', 'motorcycle', 'airplane',
            'bus', 'train', 'truck', 'boat', 'traffic light',
@@ -24,21 +24,20 @@ FPN:
  RPN_MAX_LEVEL: 7
 SOLVER:
  BASE_LR: 0.01
-  LR_POLICY: steps_with_decay
  DECAY_STEPS: [60000, 80000]
  MAX_STEPS: 90000
  SNAPSHOT_EVERY: 5000
-  SNAPSHOT_PREFIX: coco_retinanet_R-50-FPN_800_1x
+  SNAPSHOT_PREFIX: 'coco_retinanet_R-50-FPN_800_1x'
 TRAIN:
  WEIGHTS: '/model/R-50.pkl'
-  DATASET: '/data/coco_2014_trainval35k'
+  DATASET: '/data/coco_2017_train'
  IMS_PER_BATCH: 2
  SCALES: [640, 672, 704, 736, 768, 800]
  MAX_SIZE: 1333
  USE_DIFF: False # Do not use crowd objects
 TEST:
-  DATASET: '/data/coco_2014_minival'
+  DATASET: '/data/coco_2017_val'
-  JSON_FILE: '/data/instances_minival2014.json'
+  JSON_FILE: '/data/instances_val2017.json'
  PROTOCOL: 'coco'
  IMS_PER_BATCH: 1
  SCALES: [800]

--- a/configs/retinanet/coco_retinanet_R-50-FPN_800_2x.yml
+++ b/configs/retinanet/coco_retinanet_R-50-FPN_800_2x.yml
@@ -2,8 +2,8 @@ NUM_GPUS: 8
 PIXEL_STDS: [57.375, 57.12, 58.395]
 PIXEL_MEANS: [103.53, 116.28, 123.675]
 MODEL:
-  TYPE: retinanet
+  TYPE: 'retinanet'
-  BACKBONE: resnet50.fpn
+  BACKBONE: 'resnet50.fpn'
  CLASSES: ['__background__',
            'person', 'bicycle', 'car', 'motorcycle', 'airplane',
            'bus', 'train', 'truck', 'boat', 'traffic light',
@@ -24,21 +24,20 @@ FPN:
  RPN_MAX_LEVEL: 7
 SOLVER:
  BASE_LR: 0.01
-  LR_POLICY: steps_with_decay
  DECAY_STEPS: [120000, 160000]
  MAX_STEPS: 180000
  SNAPSHOT_EVERY: 5000
-  SNAPSHOT_PREFIX: coco_retinanet_R-50-FPN_800_2x
+  SNAPSHOT_PREFIX: 'coco_retinanet_R-50-FPN_800_2x'
 TRAIN:
  WEIGHTS: '/model/R-50.pkl'
-  DATASET: '/data/coco_2014_trainval35k'
+  DATASET: '/data/coco_2017_train'
  IMS_PER_BATCH: 2
  SCALES: [640, 672, 704, 736, 768, 800]
  MAX_SIZE: 1333
  USE_DIFF: False # Do not use crowd objects
 TEST:
-  DATASET: '/data/coco_2014_minival'
+  DATASET: '/data/coco_2017_val'
-  JSON_FILE: '/data/instances_minival2014.json'
+  JSON_FILE: '/data/instances_val2017.json'
  PROTOCOL: 'coco'
  IMS_PER_BATCH: 1
  SCALES: [800]

--- a/configs/retinanet/voc_retinanet_R-50-FPN_416.yml
+++ b/configs/retinanet/voc_retinanet_R-50-FPN_416.yml
@@ -2,8 +2,8 @@ NUM_GPUS: 1
 PIXEL_STDS: [57.375, 57.12, 58.395]
 PIXEL_MEANS: [103.53, 116.28, 123.675]
 MODEL:
-  TYPE: retinanet
+  TYPE: 'retinanet'
-  BACKBONE: resnet50.fpn
+  BACKBONE: 'resnet50.fpn'
  CLASSES: ['__background__',
            'aeroplane', 'bicycle', 'bird', 'boat',
            'bottle', 'bus', 'car', 'cat', 'chair',
@@ -20,7 +20,7 @@ SOLVER:
  DECAY_STEPS: [80000, 100000]
  MAX_STEPS: 120000
  SNAPSHOT_EVERY: 5000
-  SNAPSHOT_PREFIX: voc_retinanet_R-50-FPN_416
+  SNAPSHOT_PREFIX: 'voc_retinanet_R-50-FPN_416'
 PIPELINE:
  TYPE: 'ssd'
 TRAIN:

--- a/configs/retinanet/voc_retinanet_R-50-FPN_512.yml
+++ b/configs/retinanet/voc_retinanet_R-50-FPN_512.yml
@@ -2,8 +2,8 @@ NUM_GPUS: 2
 PIXEL_STDS: [57.375, 57.12, 58.395]
 PIXEL_MEANS: [103.53, 116.28, 123.675]
 MODEL:
-  TYPE: retinanet
+  TYPE: 'retinanet'
-  BACKBONE: resnet50.fpn
+  BACKBONE: 'resnet50.fpn'
  CLASSES: ['__background__',
            'aeroplane', 'bicycle', 'bird', 'boat',
            'bottle', 'bus', 'car', 'cat', 'chair',
@@ -20,7 +20,7 @@ SOLVER:
  DECAY_STEPS: [80000, 100000]
  MAX_STEPS: 120000
  SNAPSHOT_EVERY: 5000
-  SNAPSHOT_PREFIX: voc_retinanet_R-50-FPN_512
+  SNAPSHOT_PREFIX: 'voc_retinanet_R-50-FPN_512'
 PIPELINE:
  TYPE: 'ssd'
 TRAIN:

--- a/configs/ssd/voc_ssd_VGG-16_300.yml
+++ b/configs/ssd/voc_ssd_VGG-16_300.yml
@@ -2,8 +2,8 @@ NUM_GPUS: 1
 PIXEL_STDS: [1.0, 1.0, 1.0]
 PIXEL_MEANS: [103.53, 116.28, 123.675]
 MODEL:
-  TYPE: ssd
+  TYPE: 'ssd'
-  BACKBONE: vgg16_reduced_300
+  BACKBONE: 'vgg16_reduced_300'
  COARSEST_STRIDE: 0
  CLASSES: ['__background__',
            'aeroplane', 'bicycle', 'bird', 'boat',
@@ -31,7 +31,7 @@ SOLVER:
  DECAY_STEPS: [80000, 100000]
  MAX_STEPS: 120000
  SNAPSHOT_EVERY: 5000
-  SNAPSHOT_PREFIX: voc_ssd_VGG-16_300
+  SNAPSHOT_PREFIX: 'voc_ssd_VGG-16_300'
 TRAIN:
  WEIGHTS: '/model/VGG16.SSD.pkl'
  DATASET: '/data/voc_0712_trainval'

--- a/configs/ssd/voc_ssd_VGG-16_512.yml
+++ b/configs/ssd/voc_ssd_VGG-16_512.yml
@@ -2,8 +2,8 @@ NUM_GPUS: 2
 PIXEL_STDS: [1.0, 1.0, 1.0]
 PIXEL_MEANS: [103.53, 116.28, 123.675]
 MODEL:
-  TYPE: ssd
+  TYPE: 'ssd'
-  BACKBONE: vgg16_reduced_512
+  BACKBONE: 'vgg16_reduced_512'
  CLASSES: ['__background__',
            'aeroplane', 'bicycle', 'bird', 'boat',
            'bottle', 'bus', 'car', 'cat', 'chair',
@@ -32,7 +32,7 @@ SOLVER:
  DECAY_STEPS: [80000, 100000]
  MAX_STEPS: 120000
  SNAPSHOT_EVERY: 5000
-  SNAPSHOT_PREFIX: voc_ssd_VGG-16_512
+  SNAPSHOT_PREFIX: 'voc_ssd_VGG-16_512'
 TRAIN:
  WEIGHTS: '/model/VGG16.SSD.pkl'
  DATASET: '/data/voc_0712_trainval'

--- a/scripts/coco/im2rec.py
+++ b/scripts/coco/im2rec.py
@@ -18,7 +18,7 @@ import os
 import shutil
 from maker import make_record
-from maskgen import make_mask, merge_mask
+from roidb import make_database
 if __name__ == '__main__':
@@ -27,30 +27,25 @@ if __name__ == '__main__':
    # Encode masks to RLE bytes
    if not os.path.exists('build'):
        os.makedirs('build')
-    make_mask('train', '2014', COCO_ROOT)
+    make_database('train', '2017', COCO_ROOT)
-    make_mask('valminusminival', '2014', COCO_ROOT)
+    make_database('val', '2017', COCO_ROOT)
-    make_mask('minival', '2014', COCO_ROOT)
-    merge_mask('trainval35k', '2014', ['build/coco_2014_train_mask.pkl',
-                                       'build/coco_2014_valminusminival_mask.pkl'])
-    # coco_2014_trainval35k
+    # coco_2017_train
    make_record(
-        record_file=os.path.join(COCO_ROOT, 'coco_2014_trainval35k'),
+        db_file='build/coco_2017_train.db.pkl',
-        images_path=[os.path.join(COCO_ROOT, 'images/train2014'),
+        record_file=os.path.join(COCO_ROOT, 'coco_2017_train'),
-                     os.path.join(COCO_ROOT, 'images/val2014')],
+        images_path=[os.path.join(COCO_ROOT, 'images/train2017')],
-        splits_path=[os.path.join(COCO_ROOT, 'splits'),
+        splits_path=[os.path.join(COCO_ROOT, 'splits')],
-                     os.path.join(COCO_ROOT, 'splits')],
+        splits=['train2017'],
-        mask_file='build/coco_2014_trainval35k_mask.pkl',
-        splits=['train', 'valminusminival'],
    )
-    # coco_2014_minival
+    # coco_2017_val
    make_record(
-        record_file=os.path.join(COCO_ROOT, 'coco_2014_minival'),
+        db_file='build/coco_2017_val.db.pkl',
-        images_path=os.path.join(COCO_ROOT, 'images/val2014'),
+        record_file=os.path.join(COCO_ROOT, 'coco_2017_val'),
-        mask_file='build/coco_2014_minival_mask.pkl',
+        images_path=[os.path.join(COCO_ROOT, 'images/val2017')],
-        splits_path=os.path.join(COCO_ROOT, 'splits'),
+        splits_path=[os.path.join(COCO_ROOT, 'splits')],
-        splits=['minival'],
+        splits=['val2017'],
    )
    shutil.rmtree('build')
--- a/scripts/coco/maker.py
+++ b/scripts/coco/maker.py
@@ -18,7 +18,7 @@ import dragon
 import numpy as np
-def make_example(image_file, mask_objects, im_scale=None):
+def make_example(image_file, objects, im_scale=None):
    filename = os.path.split(image_file)[-1]
    example = {'id': filename.split('.')[0], 'object': []}
@@ -39,7 +39,7 @@ def make_example(image_file, mask_objects, im_scale=None):
        example['height'], example['width'], example['depth'] = img.shape
        example['content'] = img_bytes
-    for ix, obj in enumerate(mask_objects):
+    for obj in objects:
        x1, y1, x2, y2 = obj['bbox']
        example['object'].append({
            'name': obj['name'],
@@ -58,7 +58,7 @@ def make_example(image_file, mask_objects, im_scale=None):
 def make_record(
    record_file,
    images_path,
-    mask_file,
+    db_file,
    splits_path,
    splits,
    ext='.jpg',
@@ -75,11 +75,11 @@ def make_record(
    assert len(splits) == len(splits_path)
    assert len(splits) == len(images_path)
-    if mask_file is not None:
+    if db_file is not None:
-        with open(mask_file, 'rb') as f:
+        with open(db_file, 'rb') as f:
-            all_masks = pickle.load(f)
+            all_entries = pickle.load(f)
    else:
-        all_masks = {}
+        all_entries = {}
    print('Start Time:', time.strftime("%a, %d %b %Y %H:%M:%S", time.gmtime()))
@@ -133,8 +133,8 @@ def make_record(
                    count, total_line, now_time - start_time))
            filename = line.strip()
            image_file = os.path.join(images_path[db_idx], filename + ext)
-            mask_objects = all_masks[filename] if filename in all_masks else {}
+            objects = all_entries[filename] if filename in all_entries else {}
-            writer.write(make_example(image_file, mask_objects, im_scale))
+            writer.write(make_example(image_file, objects, im_scale))
    now_time = time.time()
    print('{} / {} in {:.2f} sec'.format(count, total_line, now_time - start_time))

--- a/scripts/coco/maskgen.py
+++ b/scripts/coco/maskgen.py
@@ -74,11 +74,13 @@ class COCOWrapper(object):
    def image_path_from_index(self, index):
        """Construct an image path from the image's "index" identifier."""
        # Example image path for index=119993:
-        #   images/train2014/COCO_train2014_000000119993.jpg
+        # images/train2014/COCO_train2014_000000119993.jpg
-        file_name = ('COCO_' + self._data_name + '_' +
+        # images/train2017/000000119993.jpg
-                     str(index).zfill(12) + '.jpg')
+        filename = str(index).zfill(12) + '.jpg'
+        if '2014' in self._data_name:
+            filename = 'COCO_{}_{}'.format(self._data_name, filename)
        image_path = osp.join(self._data_path, 'images',
-                              self._data_name, file_name)
+                              self._data_name, filename)
        assert osp.exists(image_path), \
            'Path does not exist: {}'.format(image_path)
        return image_path
@@ -99,19 +101,18 @@ class COCOWrapper(object):
        objects = self._COCO.loadAnns(ann_ids)
        # Sanitize boxes -- some are invalid
        valid_objects = []
+        mask, polygons = b'', []
        for obj in objects:
            x1 = float(max(0, obj['bbox'][0]))
            y1 = float(max(0, obj['bbox'][1]))
            x2 = float(min(width - 1, x1 + max(0, obj['bbox'][2] - 1)))
            y2 = float(min(height - 1, y1 + max(0, obj['bbox'][3] - 1)))
-            mask, polygons = b'', []
            if isinstance(obj['segmentation'], list):
                for p in obj['segmentation']:
                    if len(p) < 6:
                        print('Remove Invalid segm.')
                # Valid polygons have >= 3 points, so require >= 6 coordinates
                polygons = [p for p in obj['segmentation'] if len(p) >= 6]
-                # mask_bytes = mask_utils.poly2bytes(poly, height, width)
            else:
                # Crowd masks
                # Some are encoded with height or width
@@ -141,25 +142,26 @@ class COCOWrapper(object):
        return len(self._classes)
-def make_mask(split, year, data_dir):
+def make_database(split, year, data_dir):
    coco = COCOWrapper(split, year, data_dir)
    print('Preparing to make split: {}, total {} images'
          .format(split, coco.num_images))
    if not osp.exists(osp.join(coco._data_path, 'splits')):
        os.makedirs(osp.join(coco._data_path, 'splits'))
-    gt_recs = collections.OrderedDict()
+    entries = collections.OrderedDict()
    for i in range(coco.num_images):
        filename = osp.basename(coco.image_path_at(i)).split('.')[0]
        h, w, objects = coco.annotation_at(i)
-        gt_recs[filename] = objects
+        entries[filename] = objects
    with open(osp.join('build',
-                       'coco_' + year +
+                       'coco_' + year + '_' + split +
-                       '_' + split + '_mask.pkl'), 'wb') as f:
+                       '.db.pkl'), 'wb') as f:
-        pickle.dump(gt_recs, f, pickle.HIGHEST_PROTOCOL)
+        pickle.dump(entries, f, pickle.HIGHEST_PROTOCOL)
-    with open(osp.join(coco._data_path, 'splits', split + '.txt'), 'w') as f:
+    with open(osp.join(coco._data_path, 'splits',
+                       split + year + '.txt'), 'w') as f:
        for i in range(coco.num_images):
            filename = str(osp.basename(coco.image_path_at(i)).split('.')[0])
            if i != coco.num_images - 1:
@@ -167,16 +169,16 @@ def make_mask(split, year, data_dir):
            f.write(filename)
-def merge_mask(split, year, mask_files):
+def merge_database(split, year, db_files):
-    gt_recs = collections.OrderedDict()
+    entries = collections.OrderedDict()
-    data_path = os.path.dirname(mask_files[0])
+    data_path = os.path.dirname(db_files[0])
-    for mask_file in mask_files:
+    for db_file in db_files:
-        with open(mask_file, 'rb') as f:
+        with open(db_file, 'rb') as f:
-            recs = pickle.load(f)
+            entries = pickle.load(f)
-            gt_recs.update(recs)
+            entries.update(entries)
    with open(osp.join(data_path,
-                       'coco_' + year +
+                       'coco_' + year + '_' + split +
-                       '_' + split + '_mask.pkl'), 'wb') as f:
+                       '.db.pkl'), 'wb') as f:
-        pickle.dump(gt_recs, f, pickle.HIGHEST_PROTOCOL)
+        pickle.dump(entries, f, pickle.HIGHEST_PROTOCOL)
--- a/seetadet/algo/faster_rcnn/data_transformer.py
+++ b/seetadet/algo/faster_rcnn/data_transformer.py
@@ -27,6 +27,8 @@ from seetadet.utils import image as image_util
 class DataTransformer(multiprocessing.Process):
+    """DataTransformer."""
    def __init__(self, **kwargs):
        super(DataTransformer, self).__init__()
        self._scales = cfg.TRAIN.SCALES
@@ -43,7 +45,7 @@ class DataTransformer(multiprocessing.Process):
        self.q_in = self.q_out = None
        self.daemon = True
-    def get_boxes(self, example, im_scale):
+    def get_boxes(self, example, im_scale, flipped):
        objects, num_objects = example.objects, 0
        height, width = example.height, example.width
        if not self._use_diff:
@@ -56,7 +58,7 @@ class DataTransformer(multiprocessing.Process):
        boxes = np.zeros((num_objects, 4), 'float32')
        gt_classes = np.zeros((num_objects,), 'float32')
-        # Filter the difficult instances
+        # Filter the difficult instances.
        object_idx = 0
        for obj in objects:
            if not self._use_diff and obj.get('difficult', 0) > 0:
@@ -69,10 +71,14 @@ class DataTransformer(multiprocessing.Process):
            gt_classes[object_idx] = self._class_to_ind[obj['name']]
            object_idx += 1
-        # Scale the boxes to the detecting scale
+        # Flip the boxes if necessary.
+        if flipped:
+            boxes = box_util.flip_boxes(boxes, width)
+        # Scale the boxes to the detecting scale.
        boxes *= im_scale
-        # Attach the classes
+        # Attach the classes.
        gt_boxes = np.empty((num_objects, 5), dtype=np.float32)
        gt_boxes[:, :4], gt_boxes[:, 4] = boxes, gt_classes
@@ -81,7 +87,7 @@ class DataTransformer(multiprocessing.Process):
    def get(self, example):
        example = Example(example)
-        # Resize
+        # Resize.
        img, im_scale = image_util.resize_image_with_target_size(
            example.image,
            target_size=npr.choice(self._scales),
@@ -89,22 +95,18 @@ class DataTransformer(multiprocessing.Process):
            random_scales=self._random_scales,
        )
-        # Flip
+        # Flip.
        flipped = False
        if self._use_flipped and npr.randint(2) > 0:
            img = img[:, ::-1]
            flipped = True
-        # Distort
+        # Distort.
        if self._use_distort:
            img = image_util.distort_image(img)
-        # Boxes
+        # Boxes.
-        boxes = self.get_boxes(example, im_scale)
+        boxes = self.get_boxes(example, im_scale, flipped)
-        # Flip the boxes if necessary
-        if flipped:
-            boxes = box_util.flip_boxes(boxes, img.shape[1])
        # Standard outputs.
        outputs = {'image': img,

--- a/seetadet/algo/mask_rcnn/data_transformer.py
+++ b/seetadet/algo/mask_rcnn/data_transformer.py
@@ -28,6 +28,8 @@ from seetadet.utils import image as image_util
 class DataTransformer(multiprocessing.Process):
+    """DataTransformer."""
    def __init__(self, **kwargs):
        super(DataTransformer, self).__init__()
        self._scales = cfg.TRAIN.SCALES
@@ -81,6 +83,10 @@ class DataTransformer(multiprocessing.Process):
            gt_classes[object_idx] = self._class_to_ind[obj['name']]
            object_idx += 1
+        # Flip the boxes if necessary.
+        if flipped:
+            boxes = box_util.flip_boxes(boxes, width)
        # Scale the boxes to the detecting scale.
        boxes *= im_scale
@@ -115,10 +121,6 @@ class DataTransformer(multiprocessing.Process):
        # Boxes and segmentations.
        boxes, segms = self.get_boxes_and_segms(example, im_scale, flipped)
-        # Flip the boxes if necessary.
-        if flipped:
-            boxes = box_util.flip_boxes(boxes, img.shape[1])
        # Standard outputs.
        outputs = {'image': img,
                   'boxes': boxes,

--- a/seetadet/algo/mask_rcnn/proposal_target.py
+++ b/seetadet/algo/mask_rcnn/proposal_target.py
@@ -124,37 +124,37 @@ class ProposalTarget(object):
 def compute_targets(
-    ex_rois,
+    rois,
-    gt_rois,
+    gt_boxes,
    gt_labels,
-    gt_segms,
+    fg_segms,
-    mask_flags,
+    fg_segms_flag,
    mask_size,
    im_scale,
 ):
    """Compute the bounding-box regression targets."""
-    assert ex_rois.shape[0] == gt_rois.shape[0]
+    assert rois.shape[0] == gt_boxes.shape[0]
-    assert ex_rois.shape[1] == 4
+    assert rois.shape[1] == 4
-    assert gt_rois.shape[1] == 4
+    assert gt_boxes.shape[1] == 4
    # Compute bbox regression targets
    fg_inds = np.where(gt_labels > 0)[0]
-    bbox_targets = box_util.bbox_transform(
+    bbox_targets = box_util.bbox_transform(rois, gt_boxes, cfg.BBOX_REG_WEIGHTS)
-        ex_rois, gt_rois, cfg.BBOX_REG_WEIGHTS)
    # Compute mask classification targets
    mask_shape = [mask_size] * 2
-    ex_rois_ori = np.round(ex_rois / im_scale).astype(int)
+    mask_targets = -np.ones([len(rois)] + mask_shape, 'float32')
-    mask_targets = -np.ones([len(gt_labels)] + mask_shape, 'float32')
+    rois_ori = rois / im_scale
-    for i in fg_inds:
+    rois_ori_int = np.round(rois_ori).astype(int)
-        if mask_flags[i] > 0:
+    gt_boxes_ori_int = np.round(gt_boxes / im_scale).astype(int)
-            if isinstance(gt_segms[i], list):
+    for i, fg_idx in enumerate(fg_inds):
-                ret = mask_util.warp_mask_via_polygons(
+        if fg_segms_flag[i] > 0:
-                    gt_segms[i], ex_rois_ori[i], mask_shape)
+            if isinstance(fg_segms[i], list):
+                target = mask_util.warp_mask_via_polygons(
+                    fg_segms[i], rois_ori[i], mask_shape)
            else:
-                gt_rois_ori = np.round(gt_rois / im_scale).astype(int)
+                target = mask_util.warp_mask_via_intersection(
-                ret = mask_util.warp_mask_via_intersection(
+                    fg_segms[i], rois_ori_int[i], gt_boxes_ori_int[i], mask_shape)
-                    gt_segms[i], ex_rois_ori[i], gt_rois_ori[i], mask_shape)
+            if target is not None:
-            if ret is not None:
+                mask_targets[fg_idx] = target.astype(mask_targets.dtype)
-                mask_targets[i] = ret.astype('float32')
    return bbox_targets, mask_targets

--- a/seetadet/algo/ssd/data_transformer.py
+++ b/seetadet/algo/ssd/data_transformer.py
@@ -27,6 +27,8 @@ from seetadet.utils import boxes as box_util
 class DataTransformer(multiprocessing.Process):
+    """DataTransformer."""
    def __init__(self, **kwargs):
        super(DataTransformer, self).__init__()
        self._scale = cfg.TRAIN.SCALES[0]
@@ -44,7 +46,7 @@ class DataTransformer(multiprocessing.Process):
        self.q_in = self.q_out = None
        self.daemon = True
-    def get_boxes(self, example):
+    def get_boxes(self, example, flipped):
        objects, num_objects = example.objects, 0
        height, width = example.height, example.width
        if not self._use_diff:
@@ -70,6 +72,10 @@ class DataTransformer(multiprocessing.Process):
            gt_classes[object_idx] = self._class_to_ind[obj['name']]
            object_idx += 1
+        # Flip the boxes if necessary.
+        if flipped:
+            boxes = box_util.flip_boxes(boxes, width)
        # Normalize.
        boxes[:, 0::2] /= width
        boxes[:, 1::2] /= height
@@ -82,25 +88,31 @@ class DataTransformer(multiprocessing.Process):
    def get(self, example):
        example = Example(example)
+        img = example.image
+        # Flip.
+        flipped = False
+        if self._use_flipped and npr.randint(2) > 0:
+            img = img[:, ::-1]
+            flipped = True
        # Boxes.
-        boxes = self.get_boxes(example)
+        boxes = self.get_boxes(example, flipped)
+        # Return to avoid the invalid transforms.
        if len(boxes) == 0:
            return {'boxes': boxes}
        # Distort => Expand => Sample => Resize
-        img, boxes = self._apply_transform(example.image, boxes)
+        img, boxes = self._apply_transform(img, boxes)
        # Restore to the blob scale.
        boxes[:, :4] *= self._scale
-        # Flip.
-        if self._use_flipped and npr.randint(2) > 0:
-            img = img[:, ::-1]
-            boxes = box_util.flip_boxes(boxes, img.shape[1])
        # Standard outputs.
-        outputs = {'image': img, 'boxes': boxes, 'im_info': img.shape[:2]}
+        outputs = {'image': img,
+                   'boxes': boxes,
+                   'im_info': img.shape[:2]}
        # Attach precomputed targets.
        if len(boxes) > 0:

--- a/seetadet/core/config.py
+++ b/seetadet/core/config.py
@@ -333,10 +333,12 @@ __C.FRCNN.NEGATIVE_OVERLAP_HI = 0.5
 __C.FRCNN.NEGATIVE_OVERLAP_LO = 0.0
 # RoI transform function
-# Values supported: 'RoIAlign', 'RoIAlign'
+# Values supported: 'RoIAlign', 'RoIPool'
 __C.FRCNN.ROI_XFORM_METHOD = 'RoIAlign'
 # RoI transform output resolution
 __C.FRCNN.ROI_XFORM_RESOLUTION = 7
 # Resampling window size for RoI transformation
 __C.FRCNN.ROI_XFORM_SAMPLING_RATIO = 0
@@ -362,10 +364,12 @@ __C.MRCNN = AttrDict()
 __C.MRCNN.RESOLUTION = 28
 # RoI transform function
-# Values supported: 'RoIAlign', 'RoIAlign'
+# Values supported: 'RoIAlign', 'RoIPool'
 __C.MRCNN.ROI_XFORM_METHOD = 'RoIAlign'
 # RoI transform output resolution
 __C.MRCNN.ROI_XFORM_RESOLUTION = 14
 # Resampling window size for RoI transformation
 __C.MRCNN.ROI_XFORM_SAMPLING_RATIO = 0
@@ -438,6 +442,7 @@ __C.SOLVER.DISPLAY = 20
 # The interval to snapshot a model
 __C.SOLVER.SNAPSHOT_EVERY = 5000
 # Prefix to yield the path: <prefix>_iter_XYZ.pkl
 __C.SOLVER.SNAPSHOT_PREFIX = ''
@@ -451,25 +456,34 @@ __C.SOLVER.MAX_STEPS = 40000
 # Base learning rate for the specified schedule
 __C.SOLVER.BASE_LR = 0.001
 # The uniform interval for LRScheduler
 __C.SOLVER.DECAY_STEP = 1
 # The custom intervals for LRScheduler
 __C.SOLVER.DECAY_STEPS = []
 # The decay factor for exponential LRScheduler
 __C.SOLVER.DECAY_GAMMA = 0.1
 # Warm up to ``BASE_LR`` over this number of steps
 __C.SOLVER.WARM_UP_STEPS = 500
 # Start the warm up from ``BASE_LR`` * ``FACTOR``
 __C.SOLVER.WARM_UP_FACTOR = 0.333
 # The type of LRScheduler
 __C.SOLVER.LR_POLICY = 'steps_with_decay'
 # Momentum to use with SGD
 __C.SOLVER.MOMENTUM = 0.9
 # L2 regularization for weight parameters
 __C.SOLVER.WEIGHT_DECAY = 0.0001
 # L2 regularization for legacy bias parameters
 __C.SOLVER.WEIGHT_DECAY_BIAS = 0.0
 # L2 norm factor for clipping gradients
 __C.SOLVER.CLIP_NORM = 0.0

--- a/seetadet/utils/mask.py
+++ b/seetadet/utils/mask.py
@@ -14,6 +14,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+import copy
 import cv2
 import numpy as np
 import PIL.Image
@@ -37,32 +39,37 @@ def warp_mask_via_intersection(mask, box1, box2, size):
    inter_mask = mask[y1:y2 + 1, x1:x2 + 1]
    target_h = box1[3] - box1[1] + 1
    target_w = box1[2] - box1[0] + 1
-    warped_mask = np.zeros((target_h, target_w), dtype=mask.dtype)
+    warped_mask = np.zeros((target_h, target_w), dtype='uint8')
    warped_mask[ex_start_y:ex_start_y + h,
                ex_start_x:ex_start_x + w] = inter_mask
    if not isinstance(size, (tuple, list)):
        size = (size, size)
    mask = PIL.Image.fromarray(warped_mask)
-    return np.array(mask.resize((size[1], size[0]), PIL.Image.NEAREST))
+    mask = mask.resize((size[1], size[0]), PIL.Image.NEAREST)
+    return np.array(mask)
 def warp_mask_via_polygons(polygons, box, size):
    """Warp mask via polygons."""
-    w = np.maximum(box[2] - box[0], 1)
+    w, h = box[2] - box[0], box[3] - box[1]
-    h = np.maximum(box[3] - box[1], 1)
    if not isinstance(size, (tuple, list)):
        size = (size, size)
-    polygons_norm = []
+    ratio_h = size[0] / max(h, 0.1)
-    for poly in polygons:
+    ratio_w = size[1] / max(w, 0.1)
-        p = np.array(poly, dtype=np.float32)
+    polygons = copy.deepcopy(polygons)
-        p[0::2] = (p[0::2] - box[0]) * size[1] / w
+    for p in polygons:
-        p[1::2] = (p[1::2] - box[1]) * size[0] / h
+        p[0::2] = p[0::2] - box[0]
-        polygons_norm.append(p)
+        p[1::2] = p[1::2] - box[1]
-    rle = mask_tools.frPyObjects(polygons_norm, size[0], size[1])
+    if ratio_h == ratio_w:
-    mask = np.array(mask_tools.decode(rle))
+        for p in polygons:
-    mask = np.sum(mask, axis=2)
+            p *= ratio_h
-    mask = np.array(mask > 0)
+    else:
-    return mask
+        for p in polygons:
+            p[0::2] *= ratio_w
+            p[1::2] *= ratio_h
+    rle_objs = mask_tools.frPyObjects(polygons, size[0], size[1])
+    rle_objs = [mask_tools.merge(rle_objs)]
+    return mask_tools.decode(rle_objs)[:, :, 0]
 def mask_overlap(box1, box2, mask1, mask2):
@@ -148,7 +155,7 @@ def project_masks(
        w = np.maximum(w, 1)
        h = np.maximum(h, 1)
        mask = cv2.resize(padded_mask, (w, h))
-        mask = np.array(mask > thresh, 'uint8')
+        mask = np.array(mask >= thresh, 'uint8')
        x1 = max(ref_box[0], 0)
        y1 = max(ref_box[1], 0)
        x2 = min(ref_box[2] + 1, width)

--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
 # You should have received a copy of the BSD 2-Clause License
 # along with the software. If not, See,
 #
-#      <https://opensource.org/licenses/BSD-2-Clause>
+#     <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------