Bump to 0.1.1

Ting PAN
Commit bf361560 authored Apr 09, 2019 by Ting PAN
Showing with 355 additions and 156 deletions
CHANGES
README.md
configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_1x.yml
configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_2x.yml
configs/faster_rcnn/voc_faster_rcnn_R-50-FPN.yml
configs/faster_rcnn/voc_faster_rcnn_VGG-16-C4.yml
configs/retinanet/coco_retinanet_400_R-50-FPN_1x.yml
configs/retinanet/coco_retinanet_400_R-50-FPN_4x.yml
configs/retinanet/voc_retinanet_300_AirNet-FPN.yml
configs/retinanet/voc_retinanet_300_R-18-FPN.yml
configs/retinanet/voc_retinanet_300_R-34-FPN.yml
configs/ssd/voc_ssd_300_AirNet-5b.yml
configs/ssd/voc_ssd_300_VGG-16.yml
lib/core/config.py
lib/datasets/factory.py
lib/faster_rcnn/data/data_transformer.py
lib/modeling/__init__.py
lib/modeling/airnet.py
lib/modeling/base.py
lib/modeling/detector.py
--- a/CHANGES
+++ b/CHANGES
 ------------------------------------------------------------------------
 The list of most significant changes made over time in SeetaDet.

+SeetaDet 0.1.1 (20190409)
+
+Dragon Minimum Required (Version 0.3.0.0)
+
+Changes:
+
+Preview Features:
+
+- Add RandomCrop/RandomPad for ScaleJittering.
+
+- Add ResNet18/ResNet34/AirNet for R-CNN and RetinaNet.
+
+- Use C++ Implemented Decoder for RetinaNet instead.
+
+Bugs fixed:
+
+- None
+
+------------------------------------------------------------------------
+
 SeetaDet 0.1.0 (20190314)

 Dragon Minimum Required (Version 0.3.0.0)
@@ -13,4 +33,4 @@ Preview Features:

 Bugs fixed:

- None
+- None
\ No newline at end of file
--- a/README.md
+++ b/README.md
@@ -67,9 +67,11 @@ python export.py --cfg <MODEL_YAML> --exp_dir <EXP_DIR> --iter <ITERATION>
 | :------: | :------: |
 | [VGG16.SSD](http://dragon.seetatech.com/download/models/SeetaDet/imagenet/VGG16.SSD.pth)| SSD |
 | [VGG16.RCNN](http://dragon.seetatech.com/download/models/SeetaDet/imagenet/VGG16.RCNN.pth)| R-CNN |
+| [R-18.Affine](http://dragon.seetatech.com/download/models/SeetaDet/imagenet/R-18.Affine.pth)| R-CNN, RetinaNet |
+| [R-34.Affine](http://dragon.seetatech.com/download/models/SeetaDet/imagenet/R-34.Affine.pth)| R-CNN, RetinaNet |
 | [R-50.Affine](http://dragon.seetatech.com/download/models/SeetaDet/imagenet/R-50.Affine.pth)| R-CNN, RetinaNet |
 | [R-101.Affine](http://dragon.seetatech.com/download/models/SeetaDet/imagenet/R-101.Affine.pth)| R-CNN, RetinaNet |
-| [AirNet.SSD](http://dragon.seetatech.com/download/models/SeetaDet/imagenet/AirNet.SSD.pth)| SSD |
+| [AirNet.Affine](http://dragon.seetatech.com/download/models/SeetaDet/imagenet/AirNet.Affine.pth)| R-CNN, RetinaNet, SSD |

 ## References


--- a/configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_1x.yml
+++ b/configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_1x.yml
@@ -33,14 +33,14 @@ FRCNN:
  ROI_XFORM_RESOLUTION: 7
 TRAIN:
  WEIGHTS: '/data/models/imagenet/R-101.Affine.pth'
-  DATABASE: 'taas:/data/coco_2014_trainval35k_lmdb'
+  DATABASE: '/data/coco_2014_trainval35k_lmdb'
  IMS_PER_BATCH: 2
  USE_DIFF: False # Do not use crowd objects
  BATCH_SIZE: 512
  SCALES: [800]
  MAX_SIZE: 1333
 TEST:
-  DATABASE: 'taas:/data/coco_2014_minival_lmdb'
+  DATABASE: '/data/coco_2014_minival_lmdb'
  JSON_FILE: '/data/instances_minival2014.json'
  PROTOCOL: 'coco'
  RPN_POST_NMS_TOP_N: 1000

--- a/configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_2x.yml
+++ b/configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_2x.yml
@@ -33,14 +33,14 @@ FRCNN:
  ROI_XFORM_RESOLUTION: 7
 TRAIN:
  WEIGHTS: '/data/models/imagenet/R-101.Affine.pth'
-  DATABASE: 'taas:/data/coco_2014_trainval35k_lmdb'
+  DATABASE: '/data/coco_2014_trainval35k_lmdb'
  IMS_PER_BATCH: 2
  USE_DIFF: False # Do not use crowd objects
  BATCH_SIZE: 512
  SCALES: [800]
  MAX_SIZE: 1333
 TEST:
-  DATABASE: 'taas:/data/coco_2014_minival_lmdb'
+  DATABASE: '/data/coco_2014_minival_lmdb'
  JSON_FILE: '/data/instances_minival2014.json'
  PROTOCOL: 'coco'
  RPN_POST_NMS_TOP_N: 1000

--- a/configs/faster_rcnn/voc_faster_rcnn_R-50-FPN.yml
+++ b/configs/faster_rcnn/voc_faster_rcnn_R-50-FPN.yml
@@ -24,13 +24,13 @@ FRCNN:
  ROI_XFORM_RESOLUTION: 7
 TRAIN:
  WEIGHTS: '/data/models/imagenet/R-50.Affine.pth'
-  DATABASE: 'taas:/data/voc_0712_trainval_lmdb'
+  DATABASE: '/data/voc_0712_trainval_lmdb'
  IMS_PER_BATCH: 2
  BATCH_SIZE: 128
  SCALES: [600]
  MAX_SIZE: 1000
 TEST:
-  DATABASE: 'taas:/data/voc_2007_test_lmdb'
+  DATABASE: '/data/voc_2007_test_lmdb'
  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
  RPN_POST_NMS_TOP_N: 1000
  SCALES: [600]

--- a/configs/faster_rcnn/voc_faster_rcnn_VGG-16-C4.yml
+++ b/configs/faster_rcnn/voc_faster_rcnn_VGG-16-C4.yml
@@ -29,14 +29,14 @@ FRCNN:
  MLP_HEAD_DIM: 4096
 TRAIN:
  WEIGHTS: '/data/models/imagenet/VGG16.RCNN.pth'
-  DATABASE: 'taas:/data/voc_0712_trainval_lmdb'
+  DATABASE: '/data/voc_0712_trainval_lmdb'
  RPN_MIN_SIZE: 16
  IMS_PER_BATCH: 2
  BATCH_SIZE: 128
  SCALES: [600]
  MAX_SIZE: 1000
 TEST:
-  DATABASE: 'taas:/data/voc_2007_test_lmdb'
+  DATABASE: '/data/voc_2007_test_lmdb'
  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
  RPN_MIN_SIZE: 16
  RPN_POST_NMS_TOP_N: 300

--- a/configs/retinanet/coco_retinanet_400_R-50-FPN_1x.yml
+++ b/configs/retinanet/coco_retinanet_400_R-50-FPN_1x.yml
@@ -33,12 +33,12 @@ FPN:
  RPN_MAX_LEVEL: 7
 TRAIN:
  WEIGHTS: '/data/models/imagenet/R-50.Affine.pth'
-  DATABASE: 'taas:/data/coco_2014_trainval35k_lmdb'
+  DATABASE: '/data/coco_2014_trainval35k_lmdb'
  IMS_PER_BATCH: 8
  SCALES: [400]
  MAX_SIZE: 666
 TEST:
-  DATABASE: 'taas:/data/coco_2014_minival_lmdb'
+  DATABASE: '/data/coco_2014_minival_lmdb'
  JSON_FILE: '/data/instances_minival2014.json'
  PROTOCOL: 'coco'
  IMS_PER_BATCH: 1

--- a/configs/retinanet/coco_retinanet_400_R-50-FPN_4x.yml
+++ b/configs/retinanet/coco_retinanet_400_R-50-FPN_4x.yml
@@ -37,15 +37,15 @@ DROPBLOCK:
  DECREMENT: 0.000005 # * 20000 = 0.1
 TRAIN:
  WEIGHTS: '/data/models/imagenet/R-50.Affine.pth'
-  DATABASE: 'taas:/data/coco_2014_trainval35k_lmdb'
+  DATABASE: '/data/coco_2014_trainval35k_lmdb'
  IMS_PER_BATCH: 8
  SCALES: [400]
  MAX_SIZE: 666
  SCALE_JITTERING: True
  COLOR_JITTERING: True
-  SCALE_RANGE: [0.8, 1.2]
+  SCALE_RANGE: [0.75, 1.33]
 TEST:
-  DATABASE: 'taas:/data/coco_2014_minival_lmdb'
+  DATABASE: '/data/coco_2014_minival_lmdb'
  JSON_FILE: '/data/instances_minival2014.json'
  PROTOCOL: 'coco'
  IMS_PER_BATCH: 1

--- a/configs/retinanet/voc_retinanet_300_AirNet-FPN.yml
+++ b/configs/retinanet/voc_retinanet_300_AirNet-FPN.yml
+NUM_GPUS: 1
+VIS: False
+VIS_ON_FILE: False
+MODEL:
+  TYPE: retinanet
+  BACKBONE: airnet.fpn
+  CLASSES: ['__background__',
+            'aeroplane', 'bicycle', 'bird', 'boat',
+            'bottle', 'bus', 'car', 'cat', 'chair',
+            'cow', 'diningtable', 'dog', 'horse',
+            'motorbike', 'person', 'pottedplant',
+            'sheep', 'sofa', 'train', 'tvmonitor']
+  NUM_CLASSES: 21
+SOLVER:
+  BASE_LR: 0.02
+  WEIGHT_DECAY: 0.0001
+  LR_POLICY: steps_with_decay
+  STEPS: [40000, 50000, 60000]
+  MAX_ITERS: 60000
+  SNAPSHOT_ITERS: 5000
+  SNAPSHOT_PREFIX: voc_retinanet_300
+FPN:
+  RPN_MIN_LEVEL: 3
+  RPN_MAX_LEVEL: 7
+TRAIN:
+  WEIGHTS: '/data/models/imagenet/AirNet.Affine.pth'
+  DATABASE: '/data/voc_0712_trainval_lmdb'
+  IMS_PER_BATCH: 32
+  SCALES: [300]
+  MAX_SIZE: 500
+  SCALE_RANGE: [0.5, 2.0]
+  SCALE_JITTERING: True
+  COLOR_JITTERING: True
+TEST:
+  DATABASE: '/data/voc_2007_test_lmdb'
+  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
+  IMS_PER_BATCH: 1
+  SCALES: [300]
+  MAX_SIZE: 500
+  NMS: 0.45
\ No newline at end of file
--- a/configs/retinanet/voc_retinanet_300_R-18-FPN.yml
+++ b/configs/retinanet/voc_retinanet_300_R-18-FPN.yml
+NUM_GPUS: 1
+VIS: False
+VIS_ON_FILE: False
+MODEL:
+  TYPE: retinanet
+  BACKBONE: resnet18.fpn
+  CLASSES: ['__background__',
+            'aeroplane', 'bicycle', 'bird', 'boat',
+            'bottle', 'bus', 'car', 'cat', 'chair',
+            'cow', 'diningtable', 'dog', 'horse',
+            'motorbike', 'person', 'pottedplant',
+            'sheep', 'sofa', 'train', 'tvmonitor']
+  NUM_CLASSES: 21
+SOLVER:
+  BASE_LR: 0.01
+  WEIGHT_DECAY: 0.0001
+  LR_POLICY: steps_with_decay
+  STEPS: [40000, 50000, 60000]
+  WARM_UP_ITERS: 2000
+  MAX_ITERS: 60000
+  SNAPSHOT_ITERS: 5000
+  SNAPSHOT_PREFIX: voc_retinanet_300
+FPN:
+  RPN_MIN_LEVEL: 3
+  RPN_MAX_LEVEL: 7
+TRAIN:
+  WEIGHTS: '/data/models/imagenet/R-18.Affine.pth'
+  DATABASE: '/data/voc_0712_trainval_lmdb'
+  IMS_PER_BATCH: 32
+  SCALES: [300]
+  MAX_SIZE: 500
+  SCALE_RANGE: [0.5, 2.0]
+  SCALE_JITTERING: True
+  COLOR_JITTERING: True
+TEST:
+  DATABASE: '/data/voc_2007_test_lmdb'
+  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
+  IMS_PER_BATCH: 1
+  SCALES: [300]
+  MAX_SIZE: 500
+  NMS: 0.45
\ No newline at end of file
--- a/configs/retinanet/voc_retinanet_300_R-34-FPN.yml
+++ b/configs/retinanet/voc_retinanet_300_R-34-FPN.yml
+NUM_GPUS: 1
+VIS: False
+VIS_ON_FILE: False
+MODEL:
+  TYPE: retinanet
+  BACKBONE: resnet34.fpn
+  CLASSES: ['__background__',
+            'aeroplane', 'bicycle', 'bird', 'boat',
+            'bottle', 'bus', 'car', 'cat', 'chair',
+            'cow', 'diningtable', 'dog', 'horse',
+            'motorbike', 'person', 'pottedplant',
+            'sheep', 'sofa', 'train', 'tvmonitor']
+  NUM_CLASSES: 21
+SOLVER:
+  BASE_LR: 0.01
+  WEIGHT_DECAY: 0.0001
+  LR_POLICY: steps_with_decay
+  STEPS: [40000, 50000, 60000]
+  WARM_UP_ITERS: 2000
+  MAX_ITERS: 60000
+  SNAPSHOT_ITERS: 5000
+  SNAPSHOT_PREFIX: voc_retinanet_300
+FPN:
+  RPN_MIN_LEVEL: 3
+  RPN_MAX_LEVEL: 7
+TRAIN:
+  WEIGHTS: '/data/models/imagenet/R-34.Affine.pth'
+  DATABASE: '/data/voc_0712_trainval_lmdb'
+  IMS_PER_BATCH: 32
+  SCALES: [300]
+  MAX_SIZE: 500
+  SCALE_RANGE: [0.5, 2.0]
+  SCALE_JITTERING: True
+  COLOR_JITTERING: True
+TEST:
+  DATABASE: '/data/voc_2007_test_lmdb'
+  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
+  IMS_PER_BATCH: 1
+  SCALES: [300]
+  MAX_SIZE: 500
+  NMS: 0.45
\ No newline at end of file
--- a/configs/ssd/voc_ssd_300_AirNet-5b.yml
+++ b/configs/ssd/voc_ssd_300_AirNet-5b.yml
@@ -29,11 +29,11 @@ SSD:
    STRIDES: [8, 16, 32]
    ASPECT_RATIOS: [[1, 2, 0.5], [1, 2, 0.5], [1, 2, 0.5]]
 TRAIN:
-  WEIGHTS: '/data/models/imagenet/AirNet.SSD.pth'
-  DATABASE: 'taas:/data/voc_0712_trainval_lmdb'
+  WEIGHTS: '/data/models/imagenet/AirNet.Affine.pth'
+  DATABASE: '/data/voc_0712_trainval_lmdb'
  IMS_PER_BATCH: 32
 TEST:
-  DATABASE: 'taas:/data/voc_2007_test_lmdb'
+  DATABASE: '/data/voc_2007_test_lmdb'
  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
  IMS_PER_BATCH: 8
  NMS_TOP_K: 400

--- a/configs/ssd/voc_ssd_300_VGG-16.yml
+++ b/configs/ssd/voc_ssd_300_VGG-16.yml
@@ -33,10 +33,10 @@ SSD:
                    [1, 2, 0.5, 3, 0.33], [1, 2, 0.5], [1, 2, 0.5]]
 TRAIN:
  WEIGHTS: '/data/models/imagenet/VGG16.SSD.pth'
-  DATABASE: 'taas:/data/voc_0712_trainval_lmdb'
+  DATABASE: '/data/voc_0712_trainval_lmdb'
  IMS_PER_BATCH: 32
 TEST:
-  DATABASE: 'taas:/data/voc_2007_test_lmdb'
+  DATABASE: '/data/voc_2007_test_lmdb'
  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
  IMS_PER_BATCH: 8
  NMS_TOP_K: 400

--- a/lib/core/config.py
+++ b/lib/core/config.py
@@ -13,6 +13,10 @@
 #
 # ------------------------------------------------------------

+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
 import os.path as osp
 import numpy as np

@@ -104,9 +108,6 @@ __C.TRAIN.RPN_MIN_SIZE = 0
 # Set to -1 or a large value, e.g. 100000, to disable pruning anchors
 __C.TRAIN.RPN_STRADDLE_THRESH = 0

-# Resume from the last checkpoint?
-__C.TRAIN.RESUME = False
-

 ###########################################
 #                                         #
@@ -184,6 +185,7 @@ __C.TEST.DETECTIONS_PER_IM = 100
 #                                         #
 ###########################################

+
 __C.MODEL = edict()

 # The type of the model
@@ -211,11 +213,6 @@ __C.MODEL.CLASSES = ['__background__']
 # Add StopGrad at a specified stage so the bottom layers are frozen
 __C.MODEL.FREEZE_AT = 2

-# Whether to use bias prior to improve the one-stage detector?
-# Enabled if model type in ('ssd',)
-# Retinanet is force to use bias prior
-__C.MODEL.USE_BIAS_PRIOR = False
-
 # Whether to use focal loss for one-stage detectors?
 # Enabled if model type in ('ssd',)
 # Retinanet is force to use focal loss
@@ -234,6 +231,7 @@ __C.MODEL.COARSEST_STRIDE = -1
 #                                         #
 ###########################################

+
 __C.RPN = edict()

 # Strides for multiple rpn heads
@@ -252,6 +250,7 @@ __C.RPN.ASPECT_RATIOS = [0.5, 1, 2]
 #                                         #
 ###########################################

+
 __C.RETINANET = edict()

 # Anchor aspect ratios to use
@@ -269,7 +268,7 @@ __C.RETINANET.ANCHOR_SCALE = 4
 __C.RETINANET.NUM_CONVS = 4

 # During inference, #locs to select based on cls score before NMS is performed
-__C.RETINANET.PRE_NMS_TOP_N = 1000
+__C.RETINANET.PRE_NMS_TOP_N = 5000

 # IoU overlap ratio for labeling an anchor as positive
 # Anchors with >= iou overlap are labeled positive
@@ -279,9 +278,6 @@ __C.RETINANET.POSITIVE_OVERLAP = 0.5
 # Anchors with < iou overlap are labeled negative
 __C.RETINANET.NEGATIVE_OVERLAP = 0.4

-# Whether softmax should be used in classification branch training
-__C.RETINANET.SOFTMAX = False
-

 ###########################################
 #                                         #
@@ -336,6 +332,7 @@ __C.FRCNN.ROI_XFORM_RESOLUTION = 7
 #                                         #
 ###########################################

+
 __C.MRCNN = edict()

 # Resolution of mask predictions
@@ -354,6 +351,7 @@ __C.MRCNN.ROI_XFORM_RESOLUTION = 14
 #                                         #
 ###########################################

+
 __C.SSD = edict()

 # Whether to enable FPN enhancement?
@@ -412,6 +410,7 @@ __C.SSD.SAMPLERS = [
 #                                         #
 ###########################################

+
 __C.RESNET = edict()

 # Number of groups to use; 1 ==> ResNet; > 1 ==> ResNeXt
@@ -427,6 +426,7 @@ __C.RESNET.GROUP_WIDTH = 64
 #                                         #
 ###########################################

+
 __C.DROPBLOCK = edict()

 # Whether to use drop block for more regulization
@@ -442,6 +442,7 @@ __C.DROPBLOCK.DECREMENT = 1e-6
 #                                         #
 ###########################################

+
 __C.SOLVER = edict()

 # Base learning rate for the specified schedule
@@ -502,6 +503,7 @@ __C.SOLVER.SNAPSHOT_PREFIX = ''
 #                                         #
 ###########################################

+
 # Number of GPUs to use (applies to both training and testing)
 __C.NUM_GPUS = 1

@@ -523,14 +525,6 @@ __C.BBOX_REG_WEIGHTS = (10., 10., 5., 5.)
 # Default weights on (dx, dy, dw, dh, da) for normalizing rbox regression targets
 __C.RBOX_REG_WEIGHTS = (10.0, 10.0, 5.0, 5.0, 10.0)

-# Clip bounding box transformation predictions to prevent np.exp from
-# overflowing
-# Heuristic choice based on that would scale a 16 pixel anchor up to 1000 pixels
-__C.BBOX_XFORM_CLIP = np.log(1000. / 16.)
-
-# Clip ?
-__C.USE_XFORM_CLIP = False
-
 # Prior prob for the positives at the beginning of training.
 # This is used to set the bias init for the logits layer
 __C.PRIOR_PROB = 0.01

--- a/lib/datasets/factory.py
+++ b/lib/datasets/factory.py
@@ -13,6 +13,7 @@
 #
 # ------------------------------------------------------------

+import os
 from lib.datasets.taas import TaaS


@@ -26,12 +27,12 @@ def get_imdb(name):
    if len(keys) >= 2:
        cls, source = keys[0], ':'.join(keys[1:])
        if cls not in _GLOBAL_DATA_SETS:
-            raise KeyError('Unknown dataset: {}'.format(cls))
+            raise KeyError('Unknown DataSet: {}'.format(cls))
        return _GLOBAL_DATA_SETS[cls](source)
-    elif len(keys) == 1:
-        return _GLOBAL_DATA_SETS[name]()
+    elif os.path.exists(name):
+            return _GLOBAL_DATA_SETS['taas'](name)
    else:
-        raise ValueError('Illegal format of image database: {}'.format(name))
+        raise ValueError('Illegal Database: {}' + name)


 def list_imdbs():

--- a/lib/faster_rcnn/data/data_transformer.py
+++ b/lib/faster_rcnn/data/data_transformer.py
@@ -45,7 +45,13 @@ class DataTransformer(Process):
        self.Q_in = self.Q1_out = self.Q2_out = None
        self.daemon = True

-    def make_roidb(self, ann_datum, im_scale, flip=False, offsets=None):
+    def make_record(
+        self,
+        ann_datum,
+        im_scale,
+        flip=False,
+        offsets=None,
+    ):
        annotations = ann_datum.annotation
        n_objects = 0
        if not self._use_diff:
@@ -53,35 +59,43 @@ class DataTransformer(Process):
                if not ann.difficult: n_objects += 1
        else: n_objects = len(annotations)

-        roidb = {
+        record = {
            'width': ann_datum.datum.width,
            'height': ann_datum.datum.height,
            'gt_classes': np.zeros((n_objects,), dtype=np.int32),
            'boxes': np.zeros((n_objects, 4), dtype=np.float32),
        }

-        ix = 0
+        # Filter the difficult instances
+        instance_idx = 0
        for ann in annotations:
           if not self._use_diff and ann.difficult: continue
-           roidb['boxes'][ix, :] = [
-               max(0, ann.x1), max(0, ann.y1),
-                min(ann.x2, ann_datum.datum.width - 1),
-                min(ann.y2, ann_datum.datum.height - 1)]
-           roidb['gt_classes'][ix] = self._class_to_ind[ann.name]
-           ix += 1
-
-        if flip: roidb['boxes'] = _flip_boxes(roidb['boxes'], roidb['width'])
-
-        roidb['boxes'] *= im_scale
-
+           record['boxes'][instance_idx, :] = [
+               max(0, ann.x1),
+               max(0, ann.y1),
+               min(ann.x2, ann_datum.datum.width - 1),
+               min(ann.y2, ann_datum.datum.height - 1),
+           ]
+           record['gt_classes'][instance_idx] = self._class_to_ind[ann.name]
+           instance_idx += 1
+
+        # Flip the boxes if necessary
+        if flip:
+            record['boxes'] = _flip_boxes(
+                record['boxes'], record['width'])
+
+        # Scale the boxes to the detecting scale
+        record['boxes'] *= im_scale
+
+        # Apply the offsets from scale jitter
        if offsets is not None:
-            roidb['boxes'][:, 0::2] += offsets[0]
-            roidb['boxes'][:, 1::2] += offsets[1]
-            roidb['boxes'][:, :] = np.minimum(
-                np.maximum(roidb['boxes'][:, :], 0),
+            record['boxes'][:, 0::2] += offsets[0]
+            record['boxes'][:, 1::2] += offsets[1]
+            record['boxes'][:, :] = np.minimum(
+                np.maximum(record['boxes'][:, :], 0),
                    [offsets[2][1] - 1, offsets[2][0] - 1] * 2)

-        return roidb
+        return record

    @classmethod
    def get_image(cls, serialized):
@@ -121,7 +135,14 @@ class DataTransformer(Process):
        target_size = cfg.TRAIN.SCALES[scale_indices]
        im, im_scale, jitter = prep_im_for_blob(im, target_size, cfg.TRAIN.MAX_SIZE)

-        # Crop or Pad
+        # Flip
+        flip = False
+        if self._use_flipped:
+            if npr.randint(0, 2) > 0:
+                im = im[:, ::-1, :]
+                flip = True
+
+        # Random Crop or RandomPad
        offsets = None
        if cfg.TRAIN.MAX_SIZE > 0:
            if jitter != 1.0:
@@ -132,20 +153,13 @@ class DataTransformer(Process):
            # To a square (target_size, target_size)
            im, offsets = _get_image_with_target_size([target_size] * 2, im)

-        # Flip
-        flip = False
-        if self._use_flipped:
-            if npr.randint(0, 2) > 0:
-                im = im[:, ::-1, :]
-                flip = True
-
-        # Datum -> RoIDB
-        roidb = self.make_roidb(datum, im_scale, flip, offsets)
+        # Datum -> Record
+        rec = self.make_record(datum, im_scale, flip, offsets)

        # Post-Process for gt boxes
        # Shape like: [num_objects, {x1, y1, x2, y2, cls}]
-        gt_boxes = np.empty((len(roidb['gt_classes']), 5), dtype=np.float32)
-        gt_boxes[:, 0:4], gt_boxes[:, 4] = roidb['boxes'], roidb['gt_classes']
+        gt_boxes = np.empty((len(rec['gt_classes']), 5), dtype=np.float32)
+        gt_boxes[:, 0:4], gt_boxes[:, 4] = rec['boxes'], rec['gt_classes']

        return im, im_scale, gt_boxes

@@ -175,16 +189,16 @@ def _flip_boxes(boxes, width):
 def _get_image_with_target_size(target_size, im):
    im_shape = list(im.shape)
    width_diff = target_size[1] - im_shape[1]
-    offset_crop_width = max(-width_diff // 2, 0)
-    offset_pad_width = max(width_diff // 2, 0)
+    offset_crop_width = np.random.randint(0, max(-width_diff, 0) + 1)
+    offset_pad_width = np.random.randint(0, max(width_diff, 0) + 1)

    height_diff = target_size[0] - im_shape[0]
-    offset_crop_height = max(-height_diff // 2, 0)
-    offset_pad_height = max(height_diff // 2, 0)
+    offset_crop_height = np.random.randint(0, max(-height_diff, 0) + 1)
+    offset_pad_height = np.random.randint(0, max(height_diff, 0) + 1)

    im_shape[0 : 2] = target_size
    new_im = np.empty(im_shape, dtype=im.dtype)
-    new_im.fill(127)
+    new_im[:] = cfg.PIXEL_MEANS

    new_im[offset_pad_height:offset_pad_height + im.shape[0],
           offset_pad_width:offset_pad_width + im.shape[1]] = \

--- a/lib/modeling/__init__.py
+++ b/lib/modeling/__init__.py
@@ -11,8 +11,11 @@

 # Import custom modules
 from lib.modeling.base import Bootstarp
+from lib.modeling.base import RPNDecoder
+from lib.modeling.base import RetinaNetDecoder
+from lib.modeling.base import conv1x1, conv3x3, bn, affine
 from lib.modeling.fpn import FPN
 from lib.modeling.rpn import RPN
 from lib.modeling.fast_rcnn import FastRCNN
 from lib.modeling.retinanet import RetinaNet
-from lib.modeling.ssd import SSD
\ No newline at end of file
+from lib.modeling.ssd import SSD
--- a/lib/modeling/airnet.py
+++ b/lib/modeling/airnet.py
@@ -15,16 +15,16 @@ from __future__ import print_function

 import dragon.vm.torch as torch

-from lib.modeling.base import conv1x1, conv3x3, bn
+from lib.modeling import conv1x1, conv3x3, bn, affine


 class WideResBlock(torch.nn.Module):
    def __init__(self, dim_in, dim_out, stride=1, downsample=None):
        super(WideResBlock, self).__init__()
        self.conv1 = conv3x3(dim_in, dim_out, stride)
-        self.bn1 = bn(dim_out, eps=1e-3)
+        self.bn1 = affine(dim_out)
        self.conv2 = conv3x3(dim_out, dim_out)
-        self.bn2 = bn(dim_out, eps=1e-3)
+        self.bn2 = affine(dim_out)
        self.downsample = downsample
        self.relu = torch.nn.ReLU(inplace=True)

@@ -50,15 +50,15 @@ class InceptionBlock(torch.nn.Module):
    def __init__(self, dim_in, dim_out):
        super(InceptionBlock, self).__init__()
        self.conv1 = conv1x1(dim_in, dim_out)
-        self.bn1 = bn(dim_out, eps=1e-3)
+        self.bn1 = affine(dim_out)
        self.conv2 = conv3x3(dim_out, dim_out // 2)
-        self.bn2 = bn(dim_out // 2, eps=1e-3)
+        self.bn2 = affine(dim_out // 2)
        self.conv3a = conv3x3(dim_out // 2, dim_out)
-        self.bn3a = bn(dim_out, eps=1e-3)
+        self.bn3a = affine(dim_out)
        self.conv3b = conv3x3(dim_out, dim_out)
-        self.bn3b = bn(dim_out, eps=1e-3)
+        self.bn3b = affine(dim_out)
        self.conv4 = conv3x3(dim_out * 3, dim_out)
-        self.bn4 = bn(dim_out, eps=1e-3)
+        self.bn4 = affine(dim_out)
        self.relu = torch.nn.ReLU(inplace=True)

    def forward(self, x):
@@ -93,7 +93,8 @@ class AirNet(torch.nn.Module):
    def __init__(self, blocks, num_stages):
        super(AirNet, self).__init__()
        self.dim_in, filters = 64, [64, 128, 256, 384]
-        self.feature_dims = filters[1:num_stages - 1]
+        self.feature_dims = [None, None] + \
+                            filters[1:num_stages - 1]
        self.conv1 = torch.nn.Conv2d(
            3, 64,
            kernel_size=7,
@@ -101,7 +102,7 @@ class AirNet(torch.nn.Module):
            padding=3,
            bias=False,
        )
-        self.bn1 = bn(self.dim_in, eps=1e-3)
+        self.bn1 = affine(self.dim_in)
        self.relu = torch.nn.ReLU(inplace=True)
        self.maxpool = torch.nn.MaxPool2d(
            kernel_size=2,
@@ -128,7 +129,7 @@ class AirNet(torch.nn.Module):
    def make_blocks(self, dim_out, blocks, stride=1):
        downsample = torch.nn.Sequential(
            conv1x1(self.dim_in, dim_out, stride=stride),
-            bn(dim_out, eps=1e-3),
+            affine(dim_out),
        )
        layers = [WideResBlock(self.dim_in, dim_out, stride, downsample)]
        self.dim_in = dim_out
@@ -148,7 +149,7 @@ class AirNet(torch.nn.Module):
        x = self.maxpool(x)

        x = self.layer1(x)
-        outputs = [self.layer2(x)]
+        outputs = [None, None, self.layer2(x)]
        if hasattr(self, 'layer3'): outputs += [self.layer3(outputs[-1])]
        if hasattr(self, 'layer4'): outputs += [self.layer4(outputs[-1])]

@@ -164,7 +165,7 @@ def airnet(num_stages):
    )
    return AirNet(blocks, num_stages)

-
+def make_airnet_(): return airnet(5)
 def make_airnet_3b(): return airnet(3)
 def make_airnet_4b(): return airnet(4)
 def make_airnet_5b(): return airnet(5)
\ No newline at end of file
--- a/lib/modeling/base.py
+++ b/lib/modeling/base.py
@@ -44,20 +44,21 @@ class Bootstarp(torch.nn.Module):
        return self.run(inputs, outputs)


-class ProposalCXX(torch.nn.Module):
-    """Extended operator to generate proposal regions."""
+class RPNDecoder(torch.nn.Module):
+    """Generate proposal regions from RPN."""

    def __init__(self):
-        super(ProposalCXX, self).__init__()
+        super(RPNDecoder, self).__init__()
        self.register_op()
        self.K = (cfg.FPN.ROI_MAX_LEVEL -
-                        cfg.FPN.ROI_MIN_LEVEL + 1) \
+                    cfg.FPN.ROI_MIN_LEVEL + 1) \
            if len(cfg.RPN.STRIDES) > 1 else 1

    def register_op(self):
        self.op_meta = {
            'op_type': 'Proposal',
            'arguments': {
+                'det_type': 'RCNN',
                'strides': cfg.RPN.STRIDES,
                'ratios': [float(e) for e in cfg.RPN.ASPECT_RATIOS],
                'scales': [float(e) for e in cfg.RPN.SCALES],
@@ -79,6 +80,38 @@ class ProposalCXX(torch.nn.Module):
        return outputs if isinstance(outputs, list) else [outputs]


+class RetinaNetDecoder(torch.nn.Module):
+    """Generate proposal regions from retinanet."""
+
+    def __init__(self):
+        super(RetinaNetDecoder, self).__init__()
+        k_max, k_min = cfg.FPN.RPN_MAX_LEVEL, cfg.FPN.RPN_MIN_LEVEL
+        scales_per_octave = cfg.RETINANET.SCALES_PER_OCTAVE
+        self.strides = [int(2. ** lvl) for lvl in range(k_min, k_max + 1)]
+        self.scales = [cfg.RETINANET.ANCHOR_SCALE *
+            (2 ** (octave / float(scales_per_octave)))
+                for octave in range(scales_per_octave)]
+        self.register_op()
+
+    def register_op(self):
+        self.op_meta = {
+            'op_type': 'Proposal',
+            'arguments': {
+                'det_type': 'RETINANET',
+                'strides': self.strides,
+                'scales': self.scales,
+                'ratios': [float(e) for e in cfg.RETINANET.ASPECT_RATIOS],
+                'pre_nms_top_n': cfg.RETINANET.PRE_NMS_TOP_N,
+                'score_thresh': cfg.TEST.SCORE_THRESH,
+            }
+        }
+
+    def forward(self, features, cls_prob, bbox_pred, ims_info):
+        inputs = features + [cls_prob, bbox_pred, to_tensor(ims_info)]
+        outputs = [self.register_output()]
+        return self.run(inputs, outputs)
+
+
 def conv1x1(dim_in, dim_out, stride=1, bias=False):
    """1x1 convolution."""
    return torch.nn.Conv2d(

--- a/lib/modeling/detector.py
+++ b/lib/modeling/detector.py
@@ -22,9 +22,12 @@ from lib.utils.logger import is_root
 from lib.modeling.factory import get_body_func

 from lib.modeling import (
-    Bootstarp, FPN, RPN,
+    Bootstarp,
+    FPN,
+    RPN,
    FastRCNN,
-    RetinaNet, SSD,
+    RetinaNet,
+    SSD,
 )


@@ -144,6 +147,7 @@ class Detector(torch.nn.Module):

        # 3.3 Feature -> SSD
        if hasattr(self, 'ssd'):
+            features = list(filter(None, features))
            outputs.update(
                self.ssd(
                    features=features,

--- a/lib/modeling/factory.py
+++ b/lib/modeling/factory.py
@@ -55,7 +55,7 @@ for D in [16, 19]:
            'lib.modeling.vgg.make_vgg_{}{}'.format(D, T)

 # AirNet
-for D in ['3b', '4b', '5b']:
+for D in ['', '3b', '4b', '5b']:
    _STORE['BODY']['airnet{}'.format(D)] = \
        'lib.modeling.airnet.make_airnet_{}'.format(D)


--- a/lib/modeling/fast_rcnn.py
+++ b/lib/modeling/fast_rcnn.py
@@ -17,7 +17,7 @@ import dragon.vm.torch as torch
 from collections import OrderedDict

 from lib.core.config import cfg
-from lib.modeling.base import ProposalCXX
+from lib.modeling import RPNDecoder


 class FastRCNN(torch.nn.Module):
@@ -43,7 +43,7 @@ class FastRCNN(torch.nn.Module):
        self.fc7 = torch.nn.Linear(cfg.FRCNN.MLP_HEAD_DIM, cfg.FRCNN.MLP_HEAD_DIM)
        self.cls_score = torch.nn.Linear(cfg.FRCNN.MLP_HEAD_DIM, cfg.MODEL.NUM_CLASSES)
        self.bbox_pred = torch.nn.Linear(cfg.FRCNN.MLP_HEAD_DIM, cfg.MODEL.NUM_CLASSES * 4)
-        self.proposal_cxx = ProposalCXX()
+        self.rpn_decoder = RPNDecoder()
        self.proposal_layer = ProposalLayer()
        self.proposal_target_layer = ProposalTargetLayer()
        self.softmax = torch.nn.Softmax(dim=1)
@@ -80,7 +80,7 @@ class FastRCNN(torch.nn.Module):
        # Generate Proposals
        # Apply the CXX implementation during inference
        proposal_func = self.proposal_layer \
-            if self.training else self.proposal_cxx
+            if self.training else self.rpn_decoder
        self.rcnn_data = {
            'rois': proposal_func(
                kwargs['features'],

--- a/lib/modeling/fpn.py
+++ b/lib/modeling/fpn.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 import dragon.vm.torch as torch

 from lib.core.config import cfg
-from lib.modeling.base import conv1x1, conv3x3
+from lib.modeling import conv1x1, conv3x3


 HIGHEST_BACKBONE_LVL = 5  # E.g., "conv5"-like level

--- a/lib/modeling/resnet.py
+++ b/lib/modeling/resnet.py
@@ -20,12 +20,10 @@ from __future__ import print_function
 import dragon.vm.torch as torch

 from lib.core.config import cfg
-from lib.modeling.base import conv1x1, conv3x3, affine
+from lib.modeling import conv1x1, conv3x3, affine


 class BasicBlock(torch.nn.Module):
-    expansion = 1
-
    def __init__(self, dim_in, dim_out, stride=1,
                 downsample=None, dropblock=None):
        super(BasicBlock, self).__init__()
@@ -110,9 +108,9 @@ class Bottleneck(torch.nn.Module):


 class ResNet(torch.nn.Module):
-    def __init__(self, block, layers):
+    def __init__(self, block, layers, filters):
        super(ResNet, self).__init__()
-        self.dim_in, filters = 64, [256, 512, 1024, 2048]
+        self.dim_in, filters = filters[0], filters[1:]
        self.feature_dims = [self.dim_in] + filters
        self.conv1 = torch.nn.Conv2d(
            3, 64,
@@ -200,9 +198,13 @@ def resnet(depth):
    elif depth == 269: units = [3, 30, 48, 8]
    else: raise ValueError('Unsupported depth: %d' % depth)
    block = Bottleneck if depth >= 50 else BasicBlock
-    return ResNet(block, units)
+    filters = [64, 256, 512, 1024, 2048] \
+        if depth >= 50 else [64, 64, 128, 256, 512]
+    return ResNet(block, units, filters)


+def make_resnet_18(): return resnet(18)
+def make_resnet_34(): return resnet(34)
 def make_resnet_50(): return resnet(50)
 def make_resnet_101(): return resnet(101)
 def make_resnet_152(): return resnet(152)
\ No newline at end of file
--- a/lib/modeling/retinanet.py
+++ b/lib/modeling/retinanet.py
@@ -18,8 +18,8 @@ import dragon.vm.torch as torch
 from collections import OrderedDict

 from lib.core.config import cfg
-from lib.modeling.base import conv3x3
-from lib.retinanet import AnchorTargetLayer, ProposalLayer
+from lib.modeling import conv3x3, RetinaNetDecoder
+from lib.retinanet import AnchorTargetLayer


 class RetinaNet(torch.nn.Module):
@@ -37,32 +37,23 @@ class RetinaNet(torch.nn.Module):
            conv3x3(dim_in, dim_in, bias=True)
                for _ in range(cfg.RETINANET.NUM_CONVS))
        # Packed as [C, A] not [A, C]
-        self.C = cfg.MODEL.NUM_CLASSES \
-            if cfg.RETINANET.SOFTMAX \
-                else cfg.MODEL.NUM_CLASSES - 1
+        self.C = cfg.MODEL.NUM_CLASSES - 1
        A = len(cfg.RETINANET.ASPECT_RATIOS) * \
                cfg.RETINANET.SCALES_PER_OCTAVE
        self.cls_score = conv3x3(dim_in, self.C * A, bias=True)
        self.bbox_pred = conv3x3(dim_in, 4 * A, bias=True)
-        self.cls_prob = torch.nn.Softmax(dim=1, inplace=True) \
-            if cfg.RETINANET.SOFTMAX else torch.nn.Sigmoid(inplace=True)
+        self.cls_prob = torch.nn.Sigmoid(inplace=True)
        self.relu = torch.nn.ELU(inplace=True)
-        self.proposal_layer = ProposalLayer()
+        self.decoder = RetinaNetDecoder()

        ########################################
        #           RetinaNet losses           #
        ########################################

        self.anchor_target_layer = AnchorTargetLayer()
-        if cfg.RETINANET.SOFTMAX:
-            self.cls_loss = torch.nn.SoftmaxFocalLoss(
-                ignore_index=-1,
-                alpha=cfg.MODEL.FOCAL_LOSS_ALPHA,
-                gamma=cfg.MODEL.FOCAL_LOSS_GAMMA)
-        else:
-            self.cls_loss = torch.nn.SigmoidFocalLoss(
-                alpha=cfg.MODEL.FOCAL_LOSS_ALPHA,
-                gamma=cfg.MODEL.FOCAL_LOSS_GAMMA)
+        self.cls_loss = torch.nn.SigmoidFocalLoss(
+            alpha=cfg.MODEL.FOCAL_LOSS_ALPHA,
+            gamma=cfg.MODEL.FOCAL_LOSS_GAMMA)
        self.bbox_loss = torch.nn.SmoothL1Loss(beta=1. / 9.)
        self.reset_parameters()

@@ -77,15 +68,8 @@ class RetinaNet(torch.nn.Module):
        # Bias prior initialization for Focal Loss
        # For details, See the official codes:
        # https://github.com/facebookresearch/Detectron
-        if cfg.RETINANET.SOFTMAX:
-            bias = self.cls_score.bias.numpy()
-            bias = bias.reshape((cfg.MODEL.NUM_CLASSES, -1))
-            bias[0, :] = math.log(
-                (cfg.MODEL.NUM_CLASSES - 1) *
-                    (1 - cfg.PRIOR_PROB) / cfg.PRIOR_PROB)
-        else:
-            self.cls_score.bias.fill_(
-                -math.log((1 - cfg.PRIOR_PROB) / cfg.PRIOR_PROB))
+        self.cls_score.bias.fill_(
+            -math.log((1 - cfg.PRIOR_PROB) / cfg.PRIOR_PROB))

    def compute_outputs(self, features):
        """Compute the RetinaNet logits.
@@ -114,9 +98,12 @@ class RetinaNet(torch.nn.Module):
            return cls_score_wide[0], bbox_pred_wide[0]

    def compute_losses(
-        self, features,
-            cls_score, bbox_pred,
-                gt_boxes, ims_info,
+        self,
+        features,
+        cls_score,
+        bbox_pred,
+        gt_boxes,
+        ims_info,
    ):
        """Compute the RetinaNet classification loss and regression loss.

@@ -173,9 +160,10 @@ class RetinaNet(torch.nn.Module):
            )
        else:
            outputs['detections'] = \
-                self.proposal_layer(
+                self.decoder(
                    kwargs['features'],
-                    self.cls_prob(cls_score),
+                    self.cls_prob(cls_score)
+                        .permute(0, 2, 1),
                    bbox_pred,
                    kwargs['ims_info'],
                )

--- a/lib/modeling/rpn.py
+++ b/lib/modeling/rpn.py
@@ -17,7 +17,7 @@ import dragon.vm.torch as torch
 from collections import OrderedDict

 from lib.core.config import cfg
-from lib.modeling.base import conv1x1, conv3x3
+from lib.modeling import conv1x1, conv3x3


 class RPN(torch.nn.Module):
@@ -59,7 +59,6 @@ class RPN(torch.nn.Module):
        for m in self.modules():
            if isinstance(m, torch.nn.Conv2d):
                torch.nn.init.normal_(m.weight, std=0.01)
-                torch.nn.init.constant_(m.bias, 0)

    def compute_outputs(self, features):
        """Compute the RPN logits.
@@ -91,9 +90,12 @@ class RPN(torch.nn.Module):
            return cls_score_wide[0], bbox_pred_wide[0]

    def compute_losses(
-        self, features,
-            cls_score, bbox_pred,
-                gt_boxes, ims_info,
+        self,
+        features,
+        cls_score,
+        bbox_pred,
+        gt_boxes,
+        ims_info,
    ):
        """Compute the RPN classification loss and regression loss.


--- a/lib/modeling/ssd.py
+++ b/lib/modeling/ssd.py
@@ -17,11 +17,13 @@ import dragon.vm.torch as torch
 from collections import OrderedDict

 from lib.core.config import cfg
-from lib.modeling.base import conv3x3
+from lib.modeling import conv3x3

 from lib.ssd import (
-    PriorBoxLayer, MultiBoxMatchLayer,
-    HardMiningLayer, MultiBoxTargetLayer,
+    PriorBoxLayer,
+    MultiBoxMatchLayer,
+    HardMiningLayer,
+    MultiBoxTargetLayer,
 )


@@ -38,6 +40,8 @@ class SSD(torch.nn.Module):
        self.softmax = torch.nn.Softmax(dim=2)

        C = cfg.MODEL.NUM_CLASSES
+        feature_dims = list(filter(None, feature_dims))
+
        for i, dim_in in enumerate(feature_dims):
            A = len(cfg.SSD.MULTIBOX.ASPECT_RATIOS[i]) + 1
            self.cls_score.append(conv3x3(dim_in, A * C, bias=True))
@@ -89,8 +93,12 @@ class SSD(torch.nn.Module):
               torch.cat(bbox_pred_wide, dim=1).view(0, -1, 4)

    def compute_losses(
-        self, prior_boxes, gt_boxes,
-            cls_score, bbox_pred, cls_prob,
+        self,
+        prior_boxes,
+        gt_boxes,
+        cls_score,
+        bbox_pred,
+        cls_prob,
    ):
        """Compute the SSD classification loss and regression loss.


--- a/lib/modeling/vgg.py
+++ b/lib/modeling/vgg.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 import dragon.vm.torch as torch

 from lib.core.config import cfg
-from lib.modeling.base import conv1x1, conv3x3
+from lib.modeling import conv1x1, conv3x3


 class VGG(torch.nn.Module):

--- a/lib/retinanet/test.py
+++ b/lib/retinanet/test.py
@@ -141,7 +141,7 @@ def test_net(net, server):
                    keep = soft_nms(cls_dets, cfg.TEST.NMS,
                        method=cfg.TEST.SOFT_NMS_METHOD,
                        sigma=cfg.TEST.SOFT_NMS_SIGMA)
-                else: keep = nms(cls_dets, cfg.TEST.NMS)
+                else: keep = nms(cls_dets, cfg.TEST.NMS, force_cpu=True)
                cls_dets = cls_dets[keep, :]
                all_boxes[j][i] = cls_dets
                boxes_this_image.append(cls_dets)

--- a/lib/ssd/data/preprocessing/expand.py
+++ b/lib/ssd/data/preprocessing/expand.py
@@ -43,7 +43,7 @@ class Expander(object):
        w_off = int(math.floor(npr.uniform(0.0, expand_w - im_w)))

        new_im = np.empty((expand_h, expand_w, 3), dtype=np.uint8)
-        new_im.fill(127)
+        new_im[:] = cfg.PIXEL_MEANS
        new_im[h_off: h_off + im_h, w_off: w_off + im_w, :] = im

        if gt_boxes is not None: