Adapt to the latest dragon preview version

Summary: This commit changes repo to match dragon.0.3.0.dev20200707.

Adapt to the latest dragon preview version
Summary: This commit changes repo to match dragon.0.3.0.dev20200707.
Ting PAN
Commit 8558d3df authored Jul 07, 2020 by Ting PAN
Showing with 711 additions and 516 deletions
CHANGES
README.md
configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_1x.yml
configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_2x.yml
configs/faster_rcnn/voc_faster_rcnn_R-50-FPN.yml
configs/faster_rcnn/voc_faster_rcnn_VGG-16-C4.yml
configs/retinanet/coco_retinanet_416_R-50-FPN.yml
configs/retinanet/voc_retinanet_320_AirNet-FPN.yml
configs/retinanet/voc_retinanet_320_R-50-FPN.yml
configs/ssd/voc_ssd_300_VGG-16.yml
configs/ssd/voc_ssd_300_AirNet-5b.yml → configs/ssd/voc_ssd_320_AirNet-FPN.yml
configs/ssd/voc_ssd_320_R-50-FPN.yml
csrc/cxx/.clang-format
csrc/cxx/operators/nms_op.cc
csrc/cxx/operators/nms_op.h
csrc/cxx/operators/retinanet_decoder_op.cc
csrc/cxx/operators/retinanet_decoder_op.h
csrc/cxx/operators/rpn_decoder_op.cc
csrc/cxx/operators/rpn_decoder_op.h
csrc/cxx/setup.py
--- a/CHANGES
+++ b/CHANGES
 ------------------------------------------------------------------------
 The list of most significant changes made over time in SeetaDet.

+SeetaDet 0.4.2 (20200707)
+
+Dragon Minimum Required (Version 0.3.0.dev20200707)
+
+Changes:
+
+- Adapt to the latest dragon preview version.
+
+Preview Features:
+
+- None
+
+Bugs fixed:
+
+- None
+
+------------------------------------------------------------------------
+
 SeetaDet 0.4.1 (20200421)

 Dragon Minimum Required (Version 0.3.0.dev20200421)

--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ The torch-style codes help us to simplify the hierarchical pipeline of modern de

 ## Requirements

-seeta-dragon >= 0.3.0.dev20200421
+seeta-dragon >= 0.3.0.dev20200707

 ## Installation


--- a/configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_1x.yml
+++ b/configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_1x.yml
@@ -32,16 +32,17 @@ FRCNN:
 TRAIN:
  WEIGHTS: '/model/R-101.Affine.pth'
  DATASET: '/data/coco_2014_trainval35k'
-  USE_DIFF: False # Do not use crowd objects
  IMS_PER_BATCH: 2
  BATCH_SIZE: 512
  SCALES: [800]
  MAX_SIZE: 1333
+  USE_DIFF: False # Do not use crowd objects
 TEST:
  DATASET: '/data/coco_2014_minival'
  JSON_FILE: '/data/instances_minival2014.json'
  PROTOCOL: 'coco'
-  RPN_POST_NMS_TOP_N: 1000
  SCALES: [800]
  MAX_SIZE: 1333
  NMS: 0.5
+  RPN_POST_NMS_TOP_N: 1000
+
--- a/configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_2x.yml
+++ b/configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_2x.yml
@@ -32,16 +32,16 @@ FRCNN:
 TRAIN:
  WEIGHTS: '/model/R-101.Affine.pth'
  DATASET: '/data/coco_2014_trainval35k'
-  USE_DIFF: False # Do not use crowd objects
  IMS_PER_BATCH: 2
  BATCH_SIZE: 512
  SCALES: [800]
  MAX_SIZE: 1333
+  USE_DIFF: False # Do not use crowd objects
 TEST:
  DATASET: '/data/coco_2014_minival'
  JSON_FILE: '/data/instances_minival2014.json'
  PROTOCOL: 'coco'
-  RPN_POST_NMS_TOP_N: 1000
  SCALES: [800]
  MAX_SIZE: 1333
  NMS: 0.5
+  RPN_POST_NMS_TOP_N: 1000
--- a/configs/faster_rcnn/voc_faster_rcnn_R-50-FPN.yml
+++ b/configs/faster_rcnn/voc_faster_rcnn_R-50-FPN.yml
@@ -30,7 +30,7 @@ TRAIN:
 TEST:
  DATASET: '/data/voc_2007_test'
  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
-  RPN_POST_NMS_TOP_N: 1000
  SCALES: [600]
  MAX_SIZE: 1000
-  NMS: 0.45
\ No newline at end of file
+  NMS: 0.45
+  RPN_POST_NMS_TOP_N: 1000
\ No newline at end of file
--- a/configs/faster_rcnn/voc_faster_rcnn_VGG-16-C4.yml
+++ b/configs/faster_rcnn/voc_faster_rcnn_VGG-16-C4.yml
@@ -29,16 +29,16 @@ FRCNN:
 TRAIN:
  WEIGHTS: '/model/VGG16.RCNN.pth'
  DATASET: '/data/voc_0712_trainval'
-  RPN_MIN_SIZE: 16
  IMS_PER_BATCH: 2
  BATCH_SIZE: 128
  SCALES: [600]
  MAX_SIZE: 1000
+  RPN_MIN_SIZE: 16
 TEST:
  DATASET: '/data/voc_2007_test'
  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
-  RPN_MIN_SIZE: 16
-  RPN_POST_NMS_TOP_N: 300
  SCALES: [600]
  MAX_SIZE: 1000
-  NMS: 0.45
\ No newline at end of file
+  RPN_MIN_SIZE: 16
+  NMS: 0.45
+  RPN_POST_NMS_TOP_N: 300
\ No newline at end of file
--- a/configs/retinanet/coco_retinanet_416_R-50-FPN.yml
+++ b/configs/retinanet/coco_retinanet_416_R-50-FPN.yml
@@ -32,11 +32,11 @@ FPN:
 TRAIN:
  WEIGHTS: '/model/R-50.Affine.pth'
  DATASET: '/data/coco_2014_trainval35k'
-  USE_DIFF: False  # Do not use crowd objects
-  USE_COLOR_JITTER: True
  IMS_PER_BATCH: 16
  SCALES: [416]
  RANDOM_SCALES: [0.25, 1.0]
+  USE_DIFF: False  # Do not use crowd objects
+  USE_COLOR_JITTER: False
 TEST:
  DATASET: '/data/coco_2014_minival'
  JSON_FILE: '/data/instances_minival2014.json'

--- a/configs/retinanet/voc_retinanet_320_AirNet-FPN.yml
+++ b/configs/retinanet/voc_retinanet_320_AirNet-FPN.yml
@@ -23,10 +23,10 @@ FPN:
 TRAIN:
  WEIGHTS: '/model/AirNet.Affine.pth'
  DATASET: '/data/voc_0712_trainval'
-  USE_COLOR_JITTER: True
  IMS_PER_BATCH: 32
  SCALES: [320]
  RANDOM_SCALES: [0.25, 1.0]
+  USE_COLOR_JITTER: True
 TEST:
  DATASET: '/data/voc_2007_test'
  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'

--- a/configs/retinanet/voc_retinanet_320_R-50-FPN.yml
+++ b/configs/retinanet/voc_retinanet_320_R-50-FPN.yml
@@ -24,10 +24,10 @@ FPN:
 TRAIN:
  WEIGHTS: '/model/R-50.Affine.pth'
  DATASET: '/data/voc_0712_trainval'
-  USE_COLOR_JITTER: True
  IMS_PER_BATCH: 32
  SCALES: [320]
  RANDOM_SCALES: [0.25, 2.0]
+  USE_COLOR_JITTER: True
 TEST:
  DATASET: '/data/voc_2007_test'
  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'

--- a/configs/ssd/voc_ssd_300_VGG-16.yml
+++ b/configs/ssd/voc_ssd_300_VGG-16.yml
@@ -38,6 +38,7 @@ TRAIN:
  IMS_PER_BATCH: 32
  SCALES: [300]
  RANDOM_SCALES: [0.25, 1.00]
+  USE_COLOR_JITTER: True
 TEST:
  DATASET: '/data/voc_2007_test'
  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'

--- a/configs/ssd/voc_ssd_300_AirNet-5b.yml
+++ b/configs/ssd/voc_ssd_300_AirNet-5b.yml
@@ -3,7 +3,7 @@ VIS: False
 ENABLE_TENSOR_BOARD: False
 MODEL:
  TYPE: ssd
-  BACKBONE: airnet5b.mbox
+  BACKBONE: airnet.fpn
  CLASSES: ['__background__',
            'aeroplane', 'bicycle', 'bird', 'boat',
            'bottle', 'bus', 'car', 'cat', 'chair',
@@ -17,19 +17,30 @@ SOLVER:
  MAX_STEPS: 120000
  SNAPSHOT_EVERY: 5000
  SNAPSHOT_PREFIX: voc_ssd_320
+FPN:
+  RPN_MIN_LEVEL: 3
+  RPN_MAX_LEVEL: 8
 SSD:
  NUM_CONVS: 2
  MULTIBOX:
-    STRIDES: [8, 16, 32]
-    MIN_SIZES: [30, 90, 150]
-    MAX_SIZES: [90, 150, 210]
-    ASPECT_RATIOS: [[1, 2, 0.5], [1, 2, 0.5], [1, 2, 0.5]]
+    STRIDES: [8, 16, 32, 64, 100, 300]
+    MIN_SIZES: [30, 60, 110, 162, 213, 264]
+    MAX_SIZES: [60, 110, 162, 213, 264, 315]
+    ASPECT_RATIOS: [
+      [1, 2, 0.5],
+      [1, 2, 0.5, 3, 0.33],
+      [1, 2, 0.5, 3, 0.33],
+      [1, 2, 0.5, 3, 0.33],
+      [1, 2, 0.5],
+      [1, 2, 0.5],
+    ]
 TRAIN:
  WEIGHTS: '/model/AirNet.Affine.pth'
  DATASET: '/data/voc_0712_trainval'
+  IMS_PER_BATCH: 32
  SCALES: [320]
  RANDOM_SCALES: [0.25, 1.00]
-  IMS_PER_BATCH: 32
+  USE_COLOR_JITTER: True
 TEST:
  DATASET: '/data/voc_2007_test'
  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'

--- a/configs/ssd/voc_ssd_320_R-50-FPN.yml
+++ b/configs/ssd/voc_ssd_320_R-50-FPN.yml
@@ -37,9 +37,10 @@ SSD:
 TRAIN:
  WEIGHTS: '/model/R-50.Affine.pth'
  DATASET: '/data/voc_0712_trainval'
+  IMS_PER_BATCH: 32
  SCALES: [320]
  RANDOM_SCALES: [0.25, 1.00]
-  IMS_PER_BATCH: 32
+  USE_COLOR_JITTER: True
 TEST:
  DATASET: '/data/voc_2007_test'
  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'

--- a/csrc/cxx/.clang-format
+++ b/csrc/cxx/.clang-format
+---
+AccessModifierOffset: -1
+AlignAfterOpenBracket: AlwaysBreak
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlinesLeft: true
+AlignOperands: false
+AlignTrailingComments: false
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: Empty
+AllowShortIfStatementsOnASingleLine: true
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments: false
+BinPackParameters: false
+BraceWrapping:
+  AfterClass: false
+  AfterControlStatement: false
+  AfterEnum: false
+  AfterFunction: false
+  AfterNamespace: false
+  AfterObjCDeclaration: false
+  AfterStruct: false
+  AfterUnion: false
+  BeforeCatch: false
+  BeforeElse: false
+  IndentBraces: false
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Attach
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: false
+ColumnLimit: 80
+CommentPragmas: '^ IWYU pragma:'
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat: false
+ForEachMacros: [ FOR_EACH_RANGE, FOR_EACH, ]
+IncludeCategories:
+  - Regex: '^<.*\.h(pp)?>'
+    Priority: 1
+  - Regex: '^<.*'
+    Priority: 2
+  - Regex: '.*'
+    Priority: 3
+IndentCaseLabels: true
+IndentWidth: 2
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd: ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: false
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 200
+PointerAlignment: Left
+ReflowComments: true
+SortIncludes: true
+SpaceAfterCStyleCast: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles: false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard: Cpp11
+TabWidth: 8
+UseTab: Never
+...
--- a/csrc/cxx/operators/nms_op.cc
+++ b/csrc/cxx/operators/nms_op.cc
-#include <dragon/core/workspace.h>
-#include <dragon/utils/math_utils.h>
-
-#include "../utils/detection_utils.h"
 #include "nms_op.h"
+#include "../utils/detection_utils.h"

 namespace dragon {

-template <class Context> template <typename T>
+template <class Context>
+template <typename T>
 void NonMaxSuppressionOp<Context>::DoRunWithType() {
-    int num_selected;
-
-    utils::detection::ApplyNMS(
-        Output(0)->count(),
-        Output(0)->count(),
-        iou_threshold_,
-        Input(0).template mutable_data<T, Context>(),
-        Output(0)->template mutable_data<int64_t, CPUContext>(),
-        num_selected, ctx()
-    );
-
-    Output(0)->Reshape({ num_selected });
+  int num_selected;
+
+  utils::detection::ApplyNMS(
+      Output(0)->count(),
+      Output(0)->count(),
+      iou_threshold_,
+      Input(0).template mutable_data<T, Context>(),
+      Output(0)->template mutable_data<int64_t, CPUContext>(),
+      num_selected,
+      ctx());
+
+  Output(0)->Reshape({num_selected});
 }

 template <class Context>
 void NonMaxSuppressionOp<Context>::RunOnDevice() {
-    CHECK(Input(0).ndim() == 2 && Input(0).dim(1) == 5)
-        << "\nThe dimensions of boxes should be (num_boxes, 5).";
-
-    Output(0)->Reshape({ Input(0).dim(0) });
+  CHECK(Input(0).ndim() == 2 && Input(0).dim(1) == 5)
+      << "\nThe dimensions of boxes should be (num_boxes, 5).";

-    DispatchHelper<TensorTypes<float>>::Call(this, Input(0));
+  Output(0)->Reshape({Input(0).dim(0)});
+  DispatchHelper<TensorTypes<float>>::Call(this, Input(0));
 }

 DEPLOY_CPU(NonMaxSuppression);
@@ -41,4 +38,4 @@ OPERATOR_SCHEMA(NonMaxSuppression).NumInputs(1).NumOutputs(1);

 NO_GRADIENT(NonMaxSuppression);

-}  // namespace dragon
+} // namespace dragon
--- a/csrc/cxx/operators/nms_op.h
+++ b/csrc/cxx/operators/nms_op.h
@@ -5,7 +5,7 @@
 * You should have received a copy of the BSD 2-Clause License
 * along with the software. If not, See,
 *
- *      <https://opensource.org/licenses/BSD-2-Clause>
+ *    <https://opensource.org/licenses/BSD-2-Clause>
 *
 * ------------------------------------------------------------
 */
@@ -20,20 +20,20 @@ namespace dragon {
 template <class Context>
 class NonMaxSuppressionOp final : public Operator<Context> {
 public:
-    NonMaxSuppressionOp(const OperatorDef& def, Workspace* ws)
-        : Operator<Context>(def, ws),
-          iou_threshold_(OpArg<float>("iou_threshold", 0.5f)) {}
-    USE_OPERATOR_FUNCTIONS;
+  NonMaxSuppressionOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws),
+        iou_threshold_(OpArg<float>("iou_threshold", 0.5f)) {}
+  USE_OPERATOR_FUNCTIONS;

-    void RunOnDevice() override;
+  void RunOnDevice() override;

-    template <typename T>
-    void DoRunWithType();
+  template <typename T>
+  void DoRunWithType();

 protected:
-    float iou_threshold_;
+  float iou_threshold_;
 };

-}  // namespace dragon
+} // namespace dragon

-#endif  // SEETADET_CXX_OPERATORS_NMS_OP_H_
+#endif // SEETADET_CXX_OPERATORS_NMS_OP_H_
--- a/csrc/cxx/operators/retinanet_decoder_op.cc
+++ b/csrc/cxx/operators/retinanet_decoder_op.cc
-#include <dragon/core/workspace.h>
-#include <dragon/utils/math_utils.h>
+#include <dragon/utils/math_functions.h>

 #include "../utils/detection_utils.h"
 #include "retinanet_decoder_op.h"

 namespace dragon {

-template <class Context> template <typename T>
+template <class Context>
+template <typename T>
 void RetinaNetDecoderOp<Context>::DoRunWithType() {
-    using BT = float;  // DType of BBox
-    using BC = CPUContext;  // Context of BBox
+  using BT = float; // DType of BBox
+  using BC = CPUContext; // Context of BBox

-    int feat_h, feat_w;
-    int C = Input(-3).dim(2), A, K;
-    int total_proposals = 0;
-    int num_candidates, num_boxes, num_proposals;
+  int feat_h, feat_w;
+  int C = Input(-3).dim(2), A, K;
+  int total_proposals = 0;
+  int num_candidates, num_boxes, num_proposals;

-    auto* batch_scores = Input(-3).template data<T, BC>();
-    auto* batch_deltas = Input(-2).template data<T, BC>();
-    auto* im_info = Input(-1).template data<BT, BC>();
-    auto* y = Output(0)->template mutable_data<BT, BC>();
+  auto* batch_scores = Input(-3).template data<T, BC>();
+  auto* batch_deltas = Input(-2).template data<T, BC>();
+  auto* im_info = Input(-1).template data<BT, BC>();
+  auto* y = Output(0)->template mutable_data<BT, BC>();

-    for (int n = 0; n < num_images_; ++n) {
-        BT im_h = im_info[0];
-        BT im_w = im_info[1];
-        BT im_scale_h = im_info[2];
-        BT im_scale_w = im_info[2];
-        if (Input(-1).dim(1) == 4) im_scale_w = im_info[3];
-        auto* scores = batch_scores + n * Input(-3).stride(0);
-        auto* deltas = batch_deltas + n * Input(-2).stride(0);
-        CHECK_EQ(strides_.size(), InputSize() - 3)
-            << "\nGiven " << strides_.size() << " strides "
-            << "and " << InputSize() - 3 << " features";
-        // Select the top-k candidates as proposals
-        num_boxes = Input(-3).dim(1);
-        num_candidates = Input(-3).count(1);
-        roi_indices_.resize(num_candidates);
-        num_candidates = 0;
-        for (int i = 0; i < roi_indices_.size(); ++i)
-            if (scores[i] > score_thr_)
-                roi_indices_[num_candidates++] = i;
-        scores_.resize(num_candidates);
-        for (int i = 0; i < num_candidates; ++i)
-            scores_[i] = scores[roi_indices_[i]];
-        num_proposals = std::min(
-            num_candidates,
-            (int)pre_nms_topn_
-        );
-        utils::math::ArgPartition(
-            num_candidates,
-            num_proposals,
-            true,
-            scores_.data(),
-            indices_
-        );
-        for (int i = 0; i < num_proposals; ++i)
-            indices_[i] = roi_indices_[indices_[i]];
-        // Decode the candidates
-        int base_offset = 0;
-        for (int i = 0; i < strides_.size(); i++) {
-            feat_h = Input(i).dim(2);
-            feat_w = Input(i).dim(3);
-            K = feat_h * feat_w;
-            A = int(ratios_.size() * scales_.size());
-            anchors_.resize((size_t)(A * 4));
-            utils::detection::GenerateAnchors(
-                strides_[i],
-                (int)ratios_.size(),
-                (int)scales_.size(),
-                ratios_.data(),
-                scales_.data(),
-                anchors_.data()
-            );
-            utils::detection::GenerateGridAnchors(
-                num_proposals, C, A,
-                feat_h, feat_w,
-                strides_[i],
-                base_offset,
-                anchors_.data(),
-                indices_.data(),
-                y
-            );
-            base_offset += (A * K);
-        }
-        utils::detection::GenerateMCProposals(
-            num_proposals,
-            num_boxes, C,
-            n,
-            im_h,
-            im_w,
-            im_scale_h,
-            im_scale_w,
-            scores,
-            deltas,
-            indices_.data(),
-            y
-        );
-        total_proposals += num_proposals;
-        y += (num_proposals * 7);
-        im_info += Input(-1).dim(1);
+  for (int n = 0; n < num_images_; ++n) {
+    BT im_h = im_info[0];
+    BT im_w = im_info[1];
+    BT im_scale_h = im_info[2];
+    BT im_scale_w = im_info[2];
+    if (Input(-1).dim(1) == 4) im_scale_w = im_info[3];
+    auto* scores = batch_scores + n * Input(-3).stride(0);
+    auto* deltas = batch_deltas + n * Input(-2).stride(0);
+    CHECK_EQ(strides_.size(), InputSize() - 3)
+        << "\nGiven " << strides_.size() << " strides "
+        << "and " << InputSize() - 3 << " features";
+    // Select the top-k candidates as proposals
+    num_boxes = Input(-3).dim(1);
+    num_candidates = Input(-3).count(1);
+    roi_indices_.resize(num_candidates);
+    num_candidates = 0;
+    for (int i = 0; i < roi_indices_.size(); ++i)
+      if (scores[i] > score_thr_) roi_indices_[num_candidates++] = i;
+    scores_.resize(num_candidates);
+    for (int i = 0; i < num_candidates; ++i)
+      scores_[i] = scores[roi_indices_[i]];
+    num_proposals = std::min(num_candidates, (int)pre_nms_topn_);
+    utils::math::ArgPartition(
+        num_candidates, num_proposals, true, scores_.data(), indices_);
+    for (int i = 0; i < num_proposals; ++i)
+      indices_[i] = roi_indices_[indices_[i]];
+    // Decode the candidates
+    int base_offset = 0;
+    for (int i = 0; i < strides_.size(); i++) {
+      feat_h = Input(i).dim(2);
+      feat_w = Input(i).dim(3);
+      K = feat_h * feat_w;
+      A = int(ratios_.size() * scales_.size());
+      anchors_.resize((size_t)(A * 4));
+      utils::detection::GenerateAnchors(
+          strides_[i],
+          (int)ratios_.size(),
+          (int)scales_.size(),
+          ratios_.data(),
+          scales_.data(),
+          anchors_.data());
+      utils::detection::GenerateGridAnchors(
+          num_proposals,
+          C,
+          A,
+          feat_h,
+          feat_w,
+          strides_[i],
+          base_offset,
+          anchors_.data(),
+          indices_.data(),
+          y);
+      base_offset += (A * K);
    }
+    utils::detection::GenerateMCProposals(
+        num_proposals,
+        num_boxes,
+        C,
+        n,
+        im_h,
+        im_w,
+        im_scale_h,
+        im_scale_w,
+        scores,
+        deltas,
+        indices_.data(),
+        y);
+    total_proposals += num_proposals;
+    y += (num_proposals * 7);
+    im_info += Input(-1).dim(1);
+  }

-    Output(0)->Reshape({ total_proposals, 7 });
+  Output(0)->Reshape({total_proposals, 7});
 }

 template <class Context>
 void RetinaNetDecoderOp<Context>::RunOnDevice() {
-    num_images_ = Input(0).dim(0);
-
-    CHECK_EQ(Input(-1).dim(0), num_images_)
-        << "\nExcepted " << num_images_
-        << " groups info, got "
-        << Input(-1).dim(0) << ".";
+  num_images_ = Input(0).dim(0);

-    Output(0)->Reshape({ num_images_ * pre_nms_topn_, 7 });
+  CHECK_EQ(Input(-1).dim(0), num_images_)
+      << "\nExcepted " << num_images_ << " groups info, got "
+      << Input(-1).dim(0) << ".";

-    DispatchHelper<TensorTypes<float>>::Call(this, Input(-3));
+  Output(0)->Reshape({num_images_ * pre_nms_topn_, 7});
+  DispatchHelper<TensorTypes<float>>::Call(this, Input(-3));
 }

 DEPLOY_CPU(RetinaNetDecoder);
@@ -123,8 +113,6 @@ DEPLOY_CPU(RetinaNetDecoder);
 DEPLOY_CUDA(RetinaNetDecoder);
 #endif

-OPERATOR_SCHEMA(RetinaNetDecoder)
-    .NumInputs(3, INT_MAX)
-    .NumOutputs(1, INT_MAX);
+OPERATOR_SCHEMA(RetinaNetDecoder).NumInputs(3, INT_MAX).NumOutputs(1, INT_MAX);

-}  // namespace dragon
+} // namespace dragon
--- a/csrc/cxx/operators/retinanet_decoder_op.h
+++ b/csrc/cxx/operators/retinanet_decoder_op.h
@@ -5,7 +5,7 @@
 * You should have received a copy of the BSD 2-Clause License
 * along with the software. If not, See,
 *
- *      <https://opensource.org/licenses/BSD-2-Clause>
+ *    <https://opensource.org/licenses/BSD-2-Clause>
 *
 * ------------------------------------------------------------
 */
@@ -20,27 +20,27 @@ namespace dragon {
 template <class Context>
 class RetinaNetDecoderOp final : public Operator<Context> {
 public:
-    RetinaNetDecoderOp(const OperatorDef& def, Workspace* ws)
-        : Operator<Context>(def, ws),
-          strides_(OpArgs<int64_t>("strides")),
-          ratios_(OpArgs<float>("ratios")),
-          scales_(OpArgs<float>("scales")),
-          pre_nms_topn_(OpArg<int64_t>("pre_nms_top_n", 6000)),
-          score_thr_(OpArg<float>("score_thresh", 0.05f)) {}
-    USE_OPERATOR_FUNCTIONS;
+  RetinaNetDecoderOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws),
+        strides_(OpArgs<int64_t>("strides")),
+        ratios_(OpArgs<float>("ratios")),
+        scales_(OpArgs<float>("scales")),
+        pre_nms_topn_(OpArg<int64_t>("pre_nms_top_n", 6000)),
+        score_thr_(OpArg<float>("score_thresh", 0.05f)) {}
+  USE_OPERATOR_FUNCTIONS;

-    void RunOnDevice() override;
+  void RunOnDevice() override;

-    template <typename T>
-    void DoRunWithType();
+  template <typename T>
+  void DoRunWithType();

 protected:
-    float score_thr_;
-    vec64_t strides_, indices_, roi_indices_;
-    vector<float> ratios_, scales_, scores_, anchors_;
-    int64_t num_images_, pre_nms_topn_;
+  float score_thr_;
+  vec64_t strides_, indices_, roi_indices_;
+  vector<float> ratios_, scales_, scores_, anchors_;
+  int64_t num_images_, pre_nms_topn_;
 };

-}  // namespace dragon
+} // namespace dragon

-#endif  // SEETADET_CXX_OPERATORS_RETINANET_DECODER_OP_H_
+#endif // SEETADET_CXX_OPERATORS_RETINANET_DECODER_OP_H_
--- a/csrc/cxx/operators/rpn_decoder_op.cc
+++ b/csrc/cxx/operators/rpn_decoder_op.cc
--- a/csrc/cxx/operators/rpn_decoder_op.h
+++ b/csrc/cxx/operators/rpn_decoder_op.h
@@ -5,7 +5,7 @@
 * You should have received a copy of the BSD 2-Clause License
 * along with the software. If not, See,
 *
- *      <https://opensource.org/licenses/BSD-2-Clause>
+ *    <https://opensource.org/licenses/BSD-2-Clause>
 *
 * ------------------------------------------------------------
 */
@@ -20,36 +20,36 @@ namespace dragon {
 template <class Context>
 class RPNDecoderOp final : public Operator<Context> {
 public:
-    RPNDecoderOp(const OperatorDef& def, Workspace* ws)
-        : Operator<Context>(def, ws),
-          strides_(OpArgs<int64_t>("strides")),
-          ratios_(OpArgs<float>("ratios")),
-          scales_(OpArgs<float>("scales")),
-          pre_nms_topn_(OpArg<int64_t>("pre_nms_top_n", 6000)),
-          post_nms_topn_(OpArg<int64_t>("post_nms_top_n", 300)),
-          nms_thr_(OpArg<float>("nms_thresh", 0.7f)),
-          min_size_(OpArg<int64_t>("min_size", 16)),
-          min_level_(OpArg<int64_t>("min_level", 2)),
-          max_level_(OpArg<int64_t>("max_level", 5)),
-          canonical_level_(OpArg<int64_t>("canonical_level", 4)),
-          canonical_scale_(OpArg<int64_t>("canonical_scale", 224)) {}
-    USE_OPERATOR_FUNCTIONS;
-
-    void RunOnDevice() override;
-
-    template <typename T>
-    void DoRunWithType();
+  RPNDecoderOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws),
+        strides_(OpArgs<int64_t>("strides")),
+        ratios_(OpArgs<float>("ratios")),
+        scales_(OpArgs<float>("scales")),
+        pre_nms_topn_(OpArg<int64_t>("pre_nms_top_n", 6000)),
+        post_nms_topn_(OpArg<int64_t>("post_nms_top_n", 300)),
+        nms_thr_(OpArg<float>("nms_thresh", 0.7f)),
+        min_size_(OpArg<int64_t>("min_size", 16)),
+        min_level_(OpArg<int64_t>("min_level", 2)),
+        max_level_(OpArg<int64_t>("max_level", 5)),
+        canonical_level_(OpArg<int64_t>("canonical_level", 4)),
+        canonical_scale_(OpArg<int64_t>("canonical_scale", 224)) {}
+  USE_OPERATOR_FUNCTIONS;
+
+  void RunOnDevice() override;
+
+  template <typename T>
+  void DoRunWithType();

 protected:
-    float nms_thr_;
-    vec64_t strides_, indices_, roi_indices_;
-    vector<float> ratios_, scales_, scores_, anchors_;
-    int64_t min_size_, pre_nms_topn_, post_nms_topn_;
-    int64_t num_images_, min_level_, max_level_;
-    int64_t canonical_level_, canonical_scale_;
-    Tensor proposals_;
+  float nms_thr_;
+  vec64_t strides_, indices_, roi_indices_;
+  vector<float> ratios_, scales_, scores_, anchors_;
+  int64_t min_size_, pre_nms_topn_, post_nms_topn_;
+  int64_t num_images_, min_level_, max_level_;
+  int64_t canonical_level_, canonical_scale_;
+  Tensor proposals_;
 };

-}  // namespace dragon
+} // namespace dragon

-#endif  // SEETADET_CXX_OPERATORS_RPN_DECODER_OP_H_
+#endif // SEETADET_CXX_OPERATORS_RPN_DECODER_OP_H_
--- a/csrc/cxx/setup.py
+++ b/csrc/cxx/setup.py
@@ -5,7 +5,7 @@
 # You should have received a copy of the BSD 2-Clause License
 # along with the software. If not, See,
 #
-#      <https://opensource.org/licenses/BSD-2-Clause>
+#    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------

@@ -15,25 +15,35 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import glob
 from distutils.core import setup
-from dragon.tools import cpp_extension

+from dragon.tools import cpp_extension
 if cpp_extension.CUDA_HOME is not None and \
        cpp_extension._cuda.is_available():
    Extension = cpp_extension.CUDAExtension
 else:
    Extension = cpp_extension.CppExtension

+
+def find_sources(*dirs):
+    ext_suffixes = ['.cc']
+    if Extension is cpp_extension.CUDAExtension:
+        ext_suffixes.append('.cu')
+    sources = []
+    for path in dirs:
+        for ext_suffix in ext_suffixes:
+            sources += glob.glob(
+                path + '/*' + ext_suffix,
+                recursive=True,
+            )
+    return sources
+
+
 ext_modules = [
    Extension(
        name='install.lib.modules._C',
-        sources=[
-            'utils/detection_utils.cc',
-            'utils/detection_utils.cu',
-            'operators/nms_op.cc',
-            'operators/retinanet_decoder_op.cc',
-            'operators/rpn_decoder_op.cc',
-        ],
+        sources=find_sources('**'),
    ),
 ]


--- a/csrc/cxx/utils/detection_utils.cc
+++ b/csrc/cxx/utils/detection_utils.cc
-#include <dragon/core/context.h>
 #include "detection_utils.h"
+#include <dragon/core/context.h>

 namespace dragon {

@@ -9,45 +9,46 @@ namespace detection {

 template <typename T>
 T IoU(const T A[], const T B[]) {
-    if (A[0] > B[2] || A[1] > B[3] ||
-        A[2] < B[0] || A[3] < B[1]) return 0;
-    const T x1 = std::max(A[0], B[0]);
-    const T y1 = std::max(A[1], B[1]);
-    const T x2 = std::min(A[2], B[2]);
-    const T y2 = std::min(A[3], B[3]);
-    const T width = std::max((T)0, x2 - x1 + 1);
-    const T height = std::max((T)0, y2 - y1 + 1);
-    const T area = width * height;
-    const T A_area = (A[2] - A[0] + 1) * (A[3] - A[1] + 1);
-    const T B_area = (B[2] - B[0] + 1) * (B[3] - B[1] + 1);
-    return area / (A_area + B_area - area);
+  if (A[0] > B[2] || A[1] > B[3] || A[2] < B[0] || A[3] < B[1]) return 0;
+  const T x1 = std::max(A[0], B[0]);
+  const T y1 = std::max(A[1], B[1]);
+  const T x2 = std::min(A[2], B[2]);
+  const T y2 = std::min(A[3], B[3]);
+  const T width = std::max((T)0, x2 - x1 + 1);
+  const T height = std::max((T)0, y2 - y1 + 1);
+  const T area = width * height;
+  const T A_area = (A[2] - A[0] + 1) * (A[3] - A[1] + 1);
+  const T B_area = (B[2] - B[0] + 1) * (B[3] - B[1] + 1);
+  return area / (A_area + B_area - area);
 }

-template <> void ApplyNMS<float, CPUContext>(
-    const int               num_boxes,
-    const int               max_keeps,
-    const float             thresh,
-    const float*            boxes,
-    int64_t*                keep_indices,
-    int&                    num_keep,
-    CPUContext*             ctx) {
-    int count = 0;
-    std::vector<char> is_dead(num_boxes);
-    for (int i = 0; i < num_boxes; ++i) is_dead[i] = 0;
-    for (int i = 0; i < num_boxes; ++i) {
-        if (is_dead[i]) continue;
-        keep_indices[count++] = i;
-        if (count == max_keeps) break;
-        for (int j = i + 1; j < num_boxes; ++j)
-            if (!is_dead[j] && IoU(&boxes[i * 5],
-                                   &boxes[j * 5]) > thresh)
-                is_dead[j] = 1;
-    }
-    num_keep = count;
+template <>
+void ApplyNMS<float, CPUContext>(
+    const int num_boxes,
+    const int max_keeps,
+    const float thresh,
+    const float* boxes,
+    int64_t* keep_indices,
+    int& num_keep,
+    CPUContext* ctx) {
+  int count = 0;
+  std::vector<char> is_dead(num_boxes);
+  for (int i = 0; i < num_boxes; ++i)
+    is_dead[i] = 0;
+  for (int i = 0; i < num_boxes; ++i) {
+    if (is_dead[i]) continue;
+    keep_indices[count++] = i;
+    if (count == max_keeps) break;
+    for (int j = i + 1; j < num_boxes; ++j)
+      if (!is_dead[j] && IoU(&boxes[i * 5], &boxes[j * 5]) > thresh) {
+        is_dead[j] = 1;
+      }
+  }
+  num_keep = count;
 }

-}  // namespace detection
+} // namespace detection

-}  // namespace utils
+} // namespace utils

-}  // namespace dragon
+} // namespace dragon
--- a/csrc/cxx/utils/detection_utils.cu
+++ b/csrc/cxx/utils/detection_utils.cu
@@ -9,127 +9,121 @@ namespace utils {

 namespace detection {

-#define DIV_UP(m,n) ((m) / (n) + ((m) % (n) > 0))
+#define DIV_UP(m, n) ((m) / (n) + ((m) % (n) > 0))
 #define NUM_THREADS 64

 namespace {

 template <typename T>
-__device__ bool _CheckIoU(
-    const T*               a,
-    const T*               b,
-    const float            thresh) {
-    const T x1 = max(a[0], b[0]);
-    const T y1 = max(a[1], b[1]);
-    const T x2 = min(a[2], b[2]);
-    const T y2 = min(a[3], b[3]);
-    const T width = max(T(0), x2 - x1 + 1);
-    const T height = max(T(0), y2 - y1 + 1);
-    const T inter = width * height;
-    const T Sa = (a[2] - a[0] + T(1)) * (a[3] - a[1] + T(1));
-    const T Sb = (b[2] - b[0] + T(1)) * (b[3] - b[1] + T(1));
-    return inter > thresh * (Sa + Sb - inter);
+__device__ bool _CheckIoU(const T* a, const T* b, const float thresh) {
+  const T x1 = max(a[0], b[0]);
+  const T y1 = max(a[1], b[1]);
+  const T x2 = min(a[2], b[2]);
+  const T y2 = min(a[3], b[3]);
+  const T width = max(T(0), x2 - x1 + 1);
+  const T height = max(T(0), y2 - y1 + 1);
+  const T inter = width * height;
+  const T Sa = (a[2] - a[0] + T(1)) * (a[3] - a[1] + T(1));
+  const T Sb = (b[2] - b[0] + T(1)) * (b[3] - b[1] + T(1));
+  return inter > thresh * (Sa + Sb - inter);
 }

 template <typename T>
 __global__ void _NonMaxSuppression(
-    const int               num_blocks,
-    const int               num_boxes,
-    const T                 thresh,
-    const T*                dev_boxes,
-    uint64_t*               dev_mask) {
-    const int row_start = blockIdx.y;
-    const int col_start = blockIdx.x;
-    if (row_start > col_start) return;
-
-    const int row_size = min(num_boxes - row_start * NUM_THREADS, NUM_THREADS);
-    const int col_size = min(num_boxes - col_start * NUM_THREADS, NUM_THREADS);
-
-    __shared__ T block_boxes[NUM_THREADS * 4];
-
-    if (threadIdx.x < col_size) {
-        const int c1 = threadIdx.x * 4;
-        const int c2 = (col_start * NUM_THREADS + threadIdx.x) * 5;
-        block_boxes[c1] = dev_boxes[c2];
-        block_boxes[c1 + 1] = dev_boxes[c2 + 1];
-        block_boxes[c1 + 2] = dev_boxes[c2 + 2];
-        block_boxes[c1 + 3] = dev_boxes[c2 + 3];
-    }
-
-    __syncthreads();
-
-    if (threadIdx.x < row_size) {
-        const int index = row_start * NUM_THREADS + threadIdx.x;
-        const T* dev_box = dev_boxes + index * 5;
-        unsigned long long val = 0;
-        const int start = (row_start == col_start) ? (threadIdx.x + 1) : 0;
-        for (int i = start; i < col_size; ++i) {
-            if (_CheckIoU(dev_box, block_boxes + i * 4, thresh)) {
-                val |= 1ULL << i;
-            }
-        }
-        dev_mask[index * num_blocks + col_start] = val;
+    const int num_blocks,
+    const int num_boxes,
+    const T thresh,
+    const T* dev_boxes,
+    uint64_t* dev_mask) {
+  const int row_start = blockIdx.y;
+  const int col_start = blockIdx.x;
+  if (row_start > col_start) return;
+
+  const int row_size = min(num_boxes - row_start * NUM_THREADS, NUM_THREADS);
+  const int col_size = min(num_boxes - col_start * NUM_THREADS, NUM_THREADS);
+
+  __shared__ T block_boxes[NUM_THREADS * 4];
+
+  if (threadIdx.x < col_size) {
+    const int c1 = threadIdx.x * 4;
+    const int c2 = (col_start * NUM_THREADS + threadIdx.x) * 5;
+    block_boxes[c1] = dev_boxes[c2];
+    block_boxes[c1 + 1] = dev_boxes[c2 + 1];
+    block_boxes[c1 + 2] = dev_boxes[c2 + 2];
+    block_boxes[c1 + 3] = dev_boxes[c2 + 3];
+  }
+
+  __syncthreads();
+
+  if (threadIdx.x < row_size) {
+    const int index = row_start * NUM_THREADS + threadIdx.x;
+    const T* dev_box = dev_boxes + index * 5;
+    unsigned long long val = 0;
+    const int start = (row_start == col_start) ? (threadIdx.x + 1) : 0;
+    for (int i = start; i < col_size; ++i) {
+      if (_CheckIoU(dev_box, block_boxes + i * 4, thresh)) {
+        val |= 1ULL << i;
+      }
    }
+    dev_mask[index * num_blocks + col_start] = val;
+  }
 }

-}  // namespace
-
-template <> void ApplyNMS<float, CUDAContext>(
-    const int               num_boxes,
-    const int               max_keeps,
-    const float             thresh,
-    const float*            boxes,
-    int64_t*                keep_indices,
-    int&                    num_keep,
-    CUDAContext*            ctx) {
-    const int num_blocks = DIV_UP(num_boxes, NUM_THREADS);
-
-    vector<uint64_t> mask_host(num_boxes * num_blocks);
-    auto* mask_dev = (uint64_t*)ctx->New(mask_host.size() * sizeof(uint64_t));
-    
-    _NonMaxSuppression
-         <<< dim3(num_blocks, num_blocks), NUM_THREADS,
-             0, ctx->cuda_stream() >>>(
-        num_blocks,
-        num_boxes,
-        thresh,
-        boxes,
-        mask_dev
-    );
-    
-    CUDA_CHECK(cudaMemcpyAsync(
-        mask_host.data(),
-        mask_dev,
-        mask_host.size() * sizeof(uint64_t),
-        cudaMemcpyDeviceToHost,
-        ctx->cuda_stream()
-    ));
-
-    ctx->FinishDeviceComputation();
-
-    vector<uint64_t> dead_bit(num_blocks);
-    memset(&dead_bit[0], 0, sizeof(uint64_t) * num_blocks);
-
-    int num_selected = 0;
-    for (int i = 0; i < num_boxes; ++i) {
-        const int nblock = i / NUM_THREADS;
-        const int inblock = i % NUM_THREADS;
-        if (!(dead_bit[nblock] & (1ULL << inblock))) {
-            keep_indices[num_selected++] = i;
-            auto* mask_i = &mask_host[0] + i * num_blocks;
-            for (int j = nblock; j < num_blocks; ++j) dead_bit[j] |= mask_i[j];
-            if (num_selected == max_keeps) break;
-        }
+} // namespace
+
+template <>
+void ApplyNMS<float, CUDAContext>(
+    const int num_boxes,
+    const int max_keeps,
+    const float thresh,
+    const float* boxes,
+    int64_t* keep_indices,
+    int& num_keep,
+    CUDAContext* ctx) {
+  const int num_blocks = DIV_UP(num_boxes, NUM_THREADS);
+
+  vector<uint64_t> mask_host(num_boxes * num_blocks);
+  auto* mask_dev = (uint64_t*)ctx->New(mask_host.size() * sizeof(uint64_t));
+
+  _NonMaxSuppression<<<
+      dim3(num_blocks, num_blocks),
+      NUM_THREADS,
+      0,
+      ctx->cuda_stream()>>>(num_blocks, num_boxes, thresh, boxes, mask_dev);
+
+  CUDA_CHECK(cudaMemcpyAsync(
+      mask_host.data(),
+      mask_dev,
+      mask_host.size() * sizeof(uint64_t),
+      cudaMemcpyDeviceToHost,
+      ctx->cuda_stream()));
+
+  ctx->FinishDeviceComputation();
+
+  vector<uint64_t> dead_bit(num_blocks);
+  memset(&dead_bit[0], 0, sizeof(uint64_t) * num_blocks);
+
+  int num_selected = 0;
+  for (int i = 0; i < num_boxes; ++i) {
+    const int nblock = i / NUM_THREADS;
+    const int inblock = i % NUM_THREADS;
+    if (!(dead_bit[nblock] & (1ULL << inblock))) {
+      keep_indices[num_selected++] = i;
+      auto* mask_i = &mask_host[0] + i * num_blocks;
+      for (int j = nblock; j < num_blocks; ++j)
+        dead_bit[j] |= mask_i[j];
+      if (num_selected == max_keeps) break;
    }
-    num_keep = num_selected;
+  }

-    ctx->Delete(mask_dev);
+  num_keep = num_selected;
+  ctx->Delete(mask_dev);
 }

-}  // namespace detection
+} // namespace detection

-}  // namespace utils
+} // namespace utils

-}  // namespace dragon
+} // namespace dragon

-#endif  // USE_CUDA
+#endif // USE_CUDA
--- a/csrc/cxx/utils/detection_utils.h
+++ b/csrc/cxx/utils/detection_utils.h
--- a/seetadet/algo/faster_rcnn/anchor_target.py
+++ b/seetadet/algo/faster_rcnn/anchor_target.py
@@ -52,12 +52,9 @@ class AnchorTarget(object):
        gt_boxes_wide = box_util.dismantle_boxes(gt_boxes, num_images)

        # Generate grid anchors from base
-        all_anchors = \
-            generate_grid_anchors(
-                features,
-                self.base_anchors,
-                self.strides,
-            )
+        grid_shapes = [f.shape[-2:] for f in features]
+        all_anchors = generate_grid_anchors(
+            grid_shapes, self.base_anchors, self.strides)
        num_anchors = all_anchors.shape[0]

        # Label: ``1`` is positive, ``0`` is negative, ``-1` is don't care

--- a/seetadet/algo/faster_rcnn/proposal.py
+++ b/seetadet/algo/faster_rcnn/proposal.py
@@ -58,12 +58,9 @@ class Proposal(object):

        # Get resources
        num_images = ims_info.shape[0]
-        all_anchors = \
-            generate_grid_anchors(
-                features,
-                self.base_anchors,
-                self.strides,
-            )
+        grid_shapes = [f.shape[-2:] for f in features]
+        all_anchors = generate_grid_anchors(
+            grid_shapes, self.base_anchors, self.strides)

        # Prepare for the outputs
        batch_rois = []

--- a/seetadet/algo/faster_rcnn/utils.py
+++ b/seetadet/algo/faster_rcnn/utils.py
@@ -19,40 +19,40 @@ import numpy as np
 from seetadet.core.config import cfg


-def generate_grid_anchors(features, base_anchors, strides):
+def generate_grid_anchors(grid_shapes, base_anchors, strides):
    num_strides = len(strides)
-    if len(features) != num_strides:
+    if len(grid_shapes) != num_strides:
        raise ValueError(
-            'Given %d features for %d strides.'
-            % (len(features), num_strides)
+            'Given %d grids for %d strides.'
+            % (len(grid_shapes), num_strides)
        )
    # Generate proposals from shifted anchors
    anchors_to_pack = []
-    for i in range(len(features)):
-        height, width = features[i].shape[-2:]
+    for i in range(len(grid_shapes)):
+        height, width = grid_shapes[i]
        shift_x = np.arange(0, width) * strides[i]
        shift_y = np.arange(0, height) * strides[i]
        shift_x, shift_y = np.meshgrid(shift_x, shift_y)
        shifts = np.vstack((shift_x.ravel(), shift_y.ravel(),
                            shift_x.ravel(), shift_y.ravel())).transpose()
-        # Add A anchors (1, A, 4) to
-        # cell K shifts (K, 1, 4) to get
-        # shift anchors (K, A, 4)
-        # Reshape to (K * A, 4) shifted anchors
-        A = base_anchors[i].shape[0]
-        K = shifts.shape[0]
-        anchors = (base_anchors[i].reshape((1, A, 4)) +
-                   shifts.reshape((1, K, 4)).transpose((1, 0, 2)))
+        # Add a anchors (1, a, 4) to
+        # cell k shifts (k, 1, 4) to get
+        # shift anchors (k, a, 4)
+        # Reshape to (k * a, 4) shifted anchors
+        a = base_anchors[i].shape[0]
+        k = shifts.shape[0]
+        anchors = (base_anchors[i].reshape((1, a, 4)) +
+                   shifts.reshape((1, k, 4)).transpose((1, 0, 2)))
        if num_strides > 1:
            # Transpose from (K, A, 4) to (A, K, 4)
            # We will pack it with other strides to
            # match the data format of (N, C, H, W)
            anchors = anchors.transpose((1, 0, 2))
-            anchors = anchors.reshape((A * K, 4))
+            anchors = anchors.reshape((a * k, 4))
            anchors_to_pack.append(anchors)
        else:
            # Original order of Faster R-CNN
-            return anchors.reshape((K * A, 4))
+            return anchors.reshape((k * a, 4))
    return np.vstack(anchors_to_pack)



--- a/seetadet/algo/retinanet/anchor_target.py
+++ b/seetadet/algo/retinanet/anchor_target.py
@@ -46,6 +46,9 @@ class AnchorTarget(object):
                    ratios=self.ratios,
                    sizes=sizes,
                ))
+        # Store the cached grid anchors
+        self.last_grid_shapes = None
+        self.last_grid_anchors = None

    def __call__(self, features, gt_boxes):
        num_images = cfg.TRAIN.IMS_PER_BATCH
@@ -58,12 +61,17 @@ class AnchorTarget(object):
            )

        # Generate grid anchors from base
-        all_anchors = \
-            generate_grid_anchors(
-                features,
-                self.base_anchors,
-                self.strides,
-            )
+        grid_shapes = [f.shape[-2:] for f in features]
+        if grid_shapes == self.last_grid_shapes:
+            all_anchors = self.last_grid_anchors
+        else:
+            self.last_grid_shapes = grid_shapes
+            self.last_grid_anchors = all_anchors = \
+                generate_grid_anchors(
+                    grid_shapes,
+                    self.base_anchors,
+                    self.strides,
+                )
        num_anchors = all_anchors.shape[0]

        # Label: ``1`` is positive, ``0`` is negative, ``-1` is don't care

--- a/seetadet/algo/retinanet/test.py
+++ b/seetadet/algo/retinanet/test.py
@@ -15,6 +15,7 @@ from __future__ import print_function

 import types

+import dragon
 import dragon.vm.torch as torch
 import numpy as np

@@ -59,7 +60,7 @@ def ims_detect(detector, raw_images):

    # Unpack results
    results = outputs['detections']
-    detections = [[] for _ in range(len((raw_images)))]
+    detections = [[] for _ in range(len(raw_images))]

    for i in range(len(ims)):
        inds = np.where(results[:, 0].astype(np.int32) == i)[0]
@@ -126,6 +127,6 @@ def test_net(weights, num_classes, q_in, q_out, device):
            q_out.put((
                indices[i],
                dict([('im_detect', _t['im_detect'].average_time),
-                      ('misc',_t['misc'].average_time)]),
+                      ('misc', _t['misc'].average_time)]),
                dict([('boxes', boxes_this_image)]),
            ))
--- a/seetadet/algo/ssd/priorbox.py
+++ b/seetadet/algo/ssd/priorbox.py
@@ -45,14 +45,14 @@ class PriorBox(object):
                    aspect_ratios[i],
                )
            )
-        self.grid_anchors = None
+        # Store the cached grid anchors
+        self.last_grid_anchors = None

    def __call__(self, features):
-        if self.grid_anchors is not None:
-            return self.grid_anchors
-
-        self.grid_anchors = []
+        if self.last_grid_anchors is not None:
+            return self.last_grid_anchors

+        all_anchors = []
        for i in range(len(self.strides)):
            # 1. Generate base grids
            height, width = features[i].shape[-2:]
@@ -61,23 +61,23 @@ class PriorBox(object):
            shift_x, shift_y = np.meshgrid(shift_x, shift_y)

            # 2. Apply anchors on base grids
-            # Add A anchors (1, A, 4) to
-            # cell K shifts (K, 1, 4) to get
-            # shift anchors (K, A, 4)
-            # Reshape to (K * A, 4) shifted anchors
-            A = self.base_anchors[i].shape[0]
-            D = self.base_anchors[i].shape[1]
+            # Add a anchors (1, a, 4) to
+            # cell k shifts (k, 1, 4) to get
+            # shift anchors (k, a, 4)
+            # Reshape to (k * a, 4) shifted anchors
+            a = self.base_anchors[i].shape[0]
+            d = self.base_anchors[i].shape[1]
            shifts = np.vstack((
                shift_x.ravel(),
                shift_y.ravel(),
                shift_x.ravel(),
                shift_y.ravel())
            ).transpose()
-            K = shifts.shape[0]  # K = map_h * map_w
-            anchors = (self.base_anchors[i].reshape((1, A, D)) +
-                       shifts.reshape((1, K, D)).transpose((1, 0, 2)))
-            anchors = anchors.reshape((K * A, D)).astype(np.float32)
-            self.grid_anchors.append(anchors)
-        self.grid_anchors = np.concatenate(self.grid_anchors)
+            k = shifts.shape[0]  # k = map_h * map_w
+            anchors = (self.base_anchors[i].reshape((1, a, d)) +
+                       shifts.reshape((1, k, d)).transpose((1, 0, 2)))
+            anchors = anchors.reshape((k * a, d)).astype(np.float32)
+            all_anchors.append(anchors)

-        return self.grid_anchors
+        self.last_grid_anchors = np.concatenate(all_anchors)
+        return self.last_grid_anchors
--- a/seetadet/algo/ssd/test.py
+++ b/seetadet/algo/ssd/test.py
@@ -32,11 +32,9 @@ def get_images(ims):
    for im in ims:
        im_scales.append((float(out_size) / im.shape[0],
                          float(out_size) / im.shape[1]))
-        processed_ims.append(
-            cv2.resize(
+        processed_ims.append(cv2.resize(
            im, (out_size, out_size),
-            interpolation=cv2.INTER_AREA,
-        ))
+            interpolation=cv2.INTER_AREA))
    if ims[0].dtype == 'uint16':
        ims_blob = np.array(processed_ims, dtype='float32') / 256.
    else:

--- a/seetadet/algo/ssd/transforms.py
+++ b/seetadet/algo/ssd/transforms.py
@@ -49,12 +49,12 @@ class Distort(object):
        ]

    def apply(self, img, boxes=None):
-        if self._prob > 0:
-            img = PIL.Image.fromarray(img)
-            for transform_fn, prob in self._transforms:
-                if npr.uniform() < prob:
-                    img = transform_fn(img)
-                    img = img.enhance(1. + npr.uniform(-.4, .4))
+        self._prob = 0.5 if cfg.TRAIN.USE_COLOR_JITTER else 0
+        img = PIL.Image.fromarray(img)
+        for transform_fn, prob in self._transforms:
+            if npr.uniform() < prob:
+                img = transform_fn(img)
+                img = img.enhance(1. + npr.uniform(-.4, .4))
            return np.array(img), boxes
        return img, boxes


--- a/seetadet/algo/ssd/transforms_test.py
+++ b/seetadet/algo/ssd/transforms_test.py
@@ -27,8 +27,9 @@ if __name__ == '__main__':
    np.random.seed(3)
    cfg.TRAIN.SCALES = [300]
    cfg.TRAIN.RANDOM_SCALES = [0.25, 1.00]
+    cfg.TRAIN.USE_COLOR_JITTER = True

-    augmentor = transforms.Compose(
+    transformer = transforms.Compose(
        transforms.Distort(),
        transforms.Expand(),
        transforms.Sample(),
@@ -38,12 +39,12 @@ if __name__ == '__main__':
    while True:
        img = cv2.imread('cat.jpg')
        boxes = np.array([[0.33, 0.04, 0.71, 0.98]], dtype=np.float32)
-        img, boxes = augmentor(img, boxes)
+        img, boxes = transformer(img, boxes)
        for box in boxes:
            x1 = int(box[0] * img.shape[1])
            y1 = int(box[1] * img.shape[0])
            x2 = int(box[2] * img.shape[1])
            y2 = int(box[3] * img.shape[0])
            cv2.rectangle(img, (x1, y1), (x2, y2), (188, 119, 64), 2)
-        cv2.imshow('Sample', img)
+        cv2.imshow('Transforms - Preview', img)
        cv2.waitKey(0)
--- a/seetadet/dali/ssd_pipeline.py
+++ b/seetadet/dali/ssd_pipeline.py
@@ -70,14 +70,15 @@ class Pipeline(dali.Pipeline):
        # Decode image
        image = self.decode(inputs['image'])

-        # Augment the color space
-        image = self.hsv(
-            self.brightness_contrast(
-                image,
-                brightness=self.twist_rng(),
-                contrast=self.twist_rng(),
-            ), saturation=self.twist_rng()
-        )
+        # Augment the color space if necessary
+        if cfg.TRAIN.USE_COLOR_JITTER:
+            image = self.hsv(
+                self.brightness_contrast(
+                    image,
+                    brightness=self.twist_rng(),
+                    contrast=self.twist_rng(),
+                ), saturation=self.twist_rng()
+            )

        # Expand randomly to get smaller objects
        pr = self.paste_ratio() * self.flip_rng() + 1.

--- a/seetadet/datasets/factory.py
+++ b/seetadet/datasets/factory.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function

 import os
-from seetadet.datasets import kpl_record
+from seetadet.datasets import kpl_dataset


 def get_dataset(name):
@@ -42,5 +42,5 @@ def list_dataset():

 _GLOBAL_REGISTERED_DATASET = {
    'default': lambda source:
-        kpl_record.KPLRecordDataset(source),
+        kpl_dataset.KPLRecordDataset(source),
 }
--- a/seetadet/datasets/kpl_record.py
+++ b/seetadet/datasets/kpl_record.py
--- a/seetadet/modeling/airnet.py
+++ b/seetadet/modeling/airnet.py
@@ -149,8 +149,10 @@ class AirNet(nn.Module):

        x = self.layer1(x)
        outputs = [None, None, self.layer2(x)]
-        if hasattr(self, 'layer3'): outputs += [self.layer3(outputs[-1])]
-        if hasattr(self, 'layer4'): outputs += [self.layer4(outputs[-1])]
+        if hasattr(self, 'layer3'):
+            outputs += [self.layer3(outputs[-1])]
+        if hasattr(self, 'layer4'):
+            outputs += [self.layer4(outputs[-1])]

        return outputs


--- a/seetadet/modeling/detector.py
+++ b/seetadet/modeling/detector.py
@@ -39,16 +39,17 @@ class Detector(nn.Module):
        backbone = cfg.MODEL.BACKBONE.lower().split('.')
        body, modules = backbone[0], backbone[1:]

-        # + DataLoader
+        # DataLoader
+        self.data_loader = None
        self.data_loader_cls = importlib.import_module(
            'seetadet.algo.{}'.format(model)).DataLoader
        self.bootstrap = vision.Bootstrap()

-        # + FeatureExtractor
+        # FeatureExtractor
        self.body = backbones.get(body)()
        feature_dims = self.body.feature_dims

-        # + FeatureEnhancer
+        # FeatureEnhancer
        if 'fpn' in modules:
            self.fpn = models.FPN(feature_dims)
            feature_dims = self.fpn.feature_dims
@@ -57,7 +58,7 @@ class Detector(nn.Module):
        else:
            feature_dims = [feature_dims[-1]]

-        # + Detection Modules
+        # Detection Modules
        if 'rcnn' in model:
            self.rpn = models.RPN(feature_dims[0])
            if 'faster' in model:
@@ -106,7 +107,7 @@ class Detector(nn.Module):
        if inputs is None:
            # 1) Training: <= DataLayer
            # 2) Inference: <= Given
-            if not hasattr(self, 'data_loader'):
+            if self.data_loader is None:
                self.data_loader = self.data_loader_cls()
            inputs = self.data_loader()

@@ -171,29 +172,34 @@ class Detector(nn.Module):
        #  Merge Affine into Convolution  #
        ###################################
        last_module = None
-        for e in self.modules():
-            if isinstance(e, nn.Affine) and \
+        for module in self.modules():
+            if isinstance(module, nn.Affine) and \
                    isinstance(last_module, nn.Conv2d):
                if last_module.bias is None:
                    delattr(last_module, 'bias')
-                    e.forward = lambda x: x
-                    last_module.bias = e.bias
-                    last_module.weight.data.mul_(e.weight.data)
-            last_module = e
+                    module.forward = lambda x: x
+                    last_module.bias = module.bias
+                    weight = module.weight.data.view(
+                        0, *([1] * (last_module.weight.ndimension() - 1)))
+                    last_module.weight.data.mul_(weight)
+            last_module = module

        ######################################
        #  Merge BatchNorm into Convolution  #
        ######################################
        last_module = None
-        for e in self.modules():
-            if isinstance(e, nn.BatchNorm2d) and \
+        for module in self.modules():
+            if isinstance(module, nn.BatchNorm2d) and \
                    isinstance(last_module, nn.Conv2d):
                if last_module.bias is None:
                    delattr(last_module, 'bias')
-                    e.forward = lambda x: x
-                    term = torch.sqrt(e.running_var.data + e.eps)
-                    term = e.weight.data / term
-                    last_module.bias = e.bias.data - term * e.running_mean.data
+                    module.forward = lambda x: x
+                    term = torch.sqrt(module.running_var.data + module.eps)
+                    term = module.weight.data / term
+                    last_module.bias = \
+                        module.bias.data - \
+                        term * module.running_mean.data
+                    term = term.view(0, *([1] * (last_module.weight.ndimension() - 1)))
                    if last_module.weight.dtype == 'float16':
                        last_module.bias.half_()
                        weight = last_module.weight.data.float()
@@ -201,7 +207,7 @@ class Detector(nn.Module):
                        last_module.weight.copy_(weight)
                    else:
                        last_module.weight.data.mul_(term)
-            last_module = e
+            last_module = module


 def new_detector(device, weights=None, training=False):

--- a/seetadet/modeling/fpn.py
+++ b/seetadet/modeling/fpn.py
@@ -31,7 +31,8 @@ class FPN(nn.Module):
        dim = cfg.FPN.DIM
        self.C = nn.ModuleList()
        self.P = nn.ModuleList()
-        for lvl in range(cfg.FPN.RPN_MIN_LEVEL, HIGHEST_BACKBONE_LVL + 1):
+        self.highest_backbone_lvl = min(cfg.FPN.RPN_MAX_LEVEL, HIGHEST_BACKBONE_LVL)
+        for lvl in range(cfg.FPN.RPN_MIN_LEVEL, self.highest_backbone_lvl + 1):
            self.C.append(nn.Conv1x1(feature_dims[lvl - 1], dim, bias=True))
            self.P.append(nn.Conv3x3(dim, dim, bias=True))
        if 'rcnn' in cfg.MODEL.TYPE:
@@ -40,8 +41,8 @@ class FPN(nn.Module):
        else:
            self.apply_func = self.apply_on_generic
            self.relu = nn.ReLU(inplace=False)
-            for lvl in range(HIGHEST_BACKBONE_LVL + 1, cfg.FPN.RPN_MAX_LEVEL + 1):
-                dim_in = feature_dims[-1] if lvl == HIGHEST_BACKBONE_LVL + 1 else dim
+            for lvl in range(self.highest_backbone_lvl + 1, cfg.FPN.RPN_MAX_LEVEL + 1):
+                dim_in = feature_dims[-1] if lvl == self.highest_backbone_lvl + 1 else dim
                self.P.append(nn.Conv3x3(dim_in, dim, stride=2, bias=True))
        self.feature_dims = [dim]
        self.coarsest_stride = cfg.MODEL.COARSEST_STRIDE
@@ -56,12 +57,12 @@ class FPN(nn.Module):
    def apply_on_rcnn(self, features):
        fpn_input = self.C[-1](features[-1])
        min_lvl, max_lvl = cfg.FPN.RPN_MIN_LEVEL, cfg.FPN.RPN_MAX_LEVEL
-        outputs = [self.P[HIGHEST_BACKBONE_LVL - min_lvl](fpn_input)]
+        outputs = [self.P[self.highest_backbone_lvl - min_lvl](fpn_input)]
        # Apply max pool for higher features
-        for i in range(HIGHEST_BACKBONE_LVL + 1, max_lvl + 1):
+        for i in range(self.highest_backbone_lvl + 1, max_lvl + 1):
            outputs.append(self.maxpool(outputs[-1]))
        # Build pyramids between [MIN_LEVEL, HIGHEST_LEVEL]
-        for i in range(HIGHEST_BACKBONE_LVL - 1, min_lvl - 1, -1):
+        for i in range(self.highest_backbone_lvl - 1, min_lvl - 1, -1):
            lateral_output = self.C[i - min_lvl](features[i - 1])
            if self.coarsest_stride > 0:
                upscale_output = nn_funcs.upsample(
@@ -76,15 +77,15 @@ class FPN(nn.Module):
    def apply_on_generic(self, features):
        fpn_input = self.C[-1](features[-1])
        min_lvl, max_lvl = cfg.FPN.RPN_MIN_LEVEL, cfg.FPN.RPN_MAX_LEVEL
-        outputs = [self.P[HIGHEST_BACKBONE_LVL - min_lvl](fpn_input)]
+        outputs = [self.P[self.highest_backbone_lvl - min_lvl](fpn_input)]
        # Add extra convolutions for higher features
        extra_input = features[-1]
-        for i in range(HIGHEST_BACKBONE_LVL + 1, max_lvl + 1):
+        for i in range(self.highest_backbone_lvl + 1, max_lvl + 1):
            outputs.append(self.P[i - min_lvl](extra_input))
            if i != max_lvl:
                extra_input = self.relu(outputs[-1])
        # Build pyramids between [MIN_LEVEL, HIGHEST_LEVEL]
-        for i in range(HIGHEST_BACKBONE_LVL - 1, min_lvl - 1, -1):
+        for i in range(self.highest_backbone_lvl - 1, min_lvl - 1, -1):
            lateral_output = self.C[i - min_lvl](features[i - 1])
            if self.coarsest_stride > 0:
                upscale_output = nn_funcs.upsample(

--- a/seetadet/modeling/mobilenet.py
+++ b/seetadet/modeling/mobilenet.py
@@ -161,7 +161,7 @@ class NASMobileNet(nn.Module):

    def reset_parameters(self):
        for m in self.modules():
-            if nn.is_conv2d(m):
+            if isinstance(m, nn.Conv2d):
                init.kaiming_normal(m.weight, 'fan_out')
                if m.bias is not None:
                    init.constant(m.bias, 0)
@@ -173,7 +173,7 @@ class NASMobileNet(nn.Module):

        # Stop the gradients if necessary
        def freeze_func(m):
-            if nn.is_conv2d(m):
+            if isinstance(m, nn.Conv2d):
                m.weight.requires_grad = False
                m._buffers['weight'] = m.weight
                del m._parameters['weight']

--- a/seetadet/modeling/resnet.py
+++ b/seetadet/modeling/resnet.py
@@ -17,8 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import dragon.vm.torch as torch
-
 from seetadet.core.config import cfg
 from seetadet.core.registry import backbones
 from seetadet.modules import nn
@@ -37,11 +35,12 @@ class BasicBlock(nn.Module):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv3x3(dim_in, dim_out, stride)
        self.bn1 = nn.FrozenAffine(dim_out)
-        self.relu = torch.nn.ReLU(inplace=True)
+        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv3x3(dim_out, dim_out)
        self.bn2 = nn.FrozenAffine(dim_out)
        self.downsample = downsample
-        self.dropblock = dropblock
+        self.dropblock1 = nn.DropBlock2d(**dropblock) if dropblock else None
+        self.dropblock2 = nn.DropBlock2d(**dropblock) if dropblock else None

    def forward(self, x):
        residual = x
@@ -50,14 +49,14 @@ class BasicBlock(nn.Module):
        out = self.bn1(out)
        out = self.relu(out)

-        if self.dropblock is not None:
-            out = self.dropblock(out)
+        if self.dropblock1 is not None:
+            out = self.dropblock1(out)

        out = self.conv2(out)
        out = self.bn2(out)

-        if self.dropblock is not None:
-            residual = self.dropblock(residual)
+        if self.dropblock2 is not None:
+            residual = self.dropblock2(residual)

        if self.downsample is not None:
            residual = self.downsample(residual)
@@ -67,7 +66,7 @@ class BasicBlock(nn.Module):
        return out


-class Bottleneck(torch.nn.Module):
+class Bottleneck(nn.Module):
    # 1x64d => 0.25 (ResNet)
    # 32x8d, 64x4d => 1.0 (ResNeXt)
    contraction = cfg.RESNET.NUM_GROUPS \
@@ -86,12 +85,13 @@ class Bottleneck(torch.nn.Module):
        self.conv1 = nn.Conv1x1(dim_in, dim)
        self.bn1 = nn.FrozenAffine(dim)
        self.conv2 = nn.Conv3x3(dim, dim, stride=stride)
+        self.drop2 = nn.DropBlock2d(**dropblock) if dropblock else None
        self.bn2 = nn.FrozenAffine(dim)
        self.conv3 = nn.Conv1x1(dim, dim_out)
+        self.drop3 = nn.DropBlock2d(**dropblock) if dropblock else None
        self.bn3 = nn.FrozenAffine(dim_out)
-        self.relu = torch.nn.ReLU(inplace=True)
+        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
-        self.dropblock = dropblock

    def forward(self, x):
        residual = x
@@ -101,32 +101,30 @@ class Bottleneck(torch.nn.Module):
        out = self.relu(out)

        out = self.conv2(out)
+        if self.drop2 is not None:
+            out = self.drop2(out)
        out = self.bn2(out)
        out = self.relu(out)

-        if self.dropblock is not None:
-            out = self.dropblock(out)
-
        out = self.conv3(out)
        out = self.bn3(out)

-        if self.dropblock is not None:
-            residual = self.dropblock(residual)
-
        if self.downsample is not None:
            residual = self.downsample(residual)

        out += residual
+        if self.drop3 is not None:
+            out = self.drop3(out)
        out = self.relu(out)
        return out


-class ResNet(torch.nn.Module):
+class ResNet(nn.Module):
    def __init__(self, block, layers, filters):
        super(ResNet, self).__init__()
        self.dim_in, filters = filters[0], filters[1:]
        self.feature_dims = [self.dim_in] + filters
-        self.conv1 = torch.nn.Conv2d(
+        self.conv1 = nn.Conv2d(
            3, 64,
            kernel_size=7,
            stride=2,
@@ -134,29 +132,31 @@ class ResNet(torch.nn.Module):
            bias=False,
        )
        self.bn1 = nn.FrozenAffine(self.dim_in)
-        self.relu = torch.nn.ReLU(inplace=True)
-        self.maxpool = torch.nn.MaxPool2d(
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(
            kernel_size=3,
            stride=2,
            padding=0,
            ceil_mode=True,
        )
-        self.drop3 = torch.nn.DropBlock2d(
-            kp=0.9,
-            block_size=7,
-            alpha=0.25,
-            decrement=cfg.DROPBLOCK.DECREMENT
-        ) if cfg.DROPBLOCK.DROP_ON else None
-        self.drop4 = torch.nn.DropBlock2d(
-            kp=0.9,
-            block_size=7,
-            alpha=1.00,
-            decrement=cfg.DROPBLOCK.DECREMENT
-        ) if cfg.DROPBLOCK.DROP_ON else None
+        drop3 = {
+            'kp': 0.9,
+            'block_size': 7,
+            'alpha': 1.00,
+            'decrement': cfg.DROPBLOCK.DECREMENT,
+            'inplace': True,
+        } if cfg.DROPBLOCK.DROP_ON else None
+        drop4 = {
+            'kp': 0.9,
+            'block_size': 7,
+            'alpha': 1.00,
+            'decrement': cfg.DROPBLOCK.DECREMENT,
+            'inplace': True,
+        } if cfg.DROPBLOCK.DROP_ON else None
        self.layer1 = self.make_blocks(block, filters[0], layers[0])
        self.layer2 = self.make_blocks(block, filters[1], layers[1], 2)
-        self.layer3 = self.make_blocks(block, filters[2], layers[2], 2, self.drop3)
-        self.layer4 = self.make_blocks(block, filters[3], layers[3], 2, self.drop4)
+        self.layer3 = self.make_blocks(block, filters[2], layers[2], 2, drop3)
+        self.layer4 = self.make_blocks(block, filters[3], layers[3], 2, drop4)
        self.reset_parameters()

    def reset_parameters(self):
@@ -166,7 +166,7 @@ class ResNet(torch.nn.Module):

        # Stop the gradients if necessary
        def freeze_func(m):
-            if isinstance(m, torch.nn.Conv2d):
+            if isinstance(m, nn.Conv2d):
                m.weight.requires_grad = False
                m._buffers['weight'] = m.weight
                del m._parameters['weight']

--- a/seetadet/modeling/ssd.py
+++ b/seetadet/modeling/ssd.py
@@ -29,7 +29,6 @@ class SSD(nn.Module):
        ########################################
        #             SSD outputs              #
        ########################################
-
        self.cls_conv = torch.nn.ModuleList(
            nn.Conv3x3(feature_dims[0], feature_dims[0], bias=True)
            for _ in range(cfg.SSD.NUM_CONVS)

--- a/seetadet/modules/det.py
+++ b/seetadet/modules/det.py
@@ -36,7 +36,6 @@ class _NonMaxSuppression(Function):
        return self.dispatch([dets], [self.alloc()])


-
 class _RetinaNetDecoder(Function):
    """Decode predictions from RetinaNet."""


--- a/seetadet/modules/init.py
+++ b/seetadet/modules/init.py
@@ -33,6 +33,7 @@ def kaiming_normal(weight, mode='fan_in'):
        nonlinearity='relu',
    )

+
 # Aliases
 constant = nn.init.constant_
 normal = nn.init.normal_
--- a/seetadet/modules/nn.py
+++ b/seetadet/modules/nn.py
@@ -185,6 +185,7 @@ class SigmoidFocalLoss(object):
        return nn.SigmoidFocalLoss(
            alpha=cfg.MODEL.FOCAL_LOSS_ALPHA,
            gamma=cfg.MODEL.FOCAL_LOSS_GAMMA,
+            negative_index=0,  # Background index
        )


@@ -211,6 +212,7 @@ BCEWithLogitsLoss = nn.BCEWithLogitsLoss
 Conv2d = nn.Conv2d
 ConvTranspose2d = nn.ConvTranspose2d
 DepthwiseConv2d = nn.DepthwiseConv2d
+DropBlock2d = nn.DropBlock2d
 Linear = nn.Linear
 MaxPool2d = nn.MaxPool2d
 Module = nn.Module

--- a/seetadet/modules/vision.py
+++ b/seetadet/modules/vision.py
@@ -15,7 +15,7 @@ from __future__ import print_function

 import functools

-import dragon.vm.torch as torch
+from dragon.vm import torch

 from seetadet.core.config import cfg

@@ -41,7 +41,9 @@ class Bootstrap(torch.nn.Module):

    def __init__(self):
        super(Bootstrap, self).__init__()
-        self.normalize_func = functools.partial(
+        self._device = torch.device('cpu')
+        self._dummy_buffer = torch.ones(1)
+        self._normalize_func = functools.partial(
            torch.channel_normalize,
            mean=cfg.PIXEL_MEANS,
            std=[1., 1., 1.],
@@ -49,10 +51,9 @@ class Bootstrap(torch.nn.Module):
            dims=(0, 3, 1, 2),
            dtype=cfg.MODEL.PRECISION.lower(),
        )
-        self.dummy_buffer = torch.ones(1)

    def _apply(self, fn):
-        fn(self.dummy_buffer)
+        fn(self._dummy_buffer)

    def cpu(self):
        self._device = torch.device('cpu')
@@ -61,12 +62,11 @@ class Bootstrap(torch.nn.Module):
        self._device = torch.device('cuda', device)

    def device(self):
-        """Return the device of this module."""
-        return self.dummy_buffer.device
+        return self._dummy_buffer.device

    def forward(self, input):
        if isinstance(input, torch.Tensor):
-            if input.size(1) <= 3:
+            if input.shape[1] <= 3:
                return input
        cur_device = self.device()
        if input._device != cur_device:
@@ -74,4 +74,4 @@ class Bootstrap(torch.nn.Module):
                input = input.cpu()
            else:
                input = input.cuda(cur_device.index)
-        return self.normalize_func(input)
+        return self._normalize_func(input)
--- a/seetadet/solver/sgd.py
+++ b/seetadet/solver/sgd.py
@@ -32,8 +32,8 @@ class SGDSolver(object):
            lr=cfg.SOLVER.BASE_LR,
            momentum=cfg.SOLVER.MOMENTUM,
            weight_decay=cfg.SOLVER.WEIGHT_DECAY,
-            clip_gradient=float(cfg.SOLVER.CLIP_NORM),
-            scale_gradient=1. / cfg.SOLVER.LOSS_SCALING,
+            clip_norm=float(cfg.SOLVER.CLIP_NORM),
+            scale=1. / cfg.SOLVER.LOSS_SCALING,
        )
        self.lr_scheduler = lr_scheduler.get_scheduler()


--- a/seetadet/utils/observer.py
+++ b/seetadet/utils/observer.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import operator
+
+from dragon.vm import torch
+from seetadet.modules import nn
+
+
+def dense_conv_flops(m, inputs, output):
+    """Hook to compute flops for a dense convolution."""
+    k_dim = functools.reduce(operator.mul, m.kernel_size)
+    out_dim = functools.reduce(operator.mul, output.shape[2:])
+    in_c, out_c = inputs[0].shape[1], output.shape[1]
+    m.__params__ = (k_dim * in_c + (1 if m.bias else 0)) * out_c
+    m.__flops__ = m.__params__ * out_dim
+
+
+def depthwise_conv_flops(m, inputs, output):
+    """Hook to compute flops for a depthwise convolution."""
+    k_dim = functools.reduce(operator.mul, m.kernel_size)
+    out_dim = functools.reduce(operator.mul, output.shape[2:])
+    out_c = output.shape[1]
+    m.__params__ = (k_dim + (1 if m.bias else 0)) * out_c
+    m.__flops__ = m.__params__ * out_dim
+
+
+def register_flops(module):
+    """Register hooks to collect flops info."""
+    if not hasattr(module, '__flops__'):
+        module.__flops__ = 0.
+        for m in module.modules():
+            if isinstance(m, nn.DepthwiseConv2d):
+                m.register_forward_hook(depthwise_conv_flops)
+            elif isinstance(m, nn.Conv2d):
+                m.register_forward_hook(dense_conv_flops)
+
+
+def collect_flops(module, normalizer=1e6):
+    """Collect flops from the last forward."""
+    total_flops = 0.
+    for m in module.modules():
+        if hasattr(m, '__flops__'):
+            total_flops += m.__flops__
+            m.__flops__ = 0.
+    return total_flops / normalizer
+
+
+def benchmark_flops(module, normalizer=1e6):
+    """Return the flops by running benchmark once."""
+    register_flops(module)
+    collect_flops(module)
+    original_training = module.training
+    if original_training:
+        module.eval()
+    with torch.no_grad():
+        module()
+    if original_training:
+        module.train()
+    return collect_flops(module, normalizer)