Adapt to the latest dragon preview version

Summary: This commit changes repo to match dragon.0.3.0.dev20200707.

Adapt to the latest dragon preview version
Summary: This commit changes repo to match dragon.0.3.0.dev20200707.
Ting PAN
Commit 8558d3df authored Jul 07, 2020 by Ting PAN
Showing with 1252 additions and 1064 deletions
CHANGES
README.md
configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_1x.yml
configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_2x.yml
configs/faster_rcnn/voc_faster_rcnn_R-50-FPN.yml
configs/faster_rcnn/voc_faster_rcnn_VGG-16-C4.yml
configs/retinanet/coco_retinanet_416_R-50-FPN.yml
configs/retinanet/voc_retinanet_320_AirNet-FPN.yml
configs/retinanet/voc_retinanet_320_R-50-FPN.yml
configs/ssd/voc_ssd_300_VGG-16.yml
configs/ssd/voc_ssd_300_AirNet-5b.yml → configs/ssd/voc_ssd_320_AirNet-FPN.yml
configs/ssd/voc_ssd_320_R-50-FPN.yml
csrc/cxx/.clang-format
csrc/cxx/operators/nms_op.cc
csrc/cxx/operators/nms_op.h
csrc/cxx/operators/retinanet_decoder_op.cc
csrc/cxx/operators/retinanet_decoder_op.h
csrc/cxx/operators/rpn_decoder_op.cc
csrc/cxx/operators/rpn_decoder_op.h
csrc/cxx/setup.py
--- a/CHANGES
+++ b/CHANGES
 ------------------------------------------------------------------------
 The list of most significant changes made over time in SeetaDet.
+SeetaDet 0.4.2 (20200707)
+Dragon Minimum Required (Version 0.3.0.dev20200707)
+Changes:
+- Adapt to the latest dragon preview version.
+Preview Features:
+- None
+Bugs fixed:
+- None
+------------------------------------------------------------------------
 SeetaDet 0.4.1 (20200421)
 Dragon Minimum Required (Version 0.3.0.dev20200421)

--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ The torch-style codes help us to simplify the hierarchical pipeline of modern de
 ## Requirements
-seeta-dragon >= 0.3.0.dev20200421
+seeta-dragon >= 0.3.0.dev20200707
 ## Installation

--- a/configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_1x.yml
+++ b/configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_1x.yml
@@ -32,16 +32,17 @@ FRCNN:
 TRAIN:
  WEIGHTS: '/model/R-101.Affine.pth'
  DATASET: '/data/coco_2014_trainval35k'
-  USE_DIFF: False # Do not use crowd objects
  IMS_PER_BATCH: 2
  BATCH_SIZE: 512
  SCALES: [800]
  MAX_SIZE: 1333
+  USE_DIFF: False # Do not use crowd objects
 TEST:
  DATASET: '/data/coco_2014_minival'
  JSON_FILE: '/data/instances_minival2014.json'
  PROTOCOL: 'coco'
-  RPN_POST_NMS_TOP_N: 1000
  SCALES: [800]
  MAX_SIZE: 1333
  NMS: 0.5
+  RPN_POST_NMS_TOP_N: 1000
--- a/configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_2x.yml
+++ b/configs/faster_rcnn/coco_faster_rcnn_R-101-FPN_2x.yml
@@ -32,16 +32,16 @@ FRCNN:
 TRAIN:
  WEIGHTS: '/model/R-101.Affine.pth'
  DATASET: '/data/coco_2014_trainval35k'
-  USE_DIFF: False # Do not use crowd objects
  IMS_PER_BATCH: 2
  BATCH_SIZE: 512
  SCALES: [800]
  MAX_SIZE: 1333
+  USE_DIFF: False # Do not use crowd objects
 TEST:
  DATASET: '/data/coco_2014_minival'
  JSON_FILE: '/data/instances_minival2014.json'
  PROTOCOL: 'coco'
-  RPN_POST_NMS_TOP_N: 1000
  SCALES: [800]
  MAX_SIZE: 1333
  NMS: 0.5
+  RPN_POST_NMS_TOP_N: 1000
--- a/configs/faster_rcnn/voc_faster_rcnn_R-50-FPN.yml
+++ b/configs/faster_rcnn/voc_faster_rcnn_R-50-FPN.yml
@@ -30,7 +30,7 @@ TRAIN:
 TEST:
  DATASET: '/data/voc_2007_test'
  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
-  RPN_POST_NMS_TOP_N: 1000
  SCALES: [600]
  MAX_SIZE: 1000
  NMS: 0.45
\ No newline at end of file
+  RPN_POST_NMS_TOP_N: 1000
\ No newline at end of file
--- a/configs/faster_rcnn/voc_faster_rcnn_VGG-16-C4.yml
+++ b/configs/faster_rcnn/voc_faster_rcnn_VGG-16-C4.yml
@@ -29,16 +29,16 @@ FRCNN:
 TRAIN:
  WEIGHTS: '/model/VGG16.RCNN.pth'
  DATASET: '/data/voc_0712_trainval'
-  RPN_MIN_SIZE: 16
  IMS_PER_BATCH: 2
  BATCH_SIZE: 128
  SCALES: [600]
  MAX_SIZE: 1000
+  RPN_MIN_SIZE: 16
 TEST:
  DATASET: '/data/voc_2007_test'
  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'
-  RPN_MIN_SIZE: 16
-  RPN_POST_NMS_TOP_N: 300
  SCALES: [600]
  MAX_SIZE: 1000
-  NMS: 0.45
+  RPN_MIN_SIZE: 16
\ No newline at end of file
+  NMS: 0.45
+  RPN_POST_NMS_TOP_N: 300
\ No newline at end of file
--- a/configs/retinanet/coco_retinanet_416_R-50-FPN.yml
+++ b/configs/retinanet/coco_retinanet_416_R-50-FPN.yml
@@ -32,11 +32,11 @@ FPN:
 TRAIN:
  WEIGHTS: '/model/R-50.Affine.pth'
  DATASET: '/data/coco_2014_trainval35k'
-  USE_DIFF: False  # Do not use crowd objects
-  USE_COLOR_JITTER: True
  IMS_PER_BATCH: 16
  SCALES: [416]
  RANDOM_SCALES: [0.25, 1.0]
+  USE_DIFF: False  # Do not use crowd objects
+  USE_COLOR_JITTER: False
 TEST:
  DATASET: '/data/coco_2014_minival'
  JSON_FILE: '/data/instances_minival2014.json'

--- a/configs/retinanet/voc_retinanet_320_AirNet-FPN.yml
+++ b/configs/retinanet/voc_retinanet_320_AirNet-FPN.yml
@@ -23,10 +23,10 @@ FPN:
 TRAIN:
  WEIGHTS: '/model/AirNet.Affine.pth'
  DATASET: '/data/voc_0712_trainval'
-  USE_COLOR_JITTER: True
  IMS_PER_BATCH: 32
  SCALES: [320]
  RANDOM_SCALES: [0.25, 1.0]
+  USE_COLOR_JITTER: True
 TEST:
  DATASET: '/data/voc_2007_test'
  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'

--- a/configs/retinanet/voc_retinanet_320_R-50-FPN.yml
+++ b/configs/retinanet/voc_retinanet_320_R-50-FPN.yml
@@ -24,10 +24,10 @@ FPN:
 TRAIN:
  WEIGHTS: '/model/R-50.Affine.pth'
  DATASET: '/data/voc_0712_trainval'
-  USE_COLOR_JITTER: True
  IMS_PER_BATCH: 32
  SCALES: [320]
  RANDOM_SCALES: [0.25, 2.0]
+  USE_COLOR_JITTER: True
 TEST:
  DATASET: '/data/voc_2007_test'
  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'

--- a/configs/ssd/voc_ssd_300_VGG-16.yml
+++ b/configs/ssd/voc_ssd_300_VGG-16.yml
@@ -38,6 +38,7 @@ TRAIN:
  IMS_PER_BATCH: 32
  SCALES: [300]
  RANDOM_SCALES: [0.25, 1.00]
+  USE_COLOR_JITTER: True
 TEST:
  DATASET: '/data/voc_2007_test'
  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'

--- a/configs/ssd/voc_ssd_300_AirNet-5b.yml
+++ b/configs/ssd/voc_ssd_300_AirNet-5b.yml
@@ -3,7 +3,7 @@ VIS: False
 ENABLE_TENSOR_BOARD: False
 MODEL:
  TYPE: ssd
-  BACKBONE: airnet5b.mbox
+  BACKBONE: airnet.fpn
  CLASSES: ['__background__',
            'aeroplane', 'bicycle', 'bird', 'boat',
            'bottle', 'bus', 'car', 'cat', 'chair',
@@ -17,19 +17,30 @@ SOLVER:
  MAX_STEPS: 120000
  SNAPSHOT_EVERY: 5000
  SNAPSHOT_PREFIX: voc_ssd_320
+FPN:
+  RPN_MIN_LEVEL: 3
+  RPN_MAX_LEVEL: 8
 SSD:
  NUM_CONVS: 2
  MULTIBOX:
-    STRIDES: [8, 16, 32]
+    STRIDES: [8, 16, 32, 64, 100, 300]
-    MIN_SIZES: [30, 90, 150]
+    MIN_SIZES: [30, 60, 110, 162, 213, 264]
-    MAX_SIZES: [90, 150, 210]
+    MAX_SIZES: [60, 110, 162, 213, 264, 315]
-    ASPECT_RATIOS: [[1, 2, 0.5], [1, 2, 0.5], [1, 2, 0.5]]
+    ASPECT_RATIOS: [
+      [1, 2, 0.5],
+      [1, 2, 0.5, 3, 0.33],
+      [1, 2, 0.5, 3, 0.33],
+      [1, 2, 0.5, 3, 0.33],
+      [1, 2, 0.5],
+      [1, 2, 0.5],
+    ]
 TRAIN:
  WEIGHTS: '/model/AirNet.Affine.pth'
  DATASET: '/data/voc_0712_trainval'
+  IMS_PER_BATCH: 32
  SCALES: [320]
  RANDOM_SCALES: [0.25, 1.00]
-  IMS_PER_BATCH: 32
+  USE_COLOR_JITTER: True
 TEST:
  DATASET: '/data/voc_2007_test'
  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'

--- a/configs/ssd/voc_ssd_320_R-50-FPN.yml
+++ b/configs/ssd/voc_ssd_320_R-50-FPN.yml
@@ -37,9 +37,10 @@ SSD:
 TRAIN:
  WEIGHTS: '/model/R-50.Affine.pth'
  DATASET: '/data/voc_0712_trainval'
+  IMS_PER_BATCH: 32
  SCALES: [320]
  RANDOM_SCALES: [0.25, 1.00]
-  IMS_PER_BATCH: 32
+  USE_COLOR_JITTER: True
 TEST:
  DATASET: '/data/voc_2007_test'
  PROTOCOL: 'voc2007' # 'voc2007', 'voc2010', 'coco'

--- a/csrc/cxx/.clang-format
+++ b/csrc/cxx/.clang-format
+---
+AccessModifierOffset: -1
+AlignAfterOpenBracket: AlwaysBreak
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlinesLeft: true
+AlignOperands: false
+AlignTrailingComments: false
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: Empty
+AllowShortIfStatementsOnASingleLine: true
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments: false
+BinPackParameters: false
+BraceWrapping:
+  AfterClass: false
+  AfterControlStatement: false
+  AfterEnum: false
+  AfterFunction: false
+  AfterNamespace: false
+  AfterObjCDeclaration: false
+  AfterStruct: false
+  AfterUnion: false
+  BeforeCatch: false
+  BeforeElse: false
+  IndentBraces: false
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Attach
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: false
+ColumnLimit: 80
+CommentPragmas: '^ IWYU pragma:'
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat: false
+ForEachMacros: [ FOR_EACH_RANGE, FOR_EACH, ]
+IncludeCategories:
+  - Regex: '^<.*\.h(pp)?>'
+    Priority: 1
+  - Regex: '^<.*'
+    Priority: 2
+  - Regex: '.*'
+    Priority: 3
+IndentCaseLabels: true
+IndentWidth: 2
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd: ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: false
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 200
+PointerAlignment: Left
+ReflowComments: true
+SortIncludes: true
+SpaceAfterCStyleCast: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles: false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard: Cpp11
+TabWidth: 8
+UseTab: Never
+...
--- a/csrc/cxx/operators/nms_op.cc
+++ b/csrc/cxx/operators/nms_op.cc
-#include <dragon/core/workspace.h>
-#include <dragon/utils/math_utils.h>
-#include "../utils/detection_utils.h"
 #include "nms_op.h"
+#include "../utils/detection_utils.h"
 namespace dragon {
-template <class Context> template <typename T>
+template <class Context>
+template <typename T>
 void NonMaxSuppressionOp<Context>::DoRunWithType() {
-    int num_selected;
+  int num_selected;
-    utils::detection::ApplyNMS(
+  utils::detection::ApplyNMS(
-        Output(0)->count(),
+      Output(0)->count(),
-        Output(0)->count(),
+      Output(0)->count(),
-        iou_threshold_,
+      iou_threshold_,
-        Input(0).template mutable_data<T, Context>(),
+      Input(0).template mutable_data<T, Context>(),
-        Output(0)->template mutable_data<int64_t, CPUContext>(),
+      Output(0)->template mutable_data<int64_t, CPUContext>(),
-        num_selected, ctx()
+      num_selected,
-    );
+      ctx());
-    Output(0)->Reshape({ num_selected });
+  Output(0)->Reshape({num_selected});
 }
 template <class Context>
 void NonMaxSuppressionOp<Context>::RunOnDevice() {
-    CHECK(Input(0).ndim() == 2 && Input(0).dim(1) == 5)
+  CHECK(Input(0).ndim() == 2 && Input(0).dim(1) == 5)
-        << "\nThe dimensions of boxes should be (num_boxes, 5).";
+      << "\nThe dimensions of boxes should be (num_boxes, 5).";
-    Output(0)->Reshape({ Input(0).dim(0) });
-    DispatchHelper<TensorTypes<float>>::Call(this, Input(0));
+  Output(0)->Reshape({Input(0).dim(0)});
+  DispatchHelper<TensorTypes<float>>::Call(this, Input(0));
 }
 DEPLOY_CPU(NonMaxSuppression);
@@ -41,4 +38,4 @@ OPERATOR_SCHEMA(NonMaxSuppression).NumInputs(1).NumOutputs(1);
 NO_GRADIENT(NonMaxSuppression);
-}  // namespace dragon
+} // namespace dragon
--- a/csrc/cxx/operators/nms_op.h
+++ b/csrc/cxx/operators/nms_op.h
@@ -5,7 +5,7 @@
 * You should have received a copy of the BSD 2-Clause License
 * along with the software. If not, See,
 *
- *      <https://opensource.org/licenses/BSD-2-Clause>
+ *    <https://opensource.org/licenses/BSD-2-Clause>
 *
 * ------------------------------------------------------------
 */
@@ -20,20 +20,20 @@ namespace dragon {
 template <class Context>
 class NonMaxSuppressionOp final : public Operator<Context> {
 public:
-    NonMaxSuppressionOp(const OperatorDef& def, Workspace* ws)
+  NonMaxSuppressionOp(const OperatorDef& def, Workspace* ws)
-        : Operator<Context>(def, ws),
+      : Operator<Context>(def, ws),
-          iou_threshold_(OpArg<float>("iou_threshold", 0.5f)) {}
+        iou_threshold_(OpArg<float>("iou_threshold", 0.5f)) {}
-    USE_OPERATOR_FUNCTIONS;
+  USE_OPERATOR_FUNCTIONS;
-    void RunOnDevice() override;
+  void RunOnDevice() override;
-    template <typename T>
+  template <typename T>
-    void DoRunWithType();
+  void DoRunWithType();
 protected:
-    float iou_threshold_;
+  float iou_threshold_;
 };
-}  // namespace dragon
+} // namespace dragon
-#endif  // SEETADET_CXX_OPERATORS_NMS_OP_H_
+#endif // SEETADET_CXX_OPERATORS_NMS_OP_H_
--- a/csrc/cxx/operators/retinanet_decoder_op.cc
+++ b/csrc/cxx/operators/retinanet_decoder_op.cc
-#include <dragon/core/workspace.h>
+#include <dragon/utils/math_functions.h>
-#include <dragon/utils/math_utils.h>
 #include "../utils/detection_utils.h"
 #include "retinanet_decoder_op.h"
 namespace dragon {
-template <class Context> template <typename T>
+template <class Context>
+template <typename T>
 void RetinaNetDecoderOp<Context>::DoRunWithType() {
-    using BT = float;  // DType of BBox
+  using BT = float; // DType of BBox
-    using BC = CPUContext;  // Context of BBox
+  using BC = CPUContext; // Context of BBox
-    int feat_h, feat_w;
+  int feat_h, feat_w;
-    int C = Input(-3).dim(2), A, K;
+  int C = Input(-3).dim(2), A, K;
-    int total_proposals = 0;
+  int total_proposals = 0;
-    int num_candidates, num_boxes, num_proposals;
+  int num_candidates, num_boxes, num_proposals;
-    auto* batch_scores = Input(-3).template data<T, BC>();
+  auto* batch_scores = Input(-3).template data<T, BC>();
-    auto* batch_deltas = Input(-2).template data<T, BC>();
+  auto* batch_deltas = Input(-2).template data<T, BC>();
-    auto* im_info = Input(-1).template data<BT, BC>();
+  auto* im_info = Input(-1).template data<BT, BC>();
-    auto* y = Output(0)->template mutable_data<BT, BC>();
+  auto* y = Output(0)->template mutable_data<BT, BC>();
-    for (int n = 0; n < num_images_; ++n) {
+  for (int n = 0; n < num_images_; ++n) {
-        BT im_h = im_info[0];
+    BT im_h = im_info[0];
-        BT im_w = im_info[1];
+    BT im_w = im_info[1];
-        BT im_scale_h = im_info[2];
+    BT im_scale_h = im_info[2];
-        BT im_scale_w = im_info[2];
+    BT im_scale_w = im_info[2];
-        if (Input(-1).dim(1) == 4) im_scale_w = im_info[3];
+    if (Input(-1).dim(1) == 4) im_scale_w = im_info[3];
-        auto* scores = batch_scores + n * Input(-3).stride(0);
+    auto* scores = batch_scores + n * Input(-3).stride(0);
-        auto* deltas = batch_deltas + n * Input(-2).stride(0);
+    auto* deltas = batch_deltas + n * Input(-2).stride(0);
-        CHECK_EQ(strides_.size(), InputSize() - 3)
+    CHECK_EQ(strides_.size(), InputSize() - 3)
-            << "\nGiven " << strides_.size() << " strides "
+        << "\nGiven " << strides_.size() << " strides "
-            << "and " << InputSize() - 3 << " features";
+        << "and " << InputSize() - 3 << " features";
-        // Select the top-k candidates as proposals
+    // Select the top-k candidates as proposals
-        num_boxes = Input(-3).dim(1);
+    num_boxes = Input(-3).dim(1);
-        num_candidates = Input(-3).count(1);
+    num_candidates = Input(-3).count(1);
-        roi_indices_.resize(num_candidates);
+    roi_indices_.resize(num_candidates);
-        num_candidates = 0;
+    num_candidates = 0;
-        for (int i = 0; i < roi_indices_.size(); ++i)
+    for (int i = 0; i < roi_indices_.size(); ++i)
-            if (scores[i] > score_thr_)
+      if (scores[i] > score_thr_) roi_indices_[num_candidates++] = i;
-                roi_indices_[num_candidates++] = i;
+    scores_.resize(num_candidates);
-        scores_.resize(num_candidates);
+    for (int i = 0; i < num_candidates; ++i)
-        for (int i = 0; i < num_candidates; ++i)
+      scores_[i] = scores[roi_indices_[i]];
-            scores_[i] = scores[roi_indices_[i]];
+    num_proposals = std::min(num_candidates, (int)pre_nms_topn_);
-        num_proposals = std::min(
+    utils::math::ArgPartition(
-            num_candidates,
+        num_candidates, num_proposals, true, scores_.data(), indices_);
-            (int)pre_nms_topn_
+    for (int i = 0; i < num_proposals; ++i)
-        );
+      indices_[i] = roi_indices_[indices_[i]];
-        utils::math::ArgPartition(
+    // Decode the candidates
-            num_candidates,
+    int base_offset = 0;
-            num_proposals,
+    for (int i = 0; i < strides_.size(); i++) {
-            true,
+      feat_h = Input(i).dim(2);
-            scores_.data(),
+      feat_w = Input(i).dim(3);
-            indices_
+      K = feat_h * feat_w;
-        );
+      A = int(ratios_.size() * scales_.size());
-        for (int i = 0; i < num_proposals; ++i)
+      anchors_.resize((size_t)(A * 4));
-            indices_[i] = roi_indices_[indices_[i]];
+      utils::detection::GenerateAnchors(
-        // Decode the candidates
+          strides_[i],
-        int base_offset = 0;
+          (int)ratios_.size(),
-        for (int i = 0; i < strides_.size(); i++) {
+          (int)scales_.size(),
-            feat_h = Input(i).dim(2);
+          ratios_.data(),
-            feat_w = Input(i).dim(3);
+          scales_.data(),
-            K = feat_h * feat_w;
+          anchors_.data());
-            A = int(ratios_.size() * scales_.size());
+      utils::detection::GenerateGridAnchors(
-            anchors_.resize((size_t)(A * 4));
+          num_proposals,
-            utils::detection::GenerateAnchors(
+          C,
-                strides_[i],
+          A,
-                (int)ratios_.size(),
+          feat_h,
-                (int)scales_.size(),
+          feat_w,
-                ratios_.data(),
+          strides_[i],
-                scales_.data(),
+          base_offset,
-                anchors_.data()
+          anchors_.data(),
-            );
+          indices_.data(),
-            utils::detection::GenerateGridAnchors(
+          y);
-                num_proposals, C, A,
+      base_offset += (A * K);
-                feat_h, feat_w,
-                strides_[i],
-                base_offset,
-                anchors_.data(),
-                indices_.data(),
-                y
-            );
-            base_offset += (A * K);
-        }
-        utils::detection::GenerateMCProposals(
-            num_proposals,
-            num_boxes, C,
-            n,
-            im_h,
-            im_w,
-            im_scale_h,
-            im_scale_w,
-            scores,
-            deltas,
-            indices_.data(),
-            y
-        );
-        total_proposals += num_proposals;
-        y += (num_proposals * 7);
-        im_info += Input(-1).dim(1);
    }
+    utils::detection::GenerateMCProposals(
+        num_proposals,
+        num_boxes,
+        C,
+        n,
+        im_h,
+        im_w,
+        im_scale_h,
+        im_scale_w,
+        scores,
+        deltas,
+        indices_.data(),
+        y);
+    total_proposals += num_proposals;
+    y += (num_proposals * 7);
+    im_info += Input(-1).dim(1);
+  }
-    Output(0)->Reshape({ total_proposals, 7 });
+  Output(0)->Reshape({total_proposals, 7});
 }
 template <class Context>
 void RetinaNetDecoderOp<Context>::RunOnDevice() {
-    num_images_ = Input(0).dim(0);
+  num_images_ = Input(0).dim(0);
-    CHECK_EQ(Input(-1).dim(0), num_images_)
-        << "\nExcepted " << num_images_
-        << " groups info, got "
-        << Input(-1).dim(0) << ".";
-    Output(0)->Reshape({ num_images_ * pre_nms_topn_, 7 });
+  CHECK_EQ(Input(-1).dim(0), num_images_)
+      << "\nExcepted " << num_images_ << " groups info, got "
+      << Input(-1).dim(0) << ".";
-    DispatchHelper<TensorTypes<float>>::Call(this, Input(-3));
+  Output(0)->Reshape({num_images_ * pre_nms_topn_, 7});
+  DispatchHelper<TensorTypes<float>>::Call(this, Input(-3));
 }
 DEPLOY_CPU(RetinaNetDecoder);
@@ -123,8 +113,6 @@ DEPLOY_CPU(RetinaNetDecoder);
 DEPLOY_CUDA(RetinaNetDecoder);
 #endif
-OPERATOR_SCHEMA(RetinaNetDecoder)
+OPERATOR_SCHEMA(RetinaNetDecoder).NumInputs(3, INT_MAX).NumOutputs(1, INT_MAX);
-    .NumInputs(3, INT_MAX)
-    .NumOutputs(1, INT_MAX);
-}  // namespace dragon
+} // namespace dragon
--- a/csrc/cxx/operators/retinanet_decoder_op.h
+++ b/csrc/cxx/operators/retinanet_decoder_op.h
@@ -5,7 +5,7 @@
 * You should have received a copy of the BSD 2-Clause License
 * along with the software. If not, See,
 *
- *      <https://opensource.org/licenses/BSD-2-Clause>
+ *    <https://opensource.org/licenses/BSD-2-Clause>
 *
 * ------------------------------------------------------------
 */
@@ -20,27 +20,27 @@ namespace dragon {
 template <class Context>
 class RetinaNetDecoderOp final : public Operator<Context> {
 public:
-    RetinaNetDecoderOp(const OperatorDef& def, Workspace* ws)
+  RetinaNetDecoderOp(const OperatorDef& def, Workspace* ws)
-        : Operator<Context>(def, ws),
+      : Operator<Context>(def, ws),
-          strides_(OpArgs<int64_t>("strides")),
+        strides_(OpArgs<int64_t>("strides")),
-          ratios_(OpArgs<float>("ratios")),
+        ratios_(OpArgs<float>("ratios")),
-          scales_(OpArgs<float>("scales")),
+        scales_(OpArgs<float>("scales")),
-          pre_nms_topn_(OpArg<int64_t>("pre_nms_top_n", 6000)),
+        pre_nms_topn_(OpArg<int64_t>("pre_nms_top_n", 6000)),
-          score_thr_(OpArg<float>("score_thresh", 0.05f)) {}
+        score_thr_(OpArg<float>("score_thresh", 0.05f)) {}
-    USE_OPERATOR_FUNCTIONS;
+  USE_OPERATOR_FUNCTIONS;
-    void RunOnDevice() override;
+  void RunOnDevice() override;
-    template <typename T>
+  template <typename T>
-    void DoRunWithType();
+  void DoRunWithType();
 protected:
-    float score_thr_;
+  float score_thr_;
-    vec64_t strides_, indices_, roi_indices_;
+  vec64_t strides_, indices_, roi_indices_;
-    vector<float> ratios_, scales_, scores_, anchors_;
+  vector<float> ratios_, scales_, scores_, anchors_;
-    int64_t num_images_, pre_nms_topn_;
+  int64_t num_images_, pre_nms_topn_;
 };
-}  // namespace dragon
+} // namespace dragon
-#endif  // SEETADET_CXX_OPERATORS_RETINANET_DECODER_OP_H_
+#endif // SEETADET_CXX_OPERATORS_RETINANET_DECODER_OP_H_
--- a/csrc/cxx/operators/rpn_decoder_op.cc
+++ b/csrc/cxx/operators/rpn_decoder_op.cc
-#include <dragon/core/workspace.h>
+#include <dragon/utils/math_functions.h>
-#include <dragon/utils/math_utils.h>
 #include "../utils/detection_utils.h"
 #include "rpn_decoder_op.h"
 namespace dragon {
-template <class Context> template <typename T>
+template <class Context>
+template <typename T>
 void RPNDecoderOp<Context>::DoRunWithType() {
-    using BT = float;  // DType of BBox
+  using BT = float; // DType of BBox
-    using BC = CPUContext;  // Context of BBox
+  using BC = CPUContext; // Context of BBox
-    int feat_h, feat_w, K, A;
+  int feat_h, feat_w, K, A;
-    int total_rois = 0, num_rois;
+  int total_rois = 0, num_rois;
-    int num_candidates, num_proposals;
+  int num_candidates, num_proposals;
-    auto* batch_scores = Input(-3).template data<T, BC>();
+  auto* batch_scores = Input(-3).template data<T, BC>();
-    auto* batch_deltas = Input(-2).template data<T, BC>();
+  auto* batch_deltas = Input(-2).template data<T, BC>();
-    auto* im_info = Input(-1).template data<BT, BC>();
+  auto* im_info = Input(-1).template data<BT, BC>();
-    auto* y = Output(0)->template mutable_data<BT, BC>();
+  auto* y = Output(0)->template mutable_data<BT, BC>();
-    for (int n = 0; n < num_images_; ++n) {
+  for (int n = 0; n < num_images_; ++n) {
-        const BT im_h = im_info[0];
+    const BT im_h = im_info[0];
-        const BT im_w = im_info[1];
+    const BT im_w = im_info[1];
-        const BT scale = im_info[2];
+    const BT scale = im_info[2];
-        const BT min_box_h = min_size_ * scale;
+    const BT min_box_h = min_size_ * scale;
-        const BT min_box_w = min_size_ * scale;
+    const BT min_box_w = min_size_ * scale;
-        auto* scores = batch_scores + n * Input(-3).stride(0);
+    auto* scores = batch_scores + n * Input(-3).stride(0);
-        auto* deltas = batch_deltas + n * Input(-2).stride(0);
+    auto* deltas = batch_deltas + n * Input(-2).stride(0);
-        if (strides_.size() == 1) {
+    if (strides_.size() == 1) {
-            // Case 1: single stride
+      // Case 1: single stride
-            feat_h = Input(0).dim(2);
+      feat_h = Input(0).dim(2);
-            feat_w = Input(0).dim(3);
+      feat_w = Input(0).dim(3);
-            K = feat_h * feat_w;
+      K = feat_h * feat_w;
-            A = int(ratios_.size() * scales_.size());
+      A = int(ratios_.size() * scales_.size());
-            // Select the Top-K candidates as proposals
+      // Select the Top-K candidates as proposals
-            num_candidates = A * K;
+      num_candidates = A * K;
-            num_proposals = std::min(
+      num_proposals = std::min(num_candidates, (int)pre_nms_topn_);
-                num_candidates,
+      utils::math::ArgPartition(
-                (int)pre_nms_topn_
+          num_candidates, num_proposals, true, scores, indices_);
-            );
+      // Decode the candidates
-            utils::math::ArgPartition(
+      anchors_.resize((size_t)(A * 4));
-                num_candidates,
+      proposals_.Reshape({num_proposals, 5});
-                num_proposals,
+      utils::detection::GenerateAnchors(
-                true, scores, indices_
+          strides_[0],
-            );
+          (int)ratios_.size(),
-            // Decode the candidates
+          (int)scales_.size(),
-            anchors_.resize((size_t)(A * 4));
+          ratios_.data(),
-            proposals_.Reshape({ num_proposals, 5 });
+          scales_.data(),
-            utils::detection::GenerateAnchors(
+          anchors_.data());
-                strides_[0],
+      utils::detection::GenerateGridAnchors(
-                (int)ratios_.size(),
+          num_proposals,
-                (int)scales_.size(),
+          A,
-                ratios_.data(),
+          feat_h,
-                scales_.data(),
+          feat_w,
-                anchors_.data()
+          strides_[0],
-            );
+          0,
-            utils::detection::GenerateGridAnchors(
+          anchors_.data(),
-                num_proposals, A,
+          indices_.data(),
-                feat_h, feat_w,
+          proposals_.template mutable_data<BT, BC>());
-                strides_[0],
+      utils::detection::GenerateSSProposals(
-                0,
+          K,
-                anchors_.data(),
+          num_proposals,
-                indices_.data(),
+          im_h,
-                proposals_.template mutable_data<BT, BC>()
+          im_w,
-            );
+          min_box_h,
-            utils::detection::GenerateSSProposals(
+          min_box_w,
-                K, num_proposals,
+          scores,
-                im_h, im_w,
+          deltas,
-                min_box_h, min_box_w,
+          indices_.data(),
-                scores,
+          proposals_.template mutable_data<BT, BC>());
-                deltas,
+      // Sort, NMS and Retrieve
-                indices_.data(),
+      utils::detection::SortProposals(
-                proposals_.template mutable_data<BT, BC>()
+          0,
-            );
+          num_proposals - 1,
-            // Sort, NMS and Retrieve
+          num_proposals,
-            utils::detection::SortProposals(
+          proposals_.template mutable_data<BT, BC>());
-                0,
+      utils::detection::ApplyNMS(
-                num_proposals - 1,
+          num_proposals,
-                num_proposals,
+          post_nms_topn_,
-                proposals_.template mutable_data<BT, BC>()
+          nms_thr_,
-            );
+          proposals_.template mutable_data<BT, Context>(),
-            utils::detection::ApplyNMS(
+          roi_indices_.data(),
-                num_proposals,
+          num_rois,
-                post_nms_topn_,
+          ctx());
-                nms_thr_,
+      utils::detection::RetrieveRoIs(
-                proposals_.template mutable_data<BT, Context>(),
+          num_rois,
-                roi_indices_.data(),
+          n,
-                num_rois, ctx()
+          proposals_.template data<BT, BC>(),
-            );
+          roi_indices_.data(),
-            utils::detection::RetrieveRoIs(
+          y);
-                num_rois,
+    } else if (strides_.size() > 1) {
-                n,
+      // Case 2: multiple strides
-                proposals_.template data<BT, BC>(),
+      CHECK_EQ(strides_.size(), InputSize() - 3)
-                roi_indices_.data(),
+          << "\nGiven " << strides_.size() << " strides "
-                y
+          << "and " << InputSize() - 3 << " feature inputs";
-            );
+      CHECK_EQ(strides_.size(), scales_.size())
-        } else if (strides_.size() > 1) {
+          << "\nGiven " << strides_.size() << " strides "
-            // Case 2: multiple strides
+          << "and " << scales_.size() << " scales";
-            CHECK_EQ(strides_.size(), InputSize() - 3)
+      // Select the top-k candidates as proposals
-                << "\nGiven " << strides_.size() << " strides "
+      num_candidates = Input(-3).dim(1);
-                << "and " << InputSize() - 3 << " feature inputs";
+      num_proposals = std::min(num_candidates, (int)pre_nms_topn_);
-            CHECK_EQ(strides_.size(), scales_.size())
+      utils::math::ArgPartition(
-                << "\nGiven " << strides_.size() << " strides "
+          num_candidates, num_proposals, true, scores, indices_);
-                << "and " << scales_.size() << " scales";
+      // Decode the candidates
-            // Select the top-k candidates as proposals
+      int base_offset = 0;
-            num_candidates = Input(-3).dim(1);
+      proposals_.Reshape({num_proposals, 5});
-            num_proposals = std::min(
+      auto* proposals = proposals_.template mutable_data<BT, BC>();
-                num_candidates,
+      for (int i = 0; i < strides_.size(); i++) {
-                (int)pre_nms_topn_
+        feat_h = Input(i).dim(2);
-            );
+        feat_w = Input(i).dim(3);
-            utils::math::ArgPartition(
+        K = feat_h * feat_w;
-                num_candidates,
+        A = (int)ratios_.size();
-                num_proposals,
+        anchors_.resize((size_t)(A * 4));
-                true, scores, indices_
+        utils::detection::GenerateAnchors(
-            );
+            strides_[i],
-            // Decode the candidates
+            (int)ratios_.size(),
-            int base_offset = 0;
+            1,
-            proposals_.Reshape({ num_proposals, 5 });
+            ratios_.data(),
-            auto* proposals = proposals_
+            scales_.data(),
-                .template mutable_data<BT, BC>();
+            anchors_.data());
-            for (int i = 0; i < strides_.size(); i++) {
+        utils::detection::GenerateGridAnchors(
-                feat_h = Input(i).dim(2);
+            num_proposals,
-                feat_w = Input(i).dim(3);
+            A,
-                K = feat_h * feat_w;
+            feat_h,
-                A = (int)ratios_.size();
+            feat_w,
-                anchors_.resize((size_t)(A * 4));
+            strides_[i],
-                utils::detection::GenerateAnchors(
+            base_offset,
-                    strides_[i],
+            anchors_.data(),
-                    (int)ratios_.size(),
+            indices_.data(),
-                    1,
+            proposals);
-                    ratios_.data(),
+        base_offset += (A * K);
-                    scales_.data(),
+      }
-                    anchors_.data()
+      utils::detection::GenerateMSProposals(
-                );
+          num_candidates,
-                utils::detection::GenerateGridAnchors(
+          num_proposals,
-                    num_proposals, A,
+          im_h,
-                    feat_h, feat_w,
+          im_w,
-                    strides_[i],
+          min_box_h,
-                    base_offset,
+          min_box_w,
-                    anchors_.data(),
+          scores,
-                    indices_.data(),
+          deltas,
-                    proposals
+          &indices_[0],
-                );
+          proposals);
-                base_offset += (A * K);
+      // Sort, NMS and Retrieve
-            }
+      utils::detection::SortProposals(
-            utils::detection::GenerateMSProposals(
+          0, num_proposals - 1, num_proposals, proposals);
-                num_candidates,
+      utils::detection::ApplyNMS(
-                num_proposals,
+          num_proposals,
-                im_h, im_w,
+          post_nms_topn_,
-                min_box_h, min_box_w,
+          nms_thr_,
-                scores,
+          proposals_.template mutable_data<BT, Context>(),
-                deltas,
+          roi_indices_.data(),
-                &indices_[0],
+          num_rois,
-                proposals
+          ctx());
-            );
+      utils::detection::RetrieveRoIs(
-            // Sort, NMS and Retrieve
+          num_rois, n, proposals, roi_indices_.data(), y);
-            utils::detection::SortProposals(
+    } else {
-                0,
+      LOG(FATAL) << "Excepted at least one stride for proposals.";
-                num_proposals - 1,
-                num_proposals,
-                proposals
-            );
-            utils::detection::ApplyNMS(
-                num_proposals,
-                post_nms_topn_,
-                nms_thr_,
-                proposals_.template mutable_data<BT, Context>(),
-                roi_indices_.data(),
-                num_rois, ctx()
-            );
-            utils::detection::RetrieveRoIs(
-                num_rois,
-                n,
-                proposals,
-                roi_indices_.data(),
-                y
-            );
-        } else {
-            LOG(FATAL) << "Excepted at least one stride for proposals.";
-        }
-        total_rois += num_rois;
-        y += (num_rois * 5);
-        im_info += Input(-1).dim(1);
    }
+    total_rois += num_rois;
-    Output(0)->Reshape({ total_rois, 5 });
+    y += (num_rois * 5);
+    im_info += Input(-1).dim(1);
-    // Distribute rois into K bins
+  }
-    if (OutputSize() > 1) {
-        CHECK_EQ(max_level_ - min_level_ + 1, OutputSize())
+  Output(0)->Reshape({total_rois, 5});
-            << "\nExcepted " << OutputSize() << " outputs for levels "
-               "between [" << min_level_ << ", " << max_level_ << "].";
+  // Distribute rois into K bins
-        vector<BT*> ys(OutputSize());
+  if (OutputSize() > 1) {
-        vector<vec64_t> bins(OutputSize());
+    CHECK_EQ(max_level_ - min_level_ + 1, OutputSize())
-        Tensor RoIs; RoIs.ReshapeLike(*Output(0));
+        << "\nExcepted " << OutputSize() << " outputs for levels "
+        << "between [" << min_level_ << ", " << max_level_ << "].";
-        auto* rois = RoIs.template mutable_data<BT, BC>();
+    vector<BT*> ys(OutputSize());
+    vector<vec64_t> bins(OutputSize());
-        ctx()->template Copy<BT, BC, BC>(
+    Tensor RoIs;
-            Output(0)->count(),
+    RoIs.ReshapeLike(*Output(0));
-            rois, Output(0)->template data<BT, BC>()
-        );
+    auto* rois = RoIs.template mutable_data<BT, BC>();
-        utils::detection::CollectRoIs(
+    ctx()->template Copy<BT, BC, BC>(
-            total_rois,
+        Output(0)->count(), rois, Output(0)->template data<BT, BC>());
-            min_level_,
-            max_level_,
+    utils::detection::CollectRoIs(
-            canonical_level_,
+        total_rois,
-            canonical_scale_,
+        min_level_,
-            rois, bins
+        max_level_,
-        );
+        canonical_level_,
+        canonical_scale_,
-        for (int i = 0; i < OutputSize(); i++) {
+        rois,
-            Output(i)->Reshape({ std::max((int)bins[i].size(), 1), 5 });
+        bins);
-            ys[i] = Output(i)->template mutable_data<BT, BC>();
-        }
+    for (int i = 0; i < OutputSize(); i++) {
+      Output(i)->Reshape({std::max((int)bins[i].size(), 1), 5});
-        utils::detection::DistributeRoIs(bins, rois, ys);
+      ys[i] = Output(i)->template mutable_data<BT, BC>();
    }
+    utils::detection::DistributeRoIs(bins, rois, ys);
+  }
 }
 template <class Context>
 void RPNDecoderOp<Context>::RunOnDevice() {
-    num_images_ = Input(0).dim(0);
+  num_images_ = Input(0).dim(0);
-    CHECK_EQ(Input(-1).dim(0), num_images_)
+  CHECK_EQ(Input(-1).dim(0), num_images_)
-        << "\nExcepted " << num_images_
+      << "\nExcepted " << num_images_ << " groups info, got "
-        << " groups info, got "
+      << Input(-1).dim(0) << ".";
-        << Input(-1).dim(0) << ".";
-    roi_indices_.resize(post_nms_topn_);
+  roi_indices_.resize(post_nms_topn_);
-    Output(0)->Reshape({ num_images_ * post_nms_topn_, 5 });
+  Output(0)->Reshape({num_images_ * post_nms_topn_, 5});
-    DispatchHelper<TensorTypes<float>>::Call(this, Input(-3));
+  DispatchHelper<TensorTypes<float>>::Call(this, Input(-3));
 }
 DEPLOY_CPU(RPNDecoder);
@@ -241,8 +218,6 @@ DEPLOY_CPU(RPNDecoder);
 DEPLOY_CUDA(RPNDecoder);
 #endif
-OPERATOR_SCHEMA(RPNDecoder)
+OPERATOR_SCHEMA(RPNDecoder).NumInputs(3, INT_MAX).NumOutputs(1, INT_MAX);
-    .NumInputs(3, INT_MAX)
-    .NumOutputs(1, INT_MAX);
-}  // namespace dragon
+} // namespace dragon
--- a/csrc/cxx/operators/rpn_decoder_op.h
+++ b/csrc/cxx/operators/rpn_decoder_op.h
@@ -5,7 +5,7 @@
 * You should have received a copy of the BSD 2-Clause License
 * along with the software. If not, See,
 *
- *      <https://opensource.org/licenses/BSD-2-Clause>
+ *    <https://opensource.org/licenses/BSD-2-Clause>
 *
 * ------------------------------------------------------------
 */
@@ -20,36 +20,36 @@ namespace dragon {
 template <class Context>
 class RPNDecoderOp final : public Operator<Context> {
 public:
-    RPNDecoderOp(const OperatorDef& def, Workspace* ws)
+  RPNDecoderOp(const OperatorDef& def, Workspace* ws)
-        : Operator<Context>(def, ws),
+      : Operator<Context>(def, ws),
-          strides_(OpArgs<int64_t>("strides")),
+        strides_(OpArgs<int64_t>("strides")),
-          ratios_(OpArgs<float>("ratios")),
+        ratios_(OpArgs<float>("ratios")),
-          scales_(OpArgs<float>("scales")),
+        scales_(OpArgs<float>("scales")),
-          pre_nms_topn_(OpArg<int64_t>("pre_nms_top_n", 6000)),
+        pre_nms_topn_(OpArg<int64_t>("pre_nms_top_n", 6000)),
-          post_nms_topn_(OpArg<int64_t>("post_nms_top_n", 300)),
+        post_nms_topn_(OpArg<int64_t>("post_nms_top_n", 300)),
-          nms_thr_(OpArg<float>("nms_thresh", 0.7f)),
+        nms_thr_(OpArg<float>("nms_thresh", 0.7f)),
-          min_size_(OpArg<int64_t>("min_size", 16)),
+        min_size_(OpArg<int64_t>("min_size", 16)),
-          min_level_(OpArg<int64_t>("min_level", 2)),
+        min_level_(OpArg<int64_t>("min_level", 2)),
-          max_level_(OpArg<int64_t>("max_level", 5)),
+        max_level_(OpArg<int64_t>("max_level", 5)),
-          canonical_level_(OpArg<int64_t>("canonical_level", 4)),
+        canonical_level_(OpArg<int64_t>("canonical_level", 4)),
-          canonical_scale_(OpArg<int64_t>("canonical_scale", 224)) {}
+        canonical_scale_(OpArg<int64_t>("canonical_scale", 224)) {}
-    USE_OPERATOR_FUNCTIONS;
+  USE_OPERATOR_FUNCTIONS;
-    void RunOnDevice() override;
+  void RunOnDevice() override;
-    template <typename T>
+  template <typename T>
-    void DoRunWithType();
+  void DoRunWithType();
 protected:
-    float nms_thr_;
+  float nms_thr_;
-    vec64_t strides_, indices_, roi_indices_;
+  vec64_t strides_, indices_, roi_indices_;
-    vector<float> ratios_, scales_, scores_, anchors_;
+  vector<float> ratios_, scales_, scores_, anchors_;
-    int64_t min_size_, pre_nms_topn_, post_nms_topn_;
+  int64_t min_size_, pre_nms_topn_, post_nms_topn_;
-    int64_t num_images_, min_level_, max_level_;
+  int64_t num_images_, min_level_, max_level_;
-    int64_t canonical_level_, canonical_scale_;
+  int64_t canonical_level_, canonical_scale_;
-    Tensor proposals_;
+  Tensor proposals_;
 };
-}  // namespace dragon
+} // namespace dragon
-#endif  // SEETADET_CXX_OPERATORS_RPN_DECODER_OP_H_
+#endif // SEETADET_CXX_OPERATORS_RPN_DECODER_OP_H_
--- a/csrc/cxx/setup.py
+++ b/csrc/cxx/setup.py
@@ -5,7 +5,7 @@
 # You should have received a copy of the BSD 2-Clause License
 # along with the software. If not, See,
 #
-#      <https://opensource.org/licenses/BSD-2-Clause>
+#    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
@@ -15,25 +15,35 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+import glob
 from distutils.core import setup
-from dragon.tools import cpp_extension
+from dragon.tools import cpp_extension
 if cpp_extension.CUDA_HOME is not None and \
        cpp_extension._cuda.is_available():
    Extension = cpp_extension.CUDAExtension
 else:
    Extension = cpp_extension.CppExtension
+def find_sources(*dirs):
+    ext_suffixes = ['.cc']
+    if Extension is cpp_extension.CUDAExtension:
+        ext_suffixes.append('.cu')
+    sources = []
+    for path in dirs:
+        for ext_suffix in ext_suffixes:
+            sources += glob.glob(
+                path + '/*' + ext_suffix,
+                recursive=True,
+            )
+    return sources
 ext_modules = [
    Extension(
        name='install.lib.modules._C',
-        sources=[
+        sources=find_sources('**'),
-            'utils/detection_utils.cc',
-            'utils/detection_utils.cu',
-            'operators/nms_op.cc',
-            'operators/retinanet_decoder_op.cc',
-            'operators/rpn_decoder_op.cc',
-        ],
    ),
 ]

--- a/csrc/cxx/utils/detection_utils.cc
+++ b/csrc/cxx/utils/detection_utils.cc
-#include <dragon/core/context.h>
 #include "detection_utils.h"
+#include <dragon/core/context.h>
 namespace dragon {
@@ -9,45 +9,46 @@ namespace detection {
 template <typename T>
 T IoU(const T A[], const T B[]) {
-    if (A[0] > B[2] || A[1] > B[3] ||
+  if (A[0] > B[2] || A[1] > B[3] || A[2] < B[0] || A[3] < B[1]) return 0;
-        A[2] < B[0] || A[3] < B[1]) return 0;
+  const T x1 = std::max(A[0], B[0]);
-    const T x1 = std::max(A[0], B[0]);
+  const T y1 = std::max(A[1], B[1]);
-    const T y1 = std::max(A[1], B[1]);
+  const T x2 = std::min(A[2], B[2]);
-    const T x2 = std::min(A[2], B[2]);
+  const T y2 = std::min(A[3], B[3]);
-    const T y2 = std::min(A[3], B[3]);
+  const T width = std::max((T)0, x2 - x1 + 1);
-    const T width = std::max((T)0, x2 - x1 + 1);
+  const T height = std::max((T)0, y2 - y1 + 1);
-    const T height = std::max((T)0, y2 - y1 + 1);
+  const T area = width * height;
-    const T area = width * height;
+  const T A_area = (A[2] - A[0] + 1) * (A[3] - A[1] + 1);
-    const T A_area = (A[2] - A[0] + 1) * (A[3] - A[1] + 1);
+  const T B_area = (B[2] - B[0] + 1) * (B[3] - B[1] + 1);
-    const T B_area = (B[2] - B[0] + 1) * (B[3] - B[1] + 1);
+  return area / (A_area + B_area - area);
-    return area / (A_area + B_area - area);
 }
-template <> void ApplyNMS<float, CPUContext>(
+template <>
-    const int               num_boxes,
+void ApplyNMS<float, CPUContext>(
-    const int               max_keeps,
+    const int num_boxes,
-    const float             thresh,
+    const int max_keeps,
-    const float*            boxes,
+    const float thresh,
-    int64_t*                keep_indices,
+    const float* boxes,
-    int&                    num_keep,
+    int64_t* keep_indices,
-    CPUContext*             ctx) {
+    int& num_keep,
-    int count = 0;
+    CPUContext* ctx) {
-    std::vector<char> is_dead(num_boxes);
+  int count = 0;
-    for (int i = 0; i < num_boxes; ++i) is_dead[i] = 0;
+  std::vector<char> is_dead(num_boxes);
-    for (int i = 0; i < num_boxes; ++i) {
+  for (int i = 0; i < num_boxes; ++i)
-        if (is_dead[i]) continue;
+    is_dead[i] = 0;
-        keep_indices[count++] = i;
+  for (int i = 0; i < num_boxes; ++i) {
-        if (count == max_keeps) break;
+    if (is_dead[i]) continue;
-        for (int j = i + 1; j < num_boxes; ++j)
+    keep_indices[count++] = i;
-            if (!is_dead[j] && IoU(&boxes[i * 5],
+    if (count == max_keeps) break;
-                                   &boxes[j * 5]) > thresh)
+    for (int j = i + 1; j < num_boxes; ++j)
-                is_dead[j] = 1;
+      if (!is_dead[j] && IoU(&boxes[i * 5], &boxes[j * 5]) > thresh) {
-    }
+        is_dead[j] = 1;
-    num_keep = count;
+      }
+  }
+  num_keep = count;
 }
-}  // namespace detection
+} // namespace detection
-}  // namespace utils
+} // namespace utils
-}  // namespace dragon
+} // namespace dragon
--- a/csrc/cxx/utils/detection_utils.cu
+++ b/csrc/cxx/utils/detection_utils.cu
@@ -9,127 +9,121 @@ namespace utils {
 namespace detection {
-#define DIV_UP(m,n) ((m) / (n) + ((m) % (n) > 0))
+#define DIV_UP(m, n) ((m) / (n) + ((m) % (n) > 0))
 #define NUM_THREADS 64
 namespace {
 template <typename T>
-__device__ bool _CheckIoU(
+__device__ bool _CheckIoU(const T* a, const T* b, const float thresh) {
-    const T*               a,
+  const T x1 = max(a[0], b[0]);
-    const T*               b,
+  const T y1 = max(a[1], b[1]);
-    const float            thresh) {
+  const T x2 = min(a[2], b[2]);
-    const T x1 = max(a[0], b[0]);
+  const T y2 = min(a[3], b[3]);
-    const T y1 = max(a[1], b[1]);
+  const T width = max(T(0), x2 - x1 + 1);
-    const T x2 = min(a[2], b[2]);
+  const T height = max(T(0), y2 - y1 + 1);
-    const T y2 = min(a[3], b[3]);
+  const T inter = width * height;
-    const T width = max(T(0), x2 - x1 + 1);
+  const T Sa = (a[2] - a[0] + T(1)) * (a[3] - a[1] + T(1));
-    const T height = max(T(0), y2 - y1 + 1);
+  const T Sb = (b[2] - b[0] + T(1)) * (b[3] - b[1] + T(1));
-    const T inter = width * height;
+  return inter > thresh * (Sa + Sb - inter);
-    const T Sa = (a[2] - a[0] + T(1)) * (a[3] - a[1] + T(1));
-    const T Sb = (b[2] - b[0] + T(1)) * (b[3] - b[1] + T(1));
-    return inter > thresh * (Sa + Sb - inter);
 }
 template <typename T>
 __global__ void _NonMaxSuppression(
-    const int               num_blocks,
+    const int num_blocks,
-    const int               num_boxes,
+    const int num_boxes,
-    const T                 thresh,
+    const T thresh,
-    const T*                dev_boxes,
+    const T* dev_boxes,
-    uint64_t*               dev_mask) {
+    uint64_t* dev_mask) {
-    const int row_start = blockIdx.y;
+  const int row_start = blockIdx.y;
-    const int col_start = blockIdx.x;
+  const int col_start = blockIdx.x;
-    if (row_start > col_start) return;
+  if (row_start > col_start) return;
-    const int row_size = min(num_boxes - row_start * NUM_THREADS, NUM_THREADS);
+  const int row_size = min(num_boxes - row_start * NUM_THREADS, NUM_THREADS);
-    const int col_size = min(num_boxes - col_start * NUM_THREADS, NUM_THREADS);
+  const int col_size = min(num_boxes - col_start * NUM_THREADS, NUM_THREADS);
-    __shared__ T block_boxes[NUM_THREADS * 4];
+  __shared__ T block_boxes[NUM_THREADS * 4];
-    if (threadIdx.x < col_size) {
+  if (threadIdx.x < col_size) {
-        const int c1 = threadIdx.x * 4;
+    const int c1 = threadIdx.x * 4;
-        const int c2 = (col_start * NUM_THREADS + threadIdx.x) * 5;
+    const int c2 = (col_start * NUM_THREADS + threadIdx.x) * 5;
-        block_boxes[c1] = dev_boxes[c2];
+    block_boxes[c1] = dev_boxes[c2];
-        block_boxes[c1 + 1] = dev_boxes[c2 + 1];
+    block_boxes[c1 + 1] = dev_boxes[c2 + 1];
-        block_boxes[c1 + 2] = dev_boxes[c2 + 2];
+    block_boxes[c1 + 2] = dev_boxes[c2 + 2];
-        block_boxes[c1 + 3] = dev_boxes[c2 + 3];
+    block_boxes[c1 + 3] = dev_boxes[c2 + 3];
-    }
+  }
-    __syncthreads();
+  __syncthreads();
-    if (threadIdx.x < row_size) {
+  if (threadIdx.x < row_size) {
-        const int index = row_start * NUM_THREADS + threadIdx.x;
+    const int index = row_start * NUM_THREADS + threadIdx.x;
-        const T* dev_box = dev_boxes + index * 5;
+    const T* dev_box = dev_boxes + index * 5;
-        unsigned long long val = 0;
+    unsigned long long val = 0;
-        const int start = (row_start == col_start) ? (threadIdx.x + 1) : 0;
+    const int start = (row_start == col_start) ? (threadIdx.x + 1) : 0;
-        for (int i = start; i < col_size; ++i) {
+    for (int i = start; i < col_size; ++i) {
-            if (_CheckIoU(dev_box, block_boxes + i * 4, thresh)) {
+      if (_CheckIoU(dev_box, block_boxes + i * 4, thresh)) {
-                val |= 1ULL << i;
+        val |= 1ULL << i;
-            }
+      }
-        }
-        dev_mask[index * num_blocks + col_start] = val;
    }
+    dev_mask[index * num_blocks + col_start] = val;
+  }
 }
-}  // namespace
+} // namespace
-template <> void ApplyNMS<float, CUDAContext>(
+template <>
-    const int               num_boxes,
+void ApplyNMS<float, CUDAContext>(
-    const int               max_keeps,
+    const int num_boxes,
-    const float             thresh,
+    const int max_keeps,
-    const float*            boxes,
+    const float thresh,
-    int64_t*                keep_indices,
+    const float* boxes,
-    int&                    num_keep,
+    int64_t* keep_indices,
-    CUDAContext*            ctx) {
+    int& num_keep,
-    const int num_blocks = DIV_UP(num_boxes, NUM_THREADS);
+    CUDAContext* ctx) {
+  const int num_blocks = DIV_UP(num_boxes, NUM_THREADS);
-    vector<uint64_t> mask_host(num_boxes * num_blocks);
-    auto* mask_dev = (uint64_t*)ctx->New(mask_host.size() * sizeof(uint64_t));
+  vector<uint64_t> mask_host(num_boxes * num_blocks);
+  auto* mask_dev = (uint64_t*)ctx->New(mask_host.size() * sizeof(uint64_t));
-    _NonMaxSuppression
-         <<< dim3(num_blocks, num_blocks), NUM_THREADS,
+  _NonMaxSuppression<<<
-             0, ctx->cuda_stream() >>>(
+      dim3(num_blocks, num_blocks),
-        num_blocks,
+      NUM_THREADS,
-        num_boxes,
+      0,
-        thresh,
+      ctx->cuda_stream()>>>(num_blocks, num_boxes, thresh, boxes, mask_dev);
-        boxes,
-        mask_dev
+  CUDA_CHECK(cudaMemcpyAsync(
-    );
+      mask_host.data(),
+      mask_dev,
-    CUDA_CHECK(cudaMemcpyAsync(
+      mask_host.size() * sizeof(uint64_t),
-        mask_host.data(),
+      cudaMemcpyDeviceToHost,
-        mask_dev,
+      ctx->cuda_stream()));
-        mask_host.size() * sizeof(uint64_t),
-        cudaMemcpyDeviceToHost,
+  ctx->FinishDeviceComputation();
-        ctx->cuda_stream()
-    ));
+  vector<uint64_t> dead_bit(num_blocks);
+  memset(&dead_bit[0], 0, sizeof(uint64_t) * num_blocks);
-    ctx->FinishDeviceComputation();
+  int num_selected = 0;
-    vector<uint64_t> dead_bit(num_blocks);
+  for (int i = 0; i < num_boxes; ++i) {
-    memset(&dead_bit[0], 0, sizeof(uint64_t) * num_blocks);
+    const int nblock = i / NUM_THREADS;
+    const int inblock = i % NUM_THREADS;
-    int num_selected = 0;
+    if (!(dead_bit[nblock] & (1ULL << inblock))) {
-    for (int i = 0; i < num_boxes; ++i) {
+      keep_indices[num_selected++] = i;
-        const int nblock = i / NUM_THREADS;
+      auto* mask_i = &mask_host[0] + i * num_blocks;
-        const int inblock = i % NUM_THREADS;
+      for (int j = nblock; j < num_blocks; ++j)
-        if (!(dead_bit[nblock] & (1ULL << inblock))) {
+        dead_bit[j] |= mask_i[j];
-            keep_indices[num_selected++] = i;
+      if (num_selected == max_keeps) break;
-            auto* mask_i = &mask_host[0] + i * num_blocks;
-            for (int j = nblock; j < num_blocks; ++j) dead_bit[j] |= mask_i[j];
-            if (num_selected == max_keeps) break;
-        }
    }
-    num_keep = num_selected;
+  }
-    ctx->Delete(mask_dev);
+  num_keep = num_selected;
+  ctx->Delete(mask_dev);
 }
-}  // namespace detection
+} // namespace detection
-}  // namespace utils
+} // namespace utils
-}  // namespace dragon
+} // namespace dragon
-#endif  // USE_CUDA
+#endif // USE_CUDA
--- a/csrc/cxx/utils/detection_utils.h
+++ b/csrc/cxx/utils/detection_utils.h
@@ -5,7 +5,7 @@
 * You should have received a copy of the BSD 2-Clause License
 * along with the software. If not, See,
 *
- *      <https://opensource.org/licenses/BSD-2-Clause>
+ *    <https://opensource.org/licenses/BSD-2-Clause>
 *
 * ------------------------------------------------------------
 */
@@ -13,8 +13,7 @@
 #ifndef SEETADET_CXX_UTILS_DETECTION_UTILS_H_
 #define SEETADET_CXX_UTILS_DETECTION_UTILS_H_
-#include "dragon/core/context.h"
+#include "dragon/core/common.h"
-#include "dragon/core/operator.h"
 namespace dragon {
@@ -24,390 +23,409 @@ namespace detection {
 #define ROUND(x) ((int)((x) + (T)0.5))
-/******************** BBox ********************/
+/*!
+ * Box API
+ */
 template <typename T>
 inline int FilterBoxes(
-    const T                         dx,
+    const T dx,
-    const T                         dy,
+    const T dy,
-    const T                         d_log_w,
+    const T d_log_w,
-    const T                         d_log_h,
+    const T d_log_h,
-    const T                         im_w,
+    const T im_w,
-    const T                         im_h,
+    const T im_h,
-    const T                         min_box_w,
+    const T min_box_w,
-    const T                         min_box_h,
+    const T min_box_h,
-    T*                              bbox) {
+    T* bbox) {
-    const T w = bbox[2] - bbox[0] + 1;
+  const T w = bbox[2] - bbox[0] + 1;
-    const T h = bbox[3] - bbox[1] + 1;
+  const T h = bbox[3] - bbox[1] + 1;
-    const T ctr_x = bbox[0] + (T)0.5 * w;
+  const T ctr_x = bbox[0] + (T)0.5 * w;
-    const T ctr_y = bbox[1] + (T)0.5 * h;
+  const T ctr_y = bbox[1] + (T)0.5 * h;
-    const T pred_ctr_x = dx * w + ctr_x;
+  const T pred_ctr_x = dx * w + ctr_x;
-    const T pred_ctr_y = dy * h + ctr_y;
+  const T pred_ctr_y = dy * h + ctr_y;
-    const T pred_w = exp(d_log_w) * w;
+  const T pred_w = exp(d_log_w) * w;
-    const T pred_h = exp(d_log_h) * h;
+  const T pred_h = exp(d_log_h) * h;
-    bbox[0] = pred_ctr_x - (T)0.5 * pred_w;
+  bbox[0] = pred_ctr_x - (T)0.5 * pred_w;
-    bbox[1] = pred_ctr_y - (T)0.5 * pred_h;
+  bbox[1] = pred_ctr_y - (T)0.5 * pred_h;
-    bbox[2] = pred_ctr_x + (T)0.5 * pred_w;
+  bbox[2] = pred_ctr_x + (T)0.5 * pred_w;
-    bbox[3] = pred_ctr_y + (T)0.5 * pred_h;
+  bbox[3] = pred_ctr_y + (T)0.5 * pred_h;
-    bbox[0] = std::max((T)0, std::min(bbox[0], im_w - 1));
+  bbox[0] = std::max((T)0, std::min(bbox[0], im_w - 1));
-    bbox[1] = std::max((T)0, std::min(bbox[1], im_h - 1));
+  bbox[1] = std::max((T)0, std::min(bbox[1], im_h - 1));
-    bbox[2] = std::max((T)0, std::min(bbox[2], im_w - 1));
+  bbox[2] = std::max((T)0, std::min(bbox[2], im_w - 1));
-    bbox[3] = std::max((T)0, std::min(bbox[3], im_h - 1));
+  bbox[3] = std::max((T)0, std::min(bbox[3], im_h - 1));
-    const T bbox_w = bbox[2] - bbox[0] + 1;
+  const T bbox_w = bbox[2] - bbox[0] + 1;
-    const T bbox_h = bbox[3] - bbox[1] + 1;
+  const T bbox_h = bbox[3] - bbox[1] + 1;
-    return (bbox_w >= min_box_w) * (bbox_h >= min_box_h);
+  return (bbox_w >= min_box_w) * (bbox_h >= min_box_h);
 }
 template <typename T>
 inline void BBoxTransform(
-    const T                         dx,
+    const T dx,
-    const T                         dy,
+    const T dy,
-    const T                         d_log_w,
+    const T d_log_w,
-    const T                         d_log_h,
+    const T d_log_h,
-    const T                         im_w,
+    const T im_w,
-    const T                         im_h,
+    const T im_h,
-    const T                         im_scale_h,
+    const T im_scale_h,
-    const T                         im_scale_w,
+    const T im_scale_w,
-    T*                              bbox) {
+    T* bbox) {
-    const T w = bbox[2] - bbox[0] + 1;
+  const T w = bbox[2] - bbox[0] + 1;
-    const T h = bbox[3] - bbox[1] + 1;
+  const T h = bbox[3] - bbox[1] + 1;
-    const T ctr_x = bbox[0] + (T)0.5 * w;
+  const T ctr_x = bbox[0] + (T)0.5 * w;
-    const T ctr_y = bbox[1] + (T)0.5 * h;
+  const T ctr_y = bbox[1] + (T)0.5 * h;
-    const T pred_ctr_x = dx * w + ctr_x;
+  const T pred_ctr_x = dx * w + ctr_x;
-    const T pred_ctr_y = dy * h + ctr_y;
+  const T pred_ctr_y = dy * h + ctr_y;
-    const T pred_w = exp(d_log_w) * w;
+  const T pred_w = exp(d_log_w) * w;
-    const T pred_h = exp(d_log_h) * h;
+  const T pred_h = exp(d_log_h) * h;
-    bbox[0] = pred_ctr_x - (T)0.5 * pred_w;
+  bbox[0] = pred_ctr_x - (T)0.5 * pred_w;
-    bbox[1] = pred_ctr_y - (T)0.5 * pred_h;
+  bbox[1] = pred_ctr_y - (T)0.5 * pred_h;
-    bbox[2] = pred_ctr_x + (T)0.5 * pred_w;
+  bbox[2] = pred_ctr_x + (T)0.5 * pred_w;
-    bbox[3] = pred_ctr_y + (T)0.5 * pred_h;
+  bbox[3] = pred_ctr_y + (T)0.5 * pred_h;
-    bbox[0] = std::max((T)0, std::min(bbox[0], im_w - 1)) / im_scale_w;
+  bbox[0] = std::max((T)0, std::min(bbox[0], im_w - 1)) / im_scale_w;
-    bbox[1] = std::max((T)0, std::min(bbox[1], im_h - 1)) / im_scale_h;
+  bbox[1] = std::max((T)0, std::min(bbox[1], im_h - 1)) / im_scale_h;
-    bbox[2] = std::max((T)0, std::min(bbox[2], im_w - 1)) / im_scale_w;
+  bbox[2] = std::max((T)0, std::min(bbox[2], im_w - 1)) / im_scale_w;
-    bbox[3] = std::max((T)0, std::min(bbox[3], im_h - 1)) / im_scale_h;
+  bbox[3] = std::max((T)0, std::min(bbox[3], im_h - 1)) / im_scale_h;
 }
-/******************** Anchor ********************/
+/*!
+ * Anchor API
+ */
 template <typename T>
 inline void GenerateAnchors(
-    int                             base_size,
+    int base_size,
-    const int                       num_ratios,
+    const int num_ratios,
-    const int                       num_scales,
+    const int num_scales,
-    const T*                        ratios,
+    const T* ratios,
-    const T*                        scales,
+    const T* scales,
-    T*                              anchors) {
+    T* anchors) {
-    const T base_area = (T)(base_size * base_size);
+  const T base_area = (T)(base_size * base_size);
-    const T center = (T)0.5 * (base_size - (T)1);
+  const T center = (T)0.5 * (base_size - (T)1);
-    T* offset_anchors = anchors;
+  T* offset_anchors = anchors;
-    for (int i = 0; i < num_ratios; ++i) {
+  for (int i = 0; i < num_ratios; ++i) {
-        const T ratio_w = (T)ROUND(sqrt(base_area / ratios[i]));
+    const T ratio_w = (T)ROUND(sqrt(base_area / ratios[i]));
-        const T ratio_h = (T)ROUND(ratio_w * ratios[i]);
+    const T ratio_h = (T)ROUND(ratio_w * ratios[i]);
-        for (int j = 0; j < num_scales; ++j) {
+    for (int j = 0; j < num_scales; ++j) {
-            const T scale_w = (T)0.5 * (ratio_w * scales[j] - (T)1);
+      const T scale_w = (T)0.5 * (ratio_w * scales[j] - (T)1);
-            const T scale_h = (T)0.5 * (ratio_h * scales[j] - (T)1);
+      const T scale_h = (T)0.5 * (ratio_h * scales[j] - (T)1);
-            offset_anchors[0] = center - scale_w;
+      offset_anchors[0] = center - scale_w;
-            offset_anchors[1] = center - scale_h;
+      offset_anchors[1] = center - scale_h;
-            offset_anchors[2] = center + scale_w;
+      offset_anchors[2] = center + scale_w;
-            offset_anchors[3] = center + scale_h;
+      offset_anchors[3] = center + scale_h;
-            offset_anchors += 4;
+      offset_anchors += 4;
-        }
    }
+  }
 }
 template <typename T>
 inline void GenerateGridAnchors(
-    const int                       num_proposals,
+    const int num_proposals,
-    const int                       num_anchors,
+    const int num_anchors,
-    const int                       feat_h,
+    const int feat_h,
-    const int                       feat_w,
+    const int feat_w,
-    const int                       stride,
+    const int stride,
-    const int                       base_offset,
+    const int base_offset,
-    const T*                        anchors,
+    const T* anchors,
-    const int64_t*                  indices,
+    const int64_t* indices,
-    T*                              proposals) {
+    T* proposals) {
-    T x, y;
+  T x, y;
-    int idx_3d, a, h, w;
+  int idx_3d, a, h, w;
-    int idx_range = num_anchors * feat_h * feat_w;
+  int idx_range = num_anchors * feat_h * feat_w;
-    for (int i = 0; i < num_proposals; ++i) {
+  for (int i = 0; i < num_proposals; ++i) {
-        idx_3d = (int)indices[i] - base_offset;
+    idx_3d = (int)indices[i] - base_offset;
-        if (idx_3d >= 0 && idx_3d < idx_range) {
+    if (idx_3d >= 0 && idx_3d < idx_range) {
-            w = idx_3d % feat_w;
+      w = idx_3d % feat_w;
-            h = (idx_3d / feat_w) % feat_h;
+      h = (idx_3d / feat_w) % feat_h;
-            a = idx_3d / feat_w / feat_h;
+      a = idx_3d / feat_w / feat_h;
-            x = (T)w * stride, y = (T)h * stride;
+      x = (T)w * stride, y = (T)h * stride;
-            auto* A = anchors + a * 4;
+      auto* A = anchors + a * 4;
-            auto* P = proposals + i * 5;
+      auto* P = proposals + i * 5;
-            P[0] = x + A[0], P[1] = y + A[1];
+      P[0] = x + A[0], P[1] = y + A[1];
-            P[2] = x + A[2], P[3] = y + A[3];
+      P[2] = x + A[2], P[3] = y + A[3];
-        }
    }
+  }
 }
 template <typename T>
 inline void GenerateGridAnchors(
-    const int                       num_proposals,
+    const int num_proposals,
-    const int                       num_classes,
+    const int num_classes,
-    const int                       num_anchors,
+    const int num_anchors,
-    const int                       feat_h,
+    const int feat_h,
-    const int                       feat_w,
+    const int feat_w,
-    const int                       stride,
+    const int stride,
-    const int                       base_offset,
+    const int base_offset,
-    const T*                        anchors,
+    const T* anchors,
-    const int64_t*                  indices,
+    const int64_t* indices,
-    T*                              proposals) {
+    T* proposals) {
-    T x, y;
+  T x, y;
-    int idx_4d, a, h, w;
+  int idx_4d, a, h, w;
-    int lr = num_classes * base_offset;
+  int lr = num_classes * base_offset;
-    int rr = num_classes * (num_anchors * feat_h * feat_w);
+  int rr = num_classes * (num_anchors * feat_h * feat_w);
-    for (int i = 0; i < num_proposals; ++i) {
+  for (int i = 0; i < num_proposals; ++i) {
-        idx_4d = (int)indices[i] - lr;
+    idx_4d = (int)indices[i] - lr;
-        if (idx_4d >= 0 && idx_4d < rr) {
+    if (idx_4d >= 0 && idx_4d < rr) {
-            idx_4d /= num_classes;
+      idx_4d /= num_classes;
-            w = idx_4d % feat_w;
+      w = idx_4d % feat_w;
-            h = (idx_4d / feat_w) % feat_h;
+      h = (idx_4d / feat_w) % feat_h;
-            a = idx_4d / feat_w / feat_h;
+      a = idx_4d / feat_w / feat_h;
-            x = (T)w * stride, y = (T)h * stride;
+      x = (T)w * stride, y = (T)h * stride;
-            auto* A = anchors + a * 4;
+      auto* A = anchors + a * 4;
-            auto* P = proposals + i * 7 + 1;
+      auto* P = proposals + i * 7 + 1;
-            P[0] = x + A[0], P[1] = y + A[1];
+      P[0] = x + A[0], P[1] = y + A[1];
-            P[2] = x + A[2], P[3] = y + A[3];
+      P[2] = x + A[2], P[3] = y + A[3];
-        }
    }
+  }
 }
-/******************** Proposal ********************/
+/*!
+ * Proposal API
+ */
 template <typename T>
 void GenerateSSProposals(
-    const int                       K,
+    const int K,
-    const int                       num_proposals,
+    const int num_proposals,
-    const float                     im_h,
+    const float im_h,
-    const float                     im_w,
+    const float im_w,
-    const float                     min_box_h,
+    const float min_box_h,
-    const float                     min_box_w,
+    const float min_box_w,
-    const T*                        scores,
+    const T* scores,
-    const T*                        deltas,
+    const T* deltas,
-    const int64_t*                  indices,
+    const int64_t* indices,
-    T*                              proposals) {
+    T* proposals) {
-    int64_t index, a, k;
+  int64_t index, a, k;
-    const float* delta;
+  const float* delta;
-    float* proposal = proposals;
+  float* proposal = proposals;
-    float dx, dy, d_log_w, d_log_h;
+  float dx, dy, d_log_w, d_log_h;
-    for (int i = 0; i < num_proposals; ++i) {
+  for (int i = 0; i < num_proposals; ++i) {
-        index = indices[i];
+    index = indices[i];
-        a = index / K, k = index % K;
+    a = index / K, k = index % K;
-        delta = deltas + k;
+    delta = deltas + k;
-        dx = delta[(a * 4 + 0) * K];
+    dx = delta[(a * 4 + 0) * K];
-        dy = delta[(a * 4 + 1) * K];
+    dy = delta[(a * 4 + 1) * K];
-        d_log_w = delta[(a * 4 + 2) * K];
+    d_log_w = delta[(a * 4 + 2) * K];
-        d_log_h = delta[(a * 4 + 3) * K];
+    d_log_h = delta[(a * 4 + 3) * K];
-        proposal[4] = FilterBoxes(
+    proposal[4] = FilterBoxes(
-            dx, dy,
+                      dx,
-            d_log_w, d_log_h,
+                      dy,
-            im_w, im_h,
+                      d_log_w,
-            min_box_w, min_box_h,
+                      d_log_h,
-            proposal
+                      im_w,
-        ) * scores[index];
+                      im_h,
-        proposal += 5;
+                      min_box_w,
-    }
+                      min_box_h,
+                      proposal) *
+        scores[index];
+    proposal += 5;
+  }
 }
 template <typename T>
 void GenerateMSProposals(
-    const int                       num_candidates,
+    const int num_candidates,
-    const int                       num_proposals,
+    const int num_proposals,
-    const float                     im_h,
+    const float im_h,
-    const float                     im_w,
+    const float im_w,
-    const float                     min_box_h,
+    const float min_box_h,
-    const float                     min_box_w,
+    const float min_box_w,
-    const T*                        scores,
+    const T* scores,
-    const T*                        deltas,
+    const T* deltas,
-    const int64_t*                  indices,
+    const int64_t* indices,
-    T*                              proposals) {
+    T* proposals) {
-    int64_t index;
+  int64_t index;
-    int64_t num_candidates_2x = 2 * num_candidates;
+  int64_t num_candidates_2x = 2 * num_candidates;
-    int64_t num_candidates_3x = 3 * num_candidates;
+  int64_t num_candidates_3x = 3 * num_candidates;
-    float* proposal = proposals;
+  float* proposal = proposals;
-    float dx, dy, d_log_w, d_log_h;
+  float dx, dy, d_log_w, d_log_h;
-    for (int i = 0; i < num_proposals; ++i) {
+  for (int i = 0; i < num_proposals; ++i) {
-        index = indices[i];
+    index = indices[i];
-        dx = deltas[index];
+    dx = deltas[index];
-        dy = deltas[num_candidates + index];
+    dy = deltas[num_candidates + index];
-        d_log_w = deltas[num_candidates_2x + index];
+    d_log_w = deltas[num_candidates_2x + index];
-        d_log_h = deltas[num_candidates_3x + index];
+    d_log_h = deltas[num_candidates_3x + index];
-        proposal[4] = FilterBoxes(
+    proposal[4] = FilterBoxes(
-            dx, dy,
+                      dx,
-            d_log_w, d_log_h,
+                      dy,
-            im_w, im_h,
+                      d_log_w,
-            min_box_w, min_box_h,
+                      d_log_h,
-            proposal
+                      im_w,
-        ) * scores[index];
+                      im_h,
-        proposal += 5;
+                      min_box_w,
-    }
+                      min_box_h,
+                      proposal) *
+        scores[index];
+    proposal += 5;
+  }
 }
 template <typename T>
 void GenerateMCProposals(
-    const int                       num_proposals,
+    const int num_proposals,
-    const int                       num_boxes,
+    const int num_boxes,
-    const int                       num_classes,
+    const int num_classes,
-    const int                       im_idx,
+    const int im_idx,
-    const float                     im_h,
+    const float im_h,
-    const float                     im_w,
+    const float im_w,
-    const float                     im_scale_h,
+    const float im_scale_h,
-    const float                     im_scale_w,
+    const float im_scale_w,
-    const T*                        scores,
+    const T* scores,
-    const T*                        deltas,
+    const T* deltas,
-    const int64_t*                  indices,
+    const int64_t* indices,
-    T*                              proposals) {
+    T* proposals) {
-    int64_t index, cls;
+  int64_t index, cls;
-    int64_t num_boxes_2x = 2 * num_boxes;
+  int64_t num_boxes_2x = 2 * num_boxes;
-    int64_t num_boxes_3x = 3 * num_boxes;
+  int64_t num_boxes_3x = 3 * num_boxes;
-    float* proposal = proposals;
+  float* proposal = proposals;
-    float dx, dy, d_log_w, d_log_h;
+  float dx, dy, d_log_w, d_log_h;
-    for (int i = 0; i < num_proposals; ++i) {
+  for (int i = 0; i < num_proposals; ++i) {
-        cls = indices[i] % num_classes;
+    cls = indices[i] % num_classes;
-        index = indices[i] / num_classes;
+    index = indices[i] / num_classes;
-        dx = deltas[index];
+    dx = deltas[index];
-        dy = deltas[num_boxes + index];
+    dy = deltas[num_boxes + index];
-        d_log_w = deltas[num_boxes_2x + index];
+    d_log_w = deltas[num_boxes_2x + index];
-        d_log_h = deltas[num_boxes_3x + index];
+    d_log_h = deltas[num_boxes_3x + index];
-        proposal[0] = im_idx;
+    proposal[0] = im_idx;
-        BBoxTransform(
+    BBoxTransform(
-            dx, dy,
+        dx,
-            d_log_w, d_log_h,
+        dy,
-            im_w, im_h,
+        d_log_w,
-            im_scale_h, im_scale_w,
+        d_log_h,
-            proposal + 1
+        im_w,
-        );
+        im_h,
-        proposal[5] = scores[indices[i]];
+        im_scale_h,
-        proposal[6] = cls + 1;
+        im_scale_w,
-        proposal += 7;
+        proposal + 1);
-    }
+    proposal[5] = scores[indices[i]];
+    proposal[6] = cls + 1;
+    proposal += 7;
+  }
 }
 template <typename T>
-inline void SortProposals(
+inline void
-    const int                       start,
+SortProposals(const int start, const int end, const int num_top, T* proposals) {
-    const int                       end,
+  const T pivot_score = proposals[start * 5 + 4];
-    const int                       num_top,
+  int left = start + 1, right = end;
-    T*                              proposals) {
+  while (left <= right) {
-    const T pivot_score = proposals[start * 5 + 4];
+    while (left <= end && proposals[left * 5 + 4] >= pivot_score)
-    int left = start + 1, right = end;
+      ++left;
-    while (left <= right) {
+    while (right > start && proposals[right * 5 + 4] <= pivot_score)
-        while (left <= end && proposals[left * 5 + 4] >= pivot_score) ++left;
+      --right;
-        while (right > start && proposals[right * 5 + 4] <= pivot_score) --right;
+    if (left <= right) {
-        if (left <= right) {
+      for (int i = 0; i < 5; ++i)
-            for (int i = 0; i < 5; ++i)
+        std::swap(proposals[left * 5 + i], proposals[right * 5 + i]);
-                std::swap(proposals[left * 5 + i], proposals[right * 5 + i]);
+      ++left;
-            ++left;
+      --right;
-            --right;
-        }
    }
-    if (right > start) {
+  }
-        for (int i = 0; i < 5; ++i)
+  if (right > start) {
-            std::swap(proposals[start * 5 + i], proposals[right * 5 + i]);
+    for (int i = 0; i < 5; ++i)
-    }
+      std::swap(proposals[start * 5 + i], proposals[right * 5 + i]);
-    if (start < right - 1) SortProposals(start, right - 1, num_top, proposals);
+  }
-    if (right + 1 < num_top && right + 1 < end)
+  if (start < right - 1) SortProposals(start, right - 1, num_top, proposals);
-        SortProposals(right + 1, end, num_top, proposals);
+  if (right + 1 < num_top && right + 1 < end)
+    SortProposals(right + 1, end, num_top, proposals);
 }
 template <typename T>
 inline void RetrieveRoIs(
-    const int                       num_rois,
+    const int num_rois,
-    const int                       roi_batch_ind,
+    const int roi_batch_ind,
-    const T*                        proposals,
+    const T* proposals,
-    const int64_t*                  roi_indices,
+    const int64_t* roi_indices,
-    T*                              rois) {
+    T* rois) {
-    for (int i = 0; i < num_rois; ++i) {
+  for (int i = 0; i < num_rois; ++i) {
-        const T* proposal = proposals + roi_indices[i] * 5;
+    const T* proposal = proposals + roi_indices[i] * 5;
-        rois[i * 5 + 0] = (T)roi_batch_ind;
+    rois[i * 5 + 0] = (T)roi_batch_ind;
-        rois[i * 5 + 1] = proposal[0];
+    rois[i * 5 + 1] = proposal[0];
-        rois[i * 5 + 2] = proposal[1];
+    rois[i * 5 + 2] = proposal[1];
-        rois[i * 5 + 3] = proposal[2];
+    rois[i * 5 + 3] = proposal[2];
-        rois[i * 5 + 4] = proposal[3];
+    rois[i * 5 + 4] = proposal[3];
-    }
+  }
 }
 template <typename T>
 inline int roi_level(
-    const int                       min_level,
+    const int min_level,
-    const int                       max_level,
+    const int max_level,
-    const int                       canonical_level,
+    const int canonical_level,
-    const int                       canonical_scale,
+    const int canonical_scale,
-    T*                              roi) {
+    T* roi) {
-    T w = roi[3] - roi[1] + 1;
+  T w = roi[3] - roi[1] + 1;
-    T h = roi[4] - roi[2] + 1;
+  T h = roi[4] - roi[2] + 1;
-    // Refer the settings of paper
+  // Refer the settings of paper
-    int level = canonical_level + std::log2(
+  int level = canonical_level +
-        std::max(std::sqrt(w * h), (T)1) / (T)canonical_scale);
+      std::log2(std::max(std::sqrt(w * h), (T)1) / (T)canonical_scale);
-    return std::min(max_level, std::max(min_level, level));
+  return std::min(max_level, std::max(min_level, level));
 }
 template <typename T>
 inline void CollectRoIs(
-    const int                       num_rois,
+    const int num_rois,
-    const int                       min_level,
+    const int min_level,
-    const int                       max_level,
+    const int max_level,
-    const int                       canonical_level,
+    const int canonical_level,
-    const int                       canonical_scale,
+    const int canonical_scale,
-    const T*                        rois,
+    const T* rois,
-    vector<vec64_t>&                roi_bins) {
+    vector<vec64_t>& roi_bins) {
-    const T* roi = rois;
+  const T* roi = rois;
-    for (int i = 0; i < num_rois; ++i) {
+  for (int i = 0; i < num_rois; ++i) {
-        int bin_idx = roi_level(min_level, max_level,
+    int bin_idx =
-            canonical_level, canonical_scale, roi);
+        roi_level(min_level, max_level, canonical_level, canonical_scale, roi);
-        bin_idx = std::max(bin_idx - min_level, 0);
+    bin_idx = std::max(bin_idx - min_level, 0);
-        roi_bins[bin_idx].push_back(i);
+    roi_bins[bin_idx].push_back(i);
-        roi += 5;
+    roi += 5;
-    }
+  }
 }
 template <typename T>
 inline void DistributeRoIs(
-    const vector<vec64_t>&              roi_bins,
+    const vector<vec64_t>& roi_bins,
-    const T*                            rois,
+    const T* rois,
-    vector<T*>                          outputs) {
+    vector<T*> outputs) {
-    for (int i = 0; i < roi_bins.size(); i++) {
+  for (int i = 0; i < roi_bins.size(); i++) {
-        auto* y = outputs[i];
+    auto* y = outputs[i];
-        if (roi_bins[i].size() == 0) {
+    if (roi_bins[i].size() == 0) {
-            // Fake a tiny roi to avoid empty roi pooling
+      // Fake a tiny roi to avoid empty roi pooling
-            y[0] = 0, y[1] = 0, y[2] = 0, y[3] = 1, y[4] = 1;
+      y[0] = 0, y[1] = 0, y[2] = 0, y[3] = 1, y[4] = 1;
-        } else {
+    } else {
-            for (int j = 0; j < roi_bins[i].size(); ++j) {
+      for (int j = 0; j < roi_bins[i].size(); ++j) {
-                const T* roi = rois + roi_bins[i][j] * 5;
+        const T* roi = rois + roi_bins[i][j] * 5;
-                for (int k = 0; k < 5; ++k) y[k] = roi[k];
+        for (int k = 0; k < 5; ++k)
-                y += 5;
+          y[k] = roi[k];
-            }
+        y += 5;
-        }
+      }
    }
+  }
 }
-/******************** NMS ********************/
+/*!
+ * NMS API
+ */
 template <typename T, class Context>
 void ApplyNMS(
-    const int                       num_boxes,
+    const int num_boxes,
-    const int                       max_keeps,
+    const int max_keeps,
-    const T                         thresh,
+    const T thresh,
-    const T*                        boxes,
+    const T* boxes,
-    int64_t*                        keep_indices,
+    int64_t* keep_indices,
-    int&                            num_keep,
+    int& num_keep,
-    Context*                        ctx);
+    Context* ctx);
-}  // namespace detection
+} // namespace detection
-}  // namespace utils
+} // namespace utils
-}  // namespace dragon
+} // namespace dragon
-#endif  // SEETADET_CXX_UTILS_DETECTION_UTILS_H_
+#endif // SEETADET_CXX_UTILS_DETECTION_UTILS_H_
--- a/seetadet/algo/faster_rcnn/anchor_target.py
+++ b/seetadet/algo/faster_rcnn/anchor_target.py
@@ -52,12 +52,9 @@ class AnchorTarget(object):
        gt_boxes_wide = box_util.dismantle_boxes(gt_boxes, num_images)
        # Generate grid anchors from base
-        all_anchors = \
+        grid_shapes = [f.shape[-2:] for f in features]
-            generate_grid_anchors(
+        all_anchors = generate_grid_anchors(
-                features,
+            grid_shapes, self.base_anchors, self.strides)
-                self.base_anchors,
-                self.strides,
-            )
        num_anchors = all_anchors.shape[0]
        # Label: ``1`` is positive, ``0`` is negative, ``-1` is don't care

--- a/seetadet/algo/faster_rcnn/proposal.py
+++ b/seetadet/algo/faster_rcnn/proposal.py
@@ -58,12 +58,9 @@ class Proposal(object):
        # Get resources
        num_images = ims_info.shape[0]
-        all_anchors = \
+        grid_shapes = [f.shape[-2:] for f in features]
-            generate_grid_anchors(
+        all_anchors = generate_grid_anchors(
-                features,
+            grid_shapes, self.base_anchors, self.strides)
-                self.base_anchors,
-                self.strides,
-            )
        # Prepare for the outputs
        batch_rois = []

--- a/seetadet/algo/faster_rcnn/utils.py
+++ b/seetadet/algo/faster_rcnn/utils.py
@@ -19,40 +19,40 @@ import numpy as np
 from seetadet.core.config import cfg
-def generate_grid_anchors(features, base_anchors, strides):
+def generate_grid_anchors(grid_shapes, base_anchors, strides):
    num_strides = len(strides)
-    if len(features) != num_strides:
+    if len(grid_shapes) != num_strides:
        raise ValueError(
-            'Given %d features for %d strides.'
+            'Given %d grids for %d strides.'
-            % (len(features), num_strides)
+            % (len(grid_shapes), num_strides)
        )
    # Generate proposals from shifted anchors
    anchors_to_pack = []
-    for i in range(len(features)):
+    for i in range(len(grid_shapes)):
-        height, width = features[i].shape[-2:]
+        height, width = grid_shapes[i]
        shift_x = np.arange(0, width) * strides[i]
        shift_y = np.arange(0, height) * strides[i]
        shift_x, shift_y = np.meshgrid(shift_x, shift_y)
        shifts = np.vstack((shift_x.ravel(), shift_y.ravel(),
                            shift_x.ravel(), shift_y.ravel())).transpose()
-        # Add A anchors (1, A, 4) to
+        # Add a anchors (1, a, 4) to
-        # cell K shifts (K, 1, 4) to get
+        # cell k shifts (k, 1, 4) to get
-        # shift anchors (K, A, 4)
+        # shift anchors (k, a, 4)
-        # Reshape to (K * A, 4) shifted anchors
+        # Reshape to (k * a, 4) shifted anchors
-        A = base_anchors[i].shape[0]
+        a = base_anchors[i].shape[0]
-        K = shifts.shape[0]
+        k = shifts.shape[0]
-        anchors = (base_anchors[i].reshape((1, A, 4)) +
+        anchors = (base_anchors[i].reshape((1, a, 4)) +
-                   shifts.reshape((1, K, 4)).transpose((1, 0, 2)))
+                   shifts.reshape((1, k, 4)).transpose((1, 0, 2)))
        if num_strides > 1:
            # Transpose from (K, A, 4) to (A, K, 4)
            # We will pack it with other strides to
            # match the data format of (N, C, H, W)
            anchors = anchors.transpose((1, 0, 2))
-            anchors = anchors.reshape((A * K, 4))
+            anchors = anchors.reshape((a * k, 4))
            anchors_to_pack.append(anchors)
        else:
            # Original order of Faster R-CNN
-            return anchors.reshape((K * A, 4))
+            return anchors.reshape((k * a, 4))
    return np.vstack(anchors_to_pack)

--- a/seetadet/algo/retinanet/anchor_target.py
+++ b/seetadet/algo/retinanet/anchor_target.py
@@ -46,6 +46,9 @@ class AnchorTarget(object):
                    ratios=self.ratios,
                    sizes=sizes,
                ))
+        # Store the cached grid anchors
+        self.last_grid_shapes = None
+        self.last_grid_anchors = None
    def __call__(self, features, gt_boxes):
        num_images = cfg.TRAIN.IMS_PER_BATCH
@@ -58,12 +61,17 @@ class AnchorTarget(object):
            )
        # Generate grid anchors from base
-        all_anchors = \
+        grid_shapes = [f.shape[-2:] for f in features]
-            generate_grid_anchors(
+        if grid_shapes == self.last_grid_shapes:
-                features,
+            all_anchors = self.last_grid_anchors
-                self.base_anchors,
+        else:
-                self.strides,
+            self.last_grid_shapes = grid_shapes
-            )
+            self.last_grid_anchors = all_anchors = \
+                generate_grid_anchors(
+                    grid_shapes,
+                    self.base_anchors,
+                    self.strides,
+                )
        num_anchors = all_anchors.shape[0]
        # Label: ``1`` is positive, ``0`` is negative, ``-1` is don't care

--- a/seetadet/algo/retinanet/test.py
+++ b/seetadet/algo/retinanet/test.py
@@ -15,6 +15,7 @@ from __future__ import print_function
 import types
+import dragon
 import dragon.vm.torch as torch
 import numpy as np
@@ -59,7 +60,7 @@ def ims_detect(detector, raw_images):
    # Unpack results
    results = outputs['detections']
-    detections = [[] for _ in range(len((raw_images)))]
+    detections = [[] for _ in range(len(raw_images))]
    for i in range(len(ims)):
        inds = np.where(results[:, 0].astype(np.int32) == i)[0]
@@ -126,6 +127,6 @@ def test_net(weights, num_classes, q_in, q_out, device):
            q_out.put((
                indices[i],
                dict([('im_detect', _t['im_detect'].average_time),
-                      ('misc',_t['misc'].average_time)]),
+                      ('misc', _t['misc'].average_time)]),
                dict([('boxes', boxes_this_image)]),
            ))
--- a/seetadet/algo/ssd/priorbox.py
+++ b/seetadet/algo/ssd/priorbox.py
@@ -45,14 +45,14 @@ class PriorBox(object):
                    aspect_ratios[i],
                )
            )
-        self.grid_anchors = None
+        # Store the cached grid anchors
+        self.last_grid_anchors = None
    def __call__(self, features):
-        if self.grid_anchors is not None:
+        if self.last_grid_anchors is not None:
-            return self.grid_anchors
+            return self.last_grid_anchors
-        self.grid_anchors = []
+        all_anchors = []
        for i in range(len(self.strides)):
            # 1. Generate base grids
            height, width = features[i].shape[-2:]
@@ -61,23 +61,23 @@ class PriorBox(object):
            shift_x, shift_y = np.meshgrid(shift_x, shift_y)
            # 2. Apply anchors on base grids
-            # Add A anchors (1, A, 4) to
+            # Add a anchors (1, a, 4) to
-            # cell K shifts (K, 1, 4) to get
+            # cell k shifts (k, 1, 4) to get
-            # shift anchors (K, A, 4)
+            # shift anchors (k, a, 4)
-            # Reshape to (K * A, 4) shifted anchors
+            # Reshape to (k * a, 4) shifted anchors
-            A = self.base_anchors[i].shape[0]
+            a = self.base_anchors[i].shape[0]
-            D = self.base_anchors[i].shape[1]
+            d = self.base_anchors[i].shape[1]
            shifts = np.vstack((
                shift_x.ravel(),
                shift_y.ravel(),
                shift_x.ravel(),
                shift_y.ravel())
            ).transpose()
-            K = shifts.shape[0]  # K = map_h * map_w
+            k = shifts.shape[0]  # k = map_h * map_w
-            anchors = (self.base_anchors[i].reshape((1, A, D)) +
+            anchors = (self.base_anchors[i].reshape((1, a, d)) +
-                       shifts.reshape((1, K, D)).transpose((1, 0, 2)))
+                       shifts.reshape((1, k, d)).transpose((1, 0, 2)))
-            anchors = anchors.reshape((K * A, D)).astype(np.float32)
+            anchors = anchors.reshape((k * a, d)).astype(np.float32)
-            self.grid_anchors.append(anchors)
+            all_anchors.append(anchors)
-        self.grid_anchors = np.concatenate(self.grid_anchors)
-        return self.grid_anchors
+        self.last_grid_anchors = np.concatenate(all_anchors)
+        return self.last_grid_anchors
--- a/seetadet/algo/ssd/test.py
+++ b/seetadet/algo/ssd/test.py
@@ -32,11 +32,9 @@ def get_images(ims):
    for im in ims:
        im_scales.append((float(out_size) / im.shape[0],
                          float(out_size) / im.shape[1]))
-        processed_ims.append(
+        processed_ims.append(cv2.resize(
-            cv2.resize(
            im, (out_size, out_size),
-            interpolation=cv2.INTER_AREA,
+            interpolation=cv2.INTER_AREA))
-        ))
    if ims[0].dtype == 'uint16':
        ims_blob = np.array(processed_ims, dtype='float32') / 256.
    else:

--- a/seetadet/algo/ssd/transforms.py
+++ b/seetadet/algo/ssd/transforms.py
@@ -49,12 +49,12 @@ class Distort(object):
        ]
    def apply(self, img, boxes=None):
-        if self._prob > 0:
+        self._prob = 0.5 if cfg.TRAIN.USE_COLOR_JITTER else 0
-            img = PIL.Image.fromarray(img)
+        img = PIL.Image.fromarray(img)
-            for transform_fn, prob in self._transforms:
+        for transform_fn, prob in self._transforms:
-                if npr.uniform() < prob:
+            if npr.uniform() < prob:
-                    img = transform_fn(img)
+                img = transform_fn(img)
-                    img = img.enhance(1. + npr.uniform(-.4, .4))
+                img = img.enhance(1. + npr.uniform(-.4, .4))
            return np.array(img), boxes
        return img, boxes

--- a/seetadet/algo/ssd/transforms_test.py
+++ b/seetadet/algo/ssd/transforms_test.py
@@ -27,8 +27,9 @@ if __name__ == '__main__':
    np.random.seed(3)
    cfg.TRAIN.SCALES = [300]
    cfg.TRAIN.RANDOM_SCALES = [0.25, 1.00]
+    cfg.TRAIN.USE_COLOR_JITTER = True
-    augmentor = transforms.Compose(
+    transformer = transforms.Compose(
        transforms.Distort(),
        transforms.Expand(),
        transforms.Sample(),
@@ -38,12 +39,12 @@ if __name__ == '__main__':
    while True:
        img = cv2.imread('cat.jpg')
        boxes = np.array([[0.33, 0.04, 0.71, 0.98]], dtype=np.float32)
-        img, boxes = augmentor(img, boxes)
+        img, boxes = transformer(img, boxes)
        for box in boxes:
            x1 = int(box[0] * img.shape[1])
            y1 = int(box[1] * img.shape[0])
            x2 = int(box[2] * img.shape[1])
            y2 = int(box[3] * img.shape[0])
            cv2.rectangle(img, (x1, y1), (x2, y2), (188, 119, 64), 2)
-        cv2.imshow('Sample', img)
+        cv2.imshow('Transforms - Preview', img)
        cv2.waitKey(0)
--- a/seetadet/dali/ssd_pipeline.py
+++ b/seetadet/dali/ssd_pipeline.py
@@ -70,14 +70,15 @@ class Pipeline(dali.Pipeline):
        # Decode image
        image = self.decode(inputs['image'])
-        # Augment the color space
+        # Augment the color space if necessary
-        image = self.hsv(
+        if cfg.TRAIN.USE_COLOR_JITTER:
-            self.brightness_contrast(
+            image = self.hsv(
-                image,
+                self.brightness_contrast(
-                brightness=self.twist_rng(),
+                    image,
-                contrast=self.twist_rng(),
+                    brightness=self.twist_rng(),
-            ), saturation=self.twist_rng()
+                    contrast=self.twist_rng(),
-        )
+                ), saturation=self.twist_rng()
+            )
        # Expand randomly to get smaller objects
        pr = self.paste_ratio() * self.flip_rng() + 1.

--- a/seetadet/datasets/factory.py
+++ b/seetadet/datasets/factory.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 import os
-from seetadet.datasets import kpl_record
+from seetadet.datasets import kpl_dataset
 def get_dataset(name):
@@ -42,5 +42,5 @@ def list_dataset():
 _GLOBAL_REGISTERED_DATASET = {
    'default': lambda source:
-        kpl_record.KPLRecordDataset(source),
+        kpl_dataset.KPLRecordDataset(source),
 }
--- a/seetadet/datasets/kpl_record.py
+++ b/seetadet/datasets/kpl_record.py
--- a/seetadet/modeling/airnet.py
+++ b/seetadet/modeling/airnet.py
@@ -149,8 +149,10 @@ class AirNet(nn.Module):
        x = self.layer1(x)
        outputs = [None, None, self.layer2(x)]
-        if hasattr(self, 'layer3'): outputs += [self.layer3(outputs[-1])]
+        if hasattr(self, 'layer3'):
-        if hasattr(self, 'layer4'): outputs += [self.layer4(outputs[-1])]
+            outputs += [self.layer3(outputs[-1])]
+        if hasattr(self, 'layer4'):
+            outputs += [self.layer4(outputs[-1])]
        return outputs

--- a/seetadet/modeling/detector.py
+++ b/seetadet/modeling/detector.py
@@ -39,16 +39,17 @@ class Detector(nn.Module):
        backbone = cfg.MODEL.BACKBONE.lower().split('.')
        body, modules = backbone[0], backbone[1:]
-        # + DataLoader
+        # DataLoader
+        self.data_loader = None
        self.data_loader_cls = importlib.import_module(
            'seetadet.algo.{}'.format(model)).DataLoader
        self.bootstrap = vision.Bootstrap()
-        # + FeatureExtractor
+        # FeatureExtractor
        self.body = backbones.get(body)()
        feature_dims = self.body.feature_dims
-        # + FeatureEnhancer
+        # FeatureEnhancer
        if 'fpn' in modules:
            self.fpn = models.FPN(feature_dims)
            feature_dims = self.fpn.feature_dims
@@ -57,7 +58,7 @@ class Detector(nn.Module):
        else:
            feature_dims = [feature_dims[-1]]
-        # + Detection Modules
+        # Detection Modules
        if 'rcnn' in model:
            self.rpn = models.RPN(feature_dims[0])
            if 'faster' in model:
@@ -106,7 +107,7 @@ class Detector(nn.Module):
        if inputs is None:
            # 1) Training: <= DataLayer
            # 2) Inference: <= Given
-            if not hasattr(self, 'data_loader'):
+            if self.data_loader is None:
                self.data_loader = self.data_loader_cls()
            inputs = self.data_loader()
@@ -171,29 +172,34 @@ class Detector(nn.Module):
        #  Merge Affine into Convolution  #
        ###################################
        last_module = None
-        for e in self.modules():
+        for module in self.modules():
-            if isinstance(e, nn.Affine) and \
+            if isinstance(module, nn.Affine) and \
                    isinstance(last_module, nn.Conv2d):
                if last_module.bias is None:
                    delattr(last_module, 'bias')
-                    e.forward = lambda x: x
+                    module.forward = lambda x: x
-                    last_module.bias = e.bias
+                    last_module.bias = module.bias
-                    last_module.weight.data.mul_(e.weight.data)
+                    weight = module.weight.data.view(
-            last_module = e
+                        0, *([1] * (last_module.weight.ndimension() - 1)))
+                    last_module.weight.data.mul_(weight)
+            last_module = module
        ######################################
        #  Merge BatchNorm into Convolution  #
        ######################################
        last_module = None
-        for e in self.modules():
+        for module in self.modules():
-            if isinstance(e, nn.BatchNorm2d) and \
+            if isinstance(module, nn.BatchNorm2d) and \
                    isinstance(last_module, nn.Conv2d):
                if last_module.bias is None:
                    delattr(last_module, 'bias')
-                    e.forward = lambda x: x
+                    module.forward = lambda x: x
-                    term = torch.sqrt(e.running_var.data + e.eps)
+                    term = torch.sqrt(module.running_var.data + module.eps)
-                    term = e.weight.data / term
+                    term = module.weight.data / term
-                    last_module.bias = e.bias.data - term * e.running_mean.data
+                    last_module.bias = \
+                        module.bias.data - \
+                        term * module.running_mean.data
+                    term = term.view(0, *([1] * (last_module.weight.ndimension() - 1)))
                    if last_module.weight.dtype == 'float16':
                        last_module.bias.half_()
                        weight = last_module.weight.data.float()
@@ -201,7 +207,7 @@ class Detector(nn.Module):
                        last_module.weight.copy_(weight)
                    else:
                        last_module.weight.data.mul_(term)
-            last_module = e
+            last_module = module
 def new_detector(device, weights=None, training=False):

--- a/seetadet/modeling/fpn.py
+++ b/seetadet/modeling/fpn.py
@@ -31,7 +31,8 @@ class FPN(nn.Module):
        dim = cfg.FPN.DIM
        self.C = nn.ModuleList()
        self.P = nn.ModuleList()
-        for lvl in range(cfg.FPN.RPN_MIN_LEVEL, HIGHEST_BACKBONE_LVL + 1):
+        self.highest_backbone_lvl = min(cfg.FPN.RPN_MAX_LEVEL, HIGHEST_BACKBONE_LVL)
+        for lvl in range(cfg.FPN.RPN_MIN_LEVEL, self.highest_backbone_lvl + 1):
            self.C.append(nn.Conv1x1(feature_dims[lvl - 1], dim, bias=True))
            self.P.append(nn.Conv3x3(dim, dim, bias=True))
        if 'rcnn' in cfg.MODEL.TYPE:
@@ -40,8 +41,8 @@ class FPN(nn.Module):
        else:
            self.apply_func = self.apply_on_generic
            self.relu = nn.ReLU(inplace=False)
-            for lvl in range(HIGHEST_BACKBONE_LVL + 1, cfg.FPN.RPN_MAX_LEVEL + 1):
+            for lvl in range(self.highest_backbone_lvl + 1, cfg.FPN.RPN_MAX_LEVEL + 1):
-                dim_in = feature_dims[-1] if lvl == HIGHEST_BACKBONE_LVL + 1 else dim
+                dim_in = feature_dims[-1] if lvl == self.highest_backbone_lvl + 1 else dim
                self.P.append(nn.Conv3x3(dim_in, dim, stride=2, bias=True))
        self.feature_dims = [dim]
        self.coarsest_stride = cfg.MODEL.COARSEST_STRIDE
@@ -56,12 +57,12 @@ class FPN(nn.Module):
    def apply_on_rcnn(self, features):
        fpn_input = self.C[-1](features[-1])
        min_lvl, max_lvl = cfg.FPN.RPN_MIN_LEVEL, cfg.FPN.RPN_MAX_LEVEL
-        outputs = [self.P[HIGHEST_BACKBONE_LVL - min_lvl](fpn_input)]
+        outputs = [self.P[self.highest_backbone_lvl - min_lvl](fpn_input)]
        # Apply max pool for higher features
-        for i in range(HIGHEST_BACKBONE_LVL + 1, max_lvl + 1):
+        for i in range(self.highest_backbone_lvl + 1, max_lvl + 1):
            outputs.append(self.maxpool(outputs[-1]))
        # Build pyramids between [MIN_LEVEL, HIGHEST_LEVEL]
-        for i in range(HIGHEST_BACKBONE_LVL - 1, min_lvl - 1, -1):
+        for i in range(self.highest_backbone_lvl - 1, min_lvl - 1, -1):
            lateral_output = self.C[i - min_lvl](features[i - 1])
            if self.coarsest_stride > 0:
                upscale_output = nn_funcs.upsample(
@@ -76,15 +77,15 @@ class FPN(nn.Module):
    def apply_on_generic(self, features):
        fpn_input = self.C[-1](features[-1])
        min_lvl, max_lvl = cfg.FPN.RPN_MIN_LEVEL, cfg.FPN.RPN_MAX_LEVEL
-        outputs = [self.P[HIGHEST_BACKBONE_LVL - min_lvl](fpn_input)]
+        outputs = [self.P[self.highest_backbone_lvl - min_lvl](fpn_input)]
        # Add extra convolutions for higher features
        extra_input = features[-1]
-        for i in range(HIGHEST_BACKBONE_LVL + 1, max_lvl + 1):
+        for i in range(self.highest_backbone_lvl + 1, max_lvl + 1):
            outputs.append(self.P[i - min_lvl](extra_input))
            if i != max_lvl:
                extra_input = self.relu(outputs[-1])
        # Build pyramids between [MIN_LEVEL, HIGHEST_LEVEL]
-        for i in range(HIGHEST_BACKBONE_LVL - 1, min_lvl - 1, -1):
+        for i in range(self.highest_backbone_lvl - 1, min_lvl - 1, -1):
            lateral_output = self.C[i - min_lvl](features[i - 1])
            if self.coarsest_stride > 0:
                upscale_output = nn_funcs.upsample(

--- a/seetadet/modeling/mobilenet.py
+++ b/seetadet/modeling/mobilenet.py
@@ -161,7 +161,7 @@ class NASMobileNet(nn.Module):
    def reset_parameters(self):
        for m in self.modules():
-            if nn.is_conv2d(m):
+            if isinstance(m, nn.Conv2d):
                init.kaiming_normal(m.weight, 'fan_out')
                if m.bias is not None:
                    init.constant(m.bias, 0)
@@ -173,7 +173,7 @@ class NASMobileNet(nn.Module):
        # Stop the gradients if necessary
        def freeze_func(m):
-            if nn.is_conv2d(m):
+            if isinstance(m, nn.Conv2d):
                m.weight.requires_grad = False
                m._buffers['weight'] = m.weight
                del m._parameters['weight']

--- a/seetadet/modeling/resnet.py
+++ b/seetadet/modeling/resnet.py
@@ -17,8 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-import dragon.vm.torch as torch
 from seetadet.core.config import cfg
 from seetadet.core.registry import backbones
 from seetadet.modules import nn
@@ -37,11 +35,12 @@ class BasicBlock(nn.Module):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv3x3(dim_in, dim_out, stride)
        self.bn1 = nn.FrozenAffine(dim_out)
-        self.relu = torch.nn.ReLU(inplace=True)
+        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv3x3(dim_out, dim_out)
        self.bn2 = nn.FrozenAffine(dim_out)
        self.downsample = downsample
-        self.dropblock = dropblock
+        self.dropblock1 = nn.DropBlock2d(**dropblock) if dropblock else None
+        self.dropblock2 = nn.DropBlock2d(**dropblock) if dropblock else None
    def forward(self, x):
        residual = x
@@ -50,14 +49,14 @@ class BasicBlock(nn.Module):
        out = self.bn1(out)
        out = self.relu(out)
-        if self.dropblock is not None:
+        if self.dropblock1 is not None:
-            out = self.dropblock(out)
+            out = self.dropblock1(out)
        out = self.conv2(out)
        out = self.bn2(out)
-        if self.dropblock is not None:
+        if self.dropblock2 is not None:
-            residual = self.dropblock(residual)
+            residual = self.dropblock2(residual)
        if self.downsample is not None:
            residual = self.downsample(residual)
@@ -67,7 +66,7 @@ class BasicBlock(nn.Module):
        return out
-class Bottleneck(torch.nn.Module):
+class Bottleneck(nn.Module):
    # 1x64d => 0.25 (ResNet)
    # 32x8d, 64x4d => 1.0 (ResNeXt)
    contraction = cfg.RESNET.NUM_GROUPS \
@@ -86,12 +85,13 @@ class Bottleneck(torch.nn.Module):
        self.conv1 = nn.Conv1x1(dim_in, dim)
        self.bn1 = nn.FrozenAffine(dim)
        self.conv2 = nn.Conv3x3(dim, dim, stride=stride)
+        self.drop2 = nn.DropBlock2d(**dropblock) if dropblock else None
        self.bn2 = nn.FrozenAffine(dim)
        self.conv3 = nn.Conv1x1(dim, dim_out)
+        self.drop3 = nn.DropBlock2d(**dropblock) if dropblock else None
        self.bn3 = nn.FrozenAffine(dim_out)
-        self.relu = torch.nn.ReLU(inplace=True)
+        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
-        self.dropblock = dropblock
    def forward(self, x):
        residual = x
@@ -101,32 +101,30 @@ class Bottleneck(torch.nn.Module):
        out = self.relu(out)
        out = self.conv2(out)
+        if self.drop2 is not None:
+            out = self.drop2(out)
        out = self.bn2(out)
        out = self.relu(out)
-        if self.dropblock is not None:
-            out = self.dropblock(out)
        out = self.conv3(out)
        out = self.bn3(out)
-        if self.dropblock is not None:
-            residual = self.dropblock(residual)
        if self.downsample is not None:
            residual = self.downsample(residual)
        out += residual
+        if self.drop3 is not None:
+            out = self.drop3(out)
        out = self.relu(out)
        return out
-class ResNet(torch.nn.Module):
+class ResNet(nn.Module):
    def __init__(self, block, layers, filters):
        super(ResNet, self).__init__()
        self.dim_in, filters = filters[0], filters[1:]
        self.feature_dims = [self.dim_in] + filters
-        self.conv1 = torch.nn.Conv2d(
+        self.conv1 = nn.Conv2d(
            3, 64,
            kernel_size=7,
            stride=2,
@@ -134,29 +132,31 @@ class ResNet(torch.nn.Module):
            bias=False,
        )
        self.bn1 = nn.FrozenAffine(self.dim_in)
-        self.relu = torch.nn.ReLU(inplace=True)
+        self.relu = nn.ReLU(inplace=True)
-        self.maxpool = torch.nn.MaxPool2d(
+        self.maxpool = nn.MaxPool2d(
            kernel_size=3,
            stride=2,
            padding=0,
            ceil_mode=True,
        )
-        self.drop3 = torch.nn.DropBlock2d(
+        drop3 = {
-            kp=0.9,
+            'kp': 0.9,
-            block_size=7,
+            'block_size': 7,
-            alpha=0.25,
+            'alpha': 1.00,
-            decrement=cfg.DROPBLOCK.DECREMENT
+            'decrement': cfg.DROPBLOCK.DECREMENT,
-        ) if cfg.DROPBLOCK.DROP_ON else None
+            'inplace': True,
-        self.drop4 = torch.nn.DropBlock2d(
+        } if cfg.DROPBLOCK.DROP_ON else None
-            kp=0.9,
+        drop4 = {
-            block_size=7,
+            'kp': 0.9,
-            alpha=1.00,
+            'block_size': 7,
-            decrement=cfg.DROPBLOCK.DECREMENT
+            'alpha': 1.00,
-        ) if cfg.DROPBLOCK.DROP_ON else None
+            'decrement': cfg.DROPBLOCK.DECREMENT,
+            'inplace': True,
+        } if cfg.DROPBLOCK.DROP_ON else None
        self.layer1 = self.make_blocks(block, filters[0], layers[0])
        self.layer2 = self.make_blocks(block, filters[1], layers[1], 2)
-        self.layer3 = self.make_blocks(block, filters[2], layers[2], 2, self.drop3)
+        self.layer3 = self.make_blocks(block, filters[2], layers[2], 2, drop3)
-        self.layer4 = self.make_blocks(block, filters[3], layers[3], 2, self.drop4)
+        self.layer4 = self.make_blocks(block, filters[3], layers[3], 2, drop4)
        self.reset_parameters()
    def reset_parameters(self):
@@ -166,7 +166,7 @@ class ResNet(torch.nn.Module):
        # Stop the gradients if necessary
        def freeze_func(m):
-            if isinstance(m, torch.nn.Conv2d):
+            if isinstance(m, nn.Conv2d):
                m.weight.requires_grad = False
                m._buffers['weight'] = m.weight
                del m._parameters['weight']

--- a/seetadet/modeling/ssd.py
+++ b/seetadet/modeling/ssd.py
@@ -29,7 +29,6 @@ class SSD(nn.Module):
        ########################################
        #             SSD outputs              #
        ########################################
        self.cls_conv = torch.nn.ModuleList(
            nn.Conv3x3(feature_dims[0], feature_dims[0], bias=True)
            for _ in range(cfg.SSD.NUM_CONVS)

--- a/seetadet/modules/det.py
+++ b/seetadet/modules/det.py
@@ -36,7 +36,6 @@ class _NonMaxSuppression(Function):
        return self.dispatch([dets], [self.alloc()])
 class _RetinaNetDecoder(Function):
    """Decode predictions from RetinaNet."""

--- a/seetadet/modules/init.py
+++ b/seetadet/modules/init.py
@@ -33,6 +33,7 @@ def kaiming_normal(weight, mode='fan_in'):
        nonlinearity='relu',
    )
 # Aliases
 constant = nn.init.constant_
 normal = nn.init.normal_
--- a/seetadet/modules/nn.py
+++ b/seetadet/modules/nn.py
@@ -185,6 +185,7 @@ class SigmoidFocalLoss(object):
        return nn.SigmoidFocalLoss(
            alpha=cfg.MODEL.FOCAL_LOSS_ALPHA,
            gamma=cfg.MODEL.FOCAL_LOSS_GAMMA,
+            negative_index=0,  # Background index
        )
@@ -211,6 +212,7 @@ BCEWithLogitsLoss = nn.BCEWithLogitsLoss
 Conv2d = nn.Conv2d
 ConvTranspose2d = nn.ConvTranspose2d
 DepthwiseConv2d = nn.DepthwiseConv2d
+DropBlock2d = nn.DropBlock2d
 Linear = nn.Linear
 MaxPool2d = nn.MaxPool2d
 Module = nn.Module

--- a/seetadet/modules/vision.py
+++ b/seetadet/modules/vision.py
@@ -15,7 +15,7 @@ from __future__ import print_function
 import functools
-import dragon.vm.torch as torch
+from dragon.vm import torch
 from seetadet.core.config import cfg
@@ -41,7 +41,9 @@ class Bootstrap(torch.nn.Module):
    def __init__(self):
        super(Bootstrap, self).__init__()
-        self.normalize_func = functools.partial(
+        self._device = torch.device('cpu')
+        self._dummy_buffer = torch.ones(1)
+        self._normalize_func = functools.partial(
            torch.channel_normalize,
            mean=cfg.PIXEL_MEANS,
            std=[1., 1., 1.],
@@ -49,10 +51,9 @@ class Bootstrap(torch.nn.Module):
            dims=(0, 3, 1, 2),
            dtype=cfg.MODEL.PRECISION.lower(),
        )
-        self.dummy_buffer = torch.ones(1)
    def _apply(self, fn):
-        fn(self.dummy_buffer)
+        fn(self._dummy_buffer)
    def cpu(self):
        self._device = torch.device('cpu')
@@ -61,12 +62,11 @@ class Bootstrap(torch.nn.Module):
        self._device = torch.device('cuda', device)
    def device(self):
-        """Return the device of this module."""
+        return self._dummy_buffer.device
-        return self.dummy_buffer.device
    def forward(self, input):
        if isinstance(input, torch.Tensor):
-            if input.size(1) <= 3:
+            if input.shape[1] <= 3:
                return input
        cur_device = self.device()
        if input._device != cur_device:
@@ -74,4 +74,4 @@ class Bootstrap(torch.nn.Module):
                input = input.cpu()
            else:
                input = input.cuda(cur_device.index)
-        return self.normalize_func(input)
+        return self._normalize_func(input)
--- a/seetadet/solver/sgd.py
+++ b/seetadet/solver/sgd.py
@@ -32,8 +32,8 @@ class SGDSolver(object):
            lr=cfg.SOLVER.BASE_LR,
            momentum=cfg.SOLVER.MOMENTUM,
            weight_decay=cfg.SOLVER.WEIGHT_DECAY,
-            clip_gradient=float(cfg.SOLVER.CLIP_NORM),
+            clip_norm=float(cfg.SOLVER.CLIP_NORM),
-            scale_gradient=1. / cfg.SOLVER.LOSS_SCALING,
+            scale=1. / cfg.SOLVER.LOSS_SCALING,
        )
        self.lr_scheduler = lr_scheduler.get_scheduler()

--- a/seetadet/utils/observer.py
+++ b/seetadet/utils/observer.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import functools
+import operator
+from dragon.vm import torch
+from seetadet.modules import nn
+def dense_conv_flops(m, inputs, output):
+    """Hook to compute flops for a dense convolution."""
+    k_dim = functools.reduce(operator.mul, m.kernel_size)
+    out_dim = functools.reduce(operator.mul, output.shape[2:])
+    in_c, out_c = inputs[0].shape[1], output.shape[1]
+    m.__params__ = (k_dim * in_c + (1 if m.bias else 0)) * out_c
+    m.__flops__ = m.__params__ * out_dim
+def depthwise_conv_flops(m, inputs, output):
+    """Hook to compute flops for a depthwise convolution."""
+    k_dim = functools.reduce(operator.mul, m.kernel_size)
+    out_dim = functools.reduce(operator.mul, output.shape[2:])
+    out_c = output.shape[1]
+    m.__params__ = (k_dim + (1 if m.bias else 0)) * out_c
+    m.__flops__ = m.__params__ * out_dim
+def register_flops(module):
+    """Register hooks to collect flops info."""
+    if not hasattr(module, '__flops__'):
+        module.__flops__ = 0.
+        for m in module.modules():
+            if isinstance(m, nn.DepthwiseConv2d):
+                m.register_forward_hook(depthwise_conv_flops)
+            elif isinstance(m, nn.Conv2d):
+                m.register_forward_hook(dense_conv_flops)
+def collect_flops(module, normalizer=1e6):
+    """Collect flops from the last forward."""
+    total_flops = 0.
+    for m in module.modules():
+        if hasattr(m, '__flops__'):
+            total_flops += m.__flops__
+            m.__flops__ = 0.
+    return total_flops / normalizer
+def benchmark_flops(module, normalizer=1e6):
+    """Return the flops by running benchmark once."""
+    register_flops(module)
+    collect_flops(module)
+    original_training = module.training
+    if original_training:
+        module.eval()
+    with torch.no_grad():
+        module()
+    if original_training:
+        module.train()
+    return collect_flops(module, normalizer)