Disable sharing gradients on shape ops

Ting PAN
Commit 2d1b7752 authored Dec 30, 2017 by Ting PAN
Showing with 128 additions and 92 deletions
Dragon/include/operators/ndarray/concat_op.h
Dragon/include/operators/ndarray/crop_op.h
Dragon/include/operators/ndarray/expand_dims_op.h
Dragon/include/operators/ndarray/flatten_op.h
Dragon/include/operators/ndarray/pad_op.h
Dragon/include/operators/ndarray/random_pick_op.h
Dragon/include/operators/ndarray/reshape_op.h
Dragon/include/operators/ndarray/tile_op.h
Dragon/include/operators/vision/roi_align_op.h
Dragon/include/utils/op_kernel.h
Dragon/python/dragon/vm/caffe/layers/__init__.py
Dragon/python/dragon/vm/caffe/layers/common.py
Dragon/python/dragon/vm/caffe/layers/loss.py
Dragon/python/dragon/vm/caffe/net.py
Dragon/python/dragon/vm/caffe/proto/caffe.proto
Dragon/python/dragon/vm/caffe/proto/caffe_pb2.py
Dragon/python/dragon/vm/caffe/solver.py
Dragon/python/dragon/vm/theano/compile/function.py
Dragon/src/operators/ndarray/concat_op.cc
Dragon/src/operators/vision/roi_align_op.cc
--- a/Dragon/include/operators/ndarray/concat_op.h
+++ b/Dragon/include/operators/ndarray/concat_op.h
@@ -31,12 +31,13 @@ class ConcatOp : public Operator<Context> {
 template <class Context>
 class ConcatGradientOp : public Operator<Context> {
 public:
-    ConcatGradientOp(const OperatorDef& op_def, Workspace* ws) 
+    ConcatGradientOp(const OperatorDef& op_def, Workspace* ws)
        : Operator<Context>(op_def, ws),
          axis(OperatorBase::GetSingleArg<int>("axis", 1)),
-          nin(OperatorBase::GetSingleArg<int>("num_input", 1)) {}
+          nin(OperatorBase::GetSingleArg<int>("num_input", 1)) {
+        DISABLE_SHARE_GRADIENT;
+    }
-    void ShareGradient() override;
    void RunOnDevice() override;
    template <typename T> void RunWithType();

--- a/Dragon/include/operators/ndarray/crop_op.h
+++ b/Dragon/include/operators/ndarray/crop_op.h
@@ -46,7 +46,9 @@ class CropGradientOp final : public Operator<Context > {
          start_axis(OperatorBase::GetSingleArg<int>("start_axis", -1)),
          offsets(OperatorBase::GetRepeatedArg<int>("offsets")),
          shape(OperatorBase::GetRepeatedArg<int>("shape")),
-          shape_like(OperatorBase::GetSingleArg<string>("shape_like", "")) {}
+          shape_like(OperatorBase::GetSingleArg<string>("shape_like", "")) {
+        DISABLE_SHARE_GRADIENT;
+    }
    void Setup();
    void RunOnDevice() override;

--- a/Dragon/include/operators/ndarray/expand_dims_op.h
+++ b/Dragon/include/operators/ndarray/expand_dims_op.h
@@ -28,7 +28,9 @@ template <class Context>
 class ExpandDimsGradientOp final : public Operator<Context> {
 public:
    ExpandDimsGradientOp(const OperatorDef& op_def, Workspace* ws)
-        : Operator<Context>(op_def, ws) {}
+        : Operator<Context>(op_def, ws) {
+        DISABLE_SHARE_GRADIENT;
+    }
    void RunOnDevice() override;
 };

--- a/Dragon/include/operators/ndarray/flatten_op.h
+++ b/Dragon/include/operators/ndarray/flatten_op.h
@@ -32,7 +32,9 @@ template <class Context>
 class FlattenGradientOp final : public Operator<Context> {
 public:
    FlattenGradientOp(const OperatorDef& op_def, Workspace* ws)
-         : Operator<Context>(op_def, ws) {}
+         : Operator<Context>(op_def, ws) {
+        DISABLE_SHARE_GRADIENT;
+    }
    void RunOnDevice() override;
 };

--- a/Dragon/include/operators/ndarray/pad_op.h
+++ b/Dragon/include/operators/ndarray/pad_op.h
@@ -63,6 +63,7 @@ class PadGradientOp final : public Operator<Context> {
        }
        std::sort(process_axes.begin(), process_axes.end());
        std::reverse(process_axes.begin(), process_axes.end());
+        DISABLE_SHARE_GRADIENT;
    }
    void RunOnDevice() override;

--- a/Dragon/include/operators/ndarray/random_pick_op.h
+++ b/Dragon/include/operators/ndarray/random_pick_op.h
@@ -34,7 +34,9 @@ class RandomPickGradientOp final : public Operator<Context> {
 public:
    RandomPickGradientOp(const OperatorDef& op_def, Workspace* ws)
        : Operator<Context>(op_def, ws),
-        axis(OperatorBase::GetSingleArg<int>("axis", 0)) {}
+        axis(OperatorBase::GetSingleArg<int>("axis", 0)) {
+        DISABLE_SHARE_GRADIENT;
+    }
    void RunOnDevice() override;
    template <typename T> void RunWithType();

--- a/Dragon/include/operators/ndarray/reshape_op.h
+++ b/Dragon/include/operators/ndarray/reshape_op.h
@@ -31,7 +31,9 @@ template <class Context>
 class ReshapeGradientOp final : public Operator<Context> {
 public:
    ReshapeGradientOp(const OperatorDef& op_def, Workspace* ws)
-        : Operator<Context>(op_def, ws) {}
+        : Operator<Context>(op_def, ws) {
+        DISABLE_SHARE_GRADIENT;
+    }
    void RunOnDevice() override;
 };

--- a/Dragon/include/operators/ndarray/tile_op.h
+++ b/Dragon/include/operators/ndarray/tile_op.h
@@ -44,6 +44,7 @@ class TileGradientOp : public Operator<Context> {
                process_axes.push_back({ multiples[i], i });
        std::sort(process_axes.begin(), process_axes.end());
        std::reverse(process_axes.begin(), process_axes.end());
+        DISABLE_SHARE_GRADIENT;
    }
    void RunOnDevice() override;

--- a/Dragon/include/operators/vision/roi_align_op.h
+++ b/Dragon/include/operators/vision/roi_align_op.h
@@ -29,7 +29,7 @@ class ROIAlignOp : public Operator<Context> {
 protected:
    int pool_h, pool_w;
    float spatial_scale;
-    Tensor* mask;
+    Tensor* mask_h, *mask_w;
 };
 template <class Context>
@@ -50,7 +50,7 @@ class ROIAlignGradientOp : public Operator<Context> {
 protected:
    int pool_h, pool_w;
    float spatial_scale;
-    Tensor* mask;
+    Tensor* mask_h, *mask_w;
 };
 }    // namespace dragon

--- a/Dragon/include/utils/op_kernel.h
+++ b/Dragon/include/utils/op_kernel.h
@@ -813,7 +813,8 @@ void ROIAlign(const float spatial_scale,
              const int pool_w,
              Tensor* x,
              Tensor* roi,
-              Tensor* mask,
+              Tensor* mask_h,
+              Tensor* mask_w,
              Tensor* y);
 template <typename T, class Context>
@@ -822,7 +823,8 @@ void ROIAlignGrad(const float spatial_scale,
                  const int pool_w,
                  Tensor* dy,
                  Tensor* roi,
-                  Tensor* mask,
+                  Tensor* mask_h,
+                  Tensor* mask_w,
                  Tensor* dx);
 }    // namespace kernel

--- a/Dragon/python/dragon/vm/caffe/layers/__init__.py
+++ b/Dragon/python/dragon/vm/caffe/layers/__init__.py
@@ -21,6 +21,7 @@ from .neuron import ReLULayer, \
                    ELULayer, \
                    SELULayer, \
                    DropoutLayer, \
+                    SigmoidLayer, \
                    TanHLayer, \
                    PowerLayer
@@ -53,6 +54,7 @@ from .common import InnerProductLayer, \
                    NormalizeLayer, \
                    InstanceNormLayer, \
                    TileLayer, \
+                    ReductionLayer, \
                    ExpandDimsLayer, \
                    ProposalLayer, \
                    DenseConcatLayer
\ No newline at end of file
--- a/Dragon/python/dragon/vm/caffe/layers/common.py
+++ b/Dragon/python/dragon/vm/caffe/layers/common.py
@@ -553,6 +553,32 @@ class TileLayer(Layer):
        return ops.Tile(input, **self._param)
+class ReductionLayer(Layer):
+    """The extended implementation of ``ReductionLayer``.
+    Parameters
+    ----------
+    operation : caffe_pb2.ReductionOp
+        The operation. Refer `ReductionParameter.operation`_.
+    axis : int
+        The axis to to reduce. Refer `ReductionParameter.axis`_.
+    """
+    def __init__(self, LayerParameter):
+        super(ReductionLayer, self).__init__(LayerParameter)
+        param = LayerParameter.reduction_param
+        if param.axis < 0:
+            if param.axis != -1:
+                raise ValueError('The negative axis can only be -1(reduce all).')
+        self._param = {'operation': {1: 'SUM', 4: 'MEAN'}[param.operation],
+                       'axis': param.axis}
+    def Setup(self, bottom):
+        super(ReductionLayer, self).Setup(bottom)
+        input = bottom[0] if isinstance(bottom, list) else bottom
+        return ops.Reduce(input, **self._param)
 class ExpandDimsLayer(Layer):
    """The implementation of ``ExpandDimsLayer``.

--- a/Dragon/python/dragon/vm/caffe/layers/loss.py
+++ b/Dragon/python/dragon/vm/caffe/layers/loss.py
@@ -27,7 +27,7 @@ class SoftmaxWithLossLayer(Layer):
        super(SoftmaxWithLossLayer, self).__init__(LayerParameter)
        param = LayerParameter.loss_param
        softmax_param = LayerParameter.softmax_param
-        norm_mode = {0: 'FULL', 1: 'VALID', 2: 'BATCH_SIZE', 3: 'NONE'}
+        norm_mode = {0: 'FULL', 1: 'VALID', 2: 'BATCH_SIZE', 3: 'NONE', 4: 'UNIT'}
        normalization = 'VALID'
        if param.HasField('normalize'):
            if not param.normalize: normalization = 'BATCH_SIZE'
@@ -57,7 +57,7 @@ class SigmoidCrossEntropyLossLayer(Layer):
    def __init__(self, LayerParameter):
        super(SigmoidCrossEntropyLossLayer, self).__init__(LayerParameter)
        param = LayerParameter.loss_param
-        norm_mode = {0: 'FULL', 1: 'BATCH_SIZE', 2: 'BATCH_SIZE', 3: 'NONE'}
+        norm_mode = {0: 'FULL', 1: 'BATCH_SIZE', 2: 'BATCH_SIZE', 3: 'NONE', 4: 'UNIT'}
        normalization = 'BATCH_SIZE'
        if param.HasField('normalize'):
            if param.normalize: normalization = 'FULL'
@@ -157,7 +157,7 @@ class SoftmaxWithFocalLossLayer(Layer):
        param = LayerParameter.loss_param
        softmax_param = LayerParameter.softmax_param
        focal_loss_param = LayerParameter.focal_loss_param
-        norm_mode = {0: 'FULL', 1: 'VALID', 2: 'BATCH_SIZE', 3: 'NONE'}
+        norm_mode = {0: 'FULL', 1: 'VALID', 2: 'BATCH_SIZE', 3: 'NONE', 4: 'UNIT'}
        normalization = 'VALID'
        if param.HasField('normalize'):
            if not param.normalize: normalization = 'BATCH_SIZE'
@@ -174,4 +174,4 @@ class SoftmaxWithFocalLossLayer(Layer):
        super(SoftmaxWithFocalLossLayer, self).Setup(bottom)
        loss = ops.SparseSoftmaxFocalLoss(bottom, **self._param)
        if self._loss_weight is not None: loss *= self._loss_weight
        return loss
\ No newline at end of file
--- a/Dragon/python/dragon/vm/caffe/net.py
+++ b/Dragon/python/dragon/vm/caffe/net.py
@@ -215,8 +215,13 @@ class Net(object):
            if len(LayerParameter.loss_weight) == 0:
                LayerParameter.loss_weight.extend([1.0])
            for idx, loss_weight in enumerate(LayerParameter.loss_weight):
-                  if loss_weight <= 0: continue
+                if loss_weight <= 0: continue
-                  self._costs.append(self.blobs[LayerParameter.top[idx]].data)
+                self._costs.append(self.blobs[LayerParameter.top[idx]].data)
+        else:
+            if len(LayerParameter.loss_weight) != 0:
+                for idx, loss_weight in enumerate(LayerParameter.loss_weight):
+                    if loss_weight <= 0: continue
+                    self._costs.append(self.blobs[LayerParameter.top[idx]].data)
        if self._phase != 'TRAIN': return

--- a/Dragon/python/dragon/vm/caffe/proto/caffe.proto
+++ b/Dragon/python/dragon/vm/caffe/proto/caffe.proto
@@ -473,6 +473,8 @@ message LossParameter {
    BATCH_SIZE = 2;
    // Do not normalize the loss.
    NONE = 3;
+    // Do not reduce the loss.
+    UNIT = 4;
  }
  optional NormalizationMode normalization = 3 [default = VALID];
  // Deprecated.  Ignored if normalization is specified.  If normalization

--- a/Dragon/python/dragon/vm/caffe/proto/caffe_pb2.py
+++ b/Dragon/python/dragon/vm/caffe/proto/caffe_pb2.py
--- a/Dragon/python/dragon/vm/caffe/solver.py
+++ b/Dragon/python/dragon/vm/caffe/solver.py
@@ -296,7 +296,10 @@ class Solver(object):
            for i in xrange(self._param.iter_size):
                self.train(return_outputs=False)
                if root_solver():
-                    for cost in self._net._costs: loss += ws.FetchTensor(cost)[0]
+                    for cost in self._net._costs:
+                        cost_value = ws.FetchTensor(cost)
+                        if cost_value.size == 1:
+                            loss += cost_value[0]
            if root_solver():
                loss /= self._param.iter_size

--- a/Dragon/python/dragon/vm/theano/compile/function.py
+++ b/Dragon/python/dragon/vm/theano/compile/function.py
@@ -279,6 +279,7 @@ def function(inputs=None, outputs=None, givens=None, updater=None):
                    external_input_exprs = OrderedDict(external_input_exprs, **new_tensor.expressions)
                else:
                    external_input_exprs = dict(external_input_exprs, **new_tensor.expressions)
+                    external_input_exprs = OrderedDict(sorted(external_input_exprs.items(), lambda x, y: cmp(x[1], y[1])))
            elif isinstance(new_tensor, np.ndarray):
                ws.FeedTensor(new_tensor, GetTensorName())
        external_input_ops = [v for k, v in external_input_exprs.items()]

--- a/Dragon/src/operators/ndarray/concat_op.cc
+++ b/Dragon/src/operators/ndarray/concat_op.cc
@@ -104,17 +104,6 @@ void ConcatGradientOp<Context>::RunOnDevice() {
    else LOG(FATAL) << "Unsupported input types.";
 }
-template <class Context>
-void ConcatGradientOp<Context>::ShareGradient() {
-    for (int i = 0; i < OutputSize(); i++) {
-        if (output(i)->name() != "ignore") {
-            Tensor* dX = ws()->GetBuffer("Grad");
-            ws()->CreateAvatar(output(i), dX);
-            break;
-        }
-    }
-}
 DEPLOY_CPU(ConcatGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(ConcatGradient);

--- a/Dragon/src/operators/vision/roi_align_op.cc
+++ b/Dragon/src/operators/vision/roi_align_op.cc
@@ -11,17 +11,20 @@ void ROIAlignOp<Context>::RunWithType() {
                                pool_h, pool_w,
                                     &input(0),
                                     &input(1),
-                                          mask,
+                                        mask_h,
+                                        mask_w,
                                    output(0));
 }
 template <class Context>
 void ROIAlignOp<Context>::RunOnDevice() {
-    mask = ws()->CreateTensor("/mnt/" + anchor() + "/roi_align_mask");
+    mask_h = ws()->CreateTensor("/mnt/" + anchor() + "/roi_align_mask_h");
+    mask_w = ws()->CreateTensor("/mnt/" + anchor() + "/roi_align_mask_w");
    vector<TIndex> dims({input(1).dim(0), input(0).dim(1), pool_h, pool_w});
    output(0)->Reshape(dims);
-    mask->Reshape(dims);
+    mask_h->Reshape(dims);
+    mask_w->Reshape(dims);
    if (input(0).template IsType<float>()) return RunWithType<float>();
    else LOG(FATAL) << "Unsupported input types.";
@@ -39,13 +42,15 @@ void ROIAlignGradientOp<Context>::RunWithType() {
                                    pool_h, pool_w,
                                        &input(-1),
                                         &input(1),
-                                              mask,
+                                            mask_h,
+                                            mask_w,
                                        output(0));
 }
 template <class Context>
 void ROIAlignGradientOp<Context>::RunOnDevice() {
-    mask = ws()->GetTensor("/mnt/" + anchor() + "/roi_align_mask");
+    mask_h = ws()->GetTensor("/mnt/" + anchor() + "/roi_align_mask_h");
+    mask_w = ws()->GetTensor("/mnt/" + anchor() + "/roi_align_mask_w");
    output(0)->ReshapeLike(input(0));

--- a/Dragon/src/utils/op_kernel.cc
+++ b/Dragon/src/utils/op_kernel.cc
@@ -2640,7 +2640,8 @@ template<> void ROIAlign<float, CPUContext>(const float spatial_scale,
                                            const int pool_h, const int pool_w,
                                            Tensor* x,
                                            Tensor* roi,
-                                            Tensor* mask,
+                                            Tensor* mask_h,
+                                            Tensor* mask_w,
                                            Tensor* y) {
    NOT_IMPLEMENTED;
 }
@@ -2649,7 +2650,8 @@ template<> void ROIAlignGrad<float, CPUContext>(const float spatial_scale,
                                                const int pool_h, const int pool_w,
                                                Tensor* dy,
                                                Tensor* roi,
-                                                Tensor* mask,
+                                                Tensor* mask_h,
+                                                Tensor* mask_w,
                                                Tensor* dx) {
    NOT_IMPLEMENTED;
 }

--- a/Dragon/src/utils/op_kernel.cu
+++ b/Dragon/src/utils/op_kernel.cu
@@ -3937,7 +3937,8 @@ __global__ void _ROIAlign(const int count,
                          const int pool_h, const int pool_w, 
                          const T* x,
                          const T* roi,
-                          T* mask,
+                          T* mask_h,
+                          T* mask_w,
                          T* y) {
    CUDA_KERNEL_LOOP(idx, count) {
        int pw = idx % pool_w;
@@ -3970,18 +3971,17 @@ __global__ void _ROIAlign(const int count,
        bool is_empty = (hend <= hstart) || (wend <= wstart);
        T maxval = is_empty ? 0 : -FLT_MAX;
-        int maxidx = -1;
+        T max_h_idx = -1;
-        int x_idx = 0;
+        T max_w_idx = -1;
        x += (roi_batch_ind * channels + c) * height * width;
        T h_stride = (hend - hstart) / 3.0;
        T w_stride = (wend - wstart) / 3.0;
        for (T h = hstart + h_stride; h <= hend - h_stride + 0.01; h += max(h_stride, 0.01)) {
            for (T w = wstart + w_stride; w <= wend - w_stride + 0.01; w += max(w_stride, 0.01)) {
-                x_idx++;
                int hlow = min(max(static_cast<int>(floor(h)), 0), height - 1);
-                int hhigh = min(hlow + 1, height - 1);
+                int hhigh = min(max(static_cast<int>(ceil(h)), 0), height - 1);
                int wleft = min(max(static_cast<int>(floor(w)), 0), width - 1);
-                int wright = min(wleft + 1, width - 1);
+                int wright = min(max(static_cast<int>(ceil(w)), 0), width - 1);
                int topleft = hlow * width + wleft;
                int topright = hlow * width + wright;
                int bottomleft = hhigh * width + wleft;
@@ -3994,12 +3994,14 @@ __global__ void _ROIAlign(const int count,
                if (value > maxval) {
                    maxval = value;
-                    maxidx = x_idx;
+                    max_h_idx = h;
+                    max_w_idx = w;
                }
            }
        }
        y[idx] = maxval;
-        mask[idx] = maxidx;
+        mask_h[idx] = max_h_idx;
+        mask_w[idx] = max_w_idx;
    }
 }
@@ -4007,12 +4009,14 @@ template<> void ROIAlign<float, CUDAContext>(const float spatial_scale,
                                             const int pool_h, const int pool_w,
                                             Tensor* x,
                                             Tensor* roi,
-                                             Tensor* mask,
+                                             Tensor* mask_h,
+                                             Tensor* mask_w,
                                             Tensor* y) {
    auto* Xdata = x->data<float, CUDAContext>();
    auto* Rdata = roi->data<float, CUDAContext>();
    auto* Ydata = y->mutable_data<float, CUDAContext>();
-    auto* Mdata = mask->mutable_data<float, CUDAContext>();
+    auto* MHdata = mask_h->mutable_data<float, CUDAContext>();
+    auto* MWdata = mask_w->mutable_data<float, CUDAContext>();
    TIndex channels = x->dim(1), count = y->count();
    TIndex height = x->dim(2), width = x->dim(3);
    _ROIAlign<float> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count, 
@@ -4022,7 +4026,8 @@ template<> void ROIAlign<float, CUDAContext>(const float spatial_scale,
                                                         pool_h, pool_w,
                                                                  Xdata,
                                                                  Rdata,
-                                                                  Mdata,
+                                                                 MHdata,
+                                                                 MWdata,
                                                                 Ydata);
    CUDA_POST_KERNEL_CHECK;
 }
@@ -4036,7 +4041,8 @@ __global__ void _ROIAlignGrad(const int count,
                              const int pool_h, const int pool_w, 
                              const T* dy,
                              const T* roi,
-                              const T* mask,
+                              const T* mask_h,
+                              const T* mask_w,
                              T* dx) {
    CUDA_KERNEL_LOOP(idx, count) {
        int w = idx % width;
@@ -4063,53 +4069,28 @@ __global__ void _ROIAlignGrad(const int count,
            int offset = (roi_n * channels + c) * pool_h * pool_w;
            const T* offset_dy = dy + offset;
-            const T* offset_mask = mask + offset;
+            const T* offset_mask_h = mask_h + offset;
+            const T* offset_mask_w = mask_w + offset;
            T roi_width = max(roi_end_w - roi_start_w, static_cast<T>(1));
            T roi_height = max(roi_end_h - roi_start_h, static_cast<T>(1));
-            T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pool_h);
-            T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pool_w);
            for (int ph = 0; ph < pool_h; ++ph) {
                for (int pw = 0; pw < pool_w; ++pw) {
-                    T hstart = static_cast<T>((ph)* bin_size_h);
-                    T wstart = static_cast<T>((pw)* bin_size_w);
-                    T hend = static_cast<T>((ph + 1) * bin_size_h);
-                    T wend = static_cast<T>((pw + 1) * bin_size_w);
-                    hstart = min(max(hstart + roi_start_h, static_cast<T>(0)), static_cast<T>(height));
-                    hend = min(max(hend + roi_start_h, static_cast<T>(0)), static_cast<T>(height));
-                    wstart = min(max(wstart + roi_start_w, static_cast<T>(0)), static_cast<T>(width));
-                    wend = min(max(wend + roi_start_w, static_cast<T>(0)), static_cast<T>(width));
-                    bool in_bin = (w > wstart - 1.0 && 
-                                   w < wend + 1.0 && 
-                                   h > hstart - 1.0 
-                                   && h < hend + 1.0);
-                    if (!in_bin) continue;
                    const int pool_idx = ph * pool_w + pw;
-                    int x_idx = 0;
+                    T a_h = offset_mask_h[pool_idx];
-                    T h_stride = (hend - hstart) / 3.0;
+                    T a_w = offset_mask_w[pool_idx];
-                    T w_stride = (wend - wstart) / 3.0;
+                    int hlow = min(max(static_cast<int>(floor(a_h)), 0), height - 1);
-                    for (T rh = hstart + h_stride; rh <= hend - h_stride + 0.01; rh += max(h_stride, 0.01)) {
+                    int hhigh = min(max(static_cast<int>(ceil(a_h)), 0), height - 1);
-                        for (T rw = wstart + w_stride; rw <= wend - w_stride + 0.01; rw += max(w_stride, 0.01)) {
+                    int wleft = min(max(static_cast<int>(floor(a_w)), 0), width - 1);
-                            x_idx++;
+                    int wright = min(max(static_cast<int>(ceil(a_w)), 0), width - 1);
-                            if (offset_mask[pool_idx] != x_idx) continue;
+                    if (h != hlow && h != hhigh && w != wleft && w != wright) continue;
-                            int hlow = min(max(static_cast<int>(floor(rh)), 0), height - 1);
+                    T alpha = (hlow == hhigh) ? static_cast<T>(0.5) : (a_h - hlow) / (hhigh - hlow);
-                            int hhigh = min(hlow + 1, height - 1);
+                    T beta = (wleft == wright) ? static_cast<T>(0.5) : (a_w - wleft) / (wright - wleft);
-                            int wleft = min(max(static_cast<int>(floor(rw)), 0), width - 1);
+                    if (h == hlow && w == wleft) gradient += offset_dy[pool_idx] * (1 - alpha) * (1 - beta);
-                            int wright = min(wleft + 1, width - 1);
+                    else if (h == hlow && w == wright) gradient += offset_dy[pool_idx] * (1 - alpha) * beta;
-                            if (h != hlow && h != hhigh && w != wleft && w != wright) continue;
+                    else if (h == hhigh && w == wleft) gradient += offset_dy[pool_idx] * alpha * (1 - beta);
-                            T alpha = (hlow == hhigh) ? static_cast<T>(0.5) : (rh - hlow) / (hhigh - hlow);
+                    else if (h == hhigh && w == wright) gradient += offset_dy[pool_idx] * alpha * beta;
-                            T beta = (wleft == wright) ? static_cast<T>(0.5) : (rw - wleft) / (wright - wleft);
-                            if (h == hlow && w == wleft) gradient += offset_dy[pool_idx] * (1 - alpha) * (1 - beta);
-                            else if (h == hlow && w == wright) gradient += offset_dy[pool_idx] * (1 - alpha) * beta;
-                            else if (h == hhigh && w == wleft) gradient += offset_dy[pool_idx] * alpha * (1 - beta);
-                            else if (h == hhigh && w == wright) gradient += offset_dy[pool_idx] * alpha * beta;
-                        }
-                    }
                }
            }
        }
@@ -4121,11 +4102,13 @@ template<> void ROIAlignGrad<float, CUDAContext>(const float spatial_scale,
                                                 const int pool_h, const int pool_w,
                                                 Tensor* dy,
                                                 Tensor* roi,
-                                                 Tensor* mask,
+                                                 Tensor* mask_h,
+                                                 Tensor* mask_w,
                                                 Tensor* dx) {
    auto* dYdata = dy->data<float, CUDAContext>();
    auto* Rdata = roi->data<float, CUDAContext>();
-    auto* Mdata = mask->data<float, CUDAContext>();
+    auto* MHdata = mask_h->data<float, CUDAContext>();
+    auto* MWdata = mask_w->data<float, CUDAContext>();
    auto* dXdata = dx->mutable_data<float, CUDAContext>();
    TIndex channels = dx->dim(1), count = dx->count();
    TIndex height = dx->dim(2), width = dx->dim(3);
@@ -4137,7 +4120,8 @@ template<> void ROIAlignGrad<float, CUDAContext>(const float spatial_scale,
                                                             pool_h, pool_w,
                                                                     dYdata,
                                                                      Rdata,
-                                                                      Mdata,
+                                                                     MHdata,
+                                                                     MWdata,
                                                                    dXdata);
    CUDA_POST_KERNEL_CHECK;
 }