Fix the bug of missing defs on blending assign operators

Summary: This commit attaches input and output together in assign operators, which fixes the missing input defs due to identity from input to output.

Fix the bug of missing defs on blending assign operators
Summary: This commit attaches input and output together in assign operators, which fixes the missing input defs due to identity from input to output.
Ting PAN
Commit 9de0f1a3 authored Dec 02, 2020 by Ting PAN
Showing with 455 additions and 1472 deletions
docs/api/cc/Makefile
docs/api/cc/conf.py
docs/api/cc/make.bat
docs/api/python/Makefile
docs/api/python/conf.py
docs/api/python/dragon.rst
docs/api/python/dragon/EagerTensor.rst
docs/api/python/dragon/Tensor.rst
docs/api/python/dragon/copy.rst → docs/api/python/dragon/identity.rst
docs/api/python/make.bat
docs/api/python/onnx.rst
docs/api/python/tensorflow.rst
dragon/kernels/activation/elu_op_kernel.cc
dragon/kernels/activation/elu_op_kernel.cu
dragon/kernels/activation/hardsigmoid_op_kernel.cc
dragon/kernels/activation/hardsigmoid_op_kernel.cu
dragon/kernels/activation/hardswish_op_kernel.cc
dragon/kernels/activation/hardswish_op_kernel.cu
dragon/kernels/activation/prelu_op_kernel.cc
dragon/kernels/activation/prelu_op_kernel.cu
--- a/docs/api/cc/Makefile
+++ b/docs/api/cc/Makefile
@@ -39,14 +39,14 @@ html:
 	@echo "Build finished. The HTML pages are in $(BUILDDIR)."
 latex:
-	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)-latex
+	$(SPHINXBUILD) -b latex -j ${NUMBER_OF_PROCESSORS} $(ALLSPHINXOPTS) $(BUILDDIR)-latex
 	@echo
 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)-latex."
 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
 	      "(use \`make latexpdf' here to do that automatically)."
 latexpdf:
-	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)-latex
+	$(SPHINXBUILD) -b latex -j ${NUMBER_OF_PROCESSORS} $(ALLSPHINXOPTS) $(BUILDDIR)-latex
 	@echo "Running LaTeX files through pdflatex..."
-	$(MAKE) -C $(BUILDDIR)/latex all-pdf
+	$(MAKE) -C $(BUILDDIR)-latex all-pdf
 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)-latex."
--- a/docs/api/cc/conf.py
+++ b/docs/api/cc/conf.py
@@ -39,7 +39,7 @@ extensions = ['sphinx.ext.autodoc', 'sphinxcontrib.katex', 'breathe']
 # Project
 project = 'dragon'
 copyright = 'Copyright (c) 2017-present, SeetaTech, Co.,Ltd'
-author = 'SeetaTech'
+author = 'SeetaTech, Co.,Ltd'
 with open('../../../dragon/version.txt', 'r') as f:
    version = f.read().strip()
@@ -114,6 +114,7 @@ latex_elements = {
 \fvset{breaklines=true, breakanywhere=true}
 \setlength{\headheight}{13.6pt}
 \setlength{\itemindent}{-1pt}
+\addto\captionsenglish{\renewcommand{\chaptername}{}}
 \makeatletter
    \renewcommand*\l@subsection{\@dottedtocline{2}{3.8em}{3.8em}}
    \fancypagestyle{normal}{
@@ -146,13 +147,18 @@ latex_elements = {
    \vspace*{40mm}
    \LARGE \@author
+    \vspace*{40mm}
+    \LARGE \today
 \end{titlepage}
 \makeatother
 \pagenumbering{arabic}
 ''',
    'pointsize': '10pt',
+    'classoptions': ',oneside',
    'figure_align': 'H',
+    'fncychap': '\\usepackage[Sonny]{fncychap}',
    'printindex': '',
    'sphinxsetup': ' \
        hmargin={0.75in,0.75in}, \

--- a/docs/api/cc/make.bat
+++ b/docs/api/cc/make.bat
@@ -65,7 +65,7 @@ if "%1" == "doxygen" (
 )
 if "%1" == "html" (
-	%SPHINXBUILD% -b html -j %NUMBER_OF_PROCESSORS% %ALLSPHINXOPTS% %BUILDDIR%
+	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%
 	if errorlevel 1 exit /b 1
 	echo.
 	echo.Build finished. The HTML pages are in %BUILDDIR%.

--- a/docs/api/python/Makefile
+++ b/docs/api/python/Makefile
@@ -34,14 +34,14 @@ html:
 	@echo "Build finished. The HTML pages are in $(BUILDDIR)."
 latex:
-	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)-latex
+	$(SPHINXBUILD) -b latex -j ${NUMBER_OF_PROCESSORS} $(ALLSPHINXOPTS) $(BUILDDIR)-latex
 	@echo
 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)-latex."
 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
 	      "(use \`make latexpdf' here to do that automatically)."
 latexpdf:
-	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)-latex
+	$(SPHINXBUILD) -b latex -j ${NUMBER_OF_PROCESSORS} $(ALLSPHINXOPTS) $(BUILDDIR)-latex
 	@echo "Running LaTeX files through pdflatex..."
-	$(MAKE) -C $(BUILDDIR)/latex all-pdf
+	$(MAKE) -C $(BUILDDIR)-latex all-pdf
 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)-latex."
--- a/docs/api/python/conf.py
+++ b/docs/api/python/conf.py
@@ -41,7 +41,7 @@ napoleon_use_rtype = False
 # Project
 project = 'dragon'
 copyright = 'Copyright (c) 2017-present, SeetaTech, Co.,Ltd'
-author = 'SeetaTech'
+author = 'SeetaTech, Co.,Ltd'
 with open('../../../dragon/version.txt', 'r') as f:
    version = f.read().strip()
@@ -122,6 +122,7 @@ latex_elements = {
 \fvset{breaklines=true, breakanywhere=true}
 \setlength{\headheight}{13.6pt}
 \setlength{\itemindent}{-1pt}
+\addto\captionsenglish{\renewcommand{\chaptername}{}}
 \makeatletter
    \renewcommand*\l@subsection{\@dottedtocline{2}{3.8em}{3.8em}}
    \fancypagestyle{normal}{
@@ -154,13 +155,18 @@ latex_elements = {
    \vspace*{40mm}
    \LARGE \@author
+    \vspace*{40mm}
+    \LARGE \today
 \end{titlepage}
 \makeatother
 \pagenumbering{arabic}
 ''',
    'pointsize': '10pt',
+    'classoptions': ',oneside',
    'figure_align': 'H',
+    'fncychap': '\\usepackage[Sonny]{fncychap}',
    'printindex': '',
    'sphinxsetup': ' \
        hmargin={0.75in,0.75in}, \

--- a/docs/api/python/dragon.rst
+++ b/docs/api/python/dragon.rst
@@ -48,9 +48,6 @@ dragon
  `constant(...) <dragon/constant.html>`_
  : Return a tensor initialized from the value.
-  `copy(...) <dragon/copy.html>`_
-  : Copy the input.
  `create_function(...) <dragon/create_function.html>`_
  : Create a callable graph from the specified outputs.
@@ -93,6 +90,9 @@ dragon
  `graph_mode(...) <dragon/graph_mode.html>`_
  : Context-manager set the graph execution mode.
+  `identity(...) <dragon/identity.html>`_
+  : Return a tensor copied from the input.
  `index_select(...) <dragon/index_select.html>`_
  : Select the elements according to the index along the given axis.
@@ -199,7 +199,6 @@ dragon
  dragon/channel_shuffle
  dragon/concat
  dragon/constant
-  dragon/copy
  dragon/create_function
  dragon/device
  dragon/eager_mode
@@ -214,6 +213,7 @@ dragon
  dragon/get_workspace
  dragon/gradients
  dragon/graph_mode
+  dragon/identity
  dragon/index_select
  dragon/linspace
  dragon/load_library

--- a/docs/api/python/dragon/EagerTensor.rst
+++ b/docs/api/python/dragon/EagerTensor.rst
@@ -174,8 +174,8 @@ __truediv__
 .. _dragon.assign(...): assign.html
 .. _dragon.cast(...): cast.html
-.. _dragon.copy(...): copy.html
 .. _dragon.fill(...): fill.html
+.. _dragon.identity(...): identity.html
 .. _dragon.masked_assign(...): masked_assign.html
 .. _dragon.masked_select(...): masked_select.html
 .. _dragon.math.add(...): math/add.html

--- a/docs/api/python/dragon/Tensor.rst
+++ b/docs/api/python/dragon/Tensor.rst
@@ -154,8 +154,8 @@ __truediv__
 .. _dragon.assign(...): assign.html
 .. _dragon.cast(...): cast.html
-.. _dragon.copy(...): copy.html
 .. _dragon.fill(...): fill.html
+.. _dragon.identity(...): identity.html
 .. _dragon.masked_assign(...): masked_assign.html
 .. _dragon.masked_select(...): masked_select.html
 .. _dragon.math.add(...): math/add.html

--- a/docs/api/python/dragon/copy.rst
+++ b/docs/api/python/dragon/copy.rst
-copy
+identity
-====
+========
-.. autofunction:: dragon.copy
+.. autofunction:: dragon.identity
 .. raw:: html

--- a/docs/api/python/make.bat
+++ b/docs/api/python/make.bat
@@ -55,7 +55,7 @@ if errorlevel 9009 (
 :sphinx_ok
 if "%1" == "html" (
-	%SPHINXBUILD% -b html -j %NUMBER_OF_PROCESSORS% %ALLSPHINXOPTS% %BUILDDIR%
+	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%
 	if errorlevel 1 exit /b 1
 	echo.
 	echo.Build finished. The HTML pages are in %BUILDDIR%.

--- a/docs/api/python/onnx.rst
+++ b/docs/api/python/onnx.rst
@@ -65,7 +65,7 @@ Name                     Supported Reference
 `Greater`_               |v|       :func:`dragon.math.greater`
 `HardSigmoid`_           |v|       :func:`dragon.nn.hardsigmoid`
 `Hardmax`_
-`Identity`_
+`Identity`_              |v|       :func:`dragon.identity`
 `If`_
 `InstanceNormalization`_ |v|       :func:`dragon.nn.instance_norm`
 `IsInf`_                 |v|       :func:`dragon.math.is_inf`

--- a/docs/api/python/tensorflow.rst
+++ b/docs/api/python/tensorflow.rst
@@ -58,7 +58,7 @@ vm.tensorflow
  : Compute the symbolic derivatives of ``ys`` w.r.t. ``xs`` .
  `identity(...) <tensorflow/identity.html>`_
-  : Return a new tensor copying the content of input.
+  : Return a tensor copied from the input.
  `linspace(...) <tensorflow/linspace.html>`_
  : Generate evenly spaced values within intervals along the given axis.

--- a/dragon/kernels/activation/elu_op_kernel.cc
+++ b/dragon/kernels/activation/elu_op_kernel.cc
@@ -68,11 +68,9 @@ void _EluGrad<float16>(
 DEFINE_KERNEL_LAUNCHER(float16);
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 DEFINE_GRAD_KERNEL_LAUNCHER(float16);
 DEFINE_GRAD_KERNEL_LAUNCHER(float);
 DEFINE_GRAD_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER

--- a/dragon/kernels/activation/elu_op_kernel.cu
+++ b/dragon/kernels/activation/elu_op_kernel.cu
@@ -170,10 +170,8 @@ void EluGrad<float16, CUDAContext>(
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 DEFINE_GRAD_KERNEL_LAUNCHER(float);
 DEFINE_GRAD_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER

--- a/dragon/kernels/activation/hardsigmoid_op_kernel.cc
+++ b/dragon/kernels/activation/hardsigmoid_op_kernel.cc
@@ -83,11 +83,9 @@ void _HardSigmoidGrad<float16>(
 DEFINE_KERNEL_LAUNCHER(float16);
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 DEFINE_GRAD_KERNEL_LAUNCHER(float16);
 DEFINE_GRAD_KERNEL_LAUNCHER(float);
 DEFINE_GRAD_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER

--- a/dragon/kernels/activation/hardsigmoid_op_kernel.cu
+++ b/dragon/kernels/activation/hardsigmoid_op_kernel.cu
@@ -126,10 +126,8 @@ void HardSigmoidGrad<float16, CUDAContext>(
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 DEFINE_GRAD_KERNEL_LAUNCHER(float);
 DEFINE_GRAD_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER

--- a/dragon/kernels/activation/hardswish_op_kernel.cc
+++ b/dragon/kernels/activation/hardswish_op_kernel.cc
@@ -88,11 +88,9 @@ void _HardSwishGrad<float16>(
 DEFINE_KERNEL_LAUNCHER(float16);
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 DEFINE_GRAD_KERNEL_LAUNCHER(float16);
 DEFINE_GRAD_KERNEL_LAUNCHER(float);
 DEFINE_GRAD_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER

--- a/dragon/kernels/activation/hardswish_op_kernel.cu
+++ b/dragon/kernels/activation/hardswish_op_kernel.cu
@@ -153,10 +153,8 @@ void HardSwishGrad<float16, CUDAContext>(
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 DEFINE_GRAD_KERNEL_LAUNCHER(float);
 DEFINE_GRAD_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER

--- a/dragon/kernels/activation/prelu_op_kernel.cc
+++ b/dragon/kernels/activation/prelu_op_kernel.cc
@@ -243,10 +243,8 @@ void PReluWGrad<float16, CPUContext>(
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 DEFINE_GRAD_KERNEL_LAUNCHER(float);
 DEFINE_GRAD_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER

--- a/dragon/kernels/activation/prelu_op_kernel.cu
+++ b/dragon/kernels/activation/prelu_op_kernel.cu
@@ -601,10 +601,8 @@ void PReluWGrad<float16, CUDAContext>(
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 DEFINE_GRAD_KERNEL_LAUNCHER(float);
 DEFINE_GRAD_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER

--- a/dragon/kernels/activation/relu_op_kernel.cc
+++ b/dragon/kernels/activation/relu_op_kernel.cc
@@ -124,11 +124,9 @@ void _ReluNGrad<float16>(
 DEFINE_KERNEL_LAUNCHER(float16);
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 DEFINE_GRAD_KERNEL_LAUNCHER(float16);
 DEFINE_GRAD_KERNEL_LAUNCHER(float);
 DEFINE_GRAD_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER

--- a/dragon/kernels/activation/relu_op_kernel.cu
+++ b/dragon/kernels/activation/relu_op_kernel.cu
@@ -401,10 +401,8 @@ void ReluNGrad<float16, CUDAContext>(
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 DEFINE_GRAD_KERNEL_LAUNCHER(float);
 DEFINE_GRAD_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER

--- a/dragon/kernels/activation/selu_op_kernel.cc
+++ b/dragon/kernels/activation/selu_op_kernel.cc
@@ -85,11 +85,9 @@ void _SeluGrad<float16>(
 DEFINE_KERNEL_LAUNCHER(float16);
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 DEFINE_GRAD_KERNEL_LAUNCHER(float16);
 DEFINE_GRAD_KERNEL_LAUNCHER(float);
 DEFINE_GRAD_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER

--- a/dragon/kernels/activation/selu_op_kernel.cu
+++ b/dragon/kernels/activation/selu_op_kernel.cu
@@ -191,10 +191,8 @@ void SeluGrad<float16, CUDAContext>(
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 DEFINE_GRAD_KERNEL_LAUNCHER(float);
 DEFINE_GRAD_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER

--- a/dragon/kernels/activation/sigmoid_op_kernel.cc
+++ b/dragon/kernels/activation/sigmoid_op_kernel.cc
@@ -56,11 +56,9 @@ void _SigmoidGrad<float16>(
 DEFINE_KERNEL_LAUNCHER(float16);
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 DEFINE_GRAD_KERNEL_LAUNCHER(float16);
 DEFINE_GRAD_KERNEL_LAUNCHER(float);
 DEFINE_GRAD_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER

--- a/dragon/kernels/activation/sigmoid_op_kernel.cu
+++ b/dragon/kernels/activation/sigmoid_op_kernel.cu
@@ -135,10 +135,8 @@ void SigmoidGrad<float16, CUDAContext>(
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 DEFINE_GRAD_KERNEL_LAUNCHER(float);
 DEFINE_GRAD_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER

--- a/dragon/kernels/activation/softmax_op_kernel.cc
+++ b/dragon/kernels/activation/softmax_op_kernel.cc
@@ -122,11 +122,9 @@ void _SoftmaxGrad<float16>(
 DEFINE_KERNEL_LAUNCHER(float16);
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 DEFINE_GRAD_KERNEL_LAUNCHER(float16);
 DEFINE_GRAD_KERNEL_LAUNCHER(float);
 DEFINE_GRAD_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER

--- a/dragon/kernels/activation/softmax_op_kernel.cu
+++ b/dragon/kernels/activation/softmax_op_kernel.cu
@@ -257,10 +257,8 @@ void SoftmaxGrad<float16, CUDAContext>(
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 DEFINE_GRAD_KERNEL_LAUNCHER(float);
 DEFINE_GRAD_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER

--- a/dragon/kernels/activation/swish_op_kernel.cc
+++ b/dragon/kernels/activation/swish_op_kernel.cc
@@ -60,11 +60,9 @@ void _SwishGrad<float16>(
 DEFINE_KERNEL_LAUNCHER(float16);
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 DEFINE_GRAD_KERNEL_LAUNCHER(float16);
 DEFINE_GRAD_KERNEL_LAUNCHER(float);
 DEFINE_GRAD_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER

--- a/dragon/kernels/activation/swish_op_kernel.cu
+++ b/dragon/kernels/activation/swish_op_kernel.cu
@@ -121,10 +121,8 @@ void SwishGrad<float16, CUDAContext>(
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 DEFINE_GRAD_KERNEL_LAUNCHER(float);
 DEFINE_GRAD_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER

--- a/dragon/kernels/activation/tanh_op_kernel.cc
+++ b/dragon/kernels/activation/tanh_op_kernel.cc
@@ -55,11 +55,9 @@ void _TanhGrad<float16>(
 DEFINE_KERNEL_LAUNCHER(float16);
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 DEFINE_GRAD_KERNEL_LAUNCHER(float16);
 DEFINE_GRAD_KERNEL_LAUNCHER(float);
 DEFINE_GRAD_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER

--- a/dragon/kernels/activation/tanh_op_kernel.cu
+++ b/dragon/kernels/activation/tanh_op_kernel.cu
@@ -124,10 +124,8 @@ void TanhGrad<float16, CUDAContext>(
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 DEFINE_GRAD_KERNEL_LAUNCHER(float);
 DEFINE_GRAD_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER

--- a/dragon/kernels/array/channel_affine_op_kernel.cc
+++ b/dragon/kernels/array/channel_affine_op_kernel.cc
@@ -93,7 +93,6 @@ DEFINE_KERNEL_LAUNCHER(int);
 DEFINE_KERNEL_LAUNCHER(int64_t);
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 } // namespace kernel

--- a/dragon/kernels/array/channel_affine_op_kernel.cu
+++ b/dragon/kernels/array/channel_affine_op_kernel.cu
@@ -206,7 +206,6 @@ DEFINE_KERNEL_LAUNCHER(int);
 DEFINE_KERNEL_LAUNCHER(int64_t);
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 } // namespace kernel

--- a/dragon/kernels/array/channel_normalize_op_kernel.cc
+++ b/dragon/kernels/array/channel_normalize_op_kernel.cc
@@ -105,14 +105,12 @@ DEFINE_KERNEL_LAUNCHER(float, float);
 DEFINE_KERNEL_LAUNCHER(float, double);
 DEFINE_KERNEL_LAUNCHER(double, float);
 DEFINE_KERNEL_LAUNCHER(double, double);
 DEFINE_FP16_KERNEL_LAUNCHER(int8_t);
 DEFINE_FP16_KERNEL_LAUNCHER(uint8_t);
 DEFINE_FP16_KERNEL_LAUNCHER(int);
 DEFINE_FP16_KERNEL_LAUNCHER(int64_t);
 DEFINE_FP16_KERNEL_LAUNCHER(float);
 DEFINE_FP16_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_FP16_KERNEL_LAUNCHER

--- a/dragon/kernels/array/channel_normalize_op_kernel.cu
+++ b/dragon/kernels/array/channel_normalize_op_kernel.cu
@@ -268,14 +268,12 @@ DEFINE_KERNEL_LAUNCHER(float, float);
 DEFINE_KERNEL_LAUNCHER(float, double);
 DEFINE_KERNEL_LAUNCHER(double, float);
 DEFINE_KERNEL_LAUNCHER(double, double);
 DEFINE_FP16_KERNEL_LAUNCHER(int8_t);
 DEFINE_FP16_KERNEL_LAUNCHER(uint8_t);
 DEFINE_FP16_KERNEL_LAUNCHER(int);
 DEFINE_FP16_KERNEL_LAUNCHER(int64_t);
 DEFINE_FP16_KERNEL_LAUNCHER(float);
 DEFINE_FP16_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_FP16_KERNEL_LAUNCHER

--- a/dragon/kernels/array/channel_shuffle_op_kernel.cc
+++ b/dragon/kernels/array/channel_shuffle_op_kernel.cc
@@ -51,7 +51,6 @@ DEFINE_KERNEL_LAUNCHER(int64_t);
 DEFINE_KERNEL_LAUNCHER(float16);
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 } // namespace kernel

--- a/dragon/kernels/array/channel_shuffle_op_kernel.cu
+++ b/dragon/kernels/array/channel_shuffle_op_kernel.cu
@@ -57,7 +57,6 @@ DEFINE_KERNEL_LAUNCHER(int64_t);
 DEFINE_KERNEL_LAUNCHER(float16);
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 } // namespace kernel

--- a/dragon/kernels/array/eye_op_kernel.cc
+++ b/dragon/kernels/array/eye_op_kernel.cc
@@ -38,7 +38,6 @@ DEFINE_KERNEL_LAUNCHER(int64_t);
 DEFINE_KERNEL_LAUNCHER(float16);
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 } // namespace kernel

--- a/dragon/kernels/array/eye_op_kernel.cu
+++ b/dragon/kernels/array/eye_op_kernel.cu
@@ -75,7 +75,6 @@ DEFINE_KERNEL_LAUNCHER(int);
 DEFINE_KERNEL_LAUNCHER(int64_t);
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 } // namespace kernel

--- a/dragon/kernels/array/index_select_op_kernel.cc
+++ b/dragon/kernels/array/index_select_op_kernel.cc
@@ -77,7 +77,6 @@ DEFINE_KERNEL_LAUNCHER(IndexSelect, int64_t);
 DEFINE_KERNEL_LAUNCHER(IndexSelect, float16);
 DEFINE_KERNEL_LAUNCHER(IndexSelect, float);
 DEFINE_KERNEL_LAUNCHER(IndexSelect, double);
 DEFINE_KERNEL_LAUNCHER(IndexSelectGrad, int8_t);
 DEFINE_KERNEL_LAUNCHER(IndexSelectGrad, uint8_t);
 DEFINE_KERNEL_LAUNCHER(IndexSelectGrad, int);
@@ -85,7 +84,6 @@ DEFINE_KERNEL_LAUNCHER(IndexSelectGrad, int64_t);
 DEFINE_KERNEL_LAUNCHER(IndexSelectGrad, float16);
 DEFINE_KERNEL_LAUNCHER(IndexSelectGrad, float);
 DEFINE_KERNEL_LAUNCHER(IndexSelectGrad, double);
 #undef DEFINE_KERNEL_LAUNCHER
 } // namespace kernel

--- a/dragon/kernels/array/index_select_op_kernel.cu
+++ b/dragon/kernels/array/index_select_op_kernel.cu
@@ -112,11 +112,9 @@ DEFINE_KERNEL_LAUNCHER(int64_t);
 DEFINE_KERNEL_LAUNCHER(float16);
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 DEFINE_GRAD_KERNEL_LAUNCHER(float16);
 DEFINE_GRAD_KERNEL_LAUNCHER(float);
 DEFINE_GRAD_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER

--- a/dragon/kernels/array/masked_select_op_kernel.cc
+++ b/dragon/kernels/array/masked_select_op_kernel.cc
@@ -80,14 +80,12 @@ DEFINE_KERNEL_LAUNCHER(int64_t, int64_t);
 DEFINE_KERNEL_LAUNCHER(int64_t, float16);
 DEFINE_KERNEL_LAUNCHER(int64_t, float);
 DEFINE_KERNEL_LAUNCHER(int64_t, double);
 DEFINE_GRAD_KERNEL_LAUNCHER(int, float16);
 DEFINE_GRAD_KERNEL_LAUNCHER(int, float);
 DEFINE_GRAD_KERNEL_LAUNCHER(int, double);
 DEFINE_GRAD_KERNEL_LAUNCHER(int64_t, float16);
 DEFINE_GRAD_KERNEL_LAUNCHER(int64_t, float);
 DEFINE_GRAD_KERNEL_LAUNCHER(int64_t, double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER

--- a/dragon/kernels/array/masked_select_op_kernel.cu
+++ b/dragon/kernels/array/masked_select_op_kernel.cu
@@ -85,14 +85,12 @@ DEFINE_KERNEL_LAUNCHER(int64_t, int64_t);
 DEFINE_KERNEL_LAUNCHER(int64_t, float16);
 DEFINE_KERNEL_LAUNCHER(int64_t, float);
 DEFINE_KERNEL_LAUNCHER(int64_t, double);
 DEFINE_GRAD_KERNEL_LAUNCHER(int, float16);
 DEFINE_GRAD_KERNEL_LAUNCHER(int, float);
 DEFINE_GRAD_KERNEL_LAUNCHER(int, double);
 DEFINE_GRAD_KERNEL_LAUNCHER(int64_t, float16);
 DEFINE_GRAD_KERNEL_LAUNCHER(int64_t, float);
 DEFINE_GRAD_KERNEL_LAUNCHER(int64_t, double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER

--- a/dragon/kernels/array/one_hot_op_kernel.cc
+++ b/dragon/kernels/array/one_hot_op_kernel.cc
@@ -42,7 +42,6 @@ void _OneHot(
 DEFINE_KERNEL_LAUNCHER(int);
 DEFINE_KERNEL_LAUNCHER(int64_t);
 DEFINE_KERNEL_LAUNCHER(float);
 #undef DEFINE_KERNEL_LAUNCHER
 } // namespace kernel

--- a/dragon/kernels/array/one_hot_op_kernel.cu
+++ b/dragon/kernels/array/one_hot_op_kernel.cu
@@ -42,7 +42,6 @@ __global__ void _OneHot(
 DEFINE_KERNEL_LAUNCHER(int);
 DEFINE_KERNEL_LAUNCHER(int64_t);
 DEFINE_KERNEL_LAUNCHER(float);
 #undef DEFINE_KERNEL_LAUNCHER
 } // namespace kernel

--- a/dragon/kernels/array/pad_op_kernel.cc
+++ b/dragon/kernels/array/pad_op_kernel.cc
@@ -132,7 +132,6 @@ DEFINE_CONST_KERNEL_LAUNCHER(int64_t);
 DEFINE_CONST_KERNEL_LAUNCHER(float16);
 DEFINE_CONST_KERNEL_LAUNCHER(float);
 DEFINE_CONST_KERNEL_LAUNCHER(double);
 DEFINE_KERNEL_LAUNCHER(ReflectPad, bool);
 DEFINE_KERNEL_LAUNCHER(ReflectPad, int8_t);
 DEFINE_KERNEL_LAUNCHER(ReflectPad, uint8_t);
@@ -141,7 +140,6 @@ DEFINE_KERNEL_LAUNCHER(ReflectPad, int64_t);
 DEFINE_KERNEL_LAUNCHER(ReflectPad, float16);
 DEFINE_KERNEL_LAUNCHER(ReflectPad, float);
 DEFINE_KERNEL_LAUNCHER(ReflectPad, double);
 DEFINE_KERNEL_LAUNCHER(EdgePad, bool);
 DEFINE_KERNEL_LAUNCHER(EdgePad, int8_t);
 DEFINE_KERNEL_LAUNCHER(EdgePad, uint8_t);
@@ -150,9 +148,8 @@ DEFINE_KERNEL_LAUNCHER(EdgePad, int64_t);
 DEFINE_KERNEL_LAUNCHER(EdgePad, float16);
 DEFINE_KERNEL_LAUNCHER(EdgePad, float);
 DEFINE_KERNEL_LAUNCHER(EdgePad, double);
-#undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_CONST_KERNEL_LAUNCHER
+#undef DEFINE_KERNEL_LAUNCHER
 } // namespace kernel

--- a/dragon/kernels/array/pad_op_kernel.cu
+++ b/dragon/kernels/array/pad_op_kernel.cu
@@ -151,7 +151,6 @@ DEFINE_CONST_KERNEL_LAUNCHER(int64_t);
 DEFINE_CONST_KERNEL_LAUNCHER(float16);
 DEFINE_CONST_KERNEL_LAUNCHER(float);
 DEFINE_CONST_KERNEL_LAUNCHER(double);
 DEFINE_KERNEL_LAUNCHER(ReflectPad, bool);
 DEFINE_KERNEL_LAUNCHER(ReflectPad, int8_t);
 DEFINE_KERNEL_LAUNCHER(ReflectPad, uint8_t);
@@ -160,7 +159,6 @@ DEFINE_KERNEL_LAUNCHER(ReflectPad, int64_t);
 DEFINE_KERNEL_LAUNCHER(ReflectPad, float16);
 DEFINE_KERNEL_LAUNCHER(ReflectPad, float);
 DEFINE_KERNEL_LAUNCHER(ReflectPad, double);
 DEFINE_KERNEL_LAUNCHER(EdgePad, bool);
 DEFINE_KERNEL_LAUNCHER(EdgePad, int8_t);
 DEFINE_KERNEL_LAUNCHER(EdgePad, uint8_t);
@@ -169,9 +167,8 @@ DEFINE_KERNEL_LAUNCHER(EdgePad, int64_t);
 DEFINE_KERNEL_LAUNCHER(EdgePad, float16);
 DEFINE_KERNEL_LAUNCHER(EdgePad, float);
 DEFINE_KERNEL_LAUNCHER(EdgePad, double);
-#undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_CONST_KERNEL_LAUNCHER
+#undef DEFINE_KERNEL_LAUNCHER
 } // namespace kernel

--- a/dragon/kernels/array/reduce_sum_op_kernel.cc
+++ b/dragon/kernels/array/reduce_sum_op_kernel.cc
@@ -63,7 +63,6 @@ void ReduceSumGrad<float16, CPUContext>(
 DEFINE_GRAD_KERNEL_LAUNCHER(float);
 DEFINE_GRAD_KERNEL_LAUNCHER(double);
 #undef DEFINE_GRAD_KERNEL_LAUNCHER
 } // namespace kernel

--- a/dragon/kernels/array/reduce_sum_op_kernel.cu
+++ b/dragon/kernels/array/reduce_sum_op_kernel.cu
@@ -128,7 +128,6 @@ void ReduceSumGrad<float16, CUDAContext>(
 DEFINE_GRAD_KERNEL_LAUNCHER(float);
 DEFINE_GRAD_KERNEL_LAUNCHER(double);
 #undef DEFINE_GRAD_KERNEL_LAUNCHER
 } // namespace kernel

--- a/dragon/kernels/array/repeat_op_kernel.cc
+++ b/dragon/kernels/array/repeat_op_kernel.cc
@@ -75,11 +75,9 @@ DEFINE_KERNEL_LAUNCHER(Repeat, int64_t);
 DEFINE_KERNEL_LAUNCHER(Repeat, float16);
 DEFINE_KERNEL_LAUNCHER(Repeat, float);
 DEFINE_KERNEL_LAUNCHER(Repeat, double);
 DEFINE_KERNEL_LAUNCHER(RepeatGrad, float16);
 DEFINE_KERNEL_LAUNCHER(RepeatGrad, float);
 DEFINE_KERNEL_LAUNCHER(RepeatGrad, double);
 #undef DEFINE_KERNEL_LAUNCHER
 } // namespace kernel

--- a/dragon/kernels/array/repeat_op_kernel.cu
+++ b/dragon/kernels/array/repeat_op_kernel.cu
@@ -139,10 +139,8 @@ DEFINE_KERNEL_LAUNCHER(int64_t);
 DEFINE_KERNEL_LAUNCHER(float16);
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 DEFINE_GRAD_KERNEL_LAUNCHER(float);
 DEFINE_GRAD_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER

--- a/dragon/kernels/array/slice_op_kernel.cc
+++ b/dragon/kernels/array/slice_op_kernel.cc
@@ -76,7 +76,6 @@ DEFINE_KERNEL_LAUNCHER(Slice, int64_t);
 DEFINE_KERNEL_LAUNCHER(Slice, float16);
 DEFINE_KERNEL_LAUNCHER(Slice, float);
 DEFINE_KERNEL_LAUNCHER(Slice, double);
 DEFINE_KERNEL_LAUNCHER(SliceGrad, bool);
 DEFINE_KERNEL_LAUNCHER(SliceGrad, int8_t);
 DEFINE_KERNEL_LAUNCHER(SliceGrad, uint8_t);
@@ -85,7 +84,6 @@ DEFINE_KERNEL_LAUNCHER(SliceGrad, int64_t);
 DEFINE_KERNEL_LAUNCHER(SliceGrad, float16);
 DEFINE_KERNEL_LAUNCHER(SliceGrad, float);
 DEFINE_KERNEL_LAUNCHER(SliceGrad, double);
 #undef DEFINE_KERNEL_LAUNCHER
 } // namespace kernel

--- a/dragon/kernels/array/slice_op_kernel.cu
+++ b/dragon/kernels/array/slice_op_kernel.cu
@@ -85,7 +85,6 @@ DEFINE_KERNEL_LAUNCHER(Slice, int64_t);
 DEFINE_KERNEL_LAUNCHER(Slice, float16);
 DEFINE_KERNEL_LAUNCHER(Slice, float);
 DEFINE_KERNEL_LAUNCHER(Slice, double);
 DEFINE_KERNEL_LAUNCHER(SliceGrad, bool);
 DEFINE_KERNEL_LAUNCHER(SliceGrad, int8_t);
 DEFINE_KERNEL_LAUNCHER(SliceGrad, uint8_t);
@@ -94,7 +93,6 @@ DEFINE_KERNEL_LAUNCHER(SliceGrad, int64_t);
 DEFINE_KERNEL_LAUNCHER(SliceGrad, float16);
 DEFINE_KERNEL_LAUNCHER(SliceGrad, float);
 DEFINE_KERNEL_LAUNCHER(SliceGrad, double);
 #undef DEFINE_KERNEL_LAUNCHER
 } // namespace kernel

--- a/dragon/kernels/array/tile_op_kernel.cc
+++ b/dragon/kernels/array/tile_op_kernel.cc
@@ -85,11 +85,9 @@ DEFINE_KERNEL_LAUNCHER(int64_t);
 DEFINE_KERNEL_LAUNCHER(float16);
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 DEFINE_GRAD_KERNEL_LAUNCHER(float16);
 DEFINE_GRAD_KERNEL_LAUNCHER(float);
 DEFINE_GRAD_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER

--- a/dragon/kernels/array/tile_op_kernel.cu
+++ b/dragon/kernels/array/tile_op_kernel.cu
@@ -141,10 +141,8 @@ DEFINE_KERNEL_LAUNCHER(int64_t);
 DEFINE_KERNEL_LAUNCHER(float16);
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 DEFINE_GRAD_KERNEL_LAUNCHER(float);
 DEFINE_GRAD_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER

--- a/dragon/kernels/array/transpose_op_kernel.cc
+++ b/dragon/kernels/array/transpose_op_kernel.cc
@@ -73,11 +73,9 @@ DEFINE_KERNEL_LAUNCHER(Transpose, int64_t);
 DEFINE_KERNEL_LAUNCHER(Transpose, float16);
 DEFINE_KERNEL_LAUNCHER(Transpose, float);
 DEFINE_KERNEL_LAUNCHER(Transpose, double);
 DEFINE_KERNEL_LAUNCHER(TransposeGrad, float16);
 DEFINE_KERNEL_LAUNCHER(TransposeGrad, float);
 DEFINE_KERNEL_LAUNCHER(TransposeGrad, double);
 #undef DEFINE_KERNEL_LAUNCHER
 } // namespace kernel

--- a/dragon/kernels/array/transpose_op_kernel.cu
+++ b/dragon/kernels/array/transpose_op_kernel.cu
@@ -81,11 +81,9 @@ DEFINE_KERNEL_LAUNCHER(Transpose, int64_t);
 DEFINE_KERNEL_LAUNCHER(Transpose, float16);
 DEFINE_KERNEL_LAUNCHER(Transpose, float);
 DEFINE_KERNEL_LAUNCHER(Transpose, double);
 DEFINE_KERNEL_LAUNCHER(TransposeGrad, float16);
 DEFINE_KERNEL_LAUNCHER(TransposeGrad, float);
 DEFINE_KERNEL_LAUNCHER(TransposeGrad, double);
 #undef DEFINE_KERNEL_LAUNCHER
 } // namespace kernel

--- a/dragon/kernels/array/unique_op_kernel.cc
+++ b/dragon/kernels/array/unique_op_kernel.cc
@@ -75,6 +75,7 @@ DEFINE_KERNEL_LAUNCHER(int);
 DEFINE_KERNEL_LAUNCHER(int64_t);
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
+#undef DEFINE_KERNEL_LAUNCHER
 } // namespace kernel

--- a/dragon/kernels/control_flow/assign_op_kernel.cc
+++ b/dragon/kernels/control_flow/assign_op_kernel.cc
@@ -54,7 +54,6 @@ DEFINE_KERNEL_LAUNCHER(int64_t);
 DEFINE_KERNEL_LAUNCHER(float16);
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 } // namespace kernel

--- a/dragon/kernels/control_flow/assign_op_kernel.cu
+++ b/dragon/kernels/control_flow/assign_op_kernel.cu
@@ -65,7 +65,6 @@ DEFINE_KERNEL_LAUNCHER(int64_t);
 DEFINE_KERNEL_LAUNCHER(float16);
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 } // namespace kernel

--- a/dragon/kernels/loss/generic_loss_op_kernel.cc
+++ b/dragon/kernels/loss/generic_loss_op_kernel.cc
@@ -108,10 +108,8 @@ void BroadcastLossGrad<float16, CPUContext>(
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 DEFINE_GRAD_KERNEL_LAUNCHER(float);
 DEFINE_GRAD_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER

--- a/dragon/kernels/loss/generic_loss_op_kernel.cu
+++ b/dragon/kernels/loss/generic_loss_op_kernel.cu
@@ -231,10 +231,8 @@ void BroadcastLossGrad<float16, CUDAContext>(
 DEFINE_KERNEL_LAUNCHER(float16);
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 DEFINE_GRAD_KERNEL_LAUNCHER(float);
 DEFINE_GRAD_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER

--- a/dragon/kernels/loss/nll_loss_op_kernel.cc
+++ b/dragon/kernels/loss/nll_loss_op_kernel.cc
@@ -88,12 +88,10 @@ DEFINE_KERNEL_LAUNCHER(NLLLoss, float, float);
 DEFINE_KERNEL_LAUNCHER(NLLLoss, float, int64_t);
 DEFINE_KERNEL_LAUNCHER(NLLLoss, double, double);
 DEFINE_KERNEL_LAUNCHER(NLLLoss, double, int64_t);
 DEFINE_KERNEL_LAUNCHER(NLLLossGrad, float, float);
 DEFINE_KERNEL_LAUNCHER(NLLLossGrad, float, int64_t);
 DEFINE_KERNEL_LAUNCHER(NLLLossGrad, double, double);
 DEFINE_KERNEL_LAUNCHER(NLLLossGrad, double, int64_t);
 #undef DEFINE_KERNEL_LAUNCHER
 } // namespace kernel

--- a/dragon/kernels/loss/nll_loss_op_kernel.cu
+++ b/dragon/kernels/loss/nll_loss_op_kernel.cu
@@ -87,12 +87,10 @@ DEFINE_KERNEL_LAUNCHER(NLLLoss, float, float);
 DEFINE_KERNEL_LAUNCHER(NLLLoss, float, int64_t);
 DEFINE_KERNEL_LAUNCHER(NLLLoss, double, double);
 DEFINE_KERNEL_LAUNCHER(NLLLoss, double, int64_t);
 DEFINE_KERNEL_LAUNCHER(NLLLossGrad, float, float);
 DEFINE_KERNEL_LAUNCHER(NLLLossGrad, float, int64_t);
 DEFINE_KERNEL_LAUNCHER(NLLLossGrad, double, double);
 DEFINE_KERNEL_LAUNCHER(NLLLossGrad, double, int64_t);
 #undef DEFINE_KERNEL_LAUNCHER
 } // namespace kernel

--- a/dragon/kernels/loss/sigmoid_ce_loss_op_kernel.cc
+++ b/dragon/kernels/loss/sigmoid_ce_loss_op_kernel.cc
@@ -68,10 +68,8 @@ void _SigmoidCrossEntropyGrad(
 DEFINE_KERNEL_LAUNCHER(SigmoidCrossEntropy, float);
 DEFINE_KERNEL_LAUNCHER(SigmoidCrossEntropy, double);
 DEFINE_KERNEL_LAUNCHER(SigmoidCrossEntropyGrad, float);
 DEFINE_KERNEL_LAUNCHER(SigmoidCrossEntropyGrad, double);
 #undef DEFINE_KERNEL_LAUNCHER
 } // namespace kernel

--- a/dragon/kernels/loss/sigmoid_ce_loss_op_kernel.cu
+++ b/dragon/kernels/loss/sigmoid_ce_loss_op_kernel.cu
@@ -64,10 +64,8 @@ __global__ void _SigmoidCrossEntropyGrad(
 DEFINE_KERNEL_LAUNCHER(SigmoidCrossEntropy, float);
 DEFINE_KERNEL_LAUNCHER(SigmoidCrossEntropy, double);
 DEFINE_KERNEL_LAUNCHER(SigmoidCrossEntropyGrad, float);
 DEFINE_KERNEL_LAUNCHER(SigmoidCrossEntropyGrad, double);
 #undef DEFINE_KERNEL_LAUNCHER
 } // namespace kernel

--- a/dragon/kernels/loss/sigmoid_focal_loss_op_kernel.cc
+++ b/dragon/kernels/loss/sigmoid_focal_loss_op_kernel.cc
@@ -137,12 +137,10 @@ DEFINE_KERNEL_LAUNCHER(SigmoidFocalLoss, float, float);
 DEFINE_KERNEL_LAUNCHER(SigmoidFocalLoss, float, int64_t);
 DEFINE_KERNEL_LAUNCHER(SigmoidFocalLoss, double, double);
 DEFINE_KERNEL_LAUNCHER(SigmoidFocalLoss, double, int64_t);
 DEFINE_KERNEL_LAUNCHER(SigmoidFocalLossGrad, float, float);
 DEFINE_KERNEL_LAUNCHER(SigmoidFocalLossGrad, float, int64_t);
 DEFINE_KERNEL_LAUNCHER(SigmoidFocalLossGrad, double, double);
 DEFINE_KERNEL_LAUNCHER(SigmoidFocalLossGrad, double, int64_t);
 #undef DEFINE_KERNEL_LAUNCHER
 } // namespace kernel

--- a/dragon/kernels/loss/sigmoid_focal_loss_op_kernel.cu
+++ b/dragon/kernels/loss/sigmoid_focal_loss_op_kernel.cu
@@ -132,12 +132,10 @@ DEFINE_KERNEL_LAUNCHER(SigmoidFocalLoss, float, float);
 DEFINE_KERNEL_LAUNCHER(SigmoidFocalLoss, float, int64_t);
 DEFINE_KERNEL_LAUNCHER(SigmoidFocalLoss, double, double);
 DEFINE_KERNEL_LAUNCHER(SigmoidFocalLoss, double, int64_t);
 DEFINE_KERNEL_LAUNCHER(SigmoidFocalLossGrad, float, float);
 DEFINE_KERNEL_LAUNCHER(SigmoidFocalLossGrad, float, int64_t);
 DEFINE_KERNEL_LAUNCHER(SigmoidFocalLossGrad, double, double);
 DEFINE_KERNEL_LAUNCHER(SigmoidFocalLossGrad, double, int64_t);
 #undef DEFINE_KERNEL_LAUNCHER
 } // namespace kernel

--- a/dragon/kernels/loss/smooth_l1_loss_op_kernel.cc
+++ b/dragon/kernels/loss/smooth_l1_loss_op_kernel.cc
@@ -57,10 +57,8 @@ void SmoothL1Grad<float16, CPUContext>(
 DEFINE_KERNEL_LAUNCHER(SmoothL1, float);
 DEFINE_KERNEL_LAUNCHER(SmoothL1, double);
 DEFINE_KERNEL_LAUNCHER(SmoothL1Grad, float);
 DEFINE_KERNEL_LAUNCHER(SmoothL1Grad, double);
 #undef DEFINE_KERNEL_LAUNCHER
 } // namespace kernel

--- a/dragon/kernels/loss/smooth_l1_loss_op_kernel.cu
+++ b/dragon/kernels/loss/smooth_l1_loss_op_kernel.cu
@@ -90,10 +90,8 @@ void SmoothL1Grad<float16, CUDAContext>(
 DEFINE_KERNEL_LAUNCHER(SmoothL1, float);
 DEFINE_KERNEL_LAUNCHER(SmoothL1, double);
 DEFINE_KERNEL_LAUNCHER(SmoothL1Grad, float);
 DEFINE_KERNEL_LAUNCHER(SmoothL1Grad, double);
 #undef DEFINE_KERNEL_LAUNCHER
 } // namespace kernel

--- a/dragon/kernels/loss/softmax_ce_loss_op_kernel.cc
+++ b/dragon/kernels/loss/softmax_ce_loss_op_kernel.cc
@@ -37,7 +37,6 @@ void _SoftmaxCrossEntropy(
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 } // namespace kernel

--- a/dragon/kernels/loss/softmax_ce_loss_op_kernel.cu
+++ b/dragon/kernels/loss/softmax_ce_loss_op_kernel.cu
@@ -41,7 +41,6 @@ __global__ void _SoftmaxCrossEntropy(
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 } // namespace kernel

--- a/dragon/kernels/loss/sparse_softmax_ce_loss_op_kernel.cc
+++ b/dragon/kernels/loss/sparse_softmax_ce_loss_op_kernel.cc
@@ -95,12 +95,10 @@ DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropy, float, float);
 DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropy, float, int64_t);
 DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropy, double, double);
 DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropy, double, int64_t);
 DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropyGrad, float, float);
 DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropyGrad, float, int64_t);
 DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropyGrad, double, double);
 DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropyGrad, double, int64_t);
 #undef DEFINE_KERNEL_LAUNCHER
 } // namespace kernel

--- a/dragon/kernels/loss/sparse_softmax_ce_loss_op_kernel.cu
+++ b/dragon/kernels/loss/sparse_softmax_ce_loss_op_kernel.cu
@@ -93,12 +93,10 @@ DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropy, float, float);
 DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropy, float, int64_t);
 DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropy, double, double);
 DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropy, double, int64_t);
 DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropyGrad, float, float);
 DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropyGrad, float, int64_t);
 DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropyGrad, double, double);
 DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropyGrad, double, int64_t);
 #undef DEFINE_KERNEL_LAUNCHER
 } // namespace kernel

--- a/dragon/kernels/math/clip_op_kernel.cc
+++ b/dragon/kernels/math/clip_op_kernel.cc
@@ -104,11 +104,9 @@ DEFINE_KERNEL_LAUNCHER(int64_t);
 DEFINE_KERNEL_LAUNCHER(float16);
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 DEFINE_GRAD_KERNEL_LAUNCHER(float16);
 DEFINE_GRAD_KERNEL_LAUNCHER(float);
 DEFINE_GRAD_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER

--- a/dragon/kernels/math/clip_op_kernel.cu
+++ b/dragon/kernels/math/clip_op_kernel.cu
@@ -160,10 +160,8 @@ DEFINE_KERNEL_LAUNCHER(int);
 DEFINE_KERNEL_LAUNCHER(int64_t);
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 DEFINE_GRAD_KERNEL_LAUNCHER(float);
 DEFINE_GRAD_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER

--- a/dragon/kernels/math/moments_op_kernel.cc
+++ b/dragon/kernels/math/moments_op_kernel.cc
@@ -155,7 +155,7 @@ void Moments<float16, float, CPUContext>(
  CPU_FP16_NOT_SUPPORTED;
 }
-#define DEFINE_MOMENTS_KERNEL_LAUNCHER(Tx, Ty)                   \
+#define DEFINE_KERNEL_LAUNCHER(Tx, Ty)                           \
  template <>                                                    \
  void Moments<Tx, Ty, CPUContext>(                              \
      const int num_dims,                                        \
@@ -169,14 +169,13 @@ void Moments<float16, float, CPUContext>(
    _Moments(num_dims, dims, num_axes, axes, x, mean, var, ctx); \
  }
-DEFINE_MOMENTS_KERNEL_LAUNCHER(int8_t, float);
+DEFINE_KERNEL_LAUNCHER(int8_t, float);
-DEFINE_MOMENTS_KERNEL_LAUNCHER(uint8_t, float);
+DEFINE_KERNEL_LAUNCHER(uint8_t, float);
-DEFINE_MOMENTS_KERNEL_LAUNCHER(int, float);
+DEFINE_KERNEL_LAUNCHER(int, float);
-DEFINE_MOMENTS_KERNEL_LAUNCHER(int64_t, float);
+DEFINE_KERNEL_LAUNCHER(int64_t, float);
-DEFINE_MOMENTS_KERNEL_LAUNCHER(float, float);
+DEFINE_KERNEL_LAUNCHER(float, float);
-DEFINE_MOMENTS_KERNEL_LAUNCHER(double, double);
+DEFINE_KERNEL_LAUNCHER(double, double);
+#undef DEFINE__KERNEL_LAUNCHER
-#undef DEFINE_MOMENTS_KERNEL_LAUNCHER
 } // namespace kernel

--- a/dragon/kernels/math/moments_op_kernel.cu
+++ b/dragon/kernels/math/moments_op_kernel.cu
@@ -316,7 +316,6 @@ DEFINE_KERNEL_LAUNCHER(int, float);
 DEFINE_KERNEL_LAUNCHER(int64_t, float);
 DEFINE_KERNEL_LAUNCHER(float, float);
 DEFINE_KERNEL_LAUNCHER(double, double);
 #undef DEFINE_KERNEL_LAUNCHER
 } // namespace kernel

--- a/dragon/kernels/normalization/lp_norm_op_kernel.cc
+++ b/dragon/kernels/normalization/lp_norm_op_kernel.cc
@@ -177,12 +177,6 @@ void L2NormalizeGrad<float16, CPUContext>(
    _##name(outer_dim, inner_dim, reduce_dim, (T)scale, (T)eps, x, y); \
  }
-DEFINE_KERNEL_LAUNCHER(L1Normalize, float);
-DEFINE_KERNEL_LAUNCHER(L1Normalize, double);
-DEFINE_KERNEL_LAUNCHER(L2Normalize, float);
-DEFINE_KERNEL_LAUNCHER(L2Normalize, double);
-#undef DEFINE_KERNEL_LAUNCHER
 #define DEFINE_GRAD_KERNEL_LAUNCHER(name, T)                                \
  template <>                                                               \
  void name<T, CPUContext>(                                                 \
@@ -198,10 +192,15 @@ DEFINE_KERNEL_LAUNCHER(L2Normalize, double);
    _##name(outer_dim, inner_dim, reduce_dim, (T)scale, (T)eps, dy, x, dx); \
  }
+DEFINE_KERNEL_LAUNCHER(L1Normalize, float);
+DEFINE_KERNEL_LAUNCHER(L1Normalize, double);
+DEFINE_KERNEL_LAUNCHER(L2Normalize, float);
+DEFINE_KERNEL_LAUNCHER(L2Normalize, double);
 DEFINE_GRAD_KERNEL_LAUNCHER(L1NormalizeGrad, float);
 DEFINE_GRAD_KERNEL_LAUNCHER(L1NormalizeGrad, double);
 DEFINE_GRAD_KERNEL_LAUNCHER(L2NormalizeGrad, float);
 DEFINE_GRAD_KERNEL_LAUNCHER(L2NormalizeGrad, double);
+#undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER
 } // namespace kernel

--- a/dragon/kernels/normalization/lp_norm_op_kernel.cu
+++ b/dragon/kernels/normalization/lp_norm_op_kernel.cu
@@ -170,14 +170,6 @@ __global__ void _L2NormalizeGrad(
            reinterpret_cast<ScalarT*>(y));                                 \
  }
-DEFINE_KERNEL_LAUNCHER(L1Normalize, float16, half, float);
-DEFINE_KERNEL_LAUNCHER(L1Normalize, float, float, float);
-DEFINE_KERNEL_LAUNCHER(L1Normalize, double, double, double);
-DEFINE_KERNEL_LAUNCHER(L2Normalize, float16, half, float);
-DEFINE_KERNEL_LAUNCHER(L2Normalize, float, float, float);
-DEFINE_KERNEL_LAUNCHER(L2Normalize, double, double, double);
-#undef DEFINE_KERNEL_LAUNCHER
 #define DEFINE_GRAD_KERNEL_LAUNCHER(name, T, ScalarT, AccT)                 \
  template <>                                                               \
  void name<T, CUDAContext>(                                                \
@@ -203,12 +195,19 @@ DEFINE_KERNEL_LAUNCHER(L2Normalize, double, double, double);
            reinterpret_cast<ScalarT*>(dx));                                \
  }
+DEFINE_KERNEL_LAUNCHER(L1Normalize, float16, half, float);
+DEFINE_KERNEL_LAUNCHER(L1Normalize, float, float, float);
+DEFINE_KERNEL_LAUNCHER(L1Normalize, double, double, double);
+DEFINE_KERNEL_LAUNCHER(L2Normalize, float16, half, float);
+DEFINE_KERNEL_LAUNCHER(L2Normalize, float, float, float);
+DEFINE_KERNEL_LAUNCHER(L2Normalize, double, double, double);
 DEFINE_GRAD_KERNEL_LAUNCHER(L1NormalizeGrad, float16, half, float);
 DEFINE_GRAD_KERNEL_LAUNCHER(L1NormalizeGrad, float, float, float);
 DEFINE_GRAD_KERNEL_LAUNCHER(L1NormalizeGrad, double, double, double);
 DEFINE_GRAD_KERNEL_LAUNCHER(L2NormalizeGrad, float16, half, float);
 DEFINE_GRAD_KERNEL_LAUNCHER(L2NormalizeGrad, float, float, float);
 DEFINE_GRAD_KERNEL_LAUNCHER(L2NormalizeGrad, double, double, double);
+#undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER
 } // namespace kernel

--- a/dragon/kernels/vision/avgpool_op_kernel.cc
+++ b/dragon/kernels/vision/avgpool_op_kernel.cc
@@ -293,10 +293,8 @@ void _AvgPool2dGradNHWC(
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 DEFINE_GRAD_KERNEL_LAUNCHER(float);
 DEFINE_GRAD_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER

--- a/dragon/kernels/vision/avgpool_op_kernel.cu
+++ b/dragon/kernels/vision/avgpool_op_kernel.cu
@@ -325,10 +325,8 @@ __global__ void _AvgPool2dGradNHWC(
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 DEFINE_GRAD_KERNEL_LAUNCHER(float);
 DEFINE_GRAD_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER

--- a/dragon/kernels/vision/bias_add_op_kernel.cc
+++ b/dragon/kernels/vision/bias_add_op_kernel.cc
@@ -76,7 +76,6 @@ DEFINE_KERNEL_LAUNCHER(int);
 DEFINE_KERNEL_LAUNCHER(int64_t);
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 } // namespace kernel

--- a/dragon/kernels/vision/bias_add_op_kernel.cu
+++ b/dragon/kernels/vision/bias_add_op_kernel.cu
@@ -103,7 +103,6 @@ DEFINE_KERNEL_LAUNCHER(int);
 DEFINE_KERNEL_LAUNCHER(int64_t);
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 } // namespace kernel

--- a/dragon/kernels/vision/conv_op_kernel.cc
+++ b/dragon/kernels/vision/conv_op_kernel.cc
@@ -304,7 +304,6 @@ void _Col2Im2dNHWC(
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 } // namespace kernel

--- a/dragon/kernels/vision/conv_op_kernel.cu
+++ b/dragon/kernels/vision/conv_op_kernel.cu
@@ -356,7 +356,6 @@ __global__ void _Col2Im2dNHWC(
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 } // namespace kernel

--- a/dragon/kernels/vision/depthwise_conv_op_kernel.cc
+++ b/dragon/kernels/vision/depthwise_conv_op_kernel.cc
@@ -243,6 +243,7 @@ void DepthwiseConv2d<float, CPUContext>(
 DEFINE_GRAD_KERNEL_LAUNCHER(float16);
 DEFINE_GRAD_KERNEL_LAUNCHER(float);
 #undef DEFINE_GRAD_KERNEL_LAUNCHER
+#undef DISPATCH_DATA_KERNEL
 } // namespace kernel

--- a/dragon/kernels/vision/depthwise_conv_op_kernel.cu
+++ b/dragon/kernels/vision/depthwise_conv_op_kernel.cu
@@ -12,9 +12,9 @@ namespace kernel {
 namespace {
 #if __CUDA_ARCH__ >= 350
-#define LOAD(x, i) __ldg(x + i)
+#define LDG(x, i) __ldg(x + i)
 #else
-#define LOAD(x, i) x[i]
+#define LDG(x, i) x[i]
 #endif
 template <typename T, typename AccT, int KKH, int KKW>
@@ -60,7 +60,7 @@ __global__ void _DepthwiseConv2dNCHW(
        iw = iw_start + kw * dilation_w;
        if (ih >= 0 && ih < H && iw >= 0 && iw < W) {
          xi = x_start + ih * W + iw;
-          sum_val += convert::To<AccT>(Multiplies(LOAD(x, xi), LOAD(w, wi)));
+          sum_val += convert::To<AccT>(Multiplies(LDG(x, xi), LDG(w, wi)));
        }
        ++wi;
      } // End kw
@@ -112,7 +112,7 @@ __global__ void _DepthwiseConv2dNHWC(
        iw = iw_start + kw * dilation_w;
        if (ih >= 0 && ih < H && iw >= 0 && iw < W) {
          xi = ((x_start + ih) * W + iw) * C + c;
-          sum_val += convert::To<AccT>(Multiplies(LOAD(x, xi), LOAD(w, wi)));
+          sum_val += convert::To<AccT>(Multiplies(LDG(x, xi), LDG(w, wi)));
        }
        ++wi;
      } // End kw
@@ -164,7 +164,7 @@ __global__ void _DepthwiseConv2dGradNCHW(
          ow = ow / stride_w;
          if (oh >= 0 && oh < out_h && ow >= 0 && ow < out_w) {
            yi = y_start + oh * out_w + ow;
-            sum_val += convert::To<AccT>(Multiplies(LOAD(dy, yi), LOAD(w, wi)));
+            sum_val += convert::To<AccT>(Multiplies(LDG(dy, yi), LDG(w, wi)));
          }
        }
        ++wi;
@@ -217,7 +217,7 @@ __global__ void _DepthwiseConv2dGradNHWC(
          ow = ow / stride_w;
          if (oh >= 0 && oh < out_h && ow >= 0 && ow < out_w) {
            yi = ((y_start + oh) * out_w + ow) * C + c;
-            sum_val += convert::To<AccT>(Multiplies(LOAD(dy, yi), LOAD(w, wi)));
+            sum_val += convert::To<AccT>(Multiplies(LDG(dy, yi), LDG(w, wi)));
          }
        }
        ++wi;
@@ -267,7 +267,7 @@ __global__ void _DepthwiseConv2dWGradNCHW(
      if (ih >= 0 && iw >= 0 && ih < H && iw < W) {
        xi = ((i * C + c) * H + ih) * W + iw;
        yi = (i * C + c) * out_h * out_w + j;
-        sum_val += convert::To<AccT>(Multiplies(LOAD(dy, yi), LOAD(x, xi)));
+        sum_val += convert::To<AccT>(Multiplies(LDG(dy, yi), LDG(x, xi)));
      }
    }
  }
@@ -320,7 +320,7 @@ __global__ void _DepthwiseConv2dWGradNHWC(
      if (ih >= 0 && iw >= 0 && ih < H && iw < W) {
        xi = ((i * H + ih) * W + iw) * C + c;
        yi = (i * ohw + j) * C + c;
-        sum_val += convert::To<AccT>(Multiplies(LOAD(dy, yi), LOAD(x, xi)));
+        sum_val += convert::To<AccT>(Multiplies(LDG(dy, yi), LDG(x, xi)));
      }
    }
  }
@@ -333,7 +333,7 @@ __global__ void _DepthwiseConv2dWGradNHWC(
  }
 }
-#undef LOAD
+#undef LDG
 } // namespace
@@ -528,10 +528,8 @@ __global__ void _DepthwiseConv2dWGradNHWC(
 DEFINE_KERNEL_LAUNCHER(float16, half, float);
 DEFINE_KERNEL_LAUNCHER(float, float, float);
 DEFINE_GRAD_KERNEL_LAUNCHER(float16, half, float);
 DEFINE_GRAD_KERNEL_LAUNCHER(float, float, float);
 #undef DISPATCH_DATA_KERNEL
 #undef DISPATCH_WEIGHT_KERNEL
 #undef DEFINE_KERNEL_LAUNCHER

--- a/dragon/kernels/vision/maxpool_op_kernel.cc
+++ b/dragon/kernels/vision/maxpool_op_kernel.cc
@@ -287,10 +287,8 @@ void _MaxPool2dGradNHWC(
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 DEFINE_GRAD_KERNEL_LAUNCHER(float);
 DEFINE_GRAD_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER

--- a/dragon/kernels/vision/maxpool_op_kernel.cu
+++ b/dragon/kernels/vision/maxpool_op_kernel.cu
@@ -335,10 +335,8 @@ __global__ void _MaxPool2dGradNHWC(
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 DEFINE_GRAD_KERNEL_LAUNCHER(float);
 DEFINE_GRAD_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER

--- a/dragon/kernels/vision/resize_linear_op_kernel.cc
+++ b/dragon/kernels/vision/resize_linear_op_kernel.cc
@@ -272,10 +272,8 @@ DEFINE_KERNEL_LAUNCHER(int);
 DEFINE_KERNEL_LAUNCHER(int64_t);
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 DEFINE_GRAD_KERNEL_LAUNCHER(float);
 DEFINE_GRAD_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER

--- a/dragon/kernels/vision/resize_linear_op_kernel.cu
+++ b/dragon/kernels/vision/resize_linear_op_kernel.cu
@@ -11,11 +11,9 @@ namespace kernel {
 namespace {
 #if __CUDA_ARCH__ >= 350
-#define LOAD(x, i) __ldg(x + i)
+#define LDG(x, i) convert::To<float>(__ldg(x + i))
-#define LOADF(x, i) __half2float(__ldg(x + i))
 #else
-#define LOAD(x, i) x[i]
+#define LDG(x, i) convert::To<float>(x[i])
-#define LOADF(x, i) __half2float(x[i])
 #endif
 template <typename T>
@@ -70,53 +68,13 @@ __global__ void _ResizeLinearNCHW(
    const float u = w_in - li;
    const int offset = (n * C + c) * H;
-    const float tl = LOAD(x, ((offset + ti) * W + li));
+    const float tl = LDG(x, ((offset + ti) * W + li));
-    const float tr = LOAD(x, ((offset + ti) * W + ri));
+    const float tr = LDG(x, ((offset + ti) * W + ri));
-    const float bl = LOAD(x, ((offset + bi) * W + li));
+    const float bl = LDG(x, ((offset + bi) * W + li));
-    const float br = LOAD(x, ((offset + bi) * W + ri));
+    const float br = LDG(x, ((offset + bi) * W + ri));
    const float t = tl + (tr - tl) * u;
    const float b = bl + (br - bl) * u;
-    y[yi] = (T)(t + (b - t) * v);
+    y[yi] = convert::To<T>(t + (b - t) * v);
-  }
-}
-template <>
-__global__ void _ResizeLinearNCHW<half>(
-    const int nthreads,
-    const int C,
-    const int H,
-    const int W,
-    const int out_h,
-    const int out_w,
-    const float scale_h,
-    const float scale_w,
-    const bool align_corners,
-    const half* x,
-    half* y) {
-  CUDA_1D_KERNEL_LOOP(yi, nthreads) {
-    const int w = yi % out_w;
-    const int h = (yi / out_w) % out_h;
-    const int c = (yi / out_w / out_h) % C;
-    const int n = yi / out_w / out_w / C;
-    const float h_in = TransformCoordinate(h, scale_h, align_corners);
-    const int ti = floorf(h_in);
-    const int bi = h_in < H - 1 ? ceilf(h_in) : H - 1;
-    const float v = h_in - ti;
-    const float w_in = TransformCoordinate(w, scale_w, align_corners);
-    const int li = floorf(w_in);
-    const int ri = (w_in < W - 1) ? ceilf(w_in) : W - 1;
-    const float u = w_in - li;
-    const int offset = (n * C + c) * H;
-    const float tl = LOADF(x, ((offset + ti) * W + li));
-    const float tr = LOADF(x, ((offset + ti) * W + ri));
-    const float bl = LOADF(x, ((offset + bi) * W + li));
-    const float br = LOADF(x, ((offset + bi) * W + ri));
-    const float t = tl + (tr - tl) * u;
-    const float b = bl + (br - bl) * u;
-    y[yi] = __float2half(t + (b - t) * v);
  }
 }
@@ -150,53 +108,13 @@ __global__ void _ResizeLinearNHWC(
    const float u = w_in - li;
    const int offset = n * H;
-    const float tl = LOAD(x, (((offset + ti) * W + li) * C + c));
+    const float tl = LDG(x, (((offset + ti) * W + li) * C + c));
-    const float tr = LOAD(x, (((offset + ti) * W + ri) * C + c));
+    const float tr = LDG(x, (((offset + ti) * W + ri) * C + c));
-    const float bl = LOAD(x, (((offset + bi) * W + li) * C + c));
+    const float bl = LDG(x, (((offset + bi) * W + li) * C + c));
-    const float br = LOAD(x, (((offset + bi) * W + ri) * C + c));
+    const float br = LDG(x, (((offset + bi) * W + ri) * C + c));
-    const float t = tl + (tr - tl) * u;
-    const float b = bl + (br - bl) * u;
-    y[yi] = (T)(t + (b - t) * v);
-  }
-}
-template <>
-__global__ void _ResizeLinearNHWC<half>(
-    const int nthreads,
-    const int C,
-    const int H,
-    const int W,
-    const int out_h,
-    const int out_w,
-    const float scale_h,
-    const float scale_w,
-    const bool align_corners,
-    const half* x,
-    half* y) {
-  CUDA_1D_KERNEL_LOOP(yi, nthreads) {
-    const int c = yi % C;
-    const int w = (yi / C) % out_w;
-    const int h = (yi / C / out_w) % out_h;
-    const int n = yi / C / out_w / out_h;
-    const float h_in = TransformCoordinate(h, scale_h, align_corners);
-    const int ti = floorf(h_in);
-    const int bi = (h_in < H - 1) ? ceilf(h_in) : H - 1;
-    const float v = h_in - ti;
-    const float w_in = TransformCoordinate(w, scale_w, align_corners);
-    const int li = floorf(w_in);
-    const int ri = (w_in < W - 1) ? ceilf(w_in) : W - 1;
-    const float u = w_in - li;
-    const int offset = n * H;
-    const float tl = LOADF(x, (((offset + ti) * W + li) * C + c));
-    const float tr = LOADF(x, (((offset + ti) * W + ri) * C + c));
-    const float bl = LOADF(x, (((offset + bi) * W + li) * C + c));
-    const float br = LOADF(x, (((offset + bi) * W + ri) * C + c));
    const float t = tl + (tr - tl) * u;
    const float b = bl + (br - bl) * u;
-    y[yi] = __float2half(t + (b - t) * v);
+    y[yi] = convert::To<T>(t + (b - t) * v);
  }
 }
@@ -229,48 +147,8 @@ __global__ void _ResizeLinearGradNCHW(
    const int ri = (w_in < W - 1) ? ceilf(w_in) : W - 1;
    const float u = w_in - li;
-    const float dt = (1.f - v) * LOAD(dy, yi);
+    const float dt = (1.f - v) * LDG(dy, yi);
-    const float db = v * LOAD(dy, yi);
+    const float db = v * LDG(dy, yi);
-    const int offset = (n * C + c) * H;
-    atomicAdd(&dx[(offset + ti) * W + li], (1.f - u) * dt);
-    atomicAdd(&dx[(offset + ti) * W + ri], u * dt);
-    atomicAdd(&dx[(offset + bi) * W + li], (1.f - u) * db);
-    atomicAdd(&dx[(offset + bi) * W + ri], u * db);
-  }
-}
-template <>
-__global__ void _ResizeLinearGradNCHW<half>(
-    const int nthreads,
-    const int C,
-    const int H,
-    const int W,
-    const int out_h,
-    const int out_w,
-    const float scale_h,
-    const float scale_w,
-    const bool align_corners,
-    const half* dy,
-    float* dx) {
-  CUDA_1D_KERNEL_LOOP(yi, nthreads) {
-    const int w = yi % out_w;
-    const int h = (yi / out_w) % out_h;
-    const int c = (yi / out_w / out_h) % C;
-    const int n = yi / out_w / out_w / C;
-    const float h_in = TransformCoordinate(h, scale_h, align_corners);
-    const int ti = floorf(h_in);
-    const int bi = (h_in < H - 1) ? ceilf(h_in) : H - 1;
-    const float v = h_in - ti;
-    const float w_in = TransformCoordinate(w, scale_w, align_corners);
-    const int li = floorf(w_in);
-    const int ri = (w_in < W - 1) ? ceilf(w_in) : W - 1;
-    const float u = w_in - li;
-    const float dt = (1.f - v) * LOADF(dy, yi);
-    const float db = v * LOADF(dy, yi);
    const int offset = (n * C + c) * H;
    atomicAdd(&dx[(offset + ti) * W + li], (1.f - u) * dt);
@@ -309,48 +187,8 @@ __global__ void _ResizeLinearGradNHWC(
    const int ri = (w_in < W - 1) ? ceilf(w_in) : W - 1;
    const float u = w_in - li;
-    const float dt = (1.f - v) * LOAD(dy, yi);
+    const float dt = (1.f - v) * LDG(dy, yi);
-    const float db = v * LOAD(dy, yi);
+    const float db = v * LDG(dy, yi);
-    const int offset = n * H;
-    atomicAdd(&dx[((offset + ti) * W + li) * C + c], (1.f - u) * dt);
-    atomicAdd(&dx[((offset + ti) * W + ri) * C + c], u * dt);
-    atomicAdd(&dx[((offset + bi) * W + li) * C + c], (1.f - u) * db);
-    atomicAdd(&dx[((offset + bi) * W + ri) * C + c], u * db);
-  }
-}
-template <>
-__global__ void _ResizeLinearGradNHWC<half>(
-    const int nthreads,
-    const int C,
-    const int H,
-    const int W,
-    const int out_h,
-    const int out_w,
-    const float scale_h,
-    const float scale_w,
-    const bool align_corners,
-    const half* dy,
-    float* dx) {
-  CUDA_1D_KERNEL_LOOP(yi, nthreads) {
-    const int c = yi % C;
-    const int w = (yi / C) % out_w;
-    const int h = (yi / C / out_w) % out_h;
-    const int n = yi / C / out_w / out_h;
-    const float h_in = TransformCoordinate(h, scale_h, align_corners);
-    const int ti = floorf(h_in);
-    const int bi = (h_in < H - 1) ? ceilf(h_in) : H - 1;
-    const float v = h_in - ti;
-    const float w_in = TransformCoordinate(w, scale_w, align_corners);
-    const int li = floorf(w_in);
-    const int ri = (w_in < W - 1) ? ceilf(w_in) : W - 1;
-    const float u = w_in - li;
-    const float dt = (1.f - v) * LOADF(dy, yi);
-    const float db = v * LOADF(dy, yi);
    const int offset = n * H;
    atomicAdd(&dx[((offset + ti) * W + li) * C + c], (1.f - u) * dt);
@@ -360,125 +198,22 @@ __global__ void _ResizeLinearGradNHWC<half>(
  }
 }
-#undef LOAD
+#undef LDG
-#undef LOADF
 } // namespace
 /* ------------------- Launcher Separator ------------------- */
-template <>
+#define DISPATCH_RESIZE_KERNEL(name, T, nblocks, nthreads, ...)            \
-void ResizeLinear<float16, CUDAContext>(
+  if (data_format == "NCHW") {                                             \
-    const int N,
+    name##NCHW<<<nblocks, nthreads, 0, ctx->cuda_stream()>>>(__VA_ARGS__); \
-    const int C,
+  } else if (data_format == "NHWC") {                                      \
-    const int H,
+    name##NHWC<<<nblocks, nthreads, 0, ctx->cuda_stream()>>>(__VA_ARGS__); \
-    const int W,
+  } else {                                                                 \
-    const int out_h,
+    LOG(FATAL) << "Unknown DataFormat: " << data_format;                   \
-    const int out_w,
-    const bool align_corners,
-    const string& data_format,
-    const float16* x,
-    float16* y,
-    CUDAContext* ctx) {
-  auto nthreads = N * C * out_h * out_w;
-  auto scale_h = ComputeScale(H, out_h, align_corners);
-  auto scale_w = ComputeScale(W, out_w, align_corners);
-  if (data_format == "NCHW") {
-    _ResizeLinearNCHW<<<
-        CUDA_BLOCKS(nthreads),
-        CUDA_THREADS,
-        0,
-        ctx->cuda_stream()>>>(
-        nthreads,
-        C,
-        H,
-        W,
-        out_h,
-        out_w,
-        scale_h,
-        scale_w,
-        align_corners,
-        reinterpret_cast<const half*>(x),
-        reinterpret_cast<half*>(y));
-  } else if (data_format == "NHWC") {
-    _ResizeLinearNHWC<<<
-        CUDA_BLOCKS(nthreads),
-        CUDA_THREADS,
-        0,
-        ctx->cuda_stream()>>>(
-        nthreads,
-        C,
-        H,
-        W,
-        out_h,
-        out_w,
-        scale_h,
-        scale_w,
-        align_corners,
-        reinterpret_cast<const half*>(x),
-        reinterpret_cast<half*>(y));
-  } else {
-    LOG(FATAL) << "Unknown data format: " << data_format;
-  }
-}
-template <>
-void ResizeLinearGrad<float16, CUDAContext>(
-    const int N,
-    const int C,
-    const int H,
-    const int W,
-    const int out_h,
-    const int out_w,
-    const bool align_corners,
-    const string& data_format,
-    const float16* dy,
-    float* dx,
-    CUDAContext* ctx) {
-  auto nthreads = N * C * out_h * out_w;
-  auto scale_h = ComputeScale(H, out_h, align_corners);
-  auto scale_w = ComputeScale(W, out_w, align_corners);
-  math::Set(N * C * H * W, 0.f, dx, ctx);
-  if (data_format == "NCHW") {
-    _ResizeLinearGradNCHW<<<
-        CUDA_BLOCKS(nthreads),
-        CUDA_THREADS,
-        0,
-        ctx->cuda_stream()>>>(
-        nthreads,
-        C,
-        H,
-        W,
-        out_h,
-        out_w,
-        scale_h,
-        scale_w,
-        align_corners,
-        reinterpret_cast<const half*>(dy),
-        dx);
-  } else if (data_format == "NHWC") {
-    _ResizeLinearGradNHWC<<<
-        CUDA_BLOCKS(nthreads),
-        CUDA_THREADS,
-        0,
-        ctx->cuda_stream()>>>(
-        nthreads,
-        C,
-        H,
-        W,
-        out_h,
-        out_w,
-        scale_h,
-        scale_w,
-        align_corners,
-        reinterpret_cast<const half*>(dy),
-        dx);
-  } else {
-    LOG(FATAL) << "Unknown data format: " << data_format;
  }
-}
-#define DEFINE_KERNEL_LAUNCHER(T)                           \
+#define DEFINE_KERNEL_LAUNCHER(T, ScalarT)                \
  template <>                                             \
  void ResizeLinear<T, CUDAContext>(                      \
      const int N,                                        \
@@ -495,29 +230,11 @@ void ResizeLinearGrad<float16, CUDAContext>(
    auto nthreads = N * C * out_h * out_w;                \
    auto scale_h = ComputeScale(H, out_h, align_corners); \
    auto scale_w = ComputeScale(W, out_w, align_corners); \
-    if (data_format == "NCHW") {                            \
+    DISPATCH_RESIZE_KERNEL(                               \
-      _ResizeLinearNCHW<<<                                  \
+        _ResizeLinear,                                    \
-          CUDA_BLOCKS(nthreads),                            \
+        ScalarT,                                          \
-          CUDA_THREADS,                                     \
-          0,                                                \
-          ctx->cuda_stream()>>>(                            \
-          nthreads,                                         \
-          C,                                                \
-          H,                                                \
-          W,                                                \
-          out_h,                                            \
-          out_w,                                            \
-          scale_h,                                          \
-          scale_w,                                          \
-          align_corners,                                    \
-          x,                                                \
-          y);                                               \
-    } else if (data_format == "NHWC") {                     \
-      _ResizeLinearNHWC<<<                                  \
        CUDA_BLOCKS(nthreads),                            \
        CUDA_THREADS,                                     \
-          0,                                                \
-          ctx->cuda_stream()>>>(                            \
        nthreads,                                         \
        C,                                                \
        H,                                                \
@@ -527,14 +244,11 @@ void ResizeLinearGrad<float16, CUDAContext>(
        scale_h,                                          \
        scale_w,                                          \
        align_corners,                                    \
-          x,                                                \
+        reinterpret_cast<const ScalarT*>(x),              \
-          y);                                               \
+        reinterpret_cast<ScalarT*>(y));                   \
-    } else {                                                \
-      LOG(FATAL) << "Unknown data format: " << data_format; \
-    }                                                       \
  }
-#define DEFINE_GRAD_KERNEL_LAUNCHER(T)                      \
+#define DEFINE_GRAD_KERNEL_LAUNCHER(T, ScalarT)           \
  template <>                                             \
  void ResizeLinearGrad<T, CUDAContext>(                  \
      const int N,                                        \
@@ -552,12 +266,11 @@ void ResizeLinearGrad<float16, CUDAContext>(
    auto scale_h = ComputeScale(H, out_h, align_corners); \
    auto scale_w = ComputeScale(W, out_w, align_corners); \
    math::Set(N* C* H* W, 0.f, dx, ctx);                  \
-    if (data_format == "NCHW") {                            \
+    DISPATCH_RESIZE_KERNEL(                               \
-      _ResizeLinearGradNCHW<<<                              \
+        _ResizeLinearGrad,                                \
+        ScalarT,                                          \
        CUDA_BLOCKS(nthreads),                            \
        CUDA_THREADS,                                     \
-          0,                                                \
-          ctx->cuda_stream()>>>(                            \
        nthreads,                                         \
        C,                                                \
        H,                                                \
@@ -567,42 +280,23 @@ void ResizeLinearGrad<float16, CUDAContext>(
        scale_h,                                          \
        scale_w,                                          \
        align_corners,                                    \
-          dy,                                               \
+        reinterpret_cast<const ScalarT*>(dy),             \
        dx);                                              \
-    } else if (data_format == "NHWC") {                     \
-      _ResizeLinearGradNHWC<<<                              \
-          CUDA_BLOCKS(nthreads),                            \
-          CUDA_THREADS,                                     \
-          0,                                                \
-          ctx->cuda_stream()>>>(                            \
-          nthreads,                                         \
-          C,                                                \
-          H,                                                \
-          W,                                                \
-          out_h,                                            \
-          out_w,                                            \
-          scale_h,                                          \
-          scale_w,                                          \
-          align_corners,                                    \
-          dy,                                               \
-          dx);                                              \
-    } else {                                                \
-      LOG(FATAL) << "Unknown data format: " << data_format; \
-    }                                                       \
  }
-DEFINE_KERNEL_LAUNCHER(int8_t);
+DEFINE_KERNEL_LAUNCHER(int8_t, int8_t);
-DEFINE_KERNEL_LAUNCHER(uint8_t);
+DEFINE_KERNEL_LAUNCHER(uint8_t, uint8_t);
-DEFINE_KERNEL_LAUNCHER(int);
+DEFINE_KERNEL_LAUNCHER(int, int);
-DEFINE_KERNEL_LAUNCHER(int64_t);
+DEFINE_KERNEL_LAUNCHER(int64_t, int64_t);
-DEFINE_KERNEL_LAUNCHER(float);
+DEFINE_KERNEL_LAUNCHER(float16, half);
-DEFINE_KERNEL_LAUNCHER(double);
+DEFINE_KERNEL_LAUNCHER(float, float);
+DEFINE_KERNEL_LAUNCHER(double, double);
-DEFINE_GRAD_KERNEL_LAUNCHER(float);
+DEFINE_GRAD_KERNEL_LAUNCHER(float16, half);
-DEFINE_GRAD_KERNEL_LAUNCHER(double);
+DEFINE_GRAD_KERNEL_LAUNCHER(float, float);
+DEFINE_GRAD_KERNEL_LAUNCHER(double, double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER
+#undef DISPATCH_RESIZE_KERNEL
 } // namespace kernel

--- a/dragon/kernels/vision/resize_nearest_op_kernel.cc
+++ b/dragon/kernels/vision/resize_nearest_op_kernel.cc
@@ -181,10 +181,8 @@ DEFINE_KERNEL_LAUNCHER(int64_t);
 DEFINE_KERNEL_LAUNCHER(float16);
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 DEFINE_GRAD_KERNEL_LAUNCHER(float);
 DEFINE_GRAD_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER

--- a/dragon/kernels/vision/resize_nearest_op_kernel.cu
+++ b/dragon/kernels/vision/resize_nearest_op_kernel.cu
@@ -10,6 +10,14 @@ namespace kernel {
 namespace {
+#if __CUDA_ARCH__ >= 350
+#define LDG(x, i) __ldg(x + i)
+#define LDG2(x, i) convert::To<float>(__ldg(x + i))
+#else
+#define LDG(x, i) x[i]
+#define LDG2(x, i) convert::To<float>(x[i])
+#endif
 template <typename T>
 __global__ void _ResizeNearestNCHW(
    const int nthreads,
@@ -29,11 +37,7 @@ __global__ void _ResizeNearestNCHW(
    const int n = yi / out_w / out_h / C;
    const int h_in = min(int(h * scale_h), H - 1);
    const int w_in = min(int(w * scale_w), W - 1);
-#if __CUDA_ARCH__ >= 350
+    y[yi] = LDG(x, (((n * C + c) * H + h_in) * W + w_in));
-    y[yi] = __ldg(x + (((n * C + c) * H + h_in) * W + w_in));
-#else
-    y[yi] = x[((n * C + c) * H + h_in) * W + w_in];
-#endif
  }
 }
@@ -56,11 +60,7 @@ __global__ void _ResizeNearestNHWC(
    const int n = yi / C / out_w / out_h;
    const int h_in = min(int(h * scale_h), H - 1);
    const int w_in = min(int(w * scale_w), W - 1);
-#if __CUDA_ARCH__ >= 350
+    y[yi] = LDG(x, (((n * H + h_in) * W + w_in) * C + c));
-    y[yi] = __ldg(x + (((n * H + h_in) * W + w_in) * C + c));
-#else
-    y[yi] = x[((n * H + h_in) * W + w_in) * C + c];
-#endif
  }
 }
@@ -83,39 +83,7 @@ __global__ void _ResizeNearestGradNCHW(
    const int n = yi / out_w / out_h / C;
    const int h_in = min(int(h * scale_h), H - 1);
    const int w_in = min(int(w * scale_w), W - 1);
-#if __CUDA_ARCH__ >= 350
+    atomicAdd(&dx[((n * C + c) * H + h_in) * W + w_in], LDG2(dy, yi));
-    atomicAdd(&dx[((n * C + c) * H + h_in) * W + w_in], (float)__ldg(dy + yi));
-#else
-    atomicAdd(&dx[((n * C + c) * H + h_in) * W + w_in], (float)dy[yi]);
-#endif
-  }
-}
-template <>
-__global__ void _ResizeNearestGradNCHW<half>(
-    const int nthreads,
-    const int C,
-    const int H,
-    const int W,
-    const int out_h,
-    const int out_w,
-    const float scale_h,
-    const float scale_w,
-    const half* dy,
-    float* dx) {
-  CUDA_1D_KERNEL_LOOP(yi, nthreads) {
-    const int w = yi % out_w;
-    const int h = (yi / out_w) % out_h;
-    const int c = (yi / out_w / out_h) % C;
-    const int n = yi / out_w / out_h / C;
-    const int h_in = min(int(h * scale_h), H - 1);
-    const int w_in = min(int(w * scale_w), W - 1);
-#if __CUDA_ARCH__ >= 350
-    atomicAdd(
-        &dx[((n * C + c) * H + h_in) * W + w_in], __half2float(__ldg(dy + yi)));
-#else
-    atomicAdd(&dx[((n * C + c) * H + h_in) * W + w_in], __half2float(dy[yi]));
-#endif
  }
 }
@@ -138,152 +106,27 @@ __global__ void _ResizeNearestGradNHWC(
    const int n = yi / C / out_w / out_h;
    const int h_in = min(int(h * scale_h), H - 1);
    const int w_in = min(int(w * scale_w), W - 1);
-#if __CUDA_ARCH__ >= 350
+    atomicAdd(&dx[((n * H + h_in) * W + w_in) * C + c], LDG2(dy, yi));
-    atomicAdd(&dx[((n * H + h_in) * W + w_in) * C + c], (float)__ldg(dy + yi));
-#else
-    atomicAdd(&dx[((n * H + h_in) * W + w_in) * C + c], (float)dy[yi]);
-#endif
  }
 }
-template <>
+#undef LDG
-__global__ void _ResizeNearestGradNHWC<half>(
+#undef LDG2
-    const int nthreads,
-    const int C,
-    const int H,
-    const int W,
-    const int out_h,
-    const int out_w,
-    const float scale_h,
-    const float scale_w,
-    const half* dy,
-    float* dx) {
-  CUDA_1D_KERNEL_LOOP(yi, nthreads) {
-    const int c = yi % C;
-    const int w = (yi / C) % out_w;
-    const int h = (yi / C / out_w) % out_h;
-    const int n = yi / C / out_w / out_h;
-    const int h_in = min(int(h * scale_h), H - 1);
-    const int w_in = min(int(w * scale_w), W - 1);
-#if __CUDA_ARCH__ >= 350
-    atomicAdd(
-        &dx[((n * H + h_in) * W + w_in) * C + c], __half2float(__ldg(dy + yi)));
-#else
-    atomicAdd(&dx[((n * H + h_in) * W + w_in) * C + c], __half2float(dy[yi]));
-#endif
-  }
-}
 } // namespace
 /* ------------------- Launcher Separator ------------------- */
-template <>
+#define DISPATCH_RESIZE_KERNEL(name, T, nblocks, nthreads, ...)            \
-void ResizeNearest<float16, CUDAContext>(
+  if (data_format == "NCHW") {                                             \
-    const int N,
+    name##NCHW<<<nblocks, nthreads, 0, ctx->cuda_stream()>>>(__VA_ARGS__); \
-    const int C,
+  } else if (data_format == "NHWC") {                                      \
-    const int H,
+    name##NHWC<<<nblocks, nthreads, 0, ctx->cuda_stream()>>>(__VA_ARGS__); \
-    const int W,
+  } else {                                                                 \
-    const int out_h,
+    LOG(FATAL) << "Unknown DataFormat: " << data_format;                   \
-    const int out_w,
-    const string& data_format,
-    const float16* x,
-    float16* y,
-    CUDAContext* ctx) {
-  auto nthreads = N * C * out_h * out_w;
-  auto scale_h = (float)H / (float)out_h;
-  auto scale_w = (float)W / (float)out_w;
-  if (data_format == "NCHW") {
-    _ResizeNearestNCHW<<<
-        CUDA_BLOCKS(nthreads),
-        CUDA_THREADS,
-        0,
-        ctx->cuda_stream()>>>(
-        nthreads,
-        C,
-        H,
-        W,
-        out_h,
-        out_w,
-        scale_h,
-        scale_w,
-        reinterpret_cast<const half*>(x),
-        reinterpret_cast<half*>(y));
-  } else if (data_format == "NHWC") {
-    _ResizeNearestNHWC<<<
-        CUDA_BLOCKS(nthreads),
-        CUDA_THREADS,
-        0,
-        ctx->cuda_stream()>>>(
-        nthreads,
-        C,
-        H,
-        W,
-        out_h,
-        out_w,
-        scale_h,
-        scale_w,
-        reinterpret_cast<const half*>(x),
-        reinterpret_cast<half*>(y));
-  } else {
-    LOG(FATAL) << "Unknown data format: " << data_format;
-  }
-}
-template <>
-void ResizeNearestGrad<float16, CUDAContext>(
-    const int N,
-    const int C,
-    const int H,
-    const int W,
-    const int out_h,
-    const int out_w,
-    const string& data_format,
-    const float16* dy,
-    float* dx,
-    CUDAContext* ctx) {
-  auto nthreads = N * C * out_h * out_w;
-  auto scale_h = (float)H / (float)out_h;
-  auto scale_w = (float)W / (float)out_w;
-  math::Set(N * C * H * W, 0.f, dx, ctx);
-  if (data_format == "NCHW") {
-    _ResizeNearestGradNCHW<<<
-        CUDA_BLOCKS(nthreads),
-        CUDA_THREADS,
-        0,
-        ctx->cuda_stream()>>>(
-        nthreads,
-        C,
-        H,
-        W,
-        out_h,
-        out_w,
-        scale_h,
-        scale_w,
-        reinterpret_cast<const half*>(dy),
-        dx);
-  } else if (data_format == "NHWC") {
-    _ResizeNearestGradNHWC<<<
-        CUDA_BLOCKS(nthreads),
-        CUDA_THREADS,
-        0,
-        ctx->cuda_stream()>>>(
-        nthreads,
-        C,
-        H,
-        W,
-        out_h,
-        out_w,
-        scale_h,
-        scale_w,
-        reinterpret_cast<const half*>(dy),
-        dx);
-  } else {
-    LOG(FATAL) << "Unknown data format: " << data_format;
  }
-}
-#define DEFINE_KERNEL_LAUNCHER(T)                                   \
+#define DEFINE_KERNEL_LAUNCHER(T, ScalarT)   \
  template <>                                \
  void ResizeNearest<T, CUDAContext>(        \
      const int N,                           \
@@ -299,26 +142,24 @@ void ResizeNearestGrad<float16, CUDAContext>(
    auto nthreads = N * C * out_h * out_w;   \
    auto scale_h = (float)H / (float)out_h;  \
    auto scale_w = (float)W / (float)out_w;  \
-    if (data_format == "NCHW") {                                    \
+    DISPATCH_RESIZE_KERNEL(                  \
-      _ResizeNearestNCHW<<<                                         \
+        _ResizeNearest,                      \
+        ScalarT,                             \
        CUDA_BLOCKS(nthreads),               \
        CUDA_THREADS,                        \
-          0,                                                        \
+        nthreads,                            \
-          ctx->cuda_stream()>>>(                                    \
+        C,                                   \
-          nthreads, C, H, W, out_h, out_w, scale_h, scale_w, x, y); \
+        H,                                   \
-    } else if (data_format == "NHWC") {                             \
+        W,                                   \
-      _ResizeNearestNHWC<<<                                         \
+        out_h,                               \
-          CUDA_BLOCKS(nthreads),                                    \
+        out_w,                               \
-          CUDA_THREADS,                                             \
+        scale_h,                             \
-          0,                                                        \
+        scale_w,                             \
-          ctx->cuda_stream()>>>(                                    \
+        reinterpret_cast<const ScalarT*>(x), \
-          nthreads, C, H, W, out_h, out_w, scale_h, scale_w, x, y); \
+        reinterpret_cast<ScalarT*>(y));      \
-    } else {                                                        \
-      LOG(FATAL) << "Unknown data format: " << data_format;         \
-    }                                                               \
  }
-#define DEFINE_GRAD_KERNEL_LAUNCHER(T)                                \
+#define DEFINE_GRAD_KERNEL_LAUNCHER(T, ScalarT) \
  template <>                                   \
  void ResizeNearestGrad<T, CUDAContext>(       \
      const int N,                              \
@@ -335,37 +176,36 @@ void ResizeNearestGrad<float16, CUDAContext>(
    auto scale_h = (float)H / (float)out_h;     \
    auto scale_w = (float)W / (float)out_w;     \
    math::Set(N* C* H* W, 0.f, dx, ctx);        \
-    if (data_format == "NCHW") {                                      \
+    DISPATCH_RESIZE_KERNEL(                     \
-      _ResizeNearestGradNCHW<<<                                       \
+        _ResizeNearestGrad,                     \
+        ScalarT,                                \
        CUDA_BLOCKS(nthreads),                  \
        CUDA_THREADS,                           \
-          0,                                                          \
+        nthreads,                               \
-          ctx->cuda_stream()>>>(                                      \
+        C,                                      \
-          nthreads, C, H, W, out_h, out_w, scale_h, scale_w, dy, dx); \
+        H,                                      \
-    } else if (data_format == "NHWC") {                               \
+        W,                                      \
-      _ResizeNearestGradNHWC<<<                                       \
+        out_h,                                  \
-          CUDA_BLOCKS(nthreads),                                      \
+        out_w,                                  \
-          CUDA_THREADS,                                               \
+        scale_h,                                \
-          0,                                                          \
+        scale_w,                                \
-          ctx->cuda_stream()>>>(                                      \
+        reinterpret_cast<const ScalarT*>(dy),   \
-          nthreads, C, H, W, out_h, out_w, scale_h, scale_w, dy, dx); \
+        dx);                                    \
-    } else {                                                          \
-      LOG(FATAL) << "Unknown data format: " << data_format;           \
-    }                                                                 \
  }
-DEFINE_KERNEL_LAUNCHER(int8_t);
+DEFINE_KERNEL_LAUNCHER(int8_t, int8_t);
-DEFINE_KERNEL_LAUNCHER(uint8_t);
+DEFINE_KERNEL_LAUNCHER(uint8_t, uint8_t);
-DEFINE_KERNEL_LAUNCHER(int);
+DEFINE_KERNEL_LAUNCHER(int, int);
-DEFINE_KERNEL_LAUNCHER(int64_t);
+DEFINE_KERNEL_LAUNCHER(int64_t, int64_t);
-DEFINE_KERNEL_LAUNCHER(float);
+DEFINE_KERNEL_LAUNCHER(float16, half);
-DEFINE_KERNEL_LAUNCHER(double);
+DEFINE_KERNEL_LAUNCHER(float, float);
+DEFINE_KERNEL_LAUNCHER(double, double);
-DEFINE_GRAD_KERNEL_LAUNCHER(float);
+DEFINE_GRAD_KERNEL_LAUNCHER(float16, half);
-DEFINE_GRAD_KERNEL_LAUNCHER(double);
+DEFINE_GRAD_KERNEL_LAUNCHER(float, float);
+DEFINE_GRAD_KERNEL_LAUNCHER(double, double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER
+#undef DISPATCH_RESIZE_KERNEL
 } // namespace kernel

--- a/dragon/kernels/vision/roi_align_op_kernel.cc
+++ b/dragon/kernels/vision/roi_align_op_kernel.cc
@@ -185,11 +185,9 @@ void RoiAlign<float16, CPUContext>(
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 DEFINE_GRAD_KERNEL_LAUNCHER(float16);
 DEFINE_GRAD_KERNEL_LAUNCHER(float);
 DEFINE_GRAD_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER

--- a/dragon/kernels/vision/roi_align_op_kernel.cu
+++ b/dragon/kernels/vision/roi_align_op_kernel.cu
@@ -10,11 +10,9 @@ namespace kernel {
 namespace {
 #if __CUDA_ARCH__ >= 350
-#define LOAD(x, i) __ldg(x + i)
+#define LDG(x, i) convert::To<float>(__ldg(x + i))
-#define LOADF(x, i) __half2float(__ldg(x + i))
 #else
-#define LOAD(x, i) x[i]
+#define LDG(x, i) convert::To<float>(x[i])
-#define LOADF(x, i) __half2float(x[i])
 #endif
 template <typename T>
@@ -42,48 +40,10 @@ _RoiAlignIntp(const int H, const int W, float h, float w, const T* x) {
    w = (float)li;
  }
-  const float tl = LOAD(x, (ti * W + li));
+  const float tl = LDG(x, (ti * W + li));
-  const float tr = LOAD(x, (ti * W + ri));
+  const float tr = LDG(x, (ti * W + ri));
-  const float bl = LOAD(x, (bi * W + li));
+  const float bl = LDG(x, (bi * W + li));
-  const float br = LOAD(x, (bi * W + ri));
+  const float br = LDG(x, (bi * W + ri));
-  const float v = h - ti;
-  const float u = w - li;
-  const float t = tl + (tr - tl) * u;
-  const float b = bl + (br - bl) * u;
-  return t + (b - t) * v;
-}
-template <>
-__device__ float
-_RoiAlignIntp<half>(const int H, const int W, float h, float w, const half* x) {
-  if (h < -1.f || h > H || w < -1.f || w > W) return 0.f;
-  if (h <= 0.f) h = 0.f;
-  if (w <= 0.f) w = 0.f;
-  int ti = (int)h, bi;
-  int li = (int)w, ri;
-  if (ti < H - 1) {
-    bi = ti + 1;
-  } else {
-    ti = bi = H - 1;
-    h = (float)ti;
-  }
-  if (li < W - 1) {
-    ri = li + 1;
-  } else {
-    ri = li = W - 1;
-    w = (float)li;
-  }
-  const float tl = LOADF(x, (ti * W + li));
-  const float tr = LOADF(x, (ti * W + ri));
-  const float bl = LOADF(x, (bi * W + li));
-  const float br = LOADF(x, (bi * W + ri));
  const float v = h - ti;
  const float u = w - li;
@@ -133,7 +93,7 @@ __device__ void _RoiAlignIntpParam(
  u = w - li;
 }
-template <typename T>
+template <typename T, typename AccT>
 __global__ void _RoiAlign(
    const int nthreads,
    const int C,
@@ -156,7 +116,7 @@ __global__ void _RoiAlign(
    const int batch_ind = roi[0];
    if (batch_ind < 0) {
-      y[yi] = T(0);
+      y[yi] = convert::To<T>(0.f);
      continue;
    }
@@ -180,67 +140,7 @@ __global__ void _RoiAlign(
    const T* offset_x = x + (batch_ind * C + c) * H * W;
-    T val = T(0);
+    AccT val = AccT(0);
-    for (int i = 0; i < grid_h; i++) {
-      const float h = hstart + (i + .5f) * bin_h / grid_h;
-      for (int j = 0; j < grid_w; j++) {
-        const float w = wstart + (j + .5f) * bin_w / grid_w;
-        val += _RoiAlignIntp(H, W, h, w, offset_x);
-      }
-    }
-    y[yi] = val / T(grid_h * grid_w);
-  }
-}
-template <>
-__global__ void _RoiAlign<half>(
-    const int nthreads,
-    const int C,
-    const int H,
-    const int W,
-    const int out_h,
-    const int out_w,
-    const float spatial_scale,
-    const int sampling_ratio,
-    const half* x,
-    const float* rois,
-    half* y) {
-  CUDA_1D_KERNEL_LOOP(yi, nthreads) {
-    const int ow = yi % out_w;
-    const int oh = (yi / out_w) % out_h;
-    const int c = (yi / out_w / out_h) % C;
-    const int n = yi / out_w / out_h / C;
-    const float* roi = rois + n * 5;
-    const int batch_ind = roi[0];
-    if (batch_ind < 0) {
-      y[yi] = __float2half(0.f);
-      continue;
-    }
-    const float roi_start_w = roi[1] * spatial_scale;
-    const float roi_start_h = roi[2] * spatial_scale;
-    const float roi_end_w = roi[3] * spatial_scale;
-    const float roi_end_h = roi[4] * spatial_scale;
-    const float roi_w = max(roi_end_w - roi_start_w, 1.f);
-    const float roi_h = max(roi_end_h - roi_start_h, 1.f);
-    const float bin_h = roi_h / (float)out_h;
-    const float bin_w = roi_w / (float)out_w;
-    const int grid_h =
-        sampling_ratio > 0 ? sampling_ratio : ceil(roi_h / out_h);
-    const int grid_w =
-        sampling_ratio > 0 ? sampling_ratio : ceil(roi_w / out_w);
-    const float hstart = roi_start_h + oh * bin_h;
-    const float wstart = roi_start_w + ow * bin_w;
-    const half* offset_x = x + (batch_ind * C + c) * H * W;
-    float val = 0.f;
    for (int i = 0; i < grid_h; i++) {
      const float h = hstart + (i + .5f) * bin_h / grid_h;
      for (int j = 0; j < grid_w; j++) {
@@ -249,11 +149,11 @@ __global__ void _RoiAlign<half>(
      }
    }
-    y[yi] = __float2half(val / float(grid_h * grid_w));
+    y[yi] = convert::To<T>(val / AccT(grid_h * grid_w));
  }
 }
-template <typename T>
+template <typename T, typename AccT>
 __global__ void _RoiAlignGrad(
    const int nthreads,
    const int C,
@@ -265,7 +165,7 @@ __global__ void _RoiAlignGrad(
    const int sampling_ratio,
    const T* dy,
    const float* rois,
-    float* dx) {
+    AccT* dx) {
  CUDA_1D_KERNEL_LOOP(yi, nthreads) {
    const int ow = yi % out_w;
    const int oh = (yi / out_w) % out_h;
@@ -295,7 +195,7 @@ __global__ void _RoiAlignGrad(
    const float hstart = roi_start_h + oh * bin_h;
    const float wstart = roi_start_w + ow * bin_w;
-    const float dyi = dy[yi] / T(grid_h * grid_w);
+    const float dyi = convert::To<float>(dy[yi]) / float(grid_h * grid_w);
    float* offset_dx = dx + (batch_ind * C + c) * H * W;
    for (int i = 0; i < grid_h; i++) {
@@ -318,137 +218,13 @@ __global__ void _RoiAlignGrad(
  }
 }
-template <>
+#undef LDG
-__global__ void _RoiAlignGrad<half>(
-    const int nthreads,
-    const int C,
-    const int H,
-    const int W,
-    const int out_h,
-    const int out_w,
-    const float spatial_scale,
-    const int sampling_ratio,
-    const half* dy,
-    const float* rois,
-    float* dx) {
-  CUDA_1D_KERNEL_LOOP(yi, nthreads) {
-    const int ow = yi % out_w;
-    const int oh = (yi / out_w) % out_h;
-    const int c = (yi / out_w / out_h) % C;
-    const int n = yi / out_w / out_h / C;
-    const float* roi = rois + n * 5;
-    const int batch_ind = roi[0];
-    if (batch_ind < 0) continue;
-    const float roi_start_w = roi[1] * spatial_scale;
-    const float roi_start_h = roi[2] * spatial_scale;
-    const float roi_end_w = roi[3] * spatial_scale;
-    const float roi_end_h = roi[4] * spatial_scale;
-    const float roi_w = max(roi_end_w - roi_start_w, 1.f);
-    const float roi_h = max(roi_end_h - roi_start_h, 1.f);
-    const float bin_h = roi_h / (float)out_h;
-    const float bin_w = roi_w / (float)out_w;
-    const int grid_h =
-        sampling_ratio > 0 ? sampling_ratio : ceil(roi_h / out_h);
-    const int grid_w =
-        sampling_ratio > 0 ? sampling_ratio : ceil(roi_w / out_w);
-    const float hstart = roi_start_h + oh * bin_h;
-    const float wstart = roi_start_w + ow * bin_w;
-    const float dyi = __half2float(dy[yi]) / float(grid_h * grid_w);
-    float* offset_dx = dx + (batch_ind * C + c) * H * W;
-    for (int i = 0; i < grid_h; i++) {
-      const float h = hstart + (i + .5f) * bin_h / grid_h;
-      for (int j = 0; j < grid_w; j++) {
-        const float w = wstart + (j + .5f) * bin_w / grid_w;
-        int ti, bi, li, ri;
-        float v, u;
-        _RoiAlignIntpParam(H, W, h, w, ti, bi, li, ri, v, u);
-        if (li >= 0 && ri >= 0 && ti >= 0 && bi >= 0) {
-          const float db = dyi * v;
-          const float dt = dyi * (1.f - v);
-          atomicAdd(offset_dx + ti * W + li, (1.f - u) * dt);
-          atomicAdd(offset_dx + ti * W + ri, u * dt);
-          atomicAdd(offset_dx + bi * W + li, (1.f - u) * db);
-          atomicAdd(offset_dx + bi * W + ri, u * db);
-        }
-      } // End i
-    } // End j
-  }
-}
-#undef LOAD
-#undef LOADF
 } // namespace
 /* ------------------- Launcher Separator ------------------- */
-template <>
+#define DEFINE_KERNEL_LAUNCHER(T, ScalarT)                                \
-void RoiAlign<float16, CUDAContext>(
-    const int C,
-    const int H,
-    const int W,
-    const int out_h,
-    const int out_w,
-    const int num_rois,
-    const float spatial_scale,
-    const int sampling_ratio,
-    const float16* x,
-    const float* rois,
-    float16* y,
-    CUDAContext* ctx) {
-  auto nthreads = num_rois * C * out_h * out_w;
-  _RoiAlign<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
-      nthreads,
-      C,
-      H,
-      W,
-      out_h,
-      out_w,
-      spatial_scale,
-      sampling_ratio,
-      reinterpret_cast<const half*>(x),
-      rois,
-      reinterpret_cast<half*>(y));
-}
-template <>
-void RoiAlignGrad<float16, CUDAContext>(
-    const int C,
-    const int H,
-    const int W,
-    const int out_h,
-    const int out_w,
-    const int num_rois,
-    const float spatial_scale,
-    const int sampling_ratio,
-    const float16* dy,
-    const float* rois,
-    float* dx,
-    CUDAContext* ctx) {
-  auto nthreads = num_rois * C * out_h * out_w;
-  _RoiAlignGrad<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
-      nthreads,
-      C,
-      H,
-      W,
-      out_h,
-      out_w,
-      spatial_scale,
-      sampling_ratio,
-      reinterpret_cast<const half*>(dy),
-      rois,
-      dx);
-}
-#define DEFINE_KERNEL_LAUNCHER(T)                                              \
  template <>                                                             \
  void RoiAlign<T, CUDAContext>(                                          \
      const int C,                                                        \
@@ -464,7 +240,8 @@ void RoiAlignGrad<float16, CUDAContext>(
      T* y,                                                               \
      CUDAContext* ctx) {                                                 \
    auto nthreads = num_rois * C * out_h * out_w;                         \
-    _RoiAlign<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
+    _RoiAlign<ScalarT, float>                                             \
+        <<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
            nthreads,                                                     \
            C,                                                            \
            H,                                                            \
@@ -473,12 +250,12 @@ void RoiAlignGrad<float16, CUDAContext>(
            out_w,                                                        \
            spatial_scale,                                                \
            sampling_ratio,                                               \
-        x,                                                                     \
+            reinterpret_cast<const ScalarT*>(x),                          \
            rois,                                                         \
-        y);                                                                    \
+            reinterpret_cast<ScalarT*>(y));                               \
  }
-#define DEFINE_GRAD_KERNEL_LAUNCHER(T)            \
+#define DEFINE_GRAD_KERNEL_LAUNCHER(T, ScalarT)   \
  template <>                                     \
  void RoiAlignGrad<T, CUDAContext>(              \
      const int C,                                \
@@ -507,17 +284,17 @@ void RoiAlignGrad<float16, CUDAContext>(
        out_w,                                    \
        spatial_scale,                            \
        sampling_ratio,                           \
-        dy,                                       \
+        reinterpret_cast<const ScalarT*>(dy),     \
        rois,                                     \
        dx);                                      \
  }
-DEFINE_KERNEL_LAUNCHER(float);
+DEFINE_KERNEL_LAUNCHER(float16, half);
-DEFINE_KERNEL_LAUNCHER(double);
+DEFINE_KERNEL_LAUNCHER(float, float);
+DEFINE_KERNEL_LAUNCHER(double, double);
-DEFINE_GRAD_KERNEL_LAUNCHER(float);
+DEFINE_GRAD_KERNEL_LAUNCHER(float16, half);
-DEFINE_GRAD_KERNEL_LAUNCHER(double);
+DEFINE_GRAD_KERNEL_LAUNCHER(float, float);
+DEFINE_GRAD_KERNEL_LAUNCHER(double, double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER

--- a/dragon/kernels/vision/roi_pool_op_kernel.cc
+++ b/dragon/kernels/vision/roi_pool_op_kernel.cc
@@ -149,11 +149,9 @@ void RoiPool<float16, CPUContext>(
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 DEFINE_GRAD_KERNEL_LAUNCHER(float16);
 DEFINE_GRAD_KERNEL_LAUNCHER(float);
 DEFINE_GRAD_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER

--- a/dragon/kernels/vision/roi_pool_op_kernel.cu
+++ b/dragon/kernels/vision/roi_pool_op_kernel.cu
@@ -9,6 +9,12 @@ namespace kernel {
 namespace {
+#if __CUDA_ARCH__ >= 350
+#define LDG(x, i) __ldg(x + i)
+#else
+#define LDG(x, i) x[i]
+#endif
 template <typename T>
 __global__ void _RoiPool(
    const int nthreads,
@@ -22,6 +28,7 @@ __global__ void _RoiPool(
    const float* rois,
    int* mask,
    T* y) {
+  auto Greater = math::GreaterFunctor<T>();
  CUDA_1D_KERNEL_LOOP(yi, nthreads) {
    const int ow = yi % out_w;
    const int oh = (yi / out_w) % out_h;
@@ -32,7 +39,7 @@ __global__ void _RoiPool(
    const int batch_ind = roi[0];
    if (batch_ind < 0) {
-      y[yi] = T(0);
+      y[yi] = convert::To<T>(0.f);
      mask[yi] = -1;
      continue;
    }
@@ -60,119 +67,22 @@ __global__ void _RoiPool(
    int max_idx = empty ? -1 : 0;
    const T* offset_x = x + (batch_ind * C + c) * H * W;
-    T val = empty ? T(0) : offset_x[0];
+    T val = empty ? convert::To<T>(0.f) : offset_x[0];
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        const int xi = h * W + w;
-#if __CUDA_ARCH__ >= 350
-        if (__ldg(offset_x + xi) > val) {
-          val = __ldg(offset_x + xi);
-          max_idx = xi;
-        }
-#else
-        if (offset_x[xi] > val) {
-          val = offset_x[xi];
-          max_idx = xi;
-        }
-#endif
-      }
-    }
-    y[yi] = val;
-    mask[yi] = max_idx;
-  }
-}
-template <>
-__global__ void _RoiPool<half>(
-    const int nthreads,
-    const int C,
-    const int H,
-    const int W,
-    const int out_h,
-    const int out_w,
-    const float spatial_scale,
-    const half* x,
-    const float* rois,
-    int* mask,
-    half* y) {
-  CUDA_1D_KERNEL_LOOP(yi, nthreads) {
-    const int ow = yi % out_w;
-    const int oh = (yi / out_w) % out_h;
-    const int c = (yi / out_w / out_h) % C;
-    const int n = yi / out_w / out_h / C;
-    const float* roi = rois + n * 5;
-    const int batch_ind = roi[0];
-    if (batch_ind < 0) {
-      y[yi] = __float2half(0.f);
-      mask[yi] = -1;
-      continue;
-    }
-    const int roi_start_w = round(roi[1] * spatial_scale);
-    const int roi_start_h = round(roi[2] * spatial_scale);
-    const int roi_end_w = round(roi[3] * spatial_scale);
-    const int roi_end_h = round(roi[4] * spatial_scale);
-    const int roi_w = max(roi_end_w - roi_start_w + 1, 1);
-    const int roi_h = max(roi_end_h - roi_start_h + 1, 1);
-    const float bin_h = (float)roi_h / (float)out_h;
-    const float bin_w = (float)roi_w / (float)out_w;
-    int hstart = floor(bin_h * oh);
-    int wstart = floor(bin_w * ow);
-    int hend = ceil(bin_h * (oh + 1));
-    int wend = ceil(bin_w * (ow + 1));
-    hstart = min(max(hstart + roi_start_h, 0), H);
-    hend = min(max(hend + roi_start_h, 0), H);
-    wstart = min(max(wstart + roi_start_w, 0), W);
-    wend = min(max(wend + roi_start_w, 0), W);
-    const bool empty = (hend <= hstart) || (wend <= wstart);
-    int max_idx = empty ? -1 : 0;
-    const half* offset_x = x + ((batch_ind * C + c) * H * W);
-#if __CUDA_ARCH__ >= 530
-    half val = empty ? __float2half(0.f) : __ldg(offset_x);
-#else
-    float val = empty ? 0.f : __half2float(*offset_x);
-#endif
    for (int h = hstart; h < hend; ++h) {
      for (int w = wstart; w < wend; ++w) {
        const int xi = h * W + w;
-#if __CUDA_ARCH__ >= 530
+        if (Greater(LDG(offset_x, xi), val)) {
-        if (__hgt(__ldg(offset_x + xi), val)) {
+          val = LDG(offset_x, xi);
-          val = __ldg(offset_x + xi);
          max_idx = xi;
        }
-#elif __CUDA_ARCH__ >= 350
-        if (__half2float(__ldg(offset_x + xi)) > val) {
-          val = __half2float(__ldg(offset_x + xi));
-          max_idx = xi;
      }
-#else
-        if (__half2float(offset_x[xi]) > val) {
-          val = __half2float(offset_x[xi]);
-          max_idx = xi;
-        }
-#endif
    }
-    }
-#if __CUDA_ARCH__ >= 530
    y[yi] = val;
-#else
-    y[yi] = __float2half(val);
-#endif
    mask[yi] = max_idx;
  }
 }
-template <typename T>
+template <typename T, typename AccT>
 __global__ void _RoiPoolGrad(
    const int nthreads,
    const int C,
@@ -184,7 +94,7 @@ __global__ void _RoiPoolGrad(
    const T* dy,
    const float* rois,
    const int* mask,
-    float* dx) {
+    AccT* dx) {
  CUDA_1D_KERNEL_LOOP(yi, nthreads) {
    const int c = (yi / out_w / out_h) % C;
    const int n = yi / out_w / out_h / C;
@@ -193,116 +103,20 @@ __global__ void _RoiPoolGrad(
    const int batch_ind = roi[0];
    if (batch_ind < 0) continue;
-    float* offset_dx = dx + (batch_ind * C + c) * H * W;
+    AccT* offset_dx = dx + (batch_ind * C + c) * H * W;
-#if __CUDA_ARCH__ >= 350
+    if (LDG(mask, yi) != -1) {
-    if (__ldg(mask + yi) != -1) {
+      atomicAdd(offset_dx + LDG(mask, yi), convert::To<AccT>(dy[yi]));
-      atomicAdd(offset_dx + __ldg(mask + yi), (float)dy[yi]);
-    }
-#else
-    if (mask[yi] != -1) {
-      atomicAdd(offset_dx + mask[yi], (float)dy[yi]);
    }
-#endif
  }
 }
-template <>
+#undef LDG
-__global__ void _RoiPoolGrad<half>(
-    const int nthreads,
-    const int C,
-    const int H,
-    const int W,
-    const int out_h,
-    const int out_w,
-    const float spatial_scale,
-    const half* dy,
-    const float* rois,
-    const int* mask,
-    float* dx) {
-  CUDA_1D_KERNEL_LOOP(yi, nthreads) {
-    const int c = (yi / out_w / out_h) % C;
-    const int n = yi / out_w / out_h / C;
-    const float* roi = rois + n * 5;
-    const int batch_ind = roi[0];
-    if (batch_ind < 0) continue;
-    float* offset_dx = dx + (batch_ind * C + c) * H * W;
-#if __CUDA_ARCH__ >= 350
-    if (__ldg(mask + yi) != -1) {
-      atomicAdd(offset_dx + __ldg(mask + yi), __half2float(dy[yi]));
-    }
-#else
-    if (mask[yi] != -1) {
-      atomicAdd(offset_dx + mask[yi], __half2float(dy[yi]));
-    }
-#endif
-  }
-}
 } // namespace
 /* ------------------- Launcher Separator ------------------- */
-template <>
+#define DEFINE_KERNEL_LAUNCHER(T, ScalarT)                                    \
-void RoiPool<float16, CUDAContext>(
-    const int C,
-    const int H,
-    const int W,
-    const int out_h,
-    const int out_w,
-    const int num_rois,
-    const float spatial_scale,
-    const float16* x,
-    const float* rois,
-    int* mask,
-    float16* y,
-    CUDAContext* ctx) {
-  auto nthreads = num_rois * C * out_h * out_w;
-  _RoiPool<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
-      nthreads,
-      C,
-      H,
-      W,
-      out_h,
-      out_w,
-      spatial_scale,
-      reinterpret_cast<const half*>(x),
-      rois,
-      mask,
-      reinterpret_cast<half*>(y));
-}
-template <>
-void RoiPoolGrad<float16, CUDAContext>(
-    const int C,
-    const int H,
-    const int W,
-    const int out_h,
-    const int out_w,
-    const int num_rois,
-    const float spatial_scale,
-    const float16* dy,
-    const float* rois,
-    const int* mask,
-    float* dx,
-    CUDAContext* ctx) {
-  auto nthreads = num_rois * C * out_h * out_w;
-  _RoiPoolGrad<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
-      nthreads,
-      C,
-      H,
-      W,
-      out_h,
-      out_w,
-      spatial_scale,
-      reinterpret_cast<const half*>(dy),
-      rois,
-      mask,
-      dx);
-} // RoiPoolGrad
-#define DEFINE_KERNEL_LAUNCHER(T)                                             \
  template <>                                                                 \
  void RoiPool<T, CUDAContext>(                                               \
      const int C,                                                            \
@@ -319,10 +133,20 @@ void RoiPoolGrad<float16, CUDAContext>(
      CUDAContext* ctx) {                                                     \
    auto nthreads = num_rois * C * out_h * out_w;                             \
    _RoiPool<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
-        nthreads, C, H, W, out_h, out_w, spatial_scale, x, rois, mask, y);    \
+        nthreads,                                                             \
-  }
+        C,                                                                    \
+        H,                                                                    \
-#define DEFINE_GRAD_KERNEL_LAUNCHER(T)                                       \
+        W,                                                                    \
+        out_h,                                                                \
+        out_w,                                                                \
+        spatial_scale,                                                        \
+        reinterpret_cast<const ScalarT*>(x),                                  \
+        rois,                                                                 \
+        mask,                                                                 \
+        reinterpret_cast<ScalarT*>(y));                                       \
+  }
+#define DEFINE_GRAD_KERNEL_LAUNCHER(T, ScalarT)   \
  template <>                                     \
  void RoiPoolGrad<T, CUDAContext>(               \
      const int C,                                \
@@ -343,15 +167,25 @@ void RoiPoolGrad<float16, CUDAContext>(
        CUDA_THREADS,                             \
        0,                                        \
        ctx->cuda_stream()>>>(                    \
-        nthreads, C, H, W, out_h, out_w, spatial_scale, dy, rois, mask, dx); \
+        nthreads,                                 \
-  }
+        C,                                        \
+        H,                                        \
-DEFINE_KERNEL_LAUNCHER(float);
+        W,                                        \
-DEFINE_KERNEL_LAUNCHER(double);
+        out_h,                                    \
+        out_w,                                    \
-DEFINE_GRAD_KERNEL_LAUNCHER(float);
+        spatial_scale,                            \
-DEFINE_GRAD_KERNEL_LAUNCHER(double);
+        reinterpret_cast<const ScalarT*>(dy),     \
+        rois,                                     \
+        mask,                                     \
+        dx);                                      \
+  }
+DEFINE_KERNEL_LAUNCHER(float16, half);
+DEFINE_KERNEL_LAUNCHER(float, float);
+DEFINE_KERNEL_LAUNCHER(double, double);
+DEFINE_GRAD_KERNEL_LAUNCHER(float16, half);
+DEFINE_GRAD_KERNEL_LAUNCHER(float, float);
+DEFINE_GRAD_KERNEL_LAUNCHER(double, double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER

--- a/dragon/operators/array/identity_op.cc
+++ b/dragon/operators/array/identity_op.cc
+#include "dragon/operators/array/reshape_ops.h"
+#include "dragon/utils/math_functions.h"
+namespace dragon {
+template <class Context>
+void IdentityOp<Context>::RunOnDevice() {
+  auto &X = Input(0), *Y = Output(0, {0});
+  // Store for the gradient calculation
+  STORE_INPUT_SPEC(0);
+  // Maybe copy the contents
+  Y->ReshapeLike(X)->CopyFrom(X, ctx());
+}
+DEPLOY_CPU_OPERATOR(Identity);
+#ifdef USE_CUDA
+DEPLOY_CUDA_OPERATOR(Identity);
+#endif
+OPERATOR_SCHEMA(Identity)
+    /* X */
+    .NumInputs(1)
+    /* Y */
+    .NumOutputs(1)
+    /* X => Y */
+    .AllowInplace({{0, 0}});
+OPERATOR_SCHEMA(IdentityGradient)
+    /* dY */
+    .NumInputs(1)
+    /* dX */
+    .NumOutputs(1)
+    /* dY => dX */
+    .AllowInplace({{0, 0}});
+REGISTER_GRADIENT(Identity, SimpleGradientMaker);
+} // namespace dragon
--- a/dragon/operators/array/reshape_ops.h
+++ b/dragon/operators/array/reshape_ops.h
@@ -31,6 +31,15 @@ class ReshapeGradientOpBase : public Operator<Context> {
 };
 template <class Context>
+class IdentityOp final : public Operator<Context> {
+ public:
+  SIMPLE_CTOR_DTOR(IdentityOp);
+  USE_OPERATOR_FUNCTIONS;
+  void RunOnDevice() override;
+};
+template <class Context>
 class ReshapeOp final : public Operator<Context> {
 public:
  ReshapeOp(const OperatorDef& def, Workspace* ws)
@@ -94,6 +103,7 @@ class SqueezeOp final : public Operator<Context> {
        : ReshapeGradientOpBase<Context>(def, ws) {}                     \
  };
+DEFINE_GRADIENT_OP(Identity);
 DEFINE_GRADIENT_OP(Reshape);
 DEFINE_GRADIENT_OP(Flatten);
 DEFINE_GRADIENT_OP(ExpandDims);

--- a/dragon/operators/control_flow/assign_op.cc
+++ b/dragon/operators/control_flow/assign_op.cc
@@ -8,18 +8,17 @@ namespace dragon {
 template <class Context>
 template <typename T>
 void AssignOp<Context>::DoRunWithType() {
-  auto &X = Input(0), *Y = Output(0);
+  auto &Y_ref = Input(0), &X = Input(1), *Y = Output(0, {0});
-  int num_starts, num_sizes, num_dims = Y->ndim();
-  vec64_t X_dims(num_dims), X_starts(num_dims);
  // Determine the interval of each dimension
+  int num_starts, num_sizes, num_dims = Y_ref.ndim();
+  vec64_t X_dims(num_dims), X_starts(num_dims);
  starts(0, &num_starts);
  sizes(0, &num_sizes);
  for (int i = 0; i < num_dims; i++) {
    auto dim_start = i < num_starts ? starts(i) : 0;
-    auto dim_end = Y->dim(i);
+    auto dim_end = Y_ref.dim(i);
    if (i < num_sizes) {
      auto dim_length = sizes(i);
      if (dim_length > 0) {
@@ -28,19 +27,18 @@ void AssignOp<Context>::DoRunWithType() {
        dim_end = dim_start + 1;
      }
    }
-    CHECK(dim_start >= 0 && dim_start < Y->dim(i))
+    CHECK(dim_start >= 0 && dim_start < Y_ref.dim(i))
        << "\nAssigning starts from " << dim_start << " of axis " << i << ", "
-        << "while the dimension of this axis is " << Y->dim(i) << ".";
+        << "while the dimension of this axis is " << Y_ref.dim(i) << ".";
-    CHECK(dim_end > 0 && dim_end <= Y->dim(i))
+    CHECK(dim_end > 0 && dim_end <= Y_ref.dim(i))
        << "\nAssigning ends at " << dim_end << " of axis " << i << ", "
-        << "while the dimension of this axis is " << Y->dim(i) << ".";
+        << "while the dimension of this axis is " << Y_ref.dim(i) << ".";
    X_starts[i] = dim_start;
    X_dims[i] = dim_end - dim_start;
  }
-  Tensor X_broadcast(X_dims);
+  Tensor XRef(X_dims);
-  auto* x = X.template data<T, Context>();
+  auto* new_data = X.template data<T, Context>();
  if (X.dims() != X_dims) {
    vec64_t dims1, dims2;
    if (math::utils::IsBinaryBroadcast(X.dims(), X_dims, dims1)) {
@@ -49,17 +47,17 @@ void AssignOp<Context>::DoRunWithType() {
          << Tensor::DimString(X_dims);
      math::utils::ComputeBinaryBroadcastDims(X.dims(), X_dims, dims1, dims2);
      if (dims1 != dims2) {
-        auto* scratch = ctx()->workspace()->template data<T, Context>(
+        auto* scratch =
-            {X_broadcast.count()})[0];
+            ctx()->workspace()->template data<T, Context>({XRef.count()})[0];
        math::Set(
            X.ndim(),
            X.dims().data(),
-            X_broadcast.ndim(),
+            XRef.ndim(),
-            X_broadcast.dims().data(),
+            XRef.dims().data(),
-            x,
+            new_data,
            scratch,
            ctx());
-        x = scratch;
+        new_data = scratch;
      }
    } else {
      LOG(FATAL) << "Could not broadcast together with shapes " << X.DimString()
@@ -67,12 +65,16 @@ void AssignOp<Context>::DoRunWithType() {
    }
  }
+  // Copy the reference data
+  Y->ReshapeLike(Y_ref)->CopyFrom(Y_ref, ctx());
+  // Update with the new data
  kernel::Assign(
      num_dims,
      X_dims.data(),
      Y->strides().data(),
      X_starts.data(),
-      x,
+      new_data,
      Y->template mutable_data<T, Context>(),
      ctx());
 }
@@ -88,10 +90,12 @@ DEPLOY_CUDA_OPERATOR(Assign);
 #endif
 OPERATOR_SCHEMA(Assign)
-    /* V */
+    /* Y_ref, X */
-    .NumInputs(1)
+    .NumInputs(2)
-    /* X */
+    /* Y */
-    .NumOutputs(1);
+    .NumOutputs(1)
+    /* Y_ref => Y */
+    .AllowInplace({{0, 0}});
 NO_GRADIENT(Assign);

--- a/dragon/operators/control_flow/copy_op.cc
+++ b/dragon/operators/control_flow/copy_op.cc
-#include "dragon/operators/control_flow/copy_op.h"
-#include "dragon/utils/math_functions.h"
-namespace dragon {
-template <class Context>
-template <typename T>
-void CopyOp<Context>::DoRunWithType() {
-  auto &X = Input(0), *Y = Output(0);
-  math::Copy(
-      X.count(),
-      X.template data<T, Context>(),
-      Y->ReshapeLike(X)->template mutable_data<T, Context>(),
-      ctx());
-}
-template <class Context>
-void CopyOp<Context>::RunOnDevice() {
-  DispatchHelper<FullTensorTypes>::Call(this, Input(0));
-}
-DEPLOY_CPU_OPERATOR(Copy);
-#ifdef USE_CUDA
-DEPLOY_CUDA_OPERATOR(Copy);
-#endif
-OPERATOR_SCHEMA(Copy)
-    /* X */
-    .NumInputs(1)
-    /* Y */
-    .NumOutputs(1);
-NO_GRADIENT(Copy);
-} // namespace dragon
--- a/dragon/operators/control_flow/copy_op.h
+++ b/dragon/operators/control_flow/copy_op.h
-/*!
- * Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
- *
- * Licensed under the BSD 2-Clause License.
- * You should have received a copy of the BSD 2-Clause License
- * along with the software. If not, See,
- *
- *     <https://opensource.org/licenses/BSD-2-Clause>
- *
- * ------------------------------------------------------------
- */
-#ifndef DRAGON_OPERATORS_CONTROL_FLOW_COPY_OP_H_
-#define DRAGON_OPERATORS_CONTROL_FLOW_COPY_OP_H_
-#include "dragon/core/operator.h"
-namespace dragon {
-template <class Context>
-class CopyOp final : public Operator<Context> {
- public:
-  SIMPLE_CTOR_DTOR(CopyOp);
-  USE_OPERATOR_FUNCTIONS;
-  void RunOnDevice() override;
-  template <typename T>
-  void DoRunWithType();
-};
-} // namespace dragon
-#endif // DRAGON_OPERATORS_CONTROL_FLOW_COPY_OP_H_
--- a/dragon/operators/control_flow/masked_assign_op.cc
+++ b/dragon/operators/control_flow/masked_assign_op.cc
@@ -8,24 +8,28 @@ namespace dragon {
 template <class Context>
 template <typename T>
 void MaskedAssignOp<Context>::DoRunWithType() {
-  auto &X = Input(0), &X_mask = Input(1), *Y = Output(0);
+  auto &X = Input(1), &X_mask = Input(2);
+  auto &Y_ref = Input(0), *Y = Output(0, {0});
  CHECK(X_mask.template IsType<bool>() || X_mask.template IsType<uint8_t>())
      << "\nExcepted bool or uint8 mask.";
  vec64_t X_dims, Y_dims;
  if (math::utils::IsBinaryBroadcast(X.dims(), X_mask.dims(), X_dims) &&
-      math::utils::IsBinaryBroadcast(X_dims, Y->dims(), Y_dims) &&
+      math::utils::IsBinaryBroadcast(X_dims, Y_ref.dims(), Y_dims) &&
-      Y_dims == Y->dims()) {
+      Y_dims == Y_ref.dims()) {
+    // Copy the reference data
+    Y->ReshapeLike(Y_ref)->CopyFrom(Y_ref, ctx());
+    // Update with the new data
    math::Where(
        X.ndim(),
        X.dims().data(),
-        Y->ndim(),
+        Y_ref.ndim(),
-        Y->dims().data(),
+        Y_ref.dims().data(),
        X_mask.ndim(),
        X_mask.dims().data(),
        X.template data<T, Context>(),
-        Y->template data<T, Context>(),
+        Y_ref.template data<T, Context>(),
        (const bool*)X_mask.template raw_data<Context>(),
        Y->template mutable_data<T, Context>(),
        ctx());
@@ -46,10 +50,12 @@ DEPLOY_CUDA_OPERATOR(MaskedAssign);
 #endif
 OPERATOR_SCHEMA(MaskedAssign)
-    /* X, M */
+    /* Y_ref, X, X_mask */
-    .NumInputs(2)
+    .NumInputs(3)
    /* Y */
-    .NumOutputs(1);
+    .NumOutputs(1)
+    /* Y_ref => Y */
+    .AllowInplace({{0, 0}});
 NO_GRADIENT(MaskedAssign);

--- a/dragon/python/__init__.py
+++ b/dragon/python/__init__.py
@@ -65,6 +65,7 @@ from dragon.core.ops.array_ops import channel_shuffle
 from dragon.core.ops.array_ops import concat
 from dragon.core.ops.array_ops import expand_dims
 from dragon.core.ops.array_ops import flatten
+from dragon.core.ops.array_ops import identity
 from dragon.core.ops.array_ops import index_select
 from dragon.core.ops.array_ops import linspace
 from dragon.core.ops.array_ops import masked_select
@@ -85,7 +86,6 @@ from dragon.core.ops.array_ops import transpose
 from dragon.core.ops.array_ops import unique
 from dragon.core.ops.array_ops import where
 from dragon.core.ops.control_flow_ops import assign
-from dragon.core.ops.control_flow_ops import copy
 from dragon.core.ops.control_flow_ops import masked_assign
 from dragon.core.ops.framework_ops import python_plugin
 from dragon.core.ops.framework_ops import stop_gradient

--- a/dragon/python/core/autograph/op_def.py
+++ b/dragon/python/core/autograph/op_def.py
@@ -53,6 +53,7 @@ class OpDef(object):
        outputs=None,
        num_outputs=1,
        extra_inputs=None,
+        extra_outputs=None,
        name=None,
        **kwargs
    ):
@@ -64,6 +65,7 @@ class OpDef(object):
        for input in inputs:
            op_info.merge_from(input)
+        # Collect defs from extra inputs.
        if extra_inputs is not None:
            extra_inputs = nest.flatten(extra_inputs)
            for input in extra_inputs:

--- a/dragon/python/core/autograph/op_spec.py
+++ b/dragon/python/core/autograph/op_spec.py
@@ -59,12 +59,6 @@ def arg_reduce_spec(args, inputs, outputs):
    return outputs
-@register(['Assign', 'MaskedAssign'])
-def assign_spec(args, inputs, outputs):
-    _ = locals()
-    return outputs
 def binary_shape_spec(inputs, outputs):
    if inputs[0].shape is None or inputs[1].shape is None:
        return outputs

--- a/dragon/python/core/ops/array_ops.py
+++ b/dragon/python/core/ops/array_ops.py
@@ -588,6 +588,45 @@ def flatten(inputs, axis=0, num_axes=-1, keep_axes=None, **kwargs):
 @OpSchema.num_inputs(1)
+def identity(inputs, **kwargs):
+    """Return a tensor copied from the input.
+    Examples:
+    Examples:
+    ```python
+    # Copy ``x`` to ``y``
+    x = dragon.zeros(shape=(2, 3))
+    y = dragon.identity(x)
+    # ``x`` != ``y``
+    x += 1
+    print(x)
+    print(y)
+    ```
+    Parameters
+    ----------
+    inputs : dragon.Tensor
+        The input tensor.
+    Returns
+    -------
+    dragon.Tensor
+        The output tensor.
+    """
+    args = parse_args(locals())
+    inplace = args.pop('inplace') if 'inplace' in args else False
+    op_lib = array_ops_lib.Identity
+    if context.executing_eagerly():
+        return op_lib.instantiate().apply([inputs], inplace=inplace)
+    else:
+        return op_lib.blend(**args)
+@OpSchema.num_inputs(1)
 def index_select(inputs, index, axis=0, **kwargs):
    """Select the elements according to the index along the given axis.

--- a/dragon/python/core/ops/array_ops_lib.py
+++ b/dragon/python/core/ops/array_ops_lib.py
@@ -259,6 +259,20 @@ class Flatten(Operator):
        return self.dispatch(inputs, outputs)
+class Identity(Operator):
+    """Identity operator."""
+    def __init__(self, key, dev, **kwargs):
+        super(Identity, self).__init__(key, dev, **kwargs)
+    def attributes(self):
+        return {'op_type': 'Identity', 'arguments': {}}
+    def forward(self, inputs, inplace=False):
+        outputs = [self.alloc(inputs[0]) if inplace else self.alloc()]
+        return self.dispatch(inputs, outputs)
 class IndexSelect(Operator):
    """IndexSelect operator."""

--- a/dragon/python/core/ops/control_flow_ops.py
+++ b/dragon/python/core/ops/control_flow_ops.py
@@ -20,7 +20,6 @@ from dragon.core.ops import control_flow_ops_lib
 from dragon.core.ops.utils import ArgHelper
 from dragon.core.ops.utils import OpSchema
 from dragon.core.ops.utils import parse_args
-from dragon.core.util import nest
 @OpSchema.num_inputs(2)
@@ -43,66 +42,22 @@ def assign(inputs, starts=None, sizes=None, **kwargs):
    Returns
    -------
    dragon.Tensor
-        The input tensor.
+        The output tensor.
    """
    args = parse_args(locals())
+    inplace = args.pop('inplace') if 'inplace' in args else False
    inputs[1] = ops.scalar_to_tensor(inputs[1], inputs[0].dtype)
    op_lib = control_flow_ops_lib.Assign
    if context.executing_eagerly():
        return op_lib \
            .instantiate(
                ndim=len(starts) if starts is not None else 0,
-            ).apply(inputs, starts, sizes)
+            ).apply(inputs, starts, sizes, inplace=inplace)
    else:
-        args['outputs'] = [args['inputs'][0]]
-        args['inputs'] = [args['inputs'][1]]
        return op_lib.blend(**args)
-@OpSchema.num_inputs(1, 2)
-def copy(inputs, **kwargs):
-    """Copy the input.
-    Examples:
-    ```python
-    # Copy ``x`` to ``y``
-    x = dragon.ones(shape=(2, 3))
-    y = dragon.zeros(shape=(2, 4))
-    dragon.copy([x, y])
-    # Copy to a new tensor from ``x``
-    y = dragon.copy(x)
-    ```
-    Parameters
-    ----------
-    inputs : Union[dragon.Tensor, Sequence[dragon.Tensor]]
-        The input tensor.
-    Returns
-    -------
-    dragon.Tensor
-        The output tensor.
-    """
-    args = parse_args(locals())
-    args['inputs'] = nest.flatten(inputs)
-    if len(args['inputs']) == 2:
-        args['outputs'] = [args['inputs'][1]]
-        args['inputs'] = [args['inputs'][0]]
-    else:
-        args['outputs'] = None
-    op_lib = control_flow_ops_lib.Copy
-    if context.executing_eagerly():
-        return op_lib \
-            .instantiate() \
-            .apply(args['inputs'], args['outputs'])
-    else:
-        return op_lib.blend('Copy', **args)
 @OpSchema.num_inputs(3)
 def masked_assign(inputs, **kwargs):
    r"""Assign the value to input where mask is 1.
@@ -126,13 +81,10 @@ def masked_assign(inputs, **kwargs):
    """
    args = parse_args(locals())
+    inplace = args.pop('inplace') if 'inplace' in args else False
    inputs[1] = ops.scalar_to_tensor(inputs[1], inputs[0].dtype)
    op_lib = control_flow_ops_lib.MaskedAssign
    if context.executing_eagerly():
-        return op_lib.instantiate().apply(inputs)
+        return op_lib.instantiate().apply(inputs, inplace=inplace)
    else:
-        args.update({
-            'outputs': [args['inputs'][0]],
-            'inputs': [args['inputs'][1:]],
-        })
        return op_lib.blend(**args)
--- a/dragon/python/core/ops/control_flow_ops_lib.py
+++ b/dragon/python/core/ops/control_flow_ops_lib.py
@@ -46,29 +46,16 @@ class Assign(Operator):
                ws, '{}/sizes[{}]'.format(handle, i),
                sizes[i], 'int64')
-    def forward(self, inputs, starts, sizes):
+    def forward(self, inputs, starts, sizes, inplace=False):
+        outputs = [self.alloc(inputs[0]) if inplace else self.alloc()]
        return self.dispatch(
-            [inputs[1]], [inputs[0]],
+            inputs, outputs,
            callback=lambda ws, handle:
                self.feed(ws, handle, starts, sizes),
            no_grad=True,
        )
-class Copy(Operator):
-    """Copy operator."""
-    def __init__(self, key, dev, **kwargs):
-        super(Copy, self).__init__(key, dev, **kwargs)
-    def attributes(self):
-        return {'op_type': 'Copy', 'arguments': {}}
-    def forward(self, inputs, outputs):
-        outputs = outputs if outputs else [self.alloc()]
-        return self.dispatch(inputs, outputs, no_grad=True)
 class MaskedAssign(Operator):
    """MaskedAssign operator."""
@@ -78,5 +65,6 @@ class MaskedAssign(Operator):
    def attributes(self):
        return {'op_type': 'MaskedAssign', 'arguments': {}}
-    def forward(self, inputs):
+    def forward(self, inputs, inplace=False):
-        return self.dispatch(inputs[1:], [inputs[0]], no_grad=True)
+        outputs = [self.alloc(inputs[0]) if inplace else self.alloc()]
+        return self.dispatch(inputs, outputs, no_grad=True)
--- a/dragon/python/core/ops/tensorbind_eager.py
+++ b/dragon/python/core/ops/tensorbind_eager.py
@@ -104,11 +104,10 @@ def copy(self):
    See Also
    --------
-    `dragon.copy(...)`_
+    `dragon.identity(...)`_
    """
-    return control_flow_ops_lib.Copy \
+    return array_ops_lib.Identity.instantiate().apply([self])
-        .instantiate().apply([self], None)
 def div(self, other):
@@ -194,7 +193,7 @@ def getitem(self, item):
            if axis is not None:
                return _index_select(self, item[axis], axis)
        starts, sizes = _process_index(item)
-        return _section_select(self, starts, sizes)
+        return _sliced_select(self, starts, sizes)
 def glorot_normal(self, mode='fan_in', scale=2.0):
@@ -599,7 +598,7 @@ def setitem(self, key, value):
        _masked_assign(self, value, key)
    else:
        starts, sizes = _process_index(key)
-        _section_assign(self, value, starts, sizes)
+        _sliced_assign(self, value, starts, sizes)
 def sub(self, other):
@@ -705,7 +704,7 @@ def _masked_assign(ref, value, mask):
    """Assign value according to the mask."""
    value = ops.scalar_to_tensor(value, ref.dtype)
    return control_flow_ops_lib.MaskedAssign \
-        .instantiate().apply([ref, value, mask])
+        .instantiate().apply([ref, value, mask], inplace=True)
 def _masked_select(x, mask):
@@ -747,16 +746,16 @@ def _process_index(item):
    return starts, sizes
-def _section_assign(ref, value, starts, sizes):
+def _sliced_assign(ref, value, starts, sizes):
-    """Apply the section-assign operation."""
+    """Assign value according to the slices."""
    value = ops.scalar_to_tensor(value, ref.dtype)
    return control_flow_ops_lib.Assign \
        .instantiate(ndim=len(starts) if starts is not None else 0) \
-        .apply([ref, value], starts, sizes)
+        .apply([ref, value], starts, sizes, inplace=True)
-def _section_select(x, starts, sizes):
+def _sliced_select(x, starts, sizes):
-    """Apply the section-select operation."""
+    """Select elements according to the slices."""
    return array_ops_lib.Slice \
        .instantiate(ndim=len(starts)).apply([x], starts, sizes)

--- a/dragon/python/core/ops/tensorbind_symbol.py
+++ b/dragon/python/core/ops/tensorbind_symbol.py
@@ -79,11 +79,10 @@ def copy(self):
    See Also
    --------
-    `dragon.copy(...)`_
+    `dragon.identity(...)`_
    """
-    outputs = [Tensor(shape=self.shape, dtype=self.dtype)]
+    return OpDef.apply('Identity', [self])
-    return OpDef.apply('Copy', [self], [outputs])
 def div(self, other):
@@ -169,7 +168,7 @@ def getitem(self, item):
            if axis is not None:
                return _index_select(self, item[axis], axis)
        starts, sizes = _process_index(item)
-        return _section_select(self, starts, sizes)
+        return _sliced_select(self, starts, sizes)
 def get_value(self):
@@ -402,10 +401,13 @@ def setitem(self, key, value):
    """
    if isinstance(key, Tensor):
-        _masked_assign(self, value, key)
+        raise RuntimeError(
+            'Assigning via mask is an ambiguous behavior in graph mode. '
+            'Use `dragon.masked_assign(...)` instead.')
    else:
-        starts, sizes = _process_index(key)
+        raise RuntimeError(
-        _section_assign(self, value, starts, sizes)
+            'Assigning via slices is an ambiguous behavior in graph mode. '
+            'Use `dragon.assign(...)` instead.')
 def set_value(self, value):
@@ -458,12 +460,6 @@ def _index_select(x, index, axis):
    return OpDef.apply('IndexSelect', [x, index], axis=axis, num_axes=1)
-def _masked_assign(ref, value, mask):
-    """Assign value according to the mask."""
-    value = ops.scalar_to_tensor(value, ref.dtype)
-    return OpDef.apply('MaskedAssign', [value, mask], [ref])
 def _masked_select(x, mask):
    """Select elements according to the mask."""
    return OpDef.apply('MaskedSelect', [x, mask])
@@ -502,14 +498,8 @@ def _process_index(item):
    return starts, sizes
-def _section_assign(ref, value, starts, sizes):
+def _sliced_select(x, starts, sizes):
-    """Create the section-assign operator."""
+    """Select elements according to the slices."""
-    value = ops.scalar_to_tensor(value, ref.dtype)
-    return OpDef.apply('Assign', [value], [ref], starts=starts, sizes=sizes)
-def _section_select(x, starts, sizes):
-    """Create the section-select operator."""
    return OpDef.apply('Slice', [x], starts=starts, sizes=sizes)

--- a/tensorflow/core/ops/array_ops.py
+++ b/tensorflow/core/ops/array_ops.py
@@ -24,7 +24,6 @@ from dragon.core.autograph.tensor import TensorRef
 from dragon.core.framework import context
 from dragon.core.framework import workspace
 from dragon.core.ops import array_ops
-from dragon.core.ops import control_flow_ops
 from dragon.core.ops import init_ops
 from dragon.core.ops import vision_ops
@@ -260,19 +259,19 @@ def gather(params, indices, axis=0, name=None):
 def identity(input, name=None):
-    """Return a new tensor copying the content of input.
+    """Return a tensor copied from the input.
    Examples:
    ```python
-    # Copy ``x`` to ``xx``
+    # Copy ``x`` to ``y``
    x = tf.zeros(shape=(2, 3))
-    xx = tf.identity(x)
+    y = tf.identity(x)
-    # ``x`` != ``xx``
+    # ``x`` != ``y``
    x += 1
    print(x)
-    print(xx)
+    print(y)
    ```
    Parameters
@@ -288,7 +287,7 @@ def identity(input, name=None):
        The output tensor.
    """
-    return control_flow_ops.copy(input, name=name if name else 'Identity')
+    return array_ops.identity(input, name=name if name else 'Identity')
 def ones(shape, dtype='float32', name=None):

--- a/test/dragon/test_ops.py
+++ b/test/dragon/test_ops.py
@@ -655,6 +655,19 @@ class TestArrayOps(OpTestCase):
        with dragon.device('cuda'):
            self.test_flatten()
+    def test_identity(self):
+        for execution in ('EAGER_MODE', 'GRAPH_MODE'):
+            with execution_context().mode(execution):
+                data = arange((4,))
+                x = new_tensor(data)
+                y = dragon.identity(x)
+                self.assertEqual(y, data)
+    @unittest.skipIf(not TEST_CUDA, 'CUDA unavailable')
+    def test_identity_cuda(self):
+        with dragon.device('cuda'):
+            self.test_identity()
    def test_index_select(self):
        entries = [1, (1, 2)]
        for execution in ('EAGER_MODE', 'GRAPH_MODE'):
@@ -1116,30 +1129,14 @@ class TestControlFlowOps(OpTestCase):
        with dragon.device('cuda'):
            self.test_assign()
-    def test_copy(self):
-        for execution in ('EAGER_MODE', 'GRAPH_MODE'):
-            with execution_context().mode(execution):
-                data = arange((4,))
-                x = new_tensor(data)
-                y = dragon.zeros((4,), dtype='float32')
-                y = dragon.copy([x, y])
-                z = dragon.copy(x)
-                self.assertEqual(y, data)
-                self.assertEqual(z, data)
-    @unittest.skipIf(not TEST_CUDA, 'CUDA unavailable')
-    def test_copy_cuda(self):
-        with dragon.device('cuda'):
-            self.test_copy()
    def test_masked_assign(self):
        for execution in ('EAGER_MODE', 'GRAPH_MODE'):
            with execution_context().mode(execution):
                data = arange((2, 3))
                x = new_tensor(data)
-                dragon.masked_assign([x, 0, x > 2])
+                y = dragon.masked_assign([x, 0, x > 2])
                data[data > 2] = 0
-                self.assertEqual(x, data)
+                self.assertEqual(y, data)
    @unittest.skipIf(not TEST_CUDA, 'CUDA unavailable')
    def test_masked_assign_cuda(self):
@@ -3021,9 +3018,12 @@ class TestTensorOps(OpTestCase):
            with execution_context().mode(execution):
                data = arange((2, 3))
                x = new_tensor(data)
+                try:
                    x[x > 2] = 0
                    data[data > 2] = 0
                    self.assertEqual(x, data)
+                except RuntimeError:
+                    pass
                entries = [0,
                           slice(None, None, None),
                           slice(0, None, None),
@@ -3037,7 +3037,7 @@ class TestTensorOps(OpTestCase):
                        x.__setitem__(item, 0)
                        data.__setitem__(item, 0)
                        self.assertEqual(x, data)
-                    except (NotImplementedError, ValueError, TypeError):
+                    except (NotImplementedError, ValueError, TypeError, RuntimeError):
                        pass
    def test_rsub(self):

--- a/torch/core/ops/array/_functions.py
+++ b/torch/core/ops/array/_functions.py
@@ -71,7 +71,7 @@ class Assign(function.Function):
    def forward(self, out, starts, sizes, input):
        self._check_device([input, out])
        return self.dispatch(
-            [input], [out],
+            [out, input], [self.alloc(out)],
            callback=lambda ws, handle:
                self.feed(ws, handle, starts, sizes),
            no_grad=True,
@@ -306,7 +306,7 @@ class MaskedAssign(function.Function):
        return {'op_type': 'MaskedAssign', 'arguments': {}}
    def forward(self, out, mask, input):
-        return self.dispatch([input, mask], [self.alloc(out)])
+        return self.dispatch([out, input, mask], [self.alloc(out)])
 class MaskedSelect(function.Function):