Commit 9de0f1a3 by Ting PAN

Fix the bug of missing defs on blending assign operators

Summary:

This commit attaches input and output together in assign operators,
which fixes the missing input defs due to identity from input to output.
1 parent 746f2cbb
Showing with 589 additions and 1606 deletions
......@@ -39,14 +39,14 @@ html:
@echo "Build finished. The HTML pages are in $(BUILDDIR)."
latex:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)-latex
$(SPHINXBUILD) -b latex -j ${NUMBER_OF_PROCESSORS} $(ALLSPHINXOPTS) $(BUILDDIR)-latex
@echo
@echo "Build finished; the LaTeX files are in $(BUILDDIR)-latex."
@echo "Run \`make' in that directory to run these through (pdf)latex" \
"(use \`make latexpdf' here to do that automatically)."
latexpdf:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)-latex
$(SPHINXBUILD) -b latex -j ${NUMBER_OF_PROCESSORS} $(ALLSPHINXOPTS) $(BUILDDIR)-latex
@echo "Running LaTeX files through pdflatex..."
$(MAKE) -C $(BUILDDIR)/latex all-pdf
$(MAKE) -C $(BUILDDIR)-latex all-pdf
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)-latex."
......@@ -39,7 +39,7 @@ extensions = ['sphinx.ext.autodoc', 'sphinxcontrib.katex', 'breathe']
# Project
project = 'dragon'
copyright = 'Copyright (c) 2017-present, SeetaTech, Co.,Ltd'
author = 'SeetaTech'
author = 'SeetaTech, Co.,Ltd'
with open('../../../dragon/version.txt', 'r') as f:
version = f.read().strip()
......@@ -114,6 +114,7 @@ latex_elements = {
\fvset{breaklines=true, breakanywhere=true}
\setlength{\headheight}{13.6pt}
\setlength{\itemindent}{-1pt}
\addto\captionsenglish{\renewcommand{\chaptername}{}}
\makeatletter
\renewcommand*\l@subsection{\@dottedtocline{2}{3.8em}{3.8em}}
\fancypagestyle{normal}{
......@@ -146,13 +147,18 @@ latex_elements = {
\vspace*{40mm}
\LARGE \@author
\vspace*{40mm}
\LARGE \today
\end{titlepage}
\makeatother
\pagenumbering{arabic}
''',
'pointsize': '10pt',
'classoptions': ',oneside',
'figure_align': 'H',
'fncychap': '\\usepackage[Sonny]{fncychap}',
'printindex': '',
'sphinxsetup': ' \
hmargin={0.75in,0.75in}, \
......
......@@ -65,7 +65,7 @@ if "%1" == "doxygen" (
)
if "%1" == "html" (
%SPHINXBUILD% -b html -j %NUMBER_OF_PROCESSORS% %ALLSPHINXOPTS% %BUILDDIR%
%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The HTML pages are in %BUILDDIR%.
......
......@@ -34,14 +34,14 @@ html:
@echo "Build finished. The HTML pages are in $(BUILDDIR)."
latex:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)-latex
$(SPHINXBUILD) -b latex -j ${NUMBER_OF_PROCESSORS} $(ALLSPHINXOPTS) $(BUILDDIR)-latex
@echo
@echo "Build finished; the LaTeX files are in $(BUILDDIR)-latex."
@echo "Run \`make' in that directory to run these through (pdf)latex" \
"(use \`make latexpdf' here to do that automatically)."
latexpdf:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)-latex
$(SPHINXBUILD) -b latex -j ${NUMBER_OF_PROCESSORS} $(ALLSPHINXOPTS) $(BUILDDIR)-latex
@echo "Running LaTeX files through pdflatex..."
$(MAKE) -C $(BUILDDIR)/latex all-pdf
$(MAKE) -C $(BUILDDIR)-latex all-pdf
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)-latex."
......@@ -41,7 +41,7 @@ napoleon_use_rtype = False
# Project
project = 'dragon'
copyright = 'Copyright (c) 2017-present, SeetaTech, Co.,Ltd'
author = 'SeetaTech'
author = 'SeetaTech, Co.,Ltd'
with open('../../../dragon/version.txt', 'r') as f:
version = f.read().strip()
......@@ -122,6 +122,7 @@ latex_elements = {
\fvset{breaklines=true, breakanywhere=true}
\setlength{\headheight}{13.6pt}
\setlength{\itemindent}{-1pt}
\addto\captionsenglish{\renewcommand{\chaptername}{}}
\makeatletter
\renewcommand*\l@subsection{\@dottedtocline{2}{3.8em}{3.8em}}
\fancypagestyle{normal}{
......@@ -154,13 +155,18 @@ latex_elements = {
\vspace*{40mm}
\LARGE \@author
\vspace*{40mm}
\LARGE \today
\end{titlepage}
\makeatother
\pagenumbering{arabic}
''',
'pointsize': '10pt',
'classoptions': ',oneside',
'figure_align': 'H',
'fncychap': '\\usepackage[Sonny]{fncychap}',
'printindex': '',
'sphinxsetup': ' \
hmargin={0.75in,0.75in}, \
......
......@@ -48,9 +48,6 @@ dragon
`constant(...) <dragon/constant.html>`_
: Return a tensor initialized from the value.
`copy(...) <dragon/copy.html>`_
: Copy the input.
`create_function(...) <dragon/create_function.html>`_
: Create a callable graph from the specified outputs.
......@@ -93,6 +90,9 @@ dragon
`graph_mode(...) <dragon/graph_mode.html>`_
: Context-manager set the graph execution mode.
`identity(...) <dragon/identity.html>`_
: Return a tensor copied from the input.
`index_select(...) <dragon/index_select.html>`_
: Select the elements according to the index along the given axis.
......@@ -199,7 +199,6 @@ dragon
dragon/channel_shuffle
dragon/concat
dragon/constant
dragon/copy
dragon/create_function
dragon/device
dragon/eager_mode
......@@ -214,6 +213,7 @@ dragon
dragon/get_workspace
dragon/gradients
dragon/graph_mode
dragon/identity
dragon/index_select
dragon/linspace
dragon/load_library
......
......@@ -174,8 +174,8 @@ __truediv__
.. _dragon.assign(...): assign.html
.. _dragon.cast(...): cast.html
.. _dragon.copy(...): copy.html
.. _dragon.fill(...): fill.html
.. _dragon.identity(...): identity.html
.. _dragon.masked_assign(...): masked_assign.html
.. _dragon.masked_select(...): masked_select.html
.. _dragon.math.add(...): math/add.html
......
......@@ -154,8 +154,8 @@ __truediv__
.. _dragon.assign(...): assign.html
.. _dragon.cast(...): cast.html
.. _dragon.copy(...): copy.html
.. _dragon.fill(...): fill.html
.. _dragon.identity(...): identity.html
.. _dragon.masked_assign(...): masked_assign.html
.. _dragon.masked_select(...): masked_select.html
.. _dragon.math.add(...): math/add.html
......
copy
====
identity
========
.. autofunction:: dragon.copy
.. autofunction:: dragon.identity
.. raw:: html
......
......@@ -55,7 +55,7 @@ if errorlevel 9009 (
:sphinx_ok
if "%1" == "html" (
%SPHINXBUILD% -b html -j %NUMBER_OF_PROCESSORS% %ALLSPHINXOPTS% %BUILDDIR%
%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The HTML pages are in %BUILDDIR%.
......
......@@ -65,7 +65,7 @@ Name Supported Reference
`Greater`_ |v| :func:`dragon.math.greater`
`HardSigmoid`_ |v| :func:`dragon.nn.hardsigmoid`
`Hardmax`_
`Identity`_
`Identity`_ |v| :func:`dragon.identity`
`If`_
`InstanceNormalization`_ |v| :func:`dragon.nn.instance_norm`
`IsInf`_ |v| :func:`dragon.math.is_inf`
......
......@@ -58,7 +58,7 @@ vm.tensorflow
: Compute the symbolic derivatives of ``ys`` w.r.t. ``xs`` .
`identity(...) <tensorflow/identity.html>`_
: Return a new tensor copying the content of input.
: Return a tensor copied from the input.
`linspace(...) <tensorflow/linspace.html>`_
: Generate evenly spaced values within intervals along the given axis.
......
......@@ -68,11 +68,9 @@ void _EluGrad<float16>(
DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float16);
DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
......
......@@ -170,10 +170,8 @@ void EluGrad<float16, CUDAContext>(
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
......
......@@ -83,11 +83,9 @@ void _HardSigmoidGrad<float16>(
DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float16);
DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
......
......@@ -126,10 +126,8 @@ void HardSigmoidGrad<float16, CUDAContext>(
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
......
......@@ -88,11 +88,9 @@ void _HardSwishGrad<float16>(
DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float16);
DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
......
......@@ -153,10 +153,8 @@ void HardSwishGrad<float16, CUDAContext>(
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
......
......@@ -243,10 +243,8 @@ void PReluWGrad<float16, CPUContext>(
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
......
......@@ -601,10 +601,8 @@ void PReluWGrad<float16, CUDAContext>(
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
......
......@@ -124,11 +124,9 @@ void _ReluNGrad<float16>(
DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float16);
DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
......
......@@ -401,10 +401,8 @@ void ReluNGrad<float16, CUDAContext>(
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
......
......@@ -85,11 +85,9 @@ void _SeluGrad<float16>(
DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float16);
DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
......
......@@ -191,10 +191,8 @@ void SeluGrad<float16, CUDAContext>(
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
......
......@@ -56,11 +56,9 @@ void _SigmoidGrad<float16>(
DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float16);
DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
......
......@@ -135,10 +135,8 @@ void SigmoidGrad<float16, CUDAContext>(
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
......
......@@ -122,11 +122,9 @@ void _SoftmaxGrad<float16>(
DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float16);
DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
......
......@@ -257,10 +257,8 @@ void SoftmaxGrad<float16, CUDAContext>(
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
......
......@@ -60,11 +60,9 @@ void _SwishGrad<float16>(
DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float16);
DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
......
......@@ -121,10 +121,8 @@ void SwishGrad<float16, CUDAContext>(
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
......
......@@ -55,11 +55,9 @@ void _TanhGrad<float16>(
DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float16);
DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
......
......@@ -124,10 +124,8 @@ void TanhGrad<float16, CUDAContext>(
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
......
......@@ -93,7 +93,6 @@ DEFINE_KERNEL_LAUNCHER(int);
DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -206,7 +206,6 @@ DEFINE_KERNEL_LAUNCHER(int);
DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -105,14 +105,12 @@ DEFINE_KERNEL_LAUNCHER(float, float);
DEFINE_KERNEL_LAUNCHER(float, double);
DEFINE_KERNEL_LAUNCHER(double, float);
DEFINE_KERNEL_LAUNCHER(double, double);
DEFINE_FP16_KERNEL_LAUNCHER(int8_t);
DEFINE_FP16_KERNEL_LAUNCHER(uint8_t);
DEFINE_FP16_KERNEL_LAUNCHER(int);
DEFINE_FP16_KERNEL_LAUNCHER(int64_t);
DEFINE_FP16_KERNEL_LAUNCHER(float);
DEFINE_FP16_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_FP16_KERNEL_LAUNCHER
......
......@@ -268,14 +268,12 @@ DEFINE_KERNEL_LAUNCHER(float, float);
DEFINE_KERNEL_LAUNCHER(float, double);
DEFINE_KERNEL_LAUNCHER(double, float);
DEFINE_KERNEL_LAUNCHER(double, double);
DEFINE_FP16_KERNEL_LAUNCHER(int8_t);
DEFINE_FP16_KERNEL_LAUNCHER(uint8_t);
DEFINE_FP16_KERNEL_LAUNCHER(int);
DEFINE_FP16_KERNEL_LAUNCHER(int64_t);
DEFINE_FP16_KERNEL_LAUNCHER(float);
DEFINE_FP16_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_FP16_KERNEL_LAUNCHER
......
......@@ -51,7 +51,6 @@ DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -57,7 +57,6 @@ DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -38,7 +38,6 @@ DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -75,7 +75,6 @@ DEFINE_KERNEL_LAUNCHER(int);
DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -77,7 +77,6 @@ DEFINE_KERNEL_LAUNCHER(IndexSelect, int64_t);
DEFINE_KERNEL_LAUNCHER(IndexSelect, float16);
DEFINE_KERNEL_LAUNCHER(IndexSelect, float);
DEFINE_KERNEL_LAUNCHER(IndexSelect, double);
DEFINE_KERNEL_LAUNCHER(IndexSelectGrad, int8_t);
DEFINE_KERNEL_LAUNCHER(IndexSelectGrad, uint8_t);
DEFINE_KERNEL_LAUNCHER(IndexSelectGrad, int);
......@@ -85,7 +84,6 @@ DEFINE_KERNEL_LAUNCHER(IndexSelectGrad, int64_t);
DEFINE_KERNEL_LAUNCHER(IndexSelectGrad, float16);
DEFINE_KERNEL_LAUNCHER(IndexSelectGrad, float);
DEFINE_KERNEL_LAUNCHER(IndexSelectGrad, double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -112,11 +112,9 @@ DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float16);
DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
......
......@@ -80,14 +80,12 @@ DEFINE_KERNEL_LAUNCHER(int64_t, int64_t);
DEFINE_KERNEL_LAUNCHER(int64_t, float16);
DEFINE_KERNEL_LAUNCHER(int64_t, float);
DEFINE_KERNEL_LAUNCHER(int64_t, double);
DEFINE_GRAD_KERNEL_LAUNCHER(int, float16);
DEFINE_GRAD_KERNEL_LAUNCHER(int, float);
DEFINE_GRAD_KERNEL_LAUNCHER(int, double);
DEFINE_GRAD_KERNEL_LAUNCHER(int64_t, float16);
DEFINE_GRAD_KERNEL_LAUNCHER(int64_t, float);
DEFINE_GRAD_KERNEL_LAUNCHER(int64_t, double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
......
......@@ -85,14 +85,12 @@ DEFINE_KERNEL_LAUNCHER(int64_t, int64_t);
DEFINE_KERNEL_LAUNCHER(int64_t, float16);
DEFINE_KERNEL_LAUNCHER(int64_t, float);
DEFINE_KERNEL_LAUNCHER(int64_t, double);
DEFINE_GRAD_KERNEL_LAUNCHER(int, float16);
DEFINE_GRAD_KERNEL_LAUNCHER(int, float);
DEFINE_GRAD_KERNEL_LAUNCHER(int, double);
DEFINE_GRAD_KERNEL_LAUNCHER(int64_t, float16);
DEFINE_GRAD_KERNEL_LAUNCHER(int64_t, float);
DEFINE_GRAD_KERNEL_LAUNCHER(int64_t, double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
......
......@@ -42,7 +42,6 @@ void _OneHot(
DEFINE_KERNEL_LAUNCHER(int);
DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -42,7 +42,6 @@ __global__ void _OneHot(
DEFINE_KERNEL_LAUNCHER(int);
DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -132,7 +132,6 @@ DEFINE_CONST_KERNEL_LAUNCHER(int64_t);
DEFINE_CONST_KERNEL_LAUNCHER(float16);
DEFINE_CONST_KERNEL_LAUNCHER(float);
DEFINE_CONST_KERNEL_LAUNCHER(double);
DEFINE_KERNEL_LAUNCHER(ReflectPad, bool);
DEFINE_KERNEL_LAUNCHER(ReflectPad, int8_t);
DEFINE_KERNEL_LAUNCHER(ReflectPad, uint8_t);
......@@ -141,7 +140,6 @@ DEFINE_KERNEL_LAUNCHER(ReflectPad, int64_t);
DEFINE_KERNEL_LAUNCHER(ReflectPad, float16);
DEFINE_KERNEL_LAUNCHER(ReflectPad, float);
DEFINE_KERNEL_LAUNCHER(ReflectPad, double);
DEFINE_KERNEL_LAUNCHER(EdgePad, bool);
DEFINE_KERNEL_LAUNCHER(EdgePad, int8_t);
DEFINE_KERNEL_LAUNCHER(EdgePad, uint8_t);
......@@ -150,9 +148,8 @@ DEFINE_KERNEL_LAUNCHER(EdgePad, int64_t);
DEFINE_KERNEL_LAUNCHER(EdgePad, float16);
DEFINE_KERNEL_LAUNCHER(EdgePad, float);
DEFINE_KERNEL_LAUNCHER(EdgePad, double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_CONST_KERNEL_LAUNCHER
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -151,7 +151,6 @@ DEFINE_CONST_KERNEL_LAUNCHER(int64_t);
DEFINE_CONST_KERNEL_LAUNCHER(float16);
DEFINE_CONST_KERNEL_LAUNCHER(float);
DEFINE_CONST_KERNEL_LAUNCHER(double);
DEFINE_KERNEL_LAUNCHER(ReflectPad, bool);
DEFINE_KERNEL_LAUNCHER(ReflectPad, int8_t);
DEFINE_KERNEL_LAUNCHER(ReflectPad, uint8_t);
......@@ -160,7 +159,6 @@ DEFINE_KERNEL_LAUNCHER(ReflectPad, int64_t);
DEFINE_KERNEL_LAUNCHER(ReflectPad, float16);
DEFINE_KERNEL_LAUNCHER(ReflectPad, float);
DEFINE_KERNEL_LAUNCHER(ReflectPad, double);
DEFINE_KERNEL_LAUNCHER(EdgePad, bool);
DEFINE_KERNEL_LAUNCHER(EdgePad, int8_t);
DEFINE_KERNEL_LAUNCHER(EdgePad, uint8_t);
......@@ -169,9 +167,8 @@ DEFINE_KERNEL_LAUNCHER(EdgePad, int64_t);
DEFINE_KERNEL_LAUNCHER(EdgePad, float16);
DEFINE_KERNEL_LAUNCHER(EdgePad, float);
DEFINE_KERNEL_LAUNCHER(EdgePad, double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_CONST_KERNEL_LAUNCHER
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -63,7 +63,6 @@ void ReduceSumGrad<float16, CPUContext>(
DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_GRAD_KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -128,7 +128,6 @@ void ReduceSumGrad<float16, CUDAContext>(
DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_GRAD_KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -75,11 +75,9 @@ DEFINE_KERNEL_LAUNCHER(Repeat, int64_t);
DEFINE_KERNEL_LAUNCHER(Repeat, float16);
DEFINE_KERNEL_LAUNCHER(Repeat, float);
DEFINE_KERNEL_LAUNCHER(Repeat, double);
DEFINE_KERNEL_LAUNCHER(RepeatGrad, float16);
DEFINE_KERNEL_LAUNCHER(RepeatGrad, float);
DEFINE_KERNEL_LAUNCHER(RepeatGrad, double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -139,10 +139,8 @@ DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
......
......@@ -76,7 +76,6 @@ DEFINE_KERNEL_LAUNCHER(Slice, int64_t);
DEFINE_KERNEL_LAUNCHER(Slice, float16);
DEFINE_KERNEL_LAUNCHER(Slice, float);
DEFINE_KERNEL_LAUNCHER(Slice, double);
DEFINE_KERNEL_LAUNCHER(SliceGrad, bool);
DEFINE_KERNEL_LAUNCHER(SliceGrad, int8_t);
DEFINE_KERNEL_LAUNCHER(SliceGrad, uint8_t);
......@@ -85,7 +84,6 @@ DEFINE_KERNEL_LAUNCHER(SliceGrad, int64_t);
DEFINE_KERNEL_LAUNCHER(SliceGrad, float16);
DEFINE_KERNEL_LAUNCHER(SliceGrad, float);
DEFINE_KERNEL_LAUNCHER(SliceGrad, double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -85,7 +85,6 @@ DEFINE_KERNEL_LAUNCHER(Slice, int64_t);
DEFINE_KERNEL_LAUNCHER(Slice, float16);
DEFINE_KERNEL_LAUNCHER(Slice, float);
DEFINE_KERNEL_LAUNCHER(Slice, double);
DEFINE_KERNEL_LAUNCHER(SliceGrad, bool);
DEFINE_KERNEL_LAUNCHER(SliceGrad, int8_t);
DEFINE_KERNEL_LAUNCHER(SliceGrad, uint8_t);
......@@ -94,7 +93,6 @@ DEFINE_KERNEL_LAUNCHER(SliceGrad, int64_t);
DEFINE_KERNEL_LAUNCHER(SliceGrad, float16);
DEFINE_KERNEL_LAUNCHER(SliceGrad, float);
DEFINE_KERNEL_LAUNCHER(SliceGrad, double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -85,11 +85,9 @@ DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float16);
DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
......
......@@ -141,10 +141,8 @@ DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
......
......@@ -73,11 +73,9 @@ DEFINE_KERNEL_LAUNCHER(Transpose, int64_t);
DEFINE_KERNEL_LAUNCHER(Transpose, float16);
DEFINE_KERNEL_LAUNCHER(Transpose, float);
DEFINE_KERNEL_LAUNCHER(Transpose, double);
DEFINE_KERNEL_LAUNCHER(TransposeGrad, float16);
DEFINE_KERNEL_LAUNCHER(TransposeGrad, float);
DEFINE_KERNEL_LAUNCHER(TransposeGrad, double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -81,11 +81,9 @@ DEFINE_KERNEL_LAUNCHER(Transpose, int64_t);
DEFINE_KERNEL_LAUNCHER(Transpose, float16);
DEFINE_KERNEL_LAUNCHER(Transpose, float);
DEFINE_KERNEL_LAUNCHER(Transpose, double);
DEFINE_KERNEL_LAUNCHER(TransposeGrad, float16);
DEFINE_KERNEL_LAUNCHER(TransposeGrad, float);
DEFINE_KERNEL_LAUNCHER(TransposeGrad, double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -75,6 +75,7 @@ DEFINE_KERNEL_LAUNCHER(int);
DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -54,7 +54,6 @@ DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -65,7 +65,6 @@ DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -108,10 +108,8 @@ void BroadcastLossGrad<float16, CPUContext>(
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
......
......@@ -231,10 +231,8 @@ void BroadcastLossGrad<float16, CUDAContext>(
DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
......
......@@ -88,12 +88,10 @@ DEFINE_KERNEL_LAUNCHER(NLLLoss, float, float);
DEFINE_KERNEL_LAUNCHER(NLLLoss, float, int64_t);
DEFINE_KERNEL_LAUNCHER(NLLLoss, double, double);
DEFINE_KERNEL_LAUNCHER(NLLLoss, double, int64_t);
DEFINE_KERNEL_LAUNCHER(NLLLossGrad, float, float);
DEFINE_KERNEL_LAUNCHER(NLLLossGrad, float, int64_t);
DEFINE_KERNEL_LAUNCHER(NLLLossGrad, double, double);
DEFINE_KERNEL_LAUNCHER(NLLLossGrad, double, int64_t);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -87,12 +87,10 @@ DEFINE_KERNEL_LAUNCHER(NLLLoss, float, float);
DEFINE_KERNEL_LAUNCHER(NLLLoss, float, int64_t);
DEFINE_KERNEL_LAUNCHER(NLLLoss, double, double);
DEFINE_KERNEL_LAUNCHER(NLLLoss, double, int64_t);
DEFINE_KERNEL_LAUNCHER(NLLLossGrad, float, float);
DEFINE_KERNEL_LAUNCHER(NLLLossGrad, float, int64_t);
DEFINE_KERNEL_LAUNCHER(NLLLossGrad, double, double);
DEFINE_KERNEL_LAUNCHER(NLLLossGrad, double, int64_t);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -68,10 +68,8 @@ void _SigmoidCrossEntropyGrad(
DEFINE_KERNEL_LAUNCHER(SigmoidCrossEntropy, float);
DEFINE_KERNEL_LAUNCHER(SigmoidCrossEntropy, double);
DEFINE_KERNEL_LAUNCHER(SigmoidCrossEntropyGrad, float);
DEFINE_KERNEL_LAUNCHER(SigmoidCrossEntropyGrad, double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -64,10 +64,8 @@ __global__ void _SigmoidCrossEntropyGrad(
DEFINE_KERNEL_LAUNCHER(SigmoidCrossEntropy, float);
DEFINE_KERNEL_LAUNCHER(SigmoidCrossEntropy, double);
DEFINE_KERNEL_LAUNCHER(SigmoidCrossEntropyGrad, float);
DEFINE_KERNEL_LAUNCHER(SigmoidCrossEntropyGrad, double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -137,12 +137,10 @@ DEFINE_KERNEL_LAUNCHER(SigmoidFocalLoss, float, float);
DEFINE_KERNEL_LAUNCHER(SigmoidFocalLoss, float, int64_t);
DEFINE_KERNEL_LAUNCHER(SigmoidFocalLoss, double, double);
DEFINE_KERNEL_LAUNCHER(SigmoidFocalLoss, double, int64_t);
DEFINE_KERNEL_LAUNCHER(SigmoidFocalLossGrad, float, float);
DEFINE_KERNEL_LAUNCHER(SigmoidFocalLossGrad, float, int64_t);
DEFINE_KERNEL_LAUNCHER(SigmoidFocalLossGrad, double, double);
DEFINE_KERNEL_LAUNCHER(SigmoidFocalLossGrad, double, int64_t);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -132,12 +132,10 @@ DEFINE_KERNEL_LAUNCHER(SigmoidFocalLoss, float, float);
DEFINE_KERNEL_LAUNCHER(SigmoidFocalLoss, float, int64_t);
DEFINE_KERNEL_LAUNCHER(SigmoidFocalLoss, double, double);
DEFINE_KERNEL_LAUNCHER(SigmoidFocalLoss, double, int64_t);
DEFINE_KERNEL_LAUNCHER(SigmoidFocalLossGrad, float, float);
DEFINE_KERNEL_LAUNCHER(SigmoidFocalLossGrad, float, int64_t);
DEFINE_KERNEL_LAUNCHER(SigmoidFocalLossGrad, double, double);
DEFINE_KERNEL_LAUNCHER(SigmoidFocalLossGrad, double, int64_t);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -57,10 +57,8 @@ void SmoothL1Grad<float16, CPUContext>(
DEFINE_KERNEL_LAUNCHER(SmoothL1, float);
DEFINE_KERNEL_LAUNCHER(SmoothL1, double);
DEFINE_KERNEL_LAUNCHER(SmoothL1Grad, float);
DEFINE_KERNEL_LAUNCHER(SmoothL1Grad, double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -90,10 +90,8 @@ void SmoothL1Grad<float16, CUDAContext>(
DEFINE_KERNEL_LAUNCHER(SmoothL1, float);
DEFINE_KERNEL_LAUNCHER(SmoothL1, double);
DEFINE_KERNEL_LAUNCHER(SmoothL1Grad, float);
DEFINE_KERNEL_LAUNCHER(SmoothL1Grad, double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -37,7 +37,6 @@ void _SoftmaxCrossEntropy(
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -41,7 +41,6 @@ __global__ void _SoftmaxCrossEntropy(
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -95,12 +95,10 @@ DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropy, float, float);
DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropy, float, int64_t);
DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropy, double, double);
DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropy, double, int64_t);
DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropyGrad, float, float);
DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropyGrad, float, int64_t);
DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropyGrad, double, double);
DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropyGrad, double, int64_t);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -93,12 +93,10 @@ DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropy, float, float);
DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropy, float, int64_t);
DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropy, double, double);
DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropy, double, int64_t);
DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropyGrad, float, float);
DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropyGrad, float, int64_t);
DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropyGrad, double, double);
DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropyGrad, double, int64_t);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -104,11 +104,9 @@ DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float16);
DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
......
......@@ -160,10 +160,8 @@ DEFINE_KERNEL_LAUNCHER(int);
DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
......
......@@ -155,7 +155,7 @@ void Moments<float16, float, CPUContext>(
CPU_FP16_NOT_SUPPORTED;
}
#define DEFINE_MOMENTS_KERNEL_LAUNCHER(Tx, Ty) \
#define DEFINE_KERNEL_LAUNCHER(Tx, Ty) \
template <> \
void Moments<Tx, Ty, CPUContext>( \
const int num_dims, \
......@@ -169,14 +169,13 @@ void Moments<float16, float, CPUContext>(
_Moments(num_dims, dims, num_axes, axes, x, mean, var, ctx); \
}
DEFINE_MOMENTS_KERNEL_LAUNCHER(int8_t, float);
DEFINE_MOMENTS_KERNEL_LAUNCHER(uint8_t, float);
DEFINE_MOMENTS_KERNEL_LAUNCHER(int, float);
DEFINE_MOMENTS_KERNEL_LAUNCHER(int64_t, float);
DEFINE_MOMENTS_KERNEL_LAUNCHER(float, float);
DEFINE_MOMENTS_KERNEL_LAUNCHER(double, double);
#undef DEFINE_MOMENTS_KERNEL_LAUNCHER
DEFINE_KERNEL_LAUNCHER(int8_t, float);
DEFINE_KERNEL_LAUNCHER(uint8_t, float);
DEFINE_KERNEL_LAUNCHER(int, float);
DEFINE_KERNEL_LAUNCHER(int64_t, float);
DEFINE_KERNEL_LAUNCHER(float, float);
DEFINE_KERNEL_LAUNCHER(double, double);
#undef DEFINE__KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -316,7 +316,6 @@ DEFINE_KERNEL_LAUNCHER(int, float);
DEFINE_KERNEL_LAUNCHER(int64_t, float);
DEFINE_KERNEL_LAUNCHER(float, float);
DEFINE_KERNEL_LAUNCHER(double, double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -177,12 +177,6 @@ void L2NormalizeGrad<float16, CPUContext>(
_##name(outer_dim, inner_dim, reduce_dim, (T)scale, (T)eps, x, y); \
}
DEFINE_KERNEL_LAUNCHER(L1Normalize, float);
DEFINE_KERNEL_LAUNCHER(L1Normalize, double);
DEFINE_KERNEL_LAUNCHER(L2Normalize, float);
DEFINE_KERNEL_LAUNCHER(L2Normalize, double);
#undef DEFINE_KERNEL_LAUNCHER
#define DEFINE_GRAD_KERNEL_LAUNCHER(name, T) \
template <> \
void name<T, CPUContext>( \
......@@ -198,10 +192,15 @@ DEFINE_KERNEL_LAUNCHER(L2Normalize, double);
_##name(outer_dim, inner_dim, reduce_dim, (T)scale, (T)eps, dy, x, dx); \
}
DEFINE_KERNEL_LAUNCHER(L1Normalize, float);
DEFINE_KERNEL_LAUNCHER(L1Normalize, double);
DEFINE_KERNEL_LAUNCHER(L2Normalize, float);
DEFINE_KERNEL_LAUNCHER(L2Normalize, double);
DEFINE_GRAD_KERNEL_LAUNCHER(L1NormalizeGrad, float);
DEFINE_GRAD_KERNEL_LAUNCHER(L1NormalizeGrad, double);
DEFINE_GRAD_KERNEL_LAUNCHER(L2NormalizeGrad, float);
DEFINE_GRAD_KERNEL_LAUNCHER(L2NormalizeGrad, double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -170,14 +170,6 @@ __global__ void _L2NormalizeGrad(
reinterpret_cast<ScalarT*>(y)); \
}
DEFINE_KERNEL_LAUNCHER(L1Normalize, float16, half, float);
DEFINE_KERNEL_LAUNCHER(L1Normalize, float, float, float);
DEFINE_KERNEL_LAUNCHER(L1Normalize, double, double, double);
DEFINE_KERNEL_LAUNCHER(L2Normalize, float16, half, float);
DEFINE_KERNEL_LAUNCHER(L2Normalize, float, float, float);
DEFINE_KERNEL_LAUNCHER(L2Normalize, double, double, double);
#undef DEFINE_KERNEL_LAUNCHER
#define DEFINE_GRAD_KERNEL_LAUNCHER(name, T, ScalarT, AccT) \
template <> \
void name<T, CUDAContext>( \
......@@ -203,12 +195,19 @@ DEFINE_KERNEL_LAUNCHER(L2Normalize, double, double, double);
reinterpret_cast<ScalarT*>(dx)); \
}
DEFINE_KERNEL_LAUNCHER(L1Normalize, float16, half, float);
DEFINE_KERNEL_LAUNCHER(L1Normalize, float, float, float);
DEFINE_KERNEL_LAUNCHER(L1Normalize, double, double, double);
DEFINE_KERNEL_LAUNCHER(L2Normalize, float16, half, float);
DEFINE_KERNEL_LAUNCHER(L2Normalize, float, float, float);
DEFINE_KERNEL_LAUNCHER(L2Normalize, double, double, double);
DEFINE_GRAD_KERNEL_LAUNCHER(L1NormalizeGrad, float16, half, float);
DEFINE_GRAD_KERNEL_LAUNCHER(L1NormalizeGrad, float, float, float);
DEFINE_GRAD_KERNEL_LAUNCHER(L1NormalizeGrad, double, double, double);
DEFINE_GRAD_KERNEL_LAUNCHER(L2NormalizeGrad, float16, half, float);
DEFINE_GRAD_KERNEL_LAUNCHER(L2NormalizeGrad, float, float, float);
DEFINE_GRAD_KERNEL_LAUNCHER(L2NormalizeGrad, double, double, double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -293,10 +293,8 @@ void _AvgPool2dGradNHWC(
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
......
......@@ -325,10 +325,8 @@ __global__ void _AvgPool2dGradNHWC(
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
......
......@@ -76,7 +76,6 @@ DEFINE_KERNEL_LAUNCHER(int);
DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -103,7 +103,6 @@ DEFINE_KERNEL_LAUNCHER(int);
DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -304,7 +304,6 @@ void _Col2Im2dNHWC(
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -356,7 +356,6 @@ __global__ void _Col2Im2dNHWC(
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -243,6 +243,7 @@ void DepthwiseConv2d<float, CPUContext>(
DEFINE_GRAD_KERNEL_LAUNCHER(float16);
DEFINE_GRAD_KERNEL_LAUNCHER(float);
#undef DEFINE_GRAD_KERNEL_LAUNCHER
#undef DISPATCH_DATA_KERNEL
} // namespace kernel
......
......@@ -12,9 +12,9 @@ namespace kernel {
namespace {
#if __CUDA_ARCH__ >= 350
#define LOAD(x, i) __ldg(x + i)
#define LDG(x, i) __ldg(x + i)
#else
#define LOAD(x, i) x[i]
#define LDG(x, i) x[i]
#endif
template <typename T, typename AccT, int KKH, int KKW>
......@@ -60,7 +60,7 @@ __global__ void _DepthwiseConv2dNCHW(
iw = iw_start + kw * dilation_w;
if (ih >= 0 && ih < H && iw >= 0 && iw < W) {
xi = x_start + ih * W + iw;
sum_val += convert::To<AccT>(Multiplies(LOAD(x, xi), LOAD(w, wi)));
sum_val += convert::To<AccT>(Multiplies(LDG(x, xi), LDG(w, wi)));
}
++wi;
} // End kw
......@@ -112,7 +112,7 @@ __global__ void _DepthwiseConv2dNHWC(
iw = iw_start + kw * dilation_w;
if (ih >= 0 && ih < H && iw >= 0 && iw < W) {
xi = ((x_start + ih) * W + iw) * C + c;
sum_val += convert::To<AccT>(Multiplies(LOAD(x, xi), LOAD(w, wi)));
sum_val += convert::To<AccT>(Multiplies(LDG(x, xi), LDG(w, wi)));
}
++wi;
} // End kw
......@@ -164,7 +164,7 @@ __global__ void _DepthwiseConv2dGradNCHW(
ow = ow / stride_w;
if (oh >= 0 && oh < out_h && ow >= 0 && ow < out_w) {
yi = y_start + oh * out_w + ow;
sum_val += convert::To<AccT>(Multiplies(LOAD(dy, yi), LOAD(w, wi)));
sum_val += convert::To<AccT>(Multiplies(LDG(dy, yi), LDG(w, wi)));
}
}
++wi;
......@@ -217,7 +217,7 @@ __global__ void _DepthwiseConv2dGradNHWC(
ow = ow / stride_w;
if (oh >= 0 && oh < out_h && ow >= 0 && ow < out_w) {
yi = ((y_start + oh) * out_w + ow) * C + c;
sum_val += convert::To<AccT>(Multiplies(LOAD(dy, yi), LOAD(w, wi)));
sum_val += convert::To<AccT>(Multiplies(LDG(dy, yi), LDG(w, wi)));
}
}
++wi;
......@@ -267,7 +267,7 @@ __global__ void _DepthwiseConv2dWGradNCHW(
if (ih >= 0 && iw >= 0 && ih < H && iw < W) {
xi = ((i * C + c) * H + ih) * W + iw;
yi = (i * C + c) * out_h * out_w + j;
sum_val += convert::To<AccT>(Multiplies(LOAD(dy, yi), LOAD(x, xi)));
sum_val += convert::To<AccT>(Multiplies(LDG(dy, yi), LDG(x, xi)));
}
}
}
......@@ -320,7 +320,7 @@ __global__ void _DepthwiseConv2dWGradNHWC(
if (ih >= 0 && iw >= 0 && ih < H && iw < W) {
xi = ((i * H + ih) * W + iw) * C + c;
yi = (i * ohw + j) * C + c;
sum_val += convert::To<AccT>(Multiplies(LOAD(dy, yi), LOAD(x, xi)));
sum_val += convert::To<AccT>(Multiplies(LDG(dy, yi), LDG(x, xi)));
}
}
}
......@@ -333,7 +333,7 @@ __global__ void _DepthwiseConv2dWGradNHWC(
}
}
#undef LOAD
#undef LDG
} // namespace
......@@ -528,10 +528,8 @@ __global__ void _DepthwiseConv2dWGradNHWC(
DEFINE_KERNEL_LAUNCHER(float16, half, float);
DEFINE_KERNEL_LAUNCHER(float, float, float);
DEFINE_GRAD_KERNEL_LAUNCHER(float16, half, float);
DEFINE_GRAD_KERNEL_LAUNCHER(float, float, float);
#undef DISPATCH_DATA_KERNEL
#undef DISPATCH_WEIGHT_KERNEL
#undef DEFINE_KERNEL_LAUNCHER
......
......@@ -287,10 +287,8 @@ void _MaxPool2dGradNHWC(
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
......
......@@ -335,10 +335,8 @@ __global__ void _MaxPool2dGradNHWC(
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
......
......@@ -272,10 +272,8 @@ DEFINE_KERNEL_LAUNCHER(int);
DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
......
......@@ -11,11 +11,9 @@ namespace kernel {
namespace {
#if __CUDA_ARCH__ >= 350
#define LOAD(x, i) __ldg(x + i)
#define LOADF(x, i) __half2float(__ldg(x + i))
#define LDG(x, i) convert::To<float>(__ldg(x + i))
#else
#define LOAD(x, i) x[i]
#define LOADF(x, i) __half2float(x[i])
#define LDG(x, i) convert::To<float>(x[i])
#endif
template <typename T>
......@@ -70,53 +68,13 @@ __global__ void _ResizeLinearNCHW(
const float u = w_in - li;
const int offset = (n * C + c) * H;
const float tl = LOAD(x, ((offset + ti) * W + li));
const float tr = LOAD(x, ((offset + ti) * W + ri));
const float bl = LOAD(x, ((offset + bi) * W + li));
const float br = LOAD(x, ((offset + bi) * W + ri));
const float tl = LDG(x, ((offset + ti) * W + li));
const float tr = LDG(x, ((offset + ti) * W + ri));
const float bl = LDG(x, ((offset + bi) * W + li));
const float br = LDG(x, ((offset + bi) * W + ri));
const float t = tl + (tr - tl) * u;
const float b = bl + (br - bl) * u;
y[yi] = (T)(t + (b - t) * v);
}
}
template <>
__global__ void _ResizeLinearNCHW<half>(
const int nthreads,
const int C,
const int H,
const int W,
const int out_h,
const int out_w,
const float scale_h,
const float scale_w,
const bool align_corners,
const half* x,
half* y) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
const int w = yi % out_w;
const int h = (yi / out_w) % out_h;
const int c = (yi / out_w / out_h) % C;
const int n = yi / out_w / out_w / C;
const float h_in = TransformCoordinate(h, scale_h, align_corners);
const int ti = floorf(h_in);
const int bi = h_in < H - 1 ? ceilf(h_in) : H - 1;
const float v = h_in - ti;
const float w_in = TransformCoordinate(w, scale_w, align_corners);
const int li = floorf(w_in);
const int ri = (w_in < W - 1) ? ceilf(w_in) : W - 1;
const float u = w_in - li;
const int offset = (n * C + c) * H;
const float tl = LOADF(x, ((offset + ti) * W + li));
const float tr = LOADF(x, ((offset + ti) * W + ri));
const float bl = LOADF(x, ((offset + bi) * W + li));
const float br = LOADF(x, ((offset + bi) * W + ri));
const float t = tl + (tr - tl) * u;
const float b = bl + (br - bl) * u;
y[yi] = __float2half(t + (b - t) * v);
y[yi] = convert::To<T>(t + (b - t) * v);
}
}
......@@ -150,53 +108,13 @@ __global__ void _ResizeLinearNHWC(
const float u = w_in - li;
const int offset = n * H;
const float tl = LOAD(x, (((offset + ti) * W + li) * C + c));
const float tr = LOAD(x, (((offset + ti) * W + ri) * C + c));
const float bl = LOAD(x, (((offset + bi) * W + li) * C + c));
const float br = LOAD(x, (((offset + bi) * W + ri) * C + c));
const float t = tl + (tr - tl) * u;
const float b = bl + (br - bl) * u;
y[yi] = (T)(t + (b - t) * v);
}
}
template <>
__global__ void _ResizeLinearNHWC<half>(
const int nthreads,
const int C,
const int H,
const int W,
const int out_h,
const int out_w,
const float scale_h,
const float scale_w,
const bool align_corners,
const half* x,
half* y) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
const int c = yi % C;
const int w = (yi / C) % out_w;
const int h = (yi / C / out_w) % out_h;
const int n = yi / C / out_w / out_h;
const float h_in = TransformCoordinate(h, scale_h, align_corners);
const int ti = floorf(h_in);
const int bi = (h_in < H - 1) ? ceilf(h_in) : H - 1;
const float v = h_in - ti;
const float w_in = TransformCoordinate(w, scale_w, align_corners);
const int li = floorf(w_in);
const int ri = (w_in < W - 1) ? ceilf(w_in) : W - 1;
const float u = w_in - li;
const int offset = n * H;
const float tl = LOADF(x, (((offset + ti) * W + li) * C + c));
const float tr = LOADF(x, (((offset + ti) * W + ri) * C + c));
const float bl = LOADF(x, (((offset + bi) * W + li) * C + c));
const float br = LOADF(x, (((offset + bi) * W + ri) * C + c));
const float tl = LDG(x, (((offset + ti) * W + li) * C + c));
const float tr = LDG(x, (((offset + ti) * W + ri) * C + c));
const float bl = LDG(x, (((offset + bi) * W + li) * C + c));
const float br = LDG(x, (((offset + bi) * W + ri) * C + c));
const float t = tl + (tr - tl) * u;
const float b = bl + (br - bl) * u;
y[yi] = __float2half(t + (b - t) * v);
y[yi] = convert::To<T>(t + (b - t) * v);
}
}
......@@ -229,48 +147,8 @@ __global__ void _ResizeLinearGradNCHW(
const int ri = (w_in < W - 1) ? ceilf(w_in) : W - 1;
const float u = w_in - li;
const float dt = (1.f - v) * LOAD(dy, yi);
const float db = v * LOAD(dy, yi);
const int offset = (n * C + c) * H;
atomicAdd(&dx[(offset + ti) * W + li], (1.f - u) * dt);
atomicAdd(&dx[(offset + ti) * W + ri], u * dt);
atomicAdd(&dx[(offset + bi) * W + li], (1.f - u) * db);
atomicAdd(&dx[(offset + bi) * W + ri], u * db);
}
}
template <>
__global__ void _ResizeLinearGradNCHW<half>(
const int nthreads,
const int C,
const int H,
const int W,
const int out_h,
const int out_w,
const float scale_h,
const float scale_w,
const bool align_corners,
const half* dy,
float* dx) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
const int w = yi % out_w;
const int h = (yi / out_w) % out_h;
const int c = (yi / out_w / out_h) % C;
const int n = yi / out_w / out_w / C;
const float h_in = TransformCoordinate(h, scale_h, align_corners);
const int ti = floorf(h_in);
const int bi = (h_in < H - 1) ? ceilf(h_in) : H - 1;
const float v = h_in - ti;
const float w_in = TransformCoordinate(w, scale_w, align_corners);
const int li = floorf(w_in);
const int ri = (w_in < W - 1) ? ceilf(w_in) : W - 1;
const float u = w_in - li;
const float dt = (1.f - v) * LOADF(dy, yi);
const float db = v * LOADF(dy, yi);
const float dt = (1.f - v) * LDG(dy, yi);
const float db = v * LDG(dy, yi);
const int offset = (n * C + c) * H;
atomicAdd(&dx[(offset + ti) * W + li], (1.f - u) * dt);
......@@ -309,48 +187,8 @@ __global__ void _ResizeLinearGradNHWC(
const int ri = (w_in < W - 1) ? ceilf(w_in) : W - 1;
const float u = w_in - li;
const float dt = (1.f - v) * LOAD(dy, yi);
const float db = v * LOAD(dy, yi);
const int offset = n * H;
atomicAdd(&dx[((offset + ti) * W + li) * C + c], (1.f - u) * dt);
atomicAdd(&dx[((offset + ti) * W + ri) * C + c], u * dt);
atomicAdd(&dx[((offset + bi) * W + li) * C + c], (1.f - u) * db);
atomicAdd(&dx[((offset + bi) * W + ri) * C + c], u * db);
}
}
template <>
__global__ void _ResizeLinearGradNHWC<half>(
const int nthreads,
const int C,
const int H,
const int W,
const int out_h,
const int out_w,
const float scale_h,
const float scale_w,
const bool align_corners,
const half* dy,
float* dx) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
const int c = yi % C;
const int w = (yi / C) % out_w;
const int h = (yi / C / out_w) % out_h;
const int n = yi / C / out_w / out_h;
const float h_in = TransformCoordinate(h, scale_h, align_corners);
const int ti = floorf(h_in);
const int bi = (h_in < H - 1) ? ceilf(h_in) : H - 1;
const float v = h_in - ti;
const float w_in = TransformCoordinate(w, scale_w, align_corners);
const int li = floorf(w_in);
const int ri = (w_in < W - 1) ? ceilf(w_in) : W - 1;
const float u = w_in - li;
const float dt = (1.f - v) * LOADF(dy, yi);
const float db = v * LOADF(dy, yi);
const float dt = (1.f - v) * LDG(dy, yi);
const float db = v * LDG(dy, yi);
const int offset = n * H;
atomicAdd(&dx[((offset + ti) * W + li) * C + c], (1.f - u) * dt);
......@@ -360,249 +198,105 @@ __global__ void _ResizeLinearGradNHWC<half>(
}
}
#undef LOAD
#undef LOADF
#undef LDG
} // namespace
/* ------------------- Launcher Separator ------------------- */
template <>
void ResizeLinear<float16, CUDAContext>(
const int N,
const int C,
const int H,
const int W,
const int out_h,
const int out_w,
const bool align_corners,
const string& data_format,
const float16* x,
float16* y,
CUDAContext* ctx) {
auto nthreads = N * C * out_h * out_w;
auto scale_h = ComputeScale(H, out_h, align_corners);
auto scale_w = ComputeScale(W, out_w, align_corners);
if (data_format == "NCHW") {
_ResizeLinearNCHW<<<
CUDA_BLOCKS(nthreads),
CUDA_THREADS,
0,
ctx->cuda_stream()>>>(
nthreads,
C,
H,
W,
out_h,
out_w,
scale_h,
scale_w,
align_corners,
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y));
} else if (data_format == "NHWC") {
_ResizeLinearNHWC<<<
CUDA_BLOCKS(nthreads),
CUDA_THREADS,
0,
ctx->cuda_stream()>>>(
nthreads,
C,
H,
W,
out_h,
out_w,
scale_h,
scale_w,
align_corners,
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y));
} else {
LOG(FATAL) << "Unknown data format: " << data_format;
}
}
template <>
void ResizeLinearGrad<float16, CUDAContext>(
const int N,
const int C,
const int H,
const int W,
const int out_h,
const int out_w,
const bool align_corners,
const string& data_format,
const float16* dy,
float* dx,
CUDAContext* ctx) {
auto nthreads = N * C * out_h * out_w;
auto scale_h = ComputeScale(H, out_h, align_corners);
auto scale_w = ComputeScale(W, out_w, align_corners);
math::Set(N * C * H * W, 0.f, dx, ctx);
if (data_format == "NCHW") {
_ResizeLinearGradNCHW<<<
CUDA_BLOCKS(nthreads),
CUDA_THREADS,
0,
ctx->cuda_stream()>>>(
nthreads,
C,
H,
W,
out_h,
out_w,
scale_h,
scale_w,
align_corners,
reinterpret_cast<const half*>(dy),
dx);
} else if (data_format == "NHWC") {
_ResizeLinearGradNHWC<<<
CUDA_BLOCKS(nthreads),
CUDA_THREADS,
0,
ctx->cuda_stream()>>>(
nthreads,
C,
H,
W,
out_h,
out_w,
scale_h,
scale_w,
align_corners,
reinterpret_cast<const half*>(dy),
dx);
} else {
LOG(FATAL) << "Unknown data format: " << data_format;
#define DISPATCH_RESIZE_KERNEL(name, T, nblocks, nthreads, ...) \
if (data_format == "NCHW") { \
name##NCHW<<<nblocks, nthreads, 0, ctx->cuda_stream()>>>(__VA_ARGS__); \
} else if (data_format == "NHWC") { \
name##NHWC<<<nblocks, nthreads, 0, ctx->cuda_stream()>>>(__VA_ARGS__); \
} else { \
LOG(FATAL) << "Unknown DataFormat: " << data_format; \
}
}
#define DEFINE_KERNEL_LAUNCHER(T) \
template <> \
void ResizeLinear<T, CUDAContext>( \
const int N, \
const int C, \
const int H, \
const int W, \
const int out_h, \
const int out_w, \
const bool align_corners, \
const string& data_format, \
const T* x, \
T* y, \
CUDAContext* ctx) { \
auto nthreads = N * C * out_h * out_w; \
auto scale_h = ComputeScale(H, out_h, align_corners); \
auto scale_w = ComputeScale(W, out_w, align_corners); \
if (data_format == "NCHW") { \
_ResizeLinearNCHW<<< \
CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \
0, \
ctx->cuda_stream()>>>( \
nthreads, \
C, \
H, \
W, \
out_h, \
out_w, \
scale_h, \
scale_w, \
align_corners, \
x, \
y); \
} else if (data_format == "NHWC") { \
_ResizeLinearNHWC<<< \
CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \
0, \
ctx->cuda_stream()>>>( \
nthreads, \
C, \
H, \
W, \
out_h, \
out_w, \
scale_h, \
scale_w, \
align_corners, \
x, \
y); \
} else { \
LOG(FATAL) << "Unknown data format: " << data_format; \
} \
#define DEFINE_KERNEL_LAUNCHER(T, ScalarT) \
template <> \
void ResizeLinear<T, CUDAContext>( \
const int N, \
const int C, \
const int H, \
const int W, \
const int out_h, \
const int out_w, \
const bool align_corners, \
const string& data_format, \
const T* x, \
T* y, \
CUDAContext* ctx) { \
auto nthreads = N * C * out_h * out_w; \
auto scale_h = ComputeScale(H, out_h, align_corners); \
auto scale_w = ComputeScale(W, out_w, align_corners); \
DISPATCH_RESIZE_KERNEL( \
_ResizeLinear, \
ScalarT, \
CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \
nthreads, \
C, \
H, \
W, \
out_h, \
out_w, \
scale_h, \
scale_w, \
align_corners, \
reinterpret_cast<const ScalarT*>(x), \
reinterpret_cast<ScalarT*>(y)); \
}
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
template <> \
void ResizeLinearGrad<T, CUDAContext>( \
const int N, \
const int C, \
const int H, \
const int W, \
const int out_h, \
const int out_w, \
const bool align_corners, \
const string& data_format, \
const T* dy, \
float* dx, \
CUDAContext* ctx) { \
auto nthreads = N * C * out_h * out_w; \
auto scale_h = ComputeScale(H, out_h, align_corners); \
auto scale_w = ComputeScale(W, out_w, align_corners); \
math::Set(N* C* H* W, 0.f, dx, ctx); \
if (data_format == "NCHW") { \
_ResizeLinearGradNCHW<<< \
CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \
0, \
ctx->cuda_stream()>>>( \
nthreads, \
C, \
H, \
W, \
out_h, \
out_w, \
scale_h, \
scale_w, \
align_corners, \
dy, \
dx); \
} else if (data_format == "NHWC") { \
_ResizeLinearGradNHWC<<< \
CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \
0, \
ctx->cuda_stream()>>>( \
nthreads, \
C, \
H, \
W, \
out_h, \
out_w, \
scale_h, \
scale_w, \
align_corners, \
dy, \
dx); \
} else { \
LOG(FATAL) << "Unknown data format: " << data_format; \
} \
#define DEFINE_GRAD_KERNEL_LAUNCHER(T, ScalarT) \
template <> \
void ResizeLinearGrad<T, CUDAContext>( \
const int N, \
const int C, \
const int H, \
const int W, \
const int out_h, \
const int out_w, \
const bool align_corners, \
const string& data_format, \
const T* dy, \
float* dx, \
CUDAContext* ctx) { \
auto nthreads = N * C * out_h * out_w; \
auto scale_h = ComputeScale(H, out_h, align_corners); \
auto scale_w = ComputeScale(W, out_w, align_corners); \
math::Set(N* C* H* W, 0.f, dx, ctx); \
DISPATCH_RESIZE_KERNEL( \
_ResizeLinearGrad, \
ScalarT, \
CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \
nthreads, \
C, \
H, \
W, \
out_h, \
out_w, \
scale_h, \
scale_w, \
align_corners, \
reinterpret_cast<const ScalarT*>(dy), \
dx); \
}
DEFINE_KERNEL_LAUNCHER(int8_t);
DEFINE_KERNEL_LAUNCHER(uint8_t);
DEFINE_KERNEL_LAUNCHER(int);
DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double);
DEFINE_KERNEL_LAUNCHER(int8_t, int8_t);
DEFINE_KERNEL_LAUNCHER(uint8_t, uint8_t);
DEFINE_KERNEL_LAUNCHER(int, int);
DEFINE_KERNEL_LAUNCHER(int64_t, int64_t);
DEFINE_KERNEL_LAUNCHER(float16, half);
DEFINE_KERNEL_LAUNCHER(float, float);
DEFINE_KERNEL_LAUNCHER(double, double);
DEFINE_GRAD_KERNEL_LAUNCHER(float16, half);
DEFINE_GRAD_KERNEL_LAUNCHER(float, float);
DEFINE_GRAD_KERNEL_LAUNCHER(double, double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
#undef DISPATCH_RESIZE_KERNEL
} // namespace kernel
......
......@@ -181,10 +181,8 @@ DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
......
......@@ -10,6 +10,14 @@ namespace kernel {
namespace {
#if __CUDA_ARCH__ >= 350
#define LDG(x, i) __ldg(x + i)
#define LDG2(x, i) convert::To<float>(__ldg(x + i))
#else
#define LDG(x, i) x[i]
#define LDG2(x, i) convert::To<float>(x[i])
#endif
template <typename T>
__global__ void _ResizeNearestNCHW(
const int nthreads,
......@@ -29,11 +37,7 @@ __global__ void _ResizeNearestNCHW(
const int n = yi / out_w / out_h / C;
const int h_in = min(int(h * scale_h), H - 1);
const int w_in = min(int(w * scale_w), W - 1);
#if __CUDA_ARCH__ >= 350
y[yi] = __ldg(x + (((n * C + c) * H + h_in) * W + w_in));
#else
y[yi] = x[((n * C + c) * H + h_in) * W + w_in];
#endif
y[yi] = LDG(x, (((n * C + c) * H + h_in) * W + w_in));
}
}
......@@ -56,11 +60,7 @@ __global__ void _ResizeNearestNHWC(
const int n = yi / C / out_w / out_h;
const int h_in = min(int(h * scale_h), H - 1);
const int w_in = min(int(w * scale_w), W - 1);
#if __CUDA_ARCH__ >= 350
y[yi] = __ldg(x + (((n * H + h_in) * W + w_in) * C + c));
#else
y[yi] = x[((n * H + h_in) * W + w_in) * C + c];
#endif
y[yi] = LDG(x, (((n * H + h_in) * W + w_in) * C + c));
}
}
......@@ -83,39 +83,7 @@ __global__ void _ResizeNearestGradNCHW(
const int n = yi / out_w / out_h / C;
const int h_in = min(int(h * scale_h), H - 1);
const int w_in = min(int(w * scale_w), W - 1);
#if __CUDA_ARCH__ >= 350
atomicAdd(&dx[((n * C + c) * H + h_in) * W + w_in], (float)__ldg(dy + yi));
#else
atomicAdd(&dx[((n * C + c) * H + h_in) * W + w_in], (float)dy[yi]);
#endif
}
}
template <>
__global__ void _ResizeNearestGradNCHW<half>(
const int nthreads,
const int C,
const int H,
const int W,
const int out_h,
const int out_w,
const float scale_h,
const float scale_w,
const half* dy,
float* dx) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
const int w = yi % out_w;
const int h = (yi / out_w) % out_h;
const int c = (yi / out_w / out_h) % C;
const int n = yi / out_w / out_h / C;
const int h_in = min(int(h * scale_h), H - 1);
const int w_in = min(int(w * scale_w), W - 1);
#if __CUDA_ARCH__ >= 350
atomicAdd(
&dx[((n * C + c) * H + h_in) * W + w_in], __half2float(__ldg(dy + yi)));
#else
atomicAdd(&dx[((n * C + c) * H + h_in) * W + w_in], __half2float(dy[yi]));
#endif
atomicAdd(&dx[((n * C + c) * H + h_in) * W + w_in], LDG2(dy, yi));
}
}
......@@ -138,234 +106,106 @@ __global__ void _ResizeNearestGradNHWC(
const int n = yi / C / out_w / out_h;
const int h_in = min(int(h * scale_h), H - 1);
const int w_in = min(int(w * scale_w), W - 1);
#if __CUDA_ARCH__ >= 350
atomicAdd(&dx[((n * H + h_in) * W + w_in) * C + c], (float)__ldg(dy + yi));
#else
atomicAdd(&dx[((n * H + h_in) * W + w_in) * C + c], (float)dy[yi]);
#endif
atomicAdd(&dx[((n * H + h_in) * W + w_in) * C + c], LDG2(dy, yi));
}
}
template <>
__global__ void _ResizeNearestGradNHWC<half>(
const int nthreads,
const int C,
const int H,
const int W,
const int out_h,
const int out_w,
const float scale_h,
const float scale_w,
const half* dy,
float* dx) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
const int c = yi % C;
const int w = (yi / C) % out_w;
const int h = (yi / C / out_w) % out_h;
const int n = yi / C / out_w / out_h;
const int h_in = min(int(h * scale_h), H - 1);
const int w_in = min(int(w * scale_w), W - 1);
#if __CUDA_ARCH__ >= 350
atomicAdd(
&dx[((n * H + h_in) * W + w_in) * C + c], __half2float(__ldg(dy + yi)));
#else
atomicAdd(&dx[((n * H + h_in) * W + w_in) * C + c], __half2float(dy[yi]));
#endif
}
}
#undef LDG
#undef LDG2
} // namespace
/* ------------------- Launcher Separator ------------------- */
template <>
void ResizeNearest<float16, CUDAContext>(
const int N,
const int C,
const int H,
const int W,
const int out_h,
const int out_w,
const string& data_format,
const float16* x,
float16* y,
CUDAContext* ctx) {
auto nthreads = N * C * out_h * out_w;
auto scale_h = (float)H / (float)out_h;
auto scale_w = (float)W / (float)out_w;
if (data_format == "NCHW") {
_ResizeNearestNCHW<<<
CUDA_BLOCKS(nthreads),
CUDA_THREADS,
0,
ctx->cuda_stream()>>>(
nthreads,
C,
H,
W,
out_h,
out_w,
scale_h,
scale_w,
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y));
} else if (data_format == "NHWC") {
_ResizeNearestNHWC<<<
CUDA_BLOCKS(nthreads),
CUDA_THREADS,
0,
ctx->cuda_stream()>>>(
nthreads,
C,
H,
W,
out_h,
out_w,
scale_h,
scale_w,
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y));
} else {
LOG(FATAL) << "Unknown data format: " << data_format;
}
}
template <>
void ResizeNearestGrad<float16, CUDAContext>(
const int N,
const int C,
const int H,
const int W,
const int out_h,
const int out_w,
const string& data_format,
const float16* dy,
float* dx,
CUDAContext* ctx) {
auto nthreads = N * C * out_h * out_w;
auto scale_h = (float)H / (float)out_h;
auto scale_w = (float)W / (float)out_w;
math::Set(N * C * H * W, 0.f, dx, ctx);
if (data_format == "NCHW") {
_ResizeNearestGradNCHW<<<
CUDA_BLOCKS(nthreads),
CUDA_THREADS,
0,
ctx->cuda_stream()>>>(
nthreads,
C,
H,
W,
out_h,
out_w,
scale_h,
scale_w,
reinterpret_cast<const half*>(dy),
dx);
} else if (data_format == "NHWC") {
_ResizeNearestGradNHWC<<<
CUDA_BLOCKS(nthreads),
CUDA_THREADS,
0,
ctx->cuda_stream()>>>(
nthreads,
C,
H,
W,
out_h,
out_w,
scale_h,
scale_w,
reinterpret_cast<const half*>(dy),
dx);
} else {
LOG(FATAL) << "Unknown data format: " << data_format;
#define DISPATCH_RESIZE_KERNEL(name, T, nblocks, nthreads, ...) \
if (data_format == "NCHW") { \
name##NCHW<<<nblocks, nthreads, 0, ctx->cuda_stream()>>>(__VA_ARGS__); \
} else if (data_format == "NHWC") { \
name##NHWC<<<nblocks, nthreads, 0, ctx->cuda_stream()>>>(__VA_ARGS__); \
} else { \
LOG(FATAL) << "Unknown DataFormat: " << data_format; \
}
}
#define DEFINE_KERNEL_LAUNCHER(T) \
template <> \
void ResizeNearest<T, CUDAContext>( \
const int N, \
const int C, \
const int H, \
const int W, \
const int out_h, \
const int out_w, \
const string& data_format, \
const T* x, \
T* y, \
CUDAContext* ctx) { \
auto nthreads = N * C * out_h * out_w; \
auto scale_h = (float)H / (float)out_h; \
auto scale_w = (float)W / (float)out_w; \
if (data_format == "NCHW") { \
_ResizeNearestNCHW<<< \
CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \
0, \
ctx->cuda_stream()>>>( \
nthreads, C, H, W, out_h, out_w, scale_h, scale_w, x, y); \
} else if (data_format == "NHWC") { \
_ResizeNearestNHWC<<< \
CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \
0, \
ctx->cuda_stream()>>>( \
nthreads, C, H, W, out_h, out_w, scale_h, scale_w, x, y); \
} else { \
LOG(FATAL) << "Unknown data format: " << data_format; \
} \
#define DEFINE_KERNEL_LAUNCHER(T, ScalarT) \
template <> \
void ResizeNearest<T, CUDAContext>( \
const int N, \
const int C, \
const int H, \
const int W, \
const int out_h, \
const int out_w, \
const string& data_format, \
const T* x, \
T* y, \
CUDAContext* ctx) { \
auto nthreads = N * C * out_h * out_w; \
auto scale_h = (float)H / (float)out_h; \
auto scale_w = (float)W / (float)out_w; \
DISPATCH_RESIZE_KERNEL( \
_ResizeNearest, \
ScalarT, \
CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \
nthreads, \
C, \
H, \
W, \
out_h, \
out_w, \
scale_h, \
scale_w, \
reinterpret_cast<const ScalarT*>(x), \
reinterpret_cast<ScalarT*>(y)); \
}
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
template <> \
void ResizeNearestGrad<T, CUDAContext>( \
const int N, \
const int C, \
const int H, \
const int W, \
const int out_h, \
const int out_w, \
const string& data_format, \
const T* dy, \
float* dx, \
CUDAContext* ctx) { \
auto nthreads = N * C * out_h * out_w; \
auto scale_h = (float)H / (float)out_h; \
auto scale_w = (float)W / (float)out_w; \
math::Set(N* C* H* W, 0.f, dx, ctx); \
if (data_format == "NCHW") { \
_ResizeNearestGradNCHW<<< \
CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \
0, \
ctx->cuda_stream()>>>( \
nthreads, C, H, W, out_h, out_w, scale_h, scale_w, dy, dx); \
} else if (data_format == "NHWC") { \
_ResizeNearestGradNHWC<<< \
CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \
0, \
ctx->cuda_stream()>>>( \
nthreads, C, H, W, out_h, out_w, scale_h, scale_w, dy, dx); \
} else { \
LOG(FATAL) << "Unknown data format: " << data_format; \
} \
#define DEFINE_GRAD_KERNEL_LAUNCHER(T, ScalarT) \
template <> \
void ResizeNearestGrad<T, CUDAContext>( \
const int N, \
const int C, \
const int H, \
const int W, \
const int out_h, \
const int out_w, \
const string& data_format, \
const T* dy, \
float* dx, \
CUDAContext* ctx) { \
auto nthreads = N * C * out_h * out_w; \
auto scale_h = (float)H / (float)out_h; \
auto scale_w = (float)W / (float)out_w; \
math::Set(N* C* H* W, 0.f, dx, ctx); \
DISPATCH_RESIZE_KERNEL( \
_ResizeNearestGrad, \
ScalarT, \
CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \
nthreads, \
C, \
H, \
W, \
out_h, \
out_w, \
scale_h, \
scale_w, \
reinterpret_cast<const ScalarT*>(dy), \
dx); \
}
DEFINE_KERNEL_LAUNCHER(int8_t);
DEFINE_KERNEL_LAUNCHER(uint8_t);
DEFINE_KERNEL_LAUNCHER(int);
DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double);
DEFINE_KERNEL_LAUNCHER(int8_t, int8_t);
DEFINE_KERNEL_LAUNCHER(uint8_t, uint8_t);
DEFINE_KERNEL_LAUNCHER(int, int);
DEFINE_KERNEL_LAUNCHER(int64_t, int64_t);
DEFINE_KERNEL_LAUNCHER(float16, half);
DEFINE_KERNEL_LAUNCHER(float, float);
DEFINE_KERNEL_LAUNCHER(double, double);
DEFINE_GRAD_KERNEL_LAUNCHER(float16, half);
DEFINE_GRAD_KERNEL_LAUNCHER(float, float);
DEFINE_GRAD_KERNEL_LAUNCHER(double, double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
#undef DISPATCH_RESIZE_KERNEL
} // namespace kernel
......
......@@ -185,11 +185,9 @@ void RoiAlign<float16, CPUContext>(
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float16);
DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
......
......@@ -10,11 +10,9 @@ namespace kernel {
namespace {
#if __CUDA_ARCH__ >= 350
#define LOAD(x, i) __ldg(x + i)
#define LOADF(x, i) __half2float(__ldg(x + i))
#define LDG(x, i) convert::To<float>(__ldg(x + i))
#else
#define LOAD(x, i) x[i]
#define LOADF(x, i) __half2float(x[i])
#define LDG(x, i) convert::To<float>(x[i])
#endif
template <typename T>
......@@ -42,48 +40,10 @@ _RoiAlignIntp(const int H, const int W, float h, float w, const T* x) {
w = (float)li;
}
const float tl = LOAD(x, (ti * W + li));
const float tr = LOAD(x, (ti * W + ri));
const float bl = LOAD(x, (bi * W + li));
const float br = LOAD(x, (bi * W + ri));
const float v = h - ti;
const float u = w - li;
const float t = tl + (tr - tl) * u;
const float b = bl + (br - bl) * u;
return t + (b - t) * v;
}
template <>
__device__ float
_RoiAlignIntp<half>(const int H, const int W, float h, float w, const half* x) {
if (h < -1.f || h > H || w < -1.f || w > W) return 0.f;
if (h <= 0.f) h = 0.f;
if (w <= 0.f) w = 0.f;
int ti = (int)h, bi;
int li = (int)w, ri;
if (ti < H - 1) {
bi = ti + 1;
} else {
ti = bi = H - 1;
h = (float)ti;
}
if (li < W - 1) {
ri = li + 1;
} else {
ri = li = W - 1;
w = (float)li;
}
const float tl = LOADF(x, (ti * W + li));
const float tr = LOADF(x, (ti * W + ri));
const float bl = LOADF(x, (bi * W + li));
const float br = LOADF(x, (bi * W + ri));
const float tl = LDG(x, (ti * W + li));
const float tr = LDG(x, (ti * W + ri));
const float bl = LDG(x, (bi * W + li));
const float br = LDG(x, (bi * W + ri));
const float v = h - ti;
const float u = w - li;
......@@ -133,7 +93,7 @@ __device__ void _RoiAlignIntpParam(
u = w - li;
}
template <typename T>
template <typename T, typename AccT>
__global__ void _RoiAlign(
const int nthreads,
const int C,
......@@ -156,7 +116,7 @@ __global__ void _RoiAlign(
const int batch_ind = roi[0];
if (batch_ind < 0) {
y[yi] = T(0);
y[yi] = convert::To<T>(0.f);
continue;
}
......@@ -180,67 +140,7 @@ __global__ void _RoiAlign(
const T* offset_x = x + (batch_ind * C + c) * H * W;
T val = T(0);
for (int i = 0; i < grid_h; i++) {
const float h = hstart + (i + .5f) * bin_h / grid_h;
for (int j = 0; j < grid_w; j++) {
const float w = wstart + (j + .5f) * bin_w / grid_w;
val += _RoiAlignIntp(H, W, h, w, offset_x);
}
}
y[yi] = val / T(grid_h * grid_w);
}
}
template <>
__global__ void _RoiAlign<half>(
const int nthreads,
const int C,
const int H,
const int W,
const int out_h,
const int out_w,
const float spatial_scale,
const int sampling_ratio,
const half* x,
const float* rois,
half* y) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
const int ow = yi % out_w;
const int oh = (yi / out_w) % out_h;
const int c = (yi / out_w / out_h) % C;
const int n = yi / out_w / out_h / C;
const float* roi = rois + n * 5;
const int batch_ind = roi[0];
if (batch_ind < 0) {
y[yi] = __float2half(0.f);
continue;
}
const float roi_start_w = roi[1] * spatial_scale;
const float roi_start_h = roi[2] * spatial_scale;
const float roi_end_w = roi[3] * spatial_scale;
const float roi_end_h = roi[4] * spatial_scale;
const float roi_w = max(roi_end_w - roi_start_w, 1.f);
const float roi_h = max(roi_end_h - roi_start_h, 1.f);
const float bin_h = roi_h / (float)out_h;
const float bin_w = roi_w / (float)out_w;
const int grid_h =
sampling_ratio > 0 ? sampling_ratio : ceil(roi_h / out_h);
const int grid_w =
sampling_ratio > 0 ? sampling_ratio : ceil(roi_w / out_w);
const float hstart = roi_start_h + oh * bin_h;
const float wstart = roi_start_w + ow * bin_w;
const half* offset_x = x + (batch_ind * C + c) * H * W;
float val = 0.f;
AccT val = AccT(0);
for (int i = 0; i < grid_h; i++) {
const float h = hstart + (i + .5f) * bin_h / grid_h;
for (int j = 0; j < grid_w; j++) {
......@@ -249,11 +149,11 @@ __global__ void _RoiAlign<half>(
}
}
y[yi] = __float2half(val / float(grid_h * grid_w));
y[yi] = convert::To<T>(val / AccT(grid_h * grid_w));
}
}
template <typename T>
template <typename T, typename AccT>
__global__ void _RoiAlignGrad(
const int nthreads,
const int C,
......@@ -265,7 +165,7 @@ __global__ void _RoiAlignGrad(
const int sampling_ratio,
const T* dy,
const float* rois,
float* dx) {
AccT* dx) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
const int ow = yi % out_w;
const int oh = (yi / out_w) % out_h;
......@@ -295,7 +195,7 @@ __global__ void _RoiAlignGrad(
const float hstart = roi_start_h + oh * bin_h;
const float wstart = roi_start_w + ow * bin_w;
const float dyi = dy[yi] / T(grid_h * grid_w);
const float dyi = convert::To<float>(dy[yi]) / float(grid_h * grid_w);
float* offset_dx = dx + (batch_ind * C + c) * H * W;
for (int i = 0; i < grid_h; i++) {
......@@ -318,167 +218,44 @@ __global__ void _RoiAlignGrad(
}
}
template <>
__global__ void _RoiAlignGrad<half>(
const int nthreads,
const int C,
const int H,
const int W,
const int out_h,
const int out_w,
const float spatial_scale,
const int sampling_ratio,
const half* dy,
const float* rois,
float* dx) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
const int ow = yi % out_w;
const int oh = (yi / out_w) % out_h;
const int c = (yi / out_w / out_h) % C;
const int n = yi / out_w / out_h / C;
const float* roi = rois + n * 5;
const int batch_ind = roi[0];
if (batch_ind < 0) continue;
const float roi_start_w = roi[1] * spatial_scale;
const float roi_start_h = roi[2] * spatial_scale;
const float roi_end_w = roi[3] * spatial_scale;
const float roi_end_h = roi[4] * spatial_scale;
const float roi_w = max(roi_end_w - roi_start_w, 1.f);
const float roi_h = max(roi_end_h - roi_start_h, 1.f);
const float bin_h = roi_h / (float)out_h;
const float bin_w = roi_w / (float)out_w;
const int grid_h =
sampling_ratio > 0 ? sampling_ratio : ceil(roi_h / out_h);
const int grid_w =
sampling_ratio > 0 ? sampling_ratio : ceil(roi_w / out_w);
const float hstart = roi_start_h + oh * bin_h;
const float wstart = roi_start_w + ow * bin_w;
const float dyi = __half2float(dy[yi]) / float(grid_h * grid_w);
float* offset_dx = dx + (batch_ind * C + c) * H * W;
for (int i = 0; i < grid_h; i++) {
const float h = hstart + (i + .5f) * bin_h / grid_h;
for (int j = 0; j < grid_w; j++) {
const float w = wstart + (j + .5f) * bin_w / grid_w;
int ti, bi, li, ri;
float v, u;
_RoiAlignIntpParam(H, W, h, w, ti, bi, li, ri, v, u);
if (li >= 0 && ri >= 0 && ti >= 0 && bi >= 0) {
const float db = dyi * v;
const float dt = dyi * (1.f - v);
atomicAdd(offset_dx + ti * W + li, (1.f - u) * dt);
atomicAdd(offset_dx + ti * W + ri, u * dt);
atomicAdd(offset_dx + bi * W + li, (1.f - u) * db);
atomicAdd(offset_dx + bi * W + ri, u * db);
}
} // End i
} // End j
}
}
#undef LOAD
#undef LOADF
#undef LDG
} // namespace
/* ------------------- Launcher Separator ------------------- */
template <>
void RoiAlign<float16, CUDAContext>(
const int C,
const int H,
const int W,
const int out_h,
const int out_w,
const int num_rois,
const float spatial_scale,
const int sampling_ratio,
const float16* x,
const float* rois,
float16* y,
CUDAContext* ctx) {
auto nthreads = num_rois * C * out_h * out_w;
_RoiAlign<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
nthreads,
C,
H,
W,
out_h,
out_w,
spatial_scale,
sampling_ratio,
reinterpret_cast<const half*>(x),
rois,
reinterpret_cast<half*>(y));
}
template <>
void RoiAlignGrad<float16, CUDAContext>(
const int C,
const int H,
const int W,
const int out_h,
const int out_w,
const int num_rois,
const float spatial_scale,
const int sampling_ratio,
const float16* dy,
const float* rois,
float* dx,
CUDAContext* ctx) {
auto nthreads = num_rois * C * out_h * out_w;
_RoiAlignGrad<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
nthreads,
C,
H,
W,
out_h,
out_w,
spatial_scale,
sampling_ratio,
reinterpret_cast<const half*>(dy),
rois,
dx);
}
#define DEFINE_KERNEL_LAUNCHER(T) \
template <> \
void RoiAlign<T, CUDAContext>( \
const int C, \
const int H, \
const int W, \
const int out_h, \
const int out_w, \
const int num_rois, \
const float spatial_scale, \
const int sampling_ratio, \
const T* x, \
const float* rois, \
T* y, \
CUDAContext* ctx) { \
auto nthreads = num_rois * C * out_h * out_w; \
_RoiAlign<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
nthreads, \
C, \
H, \
W, \
out_h, \
out_w, \
spatial_scale, \
sampling_ratio, \
x, \
rois, \
y); \
#define DEFINE_KERNEL_LAUNCHER(T, ScalarT) \
template <> \
void RoiAlign<T, CUDAContext>( \
const int C, \
const int H, \
const int W, \
const int out_h, \
const int out_w, \
const int num_rois, \
const float spatial_scale, \
const int sampling_ratio, \
const T* x, \
const float* rois, \
T* y, \
CUDAContext* ctx) { \
auto nthreads = num_rois * C * out_h * out_w; \
_RoiAlign<ScalarT, float> \
<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
nthreads, \
C, \
H, \
W, \
out_h, \
out_w, \
spatial_scale, \
sampling_ratio, \
reinterpret_cast<const ScalarT*>(x), \
rois, \
reinterpret_cast<ScalarT*>(y)); \
}
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
#define DEFINE_GRAD_KERNEL_LAUNCHER(T, ScalarT) \
template <> \
void RoiAlignGrad<T, CUDAContext>( \
const int C, \
......@@ -507,17 +284,17 @@ void RoiAlignGrad<float16, CUDAContext>(
out_w, \
spatial_scale, \
sampling_ratio, \
dy, \
reinterpret_cast<const ScalarT*>(dy), \
rois, \
dx); \
}
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double);
DEFINE_KERNEL_LAUNCHER(float16, half);
DEFINE_KERNEL_LAUNCHER(float, float);
DEFINE_KERNEL_LAUNCHER(double, double);
DEFINE_GRAD_KERNEL_LAUNCHER(float16, half);
DEFINE_GRAD_KERNEL_LAUNCHER(float, float);
DEFINE_GRAD_KERNEL_LAUNCHER(double, double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
......
......@@ -149,11 +149,9 @@ void RoiPool<float16, CPUContext>(
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float16);
DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
......
......@@ -9,6 +9,12 @@ namespace kernel {
namespace {
#if __CUDA_ARCH__ >= 350
#define LDG(x, i) __ldg(x + i)
#else
#define LDG(x, i) x[i]
#endif
template <typename T>
__global__ void _RoiPool(
const int nthreads,
......@@ -22,6 +28,7 @@ __global__ void _RoiPool(
const float* rois,
int* mask,
T* y) {
auto Greater = math::GreaterFunctor<T>();
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
const int ow = yi % out_w;
const int oh = (yi / out_w) % out_h;
......@@ -32,7 +39,7 @@ __global__ void _RoiPool(
const int batch_ind = roi[0];
if (batch_ind < 0) {
y[yi] = T(0);
y[yi] = convert::To<T>(0.f);
mask[yi] = -1;
continue;
}
......@@ -60,119 +67,22 @@ __global__ void _RoiPool(
int max_idx = empty ? -1 : 0;
const T* offset_x = x + (batch_ind * C + c) * H * W;
T val = empty ? T(0) : offset_x[0];
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
const int xi = h * W + w;
#if __CUDA_ARCH__ >= 350
if (__ldg(offset_x + xi) > val) {
val = __ldg(offset_x + xi);
max_idx = xi;
}
#else
if (offset_x[xi] > val) {
val = offset_x[xi];
max_idx = xi;
}
#endif
}
}
y[yi] = val;
mask[yi] = max_idx;
}
}
template <>
__global__ void _RoiPool<half>(
const int nthreads,
const int C,
const int H,
const int W,
const int out_h,
const int out_w,
const float spatial_scale,
const half* x,
const float* rois,
int* mask,
half* y) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
const int ow = yi % out_w;
const int oh = (yi / out_w) % out_h;
const int c = (yi / out_w / out_h) % C;
const int n = yi / out_w / out_h / C;
const float* roi = rois + n * 5;
const int batch_ind = roi[0];
if (batch_ind < 0) {
y[yi] = __float2half(0.f);
mask[yi] = -1;
continue;
}
const int roi_start_w = round(roi[1] * spatial_scale);
const int roi_start_h = round(roi[2] * spatial_scale);
const int roi_end_w = round(roi[3] * spatial_scale);
const int roi_end_h = round(roi[4] * spatial_scale);
const int roi_w = max(roi_end_w - roi_start_w + 1, 1);
const int roi_h = max(roi_end_h - roi_start_h + 1, 1);
const float bin_h = (float)roi_h / (float)out_h;
const float bin_w = (float)roi_w / (float)out_w;
int hstart = floor(bin_h * oh);
int wstart = floor(bin_w * ow);
int hend = ceil(bin_h * (oh + 1));
int wend = ceil(bin_w * (ow + 1));
hstart = min(max(hstart + roi_start_h, 0), H);
hend = min(max(hend + roi_start_h, 0), H);
wstart = min(max(wstart + roi_start_w, 0), W);
wend = min(max(wend + roi_start_w, 0), W);
const bool empty = (hend <= hstart) || (wend <= wstart);
int max_idx = empty ? -1 : 0;
const half* offset_x = x + ((batch_ind * C + c) * H * W);
#if __CUDA_ARCH__ >= 530
half val = empty ? __float2half(0.f) : __ldg(offset_x);
#else
float val = empty ? 0.f : __half2float(*offset_x);
#endif
T val = empty ? convert::To<T>(0.f) : offset_x[0];
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
const int xi = h * W + w;
#if __CUDA_ARCH__ >= 530
if (__hgt(__ldg(offset_x + xi), val)) {
val = __ldg(offset_x + xi);
max_idx = xi;
}
#elif __CUDA_ARCH__ >= 350
if (__half2float(__ldg(offset_x + xi)) > val) {
val = __half2float(__ldg(offset_x + xi));
if (Greater(LDG(offset_x, xi), val)) {
val = LDG(offset_x, xi);
max_idx = xi;
}
#else
if (__half2float(offset_x[xi]) > val) {
val = __half2float(offset_x[xi]);
max_idx = xi;
}
#endif
}
}
#if __CUDA_ARCH__ >= 530
y[yi] = val;
#else
y[yi] = __float2half(val);
#endif
mask[yi] = max_idx;
}
}
template <typename T>
template <typename T, typename AccT>
__global__ void _RoiPoolGrad(
const int nthreads,
const int C,
......@@ -184,7 +94,7 @@ __global__ void _RoiPoolGrad(
const T* dy,
const float* rois,
const int* mask,
float* dx) {
AccT* dx) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
const int c = (yi / out_w / out_h) % C;
const int n = yi / out_w / out_h / C;
......@@ -193,116 +103,20 @@ __global__ void _RoiPoolGrad(
const int batch_ind = roi[0];
if (batch_ind < 0) continue;
float* offset_dx = dx + (batch_ind * C + c) * H * W;
#if __CUDA_ARCH__ >= 350
if (__ldg(mask + yi) != -1) {
atomicAdd(offset_dx + __ldg(mask + yi), (float)dy[yi]);
AccT* offset_dx = dx + (batch_ind * C + c) * H * W;
if (LDG(mask, yi) != -1) {
atomicAdd(offset_dx + LDG(mask, yi), convert::To<AccT>(dy[yi]));
}
#else
if (mask[yi] != -1) {
atomicAdd(offset_dx + mask[yi], (float)dy[yi]);
}
#endif
}
}
template <>
__global__ void _RoiPoolGrad<half>(
const int nthreads,
const int C,
const int H,
const int W,
const int out_h,
const int out_w,
const float spatial_scale,
const half* dy,
const float* rois,
const int* mask,
float* dx) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
const int c = (yi / out_w / out_h) % C;
const int n = yi / out_w / out_h / C;
const float* roi = rois + n * 5;
const int batch_ind = roi[0];
if (batch_ind < 0) continue;
float* offset_dx = dx + (batch_ind * C + c) * H * W;
#if __CUDA_ARCH__ >= 350
if (__ldg(mask + yi) != -1) {
atomicAdd(offset_dx + __ldg(mask + yi), __half2float(dy[yi]));
}
#else
if (mask[yi] != -1) {
atomicAdd(offset_dx + mask[yi], __half2float(dy[yi]));
}
#endif
}
}
#undef LDG
} // namespace
/* ------------------- Launcher Separator ------------------- */
template <>
void RoiPool<float16, CUDAContext>(
const int C,
const int H,
const int W,
const int out_h,
const int out_w,
const int num_rois,
const float spatial_scale,
const float16* x,
const float* rois,
int* mask,
float16* y,
CUDAContext* ctx) {
auto nthreads = num_rois * C * out_h * out_w;
_RoiPool<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
nthreads,
C,
H,
W,
out_h,
out_w,
spatial_scale,
reinterpret_cast<const half*>(x),
rois,
mask,
reinterpret_cast<half*>(y));
}
template <>
void RoiPoolGrad<float16, CUDAContext>(
const int C,
const int H,
const int W,
const int out_h,
const int out_w,
const int num_rois,
const float spatial_scale,
const float16* dy,
const float* rois,
const int* mask,
float* dx,
CUDAContext* ctx) {
auto nthreads = num_rois * C * out_h * out_w;
_RoiPoolGrad<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
nthreads,
C,
H,
W,
out_h,
out_w,
spatial_scale,
reinterpret_cast<const half*>(dy),
rois,
mask,
dx);
} // RoiPoolGrad
#define DEFINE_KERNEL_LAUNCHER(T) \
#define DEFINE_KERNEL_LAUNCHER(T, ScalarT) \
template <> \
void RoiPool<T, CUDAContext>( \
const int C, \
......@@ -319,39 +133,59 @@ void RoiPoolGrad<float16, CUDAContext>(
CUDAContext* ctx) { \
auto nthreads = num_rois * C * out_h * out_w; \
_RoiPool<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
nthreads, C, H, W, out_h, out_w, spatial_scale, x, rois, mask, y); \
nthreads, \
C, \
H, \
W, \
out_h, \
out_w, \
spatial_scale, \
reinterpret_cast<const ScalarT*>(x), \
rois, \
mask, \
reinterpret_cast<ScalarT*>(y)); \
}
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
template <> \
void RoiPoolGrad<T, CUDAContext>( \
const int C, \
const int H, \
const int W, \
const int out_h, \
const int out_w, \
const int num_rois, \
const float spatial_scale, \
const T* dy, \
const float* rois, \
const int* mask, \
float* dx, \
CUDAContext* ctx) { \
auto nthreads = num_rois * C * out_h * out_w; \
_RoiPoolGrad<<< \
CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \
0, \
ctx->cuda_stream()>>>( \
nthreads, C, H, W, out_h, out_w, spatial_scale, dy, rois, mask, dx); \
#define DEFINE_GRAD_KERNEL_LAUNCHER(T, ScalarT) \
template <> \
void RoiPoolGrad<T, CUDAContext>( \
const int C, \
const int H, \
const int W, \
const int out_h, \
const int out_w, \
const int num_rois, \
const float spatial_scale, \
const T* dy, \
const float* rois, \
const int* mask, \
float* dx, \
CUDAContext* ctx) { \
auto nthreads = num_rois * C * out_h * out_w; \
_RoiPoolGrad<<< \
CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \
0, \
ctx->cuda_stream()>>>( \
nthreads, \
C, \
H, \
W, \
out_h, \
out_w, \
spatial_scale, \
reinterpret_cast<const ScalarT*>(dy), \
rois, \
mask, \
dx); \
}
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double);
DEFINE_KERNEL_LAUNCHER(float16, half);
DEFINE_KERNEL_LAUNCHER(float, float);
DEFINE_KERNEL_LAUNCHER(double, double);
DEFINE_GRAD_KERNEL_LAUNCHER(float16, half);
DEFINE_GRAD_KERNEL_LAUNCHER(float, float);
DEFINE_GRAD_KERNEL_LAUNCHER(double, double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
......
#include "dragon/operators/array/reshape_ops.h"
#include "dragon/utils/math_functions.h"
namespace dragon {
template <class Context>
void IdentityOp<Context>::RunOnDevice() {
auto &X = Input(0), *Y = Output(0, {0});
// Store for the gradient calculation
STORE_INPUT_SPEC(0);
// Maybe copy the contents
Y->ReshapeLike(X)->CopyFrom(X, ctx());
}
DEPLOY_CPU_OPERATOR(Identity);
#ifdef USE_CUDA
DEPLOY_CUDA_OPERATOR(Identity);
#endif
OPERATOR_SCHEMA(Identity)
/* X */
.NumInputs(1)
/* Y */
.NumOutputs(1)
/* X => Y */
.AllowInplace({{0, 0}});
OPERATOR_SCHEMA(IdentityGradient)
/* dY */
.NumInputs(1)
/* dX */
.NumOutputs(1)
/* dY => dX */
.AllowInplace({{0, 0}});
REGISTER_GRADIENT(Identity, SimpleGradientMaker);
} // namespace dragon
......@@ -31,6 +31,15 @@ class ReshapeGradientOpBase : public Operator<Context> {
};
template <class Context>
class IdentityOp final : public Operator<Context> {
public:
SIMPLE_CTOR_DTOR(IdentityOp);
USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override;
};
template <class Context>
class ReshapeOp final : public Operator<Context> {
public:
ReshapeOp(const OperatorDef& def, Workspace* ws)
......@@ -94,6 +103,7 @@ class SqueezeOp final : public Operator<Context> {
: ReshapeGradientOpBase<Context>(def, ws) {} \
};
DEFINE_GRADIENT_OP(Identity);
DEFINE_GRADIENT_OP(Reshape);
DEFINE_GRADIENT_OP(Flatten);
DEFINE_GRADIENT_OP(ExpandDims);
......
......@@ -8,18 +8,17 @@ namespace dragon {
template <class Context>
template <typename T>
void AssignOp<Context>::DoRunWithType() {
auto &X = Input(0), *Y = Output(0);
int num_starts, num_sizes, num_dims = Y->ndim();
vec64_t X_dims(num_dims), X_starts(num_dims);
auto &Y_ref = Input(0), &X = Input(1), *Y = Output(0, {0});
// Determine the interval of each dimension
int num_starts, num_sizes, num_dims = Y_ref.ndim();
vec64_t X_dims(num_dims), X_starts(num_dims);
starts(0, &num_starts);
sizes(0, &num_sizes);
for (int i = 0; i < num_dims; i++) {
auto dim_start = i < num_starts ? starts(i) : 0;
auto dim_end = Y->dim(i);
auto dim_end = Y_ref.dim(i);
if (i < num_sizes) {
auto dim_length = sizes(i);
if (dim_length > 0) {
......@@ -28,19 +27,18 @@ void AssignOp<Context>::DoRunWithType() {
dim_end = dim_start + 1;
}
}
CHECK(dim_start >= 0 && dim_start < Y->dim(i))
CHECK(dim_start >= 0 && dim_start < Y_ref.dim(i))
<< "\nAssigning starts from " << dim_start << " of axis " << i << ", "
<< "while the dimension of this axis is " << Y->dim(i) << ".";
CHECK(dim_end > 0 && dim_end <= Y->dim(i))
<< "while the dimension of this axis is " << Y_ref.dim(i) << ".";
CHECK(dim_end > 0 && dim_end <= Y_ref.dim(i))
<< "\nAssigning ends at " << dim_end << " of axis " << i << ", "
<< "while the dimension of this axis is " << Y->dim(i) << ".";
<< "while the dimension of this axis is " << Y_ref.dim(i) << ".";
X_starts[i] = dim_start;
X_dims[i] = dim_end - dim_start;
}
Tensor X_broadcast(X_dims);
auto* x = X.template data<T, Context>();
Tensor XRef(X_dims);
auto* new_data = X.template data<T, Context>();
if (X.dims() != X_dims) {
vec64_t dims1, dims2;
if (math::utils::IsBinaryBroadcast(X.dims(), X_dims, dims1)) {
......@@ -49,17 +47,17 @@ void AssignOp<Context>::DoRunWithType() {
<< Tensor::DimString(X_dims);
math::utils::ComputeBinaryBroadcastDims(X.dims(), X_dims, dims1, dims2);
if (dims1 != dims2) {
auto* scratch = ctx()->workspace()->template data<T, Context>(
{X_broadcast.count()})[0];
auto* scratch =
ctx()->workspace()->template data<T, Context>({XRef.count()})[0];
math::Set(
X.ndim(),
X.dims().data(),
X_broadcast.ndim(),
X_broadcast.dims().data(),
x,
XRef.ndim(),
XRef.dims().data(),
new_data,
scratch,
ctx());
x = scratch;
new_data = scratch;
}
} else {
LOG(FATAL) << "Could not broadcast together with shapes " << X.DimString()
......@@ -67,12 +65,16 @@ void AssignOp<Context>::DoRunWithType() {
}
}
// Copy the reference data
Y->ReshapeLike(Y_ref)->CopyFrom(Y_ref, ctx());
// Update with the new data
kernel::Assign(
num_dims,
X_dims.data(),
Y->strides().data(),
X_starts.data(),
x,
new_data,
Y->template mutable_data<T, Context>(),
ctx());
}
......@@ -88,10 +90,12 @@ DEPLOY_CUDA_OPERATOR(Assign);
#endif
OPERATOR_SCHEMA(Assign)
/* V */
.NumInputs(1)
/* X */
.NumOutputs(1);
/* Y_ref, X */
.NumInputs(2)
/* Y */
.NumOutputs(1)
/* Y_ref => Y */
.AllowInplace({{0, 0}});
NO_GRADIENT(Assign);
......
#include "dragon/operators/control_flow/copy_op.h"
#include "dragon/utils/math_functions.h"
namespace dragon {
template <class Context>
template <typename T>
void CopyOp<Context>::DoRunWithType() {
auto &X = Input(0), *Y = Output(0);
math::Copy(
X.count(),
X.template data<T, Context>(),
Y->ReshapeLike(X)->template mutable_data<T, Context>(),
ctx());
}
template <class Context>
void CopyOp<Context>::RunOnDevice() {
DispatchHelper<FullTensorTypes>::Call(this, Input(0));
}
DEPLOY_CPU_OPERATOR(Copy);
#ifdef USE_CUDA
DEPLOY_CUDA_OPERATOR(Copy);
#endif
OPERATOR_SCHEMA(Copy)
/* X */
.NumInputs(1)
/* Y */
.NumOutputs(1);
NO_GRADIENT(Copy);
} // namespace dragon
/*!
* Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
*
* Licensed under the BSD 2-Clause License.
* You should have received a copy of the BSD 2-Clause License
* along with the software. If not, See,
*
* <https://opensource.org/licenses/BSD-2-Clause>
*
* ------------------------------------------------------------
*/
#ifndef DRAGON_OPERATORS_CONTROL_FLOW_COPY_OP_H_
#define DRAGON_OPERATORS_CONTROL_FLOW_COPY_OP_H_
#include "dragon/core/operator.h"
namespace dragon {
template <class Context>
class CopyOp final : public Operator<Context> {
public:
SIMPLE_CTOR_DTOR(CopyOp);
USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override;
template <typename T>
void DoRunWithType();
};
} // namespace dragon
#endif // DRAGON_OPERATORS_CONTROL_FLOW_COPY_OP_H_
......@@ -8,24 +8,28 @@ namespace dragon {
template <class Context>
template <typename T>
void MaskedAssignOp<Context>::DoRunWithType() {
auto &X = Input(0), &X_mask = Input(1), *Y = Output(0);
auto &X = Input(1), &X_mask = Input(2);
auto &Y_ref = Input(0), *Y = Output(0, {0});
CHECK(X_mask.template IsType<bool>() || X_mask.template IsType<uint8_t>())
<< "\nExcepted bool or uint8 mask.";
vec64_t X_dims, Y_dims;
if (math::utils::IsBinaryBroadcast(X.dims(), X_mask.dims(), X_dims) &&
math::utils::IsBinaryBroadcast(X_dims, Y->dims(), Y_dims) &&
Y_dims == Y->dims()) {
math::utils::IsBinaryBroadcast(X_dims, Y_ref.dims(), Y_dims) &&
Y_dims == Y_ref.dims()) {
// Copy the reference data
Y->ReshapeLike(Y_ref)->CopyFrom(Y_ref, ctx());
// Update with the new data
math::Where(
X.ndim(),
X.dims().data(),
Y->ndim(),
Y->dims().data(),
Y_ref.ndim(),
Y_ref.dims().data(),
X_mask.ndim(),
X_mask.dims().data(),
X.template data<T, Context>(),
Y->template data<T, Context>(),
Y_ref.template data<T, Context>(),
(const bool*)X_mask.template raw_data<Context>(),
Y->template mutable_data<T, Context>(),
ctx());
......@@ -46,10 +50,12 @@ DEPLOY_CUDA_OPERATOR(MaskedAssign);
#endif
OPERATOR_SCHEMA(MaskedAssign)
/* X, M */
.NumInputs(2)
/* Y_ref, X, X_mask */
.NumInputs(3)
/* Y */
.NumOutputs(1);
.NumOutputs(1)
/* Y_ref => Y */
.AllowInplace({{0, 0}});
NO_GRADIENT(MaskedAssign);
......
......@@ -65,6 +65,7 @@ from dragon.core.ops.array_ops import channel_shuffle
from dragon.core.ops.array_ops import concat
from dragon.core.ops.array_ops import expand_dims
from dragon.core.ops.array_ops import flatten
from dragon.core.ops.array_ops import identity
from dragon.core.ops.array_ops import index_select
from dragon.core.ops.array_ops import linspace
from dragon.core.ops.array_ops import masked_select
......@@ -85,7 +86,6 @@ from dragon.core.ops.array_ops import transpose
from dragon.core.ops.array_ops import unique
from dragon.core.ops.array_ops import where
from dragon.core.ops.control_flow_ops import assign
from dragon.core.ops.control_flow_ops import copy
from dragon.core.ops.control_flow_ops import masked_assign
from dragon.core.ops.framework_ops import python_plugin
from dragon.core.ops.framework_ops import stop_gradient
......
......@@ -53,6 +53,7 @@ class OpDef(object):
outputs=None,
num_outputs=1,
extra_inputs=None,
extra_outputs=None,
name=None,
**kwargs
):
......@@ -64,6 +65,7 @@ class OpDef(object):
for input in inputs:
op_info.merge_from(input)
# Collect defs from extra inputs.
if extra_inputs is not None:
extra_inputs = nest.flatten(extra_inputs)
for input in extra_inputs:
......
......@@ -59,12 +59,6 @@ def arg_reduce_spec(args, inputs, outputs):
return outputs
@register(['Assign', 'MaskedAssign'])
def assign_spec(args, inputs, outputs):
_ = locals()
return outputs
def binary_shape_spec(inputs, outputs):
if inputs[0].shape is None or inputs[1].shape is None:
return outputs
......
......@@ -588,6 +588,45 @@ def flatten(inputs, axis=0, num_axes=-1, keep_axes=None, **kwargs):
@OpSchema.num_inputs(1)
def identity(inputs, **kwargs):
"""Return a tensor copied from the input.
Examples:
Examples:
```python
# Copy ``x`` to ``y``
x = dragon.zeros(shape=(2, 3))
y = dragon.identity(x)
# ``x`` != ``y``
x += 1
print(x)
print(y)
```
Parameters
----------
inputs : dragon.Tensor
The input tensor.
Returns
-------
dragon.Tensor
The output tensor.
"""
args = parse_args(locals())
inplace = args.pop('inplace') if 'inplace' in args else False
op_lib = array_ops_lib.Identity
if context.executing_eagerly():
return op_lib.instantiate().apply([inputs], inplace=inplace)
else:
return op_lib.blend(**args)
@OpSchema.num_inputs(1)
def index_select(inputs, index, axis=0, **kwargs):
"""Select the elements according to the index along the given axis.
......
......@@ -259,6 +259,20 @@ class Flatten(Operator):
return self.dispatch(inputs, outputs)
class Identity(Operator):
"""Identity operator."""
def __init__(self, key, dev, **kwargs):
super(Identity, self).__init__(key, dev, **kwargs)
def attributes(self):
return {'op_type': 'Identity', 'arguments': {}}
def forward(self, inputs, inplace=False):
outputs = [self.alloc(inputs[0]) if inplace else self.alloc()]
return self.dispatch(inputs, outputs)
class IndexSelect(Operator):
"""IndexSelect operator."""
......
......@@ -20,7 +20,6 @@ from dragon.core.ops import control_flow_ops_lib
from dragon.core.ops.utils import ArgHelper
from dragon.core.ops.utils import OpSchema
from dragon.core.ops.utils import parse_args
from dragon.core.util import nest
@OpSchema.num_inputs(2)
......@@ -43,66 +42,22 @@ def assign(inputs, starts=None, sizes=None, **kwargs):
Returns
-------
dragon.Tensor
The input tensor.
The output tensor.
"""
args = parse_args(locals())
inplace = args.pop('inplace') if 'inplace' in args else False
inputs[1] = ops.scalar_to_tensor(inputs[1], inputs[0].dtype)
op_lib = control_flow_ops_lib.Assign
if context.executing_eagerly():
return op_lib \
.instantiate(
ndim=len(starts) if starts is not None else 0,
).apply(inputs, starts, sizes)
).apply(inputs, starts, sizes, inplace=inplace)
else:
args['outputs'] = [args['inputs'][0]]
args['inputs'] = [args['inputs'][1]]
return op_lib.blend(**args)
@OpSchema.num_inputs(1, 2)
def copy(inputs, **kwargs):
"""Copy the input.
Examples:
```python
# Copy ``x`` to ``y``
x = dragon.ones(shape=(2, 3))
y = dragon.zeros(shape=(2, 4))
dragon.copy([x, y])
# Copy to a new tensor from ``x``
y = dragon.copy(x)
```
Parameters
----------
inputs : Union[dragon.Tensor, Sequence[dragon.Tensor]]
The input tensor.
Returns
-------
dragon.Tensor
The output tensor.
"""
args = parse_args(locals())
args['inputs'] = nest.flatten(inputs)
if len(args['inputs']) == 2:
args['outputs'] = [args['inputs'][1]]
args['inputs'] = [args['inputs'][0]]
else:
args['outputs'] = None
op_lib = control_flow_ops_lib.Copy
if context.executing_eagerly():
return op_lib \
.instantiate() \
.apply(args['inputs'], args['outputs'])
else:
return op_lib.blend('Copy', **args)
@OpSchema.num_inputs(3)
def masked_assign(inputs, **kwargs):
r"""Assign the value to input where mask is 1.
......@@ -126,13 +81,10 @@ def masked_assign(inputs, **kwargs):
"""
args = parse_args(locals())
inplace = args.pop('inplace') if 'inplace' in args else False
inputs[1] = ops.scalar_to_tensor(inputs[1], inputs[0].dtype)
op_lib = control_flow_ops_lib.MaskedAssign
if context.executing_eagerly():
return op_lib.instantiate().apply(inputs)
return op_lib.instantiate().apply(inputs, inplace=inplace)
else:
args.update({
'outputs': [args['inputs'][0]],
'inputs': [args['inputs'][1:]],
})
return op_lib.blend(**args)
......@@ -46,29 +46,16 @@ class Assign(Operator):
ws, '{}/sizes[{}]'.format(handle, i),
sizes[i], 'int64')
def forward(self, inputs, starts, sizes):
def forward(self, inputs, starts, sizes, inplace=False):
outputs = [self.alloc(inputs[0]) if inplace else self.alloc()]
return self.dispatch(
[inputs[1]], [inputs[0]],
inputs, outputs,
callback=lambda ws, handle:
self.feed(ws, handle, starts, sizes),
no_grad=True,
)
class Copy(Operator):
"""Copy operator."""
def __init__(self, key, dev, **kwargs):
super(Copy, self).__init__(key, dev, **kwargs)
def attributes(self):
return {'op_type': 'Copy', 'arguments': {}}
def forward(self, inputs, outputs):
outputs = outputs if outputs else [self.alloc()]
return self.dispatch(inputs, outputs, no_grad=True)
class MaskedAssign(Operator):
"""MaskedAssign operator."""
......@@ -78,5 +65,6 @@ class MaskedAssign(Operator):
def attributes(self):
return {'op_type': 'MaskedAssign', 'arguments': {}}
def forward(self, inputs):
return self.dispatch(inputs[1:], [inputs[0]], no_grad=True)
def forward(self, inputs, inplace=False):
outputs = [self.alloc(inputs[0]) if inplace else self.alloc()]
return self.dispatch(inputs, outputs, no_grad=True)
......@@ -104,11 +104,10 @@ def copy(self):
See Also
--------
`dragon.copy(...)`_
`dragon.identity(...)`_
"""
return control_flow_ops_lib.Copy \
.instantiate().apply([self], None)
return array_ops_lib.Identity.instantiate().apply([self])
def div(self, other):
......@@ -194,7 +193,7 @@ def getitem(self, item):
if axis is not None:
return _index_select(self, item[axis], axis)
starts, sizes = _process_index(item)
return _section_select(self, starts, sizes)
return _sliced_select(self, starts, sizes)
def glorot_normal(self, mode='fan_in', scale=2.0):
......@@ -599,7 +598,7 @@ def setitem(self, key, value):
_masked_assign(self, value, key)
else:
starts, sizes = _process_index(key)
_section_assign(self, value, starts, sizes)
_sliced_assign(self, value, starts, sizes)
def sub(self, other):
......@@ -705,7 +704,7 @@ def _masked_assign(ref, value, mask):
"""Assign value according to the mask."""
value = ops.scalar_to_tensor(value, ref.dtype)
return control_flow_ops_lib.MaskedAssign \
.instantiate().apply([ref, value, mask])
.instantiate().apply([ref, value, mask], inplace=True)
def _masked_select(x, mask):
......@@ -747,16 +746,16 @@ def _process_index(item):
return starts, sizes
def _section_assign(ref, value, starts, sizes):
"""Apply the section-assign operation."""
def _sliced_assign(ref, value, starts, sizes):
"""Assign value according to the slices."""
value = ops.scalar_to_tensor(value, ref.dtype)
return control_flow_ops_lib.Assign \
.instantiate(ndim=len(starts) if starts is not None else 0) \
.apply([ref, value], starts, sizes)
.apply([ref, value], starts, sizes, inplace=True)
def _section_select(x, starts, sizes):
"""Apply the section-select operation."""
def _sliced_select(x, starts, sizes):
"""Select elements according to the slices."""
return array_ops_lib.Slice \
.instantiate(ndim=len(starts)).apply([x], starts, sizes)
......
......@@ -79,11 +79,10 @@ def copy(self):
See Also
--------
`dragon.copy(...)`_
`dragon.identity(...)`_
"""
outputs = [Tensor(shape=self.shape, dtype=self.dtype)]
return OpDef.apply('Copy', [self], [outputs])
return OpDef.apply('Identity', [self])
def div(self, other):
......@@ -169,7 +168,7 @@ def getitem(self, item):
if axis is not None:
return _index_select(self, item[axis], axis)
starts, sizes = _process_index(item)
return _section_select(self, starts, sizes)
return _sliced_select(self, starts, sizes)
def get_value(self):
......@@ -402,10 +401,13 @@ def setitem(self, key, value):
"""
if isinstance(key, Tensor):
_masked_assign(self, value, key)
raise RuntimeError(
'Assigning via mask is an ambiguous behavior in graph mode. '
'Use `dragon.masked_assign(...)` instead.')
else:
starts, sizes = _process_index(key)
_section_assign(self, value, starts, sizes)
raise RuntimeError(
'Assigning via slices is an ambiguous behavior in graph mode. '
'Use `dragon.assign(...)` instead.')
def set_value(self, value):
......@@ -458,12 +460,6 @@ def _index_select(x, index, axis):
return OpDef.apply('IndexSelect', [x, index], axis=axis, num_axes=1)
def _masked_assign(ref, value, mask):
"""Assign value according to the mask."""
value = ops.scalar_to_tensor(value, ref.dtype)
return OpDef.apply('MaskedAssign', [value, mask], [ref])
def _masked_select(x, mask):
"""Select elements according to the mask."""
return OpDef.apply('MaskedSelect', [x, mask])
......@@ -502,14 +498,8 @@ def _process_index(item):
return starts, sizes
def _section_assign(ref, value, starts, sizes):
"""Create the section-assign operator."""
value = ops.scalar_to_tensor(value, ref.dtype)
return OpDef.apply('Assign', [value], [ref], starts=starts, sizes=sizes)
def _section_select(x, starts, sizes):
"""Create the section-select operator."""
def _sliced_select(x, starts, sizes):
"""Select elements according to the slices."""
return OpDef.apply('Slice', [x], starts=starts, sizes=sizes)
......
......@@ -24,7 +24,6 @@ from dragon.core.autograph.tensor import TensorRef
from dragon.core.framework import context
from dragon.core.framework import workspace
from dragon.core.ops import array_ops
from dragon.core.ops import control_flow_ops
from dragon.core.ops import init_ops
from dragon.core.ops import vision_ops
......@@ -260,19 +259,19 @@ def gather(params, indices, axis=0, name=None):
def identity(input, name=None):
"""Return a new tensor copying the content of input.
"""Return a tensor copied from the input.
Examples:
```python
# Copy ``x`` to ``xx``
# Copy ``x`` to ``y``
x = tf.zeros(shape=(2, 3))
xx = tf.identity(x)
y = tf.identity(x)
# ``x`` != ``xx``
# ``x`` != ``y``
x += 1
print(x)
print(xx)
print(y)
```
Parameters
......@@ -288,7 +287,7 @@ def identity(input, name=None):
The output tensor.
"""
return control_flow_ops.copy(input, name=name if name else 'Identity')
return array_ops.identity(input, name=name if name else 'Identity')
def ones(shape, dtype='float32', name=None):
......
......@@ -655,6 +655,19 @@ class TestArrayOps(OpTestCase):
with dragon.device('cuda'):
self.test_flatten()
def test_identity(self):
for execution in ('EAGER_MODE', 'GRAPH_MODE'):
with execution_context().mode(execution):
data = arange((4,))
x = new_tensor(data)
y = dragon.identity(x)
self.assertEqual(y, data)
@unittest.skipIf(not TEST_CUDA, 'CUDA unavailable')
def test_identity_cuda(self):
with dragon.device('cuda'):
self.test_identity()
def test_index_select(self):
entries = [1, (1, 2)]
for execution in ('EAGER_MODE', 'GRAPH_MODE'):
......@@ -1116,30 +1129,14 @@ class TestControlFlowOps(OpTestCase):
with dragon.device('cuda'):
self.test_assign()
def test_copy(self):
for execution in ('EAGER_MODE', 'GRAPH_MODE'):
with execution_context().mode(execution):
data = arange((4,))
x = new_tensor(data)
y = dragon.zeros((4,), dtype='float32')
y = dragon.copy([x, y])
z = dragon.copy(x)
self.assertEqual(y, data)
self.assertEqual(z, data)
@unittest.skipIf(not TEST_CUDA, 'CUDA unavailable')
def test_copy_cuda(self):
with dragon.device('cuda'):
self.test_copy()
def test_masked_assign(self):
for execution in ('EAGER_MODE', 'GRAPH_MODE'):
with execution_context().mode(execution):
data = arange((2, 3))
x = new_tensor(data)
dragon.masked_assign([x, 0, x > 2])
y = dragon.masked_assign([x, 0, x > 2])
data[data > 2] = 0
self.assertEqual(x, data)
self.assertEqual(y, data)
@unittest.skipIf(not TEST_CUDA, 'CUDA unavailable')
def test_masked_assign_cuda(self):
......@@ -3021,9 +3018,12 @@ class TestTensorOps(OpTestCase):
with execution_context().mode(execution):
data = arange((2, 3))
x = new_tensor(data)
x[x > 2] = 0
data[data > 2] = 0
self.assertEqual(x, data)
try:
x[x > 2] = 0
data[data > 2] = 0
self.assertEqual(x, data)
except RuntimeError:
pass
entries = [0,
slice(None, None, None),
slice(0, None, None),
......@@ -3037,7 +3037,7 @@ class TestTensorOps(OpTestCase):
x.__setitem__(item, 0)
data.__setitem__(item, 0)
self.assertEqual(x, data)
except (NotImplementedError, ValueError, TypeError):
except (NotImplementedError, ValueError, TypeError, RuntimeError):
pass
def test_rsub(self):
......
......@@ -71,7 +71,7 @@ class Assign(function.Function):
def forward(self, out, starts, sizes, input):
self._check_device([input, out])
return self.dispatch(
[input], [out],
[out, input], [self.alloc(out)],
callback=lambda ws, handle:
self.feed(ws, handle, starts, sizes),
no_grad=True,
......@@ -306,7 +306,7 @@ class MaskedAssign(function.Function):
return {'op_type': 'MaskedAssign', 'arguments': {}}
def forward(self, out, mask, input):
return self.dispatch([input, mask], [self.alloc(out)])
return self.dispatch([out, input, mask], [self.alloc(out)])
class MaskedSelect(function.Function):
......
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!