Commit 9de0f1a3 by Ting PAN

Fix the bug of missing defs on blending assign operators

Summary:

This commit attaches input and output together in assign operators,
which fixes the missing input defs due to identity from input to output.
1 parent 746f2cbb
Showing with 455 additions and 1472 deletions
...@@ -39,14 +39,14 @@ html: ...@@ -39,14 +39,14 @@ html:
@echo "Build finished. The HTML pages are in $(BUILDDIR)." @echo "Build finished. The HTML pages are in $(BUILDDIR)."
latex: latex:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)-latex $(SPHINXBUILD) -b latex -j ${NUMBER_OF_PROCESSORS} $(ALLSPHINXOPTS) $(BUILDDIR)-latex
@echo @echo
@echo "Build finished; the LaTeX files are in $(BUILDDIR)-latex." @echo "Build finished; the LaTeX files are in $(BUILDDIR)-latex."
@echo "Run \`make' in that directory to run these through (pdf)latex" \ @echo "Run \`make' in that directory to run these through (pdf)latex" \
"(use \`make latexpdf' here to do that automatically)." "(use \`make latexpdf' here to do that automatically)."
latexpdf: latexpdf:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)-latex $(SPHINXBUILD) -b latex -j ${NUMBER_OF_PROCESSORS} $(ALLSPHINXOPTS) $(BUILDDIR)-latex
@echo "Running LaTeX files through pdflatex..." @echo "Running LaTeX files through pdflatex..."
$(MAKE) -C $(BUILDDIR)/latex all-pdf $(MAKE) -C $(BUILDDIR)-latex all-pdf
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)-latex." @echo "pdflatex finished; the PDF files are in $(BUILDDIR)-latex."
...@@ -39,7 +39,7 @@ extensions = ['sphinx.ext.autodoc', 'sphinxcontrib.katex', 'breathe'] ...@@ -39,7 +39,7 @@ extensions = ['sphinx.ext.autodoc', 'sphinxcontrib.katex', 'breathe']
# Project # Project
project = 'dragon' project = 'dragon'
copyright = 'Copyright (c) 2017-present, SeetaTech, Co.,Ltd' copyright = 'Copyright (c) 2017-present, SeetaTech, Co.,Ltd'
author = 'SeetaTech' author = 'SeetaTech, Co.,Ltd'
with open('../../../dragon/version.txt', 'r') as f: with open('../../../dragon/version.txt', 'r') as f:
version = f.read().strip() version = f.read().strip()
...@@ -114,6 +114,7 @@ latex_elements = { ...@@ -114,6 +114,7 @@ latex_elements = {
\fvset{breaklines=true, breakanywhere=true} \fvset{breaklines=true, breakanywhere=true}
\setlength{\headheight}{13.6pt} \setlength{\headheight}{13.6pt}
\setlength{\itemindent}{-1pt} \setlength{\itemindent}{-1pt}
\addto\captionsenglish{\renewcommand{\chaptername}{}}
\makeatletter \makeatletter
\renewcommand*\l@subsection{\@dottedtocline{2}{3.8em}{3.8em}} \renewcommand*\l@subsection{\@dottedtocline{2}{3.8em}{3.8em}}
\fancypagestyle{normal}{ \fancypagestyle{normal}{
...@@ -146,13 +147,18 @@ latex_elements = { ...@@ -146,13 +147,18 @@ latex_elements = {
\vspace*{40mm} \vspace*{40mm}
\LARGE \@author \LARGE \@author
\vspace*{40mm}
\LARGE \today
\end{titlepage} \end{titlepage}
\makeatother \makeatother
\pagenumbering{arabic} \pagenumbering{arabic}
''', ''',
'pointsize': '10pt', 'pointsize': '10pt',
'classoptions': ',oneside',
'figure_align': 'H', 'figure_align': 'H',
'fncychap': '\\usepackage[Sonny]{fncychap}',
'printindex': '', 'printindex': '',
'sphinxsetup': ' \ 'sphinxsetup': ' \
hmargin={0.75in,0.75in}, \ hmargin={0.75in,0.75in}, \
......
...@@ -65,7 +65,7 @@ if "%1" == "doxygen" ( ...@@ -65,7 +65,7 @@ if "%1" == "doxygen" (
) )
if "%1" == "html" ( if "%1" == "html" (
%SPHINXBUILD% -b html -j %NUMBER_OF_PROCESSORS% %ALLSPHINXOPTS% %BUILDDIR% %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%
if errorlevel 1 exit /b 1 if errorlevel 1 exit /b 1
echo. echo.
echo.Build finished. The HTML pages are in %BUILDDIR%. echo.Build finished. The HTML pages are in %BUILDDIR%.
......
...@@ -34,14 +34,14 @@ html: ...@@ -34,14 +34,14 @@ html:
@echo "Build finished. The HTML pages are in $(BUILDDIR)." @echo "Build finished. The HTML pages are in $(BUILDDIR)."
latex: latex:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)-latex $(SPHINXBUILD) -b latex -j ${NUMBER_OF_PROCESSORS} $(ALLSPHINXOPTS) $(BUILDDIR)-latex
@echo @echo
@echo "Build finished; the LaTeX files are in $(BUILDDIR)-latex." @echo "Build finished; the LaTeX files are in $(BUILDDIR)-latex."
@echo "Run \`make' in that directory to run these through (pdf)latex" \ @echo "Run \`make' in that directory to run these through (pdf)latex" \
"(use \`make latexpdf' here to do that automatically)." "(use \`make latexpdf' here to do that automatically)."
latexpdf: latexpdf:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)-latex $(SPHINXBUILD) -b latex -j ${NUMBER_OF_PROCESSORS} $(ALLSPHINXOPTS) $(BUILDDIR)-latex
@echo "Running LaTeX files through pdflatex..." @echo "Running LaTeX files through pdflatex..."
$(MAKE) -C $(BUILDDIR)/latex all-pdf $(MAKE) -C $(BUILDDIR)-latex all-pdf
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)-latex." @echo "pdflatex finished; the PDF files are in $(BUILDDIR)-latex."
...@@ -41,7 +41,7 @@ napoleon_use_rtype = False ...@@ -41,7 +41,7 @@ napoleon_use_rtype = False
# Project # Project
project = 'dragon' project = 'dragon'
copyright = 'Copyright (c) 2017-present, SeetaTech, Co.,Ltd' copyright = 'Copyright (c) 2017-present, SeetaTech, Co.,Ltd'
author = 'SeetaTech' author = 'SeetaTech, Co.,Ltd'
with open('../../../dragon/version.txt', 'r') as f: with open('../../../dragon/version.txt', 'r') as f:
version = f.read().strip() version = f.read().strip()
...@@ -122,6 +122,7 @@ latex_elements = { ...@@ -122,6 +122,7 @@ latex_elements = {
\fvset{breaklines=true, breakanywhere=true} \fvset{breaklines=true, breakanywhere=true}
\setlength{\headheight}{13.6pt} \setlength{\headheight}{13.6pt}
\setlength{\itemindent}{-1pt} \setlength{\itemindent}{-1pt}
\addto\captionsenglish{\renewcommand{\chaptername}{}}
\makeatletter \makeatletter
\renewcommand*\l@subsection{\@dottedtocline{2}{3.8em}{3.8em}} \renewcommand*\l@subsection{\@dottedtocline{2}{3.8em}{3.8em}}
\fancypagestyle{normal}{ \fancypagestyle{normal}{
...@@ -154,13 +155,18 @@ latex_elements = { ...@@ -154,13 +155,18 @@ latex_elements = {
\vspace*{40mm} \vspace*{40mm}
\LARGE \@author \LARGE \@author
\vspace*{40mm}
\LARGE \today
\end{titlepage} \end{titlepage}
\makeatother \makeatother
\pagenumbering{arabic} \pagenumbering{arabic}
''', ''',
'pointsize': '10pt', 'pointsize': '10pt',
'classoptions': ',oneside',
'figure_align': 'H', 'figure_align': 'H',
'fncychap': '\\usepackage[Sonny]{fncychap}',
'printindex': '', 'printindex': '',
'sphinxsetup': ' \ 'sphinxsetup': ' \
hmargin={0.75in,0.75in}, \ hmargin={0.75in,0.75in}, \
......
...@@ -48,9 +48,6 @@ dragon ...@@ -48,9 +48,6 @@ dragon
`constant(...) <dragon/constant.html>`_ `constant(...) <dragon/constant.html>`_
: Return a tensor initialized from the value. : Return a tensor initialized from the value.
`copy(...) <dragon/copy.html>`_
: Copy the input.
`create_function(...) <dragon/create_function.html>`_ `create_function(...) <dragon/create_function.html>`_
: Create a callable graph from the specified outputs. : Create a callable graph from the specified outputs.
...@@ -93,6 +90,9 @@ dragon ...@@ -93,6 +90,9 @@ dragon
`graph_mode(...) <dragon/graph_mode.html>`_ `graph_mode(...) <dragon/graph_mode.html>`_
: Context-manager set the graph execution mode. : Context-manager set the graph execution mode.
`identity(...) <dragon/identity.html>`_
: Return a tensor copied from the input.
`index_select(...) <dragon/index_select.html>`_ `index_select(...) <dragon/index_select.html>`_
: Select the elements according to the index along the given axis. : Select the elements according to the index along the given axis.
...@@ -199,7 +199,6 @@ dragon ...@@ -199,7 +199,6 @@ dragon
dragon/channel_shuffle dragon/channel_shuffle
dragon/concat dragon/concat
dragon/constant dragon/constant
dragon/copy
dragon/create_function dragon/create_function
dragon/device dragon/device
dragon/eager_mode dragon/eager_mode
...@@ -214,6 +213,7 @@ dragon ...@@ -214,6 +213,7 @@ dragon
dragon/get_workspace dragon/get_workspace
dragon/gradients dragon/gradients
dragon/graph_mode dragon/graph_mode
dragon/identity
dragon/index_select dragon/index_select
dragon/linspace dragon/linspace
dragon/load_library dragon/load_library
......
...@@ -174,8 +174,8 @@ __truediv__ ...@@ -174,8 +174,8 @@ __truediv__
.. _dragon.assign(...): assign.html .. _dragon.assign(...): assign.html
.. _dragon.cast(...): cast.html .. _dragon.cast(...): cast.html
.. _dragon.copy(...): copy.html
.. _dragon.fill(...): fill.html .. _dragon.fill(...): fill.html
.. _dragon.identity(...): identity.html
.. _dragon.masked_assign(...): masked_assign.html .. _dragon.masked_assign(...): masked_assign.html
.. _dragon.masked_select(...): masked_select.html .. _dragon.masked_select(...): masked_select.html
.. _dragon.math.add(...): math/add.html .. _dragon.math.add(...): math/add.html
......
...@@ -154,8 +154,8 @@ __truediv__ ...@@ -154,8 +154,8 @@ __truediv__
.. _dragon.assign(...): assign.html .. _dragon.assign(...): assign.html
.. _dragon.cast(...): cast.html .. _dragon.cast(...): cast.html
.. _dragon.copy(...): copy.html
.. _dragon.fill(...): fill.html .. _dragon.fill(...): fill.html
.. _dragon.identity(...): identity.html
.. _dragon.masked_assign(...): masked_assign.html .. _dragon.masked_assign(...): masked_assign.html
.. _dragon.masked_select(...): masked_select.html .. _dragon.masked_select(...): masked_select.html
.. _dragon.math.add(...): math/add.html .. _dragon.math.add(...): math/add.html
......
copy identity
==== ========
.. autofunction:: dragon.copy .. autofunction:: dragon.identity
.. raw:: html .. raw:: html
......
...@@ -55,7 +55,7 @@ if errorlevel 9009 ( ...@@ -55,7 +55,7 @@ if errorlevel 9009 (
:sphinx_ok :sphinx_ok
if "%1" == "html" ( if "%1" == "html" (
%SPHINXBUILD% -b html -j %NUMBER_OF_PROCESSORS% %ALLSPHINXOPTS% %BUILDDIR% %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%
if errorlevel 1 exit /b 1 if errorlevel 1 exit /b 1
echo. echo.
echo.Build finished. The HTML pages are in %BUILDDIR%. echo.Build finished. The HTML pages are in %BUILDDIR%.
......
...@@ -65,7 +65,7 @@ Name Supported Reference ...@@ -65,7 +65,7 @@ Name Supported Reference
`Greater`_ |v| :func:`dragon.math.greater` `Greater`_ |v| :func:`dragon.math.greater`
`HardSigmoid`_ |v| :func:`dragon.nn.hardsigmoid` `HardSigmoid`_ |v| :func:`dragon.nn.hardsigmoid`
`Hardmax`_ `Hardmax`_
`Identity`_ `Identity`_ |v| :func:`dragon.identity`
`If`_ `If`_
`InstanceNormalization`_ |v| :func:`dragon.nn.instance_norm` `InstanceNormalization`_ |v| :func:`dragon.nn.instance_norm`
`IsInf`_ |v| :func:`dragon.math.is_inf` `IsInf`_ |v| :func:`dragon.math.is_inf`
......
...@@ -58,7 +58,7 @@ vm.tensorflow ...@@ -58,7 +58,7 @@ vm.tensorflow
: Compute the symbolic derivatives of ``ys`` w.r.t. ``xs`` . : Compute the symbolic derivatives of ``ys`` w.r.t. ``xs`` .
`identity(...) <tensorflow/identity.html>`_ `identity(...) <tensorflow/identity.html>`_
: Return a new tensor copying the content of input. : Return a tensor copied from the input.
`linspace(...) <tensorflow/linspace.html>`_ `linspace(...) <tensorflow/linspace.html>`_
: Generate evenly spaced values within intervals along the given axis. : Generate evenly spaced values within intervals along the given axis.
......
...@@ -68,11 +68,9 @@ void _EluGrad<float16>( ...@@ -68,11 +68,9 @@ void _EluGrad<float16>(
DEFINE_KERNEL_LAUNCHER(float16); DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float16); DEFINE_GRAD_KERNEL_LAUNCHER(float16);
DEFINE_GRAD_KERNEL_LAUNCHER(float); DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double); DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
......
...@@ -170,10 +170,8 @@ void EluGrad<float16, CUDAContext>( ...@@ -170,10 +170,8 @@ void EluGrad<float16, CUDAContext>(
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float); DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double); DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
......
...@@ -83,11 +83,9 @@ void _HardSigmoidGrad<float16>( ...@@ -83,11 +83,9 @@ void _HardSigmoidGrad<float16>(
DEFINE_KERNEL_LAUNCHER(float16); DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float16); DEFINE_GRAD_KERNEL_LAUNCHER(float16);
DEFINE_GRAD_KERNEL_LAUNCHER(float); DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double); DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
......
...@@ -126,10 +126,8 @@ void HardSigmoidGrad<float16, CUDAContext>( ...@@ -126,10 +126,8 @@ void HardSigmoidGrad<float16, CUDAContext>(
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float); DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double); DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
......
...@@ -88,11 +88,9 @@ void _HardSwishGrad<float16>( ...@@ -88,11 +88,9 @@ void _HardSwishGrad<float16>(
DEFINE_KERNEL_LAUNCHER(float16); DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float16); DEFINE_GRAD_KERNEL_LAUNCHER(float16);
DEFINE_GRAD_KERNEL_LAUNCHER(float); DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double); DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
......
...@@ -153,10 +153,8 @@ void HardSwishGrad<float16, CUDAContext>( ...@@ -153,10 +153,8 @@ void HardSwishGrad<float16, CUDAContext>(
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float); DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double); DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
......
...@@ -243,10 +243,8 @@ void PReluWGrad<float16, CPUContext>( ...@@ -243,10 +243,8 @@ void PReluWGrad<float16, CPUContext>(
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float); DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double); DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
......
...@@ -601,10 +601,8 @@ void PReluWGrad<float16, CUDAContext>( ...@@ -601,10 +601,8 @@ void PReluWGrad<float16, CUDAContext>(
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float); DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double); DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
......
...@@ -124,11 +124,9 @@ void _ReluNGrad<float16>( ...@@ -124,11 +124,9 @@ void _ReluNGrad<float16>(
DEFINE_KERNEL_LAUNCHER(float16); DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float16); DEFINE_GRAD_KERNEL_LAUNCHER(float16);
DEFINE_GRAD_KERNEL_LAUNCHER(float); DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double); DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
......
...@@ -401,10 +401,8 @@ void ReluNGrad<float16, CUDAContext>( ...@@ -401,10 +401,8 @@ void ReluNGrad<float16, CUDAContext>(
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float); DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double); DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
......
...@@ -85,11 +85,9 @@ void _SeluGrad<float16>( ...@@ -85,11 +85,9 @@ void _SeluGrad<float16>(
DEFINE_KERNEL_LAUNCHER(float16); DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float16); DEFINE_GRAD_KERNEL_LAUNCHER(float16);
DEFINE_GRAD_KERNEL_LAUNCHER(float); DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double); DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
......
...@@ -191,10 +191,8 @@ void SeluGrad<float16, CUDAContext>( ...@@ -191,10 +191,8 @@ void SeluGrad<float16, CUDAContext>(
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float); DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double); DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
......
...@@ -56,11 +56,9 @@ void _SigmoidGrad<float16>( ...@@ -56,11 +56,9 @@ void _SigmoidGrad<float16>(
DEFINE_KERNEL_LAUNCHER(float16); DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float16); DEFINE_GRAD_KERNEL_LAUNCHER(float16);
DEFINE_GRAD_KERNEL_LAUNCHER(float); DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double); DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
......
...@@ -135,10 +135,8 @@ void SigmoidGrad<float16, CUDAContext>( ...@@ -135,10 +135,8 @@ void SigmoidGrad<float16, CUDAContext>(
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float); DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double); DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
......
...@@ -122,11 +122,9 @@ void _SoftmaxGrad<float16>( ...@@ -122,11 +122,9 @@ void _SoftmaxGrad<float16>(
DEFINE_KERNEL_LAUNCHER(float16); DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float16); DEFINE_GRAD_KERNEL_LAUNCHER(float16);
DEFINE_GRAD_KERNEL_LAUNCHER(float); DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double); DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
......
...@@ -257,10 +257,8 @@ void SoftmaxGrad<float16, CUDAContext>( ...@@ -257,10 +257,8 @@ void SoftmaxGrad<float16, CUDAContext>(
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float); DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double); DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
......
...@@ -60,11 +60,9 @@ void _SwishGrad<float16>( ...@@ -60,11 +60,9 @@ void _SwishGrad<float16>(
DEFINE_KERNEL_LAUNCHER(float16); DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float16); DEFINE_GRAD_KERNEL_LAUNCHER(float16);
DEFINE_GRAD_KERNEL_LAUNCHER(float); DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double); DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
......
...@@ -121,10 +121,8 @@ void SwishGrad<float16, CUDAContext>( ...@@ -121,10 +121,8 @@ void SwishGrad<float16, CUDAContext>(
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float); DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double); DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
......
...@@ -55,11 +55,9 @@ void _TanhGrad<float16>( ...@@ -55,11 +55,9 @@ void _TanhGrad<float16>(
DEFINE_KERNEL_LAUNCHER(float16); DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float16); DEFINE_GRAD_KERNEL_LAUNCHER(float16);
DEFINE_GRAD_KERNEL_LAUNCHER(float); DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double); DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
......
...@@ -124,10 +124,8 @@ void TanhGrad<float16, CUDAContext>( ...@@ -124,10 +124,8 @@ void TanhGrad<float16, CUDAContext>(
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float); DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double); DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
......
...@@ -93,7 +93,6 @@ DEFINE_KERNEL_LAUNCHER(int); ...@@ -93,7 +93,6 @@ DEFINE_KERNEL_LAUNCHER(int);
DEFINE_KERNEL_LAUNCHER(int64_t); DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
......
...@@ -206,7 +206,6 @@ DEFINE_KERNEL_LAUNCHER(int); ...@@ -206,7 +206,6 @@ DEFINE_KERNEL_LAUNCHER(int);
DEFINE_KERNEL_LAUNCHER(int64_t); DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
......
...@@ -105,14 +105,12 @@ DEFINE_KERNEL_LAUNCHER(float, float); ...@@ -105,14 +105,12 @@ DEFINE_KERNEL_LAUNCHER(float, float);
DEFINE_KERNEL_LAUNCHER(float, double); DEFINE_KERNEL_LAUNCHER(float, double);
DEFINE_KERNEL_LAUNCHER(double, float); DEFINE_KERNEL_LAUNCHER(double, float);
DEFINE_KERNEL_LAUNCHER(double, double); DEFINE_KERNEL_LAUNCHER(double, double);
DEFINE_FP16_KERNEL_LAUNCHER(int8_t); DEFINE_FP16_KERNEL_LAUNCHER(int8_t);
DEFINE_FP16_KERNEL_LAUNCHER(uint8_t); DEFINE_FP16_KERNEL_LAUNCHER(uint8_t);
DEFINE_FP16_KERNEL_LAUNCHER(int); DEFINE_FP16_KERNEL_LAUNCHER(int);
DEFINE_FP16_KERNEL_LAUNCHER(int64_t); DEFINE_FP16_KERNEL_LAUNCHER(int64_t);
DEFINE_FP16_KERNEL_LAUNCHER(float); DEFINE_FP16_KERNEL_LAUNCHER(float);
DEFINE_FP16_KERNEL_LAUNCHER(double); DEFINE_FP16_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_FP16_KERNEL_LAUNCHER #undef DEFINE_FP16_KERNEL_LAUNCHER
......
...@@ -268,14 +268,12 @@ DEFINE_KERNEL_LAUNCHER(float, float); ...@@ -268,14 +268,12 @@ DEFINE_KERNEL_LAUNCHER(float, float);
DEFINE_KERNEL_LAUNCHER(float, double); DEFINE_KERNEL_LAUNCHER(float, double);
DEFINE_KERNEL_LAUNCHER(double, float); DEFINE_KERNEL_LAUNCHER(double, float);
DEFINE_KERNEL_LAUNCHER(double, double); DEFINE_KERNEL_LAUNCHER(double, double);
DEFINE_FP16_KERNEL_LAUNCHER(int8_t); DEFINE_FP16_KERNEL_LAUNCHER(int8_t);
DEFINE_FP16_KERNEL_LAUNCHER(uint8_t); DEFINE_FP16_KERNEL_LAUNCHER(uint8_t);
DEFINE_FP16_KERNEL_LAUNCHER(int); DEFINE_FP16_KERNEL_LAUNCHER(int);
DEFINE_FP16_KERNEL_LAUNCHER(int64_t); DEFINE_FP16_KERNEL_LAUNCHER(int64_t);
DEFINE_FP16_KERNEL_LAUNCHER(float); DEFINE_FP16_KERNEL_LAUNCHER(float);
DEFINE_FP16_KERNEL_LAUNCHER(double); DEFINE_FP16_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_FP16_KERNEL_LAUNCHER #undef DEFINE_FP16_KERNEL_LAUNCHER
......
...@@ -51,7 +51,6 @@ DEFINE_KERNEL_LAUNCHER(int64_t); ...@@ -51,7 +51,6 @@ DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float16); DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
......
...@@ -57,7 +57,6 @@ DEFINE_KERNEL_LAUNCHER(int64_t); ...@@ -57,7 +57,6 @@ DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float16); DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
......
...@@ -38,7 +38,6 @@ DEFINE_KERNEL_LAUNCHER(int64_t); ...@@ -38,7 +38,6 @@ DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float16); DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
......
...@@ -75,7 +75,6 @@ DEFINE_KERNEL_LAUNCHER(int); ...@@ -75,7 +75,6 @@ DEFINE_KERNEL_LAUNCHER(int);
DEFINE_KERNEL_LAUNCHER(int64_t); DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
......
...@@ -77,7 +77,6 @@ DEFINE_KERNEL_LAUNCHER(IndexSelect, int64_t); ...@@ -77,7 +77,6 @@ DEFINE_KERNEL_LAUNCHER(IndexSelect, int64_t);
DEFINE_KERNEL_LAUNCHER(IndexSelect, float16); DEFINE_KERNEL_LAUNCHER(IndexSelect, float16);
DEFINE_KERNEL_LAUNCHER(IndexSelect, float); DEFINE_KERNEL_LAUNCHER(IndexSelect, float);
DEFINE_KERNEL_LAUNCHER(IndexSelect, double); DEFINE_KERNEL_LAUNCHER(IndexSelect, double);
DEFINE_KERNEL_LAUNCHER(IndexSelectGrad, int8_t); DEFINE_KERNEL_LAUNCHER(IndexSelectGrad, int8_t);
DEFINE_KERNEL_LAUNCHER(IndexSelectGrad, uint8_t); DEFINE_KERNEL_LAUNCHER(IndexSelectGrad, uint8_t);
DEFINE_KERNEL_LAUNCHER(IndexSelectGrad, int); DEFINE_KERNEL_LAUNCHER(IndexSelectGrad, int);
...@@ -85,7 +84,6 @@ DEFINE_KERNEL_LAUNCHER(IndexSelectGrad, int64_t); ...@@ -85,7 +84,6 @@ DEFINE_KERNEL_LAUNCHER(IndexSelectGrad, int64_t);
DEFINE_KERNEL_LAUNCHER(IndexSelectGrad, float16); DEFINE_KERNEL_LAUNCHER(IndexSelectGrad, float16);
DEFINE_KERNEL_LAUNCHER(IndexSelectGrad, float); DEFINE_KERNEL_LAUNCHER(IndexSelectGrad, float);
DEFINE_KERNEL_LAUNCHER(IndexSelectGrad, double); DEFINE_KERNEL_LAUNCHER(IndexSelectGrad, double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
......
...@@ -112,11 +112,9 @@ DEFINE_KERNEL_LAUNCHER(int64_t); ...@@ -112,11 +112,9 @@ DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float16); DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float16); DEFINE_GRAD_KERNEL_LAUNCHER(float16);
DEFINE_GRAD_KERNEL_LAUNCHER(float); DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double); DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
......
...@@ -80,14 +80,12 @@ DEFINE_KERNEL_LAUNCHER(int64_t, int64_t); ...@@ -80,14 +80,12 @@ DEFINE_KERNEL_LAUNCHER(int64_t, int64_t);
DEFINE_KERNEL_LAUNCHER(int64_t, float16); DEFINE_KERNEL_LAUNCHER(int64_t, float16);
DEFINE_KERNEL_LAUNCHER(int64_t, float); DEFINE_KERNEL_LAUNCHER(int64_t, float);
DEFINE_KERNEL_LAUNCHER(int64_t, double); DEFINE_KERNEL_LAUNCHER(int64_t, double);
DEFINE_GRAD_KERNEL_LAUNCHER(int, float16); DEFINE_GRAD_KERNEL_LAUNCHER(int, float16);
DEFINE_GRAD_KERNEL_LAUNCHER(int, float); DEFINE_GRAD_KERNEL_LAUNCHER(int, float);
DEFINE_GRAD_KERNEL_LAUNCHER(int, double); DEFINE_GRAD_KERNEL_LAUNCHER(int, double);
DEFINE_GRAD_KERNEL_LAUNCHER(int64_t, float16); DEFINE_GRAD_KERNEL_LAUNCHER(int64_t, float16);
DEFINE_GRAD_KERNEL_LAUNCHER(int64_t, float); DEFINE_GRAD_KERNEL_LAUNCHER(int64_t, float);
DEFINE_GRAD_KERNEL_LAUNCHER(int64_t, double); DEFINE_GRAD_KERNEL_LAUNCHER(int64_t, double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
......
...@@ -85,14 +85,12 @@ DEFINE_KERNEL_LAUNCHER(int64_t, int64_t); ...@@ -85,14 +85,12 @@ DEFINE_KERNEL_LAUNCHER(int64_t, int64_t);
DEFINE_KERNEL_LAUNCHER(int64_t, float16); DEFINE_KERNEL_LAUNCHER(int64_t, float16);
DEFINE_KERNEL_LAUNCHER(int64_t, float); DEFINE_KERNEL_LAUNCHER(int64_t, float);
DEFINE_KERNEL_LAUNCHER(int64_t, double); DEFINE_KERNEL_LAUNCHER(int64_t, double);
DEFINE_GRAD_KERNEL_LAUNCHER(int, float16); DEFINE_GRAD_KERNEL_LAUNCHER(int, float16);
DEFINE_GRAD_KERNEL_LAUNCHER(int, float); DEFINE_GRAD_KERNEL_LAUNCHER(int, float);
DEFINE_GRAD_KERNEL_LAUNCHER(int, double); DEFINE_GRAD_KERNEL_LAUNCHER(int, double);
DEFINE_GRAD_KERNEL_LAUNCHER(int64_t, float16); DEFINE_GRAD_KERNEL_LAUNCHER(int64_t, float16);
DEFINE_GRAD_KERNEL_LAUNCHER(int64_t, float); DEFINE_GRAD_KERNEL_LAUNCHER(int64_t, float);
DEFINE_GRAD_KERNEL_LAUNCHER(int64_t, double); DEFINE_GRAD_KERNEL_LAUNCHER(int64_t, double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
......
...@@ -42,7 +42,6 @@ void _OneHot( ...@@ -42,7 +42,6 @@ void _OneHot(
DEFINE_KERNEL_LAUNCHER(int); DEFINE_KERNEL_LAUNCHER(int);
DEFINE_KERNEL_LAUNCHER(int64_t); DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
......
...@@ -42,7 +42,6 @@ __global__ void _OneHot( ...@@ -42,7 +42,6 @@ __global__ void _OneHot(
DEFINE_KERNEL_LAUNCHER(int); DEFINE_KERNEL_LAUNCHER(int);
DEFINE_KERNEL_LAUNCHER(int64_t); DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
......
...@@ -132,7 +132,6 @@ DEFINE_CONST_KERNEL_LAUNCHER(int64_t); ...@@ -132,7 +132,6 @@ DEFINE_CONST_KERNEL_LAUNCHER(int64_t);
DEFINE_CONST_KERNEL_LAUNCHER(float16); DEFINE_CONST_KERNEL_LAUNCHER(float16);
DEFINE_CONST_KERNEL_LAUNCHER(float); DEFINE_CONST_KERNEL_LAUNCHER(float);
DEFINE_CONST_KERNEL_LAUNCHER(double); DEFINE_CONST_KERNEL_LAUNCHER(double);
DEFINE_KERNEL_LAUNCHER(ReflectPad, bool); DEFINE_KERNEL_LAUNCHER(ReflectPad, bool);
DEFINE_KERNEL_LAUNCHER(ReflectPad, int8_t); DEFINE_KERNEL_LAUNCHER(ReflectPad, int8_t);
DEFINE_KERNEL_LAUNCHER(ReflectPad, uint8_t); DEFINE_KERNEL_LAUNCHER(ReflectPad, uint8_t);
...@@ -141,7 +140,6 @@ DEFINE_KERNEL_LAUNCHER(ReflectPad, int64_t); ...@@ -141,7 +140,6 @@ DEFINE_KERNEL_LAUNCHER(ReflectPad, int64_t);
DEFINE_KERNEL_LAUNCHER(ReflectPad, float16); DEFINE_KERNEL_LAUNCHER(ReflectPad, float16);
DEFINE_KERNEL_LAUNCHER(ReflectPad, float); DEFINE_KERNEL_LAUNCHER(ReflectPad, float);
DEFINE_KERNEL_LAUNCHER(ReflectPad, double); DEFINE_KERNEL_LAUNCHER(ReflectPad, double);
DEFINE_KERNEL_LAUNCHER(EdgePad, bool); DEFINE_KERNEL_LAUNCHER(EdgePad, bool);
DEFINE_KERNEL_LAUNCHER(EdgePad, int8_t); DEFINE_KERNEL_LAUNCHER(EdgePad, int8_t);
DEFINE_KERNEL_LAUNCHER(EdgePad, uint8_t); DEFINE_KERNEL_LAUNCHER(EdgePad, uint8_t);
...@@ -150,9 +148,8 @@ DEFINE_KERNEL_LAUNCHER(EdgePad, int64_t); ...@@ -150,9 +148,8 @@ DEFINE_KERNEL_LAUNCHER(EdgePad, int64_t);
DEFINE_KERNEL_LAUNCHER(EdgePad, float16); DEFINE_KERNEL_LAUNCHER(EdgePad, float16);
DEFINE_KERNEL_LAUNCHER(EdgePad, float); DEFINE_KERNEL_LAUNCHER(EdgePad, float);
DEFINE_KERNEL_LAUNCHER(EdgePad, double); DEFINE_KERNEL_LAUNCHER(EdgePad, double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_CONST_KERNEL_LAUNCHER #undef DEFINE_CONST_KERNEL_LAUNCHER
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
......
...@@ -151,7 +151,6 @@ DEFINE_CONST_KERNEL_LAUNCHER(int64_t); ...@@ -151,7 +151,6 @@ DEFINE_CONST_KERNEL_LAUNCHER(int64_t);
DEFINE_CONST_KERNEL_LAUNCHER(float16); DEFINE_CONST_KERNEL_LAUNCHER(float16);
DEFINE_CONST_KERNEL_LAUNCHER(float); DEFINE_CONST_KERNEL_LAUNCHER(float);
DEFINE_CONST_KERNEL_LAUNCHER(double); DEFINE_CONST_KERNEL_LAUNCHER(double);
DEFINE_KERNEL_LAUNCHER(ReflectPad, bool); DEFINE_KERNEL_LAUNCHER(ReflectPad, bool);
DEFINE_KERNEL_LAUNCHER(ReflectPad, int8_t); DEFINE_KERNEL_LAUNCHER(ReflectPad, int8_t);
DEFINE_KERNEL_LAUNCHER(ReflectPad, uint8_t); DEFINE_KERNEL_LAUNCHER(ReflectPad, uint8_t);
...@@ -160,7 +159,6 @@ DEFINE_KERNEL_LAUNCHER(ReflectPad, int64_t); ...@@ -160,7 +159,6 @@ DEFINE_KERNEL_LAUNCHER(ReflectPad, int64_t);
DEFINE_KERNEL_LAUNCHER(ReflectPad, float16); DEFINE_KERNEL_LAUNCHER(ReflectPad, float16);
DEFINE_KERNEL_LAUNCHER(ReflectPad, float); DEFINE_KERNEL_LAUNCHER(ReflectPad, float);
DEFINE_KERNEL_LAUNCHER(ReflectPad, double); DEFINE_KERNEL_LAUNCHER(ReflectPad, double);
DEFINE_KERNEL_LAUNCHER(EdgePad, bool); DEFINE_KERNEL_LAUNCHER(EdgePad, bool);
DEFINE_KERNEL_LAUNCHER(EdgePad, int8_t); DEFINE_KERNEL_LAUNCHER(EdgePad, int8_t);
DEFINE_KERNEL_LAUNCHER(EdgePad, uint8_t); DEFINE_KERNEL_LAUNCHER(EdgePad, uint8_t);
...@@ -169,9 +167,8 @@ DEFINE_KERNEL_LAUNCHER(EdgePad, int64_t); ...@@ -169,9 +167,8 @@ DEFINE_KERNEL_LAUNCHER(EdgePad, int64_t);
DEFINE_KERNEL_LAUNCHER(EdgePad, float16); DEFINE_KERNEL_LAUNCHER(EdgePad, float16);
DEFINE_KERNEL_LAUNCHER(EdgePad, float); DEFINE_KERNEL_LAUNCHER(EdgePad, float);
DEFINE_KERNEL_LAUNCHER(EdgePad, double); DEFINE_KERNEL_LAUNCHER(EdgePad, double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_CONST_KERNEL_LAUNCHER #undef DEFINE_CONST_KERNEL_LAUNCHER
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
......
...@@ -63,7 +63,6 @@ void ReduceSumGrad<float16, CPUContext>( ...@@ -63,7 +63,6 @@ void ReduceSumGrad<float16, CPUContext>(
DEFINE_GRAD_KERNEL_LAUNCHER(float); DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double); DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
......
...@@ -128,7 +128,6 @@ void ReduceSumGrad<float16, CUDAContext>( ...@@ -128,7 +128,6 @@ void ReduceSumGrad<float16, CUDAContext>(
DEFINE_GRAD_KERNEL_LAUNCHER(float); DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double); DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
......
...@@ -75,11 +75,9 @@ DEFINE_KERNEL_LAUNCHER(Repeat, int64_t); ...@@ -75,11 +75,9 @@ DEFINE_KERNEL_LAUNCHER(Repeat, int64_t);
DEFINE_KERNEL_LAUNCHER(Repeat, float16); DEFINE_KERNEL_LAUNCHER(Repeat, float16);
DEFINE_KERNEL_LAUNCHER(Repeat, float); DEFINE_KERNEL_LAUNCHER(Repeat, float);
DEFINE_KERNEL_LAUNCHER(Repeat, double); DEFINE_KERNEL_LAUNCHER(Repeat, double);
DEFINE_KERNEL_LAUNCHER(RepeatGrad, float16); DEFINE_KERNEL_LAUNCHER(RepeatGrad, float16);
DEFINE_KERNEL_LAUNCHER(RepeatGrad, float); DEFINE_KERNEL_LAUNCHER(RepeatGrad, float);
DEFINE_KERNEL_LAUNCHER(RepeatGrad, double); DEFINE_KERNEL_LAUNCHER(RepeatGrad, double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
......
...@@ -139,10 +139,8 @@ DEFINE_KERNEL_LAUNCHER(int64_t); ...@@ -139,10 +139,8 @@ DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float16); DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float); DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double); DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
......
...@@ -76,7 +76,6 @@ DEFINE_KERNEL_LAUNCHER(Slice, int64_t); ...@@ -76,7 +76,6 @@ DEFINE_KERNEL_LAUNCHER(Slice, int64_t);
DEFINE_KERNEL_LAUNCHER(Slice, float16); DEFINE_KERNEL_LAUNCHER(Slice, float16);
DEFINE_KERNEL_LAUNCHER(Slice, float); DEFINE_KERNEL_LAUNCHER(Slice, float);
DEFINE_KERNEL_LAUNCHER(Slice, double); DEFINE_KERNEL_LAUNCHER(Slice, double);
DEFINE_KERNEL_LAUNCHER(SliceGrad, bool); DEFINE_KERNEL_LAUNCHER(SliceGrad, bool);
DEFINE_KERNEL_LAUNCHER(SliceGrad, int8_t); DEFINE_KERNEL_LAUNCHER(SliceGrad, int8_t);
DEFINE_KERNEL_LAUNCHER(SliceGrad, uint8_t); DEFINE_KERNEL_LAUNCHER(SliceGrad, uint8_t);
...@@ -85,7 +84,6 @@ DEFINE_KERNEL_LAUNCHER(SliceGrad, int64_t); ...@@ -85,7 +84,6 @@ DEFINE_KERNEL_LAUNCHER(SliceGrad, int64_t);
DEFINE_KERNEL_LAUNCHER(SliceGrad, float16); DEFINE_KERNEL_LAUNCHER(SliceGrad, float16);
DEFINE_KERNEL_LAUNCHER(SliceGrad, float); DEFINE_KERNEL_LAUNCHER(SliceGrad, float);
DEFINE_KERNEL_LAUNCHER(SliceGrad, double); DEFINE_KERNEL_LAUNCHER(SliceGrad, double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
......
...@@ -85,7 +85,6 @@ DEFINE_KERNEL_LAUNCHER(Slice, int64_t); ...@@ -85,7 +85,6 @@ DEFINE_KERNEL_LAUNCHER(Slice, int64_t);
DEFINE_KERNEL_LAUNCHER(Slice, float16); DEFINE_KERNEL_LAUNCHER(Slice, float16);
DEFINE_KERNEL_LAUNCHER(Slice, float); DEFINE_KERNEL_LAUNCHER(Slice, float);
DEFINE_KERNEL_LAUNCHER(Slice, double); DEFINE_KERNEL_LAUNCHER(Slice, double);
DEFINE_KERNEL_LAUNCHER(SliceGrad, bool); DEFINE_KERNEL_LAUNCHER(SliceGrad, bool);
DEFINE_KERNEL_LAUNCHER(SliceGrad, int8_t); DEFINE_KERNEL_LAUNCHER(SliceGrad, int8_t);
DEFINE_KERNEL_LAUNCHER(SliceGrad, uint8_t); DEFINE_KERNEL_LAUNCHER(SliceGrad, uint8_t);
...@@ -94,7 +93,6 @@ DEFINE_KERNEL_LAUNCHER(SliceGrad, int64_t); ...@@ -94,7 +93,6 @@ DEFINE_KERNEL_LAUNCHER(SliceGrad, int64_t);
DEFINE_KERNEL_LAUNCHER(SliceGrad, float16); DEFINE_KERNEL_LAUNCHER(SliceGrad, float16);
DEFINE_KERNEL_LAUNCHER(SliceGrad, float); DEFINE_KERNEL_LAUNCHER(SliceGrad, float);
DEFINE_KERNEL_LAUNCHER(SliceGrad, double); DEFINE_KERNEL_LAUNCHER(SliceGrad, double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
......
...@@ -85,11 +85,9 @@ DEFINE_KERNEL_LAUNCHER(int64_t); ...@@ -85,11 +85,9 @@ DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float16); DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float16); DEFINE_GRAD_KERNEL_LAUNCHER(float16);
DEFINE_GRAD_KERNEL_LAUNCHER(float); DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double); DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
......
...@@ -141,10 +141,8 @@ DEFINE_KERNEL_LAUNCHER(int64_t); ...@@ -141,10 +141,8 @@ DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float16); DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float); DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double); DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
......
...@@ -73,11 +73,9 @@ DEFINE_KERNEL_LAUNCHER(Transpose, int64_t); ...@@ -73,11 +73,9 @@ DEFINE_KERNEL_LAUNCHER(Transpose, int64_t);
DEFINE_KERNEL_LAUNCHER(Transpose, float16); DEFINE_KERNEL_LAUNCHER(Transpose, float16);
DEFINE_KERNEL_LAUNCHER(Transpose, float); DEFINE_KERNEL_LAUNCHER(Transpose, float);
DEFINE_KERNEL_LAUNCHER(Transpose, double); DEFINE_KERNEL_LAUNCHER(Transpose, double);
DEFINE_KERNEL_LAUNCHER(TransposeGrad, float16); DEFINE_KERNEL_LAUNCHER(TransposeGrad, float16);
DEFINE_KERNEL_LAUNCHER(TransposeGrad, float); DEFINE_KERNEL_LAUNCHER(TransposeGrad, float);
DEFINE_KERNEL_LAUNCHER(TransposeGrad, double); DEFINE_KERNEL_LAUNCHER(TransposeGrad, double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
......
...@@ -81,11 +81,9 @@ DEFINE_KERNEL_LAUNCHER(Transpose, int64_t); ...@@ -81,11 +81,9 @@ DEFINE_KERNEL_LAUNCHER(Transpose, int64_t);
DEFINE_KERNEL_LAUNCHER(Transpose, float16); DEFINE_KERNEL_LAUNCHER(Transpose, float16);
DEFINE_KERNEL_LAUNCHER(Transpose, float); DEFINE_KERNEL_LAUNCHER(Transpose, float);
DEFINE_KERNEL_LAUNCHER(Transpose, double); DEFINE_KERNEL_LAUNCHER(Transpose, double);
DEFINE_KERNEL_LAUNCHER(TransposeGrad, float16); DEFINE_KERNEL_LAUNCHER(TransposeGrad, float16);
DEFINE_KERNEL_LAUNCHER(TransposeGrad, float); DEFINE_KERNEL_LAUNCHER(TransposeGrad, float);
DEFINE_KERNEL_LAUNCHER(TransposeGrad, double); DEFINE_KERNEL_LAUNCHER(TransposeGrad, double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
......
...@@ -75,6 +75,7 @@ DEFINE_KERNEL_LAUNCHER(int); ...@@ -75,6 +75,7 @@ DEFINE_KERNEL_LAUNCHER(int);
DEFINE_KERNEL_LAUNCHER(int64_t); DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
......
...@@ -54,7 +54,6 @@ DEFINE_KERNEL_LAUNCHER(int64_t); ...@@ -54,7 +54,6 @@ DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float16); DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
......
...@@ -65,7 +65,6 @@ DEFINE_KERNEL_LAUNCHER(int64_t); ...@@ -65,7 +65,6 @@ DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float16); DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
......
...@@ -108,10 +108,8 @@ void BroadcastLossGrad<float16, CPUContext>( ...@@ -108,10 +108,8 @@ void BroadcastLossGrad<float16, CPUContext>(
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float); DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double); DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
......
...@@ -231,10 +231,8 @@ void BroadcastLossGrad<float16, CUDAContext>( ...@@ -231,10 +231,8 @@ void BroadcastLossGrad<float16, CUDAContext>(
DEFINE_KERNEL_LAUNCHER(float16); DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float); DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double); DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
......
...@@ -88,12 +88,10 @@ DEFINE_KERNEL_LAUNCHER(NLLLoss, float, float); ...@@ -88,12 +88,10 @@ DEFINE_KERNEL_LAUNCHER(NLLLoss, float, float);
DEFINE_KERNEL_LAUNCHER(NLLLoss, float, int64_t); DEFINE_KERNEL_LAUNCHER(NLLLoss, float, int64_t);
DEFINE_KERNEL_LAUNCHER(NLLLoss, double, double); DEFINE_KERNEL_LAUNCHER(NLLLoss, double, double);
DEFINE_KERNEL_LAUNCHER(NLLLoss, double, int64_t); DEFINE_KERNEL_LAUNCHER(NLLLoss, double, int64_t);
DEFINE_KERNEL_LAUNCHER(NLLLossGrad, float, float); DEFINE_KERNEL_LAUNCHER(NLLLossGrad, float, float);
DEFINE_KERNEL_LAUNCHER(NLLLossGrad, float, int64_t); DEFINE_KERNEL_LAUNCHER(NLLLossGrad, float, int64_t);
DEFINE_KERNEL_LAUNCHER(NLLLossGrad, double, double); DEFINE_KERNEL_LAUNCHER(NLLLossGrad, double, double);
DEFINE_KERNEL_LAUNCHER(NLLLossGrad, double, int64_t); DEFINE_KERNEL_LAUNCHER(NLLLossGrad, double, int64_t);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
......
...@@ -87,12 +87,10 @@ DEFINE_KERNEL_LAUNCHER(NLLLoss, float, float); ...@@ -87,12 +87,10 @@ DEFINE_KERNEL_LAUNCHER(NLLLoss, float, float);
DEFINE_KERNEL_LAUNCHER(NLLLoss, float, int64_t); DEFINE_KERNEL_LAUNCHER(NLLLoss, float, int64_t);
DEFINE_KERNEL_LAUNCHER(NLLLoss, double, double); DEFINE_KERNEL_LAUNCHER(NLLLoss, double, double);
DEFINE_KERNEL_LAUNCHER(NLLLoss, double, int64_t); DEFINE_KERNEL_LAUNCHER(NLLLoss, double, int64_t);
DEFINE_KERNEL_LAUNCHER(NLLLossGrad, float, float); DEFINE_KERNEL_LAUNCHER(NLLLossGrad, float, float);
DEFINE_KERNEL_LAUNCHER(NLLLossGrad, float, int64_t); DEFINE_KERNEL_LAUNCHER(NLLLossGrad, float, int64_t);
DEFINE_KERNEL_LAUNCHER(NLLLossGrad, double, double); DEFINE_KERNEL_LAUNCHER(NLLLossGrad, double, double);
DEFINE_KERNEL_LAUNCHER(NLLLossGrad, double, int64_t); DEFINE_KERNEL_LAUNCHER(NLLLossGrad, double, int64_t);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
......
...@@ -68,10 +68,8 @@ void _SigmoidCrossEntropyGrad( ...@@ -68,10 +68,8 @@ void _SigmoidCrossEntropyGrad(
DEFINE_KERNEL_LAUNCHER(SigmoidCrossEntropy, float); DEFINE_KERNEL_LAUNCHER(SigmoidCrossEntropy, float);
DEFINE_KERNEL_LAUNCHER(SigmoidCrossEntropy, double); DEFINE_KERNEL_LAUNCHER(SigmoidCrossEntropy, double);
DEFINE_KERNEL_LAUNCHER(SigmoidCrossEntropyGrad, float); DEFINE_KERNEL_LAUNCHER(SigmoidCrossEntropyGrad, float);
DEFINE_KERNEL_LAUNCHER(SigmoidCrossEntropyGrad, double); DEFINE_KERNEL_LAUNCHER(SigmoidCrossEntropyGrad, double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
......
...@@ -64,10 +64,8 @@ __global__ void _SigmoidCrossEntropyGrad( ...@@ -64,10 +64,8 @@ __global__ void _SigmoidCrossEntropyGrad(
DEFINE_KERNEL_LAUNCHER(SigmoidCrossEntropy, float); DEFINE_KERNEL_LAUNCHER(SigmoidCrossEntropy, float);
DEFINE_KERNEL_LAUNCHER(SigmoidCrossEntropy, double); DEFINE_KERNEL_LAUNCHER(SigmoidCrossEntropy, double);
DEFINE_KERNEL_LAUNCHER(SigmoidCrossEntropyGrad, float); DEFINE_KERNEL_LAUNCHER(SigmoidCrossEntropyGrad, float);
DEFINE_KERNEL_LAUNCHER(SigmoidCrossEntropyGrad, double); DEFINE_KERNEL_LAUNCHER(SigmoidCrossEntropyGrad, double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
......
...@@ -137,12 +137,10 @@ DEFINE_KERNEL_LAUNCHER(SigmoidFocalLoss, float, float); ...@@ -137,12 +137,10 @@ DEFINE_KERNEL_LAUNCHER(SigmoidFocalLoss, float, float);
DEFINE_KERNEL_LAUNCHER(SigmoidFocalLoss, float, int64_t); DEFINE_KERNEL_LAUNCHER(SigmoidFocalLoss, float, int64_t);
DEFINE_KERNEL_LAUNCHER(SigmoidFocalLoss, double, double); DEFINE_KERNEL_LAUNCHER(SigmoidFocalLoss, double, double);
DEFINE_KERNEL_LAUNCHER(SigmoidFocalLoss, double, int64_t); DEFINE_KERNEL_LAUNCHER(SigmoidFocalLoss, double, int64_t);
DEFINE_KERNEL_LAUNCHER(SigmoidFocalLossGrad, float, float); DEFINE_KERNEL_LAUNCHER(SigmoidFocalLossGrad, float, float);
DEFINE_KERNEL_LAUNCHER(SigmoidFocalLossGrad, float, int64_t); DEFINE_KERNEL_LAUNCHER(SigmoidFocalLossGrad, float, int64_t);
DEFINE_KERNEL_LAUNCHER(SigmoidFocalLossGrad, double, double); DEFINE_KERNEL_LAUNCHER(SigmoidFocalLossGrad, double, double);
DEFINE_KERNEL_LAUNCHER(SigmoidFocalLossGrad, double, int64_t); DEFINE_KERNEL_LAUNCHER(SigmoidFocalLossGrad, double, int64_t);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
......
...@@ -132,12 +132,10 @@ DEFINE_KERNEL_LAUNCHER(SigmoidFocalLoss, float, float); ...@@ -132,12 +132,10 @@ DEFINE_KERNEL_LAUNCHER(SigmoidFocalLoss, float, float);
DEFINE_KERNEL_LAUNCHER(SigmoidFocalLoss, float, int64_t); DEFINE_KERNEL_LAUNCHER(SigmoidFocalLoss, float, int64_t);
DEFINE_KERNEL_LAUNCHER(SigmoidFocalLoss, double, double); DEFINE_KERNEL_LAUNCHER(SigmoidFocalLoss, double, double);
DEFINE_KERNEL_LAUNCHER(SigmoidFocalLoss, double, int64_t); DEFINE_KERNEL_LAUNCHER(SigmoidFocalLoss, double, int64_t);
DEFINE_KERNEL_LAUNCHER(SigmoidFocalLossGrad, float, float); DEFINE_KERNEL_LAUNCHER(SigmoidFocalLossGrad, float, float);
DEFINE_KERNEL_LAUNCHER(SigmoidFocalLossGrad, float, int64_t); DEFINE_KERNEL_LAUNCHER(SigmoidFocalLossGrad, float, int64_t);
DEFINE_KERNEL_LAUNCHER(SigmoidFocalLossGrad, double, double); DEFINE_KERNEL_LAUNCHER(SigmoidFocalLossGrad, double, double);
DEFINE_KERNEL_LAUNCHER(SigmoidFocalLossGrad, double, int64_t); DEFINE_KERNEL_LAUNCHER(SigmoidFocalLossGrad, double, int64_t);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
......
...@@ -57,10 +57,8 @@ void SmoothL1Grad<float16, CPUContext>( ...@@ -57,10 +57,8 @@ void SmoothL1Grad<float16, CPUContext>(
DEFINE_KERNEL_LAUNCHER(SmoothL1, float); DEFINE_KERNEL_LAUNCHER(SmoothL1, float);
DEFINE_KERNEL_LAUNCHER(SmoothL1, double); DEFINE_KERNEL_LAUNCHER(SmoothL1, double);
DEFINE_KERNEL_LAUNCHER(SmoothL1Grad, float); DEFINE_KERNEL_LAUNCHER(SmoothL1Grad, float);
DEFINE_KERNEL_LAUNCHER(SmoothL1Grad, double); DEFINE_KERNEL_LAUNCHER(SmoothL1Grad, double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
......
...@@ -90,10 +90,8 @@ void SmoothL1Grad<float16, CUDAContext>( ...@@ -90,10 +90,8 @@ void SmoothL1Grad<float16, CUDAContext>(
DEFINE_KERNEL_LAUNCHER(SmoothL1, float); DEFINE_KERNEL_LAUNCHER(SmoothL1, float);
DEFINE_KERNEL_LAUNCHER(SmoothL1, double); DEFINE_KERNEL_LAUNCHER(SmoothL1, double);
DEFINE_KERNEL_LAUNCHER(SmoothL1Grad, float); DEFINE_KERNEL_LAUNCHER(SmoothL1Grad, float);
DEFINE_KERNEL_LAUNCHER(SmoothL1Grad, double); DEFINE_KERNEL_LAUNCHER(SmoothL1Grad, double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
......
...@@ -37,7 +37,6 @@ void _SoftmaxCrossEntropy( ...@@ -37,7 +37,6 @@ void _SoftmaxCrossEntropy(
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
......
...@@ -41,7 +41,6 @@ __global__ void _SoftmaxCrossEntropy( ...@@ -41,7 +41,6 @@ __global__ void _SoftmaxCrossEntropy(
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
......
...@@ -95,12 +95,10 @@ DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropy, float, float); ...@@ -95,12 +95,10 @@ DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropy, float, float);
DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropy, float, int64_t); DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropy, float, int64_t);
DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropy, double, double); DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropy, double, double);
DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropy, double, int64_t); DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropy, double, int64_t);
DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropyGrad, float, float); DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropyGrad, float, float);
DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropyGrad, float, int64_t); DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropyGrad, float, int64_t);
DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropyGrad, double, double); DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropyGrad, double, double);
DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropyGrad, double, int64_t); DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropyGrad, double, int64_t);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
......
...@@ -93,12 +93,10 @@ DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropy, float, float); ...@@ -93,12 +93,10 @@ DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropy, float, float);
DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropy, float, int64_t); DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropy, float, int64_t);
DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropy, double, double); DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropy, double, double);
DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropy, double, int64_t); DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropy, double, int64_t);
DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropyGrad, float, float); DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropyGrad, float, float);
DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropyGrad, float, int64_t); DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropyGrad, float, int64_t);
DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropyGrad, double, double); DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropyGrad, double, double);
DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropyGrad, double, int64_t); DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropyGrad, double, int64_t);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
......
...@@ -104,11 +104,9 @@ DEFINE_KERNEL_LAUNCHER(int64_t); ...@@ -104,11 +104,9 @@ DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float16); DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float16); DEFINE_GRAD_KERNEL_LAUNCHER(float16);
DEFINE_GRAD_KERNEL_LAUNCHER(float); DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double); DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
......
...@@ -160,10 +160,8 @@ DEFINE_KERNEL_LAUNCHER(int); ...@@ -160,10 +160,8 @@ DEFINE_KERNEL_LAUNCHER(int);
DEFINE_KERNEL_LAUNCHER(int64_t); DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float); DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double); DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
......
...@@ -155,7 +155,7 @@ void Moments<float16, float, CPUContext>( ...@@ -155,7 +155,7 @@ void Moments<float16, float, CPUContext>(
CPU_FP16_NOT_SUPPORTED; CPU_FP16_NOT_SUPPORTED;
} }
#define DEFINE_MOMENTS_KERNEL_LAUNCHER(Tx, Ty) \ #define DEFINE_KERNEL_LAUNCHER(Tx, Ty) \
template <> \ template <> \
void Moments<Tx, Ty, CPUContext>( \ void Moments<Tx, Ty, CPUContext>( \
const int num_dims, \ const int num_dims, \
...@@ -169,14 +169,13 @@ void Moments<float16, float, CPUContext>( ...@@ -169,14 +169,13 @@ void Moments<float16, float, CPUContext>(
_Moments(num_dims, dims, num_axes, axes, x, mean, var, ctx); \ _Moments(num_dims, dims, num_axes, axes, x, mean, var, ctx); \
} }
DEFINE_MOMENTS_KERNEL_LAUNCHER(int8_t, float); DEFINE_KERNEL_LAUNCHER(int8_t, float);
DEFINE_MOMENTS_KERNEL_LAUNCHER(uint8_t, float); DEFINE_KERNEL_LAUNCHER(uint8_t, float);
DEFINE_MOMENTS_KERNEL_LAUNCHER(int, float); DEFINE_KERNEL_LAUNCHER(int, float);
DEFINE_MOMENTS_KERNEL_LAUNCHER(int64_t, float); DEFINE_KERNEL_LAUNCHER(int64_t, float);
DEFINE_MOMENTS_KERNEL_LAUNCHER(float, float); DEFINE_KERNEL_LAUNCHER(float, float);
DEFINE_MOMENTS_KERNEL_LAUNCHER(double, double); DEFINE_KERNEL_LAUNCHER(double, double);
#undef DEFINE__KERNEL_LAUNCHER
#undef DEFINE_MOMENTS_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
......
...@@ -316,7 +316,6 @@ DEFINE_KERNEL_LAUNCHER(int, float); ...@@ -316,7 +316,6 @@ DEFINE_KERNEL_LAUNCHER(int, float);
DEFINE_KERNEL_LAUNCHER(int64_t, float); DEFINE_KERNEL_LAUNCHER(int64_t, float);
DEFINE_KERNEL_LAUNCHER(float, float); DEFINE_KERNEL_LAUNCHER(float, float);
DEFINE_KERNEL_LAUNCHER(double, double); DEFINE_KERNEL_LAUNCHER(double, double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
......
...@@ -177,12 +177,6 @@ void L2NormalizeGrad<float16, CPUContext>( ...@@ -177,12 +177,6 @@ void L2NormalizeGrad<float16, CPUContext>(
_##name(outer_dim, inner_dim, reduce_dim, (T)scale, (T)eps, x, y); \ _##name(outer_dim, inner_dim, reduce_dim, (T)scale, (T)eps, x, y); \
} }
DEFINE_KERNEL_LAUNCHER(L1Normalize, float);
DEFINE_KERNEL_LAUNCHER(L1Normalize, double);
DEFINE_KERNEL_LAUNCHER(L2Normalize, float);
DEFINE_KERNEL_LAUNCHER(L2Normalize, double);
#undef DEFINE_KERNEL_LAUNCHER
#define DEFINE_GRAD_KERNEL_LAUNCHER(name, T) \ #define DEFINE_GRAD_KERNEL_LAUNCHER(name, T) \
template <> \ template <> \
void name<T, CPUContext>( \ void name<T, CPUContext>( \
...@@ -198,10 +192,15 @@ DEFINE_KERNEL_LAUNCHER(L2Normalize, double); ...@@ -198,10 +192,15 @@ DEFINE_KERNEL_LAUNCHER(L2Normalize, double);
_##name(outer_dim, inner_dim, reduce_dim, (T)scale, (T)eps, dy, x, dx); \ _##name(outer_dim, inner_dim, reduce_dim, (T)scale, (T)eps, dy, x, dx); \
} }
DEFINE_KERNEL_LAUNCHER(L1Normalize, float);
DEFINE_KERNEL_LAUNCHER(L1Normalize, double);
DEFINE_KERNEL_LAUNCHER(L2Normalize, float);
DEFINE_KERNEL_LAUNCHER(L2Normalize, double);
DEFINE_GRAD_KERNEL_LAUNCHER(L1NormalizeGrad, float); DEFINE_GRAD_KERNEL_LAUNCHER(L1NormalizeGrad, float);
DEFINE_GRAD_KERNEL_LAUNCHER(L1NormalizeGrad, double); DEFINE_GRAD_KERNEL_LAUNCHER(L1NormalizeGrad, double);
DEFINE_GRAD_KERNEL_LAUNCHER(L2NormalizeGrad, float); DEFINE_GRAD_KERNEL_LAUNCHER(L2NormalizeGrad, float);
DEFINE_GRAD_KERNEL_LAUNCHER(L2NormalizeGrad, double); DEFINE_GRAD_KERNEL_LAUNCHER(L2NormalizeGrad, double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
......
...@@ -170,14 +170,6 @@ __global__ void _L2NormalizeGrad( ...@@ -170,14 +170,6 @@ __global__ void _L2NormalizeGrad(
reinterpret_cast<ScalarT*>(y)); \ reinterpret_cast<ScalarT*>(y)); \
} }
DEFINE_KERNEL_LAUNCHER(L1Normalize, float16, half, float);
DEFINE_KERNEL_LAUNCHER(L1Normalize, float, float, float);
DEFINE_KERNEL_LAUNCHER(L1Normalize, double, double, double);
DEFINE_KERNEL_LAUNCHER(L2Normalize, float16, half, float);
DEFINE_KERNEL_LAUNCHER(L2Normalize, float, float, float);
DEFINE_KERNEL_LAUNCHER(L2Normalize, double, double, double);
#undef DEFINE_KERNEL_LAUNCHER
#define DEFINE_GRAD_KERNEL_LAUNCHER(name, T, ScalarT, AccT) \ #define DEFINE_GRAD_KERNEL_LAUNCHER(name, T, ScalarT, AccT) \
template <> \ template <> \
void name<T, CUDAContext>( \ void name<T, CUDAContext>( \
...@@ -203,12 +195,19 @@ DEFINE_KERNEL_LAUNCHER(L2Normalize, double, double, double); ...@@ -203,12 +195,19 @@ DEFINE_KERNEL_LAUNCHER(L2Normalize, double, double, double);
reinterpret_cast<ScalarT*>(dx)); \ reinterpret_cast<ScalarT*>(dx)); \
} }
DEFINE_KERNEL_LAUNCHER(L1Normalize, float16, half, float);
DEFINE_KERNEL_LAUNCHER(L1Normalize, float, float, float);
DEFINE_KERNEL_LAUNCHER(L1Normalize, double, double, double);
DEFINE_KERNEL_LAUNCHER(L2Normalize, float16, half, float);
DEFINE_KERNEL_LAUNCHER(L2Normalize, float, float, float);
DEFINE_KERNEL_LAUNCHER(L2Normalize, double, double, double);
DEFINE_GRAD_KERNEL_LAUNCHER(L1NormalizeGrad, float16, half, float); DEFINE_GRAD_KERNEL_LAUNCHER(L1NormalizeGrad, float16, half, float);
DEFINE_GRAD_KERNEL_LAUNCHER(L1NormalizeGrad, float, float, float); DEFINE_GRAD_KERNEL_LAUNCHER(L1NormalizeGrad, float, float, float);
DEFINE_GRAD_KERNEL_LAUNCHER(L1NormalizeGrad, double, double, double); DEFINE_GRAD_KERNEL_LAUNCHER(L1NormalizeGrad, double, double, double);
DEFINE_GRAD_KERNEL_LAUNCHER(L2NormalizeGrad, float16, half, float); DEFINE_GRAD_KERNEL_LAUNCHER(L2NormalizeGrad, float16, half, float);
DEFINE_GRAD_KERNEL_LAUNCHER(L2NormalizeGrad, float, float, float); DEFINE_GRAD_KERNEL_LAUNCHER(L2NormalizeGrad, float, float, float);
DEFINE_GRAD_KERNEL_LAUNCHER(L2NormalizeGrad, double, double, double); DEFINE_GRAD_KERNEL_LAUNCHER(L2NormalizeGrad, double, double, double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
......
...@@ -293,10 +293,8 @@ void _AvgPool2dGradNHWC( ...@@ -293,10 +293,8 @@ void _AvgPool2dGradNHWC(
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float); DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double); DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
......
...@@ -325,10 +325,8 @@ __global__ void _AvgPool2dGradNHWC( ...@@ -325,10 +325,8 @@ __global__ void _AvgPool2dGradNHWC(
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float); DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double); DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
......
...@@ -76,7 +76,6 @@ DEFINE_KERNEL_LAUNCHER(int); ...@@ -76,7 +76,6 @@ DEFINE_KERNEL_LAUNCHER(int);
DEFINE_KERNEL_LAUNCHER(int64_t); DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
......
...@@ -103,7 +103,6 @@ DEFINE_KERNEL_LAUNCHER(int); ...@@ -103,7 +103,6 @@ DEFINE_KERNEL_LAUNCHER(int);
DEFINE_KERNEL_LAUNCHER(int64_t); DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
......
...@@ -304,7 +304,6 @@ void _Col2Im2dNHWC( ...@@ -304,7 +304,6 @@ void _Col2Im2dNHWC(
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
......
...@@ -356,7 +356,6 @@ __global__ void _Col2Im2dNHWC( ...@@ -356,7 +356,6 @@ __global__ void _Col2Im2dNHWC(
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
......
...@@ -243,6 +243,7 @@ void DepthwiseConv2d<float, CPUContext>( ...@@ -243,6 +243,7 @@ void DepthwiseConv2d<float, CPUContext>(
DEFINE_GRAD_KERNEL_LAUNCHER(float16); DEFINE_GRAD_KERNEL_LAUNCHER(float16);
DEFINE_GRAD_KERNEL_LAUNCHER(float); DEFINE_GRAD_KERNEL_LAUNCHER(float);
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
#undef DISPATCH_DATA_KERNEL
} // namespace kernel } // namespace kernel
......
...@@ -12,9 +12,9 @@ namespace kernel { ...@@ -12,9 +12,9 @@ namespace kernel {
namespace { namespace {
#if __CUDA_ARCH__ >= 350 #if __CUDA_ARCH__ >= 350
#define LOAD(x, i) __ldg(x + i) #define LDG(x, i) __ldg(x + i)
#else #else
#define LOAD(x, i) x[i] #define LDG(x, i) x[i]
#endif #endif
template <typename T, typename AccT, int KKH, int KKW> template <typename T, typename AccT, int KKH, int KKW>
...@@ -60,7 +60,7 @@ __global__ void _DepthwiseConv2dNCHW( ...@@ -60,7 +60,7 @@ __global__ void _DepthwiseConv2dNCHW(
iw = iw_start + kw * dilation_w; iw = iw_start + kw * dilation_w;
if (ih >= 0 && ih < H && iw >= 0 && iw < W) { if (ih >= 0 && ih < H && iw >= 0 && iw < W) {
xi = x_start + ih * W + iw; xi = x_start + ih * W + iw;
sum_val += convert::To<AccT>(Multiplies(LOAD(x, xi), LOAD(w, wi))); sum_val += convert::To<AccT>(Multiplies(LDG(x, xi), LDG(w, wi)));
} }
++wi; ++wi;
} // End kw } // End kw
...@@ -112,7 +112,7 @@ __global__ void _DepthwiseConv2dNHWC( ...@@ -112,7 +112,7 @@ __global__ void _DepthwiseConv2dNHWC(
iw = iw_start + kw * dilation_w; iw = iw_start + kw * dilation_w;
if (ih >= 0 && ih < H && iw >= 0 && iw < W) { if (ih >= 0 && ih < H && iw >= 0 && iw < W) {
xi = ((x_start + ih) * W + iw) * C + c; xi = ((x_start + ih) * W + iw) * C + c;
sum_val += convert::To<AccT>(Multiplies(LOAD(x, xi), LOAD(w, wi))); sum_val += convert::To<AccT>(Multiplies(LDG(x, xi), LDG(w, wi)));
} }
++wi; ++wi;
} // End kw } // End kw
...@@ -164,7 +164,7 @@ __global__ void _DepthwiseConv2dGradNCHW( ...@@ -164,7 +164,7 @@ __global__ void _DepthwiseConv2dGradNCHW(
ow = ow / stride_w; ow = ow / stride_w;
if (oh >= 0 && oh < out_h && ow >= 0 && ow < out_w) { if (oh >= 0 && oh < out_h && ow >= 0 && ow < out_w) {
yi = y_start + oh * out_w + ow; yi = y_start + oh * out_w + ow;
sum_val += convert::To<AccT>(Multiplies(LOAD(dy, yi), LOAD(w, wi))); sum_val += convert::To<AccT>(Multiplies(LDG(dy, yi), LDG(w, wi)));
} }
} }
++wi; ++wi;
...@@ -217,7 +217,7 @@ __global__ void _DepthwiseConv2dGradNHWC( ...@@ -217,7 +217,7 @@ __global__ void _DepthwiseConv2dGradNHWC(
ow = ow / stride_w; ow = ow / stride_w;
if (oh >= 0 && oh < out_h && ow >= 0 && ow < out_w) { if (oh >= 0 && oh < out_h && ow >= 0 && ow < out_w) {
yi = ((y_start + oh) * out_w + ow) * C + c; yi = ((y_start + oh) * out_w + ow) * C + c;
sum_val += convert::To<AccT>(Multiplies(LOAD(dy, yi), LOAD(w, wi))); sum_val += convert::To<AccT>(Multiplies(LDG(dy, yi), LDG(w, wi)));
} }
} }
++wi; ++wi;
...@@ -267,7 +267,7 @@ __global__ void _DepthwiseConv2dWGradNCHW( ...@@ -267,7 +267,7 @@ __global__ void _DepthwiseConv2dWGradNCHW(
if (ih >= 0 && iw >= 0 && ih < H && iw < W) { if (ih >= 0 && iw >= 0 && ih < H && iw < W) {
xi = ((i * C + c) * H + ih) * W + iw; xi = ((i * C + c) * H + ih) * W + iw;
yi = (i * C + c) * out_h * out_w + j; yi = (i * C + c) * out_h * out_w + j;
sum_val += convert::To<AccT>(Multiplies(LOAD(dy, yi), LOAD(x, xi))); sum_val += convert::To<AccT>(Multiplies(LDG(dy, yi), LDG(x, xi)));
} }
} }
} }
...@@ -320,7 +320,7 @@ __global__ void _DepthwiseConv2dWGradNHWC( ...@@ -320,7 +320,7 @@ __global__ void _DepthwiseConv2dWGradNHWC(
if (ih >= 0 && iw >= 0 && ih < H && iw < W) { if (ih >= 0 && iw >= 0 && ih < H && iw < W) {
xi = ((i * H + ih) * W + iw) * C + c; xi = ((i * H + ih) * W + iw) * C + c;
yi = (i * ohw + j) * C + c; yi = (i * ohw + j) * C + c;
sum_val += convert::To<AccT>(Multiplies(LOAD(dy, yi), LOAD(x, xi))); sum_val += convert::To<AccT>(Multiplies(LDG(dy, yi), LDG(x, xi)));
} }
} }
} }
...@@ -333,7 +333,7 @@ __global__ void _DepthwiseConv2dWGradNHWC( ...@@ -333,7 +333,7 @@ __global__ void _DepthwiseConv2dWGradNHWC(
} }
} }
#undef LOAD #undef LDG
} // namespace } // namespace
...@@ -528,10 +528,8 @@ __global__ void _DepthwiseConv2dWGradNHWC( ...@@ -528,10 +528,8 @@ __global__ void _DepthwiseConv2dWGradNHWC(
DEFINE_KERNEL_LAUNCHER(float16, half, float); DEFINE_KERNEL_LAUNCHER(float16, half, float);
DEFINE_KERNEL_LAUNCHER(float, float, float); DEFINE_KERNEL_LAUNCHER(float, float, float);
DEFINE_GRAD_KERNEL_LAUNCHER(float16, half, float); DEFINE_GRAD_KERNEL_LAUNCHER(float16, half, float);
DEFINE_GRAD_KERNEL_LAUNCHER(float, float, float); DEFINE_GRAD_KERNEL_LAUNCHER(float, float, float);
#undef DISPATCH_DATA_KERNEL #undef DISPATCH_DATA_KERNEL
#undef DISPATCH_WEIGHT_KERNEL #undef DISPATCH_WEIGHT_KERNEL
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
......
...@@ -287,10 +287,8 @@ void _MaxPool2dGradNHWC( ...@@ -287,10 +287,8 @@ void _MaxPool2dGradNHWC(
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float); DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double); DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
......
...@@ -335,10 +335,8 @@ __global__ void _MaxPool2dGradNHWC( ...@@ -335,10 +335,8 @@ __global__ void _MaxPool2dGradNHWC(
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float); DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double); DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
......
...@@ -272,10 +272,8 @@ DEFINE_KERNEL_LAUNCHER(int); ...@@ -272,10 +272,8 @@ DEFINE_KERNEL_LAUNCHER(int);
DEFINE_KERNEL_LAUNCHER(int64_t); DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float); DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double); DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
......
...@@ -11,11 +11,9 @@ namespace kernel { ...@@ -11,11 +11,9 @@ namespace kernel {
namespace { namespace {
#if __CUDA_ARCH__ >= 350 #if __CUDA_ARCH__ >= 350
#define LOAD(x, i) __ldg(x + i) #define LDG(x, i) convert::To<float>(__ldg(x + i))
#define LOADF(x, i) __half2float(__ldg(x + i))
#else #else
#define LOAD(x, i) x[i] #define LDG(x, i) convert::To<float>(x[i])
#define LOADF(x, i) __half2float(x[i])
#endif #endif
template <typename T> template <typename T>
...@@ -70,53 +68,13 @@ __global__ void _ResizeLinearNCHW( ...@@ -70,53 +68,13 @@ __global__ void _ResizeLinearNCHW(
const float u = w_in - li; const float u = w_in - li;
const int offset = (n * C + c) * H; const int offset = (n * C + c) * H;
const float tl = LOAD(x, ((offset + ti) * W + li)); const float tl = LDG(x, ((offset + ti) * W + li));
const float tr = LOAD(x, ((offset + ti) * W + ri)); const float tr = LDG(x, ((offset + ti) * W + ri));
const float bl = LOAD(x, ((offset + bi) * W + li)); const float bl = LDG(x, ((offset + bi) * W + li));
const float br = LOAD(x, ((offset + bi) * W + ri)); const float br = LDG(x, ((offset + bi) * W + ri));
const float t = tl + (tr - tl) * u; const float t = tl + (tr - tl) * u;
const float b = bl + (br - bl) * u; const float b = bl + (br - bl) * u;
y[yi] = (T)(t + (b - t) * v); y[yi] = convert::To<T>(t + (b - t) * v);
}
}
template <>
__global__ void _ResizeLinearNCHW<half>(
const int nthreads,
const int C,
const int H,
const int W,
const int out_h,
const int out_w,
const float scale_h,
const float scale_w,
const bool align_corners,
const half* x,
half* y) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
const int w = yi % out_w;
const int h = (yi / out_w) % out_h;
const int c = (yi / out_w / out_h) % C;
const int n = yi / out_w / out_w / C;
const float h_in = TransformCoordinate(h, scale_h, align_corners);
const int ti = floorf(h_in);
const int bi = h_in < H - 1 ? ceilf(h_in) : H - 1;
const float v = h_in - ti;
const float w_in = TransformCoordinate(w, scale_w, align_corners);
const int li = floorf(w_in);
const int ri = (w_in < W - 1) ? ceilf(w_in) : W - 1;
const float u = w_in - li;
const int offset = (n * C + c) * H;
const float tl = LOADF(x, ((offset + ti) * W + li));
const float tr = LOADF(x, ((offset + ti) * W + ri));
const float bl = LOADF(x, ((offset + bi) * W + li));
const float br = LOADF(x, ((offset + bi) * W + ri));
const float t = tl + (tr - tl) * u;
const float b = bl + (br - bl) * u;
y[yi] = __float2half(t + (b - t) * v);
} }
} }
...@@ -150,53 +108,13 @@ __global__ void _ResizeLinearNHWC( ...@@ -150,53 +108,13 @@ __global__ void _ResizeLinearNHWC(
const float u = w_in - li; const float u = w_in - li;
const int offset = n * H; const int offset = n * H;
const float tl = LOAD(x, (((offset + ti) * W + li) * C + c)); const float tl = LDG(x, (((offset + ti) * W + li) * C + c));
const float tr = LOAD(x, (((offset + ti) * W + ri) * C + c)); const float tr = LDG(x, (((offset + ti) * W + ri) * C + c));
const float bl = LOAD(x, (((offset + bi) * W + li) * C + c)); const float bl = LDG(x, (((offset + bi) * W + li) * C + c));
const float br = LOAD(x, (((offset + bi) * W + ri) * C + c)); const float br = LDG(x, (((offset + bi) * W + ri) * C + c));
const float t = tl + (tr - tl) * u;
const float b = bl + (br - bl) * u;
y[yi] = (T)(t + (b - t) * v);
}
}
template <>
__global__ void _ResizeLinearNHWC<half>(
const int nthreads,
const int C,
const int H,
const int W,
const int out_h,
const int out_w,
const float scale_h,
const float scale_w,
const bool align_corners,
const half* x,
half* y) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
const int c = yi % C;
const int w = (yi / C) % out_w;
const int h = (yi / C / out_w) % out_h;
const int n = yi / C / out_w / out_h;
const float h_in = TransformCoordinate(h, scale_h, align_corners);
const int ti = floorf(h_in);
const int bi = (h_in < H - 1) ? ceilf(h_in) : H - 1;
const float v = h_in - ti;
const float w_in = TransformCoordinate(w, scale_w, align_corners);
const int li = floorf(w_in);
const int ri = (w_in < W - 1) ? ceilf(w_in) : W - 1;
const float u = w_in - li;
const int offset = n * H;
const float tl = LOADF(x, (((offset + ti) * W + li) * C + c));
const float tr = LOADF(x, (((offset + ti) * W + ri) * C + c));
const float bl = LOADF(x, (((offset + bi) * W + li) * C + c));
const float br = LOADF(x, (((offset + bi) * W + ri) * C + c));
const float t = tl + (tr - tl) * u; const float t = tl + (tr - tl) * u;
const float b = bl + (br - bl) * u; const float b = bl + (br - bl) * u;
y[yi] = __float2half(t + (b - t) * v); y[yi] = convert::To<T>(t + (b - t) * v);
} }
} }
...@@ -229,48 +147,8 @@ __global__ void _ResizeLinearGradNCHW( ...@@ -229,48 +147,8 @@ __global__ void _ResizeLinearGradNCHW(
const int ri = (w_in < W - 1) ? ceilf(w_in) : W - 1; const int ri = (w_in < W - 1) ? ceilf(w_in) : W - 1;
const float u = w_in - li; const float u = w_in - li;
const float dt = (1.f - v) * LOAD(dy, yi); const float dt = (1.f - v) * LDG(dy, yi);
const float db = v * LOAD(dy, yi); const float db = v * LDG(dy, yi);
const int offset = (n * C + c) * H;
atomicAdd(&dx[(offset + ti) * W + li], (1.f - u) * dt);
atomicAdd(&dx[(offset + ti) * W + ri], u * dt);
atomicAdd(&dx[(offset + bi) * W + li], (1.f - u) * db);
atomicAdd(&dx[(offset + bi) * W + ri], u * db);
}
}
template <>
__global__ void _ResizeLinearGradNCHW<half>(
const int nthreads,
const int C,
const int H,
const int W,
const int out_h,
const int out_w,
const float scale_h,
const float scale_w,
const bool align_corners,
const half* dy,
float* dx) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
const int w = yi % out_w;
const int h = (yi / out_w) % out_h;
const int c = (yi / out_w / out_h) % C;
const int n = yi / out_w / out_w / C;
const float h_in = TransformCoordinate(h, scale_h, align_corners);
const int ti = floorf(h_in);
const int bi = (h_in < H - 1) ? ceilf(h_in) : H - 1;
const float v = h_in - ti;
const float w_in = TransformCoordinate(w, scale_w, align_corners);
const int li = floorf(w_in);
const int ri = (w_in < W - 1) ? ceilf(w_in) : W - 1;
const float u = w_in - li;
const float dt = (1.f - v) * LOADF(dy, yi);
const float db = v * LOADF(dy, yi);
const int offset = (n * C + c) * H; const int offset = (n * C + c) * H;
atomicAdd(&dx[(offset + ti) * W + li], (1.f - u) * dt); atomicAdd(&dx[(offset + ti) * W + li], (1.f - u) * dt);
...@@ -309,48 +187,8 @@ __global__ void _ResizeLinearGradNHWC( ...@@ -309,48 +187,8 @@ __global__ void _ResizeLinearGradNHWC(
const int ri = (w_in < W - 1) ? ceilf(w_in) : W - 1; const int ri = (w_in < W - 1) ? ceilf(w_in) : W - 1;
const float u = w_in - li; const float u = w_in - li;
const float dt = (1.f - v) * LOAD(dy, yi); const float dt = (1.f - v) * LDG(dy, yi);
const float db = v * LOAD(dy, yi); const float db = v * LDG(dy, yi);
const int offset = n * H;
atomicAdd(&dx[((offset + ti) * W + li) * C + c], (1.f - u) * dt);
atomicAdd(&dx[((offset + ti) * W + ri) * C + c], u * dt);
atomicAdd(&dx[((offset + bi) * W + li) * C + c], (1.f - u) * db);
atomicAdd(&dx[((offset + bi) * W + ri) * C + c], u * db);
}
}
template <>
__global__ void _ResizeLinearGradNHWC<half>(
const int nthreads,
const int C,
const int H,
const int W,
const int out_h,
const int out_w,
const float scale_h,
const float scale_w,
const bool align_corners,
const half* dy,
float* dx) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
const int c = yi % C;
const int w = (yi / C) % out_w;
const int h = (yi / C / out_w) % out_h;
const int n = yi / C / out_w / out_h;
const float h_in = TransformCoordinate(h, scale_h, align_corners);
const int ti = floorf(h_in);
const int bi = (h_in < H - 1) ? ceilf(h_in) : H - 1;
const float v = h_in - ti;
const float w_in = TransformCoordinate(w, scale_w, align_corners);
const int li = floorf(w_in);
const int ri = (w_in < W - 1) ? ceilf(w_in) : W - 1;
const float u = w_in - li;
const float dt = (1.f - v) * LOADF(dy, yi);
const float db = v * LOADF(dy, yi);
const int offset = n * H; const int offset = n * H;
atomicAdd(&dx[((offset + ti) * W + li) * C + c], (1.f - u) * dt); atomicAdd(&dx[((offset + ti) * W + li) * C + c], (1.f - u) * dt);
...@@ -360,125 +198,22 @@ __global__ void _ResizeLinearGradNHWC<half>( ...@@ -360,125 +198,22 @@ __global__ void _ResizeLinearGradNHWC<half>(
} }
} }
#undef LOAD #undef LDG
#undef LOADF
} // namespace } // namespace
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
template <> #define DISPATCH_RESIZE_KERNEL(name, T, nblocks, nthreads, ...) \
void ResizeLinear<float16, CUDAContext>( if (data_format == "NCHW") { \
const int N, name##NCHW<<<nblocks, nthreads, 0, ctx->cuda_stream()>>>(__VA_ARGS__); \
const int C, } else if (data_format == "NHWC") { \
const int H, name##NHWC<<<nblocks, nthreads, 0, ctx->cuda_stream()>>>(__VA_ARGS__); \
const int W, } else { \
const int out_h, LOG(FATAL) << "Unknown DataFormat: " << data_format; \
const int out_w,
const bool align_corners,
const string& data_format,
const float16* x,
float16* y,
CUDAContext* ctx) {
auto nthreads = N * C * out_h * out_w;
auto scale_h = ComputeScale(H, out_h, align_corners);
auto scale_w = ComputeScale(W, out_w, align_corners);
if (data_format == "NCHW") {
_ResizeLinearNCHW<<<
CUDA_BLOCKS(nthreads),
CUDA_THREADS,
0,
ctx->cuda_stream()>>>(
nthreads,
C,
H,
W,
out_h,
out_w,
scale_h,
scale_w,
align_corners,
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y));
} else if (data_format == "NHWC") {
_ResizeLinearNHWC<<<
CUDA_BLOCKS(nthreads),
CUDA_THREADS,
0,
ctx->cuda_stream()>>>(
nthreads,
C,
H,
W,
out_h,
out_w,
scale_h,
scale_w,
align_corners,
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y));
} else {
LOG(FATAL) << "Unknown data format: " << data_format;
}
}
template <>
void ResizeLinearGrad<float16, CUDAContext>(
const int N,
const int C,
const int H,
const int W,
const int out_h,
const int out_w,
const bool align_corners,
const string& data_format,
const float16* dy,
float* dx,
CUDAContext* ctx) {
auto nthreads = N * C * out_h * out_w;
auto scale_h = ComputeScale(H, out_h, align_corners);
auto scale_w = ComputeScale(W, out_w, align_corners);
math::Set(N * C * H * W, 0.f, dx, ctx);
if (data_format == "NCHW") {
_ResizeLinearGradNCHW<<<
CUDA_BLOCKS(nthreads),
CUDA_THREADS,
0,
ctx->cuda_stream()>>>(
nthreads,
C,
H,
W,
out_h,
out_w,
scale_h,
scale_w,
align_corners,
reinterpret_cast<const half*>(dy),
dx);
} else if (data_format == "NHWC") {
_ResizeLinearGradNHWC<<<
CUDA_BLOCKS(nthreads),
CUDA_THREADS,
0,
ctx->cuda_stream()>>>(
nthreads,
C,
H,
W,
out_h,
out_w,
scale_h,
scale_w,
align_corners,
reinterpret_cast<const half*>(dy),
dx);
} else {
LOG(FATAL) << "Unknown data format: " << data_format;
} }
}
#define DEFINE_KERNEL_LAUNCHER(T) \ #define DEFINE_KERNEL_LAUNCHER(T, ScalarT) \
template <> \ template <> \
void ResizeLinear<T, CUDAContext>( \ void ResizeLinear<T, CUDAContext>( \
const int N, \ const int N, \
...@@ -495,29 +230,11 @@ void ResizeLinearGrad<float16, CUDAContext>( ...@@ -495,29 +230,11 @@ void ResizeLinearGrad<float16, CUDAContext>(
auto nthreads = N * C * out_h * out_w; \ auto nthreads = N * C * out_h * out_w; \
auto scale_h = ComputeScale(H, out_h, align_corners); \ auto scale_h = ComputeScale(H, out_h, align_corners); \
auto scale_w = ComputeScale(W, out_w, align_corners); \ auto scale_w = ComputeScale(W, out_w, align_corners); \
if (data_format == "NCHW") { \ DISPATCH_RESIZE_KERNEL( \
_ResizeLinearNCHW<<< \ _ResizeLinear, \
CUDA_BLOCKS(nthreads), \ ScalarT, \
CUDA_THREADS, \
0, \
ctx->cuda_stream()>>>( \
nthreads, \
C, \
H, \
W, \
out_h, \
out_w, \
scale_h, \
scale_w, \
align_corners, \
x, \
y); \
} else if (data_format == "NHWC") { \
_ResizeLinearNHWC<<< \
CUDA_BLOCKS(nthreads), \ CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \ CUDA_THREADS, \
0, \
ctx->cuda_stream()>>>( \
nthreads, \ nthreads, \
C, \ C, \
H, \ H, \
...@@ -527,14 +244,11 @@ void ResizeLinearGrad<float16, CUDAContext>( ...@@ -527,14 +244,11 @@ void ResizeLinearGrad<float16, CUDAContext>(
scale_h, \ scale_h, \
scale_w, \ scale_w, \
align_corners, \ align_corners, \
x, \ reinterpret_cast<const ScalarT*>(x), \
y); \ reinterpret_cast<ScalarT*>(y)); \
} else { \
LOG(FATAL) << "Unknown data format: " << data_format; \
} \
} }
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \ #define DEFINE_GRAD_KERNEL_LAUNCHER(T, ScalarT) \
template <> \ template <> \
void ResizeLinearGrad<T, CUDAContext>( \ void ResizeLinearGrad<T, CUDAContext>( \
const int N, \ const int N, \
...@@ -552,12 +266,11 @@ void ResizeLinearGrad<float16, CUDAContext>( ...@@ -552,12 +266,11 @@ void ResizeLinearGrad<float16, CUDAContext>(
auto scale_h = ComputeScale(H, out_h, align_corners); \ auto scale_h = ComputeScale(H, out_h, align_corners); \
auto scale_w = ComputeScale(W, out_w, align_corners); \ auto scale_w = ComputeScale(W, out_w, align_corners); \
math::Set(N* C* H* W, 0.f, dx, ctx); \ math::Set(N* C* H* W, 0.f, dx, ctx); \
if (data_format == "NCHW") { \ DISPATCH_RESIZE_KERNEL( \
_ResizeLinearGradNCHW<<< \ _ResizeLinearGrad, \
ScalarT, \
CUDA_BLOCKS(nthreads), \ CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \ CUDA_THREADS, \
0, \
ctx->cuda_stream()>>>( \
nthreads, \ nthreads, \
C, \ C, \
H, \ H, \
...@@ -567,42 +280,23 @@ void ResizeLinearGrad<float16, CUDAContext>( ...@@ -567,42 +280,23 @@ void ResizeLinearGrad<float16, CUDAContext>(
scale_h, \ scale_h, \
scale_w, \ scale_w, \
align_corners, \ align_corners, \
dy, \ reinterpret_cast<const ScalarT*>(dy), \
dx); \ dx); \
} else if (data_format == "NHWC") { \
_ResizeLinearGradNHWC<<< \
CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \
0, \
ctx->cuda_stream()>>>( \
nthreads, \
C, \
H, \
W, \
out_h, \
out_w, \
scale_h, \
scale_w, \
align_corners, \
dy, \
dx); \
} else { \
LOG(FATAL) << "Unknown data format: " << data_format; \
} \
} }
DEFINE_KERNEL_LAUNCHER(int8_t); DEFINE_KERNEL_LAUNCHER(int8_t, int8_t);
DEFINE_KERNEL_LAUNCHER(uint8_t); DEFINE_KERNEL_LAUNCHER(uint8_t, uint8_t);
DEFINE_KERNEL_LAUNCHER(int); DEFINE_KERNEL_LAUNCHER(int, int);
DEFINE_KERNEL_LAUNCHER(int64_t); DEFINE_KERNEL_LAUNCHER(int64_t, int64_t);
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float16, half);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(float, float);
DEFINE_KERNEL_LAUNCHER(double, double);
DEFINE_GRAD_KERNEL_LAUNCHER(float); DEFINE_GRAD_KERNEL_LAUNCHER(float16, half);
DEFINE_GRAD_KERNEL_LAUNCHER(double); DEFINE_GRAD_KERNEL_LAUNCHER(float, float);
DEFINE_GRAD_KERNEL_LAUNCHER(double, double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
#undef DISPATCH_RESIZE_KERNEL
} // namespace kernel } // namespace kernel
......
...@@ -181,10 +181,8 @@ DEFINE_KERNEL_LAUNCHER(int64_t); ...@@ -181,10 +181,8 @@ DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float16); DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float); DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double); DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
......
...@@ -10,6 +10,14 @@ namespace kernel { ...@@ -10,6 +10,14 @@ namespace kernel {
namespace { namespace {
#if __CUDA_ARCH__ >= 350
#define LDG(x, i) __ldg(x + i)
#define LDG2(x, i) convert::To<float>(__ldg(x + i))
#else
#define LDG(x, i) x[i]
#define LDG2(x, i) convert::To<float>(x[i])
#endif
template <typename T> template <typename T>
__global__ void _ResizeNearestNCHW( __global__ void _ResizeNearestNCHW(
const int nthreads, const int nthreads,
...@@ -29,11 +37,7 @@ __global__ void _ResizeNearestNCHW( ...@@ -29,11 +37,7 @@ __global__ void _ResizeNearestNCHW(
const int n = yi / out_w / out_h / C; const int n = yi / out_w / out_h / C;
const int h_in = min(int(h * scale_h), H - 1); const int h_in = min(int(h * scale_h), H - 1);
const int w_in = min(int(w * scale_w), W - 1); const int w_in = min(int(w * scale_w), W - 1);
#if __CUDA_ARCH__ >= 350 y[yi] = LDG(x, (((n * C + c) * H + h_in) * W + w_in));
y[yi] = __ldg(x + (((n * C + c) * H + h_in) * W + w_in));
#else
y[yi] = x[((n * C + c) * H + h_in) * W + w_in];
#endif
} }
} }
...@@ -56,11 +60,7 @@ __global__ void _ResizeNearestNHWC( ...@@ -56,11 +60,7 @@ __global__ void _ResizeNearestNHWC(
const int n = yi / C / out_w / out_h; const int n = yi / C / out_w / out_h;
const int h_in = min(int(h * scale_h), H - 1); const int h_in = min(int(h * scale_h), H - 1);
const int w_in = min(int(w * scale_w), W - 1); const int w_in = min(int(w * scale_w), W - 1);
#if __CUDA_ARCH__ >= 350 y[yi] = LDG(x, (((n * H + h_in) * W + w_in) * C + c));
y[yi] = __ldg(x + (((n * H + h_in) * W + w_in) * C + c));
#else
y[yi] = x[((n * H + h_in) * W + w_in) * C + c];
#endif
} }
} }
...@@ -83,39 +83,7 @@ __global__ void _ResizeNearestGradNCHW( ...@@ -83,39 +83,7 @@ __global__ void _ResizeNearestGradNCHW(
const int n = yi / out_w / out_h / C; const int n = yi / out_w / out_h / C;
const int h_in = min(int(h * scale_h), H - 1); const int h_in = min(int(h * scale_h), H - 1);
const int w_in = min(int(w * scale_w), W - 1); const int w_in = min(int(w * scale_w), W - 1);
#if __CUDA_ARCH__ >= 350 atomicAdd(&dx[((n * C + c) * H + h_in) * W + w_in], LDG2(dy, yi));
atomicAdd(&dx[((n * C + c) * H + h_in) * W + w_in], (float)__ldg(dy + yi));
#else
atomicAdd(&dx[((n * C + c) * H + h_in) * W + w_in], (float)dy[yi]);
#endif
}
}
template <>
__global__ void _ResizeNearestGradNCHW<half>(
const int nthreads,
const int C,
const int H,
const int W,
const int out_h,
const int out_w,
const float scale_h,
const float scale_w,
const half* dy,
float* dx) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
const int w = yi % out_w;
const int h = (yi / out_w) % out_h;
const int c = (yi / out_w / out_h) % C;
const int n = yi / out_w / out_h / C;
const int h_in = min(int(h * scale_h), H - 1);
const int w_in = min(int(w * scale_w), W - 1);
#if __CUDA_ARCH__ >= 350
atomicAdd(
&dx[((n * C + c) * H + h_in) * W + w_in], __half2float(__ldg(dy + yi)));
#else
atomicAdd(&dx[((n * C + c) * H + h_in) * W + w_in], __half2float(dy[yi]));
#endif
} }
} }
...@@ -138,152 +106,27 @@ __global__ void _ResizeNearestGradNHWC( ...@@ -138,152 +106,27 @@ __global__ void _ResizeNearestGradNHWC(
const int n = yi / C / out_w / out_h; const int n = yi / C / out_w / out_h;
const int h_in = min(int(h * scale_h), H - 1); const int h_in = min(int(h * scale_h), H - 1);
const int w_in = min(int(w * scale_w), W - 1); const int w_in = min(int(w * scale_w), W - 1);
#if __CUDA_ARCH__ >= 350 atomicAdd(&dx[((n * H + h_in) * W + w_in) * C + c], LDG2(dy, yi));
atomicAdd(&dx[((n * H + h_in) * W + w_in) * C + c], (float)__ldg(dy + yi));
#else
atomicAdd(&dx[((n * H + h_in) * W + w_in) * C + c], (float)dy[yi]);
#endif
} }
} }
template <> #undef LDG
__global__ void _ResizeNearestGradNHWC<half>( #undef LDG2
const int nthreads,
const int C,
const int H,
const int W,
const int out_h,
const int out_w,
const float scale_h,
const float scale_w,
const half* dy,
float* dx) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
const int c = yi % C;
const int w = (yi / C) % out_w;
const int h = (yi / C / out_w) % out_h;
const int n = yi / C / out_w / out_h;
const int h_in = min(int(h * scale_h), H - 1);
const int w_in = min(int(w * scale_w), W - 1);
#if __CUDA_ARCH__ >= 350
atomicAdd(
&dx[((n * H + h_in) * W + w_in) * C + c], __half2float(__ldg(dy + yi)));
#else
atomicAdd(&dx[((n * H + h_in) * W + w_in) * C + c], __half2float(dy[yi]));
#endif
}
}
} // namespace } // namespace
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
template <> #define DISPATCH_RESIZE_KERNEL(name, T, nblocks, nthreads, ...) \
void ResizeNearest<float16, CUDAContext>( if (data_format == "NCHW") { \
const int N, name##NCHW<<<nblocks, nthreads, 0, ctx->cuda_stream()>>>(__VA_ARGS__); \
const int C, } else if (data_format == "NHWC") { \
const int H, name##NHWC<<<nblocks, nthreads, 0, ctx->cuda_stream()>>>(__VA_ARGS__); \
const int W, } else { \
const int out_h, LOG(FATAL) << "Unknown DataFormat: " << data_format; \
const int out_w,
const string& data_format,
const float16* x,
float16* y,
CUDAContext* ctx) {
auto nthreads = N * C * out_h * out_w;
auto scale_h = (float)H / (float)out_h;
auto scale_w = (float)W / (float)out_w;
if (data_format == "NCHW") {
_ResizeNearestNCHW<<<
CUDA_BLOCKS(nthreads),
CUDA_THREADS,
0,
ctx->cuda_stream()>>>(
nthreads,
C,
H,
W,
out_h,
out_w,
scale_h,
scale_w,
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y));
} else if (data_format == "NHWC") {
_ResizeNearestNHWC<<<
CUDA_BLOCKS(nthreads),
CUDA_THREADS,
0,
ctx->cuda_stream()>>>(
nthreads,
C,
H,
W,
out_h,
out_w,
scale_h,
scale_w,
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y));
} else {
LOG(FATAL) << "Unknown data format: " << data_format;
}
}
template <>
void ResizeNearestGrad<float16, CUDAContext>(
const int N,
const int C,
const int H,
const int W,
const int out_h,
const int out_w,
const string& data_format,
const float16* dy,
float* dx,
CUDAContext* ctx) {
auto nthreads = N * C * out_h * out_w;
auto scale_h = (float)H / (float)out_h;
auto scale_w = (float)W / (float)out_w;
math::Set(N * C * H * W, 0.f, dx, ctx);
if (data_format == "NCHW") {
_ResizeNearestGradNCHW<<<
CUDA_BLOCKS(nthreads),
CUDA_THREADS,
0,
ctx->cuda_stream()>>>(
nthreads,
C,
H,
W,
out_h,
out_w,
scale_h,
scale_w,
reinterpret_cast<const half*>(dy),
dx);
} else if (data_format == "NHWC") {
_ResizeNearestGradNHWC<<<
CUDA_BLOCKS(nthreads),
CUDA_THREADS,
0,
ctx->cuda_stream()>>>(
nthreads,
C,
H,
W,
out_h,
out_w,
scale_h,
scale_w,
reinterpret_cast<const half*>(dy),
dx);
} else {
LOG(FATAL) << "Unknown data format: " << data_format;
} }
}
#define DEFINE_KERNEL_LAUNCHER(T) \ #define DEFINE_KERNEL_LAUNCHER(T, ScalarT) \
template <> \ template <> \
void ResizeNearest<T, CUDAContext>( \ void ResizeNearest<T, CUDAContext>( \
const int N, \ const int N, \
...@@ -299,26 +142,24 @@ void ResizeNearestGrad<float16, CUDAContext>( ...@@ -299,26 +142,24 @@ void ResizeNearestGrad<float16, CUDAContext>(
auto nthreads = N * C * out_h * out_w; \ auto nthreads = N * C * out_h * out_w; \
auto scale_h = (float)H / (float)out_h; \ auto scale_h = (float)H / (float)out_h; \
auto scale_w = (float)W / (float)out_w; \ auto scale_w = (float)W / (float)out_w; \
if (data_format == "NCHW") { \ DISPATCH_RESIZE_KERNEL( \
_ResizeNearestNCHW<<< \ _ResizeNearest, \
ScalarT, \
CUDA_BLOCKS(nthreads), \ CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \ CUDA_THREADS, \
0, \ nthreads, \
ctx->cuda_stream()>>>( \ C, \
nthreads, C, H, W, out_h, out_w, scale_h, scale_w, x, y); \ H, \
} else if (data_format == "NHWC") { \ W, \
_ResizeNearestNHWC<<< \ out_h, \
CUDA_BLOCKS(nthreads), \ out_w, \
CUDA_THREADS, \ scale_h, \
0, \ scale_w, \
ctx->cuda_stream()>>>( \ reinterpret_cast<const ScalarT*>(x), \
nthreads, C, H, W, out_h, out_w, scale_h, scale_w, x, y); \ reinterpret_cast<ScalarT*>(y)); \
} else { \
LOG(FATAL) << "Unknown data format: " << data_format; \
} \
} }
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \ #define DEFINE_GRAD_KERNEL_LAUNCHER(T, ScalarT) \
template <> \ template <> \
void ResizeNearestGrad<T, CUDAContext>( \ void ResizeNearestGrad<T, CUDAContext>( \
const int N, \ const int N, \
...@@ -335,37 +176,36 @@ void ResizeNearestGrad<float16, CUDAContext>( ...@@ -335,37 +176,36 @@ void ResizeNearestGrad<float16, CUDAContext>(
auto scale_h = (float)H / (float)out_h; \ auto scale_h = (float)H / (float)out_h; \
auto scale_w = (float)W / (float)out_w; \ auto scale_w = (float)W / (float)out_w; \
math::Set(N* C* H* W, 0.f, dx, ctx); \ math::Set(N* C* H* W, 0.f, dx, ctx); \
if (data_format == "NCHW") { \ DISPATCH_RESIZE_KERNEL( \
_ResizeNearestGradNCHW<<< \ _ResizeNearestGrad, \
ScalarT, \
CUDA_BLOCKS(nthreads), \ CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \ CUDA_THREADS, \
0, \ nthreads, \
ctx->cuda_stream()>>>( \ C, \
nthreads, C, H, W, out_h, out_w, scale_h, scale_w, dy, dx); \ H, \
} else if (data_format == "NHWC") { \ W, \
_ResizeNearestGradNHWC<<< \ out_h, \
CUDA_BLOCKS(nthreads), \ out_w, \
CUDA_THREADS, \ scale_h, \
0, \ scale_w, \
ctx->cuda_stream()>>>( \ reinterpret_cast<const ScalarT*>(dy), \
nthreads, C, H, W, out_h, out_w, scale_h, scale_w, dy, dx); \ dx); \
} else { \
LOG(FATAL) << "Unknown data format: " << data_format; \
} \
} }
DEFINE_KERNEL_LAUNCHER(int8_t); DEFINE_KERNEL_LAUNCHER(int8_t, int8_t);
DEFINE_KERNEL_LAUNCHER(uint8_t); DEFINE_KERNEL_LAUNCHER(uint8_t, uint8_t);
DEFINE_KERNEL_LAUNCHER(int); DEFINE_KERNEL_LAUNCHER(int, int);
DEFINE_KERNEL_LAUNCHER(int64_t); DEFINE_KERNEL_LAUNCHER(int64_t, int64_t);
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float16, half);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(float, float);
DEFINE_KERNEL_LAUNCHER(double, double);
DEFINE_GRAD_KERNEL_LAUNCHER(float); DEFINE_GRAD_KERNEL_LAUNCHER(float16, half);
DEFINE_GRAD_KERNEL_LAUNCHER(double); DEFINE_GRAD_KERNEL_LAUNCHER(float, float);
DEFINE_GRAD_KERNEL_LAUNCHER(double, double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
#undef DISPATCH_RESIZE_KERNEL
} // namespace kernel } // namespace kernel
......
...@@ -185,11 +185,9 @@ void RoiAlign<float16, CPUContext>( ...@@ -185,11 +185,9 @@ void RoiAlign<float16, CPUContext>(
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float16); DEFINE_GRAD_KERNEL_LAUNCHER(float16);
DEFINE_GRAD_KERNEL_LAUNCHER(float); DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double); DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
......
...@@ -10,11 +10,9 @@ namespace kernel { ...@@ -10,11 +10,9 @@ namespace kernel {
namespace { namespace {
#if __CUDA_ARCH__ >= 350 #if __CUDA_ARCH__ >= 350
#define LOAD(x, i) __ldg(x + i) #define LDG(x, i) convert::To<float>(__ldg(x + i))
#define LOADF(x, i) __half2float(__ldg(x + i))
#else #else
#define LOAD(x, i) x[i] #define LDG(x, i) convert::To<float>(x[i])
#define LOADF(x, i) __half2float(x[i])
#endif #endif
template <typename T> template <typename T>
...@@ -42,48 +40,10 @@ _RoiAlignIntp(const int H, const int W, float h, float w, const T* x) { ...@@ -42,48 +40,10 @@ _RoiAlignIntp(const int H, const int W, float h, float w, const T* x) {
w = (float)li; w = (float)li;
} }
const float tl = LOAD(x, (ti * W + li)); const float tl = LDG(x, (ti * W + li));
const float tr = LOAD(x, (ti * W + ri)); const float tr = LDG(x, (ti * W + ri));
const float bl = LOAD(x, (bi * W + li)); const float bl = LDG(x, (bi * W + li));
const float br = LOAD(x, (bi * W + ri)); const float br = LDG(x, (bi * W + ri));
const float v = h - ti;
const float u = w - li;
const float t = tl + (tr - tl) * u;
const float b = bl + (br - bl) * u;
return t + (b - t) * v;
}
template <>
__device__ float
_RoiAlignIntp<half>(const int H, const int W, float h, float w, const half* x) {
if (h < -1.f || h > H || w < -1.f || w > W) return 0.f;
if (h <= 0.f) h = 0.f;
if (w <= 0.f) w = 0.f;
int ti = (int)h, bi;
int li = (int)w, ri;
if (ti < H - 1) {
bi = ti + 1;
} else {
ti = bi = H - 1;
h = (float)ti;
}
if (li < W - 1) {
ri = li + 1;
} else {
ri = li = W - 1;
w = (float)li;
}
const float tl = LOADF(x, (ti * W + li));
const float tr = LOADF(x, (ti * W + ri));
const float bl = LOADF(x, (bi * W + li));
const float br = LOADF(x, (bi * W + ri));
const float v = h - ti; const float v = h - ti;
const float u = w - li; const float u = w - li;
...@@ -133,7 +93,7 @@ __device__ void _RoiAlignIntpParam( ...@@ -133,7 +93,7 @@ __device__ void _RoiAlignIntpParam(
u = w - li; u = w - li;
} }
template <typename T> template <typename T, typename AccT>
__global__ void _RoiAlign( __global__ void _RoiAlign(
const int nthreads, const int nthreads,
const int C, const int C,
...@@ -156,7 +116,7 @@ __global__ void _RoiAlign( ...@@ -156,7 +116,7 @@ __global__ void _RoiAlign(
const int batch_ind = roi[0]; const int batch_ind = roi[0];
if (batch_ind < 0) { if (batch_ind < 0) {
y[yi] = T(0); y[yi] = convert::To<T>(0.f);
continue; continue;
} }
...@@ -180,67 +140,7 @@ __global__ void _RoiAlign( ...@@ -180,67 +140,7 @@ __global__ void _RoiAlign(
const T* offset_x = x + (batch_ind * C + c) * H * W; const T* offset_x = x + (batch_ind * C + c) * H * W;
T val = T(0); AccT val = AccT(0);
for (int i = 0; i < grid_h; i++) {
const float h = hstart + (i + .5f) * bin_h / grid_h;
for (int j = 0; j < grid_w; j++) {
const float w = wstart + (j + .5f) * bin_w / grid_w;
val += _RoiAlignIntp(H, W, h, w, offset_x);
}
}
y[yi] = val / T(grid_h * grid_w);
}
}
template <>
__global__ void _RoiAlign<half>(
const int nthreads,
const int C,
const int H,
const int W,
const int out_h,
const int out_w,
const float spatial_scale,
const int sampling_ratio,
const half* x,
const float* rois,
half* y) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
const int ow = yi % out_w;
const int oh = (yi / out_w) % out_h;
const int c = (yi / out_w / out_h) % C;
const int n = yi / out_w / out_h / C;
const float* roi = rois + n * 5;
const int batch_ind = roi[0];
if (batch_ind < 0) {
y[yi] = __float2half(0.f);
continue;
}
const float roi_start_w = roi[1] * spatial_scale;
const float roi_start_h = roi[2] * spatial_scale;
const float roi_end_w = roi[3] * spatial_scale;
const float roi_end_h = roi[4] * spatial_scale;
const float roi_w = max(roi_end_w - roi_start_w, 1.f);
const float roi_h = max(roi_end_h - roi_start_h, 1.f);
const float bin_h = roi_h / (float)out_h;
const float bin_w = roi_w / (float)out_w;
const int grid_h =
sampling_ratio > 0 ? sampling_ratio : ceil(roi_h / out_h);
const int grid_w =
sampling_ratio > 0 ? sampling_ratio : ceil(roi_w / out_w);
const float hstart = roi_start_h + oh * bin_h;
const float wstart = roi_start_w + ow * bin_w;
const half* offset_x = x + (batch_ind * C + c) * H * W;
float val = 0.f;
for (int i = 0; i < grid_h; i++) { for (int i = 0; i < grid_h; i++) {
const float h = hstart + (i + .5f) * bin_h / grid_h; const float h = hstart + (i + .5f) * bin_h / grid_h;
for (int j = 0; j < grid_w; j++) { for (int j = 0; j < grid_w; j++) {
...@@ -249,11 +149,11 @@ __global__ void _RoiAlign<half>( ...@@ -249,11 +149,11 @@ __global__ void _RoiAlign<half>(
} }
} }
y[yi] = __float2half(val / float(grid_h * grid_w)); y[yi] = convert::To<T>(val / AccT(grid_h * grid_w));
} }
} }
template <typename T> template <typename T, typename AccT>
__global__ void _RoiAlignGrad( __global__ void _RoiAlignGrad(
const int nthreads, const int nthreads,
const int C, const int C,
...@@ -265,7 +165,7 @@ __global__ void _RoiAlignGrad( ...@@ -265,7 +165,7 @@ __global__ void _RoiAlignGrad(
const int sampling_ratio, const int sampling_ratio,
const T* dy, const T* dy,
const float* rois, const float* rois,
float* dx) { AccT* dx) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) { CUDA_1D_KERNEL_LOOP(yi, nthreads) {
const int ow = yi % out_w; const int ow = yi % out_w;
const int oh = (yi / out_w) % out_h; const int oh = (yi / out_w) % out_h;
...@@ -295,7 +195,7 @@ __global__ void _RoiAlignGrad( ...@@ -295,7 +195,7 @@ __global__ void _RoiAlignGrad(
const float hstart = roi_start_h + oh * bin_h; const float hstart = roi_start_h + oh * bin_h;
const float wstart = roi_start_w + ow * bin_w; const float wstart = roi_start_w + ow * bin_w;
const float dyi = dy[yi] / T(grid_h * grid_w); const float dyi = convert::To<float>(dy[yi]) / float(grid_h * grid_w);
float* offset_dx = dx + (batch_ind * C + c) * H * W; float* offset_dx = dx + (batch_ind * C + c) * H * W;
for (int i = 0; i < grid_h; i++) { for (int i = 0; i < grid_h; i++) {
...@@ -318,137 +218,13 @@ __global__ void _RoiAlignGrad( ...@@ -318,137 +218,13 @@ __global__ void _RoiAlignGrad(
} }
} }
template <> #undef LDG
__global__ void _RoiAlignGrad<half>(
const int nthreads,
const int C,
const int H,
const int W,
const int out_h,
const int out_w,
const float spatial_scale,
const int sampling_ratio,
const half* dy,
const float* rois,
float* dx) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
const int ow = yi % out_w;
const int oh = (yi / out_w) % out_h;
const int c = (yi / out_w / out_h) % C;
const int n = yi / out_w / out_h / C;
const float* roi = rois + n * 5;
const int batch_ind = roi[0];
if (batch_ind < 0) continue;
const float roi_start_w = roi[1] * spatial_scale;
const float roi_start_h = roi[2] * spatial_scale;
const float roi_end_w = roi[3] * spatial_scale;
const float roi_end_h = roi[4] * spatial_scale;
const float roi_w = max(roi_end_w - roi_start_w, 1.f);
const float roi_h = max(roi_end_h - roi_start_h, 1.f);
const float bin_h = roi_h / (float)out_h;
const float bin_w = roi_w / (float)out_w;
const int grid_h =
sampling_ratio > 0 ? sampling_ratio : ceil(roi_h / out_h);
const int grid_w =
sampling_ratio > 0 ? sampling_ratio : ceil(roi_w / out_w);
const float hstart = roi_start_h + oh * bin_h;
const float wstart = roi_start_w + ow * bin_w;
const float dyi = __half2float(dy[yi]) / float(grid_h * grid_w);
float* offset_dx = dx + (batch_ind * C + c) * H * W;
for (int i = 0; i < grid_h; i++) {
const float h = hstart + (i + .5f) * bin_h / grid_h;
for (int j = 0; j < grid_w; j++) {
const float w = wstart + (j + .5f) * bin_w / grid_w;
int ti, bi, li, ri;
float v, u;
_RoiAlignIntpParam(H, W, h, w, ti, bi, li, ri, v, u);
if (li >= 0 && ri >= 0 && ti >= 0 && bi >= 0) {
const float db = dyi * v;
const float dt = dyi * (1.f - v);
atomicAdd(offset_dx + ti * W + li, (1.f - u) * dt);
atomicAdd(offset_dx + ti * W + ri, u * dt);
atomicAdd(offset_dx + bi * W + li, (1.f - u) * db);
atomicAdd(offset_dx + bi * W + ri, u * db);
}
} // End i
} // End j
}
}
#undef LOAD
#undef LOADF
} // namespace } // namespace
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
template <> #define DEFINE_KERNEL_LAUNCHER(T, ScalarT) \
void RoiAlign<float16, CUDAContext>(
const int C,
const int H,
const int W,
const int out_h,
const int out_w,
const int num_rois,
const float spatial_scale,
const int sampling_ratio,
const float16* x,
const float* rois,
float16* y,
CUDAContext* ctx) {
auto nthreads = num_rois * C * out_h * out_w;
_RoiAlign<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
nthreads,
C,
H,
W,
out_h,
out_w,
spatial_scale,
sampling_ratio,
reinterpret_cast<const half*>(x),
rois,
reinterpret_cast<half*>(y));
}
template <>
void RoiAlignGrad<float16, CUDAContext>(
const int C,
const int H,
const int W,
const int out_h,
const int out_w,
const int num_rois,
const float spatial_scale,
const int sampling_ratio,
const float16* dy,
const float* rois,
float* dx,
CUDAContext* ctx) {
auto nthreads = num_rois * C * out_h * out_w;
_RoiAlignGrad<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
nthreads,
C,
H,
W,
out_h,
out_w,
spatial_scale,
sampling_ratio,
reinterpret_cast<const half*>(dy),
rois,
dx);
}
#define DEFINE_KERNEL_LAUNCHER(T) \
template <> \ template <> \
void RoiAlign<T, CUDAContext>( \ void RoiAlign<T, CUDAContext>( \
const int C, \ const int C, \
...@@ -464,7 +240,8 @@ void RoiAlignGrad<float16, CUDAContext>( ...@@ -464,7 +240,8 @@ void RoiAlignGrad<float16, CUDAContext>(
T* y, \ T* y, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
auto nthreads = num_rois * C * out_h * out_w; \ auto nthreads = num_rois * C * out_h * out_w; \
_RoiAlign<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ _RoiAlign<ScalarT, float> \
<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
nthreads, \ nthreads, \
C, \ C, \
H, \ H, \
...@@ -473,12 +250,12 @@ void RoiAlignGrad<float16, CUDAContext>( ...@@ -473,12 +250,12 @@ void RoiAlignGrad<float16, CUDAContext>(
out_w, \ out_w, \
spatial_scale, \ spatial_scale, \
sampling_ratio, \ sampling_ratio, \
x, \ reinterpret_cast<const ScalarT*>(x), \
rois, \ rois, \
y); \ reinterpret_cast<ScalarT*>(y)); \
} }
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \ #define DEFINE_GRAD_KERNEL_LAUNCHER(T, ScalarT) \
template <> \ template <> \
void RoiAlignGrad<T, CUDAContext>( \ void RoiAlignGrad<T, CUDAContext>( \
const int C, \ const int C, \
...@@ -507,17 +284,17 @@ void RoiAlignGrad<float16, CUDAContext>( ...@@ -507,17 +284,17 @@ void RoiAlignGrad<float16, CUDAContext>(
out_w, \ out_w, \
spatial_scale, \ spatial_scale, \
sampling_ratio, \ sampling_ratio, \
dy, \ reinterpret_cast<const ScalarT*>(dy), \
rois, \ rois, \
dx); \ dx); \
} }
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float16, half);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(float, float);
DEFINE_KERNEL_LAUNCHER(double, double);
DEFINE_GRAD_KERNEL_LAUNCHER(float); DEFINE_GRAD_KERNEL_LAUNCHER(float16, half);
DEFINE_GRAD_KERNEL_LAUNCHER(double); DEFINE_GRAD_KERNEL_LAUNCHER(float, float);
DEFINE_GRAD_KERNEL_LAUNCHER(double, double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
......
...@@ -149,11 +149,9 @@ void RoiPool<float16, CPUContext>( ...@@ -149,11 +149,9 @@ void RoiPool<float16, CPUContext>(
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float16); DEFINE_GRAD_KERNEL_LAUNCHER(float16);
DEFINE_GRAD_KERNEL_LAUNCHER(float); DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double); DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
......
...@@ -9,6 +9,12 @@ namespace kernel { ...@@ -9,6 +9,12 @@ namespace kernel {
namespace { namespace {
#if __CUDA_ARCH__ >= 350
#define LDG(x, i) __ldg(x + i)
#else
#define LDG(x, i) x[i]
#endif
template <typename T> template <typename T>
__global__ void _RoiPool( __global__ void _RoiPool(
const int nthreads, const int nthreads,
...@@ -22,6 +28,7 @@ __global__ void _RoiPool( ...@@ -22,6 +28,7 @@ __global__ void _RoiPool(
const float* rois, const float* rois,
int* mask, int* mask,
T* y) { T* y) {
auto Greater = math::GreaterFunctor<T>();
CUDA_1D_KERNEL_LOOP(yi, nthreads) { CUDA_1D_KERNEL_LOOP(yi, nthreads) {
const int ow = yi % out_w; const int ow = yi % out_w;
const int oh = (yi / out_w) % out_h; const int oh = (yi / out_w) % out_h;
...@@ -32,7 +39,7 @@ __global__ void _RoiPool( ...@@ -32,7 +39,7 @@ __global__ void _RoiPool(
const int batch_ind = roi[0]; const int batch_ind = roi[0];
if (batch_ind < 0) { if (batch_ind < 0) {
y[yi] = T(0); y[yi] = convert::To<T>(0.f);
mask[yi] = -1; mask[yi] = -1;
continue; continue;
} }
...@@ -60,119 +67,22 @@ __global__ void _RoiPool( ...@@ -60,119 +67,22 @@ __global__ void _RoiPool(
int max_idx = empty ? -1 : 0; int max_idx = empty ? -1 : 0;
const T* offset_x = x + (batch_ind * C + c) * H * W; const T* offset_x = x + (batch_ind * C + c) * H * W;
T val = empty ? T(0) : offset_x[0]; T val = empty ? convert::To<T>(0.f) : offset_x[0];
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
const int xi = h * W + w;
#if __CUDA_ARCH__ >= 350
if (__ldg(offset_x + xi) > val) {
val = __ldg(offset_x + xi);
max_idx = xi;
}
#else
if (offset_x[xi] > val) {
val = offset_x[xi];
max_idx = xi;
}
#endif
}
}
y[yi] = val;
mask[yi] = max_idx;
}
}
template <>
__global__ void _RoiPool<half>(
const int nthreads,
const int C,
const int H,
const int W,
const int out_h,
const int out_w,
const float spatial_scale,
const half* x,
const float* rois,
int* mask,
half* y) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
const int ow = yi % out_w;
const int oh = (yi / out_w) % out_h;
const int c = (yi / out_w / out_h) % C;
const int n = yi / out_w / out_h / C;
const float* roi = rois + n * 5;
const int batch_ind = roi[0];
if (batch_ind < 0) {
y[yi] = __float2half(0.f);
mask[yi] = -1;
continue;
}
const int roi_start_w = round(roi[1] * spatial_scale);
const int roi_start_h = round(roi[2] * spatial_scale);
const int roi_end_w = round(roi[3] * spatial_scale);
const int roi_end_h = round(roi[4] * spatial_scale);
const int roi_w = max(roi_end_w - roi_start_w + 1, 1);
const int roi_h = max(roi_end_h - roi_start_h + 1, 1);
const float bin_h = (float)roi_h / (float)out_h;
const float bin_w = (float)roi_w / (float)out_w;
int hstart = floor(bin_h * oh);
int wstart = floor(bin_w * ow);
int hend = ceil(bin_h * (oh + 1));
int wend = ceil(bin_w * (ow + 1));
hstart = min(max(hstart + roi_start_h, 0), H);
hend = min(max(hend + roi_start_h, 0), H);
wstart = min(max(wstart + roi_start_w, 0), W);
wend = min(max(wend + roi_start_w, 0), W);
const bool empty = (hend <= hstart) || (wend <= wstart);
int max_idx = empty ? -1 : 0;
const half* offset_x = x + ((batch_ind * C + c) * H * W);
#if __CUDA_ARCH__ >= 530
half val = empty ? __float2half(0.f) : __ldg(offset_x);
#else
float val = empty ? 0.f : __half2float(*offset_x);
#endif
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
const int xi = h * W + w; const int xi = h * W + w;
#if __CUDA_ARCH__ >= 530 if (Greater(LDG(offset_x, xi), val)) {
if (__hgt(__ldg(offset_x + xi), val)) { val = LDG(offset_x, xi);
val = __ldg(offset_x + xi);
max_idx = xi; max_idx = xi;
} }
#elif __CUDA_ARCH__ >= 350
if (__half2float(__ldg(offset_x + xi)) > val) {
val = __half2float(__ldg(offset_x + xi));
max_idx = xi;
} }
#else
if (__half2float(offset_x[xi]) > val) {
val = __half2float(offset_x[xi]);
max_idx = xi;
}
#endif
} }
}
#if __CUDA_ARCH__ >= 530
y[yi] = val; y[yi] = val;
#else
y[yi] = __float2half(val);
#endif
mask[yi] = max_idx; mask[yi] = max_idx;
} }
} }
template <typename T> template <typename T, typename AccT>
__global__ void _RoiPoolGrad( __global__ void _RoiPoolGrad(
const int nthreads, const int nthreads,
const int C, const int C,
...@@ -184,7 +94,7 @@ __global__ void _RoiPoolGrad( ...@@ -184,7 +94,7 @@ __global__ void _RoiPoolGrad(
const T* dy, const T* dy,
const float* rois, const float* rois,
const int* mask, const int* mask,
float* dx) { AccT* dx) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) { CUDA_1D_KERNEL_LOOP(yi, nthreads) {
const int c = (yi / out_w / out_h) % C; const int c = (yi / out_w / out_h) % C;
const int n = yi / out_w / out_h / C; const int n = yi / out_w / out_h / C;
...@@ -193,116 +103,20 @@ __global__ void _RoiPoolGrad( ...@@ -193,116 +103,20 @@ __global__ void _RoiPoolGrad(
const int batch_ind = roi[0]; const int batch_ind = roi[0];
if (batch_ind < 0) continue; if (batch_ind < 0) continue;
float* offset_dx = dx + (batch_ind * C + c) * H * W; AccT* offset_dx = dx + (batch_ind * C + c) * H * W;
#if __CUDA_ARCH__ >= 350 if (LDG(mask, yi) != -1) {
if (__ldg(mask + yi) != -1) { atomicAdd(offset_dx + LDG(mask, yi), convert::To<AccT>(dy[yi]));
atomicAdd(offset_dx + __ldg(mask + yi), (float)dy[yi]);
}
#else
if (mask[yi] != -1) {
atomicAdd(offset_dx + mask[yi], (float)dy[yi]);
} }
#endif
} }
} }
template <> #undef LDG
__global__ void _RoiPoolGrad<half>(
const int nthreads,
const int C,
const int H,
const int W,
const int out_h,
const int out_w,
const float spatial_scale,
const half* dy,
const float* rois,
const int* mask,
float* dx) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
const int c = (yi / out_w / out_h) % C;
const int n = yi / out_w / out_h / C;
const float* roi = rois + n * 5;
const int batch_ind = roi[0];
if (batch_ind < 0) continue;
float* offset_dx = dx + (batch_ind * C + c) * H * W;
#if __CUDA_ARCH__ >= 350
if (__ldg(mask + yi) != -1) {
atomicAdd(offset_dx + __ldg(mask + yi), __half2float(dy[yi]));
}
#else
if (mask[yi] != -1) {
atomicAdd(offset_dx + mask[yi], __half2float(dy[yi]));
}
#endif
}
}
} // namespace } // namespace
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
template <> #define DEFINE_KERNEL_LAUNCHER(T, ScalarT) \
void RoiPool<float16, CUDAContext>(
const int C,
const int H,
const int W,
const int out_h,
const int out_w,
const int num_rois,
const float spatial_scale,
const float16* x,
const float* rois,
int* mask,
float16* y,
CUDAContext* ctx) {
auto nthreads = num_rois * C * out_h * out_w;
_RoiPool<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
nthreads,
C,
H,
W,
out_h,
out_w,
spatial_scale,
reinterpret_cast<const half*>(x),
rois,
mask,
reinterpret_cast<half*>(y));
}
template <>
void RoiPoolGrad<float16, CUDAContext>(
const int C,
const int H,
const int W,
const int out_h,
const int out_w,
const int num_rois,
const float spatial_scale,
const float16* dy,
const float* rois,
const int* mask,
float* dx,
CUDAContext* ctx) {
auto nthreads = num_rois * C * out_h * out_w;
_RoiPoolGrad<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
nthreads,
C,
H,
W,
out_h,
out_w,
spatial_scale,
reinterpret_cast<const half*>(dy),
rois,
mask,
dx);
} // RoiPoolGrad
#define DEFINE_KERNEL_LAUNCHER(T) \
template <> \ template <> \
void RoiPool<T, CUDAContext>( \ void RoiPool<T, CUDAContext>( \
const int C, \ const int C, \
...@@ -319,10 +133,20 @@ void RoiPoolGrad<float16, CUDAContext>( ...@@ -319,10 +133,20 @@ void RoiPoolGrad<float16, CUDAContext>(
CUDAContext* ctx) { \ CUDAContext* ctx) { \
auto nthreads = num_rois * C * out_h * out_w; \ auto nthreads = num_rois * C * out_h * out_w; \
_RoiPool<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ _RoiPool<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
nthreads, C, H, W, out_h, out_w, spatial_scale, x, rois, mask, y); \ nthreads, \
} C, \
H, \
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \ W, \
out_h, \
out_w, \
spatial_scale, \
reinterpret_cast<const ScalarT*>(x), \
rois, \
mask, \
reinterpret_cast<ScalarT*>(y)); \
}
#define DEFINE_GRAD_KERNEL_LAUNCHER(T, ScalarT) \
template <> \ template <> \
void RoiPoolGrad<T, CUDAContext>( \ void RoiPoolGrad<T, CUDAContext>( \
const int C, \ const int C, \
...@@ -343,15 +167,25 @@ void RoiPoolGrad<float16, CUDAContext>( ...@@ -343,15 +167,25 @@ void RoiPoolGrad<float16, CUDAContext>(
CUDA_THREADS, \ CUDA_THREADS, \
0, \ 0, \
ctx->cuda_stream()>>>( \ ctx->cuda_stream()>>>( \
nthreads, C, H, W, out_h, out_w, spatial_scale, dy, rois, mask, dx); \ nthreads, \
} C, \
H, \
DEFINE_KERNEL_LAUNCHER(float); W, \
DEFINE_KERNEL_LAUNCHER(double); out_h, \
out_w, \
DEFINE_GRAD_KERNEL_LAUNCHER(float); spatial_scale, \
DEFINE_GRAD_KERNEL_LAUNCHER(double); reinterpret_cast<const ScalarT*>(dy), \
rois, \
mask, \
dx); \
}
DEFINE_KERNEL_LAUNCHER(float16, half);
DEFINE_KERNEL_LAUNCHER(float, float);
DEFINE_KERNEL_LAUNCHER(double, double);
DEFINE_GRAD_KERNEL_LAUNCHER(float16, half);
DEFINE_GRAD_KERNEL_LAUNCHER(float, float);
DEFINE_GRAD_KERNEL_LAUNCHER(double, double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
......
#include "dragon/operators/array/reshape_ops.h"
#include "dragon/utils/math_functions.h"
namespace dragon {
template <class Context>
void IdentityOp<Context>::RunOnDevice() {
auto &X = Input(0), *Y = Output(0, {0});
// Store for the gradient calculation
STORE_INPUT_SPEC(0);
// Maybe copy the contents
Y->ReshapeLike(X)->CopyFrom(X, ctx());
}
DEPLOY_CPU_OPERATOR(Identity);
#ifdef USE_CUDA
DEPLOY_CUDA_OPERATOR(Identity);
#endif
OPERATOR_SCHEMA(Identity)
/* X */
.NumInputs(1)
/* Y */
.NumOutputs(1)
/* X => Y */
.AllowInplace({{0, 0}});
OPERATOR_SCHEMA(IdentityGradient)
/* dY */
.NumInputs(1)
/* dX */
.NumOutputs(1)
/* dY => dX */
.AllowInplace({{0, 0}});
REGISTER_GRADIENT(Identity, SimpleGradientMaker);
} // namespace dragon
...@@ -31,6 +31,15 @@ class ReshapeGradientOpBase : public Operator<Context> { ...@@ -31,6 +31,15 @@ class ReshapeGradientOpBase : public Operator<Context> {
}; };
template <class Context> template <class Context>
class IdentityOp final : public Operator<Context> {
public:
SIMPLE_CTOR_DTOR(IdentityOp);
USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override;
};
template <class Context>
class ReshapeOp final : public Operator<Context> { class ReshapeOp final : public Operator<Context> {
public: public:
ReshapeOp(const OperatorDef& def, Workspace* ws) ReshapeOp(const OperatorDef& def, Workspace* ws)
...@@ -94,6 +103,7 @@ class SqueezeOp final : public Operator<Context> { ...@@ -94,6 +103,7 @@ class SqueezeOp final : public Operator<Context> {
: ReshapeGradientOpBase<Context>(def, ws) {} \ : ReshapeGradientOpBase<Context>(def, ws) {} \
}; };
DEFINE_GRADIENT_OP(Identity);
DEFINE_GRADIENT_OP(Reshape); DEFINE_GRADIENT_OP(Reshape);
DEFINE_GRADIENT_OP(Flatten); DEFINE_GRADIENT_OP(Flatten);
DEFINE_GRADIENT_OP(ExpandDims); DEFINE_GRADIENT_OP(ExpandDims);
......
...@@ -8,18 +8,17 @@ namespace dragon { ...@@ -8,18 +8,17 @@ namespace dragon {
template <class Context> template <class Context>
template <typename T> template <typename T>
void AssignOp<Context>::DoRunWithType() { void AssignOp<Context>::DoRunWithType() {
auto &X = Input(0), *Y = Output(0); auto &Y_ref = Input(0), &X = Input(1), *Y = Output(0, {0});
int num_starts, num_sizes, num_dims = Y->ndim();
vec64_t X_dims(num_dims), X_starts(num_dims);
// Determine the interval of each dimension // Determine the interval of each dimension
int num_starts, num_sizes, num_dims = Y_ref.ndim();
vec64_t X_dims(num_dims), X_starts(num_dims);
starts(0, &num_starts); starts(0, &num_starts);
sizes(0, &num_sizes); sizes(0, &num_sizes);
for (int i = 0; i < num_dims; i++) { for (int i = 0; i < num_dims; i++) {
auto dim_start = i < num_starts ? starts(i) : 0; auto dim_start = i < num_starts ? starts(i) : 0;
auto dim_end = Y->dim(i); auto dim_end = Y_ref.dim(i);
if (i < num_sizes) { if (i < num_sizes) {
auto dim_length = sizes(i); auto dim_length = sizes(i);
if (dim_length > 0) { if (dim_length > 0) {
...@@ -28,19 +27,18 @@ void AssignOp<Context>::DoRunWithType() { ...@@ -28,19 +27,18 @@ void AssignOp<Context>::DoRunWithType() {
dim_end = dim_start + 1; dim_end = dim_start + 1;
} }
} }
CHECK(dim_start >= 0 && dim_start < Y->dim(i)) CHECK(dim_start >= 0 && dim_start < Y_ref.dim(i))
<< "\nAssigning starts from " << dim_start << " of axis " << i << ", " << "\nAssigning starts from " << dim_start << " of axis " << i << ", "
<< "while the dimension of this axis is " << Y->dim(i) << "."; << "while the dimension of this axis is " << Y_ref.dim(i) << ".";
CHECK(dim_end > 0 && dim_end <= Y->dim(i)) CHECK(dim_end > 0 && dim_end <= Y_ref.dim(i))
<< "\nAssigning ends at " << dim_end << " of axis " << i << ", " << "\nAssigning ends at " << dim_end << " of axis " << i << ", "
<< "while the dimension of this axis is " << Y->dim(i) << "."; << "while the dimension of this axis is " << Y_ref.dim(i) << ".";
X_starts[i] = dim_start; X_starts[i] = dim_start;
X_dims[i] = dim_end - dim_start; X_dims[i] = dim_end - dim_start;
} }
Tensor X_broadcast(X_dims); Tensor XRef(X_dims);
auto* x = X.template data<T, Context>(); auto* new_data = X.template data<T, Context>();
if (X.dims() != X_dims) { if (X.dims() != X_dims) {
vec64_t dims1, dims2; vec64_t dims1, dims2;
if (math::utils::IsBinaryBroadcast(X.dims(), X_dims, dims1)) { if (math::utils::IsBinaryBroadcast(X.dims(), X_dims, dims1)) {
...@@ -49,17 +47,17 @@ void AssignOp<Context>::DoRunWithType() { ...@@ -49,17 +47,17 @@ void AssignOp<Context>::DoRunWithType() {
<< Tensor::DimString(X_dims); << Tensor::DimString(X_dims);
math::utils::ComputeBinaryBroadcastDims(X.dims(), X_dims, dims1, dims2); math::utils::ComputeBinaryBroadcastDims(X.dims(), X_dims, dims1, dims2);
if (dims1 != dims2) { if (dims1 != dims2) {
auto* scratch = ctx()->workspace()->template data<T, Context>( auto* scratch =
{X_broadcast.count()})[0]; ctx()->workspace()->template data<T, Context>({XRef.count()})[0];
math::Set( math::Set(
X.ndim(), X.ndim(),
X.dims().data(), X.dims().data(),
X_broadcast.ndim(), XRef.ndim(),
X_broadcast.dims().data(), XRef.dims().data(),
x, new_data,
scratch, scratch,
ctx()); ctx());
x = scratch; new_data = scratch;
} }
} else { } else {
LOG(FATAL) << "Could not broadcast together with shapes " << X.DimString() LOG(FATAL) << "Could not broadcast together with shapes " << X.DimString()
...@@ -67,12 +65,16 @@ void AssignOp<Context>::DoRunWithType() { ...@@ -67,12 +65,16 @@ void AssignOp<Context>::DoRunWithType() {
} }
} }
// Copy the reference data
Y->ReshapeLike(Y_ref)->CopyFrom(Y_ref, ctx());
// Update with the new data
kernel::Assign( kernel::Assign(
num_dims, num_dims,
X_dims.data(), X_dims.data(),
Y->strides().data(), Y->strides().data(),
X_starts.data(), X_starts.data(),
x, new_data,
Y->template mutable_data<T, Context>(), Y->template mutable_data<T, Context>(),
ctx()); ctx());
} }
...@@ -88,10 +90,12 @@ DEPLOY_CUDA_OPERATOR(Assign); ...@@ -88,10 +90,12 @@ DEPLOY_CUDA_OPERATOR(Assign);
#endif #endif
OPERATOR_SCHEMA(Assign) OPERATOR_SCHEMA(Assign)
/* V */ /* Y_ref, X */
.NumInputs(1) .NumInputs(2)
/* X */ /* Y */
.NumOutputs(1); .NumOutputs(1)
/* Y_ref => Y */
.AllowInplace({{0, 0}});
NO_GRADIENT(Assign); NO_GRADIENT(Assign);
......
#include "dragon/operators/control_flow/copy_op.h"
#include "dragon/utils/math_functions.h"
namespace dragon {
template <class Context>
template <typename T>
void CopyOp<Context>::DoRunWithType() {
auto &X = Input(0), *Y = Output(0);
math::Copy(
X.count(),
X.template data<T, Context>(),
Y->ReshapeLike(X)->template mutable_data<T, Context>(),
ctx());
}
template <class Context>
void CopyOp<Context>::RunOnDevice() {
DispatchHelper<FullTensorTypes>::Call(this, Input(0));
}
DEPLOY_CPU_OPERATOR(Copy);
#ifdef USE_CUDA
DEPLOY_CUDA_OPERATOR(Copy);
#endif
OPERATOR_SCHEMA(Copy)
/* X */
.NumInputs(1)
/* Y */
.NumOutputs(1);
NO_GRADIENT(Copy);
} // namespace dragon
/*!
* Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
*
* Licensed under the BSD 2-Clause License.
* You should have received a copy of the BSD 2-Clause License
* along with the software. If not, See,
*
* <https://opensource.org/licenses/BSD-2-Clause>
*
* ------------------------------------------------------------
*/
#ifndef DRAGON_OPERATORS_CONTROL_FLOW_COPY_OP_H_
#define DRAGON_OPERATORS_CONTROL_FLOW_COPY_OP_H_
#include "dragon/core/operator.h"
namespace dragon {
template <class Context>
class CopyOp final : public Operator<Context> {
public:
SIMPLE_CTOR_DTOR(CopyOp);
USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override;
template <typename T>
void DoRunWithType();
};
} // namespace dragon
#endif // DRAGON_OPERATORS_CONTROL_FLOW_COPY_OP_H_
...@@ -8,24 +8,28 @@ namespace dragon { ...@@ -8,24 +8,28 @@ namespace dragon {
template <class Context> template <class Context>
template <typename T> template <typename T>
void MaskedAssignOp<Context>::DoRunWithType() { void MaskedAssignOp<Context>::DoRunWithType() {
auto &X = Input(0), &X_mask = Input(1), *Y = Output(0); auto &X = Input(1), &X_mask = Input(2);
auto &Y_ref = Input(0), *Y = Output(0, {0});
CHECK(X_mask.template IsType<bool>() || X_mask.template IsType<uint8_t>()) CHECK(X_mask.template IsType<bool>() || X_mask.template IsType<uint8_t>())
<< "\nExcepted bool or uint8 mask."; << "\nExcepted bool or uint8 mask.";
vec64_t X_dims, Y_dims; vec64_t X_dims, Y_dims;
if (math::utils::IsBinaryBroadcast(X.dims(), X_mask.dims(), X_dims) && if (math::utils::IsBinaryBroadcast(X.dims(), X_mask.dims(), X_dims) &&
math::utils::IsBinaryBroadcast(X_dims, Y->dims(), Y_dims) && math::utils::IsBinaryBroadcast(X_dims, Y_ref.dims(), Y_dims) &&
Y_dims == Y->dims()) { Y_dims == Y_ref.dims()) {
// Copy the reference data
Y->ReshapeLike(Y_ref)->CopyFrom(Y_ref, ctx());
// Update with the new data
math::Where( math::Where(
X.ndim(), X.ndim(),
X.dims().data(), X.dims().data(),
Y->ndim(), Y_ref.ndim(),
Y->dims().data(), Y_ref.dims().data(),
X_mask.ndim(), X_mask.ndim(),
X_mask.dims().data(), X_mask.dims().data(),
X.template data<T, Context>(), X.template data<T, Context>(),
Y->template data<T, Context>(), Y_ref.template data<T, Context>(),
(const bool*)X_mask.template raw_data<Context>(), (const bool*)X_mask.template raw_data<Context>(),
Y->template mutable_data<T, Context>(), Y->template mutable_data<T, Context>(),
ctx()); ctx());
...@@ -46,10 +50,12 @@ DEPLOY_CUDA_OPERATOR(MaskedAssign); ...@@ -46,10 +50,12 @@ DEPLOY_CUDA_OPERATOR(MaskedAssign);
#endif #endif
OPERATOR_SCHEMA(MaskedAssign) OPERATOR_SCHEMA(MaskedAssign)
/* X, M */ /* Y_ref, X, X_mask */
.NumInputs(2) .NumInputs(3)
/* Y */ /* Y */
.NumOutputs(1); .NumOutputs(1)
/* Y_ref => Y */
.AllowInplace({{0, 0}});
NO_GRADIENT(MaskedAssign); NO_GRADIENT(MaskedAssign);
......
...@@ -65,6 +65,7 @@ from dragon.core.ops.array_ops import channel_shuffle ...@@ -65,6 +65,7 @@ from dragon.core.ops.array_ops import channel_shuffle
from dragon.core.ops.array_ops import concat from dragon.core.ops.array_ops import concat
from dragon.core.ops.array_ops import expand_dims from dragon.core.ops.array_ops import expand_dims
from dragon.core.ops.array_ops import flatten from dragon.core.ops.array_ops import flatten
from dragon.core.ops.array_ops import identity
from dragon.core.ops.array_ops import index_select from dragon.core.ops.array_ops import index_select
from dragon.core.ops.array_ops import linspace from dragon.core.ops.array_ops import linspace
from dragon.core.ops.array_ops import masked_select from dragon.core.ops.array_ops import masked_select
...@@ -85,7 +86,6 @@ from dragon.core.ops.array_ops import transpose ...@@ -85,7 +86,6 @@ from dragon.core.ops.array_ops import transpose
from dragon.core.ops.array_ops import unique from dragon.core.ops.array_ops import unique
from dragon.core.ops.array_ops import where from dragon.core.ops.array_ops import where
from dragon.core.ops.control_flow_ops import assign from dragon.core.ops.control_flow_ops import assign
from dragon.core.ops.control_flow_ops import copy
from dragon.core.ops.control_flow_ops import masked_assign from dragon.core.ops.control_flow_ops import masked_assign
from dragon.core.ops.framework_ops import python_plugin from dragon.core.ops.framework_ops import python_plugin
from dragon.core.ops.framework_ops import stop_gradient from dragon.core.ops.framework_ops import stop_gradient
......
...@@ -53,6 +53,7 @@ class OpDef(object): ...@@ -53,6 +53,7 @@ class OpDef(object):
outputs=None, outputs=None,
num_outputs=1, num_outputs=1,
extra_inputs=None, extra_inputs=None,
extra_outputs=None,
name=None, name=None,
**kwargs **kwargs
): ):
...@@ -64,6 +65,7 @@ class OpDef(object): ...@@ -64,6 +65,7 @@ class OpDef(object):
for input in inputs: for input in inputs:
op_info.merge_from(input) op_info.merge_from(input)
# Collect defs from extra inputs.
if extra_inputs is not None: if extra_inputs is not None:
extra_inputs = nest.flatten(extra_inputs) extra_inputs = nest.flatten(extra_inputs)
for input in extra_inputs: for input in extra_inputs:
......
...@@ -59,12 +59,6 @@ def arg_reduce_spec(args, inputs, outputs): ...@@ -59,12 +59,6 @@ def arg_reduce_spec(args, inputs, outputs):
return outputs return outputs
@register(['Assign', 'MaskedAssign'])
def assign_spec(args, inputs, outputs):
_ = locals()
return outputs
def binary_shape_spec(inputs, outputs): def binary_shape_spec(inputs, outputs):
if inputs[0].shape is None or inputs[1].shape is None: if inputs[0].shape is None or inputs[1].shape is None:
return outputs return outputs
......
...@@ -588,6 +588,45 @@ def flatten(inputs, axis=0, num_axes=-1, keep_axes=None, **kwargs): ...@@ -588,6 +588,45 @@ def flatten(inputs, axis=0, num_axes=-1, keep_axes=None, **kwargs):
@OpSchema.num_inputs(1) @OpSchema.num_inputs(1)
def identity(inputs, **kwargs):
"""Return a tensor copied from the input.
Examples:
Examples:
```python
# Copy ``x`` to ``y``
x = dragon.zeros(shape=(2, 3))
y = dragon.identity(x)
# ``x`` != ``y``
x += 1
print(x)
print(y)
```
Parameters
----------
inputs : dragon.Tensor
The input tensor.
Returns
-------
dragon.Tensor
The output tensor.
"""
args = parse_args(locals())
inplace = args.pop('inplace') if 'inplace' in args else False
op_lib = array_ops_lib.Identity
if context.executing_eagerly():
return op_lib.instantiate().apply([inputs], inplace=inplace)
else:
return op_lib.blend(**args)
@OpSchema.num_inputs(1)
def index_select(inputs, index, axis=0, **kwargs): def index_select(inputs, index, axis=0, **kwargs):
"""Select the elements according to the index along the given axis. """Select the elements according to the index along the given axis.
......
...@@ -259,6 +259,20 @@ class Flatten(Operator): ...@@ -259,6 +259,20 @@ class Flatten(Operator):
return self.dispatch(inputs, outputs) return self.dispatch(inputs, outputs)
class Identity(Operator):
"""Identity operator."""
def __init__(self, key, dev, **kwargs):
super(Identity, self).__init__(key, dev, **kwargs)
def attributes(self):
return {'op_type': 'Identity', 'arguments': {}}
def forward(self, inputs, inplace=False):
outputs = [self.alloc(inputs[0]) if inplace else self.alloc()]
return self.dispatch(inputs, outputs)
class IndexSelect(Operator): class IndexSelect(Operator):
"""IndexSelect operator.""" """IndexSelect operator."""
......
...@@ -20,7 +20,6 @@ from dragon.core.ops import control_flow_ops_lib ...@@ -20,7 +20,6 @@ from dragon.core.ops import control_flow_ops_lib
from dragon.core.ops.utils import ArgHelper from dragon.core.ops.utils import ArgHelper
from dragon.core.ops.utils import OpSchema from dragon.core.ops.utils import OpSchema
from dragon.core.ops.utils import parse_args from dragon.core.ops.utils import parse_args
from dragon.core.util import nest
@OpSchema.num_inputs(2) @OpSchema.num_inputs(2)
...@@ -43,66 +42,22 @@ def assign(inputs, starts=None, sizes=None, **kwargs): ...@@ -43,66 +42,22 @@ def assign(inputs, starts=None, sizes=None, **kwargs):
Returns Returns
------- -------
dragon.Tensor dragon.Tensor
The input tensor. The output tensor.
""" """
args = parse_args(locals()) args = parse_args(locals())
inplace = args.pop('inplace') if 'inplace' in args else False
inputs[1] = ops.scalar_to_tensor(inputs[1], inputs[0].dtype) inputs[1] = ops.scalar_to_tensor(inputs[1], inputs[0].dtype)
op_lib = control_flow_ops_lib.Assign op_lib = control_flow_ops_lib.Assign
if context.executing_eagerly(): if context.executing_eagerly():
return op_lib \ return op_lib \
.instantiate( .instantiate(
ndim=len(starts) if starts is not None else 0, ndim=len(starts) if starts is not None else 0,
).apply(inputs, starts, sizes) ).apply(inputs, starts, sizes, inplace=inplace)
else: else:
args['outputs'] = [args['inputs'][0]]
args['inputs'] = [args['inputs'][1]]
return op_lib.blend(**args) return op_lib.blend(**args)
@OpSchema.num_inputs(1, 2)
def copy(inputs, **kwargs):
"""Copy the input.
Examples:
```python
# Copy ``x`` to ``y``
x = dragon.ones(shape=(2, 3))
y = dragon.zeros(shape=(2, 4))
dragon.copy([x, y])
# Copy to a new tensor from ``x``
y = dragon.copy(x)
```
Parameters
----------
inputs : Union[dragon.Tensor, Sequence[dragon.Tensor]]
The input tensor.
Returns
-------
dragon.Tensor
The output tensor.
"""
args = parse_args(locals())
args['inputs'] = nest.flatten(inputs)
if len(args['inputs']) == 2:
args['outputs'] = [args['inputs'][1]]
args['inputs'] = [args['inputs'][0]]
else:
args['outputs'] = None
op_lib = control_flow_ops_lib.Copy
if context.executing_eagerly():
return op_lib \
.instantiate() \
.apply(args['inputs'], args['outputs'])
else:
return op_lib.blend('Copy', **args)
@OpSchema.num_inputs(3) @OpSchema.num_inputs(3)
def masked_assign(inputs, **kwargs): def masked_assign(inputs, **kwargs):
r"""Assign the value to input where mask is 1. r"""Assign the value to input where mask is 1.
...@@ -126,13 +81,10 @@ def masked_assign(inputs, **kwargs): ...@@ -126,13 +81,10 @@ def masked_assign(inputs, **kwargs):
""" """
args = parse_args(locals()) args = parse_args(locals())
inplace = args.pop('inplace') if 'inplace' in args else False
inputs[1] = ops.scalar_to_tensor(inputs[1], inputs[0].dtype) inputs[1] = ops.scalar_to_tensor(inputs[1], inputs[0].dtype)
op_lib = control_flow_ops_lib.MaskedAssign op_lib = control_flow_ops_lib.MaskedAssign
if context.executing_eagerly(): if context.executing_eagerly():
return op_lib.instantiate().apply(inputs) return op_lib.instantiate().apply(inputs, inplace=inplace)
else: else:
args.update({
'outputs': [args['inputs'][0]],
'inputs': [args['inputs'][1:]],
})
return op_lib.blend(**args) return op_lib.blend(**args)
...@@ -46,29 +46,16 @@ class Assign(Operator): ...@@ -46,29 +46,16 @@ class Assign(Operator):
ws, '{}/sizes[{}]'.format(handle, i), ws, '{}/sizes[{}]'.format(handle, i),
sizes[i], 'int64') sizes[i], 'int64')
def forward(self, inputs, starts, sizes): def forward(self, inputs, starts, sizes, inplace=False):
outputs = [self.alloc(inputs[0]) if inplace else self.alloc()]
return self.dispatch( return self.dispatch(
[inputs[1]], [inputs[0]], inputs, outputs,
callback=lambda ws, handle: callback=lambda ws, handle:
self.feed(ws, handle, starts, sizes), self.feed(ws, handle, starts, sizes),
no_grad=True, no_grad=True,
) )
class Copy(Operator):
"""Copy operator."""
def __init__(self, key, dev, **kwargs):
super(Copy, self).__init__(key, dev, **kwargs)
def attributes(self):
return {'op_type': 'Copy', 'arguments': {}}
def forward(self, inputs, outputs):
outputs = outputs if outputs else [self.alloc()]
return self.dispatch(inputs, outputs, no_grad=True)
class MaskedAssign(Operator): class MaskedAssign(Operator):
"""MaskedAssign operator.""" """MaskedAssign operator."""
...@@ -78,5 +65,6 @@ class MaskedAssign(Operator): ...@@ -78,5 +65,6 @@ class MaskedAssign(Operator):
def attributes(self): def attributes(self):
return {'op_type': 'MaskedAssign', 'arguments': {}} return {'op_type': 'MaskedAssign', 'arguments': {}}
def forward(self, inputs): def forward(self, inputs, inplace=False):
return self.dispatch(inputs[1:], [inputs[0]], no_grad=True) outputs = [self.alloc(inputs[0]) if inplace else self.alloc()]
return self.dispatch(inputs, outputs, no_grad=True)
...@@ -104,11 +104,10 @@ def copy(self): ...@@ -104,11 +104,10 @@ def copy(self):
See Also See Also
-------- --------
`dragon.copy(...)`_ `dragon.identity(...)`_
""" """
return control_flow_ops_lib.Copy \ return array_ops_lib.Identity.instantiate().apply([self])
.instantiate().apply([self], None)
def div(self, other): def div(self, other):
...@@ -194,7 +193,7 @@ def getitem(self, item): ...@@ -194,7 +193,7 @@ def getitem(self, item):
if axis is not None: if axis is not None:
return _index_select(self, item[axis], axis) return _index_select(self, item[axis], axis)
starts, sizes = _process_index(item) starts, sizes = _process_index(item)
return _section_select(self, starts, sizes) return _sliced_select(self, starts, sizes)
def glorot_normal(self, mode='fan_in', scale=2.0): def glorot_normal(self, mode='fan_in', scale=2.0):
...@@ -599,7 +598,7 @@ def setitem(self, key, value): ...@@ -599,7 +598,7 @@ def setitem(self, key, value):
_masked_assign(self, value, key) _masked_assign(self, value, key)
else: else:
starts, sizes = _process_index(key) starts, sizes = _process_index(key)
_section_assign(self, value, starts, sizes) _sliced_assign(self, value, starts, sizes)
def sub(self, other): def sub(self, other):
...@@ -705,7 +704,7 @@ def _masked_assign(ref, value, mask): ...@@ -705,7 +704,7 @@ def _masked_assign(ref, value, mask):
"""Assign value according to the mask.""" """Assign value according to the mask."""
value = ops.scalar_to_tensor(value, ref.dtype) value = ops.scalar_to_tensor(value, ref.dtype)
return control_flow_ops_lib.MaskedAssign \ return control_flow_ops_lib.MaskedAssign \
.instantiate().apply([ref, value, mask]) .instantiate().apply([ref, value, mask], inplace=True)
def _masked_select(x, mask): def _masked_select(x, mask):
...@@ -747,16 +746,16 @@ def _process_index(item): ...@@ -747,16 +746,16 @@ def _process_index(item):
return starts, sizes return starts, sizes
def _section_assign(ref, value, starts, sizes): def _sliced_assign(ref, value, starts, sizes):
"""Apply the section-assign operation.""" """Assign value according to the slices."""
value = ops.scalar_to_tensor(value, ref.dtype) value = ops.scalar_to_tensor(value, ref.dtype)
return control_flow_ops_lib.Assign \ return control_flow_ops_lib.Assign \
.instantiate(ndim=len(starts) if starts is not None else 0) \ .instantiate(ndim=len(starts) if starts is not None else 0) \
.apply([ref, value], starts, sizes) .apply([ref, value], starts, sizes, inplace=True)
def _section_select(x, starts, sizes): def _sliced_select(x, starts, sizes):
"""Apply the section-select operation.""" """Select elements according to the slices."""
return array_ops_lib.Slice \ return array_ops_lib.Slice \
.instantiate(ndim=len(starts)).apply([x], starts, sizes) .instantiate(ndim=len(starts)).apply([x], starts, sizes)
......
...@@ -79,11 +79,10 @@ def copy(self): ...@@ -79,11 +79,10 @@ def copy(self):
See Also See Also
-------- --------
`dragon.copy(...)`_ `dragon.identity(...)`_
""" """
outputs = [Tensor(shape=self.shape, dtype=self.dtype)] return OpDef.apply('Identity', [self])
return OpDef.apply('Copy', [self], [outputs])
def div(self, other): def div(self, other):
...@@ -169,7 +168,7 @@ def getitem(self, item): ...@@ -169,7 +168,7 @@ def getitem(self, item):
if axis is not None: if axis is not None:
return _index_select(self, item[axis], axis) return _index_select(self, item[axis], axis)
starts, sizes = _process_index(item) starts, sizes = _process_index(item)
return _section_select(self, starts, sizes) return _sliced_select(self, starts, sizes)
def get_value(self): def get_value(self):
...@@ -402,10 +401,13 @@ def setitem(self, key, value): ...@@ -402,10 +401,13 @@ def setitem(self, key, value):
""" """
if isinstance(key, Tensor): if isinstance(key, Tensor):
_masked_assign(self, value, key) raise RuntimeError(
'Assigning via mask is an ambiguous behavior in graph mode. '
'Use `dragon.masked_assign(...)` instead.')
else: else:
starts, sizes = _process_index(key) raise RuntimeError(
_section_assign(self, value, starts, sizes) 'Assigning via slices is an ambiguous behavior in graph mode. '
'Use `dragon.assign(...)` instead.')
def set_value(self, value): def set_value(self, value):
...@@ -458,12 +460,6 @@ def _index_select(x, index, axis): ...@@ -458,12 +460,6 @@ def _index_select(x, index, axis):
return OpDef.apply('IndexSelect', [x, index], axis=axis, num_axes=1) return OpDef.apply('IndexSelect', [x, index], axis=axis, num_axes=1)
def _masked_assign(ref, value, mask):
"""Assign value according to the mask."""
value = ops.scalar_to_tensor(value, ref.dtype)
return OpDef.apply('MaskedAssign', [value, mask], [ref])
def _masked_select(x, mask): def _masked_select(x, mask):
"""Select elements according to the mask.""" """Select elements according to the mask."""
return OpDef.apply('MaskedSelect', [x, mask]) return OpDef.apply('MaskedSelect', [x, mask])
...@@ -502,14 +498,8 @@ def _process_index(item): ...@@ -502,14 +498,8 @@ def _process_index(item):
return starts, sizes return starts, sizes
def _section_assign(ref, value, starts, sizes): def _sliced_select(x, starts, sizes):
"""Create the section-assign operator.""" """Select elements according to the slices."""
value = ops.scalar_to_tensor(value, ref.dtype)
return OpDef.apply('Assign', [value], [ref], starts=starts, sizes=sizes)
def _section_select(x, starts, sizes):
"""Create the section-select operator."""
return OpDef.apply('Slice', [x], starts=starts, sizes=sizes) return OpDef.apply('Slice', [x], starts=starts, sizes=sizes)
......
...@@ -24,7 +24,6 @@ from dragon.core.autograph.tensor import TensorRef ...@@ -24,7 +24,6 @@ from dragon.core.autograph.tensor import TensorRef
from dragon.core.framework import context from dragon.core.framework import context
from dragon.core.framework import workspace from dragon.core.framework import workspace
from dragon.core.ops import array_ops from dragon.core.ops import array_ops
from dragon.core.ops import control_flow_ops
from dragon.core.ops import init_ops from dragon.core.ops import init_ops
from dragon.core.ops import vision_ops from dragon.core.ops import vision_ops
...@@ -260,19 +259,19 @@ def gather(params, indices, axis=0, name=None): ...@@ -260,19 +259,19 @@ def gather(params, indices, axis=0, name=None):
def identity(input, name=None): def identity(input, name=None):
"""Return a new tensor copying the content of input. """Return a tensor copied from the input.
Examples: Examples:
```python ```python
# Copy ``x`` to ``xx`` # Copy ``x`` to ``y``
x = tf.zeros(shape=(2, 3)) x = tf.zeros(shape=(2, 3))
xx = tf.identity(x) y = tf.identity(x)
# ``x`` != ``xx`` # ``x`` != ``y``
x += 1 x += 1
print(x) print(x)
print(xx) print(y)
``` ```
Parameters Parameters
...@@ -288,7 +287,7 @@ def identity(input, name=None): ...@@ -288,7 +287,7 @@ def identity(input, name=None):
The output tensor. The output tensor.
""" """
return control_flow_ops.copy(input, name=name if name else 'Identity') return array_ops.identity(input, name=name if name else 'Identity')
def ones(shape, dtype='float32', name=None): def ones(shape, dtype='float32', name=None):
......
...@@ -655,6 +655,19 @@ class TestArrayOps(OpTestCase): ...@@ -655,6 +655,19 @@ class TestArrayOps(OpTestCase):
with dragon.device('cuda'): with dragon.device('cuda'):
self.test_flatten() self.test_flatten()
def test_identity(self):
for execution in ('EAGER_MODE', 'GRAPH_MODE'):
with execution_context().mode(execution):
data = arange((4,))
x = new_tensor(data)
y = dragon.identity(x)
self.assertEqual(y, data)
@unittest.skipIf(not TEST_CUDA, 'CUDA unavailable')
def test_identity_cuda(self):
with dragon.device('cuda'):
self.test_identity()
def test_index_select(self): def test_index_select(self):
entries = [1, (1, 2)] entries = [1, (1, 2)]
for execution in ('EAGER_MODE', 'GRAPH_MODE'): for execution in ('EAGER_MODE', 'GRAPH_MODE'):
...@@ -1116,30 +1129,14 @@ class TestControlFlowOps(OpTestCase): ...@@ -1116,30 +1129,14 @@ class TestControlFlowOps(OpTestCase):
with dragon.device('cuda'): with dragon.device('cuda'):
self.test_assign() self.test_assign()
def test_copy(self):
for execution in ('EAGER_MODE', 'GRAPH_MODE'):
with execution_context().mode(execution):
data = arange((4,))
x = new_tensor(data)
y = dragon.zeros((4,), dtype='float32')
y = dragon.copy([x, y])
z = dragon.copy(x)
self.assertEqual(y, data)
self.assertEqual(z, data)
@unittest.skipIf(not TEST_CUDA, 'CUDA unavailable')
def test_copy_cuda(self):
with dragon.device('cuda'):
self.test_copy()
def test_masked_assign(self): def test_masked_assign(self):
for execution in ('EAGER_MODE', 'GRAPH_MODE'): for execution in ('EAGER_MODE', 'GRAPH_MODE'):
with execution_context().mode(execution): with execution_context().mode(execution):
data = arange((2, 3)) data = arange((2, 3))
x = new_tensor(data) x = new_tensor(data)
dragon.masked_assign([x, 0, x > 2]) y = dragon.masked_assign([x, 0, x > 2])
data[data > 2] = 0 data[data > 2] = 0
self.assertEqual(x, data) self.assertEqual(y, data)
@unittest.skipIf(not TEST_CUDA, 'CUDA unavailable') @unittest.skipIf(not TEST_CUDA, 'CUDA unavailable')
def test_masked_assign_cuda(self): def test_masked_assign_cuda(self):
...@@ -3021,9 +3018,12 @@ class TestTensorOps(OpTestCase): ...@@ -3021,9 +3018,12 @@ class TestTensorOps(OpTestCase):
with execution_context().mode(execution): with execution_context().mode(execution):
data = arange((2, 3)) data = arange((2, 3))
x = new_tensor(data) x = new_tensor(data)
try:
x[x > 2] = 0 x[x > 2] = 0
data[data > 2] = 0 data[data > 2] = 0
self.assertEqual(x, data) self.assertEqual(x, data)
except RuntimeError:
pass
entries = [0, entries = [0,
slice(None, None, None), slice(None, None, None),
slice(0, None, None), slice(0, None, None),
...@@ -3037,7 +3037,7 @@ class TestTensorOps(OpTestCase): ...@@ -3037,7 +3037,7 @@ class TestTensorOps(OpTestCase):
x.__setitem__(item, 0) x.__setitem__(item, 0)
data.__setitem__(item, 0) data.__setitem__(item, 0)
self.assertEqual(x, data) self.assertEqual(x, data)
except (NotImplementedError, ValueError, TypeError): except (NotImplementedError, ValueError, TypeError, RuntimeError):
pass pass
def test_rsub(self): def test_rsub(self):
......
...@@ -71,7 +71,7 @@ class Assign(function.Function): ...@@ -71,7 +71,7 @@ class Assign(function.Function):
def forward(self, out, starts, sizes, input): def forward(self, out, starts, sizes, input):
self._check_device([input, out]) self._check_device([input, out])
return self.dispatch( return self.dispatch(
[input], [out], [out, input], [self.alloc(out)],
callback=lambda ws, handle: callback=lambda ws, handle:
self.feed(ws, handle, starts, sizes), self.feed(ws, handle, starts, sizes),
no_grad=True, no_grad=True,
...@@ -306,7 +306,7 @@ class MaskedAssign(function.Function): ...@@ -306,7 +306,7 @@ class MaskedAssign(function.Function):
return {'op_type': 'MaskedAssign', 'arguments': {}} return {'op_type': 'MaskedAssign', 'arguments': {}}
def forward(self, out, mask, input): def forward(self, out, mask, input):
return self.dispatch([input, mask], [self.alloc(out)]) return self.dispatch([out, input, mask], [self.alloc(out)])
class MaskedSelect(function.Function): class MaskedSelect(function.Function):
......
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!