Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
SeetaResearch
/
Dragon
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Issues
0
Merge Requests
0
Pipelines
Wiki
Snippets
Settings
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit 6eeac5fe
authored
Aug 26, 2017
by
Ting PAN
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add omp optimization
1 parent
007d9c21
Show whitespace changes
Inline
Side-by-side
Showing
27 changed files
with
462 additions
and
114 deletions
Dragon/CMakeLists.txt
Dragon/include/utils/omp_alternative.h
Dragon/include/utils/sse_device.h
Dragon/src/operators/arithmetic/add_op.cc
Dragon/src/operators/arithmetic/div_op.cc
Dragon/src/operators/arithmetic/dot_op.cc
Dragon/src/operators/arithmetic/eltwise_op.cc
Dragon/src/operators/arithmetic/gram_matrix_op.cc
Dragon/src/operators/arithmetic/matmul_op.cc
Dragon/src/operators/arithmetic/mul_op.cc
Dragon/src/operators/arithmetic/pow_op.cc
Dragon/src/operators/arithmetic/scale_op.cc
Dragon/src/operators/arithmetic/sub_op.cc
Dragon/src/operators/common/concat_op.cc
Dragon/src/operators/common/transpose_op.cc
Dragon/src/operators/norm/batch_norm_op.cc
Dragon/src/operators/norm/l2_norm_op.cc
Dragon/src/operators/utils/cast_op.cpp
Dragon/src/operators/utils/gradient_op.cc
Dragon/src/operators/utils/memory_data_op.cc
Dragon/src/operators/vision/dense_concat_op.cc
Dragon/src/utils/math_functions.cc
Dragon/src/utils/math_functions.cu
Dragon/src/utils/op_kernel.cc
Dragon/src/utils/op_kernel.cu
Dragon/src/utils/sse_alternative.cc
README.md
Dragon/CMakeLists.txt
View file @
6eeac5f
...
@@ -12,6 +12,7 @@ option(WITH_PYTHON3 "Set ON to use PYTHON3 otherwise PYTHON2" OF
...
@@ -12,6 +12,7 @@ option(WITH_PYTHON3 "Set ON to use PYTHON3 otherwise PYTHON2" OF
option
(
WITH_CUDA
"Set ON to use CUDA"
ON
)
option
(
WITH_CUDA
"Set ON to use CUDA"
ON
)
option
(
WITH_CUDNN
"Set ON to use CUDNN"
OFF
)
option
(
WITH_CUDNN
"Set ON to use CUDNN"
OFF
)
option
(
WITH_BLAS
"Set ON to use BLAS"
OFF
)
option
(
WITH_BLAS
"Set ON to use BLAS"
OFF
)
option
(
WITH_OMP
"Set ON to use OpenMP"
OFF
)
option
(
WITH_SSE
"Set ON to use SSE 4.1"
ON
)
option
(
WITH_SSE
"Set ON to use SSE 4.1"
ON
)
option
(
WITH_MPI
"Set ON to use MPI"
OFF
)
option
(
WITH_MPI
"Set ON to use MPI"
OFF
)
option
(
WITH_MPI_CUDA
"Set ON to use MPI-CUDA"
OFF
)
option
(
WITH_MPI_CUDA
"Set ON to use MPI-CUDA"
OFF
)
...
@@ -22,7 +23,7 @@ option(WITH_CUDA_FP16 "Set ON to use FP16" ON)
...
@@ -22,7 +23,7 @@ option(WITH_CUDA_FP16 "Set ON to use FP16" ON)
set
(
3RDPARTY_DIR
${
PROJECT_SOURCE_DIR
}
/../3rdparty
)
set
(
3RDPARTY_DIR
${
PROJECT_SOURCE_DIR
}
/../3rdparty
)
# set your python environment
# set your python environment
set
(
PYTHON_DIR /usr/include/python2.7
)
# prefer
set
(
PYTHON_DIR /usr/include/python2.7
)
# prefer
red
#set(PYTHON_DIR /usr/include/python3.x) # optional, set specific version
#set(PYTHON_DIR /usr/include/python3.x) # optional, set specific version
#set(ANACONDA_DIR /xxx/anaconda) # optional, root folder of anaconda, preset for 2.7, 3.5, and 3.6
#set(ANACONDA_DIR /xxx/anaconda) # optional, root folder of anaconda, preset for 2.7, 3.5, and 3.6
set
(
NUMPY_DIR /xxx/numpy
)
# required, root folder of numpy package
set
(
NUMPY_DIR /xxx/numpy
)
# required, root folder of numpy package
...
@@ -118,6 +119,10 @@ else()
...
@@ -118,6 +119,10 @@ else()
"
\n
-- > GEMM/GEMV is disabled"
"
\n
-- > GEMM/GEMV is disabled"
"
\n
-- > prefer not to run as CPU Mode"
)
"
\n
-- > prefer not to run as CPU Mode"
)
endif
()
endif
()
if
(
WITH_OMP
)
ADD_DEFINITIONS
(
-DWITH_OMP
)
message
(
STATUS
"Use OpenMP [Optional]"
)
endif
()
if
(
WITH_SSE
)
if
(
WITH_SSE
)
ADD_DEFINITIONS
(
-DWITH_SSE
)
ADD_DEFINITIONS
(
-DWITH_SSE
)
message
(
STATUS
"Use SSE [Optional]"
)
message
(
STATUS
"Use SSE [Optional]"
)
...
@@ -145,11 +150,18 @@ endif()
...
@@ -145,11 +150,18 @@ endif()
# ---[ Flags
# ---[ Flags
set
(
CUDA_NVCC_FLAGS
"
${
CUDA_NVCC_FLAGS
}
${
CUDA_ARCH
}
"
)
set
(
CUDA_NVCC_FLAGS
"
${
CUDA_NVCC_FLAGS
}
${
CUDA_ARCH
}
"
)
if
(
WIN32
)
if
(
WIN32
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
/MP"
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
/MP /O2"
)
if
(
WITH_OMP
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
/openmp"
)
endif
()
endif
()
endif
()
if
(
UNIX
)
if
(
UNIX
)
set
(
CMAKE_C_FLAGS
"
${
CMAKE_C_FLAGS
}
-fPIC"
)
set
(
CMAKE_C_FLAGS
"
${
CMAKE_C_FLAGS
}
-fPIC"
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-fPIC -O2 -m64 -fpermissive -std=c++11"
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-fPIC -O2 -m64 -fpermissive -std=c++11"
)
if
(
WITH_OMP
)
set
(
CMAKE_C_FLAGS
"
${
CMAKE_C_FLAGS
}
-fopenmp"
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-fopenmp"
)
endif
()
endif
()
endif
()
# ---[ Warnings
# ---[ Warnings
...
...
Dragon/include/utils/omp_alternative.h
0 → 100644
View file @
6eeac5f
// --------------------------------------------------------
// Dragon
// Copyright(c) 2017 SeetaTech
// Written by Ting Pan
// --------------------------------------------------------
#ifndef DRAGON_UTILS_OMP_ALTERNATIVE_H_
#define DRAGON_UTILS_OMP_ALTERNATIVE_H_
#ifdef WITH_OMP
#include <algorithm>
#include <omp.h>
namespace
dragon
{
#define OMP_MIN_ITERATORS_PER_CORE 256
inline
int
GET_OMP_THREADS
(
const
int
N
)
{
int
threads
=
std
::
max
(
N
/
OMP_MIN_ITERATORS_PER_CORE
,
1
);
return
std
::
min
(
threads
,
omp_get_num_procs
());
}
}
#endif // WITH_OMP
#endif // DRAGON_UTILS_OMP_ALTERNATIVE_H_
\ No newline at end of file
Dragon/include/utils/sse_device.h
View file @
6eeac5f
...
@@ -15,11 +15,10 @@
...
@@ -15,11 +15,10 @@
namespace
dragon
{
namespace
dragon
{
#define SSE_LOOP1(i, n) \
#define SSE_LOOP1(i, n) \
int32_t i; \
for (i = 0; i < n - 4; i += 4) \
for (i = 0; i < n - 4; i += 4) \
#define SSE_LOOP2(i, n) \
#define SSE_LOOP2(i, n) \
for (; i < n;
i++
)
for (; i < n;
++i
)
#define SSE_FP32_LOAD _mm_loadu_ps
#define SSE_FP32_LOAD _mm_loadu_ps
#define SSE_FP32_STORE _mm_storeu_ps
#define SSE_FP32_STORE _mm_storeu_ps
...
...
Dragon/src/operators/arithmetic/add_op.cc
View file @
6eeac5f
...
@@ -53,18 +53,24 @@ void AddOp<Context>::RunOnDevice() {
...
@@ -53,18 +53,24 @@ void AddOp<Context>::RunOnDevice() {
}
}
else
if
(
input
(
0
).
dim
(
0
)
==
input
(
1
).
dim
(
0
)
&&
input
(
1
).
count
(
1
)
==
1
)
{
else
if
(
input
(
0
).
dim
(
0
)
==
input
(
1
).
dim
(
0
)
&&
input
(
1
).
count
(
1
)
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
2
);
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
2
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
2
);
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
2
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
}
else
if
(
input
(
0
).
dim
(
-
1
)
==
input
(
1
).
dim
(
-
1
)
&&
else
if
(
input
(
0
).
dim
(
-
1
)
==
input
(
1
).
dim
(
-
1
)
&&
input
(
1
).
count
(
0
,
input
(
1
).
axis
(
-
1
))
==
1
)
{
input
(
1
).
count
(
0
,
input
(
1
).
axis
(
-
1
))
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
1
);
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
1
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
1
);
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
1
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
}
else
if
(
input
(
1
).
ndim
()
==
1
&&
input
(
1
).
dim
(
0
)
==
1
)
{
else
if
(
input
(
1
).
ndim
()
==
1
&&
input
(
1
).
dim
(
0
)
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
0
);
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
0
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
0
);
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
0
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
}
else
{
else
{
...
@@ -139,18 +145,24 @@ void AddGradientOp<Context>::RunOnDevice() {
...
@@ -139,18 +145,24 @@ void AddGradientOp<Context>::RunOnDevice() {
}
}
else
if
(
input
(
-
1
).
dim
(
0
)
==
input
(
0
).
dim
(
0
)
&&
input
(
0
).
count
(
1
)
==
1
)
{
else
if
(
input
(
-
1
).
dim
(
0
)
==
input
(
0
).
dim
(
0
)
&&
input
(
0
).
count
(
1
)
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
2
);
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
2
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
2
);
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
2
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
}
else
if
(
input
(
-
1
).
dim
(
-
1
)
==
input
(
0
).
dim
(
-
1
)
&&
else
if
(
input
(
-
1
).
dim
(
-
1
)
==
input
(
0
).
dim
(
-
1
)
&&
input
(
0
).
count
(
0
,
input
(
0
).
axis
(
-
1
))
==
1
)
{
input
(
0
).
count
(
0
,
input
(
0
).
axis
(
-
1
))
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
1
);
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
1
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
1
);
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
1
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
}
else
if
(
input
(
0
).
ndim
()
==
1
&&
input
(
0
).
dim
(
0
)
==
1
)
{
else
if
(
input
(
0
).
ndim
()
==
1
&&
input
(
0
).
dim
(
0
)
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
0
);
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
0
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
0
);
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
0
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
}
else
{
else
{
...
...
Dragon/src/operators/arithmetic/div_op.cc
View file @
6eeac5f
...
@@ -54,18 +54,24 @@ void DivOp<Context>::RunOnDevice() {
...
@@ -54,18 +54,24 @@ void DivOp<Context>::RunOnDevice() {
}
}
else
if
(
input
(
0
).
dim
(
0
)
==
input
(
1
).
dim
(
0
)
&&
input
(
1
).
count
(
1
)
==
1
)
{
else
if
(
input
(
0
).
dim
(
0
)
==
input
(
1
).
dim
(
0
)
&&
input
(
1
).
count
(
1
)
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
2
);
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
2
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
2
);
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
2
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types"
;
else
LOG
(
FATAL
)
<<
"unsupported input types"
;
}
}
else
if
(
input
(
0
).
dim
(
-
1
)
==
input
(
1
).
dim
(
-
1
)
&&
else
if
(
input
(
0
).
dim
(
-
1
)
==
input
(
1
).
dim
(
-
1
)
&&
input
(
1
).
count
(
0
,
input
(
1
).
axis
(
-
1
))
==
1
)
{
input
(
1
).
count
(
0
,
input
(
1
).
axis
(
-
1
))
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
1
);
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
1
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
1
);
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
1
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types"
;
else
LOG
(
FATAL
)
<<
"unsupported input types"
;
}
}
else
if
(
input
(
1
).
ndim
()
==
1
&&
input
(
1
).
dim
(
0
)
==
1
)
{
else
if
(
input
(
1
).
ndim
()
==
1
&&
input
(
1
).
dim
(
0
)
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
0
);
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
0
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
0
);
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
0
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types"
;
else
LOG
(
FATAL
)
<<
"unsupported input types"
;
}
}
else
{
else
{
...
@@ -170,18 +176,24 @@ void DivGradientOp<Context>::RunOnDevice() {
...
@@ -170,18 +176,24 @@ void DivGradientOp<Context>::RunOnDevice() {
}
}
else
if
(
input
(
0
).
dim
(
0
)
==
input
(
1
).
dim
(
0
)
&&
input
(
1
).
count
(
1
)
==
1
)
{
else
if
(
input
(
0
).
dim
(
0
)
==
input
(
1
).
dim
(
0
)
&&
input
(
1
).
count
(
1
)
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
2
);
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
2
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
2
);
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
2
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types"
;
else
LOG
(
FATAL
)
<<
"unsupported input types"
;
}
}
else
if
(
input
(
0
).
dim
(
-
1
)
==
input
(
1
).
dim
(
-
1
)
&&
else
if
(
input
(
0
).
dim
(
-
1
)
==
input
(
1
).
dim
(
-
1
)
&&
input
(
1
).
count
(
0
,
input
(
1
).
axis
(
-
1
))
==
1
)
{
input
(
1
).
count
(
0
,
input
(
1
).
axis
(
-
1
))
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
1
);
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
1
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
1
);
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
1
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types"
;
else
LOG
(
FATAL
)
<<
"unsupported input types"
;
}
}
else
if
(
input
(
1
).
ndim
()
==
1
&&
input
(
1
).
dim
(
0
)
==
1
)
{
else
if
(
input
(
1
).
ndim
()
==
1
&&
input
(
1
).
dim
(
0
)
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
0
);
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
0
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
0
);
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
0
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types"
;
else
LOG
(
FATAL
)
<<
"unsupported input types"
;
}
}
else
{
else
{
...
...
Dragon/src/operators/arithmetic/dot_op.cc
View file @
6eeac5f
...
@@ -55,7 +55,9 @@ void DotOp<Context>::RunOnDevice() {
...
@@ -55,7 +55,9 @@ void DotOp<Context>::RunOnDevice() {
dims
[
dims
.
size
()
-
1
]
=
N1
;
dims
[
dims
.
size
()
-
1
]
=
N1
;
output
(
0
)
->
Reshape
(
dims
);
output
(
0
)
->
Reshape
(
dims
);
if
(
input
(
0
).
template
IsType
<
float
>
())
GemmRunWithType
<
float
>
();
if
(
input
(
0
).
template
IsType
<
float
>
())
GemmRunWithType
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
GemmRunWithType
<
float16
>
();
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
GemmRunWithType
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
}
else
if
(
input
(
0
).
ndim
()
>=
2
&&
input
(
1
).
ndim
()
==
1
)
{
else
if
(
input
(
0
).
ndim
()
>=
2
&&
input
(
1
).
ndim
()
==
1
)
{
...
@@ -70,7 +72,9 @@ void DotOp<Context>::RunOnDevice() {
...
@@ -70,7 +72,9 @@ void DotOp<Context>::RunOnDevice() {
dims
.
pop_back
();
dims
.
pop_back
();
output
(
0
)
->
Reshape
(
dims
);
output
(
0
)
->
Reshape
(
dims
);
if
(
input
(
0
).
template
IsType
<
float
>
())
GemvRunWithType
<
float
>
();
if
(
input
(
0
).
template
IsType
<
float
>
())
GemvRunWithType
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
GemvRunWithType
<
float16
>
();
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
GemvRunWithType
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
}
else
{
else
{
...
@@ -148,7 +152,9 @@ void DotGradientOp<Context>::RunOnDevice() {
...
@@ -148,7 +152,9 @@ void DotGradientOp<Context>::RunOnDevice() {
<<
input
(
0
).
dim_string
()
<<
" can not Dot with Tensor"
<<
input
(
0
).
dim_string
()
<<
" can not Dot with Tensor"
<<
"("
<<
input
(
1
).
name
()
<<
"): "
<<
input
(
1
).
dim_string
();
<<
"("
<<
input
(
1
).
name
()
<<
"): "
<<
input
(
1
).
dim_string
();
if
(
input
(
0
).
template
IsType
<
float
>
())
GemmRunWithType
<
float
>
();
if
(
input
(
0
).
template
IsType
<
float
>
())
GemmRunWithType
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
GemmRunWithType
<
float16
>
();
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
GemmRunWithType
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
}
else
if
(
input
(
0
).
ndim
()
>=
2
&&
input
(
1
).
ndim
()
==
1
)
{
else
if
(
input
(
0
).
ndim
()
>=
2
&&
input
(
1
).
ndim
()
==
1
)
{
...
@@ -160,7 +166,9 @@ void DotGradientOp<Context>::RunOnDevice() {
...
@@ -160,7 +166,9 @@ void DotGradientOp<Context>::RunOnDevice() {
<<
input
(
0
).
dim_string
()
<<
" can not Dot with Tensor"
<<
input
(
0
).
dim_string
()
<<
" can not Dot with Tensor"
<<
"("
<<
input
(
1
).
name
()
<<
"): "
<<
input
(
1
).
dim_string
();
<<
"("
<<
input
(
1
).
name
()
<<
"): "
<<
input
(
1
).
dim_string
();
if
(
input
(
0
).
template
IsType
<
float
>
())
GemvRunWithType
<
float
>
();
if
(
input
(
0
).
template
IsType
<
float
>
())
GemvRunWithType
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
GemvRunWithType
<
float16
>
();
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
GemvRunWithType
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
}
else
{
else
{
...
...
Dragon/src/operators/arithmetic/eltwise_op.cc
View file @
6eeac5f
...
@@ -41,12 +41,16 @@ void EltwiseOp<Context>::RunOnDevice() {
...
@@ -41,12 +41,16 @@ void EltwiseOp<Context>::RunOnDevice() {
if
(
operation
==
"SUM"
)
{
if
(
operation
==
"SUM"
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
SumRunWithType
<
float
>
();
if
(
input
(
0
).
template
IsType
<
float
>
())
SumRunWithType
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
SumRunWithType
<
float16
>
();
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
SumRunWithType
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
}
else
if
(
operation
==
"PROD"
)
{
else
if
(
operation
==
"PROD"
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
ProdRunWithType
<
float
>
();
if
(
input
(
0
).
template
IsType
<
float
>
())
ProdRunWithType
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
ProdRunWithType
<
float16
>
();
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
ProdRunWithType
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
}
else
{
else
{
...
@@ -104,12 +108,16 @@ void EltwiseGradientOp<Context>::RunOnDevice() {
...
@@ -104,12 +108,16 @@ void EltwiseGradientOp<Context>::RunOnDevice() {
if
(
operation
==
"SUM"
)
{
if
(
operation
==
"SUM"
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
SumRunWithType
<
float
>
();
if
(
input
(
0
).
template
IsType
<
float
>
())
SumRunWithType
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
SumRunWithType
<
float16
>
();
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
SumRunWithType
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
}
else
if
(
operation
==
"PROD"
)
{
else
if
(
operation
==
"PROD"
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
ProdRunWithType
<
float
>
();
if
(
input
(
0
).
template
IsType
<
float
>
())
ProdRunWithType
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
ProdRunWithType
<
float16
>
();
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
ProdRunWithType
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
}
else
{
else
{
...
...
Dragon/src/operators/arithmetic/gram_matrix_op.cc
View file @
6eeac5f
...
@@ -25,7 +25,9 @@ void GramMatrixOp<Context>::RunOnDevice() {
...
@@ -25,7 +25,9 @@ void GramMatrixOp<Context>::RunOnDevice() {
output
(
0
)
->
Reshape
(
vector
<
TIndex
>
({
outer_dim
,
dim
,
dim
}));
output
(
0
)
->
Reshape
(
vector
<
TIndex
>
({
outer_dim
,
dim
,
dim
}));
if
(
input
(
0
).
template
IsType
<
float
>
())
RunWithType
<
float
>
();
if
(
input
(
0
).
template
IsType
<
float
>
())
RunWithType
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
RunWithType
<
float16
>
();
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
RunWithType
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
}
...
@@ -57,7 +59,9 @@ void GramMatrixGradientOp<Context>::RunOnDevice() {
...
@@ -57,7 +59,9 @@ void GramMatrixGradientOp<Context>::RunOnDevice() {
output
(
0
)
->
ReshapeLike
(
input
(
0
));
output
(
0
)
->
ReshapeLike
(
input
(
0
));
if
(
input
(
0
).
template
IsType
<
float
>
())
RunWithType
<
float
>
();
if
(
input
(
0
).
template
IsType
<
float
>
())
RunWithType
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
RunWithType
<
float16
>
();
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
RunWithType
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
}
...
...
Dragon/src/operators/arithmetic/matmul_op.cc
View file @
6eeac5f
...
@@ -48,7 +48,9 @@ void MatmulOp<Context>::RunOnDevice() {
...
@@ -48,7 +48,9 @@ void MatmulOp<Context>::RunOnDevice() {
dims
[
dims
.
size
()
-
1
]
=
N
;
dims
[
dims
.
size
()
-
1
]
=
N
;
output
(
0
)
->
Reshape
(
dims
);
output
(
0
)
->
Reshape
(
dims
);
if
(
input
(
0
).
template
IsType
<
float
>
())
RunWithType
<
float
>
();
if
(
input
(
0
).
template
IsType
<
float
>
())
RunWithType
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
RunWithType
<
float16
>
();
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
RunWithType
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
}
...
@@ -105,7 +107,9 @@ void MatmulGradientOp<Context>::RunOnDevice() {
...
@@ -105,7 +107,9 @@ void MatmulGradientOp<Context>::RunOnDevice() {
output
(
0
)
->
ReshapeLike
(
input
(
0
));
output
(
0
)
->
ReshapeLike
(
input
(
0
));
output
(
1
)
->
ReshapeLike
(
input
(
1
));
output
(
1
)
->
ReshapeLike
(
input
(
1
));
if
(
input
(
0
).
template
IsType
<
float
>
())
RunWithType
<
float
>
();
if
(
input
(
0
).
template
IsType
<
float
>
())
RunWithType
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
RunWithType
<
float16
>
();
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
RunWithType
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
}
...
...
Dragon/src/operators/arithmetic/mul_op.cc
View file @
6eeac5f
...
@@ -54,18 +54,24 @@ void MulOp<Context>::RunOnDevice() {
...
@@ -54,18 +54,24 @@ void MulOp<Context>::RunOnDevice() {
}
}
else
if
(
input
(
0
).
dim
(
0
)
==
input
(
1
).
dim
(
0
)
&&
input
(
1
).
count
(
1
)
==
1
)
{
else
if
(
input
(
0
).
dim
(
0
)
==
input
(
1
).
dim
(
0
)
&&
input
(
1
).
count
(
1
)
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
2
);
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
2
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
2
);
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
2
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
}
else
if
(
input
(
0
).
dim
(
-
1
)
==
input
(
1
).
dim
(
-
1
)
&&
else
if
(
input
(
0
).
dim
(
-
1
)
==
input
(
1
).
dim
(
-
1
)
&&
input
(
1
).
count
(
0
,
input
(
1
).
axis
(
-
1
))
==
1
)
{
input
(
1
).
count
(
0
,
input
(
1
).
axis
(
-
1
))
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
1
);
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
1
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
1
);
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
1
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
}
else
if
(
input
(
1
).
ndim
()
==
1
&&
input
(
1
).
dim
(
0
)
==
1
)
{
else
if
(
input
(
1
).
ndim
()
==
1
&&
input
(
1
).
dim
(
0
)
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
0
);
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
0
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
0
);
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
0
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
}
else
{
else
{
...
@@ -158,18 +164,24 @@ void MulGradientOp<Context>::RunOnDevice() {
...
@@ -158,18 +164,24 @@ void MulGradientOp<Context>::RunOnDevice() {
}
}
else
if
(
input
(
0
).
dim
(
0
)
==
input
(
1
).
dim
(
0
)
&&
input
(
1
).
count
(
1
)
==
1
)
{
else
if
(
input
(
0
).
dim
(
0
)
==
input
(
1
).
dim
(
0
)
&&
input
(
1
).
count
(
1
)
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
2
);
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
2
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
2
);
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
2
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
}
else
if
(
input
(
0
).
dim
(
-
1
)
==
input
(
1
).
dim
(
-
1
)
&&
else
if
(
input
(
0
).
dim
(
-
1
)
==
input
(
1
).
dim
(
-
1
)
&&
input
(
1
).
count
(
0
,
input
(
1
).
axis
(
-
1
))
==
1
)
{
input
(
1
).
count
(
0
,
input
(
1
).
axis
(
-
1
))
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
1
);
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
1
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
1
);
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
1
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
}
else
if
(
input
(
1
).
ndim
()
==
1
&&
input
(
1
).
dim
(
0
)
==
1
)
{
else
if
(
input
(
1
).
ndim
()
==
1
&&
input
(
1
).
dim
(
0
)
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
0
);
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
0
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
0
);
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
0
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
}
else
{
else
{
...
...
Dragon/src/operators/arithmetic/pow_op.cc
View file @
6eeac5f
...
@@ -26,7 +26,9 @@ void PowOp<Context>::RunOnDevice() {
...
@@ -26,7 +26,9 @@ void PowOp<Context>::RunOnDevice() {
output
(
0
)
->
ReshapeLike
(
input
(
0
));
output
(
0
)
->
ReshapeLike
(
input
(
0
));
if
(
input
(
0
).
template
IsType
<
float
>
())
RunWithType
<
float
>
();
if
(
input
(
0
).
template
IsType
<
float
>
())
RunWithType
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
RunWithType
<
float16
>
();
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
RunWithType
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
}
...
@@ -76,7 +78,9 @@ void PowGradientOp<Context>::RunOnDevice() {
...
@@ -76,7 +78,9 @@ void PowGradientOp<Context>::RunOnDevice() {
output
(
0
)
->
ReshapeLike
(
input
(
0
));
output
(
0
)
->
ReshapeLike
(
input
(
0
));
if
(
input
(
0
).
template
IsType
<
float
>
())
RunWithType
<
float
>
();
if
(
input
(
0
).
template
IsType
<
float
>
())
RunWithType
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
RunWithType
<
float16
>
();
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
RunWithType
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
}
...
...
Dragon/src/operators/arithmetic/scale_op.cc
View file @
6eeac5f
...
@@ -37,7 +37,9 @@ void ScaleOp<Context>::RunOnDevice() {
...
@@ -37,7 +37,9 @@ void ScaleOp<Context>::RunOnDevice() {
output
(
0
)
->
ReshapeLike
(
input
(
0
));
output
(
0
)
->
ReshapeLike
(
input
(
0
));
if
(
input
(
0
).
template
IsType
<
float
>
())
RunWithType
<
float
>
();
if
(
input
(
0
).
template
IsType
<
float
>
())
RunWithType
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
RunWithType
<
float16
>
();
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
RunWithType
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
}
...
...
Dragon/src/operators/arithmetic/sub_op.cc
View file @
6eeac5f
...
@@ -53,18 +53,24 @@ void SubOp<Context>::RunOnDevice() {
...
@@ -53,18 +53,24 @@ void SubOp<Context>::RunOnDevice() {
}
}
else
if
(
input
(
0
).
dim
(
0
)
==
input
(
1
).
dim
(
0
)
&&
input
(
1
).
count
(
1
)
==
1
)
{
else
if
(
input
(
0
).
dim
(
0
)
==
input
(
1
).
dim
(
0
)
&&
input
(
1
).
count
(
1
)
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
2
);
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
2
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
2
);
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
2
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
}
else
if
(
input
(
0
).
dim
(
-
1
)
==
input
(
1
).
dim
(
-
1
)
&&
else
if
(
input
(
0
).
dim
(
-
1
)
==
input
(
1
).
dim
(
-
1
)
&&
input
(
1
).
count
(
0
,
input
(
1
).
axis
(
-
1
))
==
1
)
{
input
(
1
).
count
(
0
,
input
(
1
).
axis
(
-
1
))
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
1
);
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
1
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
1
);
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
1
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
}
else
if
(
input
(
1
).
ndim
()
==
1
&&
input
(
1
).
dim
(
0
)
==
1
)
{
else
if
(
input
(
1
).
ndim
()
==
1
&&
input
(
1
).
dim
(
0
)
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
0
);
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
0
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
0
);
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
0
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
}
else
{
else
{
...
@@ -139,18 +145,24 @@ void SubGradientOp<Context>::RunOnDevice() {
...
@@ -139,18 +145,24 @@ void SubGradientOp<Context>::RunOnDevice() {
}
}
else
if
(
input
(
-
1
).
dim
(
0
)
==
input
(
0
).
dim
(
0
)
&&
input
(
0
).
count
(
1
)
==
1
)
{
else
if
(
input
(
-
1
).
dim
(
0
)
==
input
(
0
).
dim
(
0
)
&&
input
(
0
).
count
(
1
)
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
2
);
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
2
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
2
);
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
2
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
}
else
if
(
input
(
-
1
).
dim
(
-
1
)
==
input
(
0
).
dim
(
-
1
)
&&
else
if
(
input
(
-
1
).
dim
(
-
1
)
==
input
(
0
).
dim
(
-
1
)
&&
input
(
0
).
count
(
0
,
input
(
0
).
axis
(
-
1
))
==
1
)
{
input
(
0
).
count
(
0
,
input
(
0
).
axis
(
-
1
))
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
1
);
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
1
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
1
);
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
1
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
}
else
if
(
input
(
0
).
ndim
()
==
1
&&
input
(
0
).
dim
(
0
)
==
1
)
{
else
if
(
input
(
0
).
ndim
()
==
1
&&
input
(
0
).
dim
(
0
)
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
0
);
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
0
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
0
);
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
0
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
}
else
{
else
{
...
...
Dragon/src/operators/common/concat_op.cc
View file @
6eeac5f
...
@@ -49,7 +49,9 @@ void ConcatOp<Context>::RunOnDevice() {
...
@@ -49,7 +49,9 @@ void ConcatOp<Context>::RunOnDevice() {
}
}
if
(
input
(
0
).
template
IsType
<
float
>
())
RunWithType
<
float
>
();
if
(
input
(
0
).
template
IsType
<
float
>
())
RunWithType
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
RunWithType
<
float16
>
();
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
RunWithType
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
}
...
@@ -96,7 +98,9 @@ void ConcatGradientOp<Context>::RunOnDevice() {
...
@@ -96,7 +98,9 @@ void ConcatGradientOp<Context>::RunOnDevice() {
}
}
if
(
input
(
0
).
template
IsType
<
float
>
())
RunWithType
<
float
>
();
if
(
input
(
0
).
template
IsType
<
float
>
())
RunWithType
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
RunWithType
<
float16
>
();
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
RunWithType
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
}
...
...
Dragon/src/operators/common/transpose_op.cc
View file @
6eeac5f
...
@@ -45,7 +45,9 @@ void TransposeOp<Context>::RunOnDevice() {
...
@@ -45,7 +45,9 @@ void TransposeOp<Context>::RunOnDevice() {
}
}
if
(
input
(
0
).
template
IsType
<
float
>
())
RunWithType
<
float
>
();
if
(
input
(
0
).
template
IsType
<
float
>
())
RunWithType
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
RunWithType
<
float16
>
();
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
RunWithType
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
}
...
@@ -75,7 +77,9 @@ void TransposeGradientOp<Context>::RunOnDevice() {
...
@@ -75,7 +77,9 @@ void TransposeGradientOp<Context>::RunOnDevice() {
new_steps
=
ws
()
->
GetTensor
(
"_t_"
+
anchor
()
+
"_new_steps"
);
new_steps
=
ws
()
->
GetTensor
(
"_t_"
+
anchor
()
+
"_new_steps"
);
if
(
input
(
0
).
template
IsType
<
float
>
())
RunWithType
<
float
>
();
if
(
input
(
0
).
template
IsType
<
float
>
())
RunWithType
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
RunWithType
<
float16
>
();
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
RunWithType
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
}
...
...
Dragon/src/operators/norm/batch_norm_op.cc
View file @
6eeac5f
...
@@ -127,7 +127,9 @@ void BatchNormOp<Context>::RunOnDevice() {
...
@@ -127,7 +127,9 @@ void BatchNormOp<Context>::RunOnDevice() {
if
(
input
(
0
).
template
IsType
<
float
>
())
RunWithType
<
float
>
();
if
(
input
(
0
).
template
IsType
<
float
>
())
RunWithType
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
RunWithType
<
float16
>
();
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
RunWithType
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
}
...
@@ -247,7 +249,9 @@ void BatchNormGradientOp<Context>::RunOnDevice() {
...
@@ -247,7 +249,9 @@ void BatchNormGradientOp<Context>::RunOnDevice() {
else
use_global_stats
=
use_stats
==
1
?
true
:
false
;
else
use_global_stats
=
use_stats
==
1
?
true
:
false
;
if
(
input
(
0
).
template
IsType
<
float
>
())
RunWithType
<
float
>
();
if
(
input
(
0
).
template
IsType
<
float
>
())
RunWithType
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
RunWithType
<
float16
>
();
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
RunWithType
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
}
...
...
Dragon/src/operators/norm/l2_norm_op.cc
View file @
6eeac5f
...
@@ -78,7 +78,9 @@ void L2NormOp<Context>::RunOnDevice() {
...
@@ -78,7 +78,9 @@ void L2NormOp<Context>::RunOnDevice() {
output
(
0
)
->
ReshapeLike
(
input
(
0
));
output
(
0
)
->
ReshapeLike
(
input
(
0
));
if
(
input
(
0
).
template
IsType
<
float
>
())
RunWithType
<
float
>
();
if
(
input
(
0
).
template
IsType
<
float
>
())
RunWithType
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
RunWithType
<
float16
>
();
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
RunWithType
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
}
...
...
Dragon/src/operators/utils/cast_op.cpp
View file @
6eeac5f
...
@@ -4,6 +4,8 @@
...
@@ -4,6 +4,8 @@
namespace
dragon
{
namespace
dragon
{
#ifdef WITH_CUDA_FP16
template
<
class
Context
>
template
<
class
Context
>
void
FloatToHalfOp
<
Context
>::
RunOnDevice
()
{
void
FloatToHalfOp
<
Context
>::
RunOnDevice
()
{
CHECK
(
input
(
0
).
template
IsType
<
float
>
())
CHECK
(
input
(
0
).
template
IsType
<
float
>
())
...
@@ -28,4 +30,6 @@ OPERATOR_SCHEMA(FloatToHalf).NumInputs(1).NumOutputs(1);
...
@@ -28,4 +30,6 @@ OPERATOR_SCHEMA(FloatToHalf).NumInputs(1).NumOutputs(1);
NO_GRADIENT
(
FloatToHalf
);
NO_GRADIENT
(
FloatToHalf
);
#endif
}
//
namespace
dragon
}
//
namespace
dragon
\ No newline at end of file
Dragon/src/operators/utils/gradient_op.cc
View file @
6eeac5f
...
@@ -19,7 +19,9 @@ void GradientGenerateOp<Context>::RunWithType() {
...
@@ -19,7 +19,9 @@ void GradientGenerateOp<Context>::RunWithType() {
template
<
class
Context
>
template
<
class
Context
>
void
GradientGenerateOp
<
Context
>::
RunOnDevice
()
{
void
GradientGenerateOp
<
Context
>::
RunOnDevice
()
{
if
(
input
(
0
).
template
IsType
<
float
>
())
RunWithType
<
float
>
();
if
(
input
(
0
).
template
IsType
<
float
>
())
RunWithType
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
RunWithType
<
float16
>
();
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
RunWithType
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
}
...
...
Dragon/src/operators/utils/memory_data_op.cc
View file @
6eeac5f
...
@@ -23,12 +23,16 @@ void MemoryDataOp<Context>::RunOnDevice() {
...
@@ -23,12 +23,16 @@ void MemoryDataOp<Context>::RunOnDevice() {
if
(
input
(
0
).
template
IsType
<
float
>
())
{
if
(
input
(
0
).
template
IsType
<
float
>
())
{
if
(
data_type
==
TensorProto_DataType_FLOAT
)
RunWithType
<
float
,
float
>
();
if
(
data_type
==
TensorProto_DataType_FLOAT
)
RunWithType
<
float
,
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
data_type
==
TensorProto_DataType_FLOAT16
)
RunWithType
<
float
,
float16
>
();
else
if
(
data_type
==
TensorProto_DataType_FLOAT16
)
RunWithType
<
float
,
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
}
else
if
(
input
(
0
).
template
IsType
<
uint8_t
>
())
{
else
if
(
input
(
0
).
template
IsType
<
uint8_t
>
())
{
if
(
data_type
==
TensorProto_DataType_FLOAT
)
RunWithType
<
uint8_t
,
float
>
();
if
(
data_type
==
TensorProto_DataType_FLOAT
)
RunWithType
<
uint8_t
,
float
>
();
#ifdef WITH_CUDA_FP16
if
(
data_type
==
TensorProto_DataType_FLOAT16
)
RunWithType
<
uint8_t
,
float16
>
();
if
(
data_type
==
TensorProto_DataType_FLOAT16
)
RunWithType
<
uint8_t
,
float16
>
();
#endif
}
}
else
{
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
else
{
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
}
}
...
...
Dragon/src/operators/vision/dense_concat_op.cc
View file @
6eeac5f
...
@@ -58,7 +58,9 @@ void DenseConcatGradientOp<Context>::ElimateCorruption() {
...
@@ -58,7 +58,9 @@ void DenseConcatGradientOp<Context>::ElimateCorruption() {
input
(
0
).
Move
(
buffer
->
memory
());
input
(
0
).
Move
(
buffer
->
memory
());
head_data
[
idx
]
=
input
(
0
).
name
();
head_data
[
idx
]
=
input
(
0
).
name
();
if
(
input
(
-
2
).
template
IsType
<
float
>
())
RestoreX1
<
float
>
();
if
(
input
(
-
2
).
template
IsType
<
float
>
())
RestoreX1
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
-
2
).
template
IsType
<
float16
>
())
RestoreX1
<
float16
>
();
else
if
(
input
(
-
2
).
template
IsType
<
float16
>
())
RestoreX1
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
// post-process
// post-process
if
(
input
(
0
).
memory
()
!=
buffer
->
memory
())
buffer
->
Move
(
input
(
0
).
memory
());
if
(
input
(
0
).
memory
()
!=
buffer
->
memory
())
buffer
->
Move
(
input
(
0
).
memory
());
...
...
Dragon/src/utils/math_functions.cc
View file @
6eeac5f
#include <random>
#include <random>
#include "core/context.h"
#include "core/context.h"
#include "utils/math_functions.h"
#include "utils/omp_alternative.h"
#ifdef WITH_SSE
#include "utils/sse_alternative.h"
#include "utils/sse_alternative.h"
#
endif
#
include "utils/math_functions.h"
namespace
dragon
{
namespace
dragon
{
...
@@ -22,9 +20,12 @@ template <> void Set<float, CPUContext>(const int n,
...
@@ -22,9 +20,12 @@ template <> void Set<float, CPUContext>(const int n,
}
}
#ifdef WITH_SSE
#ifdef WITH_SSE
sse
::
Set
<
float
>
(
n
,
alpha
,
x
);
sse
::
Set
<
float
>
(
n
,
alpha
,
x
);
#else // naive implement
#else
for
(
int
i
=
0
;
i
<
n
;
++
i
)
x
[
i
]
=
alpha
;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
x
[
i
]
=
alpha
;
#endif // WITH_SSE
}
}
template
<>
void
Set
<
int
,
CPUContext
>
(
const
int
n
,
template
<>
void
Set
<
int
,
CPUContext
>
(
const
int
n
,
...
@@ -36,9 +37,12 @@ template <> void Set<int, CPUContext>(const int n,
...
@@ -36,9 +37,12 @@ template <> void Set<int, CPUContext>(const int n,
}
}
#ifdef WITH_SSE
#ifdef WITH_SSE
sse
::
Set
<
int
>
(
n
,
alpha
,
x
);
sse
::
Set
<
int
>
(
n
,
alpha
,
x
);
#else // naive implement
#else
for
(
int
i
=
0
;
i
<
n
;
++
i
)
x
[
i
]
=
alpha
;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
x
[
i
]
=
alpha
;
#endif // WITH_SSE
}
}
template
<>
void
Set
<
float16
,
CPUContext
>
(
const
int
n
,
template
<>
void
Set
<
float16
,
CPUContext
>
(
const
int
n
,
...
@@ -52,9 +56,10 @@ template <> void RandomUniform<float, CPUContext>(const int n,
...
@@ -52,9 +56,10 @@ template <> void RandomUniform<float, CPUContext>(const int n,
const
float
high
,
const
float
high
,
float
*
x
)
{
float
*
x
)
{
std
::
uniform_real_distribution
<
float
>
distribution
(
low
,
high
);
std
::
uniform_real_distribution
<
float
>
distribution
(
low
,
high
);
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
#ifdef WITH_OMP
x
[
i
]
=
distribution
(
*
rand_generator
());
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
}
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
x
[
i
]
=
distribution
(
*
rand_generator
());
}
}
template
<>
void
RandomUniform
<
float16
,
CPUContext
>
(
const
int
n
,
template
<>
void
RandomUniform
<
float16
,
CPUContext
>
(
const
int
n
,
...
@@ -69,9 +74,10 @@ template <> void RandomUniform<uint32_t, CPUContext>(const int n,
...
@@ -69,9 +74,10 @@ template <> void RandomUniform<uint32_t, CPUContext>(const int n,
const
float
high
,
const
float
high
,
uint32_t
*
x
)
{
uint32_t
*
x
)
{
std
::
uniform_int_distribution
<
uint32_t
>
distribution
(
low
,
high
);
std
::
uniform_int_distribution
<
uint32_t
>
distribution
(
low
,
high
);
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
#ifdef WITH_OMP
x
[
i
]
=
distribution
(
*
rand_generator
());
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
}
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
x
[
i
]
=
distribution
(
*
rand_generator
());
}
}
template
<>
void
RandomNormal
<
float
,
CPUContext
>
(
const
int
n
,
template
<>
void
RandomNormal
<
float
,
CPUContext
>
(
const
int
n
,
...
@@ -79,9 +85,10 @@ template <> void RandomNormal<float, CPUContext>(const int n,
...
@@ -79,9 +85,10 @@ template <> void RandomNormal<float, CPUContext>(const int n,
const
float
sigma
,
const
float
sigma
,
float
*
x
)
{
float
*
x
)
{
std
::
normal_distribution
<
float
>
distribution
(
mu
,
sigma
);
std
::
normal_distribution
<
float
>
distribution
(
mu
,
sigma
);
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
#ifdef WITH_OMP
x
[
i
]
=
distribution
(
*
rand_generator
());
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
}
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
x
[
i
]
=
distribution
(
*
rand_generator
());
}
}
template
<>
void
RandomNormal
<
float16
,
CPUContext
>
(
const
int
n
,
template
<>
void
RandomNormal
<
float16
,
CPUContext
>
(
const
int
n
,
...
@@ -121,9 +128,10 @@ template <> void RandomBernoulli<float, CPUContext>(const int n,
...
@@ -121,9 +128,10 @@ template <> void RandomBernoulli<float, CPUContext>(const int n,
const
float
p
,
const
float
p
,
uint32_t
*
x
)
{
uint32_t
*
x
)
{
std
::
bernoulli_distribution
distribution
(
p
);
std
::
bernoulli_distribution
distribution
(
p
);
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
#ifdef WITH_OMP
x
[
i
]
=
distribution
(
*
rand_generator
());
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
}
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
x
[
i
]
=
distribution
(
*
rand_generator
());
}
}
/******************** Level-1 ********************/
/******************** Level-1 ********************/
...
@@ -134,9 +142,12 @@ template <> void Add<float, CPUContext>(const int n,
...
@@ -134,9 +142,12 @@ template <> void Add<float, CPUContext>(const int n,
float
*
y
)
{
float
*
y
)
{
#ifdef WITH_SSE
#ifdef WITH_SSE
sse
::
Add
<
float
>
(
n
,
a
,
b
,
y
);
sse
::
Add
<
float
>
(
n
,
a
,
b
,
y
);
#else // naive implement
#else
for
(
int
i
=
0
;
i
<
n
;
++
i
)
y
[
i
]
=
a
[
i
]
+
b
[
i
];
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
y
[
i
]
=
a
[
i
]
+
b
[
i
];
#endif // WITH_SSE
}
}
template
<>
void
Sub
<
float
,
CPUContext
>
(
const
int
n
,
template
<>
void
Sub
<
float
,
CPUContext
>
(
const
int
n
,
...
@@ -145,9 +156,12 @@ template <> void Sub<float, CPUContext>(const int n,
...
@@ -145,9 +156,12 @@ template <> void Sub<float, CPUContext>(const int n,
float
*
y
)
{
float
*
y
)
{
#ifdef WITH_SSE
#ifdef WITH_SSE
sse
::
Sub
<
float
>
(
n
,
a
,
b
,
y
);
sse
::
Sub
<
float
>
(
n
,
a
,
b
,
y
);
#else // naive implement
#else
for
(
int
i
=
0
;
i
<
n
;
++
i
)
y
[
i
]
=
a
[
i
]
-
b
[
i
];
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
y
[
i
]
=
a
[
i
]
-
b
[
i
];
#endif // WITH_SSE
}
}
template
<>
void
Mul
<
float
,
CPUContext
>
(
const
int
n
,
template
<>
void
Mul
<
float
,
CPUContext
>
(
const
int
n
,
...
@@ -156,9 +170,12 @@ template <> void Mul<float, CPUContext>(const int n,
...
@@ -156,9 +170,12 @@ template <> void Mul<float, CPUContext>(const int n,
float
*
y
)
{
float
*
y
)
{
#ifdef WITH_SSE
#ifdef WITH_SSE
sse
::
Mul
<
float
>
(
n
,
a
,
b
,
y
);
sse
::
Mul
<
float
>
(
n
,
a
,
b
,
y
);
#else // naive implement
#else
for
(
int
i
=
0
;
i
<
n
;
++
i
)
y
[
i
]
=
a
[
i
]
*
b
[
i
];
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
y
[
i
]
=
a
[
i
]
*
b
[
i
];
#endif // WITH_SSE
}
}
template
<>
void
Mul
<
float16
,
CPUContext
>
(
const
int
n
,
template
<>
void
Mul
<
float16
,
CPUContext
>
(
const
int
n
,
...
@@ -174,9 +191,12 @@ template <> void Div<float, CPUContext>(const int n,
...
@@ -174,9 +191,12 @@ template <> void Div<float, CPUContext>(const int n,
float
*
y
)
{
float
*
y
)
{
#ifdef WITH_SSE
#ifdef WITH_SSE
sse
::
Div
<
float
>
(
n
,
a
,
b
,
y
);
sse
::
Div
<
float
>
(
n
,
a
,
b
,
y
);
#else // naive implement
#else
for
(
int
i
=
0
;
i
<
n
;
++
i
)
y
[
i
]
=
a
[
i
]
/
b
[
i
];
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
y
[
i
]
=
a
[
i
]
/
b
[
i
];
#endif // WITH_SSE
}
}
template
<>
void
Div
<
float16
,
CPUContext
>
(
const
int
n
,
template
<>
void
Div
<
float16
,
CPUContext
>
(
const
int
n
,
...
@@ -190,6 +210,9 @@ template <> void Clip<float, CPUContext>(const int n,
...
@@ -190,6 +210,9 @@ template <> void Clip<float, CPUContext>(const int n,
const
float
low
,
const
float
low
,
const
float
high
,
const
float
high
,
float
*
x
)
{
float
*
x
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
x
[
i
]
=
std
::
max
(
low
,
std
::
min
(
x
[
i
],
high
));
x
[
i
]
=
std
::
max
(
low
,
std
::
min
(
x
[
i
],
high
));
}
}
...
@@ -198,6 +221,9 @@ template <> void Clip<float, CPUContext>(const int n,
...
@@ -198,6 +221,9 @@ template <> void Clip<float, CPUContext>(const int n,
template
<>
void
Exp
<
float
,
CPUContext
>
(
int
n
,
template
<>
void
Exp
<
float
,
CPUContext
>
(
int
n
,
const
float
*
x
,
const
float
*
x
,
float
*
y
)
{
float
*
y
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
std
::
exp
(
x
[
i
]);
y
[
i
]
=
std
::
exp
(
x
[
i
]);
}
}
...
@@ -206,6 +232,9 @@ template <> void Exp<float, CPUContext>(int n,
...
@@ -206,6 +232,9 @@ template <> void Exp<float, CPUContext>(int n,
template
<>
void
Log
<
float
,
CPUContext
>
(
int
n
,
template
<>
void
Log
<
float
,
CPUContext
>
(
int
n
,
const
float
*
x
,
const
float
*
x
,
float
*
y
)
{
float
*
y
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
std
::
log
(
x
[
i
]);
y
[
i
]
=
std
::
log
(
x
[
i
]);
}
}
...
@@ -214,6 +243,9 @@ template <> void Log<float, CPUContext>(int n,
...
@@ -214,6 +243,9 @@ template <> void Log<float, CPUContext>(int n,
template
<>
void
Square
<
float
,
CPUContext
>
(
int
n
,
template
<>
void
Square
<
float
,
CPUContext
>
(
int
n
,
const
float
*
x
,
const
float
*
x
,
float
*
y
)
{
float
*
y
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
x
[
i
]
*
x
[
i
];
y
[
i
]
=
x
[
i
]
*
x
[
i
];
}
}
...
@@ -228,6 +260,9 @@ template <> void Square<float16, CPUContext>(int n,
...
@@ -228,6 +260,9 @@ template <> void Square<float16, CPUContext>(int n,
template
<>
void
Sqrt
<
float
,
CPUContext
>
(
int
n
,
template
<>
void
Sqrt
<
float
,
CPUContext
>
(
int
n
,
const
float
*
x
,
const
float
*
x
,
float
*
y
)
{
float
*
y
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
std
::
sqrt
(
x
[
i
]);
y
[
i
]
=
std
::
sqrt
(
x
[
i
]);
}
}
...
@@ -243,6 +278,9 @@ template <> void Pow<float, CPUContext>(int n,
...
@@ -243,6 +278,9 @@ template <> void Pow<float, CPUContext>(int n,
const
float
alpha
,
const
float
alpha
,
const
float
*
x
,
const
float
*
x
,
float
*
y
)
{
float
*
y
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
std
::
pow
(
x
[
i
],
alpha
);
y
[
i
]
=
std
::
pow
(
x
[
i
],
alpha
);
}
}
...
@@ -259,6 +297,9 @@ template <> void Inv<float, CPUContext>(const int n,
...
@@ -259,6 +297,9 @@ template <> void Inv<float, CPUContext>(const int n,
const
float
numerator
,
const
float
numerator
,
const
float
*
x
,
const
float
*
x
,
float
*
y
)
{
float
*
y
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
1.0
/
y
[
i
];
y
[
i
]
=
1.0
/
y
[
i
];
}
}
...
@@ -280,9 +321,12 @@ template <> void Scal<float, CPUContext>(const int n,
...
@@ -280,9 +321,12 @@ template <> void Scal<float, CPUContext>(const int n,
cblas_sscal
(
n
,
alpha
,
y
,
1
);
cblas_sscal
(
n
,
alpha
,
y
,
1
);
#elif WITH_SSE
#elif WITH_SSE
sse
::
Scal
<
float
>
(
n
,
alpha
,
y
);
sse
::
Scal
<
float
>
(
n
,
alpha
,
y
);
#else // naive implement
#else
for
(
int
i
=
0
;
i
<
n
;
++
i
)
y
[
i
]
=
y
[
i
]
*
alpha
;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
y
[
i
]
=
y
[
i
]
*
alpha
;
#endif // WITH_BLAS
}
}
template
<>
void
Scal
<
float16
,
CPUContext
>
(
const
int
n
,
template
<>
void
Scal
<
float16
,
CPUContext
>
(
const
int
n
,
...
@@ -307,9 +351,12 @@ template <> void Scale<float, CPUContext>(const int n,
...
@@ -307,9 +351,12 @@ template <> void Scale<float, CPUContext>(const int n,
cblas_sscal
(
n
,
alpha
,
y
,
1
);
cblas_sscal
(
n
,
alpha
,
y
,
1
);
#elif WITH_SSE
#elif WITH_SSE
sse
::
Scale
<
float
>
(
n
,
alpha
,
x
,
y
);
sse
::
Scale
<
float
>
(
n
,
alpha
,
x
,
y
);
#else // naive implement
#else
for
(
int
i
=
0
;
i
<
n
;
++
i
)
y
[
i
]
=
x
[
i
]
*
alpha
;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
y
[
i
]
=
x
[
i
]
*
alpha
;
#endif // WITH_BLAS
}
}
template
<>
float
StridedDot
<
float
,
CPUContext
>
(
const
int
n
,
template
<>
float
StridedDot
<
float
,
CPUContext
>
(
const
int
n
,
...
@@ -319,11 +366,14 @@ template <> float StridedDot<float, CPUContext>(const int n,
...
@@ -319,11 +366,14 @@ template <> float StridedDot<float, CPUContext>(const int n,
const
int
incy
)
{
const
int
incy
)
{
#ifdef WITH_BLAS
#ifdef WITH_BLAS
return
cblas_sdot
(
n
,
a
,
incx
,
b
,
incy
);
return
cblas_sdot
(
n
,
a
,
incx
,
b
,
incy
);
#else
// naive implement
#else
float
ret
=
0.
f
;
float
ret
=
0.
f
;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
ret
+=
a
[
i
]
*
b
[
i
];
for
(
int
i
=
0
;
i
<
n
;
++
i
)
ret
+=
a
[
i
]
*
b
[
i
];
return
ret
;
return
ret
;
#endif
#endif
// WITH_BLAS
}
}
template
<>
float
Dot
<
float
,
CPUContext
>
(
int
n
,
template
<>
float
Dot
<
float
,
CPUContext
>
(
int
n
,
...
@@ -333,11 +383,14 @@ template <> float Dot<float, CPUContext>(int n,
...
@@ -333,11 +383,14 @@ template <> float Dot<float, CPUContext>(int n,
return
StridedDot
<
float
,
CPUContext
>
(
n
,
a
,
1
,
b
,
1
);
return
StridedDot
<
float
,
CPUContext
>
(
n
,
a
,
1
,
b
,
1
);
#elif WITH_SSE
#elif WITH_SSE
return
sse
::
Dot
<
float
>
(
n
,
a
,
b
);
return
sse
::
Dot
<
float
>
(
n
,
a
,
b
);
#else
// naive implement
#else
float
ret
=
0.
f
;
float
ret
=
0.
f
;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
ret
+=
a
[
i
]
*
b
[
i
];
for
(
int
i
=
0
;
i
<
n
;
++
i
)
ret
+=
a
[
i
]
*
b
[
i
];
return
ret
;
return
ret
;
#endif
#endif
// WITH_BLAS
}
}
template
<>
float
Dot
<
float16
,
CPUContext
>
(
int
n
,
template
<>
float
Dot
<
float16
,
CPUContext
>
(
int
n
,
...
@@ -352,11 +405,14 @@ template <> float ASum<float, CPUContext>(const int n, const float* x) {
...
@@ -352,11 +405,14 @@ template <> float ASum<float, CPUContext>(const int n, const float* x) {
return
cblas_sasum
(
n
,
x
,
1
);
return
cblas_sasum
(
n
,
x
,
1
);
#elif WITH_SSE
#elif WITH_SSE
return
sse
::
ASum
<
float
>
(
n
,
x
);
return
sse
::
ASum
<
float
>
(
n
,
x
);
#else
// naive implement
#else
float
ret
=
0.
f
;
float
ret
=
0.
f
;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
ret
+=
x
[
i
];
for
(
int
i
=
0
;
i
<
n
;
++
i
)
ret
+=
x
[
i
];
return
ret
;
return
ret
;
#endif
#endif
// WITH_BLAS
}
}
template
<>
void
AddScalar
<
float
,
CPUContext
>
(
const
int
n
,
template
<>
void
AddScalar
<
float
,
CPUContext
>
(
const
int
n
,
...
@@ -364,9 +420,12 @@ template <> void AddScalar<float, CPUContext>(const int n,
...
@@ -364,9 +420,12 @@ template <> void AddScalar<float, CPUContext>(const int n,
float
*
y
)
{
float
*
y
)
{
#ifdef WITH_SSE
#ifdef WITH_SSE
sse
::
AddScalar
<
float
>
(
n
,
alpha
,
y
);
sse
::
AddScalar
<
float
>
(
n
,
alpha
,
y
);
#else // naive implement
#else
for
(
int
i
=
0
;
i
<
n
;
++
i
)
y
[
i
]
+=
alpha
;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
y
[
i
]
+=
alpha
;
#endif // WITH_SSE
}
}
template
<>
void
AddScalar
<
float16
,
CPUContext
>
(
const
int
n
,
template
<>
void
AddScalar
<
float16
,
CPUContext
>
(
const
int
n
,
...
@@ -380,9 +439,12 @@ template <> void MulScalar<float, CPUContext>(const int n,
...
@@ -380,9 +439,12 @@ template <> void MulScalar<float, CPUContext>(const int n,
float
*
y
)
{
float
*
y
)
{
#ifdef WITH_SSE
#ifdef WITH_SSE
sse
::
MulScalar
<
float
>
(
n
,
alpha
,
y
);
sse
::
MulScalar
<
float
>
(
n
,
alpha
,
y
);
#else // naive implement
#else
for
(
int
i
=
0
;
i
<
n
;
++
i
)
y
[
i
]
*=
alpha
;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
y
[
i
]
*=
alpha
;
#endif // WITH_SSE
}
}
template
<>
void
Axpy
<
float
,
CPUContext
>
(
const
int
n
,
template
<>
void
Axpy
<
float
,
CPUContext
>
(
const
int
n
,
...
@@ -393,9 +455,12 @@ template <> void Axpy<float, CPUContext>(const int n,
...
@@ -393,9 +455,12 @@ template <> void Axpy<float, CPUContext>(const int n,
cblas_saxpy
(
n
,
alpha
,
x
,
1
,
y
,
1
);
cblas_saxpy
(
n
,
alpha
,
x
,
1
,
y
,
1
);
#elif WITH_SSE
#elif WITH_SSE
sse
::
Axpy
<
float
>
(
n
,
alpha
,
x
,
y
);
sse
::
Axpy
<
float
>
(
n
,
alpha
,
x
,
y
);
#else // naive implement
#else
for
(
int
i
=
0
;
i
<
n
;
++
i
)
y
[
i
]
=
alpha
*
x
[
i
]
+
y
[
i
];
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
y
[
i
]
=
alpha
*
x
[
i
]
+
y
[
i
];
#endif // WITH_BLAS
}
}
template
<>
void
Axpy
<
float16
,
CPUContext
>
(
const
int
n
,
template
<>
void
Axpy
<
float16
,
CPUContext
>
(
const
int
n
,
...
@@ -415,9 +480,12 @@ template <> void Axpby<float, CPUContext>(const int n,
...
@@ -415,9 +480,12 @@ template <> void Axpby<float, CPUContext>(const int n,
cblas_saxpy
(
n
,
alpha
,
x
,
1
,
y
,
1
);
cblas_saxpy
(
n
,
alpha
,
x
,
1
,
y
,
1
);
#elif WITH_SSE
#elif WITH_SSE
sse
::
Axpby
<
float
>
(
n
,
alpha
,
x
,
beta
,
y
);
sse
::
Axpby
<
float
>
(
n
,
alpha
,
x
,
beta
,
y
);
#else // naive implement
#else
for
(
int
i
=
0
;
i
<
n
;
++
i
)
y
[
i
]
=
alpha
*
x
[
i
]
+
beta
*
y
[
i
];
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
y
[
i
]
=
alpha
*
x
[
i
]
+
beta
*
y
[
i
];
#endif // WITH_BLAS
}
}
template
<>
void
Axpby
<
float16
,
CPUContext
>
(
const
int
n
,
template
<>
void
Axpby
<
float16
,
CPUContext
>
(
const
int
n
,
...
...
Dragon/src/utils/math_functions.cu
View file @
6eeac5f
...
@@ -40,6 +40,7 @@ template <> void Set<int, CUDAContext>(const int n,
...
@@ -40,6 +40,7 @@ template <> void Set<int, CUDAContext>(const int n,
_Set<int> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, alpha, x);
_Set<int> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, alpha, x);
}
}
#ifdef WITH_CUDA_FP16
template <typename T>
template <typename T>
__global__ void _SetHalf2(const int n, const half2 alpha, half2* x) {
__global__ void _SetHalf2(const int n, const half2 alpha, half2* x) {
CUDA_KERNEL_LOOP(idx, n) {
CUDA_KERNEL_LOOP(idx, n) {
...
@@ -61,6 +62,7 @@ template <> void Set<float16, CUDAContext>(const int n,
...
@@ -61,6 +62,7 @@ template <> void Set<float16, CUDAContext>(const int n,
_Set<float16> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, alpha, x);
_Set<float16> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, alpha, x);
}
}
}
}
#endif
template <> void RandomUniform<uint32_t, CUDAContext>(const int n,
template <> void RandomUniform<uint32_t, CUDAContext>(const int n,
const float low,
const float low,
...
@@ -144,6 +146,7 @@ template <> void Mul<float, CUDAContext>(int n,
...
@@ -144,6 +146,7 @@ template <> void Mul<float, CUDAContext>(int n,
_Mul<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, a, b, y);
_Mul<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, a, b, y);
}
}
#ifdef WITH_CUDA_FP16
template <typename T>
template <typename T>
__global__ void _MulHalf(const int n, const half* a, const half* b, half* y) {
__global__ void _MulHalf(const int n, const half* a, const half* b, half* y) {
CUDA_KERNEL_LOOP(idx, n) {
CUDA_KERNEL_LOOP(idx, n) {
...
@@ -176,6 +179,7 @@ template <> void Mul<float16, CUDAContext>(int n,
...
@@ -176,6 +179,7 @@ template <> void Mul<float16, CUDAContext>(int n,
reinterpret_cast<const half*>(b),
reinterpret_cast<const half*>(b),
reinterpret_cast<half*>(y));
reinterpret_cast<half*>(y));
}
}
#endif
template <typename T>
template <typename T>
__global__ void _Div(const int n, const T* a, const T* b, T* y) {
__global__ void _Div(const int n, const T* a, const T* b, T* y) {
...
@@ -191,6 +195,7 @@ template <> void Div<float, CUDAContext>(int n,
...
@@ -191,6 +195,7 @@ template <> void Div<float, CUDAContext>(int n,
_Div<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, a, b, y);
_Div<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, a, b, y);
}
}
#ifdef WITH_CUDA_FP16
template <typename T>
template <typename T>
__global__ void _DivHalf(const int n, const half* a, const half* b, half* y) {
__global__ void _DivHalf(const int n, const half* a, const half* b, half* y) {
CUDA_KERNEL_LOOP(idx, n) {
CUDA_KERNEL_LOOP(idx, n) {
...
@@ -209,6 +214,7 @@ template <> void Div<float16, CUDAContext>(int n,
...
@@ -209,6 +214,7 @@ template <> void Div<float16, CUDAContext>(int n,
reinterpret_cast<const half*>(b),
reinterpret_cast<const half*>(b),
reinterpret_cast<half*>(y));
reinterpret_cast<half*>(y));
}
}
#endif
template <typename T>
template <typename T>
__global__ void _Clip(const int n, const T low, const T high, T* x) {
__global__ void _Clip(const int n, const T low, const T high, T* x) {
...
@@ -260,6 +266,7 @@ template <> void Square<float, CUDAContext>(int n,
...
@@ -260,6 +266,7 @@ template <> void Square<float, CUDAContext>(int n,
_Square<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, x, y);
_Square<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, x, y);
}
}
#ifdef WITH_CUDA_FP16
template <typename T>
template <typename T>
__global__ void _SquareHalf(const int n, const half* x, half* y) {
__global__ void _SquareHalf(const int n, const half* x, half* y) {
CUDA_KERNEL_LOOP(idx, n) {
CUDA_KERNEL_LOOP(idx, n) {
...
@@ -290,6 +297,7 @@ template <> void Square<float16, CUDAContext>(int n,
...
@@ -290,6 +297,7 @@ template <> void Square<float16, CUDAContext>(int n,
reinterpret_cast<half*>(y));
reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK;
CUDA_POST_KERNEL_CHECK;
}
}
#endif
template <typename T>
template <typename T>
__global__ void _Sqrt(const int n, const T* x, T* y) {
__global__ void _Sqrt(const int n, const T* x, T* y) {
...
@@ -304,6 +312,7 @@ template <> void Sqrt<float, CUDAContext>(int n,
...
@@ -304,6 +312,7 @@ template <> void Sqrt<float, CUDAContext>(int n,
_Sqrt<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, x, y);
_Sqrt<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, x, y);
}
}
#ifdef WITH_CUDA_FP16
template <typename T>
template <typename T>
__global__ void _SqrtHalf(const int n, const half* x, half* y) {
__global__ void _SqrtHalf(const int n, const half* x, half* y) {
CUDA_KERNEL_LOOP(idx, n) {
CUDA_KERNEL_LOOP(idx, n) {
...
@@ -334,6 +343,7 @@ template <> void Sqrt<float16, CUDAContext>(int n,
...
@@ -334,6 +343,7 @@ template <> void Sqrt<float16, CUDAContext>(int n,
reinterpret_cast<half*>(y));
reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK;
CUDA_POST_KERNEL_CHECK;
}
}
#endif
template <typename T>
template <typename T>
__global__ void _Pow(const int n, const T alpha, const T* a, T* y) {
__global__ void _Pow(const int n, const T alpha, const T* a, T* y) {
...
@@ -349,6 +359,7 @@ template <> void Pow<float, CUDAContext>(int n,
...
@@ -349,6 +359,7 @@ template <> void Pow<float, CUDAContext>(int n,
_Pow<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, alpha, x, y);
_Pow<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, alpha, x, y);
}
}
#ifdef WITH_CUDA_FP16
template <typename T>
template <typename T>
__global__ void _PowHalf(const int n, const float alpha, const half* a, half* y) {
__global__ void _PowHalf(const int n, const float alpha, const half* a, half* y) {
CUDA_KERNEL_LOOP(idx, n) {
CUDA_KERNEL_LOOP(idx, n) {
...
@@ -384,6 +395,7 @@ template <> void Pow<float16, CUDAContext>(int n,
...
@@ -384,6 +395,7 @@ template <> void Pow<float16, CUDAContext>(int n,
reinterpret_cast<half*>(y));
reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK;
CUDA_POST_KERNEL_CHECK;
}
}
#endif
template <typename T>
template <typename T>
__global__ void _Inv(const int n, const float numerator, const T* x, T* y) {
__global__ void _Inv(const int n, const float numerator, const T* x, T* y) {
...
@@ -399,6 +411,7 @@ template <> void Inv<float, CUDAContext>(const int n,
...
@@ -399,6 +411,7 @@ template <> void Inv<float, CUDAContext>(const int n,
_Inv<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, numerator, x, y);
_Inv<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, numerator, x, y);
}
}
#ifdef WITH_CUDA_FP16
template <typename T>
template <typename T>
__global__ void _InvHalf(const int n, const half numerator, const half* x, half* y) {
__global__ void _InvHalf(const int n, const half numerator, const half* x, half* y) {
CUDA_KERNEL_LOOP(idx, n) {
CUDA_KERNEL_LOOP(idx, n) {
...
@@ -439,6 +452,7 @@ template <> void Inv<float16, CUDAContext>(const int n,
...
@@ -439,6 +452,7 @@ template <> void Inv<float16, CUDAContext>(const int n,
}
}
CUDA_POST_KERNEL_CHECK;
CUDA_POST_KERNEL_CHECK;
}
}
#endif
/******************** Level-2 ********************/
/******************** Level-2 ********************/
...
@@ -518,6 +532,7 @@ template <> void AddScalar<float, CUDAContext>(const int n, const float alpha, f
...
@@ -518,6 +532,7 @@ template <> void AddScalar<float, CUDAContext>(const int n, const float alpha, f
_AddScalar<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, alpha, y);
_AddScalar<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, alpha, y);
}
}
#ifdef WITH_CUDA_FP16
template <typename T>
template <typename T>
__global__ void _AddScalarHalf(const int n, half alpha, half* y) {
__global__ void _AddScalarHalf(const int n, half alpha, half* y) {
CUDA_KERNEL_LOOP(idx, n) {
CUDA_KERNEL_LOOP(idx, n) {
...
@@ -552,6 +567,7 @@ template <> void AddScalar<float16, CUDAContext>(const int n, const float alpha,
...
@@ -552,6 +567,7 @@ template <> void AddScalar<float16, CUDAContext>(const int n, const float alpha,
}
}
CUDA_POST_KERNEL_CHECK;
CUDA_POST_KERNEL_CHECK;
}
}
#endif
template <typename T>
template <typename T>
__global__ void _MulScalar(const int n, T alpha, T* y) {
__global__ void _MulScalar(const int n, T alpha, T* y) {
...
@@ -641,6 +657,7 @@ template <> void Gemm<float, CUDAContext>(const CBLAS_TRANSPOSE transA,
...
@@ -641,6 +657,7 @@ template <> void Gemm<float, CUDAContext>(const CBLAS_TRANSPOSE transA,
C, N));
C, N));
}
}
#ifdef WITH_CUDA_FP16
template <> void Gemm<float16, CUDAContext>(const CBLAS_TRANSPOSE transA,
template <> void Gemm<float16, CUDAContext>(const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB,
const CBLAS_TRANSPOSE transB,
const int M,
const int M,
...
@@ -682,6 +699,7 @@ template <> void Gemm<float16, CUDAContext>(const CBLAS_TRANSPOSE transA,
...
@@ -682,6 +699,7 @@ template <> void Gemm<float16, CUDAContext>(const CBLAS_TRANSPOSE transA,
LOG(FATAL) << "unsupported math type";
LOG(FATAL) << "unsupported math type";
}
}
}
}
#endif
template <> void Gemv<float, CUDAContext>(const CBLAS_TRANSPOSE transA,
template <> void Gemv<float, CUDAContext>(const CBLAS_TRANSPOSE transA,
const int M, const int N,
const int M, const int N,
...
@@ -702,6 +720,7 @@ template <> void Gemv<float, CUDAContext>(const CBLAS_TRANSPOSE transA,
...
@@ -702,6 +720,7 @@ template <> void Gemv<float, CUDAContext>(const CBLAS_TRANSPOSE transA,
y, 1));
y, 1));
}
}
#ifdef WITH_CUDA_FP16
template <> void Gemv<float16, CUDAContext>(const CBLAS_TRANSPOSE transA,
template <> void Gemv<float16, CUDAContext>(const CBLAS_TRANSPOSE transA,
const int M,
const int M,
const int N,
const int N,
...
@@ -742,6 +761,7 @@ template <> void Gemv<float16, CUDAContext>(const CBLAS_TRANSPOSE transA,
...
@@ -742,6 +761,7 @@ template <> void Gemv<float16, CUDAContext>(const CBLAS_TRANSPOSE transA,
LOG(FATAL) << "unsupported math type";
LOG(FATAL) << "unsupported math type";
}
}
}
}
#endif
} // namespace math
} // namespace math
...
...
Dragon/src/utils/op_kernel.cc
View file @
6eeac5f
...
@@ -3,11 +3,9 @@
...
@@ -3,11 +3,9 @@
#include "core/tensor.h"
#include "core/tensor.h"
#include "utils/op_kernel.h"
#include "utils/op_kernel.h"
#include "utils/math_functions.h"
#include "utils/omp_alternative.h"
#ifdef WITH_SSE
#include "utils/sse_alternative.h"
#include "utils/sse_alternative.h"
#
endif
#
include "utils/math_functions.h"
bool
judge
(
int
a
,
int
b
)
{
return
unsigned
(
a
)
<
unsigned
(
b
);
}
bool
judge
(
int
a
,
int
b
)
{
return
unsigned
(
a
)
<
unsigned
(
b
);
}
...
@@ -28,8 +26,10 @@ template<> void Dropout<float, CPUContext>(const int count,
...
@@ -28,8 +26,10 @@ template<> void Dropout<float, CPUContext>(const int count,
CPUContext
*
context
)
{
CPUContext
*
context
)
{
uint32_t
thresh
=
static_cast
<
uint32_t
>
(
UINT_MAX
*
prob
);
uint32_t
thresh
=
static_cast
<
uint32_t
>
(
UINT_MAX
*
prob
);
math
::
RandomBernoulli
<
float
,
CPUContext
>
(
count
,
1
-
prob
,
mask
);
math
::
RandomBernoulli
<
float
,
CPUContext
>
(
count
,
1
-
prob
,
mask
);
for
(
int
i
=
0
;
i
<
count
;
++
i
)
#ifdef WITH_OMP
y
[
i
]
=
x
[
i
]
*
mask
[
i
]
*
scale
;
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for
(
int
i
=
0
;
i
<
count
;
++
i
)
y
[
i
]
=
x
[
i
]
*
mask
[
i
]
*
scale
;
}
}
template
<>
void
DropoutGrad
<
float
,
CPUContext
>
(
const
int
count
,
template
<>
void
DropoutGrad
<
float
,
CPUContext
>
(
const
int
count
,
...
@@ -38,8 +38,10 @@ template<> void DropoutGrad<float, CPUContext>(const int count,
...
@@ -38,8 +38,10 @@ template<> void DropoutGrad<float, CPUContext>(const int count,
const
float
*
dy
,
const
float
*
dy
,
const
uint32_t
*
mask
,
const
uint32_t
*
mask
,
float
*
dx
)
{
float
*
dx
)
{
for
(
int
i
=
0
;
i
<
count
;
++
i
)
#ifdef WITH_OMP
dx
[
i
]
=
dy
[
i
]
*
mask
[
i
]
*
scale
;
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for
(
int
i
=
0
;
i
<
count
;
++
i
)
dx
[
i
]
=
dy
[
i
]
*
mask
[
i
]
*
scale
;
}
}
/******************** activation.relu ********************/
/******************** activation.relu ********************/
...
@@ -48,6 +50,9 @@ template<> void Relu<float, CPUContext>(const int count,
...
@@ -48,6 +50,9 @@ template<> void Relu<float, CPUContext>(const int count,
const
float
*
x
,
const
float
*
x
,
const
float
slope
,
const
float
slope
,
float
*
y
)
{
float
*
y
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
y
[
i
]
=
std
::
max
(
x
[
i
],
0.
f
)
+
slope
*
std
::
min
(
x
[
i
],
0.
f
);
y
[
i
]
=
std
::
max
(
x
[
i
],
0.
f
)
+
slope
*
std
::
min
(
x
[
i
],
0.
f
);
}
}
...
@@ -58,10 +63,12 @@ template<> void ReluGrad<float, CPUContext>(const int count,
...
@@ -58,10 +63,12 @@ template<> void ReluGrad<float, CPUContext>(const int count,
const
float
*
y
,
const
float
*
y
,
const
float
slope
,
const
float
slope
,
float
*
dx
)
{
float
*
dx
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
dx
[
i
]
=
dy
[
i
]
*
((
y
[
i
]
>
0
)
+
slope
*
(
y
[
i
]
<=
0
));
dx
[
i
]
=
dy
[
i
]
*
((
y
[
i
]
>
0
)
+
slope
*
(
y
[
i
]
<=
0
));
}
}
}
}
/******************** activation.sigmoid ********************/
/******************** activation.sigmoid ********************/
...
@@ -70,15 +77,19 @@ template <typename T>
...
@@ -70,15 +77,19 @@ template <typename T>
T
_sigmoid
(
T
x
)
{
return
T
(
1
)
/
(
T
(
1
)
+
exp
(
-
x
));
}
T
_sigmoid
(
T
x
)
{
return
T
(
1
)
/
(
T
(
1
)
+
exp
(
-
x
));
}
template
<>
void
Sigmoid
<
float
,
CPUContext
>
(
const
int
count
,
const
float
*
x
,
float
*
y
)
{
template
<>
void
Sigmoid
<
float
,
CPUContext
>
(
const
int
count
,
const
float
*
x
,
float
*
y
)
{
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
#ifdef WITH_OMP
y
[
i
]
=
_sigmoid
<
float
>
(
x
[
i
]);
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
}
#endif
for
(
int
i
=
0
;
i
<
count
;
++
i
)
y
[
i
]
=
_sigmoid
<
float
>
(
x
[
i
]);
}
}
template
<>
void
SigmoidGrad
<
float
,
CPUContext
>
(
const
int
count
,
template
<>
void
SigmoidGrad
<
float
,
CPUContext
>
(
const
int
count
,
const
float
*
dy
,
const
float
*
dy
,
const
float
*
y
,
const
float
*
y
,
float
*
dx
)
{
float
*
dx
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
dx
[
i
]
=
dy
[
i
]
*
y
[
i
]
*
(
1
-
y
[
i
]);
dx
[
i
]
=
dy
[
i
]
*
y
[
i
]
*
(
1
-
y
[
i
]);
}
}
...
@@ -149,6 +160,9 @@ template<> void SoftmaxGrad<float, CPUContext>(const int count,
...
@@ -149,6 +160,9 @@ template<> void SoftmaxGrad<float, CPUContext>(const int count,
/******************** activation.tanh ********************/
/******************** activation.tanh ********************/
template
<>
void
Tanh
<
float
,
CPUContext
>
(
const
int
count
,
const
float
*
x
,
float
*
y
)
{
template
<>
void
Tanh
<
float
,
CPUContext
>
(
const
int
count
,
const
float
*
x
,
float
*
y
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
y
[
i
]
=
std
::
tanh
(
x
[
i
]);
y
[
i
]
=
std
::
tanh
(
x
[
i
]);
}
}
...
@@ -158,6 +172,9 @@ template<> void TanhGrad<float, CPUContext>(const int count,
...
@@ -158,6 +172,9 @@ template<> void TanhGrad<float, CPUContext>(const int count,
const
float
*
dy
,
const
float
*
dy
,
const
float
*
y
,
const
float
*
y
,
float
*
dx
)
{
float
*
dx
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
dx
[
i
]
=
dy
[
i
]
*
(
1
-
y
[
i
]
*
y
[
i
]);
dx
[
i
]
=
dy
[
i
]
*
(
1
-
y
[
i
]
*
y
[
i
]);
}
}
...
@@ -197,6 +214,9 @@ template <> void Clip<float, CPUContext>(const int count,
...
@@ -197,6 +214,9 @@ template <> void Clip<float, CPUContext>(const int count,
const
float
*
x
,
const
float
*
x
,
float
*
mask
,
float
*
mask
,
float
*
y
)
{
float
*
y
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
mask
[
i
]
=
1.0
;
mask
[
i
]
=
1.0
;
if
(
x
[
i
]
<
low
||
x
[
i
]
>
high
)
mask
[
i
]
=
0.0
;
if
(
x
[
i
]
<
low
||
x
[
i
]
>
high
)
mask
[
i
]
=
0.0
;
...
@@ -300,8 +320,10 @@ template<> void Argmax<float, CPUContext>(const int count,
...
@@ -300,8 +320,10 @@ template<> void Argmax<float, CPUContext>(const int count,
/******************** common.at ********************/
/******************** common.at ********************/
template
<>
void
CanonicalAxis
<
float
,
CPUContext
>
(
const
int
count
,
const
int
dim
,
float
*
y
)
{
template
<>
void
CanonicalAxis
<
float
,
CPUContext
>
(
const
int
count
,
const
int
dim
,
float
*
y
)
{
for
(
int
i
=
0
;
i
<
count
;
++
i
)
#ifdef WITH_OMP
if
(
y
[
i
]
<
0
)
y
[
i
]
+=
dim
;
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for
(
int
i
=
0
;
i
<
count
;
++
i
)
if
(
y
[
i
]
<
0
)
y
[
i
]
+=
dim
;
}
}
template
<>
void
At
<
float
,
CPUContext
>
(
const
int
count
,
template
<>
void
At
<
float
,
CPUContext
>
(
const
int
count
,
...
@@ -478,6 +500,9 @@ template<> void Sum<float, CPUContext>(const int count,
...
@@ -478,6 +500,9 @@ template<> void Sum<float, CPUContext>(const int count,
const
int
inner_dim
,
const
int
inner_dim
,
const
float
*
x
,
const
float
*
x
,
float
*
y
)
{
float
*
y
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
float
sum_val
=
0.0
;
float
sum_val
=
0.0
;
for
(
int
j
=
0
;
j
<
axis_dim
;
++
j
)
for
(
int
j
=
0
;
j
<
axis_dim
;
++
j
)
...
@@ -492,6 +517,9 @@ template<> void SumGrad<float, CPUContext>(const int count,
...
@@ -492,6 +517,9 @@ template<> void SumGrad<float, CPUContext>(const int count,
const
float
coeff
,
const
float
coeff
,
const
float
*
dy
,
const
float
*
dy
,
float
*
dx
)
{
float
*
dx
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
for
(
int
j
=
0
;
j
<
axis_dim
;
++
j
)
for
(
int
j
=
0
;
j
<
axis_dim
;
++
j
)
dx
[(
i
/
inner_dim
*
axis_dim
+
j
)
*
inner_dim
+
i
%
inner_dim
]
=
dy
[
i
]
*
coeff
;
dx
[(
i
/
inner_dim
*
axis_dim
+
j
)
*
inner_dim
+
i
%
inner_dim
]
=
dy
[
i
]
*
coeff
;
...
@@ -585,6 +613,9 @@ template <> void Transpose<float, CPUContext>(const int count,
...
@@ -585,6 +613,9 @@ template <> void Transpose<float, CPUContext>(const int count,
const
int
*
new_steps
,
const
int
*
new_steps
,
const
float
*
x
,
const
float
*
x
,
float
*
y
)
{
float
*
y
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
int
x_idx
=
0
,
y_idx
=
i
;
int
x_idx
=
0
,
y_idx
=
i
;
for
(
int
j
=
0
;
j
<
ndim
;
++
j
)
{
for
(
int
j
=
0
;
j
<
ndim
;
++
j
)
{
...
@@ -603,15 +634,7 @@ template <> void Transpose<float16, CPUContext>(const int count,
...
@@ -603,15 +634,7 @@ template <> void Transpose<float16, CPUContext>(const int count,
const
int
*
new_steps
,
const
int
*
new_steps
,
const
float16
*
x
,
const
float16
*
x
,
float16
*
y
)
{
float16
*
y
)
{
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
LOG
(
FATAL
)
<<
"unsupport float16 with CPU"
;
int
x_idx
=
0
,
y_idx
=
i
;
for
(
int
j
=
0
;
j
<
ndim
;
++
j
)
{
int
k
=
order
[
j
];
x_idx
+=
(
y_idx
/
new_steps
[
j
])
*
old_steps
[
k
];
y_idx
%=
new_steps
[
j
];
}
y
[
i
]
=
x
[
x_idx
];
}
}
}
template
<>
void
TransposeGrad
<
float
,
CPUContext
>
(
const
int
count
,
template
<>
void
TransposeGrad
<
float
,
CPUContext
>
(
const
int
count
,
...
@@ -621,6 +644,9 @@ template <> void TransposeGrad<float, CPUContext>(const int count,
...
@@ -621,6 +644,9 @@ template <> void TransposeGrad<float, CPUContext>(const int count,
const
int
*
new_steps
,
const
int
*
new_steps
,
const
float
*
dy
,
const
float
*
dy
,
float
*
dx
)
{
float
*
dx
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
int
x_idx
=
0
,
y_idx
=
i
;
int
x_idx
=
0
,
y_idx
=
i
;
for
(
int
j
=
0
;
j
<
ndim
;
++
j
)
{
for
(
int
j
=
0
;
j
<
ndim
;
++
j
)
{
...
@@ -639,20 +665,15 @@ template <> void TransposeGrad<float16, CPUContext>(const int count,
...
@@ -639,20 +665,15 @@ template <> void TransposeGrad<float16, CPUContext>(const int count,
const
int
*
new_steps
,
const
int
*
new_steps
,
const
float16
*
dy
,
const
float16
*
dy
,
float16
*
dx
)
{
float16
*
dx
)
{
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
LOG
(
FATAL
)
<<
"unsupport float16 with CPU"
;
int
x_idx
=
0
,
y_idx
=
i
;
for
(
int
j
=
0
;
j
<
ndim
;
++
j
)
{
int
k
=
order
[
j
];
x_idx
+=
(
y_idx
/
new_steps
[
j
])
*
old_steps
[
k
];
y_idx
%=
new_steps
[
j
];
}
dx
[
x_idx
]
=
dy
[
i
];
}
}
}
/******************** loss.l1_loss ********************/
/******************** loss.l1_loss ********************/
template
<>
void
AbsGrad
<
float
,
CPUContext
>
(
const
int
count
,
const
float
*
dy
,
float
*
dx
)
{
template
<>
void
AbsGrad
<
float
,
CPUContext
>
(
const
int
count
,
const
float
*
dy
,
float
*
dx
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
const
float
val
=
dy
[
i
];
const
float
val
=
dy
[
i
];
// val > 0: 1 | val == 0: 0 | val < 0: -1
// val > 0: 1 | val == 0: 0 | val < 0: -1
...
@@ -666,6 +687,9 @@ template <> void SigmoidCrossEntropy<float, CPUContext>(const int count,
...
@@ -666,6 +687,9 @@ template <> void SigmoidCrossEntropy<float, CPUContext>(const int count,
const
float
*
x
,
const
float
*
x
,
const
float
*
target
,
const
float
*
target
,
float
*
loss
)
{
float
*
loss
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
loss
[
i
]
=
std
::
log
(
1
+
std
::
exp
(
x
[
i
]
-
2
*
x
[
i
]
*
(
x
[
i
]
>=
0
)))
loss
[
i
]
=
std
::
log
(
1
+
std
::
exp
(
x
[
i
]
-
2
*
x
[
i
]
*
(
x
[
i
]
>=
0
)))
+
x
[
i
]
*
((
x
[
i
]
>=
0
)
-
target
[
i
]);
+
x
[
i
]
*
((
x
[
i
]
>=
0
)
-
target
[
i
]);
...
@@ -678,6 +702,9 @@ template<> void SmoothL1<float, CPUContext>(const int count,
...
@@ -678,6 +702,9 @@ template<> void SmoothL1<float, CPUContext>(const int count,
const
float
sigma2
,
const
float
sigma2
,
const
float
*
x
,
const
float
*
x
,
float
*
y
)
{
float
*
y
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
const
float
val
=
x
[
i
];
const
float
val
=
x
[
i
];
const
float
abs_val
=
abs
(
val
);
const
float
abs_val
=
abs
(
val
);
...
@@ -690,6 +717,9 @@ template<> void SmoothL1Grad<float, CPUContext>(const int count,
...
@@ -690,6 +717,9 @@ template<> void SmoothL1Grad<float, CPUContext>(const int count,
const
float
sigma2
,
const
float
sigma2
,
const
float
*
dy
,
const
float
*
dy
,
float
*
dx
)
{
float
*
dx
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
const
float
val
=
dy
[
i
];
const
float
val
=
dy
[
i
];
const
float
abs_val
=
abs
(
val
);
const
float
abs_val
=
abs
(
val
);
...
@@ -705,6 +735,9 @@ template <> void SoftmaxCrossEntropy<float, CPUContext>(const int count,
...
@@ -705,6 +735,9 @@ template <> void SoftmaxCrossEntropy<float, CPUContext>(const int count,
const
float
*
prob
,
const
float
*
prob
,
const
float
*
target
,
const
float
*
target
,
float
*
loss
)
{
float
*
loss
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
loss
[
i
]
=
-
target
[
i
]
*
std
::
log
(
std
::
max
(
prob
[
i
],
FLT_MIN
));
loss
[
i
]
=
-
target
[
i
]
*
std
::
log
(
std
::
max
(
prob
[
i
],
FLT_MIN
));
}
}
...
@@ -1019,6 +1052,9 @@ template <> void Equal<float, CPUContext>(const int count,
...
@@ -1019,6 +1052,9 @@ template <> void Equal<float, CPUContext>(const int count,
const
float
*
a
,
const
float
*
a
,
const
float
*
b
,
const
float
*
b
,
float
*
y
)
{
float
*
y
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for
(
int
i
=
0
;
i
<
count
;
++
i
)
for
(
int
i
=
0
;
i
<
count
;
++
i
)
y
[
i
]
=
fabs
(
a
[
i
]
-
b
[
i
])
<
FLT_EPSILON
?
1.0
:
0.0
;
y
[
i
]
=
fabs
(
a
[
i
]
-
b
[
i
])
<
FLT_EPSILON
?
1.0
:
0.0
;
}
}
...
@@ -1096,6 +1132,9 @@ template <> void OneHot<float, CPUContext>(const int count,
...
@@ -1096,6 +1132,9 @@ template <> void OneHot<float, CPUContext>(const int count,
const
int
on_value
,
const
int
on_value
,
const
float
*
x
,
const
float
*
x
,
float
*
y
)
{
float
*
y
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
const
int
val
=
x
[
i
];
const
int
val
=
x
[
i
];
y
[
i
*
depth
+
val
]
=
on_value
;
y
[
i
*
depth
+
val
]
=
on_value
;
...
...
Dragon/src/utils/op_kernel.cu
View file @
6eeac5f
...
@@ -21,7 +21,7 @@ template<> void Empty<float, CUDAContext>() {
...
@@ -21,7 +21,7 @@ template<> void Empty<float, CUDAContext>() {
}
}
template<> void Empty<float16, CUDAContext>() {
template<> void Empty<float16, CUDAContext>() {
_Empty<float> << <1, 1 >> >();
_Empty<float
16
> << <1, 1 >> >();
CUDA_POST_KERNEL_CHECK;
CUDA_POST_KERNEL_CHECK;
}
}
...
@@ -102,6 +102,7 @@ template<> void Relu<float, CUDAContext>(const int count,
...
@@ -102,6 +102,7 @@ template<> void Relu<float, CUDAContext>(const int count,
CUDA_POST_KERNEL_CHECK;
CUDA_POST_KERNEL_CHECK;
}
}
#ifdef WITH_CUDA_FP16
template <typename T>
template <typename T>
__global__ void _ReluHalf(const int count, const half* x, const float slope, half* y) {
__global__ void _ReluHalf(const int count, const half* x, const float slope, half* y) {
const half kSlope = __float2half(slope);
const half kSlope = __float2half(slope);
...
@@ -123,6 +124,7 @@ template<> void Relu<float16, CUDAContext>(const int count,
...
@@ -123,6 +124,7 @@ template<> void Relu<float16, CUDAContext>(const int count,
reinterpret_cast<half*>(y));
reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK;
CUDA_POST_KERNEL_CHECK;
}
}
#endif
template <typename T>
template <typename T>
__global__ void _ReluGrad(const int count,
__global__ void _ReluGrad(const int count,
...
@@ -477,6 +479,7 @@ template<> void Scale<float, CUDAContext>(const int axis,
...
@@ -477,6 +479,7 @@ template<> void Scale<float, CUDAContext>(const int axis,
Ydata);
Ydata);
}
}
#ifdef WITH_CUDA_FP16
template <typename T>
template <typename T>
__global__ void _ScaleWithoutBiasHalf(const int n,
__global__ void _ScaleWithoutBiasHalf(const int n,
const half* x,
const half* x,
...
@@ -538,6 +541,7 @@ template<> void Scale<float16, CUDAContext>(const int axis,
...
@@ -538,6 +541,7 @@ template<> void Scale<float16, CUDAContext>(const int axis,
inner_dim,
inner_dim,
reinterpret_cast<half*>(Ydata));
reinterpret_cast<half*>(Ydata));
}
}
#endif
template <> void ScaleGrad<float, CUDAContext>(const int axis,
template <> void ScaleGrad<float, CUDAContext>(const int axis,
Tensor* dy,
Tensor* dy,
...
@@ -730,6 +734,7 @@ template <> void Concat<float, CUDAContext>(const int count,
...
@@ -730,6 +734,7 @@ template <> void Concat<float, CUDAContext>(const int count,
CUDA_POST_KERNEL_CHECK;
CUDA_POST_KERNEL_CHECK;
}
}
#ifdef WITH_CUDA_FP16
template <> void Concat<float16, CUDAContext>(const int count,
template <> void Concat<float16, CUDAContext>(const int count,
const int outer_dim,
const int outer_dim,
const int inner_dim,
const int inner_dim,
...
@@ -749,6 +754,7 @@ template <> void Concat<float16, CUDAContext>(const int count,
...
@@ -749,6 +754,7 @@ template <> void Concat<float16, CUDAContext>(const int count,
reinterpret_cast<half*>(y));
reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK;
CUDA_POST_KERNEL_CHECK;
}
}
#endif
template <typename T>
template <typename T>
__global__ void _ConcatGrad(const int count,
__global__ void _ConcatGrad(const int count,
...
@@ -789,6 +795,7 @@ template <> void ConcatGrad<float, CUDAContext>(const int count,
...
@@ -789,6 +795,7 @@ template <> void ConcatGrad<float, CUDAContext>(const int count,
CUDA_POST_KERNEL_CHECK;
CUDA_POST_KERNEL_CHECK;
}
}
#ifdef WITH_CUDA_FP16
template <> void ConcatGrad<float16, CUDAContext>(const int count,
template <> void ConcatGrad<float16, CUDAContext>(const int count,
const int outer_dim,
const int outer_dim,
const int inner_dim,
const int inner_dim,
...
@@ -808,6 +815,7 @@ template <> void ConcatGrad<float16, CUDAContext>(const int count,
...
@@ -808,6 +815,7 @@ template <> void ConcatGrad<float16, CUDAContext>(const int count,
reinterpret_cast<half*>(dx));
reinterpret_cast<half*>(dx));
CUDA_POST_KERNEL_CHECK;
CUDA_POST_KERNEL_CHECK;
}
}
#endif
/******************** common.crop ********************/
/******************** common.crop ********************/
...
@@ -1134,6 +1142,7 @@ template <> void Transpose<float, CUDAContext>(const int count,
...
@@ -1134,6 +1142,7 @@ template <> void Transpose<float, CUDAContext>(const int count,
CUDA_POST_KERNEL_CHECK;
CUDA_POST_KERNEL_CHECK;
}
}
#ifdef WITH_CUDA_FP16
template <> void Transpose<float16, CUDAContext>(const int count,
template <> void Transpose<float16, CUDAContext>(const int count,
const int ndim,
const int ndim,
const int* order,
const int* order,
...
@@ -1150,6 +1159,7 @@ template <> void Transpose<float16, CUDAContext>(const int count,
...
@@ -1150,6 +1159,7 @@ template <> void Transpose<float16, CUDAContext>(const int count,
reinterpret_cast<half*>(y));
reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK;
CUDA_POST_KERNEL_CHECK;
}
}
#endif
template <typename T>
template <typename T>
__global__ void _TransposeGrad(const int count,
__global__ void _TransposeGrad(const int count,
...
@@ -1187,6 +1197,7 @@ template <> void TransposeGrad<float, CUDAContext>(const int count,
...
@@ -1187,6 +1197,7 @@ template <> void TransposeGrad<float, CUDAContext>(const int count,
CUDA_POST_KERNEL_CHECK;
CUDA_POST_KERNEL_CHECK;
}
}
#ifdef WITH_CUDA_FP16
template <> void TransposeGrad<float16, CUDAContext>(const int count,
template <> void TransposeGrad<float16, CUDAContext>(const int count,
const int ndim,
const int ndim,
const int* order,
const int* order,
...
@@ -1203,6 +1214,7 @@ template <> void TransposeGrad<float16, CUDAContext>(const int count,
...
@@ -1203,6 +1214,7 @@ template <> void TransposeGrad<float16, CUDAContext>(const int count,
reinterpret_cast<half*>(dx));
reinterpret_cast<half*>(dx));
CUDA_POST_KERNEL_CHECK;
CUDA_POST_KERNEL_CHECK;
}
}
#endif
/******************** loss.l1_loss ********************/
/******************** loss.l1_loss ********************/
...
@@ -1834,6 +1846,7 @@ template <> void RMSPropUpdate<float, CUDAContext>(const int count,
...
@@ -1834,6 +1846,7 @@ template <> void RMSPropUpdate<float, CUDAContext>(const int count,
/******************** utils.cast ********************/
/******************** utils.cast ********************/
#ifdef WITH_CUDA_FP16
template <typename T>
template <typename T>
__global__ void _FloatToHalfKernel(const int count, const float* x, half* y) {
__global__ void _FloatToHalfKernel(const int count, const float* x, half* y) {
CUDA_KERNEL_LOOP(idx, count) {
CUDA_KERNEL_LOOP(idx, count) {
...
@@ -1849,6 +1862,7 @@ template <> void Float2Half<float, CUDAContext>(const int count,
...
@@ -1849,6 +1862,7 @@ template <> void Float2Half<float, CUDAContext>(const int count,
reinterpret_cast<half*>(y));
reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK;
CUDA_POST_KERNEL_CHECK;
}
}
#endif
/******************** utils.compare ********************/
/******************** utils.compare ********************/
...
@@ -1943,6 +1957,7 @@ template <> void MemoryData<uint8_t, float, CUDAContext>(const int count,
...
@@ -1943,6 +1957,7 @@ template <> void MemoryData<uint8_t, float, CUDAContext>(const int count,
CUDA_POST_KERNEL_CHECK;
CUDA_POST_KERNEL_CHECK;
}
}
#ifdef WITH_CUDA_FP16
template <> void MemoryData<float, float16, CUDAContext>(const int count,
template <> void MemoryData<float, float16, CUDAContext>(const int count,
const int num,
const int num,
const int channels,
const int channels,
...
@@ -1976,6 +1991,7 @@ template <> void MemoryData<uint8_t, float16, CUDAContext>(const int count,
...
@@ -1976,6 +1991,7 @@ template <> void MemoryData<uint8_t, float16, CUDAContext>(const int count,
reinterpret_cast<half*>(y));
reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK;
CUDA_POST_KERNEL_CHECK;
}
}
#endif
/******************** utils.one_hot ********************/
/******************** utils.one_hot ********************/
...
...
Dragon/src/utils/sse_alternative.cc
View file @
6eeac5f
...
@@ -3,27 +3,40 @@
...
@@ -3,27 +3,40 @@
#include <cmath>
#include <cmath>
#include <algorithm>
#include <algorithm>
#include "utils/omp_alternative.h"
#include "utils/sse_alternative.h"
#include "utils/sse_alternative.h"
namespace
dragon
{
namespace
dragon
{
namespace
sse
{
namespace
sse
{
template
<>
void
Set
(
const
int
n
,
const
float
alpha
,
float
*
x
)
{
template
<>
void
Set
(
const
int
n
,
const
float
alpha
,
float
*
x
)
{
__m128
scalar
=
SSE_FP32_SCALAR
(
alpha
);
__m128
scalar
=
SSE_FP32_SCALAR
(
alpha
);
int32_t
i
=
0
;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
SSE_LOOP1
(
i
,
n
)
SSE_FP32_STORE
(
x
+
i
,
scalar
);
SSE_LOOP1
(
i
,
n
)
SSE_FP32_STORE
(
x
+
i
,
scalar
);
SSE_LOOP2
(
i
,
n
)
x
[
i
]
=
alpha
;
SSE_LOOP2
(
i
,
n
)
x
[
i
]
=
alpha
;
}
}
template
<>
void
Set
(
const
int
n
,
const
int
alpha
,
int
*
x
)
{
template
<>
void
Set
(
const
int
n
,
const
int
alpha
,
int
*
x
)
{
__m128i
scalar
=
SSE_INT32_SCALAR
(
alpha
);
__m128i
scalar
=
SSE_INT32_SCALAR
(
alpha
);
__m128i
*
x1
=
reinterpret_cast
<
__m128i
*>
(
x
);
__m128i
*
x1
=
reinterpret_cast
<
__m128i
*>
(
x
);
int32_t
i
=
0
;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
SSE_LOOP1
(
i
,
n
)
SSE_INT128_STORE
(
x1
++
,
scalar
);
SSE_LOOP1
(
i
,
n
)
SSE_INT128_STORE
(
x1
++
,
scalar
);
SSE_LOOP2
(
i
,
n
)
x
[
i
]
=
alpha
;
SSE_LOOP2
(
i
,
n
)
x
[
i
]
=
alpha
;
}
}
template
<>
void
Add
(
const
int
n
,
const
float
*
a
,
const
float
*
b
,
float
*
y
)
{
template
<>
void
Add
(
const
int
n
,
const
float
*
a
,
const
float
*
b
,
float
*
y
)
{
__m128
x1
,
y1
,
z1
;
__m128
x1
,
y1
,
z1
;
int32_t
i
=
0
;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
SSE_LOOP1
(
i
,
n
)
{
SSE_LOOP1
(
i
,
n
)
{
x1
=
SSE_FP32_LOAD
(
a
+
i
);
x1
=
SSE_FP32_LOAD
(
a
+
i
);
y1
=
SSE_FP32_LOAD
(
b
+
i
);
y1
=
SSE_FP32_LOAD
(
b
+
i
);
...
@@ -31,10 +44,14 @@ namespace sse {
...
@@ -31,10 +44,14 @@ namespace sse {
SSE_FP32_STORE
(
y
+
i
,
z1
);
SSE_FP32_STORE
(
y
+
i
,
z1
);
}
}
SSE_LOOP2
(
i
,
n
)
y
[
i
]
=
a
[
i
]
+
b
[
i
];
SSE_LOOP2
(
i
,
n
)
y
[
i
]
=
a
[
i
]
+
b
[
i
];
}
}
template
<>
void
Sub
(
const
int
n
,
const
float
*
a
,
const
float
*
b
,
float
*
y
)
{
template
<>
void
Sub
(
const
int
n
,
const
float
*
a
,
const
float
*
b
,
float
*
y
)
{
__m128
x1
,
y1
,
z1
;
__m128
x1
,
y1
,
z1
;
int32_t
i
=
0
;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
SSE_LOOP1
(
i
,
n
)
{
SSE_LOOP1
(
i
,
n
)
{
x1
=
SSE_FP32_LOAD
(
a
+
i
);
x1
=
SSE_FP32_LOAD
(
a
+
i
);
y1
=
SSE_FP32_LOAD
(
b
+
i
);
y1
=
SSE_FP32_LOAD
(
b
+
i
);
...
@@ -42,10 +59,14 @@ namespace sse {
...
@@ -42,10 +59,14 @@ namespace sse {
SSE_FP32_STORE
(
y
+
i
,
z1
);
SSE_FP32_STORE
(
y
+
i
,
z1
);
}
}
SSE_LOOP2
(
i
,
n
)
y
[
i
]
=
a
[
i
]
-
b
[
i
];
SSE_LOOP2
(
i
,
n
)
y
[
i
]
=
a
[
i
]
-
b
[
i
];
}
}
template
<>
void
Mul
(
const
int
n
,
const
float
*
a
,
const
float
*
b
,
float
*
y
)
{
template
<>
void
Mul
(
const
int
n
,
const
float
*
a
,
const
float
*
b
,
float
*
y
)
{
__m128
x1
,
y1
,
z1
;
__m128
x1
,
y1
,
z1
;
int32_t
i
=
0
;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
SSE_LOOP1
(
i
,
n
)
{
SSE_LOOP1
(
i
,
n
)
{
x1
=
SSE_FP32_LOAD
(
a
+
i
);
x1
=
SSE_FP32_LOAD
(
a
+
i
);
y1
=
SSE_FP32_LOAD
(
b
+
i
);
y1
=
SSE_FP32_LOAD
(
b
+
i
);
...
@@ -53,10 +74,14 @@ namespace sse {
...
@@ -53,10 +74,14 @@ namespace sse {
SSE_FP32_STORE
(
y
+
i
,
z1
);
SSE_FP32_STORE
(
y
+
i
,
z1
);
}
}
SSE_LOOP2
(
i
,
n
)
y
[
i
]
=
a
[
i
]
*
b
[
i
];
SSE_LOOP2
(
i
,
n
)
y
[
i
]
=
a
[
i
]
*
b
[
i
];
}
}
template
<>
void
Div
(
const
int
n
,
const
float
*
a
,
const
float
*
b
,
float
*
y
)
{
template
<>
void
Div
(
const
int
n
,
const
float
*
a
,
const
float
*
b
,
float
*
y
)
{
__m128
x1
,
y1
,
z1
;
__m128
x1
,
y1
,
z1
;
int32_t
i
=
0
;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
SSE_LOOP1
(
i
,
n
)
{
SSE_LOOP1
(
i
,
n
)
{
x1
=
SSE_FP32_LOAD
(
a
+
i
);
x1
=
SSE_FP32_LOAD
(
a
+
i
);
y1
=
SSE_FP32_LOAD
(
b
+
i
);
y1
=
SSE_FP32_LOAD
(
b
+
i
);
...
@@ -64,31 +89,42 @@ namespace sse {
...
@@ -64,31 +89,42 @@ namespace sse {
SSE_FP32_STORE
(
y
+
i
,
z1
);
SSE_FP32_STORE
(
y
+
i
,
z1
);
}
}
SSE_LOOP2
(
i
,
n
)
y
[
i
]
=
a
[
i
]
/
b
[
i
];
SSE_LOOP2
(
i
,
n
)
y
[
i
]
=
a
[
i
]
/
b
[
i
];
}
}
template
<>
void
Scal
(
const
int
n
,
const
float
alpha
,
float
*
y
)
{
template
<>
void
Scal
(
const
int
n
,
const
float
alpha
,
float
*
y
)
{
__m128
y1
,
scalar
=
SSE_FP32_SCALAR
(
alpha
);
__m128
y1
,
scalar
=
SSE_FP32_SCALAR
(
alpha
);
int32_t
i
=
0
;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
SSE_LOOP1
(
i
,
n
)
{
SSE_LOOP1
(
i
,
n
)
{
y1
=
SSE_FP32_LOAD
(
y
+
i
);
y1
=
SSE_FP32_LOAD
(
y
+
i
);
y1
=
SSE_FP32_MUL
(
y1
,
scalar
);
y1
=
SSE_FP32_MUL
(
y1
,
scalar
);
SSE_FP32_STORE
(
y
+
i
,
y1
);
SSE_FP32_STORE
(
y
+
i
,
y1
);
}
}
SSE_LOOP2
(
i
,
n
)
y
[
i
]
*=
alpha
;
SSE_LOOP2
(
i
,
n
)
y
[
i
]
*=
alpha
;
}
}
template
<>
void
Scale
(
const
int
n
,
const
float
alpha
,
const
float
*
x
,
float
*
y
)
{
template
<>
void
Scale
(
const
int
n
,
const
float
alpha
,
const
float
*
x
,
float
*
y
)
{
__m128
x1
,
scalar
=
SSE_FP32_SCALAR
(
alpha
);
__m128
x1
,
scalar
=
SSE_FP32_SCALAR
(
alpha
);
int32_t
i
=
0
;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
SSE_LOOP1
(
i
,
n
)
{
SSE_LOOP1
(
i
,
n
)
{
x1
=
SSE_FP32_LOAD
(
x
+
i
);
x1
=
SSE_FP32_LOAD
(
x
+
i
);
x1
=
SSE_FP32_MUL
(
x1
,
scalar
);
x1
=
SSE_FP32_MUL
(
x1
,
scalar
);
SSE_FP32_STORE
(
y
+
i
,
x1
);
SSE_FP32_STORE
(
y
+
i
,
x1
);
}
}
SSE_LOOP2
(
i
,
n
)
y
[
i
]
=
x
[
i
]
*
alpha
;
SSE_LOOP2
(
i
,
n
)
y
[
i
]
=
x
[
i
]
*
alpha
;
}
}
template
<>
void
Axpy
(
const
int
n
,
float
alpha
,
const
float
*
x
,
float
*
y
)
{
template
<>
void
Axpy
(
const
int
n
,
float
alpha
,
const
float
*
x
,
float
*
y
)
{
__m128
x1
,
y1
,
scalar
=
SSE_FP32_SCALAR
(
alpha
);
__m128
x1
,
y1
,
scalar
=
SSE_FP32_SCALAR
(
alpha
);
int32_t
i
=
0
;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
SSE_LOOP1
(
i
,
n
)
{
SSE_LOOP1
(
i
,
n
)
{
x1
=
SSE_FP32_LOAD
(
x
+
i
);
x1
=
SSE_FP32_LOAD
(
x
+
i
);
y1
=
SSE_FP32_LOAD
(
y
+
i
);
y1
=
SSE_FP32_LOAD
(
y
+
i
);
...
@@ -97,13 +133,20 @@ namespace sse {
...
@@ -97,13 +133,20 @@ namespace sse {
SSE_FP32_STORE
(
y
+
i
,
y1
);
SSE_FP32_STORE
(
y
+
i
,
y1
);
}
}
SSE_LOOP2
(
i
,
n
)
y
[
i
]
=
alpha
*
x
[
i
]
+
y
[
i
];
SSE_LOOP2
(
i
,
n
)
y
[
i
]
=
alpha
*
x
[
i
]
+
y
[
i
];
}
}
template
<>
void
Axpby
(
const
int
n
,
float
alpha
,
const
float
*
x
,
template
<>
void
Axpby
(
const
int
n
,
const
float
beta
,
float
*
y
)
{
float
alpha
,
const
float
*
x
,
const
float
beta
,
float
*
y
)
{
__m128
x1
,
y1
,
z1
;
__m128
x1
,
y1
,
z1
;
__m128
scalar1
=
SSE_FP32_SCALAR
(
alpha
);
__m128
scalar1
=
SSE_FP32_SCALAR
(
alpha
);
__m128
scalar2
=
SSE_FP32_SCALAR
(
beta
);
__m128
scalar2
=
SSE_FP32_SCALAR
(
beta
);
int32_t
i
=
0
;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
SSE_LOOP1
(
i
,
n
)
{
SSE_LOOP1
(
i
,
n
)
{
x1
=
SSE_FP32_LOAD
(
x
+
i
);
x1
=
SSE_FP32_LOAD
(
x
+
i
);
y1
=
SSE_FP32_LOAD
(
y
+
i
);
y1
=
SSE_FP32_LOAD
(
y
+
i
);
...
@@ -113,10 +156,14 @@ namespace sse {
...
@@ -113,10 +156,14 @@ namespace sse {
SSE_FP32_STORE
(
y
+
i
,
z1
);
SSE_FP32_STORE
(
y
+
i
,
z1
);
}
}
SSE_LOOP2
(
i
,
n
)
y
[
i
]
=
alpha
*
x
[
i
]
+
beta
*
y
[
i
];
SSE_LOOP2
(
i
,
n
)
y
[
i
]
=
alpha
*
x
[
i
]
+
beta
*
y
[
i
];
}
}
template
<>
float
ASum
(
const
int
n
,
const
float
*
x
)
{
template
<>
float
ASum
(
const
int
n
,
const
float
*
x
)
{
__m128
x1
,
sum
=
SSE_FP32_ZERO
;
__m128
x1
,
sum
=
SSE_FP32_ZERO
;
int32_t
i
=
0
;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
SSE_LOOP1
(
i
,
n
)
{
SSE_LOOP1
(
i
,
n
)
{
x1
=
SSE_FP32_LOAD
(
x
+
i
);
x1
=
SSE_FP32_LOAD
(
x
+
i
);
sum
=
SSE_FP32_ADD
(
sum
,
x1
);
sum
=
SSE_FP32_ADD
(
sum
,
x1
);
...
@@ -126,30 +173,42 @@ namespace sse {
...
@@ -126,30 +173,42 @@ namespace sse {
float
ret
=
buf
[
0
]
+
buf
[
1
]
+
buf
[
2
]
+
buf
[
3
];
float
ret
=
buf
[
0
]
+
buf
[
1
]
+
buf
[
2
]
+
buf
[
3
];
SSE_LOOP2
(
i
,
n
)
ret
+=
x
[
i
];
SSE_LOOP2
(
i
,
n
)
ret
+=
x
[
i
];
return
ret
;
return
ret
;
}
}
template
<>
void
AddScalar
(
const
int
n
,
const
float
alpha
,
float
*
y
)
{
template
<>
void
AddScalar
(
const
int
n
,
const
float
alpha
,
float
*
y
)
{
__m128
y1
,
scalar
=
SSE_FP32_SCALAR
(
alpha
);
__m128
y1
,
scalar
=
SSE_FP32_SCALAR
(
alpha
);
int32_t
i
=
0
;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
SSE_LOOP1
(
i
,
n
)
{
SSE_LOOP1
(
i
,
n
)
{
y1
=
SSE_FP32_LOAD
(
y
+
i
);
y1
=
SSE_FP32_LOAD
(
y
+
i
);
y1
=
SSE_FP32_ADD
(
y1
,
scalar
);
y1
=
SSE_FP32_ADD
(
y1
,
scalar
);
SSE_FP32_STORE
(
y
+
i
,
y1
);
SSE_FP32_STORE
(
y
+
i
,
y1
);
}
}
SSE_LOOP2
(
i
,
n
)
y
[
i
]
+=
alpha
;
SSE_LOOP2
(
i
,
n
)
y
[
i
]
+=
alpha
;
}
}
template
<>
void
MulScalar
(
const
int
n
,
const
float
alpha
,
float
*
y
)
{
template
<>
void
MulScalar
(
const
int
n
,
const
float
alpha
,
float
*
y
)
{
__m128
y1
,
scalar
=
SSE_FP32_SCALAR
(
alpha
);
__m128
y1
,
scalar
=
SSE_FP32_SCALAR
(
alpha
);
int32_t
i
=
0
;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
SSE_LOOP1
(
i
,
n
)
{
SSE_LOOP1
(
i
,
n
)
{
y1
=
SSE_FP32_LOAD
(
y
+
i
);
y1
=
SSE_FP32_LOAD
(
y
+
i
);
y1
=
SSE_FP32_MUL
(
y1
,
scalar
);
y1
=
SSE_FP32_MUL
(
y1
,
scalar
);
SSE_FP32_STORE
(
y
+
i
,
y1
);
SSE_FP32_STORE
(
y
+
i
,
y1
);
}
}
SSE_LOOP2
(
i
,
n
)
y
[
i
]
*=
alpha
;
SSE_LOOP2
(
i
,
n
)
y
[
i
]
*=
alpha
;
}
}
template
<>
float
Dot
(
const
int
n
,
const
float
*
a
,
const
float
*
b
)
{
template
<>
float
Dot
(
const
int
n
,
const
float
*
a
,
const
float
*
b
)
{
__m128
x1
,
y1
,
sum
=
SSE_FP32_ZERO
;
__m128
x1
,
y1
,
sum
=
SSE_FP32_ZERO
;
int32_t
i
=
0
;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
SSE_LOOP1
(
i
,
n
)
{
SSE_LOOP1
(
i
,
n
)
{
x1
=
SSE_FP32_LOAD
(
a
+
i
);
x1
=
SSE_FP32_LOAD
(
a
+
i
);
y1
=
SSE_FP32_LOAD
(
b
+
i
);
y1
=
SSE_FP32_LOAD
(
b
+
i
);
...
@@ -160,7 +219,7 @@ namespace sse {
...
@@ -160,7 +219,7 @@ namespace sse {
float
ret
=
buf
[
0
]
+
buf
[
1
]
+
buf
[
2
]
+
buf
[
3
];
float
ret
=
buf
[
0
]
+
buf
[
1
]
+
buf
[
2
]
+
buf
[
3
];
SSE_LOOP2
(
i
,
n
)
ret
+=
a
[
i
]
*
b
[
i
];
SSE_LOOP2
(
i
,
n
)
ret
+=
a
[
i
]
*
b
[
i
];
return
ret
;
return
ret
;
}
}
}
// namespace ssd
}
// namespace ssd
...
...
README.md
View file @
6eeac5f
# Dragon: A Computation Graph Virtual Machine Based Deep Learning Framework
# Dragon: A Computation Graph Virtual Machine Based Deep Learning Framework
![](http://images.cnblogs.com/cnblogs_com/neopenx/690760/o_dragon_logo.png)
-----
### Compile Requirements for C++
### Compile Requirements for C++
0.
Google Protocol Buffer
0.
Google Protocol Buffer
...
...
Write
Preview
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment