Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
SeetaResearch
/
Dragon
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Issues
0
Merge Requests
0
Pipelines
Wiki
Snippets
Settings
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit 6eeac5fe
authored
Aug 26, 2017
by
Ting PAN
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add omp optimization
1 parent
007d9c21
Hide whitespace changes
Inline
Side-by-side
Showing
27 changed files
with
580 additions
and
232 deletions
Dragon/CMakeLists.txt
Dragon/include/utils/omp_alternative.h
Dragon/include/utils/sse_device.h
Dragon/src/operators/arithmetic/add_op.cc
Dragon/src/operators/arithmetic/div_op.cc
Dragon/src/operators/arithmetic/dot_op.cc
Dragon/src/operators/arithmetic/eltwise_op.cc
Dragon/src/operators/arithmetic/gram_matrix_op.cc
Dragon/src/operators/arithmetic/matmul_op.cc
Dragon/src/operators/arithmetic/mul_op.cc
Dragon/src/operators/arithmetic/pow_op.cc
Dragon/src/operators/arithmetic/scale_op.cc
Dragon/src/operators/arithmetic/sub_op.cc
Dragon/src/operators/common/concat_op.cc
Dragon/src/operators/common/transpose_op.cc
Dragon/src/operators/norm/batch_norm_op.cc
Dragon/src/operators/norm/l2_norm_op.cc
Dragon/src/operators/utils/cast_op.cpp
Dragon/src/operators/utils/gradient_op.cc
Dragon/src/operators/utils/memory_data_op.cc
Dragon/src/operators/vision/dense_concat_op.cc
Dragon/src/utils/math_functions.cc
Dragon/src/utils/math_functions.cu
Dragon/src/utils/op_kernel.cc
Dragon/src/utils/op_kernel.cu
Dragon/src/utils/sse_alternative.cc
README.md
Dragon/CMakeLists.txt
View file @
6eeac5f
...
...
@@ -12,6 +12,7 @@ option(WITH_PYTHON3 "Set ON to use PYTHON3 otherwise PYTHON2" OF
option
(
WITH_CUDA
"Set ON to use CUDA"
ON
)
option
(
WITH_CUDNN
"Set ON to use CUDNN"
OFF
)
option
(
WITH_BLAS
"Set ON to use BLAS"
OFF
)
option
(
WITH_OMP
"Set ON to use OpenMP"
OFF
)
option
(
WITH_SSE
"Set ON to use SSE 4.1"
ON
)
option
(
WITH_MPI
"Set ON to use MPI"
OFF
)
option
(
WITH_MPI_CUDA
"Set ON to use MPI-CUDA"
OFF
)
...
...
@@ -22,7 +23,7 @@ option(WITH_CUDA_FP16 "Set ON to use FP16" ON)
set
(
3RDPARTY_DIR
${
PROJECT_SOURCE_DIR
}
/../3rdparty
)
# set your python environment
set
(
PYTHON_DIR /usr/include/python2.7
)
# prefer
set
(
PYTHON_DIR /usr/include/python2.7
)
# prefer
red
#set(PYTHON_DIR /usr/include/python3.x) # optional, set specific version
#set(ANACONDA_DIR /xxx/anaconda) # optional, root folder of anaconda, preset for 2.7, 3.5, and 3.6
set
(
NUMPY_DIR /xxx/numpy
)
# required, root folder of numpy package
...
...
@@ -118,6 +119,10 @@ else()
"
\n
-- > GEMM/GEMV is disabled"
"
\n
-- > prefer not to run as CPU Mode"
)
endif
()
if
(
WITH_OMP
)
ADD_DEFINITIONS
(
-DWITH_OMP
)
message
(
STATUS
"Use OpenMP [Optional]"
)
endif
()
if
(
WITH_SSE
)
ADD_DEFINITIONS
(
-DWITH_SSE
)
message
(
STATUS
"Use SSE [Optional]"
)
...
...
@@ -145,11 +150,18 @@ endif()
# ---[ Flags
set
(
CUDA_NVCC_FLAGS
"
${
CUDA_NVCC_FLAGS
}
${
CUDA_ARCH
}
"
)
if
(
WIN32
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
/MP"
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
/MP /O2"
)
if
(
WITH_OMP
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
/openmp"
)
endif
()
endif
()
if
(
UNIX
)
set
(
CMAKE_C_FLAGS
"
${
CMAKE_C_FLAGS
}
-fPIC"
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-fPIC -O2 -m64 -fpermissive -std=c++11"
)
if
(
WITH_OMP
)
set
(
CMAKE_C_FLAGS
"
${
CMAKE_C_FLAGS
}
-fopenmp"
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-fopenmp"
)
endif
()
endif
()
# ---[ Warnings
...
...
Dragon/include/utils/omp_alternative.h
0 → 100644
View file @
6eeac5f
// --------------------------------------------------------
// Dragon
// Copyright(c) 2017 SeetaTech
// Written by Ting Pan
// --------------------------------------------------------
#ifndef DRAGON_UTILS_OMP_ALTERNATIVE_H_
#define DRAGON_UTILS_OMP_ALTERNATIVE_H_
#ifdef WITH_OMP
#include <algorithm>
#include <omp.h>
namespace
dragon
{
#define OMP_MIN_ITERATORS_PER_CORE 256
inline
int
GET_OMP_THREADS
(
const
int
N
)
{
int
threads
=
std
::
max
(
N
/
OMP_MIN_ITERATORS_PER_CORE
,
1
);
return
std
::
min
(
threads
,
omp_get_num_procs
());
}
}
#endif // WITH_OMP
#endif // DRAGON_UTILS_OMP_ALTERNATIVE_H_
\ No newline at end of file
Dragon/include/utils/sse_device.h
View file @
6eeac5f
...
...
@@ -15,11 +15,10 @@
namespace
dragon
{
#define SSE_LOOP1(i, n) \
int32_t i; \
for (i = 0; i < n - 4; i += 4) \
#define SSE_LOOP2(i, n) \
for (; i < n;
i++
)
for (; i < n;
++i
)
#define SSE_FP32_LOAD _mm_loadu_ps
#define SSE_FP32_STORE _mm_storeu_ps
...
...
Dragon/src/operators/arithmetic/add_op.cc
View file @
6eeac5f
...
...
@@ -53,18 +53,24 @@ void AddOp<Context>::RunOnDevice() {
}
else
if
(
input
(
0
).
dim
(
0
)
==
input
(
1
).
dim
(
0
)
&&
input
(
1
).
count
(
1
)
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
2
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
2
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
else
if
(
input
(
0
).
dim
(
-
1
)
==
input
(
1
).
dim
(
-
1
)
&&
input
(
1
).
count
(
0
,
input
(
1
).
axis
(
-
1
))
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
1
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
1
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
else
if
(
input
(
1
).
ndim
()
==
1
&&
input
(
1
).
dim
(
0
)
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
0
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
0
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
else
{
...
...
@@ -139,18 +145,24 @@ void AddGradientOp<Context>::RunOnDevice() {
}
else
if
(
input
(
-
1
).
dim
(
0
)
==
input
(
0
).
dim
(
0
)
&&
input
(
0
).
count
(
1
)
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
2
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
2
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
else
if
(
input
(
-
1
).
dim
(
-
1
)
==
input
(
0
).
dim
(
-
1
)
&&
input
(
0
).
count
(
0
,
input
(
0
).
axis
(
-
1
))
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
1
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
1
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
else
if
(
input
(
0
).
ndim
()
==
1
&&
input
(
0
).
dim
(
0
)
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
0
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
0
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
else
{
...
...
Dragon/src/operators/arithmetic/div_op.cc
View file @
6eeac5f
...
...
@@ -54,18 +54,24 @@ void DivOp<Context>::RunOnDevice() {
}
else
if
(
input
(
0
).
dim
(
0
)
==
input
(
1
).
dim
(
0
)
&&
input
(
1
).
count
(
1
)
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
2
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
2
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types"
;
}
else
if
(
input
(
0
).
dim
(
-
1
)
==
input
(
1
).
dim
(
-
1
)
&&
input
(
1
).
count
(
0
,
input
(
1
).
axis
(
-
1
))
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
1
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
1
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types"
;
}
else
if
(
input
(
1
).
ndim
()
==
1
&&
input
(
1
).
dim
(
0
)
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
0
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
0
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types"
;
}
else
{
...
...
@@ -170,18 +176,24 @@ void DivGradientOp<Context>::RunOnDevice() {
}
else
if
(
input
(
0
).
dim
(
0
)
==
input
(
1
).
dim
(
0
)
&&
input
(
1
).
count
(
1
)
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
2
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
2
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types"
;
}
else
if
(
input
(
0
).
dim
(
-
1
)
==
input
(
1
).
dim
(
-
1
)
&&
input
(
1
).
count
(
0
,
input
(
1
).
axis
(
-
1
))
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
1
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
1
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types"
;
}
else
if
(
input
(
1
).
ndim
()
==
1
&&
input
(
1
).
dim
(
0
)
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
0
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
0
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types"
;
}
else
{
...
...
Dragon/src/operators/arithmetic/dot_op.cc
View file @
6eeac5f
...
...
@@ -55,7 +55,9 @@ void DotOp<Context>::RunOnDevice() {
dims
[
dims
.
size
()
-
1
]
=
N1
;
output
(
0
)
->
Reshape
(
dims
);
if
(
input
(
0
).
template
IsType
<
float
>
())
GemmRunWithType
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
GemmRunWithType
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
else
if
(
input
(
0
).
ndim
()
>=
2
&&
input
(
1
).
ndim
()
==
1
)
{
...
...
@@ -70,7 +72,9 @@ void DotOp<Context>::RunOnDevice() {
dims
.
pop_back
();
output
(
0
)
->
Reshape
(
dims
);
if
(
input
(
0
).
template
IsType
<
float
>
())
GemvRunWithType
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
GemvRunWithType
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
else
{
...
...
@@ -148,7 +152,9 @@ void DotGradientOp<Context>::RunOnDevice() {
<<
input
(
0
).
dim_string
()
<<
" can not Dot with Tensor"
<<
"("
<<
input
(
1
).
name
()
<<
"): "
<<
input
(
1
).
dim_string
();
if
(
input
(
0
).
template
IsType
<
float
>
())
GemmRunWithType
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
GemmRunWithType
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
else
if
(
input
(
0
).
ndim
()
>=
2
&&
input
(
1
).
ndim
()
==
1
)
{
...
...
@@ -160,7 +166,9 @@ void DotGradientOp<Context>::RunOnDevice() {
<<
input
(
0
).
dim_string
()
<<
" can not Dot with Tensor"
<<
"("
<<
input
(
1
).
name
()
<<
"): "
<<
input
(
1
).
dim_string
();
if
(
input
(
0
).
template
IsType
<
float
>
())
GemvRunWithType
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
GemvRunWithType
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
else
{
...
...
Dragon/src/operators/arithmetic/eltwise_op.cc
View file @
6eeac5f
...
...
@@ -41,12 +41,16 @@ void EltwiseOp<Context>::RunOnDevice() {
if
(
operation
==
"SUM"
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
SumRunWithType
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
SumRunWithType
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
else
if
(
operation
==
"PROD"
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
ProdRunWithType
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
ProdRunWithType
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
else
{
...
...
@@ -104,12 +108,16 @@ void EltwiseGradientOp<Context>::RunOnDevice() {
if
(
operation
==
"SUM"
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
SumRunWithType
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
SumRunWithType
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
else
if
(
operation
==
"PROD"
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
ProdRunWithType
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
ProdRunWithType
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
else
{
...
...
Dragon/src/operators/arithmetic/gram_matrix_op.cc
View file @
6eeac5f
...
...
@@ -25,7 +25,9 @@ void GramMatrixOp<Context>::RunOnDevice() {
output
(
0
)
->
Reshape
(
vector
<
TIndex
>
({
outer_dim
,
dim
,
dim
}));
if
(
input
(
0
).
template
IsType
<
float
>
())
RunWithType
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
RunWithType
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
...
...
@@ -57,7 +59,9 @@ void GramMatrixGradientOp<Context>::RunOnDevice() {
output
(
0
)
->
ReshapeLike
(
input
(
0
));
if
(
input
(
0
).
template
IsType
<
float
>
())
RunWithType
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
RunWithType
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
...
...
Dragon/src/operators/arithmetic/matmul_op.cc
View file @
6eeac5f
...
...
@@ -48,7 +48,9 @@ void MatmulOp<Context>::RunOnDevice() {
dims
[
dims
.
size
()
-
1
]
=
N
;
output
(
0
)
->
Reshape
(
dims
);
if
(
input
(
0
).
template
IsType
<
float
>
())
RunWithType
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
RunWithType
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
...
...
@@ -105,7 +107,9 @@ void MatmulGradientOp<Context>::RunOnDevice() {
output
(
0
)
->
ReshapeLike
(
input
(
0
));
output
(
1
)
->
ReshapeLike
(
input
(
1
));
if
(
input
(
0
).
template
IsType
<
float
>
())
RunWithType
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
RunWithType
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
...
...
Dragon/src/operators/arithmetic/mul_op.cc
View file @
6eeac5f
...
...
@@ -54,18 +54,24 @@ void MulOp<Context>::RunOnDevice() {
}
else
if
(
input
(
0
).
dim
(
0
)
==
input
(
1
).
dim
(
0
)
&&
input
(
1
).
count
(
1
)
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
2
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
2
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
else
if
(
input
(
0
).
dim
(
-
1
)
==
input
(
1
).
dim
(
-
1
)
&&
input
(
1
).
count
(
0
,
input
(
1
).
axis
(
-
1
))
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
1
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
1
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
else
if
(
input
(
1
).
ndim
()
==
1
&&
input
(
1
).
dim
(
0
)
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
0
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
0
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
else
{
...
...
@@ -158,18 +164,24 @@ void MulGradientOp<Context>::RunOnDevice() {
}
else
if
(
input
(
0
).
dim
(
0
)
==
input
(
1
).
dim
(
0
)
&&
input
(
1
).
count
(
1
)
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
2
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
2
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
else
if
(
input
(
0
).
dim
(
-
1
)
==
input
(
1
).
dim
(
-
1
)
&&
input
(
1
).
count
(
0
,
input
(
1
).
axis
(
-
1
))
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
1
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
1
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
else
if
(
input
(
1
).
ndim
()
==
1
&&
input
(
1
).
dim
(
0
)
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
0
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
0
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
else
{
...
...
Dragon/src/operators/arithmetic/pow_op.cc
View file @
6eeac5f
...
...
@@ -26,7 +26,9 @@ void PowOp<Context>::RunOnDevice() {
output
(
0
)
->
ReshapeLike
(
input
(
0
));
if
(
input
(
0
).
template
IsType
<
float
>
())
RunWithType
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
RunWithType
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
...
...
@@ -76,7 +78,9 @@ void PowGradientOp<Context>::RunOnDevice() {
output
(
0
)
->
ReshapeLike
(
input
(
0
));
if
(
input
(
0
).
template
IsType
<
float
>
())
RunWithType
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
RunWithType
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
...
...
Dragon/src/operators/arithmetic/scale_op.cc
View file @
6eeac5f
...
...
@@ -37,7 +37,9 @@ void ScaleOp<Context>::RunOnDevice() {
output
(
0
)
->
ReshapeLike
(
input
(
0
));
if
(
input
(
0
).
template
IsType
<
float
>
())
RunWithType
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
RunWithType
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
...
...
Dragon/src/operators/arithmetic/sub_op.cc
View file @
6eeac5f
...
...
@@ -53,18 +53,24 @@ void SubOp<Context>::RunOnDevice() {
}
else
if
(
input
(
0
).
dim
(
0
)
==
input
(
1
).
dim
(
0
)
&&
input
(
1
).
count
(
1
)
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
2
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
2
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
else
if
(
input
(
0
).
dim
(
-
1
)
==
input
(
1
).
dim
(
-
1
)
&&
input
(
1
).
count
(
0
,
input
(
1
).
axis
(
-
1
))
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
1
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
1
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
else
if
(
input
(
1
).
ndim
()
==
1
&&
input
(
1
).
dim
(
0
)
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
0
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
0
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
else
{
...
...
@@ -139,18 +145,24 @@ void SubGradientOp<Context>::RunOnDevice() {
}
else
if
(
input
(
-
1
).
dim
(
0
)
==
input
(
0
).
dim
(
0
)
&&
input
(
0
).
count
(
1
)
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
2
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
2
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
else
if
(
input
(
-
1
).
dim
(
-
1
)
==
input
(
0
).
dim
(
-
1
)
&&
input
(
0
).
count
(
0
,
input
(
0
).
axis
(
-
1
))
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
1
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
1
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
else
if
(
input
(
0
).
ndim
()
==
1
&&
input
(
0
).
dim
(
0
)
==
1
)
{
if
(
input
(
0
).
template
IsType
<
float
>
())
BroadcastRunWithType
<
float
>
(
0
);
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
BroadcastRunWithType
<
float16
>
(
0
);
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
else
{
...
...
Dragon/src/operators/common/concat_op.cc
View file @
6eeac5f
...
...
@@ -49,7 +49,9 @@ void ConcatOp<Context>::RunOnDevice() {
}
if
(
input
(
0
).
template
IsType
<
float
>
())
RunWithType
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
RunWithType
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
...
...
@@ -96,7 +98,9 @@ void ConcatGradientOp<Context>::RunOnDevice() {
}
if
(
input
(
0
).
template
IsType
<
float
>
())
RunWithType
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
RunWithType
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
...
...
Dragon/src/operators/common/transpose_op.cc
View file @
6eeac5f
...
...
@@ -45,7 +45,9 @@ void TransposeOp<Context>::RunOnDevice() {
}
if
(
input
(
0
).
template
IsType
<
float
>
())
RunWithType
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
RunWithType
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
...
...
@@ -75,7 +77,9 @@ void TransposeGradientOp<Context>::RunOnDevice() {
new_steps
=
ws
()
->
GetTensor
(
"_t_"
+
anchor
()
+
"_new_steps"
);
if
(
input
(
0
).
template
IsType
<
float
>
())
RunWithType
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
RunWithType
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
...
...
Dragon/src/operators/norm/batch_norm_op.cc
View file @
6eeac5f
...
...
@@ -127,7 +127,9 @@ void BatchNormOp<Context>::RunOnDevice() {
if
(
input
(
0
).
template
IsType
<
float
>
())
RunWithType
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
RunWithType
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
...
...
@@ -247,7 +249,9 @@ void BatchNormGradientOp<Context>::RunOnDevice() {
else
use_global_stats
=
use_stats
==
1
?
true
:
false
;
if
(
input
(
0
).
template
IsType
<
float
>
())
RunWithType
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
RunWithType
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
...
...
Dragon/src/operators/norm/l2_norm_op.cc
View file @
6eeac5f
...
...
@@ -78,7 +78,9 @@ void L2NormOp<Context>::RunOnDevice() {
output
(
0
)
->
ReshapeLike
(
input
(
0
));
if
(
input
(
0
).
template
IsType
<
float
>
())
RunWithType
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
RunWithType
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
...
...
Dragon/src/operators/utils/cast_op.cpp
View file @
6eeac5f
...
...
@@ -4,6 +4,8 @@
namespace
dragon
{
#ifdef WITH_CUDA_FP16
template
<
class
Context
>
void
FloatToHalfOp
<
Context
>::
RunOnDevice
()
{
CHECK
(
input
(
0
).
template
IsType
<
float
>
())
...
...
@@ -28,4 +30,6 @@ OPERATOR_SCHEMA(FloatToHalf).NumInputs(1).NumOutputs(1);
NO_GRADIENT
(
FloatToHalf
);
#endif
}
//
namespace
dragon
\ No newline at end of file
Dragon/src/operators/utils/gradient_op.cc
View file @
6eeac5f
...
...
@@ -19,7 +19,9 @@ void GradientGenerateOp<Context>::RunWithType() {
template
<
class
Context
>
void
GradientGenerateOp
<
Context
>::
RunOnDevice
()
{
if
(
input
(
0
).
template
IsType
<
float
>
())
RunWithType
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
0
).
template
IsType
<
float16
>
())
RunWithType
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
...
...
Dragon/src/operators/utils/memory_data_op.cc
View file @
6eeac5f
...
...
@@ -23,12 +23,16 @@ void MemoryDataOp<Context>::RunOnDevice() {
if
(
input
(
0
).
template
IsType
<
float
>
())
{
if
(
data_type
==
TensorProto_DataType_FLOAT
)
RunWithType
<
float
,
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
data_type
==
TensorProto_DataType_FLOAT16
)
RunWithType
<
float
,
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
else
if
(
input
(
0
).
template
IsType
<
uint8_t
>
())
{
if
(
data_type
==
TensorProto_DataType_FLOAT
)
RunWithType
<
uint8_t
,
float
>
();
#ifdef WITH_CUDA_FP16
if
(
data_type
==
TensorProto_DataType_FLOAT16
)
RunWithType
<
uint8_t
,
float16
>
();
#endif
}
else
{
LOG
(
FATAL
)
<<
"unsupported input types."
;
}
}
...
...
Dragon/src/operators/vision/dense_concat_op.cc
View file @
6eeac5f
...
...
@@ -58,7 +58,9 @@ void DenseConcatGradientOp<Context>::ElimateCorruption() {
input
(
0
).
Move
(
buffer
->
memory
());
head_data
[
idx
]
=
input
(
0
).
name
();
if
(
input
(
-
2
).
template
IsType
<
float
>
())
RestoreX1
<
float
>
();
#ifdef WITH_CUDA_FP16
else
if
(
input
(
-
2
).
template
IsType
<
float16
>
())
RestoreX1
<
float16
>
();
#endif
else
LOG
(
FATAL
)
<<
"unsupported input types."
;
// post-process
if
(
input
(
0
).
memory
()
!=
buffer
->
memory
())
buffer
->
Move
(
input
(
0
).
memory
());
...
...
Dragon/src/utils/math_functions.cc
View file @
6eeac5f
#include <random>
#include "core/context.h"
#include "utils/math_functions.h"
#ifdef WITH_SSE
#include "utils/omp_alternative.h"
#include "utils/sse_alternative.h"
#
endif
#
include "utils/math_functions.h"
namespace
dragon
{
...
...
@@ -22,9 +20,12 @@ template <> void Set<float, CPUContext>(const int n,
}
#ifdef WITH_SSE
sse
::
Set
<
float
>
(
n
,
alpha
,
x
);
#else // naive implement
#else
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
x
[
i
]
=
alpha
;
#endif
#endif
// WITH_SSE
}
template
<>
void
Set
<
int
,
CPUContext
>
(
const
int
n
,
...
...
@@ -36,9 +37,12 @@ template <> void Set<int, CPUContext>(const int n,
}
#ifdef WITH_SSE
sse
::
Set
<
int
>
(
n
,
alpha
,
x
);
#else // naive implement
#else
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
x
[
i
]
=
alpha
;
#endif
#endif
// WITH_SSE
}
template
<>
void
Set
<
float16
,
CPUContext
>
(
const
int
n
,
...
...
@@ -52,9 +56,10 @@ template <> void RandomUniform<float, CPUContext>(const int n,
const
float
high
,
float
*
x
)
{
std
::
uniform_real_distribution
<
float
>
distribution
(
low
,
high
);
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
x
[
i
]
=
distribution
(
*
rand_generator
());
}
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
x
[
i
]
=
distribution
(
*
rand_generator
());
}
template
<>
void
RandomUniform
<
float16
,
CPUContext
>
(
const
int
n
,
...
...
@@ -69,9 +74,10 @@ template <> void RandomUniform<uint32_t, CPUContext>(const int n,
const
float
high
,
uint32_t
*
x
)
{
std
::
uniform_int_distribution
<
uint32_t
>
distribution
(
low
,
high
);
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
x
[
i
]
=
distribution
(
*
rand_generator
());
}
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
x
[
i
]
=
distribution
(
*
rand_generator
());
}
template
<>
void
RandomNormal
<
float
,
CPUContext
>
(
const
int
n
,
...
...
@@ -79,9 +85,10 @@ template <> void RandomNormal<float, CPUContext>(const int n,
const
float
sigma
,
float
*
x
)
{
std
::
normal_distribution
<
float
>
distribution
(
mu
,
sigma
);
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
x
[
i
]
=
distribution
(
*
rand_generator
());
}
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
x
[
i
]
=
distribution
(
*
rand_generator
());
}
template
<>
void
RandomNormal
<
float16
,
CPUContext
>
(
const
int
n
,
...
...
@@ -121,9 +128,10 @@ template <> void RandomBernoulli<float, CPUContext>(const int n,
const
float
p
,
uint32_t
*
x
)
{
std
::
bernoulli_distribution
distribution
(
p
);
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
x
[
i
]
=
distribution
(
*
rand_generator
());
}
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
x
[
i
]
=
distribution
(
*
rand_generator
());
}
/******************** Level-1 ********************/
...
...
@@ -134,9 +142,12 @@ template <> void Add<float, CPUContext>(const int n,
float
*
y
)
{
#ifdef WITH_SSE
sse
::
Add
<
float
>
(
n
,
a
,
b
,
y
);
#else // naive implement
for
(
int
i
=
0
;
i
<
n
;
++
i
)
y
[
i
]
=
a
[
i
]
+
b
[
i
];
#else
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
y
[
i
]
=
a
[
i
]
+
b
[
i
];
#endif // WITH_SSE
}
template
<>
void
Sub
<
float
,
CPUContext
>
(
const
int
n
,
...
...
@@ -145,9 +156,12 @@ template <> void Sub<float, CPUContext>(const int n,
float
*
y
)
{
#ifdef WITH_SSE
sse
::
Sub
<
float
>
(
n
,
a
,
b
,
y
);
#else // naive implement
for
(
int
i
=
0
;
i
<
n
;
++
i
)
y
[
i
]
=
a
[
i
]
-
b
[
i
];
#else
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
y
[
i
]
=
a
[
i
]
-
b
[
i
];
#endif // WITH_SSE
}
template
<>
void
Mul
<
float
,
CPUContext
>
(
const
int
n
,
...
...
@@ -156,9 +170,12 @@ template <> void Mul<float, CPUContext>(const int n,
float
*
y
)
{
#ifdef WITH_SSE
sse
::
Mul
<
float
>
(
n
,
a
,
b
,
y
);
#else // naive implement
for
(
int
i
=
0
;
i
<
n
;
++
i
)
y
[
i
]
=
a
[
i
]
*
b
[
i
];
#else
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
y
[
i
]
=
a
[
i
]
*
b
[
i
];
#endif // WITH_SSE
}
template
<>
void
Mul
<
float16
,
CPUContext
>
(
const
int
n
,
...
...
@@ -174,9 +191,12 @@ template <> void Div<float, CPUContext>(const int n,
float
*
y
)
{
#ifdef WITH_SSE
sse
::
Div
<
float
>
(
n
,
a
,
b
,
y
);
#else // naive implement
#else
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
y
[
i
]
=
a
[
i
]
/
b
[
i
];
#endif
#endif
// WITH_SSE
}
template
<>
void
Div
<
float16
,
CPUContext
>
(
const
int
n
,
...
...
@@ -190,6 +210,9 @@ template <> void Clip<float, CPUContext>(const int n,
const
float
low
,
const
float
high
,
float
*
x
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
x
[
i
]
=
std
::
max
(
low
,
std
::
min
(
x
[
i
],
high
));
}
...
...
@@ -198,6 +221,9 @@ template <> void Clip<float, CPUContext>(const int n,
template
<>
void
Exp
<
float
,
CPUContext
>
(
int
n
,
const
float
*
x
,
float
*
y
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
std
::
exp
(
x
[
i
]);
}
...
...
@@ -206,6 +232,9 @@ template <> void Exp<float, CPUContext>(int n,
template
<>
void
Log
<
float
,
CPUContext
>
(
int
n
,
const
float
*
x
,
float
*
y
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
std
::
log
(
x
[
i
]);
}
...
...
@@ -214,6 +243,9 @@ template <> void Log<float, CPUContext>(int n,
template
<>
void
Square
<
float
,
CPUContext
>
(
int
n
,
const
float
*
x
,
float
*
y
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
x
[
i
]
*
x
[
i
];
}
...
...
@@ -228,6 +260,9 @@ template <> void Square<float16, CPUContext>(int n,
template
<>
void
Sqrt
<
float
,
CPUContext
>
(
int
n
,
const
float
*
x
,
float
*
y
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
std
::
sqrt
(
x
[
i
]);
}
...
...
@@ -243,6 +278,9 @@ template <> void Pow<float, CPUContext>(int n,
const
float
alpha
,
const
float
*
x
,
float
*
y
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
std
::
pow
(
x
[
i
],
alpha
);
}
...
...
@@ -259,6 +297,9 @@ template <> void Inv<float, CPUContext>(const int n,
const
float
numerator
,
const
float
*
x
,
float
*
y
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
1.0
/
y
[
i
];
}
...
...
@@ -280,9 +321,12 @@ template <> void Scal<float, CPUContext>(const int n,
cblas_sscal
(
n
,
alpha
,
y
,
1
);
#elif WITH_SSE
sse
::
Scal
<
float
>
(
n
,
alpha
,
y
);
#else // naive implement
for
(
int
i
=
0
;
i
<
n
;
++
i
)
y
[
i
]
=
y
[
i
]
*
alpha
;
#else
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
y
[
i
]
=
y
[
i
]
*
alpha
;
#endif // WITH_BLAS
}
template
<>
void
Scal
<
float16
,
CPUContext
>
(
const
int
n
,
...
...
@@ -307,9 +351,12 @@ template <> void Scale<float, CPUContext>(const int n,
cblas_sscal
(
n
,
alpha
,
y
,
1
);
#elif WITH_SSE
sse
::
Scale
<
float
>
(
n
,
alpha
,
x
,
y
);
#else // naive implement
for
(
int
i
=
0
;
i
<
n
;
++
i
)
y
[
i
]
=
x
[
i
]
*
alpha
;
#else
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
y
[
i
]
=
x
[
i
]
*
alpha
;
#endif // WITH_BLAS
}
template
<>
float
StridedDot
<
float
,
CPUContext
>
(
const
int
n
,
...
...
@@ -319,11 +366,14 @@ template <> float StridedDot<float, CPUContext>(const int n,
const
int
incy
)
{
#ifdef WITH_BLAS
return
cblas_sdot
(
n
,
a
,
incx
,
b
,
incy
);
#else
// naive implement
#else
float
ret
=
0.
f
;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
ret
+=
a
[
i
]
*
b
[
i
];
return
ret
;
#endif
#endif
// WITH_BLAS
}
template
<>
float
Dot
<
float
,
CPUContext
>
(
int
n
,
...
...
@@ -333,11 +383,14 @@ template <> float Dot<float, CPUContext>(int n,
return
StridedDot
<
float
,
CPUContext
>
(
n
,
a
,
1
,
b
,
1
);
#elif WITH_SSE
return
sse
::
Dot
<
float
>
(
n
,
a
,
b
);
#else
// naive implement
#else
float
ret
=
0.
f
;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
ret
+=
a
[
i
]
*
b
[
i
];
return
ret
;
#endif
#endif
// WITH_BLAS
}
template
<>
float
Dot
<
float16
,
CPUContext
>
(
int
n
,
...
...
@@ -350,23 +403,29 @@ template <> float Dot<float16, CPUContext>(int n,
template
<>
float
ASum
<
float
,
CPUContext
>
(
const
int
n
,
const
float
*
x
)
{
#ifdef WITH_BLAS
return
cblas_sasum
(
n
,
x
,
1
);
#elif
WITH_SSE
#elif WITH_SSE
return
sse
::
ASum
<
float
>
(
n
,
x
);
#else
// naive implement
#else
float
ret
=
0.
f
;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
ret
+=
x
[
i
];
return
ret
;
#endif
#endif
// WITH_BLAS
}
template
<>
void
AddScalar
<
float
,
CPUContext
>
(
const
int
n
,
const
float
alpha
,
float
*
y
)
{
#ifdef
WITH_SSE
#ifdef WITH_SSE
sse
::
AddScalar
<
float
>
(
n
,
alpha
,
y
);
#else // naive implement
for
(
int
i
=
0
;
i
<
n
;
++
i
)
y
[
i
]
+=
alpha
;
#else
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
y
[
i
]
+=
alpha
;
#endif // WITH_SSE
}
template
<>
void
AddScalar
<
float16
,
CPUContext
>
(
const
int
n
,
...
...
@@ -378,11 +437,14 @@ template <> void AddScalar<float16, CPUContext>(const int n,
template
<>
void
MulScalar
<
float
,
CPUContext
>
(
const
int
n
,
const
float
alpha
,
float
*
y
)
{
#ifdef
WITH_SSE
#ifdef WITH_SSE
sse
::
MulScalar
<
float
>
(
n
,
alpha
,
y
);
#else // naive implement
for
(
int
i
=
0
;
i
<
n
;
++
i
)
y
[
i
]
*=
alpha
;
#else
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
y
[
i
]
*=
alpha
;
#endif // WITH_SSE
}
template
<>
void
Axpy
<
float
,
CPUContext
>
(
const
int
n
,
...
...
@@ -393,9 +455,12 @@ template <> void Axpy<float, CPUContext>(const int n,
cblas_saxpy
(
n
,
alpha
,
x
,
1
,
y
,
1
);
#elif WITH_SSE
sse
::
Axpy
<
float
>
(
n
,
alpha
,
x
,
y
);
#else // naive implement
for
(
int
i
=
0
;
i
<
n
;
++
i
)
y
[
i
]
=
alpha
*
x
[
i
]
+
y
[
i
];
#else
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
y
[
i
]
=
alpha
*
x
[
i
]
+
y
[
i
];
#endif // WITH_BLAS
}
template
<>
void
Axpy
<
float16
,
CPUContext
>
(
const
int
n
,
...
...
@@ -415,9 +480,12 @@ template <> void Axpby<float, CPUContext>(const int n,
cblas_saxpy
(
n
,
alpha
,
x
,
1
,
y
,
1
);
#elif WITH_SSE
sse
::
Axpby
<
float
>
(
n
,
alpha
,
x
,
beta
,
y
);
#else // naive implement
for
(
int
i
=
0
;
i
<
n
;
++
i
)
y
[
i
]
=
alpha
*
x
[
i
]
+
beta
*
y
[
i
];
#else
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for
(
int
i
=
0
;
i
<
n
;
++
i
)
y
[
i
]
=
alpha
*
x
[
i
]
+
beta
*
y
[
i
];
#endif // WITH_BLAS
}
template
<>
void
Axpby
<
float16
,
CPUContext
>
(
const
int
n
,
...
...
Dragon/src/utils/math_functions.cu
View file @
6eeac5f
...
...
@@ -40,6 +40,7 @@ template <> void Set<int, CUDAContext>(const int n,
_Set<int> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, alpha, x);
}
#ifdef WITH_CUDA_FP16
template <typename T>
__global__ void _SetHalf2(const int n, const half2 alpha, half2* x) {
CUDA_KERNEL_LOOP(idx, n) {
...
...
@@ -61,6 +62,7 @@ template <> void Set<float16, CUDAContext>(const int n,
_Set<float16> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, alpha, x);
}
}
#endif
template <> void RandomUniform<uint32_t, CUDAContext>(const int n,
const float low,
...
...
@@ -144,6 +146,7 @@ template <> void Mul<float, CUDAContext>(int n,
_Mul<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, a, b, y);
}
#ifdef WITH_CUDA_FP16
template <typename T>
__global__ void _MulHalf(const int n, const half* a, const half* b, half* y) {
CUDA_KERNEL_LOOP(idx, n) {
...
...
@@ -161,7 +164,7 @@ __global__ void _MulHalf2(const int n, const half2* a, const half2* b, half2* y)
#endif
}
}
template <> void Mul<float16, CUDAContext>(int n,
const float16* a,
const float16* b,
...
...
@@ -176,6 +179,7 @@ template <> void Mul<float16, CUDAContext>(int n,
reinterpret_cast<const half*>(b),
reinterpret_cast<half*>(y));
}
#endif
template <typename T>
__global__ void _Div(const int n, const T* a, const T* b, T* y) {
...
...
@@ -191,6 +195,7 @@ template <> void Div<float, CUDAContext>(int n,
_Div<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, a, b, y);
}
#ifdef WITH_CUDA_FP16
template <typename T>
__global__ void _DivHalf(const int n, const half* a, const half* b, half* y) {
CUDA_KERNEL_LOOP(idx, n) {
...
...
@@ -209,6 +214,7 @@ template <> void Div<float16, CUDAContext>(int n,
reinterpret_cast<const half*>(b),
reinterpret_cast<half*>(y));
}
#endif
template <typename T>
__global__ void _Clip(const int n, const T low, const T high, T* x) {
...
...
@@ -260,6 +266,7 @@ template <> void Square<float, CUDAContext>(int n,
_Square<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, x, y);
}
#ifdef WITH_CUDA_FP16
template <typename T>
__global__ void _SquareHalf(const int n, const half* x, half* y) {
CUDA_KERNEL_LOOP(idx, n) {
...
...
@@ -290,6 +297,7 @@ template <> void Square<float16, CUDAContext>(int n,
reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK;
}
#endif
template <typename T>
__global__ void _Sqrt(const int n, const T* x, T* y) {
...
...
@@ -304,6 +312,7 @@ template <> void Sqrt<float, CUDAContext>(int n,
_Sqrt<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, x, y);
}
#ifdef WITH_CUDA_FP16
template <typename T>
__global__ void _SqrtHalf(const int n, const half* x, half* y) {
CUDA_KERNEL_LOOP(idx, n) {
...
...
@@ -334,6 +343,7 @@ template <> void Sqrt<float16, CUDAContext>(int n,
reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK;
}
#endif
template <typename T>
__global__ void _Pow(const int n, const T alpha, const T* a, T* y) {
...
...
@@ -349,6 +359,7 @@ template <> void Pow<float, CUDAContext>(int n,
_Pow<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, alpha, x, y);
}
#ifdef WITH_CUDA_FP16
template <typename T>
__global__ void _PowHalf(const int n, const float alpha, const half* a, half* y) {
CUDA_KERNEL_LOOP(idx, n) {
...
...
@@ -384,6 +395,7 @@ template <> void Pow<float16, CUDAContext>(int n,
reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK;
}
#endif
template <typename T>
__global__ void _Inv(const int n, const float numerator, const T* x, T* y) {
...
...
@@ -399,6 +411,7 @@ template <> void Inv<float, CUDAContext>(const int n,
_Inv<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, numerator, x, y);
}
#ifdef WITH_CUDA_FP16
template <typename T>
__global__ void _InvHalf(const int n, const half numerator, const half* x, half* y) {
CUDA_KERNEL_LOOP(idx, n) {
...
...
@@ -439,6 +452,7 @@ template <> void Inv<float16, CUDAContext>(const int n,
}
CUDA_POST_KERNEL_CHECK;
}
#endif
/******************** Level-2 ********************/
...
...
@@ -518,6 +532,7 @@ template <> void AddScalar<float, CUDAContext>(const int n, const float alpha, f
_AddScalar<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, alpha, y);
}
#ifdef WITH_CUDA_FP16
template <typename T>
__global__ void _AddScalarHalf(const int n, half alpha, half* y) {
CUDA_KERNEL_LOOP(idx, n) {
...
...
@@ -552,6 +567,7 @@ template <> void AddScalar<float16, CUDAContext>(const int n, const float alpha,
}
CUDA_POST_KERNEL_CHECK;
}
#endif
template <typename T>
__global__ void _MulScalar(const int n, T alpha, T* y) {
...
...
@@ -641,6 +657,7 @@ template <> void Gemm<float, CUDAContext>(const CBLAS_TRANSPOSE transA,
C, N));
}
#ifdef WITH_CUDA_FP16
template <> void Gemm<float16, CUDAContext>(const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB,
const int M,
...
...
@@ -682,6 +699,7 @@ template <> void Gemm<float16, CUDAContext>(const CBLAS_TRANSPOSE transA,
LOG(FATAL) << "unsupported math type";
}
}
#endif
template <> void Gemv<float, CUDAContext>(const CBLAS_TRANSPOSE transA,
const int M, const int N,
...
...
@@ -702,6 +720,7 @@ template <> void Gemv<float, CUDAContext>(const CBLAS_TRANSPOSE transA,
y, 1));
}
#ifdef WITH_CUDA_FP16
template <> void Gemv<float16, CUDAContext>(const CBLAS_TRANSPOSE transA,
const int M,
const int N,
...
...
@@ -742,6 +761,7 @@ template <> void Gemv<float16, CUDAContext>(const CBLAS_TRANSPOSE transA,
LOG(FATAL) << "unsupported math type";
}
}
#endif
} // namespace math
...
...
Dragon/src/utils/op_kernel.cc
View file @
6eeac5f
...
...
@@ -3,11 +3,9 @@
#include "core/tensor.h"
#include "utils/op_kernel.h"
#include "utils/math_functions.h"
#ifdef WITH_SSE
#include "utils/omp_alternative.h"
#include "utils/sse_alternative.h"
#
endif
#
include "utils/math_functions.h"
bool
judge
(
int
a
,
int
b
)
{
return
unsigned
(
a
)
<
unsigned
(
b
);
}
...
...
@@ -28,8 +26,10 @@ template<> void Dropout<float, CPUContext>(const int count,
CPUContext
*
context
)
{
uint32_t
thresh
=
static_cast
<
uint32_t
>
(
UINT_MAX
*
prob
);
math
::
RandomBernoulli
<
float
,
CPUContext
>
(
count
,
1
-
prob
,
mask
);
for
(
int
i
=
0
;
i
<
count
;
++
i
)
y
[
i
]
=
x
[
i
]
*
mask
[
i
]
*
scale
;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for
(
int
i
=
0
;
i
<
count
;
++
i
)
y
[
i
]
=
x
[
i
]
*
mask
[
i
]
*
scale
;
}
template
<>
void
DropoutGrad
<
float
,
CPUContext
>
(
const
int
count
,
...
...
@@ -38,8 +38,10 @@ template<> void DropoutGrad<float, CPUContext>(const int count,
const
float
*
dy
,
const
uint32_t
*
mask
,
float
*
dx
)
{
for
(
int
i
=
0
;
i
<
count
;
++
i
)
dx
[
i
]
=
dy
[
i
]
*
mask
[
i
]
*
scale
;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for
(
int
i
=
0
;
i
<
count
;
++
i
)
dx
[
i
]
=
dy
[
i
]
*
mask
[
i
]
*
scale
;
}
/******************** activation.relu ********************/
...
...
@@ -48,6 +50,9 @@ template<> void Relu<float, CPUContext>(const int count,
const
float
*
x
,
const
float
slope
,
float
*
y
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
y
[
i
]
=
std
::
max
(
x
[
i
],
0.
f
)
+
slope
*
std
::
min
(
x
[
i
],
0.
f
);
}
...
...
@@ -58,10 +63,12 @@ template<> void ReluGrad<float, CPUContext>(const int count,
const
float
*
y
,
const
float
slope
,
float
*
dx
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
dx
[
i
]
=
dy
[
i
]
*
((
y
[
i
]
>
0
)
+
slope
*
(
y
[
i
]
<=
0
));
}
}
/******************** activation.sigmoid ********************/
...
...
@@ -70,15 +77,19 @@ template <typename T>
T
_sigmoid
(
T
x
)
{
return
T
(
1
)
/
(
T
(
1
)
+
exp
(
-
x
));
}
template
<>
void
Sigmoid
<
float
,
CPUContext
>
(
const
int
count
,
const
float
*
x
,
float
*
y
)
{
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
y
[
i
]
=
_sigmoid
<
float
>
(
x
[
i
]);
}
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for
(
int
i
=
0
;
i
<
count
;
++
i
)
y
[
i
]
=
_sigmoid
<
float
>
(
x
[
i
]);
}
template
<>
void
SigmoidGrad
<
float
,
CPUContext
>
(
const
int
count
,
const
float
*
dy
,
const
float
*
y
,
float
*
dx
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
dx
[
i
]
=
dy
[
i
]
*
y
[
i
]
*
(
1
-
y
[
i
]);
}
...
...
@@ -149,6 +160,9 @@ template<> void SoftmaxGrad<float, CPUContext>(const int count,
/******************** activation.tanh ********************/
template
<>
void
Tanh
<
float
,
CPUContext
>
(
const
int
count
,
const
float
*
x
,
float
*
y
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
y
[
i
]
=
std
::
tanh
(
x
[
i
]);
}
...
...
@@ -158,6 +172,9 @@ template<> void TanhGrad<float, CPUContext>(const int count,
const
float
*
dy
,
const
float
*
y
,
float
*
dx
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
dx
[
i
]
=
dy
[
i
]
*
(
1
-
y
[
i
]
*
y
[
i
]);
}
...
...
@@ -197,6 +214,9 @@ template <> void Clip<float, CPUContext>(const int count,
const
float
*
x
,
float
*
mask
,
float
*
y
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
mask
[
i
]
=
1.0
;
if
(
x
[
i
]
<
low
||
x
[
i
]
>
high
)
mask
[
i
]
=
0.0
;
...
...
@@ -300,8 +320,10 @@ template<> void Argmax<float, CPUContext>(const int count,
/******************** common.at ********************/
template
<>
void
CanonicalAxis
<
float
,
CPUContext
>
(
const
int
count
,
const
int
dim
,
float
*
y
)
{
for
(
int
i
=
0
;
i
<
count
;
++
i
)
if
(
y
[
i
]
<
0
)
y
[
i
]
+=
dim
;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for
(
int
i
=
0
;
i
<
count
;
++
i
)
if
(
y
[
i
]
<
0
)
y
[
i
]
+=
dim
;
}
template
<>
void
At
<
float
,
CPUContext
>
(
const
int
count
,
...
...
@@ -478,6 +500,9 @@ template<> void Sum<float, CPUContext>(const int count,
const
int
inner_dim
,
const
float
*
x
,
float
*
y
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
float
sum_val
=
0.0
;
for
(
int
j
=
0
;
j
<
axis_dim
;
++
j
)
...
...
@@ -492,6 +517,9 @@ template<> void SumGrad<float, CPUContext>(const int count,
const
float
coeff
,
const
float
*
dy
,
float
*
dx
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
for
(
int
j
=
0
;
j
<
axis_dim
;
++
j
)
dx
[(
i
/
inner_dim
*
axis_dim
+
j
)
*
inner_dim
+
i
%
inner_dim
]
=
dy
[
i
]
*
coeff
;
...
...
@@ -585,6 +613,9 @@ template <> void Transpose<float, CPUContext>(const int count,
const
int
*
new_steps
,
const
float
*
x
,
float
*
y
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
int
x_idx
=
0
,
y_idx
=
i
;
for
(
int
j
=
0
;
j
<
ndim
;
++
j
)
{
...
...
@@ -603,15 +634,7 @@ template <> void Transpose<float16, CPUContext>(const int count,
const
int
*
new_steps
,
const
float16
*
x
,
float16
*
y
)
{
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
int
x_idx
=
0
,
y_idx
=
i
;
for
(
int
j
=
0
;
j
<
ndim
;
++
j
)
{
int
k
=
order
[
j
];
x_idx
+=
(
y_idx
/
new_steps
[
j
])
*
old_steps
[
k
];
y_idx
%=
new_steps
[
j
];
}
y
[
i
]
=
x
[
x_idx
];
}
LOG
(
FATAL
)
<<
"unsupport float16 with CPU"
;
}
template
<>
void
TransposeGrad
<
float
,
CPUContext
>
(
const
int
count
,
...
...
@@ -621,6 +644,9 @@ template <> void TransposeGrad<float, CPUContext>(const int count,
const
int
*
new_steps
,
const
float
*
dy
,
float
*
dx
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
int
x_idx
=
0
,
y_idx
=
i
;
for
(
int
j
=
0
;
j
<
ndim
;
++
j
)
{
...
...
@@ -639,20 +665,15 @@ template <> void TransposeGrad<float16, CPUContext>(const int count,
const
int
*
new_steps
,
const
float16
*
dy
,
float16
*
dx
)
{
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
int
x_idx
=
0
,
y_idx
=
i
;
for
(
int
j
=
0
;
j
<
ndim
;
++
j
)
{
int
k
=
order
[
j
];
x_idx
+=
(
y_idx
/
new_steps
[
j
])
*
old_steps
[
k
];
y_idx
%=
new_steps
[
j
];
}
dx
[
x_idx
]
=
dy
[
i
];
}
LOG
(
FATAL
)
<<
"unsupport float16 with CPU"
;
}
/******************** loss.l1_loss ********************/
template
<>
void
AbsGrad
<
float
,
CPUContext
>
(
const
int
count
,
const
float
*
dy
,
float
*
dx
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
const
float
val
=
dy
[
i
];
// val > 0: 1 | val == 0: 0 | val < 0: -1
...
...
@@ -666,6 +687,9 @@ template <> void SigmoidCrossEntropy<float, CPUContext>(const int count,
const
float
*
x
,
const
float
*
target
,
float
*
loss
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
loss
[
i
]
=
std
::
log
(
1
+
std
::
exp
(
x
[
i
]
-
2
*
x
[
i
]
*
(
x
[
i
]
>=
0
)))
+
x
[
i
]
*
((
x
[
i
]
>=
0
)
-
target
[
i
]);
...
...
@@ -678,6 +702,9 @@ template<> void SmoothL1<float, CPUContext>(const int count,
const
float
sigma2
,
const
float
*
x
,
float
*
y
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
const
float
val
=
x
[
i
];
const
float
abs_val
=
abs
(
val
);
...
...
@@ -690,6 +717,9 @@ template<> void SmoothL1Grad<float, CPUContext>(const int count,
const
float
sigma2
,
const
float
*
dy
,
float
*
dx
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
const
float
val
=
dy
[
i
];
const
float
abs_val
=
abs
(
val
);
...
...
@@ -705,6 +735,9 @@ template <> void SoftmaxCrossEntropy<float, CPUContext>(const int count,
const
float
*
prob
,
const
float
*
target
,
float
*
loss
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
loss
[
i
]
=
-
target
[
i
]
*
std
::
log
(
std
::
max
(
prob
[
i
],
FLT_MIN
));
}
...
...
@@ -1016,9 +1049,12 @@ template <> void RMSPropUpdate<float, CPUContext>(const int count,
/******************** utils.compare ********************/
template
<>
void
Equal
<
float
,
CPUContext
>
(
const
int
count
,
const
float
*
a
,
const
float
*
b
,
float
*
y
)
{
const
float
*
a
,
const
float
*
b
,
float
*
y
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for
(
int
i
=
0
;
i
<
count
;
++
i
)
y
[
i
]
=
fabs
(
a
[
i
]
-
b
[
i
])
<
FLT_EPSILON
?
1.0
:
0.0
;
}
...
...
@@ -1096,6 +1132,9 @@ template <> void OneHot<float, CPUContext>(const int count,
const
int
on_value
,
const
float
*
x
,
float
*
y
)
{
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
const
int
val
=
x
[
i
];
y
[
i
*
depth
+
val
]
=
on_value
;
...
...
Dragon/src/utils/op_kernel.cu
View file @
6eeac5f
...
...
@@ -21,7 +21,7 @@ template<> void Empty<float, CUDAContext>() {
}
template<> void Empty<float16, CUDAContext>() {
_Empty<float> << <1, 1 >> >();
_Empty<float
16
> << <1, 1 >> >();
CUDA_POST_KERNEL_CHECK;
}
...
...
@@ -102,6 +102,7 @@ template<> void Relu<float, CUDAContext>(const int count,
CUDA_POST_KERNEL_CHECK;
}
#ifdef WITH_CUDA_FP16
template <typename T>
__global__ void _ReluHalf(const int count, const half* x, const float slope, half* y) {
const half kSlope = __float2half(slope);
...
...
@@ -123,6 +124,7 @@ template<> void Relu<float16, CUDAContext>(const int count,
reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK;
}
#endif
template <typename T>
__global__ void _ReluGrad(const int count,
...
...
@@ -477,6 +479,7 @@ template<> void Scale<float, CUDAContext>(const int axis,
Ydata);
}
#ifdef WITH_CUDA_FP16
template <typename T>
__global__ void _ScaleWithoutBiasHalf(const int n,
const half* x,
...
...
@@ -538,6 +541,7 @@ template<> void Scale<float16, CUDAContext>(const int axis,
inner_dim,
reinterpret_cast<half*>(Ydata));
}
#endif
template <> void ScaleGrad<float, CUDAContext>(const int axis,
Tensor* dy,
...
...
@@ -730,6 +734,7 @@ template <> void Concat<float, CUDAContext>(const int count,
CUDA_POST_KERNEL_CHECK;
}
#ifdef WITH_CUDA_FP16
template <> void Concat<float16, CUDAContext>(const int count,
const int outer_dim,
const int inner_dim,
...
...
@@ -749,6 +754,7 @@ template <> void Concat<float16, CUDAContext>(const int count,
reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK;
}
#endif
template <typename T>
__global__ void _ConcatGrad(const int count,
...
...
@@ -789,6 +795,7 @@ template <> void ConcatGrad<float, CUDAContext>(const int count,
CUDA_POST_KERNEL_CHECK;
}
#ifdef WITH_CUDA_FP16
template <> void ConcatGrad<float16, CUDAContext>(const int count,
const int outer_dim,
const int inner_dim,
...
...
@@ -808,6 +815,7 @@ template <> void ConcatGrad<float16, CUDAContext>(const int count,
reinterpret_cast<half*>(dx));
CUDA_POST_KERNEL_CHECK;
}
#endif
/******************** common.crop ********************/
...
...
@@ -1134,6 +1142,7 @@ template <> void Transpose<float, CUDAContext>(const int count,
CUDA_POST_KERNEL_CHECK;
}
#ifdef WITH_CUDA_FP16
template <> void Transpose<float16, CUDAContext>(const int count,
const int ndim,
const int* order,
...
...
@@ -1150,6 +1159,7 @@ template <> void Transpose<float16, CUDAContext>(const int count,
reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK;
}
#endif
template <typename T>
__global__ void _TransposeGrad(const int count,
...
...
@@ -1187,6 +1197,7 @@ template <> void TransposeGrad<float, CUDAContext>(const int count,
CUDA_POST_KERNEL_CHECK;
}
#ifdef WITH_CUDA_FP16
template <> void TransposeGrad<float16, CUDAContext>(const int count,
const int ndim,
const int* order,
...
...
@@ -1203,6 +1214,7 @@ template <> void TransposeGrad<float16, CUDAContext>(const int count,
reinterpret_cast<half*>(dx));
CUDA_POST_KERNEL_CHECK;
}
#endif
/******************** loss.l1_loss ********************/
...
...
@@ -1834,6 +1846,7 @@ template <> void RMSPropUpdate<float, CUDAContext>(const int count,
/******************** utils.cast ********************/
#ifdef WITH_CUDA_FP16
template <typename T>
__global__ void _FloatToHalfKernel(const int count, const float* x, half* y) {
CUDA_KERNEL_LOOP(idx, count) {
...
...
@@ -1849,6 +1862,7 @@ template <> void Float2Half<float, CUDAContext>(const int count,
reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK;
}
#endif
/******************** utils.compare ********************/
...
...
@@ -1943,6 +1957,7 @@ template <> void MemoryData<uint8_t, float, CUDAContext>(const int count,
CUDA_POST_KERNEL_CHECK;
}
#ifdef WITH_CUDA_FP16
template <> void MemoryData<float, float16, CUDAContext>(const int count,
const int num,
const int channels,
...
...
@@ -1976,6 +1991,7 @@ template <> void MemoryData<uint8_t, float16, CUDAContext>(const int count,
reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK;
}
#endif
/******************** utils.one_hot ********************/
...
...
Dragon/src/utils/sse_alternative.cc
View file @
6eeac5f
...
...
@@ -3,164 +3,223 @@
#include <cmath>
#include <algorithm>
#include "utils/omp_alternative.h"
#include "utils/sse_alternative.h"
namespace
dragon
{
namespace
sse
{
template
<>
void
Set
(
const
int
n
,
const
float
alpha
,
float
*
x
)
{
__m128
scalar
=
SSE_FP32_SCALAR
(
alpha
);
SSE_LOOP1
(
i
,
n
)
SSE_FP32_STORE
(
x
+
i
,
scalar
);
SSE_LOOP2
(
i
,
n
)
x
[
i
]
=
alpha
;
template
<>
void
Set
(
const
int
n
,
const
float
alpha
,
float
*
x
)
{
__m128
scalar
=
SSE_FP32_SCALAR
(
alpha
);
int32_t
i
=
0
;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
SSE_LOOP1
(
i
,
n
)
SSE_FP32_STORE
(
x
+
i
,
scalar
);
SSE_LOOP2
(
i
,
n
)
x
[
i
]
=
alpha
;
}
template
<>
void
Set
(
const
int
n
,
const
int
alpha
,
int
*
x
)
{
__m128i
scalar
=
SSE_INT32_SCALAR
(
alpha
);
__m128i
*
x1
=
reinterpret_cast
<
__m128i
*>
(
x
);
int32_t
i
=
0
;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
SSE_LOOP1
(
i
,
n
)
SSE_INT128_STORE
(
x1
++
,
scalar
);
SSE_LOOP2
(
i
,
n
)
x
[
i
]
=
alpha
;
}
template
<>
void
Add
(
const
int
n
,
const
float
*
a
,
const
float
*
b
,
float
*
y
)
{
__m128
x1
,
y1
,
z1
;
int32_t
i
=
0
;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
SSE_LOOP1
(
i
,
n
)
{
x1
=
SSE_FP32_LOAD
(
a
+
i
);
y1
=
SSE_FP32_LOAD
(
b
+
i
);
z1
=
SSE_FP32_ADD
(
x1
,
y1
);
SSE_FP32_STORE
(
y
+
i
,
z1
);
}
template
<>
void
Set
(
const
int
n
,
const
int
alpha
,
int
*
x
)
{
__m128i
scalar
=
SSE_INT32_SCALAR
(
alpha
);
__m128i
*
x1
=
reinterpret_cast
<
__m128i
*>
(
x
);
SSE_LOOP1
(
i
,
n
)
SSE_INT128_STORE
(
x1
++
,
scalar
);
SSE_LOOP2
(
i
,
n
)
x
[
i
]
=
alpha
;
}
template
<>
void
Add
(
const
int
n
,
const
float
*
a
,
const
float
*
b
,
float
*
y
)
{
__m128
x1
,
y1
,
z1
;
SSE_LOOP1
(
i
,
n
)
{
x1
=
SSE_FP32_LOAD
(
a
+
i
);
y1
=
SSE_FP32_LOAD
(
b
+
i
);
z1
=
SSE_FP32_ADD
(
x1
,
y1
);
SSE_FP32_STORE
(
y
+
i
,
z1
);
}
SSE_LOOP2
(
i
,
n
)
y
[
i
]
=
a
[
i
]
+
b
[
i
];
}
template
<>
void
Sub
(
const
int
n
,
const
float
*
a
,
const
float
*
b
,
float
*
y
)
{
__m128
x1
,
y1
,
z1
;
SSE_LOOP1
(
i
,
n
)
{
x1
=
SSE_FP32_LOAD
(
a
+
i
);
y1
=
SSE_FP32_LOAD
(
b
+
i
);
z1
=
SSE_FP32_SUB
(
x1
,
y1
);
SSE_FP32_STORE
(
y
+
i
,
z1
);
}
SSE_LOOP2
(
i
,
n
)
y
[
i
]
=
a
[
i
]
-
b
[
i
];
}
template
<>
void
Mul
(
const
int
n
,
const
float
*
a
,
const
float
*
b
,
float
*
y
)
{
__m128
x1
,
y1
,
z1
;
SSE_LOOP1
(
i
,
n
)
{
x1
=
SSE_FP32_LOAD
(
a
+
i
);
y1
=
SSE_FP32_LOAD
(
b
+
i
);
z1
=
SSE_FP32_MUL
(
x1
,
y1
);
SSE_FP32_STORE
(
y
+
i
,
z1
);
}
SSE_LOOP2
(
i
,
n
)
y
[
i
]
=
a
[
i
]
*
b
[
i
];
SSE_LOOP2
(
i
,
n
)
y
[
i
]
=
a
[
i
]
+
b
[
i
];
}
template
<>
void
Sub
(
const
int
n
,
const
float
*
a
,
const
float
*
b
,
float
*
y
)
{
__m128
x1
,
y1
,
z1
;
int32_t
i
=
0
;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
SSE_LOOP1
(
i
,
n
)
{
x1
=
SSE_FP32_LOAD
(
a
+
i
);
y1
=
SSE_FP32_LOAD
(
b
+
i
);
z1
=
SSE_FP32_SUB
(
x1
,
y1
);
SSE_FP32_STORE
(
y
+
i
,
z1
);
}
template
<>
void
Div
(
const
int
n
,
const
float
*
a
,
const
float
*
b
,
float
*
y
)
{
__m128
x1
,
y1
,
z1
;
SSE_LOOP1
(
i
,
n
)
{
x1
=
SSE_FP32_LOAD
(
a
+
i
);
y1
=
SSE_FP32_LOAD
(
b
+
i
);
z1
=
SSE_FP32_DIV
(
x1
,
y1
);
SSE_FP32_STORE
(
y
+
i
,
z1
);
}
SSE_LOOP2
(
i
,
n
)
y
[
i
]
=
a
[
i
]
/
b
[
i
];
SSE_LOOP2
(
i
,
n
)
y
[
i
]
=
a
[
i
]
-
b
[
i
];
}
template
<>
void
Mul
(
const
int
n
,
const
float
*
a
,
const
float
*
b
,
float
*
y
)
{
__m128
x1
,
y1
,
z1
;
int32_t
i
=
0
;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
SSE_LOOP1
(
i
,
n
)
{
x1
=
SSE_FP32_LOAD
(
a
+
i
);
y1
=
SSE_FP32_LOAD
(
b
+
i
);
z1
=
SSE_FP32_MUL
(
x1
,
y1
);
SSE_FP32_STORE
(
y
+
i
,
z1
);
}
template
<>
void
Scal
(
const
int
n
,
const
float
alpha
,
float
*
y
)
{
__m128
y1
,
scalar
=
SSE_FP32_SCALAR
(
alpha
);
SSE_LOOP1
(
i
,
n
)
{
y1
=
SSE_FP32_LOAD
(
y
+
i
);
y1
=
SSE_FP32_MUL
(
y1
,
scalar
);
SSE_FP32_STORE
(
y
+
i
,
y1
);
}
SSE_LOOP2
(
i
,
n
)
y
[
i
]
*=
alpha
;
SSE_LOOP2
(
i
,
n
)
y
[
i
]
=
a
[
i
]
*
b
[
i
];
}
template
<>
void
Div
(
const
int
n
,
const
float
*
a
,
const
float
*
b
,
float
*
y
)
{
__m128
x1
,
y1
,
z1
;
int32_t
i
=
0
;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
SSE_LOOP1
(
i
,
n
)
{
x1
=
SSE_FP32_LOAD
(
a
+
i
);
y1
=
SSE_FP32_LOAD
(
b
+
i
);
z1
=
SSE_FP32_DIV
(
x1
,
y1
);
SSE_FP32_STORE
(
y
+
i
,
z1
);
}
template
<>
void
Scale
(
const
int
n
,
const
float
alpha
,
const
float
*
x
,
float
*
y
)
{
__m128
x1
,
scalar
=
SSE_FP32_SCALAR
(
alpha
);
SSE_LOOP1
(
i
,
n
)
{
x1
=
SSE_FP32_LOAD
(
x
+
i
);
x1
=
SSE_FP32_MUL
(
x1
,
scalar
);
SSE_FP32_STORE
(
y
+
i
,
x1
);
}
SSE_LOOP2
(
i
,
n
)
y
[
i
]
=
x
[
i
]
*
alpha
;
SSE_LOOP2
(
i
,
n
)
y
[
i
]
=
a
[
i
]
/
b
[
i
];
}
template
<>
void
Scal
(
const
int
n
,
const
float
alpha
,
float
*
y
)
{
__m128
y1
,
scalar
=
SSE_FP32_SCALAR
(
alpha
);
int32_t
i
=
0
;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
SSE_LOOP1
(
i
,
n
)
{
y1
=
SSE_FP32_LOAD
(
y
+
i
);
y1
=
SSE_FP32_MUL
(
y1
,
scalar
);
SSE_FP32_STORE
(
y
+
i
,
y1
);
}
template
<>
void
Axpy
(
const
int
n
,
float
alpha
,
const
float
*
x
,
float
*
y
)
{
__m128
x1
,
y1
,
scalar
=
SSE_FP32_SCALAR
(
alpha
);
SSE_LOOP1
(
i
,
n
)
{
x1
=
SSE_FP32_LOAD
(
x
+
i
);
y1
=
SSE_FP32_LOAD
(
y
+
i
);
x1
=
SSE_FP32_MUL
(
x1
,
scalar
);
y1
=
SSE_FP32_ADD
(
x1
,
y1
);
SSE_FP32_STORE
(
y
+
i
,
y1
);
}
SSE_LOOP2
(
i
,
n
)
y
[
i
]
=
alpha
*
x
[
i
]
+
y
[
i
];
SSE_LOOP2
(
i
,
n
)
y
[
i
]
*=
alpha
;
}
template
<>
void
Scale
(
const
int
n
,
const
float
alpha
,
const
float
*
x
,
float
*
y
)
{
__m128
x1
,
scalar
=
SSE_FP32_SCALAR
(
alpha
);
int32_t
i
=
0
;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
SSE_LOOP1
(
i
,
n
)
{
x1
=
SSE_FP32_LOAD
(
x
+
i
);
x1
=
SSE_FP32_MUL
(
x1
,
scalar
);
SSE_FP32_STORE
(
y
+
i
,
x1
);
}
template
<>
void
Axpby
(
const
int
n
,
float
alpha
,
const
float
*
x
,
const
float
beta
,
float
*
y
)
{
__m128
x1
,
y1
,
z1
;
__m128
scalar1
=
SSE_FP32_SCALAR
(
alpha
);
__m128
scalar2
=
SSE_FP32_SCALAR
(
beta
)
;
SSE_LOOP1
(
i
,
n
)
{
x1
=
SSE_FP32_LOAD
(
x
+
i
);
y1
=
SSE_FP32_LOAD
(
y
+
i
);
x1
=
SSE_FP32_MUL
(
x1
,
scalar1
);
y1
=
SSE_FP32_MUL
(
y1
,
scalar2
);
z1
=
SSE_FP32_ADD
(
x1
,
y1
);
SSE_FP32_STORE
(
y
+
i
,
z1
);
}
SSE_
LOOP2
(
i
,
n
)
y
[
i
]
=
alpha
*
x
[
i
]
+
beta
*
y
[
i
]
;
SSE_LOOP2
(
i
,
n
)
y
[
i
]
=
x
[
i
]
*
alpha
;
}
template
<>
void
Axpy
(
const
int
n
,
float
alpha
,
const
float
*
x
,
float
*
y
)
{
__m128
x1
,
y1
,
scalar
=
SSE_FP32_SCALAR
(
alpha
);
int32_t
i
=
0
;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
SSE_LOOP1
(
i
,
n
)
{
x1
=
SSE_FP32_LOAD
(
x
+
i
);
y1
=
SSE_FP32_LOAD
(
y
+
i
);
x1
=
SSE_FP32_MUL
(
x1
,
scalar
);
y1
=
SSE_FP32_ADD
(
x1
,
y1
);
SSE_
FP32_STORE
(
y
+
i
,
y1
)
;
}
template
<>
float
ASum
(
const
int
n
,
const
float
*
x
)
{
__m128
x1
,
sum
=
SSE_FP32_ZERO
;
SSE_LOOP1
(
i
,
n
)
{
x1
=
SSE_FP32_LOAD
(
x
+
i
);
sum
=
SSE_FP32_ADD
(
sum
,
x1
);
}
float
buf
[
4
];
SSE_FP32_STORE
(
buf
,
sum
);
float
ret
=
buf
[
0
]
+
buf
[
1
]
+
buf
[
2
]
+
buf
[
3
];
SSE_LOOP2
(
i
,
n
)
ret
+=
x
[
i
];
return
ret
;
SSE_LOOP2
(
i
,
n
)
y
[
i
]
=
alpha
*
x
[
i
]
+
y
[
i
];
}
template
<>
void
Axpby
(
const
int
n
,
float
alpha
,
const
float
*
x
,
const
float
beta
,
float
*
y
)
{
__m128
x1
,
y1
,
z1
;
__m128
scalar1
=
SSE_FP32_SCALAR
(
alpha
);
__m128
scalar2
=
SSE_FP32_SCALAR
(
beta
);
int32_t
i
=
0
;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
SSE_LOOP1
(
i
,
n
)
{
x1
=
SSE_FP32_LOAD
(
x
+
i
);
y1
=
SSE_FP32_LOAD
(
y
+
i
);
x1
=
SSE_FP32_MUL
(
x1
,
scalar1
);
y1
=
SSE_FP32_MUL
(
y1
,
scalar2
);
z1
=
SSE_FP32_ADD
(
x1
,
y1
);
SSE_FP32_STORE
(
y
+
i
,
z1
);
}
template
<>
void
AddScalar
(
const
int
n
,
const
float
alpha
,
float
*
y
)
{
__m128
y1
,
scalar
=
SSE_FP32_SCALAR
(
alpha
);
SSE_LOOP1
(
i
,
n
)
{
y1
=
SSE_FP32_LOAD
(
y
+
i
);
y1
=
SSE_FP32_ADD
(
y1
,
scalar
);
SSE_FP32_STORE
(
y
+
i
,
y1
);
}
SSE_LOOP2
(
i
,
n
)
y
[
i
]
+=
alpha
;
SSE_LOOP2
(
i
,
n
)
y
[
i
]
=
alpha
*
x
[
i
]
+
beta
*
y
[
i
];
}
template
<>
float
ASum
(
const
int
n
,
const
float
*
x
)
{
__m128
x1
,
sum
=
SSE_FP32_ZERO
;
int32_t
i
=
0
;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
SSE_LOOP1
(
i
,
n
)
{
x1
=
SSE_FP32_LOAD
(
x
+
i
);
sum
=
SSE_FP32_ADD
(
sum
,
x1
);
}
template
<>
void
MulScalar
(
const
int
n
,
const
float
alpha
,
float
*
y
)
{
__m128
y1
,
scalar
=
SSE_FP32_SCALAR
(
alpha
);
SSE_LOOP1
(
i
,
n
)
{
y1
=
SSE_FP32_LOAD
(
y
+
i
);
y1
=
SSE_FP32_MUL
(
y1
,
scalar
);
SSE_FP32_STORE
(
y
+
i
,
y1
);
}
SSE_LOOP2
(
i
,
n
)
y
[
i
]
*=
alpha
;
float
buf
[
4
];
SSE_FP32_STORE
(
buf
,
sum
);
float
ret
=
buf
[
0
]
+
buf
[
1
]
+
buf
[
2
]
+
buf
[
3
];
SSE_LOOP2
(
i
,
n
)
ret
+=
x
[
i
];
return
ret
;
}
template
<>
void
AddScalar
(
const
int
n
,
const
float
alpha
,
float
*
y
)
{
__m128
y1
,
scalar
=
SSE_FP32_SCALAR
(
alpha
);
int32_t
i
=
0
;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
SSE_LOOP1
(
i
,
n
)
{
y1
=
SSE_FP32_LOAD
(
y
+
i
);
y1
=
SSE_FP32_ADD
(
y1
,
scalar
);
SSE_FP32_STORE
(
y
+
i
,
y1
);
}
SSE_LOOP2
(
i
,
n
)
y
[
i
]
+=
alpha
;
}
template
<>
void
MulScalar
(
const
int
n
,
const
float
alpha
,
float
*
y
)
{
__m128
y1
,
scalar
=
SSE_FP32_SCALAR
(
alpha
);
int32_t
i
=
0
;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
SSE_LOOP1
(
i
,
n
)
{
y1
=
SSE_FP32_LOAD
(
y
+
i
);
y1
=
SSE_FP32_MUL
(
y1
,
scalar
);
SSE_FP32_STORE
(
y
+
i
,
y1
);
}
template
<>
float
Dot
(
const
int
n
,
const
float
*
a
,
const
float
*
b
)
{
__m128
x1
,
y1
,
sum
=
SSE_FP32_ZERO
;
SSE_LOOP1
(
i
,
n
)
{
x1
=
SSE_FP32_LOAD
(
a
+
i
)
;
y1
=
SSE_FP32_LOAD
(
b
+
i
)
;
sum
=
SSE_FP32_ADD
(
sum
,
SSE_FP32_MUL
(
x1
,
y1
));
}
float
buf
[
4
];
SSE_FP32_STORE
(
buf
,
sum
);
float
ret
=
buf
[
0
]
+
buf
[
1
]
+
buf
[
2
]
+
buf
[
3
]
;
SSE_LOOP2
(
i
,
n
)
ret
+=
a
[
i
]
*
b
[
i
]
;
return
ret
;
SSE_LOOP2
(
i
,
n
)
y
[
i
]
*=
alpha
;
}
template
<>
float
Dot
(
const
int
n
,
const
float
*
a
,
const
float
*
b
)
{
__m128
x1
,
y1
,
sum
=
SSE_FP32_ZERO
;
int32_t
i
=
0
;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
SSE_LOOP1
(
i
,
n
)
{
x1
=
SSE_FP32_LOAD
(
a
+
i
)
;
y1
=
SSE_FP32_LOAD
(
b
+
i
)
;
sum
=
SSE_FP32_ADD
(
sum
,
SSE_FP32_MUL
(
x1
,
y1
))
;
}
float
buf
[
4
];
SSE_FP32_STORE
(
buf
,
sum
);
float
ret
=
buf
[
0
]
+
buf
[
1
]
+
buf
[
2
]
+
buf
[
3
];
SSE_LOOP2
(
i
,
n
)
ret
+=
a
[
i
]
*
b
[
i
];
return
ret
;
}
}
// namespace ssd
...
...
README.md
View file @
6eeac5f
# Dragon: A Computation Graph Virtual Machine Based Deep Learning Framework
![](http://images.cnblogs.com/cnblogs_com/neopenx/690760/o_dragon_logo.png)
-----
### Compile Requirements for C++
0.
Google Protocol Buffer
...
...
Write
Preview
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment