Commit 6eeac5fe by Ting PAN

add omp optimization

1 parent 007d9c21
......@@ -12,6 +12,7 @@ option(WITH_PYTHON3 "Set ON to use PYTHON3 otherwise PYTHON2" OF
option(WITH_CUDA "Set ON to use CUDA" ON)
option(WITH_CUDNN "Set ON to use CUDNN" OFF)
option(WITH_BLAS "Set ON to use BLAS" OFF)
option(WITH_OMP "Set ON to use OpenMP" OFF)
option(WITH_SSE "Set ON to use SSE 4.1" ON)
option(WITH_MPI "Set ON to use MPI" OFF)
option(WITH_MPI_CUDA "Set ON to use MPI-CUDA" OFF)
......@@ -22,7 +23,7 @@ option(WITH_CUDA_FP16 "Set ON to use FP16" ON)
set(3RDPARTY_DIR ${PROJECT_SOURCE_DIR}/../3rdparty)
# set your python environment
set(PYTHON_DIR /usr/include/python2.7) # prefer
set(PYTHON_DIR /usr/include/python2.7) # preferred
#set(PYTHON_DIR /usr/include/python3.x) # optional, set specific version
#set(ANACONDA_DIR /xxx/anaconda) # optional, root folder of anaconda, preset for 2.7, 3.5, and 3.6
set(NUMPY_DIR /xxx/numpy) # required, root folder of numpy package
......@@ -118,6 +119,10 @@ else()
"\n -- > GEMM/GEMV is disabled"
"\n -- > prefer not to run as CPU Mode")
endif()
if (WITH_OMP)
ADD_DEFINITIONS(-DWITH_OMP)
message(STATUS "Use OpenMP [Optional]")
endif()
if (WITH_SSE)
ADD_DEFINITIONS(-DWITH_SSE)
message(STATUS "Use SSE [Optional]")
......@@ -145,11 +150,18 @@ endif()
# ---[ Flags
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ${CUDA_ARCH}")
if(WIN32)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP /O2")
if (WITH_OMP)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /openmp")
endif()
endif()
if(UNIX)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -O2 -m64 -fpermissive -std=c++11")
if (WITH_OMP)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fopenmp")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
endif()
endif()
# ---[ Warnings
......
// --------------------------------------------------------
// Dragon
// Copyright(c) 2017 SeetaTech
// Written by Ting Pan
// --------------------------------------------------------
#ifndef DRAGON_UTILS_OMP_ALTERNATIVE_H_
#define DRAGON_UTILS_OMP_ALTERNATIVE_H_
#ifdef WITH_OMP
#include <algorithm>
#include <omp.h>
namespace dragon {
#define OMP_MIN_ITERATORS_PER_CORE 256
inline int GET_OMP_THREADS(const int N) {
int threads = std::max(N / OMP_MIN_ITERATORS_PER_CORE, 1);
return std::min(threads, omp_get_num_procs());
}
}
#endif // WITH_OMP
#endif // DRAGON_UTILS_OMP_ALTERNATIVE_H_
\ No newline at end of file
......@@ -15,11 +15,10 @@
namespace dragon {
#define SSE_LOOP1(i, n) \
int32_t i; \
for (i = 0; i < n - 4; i += 4) \
#define SSE_LOOP2(i, n) \
for (; i < n; i++)
for (; i < n; ++i)
#define SSE_FP32_LOAD _mm_loadu_ps
#define SSE_FP32_STORE _mm_storeu_ps
......
......@@ -53,18 +53,24 @@ void AddOp<Context>::RunOnDevice() {
}
else if (input(0).dim(0) == input(1).dim(0) && input(1).count(1) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(2);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(2);
#endif
else LOG(FATAL) << "unsupported input types.";
}
else if (input(0).dim(-1) == input(1).dim(-1) &&
input(1).count(0, input(1).axis(-1)) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(1);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(1);
#endif
else LOG(FATAL) << "unsupported input types.";
}
else if (input(1).ndim() == 1 && input(1).dim(0) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(0);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(0);
#endif
else LOG(FATAL) << "unsupported input types.";
}
else {
......@@ -139,18 +145,24 @@ void AddGradientOp<Context>::RunOnDevice() {
}
else if (input(-1).dim(0) == input(0).dim(0) && input(0).count(1) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(2);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(2);
#endif
else LOG(FATAL) << "unsupported input types.";
}
else if (input(-1).dim(-1) == input(0).dim(-1) &&
input(0).count(0, input(0).axis(-1)) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(1);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(1);
#endif
else LOG(FATAL) << "unsupported input types.";
}
else if (input(0).ndim() == 1 && input(0).dim(0) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(0);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(0);
#endif
else LOG(FATAL) << "unsupported input types.";
}
else {
......
......@@ -54,18 +54,24 @@ void DivOp<Context>::RunOnDevice() {
}
else if (input(0).dim(0) == input(1).dim(0) && input(1).count(1) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(2);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(2);
#endif
else LOG(FATAL) << "unsupported input types";
}
else if (input(0).dim(-1) == input(1).dim(-1) &&
input(1).count(0, input(1).axis(-1)) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(1);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(1);
#endif
else LOG(FATAL) << "unsupported input types";
}
else if (input(1).ndim() == 1 && input(1).dim(0) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(0);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(0);
#endif
else LOG(FATAL) << "unsupported input types";
}
else {
......@@ -170,18 +176,24 @@ void DivGradientOp<Context>::RunOnDevice() {
}
else if (input(0).dim(0) == input(1).dim(0) && input(1).count(1) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(2);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(2);
#endif
else LOG(FATAL) << "unsupported input types";
}
else if (input(0).dim(-1) == input(1).dim(-1) &&
input(1).count(0, input(1).axis(-1)) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(1);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(1);
#endif
else LOG(FATAL) << "unsupported input types";
}
else if (input(1).ndim() == 1 && input(1).dim(0) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(0);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(0);
#endif
else LOG(FATAL) << "unsupported input types";
}
else {
......
......@@ -55,7 +55,9 @@ void DotOp<Context>::RunOnDevice() {
dims[dims.size() - 1] = N1;
output(0)->Reshape(dims);
if (input(0).template IsType<float>()) GemmRunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) GemmRunWithType<float16>();
#endif
else LOG(FATAL) << "unsupported input types.";
}
else if (input(0).ndim() >= 2 && input(1).ndim() == 1) {
......@@ -70,7 +72,9 @@ void DotOp<Context>::RunOnDevice() {
dims.pop_back();
output(0)->Reshape(dims);
if (input(0).template IsType<float>()) GemvRunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) GemvRunWithType<float16>();
#endif
else LOG(FATAL) << "unsupported input types.";
}
else {
......@@ -148,7 +152,9 @@ void DotGradientOp<Context>::RunOnDevice() {
<< input(0).dim_string() << " can not Dot with Tensor"
<< "(" << input(1).name() << "): " << input(1).dim_string();
if (input(0).template IsType<float>()) GemmRunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) GemmRunWithType<float16>();
#endif
else LOG(FATAL) << "unsupported input types.";
}
else if (input(0).ndim() >= 2 && input(1).ndim() == 1) {
......@@ -160,7 +166,9 @@ void DotGradientOp<Context>::RunOnDevice() {
<< input(0).dim_string() << " can not Dot with Tensor"
<< "(" << input(1).name() << "): " << input(1).dim_string();
if (input(0).template IsType<float>()) GemvRunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) GemvRunWithType<float16>();
#endif
else LOG(FATAL) << "unsupported input types.";
}
else {
......
......@@ -41,12 +41,16 @@ void EltwiseOp<Context>::RunOnDevice() {
if (operation == "SUM") {
if (input(0).template IsType<float>()) SumRunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) SumRunWithType<float16>();
#endif
else LOG(FATAL) << "unsupported input types.";
}
else if (operation == "PROD") {
if (input(0).template IsType<float>()) ProdRunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) ProdRunWithType<float16>();
#endif
else LOG(FATAL) << "unsupported input types.";
}
else {
......@@ -104,12 +108,16 @@ void EltwiseGradientOp<Context>::RunOnDevice() {
if (operation == "SUM") {
if (input(0).template IsType<float>()) SumRunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) SumRunWithType<float16>();
#endif
else LOG(FATAL) << "unsupported input types.";
}
else if (operation == "PROD") {
if (input(0).template IsType<float>()) ProdRunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) ProdRunWithType<float16>();
#endif
else LOG(FATAL) << "unsupported input types.";
}
else {
......
......@@ -25,7 +25,9 @@ void GramMatrixOp<Context>::RunOnDevice() {
output(0)->Reshape(vector<TIndex>({ outer_dim, dim, dim }));
if (input(0).template IsType<float>()) RunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) RunWithType<float16>();
#endif
else LOG(FATAL) << "unsupported input types.";
}
......@@ -57,7 +59,9 @@ void GramMatrixGradientOp<Context>::RunOnDevice() {
output(0)->ReshapeLike(input(0));
if (input(0).template IsType<float>()) RunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) RunWithType<float16>();
#endif
else LOG(FATAL) << "unsupported input types.";
}
......
......@@ -48,7 +48,9 @@ void MatmulOp<Context>::RunOnDevice() {
dims[dims.size() - 1] = N;
output(0)->Reshape(dims);
if (input(0).template IsType<float>()) RunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) RunWithType<float16>();
#endif
else LOG(FATAL) << "unsupported input types.";
}
......@@ -105,7 +107,9 @@ void MatmulGradientOp<Context>::RunOnDevice() {
output(0)->ReshapeLike(input(0));
output(1)->ReshapeLike(input(1));
if (input(0).template IsType<float>()) RunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) RunWithType<float16>();
#endif
else LOG(FATAL) << "unsupported input types.";
}
......
......@@ -54,18 +54,24 @@ void MulOp<Context>::RunOnDevice() {
}
else if (input(0).dim(0) == input(1).dim(0) && input(1).count(1) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(2);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(2);
#endif
else LOG(FATAL) << "unsupported input types.";
}
else if (input(0).dim(-1) == input(1).dim(-1) &&
input(1).count(0, input(1).axis(-1)) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(1);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(1);
#endif
else LOG(FATAL) << "unsupported input types.";
}
else if (input(1).ndim() == 1 && input(1).dim(0) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(0);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(0);
#endif
else LOG(FATAL) << "unsupported input types.";
}
else {
......@@ -158,18 +164,24 @@ void MulGradientOp<Context>::RunOnDevice() {
}
else if (input(0).dim(0) == input(1).dim(0) && input(1).count(1) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(2);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(2);
#endif
else LOG(FATAL) << "unsupported input types.";
}
else if (input(0).dim(-1) == input(1).dim(-1) &&
input(1).count(0, input(1).axis(-1)) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(1);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(1);
#endif
else LOG(FATAL) << "unsupported input types.";
}
else if (input(1).ndim() == 1 && input(1).dim(0) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(0);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(0);
#endif
else LOG(FATAL) << "unsupported input types.";
}
else {
......
......@@ -26,7 +26,9 @@ void PowOp<Context>::RunOnDevice() {
output(0)->ReshapeLike(input(0));
if (input(0).template IsType<float>()) RunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) RunWithType<float16>();
#endif
else LOG(FATAL) << "unsupported input types.";
}
......@@ -76,7 +78,9 @@ void PowGradientOp<Context>::RunOnDevice() {
output(0)->ReshapeLike(input(0));
if (input(0).template IsType<float>()) RunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) RunWithType<float16>();
#endif
else LOG(FATAL) << "unsupported input types.";
}
......
......@@ -37,7 +37,9 @@ void ScaleOp<Context>::RunOnDevice() {
output(0)->ReshapeLike(input(0));
if (input(0).template IsType<float>()) RunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) RunWithType<float16>();
#endif
else LOG(FATAL) << "unsupported input types.";
}
......
......@@ -53,18 +53,24 @@ void SubOp<Context>::RunOnDevice() {
}
else if (input(0).dim(0) == input(1).dim(0) && input(1).count(1) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(2);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(2);
#endif
else LOG(FATAL) << "unsupported input types.";
}
else if (input(0).dim(-1) == input(1).dim(-1) &&
input(1).count(0, input(1).axis(-1)) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(1);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(1);
#endif
else LOG(FATAL) << "unsupported input types.";
}
else if (input(1).ndim() == 1 && input(1).dim(0) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(0);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(0);
#endif
else LOG(FATAL) << "unsupported input types.";
}
else {
......@@ -139,18 +145,24 @@ void SubGradientOp<Context>::RunOnDevice() {
}
else if (input(-1).dim(0) == input(0).dim(0) && input(0).count(1) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(2);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(2);
#endif
else LOG(FATAL) << "unsupported input types.";
}
else if (input(-1).dim(-1) == input(0).dim(-1) &&
input(0).count(0, input(0).axis(-1)) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(1);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(1);
#endif
else LOG(FATAL) << "unsupported input types.";
}
else if (input(0).ndim() == 1 && input(0).dim(0) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(0);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(0);
#endif
else LOG(FATAL) << "unsupported input types.";
}
else {
......
......@@ -49,7 +49,9 @@ void ConcatOp<Context>::RunOnDevice() {
}
if (input(0).template IsType<float>()) RunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) RunWithType<float16>();
#endif
else LOG(FATAL) << "unsupported input types.";
}
......@@ -96,7 +98,9 @@ void ConcatGradientOp<Context>::RunOnDevice() {
}
if (input(0).template IsType<float>()) RunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) RunWithType<float16>();
#endif
else LOG(FATAL) << "unsupported input types.";
}
......
......@@ -45,7 +45,9 @@ void TransposeOp<Context>::RunOnDevice() {
}
if (input(0).template IsType<float>()) RunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) RunWithType<float16>();
#endif
else LOG(FATAL) << "unsupported input types.";
}
......@@ -75,7 +77,9 @@ void TransposeGradientOp<Context>::RunOnDevice() {
new_steps = ws()->GetTensor("_t_" + anchor() + "_new_steps");
if (input(0).template IsType<float>()) RunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) RunWithType<float16>();
#endif
else LOG(FATAL) << "unsupported input types.";
}
......
......@@ -127,7 +127,9 @@ void BatchNormOp<Context>::RunOnDevice() {
if (input(0).template IsType<float>()) RunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) RunWithType<float16>();
#endif
else LOG(FATAL) << "unsupported input types.";
}
......@@ -247,7 +249,9 @@ void BatchNormGradientOp<Context>::RunOnDevice() {
else use_global_stats = use_stats == 1 ? true : false;
if (input(0).template IsType<float>()) RunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) RunWithType<float16>();
#endif
else LOG(FATAL) << "unsupported input types.";
}
......
......@@ -78,7 +78,9 @@ void L2NormOp<Context>::RunOnDevice() {
output(0)->ReshapeLike(input(0));
if (input(0).template IsType<float>()) RunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) RunWithType<float16>();
#endif
else LOG(FATAL) << "unsupported input types.";
}
......
......@@ -4,6 +4,8 @@
namespace dragon {
#ifdef WITH_CUDA_FP16
template <class Context>
void FloatToHalfOp<Context>::RunOnDevice() {
CHECK(input(0).template IsType<float>())
......@@ -28,4 +30,6 @@ OPERATOR_SCHEMA(FloatToHalf).NumInputs(1).NumOutputs(1);
NO_GRADIENT(FloatToHalf);
#endif
} // namespace dragon
\ No newline at end of file
......@@ -19,7 +19,9 @@ void GradientGenerateOp<Context>::RunWithType() {
template <class Context>
void GradientGenerateOp<Context>::RunOnDevice() {
if (input(0).template IsType<float>()) RunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) RunWithType<float16>();
#endif
else LOG(FATAL) << "unsupported input types.";
}
......
......@@ -23,12 +23,16 @@ void MemoryDataOp<Context>::RunOnDevice() {
if (input(0).template IsType<float>()) {
if (data_type == TensorProto_DataType_FLOAT) RunWithType<float, float>();
#ifdef WITH_CUDA_FP16
else if (data_type == TensorProto_DataType_FLOAT16) RunWithType<float, float16>();
#endif
else LOG(FATAL) << "unsupported input types.";
}
else if (input(0).template IsType<uint8_t>()) {
if (data_type == TensorProto_DataType_FLOAT) RunWithType<uint8_t, float>();
#ifdef WITH_CUDA_FP16
if (data_type == TensorProto_DataType_FLOAT16) RunWithType<uint8_t, float16>();
#endif
}
else { LOG(FATAL) << "unsupported input types."; }
}
......
......@@ -58,7 +58,9 @@ void DenseConcatGradientOp<Context>::ElimateCorruption() {
input(0).Move(buffer->memory());
head_data[idx] = input(0).name();
if (input(-2).template IsType<float>()) RestoreX1<float>();
#ifdef WITH_CUDA_FP16
else if (input(-2).template IsType<float16>()) RestoreX1<float16>();
#endif
else LOG(FATAL) << "unsupported input types.";
// post-process
if (input(0).memory() != buffer->memory()) buffer->Move(input(0).memory());
......
#include <random>
#include "core/context.h"
#include "utils/math_functions.h"
#ifdef WITH_SSE
#include "utils/omp_alternative.h"
#include "utils/sse_alternative.h"
#endif
#include "utils/math_functions.h"
namespace dragon {
......@@ -22,9 +20,12 @@ template <> void Set<float, CPUContext>(const int n,
}
#ifdef WITH_SSE
sse::Set<float>(n, alpha, x);
#else // naive implement
#else
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for (int i = 0; i < n; ++i) x[i] = alpha;
#endif
#endif // WITH_SSE
}
template <> void Set<int, CPUContext>(const int n,
......@@ -36,9 +37,12 @@ template <> void Set<int, CPUContext>(const int n,
}
#ifdef WITH_SSE
sse::Set<int>(n, alpha, x);
#else // naive implement
#else
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for (int i = 0; i < n; ++i) x[i] = alpha;
#endif
#endif // WITH_SSE
}
template <> void Set<float16, CPUContext>(const int n,
......@@ -52,9 +56,10 @@ template <> void RandomUniform<float, CPUContext>(const int n,
const float high,
float* x) {
std::uniform_real_distribution<float> distribution(low, high);
for (int i = 0; i < n; ++i) {
x[i] = distribution(*rand_generator());
}
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for (int i = 0; i < n; ++i) x[i] = distribution(*rand_generator());
}
template <> void RandomUniform<float16, CPUContext>(const int n,
......@@ -69,9 +74,10 @@ template <> void RandomUniform<uint32_t, CPUContext>(const int n,
const float high,
uint32_t* x) {
std::uniform_int_distribution<uint32_t> distribution(low, high);
for (int i = 0; i < n; ++i) {
x[i] = distribution(*rand_generator());
}
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for (int i = 0; i < n; ++i) x[i] = distribution(*rand_generator());
}
template <> void RandomNormal<float, CPUContext>(const int n,
......@@ -79,9 +85,10 @@ template <> void RandomNormal<float, CPUContext>(const int n,
const float sigma,
float* x) {
std::normal_distribution<float> distribution(mu, sigma);
for (int i = 0; i < n; ++i) {
x[i] = distribution(*rand_generator());
}
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for (int i = 0; i < n; ++i) x[i] = distribution(*rand_generator());
}
template <> void RandomNormal<float16, CPUContext>(const int n,
......@@ -121,9 +128,10 @@ template <> void RandomBernoulli<float, CPUContext>(const int n,
const float p,
uint32_t* x) {
std::bernoulli_distribution distribution(p);
for (int i = 0; i < n; ++i) {
x[i] = distribution(*rand_generator());
}
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for (int i = 0; i < n; ++i) x[i] = distribution(*rand_generator());
}
/******************** Level-1 ********************/
......@@ -134,9 +142,12 @@ template <> void Add<float, CPUContext>(const int n,
float* y) {
#ifdef WITH_SSE
sse::Add<float>(n, a, b, y);
#else // naive implement
for (int i = 0; i < n; ++i) y[i] = a[i] + b[i];
#else
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for (int i = 0; i < n; ++i) y[i] = a[i] + b[i];
#endif // WITH_SSE
}
template <> void Sub<float, CPUContext>(const int n,
......@@ -145,9 +156,12 @@ template <> void Sub<float, CPUContext>(const int n,
float* y) {
#ifdef WITH_SSE
sse::Sub<float>(n, a, b, y);
#else // naive implement
for (int i = 0; i < n; ++i) y[i] = a[i] - b[i];
#else
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for (int i = 0; i < n; ++i) y[i] = a[i] - b[i];
#endif // WITH_SSE
}
template <> void Mul<float, CPUContext>(const int n,
......@@ -156,9 +170,12 @@ template <> void Mul<float, CPUContext>(const int n,
float* y) {
#ifdef WITH_SSE
sse::Mul<float>(n, a, b, y);
#else // naive implement
for (int i = 0; i < n; ++i) y[i] = a[i] * b[i];
#else
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for (int i = 0; i < n; ++i) y[i] = a[i] * b[i];
#endif // WITH_SSE
}
template <> void Mul<float16, CPUContext>(const int n,
......@@ -174,9 +191,12 @@ template <> void Div<float, CPUContext>(const int n,
float* y) {
#ifdef WITH_SSE
sse::Div<float>(n, a, b, y);
#else // naive implement
#else
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for (int i = 0; i < n; ++i) y[i] = a[i] / b[i];
#endif
#endif // WITH_SSE
}
template <> void Div<float16, CPUContext>(const int n,
......@@ -190,6 +210,9 @@ template <> void Clip<float, CPUContext>(const int n,
const float low,
const float high,
float* x) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for (int i = 0; i < n; ++i) {
x[i] = std::max(low, std::min(x[i], high));
}
......@@ -198,6 +221,9 @@ template <> void Clip<float, CPUContext>(const int n,
template <> void Exp<float, CPUContext>(int n,
const float* x,
float* y) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for (int i = 0; i < n; ++i) {
y[i] = std::exp(x[i]);
}
......@@ -206,6 +232,9 @@ template <> void Exp<float, CPUContext>(int n,
template <> void Log<float, CPUContext>(int n,
const float* x,
float* y) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for (int i = 0; i < n; ++i) {
y[i] = std::log(x[i]);
}
......@@ -214,6 +243,9 @@ template <> void Log<float, CPUContext>(int n,
template <> void Square<float, CPUContext>(int n,
const float* x,
float* y) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for (int i = 0; i < n; ++i) {
y[i] = x[i] * x[i];
}
......@@ -228,6 +260,9 @@ template <> void Square<float16, CPUContext>(int n,
template <> void Sqrt<float, CPUContext>(int n,
const float* x,
float* y) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for (int i = 0; i < n; ++i) {
y[i] = std::sqrt(x[i]);
}
......@@ -243,6 +278,9 @@ template <> void Pow<float, CPUContext>(int n,
const float alpha,
const float* x,
float* y) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for (int i = 0; i < n; ++i) {
y[i] = std::pow(x[i], alpha);
}
......@@ -259,6 +297,9 @@ template <> void Inv<float, CPUContext>(const int n,
const float numerator,
const float* x,
float* y) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for (int i = 0; i < n; ++i) {
y[i] = 1.0 / y[i];
}
......@@ -280,9 +321,12 @@ template <> void Scal<float, CPUContext>(const int n,
cblas_sscal(n, alpha, y, 1);
#elif WITH_SSE
sse::Scal<float>(n, alpha, y);
#else // naive implement
for (int i = 0; i < n; ++i) y[i] = y[i] * alpha;
#else
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for (int i = 0; i < n; ++i) y[i] = y[i] * alpha;
#endif // WITH_BLAS
}
template <> void Scal<float16, CPUContext>(const int n,
......@@ -307,9 +351,12 @@ template <> void Scale<float, CPUContext>(const int n,
cblas_sscal(n, alpha, y, 1);
#elif WITH_SSE
sse::Scale<float>(n, alpha, x, y);
#else // naive implement
for (int i = 0; i < n; ++i) y[i] = x[i] * alpha;
#else
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for (int i = 0; i < n; ++i) y[i] = x[i] * alpha;
#endif // WITH_BLAS
}
template <> float StridedDot<float, CPUContext>(const int n,
......@@ -319,11 +366,14 @@ template <> float StridedDot<float, CPUContext>(const int n,
const int incy) {
#ifdef WITH_BLAS
return cblas_sdot(n, a, incx, b, incy);
#else // naive implement
#else
float ret = 0.f;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for (int i = 0; i < n; ++i) ret += a[i] * b[i];
return ret;
#endif
#endif // WITH_BLAS
}
template <> float Dot<float, CPUContext>(int n,
......@@ -333,11 +383,14 @@ template <> float Dot<float, CPUContext>(int n,
return StridedDot<float, CPUContext>(n, a, 1, b, 1);
#elif WITH_SSE
return sse::Dot<float>(n, a, b);
#else // naive implement
#else
float ret = 0.f;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for (int i = 0; i < n; ++i) ret += a[i] * b[i];
return ret;
#endif
#endif // WITH_BLAS
}
template <> float Dot<float16, CPUContext>(int n,
......@@ -350,23 +403,29 @@ template <> float Dot<float16, CPUContext>(int n,
template <> float ASum<float, CPUContext>(const int n, const float* x) {
#ifdef WITH_BLAS
return cblas_sasum(n, x, 1);
#elif WITH_SSE
#elif WITH_SSE
return sse::ASum<float>(n, x);
#else // naive implement
#else
float ret = 0.f;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for (int i = 0; i < n; ++i) ret += x[i];
return ret;
#endif
#endif // WITH_BLAS
}
template <> void AddScalar<float, CPUContext>(const int n,
const float alpha,
float* y) {
#ifdef WITH_SSE
#ifdef WITH_SSE
sse::AddScalar<float>(n, alpha, y);
#else // naive implement
for (int i = 0; i < n; ++i) y[i] += alpha;
#else
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for (int i = 0; i < n; ++i) y[i] += alpha;
#endif // WITH_SSE
}
template <> void AddScalar<float16, CPUContext>(const int n,
......@@ -378,11 +437,14 @@ template <> void AddScalar<float16, CPUContext>(const int n,
template <> void MulScalar<float, CPUContext>(const int n,
const float alpha,
float* y) {
#ifdef WITH_SSE
#ifdef WITH_SSE
sse::MulScalar<float>(n, alpha, y);
#else // naive implement
for (int i = 0; i < n; ++i) y[i] *= alpha;
#else
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for (int i = 0; i < n; ++i) y[i] *= alpha;
#endif // WITH_SSE
}
template <> void Axpy<float, CPUContext>(const int n,
......@@ -393,9 +455,12 @@ template <> void Axpy<float, CPUContext>(const int n,
cblas_saxpy(n, alpha, x, 1, y, 1);
#elif WITH_SSE
sse::Axpy<float>(n, alpha, x, y);
#else // naive implement
for (int i = 0; i < n; ++i) y[i] = alpha * x[i] + y[i];
#else
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for (int i = 0; i < n; ++i) y[i] = alpha * x[i] + y[i];
#endif // WITH_BLAS
}
template <> void Axpy<float16, CPUContext>(const int n,
......@@ -415,9 +480,12 @@ template <> void Axpby<float, CPUContext>(const int n,
cblas_saxpy(n, alpha, x, 1, y, 1);
#elif WITH_SSE
sse::Axpby<float>(n, alpha, x, beta, y);
#else // naive implement
for (int i = 0; i < n; ++i) y[i] = alpha * x[i] + beta* y[i];
#else
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for (int i = 0; i < n; ++i) y[i] = alpha * x[i] + beta* y[i];
#endif // WITH_BLAS
}
template <> void Axpby<float16, CPUContext>(const int n,
......
......@@ -40,6 +40,7 @@ template <> void Set<int, CUDAContext>(const int n,
_Set<int> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, alpha, x);
}
#ifdef WITH_CUDA_FP16
template <typename T>
__global__ void _SetHalf2(const int n, const half2 alpha, half2* x) {
CUDA_KERNEL_LOOP(idx, n) {
......@@ -61,6 +62,7 @@ template <> void Set<float16, CUDAContext>(const int n,
_Set<float16> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, alpha, x);
}
}
#endif
template <> void RandomUniform<uint32_t, CUDAContext>(const int n,
const float low,
......@@ -144,6 +146,7 @@ template <> void Mul<float, CUDAContext>(int n,
_Mul<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, a, b, y);
}
#ifdef WITH_CUDA_FP16
template <typename T>
__global__ void _MulHalf(const int n, const half* a, const half* b, half* y) {
CUDA_KERNEL_LOOP(idx, n) {
......@@ -161,7 +164,7 @@ __global__ void _MulHalf2(const int n, const half2* a, const half2* b, half2* y)
#endif
}
}
template <> void Mul<float16, CUDAContext>(int n,
const float16* a,
const float16* b,
......@@ -176,6 +179,7 @@ template <> void Mul<float16, CUDAContext>(int n,
reinterpret_cast<const half*>(b),
reinterpret_cast<half*>(y));
}
#endif
template <typename T>
__global__ void _Div(const int n, const T* a, const T* b, T* y) {
......@@ -191,6 +195,7 @@ template <> void Div<float, CUDAContext>(int n,
_Div<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, a, b, y);
}
#ifdef WITH_CUDA_FP16
template <typename T>
__global__ void _DivHalf(const int n, const half* a, const half* b, half* y) {
CUDA_KERNEL_LOOP(idx, n) {
......@@ -209,6 +214,7 @@ template <> void Div<float16, CUDAContext>(int n,
reinterpret_cast<const half*>(b),
reinterpret_cast<half*>(y));
}
#endif
template <typename T>
__global__ void _Clip(const int n, const T low, const T high, T* x) {
......@@ -260,6 +266,7 @@ template <> void Square<float, CUDAContext>(int n,
_Square<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, x, y);
}
#ifdef WITH_CUDA_FP16
template <typename T>
__global__ void _SquareHalf(const int n, const half* x, half* y) {
CUDA_KERNEL_LOOP(idx, n) {
......@@ -290,6 +297,7 @@ template <> void Square<float16, CUDAContext>(int n,
reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK;
}
#endif
template <typename T>
__global__ void _Sqrt(const int n, const T* x, T* y) {
......@@ -304,6 +312,7 @@ template <> void Sqrt<float, CUDAContext>(int n,
_Sqrt<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, x, y);
}
#ifdef WITH_CUDA_FP16
template <typename T>
__global__ void _SqrtHalf(const int n, const half* x, half* y) {
CUDA_KERNEL_LOOP(idx, n) {
......@@ -334,6 +343,7 @@ template <> void Sqrt<float16, CUDAContext>(int n,
reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK;
}
#endif
template <typename T>
__global__ void _Pow(const int n, const T alpha, const T* a, T* y) {
......@@ -349,6 +359,7 @@ template <> void Pow<float, CUDAContext>(int n,
_Pow<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, alpha, x, y);
}
#ifdef WITH_CUDA_FP16
template <typename T>
__global__ void _PowHalf(const int n, const float alpha, const half* a, half* y) {
CUDA_KERNEL_LOOP(idx, n) {
......@@ -384,6 +395,7 @@ template <> void Pow<float16, CUDAContext>(int n,
reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK;
}
#endif
template <typename T>
__global__ void _Inv(const int n, const float numerator, const T* x, T* y) {
......@@ -399,6 +411,7 @@ template <> void Inv<float, CUDAContext>(const int n,
_Inv<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, numerator, x, y);
}
#ifdef WITH_CUDA_FP16
template <typename T>
__global__ void _InvHalf(const int n, const half numerator, const half* x, half* y) {
CUDA_KERNEL_LOOP(idx, n) {
......@@ -439,6 +452,7 @@ template <> void Inv<float16, CUDAContext>(const int n,
}
CUDA_POST_KERNEL_CHECK;
}
#endif
/******************** Level-2 ********************/
......@@ -518,6 +532,7 @@ template <> void AddScalar<float, CUDAContext>(const int n, const float alpha, f
_AddScalar<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, alpha, y);
}
#ifdef WITH_CUDA_FP16
template <typename T>
__global__ void _AddScalarHalf(const int n, half alpha, half* y) {
CUDA_KERNEL_LOOP(idx, n) {
......@@ -552,6 +567,7 @@ template <> void AddScalar<float16, CUDAContext>(const int n, const float alpha,
}
CUDA_POST_KERNEL_CHECK;
}
#endif
template <typename T>
__global__ void _MulScalar(const int n, T alpha, T* y) {
......@@ -641,6 +657,7 @@ template <> void Gemm<float, CUDAContext>(const CBLAS_TRANSPOSE transA,
C, N));
}
#ifdef WITH_CUDA_FP16
template <> void Gemm<float16, CUDAContext>(const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB,
const int M,
......@@ -682,6 +699,7 @@ template <> void Gemm<float16, CUDAContext>(const CBLAS_TRANSPOSE transA,
LOG(FATAL) << "unsupported math type";
}
}
#endif
template <> void Gemv<float, CUDAContext>(const CBLAS_TRANSPOSE transA,
const int M, const int N,
......@@ -702,6 +720,7 @@ template <> void Gemv<float, CUDAContext>(const CBLAS_TRANSPOSE transA,
y, 1));
}
#ifdef WITH_CUDA_FP16
template <> void Gemv<float16, CUDAContext>(const CBLAS_TRANSPOSE transA,
const int M,
const int N,
......@@ -742,6 +761,7 @@ template <> void Gemv<float16, CUDAContext>(const CBLAS_TRANSPOSE transA,
LOG(FATAL) << "unsupported math type";
}
}
#endif
} // namespace math
......
......@@ -3,11 +3,9 @@
#include "core/tensor.h"
#include "utils/op_kernel.h"
#include "utils/math_functions.h"
#ifdef WITH_SSE
#include "utils/omp_alternative.h"
#include "utils/sse_alternative.h"
#endif
#include "utils/math_functions.h"
bool judge(int a, int b) { return unsigned(a) < unsigned(b); }
......@@ -28,8 +26,10 @@ template<> void Dropout<float, CPUContext>(const int count,
CPUContext* context) {
uint32_t thresh = static_cast<uint32_t>(UINT_MAX * prob);
math::RandomBernoulli<float, CPUContext>(count, 1 - prob, mask);
for (int i = 0; i < count; ++i)
y[i] = x[i] * mask[i] * scale;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) y[i] = x[i] * mask[i] * scale;
}
template<> void DropoutGrad<float, CPUContext>(const int count,
......@@ -38,8 +38,10 @@ template<> void DropoutGrad<float, CPUContext>(const int count,
const float* dy,
const uint32_t* mask,
float* dx) {
for (int i = 0; i < count; ++i)
dx[i] = dy[i] * mask[i] * scale;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) dx[i] = dy[i] * mask[i] * scale;
}
/******************** activation.relu ********************/
......@@ -48,6 +50,9 @@ template<> void Relu<float, CPUContext>(const int count,
const float* x,
const float slope,
float* y) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
y[i] = std::max(x[i], 0.f) + slope * std::min(x[i], 0.f);
}
......@@ -58,10 +63,12 @@ template<> void ReluGrad<float, CPUContext>(const int count,
const float* y,
const float slope,
float* dx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
dx[i] = dy[i] * ((y[i] > 0) + slope * (y[i] <= 0));
}
}
/******************** activation.sigmoid ********************/
......@@ -70,15 +77,19 @@ template <typename T>
T _sigmoid(T x) { return T(1) / (T(1) + exp(-x)); }
template<> void Sigmoid<float, CPUContext>(const int count, const float* x, float* y) {
for (int i = 0; i < count; ++i) {
y[i] = _sigmoid<float>(x[i]);
}
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) y[i] = _sigmoid<float>(x[i]);
}
template<> void SigmoidGrad<float, CPUContext>(const int count,
const float* dy,
const float* y,
float* dx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
dx[i] = dy[i] * y[i] * (1 - y[i]);
}
......@@ -149,6 +160,9 @@ template<> void SoftmaxGrad<float, CPUContext>(const int count,
/******************** activation.tanh ********************/
template<> void Tanh<float, CPUContext>(const int count, const float* x, float* y) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
y[i] = std::tanh(x[i]);
}
......@@ -158,6 +172,9 @@ template<> void TanhGrad<float, CPUContext>(const int count,
const float* dy,
const float* y,
float* dx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
dx[i] = dy[i] * (1 - y[i] * y[i]);
}
......@@ -197,6 +214,9 @@ template <> void Clip<float, CPUContext>(const int count,
const float* x,
float* mask,
float* y) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
mask[i] = 1.0;
if (x[i] < low || x[i] > high) mask[i] = 0.0;
......@@ -300,8 +320,10 @@ template<> void Argmax<float, CPUContext>(const int count,
/******************** common.at ********************/
template <> void CanonicalAxis<float, CPUContext>(const int count, const int dim, float* y) {
for (int i = 0; i < count; ++i)
if (y[i] < 0) y[i] += dim;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) if (y[i] < 0) y[i] += dim;
}
template <> void At<float, CPUContext>(const int count,
......@@ -478,6 +500,9 @@ template<> void Sum<float, CPUContext>(const int count,
const int inner_dim,
const float* x,
float* y) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
float sum_val = 0.0;
for (int j = 0; j < axis_dim; ++j)
......@@ -492,6 +517,9 @@ template<> void SumGrad<float, CPUContext>(const int count,
const float coeff,
const float* dy,
float* dx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
for (int j = 0; j < axis_dim; ++j)
dx[(i / inner_dim * axis_dim + j) * inner_dim + i % inner_dim] = dy[i] * coeff;
......@@ -585,6 +613,9 @@ template <> void Transpose<float, CPUContext>(const int count,
const int* new_steps,
const float* x,
float* y) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
int x_idx = 0, y_idx = i;
for (int j = 0; j < ndim; ++j) {
......@@ -603,15 +634,7 @@ template <> void Transpose<float16, CPUContext>(const int count,
const int* new_steps,
const float16* x,
float16* y) {
for (int i = 0; i < count; ++i) {
int x_idx = 0, y_idx = i;
for (int j = 0; j < ndim; ++j) {
int k = order[j];
x_idx += (y_idx / new_steps[j]) * old_steps[k];
y_idx %= new_steps[j];
}
y[i] = x[x_idx];
}
LOG(FATAL) << "unsupport float16 with CPU";
}
template <> void TransposeGrad<float, CPUContext>(const int count,
......@@ -621,6 +644,9 @@ template <> void TransposeGrad<float, CPUContext>(const int count,
const int* new_steps,
const float* dy,
float* dx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
int x_idx = 0, y_idx = i;
for (int j = 0; j < ndim; ++j) {
......@@ -639,20 +665,15 @@ template <> void TransposeGrad<float16, CPUContext>(const int count,
const int* new_steps,
const float16* dy,
float16* dx) {
for (int i = 0; i < count; ++i) {
int x_idx = 0, y_idx = i;
for (int j = 0; j < ndim; ++j) {
int k = order[j];
x_idx += (y_idx / new_steps[j]) * old_steps[k];
y_idx %= new_steps[j];
}
dx[x_idx] = dy[i];
}
LOG(FATAL) << "unsupport float16 with CPU";
}
/******************** loss.l1_loss ********************/
template<> void AbsGrad<float, CPUContext>(const int count, const float* dy, float* dx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
const float val = dy[i];
// val > 0: 1 | val == 0: 0 | val < 0: -1
......@@ -666,6 +687,9 @@ template <> void SigmoidCrossEntropy<float, CPUContext>(const int count,
const float* x,
const float* target,
float* loss) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
loss[i] = std::log(1 + std::exp(x[i] - 2 * x[i] * (x[i] >= 0)))
+ x[i] * ((x[i] >= 0) - target[i]);
......@@ -678,6 +702,9 @@ template<> void SmoothL1<float, CPUContext>(const int count,
const float sigma2,
const float* x,
float* y) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
const float val = x[i];
const float abs_val = abs(val);
......@@ -690,6 +717,9 @@ template<> void SmoothL1Grad<float, CPUContext>(const int count,
const float sigma2,
const float* dy,
float* dx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
const float val = dy[i];
const float abs_val = abs(val);
......@@ -705,6 +735,9 @@ template <> void SoftmaxCrossEntropy<float, CPUContext>(const int count,
const float* prob,
const float* target,
float* loss) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
loss[i] = - target[i] * std::log(std::max(prob[i], FLT_MIN));
}
......@@ -1016,9 +1049,12 @@ template <> void RMSPropUpdate<float, CPUContext>(const int count,
/******************** utils.compare ********************/
template <> void Equal<float, CPUContext>(const int count,
const float* a,
const float* b,
float* y) {
const float* a,
const float* b,
float* y) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i)
y[i] = fabs(a[i] - b[i]) < FLT_EPSILON ? 1.0 : 0.0;
}
......@@ -1096,6 +1132,9 @@ template <> void OneHot<float, CPUContext>(const int count,
const int on_value,
const float* x,
float* y) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
const int val = x[i];
y[i * depth + val] = on_value;
......
......@@ -21,7 +21,7 @@ template<> void Empty<float, CUDAContext>() {
}
template<> void Empty<float16, CUDAContext>() {
_Empty<float> << <1, 1 >> >();
_Empty<float16> << <1, 1 >> >();
CUDA_POST_KERNEL_CHECK;
}
......@@ -102,6 +102,7 @@ template<> void Relu<float, CUDAContext>(const int count,
CUDA_POST_KERNEL_CHECK;
}
#ifdef WITH_CUDA_FP16
template <typename T>
__global__ void _ReluHalf(const int count, const half* x, const float slope, half* y) {
const half kSlope = __float2half(slope);
......@@ -123,6 +124,7 @@ template<> void Relu<float16, CUDAContext>(const int count,
reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK;
}
#endif
template <typename T>
__global__ void _ReluGrad(const int count,
......@@ -477,6 +479,7 @@ template<> void Scale<float, CUDAContext>(const int axis,
Ydata);
}
#ifdef WITH_CUDA_FP16
template <typename T>
__global__ void _ScaleWithoutBiasHalf(const int n,
const half* x,
......@@ -538,6 +541,7 @@ template<> void Scale<float16, CUDAContext>(const int axis,
inner_dim,
reinterpret_cast<half*>(Ydata));
}
#endif
template <> void ScaleGrad<float, CUDAContext>(const int axis,
Tensor* dy,
......@@ -730,6 +734,7 @@ template <> void Concat<float, CUDAContext>(const int count,
CUDA_POST_KERNEL_CHECK;
}
#ifdef WITH_CUDA_FP16
template <> void Concat<float16, CUDAContext>(const int count,
const int outer_dim,
const int inner_dim,
......@@ -749,6 +754,7 @@ template <> void Concat<float16, CUDAContext>(const int count,
reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK;
}
#endif
template <typename T>
__global__ void _ConcatGrad(const int count,
......@@ -789,6 +795,7 @@ template <> void ConcatGrad<float, CUDAContext>(const int count,
CUDA_POST_KERNEL_CHECK;
}
#ifdef WITH_CUDA_FP16
template <> void ConcatGrad<float16, CUDAContext>(const int count,
const int outer_dim,
const int inner_dim,
......@@ -808,6 +815,7 @@ template <> void ConcatGrad<float16, CUDAContext>(const int count,
reinterpret_cast<half*>(dx));
CUDA_POST_KERNEL_CHECK;
}
#endif
/******************** common.crop ********************/
......@@ -1134,6 +1142,7 @@ template <> void Transpose<float, CUDAContext>(const int count,
CUDA_POST_KERNEL_CHECK;
}
#ifdef WITH_CUDA_FP16
template <> void Transpose<float16, CUDAContext>(const int count,
const int ndim,
const int* order,
......@@ -1150,6 +1159,7 @@ template <> void Transpose<float16, CUDAContext>(const int count,
reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK;
}
#endif
template <typename T>
__global__ void _TransposeGrad(const int count,
......@@ -1187,6 +1197,7 @@ template <> void TransposeGrad<float, CUDAContext>(const int count,
CUDA_POST_KERNEL_CHECK;
}
#ifdef WITH_CUDA_FP16
template <> void TransposeGrad<float16, CUDAContext>(const int count,
const int ndim,
const int* order,
......@@ -1203,6 +1214,7 @@ template <> void TransposeGrad<float16, CUDAContext>(const int count,
reinterpret_cast<half*>(dx));
CUDA_POST_KERNEL_CHECK;
}
#endif
/******************** loss.l1_loss ********************/
......@@ -1834,6 +1846,7 @@ template <> void RMSPropUpdate<float, CUDAContext>(const int count,
/******************** utils.cast ********************/
#ifdef WITH_CUDA_FP16
template <typename T>
__global__ void _FloatToHalfKernel(const int count, const float* x, half* y) {
CUDA_KERNEL_LOOP(idx, count) {
......@@ -1849,6 +1862,7 @@ template <> void Float2Half<float, CUDAContext>(const int count,
reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK;
}
#endif
/******************** utils.compare ********************/
......@@ -1943,6 +1957,7 @@ template <> void MemoryData<uint8_t, float, CUDAContext>(const int count,
CUDA_POST_KERNEL_CHECK;
}
#ifdef WITH_CUDA_FP16
template <> void MemoryData<float, float16, CUDAContext>(const int count,
const int num,
const int channels,
......@@ -1976,6 +1991,7 @@ template <> void MemoryData<uint8_t, float16, CUDAContext>(const int count,
reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK;
}
#endif
/******************** utils.one_hot ********************/
......
......@@ -3,164 +3,223 @@
#include <cmath>
#include <algorithm>
#include "utils/omp_alternative.h"
#include "utils/sse_alternative.h"
namespace dragon {
namespace sse {
template<> void Set(const int n, const float alpha, float* x) {
__m128 scalar = SSE_FP32_SCALAR(alpha);
SSE_LOOP1(i, n) SSE_FP32_STORE(x + i, scalar);
SSE_LOOP2(i, n) x[i] = alpha;
template<> void Set(const int n, const float alpha, float* x) {
__m128 scalar = SSE_FP32_SCALAR(alpha);
int32_t i = 0;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
SSE_LOOP1(i, n) SSE_FP32_STORE(x + i, scalar);
SSE_LOOP2(i, n) x[i] = alpha;
}
template<> void Set(const int n, const int alpha, int* x) {
__m128i scalar = SSE_INT32_SCALAR(alpha);
__m128i* x1 = reinterpret_cast<__m128i*>(x);
int32_t i = 0;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
SSE_LOOP1(i, n) SSE_INT128_STORE(x1++, scalar);
SSE_LOOP2(i, n) x[i] = alpha;
}
template<> void Add(const int n, const float* a, const float* b, float* y) {
__m128 x1, y1, z1;
int32_t i = 0;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
SSE_LOOP1(i, n) {
x1 = SSE_FP32_LOAD(a + i);
y1 = SSE_FP32_LOAD(b + i);
z1 = SSE_FP32_ADD(x1, y1);
SSE_FP32_STORE(y + i, z1);
}
template<> void Set(const int n, const int alpha, int* x) {
__m128i scalar = SSE_INT32_SCALAR(alpha);
__m128i* x1 = reinterpret_cast<__m128i*>(x);
SSE_LOOP1(i, n) SSE_INT128_STORE(x1++, scalar);
SSE_LOOP2(i, n) x[i] = alpha;
}
template<> void Add(const int n, const float* a, const float* b, float* y) {
__m128 x1, y1, z1;
SSE_LOOP1(i, n) {
x1 = SSE_FP32_LOAD(a + i);
y1 = SSE_FP32_LOAD(b + i);
z1 = SSE_FP32_ADD(x1, y1);
SSE_FP32_STORE(y + i, z1);
}
SSE_LOOP2(i, n) y[i] = a[i] + b[i];
}
template<> void Sub(const int n, const float* a, const float* b, float* y) {
__m128 x1, y1, z1;
SSE_LOOP1(i, n) {
x1 = SSE_FP32_LOAD(a + i);
y1 = SSE_FP32_LOAD(b + i);
z1 = SSE_FP32_SUB(x1, y1);
SSE_FP32_STORE(y + i, z1);
}
SSE_LOOP2(i, n) y[i] = a[i] - b[i];
}
template<> void Mul(const int n, const float* a, const float* b, float* y) {
__m128 x1, y1, z1;
SSE_LOOP1(i, n) {
x1 = SSE_FP32_LOAD(a + i);
y1 = SSE_FP32_LOAD(b + i);
z1 = SSE_FP32_MUL(x1, y1);
SSE_FP32_STORE(y + i, z1);
}
SSE_LOOP2(i, n) y[i] = a[i] * b[i];
SSE_LOOP2(i, n) y[i] = a[i] + b[i];
}
template<> void Sub(const int n, const float* a, const float* b, float* y) {
__m128 x1, y1, z1;
int32_t i = 0;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
SSE_LOOP1(i, n) {
x1 = SSE_FP32_LOAD(a + i);
y1 = SSE_FP32_LOAD(b + i);
z1 = SSE_FP32_SUB(x1, y1);
SSE_FP32_STORE(y + i, z1);
}
template<> void Div(const int n, const float* a, const float* b, float* y) {
__m128 x1, y1, z1;
SSE_LOOP1(i, n) {
x1 = SSE_FP32_LOAD(a + i);
y1 = SSE_FP32_LOAD(b + i);
z1 = SSE_FP32_DIV(x1, y1);
SSE_FP32_STORE(y + i, z1);
}
SSE_LOOP2(i, n) y[i] = a[i] / b[i];
SSE_LOOP2(i, n) y[i] = a[i] - b[i];
}
template<> void Mul(const int n, const float* a, const float* b, float* y) {
__m128 x1, y1, z1;
int32_t i = 0;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
SSE_LOOP1(i, n) {
x1 = SSE_FP32_LOAD(a + i);
y1 = SSE_FP32_LOAD(b + i);
z1 = SSE_FP32_MUL(x1, y1);
SSE_FP32_STORE(y + i, z1);
}
template<> void Scal(const int n, const float alpha, float* y) {
__m128 y1, scalar = SSE_FP32_SCALAR(alpha);
SSE_LOOP1(i, n) {
y1 = SSE_FP32_LOAD(y + i);
y1 = SSE_FP32_MUL(y1, scalar);
SSE_FP32_STORE(y + i, y1);
}
SSE_LOOP2(i, n) y[i] *= alpha;
SSE_LOOP2(i, n) y[i] = a[i] * b[i];
}
template<> void Div(const int n, const float* a, const float* b, float* y) {
__m128 x1, y1, z1;
int32_t i = 0;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
SSE_LOOP1(i, n) {
x1 = SSE_FP32_LOAD(a + i);
y1 = SSE_FP32_LOAD(b + i);
z1 = SSE_FP32_DIV(x1, y1);
SSE_FP32_STORE(y + i, z1);
}
template<> void Scale(const int n, const float alpha, const float* x, float* y) {
__m128 x1, scalar = SSE_FP32_SCALAR(alpha);
SSE_LOOP1(i, n) {
x1 = SSE_FP32_LOAD(x + i);
x1 = SSE_FP32_MUL(x1, scalar);
SSE_FP32_STORE(y + i, x1);
}
SSE_LOOP2(i, n) y[i] = x[i] * alpha;
SSE_LOOP2(i, n) y[i] = a[i] / b[i];
}
template<> void Scal(const int n, const float alpha, float* y) {
__m128 y1, scalar = SSE_FP32_SCALAR(alpha);
int32_t i = 0;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
SSE_LOOP1(i, n) {
y1 = SSE_FP32_LOAD(y + i);
y1 = SSE_FP32_MUL(y1, scalar);
SSE_FP32_STORE(y + i, y1);
}
template<> void Axpy(const int n, float alpha, const float* x, float *y) {
__m128 x1, y1, scalar = SSE_FP32_SCALAR(alpha);
SSE_LOOP1(i, n) {
x1 = SSE_FP32_LOAD(x + i);
y1 = SSE_FP32_LOAD(y + i);
x1 = SSE_FP32_MUL(x1, scalar);
y1 = SSE_FP32_ADD(x1, y1);
SSE_FP32_STORE(y + i, y1);
}
SSE_LOOP2(i, n) y[i] = alpha * x[i] + y[i];
SSE_LOOP2(i, n) y[i] *= alpha;
}
template<> void Scale(const int n, const float alpha, const float* x, float* y) {
__m128 x1, scalar = SSE_FP32_SCALAR(alpha);
int32_t i = 0;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
SSE_LOOP1(i, n) {
x1 = SSE_FP32_LOAD(x + i);
x1 = SSE_FP32_MUL(x1, scalar);
SSE_FP32_STORE(y + i, x1);
}
template<> void Axpby(const int n, float alpha, const float* x,
const float beta, float *y) {
__m128 x1, y1, z1;
__m128 scalar1 = SSE_FP32_SCALAR(alpha);
__m128 scalar2 = SSE_FP32_SCALAR(beta);
SSE_LOOP1(i, n) {
x1 = SSE_FP32_LOAD(x + i);
y1 = SSE_FP32_LOAD(y + i);
x1 = SSE_FP32_MUL(x1, scalar1);
y1 = SSE_FP32_MUL(y1, scalar2);
z1 = SSE_FP32_ADD(x1, y1);
SSE_FP32_STORE(y + i, z1);
}
SSE_LOOP2(i, n) y[i] = alpha * x[i] + beta* y[i];
SSE_LOOP2(i, n) y[i] = x[i] * alpha;
}
template<> void Axpy(const int n, float alpha, const float* x, float *y) {
__m128 x1, y1, scalar = SSE_FP32_SCALAR(alpha);
int32_t i = 0;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
SSE_LOOP1(i, n) {
x1 = SSE_FP32_LOAD(x + i);
y1 = SSE_FP32_LOAD(y + i);
x1 = SSE_FP32_MUL(x1, scalar);
y1 = SSE_FP32_ADD(x1, y1);
SSE_FP32_STORE(y + i, y1);
}
template<> float ASum(const int n, const float *x) {
__m128 x1, sum = SSE_FP32_ZERO;
SSE_LOOP1(i, n) {
x1 = SSE_FP32_LOAD(x + i);
sum = SSE_FP32_ADD(sum, x1);
}
float buf[4];
SSE_FP32_STORE(buf, sum);
float ret = buf[0] + buf[1] + buf[2] + buf[3];
SSE_LOOP2(i, n) ret += x[i];
return ret;
SSE_LOOP2(i, n) y[i] = alpha * x[i] + y[i];
}
template<> void Axpby(const int n,
float alpha,
const float* x,
const float beta,
float *y) {
__m128 x1, y1, z1;
__m128 scalar1 = SSE_FP32_SCALAR(alpha);
__m128 scalar2 = SSE_FP32_SCALAR(beta);
int32_t i = 0;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
SSE_LOOP1(i, n) {
x1 = SSE_FP32_LOAD(x + i);
y1 = SSE_FP32_LOAD(y + i);
x1 = SSE_FP32_MUL(x1, scalar1);
y1 = SSE_FP32_MUL(y1, scalar2);
z1 = SSE_FP32_ADD(x1, y1);
SSE_FP32_STORE(y + i, z1);
}
template<> void AddScalar(const int n, const float alpha, float* y) {
__m128 y1, scalar = SSE_FP32_SCALAR(alpha);
SSE_LOOP1(i, n) {
y1 = SSE_FP32_LOAD(y + i);
y1 = SSE_FP32_ADD(y1, scalar);
SSE_FP32_STORE(y + i, y1);
}
SSE_LOOP2(i, n) y[i] += alpha;
SSE_LOOP2(i, n) y[i] = alpha * x[i] + beta* y[i];
}
template<> float ASum(const int n, const float *x) {
__m128 x1, sum = SSE_FP32_ZERO;
int32_t i = 0;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
SSE_LOOP1(i, n) {
x1 = SSE_FP32_LOAD(x + i);
sum = SSE_FP32_ADD(sum, x1);
}
template<> void MulScalar(const int n, const float alpha, float* y) {
__m128 y1, scalar = SSE_FP32_SCALAR(alpha);
SSE_LOOP1(i, n) {
y1 = SSE_FP32_LOAD(y + i);
y1 = SSE_FP32_MUL(y1, scalar);
SSE_FP32_STORE(y + i, y1);
}
SSE_LOOP2(i, n) y[i] *= alpha;
float buf[4];
SSE_FP32_STORE(buf, sum);
float ret = buf[0] + buf[1] + buf[2] + buf[3];
SSE_LOOP2(i, n) ret += x[i];
return ret;
}
template<> void AddScalar(const int n, const float alpha, float* y) {
__m128 y1, scalar = SSE_FP32_SCALAR(alpha);
int32_t i = 0;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
SSE_LOOP1(i, n) {
y1 = SSE_FP32_LOAD(y + i);
y1 = SSE_FP32_ADD(y1, scalar);
SSE_FP32_STORE(y + i, y1);
}
SSE_LOOP2(i, n) y[i] += alpha;
}
template<> void MulScalar(const int n, const float alpha, float* y) {
__m128 y1, scalar = SSE_FP32_SCALAR(alpha);
int32_t i = 0;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
SSE_LOOP1(i, n) {
y1 = SSE_FP32_LOAD(y + i);
y1 = SSE_FP32_MUL(y1, scalar);
SSE_FP32_STORE(y + i, y1);
}
template <> float Dot(const int n, const float* a, const float* b) {
__m128 x1, y1, sum = SSE_FP32_ZERO;
SSE_LOOP1(i, n) {
x1 = SSE_FP32_LOAD(a + i);
y1 = SSE_FP32_LOAD(b + i);
sum = SSE_FP32_ADD(sum, SSE_FP32_MUL(x1, y1));
}
float buf[4];
SSE_FP32_STORE(buf, sum);
float ret = buf[0] + buf[1] + buf[2] + buf[3];
SSE_LOOP2(i, n) ret += a[i] * b[i];
return ret;
SSE_LOOP2(i, n) y[i] *= alpha;
}
template <> float Dot(const int n, const float* a, const float* b) {
__m128 x1, y1, sum = SSE_FP32_ZERO;
int32_t i = 0;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
SSE_LOOP1(i, n) {
x1 = SSE_FP32_LOAD(a + i);
y1 = SSE_FP32_LOAD(b + i);
sum = SSE_FP32_ADD(sum, SSE_FP32_MUL(x1, y1));
}
float buf[4];
SSE_FP32_STORE(buf, sum);
float ret = buf[0] + buf[1] + buf[2] + buf[3];
SSE_LOOP2(i, n) ret += a[i] * b[i];
return ret;
}
} // namespace ssd
......
# Dragon: A Computation Graph Virtual Machine Based Deep Learning Framework
![](http://images.cnblogs.com/cnblogs_com/neopenx/690760/o_dragon_logo.png)
-----
### Compile Requirements for C++
0. Google Protocol Buffer
......
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!