Commit 6eeac5fe by Ting PAN

add omp optimization

1 parent 007d9c21
...@@ -12,6 +12,7 @@ option(WITH_PYTHON3 "Set ON to use PYTHON3 otherwise PYTHON2" OF ...@@ -12,6 +12,7 @@ option(WITH_PYTHON3 "Set ON to use PYTHON3 otherwise PYTHON2" OF
option(WITH_CUDA "Set ON to use CUDA" ON) option(WITH_CUDA "Set ON to use CUDA" ON)
option(WITH_CUDNN "Set ON to use CUDNN" OFF) option(WITH_CUDNN "Set ON to use CUDNN" OFF)
option(WITH_BLAS "Set ON to use BLAS" OFF) option(WITH_BLAS "Set ON to use BLAS" OFF)
option(WITH_OMP "Set ON to use OpenMP" OFF)
option(WITH_SSE "Set ON to use SSE 4.1" ON) option(WITH_SSE "Set ON to use SSE 4.1" ON)
option(WITH_MPI "Set ON to use MPI" OFF) option(WITH_MPI "Set ON to use MPI" OFF)
option(WITH_MPI_CUDA "Set ON to use MPI-CUDA" OFF) option(WITH_MPI_CUDA "Set ON to use MPI-CUDA" OFF)
...@@ -22,7 +23,7 @@ option(WITH_CUDA_FP16 "Set ON to use FP16" ON) ...@@ -22,7 +23,7 @@ option(WITH_CUDA_FP16 "Set ON to use FP16" ON)
set(3RDPARTY_DIR ${PROJECT_SOURCE_DIR}/../3rdparty) set(3RDPARTY_DIR ${PROJECT_SOURCE_DIR}/../3rdparty)
# set your python environment # set your python environment
set(PYTHON_DIR /usr/include/python2.7) # prefer set(PYTHON_DIR /usr/include/python2.7) # preferred
#set(PYTHON_DIR /usr/include/python3.x) # optional, set specific version #set(PYTHON_DIR /usr/include/python3.x) # optional, set specific version
#set(ANACONDA_DIR /xxx/anaconda) # optional, root folder of anaconda, preset for 2.7, 3.5, and 3.6 #set(ANACONDA_DIR /xxx/anaconda) # optional, root folder of anaconda, preset for 2.7, 3.5, and 3.6
set(NUMPY_DIR /xxx/numpy) # required, root folder of numpy package set(NUMPY_DIR /xxx/numpy) # required, root folder of numpy package
...@@ -118,6 +119,10 @@ else() ...@@ -118,6 +119,10 @@ else()
"\n -- > GEMM/GEMV is disabled" "\n -- > GEMM/GEMV is disabled"
"\n -- > prefer not to run as CPU Mode") "\n -- > prefer not to run as CPU Mode")
endif() endif()
if (WITH_OMP)
ADD_DEFINITIONS(-DWITH_OMP)
message(STATUS "Use OpenMP [Optional]")
endif()
if (WITH_SSE) if (WITH_SSE)
ADD_DEFINITIONS(-DWITH_SSE) ADD_DEFINITIONS(-DWITH_SSE)
message(STATUS "Use SSE [Optional]") message(STATUS "Use SSE [Optional]")
...@@ -145,11 +150,18 @@ endif() ...@@ -145,11 +150,18 @@ endif()
# ---[ Flags # ---[ Flags
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ${CUDA_ARCH}") set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ${CUDA_ARCH}")
if(WIN32) if(WIN32)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP /O2")
if (WITH_OMP)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /openmp")
endif()
endif() endif()
if(UNIX) if(UNIX)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -O2 -m64 -fpermissive -std=c++11") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -O2 -m64 -fpermissive -std=c++11")
if (WITH_OMP)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fopenmp")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
endif()
endif() endif()
# ---[ Warnings # ---[ Warnings
......
// --------------------------------------------------------
// Dragon
// Copyright(c) 2017 SeetaTech
// Written by Ting Pan
// --------------------------------------------------------
#ifndef DRAGON_UTILS_OMP_ALTERNATIVE_H_
#define DRAGON_UTILS_OMP_ALTERNATIVE_H_
#ifdef WITH_OMP
#include <algorithm>
#include <omp.h>
namespace dragon {
#define OMP_MIN_ITERATORS_PER_CORE 256
inline int GET_OMP_THREADS(const int N) {
int threads = std::max(N / OMP_MIN_ITERATORS_PER_CORE, 1);
return std::min(threads, omp_get_num_procs());
}
}
#endif // WITH_OMP
#endif // DRAGON_UTILS_OMP_ALTERNATIVE_H_
\ No newline at end of file
...@@ -15,11 +15,10 @@ ...@@ -15,11 +15,10 @@
namespace dragon { namespace dragon {
#define SSE_LOOP1(i, n) \ #define SSE_LOOP1(i, n) \
int32_t i; \
for (i = 0; i < n - 4; i += 4) \ for (i = 0; i < n - 4; i += 4) \
#define SSE_LOOP2(i, n) \ #define SSE_LOOP2(i, n) \
for (; i < n; i++) for (; i < n; ++i)
#define SSE_FP32_LOAD _mm_loadu_ps #define SSE_FP32_LOAD _mm_loadu_ps
#define SSE_FP32_STORE _mm_storeu_ps #define SSE_FP32_STORE _mm_storeu_ps
......
...@@ -53,18 +53,24 @@ void AddOp<Context>::RunOnDevice() { ...@@ -53,18 +53,24 @@ void AddOp<Context>::RunOnDevice() {
} }
else if (input(0).dim(0) == input(1).dim(0) && input(1).count(1) == 1) { else if (input(0).dim(0) == input(1).dim(0) && input(1).count(1) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(2); if (input(0).template IsType<float>()) BroadcastRunWithType<float>(2);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(2); else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(2);
#endif
else LOG(FATAL) << "unsupported input types."; else LOG(FATAL) << "unsupported input types.";
} }
else if (input(0).dim(-1) == input(1).dim(-1) && else if (input(0).dim(-1) == input(1).dim(-1) &&
input(1).count(0, input(1).axis(-1)) == 1) { input(1).count(0, input(1).axis(-1)) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(1); if (input(0).template IsType<float>()) BroadcastRunWithType<float>(1);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(1); else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(1);
#endif
else LOG(FATAL) << "unsupported input types."; else LOG(FATAL) << "unsupported input types.";
} }
else if (input(1).ndim() == 1 && input(1).dim(0) == 1) { else if (input(1).ndim() == 1 && input(1).dim(0) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(0); if (input(0).template IsType<float>()) BroadcastRunWithType<float>(0);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(0); else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(0);
#endif
else LOG(FATAL) << "unsupported input types."; else LOG(FATAL) << "unsupported input types.";
} }
else { else {
...@@ -139,18 +145,24 @@ void AddGradientOp<Context>::RunOnDevice() { ...@@ -139,18 +145,24 @@ void AddGradientOp<Context>::RunOnDevice() {
} }
else if (input(-1).dim(0) == input(0).dim(0) && input(0).count(1) == 1) { else if (input(-1).dim(0) == input(0).dim(0) && input(0).count(1) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(2); if (input(0).template IsType<float>()) BroadcastRunWithType<float>(2);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(2); else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(2);
#endif
else LOG(FATAL) << "unsupported input types."; else LOG(FATAL) << "unsupported input types.";
} }
else if (input(-1).dim(-1) == input(0).dim(-1) && else if (input(-1).dim(-1) == input(0).dim(-1) &&
input(0).count(0, input(0).axis(-1)) == 1) { input(0).count(0, input(0).axis(-1)) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(1); if (input(0).template IsType<float>()) BroadcastRunWithType<float>(1);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(1); else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(1);
#endif
else LOG(FATAL) << "unsupported input types."; else LOG(FATAL) << "unsupported input types.";
} }
else if (input(0).ndim() == 1 && input(0).dim(0) == 1) { else if (input(0).ndim() == 1 && input(0).dim(0) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(0); if (input(0).template IsType<float>()) BroadcastRunWithType<float>(0);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(0); else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(0);
#endif
else LOG(FATAL) << "unsupported input types."; else LOG(FATAL) << "unsupported input types.";
} }
else { else {
......
...@@ -54,18 +54,24 @@ void DivOp<Context>::RunOnDevice() { ...@@ -54,18 +54,24 @@ void DivOp<Context>::RunOnDevice() {
} }
else if (input(0).dim(0) == input(1).dim(0) && input(1).count(1) == 1) { else if (input(0).dim(0) == input(1).dim(0) && input(1).count(1) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(2); if (input(0).template IsType<float>()) BroadcastRunWithType<float>(2);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(2); else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(2);
#endif
else LOG(FATAL) << "unsupported input types"; else LOG(FATAL) << "unsupported input types";
} }
else if (input(0).dim(-1) == input(1).dim(-1) && else if (input(0).dim(-1) == input(1).dim(-1) &&
input(1).count(0, input(1).axis(-1)) == 1) { input(1).count(0, input(1).axis(-1)) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(1); if (input(0).template IsType<float>()) BroadcastRunWithType<float>(1);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(1); else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(1);
#endif
else LOG(FATAL) << "unsupported input types"; else LOG(FATAL) << "unsupported input types";
} }
else if (input(1).ndim() == 1 && input(1).dim(0) == 1) { else if (input(1).ndim() == 1 && input(1).dim(0) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(0); if (input(0).template IsType<float>()) BroadcastRunWithType<float>(0);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(0); else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(0);
#endif
else LOG(FATAL) << "unsupported input types"; else LOG(FATAL) << "unsupported input types";
} }
else { else {
...@@ -170,18 +176,24 @@ void DivGradientOp<Context>::RunOnDevice() { ...@@ -170,18 +176,24 @@ void DivGradientOp<Context>::RunOnDevice() {
} }
else if (input(0).dim(0) == input(1).dim(0) && input(1).count(1) == 1) { else if (input(0).dim(0) == input(1).dim(0) && input(1).count(1) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(2); if (input(0).template IsType<float>()) BroadcastRunWithType<float>(2);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(2); else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(2);
#endif
else LOG(FATAL) << "unsupported input types"; else LOG(FATAL) << "unsupported input types";
} }
else if (input(0).dim(-1) == input(1).dim(-1) && else if (input(0).dim(-1) == input(1).dim(-1) &&
input(1).count(0, input(1).axis(-1)) == 1) { input(1).count(0, input(1).axis(-1)) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(1); if (input(0).template IsType<float>()) BroadcastRunWithType<float>(1);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(1); else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(1);
#endif
else LOG(FATAL) << "unsupported input types"; else LOG(FATAL) << "unsupported input types";
} }
else if (input(1).ndim() == 1 && input(1).dim(0) == 1) { else if (input(1).ndim() == 1 && input(1).dim(0) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(0); if (input(0).template IsType<float>()) BroadcastRunWithType<float>(0);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(0); else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(0);
#endif
else LOG(FATAL) << "unsupported input types"; else LOG(FATAL) << "unsupported input types";
} }
else { else {
......
...@@ -55,7 +55,9 @@ void DotOp<Context>::RunOnDevice() { ...@@ -55,7 +55,9 @@ void DotOp<Context>::RunOnDevice() {
dims[dims.size() - 1] = N1; dims[dims.size() - 1] = N1;
output(0)->Reshape(dims); output(0)->Reshape(dims);
if (input(0).template IsType<float>()) GemmRunWithType<float>(); if (input(0).template IsType<float>()) GemmRunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) GemmRunWithType<float16>(); else if (input(0).template IsType<float16>()) GemmRunWithType<float16>();
#endif
else LOG(FATAL) << "unsupported input types."; else LOG(FATAL) << "unsupported input types.";
} }
else if (input(0).ndim() >= 2 && input(1).ndim() == 1) { else if (input(0).ndim() >= 2 && input(1).ndim() == 1) {
...@@ -70,7 +72,9 @@ void DotOp<Context>::RunOnDevice() { ...@@ -70,7 +72,9 @@ void DotOp<Context>::RunOnDevice() {
dims.pop_back(); dims.pop_back();
output(0)->Reshape(dims); output(0)->Reshape(dims);
if (input(0).template IsType<float>()) GemvRunWithType<float>(); if (input(0).template IsType<float>()) GemvRunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) GemvRunWithType<float16>(); else if (input(0).template IsType<float16>()) GemvRunWithType<float16>();
#endif
else LOG(FATAL) << "unsupported input types."; else LOG(FATAL) << "unsupported input types.";
} }
else { else {
...@@ -148,7 +152,9 @@ void DotGradientOp<Context>::RunOnDevice() { ...@@ -148,7 +152,9 @@ void DotGradientOp<Context>::RunOnDevice() {
<< input(0).dim_string() << " can not Dot with Tensor" << input(0).dim_string() << " can not Dot with Tensor"
<< "(" << input(1).name() << "): " << input(1).dim_string(); << "(" << input(1).name() << "): " << input(1).dim_string();
if (input(0).template IsType<float>()) GemmRunWithType<float>(); if (input(0).template IsType<float>()) GemmRunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) GemmRunWithType<float16>(); else if (input(0).template IsType<float16>()) GemmRunWithType<float16>();
#endif
else LOG(FATAL) << "unsupported input types."; else LOG(FATAL) << "unsupported input types.";
} }
else if (input(0).ndim() >= 2 && input(1).ndim() == 1) { else if (input(0).ndim() >= 2 && input(1).ndim() == 1) {
...@@ -160,7 +166,9 @@ void DotGradientOp<Context>::RunOnDevice() { ...@@ -160,7 +166,9 @@ void DotGradientOp<Context>::RunOnDevice() {
<< input(0).dim_string() << " can not Dot with Tensor" << input(0).dim_string() << " can not Dot with Tensor"
<< "(" << input(1).name() << "): " << input(1).dim_string(); << "(" << input(1).name() << "): " << input(1).dim_string();
if (input(0).template IsType<float>()) GemvRunWithType<float>(); if (input(0).template IsType<float>()) GemvRunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) GemvRunWithType<float16>(); else if (input(0).template IsType<float16>()) GemvRunWithType<float16>();
#endif
else LOG(FATAL) << "unsupported input types."; else LOG(FATAL) << "unsupported input types.";
} }
else { else {
......
...@@ -41,12 +41,16 @@ void EltwiseOp<Context>::RunOnDevice() { ...@@ -41,12 +41,16 @@ void EltwiseOp<Context>::RunOnDevice() {
if (operation == "SUM") { if (operation == "SUM") {
if (input(0).template IsType<float>()) SumRunWithType<float>(); if (input(0).template IsType<float>()) SumRunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) SumRunWithType<float16>(); else if (input(0).template IsType<float16>()) SumRunWithType<float16>();
#endif
else LOG(FATAL) << "unsupported input types."; else LOG(FATAL) << "unsupported input types.";
} }
else if (operation == "PROD") { else if (operation == "PROD") {
if (input(0).template IsType<float>()) ProdRunWithType<float>(); if (input(0).template IsType<float>()) ProdRunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) ProdRunWithType<float16>(); else if (input(0).template IsType<float16>()) ProdRunWithType<float16>();
#endif
else LOG(FATAL) << "unsupported input types."; else LOG(FATAL) << "unsupported input types.";
} }
else { else {
...@@ -104,12 +108,16 @@ void EltwiseGradientOp<Context>::RunOnDevice() { ...@@ -104,12 +108,16 @@ void EltwiseGradientOp<Context>::RunOnDevice() {
if (operation == "SUM") { if (operation == "SUM") {
if (input(0).template IsType<float>()) SumRunWithType<float>(); if (input(0).template IsType<float>()) SumRunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) SumRunWithType<float16>(); else if (input(0).template IsType<float16>()) SumRunWithType<float16>();
#endif
else LOG(FATAL) << "unsupported input types."; else LOG(FATAL) << "unsupported input types.";
} }
else if (operation == "PROD") { else if (operation == "PROD") {
if (input(0).template IsType<float>()) ProdRunWithType<float>(); if (input(0).template IsType<float>()) ProdRunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) ProdRunWithType<float16>(); else if (input(0).template IsType<float16>()) ProdRunWithType<float16>();
#endif
else LOG(FATAL) << "unsupported input types."; else LOG(FATAL) << "unsupported input types.";
} }
else { else {
......
...@@ -25,7 +25,9 @@ void GramMatrixOp<Context>::RunOnDevice() { ...@@ -25,7 +25,9 @@ void GramMatrixOp<Context>::RunOnDevice() {
output(0)->Reshape(vector<TIndex>({ outer_dim, dim, dim })); output(0)->Reshape(vector<TIndex>({ outer_dim, dim, dim }));
if (input(0).template IsType<float>()) RunWithType<float>(); if (input(0).template IsType<float>()) RunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) RunWithType<float16>(); else if (input(0).template IsType<float16>()) RunWithType<float16>();
#endif
else LOG(FATAL) << "unsupported input types."; else LOG(FATAL) << "unsupported input types.";
} }
...@@ -57,7 +59,9 @@ void GramMatrixGradientOp<Context>::RunOnDevice() { ...@@ -57,7 +59,9 @@ void GramMatrixGradientOp<Context>::RunOnDevice() {
output(0)->ReshapeLike(input(0)); output(0)->ReshapeLike(input(0));
if (input(0).template IsType<float>()) RunWithType<float>(); if (input(0).template IsType<float>()) RunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) RunWithType<float16>(); else if (input(0).template IsType<float16>()) RunWithType<float16>();
#endif
else LOG(FATAL) << "unsupported input types."; else LOG(FATAL) << "unsupported input types.";
} }
......
...@@ -48,7 +48,9 @@ void MatmulOp<Context>::RunOnDevice() { ...@@ -48,7 +48,9 @@ void MatmulOp<Context>::RunOnDevice() {
dims[dims.size() - 1] = N; dims[dims.size() - 1] = N;
output(0)->Reshape(dims); output(0)->Reshape(dims);
if (input(0).template IsType<float>()) RunWithType<float>(); if (input(0).template IsType<float>()) RunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) RunWithType<float16>(); else if (input(0).template IsType<float16>()) RunWithType<float16>();
#endif
else LOG(FATAL) << "unsupported input types."; else LOG(FATAL) << "unsupported input types.";
} }
...@@ -105,7 +107,9 @@ void MatmulGradientOp<Context>::RunOnDevice() { ...@@ -105,7 +107,9 @@ void MatmulGradientOp<Context>::RunOnDevice() {
output(0)->ReshapeLike(input(0)); output(0)->ReshapeLike(input(0));
output(1)->ReshapeLike(input(1)); output(1)->ReshapeLike(input(1));
if (input(0).template IsType<float>()) RunWithType<float>(); if (input(0).template IsType<float>()) RunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) RunWithType<float16>(); else if (input(0).template IsType<float16>()) RunWithType<float16>();
#endif
else LOG(FATAL) << "unsupported input types."; else LOG(FATAL) << "unsupported input types.";
} }
......
...@@ -54,18 +54,24 @@ void MulOp<Context>::RunOnDevice() { ...@@ -54,18 +54,24 @@ void MulOp<Context>::RunOnDevice() {
} }
else if (input(0).dim(0) == input(1).dim(0) && input(1).count(1) == 1) { else if (input(0).dim(0) == input(1).dim(0) && input(1).count(1) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(2); if (input(0).template IsType<float>()) BroadcastRunWithType<float>(2);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(2); else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(2);
#endif
else LOG(FATAL) << "unsupported input types."; else LOG(FATAL) << "unsupported input types.";
} }
else if (input(0).dim(-1) == input(1).dim(-1) && else if (input(0).dim(-1) == input(1).dim(-1) &&
input(1).count(0, input(1).axis(-1)) == 1) { input(1).count(0, input(1).axis(-1)) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(1); if (input(0).template IsType<float>()) BroadcastRunWithType<float>(1);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(1); else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(1);
#endif
else LOG(FATAL) << "unsupported input types."; else LOG(FATAL) << "unsupported input types.";
} }
else if (input(1).ndim() == 1 && input(1).dim(0) == 1) { else if (input(1).ndim() == 1 && input(1).dim(0) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(0); if (input(0).template IsType<float>()) BroadcastRunWithType<float>(0);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(0); else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(0);
#endif
else LOG(FATAL) << "unsupported input types."; else LOG(FATAL) << "unsupported input types.";
} }
else { else {
...@@ -158,18 +164,24 @@ void MulGradientOp<Context>::RunOnDevice() { ...@@ -158,18 +164,24 @@ void MulGradientOp<Context>::RunOnDevice() {
} }
else if (input(0).dim(0) == input(1).dim(0) && input(1).count(1) == 1) { else if (input(0).dim(0) == input(1).dim(0) && input(1).count(1) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(2); if (input(0).template IsType<float>()) BroadcastRunWithType<float>(2);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(2); else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(2);
#endif
else LOG(FATAL) << "unsupported input types."; else LOG(FATAL) << "unsupported input types.";
} }
else if (input(0).dim(-1) == input(1).dim(-1) && else if (input(0).dim(-1) == input(1).dim(-1) &&
input(1).count(0, input(1).axis(-1)) == 1) { input(1).count(0, input(1).axis(-1)) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(1); if (input(0).template IsType<float>()) BroadcastRunWithType<float>(1);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(1); else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(1);
#endif
else LOG(FATAL) << "unsupported input types."; else LOG(FATAL) << "unsupported input types.";
} }
else if (input(1).ndim() == 1 && input(1).dim(0) == 1) { else if (input(1).ndim() == 1 && input(1).dim(0) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(0); if (input(0).template IsType<float>()) BroadcastRunWithType<float>(0);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(0); else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(0);
#endif
else LOG(FATAL) << "unsupported input types."; else LOG(FATAL) << "unsupported input types.";
} }
else { else {
......
...@@ -26,7 +26,9 @@ void PowOp<Context>::RunOnDevice() { ...@@ -26,7 +26,9 @@ void PowOp<Context>::RunOnDevice() {
output(0)->ReshapeLike(input(0)); output(0)->ReshapeLike(input(0));
if (input(0).template IsType<float>()) RunWithType<float>(); if (input(0).template IsType<float>()) RunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) RunWithType<float16>(); else if (input(0).template IsType<float16>()) RunWithType<float16>();
#endif
else LOG(FATAL) << "unsupported input types."; else LOG(FATAL) << "unsupported input types.";
} }
...@@ -76,7 +78,9 @@ void PowGradientOp<Context>::RunOnDevice() { ...@@ -76,7 +78,9 @@ void PowGradientOp<Context>::RunOnDevice() {
output(0)->ReshapeLike(input(0)); output(0)->ReshapeLike(input(0));
if (input(0).template IsType<float>()) RunWithType<float>(); if (input(0).template IsType<float>()) RunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) RunWithType<float16>(); else if (input(0).template IsType<float16>()) RunWithType<float16>();
#endif
else LOG(FATAL) << "unsupported input types."; else LOG(FATAL) << "unsupported input types.";
} }
......
...@@ -37,7 +37,9 @@ void ScaleOp<Context>::RunOnDevice() { ...@@ -37,7 +37,9 @@ void ScaleOp<Context>::RunOnDevice() {
output(0)->ReshapeLike(input(0)); output(0)->ReshapeLike(input(0));
if (input(0).template IsType<float>()) RunWithType<float>(); if (input(0).template IsType<float>()) RunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) RunWithType<float16>(); else if (input(0).template IsType<float16>()) RunWithType<float16>();
#endif
else LOG(FATAL) << "unsupported input types."; else LOG(FATAL) << "unsupported input types.";
} }
......
...@@ -53,18 +53,24 @@ void SubOp<Context>::RunOnDevice() { ...@@ -53,18 +53,24 @@ void SubOp<Context>::RunOnDevice() {
} }
else if (input(0).dim(0) == input(1).dim(0) && input(1).count(1) == 1) { else if (input(0).dim(0) == input(1).dim(0) && input(1).count(1) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(2); if (input(0).template IsType<float>()) BroadcastRunWithType<float>(2);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(2); else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(2);
#endif
else LOG(FATAL) << "unsupported input types."; else LOG(FATAL) << "unsupported input types.";
} }
else if (input(0).dim(-1) == input(1).dim(-1) && else if (input(0).dim(-1) == input(1).dim(-1) &&
input(1).count(0, input(1).axis(-1)) == 1) { input(1).count(0, input(1).axis(-1)) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(1); if (input(0).template IsType<float>()) BroadcastRunWithType<float>(1);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(1); else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(1);
#endif
else LOG(FATAL) << "unsupported input types."; else LOG(FATAL) << "unsupported input types.";
} }
else if (input(1).ndim() == 1 && input(1).dim(0) == 1) { else if (input(1).ndim() == 1 && input(1).dim(0) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(0); if (input(0).template IsType<float>()) BroadcastRunWithType<float>(0);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(0); else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(0);
#endif
else LOG(FATAL) << "unsupported input types."; else LOG(FATAL) << "unsupported input types.";
} }
else { else {
...@@ -139,18 +145,24 @@ void SubGradientOp<Context>::RunOnDevice() { ...@@ -139,18 +145,24 @@ void SubGradientOp<Context>::RunOnDevice() {
} }
else if (input(-1).dim(0) == input(0).dim(0) && input(0).count(1) == 1) { else if (input(-1).dim(0) == input(0).dim(0) && input(0).count(1) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(2); if (input(0).template IsType<float>()) BroadcastRunWithType<float>(2);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(2); else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(2);
#endif
else LOG(FATAL) << "unsupported input types."; else LOG(FATAL) << "unsupported input types.";
} }
else if (input(-1).dim(-1) == input(0).dim(-1) && else if (input(-1).dim(-1) == input(0).dim(-1) &&
input(0).count(0, input(0).axis(-1)) == 1) { input(0).count(0, input(0).axis(-1)) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(1); if (input(0).template IsType<float>()) BroadcastRunWithType<float>(1);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(1); else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(1);
#endif
else LOG(FATAL) << "unsupported input types."; else LOG(FATAL) << "unsupported input types.";
} }
else if (input(0).ndim() == 1 && input(0).dim(0) == 1) { else if (input(0).ndim() == 1 && input(0).dim(0) == 1) {
if (input(0).template IsType<float>()) BroadcastRunWithType<float>(0); if (input(0).template IsType<float>()) BroadcastRunWithType<float>(0);
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(0); else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(0);
#endif
else LOG(FATAL) << "unsupported input types."; else LOG(FATAL) << "unsupported input types.";
} }
else { else {
......
...@@ -49,7 +49,9 @@ void ConcatOp<Context>::RunOnDevice() { ...@@ -49,7 +49,9 @@ void ConcatOp<Context>::RunOnDevice() {
} }
if (input(0).template IsType<float>()) RunWithType<float>(); if (input(0).template IsType<float>()) RunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) RunWithType<float16>(); else if (input(0).template IsType<float16>()) RunWithType<float16>();
#endif
else LOG(FATAL) << "unsupported input types."; else LOG(FATAL) << "unsupported input types.";
} }
...@@ -96,7 +98,9 @@ void ConcatGradientOp<Context>::RunOnDevice() { ...@@ -96,7 +98,9 @@ void ConcatGradientOp<Context>::RunOnDevice() {
} }
if (input(0).template IsType<float>()) RunWithType<float>(); if (input(0).template IsType<float>()) RunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) RunWithType<float16>(); else if (input(0).template IsType<float16>()) RunWithType<float16>();
#endif
else LOG(FATAL) << "unsupported input types."; else LOG(FATAL) << "unsupported input types.";
} }
......
...@@ -45,7 +45,9 @@ void TransposeOp<Context>::RunOnDevice() { ...@@ -45,7 +45,9 @@ void TransposeOp<Context>::RunOnDevice() {
} }
if (input(0).template IsType<float>()) RunWithType<float>(); if (input(0).template IsType<float>()) RunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) RunWithType<float16>(); else if (input(0).template IsType<float16>()) RunWithType<float16>();
#endif
else LOG(FATAL) << "unsupported input types."; else LOG(FATAL) << "unsupported input types.";
} }
...@@ -75,7 +77,9 @@ void TransposeGradientOp<Context>::RunOnDevice() { ...@@ -75,7 +77,9 @@ void TransposeGradientOp<Context>::RunOnDevice() {
new_steps = ws()->GetTensor("_t_" + anchor() + "_new_steps"); new_steps = ws()->GetTensor("_t_" + anchor() + "_new_steps");
if (input(0).template IsType<float>()) RunWithType<float>(); if (input(0).template IsType<float>()) RunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) RunWithType<float16>(); else if (input(0).template IsType<float16>()) RunWithType<float16>();
#endif
else LOG(FATAL) << "unsupported input types."; else LOG(FATAL) << "unsupported input types.";
} }
......
...@@ -127,7 +127,9 @@ void BatchNormOp<Context>::RunOnDevice() { ...@@ -127,7 +127,9 @@ void BatchNormOp<Context>::RunOnDevice() {
if (input(0).template IsType<float>()) RunWithType<float>(); if (input(0).template IsType<float>()) RunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) RunWithType<float16>(); else if (input(0).template IsType<float16>()) RunWithType<float16>();
#endif
else LOG(FATAL) << "unsupported input types."; else LOG(FATAL) << "unsupported input types.";
} }
...@@ -247,7 +249,9 @@ void BatchNormGradientOp<Context>::RunOnDevice() { ...@@ -247,7 +249,9 @@ void BatchNormGradientOp<Context>::RunOnDevice() {
else use_global_stats = use_stats == 1 ? true : false; else use_global_stats = use_stats == 1 ? true : false;
if (input(0).template IsType<float>()) RunWithType<float>(); if (input(0).template IsType<float>()) RunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) RunWithType<float16>(); else if (input(0).template IsType<float16>()) RunWithType<float16>();
#endif
else LOG(FATAL) << "unsupported input types."; else LOG(FATAL) << "unsupported input types.";
} }
......
...@@ -78,7 +78,9 @@ void L2NormOp<Context>::RunOnDevice() { ...@@ -78,7 +78,9 @@ void L2NormOp<Context>::RunOnDevice() {
output(0)->ReshapeLike(input(0)); output(0)->ReshapeLike(input(0));
if (input(0).template IsType<float>()) RunWithType<float>(); if (input(0).template IsType<float>()) RunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) RunWithType<float16>(); else if (input(0).template IsType<float16>()) RunWithType<float16>();
#endif
else LOG(FATAL) << "unsupported input types."; else LOG(FATAL) << "unsupported input types.";
} }
......
...@@ -4,6 +4,8 @@ ...@@ -4,6 +4,8 @@
namespace dragon { namespace dragon {
#ifdef WITH_CUDA_FP16
template <class Context> template <class Context>
void FloatToHalfOp<Context>::RunOnDevice() { void FloatToHalfOp<Context>::RunOnDevice() {
CHECK(input(0).template IsType<float>()) CHECK(input(0).template IsType<float>())
...@@ -28,4 +30,6 @@ OPERATOR_SCHEMA(FloatToHalf).NumInputs(1).NumOutputs(1); ...@@ -28,4 +30,6 @@ OPERATOR_SCHEMA(FloatToHalf).NumInputs(1).NumOutputs(1);
NO_GRADIENT(FloatToHalf); NO_GRADIENT(FloatToHalf);
#endif
} // namespace dragon } // namespace dragon
\ No newline at end of file
...@@ -19,7 +19,9 @@ void GradientGenerateOp<Context>::RunWithType() { ...@@ -19,7 +19,9 @@ void GradientGenerateOp<Context>::RunWithType() {
template <class Context> template <class Context>
void GradientGenerateOp<Context>::RunOnDevice() { void GradientGenerateOp<Context>::RunOnDevice() {
if (input(0).template IsType<float>()) RunWithType<float>(); if (input(0).template IsType<float>()) RunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (input(0).template IsType<float16>()) RunWithType<float16>(); else if (input(0).template IsType<float16>()) RunWithType<float16>();
#endif
else LOG(FATAL) << "unsupported input types."; else LOG(FATAL) << "unsupported input types.";
} }
......
...@@ -23,12 +23,16 @@ void MemoryDataOp<Context>::RunOnDevice() { ...@@ -23,12 +23,16 @@ void MemoryDataOp<Context>::RunOnDevice() {
if (input(0).template IsType<float>()) { if (input(0).template IsType<float>()) {
if (data_type == TensorProto_DataType_FLOAT) RunWithType<float, float>(); if (data_type == TensorProto_DataType_FLOAT) RunWithType<float, float>();
#ifdef WITH_CUDA_FP16
else if (data_type == TensorProto_DataType_FLOAT16) RunWithType<float, float16>(); else if (data_type == TensorProto_DataType_FLOAT16) RunWithType<float, float16>();
#endif
else LOG(FATAL) << "unsupported input types."; else LOG(FATAL) << "unsupported input types.";
} }
else if (input(0).template IsType<uint8_t>()) { else if (input(0).template IsType<uint8_t>()) {
if (data_type == TensorProto_DataType_FLOAT) RunWithType<uint8_t, float>(); if (data_type == TensorProto_DataType_FLOAT) RunWithType<uint8_t, float>();
#ifdef WITH_CUDA_FP16
if (data_type == TensorProto_DataType_FLOAT16) RunWithType<uint8_t, float16>(); if (data_type == TensorProto_DataType_FLOAT16) RunWithType<uint8_t, float16>();
#endif
} }
else { LOG(FATAL) << "unsupported input types."; } else { LOG(FATAL) << "unsupported input types."; }
} }
......
...@@ -58,7 +58,9 @@ void DenseConcatGradientOp<Context>::ElimateCorruption() { ...@@ -58,7 +58,9 @@ void DenseConcatGradientOp<Context>::ElimateCorruption() {
input(0).Move(buffer->memory()); input(0).Move(buffer->memory());
head_data[idx] = input(0).name(); head_data[idx] = input(0).name();
if (input(-2).template IsType<float>()) RestoreX1<float>(); if (input(-2).template IsType<float>()) RestoreX1<float>();
#ifdef WITH_CUDA_FP16
else if (input(-2).template IsType<float16>()) RestoreX1<float16>(); else if (input(-2).template IsType<float16>()) RestoreX1<float16>();
#endif
else LOG(FATAL) << "unsupported input types."; else LOG(FATAL) << "unsupported input types.";
// post-process // post-process
if (input(0).memory() != buffer->memory()) buffer->Move(input(0).memory()); if (input(0).memory() != buffer->memory()) buffer->Move(input(0).memory());
......
...@@ -40,6 +40,7 @@ template <> void Set<int, CUDAContext>(const int n, ...@@ -40,6 +40,7 @@ template <> void Set<int, CUDAContext>(const int n,
_Set<int> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, alpha, x); _Set<int> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, alpha, x);
} }
#ifdef WITH_CUDA_FP16
template <typename T> template <typename T>
__global__ void _SetHalf2(const int n, const half2 alpha, half2* x) { __global__ void _SetHalf2(const int n, const half2 alpha, half2* x) {
CUDA_KERNEL_LOOP(idx, n) { CUDA_KERNEL_LOOP(idx, n) {
...@@ -61,6 +62,7 @@ template <> void Set<float16, CUDAContext>(const int n, ...@@ -61,6 +62,7 @@ template <> void Set<float16, CUDAContext>(const int n,
_Set<float16> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, alpha, x); _Set<float16> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, alpha, x);
} }
} }
#endif
template <> void RandomUniform<uint32_t, CUDAContext>(const int n, template <> void RandomUniform<uint32_t, CUDAContext>(const int n,
const float low, const float low,
...@@ -144,6 +146,7 @@ template <> void Mul<float, CUDAContext>(int n, ...@@ -144,6 +146,7 @@ template <> void Mul<float, CUDAContext>(int n,
_Mul<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, a, b, y); _Mul<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, a, b, y);
} }
#ifdef WITH_CUDA_FP16
template <typename T> template <typename T>
__global__ void _MulHalf(const int n, const half* a, const half* b, half* y) { __global__ void _MulHalf(const int n, const half* a, const half* b, half* y) {
CUDA_KERNEL_LOOP(idx, n) { CUDA_KERNEL_LOOP(idx, n) {
...@@ -161,7 +164,7 @@ __global__ void _MulHalf2(const int n, const half2* a, const half2* b, half2* y) ...@@ -161,7 +164,7 @@ __global__ void _MulHalf2(const int n, const half2* a, const half2* b, half2* y)
#endif #endif
} }
} }
template <> void Mul<float16, CUDAContext>(int n, template <> void Mul<float16, CUDAContext>(int n,
const float16* a, const float16* a,
const float16* b, const float16* b,
...@@ -176,6 +179,7 @@ template <> void Mul<float16, CUDAContext>(int n, ...@@ -176,6 +179,7 @@ template <> void Mul<float16, CUDAContext>(int n,
reinterpret_cast<const half*>(b), reinterpret_cast<const half*>(b),
reinterpret_cast<half*>(y)); reinterpret_cast<half*>(y));
} }
#endif
template <typename T> template <typename T>
__global__ void _Div(const int n, const T* a, const T* b, T* y) { __global__ void _Div(const int n, const T* a, const T* b, T* y) {
...@@ -191,6 +195,7 @@ template <> void Div<float, CUDAContext>(int n, ...@@ -191,6 +195,7 @@ template <> void Div<float, CUDAContext>(int n,
_Div<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, a, b, y); _Div<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, a, b, y);
} }
#ifdef WITH_CUDA_FP16
template <typename T> template <typename T>
__global__ void _DivHalf(const int n, const half* a, const half* b, half* y) { __global__ void _DivHalf(const int n, const half* a, const half* b, half* y) {
CUDA_KERNEL_LOOP(idx, n) { CUDA_KERNEL_LOOP(idx, n) {
...@@ -209,6 +214,7 @@ template <> void Div<float16, CUDAContext>(int n, ...@@ -209,6 +214,7 @@ template <> void Div<float16, CUDAContext>(int n,
reinterpret_cast<const half*>(b), reinterpret_cast<const half*>(b),
reinterpret_cast<half*>(y)); reinterpret_cast<half*>(y));
} }
#endif
template <typename T> template <typename T>
__global__ void _Clip(const int n, const T low, const T high, T* x) { __global__ void _Clip(const int n, const T low, const T high, T* x) {
...@@ -260,6 +266,7 @@ template <> void Square<float, CUDAContext>(int n, ...@@ -260,6 +266,7 @@ template <> void Square<float, CUDAContext>(int n,
_Square<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, x, y); _Square<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, x, y);
} }
#ifdef WITH_CUDA_FP16
template <typename T> template <typename T>
__global__ void _SquareHalf(const int n, const half* x, half* y) { __global__ void _SquareHalf(const int n, const half* x, half* y) {
CUDA_KERNEL_LOOP(idx, n) { CUDA_KERNEL_LOOP(idx, n) {
...@@ -290,6 +297,7 @@ template <> void Square<float16, CUDAContext>(int n, ...@@ -290,6 +297,7 @@ template <> void Square<float16, CUDAContext>(int n,
reinterpret_cast<half*>(y)); reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK; CUDA_POST_KERNEL_CHECK;
} }
#endif
template <typename T> template <typename T>
__global__ void _Sqrt(const int n, const T* x, T* y) { __global__ void _Sqrt(const int n, const T* x, T* y) {
...@@ -304,6 +312,7 @@ template <> void Sqrt<float, CUDAContext>(int n, ...@@ -304,6 +312,7 @@ template <> void Sqrt<float, CUDAContext>(int n,
_Sqrt<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, x, y); _Sqrt<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, x, y);
} }
#ifdef WITH_CUDA_FP16
template <typename T> template <typename T>
__global__ void _SqrtHalf(const int n, const half* x, half* y) { __global__ void _SqrtHalf(const int n, const half* x, half* y) {
CUDA_KERNEL_LOOP(idx, n) { CUDA_KERNEL_LOOP(idx, n) {
...@@ -334,6 +343,7 @@ template <> void Sqrt<float16, CUDAContext>(int n, ...@@ -334,6 +343,7 @@ template <> void Sqrt<float16, CUDAContext>(int n,
reinterpret_cast<half*>(y)); reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK; CUDA_POST_KERNEL_CHECK;
} }
#endif
template <typename T> template <typename T>
__global__ void _Pow(const int n, const T alpha, const T* a, T* y) { __global__ void _Pow(const int n, const T alpha, const T* a, T* y) {
...@@ -349,6 +359,7 @@ template <> void Pow<float, CUDAContext>(int n, ...@@ -349,6 +359,7 @@ template <> void Pow<float, CUDAContext>(int n,
_Pow<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, alpha, x, y); _Pow<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, alpha, x, y);
} }
#ifdef WITH_CUDA_FP16
template <typename T> template <typename T>
__global__ void _PowHalf(const int n, const float alpha, const half* a, half* y) { __global__ void _PowHalf(const int n, const float alpha, const half* a, half* y) {
CUDA_KERNEL_LOOP(idx, n) { CUDA_KERNEL_LOOP(idx, n) {
...@@ -384,6 +395,7 @@ template <> void Pow<float16, CUDAContext>(int n, ...@@ -384,6 +395,7 @@ template <> void Pow<float16, CUDAContext>(int n,
reinterpret_cast<half*>(y)); reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK; CUDA_POST_KERNEL_CHECK;
} }
#endif
template <typename T> template <typename T>
__global__ void _Inv(const int n, const float numerator, const T* x, T* y) { __global__ void _Inv(const int n, const float numerator, const T* x, T* y) {
...@@ -399,6 +411,7 @@ template <> void Inv<float, CUDAContext>(const int n, ...@@ -399,6 +411,7 @@ template <> void Inv<float, CUDAContext>(const int n,
_Inv<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, numerator, x, y); _Inv<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, numerator, x, y);
} }
#ifdef WITH_CUDA_FP16
template <typename T> template <typename T>
__global__ void _InvHalf(const int n, const half numerator, const half* x, half* y) { __global__ void _InvHalf(const int n, const half numerator, const half* x, half* y) {
CUDA_KERNEL_LOOP(idx, n) { CUDA_KERNEL_LOOP(idx, n) {
...@@ -439,6 +452,7 @@ template <> void Inv<float16, CUDAContext>(const int n, ...@@ -439,6 +452,7 @@ template <> void Inv<float16, CUDAContext>(const int n,
} }
CUDA_POST_KERNEL_CHECK; CUDA_POST_KERNEL_CHECK;
} }
#endif
/******************** Level-2 ********************/ /******************** Level-2 ********************/
...@@ -518,6 +532,7 @@ template <> void AddScalar<float, CUDAContext>(const int n, const float alpha, f ...@@ -518,6 +532,7 @@ template <> void AddScalar<float, CUDAContext>(const int n, const float alpha, f
_AddScalar<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, alpha, y); _AddScalar<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, alpha, y);
} }
#ifdef WITH_CUDA_FP16
template <typename T> template <typename T>
__global__ void _AddScalarHalf(const int n, half alpha, half* y) { __global__ void _AddScalarHalf(const int n, half alpha, half* y) {
CUDA_KERNEL_LOOP(idx, n) { CUDA_KERNEL_LOOP(idx, n) {
...@@ -552,6 +567,7 @@ template <> void AddScalar<float16, CUDAContext>(const int n, const float alpha, ...@@ -552,6 +567,7 @@ template <> void AddScalar<float16, CUDAContext>(const int n, const float alpha,
} }
CUDA_POST_KERNEL_CHECK; CUDA_POST_KERNEL_CHECK;
} }
#endif
template <typename T> template <typename T>
__global__ void _MulScalar(const int n, T alpha, T* y) { __global__ void _MulScalar(const int n, T alpha, T* y) {
...@@ -641,6 +657,7 @@ template <> void Gemm<float, CUDAContext>(const CBLAS_TRANSPOSE transA, ...@@ -641,6 +657,7 @@ template <> void Gemm<float, CUDAContext>(const CBLAS_TRANSPOSE transA,
C, N)); C, N));
} }
#ifdef WITH_CUDA_FP16
template <> void Gemm<float16, CUDAContext>(const CBLAS_TRANSPOSE transA, template <> void Gemm<float16, CUDAContext>(const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB, const CBLAS_TRANSPOSE transB,
const int M, const int M,
...@@ -682,6 +699,7 @@ template <> void Gemm<float16, CUDAContext>(const CBLAS_TRANSPOSE transA, ...@@ -682,6 +699,7 @@ template <> void Gemm<float16, CUDAContext>(const CBLAS_TRANSPOSE transA,
LOG(FATAL) << "unsupported math type"; LOG(FATAL) << "unsupported math type";
} }
} }
#endif
template <> void Gemv<float, CUDAContext>(const CBLAS_TRANSPOSE transA, template <> void Gemv<float, CUDAContext>(const CBLAS_TRANSPOSE transA,
const int M, const int N, const int M, const int N,
...@@ -702,6 +720,7 @@ template <> void Gemv<float, CUDAContext>(const CBLAS_TRANSPOSE transA, ...@@ -702,6 +720,7 @@ template <> void Gemv<float, CUDAContext>(const CBLAS_TRANSPOSE transA,
y, 1)); y, 1));
} }
#ifdef WITH_CUDA_FP16
template <> void Gemv<float16, CUDAContext>(const CBLAS_TRANSPOSE transA, template <> void Gemv<float16, CUDAContext>(const CBLAS_TRANSPOSE transA,
const int M, const int M,
const int N, const int N,
...@@ -742,6 +761,7 @@ template <> void Gemv<float16, CUDAContext>(const CBLAS_TRANSPOSE transA, ...@@ -742,6 +761,7 @@ template <> void Gemv<float16, CUDAContext>(const CBLAS_TRANSPOSE transA,
LOG(FATAL) << "unsupported math type"; LOG(FATAL) << "unsupported math type";
} }
} }
#endif
} // namespace math } // namespace math
......
...@@ -3,11 +3,9 @@ ...@@ -3,11 +3,9 @@
#include "core/tensor.h" #include "core/tensor.h"
#include "utils/op_kernel.h" #include "utils/op_kernel.h"
#include "utils/math_functions.h" #include "utils/omp_alternative.h"
#ifdef WITH_SSE
#include "utils/sse_alternative.h" #include "utils/sse_alternative.h"
#endif #include "utils/math_functions.h"
bool judge(int a, int b) { return unsigned(a) < unsigned(b); } bool judge(int a, int b) { return unsigned(a) < unsigned(b); }
...@@ -28,8 +26,10 @@ template<> void Dropout<float, CPUContext>(const int count, ...@@ -28,8 +26,10 @@ template<> void Dropout<float, CPUContext>(const int count,
CPUContext* context) { CPUContext* context) {
uint32_t thresh = static_cast<uint32_t>(UINT_MAX * prob); uint32_t thresh = static_cast<uint32_t>(UINT_MAX * prob);
math::RandomBernoulli<float, CPUContext>(count, 1 - prob, mask); math::RandomBernoulli<float, CPUContext>(count, 1 - prob, mask);
for (int i = 0; i < count; ++i) #ifdef WITH_OMP
y[i] = x[i] * mask[i] * scale; #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) y[i] = x[i] * mask[i] * scale;
} }
template<> void DropoutGrad<float, CPUContext>(const int count, template<> void DropoutGrad<float, CPUContext>(const int count,
...@@ -38,8 +38,10 @@ template<> void DropoutGrad<float, CPUContext>(const int count, ...@@ -38,8 +38,10 @@ template<> void DropoutGrad<float, CPUContext>(const int count,
const float* dy, const float* dy,
const uint32_t* mask, const uint32_t* mask,
float* dx) { float* dx) {
for (int i = 0; i < count; ++i) #ifdef WITH_OMP
dx[i] = dy[i] * mask[i] * scale; #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) dx[i] = dy[i] * mask[i] * scale;
} }
/******************** activation.relu ********************/ /******************** activation.relu ********************/
...@@ -48,6 +50,9 @@ template<> void Relu<float, CPUContext>(const int count, ...@@ -48,6 +50,9 @@ template<> void Relu<float, CPUContext>(const int count,
const float* x, const float* x,
const float slope, const float slope,
float* y) { float* y) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
y[i] = std::max(x[i], 0.f) + slope * std::min(x[i], 0.f); y[i] = std::max(x[i], 0.f) + slope * std::min(x[i], 0.f);
} }
...@@ -58,10 +63,12 @@ template<> void ReluGrad<float, CPUContext>(const int count, ...@@ -58,10 +63,12 @@ template<> void ReluGrad<float, CPUContext>(const int count,
const float* y, const float* y,
const float slope, const float slope,
float* dx) { float* dx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
dx[i] = dy[i] * ((y[i] > 0) + slope * (y[i] <= 0)); dx[i] = dy[i] * ((y[i] > 0) + slope * (y[i] <= 0));
} }
} }
/******************** activation.sigmoid ********************/ /******************** activation.sigmoid ********************/
...@@ -70,15 +77,19 @@ template <typename T> ...@@ -70,15 +77,19 @@ template <typename T>
T _sigmoid(T x) { return T(1) / (T(1) + exp(-x)); } T _sigmoid(T x) { return T(1) / (T(1) + exp(-x)); }
template<> void Sigmoid<float, CPUContext>(const int count, const float* x, float* y) { template<> void Sigmoid<float, CPUContext>(const int count, const float* x, float* y) {
for (int i = 0; i < count; ++i) { #ifdef WITH_OMP
y[i] = _sigmoid<float>(x[i]); #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
} #endif
for (int i = 0; i < count; ++i) y[i] = _sigmoid<float>(x[i]);
} }
template<> void SigmoidGrad<float, CPUContext>(const int count, template<> void SigmoidGrad<float, CPUContext>(const int count,
const float* dy, const float* dy,
const float* y, const float* y,
float* dx) { float* dx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
dx[i] = dy[i] * y[i] * (1 - y[i]); dx[i] = dy[i] * y[i] * (1 - y[i]);
} }
...@@ -149,6 +160,9 @@ template<> void SoftmaxGrad<float, CPUContext>(const int count, ...@@ -149,6 +160,9 @@ template<> void SoftmaxGrad<float, CPUContext>(const int count,
/******************** activation.tanh ********************/ /******************** activation.tanh ********************/
template<> void Tanh<float, CPUContext>(const int count, const float* x, float* y) { template<> void Tanh<float, CPUContext>(const int count, const float* x, float* y) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
y[i] = std::tanh(x[i]); y[i] = std::tanh(x[i]);
} }
...@@ -158,6 +172,9 @@ template<> void TanhGrad<float, CPUContext>(const int count, ...@@ -158,6 +172,9 @@ template<> void TanhGrad<float, CPUContext>(const int count,
const float* dy, const float* dy,
const float* y, const float* y,
float* dx) { float* dx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
dx[i] = dy[i] * (1 - y[i] * y[i]); dx[i] = dy[i] * (1 - y[i] * y[i]);
} }
...@@ -197,6 +214,9 @@ template <> void Clip<float, CPUContext>(const int count, ...@@ -197,6 +214,9 @@ template <> void Clip<float, CPUContext>(const int count,
const float* x, const float* x,
float* mask, float* mask,
float* y) { float* y) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
mask[i] = 1.0; mask[i] = 1.0;
if (x[i] < low || x[i] > high) mask[i] = 0.0; if (x[i] < low || x[i] > high) mask[i] = 0.0;
...@@ -300,8 +320,10 @@ template<> void Argmax<float, CPUContext>(const int count, ...@@ -300,8 +320,10 @@ template<> void Argmax<float, CPUContext>(const int count,
/******************** common.at ********************/ /******************** common.at ********************/
template <> void CanonicalAxis<float, CPUContext>(const int count, const int dim, float* y) { template <> void CanonicalAxis<float, CPUContext>(const int count, const int dim, float* y) {
for (int i = 0; i < count; ++i) #ifdef WITH_OMP
if (y[i] < 0) y[i] += dim; #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) if (y[i] < 0) y[i] += dim;
} }
template <> void At<float, CPUContext>(const int count, template <> void At<float, CPUContext>(const int count,
...@@ -478,6 +500,9 @@ template<> void Sum<float, CPUContext>(const int count, ...@@ -478,6 +500,9 @@ template<> void Sum<float, CPUContext>(const int count,
const int inner_dim, const int inner_dim,
const float* x, const float* x,
float* y) { float* y) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
float sum_val = 0.0; float sum_val = 0.0;
for (int j = 0; j < axis_dim; ++j) for (int j = 0; j < axis_dim; ++j)
...@@ -492,6 +517,9 @@ template<> void SumGrad<float, CPUContext>(const int count, ...@@ -492,6 +517,9 @@ template<> void SumGrad<float, CPUContext>(const int count,
const float coeff, const float coeff,
const float* dy, const float* dy,
float* dx) { float* dx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
for (int j = 0; j < axis_dim; ++j) for (int j = 0; j < axis_dim; ++j)
dx[(i / inner_dim * axis_dim + j) * inner_dim + i % inner_dim] = dy[i] * coeff; dx[(i / inner_dim * axis_dim + j) * inner_dim + i % inner_dim] = dy[i] * coeff;
...@@ -585,6 +613,9 @@ template <> void Transpose<float, CPUContext>(const int count, ...@@ -585,6 +613,9 @@ template <> void Transpose<float, CPUContext>(const int count,
const int* new_steps, const int* new_steps,
const float* x, const float* x,
float* y) { float* y) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
int x_idx = 0, y_idx = i; int x_idx = 0, y_idx = i;
for (int j = 0; j < ndim; ++j) { for (int j = 0; j < ndim; ++j) {
...@@ -603,15 +634,7 @@ template <> void Transpose<float16, CPUContext>(const int count, ...@@ -603,15 +634,7 @@ template <> void Transpose<float16, CPUContext>(const int count,
const int* new_steps, const int* new_steps,
const float16* x, const float16* x,
float16* y) { float16* y) {
for (int i = 0; i < count; ++i) { LOG(FATAL) << "unsupport float16 with CPU";
int x_idx = 0, y_idx = i;
for (int j = 0; j < ndim; ++j) {
int k = order[j];
x_idx += (y_idx / new_steps[j]) * old_steps[k];
y_idx %= new_steps[j];
}
y[i] = x[x_idx];
}
} }
template <> void TransposeGrad<float, CPUContext>(const int count, template <> void TransposeGrad<float, CPUContext>(const int count,
...@@ -621,6 +644,9 @@ template <> void TransposeGrad<float, CPUContext>(const int count, ...@@ -621,6 +644,9 @@ template <> void TransposeGrad<float, CPUContext>(const int count,
const int* new_steps, const int* new_steps,
const float* dy, const float* dy,
float* dx) { float* dx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
int x_idx = 0, y_idx = i; int x_idx = 0, y_idx = i;
for (int j = 0; j < ndim; ++j) { for (int j = 0; j < ndim; ++j) {
...@@ -639,20 +665,15 @@ template <> void TransposeGrad<float16, CPUContext>(const int count, ...@@ -639,20 +665,15 @@ template <> void TransposeGrad<float16, CPUContext>(const int count,
const int* new_steps, const int* new_steps,
const float16* dy, const float16* dy,
float16* dx) { float16* dx) {
for (int i = 0; i < count; ++i) { LOG(FATAL) << "unsupport float16 with CPU";
int x_idx = 0, y_idx = i;
for (int j = 0; j < ndim; ++j) {
int k = order[j];
x_idx += (y_idx / new_steps[j]) * old_steps[k];
y_idx %= new_steps[j];
}
dx[x_idx] = dy[i];
}
} }
/******************** loss.l1_loss ********************/ /******************** loss.l1_loss ********************/
template<> void AbsGrad<float, CPUContext>(const int count, const float* dy, float* dx) { template<> void AbsGrad<float, CPUContext>(const int count, const float* dy, float* dx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
const float val = dy[i]; const float val = dy[i];
// val > 0: 1 | val == 0: 0 | val < 0: -1 // val > 0: 1 | val == 0: 0 | val < 0: -1
...@@ -666,6 +687,9 @@ template <> void SigmoidCrossEntropy<float, CPUContext>(const int count, ...@@ -666,6 +687,9 @@ template <> void SigmoidCrossEntropy<float, CPUContext>(const int count,
const float* x, const float* x,
const float* target, const float* target,
float* loss) { float* loss) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
loss[i] = std::log(1 + std::exp(x[i] - 2 * x[i] * (x[i] >= 0))) loss[i] = std::log(1 + std::exp(x[i] - 2 * x[i] * (x[i] >= 0)))
+ x[i] * ((x[i] >= 0) - target[i]); + x[i] * ((x[i] >= 0) - target[i]);
...@@ -678,6 +702,9 @@ template<> void SmoothL1<float, CPUContext>(const int count, ...@@ -678,6 +702,9 @@ template<> void SmoothL1<float, CPUContext>(const int count,
const float sigma2, const float sigma2,
const float* x, const float* x,
float* y) { float* y) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
const float val = x[i]; const float val = x[i];
const float abs_val = abs(val); const float abs_val = abs(val);
...@@ -690,6 +717,9 @@ template<> void SmoothL1Grad<float, CPUContext>(const int count, ...@@ -690,6 +717,9 @@ template<> void SmoothL1Grad<float, CPUContext>(const int count,
const float sigma2, const float sigma2,
const float* dy, const float* dy,
float* dx) { float* dx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
const float val = dy[i]; const float val = dy[i];
const float abs_val = abs(val); const float abs_val = abs(val);
...@@ -705,6 +735,9 @@ template <> void SoftmaxCrossEntropy<float, CPUContext>(const int count, ...@@ -705,6 +735,9 @@ template <> void SoftmaxCrossEntropy<float, CPUContext>(const int count,
const float* prob, const float* prob,
const float* target, const float* target,
float* loss) { float* loss) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
loss[i] = - target[i] * std::log(std::max(prob[i], FLT_MIN)); loss[i] = - target[i] * std::log(std::max(prob[i], FLT_MIN));
} }
...@@ -1016,9 +1049,12 @@ template <> void RMSPropUpdate<float, CPUContext>(const int count, ...@@ -1016,9 +1049,12 @@ template <> void RMSPropUpdate<float, CPUContext>(const int count,
/******************** utils.compare ********************/ /******************** utils.compare ********************/
template <> void Equal<float, CPUContext>(const int count, template <> void Equal<float, CPUContext>(const int count,
const float* a, const float* a,
const float* b, const float* b,
float* y) { float* y) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) for (int i = 0; i < count; ++i)
y[i] = fabs(a[i] - b[i]) < FLT_EPSILON ? 1.0 : 0.0; y[i] = fabs(a[i] - b[i]) < FLT_EPSILON ? 1.0 : 0.0;
} }
...@@ -1096,6 +1132,9 @@ template <> void OneHot<float, CPUContext>(const int count, ...@@ -1096,6 +1132,9 @@ template <> void OneHot<float, CPUContext>(const int count,
const int on_value, const int on_value,
const float* x, const float* x,
float* y) { float* y) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
const int val = x[i]; const int val = x[i];
y[i * depth + val] = on_value; y[i * depth + val] = on_value;
......
...@@ -21,7 +21,7 @@ template<> void Empty<float, CUDAContext>() { ...@@ -21,7 +21,7 @@ template<> void Empty<float, CUDAContext>() {
} }
template<> void Empty<float16, CUDAContext>() { template<> void Empty<float16, CUDAContext>() {
_Empty<float> << <1, 1 >> >(); _Empty<float16> << <1, 1 >> >();
CUDA_POST_KERNEL_CHECK; CUDA_POST_KERNEL_CHECK;
} }
...@@ -102,6 +102,7 @@ template<> void Relu<float, CUDAContext>(const int count, ...@@ -102,6 +102,7 @@ template<> void Relu<float, CUDAContext>(const int count,
CUDA_POST_KERNEL_CHECK; CUDA_POST_KERNEL_CHECK;
} }
#ifdef WITH_CUDA_FP16
template <typename T> template <typename T>
__global__ void _ReluHalf(const int count, const half* x, const float slope, half* y) { __global__ void _ReluHalf(const int count, const half* x, const float slope, half* y) {
const half kSlope = __float2half(slope); const half kSlope = __float2half(slope);
...@@ -123,6 +124,7 @@ template<> void Relu<float16, CUDAContext>(const int count, ...@@ -123,6 +124,7 @@ template<> void Relu<float16, CUDAContext>(const int count,
reinterpret_cast<half*>(y)); reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK; CUDA_POST_KERNEL_CHECK;
} }
#endif
template <typename T> template <typename T>
__global__ void _ReluGrad(const int count, __global__ void _ReluGrad(const int count,
...@@ -477,6 +479,7 @@ template<> void Scale<float, CUDAContext>(const int axis, ...@@ -477,6 +479,7 @@ template<> void Scale<float, CUDAContext>(const int axis,
Ydata); Ydata);
} }
#ifdef WITH_CUDA_FP16
template <typename T> template <typename T>
__global__ void _ScaleWithoutBiasHalf(const int n, __global__ void _ScaleWithoutBiasHalf(const int n,
const half* x, const half* x,
...@@ -538,6 +541,7 @@ template<> void Scale<float16, CUDAContext>(const int axis, ...@@ -538,6 +541,7 @@ template<> void Scale<float16, CUDAContext>(const int axis,
inner_dim, inner_dim,
reinterpret_cast<half*>(Ydata)); reinterpret_cast<half*>(Ydata));
} }
#endif
template <> void ScaleGrad<float, CUDAContext>(const int axis, template <> void ScaleGrad<float, CUDAContext>(const int axis,
Tensor* dy, Tensor* dy,
...@@ -730,6 +734,7 @@ template <> void Concat<float, CUDAContext>(const int count, ...@@ -730,6 +734,7 @@ template <> void Concat<float, CUDAContext>(const int count,
CUDA_POST_KERNEL_CHECK; CUDA_POST_KERNEL_CHECK;
} }
#ifdef WITH_CUDA_FP16
template <> void Concat<float16, CUDAContext>(const int count, template <> void Concat<float16, CUDAContext>(const int count,
const int outer_dim, const int outer_dim,
const int inner_dim, const int inner_dim,
...@@ -749,6 +754,7 @@ template <> void Concat<float16, CUDAContext>(const int count, ...@@ -749,6 +754,7 @@ template <> void Concat<float16, CUDAContext>(const int count,
reinterpret_cast<half*>(y)); reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK; CUDA_POST_KERNEL_CHECK;
} }
#endif
template <typename T> template <typename T>
__global__ void _ConcatGrad(const int count, __global__ void _ConcatGrad(const int count,
...@@ -789,6 +795,7 @@ template <> void ConcatGrad<float, CUDAContext>(const int count, ...@@ -789,6 +795,7 @@ template <> void ConcatGrad<float, CUDAContext>(const int count,
CUDA_POST_KERNEL_CHECK; CUDA_POST_KERNEL_CHECK;
} }
#ifdef WITH_CUDA_FP16
template <> void ConcatGrad<float16, CUDAContext>(const int count, template <> void ConcatGrad<float16, CUDAContext>(const int count,
const int outer_dim, const int outer_dim,
const int inner_dim, const int inner_dim,
...@@ -808,6 +815,7 @@ template <> void ConcatGrad<float16, CUDAContext>(const int count, ...@@ -808,6 +815,7 @@ template <> void ConcatGrad<float16, CUDAContext>(const int count,
reinterpret_cast<half*>(dx)); reinterpret_cast<half*>(dx));
CUDA_POST_KERNEL_CHECK; CUDA_POST_KERNEL_CHECK;
} }
#endif
/******************** common.crop ********************/ /******************** common.crop ********************/
...@@ -1134,6 +1142,7 @@ template <> void Transpose<float, CUDAContext>(const int count, ...@@ -1134,6 +1142,7 @@ template <> void Transpose<float, CUDAContext>(const int count,
CUDA_POST_KERNEL_CHECK; CUDA_POST_KERNEL_CHECK;
} }
#ifdef WITH_CUDA_FP16
template <> void Transpose<float16, CUDAContext>(const int count, template <> void Transpose<float16, CUDAContext>(const int count,
const int ndim, const int ndim,
const int* order, const int* order,
...@@ -1150,6 +1159,7 @@ template <> void Transpose<float16, CUDAContext>(const int count, ...@@ -1150,6 +1159,7 @@ template <> void Transpose<float16, CUDAContext>(const int count,
reinterpret_cast<half*>(y)); reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK; CUDA_POST_KERNEL_CHECK;
} }
#endif
template <typename T> template <typename T>
__global__ void _TransposeGrad(const int count, __global__ void _TransposeGrad(const int count,
...@@ -1187,6 +1197,7 @@ template <> void TransposeGrad<float, CUDAContext>(const int count, ...@@ -1187,6 +1197,7 @@ template <> void TransposeGrad<float, CUDAContext>(const int count,
CUDA_POST_KERNEL_CHECK; CUDA_POST_KERNEL_CHECK;
} }
#ifdef WITH_CUDA_FP16
template <> void TransposeGrad<float16, CUDAContext>(const int count, template <> void TransposeGrad<float16, CUDAContext>(const int count,
const int ndim, const int ndim,
const int* order, const int* order,
...@@ -1203,6 +1214,7 @@ template <> void TransposeGrad<float16, CUDAContext>(const int count, ...@@ -1203,6 +1214,7 @@ template <> void TransposeGrad<float16, CUDAContext>(const int count,
reinterpret_cast<half*>(dx)); reinterpret_cast<half*>(dx));
CUDA_POST_KERNEL_CHECK; CUDA_POST_KERNEL_CHECK;
} }
#endif
/******************** loss.l1_loss ********************/ /******************** loss.l1_loss ********************/
...@@ -1834,6 +1846,7 @@ template <> void RMSPropUpdate<float, CUDAContext>(const int count, ...@@ -1834,6 +1846,7 @@ template <> void RMSPropUpdate<float, CUDAContext>(const int count,
/******************** utils.cast ********************/ /******************** utils.cast ********************/
#ifdef WITH_CUDA_FP16
template <typename T> template <typename T>
__global__ void _FloatToHalfKernel(const int count, const float* x, half* y) { __global__ void _FloatToHalfKernel(const int count, const float* x, half* y) {
CUDA_KERNEL_LOOP(idx, count) { CUDA_KERNEL_LOOP(idx, count) {
...@@ -1849,6 +1862,7 @@ template <> void Float2Half<float, CUDAContext>(const int count, ...@@ -1849,6 +1862,7 @@ template <> void Float2Half<float, CUDAContext>(const int count,
reinterpret_cast<half*>(y)); reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK; CUDA_POST_KERNEL_CHECK;
} }
#endif
/******************** utils.compare ********************/ /******************** utils.compare ********************/
...@@ -1943,6 +1957,7 @@ template <> void MemoryData<uint8_t, float, CUDAContext>(const int count, ...@@ -1943,6 +1957,7 @@ template <> void MemoryData<uint8_t, float, CUDAContext>(const int count,
CUDA_POST_KERNEL_CHECK; CUDA_POST_KERNEL_CHECK;
} }
#ifdef WITH_CUDA_FP16
template <> void MemoryData<float, float16, CUDAContext>(const int count, template <> void MemoryData<float, float16, CUDAContext>(const int count,
const int num, const int num,
const int channels, const int channels,
...@@ -1976,6 +1991,7 @@ template <> void MemoryData<uint8_t, float16, CUDAContext>(const int count, ...@@ -1976,6 +1991,7 @@ template <> void MemoryData<uint8_t, float16, CUDAContext>(const int count,
reinterpret_cast<half*>(y)); reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK; CUDA_POST_KERNEL_CHECK;
} }
#endif
/******************** utils.one_hot ********************/ /******************** utils.one_hot ********************/
......
# Dragon: A Computation Graph Virtual Machine Based Deep Learning Framework # Dragon: A Computation Graph Virtual Machine Based Deep Learning Framework
![](http://images.cnblogs.com/cnblogs_com/neopenx/690760/o_dragon_logo.png)
-----
### Compile Requirements for C++ ### Compile Requirements for C++
0. Google Protocol Buffer 0. Google Protocol Buffer
......
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!