add omp optimization

Ting PAN
Commit 6eeac5fe authored Aug 26, 2017 by Ting PAN
Showing with 580 additions and 232 deletions
Dragon/CMakeLists.txt
Dragon/include/utils/omp_alternative.h
Dragon/include/utils/sse_device.h
Dragon/src/operators/arithmetic/add_op.cc
Dragon/src/operators/arithmetic/div_op.cc
Dragon/src/operators/arithmetic/dot_op.cc
Dragon/src/operators/arithmetic/eltwise_op.cc
Dragon/src/operators/arithmetic/gram_matrix_op.cc
Dragon/src/operators/arithmetic/matmul_op.cc
Dragon/src/operators/arithmetic/mul_op.cc
Dragon/src/operators/arithmetic/pow_op.cc
Dragon/src/operators/arithmetic/scale_op.cc
Dragon/src/operators/arithmetic/sub_op.cc
Dragon/src/operators/common/concat_op.cc
Dragon/src/operators/common/transpose_op.cc
Dragon/src/operators/norm/batch_norm_op.cc
Dragon/src/operators/norm/l2_norm_op.cc
Dragon/src/operators/utils/cast_op.cpp
Dragon/src/operators/utils/gradient_op.cc
Dragon/src/operators/utils/memory_data_op.cc
--- a/Dragon/CMakeLists.txt
+++ b/Dragon/CMakeLists.txt
@@ -12,6 +12,7 @@ option(WITH_PYTHON3                "Set ON to use PYTHON3 otherwise PYTHON2"  OF
 option(WITH_CUDA                   "Set ON to use CUDA"  ON)
 option(WITH_CUDNN                  "Set ON to use CUDNN" OFF)
 option(WITH_BLAS                   "Set ON to use BLAS"  OFF)
+option(WITH_OMP                    "Set ON to use OpenMP"  OFF)
 option(WITH_SSE                    "Set ON to use SSE 4.1"  ON)
 option(WITH_MPI                    "Set ON to use MPI"  OFF)
 option(WITH_MPI_CUDA               "Set ON to use MPI-CUDA"  OFF)
@@ -22,7 +23,7 @@ option(WITH_CUDA_FP16              "Set ON to use FP16"  ON)
 set(3RDPARTY_DIR  ${PROJECT_SOURCE_DIR}/../3rdparty)

 # set your python environment
-set(PYTHON_DIR /usr/include/python2.7)  # prefer
+set(PYTHON_DIR /usr/include/python2.7)  # preferred
 #set(PYTHON_DIR /usr/include/python3.x)  # optional, set specific version
 #set(ANACONDA_DIR /xxx/anaconda)  # optional, root folder of anaconda, preset for 2.7, 3.5, and 3.6 
 set(NUMPY_DIR /xxx/numpy)  # required, root folder of numpy package
@@ -118,6 +119,10 @@ else()
        "\n -- > GEMM/GEMV is disabled"
        "\n -- > prefer not to run as CPU Mode")
 endif()
+if (WITH_OMP)
+    ADD_DEFINITIONS(-DWITH_OMP)
+    message(STATUS "Use OpenMP [Optional]")
+endif() 
 if (WITH_SSE) 
    ADD_DEFINITIONS(-DWITH_SSE)
    message(STATUS "Use SSE [Optional]")
@@ -145,11 +150,18 @@ endif()
 # ---[ Flags
 set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ${CUDA_ARCH}")
 if(WIN32)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP /O2")
+    if (WITH_OMP)
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /openmp")
+    endif()
 endif()
 if(UNIX)
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -O2 -m64 -fpermissive -std=c++11")
+    if (WITH_OMP)
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fopenmp")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
+    endif()
 endif()

 # ---[ Warnings

--- a/Dragon/include/utils/omp_alternative.h
+++ b/Dragon/include/utils/omp_alternative.h
+// --------------------------------------------------------
+// Dragon
+// Copyright(c) 2017 SeetaTech
+// Written by Ting Pan
+// --------------------------------------------------------
+
+#ifndef DRAGON_UTILS_OMP_ALTERNATIVE_H_
+#define DRAGON_UTILS_OMP_ALTERNATIVE_H_
+
+#ifdef WITH_OMP
+
+#include <algorithm>
+#include <omp.h>
+
+namespace dragon {
+
+#define OMP_MIN_ITERATORS_PER_CORE 256
+
+inline int GET_OMP_THREADS(const int N) { 
+   int threads = std::max(N / OMP_MIN_ITERATORS_PER_CORE, 1); 
+   return std::min(threads, omp_get_num_procs());
+}
+
+}
+
+#endif  // WITH_OMP
+
+#endif  // DRAGON_UTILS_OMP_ALTERNATIVE_H_
\ No newline at end of file
--- a/Dragon/include/utils/sse_device.h
+++ b/Dragon/include/utils/sse_device.h
@@ -15,11 +15,10 @@
 namespace dragon {

 #define SSE_LOOP1(i, n) \
-  int32_t i; \
  for (i = 0; i < n - 4; i += 4) \

 #define SSE_LOOP2(i, n) \
-  for (; i < n; i++)
+  for (; i < n; ++i)

 #define SSE_FP32_LOAD _mm_loadu_ps
 #define SSE_FP32_STORE _mm_storeu_ps

--- a/Dragon/src/operators/arithmetic/add_op.cc
+++ b/Dragon/src/operators/arithmetic/add_op.cc
@@ -53,18 +53,24 @@ void AddOp<Context>::RunOnDevice() {
    } 
    else if (input(0).dim(0) == input(1).dim(0) && input(1).count(1) == 1) {
        if (input(0).template IsType<float>()) BroadcastRunWithType<float>(2);
+#ifdef WITH_CUDA_FP16
        else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(2);
+#endif
        else LOG(FATAL) << "unsupported input types.";
    }
    else if (input(0).dim(-1) == input(1).dim(-1) && 
             input(1).count(0, input(1).axis(-1)) == 1) {
        if (input(0).template IsType<float>()) BroadcastRunWithType<float>(1);
+#ifdef WITH_CUDA_FP16
        else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(1);
+#endif
        else LOG(FATAL) << "unsupported input types.";
    } 
    else if (input(1).ndim() == 1 && input(1).dim(0) == 1) {
        if (input(0).template IsType<float>()) BroadcastRunWithType<float>(0);
+#ifdef WITH_CUDA_FP16
        else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(0);
+#endif
        else LOG(FATAL) << "unsupported input types.";
    }
    else {
@@ -139,18 +145,24 @@ void AddGradientOp<Context>::RunOnDevice() {
    } 
    else if (input(-1).dim(0) == input(0).dim(0) && input(0).count(1) == 1) {
        if (input(0).template IsType<float>()) BroadcastRunWithType<float>(2);
+#ifdef WITH_CUDA_FP16
        else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(2);
+#endif
        else LOG(FATAL) << "unsupported input types.";
    }
    else if (input(-1).dim(-1) == input(0).dim(-1) && 
             input(0).count(0, input(0).axis(-1)) == 1) {
        if (input(0).template IsType<float>()) BroadcastRunWithType<float>(1);
+#ifdef WITH_CUDA_FP16
        else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(1);
+#endif
        else LOG(FATAL) << "unsupported input types.";
    } 
    else if (input(0).ndim() == 1 && input(0).dim(0) == 1) {
        if (input(0).template IsType<float>()) BroadcastRunWithType<float>(0);
+#ifdef WITH_CUDA_FP16
        else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(0);
+#endif
        else LOG(FATAL) << "unsupported input types.";
    }
    else {

--- a/Dragon/src/operators/arithmetic/div_op.cc
+++ b/Dragon/src/operators/arithmetic/div_op.cc
@@ -54,18 +54,24 @@ void DivOp<Context>::RunOnDevice() {
    } 
    else if (input(0).dim(0) == input(1).dim(0) && input(1).count(1) == 1) {
        if (input(0).template IsType<float>()) BroadcastRunWithType<float>(2);
+#ifdef WITH_CUDA_FP16
        else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(2);
+#endif
        else LOG(FATAL) << "unsupported input types";
    }
    else if (input(0).dim(-1) == input(1).dim(-1) && 
             input(1).count(0, input(1).axis(-1)) == 1) {
        if (input(0).template IsType<float>()) BroadcastRunWithType<float>(1);
+#ifdef WITH_CUDA_FP16
        else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(1);
+#endif
        else LOG(FATAL) << "unsupported input types";
    } 
    else if (input(1).ndim() == 1 && input(1).dim(0) == 1) {
        if (input(0).template IsType<float>()) BroadcastRunWithType<float>(0);
+#ifdef WITH_CUDA_FP16
        else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(0);
+#endif
        else LOG(FATAL) << "unsupported input types";
    }
    else {
@@ -170,18 +176,24 @@ void DivGradientOp<Context>::RunOnDevice() {
    } 
    else if (input(0).dim(0) == input(1).dim(0) && input(1).count(1) == 1) {
        if (input(0).template IsType<float>()) BroadcastRunWithType<float>(2);
+#ifdef WITH_CUDA_FP16
        else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(2);
+#endif
        else LOG(FATAL) << "unsupported input types";
    }
    else if (input(0).dim(-1) == input(1).dim(-1) && 
             input(1).count(0, input(1).axis(-1)) == 1) {
        if (input(0).template IsType<float>()) BroadcastRunWithType<float>(1);
+#ifdef WITH_CUDA_FP16
        else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(1);
+#endif
        else LOG(FATAL) << "unsupported input types";
    } 
    else if (input(1).ndim() == 1 && input(1).dim(0) == 1) {
        if (input(0).template IsType<float>()) BroadcastRunWithType<float>(0);
+#ifdef WITH_CUDA_FP16
        else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(0);
+#endif
        else LOG(FATAL) << "unsupported input types";
    }
    else {

--- a/Dragon/src/operators/arithmetic/dot_op.cc
+++ b/Dragon/src/operators/arithmetic/dot_op.cc
@@ -55,7 +55,9 @@ void DotOp<Context>::RunOnDevice() {
        dims[dims.size() - 1] = N1;
        output(0)->Reshape(dims);
        if (input(0).template IsType<float>()) GemmRunWithType<float>();
+#ifdef WITH_CUDA_FP16
        else if (input(0).template IsType<float16>()) GemmRunWithType<float16>();
+#endif
        else LOG(FATAL) << "unsupported input types.";
    } 
    else if (input(0).ndim() >= 2 && input(1).ndim() == 1) {
@@ -70,7 +72,9 @@ void DotOp<Context>::RunOnDevice() {
        dims.pop_back();
        output(0)->Reshape(dims);
        if (input(0).template IsType<float>()) GemvRunWithType<float>();
+#ifdef WITH_CUDA_FP16
        else if (input(0).template IsType<float16>()) GemvRunWithType<float16>();
+#endif
        else LOG(FATAL) << "unsupported input types.";
    } 
    else {
@@ -148,7 +152,9 @@ void DotGradientOp<Context>::RunOnDevice() {
            << input(0).dim_string() << " can not Dot with Tensor"
            << "(" << input(1).name() << "): " << input(1).dim_string();
        if (input(0).template IsType<float>()) GemmRunWithType<float>();
+#ifdef WITH_CUDA_FP16
        else if (input(0).template IsType<float16>()) GemmRunWithType<float16>();
+#endif
        else LOG(FATAL) << "unsupported input types.";
    } 
    else if (input(0).ndim() >= 2 && input(1).ndim() == 1) {
@@ -160,7 +166,9 @@ void DotGradientOp<Context>::RunOnDevice() {
            << input(0).dim_string() << " can not Dot with Tensor"
            << "(" << input(1).name() << "): " << input(1).dim_string();
        if (input(0).template IsType<float>()) GemvRunWithType<float>();
+#ifdef WITH_CUDA_FP16
        else if (input(0).template IsType<float16>()) GemvRunWithType<float16>();
+#endif
        else LOG(FATAL) << "unsupported input types.";
    } 
    else {

--- a/Dragon/src/operators/arithmetic/eltwise_op.cc
+++ b/Dragon/src/operators/arithmetic/eltwise_op.cc
@@ -41,12 +41,16 @@ void EltwiseOp<Context>::RunOnDevice() {

    if (operation == "SUM") {
        if (input(0).template IsType<float>()) SumRunWithType<float>();
+#ifdef WITH_CUDA_FP16
        else if (input(0).template IsType<float16>()) SumRunWithType<float16>();
+#endif
        else LOG(FATAL) << "unsupported input types.";
    } 
    else if (operation == "PROD") {
        if (input(0).template IsType<float>()) ProdRunWithType<float>();
+#ifdef WITH_CUDA_FP16
        else if (input(0).template IsType<float16>()) ProdRunWithType<float16>();
+#endif
        else LOG(FATAL) << "unsupported input types.";
    } 
    else {
@@ -104,12 +108,16 @@ void EltwiseGradientOp<Context>::RunOnDevice() {

    if (operation == "SUM") {
        if (input(0).template IsType<float>()) SumRunWithType<float>();
+#ifdef WITH_CUDA_FP16
        else if (input(0).template IsType<float16>()) SumRunWithType<float16>();
+#endif
        else LOG(FATAL) << "unsupported input types.";
    } 
    else if (operation == "PROD") {
        if (input(0).template IsType<float>()) ProdRunWithType<float>();
+#ifdef WITH_CUDA_FP16
        else if (input(0).template IsType<float16>()) ProdRunWithType<float16>();
+#endif
        else LOG(FATAL) << "unsupported input types.";
    } 
    else {

--- a/Dragon/src/operators/arithmetic/gram_matrix_op.cc
+++ b/Dragon/src/operators/arithmetic/gram_matrix_op.cc
@@ -25,7 +25,9 @@ void GramMatrixOp<Context>::RunOnDevice() {
    output(0)->Reshape(vector<TIndex>({ outer_dim, dim, dim }));

    if (input(0).template IsType<float>()) RunWithType<float>();
+#ifdef WITH_CUDA_FP16
    else if (input(0).template IsType<float16>()) RunWithType<float16>();
+#endif
    else LOG(FATAL) << "unsupported input types.";
 }

@@ -57,7 +59,9 @@ void GramMatrixGradientOp<Context>::RunOnDevice() {
    output(0)->ReshapeLike(input(0));

    if (input(0).template IsType<float>()) RunWithType<float>();
+#ifdef WITH_CUDA_FP16
    else if (input(0).template IsType<float16>()) RunWithType<float16>();
+#endif
    else LOG(FATAL) << "unsupported input types.";
 }


--- a/Dragon/src/operators/arithmetic/matmul_op.cc
+++ b/Dragon/src/operators/arithmetic/matmul_op.cc
@@ -48,7 +48,9 @@ void MatmulOp<Context>::RunOnDevice() {
    dims[dims.size() - 1] = N;
    output(0)->Reshape(dims);
    if (input(0).template IsType<float>()) RunWithType<float>();
+#ifdef WITH_CUDA_FP16
    else if (input(0).template IsType<float16>()) RunWithType<float16>();
+#endif
    else LOG(FATAL) << "unsupported input types.";
 }

@@ -105,7 +107,9 @@ void MatmulGradientOp<Context>::RunOnDevice() {
    output(0)->ReshapeLike(input(0));
    output(1)->ReshapeLike(input(1));
    if (input(0).template IsType<float>()) RunWithType<float>();
+#ifdef WITH_CUDA_FP16
    else if (input(0).template IsType<float16>()) RunWithType<float16>();
+#endif
    else LOG(FATAL) << "unsupported input types.";
 }


--- a/Dragon/src/operators/arithmetic/mul_op.cc
+++ b/Dragon/src/operators/arithmetic/mul_op.cc
@@ -54,18 +54,24 @@ void MulOp<Context>::RunOnDevice() {
    } 
    else if (input(0).dim(0) == input(1).dim(0) && input(1).count(1) == 1) {
        if (input(0).template IsType<float>()) BroadcastRunWithType<float>(2);
+#ifdef WITH_CUDA_FP16
        else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(2);
+#endif
        else LOG(FATAL) << "unsupported input types.";
    }
    else if (input(0).dim(-1) == input(1).dim(-1) && 
             input(1).count(0, input(1).axis(-1)) == 1) {
        if (input(0).template IsType<float>()) BroadcastRunWithType<float>(1);
+#ifdef WITH_CUDA_FP16
        else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(1);
+#endif
        else LOG(FATAL) << "unsupported input types.";
    } 
    else if (input(1).ndim() == 1 && input(1).dim(0) == 1) {
        if (input(0).template IsType<float>()) BroadcastRunWithType<float>(0);
+#ifdef WITH_CUDA_FP16
        else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(0);
+#endif
        else LOG(FATAL) << "unsupported input types.";
    }
    else {
@@ -158,18 +164,24 @@ void MulGradientOp<Context>::RunOnDevice() {
    } 
    else if (input(0).dim(0) == input(1).dim(0) && input(1).count(1) == 1) {
        if (input(0).template IsType<float>()) BroadcastRunWithType<float>(2);
+#ifdef WITH_CUDA_FP16
        else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(2);
+#endif
        else LOG(FATAL) << "unsupported input types.";
    }
    else if (input(0).dim(-1) == input(1).dim(-1) && 
             input(1).count(0, input(1).axis(-1)) == 1) {
        if (input(0).template IsType<float>()) BroadcastRunWithType<float>(1);
+#ifdef WITH_CUDA_FP16
        else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(1);
+#endif
        else LOG(FATAL) << "unsupported input types.";
    } 
    else if (input(1).ndim() == 1 && input(1).dim(0) == 1) {
        if (input(0).template IsType<float>()) BroadcastRunWithType<float>(0);
+#ifdef WITH_CUDA_FP16
        else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(0);
+#endif
        else LOG(FATAL) << "unsupported input types.";
    }
    else {

--- a/Dragon/src/operators/arithmetic/pow_op.cc
+++ b/Dragon/src/operators/arithmetic/pow_op.cc
@@ -26,7 +26,9 @@ void PowOp<Context>::RunOnDevice() {
    output(0)->ReshapeLike(input(0));
    
    if (input(0).template IsType<float>()) RunWithType<float>();
+#ifdef WITH_CUDA_FP16
    else if (input(0).template IsType<float16>()) RunWithType<float16>();
+#endif
    else LOG(FATAL) << "unsupported input types.";
 }

@@ -76,7 +78,9 @@ void PowGradientOp<Context>::RunOnDevice() {
    output(0)->ReshapeLike(input(0));

    if (input(0).template IsType<float>()) RunWithType<float>();
+#ifdef WITH_CUDA_FP16
    else if (input(0).template IsType<float16>()) RunWithType<float16>();
+#endif
    else LOG(FATAL) << "unsupported input types.";
 }


--- a/Dragon/src/operators/arithmetic/scale_op.cc
+++ b/Dragon/src/operators/arithmetic/scale_op.cc
@@ -37,7 +37,9 @@ void ScaleOp<Context>::RunOnDevice() {
    output(0)->ReshapeLike(input(0));

    if (input(0).template IsType<float>()) RunWithType<float>();
+#ifdef WITH_CUDA_FP16
    else if (input(0).template IsType<float16>()) RunWithType<float16>();
+#endif
    else LOG(FATAL) << "unsupported input types.";
 }


--- a/Dragon/src/operators/arithmetic/sub_op.cc
+++ b/Dragon/src/operators/arithmetic/sub_op.cc
@@ -53,18 +53,24 @@ void SubOp<Context>::RunOnDevice() {
    }
    else if (input(0).dim(0) == input(1).dim(0) && input(1).count(1) == 1) {
        if (input(0).template IsType<float>()) BroadcastRunWithType<float>(2);
+#ifdef WITH_CUDA_FP16
        else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(2);
+#endif
        else LOG(FATAL) << "unsupported input types.";
    }
    else if (input(0).dim(-1) == input(1).dim(-1) && 
             input(1).count(0, input(1).axis(-1)) == 1) {
        if (input(0).template IsType<float>()) BroadcastRunWithType<float>(1);
+#ifdef WITH_CUDA_FP16
        else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(1);
+#endif
        else LOG(FATAL) << "unsupported input types.";
    } 
    else if (input(1).ndim() == 1 && input(1).dim(0) == 1) {
        if (input(0).template IsType<float>()) BroadcastRunWithType<float>(0);
+#ifdef WITH_CUDA_FP16
        else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(0);
+#endif
        else LOG(FATAL) << "unsupported input types.";
    } 
    else {
@@ -139,18 +145,24 @@ void SubGradientOp<Context>::RunOnDevice() {
    } 
    else if (input(-1).dim(0) == input(0).dim(0) && input(0).count(1) == 1) {
        if (input(0).template IsType<float>()) BroadcastRunWithType<float>(2);
+#ifdef WITH_CUDA_FP16
        else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(2);
+#endif
        else LOG(FATAL) << "unsupported input types.";
    }
    else if (input(-1).dim(-1) == input(0).dim(-1) && 
             input(0).count(0, input(0).axis(-1)) == 1) {
        if (input(0).template IsType<float>()) BroadcastRunWithType<float>(1);
+#ifdef WITH_CUDA_FP16
        else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(1);
+#endif
        else LOG(FATAL) << "unsupported input types.";
    } 
    else if (input(0).ndim() == 1 && input(0).dim(0) == 1) {
        if (input(0).template IsType<float>()) BroadcastRunWithType<float>(0);
+#ifdef WITH_CUDA_FP16
        else if (input(0).template IsType<float16>()) BroadcastRunWithType<float16>(0);
+#endif
        else LOG(FATAL) << "unsupported input types.";
    }
    else {

--- a/Dragon/src/operators/common/concat_op.cc
+++ b/Dragon/src/operators/common/concat_op.cc
@@ -49,7 +49,9 @@ void ConcatOp<Context>::RunOnDevice() {
    }

    if (input(0).template IsType<float>()) RunWithType<float>();
+#ifdef WITH_CUDA_FP16
    else if (input(0).template IsType<float16>()) RunWithType<float16>();
+#endif
    else LOG(FATAL) << "unsupported input types.";
 }

@@ -96,7 +98,9 @@ void ConcatGradientOp<Context>::RunOnDevice() {
    }

    if (input(0).template IsType<float>()) RunWithType<float>();
+#ifdef WITH_CUDA_FP16
    else if (input(0).template IsType<float16>()) RunWithType<float16>();
+#endif
    else LOG(FATAL) << "unsupported input types.";
 }


--- a/Dragon/src/operators/common/transpose_op.cc
+++ b/Dragon/src/operators/common/transpose_op.cc
@@ -45,7 +45,9 @@ void TransposeOp<Context>::RunOnDevice() {
    }

    if (input(0).template IsType<float>()) RunWithType<float>();
+#ifdef WITH_CUDA_FP16
    else if (input(0).template IsType<float16>()) RunWithType<float16>();
+#endif
    else LOG(FATAL) << "unsupported input types.";
 }

@@ -75,7 +77,9 @@ void TransposeGradientOp<Context>::RunOnDevice() {
    new_steps = ws()->GetTensor("_t_" + anchor() + "_new_steps");

    if (input(0).template IsType<float>()) RunWithType<float>();
+#ifdef WITH_CUDA_FP16
    else if (input(0).template IsType<float16>()) RunWithType<float16>();
+#endif
    else LOG(FATAL) << "unsupported input types.";
 }
    

--- a/Dragon/src/operators/norm/batch_norm_op.cc
+++ b/Dragon/src/operators/norm/batch_norm_op.cc
@@ -127,7 +127,9 @@ void BatchNormOp<Context>::RunOnDevice() {


    if (input(0).template IsType<float>()) RunWithType<float>();
+#ifdef WITH_CUDA_FP16
    else if (input(0).template IsType<float16>()) RunWithType<float16>();
+#endif
    else LOG(FATAL) << "unsupported input types.";
 }

@@ -247,7 +249,9 @@ void BatchNormGradientOp<Context>::RunOnDevice() {
    else use_global_stats = use_stats == 1 ? true : false;
    
    if (input(0).template IsType<float>()) RunWithType<float>();
+#ifdef WITH_CUDA_FP16
    else if (input(0).template IsType<float16>()) RunWithType<float16>();
+#endif
    else LOG(FATAL) << "unsupported input types.";
 }


--- a/Dragon/src/operators/norm/l2_norm_op.cc
+++ b/Dragon/src/operators/norm/l2_norm_op.cc
@@ -78,7 +78,9 @@ void L2NormOp<Context>::RunOnDevice() {
    output(0)->ReshapeLike(input(0));

    if (input(0).template IsType<float>()) RunWithType<float>();
+#ifdef WITH_CUDA_FP16
    else if (input(0).template IsType<float16>()) RunWithType<float16>();
+#endif
    else LOG(FATAL) << "unsupported input types.";
 }


--- a/Dragon/src/operators/utils/cast_op.cpp
+++ b/Dragon/src/operators/utils/cast_op.cpp
@@ -4,6 +4,8 @@

 namespace dragon {

+#ifdef WITH_CUDA_FP16
+
 template <class Context>
 void FloatToHalfOp<Context>::RunOnDevice() {
    CHECK(input(0).template IsType<float>())
@@ -28,4 +30,6 @@ OPERATOR_SCHEMA(FloatToHalf).NumInputs(1).NumOutputs(1);

 NO_GRADIENT(FloatToHalf);

+#endif
+
 }    // namespace dragon
\ No newline at end of file
--- a/Dragon/src/operators/utils/gradient_op.cc
+++ b/Dragon/src/operators/utils/gradient_op.cc
@@ -19,7 +19,9 @@ void GradientGenerateOp<Context>::RunWithType() {
 template <class Context>
 void GradientGenerateOp<Context>::RunOnDevice() {
    if (input(0).template IsType<float>()) RunWithType<float>();
+#ifdef WITH_CUDA_FP16
    else if (input(0).template IsType<float16>()) RunWithType<float16>();
+#endif
    else LOG(FATAL) << "unsupported input types.";
 }


--- a/Dragon/src/operators/utils/memory_data_op.cc
+++ b/Dragon/src/operators/utils/memory_data_op.cc
@@ -23,12 +23,16 @@ void MemoryDataOp<Context>::RunOnDevice() {

    if (input(0).template IsType<float>()) {
        if (data_type == TensorProto_DataType_FLOAT) RunWithType<float, float>();
+#ifdef WITH_CUDA_FP16
        else if (data_type == TensorProto_DataType_FLOAT16) RunWithType<float, float16>();
+#endif
        else LOG(FATAL) << "unsupported input types.";
    } 
    else if (input(0).template IsType<uint8_t>()) {
        if (data_type == TensorProto_DataType_FLOAT) RunWithType<uint8_t, float>();
+#ifdef WITH_CUDA_FP16
        if (data_type == TensorProto_DataType_FLOAT16) RunWithType<uint8_t, float16>();
+#endif
    } 
    else { LOG(FATAL) << "unsupported input types."; }
 }

--- a/Dragon/src/operators/vision/dense_concat_op.cc
+++ b/Dragon/src/operators/vision/dense_concat_op.cc
@@ -58,7 +58,9 @@ void DenseConcatGradientOp<Context>::ElimateCorruption() {
        input(0).Move(buffer->memory());
        head_data[idx] = input(0).name();
        if (input(-2).template IsType<float>()) RestoreX1<float>();
+#ifdef WITH_CUDA_FP16
        else if (input(-2).template IsType<float16>()) RestoreX1<float16>();
+#endif
        else LOG(FATAL) << "unsupported input types.";
        //  post-process
        if (input(0).memory() != buffer->memory()) buffer->Move(input(0).memory());

--- a/Dragon/src/utils/math_functions.cc
+++ b/Dragon/src/utils/math_functions.cc
 #include <random>

 #include "core/context.h"
-#include "utils/math_functions.h"
-
-#ifdef WITH_SSE
+#include "utils/omp_alternative.h"
 #include "utils/sse_alternative.h"
-#endif
+#include "utils/math_functions.h"

 namespace dragon {

@@ -22,9 +20,12 @@ template <> void Set<float, CPUContext>(const int n,
    }
 #ifdef WITH_SSE
    sse::Set<float>(n, alpha, x);
-#else   // naive implement
+#else
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
+#endif
    for (int i = 0; i < n; ++i) x[i] = alpha;
-#endif 
+#endif  // WITH_SSE
 }

 template <> void Set<int, CPUContext>(const int n, 
@@ -36,9 +37,12 @@ template <> void Set<int, CPUContext>(const int n,
    }
 #ifdef WITH_SSE
    sse::Set<int>(n, alpha, x);
-#else  // naive implement
+#else
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
+#endif
    for (int i = 0; i < n; ++i) x[i] = alpha;
-#endif 
+#endif  // WITH_SSE
 }

 template <> void Set<float16, CPUContext>(const int n, 
@@ -52,9 +56,10 @@ template <> void RandomUniform<float, CPUContext>(const int n,
                                                  const float high, 
                                                  float* x) {
    std::uniform_real_distribution<float> distribution(low, high);
-    for (int i = 0; i < n; ++i) {
-        x[i] = distribution(*rand_generator());
-    }
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
+#endif
+    for (int i = 0; i < n; ++i) x[i] = distribution(*rand_generator());
 }

 template <> void RandomUniform<float16, CPUContext>(const int n, 
@@ -69,9 +74,10 @@ template <> void RandomUniform<uint32_t, CPUContext>(const int n,
                                                     const float high, 
                                                     uint32_t* x) {
    std::uniform_int_distribution<uint32_t> distribution(low, high);
-    for (int i = 0; i < n; ++i) {
-        x[i] = distribution(*rand_generator());
-    }
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
+#endif
+    for (int i = 0; i < n; ++i) x[i] = distribution(*rand_generator());
 }

 template <> void RandomNormal<float, CPUContext>(const int n, 
@@ -79,9 +85,10 @@ template <> void RandomNormal<float, CPUContext>(const int n,
                                                 const float sigma, 
                                                 float* x) {
    std::normal_distribution<float> distribution(mu, sigma);
-    for (int i = 0; i < n; ++i) {
-        x[i] = distribution(*rand_generator());
-    }    
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
+#endif
+    for (int i = 0; i < n; ++i) x[i] = distribution(*rand_generator());
 }

 template <> void RandomNormal<float16, CPUContext>(const int n, 
@@ -121,9 +128,10 @@ template <> void RandomBernoulli<float, CPUContext>(const int n,
                                                    const float p,
                                                    uint32_t* x) {
    std::bernoulli_distribution distribution(p);
-    for (int i = 0; i < n; ++i) {
-        x[i] = distribution(*rand_generator());
-    }    
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
+#endif
+    for (int i = 0; i < n; ++i) x[i] = distribution(*rand_generator());
 }

 /******************** Level-1 ********************/
@@ -134,9 +142,12 @@ template <> void Add<float, CPUContext>(const int n,
                                        float* y) {
 #ifdef WITH_SSE
    sse::Add<float>(n, a, b, y);
-#else  // naive implement
-    for (int i = 0; i < n; ++i) y[i] = a[i] + b[i];
+#else
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
 #endif
+    for (int i = 0; i < n; ++i) y[i] = a[i] + b[i];
+#endif  // WITH_SSE
 }

 template <> void Sub<float, CPUContext>(const int n, 
@@ -145,9 +156,12 @@ template <> void Sub<float, CPUContext>(const int n,
                                        float* y) {
 #ifdef WITH_SSE
    sse::Sub<float>(n, a, b, y);
-#else  // naive implement
-    for (int i = 0; i < n; ++i) y[i] = a[i] - b[i];
+#else
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
 #endif
+    for (int i = 0; i < n; ++i) y[i] = a[i] - b[i];
+#endif  // WITH_SSE
 }

 template <> void Mul<float, CPUContext>(const int n, 
@@ -156,9 +170,12 @@ template <> void Mul<float, CPUContext>(const int n,
                                        float* y) {
 #ifdef WITH_SSE
    sse::Mul<float>(n, a, b, y);
-#else  // naive implement
-    for (int i = 0; i < n; ++i) y[i] = a[i] * b[i];
+#else
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
 #endif
+    for (int i = 0; i < n; ++i) y[i] = a[i] * b[i];
+#endif  // WITH_SSE
 }

 template <> void Mul<float16, CPUContext>(const int n, 
@@ -174,9 +191,12 @@ template <> void Div<float, CPUContext>(const int n,
                                        float* y) {
 #ifdef WITH_SSE
    sse::Div<float>(n, a, b, y);
-#else  // naive implement
+#else
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
+#endif
    for (int i = 0; i < n; ++i) y[i] = a[i] / b[i];
-#endif    
+#endif  // WITH_SSE
 }

 template <> void Div<float16, CPUContext>(const int n, 
@@ -190,6 +210,9 @@ template <> void Clip<float, CPUContext>(const int n,
                                         const float low, 
                                         const float high,
                                         float* x) {
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
+#endif
    for (int i = 0; i < n; ++i) {
        x[i] = std::max(low, std::min(x[i], high));
    }
@@ -198,6 +221,9 @@ template <> void Clip<float, CPUContext>(const int n,
 template <> void Exp<float, CPUContext>(int n, 
                                        const float* x, 
                                        float* y) {
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
+#endif
    for (int i = 0; i < n; ++i) {
        y[i] = std::exp(x[i]);
    }
@@ -206,6 +232,9 @@ template <> void Exp<float, CPUContext>(int n,
 template <> void Log<float, CPUContext>(int n,
                                        const float* x, 
                                        float* y) {
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
+#endif
    for (int i = 0; i < n; ++i) {
        y[i] = std::log(x[i]);
    }
@@ -214,6 +243,9 @@ template <> void Log<float, CPUContext>(int n,
 template <> void Square<float, CPUContext>(int n,
                                           const float* x,
                                           float* y) {
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
+#endif
    for (int i = 0; i < n; ++i) {
        y[i] = x[i] * x[i];
    }
@@ -228,6 +260,9 @@ template <> void Square<float16, CPUContext>(int n,
 template <> void Sqrt<float, CPUContext>(int n,
                                         const float* x,
                                         float* y) {
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
+#endif
    for (int i = 0; i < n; ++i) {
        y[i] = std::sqrt(x[i]);
    }
@@ -243,6 +278,9 @@ template <> void Pow<float, CPUContext>(int n,
                                        const float alpha, 
                                        const float* x,
                                        float* y) {
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
+#endif
    for (int i = 0; i < n; ++i) {
        y[i] = std::pow(x[i], alpha);
    }    
@@ -259,6 +297,9 @@ template <> void Inv<float, CPUContext>(const int n,
                                        const float numerator,
                                        const float* x, 
                                        float* y) {
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
+#endif
    for (int i = 0; i < n; ++i) {
        y[i] = 1.0 / y[i];
    }
@@ -280,9 +321,12 @@ template <> void Scal<float, CPUContext>(const int n,
    cblas_sscal(n, alpha, y, 1);
 #elif  WITH_SSE
    sse::Scal<float>(n, alpha, y);
-#else  // naive implement
-    for (int i = 0; i < n; ++i) y[i] = y[i] * alpha;
+#else
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
 #endif
+    for (int i = 0; i < n; ++i) y[i] = y[i] * alpha;
+#endif  // WITH_BLAS
 }

 template <> void Scal<float16, CPUContext>(const int n, 
@@ -307,9 +351,12 @@ template <> void Scale<float, CPUContext>(const int n,
    cblas_sscal(n, alpha, y, 1);
 #elif  WITH_SSE
    sse::Scale<float>(n, alpha, x, y);
-#else  // naive implement
-    for (int i = 0; i < n; ++i) y[i] = x[i] * alpha;
+#else
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
 #endif
+    for (int i = 0; i < n; ++i) y[i] = x[i] * alpha;
+#endif  // WITH_BLAS
 }

 template <> float StridedDot<float, CPUContext>(const int n, 
@@ -319,11 +366,14 @@ template <> float StridedDot<float, CPUContext>(const int n,
                                                const int incy) {
 #ifdef WITH_BLAS
    return cblas_sdot(n, a, incx, b, incy);
-#else  // naive implement
+#else
    float ret = 0.f;
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
+#endif
    for (int i = 0; i < n; ++i) ret += a[i] * b[i];
    return ret;
-#endif
+#endif  //  WITH_BLAS
 }

 template <> float Dot<float, CPUContext>(int n, 
@@ -333,11 +383,14 @@ template <> float Dot<float, CPUContext>(int n,
    return StridedDot<float, CPUContext>(n, a, 1, b, 1);
 #elif  WITH_SSE
    return sse::Dot<float>(n, a, b);
-#else  // naive implement
+#else
    float ret = 0.f;
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
+#endif
    for (int i = 0; i < n; ++i) ret += a[i] * b[i];
    return ret;
-#endif
+#endif  // WITH_BLAS
 }

 template <> float Dot<float16, CPUContext>(int n, 
@@ -350,23 +403,29 @@ template <> float Dot<float16, CPUContext>(int n,
 template <> float ASum<float, CPUContext>(const int n, const float* x) {
 #ifdef WITH_BLAS
    return cblas_sasum(n, x, 1);
-#elif  WITH_SSE
+#elif WITH_SSE
    return sse::ASum<float>(n, x);
-#else   // naive implement
+#else
    float ret = 0.f;
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
+#endif
    for (int i = 0; i < n; ++i) ret += x[i];
    return ret;
-#endif
+#endif  // WITH_BLAS
 }

 template <> void AddScalar<float, CPUContext>(const int n,
                                              const float alpha, 
                                              float* y) {
-#ifdef  WITH_SSE
+#ifdef WITH_SSE
    sse::AddScalar<float>(n, alpha, y);
-#else  // naive implement
-    for (int i = 0; i < n; ++i) y[i] += alpha;
+#else
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
 #endif
+    for (int i = 0; i < n; ++i) y[i] += alpha;
+#endif  // WITH_SSE
 }

 template <> void AddScalar<float16, CPUContext>(const int n, 
@@ -378,11 +437,14 @@ template <> void AddScalar<float16, CPUContext>(const int n,
 template <> void MulScalar<float, CPUContext>(const int n, 
                                              const float alpha,
                                              float* y) {
-#ifdef  WITH_SSE
+#ifdef WITH_SSE
    sse::MulScalar<float>(n, alpha, y);
-#else    // naive implement
-    for (int i = 0; i < n; ++i) y[i] *= alpha;
+#else
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
 #endif
+    for (int i = 0; i < n; ++i) y[i] *= alpha;
+#endif  // WITH_SSE
 }

 template <> void Axpy<float, CPUContext>(const int n, 
@@ -393,9 +455,12 @@ template <> void Axpy<float, CPUContext>(const int n,
    cblas_saxpy(n, alpha, x, 1, y, 1);
 #elif  WITH_SSE
    sse::Axpy<float>(n, alpha, x, y);
-#else  // naive implement
-    for (int i = 0; i < n; ++i) y[i] = alpha * x[i] + y[i];
+#else
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
 #endif
+    for (int i = 0; i < n; ++i) y[i] = alpha * x[i] + y[i];
+#endif  // WITH_BLAS
 }

 template <> void Axpy<float16, CPUContext>(const int n, 
@@ -415,9 +480,12 @@ template <> void Axpby<float, CPUContext>(const int n,
    cblas_saxpy(n, alpha, x, 1, y, 1);
 #elif  WITH_SSE
    sse::Axpby<float>(n, alpha, x, beta, y);
-#else   // naive implement
-    for (int i = 0; i < n; ++i) y[i] = alpha * x[i] + beta* y[i];
+#else
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
 #endif
+    for (int i = 0; i < n; ++i) y[i] = alpha * x[i] + beta* y[i];
+#endif  // WITH_BLAS
 }

 template <> void Axpby<float16, CPUContext>(const int n, 

--- a/Dragon/src/utils/math_functions.cu
+++ b/Dragon/src/utils/math_functions.cu
@@ -40,6 +40,7 @@ template <> void Set<int, CUDAContext>(const int n,
    _Set<int> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, alpha, x);
 }

+#ifdef WITH_CUDA_FP16
 template <typename T>
 __global__ void _SetHalf2(const int n, const half2 alpha, half2* x) {
    CUDA_KERNEL_LOOP(idx, n) {
@@ -61,6 +62,7 @@ template <> void Set<float16, CUDAContext>(const int n,
        _Set<float16> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, alpha, x);
    }
 }
+#endif

 template <> void RandomUniform<uint32_t, CUDAContext>(const int n, 
                                                      const float low, 
@@ -144,6 +146,7 @@ template <> void Mul<float, CUDAContext>(int n,
    _Mul<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, a, b, y);
 }

+#ifdef WITH_CUDA_FP16
 template <typename T>
 __global__ void _MulHalf(const int n, const half* a, const half* b, half* y) {
    CUDA_KERNEL_LOOP(idx, n) {
@@ -161,7 +164,7 @@ __global__ void _MulHalf2(const int n, const half2* a, const half2* b, half2* y)
 #endif
    }
 }
-    
+
 template <> void Mul<float16, CUDAContext>(int n, 
                                           const float16* a, 
                                           const float16* b, 
@@ -176,6 +179,7 @@ template <> void Mul<float16, CUDAContext>(int n,
                                  reinterpret_cast<const half*>(b),
                                  reinterpret_cast<half*>(y));
 }
+#endif

 template <typename T>
 __global__ void _Div(const int n, const T* a, const T* b, T* y) {
@@ -191,6 +195,7 @@ template <> void Div<float, CUDAContext>(int n,
    _Div<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, a, b, y);
 }

+#ifdef WITH_CUDA_FP16
 template <typename T>
 __global__ void _DivHalf(const int n, const half* a, const half* b, half* y) {
    CUDA_KERNEL_LOOP(idx, n) {
@@ -209,6 +214,7 @@ template <> void Div<float16, CUDAContext>(int n,
                             reinterpret_cast<const half*>(b),
                             reinterpret_cast<half*>(y));
 }
+#endif

 template <typename T>
 __global__ void _Clip(const int n, const T low, const T high, T* x) {
@@ -260,6 +266,7 @@ template <> void Square<float, CUDAContext>(int n,
    _Square<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, x, y);
 }

+#ifdef WITH_CUDA_FP16
 template <typename T>
 __global__ void _SquareHalf(const int n, const half* x, half* y) {
    CUDA_KERNEL_LOOP(idx, n) {
@@ -290,6 +297,7 @@ template <> void Square<float16, CUDAContext>(int n,
                                      reinterpret_cast<half*>(y));
    CUDA_POST_KERNEL_CHECK;
 }
+#endif

 template <typename T>
 __global__ void _Sqrt(const int n, const T* x, T* y) {
@@ -304,6 +312,7 @@ template <> void Sqrt<float, CUDAContext>(int n,
    _Sqrt<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, x, y);
 }

+#ifdef WITH_CUDA_FP16
 template <typename T>
 __global__ void _SqrtHalf(const int n, const half* x, half* y) {
    CUDA_KERNEL_LOOP(idx, n) {
@@ -334,6 +343,7 @@ template <> void Sqrt<float16, CUDAContext>(int n,
                                     reinterpret_cast<half*>(y));
    CUDA_POST_KERNEL_CHECK;
 }
+#endif

 template <typename T>
 __global__ void _Pow(const int n, const T alpha, const T* a, T* y) {
@@ -349,6 +359,7 @@ template <> void Pow<float, CUDAContext>(int n,
    _Pow<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, alpha, x, y);
 }

+#ifdef WITH_CUDA_FP16
 template <typename T>
 __global__ void _PowHalf(const int n, const float alpha, const half* a, half* y) {
    CUDA_KERNEL_LOOP(idx, n) {
@@ -384,6 +395,7 @@ template <> void Pow<float16, CUDAContext>(int n,
                                    reinterpret_cast<half*>(y));
    CUDA_POST_KERNEL_CHECK;
 }
+#endif

 template <typename T>
 __global__ void _Inv(const int n, const float numerator, const T* x, T* y) {
@@ -399,6 +411,7 @@ template <> void Inv<float, CUDAContext>(const int n,
    _Inv<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, numerator, x, y);
 }

+#ifdef WITH_CUDA_FP16
 template <typename T>
 __global__ void _InvHalf(const int n, const half numerator, const half* x, half* y) {
    CUDA_KERNEL_LOOP(idx, n) {
@@ -439,6 +452,7 @@ template <> void Inv<float16, CUDAContext>(const int n,
    }
    CUDA_POST_KERNEL_CHECK;
 }
+#endif

 /******************** Level-2 ********************/

@@ -518,6 +532,7 @@ template <> void AddScalar<float, CUDAContext>(const int n, const float alpha, f
    _AddScalar<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, alpha, y);
 }

+#ifdef WITH_CUDA_FP16
 template <typename T>
 __global__ void _AddScalarHalf(const int n, half alpha, half* y) {
    CUDA_KERNEL_LOOP(idx, n) {
@@ -552,6 +567,7 @@ template <> void AddScalar<float16, CUDAContext>(const int n, const float alpha,
    }
    CUDA_POST_KERNEL_CHECK;
 }
+#endif

 template <typename T>
 __global__ void _MulScalar(const int n, T alpha, T* y) {
@@ -641,6 +657,7 @@ template <> void Gemm<float, CUDAContext>(const CBLAS_TRANSPOSE transA,
                                C, N));
 }

+#ifdef WITH_CUDA_FP16
 template <> void Gemm<float16, CUDAContext>(const CBLAS_TRANSPOSE transA, 
                                            const CBLAS_TRANSPOSE transB,
                                            const int M, 
@@ -682,6 +699,7 @@ template <> void Gemm<float16, CUDAContext>(const CBLAS_TRANSPOSE transA,
        LOG(FATAL) << "unsupported math type";
    }
 }
+#endif

 template <> void Gemv<float, CUDAContext>(const CBLAS_TRANSPOSE transA, 
                                          const int M, const int N,
@@ -702,6 +720,7 @@ template <> void Gemv<float, CUDAContext>(const CBLAS_TRANSPOSE transA,
                                y, 1));
 }

+#ifdef WITH_CUDA_FP16
 template <> void Gemv<float16, CUDAContext>(const CBLAS_TRANSPOSE transA, 
                                            const int M, 
                                            const int N,
@@ -742,6 +761,7 @@ template <> void Gemv<float16, CUDAContext>(const CBLAS_TRANSPOSE transA,
            LOG(FATAL) << "unsupported math type";
    }
 }
+#endif

 }    // namespace math


--- a/Dragon/src/utils/op_kernel.cc
+++ b/Dragon/src/utils/op_kernel.cc
@@ -3,11 +3,9 @@

 #include "core/tensor.h"
 #include "utils/op_kernel.h"
-#include "utils/math_functions.h"
-
-#ifdef WITH_SSE
+#include "utils/omp_alternative.h"
 #include "utils/sse_alternative.h"
-#endif
+#include "utils/math_functions.h"

 bool judge(int a, int b)  { return unsigned(a) < unsigned(b); }

@@ -28,8 +26,10 @@ template<> void Dropout<float, CPUContext>(const int count,
                                           CPUContext* context) {
    uint32_t thresh = static_cast<uint32_t>(UINT_MAX * prob);
    math::RandomBernoulli<float, CPUContext>(count, 1 - prob, mask);
-    for (int i = 0; i < count; ++i) 
-        y[i] = x[i] * mask[i] * scale;
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
+#endif
+    for (int i = 0; i < count; ++i) y[i] = x[i] * mask[i] * scale;
 }

 template<> void DropoutGrad<float, CPUContext>(const int count, 
@@ -38,8 +38,10 @@ template<> void DropoutGrad<float, CPUContext>(const int count,
                                               const float* dy, 
                                               const uint32_t* mask,
                                               float* dx) {
-    for (int i = 0; i < count; ++i) 
-        dx[i] = dy[i] * mask[i] * scale;
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
+#endif
+    for (int i = 0; i < count; ++i) dx[i] = dy[i] * mask[i] * scale;
 }

 /******************** activation.relu ********************/
@@ -48,6 +50,9 @@ template<> void Relu<float, CPUContext>(const int count,
                                        const float* x, 
                                        const float slope, 
                                        float* y) {
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
+#endif
    for (int i = 0; i < count; ++i) {
        y[i] = std::max(x[i], 0.f) + slope * std::min(x[i], 0.f);
    }
@@ -58,10 +63,12 @@ template<> void ReluGrad<float, CPUContext>(const int count,
                                            const float* y, 
                                            const float slope, 
                                            float* dx) {
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
+#endif
    for (int i = 0; i < count; ++i) {
        dx[i] = dy[i] * ((y[i] > 0) + slope * (y[i] <= 0));
    }
-
 }

 /******************** activation.sigmoid ********************/
@@ -70,15 +77,19 @@ template <typename T>
 T _sigmoid(T x) { return T(1) / (T(1) + exp(-x)); }

 template<> void Sigmoid<float, CPUContext>(const int count, const float* x, float* y) {
-    for (int i = 0; i < count; ++i)  {
-        y[i] = _sigmoid<float>(x[i]);
-    }
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
+#endif
+    for (int i = 0; i < count; ++i)  y[i] = _sigmoid<float>(x[i]);
 }

 template<> void SigmoidGrad<float, CPUContext>(const int count, 
                                               const float* dy, 
                                               const float* y, 
                                               float* dx) {
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
+#endif
    for (int i = 0; i < count; ++i) {
        dx[i] = dy[i] * y[i] * (1 - y[i]);
    }
@@ -149,6 +160,9 @@ template<> void SoftmaxGrad<float, CPUContext>(const int count,
 /******************** activation.tanh ********************/

 template<> void Tanh<float, CPUContext>(const int count, const float* x, float* y) {
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
+#endif
    for (int i = 0; i < count; ++i) {
        y[i] = std::tanh(x[i]);
    }
@@ -158,6 +172,9 @@ template<> void TanhGrad<float, CPUContext>(const int count,
                                            const float* dy, 
                                            const float* y, 
                                            float* dx) {
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
+#endif
    for (int i = 0; i < count; ++i) {
        dx[i] = dy[i] * (1 - y[i] * y[i]);
    }
@@ -197,6 +214,9 @@ template <> void Clip<float, CPUContext>(const int count,
                                         const float* x,
                                         float* mask,
                                         float* y) {
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
+#endif
    for (int i = 0; i < count; ++i) {
        mask[i] = 1.0;
        if (x[i] < low || x[i] > high) mask[i] = 0.0;
@@ -300,8 +320,10 @@ template<> void Argmax<float, CPUContext>(const int count,
 /******************** common.at ********************/

 template <> void CanonicalAxis<float, CPUContext>(const int count, const int dim, float* y) {
-    for (int i = 0; i < count; ++i) 
-        if (y[i] < 0) y[i] += dim;
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
+#endif
+    for (int i = 0; i < count; ++i) if (y[i] < 0) y[i] += dim;
 }

 template <> void At<float, CPUContext>(const int count, 
@@ -478,6 +500,9 @@ template<> void Sum<float, CPUContext>(const int count,
                                       const int inner_dim, 
                                       const float* x, 
                                       float* y) {
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
+#endif
    for (int i = 0; i < count; ++i) {
        float sum_val = 0.0;
        for (int j = 0; j < axis_dim; ++j)
@@ -492,6 +517,9 @@ template<> void SumGrad<float, CPUContext>(const int count,
                                           const float coeff, 
                                           const float* dy, 
                                           float* dx) {
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
+#endif
    for (int i = 0; i < count; ++i) {
        for (int j = 0; j < axis_dim; ++j)
            dx[(i / inner_dim * axis_dim + j) * inner_dim + i % inner_dim] = dy[i] * coeff;
@@ -585,6 +613,9 @@ template <> void Transpose<float, CPUContext>(const int count,
                                              const int* new_steps, 
                                              const float* x, 
                                              float* y) {
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
+#endif
    for (int i = 0; i < count; ++i) {
       int x_idx = 0, y_idx = i;
       for (int j = 0; j < ndim; ++j) {
@@ -603,15 +634,7 @@ template <> void Transpose<float16, CPUContext>(const int count,
                                                const int* new_steps, 
                                                const float16* x, 
                                                float16* y) {
-    for (int i = 0; i < count; ++i) {
-       int x_idx = 0, y_idx = i;
-       for (int j = 0; j < ndim; ++j) {
-           int k = order[j];
-           x_idx += (y_idx / new_steps[j]) * old_steps[k];
-           y_idx %= new_steps[j];
-       }
-       y[i] = x[x_idx];
-    }
+    LOG(FATAL) << "unsupport float16 with CPU";
 }

 template <> void TransposeGrad<float, CPUContext>(const int count, 
@@ -621,6 +644,9 @@ template <> void TransposeGrad<float, CPUContext>(const int count,
                                                  const int* new_steps, 
                                                  const float* dy, 
                                                  float* dx) {
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
+#endif
    for (int i = 0; i < count; ++i) {
        int x_idx = 0, y_idx = i;
        for (int j = 0; j < ndim; ++j) {
@@ -639,20 +665,15 @@ template <> void TransposeGrad<float16, CPUContext>(const int count,
                                                    const int* new_steps, 
                                                    const float16* dy, 
                                                    float16* dx) {
-    for (int i = 0; i < count; ++i) {
-        int x_idx = 0, y_idx = i;
-        for (int j = 0; j < ndim; ++j) {
-            int k = order[j];
-            x_idx += (y_idx / new_steps[j]) * old_steps[k];
-            y_idx %= new_steps[j];
-        }
-        dx[x_idx] = dy[i];
-    }
+    LOG(FATAL) << "unsupport float16 with CPU";
 }

 /******************** loss.l1_loss ********************/

 template<> void AbsGrad<float, CPUContext>(const int count, const float* dy, float* dx) {
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
+#endif
    for (int i = 0; i < count; ++i) {
        const float val = dy[i];
        //  val > 0: 1 | val == 0: 0 | val < 0: -1
@@ -666,6 +687,9 @@ template <> void SigmoidCrossEntropy<float, CPUContext>(const int count,
                                                        const float* x, 
                                                        const float* target, 
                                                        float* loss) {
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
+#endif
    for (int i = 0; i < count; ++i) {
        loss[i] = std::log(1 + std::exp(x[i] - 2 * x[i] * (x[i] >= 0)))
                      + x[i] * ((x[i] >= 0) - target[i]);
@@ -678,6 +702,9 @@ template<> void SmoothL1<float, CPUContext>(const int count,
                                            const float sigma2, 
                                            const float* x, 
                                            float* y) {
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
+#endif
    for (int i = 0; i < count; ++i) {
        const float val = x[i];
        const float abs_val = abs(val);
@@ -690,6 +717,9 @@ template<> void SmoothL1Grad<float, CPUContext>(const int count,
                                                const float sigma2, 
                                                const float* dy, 
                                                float* dx) {
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
+#endif
    for (int i = 0; i < count; ++i) {
        const float val = dy[i];
        const float abs_val = abs(val);
@@ -705,6 +735,9 @@ template <> void SoftmaxCrossEntropy<float, CPUContext>(const int count,
                                                        const float* prob, 
                                                        const float* target, 
                                                        float* loss) {
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
+#endif
    for (int i = 0; i < count; ++i) {
        loss[i] = - target[i] * std::log(std::max(prob[i], FLT_MIN));
    }
@@ -1016,9 +1049,12 @@ template <> void RMSPropUpdate<float, CPUContext>(const int count,
 /******************** utils.compare ********************/

 template <> void Equal<float, CPUContext>(const int count,
-    const float* a,
-    const float* b,
-    float* y) {
+                                          const float* a,
+                                          const float* b,
+                                          float* y) {
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
+#endif
    for (int i = 0; i < count; ++i)
        y[i] = fabs(a[i] - b[i]) < FLT_EPSILON ? 1.0 : 0.0;
 }
@@ -1096,6 +1132,9 @@ template <> void OneHot<float, CPUContext>(const int count,
                                           const int on_value,
                                           const float* x,
                                           float* y) {
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
+#endif
    for (int i = 0; i < count; ++i) {
        const int val = x[i];
        y[i * depth + val] = on_value;

--- a/Dragon/src/utils/op_kernel.cu
+++ b/Dragon/src/utils/op_kernel.cu
@@ -21,7 +21,7 @@ template<> void Empty<float, CUDAContext>() {
 }

 template<> void Empty<float16, CUDAContext>() {
-    _Empty<float> << <1, 1 >> >();
+    _Empty<float16> << <1, 1 >> >();
     CUDA_POST_KERNEL_CHECK;
 }

@@ -102,6 +102,7 @@ template<> void Relu<float, CUDAContext>(const int count,
    CUDA_POST_KERNEL_CHECK;
 }

+#ifdef WITH_CUDA_FP16
 template <typename T>
 __global__ void _ReluHalf(const int count, const half* x, const float slope, half* y) {
    const half kSlope = __float2half(slope);
@@ -123,6 +124,7 @@ template<> void Relu<float16, CUDAContext>(const int count,
                                            reinterpret_cast<half*>(y));
    CUDA_POST_KERNEL_CHECK;
 }
+#endif

 template <typename T>
 __global__ void _ReluGrad(const int count, 
@@ -477,6 +479,7 @@ template<> void Scale<float, CUDAContext>(const int axis,
                                                                              Ydata);
 }

+#ifdef WITH_CUDA_FP16
 template <typename T>
 __global__ void _ScaleWithoutBiasHalf(const int n, 
                                      const half* x, 
@@ -538,6 +541,7 @@ template<> void Scale<float16, CUDAContext>(const int axis,
                                                                              inner_dim, 
                                                        reinterpret_cast<half*>(Ydata));
 }
+#endif

 template <> void ScaleGrad<float, CUDAContext>(const int axis, 
                                               Tensor* dy, 
@@ -730,6 +734,7 @@ template <> void Concat<float, CUDAContext>(const int count,
    CUDA_POST_KERNEL_CHECK;
 }

+#ifdef WITH_CUDA_FP16
 template <> void Concat<float16, CUDAContext>(const int count, 
                                              const int outer_dim, 
                                              const int inner_dim,
@@ -749,6 +754,7 @@ template <> void Concat<float16, CUDAContext>(const int count,
                                         reinterpret_cast<half*>(y));
    CUDA_POST_KERNEL_CHECK;
 }
+#endif

 template <typename T>
 __global__ void _ConcatGrad(const int count, 
@@ -789,6 +795,7 @@ template <> void ConcatGrad<float, CUDAContext>(const int count,
    CUDA_POST_KERNEL_CHECK;
 }

+#ifdef WITH_CUDA_FP16
 template <> void ConcatGrad<float16, CUDAContext>(const int count, 
                                                  const int outer_dim, 
                                                  const int inner_dim,
@@ -808,6 +815,7 @@ template <> void ConcatGrad<float16, CUDAContext>(const int count,
                                            reinterpret_cast<half*>(dx));
    CUDA_POST_KERNEL_CHECK;
 }
+#endif

 /******************** common.crop ********************/

@@ -1134,6 +1142,7 @@ template <> void Transpose<float, CUDAContext>(const int count,
    CUDA_POST_KERNEL_CHECK;
 }

+#ifdef WITH_CUDA_FP16
 template <> void Transpose<float16, CUDAContext>(const int count, 
                                                 const int ndim, 
                                                 const int* order, 
@@ -1150,6 +1159,7 @@ template <> void Transpose<float16, CUDAContext>(const int count,
                                            reinterpret_cast<half*>(y));
    CUDA_POST_KERNEL_CHECK;
 }
+#endif

 template <typename T>
 __global__ void _TransposeGrad(const int count, 
@@ -1187,6 +1197,7 @@ template <> void TransposeGrad<float, CUDAContext>(const int count,
    CUDA_POST_KERNEL_CHECK;
 }

+#ifdef WITH_CUDA_FP16
 template <> void TransposeGrad<float16, CUDAContext>(const int count, 
                                                     const int ndim,
                                                     const int* order, 
@@ -1203,6 +1214,7 @@ template <> void TransposeGrad<float16, CUDAContext>(const int count,
                                               reinterpret_cast<half*>(dx));
    CUDA_POST_KERNEL_CHECK;
 }
+#endif

 /******************** loss.l1_loss ********************/

@@ -1834,6 +1846,7 @@ template <> void RMSPropUpdate<float, CUDAContext>(const int count,

 /******************** utils.cast ********************/

+#ifdef WITH_CUDA_FP16
 template <typename T>
 __global__ void _FloatToHalfKernel(const int count, const float* x, half* y) {
    CUDA_KERNEL_LOOP(idx, count) {
@@ -1849,6 +1862,7 @@ template <> void Float2Half<float, CUDAContext>(const int count,
                                                     reinterpret_cast<half*>(y));
     CUDA_POST_KERNEL_CHECK;
 }
+#endif

 /******************** utils.compare ********************/

@@ -1943,6 +1957,7 @@ template <> void MemoryData<uint8_t, float, CUDAContext>(const int count,
    CUDA_POST_KERNEL_CHECK;
 }

+#ifdef WITH_CUDA_FP16
 template <> void MemoryData<float, float16, CUDAContext>(const int count, 
                                                         const int num, 
                                                         const int channels, 
@@ -1976,6 +1991,7 @@ template <> void MemoryData<uint8_t, float16, CUDAContext>(const int count,
                                                          reinterpret_cast<half*>(y));
    CUDA_POST_KERNEL_CHECK;
 }
+#endif

 /******************** utils.one_hot ********************/


--- a/Dragon/src/utils/sse_alternative.cc
+++ b/Dragon/src/utils/sse_alternative.cc
@@ -3,164 +3,223 @@
 #include <cmath>
 #include <algorithm>

+#include "utils/omp_alternative.h"
 #include "utils/sse_alternative.h"

 namespace dragon {

 namespace sse {

-    template<> void Set(const int n, const float alpha, float* x) {
-        __m128 scalar = SSE_FP32_SCALAR(alpha);
-        SSE_LOOP1(i, n) SSE_FP32_STORE(x + i, scalar);
-        SSE_LOOP2(i, n) x[i] = alpha;
+template<> void Set(const int n, const float alpha, float* x) {
+    __m128 scalar = SSE_FP32_SCALAR(alpha);
+    int32_t i = 0;
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
+#endif
+    SSE_LOOP1(i, n) SSE_FP32_STORE(x + i, scalar);
+    SSE_LOOP2(i, n) x[i] = alpha;
+}
+
+template<> void Set(const int n, const int alpha, int* x) {
+    __m128i scalar = SSE_INT32_SCALAR(alpha);
+    __m128i* x1 = reinterpret_cast<__m128i*>(x);
+    int32_t i = 0;
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
+#endif
+    SSE_LOOP1(i, n) SSE_INT128_STORE(x1++, scalar);
+    SSE_LOOP2(i, n) x[i] = alpha;
+}
+
+template<> void Add(const int n, const float* a, const float* b, float* y) {
+    __m128 x1, y1, z1;
+    int32_t i = 0;
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
+#endif
+    SSE_LOOP1(i, n) {
+        x1 = SSE_FP32_LOAD(a + i);
+        y1 = SSE_FP32_LOAD(b + i);
+        z1 = SSE_FP32_ADD(x1, y1);
+        SSE_FP32_STORE(y + i, z1);
    }
-
-    template<> void Set(const int n, const int alpha, int* x) {
-        __m128i scalar = SSE_INT32_SCALAR(alpha);
-        __m128i* x1 = reinterpret_cast<__m128i*>(x);
-        SSE_LOOP1(i, n) SSE_INT128_STORE(x1++, scalar);
-        SSE_LOOP2(i, n) x[i] = alpha;
-    }
-
-    template<> void Add(const int n, const float* a, const float* b, float* y) {
-        __m128 x1, y1, z1;
-        SSE_LOOP1(i, n) {
-            x1 = SSE_FP32_LOAD(a + i);
-            y1 = SSE_FP32_LOAD(b + i);
-            z1 = SSE_FP32_ADD(x1, y1);
-            SSE_FP32_STORE(y + i, z1);
-        }
-        SSE_LOOP2(i, n) y[i] = a[i] + b[i];
-    }
-
-    template<> void Sub(const int n, const float* a, const float* b, float* y) {
-        __m128 x1, y1, z1;
-        SSE_LOOP1(i, n) {
-            x1 = SSE_FP32_LOAD(a + i);
-            y1 = SSE_FP32_LOAD(b + i);
-            z1 = SSE_FP32_SUB(x1, y1);
-            SSE_FP32_STORE(y + i, z1);
-        }
-        SSE_LOOP2(i, n) y[i] = a[i] - b[i];
-    }
-
-    template<> void Mul(const int n, const float* a, const float* b, float* y) {
-        __m128 x1, y1, z1;
-        SSE_LOOP1(i, n) {
-            x1 = SSE_FP32_LOAD(a + i);
-            y1 = SSE_FP32_LOAD(b + i);
-            z1 = SSE_FP32_MUL(x1, y1);
-            SSE_FP32_STORE(y + i, z1);
-        }
-        SSE_LOOP2(i, n) y[i] = a[i] * b[i];
+    SSE_LOOP2(i, n) y[i] = a[i] + b[i];
+}
+
+template<> void Sub(const int n, const float* a, const float* b, float* y) {
+    __m128 x1, y1, z1;
+    int32_t i = 0;
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
+#endif
+    SSE_LOOP1(i, n) {
+        x1 = SSE_FP32_LOAD(a + i);
+        y1 = SSE_FP32_LOAD(b + i);
+        z1 = SSE_FP32_SUB(x1, y1);
+        SSE_FP32_STORE(y + i, z1);
    }
-
-    template<> void Div(const int n, const float* a, const float* b, float* y) {
-        __m128 x1, y1, z1;
-        SSE_LOOP1(i, n) {
-            x1 = SSE_FP32_LOAD(a + i);
-            y1 = SSE_FP32_LOAD(b + i);
-            z1 = SSE_FP32_DIV(x1, y1);
-            SSE_FP32_STORE(y + i, z1);
-        }
-        SSE_LOOP2(i, n) y[i] = a[i] / b[i];
+    SSE_LOOP2(i, n) y[i] = a[i] - b[i];
+}
+
+template<> void Mul(const int n, const float* a, const float* b, float* y) {
+    __m128 x1, y1, z1;
+    int32_t i = 0;
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
+#endif
+    SSE_LOOP1(i, n) {
+        x1 = SSE_FP32_LOAD(a + i);
+        y1 = SSE_FP32_LOAD(b + i);
+        z1 = SSE_FP32_MUL(x1, y1);
+        SSE_FP32_STORE(y + i, z1);
    }
-
-    template<> void Scal(const int n, const float alpha, float* y) {
-        __m128 y1, scalar = SSE_FP32_SCALAR(alpha);
-        SSE_LOOP1(i, n) {
-            y1 = SSE_FP32_LOAD(y + i);
-            y1 = SSE_FP32_MUL(y1, scalar);
-            SSE_FP32_STORE(y + i, y1);
-        }
-        SSE_LOOP2(i, n) y[i] *= alpha;
+    SSE_LOOP2(i, n) y[i] = a[i] * b[i];
+}
+
+template<> void Div(const int n, const float* a, const float* b, float* y) {
+    __m128 x1, y1, z1;
+    int32_t i = 0;
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
+#endif
+    SSE_LOOP1(i, n) {
+        x1 = SSE_FP32_LOAD(a + i);
+        y1 = SSE_FP32_LOAD(b + i);
+        z1 = SSE_FP32_DIV(x1, y1);
+        SSE_FP32_STORE(y + i, z1);
    }
-
-    template<> void Scale(const int n, const float alpha, const float* x, float* y) {
-        __m128 x1, scalar = SSE_FP32_SCALAR(alpha);
-        SSE_LOOP1(i, n) {
-            x1 = SSE_FP32_LOAD(x + i);
-            x1 = SSE_FP32_MUL(x1, scalar);
-            SSE_FP32_STORE(y + i, x1);
-        }
-        SSE_LOOP2(i, n) y[i] = x[i] * alpha;
+    SSE_LOOP2(i, n) y[i] = a[i] / b[i];
+}
+
+template<> void Scal(const int n, const float alpha, float* y) {
+    __m128 y1, scalar = SSE_FP32_SCALAR(alpha);
+    int32_t i = 0;
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
+#endif
+    SSE_LOOP1(i, n) {
+        y1 = SSE_FP32_LOAD(y + i);
+        y1 = SSE_FP32_MUL(y1, scalar);
+        SSE_FP32_STORE(y + i, y1);
    }
-
-    
-    template<> void Axpy(const int n, float alpha, const float* x, float *y) {
-        __m128 x1, y1, scalar = SSE_FP32_SCALAR(alpha);
-        SSE_LOOP1(i, n) {
-            x1 = SSE_FP32_LOAD(x + i);
-            y1 = SSE_FP32_LOAD(y + i);
-            x1 = SSE_FP32_MUL(x1, scalar);
-            y1 = SSE_FP32_ADD(x1, y1);
-            SSE_FP32_STORE(y + i, y1);
-        }
-        SSE_LOOP2(i, n) y[i] = alpha * x[i] + y[i];
+    SSE_LOOP2(i, n) y[i] *= alpha;
+}
+
+template<> void Scale(const int n, const float alpha, const float* x, float* y) {
+    __m128 x1, scalar = SSE_FP32_SCALAR(alpha);
+    int32_t i = 0;
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
+#endif
+    SSE_LOOP1(i, n) {
+        x1 = SSE_FP32_LOAD(x + i);
+        x1 = SSE_FP32_MUL(x1, scalar);
+        SSE_FP32_STORE(y + i, x1);
    }
-
-    template<> void Axpby(const int n, float alpha, const float* x, 
-        const float beta, float *y) {
-        __m128 x1, y1, z1;
-        __m128 scalar1 = SSE_FP32_SCALAR(alpha);
-        __m128 scalar2 = SSE_FP32_SCALAR(beta);
-        SSE_LOOP1(i, n) {
-            x1 = SSE_FP32_LOAD(x + i);
-            y1 = SSE_FP32_LOAD(y + i);
-            x1 = SSE_FP32_MUL(x1, scalar1);
-            y1 = SSE_FP32_MUL(y1, scalar2);
-            z1 = SSE_FP32_ADD(x1, y1);
-            SSE_FP32_STORE(y + i, z1);
-        }
-        SSE_LOOP2(i, n) y[i] = alpha * x[i] + beta* y[i];
+    SSE_LOOP2(i, n) y[i] = x[i] * alpha;
+}
+
+template<> void Axpy(const int n, float alpha, const float* x, float *y) {
+    __m128 x1, y1, scalar = SSE_FP32_SCALAR(alpha);
+    int32_t i = 0;
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
+#endif
+    SSE_LOOP1(i, n) {
+        x1 = SSE_FP32_LOAD(x + i);
+        y1 = SSE_FP32_LOAD(y + i);
+        x1 = SSE_FP32_MUL(x1, scalar);
+        y1 = SSE_FP32_ADD(x1, y1);
+        SSE_FP32_STORE(y + i, y1);
    }
-
-    template<> float ASum(const int n, const float *x) {
-        __m128 x1, sum = SSE_FP32_ZERO;
-        SSE_LOOP1(i, n) {
-            x1 = SSE_FP32_LOAD(x + i);
-            sum = SSE_FP32_ADD(sum, x1);
-        }
-        float buf[4];
-        SSE_FP32_STORE(buf, sum);
-        float ret = buf[0] + buf[1] + buf[2] + buf[3];
-        SSE_LOOP2(i, n) ret += x[i];
-        return ret;
+    SSE_LOOP2(i, n) y[i] = alpha * x[i] + y[i];
+}
+
+template<> void Axpby(const int n, 
+                      float alpha, 
+                      const float* x, 
+                      const float beta, 
+                      float *y) {
+    __m128 x1, y1, z1;
+    __m128 scalar1 = SSE_FP32_SCALAR(alpha);
+    __m128 scalar2 = SSE_FP32_SCALAR(beta);
+    int32_t i = 0;
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
+#endif
+    SSE_LOOP1(i, n) {
+        x1 = SSE_FP32_LOAD(x + i);
+        y1 = SSE_FP32_LOAD(y + i);
+        x1 = SSE_FP32_MUL(x1, scalar1);
+        y1 = SSE_FP32_MUL(y1, scalar2);
+        z1 = SSE_FP32_ADD(x1, y1);
+        SSE_FP32_STORE(y + i, z1);
    }
-
-    template<> void AddScalar(const int n, const float alpha, float* y) {
-        __m128 y1, scalar = SSE_FP32_SCALAR(alpha);
-        SSE_LOOP1(i, n) {
-            y1 = SSE_FP32_LOAD(y + i);
-            y1 = SSE_FP32_ADD(y1, scalar);
-            SSE_FP32_STORE(y + i, y1);
-        }
-        SSE_LOOP2(i, n) y[i] += alpha;
+    SSE_LOOP2(i, n) y[i] = alpha * x[i] + beta* y[i];
+}
+
+template<> float ASum(const int n, const float *x) {
+    __m128 x1, sum = SSE_FP32_ZERO;
+    int32_t i = 0;
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
+#endif
+    SSE_LOOP1(i, n) {
+        x1 = SSE_FP32_LOAD(x + i);
+        sum = SSE_FP32_ADD(sum, x1);
    }
-
-    template<> void MulScalar(const int n, const float alpha, float* y) {
-        __m128 y1, scalar = SSE_FP32_SCALAR(alpha);
-        SSE_LOOP1(i, n) {
-            y1 = SSE_FP32_LOAD(y + i);
-            y1 = SSE_FP32_MUL(y1, scalar);
-            SSE_FP32_STORE(y + i, y1);
-        }
-        SSE_LOOP2(i, n) y[i] *= alpha;
+    float buf[4];
+    SSE_FP32_STORE(buf, sum);
+    float ret = buf[0] + buf[1] + buf[2] + buf[3];
+    SSE_LOOP2(i, n) ret += x[i];
+    return ret;
+}
+
+template<> void AddScalar(const int n, const float alpha, float* y) {
+    __m128 y1, scalar = SSE_FP32_SCALAR(alpha);
+    int32_t i = 0;
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
+#endif
+     SSE_LOOP1(i, n) {
+         y1 = SSE_FP32_LOAD(y + i);
+         y1 = SSE_FP32_ADD(y1, scalar);
+         SSE_FP32_STORE(y + i, y1);
+     }
+     SSE_LOOP2(i, n) y[i] += alpha;
+}
+
+template<> void MulScalar(const int n, const float alpha, float* y) {
+    __m128 y1, scalar = SSE_FP32_SCALAR(alpha);
+    int32_t i = 0;
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
+#endif
+    SSE_LOOP1(i, n) {
+        y1 = SSE_FP32_LOAD(y + i);
+        y1 = SSE_FP32_MUL(y1, scalar);
+        SSE_FP32_STORE(y + i, y1);
    }
-
-    template <> float Dot(const int n, const float* a, const float* b) {
-        __m128 x1, y1, sum = SSE_FP32_ZERO;
-        SSE_LOOP1(i, n) {
-            x1 = SSE_FP32_LOAD(a + i);
-            y1 = SSE_FP32_LOAD(b + i);
-            sum = SSE_FP32_ADD(sum, SSE_FP32_MUL(x1, y1));
-        }
-        float buf[4];
-        SSE_FP32_STORE(buf, sum);
-        float ret = buf[0] + buf[1] + buf[2] + buf[3];
-        SSE_LOOP2(i, n) ret += a[i] * b[i];
-        return ret;
+    SSE_LOOP2(i, n) y[i] *= alpha;
+}
+
+template <> float Dot(const int n, const float* a, const float* b) {
+    __m128 x1, y1, sum = SSE_FP32_ZERO;
+    int32_t i = 0;
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
+#endif
+    SSE_LOOP1(i, n) {
+        x1 = SSE_FP32_LOAD(a + i);
+        y1 = SSE_FP32_LOAD(b + i);
+        sum = SSE_FP32_ADD(sum, SSE_FP32_MUL(x1, y1));
    }
+    float buf[4];
+    SSE_FP32_STORE(buf, sum);
+    float ret = buf[0] + buf[1] + buf[2] + buf[3];
+    SSE_LOOP2(i, n) ret += a[i] * b[i];
+    return ret;
+}

 }    // namespace ssd


--- a/README.md
+++ b/README.md
 # Dragon: A Computation Graph Virtual Machine Based Deep Learning Framework
-
+![](http://images.cnblogs.com/cnblogs_com/neopenx/690760/o_dragon_logo.png)
+-----
 ### Compile Requirements for C++

 0. Google Protocol Buffer