Commit c40eaf7b by Ting PAN

Add support for building Ampere GPU, CUDA11 and CUDNN8

Summary:
This commit fixes the issue on building with CUDA11 and CUDNN8.
Besides, C++14 is enabled by default instead of C++11 to support CUB 1.9+,
and for this reason, the compiler is required to be gcc5/clang6/msvc141 or higher.
1 parent d8f612c8
...@@ -20,6 +20,8 @@ if (USE_CUDA) ...@@ -20,6 +20,8 @@ if (USE_CUDA)
if (MSVC) if (MSVC)
# Suppress all warnings for msvc compiler # Suppress all warnings for msvc compiler
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -w") set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -w")
else()
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++14")
endif() endif()
endif() endif()
if (USE_TENSORRT) if (USE_TENSORRT)
......
...@@ -49,11 +49,6 @@ foreach(_proto ${ARGN}) ...@@ -49,11 +49,6 @@ foreach(_proto ${ARGN})
-I=${_proto_dir} -I=${_proto_dir}
--cpp_out=${PROTOBUF_DLLEXPORT_STRING}${_proto_dir} --cpp_out=${PROTOBUF_DLLEXPORT_STRING}${_proto_dir}
${_proto}) ${_proto})
if (MSVC)
string(REPLACE ".proto" ".pb.h" _pb_h "${_proto}")
string(REPLACE ".proto" ".pb.cc" _pb_cc "${_proto}")
protobuf_remove_constexpr(${_pb_h} ${_pb_cc})
endif()
endforeach() endforeach()
endfunction() endfunction()
...@@ -69,11 +64,6 @@ foreach(_proto ${ARGN}) ...@@ -69,11 +64,6 @@ foreach(_proto ${ARGN})
-I=${_proto_dir} -I=${_proto_dir}
--cpp_out=${PROTOBUF_DLLEXPORT_STRING}${_proto_dir} --cpp_out=${PROTOBUF_DLLEXPORT_STRING}${_proto_dir}
${_proto}) ${_proto})
if (MSVC)
string(REPLACE ".proto" ".pb.h" _pb_h "${_proto}")
string(REPLACE ".proto" ".pb.cc" _pb_cc "${_proto}")
protobuf_remove_constexpr(${_pb_h} ${_pb_cc})
endif()
endforeach() endforeach()
endfunction() endfunction()
......
include(CheckCXXCompilerFlag) include(CheckCXXCompilerFlag)
# ---[ Check if CXX11 is supported # ---[ Check if CXX14 is supported
set(CMAKE_CXX_STANDARD 11) set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_STANDARD_REQUIRED ON)
# ---[ Use ``-fPIC`` for all compilers # ---[ Use ``-fPIC`` for all compilers
...@@ -30,7 +30,7 @@ if (MSVC) ...@@ -30,7 +30,7 @@ if (MSVC)
endif() endif()
else() # GNU, Clang, AppleClang else() # GNU, Clang, AppleClang
set(CMAKE_ORIGIN) set(CMAKE_ORIGIN)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2 -std=c++11") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2 -std=c++14")
if (USE_NATIVE_ARCH) if (USE_NATIVE_ARCH)
check_cxx_compiler_flag("-march=native" COMPILER_SUPPORTS_MARCH_NATIVE) check_cxx_compiler_flag("-march=native" COMPILER_SUPPORTS_MARCH_NATIVE)
if (COMPILER_SUPPORTS_MARCH_NATIVE) if (COMPILER_SUPPORTS_MARCH_NATIVE)
......
...@@ -5,9 +5,9 @@ ...@@ -5,9 +5,9 @@
# - "Auto" detects local machine GPU compute arch at runtime. # - "Auto" detects local machine GPU compute arch at runtime.
# - "Common" and "All" cover common and entire subsets of architectures # - "Common" and "All" cover common and entire subsets of architectures
# ARCH_AND_PTX : NAME | NUM.NUM | NUM.NUM(NUM.NUM) | NUM.NUM+PTX # ARCH_AND_PTX : NAME | NUM.NUM | NUM.NUM(NUM.NUM) | NUM.NUM+PTX
# NAME: Kepler Maxwell Kepler+Tesla Maxwell+Tegra Pascal Volta Turing # NAME: Kepler Maxwell Kepler+Tesla Maxwell+Tegra Pascal Volta Turing Ampere
# NUM: Any number. Only those pairs are currently accepted by NVCC though: # NUM: Any number. Only those pairs are currently accepted by NVCC though:
# 3.5 3.7 5.0 5.2 5.3 6.0 6.1 6.2 7.0 7.2 7.5 # 3.5 3.7 5.0 5.2 5.3 6.0 6.1 6.2 7.0 7.2 7.5 8.0
# Returns LIST of flags to be added to CUDA_NVCC_FLAGS in ${out_variable} # Returns LIST of flags to be added to CUDA_NVCC_FLAGS in ${out_variable}
# Additionally, sets ${out_variable}_readable to the resulting numeric list # Additionally, sets ${out_variable}_readable to the resulting numeric list
# Example: # Example:
...@@ -55,27 +55,39 @@ if(CUDA_VERSION VERSION_GREATER "7.5") ...@@ -55,27 +55,39 @@ if(CUDA_VERSION VERSION_GREATER "7.5")
list(APPEND CUDA_ALL_GPU_ARCHITECTURES "6.0" "6.1" "6.2") list(APPEND CUDA_ALL_GPU_ARCHITECTURES "6.0" "6.1" "6.2")
if(CUDA_VERSION VERSION_LESS "9.0") if(CUDA_VERSION VERSION_LESS "9.0")
list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "6.1+PTX") list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "6.2+PTX")
set(CUDA_LIMIT_GPU_ARCHITECTURE "7.0") set(CUDA_LIMIT_GPU_ARCHITECTURE "7.0")
endif() endif()
endif () endif ()
if(CUDA_VERSION VERSION_GREATER "8.5") if(CUDA_VERSION VERSION_GREATER "8.5")
list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Volta") list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Volta")
list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "7.0" "7.0+PTX") list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "7.0")
list(APPEND CUDA_ALL_GPU_ARCHITECTURES "7.0" "7.0+PTX" "7.2" "7.2+PTX") list(APPEND CUDA_ALL_GPU_ARCHITECTURES "7.0" "7.2")
if(CUDA_VERSION VERSION_LESS "10.0") if(CUDA_VERSION VERSION_LESS "10.0")
list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "7.2+PTX")
set(CUDA_LIMIT_GPU_ARCHITECTURE "8.0") set(CUDA_LIMIT_GPU_ARCHITECTURE "8.0")
endif() endif()
endif() endif()
if(CUDA_VERSION VERSION_GREATER "9.5") if(CUDA_VERSION VERSION_GREATER "9.5")
list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Turing") list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Turing")
list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "7.5" "7.5+PTX") list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "7.5")
list(APPEND CUDA_ALL_GPU_ARCHITECTURES "7.5" "7.5+PTX") list(APPEND CUDA_ALL_GPU_ARCHITECTURES "7.5")
if(CUDA_VERSION VERSION_LESS "11.0") if(CUDA_VERSION VERSION_LESS "11.0")
list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "7.5+PTX")
set(CUDA_LIMIT_GPU_ARCHITECTURE "8.0")
endif()
endif()
if(CUDA_VERSION VERSION_GREATER "10.5")
list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Ampere")
list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.0" "8.0+PTX")
list(APPEND CUDA_ALL_GPU_ARCHITECTURES "8.0")
if(CUDA_VERSION VERSION_LESS "12.0")
set(CUDA_LIMIT_GPU_ARCHITECTURE "9.0") set(CUDA_LIMIT_GPU_ARCHITECTURE "9.0")
endif() endif()
endif() endif()
...@@ -211,6 +223,9 @@ function(CUDA_SELECT_NVCC_ARCH_FLAGS out_variable) ...@@ -211,6 +223,9 @@ function(CUDA_SELECT_NVCC_ARCH_FLAGS out_variable)
elseif(${arch_name} STREQUAL "Turing") elseif(${arch_name} STREQUAL "Turing")
set(arch_bin 7.5) set(arch_bin 7.5)
set(arch_ptx 7.5) set(arch_ptx 7.5)
elseif(${arch_name} STREQUAL "Ampere")
set(arch_bin 8.0)
set(arch_ptx 8.0)
else() else()
message(SEND_ERROR "Unknown CUDA Architecture Name ${arch_name} in CUDA_SELECT_NVCC_ARCH_FLAGS") message(SEND_ERROR "Unknown CUDA Architecture Name ${arch_name} in CUDA_SELECT_NVCC_ARCH_FLAGS")
endif() endif()
......
...@@ -9,10 +9,9 @@ std::mutex& CUDAContext::mutex() { ...@@ -9,10 +9,9 @@ std::mutex& CUDAContext::mutex() {
return m; return m;
} }
CUDAObject* CUDAContext::object() { CUDAObjects& CUDAContext::objects() {
static TLS_OBJECT CUDAObject* cuda_object_; static thread_local CUDAObjects cuda_objects_;
if (!cuda_object_) cuda_object_ = new CUDAObject(); return cuda_objects_;
return cuda_object_;
} }
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -21,10 +21,10 @@ namespace dragon { ...@@ -21,10 +21,10 @@ namespace dragon {
#ifdef USE_CUDA #ifdef USE_CUDA
class CUDAObject { class CUDAObjects {
public: public:
/*! \brief Default Constructor */ /*! \brief Default Constructor */
CUDAObject() { CUDAObjects() {
for (int i = 0; i < CUDA_MAX_DEVICES; i++) { for (int i = 0; i < CUDA_MAX_DEVICES; i++) {
cuda_streams_[i] = vector<cudaStream_t>(); cuda_streams_[i] = vector<cudaStream_t>();
cublas_handles_[i] = vector<cublasHandle_t>(); cublas_handles_[i] = vector<cublasHandle_t>();
...@@ -38,7 +38,7 @@ class CUDAObject { ...@@ -38,7 +38,7 @@ class CUDAObject {
} }
/*! \brief Destructor */ /*! \brief Destructor */
~CUDAObject() { ~CUDAObjects() {
for (int i = 0; i < CUDA_MAX_DEVICES; i++) { for (int i = 0; i < CUDA_MAX_DEVICES; i++) {
for (int j = 0; j < cuda_streams_[i].size(); j++) { for (int j = 0; j < cuda_streams_[i].size(); j++) {
auto& stream = cuda_streams_[i][j]; auto& stream = cuda_streams_[i][j];
...@@ -164,7 +164,7 @@ class CUDAObject { ...@@ -164,7 +164,7 @@ class CUDAObject {
bool cudnn_benchmark_ = false; bool cudnn_benchmark_ = false;
private: private:
DISABLE_COPY_AND_ASSIGN(CUDAObject); DISABLE_COPY_AND_ASSIGN(CUDAObjects);
}; };
/*! /*!
...@@ -197,7 +197,7 @@ class DRAGON_API CUDAContext { ...@@ -197,7 +197,7 @@ class DRAGON_API CUDAContext {
/*! \brief Set a memory block to the given value */ /*! \brief Set a memory block to the given value */
static void Memset(size_t n, void* ptr, int value = 0) { static void Memset(size_t n, void* ptr, int value = 0) {
auto stream = object()->default_stream(); auto stream = objects().default_stream();
CUDA_CHECK(cudaMemsetAsync(ptr, value, n, stream)); CUDA_CHECK(cudaMemsetAsync(ptr, value, n, stream));
SynchronizeStream(stream); SynchronizeStream(stream);
} }
...@@ -216,7 +216,7 @@ class DRAGON_API CUDAContext { ...@@ -216,7 +216,7 @@ class DRAGON_API CUDAContext {
/*! \brief Copy a memory block to the destination using given device */ /*! \brief Copy a memory block to the destination using given device */
template <class DestContext, class SrcContext> template <class DestContext, class SrcContext>
static void Memcpy(size_t n, void* dest, const void* src, int device) { static void Memcpy(size_t n, void* dest, const void* src, int device) {
auto stream = object()->default_stream(device); auto stream = objects().default_stream(device);
CUDA_CHECK(cudaMemcpyAsync(dest, src, n, cudaMemcpyDefault, stream)); CUDA_CHECK(cudaMemcpyAsync(dest, src, n, cudaMemcpyDefault, stream));
SynchronizeStream(stream); SynchronizeStream(stream);
} }
...@@ -269,12 +269,12 @@ class DRAGON_API CUDAContext { ...@@ -269,12 +269,12 @@ class DRAGON_API CUDAContext {
/*! \brief Return the specified cuda stream */ /*! \brief Return the specified cuda stream */
cudaStream_t cuda_stream(int device, int stream) { cudaStream_t cuda_stream(int device, int stream) {
return object()->stream(device, stream); return objects().stream(device, stream);
} }
/*! \brief Return the cublas handle */ /*! \brief Return the cublas handle */
cublasHandle_t cublas_handle() { cublasHandle_t cublas_handle() {
return object()->cublas_handle(device_id_, stream_id_); return objects().cublas_handle(device_id_, stream_id_);
} }
/*! \brief Return the curand generator */ /*! \brief Return the curand generator */
...@@ -293,7 +293,7 @@ class DRAGON_API CUDAContext { ...@@ -293,7 +293,7 @@ class DRAGON_API CUDAContext {
/*! \brief Return the cudnn handle */ /*! \brief Return the cudnn handle */
#ifdef USE_CUDNN #ifdef USE_CUDNN
cudnnHandle_t cudnn_handle() { cudnnHandle_t cudnn_handle() {
return object()->cudnn_handle(device_id_, stream_id_); return objects().cudnn_handle(device_id_, stream_id_);
} }
#endif #endif
...@@ -315,8 +315,8 @@ class DRAGON_API CUDAContext { ...@@ -315,8 +315,8 @@ class DRAGON_API CUDAContext {
/*! \brief Return the shared context mutex */ /*! \brief Return the shared context mutex */
static std::mutex& mutex(); static std::mutex& mutex();
/*! \brief Return the thread-local cuda object */ /*! \brief Return the thread-local cuda objects */
static CUDAObject* object(); static CUDAObjects& objects();
/*! \brief Return the random generator */ /*! \brief Return the random generator */
std::mt19937* rand_generator() { std::mt19937* rand_generator() {
......
...@@ -158,7 +158,7 @@ TryCreateOperator(const string& key, const OperatorDef& def, Workspace* ws) { ...@@ -158,7 +158,7 @@ TryCreateOperator(const string& key, const OperatorDef& def, Workspace* ws) {
case PROTO_CUDA: case PROTO_CUDA:
#ifdef USE_CUDNN #ifdef USE_CUDNN
if (CUDNNOperatorRegistry()->Has(key) && if (CUDNNOperatorRegistry()->Has(key) &&
CUDAContext::object()->cudnn_enabled_) { CUDAContext::objects().cudnn_enabled_) {
return CUDNNOperatorRegistry()->Create(key, def, ws); return CUDNNOperatorRegistry()->Create(key, def, ws);
} }
#endif #endif
......
...@@ -98,9 +98,9 @@ void RegisterModule(py::module& m) { ...@@ -98,9 +98,9 @@ void RegisterModule(py::module& m) {
/*! \brief Activate the CuDNN engine */ /*! \brief Activate the CuDNN engine */
m.def("cudaEnableDNN", [](bool enabled, bool benchmark) { m.def("cudaEnableDNN", [](bool enabled, bool benchmark) {
#ifdef USE_CUDA #ifdef USE_CUDA
auto* cuda_object = CUDAContext::object(); auto& cuda_objects = CUDAContext::objects();
cuda_object->cudnn_enabled_ = enabled; cuda_objects.cudnn_enabled_ = enabled;
cuda_object->cudnn_benchmark_ = benchmark; cuda_objects.cudnn_benchmark_ = benchmark;
#endif #endif
}); });
...@@ -129,7 +129,7 @@ void RegisterModule(py::module& m) { ...@@ -129,7 +129,7 @@ void RegisterModule(py::module& m) {
m.def("cudaStreamSynchronize", [](int device_id, int stream_id) { m.def("cudaStreamSynchronize", [](int device_id, int stream_id) {
#ifdef USE_CUDA #ifdef USE_CUDA
if (device_id < 0) device_id = CUDAContext::current_device(); if (device_id < 0) device_id = CUDAContext::current_device();
auto stream = CUDAContext::object()->stream(device_id, stream_id); auto stream = CUDAContext::objects().stream(device_id, stream_id);
CUDAContext::SynchronizeStream(stream); CUDAContext::SynchronizeStream(stream);
#endif #endif
}); });
......
...@@ -9,7 +9,6 @@ template <typename T> ...@@ -9,7 +9,6 @@ template <typename T>
void CuDNNReluOp<Context>::DoRunWithType() { void CuDNNReluOp<Context>::DoRunWithType() {
auto &X = Input(0), *Y = Output(0, {0}); auto &X = Input(0), *Y = Output(0, {0});
CuDNNSetTensorDesc<T>(&input_desc_, X.dims()); CuDNNSetTensorDesc<T>(&input_desc_, X.dims());
#if CUDNN_VERSION_MIN(5, 0, 0)
CUDNN_CHECK(cudnnActivationForward( CUDNN_CHECK(cudnnActivationForward(
ctx()->cudnn_handle(), ctx()->cudnn_handle(),
act_desc_, act_desc_,
...@@ -19,17 +18,6 @@ void CuDNNReluOp<Context>::DoRunWithType() { ...@@ -19,17 +18,6 @@ void CuDNNReluOp<Context>::DoRunWithType() {
CuDNNType<T>::zero, CuDNNType<T>::zero,
input_desc_, input_desc_,
Y->ReshapeLike(X)->template mutable_data<T, Context>())); Y->ReshapeLike(X)->template mutable_data<T, Context>()));
#else
CUDNN_CHECK(cudnnActivationForward_v4(
ctx()->cudnn_handle(),
act_desc_,
CuDNNType<Dtype>::one,
input_desc_,
X.template data<T, Context>(),
CuDNNType<Dtype>::zero,
input_desc_,
Y->ReshapeLike(X)->template mutable_data<T, Context>()));
#endif
} }
template <class Context> template <class Context>
...@@ -46,7 +34,6 @@ template <typename T> ...@@ -46,7 +34,6 @@ template <typename T>
void CuDNNReluGradientOp<Context>::DoRunWithType() { void CuDNNReluGradientOp<Context>::DoRunWithType() {
auto &Y = Input(0), &dY = Input(1), *dX = Output(0); auto &Y = Input(0), &dY = Input(1), *dX = Output(0);
CuDNNSetTensorDesc<T>(&input_desc_, Y.dims()); CuDNNSetTensorDesc<T>(&input_desc_, Y.dims());
#if CUDNN_VERSION_MIN(5, 0, 0)
CUDNN_CHECK(cudnnActivationBackward( CUDNN_CHECK(cudnnActivationBackward(
ctx()->cudnn_handle(), ctx()->cudnn_handle(),
act_desc_, act_desc_,
...@@ -60,21 +47,6 @@ void CuDNNReluGradientOp<Context>::DoRunWithType() { ...@@ -60,21 +47,6 @@ void CuDNNReluGradientOp<Context>::DoRunWithType() {
CuDNNType<T>::zero, CuDNNType<T>::zero,
input_desc_, input_desc_,
dX->ReshapeLike(Y)->template mutable_data<T, Context>())); dX->ReshapeLike(Y)->template mutable_data<T, Context>()));
#else
CUDNN_CHECK(cudnnActivationBackward_v4(
ctx()->cudnn_handle(),
act_desc_,
CuDNNType<T>::one,
input_desc_,
Y.template data<T, Context>(),
input_desc_,
dY.template data<T, Context>(),
input_desc_,
Y.template data<T, Context>(),
CuDNNType<T>::zero,
input_desc_,
dX->ReshapeLike(Y)->template mutable_data<T, Context>()));
#endif
} }
template <class Context> template <class Context>
......
...@@ -9,7 +9,6 @@ template <typename T> ...@@ -9,7 +9,6 @@ template <typename T>
void CuDNNSigmoidOp<Context>::DoRunWithType() { void CuDNNSigmoidOp<Context>::DoRunWithType() {
auto &X = Input(0), *Y = Output(0, {0}); auto &X = Input(0), *Y = Output(0, {0});
CuDNNSetTensorDesc<T>(&input_desc_, X.dims()); CuDNNSetTensorDesc<T>(&input_desc_, X.dims());
#if CUDNN_VERSION_MIN(5, 0, 0)
CUDNN_CHECK(cudnnActivationForward( CUDNN_CHECK(cudnnActivationForward(
ctx()->cudnn_handle(), ctx()->cudnn_handle(),
act_desc_, act_desc_,
...@@ -19,17 +18,6 @@ void CuDNNSigmoidOp<Context>::DoRunWithType() { ...@@ -19,17 +18,6 @@ void CuDNNSigmoidOp<Context>::DoRunWithType() {
CuDNNType<T>::zero, CuDNNType<T>::zero,
input_desc_, input_desc_,
Y->ReshapeLike(X)->template mutable_data<T, Context>())); Y->ReshapeLike(X)->template mutable_data<T, Context>()));
#else
CUDNN_CHECK(cudnnActivationForward_v4(
ctx()->cudnn_handle(),
act_desc_,
CuDNNType<T>::one,
input_desc_,
X.template data<T, Context>(),
CuDNNType<T>::zero,
input_desc_,
Y->ReshapeLike(X)->template mutable_data<T, Context>()));
#endif
} }
template <class Context> template <class Context>
...@@ -42,7 +30,6 @@ template <typename T> ...@@ -42,7 +30,6 @@ template <typename T>
void CuDNNSigmoidGradientOp<Context>::DoRunWithType() { void CuDNNSigmoidGradientOp<Context>::DoRunWithType() {
auto &Y = Input(0), &dY = Input(1), *dX = Output(0); auto &Y = Input(0), &dY = Input(1), *dX = Output(0);
CuDNNSetTensorDesc<T>(&input_desc_, Y.dims()); CuDNNSetTensorDesc<T>(&input_desc_, Y.dims());
#if CUDNN_VERSION_MIN(5, 0, 0)
CUDNN_CHECK(cudnnActivationBackward( CUDNN_CHECK(cudnnActivationBackward(
ctx()->cudnn_handle(), ctx()->cudnn_handle(),
act_desc_, act_desc_,
...@@ -56,21 +43,6 @@ void CuDNNSigmoidGradientOp<Context>::DoRunWithType() { ...@@ -56,21 +43,6 @@ void CuDNNSigmoidGradientOp<Context>::DoRunWithType() {
CuDNNType<T>::zero, CuDNNType<T>::zero,
input_desc_, input_desc_,
dX->ReshapeLike(Y)->template mutable_data<T, Context>())); dX->ReshapeLike(Y)->template mutable_data<T, Context>()));
#else
CUDNN_CHECK(cudnnActivationBackward_v4(
ctx()->cudnn_handle(),
act_desc_,
CuDNNType<T>::one,
input_desc_,
Y.template data<T, Context>(),
input_desc_,
dY.template data<T, Context>(),
input_desc_,
y,
CuDNNType<T>::zero,
input_desc_,
dX->ReshapeLike(Y)->template mutable_data<T, Context>()));
#endif
} }
template <class Context> template <class Context>
......
...@@ -9,7 +9,6 @@ template <typename T> ...@@ -9,7 +9,6 @@ template <typename T>
void CuDNNTanhOp<Context>::DoRunWithType() { void CuDNNTanhOp<Context>::DoRunWithType() {
auto &X = Input(0), *Y = Output(0, {0}); auto &X = Input(0), *Y = Output(0, {0});
CuDNNSetTensorDesc<T>(&input_desc_, X.dims()); CuDNNSetTensorDesc<T>(&input_desc_, X.dims());
#if CUDNN_VERSION_MIN(5, 0, 0)
CUDNN_CHECK(cudnnActivationForward( CUDNN_CHECK(cudnnActivationForward(
ctx()->cudnn_handle(), ctx()->cudnn_handle(),
act_desc_, act_desc_,
...@@ -19,17 +18,6 @@ void CuDNNTanhOp<Context>::DoRunWithType() { ...@@ -19,17 +18,6 @@ void CuDNNTanhOp<Context>::DoRunWithType() {
CuDNNType<T>::zero, CuDNNType<T>::zero,
input_desc_, input_desc_,
Y->ReshapeLike(X)->template mutable_data<T, Context>())); Y->ReshapeLike(X)->template mutable_data<T, Context>()));
#else
CUDNN_CHECK(cudnnActivationForward_v4(
ctx()->cudnn_handle(),
act_desc_,
CuDNNType<T>::one,
input_desc_,
X.template data<T, Context>(),
CuDNNType<T>::zero,
output_desc_,
Y->ReshapeLike(X)->template mutable_data<T, Context>()));
#endif
} }
template <class Context> template <class Context>
...@@ -42,7 +30,6 @@ template <typename T> ...@@ -42,7 +30,6 @@ template <typename T>
void CuDNNTanhGradientOp<Context>::DoRunWithType() { void CuDNNTanhGradientOp<Context>::DoRunWithType() {
auto &Y = Input(0), &dY = Input(1), *dX = Output(0); auto &Y = Input(0), &dY = Input(1), *dX = Output(0);
CuDNNSetTensorDesc<T>(&input_desc_, Y.dims()); CuDNNSetTensorDesc<T>(&input_desc_, Y.dims());
#if CUDNN_VERSION_MIN(5, 0, 0)
CUDNN_CHECK(cudnnActivationBackward( CUDNN_CHECK(cudnnActivationBackward(
ctx()->cudnn_handle(), ctx()->cudnn_handle(),
act_desc_, act_desc_,
...@@ -56,21 +43,6 @@ void CuDNNTanhGradientOp<Context>::DoRunWithType() { ...@@ -56,21 +43,6 @@ void CuDNNTanhGradientOp<Context>::DoRunWithType() {
CuDNNType<T>::zero, CuDNNType<T>::zero,
input_desc_, input_desc_,
dX->ReshapeLike(Y)->template mutable_data<T, Context>())); dX->ReshapeLike(Y)->template mutable_data<T, Context>()));
#else
CUDNN_CHECK(cudnnActivationBackward_v4(
ctx()->cudnn_handle(),
act_desc_,
CuDNNType<T>::one,
input_desc_,
Y.template data<T, Context>(),
input_desc_,
dY.template data<T, Context>(),
input_desc_,
Y.template data<T, Context>(),
CuDNNType<T>::zero,
input_desc_,
dX->ReshapeLike(Y)->template mutable_data<T, Context>()));
#endif
} }
template <class Context> template <class Context>
......
...@@ -148,7 +148,7 @@ class CollectiveOpBase : public Operator<Context> { ...@@ -148,7 +148,7 @@ class CollectiveOpBase : public Operator<Context> {
} }
ncclComm_t nccl_comm() { ncclComm_t nccl_comm() {
auto ret = CUDAContext::object()->nccl_comm( auto ret = CUDAContext::objects().nccl_comm(
this->ctx()->template device(), this->ctx()->template device(),
group_str_, group_str_,
nullptr, nullptr,
...@@ -161,7 +161,7 @@ class CollectiveOpBase : public Operator<Context> { ...@@ -161,7 +161,7 @@ class CollectiveOpBase : public Operator<Context> {
NCCL_CHECK(ncclGetUniqueId(&comm_uuid)); NCCL_CHECK(ncclGetUniqueId(&comm_uuid));
} }
Broadcast((uint8_t*)&comm_uuid, sizeof(comm_uuid)); Broadcast((uint8_t*)&comm_uuid, sizeof(comm_uuid));
ret = CUDAContext::object()->nccl_comm( ret = CUDAContext::objects().nccl_comm(
this->ctx()->template device(), this->ctx()->template device(),
group_str_, group_str_,
&comm_uuid, &comm_uuid,
......
...@@ -144,8 +144,6 @@ class SyncBatchNormGradientOp : public BatchNormGradientOp<Context> { ...@@ -144,8 +144,6 @@ class SyncBatchNormGradientOp : public BatchNormGradientOp<Context> {
#ifdef USE_CUDNN #ifdef USE_CUDNN
#if CUDNN_VERSION_MIN(5, 0, 0)
template <class Context> template <class Context>
class CuDNNBatchNormOp final : public BatchNormOpBase<Context> { class CuDNNBatchNormOp final : public BatchNormOpBase<Context> {
public: public:
...@@ -211,8 +209,6 @@ class CuDNNBatchNormGradientOp final : public BatchNormGradientOp<Context> { ...@@ -211,8 +209,6 @@ class CuDNNBatchNormGradientOp final : public BatchNormGradientOp<Context> {
cudnnBatchNormMode_t bn_mode_; cudnnBatchNormMode_t bn_mode_;
}; };
#endif // CUDNN_VERSION_MIN(5, 0, 0)
#endif // USE_CUDNN #endif // USE_CUDNN
} // namespace dragon } // namespace dragon
......
...@@ -4,8 +4,6 @@ ...@@ -4,8 +4,6 @@
#ifdef USE_CUDNN #ifdef USE_CUDNN
#if CUDNN_VERSION_MIN(5, 0, 0)
namespace dragon { namespace dragon {
template <class Context> template <class Context>
...@@ -171,6 +169,4 @@ DEPLOY_CUDNN(BatchNormGradient); ...@@ -171,6 +169,4 @@ DEPLOY_CUDNN(BatchNormGradient);
} // namespace dragon } // namespace dragon
#endif // CUDNN_VERSION_MIN(5, 0, 0)
#endif // USE_CUDNN #endif // USE_CUDNN
...@@ -4,8 +4,6 @@ ...@@ -4,8 +4,6 @@
#include "dragon/core/workspace.h" #include "dragon/core/workspace.h"
#include "dragon/utils/filler.h" #include "dragon/utils/filler.h"
#if CUDNN_VERSION_MIN(5, 0, 0)
namespace dragon { namespace dragon {
template <class Context> template <class Context>
...@@ -56,7 +54,7 @@ void CuDNNRecurrentOpBase<Context>::ResetDesc() { ...@@ -56,7 +54,7 @@ void CuDNNRecurrentOpBase<Context>::ResetDesc() {
// Setup RNN // Setup RNN
#if CUDNN_VERSION_MIN(7, 0, 0) #if CUDNN_VERSION_MIN(7, 0, 0)
CUDNN_CHECK(cudnnSetRNNDescriptor( CUDNN_CHECK(cudnnSetRNNDescriptor_v6(
ctx()->cudnn_handle(), ctx()->cudnn_handle(),
rnn_desc_, rnn_desc_,
hidden_size_, hidden_size_,
...@@ -323,6 +321,4 @@ DEPLOY_CUDNN(RecurrentGradient); ...@@ -323,6 +321,4 @@ DEPLOY_CUDNN(RecurrentGradient);
} // namespace dragon } // namespace dragon
#endif // CUDNN_VERSION_MIN(5, 0, 0)
#endif // USE_CUDNN #endif // USE_CUDNN
...@@ -19,8 +19,6 @@ namespace dragon { ...@@ -19,8 +19,6 @@ namespace dragon {
#ifdef USE_CUDNN #ifdef USE_CUDNN
#if CUDNN_VERSION_MIN(5, 0, 0)
class CuDNNTensorDescs { class CuDNNTensorDescs {
public: public:
CuDNNTensorDescs(int num_descs) { CuDNNTensorDescs(int num_descs) {
...@@ -174,8 +172,6 @@ class CuDNNRecurrentGradientOp final : public CuDNNRecurrentOpBase<Context> { ...@@ -174,8 +172,6 @@ class CuDNNRecurrentGradientOp final : public CuDNNRecurrentOpBase<Context> {
void DoRunWithType(); void DoRunWithType();
}; };
#endif // CUDNN_VERSION_MIN(5, 0, 0)
#endif // USE_CUDNN #endif // USE_CUDNN
} // namespace dragon } // namespace dragon
......
...@@ -68,7 +68,6 @@ void CuDNNConv2dOp<Context>::ResetDesc() { ...@@ -68,7 +68,6 @@ void CuDNNConv2dOp<Context>::ResetDesc() {
if (filter_changed) { if (filter_changed) {
// Determine the weight shape // Determine the weight shape
filter_dims_ = W.dims(); filter_dims_ = W.dims();
#if CUDNN_VERSION_MIN(5, 0, 0)
CUDNN_CHECK(cudnnSetFilter4dDescriptor( CUDNN_CHECK(cudnnSetFilter4dDescriptor(
filter_desc_, filter_desc_,
CuDNNType<T>::type, CuDNNType<T>::type,
...@@ -77,16 +76,6 @@ void CuDNNConv2dOp<Context>::ResetDesc() { ...@@ -77,16 +76,6 @@ void CuDNNConv2dOp<Context>::ResetDesc() {
in_channels_ / group_, in_channels_ / group_,
kshape_[0], kshape_[0],
kshape_[1])); kshape_[1]));
#else
CUDNN_CHECK(cudnnSetFilter4dDescriptor_v4(
filter_desc_,
CuDNNType<T>::type,
format_,
out_channels_ / cudnn_group_,
in_channels_ / group_,
kshape_[0],
kshape_[1]));
#endif
// Determine the bias shape // Determine the bias shape
if (HasBias()) { if (HasBias()) {
CuDNNSetBiasDesc<T>( CuDNNSetBiasDesc<T>(
...@@ -96,9 +85,34 @@ void CuDNNConv2dOp<Context>::ResetDesc() { ...@@ -96,9 +85,34 @@ void CuDNNConv2dOp<Context>::ResetDesc() {
// Set the conv configuration // Set the conv configuration
SetConvDesc<T>(); SetConvDesc<T>();
// Get or search the appropriate algorithm // Get or search the appropriate algorithm
if (CUDAContext::object()->cudnn_benchmark_) { if (CUDAContext::objects().cudnn_benchmark_) {
exhaustive_search_ = true; exhaustive_search_ = true;
} else { } else {
#if CUDNN_VERSION_MIN(7, 0, 0)
int num_valid_algos;
constexpr int num_algos = CUDNN_CONV_NUM_FWD_ALGOS;
cudnnConvolutionFwdAlgoPerf_t stats[num_algos];
CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm_v7(
ctx()->cudnn_handle(),
input_desc_,
filter_desc_,
conv_desc_,
output_desc_,
num_algos,
&num_valid_algos,
stats));
bool algo_is_found = false;
for (int i = 0; i < num_valid_algos; ++i) {
if (stats[i].memory <= CUDNN_CONV_WORKSPACE_LIMIT_BYTES) {
fwd_algo_ = stats[i].algo;
algo_is_found = true;
break;
}
}
CHECK(algo_is_found)
<< "\nNo algorithms available for <cudnnConvolutionForward> "
<< "under the current desc and workspace limit.";
#else
CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm( CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
ctx()->cudnn_handle(), ctx()->cudnn_handle(),
input_desc_, input_desc_,
...@@ -108,6 +122,7 @@ void CuDNNConv2dOp<Context>::ResetDesc() { ...@@ -108,6 +122,7 @@ void CuDNNConv2dOp<Context>::ResetDesc() {
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
CUDNN_CONV_WORKSPACE_LIMIT_BYTES, CUDNN_CONV_WORKSPACE_LIMIT_BYTES,
&fwd_algo_)); &fwd_algo_));
#endif // CUDNN_VERSION_MIN(7, 0, 0)
} }
cudnn_ws_nbytes_ = SIZE_MAX; // Request a new size cudnn_ws_nbytes_ = SIZE_MAX; // Request a new size
} }
...@@ -136,8 +151,9 @@ void CuDNNConv2dOp<Context>::DoRunWithType() { ...@@ -136,8 +151,9 @@ void CuDNNConv2dOp<Context>::DoRunWithType() {
scratch = scratch =
ws()->template data<Context>({CUDNN_CONV_WORKSPACE_LIMIT_BYTES})[0]; ws()->template data<Context>({CUDNN_CONV_WORKSPACE_LIMIT_BYTES})[0];
auto algo = algo_cache_.get(X.dims(), W.dims(), compute_type_, [&]() { auto algo = algo_cache_.get(X.dims(), W.dims(), compute_type_, [&]() {
int returned_algo_count; int num_valid_algos;
std::array<cudnnConvolutionFwdAlgoPerf_t, CUDNN_CONV_NUM_FWD_ALGOS> stat; constexpr int num_algos = CUDNN_CONV_NUM_FWD_ALGOS;
cudnnConvolutionFwdAlgoPerf_t stats[num_algos];
CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithmEx( CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithmEx(
ctx()->cudnn_handle(), ctx()->cudnn_handle(),
input_desc_, input_desc_,
...@@ -147,12 +163,12 @@ void CuDNNConv2dOp<Context>::DoRunWithType() { ...@@ -147,12 +163,12 @@ void CuDNNConv2dOp<Context>::DoRunWithType() {
conv_desc_, conv_desc_,
output_desc_, output_desc_,
y, y,
CUDNN_CONV_NUM_FWD_ALGOS, num_algos,
&returned_algo_count, &num_valid_algos,
stat.data(), stats,
scratch, scratch,
CUDNN_CONV_WORKSPACE_LIMIT_BYTES)); CUDNN_CONV_WORKSPACE_LIMIT_BYTES));
return FwdAlgoWithCost(stat[0].algo, stat[0].time); return FwdAlgoWithCost(stats[0].algo, stats[0].time);
}); });
exhaustive_search_ = false; exhaustive_search_ = false;
fwd_algo_ = std::get<0>(algo); fwd_algo_ = std::get<0>(algo);
...@@ -273,6 +289,7 @@ template <class Context> ...@@ -273,6 +289,7 @@ template <class Context>
template <typename T> template <typename T>
void CuDNNConv2dGradientOp<Context>::ResetDesc() { void CuDNNConv2dGradientOp<Context>::ResetDesc() {
auto &X = Input(0), &W = Input(1), &dY = Input(-1); auto &X = Input(0), &W = Input(1), &dY = Input(-1);
auto *dX = Output(0), *dW = Output(1);
bool input_changed = (X.dims() != input_dims_); bool input_changed = (X.dims() != input_dims_);
bool filter_changed = (W.dims() != filter_dims_); bool filter_changed = (W.dims() != filter_dims_);
if (input_changed || filter_changed) { if (input_changed || filter_changed) {
...@@ -290,7 +307,6 @@ void CuDNNConv2dGradientOp<Context>::ResetDesc() { ...@@ -290,7 +307,6 @@ void CuDNNConv2dGradientOp<Context>::ResetDesc() {
if (filter_changed) { if (filter_changed) {
// Determine the weight shape // Determine the weight shape
filter_dims_ = W.dims(); filter_dims_ = W.dims();
#if CUDNN_VERSION_MIN(5, 0, 0)
CUDNN_CHECK(cudnnSetFilter4dDescriptor( CUDNN_CHECK(cudnnSetFilter4dDescriptor(
filter_desc_, filter_desc_,
CuDNNType<T>::type, CuDNNType<T>::type,
...@@ -299,16 +315,6 @@ void CuDNNConv2dGradientOp<Context>::ResetDesc() { ...@@ -299,16 +315,6 @@ void CuDNNConv2dGradientOp<Context>::ResetDesc() {
in_channels_ / group_, in_channels_ / group_,
kshape_[0], kshape_[0],
kshape_[1])); kshape_[1]));
#else
CUDNN_CHECK(cudnnSetFilter4dDescriptor_v4(
filter_desc_,
CuDNNType<T>::type,
format_,
out_channels_ / cudnn_group_,
in_channels_ / group_,
kshape_[0],
kshape_[1]));
#endif
// Determine the bias shape // Determine the bias shape
if (HasBias()) { if (HasBias()) {
CuDNNSetBiasDesc<T>( CuDNNSetBiasDesc<T>(
...@@ -318,10 +324,36 @@ void CuDNNConv2dGradientOp<Context>::ResetDesc() { ...@@ -318,10 +324,36 @@ void CuDNNConv2dGradientOp<Context>::ResetDesc() {
// Set the conv configuration // Set the conv configuration
SetConvDesc<T>(); SetConvDesc<T>();
// Get the appropriate algorithm // Get the appropriate algorithm
if (CUDAContext::object()->cudnn_benchmark_) { if (CUDAContext::objects().cudnn_benchmark_) {
exhaustive_search_data_ = true; exhaustive_search_data_ = true;
exhaustive_search_filter_ = true; exhaustive_search_filter_ = true;
} else { } else {
if (dW->has_name()) {
#if CUDNN_VERSION_MIN(7, 0, 0)
int num_valid_algos;
constexpr int num_algos = CUDNN_CONV_NUM_BWD_FILTER_ALGOS;
cudnnConvolutionBwdFilterAlgoPerf_t stats[num_algos];
CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm_v7(
ctx()->cudnn_handle(),
output_desc_,
input_desc_,
conv_desc_,
filter_desc_,
num_algos,
&num_valid_algos,
stats));
bool algo_is_found = false;
for (int i = 0; i < num_valid_algos; ++i) {
if (stats[i].memory <= CUDNN_CONV_WORKSPACE_LIMIT_BYTES) {
bwd_filter_algo_ = stats[i].algo;
algo_is_found = true;
break;
}
}
CHECK(algo_is_found)
<< "\nNo algorithms available for <cudnnConvolutionBackwardFilter> "
<< "under the current desc and workspace limit.";
#else
CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm( CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
ctx()->cudnn_handle(), ctx()->cudnn_handle(),
output_desc_, output_desc_,
...@@ -331,6 +363,34 @@ void CuDNNConv2dGradientOp<Context>::ResetDesc() { ...@@ -331,6 +363,34 @@ void CuDNNConv2dGradientOp<Context>::ResetDesc() {
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
CUDNN_CONV_WORKSPACE_LIMIT_BYTES, CUDNN_CONV_WORKSPACE_LIMIT_BYTES,
&bwd_filter_algo_)); &bwd_filter_algo_));
#endif // CUDNN_VERSION_MIN(7, 0, 0)
}
if (dX->has_name()) {
#if CUDNN_VERSION_MIN(7, 0, 0)
int num_valid_algos;
constexpr int num_algos = CUDNN_CONV_NUM_BWD_DATA_ALGOS;
cudnnConvolutionBwdDataAlgoPerf_t stats[num_algos];
CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm_v7(
ctx()->cudnn_handle(),
filter_desc_,
input_desc_,
conv_desc_,
output_desc_,
num_algos,
&num_valid_algos,
stats));
bool algo_is_found = false;
for (int i = 0; i < num_valid_algos; ++i) {
if (stats[i].memory <= CUDNN_CONV_WORKSPACE_LIMIT_BYTES) {
bwd_data_algo_ = stats[i].algo;
algo_is_found = true;
break;
}
}
CHECK(algo_is_found)
<< "\nNo algorithms available for <cudnnConvolutionBackwardData> "
<< "under the current desc and workspace limit.";
#else
CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm( CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
ctx()->cudnn_handle(), ctx()->cudnn_handle(),
filter_desc_, filter_desc_,
...@@ -340,6 +400,8 @@ void CuDNNConv2dGradientOp<Context>::ResetDesc() { ...@@ -340,6 +400,8 @@ void CuDNNConv2dGradientOp<Context>::ResetDesc() {
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
CUDNN_CONV_WORKSPACE_LIMIT_BYTES, CUDNN_CONV_WORKSPACE_LIMIT_BYTES,
&bwd_data_algo_)); &bwd_data_algo_));
#endif // CUDNN_VERSION_MIN(7, 0, 0)
}
} }
cudnn_ws_nbytes_ = SIZE_MAX; // Request a new size cudnn_ws_nbytes_ = SIZE_MAX; // Request a new size
} }
...@@ -365,11 +427,9 @@ void CuDNNConv2dGradientOp<Context>::DoRunWithType() { ...@@ -365,11 +427,9 @@ void CuDNNConv2dGradientOp<Context>::DoRunWithType() {
dw = dW->template mutable_data<T, Context>(); dw = dW->template mutable_data<T, Context>();
auto algo = auto algo =
filter_algo_cache_.get(X.dims(), W.dims(), compute_type_, [&]() { filter_algo_cache_.get(X.dims(), W.dims(), compute_type_, [&]() {
int returned_algo_count; int num_valid_algos;
std::array< constexpr int num_algos = CUDNN_CONV_NUM_BWD_FILTER_ALGOS;
cudnnConvolutionBwdFilterAlgoPerf_t, cudnnConvolutionBwdFilterAlgoPerf_t stats[num_algos];
CUDNN_CONV_NUM_BWD_FILTER_ALGOS>
stat;
CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithmEx( CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithmEx(
ctx()->cudnn_handle(), ctx()->cudnn_handle(),
output_desc_, output_desc_,
...@@ -379,12 +439,12 @@ void CuDNNConv2dGradientOp<Context>::DoRunWithType() { ...@@ -379,12 +439,12 @@ void CuDNNConv2dGradientOp<Context>::DoRunWithType() {
conv_desc_, conv_desc_,
filter_desc_, filter_desc_,
dw, dw,
CUDNN_CONV_NUM_BWD_FILTER_ALGOS, num_algos,
&returned_algo_count, &num_valid_algos,
stat.data(), stats,
scratch, scratch,
CUDNN_CONV_WORKSPACE_LIMIT_BYTES)); CUDNN_CONV_WORKSPACE_LIMIT_BYTES));
return BwdFilterAlgoWithCost(stat[0].algo, stat[0].time); return BwdFilterAlgoWithCost(stats[0].algo, stats[0].time);
}); });
exhaustive_search_filter_ = false; exhaustive_search_filter_ = false;
bwd_filter_algo_ = std::get<0>(algo); bwd_filter_algo_ = std::get<0>(algo);
...@@ -396,11 +456,9 @@ void CuDNNConv2dGradientOp<Context>::DoRunWithType() { ...@@ -396,11 +456,9 @@ void CuDNNConv2dGradientOp<Context>::DoRunWithType() {
w = W.template data<T, Context>(); w = W.template data<T, Context>();
dx = dX->template mutable_data<T, Context>(); dx = dX->template mutable_data<T, Context>();
auto algo = data_algo_cache_.get(X.dims(), W.dims(), compute_type_, [&]() { auto algo = data_algo_cache_.get(X.dims(), W.dims(), compute_type_, [&]() {
int returned_algo_count; int num_valid_algos;
std::array< constexpr int num_algos = CUDNN_CONV_NUM_BWD_DATA_ALGOS;
cudnnConvolutionBwdDataAlgoPerf_t, cudnnConvolutionBwdDataAlgoPerf_t stats[num_algos];
CUDNN_CONV_NUM_BWD_DATA_ALGOS>
stat;
CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithmEx( CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithmEx(
ctx()->cudnn_handle(), ctx()->cudnn_handle(),
filter_desc_, filter_desc_,
...@@ -410,12 +468,12 @@ void CuDNNConv2dGradientOp<Context>::DoRunWithType() { ...@@ -410,12 +468,12 @@ void CuDNNConv2dGradientOp<Context>::DoRunWithType() {
conv_desc_, conv_desc_,
output_desc_, output_desc_,
dx, dx,
CUDNN_CONV_NUM_BWD_DATA_ALGOS, num_algos,
&returned_algo_count, &num_valid_algos,
stat.data(), stats,
scratch, scratch,
CUDNN_CONV_WORKSPACE_LIMIT_BYTES)); CUDNN_CONV_WORKSPACE_LIMIT_BYTES));
return BwdDataAlgoWithCost(stat[0].algo, stat[0].time); return BwdDataAlgoWithCost(stats[0].algo, stats[0].time);
}); });
exhaustive_search_data_ = false; exhaustive_search_data_ = false;
bwd_data_algo_ = std::get<0>(algo); bwd_data_algo_ = std::get<0>(algo);
...@@ -464,7 +522,7 @@ void CuDNNConv2dGradientOp<Context>::DoRunWithType() { ...@@ -464,7 +522,7 @@ void CuDNNConv2dGradientOp<Context>::DoRunWithType() {
db)); db));
} }
if (Output(1)->has_name()) { if (dW->has_name()) {
x = X.template data<T, Context>(); x = X.template data<T, Context>();
dw = dW->template mutable_data<T, Context>(); dw = dW->template mutable_data<T, Context>();
for (int g = 0; g < cudnn_group_; g++) { for (int g = 0; g < cudnn_group_; g++) {
...@@ -485,7 +543,7 @@ void CuDNNConv2dGradientOp<Context>::DoRunWithType() { ...@@ -485,7 +543,7 @@ void CuDNNConv2dGradientOp<Context>::DoRunWithType() {
} }
} }
if (Output(0)->has_name()) { if (dX->has_name()) {
w = W.template data<T, Context>(); w = W.template data<T, Context>();
dx = dX->template mutable_data<T, Context>(); dx = dX->template mutable_data<T, Context>();
for (int g = 0; g < cudnn_group_; g++) { for (int g = 0; g < cudnn_group_; g++) {
......
...@@ -66,7 +66,6 @@ void CuDNNConvTranspose2dOp<Context>::ResetDesc() { ...@@ -66,7 +66,6 @@ void CuDNNConvTranspose2dOp<Context>::ResetDesc() {
} }
} }
if (filter_changed) { if (filter_changed) {
#if CUDNN_VERSION_MIN(5, 0, 0)
CUDNN_CHECK(cudnnSetFilter4dDescriptor( CUDNN_CHECK(cudnnSetFilter4dDescriptor(
filter_desc_, filter_desc_,
CuDNNType<T>::type, CuDNNType<T>::type,
...@@ -75,16 +74,6 @@ void CuDNNConvTranspose2dOp<Context>::ResetDesc() { ...@@ -75,16 +74,6 @@ void CuDNNConvTranspose2dOp<Context>::ResetDesc() {
out_channels_ / group_, out_channels_ / group_,
kshape_[0], kshape_[0],
kshape_[1])); kshape_[1]));
#else
CUDNN_CHECK(cudnnSetFilter4dDescriptor_v4(
filter_desc_,
CuDNNType<T>::type,
format_,
in_channels_ / cudnn_group_,
out_channels_ / group_,
kshape_[0],
kshape_[1]));
#endif
// Determine the bias shape // Determine the bias shape
if (HasBias()) { if (HasBias()) {
CuDNNSetBiasDesc<T>( CuDNNSetBiasDesc<T>(
...@@ -94,9 +83,34 @@ void CuDNNConvTranspose2dOp<Context>::ResetDesc() { ...@@ -94,9 +83,34 @@ void CuDNNConvTranspose2dOp<Context>::ResetDesc() {
// Set the conv configuration // Set the conv configuration
SetConvDesc<T>(); SetConvDesc<T>();
// Get or search the appropriate algorithm // Get or search the appropriate algorithm
if (CUDAContext::object()->cudnn_benchmark_) { if (CUDAContext::objects().cudnn_benchmark_) {
exhaustive_search_ = true; exhaustive_search_ = true;
} else { } else {
#if CUDNN_VERSION_MIN(7, 0, 0)
int num_valid_algos;
constexpr int num_algos = CUDNN_CONV_NUM_BWD_DATA_ALGOS;
cudnnConvolutionBwdDataAlgoPerf_t stats[num_algos];
CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm_v7(
ctx()->cudnn_handle(),
filter_desc_,
input_desc_,
conv_desc_,
output_desc_,
num_algos,
&num_valid_algos,
stats));
bool algo_is_found = false;
for (int i = 0; i < num_valid_algos; ++i) {
if (stats[i].memory <= CUDNN_CONV_WORKSPACE_LIMIT_BYTES) {
fwd_algo_ = stats[i].algo;
algo_is_found = true;
break;
}
}
CHECK(algo_is_found)
<< "\nNo algorithms available for <cudnnConvolutionBackwardData> "
<< "under the current desc and workspace limit.";
#else
CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm( CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
ctx()->cudnn_handle(), ctx()->cudnn_handle(),
filter_desc_, filter_desc_,
...@@ -106,6 +120,7 @@ void CuDNNConvTranspose2dOp<Context>::ResetDesc() { ...@@ -106,6 +120,7 @@ void CuDNNConvTranspose2dOp<Context>::ResetDesc() {
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
CUDNN_CONV_WORKSPACE_LIMIT_BYTES, CUDNN_CONV_WORKSPACE_LIMIT_BYTES,
&fwd_algo_)); &fwd_algo_));
#endif // CUDNN_VERSION_MIN(7, 0, 0)
} }
cudnn_ws_nbytes_ = SIZE_MAX; // Request a new size cudnn_ws_nbytes_ = SIZE_MAX; // Request a new size
} }
...@@ -134,11 +149,9 @@ void CuDNNConvTranspose2dOp<Context>::DoRunWithType() { ...@@ -134,11 +149,9 @@ void CuDNNConvTranspose2dOp<Context>::DoRunWithType() {
scratch = scratch =
ws()->template data<Context>({CUDNN_CONV_WORKSPACE_LIMIT_BYTES})[0]; ws()->template data<Context>({CUDNN_CONV_WORKSPACE_LIMIT_BYTES})[0];
auto algo = algo_cache_.get(X.dims(), W.dims(), compute_type_, [&]() { auto algo = algo_cache_.get(X.dims(), W.dims(), compute_type_, [&]() {
int returned_algo_count; int num_valid_algos;
std::array< constexpr int num_algos = CUDNN_CONV_NUM_BWD_DATA_ALGOS;
cudnnConvolutionBwdDataAlgoPerf_t, cudnnConvolutionBwdDataAlgoPerf_t stats[num_algos];
CUDNN_CONV_NUM_BWD_DATA_ALGOS>
stat;
CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithmEx( CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithmEx(
ctx()->cudnn_handle(), ctx()->cudnn_handle(),
filter_desc_, filter_desc_,
...@@ -148,12 +161,12 @@ void CuDNNConvTranspose2dOp<Context>::DoRunWithType() { ...@@ -148,12 +161,12 @@ void CuDNNConvTranspose2dOp<Context>::DoRunWithType() {
conv_desc_, conv_desc_,
output_desc_, output_desc_,
y, y,
CUDNN_CONV_NUM_BWD_DATA_ALGOS, num_algos,
&returned_algo_count, &num_valid_algos,
stat.data(), stats,
scratch, scratch,
CUDNN_CONV_WORKSPACE_LIMIT_BYTES)); CUDNN_CONV_WORKSPACE_LIMIT_BYTES));
return FwdAlgoWithCost(stat[0].algo, stat[0].time); return FwdAlgoWithCost(stats[0].algo, stats[0].time);
}); });
exhaustive_search_ = false; exhaustive_search_ = false;
fwd_algo_ = std::get<0>(algo); fwd_algo_ = std::get<0>(algo);
...@@ -274,6 +287,7 @@ template <class Context> ...@@ -274,6 +287,7 @@ template <class Context>
template <typename T> template <typename T>
void CuDNNConvTranspose2dGradientOp<Context>::ResetDesc() { void CuDNNConvTranspose2dGradientOp<Context>::ResetDesc() {
auto &X = Input(0), &W = Input(1), &dY = Input(-1); auto &X = Input(0), &W = Input(1), &dY = Input(-1);
auto *dX = Output(0), *dW = Output(1);
bool input_changed = (X.dims() != input_dims_); bool input_changed = (X.dims() != input_dims_);
bool filter_changed = (W.dims() != filter_dims_); bool filter_changed = (W.dims() != filter_dims_);
if (input_changed || filter_changed) { if (input_changed || filter_changed) {
...@@ -289,7 +303,6 @@ void CuDNNConvTranspose2dGradientOp<Context>::ResetDesc() { ...@@ -289,7 +303,6 @@ void CuDNNConvTranspose2dGradientOp<Context>::ResetDesc() {
} }
} }
if (filter_changed) { if (filter_changed) {
#if CUDNN_VERSION_MIN(5, 0, 0)
CUDNN_CHECK(cudnnSetFilter4dDescriptor( CUDNN_CHECK(cudnnSetFilter4dDescriptor(
filter_desc_, filter_desc_,
CuDNNType<T>::type, CuDNNType<T>::type,
...@@ -298,16 +311,6 @@ void CuDNNConvTranspose2dGradientOp<Context>::ResetDesc() { ...@@ -298,16 +311,6 @@ void CuDNNConvTranspose2dGradientOp<Context>::ResetDesc() {
out_channels_ / group_, out_channels_ / group_,
kshape_[0], kshape_[0],
kshape_[1])); kshape_[1]));
#else
CUDNN_CHECK(cudnnSetFilter4dDescriptor_v4(
filter_desc,
CuDNNType<T>::type,
format_,
in_channels_ / cudnn_group_,
out_channels_ / group_,
kshape_[0],
kshape_[1]));
#endif
// Determine the bias shape // Determine the bias shape
if (HasBias()) { if (HasBias()) {
CuDNNSetBiasDesc<T>( CuDNNSetBiasDesc<T>(
...@@ -317,10 +320,36 @@ void CuDNNConvTranspose2dGradientOp<Context>::ResetDesc() { ...@@ -317,10 +320,36 @@ void CuDNNConvTranspose2dGradientOp<Context>::ResetDesc() {
// Set the conv configuration // Set the conv configuration
SetConvDesc<T>(); SetConvDesc<T>();
// Get the appropriate algorithm // Get the appropriate algorithm
if (CUDAContext::object()->cudnn_benchmark_) { if (CUDAContext::objects().cudnn_benchmark_) {
exhaustive_search_data_ = true; exhaustive_search_data_ = true;
exhaustive_search_filter_ = true; exhaustive_search_filter_ = true;
} else { } else {
if (dW->has_name()) {
#if CUDNN_VERSION_MIN(7, 0, 0)
int num_valid_algos;
constexpr int num_algos = CUDNN_CONV_NUM_BWD_FILTER_ALGOS;
cudnnConvolutionBwdFilterAlgoPerf_t stats[num_algos];
CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm_v7(
ctx()->cudnn_handle(),
input_desc_,
output_desc_,
conv_desc_,
filter_desc_,
num_algos,
&num_valid_algos,
stats));
bool algo_is_found = false;
for (int i = 0; i < num_valid_algos; ++i) {
if (stats[i].memory <= CUDNN_CONV_WORKSPACE_LIMIT_BYTES) {
bwd_filter_algo_ = stats[i].algo;
algo_is_found = true;
break;
}
}
CHECK(algo_is_found)
<< "\nNo algorithms available for <cudnnConvolutionBackwardFilter> "
<< "under the current desc and workspace limit.";
#else
CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm( CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
ctx()->cudnn_handle(), ctx()->cudnn_handle(),
input_desc_, input_desc_,
...@@ -330,6 +359,34 @@ void CuDNNConvTranspose2dGradientOp<Context>::ResetDesc() { ...@@ -330,6 +359,34 @@ void CuDNNConvTranspose2dGradientOp<Context>::ResetDesc() {
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
CUDNN_CONV_WORKSPACE_LIMIT_BYTES, CUDNN_CONV_WORKSPACE_LIMIT_BYTES,
&bwd_filter_algo_)); &bwd_filter_algo_));
#endif // CUDNN_VERSION_MIN(7, 0, 0)
}
if (dX->has_name()) {
#if CUDNN_VERSION_MIN(7, 0, 0)
int num_valid_algos;
constexpr int num_algos = CUDNN_CONV_NUM_FWD_ALGOS;
cudnnConvolutionFwdAlgoPerf_t stats[num_algos];
CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm_v7(
ctx()->cudnn_handle(),
input_desc_,
filter_desc_,
conv_desc_,
output_desc_,
num_algos,
&num_valid_algos,
stats));
bool algo_is_found = false;
for (int i = 0; i < num_valid_algos; ++i) {
if (stats[i].memory <= CUDNN_CONV_WORKSPACE_LIMIT_BYTES) {
bwd_data_algo_ = stats[i].algo;
algo_is_found = true;
break;
}
}
CHECK(algo_is_found)
<< "\nNo algorithms available for <cudnnConvolutionForward> "
<< "under the current desc and workspace limit.";
#else
CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm( CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
ctx()->cudnn_handle(), ctx()->cudnn_handle(),
input_desc_, input_desc_,
...@@ -339,6 +396,8 @@ void CuDNNConvTranspose2dGradientOp<Context>::ResetDesc() { ...@@ -339,6 +396,8 @@ void CuDNNConvTranspose2dGradientOp<Context>::ResetDesc() {
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
CUDNN_CONV_WORKSPACE_LIMIT_BYTES, CUDNN_CONV_WORKSPACE_LIMIT_BYTES,
&bwd_data_algo_)); &bwd_data_algo_));
#endif // CUDNN_VERSION_MIN(7, 0, 0)
}
} }
cudnn_ws_nbytes_ = SIZE_MAX; // Request a new size cudnn_ws_nbytes_ = SIZE_MAX; // Request a new size
} }
...@@ -364,11 +423,9 @@ void CuDNNConvTranspose2dGradientOp<Context>::DoRunWithType() { ...@@ -364,11 +423,9 @@ void CuDNNConvTranspose2dGradientOp<Context>::DoRunWithType() {
dw = dW->template mutable_data<T, Context>(); dw = dW->template mutable_data<T, Context>();
auto algo = auto algo =
filter_algo_cache_.get(X.dims(), W.dims(), compute_type_, [&]() { filter_algo_cache_.get(X.dims(), W.dims(), compute_type_, [&]() {
int returned_algo_count; int num_valid_algos;
std::array< constexpr int num_algos = CUDNN_CONV_NUM_BWD_FILTER_ALGOS;
cudnnConvolutionBwdFilterAlgoPerf_t, cudnnConvolutionBwdFilterAlgoPerf_t stats[num_algos];
CUDNN_CONV_NUM_BWD_FILTER_ALGOS>
stat;
CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithmEx( CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithmEx(
ctx()->cudnn_handle(), ctx()->cudnn_handle(),
input_desc_, input_desc_,
...@@ -378,12 +435,12 @@ void CuDNNConvTranspose2dGradientOp<Context>::DoRunWithType() { ...@@ -378,12 +435,12 @@ void CuDNNConvTranspose2dGradientOp<Context>::DoRunWithType() {
conv_desc_, conv_desc_,
filter_desc_, filter_desc_,
dw, dw,
CUDNN_CONV_NUM_BWD_FILTER_ALGOS, num_algos,
&returned_algo_count, &num_valid_algos,
stat.data(), stats,
scratch, scratch,
CUDNN_CONV_WORKSPACE_LIMIT_BYTES)); CUDNN_CONV_WORKSPACE_LIMIT_BYTES));
return BwdFilterAlgoWithCost(stat[0].algo, stat[0].time); return BwdFilterAlgoWithCost(stats[0].algo, stats[0].time);
}); });
exhaustive_search_filter_ = false; exhaustive_search_filter_ = false;
bwd_filter_algo_ = std::get<0>(algo); bwd_filter_algo_ = std::get<0>(algo);
...@@ -395,8 +452,9 @@ void CuDNNConvTranspose2dGradientOp<Context>::DoRunWithType() { ...@@ -395,8 +452,9 @@ void CuDNNConvTranspose2dGradientOp<Context>::DoRunWithType() {
w = W.template data<T, Context>(); w = W.template data<T, Context>();
dx = dX->template mutable_data<T, Context>(); dx = dX->template mutable_data<T, Context>();
auto algo = data_algo_cache_.get(X.dims(), W.dims(), compute_type_, [&]() { auto algo = data_algo_cache_.get(X.dims(), W.dims(), compute_type_, [&]() {
int returned_algo_count; int num_valid_algos;
std::array<cudnnConvolutionFwdAlgoPerf_t, CUDNN_CONV_NUM_FWD_ALGOS> stat; constexpr int num_algos = CUDNN_CONV_NUM_FWD_ALGOS;
cudnnConvolutionFwdAlgoPerf_t stats[num_algos];
CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithmEx( CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithmEx(
ctx()->cudnn_handle(), ctx()->cudnn_handle(),
input_desc_, input_desc_,
...@@ -406,12 +464,12 @@ void CuDNNConvTranspose2dGradientOp<Context>::DoRunWithType() { ...@@ -406,12 +464,12 @@ void CuDNNConvTranspose2dGradientOp<Context>::DoRunWithType() {
conv_desc_, conv_desc_,
output_desc_, output_desc_,
dx, dx,
CUDNN_CONV_NUM_FWD_ALGOS, num_algos,
&returned_algo_count, &num_valid_algos,
stat.data(), stats,
scratch, scratch,
CUDNN_CONV_WORKSPACE_LIMIT_BYTES)); CUDNN_CONV_WORKSPACE_LIMIT_BYTES));
return BwdDataAlgoWithCost(stat[0].algo, stat[0].time); return BwdDataAlgoWithCost(stats[0].algo, stats[0].time);
}); });
exhaustive_search_data_ = false; exhaustive_search_data_ = false;
bwd_data_algo_ = std::get<0>(algo); bwd_data_algo_ = std::get<0>(algo);
......
...@@ -13,7 +13,6 @@ void CuDNNPool2dOp<Context>::DoRunWithType() { ...@@ -13,7 +13,6 @@ void CuDNNPool2dOp<Context>::DoRunWithType() {
CuDNNSetTensorDesc<T>(&input_desc_, X.dims(), data_format()); CuDNNSetTensorDesc<T>(&input_desc_, X.dims(), data_format());
CuDNNSetTensorDesc<T>(&output_desc_, out_shape_, data_format()); CuDNNSetTensorDesc<T>(&output_desc_, out_shape_, data_format());
#if CUDNN_VERSION_MIN(5, 0, 0)
CUDNN_CHECK(cudnnSetPooling2dDescriptor( CUDNN_CHECK(cudnnSetPooling2dDescriptor(
pool_desc_, pool_desc_,
pool_mode_, pool_mode_,
...@@ -24,18 +23,6 @@ void CuDNNPool2dOp<Context>::DoRunWithType() { ...@@ -24,18 +23,6 @@ void CuDNNPool2dOp<Context>::DoRunWithType() {
pad_l_[1], pad_l_[1],
stride_[0], stride_[0],
stride_[1])); stride_[1]));
#else
CUDNN_CHECK(cudnnSetPooling2dDescriptor_v4(
pool_desc_,
pool_mode_,
CUDNN_PROPAGATE_NAN,
kshape_[0],
kshape_[1],
pad_l_[0],
pad_l_[1],
stride_[0],
stride_[1]));
#endif
CUDNN_CHECK(cudnnPoolingForward( CUDNN_CHECK(cudnnPoolingForward(
ctx()->cudnn_handle(), ctx()->cudnn_handle(),
...@@ -63,7 +50,6 @@ void CuDNNPool2dGradientOp<Context>::DoRunWithType() { ...@@ -63,7 +50,6 @@ void CuDNNPool2dGradientOp<Context>::DoRunWithType() {
CuDNNSetTensorDesc<T>(&input_desc_, dY.dims(), data_format()); CuDNNSetTensorDesc<T>(&input_desc_, dY.dims(), data_format());
CuDNNSetTensorDesc<T>(&output_desc_, X.dims(), data_format()); CuDNNSetTensorDesc<T>(&output_desc_, X.dims(), data_format());
#if CUDNN_VERSION_MIN(5, 0, 0)
CUDNN_CHECK(cudnnSetPooling2dDescriptor( CUDNN_CHECK(cudnnSetPooling2dDescriptor(
pool_desc_, pool_desc_,
pool_mode_, pool_mode_,
...@@ -74,18 +60,6 @@ void CuDNNPool2dGradientOp<Context>::DoRunWithType() { ...@@ -74,18 +60,6 @@ void CuDNNPool2dGradientOp<Context>::DoRunWithType() {
pad_l_[1], pad_l_[1],
stride_[0], stride_[0],
stride_[1])); stride_[1]));
#else
CUDNN_CHECK(cudnnSetPooling2dDescriptor_v4(
pool_desc_,
pool_mode_,
CUDNN_PROPAGATE_NAN,
kshape_[0],
kshape_[1],
pad_l_[0],
pad_l_[1],
stride_[0],
stride_[1]));
#endif
CUDNN_CHECK(cudnnPoolingBackward( CUDNN_CHECK(cudnnPoolingBackward(
ctx()->cudnn_handle(), ctx()->cudnn_handle(),
......
...@@ -138,16 +138,15 @@ class BuildExtension(_build_ext): ...@@ -138,16 +138,15 @@ class BuildExtension(_build_ext):
self.compiler.set_executable('compiler_so', nvcc) self.compiler.set_executable('compiler_so', nvcc)
if isinstance(cflags, dict): if isinstance(cflags, dict):
cflags = cflags['nvcc'] cflags = cflags['nvcc']
cflags = \ cflags = (COMMON_NVCC_FLAGS +
COMMON_NVCC_FLAGS + \ ['--compiler-options', "'-fPIC'"] +
['--compiler-options', "'-fPIC'"] + \ cflags + _get_cuda_arch_flags(cflags))
cflags + _get_cuda_arch_flags(cflags)
else: else:
if isinstance(cflags, dict): if isinstance(cflags, dict):
cflags = cflags['cxx'] cflags = cflags['cxx']
cflags += COMMON_CC_FLAGS cflags += COMMON_CC_FLAGS
if not any(flag.startswith('-std=') for flag in cflags): if not any(flag.startswith('-std=') for flag in cflags):
cflags.append('-std=c++11') cflags.append('-std=c++14')
original_compile(obj, src, ext, cc_args, cflags, pp_opts) original_compile(obj, src, ext, cc_args, cflags, pp_opts)
finally: finally:
self.compiler.set_executable('compiler_so', original_compiler) self.compiler.set_executable('compiler_so', original_compiler)
...@@ -328,6 +327,7 @@ def _get_cuda_arch_flags(cflags=None): ...@@ -328,6 +327,7 @@ def _get_cuda_arch_flags(cflags=None):
'5.0', '5.2', '5.3', '5.0', '5.2', '5.3',
'6.0', '6.1', '6.2', '6.0', '6.1', '6.2',
'7.0', '7.2', '7.5', '7.0', '7.2', '7.5',
'8.0',
] ]
valid_arch_strings = supported_arches + [s + "+PTX" for s in supported_arches] valid_arch_strings = supported_arches + [s + "+PTX" for s in supported_arches]
capability = _cuda.get_device_capability() capability = _cuda.get_device_capability()
...@@ -365,6 +365,6 @@ CUDA_HOME = _find_cuda() ...@@ -365,6 +365,6 @@ CUDA_HOME = _find_cuda()
CUDNN_HOME = _os.environ.get('CUDNN_HOME') or _os.environ.get('CUDNN_PATH') CUDNN_HOME = _os.environ.get('CUDNN_HOME') or _os.environ.get('CUDNN_PATH')
COMMON_CC_FLAGS = ['-Wno-sign-compare', '-Wno-unused-variable', '-Wno-reorder'] COMMON_CC_FLAGS = ['-Wno-sign-compare', '-Wno-unused-variable', '-Wno-reorder']
COMMON_MSVC_FLAGS = ['/EHsc', '/wd4819', '/wd4244', '/wd4251', '/wd4275', '/wd4800', '/wd4996'] COMMON_MSVC_FLAGS = ['/EHsc', '/wd4819', '/wd4244', '/wd4251', '/wd4275', '/wd4800', '/wd4996']
COMMON_NVCC_FLAGS = ['-w'] if IS_WINDOWS else [] COMMON_NVCC_FLAGS = ['-w'] if IS_WINDOWS else ['-std=c++14']
COMMON_LINK_LIBRARIES = ['protobuf'] if IS_WINDOWS else [] COMMON_LINK_LIBRARIES = ['protobuf'] if IS_WINDOWS else []
DLLIMPORT_STR = '__declspec(dllimport)' if IS_WINDOWS else '' DLLIMPORT_STR = '__declspec(dllimport)' if IS_WINDOWS else ''
...@@ -32,16 +32,16 @@ namespace dragon { ...@@ -32,16 +32,16 @@ namespace dragon {
#ifdef USE_CUDA #ifdef USE_CUDA
/*! \brief The number of cuda threads to use */ /*! \brief The number of cuda threads to use */
const int CUDA_THREADS = 256; constexpr int CUDA_THREADS = 256;
/*! \brief The maximum number of blocks to use in the default kernel call */ /*! \brief The maximum number of blocks to use in the default kernel call */
const int CUDA_MAX_BLOCKS = 4096; constexpr int CUDA_MAX_BLOCKS = 4096;
/*! \brief The maximum number of devices in a single machine */ /*! \brief The maximum number of devices in a single machine */
const int CUDA_MAX_DEVICES = 16; constexpr int CUDA_MAX_DEVICES = 16;
/*! \brief The maximum number of tensor dimsensions */ /*! \brief The maximum number of tensor dimsensions */
const int CUDA_TENSOR_MAX_DIMS = 8; constexpr int CUDA_TENSOR_MAX_DIMS = 8;
#define CUDA_VERSION_MIN(major, minor, patch) \ #define CUDA_VERSION_MIN(major, minor, patch) \
(CUDA_VERSION >= (major * 1000 + minor * 100 + patch)) (CUDA_VERSION >= (major * 1000 + minor * 100 + patch))
......
...@@ -34,19 +34,19 @@ namespace dragon { ...@@ -34,19 +34,19 @@ namespace dragon {
<< cudnnGetErrorString(status); \ << cudnnGetErrorString(status); \
} while (0) } while (0)
static const size_t CUDNN_CONV_WORKSPACE_LIMIT_BYTES = 64 * 1024 * 1024; constexpr size_t CUDNN_CONV_WORKSPACE_LIMIT_BYTES = 64 * 1024 * 1024;
#if CUDNN_VERSION_MIN(7, 0, 0) #if CUDNN_VERSION_MIN(7, 0, 0)
static const size_t CUDNN_CONV_NUM_FWD_ALGOS = constexpr size_t CUDNN_CONV_NUM_FWD_ALGOS =
2 * CUDNN_CONVOLUTION_FWD_ALGO_COUNT; 2 * CUDNN_CONVOLUTION_FWD_ALGO_COUNT;
static const size_t CUDNN_CONV_NUM_BWD_FILTER_ALGOS = constexpr size_t CUDNN_CONV_NUM_BWD_FILTER_ALGOS =
2 * CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT; 2 * CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT;
static const size_t CUDNN_CONV_NUM_BWD_DATA_ALGOS = constexpr size_t CUDNN_CONV_NUM_BWD_DATA_ALGOS =
2 * CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT; 2 * CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT;
#else #else
static const size_t CUDNN_CONV_NUM_FWD_ALGOS = 7; constexpr size_t CUDNN_CONV_NUM_FWD_ALGOS = 7;
static const size_t CUDNN_CONV_NUM_BWD_FILTER_ALGOS = 4; constexpr size_t CUDNN_CONV_NUM_BWD_FILTER_ALGOS = 4;
static const size_t CUDNN_CONV_NUM_BWD_DATA_ALGOS = 5; constexpr size_t CUDNN_CONV_NUM_BWD_DATA_ALGOS = 5;
#endif #endif
class Tensor; class Tensor;
......
...@@ -17,13 +17,6 @@ ...@@ -17,13 +17,6 @@
#define DRAGON_API #define DRAGON_API
#endif #endif
// Avoid using of "thread_local" for VS2013 or older Xcode
#if defined(__clang__) || defined(__GNUC__)
#define TLS_OBJECT __thread
#else
#define TLS_OBJECT __declspec(thread)
#endif
// Disable the copy and assignment operator for a class // Disable the copy and assignment operator for a class
#define DISABLE_COPY_AND_ASSIGN(classname) \ #define DISABLE_COPY_AND_ASSIGN(classname) \
classname(const classname&) = delete; \ classname(const classname&) = delete; \
......
:: ##############################################################
:: Command file to build on Windows for Visual Studio 2013 (VC12)
:: ##############################################################
@echo off
setlocal
:: Build variables
set ORIGINAL_DIR=%cd%
set REPO_ROOT=%~dp0%..
set DRAGON_ROOT=%REPO_ROOT%\dragon
set THIRD_PARTY_DIR=%REPO_ROOT%\third_party
set CMAKE_GENERATOR="Visual Studio 12 2013 Win64"
:: Build options
set BUILD_PYTHON=ON
set BUILD_RUNTIME=OFF
:: Optional libraries
set USE_CUDA=ON
set USE_CUDNN=ON
set USE_OPENMP=ON
set USE_AVX=ON
set USE_AVX2=ON
set USE_FMA=ON
:: Protobuf SDK options
set PROTOBUF_SDK_ROOT_DIR=%THIRD_PARTY_DIR%\protobuf
:: Protobuf Compiler options
:: Set the protobuf compiler(i.e., protoc) if necessary.
:: If not, a compiler in the sdk or environment will be used.
set PROTOBUF_PROTOC_EXECUTABLE=%PROTOBUF_SDK_ROOT_DIR%\bin\protoc
:: Python options
:: Set your python "interpreter" if necessary.
:: If not, a default interpreter will be used.
:: set PYTHON_EXECUTABLE=X:/Anaconda3/python
if %BUILD_PYTHON% == ON (
if NOT DEFINED PYTHON_EXECUTABLE (
for /F %%i in ('python -c "import sys;print(sys.executable)"') do (set PYTHON_EXECUTABLE=%%i)
)
)
echo=
echo ------------------------- BUILDING CONFIGS -------------------------
echo=
echo -- DRAGON_ROOT=%DRAGON_ROOT%
echo -- CMAKE_GENERATOR=%CMAKE_GENERATOR%
if not exist %DRAGON_ROOT%\build mkdir %DRAGON_ROOT%\build
cd %DRAGON_ROOT%\build
cmake .. ^
-G%CMAKE_GENERATOR% ^
-DBUILD_PYTHON=%BUILD_PYTHON% ^
-DBUILD_RUNTIME=%BUILD_RUNTIME% ^
-USE_CUDA==%USE_CUDA% ^
-USE_CUDNN==%USE_CUDNN% ^
-USE_OPENMP==%USE_OPENMP% ^
-USE_AVX==%USE_AVX% ^
-USE_AVX2==%USE_AVX2% ^
-USE_FMA==%USE_FMA% ^
-DTHIRD_PARTY_DIR=%THIRD_PARTY_DIR% ^
-DPROTOBUF_SDK_ROOT_DIR=%PROTOBUF_SDK_ROOT_DIR% ^
-DPROTOBUF_PROTOC_EXECUTABLE=%PROTOBUF_PROTOC_EXECUTABLE% ^
-DPYTHON_EXECUTABLE=%PYTHON_EXECUTABLE% ^
|| goto :label_error
echo=
echo ------------------------- BUILDING CONFIGS -------------------------
echo=
cmake --build . --target INSTALL --config Release -- /maxcpucount:%NUMBER_OF_PROCESSORS% || goto :label_error
cd %DRAGON_ROOT%
%PYTHON_EXECUTABLE% setup.py install || goto :label_error
echo=
echo Built successfully
cd %ORIGINAL_DIR%
endlocal
pause
exit /b 0
:label_error
echo=
echo Building failed
cd %ORIGINAL_DIR%
endlocal
pause
exit /b 1
:: ##############################################################
:: Command file to build on Windows for Visual Studio 2015 (VC14)
:: ##############################################################
@echo off
setlocal
:: Build variables
set ORIGINAL_DIR=%cd%
set REPO_ROOT=%~dp0%..
set DRAGON_ROOT=%REPO_ROOT%\dragon
set THIRD_PARTY_DIR=%REPO_ROOT%\third_party
set CMAKE_GENERATOR="Visual Studio 14 2015 Win64"
:: Build options
set BUILD_PYTHON=ON
set BUILD_RUNTIME=OFF
:: Optional libraries
set USE_CUDA=ON
set USE_CUDNN=ON
set USE_OPENMP=ON
set USE_AVX=ON
set USE_AVX2=ON
set USE_FMA=ON
:: Protobuf SDK options
set PROTOBUF_SDK_ROOT_DIR=%THIRD_PARTY_DIR%\protobuf
:: Protobuf Compiler options
:: Set the protobuf compiler(i.e., protoc) if necessary.
:: If not, a compiler in the sdk or environment will be used.
set PROTOBUF_PROTOC_EXECUTABLE=%PROTOBUF_SDK_ROOT_DIR%\bin\protoc
:: Python options
:: Set your python "interpreter" if necessary.
:: If not, a default interpreter will be used.
:: set PYTHON_EXECUTABLE=X:/Anaconda3/python
if %BUILD_PYTHON% == ON (
if NOT DEFINED PYTHON_EXECUTABLE (
for /F %%i in ('python -c "import sys;print(sys.executable)"') do (set PYTHON_EXECUTABLE=%%i)
)
)
echo=
echo ------------------------- BUILDING CONFIGS -------------------------
echo=
echo -- DRAGON_ROOT=%DRAGON_ROOT%
echo -- CMAKE_GENERATOR=%CMAKE_GENERATOR%
if not exist %DRAGON_ROOT%\build mkdir %DRAGON_ROOT%\build
cd %DRAGON_ROOT%\build
cmake .. ^
-G%CMAKE_GENERATOR% ^
-DBUILD_PYTHON=%BUILD_PYTHON% ^
-DBUILD_RUNTIME=%BUILD_RUNTIME% ^
-USE_CUDA==%USE_CUDA% ^
-USE_CUDNN==%USE_CUDNN% ^
-USE_OPENMP==%USE_OPENMP% ^
-USE_AVX==%USE_AVX% ^
-USE_AVX2==%USE_AVX2% ^
-USE_FMA==%USE_FMA% ^
-DTHIRD_PARTY_DIR=%THIRD_PARTY_DIR% ^
-DPROTOBUF_SDK_ROOT_DIR=%PROTOBUF_SDK_ROOT_DIR% ^
-DPROTOBUF_PROTOC_EXECUTABLE=%PROTOBUF_PROTOC_EXECUTABLE% ^
-DPYTHON_EXECUTABLE=%PYTHON_EXECUTABLE% ^
|| goto :label_error
echo=
echo ------------------------- BUILDING CONFIGS -------------------------
echo=
cmake --build . --target INSTALL --config Release -- /maxcpucount:%NUMBER_OF_PROCESSORS% || goto :label_error
cd %DRAGON_ROOT%
%PYTHON_EXECUTABLE% setup.py install || goto :label_error
echo=
echo Built successfully
cd %ORIGINAL_DIR%
endlocal
pause
exit /b 0
:label_error
echo=
echo Building failed
cd %ORIGINAL_DIR%
endlocal
pause
exit /b 1
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
# <https://opensource.org/licenses/BSD-2-Clause> # <https://opensource.org/licenses/BSD-2-Clause>
# #
# ------------------------------------------------------------ # ------------------------------------------------------------
"""Command line to run tests."""
from __future__ import absolute_import from __future__ import absolute_import
from __future__ import division from __future__ import division
......
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!