Commit 3a97dc22 by Ting PAN

Move transpose kernels into math namespace

Summary:
This commit uses "math::Transpose" instead of "kernels::Transpose"
for more possible optimized routines in the future.
1 parent 46feba80
...@@ -165,6 +165,10 @@ clamp\_ ...@@ -165,6 +165,10 @@ clamp\_
####### #######
.. automethod:: dragon.vm.torch.Tensor.clamp_ .. automethod:: dragon.vm.torch.Tensor.clamp_
contiguous
##########
.. automethod:: dragon.vm.torch.Tensor.contiguous
copy\_ copy\_
###### ######
.. automethod:: dragon.vm.torch.Tensor.copy_ .. automethod:: dragon.vm.torch.Tensor.copy_
...@@ -309,6 +313,10 @@ isnan ...@@ -309,6 +313,10 @@ isnan
##### #####
.. automethod:: dragon.vm.torch.Tensor.isnan .. automethod:: dragon.vm.torch.Tensor.isnan
is_contiguous
#############
.. automethod:: dragon.vm.torch.Tensor.is_contiguous
is_floating_point is_floating_point
################# #################
.. automethod:: dragon.vm.torch.Tensor.is_floating_point .. automethod:: dragon.vm.torch.Tensor.is_floating_point
......
#include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
namespace kernels {
namespace {
template <typename T>
void _Transpose(
const int num_dims,
const int64_t* x_strides,
const int64_t* y_dims,
const T* x,
T* y) {
const auto N =
std::accumulate(y_dims, y_dims + num_dims, 1, std::multiplies<int64_t>());
vec64_t index(num_dims, 0);
for (int yi = 0; yi < N; ++yi) {
int64_t xi = 0;
for (int d = num_dims - 1; d >= 0; --d) {
xi += index[d] * x_strides[d];
}
y[yi] = x[xi];
math::utils::IncreaseIndexInDims(num_dims, y_dims, index.data());
}
}
} // namespace
/* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(name, T) \
template <> \
void name<T, CPUContext>( \
const int num_dims, \
const int64_t* x_strides, \
const int64_t* y_dims, \
const T* x, \
T* y, \
CPUContext* ctx) { \
_##name(num_dims, x_strides, y_dims, x, y); \
}
DEFINE_KERNEL_LAUNCHER(Transpose, bool);
DEFINE_KERNEL_LAUNCHER(Transpose, uint8_t);
DEFINE_KERNEL_LAUNCHER(Transpose, int8_t);
DEFINE_KERNEL_LAUNCHER(Transpose, int);
DEFINE_KERNEL_LAUNCHER(Transpose, int64_t);
DEFINE_KERNEL_LAUNCHER(Transpose, float16);
DEFINE_KERNEL_LAUNCHER(Transpose, float);
DEFINE_KERNEL_LAUNCHER(Transpose, double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernels
} // namespace dragon
#ifdef USE_CUDA
#include "dragon/core/context_cuda.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
namespace kernels {
namespace {
template <typename T, int D>
__global__ void _Transpose(
const int N,
const SimpleArray<int, D> X_strides,
const SimpleArray<int, D> Y_dims,
const T* x,
T* y) {
CUDA_1D_KERNEL_LOOP(yi, N) {
int xi = 0, tmp = yi;
#pragma unroll
for (int d = D - 1; d >= 0; --d) {
int r;
FIXED_DIVISOR_DIV_MOD(Y_dims.data[d], tmp, &tmp, &r);
xi += r * X_strides.data[d];
}
y[yi] = x[xi];
}
}
template <typename T, int D>
void _TransposeImpl(
const int64_t* x_strides,
const int64_t* y_dims,
const T* x,
T* y,
CUDAContext* ctx) {
const auto N =
std::accumulate(y_dims, y_dims + D, 1, std::multiplies<int64_t>());
SimpleArray<int, D> X_strides, Y_dims;
for (int i = 0; i < D; ++i) {
X_strides.data[i] = x_strides[i];
Y_dims.data[i] = y_dims[i];
}
_Transpose<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, X_strides, Y_dims, x, y);
}
} // namespace
/* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(T) \
template <> \
void Transpose<T, CUDAContext>( \
const int num_dims, \
const int64_t* x_strides, \
const int64_t* y_dims, \
const T* x, \
T* y, \
CUDAContext* ctx) { \
CUDA_TENSOR_DIMS_CHECK(num_dims); \
DISPATCH_FUNC_BY_VALUE_WITH_TYPE_1( \
_TransposeImpl, T, num_dims, x_strides, y_dims, x, y, ctx); \
}
DEFINE_KERNEL_LAUNCHER(bool);
DEFINE_KERNEL_LAUNCHER(uint8_t);
DEFINE_KERNEL_LAUNCHER(int8_t);
DEFINE_KERNEL_LAUNCHER(int);
DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernels
} // namespace dragon
#endif // USE_CUDA
#include "dragon/operators/array/transpose_op.h" #include "dragon/operators/array/transpose_op.h"
#include "dragon/core/workspace.h" #include "dragon/core/workspace.h"
#include "dragon/utils/math_functions.h" #include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
...@@ -17,45 +16,30 @@ void TransposeOp<Context>::DoRunWithType() { ...@@ -17,45 +16,30 @@ void TransposeOp<Context>::DoRunWithType() {
<< "\nProviding " << num_axes << " dimensions to permute, " << "\nProviding " << num_axes << " dimensions to permute, "
<< "while Tensor(" << X.name() << ")'s dims are " << X.DimString(); << "while Tensor(" << X.name() << ")'s dims are " << X.DimString();
vec64_t new_axes(num_dims), new_dims(num_dims); vec64_t Y_axes(num_dims), Y_dims(num_dims);
for (int i = 0; i < num_dims; ++i) { for (int i = 0; i < num_dims; ++i) {
new_axes[i] = num_axes > 0 ? perm(i) : num_dims - i - 1; Y_axes[i] = num_axes > 0 ? perm(i) : num_dims - i - 1;
} }
if (def().type() == "TransposeGradient") { if (def().type() == "TransposeGradient") {
auto old_axes(new_axes); const auto X_axes(Y_axes);
for (int i = 0; i < num_dims; ++i) { for (int i = 0; i < num_dims; ++i) {
new_axes[old_axes[i]] = i; Y_axes[X_axes[i]] = i;
} }
} }
for (int i = 0; i < num_dims; ++i) { for (int i = 0; i < num_dims; ++i) {
new_dims[i] = X.dim(new_axes[i]); Y_dims[i] = X.dim(Y_axes[i]);
}
vec64_t transpose_dims, transpose_axes;
math::utils::CollapseTransposeAxes(
num_dims,
X.dims().data(),
new_axes.data(),
transpose_dims,
transpose_axes);
Tensor X_collapse(transpose_dims);
num_dims = X_collapse.ndim();
vec64_t X_strides(num_dims), Y_dims(num_dims);
for (int i = 0; i < num_dims; ++i) {
X_strides[i] = X_collapse.stride(transpose_axes[i]);
Y_dims[i] = X_collapse.dim(transpose_axes[i]);
} }
auto* scratch = ((void*)&X == (void*)Y) auto* scratch = ((void*)&X == (void*)Y)
? ctx()->workspace()->template data<T, Context>({X.count()})[0] ? ctx()->workspace()->template data<T, Context>({X.count()})[0]
: Y->Reshape(new_dims)->template mutable_data<T, Context>(); : Y->Reshape(Y_dims)->template mutable_data<T, Context>();
kernels::Transpose( math::Transpose(
num_dims, num_dims,
X_strides.data(), X.dims().data(),
Y_dims.data(), Y_axes.data(),
X.template data<T, Context>(), X.template data<T, Context>(),
scratch, scratch,
ctx()); ctx());
...@@ -64,7 +48,7 @@ void TransposeOp<Context>::DoRunWithType() { ...@@ -64,7 +48,7 @@ void TransposeOp<Context>::DoRunWithType() {
math::Copy( math::Copy(
X.count(), X.count(),
scratch, scratch,
Y->Reshape(new_dims)->template mutable_data<T, Context>(), Y->Reshape(Y_dims)->template mutable_data<T, Context>(),
ctx()); ctx());
} }
} }
......
...@@ -68,9 +68,8 @@ class CuDNNConvOp final : public CuDNNConvOpBase<Context> { ...@@ -68,9 +68,8 @@ class CuDNNConvOp final : public CuDNNConvOpBase<Context> {
CuDNNConvOp(const OperatorDef& def, Workspace* ws) CuDNNConvOp(const OperatorDef& def, Workspace* ws)
: CuDNNConvOpBase<Context>(def, ws) { : CuDNNConvOpBase<Context>(def, ws) {
CuDNNCreateTensorDesc(&input_desc_); CuDNNCreateTensorDesc(&input_desc_);
CuDNNCreateTensorDesc(&bias_desc_);
CuDNNCreateTensorDesc(&output_desc_); CuDNNCreateTensorDesc(&output_desc_);
CuDNNCreateTensorDesc(&output_desc_for_bias_); CuDNNCreateTensorDesc(&bias_desc_);
CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_)); CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_));
CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc_)); CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc_));
} }
...@@ -80,9 +79,8 @@ class CuDNNConvOp final : public CuDNNConvOpBase<Context> { ...@@ -80,9 +79,8 @@ class CuDNNConvOp final : public CuDNNConvOpBase<Context> {
~CuDNNConvOp() { ~CuDNNConvOp() {
CuDNNDestroyTensorDesc(&input_desc_); CuDNNDestroyTensorDesc(&input_desc_);
CuDNNDestroyTensorDesc(&bias_desc_);
CuDNNDestroyTensorDesc(&output_desc_); CuDNNDestroyTensorDesc(&output_desc_);
CuDNNDestroyTensorDesc(&output_desc_for_bias_); CuDNNDestroyTensorDesc(&bias_desc_);
CUDNN_CHECK(cudnnDestroyFilterDescriptor(filter_desc_)); CUDNN_CHECK(cudnnDestroyFilterDescriptor(filter_desc_));
CUDNN_CHECK(cudnnDestroyConvolutionDescriptor(conv_desc_)); CUDNN_CHECK(cudnnDestroyConvolutionDescriptor(conv_desc_));
} }
...@@ -105,8 +103,7 @@ class CuDNNConvOp final : public CuDNNConvOpBase<Context> { ...@@ -105,8 +103,7 @@ class CuDNNConvOp final : public CuDNNConvOpBase<Context> {
bool exhaustive_search_ = false; bool exhaustive_search_ = false;
bool algo_deterministic_ = false; bool algo_deterministic_ = false;
cudnnConvolutionFwdAlgo_t fwd_algo_; cudnnConvolutionFwdAlgo_t fwd_algo_;
cudnnTensorDescriptor_t input_desc_, output_desc_; cudnnTensorDescriptor_t input_desc_, output_desc_, bias_desc_;
cudnnTensorDescriptor_t bias_desc_, output_desc_for_bias_;
using FwdAlgoWithCost = std::tuple<cudnnConvolutionFwdAlgo_t, float>; using FwdAlgoWithCost = std::tuple<cudnnConvolutionFwdAlgo_t, float>;
ConvAlgorithmCache<FwdAlgoWithCost> algo_cache_; ConvAlgorithmCache<FwdAlgoWithCost> algo_cache_;
}; };
...@@ -117,9 +114,8 @@ class CuDNNConvGradientOp final : public CuDNNConvOpBase<Context> { ...@@ -117,9 +114,8 @@ class CuDNNConvGradientOp final : public CuDNNConvOpBase<Context> {
CuDNNConvGradientOp(const OperatorDef& def, Workspace* ws) CuDNNConvGradientOp(const OperatorDef& def, Workspace* ws)
: CuDNNConvOpBase<Context>(def, ws) { : CuDNNConvOpBase<Context>(def, ws) {
CuDNNCreateTensorDesc(&input_desc_); CuDNNCreateTensorDesc(&input_desc_);
CuDNNCreateTensorDesc(&bias_desc_);
CuDNNCreateTensorDesc(&output_desc_); CuDNNCreateTensorDesc(&output_desc_);
CuDNNCreateTensorDesc(&input_desc_for_bias_); CuDNNCreateTensorDesc(&bias_desc_);
CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_)); CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_));
CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc_)); CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc_));
} }
...@@ -129,9 +125,8 @@ class CuDNNConvGradientOp final : public CuDNNConvOpBase<Context> { ...@@ -129,9 +125,8 @@ class CuDNNConvGradientOp final : public CuDNNConvOpBase<Context> {
~CuDNNConvGradientOp() { ~CuDNNConvGradientOp() {
CuDNNDestroyTensorDesc(&input_desc_); CuDNNDestroyTensorDesc(&input_desc_);
CuDNNDestroyTensorDesc(&bias_desc_);
CuDNNDestroyTensorDesc(&output_desc_); CuDNNDestroyTensorDesc(&output_desc_);
CuDNNDestroyTensorDesc(&input_desc_for_bias_); CuDNNDestroyTensorDesc(&bias_desc_);
CUDNN_CHECK(cudnnDestroyFilterDescriptor(filter_desc_)); CUDNN_CHECK(cudnnDestroyFilterDescriptor(filter_desc_));
CUDNN_CHECK(cudnnDestroyConvolutionDescriptor(conv_desc_)); CUDNN_CHECK(cudnnDestroyConvolutionDescriptor(conv_desc_));
} }
...@@ -157,8 +152,7 @@ class CuDNNConvGradientOp final : public CuDNNConvOpBase<Context> { ...@@ -157,8 +152,7 @@ class CuDNNConvGradientOp final : public CuDNNConvOpBase<Context> {
bool exhaustive_search_filter_ = false; bool exhaustive_search_filter_ = false;
cudnnConvolutionBwdFilterAlgo_t bwd_filter_algo_; cudnnConvolutionBwdFilterAlgo_t bwd_filter_algo_;
cudnnConvolutionBwdDataAlgo_t bwd_data_algo_; cudnnConvolutionBwdDataAlgo_t bwd_data_algo_;
cudnnTensorDescriptor_t input_desc_, output_desc_; cudnnTensorDescriptor_t input_desc_, output_desc_, bias_desc_;
cudnnTensorDescriptor_t bias_desc_, input_desc_for_bias_;
using BwdDataAlgoWithCost = std::tuple<cudnnConvolutionBwdDataAlgo_t, float>; using BwdDataAlgoWithCost = std::tuple<cudnnConvolutionBwdDataAlgo_t, float>;
using BwdFilterAlgoWithCost = using BwdFilterAlgoWithCost =
std::tuple<cudnnConvolutionBwdFilterAlgo_t, float>; std::tuple<cudnnConvolutionBwdFilterAlgo_t, float>;
......
...@@ -16,9 +16,6 @@ void CuDNNConvOp<Context>::ResetDesc() { ...@@ -16,9 +16,6 @@ void CuDNNConvOp<Context>::ResetDesc() {
input_dims_ = X.dims(); input_dims_ = X.dims();
CuDNNSetTensorDesc<T>(&input_desc_, X.dims(), data_format()); CuDNNSetTensorDesc<T>(&input_desc_, X.dims(), data_format());
CuDNNSetTensorDesc<T>(&output_desc_, Y->dims(), data_format()); CuDNNSetTensorDesc<T>(&output_desc_, Y->dims(), data_format());
if (HasBias()) {
CuDNNSetTensorDesc<T>(&output_desc_for_bias_, Y->dims(), data_format());
}
} }
if (filter_changed) { if (filter_changed) {
filter_dims_ = W.dims(); filter_dims_ = W.dims();
...@@ -152,14 +149,13 @@ void CuDNNConvOp<Context>::DoRunWithType() { ...@@ -152,14 +149,13 @@ void CuDNNConvOp<Context>::DoRunWithType() {
y)); y));
if (HasBias()) { if (HasBias()) {
auto* b = Input(2).template data<T, Context>();
CUDNN_CHECK(cudnnAddTensor( CUDNN_CHECK(cudnnAddTensor(
ctx()->cudnn_handle(), ctx()->cudnn_handle(),
CuDNNType<T>::one, CuDNNType<T>::one,
bias_desc_, bias_desc_,
b, Input(2).template data<T, Context>(),
CuDNNType<T>::one, CuDNNType<T>::one,
output_desc_for_bias_, output_desc_,
y)); y));
} }
} }
...@@ -181,9 +177,6 @@ void CuDNNConvGradientOp<Context>::ResetDesc() { ...@@ -181,9 +177,6 @@ void CuDNNConvGradientOp<Context>::ResetDesc() {
input_dims_ = X.dims(); input_dims_ = X.dims();
CuDNNSetTensorDesc<T>(&input_desc_, dY.dims(), data_format()); CuDNNSetTensorDesc<T>(&input_desc_, dY.dims(), data_format());
CuDNNSetTensorDesc<T>(&output_desc_, X.dims(), data_format()); CuDNNSetTensorDesc<T>(&output_desc_, X.dims(), data_format());
if (HasBias()) {
CuDNNSetTensorDesc<T>(&input_desc_for_bias_, dY.dims(), data_format());
}
} }
if (filter_changed) { if (filter_changed) {
filter_dims_ = W.dims(); filter_dims_ = W.dims();
...@@ -382,15 +375,14 @@ void CuDNNConvGradientOp<Context>::DoRunWithType() { ...@@ -382,15 +375,14 @@ void CuDNNConvGradientOp<Context>::DoRunWithType() {
} }
if (Output(2)->has_name()) { if (Output(2)->has_name()) {
auto* db = Output(2)->template mutable_data<T, Context>();
CUDNN_CHECK(cudnnConvolutionBackwardBias( CUDNN_CHECK(cudnnConvolutionBackwardBias(
ctx()->cudnn_handle(), ctx()->cudnn_handle(),
CuDNNType<T>::one, CuDNNType<T>::one,
input_desc_for_bias_, input_desc_,
dy, dy,
CuDNNType<T>::zero, CuDNNType<T>::zero,
bias_desc_, bias_desc_,
db)); Output(2)->template mutable_data<T, Context>()));
} }
if (dW->has_name()) { if (dW->has_name()) {
......
...@@ -76,9 +76,8 @@ class CuDNNConvTransposeOp final : public CuDNNConvOpBase<Context> { ...@@ -76,9 +76,8 @@ class CuDNNConvTransposeOp final : public CuDNNConvOpBase<Context> {
CuDNNConvTransposeOp(const OperatorDef& def, Workspace* ws) CuDNNConvTransposeOp(const OperatorDef& def, Workspace* ws)
: CuDNNConvOpBase<Context>(def, ws) { : CuDNNConvOpBase<Context>(def, ws) {
CuDNNCreateTensorDesc(&input_desc_); CuDNNCreateTensorDesc(&input_desc_);
CuDNNCreateTensorDesc(&bias_desc_);
CuDNNCreateTensorDesc(&output_desc_); CuDNNCreateTensorDesc(&output_desc_);
CuDNNCreateTensorDesc(&output_desc_for_bias_); CuDNNCreateTensorDesc(&bias_desc_);
CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_)); CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_));
CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc_)); CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc_));
} }
...@@ -88,9 +87,8 @@ class CuDNNConvTransposeOp final : public CuDNNConvOpBase<Context> { ...@@ -88,9 +87,8 @@ class CuDNNConvTransposeOp final : public CuDNNConvOpBase<Context> {
~CuDNNConvTransposeOp() { ~CuDNNConvTransposeOp() {
CuDNNDestroyTensorDesc(&input_desc_); CuDNNDestroyTensorDesc(&input_desc_);
CuDNNDestroyTensorDesc(&bias_desc_);
CuDNNDestroyTensorDesc(&output_desc_); CuDNNDestroyTensorDesc(&output_desc_);
CuDNNDestroyTensorDesc(&output_desc_for_bias_); CuDNNDestroyTensorDesc(&bias_desc_);
CUDNN_CHECK(cudnnDestroyFilterDescriptor(filter_desc_)); CUDNN_CHECK(cudnnDestroyFilterDescriptor(filter_desc_));
CUDNN_CHECK(cudnnDestroyConvolutionDescriptor(conv_desc_)); CUDNN_CHECK(cudnnDestroyConvolutionDescriptor(conv_desc_));
} }
...@@ -117,8 +115,7 @@ class CuDNNConvTransposeOp final : public CuDNNConvOpBase<Context> { ...@@ -117,8 +115,7 @@ class CuDNNConvTransposeOp final : public CuDNNConvOpBase<Context> {
bool exhaustive_search_ = false; bool exhaustive_search_ = false;
bool algo_deterministic_ = false; bool algo_deterministic_ = false;
cudnnConvolutionBwdDataAlgo_t fwd_algo_; cudnnConvolutionBwdDataAlgo_t fwd_algo_;
cudnnTensorDescriptor_t input_desc_, output_desc_; cudnnTensorDescriptor_t input_desc_, output_desc_, bias_desc_;
cudnnTensorDescriptor_t bias_desc_, output_desc_for_bias_;
using FwdAlgoWithCost = std::tuple<cudnnConvolutionBwdDataAlgo_t, float>; using FwdAlgoWithCost = std::tuple<cudnnConvolutionBwdDataAlgo_t, float>;
ConvAlgorithmCache<FwdAlgoWithCost> algo_cache_; ConvAlgorithmCache<FwdAlgoWithCost> algo_cache_;
}; };
...@@ -129,9 +126,8 @@ class CuDNNConvTransposeGradientOp final : public CuDNNConvOpBase<Context> { ...@@ -129,9 +126,8 @@ class CuDNNConvTransposeGradientOp final : public CuDNNConvOpBase<Context> {
CuDNNConvTransposeGradientOp(const OperatorDef& def, Workspace* ws) CuDNNConvTransposeGradientOp(const OperatorDef& def, Workspace* ws)
: CuDNNConvOpBase<Context>(def, ws) { : CuDNNConvOpBase<Context>(def, ws) {
CuDNNCreateTensorDesc(&input_desc_); CuDNNCreateTensorDesc(&input_desc_);
CuDNNCreateTensorDesc(&bias_desc_);
CuDNNCreateTensorDesc(&output_desc_); CuDNNCreateTensorDesc(&output_desc_);
CuDNNCreateTensorDesc(&input_desc_for_bias_); CuDNNCreateTensorDesc(&bias_desc_);
CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_)); CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_));
CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc_)); CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc_));
} }
...@@ -141,9 +137,8 @@ class CuDNNConvTransposeGradientOp final : public CuDNNConvOpBase<Context> { ...@@ -141,9 +137,8 @@ class CuDNNConvTransposeGradientOp final : public CuDNNConvOpBase<Context> {
~CuDNNConvTransposeGradientOp() { ~CuDNNConvTransposeGradientOp() {
CuDNNDestroyTensorDesc(&input_desc_); CuDNNDestroyTensorDesc(&input_desc_);
CuDNNDestroyTensorDesc(&bias_desc_);
CuDNNDestroyTensorDesc(&output_desc_); CuDNNDestroyTensorDesc(&output_desc_);
CuDNNDestroyTensorDesc(&input_desc_for_bias_); CuDNNDestroyTensorDesc(&bias_desc_);
CUDNN_CHECK(cudnnDestroyFilterDescriptor(filter_desc_)); CUDNN_CHECK(cudnnDestroyFilterDescriptor(filter_desc_));
CUDNN_CHECK(cudnnDestroyConvolutionDescriptor(conv_desc_)); CUDNN_CHECK(cudnnDestroyConvolutionDescriptor(conv_desc_));
} }
...@@ -173,8 +168,7 @@ class CuDNNConvTransposeGradientOp final : public CuDNNConvOpBase<Context> { ...@@ -173,8 +168,7 @@ class CuDNNConvTransposeGradientOp final : public CuDNNConvOpBase<Context> {
bool filter_algo_deterministic_ = false; bool filter_algo_deterministic_ = false;
cudnnConvolutionBwdFilterAlgo_t bwd_filter_algo_; cudnnConvolutionBwdFilterAlgo_t bwd_filter_algo_;
cudnnConvolutionFwdAlgo_t bwd_data_algo_; cudnnConvolutionFwdAlgo_t bwd_data_algo_;
cudnnTensorDescriptor_t input_desc_, output_desc_; cudnnTensorDescriptor_t input_desc_, output_desc_, bias_desc_;
cudnnTensorDescriptor_t bias_desc_, input_desc_for_bias_;
using BwdDataAlgoWithCost = std::tuple<cudnnConvolutionFwdAlgo_t, float>; using BwdDataAlgoWithCost = std::tuple<cudnnConvolutionFwdAlgo_t, float>;
using BwdFilterAlgoWithCost = using BwdFilterAlgoWithCost =
std::tuple<cudnnConvolutionBwdFilterAlgo_t, float>; std::tuple<cudnnConvolutionBwdFilterAlgo_t, float>;
......
...@@ -16,9 +16,6 @@ void CuDNNConvTransposeOp<Context>::ResetDesc() { ...@@ -16,9 +16,6 @@ void CuDNNConvTransposeOp<Context>::ResetDesc() {
input_dims_ = X.dims(); input_dims_ = X.dims();
CuDNNSetTensorDesc<T>(&input_desc_, X.dims(), data_format()); CuDNNSetTensorDesc<T>(&input_desc_, X.dims(), data_format());
CuDNNSetTensorDesc<T>(&output_desc_, Y->dims(), data_format()); CuDNNSetTensorDesc<T>(&output_desc_, Y->dims(), data_format());
if (HasBias()) {
CuDNNSetTensorDesc<T>(&output_desc_for_bias_, Y->dims(), data_format());
}
} }
if (filter_changed) { if (filter_changed) {
filter_dims_ = W.dims(); filter_dims_ = W.dims();
...@@ -153,14 +150,13 @@ void CuDNNConvTransposeOp<Context>::DoRunWithType() { ...@@ -153,14 +150,13 @@ void CuDNNConvTransposeOp<Context>::DoRunWithType() {
y)); y));
if (HasBias()) { if (HasBias()) {
auto* b = Input(2).template data<T, Context>();
CUDNN_CHECK(cudnnAddTensor( CUDNN_CHECK(cudnnAddTensor(
ctx()->cudnn_handle(), ctx()->cudnn_handle(),
CuDNNType<T>::one, CuDNNType<T>::one,
bias_desc_, bias_desc_,
b, Input(2).template data<T, Context>(),
CuDNNType<T>::one, CuDNNType<T>::one,
output_desc_for_bias_, output_desc_,
y)); y));
} }
} }
...@@ -182,9 +178,6 @@ void CuDNNConvTransposeGradientOp<Context>::ResetDesc() { ...@@ -182,9 +178,6 @@ void CuDNNConvTransposeGradientOp<Context>::ResetDesc() {
input_dims_ = X.dims(); input_dims_ = X.dims();
CuDNNSetTensorDesc<T>(&input_desc_, dY.dims(), data_format()); CuDNNSetTensorDesc<T>(&input_desc_, dY.dims(), data_format());
CuDNNSetTensorDesc<T>(&output_desc_, X.dims(), data_format()); CuDNNSetTensorDesc<T>(&output_desc_, X.dims(), data_format());
if (HasBias()) {
CuDNNSetTensorDesc<T>(&input_desc_for_bias_, dY.dims(), data_format());
}
} }
if (filter_changed) { if (filter_changed) {
filter_dims_ = W.dims(); filter_dims_ = W.dims();
...@@ -383,15 +376,14 @@ void CuDNNConvTransposeGradientOp<Context>::DoRunWithType() { ...@@ -383,15 +376,14 @@ void CuDNNConvTransposeGradientOp<Context>::DoRunWithType() {
} }
if (Output(2)->has_name()) { if (Output(2)->has_name()) {
auto* db = Output(2)->template mutable_data<T, Context>();
CUDNN_CHECK(cudnnConvolutionBackwardBias( CUDNN_CHECK(cudnnConvolutionBackwardBias(
ctx()->cudnn_handle(), ctx()->cudnn_handle(),
CuDNNType<T>::one, CuDNNType<T>::one,
input_desc_for_bias_, input_desc_,
dy, dy,
CuDNNType<T>::zero, CuDNNType<T>::zero,
bias_desc_, bias_desc_,
db)); Output(2)->template mutable_data<T, Context>()));
} }
if (dW->has_name()) { if (dW->has_name()) {
......
#include "dragon/operators/vision/space_to_depth_op.h" #include "dragon/operators/vision/space_to_depth_op.h"
#include "dragon/core/workspace.h" #include "dragon/core/workspace.h"
#include "dragon/utils/math_functions.h" #include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
...@@ -60,29 +59,15 @@ void SpaceToDepthOp<Context>::DoRunWithType() { ...@@ -60,29 +59,15 @@ void SpaceToDepthOp<Context>::DoRunWithType() {
CHECK_EQ(X_reshape.count(), X.count()) CHECK_EQ(X_reshape.count(), X.count())
<< "\nCould not rearrange " << X.DimString() << " to " << "\nCould not rearrange " << X.DimString() << " to "
<< X_reshape.DimString() << " with block size " << block_size_ << "."; << X_reshape.DimString() << " with block size " << block_size_ << ".";
vec64_t transpose_dims, transpose_axes;
math::utils::CollapseTransposeAxes(
X_reshape.ndim(),
X_reshape.dims().data(),
perm.data(),
transpose_dims,
transpose_axes);
Tensor X_collapse(transpose_dims);
num_dims = X_collapse.ndim();
vec64_t X_strides(num_dims), Y_dims(num_dims);
for (int i = 0; i < num_dims; ++i) {
X_strides[i] = X_collapse.stride(transpose_axes[i]);
Y_dims[i] = X_collapse.dim(transpose_axes[i]);
}
auto* scratch = ((void*)&X == (void*)Y) auto* scratch = ((void*)&X == (void*)Y)
? ctx()->workspace()->template data<T, Context>({X.count()})[0] ? ctx()->workspace()->template data<T, Context>({X.count()})[0]
: Y->Reshape(out_shape)->template mutable_data<T, Context>(); : Y->Reshape(out_shape)->template mutable_data<T, Context>();
kernels::Transpose( math::Transpose(
num_dims, X_reshape.ndim(),
X_strides.data(), X_reshape.dims().data(),
Y_dims.data(), perm.data(),
X.template data<T, Context>(), X.template data<T, Context>(),
scratch, scratch,
ctx()); ctx());
...@@ -142,29 +127,15 @@ void DepthToSpaceOp<Context>::DoRunWithType() { ...@@ -142,29 +127,15 @@ void DepthToSpaceOp<Context>::DoRunWithType() {
CHECK_EQ(X_reshape.count(), X.count()) CHECK_EQ(X_reshape.count(), X.count())
<< "\nCould not rearrange " << X.DimString() << " to " << "\nCould not rearrange " << X.DimString() << " to "
<< X_reshape.DimString() << " with block size " << block_size_ << "."; << X_reshape.DimString() << " with block size " << block_size_ << ".";
vec64_t transpose_dims, transpose_axes;
math::utils::CollapseTransposeAxes(
X_reshape.ndim(),
X_reshape.dims().data(),
perm.data(),
transpose_dims,
transpose_axes);
Tensor X_collapse(transpose_dims);
num_dims = X_collapse.ndim();
vec64_t X_strides(num_dims), Y_dims(num_dims);
for (int i = 0; i < num_dims; ++i) {
X_strides[i] = X_collapse.stride(transpose_axes[i]);
Y_dims[i] = X_collapse.dim(transpose_axes[i]);
}
auto* scratch = ((void*)&X == (void*)Y) auto* scratch = ((void*)&X == (void*)Y)
? ctx()->workspace()->template data<T, Context>({X.count()})[0] ? ctx()->workspace()->template data<T, Context>({X.count()})[0]
: Y->Reshape(out_shape)->template mutable_data<T, Context>(); : Y->Reshape(out_shape)->template mutable_data<T, Context>();
kernels::Transpose( math::Transpose(
num_dims, X_reshape.ndim(),
X_strides.data(), X_reshape.dims().data(),
Y_dims.data(), perm.data(),
X.template data<T, Context>(), X.template data<T, Context>(),
scratch, scratch,
ctx()); ctx());
......
...@@ -133,7 +133,7 @@ void _BroadcastSetImpl( ...@@ -133,7 +133,7 @@ void _BroadcastSetImpl(
const T* x, const T* x,
T* y, T* y,
CUDAContext* ctx) { CUDAContext* ctx) {
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> X_strides, Y_dims; SimpleArray<int, D> X_strides, Y_dims;
const auto N = const auto N =
std::accumulate(y_dims, y_dims + D, 1, std::multiplies<int64_t>()); std::accumulate(y_dims, y_dims + D, 1, std::multiplies<int64_t>());
for (int i = 0; i < D; ++i) { for (int i = 0; i < D; ++i) {
......
#include "dragon/utils/math/transpose.h"
#include "dragon/utils/math/utils.h"
namespace dragon {
namespace math {
namespace {
template <typename T>
void _Transpose(
const int num_dims,
const int64_t* x_strides,
const int64_t* y_dims,
const T* x,
T* y) {
const auto N =
std::accumulate(y_dims, y_dims + num_dims, 1, std::multiplies<int64_t>());
vec64_t index(num_dims, 0);
for (int yi = 0; yi < N; ++yi) {
int64_t xi = 0;
for (int d = num_dims - 1; d >= 0; --d) {
xi += index[d] * x_strides[d];
}
y[yi] = x[xi];
utils::IncreaseIndexInDims(num_dims, y_dims, index.data());
}
}
} // namespace
/* ------------------- Launcher Separator ------------------- */
#define DEFINE_TRANSPOSE_FUNC(T) \
template <> \
void Transpose<T, CPUContext>( \
const int num_dims, \
const int64_t* dims, \
const int64_t* axes, \
const T* x, \
T* y, \
CPUContext* ctx) { \
vec64_t new_dims, new_axes; \
utils::CollapseTransposeAxes(num_dims, dims, axes, new_dims, new_axes); \
const int num_axes = new_dims.size(); \
vec64_t X_strides(num_axes), Y_dims(num_axes); \
utils::ComputeTransposeStrides( \
num_axes, new_dims.data(), new_axes.data(), X_strides.data()); \
for (int i = 0; i < num_axes; ++i) { \
Y_dims[i] = new_dims[new_axes[i]]; \
} \
_Transpose(num_axes, X_strides.data(), Y_dims.data(), x, y); \
}
DEFINE_TRANSPOSE_FUNC(bool);
DEFINE_TRANSPOSE_FUNC(uint8_t);
DEFINE_TRANSPOSE_FUNC(int8_t);
DEFINE_TRANSPOSE_FUNC(int);
DEFINE_TRANSPOSE_FUNC(int64_t);
DEFINE_TRANSPOSE_FUNC(float16);
DEFINE_TRANSPOSE_FUNC(float);
DEFINE_TRANSPOSE_FUNC(double);
#undef DEFINE_TRANSPOSE_FUNC
} // namespace math
} // namespace dragon
#ifdef USE_CUDA
#include "dragon/core/context_cuda.h"
#include "dragon/utils/math/transpose.h"
#include "dragon/utils/math/utils.h"
namespace dragon {
namespace math {
namespace {
constexpr int kTileDim = 32;
constexpr int kBlockRows = 8;
template <typename T, int D>
__global__ void _Transpose(
const int N,
const SimpleArray<int, D> X_strides,
const SimpleArray<int, D> Y_dims,
const T* x,
T* y) {
CUDA_1D_KERNEL_LOOP(yi, N) {
int xi = 0, tmp = yi;
#pragma unroll
for (int d = D - 1; d >= 0; --d) {
int r;
FIXED_DIVISOR_DIV_MOD(Y_dims.data[d], tmp, &tmp, &r);
xi += r * X_strides.data[d];
}
y[yi] = x[xi];
}
}
template <typename T>
__global__ void _BatchTranspose2D(
const int H,
const int W,
const int dh,
const int dw,
const T* X,
T* Y) {
__shared__ T block[kTileDim][kTileDim + 1];
const int k = blockIdx.x % (dh * dw);
const int r = k / dw;
const int c = k % dw;
const int offset = blockIdx.x / (dh * dw) * H * W;
int x = c * kTileDim + threadIdx.x;
int y = r * kTileDim + threadIdx.y;
if (x < W) {
for (int i = 0; threadIdx.y + i < kTileDim && y + i < H; i += kBlockRows) {
block[threadIdx.y + i][threadIdx.x] = X[offset + (y + i) * W + x];
}
}
__syncthreads();
x = r * kTileDim + threadIdx.x;
y = c * kTileDim + threadIdx.y;
if (x < H) {
for (int i = 0; threadIdx.y + i < kTileDim && y + i < W; i += kBlockRows) {
Y[offset + (y + i) * H + x] = block[threadIdx.x][threadIdx.y + i];
}
}
}
template <typename T, int D>
void _TransposeImpl(
const int64_t* x_strides,
const int64_t* y_dims,
const T* x,
T* y,
CUDAContext* ctx) {
const auto N =
std::accumulate(y_dims, y_dims + D, 1, std::multiplies<int64_t>());
SimpleArray<int, D> X_strides, Y_dims;
for (int i = 0; i < D; ++i) {
X_strides.data[i] = x_strides[i];
Y_dims.data[i] = y_dims[i];
}
_Transpose<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, X_strides, Y_dims, x, y);
}
} // namespace
/* ------------------- Launcher Separator ------------------- */
#define DEFINE_TRANSPOSE_FUNC(T) \
template <> \
void Transpose<T, CUDAContext>( \
const int num_dims, \
const int64_t* dims, \
const int64_t* axes, \
const T* x, \
T* y, \
CUDAContext* ctx) { \
vec64_t new_dims, new_axes; \
utils::CollapseTransposeAxes(num_dims, dims, axes, new_dims, new_axes); \
const int num_axes = new_dims.size(); \
if (num_axes == 3 && new_axes == vec64_t({0, 2, 1})) { \
const auto N = new_dims[0], H = new_dims[1], W = new_dims[2]; \
const auto dh = utils::DivUp<int64_t>(H, kTileDim); \
const auto dw = utils::DivUp<int64_t>(W, kTileDim); \
_BatchTranspose2D<<< \
N * dh * dw, \
dim3(kTileDim, kBlockRows), \
0, \
ctx->cuda_stream()>>>(H, W, dh, dw, x, y); \
return; \
} \
CUDA_TENSOR_DIMS_CHECK(num_axes); \
vec64_t X_strides(num_axes), Y_dims(num_axes); \
utils::ComputeTransposeStrides( \
num_axes, new_dims.data(), new_axes.data(), X_strides.data()); \
for (int i = 0; i < num_axes; ++i) { \
Y_dims[i] = new_dims[new_axes[i]]; \
} \
DISPATCH_FUNC_BY_VALUE_WITH_TYPE_1( \
_TransposeImpl, \
T, \
num_axes, \
X_strides.data(), \
Y_dims.data(), \
x, \
y, \
ctx); \
}
DEFINE_TRANSPOSE_FUNC(bool);
DEFINE_TRANSPOSE_FUNC(uint8_t);
DEFINE_TRANSPOSE_FUNC(int8_t);
DEFINE_TRANSPOSE_FUNC(int);
DEFINE_TRANSPOSE_FUNC(int64_t);
DEFINE_TRANSPOSE_FUNC(float16);
DEFINE_TRANSPOSE_FUNC(float);
DEFINE_TRANSPOSE_FUNC(double);
#undef DEFINE_TRANSPOSE_FUNC
} // namespace math
} // namespace dragon
#endif // USE_CUDA
/*!
* Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
*
* Licensed under the BSD 2-Clause License.
* You should have received a copy of the BSD 2-Clause License
* along with the software. If not, See,
*
* <https://opensource.org/licenses/BSD-2-Clause>
*
* ------------------------------------------------------------
*/
#ifndef DRAGON_UTILS_MATH_TRANSPOSE_H_
#define DRAGON_UTILS_MATH_TRANSPOSE_H_
#include "dragon/core/context.h"
namespace dragon {
namespace math {
template <typename T, class Context>
DRAGON_API void Transpose(
const int num_dims,
const int64_t* dims,
const int64_t* axes,
const T* x,
T* y,
Context* ctx);
} // namespace math
} // namespace dragon
#endif // DRAGON_UTILS_MATH_TRANSPOSE_H_
...@@ -201,6 +201,11 @@ inline __device__ half2 Cube(half2 x) { ...@@ -201,6 +201,11 @@ inline __device__ half2 Cube(half2 x) {
*/ */
template <typename T> template <typename T>
inline T DivUp(const T a, const T b) {
return (a + b - T(1)) / b;
}
template <typename T>
inline void ArgPartition( inline void ArgPartition(
const int count, const int count,
const int kth, const int kth,
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#include "dragon/utils/math/functional.h" #include "dragon/utils/math/functional.h"
#include "dragon/utils/math/random.h" #include "dragon/utils/math/random.h"
#include "dragon/utils/math/reduce.h" #include "dragon/utils/math/reduce.h"
#include "dragon/utils/math/transpose.h"
#include "dragon/utils/math/utils.h" #include "dragon/utils/math/utils.h"
#endif // DRAGON_UTILS_MATH_FUNCTIONS_H_ #endif // DRAGON_UTILS_MATH_FUNCTIONS_H_
...@@ -596,15 +596,6 @@ void TileGrad( ...@@ -596,15 +596,6 @@ void TileGrad(
Context* ctx); Context* ctx);
template <typename T, class Context> template <typename T, class Context>
void Transpose(
const int num_dims,
const int64_t* x_strides,
const int64_t* y_dims,
const T* x,
T* y,
Context* ctx);
template <typename T, class Context>
void TopK( void TopK(
const int N, const int N,
const int S, const int S,
......
...@@ -44,6 +44,8 @@ class TestTensor(unittest.TestCase): ...@@ -44,6 +44,8 @@ class TestTensor(unittest.TestCase):
c.backward() c.backward()
self.assertEqual(a.is_leaf, True) self.assertEqual(a.is_leaf, True)
self.assertEqual(a.is_floating_point(), True) self.assertEqual(a.is_floating_point(), True)
self.assertEqual(a.is_contiguous(), True)
self.assertEqual(a.contiguous().is_contiguous(), True)
self.assertEqual(a.volatile, False) self.assertEqual(a.volatile, False)
self.assertEqual(a.numel(), 1) self.assertEqual(a.numel(), 1)
self.assertEqual(a.grad_fn, None) self.assertEqual(a.grad_fn, None)
......
...@@ -772,6 +772,17 @@ class Tensor(object): ...@@ -772,6 +772,17 @@ class Tensor(object):
""" """
def contiguous(self):
"""Return a tensor with contiguous memory.
Returns
-------
dragon.vm.torch.Tensor
The output tensor.
"""
return self
def copy_(self, src): def copy_(self, src):
"""Copy the elements into this tensor. """Copy the elements into this tensor.
...@@ -1341,6 +1352,17 @@ class Tensor(object): ...@@ -1341,6 +1352,17 @@ class Tensor(object):
""" """
def is_contiguous(self):
"""Return whether the memory is contiguous.
Returns
-------
bool
``True`` if the memory is contiguous otherwise ``False``.
"""
return True
def is_floating_point(self): def is_floating_point(self):
"""Return whether the data type is floating. """Return whether the data type is floating.
......
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!