Commit 77dcd71d by Ting PAN

Fix bug of sharing the corrupted workspace data

Summary:
This commit uses unique tensors to provide workspace data
to avoid the corruption between operator and kernel.
1 parent e83c407a
...@@ -65,11 +65,11 @@ UniqueName ...@@ -65,11 +65,11 @@ UniqueName
data data
#### ####
.. doxygenfunction:: dragon::Workspace::data(const vector<size_t> &segments) .. doxygenfunction:: dragon::Workspace::data(const vector<size_t> &segments, const string &name = "data:0")
data data
#### ####
.. doxygenfunction:: dragon::Workspace::data(const vector<int64_t> &segments) .. doxygenfunction:: dragon::Workspace::data(const vector<int64_t> &segments, const string &name = "data:0")
graphs graphs
###### ######
......
...@@ -87,9 +87,11 @@ class DRAGON_API Workspace { ...@@ -87,9 +87,11 @@ class DRAGON_API Workspace {
/*! \brief Return a group of the shared raw data */ /*! \brief Return a group of the shared raw data */
template <class Context> template <class Context>
vector<void*> data(const vector<size_t>& segments) { vector<void*> data(
const vector<size_t>& segments,
const string& name = "data:0") {
vector<void*> group(segments.size()); vector<void*> group(segments.size());
group[0] = CreateTensor("/share/data") group[0] = CreateTensor("/share/buffer/" + name)
->Reshape({(int64_t)std::accumulate( ->Reshape({(int64_t)std::accumulate(
segments.begin(), segments.end(), size_t(0))}) segments.begin(), segments.end(), size_t(0))})
->template mutable_data<uint8_t, Context>(); ->template mutable_data<uint8_t, Context>();
...@@ -101,13 +103,15 @@ class DRAGON_API Workspace { ...@@ -101,13 +103,15 @@ class DRAGON_API Workspace {
/*! \brief Return a group of shared typed data */ /*! \brief Return a group of shared typed data */
template <typename T, class Context> template <typename T, class Context>
vector<T*> data(const vector<int64_t>& segments) { vector<T*> data(
const vector<int64_t>& segments,
const string& name = "data:0") {
vector<T*> group(segments.size()); vector<T*> group(segments.size());
vector<size_t> segments_v2; vector<size_t> segments_v2;
for (const auto size : segments) { for (const auto size : segments) {
segments_v2.push_back(size * sizeof(T)); segments_v2.push_back(size * sizeof(T));
} }
auto group_v2 = data<Context>(segments_v2); auto group_v2 = data<Context>(segments_v2, name);
for (int i = 0; i < segments.size(); ++i) { for (int i = 0; i < segments.size(); ++i) {
group[i] = (T*)group_v2[i]; group[i] = (T*)group_v2[i];
} }
......
...@@ -32,44 +32,45 @@ __global__ void _UnravelIndex( ...@@ -32,44 +32,45 @@ __global__ void _UnravelIndex(
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(IndexType) \ #define DEFINE_KERNEL_LAUNCHER(IndexType) \
template <> \ template <> \
void Flagged<IndexType, CUDAContext>( \ void Flagged<IndexType, CUDAContext>( \
const int count, \ const int count, \
const uint8_t* mask, \ const uint8_t* mask, \
IndexType* index, \ IndexType* index, \
int* num_selected, \ int* num_selected, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
IndexType num_selected_host; \ IndexType num_selected_host; \
auto* num_selected_dev = index + count; \ auto* num_selected_dev = index + count; \
size_t ws_nbytes = 0; \ size_t ws_nbytes = 0; \
cub::CountingInputIterator<int> itr(0); \ cub::CountingInputIterator<int> itr(0); \
cub::DeviceSelect::Flagged( \ cub::DeviceSelect::Flagged( \
nullptr, \ nullptr, \
ws_nbytes, \ ws_nbytes, \
itr, \ itr, \
mask, \ mask, \
index, \ index, \
static_cast<int64_t*>(nullptr), \ static_cast<int64_t*>(nullptr), \
count, \ count, \
ctx->cuda_stream()); \ ctx->cuda_stream()); \
cub::DeviceSelect::Flagged( \ cub::DeviceSelect::Flagged( \
ctx->workspace()->template data<CUDAContext>({ws_nbytes})[0], \ ctx->workspace()->template data<CUDAContext>( \
ws_nbytes, \ {ws_nbytes}, "data:1")[0], \
itr, \ ws_nbytes, \
mask, \ itr, \
index, \ mask, \
num_selected_dev, \ index, \
count, \ num_selected_dev, \
ctx->cuda_stream()); \ count, \
CUDA_CHECK(cudaMemcpyAsync( \ ctx->cuda_stream()); \
&num_selected_host, \ CUDA_CHECK(cudaMemcpyAsync( \
num_selected_dev, \ &num_selected_host, \
sizeof(IndexType), \ num_selected_dev, \
cudaMemcpyDefault, \ sizeof(IndexType), \
ctx->cuda_stream())); \ cudaMemcpyDefault, \
ctx->FinishDeviceComputation(); \ ctx->cuda_stream())); \
num_selected[0] = num_selected_host; \ ctx->FinishDeviceComputation(); \
num_selected[0] = num_selected_host; \
} }
DEFINE_KERNEL_LAUNCHER(int); DEFINE_KERNEL_LAUNCHER(int);
......
...@@ -189,56 +189,56 @@ __global__ void _SelectViaDeviceSort( ...@@ -189,56 +189,56 @@ __global__ void _SelectViaDeviceSort(
<< ") to launch the cuda kernel"; \ << ") to launch the cuda kernel"; \
} }
#define DEFINE_KERNEL_LAUNCHER(T1, T2, kLowest, kMax) \ #define DEFINE_KERNEL_LAUNCHER(T1, T2, kLowest, kMax) \
template <> \ template <> \
void TopSelect<T1, CUDAContext>( \ void TopSelect<T1, CUDAContext>( \
const int outer_dim, \ const int outer_dim, \
const int inner_dim, \ const int inner_dim, \
const int axis_dim, \ const int axis_dim, \
const int select_dim, \ const int select_dim, \
const int largest, \ const int largest, \
const T1* x, \ const T1* x, \
T1* value, \ T1* value, \
int64_t* index, \ int64_t* index, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
const int rows = outer_dim * inner_dim; \ const int rows = outer_dim * inner_dim; \
const int cols = axis_dim; \ const int cols = axis_dim; \
if (rows == 1 || cols > CUDA_THREADS * 8) { \ if (rows == 1 || cols > CUDA_THREADS * 8) { \
const int input_count = outer_dim * inner_dim * axis_dim; \ const int in_count = outer_dim * inner_dim * axis_dim; \
const int output_count = outer_dim * inner_dim * select_dim; \ const int out_count = outer_dim * inner_dim * select_dim; \
auto data = ctx->workspace()->template data<CUDAContext>( \ auto data = ctx->workspace()->template data<CUDAContext>( \
{input_count * sizeof(T1), input_count * sizeof(int64_t)}); \ {in_count * sizeof(T1), in_count * sizeof(int64_t)}, "data:1"); \
math::Copy(input_count, x, (T1*)data[0], ctx); \ math::Copy(in_count, x, (T1*)data[0], ctx); \
_DeviceSort( \ _DeviceSort( \
outer_dim, \ outer_dim, \
inner_dim, \ inner_dim, \
axis_dim, \ axis_dim, \
largest, \ largest, \
(T1*)data[0], \ (T1*)data[0], \
(int64_t*)data[1], \ (int64_t*)data[1], \
ctx); \ ctx); \
if (rows == 1) { \ if (rows == 1) { \
math::Copy(output_count, (T1*)data[0], value, ctx); \ math::Copy(out_count, (T1*)data[0], value, ctx); \
math::Copy(output_count, (int64_t*)data[1], index, ctx); \ math::Copy(out_count, (int64_t*)data[1], index, ctx); \
} else { \ } else { \
_SelectViaDeviceSort<<< \ _SelectViaDeviceSort<<< \
CUDA_BLOCKS(output_count), \ CUDA_BLOCKS(out_count), \
CUDA_THREADS, \ CUDA_THREADS, \
0, \ 0, \
ctx->cuda_stream()>>>( \ ctx->cuda_stream()>>>( \
output_count, \ out_count, \
axis_dim, \ axis_dim, \
inner_dim, \ inner_dim, \
select_dim, \ select_dim, \
(T1*)data[0], \ (T1*)data[0], \
(int64_t*)data[1], \ (int64_t*)data[1], \
value, \ value, \
index); \ index); \
} \ } \
return; \ return; \
} \ } \
T2 init = largest > 0 ? kLowest : kMax; \ T2 init = largest > 0 ? kLowest : kMax; \
PLACE_BLOCK_SORT_CASES(T2); \ PLACE_BLOCK_SORT_CASES(T2); \
} }
DEFINE_KERNEL_LAUNCHER( DEFINE_KERNEL_LAUNCHER(
......
...@@ -20,8 +20,9 @@ __global__ void _SigmoidCrossEntropy( ...@@ -20,8 +20,9 @@ __global__ void _SigmoidCrossEntropy(
if (target[i] < 0) { if (target[i] < 0) {
loss[i] = mask[i] = T(0); loss[i] = mask[i] = T(0);
} else { } else {
loss[i] = log(T(1) + exp(logit[i] - T(2) * logit[i] * (logit[i] >= 0))) + const T lgt = logit[i];
logit[i] * ((logit[i] >= 0) - target[i]); loss[i] = log(T(1) + exp(lgt - T(2) * lgt * T(lgt >= 0))) +
lgt * (T(lgt >= 0) - target[i]);
mask[i] = T(1); mask[i] = T(1);
} }
} }
......
...@@ -75,7 +75,7 @@ void UpdateOpBase<Context>::RunOnDevice() { ...@@ -75,7 +75,7 @@ void UpdateOpBase<Context>::RunOnDevice() {
ApplyUpdate<float>(&dX, X); ApplyUpdate<float>(&dX, X);
} else if (dX.template IsType<float16>()) { } else if (dX.template IsType<float16>()) {
auto* X_master = workspace()->CreateTensor(X->name() + "[float32]"); auto* X_master = workspace()->CreateTensor(X->name() + "[float32]");
auto* dX_copy = ctx()->workspace()->CreateTensor("/share/data"); auto* dX_copy = ctx()->workspace()->CreateTensor("/share/buffer/data:0");
if (X_master->count() != X->count()) { if (X_master->count() != X->count()) {
math::Cast( math::Cast(
X->count(), X->count(),
......
...@@ -116,7 +116,7 @@ __global__ void _GenericReduce( ...@@ -116,7 +116,7 @@ __global__ void _GenericReduce(
cast::to<T>(init), \ cast::to<T>(init), \
ctx->cuda_stream()); \ ctx->cuda_stream()); \
cub::DeviceReduce::Reduce( \ cub::DeviceReduce::Reduce( \
ctx->workspace()->data<CUDAContext>({ws_nbytes})[0], \ ctx->workspace()->data<CUDAContext>({ws_nbytes}, "data:1")[0], \
ws_nbytes, \ ws_nbytes, \
x, \ x, \
y, \ y, \
......
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!