Commit 77dcd71d by Ting PAN

Fix bug of sharing the corrupted workspace data

Summary:
This commit uses unique tensors to provide workspace data
to avoid the corruption between operator and kernel.
1 parent e83c407a
...@@ -65,11 +65,11 @@ UniqueName ...@@ -65,11 +65,11 @@ UniqueName
data data
#### ####
.. doxygenfunction:: dragon::Workspace::data(const vector<size_t> &segments) .. doxygenfunction:: dragon::Workspace::data(const vector<size_t> &segments, const string &name = "data:0")
data data
#### ####
.. doxygenfunction:: dragon::Workspace::data(const vector<int64_t> &segments) .. doxygenfunction:: dragon::Workspace::data(const vector<int64_t> &segments, const string &name = "data:0")
graphs graphs
###### ######
......
...@@ -87,9 +87,11 @@ class DRAGON_API Workspace { ...@@ -87,9 +87,11 @@ class DRAGON_API Workspace {
/*! \brief Return a group of the shared raw data */ /*! \brief Return a group of the shared raw data */
template <class Context> template <class Context>
vector<void*> data(const vector<size_t>& segments) { vector<void*> data(
const vector<size_t>& segments,
const string& name = "data:0") {
vector<void*> group(segments.size()); vector<void*> group(segments.size());
group[0] = CreateTensor("/share/data") group[0] = CreateTensor("/share/buffer/" + name)
->Reshape({(int64_t)std::accumulate( ->Reshape({(int64_t)std::accumulate(
segments.begin(), segments.end(), size_t(0))}) segments.begin(), segments.end(), size_t(0))})
->template mutable_data<uint8_t, Context>(); ->template mutable_data<uint8_t, Context>();
...@@ -101,13 +103,15 @@ class DRAGON_API Workspace { ...@@ -101,13 +103,15 @@ class DRAGON_API Workspace {
/*! \brief Return a group of shared typed data */ /*! \brief Return a group of shared typed data */
template <typename T, class Context> template <typename T, class Context>
vector<T*> data(const vector<int64_t>& segments) { vector<T*> data(
const vector<int64_t>& segments,
const string& name = "data:0") {
vector<T*> group(segments.size()); vector<T*> group(segments.size());
vector<size_t> segments_v2; vector<size_t> segments_v2;
for (const auto size : segments) { for (const auto size : segments) {
segments_v2.push_back(size * sizeof(T)); segments_v2.push_back(size * sizeof(T));
} }
auto group_v2 = data<Context>(segments_v2); auto group_v2 = data<Context>(segments_v2, name);
for (int i = 0; i < segments.size(); ++i) { for (int i = 0; i < segments.size(); ++i) {
group[i] = (T*)group_v2[i]; group[i] = (T*)group_v2[i];
} }
......
...@@ -54,7 +54,8 @@ __global__ void _UnravelIndex( ...@@ -54,7 +54,8 @@ __global__ void _UnravelIndex(
count, \ count, \
ctx->cuda_stream()); \ ctx->cuda_stream()); \
cub::DeviceSelect::Flagged( \ cub::DeviceSelect::Flagged( \
ctx->workspace()->template data<CUDAContext>({ws_nbytes})[0], \ ctx->workspace()->template data<CUDAContext>( \
{ws_nbytes}, "data:1")[0], \
ws_nbytes, \ ws_nbytes, \
itr, \ itr, \
mask, \ mask, \
......
...@@ -204,11 +204,11 @@ __global__ void _SelectViaDeviceSort( ...@@ -204,11 +204,11 @@ __global__ void _SelectViaDeviceSort(
const int rows = outer_dim * inner_dim; \ const int rows = outer_dim * inner_dim; \
const int cols = axis_dim; \ const int cols = axis_dim; \
if (rows == 1 || cols > CUDA_THREADS * 8) { \ if (rows == 1 || cols > CUDA_THREADS * 8) { \
const int input_count = outer_dim * inner_dim * axis_dim; \ const int in_count = outer_dim * inner_dim * axis_dim; \
const int output_count = outer_dim * inner_dim * select_dim; \ const int out_count = outer_dim * inner_dim * select_dim; \
auto data = ctx->workspace()->template data<CUDAContext>( \ auto data = ctx->workspace()->template data<CUDAContext>( \
{input_count * sizeof(T1), input_count * sizeof(int64_t)}); \ {in_count * sizeof(T1), in_count * sizeof(int64_t)}, "data:1"); \
math::Copy(input_count, x, (T1*)data[0], ctx); \ math::Copy(in_count, x, (T1*)data[0], ctx); \
_DeviceSort( \ _DeviceSort( \
outer_dim, \ outer_dim, \
inner_dim, \ inner_dim, \
...@@ -218,15 +218,15 @@ __global__ void _SelectViaDeviceSort( ...@@ -218,15 +218,15 @@ __global__ void _SelectViaDeviceSort(
(int64_t*)data[1], \ (int64_t*)data[1], \
ctx); \ ctx); \
if (rows == 1) { \ if (rows == 1) { \
math::Copy(output_count, (T1*)data[0], value, ctx); \ math::Copy(out_count, (T1*)data[0], value, ctx); \
math::Copy(output_count, (int64_t*)data[1], index, ctx); \ math::Copy(out_count, (int64_t*)data[1], index, ctx); \
} else { \ } else { \
_SelectViaDeviceSort<<< \ _SelectViaDeviceSort<<< \
CUDA_BLOCKS(output_count), \ CUDA_BLOCKS(out_count), \
CUDA_THREADS, \ CUDA_THREADS, \
0, \ 0, \
ctx->cuda_stream()>>>( \ ctx->cuda_stream()>>>( \
output_count, \ out_count, \
axis_dim, \ axis_dim, \
inner_dim, \ inner_dim, \
select_dim, \ select_dim, \
......
...@@ -20,8 +20,9 @@ __global__ void _SigmoidCrossEntropy( ...@@ -20,8 +20,9 @@ __global__ void _SigmoidCrossEntropy(
if (target[i] < 0) { if (target[i] < 0) {
loss[i] = mask[i] = T(0); loss[i] = mask[i] = T(0);
} else { } else {
loss[i] = log(T(1) + exp(logit[i] - T(2) * logit[i] * (logit[i] >= 0))) + const T lgt = logit[i];
logit[i] * ((logit[i] >= 0) - target[i]); loss[i] = log(T(1) + exp(lgt - T(2) * lgt * T(lgt >= 0))) +
lgt * (T(lgt >= 0) - target[i]);
mask[i] = T(1); mask[i] = T(1);
} }
} }
......
...@@ -75,7 +75,7 @@ void UpdateOpBase<Context>::RunOnDevice() { ...@@ -75,7 +75,7 @@ void UpdateOpBase<Context>::RunOnDevice() {
ApplyUpdate<float>(&dX, X); ApplyUpdate<float>(&dX, X);
} else if (dX.template IsType<float16>()) { } else if (dX.template IsType<float16>()) {
auto* X_master = workspace()->CreateTensor(X->name() + "[float32]"); auto* X_master = workspace()->CreateTensor(X->name() + "[float32]");
auto* dX_copy = ctx()->workspace()->CreateTensor("/share/data"); auto* dX_copy = ctx()->workspace()->CreateTensor("/share/buffer/data:0");
if (X_master->count() != X->count()) { if (X_master->count() != X->count()) {
math::Cast( math::Cast(
X->count(), X->count(),
......
...@@ -116,7 +116,7 @@ __global__ void _GenericReduce( ...@@ -116,7 +116,7 @@ __global__ void _GenericReduce(
cast::to<T>(init), \ cast::to<T>(init), \
ctx->cuda_stream()); \ ctx->cuda_stream()); \
cub::DeviceReduce::Reduce( \ cub::DeviceReduce::Reduce( \
ctx->workspace()->data<CUDAContext>({ws_nbytes})[0], \ ctx->workspace()->data<CUDAContext>({ws_nbytes}, "data:1")[0], \
ws_nbytes, \ ws_nbytes, \
x, \ x, \
y, \ y, \
......
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!