Commit 77dcd71d by Ting PAN

Fix bug of sharing the corrupted workspace data

Summary:
This commit uses unique tensors to provide workspace data
to avoid the corruption between operator and kernel.
1 parent e83c407a
......@@ -65,11 +65,11 @@ UniqueName
data
####
.. doxygenfunction:: dragon::Workspace::data(const vector<size_t> &segments)
.. doxygenfunction:: dragon::Workspace::data(const vector<size_t> &segments, const string &name = "data:0")
data
####
.. doxygenfunction:: dragon::Workspace::data(const vector<int64_t> &segments)
.. doxygenfunction:: dragon::Workspace::data(const vector<int64_t> &segments, const string &name = "data:0")
graphs
######
......
......@@ -87,9 +87,11 @@ class DRAGON_API Workspace {
/*! \brief Return a group of the shared raw data */
template <class Context>
vector<void*> data(const vector<size_t>& segments) {
vector<void*> data(
const vector<size_t>& segments,
const string& name = "data:0") {
vector<void*> group(segments.size());
group[0] = CreateTensor("/share/data")
group[0] = CreateTensor("/share/buffer/" + name)
->Reshape({(int64_t)std::accumulate(
segments.begin(), segments.end(), size_t(0))})
->template mutable_data<uint8_t, Context>();
......@@ -101,13 +103,15 @@ class DRAGON_API Workspace {
/*! \brief Return a group of shared typed data */
template <typename T, class Context>
vector<T*> data(const vector<int64_t>& segments) {
vector<T*> data(
const vector<int64_t>& segments,
const string& name = "data:0") {
vector<T*> group(segments.size());
vector<size_t> segments_v2;
for (const auto size : segments) {
segments_v2.push_back(size * sizeof(T));
}
auto group_v2 = data<Context>(segments_v2);
auto group_v2 = data<Context>(segments_v2, name);
for (int i = 0; i < segments.size(); ++i) {
group[i] = (T*)group_v2[i];
}
......
......@@ -32,44 +32,45 @@ __global__ void _UnravelIndex(
/* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(IndexType) \
template <> \
void Flagged<IndexType, CUDAContext>( \
const int count, \
const uint8_t* mask, \
IndexType* index, \
int* num_selected, \
CUDAContext* ctx) { \
IndexType num_selected_host; \
auto* num_selected_dev = index + count; \
size_t ws_nbytes = 0; \
cub::CountingInputIterator<int> itr(0); \
cub::DeviceSelect::Flagged( \
nullptr, \
ws_nbytes, \
itr, \
mask, \
index, \
static_cast<int64_t*>(nullptr), \
count, \
ctx->cuda_stream()); \
cub::DeviceSelect::Flagged( \
ctx->workspace()->template data<CUDAContext>({ws_nbytes})[0], \
ws_nbytes, \
itr, \
mask, \
index, \
num_selected_dev, \
count, \
ctx->cuda_stream()); \
CUDA_CHECK(cudaMemcpyAsync( \
&num_selected_host, \
num_selected_dev, \
sizeof(IndexType), \
cudaMemcpyDefault, \
ctx->cuda_stream())); \
ctx->FinishDeviceComputation(); \
num_selected[0] = num_selected_host; \
#define DEFINE_KERNEL_LAUNCHER(IndexType) \
template <> \
void Flagged<IndexType, CUDAContext>( \
const int count, \
const uint8_t* mask, \
IndexType* index, \
int* num_selected, \
CUDAContext* ctx) { \
IndexType num_selected_host; \
auto* num_selected_dev = index + count; \
size_t ws_nbytes = 0; \
cub::CountingInputIterator<int> itr(0); \
cub::DeviceSelect::Flagged( \
nullptr, \
ws_nbytes, \
itr, \
mask, \
index, \
static_cast<int64_t*>(nullptr), \
count, \
ctx->cuda_stream()); \
cub::DeviceSelect::Flagged( \
ctx->workspace()->template data<CUDAContext>( \
{ws_nbytes}, "data:1")[0], \
ws_nbytes, \
itr, \
mask, \
index, \
num_selected_dev, \
count, \
ctx->cuda_stream()); \
CUDA_CHECK(cudaMemcpyAsync( \
&num_selected_host, \
num_selected_dev, \
sizeof(IndexType), \
cudaMemcpyDefault, \
ctx->cuda_stream())); \
ctx->FinishDeviceComputation(); \
num_selected[0] = num_selected_host; \
}
DEFINE_KERNEL_LAUNCHER(int);
......
......@@ -189,56 +189,56 @@ __global__ void _SelectViaDeviceSort(
<< ") to launch the cuda kernel"; \
}
#define DEFINE_KERNEL_LAUNCHER(T1, T2, kLowest, kMax) \
template <> \
void TopSelect<T1, CUDAContext>( \
const int outer_dim, \
const int inner_dim, \
const int axis_dim, \
const int select_dim, \
const int largest, \
const T1* x, \
T1* value, \
int64_t* index, \
CUDAContext* ctx) { \
const int rows = outer_dim * inner_dim; \
const int cols = axis_dim; \
if (rows == 1 || cols > CUDA_THREADS * 8) { \
const int input_count = outer_dim * inner_dim * axis_dim; \
const int output_count = outer_dim * inner_dim * select_dim; \
auto data = ctx->workspace()->template data<CUDAContext>( \
{input_count * sizeof(T1), input_count * sizeof(int64_t)}); \
math::Copy(input_count, x, (T1*)data[0], ctx); \
_DeviceSort( \
outer_dim, \
inner_dim, \
axis_dim, \
largest, \
(T1*)data[0], \
(int64_t*)data[1], \
ctx); \
if (rows == 1) { \
math::Copy(output_count, (T1*)data[0], value, ctx); \
math::Copy(output_count, (int64_t*)data[1], index, ctx); \
} else { \
_SelectViaDeviceSort<<< \
CUDA_BLOCKS(output_count), \
CUDA_THREADS, \
0, \
ctx->cuda_stream()>>>( \
output_count, \
axis_dim, \
inner_dim, \
select_dim, \
(T1*)data[0], \
(int64_t*)data[1], \
value, \
index); \
} \
return; \
} \
T2 init = largest > 0 ? kLowest : kMax; \
PLACE_BLOCK_SORT_CASES(T2); \
#define DEFINE_KERNEL_LAUNCHER(T1, T2, kLowest, kMax) \
template <> \
void TopSelect<T1, CUDAContext>( \
const int outer_dim, \
const int inner_dim, \
const int axis_dim, \
const int select_dim, \
const int largest, \
const T1* x, \
T1* value, \
int64_t* index, \
CUDAContext* ctx) { \
const int rows = outer_dim * inner_dim; \
const int cols = axis_dim; \
if (rows == 1 || cols > CUDA_THREADS * 8) { \
const int in_count = outer_dim * inner_dim * axis_dim; \
const int out_count = outer_dim * inner_dim * select_dim; \
auto data = ctx->workspace()->template data<CUDAContext>( \
{in_count * sizeof(T1), in_count * sizeof(int64_t)}, "data:1"); \
math::Copy(in_count, x, (T1*)data[0], ctx); \
_DeviceSort( \
outer_dim, \
inner_dim, \
axis_dim, \
largest, \
(T1*)data[0], \
(int64_t*)data[1], \
ctx); \
if (rows == 1) { \
math::Copy(out_count, (T1*)data[0], value, ctx); \
math::Copy(out_count, (int64_t*)data[1], index, ctx); \
} else { \
_SelectViaDeviceSort<<< \
CUDA_BLOCKS(out_count), \
CUDA_THREADS, \
0, \
ctx->cuda_stream()>>>( \
out_count, \
axis_dim, \
inner_dim, \
select_dim, \
(T1*)data[0], \
(int64_t*)data[1], \
value, \
index); \
} \
return; \
} \
T2 init = largest > 0 ? kLowest : kMax; \
PLACE_BLOCK_SORT_CASES(T2); \
}
DEFINE_KERNEL_LAUNCHER(
......
......@@ -20,8 +20,9 @@ __global__ void _SigmoidCrossEntropy(
if (target[i] < 0) {
loss[i] = mask[i] = T(0);
} else {
loss[i] = log(T(1) + exp(logit[i] - T(2) * logit[i] * (logit[i] >= 0))) +
logit[i] * ((logit[i] >= 0) - target[i]);
const T lgt = logit[i];
loss[i] = log(T(1) + exp(lgt - T(2) * lgt * T(lgt >= 0))) +
lgt * (T(lgt >= 0) - target[i]);
mask[i] = T(1);
}
}
......
......@@ -75,7 +75,7 @@ void UpdateOpBase<Context>::RunOnDevice() {
ApplyUpdate<float>(&dX, X);
} else if (dX.template IsType<float16>()) {
auto* X_master = workspace()->CreateTensor(X->name() + "[float32]");
auto* dX_copy = ctx()->workspace()->CreateTensor("/share/data");
auto* dX_copy = ctx()->workspace()->CreateTensor("/share/buffer/data:0");
if (X_master->count() != X->count()) {
math::Cast(
X->count(),
......
......@@ -116,7 +116,7 @@ __global__ void _GenericReduce(
cast::to<T>(init), \
ctx->cuda_stream()); \
cub::DeviceReduce::Reduce( \
ctx->workspace()->data<CUDAContext>({ws_nbytes})[0], \
ctx->workspace()->data<CUDAContext>({ws_nbytes}, "data:1")[0], \
ws_nbytes, \
x, \
y, \
......
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!