Commit 5d518b6c by Ting PAN

io refactoring

1 parent 31e02b2b
Showing with 502 additions and 430 deletions
...@@ -24,7 +24,7 @@ set(3RDPARTY_DIR ${PROJECT_SOURCE_DIR}/../3rdparty) ...@@ -24,7 +24,7 @@ set(3RDPARTY_DIR ${PROJECT_SOURCE_DIR}/../3rdparty)
set(PYTHON_DIR /usr/include/python2.7) # prefer set(PYTHON_DIR /usr/include/python2.7) # prefer
#set(PYTHON_DIR /usr/include/python3.x) # optional, set specific version #set(PYTHON_DIR /usr/include/python3.x) # optional, set specific version
#set(ANACONDA_DIR /xxx/anaconda) # optional, set specific version below if using py3 #set(ANACONDA_DIR /xxx/anaconda) # optional, set specific version below if using py3
set(NUMPY_DIR /xxx/numpy) # require root folder of numpy package set(NUMPY_DIR /xxx/numpy) # require, root folder of numpy package
# set CUDA compiling architecture # set CUDA compiling architecture
set(CUDA_ARCH -gencode arch=compute_20,code=sm_20 set(CUDA_ARCH -gencode arch=compute_20,code=sm_20
......
...@@ -52,7 +52,7 @@ class CPUContext{ ...@@ -52,7 +52,7 @@ class CPUContext{
inline static void Delete(void* data) { free(data); } inline static void Delete(void* data) { free(data); }
template<typename T, class DstContext, class SrcContext> template<typename T, class DstContext, class SrcContext>
inline static void Copy(int n, T* dst, const T* src){ inline static void Copy(int n, T* dst, const T* src) {
if (dst == src) return; if (dst == src) return;
// only the basic types(e.g. int/float) can memcpy correctly // only the basic types(e.g. int/float) can memcpy correctly
if (std::is_fundamental<T>::value) if (std::is_fundamental<T>::value)
......
...@@ -119,7 +119,7 @@ class CUDAContext { ...@@ -119,7 +119,7 @@ class CUDAContext {
inline static void Delete(void* data) { cudaFree(data); } inline static void Delete(void* data) { cudaFree(data); }
template<typename T, class DstContext, class SrcContext> template<typename T, class DstContext, class SrcContext>
static void Copy(int n, T* dst, const T* src){ static void Copy(int n, T* dst, const T* src) {
if (dst == src) return; if (dst == src) return;
Memcpy<SrcContext, DstContext>(n * sizeof(T), (void*)dst, (const void*)src); Memcpy<SrcContext, DstContext>(n * sizeof(T), (void*)dst, (const void*)src);
} }
...@@ -148,7 +148,7 @@ class CUDAContext { ...@@ -148,7 +148,7 @@ class CUDAContext {
} }
#ifdef WITH_CUDNN #ifdef WITH_CUDNN
cudnnHandle_t cudnn_handle(){ cudnnHandle_t cudnn_handle() {
auto& handle = cuda_object_.cudnn_handle[gpu_id_]; auto& handle = cuda_object_.cudnn_handle[gpu_id_];
if (handle) { if (handle) {
return handle; return handle;
......
...@@ -77,7 +77,7 @@ class Tensor { ...@@ -77,7 +77,7 @@ class Tensor {
inline TIndex offset(const vector<TIndex>& vec) { inline TIndex offset(const vector<TIndex>& vec) {
CHECK_LE(vec.size(), ndim()); CHECK_LE(vec.size(), ndim());
TIndex offset = 0; TIndex offset = 0;
for (int i = 0; i < ndim(); i++){ for (int i = 0; i < ndim(); i++) {
offset = offset * dim(i); offset = offset * dim(i);
if (vec.size() > i) offset += vec[i]; if (vec.size() > i) offset += vec[i];
} }
...@@ -130,7 +130,7 @@ class Tensor { ...@@ -130,7 +130,7 @@ class Tensor {
} }
template <class Context> template <class Context>
void* raw_mutable_data(const TypeMeta& meta){ void* raw_mutable_data(const TypeMeta& meta) {
void* data_ptr; void* data_ptr;
active_data_ptr<Context>(&data_ptr); active_data_ptr<Context>(&data_ptr);
if (meta_ == meta && data_ptr) { if (meta_ == meta && data_ptr) {
......
...@@ -75,20 +75,20 @@ class TypeMeta { ...@@ -75,20 +75,20 @@ class TypeMeta {
bool Match() const { return (id_ == Id<T>()); } bool Match() const { return (id_ == Id<T>()); }
template <typename T> template <typename T>
static void Ctor(void* ptr, size_t n){ static void Ctor(void* ptr, size_t n) {
T* typed_ptr = static_cast<T*>(ptr); T* typed_ptr = static_cast<T*>(ptr);
for (unsigned int i = 0; i < n; i++) new(typed_ptr + i) T; for (unsigned int i = 0; i < n; i++) new(typed_ptr + i) T;
} }
template <typename T> template <typename T>
static void Copy(const void* src, void* dst, size_t n){ static void Copy(const void* src, void* dst, size_t n) {
const T* typed_src = static_cast<const T*>(src); const T* typed_src = static_cast<const T*>(src);
T* typed_dst = static_cast<T*>(dst); T* typed_dst = static_cast<T*>(dst);
for (unsigned int i = 0; i < n; i++) typed_dst[i] = typed_src[i]; for (unsigned int i = 0; i < n; i++) typed_dst[i] = typed_src[i];
} }
template <typename T> template <typename T>
static void Dtor(void* ptr, size_t n){ static void Dtor(void* ptr, size_t n) {
T* typed_ptr = static_cast<T*>(ptr); T* typed_ptr = static_cast<T*>(ptr);
for (unsigned int i = 0; i < n; i++) typed_ptr[i].~T(); for (unsigned int i = 0; i < n; i++) typed_ptr[i].~T();
} }
......
...@@ -44,7 +44,7 @@ class Workspace{ ...@@ -44,7 +44,7 @@ class Workspace{
return tensor_map_.count(query) > 0; return tensor_map_.count(query) > 0;
} }
inline Tensor* CreateTensor(const string& name){ inline Tensor* CreateTensor(const string& name) {
string query = GetTensorName(name); string query = GetTensorName(name);
if (!HasTensor(query)) if (!HasTensor(query))
tensor_map_[query] = unique_ptr<Tensor>(new Tensor(query)); tensor_map_[query] = unique_ptr<Tensor>(new Tensor(query));
...@@ -143,7 +143,7 @@ class Workspace{ ...@@ -143,7 +143,7 @@ class Workspace{
return graph_map_[graph_name]->Run(include, exclude); return graph_map_[graph_name]->Run(include, exclude);
} }
inline vector<string> GetGraphs(){ inline vector<string> GetGraphs() {
vector<string> names; vector<string> names;
for (auto& it : graph_map_) names.push_back(it.first); for (auto& it : graph_map_) names.push_back(it.first);
return names; return names;
......
...@@ -54,11 +54,11 @@ class ScanGradientOp final: public Operator<Context> { ...@@ -54,11 +54,11 @@ class ScanGradientOp final: public Operator<Context> {
step_tensor(OperatorBase::GetSingleArg<string>("step_tensor", "")), step_tensor(OperatorBase::GetSingleArg<string>("step_tensor", "")),
forward_inputs(OperatorBase::GetRepeatedArg<string>("inputs_name")), forward_inputs(OperatorBase::GetRepeatedArg<string>("inputs_name")),
forward_outputs(OperatorBase::GetRepeatedArg<string>("outputs_name")) { forward_outputs(OperatorBase::GetRepeatedArg<string>("outputs_name")) {
// handle GO(x) // handle GO(x)
for (int i = 0; i < forward_outputs.size(); i++) for (int i = 0; i < forward_outputs.size(); i++)
terms[forward_outputs[i] + "_grad"] = input(i + (int)OutputSize()).name(); terms[forward_outputs[i] + "_grad"] = input(i + (int)OutputSize()).name();
// handle GI(x) // handle GI(x)
for (int i = 0; i < forward_inputs.size(); i++) for (int i = 0; i < forward_inputs.size(); i++)
terms[forward_inputs[i] + "_grad"] = output(i)->name(); terms[forward_inputs[i] + "_grad"] = output(i)->name();
} }
......
...@@ -25,9 +25,9 @@ class AccuracyOp final: public Operator<Context> { ...@@ -25,9 +25,9 @@ class AccuracyOp final: public Operator<Context> {
public: public:
AccuracyOp(const OperatorDef& op_def, Workspace* ws) AccuracyOp(const OperatorDef& op_def, Workspace* ws)
: Operator<Context>(op_def, ws), : Operator<Context>(op_def, ws),
top_k(OperatorBase::GetSingleArg<int>("top_k", 1)){ top_k(OperatorBase::GetSingleArg<int>("top_k", 1)) {
vector<int> args = OperatorBase::GetRepeatedArg<int>("ignore_labels"); vector<int> args = OperatorBase::GetRepeatedArg<int>("ignore_labels");
if (args.size()){ if (args.size()) {
ignore_labels.Reshape(vector<TIndex>(1, args.size())); ignore_labels.Reshape(vector<TIndex>(1, args.size()));
int* ignore_data = ignore_labels.mutable_data<int, CPUContext>(); int* ignore_data = ignore_labels.mutable_data<int, CPUContext>();
for (int i = 0; i < args.size(); i++) ignore_data[i] = args[i]; for (int i = 0; i < args.size(); i++) ignore_data[i] = args[i];
......
...@@ -39,7 +39,7 @@ class ROIAlignGradientOp : public Operator<Context> { ...@@ -39,7 +39,7 @@ class ROIAlignGradientOp : public Operator<Context> {
: Operator<Context>(op_def, ws), : Operator<Context>(op_def, ws),
pool_h(OperatorBase::GetSingleArg<int>("pool_h", 0)), pool_h(OperatorBase::GetSingleArg<int>("pool_h", 0)),
pool_w(OperatorBase::GetSingleArg<int>("pool_w", 0)), pool_w(OperatorBase::GetSingleArg<int>("pool_w", 0)),
spatial_scale(OperatorBase::GetSingleArg<float>("spatial_scale", 1.0)){ spatial_scale(OperatorBase::GetSingleArg<float>("spatial_scale", 1.0)) {
CHECK_GT(pool_h, 0) << "\npool_h must > 0"; CHECK_GT(pool_h, 0) << "\npool_h must > 0";
CHECK_GT(pool_w, 0) << "\npool_w must > 0"; CHECK_GT(pool_w, 0) << "\npool_w must > 0";
} }
......
...@@ -54,7 +54,7 @@ inline void LoadCaffeModel(string file, string scope, Workspace* ws) { ...@@ -54,7 +54,7 @@ inline void LoadCaffeModel(string file, string scope, Workspace* ws) {
ReadProtoFromBinaryFile(file.c_str(), &net_param); ReadProtoFromBinaryFile(file.c_str(), &net_param);
LOG(INFO) << "Restore From Model @: " << file << "......"; LOG(INFO) << "Restore From Model @: " << file << "......";
LOG(INFO) << "Model Format: CaffeModel"; LOG(INFO) << "Model Format: CaffeModel";
for (int i = 0; i < net_param.layer_size(); i++){ for (int i = 0; i < net_param.layer_size(); i++) {
const LayerParameter& layer = net_param.layer(i); const LayerParameter& layer = net_param.layer(i);
const string& layer_name = layer.name(); const string& layer_name = layer.name();
string prefix = scope + layer_name + "@param"; string prefix = scope + layer_name + "@param";
......
...@@ -54,7 +54,7 @@ class TruncatedNormalFiller final : public Filler < T, Context > { ...@@ -54,7 +54,7 @@ class TruncatedNormalFiller final : public Filler < T, Context > {
public: public:
TruncatedNormalFiller(const TensorFiller& filler): Filler<T, Context>(filler) {} TruncatedNormalFiller(const TensorFiller& filler): Filler<T, Context>(filler) {}
void Fill(Tensor* tensor) override { void Fill(Tensor* tensor) override {
// implement of gpu is diffcult // implement of gpu is diffcult
math::RandomTruncatedNormal<T, CPUContext>(tensor->count(), math::RandomTruncatedNormal<T, CPUContext>(tensor->count(),
filler().mean(), filler().mean(),
filler().std(), filler().std(),
......
...@@ -22,7 +22,7 @@ inline std::vector<std::string> SplitString(const std::string& str, ...@@ -22,7 +22,7 @@ inline std::vector<std::string> SplitString(const std::string& str,
std::vector<std::string> ret; std::vector<std::string> ret;
std::string temp(str); std::string temp(str);
size_t pos; size_t pos;
while (pos = temp.find(c), pos != std::string::npos){ while (pos = temp.find(c), pos != std::string::npos) {
ret.push_back(temp.substr(0, pos)); ret.push_back(temp.substr(0, pos));
temp.erase(0, pos + 1); temp.erase(0, pos + 1);
} }
......
...@@ -31,7 +31,7 @@ const TypeMeta& NumpyTypeToDragon(int numpy_type) { ...@@ -31,7 +31,7 @@ const TypeMeta& NumpyTypeToDragon(int numpy_type) {
{ NPY_FLOAT16, TypeMeta::Make<float16>() }, { NPY_FLOAT16, TypeMeta::Make<float16>() },
{ NPY_UINT8, TypeMeta::Make<uint8_t>() }}; { NPY_UINT8, TypeMeta::Make<uint8_t>() }};
static TypeMeta unknown_type; // id = 0 static TypeMeta unknown_type;
return dragon_type_map.count(numpy_type) ? dragon_type_map[numpy_type] : unknown_type; return dragon_type_map.count(numpy_type) ? dragon_type_map[numpy_type] : unknown_type;
} }
...@@ -50,7 +50,7 @@ REGISTER_TENSOR_FETCHER(TypeMeta::Id<NumpyFetcher>(), NumpyFetcher); ...@@ -50,7 +50,7 @@ REGISTER_TENSOR_FETCHER(TypeMeta::Id<NumpyFetcher>(), NumpyFetcher);
REGISTER_TENSOR_FETCHER(TypeMeta::Id<StringFetcher>(), StringFetcher); REGISTER_TENSOR_FETCHER(TypeMeta::Id<StringFetcher>(), StringFetcher);
REGISTER_TENSOR_FEEDER(TypeMeta::Id<NumpyFeeder>(), NumpyFeeder); REGISTER_TENSOR_FEEDER(TypeMeta::Id<NumpyFeeder>(), NumpyFeeder);
extern "C"{ extern "C" {
PyObject* RegisteredOperatorsCC(PyObject* self, PyObject* args) { PyObject* RegisteredOperatorsCC(PyObject* self, PyObject* args) {
set<string> all_keys; set<string> all_keys;
...@@ -123,7 +123,7 @@ bool SwitchWorkspaceInternal(const string& name, const bool create_if_missing) { ...@@ -123,7 +123,7 @@ bool SwitchWorkspaceInternal(const string& name, const bool create_if_missing) {
} else if (create_if_missing) { } else if (create_if_missing) {
unique_ptr<Workspace> new_workspace(new Workspace()); unique_ptr<Workspace> new_workspace(new Workspace());
g_workspace = new_workspace.get(); g_workspace = new_workspace.get();
g_workspaces[name] = std::move(new_workspace); // ??? g_workspaces[name] = std::move(new_workspace);
g_current_workspace = name; g_current_workspace = name;
return true; return true;
} else { } else {
......
...@@ -33,7 +33,7 @@ inline PyObject* StdStringToPyBytes(const std::string& str) { ...@@ -33,7 +33,7 @@ inline PyObject* StdStringToPyBytes(const std::string& str) {
return PyBytes_FromStringAndSize(str.c_str(), str.size()); return PyBytes_FromStringAndSize(str.c_str(), str.size());
} }
template <typename T> template <typename T>
inline void MakeStringInternal(std::stringstream& ss, const T& t){ ss << t; } inline void MakeStringInternal(std::stringstream& ss, const T& t) { ss << t; }
template <typename T,typename ... Args> template <typename T,typename ... Args>
inline void MakeStringInternal(std::stringstream& ss, const T& t, const Args& ... args) { inline void MakeStringInternal(std::stringstream& ss, const T& t, const Args& ... args) {
...@@ -124,7 +124,7 @@ class NumpyFeeder : public TensorFeederBase { ...@@ -124,7 +124,7 @@ class NumpyFeeder : public TensorFeederBase {
Tensor* tensor) override { Tensor* tensor) override {
PyArrayObject* array = PyArray_GETCONTIGUOUS(original_array); PyArrayObject* array = PyArray_GETCONTIGUOUS(original_array);
const TypeMeta& meta = NumpyTypeToDragon(PyArray_TYPE(array)); const TypeMeta& meta = NumpyTypeToDragon(PyArray_TYPE(array));
if (meta.id() == 0){ if (meta.id() == 0) {
PyErr_SetString(PyExc_TypeError, "numpy data type is not supported."); PyErr_SetString(PyExc_TypeError, "numpy data type is not supported.");
return nullptr; return nullptr;
} }
......
...@@ -61,13 +61,13 @@ inline PyObject* MPICreateGroupCC(PyObject* self, PyObject* args) { ...@@ -61,13 +61,13 @@ inline PyObject* MPICreateGroupCC(PyObject* self, PyObject* args) {
for (int i = 0; i < world_size; i++) all_ranks.insert(i); for (int i = 0; i < world_size; i++) all_ranks.insert(i);
local_group = world_group; local_group = world_group;
// check inclue ranks // check inclue ranks
int size = PyList_Size(incl); int size = PyList_Size(incl);
if (size > 0){ if (size > 0) {
all_ranks.clear(); all_ranks.clear();
unique_ptr<int> incl_ranks(new int[size]); unique_ptr<int> incl_ranks(new int[size]);
int* ranks = incl_ranks.get(); int* ranks = incl_ranks.get();
for (int i = 0; i < size; i++){ for (int i = 0; i < size; i++) {
ranks[i] = _PyInt_AsInt(PyList_GetItem(incl, i)); ranks[i] = _PyInt_AsInt(PyList_GetItem(incl, i));
all_ranks.insert(ranks[i]); all_ranks.insert(ranks[i]);
} }
...@@ -75,13 +75,13 @@ inline PyObject* MPICreateGroupCC(PyObject* self, PyObject* args) { ...@@ -75,13 +75,13 @@ inline PyObject* MPICreateGroupCC(PyObject* self, PyObject* args) {
CHECK(err_code == MPI_SUCCESS) << "failed to create mpi group."; CHECK(err_code == MPI_SUCCESS) << "failed to create mpi group.";
} }
// check exclude ranks // check exclude ranks
size = PyList_Size(excl); size = PyList_Size(excl);
if (size > 0) { if (size > 0) {
all_ranks.clear(); Set<int> tmp; all_ranks.clear(); Set<int> tmp;
unique_ptr<int> excl_ranks(new int[size]); unique_ptr<int> excl_ranks(new int[size]);
int* ranks = excl_ranks.get(); int* ranks = excl_ranks.get();
for (int i = 0; i < size; i++){ for (int i = 0; i < size; i++) {
ranks[i] = _PyInt_AsInt(PyList_GetItem(excl, i)); ranks[i] = _PyInt_AsInt(PyList_GetItem(excl, i));
tmp.insert(ranks[i]); tmp.insert(ranks[i]);
} }
...@@ -97,7 +97,7 @@ inline PyObject* MPICreateGroupCC(PyObject* self, PyObject* args) { ...@@ -97,7 +97,7 @@ inline PyObject* MPICreateGroupCC(PyObject* self, PyObject* args) {
if (local_comm != MPI_COMM_NULL) { if (local_comm != MPI_COMM_NULL) {
int world_rank, local_size; int world_rank, local_size;
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
if (world_rank == local_root){ if (world_rank == local_root) {
MPI_Comm_size(local_comm, &local_size); MPI_Comm_size(local_comm, &local_size);
std::stringstream ss; std::stringstream ss;
ss << "Rank[" << world_rank << "]: " ss << "Rank[" << world_rank << "]: "
......
# --------------------------------------------------------
# Dragon
# Copyright(c) 2017 SeetaTech
# Written by Ting Pan
# --------------------------------------------------------
from .data_batch import DataBatch
# -------------------------------------------------------- # --------------------------------------------------------
# Caffe for Dragon # Dragon
# Copyright(c) 2017 SeetaTech # Copyright(c) 2017 SeetaTech
# Written by Ting Pan # Written by Ting Pan
# -------------------------------------------------------- # --------------------------------------------------------
...@@ -10,7 +10,7 @@ from six.moves import range as xrange ...@@ -10,7 +10,7 @@ from six.moves import range as xrange
from dragon.config import logger from dragon.config import logger
from .__init__ import GetProperty from .utils import GetProperty
class BlobFetcher(Process): class BlobFetcher(Process):
def __init__(self, **kwargs): def __init__(self, **kwargs):
...@@ -30,16 +30,16 @@ class BlobFetcher(Process): ...@@ -30,16 +30,16 @@ class BlobFetcher(Process):
atexit.register(cleanup) atexit.register(cleanup)
def im_list_to_blob(self): def im_list_to_blob(self):
datum = self.Q_in.get() # (h, w, BGR) datum = self.Q_in.get()
im = datum[0]; h, w, c = im.shape im_blob = []
im_blob = np.zeros((self._batch_size, h, w, c), dtype=np.float32)
label_blob = np.zeros((self._batch_size, len(datum[1])), dtype=np.float32) \ label_blob = np.zeros((self._batch_size, len(datum[1])), dtype=np.float32) \
if len(datum) > 1 else None if len(datum) > 1 else None
for i in xrange(0, self._batch_size): for i in xrange(0, self._batch_size):
im_blob[i, 0:h, 0:w, :] = datum[0] im_blob.append(datum[0])
if label_blob is not None: label_blob[i, :] = datum[1] if label_blob is not None: label_blob[i, :] = datum[1]
if i != self._batch_size - 1: datum = self.Q_in.get() if i != self._batch_size - 1: datum = self.Q_in.get()
channel_swap = (0, 3, 1, 2) channel_swap = (0, 3, 1, 2)
im_blob = np.array(im_blob, dtype=np.float32)
im_blob = im_blob.transpose(channel_swap) im_blob = im_blob.transpose(channel_swap)
return (im_blob, label_blob) return (im_blob, label_blob)
......
# -------------------------------------------------------- # --------------------------------------------------------
# Caffe for Dragon # Dragon
# Copyright(c) 2017 SeetaTech # Copyright(c) 2017 SeetaTech
# Written by Ting Pan # Written by Ting Pan
# -------------------------------------------------------- # --------------------------------------------------------
import sys
import time import time
import pprint import pprint
from multiprocessing import Queue from multiprocessing import Queue
if sys.version_info >= (3,0):
from queue import Queue as Queue2
else:
from Queue import Queue as Queue2
import threading
from six.moves import range as xrange from six.moves import range as xrange
import dragon.core.mpi as mpi import dragon.core.mpi as mpi
...@@ -16,10 +22,11 @@ from .data_reader import DataReader ...@@ -16,10 +22,11 @@ from .data_reader import DataReader
from .data_transformer import DataTransformer from .data_transformer import DataTransformer
from .blob_fetcher import BlobFetcher from .blob_fetcher import BlobFetcher
from .__init__ import GetProperty from .utils import GetProperty
class DataBatch(object): class DataBatch(threading.Thread):
def __init__(self, **kwargs): def __init__(self, **kwargs):
super(DataBatch, self).__init__()
"""DataBatch use Triple-Buffering to speed up""" """DataBatch use Triple-Buffering to speed up"""
...@@ -35,10 +42,10 @@ class DataBatch(object): ...@@ -35,10 +42,10 @@ class DataBatch(object):
kwargs['group_size'] = group_size kwargs['group_size'] = group_size
# configuration # configuration
self._prefetch = GetProperty(kwargs, 'prefetch', 40) self._prefetch = GetProperty(kwargs, 'prefetch', 5)
self._num_readers = GetProperty(kwargs, 'num_readers', 1) self._num_readers = GetProperty(kwargs, 'num_readers', 1)
self._num_transformers = GetProperty(kwargs, 'num_transformers', -1) self._num_transformers = GetProperty(kwargs, 'num_transformers', -1)
self._num_fetchers = GetProperty(kwargs, 'num_fetchers', 3) self._num_fetchers = GetProperty(kwargs, 'num_fetchers', 1)
# default policy # default policy
if self._num_transformers == -1: if self._num_transformers == -1:
...@@ -60,6 +67,7 @@ class DataBatch(object): ...@@ -60,6 +67,7 @@ class DataBatch(object):
self.Q_level_1 = Queue(self._prefetch * self._num_readers * self._batch_size) self.Q_level_1 = Queue(self._prefetch * self._num_readers * self._batch_size)
self.Q_level_2 = Queue(self._prefetch * self._num_readers * self._batch_size) self.Q_level_2 = Queue(self._prefetch * self._num_readers * self._batch_size)
self.Q_level_3 = Queue(self._prefetch * self._num_readers) self.Q_level_3 = Queue(self._prefetch * self._num_readers)
self.Q_level_4 = Queue2(self._prefetch * self._num_readers)
# init readers # init readers
self._readers = [] self._readers = []
...@@ -102,11 +110,16 @@ class DataBatch(object): ...@@ -102,11 +110,16 @@ class DataBatch(object):
self._fetchers.append(fetcher) self._fetchers.append(fetcher)
time.sleep(0.1) time.sleep(0.1)
self.daemon = True
self.start()
#self.echo() #self.echo()
@property def run(self):
def blobs(self): while True:
return self.Q_level_3.get() self.Q_level_4.put(self.Q_level_3.get())
def get(self):
return self.Q_level_4.get()
def echo(self): def echo(self):
logger.info('---------------------------------------------------------') logger.info('---------------------------------------------------------')
......
# -------------------------------------------------------- # --------------------------------------------------------
# Caffe for Dragon # Dragon
# Copyright(c) 2017 SeetaTech # Copyright(c) 2017 SeetaTech
# Written by Ting Pan # Written by Ting Pan
# -------------------------------------------------------- # --------------------------------------------------------
...@@ -12,8 +12,7 @@ import dragon.config as config ...@@ -12,8 +12,7 @@ import dragon.config as config
from dragon.config import logger from dragon.config import logger
from dragon.tools.db import LMDB from dragon.tools.db import LMDB
from .__init__ import GetProperty from .utils import GetProperty
class DataReader(Process): class DataReader(Process):
def __init__(self, **kwargs): def __init__(self, **kwargs):
......
# -------------------------------------------------------- # --------------------------------------------------------
# Caffe for Dragon # Dragon
# Copyright(c) 2017 SeetaTech # Copyright(c) 2017 SeetaTech
# Written by Ting Pan # Written by Ting Pan
# -------------------------------------------------------- # --------------------------------------------------------
...@@ -12,7 +12,7 @@ import dragon.config as config ...@@ -12,7 +12,7 @@ import dragon.config as config
from dragon.config import logger from dragon.config import logger
import dragon.vm.caffe.proto.caffe_pb2 as pb import dragon.vm.caffe.proto.caffe_pb2 as pb
from .__init__ import GetProperty from .utils import GetProperty
try: try:
import cv2 import cv2
...@@ -130,6 +130,4 @@ class DataTransformer(Process): ...@@ -130,6 +130,4 @@ class DataTransformer(Process):
npr.seed(self._random_seed) npr.seed(self._random_seed)
while True: while True:
serialized = self.Q_in.get() serialized = self.Q_in.get()
self.Q_out.put(self.transform_image_label(serialized)) self.Q_out.put(self.transform_image_label(serialized))
\ No newline at end of file
# -------------------------------------------------------- # --------------------------------------------------------
# Caffe for Dragon # Dragon
# Copyright(c) 2017 SeetaTech # Copyright(c) 2017 SeetaTech
# Written by Ting Pan # Written by Ting Pan
# -------------------------------------------------------- # --------------------------------------------------------
def GetProperty(kwargs, name, default): def GetProperty(kwargs, name, default):
return kwargs[name] \ return kwargs[name] \
if name in kwargs else default if name in kwargs else default
\ No newline at end of file
# -------------------------------------------------------- # --------------------------------------------------------
# Caffe for Dragon # Dragon
# Copyright(c) 2017 SeetaTech # Copyright(c) 2017 SeetaTech
# Written by Ting Pan # Written by Ting Pan
# -------------------------------------------------------- # --------------------------------------------------------
import dragon.vm.caffe as caffe
import dragon.core.workspace as ws import dragon.core.workspace as ws
from .minibatch import DataBatch from dragon.io.data_batch import DataBatch
class DataLayer(caffe.Layer): class MiniBatchOp(object):
def setup(self, bottom, top):
def setup(self, inputs, outputs):
kwargs = eval(self.param_str) kwargs = eval(self.param_str)
self._data_batch = DataBatch(**kwargs) self._data_batch = DataBatch(**kwargs)
def forward(self, bottom, top): def run(self, inputs, outputs):
blobs = self._data_batch.blobs blobs = self._data_batch.get()
for idx, blob in enumerate(blobs): for idx, blob in enumerate(blobs):
ws.FeedTensor(top[idx], blob) ws.FeedTensor(outputs[idx], blob)
\ No newline at end of file \ No newline at end of file
...@@ -10,7 +10,6 @@ from dragon.operators.utils import Run ...@@ -10,7 +10,6 @@ from dragon.operators.utils import Run
def LMDBData(**kwargs): def LMDBData(**kwargs):
""" """
:param kwargs: a dict of imagenet data param
:param --> mean_value: a list of mean values for channles [B-G-R] :param --> mean_value: a list of mean values for channles [B-G-R]
:param --> source: a str of the images root directory :param --> source: a str of the images root directory
:param --> imageset: a str of text file contains image name / label :param --> imageset: a str of text file contains image name / label
...@@ -30,8 +29,8 @@ def LMDBData(**kwargs): ...@@ -30,8 +29,8 @@ def LMDBData(**kwargs):
args = locals(); kwargs = args['kwargs'] args = locals(); kwargs = args['kwargs']
del args['kwargs']; kwargs = dict(args, **kwargs) del args['kwargs']; kwargs = dict(args, **kwargs)
kwargs['module'] = 'dragon.vm.caffe.io.data_layer' kwargs['module'] = 'dragon.operators.custom.minibatch'
kwargs['op'] = 'DataLayer' kwargs['op'] = 'MiniBatchOp'
return Run([], param_str=str(kwargs), nout=2, **kwargs) return Run([], param_str=str(kwargs), nout=2, **kwargs)
......
# --------------------------------------------------------
# Dragon
# Copyright(c) 2017 SeetaTech
# Written by Ting Pan
# --------------------------------------------------------
""" Generate LMDB from images """
import os
import sys
import time
import shutil
import argparse
import cv2
try:
import numpy as np
except: pass
from dragon.tools.db import LMDB
from dragon.vm.caffe.proto import caffe_pb2
def resize_image(im, resize):
if im.shape[0] > im.shape[1]:
newsize = (resize, im.shape[0] * resize / im.shape[1])
else:
newsize = (im.shape[1] * resize / im.shape[0], resize)
im = cv2.resize(im, newsize)
return im
def make_db(args):
if os.path.isfile(args.list) is False:
raise ValueError('the path of image list is invalid.')
if os.path.isdir(args.database) is True:
raise ValueError('the database is already exist or invalid.')
print('start time: ', time.strftime("%a, %d %b %Y %H:%M:%S", time.gmtime()))
db = LMDB(max_commit=10000)
db.open(args.database, mode='w')
total_line = sum(1 for line in open(args.list))
count = 0
zfill_flag = '{0:0%d}' % (args.zfill)
encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), args.quality]
start_time = time.time()
with open(args.list, 'r') as input_file:
records = input_file.readlines()
if args.shuffle:
import random
random.shuffle(records)
for record in records:
count += 1
if count % 10000 == 0:
now_time = time.time()
print('{0} / {1} in {2:.2f} sec'.format(
count, total_line, now_time - start_time))
db.commit()
record = record.split()
path = record[0]
label = record[1]
img = cv2.imread(os.path.join(args.root, path))
if args.resize > 0:
img = resize_image(img, args.resize)
if args.pad > 0:
pad_img = np.zeros((img.shape[0] + 2 * args.pad,
img.shape[1] + 2 * args.pad, 3), dtype=img.dtype)
pad_img[args.pad : args.pad + img.shape[0],
args.pad : args.pad + img.shape[1], :] = img
img = pad_img
result, imgencode = cv2.imencode('.jpg', img, encode_param)
datum = caffe_pb2.Datum()
datum.height, datum.width, datum.channels = img.shape
datum.label = int(label)
datum.encoded = True
datum.data = imgencode.tostring()
db.put(zfill_flag.format(count - 1), datum.SerializeToString())
now_time = time.time()
print('{0} / {1} in {2:.2f} sec'.format(count, total_line, now_time - start_time))
db.put('size', str(count))
db.put('zfill', str(args.zfill))
db.commit()
db.close()
shutil.copy(args.list, args.database + '/image_list.txt')
end_time = time.time()
print('{0} images have been stored in the database.'.format(total_line))
print('This task finishes within {0:.2f} seconds.'.format(end_time - start_time))
print('The size of database is {0} MB.'.
format(float(os.path.getsize(args.database + '/data.mdb') / 1000 / 1000)))
def parse_args():
parser = argparse.ArgumentParser(description='Create LMDB from images for classification.')
parser.add_argument('--root', help='the root folder of raw images')
parser.add_argument('--list', help='the filepath of image list')
parser.add_argument('--database', help='the filepath of database')
parser.add_argument('--zfill', type=int, default=8, help='zfill for the key of database')
parser.add_argument('--resize', type=int, default=0, help='resize the shorter edge of image to the newsize')
parser.add_argument('--pad', type=int, default=0, help='zero-pad the image')
parser.add_argument('--quality', type=int, default=95, help='JPEG quality for encoding, 1-100')
parser.add_argument('--shuffle', type=bool, default=True, help='randomize the order in list file True')
if len(sys.argv) < 4:
parser.print_help()
sys.exit(1)
args = parser.parse_args()
return args
if __name__ == '__main__':
args = parse_args()
make_db(args)
\ No newline at end of file
# --------------------------------------------------------
# Dragon
# Copyright(c) 2017 SeetaTech
# Written by Ting Pan
# --------------------------------------------------------
from google.protobuf.message import Message
from protos import dragon_pb2 as pb
import numpy as np
def MakeArgument(key, value):
argument = pb.Argument()
argument.name = key
if type(value) is float: argument.f = value
elif type(value) is int : argument.i = value
elif type(value) is np.int64: argument.i64 = int(value)
elif type(value) is str: argument.s = value
elif type(value) is unicode: argument.s = value
elif type(value) is bool: argument.b = value
elif isinstance(value, Message): argument.s = value.SerializeToString()
elif all(type(v) is float for v in value): argument.floats.extend(value)
elif all(type(v) is int for v in value): argument.ints.extend(value)
elif all(type(v) is str for v in value): argument.strings.extend(value)
elif all(type(v) is unicode or type(v) is str for v in value): argument.strings.extend(value)
elif all(isinstance(v,Message) for v in value):
argument.strings.extend([v.SerializeToString() for v in value])
else: raise ValueError('unknown argument type: key={} value={} value type={}' \
.format(key,value,type(value)))
return argument
def MakeOperatorDef(op_type, inputs, outputs, name='',
device_option=None, arg=None, engine=None, **kwargs):
operator = pb.OperatorDef()
operator.type = op_type
operator.name = name
operator.input.extend([str(tensor) for tensor in inputs])
operator.output.extend([str(tensor) for tensor in outputs])
if device_option is not None:
operator.device_option.CopyFrom(device_option)
if engine is not None:
operator.engine = engine
if 'random_seed' in kwargs:
operator.device_option.random_seed = kwargs['random_seed']
del kwargs['random_seed']
if arg is not None:
operator.arg.extend(arg)
for k,v in kwargs.items():
if v is None: continue
operator.arg.add().CopyFrom(MakeArgument(k,v))
return operator
def MakeDeviceOption(device_type, gpu_id, rng_seed = None):
""" return a DeviceOption """
option = pb.DeviceOption()
option.device_type = device_type
option.gpu_id = gpu_id
if rng_seed is not None: option.random_seed = rng_seed
return option
# fix the python stdout
class Unbuffered(object):
def __init__(self, stream):
self.stream = stream
def write(self, data):
self.stream.write(data)
self.stream.flush()
def __getattr__(self, attr):
return getattr(self.stream, attr)
# clear the stdout buffer for mpi(c++ & python)
import sys
sys.stdout = Unbuffered(sys.stdout)
\ No newline at end of file
...@@ -221,7 +221,7 @@ message SolverParameter { ...@@ -221,7 +221,7 @@ message SolverParameter {
// RMSProp decay value // RMSProp decay value
// MeanSquare(t) = rms_decay*MeanSquare(t-1) + (1-rms_decay)*SquareGradient(t) // MeanSquare(t) = rms_decay*MeanSquare(t-1) + (1-rms_decay)*SquareGradient(t)
optional float rms_decay = 38; optional float rms_decay = 38 [default = 0.99];
// If true, print information about the state of the net that may help with // If true, print information about the state of the net that may help with
// debugging learning problems. // debugging learning problems.
...@@ -676,7 +676,7 @@ message DataParameter { ...@@ -676,7 +676,7 @@ message DataParameter {
optional bool force_encoded_color = 9 [default = false]; optional bool force_encoded_color = 9 [default = false];
// Prefetch queue (Number of batches to prefetch to host memory, increase if // Prefetch queue (Number of batches to prefetch to host memory, increase if
// data access bandwidth varies). // data access bandwidth varies).
optional uint32 prefetch = 10 [default = 40]; optional uint32 prefetch = 10 [default = 5];
} }
message DropoutParameter { message DropoutParameter {
......
...@@ -8,9 +8,9 @@ CPUObject CPUContext::cpu_object_; ...@@ -8,9 +8,9 @@ CPUObject CPUContext::cpu_object_;
CUDAObject CUDAContext::cuda_object_; CUDAObject CUDAContext::cuda_object_;
#endif // WITH_CUDA #endif // WITH_CUDA
// cpu <- gpu // cpu <- gpu
template<> void CPUContext::Memcpy<CPUContext, CUDAContext>( template<> void CPUContext::Memcpy<CPUContext, CUDAContext>(
size_t nbytes, void* dst, const void* src){ size_t nbytes, void* dst, const void* src) {
#ifdef WITH_CUDA #ifdef WITH_CUDA
CUDAContext ctx(POINTER_DEVICE(src)); CUDAContext ctx(POINTER_DEVICE(src));
ctx.Memcpy<CPUContext, CUDAContext>(nbytes, dst, src); ctx.Memcpy<CPUContext, CUDAContext>(nbytes, dst, src);
...@@ -19,9 +19,9 @@ template<> void CPUContext::Memcpy<CPUContext, CUDAContext>( ...@@ -19,9 +19,9 @@ template<> void CPUContext::Memcpy<CPUContext, CUDAContext>(
#endif #endif
} }
// gpu <- cpu // gpu <- cpu
template<> void CPUContext::Memcpy<CUDAContext, CPUContext>( template<> void CPUContext::Memcpy<CUDAContext, CPUContext>(
size_t nbytes, void* dst, const void* src){ size_t nbytes, void* dst, const void* src) {
#ifdef WITH_CUDA #ifdef WITH_CUDA
CUDAContext ctx(POINTER_DEVICE(dst)); CUDAContext ctx(POINTER_DEVICE(dst));
ctx.Memcpy<CUDAContext, CPUContext>(nbytes, dst, src); ctx.Memcpy<CUDAContext, CPUContext>(nbytes, dst, src);
......
...@@ -151,13 +151,13 @@ GraphDef Graph::Prune(const GraphDef& graph_def) { ...@@ -151,13 +151,13 @@ GraphDef Graph::Prune(const GraphDef& graph_def) {
OperatorDef op_def; OperatorDef op_def;
op_def.CopyFrom(graph_def.op(it)); op_def.CopyFrom(graph_def.op(it));
// handle inputs // handle inputs
for (int i = 0; i < graph_def.op(it).input_size(); i++){ for (int i = 0; i < graph_def.op(it).input_size(); i++) {
string input = graph_def.op(it).input(i); string input = graph_def.op(it).input(i);
if (!colored_[input] || !outputs.count(input)) if (!colored_[input] || !outputs.count(input))
*op_def.mutable_input(i) = "ignore"; *op_def.mutable_input(i) = "ignore";
} }
// handle outputs // handle outputs
for (int i = 0; i < graph_def.op(it).output_size(); i++){ for (int i = 0; i < graph_def.op(it).output_size(); i++) {
string output = graph_def.op(it).output(i); string output = graph_def.op(it).output(i);
if (!colored_[output]) *op_def.mutable_output(i) = "ignore"; if (!colored_[output]) *op_def.mutable_output(i) = "ignore";
else outputs.insert(op_def.output(i)); else outputs.insert(op_def.output(i));
...@@ -212,7 +212,7 @@ GraphDef Graph::MakeUpdate(const GraphDef& graph_def) { ...@@ -212,7 +212,7 @@ GraphDef Graph::MakeUpdate(const GraphDef& graph_def) {
for (int i = 0; i < graph_def.u_target_size(); i++) { for (int i = 0; i < graph_def.u_target_size(); i++) {
UpdateTarget target = graph_def.u_target(i); UpdateTarget target = graph_def.u_target(i);
vector<string> missing_tensors; vector<string> missing_tensors;
// missing check // missing check
for (auto& tensor : target.tensor()) { for (auto& tensor : target.tensor()) {
if (!ws()->HasTensor(tensor)) { if (!ws()->HasTensor(tensor)) {
LOG(INFO) << "missing Tensor: " << tensor; LOG(INFO) << "missing Tensor: " << tensor;
......
...@@ -18,12 +18,12 @@ CheckTuple GraphGradientMaker::CheckMissingGrad(OperatorDef* forward_op) { ...@@ -18,12 +18,12 @@ CheckTuple GraphGradientMaker::CheckMissingGrad(OperatorDef* forward_op) {
string g_output = output + "_grad"; string g_output = output + "_grad";
if (terms_.count(g_output)) g_output = terms_[g_output]; if (terms_.count(g_output)) g_output = terms_[g_output];
// check if having external grad first // check if having external grad first
if (external_grads_.count(g_output)) if (external_grads_.count(g_output))
inputs_to_grads_[output] = g_output; inputs_to_grads_[output] = g_output;
// consider generate virtual grad // consider generate virtual grad
else if (targets_set_.count(output) && g_output != "ignore"){ else if (targets_set_.count(output) && g_output != "ignore") {
gen_grads.push_back({ output, idx }); gen_grads.push_back({ output, idx });
inputs_to_grads_[output] = g_output; inputs_to_grads_[output] = g_output;
} }
...@@ -36,7 +36,7 @@ CheckTuple GraphGradientMaker::CheckMissingGrad(OperatorDef* forward_op) { ...@@ -36,7 +36,7 @@ CheckTuple GraphGradientMaker::CheckMissingGrad(OperatorDef* forward_op) {
if (forward_op->output_size() == 1) return { true, gen_grads }; if (forward_op->output_size() == 1) return { true, gen_grads };
} }
} }
// check pass, even if missing some grads // check pass, even if missing some grads
return { false, gen_grads }; return { false, gen_grads };
} }
...@@ -50,7 +50,7 @@ GraphDef GraphGradientMaker::Make() { ...@@ -50,7 +50,7 @@ GraphDef GraphGradientMaker::Make() {
Set<string> all_split_grads; Set<string> all_split_grads;
// PLAY for the forward // PLAY for the forward
for (auto& op : forward_def_.op()){ for (auto& op : forward_def_.op()) {
if (NoGradientRegistry()->Has(op.type())) continue; if (NoGradientRegistry()->Has(op.type())) continue;
for (auto& input : op.input()) inputs_count[input]++; for (auto& input : op.input()) inputs_count[input]++;
} }
...@@ -73,17 +73,17 @@ GraphDef GraphGradientMaker::Make() { ...@@ -73,17 +73,17 @@ GraphDef GraphGradientMaker::Make() {
Gradient grad = MakeGradientForOp(*op, g_outputs); Gradient grad = MakeGradientForOp(*op, g_outputs);
// replace terms // replace terms
for (auto& g_op : grad.ops){ for (auto& g_op : grad.ops) {
g_op.set_name(GetOperatorName()); g_op.set_name(GetOperatorName());
for (int i = 0; i < g_op.input_size(); i++){ for (int i = 0; i < g_op.input_size(); i++) {
string* input = g_op.mutable_input(i); string* input = g_op.mutable_input(i);
if (terms_.count(*input)) *input = terms_[*input]; if (terms_.count(*input)) *input = terms_[*input];
} }
for (int i = 0; i < g_op.output_size(); i++){ for (int i = 0; i < g_op.output_size(); i++) {
string* output = g_op.mutable_output(i); string* output = g_op.mutable_output(i);
if (terms_.count(*output)) *output = terms_[*output]; if (terms_.count(*output)) *output = terms_[*output];
} }
for (int i = 0; i < grad.g_inputs.size(); i++){ for (int i = 0; i < grad.g_inputs.size(); i++) {
if (terms_.count(grad.g_inputs[i])) if (terms_.count(grad.g_inputs[i]))
grad.g_inputs[i] = terms_[grad.g_inputs[i]]; grad.g_inputs[i] = terms_[grad.g_inputs[i]];
} }
...@@ -106,14 +106,14 @@ GraphDef GraphGradientMaker::Make() { ...@@ -106,14 +106,14 @@ GraphDef GraphGradientMaker::Make() {
string split_name = *output + "_autosplit_" + str(grads_count[*output]++); string split_name = *output + "_autosplit_" + str(grads_count[*output]++);
if (!is_skip) all_split_grads.insert(split_name); if (!is_skip) all_split_grads.insert(split_name);
// gather // gather
if (grads_count[*output] == inputs_count[original_name]){ if (grads_count[*output] == inputs_count[original_name]) {
gather_op = new OperatorDef(); gather_op = new OperatorDef();
gather_op->set_name(GetOperatorName()); gather_op->set_name(GetOperatorName());
gather_op->set_type("GradientGather"); gather_op->set_type("GradientGather");
gather_op->add_output(*output); gather_op->add_output(*output);
if (g_op.has_device_option()) if (g_op.has_device_option())
gather_op->mutable_device_option()->CopyFrom(g_op.device_option()); gather_op->mutable_device_option()->CopyFrom(g_op.device_option());
for (int j = 0; j < grads_count[*output]; j++){ for (int j = 0; j < grads_count[*output]; j++) {
string key = *output + "_autosplit_" + str(j); string key = *output + "_autosplit_" + str(j);
if (all_split_grads.count(key)) gather_op->add_input(key); if (all_split_grads.count(key)) gather_op->add_input(key);
} }
...@@ -123,7 +123,7 @@ GraphDef GraphGradientMaker::Make() { ...@@ -123,7 +123,7 @@ GraphDef GraphGradientMaker::Make() {
} }
} }
// append ops // append ops
if (!is_skip) { if (!is_skip) {
if (gen_grads.size() > 0) { if (gen_grads.size() > 0) {
vector<string> op_inputs, op_outputs; vector<string> op_inputs, op_outputs;
...@@ -148,7 +148,7 @@ GraphDef GraphGradientMaker::Make() { ...@@ -148,7 +148,7 @@ GraphDef GraphGradientMaker::Make() {
} }
if (gather_op != nullptr) new_def_.add_op()->CopyFrom(*gather_op); if (gather_op != nullptr) new_def_.add_op()->CopyFrom(*gather_op);
// done // done
if (!is_skip) { if (!is_skip) {
for (int i = 0; i < op->input_size(); i++) { for (int i = 0; i < op->input_size(); i++) {
if (!grad.g_inputs[i].empty()) if (!grad.g_inputs[i].empty())
......
...@@ -66,7 +66,7 @@ DEFINE_REGISTRY(GradientRegistry, GradientMakerBase, const OperatorDef&, const v ...@@ -66,7 +66,7 @@ DEFINE_REGISTRY(GradientRegistry, GradientMakerBase, const OperatorDef&, const v
DEFINE_REGISTRY(NoGradientRegistry, GradientMakerBase, const OperatorDef&, const vector<string>&); DEFINE_REGISTRY(NoGradientRegistry, GradientMakerBase, const OperatorDef&, const vector<string>&);
#define INSTANTIATE_GET_SINGLE_ARGUMENT(T, fieldname) \ #define INSTANTIATE_GET_SINGLE_ARGUMENT(T, fieldname) \
template <> T OperatorBase::GetSingleArg(const string& name, const T& default_value){ \ template <> T OperatorBase::GetSingleArg(const string& name, const T& default_value) { \
if(args_.count(name) == 0) { \ if(args_.count(name) == 0) { \
return default_value; \ return default_value; \
} \ } \
...@@ -82,7 +82,7 @@ INSTANTIATE_GET_SINGLE_ARGUMENT(int64_t, i64); ...@@ -82,7 +82,7 @@ INSTANTIATE_GET_SINGLE_ARGUMENT(int64_t, i64);
#define INSTANTIATE_GET_REPEATED_ARGUMENT(T, fieldname) \ #define INSTANTIATE_GET_REPEATED_ARGUMENT(T, fieldname) \
template<> vector<T> OperatorBase::GetRepeatedArg<T>(const string& name){ \ template<> vector<T> OperatorBase::GetRepeatedArg<T>(const string& name) { \
if(args_.count(name) == 0) return vector<T>(); \ if(args_.count(name) == 0) return vector<T>(); \
vector<T> values; \ vector<T> values; \
for(const auto& v : args_[name]->fieldname()) values.push_back(v); \ for(const auto& v : args_[name]->fieldname()) values.push_back(v); \
......
...@@ -17,7 +17,7 @@ bool OpSchema::Verify(const OperatorDef& def) const { ...@@ -17,7 +17,7 @@ bool OpSchema::Verify(const OperatorDef& def) const {
} }
for (int in = 0; in < def.input_size(); in++) { for (int in = 0; in < def.input_size(); in++) {
if (def.input(in) == "ignore") continue; if (def.input(in) == "ignore") continue;
for (int out = 0; out < def.output_size(); out++){ for (int out = 0; out < def.output_size(); out++) {
if (def.output(out) == "ignore") continue; if (def.output(out) == "ignore") continue;
if (def.input(in) == def.output(out) && (!CheckInplace(in, out))) if (def.input(in) == def.output(out) && (!CheckInplace(in, out)))
LOG(FATAL) << "[" << def.name() << "] input(" LOG(FATAL) << "[" << def.name() << "] input("
......
...@@ -24,7 +24,7 @@ void CuDNNReluOp<Context>::RunWithType() { ...@@ -24,7 +24,7 @@ void CuDNNReluOp<Context>::RunWithType() {
template <class Context> template <class Context>
void CuDNNReluOp<Context>::RunOnDevice() { void CuDNNReluOp<Context>::RunOnDevice() {
// cudnn does not support LeakyRelu // cudnn does not support LeakyRelu
if (this->slope != 0) return ReluOp<Context>::RunOnDevice(); if (this->slope != 0) return ReluOp<Context>::RunOnDevice();
output(0)->ReshapeLike(input(0)); output(0)->ReshapeLike(input(0));
...@@ -58,7 +58,7 @@ void CuDNNReluGradientOp<Context>::RunWithType() { ...@@ -58,7 +58,7 @@ void CuDNNReluGradientOp<Context>::RunWithType() {
template <class Context> template <class Context>
void CuDNNReluGradientOp<Context>::RunOnDevice() { void CuDNNReluGradientOp<Context>::RunOnDevice() {
// cudnn does not support LeakyRelu // cudnn does not support LeakyRelu
if (this->slope != 0) return ReluGradientOp<Context>::RunOnDevice(); if (this->slope != 0) return ReluGradientOp<Context>::RunOnDevice();
output(0)->ReshapeLike(input(0)); output(0)->ReshapeLike(input(0));
......
...@@ -71,10 +71,9 @@ void DropoutGradientOp<Context>::RunOnDevice() { ...@@ -71,10 +71,9 @@ void DropoutGradientOp<Context>::RunOnDevice() {
template <class Context> template <class Context>
void DropoutGradientOp<Context>::ClearAfterRun() { void DropoutGradientOp<Context>::ClearAfterRun() {
ws()->ReleaseBuffer(mask); ws()->ReleaseBuffer(mask, true);
} }
DEPLOY_CPU(DropoutGradient); DEPLOY_CPU(DropoutGradient);
#ifdef WITH_CUDA #ifdef WITH_CUDA
DEPLOY_CUDA(DropoutGradient); DEPLOY_CUDA(DropoutGradient);
......
...@@ -125,7 +125,7 @@ void DotGradientOp<Context>::GemvRunWithType() { ...@@ -125,7 +125,7 @@ void DotGradientOp<Context>::GemvRunWithType() {
} }
template <class Context> template <class Context>
void DotGradientOp<Context>::RunOnDevice(){ void DotGradientOp<Context>::RunOnDevice() {
output(0)->ReshapeLike(input(0)); output(0)->ReshapeLike(input(0));
output(1)->ReshapeLike(input(1)); output(1)->ReshapeLike(input(1));
......
...@@ -65,7 +65,7 @@ void EltwiseGradientOp<Context>::SumRunWithType() { ...@@ -65,7 +65,7 @@ void EltwiseGradientOp<Context>::SumRunWithType() {
auto* dYdata = input(-1).template data<T, Context>(); auto* dYdata = input(-1).template data<T, Context>();
TIndex count = input(-1).count(); TIndex count = input(-1).count();
for (int i = 0; i < OutputSize(); i++){ for (int i = 0; i < OutputSize(); i++) {
if (output(i)->name() == "ignore") continue; if (output(i)->name() == "ignore") continue;
auto* dXdata = output(i)->template mutable_data<T, Context>(); auto* dXdata = output(i)->template mutable_data<T, Context>();
if (coeffs[i] == float(1)) { if (coeffs[i] == float(1)) {
......
...@@ -8,7 +8,7 @@ template <class Context> template <typename T> ...@@ -8,7 +8,7 @@ template <class Context> template <typename T>
void GramMatrixOp<Context>::RunWithType() { void GramMatrixOp<Context>::RunWithType() {
auto* Xdata = input(0).template data<T, Context>(); auto* Xdata = input(0).template data<T, Context>();
auto* Ydata = output(0)->template mutable_data<T, Context>(); auto* Ydata = output(0)->template mutable_data<T, Context>();
for (int i = 0; i < outer_dim; i++){ for (int i = 0; i < outer_dim; i++) {
math::Gemm<T, Context>(CblasNoTrans, CblasTrans, math::Gemm<T, Context>(CblasNoTrans, CblasTrans,
dim, dim, inner_dim, 1.0, Xdata, Xdata, 0.0, Ydata); dim, dim, inner_dim, 1.0, Xdata, Xdata, 0.0, Ydata);
Xdata += x_offset; Xdata += x_offset;
...@@ -40,7 +40,7 @@ void GramMatrixGradientOp<Context>::RunWithType() { ...@@ -40,7 +40,7 @@ void GramMatrixGradientOp<Context>::RunWithType() {
auto* dYdata = input(-1).template data<T, Context>(); auto* dYdata = input(-1).template data<T, Context>();
auto* Xdata = input(0).template data<T, Context>(); auto* Xdata = input(0).template data<T, Context>();
auto* dXdata = output(0)->template mutable_data<T, Context>(); auto* dXdata = output(0)->template mutable_data<T, Context>();
for (int i = 0; i < outer_dim; i++){ for (int i = 0; i < outer_dim; i++) {
math::Gemm<T, Context>(CblasNoTrans, CblasNoTrans, math::Gemm<T, Context>(CblasNoTrans, CblasNoTrans,
dim, inner_dim, dim, 2.0, dYdata, Xdata, 0.0, dXdata); dim, inner_dim, dim, 2.0, dYdata, Xdata, 0.0, dXdata);
dYdata += y_offset; dYdata += y_offset;
...@@ -49,7 +49,7 @@ void GramMatrixGradientOp<Context>::RunWithType() { ...@@ -49,7 +49,7 @@ void GramMatrixGradientOp<Context>::RunWithType() {
} }
template <class Context> template <class Context>
void GramMatrixGradientOp<Context>::RunOnDevice(){ void GramMatrixGradientOp<Context>::RunOnDevice() {
outer_dim = input(0).count(0, axis); outer_dim = input(0).count(0, axis);
dim = input(0).dim(axis); dim = input(0).dim(axis);
inner_dim = input(0).count(axis + 1); inner_dim = input(0).count(axis + 1);
......
...@@ -81,7 +81,7 @@ void MatmulGradientOp<Context>::RunWithType() { ...@@ -81,7 +81,7 @@ void MatmulGradientOp<Context>::RunWithType() {
} }
template <class Context> template <class Context>
void MatmulGradientOp<Context>::RunOnDevice(){ void MatmulGradientOp<Context>::RunOnDevice() {
CHECK(input(0).ndim() == input(1).ndim()) CHECK(input(0).ndim() == input(1).ndim())
<< "both matrices must have the same number of dimensions."; << "both matrices must have the same number of dimensions.";
CHECK_GE(input(0).ndim(), 2) CHECK_GE(input(0).ndim(), 2)
......
...@@ -9,7 +9,7 @@ void PowOp<Context>::RunWithType() { ...@@ -9,7 +9,7 @@ void PowOp<Context>::RunWithType() {
TIndex count = input(0).count(); TIndex count = input(0).count();
auto* Ydata = output(0)->template mutable_data<T, Context>(); auto* Ydata = output(0)->template mutable_data<T, Context>();
if (power_scale == float(0)){ if (power_scale == float(0)) {
float value = (power == float(0)) ? float(1) : pow(shift, power); float value = (power == float(0)) ? float(1) : pow(shift, power);
math::Set<T, Context>(count, dragon_cast<T, float>(value), Ydata); math::Set<T, Context>(count, dragon_cast<T, float>(value), Ydata);
return; return;
......
...@@ -85,12 +85,12 @@ void ScaleGradientOp<Context>::ScaleRunWithType() { ...@@ -85,12 +85,12 @@ void ScaleGradientOp<Context>::ScaleRunWithType() {
T* SRes_data = nullptr; T* SRes_data = nullptr;
if (inner_dim == 1) { if (inner_dim == 1) {
SRes_data = tmp_data; SRes_data = tmp_data;
} else if (sum_result.count() == 1) { // handle inner only } else if (sum_result.count() == 1) { // handle inner only
dScale = output(1)->template mutable_data<T, CPUContext>(); dScale = output(1)->template mutable_data<T, CPUContext>();
T result = math::Dot<T, Context>(inner_dim, tmp_data, SMul_data); T result = math::Dot<T, Context>(inner_dim, tmp_data, SMul_data);
*dScale += result; *dScale += result;
} else { } else {
SRes_data = (outer_dim == 1) ? // handle scale only SRes_data = (outer_dim == 1) ? // handle scale only
dScale : sum_result.template mutable_data<T, Context>(); dScale : sum_result.template mutable_data<T, Context>();
math::Gemv<T, Context>(CblasNoTrans, sum_result.count(), inner_dim, math::Gemv<T, Context>(CblasNoTrans, sum_result.count(), inner_dim,
1.0, 1.0,
...@@ -99,7 +99,7 @@ void ScaleGradientOp<Context>::ScaleRunWithType() { ...@@ -99,7 +99,7 @@ void ScaleGradientOp<Context>::ScaleRunWithType() {
SRes_data); SRes_data);
} }
if (outer_dim != 1) { if (outer_dim != 1) {
if (scale_dim == 1) { // handle outer only if (scale_dim == 1) { // handle outer only
T result = math::Dot<T, Context>(outer_dim, SMul_data, SRes_data); T result = math::Dot<T, Context>(outer_dim, SMul_data, SRes_data);
*dScale += result; *dScale += result;
} else { } else {
......
...@@ -12,7 +12,7 @@ void SquareOp<Context>::RunWithType() { ...@@ -12,7 +12,7 @@ void SquareOp<Context>::RunWithType() {
} }
template <class Context> template <class Context>
void SquareOp<Context>::RunOnDevice(){ void SquareOp<Context>::RunOnDevice() {
output(0)->ReshapeLike(input(0)); output(0)->ReshapeLike(input(0));
if (input(0).template IsType<float>()) RunWithType<float>(); if (input(0).template IsType<float>()) RunWithType<float>();
......
...@@ -44,7 +44,7 @@ void SubOp<Context>::BroadcastRunWithType(int type) { ...@@ -44,7 +44,7 @@ void SubOp<Context>::BroadcastRunWithType(int type) {
} }
template <class Context> template <class Context>
void SubOp<Context>::RunOnDevice(){ void SubOp<Context>::RunOnDevice() {
output(0)->ReshapeLike(input(0)); output(0)->ReshapeLike(input(0));
if (input(0).dims() == input(1).dims()) { if (input(0).dims() == input(1).dims()) {
......
...@@ -6,7 +6,7 @@ namespace dragon { ...@@ -6,7 +6,7 @@ namespace dragon {
template <class Context> template <typename T> template <class Context> template <typename T>
void ArgmaxOp<Context>::RunWithType() { void ArgmaxOp<Context>::RunWithType() {
if (top_k != 1) { if (top_k != 1) {
// it's difficult to implement device code when top_k > 1 // it's difficult to implement device code when top_k > 1
auto* Xdata = input(0).template data<T, CPUContext>(); auto* Xdata = input(0).template data<T, CPUContext>();
auto* Ydata = output(0)->template mutable_data<T, CPUContext>(); auto* Ydata = output(0)->template mutable_data<T, CPUContext>();
kernel::Argmax<T, CPUContext>(count, input(0).dim(axis), inner_dim, kernel::Argmax<T, CPUContext>(count, input(0).dim(axis), inner_dim,
......
...@@ -7,7 +7,7 @@ namespace dragon { ...@@ -7,7 +7,7 @@ namespace dragon {
template <class Context> template <typename T> template <class Context> template <typename T>
void ConcatOp<Context>::RunWithType() { void ConcatOp<Context>::RunWithType() {
auto* Ydata = output(0)->template mutable_data<T, Context>(); auto* Ydata = output(0)->template mutable_data<T, Context>();
for (int i = 0; i < nin; i++){ for (int i = 0; i < nin; i++) {
auto* Xdata = input(i).template data<T, Context>(); auto* Xdata = input(i).template data<T, Context>();
TIndex count = input(i).count(); TIndex count = input(i).count();
x_concat_dim = input(i).dim(axis); x_concat_dim = input(i).dim(axis);
...@@ -25,12 +25,12 @@ void ConcatOp<Context>::RunWithType() { ...@@ -25,12 +25,12 @@ void ConcatOp<Context>::RunWithType() {
} }
template <class Context> template <class Context>
void ConcatOp<Context>::RunOnDevice(){ void ConcatOp<Context>::RunOnDevice() {
concat_dims = input(0).dims(); concat_dims = input(0).dims();
for (int i = 1; i < nin; i++) { for (int i = 1; i < nin; i++) {
CHECK_EQ(concat_dims.size(), input(i).ndim()) CHECK_EQ(concat_dims.size(), input(i).ndim())
<< "\nall inputs must have the same ndim."; << "\nall inputs must have the same ndim.";
for (int j = 0; j < concat_dims.size(); j++){ for (int j = 0; j < concat_dims.size(); j++) {
if (j == axis) continue; if (j == axis) continue;
CHECK_EQ(concat_dims[j], input(i).dim(j)) CHECK_EQ(concat_dims[j], input(i).dim(j))
<< "\nall inputs must have the same dims" << "\nall inputs must have the same dims"
...@@ -82,7 +82,7 @@ void ConcatGradientOp<Context>::RunWithType() { ...@@ -82,7 +82,7 @@ void ConcatGradientOp<Context>::RunWithType() {
} }
template <class Context> template <class Context>
void ConcatGradientOp<Context>::RunOnDevice(){ void ConcatGradientOp<Context>::RunOnDevice() {
if (input(-1).name() == "ignore") return; if (input(-1).name() == "ignore") return;
concat_dims = input(-1).dims(); concat_dims = input(-1).dims();
y_concat_dim = concat_dims[axis]; y_concat_dim = concat_dims[axis];
......
...@@ -27,7 +27,7 @@ OPERATOR_SCHEMA(Flatten).NumInputs(1).NumOutputs(1); ...@@ -27,7 +27,7 @@ OPERATOR_SCHEMA(Flatten).NumInputs(1).NumOutputs(1);
template <class Context> template <class Context>
void FlattenGradientOp<Context>::RunOnDevice(){ void FlattenGradientOp<Context>::RunOnDevice() {
output(0)->ReshapeLike(input(0)); output(0)->ReshapeLike(input(0));
output(0)->Share(input(1)); output(0)->Share(input(1));
} }
......
...@@ -8,7 +8,7 @@ RunOp<Context>::RunOp(const OperatorDef& op_def, Workspace* ws) ...@@ -8,7 +8,7 @@ RunOp<Context>::RunOp(const OperatorDef& op_def, Workspace* ws)
module(OperatorBase::GetSingleArg<string>("module", "")), module(OperatorBase::GetSingleArg<string>("module", "")),
op(OperatorBase::GetSingleArg<string>("op", "")), op(OperatorBase::GetSingleArg<string>("op", "")),
param_str((OperatorBase::GetSingleArg<string>("param_str", ""))) { param_str((OperatorBase::GetSingleArg<string>("param_str", ""))) {
// init interpreter & load module // init interpreter & load module
Py_Initialize(); Py_Initialize();
PyObject* py_module = PyImport_ImportModule(module.c_str()); PyObject* py_module = PyImport_ImportModule(module.c_str());
CHECK(py_module) << "\ncan not import py module: " << module; CHECK(py_module) << "\ncan not import py module: " << module;
...@@ -18,11 +18,11 @@ RunOp<Context>::RunOp(const OperatorDef& op_def, Workspace* ws) ...@@ -18,11 +18,11 @@ RunOp<Context>::RunOp(const OperatorDef& op_def, Workspace* ws)
<< " from module: " << module; << " from module: " << module;
self = PyObject_CallObject(py_op, NULL); self = PyObject_CallObject(py_op, NULL);
// pass param string // pass param string
PyObject_SetAttr(self, String("param_str"), String(param_str.c_str())); PyObject_SetAttr(self, String("param_str"), String(param_str.c_str()));
PyObject_SetAttr(self, String("param_str_"), String(param_str.c_str())); PyObject_SetAttr(self, String("param_str_"), String(param_str.c_str()));
// build inputs and outputs for Python // build inputs and outputs for Python
inputs = PyList_New(InputSize()); inputs = PyList_New(InputSize());
for (int i = 0; i < InputSize(); i++) for (int i = 0; i < InputSize(); i++)
PyList_SetItem(inputs, i, String(input(i).name().c_str())); PyList_SetItem(inputs, i, String(input(i).name().c_str()));
...@@ -31,21 +31,21 @@ RunOp<Context>::RunOp(const OperatorDef& op_def, Workspace* ws) ...@@ -31,21 +31,21 @@ RunOp<Context>::RunOp(const OperatorDef& op_def, Workspace* ws)
PyList_SetItem(outputs, i, String(output(i)->name().c_str())); PyList_SetItem(outputs, i, String(output(i)->name().c_str()));
if (!this->allow_run()) return; if (!this->allow_run()) return;
// setup // setup
if (PyObject_HasAttr(self, String("setup"))) if (PyObject_HasAttr(self, String("setup")))
PyObject_CallMethod(self, "setup", "OO", inputs, outputs); PyObject_CallMethod(self, "setup", "OO", inputs, outputs);
} }
template <class Context> template <class Context>
void RunOp<Context>::RunOnDevice() { void RunOp<Context>::RunOnDevice() {
// init phase // init phase
PyObject_SetAttr(self, String("phase"), String(this->phase().c_str())); PyObject_SetAttr(self, String("phase"), String(this->phase().c_str()));
// reshape // reshape
if (PyObject_HasAttr(self, String("reshape"))) if (PyObject_HasAttr(self, String("reshape")))
PyObject_CallMethod(self, "reshape", "OO", inputs, outputs); PyObject_CallMethod(self, "reshape", "OO", inputs, outputs);
// run // run
if (PyObject_HasAttr(self, String("forward"))) { if (PyObject_HasAttr(self, String("forward"))) {
PyObject_CallMethod(self, "forward", "OO", inputs, outputs); PyObject_CallMethod(self, "forward", "OO", inputs, outputs);
} else if (PyObject_HasAttr(self, String("run"))) { } else if (PyObject_HasAttr(self, String("run"))) {
...@@ -63,14 +63,14 @@ NO_GRADIENT(Run); ...@@ -63,14 +63,14 @@ NO_GRADIENT(Run);
template <class Context> template <class Context>
void TemplateGradientOp<Context>::RunOnDevice() { void TemplateGradientOp<Context>::RunOnDevice() {
// init phase // init phase
PyObject_SetAttr(this->self, String("phase"), String(this->phase().c_str())); PyObject_SetAttr(this->self, String("phase"), String(this->phase().c_str()));
// reshape // reshape
if (PyObject_HasAttr(this->self, String("reshape"))) if (PyObject_HasAttr(this->self, String("reshape")))
PyObject_CallMethod(this->self, "reshape", "OO", this->inputs, this->outputs); PyObject_CallMethod(this->self, "reshape", "OO", this->inputs, this->outputs);
// run // run
if (PyObject_HasAttr(this->self, String("backward"))) { if (PyObject_HasAttr(this->self, String("backward"))) {
PyObject_CallMethod(this->self, "forward", "OO", this->inputs, this->outputs); PyObject_CallMethod(this->self, "forward", "OO", this->inputs, this->outputs);
} else if (PyObject_HasAttr(this->self, String("grad"))) { } else if (PyObject_HasAttr(this->self, String("grad"))) {
......
...@@ -78,7 +78,7 @@ void ReduceGradientOp<Context>::SumRunWithType() { ...@@ -78,7 +78,7 @@ void ReduceGradientOp<Context>::SumRunWithType() {
template <class Context> template <typename T> template <class Context> template <typename T>
void ReduceGradientOp<Context>::MeanRunWithType() { void ReduceGradientOp<Context>::MeanRunWithType() {
auto* dXdata = output(0)->template mutable_data<T, Context>(); auto* dXdata = output(0)->template mutable_data<T, Context>();
if (axis == -1){ if (axis == -1) {
auto* dYdata = input(-1).template data<T, CPUContext>(); auto* dYdata = input(-1).template data<T, CPUContext>();
math::Set<T, Context>(output(0)->count(), dYdata[0] / input(0).count(), dXdata); math::Set<T, Context>(output(0)->count(), dYdata[0] / input(0).count(), dXdata);
} else { } else {
......
...@@ -8,18 +8,18 @@ void ReshapeOp<Context>::RunOnDevice() { ...@@ -8,18 +8,18 @@ void ReshapeOp<Context>::RunOnDevice() {
int infer_dim = -1; int infer_dim = -1;
TIndex total_count = 1; TIndex total_count = 1;
for (int i = 0; i < shape.size(); i++) { for (int i = 0; i < shape.size(); i++) {
// handle unchanged dim // handle unchanged dim
if (shape[i] == 0) { if (shape[i] == 0) {
CHECK_LT(i, (int)Xdims.size()) CHECK_LT(i, (int)Xdims.size())
<< "\ndim(" << i << ") is out of the Xdims range of (0, " << "\ndim(" << i << ") is out of the Xdims range of (0, "
<< Xdims.size() << ")."; << Xdims.size() << ").";
new_shape[i] = Xdims[i]; new_shape[i] = Xdims[i];
} }
// handle reseted dim // handle reseted dim
else if (shape[i] > 0) { else if (shape[i] > 0) {
new_shape[i] = shape[i]; new_shape[i] = shape[i];
} }
// handle inferred dim // handle inferred dim
else { else {
CHECK_EQ(infer_dim, -1) CHECK_EQ(infer_dim, -1)
<< "\ndim(" << infer_dim << ") required infer before" << "\ndim(" << infer_dim << ") required infer before"
...@@ -30,7 +30,7 @@ void ReshapeOp<Context>::RunOnDevice() { ...@@ -30,7 +30,7 @@ void ReshapeOp<Context>::RunOnDevice() {
if (new_shape[i] != -1) total_count *= new_shape[i]; if (new_shape[i] != -1) total_count *= new_shape[i];
} }
// solve inferred dim if necessary // solve inferred dim if necessary
if (infer_dim != -1) { if (infer_dim != -1) {
for (int i = 0; i < new_shape.size(); i++) { for (int i = 0; i < new_shape.size(); i++) {
if (new_shape[i] == -1) { if (new_shape[i] == -1) {
......
...@@ -25,7 +25,7 @@ void ScanOp<Context>::InitTemplate() { ...@@ -25,7 +25,7 @@ void ScanOp<Context>::InitTemplate() {
slice_def.add_arg()->CopyFrom(arg_nout); slice_def.add_arg()->CopyFrom(arg_nout);
template_def.mutable_device_option()->CopyFrom(op_def().device_option()); template_def.mutable_device_option()->CopyFrom(op_def().device_option());
template_def.set_debug_mode(debug_mode); template_def.set_debug_mode(debug_mode);
// init for the first step // init for the first step
for (int i = 0; i < nseqs; i++) { for (int i = 0; i < nseqs; i++) {
OperatorDef* op = template_def.add_op(); OperatorDef* op = template_def.add_op();
op->CopyFrom(slice_def); op->CopyFrom(slice_def);
...@@ -37,19 +37,19 @@ void ScanOp<Context>::InitTemplate() { ...@@ -37,19 +37,19 @@ void ScanOp<Context>::InitTemplate() {
OperatorDef* op = template_def.add_op(); OperatorDef* op = template_def.add_op();
op->CopyFrom(func_def.op(i)); op->CopyFrom(func_def.op(i));
op->set_name(name() + "(BodyOp." + str(i + nseqs) + ")@1"); op->set_name(name() + "(BodyOp." + str(i + nseqs) + ")@1");
// replace inputs term // replace inputs term
for (int j = 0; j < op->input_size(); j++) { for (int j = 0; j < op->input_size(); j++) {
string* input = op->mutable_input(j); string* input = op->mutable_input(j);
if (terms.count(*input)) *input = terms[*input]; if (terms.count(*input)) *input = terms[*input];
} }
// replace outputs term // replace outputs term
for (int j = 0; j < op->output_size(); j++) { for (int j = 0; j < op->output_size(); j++) {
string* output = op->mutable_output(j); string* output = op->mutable_output(j);
terms[*output] = *output + "@1"; terms[*output] = *output + "@1";
*output = terms[*output]; *output = terms[*output];
} }
} }
// handle pre outputs // handle pre outputs
for (int i = 0; i < nout; i++) { for (int i = 0; i < nout; i++) {
if (default_outputs[i].empty()) continue; if (default_outputs[i].empty()) continue;
terms[default_outputs[i]] = func_def.target(i) + "@1"; terms[default_outputs[i]] = func_def.target(i) + "@1";
...@@ -59,15 +59,15 @@ void ScanOp<Context>::InitTemplate() { ...@@ -59,15 +59,15 @@ void ScanOp<Context>::InitTemplate() {
template <class Context> template <class Context>
void ScanOp<Context>::UpdateTerms(int cur_step) { void ScanOp<Context>::UpdateTerms(int cur_step) {
string prev, now; string prev, now;
// update sequences term // update sequences term
for (int i = 0; i < nseqs; i++) { for (int i = 0; i < nseqs; i++) {
prev = input(i).name() + "@" + str(cur_step - 1); prev = input(i).name() + "@" + str(cur_step - 1);
now = input(i).name() + "@" + str(cur_step); now = input(i).name() + "@" + str(cur_step);
terms[prev] = now; terms[prev] = now;
} }
if (cur_step < 3) return; if (cur_step < 3) return;
// update recurrent term // update recurrent term
// only support the latest one-step (as Theano's done) // only support the latest one-step (as Theano's done)
for (int i = 0; i < nout; i++) { for (int i = 0; i < nout; i++) {
if (default_outputs[i].empty()) continue; if (default_outputs[i].empty()) continue;
prev = output(i)->name() + "@" + str(cur_step - 2); prev = output(i)->name() + "@" + str(cur_step - 2);
...@@ -93,15 +93,15 @@ void ScanOp<Context>::UnrollTemplate() { ...@@ -93,15 +93,15 @@ void ScanOp<Context>::UnrollTemplate() {
for (int idx = 0; idx < nseqs; idx++) { for (int idx = 0; idx < nseqs; idx++) {
OperatorDef *op = new_def.mutable_op(idx); OperatorDef *op = new_def.mutable_op(idx);
int nslices = input(idx).dim(axis); int nslices = input(idx).dim(axis);
// alter the num of slices for all sequences // alter the num of slices for all sequences
op->mutable_arg(1)->set_i(nslices); op->mutable_arg(1)->set_i(nslices);
// add slices as outputs // add slices as outputs
for (int t = 1; t <= nslices; t++) { for (int t = 1; t <= nslices; t++) {
string slice = op->input(0) + "@" + str(t); string slice = op->input(0) + "@" + str(t);
op->add_output(slice); op->add_output(slice);
} }
} }
// main loop // main loop
for (int t = 2; t <= nsteps; t++) { for (int t = 2; t <= nsteps; t++) {
UpdateTerms(t); UpdateTerms(t);
int copy_r = new_def.op_size(), copy_l = copy_r - nrepeats; int copy_r = new_def.op_size(), copy_l = copy_r - nrepeats;
...@@ -109,12 +109,12 @@ void ScanOp<Context>::UnrollTemplate() { ...@@ -109,12 +109,12 @@ void ScanOp<Context>::UnrollTemplate() {
OperatorDef* op = new_def.add_op(); OperatorDef* op = new_def.add_op();
op->CopyFrom(new_def.op(idx)); op->CopyFrom(new_def.op(idx));
op->set_name(SplitString(op->name(), "@")[0] + "@" + str(t)); op->set_name(SplitString(op->name(), "@")[0] + "@" + str(t));
// replace inputs // replace inputs
for (int j = 0; j < op->input_size(); j++) { for (int j = 0; j < op->input_size(); j++) {
string* input = op->mutable_input(j); string* input = op->mutable_input(j);
if (terms.count(*input)) *input = terms[*input]; if (terms.count(*input)) *input = terms[*input];
} }
// replace outputs // replace outputs
for (int j = 0; j < op->output_size(); j++) { for (int j = 0; j < op->output_size(); j++) {
string* output = op->mutable_output(j); string* output = op->mutable_output(j);
terms[*output] = SplitString(*output, "@")[0] + "@" + str(t); terms[*output] = SplitString(*output, "@")[0] + "@" + str(t);
...@@ -123,9 +123,9 @@ void ScanOp<Context>::UnrollTemplate() { ...@@ -123,9 +123,9 @@ void ScanOp<Context>::UnrollTemplate() {
} }
} }
for (int i = 0; i < nout; i++) { for (int i = 0; i < nout; i++) {
// solve the last step only // solve the last step only
new_def.add_target(func_def.target(i) + "@" + str(nsteps)); new_def.add_target(func_def.target(i) + "@" + str(nsteps));
// concat all steps if necessary // concat all steps if necessary
if (output(i)->name() == "ignore") continue; if (output(i)->name() == "ignore") continue;
OperatorDef* op = new_def.add_op(); OperatorDef* op = new_def.add_op();
op->set_name(name() + "(BodyOp." + str(nseqs + nrepeats + i) + ")"); op->set_name(name() + "(BodyOp." + str(nseqs + nrepeats + i) + ")");
...@@ -138,10 +138,10 @@ void ScanOp<Context>::UnrollTemplate() { ...@@ -138,10 +138,10 @@ void ScanOp<Context>::UnrollTemplate() {
for (int t = 1; t <= nsteps; t++) for (int t = 1; t <= nsteps; t++)
op->add_input(output(i)->name() + "@" + str(t)); op->add_input(output(i)->name() + "@" + str(t));
op->add_output(output(i)->name()); op->add_output(output(i)->name());
// solve all the all steps // solve all the all steps
new_def.add_target(output(i)->name()); new_def.add_target(output(i)->name());
} }
// upload // upload
Tensor* string_tensor = ws()->CreateTensor("_t_" + anchor() + "_raw_ops"); Tensor* string_tensor = ws()->CreateTensor("_t_" + anchor() + "_raw_ops");
string_tensor->Reshape(vector<TIndex>(1, 1)); string_tensor->Reshape(vector<TIndex>(1, 1));
string* data = string_tensor->mutable_data <string, CPUContext>(); string* data = string_tensor->mutable_data <string, CPUContext>();
...@@ -150,13 +150,11 @@ void ScanOp<Context>::UnrollTemplate() { ...@@ -150,13 +150,11 @@ void ScanOp<Context>::UnrollTemplate() {
template <class Context> template <class Context>
void ScanOp<Context>::RunOnDevice() { void ScanOp<Context>::RunOnDevice() {
// unroll
UnrollTemplate(); UnrollTemplate();
if (!graphs.count(nsteps)) if (!graphs.count(nsteps)) {
graphs[nsteps].reset(new Graph(new_def, ws())); graphs[nsteps].reset(new Graph(new_def, ws()));
}
cur_graph = graphs[nsteps].get(); cur_graph = graphs[nsteps].get();
// forward
cur_graph->Run("", ""); cur_graph->Run("", "");
} }
...@@ -201,13 +199,11 @@ void ScanGradientOp<Context>::MakeGradientOps() { ...@@ -201,13 +199,11 @@ void ScanGradientOp<Context>::MakeGradientOps() {
template <class Context> template <class Context>
void ScanGradientOp<Context>::RunOnDevice() { void ScanGradientOp<Context>::RunOnDevice() {
// make graph
MakeGradientOps(); MakeGradientOps();
if (!graphs.count(nsteps)) if (!graphs.count(nsteps)) {
graphs[nsteps].reset(new Graph(new_def, ws())); graphs[nsteps].reset(new Graph(new_def, ws()));
}
cur_graph = graphs[nsteps].get(); cur_graph = graphs[nsteps].get();
// backward
cur_graph->Run("Gradient", ""); cur_graph->Run("Gradient", "");
} }
......
...@@ -7,7 +7,7 @@ namespace dragon { ...@@ -7,7 +7,7 @@ namespace dragon {
template <class Context> template <typename T> template <class Context> template <typename T>
void SliceOp<Context>::RunWithType() { void SliceOp<Context>::RunWithType() {
auto* Xdata = input(0).template data<T, Context>(); auto* Xdata = input(0).template data<T, Context>();
for (int i = 0; i < nout; i++){ for (int i = 0; i < nout; i++) {
auto* Ydata = output(i)->template mutable_data<T, Context>(); auto* Ydata = output(i)->template mutable_data<T, Context>();
TIndex count = output(i)->count(); TIndex count = output(i)->count();
kernel::Slice<T, Context>(count, outer_dim, inner_dim, kernel::Slice<T, Context>(count, outer_dim, inner_dim,
...@@ -46,7 +46,7 @@ OPERATOR_SCHEMA(Slice).NumInputs(1).NumOutputs(1, INT_MAX); ...@@ -46,7 +46,7 @@ OPERATOR_SCHEMA(Slice).NumInputs(1).NumOutputs(1, INT_MAX);
template <class Context> template <typename T> template <class Context> template <typename T>
void SliceGradientOp<Context>::RunWithType() { void SliceGradientOp<Context>::RunWithType() {
auto* dXdata = output(0)->template mutable_data<T, Context>(); auto* dXdata = output(0)->template mutable_data<T, Context>();
for (int i = 0; i < nout; i++){ for (int i = 0; i < nout; i++) {
if (input(i + 1).name() == "ignore") continue; if (input(i + 1).name() == "ignore") continue;
auto* dYdata = input(i + 1).template data<T, Context>(); auto* dYdata = input(i + 1).template data<T, Context>();
TIndex count = input(i + 1).count(); TIndex count = input(i + 1).count();
......
...@@ -60,11 +60,11 @@ void AccuracyOp<Context>::RunWithType() { ...@@ -60,11 +60,11 @@ void AccuracyOp<Context>::RunWithType() {
} }
} }
count++; count++;
} // end inner_num } // end inner_num
} // end outer_num } // end outer_num
output(0)->template mutable_data<T, CPUContext>()[0] = acc / count; output(0)->template mutable_data<T, CPUContext>()[0] = acc / count;
if (OutputSize() > 1){ if (OutputSize() > 1) {
auto* acc_per_class = output(1)->template mutable_data<T, CPUContext>(); auto* acc_per_class = output(1)->template mutable_data<T, CPUContext>();
for (int i = 0; i < classes; i++) for (int i = 0; i < classes; i++)
acc_per_class[i] = num_per_class[i] == 0 ? 0 : acc_per_class[i] / acc_per_class[i]; acc_per_class[i] = num_per_class[i] == 0 ? 0 : acc_per_class[i] / acc_per_class[i];
......
...@@ -28,7 +28,7 @@ void L1LossOp<Context>::RunWithType() { ...@@ -28,7 +28,7 @@ void L1LossOp<Context>::RunWithType() {
} }
template <class Context> template <class Context>
void L1LossOp<Context>::RunOnDevice(){ void L1LossOp<Context>::RunOnDevice() {
CHECK_EQ(input(0).count(), input(1).count()); CHECK_EQ(input(0).count(), input(1).count());
output(0)->Reshape(vector<TIndex>(1, 1)); output(0)->Reshape(vector<TIndex>(1, 1));
diff = ws()->CreateTensor("_t_" + anchor() + "_l1_loss_diff"); diff = ws()->CreateTensor("_t_" + anchor() + "_l1_loss_diff");
...@@ -53,7 +53,7 @@ void L1LossGradientOp<Context>::RunWithType() { ...@@ -53,7 +53,7 @@ void L1LossGradientOp<Context>::RunWithType() {
else if (normalization == "FULL") normalizer = input(0).count(); else if (normalization == "FULL") normalizer = input(0).count();
else if (normalization == "NONE") normalizer = 1; else if (normalization == "NONE") normalizer = 1;
alpha = alpha / normalizer; alpha = alpha / normalizer;
for (int i = 0; i < 2; i++){ for (int i = 0; i < 2; i++) {
if (output(i)->name() == "ignore") continue; if (output(i)->name() == "ignore") continue;
output(i)->ReshapeLike(input(i)); output(i)->ReshapeLike(input(i));
auto* dXdata = output(i)->template mutable_data<T, Context>(); auto* dXdata = output(i)->template mutable_data<T, Context>();
......
...@@ -11,7 +11,7 @@ void L2LossOp<Context>::RunWithType() { ...@@ -11,7 +11,7 @@ void L2LossOp<Context>::RunWithType() {
auto* diff_data = diff->template mutable_data<T, Context>(); auto* diff_data = diff->template mutable_data<T, Context>();
auto* Ydata = output(0)->template mutable_data<T, CPUContext>(); auto* Ydata = output(0)->template mutable_data<T, CPUContext>();
math::Sub<T, Context>(diff->count(), X0data, X1data, diff_data); math::Sub<T, Context>(diff->count(), X0data, X1data, diff_data);
if (InputSize() > 2){ if (InputSize() > 2) {
CHECK_EQ(input(0).count(), input(2).count()); CHECK_EQ(input(0).count(), input(2).count());
auto* Wdata = input(2).template data<T, Context>(); auto* Wdata = input(2).template data<T, Context>();
math::Mul<T, Context>(diff->count(), Wdata, diff_data, diff_data); math::Mul<T, Context>(diff->count(), Wdata, diff_data, diff_data);
......
...@@ -14,12 +14,12 @@ void SmoothL1LossOp<Context>::RunWithType() { ...@@ -14,12 +14,12 @@ void SmoothL1LossOp<Context>::RunWithType() {
auto* Ydata = output(0)->template mutable_data<T, CPUContext>(); auto* Ydata = output(0)->template mutable_data<T, CPUContext>();
math::Sub<T, Context>(diff->count(), X0data, X1data, diff_data); math::Sub<T, Context>(diff->count(), X0data, X1data, diff_data);
if (InputSize() > 2){ if (InputSize() > 2) {
auto* inside_w_data = input(2).template data<T, Context>(); auto* inside_w_data = input(2).template data<T, Context>();
math::Mul<T, Context>(diff->count(), inside_w_data, diff_data, diff_data); math::Mul<T, Context>(diff->count(), inside_w_data, diff_data, diff_data);
} }
kernel::SmoothL1<T, Context>(diff->count(), sigma2, diff_data, error_data); kernel::SmoothL1<T, Context>(diff->count(), sigma2, diff_data, error_data);
if (InputSize() > 3){ if (InputSize() > 3) {
auto* outside_w_data = input(3).template data<T, Context>(); auto* outside_w_data = input(3).template data<T, Context>();
math::Mul<T, Context>(diff->count(), outside_w_data, error_data, error_data); math::Mul<T, Context>(diff->count(), outside_w_data, error_data, error_data);
} }
......
...@@ -69,7 +69,7 @@ void SoftmaxCrossEntropyLossGradientOp<Context>::RunWithType() { ...@@ -69,7 +69,7 @@ void SoftmaxCrossEntropyLossGradientOp<Context>::RunWithType() {
return; return;
} }
// normalize // normalize
T normalizer; T normalizer;
if (normalization == "BATCH_SIZE") normalizer = outer_dim; if (normalization == "BATCH_SIZE") normalizer = outer_dim;
else if (normalization == "FULL") normalizer = outer_dim * inner_dim; else if (normalization == "FULL") normalizer = outer_dim * inner_dim;
......
...@@ -26,14 +26,14 @@ void MPIBroadcastOp<Context>::RunWithType() { ...@@ -26,14 +26,14 @@ void MPIBroadcastOp<Context>::RunWithType() {
} }
template <class Context> template <class Context>
void MPIBroadcastOp<Context>::RunOnDevice(){ void MPIBroadcastOp<Context>::RunOnDevice() {
CHECK(this->comm != MPI_COMM_NULL) CHECK(this->comm != MPI_COMM_NULL)
<< "\nMPIBroadcastOp, name: " << name() << "\nMPIBroadcastOp, name: " << name()
<< ", does not belong to any group, can't run."; << ", does not belong to any group, can't run.";
size_t ndim[1]; size_t ndim[1];
TIndex* dims = nullptr; TIndex* dims = nullptr;
if (this->comm_rank == this->comm_root){ if (this->comm_rank == this->comm_root) {
ndim[0] = input(0).ndim(); ndim[0] = input(0).ndim();
dims = new TIndex[ndim[0]]; dims = new TIndex[ndim[0]];
for (int i = 0; i < input(0).ndim(); i++) for (int i = 0; i < input(0).ndim(); i++)
...@@ -90,7 +90,7 @@ void MPIBroadcastGradientOp<Context>::RunWithType() { ...@@ -90,7 +90,7 @@ void MPIBroadcastGradientOp<Context>::RunWithType() {
} }
template <class Context> template <class Context>
void MPIBroadcastGradientOp<Context>::RunOnDevice(){ void MPIBroadcastGradientOp<Context>::RunOnDevice() {
output(0)->ReshapeLike(input(-1)); output(0)->ReshapeLike(input(-1));
if (input(0).template IsType<float>()) RunWithType<float>(); if (input(0).template IsType<float>()) RunWithType<float>();
......
...@@ -51,7 +51,7 @@ template <class Context> template <typename T> ...@@ -51,7 +51,7 @@ template <class Context> template <typename T>
void MPIGatherGradientOp<Context>::RunWithType() { void MPIGatherGradientOp<Context>::RunWithType() {
if (this->comm_rank == this->comm_root) { if (this->comm_rank == this->comm_root) {
output(0)->Share(input(this->comm_rank + 1)); output(0)->Share(input(this->comm_rank + 1));
for (int i = 0; i < this->comm_size; i++){ for (int i = 0; i < this->comm_size; i++) {
if (i == this->comm_root) continue; if (i == this->comm_root) continue;
#ifdef WITH_CUDA_AWARE #ifdef WITH_CUDA_AWARE
auto* dYdata = input(this->comm_rank + 1).template data<T, Context>(); auto* dYdata = input(this->comm_rank + 1).template data<T, Context>();
......
...@@ -120,7 +120,7 @@ void BatchNormOp<Context>::RunOnDevice() { ...@@ -120,7 +120,7 @@ void BatchNormOp<Context>::RunOnDevice() {
if (use_stats == -1) use_global_stats = phase() == "TEST" ? true : false; if (use_stats == -1) use_global_stats = phase() == "TEST" ? true : false;
else use_global_stats = use_stats == 1 ? true : false; else use_global_stats = use_stats == 1 ? true : false;
// if true, Act/Exp/Pow/Norm Ops can not exist before when train // if true, Act/Exp/Pow/Norm Ops can not exist before when train
if (inplace) output(0)->Share(input(0)); if (inplace) output(0)->Share(input(0));
if (input(0).template IsType<float>()) RunWithType<float>(); if (input(0).template IsType<float>()) RunWithType<float>();
...@@ -171,7 +171,7 @@ void BatchNormGradientOp<Context>::RunWithType() { ...@@ -171,7 +171,7 @@ void BatchNormGradientOp<Context>::RunWithType() {
auto* Ydata = input(-2).template data<T, Context>(); auto* Ydata = input(-2).template data<T, Context>();
math::Mul<T, Context>(output(0)->count(), Ydata, dYdata, dXdata); math::Mul<T, Context>(output(0)->count(), Ydata, dYdata, dXdata);
// sum(dE/dY \cdot Y) // sum(dE/dY \cdot Y)
math::Gemv<T, Context>(CblasNoTrans, nbychans, spatial_dim, math::Gemv<T, Context>(CblasNoTrans, nbychans, spatial_dim,
1.0, 1.0,
dXdata, SMul_data, dXdata, SMul_data,
...@@ -193,10 +193,10 @@ void BatchNormGradientOp<Context>::RunWithType() { ...@@ -193,10 +193,10 @@ void BatchNormGradientOp<Context>::RunWithType() {
0.0, 0.0,
dXdata); dXdata);
// sum(dE/dY \cdot Y) \cdot Y // sum(dE/dY \cdot Y) \cdot Y
math::Mul<T, Context>(output(0)->count(), Ydata, dXdata, dXdata); math::Mul<T, Context>(output(0)->count(), Ydata, dXdata, dXdata);
// sum(dE/dY) + sum(dE/dY \cdot Y) \cdot Y // sum(dE/dY) + sum(dE/dY \cdot Y) \cdot Y
math::Gemv<T, Context>(CblasNoTrans, nbychans, spatial_dim, math::Gemv<T, Context>(CblasNoTrans, nbychans, spatial_dim,
1.0, 1.0,
dYdata, SMul_data, dYdata, SMul_data,
...@@ -224,7 +224,7 @@ void BatchNormGradientOp<Context>::RunWithType() { ...@@ -224,7 +224,7 @@ void BatchNormGradientOp<Context>::RunWithType() {
-1.0 / (num * spatial_dim), -1.0 / (num * spatial_dim),
dXdata); dXdata);
// divide by stddev // divide by stddev
math::Div<T, Context>(output(0)->count(), dXdata, Std_data, dXdata); math::Div<T, Context>(output(0)->count(), dXdata, Std_data, dXdata);
// release buffer // release buffer
......
...@@ -60,7 +60,7 @@ void BatchRenormOp<Context>::RunWithType() { ...@@ -60,7 +60,7 @@ void BatchRenormOp<Context>::RunWithType() {
ctx().template Copy<T, Context, Context>(input(0).count(), Ydata, Xdata); ctx().template Copy<T, Context, Context>(input(0).count(), Ydata, Xdata);
} }
// subtract mean // subtract mean
math::Gemm<T, Context>(CblasNoTrans, CblasNoTrans, num, channels, 1, math::Gemm<T, Context>(CblasNoTrans, CblasNoTrans, num, channels, 1,
1.0, 1.0,
NMul_data, tMean_data, NMul_data, tMean_data,
...@@ -73,7 +73,7 @@ void BatchRenormOp<Context>::RunWithType() { ...@@ -73,7 +73,7 @@ void BatchRenormOp<Context>::RunWithType() {
Ydata); Ydata);
if (!use_global_stats) { if (!use_global_stats) {
// Var(X) = E((X - EX) ^ 2) // Var(X) = E((X - EX) ^ 2)
math::Pow<T, Context>(stddev->count(), 2, Ydata, Std_data); math::Pow<T, Context>(stddev->count(), 2, Ydata, Std_data);
math::Gemv<T, Context>(CblasNoTrans, nbychans, spatial_dim, math::Gemv<T, Context>(CblasNoTrans, nbychans, spatial_dim,
1.0 / (num * spatial_dim), 1.0 / (num * spatial_dim),
...@@ -85,7 +85,7 @@ void BatchRenormOp<Context>::RunWithType() { ...@@ -85,7 +85,7 @@ void BatchRenormOp<Context>::RunWithType() {
NByC_data, NMul_data, NByC_data, NMul_data,
0.0, 0.0,
tVar_data); tVar_data);
// update moving average // update moving average
hFact_data[0] *= momentum; hFact_data[0] += 1; hFact_data[0] *= momentum; hFact_data[0] += 1;
int m = input(0).count() / channels; int m = input(0).count() / channels;
T factor = m > 1 ? T(m) / (m - 1) : 1; T factor = m > 1 ? T(m) / (m - 1) : 1;
...@@ -93,31 +93,31 @@ void BatchRenormOp<Context>::RunWithType() { ...@@ -93,31 +93,31 @@ void BatchRenormOp<Context>::RunWithType() {
math::Axpby<T, Context>(mean.count(), factor, tVar_data, momentum, hVar_data); math::Axpby<T, Context>(mean.count(), factor, tVar_data, momentum, hVar_data);
} }
// normalize var // normalize var
math::AddScalar<T, Context>(mean.count(), eps, tVar_data); math::AddScalar<T, Context>(mean.count(), eps, tVar_data);
math::Pow<T, Context>(mean.count(), 0.5, tVar_data, tVar_data); math::Pow<T, Context>(mean.count(), 0.5, tVar_data, tVar_data);
if (!use_global_stats) { if (!use_global_stats) {
// normalize history var // normalize history var
math::AddScalar<T, Context>(mean.count(), eps, thVar_data); math::AddScalar<T, Context>(mean.count(), eps, thVar_data);
math::Pow<T, Context>(mean.count(), 0.5, thVar_data, thVar_data); math::Pow<T, Context>(mean.count(), 0.5, thVar_data, thVar_data);
// compute r // compute r
math::Div<T, Context>(mean.count(), tVar_data, thVar_data, tRdata); math::Div<T, Context>(mean.count(), tVar_data, thVar_data, tRdata);
math::Clip<T, Context>(mean.count(), 1.0 / t_r_max, t_r_max, tRdata); math::Clip<T, Context>(mean.count(), 1.0 / t_r_max, t_r_max, tRdata);
// compute d // compute d
math::Sub<T, Context>(mean.count(), tMean_data, thMean_data, tDdata); math::Sub<T, Context>(mean.count(), tMean_data, thMean_data, tDdata);
math::Div<T, Context>(mean.count(), tDdata, thVar_data, tDdata); math::Div<T, Context>(mean.count(), tDdata, thVar_data, tDdata);
math::Clip<T, Context>(mean.count(), -t_d_max, t_d_max, tDdata); math::Clip<T, Context>(mean.count(), -t_d_max, t_d_max, tDdata);
// update the bound of r & d // update the bound of r & d
t_r_max = r_max / (1.0 + (r_max - 1.0) * exp(-t_val)); t_r_max = r_max / (1.0 + (r_max - 1.0) * exp(-t_val));
t_d_max = d_max / (1.0 + (d_max - 1.0) * exp(-2 * t_val)); t_d_max = d_max / (1.0 + (d_max - 1.0) * exp(-2 * t_val));
t_val += t_delta; t_val += t_delta;
} }
// divide by var // divide by var
math::Gemm<T, Context>(CblasNoTrans, CblasNoTrans, num, channels, 1, math::Gemm<T, Context>(CblasNoTrans, CblasNoTrans, num, channels, 1,
1.0, 1.0,
NMul_data, tVar_data, NMul_data, tVar_data,
...@@ -131,11 +131,11 @@ void BatchRenormOp<Context>::RunWithType() { ...@@ -131,11 +131,11 @@ void BatchRenormOp<Context>::RunWithType() {
math::Div<T, Context>(stddev->count(), Ydata, Std_data, Ydata); math::Div<T, Context>(stddev->count(), Ydata, Std_data, Ydata);
if (!use_global_stats) { if (!use_global_stats) {
// store x_norm for backward // store x_norm for backward
XNorm_data = x_norm->template mutable_data<T, Context>(); XNorm_data = x_norm->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>(output(0)->count(), XNorm_data, Ydata); ctx().template Copy<T, Context, Context>(output(0)->count(), XNorm_data, Ydata);
// correction: mul by r // correction: mul by r
math::Gemm<T, Context>(CblasNoTrans, CblasNoTrans, num, channels, 1, math::Gemm<T, Context>(CblasNoTrans, CblasNoTrans, num, channels, 1,
1.0, 1.0,
NMul_data, tRdata, NMul_data, tRdata,
...@@ -183,7 +183,7 @@ void BatchRenormOp<Context>::RunOnDevice() { ...@@ -183,7 +183,7 @@ void BatchRenormOp<Context>::RunOnDevice() {
if (use_stats == -1) use_global_stats = phase() == "TEST" ? true : false; if (use_stats == -1) use_global_stats = phase() == "TEST" ? true : false;
else use_global_stats = use_stats == 1 ? true : false; else use_global_stats = use_stats == 1 ? true : false;
// if true, Act/Exp/Pow/Norm Ops can not exist before when train // if true, Act/Exp/Pow/Norm Ops can not exist before when train
if (inplace) output(0)->Share(input(0)); if (inplace) output(0)->Share(input(0));
if (input(0).template IsType<float>()) RunWithType<float>(); if (input(0).template IsType<float>()) RunWithType<float>();
...@@ -233,7 +233,7 @@ void BatchRenormGradientOp<Context>::RunWithType() { ...@@ -233,7 +233,7 @@ void BatchRenormGradientOp<Context>::RunWithType() {
auto* XNorm_data = x_norm->template data<T, Context>(); auto* XNorm_data = x_norm->template data<T, Context>();
auto* tMean_data = mean.template mutable_data<T, Context>(); auto* tMean_data = mean.template mutable_data<T, Context>();
// buffer <- dE/dY \cdot r // buffer <- dE/dY \cdot r
math::Gemm<T, Context>(CblasNoTrans, CblasNoTrans, num, channels, 1, math::Gemm<T, Context>(CblasNoTrans, CblasNoTrans, num, channels, 1,
1.0, 1.0,
NMul_data, tRdata, NMul_data, tRdata,
...@@ -246,7 +246,7 @@ void BatchRenormGradientOp<Context>::RunWithType() { ...@@ -246,7 +246,7 @@ void BatchRenormGradientOp<Context>::RunWithType() {
Std_data); Std_data);
math::Mul<T, Context>(output(0)->count(), dYdata, Std_data, Std_data); math::Mul<T, Context>(output(0)->count(), dYdata, Std_data, Std_data);
// sum(dE/dY \cdot Y) // sum(dE/dY \cdot Y)
math::Mul<T, Context>(output(0)->count(), XNorm_data, Std_data, dXdata); math::Mul<T, Context>(output(0)->count(), XNorm_data, Std_data, dXdata);
math::Gemv<T, Context>(CblasNoTrans, nbychans, spatial_dim, math::Gemv<T, Context>(CblasNoTrans, nbychans, spatial_dim,
1.0, 1.0,
...@@ -269,10 +269,10 @@ void BatchRenormGradientOp<Context>::RunWithType() { ...@@ -269,10 +269,10 @@ void BatchRenormGradientOp<Context>::RunWithType() {
0.0, 0.0,
dXdata); dXdata);
// sum(dE/dY \cdot Y) \cdot Y // sum(dE/dY \cdot Y) \cdot Y
math::Mul<T, Context>(output(0)->count(), XNorm_data, dXdata, dXdata); math::Mul<T, Context>(output(0)->count(), XNorm_data, dXdata, dXdata);
// sum(dE/dY) + sum(dE/dY \cdot Y) \cdot Y // sum(dE/dY) + sum(dE/dY \cdot Y) \cdot Y
math::Gemv<T, Context>(CblasNoTrans, nbychans, spatial_dim, math::Gemv<T, Context>(CblasNoTrans, nbychans, spatial_dim,
1.0, 1.0,
Std_data, SMul_data, Std_data, SMul_data,
...@@ -293,13 +293,13 @@ void BatchRenormGradientOp<Context>::RunWithType() { ...@@ -293,13 +293,13 @@ void BatchRenormGradientOp<Context>::RunWithType() {
NByC_data, SMul_data, NByC_data, SMul_data,
1.0, dXdata); 1.0, dXdata);
// dE/dY - mean(dE/dY)- mean(dE/dY \cdot Y) \cdot Y // dE/dY - mean(dE/dY)- mean(dE/dY \cdot Y) \cdot Y
// = dE/dY - mean(sum(dE/dY) + sum(dE/dY \cdot Y) \cdot Y) // = dE/dY - mean(sum(dE/dY) + sum(dE/dY \cdot Y) \cdot Y)
math::Axpby<T, Context>(output(0)->count(), 1.0, Std_data, math::Axpby<T, Context>(output(0)->count(), 1.0, Std_data,
-1.0 / (num * spatial_dim), -1.0 / (num * spatial_dim),
dXdata); dXdata);
// divide var // divide var
math::Gemm<T, Context>(CblasNoTrans, CblasNoTrans, num, channels, 1, math::Gemm<T, Context>(CblasNoTrans, CblasNoTrans, num, channels, 1,
1.0, 1.0,
NMul_data, tVar_data, NMul_data, tVar_data,
......
...@@ -60,7 +60,7 @@ void InstanceNormOp<Context>::RunWithType() { ...@@ -60,7 +60,7 @@ void InstanceNormOp<Context>::RunWithType() {
} }
template <class Context> template <class Context>
void InstanceNormOp<Context>::RunOnDevice(){ void InstanceNormOp<Context>::RunOnDevice() {
num = input(0).dim(0); channels = input(0).dim(1); num = input(0).dim(0); channels = input(0).dim(1);
spatial_dim = input(0).count(2); nbychans = num * channels; spatial_dim = input(0).count(2); nbychans = num * channels;
vector<TIndex> dims({ num, channels }); vector<TIndex> dims({ num, channels });
...@@ -69,7 +69,7 @@ void InstanceNormOp<Context>::RunOnDevice(){ ...@@ -69,7 +69,7 @@ void InstanceNormOp<Context>::RunOnDevice(){
output(0)->ReshapeLike(input(0)); output(0)->ReshapeLike(input(0));
// if true, Act/Exp/Pow/Norm Ops can not exist before when train // if true, Act/Exp/Pow/Norm Ops can not exist before when train
if (inplace) output(0)->Share(input(0)); if (inplace) output(0)->Share(input(0));
if (input(0).template IsType<float>()) RunWithType<float>(); if (input(0).template IsType<float>()) RunWithType<float>();
...@@ -105,7 +105,7 @@ void InstanceNormGradientOp<Context>::RunWithType() { ...@@ -105,7 +105,7 @@ void InstanceNormGradientOp<Context>::RunWithType() {
auto* Ydata = input(-2).template data<T, Context>(); auto* Ydata = input(-2).template data<T, Context>();
math::Mul<T, Context>(output(0)->count(), Ydata, dYdata, dXdata); math::Mul<T, Context>(output(0)->count(), Ydata, dYdata, dXdata);
// sum(dE/dY \cdot Y) // sum(dE/dY \cdot Y)
math::Gemv<T, Context>(CblasNoTrans, nbychans, spatial_dim, math::Gemv<T, Context>(CblasNoTrans, nbychans, spatial_dim,
1.0, 1.0,
dXdata, SMul_data, dXdata, SMul_data,
...@@ -116,10 +116,10 @@ void InstanceNormGradientOp<Context>::RunWithType() { ...@@ -116,10 +116,10 @@ void InstanceNormGradientOp<Context>::RunWithType() {
0.0, 0.0,
dXdata); dXdata);
// sum(dE/dY \cdot Y) \cdot Y // sum(dE/dY \cdot Y) \cdot Y
math::Mul<T, Context>(output(0)->count(), Ydata, dXdata, dXdata); math::Mul<T, Context>(output(0)->count(), Ydata, dXdata, dXdata);
// sum(dE/dY) + sum(dE/dY \cdot Y) \cdot Y // sum(dE/dY) + sum(dE/dY \cdot Y) \cdot Y
math::Gemv<T, Context>(CblasNoTrans, nbychans, spatial_dim, math::Gemv<T, Context>(CblasNoTrans, nbychans, spatial_dim,
1.0, 1.0,
dYdata, SMul_data, dYdata, SMul_data,
...@@ -130,13 +130,13 @@ void InstanceNormGradientOp<Context>::RunWithType() { ...@@ -130,13 +130,13 @@ void InstanceNormGradientOp<Context>::RunWithType() {
1.0, 1.0,
dXdata); dXdata);
// dE/dY - mean(dE/dY)- mean(dE/dY \cdot Y) \cdot Y // dE/dY - mean(dE/dY)- mean(dE/dY \cdot Y) \cdot Y
// = dE/dY - mean(sum(dE/dY) + sum(dE/dY \cdot Y) \cdot Y) // = dE/dY - mean(sum(dE/dY) + sum(dE/dY \cdot Y) \cdot Y)
math::Axpby<T, Context>(output(0)->count(), 1.0, dYdata, math::Axpby<T, Context>(output(0)->count(), 1.0, dYdata,
-1.0 / spatial_dim, -1.0 / spatial_dim,
dXdata); dXdata);
// divide by var // divide by var
math::Div<T, Context>(output(0)->count(), dXdata, Std_data, dXdata); math::Div<T, Context>(output(0)->count(), dXdata, Std_data, dXdata);
// release buffer // release buffer
......
...@@ -41,9 +41,9 @@ void L2NormOp<Context>::RunWithType() { ...@@ -41,9 +41,9 @@ void L2NormOp<Context>::RunWithType() {
Bdata, DMuldata, Bdata, DMuldata,
1.0, 1.0,
Ndata); Ndata);
// compute T2 = \sqrt{T1} // compute T2 = \sqrt{T1}
math::Sqrt<T, Context>(inner_dim, Ndata, Ndata); math::Sqrt<T, Context>(inner_dim, Ndata, Ndata);
// compute T3 = x / [(T2)]_{dim} // compute T3 = x / [(T2)]_{dim}
math::Gemm<T, Context>(CblasNoTrans, CblasNoTrans, dim, inner_dim, 1, math::Gemm<T, Context>(CblasNoTrans, CblasNoTrans, dim, inner_dim, 1,
1.0, 1.0,
DMuldata, Ndata, DMuldata, Ndata,
...@@ -109,7 +109,7 @@ void L2NormGradientOp<Context>::RunWithType() { ...@@ -109,7 +109,7 @@ void L2NormGradientOp<Context>::RunWithType() {
auto* Bdata = buffer->template mutable_data<T, Context>(); auto* Bdata = buffer->template mutable_data<T, Context>();
auto* BInnerdata = buffer_inner->template mutable_data<T, Context>(); auto* BInnerdata = buffer_inner->template mutable_data<T, Context>();
for (int n = 0; n < outer_dim; n++){ for (int n = 0; n < outer_dim; n++) {
if (across_inner) { if (across_inner) {
Ndata = norm->template data<T, CPUContext>(); Ndata = norm->template data<T, CPUContext>();
T sum_of_x_mul_dy = math::Dot<T, Context>(buffer->count(), Xdata, dYdata); T sum_of_x_mul_dy = math::Dot<T, Context>(buffer->count(), Xdata, dYdata);
...@@ -117,21 +117,21 @@ void L2NormGradientOp<Context>::RunWithType() { ...@@ -117,21 +117,21 @@ void L2NormGradientOp<Context>::RunWithType() {
math::Sub<T, Context>(buffer->count(), dYdata, dXdata, dXdata); math::Sub<T, Context>(buffer->count(), dYdata, dXdata, dXdata);
math::Scal<T, Context>(buffer->count(), T(1.0 / Ndata[n]), dXdata); math::Scal<T, Context>(buffer->count(), T(1.0 / Ndata[n]), dXdata);
} else { } else {
// compute \sum_{i} x_{i, j}dy_{i, j} // compute \sum_{i} x_{i, j}dy_{i, j}
math::Mul<T, Context>(buffer->count(), Xdata, dYdata, Bdata); math::Mul<T, Context>(buffer->count(), Xdata, dYdata, Bdata);
math::Gemv<T, Context>(CblasTrans, dim, inner_dim, math::Gemv<T, Context>(CblasTrans, dim, inner_dim,
1.0, 1.0,
Bdata, DMuldata, Bdata, DMuldata,
0.0, 0.0,
BInnerdata); BInnerdata);
// compute T1 = x[(\sum_{i} x_{i, j}dy_{i, j})]_{dim} // compute T1 = x[(\sum_{i} x_{i, j}dy_{i, j})]_{dim}
math::Gemm<T, Context>(CblasNoTrans, CblasNoTrans, dim, inner_dim, 1, math::Gemm<T, Context>(CblasNoTrans, CblasNoTrans, dim, inner_dim, 1,
1.0, 1.0,
DMuldata, BInnerdata, DMuldata, BInnerdata,
0.0, 0.0,
Bdata); Bdata);
math::Mul<T, Context>(buffer->count(), Xdata, Bdata, dXdata); math::Mul<T, Context>(buffer->count(), Xdata, Bdata, dXdata);
// compute T2 = T1 / Normalizer^{2} // compute T2 = T1 / Normalizer^{2}
math::Pow<T, Context>(inner_dim, 2.0, Ndata, BInnerdata); math::Pow<T, Context>(inner_dim, 2.0, Ndata, BInnerdata);
math::Gemm<T, Context>(CblasNoTrans, CblasNoTrans, dim, inner_dim, 1, math::Gemm<T, Context>(CblasNoTrans, CblasNoTrans, dim, inner_dim, 1,
1.0, 1.0,
...@@ -139,7 +139,7 @@ void L2NormGradientOp<Context>::RunWithType() { ...@@ -139,7 +139,7 @@ void L2NormGradientOp<Context>::RunWithType() {
0.0, 0.0,
Bdata); Bdata);
math::Div<T, Context>(buffer->count(), dXdata, Bdata, dXdata); math::Div<T, Context>(buffer->count(), dXdata, Bdata, dXdata);
// compute T3 = (dy - T2) / Normalizer // compute T3 = (dy - T2) / Normalizer
math::Sub<T, Context>(buffer->count(), dYdata, dXdata, dXdata); math::Sub<T, Context>(buffer->count(), dYdata, dXdata, dXdata);
math::Gemm<T, Context>(CblasNoTrans, CblasNoTrans, dim, inner_dim, 1, math::Gemm<T, Context>(CblasNoTrans, CblasNoTrans, dim, inner_dim, 1,
1.0, 1.0,
......
...@@ -18,10 +18,10 @@ void LSTMUnitOp<Context>::RunWithType() { ...@@ -18,10 +18,10 @@ void LSTMUnitOp<Context>::RunWithType() {
template <class Context> template <class Context>
void LSTMUnitOp<Context>::RunOnDevice() { void LSTMUnitOp<Context>::RunOnDevice() {
// input(0): ----- c_t_1 // input(0): ----- c_t_1
// input(1): ----- gate_input // input(1): ----- gate_input
// output(0): ----- c_t // output(0): ----- c_t
// output(1): ----- h_t // output(1): ----- h_t
num = input(0).dim(0); num = input(0).dim(0);
channels = input(0).ndim() == 2 ? input(0).dim(1) : input(0).dim(2); channels = input(0).ndim() == 2 ? input(0).dim(1) : input(0).dim(2);
if (!has_cont.empty()) { if (!has_cont.empty()) {
...@@ -57,13 +57,13 @@ void LSTMUnitGradientOp<Context>::RunWithType() { ...@@ -57,13 +57,13 @@ void LSTMUnitGradientOp<Context>::RunWithType() {
template <class Context> template <class Context>
void LSTMUnitGradientOp<Context>::RunOnDevice() { void LSTMUnitGradientOp<Context>::RunOnDevice() {
// input(0): ----- c_t_1 // input(0): ----- c_t_1
// input(1): ----- x_act // input(1): ----- x_act
// input(2): ----- c_t // input(2): ----- c_t
// input(3): ----- d(c_t) // input(3): ----- d(c_t)
// input(4): ----- d(h_t) // input(4): ----- d(h_t)
// output(0): ----- d(c_t_1) // output(0): ----- d(c_t_1)
// output(1): ----- d(gate_input) // output(1): ----- d(gate_input)
num = input(0).dim(0); num = input(0).dim(0);
channels = input(0).ndim() == 2 ? input(0).dim(1) : input(0).dim(2); channels = input(0).ndim() == 2 ? input(0).dim(1) : input(0).dim(2);
output(0)->ReshapeLike(input(0)); output(0)->ReshapeLike(input(0));
...@@ -92,7 +92,7 @@ public: ...@@ -92,7 +92,7 @@ public:
vector<string> {I(0), I(1), O(0), GO(0), GO(1)}, vector<string> {I(0), I(1), O(0), GO(0), GO(1)},
vector<string> {GI(0), GI(1)}); vector<string> {GI(0), GI(1)});
} }
// fill zero for dc_{T+1} // fill zero for dc_{T+1}
vector<float> DefaultValues() override{ return{ 0.0, 1.0 }; } vector<float> DefaultValues() override{ return{ 0.0, 1.0 }; }
}; };
REGISTER_GRADIENT(LSTMUnit, GetLSTMUnitGradient); REGISTER_GRADIENT(LSTMUnit, GetLSTMUnitGradient);
......
...@@ -21,7 +21,7 @@ void AsyncUpdateOp<Context>::UpdateTimestamp(int tag) { ...@@ -21,7 +21,7 @@ void AsyncUpdateOp<Context>::UpdateTimestamp(int tag) {
} }
template <class Context> template <class Context>
int AsyncUpdateOp<Context>::GetDelay(int tag){ int AsyncUpdateOp<Context>::GetDelay(int tag) {
Tensor* t = ws()->GetTensor("_t_" + this->domain + "async_timestamp"); Tensor* t = ws()->GetTensor("_t_" + this->domain + "async_timestamp");
int* global_timestamp = t->template mutable_data<int, CPUContext>(); int* global_timestamp = t->template mutable_data<int, CPUContext>();
return global_timestamp[tag] - local_timestamp[tag] + 1; return global_timestamp[tag] - local_timestamp[tag] + 1;
...@@ -35,33 +35,33 @@ AsyncUpdateOp<Context>::AsyncUpdateOp(const OperatorDef& op_def, Workspace* ws) ...@@ -35,33 +35,33 @@ AsyncUpdateOp<Context>::AsyncUpdateOp(const OperatorDef& op_def, Workspace* ws)
mode(OperatorBase::GetSingleArg<string>("mode", "Async")), mode(OperatorBase::GetSingleArg<string>("mode", "Async")),
nsync(OperatorBase::GetSingleArg<int>("nsync", -1)) { nsync(OperatorBase::GetSingleArg<int>("nsync", -1)) {
// make key-val tags // make key-val tags
Tensor* t = this->ws()->CreateTensor("_t_" + this->domain + "async_tags"); Tensor* t = this->ws()->CreateTensor("_t_" + this->domain + "async_tags");
t->Reshape(vector<TIndex>(1, InputSize())); t->Reshape(vector<TIndex>(1, InputSize()));
tags = t->template mutable_data<string, CPUContext>(); tags = t->template mutable_data<string, CPUContext>();
for (int i = 0; i < OutputSize(); i++) tags[i] = output(i)->name(); for (int i = 0; i < OutputSize(); i++) tags[i] = output(i)->name();
// make recv logs // make recv logs
t = this->ws()->CreateTensor("_t_" + this->domain + "async_logs"); t = this->ws()->CreateTensor("_t_" + this->domain + "async_logs");
t->Reshape(vector<TIndex>(1, InputSize())); t->Reshape(vector<TIndex>(1, InputSize()));
// make recv buffers // make recv buffers
acc_buffers = new Tensor*[InputSize()]; // for soft-sync acc_buffers = new Tensor*[InputSize()]; // for soft-sync
recv_buffer.reset(new Tensor()); // for async recv_buffer.reset(new Tensor()); // for async
// setup for server // setup for server
if (this->comm_rank == this->comm_root) { if (this->comm_rank == this->comm_root) {
if (nsync == -1) nsync = this->comm_size; // fully async if (nsync == -1) nsync = this->comm_size; // fully async
max_recv = this->comm_size / nsync; max_recv = this->comm_size / nsync;
// make global timestamp // make global timestamp
t = this->ws()->CreateTensor("_t_" + this->domain + "async_timestamp"); t = this->ws()->CreateTensor("_t_" + this->domain + "async_timestamp");
t->Reshape(vector<TIndex>(1, InputSize())); t->Reshape(vector<TIndex>(1, InputSize()));
// make global buffers // make global buffers
for (int i = 0; i < OutputSize(); i++) for (int i = 0; i < OutputSize(); i++)
acc_buffers[i] = this->ws()->CreateTensor(tags[i] + "_grad_async_acc"); acc_buffers[i] = this->ws()->CreateTensor(tags[i] + "_grad_async_acc");
} }
// create independent stream for thread if using cuda-aware // create independent stream for thread if using cuda-aware
#ifdef WITH_CUDA_AWARE #ifdef WITH_CUDA_AWARE
cudaStreamCreate(&stream); cudaStreamCreate(&stream);
cublasCreate_v2(&handle); cublasCreate_v2(&handle);
...@@ -71,7 +71,7 @@ AsyncUpdateOp<Context>::AsyncUpdateOp(const OperatorDef& op_def, Workspace* ws) ...@@ -71,7 +71,7 @@ AsyncUpdateOp<Context>::AsyncUpdateOp(const OperatorDef& op_def, Workspace* ws)
template <class Context> template <typename T> template <class Context> template <typename T>
void AsyncUpdateOp<Context>::RootRunWithType() { void AsyncUpdateOp<Context>::RootRunWithType() {
for (int i = 0; i < InputSize(); i++){ for (int i = 0; i < InputSize(); i++) {
auto* dXdata = input(i).template mutable_data<T, Context>(); auto* dXdata = input(i).template mutable_data<T, Context>();
auto* Xdata = output(i)->template mutable_data<T, Context>(); auto* Xdata = output(i)->template mutable_data<T, Context>();
...@@ -88,7 +88,7 @@ void AsyncUpdateOp<Context>::RootRunWithType() { ...@@ -88,7 +88,7 @@ void AsyncUpdateOp<Context>::RootRunWithType() {
} }
template <class Context> template <class Context>
void AsyncUpdateOp<Context>::RunOnDevice(){ void AsyncUpdateOp<Context>::RunOnDevice() {
if (this->comm_rank != this->comm_root) return; if (this->comm_rank != this->comm_root) return;
if (input(0).template IsType<float>()) { if (input(0).template IsType<float>()) {
...@@ -102,11 +102,11 @@ void AsyncUpdateOp<Context>::RunOnDevice(){ ...@@ -102,11 +102,11 @@ void AsyncUpdateOp<Context>::RunOnDevice(){
template <class Context> template <typename T> template <class Context> template <typename T>
void AsyncUpdateOp<Context>::ThreadRunWithType() { void AsyncUpdateOp<Context>::ThreadRunWithType() {
while (1) { while (1) {
// pull from specfic client // pull from specfic client
MPI_Status status; MPI_Status status;
MPI_Probe(node_id, MPI_ANY_TAG, this->comm, &status); MPI_Probe(node_id, MPI_ANY_TAG, this->comm, &status);
Tensor* X = ws()->GetTensor(tags[status.MPI_TAG]); Tensor* X = ws()->GetTensor(tags[status.MPI_TAG]);
if (X->count() == 0) continue; // wait for server if (X->count() == 0) continue; // wait for server
recv_buffer->ReshapeLike(*X); recv_buffer->ReshapeLike(*X);
#ifdef WITH_CUDA_AWARE #ifdef WITH_CUDA_AWARE
auto* Bdata = recv_buffer->template mutable_data<T, Context>(); auto* Bdata = recv_buffer->template mutable_data<T, Context>();
...@@ -114,7 +114,7 @@ void AsyncUpdateOp<Context>::ThreadRunWithType() { ...@@ -114,7 +114,7 @@ void AsyncUpdateOp<Context>::ThreadRunWithType() {
auto* Bdata = recv_buffer->template mutable_data<T, CPUContext>(); auto* Bdata = recv_buffer->template mutable_data<T, CPUContext>();
#endif #endif
MPI_Recv(Bdata, X->count(), MPI_FLOAT, status.MPI_SOURCE, status.MPI_TAG, this->comm, MPI_STATUS_IGNORE); MPI_Recv(Bdata, X->count(), MPI_FLOAT, status.MPI_SOURCE, status.MPI_TAG, this->comm, MPI_STATUS_IGNORE);
// update // update
#ifdef WITH_CUDA_AWARE #ifdef WITH_CUDA_AWARE
auto* Xdata = X->template mutable_data<T, Context>(); auto* Xdata = X->template mutable_data<T, Context>();
if (mode != "Async_No_Lock") ws()->LockTensor(output(status.MPI_TAG)->name()); if (mode != "Async_No_Lock") ws()->LockTensor(output(status.MPI_TAG)->name());
...@@ -132,9 +132,9 @@ void AsyncUpdateOp<Context>::ThreadRunWithType() { ...@@ -132,9 +132,9 @@ void AsyncUpdateOp<Context>::ThreadRunWithType() {
math::Axpy<T, CPUContext>(X->count(), -1.0 / delay, Bdata, Xdata); math::Axpy<T, CPUContext>(X->count(), -1.0 / delay, Bdata, Xdata);
if (mode != "Async_No_Lock") ws()->UnlockTensor(output(status.MPI_TAG)->name()); if (mode != "Async_No_Lock") ws()->UnlockTensor(output(status.MPI_TAG)->name());
#endif #endif
// push back to this client // push back to this client
MPI_Send(Xdata, X->count(), MPI_FLOAT, status.MPI_SOURCE, status.MPI_TAG, this->comm); MPI_Send(Xdata, X->count(), MPI_FLOAT, status.MPI_SOURCE, status.MPI_TAG, this->comm);
// do statistics // do statistics
update_count++; update_count++;
if (update_count % (100 * InputSize()) == 0) if (update_count % (100 * InputSize()) == 0)
LOG(INFO) << "Server[" << node_id << "]: " LOG(INFO) << "Server[" << node_id << "]: "
......
...@@ -59,7 +59,7 @@ void UpdateOpBase<Context>::ReduceRunWithType() { ...@@ -59,7 +59,7 @@ void UpdateOpBase<Context>::ReduceRunWithType() {
int recv_from = (comm_rank - 1 + comm_size) % comm_size; int recv_from = (comm_rank - 1 + comm_size) % comm_size;
int send_to = (comm_rank + 1) % comm_size; int send_to = (comm_rank + 1) % comm_size;
// scatter-reduce // scatter-reduce
for (int i = 0; i < comm_size - 1; i++) { for (int i = 0; i < comm_size - 1; i++) {
int recv_chunk = (comm_rank - i - 1 + comm_size) % comm_size; int recv_chunk = (comm_rank - i - 1 + comm_size) % comm_size;
int send_chunk = (comm_rank - i + comm_size) % comm_size; int send_chunk = (comm_rank - i + comm_size) % comm_size;
...@@ -83,7 +83,7 @@ void UpdateOpBase<Context>::ReduceRunWithType() { ...@@ -83,7 +83,7 @@ void UpdateOpBase<Context>::ReduceRunWithType() {
} }
ws()->ReleaseBuffer(buffer); ws()->ReleaseBuffer(buffer);
// allgather // allgather
for (int i = 0; i < comm_size - 1; i++) { for (int i = 0; i < comm_size - 1; i++) {
int send_chunk = (comm_rank - i + 1 + comm_size) % comm_size; int send_chunk = (comm_rank - i + 1 + comm_size) % comm_size;
int recv_chunk = (comm_rank - i + comm_size) % comm_size; int recv_chunk = (comm_rank - i + comm_size) % comm_size;
...@@ -97,8 +97,8 @@ void UpdateOpBase<Context>::ReduceRunWithType() { ...@@ -97,8 +97,8 @@ void UpdateOpBase<Context>::ReduceRunWithType() {
0, comm, MPI_STATUS_IGNORE); 0, comm, MPI_STATUS_IGNORE);
} }
// ave-normalize // ave-normalize
if (comm_size > 1){ if (comm_size > 1) {
#ifdef WITH_CUDA_AWARE #ifdef WITH_CUDA_AWARE
math::Scal<T, Context>(count, T(1.0 / comm_size), dXdata); math::Scal<T, Context>(count, T(1.0 / comm_size), dXdata);
#else #else
...@@ -110,13 +110,13 @@ void UpdateOpBase<Context>::ReduceRunWithType() { ...@@ -110,13 +110,13 @@ void UpdateOpBase<Context>::ReduceRunWithType() {
template <class Context> template <typename T> template <class Context> template <typename T>
void UpdateOpBase<Context>::PreprocessRunWithType() { void UpdateOpBase<Context>::PreprocessRunWithType() {
// scale // scale
scale_factor = param("scale_gradient"); scale_factor = param("scale_gradient");
if (scale_factor != 1){ if (scale_factor != 1) {
auto* dXdata = input(0).template mutable_data<T, Context>(); auto* dXdata = input(0).template mutable_data<T, Context>();
math::Scal<T, Context>(input(0).count(), scale_factor, dXdata); math::Scal<T, Context>(input(0).count(), scale_factor, dXdata);
} }
// clip // clip
clip_thresh = param("clip_gradient"); clip_thresh = param("clip_gradient");
if (clip_thresh > 0) { if (clip_thresh > 0) {
auto* dXdata = input(0).template mutable_data<T, Context>(); auto* dXdata = input(0).template mutable_data<T, Context>();
...@@ -127,12 +127,12 @@ void UpdateOpBase<Context>::PreprocessRunWithType() { ...@@ -127,12 +127,12 @@ void UpdateOpBase<Context>::PreprocessRunWithType() {
math::Scal<T, Context>(input(0).count(), factor, dXdata); math::Scal<T, Context>(input(0).count(), factor, dXdata);
} }
} }
// decay // decay
l2_decay = param("l2_decay"); l2_decay = param("l2_decay") * decay_mult;
if (l2_decay > 0){ if (l2_decay > 0) {
auto* dXdata = input(0).template mutable_data<T, Context>(); auto* dXdata = input(0).template mutable_data<T, Context>();
auto* Xdata = output(0)->template data<T, Context>(); auto* Xdata = output(0)->template data<T, Context>();
math::Axpy<T, Context>(input(0).count(), l2_decay * decay_mult, Xdata, dXdata); math::Axpy<T, Context>(input(0).count(), l2_decay, Xdata, dXdata);
} }
} }
...@@ -141,9 +141,9 @@ void UpdateOpBase<Context>::UpdateRunWithType() { ...@@ -141,9 +141,9 @@ void UpdateOpBase<Context>::UpdateRunWithType() {
if (!allow_parallel || (allow_parallel && mode == "Sync")) { if (!allow_parallel || (allow_parallel && mode == "Sync")) {
auto* dXdata = input(0).template mutable_data<T, Context>(); auto* dXdata = input(0).template mutable_data<T, Context>();
auto* Xdata = output(0)->template mutable_data<T, Context>(); auto* Xdata = output(0)->template mutable_data<T, Context>();
// update // update
math::Axpy<T, Context>(output(0)->count(), -1.0, dXdata, Xdata); math::Axpy<T, Context>(output(0)->count(), -1.0, dXdata, Xdata);
// clear accumulated grads // clear accumulated grads
math::Set<T, Context>(input(0).count(), 0, dXdata); math::Set<T, Context>(input(0).count(), 0, dXdata);
} else { } else {
#ifdef WITH_MPI #ifdef WITH_MPI
......
...@@ -10,12 +10,12 @@ void FloatToHalfOp<Context>::RunOnDevice() { ...@@ -10,12 +10,12 @@ void FloatToHalfOp<Context>::RunOnDevice() {
<< "the type of tensor must be float32."; << "the type of tensor must be float32.";
output(0)->ReshapeLike(input(0)); output(0)->ReshapeLike(input(0));
// cast // cast
auto* Xdata = input(0).template data<float, Context>(); auto* Xdata = input(0).template data<float, Context>();
auto* Ydata = output(0)->template mutable_data<float16, Context>(); auto* Ydata = output(0)->template mutable_data<float16, Context>();
kernel::Float2Half<float, Context>(output(0)->count(), Xdata, Ydata); kernel::Float2Half<float, Context>(output(0)->count(), Xdata, Ydata);
// release & share // release & share
input(0).Reset(); input(0).Reset();
input(0).ReshapeLike(*output(0)); input(0).ReshapeLike(*output(0));
input(0).Share(*output(0)); input(0).Share(*output(0));
......
...@@ -12,7 +12,7 @@ void CompareOp<Context>::EqualRunWithType() { ...@@ -12,7 +12,7 @@ void CompareOp<Context>::EqualRunWithType() {
} }
template <class Context> template <class Context>
void CompareOp<Context>::RunOnDevice(){ void CompareOp<Context>::RunOnDevice() {
CHECK_EQ(input(0).count(), input(1).count()) CHECK_EQ(input(0).count(), input(1).count())
<< "both conditioned tensor must have same elements."; << "both conditioned tensor must have same elements.";
output(0)->ReshapeLike(input(0)); output(0)->ReshapeLike(input(0));
......
...@@ -6,7 +6,7 @@ namespace dragon { ...@@ -6,7 +6,7 @@ namespace dragon {
template <class Context> template <typename T> template <class Context> template <typename T>
void GradientGenerateOp<Context>::RunWithType() { void GradientGenerateOp<Context>::RunWithType() {
for (int i = 0; i < OutputSize(); i++){ for (int i = 0; i < OutputSize(); i++) {
if (output(i)->name() == "ignore") continue; if (output(i)->name() == "ignore") continue;
output(i)->ReshapeLike(input(i)); output(i)->ReshapeLike(input(i));
auto* dXdata = output(0)->template mutable_data<T, Context>(); auto* dXdata = output(0)->template mutable_data<T, Context>();
......
...@@ -26,7 +26,7 @@ void InitializeOp<Context>::RunOnDevice() { ...@@ -26,7 +26,7 @@ void InitializeOp<Context>::RunOnDevice() {
RunWithType<float>(); RunWithType<float>();
} }
// constant // constant
DEPLOY_CPU(Fill); DEPLOY_CPU(Fill);
#ifdef WITH_CUDA #ifdef WITH_CUDA
DEPLOY_CUDA(Fill); DEPLOY_CUDA(Fill);
...@@ -34,7 +34,7 @@ DEPLOY_CUDA(Fill); ...@@ -34,7 +34,7 @@ DEPLOY_CUDA(Fill);
OPERATOR_SCHEMA(Fill).NumInputs(0, 1).NumOutputs(1); OPERATOR_SCHEMA(Fill).NumInputs(0, 1).NumOutputs(1);
NO_GRADIENT(Fill); NO_GRADIENT(Fill);
// uniform // uniform
DEPLOY_CPU(RandomUniform); DEPLOY_CPU(RandomUniform);
#ifdef WITH_CUDA #ifdef WITH_CUDA
DEPLOY_CUDA(RandomUniform); DEPLOY_CUDA(RandomUniform);
...@@ -42,7 +42,7 @@ DEPLOY_CUDA(RandomUniform); ...@@ -42,7 +42,7 @@ DEPLOY_CUDA(RandomUniform);
OPERATOR_SCHEMA(RandomUniform).NumInputs(0, 1).NumOutputs(1); OPERATOR_SCHEMA(RandomUniform).NumInputs(0, 1).NumOutputs(1);
NO_GRADIENT(RandomUniform); NO_GRADIENT(RandomUniform);
// normal // normal
DEPLOY_CPU(RandomNormal); DEPLOY_CPU(RandomNormal);
#ifdef WITH_CUDA #ifdef WITH_CUDA
DEPLOY_CUDA(RandomNormal); DEPLOY_CUDA(RandomNormal);
...@@ -50,7 +50,7 @@ DEPLOY_CUDA(RandomNormal); ...@@ -50,7 +50,7 @@ DEPLOY_CUDA(RandomNormal);
OPERATOR_SCHEMA(RandomNormal).NumInputs(0, 1).NumOutputs(1); OPERATOR_SCHEMA(RandomNormal).NumInputs(0, 1).NumOutputs(1);
NO_GRADIENT(RandomNormal); NO_GRADIENT(RandomNormal);
// truncated normal // truncated normal
DEPLOY_CPU(TruncatedNormal); DEPLOY_CPU(TruncatedNormal);
#ifdef WITH_CUDA #ifdef WITH_CUDA
DEPLOY_CPU_CUDA(TruncatedNormal); DEPLOY_CPU_CUDA(TruncatedNormal);
...@@ -58,7 +58,7 @@ DEPLOY_CPU_CUDA(TruncatedNormal); ...@@ -58,7 +58,7 @@ DEPLOY_CPU_CUDA(TruncatedNormal);
OPERATOR_SCHEMA(TruncatedNormal).NumInputs(0, 1).NumOutputs(1); OPERATOR_SCHEMA(TruncatedNormal).NumInputs(0, 1).NumOutputs(1);
NO_GRADIENT(TruncatedNormal); NO_GRADIENT(TruncatedNormal);
// glorot uniform // glorot uniform
DEPLOY_CPU(GlorotUniform); DEPLOY_CPU(GlorotUniform);
#ifdef WITH_CUDA #ifdef WITH_CUDA
DEPLOY_CUDA(GlorotUniform); DEPLOY_CUDA(GlorotUniform);
...@@ -66,7 +66,7 @@ DEPLOY_CUDA(GlorotUniform); ...@@ -66,7 +66,7 @@ DEPLOY_CUDA(GlorotUniform);
OPERATOR_SCHEMA(GlorotUniform).NumInputs(0, 1).NumOutputs(1); OPERATOR_SCHEMA(GlorotUniform).NumInputs(0, 1).NumOutputs(1);
NO_GRADIENT(GlorotUniform); NO_GRADIENT(GlorotUniform);
// glorot normal // glorot normal
DEPLOY_CPU(GlorotNormal); DEPLOY_CPU(GlorotNormal);
#ifdef WITH_CUDA #ifdef WITH_CUDA
DEPLOY_CUDA(GlorotNormal); DEPLOY_CUDA(GlorotNormal);
......
...@@ -229,16 +229,16 @@ __global__ static void nms_mask(const Dtype boxes[], ...@@ -229,16 +229,16 @@ __global__ static void nms_mask(const Dtype boxes[],
} }
} }
// mask: "num_boxes x num_blocks" array // mask: "num_boxes x num_blocks" array
// for mask[j][bi], "di-th bit = 1" means: // for mask[j][bi], "di-th bit = 1" means:
// box j is significantly overlapped with box i = i_start + di, // box j is significantly overlapped with box i = i_start + di,
// where i_start = bi * block_size // where i_start = bi * block_size
{ {
const int num_blocks = DIV_THEN_CEIL(num_boxes, nms_block_size); const int num_blocks = DIV_THEN_CEIL(num_boxes, nms_block_size);
const int bi = blockIdx.x; const int bi = blockIdx.x;
mask[(j_start + dj) * num_blocks + bi] = mask_j; mask[(j_start + dj) * num_blocks + bi] = mask_j;
} }
} // endif dj < dj_end } // endif dj < dj_end
} }
} }
......
...@@ -4,10 +4,10 @@ namespace dragon { ...@@ -4,10 +4,10 @@ namespace dragon {
template <class Context> template <class Context>
void ShapeOp<Context>::RunOnDevice() { void ShapeOp<Context>::RunOnDevice() {
// reshape // reshape
output(0)->Reshape(vector<TIndex>(1, input(0).ndim())); output(0)->Reshape(vector<TIndex>(1, input(0).ndim()));
// forward // forward
auto* Ydata = output(0)->template mutable_data<float, CPUContext>(); auto* Ydata = output(0)->template mutable_data<float, CPUContext>();
for (int i = 0; i < input(0).ndim(); i++) Ydata[i] = input(0).dim(i); for (int i = 0; i < input(0).ndim(); i++) Ydata[i] = input(0).dim(i);
} }
......
...@@ -7,7 +7,7 @@ namespace dragon { ...@@ -7,7 +7,7 @@ namespace dragon {
template <class Context> template <class Context>
void ConvOp<Context>::ComputeOutputShape() { void ConvOp<Context>::ComputeOutputShape() {
this->output_shape.clear(); this->output_shape.clear();
for (int i = 0; i < this->num_spatial_axes; i++){ for (int i = 0; i < this->num_spatial_axes; i++) {
const int input_dim = this->bottom_shape[this->channel_axis + i + 1]; const int input_dim = this->bottom_shape[this->channel_axis + i + 1];
const int dilated_kernel = this->dilation[i] * (this->kernel_size[i] - 1) + 1; const int dilated_kernel = this->dilation[i] * (this->kernel_size[i] - 1) + 1;
const int output_dim = (input_dim + 2 * this->pad[i] - dilated_kernel) / this->stride[i] + 1; const int output_dim = (input_dim + 2 * this->pad[i] - dilated_kernel) / this->stride[i] + 1;
...@@ -43,7 +43,7 @@ void ConvOp<Context>::RunWithType() { ...@@ -43,7 +43,7 @@ void ConvOp<Context>::RunWithType() {
} }
template <class Context> template <class Context>
void ConvOp<Context>::RunOnDevice(){ void ConvOp<Context>::RunOnDevice() {
Reshape(); Reshape();
if (input(0).template IsType<float>()) RunWithType<float>(); if (input(0).template IsType<float>()) RunWithType<float>();
...@@ -71,7 +71,7 @@ void ConvGradientOp<Context>::RunWithType() { ...@@ -71,7 +71,7 @@ void ConvGradientOp<Context>::RunWithType() {
Db(dYdata + n * this->y_offset, dBdata); Db(dYdata + n * this->y_offset, dBdata);
} }
for (int n = 0; n < input(2).dim(0); n++){ for (int n = 0; n < input(2).dim(0); n++) {
if (output(1)->name() != "ignore") { if (output(1)->name() != "ignore") {
auto* Xdata = input(0).template data<T, Context>(); auto* Xdata = input(0).template data<T, Context>();
auto* dWdata = output(1)->template mutable_data<T, Context>(); auto* dWdata = output(1)->template mutable_data<T, Context>();
......
...@@ -125,7 +125,7 @@ void ConvOpBase<Context>::Reshape() { ...@@ -125,7 +125,7 @@ void ConvOpBase<Context>::Reshape() {
col_offset = kernel_dim * conv_out_spatial_dim; col_offset = kernel_dim * conv_out_spatial_dim;
output_offset = conv_out_channels * conv_out_spatial_dim / group; output_offset = conv_out_channels * conv_out_spatial_dim / group;
// compute col buffer shape // compute col buffer shape
col_buffer_shape.clear(); col_buffer_shape.clear();
col_buffer_shape.push_back(kernel_dim * group); col_buffer_shape.push_back(kernel_dim * group);
for (int i = 0; i < num_spatial_axes; i++) { for (int i = 0; i < num_spatial_axes; i++) {
...@@ -159,9 +159,9 @@ void ConvOpBase<Context>::GradientReshape() { ...@@ -159,9 +159,9 @@ void ConvOpBase<Context>::GradientReshape() {
conv_out_spatial_dim = input(2).count(channel_axis + 1); conv_out_spatial_dim = input(2).count(channel_axis + 1);
} }
// compute input shape // compute input shape
input_shape.clear(); input_shape.clear();
for (int i = 0; i < num_spatial_axes; i++){ for (int i = 0; i < num_spatial_axes; i++) {
if (ReverseDimensions()) { if (ReverseDimensions()) {
input_shape.push_back(input(2).dim(channel_axis + i + 1)); input_shape.push_back(input(2).dim(channel_axis + i + 1));
} else { } else {
...@@ -169,7 +169,7 @@ void ConvOpBase<Context>::GradientReshape() { ...@@ -169,7 +169,7 @@ void ConvOpBase<Context>::GradientReshape() {
} }
} }
kernel_dim = input(1).count(1); // in * kh * kw kernel_dim = input(1).count(1); // in * kh * kw
out_spatial_dim = input(2).count(channel_axis + 1); out_spatial_dim = input(2).count(channel_axis + 1);
x_offset = input(0).count(channel_axis); x_offset = input(0).count(channel_axis);
...@@ -178,10 +178,10 @@ void ConvOpBase<Context>::GradientReshape() { ...@@ -178,10 +178,10 @@ void ConvOpBase<Context>::GradientReshape() {
col_offset = kernel_dim * conv_out_spatial_dim; col_offset = kernel_dim * conv_out_spatial_dim;
output_offset = conv_out_channels * conv_out_spatial_dim / group; output_offset = conv_out_channels * conv_out_spatial_dim / group;
// compute col buffer shape // compute col buffer shape
col_buffer_shape.clear(); col_buffer_shape.clear();
col_buffer_shape.push_back(kernel_dim * group); col_buffer_shape.push_back(kernel_dim * group);
for (int i = 0; i < num_spatial_axes; i++){ for (int i = 0; i < num_spatial_axes; i++) {
if (ReverseDimensions()) { if (ReverseDimensions()) {
col_buffer_shape.push_back(bottom_shape[channel_axis + i + 1]); col_buffer_shape.push_back(bottom_shape[channel_axis + i + 1]);
} else { } else {
......
...@@ -228,13 +228,13 @@ void CuDNNConvGradientOp<Context>::RunWithType() { ...@@ -228,13 +228,13 @@ void CuDNNConvGradientOp<Context>::RunWithType() {
const T* dYdata = input(2).template data<T, Context>(); const T* dYdata = input(2).template data<T, Context>();
for (int g = 0; g < this->group; g++) { for (int g = 0; g < this->group; g++) {
if (output(2)->name() != "ignore"){ if (output(2)->name() != "ignore") {
T* dBdata = output(2)->template mutable_data<T, Context>(); T* dBdata = output(2)->template mutable_data<T, Context>();
CUDNN_CHECK(cudnnConvolutionBackwardBias(handle[g], CUDNN_CHECK(cudnnConvolutionBackwardBias(handle[g],
CUDNNType<T>::one, input_desc, dYdata + this->y_offset * g, CUDNNType<T>::one, input_desc, dYdata + this->y_offset * g,
CUDNNType<T>::one, bias_desc, dBdata + bias_offset * g)); CUDNNType<T>::one, bias_desc, dBdata + bias_offset * g));
} }
if (output(1)->name() != "ignore"){ if (output(1)->name() != "ignore") {
auto* Xdata = input(0).template data<T, Context>(); auto* Xdata = input(0).template data<T, Context>();
auto* dWdata = output(1)->template mutable_data<T, Context>(); auto* dWdata = output(1)->template mutable_data<T, Context>();
auto* workspace = buffer2->mutable_data<char, Context>(); auto* workspace = buffer2->mutable_data<char, Context>();
...@@ -246,7 +246,7 @@ void CuDNNConvGradientOp<Context>::RunWithType() { ...@@ -246,7 +246,7 @@ void CuDNNConvGradientOp<Context>::RunWithType() {
workspace + g * workspace_bwd_filter_size, workspace_bwd_filter_size, workspace + g * workspace_bwd_filter_size, workspace_bwd_filter_size,
CUDNNType<T>::one, filter_desc, dWdata + this->weight_offset * g)); CUDNNType<T>::one, filter_desc, dWdata + this->weight_offset * g));
} }
if (output(0)->name() != "ignore"){ if (output(0)->name() != "ignore") {
auto* Wdata = input(1).template data<T, Context>(); auto* Wdata = input(1).template data<T, Context>();
auto* dXdata = output(0)->template mutable_data<T, Context>(); auto* dXdata = output(0)->template mutable_data<T, Context>();
auto* workspace = buffer1->mutable_data<char, Context>(); auto* workspace = buffer1->mutable_data<char, Context>();
......
...@@ -54,7 +54,7 @@ void CuDNNLRNGradientOp<Context>::RunWithType() { ...@@ -54,7 +54,7 @@ void CuDNNLRNGradientOp<Context>::RunWithType() {
} }
template <class Context> template <class Context>
void CuDNNLRNGradientOp<Context>::RunOnDevice(){ void CuDNNLRNGradientOp<Context>::RunOnDevice() {
output(0)->ReshapeLike(input(0)); output(0)->ReshapeLike(input(0));
if (this->mode == ACROSS_CHANNELS) { if (this->mode == ACROSS_CHANNELS) {
......
...@@ -43,7 +43,7 @@ void DeConvOp<Context>::RunWithType() { ...@@ -43,7 +43,7 @@ void DeConvOp<Context>::RunWithType() {
} }
template <class Context> template <class Context>
void DeConvOp<Context>::RunOnDevice(){ void DeConvOp<Context>::RunOnDevice() {
Reshape(); Reshape();
if (input(0).template IsType<float>()) RunWithType<float>(); if (input(0).template IsType<float>()) RunWithType<float>();
......
...@@ -98,7 +98,7 @@ void LRNOp<Context>::ProdRunWithType() { ...@@ -98,7 +98,7 @@ void LRNOp<Context>::ProdRunWithType() {
} }
template <class Context> template <class Context>
void LRNOp<Context>::RunOnDevice(){ void LRNOp<Context>::RunOnDevice() {
if (mode == ACROSS_CHANNELS) { if (mode == ACROSS_CHANNELS) {
if (input(0).template IsType<float>()) { if (input(0).template IsType<float>()) {
AcrossRunWithType<float>(); AcrossRunWithType<float>();
...@@ -223,7 +223,7 @@ void LRNGradientOp<Context>::SplitRunWithType() { ...@@ -223,7 +223,7 @@ void LRNGradientOp<Context>::SplitRunWithType() {
} }
template <class Context> template <class Context>
void LRNGradientOp<Context>::RunOnDevice(){ void LRNGradientOp<Context>::RunOnDevice() {
if (mode == ACROSS_CHANNELS) { if (mode == ACROSS_CHANNELS) {
if (input(0).template IsType<float>()) { if (input(0).template IsType<float>()) {
AcrossRunWithType<float>(); AcrossRunWithType<float>();
......
...@@ -16,7 +16,7 @@ void NNResizeOp<Context>::RunWithType() { ...@@ -16,7 +16,7 @@ void NNResizeOp<Context>::RunWithType() {
template <class Context> template <class Context>
void NNResizeOp<Context>::RunOnDevice() { void NNResizeOp<Context>::RunOnDevice() {
dims = input(0).dims(); dims = input(0).dims();
if (dsize.size() == 0){ if (dsize.size() == 0) {
CHECK(fy != -1.0 && fx != -1.0); CHECK(fy != -1.0 && fx != -1.0);
dims[2] = int(dims[2] * fy); dims[2] = int(dims[2] * fy);
dims[3] = int(dims[3] * fx); dims[3] = int(dims[3] * fx);
...@@ -47,7 +47,7 @@ void NNResizeGradientOp<Context>::RunWithType() { ...@@ -47,7 +47,7 @@ void NNResizeGradientOp<Context>::RunWithType() {
} }
template <class Context> template <class Context>
void NNResizeGradientOp<Context>::RunOnDevice(){ void NNResizeGradientOp<Context>::RunOnDevice() {
output(0)->ReshapeLike(input(0)); output(0)->ReshapeLike(input(0));
if (input(0).template IsType<float>()) return RunWithType<float>(); if (input(0).template IsType<float>()) return RunWithType<float>();
......
...@@ -37,7 +37,7 @@ void cudnnSetTensorDesc(cudnnTensorDescriptor_t* desc, const vector<TIndex>& dim ...@@ -37,7 +37,7 @@ void cudnnSetTensorDesc(cudnnTensorDescriptor_t* desc, const vector<TIndex>& dim
int* dimA = new int[ndim]; int* dimA = new int[ndim];
int* strideA = new int[ndim]; int* strideA = new int[ndim];
TIndex stride = 1; TIndex stride = 1;
for (int i = ndim - 1; i >= 0; i--){ for (int i = ndim - 1; i >= 0; i--) {
strideA[i] = stride; strideA[i] = stride;
dimA[i] = dims[i]; dimA[i] = dims[i];
stride *= dimA[i]; stride *= dimA[i];
...@@ -55,7 +55,7 @@ void cudnnSetTensorDesc(cudnnTensorDescriptor_t* desc, ...@@ -55,7 +55,7 @@ void cudnnSetTensorDesc(cudnnTensorDescriptor_t* desc,
int ndim = (int)dims.size(); int ndim = (int)dims.size();
int* dimA = new int[ndim]; int* dimA = new int[ndim];
int* strideA = new int[ndim]; int* strideA = new int[ndim];
for (int i = ndim - 1; i >= 0; i--){ for (int i = ndim - 1; i >= 0; i--) {
strideA[i] = strides[i]; strideA[i] = strides[i];
dimA[i] = dims[i]; dimA[i] = dims[i];
} }
...@@ -66,10 +66,10 @@ void cudnnSetTensorDesc(cudnnTensorDescriptor_t* desc, ...@@ -66,10 +66,10 @@ void cudnnSetTensorDesc(cudnnTensorDescriptor_t* desc,
template <typename T> template <typename T>
void cudnnSetTensorDesc(cudnnTensorDescriptor_t* desc, Tensor* tensor) { void cudnnSetTensorDesc(cudnnTensorDescriptor_t* desc, Tensor* tensor) {
// CUDNN only support ndim from 3 to 8 // cuDNN requires ndim from 3 to 8
// we fake a reshaped dims to pass check // we fake a reshaped dims to pass check
vector<TIndex> fake_dims(tensor->dims()); vector<TIndex> fake_dims(tensor->dims());
if (fake_dims.size() < 3 || fake_dims.size() > 8){ if (fake_dims.size() < 3 || fake_dims.size() > 8) {
fake_dims.assign({ 1, 1 }); fake_dims.assign({ 1, 1 });
fake_dims.push_back(tensor->count()); fake_dims.push_back(tensor->count());
} }
......
...@@ -142,7 +142,7 @@ template <> void Add<float, CPUContext>(const int n, ...@@ -142,7 +142,7 @@ template <> void Add<float, CPUContext>(const int n,
template <> void Sub<float, CPUContext>(const int n, template <> void Sub<float, CPUContext>(const int n,
const float* a, const float* a,
const float* b, const float* b,
float* y){ float* y) {
#ifdef WITH_SSE #ifdef WITH_SSE
sse::Sub<float>(n, a, b, y); sse::Sub<float>(n, a, b, y);
#else // naive implement #else // naive implement
......
...@@ -66,8 +66,8 @@ template <> void RandomUniform<uint32_t, CUDAContext>(const int n, ...@@ -66,8 +66,8 @@ template <> void RandomUniform<uint32_t, CUDAContext>(const int n,
const float low, const float low,
const float high, const float high,
uint32_t* x) { uint32_t* x) {
// note that we ignore the low / high // note that we ignore the low / high
// curand could only generates in the range of [0, uint32] // curand could only generates in the range of [0, uint32]
CURAND_CHECK(curandGenerate(curand_generator(), x, n)); CURAND_CHECK(curandGenerate(curand_generator(), x, n));
} }
...@@ -95,8 +95,8 @@ template <> void RandomNormal<float16, CUDAContext>(const int n, ...@@ -95,8 +95,8 @@ template <> void RandomNormal<float16, CUDAContext>(const int n,
template <> void RandomBernoulli<float, CUDAContext>(const int n, template <> void RandomBernoulli<float, CUDAContext>(const int n,
const float p, const float p,
unsigned int* x) { unsigned int* x) {
// curand could not generate bernoulli distribution // curand could not generate bernoulli distribution
// we recommend implement it within specfic case, e.g. Dropout // we recommend implement it within specfic case, e.g. Dropout
NOT_IMPLEMENTED; NOT_IMPLEMENTED;
} }
......
...@@ -98,7 +98,7 @@ template<> void Softmax<float, CPUContext>(const int count, ...@@ -98,7 +98,7 @@ template<> void Softmax<float, CPUContext>(const int count,
const int dim = count / outer_dim; const int dim = count / outer_dim;
for (int i = 0; i < outer_dim; ++i) { for (int i = 0; i < outer_dim; ++i) {
context->Copy<float, CPUContext, CPUContext>(inner_dim, scale, x + i*dim); context->Copy<float, CPUContext, CPUContext>(inner_dim, scale, x + i*dim);
for (int j = 0; j < classes; ++j){ for (int j = 0; j < classes; ++j) {
for (int k = 0; k < inner_dim; k++) for (int k = 0; k < inner_dim; k++)
scale[k] = std::max(scale[k], x[i * dim + j * inner_dim + k]); scale[k] = std::max(scale[k], x[i * dim + j * inner_dim + k]);
} }
...@@ -668,7 +668,7 @@ template <> void OneHot<float, CPUContext>(const int count, ...@@ -668,7 +668,7 @@ template <> void OneHot<float, CPUContext>(const int count,
template<> void AbsGrad<float, CPUContext>(const int count, const float* dy, float* dx) { template<> void AbsGrad<float, CPUContext>(const int count, const float* dy, float* dx) {
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
const float val = dy[i]; const float val = dy[i];
// val > 0: 1 | val == 0: 0 | val < 0: -1 // val > 0: 1 | val == 0: 0 | val < 0: -1
dx[i] = (val > float(0)) - (val < float(0)); dx[i] = (val > float(0)) - (val < float(0));
} }
} }
...@@ -707,7 +707,7 @@ template<> void SmoothL1Grad<float, CPUContext>(const int count, ...@@ -707,7 +707,7 @@ template<> void SmoothL1Grad<float, CPUContext>(const int count,
const float val = dy[i]; const float val = dy[i];
const float abs_val = abs(val); const float abs_val = abs(val);
if (abs_val < 1.0 / sigma2) dx[i] = val * sigma2; if (abs_val < 1.0 / sigma2) dx[i] = val * sigma2;
// val > 0: 1 | val == 0: 0 | val < 0: -1 // val > 0: 1 | val == 0: 0 | val < 0: -1
else dx[i] = (val > float(0)) - (val < float(0)); else dx[i] = (val > float(0)) - (val < float(0));
} }
} }
...@@ -862,11 +862,11 @@ template <> void LSTMUnitGrad<float, CPUContext>(const int count, ...@@ -862,11 +862,11 @@ template <> void LSTMUnitGrad<float, CPUContext>(const int count,
p_df = dx + f_offset + ch; p_df = dx + f_offset + ch;
p_do = dx + o_offset + ch; p_do = dx + o_offset + ch;
p_dg = dx + g_offset + ch; p_dg = dx + g_offset + ch;
// BPTT compute the dc_{t-1} at the time of t // BPTT compute the dc_{t-1} at the time of t
// dc_{t-1} = dl / d(h_{t}) * d(h_{t}) / d(c_{t}) * d(c_{t}) / d(c_{t-1}) // dc_{t-1} = dl / d(h_{t}) * d(h_{t}) / d(c_{t}) * d(c_{t}) / d(c_{t-1})
// + d(c_{t+1}) / d(c_{t}) * d(c_{t}) / d(c_{t-1}) // + d(c_{t+1}) / d(c_{t}) * d(c_{t}) / d(c_{t-1})
// = (dl / d(h_{t}) * d(h_{t}) / d(c_{t}) + d(c_{t+1}) / d(c_{t})) // = (dl / d(h_{t}) * d(h_{t}) / d(c_{t}) + d(c_{t+1}) / d(c_{t}))
// * d(c_{t}) / d(c_{t-1}) // * d(c_{t}) / d(c_{t-1})
tanh_c_t = tanh(c[ch]); tanh_c_t = tanh(c[ch]);
dc_1_sum_term = dh[ch] * o * (1 - tanh_c_t * tanh_c_t) + dc[ch]; dc_1_sum_term = dh[ch] * o * (1 - tanh_c_t * tanh_c_t) + dc[ch];
dc_1[ch] = dc_1_sum_term * f; dc_1[ch] = dc_1_sum_term * f;
...@@ -1182,18 +1182,18 @@ template<> void MAXPooling<float, CPUContext>(const int count, ...@@ -1182,18 +1182,18 @@ template<> void MAXPooling<float, CPUContext>(const int count,
max_val = x[idx]; max_val = x[idx];
max_idx = idx; max_idx = idx;
} }
} // end w } // end w
} // end h } // end h
y[pool_idx] = max_val; y[pool_idx] = max_val;
mask[pool_idx] = max_idx; mask[pool_idx] = max_idx;
} // end pw } // end pw
} // end ph } // end ph
// offset a channel // offset a channel
x += x_offset; x += x_offset;
y += y_offset; y += y_offset;
mask += y_offset; mask += y_offset;
} // end c } // end c
} // end n } // end n
} }
template<> void AVEPooling<float, CPUContext>(const int count, template<> void AVEPooling<float, CPUContext>(const int count,
...@@ -1257,13 +1257,13 @@ template<> void MAXPoolingGrad<float, CPUContext>(const int count, ...@@ -1257,13 +1257,13 @@ template<> void MAXPoolingGrad<float, CPUContext>(const int count,
const int pool_idx = ph * pool_width + pw; const int pool_idx = ph * pool_width + pw;
const int idx = mask[pool_idx]; const int idx = mask[pool_idx];
dx[idx] += dy[pool_idx]; dx[idx] += dy[pool_idx];
} // end pw } // end pw
} // end ph } // end ph
dx += x_offset; dx += x_offset;
dy += y_offset; dy += y_offset;
mask += y_offset; mask += y_offset;
} // end c } // end c
} // end n } // end n
} }
template<> void AVEPoolingGrad<float, CPUContext>(const int count, template<> void AVEPoolingGrad<float, CPUContext>(const int count,
...@@ -1298,12 +1298,12 @@ template<> void AVEPoolingGrad<float, CPUContext>(const int count, ...@@ -1298,12 +1298,12 @@ template<> void AVEPoolingGrad<float, CPUContext>(const int count,
dx[idx] += (dy[pool_idx] / pool_size); dx[idx] += (dy[pool_idx] / pool_size);
} }
} }
} // end pw } // end pw
} // end ph } // end ph
dx += x_offset; dx += x_offset;
dy += y_offset; dy += y_offset;
} // end c } // end c
} // end n } // end n
} }
/******************** vision.roi_pooling ********************/ /******************** vision.roi_pooling ********************/
...@@ -1362,18 +1362,18 @@ template<> void ROIPooling<float, CPUContext>(const float spatial_scale, ...@@ -1362,18 +1362,18 @@ template<> void ROIPooling<float, CPUContext>(const float spatial_scale,
Ydata[pool_idx] = Idata[idx]; Ydata[pool_idx] = Idata[idx];
Mdata[pool_idx] = idx; Mdata[pool_idx] = idx;
} }
} //end w } // end w
} // end h } // end h
} // end pw } // end pw
} // end ph } // end ph
// offset image channels // offset image channels
Idata += x->offset(0, 1); Idata += x->offset(0, 1);
Ydata += y->offset(0, 1); Ydata += y->offset(0, 1);
Mdata += mask->offset(0, 1); Mdata += mask->offset(0, 1);
} // end c } // end c
// offset roi region // offset roi region
Rdata += roi->offset(1); Rdata += roi->offset(1);
} //end n } // end n
} }
template<> void ROIPoolingGrad<float, CPUContext>(const float spatial_scale, template<> void ROIPoolingGrad<float, CPUContext>(const float spatial_scale,
......
...@@ -130,7 +130,7 @@ __global__ void _ReluGrad(const int count, ...@@ -130,7 +130,7 @@ __global__ void _ReluGrad(const int count,
const T* y, const T* y,
const float slope, const float slope,
T* dx) { T* dx) {
CUDA_KERNEL_LOOP(i, count){ CUDA_KERNEL_LOOP(i, count) {
dx[i] = dy[i] * ((y[i] > 0) + slope * (y[i] <= 0)); dx[i] = dy[i] * ((y[i] > 0) + slope * (y[i] <= 0));
} }
} }
...@@ -912,7 +912,7 @@ __global__ void _Sum(const int count, ...@@ -912,7 +912,7 @@ __global__ void _Sum(const int count,
template<> void Sum<float, CUDAContext>( template<> void Sum<float, CUDAContext>(
const int count, const int axis_dim, const int count, const int axis_dim,
const int inner_dim, const float* x, float* y){ const int inner_dim, const float* x, float* y) {
_Sum<float> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count, _Sum<float> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
axis_dim, axis_dim,
inner_dim, inner_dim,
...@@ -954,7 +954,7 @@ template<> void SumGrad<float, CUDAContext>(const int count, ...@@ -954,7 +954,7 @@ template<> void SumGrad<float, CUDAContext>(const int count,
template <typename T> template <typename T>
__global__ void _Slice(const int count, const int outer_dim, const int inner_dim, __global__ void _Slice(const int count, const int outer_dim, const int inner_dim,
const int x_slice_dim, const int y_slice_dim, const int slice_offset, const T* x, T* y){ const int x_slice_dim, const int y_slice_dim, const int slice_offset, const T* x, T* y) {
CUDA_KERNEL_LOOP(idx, count) { CUDA_KERNEL_LOOP(idx, count) {
const int tmp = y_slice_dim * inner_dim; const int tmp = y_slice_dim * inner_dim;
const int outer_idx = idx / tmp; const int outer_idx = idx / tmp;
...@@ -1238,7 +1238,7 @@ template <typename T> ...@@ -1238,7 +1238,7 @@ template <typename T>
__global__ void _AbsGrad(const int count, const T* dy, T* dx) { __global__ void _AbsGrad(const int count, const T* dy, T* dx) {
CUDA_KERNEL_LOOP(idx, count) { CUDA_KERNEL_LOOP(idx, count) {
const T val = dy[idx]; const T val = dy[idx];
// val > 0: 1 | val == 0: 0 | val < 0: -1 // val > 0: 1 | val == 0: 0 | val < 0: -1
dx[idx] = (val > T(0)) - (val < T(0)); dx[idx] = (val > T(0)) - (val < T(0));
} }
} }
...@@ -1298,7 +1298,7 @@ __global__ void _SmoothL1Grad(const int count, const float sigma2, const T* dy, ...@@ -1298,7 +1298,7 @@ __global__ void _SmoothL1Grad(const int count, const float sigma2, const T* dy,
const T val = dy[idx]; const T val = dy[idx];
const T abs_val = abs(val); const T abs_val = abs(val);
if (abs_val < 1.0 / sigma2) dx[idx] = val * sigma2; if (abs_val < 1.0 / sigma2) dx[idx] = val * sigma2;
// val > 0: 1 | val == 0: 0 | val < 0: -1 // val > 0: 1 | val == 0: 0 | val < 0: -1
else dx[idx] = (val > T(0)) - (val < T(0)); else dx[idx] = (val > T(0)) - (val < T(0));
} }
} }
...@@ -1952,8 +1952,8 @@ __global__ void _Col2Im(const int count, ...@@ -1952,8 +1952,8 @@ __global__ void _Col2Im(const int count,
const int ex_kernel_w = (kernel_w - 1) * dilation_w + 1; const int ex_kernel_w = (kernel_w - 1) * dilation_w + 1;
const int w_start = (im_w < ex_kernel_w) ? 0 : (im_w - ex_kernel_w) / stride_w + 1; const int w_start = (im_w < ex_kernel_w) ? 0 : (im_w - ex_kernel_w) / stride_w + 1;
// redundant pixels will be ignored when conv // redundant pixels will be ignored when conv
// note to clip them by min(x,col_w) // note to clip them by min(x,col_w)
const int w_end = min(im_w / stride_w + 1, col_w); const int w_end = min(im_w / stride_w + 1, col_w);
const int h_start = (im_h < ex_kernel_h) ? 0 : (im_h - ex_kernel_h) / stride_h + 1; const int h_start = (im_h < ex_kernel_h) ? 0 : (im_h - ex_kernel_h) / stride_h + 1;
const int h_end = min(im_h / stride_h + 1, col_h); const int h_end = min(im_h / stride_h + 1, col_h);
...@@ -1962,8 +1962,8 @@ __global__ void _Col2Im(const int count, ...@@ -1962,8 +1962,8 @@ __global__ void _Col2Im(const int count,
for (int w = w_start; w < w_end; ++w) { for (int w = w_start; w < w_end; ++w) {
int kh_off = (im_h - h * stride_h); int kh_off = (im_h - h * stride_h);
int kw_off = (im_w - w * stride_w); int kw_off = (im_w - w * stride_w);
// only the serval im pixels used in dilated-conv // only the serval im pixels used in dilated-conv
// ignore the corresponding col pixels // ignore the corresponding col pixels
if (kh_off % dilation_h == 0 && kw_off % dilation_w == 0) { if (kh_off % dilation_h == 0 && kw_off % dilation_w == 0) {
kh_off /= dilation_h; kh_off /= dilation_h;
kw_off /= dilation_w; kw_off /= dilation_w;
...@@ -2222,11 +2222,11 @@ __global__ void _MAXPoolingGrad(const int count, ...@@ -2222,11 +2222,11 @@ __global__ void _MAXPoolingGrad(const int count,
const int c = (idx / width / height) % channels; const int c = (idx / width / height) % channels;
const int n = idx / width / height / channels; const int n = idx / width / height / channels;
// allow overlapping // allow overlapping
const int start_ph = (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1; const int start_ph = (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;
const int start_pw = (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1; const int start_pw = (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;
// allow clip // allow clip
const int end_ph = min((h + pad_h) / stride_h + 1, pool_height); const int end_ph = min((h + pad_h) / stride_h + 1, pool_height);
const int end_pw = min((w + pad_w) / stride_w + 1, pool_width); const int end_pw = min((w + pad_w) / stride_w + 1, pool_width);
...@@ -2436,7 +2436,7 @@ __global__ void _ROIPoolingGrad(const int count, ...@@ -2436,7 +2436,7 @@ __global__ void _ROIPoolingGrad(const int count,
const T* cur_roi = roi + n * 5; const T* cur_roi = roi + n * 5;
const int im_idx_spec = cur_roi[0]; const int im_idx_spec = cur_roi[0];
// ignore wrong im_batch_idx // ignore wrong im_batch_idx
if (im_idx != im_idx_spec) continue; if (im_idx != im_idx_spec) continue;
int x1 = round(cur_roi[1] * spatial_scale); int x1 = round(cur_roi[1] * spatial_scale);
...@@ -2474,9 +2474,9 @@ __global__ void _ROIPoolingGrad(const int count, ...@@ -2474,9 +2474,9 @@ __global__ void _ROIPoolingGrad(const int count,
if (mask_off[pool_idx] == (h * width + w)) { if (mask_off[pool_idx] == (h * width + w)) {
diff += dy_off[pool_idx]; diff += dy_off[pool_idx];
} }
} // end pw } // end pw
} // end ph } // end ph
} // end n } // end n
dx[idx] = diff; dx[idx] = diff;
} }
} }
...@@ -2647,7 +2647,7 @@ __global__ void _ROIAlignGrad(const int count, ...@@ -2647,7 +2647,7 @@ __global__ void _ROIAlignGrad(const int count,
const T* cur_roi = roi + n * 5; const T* cur_roi = roi + n * 5;
const int im_idx_spec = cur_roi[0]; const int im_idx_spec = cur_roi[0];
// ignore wrong im_batch_idx // ignore wrong im_batch_idx
if (im_idx != im_idx_spec) continue; if (im_idx != im_idx_spec) continue;
T x1 = cur_roi[1] * spatial_scale; T x1 = cur_roi[1] * spatial_scale;
...@@ -2693,9 +2693,9 @@ __global__ void _ROIAlignGrad(const int count, ...@@ -2693,9 +2693,9 @@ __global__ void _ROIAlignGrad(const int count,
else gradient_factor *= mw - w1; else gradient_factor *= mw - w1;
diff += dy_off[ph * pool_w + pw] * gradient_factor; diff += dy_off[ph * pool_w + pw] * gradient_factor;
} }
} // end pw } // end pw
} // end ph } // end ph
} // end n } // end n
dx[idx] = diff; dx[idx] = diff;
} }
} }
......
...@@ -9,20 +9,20 @@ namespace dragon { ...@@ -9,20 +9,20 @@ namespace dragon {
namespace sse { namespace sse {
template<> void Set(const int n, const float alpha, float* x){ template<> void Set(const int n, const float alpha, float* x) {
__m128 scalar = SSE_FP32_SCALAR(alpha); __m128 scalar = SSE_FP32_SCALAR(alpha);
SSE_LOOP1(i, n) SSE_FP32_STORE(x + i, scalar); SSE_LOOP1(i, n) SSE_FP32_STORE(x + i, scalar);
SSE_LOOP2(i, n) x[i] = alpha; SSE_LOOP2(i, n) x[i] = alpha;
} }
template<> void Set(const int n, const int alpha, int* x){ template<> void Set(const int n, const int alpha, int* x) {
__m128i scalar = SSE_INT32_SCALAR(alpha); __m128i scalar = SSE_INT32_SCALAR(alpha);
__m128i* x1 = reinterpret_cast<__m128i*>(x); __m128i* x1 = reinterpret_cast<__m128i*>(x);
SSE_LOOP1(i, n) SSE_INT128_STORE(x1++, scalar); SSE_LOOP1(i, n) SSE_INT128_STORE(x1++, scalar);
SSE_LOOP2(i, n) x[i] = alpha; SSE_LOOP2(i, n) x[i] = alpha;
} }
template<> void Add(const int n, const float* a, const float* b, float* y){ template<> void Add(const int n, const float* a, const float* b, float* y) {
__m128 x1, y1, z1; __m128 x1, y1, z1;
SSE_LOOP1(i, n) { SSE_LOOP1(i, n) {
x1 = SSE_FP32_LOAD(a + i); x1 = SSE_FP32_LOAD(a + i);
...@@ -33,7 +33,7 @@ namespace sse { ...@@ -33,7 +33,7 @@ namespace sse {
SSE_LOOP2(i, n) y[i] = a[i] + b[i]; SSE_LOOP2(i, n) y[i] = a[i] + b[i];
} }
template<> void Sub(const int n, const float* a, const float* b, float* y){ template<> void Sub(const int n, const float* a, const float* b, float* y) {
__m128 x1, y1, z1; __m128 x1, y1, z1;
SSE_LOOP1(i, n) { SSE_LOOP1(i, n) {
x1 = SSE_FP32_LOAD(a + i); x1 = SSE_FP32_LOAD(a + i);
...@@ -44,7 +44,7 @@ namespace sse { ...@@ -44,7 +44,7 @@ namespace sse {
SSE_LOOP2(i, n) y[i] = a[i] - b[i]; SSE_LOOP2(i, n) y[i] = a[i] - b[i];
} }
template<> void Mul(const int n, const float* a, const float* b, float* y){ template<> void Mul(const int n, const float* a, const float* b, float* y) {
__m128 x1, y1, z1; __m128 x1, y1, z1;
SSE_LOOP1(i, n) { SSE_LOOP1(i, n) {
x1 = SSE_FP32_LOAD(a + i); x1 = SSE_FP32_LOAD(a + i);
...@@ -55,7 +55,7 @@ namespace sse { ...@@ -55,7 +55,7 @@ namespace sse {
SSE_LOOP2(i, n) y[i] = a[i] * b[i]; SSE_LOOP2(i, n) y[i] = a[i] * b[i];
} }
template<> void Div(const int n, const float* a, const float* b, float* y){ template<> void Div(const int n, const float* a, const float* b, float* y) {
__m128 x1, y1, z1; __m128 x1, y1, z1;
SSE_LOOP1(i, n) { SSE_LOOP1(i, n) {
x1 = SSE_FP32_LOAD(a + i); x1 = SSE_FP32_LOAD(a + i);
...@@ -66,7 +66,7 @@ namespace sse { ...@@ -66,7 +66,7 @@ namespace sse {
SSE_LOOP2(i, n) y[i] = a[i] / b[i]; SSE_LOOP2(i, n) y[i] = a[i] / b[i];
} }
template<> void Scal(const int n, const float alpha, float* y){ template<> void Scal(const int n, const float alpha, float* y) {
__m128 y1, scalar = SSE_FP32_SCALAR(alpha); __m128 y1, scalar = SSE_FP32_SCALAR(alpha);
SSE_LOOP1(i, n) { SSE_LOOP1(i, n) {
y1 = SSE_FP32_LOAD(y + i); y1 = SSE_FP32_LOAD(y + i);
...@@ -76,7 +76,7 @@ namespace sse { ...@@ -76,7 +76,7 @@ namespace sse {
SSE_LOOP2(i, n) y[i] *= alpha; SSE_LOOP2(i, n) y[i] *= alpha;
} }
template<> void Scale(const int n, const float alpha, const float* x, float* y){ template<> void Scale(const int n, const float alpha, const float* x, float* y) {
__m128 x1, scalar = SSE_FP32_SCALAR(alpha); __m128 x1, scalar = SSE_FP32_SCALAR(alpha);
SSE_LOOP1(i, n) { SSE_LOOP1(i, n) {
x1 = SSE_FP32_LOAD(x + i); x1 = SSE_FP32_LOAD(x + i);
...@@ -87,7 +87,7 @@ namespace sse { ...@@ -87,7 +87,7 @@ namespace sse {
} }
template<> void Axpy(const int n, float alpha, const float* x, float *y){ template<> void Axpy(const int n, float alpha, const float* x, float *y) {
__m128 x1, y1, scalar = SSE_FP32_SCALAR(alpha); __m128 x1, y1, scalar = SSE_FP32_SCALAR(alpha);
SSE_LOOP1(i, n) { SSE_LOOP1(i, n) {
x1 = SSE_FP32_LOAD(x + i); x1 = SSE_FP32_LOAD(x + i);
...@@ -100,7 +100,7 @@ namespace sse { ...@@ -100,7 +100,7 @@ namespace sse {
} }
template<> void Axpby(const int n, float alpha, const float* x, template<> void Axpby(const int n, float alpha, const float* x,
const float beta, float *y){ const float beta, float *y) {
__m128 x1, y1, z1; __m128 x1, y1, z1;
__m128 scalar1 = SSE_FP32_SCALAR(alpha); __m128 scalar1 = SSE_FP32_SCALAR(alpha);
__m128 scalar2 = SSE_FP32_SCALAR(beta); __m128 scalar2 = SSE_FP32_SCALAR(beta);
...@@ -115,7 +115,7 @@ namespace sse { ...@@ -115,7 +115,7 @@ namespace sse {
SSE_LOOP2(i, n) y[i] = alpha * x[i] + beta* y[i]; SSE_LOOP2(i, n) y[i] = alpha * x[i] + beta* y[i];
} }
template<> float ASum(const int n, const float *x){ template<> float ASum(const int n, const float *x) {
__m128 x1, sum = SSE_FP32_ZERO; __m128 x1, sum = SSE_FP32_ZERO;
SSE_LOOP1(i, n) { SSE_LOOP1(i, n) {
x1 = SSE_FP32_LOAD(x + i); x1 = SSE_FP32_LOAD(x + i);
...@@ -128,7 +128,7 @@ namespace sse { ...@@ -128,7 +128,7 @@ namespace sse {
return ret; return ret;
} }
template<> void AddScalar(const int n, const float alpha, float* y){ template<> void AddScalar(const int n, const float alpha, float* y) {
__m128 y1, scalar = SSE_FP32_SCALAR(alpha); __m128 y1, scalar = SSE_FP32_SCALAR(alpha);
SSE_LOOP1(i, n) { SSE_LOOP1(i, n) {
y1 = SSE_FP32_LOAD(y + i); y1 = SSE_FP32_LOAD(y + i);
...@@ -138,7 +138,7 @@ namespace sse { ...@@ -138,7 +138,7 @@ namespace sse {
SSE_LOOP2(i, n) y[i] += alpha; SSE_LOOP2(i, n) y[i] += alpha;
} }
template<> void MulScalar(const int n, const float alpha, float* y){ template<> void MulScalar(const int n, const float alpha, float* y) {
__m128 y1, scalar = SSE_FP32_SCALAR(alpha); __m128 y1, scalar = SSE_FP32_SCALAR(alpha);
SSE_LOOP1(i, n) { SSE_LOOP1(i, n) {
y1 = SSE_FP32_LOAD(y + i); y1 = SSE_FP32_LOAD(y + i);
...@@ -148,7 +148,7 @@ namespace sse { ...@@ -148,7 +148,7 @@ namespace sse {
SSE_LOOP2(i, n) y[i] *= alpha; SSE_LOOP2(i, n) y[i] *= alpha;
} }
template <> float Dot(const int n, const float* a, const float* b){ template <> float Dot(const int n, const float* a, const float* b) {
__m128 x1, y1, sum = SSE_FP32_ZERO; __m128 x1, y1, sum = SSE_FP32_ZERO;
SSE_LOOP1(i, n) { SSE_LOOP1(i, n) {
x1 = SSE_FP32_LOAD(a + i); x1 = SSE_FP32_LOAD(a + i);
......
...@@ -11,6 +11,7 @@ import sys ...@@ -11,6 +11,7 @@ import sys
import time import time
import shutil import shutil
import tarfile import tarfile
import numpy as np
from six.moves import range as xrange from six.moves import range as xrange
import cv2 import cv2
...@@ -78,7 +79,7 @@ def extract_images(): ...@@ -78,7 +79,7 @@ def extract_images():
f.write(item) f.write(item)
def make_db(image_path, label_path, database_path): def make_db(image_path, label_path, database_path, pad=0):
if os.path.isfile(label_path) is False: if os.path.isfile(label_path) is False:
raise ValueError('input path is empty or wrong.') raise ValueError('input path is empty or wrong.')
if os.path.isdir(database_path) is True: if os.path.isdir(database_path) is True:
...@@ -111,6 +112,12 @@ def make_db(image_path, label_path, database_path): ...@@ -111,6 +112,12 @@ def make_db(image_path, label_path, database_path):
label = record[1] label = record[1]
img = cv2.imread(os.path.join(image_path ,path)) img = cv2.imread(os.path.join(image_path ,path))
if pad > 0:
pad_img = np.zeros((img.shape[0] + 2 * pad,
img.shape[1] + 2 * pad, 3), dtype=np.uint8)
pad_img[pad : pad + img.shape[0],
pad : pad + img.shape[1], :] = img
img = pad_img
result, imgencode = cv2.imencode('.jpg', img, encode_param) result, imgencode = cv2.imencode('.jpg', img, encode_param)
datum = caffe_pb2.Datum() datum = caffe_pb2.Datum()
......
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!