Commit 5d518b6c by Ting PAN

io refactoring

1 parent 31e02b2b
Showing with 502 additions and 430 deletions
......@@ -24,7 +24,7 @@ set(3RDPARTY_DIR ${PROJECT_SOURCE_DIR}/../3rdparty)
set(PYTHON_DIR /usr/include/python2.7) # prefer
#set(PYTHON_DIR /usr/include/python3.x) # optional, set specific version
#set(ANACONDA_DIR /xxx/anaconda) # optional, set specific version below if using py3
set(NUMPY_DIR /xxx/numpy) # require root folder of numpy package
set(NUMPY_DIR /xxx/numpy) # require, root folder of numpy package
# set CUDA compiling architecture
set(CUDA_ARCH -gencode arch=compute_20,code=sm_20
......
......@@ -52,7 +52,7 @@ class CPUContext{
inline static void Delete(void* data) { free(data); }
template<typename T, class DstContext, class SrcContext>
inline static void Copy(int n, T* dst, const T* src){
inline static void Copy(int n, T* dst, const T* src) {
if (dst == src) return;
// only the basic types(e.g. int/float) can memcpy correctly
if (std::is_fundamental<T>::value)
......
......@@ -119,7 +119,7 @@ class CUDAContext {
inline static void Delete(void* data) { cudaFree(data); }
template<typename T, class DstContext, class SrcContext>
static void Copy(int n, T* dst, const T* src){
static void Copy(int n, T* dst, const T* src) {
if (dst == src) return;
Memcpy<SrcContext, DstContext>(n * sizeof(T), (void*)dst, (const void*)src);
}
......@@ -148,7 +148,7 @@ class CUDAContext {
}
#ifdef WITH_CUDNN
cudnnHandle_t cudnn_handle(){
cudnnHandle_t cudnn_handle() {
auto& handle = cuda_object_.cudnn_handle[gpu_id_];
if (handle) {
return handle;
......
......@@ -77,7 +77,7 @@ class Tensor {
inline TIndex offset(const vector<TIndex>& vec) {
CHECK_LE(vec.size(), ndim());
TIndex offset = 0;
for (int i = 0; i < ndim(); i++){
for (int i = 0; i < ndim(); i++) {
offset = offset * dim(i);
if (vec.size() > i) offset += vec[i];
}
......@@ -130,7 +130,7 @@ class Tensor {
}
template <class Context>
void* raw_mutable_data(const TypeMeta& meta){
void* raw_mutable_data(const TypeMeta& meta) {
void* data_ptr;
active_data_ptr<Context>(&data_ptr);
if (meta_ == meta && data_ptr) {
......
......@@ -75,20 +75,20 @@ class TypeMeta {
bool Match() const { return (id_ == Id<T>()); }
template <typename T>
static void Ctor(void* ptr, size_t n){
static void Ctor(void* ptr, size_t n) {
T* typed_ptr = static_cast<T*>(ptr);
for (unsigned int i = 0; i < n; i++) new(typed_ptr + i) T;
}
template <typename T>
static void Copy(const void* src, void* dst, size_t n){
static void Copy(const void* src, void* dst, size_t n) {
const T* typed_src = static_cast<const T*>(src);
T* typed_dst = static_cast<T*>(dst);
for (unsigned int i = 0; i < n; i++) typed_dst[i] = typed_src[i];
}
template <typename T>
static void Dtor(void* ptr, size_t n){
static void Dtor(void* ptr, size_t n) {
T* typed_ptr = static_cast<T*>(ptr);
for (unsigned int i = 0; i < n; i++) typed_ptr[i].~T();
}
......
......@@ -44,7 +44,7 @@ class Workspace{
return tensor_map_.count(query) > 0;
}
inline Tensor* CreateTensor(const string& name){
inline Tensor* CreateTensor(const string& name) {
string query = GetTensorName(name);
if (!HasTensor(query))
tensor_map_[query] = unique_ptr<Tensor>(new Tensor(query));
......@@ -143,7 +143,7 @@ class Workspace{
return graph_map_[graph_name]->Run(include, exclude);
}
inline vector<string> GetGraphs(){
inline vector<string> GetGraphs() {
vector<string> names;
for (auto& it : graph_map_) names.push_back(it.first);
return names;
......
......@@ -54,11 +54,11 @@ class ScanGradientOp final: public Operator<Context> {
step_tensor(OperatorBase::GetSingleArg<string>("step_tensor", "")),
forward_inputs(OperatorBase::GetRepeatedArg<string>("inputs_name")),
forward_outputs(OperatorBase::GetRepeatedArg<string>("outputs_name")) {
// handle GO(x)
// handle GO(x)
for (int i = 0; i < forward_outputs.size(); i++)
terms[forward_outputs[i] + "_grad"] = input(i + (int)OutputSize()).name();
// handle GI(x)
// handle GI(x)
for (int i = 0; i < forward_inputs.size(); i++)
terms[forward_inputs[i] + "_grad"] = output(i)->name();
}
......
......@@ -25,9 +25,9 @@ class AccuracyOp final: public Operator<Context> {
public:
AccuracyOp(const OperatorDef& op_def, Workspace* ws)
: Operator<Context>(op_def, ws),
top_k(OperatorBase::GetSingleArg<int>("top_k", 1)){
top_k(OperatorBase::GetSingleArg<int>("top_k", 1)) {
vector<int> args = OperatorBase::GetRepeatedArg<int>("ignore_labels");
if (args.size()){
if (args.size()) {
ignore_labels.Reshape(vector<TIndex>(1, args.size()));
int* ignore_data = ignore_labels.mutable_data<int, CPUContext>();
for (int i = 0; i < args.size(); i++) ignore_data[i] = args[i];
......
......@@ -39,7 +39,7 @@ class ROIAlignGradientOp : public Operator<Context> {
: Operator<Context>(op_def, ws),
pool_h(OperatorBase::GetSingleArg<int>("pool_h", 0)),
pool_w(OperatorBase::GetSingleArg<int>("pool_w", 0)),
spatial_scale(OperatorBase::GetSingleArg<float>("spatial_scale", 1.0)){
spatial_scale(OperatorBase::GetSingleArg<float>("spatial_scale", 1.0)) {
CHECK_GT(pool_h, 0) << "\npool_h must > 0";
CHECK_GT(pool_w, 0) << "\npool_w must > 0";
}
......
......@@ -54,7 +54,7 @@ inline void LoadCaffeModel(string file, string scope, Workspace* ws) {
ReadProtoFromBinaryFile(file.c_str(), &net_param);
LOG(INFO) << "Restore From Model @: " << file << "......";
LOG(INFO) << "Model Format: CaffeModel";
for (int i = 0; i < net_param.layer_size(); i++){
for (int i = 0; i < net_param.layer_size(); i++) {
const LayerParameter& layer = net_param.layer(i);
const string& layer_name = layer.name();
string prefix = scope + layer_name + "@param";
......
......@@ -54,7 +54,7 @@ class TruncatedNormalFiller final : public Filler < T, Context > {
public:
TruncatedNormalFiller(const TensorFiller& filler): Filler<T, Context>(filler) {}
void Fill(Tensor* tensor) override {
// implement of gpu is diffcult
// implement of gpu is diffcult
math::RandomTruncatedNormal<T, CPUContext>(tensor->count(),
filler().mean(),
filler().std(),
......
......@@ -22,7 +22,7 @@ inline std::vector<std::string> SplitString(const std::string& str,
std::vector<std::string> ret;
std::string temp(str);
size_t pos;
while (pos = temp.find(c), pos != std::string::npos){
while (pos = temp.find(c), pos != std::string::npos) {
ret.push_back(temp.substr(0, pos));
temp.erase(0, pos + 1);
}
......
......@@ -31,7 +31,7 @@ const TypeMeta& NumpyTypeToDragon(int numpy_type) {
{ NPY_FLOAT16, TypeMeta::Make<float16>() },
{ NPY_UINT8, TypeMeta::Make<uint8_t>() }};
static TypeMeta unknown_type; // id = 0
static TypeMeta unknown_type;
return dragon_type_map.count(numpy_type) ? dragon_type_map[numpy_type] : unknown_type;
}
......@@ -50,7 +50,7 @@ REGISTER_TENSOR_FETCHER(TypeMeta::Id<NumpyFetcher>(), NumpyFetcher);
REGISTER_TENSOR_FETCHER(TypeMeta::Id<StringFetcher>(), StringFetcher);
REGISTER_TENSOR_FEEDER(TypeMeta::Id<NumpyFeeder>(), NumpyFeeder);
extern "C"{
extern "C" {
PyObject* RegisteredOperatorsCC(PyObject* self, PyObject* args) {
set<string> all_keys;
......@@ -123,7 +123,7 @@ bool SwitchWorkspaceInternal(const string& name, const bool create_if_missing) {
} else if (create_if_missing) {
unique_ptr<Workspace> new_workspace(new Workspace());
g_workspace = new_workspace.get();
g_workspaces[name] = std::move(new_workspace); // ???
g_workspaces[name] = std::move(new_workspace);
g_current_workspace = name;
return true;
} else {
......
......@@ -33,7 +33,7 @@ inline PyObject* StdStringToPyBytes(const std::string& str) {
return PyBytes_FromStringAndSize(str.c_str(), str.size());
}
template <typename T>
inline void MakeStringInternal(std::stringstream& ss, const T& t){ ss << t; }
inline void MakeStringInternal(std::stringstream& ss, const T& t) { ss << t; }
template <typename T,typename ... Args>
inline void MakeStringInternal(std::stringstream& ss, const T& t, const Args& ... args) {
......@@ -124,7 +124,7 @@ class NumpyFeeder : public TensorFeederBase {
Tensor* tensor) override {
PyArrayObject* array = PyArray_GETCONTIGUOUS(original_array);
const TypeMeta& meta = NumpyTypeToDragon(PyArray_TYPE(array));
if (meta.id() == 0){
if (meta.id() == 0) {
PyErr_SetString(PyExc_TypeError, "numpy data type is not supported.");
return nullptr;
}
......
......@@ -61,13 +61,13 @@ inline PyObject* MPICreateGroupCC(PyObject* self, PyObject* args) {
for (int i = 0; i < world_size; i++) all_ranks.insert(i);
local_group = world_group;
// check inclue ranks
// check inclue ranks
int size = PyList_Size(incl);
if (size > 0){
if (size > 0) {
all_ranks.clear();
unique_ptr<int> incl_ranks(new int[size]);
int* ranks = incl_ranks.get();
for (int i = 0; i < size; i++){
for (int i = 0; i < size; i++) {
ranks[i] = _PyInt_AsInt(PyList_GetItem(incl, i));
all_ranks.insert(ranks[i]);
}
......@@ -75,13 +75,13 @@ inline PyObject* MPICreateGroupCC(PyObject* self, PyObject* args) {
CHECK(err_code == MPI_SUCCESS) << "failed to create mpi group.";
}
// check exclude ranks
// check exclude ranks
size = PyList_Size(excl);
if (size > 0) {
all_ranks.clear(); Set<int> tmp;
unique_ptr<int> excl_ranks(new int[size]);
int* ranks = excl_ranks.get();
for (int i = 0; i < size; i++){
for (int i = 0; i < size; i++) {
ranks[i] = _PyInt_AsInt(PyList_GetItem(excl, i));
tmp.insert(ranks[i]);
}
......@@ -97,7 +97,7 @@ inline PyObject* MPICreateGroupCC(PyObject* self, PyObject* args) {
if (local_comm != MPI_COMM_NULL) {
int world_rank, local_size;
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
if (world_rank == local_root){
if (world_rank == local_root) {
MPI_Comm_size(local_comm, &local_size);
std::stringstream ss;
ss << "Rank[" << world_rank << "]: "
......
# --------------------------------------------------------
# Dragon
# Copyright(c) 2017 SeetaTech
# Written by Ting Pan
# --------------------------------------------------------
from .data_batch import DataBatch
# --------------------------------------------------------
# Caffe for Dragon
# Dragon
# Copyright(c) 2017 SeetaTech
# Written by Ting Pan
# --------------------------------------------------------
......@@ -10,7 +10,7 @@ from six.moves import range as xrange
from dragon.config import logger
from .__init__ import GetProperty
from .utils import GetProperty
class BlobFetcher(Process):
def __init__(self, **kwargs):
......@@ -30,16 +30,16 @@ class BlobFetcher(Process):
atexit.register(cleanup)
def im_list_to_blob(self):
datum = self.Q_in.get() # (h, w, BGR)
im = datum[0]; h, w, c = im.shape
im_blob = np.zeros((self._batch_size, h, w, c), dtype=np.float32)
datum = self.Q_in.get()
im_blob = []
label_blob = np.zeros((self._batch_size, len(datum[1])), dtype=np.float32) \
if len(datum) > 1 else None
for i in xrange(0, self._batch_size):
im_blob[i, 0:h, 0:w, :] = datum[0]
im_blob.append(datum[0])
if label_blob is not None: label_blob[i, :] = datum[1]
if i != self._batch_size - 1: datum = self.Q_in.get()
channel_swap = (0, 3, 1, 2)
im_blob = np.array(im_blob, dtype=np.float32)
im_blob = im_blob.transpose(channel_swap)
return (im_blob, label_blob)
......
# --------------------------------------------------------
# Caffe for Dragon
# Dragon
# Copyright(c) 2017 SeetaTech
# Written by Ting Pan
# --------------------------------------------------------
import sys
import time
import pprint
from multiprocessing import Queue
if sys.version_info >= (3,0):
from queue import Queue as Queue2
else:
from Queue import Queue as Queue2
import threading
from six.moves import range as xrange
import dragon.core.mpi as mpi
......@@ -16,10 +22,11 @@ from .data_reader import DataReader
from .data_transformer import DataTransformer
from .blob_fetcher import BlobFetcher
from .__init__ import GetProperty
from .utils import GetProperty
class DataBatch(object):
class DataBatch(threading.Thread):
def __init__(self, **kwargs):
super(DataBatch, self).__init__()
"""DataBatch use Triple-Buffering to speed up"""
......@@ -35,10 +42,10 @@ class DataBatch(object):
kwargs['group_size'] = group_size
# configuration
self._prefetch = GetProperty(kwargs, 'prefetch', 40)
self._prefetch = GetProperty(kwargs, 'prefetch', 5)
self._num_readers = GetProperty(kwargs, 'num_readers', 1)
self._num_transformers = GetProperty(kwargs, 'num_transformers', -1)
self._num_fetchers = GetProperty(kwargs, 'num_fetchers', 3)
self._num_fetchers = GetProperty(kwargs, 'num_fetchers', 1)
# default policy
if self._num_transformers == -1:
......@@ -60,6 +67,7 @@ class DataBatch(object):
self.Q_level_1 = Queue(self._prefetch * self._num_readers * self._batch_size)
self.Q_level_2 = Queue(self._prefetch * self._num_readers * self._batch_size)
self.Q_level_3 = Queue(self._prefetch * self._num_readers)
self.Q_level_4 = Queue2(self._prefetch * self._num_readers)
# init readers
self._readers = []
......@@ -102,11 +110,16 @@ class DataBatch(object):
self._fetchers.append(fetcher)
time.sleep(0.1)
self.daemon = True
self.start()
#self.echo()
@property
def blobs(self):
return self.Q_level_3.get()
def run(self):
while True:
self.Q_level_4.put(self.Q_level_3.get())
def get(self):
return self.Q_level_4.get()
def echo(self):
logger.info('---------------------------------------------------------')
......
# --------------------------------------------------------
# Caffe for Dragon
# Dragon
# Copyright(c) 2017 SeetaTech
# Written by Ting Pan
# --------------------------------------------------------
......@@ -12,8 +12,7 @@ import dragon.config as config
from dragon.config import logger
from dragon.tools.db import LMDB
from .__init__ import GetProperty
from .utils import GetProperty
class DataReader(Process):
def __init__(self, **kwargs):
......
# --------------------------------------------------------
# Caffe for Dragon
# Dragon
# Copyright(c) 2017 SeetaTech
# Written by Ting Pan
# --------------------------------------------------------
......@@ -12,7 +12,7 @@ import dragon.config as config
from dragon.config import logger
import dragon.vm.caffe.proto.caffe_pb2 as pb
from .__init__ import GetProperty
from .utils import GetProperty
try:
import cv2
......@@ -130,6 +130,4 @@ class DataTransformer(Process):
npr.seed(self._random_seed)
while True:
serialized = self.Q_in.get()
self.Q_out.put(self.transform_image_label(serialized))
self.Q_out.put(self.transform_image_label(serialized))
\ No newline at end of file
# --------------------------------------------------------
# Caffe for Dragon
# Dragon
# Copyright(c) 2017 SeetaTech
# Written by Ting Pan
# --------------------------------------------------------
def GetProperty(kwargs, name, default):
return kwargs[name] \
if name in kwargs else default
\ No newline at end of file
if name in kwargs else default
# --------------------------------------------------------
# Caffe for Dragon
# Dragon
# Copyright(c) 2017 SeetaTech
# Written by Ting Pan
# --------------------------------------------------------
import dragon.vm.caffe as caffe
import dragon.core.workspace as ws
from .minibatch import DataBatch
from dragon.io.data_batch import DataBatch
class DataLayer(caffe.Layer):
def setup(self, bottom, top):
class MiniBatchOp(object):
def setup(self, inputs, outputs):
kwargs = eval(self.param_str)
self._data_batch = DataBatch(**kwargs)
def forward(self, bottom, top):
blobs = self._data_batch.blobs
def run(self, inputs, outputs):
blobs = self._data_batch.get()
for idx, blob in enumerate(blobs):
ws.FeedTensor(top[idx], blob)
\ No newline at end of file
ws.FeedTensor(outputs[idx], blob)
\ No newline at end of file
......@@ -10,7 +10,6 @@ from dragon.operators.utils import Run
def LMDBData(**kwargs):
"""
:param kwargs: a dict of imagenet data param
:param --> mean_value: a list of mean values for channles [B-G-R]
:param --> source: a str of the images root directory
:param --> imageset: a str of text file contains image name / label
......@@ -30,8 +29,8 @@ def LMDBData(**kwargs):
args = locals(); kwargs = args['kwargs']
del args['kwargs']; kwargs = dict(args, **kwargs)
kwargs['module'] = 'dragon.vm.caffe.io.data_layer'
kwargs['op'] = 'DataLayer'
kwargs['module'] = 'dragon.operators.custom.minibatch'
kwargs['op'] = 'MiniBatchOp'
return Run([], param_str=str(kwargs), nout=2, **kwargs)
......
# --------------------------------------------------------
# Dragon
# Copyright(c) 2017 SeetaTech
# Written by Ting Pan
# --------------------------------------------------------
""" Generate LMDB from images """
import os
import sys
import time
import shutil
import argparse
import cv2
try:
import numpy as np
except: pass
from dragon.tools.db import LMDB
from dragon.vm.caffe.proto import caffe_pb2
def resize_image(im, resize):
if im.shape[0] > im.shape[1]:
newsize = (resize, im.shape[0] * resize / im.shape[1])
else:
newsize = (im.shape[1] * resize / im.shape[0], resize)
im = cv2.resize(im, newsize)
return im
def make_db(args):
if os.path.isfile(args.list) is False:
raise ValueError('the path of image list is invalid.')
if os.path.isdir(args.database) is True:
raise ValueError('the database is already exist or invalid.')
print('start time: ', time.strftime("%a, %d %b %Y %H:%M:%S", time.gmtime()))
db = LMDB(max_commit=10000)
db.open(args.database, mode='w')
total_line = sum(1 for line in open(args.list))
count = 0
zfill_flag = '{0:0%d}' % (args.zfill)
encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), args.quality]
start_time = time.time()
with open(args.list, 'r') as input_file:
records = input_file.readlines()
if args.shuffle:
import random
random.shuffle(records)
for record in records:
count += 1
if count % 10000 == 0:
now_time = time.time()
print('{0} / {1} in {2:.2f} sec'.format(
count, total_line, now_time - start_time))
db.commit()
record = record.split()
path = record[0]
label = record[1]
img = cv2.imread(os.path.join(args.root, path))
if args.resize > 0:
img = resize_image(img, args.resize)
if args.pad > 0:
pad_img = np.zeros((img.shape[0] + 2 * args.pad,
img.shape[1] + 2 * args.pad, 3), dtype=img.dtype)
pad_img[args.pad : args.pad + img.shape[0],
args.pad : args.pad + img.shape[1], :] = img
img = pad_img
result, imgencode = cv2.imencode('.jpg', img, encode_param)
datum = caffe_pb2.Datum()
datum.height, datum.width, datum.channels = img.shape
datum.label = int(label)
datum.encoded = True
datum.data = imgencode.tostring()
db.put(zfill_flag.format(count - 1), datum.SerializeToString())
now_time = time.time()
print('{0} / {1} in {2:.2f} sec'.format(count, total_line, now_time - start_time))
db.put('size', str(count))
db.put('zfill', str(args.zfill))
db.commit()
db.close()
shutil.copy(args.list, args.database + '/image_list.txt')
end_time = time.time()
print('{0} images have been stored in the database.'.format(total_line))
print('This task finishes within {0:.2f} seconds.'.format(end_time - start_time))
print('The size of database is {0} MB.'.
format(float(os.path.getsize(args.database + '/data.mdb') / 1000 / 1000)))
def parse_args():
parser = argparse.ArgumentParser(description='Create LMDB from images for classification.')
parser.add_argument('--root', help='the root folder of raw images')
parser.add_argument('--list', help='the filepath of image list')
parser.add_argument('--database', help='the filepath of database')
parser.add_argument('--zfill', type=int, default=8, help='zfill for the key of database')
parser.add_argument('--resize', type=int, default=0, help='resize the shorter edge of image to the newsize')
parser.add_argument('--pad', type=int, default=0, help='zero-pad the image')
parser.add_argument('--quality', type=int, default=95, help='JPEG quality for encoding, 1-100')
parser.add_argument('--shuffle', type=bool, default=True, help='randomize the order in list file True')
if len(sys.argv) < 4:
parser.print_help()
sys.exit(1)
args = parser.parse_args()
return args
if __name__ == '__main__':
args = parse_args()
make_db(args)
\ No newline at end of file
# --------------------------------------------------------
# Dragon
# Copyright(c) 2017 SeetaTech
# Written by Ting Pan
# --------------------------------------------------------
from google.protobuf.message import Message
from protos import dragon_pb2 as pb
import numpy as np
def MakeArgument(key, value):
argument = pb.Argument()
argument.name = key
if type(value) is float: argument.f = value
elif type(value) is int : argument.i = value
elif type(value) is np.int64: argument.i64 = int(value)
elif type(value) is str: argument.s = value
elif type(value) is unicode: argument.s = value
elif type(value) is bool: argument.b = value
elif isinstance(value, Message): argument.s = value.SerializeToString()
elif all(type(v) is float for v in value): argument.floats.extend(value)
elif all(type(v) is int for v in value): argument.ints.extend(value)
elif all(type(v) is str for v in value): argument.strings.extend(value)
elif all(type(v) is unicode or type(v) is str for v in value): argument.strings.extend(value)
elif all(isinstance(v,Message) for v in value):
argument.strings.extend([v.SerializeToString() for v in value])
else: raise ValueError('unknown argument type: key={} value={} value type={}' \
.format(key,value,type(value)))
return argument
def MakeOperatorDef(op_type, inputs, outputs, name='',
device_option=None, arg=None, engine=None, **kwargs):
operator = pb.OperatorDef()
operator.type = op_type
operator.name = name
operator.input.extend([str(tensor) for tensor in inputs])
operator.output.extend([str(tensor) for tensor in outputs])
if device_option is not None:
operator.device_option.CopyFrom(device_option)
if engine is not None:
operator.engine = engine
if 'random_seed' in kwargs:
operator.device_option.random_seed = kwargs['random_seed']
del kwargs['random_seed']
if arg is not None:
operator.arg.extend(arg)
for k,v in kwargs.items():
if v is None: continue
operator.arg.add().CopyFrom(MakeArgument(k,v))
return operator
def MakeDeviceOption(device_type, gpu_id, rng_seed = None):
""" return a DeviceOption """
option = pb.DeviceOption()
option.device_type = device_type
option.gpu_id = gpu_id
if rng_seed is not None: option.random_seed = rng_seed
return option
# fix the python stdout
class Unbuffered(object):
def __init__(self, stream):
self.stream = stream
def write(self, data):
self.stream.write(data)
self.stream.flush()
def __getattr__(self, attr):
return getattr(self.stream, attr)
# clear the stdout buffer for mpi(c++ & python)
import sys
sys.stdout = Unbuffered(sys.stdout)
\ No newline at end of file
......@@ -221,7 +221,7 @@ message SolverParameter {
// RMSProp decay value
// MeanSquare(t) = rms_decay*MeanSquare(t-1) + (1-rms_decay)*SquareGradient(t)
optional float rms_decay = 38;
optional float rms_decay = 38 [default = 0.99];
// If true, print information about the state of the net that may help with
// debugging learning problems.
......@@ -676,7 +676,7 @@ message DataParameter {
optional bool force_encoded_color = 9 [default = false];
// Prefetch queue (Number of batches to prefetch to host memory, increase if
// data access bandwidth varies).
optional uint32 prefetch = 10 [default = 40];
optional uint32 prefetch = 10 [default = 5];
}
message DropoutParameter {
......
......@@ -8,9 +8,9 @@ CPUObject CPUContext::cpu_object_;
CUDAObject CUDAContext::cuda_object_;
#endif // WITH_CUDA
// cpu <- gpu
// cpu <- gpu
template<> void CPUContext::Memcpy<CPUContext, CUDAContext>(
size_t nbytes, void* dst, const void* src){
size_t nbytes, void* dst, const void* src) {
#ifdef WITH_CUDA
CUDAContext ctx(POINTER_DEVICE(src));
ctx.Memcpy<CPUContext, CUDAContext>(nbytes, dst, src);
......@@ -19,9 +19,9 @@ template<> void CPUContext::Memcpy<CPUContext, CUDAContext>(
#endif
}
// gpu <- cpu
// gpu <- cpu
template<> void CPUContext::Memcpy<CUDAContext, CPUContext>(
size_t nbytes, void* dst, const void* src){
size_t nbytes, void* dst, const void* src) {
#ifdef WITH_CUDA
CUDAContext ctx(POINTER_DEVICE(dst));
ctx.Memcpy<CUDAContext, CPUContext>(nbytes, dst, src);
......
......@@ -151,13 +151,13 @@ GraphDef Graph::Prune(const GraphDef& graph_def) {
OperatorDef op_def;
op_def.CopyFrom(graph_def.op(it));
// handle inputs
for (int i = 0; i < graph_def.op(it).input_size(); i++){
for (int i = 0; i < graph_def.op(it).input_size(); i++) {
string input = graph_def.op(it).input(i);
if (!colored_[input] || !outputs.count(input))
*op_def.mutable_input(i) = "ignore";
}
// handle outputs
for (int i = 0; i < graph_def.op(it).output_size(); i++){
for (int i = 0; i < graph_def.op(it).output_size(); i++) {
string output = graph_def.op(it).output(i);
if (!colored_[output]) *op_def.mutable_output(i) = "ignore";
else outputs.insert(op_def.output(i));
......@@ -212,7 +212,7 @@ GraphDef Graph::MakeUpdate(const GraphDef& graph_def) {
for (int i = 0; i < graph_def.u_target_size(); i++) {
UpdateTarget target = graph_def.u_target(i);
vector<string> missing_tensors;
// missing check
// missing check
for (auto& tensor : target.tensor()) {
if (!ws()->HasTensor(tensor)) {
LOG(INFO) << "missing Tensor: " << tensor;
......
......@@ -18,12 +18,12 @@ CheckTuple GraphGradientMaker::CheckMissingGrad(OperatorDef* forward_op) {
string g_output = output + "_grad";
if (terms_.count(g_output)) g_output = terms_[g_output];
// check if having external grad first
// check if having external grad first
if (external_grads_.count(g_output))
inputs_to_grads_[output] = g_output;
// consider generate virtual grad
else if (targets_set_.count(output) && g_output != "ignore"){
// consider generate virtual grad
else if (targets_set_.count(output) && g_output != "ignore") {
gen_grads.push_back({ output, idx });
inputs_to_grads_[output] = g_output;
}
......@@ -36,7 +36,7 @@ CheckTuple GraphGradientMaker::CheckMissingGrad(OperatorDef* forward_op) {
if (forward_op->output_size() == 1) return { true, gen_grads };
}
}
// check pass, even if missing some grads
// check pass, even if missing some grads
return { false, gen_grads };
}
......@@ -50,7 +50,7 @@ GraphDef GraphGradientMaker::Make() {
Set<string> all_split_grads;
// PLAY for the forward
for (auto& op : forward_def_.op()){
for (auto& op : forward_def_.op()) {
if (NoGradientRegistry()->Has(op.type())) continue;
for (auto& input : op.input()) inputs_count[input]++;
}
......@@ -73,17 +73,17 @@ GraphDef GraphGradientMaker::Make() {
Gradient grad = MakeGradientForOp(*op, g_outputs);
// replace terms
for (auto& g_op : grad.ops){
for (auto& g_op : grad.ops) {
g_op.set_name(GetOperatorName());
for (int i = 0; i < g_op.input_size(); i++){
for (int i = 0; i < g_op.input_size(); i++) {
string* input = g_op.mutable_input(i);
if (terms_.count(*input)) *input = terms_[*input];
}
for (int i = 0; i < g_op.output_size(); i++){
for (int i = 0; i < g_op.output_size(); i++) {
string* output = g_op.mutable_output(i);
if (terms_.count(*output)) *output = terms_[*output];
}
for (int i = 0; i < grad.g_inputs.size(); i++){
for (int i = 0; i < grad.g_inputs.size(); i++) {
if (terms_.count(grad.g_inputs[i]))
grad.g_inputs[i] = terms_[grad.g_inputs[i]];
}
......@@ -106,14 +106,14 @@ GraphDef GraphGradientMaker::Make() {
string split_name = *output + "_autosplit_" + str(grads_count[*output]++);
if (!is_skip) all_split_grads.insert(split_name);
// gather
if (grads_count[*output] == inputs_count[original_name]){
if (grads_count[*output] == inputs_count[original_name]) {
gather_op = new OperatorDef();
gather_op->set_name(GetOperatorName());
gather_op->set_type("GradientGather");
gather_op->add_output(*output);
if (g_op.has_device_option())
gather_op->mutable_device_option()->CopyFrom(g_op.device_option());
for (int j = 0; j < grads_count[*output]; j++){
for (int j = 0; j < grads_count[*output]; j++) {
string key = *output + "_autosplit_" + str(j);
if (all_split_grads.count(key)) gather_op->add_input(key);
}
......@@ -123,7 +123,7 @@ GraphDef GraphGradientMaker::Make() {
}
}
// append ops
// append ops
if (!is_skip) {
if (gen_grads.size() > 0) {
vector<string> op_inputs, op_outputs;
......@@ -148,7 +148,7 @@ GraphDef GraphGradientMaker::Make() {
}
if (gather_op != nullptr) new_def_.add_op()->CopyFrom(*gather_op);
// done
// done
if (!is_skip) {
for (int i = 0; i < op->input_size(); i++) {
if (!grad.g_inputs[i].empty())
......
......@@ -66,7 +66,7 @@ DEFINE_REGISTRY(GradientRegistry, GradientMakerBase, const OperatorDef&, const v
DEFINE_REGISTRY(NoGradientRegistry, GradientMakerBase, const OperatorDef&, const vector<string>&);
#define INSTANTIATE_GET_SINGLE_ARGUMENT(T, fieldname) \
template <> T OperatorBase::GetSingleArg(const string& name, const T& default_value){ \
template <> T OperatorBase::GetSingleArg(const string& name, const T& default_value) { \
if(args_.count(name) == 0) { \
return default_value; \
} \
......@@ -82,7 +82,7 @@ INSTANTIATE_GET_SINGLE_ARGUMENT(int64_t, i64);
#define INSTANTIATE_GET_REPEATED_ARGUMENT(T, fieldname) \
template<> vector<T> OperatorBase::GetRepeatedArg<T>(const string& name){ \
template<> vector<T> OperatorBase::GetRepeatedArg<T>(const string& name) { \
if(args_.count(name) == 0) return vector<T>(); \
vector<T> values; \
for(const auto& v : args_[name]->fieldname()) values.push_back(v); \
......
......@@ -17,7 +17,7 @@ bool OpSchema::Verify(const OperatorDef& def) const {
}
for (int in = 0; in < def.input_size(); in++) {
if (def.input(in) == "ignore") continue;
for (int out = 0; out < def.output_size(); out++){
for (int out = 0; out < def.output_size(); out++) {
if (def.output(out) == "ignore") continue;
if (def.input(in) == def.output(out) && (!CheckInplace(in, out)))
LOG(FATAL) << "[" << def.name() << "] input("
......
......@@ -24,7 +24,7 @@ void CuDNNReluOp<Context>::RunWithType() {
template <class Context>
void CuDNNReluOp<Context>::RunOnDevice() {
// cudnn does not support LeakyRelu
// cudnn does not support LeakyRelu
if (this->slope != 0) return ReluOp<Context>::RunOnDevice();
output(0)->ReshapeLike(input(0));
......@@ -58,7 +58,7 @@ void CuDNNReluGradientOp<Context>::RunWithType() {
template <class Context>
void CuDNNReluGradientOp<Context>::RunOnDevice() {
// cudnn does not support LeakyRelu
// cudnn does not support LeakyRelu
if (this->slope != 0) return ReluGradientOp<Context>::RunOnDevice();
output(0)->ReshapeLike(input(0));
......
......@@ -71,10 +71,9 @@ void DropoutGradientOp<Context>::RunOnDevice() {
template <class Context>
void DropoutGradientOp<Context>::ClearAfterRun() {
ws()->ReleaseBuffer(mask);
ws()->ReleaseBuffer(mask, true);
}
DEPLOY_CPU(DropoutGradient);
#ifdef WITH_CUDA
DEPLOY_CUDA(DropoutGradient);
......
......@@ -125,7 +125,7 @@ void DotGradientOp<Context>::GemvRunWithType() {
}
template <class Context>
void DotGradientOp<Context>::RunOnDevice(){
void DotGradientOp<Context>::RunOnDevice() {
output(0)->ReshapeLike(input(0));
output(1)->ReshapeLike(input(1));
......
......@@ -65,7 +65,7 @@ void EltwiseGradientOp<Context>::SumRunWithType() {
auto* dYdata = input(-1).template data<T, Context>();
TIndex count = input(-1).count();
for (int i = 0; i < OutputSize(); i++){
for (int i = 0; i < OutputSize(); i++) {
if (output(i)->name() == "ignore") continue;
auto* dXdata = output(i)->template mutable_data<T, Context>();
if (coeffs[i] == float(1)) {
......
......@@ -8,7 +8,7 @@ template <class Context> template <typename T>
void GramMatrixOp<Context>::RunWithType() {
auto* Xdata = input(0).template data<T, Context>();
auto* Ydata = output(0)->template mutable_data<T, Context>();
for (int i = 0; i < outer_dim; i++){
for (int i = 0; i < outer_dim; i++) {
math::Gemm<T, Context>(CblasNoTrans, CblasTrans,
dim, dim, inner_dim, 1.0, Xdata, Xdata, 0.0, Ydata);
Xdata += x_offset;
......@@ -40,7 +40,7 @@ void GramMatrixGradientOp<Context>::RunWithType() {
auto* dYdata = input(-1).template data<T, Context>();
auto* Xdata = input(0).template data<T, Context>();
auto* dXdata = output(0)->template mutable_data<T, Context>();
for (int i = 0; i < outer_dim; i++){
for (int i = 0; i < outer_dim; i++) {
math::Gemm<T, Context>(CblasNoTrans, CblasNoTrans,
dim, inner_dim, dim, 2.0, dYdata, Xdata, 0.0, dXdata);
dYdata += y_offset;
......@@ -49,7 +49,7 @@ void GramMatrixGradientOp<Context>::RunWithType() {
}
template <class Context>
void GramMatrixGradientOp<Context>::RunOnDevice(){
void GramMatrixGradientOp<Context>::RunOnDevice() {
outer_dim = input(0).count(0, axis);
dim = input(0).dim(axis);
inner_dim = input(0).count(axis + 1);
......
......@@ -81,7 +81,7 @@ void MatmulGradientOp<Context>::RunWithType() {
}
template <class Context>
void MatmulGradientOp<Context>::RunOnDevice(){
void MatmulGradientOp<Context>::RunOnDevice() {
CHECK(input(0).ndim() == input(1).ndim())
<< "both matrices must have the same number of dimensions.";
CHECK_GE(input(0).ndim(), 2)
......
......@@ -9,7 +9,7 @@ void PowOp<Context>::RunWithType() {
TIndex count = input(0).count();
auto* Ydata = output(0)->template mutable_data<T, Context>();
if (power_scale == float(0)){
if (power_scale == float(0)) {
float value = (power == float(0)) ? float(1) : pow(shift, power);
math::Set<T, Context>(count, dragon_cast<T, float>(value), Ydata);
return;
......
......@@ -85,12 +85,12 @@ void ScaleGradientOp<Context>::ScaleRunWithType() {
T* SRes_data = nullptr;
if (inner_dim == 1) {
SRes_data = tmp_data;
} else if (sum_result.count() == 1) { // handle inner only
} else if (sum_result.count() == 1) { // handle inner only
dScale = output(1)->template mutable_data<T, CPUContext>();
T result = math::Dot<T, Context>(inner_dim, tmp_data, SMul_data);
*dScale += result;
} else {
SRes_data = (outer_dim == 1) ? // handle scale only
SRes_data = (outer_dim == 1) ? // handle scale only
dScale : sum_result.template mutable_data<T, Context>();
math::Gemv<T, Context>(CblasNoTrans, sum_result.count(), inner_dim,
1.0,
......@@ -99,7 +99,7 @@ void ScaleGradientOp<Context>::ScaleRunWithType() {
SRes_data);
}
if (outer_dim != 1) {
if (scale_dim == 1) { // handle outer only
if (scale_dim == 1) { // handle outer only
T result = math::Dot<T, Context>(outer_dim, SMul_data, SRes_data);
*dScale += result;
} else {
......
......@@ -12,7 +12,7 @@ void SquareOp<Context>::RunWithType() {
}
template <class Context>
void SquareOp<Context>::RunOnDevice(){
void SquareOp<Context>::RunOnDevice() {
output(0)->ReshapeLike(input(0));
if (input(0).template IsType<float>()) RunWithType<float>();
......
......@@ -44,7 +44,7 @@ void SubOp<Context>::BroadcastRunWithType(int type) {
}
template <class Context>
void SubOp<Context>::RunOnDevice(){
void SubOp<Context>::RunOnDevice() {
output(0)->ReshapeLike(input(0));
if (input(0).dims() == input(1).dims()) {
......
......@@ -6,7 +6,7 @@ namespace dragon {
template <class Context> template <typename T>
void ArgmaxOp<Context>::RunWithType() {
if (top_k != 1) {
// it's difficult to implement device code when top_k > 1
// it's difficult to implement device code when top_k > 1
auto* Xdata = input(0).template data<T, CPUContext>();
auto* Ydata = output(0)->template mutable_data<T, CPUContext>();
kernel::Argmax<T, CPUContext>(count, input(0).dim(axis), inner_dim,
......
......@@ -7,7 +7,7 @@ namespace dragon {
template <class Context> template <typename T>
void ConcatOp<Context>::RunWithType() {
auto* Ydata = output(0)->template mutable_data<T, Context>();
for (int i = 0; i < nin; i++){
for (int i = 0; i < nin; i++) {
auto* Xdata = input(i).template data<T, Context>();
TIndex count = input(i).count();
x_concat_dim = input(i).dim(axis);
......@@ -25,12 +25,12 @@ void ConcatOp<Context>::RunWithType() {
}
template <class Context>
void ConcatOp<Context>::RunOnDevice(){
void ConcatOp<Context>::RunOnDevice() {
concat_dims = input(0).dims();
for (int i = 1; i < nin; i++) {
CHECK_EQ(concat_dims.size(), input(i).ndim())
<< "\nall inputs must have the same ndim.";
for (int j = 0; j < concat_dims.size(); j++){
for (int j = 0; j < concat_dims.size(); j++) {
if (j == axis) continue;
CHECK_EQ(concat_dims[j], input(i).dim(j))
<< "\nall inputs must have the same dims"
......@@ -82,7 +82,7 @@ void ConcatGradientOp<Context>::RunWithType() {
}
template <class Context>
void ConcatGradientOp<Context>::RunOnDevice(){
void ConcatGradientOp<Context>::RunOnDevice() {
if (input(-1).name() == "ignore") return;
concat_dims = input(-1).dims();
y_concat_dim = concat_dims[axis];
......
......@@ -27,7 +27,7 @@ OPERATOR_SCHEMA(Flatten).NumInputs(1).NumOutputs(1);
template <class Context>
void FlattenGradientOp<Context>::RunOnDevice(){
void FlattenGradientOp<Context>::RunOnDevice() {
output(0)->ReshapeLike(input(0));
output(0)->Share(input(1));
}
......
......@@ -8,7 +8,7 @@ RunOp<Context>::RunOp(const OperatorDef& op_def, Workspace* ws)
module(OperatorBase::GetSingleArg<string>("module", "")),
op(OperatorBase::GetSingleArg<string>("op", "")),
param_str((OperatorBase::GetSingleArg<string>("param_str", ""))) {
// init interpreter & load module
// init interpreter & load module
Py_Initialize();
PyObject* py_module = PyImport_ImportModule(module.c_str());
CHECK(py_module) << "\ncan not import py module: " << module;
......@@ -18,11 +18,11 @@ RunOp<Context>::RunOp(const OperatorDef& op_def, Workspace* ws)
<< " from module: " << module;
self = PyObject_CallObject(py_op, NULL);
// pass param string
// pass param string
PyObject_SetAttr(self, String("param_str"), String(param_str.c_str()));
PyObject_SetAttr(self, String("param_str_"), String(param_str.c_str()));
// build inputs and outputs for Python
// build inputs and outputs for Python
inputs = PyList_New(InputSize());
for (int i = 0; i < InputSize(); i++)
PyList_SetItem(inputs, i, String(input(i).name().c_str()));
......@@ -31,21 +31,21 @@ RunOp<Context>::RunOp(const OperatorDef& op_def, Workspace* ws)
PyList_SetItem(outputs, i, String(output(i)->name().c_str()));
if (!this->allow_run()) return;
// setup
// setup
if (PyObject_HasAttr(self, String("setup")))
PyObject_CallMethod(self, "setup", "OO", inputs, outputs);
}
template <class Context>
void RunOp<Context>::RunOnDevice() {
// init phase
// init phase
PyObject_SetAttr(self, String("phase"), String(this->phase().c_str()));
// reshape
// reshape
if (PyObject_HasAttr(self, String("reshape")))
PyObject_CallMethod(self, "reshape", "OO", inputs, outputs);
// run
// run
if (PyObject_HasAttr(self, String("forward"))) {
PyObject_CallMethod(self, "forward", "OO", inputs, outputs);
} else if (PyObject_HasAttr(self, String("run"))) {
......@@ -63,14 +63,14 @@ NO_GRADIENT(Run);
template <class Context>
void TemplateGradientOp<Context>::RunOnDevice() {
// init phase
// init phase
PyObject_SetAttr(this->self, String("phase"), String(this->phase().c_str()));
// reshape
// reshape
if (PyObject_HasAttr(this->self, String("reshape")))
PyObject_CallMethod(this->self, "reshape", "OO", this->inputs, this->outputs);
// run
// run
if (PyObject_HasAttr(this->self, String("backward"))) {
PyObject_CallMethod(this->self, "forward", "OO", this->inputs, this->outputs);
} else if (PyObject_HasAttr(this->self, String("grad"))) {
......
......@@ -78,7 +78,7 @@ void ReduceGradientOp<Context>::SumRunWithType() {
template <class Context> template <typename T>
void ReduceGradientOp<Context>::MeanRunWithType() {
auto* dXdata = output(0)->template mutable_data<T, Context>();
if (axis == -1){
if (axis == -1) {
auto* dYdata = input(-1).template data<T, CPUContext>();
math::Set<T, Context>(output(0)->count(), dYdata[0] / input(0).count(), dXdata);
} else {
......
......@@ -8,18 +8,18 @@ void ReshapeOp<Context>::RunOnDevice() {
int infer_dim = -1;
TIndex total_count = 1;
for (int i = 0; i < shape.size(); i++) {
// handle unchanged dim
// handle unchanged dim
if (shape[i] == 0) {
CHECK_LT(i, (int)Xdims.size())
<< "\ndim(" << i << ") is out of the Xdims range of (0, "
<< Xdims.size() << ").";
new_shape[i] = Xdims[i];
}
// handle reseted dim
// handle reseted dim
else if (shape[i] > 0) {
new_shape[i] = shape[i];
}
// handle inferred dim
// handle inferred dim
else {
CHECK_EQ(infer_dim, -1)
<< "\ndim(" << infer_dim << ") required infer before"
......@@ -30,7 +30,7 @@ void ReshapeOp<Context>::RunOnDevice() {
if (new_shape[i] != -1) total_count *= new_shape[i];
}
// solve inferred dim if necessary
// solve inferred dim if necessary
if (infer_dim != -1) {
for (int i = 0; i < new_shape.size(); i++) {
if (new_shape[i] == -1) {
......
......@@ -25,7 +25,7 @@ void ScanOp<Context>::InitTemplate() {
slice_def.add_arg()->CopyFrom(arg_nout);
template_def.mutable_device_option()->CopyFrom(op_def().device_option());
template_def.set_debug_mode(debug_mode);
// init for the first step
// init for the first step
for (int i = 0; i < nseqs; i++) {
OperatorDef* op = template_def.add_op();
op->CopyFrom(slice_def);
......@@ -37,19 +37,19 @@ void ScanOp<Context>::InitTemplate() {
OperatorDef* op = template_def.add_op();
op->CopyFrom(func_def.op(i));
op->set_name(name() + "(BodyOp." + str(i + nseqs) + ")@1");
// replace inputs term
// replace inputs term
for (int j = 0; j < op->input_size(); j++) {
string* input = op->mutable_input(j);
if (terms.count(*input)) *input = terms[*input];
}
// replace outputs term
// replace outputs term
for (int j = 0; j < op->output_size(); j++) {
string* output = op->mutable_output(j);
terms[*output] = *output + "@1";
*output = terms[*output];
}
}
// handle pre outputs
// handle pre outputs
for (int i = 0; i < nout; i++) {
if (default_outputs[i].empty()) continue;
terms[default_outputs[i]] = func_def.target(i) + "@1";
......@@ -59,15 +59,15 @@ void ScanOp<Context>::InitTemplate() {
template <class Context>
void ScanOp<Context>::UpdateTerms(int cur_step) {
string prev, now;
// update sequences term
// update sequences term
for (int i = 0; i < nseqs; i++) {
prev = input(i).name() + "@" + str(cur_step - 1);
now = input(i).name() + "@" + str(cur_step);
terms[prev] = now;
}
if (cur_step < 3) return;
// update recurrent term
// only support the latest one-step (as Theano's done)
// update recurrent term
// only support the latest one-step (as Theano's done)
for (int i = 0; i < nout; i++) {
if (default_outputs[i].empty()) continue;
prev = output(i)->name() + "@" + str(cur_step - 2);
......@@ -93,15 +93,15 @@ void ScanOp<Context>::UnrollTemplate() {
for (int idx = 0; idx < nseqs; idx++) {
OperatorDef *op = new_def.mutable_op(idx);
int nslices = input(idx).dim(axis);
// alter the num of slices for all sequences
// alter the num of slices for all sequences
op->mutable_arg(1)->set_i(nslices);
// add slices as outputs
// add slices as outputs
for (int t = 1; t <= nslices; t++) {
string slice = op->input(0) + "@" + str(t);
op->add_output(slice);
}
}
// main loop
// main loop
for (int t = 2; t <= nsteps; t++) {
UpdateTerms(t);
int copy_r = new_def.op_size(), copy_l = copy_r - nrepeats;
......@@ -109,12 +109,12 @@ void ScanOp<Context>::UnrollTemplate() {
OperatorDef* op = new_def.add_op();
op->CopyFrom(new_def.op(idx));
op->set_name(SplitString(op->name(), "@")[0] + "@" + str(t));
// replace inputs
// replace inputs
for (int j = 0; j < op->input_size(); j++) {
string* input = op->mutable_input(j);
if (terms.count(*input)) *input = terms[*input];
}
// replace outputs
// replace outputs
for (int j = 0; j < op->output_size(); j++) {
string* output = op->mutable_output(j);
terms[*output] = SplitString(*output, "@")[0] + "@" + str(t);
......@@ -123,9 +123,9 @@ void ScanOp<Context>::UnrollTemplate() {
}
}
for (int i = 0; i < nout; i++) {
// solve the last step only
// solve the last step only
new_def.add_target(func_def.target(i) + "@" + str(nsteps));
// concat all steps if necessary
// concat all steps if necessary
if (output(i)->name() == "ignore") continue;
OperatorDef* op = new_def.add_op();
op->set_name(name() + "(BodyOp." + str(nseqs + nrepeats + i) + ")");
......@@ -138,10 +138,10 @@ void ScanOp<Context>::UnrollTemplate() {
for (int t = 1; t <= nsteps; t++)
op->add_input(output(i)->name() + "@" + str(t));
op->add_output(output(i)->name());
// solve all the all steps
// solve all the all steps
new_def.add_target(output(i)->name());
}
// upload
// upload
Tensor* string_tensor = ws()->CreateTensor("_t_" + anchor() + "_raw_ops");
string_tensor->Reshape(vector<TIndex>(1, 1));
string* data = string_tensor->mutable_data <string, CPUContext>();
......@@ -150,13 +150,11 @@ void ScanOp<Context>::UnrollTemplate() {
template <class Context>
void ScanOp<Context>::RunOnDevice() {
// unroll
UnrollTemplate();
if (!graphs.count(nsteps))
if (!graphs.count(nsteps)) {
graphs[nsteps].reset(new Graph(new_def, ws()));
}
cur_graph = graphs[nsteps].get();
// forward
cur_graph->Run("", "");
}
......@@ -201,13 +199,11 @@ void ScanGradientOp<Context>::MakeGradientOps() {
template <class Context>
void ScanGradientOp<Context>::RunOnDevice() {
// make graph
MakeGradientOps();
if (!graphs.count(nsteps))
if (!graphs.count(nsteps)) {
graphs[nsteps].reset(new Graph(new_def, ws()));
}
cur_graph = graphs[nsteps].get();
// backward
cur_graph->Run("Gradient", "");
}
......
......@@ -7,7 +7,7 @@ namespace dragon {
template <class Context> template <typename T>
void SliceOp<Context>::RunWithType() {
auto* Xdata = input(0).template data<T, Context>();
for (int i = 0; i < nout; i++){
for (int i = 0; i < nout; i++) {
auto* Ydata = output(i)->template mutable_data<T, Context>();
TIndex count = output(i)->count();
kernel::Slice<T, Context>(count, outer_dim, inner_dim,
......@@ -46,7 +46,7 @@ OPERATOR_SCHEMA(Slice).NumInputs(1).NumOutputs(1, INT_MAX);
template <class Context> template <typename T>
void SliceGradientOp<Context>::RunWithType() {
auto* dXdata = output(0)->template mutable_data<T, Context>();
for (int i = 0; i < nout; i++){
for (int i = 0; i < nout; i++) {
if (input(i + 1).name() == "ignore") continue;
auto* dYdata = input(i + 1).template data<T, Context>();
TIndex count = input(i + 1).count();
......
......@@ -60,11 +60,11 @@ void AccuracyOp<Context>::RunWithType() {
}
}
count++;
} // end inner_num
} // end inner_num
} // end outer_num
output(0)->template mutable_data<T, CPUContext>()[0] = acc / count;
if (OutputSize() > 1){
if (OutputSize() > 1) {
auto* acc_per_class = output(1)->template mutable_data<T, CPUContext>();
for (int i = 0; i < classes; i++)
acc_per_class[i] = num_per_class[i] == 0 ? 0 : acc_per_class[i] / acc_per_class[i];
......
......@@ -28,7 +28,7 @@ void L1LossOp<Context>::RunWithType() {
}
template <class Context>
void L1LossOp<Context>::RunOnDevice(){
void L1LossOp<Context>::RunOnDevice() {
CHECK_EQ(input(0).count(), input(1).count());
output(0)->Reshape(vector<TIndex>(1, 1));
diff = ws()->CreateTensor("_t_" + anchor() + "_l1_loss_diff");
......@@ -53,7 +53,7 @@ void L1LossGradientOp<Context>::RunWithType() {
else if (normalization == "FULL") normalizer = input(0).count();
else if (normalization == "NONE") normalizer = 1;
alpha = alpha / normalizer;
for (int i = 0; i < 2; i++){
for (int i = 0; i < 2; i++) {
if (output(i)->name() == "ignore") continue;
output(i)->ReshapeLike(input(i));
auto* dXdata = output(i)->template mutable_data<T, Context>();
......
......@@ -11,7 +11,7 @@ void L2LossOp<Context>::RunWithType() {
auto* diff_data = diff->template mutable_data<T, Context>();
auto* Ydata = output(0)->template mutable_data<T, CPUContext>();
math::Sub<T, Context>(diff->count(), X0data, X1data, diff_data);
if (InputSize() > 2){
if (InputSize() > 2) {
CHECK_EQ(input(0).count(), input(2).count());
auto* Wdata = input(2).template data<T, Context>();
math::Mul<T, Context>(diff->count(), Wdata, diff_data, diff_data);
......
......@@ -14,12 +14,12 @@ void SmoothL1LossOp<Context>::RunWithType() {
auto* Ydata = output(0)->template mutable_data<T, CPUContext>();
math::Sub<T, Context>(diff->count(), X0data, X1data, diff_data);
if (InputSize() > 2){
if (InputSize() > 2) {
auto* inside_w_data = input(2).template data<T, Context>();
math::Mul<T, Context>(diff->count(), inside_w_data, diff_data, diff_data);
}
kernel::SmoothL1<T, Context>(diff->count(), sigma2, diff_data, error_data);
if (InputSize() > 3){
if (InputSize() > 3) {
auto* outside_w_data = input(3).template data<T, Context>();
math::Mul<T, Context>(diff->count(), outside_w_data, error_data, error_data);
}
......
......@@ -69,7 +69,7 @@ void SoftmaxCrossEntropyLossGradientOp<Context>::RunWithType() {
return;
}
// normalize
// normalize
T normalizer;
if (normalization == "BATCH_SIZE") normalizer = outer_dim;
else if (normalization == "FULL") normalizer = outer_dim * inner_dim;
......
......@@ -26,14 +26,14 @@ void MPIBroadcastOp<Context>::RunWithType() {
}
template <class Context>
void MPIBroadcastOp<Context>::RunOnDevice(){
void MPIBroadcastOp<Context>::RunOnDevice() {
CHECK(this->comm != MPI_COMM_NULL)
<< "\nMPIBroadcastOp, name: " << name()
<< ", does not belong to any group, can't run.";
size_t ndim[1];
TIndex* dims = nullptr;
if (this->comm_rank == this->comm_root){
if (this->comm_rank == this->comm_root) {
ndim[0] = input(0).ndim();
dims = new TIndex[ndim[0]];
for (int i = 0; i < input(0).ndim(); i++)
......@@ -90,7 +90,7 @@ void MPIBroadcastGradientOp<Context>::RunWithType() {
}
template <class Context>
void MPIBroadcastGradientOp<Context>::RunOnDevice(){
void MPIBroadcastGradientOp<Context>::RunOnDevice() {
output(0)->ReshapeLike(input(-1));
if (input(0).template IsType<float>()) RunWithType<float>();
......
......@@ -51,7 +51,7 @@ template <class Context> template <typename T>
void MPIGatherGradientOp<Context>::RunWithType() {
if (this->comm_rank == this->comm_root) {
output(0)->Share(input(this->comm_rank + 1));
for (int i = 0; i < this->comm_size; i++){
for (int i = 0; i < this->comm_size; i++) {
if (i == this->comm_root) continue;
#ifdef WITH_CUDA_AWARE
auto* dYdata = input(this->comm_rank + 1).template data<T, Context>();
......
......@@ -120,7 +120,7 @@ void BatchNormOp<Context>::RunOnDevice() {
if (use_stats == -1) use_global_stats = phase() == "TEST" ? true : false;
else use_global_stats = use_stats == 1 ? true : false;
// if true, Act/Exp/Pow/Norm Ops can not exist before when train
// if true, Act/Exp/Pow/Norm Ops can not exist before when train
if (inplace) output(0)->Share(input(0));
if (input(0).template IsType<float>()) RunWithType<float>();
......@@ -171,7 +171,7 @@ void BatchNormGradientOp<Context>::RunWithType() {
auto* Ydata = input(-2).template data<T, Context>();
math::Mul<T, Context>(output(0)->count(), Ydata, dYdata, dXdata);
// sum(dE/dY \cdot Y)
// sum(dE/dY \cdot Y)
math::Gemv<T, Context>(CblasNoTrans, nbychans, spatial_dim,
1.0,
dXdata, SMul_data,
......@@ -193,10 +193,10 @@ void BatchNormGradientOp<Context>::RunWithType() {
0.0,
dXdata);
// sum(dE/dY \cdot Y) \cdot Y
// sum(dE/dY \cdot Y) \cdot Y
math::Mul<T, Context>(output(0)->count(), Ydata, dXdata, dXdata);
// sum(dE/dY) + sum(dE/dY \cdot Y) \cdot Y
// sum(dE/dY) + sum(dE/dY \cdot Y) \cdot Y
math::Gemv<T, Context>(CblasNoTrans, nbychans, spatial_dim,
1.0,
dYdata, SMul_data,
......@@ -224,7 +224,7 @@ void BatchNormGradientOp<Context>::RunWithType() {
-1.0 / (num * spatial_dim),
dXdata);
// divide by stddev
// divide by stddev
math::Div<T, Context>(output(0)->count(), dXdata, Std_data, dXdata);
// release buffer
......
......@@ -60,7 +60,7 @@ void BatchRenormOp<Context>::RunWithType() {
ctx().template Copy<T, Context, Context>(input(0).count(), Ydata, Xdata);
}
// subtract mean
// subtract mean
math::Gemm<T, Context>(CblasNoTrans, CblasNoTrans, num, channels, 1,
1.0,
NMul_data, tMean_data,
......@@ -73,7 +73,7 @@ void BatchRenormOp<Context>::RunWithType() {
Ydata);
if (!use_global_stats) {
// Var(X) = E((X - EX) ^ 2)
// Var(X) = E((X - EX) ^ 2)
math::Pow<T, Context>(stddev->count(), 2, Ydata, Std_data);
math::Gemv<T, Context>(CblasNoTrans, nbychans, spatial_dim,
1.0 / (num * spatial_dim),
......@@ -85,7 +85,7 @@ void BatchRenormOp<Context>::RunWithType() {
NByC_data, NMul_data,
0.0,
tVar_data);
// update moving average
// update moving average
hFact_data[0] *= momentum; hFact_data[0] += 1;
int m = input(0).count() / channels;
T factor = m > 1 ? T(m) / (m - 1) : 1;
......@@ -93,31 +93,31 @@ void BatchRenormOp<Context>::RunWithType() {
math::Axpby<T, Context>(mean.count(), factor, tVar_data, momentum, hVar_data);
}
// normalize var
// normalize var
math::AddScalar<T, Context>(mean.count(), eps, tVar_data);
math::Pow<T, Context>(mean.count(), 0.5, tVar_data, tVar_data);
if (!use_global_stats) {
// normalize history var
// normalize history var
math::AddScalar<T, Context>(mean.count(), eps, thVar_data);
math::Pow<T, Context>(mean.count(), 0.5, thVar_data, thVar_data);
// compute r
// compute r
math::Div<T, Context>(mean.count(), tVar_data, thVar_data, tRdata);
math::Clip<T, Context>(mean.count(), 1.0 / t_r_max, t_r_max, tRdata);
// compute d
// compute d
math::Sub<T, Context>(mean.count(), tMean_data, thMean_data, tDdata);
math::Div<T, Context>(mean.count(), tDdata, thVar_data, tDdata);
math::Clip<T, Context>(mean.count(), -t_d_max, t_d_max, tDdata);
// update the bound of r & d
// update the bound of r & d
t_r_max = r_max / (1.0 + (r_max - 1.0) * exp(-t_val));
t_d_max = d_max / (1.0 + (d_max - 1.0) * exp(-2 * t_val));
t_val += t_delta;
}
// divide by var
// divide by var
math::Gemm<T, Context>(CblasNoTrans, CblasNoTrans, num, channels, 1,
1.0,
NMul_data, tVar_data,
......@@ -131,11 +131,11 @@ void BatchRenormOp<Context>::RunWithType() {
math::Div<T, Context>(stddev->count(), Ydata, Std_data, Ydata);
if (!use_global_stats) {
// store x_norm for backward
// store x_norm for backward
XNorm_data = x_norm->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>(output(0)->count(), XNorm_data, Ydata);
// correction: mul by r
// correction: mul by r
math::Gemm<T, Context>(CblasNoTrans, CblasNoTrans, num, channels, 1,
1.0,
NMul_data, tRdata,
......@@ -183,7 +183,7 @@ void BatchRenormOp<Context>::RunOnDevice() {
if (use_stats == -1) use_global_stats = phase() == "TEST" ? true : false;
else use_global_stats = use_stats == 1 ? true : false;
// if true, Act/Exp/Pow/Norm Ops can not exist before when train
// if true, Act/Exp/Pow/Norm Ops can not exist before when train
if (inplace) output(0)->Share(input(0));
if (input(0).template IsType<float>()) RunWithType<float>();
......@@ -233,7 +233,7 @@ void BatchRenormGradientOp<Context>::RunWithType() {
auto* XNorm_data = x_norm->template data<T, Context>();
auto* tMean_data = mean.template mutable_data<T, Context>();
// buffer <- dE/dY \cdot r
// buffer <- dE/dY \cdot r
math::Gemm<T, Context>(CblasNoTrans, CblasNoTrans, num, channels, 1,
1.0,
NMul_data, tRdata,
......@@ -246,7 +246,7 @@ void BatchRenormGradientOp<Context>::RunWithType() {
Std_data);
math::Mul<T, Context>(output(0)->count(), dYdata, Std_data, Std_data);
// sum(dE/dY \cdot Y)
// sum(dE/dY \cdot Y)
math::Mul<T, Context>(output(0)->count(), XNorm_data, Std_data, dXdata);
math::Gemv<T, Context>(CblasNoTrans, nbychans, spatial_dim,
1.0,
......@@ -269,10 +269,10 @@ void BatchRenormGradientOp<Context>::RunWithType() {
0.0,
dXdata);
// sum(dE/dY \cdot Y) \cdot Y
// sum(dE/dY \cdot Y) \cdot Y
math::Mul<T, Context>(output(0)->count(), XNorm_data, dXdata, dXdata);
// sum(dE/dY) + sum(dE/dY \cdot Y) \cdot Y
// sum(dE/dY) + sum(dE/dY \cdot Y) \cdot Y
math::Gemv<T, Context>(CblasNoTrans, nbychans, spatial_dim,
1.0,
Std_data, SMul_data,
......@@ -293,13 +293,13 @@ void BatchRenormGradientOp<Context>::RunWithType() {
NByC_data, SMul_data,
1.0, dXdata);
// dE/dY - mean(dE/dY)- mean(dE/dY \cdot Y) \cdot Y
// = dE/dY - mean(sum(dE/dY) + sum(dE/dY \cdot Y) \cdot Y)
// dE/dY - mean(dE/dY)- mean(dE/dY \cdot Y) \cdot Y
// = dE/dY - mean(sum(dE/dY) + sum(dE/dY \cdot Y) \cdot Y)
math::Axpby<T, Context>(output(0)->count(), 1.0, Std_data,
-1.0 / (num * spatial_dim),
dXdata);
// divide var
// divide var
math::Gemm<T, Context>(CblasNoTrans, CblasNoTrans, num, channels, 1,
1.0,
NMul_data, tVar_data,
......
......@@ -60,7 +60,7 @@ void InstanceNormOp<Context>::RunWithType() {
}
template <class Context>
void InstanceNormOp<Context>::RunOnDevice(){
void InstanceNormOp<Context>::RunOnDevice() {
num = input(0).dim(0); channels = input(0).dim(1);
spatial_dim = input(0).count(2); nbychans = num * channels;
vector<TIndex> dims({ num, channels });
......@@ -69,7 +69,7 @@ void InstanceNormOp<Context>::RunOnDevice(){
output(0)->ReshapeLike(input(0));
// if true, Act/Exp/Pow/Norm Ops can not exist before when train
// if true, Act/Exp/Pow/Norm Ops can not exist before when train
if (inplace) output(0)->Share(input(0));
if (input(0).template IsType<float>()) RunWithType<float>();
......@@ -105,7 +105,7 @@ void InstanceNormGradientOp<Context>::RunWithType() {
auto* Ydata = input(-2).template data<T, Context>();
math::Mul<T, Context>(output(0)->count(), Ydata, dYdata, dXdata);
// sum(dE/dY \cdot Y)
// sum(dE/dY \cdot Y)
math::Gemv<T, Context>(CblasNoTrans, nbychans, spatial_dim,
1.0,
dXdata, SMul_data,
......@@ -116,10 +116,10 @@ void InstanceNormGradientOp<Context>::RunWithType() {
0.0,
dXdata);
// sum(dE/dY \cdot Y) \cdot Y
// sum(dE/dY \cdot Y) \cdot Y
math::Mul<T, Context>(output(0)->count(), Ydata, dXdata, dXdata);
// sum(dE/dY) + sum(dE/dY \cdot Y) \cdot Y
// sum(dE/dY) + sum(dE/dY \cdot Y) \cdot Y
math::Gemv<T, Context>(CblasNoTrans, nbychans, spatial_dim,
1.0,
dYdata, SMul_data,
......@@ -130,13 +130,13 @@ void InstanceNormGradientOp<Context>::RunWithType() {
1.0,
dXdata);
// dE/dY - mean(dE/dY)- mean(dE/dY \cdot Y) \cdot Y
// = dE/dY - mean(sum(dE/dY) + sum(dE/dY \cdot Y) \cdot Y)
// dE/dY - mean(dE/dY)- mean(dE/dY \cdot Y) \cdot Y
// = dE/dY - mean(sum(dE/dY) + sum(dE/dY \cdot Y) \cdot Y)
math::Axpby<T, Context>(output(0)->count(), 1.0, dYdata,
-1.0 / spatial_dim,
dXdata);
// divide by var
// divide by var
math::Div<T, Context>(output(0)->count(), dXdata, Std_data, dXdata);
// release buffer
......
......@@ -41,9 +41,9 @@ void L2NormOp<Context>::RunWithType() {
Bdata, DMuldata,
1.0,
Ndata);
// compute T2 = \sqrt{T1}
// compute T2 = \sqrt{T1}
math::Sqrt<T, Context>(inner_dim, Ndata, Ndata);
// compute T3 = x / [(T2)]_{dim}
// compute T3 = x / [(T2)]_{dim}
math::Gemm<T, Context>(CblasNoTrans, CblasNoTrans, dim, inner_dim, 1,
1.0,
DMuldata, Ndata,
......@@ -109,7 +109,7 @@ void L2NormGradientOp<Context>::RunWithType() {
auto* Bdata = buffer->template mutable_data<T, Context>();
auto* BInnerdata = buffer_inner->template mutable_data<T, Context>();
for (int n = 0; n < outer_dim; n++){
for (int n = 0; n < outer_dim; n++) {
if (across_inner) {
Ndata = norm->template data<T, CPUContext>();
T sum_of_x_mul_dy = math::Dot<T, Context>(buffer->count(), Xdata, dYdata);
......@@ -117,21 +117,21 @@ void L2NormGradientOp<Context>::RunWithType() {
math::Sub<T, Context>(buffer->count(), dYdata, dXdata, dXdata);
math::Scal<T, Context>(buffer->count(), T(1.0 / Ndata[n]), dXdata);
} else {
// compute \sum_{i} x_{i, j}dy_{i, j}
// compute \sum_{i} x_{i, j}dy_{i, j}
math::Mul<T, Context>(buffer->count(), Xdata, dYdata, Bdata);
math::Gemv<T, Context>(CblasTrans, dim, inner_dim,
1.0,
Bdata, DMuldata,
0.0,
BInnerdata);
// compute T1 = x[(\sum_{i} x_{i, j}dy_{i, j})]_{dim}
// compute T1 = x[(\sum_{i} x_{i, j}dy_{i, j})]_{dim}
math::Gemm<T, Context>(CblasNoTrans, CblasNoTrans, dim, inner_dim, 1,
1.0,
DMuldata, BInnerdata,
0.0,
Bdata);
math::Mul<T, Context>(buffer->count(), Xdata, Bdata, dXdata);
// compute T2 = T1 / Normalizer^{2}
// compute T2 = T1 / Normalizer^{2}
math::Pow<T, Context>(inner_dim, 2.0, Ndata, BInnerdata);
math::Gemm<T, Context>(CblasNoTrans, CblasNoTrans, dim, inner_dim, 1,
1.0,
......@@ -139,7 +139,7 @@ void L2NormGradientOp<Context>::RunWithType() {
0.0,
Bdata);
math::Div<T, Context>(buffer->count(), dXdata, Bdata, dXdata);
// compute T3 = (dy - T2) / Normalizer
// compute T3 = (dy - T2) / Normalizer
math::Sub<T, Context>(buffer->count(), dYdata, dXdata, dXdata);
math::Gemm<T, Context>(CblasNoTrans, CblasNoTrans, dim, inner_dim, 1,
1.0,
......
......@@ -18,10 +18,10 @@ void LSTMUnitOp<Context>::RunWithType() {
template <class Context>
void LSTMUnitOp<Context>::RunOnDevice() {
// input(0): ----- c_t_1
// input(1): ----- gate_input
// output(0): ----- c_t
// output(1): ----- h_t
// input(0): ----- c_t_1
// input(1): ----- gate_input
// output(0): ----- c_t
// output(1): ----- h_t
num = input(0).dim(0);
channels = input(0).ndim() == 2 ? input(0).dim(1) : input(0).dim(2);
if (!has_cont.empty()) {
......@@ -57,13 +57,13 @@ void LSTMUnitGradientOp<Context>::RunWithType() {
template <class Context>
void LSTMUnitGradientOp<Context>::RunOnDevice() {
// input(0): ----- c_t_1
// input(1): ----- x_act
// input(2): ----- c_t
// input(3): ----- d(c_t)
// input(4): ----- d(h_t)
// output(0): ----- d(c_t_1)
// output(1): ----- d(gate_input)
// input(0): ----- c_t_1
// input(1): ----- x_act
// input(2): ----- c_t
// input(3): ----- d(c_t)
// input(4): ----- d(h_t)
// output(0): ----- d(c_t_1)
// output(1): ----- d(gate_input)
num = input(0).dim(0);
channels = input(0).ndim() == 2 ? input(0).dim(1) : input(0).dim(2);
output(0)->ReshapeLike(input(0));
......@@ -92,7 +92,7 @@ public:
vector<string> {I(0), I(1), O(0), GO(0), GO(1)},
vector<string> {GI(0), GI(1)});
}
// fill zero for dc_{T+1}
// fill zero for dc_{T+1}
vector<float> DefaultValues() override{ return{ 0.0, 1.0 }; }
};
REGISTER_GRADIENT(LSTMUnit, GetLSTMUnitGradient);
......
......@@ -21,7 +21,7 @@ void AsyncUpdateOp<Context>::UpdateTimestamp(int tag) {
}
template <class Context>
int AsyncUpdateOp<Context>::GetDelay(int tag){
int AsyncUpdateOp<Context>::GetDelay(int tag) {
Tensor* t = ws()->GetTensor("_t_" + this->domain + "async_timestamp");
int* global_timestamp = t->template mutable_data<int, CPUContext>();
return global_timestamp[tag] - local_timestamp[tag] + 1;
......@@ -35,33 +35,33 @@ AsyncUpdateOp<Context>::AsyncUpdateOp(const OperatorDef& op_def, Workspace* ws)
mode(OperatorBase::GetSingleArg<string>("mode", "Async")),
nsync(OperatorBase::GetSingleArg<int>("nsync", -1)) {
// make key-val tags
// make key-val tags
Tensor* t = this->ws()->CreateTensor("_t_" + this->domain + "async_tags");
t->Reshape(vector<TIndex>(1, InputSize()));
tags = t->template mutable_data<string, CPUContext>();
for (int i = 0; i < OutputSize(); i++) tags[i] = output(i)->name();
// make recv logs
// make recv logs
t = this->ws()->CreateTensor("_t_" + this->domain + "async_logs");
t->Reshape(vector<TIndex>(1, InputSize()));
// make recv buffers
acc_buffers = new Tensor*[InputSize()]; // for soft-sync
recv_buffer.reset(new Tensor()); // for async
// make recv buffers
acc_buffers = new Tensor*[InputSize()]; // for soft-sync
recv_buffer.reset(new Tensor()); // for async
// setup for server
// setup for server
if (this->comm_rank == this->comm_root) {
if (nsync == -1) nsync = this->comm_size; // fully async
if (nsync == -1) nsync = this->comm_size; // fully async
max_recv = this->comm_size / nsync;
// make global timestamp
// make global timestamp
t = this->ws()->CreateTensor("_t_" + this->domain + "async_timestamp");
t->Reshape(vector<TIndex>(1, InputSize()));
// make global buffers
// make global buffers
for (int i = 0; i < OutputSize(); i++)
acc_buffers[i] = this->ws()->CreateTensor(tags[i] + "_grad_async_acc");
}
// create independent stream for thread if using cuda-aware
// create independent stream for thread if using cuda-aware
#ifdef WITH_CUDA_AWARE
cudaStreamCreate(&stream);
cublasCreate_v2(&handle);
......@@ -71,7 +71,7 @@ AsyncUpdateOp<Context>::AsyncUpdateOp(const OperatorDef& op_def, Workspace* ws)
template <class Context> template <typename T>
void AsyncUpdateOp<Context>::RootRunWithType() {
for (int i = 0; i < InputSize(); i++){
for (int i = 0; i < InputSize(); i++) {
auto* dXdata = input(i).template mutable_data<T, Context>();
auto* Xdata = output(i)->template mutable_data<T, Context>();
......@@ -88,7 +88,7 @@ void AsyncUpdateOp<Context>::RootRunWithType() {
}
template <class Context>
void AsyncUpdateOp<Context>::RunOnDevice(){
void AsyncUpdateOp<Context>::RunOnDevice() {
if (this->comm_rank != this->comm_root) return;
if (input(0).template IsType<float>()) {
......@@ -102,11 +102,11 @@ void AsyncUpdateOp<Context>::RunOnDevice(){
template <class Context> template <typename T>
void AsyncUpdateOp<Context>::ThreadRunWithType() {
while (1) {
// pull from specfic client
// pull from specfic client
MPI_Status status;
MPI_Probe(node_id, MPI_ANY_TAG, this->comm, &status);
Tensor* X = ws()->GetTensor(tags[status.MPI_TAG]);
if (X->count() == 0) continue; // wait for server
if (X->count() == 0) continue; // wait for server
recv_buffer->ReshapeLike(*X);
#ifdef WITH_CUDA_AWARE
auto* Bdata = recv_buffer->template mutable_data<T, Context>();
......@@ -114,7 +114,7 @@ void AsyncUpdateOp<Context>::ThreadRunWithType() {
auto* Bdata = recv_buffer->template mutable_data<T, CPUContext>();
#endif
MPI_Recv(Bdata, X->count(), MPI_FLOAT, status.MPI_SOURCE, status.MPI_TAG, this->comm, MPI_STATUS_IGNORE);
// update
// update
#ifdef WITH_CUDA_AWARE
auto* Xdata = X->template mutable_data<T, Context>();
if (mode != "Async_No_Lock") ws()->LockTensor(output(status.MPI_TAG)->name());
......@@ -132,9 +132,9 @@ void AsyncUpdateOp<Context>::ThreadRunWithType() {
math::Axpy<T, CPUContext>(X->count(), -1.0 / delay, Bdata, Xdata);
if (mode != "Async_No_Lock") ws()->UnlockTensor(output(status.MPI_TAG)->name());
#endif
// push back to this client
// push back to this client
MPI_Send(Xdata, X->count(), MPI_FLOAT, status.MPI_SOURCE, status.MPI_TAG, this->comm);
// do statistics
// do statistics
update_count++;
if (update_count % (100 * InputSize()) == 0)
LOG(INFO) << "Server[" << node_id << "]: "
......
......@@ -59,7 +59,7 @@ void UpdateOpBase<Context>::ReduceRunWithType() {
int recv_from = (comm_rank - 1 + comm_size) % comm_size;
int send_to = (comm_rank + 1) % comm_size;
// scatter-reduce
// scatter-reduce
for (int i = 0; i < comm_size - 1; i++) {
int recv_chunk = (comm_rank - i - 1 + comm_size) % comm_size;
int send_chunk = (comm_rank - i + comm_size) % comm_size;
......@@ -83,7 +83,7 @@ void UpdateOpBase<Context>::ReduceRunWithType() {
}
ws()->ReleaseBuffer(buffer);
// allgather
// allgather
for (int i = 0; i < comm_size - 1; i++) {
int send_chunk = (comm_rank - i + 1 + comm_size) % comm_size;
int recv_chunk = (comm_rank - i + comm_size) % comm_size;
......@@ -97,8 +97,8 @@ void UpdateOpBase<Context>::ReduceRunWithType() {
0, comm, MPI_STATUS_IGNORE);
}
// ave-normalize
if (comm_size > 1){
// ave-normalize
if (comm_size > 1) {
#ifdef WITH_CUDA_AWARE
math::Scal<T, Context>(count, T(1.0 / comm_size), dXdata);
#else
......@@ -110,13 +110,13 @@ void UpdateOpBase<Context>::ReduceRunWithType() {
template <class Context> template <typename T>
void UpdateOpBase<Context>::PreprocessRunWithType() {
// scale
// scale
scale_factor = param("scale_gradient");
if (scale_factor != 1){
if (scale_factor != 1) {
auto* dXdata = input(0).template mutable_data<T, Context>();
math::Scal<T, Context>(input(0).count(), scale_factor, dXdata);
}
// clip
// clip
clip_thresh = param("clip_gradient");
if (clip_thresh > 0) {
auto* dXdata = input(0).template mutable_data<T, Context>();
......@@ -127,12 +127,12 @@ void UpdateOpBase<Context>::PreprocessRunWithType() {
math::Scal<T, Context>(input(0).count(), factor, dXdata);
}
}
// decay
l2_decay = param("l2_decay");
if (l2_decay > 0){
// decay
l2_decay = param("l2_decay") * decay_mult;
if (l2_decay > 0) {
auto* dXdata = input(0).template mutable_data<T, Context>();
auto* Xdata = output(0)->template data<T, Context>();
math::Axpy<T, Context>(input(0).count(), l2_decay * decay_mult, Xdata, dXdata);
math::Axpy<T, Context>(input(0).count(), l2_decay, Xdata, dXdata);
}
}
......@@ -141,9 +141,9 @@ void UpdateOpBase<Context>::UpdateRunWithType() {
if (!allow_parallel || (allow_parallel && mode == "Sync")) {
auto* dXdata = input(0).template mutable_data<T, Context>();
auto* Xdata = output(0)->template mutable_data<T, Context>();
// update
// update
math::Axpy<T, Context>(output(0)->count(), -1.0, dXdata, Xdata);
// clear accumulated grads
// clear accumulated grads
math::Set<T, Context>(input(0).count(), 0, dXdata);
} else {
#ifdef WITH_MPI
......
......@@ -10,12 +10,12 @@ void FloatToHalfOp<Context>::RunOnDevice() {
<< "the type of tensor must be float32.";
output(0)->ReshapeLike(input(0));
// cast
// cast
auto* Xdata = input(0).template data<float, Context>();
auto* Ydata = output(0)->template mutable_data<float16, Context>();
kernel::Float2Half<float, Context>(output(0)->count(), Xdata, Ydata);
// release & share
// release & share
input(0).Reset();
input(0).ReshapeLike(*output(0));
input(0).Share(*output(0));
......
......@@ -12,7 +12,7 @@ void CompareOp<Context>::EqualRunWithType() {
}
template <class Context>
void CompareOp<Context>::RunOnDevice(){
void CompareOp<Context>::RunOnDevice() {
CHECK_EQ(input(0).count(), input(1).count())
<< "both conditioned tensor must have same elements.";
output(0)->ReshapeLike(input(0));
......
......@@ -6,7 +6,7 @@ namespace dragon {
template <class Context> template <typename T>
void GradientGenerateOp<Context>::RunWithType() {
for (int i = 0; i < OutputSize(); i++){
for (int i = 0; i < OutputSize(); i++) {
if (output(i)->name() == "ignore") continue;
output(i)->ReshapeLike(input(i));
auto* dXdata = output(0)->template mutable_data<T, Context>();
......
......@@ -26,7 +26,7 @@ void InitializeOp<Context>::RunOnDevice() {
RunWithType<float>();
}
// constant
// constant
DEPLOY_CPU(Fill);
#ifdef WITH_CUDA
DEPLOY_CUDA(Fill);
......@@ -34,7 +34,7 @@ DEPLOY_CUDA(Fill);
OPERATOR_SCHEMA(Fill).NumInputs(0, 1).NumOutputs(1);
NO_GRADIENT(Fill);
// uniform
// uniform
DEPLOY_CPU(RandomUniform);
#ifdef WITH_CUDA
DEPLOY_CUDA(RandomUniform);
......@@ -42,7 +42,7 @@ DEPLOY_CUDA(RandomUniform);
OPERATOR_SCHEMA(RandomUniform).NumInputs(0, 1).NumOutputs(1);
NO_GRADIENT(RandomUniform);
// normal
// normal
DEPLOY_CPU(RandomNormal);
#ifdef WITH_CUDA
DEPLOY_CUDA(RandomNormal);
......@@ -50,7 +50,7 @@ DEPLOY_CUDA(RandomNormal);
OPERATOR_SCHEMA(RandomNormal).NumInputs(0, 1).NumOutputs(1);
NO_GRADIENT(RandomNormal);
// truncated normal
// truncated normal
DEPLOY_CPU(TruncatedNormal);
#ifdef WITH_CUDA
DEPLOY_CPU_CUDA(TruncatedNormal);
......@@ -58,7 +58,7 @@ DEPLOY_CPU_CUDA(TruncatedNormal);
OPERATOR_SCHEMA(TruncatedNormal).NumInputs(0, 1).NumOutputs(1);
NO_GRADIENT(TruncatedNormal);
// glorot uniform
// glorot uniform
DEPLOY_CPU(GlorotUniform);
#ifdef WITH_CUDA
DEPLOY_CUDA(GlorotUniform);
......@@ -66,7 +66,7 @@ DEPLOY_CUDA(GlorotUniform);
OPERATOR_SCHEMA(GlorotUniform).NumInputs(0, 1).NumOutputs(1);
NO_GRADIENT(GlorotUniform);
// glorot normal
// glorot normal
DEPLOY_CPU(GlorotNormal);
#ifdef WITH_CUDA
DEPLOY_CUDA(GlorotNormal);
......
......@@ -229,16 +229,16 @@ __global__ static void nms_mask(const Dtype boxes[],
}
}
// mask: "num_boxes x num_blocks" array
// for mask[j][bi], "di-th bit = 1" means:
// box j is significantly overlapped with box i = i_start + di,
// where i_start = bi * block_size
// mask: "num_boxes x num_blocks" array
// for mask[j][bi], "di-th bit = 1" means:
// box j is significantly overlapped with box i = i_start + di,
// where i_start = bi * block_size
{
const int num_blocks = DIV_THEN_CEIL(num_boxes, nms_block_size);
const int bi = blockIdx.x;
mask[(j_start + dj) * num_blocks + bi] = mask_j;
}
} // endif dj < dj_end
} // endif dj < dj_end
}
}
......
......@@ -4,10 +4,10 @@ namespace dragon {
template <class Context>
void ShapeOp<Context>::RunOnDevice() {
// reshape
// reshape
output(0)->Reshape(vector<TIndex>(1, input(0).ndim()));
// forward
// forward
auto* Ydata = output(0)->template mutable_data<float, CPUContext>();
for (int i = 0; i < input(0).ndim(); i++) Ydata[i] = input(0).dim(i);
}
......
......@@ -7,7 +7,7 @@ namespace dragon {
template <class Context>
void ConvOp<Context>::ComputeOutputShape() {
this->output_shape.clear();
for (int i = 0; i < this->num_spatial_axes; i++){
for (int i = 0; i < this->num_spatial_axes; i++) {
const int input_dim = this->bottom_shape[this->channel_axis + i + 1];
const int dilated_kernel = this->dilation[i] * (this->kernel_size[i] - 1) + 1;
const int output_dim = (input_dim + 2 * this->pad[i] - dilated_kernel) / this->stride[i] + 1;
......@@ -43,7 +43,7 @@ void ConvOp<Context>::RunWithType() {
}
template <class Context>
void ConvOp<Context>::RunOnDevice(){
void ConvOp<Context>::RunOnDevice() {
Reshape();
if (input(0).template IsType<float>()) RunWithType<float>();
......@@ -71,7 +71,7 @@ void ConvGradientOp<Context>::RunWithType() {
Db(dYdata + n * this->y_offset, dBdata);
}
for (int n = 0; n < input(2).dim(0); n++){
for (int n = 0; n < input(2).dim(0); n++) {
if (output(1)->name() != "ignore") {
auto* Xdata = input(0).template data<T, Context>();
auto* dWdata = output(1)->template mutable_data<T, Context>();
......
......@@ -125,7 +125,7 @@ void ConvOpBase<Context>::Reshape() {
col_offset = kernel_dim * conv_out_spatial_dim;
output_offset = conv_out_channels * conv_out_spatial_dim / group;
// compute col buffer shape
// compute col buffer shape
col_buffer_shape.clear();
col_buffer_shape.push_back(kernel_dim * group);
for (int i = 0; i < num_spatial_axes; i++) {
......@@ -159,9 +159,9 @@ void ConvOpBase<Context>::GradientReshape() {
conv_out_spatial_dim = input(2).count(channel_axis + 1);
}
// compute input shape
// compute input shape
input_shape.clear();
for (int i = 0; i < num_spatial_axes; i++){
for (int i = 0; i < num_spatial_axes; i++) {
if (ReverseDimensions()) {
input_shape.push_back(input(2).dim(channel_axis + i + 1));
} else {
......@@ -169,7 +169,7 @@ void ConvOpBase<Context>::GradientReshape() {
}
}
kernel_dim = input(1).count(1); // in * kh * kw
kernel_dim = input(1).count(1); // in * kh * kw
out_spatial_dim = input(2).count(channel_axis + 1);
x_offset = input(0).count(channel_axis);
......@@ -178,10 +178,10 @@ void ConvOpBase<Context>::GradientReshape() {
col_offset = kernel_dim * conv_out_spatial_dim;
output_offset = conv_out_channels * conv_out_spatial_dim / group;
// compute col buffer shape
// compute col buffer shape
col_buffer_shape.clear();
col_buffer_shape.push_back(kernel_dim * group);
for (int i = 0; i < num_spatial_axes; i++){
for (int i = 0; i < num_spatial_axes; i++) {
if (ReverseDimensions()) {
col_buffer_shape.push_back(bottom_shape[channel_axis + i + 1]);
} else {
......
......@@ -228,13 +228,13 @@ void CuDNNConvGradientOp<Context>::RunWithType() {
const T* dYdata = input(2).template data<T, Context>();
for (int g = 0; g < this->group; g++) {
if (output(2)->name() != "ignore"){
if (output(2)->name() != "ignore") {
T* dBdata = output(2)->template mutable_data<T, Context>();
CUDNN_CHECK(cudnnConvolutionBackwardBias(handle[g],
CUDNNType<T>::one, input_desc, dYdata + this->y_offset * g,
CUDNNType<T>::one, bias_desc, dBdata + bias_offset * g));
}
if (output(1)->name() != "ignore"){
if (output(1)->name() != "ignore") {
auto* Xdata = input(0).template data<T, Context>();
auto* dWdata = output(1)->template mutable_data<T, Context>();
auto* workspace = buffer2->mutable_data<char, Context>();
......@@ -246,7 +246,7 @@ void CuDNNConvGradientOp<Context>::RunWithType() {
workspace + g * workspace_bwd_filter_size, workspace_bwd_filter_size,
CUDNNType<T>::one, filter_desc, dWdata + this->weight_offset * g));
}
if (output(0)->name() != "ignore"){
if (output(0)->name() != "ignore") {
auto* Wdata = input(1).template data<T, Context>();
auto* dXdata = output(0)->template mutable_data<T, Context>();
auto* workspace = buffer1->mutable_data<char, Context>();
......
......@@ -54,7 +54,7 @@ void CuDNNLRNGradientOp<Context>::RunWithType() {
}
template <class Context>
void CuDNNLRNGradientOp<Context>::RunOnDevice(){
void CuDNNLRNGradientOp<Context>::RunOnDevice() {
output(0)->ReshapeLike(input(0));
if (this->mode == ACROSS_CHANNELS) {
......
......@@ -43,7 +43,7 @@ void DeConvOp<Context>::RunWithType() {
}
template <class Context>
void DeConvOp<Context>::RunOnDevice(){
void DeConvOp<Context>::RunOnDevice() {
Reshape();
if (input(0).template IsType<float>()) RunWithType<float>();
......
......@@ -98,7 +98,7 @@ void LRNOp<Context>::ProdRunWithType() {
}
template <class Context>
void LRNOp<Context>::RunOnDevice(){
void LRNOp<Context>::RunOnDevice() {
if (mode == ACROSS_CHANNELS) {
if (input(0).template IsType<float>()) {
AcrossRunWithType<float>();
......@@ -223,7 +223,7 @@ void LRNGradientOp<Context>::SplitRunWithType() {
}
template <class Context>
void LRNGradientOp<Context>::RunOnDevice(){
void LRNGradientOp<Context>::RunOnDevice() {
if (mode == ACROSS_CHANNELS) {
if (input(0).template IsType<float>()) {
AcrossRunWithType<float>();
......
......@@ -16,7 +16,7 @@ void NNResizeOp<Context>::RunWithType() {
template <class Context>
void NNResizeOp<Context>::RunOnDevice() {
dims = input(0).dims();
if (dsize.size() == 0){
if (dsize.size() == 0) {
CHECK(fy != -1.0 && fx != -1.0);
dims[2] = int(dims[2] * fy);
dims[3] = int(dims[3] * fx);
......@@ -47,7 +47,7 @@ void NNResizeGradientOp<Context>::RunWithType() {
}
template <class Context>
void NNResizeGradientOp<Context>::RunOnDevice(){
void NNResizeGradientOp<Context>::RunOnDevice() {
output(0)->ReshapeLike(input(0));
if (input(0).template IsType<float>()) return RunWithType<float>();
......
......@@ -37,7 +37,7 @@ void cudnnSetTensorDesc(cudnnTensorDescriptor_t* desc, const vector<TIndex>& dim
int* dimA = new int[ndim];
int* strideA = new int[ndim];
TIndex stride = 1;
for (int i = ndim - 1; i >= 0; i--){
for (int i = ndim - 1; i >= 0; i--) {
strideA[i] = stride;
dimA[i] = dims[i];
stride *= dimA[i];
......@@ -55,7 +55,7 @@ void cudnnSetTensorDesc(cudnnTensorDescriptor_t* desc,
int ndim = (int)dims.size();
int* dimA = new int[ndim];
int* strideA = new int[ndim];
for (int i = ndim - 1; i >= 0; i--){
for (int i = ndim - 1; i >= 0; i--) {
strideA[i] = strides[i];
dimA[i] = dims[i];
}
......@@ -66,10 +66,10 @@ void cudnnSetTensorDesc(cudnnTensorDescriptor_t* desc,
template <typename T>
void cudnnSetTensorDesc(cudnnTensorDescriptor_t* desc, Tensor* tensor) {
// CUDNN only support ndim from 3 to 8
// we fake a reshaped dims to pass check
// cuDNN requires ndim from 3 to 8
// we fake a reshaped dims to pass check
vector<TIndex> fake_dims(tensor->dims());
if (fake_dims.size() < 3 || fake_dims.size() > 8){
if (fake_dims.size() < 3 || fake_dims.size() > 8) {
fake_dims.assign({ 1, 1 });
fake_dims.push_back(tensor->count());
}
......
......@@ -142,7 +142,7 @@ template <> void Add<float, CPUContext>(const int n,
template <> void Sub<float, CPUContext>(const int n,
const float* a,
const float* b,
float* y){
float* y) {
#ifdef WITH_SSE
sse::Sub<float>(n, a, b, y);
#else // naive implement
......
......@@ -66,8 +66,8 @@ template <> void RandomUniform<uint32_t, CUDAContext>(const int n,
const float low,
const float high,
uint32_t* x) {
// note that we ignore the low / high
// curand could only generates in the range of [0, uint32]
// note that we ignore the low / high
// curand could only generates in the range of [0, uint32]
CURAND_CHECK(curandGenerate(curand_generator(), x, n));
}
......@@ -95,8 +95,8 @@ template <> void RandomNormal<float16, CUDAContext>(const int n,
template <> void RandomBernoulli<float, CUDAContext>(const int n,
const float p,
unsigned int* x) {
// curand could not generate bernoulli distribution
// we recommend implement it within specfic case, e.g. Dropout
// curand could not generate bernoulli distribution
// we recommend implement it within specfic case, e.g. Dropout
NOT_IMPLEMENTED;
}
......
......@@ -98,7 +98,7 @@ template<> void Softmax<float, CPUContext>(const int count,
const int dim = count / outer_dim;
for (int i = 0; i < outer_dim; ++i) {
context->Copy<float, CPUContext, CPUContext>(inner_dim, scale, x + i*dim);
for (int j = 0; j < classes; ++j){
for (int j = 0; j < classes; ++j) {
for (int k = 0; k < inner_dim; k++)
scale[k] = std::max(scale[k], x[i * dim + j * inner_dim + k]);
}
......@@ -668,7 +668,7 @@ template <> void OneHot<float, CPUContext>(const int count,
template<> void AbsGrad<float, CPUContext>(const int count, const float* dy, float* dx) {
for (int i = 0; i < count; ++i) {
const float val = dy[i];
// val > 0: 1 | val == 0: 0 | val < 0: -1
// val > 0: 1 | val == 0: 0 | val < 0: -1
dx[i] = (val > float(0)) - (val < float(0));
}
}
......@@ -707,7 +707,7 @@ template<> void SmoothL1Grad<float, CPUContext>(const int count,
const float val = dy[i];
const float abs_val = abs(val);
if (abs_val < 1.0 / sigma2) dx[i] = val * sigma2;
// val > 0: 1 | val == 0: 0 | val < 0: -1
// val > 0: 1 | val == 0: 0 | val < 0: -1
else dx[i] = (val > float(0)) - (val < float(0));
}
}
......@@ -862,11 +862,11 @@ template <> void LSTMUnitGrad<float, CPUContext>(const int count,
p_df = dx + f_offset + ch;
p_do = dx + o_offset + ch;
p_dg = dx + g_offset + ch;
// BPTT compute the dc_{t-1} at the time of t
// dc_{t-1} = dl / d(h_{t}) * d(h_{t}) / d(c_{t}) * d(c_{t}) / d(c_{t-1})
// BPTT compute the dc_{t-1} at the time of t
// dc_{t-1} = dl / d(h_{t}) * d(h_{t}) / d(c_{t}) * d(c_{t}) / d(c_{t-1})
// + d(c_{t+1}) / d(c_{t}) * d(c_{t}) / d(c_{t-1})
// = (dl / d(h_{t}) * d(h_{t}) / d(c_{t}) + d(c_{t+1}) / d(c_{t}))
// * d(c_{t}) / d(c_{t-1})
// = (dl / d(h_{t}) * d(h_{t}) / d(c_{t}) + d(c_{t+1}) / d(c_{t}))
// * d(c_{t}) / d(c_{t-1})
tanh_c_t = tanh(c[ch]);
dc_1_sum_term = dh[ch] * o * (1 - tanh_c_t * tanh_c_t) + dc[ch];
dc_1[ch] = dc_1_sum_term * f;
......@@ -1182,18 +1182,18 @@ template<> void MAXPooling<float, CPUContext>(const int count,
max_val = x[idx];
max_idx = idx;
}
} // end w
} // end h
} // end w
} // end h
y[pool_idx] = max_val;
mask[pool_idx] = max_idx;
} // end pw
} // end ph
// offset a channel
} // end pw
} // end ph
// offset a channel
x += x_offset;
y += y_offset;
mask += y_offset;
} // end c
} // end n
} // end c
} // end n
}
template<> void AVEPooling<float, CPUContext>(const int count,
......@@ -1257,13 +1257,13 @@ template<> void MAXPoolingGrad<float, CPUContext>(const int count,
const int pool_idx = ph * pool_width + pw;
const int idx = mask[pool_idx];
dx[idx] += dy[pool_idx];
} // end pw
} // end ph
} // end pw
} // end ph
dx += x_offset;
dy += y_offset;
mask += y_offset;
} // end c
} // end n
} // end c
} // end n
}
template<> void AVEPoolingGrad<float, CPUContext>(const int count,
......@@ -1298,12 +1298,12 @@ template<> void AVEPoolingGrad<float, CPUContext>(const int count,
dx[idx] += (dy[pool_idx] / pool_size);
}
}
} // end pw
} // end ph
} // end pw
} // end ph
dx += x_offset;
dy += y_offset;
} // end c
} // end n
} // end c
} // end n
}
/******************** vision.roi_pooling ********************/
......@@ -1362,18 +1362,18 @@ template<> void ROIPooling<float, CPUContext>(const float spatial_scale,
Ydata[pool_idx] = Idata[idx];
Mdata[pool_idx] = idx;
}
} //end w
} // end h
} // end pw
} // end ph
// offset image channels
} // end w
} // end h
} // end pw
} // end ph
// offset image channels
Idata += x->offset(0, 1);
Ydata += y->offset(0, 1);
Mdata += mask->offset(0, 1);
} // end c
// offset roi region
} // end c
// offset roi region
Rdata += roi->offset(1);
} //end n
} // end n
}
template<> void ROIPoolingGrad<float, CPUContext>(const float spatial_scale,
......
......@@ -130,7 +130,7 @@ __global__ void _ReluGrad(const int count,
const T* y,
const float slope,
T* dx) {
CUDA_KERNEL_LOOP(i, count){
CUDA_KERNEL_LOOP(i, count) {
dx[i] = dy[i] * ((y[i] > 0) + slope * (y[i] <= 0));
}
}
......@@ -912,7 +912,7 @@ __global__ void _Sum(const int count,
template<> void Sum<float, CUDAContext>(
const int count, const int axis_dim,
const int inner_dim, const float* x, float* y){
const int inner_dim, const float* x, float* y) {
_Sum<float> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
axis_dim,
inner_dim,
......@@ -954,7 +954,7 @@ template<> void SumGrad<float, CUDAContext>(const int count,
template <typename T>
__global__ void _Slice(const int count, const int outer_dim, const int inner_dim,
const int x_slice_dim, const int y_slice_dim, const int slice_offset, const T* x, T* y){
const int x_slice_dim, const int y_slice_dim, const int slice_offset, const T* x, T* y) {
CUDA_KERNEL_LOOP(idx, count) {
const int tmp = y_slice_dim * inner_dim;
const int outer_idx = idx / tmp;
......@@ -1238,7 +1238,7 @@ template <typename T>
__global__ void _AbsGrad(const int count, const T* dy, T* dx) {
CUDA_KERNEL_LOOP(idx, count) {
const T val = dy[idx];
// val > 0: 1 | val == 0: 0 | val < 0: -1
// val > 0: 1 | val == 0: 0 | val < 0: -1
dx[idx] = (val > T(0)) - (val < T(0));
}
}
......@@ -1298,7 +1298,7 @@ __global__ void _SmoothL1Grad(const int count, const float sigma2, const T* dy,
const T val = dy[idx];
const T abs_val = abs(val);
if (abs_val < 1.0 / sigma2) dx[idx] = val * sigma2;
// val > 0: 1 | val == 0: 0 | val < 0: -1
// val > 0: 1 | val == 0: 0 | val < 0: -1
else dx[idx] = (val > T(0)) - (val < T(0));
}
}
......@@ -1952,8 +1952,8 @@ __global__ void _Col2Im(const int count,
const int ex_kernel_w = (kernel_w - 1) * dilation_w + 1;
const int w_start = (im_w < ex_kernel_w) ? 0 : (im_w - ex_kernel_w) / stride_w + 1;
// redundant pixels will be ignored when conv
// note to clip them by min(x,col_w)
// redundant pixels will be ignored when conv
// note to clip them by min(x,col_w)
const int w_end = min(im_w / stride_w + 1, col_w);
const int h_start = (im_h < ex_kernel_h) ? 0 : (im_h - ex_kernel_h) / stride_h + 1;
const int h_end = min(im_h / stride_h + 1, col_h);
......@@ -1962,8 +1962,8 @@ __global__ void _Col2Im(const int count,
for (int w = w_start; w < w_end; ++w) {
int kh_off = (im_h - h * stride_h);
int kw_off = (im_w - w * stride_w);
// only the serval im pixels used in dilated-conv
// ignore the corresponding col pixels
// only the serval im pixels used in dilated-conv
// ignore the corresponding col pixels
if (kh_off % dilation_h == 0 && kw_off % dilation_w == 0) {
kh_off /= dilation_h;
kw_off /= dilation_w;
......@@ -2222,11 +2222,11 @@ __global__ void _MAXPoolingGrad(const int count,
const int c = (idx / width / height) % channels;
const int n = idx / width / height / channels;
// allow overlapping
// allow overlapping
const int start_ph = (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;
const int start_pw = (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;
// allow clip
// allow clip
const int end_ph = min((h + pad_h) / stride_h + 1, pool_height);
const int end_pw = min((w + pad_w) / stride_w + 1, pool_width);
......@@ -2436,7 +2436,7 @@ __global__ void _ROIPoolingGrad(const int count,
const T* cur_roi = roi + n * 5;
const int im_idx_spec = cur_roi[0];
// ignore wrong im_batch_idx
// ignore wrong im_batch_idx
if (im_idx != im_idx_spec) continue;
int x1 = round(cur_roi[1] * spatial_scale);
......@@ -2474,9 +2474,9 @@ __global__ void _ROIPoolingGrad(const int count,
if (mask_off[pool_idx] == (h * width + w)) {
diff += dy_off[pool_idx];
}
} // end pw
} // end ph
} // end n
} // end pw
} // end ph
} // end n
dx[idx] = diff;
}
}
......@@ -2647,7 +2647,7 @@ __global__ void _ROIAlignGrad(const int count,
const T* cur_roi = roi + n * 5;
const int im_idx_spec = cur_roi[0];
// ignore wrong im_batch_idx
// ignore wrong im_batch_idx
if (im_idx != im_idx_spec) continue;
T x1 = cur_roi[1] * spatial_scale;
......@@ -2693,9 +2693,9 @@ __global__ void _ROIAlignGrad(const int count,
else gradient_factor *= mw - w1;
diff += dy_off[ph * pool_w + pw] * gradient_factor;
}
} // end pw
} // end ph
} // end n
} // end pw
} // end ph
} // end n
dx[idx] = diff;
}
}
......
......@@ -9,20 +9,20 @@ namespace dragon {
namespace sse {
template<> void Set(const int n, const float alpha, float* x){
template<> void Set(const int n, const float alpha, float* x) {
__m128 scalar = SSE_FP32_SCALAR(alpha);
SSE_LOOP1(i, n) SSE_FP32_STORE(x + i, scalar);
SSE_LOOP2(i, n) x[i] = alpha;
}
template<> void Set(const int n, const int alpha, int* x){
template<> void Set(const int n, const int alpha, int* x) {
__m128i scalar = SSE_INT32_SCALAR(alpha);
__m128i* x1 = reinterpret_cast<__m128i*>(x);
SSE_LOOP1(i, n) SSE_INT128_STORE(x1++, scalar);
SSE_LOOP2(i, n) x[i] = alpha;
}
template<> void Add(const int n, const float* a, const float* b, float* y){
template<> void Add(const int n, const float* a, const float* b, float* y) {
__m128 x1, y1, z1;
SSE_LOOP1(i, n) {
x1 = SSE_FP32_LOAD(a + i);
......@@ -33,7 +33,7 @@ namespace sse {
SSE_LOOP2(i, n) y[i] = a[i] + b[i];
}
template<> void Sub(const int n, const float* a, const float* b, float* y){
template<> void Sub(const int n, const float* a, const float* b, float* y) {
__m128 x1, y1, z1;
SSE_LOOP1(i, n) {
x1 = SSE_FP32_LOAD(a + i);
......@@ -44,7 +44,7 @@ namespace sse {
SSE_LOOP2(i, n) y[i] = a[i] - b[i];
}
template<> void Mul(const int n, const float* a, const float* b, float* y){
template<> void Mul(const int n, const float* a, const float* b, float* y) {
__m128 x1, y1, z1;
SSE_LOOP1(i, n) {
x1 = SSE_FP32_LOAD(a + i);
......@@ -55,7 +55,7 @@ namespace sse {
SSE_LOOP2(i, n) y[i] = a[i] * b[i];
}
template<> void Div(const int n, const float* a, const float* b, float* y){
template<> void Div(const int n, const float* a, const float* b, float* y) {
__m128 x1, y1, z1;
SSE_LOOP1(i, n) {
x1 = SSE_FP32_LOAD(a + i);
......@@ -66,7 +66,7 @@ namespace sse {
SSE_LOOP2(i, n) y[i] = a[i] / b[i];
}
template<> void Scal(const int n, const float alpha, float* y){
template<> void Scal(const int n, const float alpha, float* y) {
__m128 y1, scalar = SSE_FP32_SCALAR(alpha);
SSE_LOOP1(i, n) {
y1 = SSE_FP32_LOAD(y + i);
......@@ -76,7 +76,7 @@ namespace sse {
SSE_LOOP2(i, n) y[i] *= alpha;
}
template<> void Scale(const int n, const float alpha, const float* x, float* y){
template<> void Scale(const int n, const float alpha, const float* x, float* y) {
__m128 x1, scalar = SSE_FP32_SCALAR(alpha);
SSE_LOOP1(i, n) {
x1 = SSE_FP32_LOAD(x + i);
......@@ -87,7 +87,7 @@ namespace sse {
}
template<> void Axpy(const int n, float alpha, const float* x, float *y){
template<> void Axpy(const int n, float alpha, const float* x, float *y) {
__m128 x1, y1, scalar = SSE_FP32_SCALAR(alpha);
SSE_LOOP1(i, n) {
x1 = SSE_FP32_LOAD(x + i);
......@@ -100,7 +100,7 @@ namespace sse {
}
template<> void Axpby(const int n, float alpha, const float* x,
const float beta, float *y){
const float beta, float *y) {
__m128 x1, y1, z1;
__m128 scalar1 = SSE_FP32_SCALAR(alpha);
__m128 scalar2 = SSE_FP32_SCALAR(beta);
......@@ -115,7 +115,7 @@ namespace sse {
SSE_LOOP2(i, n) y[i] = alpha * x[i] + beta* y[i];
}
template<> float ASum(const int n, const float *x){
template<> float ASum(const int n, const float *x) {
__m128 x1, sum = SSE_FP32_ZERO;
SSE_LOOP1(i, n) {
x1 = SSE_FP32_LOAD(x + i);
......@@ -128,7 +128,7 @@ namespace sse {
return ret;
}
template<> void AddScalar(const int n, const float alpha, float* y){
template<> void AddScalar(const int n, const float alpha, float* y) {
__m128 y1, scalar = SSE_FP32_SCALAR(alpha);
SSE_LOOP1(i, n) {
y1 = SSE_FP32_LOAD(y + i);
......@@ -138,7 +138,7 @@ namespace sse {
SSE_LOOP2(i, n) y[i] += alpha;
}
template<> void MulScalar(const int n, const float alpha, float* y){
template<> void MulScalar(const int n, const float alpha, float* y) {
__m128 y1, scalar = SSE_FP32_SCALAR(alpha);
SSE_LOOP1(i, n) {
y1 = SSE_FP32_LOAD(y + i);
......@@ -148,7 +148,7 @@ namespace sse {
SSE_LOOP2(i, n) y[i] *= alpha;
}
template <> float Dot(const int n, const float* a, const float* b){
template <> float Dot(const int n, const float* a, const float* b) {
__m128 x1, y1, sum = SSE_FP32_ZERO;
SSE_LOOP1(i, n) {
x1 = SSE_FP32_LOAD(a + i);
......
......@@ -11,6 +11,7 @@ import sys
import time
import shutil
import tarfile
import numpy as np
from six.moves import range as xrange
import cv2
......@@ -78,7 +79,7 @@ def extract_images():
f.write(item)
def make_db(image_path, label_path, database_path):
def make_db(image_path, label_path, database_path, pad=0):
if os.path.isfile(label_path) is False:
raise ValueError('input path is empty or wrong.')
if os.path.isdir(database_path) is True:
......@@ -111,6 +112,12 @@ def make_db(image_path, label_path, database_path):
label = record[1]
img = cv2.imread(os.path.join(image_path ,path))
if pad > 0:
pad_img = np.zeros((img.shape[0] + 2 * pad,
img.shape[1] + 2 * pad, 3), dtype=np.uint8)
pad_img[pad : pad + img.shape[0],
pad : pad + img.shape[1], :] = img
img = pad_img
result, imgencode = cv2.imencode('.jpg', img, encode_param)
datum = caffe_pb2.Datum()
......
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!