io refactoring

Ting PAN
Commit 5d518b6c authored Aug 06, 2017 by Ting PAN
Showing with 502 additions and 430 deletions
Dragon/CMakeLists.txt
Dragon/include/core/context.h
Dragon/include/core/context_cuda.h
Dragon/include/core/tensor.h
Dragon/include/core/typeid.h
Dragon/include/core/workspace.h
Dragon/include/operators/common/scan_op.h
Dragon/include/operators/common/utils_op.h
Dragon/include/operators/vision/roi_align_op.h
Dragon/include/utils/caffemodel.h
Dragon/include/utils/filler.h
Dragon/include/utils/string.h
Dragon/modules/python/dragon.cc
Dragon/modules/python/dragon.h
Dragon/modules/python/py_mpi.h
Dragon/python/dragon/io/__init__.py
Dragon/python/dragon/vm/caffe/io/blob_fetcher.py → Dragon/python/dragon/io/blob_fetcher.py
Dragon/python/dragon/vm/caffe/io/minibatch.py → Dragon/python/dragon/io/data_batch.py
Dragon/python/dragon/vm/caffe/io/data_reader.py → Dragon/python/dragon/io/data_reader.py
Dragon/python/dragon/vm/caffe/io/data_transformer.py → Dragon/python/dragon/io/data_transformer.py
--- a/Dragon/CMakeLists.txt
+++ b/Dragon/CMakeLists.txt
@@ -24,7 +24,7 @@ set(3RDPARTY_DIR  ${PROJECT_SOURCE_DIR}/../3rdparty)
 set(PYTHON_DIR /usr/include/python2.7)  # prefer
 #set(PYTHON_DIR /usr/include/python3.x)  # optional, set specific version
 #set(ANACONDA_DIR /xxx/anaconda)  # optional, set specific version below if using py3
-set(NUMPY_DIR /xxx/numpy)  # require， root folder of numpy package
+set(NUMPY_DIR /xxx/numpy)  # require, root folder of numpy package

 # set CUDA compiling architecture
 set(CUDA_ARCH     -gencode arch=compute_20,code=sm_20

--- a/Dragon/include/core/context.h
+++ b/Dragon/include/core/context.h
@@ -52,7 +52,7 @@ class CPUContext{
    inline static void Delete(void* data) { free(data); }

    template<typename T, class DstContext, class SrcContext>
-    inline static void Copy(int n, T* dst, const T* src){
+    inline static void Copy(int n, T* dst, const T* src) {
        if (dst == src) return;
        //  only the basic types(e.g. int/float) can memcpy correctly
        if (std::is_fundamental<T>::value)

--- a/Dragon/include/core/context_cuda.h
+++ b/Dragon/include/core/context_cuda.h
@@ -119,7 +119,7 @@ class CUDAContext {
    inline static void Delete(void* data) { cudaFree(data); }

    template<typename T, class DstContext, class SrcContext>
-    static void Copy(int n, T* dst, const T* src){
+    static void Copy(int n, T* dst, const T* src) {
        if (dst == src) return;
        Memcpy<SrcContext, DstContext>(n * sizeof(T), (void*)dst, (const void*)src);
    }
@@ -148,7 +148,7 @@ class CUDAContext {
    }

 #ifdef WITH_CUDNN
-    cudnnHandle_t cudnn_handle(){
+    cudnnHandle_t cudnn_handle() {
        auto& handle = cuda_object_.cudnn_handle[gpu_id_];
        if (handle)  {
            return handle;

--- a/Dragon/include/core/tensor.h
+++ b/Dragon/include/core/tensor.h
@@ -77,7 +77,7 @@ class Tensor {
    inline TIndex offset(const vector<TIndex>& vec) {
        CHECK_LE(vec.size(), ndim());
        TIndex offset = 0;
-        for (int i = 0; i < ndim(); i++){
+        for (int i = 0; i < ndim(); i++) {
            offset = offset * dim(i);
            if (vec.size() > i) offset += vec[i];
        }
@@ -130,7 +130,7 @@ class Tensor {
    }

    template <class Context>
-    void* raw_mutable_data(const TypeMeta& meta){
+    void* raw_mutable_data(const TypeMeta& meta) {
        void* data_ptr;
        active_data_ptr<Context>(&data_ptr);
        if (meta_ == meta && data_ptr) {

--- a/Dragon/include/core/typeid.h
+++ b/Dragon/include/core/typeid.h
@@ -75,20 +75,20 @@ class TypeMeta {
    bool Match() const { return (id_ == Id<T>()); } 

    template <typename T>
-    static void Ctor(void* ptr, size_t n){
+    static void Ctor(void* ptr, size_t n) {
        T* typed_ptr = static_cast<T*>(ptr);
        for (unsigned int i = 0; i < n; i++) new(typed_ptr + i) T;
    }

    template <typename T>
-    static void Copy(const void* src, void* dst, size_t n){
+    static void Copy(const void* src, void* dst, size_t n) {
        const T* typed_src = static_cast<const T*>(src);
        T* typed_dst = static_cast<T*>(dst);
        for (unsigned int i = 0; i < n; i++) typed_dst[i] = typed_src[i];
    }

    template <typename T>
-    static void Dtor(void* ptr, size_t n){
+    static void Dtor(void* ptr, size_t n) {
        T* typed_ptr = static_cast<T*>(ptr);
        for (unsigned int i = 0; i < n; i++) typed_ptr[i].~T();
    }

--- a/Dragon/include/core/workspace.h
+++ b/Dragon/include/core/workspace.h
@@ -44,7 +44,7 @@ class Workspace{
        return tensor_map_.count(query) > 0; 
    }

-    inline Tensor* CreateTensor(const string& name){
+    inline Tensor* CreateTensor(const string& name) {
        string query = GetTensorName(name);
        if (!HasTensor(query))
            tensor_map_[query] = unique_ptr<Tensor>(new Tensor(query));
@@ -143,7 +143,7 @@ class Workspace{
        return graph_map_[graph_name]->Run(include, exclude);
    }

-    inline vector<string> GetGraphs(){
+    inline vector<string> GetGraphs() {
        vector<string> names;
        for (auto& it : graph_map_) names.push_back(it.first);
        return names;

--- a/Dragon/include/operators/common/scan_op.h
+++ b/Dragon/include/operators/common/scan_op.h
@@ -54,11 +54,11 @@ class ScanGradientOp final: public Operator<Context> {
          step_tensor(OperatorBase::GetSingleArg<string>("step_tensor", "")),
          forward_inputs(OperatorBase::GetRepeatedArg<string>("inputs_name")),
          forward_outputs(OperatorBase::GetRepeatedArg<string>("outputs_name")) {
-        //    handle GO(x)
+        //  handle GO(x)
        for (int i = 0; i < forward_outputs.size(); i++)
            terms[forward_outputs[i] + "_grad"] = input(i + (int)OutputSize()).name();
            
-        //    handle GI(x)
+        //  handle GI(x)
        for (int i = 0; i < forward_inputs.size(); i++)
            terms[forward_inputs[i] + "_grad"] = output(i)->name();
    }

--- a/Dragon/include/operators/common/utils_op.h
+++ b/Dragon/include/operators/common/utils_op.h
@@ -25,9 +25,9 @@ class AccuracyOp final: public Operator<Context> {
 public:
    AccuracyOp(const OperatorDef& op_def, Workspace* ws)
        : Operator<Context>(op_def, ws),
-          top_k(OperatorBase::GetSingleArg<int>("top_k", 1)){
+          top_k(OperatorBase::GetSingleArg<int>("top_k", 1)) {
        vector<int> args = OperatorBase::GetRepeatedArg<int>("ignore_labels");
-        if (args.size()){
+        if (args.size()) {
            ignore_labels.Reshape(vector<TIndex>(1, args.size()));
            int* ignore_data = ignore_labels.mutable_data<int, CPUContext>();
            for (int i = 0; i < args.size(); i++) ignore_data[i] = args[i];

--- a/Dragon/include/operators/vision/roi_align_op.h
+++ b/Dragon/include/operators/vision/roi_align_op.h
@@ -39,7 +39,7 @@ class ROIAlignGradientOp : public Operator<Context> {
        : Operator<Context>(op_def, ws),
          pool_h(OperatorBase::GetSingleArg<int>("pool_h", 0)),
          pool_w(OperatorBase::GetSingleArg<int>("pool_w", 0)),
-          spatial_scale(OperatorBase::GetSingleArg<float>("spatial_scale", 1.0)){
+          spatial_scale(OperatorBase::GetSingleArg<float>("spatial_scale", 1.0)) {
        CHECK_GT(pool_h, 0) << "\npool_h must > 0";
        CHECK_GT(pool_w, 0) << "\npool_w must > 0";
    }

--- a/Dragon/include/utils/caffemodel.h
+++ b/Dragon/include/utils/caffemodel.h
@@ -54,7 +54,7 @@ inline void LoadCaffeModel(string file, string scope, Workspace* ws) {
    ReadProtoFromBinaryFile(file.c_str(), &net_param);
    LOG(INFO) << "Restore From Model @: " << file << "......";
    LOG(INFO) << "Model Format: CaffeModel";
-    for (int i = 0; i < net_param.layer_size(); i++){
+    for (int i = 0; i < net_param.layer_size(); i++) {
        const LayerParameter& layer = net_param.layer(i);
        const string& layer_name = layer.name();
        string prefix = scope + layer_name + "@param";

--- a/Dragon/include/utils/filler.h
+++ b/Dragon/include/utils/filler.h
@@ -54,7 +54,7 @@ class TruncatedNormalFiller final : public Filler < T, Context > {
 public:
    TruncatedNormalFiller(const TensorFiller& filler): Filler<T, Context>(filler) {}
    void Fill(Tensor* tensor) override {
-        //    implement of gpu is diffcult 
+        //  implement of gpu is diffcult 
        math::RandomTruncatedNormal<T, CPUContext>(tensor->count(), 
                                                   filler().mean(), 
                                                    filler().std(), 

--- a/Dragon/include/utils/string.h
+++ b/Dragon/include/utils/string.h
@@ -22,7 +22,7 @@ inline std::vector<std::string> SplitString(const std::string& str,
    std::vector<std::string> ret;
    std::string temp(str);
    size_t pos;
-    while (pos = temp.find(c), pos != std::string::npos){
+    while (pos = temp.find(c), pos != std::string::npos) {
        ret.push_back(temp.substr(0, pos));
        temp.erase(0, pos + 1);
    }

--- a/Dragon/modules/python/dragon.cc
+++ b/Dragon/modules/python/dragon.cc
@@ -31,7 +31,7 @@ const TypeMeta& NumpyTypeToDragon(int numpy_type) {
            { NPY_FLOAT16, TypeMeta::Make<float16>() },
            { NPY_UINT8, TypeMeta::Make<uint8_t>() }};

-    static TypeMeta unknown_type;  //  id = 0
+    static TypeMeta unknown_type;
    return dragon_type_map.count(numpy_type) ? dragon_type_map[numpy_type] : unknown_type;
 }

@@ -50,7 +50,7 @@ REGISTER_TENSOR_FETCHER(TypeMeta::Id<NumpyFetcher>(), NumpyFetcher);
 REGISTER_TENSOR_FETCHER(TypeMeta::Id<StringFetcher>(), StringFetcher);
 REGISTER_TENSOR_FEEDER(TypeMeta::Id<NumpyFeeder>(), NumpyFeeder);

-extern "C"{
+extern "C" {

 PyObject* RegisteredOperatorsCC(PyObject* self, PyObject* args) {
    set<string> all_keys;
@@ -123,7 +123,7 @@ bool SwitchWorkspaceInternal(const string& name, const bool create_if_missing) {
    } else if (create_if_missing) {
        unique_ptr<Workspace> new_workspace(new Workspace());
        g_workspace = new_workspace.get();
-        g_workspaces[name] = std::move(new_workspace);  //  ???
+        g_workspaces[name] = std::move(new_workspace);
        g_current_workspace = name;
        return true;
    } else {

--- a/Dragon/modules/python/dragon.h
+++ b/Dragon/modules/python/dragon.h
@@ -33,7 +33,7 @@ inline PyObject* StdStringToPyBytes(const std::string& str) {
    return PyBytes_FromStringAndSize(str.c_str(), str.size());
 }
 template <typename T>
-inline void MakeStringInternal(std::stringstream& ss, const T& t){ ss << t; }
+inline void MakeStringInternal(std::stringstream& ss, const T& t) { ss << t; }

 template <typename T,typename ... Args>
 inline void MakeStringInternal(std::stringstream& ss, const T& t, const Args& ... args) {
@@ -124,7 +124,7 @@ class NumpyFeeder : public TensorFeederBase {
                   Tensor* tensor) override {
        PyArrayObject* array = PyArray_GETCONTIGUOUS(original_array);
        const TypeMeta& meta = NumpyTypeToDragon(PyArray_TYPE(array));
-        if (meta.id() == 0){
+        if (meta.id() == 0) {
            PyErr_SetString(PyExc_TypeError, "numpy data type is not supported.");
            return nullptr;
        }

--- a/Dragon/modules/python/py_mpi.h
+++ b/Dragon/modules/python/py_mpi.h
@@ -61,13 +61,13 @@ inline PyObject* MPICreateGroupCC(PyObject* self, PyObject* args) {
    for (int i = 0; i < world_size; i++) all_ranks.insert(i);
    local_group = world_group;

-    //    check inclue ranks
+    //  check inclue ranks
    int size = PyList_Size(incl);
-    if (size > 0){
+    if (size > 0) {
        all_ranks.clear();
        unique_ptr<int> incl_ranks(new int[size]);
        int* ranks = incl_ranks.get();
-        for (int i = 0; i < size; i++){
+        for (int i = 0; i < size; i++) {
            ranks[i] = _PyInt_AsInt(PyList_GetItem(incl, i));
            all_ranks.insert(ranks[i]);
        }
@@ -75,13 +75,13 @@ inline PyObject* MPICreateGroupCC(PyObject* self, PyObject* args) {
        CHECK(err_code == MPI_SUCCESS) << "failed to create mpi group.";
    }

-    //    check exclude ranks
+    //  check exclude ranks
    size = PyList_Size(excl);
    if (size > 0) {
        all_ranks.clear(); Set<int> tmp;
        unique_ptr<int> excl_ranks(new int[size]);
        int* ranks = excl_ranks.get();
-        for (int i = 0; i < size; i++){
+        for (int i = 0; i < size; i++) {
            ranks[i] = _PyInt_AsInt(PyList_GetItem(excl, i));
            tmp.insert(ranks[i]);
        }
@@ -97,7 +97,7 @@ inline PyObject* MPICreateGroupCC(PyObject* self, PyObject* args) {
    if (local_comm != MPI_COMM_NULL) {
        int world_rank, local_size;
        MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
-        if (world_rank == local_root){
+        if (world_rank == local_root) {
            MPI_Comm_size(local_comm, &local_size);
            std::stringstream ss;
            ss << "Rank[" << world_rank << "]: "

--- a/Dragon/python/dragon/io/__init__.py
+++ b/Dragon/python/dragon/io/__init__.py
+# --------------------------------------------------------
+# Dragon
+# Copyright(c) 2017 SeetaTech
+# Written by Ting Pan
+# --------------------------------------------------------
+
+from .data_batch import DataBatch
+
+
--- a/Dragon/python/dragon/vm/caffe/io/blob_fetcher.py
+++ b/Dragon/python/dragon/vm/caffe/io/blob_fetcher.py
 # --------------------------------------------------------
-# Caffe for Dragon
+# Dragon
 # Copyright(c) 2017 SeetaTech
 # Written by Ting Pan
 # --------------------------------------------------------
@@ -10,7 +10,7 @@ from six.moves import range as xrange

 from dragon.config import logger

-from .__init__ import GetProperty
+from .utils import GetProperty

 class BlobFetcher(Process):
    def __init__(self, **kwargs):
@@ -30,16 +30,16 @@ class BlobFetcher(Process):
        atexit.register(cleanup)

    def im_list_to_blob(self):
-        datum = self.Q_in.get()  # (h, w, BGR)
-        im = datum[0]; h, w, c = im.shape
-        im_blob = np.zeros((self._batch_size, h, w, c), dtype=np.float32)
+        datum = self.Q_in.get()
+        im_blob = []
        label_blob = np.zeros((self._batch_size, len(datum[1])), dtype=np.float32) \
                        if len(datum) > 1 else None
        for i in xrange(0, self._batch_size):
-            im_blob[i, 0:h, 0:w, :] = datum[0]
+            im_blob.append(datum[0])
            if label_blob is not None: label_blob[i, :] = datum[1]
            if i != self._batch_size - 1: datum = self.Q_in.get()
        channel_swap = (0, 3, 1, 2)
+        im_blob = np.array(im_blob, dtype=np.float32)
        im_blob = im_blob.transpose(channel_swap)
        return (im_blob, label_blob)


--- a/Dragon/python/dragon/vm/caffe/io/minibatch.py
+++ b/Dragon/python/dragon/vm/caffe/io/minibatch.py
 # --------------------------------------------------------
-# Caffe for Dragon
+# Dragon
 # Copyright(c) 2017 SeetaTech
 # Written by Ting Pan
 # --------------------------------------------------------

+import sys
 import time
 import pprint
 from multiprocessing import Queue
+if sys.version_info >= (3,0):
+    from queue import Queue as Queue2
+else:
+    from Queue import Queue as Queue2
+import threading
 from six.moves import range as xrange

 import dragon.core.mpi as mpi
@@ -16,10 +22,11 @@ from .data_reader import DataReader
 from .data_transformer import DataTransformer
 from .blob_fetcher import BlobFetcher

-from .__init__ import GetProperty
+from .utils import GetProperty

-class DataBatch(object):
+class DataBatch(threading.Thread):
    def __init__(self, **kwargs):
+        super(DataBatch, self).__init__()

        """DataBatch use Triple-Buffering to speed up"""

@@ -35,10 +42,10 @@ class DataBatch(object):
        kwargs['group_size'] = group_size

        # configuration
-        self._prefetch = GetProperty(kwargs, 'prefetch', 40)
+        self._prefetch = GetProperty(kwargs, 'prefetch', 5)
        self._num_readers = GetProperty(kwargs, 'num_readers', 1)
        self._num_transformers = GetProperty(kwargs, 'num_transformers', -1)
-        self._num_fetchers = GetProperty(kwargs, 'num_fetchers', 3)
+        self._num_fetchers = GetProperty(kwargs, 'num_fetchers', 1)

        # default policy
        if self._num_transformers == -1:
@@ -60,6 +67,7 @@ class DataBatch(object):
        self.Q_level_1 = Queue(self._prefetch * self._num_readers * self._batch_size)
        self.Q_level_2 = Queue(self._prefetch * self._num_readers * self._batch_size)
        self.Q_level_3 = Queue(self._prefetch * self._num_readers)
+        self.Q_level_4 = Queue2(self._prefetch * self._num_readers)

        # init readers
        self._readers = []
@@ -102,11 +110,16 @@ class DataBatch(object):
            self._fetchers.append(fetcher)
            time.sleep(0.1)

+        self.daemon = True
+        self.start()
        #self.echo()

-    @property
-    def blobs(self):
-        return self.Q_level_3.get()
+    def run(self):
+        while True:
+            self.Q_level_4.put(self.Q_level_3.get())
+
+    def get(self):
+        return self.Q_level_4.get()

    def echo(self):
        logger.info('---------------------------------------------------------')

--- a/Dragon/python/dragon/vm/caffe/io/data_reader.py
+++ b/Dragon/python/dragon/vm/caffe/io/data_reader.py
 # --------------------------------------------------------
-# Caffe for Dragon
+# Dragon
 # Copyright(c) 2017 SeetaTech
 # Written by Ting Pan
 # --------------------------------------------------------
@@ -12,8 +12,7 @@ import dragon.config as config
 from dragon.config import logger
 from dragon.tools.db import LMDB

-from .__init__ import GetProperty
-
+from .utils import GetProperty

 class DataReader(Process):
    def __init__(self, **kwargs):

--- a/Dragon/python/dragon/vm/caffe/io/data_transformer.py
+++ b/Dragon/python/dragon/vm/caffe/io/data_transformer.py
 # --------------------------------------------------------
-# Caffe for Dragon
+# Dragon
 # Copyright(c) 2017 SeetaTech
 # Written by Ting Pan
 # --------------------------------------------------------
@@ -12,7 +12,7 @@ import dragon.config as config
 from dragon.config import logger
 import dragon.vm.caffe.proto.caffe_pb2 as pb

-from .__init__ import GetProperty
+from .utils import GetProperty

 try:
    import cv2
@@ -130,6 +130,4 @@ class DataTransformer(Process):
        npr.seed(self._random_seed)
        while True:
            serialized = self.Q_in.get()
-            self.Q_out.put(self.transform_image_label(serialized))
-
-
+            self.Q_out.put(self.transform_image_label(serialized))
\ No newline at end of file
--- a/Dragon/python/dragon/vm/caffe/io/__init__.py
+++ b/Dragon/python/dragon/vm/caffe/io/__init__.py
 # --------------------------------------------------------
-# Caffe for Dragon
+# Dragon
 # Copyright(c) 2017 SeetaTech
 # Written by Ting Pan
 # --------------------------------------------------------

 def GetProperty(kwargs, name, default):
    return kwargs[name] \
-        if name in kwargs else default
\ No newline at end of file
+        if name in kwargs else default
--- a/Dragon/python/dragon/operators/custom/__init__.py
+++ b/Dragon/python/dragon/operators/custom/__init__.py
--- a/Dragon/python/dragon/examples/custom_ops/data_process.py
+++ b/Dragon/python/dragon/examples/custom_ops/data_process.py
--- a/Dragon/python/dragon/examples/custom_ops/vec_mult.py
+++ b/Dragon/python/dragon/examples/custom_ops/vec_mult.py
--- a/Dragon/python/dragon/vm/caffe/io/data_layer.py
+++ b/Dragon/python/dragon/vm/caffe/io/data_layer.py
 # --------------------------------------------------------
-# Caffe for Dragon
+# Dragon
 # Copyright(c) 2017 SeetaTech
 # Written by Ting Pan
 # --------------------------------------------------------

-import dragon.vm.caffe as caffe
 import dragon.core.workspace as ws
-from .minibatch import DataBatch
+from dragon.io.data_batch import DataBatch

-class DataLayer(caffe.Layer):
-    def setup(self, bottom, top):
+class MiniBatchOp(object):
+
+    def setup(self, inputs, outputs):
        kwargs = eval(self.param_str)
        self._data_batch = DataBatch(**kwargs)

-    def forward(self, bottom, top):
-        blobs = self._data_batch.blobs
+    def run(self, inputs, outputs):
+        blobs = self._data_batch.get()
        for idx, blob in enumerate(blobs):
-            ws.FeedTensor(top[idx], blob)
\ No newline at end of file
+            ws.FeedTensor(outputs[idx], blob)
\ No newline at end of file
--- a/Dragon/python/dragon/operators/data.py
+++ b/Dragon/python/dragon/operators/data.py
@@ -10,7 +10,6 @@ from dragon.operators.utils import Run

 def LMDBData(**kwargs):
    """
-    :param kwargs:                   a dict of imagenet data param
    :param --> mean_value:           a list of mean values for channles [B-G-R]
    :param --> source:               a str of the images root directory
    :param --> imageset:             a str of text file contains image name / label
@@ -30,8 +29,8 @@ def LMDBData(**kwargs):
    args = locals(); kwargs = args['kwargs']
    del args['kwargs']; kwargs = dict(args, **kwargs)

-    kwargs['module'] =  'dragon.vm.caffe.io.data_layer'
-    kwargs['op'] = 'DataLayer'
+    kwargs['module'] =  'dragon.operators.custom.minibatch'
+    kwargs['op'] = 'MiniBatchOp'

    return Run([], param_str=str(kwargs), nout=2, **kwargs)


--- a/Dragon/python/dragon/tools/im2lmdb.py
+++ b/Dragon/python/dragon/tools/im2lmdb.py
+# --------------------------------------------------------
+# Dragon
+# Copyright(c) 2017 SeetaTech
+# Written by Ting Pan
+# --------------------------------------------------------
+
+""" Generate LMDB from images """
+
+import os
+import sys
+import time
+import shutil
+import argparse
+
+import cv2
+try:
+    import numpy as np
+except: pass
+
+from dragon.tools.db import LMDB
+from dragon.vm.caffe.proto import caffe_pb2
+
+def resize_image(im, resize):
+    if im.shape[0] > im.shape[1]:
+        newsize = (resize, im.shape[0] * resize / im.shape[1])
+    else:
+        newsize = (im.shape[1] * resize / im.shape[0], resize)
+    im = cv2.resize(im, newsize)
+    return im
+
+def make_db(args):
+    if os.path.isfile(args.list) is False:
+        raise ValueError('the path of image list is invalid.')
+    if os.path.isdir(args.database) is True:
+        raise ValueError('the database is already exist or invalid.')
+
+    print('start time: ', time.strftime("%a, %d %b %Y %H:%M:%S", time.gmtime()))
+
+    db = LMDB(max_commit=10000)
+    db.open(args.database, mode='w')
+
+    total_line = sum(1 for line in open(args.list))
+    count = 0
+    zfill_flag = '{0:0%d}' % (args.zfill)
+
+    encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), args.quality]
+
+    start_time = time.time()
+
+    with open(args.list, 'r') as input_file:
+        records = input_file.readlines()
+        if args.shuffle:
+            import random
+            random.shuffle(records)
+
+        for record in records:
+            count += 1
+            if count % 10000 == 0:
+                now_time = time.time()
+                print('{0} / {1} in {2:.2f} sec'.format(
+                    count, total_line, now_time - start_time))
+                db.commit()
+
+            record = record.split()
+            path = record[0]
+            label = record[1]
+
+            img = cv2.imread(os.path.join(args.root, path))
+            if args.resize > 0:
+                img = resize_image(img, args.resize)
+            if args.pad > 0:
+                pad_img = np.zeros((img.shape[0] + 2 * args.pad,
+                                    img.shape[1] + 2 * args.pad, 3), dtype=img.dtype)
+                pad_img[args.pad : args.pad + img.shape[0],
+                        args.pad : args.pad + img.shape[1], :] = img
+                img = pad_img
+            result, imgencode = cv2.imencode('.jpg', img, encode_param)
+
+            datum = caffe_pb2.Datum()
+            datum.height, datum.width, datum.channels = img.shape
+            datum.label = int(label)
+            datum.encoded = True
+            datum.data = imgencode.tostring()
+            db.put(zfill_flag.format(count - 1), datum.SerializeToString())
+
+    now_time = time.time()
+    print('{0} / {1} in {2:.2f} sec'.format(count, total_line, now_time - start_time))
+    db.put('size', str(count))
+    db.put('zfill', str(args.zfill))
+    db.commit()
+    db.close()
+
+    shutil.copy(args.list, args.database + '/image_list.txt')
+    end_time = time.time()
+    print('{0} images have been stored in the database.'.format(total_line))
+    print('This task finishes within {0:.2f} seconds.'.format(end_time - start_time))
+    print('The size of database is {0} MB.'.
+          format(float(os.path.getsize(args.database + '/data.mdb') / 1000 / 1000)))
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Create LMDB from images for classification.')
+    parser.add_argument('--root', help='the root folder of raw images')
+    parser.add_argument('--list', help='the filepath of image list')
+    parser.add_argument('--database', help='the filepath of database')
+    parser.add_argument('--zfill', type=int, default=8, help='zfill for the key of database')
+    parser.add_argument('--resize', type=int, default=0, help='resize the shorter edge of image to the newsize')
+    parser.add_argument('--pad', type=int, default=0, help='zero-pad the image')
+    parser.add_argument('--quality', type=int, default=95, help='JPEG quality for encoding, 1-100')
+    parser.add_argument('--shuffle', type=bool, default=True, help='randomize the order in list file True')
+
+    if len(sys.argv) < 4:
+        parser.print_help()
+        sys.exit(1)
+
+    args = parser.parse_args()
+    return args
+
+if __name__ == '__main__':
+
+    args = parse_args()
+
+    make_db(args)
\ No newline at end of file
--- a/Dragon/python/dragon/utils.py
+++ b/Dragon/python/dragon/utils.py
-# --------------------------------------------------------
-# Dragon
-# Copyright(c) 2017 SeetaTech
-# Written by Ting Pan
-# --------------------------------------------------------
-
-from google.protobuf.message import Message
-from protos import dragon_pb2 as pb
-import numpy as np
-
-def MakeArgument(key, value):
-    argument = pb.Argument()
-    argument.name = key
-    if type(value) is float: argument.f = value
-    elif type(value) is int : argument.i = value
-    elif type(value) is np.int64: argument.i64 = int(value)
-    elif type(value) is str: argument.s = value
-    elif type(value) is unicode: argument.s = value
-    elif type(value) is bool: argument.b = value
-    elif isinstance(value, Message): argument.s = value.SerializeToString()
-    elif all(type(v) is float for v in value): argument.floats.extend(value)
-    elif all(type(v) is int for v in value): argument.ints.extend(value)
-    elif all(type(v) is str for v in value): argument.strings.extend(value)
-    elif all(type(v) is unicode or type(v) is str for v in value): argument.strings.extend(value)
-    elif all(isinstance(v,Message) for v in value):
-        argument.strings.extend([v.SerializeToString() for v in value])
-    else: raise ValueError('unknown argument type: key={} value={} value type={}' \
-                           .format(key,value,type(value)))
-    return argument
-
-def MakeOperatorDef(op_type, inputs, outputs, name='',
-                   device_option=None, arg=None, engine=None, **kwargs):
-    operator = pb.OperatorDef()
-    operator.type = op_type
-    operator.name = name
-    operator.input.extend([str(tensor) for tensor in inputs])
-    operator.output.extend([str(tensor) for tensor in outputs])
-    if device_option is not None:
-        operator.device_option.CopyFrom(device_option)
-    if engine is not None:
-        operator.engine = engine
-    if 'random_seed' in kwargs:
-        operator.device_option.random_seed = kwargs['random_seed']
-        del kwargs['random_seed']
-    if arg is not None:
-        operator.arg.extend(arg)
-    for k,v in kwargs.items():
-        if v is None: continue
-        operator.arg.add().CopyFrom(MakeArgument(k,v))
-    return operator
-
-def MakeDeviceOption(device_type, gpu_id, rng_seed = None):
-    """ return a DeviceOption """
-    option = pb.DeviceOption()
-    option.device_type = device_type
-    option.gpu_id = gpu_id
-    if rng_seed is not None: option.random_seed = rng_seed
-    return option
-
-#  fix the python stdout
-class Unbuffered(object):
-   def __init__(self, stream):
-       self.stream = stream
-   def write(self, data):
-       self.stream.write(data)
-       self.stream.flush()
-   def __getattr__(self, attr):
-       return getattr(self.stream, attr)
-# clear the stdout buffer for mpi(c++ & python)
-import sys
-sys.stdout = Unbuffered(sys.stdout)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/caffe/proto/caffe.proto
+++ b/Dragon/python/dragon/vm/caffe/proto/caffe.proto
@@ -221,7 +221,7 @@ message SolverParameter {

  // RMSProp decay value
  // MeanSquare(t) = rms_decay*MeanSquare(t-1) + (1-rms_decay)*SquareGradient(t)
-  optional float rms_decay = 38;
+  optional float rms_decay = 38 [default = 0.99];

  // If true, print information about the state of the net that may help with
  // debugging learning problems.
@@ -676,7 +676,7 @@ message DataParameter {
  optional bool force_encoded_color = 9 [default = false];
  // Prefetch queue (Number of batches to prefetch to host memory, increase if
  // data access bandwidth varies).
-  optional uint32 prefetch = 10 [default = 40];
+  optional uint32 prefetch = 10 [default = 5];
 }

 message DropoutParameter {

--- a/Dragon/python/dragon/vm/caffe/proto/caffe_pb2.py
+++ b/Dragon/python/dragon/vm/caffe/proto/caffe_pb2.py
--- a/Dragon/src/core/context.cc
+++ b/Dragon/src/core/context.cc
@@ -8,9 +8,9 @@ CPUObject CPUContext::cpu_object_;
 CUDAObject CUDAContext::cuda_object_;
 #endif // WITH_CUDA

-//    cpu <- gpu
+//  cpu <- gpu
 template<> void CPUContext::Memcpy<CPUContext, CUDAContext>(
-    size_t nbytes, void* dst, const void* src){
+    size_t nbytes, void* dst, const void* src) {
 #ifdef WITH_CUDA
    CUDAContext ctx(POINTER_DEVICE(src));
    ctx.Memcpy<CPUContext, CUDAContext>(nbytes, dst, src);
@@ -19,9 +19,9 @@ template<> void CPUContext::Memcpy<CPUContext, CUDAContext>(
 #endif
 }

-// gpu <- cpu
+//  gpu <- cpu
 template<> void CPUContext::Memcpy<CUDAContext, CPUContext>(
-    size_t nbytes, void* dst, const void* src){
+    size_t nbytes, void* dst, const void* src) {
 #ifdef WITH_CUDA
        CUDAContext ctx(POINTER_DEVICE(dst));
        ctx.Memcpy<CUDAContext, CPUContext>(nbytes, dst, src);

--- a/Dragon/src/core/graph.cc
+++ b/Dragon/src/core/graph.cc
@@ -151,13 +151,13 @@ GraphDef Graph::Prune(const GraphDef& graph_def) {
        OperatorDef op_def;
        op_def.CopyFrom(graph_def.op(it));
        //  handle inputs
-        for (int i = 0; i < graph_def.op(it).input_size(); i++){
+        for (int i = 0; i < graph_def.op(it).input_size(); i++) {
            string input = graph_def.op(it).input(i);
            if (!colored_[input] || !outputs.count(input))
                *op_def.mutable_input(i) = "ignore";
        }
        //  handle outputs
-        for (int i = 0; i < graph_def.op(it).output_size(); i++){
+        for (int i = 0; i < graph_def.op(it).output_size(); i++) {
            string output = graph_def.op(it).output(i);
            if (!colored_[output]) *op_def.mutable_output(i) = "ignore";
            else outputs.insert(op_def.output(i));
@@ -212,7 +212,7 @@ GraphDef Graph::MakeUpdate(const GraphDef& graph_def) {
    for (int i = 0; i < graph_def.u_target_size(); i++) {
        UpdateTarget target = graph_def.u_target(i);
        vector<string> missing_tensors;
-        //    missing check
+        //  missing check
        for (auto& tensor : target.tensor()) {
            if (!ws()->HasTensor(tensor)) {
                LOG(INFO) << "missing Tensor: " << tensor;

--- a/Dragon/src/core/graph_gradient.cc
+++ b/Dragon/src/core/graph_gradient.cc
@@ -18,12 +18,12 @@ CheckTuple GraphGradientMaker::CheckMissingGrad(OperatorDef* forward_op) {
            string g_output = output + "_grad";
            if (terms_.count(g_output)) g_output = terms_[g_output];
    
-            //    check if having external grad first
+            //  check if having external grad first
            if (external_grads_.count(g_output))
                inputs_to_grads_[output] = g_output;

-            //    consider generate virtual grad
-            else if (targets_set_.count(output) && g_output != "ignore"){
+            //  consider generate virtual grad
+            else if (targets_set_.count(output) && g_output != "ignore") {
                gen_grads.push_back({ output, idx });
                inputs_to_grads_[output] = g_output;
            }
@@ -36,7 +36,7 @@ CheckTuple GraphGradientMaker::CheckMissingGrad(OperatorDef* forward_op) {
            if (forward_op->output_size() == 1) return { true, gen_grads };
        }
    }
-    //    check pass, even if missing some grads
+    //  check pass, even if missing some grads
    return { false, gen_grads };
 }

@@ -50,7 +50,7 @@ GraphDef GraphGradientMaker::Make() {
    Set<string> all_split_grads;

    // PLAY for the forward
-    for (auto& op : forward_def_.op()){
+    for (auto& op : forward_def_.op()) {
        if (NoGradientRegistry()->Has(op.type())) continue;
        for (auto& input : op.input()) inputs_count[input]++;
    }
@@ -73,17 +73,17 @@ GraphDef GraphGradientMaker::Make() {
        Gradient grad = MakeGradientForOp(*op, g_outputs);

        // replace terms
-        for (auto& g_op : grad.ops){
+        for (auto& g_op : grad.ops) {
            g_op.set_name(GetOperatorName());
-            for (int i = 0; i < g_op.input_size(); i++){
+            for (int i = 0; i < g_op.input_size(); i++) {
                string* input = g_op.mutable_input(i);
                if (terms_.count(*input)) *input = terms_[*input];
            }
-            for (int i = 0; i < g_op.output_size(); i++){
+            for (int i = 0; i < g_op.output_size(); i++) {
                string* output = g_op.mutable_output(i);
                if (terms_.count(*output)) *output = terms_[*output];
            }
-            for (int i = 0; i < grad.g_inputs.size(); i++){
+            for (int i = 0; i < grad.g_inputs.size(); i++) {
                if (terms_.count(grad.g_inputs[i]))
                    grad.g_inputs[i] = terms_[grad.g_inputs[i]];
            }
@@ -106,14 +106,14 @@ GraphDef GraphGradientMaker::Make() {
                    string split_name = *output + "_autosplit_" + str(grads_count[*output]++);
                    if (!is_skip) all_split_grads.insert(split_name);
                    //  gather
-                    if (grads_count[*output] == inputs_count[original_name]){
+                    if (grads_count[*output] == inputs_count[original_name]) {
                        gather_op = new OperatorDef();
                        gather_op->set_name(GetOperatorName());
                        gather_op->set_type("GradientGather");
                        gather_op->add_output(*output);
                        if (g_op.has_device_option())
                            gather_op->mutable_device_option()->CopyFrom(g_op.device_option());
-                        for (int j = 0; j < grads_count[*output]; j++){
+                        for (int j = 0; j < grads_count[*output]; j++) {
                            string key = *output + "_autosplit_" + str(j);
                            if (all_split_grads.count(key)) gather_op->add_input(key);
                        }
@@ -123,7 +123,7 @@ GraphDef GraphGradientMaker::Make() {
            }
        }

-        //    append ops
+        //  append ops
        if (!is_skip) {
            if (gen_grads.size() > 0) {
                vector<string> op_inputs, op_outputs;
@@ -148,7 +148,7 @@ GraphDef GraphGradientMaker::Make() {
        }
        if (gather_op != nullptr) new_def_.add_op()->CopyFrom(*gather_op);

-        //    done
+        //  done
        if (!is_skip) {
            for (int i = 0; i < op->input_size(); i++) {
                if (!grad.g_inputs[i].empty())

--- a/Dragon/src/core/operator.cc
+++ b/Dragon/src/core/operator.cc
@@ -66,7 +66,7 @@ DEFINE_REGISTRY(GradientRegistry, GradientMakerBase, const OperatorDef&, const v
 DEFINE_REGISTRY(NoGradientRegistry, GradientMakerBase, const OperatorDef&, const vector<string>&);

 #define INSTANTIATE_GET_SINGLE_ARGUMENT(T, fieldname) \
-template <> T OperatorBase::GetSingleArg(const string& name, const T& default_value){ \
+template <> T OperatorBase::GetSingleArg(const string& name, const T& default_value) { \
    if(args_.count(name) == 0) { \
        return default_value; \
    } \
@@ -82,7 +82,7 @@ INSTANTIATE_GET_SINGLE_ARGUMENT(int64_t, i64);


 #define INSTANTIATE_GET_REPEATED_ARGUMENT(T, fieldname) \
-template<> vector<T> OperatorBase::GetRepeatedArg<T>(const string& name){ \
+template<> vector<T> OperatorBase::GetRepeatedArg<T>(const string& name) { \
    if(args_.count(name) == 0) return vector<T>(); \
    vector<T> values; \
    for(const auto& v : args_[name]->fieldname()) values.push_back(v); \

--- a/Dragon/src/core/operator_schema.cc
+++ b/Dragon/src/core/operator_schema.cc
@@ -17,7 +17,7 @@ bool OpSchema::Verify(const OperatorDef& def) const {
    }
    for (int in = 0; in < def.input_size(); in++) {
        if (def.input(in) == "ignore") continue;
-        for (int out = 0; out < def.output_size(); out++){
+        for (int out = 0; out < def.output_size(); out++) {
            if (def.output(out) == "ignore") continue;
            if (def.input(in) == def.output(out) && (!CheckInplace(in, out)))
                LOG(FATAL) << "[" << def.name() << "] input("

--- a/Dragon/src/operators/activation/cudnn_relu_op.cc
+++ b/Dragon/src/operators/activation/cudnn_relu_op.cc
@@ -24,7 +24,7 @@ void CuDNNReluOp<Context>::RunWithType() {

 template <class Context>
 void CuDNNReluOp<Context>::RunOnDevice() {
-    //    cudnn does not support LeakyRelu
+    //  cudnn does not support LeakyRelu
    if (this->slope != 0) return ReluOp<Context>::RunOnDevice();
    output(0)->ReshapeLike(input(0));

@@ -58,7 +58,7 @@ void CuDNNReluGradientOp<Context>::RunWithType() {

 template <class Context>
 void CuDNNReluGradientOp<Context>::RunOnDevice() {
-    //    cudnn does not support LeakyRelu
+    //  cudnn does not support LeakyRelu
    if (this->slope != 0) return ReluGradientOp<Context>::RunOnDevice();
    output(0)->ReshapeLike(input(0));


--- a/Dragon/src/operators/activation/dropout_op.cc
+++ b/Dragon/src/operators/activation/dropout_op.cc
@@ -71,10 +71,9 @@ void DropoutGradientOp<Context>::RunOnDevice() {

 template <class Context>
 void DropoutGradientOp<Context>::ClearAfterRun() {
-    ws()->ReleaseBuffer(mask);
+    ws()->ReleaseBuffer(mask, true);
 }

-
 DEPLOY_CPU(DropoutGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(DropoutGradient);

--- a/Dragon/src/operators/arithmetic/dot_op.cc
+++ b/Dragon/src/operators/arithmetic/dot_op.cc
@@ -125,7 +125,7 @@ void DotGradientOp<Context>::GemvRunWithType() {
 }

 template <class Context>
-void DotGradientOp<Context>::RunOnDevice(){
+void DotGradientOp<Context>::RunOnDevice() {
    output(0)->ReshapeLike(input(0));
    output(1)->ReshapeLike(input(1));


--- a/Dragon/src/operators/arithmetic/eltwise_op.cc
+++ b/Dragon/src/operators/arithmetic/eltwise_op.cc
@@ -65,7 +65,7 @@ void EltwiseGradientOp<Context>::SumRunWithType() {
    auto* dYdata = input(-1).template data<T, Context>();
    TIndex count = input(-1).count();

-    for (int i = 0; i < OutputSize(); i++){
+    for (int i = 0; i < OutputSize(); i++) {
        if (output(i)->name() == "ignore") continue;
        auto* dXdata = output(i)->template mutable_data<T, Context>();
        if (coeffs[i] == float(1)) {

--- a/Dragon/src/operators/arithmetic/gram_matrix_op.cc
+++ b/Dragon/src/operators/arithmetic/gram_matrix_op.cc
@@ -8,7 +8,7 @@ template <class Context> template <typename T>
 void GramMatrixOp<Context>::RunWithType() {
    auto* Xdata = input(0).template data<T, Context>();
    auto* Ydata = output(0)->template mutable_data<T, Context>();
-    for (int i = 0; i < outer_dim; i++){
+    for (int i = 0; i < outer_dim; i++) {
        math::Gemm<T, Context>(CblasNoTrans, CblasTrans,
            dim, dim, inner_dim, 1.0, Xdata, Xdata, 0.0, Ydata);
        Xdata += x_offset;
@@ -40,7 +40,7 @@ void GramMatrixGradientOp<Context>::RunWithType() {
    auto* dYdata = input(-1).template data<T, Context>();
    auto* Xdata = input(0).template data<T, Context>();
    auto* dXdata = output(0)->template mutable_data<T, Context>();
-    for (int i = 0; i < outer_dim; i++){
+    for (int i = 0; i < outer_dim; i++) {
        math::Gemm<T, Context>(CblasNoTrans, CblasNoTrans,
            dim, inner_dim, dim, 2.0, dYdata, Xdata, 0.0, dXdata);
        dYdata += y_offset;
@@ -49,7 +49,7 @@ void GramMatrixGradientOp<Context>::RunWithType() {
 }

 template <class Context>
-void GramMatrixGradientOp<Context>::RunOnDevice(){
+void GramMatrixGradientOp<Context>::RunOnDevice() {
    outer_dim = input(0).count(0, axis);
    dim = input(0).dim(axis);
    inner_dim = input(0).count(axis + 1);

--- a/Dragon/src/operators/arithmetic/matmul_op.cc
+++ b/Dragon/src/operators/arithmetic/matmul_op.cc
@@ -81,7 +81,7 @@ void MatmulGradientOp<Context>::RunWithType() {
 }

 template <class Context>
-void MatmulGradientOp<Context>::RunOnDevice(){
+void MatmulGradientOp<Context>::RunOnDevice() {
    CHECK(input(0).ndim() == input(1).ndim())
        << "both matrices must have the same number of dimensions.";
    CHECK_GE(input(0).ndim(), 2)

--- a/Dragon/src/operators/arithmetic/pow_op.cc
+++ b/Dragon/src/operators/arithmetic/pow_op.cc
@@ -9,7 +9,7 @@ void PowOp<Context>::RunWithType() {
    TIndex count = input(0).count();
    auto* Ydata = output(0)->template mutable_data<T, Context>();

-    if (power_scale == float(0)){
+    if (power_scale == float(0)) {
        float value = (power == float(0)) ? float(1) : pow(shift, power);
        math::Set<T, Context>(count, dragon_cast<T, float>(value), Ydata);
        return;

--- a/Dragon/src/operators/arithmetic/scale_op.cc
+++ b/Dragon/src/operators/arithmetic/scale_op.cc
@@ -85,12 +85,12 @@ void ScaleGradientOp<Context>::ScaleRunWithType() {
        T* SRes_data = nullptr;
        if (inner_dim == 1) {
            SRes_data = tmp_data;
-        } else if (sum_result.count() == 1) {    // handle inner only
+        } else if (sum_result.count() == 1) {    //  handle inner only
            dScale = output(1)->template mutable_data<T, CPUContext>();
            T result = math::Dot<T, Context>(inner_dim, tmp_data, SMul_data);
            *dScale += result;
        } else {
-            SRes_data = (outer_dim == 1) ?  // handle scale only
+            SRes_data = (outer_dim == 1) ?  //  handle scale only
                dScale : sum_result.template mutable_data<T, Context>();
            math::Gemv<T, Context>(CblasNoTrans, sum_result.count(), inner_dim,
                                                                           1.0, 
@@ -99,7 +99,7 @@ void ScaleGradientOp<Context>::ScaleRunWithType() {
                                                                    SRes_data);
        } 
        if (outer_dim != 1) {
-            if (scale_dim == 1) {    //    handle outer only
+            if (scale_dim == 1) {    //  handle outer only
                T result = math::Dot<T, Context>(outer_dim, SMul_data, SRes_data);
                *dScale += result;
            } else {

--- a/Dragon/src/operators/arithmetic/square_op.cc
+++ b/Dragon/src/operators/arithmetic/square_op.cc
@@ -12,7 +12,7 @@ void SquareOp<Context>::RunWithType() {
 }

 template <class Context>
-void SquareOp<Context>::RunOnDevice(){
+void SquareOp<Context>::RunOnDevice() {
    output(0)->ReshapeLike(input(0));

    if (input(0).template IsType<float>()) RunWithType<float>();

--- a/Dragon/src/operators/arithmetic/sub_op.cc
+++ b/Dragon/src/operators/arithmetic/sub_op.cc
@@ -44,7 +44,7 @@ void SubOp<Context>::BroadcastRunWithType(int type) {
 }

 template <class Context>
-void SubOp<Context>::RunOnDevice(){
+void SubOp<Context>::RunOnDevice() {
    output(0)->ReshapeLike(input(0));

    if (input(0).dims() == input(1).dims()) {

--- a/Dragon/src/operators/common/argmax_op.cc
+++ b/Dragon/src/operators/common/argmax_op.cc
@@ -6,7 +6,7 @@ namespace dragon {
 template <class Context> template <typename T>
 void ArgmaxOp<Context>::RunWithType() {
    if (top_k != 1) {
-        //    it's difficult to implement device code when top_k > 1
+        //  it's difficult to implement device code when top_k > 1
        auto* Xdata = input(0).template data<T, CPUContext>();
        auto* Ydata = output(0)->template mutable_data<T, CPUContext>();
        kernel::Argmax<T, CPUContext>(count, input(0).dim(axis), inner_dim,

--- a/Dragon/src/operators/common/concat_op.cc
+++ b/Dragon/src/operators/common/concat_op.cc
@@ -7,7 +7,7 @@ namespace dragon {
 template <class Context> template <typename T>
 void ConcatOp<Context>::RunWithType() {
    auto* Ydata = output(0)->template mutable_data<T, Context>();
-    for (int i = 0; i < nin; i++){
+    for (int i = 0; i < nin; i++) {
        auto* Xdata = input(i).template data<T, Context>();
        TIndex count = input(i).count();
        x_concat_dim = input(i).dim(axis);
@@ -25,12 +25,12 @@ void ConcatOp<Context>::RunWithType() {
 }

 template <class Context>
-void ConcatOp<Context>::RunOnDevice(){
+void ConcatOp<Context>::RunOnDevice() {
    concat_dims = input(0).dims();
    for (int i = 1; i < nin; i++) {
        CHECK_EQ(concat_dims.size(), input(i).ndim())
            << "\nall inputs must have the same ndim.";
-        for (int j = 0; j < concat_dims.size(); j++){
+        for (int j = 0; j < concat_dims.size(); j++) {
            if (j == axis) continue;
            CHECK_EQ(concat_dims[j], input(i).dim(j))
                << "\nall inputs must have the same dims"
@@ -82,7 +82,7 @@ void ConcatGradientOp<Context>::RunWithType() {
 }

 template <class Context>
-void ConcatGradientOp<Context>::RunOnDevice(){
+void ConcatGradientOp<Context>::RunOnDevice() {
    if (input(-1).name() == "ignore") return;
    concat_dims = input(-1).dims();
    y_concat_dim = concat_dims[axis];

--- a/Dragon/src/operators/common/flatten_op.cc
+++ b/Dragon/src/operators/common/flatten_op.cc
@@ -27,7 +27,7 @@ OPERATOR_SCHEMA(Flatten).NumInputs(1).NumOutputs(1);


 template <class Context>
-void FlattenGradientOp<Context>::RunOnDevice(){
+void FlattenGradientOp<Context>::RunOnDevice() {
    output(0)->ReshapeLike(input(0));
    output(0)->Share(input(1));
 }

--- a/Dragon/src/operators/common/python_op.cc
+++ b/Dragon/src/operators/common/python_op.cc
@@ -8,7 +8,7 @@ RunOp<Context>::RunOp(const OperatorDef& op_def, Workspace* ws)
      module(OperatorBase::GetSingleArg<string>("module", "")),
      op(OperatorBase::GetSingleArg<string>("op", "")),
      param_str((OperatorBase::GetSingleArg<string>("param_str", ""))) {
-    //    init interpreter & load module
+    //  init interpreter & load module
    Py_Initialize();
    PyObject* py_module = PyImport_ImportModule(module.c_str());
    CHECK(py_module) << "\ncan not import py module: " << module;
@@ -18,11 +18,11 @@ RunOp<Context>::RunOp(const OperatorDef& op_def, Workspace* ws)
                 << " from module: " << module;
    self = PyObject_CallObject(py_op, NULL);

-    //    pass param string
+    //  pass param string
    PyObject_SetAttr(self, String("param_str"), String(param_str.c_str()));
    PyObject_SetAttr(self, String("param_str_"), String(param_str.c_str()));

-    //    build inputs and outputs for Python
+    //  build inputs and outputs for Python
    inputs = PyList_New(InputSize());
    for (int i = 0; i < InputSize(); i++)
        PyList_SetItem(inputs, i, String(input(i).name().c_str()));
@@ -31,21 +31,21 @@ RunOp<Context>::RunOp(const OperatorDef& op_def, Workspace* ws)
        PyList_SetItem(outputs, i, String(output(i)->name().c_str()));
    if (!this->allow_run()) return;

-    //    setup
+    //  setup
    if (PyObject_HasAttr(self, String("setup")))
        PyObject_CallMethod(self, "setup", "OO", inputs, outputs);
 }

 template <class Context>
 void RunOp<Context>::RunOnDevice() {
-    //    init phase
+    //  init phase
    PyObject_SetAttr(self, String("phase"), String(this->phase().c_str()));

-    //    reshape
+    //  reshape
    if (PyObject_HasAttr(self, String("reshape")))
        PyObject_CallMethod(self, "reshape", "OO", inputs, outputs);
    
-    //    run 
+    //  run 
    if (PyObject_HasAttr(self, String("forward"))) {
        PyObject_CallMethod(self, "forward", "OO", inputs, outputs);
    } else if (PyObject_HasAttr(self, String("run"))) {
@@ -63,14 +63,14 @@ NO_GRADIENT(Run);

 template <class Context>
 void TemplateGradientOp<Context>::RunOnDevice() {
-    //    init phase
+    //  init phase
    PyObject_SetAttr(this->self, String("phase"), String(this->phase().c_str()));

-    //    reshape
+    //  reshape
    if (PyObject_HasAttr(this->self, String("reshape"))) 
        PyObject_CallMethod(this->self, "reshape", "OO", this->inputs, this->outputs);
        
-    //    run 
+    //  run 
    if (PyObject_HasAttr(this->self, String("backward"))) {
        PyObject_CallMethod(this->self, "forward", "OO", this->inputs, this->outputs);
    } else if (PyObject_HasAttr(this->self, String("grad"))) {

--- a/Dragon/src/operators/common/reduce_op.cc
+++ b/Dragon/src/operators/common/reduce_op.cc
@@ -78,7 +78,7 @@ void ReduceGradientOp<Context>::SumRunWithType() {
 template <class Context> template <typename T>
 void ReduceGradientOp<Context>::MeanRunWithType() {
    auto* dXdata = output(0)->template mutable_data<T, Context>();
-    if (axis == -1){
+    if (axis == -1) {
        auto* dYdata = input(-1).template data<T, CPUContext>();
        math::Set<T, Context>(output(0)->count(), dYdata[0] / input(0).count(), dXdata);
    } else {

--- a/Dragon/src/operators/common/reshape_op.cc
+++ b/Dragon/src/operators/common/reshape_op.cc
@@ -8,18 +8,18 @@ void ReshapeOp<Context>::RunOnDevice() {
    int infer_dim = -1;
    TIndex total_count = 1;
    for (int i = 0; i < shape.size(); i++) {
-        //    handle unchanged dim
+        //  handle unchanged dim
        if (shape[i] == 0) {
            CHECK_LT(i, (int)Xdims.size())
                << "\ndim(" << i << ") is out of the Xdims range of (0, "
                << Xdims.size() << ").";
            new_shape[i] = Xdims[i];
        }
-        //    handle reseted dim
+        //  handle reseted dim
        else if (shape[i] > 0) {
            new_shape[i] = shape[i];
        }
-        //    handle inferred dim
+        //  handle inferred dim
        else {
            CHECK_EQ(infer_dim, -1)
                << "\ndim(" << infer_dim << ") required infer before"
@@ -30,7 +30,7 @@ void ReshapeOp<Context>::RunOnDevice() {
        if (new_shape[i] != -1) total_count *= new_shape[i];
    }

-    // solve inferred dim if necessary
+    //  solve inferred dim if necessary
    if (infer_dim != -1) {
        for (int i = 0; i < new_shape.size(); i++) {
            if (new_shape[i] == -1) {

--- a/Dragon/src/operators/common/scan_op.cc
+++ b/Dragon/src/operators/common/scan_op.cc
@@ -25,7 +25,7 @@ void ScanOp<Context>::InitTemplate() {
    slice_def.add_arg()->CopyFrom(arg_nout);
    template_def.mutable_device_option()->CopyFrom(op_def().device_option());
    template_def.set_debug_mode(debug_mode);
-    //    init for the first step
+    //  init for the first step
    for (int i = 0; i < nseqs; i++) {
        OperatorDef* op = template_def.add_op();
        op->CopyFrom(slice_def);
@@ -37,19 +37,19 @@ void ScanOp<Context>::InitTemplate() {
        OperatorDef* op = template_def.add_op();
        op->CopyFrom(func_def.op(i));
        op->set_name(name() + "(BodyOp." + str(i + nseqs) + ")@1");
-        //    replace inputs term
+        //  replace inputs term
        for (int j = 0; j < op->input_size(); j++) {
            string* input = op->mutable_input(j);
            if (terms.count(*input)) *input = terms[*input];
        }
-        //    replace outputs term
+        //  replace outputs term
        for (int j = 0; j < op->output_size(); j++) {
            string* output = op->mutable_output(j);
            terms[*output] = *output + "@1";
            *output = terms[*output];
        }
    }
-    //    handle pre outputs
+    //  handle pre outputs
    for (int i = 0; i < nout; i++) {
        if (default_outputs[i].empty()) continue;
        terms[default_outputs[i]] = func_def.target(i) + "@1";
@@ -59,15 +59,15 @@ void ScanOp<Context>::InitTemplate() {
 template <class Context>
 void ScanOp<Context>::UpdateTerms(int cur_step) {
    string prev, now;
-    //    update sequences term
+    //  update sequences term
    for (int i = 0; i < nseqs; i++) {
        prev = input(i).name() + "@" + str(cur_step - 1);
        now = input(i).name() + "@" + str(cur_step);
        terms[prev] = now;
    }
    if (cur_step < 3) return;
-    //    update recurrent term
-    //    only support the latest one-step (as Theano's done)
+    //  update recurrent term
+    //  only support the latest one-step (as Theano's done)
    for (int i = 0; i < nout; i++) {
        if (default_outputs[i].empty()) continue;
        prev = output(i)->name() + "@" + str(cur_step - 2);
@@ -93,15 +93,15 @@ void ScanOp<Context>::UnrollTemplate() {
    for (int idx = 0; idx < nseqs; idx++) {
        OperatorDef *op = new_def.mutable_op(idx);
        int nslices = input(idx).dim(axis);
-        //    alter the num of slices for all sequences
+        //  alter the num of slices for all sequences
        op->mutable_arg(1)->set_i(nslices);
-        //    add slices as outputs
+        //  add slices as outputs
        for (int t = 1; t <= nslices; t++) {
            string slice = op->input(0) + "@" + str(t);
            op->add_output(slice);
        }
    }
-    //    main loop
+    //  main loop
    for (int t = 2; t <= nsteps; t++) {
        UpdateTerms(t);
        int copy_r = new_def.op_size(), copy_l = copy_r - nrepeats;
@@ -109,12 +109,12 @@ void ScanOp<Context>::UnrollTemplate() {
            OperatorDef* op = new_def.add_op();
            op->CopyFrom(new_def.op(idx));
            op->set_name(SplitString(op->name(), "@")[0] + "@" + str(t));
-            //    replace inputs
+            //  replace inputs
            for (int j = 0; j < op->input_size(); j++) {
                string* input = op->mutable_input(j);
                if (terms.count(*input)) *input = terms[*input];
            }
-            //    replace outputs
+            //  replace outputs
            for (int j = 0; j < op->output_size(); j++) {
                string* output = op->mutable_output(j);
                terms[*output] = SplitString(*output, "@")[0] + "@" + str(t);
@@ -123,9 +123,9 @@ void ScanOp<Context>::UnrollTemplate() {
        }
    }
    for (int i = 0; i < nout; i++) {
-        //    solve the last step only
+        //  solve the last step only
        new_def.add_target(func_def.target(i) + "@" + str(nsteps));
-        //    concat all steps if necessary
+        //  concat all steps if necessary
        if (output(i)->name() == "ignore") continue;
        OperatorDef* op = new_def.add_op();
        op->set_name(name() + "(BodyOp." + str(nseqs + nrepeats + i) + ")");
@@ -138,10 +138,10 @@ void ScanOp<Context>::UnrollTemplate() {
        for (int t = 1; t <= nsteps; t++)
            op->add_input(output(i)->name() + "@" + str(t));
        op->add_output(output(i)->name());
-        //    solve all the all steps
+        //  solve all the all steps
        new_def.add_target(output(i)->name());
    }
-    //    upload
+    //  upload
    Tensor* string_tensor = ws()->CreateTensor("_t_" + anchor() + "_raw_ops");
    string_tensor->Reshape(vector<TIndex>(1, 1));
    string* data = string_tensor->mutable_data <string, CPUContext>();
@@ -150,13 +150,11 @@ void ScanOp<Context>::UnrollTemplate() {

 template <class Context>
 void ScanOp<Context>::RunOnDevice() {
-    //    unroll
    UnrollTemplate();
-    if (!graphs.count(nsteps))
+    if (!graphs.count(nsteps)) {
        graphs[nsteps].reset(new Graph(new_def, ws()));
+    }
    cur_graph = graphs[nsteps].get();
-
-    //    forward
    cur_graph->Run("", "");
 }

@@ -201,13 +199,11 @@ void ScanGradientOp<Context>::MakeGradientOps() {

 template <class Context>
 void ScanGradientOp<Context>::RunOnDevice() {
-    //    make graph
    MakeGradientOps();
-    if (!graphs.count(nsteps))
+    if (!graphs.count(nsteps)) {
        graphs[nsteps].reset(new Graph(new_def, ws()));
+    }
    cur_graph = graphs[nsteps].get();
-
-    //    backward
    cur_graph->Run("Gradient", "");
 }


--- a/Dragon/src/operators/common/slice_op.cc
+++ b/Dragon/src/operators/common/slice_op.cc
@@ -7,7 +7,7 @@ namespace dragon {
 template <class Context> template <typename T>
 void SliceOp<Context>::RunWithType() {
    auto* Xdata = input(0).template data<T, Context>();
-    for (int i = 0; i < nout; i++){
+    for (int i = 0; i < nout; i++) {
        auto* Ydata = output(i)->template mutable_data<T, Context>();
        TIndex count = output(i)->count();
        kernel::Slice<T, Context>(count, outer_dim, inner_dim,
@@ -46,7 +46,7 @@ OPERATOR_SCHEMA(Slice).NumInputs(1).NumOutputs(1, INT_MAX);
 template <class Context> template <typename T>
 void SliceGradientOp<Context>::RunWithType() {
    auto* dXdata = output(0)->template mutable_data<T, Context>();
-    for (int i = 0; i < nout; i++){
+    for (int i = 0; i < nout; i++) {
        if (input(i + 1).name() == "ignore") continue;
        auto* dYdata = input(i + 1).template data<T, Context>();
        TIndex count = input(i + 1).count();

--- a/Dragon/src/operators/common/utils_op.cc
+++ b/Dragon/src/operators/common/utils_op.cc
@@ -60,11 +60,11 @@ void AccuracyOp<Context>::RunWithType() {
                }
            }
            count++;
-        }    //    end inner_num
+        }    //  end inner_num
    }    // end outer_num

    output(0)->template mutable_data<T, CPUContext>()[0] = acc / count;
-    if (OutputSize() > 1){
+    if (OutputSize() > 1) {
        auto* acc_per_class = output(1)->template mutable_data<T, CPUContext>();
        for (int i = 0; i < classes; i++)
            acc_per_class[i] = num_per_class[i] == 0 ? 0 : acc_per_class[i] / acc_per_class[i];

--- a/Dragon/src/operators/loss/l1_loss_op.cc
+++ b/Dragon/src/operators/loss/l1_loss_op.cc
@@ -28,7 +28,7 @@ void L1LossOp<Context>::RunWithType() {
 }

 template <class Context>
-void L1LossOp<Context>::RunOnDevice(){
+void L1LossOp<Context>::RunOnDevice() {
    CHECK_EQ(input(0).count(), input(1).count());
    output(0)->Reshape(vector<TIndex>(1, 1));
    diff = ws()->CreateTensor("_t_" + anchor() + "_l1_loss_diff");
@@ -53,7 +53,7 @@ void L1LossGradientOp<Context>::RunWithType() {
    else if (normalization == "FULL") normalizer = input(0).count();
    else if (normalization == "NONE") normalizer = 1;
    alpha = alpha / normalizer;
-    for (int i = 0; i < 2; i++){
+    for (int i = 0; i < 2; i++) {
        if (output(i)->name() == "ignore") continue;
        output(i)->ReshapeLike(input(i));
        auto* dXdata = output(i)->template mutable_data<T, Context>();

--- a/Dragon/src/operators/loss/l2_loss_op.cc
+++ b/Dragon/src/operators/loss/l2_loss_op.cc
@@ -11,7 +11,7 @@ void L2LossOp<Context>::RunWithType() {
    auto* diff_data = diff->template mutable_data<T, Context>();
    auto* Ydata = output(0)->template mutable_data<T, CPUContext>();
    math::Sub<T, Context>(diff->count(), X0data, X1data, diff_data);
-    if (InputSize() > 2){
+    if (InputSize() > 2) {
        CHECK_EQ(input(0).count(), input(2).count());
        auto* Wdata = input(2).template data<T, Context>();
        math::Mul<T, Context>(diff->count(), Wdata, diff_data, diff_data);

--- a/Dragon/src/operators/loss/smooth_l1_loss_op.cc
+++ b/Dragon/src/operators/loss/smooth_l1_loss_op.cc
@@ -14,12 +14,12 @@ void SmoothL1LossOp<Context>::RunWithType() {
    auto* Ydata = output(0)->template mutable_data<T, CPUContext>();

    math::Sub<T, Context>(diff->count(), X0data, X1data, diff_data);
-    if (InputSize() > 2){
+    if (InputSize() > 2) {
        auto* inside_w_data = input(2).template data<T, Context>();
        math::Mul<T, Context>(diff->count(), inside_w_data, diff_data, diff_data);
    }
    kernel::SmoothL1<T, Context>(diff->count(), sigma2, diff_data, error_data);
-    if (InputSize() > 3){
+    if (InputSize() > 3) {
        auto* outside_w_data = input(3).template data<T, Context>();
        math::Mul<T, Context>(diff->count(), outside_w_data, error_data, error_data);
    }

--- a/Dragon/src/operators/loss/softmax_cross_entropy_loss_op.cc
+++ b/Dragon/src/operators/loss/softmax_cross_entropy_loss_op.cc
@@ -69,7 +69,7 @@ void SoftmaxCrossEntropyLossGradientOp<Context>::RunWithType() {
        return;
    }

-    //    normalize
+    //  normalize
    T normalizer;
    if (normalization == "BATCH_SIZE") normalizer = outer_dim;
    else if (normalization == "FULL") normalizer = outer_dim * inner_dim;

--- a/Dragon/src/operators/mpi/mpi_broadcast_op.cc
+++ b/Dragon/src/operators/mpi/mpi_broadcast_op.cc
@@ -26,14 +26,14 @@ void MPIBroadcastOp<Context>::RunWithType() {
 }

 template <class Context>
-void MPIBroadcastOp<Context>::RunOnDevice(){
+void MPIBroadcastOp<Context>::RunOnDevice() {
    CHECK(this->comm != MPI_COMM_NULL)
        << "\nMPIBroadcastOp, name: " << name()
        << ", does not belong to any group, can't run.";

    size_t ndim[1];
    TIndex* dims = nullptr;
-    if (this->comm_rank == this->comm_root){
+    if (this->comm_rank == this->comm_root) {
        ndim[0] = input(0).ndim();
        dims = new TIndex[ndim[0]];
        for (int i = 0; i < input(0).ndim(); i++)
@@ -90,7 +90,7 @@ void MPIBroadcastGradientOp<Context>::RunWithType() {
 }

 template <class Context>
-void MPIBroadcastGradientOp<Context>::RunOnDevice(){
+void MPIBroadcastGradientOp<Context>::RunOnDevice() {
    output(0)->ReshapeLike(input(-1));

    if (input(0).template IsType<float>()) RunWithType<float>();

--- a/Dragon/src/operators/mpi/mpi_gather_op.cc
+++ b/Dragon/src/operators/mpi/mpi_gather_op.cc
@@ -51,7 +51,7 @@ template <class Context> template <typename T>
 void MPIGatherGradientOp<Context>::RunWithType() {
    if (this->comm_rank == this->comm_root) {
        output(0)->Share(input(this->comm_rank + 1));
-        for (int i = 0; i < this->comm_size; i++){
+        for (int i = 0; i < this->comm_size; i++) {
            if (i == this->comm_root) continue;
 #ifdef WITH_CUDA_AWARE
            auto* dYdata = input(this->comm_rank + 1).template data<T, Context>();

--- a/Dragon/src/operators/norm/batch_norm_op.cc
+++ b/Dragon/src/operators/norm/batch_norm_op.cc
@@ -120,7 +120,7 @@ void BatchNormOp<Context>::RunOnDevice() {

    if (use_stats == -1) use_global_stats = phase() == "TEST" ? true : false;
    else use_global_stats = use_stats == 1 ? true : false;
-    //    if true, Act/Exp/Pow/Norm Ops can not exist before when train
+    //  if true, Act/Exp/Pow/Norm Ops can not exist before when train
    if (inplace) output(0)->Share(input(0));

    if (input(0).template IsType<float>()) RunWithType<float>();
@@ -171,7 +171,7 @@ void BatchNormGradientOp<Context>::RunWithType() {
    auto* Ydata = input(-2).template data<T, Context>();
    math::Mul<T, Context>(output(0)->count(), Ydata, dYdata, dXdata);

-    //    sum(dE/dY \cdot Y)
+    //  sum(dE/dY \cdot Y)
    math::Gemv<T, Context>(CblasNoTrans, nbychans, spatial_dim,
                                                           1.0, 
                                             dXdata, SMul_data, 
@@ -193,10 +193,10 @@ void BatchNormGradientOp<Context>::RunWithType() {
                                                                            0.0, 
                                                                        dXdata);

-    //    sum(dE/dY \cdot Y) \cdot Y  
+    //  sum(dE/dY \cdot Y) \cdot Y  
    math::Mul<T, Context>(output(0)->count(), Ydata, dXdata, dXdata);

-    //    sum(dE/dY) + sum(dE/dY \cdot Y) \cdot Y
+    //  sum(dE/dY) + sum(dE/dY \cdot Y) \cdot Y
    math::Gemv<T, Context>(CblasNoTrans, nbychans, spatial_dim,
                                                           1.0, 
                                             dYdata, SMul_data, 
@@ -224,7 +224,7 @@ void BatchNormGradientOp<Context>::RunWithType() {
                                 -1.0 / (num * spatial_dim), 
                                                    dXdata);

-    //    divide by stddev
+    //  divide by stddev
    math::Div<T, Context>(output(0)->count(), dXdata, Std_data, dXdata);

    //  release buffer

--- a/Dragon/src/operators/norm/batch_renorm_op.cc
+++ b/Dragon/src/operators/norm/batch_renorm_op.cc
@@ -60,7 +60,7 @@ void BatchRenormOp<Context>::RunWithType() {
        ctx().template Copy<T, Context, Context>(input(0).count(), Ydata, Xdata);
    }

-    //    subtract mean
+    //  subtract mean
    math::Gemm<T, Context>(CblasNoTrans, CblasNoTrans, num, channels, 1,
                                                                    1.0, 
                                                  NMul_data, tMean_data, 
@@ -73,7 +73,7 @@ void BatchRenormOp<Context>::RunWithType() {
                                                                         Ydata);

    if (!use_global_stats) {
-        // Var(X) = E((X - EX) ^ 2)
+        //  Var(X) = E((X - EX) ^ 2)
        math::Pow<T, Context>(stddev->count(), 2, Ydata, Std_data);
        math::Gemv<T, Context>(CblasNoTrans, nbychans, spatial_dim,
                                         1.0 / (num * spatial_dim), 
@@ -85,7 +85,7 @@ void BatchRenormOp<Context>::RunWithType() {
                                    NByC_data, NMul_data, 
                                                     0.0, 
                                              tVar_data);
-        //    update moving average
+        //  update moving average
        hFact_data[0] *= momentum; hFact_data[0] += 1;
        int m = input(0).count() / channels;
        T factor = m > 1 ? T(m) / (m - 1) : 1;
@@ -93,31 +93,31 @@ void BatchRenormOp<Context>::RunWithType() {
        math::Axpby<T, Context>(mean.count(), factor, tVar_data, momentum, hVar_data);
    }

-    //    normalize var
+    //  normalize var
    math::AddScalar<T, Context>(mean.count(), eps, tVar_data);
    math::Pow<T, Context>(mean.count(), 0.5, tVar_data, tVar_data);

    if (!use_global_stats) {
-        //    normalize history var
+        //  normalize history var
        math::AddScalar<T, Context>(mean.count(), eps, thVar_data);
        math::Pow<T, Context>(mean.count(), 0.5, thVar_data, thVar_data);

-        //    compute r
+        //  compute r
        math::Div<T, Context>(mean.count(), tVar_data, thVar_data, tRdata);
        math::Clip<T, Context>(mean.count(), 1.0 / t_r_max, t_r_max, tRdata);

-        //    compute d
+        //  compute d
        math::Sub<T, Context>(mean.count(), tMean_data, thMean_data, tDdata);
        math::Div<T, Context>(mean.count(), tDdata, thVar_data, tDdata);
        math::Clip<T, Context>(mean.count(), -t_d_max, t_d_max, tDdata);

-        //    update the bound of r & d
+        //  update the bound of r & d
        t_r_max = r_max / (1.0 + (r_max - 1.0) * exp(-t_val));
        t_d_max = d_max / (1.0 + (d_max - 1.0) * exp(-2 * t_val));
        t_val += t_delta;
    }

-    //    divide by var
+    //  divide by var
    math::Gemm<T, Context>(CblasNoTrans, CblasNoTrans, num, channels, 1,
                                                                    1.0, 
                                                   NMul_data, tVar_data, 
@@ -131,11 +131,11 @@ void BatchRenormOp<Context>::RunWithType() {
    math::Div<T, Context>(stddev->count(), Ydata, Std_data, Ydata);

    if (!use_global_stats) {
-        //    store x_norm for backward
+        //  store x_norm for backward
        XNorm_data = x_norm->template mutable_data<T, Context>();
        ctx().template Copy<T, Context, Context>(output(0)->count(), XNorm_data, Ydata);

-        //    correction: mul by r
+        //  correction: mul by r
        math::Gemm<T, Context>(CblasNoTrans, CblasNoTrans, num, channels, 1,
                                                                        1.0, 
                                                          NMul_data, tRdata,
@@ -183,7 +183,7 @@ void BatchRenormOp<Context>::RunOnDevice() {

    if (use_stats == -1) use_global_stats = phase() == "TEST" ? true : false;
    else use_global_stats = use_stats == 1 ? true : false;
-    //    if true, Act/Exp/Pow/Norm Ops can not exist before when train
+    //  if true, Act/Exp/Pow/Norm Ops can not exist before when train
    if (inplace) output(0)->Share(input(0));

    if (input(0).template IsType<float>()) RunWithType<float>();
@@ -233,7 +233,7 @@ void BatchRenormGradientOp<Context>::RunWithType() {
    auto* XNorm_data = x_norm->template data<T, Context>();
    auto* tMean_data = mean.template mutable_data<T, Context>();

-    //    buffer <- dE/dY \cdot r
+    //  buffer <- dE/dY \cdot r
    math::Gemm<T, Context>(CblasNoTrans, CblasNoTrans, num, channels, 1,
                                                                    1.0, 
                                                      NMul_data, tRdata, 
@@ -246,7 +246,7 @@ void BatchRenormGradientOp<Context>::RunWithType() {
                                                                      Std_data);
    math::Mul<T, Context>(output(0)->count(), dYdata, Std_data, Std_data);

-    //    sum(dE/dY \cdot Y)
+    //  sum(dE/dY \cdot Y)
    math::Mul<T, Context>(output(0)->count(), XNorm_data, Std_data, dXdata);
    math::Gemv<T, Context>(CblasNoTrans, nbychans, spatial_dim,
                                                           1.0, 
@@ -269,10 +269,10 @@ void BatchRenormGradientOp<Context>::RunWithType() {
                                                                            0.0, 
                                                                        dXdata);

-    //    sum(dE/dY \cdot Y) \cdot Y  
+    //  sum(dE/dY \cdot Y) \cdot Y  
    math::Mul<T, Context>(output(0)->count(), XNorm_data, dXdata, dXdata);

-    //    sum(dE/dY) + sum(dE/dY \cdot Y) \cdot Y
+    //  sum(dE/dY) + sum(dE/dY \cdot Y) \cdot Y
    math::Gemv<T, Context>(CblasNoTrans, nbychans, spatial_dim,
                                                           1.0, 
                                           Std_data, SMul_data, 
@@ -293,13 +293,13 @@ void BatchRenormGradientOp<Context>::RunWithType() {
                                                           NByC_data, SMul_data, 
                                                                   1.0, dXdata);

-    //   dE/dY - mean(dE/dY)- mean(dE/dY \cdot Y) \cdot Y
-    // = dE/dY - mean(sum(dE/dY) + sum(dE/dY \cdot Y) \cdot Y)
+    //  dE/dY - mean(dE/dY)- mean(dE/dY \cdot Y) \cdot Y
+    //  = dE/dY - mean(sum(dE/dY) + sum(dE/dY \cdot Y) \cdot Y)
    math::Axpby<T, Context>(output(0)->count(), 1.0, Std_data,
                                   -1.0 / (num * spatial_dim),
                                                      dXdata);

-    //    divide var
+    //  divide var
    math::Gemm<T, Context>(CblasNoTrans, CblasNoTrans, num, channels, 1,
                                                                    1.0, 
                                                   NMul_data, tVar_data, 

--- a/Dragon/src/operators/norm/instance_norm_op.cc
+++ b/Dragon/src/operators/norm/instance_norm_op.cc
@@ -60,7 +60,7 @@ void InstanceNormOp<Context>::RunWithType() {
 }

 template <class Context>
-void InstanceNormOp<Context>::RunOnDevice(){
+void InstanceNormOp<Context>::RunOnDevice() {
    num = input(0).dim(0); channels = input(0).dim(1);
    spatial_dim = input(0).count(2); nbychans = num * channels;
    vector<TIndex> dims({ num, channels });
@@ -69,7 +69,7 @@ void InstanceNormOp<Context>::RunOnDevice(){

    output(0)->ReshapeLike(input(0));

-    //    if true, Act/Exp/Pow/Norm Ops can not exist before when train
+    //  if true, Act/Exp/Pow/Norm Ops can not exist before when train
    if (inplace) output(0)->Share(input(0));

    if (input(0).template IsType<float>()) RunWithType<float>();
@@ -105,7 +105,7 @@ void InstanceNormGradientOp<Context>::RunWithType() {
    auto* Ydata = input(-2).template data<T, Context>();
    math::Mul<T, Context>(output(0)->count(), Ydata, dYdata, dXdata);

-    //    sum(dE/dY \cdot Y)
+    //  sum(dE/dY \cdot Y)
    math::Gemv<T, Context>(CblasNoTrans, nbychans, spatial_dim, 
                                                           1.0, 
                                             dXdata, SMul_data, 
@@ -116,10 +116,10 @@ void InstanceNormGradientOp<Context>::RunWithType() {
                                                                            0.0, 
                                                                        dXdata);

-    //    sum(dE/dY \cdot Y) \cdot Y
+    //  sum(dE/dY \cdot Y) \cdot Y
    math::Mul<T, Context>(output(0)->count(), Ydata, dXdata, dXdata);

-    //    sum(dE/dY) + sum(dE/dY \cdot Y) \cdot Y
+    //  sum(dE/dY) + sum(dE/dY \cdot Y) \cdot Y
    math::Gemv<T, Context>(CblasNoTrans, nbychans, spatial_dim, 
                                                           1.0, 
                                             dYdata, SMul_data, 
@@ -130,13 +130,13 @@ void InstanceNormGradientOp<Context>::RunWithType() {
                                                                            1.0, 
                                                                        dXdata);

-    //   dE/dY - mean(dE/dY)- mean(dE/dY \cdot Y) \cdot Y
-    // = dE/dY - mean(sum(dE/dY) + sum(dE/dY \cdot Y) \cdot Y)
+    //  dE/dY - mean(dE/dY)- mean(dE/dY \cdot Y) \cdot Y
+    //  = dE/dY - mean(sum(dE/dY) + sum(dE/dY \cdot Y) \cdot Y)
    math::Axpby<T, Context>(output(0)->count(), 1.0, dYdata,
                                         -1.0 / spatial_dim, 
                                                    dXdata);

-    //    divide by var
+    //  divide by var
    math::Div<T, Context>(output(0)->count(), dXdata, Std_data, dXdata);

    //  release buffer

--- a/Dragon/src/operators/norm/l2_norm_op.cc
+++ b/Dragon/src/operators/norm/l2_norm_op.cc
@@ -41,9 +41,9 @@ void L2NormOp<Context>::RunWithType() {
                                              Bdata, DMuldata, 
                                                          1.0, 
                                                       Ndata);
-            //    compute T2 = \sqrt{T1}
+            //  compute T2 = \sqrt{T1}
            math::Sqrt<T, Context>(inner_dim, Ndata, Ndata);
-            //    compute T3 = x / [(T2)]_{dim} 
+            //  compute T3 = x / [(T2)]_{dim} 
            math::Gemm<T, Context>(CblasNoTrans, CblasNoTrans, dim, inner_dim, 1,
                                                                             1.0, 
                                                                 DMuldata, Ndata, 
@@ -109,7 +109,7 @@ void L2NormGradientOp<Context>::RunWithType() {
    auto* Bdata = buffer->template mutable_data<T, Context>();
    auto* BInnerdata = buffer_inner->template mutable_data<T, Context>();

-    for (int n = 0; n < outer_dim; n++){
+    for (int n = 0; n < outer_dim; n++) {
        if (across_inner) {
            Ndata = norm->template data<T, CPUContext>();
            T sum_of_x_mul_dy = math::Dot<T, Context>(buffer->count(), Xdata, dYdata);
@@ -117,21 +117,21 @@ void L2NormGradientOp<Context>::RunWithType() {
            math::Sub<T, Context>(buffer->count(), dYdata, dXdata, dXdata);
            math::Scal<T, Context>(buffer->count(), T(1.0 / Ndata[n]), dXdata);
        } else {
-            //    compute \sum_{i} x_{i, j}dy_{i, j}
+            //  compute \sum_{i} x_{i, j}dy_{i, j}
            math::Mul<T, Context>(buffer->count(), Xdata, dYdata, Bdata);
            math::Gemv<T, Context>(CblasTrans, dim, inner_dim, 
                                                          1.0, 
                                              Bdata, DMuldata, 
                                                          0.0, 
                                                  BInnerdata);
-            //    compute T1 = x[(\sum_{i} x_{i, j}dy_{i, j})]_{dim}
+            //  compute T1 = x[(\sum_{i} x_{i, j}dy_{i, j})]_{dim}
            math::Gemm<T, Context>(CblasNoTrans, CblasNoTrans, dim, inner_dim, 1, 
                                                                             1.0, 
                                                            DMuldata, BInnerdata, 
                                                                             0.0, 
                                                                          Bdata);
            math::Mul<T, Context>(buffer->count(), Xdata, Bdata, dXdata);
-            //    compute T2 = T1 / Normalizer^{2}
+            //  compute T2 = T1 / Normalizer^{2}
            math::Pow<T, Context>(inner_dim, 2.0, Ndata, BInnerdata);
            math::Gemm<T, Context>(CblasNoTrans, CblasNoTrans, dim, inner_dim, 1, 
                                                                             1.0, 
@@ -139,7 +139,7 @@ void L2NormGradientOp<Context>::RunWithType() {
                                                                             0.0, 
                                                                          Bdata);
            math::Div<T, Context>(buffer->count(), dXdata, Bdata, dXdata);
-            //    compute T3 = (dy - T2) / Normalizer
+            //  compute T3 = (dy - T2) / Normalizer
            math::Sub<T, Context>(buffer->count(), dYdata, dXdata, dXdata);
            math::Gemm<T, Context>(CblasNoTrans, CblasNoTrans, dim, inner_dim, 1, 
                                                                             1.0, 

--- a/Dragon/src/operators/recurrent/lstm_unit_op.cc
+++ b/Dragon/src/operators/recurrent/lstm_unit_op.cc
@@ -18,10 +18,10 @@ void LSTMUnitOp<Context>::RunWithType() {

 template <class Context>
 void LSTMUnitOp<Context>::RunOnDevice() {
-    //    input(0):    ----- c_t_1
-    //    input(1):    ----- gate_input
-    //    output(0):  ----- c_t
-    //    output(1):  ----- h_t
+    //  input(0):  ----- c_t_1
+    //  input(1):  ----- gate_input
+    //  output(0): ----- c_t
+    //  output(1): ----- h_t
    num = input(0).dim(0);
    channels = input(0).ndim() == 2 ? input(0).dim(1) : input(0).dim(2);
    if (!has_cont.empty()) {
@@ -57,13 +57,13 @@ void LSTMUnitGradientOp<Context>::RunWithType() {

 template <class Context>
 void LSTMUnitGradientOp<Context>::RunOnDevice() {
-    //    input(0):    ----- c_t_1
-    //    input(1):    ----- x_act
-    //    input(2):    ----- c_t
-    //    input(3):   ----- d(c_t)
-    //    input(4):   ----- d(h_t)
-    //    output(0):  ----- d(c_t_1)
-    //    output(1):  ----- d(gate_input)
+    //  input(0):   ----- c_t_1
+    //  input(1):   ----- x_act
+    //  input(2):   ----- c_t
+    //  input(3):   ----- d(c_t)
+    //  input(4):   ----- d(h_t)
+    //  output(0):  ----- d(c_t_1)
+    //  output(1):  ----- d(gate_input)
    num = input(0).dim(0);
    channels = input(0).ndim() == 2 ? input(0).dim(1) : input(0).dim(2);
    output(0)->ReshapeLike(input(0));
@@ -92,7 +92,7 @@ public:
            vector<string> {I(0), I(1), O(0), GO(0), GO(1)},
            vector<string> {GI(0), GI(1)});
    }
-    //    fill zero for dc_{T+1}
+    //  fill zero for dc_{T+1}
    vector<float> DefaultValues() override{ return{ 0.0, 1.0 }; }
 };
 REGISTER_GRADIENT(LSTMUnit, GetLSTMUnitGradient);

--- a/Dragon/src/operators/update/async_update_op.cc
+++ b/Dragon/src/operators/update/async_update_op.cc
@@ -21,7 +21,7 @@ void AsyncUpdateOp<Context>::UpdateTimestamp(int tag) {
 }

 template <class Context>
-int AsyncUpdateOp<Context>::GetDelay(int tag){
+int AsyncUpdateOp<Context>::GetDelay(int tag) {
    Tensor* t = ws()->GetTensor("_t_" + this->domain + "async_timestamp");
    int* global_timestamp = t->template mutable_data<int, CPUContext>();
    return global_timestamp[tag] - local_timestamp[tag] + 1;
@@ -35,33 +35,33 @@ AsyncUpdateOp<Context>::AsyncUpdateOp(const OperatorDef& op_def, Workspace* ws)
      mode(OperatorBase::GetSingleArg<string>("mode", "Async")),
      nsync(OperatorBase::GetSingleArg<int>("nsync", -1)) {

-    //    make key-val tags
+    //  make key-val tags
    Tensor* t = this->ws()->CreateTensor("_t_" + this->domain + "async_tags");
    t->Reshape(vector<TIndex>(1, InputSize()));
    tags = t->template mutable_data<string, CPUContext>();
    for (int i = 0; i < OutputSize(); i++) tags[i] = output(i)->name();

-    //    make recv logs
+    //  make recv logs
    t = this->ws()->CreateTensor("_t_" + this->domain + "async_logs");
    t->Reshape(vector<TIndex>(1, InputSize()));

-    //    make recv buffers
-    acc_buffers = new Tensor*[InputSize()];      // for soft-sync
-    recv_buffer.reset(new Tensor());    //    for async
+    //  make recv buffers
+    acc_buffers = new Tensor*[InputSize()];    //  for soft-sync
+    recv_buffer.reset(new Tensor());    //  for async

-    //    setup for server
+    //  setup for server
    if (this->comm_rank == this->comm_root) {
-        if (nsync == -1) nsync = this->comm_size; // fully async
+        if (nsync == -1) nsync = this->comm_size;    //  fully async
        max_recv = this->comm_size / nsync;
-        //    make global timestamp
+        //  make global timestamp
        t = this->ws()->CreateTensor("_t_" + this->domain + "async_timestamp");
        t->Reshape(vector<TIndex>(1, InputSize()));
-        //    make global buffers
+        //  make global buffers
        for (int i = 0; i < OutputSize(); i++)
            acc_buffers[i] = this->ws()->CreateTensor(tags[i] + "_grad_async_acc");
    }

-    //    create independent stream for thread if using cuda-aware
+    //  create independent stream for thread if using cuda-aware
 #ifdef WITH_CUDA_AWARE
    cudaStreamCreate(&stream);
    cublasCreate_v2(&handle);
@@ -71,7 +71,7 @@ AsyncUpdateOp<Context>::AsyncUpdateOp(const OperatorDef& op_def, Workspace* ws)

 template <class Context> template <typename T>
 void AsyncUpdateOp<Context>::RootRunWithType() {
-    for (int i = 0; i < InputSize(); i++){
+    for (int i = 0; i < InputSize(); i++) {
        auto* dXdata = input(i).template mutable_data<T, Context>();
        auto* Xdata = output(i)->template mutable_data<T, Context>();

@@ -88,7 +88,7 @@ void AsyncUpdateOp<Context>::RootRunWithType() {
 }

 template <class Context>
-void AsyncUpdateOp<Context>::RunOnDevice(){
+void AsyncUpdateOp<Context>::RunOnDevice() {
    if (this->comm_rank != this->comm_root) return;

    if (input(0).template IsType<float>()) {
@@ -102,11 +102,11 @@ void AsyncUpdateOp<Context>::RunOnDevice(){
 template <class Context> template <typename T>
 void AsyncUpdateOp<Context>::ThreadRunWithType() {
    while (1) {
-        //    pull from specfic client
+        //  pull from specfic client
        MPI_Status status;
        MPI_Probe(node_id, MPI_ANY_TAG, this->comm, &status);
        Tensor* X = ws()->GetTensor(tags[status.MPI_TAG]);
-        if (X->count() == 0) continue; //    wait for server 
+        if (X->count() == 0) continue; //  wait for server 
        recv_buffer->ReshapeLike(*X);
 #ifdef WITH_CUDA_AWARE
        auto* Bdata = recv_buffer->template mutable_data<T, Context>();
@@ -114,7 +114,7 @@ void AsyncUpdateOp<Context>::ThreadRunWithType() {
        auto* Bdata = recv_buffer->template mutable_data<T, CPUContext>();
 #endif
        MPI_Recv(Bdata, X->count(), MPI_FLOAT, status.MPI_SOURCE, status.MPI_TAG, this->comm, MPI_STATUS_IGNORE);
-        //    update
+        //  update
 #ifdef WITH_CUDA_AWARE
        auto* Xdata = X->template mutable_data<T, Context>();
        if (mode != "Async_No_Lock") ws()->LockTensor(output(status.MPI_TAG)->name());
@@ -132,9 +132,9 @@ void AsyncUpdateOp<Context>::ThreadRunWithType() {
        math::Axpy<T, CPUContext>(X->count(), -1.0 / delay, Bdata, Xdata);
        if (mode != "Async_No_Lock") ws()->UnlockTensor(output(status.MPI_TAG)->name());
 #endif
-        //    push back to this client
+        //  push back to this client
        MPI_Send(Xdata, X->count(), MPI_FLOAT, status.MPI_SOURCE, status.MPI_TAG, this->comm);
-        //    do statistics
+        //  do statistics
        update_count++;
        if (update_count % (100 * InputSize()) == 0)
            LOG(INFO) << "Server[" << node_id << "]: "

--- a/Dragon/src/operators/update/update_op_base.cc
+++ b/Dragon/src/operators/update/update_op_base.cc
@@ -59,7 +59,7 @@ void UpdateOpBase<Context>::ReduceRunWithType() {
    int recv_from = (comm_rank - 1 + comm_size) % comm_size;
    int send_to = (comm_rank + 1) % comm_size;

-    //    scatter-reduce
+    //  scatter-reduce
    for (int i = 0; i < comm_size - 1; i++) {
        int recv_chunk = (comm_rank - i - 1 + comm_size) % comm_size;
        int send_chunk = (comm_rank - i + comm_size) % comm_size;
@@ -83,7 +83,7 @@ void UpdateOpBase<Context>::ReduceRunWithType() {
    }
    ws()->ReleaseBuffer(buffer);

-    //    allgather
+    //  allgather
    for (int i = 0; i < comm_size - 1; i++) {
        int send_chunk = (comm_rank - i + 1 + comm_size) % comm_size;
        int recv_chunk = (comm_rank - i + comm_size) % comm_size;
@@ -97,8 +97,8 @@ void UpdateOpBase<Context>::ReduceRunWithType() {
                     0, comm, MPI_STATUS_IGNORE);
    }

-    //    ave-normalize
-    if (comm_size > 1){
+    //  ave-normalize
+    if (comm_size > 1) {
 #ifdef WITH_CUDA_AWARE
        math::Scal<T, Context>(count, T(1.0 / comm_size), dXdata);
 #else
@@ -110,13 +110,13 @@ void UpdateOpBase<Context>::ReduceRunWithType() {

 template <class Context> template <typename T>
 void UpdateOpBase<Context>::PreprocessRunWithType() {
-    //    scale
+    //  scale
    scale_factor = param("scale_gradient");
-    if (scale_factor != 1){
+    if (scale_factor != 1) {
        auto* dXdata = input(0).template mutable_data<T, Context>();
        math::Scal<T, Context>(input(0).count(), scale_factor, dXdata);
    }
-    //    clip
+    //  clip
    clip_thresh = param("clip_gradient");
    if (clip_thresh > 0) {
        auto* dXdata = input(0).template mutable_data<T, Context>();
@@ -127,12 +127,12 @@ void UpdateOpBase<Context>::PreprocessRunWithType() {
            math::Scal<T, Context>(input(0).count(), factor, dXdata);
        }
    }
-    //    decay
-    l2_decay = param("l2_decay");
-    if (l2_decay > 0){
+    //  decay
+    l2_decay = param("l2_decay") * decay_mult;
+    if (l2_decay > 0) {
        auto* dXdata = input(0).template mutable_data<T, Context>();
        auto* Xdata = output(0)->template data<T, Context>();
-        math::Axpy<T, Context>(input(0).count(), l2_decay * decay_mult, Xdata, dXdata);
+        math::Axpy<T, Context>(input(0).count(), l2_decay, Xdata, dXdata);
    }
 }

@@ -141,9 +141,9 @@ void UpdateOpBase<Context>::UpdateRunWithType() {
    if (!allow_parallel || (allow_parallel && mode == "Sync")) {
        auto* dXdata = input(0).template mutable_data<T, Context>();
        auto* Xdata = output(0)->template mutable_data<T, Context>();
-        //    update
+        //  update
        math::Axpy<T, Context>(output(0)->count(), -1.0, dXdata, Xdata);
-        //    clear accumulated grads
+        //  clear accumulated grads
        math::Set<T, Context>(input(0).count(), 0, dXdata);
    } else {
 #ifdef WITH_MPI

--- a/Dragon/src/operators/utils/cast_op.cpp
+++ b/Dragon/src/operators/utils/cast_op.cpp
@@ -10,12 +10,12 @@ void FloatToHalfOp<Context>::RunOnDevice() {
        << "the type of tensor must be float32.";
    output(0)->ReshapeLike(input(0));
    
-    //    cast
+    //  cast
    auto* Xdata = input(0).template data<float, Context>();
    auto* Ydata = output(0)->template mutable_data<float16, Context>();
    kernel::Float2Half<float, Context>(output(0)->count(), Xdata, Ydata);

-    // release & share
+    //  release & share
    input(0).Reset();
    input(0).ReshapeLike(*output(0));
    input(0).Share(*output(0));

--- a/Dragon/src/operators/utils/compare_op.cc
+++ b/Dragon/src/operators/utils/compare_op.cc
@@ -12,7 +12,7 @@ void CompareOp<Context>::EqualRunWithType() {
 }

 template <class Context>
-void CompareOp<Context>::RunOnDevice(){
+void CompareOp<Context>::RunOnDevice() {
    CHECK_EQ(input(0).count(), input(1).count())
        << "both conditioned tensor must have same elements.";
    output(0)->ReshapeLike(input(0));

--- a/Dragon/src/operators/utils/gradient_op.cc
+++ b/Dragon/src/operators/utils/gradient_op.cc
@@ -6,7 +6,7 @@ namespace dragon {

 template <class Context> template <typename T>
 void GradientGenerateOp<Context>::RunWithType() {
-    for (int i = 0; i < OutputSize(); i++){
+    for (int i = 0; i < OutputSize(); i++) {
        if (output(i)->name() == "ignore") continue;
        output(i)->ReshapeLike(input(i));
        auto* dXdata = output(0)->template mutable_data<T, Context>();

--- a/Dragon/src/operators/utils/initialize_op.cc
+++ b/Dragon/src/operators/utils/initialize_op.cc
@@ -26,7 +26,7 @@ void InitializeOp<Context>::RunOnDevice() {
    RunWithType<float>();
 }

-//    constant
+//  constant
 DEPLOY_CPU(Fill);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(Fill);
@@ -34,7 +34,7 @@ DEPLOY_CUDA(Fill);
 OPERATOR_SCHEMA(Fill).NumInputs(0, 1).NumOutputs(1);
 NO_GRADIENT(Fill);

-//    uniform
+//  uniform
 DEPLOY_CPU(RandomUniform);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(RandomUniform);
@@ -42,7 +42,7 @@ DEPLOY_CUDA(RandomUniform);
 OPERATOR_SCHEMA(RandomUniform).NumInputs(0, 1).NumOutputs(1);
 NO_GRADIENT(RandomUniform);

-//    normal
+//  normal
 DEPLOY_CPU(RandomNormal);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(RandomNormal);
@@ -50,7 +50,7 @@ DEPLOY_CUDA(RandomNormal);
 OPERATOR_SCHEMA(RandomNormal).NumInputs(0, 1).NumOutputs(1);
 NO_GRADIENT(RandomNormal);

-//    truncated normal
+//  truncated normal
 DEPLOY_CPU(TruncatedNormal);
 #ifdef WITH_CUDA
 DEPLOY_CPU_CUDA(TruncatedNormal);
@@ -58,7 +58,7 @@ DEPLOY_CPU_CUDA(TruncatedNormal);
 OPERATOR_SCHEMA(TruncatedNormal).NumInputs(0, 1).NumOutputs(1);
 NO_GRADIENT(TruncatedNormal);

-//    glorot uniform
+//  glorot uniform
 DEPLOY_CPU(GlorotUniform);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(GlorotUniform);
@@ -66,7 +66,7 @@ DEPLOY_CUDA(GlorotUniform);
 OPERATOR_SCHEMA(GlorotUniform).NumInputs(0, 1).NumOutputs(1);
 NO_GRADIENT(GlorotUniform);

-//    glorot normal
+//  glorot normal
 DEPLOY_CPU(GlorotNormal);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(GlorotNormal);

--- a/Dragon/src/operators/utils/proposal_op.cu
+++ b/Dragon/src/operators/utils/proposal_op.cu
@@ -229,16 +229,16 @@ __global__ static void nms_mask(const Dtype boxes[],
                }
            }

-            // mask: "num_boxes x num_blocks" array
-            //   for mask[j][bi], "di-th bit = 1" means:
-            //     box j is significantly overlapped with box i = i_start + di,
-            //     where i_start = bi * block_size
+            //  mask: "num_boxes x num_blocks" array
+            //  for mask[j][bi], "di-th bit = 1" means:
+            //  box j is significantly overlapped with box i = i_start + di,
+            //  where i_start = bi * block_size
      {
          const int num_blocks = DIV_THEN_CEIL(num_boxes, nms_block_size);
          const int bi = blockIdx.x;
          mask[(j_start + dj) * num_blocks + bi] = mask_j;
      }
-        } // endif dj < dj_end
+        }   //  endif dj < dj_end
    }
 }


--- a/Dragon/src/operators/utils/shape_op.cc
+++ b/Dragon/src/operators/utils/shape_op.cc
@@ -4,10 +4,10 @@ namespace dragon {

 template <class Context>
 void ShapeOp<Context>::RunOnDevice() {
-    // reshape
+    //  reshape
    output(0)->Reshape(vector<TIndex>(1, input(0).ndim()));

-    //    forward
+    //  forward
    auto* Ydata = output(0)->template mutable_data<float, CPUContext>();
    for (int i = 0; i < input(0).ndim(); i++) Ydata[i] = input(0).dim(i);
 }

--- a/Dragon/src/operators/vision/conv_op.cc
+++ b/Dragon/src/operators/vision/conv_op.cc
@@ -7,7 +7,7 @@ namespace dragon {
 template <class Context>
 void ConvOp<Context>::ComputeOutputShape() {
    this->output_shape.clear();
-    for (int i = 0; i < this->num_spatial_axes; i++){
+    for (int i = 0; i < this->num_spatial_axes; i++) {
        const int input_dim = this->bottom_shape[this->channel_axis + i + 1];
        const int dilated_kernel = this->dilation[i] * (this->kernel_size[i] - 1) + 1;
        const int output_dim = (input_dim + 2 * this->pad[i] - dilated_kernel) / this->stride[i] + 1;
@@ -43,7 +43,7 @@ void ConvOp<Context>::RunWithType() {
 }

 template <class Context>
-void ConvOp<Context>::RunOnDevice(){
+void ConvOp<Context>::RunOnDevice() {
    Reshape();

    if (input(0).template IsType<float>()) RunWithType<float>();
@@ -71,7 +71,7 @@ void ConvGradientOp<Context>::RunWithType() {
            Db(dYdata + n * this->y_offset, dBdata);
    }

-    for (int n = 0; n < input(2).dim(0); n++){
+    for (int n = 0; n < input(2).dim(0); n++) {
        if (output(1)->name() != "ignore") {
            auto* Xdata = input(0).template data<T, Context>();
            auto* dWdata = output(1)->template mutable_data<T, Context>();

--- a/Dragon/src/operators/vision/conv_op_base.cc
+++ b/Dragon/src/operators/vision/conv_op_base.cc
@@ -125,7 +125,7 @@ void ConvOpBase<Context>::Reshape() {
    col_offset = kernel_dim * conv_out_spatial_dim;
    output_offset = conv_out_channels * conv_out_spatial_dim / group;

-    //    compute col buffer shape
+    //  compute col buffer shape
    col_buffer_shape.clear();
    col_buffer_shape.push_back(kernel_dim * group);
    for (int i = 0; i < num_spatial_axes; i++) {
@@ -159,9 +159,9 @@ void ConvOpBase<Context>::GradientReshape() {
        conv_out_spatial_dim = input(2).count(channel_axis + 1);
    }

-    //    compute input shape
+    //  compute input shape
    input_shape.clear();
-    for (int i = 0; i < num_spatial_axes; i++){
+    for (int i = 0; i < num_spatial_axes; i++) {
        if (ReverseDimensions()) {
            input_shape.push_back(input(2).dim(channel_axis + i + 1));
        } else {
@@ -169,7 +169,7 @@ void ConvOpBase<Context>::GradientReshape() {
        }
    }

-    kernel_dim = input(1).count(1);    //    in * kh * kw
+    kernel_dim = input(1).count(1);    //  in * kh * kw
    out_spatial_dim = input(2).count(channel_axis + 1);

    x_offset = input(0).count(channel_axis);
@@ -178,10 +178,10 @@ void ConvOpBase<Context>::GradientReshape() {
    col_offset = kernel_dim * conv_out_spatial_dim;
    output_offset = conv_out_channels * conv_out_spatial_dim / group;

-    //    compute col buffer shape
+    //  compute col buffer shape
    col_buffer_shape.clear();
    col_buffer_shape.push_back(kernel_dim * group);
-    for (int i = 0; i < num_spatial_axes; i++){
+    for (int i = 0; i < num_spatial_axes; i++) {
        if (ReverseDimensions()) {
            col_buffer_shape.push_back(bottom_shape[channel_axis + i + 1]);
        } else {

--- a/Dragon/src/operators/vision/cudnn_conv_op.cc
+++ b/Dragon/src/operators/vision/cudnn_conv_op.cc
@@ -228,13 +228,13 @@ void CuDNNConvGradientOp<Context>::RunWithType() {

    const T* dYdata = input(2).template data<T, Context>();
    for (int g = 0; g < this->group; g++) {
-        if (output(2)->name() != "ignore"){
+        if (output(2)->name() != "ignore") {
            T* dBdata = output(2)->template mutable_data<T, Context>();
            CUDNN_CHECK(cudnnConvolutionBackwardBias(handle[g],
                        CUDNNType<T>::one, input_desc, dYdata + this->y_offset * g,
                        CUDNNType<T>::one, bias_desc, dBdata + bias_offset * g));
        }
-        if (output(1)->name() != "ignore"){
+        if (output(1)->name() != "ignore") {
            auto* Xdata = input(0).template data<T, Context>();
            auto* dWdata = output(1)->template mutable_data<T, Context>();
            auto* workspace = buffer2->mutable_data<char, Context>();
@@ -246,7 +246,7 @@ void CuDNNConvGradientOp<Context>::RunWithType() {
              workspace + g * workspace_bwd_filter_size, workspace_bwd_filter_size,
                CUDNNType<T>::one, filter_desc, dWdata + this->weight_offset * g));
        }
-        if (output(0)->name() != "ignore"){
+        if (output(0)->name() != "ignore") {
            auto* Wdata = input(1).template data<T, Context>();
            auto* dXdata = output(0)->template mutable_data<T, Context>();
            auto* workspace = buffer1->mutable_data<char, Context>();

--- a/Dragon/src/operators/vision/cudnn_lrn_op.cc
+++ b/Dragon/src/operators/vision/cudnn_lrn_op.cc
@@ -54,7 +54,7 @@ void CuDNNLRNGradientOp<Context>::RunWithType() {
 }

 template <class Context>
-void CuDNNLRNGradientOp<Context>::RunOnDevice(){
+void CuDNNLRNGradientOp<Context>::RunOnDevice() {
    output(0)->ReshapeLike(input(0));

    if (this->mode == ACROSS_CHANNELS) {

--- a/Dragon/src/operators/vision/deconv_op.cc
+++ b/Dragon/src/operators/vision/deconv_op.cc
@@ -43,7 +43,7 @@ void DeConvOp<Context>::RunWithType() {
 }

 template <class Context>
-void DeConvOp<Context>::RunOnDevice(){
+void DeConvOp<Context>::RunOnDevice() {
    Reshape();

    if (input(0).template IsType<float>()) RunWithType<float>();

--- a/Dragon/src/operators/vision/lrn_op.cc
+++ b/Dragon/src/operators/vision/lrn_op.cc
@@ -98,7 +98,7 @@ void LRNOp<Context>::ProdRunWithType() {
 }

 template <class Context>
-void LRNOp<Context>::RunOnDevice(){
+void LRNOp<Context>::RunOnDevice() {
    if (mode == ACROSS_CHANNELS) {
        if (input(0).template IsType<float>()) {
            AcrossRunWithType<float>();
@@ -223,7 +223,7 @@ void LRNGradientOp<Context>::SplitRunWithType() {
 }

 template <class Context>
-void LRNGradientOp<Context>::RunOnDevice(){
+void LRNGradientOp<Context>::RunOnDevice() {
    if (mode == ACROSS_CHANNELS) {
        if (input(0).template IsType<float>()) {
            AcrossRunWithType<float>();

--- a/Dragon/src/operators/vision/nn_resize_op.cc
+++ b/Dragon/src/operators/vision/nn_resize_op.cc
@@ -16,7 +16,7 @@ void NNResizeOp<Context>::RunWithType() {
 template <class Context>
 void NNResizeOp<Context>::RunOnDevice() {
    dims = input(0).dims();
-    if (dsize.size() == 0){
+    if (dsize.size() == 0) {
        CHECK(fy != -1.0 && fx != -1.0);
        dims[2] = int(dims[2] * fy);
        dims[3] = int(dims[3] * fx);
@@ -47,7 +47,7 @@ void NNResizeGradientOp<Context>::RunWithType() {
 }

 template <class Context>
-void NNResizeGradientOp<Context>::RunOnDevice(){
+void NNResizeGradientOp<Context>::RunOnDevice() {
    output(0)->ReshapeLike(input(0));
    
    if (input(0).template IsType<float>()) return RunWithType<float>();

--- a/Dragon/src/utils/cudnn_device.cc
+++ b/Dragon/src/utils/cudnn_device.cc
@@ -37,7 +37,7 @@ void cudnnSetTensorDesc(cudnnTensorDescriptor_t* desc, const vector<TIndex>& dim
    int* dimA = new int[ndim];
    int* strideA = new int[ndim];
    TIndex stride = 1;
-    for (int i = ndim - 1; i >= 0; i--){
+    for (int i = ndim - 1; i >= 0; i--) {
        strideA[i] = stride;
        dimA[i] = dims[i];
        stride *= dimA[i];
@@ -55,7 +55,7 @@ void cudnnSetTensorDesc(cudnnTensorDescriptor_t* desc,
    int ndim = (int)dims.size();
    int* dimA = new int[ndim];
    int* strideA = new int[ndim];
-    for (int i = ndim - 1; i >= 0; i--){
+    for (int i = ndim - 1; i >= 0; i--) {
        strideA[i] = strides[i];
        dimA[i] = dims[i];
    }
@@ -66,10 +66,10 @@ void cudnnSetTensorDesc(cudnnTensorDescriptor_t* desc,

 template <typename T>
 void cudnnSetTensorDesc(cudnnTensorDescriptor_t* desc, Tensor* tensor) {
-    //    CUDNN only support ndim from 3 to 8
-    //    we fake a reshaped dims to pass check
+    //  cuDNN requires ndim from 3 to 8
+    //  we fake a reshaped dims to pass check
    vector<TIndex> fake_dims(tensor->dims());
-    if (fake_dims.size() < 3 || fake_dims.size() > 8){
+    if (fake_dims.size() < 3 || fake_dims.size() > 8) {
        fake_dims.assign({ 1, 1 });
        fake_dims.push_back(tensor->count());
    }

--- a/Dragon/src/utils/math_functions.cc
+++ b/Dragon/src/utils/math_functions.cc
@@ -142,7 +142,7 @@ template <> void Add<float, CPUContext>(const int n,
 template <> void Sub<float, CPUContext>(const int n, 
                                        const float* a, 
                                        const float* b,
-                                        float* y){
+                                        float* y) {
 #ifdef WITH_SSE
    sse::Sub<float>(n, a, b, y);
 #else  // naive implement

--- a/Dragon/src/utils/math_functions.cu
+++ b/Dragon/src/utils/math_functions.cu
@@ -66,8 +66,8 @@ template <> void RandomUniform<uint32_t, CUDAContext>(const int n,
                                                      const float low, 
                                                      const float high, 
                                                      uint32_t* x) {
-    //    note that we ignore the low / high
-    //    curand could only generates in the range of [0, uint32]
+    //  note that we ignore the low / high
+    //  curand could only generates in the range of [0, uint32]
    CURAND_CHECK(curandGenerate(curand_generator(), x, n));
 }

@@ -95,8 +95,8 @@ template <> void RandomNormal<float16, CUDAContext>(const int n,
 template <> void RandomBernoulli<float, CUDAContext>(const int n,
                                                     const float p, 
                                                     unsigned int* x) {
-    //    curand could not generate bernoulli distribution
-    //    we recommend implement it within specfic case, e.g. Dropout
+    //  curand could not generate bernoulli distribution
+    //  we recommend implement it within specfic case, e.g. Dropout
    NOT_IMPLEMENTED;
 }


--- a/Dragon/src/utils/op_kernel.cc
+++ b/Dragon/src/utils/op_kernel.cc
@@ -98,7 +98,7 @@ template<> void Softmax<float, CPUContext>(const int count,
    const int dim = count / outer_dim;
    for (int i = 0; i < outer_dim; ++i) {
        context->Copy<float, CPUContext, CPUContext>(inner_dim, scale, x + i*dim);
-        for (int j = 0; j < classes; ++j){
+        for (int j = 0; j < classes; ++j) {
            for (int k = 0; k < inner_dim; k++)
                scale[k] = std::max(scale[k], x[i * dim + j * inner_dim + k]);
        }
@@ -668,7 +668,7 @@ template <> void OneHot<float, CPUContext>(const int count,
 template<> void AbsGrad<float, CPUContext>(const int count, const float* dy, float* dx) {
    for (int i = 0; i < count; ++i) {
        const float val = dy[i];
-        //    val > 0: 1 | val == 0: 0 | val < 0: -1
+        //  val > 0: 1 | val == 0: 0 | val < 0: -1
        dx[i] = (val > float(0)) - (val < float(0));
    }
 }
@@ -707,7 +707,7 @@ template<> void SmoothL1Grad<float, CPUContext>(const int count,
        const float val = dy[i];
        const float abs_val = abs(val);
        if (abs_val < 1.0 / sigma2) dx[i] = val * sigma2;
-        //    val > 0: 1 | val == 0: 0 | val < 0: -1
+        //  val > 0: 1 | val == 0: 0 | val < 0: -1
        else dx[i] = (val > float(0)) - (val < float(0));
    }
 }
@@ -862,11 +862,11 @@ template <> void LSTMUnitGrad<float, CPUContext>(const int count,
            p_df = dx + f_offset + ch;
            p_do = dx + o_offset + ch;
            p_dg = dx + g_offset + ch;
-            //    BPTT compute the dc_{t-1} at the time of t
-            //    dc_{t-1} =    dl / d(h_{t}) * d(h_{t}) / d(c_{t}) * d(c_{t}) / d(c_{t-1})
+            //  BPTT compute the dc_{t-1} at the time of t
+            //  dc_{t-1} =  dl / d(h_{t}) * d(h_{t}) / d(c_{t}) * d(c_{t}) / d(c_{t-1})
            //                  + d(c_{t+1}) / d(c_{t}) * d(c_{t}) / d(c_{t-1})
-            //             =   (dl / d(h_{t}) * d(h_{t}) / d(c_{t}) + d(c_{t+1}) / d(c_{t}))
-            //                * d(c_{t}) / d(c_{t-1})
+            //           =  (dl / d(h_{t}) * d(h_{t}) / d(c_{t}) + d(c_{t+1}) / d(c_{t}))
+            //                  * d(c_{t}) / d(c_{t-1})
            tanh_c_t = tanh(c[ch]);
            dc_1_sum_term = dh[ch] * o * (1 - tanh_c_t * tanh_c_t) + dc[ch];
            dc_1[ch] = dc_1_sum_term * f;
@@ -1182,18 +1182,18 @@ template<> void MAXPooling<float, CPUContext>(const int count,
                                max_val = x[idx];
                                max_idx = idx;
                            }
-                        }    //    end w
-                    }    //    end h
+                        }    //  end w
+                    }    //  end h
                    y[pool_idx] = max_val;
                    mask[pool_idx] = max_idx;
-                }    //    end pw
-            }    //    end ph
-            //    offset a channel
+                }    //  end pw
+            }    //  end ph
+            //  offset a channel
            x += x_offset;
            y += y_offset;
            mask += y_offset;
-        }    //    end c
-    }    //    end n
+        }    //  end c
+    }    //  end n
 }

 template<> void AVEPooling<float, CPUContext>(const int count, 
@@ -1257,13 +1257,13 @@ template<> void MAXPoolingGrad<float, CPUContext>(const int count,
                        const int pool_idx = ph * pool_width + pw;
                        const int idx = mask[pool_idx];
                        dx[idx] += dy[pool_idx];
-                    }    //    end pw
-            }    //    end ph
+                    }    //  end pw
+            }    //  end ph
            dx += x_offset;
            dy += y_offset;
            mask += y_offset;
-        }    // end c
-    }    //    end n
+        }    //  end c
+    }    //  end n
 }

 template<> void AVEPoolingGrad<float, CPUContext>(const int count, 
@@ -1298,12 +1298,12 @@ template<> void AVEPoolingGrad<float, CPUContext>(const int count,
                            dx[idx] += (dy[pool_idx] / pool_size);
                        }
                    }
-                }    //    end pw
-            }    //    end ph
+                }    //  end pw
+            }    //  end ph
            dx += x_offset;
            dy += y_offset;
-        }    // end c
-    }    //    end n
+        }    //  end c
+    }    //  end n
 }

 /******************** vision.roi_pooling ********************/
@@ -1362,18 +1362,18 @@ template<> void ROIPooling<float, CPUContext>(const float spatial_scale,
                                    Ydata[pool_idx] = Idata[idx];
                                    Mdata[pool_idx] = idx;
                            }
-                        }    //end w
-                    }    // end h
-                }    // end pw
-            }    // end ph
-            //    offset image channels
+                        }    //  end w
+                    }    //  end h
+                }    //  end pw
+            }    //  end ph
+            //  offset image channels
            Idata += x->offset(0, 1);
            Ydata += y->offset(0, 1);
            Mdata += mask->offset(0, 1);
-        }    // end c
-        // offset roi region
+        }    //  end c
+        //  offset roi region
        Rdata += roi->offset(1);
-    }    //end n
+    }    //  end n
 }

 template<> void ROIPoolingGrad<float, CPUContext>(const float spatial_scale, 

--- a/Dragon/src/utils/op_kernel.cu
+++ b/Dragon/src/utils/op_kernel.cu
@@ -130,7 +130,7 @@ __global__ void _ReluGrad(const int count,
                          const T* y, 
                          const float slope, 
                          T* dx) {
-    CUDA_KERNEL_LOOP(i, count){
+    CUDA_KERNEL_LOOP(i, count) {
        dx[i] = dy[i] * ((y[i] > 0) + slope * (y[i] <= 0));
    }
 }
@@ -912,7 +912,7 @@ __global__ void _Sum(const int count,

 template<> void Sum<float, CUDAContext>(
        const int count, const int axis_dim,
-        const int inner_dim, const float* x, float* y){
+        const int inner_dim, const float* x, float* y) {
    _Sum<float> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count, 
                                                          axis_dim, 
                                                         inner_dim, 
@@ -954,7 +954,7 @@ template<> void SumGrad<float, CUDAContext>(const int count,

 template <typename T>
    __global__ void _Slice(const int count, const int outer_dim, const int inner_dim,
-        const int x_slice_dim, const int y_slice_dim, const int slice_offset, const T* x, T* y){
+        const int x_slice_dim, const int y_slice_dim, const int slice_offset, const T* x, T* y) {
        CUDA_KERNEL_LOOP(idx, count) {
            const int tmp = y_slice_dim * inner_dim;
            const int outer_idx = idx / tmp;
@@ -1238,7 +1238,7 @@ template <typename T>
 __global__ void _AbsGrad(const int count, const T* dy, T* dx) {
    CUDA_KERNEL_LOOP(idx, count) {
       const T val = dy[idx];
-       //    val > 0: 1 | val == 0: 0 | val < 0: -1
+       //  val > 0: 1 | val == 0: 0 | val < 0: -1
       dx[idx] = (val > T(0)) - (val < T(0));
    }
 }
@@ -1298,7 +1298,7 @@ __global__ void _SmoothL1Grad(const int count, const float sigma2, const T* dy, 
        const T val = dy[idx];
        const T abs_val = abs(val);
        if (abs_val < 1.0 / sigma2) dx[idx] = val * sigma2;
-        //    val > 0: 1 | val == 0: 0 | val < 0: -1
+        //  val > 0: 1 | val == 0: 0 | val < 0: -1
        else dx[idx] = (val > T(0)) - (val < T(0));
    }
 }
@@ -1952,8 +1952,8 @@ __global__ void _Col2Im(const int count,
        const int ex_kernel_w = (kernel_w - 1) * dilation_w + 1;
        const int w_start = (im_w < ex_kernel_w) ? 0 : (im_w - ex_kernel_w) / stride_w + 1;

-        //    redundant pixels will be ignored when conv
-        //    note to clip them by min(x,col_w)
+        //  redundant pixels will be ignored when conv
+        //  note to clip them by min(x,col_w)
        const int w_end = min(im_w / stride_w + 1, col_w);
        const int h_start = (im_h < ex_kernel_h) ? 0 : (im_h - ex_kernel_h) / stride_h + 1;
        const int h_end = min(im_h / stride_h + 1, col_h);
@@ -1962,8 +1962,8 @@ __global__ void _Col2Im(const int count,
            for (int w = w_start; w < w_end; ++w) {
                int kh_off = (im_h - h * stride_h);
                int kw_off = (im_w - w * stride_w);
-                //    only the serval im pixels used in dilated-conv
-                //    ignore the corresponding col pixels
+                //  only the serval im pixels used in dilated-conv
+                //  ignore the corresponding col pixels
                if (kh_off % dilation_h == 0 && kw_off % dilation_w == 0) {
                    kh_off /= dilation_h;
                    kw_off /= dilation_w;
@@ -2222,11 +2222,11 @@ __global__ void _MAXPoolingGrad(const int count,
        const int c = (idx / width / height) % channels;
        const int n = idx / width / height / channels;

-        //    allow overlapping
+        //  allow overlapping
        const int start_ph = (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;
        const int start_pw = (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;

-        //    allow clip
+        //  allow clip
        const int end_ph = min((h + pad_h) / stride_h + 1, pool_height);
        const int end_pw = min((w + pad_w) / stride_w + 1, pool_width);

@@ -2436,7 +2436,7 @@ __global__ void _ROIPoolingGrad(const int count,
            const T* cur_roi = roi + n * 5;
            const int im_idx_spec = cur_roi[0];

-            //    ignore wrong im_batch_idx
+            //  ignore wrong im_batch_idx
            if (im_idx != im_idx_spec) continue;

            int x1 = round(cur_roi[1] * spatial_scale);
@@ -2474,9 +2474,9 @@ __global__ void _ROIPoolingGrad(const int count,
                    if (mask_off[pool_idx] == (h * width + w)) {
                        diff += dy_off[pool_idx];
                    }
-                }    //    end pw
-            }    // end ph
-        }    //    end n
+                }    //  end pw
+            }    //  end ph
+        }    //  end n
        dx[idx] = diff;
    }
 }
@@ -2647,7 +2647,7 @@ __global__ void _ROIAlignGrad(const int count,
            const T* cur_roi = roi + n * 5;
            const int im_idx_spec = cur_roi[0];

-            //    ignore wrong im_batch_idx
+            //  ignore wrong im_batch_idx
            if (im_idx != im_idx_spec) continue;

            T x1 = cur_roi[1] * spatial_scale;
@@ -2693,9 +2693,9 @@ __global__ void _ROIAlignGrad(const int count,
                        else gradient_factor *= mw - w1;
                        diff += dy_off[ph * pool_w + pw] * gradient_factor;
                    }
-                }    //    end pw
-            }    // end ph
-        }    //    end n
+                }    //  end pw
+            }    //  end ph
+        }    //  end n
        dx[idx] = diff;
    }
 }

--- a/Dragon/src/utils/sse_alternative.cc
+++ b/Dragon/src/utils/sse_alternative.cc
@@ -9,20 +9,20 @@ namespace dragon {

 namespace sse {

-    template<> void Set(const int n, const float alpha, float* x){
+    template<> void Set(const int n, const float alpha, float* x) {
        __m128 scalar = SSE_FP32_SCALAR(alpha);
        SSE_LOOP1(i, n) SSE_FP32_STORE(x + i, scalar);
        SSE_LOOP2(i, n) x[i] = alpha;
    }

-    template<> void Set(const int n, const int alpha, int* x){
+    template<> void Set(const int n, const int alpha, int* x) {
        __m128i scalar = SSE_INT32_SCALAR(alpha);
        __m128i* x1 = reinterpret_cast<__m128i*>(x);
        SSE_LOOP1(i, n) SSE_INT128_STORE(x1++, scalar);
        SSE_LOOP2(i, n) x[i] = alpha;
    }

-    template<> void Add(const int n, const float* a, const float* b, float* y){
+    template<> void Add(const int n, const float* a, const float* b, float* y) {
        __m128 x1, y1, z1;
        SSE_LOOP1(i, n) {
            x1 = SSE_FP32_LOAD(a + i);
@@ -33,7 +33,7 @@ namespace sse {
        SSE_LOOP2(i, n) y[i] = a[i] + b[i];
    }

-    template<> void Sub(const int n, const float* a, const float* b, float* y){
+    template<> void Sub(const int n, const float* a, const float* b, float* y) {
        __m128 x1, y1, z1;
        SSE_LOOP1(i, n) {
            x1 = SSE_FP32_LOAD(a + i);
@@ -44,7 +44,7 @@ namespace sse {
        SSE_LOOP2(i, n) y[i] = a[i] - b[i];
    }

-    template<> void Mul(const int n, const float* a, const float* b, float* y){
+    template<> void Mul(const int n, const float* a, const float* b, float* y) {
        __m128 x1, y1, z1;
        SSE_LOOP1(i, n) {
            x1 = SSE_FP32_LOAD(a + i);
@@ -55,7 +55,7 @@ namespace sse {
        SSE_LOOP2(i, n) y[i] = a[i] * b[i];
    }

-    template<> void Div(const int n, const float* a, const float* b, float* y){
+    template<> void Div(const int n, const float* a, const float* b, float* y) {
        __m128 x1, y1, z1;
        SSE_LOOP1(i, n) {
            x1 = SSE_FP32_LOAD(a + i);
@@ -66,7 +66,7 @@ namespace sse {
        SSE_LOOP2(i, n) y[i] = a[i] / b[i];
    }

-    template<> void Scal(const int n, const float alpha, float* y){
+    template<> void Scal(const int n, const float alpha, float* y) {
        __m128 y1, scalar = SSE_FP32_SCALAR(alpha);
        SSE_LOOP1(i, n) {
            y1 = SSE_FP32_LOAD(y + i);
@@ -76,7 +76,7 @@ namespace sse {
        SSE_LOOP2(i, n) y[i] *= alpha;
    }

-    template<> void Scale(const int n, const float alpha, const float* x, float* y){
+    template<> void Scale(const int n, const float alpha, const float* x, float* y) {
        __m128 x1, scalar = SSE_FP32_SCALAR(alpha);
        SSE_LOOP1(i, n) {
            x1 = SSE_FP32_LOAD(x + i);
@@ -87,7 +87,7 @@ namespace sse {
    }

    
-    template<> void Axpy(const int n, float alpha, const float* x, float *y){
+    template<> void Axpy(const int n, float alpha, const float* x, float *y) {
        __m128 x1, y1, scalar = SSE_FP32_SCALAR(alpha);
        SSE_LOOP1(i, n) {
            x1 = SSE_FP32_LOAD(x + i);
@@ -100,7 +100,7 @@ namespace sse {
    }

    template<> void Axpby(const int n, float alpha, const float* x, 
-        const float beta, float *y){
+        const float beta, float *y) {
        __m128 x1, y1, z1;
        __m128 scalar1 = SSE_FP32_SCALAR(alpha);
        __m128 scalar2 = SSE_FP32_SCALAR(beta);
@@ -115,7 +115,7 @@ namespace sse {
        SSE_LOOP2(i, n) y[i] = alpha * x[i] + beta* y[i];
    }

-    template<> float ASum(const int n, const float *x){
+    template<> float ASum(const int n, const float *x) {
        __m128 x1, sum = SSE_FP32_ZERO;
        SSE_LOOP1(i, n) {
            x1 = SSE_FP32_LOAD(x + i);
@@ -128,7 +128,7 @@ namespace sse {
        return ret;
    }

-    template<> void AddScalar(const int n, const float alpha, float* y){
+    template<> void AddScalar(const int n, const float alpha, float* y) {
        __m128 y1, scalar = SSE_FP32_SCALAR(alpha);
        SSE_LOOP1(i, n) {
            y1 = SSE_FP32_LOAD(y + i);
@@ -138,7 +138,7 @@ namespace sse {
        SSE_LOOP2(i, n) y[i] += alpha;
    }

-    template<> void MulScalar(const int n, const float alpha, float* y){
+    template<> void MulScalar(const int n, const float alpha, float* y) {
        __m128 y1, scalar = SSE_FP32_SCALAR(alpha);
        SSE_LOOP1(i, n) {
            y1 = SSE_FP32_LOAD(y + i);
@@ -148,7 +148,7 @@ namespace sse {
        SSE_LOOP2(i, n) y[i] *= alpha;
    }

-    template <> float Dot(const int n, const float* a, const float* b){
+    template <> float Dot(const int n, const float* a, const float* b) {
        __m128 x1, y1, sum = SSE_FP32_ZERO;
        SSE_LOOP1(i, n) {
            x1 = SSE_FP32_LOAD(a + i);

--- a/examples/cifar10/gen_lmdb.py
+++ b/examples/cifar10/gen_lmdb.py
@@ -11,6 +11,7 @@ import sys
 import time
 import shutil
 import tarfile
+import numpy as np
 from six.moves import range as xrange

 import cv2
@@ -78,7 +79,7 @@ def extract_images():
            f.write(item)


-def make_db(image_path, label_path, database_path):
+def make_db(image_path, label_path, database_path, pad=0):
    if os.path.isfile(label_path) is False:
        raise ValueError('input path is empty or wrong.')
    if os.path.isdir(database_path) is True:
@@ -111,6 +112,12 @@ def make_db(image_path, label_path, database_path):
            label = record[1]

            img = cv2.imread(os.path.join(image_path ,path))
+            if pad > 0:
+                pad_img = np.zeros((img.shape[0] + 2 * pad,
+                                    img.shape[1] + 2 * pad, 3), dtype=np.uint8)
+                pad_img[pad : pad + img.shape[0],
+                        pad : pad + img.shape[1], :] = img
+                img = pad_img
            result, imgencode = cv2.imencode('.jpg', img, encode_param)

            datum = caffe_pb2.Datum()