io refactoring

Ting PAN
Commit 5d518b6c authored Aug 06, 2017 by Ting PAN
Showing with 310 additions and 237 deletions
Dragon/CMakeLists.txt
Dragon/include/core/context.h
Dragon/include/core/context_cuda.h
Dragon/include/core/tensor.h
Dragon/include/core/typeid.h
Dragon/include/core/workspace.h
Dragon/include/operators/common/scan_op.h
Dragon/include/operators/common/utils_op.h
Dragon/include/operators/vision/roi_align_op.h
Dragon/include/utils/caffemodel.h
Dragon/include/utils/filler.h
Dragon/include/utils/string.h
Dragon/modules/python/dragon.cc
Dragon/modules/python/dragon.h
Dragon/modules/python/py_mpi.h
Dragon/python/dragon/io/__init__.py
Dragon/python/dragon/vm/caffe/io/blob_fetcher.py → Dragon/python/dragon/io/blob_fetcher.py
Dragon/python/dragon/vm/caffe/io/minibatch.py → Dragon/python/dragon/io/data_batch.py
Dragon/python/dragon/vm/caffe/io/data_reader.py → Dragon/python/dragon/io/data_reader.py
Dragon/python/dragon/vm/caffe/io/data_transformer.py → Dragon/python/dragon/io/data_transformer.py
--- a/Dragon/CMakeLists.txt
+++ b/Dragon/CMakeLists.txt
@@ -24,7 +24,7 @@ set(3RDPARTY_DIR  ${PROJECT_SOURCE_DIR}/../3rdparty)
 set(PYTHON_DIR /usr/include/python2.7)  # prefer
 #set(PYTHON_DIR /usr/include/python3.x)  # optional, set specific version
 #set(ANACONDA_DIR /xxx/anaconda)  # optional, set specific version below if using py3
-set(NUMPY_DIR /xxx/numpy)  # require， root folder of numpy package
+set(NUMPY_DIR /xxx/numpy)  # require, root folder of numpy package

 # set CUDA compiling architecture
 set(CUDA_ARCH     -gencode arch=compute_20,code=sm_20

--- a/Dragon/include/core/context.h
+++ b/Dragon/include/core/context.h
@@ -52,7 +52,7 @@ class CPUContext{
    inline static void Delete(void* data) { free(data); }

    template<typename T, class DstContext, class SrcContext>
-    inline static void Copy(int n, T* dst, const T* src){
+    inline static void Copy(int n, T* dst, const T* src) {
        if (dst == src) return;
        //  only the basic types(e.g. int/float) can memcpy correctly
        if (std::is_fundamental<T>::value)

--- a/Dragon/include/core/context_cuda.h
+++ b/Dragon/include/core/context_cuda.h
@@ -119,7 +119,7 @@ class CUDAContext {
    inline static void Delete(void* data) { cudaFree(data); }

    template<typename T, class DstContext, class SrcContext>
-    static void Copy(int n, T* dst, const T* src){
+    static void Copy(int n, T* dst, const T* src) {
        if (dst == src) return;
        Memcpy<SrcContext, DstContext>(n * sizeof(T), (void*)dst, (const void*)src);
    }
@@ -148,7 +148,7 @@ class CUDAContext {
    }

 #ifdef WITH_CUDNN
-    cudnnHandle_t cudnn_handle(){
+    cudnnHandle_t cudnn_handle() {
        auto& handle = cuda_object_.cudnn_handle[gpu_id_];
        if (handle)  {
            return handle;

--- a/Dragon/include/core/tensor.h
+++ b/Dragon/include/core/tensor.h
@@ -77,7 +77,7 @@ class Tensor {
    inline TIndex offset(const vector<TIndex>& vec) {
        CHECK_LE(vec.size(), ndim());
        TIndex offset = 0;
-        for (int i = 0; i < ndim(); i++){
+        for (int i = 0; i < ndim(); i++) {
            offset = offset * dim(i);
            if (vec.size() > i) offset += vec[i];
        }
@@ -130,7 +130,7 @@ class Tensor {
    }

    template <class Context>
-    void* raw_mutable_data(const TypeMeta& meta){
+    void* raw_mutable_data(const TypeMeta& meta) {
        void* data_ptr;
        active_data_ptr<Context>(&data_ptr);
        if (meta_ == meta && data_ptr) {

--- a/Dragon/include/core/typeid.h
+++ b/Dragon/include/core/typeid.h
@@ -75,20 +75,20 @@ class TypeMeta {
    bool Match() const { return (id_ == Id<T>()); } 

    template <typename T>
-    static void Ctor(void* ptr, size_t n){
+    static void Ctor(void* ptr, size_t n) {
        T* typed_ptr = static_cast<T*>(ptr);
        for (unsigned int i = 0; i < n; i++) new(typed_ptr + i) T;
    }

    template <typename T>
-    static void Copy(const void* src, void* dst, size_t n){
+    static void Copy(const void* src, void* dst, size_t n) {
        const T* typed_src = static_cast<const T*>(src);
        T* typed_dst = static_cast<T*>(dst);
        for (unsigned int i = 0; i < n; i++) typed_dst[i] = typed_src[i];
    }

    template <typename T>
-    static void Dtor(void* ptr, size_t n){
+    static void Dtor(void* ptr, size_t n) {
        T* typed_ptr = static_cast<T*>(ptr);
        for (unsigned int i = 0; i < n; i++) typed_ptr[i].~T();
    }

--- a/Dragon/include/core/workspace.h
+++ b/Dragon/include/core/workspace.h
@@ -44,7 +44,7 @@ class Workspace{
        return tensor_map_.count(query) > 0; 
    }

-    inline Tensor* CreateTensor(const string& name){
+    inline Tensor* CreateTensor(const string& name) {
        string query = GetTensorName(name);
        if (!HasTensor(query))
            tensor_map_[query] = unique_ptr<Tensor>(new Tensor(query));
@@ -143,7 +143,7 @@ class Workspace{
        return graph_map_[graph_name]->Run(include, exclude);
    }

-    inline vector<string> GetGraphs(){
+    inline vector<string> GetGraphs() {
        vector<string> names;
        for (auto& it : graph_map_) names.push_back(it.first);
        return names;

--- a/Dragon/include/operators/common/scan_op.h
+++ b/Dragon/include/operators/common/scan_op.h
--- a/Dragon/include/operators/common/utils_op.h
+++ b/Dragon/include/operators/common/utils_op.h
@@ -25,9 +25,9 @@ class AccuracyOp final: public Operator<Context> {
 public:
    AccuracyOp(const OperatorDef& op_def, Workspace* ws)
        : Operator<Context>(op_def, ws),
-          top_k(OperatorBase::GetSingleArg<int>("top_k", 1)){
+          top_k(OperatorBase::GetSingleArg<int>("top_k", 1)) {
        vector<int> args = OperatorBase::GetRepeatedArg<int>("ignore_labels");
-        if (args.size()){
+        if (args.size()) {
            ignore_labels.Reshape(vector<TIndex>(1, args.size()));
            int* ignore_data = ignore_labels.mutable_data<int, CPUContext>();
            for (int i = 0; i < args.size(); i++) ignore_data[i] = args[i];

--- a/Dragon/include/operators/vision/roi_align_op.h
+++ b/Dragon/include/operators/vision/roi_align_op.h
@@ -39,7 +39,7 @@ class ROIAlignGradientOp : public Operator<Context> {
        : Operator<Context>(op_def, ws),
          pool_h(OperatorBase::GetSingleArg<int>("pool_h", 0)),
          pool_w(OperatorBase::GetSingleArg<int>("pool_w", 0)),
-          spatial_scale(OperatorBase::GetSingleArg<float>("spatial_scale", 1.0)){
+          spatial_scale(OperatorBase::GetSingleArg<float>("spatial_scale", 1.0)) {
        CHECK_GT(pool_h, 0) << "\npool_h must > 0";
        CHECK_GT(pool_w, 0) << "\npool_w must > 0";
    }

--- a/Dragon/include/utils/caffemodel.h
+++ b/Dragon/include/utils/caffemodel.h
@@ -54,7 +54,7 @@ inline void LoadCaffeModel(string file, string scope, Workspace* ws) {
    ReadProtoFromBinaryFile(file.c_str(), &net_param);
    LOG(INFO) << "Restore From Model @: " << file << "......";
    LOG(INFO) << "Model Format: CaffeModel";
-    for (int i = 0; i < net_param.layer_size(); i++){
+    for (int i = 0; i < net_param.layer_size(); i++) {
        const LayerParameter& layer = net_param.layer(i);
        const string& layer_name = layer.name();
        string prefix = scope + layer_name + "@param";

--- a/Dragon/include/utils/filler.h
+++ b/Dragon/include/utils/filler.h
--- a/Dragon/include/utils/string.h
+++ b/Dragon/include/utils/string.h
@@ -22,7 +22,7 @@ inline std::vector<std::string> SplitString(const std::string& str,
    std::vector<std::string> ret;
    std::string temp(str);
    size_t pos;
-    while (pos = temp.find(c), pos != std::string::npos){
+    while (pos = temp.find(c), pos != std::string::npos) {
        ret.push_back(temp.substr(0, pos));
        temp.erase(0, pos + 1);
    }

--- a/Dragon/modules/python/dragon.cc
+++ b/Dragon/modules/python/dragon.cc
@@ -31,7 +31,7 @@ const TypeMeta& NumpyTypeToDragon(int numpy_type) {
            { NPY_FLOAT16, TypeMeta::Make<float16>() },
            { NPY_UINT8, TypeMeta::Make<uint8_t>() }};

-    static TypeMeta unknown_type;  //  id = 0
+    static TypeMeta unknown_type;
    return dragon_type_map.count(numpy_type) ? dragon_type_map[numpy_type] : unknown_type;
 }

@@ -50,7 +50,7 @@ REGISTER_TENSOR_FETCHER(TypeMeta::Id<NumpyFetcher>(), NumpyFetcher);
 REGISTER_TENSOR_FETCHER(TypeMeta::Id<StringFetcher>(), StringFetcher);
 REGISTER_TENSOR_FEEDER(TypeMeta::Id<NumpyFeeder>(), NumpyFeeder);

-extern "C"{
+extern "C" {

 PyObject* RegisteredOperatorsCC(PyObject* self, PyObject* args) {
    set<string> all_keys;
@@ -123,7 +123,7 @@ bool SwitchWorkspaceInternal(const string& name, const bool create_if_missing) {
    } else if (create_if_missing) {
        unique_ptr<Workspace> new_workspace(new Workspace());
        g_workspace = new_workspace.get();
-        g_workspaces[name] = std::move(new_workspace);  //  ???
+        g_workspaces[name] = std::move(new_workspace);
        g_current_workspace = name;
        return true;
    } else {

--- a/Dragon/modules/python/dragon.h
+++ b/Dragon/modules/python/dragon.h
@@ -33,7 +33,7 @@ inline PyObject* StdStringToPyBytes(const std::string& str) {
    return PyBytes_FromStringAndSize(str.c_str(), str.size());
 }
 template <typename T>
-inline void MakeStringInternal(std::stringstream& ss, const T& t){ ss << t; }
+inline void MakeStringInternal(std::stringstream& ss, const T& t) { ss << t; }

 template <typename T,typename ... Args>
 inline void MakeStringInternal(std::stringstream& ss, const T& t, const Args& ... args) {
@@ -124,7 +124,7 @@ class NumpyFeeder : public TensorFeederBase {
                   Tensor* tensor) override {
        PyArrayObject* array = PyArray_GETCONTIGUOUS(original_array);
        const TypeMeta& meta = NumpyTypeToDragon(PyArray_TYPE(array));
-        if (meta.id() == 0){
+        if (meta.id() == 0) {
            PyErr_SetString(PyExc_TypeError, "numpy data type is not supported.");
            return nullptr;
        }

--- a/Dragon/modules/python/py_mpi.h
+++ b/Dragon/modules/python/py_mpi.h
@@ -63,11 +63,11 @@ inline PyObject* MPICreateGroupCC(PyObject* self, PyObject* args) {

    //  check inclue ranks
    int size = PyList_Size(incl);
-    if (size > 0){
+    if (size > 0) {
        all_ranks.clear();
        unique_ptr<int> incl_ranks(new int[size]);
        int* ranks = incl_ranks.get();
-        for (int i = 0; i < size; i++){
+        for (int i = 0; i < size; i++) {
            ranks[i] = _PyInt_AsInt(PyList_GetItem(incl, i));
            all_ranks.insert(ranks[i]);
        }
@@ -81,7 +81,7 @@ inline PyObject* MPICreateGroupCC(PyObject* self, PyObject* args) {
        all_ranks.clear(); Set<int> tmp;
        unique_ptr<int> excl_ranks(new int[size]);
        int* ranks = excl_ranks.get();
-        for (int i = 0; i < size; i++){
+        for (int i = 0; i < size; i++) {
            ranks[i] = _PyInt_AsInt(PyList_GetItem(excl, i));
            tmp.insert(ranks[i]);
        }
@@ -97,7 +97,7 @@ inline PyObject* MPICreateGroupCC(PyObject* self, PyObject* args) {
    if (local_comm != MPI_COMM_NULL) {
        int world_rank, local_size;
        MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
-        if (world_rank == local_root){
+        if (world_rank == local_root) {
            MPI_Comm_size(local_comm, &local_size);
            std::stringstream ss;
            ss << "Rank[" << world_rank << "]: "

--- a/Dragon/python/dragon/io/__init__.py
+++ b/Dragon/python/dragon/io/__init__.py
+# --------------------------------------------------------
+# Dragon
+# Copyright(c) 2017 SeetaTech
+# Written by Ting Pan
+# --------------------------------------------------------
+
+from .data_batch import DataBatch
+
+
--- a/Dragon/python/dragon/vm/caffe/io/blob_fetcher.py
+++ b/Dragon/python/dragon/vm/caffe/io/blob_fetcher.py
 # --------------------------------------------------------
-# Caffe for Dragon
+# Dragon
 # Copyright(c) 2017 SeetaTech
 # Written by Ting Pan
 # --------------------------------------------------------
@@ -10,7 +10,7 @@ from six.moves import range as xrange

 from dragon.config import logger

-from .__init__ import GetProperty
+from .utils import GetProperty

 class BlobFetcher(Process):
    def __init__(self, **kwargs):
@@ -30,16 +30,16 @@ class BlobFetcher(Process):
        atexit.register(cleanup)

    def im_list_to_blob(self):
-        datum = self.Q_in.get()  # (h, w, BGR)
-        im = datum[0]; h, w, c = im.shape
-        im_blob = np.zeros((self._batch_size, h, w, c), dtype=np.float32)
+        datum = self.Q_in.get()
+        im_blob = []
        label_blob = np.zeros((self._batch_size, len(datum[1])), dtype=np.float32) \
                        if len(datum) > 1 else None
        for i in xrange(0, self._batch_size):
-            im_blob[i, 0:h, 0:w, :] = datum[0]
+            im_blob.append(datum[0])
            if label_blob is not None: label_blob[i, :] = datum[1]
            if i != self._batch_size - 1: datum = self.Q_in.get()
        channel_swap = (0, 3, 1, 2)
+        im_blob = np.array(im_blob, dtype=np.float32)
        im_blob = im_blob.transpose(channel_swap)
        return (im_blob, label_blob)


--- a/Dragon/python/dragon/vm/caffe/io/minibatch.py
+++ b/Dragon/python/dragon/vm/caffe/io/minibatch.py
 # --------------------------------------------------------
-# Caffe for Dragon
+# Dragon
 # Copyright(c) 2017 SeetaTech
 # Written by Ting Pan
 # --------------------------------------------------------

+import sys
 import time
 import pprint
 from multiprocessing import Queue
+if sys.version_info >= (3,0):
+    from queue import Queue as Queue2
+else:
+    from Queue import Queue as Queue2
+import threading
 from six.moves import range as xrange

 import dragon.core.mpi as mpi
@@ -16,10 +22,11 @@ from .data_reader import DataReader
 from .data_transformer import DataTransformer
 from .blob_fetcher import BlobFetcher

-from .__init__ import GetProperty
+from .utils import GetProperty

-class DataBatch(object):
+class DataBatch(threading.Thread):
    def __init__(self, **kwargs):
+        super(DataBatch, self).__init__()

        """DataBatch use Triple-Buffering to speed up"""

@@ -35,10 +42,10 @@ class DataBatch(object):
        kwargs['group_size'] = group_size

        # configuration
-        self._prefetch = GetProperty(kwargs, 'prefetch', 40)
+        self._prefetch = GetProperty(kwargs, 'prefetch', 5)
        self._num_readers = GetProperty(kwargs, 'num_readers', 1)
        self._num_transformers = GetProperty(kwargs, 'num_transformers', -1)
-        self._num_fetchers = GetProperty(kwargs, 'num_fetchers', 3)
+        self._num_fetchers = GetProperty(kwargs, 'num_fetchers', 1)

        # default policy
        if self._num_transformers == -1:
@@ -60,6 +67,7 @@ class DataBatch(object):
        self.Q_level_1 = Queue(self._prefetch * self._num_readers * self._batch_size)
        self.Q_level_2 = Queue(self._prefetch * self._num_readers * self._batch_size)
        self.Q_level_3 = Queue(self._prefetch * self._num_readers)
+        self.Q_level_4 = Queue2(self._prefetch * self._num_readers)

        # init readers
        self._readers = []
@@ -102,11 +110,16 @@ class DataBatch(object):
            self._fetchers.append(fetcher)
            time.sleep(0.1)

+        self.daemon = True
+        self.start()
        #self.echo()

-    @property
-    def blobs(self):
-        return self.Q_level_3.get()
+    def run(self):
+        while True:
+            self.Q_level_4.put(self.Q_level_3.get())
+
+    def get(self):
+        return self.Q_level_4.get()

    def echo(self):
        logger.info('---------------------------------------------------------')

--- a/Dragon/python/dragon/vm/caffe/io/data_reader.py
+++ b/Dragon/python/dragon/vm/caffe/io/data_reader.py
 # --------------------------------------------------------
-# Caffe for Dragon
+# Dragon
 # Copyright(c) 2017 SeetaTech
 # Written by Ting Pan
 # --------------------------------------------------------
@@ -12,8 +12,7 @@ import dragon.config as config
 from dragon.config import logger
 from dragon.tools.db import LMDB

-from .__init__ import GetProperty
-
+from .utils import GetProperty

 class DataReader(Process):
    def __init__(self, **kwargs):

--- a/Dragon/python/dragon/vm/caffe/io/data_transformer.py
+++ b/Dragon/python/dragon/vm/caffe/io/data_transformer.py
 # --------------------------------------------------------
-# Caffe for Dragon
+# Dragon
 # Copyright(c) 2017 SeetaTech
 # Written by Ting Pan
 # --------------------------------------------------------
@@ -12,7 +12,7 @@ import dragon.config as config
 from dragon.config import logger
 import dragon.vm.caffe.proto.caffe_pb2 as pb

-from .__init__ import GetProperty
+from .utils import GetProperty

 try:
    import cv2
@@ -131,5 +131,3 @@ class DataTransformer(Process):
        while True:
            serialized = self.Q_in.get()
            self.Q_out.put(self.transform_image_label(serialized))
\ No newline at end of file
-
-
--- a/Dragon/python/dragon/vm/caffe/io/__init__.py
+++ b/Dragon/python/dragon/vm/caffe/io/__init__.py
 # --------------------------------------------------------
-# Caffe for Dragon
+# Dragon
 # Copyright(c) 2017 SeetaTech
 # Written by Ting Pan
 # --------------------------------------------------------

--- a/Dragon/python/dragon/operators/custom/__init__.py
+++ b/Dragon/python/dragon/operators/custom/__init__.py
--- a/Dragon/python/dragon/examples/custom_ops/data_process.py
+++ b/Dragon/python/dragon/examples/custom_ops/data_process.py
--- a/Dragon/python/dragon/examples/custom_ops/vec_mult.py
+++ b/Dragon/python/dragon/examples/custom_ops/vec_mult.py
--- a/Dragon/python/dragon/vm/caffe/io/data_layer.py
+++ b/Dragon/python/dragon/vm/caffe/io/data_layer.py
 # --------------------------------------------------------
-# Caffe for Dragon
+# Dragon
 # Copyright(c) 2017 SeetaTech
 # Written by Ting Pan
 # --------------------------------------------------------

-import dragon.vm.caffe as caffe
 import dragon.core.workspace as ws
-from .minibatch import DataBatch
+from dragon.io.data_batch import DataBatch

-class DataLayer(caffe.Layer):
-    def setup(self, bottom, top):
+class MiniBatchOp(object):
+
+    def setup(self, inputs, outputs):
        kwargs = eval(self.param_str)
        self._data_batch = DataBatch(**kwargs)

-    def forward(self, bottom, top):
-        blobs = self._data_batch.blobs
+    def run(self, inputs, outputs):
+        blobs = self._data_batch.get()
        for idx, blob in enumerate(blobs):
-            ws.FeedTensor(top[idx], blob)
\ No newline at end of file
+            ws.FeedTensor(outputs[idx], blob)
\ No newline at end of file
--- a/Dragon/python/dragon/operators/data.py
+++ b/Dragon/python/dragon/operators/data.py
@@ -10,7 +10,6 @@ from dragon.operators.utils import Run

 def LMDBData(**kwargs):
    """
-    :param kwargs:                   a dict of imagenet data param
    :param --> mean_value:           a list of mean values for channles [B-G-R]
    :param --> source:               a str of the images root directory
    :param --> imageset:             a str of text file contains image name / label
@@ -30,8 +29,8 @@ def LMDBData(**kwargs):
    args = locals(); kwargs = args['kwargs']
    del args['kwargs']; kwargs = dict(args, **kwargs)

-    kwargs['module'] =  'dragon.vm.caffe.io.data_layer'
-    kwargs['op'] = 'DataLayer'
+    kwargs['module'] =  'dragon.operators.custom.minibatch'
+    kwargs['op'] = 'MiniBatchOp'

    return Run([], param_str=str(kwargs), nout=2, **kwargs)


--- a/Dragon/python/dragon/tools/im2lmdb.py
+++ b/Dragon/python/dragon/tools/im2lmdb.py
+# --------------------------------------------------------
+# Dragon
+# Copyright(c) 2017 SeetaTech
+# Written by Ting Pan
+# --------------------------------------------------------
+
+""" Generate LMDB from images """
+
+import os
+import sys
+import time
+import shutil
+import argparse
+
+import cv2
+try:
+    import numpy as np
+except: pass
+
+from dragon.tools.db import LMDB
+from dragon.vm.caffe.proto import caffe_pb2
+
+def resize_image(im, resize):
+    if im.shape[0] > im.shape[1]:
+        newsize = (resize, im.shape[0] * resize / im.shape[1])
+    else:
+        newsize = (im.shape[1] * resize / im.shape[0], resize)
+    im = cv2.resize(im, newsize)
+    return im
+
+def make_db(args):
+    if os.path.isfile(args.list) is False:
+        raise ValueError('the path of image list is invalid.')
+    if os.path.isdir(args.database) is True:
+        raise ValueError('the database is already exist or invalid.')
+
+    print('start time: ', time.strftime("%a, %d %b %Y %H:%M:%S", time.gmtime()))
+
+    db = LMDB(max_commit=10000)
+    db.open(args.database, mode='w')
+
+    total_line = sum(1 for line in open(args.list))
+    count = 0
+    zfill_flag = '{0:0%d}' % (args.zfill)
+
+    encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), args.quality]
+
+    start_time = time.time()
+
+    with open(args.list, 'r') as input_file:
+        records = input_file.readlines()
+        if args.shuffle:
+            import random
+            random.shuffle(records)
+
+        for record in records:
+            count += 1
+            if count % 10000 == 0:
+                now_time = time.time()
+                print('{0} / {1} in {2:.2f} sec'.format(
+                    count, total_line, now_time - start_time))
+                db.commit()
+
+            record = record.split()
+            path = record[0]
+            label = record[1]
+
+            img = cv2.imread(os.path.join(args.root, path))
+            if args.resize > 0:
+                img = resize_image(img, args.resize)
+            if args.pad > 0:
+                pad_img = np.zeros((img.shape[0] + 2 * args.pad,
+                                    img.shape[1] + 2 * args.pad, 3), dtype=img.dtype)
+                pad_img[args.pad : args.pad + img.shape[0],
+                        args.pad : args.pad + img.shape[1], :] = img
+                img = pad_img
+            result, imgencode = cv2.imencode('.jpg', img, encode_param)
+
+            datum = caffe_pb2.Datum()
+            datum.height, datum.width, datum.channels = img.shape
+            datum.label = int(label)
+            datum.encoded = True
+            datum.data = imgencode.tostring()
+            db.put(zfill_flag.format(count - 1), datum.SerializeToString())
+
+    now_time = time.time()
+    print('{0} / {1} in {2:.2f} sec'.format(count, total_line, now_time - start_time))
+    db.put('size', str(count))
+    db.put('zfill', str(args.zfill))
+    db.commit()
+    db.close()
+
+    shutil.copy(args.list, args.database + '/image_list.txt')
+    end_time = time.time()
+    print('{0} images have been stored in the database.'.format(total_line))
+    print('This task finishes within {0:.2f} seconds.'.format(end_time - start_time))
+    print('The size of database is {0} MB.'.
+          format(float(os.path.getsize(args.database + '/data.mdb') / 1000 / 1000)))
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Create LMDB from images for classification.')
+    parser.add_argument('--root', help='the root folder of raw images')
+    parser.add_argument('--list', help='the filepath of image list')
+    parser.add_argument('--database', help='the filepath of database')
+    parser.add_argument('--zfill', type=int, default=8, help='zfill for the key of database')
+    parser.add_argument('--resize', type=int, default=0, help='resize the shorter edge of image to the newsize')
+    parser.add_argument('--pad', type=int, default=0, help='zero-pad the image')
+    parser.add_argument('--quality', type=int, default=95, help='JPEG quality for encoding, 1-100')
+    parser.add_argument('--shuffle', type=bool, default=True, help='randomize the order in list file True')
+
+    if len(sys.argv) < 4:
+        parser.print_help()
+        sys.exit(1)
+
+    args = parser.parse_args()
+    return args
+
+if __name__ == '__main__':
+
+    args = parse_args()
+
+    make_db(args)
\ No newline at end of file
--- a/Dragon/python/dragon/utils.py
+++ b/Dragon/python/dragon/utils.py
-# --------------------------------------------------------
-# Dragon
-# Copyright(c) 2017 SeetaTech
-# Written by Ting Pan
-# --------------------------------------------------------
-
-from google.protobuf.message import Message
-from protos import dragon_pb2 as pb
-import numpy as np
-
-def MakeArgument(key, value):
-    argument = pb.Argument()
-    argument.name = key
-    if type(value) is float: argument.f = value
-    elif type(value) is int : argument.i = value
-    elif type(value) is np.int64: argument.i64 = int(value)
-    elif type(value) is str: argument.s = value
-    elif type(value) is unicode: argument.s = value
-    elif type(value) is bool: argument.b = value
-    elif isinstance(value, Message): argument.s = value.SerializeToString()
-    elif all(type(v) is float for v in value): argument.floats.extend(value)
-    elif all(type(v) is int for v in value): argument.ints.extend(value)
-    elif all(type(v) is str for v in value): argument.strings.extend(value)
-    elif all(type(v) is unicode or type(v) is str for v in value): argument.strings.extend(value)
-    elif all(isinstance(v,Message) for v in value):
-        argument.strings.extend([v.SerializeToString() for v in value])
-    else: raise ValueError('unknown argument type: key={} value={} value type={}' \
-                           .format(key,value,type(value)))
-    return argument
-
-def MakeOperatorDef(op_type, inputs, outputs, name='',
-                   device_option=None, arg=None, engine=None, **kwargs):
-    operator = pb.OperatorDef()
-    operator.type = op_type
-    operator.name = name
-    operator.input.extend([str(tensor) for tensor in inputs])
-    operator.output.extend([str(tensor) for tensor in outputs])
-    if device_option is not None:
-        operator.device_option.CopyFrom(device_option)
-    if engine is not None:
-        operator.engine = engine
-    if 'random_seed' in kwargs:
-        operator.device_option.random_seed = kwargs['random_seed']
-        del kwargs['random_seed']
-    if arg is not None:
-        operator.arg.extend(arg)
-    for k,v in kwargs.items():
-        if v is None: continue
-        operator.arg.add().CopyFrom(MakeArgument(k,v))
-    return operator
-
-def MakeDeviceOption(device_type, gpu_id, rng_seed = None):
-    """ return a DeviceOption """
-    option = pb.DeviceOption()
-    option.device_type = device_type
-    option.gpu_id = gpu_id
-    if rng_seed is not None: option.random_seed = rng_seed
-    return option
-
-#  fix the python stdout
-class Unbuffered(object):
-   def __init__(self, stream):
-       self.stream = stream
-   def write(self, data):
-       self.stream.write(data)
-       self.stream.flush()
-   def __getattr__(self, attr):
-       return getattr(self.stream, attr)
-# clear the stdout buffer for mpi(c++ & python)
-import sys
-sys.stdout = Unbuffered(sys.stdout)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/caffe/proto/caffe.proto
+++ b/Dragon/python/dragon/vm/caffe/proto/caffe.proto
@@ -221,7 +221,7 @@ message SolverParameter {

  // RMSProp decay value
  // MeanSquare(t) = rms_decay*MeanSquare(t-1) + (1-rms_decay)*SquareGradient(t)
-  optional float rms_decay = 38;
+  optional float rms_decay = 38 [default = 0.99];

  // If true, print information about the state of the net that may help with
  // debugging learning problems.
@@ -676,7 +676,7 @@ message DataParameter {
  optional bool force_encoded_color = 9 [default = false];
  // Prefetch queue (Number of batches to prefetch to host memory, increase if
  // data access bandwidth varies).
-  optional uint32 prefetch = 10 [default = 40];
+  optional uint32 prefetch = 10 [default = 5];
 }

 message DropoutParameter {

--- a/Dragon/python/dragon/vm/caffe/proto/caffe_pb2.py
+++ b/Dragon/python/dragon/vm/caffe/proto/caffe_pb2.py
--- a/Dragon/src/core/context.cc
+++ b/Dragon/src/core/context.cc
@@ -10,7 +10,7 @@ CUDAObject CUDAContext::cuda_object_;

 //  cpu <- gpu
 template<> void CPUContext::Memcpy<CPUContext, CUDAContext>(
-    size_t nbytes, void* dst, const void* src){
+    size_t nbytes, void* dst, const void* src) {
 #ifdef WITH_CUDA
    CUDAContext ctx(POINTER_DEVICE(src));
    ctx.Memcpy<CPUContext, CUDAContext>(nbytes, dst, src);
@@ -21,7 +21,7 @@ template<> void CPUContext::Memcpy<CPUContext, CUDAContext>(

 //  gpu <- cpu
 template<> void CPUContext::Memcpy<CUDAContext, CPUContext>(
-    size_t nbytes, void* dst, const void* src){
+    size_t nbytes, void* dst, const void* src) {
 #ifdef WITH_CUDA
        CUDAContext ctx(POINTER_DEVICE(dst));
        ctx.Memcpy<CUDAContext, CPUContext>(nbytes, dst, src);

--- a/Dragon/src/core/graph.cc
+++ b/Dragon/src/core/graph.cc
@@ -151,13 +151,13 @@ GraphDef Graph::Prune(const GraphDef& graph_def) {
        OperatorDef op_def;
        op_def.CopyFrom(graph_def.op(it));
        //  handle inputs
-        for (int i = 0; i < graph_def.op(it).input_size(); i++){
+        for (int i = 0; i < graph_def.op(it).input_size(); i++) {
            string input = graph_def.op(it).input(i);
            if (!colored_[input] || !outputs.count(input))
                *op_def.mutable_input(i) = "ignore";
        }
        //  handle outputs
-        for (int i = 0; i < graph_def.op(it).output_size(); i++){
+        for (int i = 0; i < graph_def.op(it).output_size(); i++) {
            string output = graph_def.op(it).output(i);
            if (!colored_[output]) *op_def.mutable_output(i) = "ignore";
            else outputs.insert(op_def.output(i));

--- a/Dragon/src/core/graph_gradient.cc
+++ b/Dragon/src/core/graph_gradient.cc
@@ -23,7 +23,7 @@ CheckTuple GraphGradientMaker::CheckMissingGrad(OperatorDef* forward_op) {
                inputs_to_grads_[output] = g_output;

            //  consider generate virtual grad
-            else if (targets_set_.count(output) && g_output != "ignore"){
+            else if (targets_set_.count(output) && g_output != "ignore") {
                gen_grads.push_back({ output, idx });
                inputs_to_grads_[output] = g_output;
            }
@@ -50,7 +50,7 @@ GraphDef GraphGradientMaker::Make() {
    Set<string> all_split_grads;

    // PLAY for the forward
-    for (auto& op : forward_def_.op()){
+    for (auto& op : forward_def_.op()) {
        if (NoGradientRegistry()->Has(op.type())) continue;
        for (auto& input : op.input()) inputs_count[input]++;
    }
@@ -73,17 +73,17 @@ GraphDef GraphGradientMaker::Make() {
        Gradient grad = MakeGradientForOp(*op, g_outputs);

        // replace terms
-        for (auto& g_op : grad.ops){
+        for (auto& g_op : grad.ops) {
            g_op.set_name(GetOperatorName());
-            for (int i = 0; i < g_op.input_size(); i++){
+            for (int i = 0; i < g_op.input_size(); i++) {
                string* input = g_op.mutable_input(i);
                if (terms_.count(*input)) *input = terms_[*input];
            }
-            for (int i = 0; i < g_op.output_size(); i++){
+            for (int i = 0; i < g_op.output_size(); i++) {
                string* output = g_op.mutable_output(i);
                if (terms_.count(*output)) *output = terms_[*output];
            }
-            for (int i = 0; i < grad.g_inputs.size(); i++){
+            for (int i = 0; i < grad.g_inputs.size(); i++) {
                if (terms_.count(grad.g_inputs[i]))
                    grad.g_inputs[i] = terms_[grad.g_inputs[i]];
            }
@@ -106,14 +106,14 @@ GraphDef GraphGradientMaker::Make() {
                    string split_name = *output + "_autosplit_" + str(grads_count[*output]++);
                    if (!is_skip) all_split_grads.insert(split_name);
                    //  gather
-                    if (grads_count[*output] == inputs_count[original_name]){
+                    if (grads_count[*output] == inputs_count[original_name]) {
                        gather_op = new OperatorDef();
                        gather_op->set_name(GetOperatorName());
                        gather_op->set_type("GradientGather");
                        gather_op->add_output(*output);
                        if (g_op.has_device_option())
                            gather_op->mutable_device_option()->CopyFrom(g_op.device_option());
-                        for (int j = 0; j < grads_count[*output]; j++){
+                        for (int j = 0; j < grads_count[*output]; j++) {
                            string key = *output + "_autosplit_" + str(j);
                            if (all_split_grads.count(key)) gather_op->add_input(key);
                        }

--- a/Dragon/src/core/operator.cc
+++ b/Dragon/src/core/operator.cc
@@ -66,7 +66,7 @@ DEFINE_REGISTRY(GradientRegistry, GradientMakerBase, const OperatorDef&, const v
 DEFINE_REGISTRY(NoGradientRegistry, GradientMakerBase, const OperatorDef&, const vector<string>&);

 #define INSTANTIATE_GET_SINGLE_ARGUMENT(T, fieldname) \
-template <> T OperatorBase::GetSingleArg(const string& name, const T& default_value){ \
+template <> T OperatorBase::GetSingleArg(const string& name, const T& default_value) { \
    if(args_.count(name) == 0) { \
        return default_value; \
    } \
@@ -82,7 +82,7 @@ INSTANTIATE_GET_SINGLE_ARGUMENT(int64_t, i64);


 #define INSTANTIATE_GET_REPEATED_ARGUMENT(T, fieldname) \
-template<> vector<T> OperatorBase::GetRepeatedArg<T>(const string& name){ \
+template<> vector<T> OperatorBase::GetRepeatedArg<T>(const string& name) { \
    if(args_.count(name) == 0) return vector<T>(); \
    vector<T> values; \
    for(const auto& v : args_[name]->fieldname()) values.push_back(v); \

--- a/Dragon/src/core/operator_schema.cc
+++ b/Dragon/src/core/operator_schema.cc
@@ -17,7 +17,7 @@ bool OpSchema::Verify(const OperatorDef& def) const {
    }
    for (int in = 0; in < def.input_size(); in++) {
        if (def.input(in) == "ignore") continue;
-        for (int out = 0; out < def.output_size(); out++){
+        for (int out = 0; out < def.output_size(); out++) {
            if (def.output(out) == "ignore") continue;
            if (def.input(in) == def.output(out) && (!CheckInplace(in, out)))
                LOG(FATAL) << "[" << def.name() << "] input("

--- a/Dragon/src/operators/activation/cudnn_relu_op.cc
+++ b/Dragon/src/operators/activation/cudnn_relu_op.cc
--- a/Dragon/src/operators/activation/dropout_op.cc
+++ b/Dragon/src/operators/activation/dropout_op.cc
@@ -71,10 +71,9 @@ void DropoutGradientOp<Context>::RunOnDevice() {

 template <class Context>
 void DropoutGradientOp<Context>::ClearAfterRun() {
-    ws()->ReleaseBuffer(mask);
+    ws()->ReleaseBuffer(mask, true);
 }

-
 DEPLOY_CPU(DropoutGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(DropoutGradient);

--- a/Dragon/src/operators/arithmetic/dot_op.cc
+++ b/Dragon/src/operators/arithmetic/dot_op.cc
@@ -125,7 +125,7 @@ void DotGradientOp<Context>::GemvRunWithType() {
 }

 template <class Context>
-void DotGradientOp<Context>::RunOnDevice(){
+void DotGradientOp<Context>::RunOnDevice() {
    output(0)->ReshapeLike(input(0));
    output(1)->ReshapeLike(input(1));


--- a/Dragon/src/operators/arithmetic/eltwise_op.cc
+++ b/Dragon/src/operators/arithmetic/eltwise_op.cc
@@ -65,7 +65,7 @@ void EltwiseGradientOp<Context>::SumRunWithType() {
    auto* dYdata = input(-1).template data<T, Context>();
    TIndex count = input(-1).count();

-    for (int i = 0; i < OutputSize(); i++){
+    for (int i = 0; i < OutputSize(); i++) {
        if (output(i)->name() == "ignore") continue;
        auto* dXdata = output(i)->template mutable_data<T, Context>();
        if (coeffs[i] == float(1)) {

--- a/Dragon/src/operators/arithmetic/gram_matrix_op.cc
+++ b/Dragon/src/operators/arithmetic/gram_matrix_op.cc
@@ -8,7 +8,7 @@ template <class Context> template <typename T>
 void GramMatrixOp<Context>::RunWithType() {
    auto* Xdata = input(0).template data<T, Context>();
    auto* Ydata = output(0)->template mutable_data<T, Context>();
-    for (int i = 0; i < outer_dim; i++){
+    for (int i = 0; i < outer_dim; i++) {
        math::Gemm<T, Context>(CblasNoTrans, CblasTrans,
            dim, dim, inner_dim, 1.0, Xdata, Xdata, 0.0, Ydata);
        Xdata += x_offset;
@@ -40,7 +40,7 @@ void GramMatrixGradientOp<Context>::RunWithType() {
    auto* dYdata = input(-1).template data<T, Context>();
    auto* Xdata = input(0).template data<T, Context>();
    auto* dXdata = output(0)->template mutable_data<T, Context>();
-    for (int i = 0; i < outer_dim; i++){
+    for (int i = 0; i < outer_dim; i++) {
        math::Gemm<T, Context>(CblasNoTrans, CblasNoTrans,
            dim, inner_dim, dim, 2.0, dYdata, Xdata, 0.0, dXdata);
        dYdata += y_offset;
@@ -49,7 +49,7 @@ void GramMatrixGradientOp<Context>::RunWithType() {
 }

 template <class Context>
-void GramMatrixGradientOp<Context>::RunOnDevice(){
+void GramMatrixGradientOp<Context>::RunOnDevice() {
    outer_dim = input(0).count(0, axis);
    dim = input(0).dim(axis);
    inner_dim = input(0).count(axis + 1);

--- a/Dragon/src/operators/arithmetic/matmul_op.cc
+++ b/Dragon/src/operators/arithmetic/matmul_op.cc
@@ -81,7 +81,7 @@ void MatmulGradientOp<Context>::RunWithType() {
 }

 template <class Context>
-void MatmulGradientOp<Context>::RunOnDevice(){
+void MatmulGradientOp<Context>::RunOnDevice() {
    CHECK(input(0).ndim() == input(1).ndim())
        << "both matrices must have the same number of dimensions.";
    CHECK_GE(input(0).ndim(), 2)

--- a/Dragon/src/operators/arithmetic/pow_op.cc
+++ b/Dragon/src/operators/arithmetic/pow_op.cc
@@ -9,7 +9,7 @@ void PowOp<Context>::RunWithType() {
    TIndex count = input(0).count();
    auto* Ydata = output(0)->template mutable_data<T, Context>();

-    if (power_scale == float(0)){
+    if (power_scale == float(0)) {
        float value = (power == float(0)) ? float(1) : pow(shift, power);
        math::Set<T, Context>(count, dragon_cast<T, float>(value), Ydata);
        return;

--- a/Dragon/src/operators/arithmetic/scale_op.cc
+++ b/Dragon/src/operators/arithmetic/scale_op.cc
--- a/Dragon/src/operators/arithmetic/square_op.cc
+++ b/Dragon/src/operators/arithmetic/square_op.cc
@@ -12,7 +12,7 @@ void SquareOp<Context>::RunWithType() {
 }

 template <class Context>
-void SquareOp<Context>::RunOnDevice(){
+void SquareOp<Context>::RunOnDevice() {
    output(0)->ReshapeLike(input(0));

    if (input(0).template IsType<float>()) RunWithType<float>();

--- a/Dragon/src/operators/arithmetic/sub_op.cc
+++ b/Dragon/src/operators/arithmetic/sub_op.cc
@@ -44,7 +44,7 @@ void SubOp<Context>::BroadcastRunWithType(int type) {
 }

 template <class Context>
-void SubOp<Context>::RunOnDevice(){
+void SubOp<Context>::RunOnDevice() {
    output(0)->ReshapeLike(input(0));

    if (input(0).dims() == input(1).dims()) {

--- a/Dragon/src/operators/common/argmax_op.cc
+++ b/Dragon/src/operators/common/argmax_op.cc
--- a/Dragon/src/operators/common/concat_op.cc
+++ b/Dragon/src/operators/common/concat_op.cc
@@ -7,7 +7,7 @@ namespace dragon {
 template <class Context> template <typename T>
 void ConcatOp<Context>::RunWithType() {
    auto* Ydata = output(0)->template mutable_data<T, Context>();
-    for (int i = 0; i < nin; i++){
+    for (int i = 0; i < nin; i++) {
        auto* Xdata = input(i).template data<T, Context>();
        TIndex count = input(i).count();
        x_concat_dim = input(i).dim(axis);
@@ -25,12 +25,12 @@ void ConcatOp<Context>::RunWithType() {
 }

 template <class Context>
-void ConcatOp<Context>::RunOnDevice(){
+void ConcatOp<Context>::RunOnDevice() {
    concat_dims = input(0).dims();
    for (int i = 1; i < nin; i++) {
        CHECK_EQ(concat_dims.size(), input(i).ndim())
            << "\nall inputs must have the same ndim.";
-        for (int j = 0; j < concat_dims.size(); j++){
+        for (int j = 0; j < concat_dims.size(); j++) {
            if (j == axis) continue;
            CHECK_EQ(concat_dims[j], input(i).dim(j))
                << "\nall inputs must have the same dims"
@@ -82,7 +82,7 @@ void ConcatGradientOp<Context>::RunWithType() {
 }

 template <class Context>
-void ConcatGradientOp<Context>::RunOnDevice(){
+void ConcatGradientOp<Context>::RunOnDevice() {
    if (input(-1).name() == "ignore") return;
    concat_dims = input(-1).dims();
    y_concat_dim = concat_dims[axis];

--- a/Dragon/src/operators/common/flatten_op.cc
+++ b/Dragon/src/operators/common/flatten_op.cc
@@ -27,7 +27,7 @@ OPERATOR_SCHEMA(Flatten).NumInputs(1).NumOutputs(1);


 template <class Context>
-void FlattenGradientOp<Context>::RunOnDevice(){
+void FlattenGradientOp<Context>::RunOnDevice() {
    output(0)->ReshapeLike(input(0));
    output(0)->Share(input(1));
 }

--- a/Dragon/src/operators/common/python_op.cc
+++ b/Dragon/src/operators/common/python_op.cc
--- a/Dragon/src/operators/common/reduce_op.cc
+++ b/Dragon/src/operators/common/reduce_op.cc
@@ -78,7 +78,7 @@ void ReduceGradientOp<Context>::SumRunWithType() {
 template <class Context> template <typename T>
 void ReduceGradientOp<Context>::MeanRunWithType() {
    auto* dXdata = output(0)->template mutable_data<T, Context>();
-    if (axis == -1){
+    if (axis == -1) {
        auto* dYdata = input(-1).template data<T, CPUContext>();
        math::Set<T, Context>(output(0)->count(), dYdata[0] / input(0).count(), dXdata);
    } else {

--- a/Dragon/src/operators/common/reshape_op.cc
+++ b/Dragon/src/operators/common/reshape_op.cc
--- a/Dragon/src/operators/common/scan_op.cc
+++ b/Dragon/src/operators/common/scan_op.cc
@@ -150,13 +150,11 @@ void ScanOp<Context>::UnrollTemplate() {

 template <class Context>
 void ScanOp<Context>::RunOnDevice() {
-    //    unroll
    UnrollTemplate();
-    if (!graphs.count(nsteps))
+    if (!graphs.count(nsteps)) {
        graphs[nsteps].reset(new Graph(new_def, ws()));
+    }
    cur_graph = graphs[nsteps].get();
-
-    //    forward
    cur_graph->Run("", "");
 }

@@ -201,13 +199,11 @@ void ScanGradientOp<Context>::MakeGradientOps() {

 template <class Context>
 void ScanGradientOp<Context>::RunOnDevice() {
-    //    make graph
    MakeGradientOps();
-    if (!graphs.count(nsteps))
+    if (!graphs.count(nsteps)) {
        graphs[nsteps].reset(new Graph(new_def, ws()));
+    }
    cur_graph = graphs[nsteps].get();
-
-    //    backward
    cur_graph->Run("Gradient", "");
 }


--- a/Dragon/src/operators/common/slice_op.cc
+++ b/Dragon/src/operators/common/slice_op.cc
@@ -7,7 +7,7 @@ namespace dragon {
 template <class Context> template <typename T>
 void SliceOp<Context>::RunWithType() {
    auto* Xdata = input(0).template data<T, Context>();
-    for (int i = 0; i < nout; i++){
+    for (int i = 0; i < nout; i++) {
        auto* Ydata = output(i)->template mutable_data<T, Context>();
        TIndex count = output(i)->count();
        kernel::Slice<T, Context>(count, outer_dim, inner_dim,
@@ -46,7 +46,7 @@ OPERATOR_SCHEMA(Slice).NumInputs(1).NumOutputs(1, INT_MAX);
 template <class Context> template <typename T>
 void SliceGradientOp<Context>::RunWithType() {
    auto* dXdata = output(0)->template mutable_data<T, Context>();
-    for (int i = 0; i < nout; i++){
+    for (int i = 0; i < nout; i++) {
        if (input(i + 1).name() == "ignore") continue;
        auto* dYdata = input(i + 1).template data<T, Context>();
        TIndex count = input(i + 1).count();

--- a/Dragon/src/operators/common/utils_op.cc
+++ b/Dragon/src/operators/common/utils_op.cc
@@ -64,7 +64,7 @@ void AccuracyOp<Context>::RunWithType() {
    }    // end outer_num

    output(0)->template mutable_data<T, CPUContext>()[0] = acc / count;
-    if (OutputSize() > 1){
+    if (OutputSize() > 1) {
        auto* acc_per_class = output(1)->template mutable_data<T, CPUContext>();
        for (int i = 0; i < classes; i++)
            acc_per_class[i] = num_per_class[i] == 0 ? 0 : acc_per_class[i] / acc_per_class[i];

--- a/Dragon/src/operators/loss/l1_loss_op.cc
+++ b/Dragon/src/operators/loss/l1_loss_op.cc
@@ -28,7 +28,7 @@ void L1LossOp<Context>::RunWithType() {
 }

 template <class Context>
-void L1LossOp<Context>::RunOnDevice(){
+void L1LossOp<Context>::RunOnDevice() {
    CHECK_EQ(input(0).count(), input(1).count());
    output(0)->Reshape(vector<TIndex>(1, 1));
    diff = ws()->CreateTensor("_t_" + anchor() + "_l1_loss_diff");
@@ -53,7 +53,7 @@ void L1LossGradientOp<Context>::RunWithType() {
    else if (normalization == "FULL") normalizer = input(0).count();
    else if (normalization == "NONE") normalizer = 1;
    alpha = alpha / normalizer;
-    for (int i = 0; i < 2; i++){
+    for (int i = 0; i < 2; i++) {
        if (output(i)->name() == "ignore") continue;
        output(i)->ReshapeLike(input(i));
        auto* dXdata = output(i)->template mutable_data<T, Context>();

--- a/Dragon/src/operators/loss/l2_loss_op.cc
+++ b/Dragon/src/operators/loss/l2_loss_op.cc
@@ -11,7 +11,7 @@ void L2LossOp<Context>::RunWithType() {
    auto* diff_data = diff->template mutable_data<T, Context>();
    auto* Ydata = output(0)->template mutable_data<T, CPUContext>();
    math::Sub<T, Context>(diff->count(), X0data, X1data, diff_data);
-    if (InputSize() > 2){
+    if (InputSize() > 2) {
        CHECK_EQ(input(0).count(), input(2).count());
        auto* Wdata = input(2).template data<T, Context>();
        math::Mul<T, Context>(diff->count(), Wdata, diff_data, diff_data);

--- a/Dragon/src/operators/loss/smooth_l1_loss_op.cc
+++ b/Dragon/src/operators/loss/smooth_l1_loss_op.cc
@@ -14,12 +14,12 @@ void SmoothL1LossOp<Context>::RunWithType() {
    auto* Ydata = output(0)->template mutable_data<T, CPUContext>();

    math::Sub<T, Context>(diff->count(), X0data, X1data, diff_data);
-    if (InputSize() > 2){
+    if (InputSize() > 2) {
        auto* inside_w_data = input(2).template data<T, Context>();
        math::Mul<T, Context>(diff->count(), inside_w_data, diff_data, diff_data);
    }
    kernel::SmoothL1<T, Context>(diff->count(), sigma2, diff_data, error_data);
-    if (InputSize() > 3){
+    if (InputSize() > 3) {
        auto* outside_w_data = input(3).template data<T, Context>();
        math::Mul<T, Context>(diff->count(), outside_w_data, error_data, error_data);
    }

--- a/Dragon/src/operators/loss/softmax_cross_entropy_loss_op.cc
+++ b/Dragon/src/operators/loss/softmax_cross_entropy_loss_op.cc
--- a/Dragon/src/operators/mpi/mpi_broadcast_op.cc
+++ b/Dragon/src/operators/mpi/mpi_broadcast_op.cc
@@ -26,14 +26,14 @@ void MPIBroadcastOp<Context>::RunWithType() {
 }

 template <class Context>
-void MPIBroadcastOp<Context>::RunOnDevice(){
+void MPIBroadcastOp<Context>::RunOnDevice() {
    CHECK(this->comm != MPI_COMM_NULL)
        << "\nMPIBroadcastOp, name: " << name()
        << ", does not belong to any group, can't run.";

    size_t ndim[1];
    TIndex* dims = nullptr;
-    if (this->comm_rank == this->comm_root){
+    if (this->comm_rank == this->comm_root) {
        ndim[0] = input(0).ndim();
        dims = new TIndex[ndim[0]];
        for (int i = 0; i < input(0).ndim(); i++)
@@ -90,7 +90,7 @@ void MPIBroadcastGradientOp<Context>::RunWithType() {
 }

 template <class Context>
-void MPIBroadcastGradientOp<Context>::RunOnDevice(){
+void MPIBroadcastGradientOp<Context>::RunOnDevice() {
    output(0)->ReshapeLike(input(-1));

    if (input(0).template IsType<float>()) RunWithType<float>();

--- a/Dragon/src/operators/mpi/mpi_gather_op.cc
+++ b/Dragon/src/operators/mpi/mpi_gather_op.cc
@@ -51,7 +51,7 @@ template <class Context> template <typename T>
 void MPIGatherGradientOp<Context>::RunWithType() {
    if (this->comm_rank == this->comm_root) {
        output(0)->Share(input(this->comm_rank + 1));
-        for (int i = 0; i < this->comm_size; i++){
+        for (int i = 0; i < this->comm_size; i++) {
            if (i == this->comm_root) continue;
 #ifdef WITH_CUDA_AWARE
            auto* dYdata = input(this->comm_rank + 1).template data<T, Context>();

--- a/Dragon/src/operators/norm/batch_norm_op.cc
+++ b/Dragon/src/operators/norm/batch_norm_op.cc
--- a/Dragon/src/operators/norm/batch_renorm_op.cc
+++ b/Dragon/src/operators/norm/batch_renorm_op.cc
--- a/Dragon/src/operators/norm/instance_norm_op.cc
+++ b/Dragon/src/operators/norm/instance_norm_op.cc
@@ -60,7 +60,7 @@ void InstanceNormOp<Context>::RunWithType() {
 }

 template <class Context>
-void InstanceNormOp<Context>::RunOnDevice(){
+void InstanceNormOp<Context>::RunOnDevice() {
    num = input(0).dim(0); channels = input(0).dim(1);
    spatial_dim = input(0).count(2); nbychans = num * channels;
    vector<TIndex> dims({ num, channels });

--- a/Dragon/src/operators/norm/l2_norm_op.cc
+++ b/Dragon/src/operators/norm/l2_norm_op.cc
@@ -109,7 +109,7 @@ void L2NormGradientOp<Context>::RunWithType() {
    auto* Bdata = buffer->template mutable_data<T, Context>();
    auto* BInnerdata = buffer_inner->template mutable_data<T, Context>();

-    for (int n = 0; n < outer_dim; n++){
+    for (int n = 0; n < outer_dim; n++) {
        if (across_inner) {
            Ndata = norm->template data<T, CPUContext>();
            T sum_of_x_mul_dy = math::Dot<T, Context>(buffer->count(), Xdata, dYdata);

--- a/Dragon/src/operators/recurrent/lstm_unit_op.cc
+++ b/Dragon/src/operators/recurrent/lstm_unit_op.cc
--- a/Dragon/src/operators/update/async_update_op.cc
+++ b/Dragon/src/operators/update/async_update_op.cc
@@ -21,7 +21,7 @@ void AsyncUpdateOp<Context>::UpdateTimestamp(int tag) {
 }

 template <class Context>
-int AsyncUpdateOp<Context>::GetDelay(int tag){
+int AsyncUpdateOp<Context>::GetDelay(int tag) {
    Tensor* t = ws()->GetTensor("_t_" + this->domain + "async_timestamp");
    int* global_timestamp = t->template mutable_data<int, CPUContext>();
    return global_timestamp[tag] - local_timestamp[tag] + 1;
@@ -71,7 +71,7 @@ AsyncUpdateOp<Context>::AsyncUpdateOp(const OperatorDef& op_def, Workspace* ws)

 template <class Context> template <typename T>
 void AsyncUpdateOp<Context>::RootRunWithType() {
-    for (int i = 0; i < InputSize(); i++){
+    for (int i = 0; i < InputSize(); i++) {
        auto* dXdata = input(i).template mutable_data<T, Context>();
        auto* Xdata = output(i)->template mutable_data<T, Context>();

@@ -88,7 +88,7 @@ void AsyncUpdateOp<Context>::RootRunWithType() {
 }

 template <class Context>
-void AsyncUpdateOp<Context>::RunOnDevice(){
+void AsyncUpdateOp<Context>::RunOnDevice() {
    if (this->comm_rank != this->comm_root) return;

    if (input(0).template IsType<float>()) {

--- a/Dragon/src/operators/update/update_op_base.cc
+++ b/Dragon/src/operators/update/update_op_base.cc
@@ -98,7 +98,7 @@ void UpdateOpBase<Context>::ReduceRunWithType() {
    }

    //  ave-normalize
-    if (comm_size > 1){
+    if (comm_size > 1) {
 #ifdef WITH_CUDA_AWARE
        math::Scal<T, Context>(count, T(1.0 / comm_size), dXdata);
 #else
@@ -112,7 +112,7 @@ template <class Context> template <typename T>
 void UpdateOpBase<Context>::PreprocessRunWithType() {
    //  scale
    scale_factor = param("scale_gradient");
-    if (scale_factor != 1){
+    if (scale_factor != 1) {
        auto* dXdata = input(0).template mutable_data<T, Context>();
        math::Scal<T, Context>(input(0).count(), scale_factor, dXdata);
    }
@@ -128,11 +128,11 @@ void UpdateOpBase<Context>::PreprocessRunWithType() {
        }
    }
    //  decay
-    l2_decay = param("l2_decay");
-    if (l2_decay > 0){
+    l2_decay = param("l2_decay") * decay_mult;
+    if (l2_decay > 0) {
        auto* dXdata = input(0).template mutable_data<T, Context>();
        auto* Xdata = output(0)->template data<T, Context>();
-        math::Axpy<T, Context>(input(0).count(), l2_decay * decay_mult, Xdata, dXdata);
+        math::Axpy<T, Context>(input(0).count(), l2_decay, Xdata, dXdata);
    }
 }


--- a/Dragon/src/operators/utils/cast_op.cpp
+++ b/Dragon/src/operators/utils/cast_op.cpp
--- a/Dragon/src/operators/utils/compare_op.cc
+++ b/Dragon/src/operators/utils/compare_op.cc
@@ -12,7 +12,7 @@ void CompareOp<Context>::EqualRunWithType() {
 }

 template <class Context>
-void CompareOp<Context>::RunOnDevice(){
+void CompareOp<Context>::RunOnDevice() {
    CHECK_EQ(input(0).count(), input(1).count())
        << "both conditioned tensor must have same elements.";
    output(0)->ReshapeLike(input(0));

--- a/Dragon/src/operators/utils/gradient_op.cc
+++ b/Dragon/src/operators/utils/gradient_op.cc
@@ -6,7 +6,7 @@ namespace dragon {

 template <class Context> template <typename T>
 void GradientGenerateOp<Context>::RunWithType() {
-    for (int i = 0; i < OutputSize(); i++){
+    for (int i = 0; i < OutputSize(); i++) {
        if (output(i)->name() == "ignore") continue;
        output(i)->ReshapeLike(input(i));
        auto* dXdata = output(0)->template mutable_data<T, Context>();

--- a/Dragon/src/operators/utils/initialize_op.cc
+++ b/Dragon/src/operators/utils/initialize_op.cc
--- a/Dragon/src/operators/utils/proposal_op.cu
+++ b/Dragon/src/operators/utils/proposal_op.cu
--- a/Dragon/src/operators/utils/shape_op.cc
+++ b/Dragon/src/operators/utils/shape_op.cc
--- a/Dragon/src/operators/vision/conv_op.cc
+++ b/Dragon/src/operators/vision/conv_op.cc
@@ -7,7 +7,7 @@ namespace dragon {
 template <class Context>
 void ConvOp<Context>::ComputeOutputShape() {
    this->output_shape.clear();
-    for (int i = 0; i < this->num_spatial_axes; i++){
+    for (int i = 0; i < this->num_spatial_axes; i++) {
        const int input_dim = this->bottom_shape[this->channel_axis + i + 1];
        const int dilated_kernel = this->dilation[i] * (this->kernel_size[i] - 1) + 1;
        const int output_dim = (input_dim + 2 * this->pad[i] - dilated_kernel) / this->stride[i] + 1;
@@ -43,7 +43,7 @@ void ConvOp<Context>::RunWithType() {
 }

 template <class Context>
-void ConvOp<Context>::RunOnDevice(){
+void ConvOp<Context>::RunOnDevice() {
    Reshape();

    if (input(0).template IsType<float>()) RunWithType<float>();
@@ -71,7 +71,7 @@ void ConvGradientOp<Context>::RunWithType() {
            Db(dYdata + n * this->y_offset, dBdata);
    }

-    for (int n = 0; n < input(2).dim(0); n++){
+    for (int n = 0; n < input(2).dim(0); n++) {
        if (output(1)->name() != "ignore") {
            auto* Xdata = input(0).template data<T, Context>();
            auto* dWdata = output(1)->template mutable_data<T, Context>();

--- a/Dragon/src/operators/vision/conv_op_base.cc
+++ b/Dragon/src/operators/vision/conv_op_base.cc
@@ -161,7 +161,7 @@ void ConvOpBase<Context>::GradientReshape() {

    //  compute input shape
    input_shape.clear();
-    for (int i = 0; i < num_spatial_axes; i++){
+    for (int i = 0; i < num_spatial_axes; i++) {
        if (ReverseDimensions()) {
            input_shape.push_back(input(2).dim(channel_axis + i + 1));
        } else {
@@ -181,7 +181,7 @@ void ConvOpBase<Context>::GradientReshape() {
    //  compute col buffer shape
    col_buffer_shape.clear();
    col_buffer_shape.push_back(kernel_dim * group);
-    for (int i = 0; i < num_spatial_axes; i++){
+    for (int i = 0; i < num_spatial_axes; i++) {
        if (ReverseDimensions()) {
            col_buffer_shape.push_back(bottom_shape[channel_axis + i + 1]);
        } else {

--- a/Dragon/src/operators/vision/cudnn_conv_op.cc
+++ b/Dragon/src/operators/vision/cudnn_conv_op.cc
@@ -228,13 +228,13 @@ void CuDNNConvGradientOp<Context>::RunWithType() {

    const T* dYdata = input(2).template data<T, Context>();
    for (int g = 0; g < this->group; g++) {
-        if (output(2)->name() != "ignore"){
+        if (output(2)->name() != "ignore") {
            T* dBdata = output(2)->template mutable_data<T, Context>();
            CUDNN_CHECK(cudnnConvolutionBackwardBias(handle[g],
                        CUDNNType<T>::one, input_desc, dYdata + this->y_offset * g,
                        CUDNNType<T>::one, bias_desc, dBdata + bias_offset * g));
        }
-        if (output(1)->name() != "ignore"){
+        if (output(1)->name() != "ignore") {
            auto* Xdata = input(0).template data<T, Context>();
            auto* dWdata = output(1)->template mutable_data<T, Context>();
            auto* workspace = buffer2->mutable_data<char, Context>();
@@ -246,7 +246,7 @@ void CuDNNConvGradientOp<Context>::RunWithType() {
              workspace + g * workspace_bwd_filter_size, workspace_bwd_filter_size,
                CUDNNType<T>::one, filter_desc, dWdata + this->weight_offset * g));
        }
-        if (output(0)->name() != "ignore"){
+        if (output(0)->name() != "ignore") {
            auto* Wdata = input(1).template data<T, Context>();
            auto* dXdata = output(0)->template mutable_data<T, Context>();
            auto* workspace = buffer1->mutable_data<char, Context>();

--- a/Dragon/src/operators/vision/cudnn_lrn_op.cc
+++ b/Dragon/src/operators/vision/cudnn_lrn_op.cc
@@ -54,7 +54,7 @@ void CuDNNLRNGradientOp<Context>::RunWithType() {
 }

 template <class Context>
-void CuDNNLRNGradientOp<Context>::RunOnDevice(){
+void CuDNNLRNGradientOp<Context>::RunOnDevice() {
    output(0)->ReshapeLike(input(0));

    if (this->mode == ACROSS_CHANNELS) {

--- a/Dragon/src/operators/vision/deconv_op.cc
+++ b/Dragon/src/operators/vision/deconv_op.cc
@@ -43,7 +43,7 @@ void DeConvOp<Context>::RunWithType() {
 }

 template <class Context>
-void DeConvOp<Context>::RunOnDevice(){
+void DeConvOp<Context>::RunOnDevice() {
    Reshape();

    if (input(0).template IsType<float>()) RunWithType<float>();

--- a/Dragon/src/operators/vision/lrn_op.cc
+++ b/Dragon/src/operators/vision/lrn_op.cc
@@ -98,7 +98,7 @@ void LRNOp<Context>::ProdRunWithType() {
 }

 template <class Context>
-void LRNOp<Context>::RunOnDevice(){
+void LRNOp<Context>::RunOnDevice() {
    if (mode == ACROSS_CHANNELS) {
        if (input(0).template IsType<float>()) {
            AcrossRunWithType<float>();
@@ -223,7 +223,7 @@ void LRNGradientOp<Context>::SplitRunWithType() {
 }

 template <class Context>
-void LRNGradientOp<Context>::RunOnDevice(){
+void LRNGradientOp<Context>::RunOnDevice() {
    if (mode == ACROSS_CHANNELS) {
        if (input(0).template IsType<float>()) {
            AcrossRunWithType<float>();

--- a/Dragon/src/operators/vision/nn_resize_op.cc
+++ b/Dragon/src/operators/vision/nn_resize_op.cc
@@ -16,7 +16,7 @@ void NNResizeOp<Context>::RunWithType() {
 template <class Context>
 void NNResizeOp<Context>::RunOnDevice() {
    dims = input(0).dims();
-    if (dsize.size() == 0){
+    if (dsize.size() == 0) {
        CHECK(fy != -1.0 && fx != -1.0);
        dims[2] = int(dims[2] * fy);
        dims[3] = int(dims[3] * fx);
@@ -47,7 +47,7 @@ void NNResizeGradientOp<Context>::RunWithType() {
 }

 template <class Context>
-void NNResizeGradientOp<Context>::RunOnDevice(){
+void NNResizeGradientOp<Context>::RunOnDevice() {
    output(0)->ReshapeLike(input(0));
    
    if (input(0).template IsType<float>()) return RunWithType<float>();

--- a/Dragon/src/utils/cudnn_device.cc
+++ b/Dragon/src/utils/cudnn_device.cc
@@ -37,7 +37,7 @@ void cudnnSetTensorDesc(cudnnTensorDescriptor_t* desc, const vector<TIndex>& dim
    int* dimA = new int[ndim];
    int* strideA = new int[ndim];
    TIndex stride = 1;
-    for (int i = ndim - 1; i >= 0; i--){
+    for (int i = ndim - 1; i >= 0; i--) {
        strideA[i] = stride;
        dimA[i] = dims[i];
        stride *= dimA[i];
@@ -55,7 +55,7 @@ void cudnnSetTensorDesc(cudnnTensorDescriptor_t* desc,
    int ndim = (int)dims.size();
    int* dimA = new int[ndim];
    int* strideA = new int[ndim];
-    for (int i = ndim - 1; i >= 0; i--){
+    for (int i = ndim - 1; i >= 0; i--) {
        strideA[i] = strides[i];
        dimA[i] = dims[i];
    }
@@ -66,10 +66,10 @@ void cudnnSetTensorDesc(cudnnTensorDescriptor_t* desc,

 template <typename T>
 void cudnnSetTensorDesc(cudnnTensorDescriptor_t* desc, Tensor* tensor) {
-    //    CUDNN only support ndim from 3 to 8
+    //  cuDNN requires ndim from 3 to 8
    //  we fake a reshaped dims to pass check
    vector<TIndex> fake_dims(tensor->dims());
-    if (fake_dims.size() < 3 || fake_dims.size() > 8){
+    if (fake_dims.size() < 3 || fake_dims.size() > 8) {
        fake_dims.assign({ 1, 1 });
        fake_dims.push_back(tensor->count());
    }

--- a/Dragon/src/utils/math_functions.cc
+++ b/Dragon/src/utils/math_functions.cc
@@ -142,7 +142,7 @@ template <> void Add<float, CPUContext>(const int n,
 template <> void Sub<float, CPUContext>(const int n, 
                                        const float* a, 
                                        const float* b,
-                                        float* y){
+                                        float* y) {
 #ifdef WITH_SSE
    sse::Sub<float>(n, a, b, y);
 #else  // naive implement

--- a/Dragon/src/utils/math_functions.cu
+++ b/Dragon/src/utils/math_functions.cu
--- a/Dragon/src/utils/op_kernel.cc
+++ b/Dragon/src/utils/op_kernel.cc
@@ -98,7 +98,7 @@ template<> void Softmax<float, CPUContext>(const int count,
    const int dim = count / outer_dim;
    for (int i = 0; i < outer_dim; ++i) {
        context->Copy<float, CPUContext, CPUContext>(inner_dim, scale, x + i*dim);
-        for (int j = 0; j < classes; ++j){
+        for (int j = 0; j < classes; ++j) {
            for (int k = 0; k < inner_dim; k++)
                scale[k] = std::max(scale[k], x[i * dim + j * inner_dim + k]);
        }
@@ -1362,7 +1362,7 @@ template<> void ROIPooling<float, CPUContext>(const float spatial_scale,
                                    Ydata[pool_idx] = Idata[idx];
                                    Mdata[pool_idx] = idx;
                            }
-                        }    //end w
+                        }    //  end w
                    }    //  end h
                }    //  end pw
            }    //  end ph
@@ -1373,7 +1373,7 @@ template<> void ROIPooling<float, CPUContext>(const float spatial_scale,
        }    //  end c
        //  offset roi region
        Rdata += roi->offset(1);
-    }    //end n
+    }    //  end n
 }

 template<> void ROIPoolingGrad<float, CPUContext>(const float spatial_scale, 

--- a/Dragon/src/utils/op_kernel.cu
+++ b/Dragon/src/utils/op_kernel.cu
@@ -130,7 +130,7 @@ __global__ void _ReluGrad(const int count,
                          const T* y, 
                          const float slope, 
                          T* dx) {
-    CUDA_KERNEL_LOOP(i, count){
+    CUDA_KERNEL_LOOP(i, count) {
        dx[i] = dy[i] * ((y[i] > 0) + slope * (y[i] <= 0));
    }
 }
@@ -912,7 +912,7 @@ __global__ void _Sum(const int count,

 template<> void Sum<float, CUDAContext>(
        const int count, const int axis_dim,
-        const int inner_dim, const float* x, float* y){
+        const int inner_dim, const float* x, float* y) {
    _Sum<float> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count, 
                                                          axis_dim, 
                                                         inner_dim, 
@@ -954,7 +954,7 @@ template<> void SumGrad<float, CUDAContext>(const int count,

 template <typename T>
    __global__ void _Slice(const int count, const int outer_dim, const int inner_dim,
-        const int x_slice_dim, const int y_slice_dim, const int slice_offset, const T* x, T* y){
+        const int x_slice_dim, const int y_slice_dim, const int slice_offset, const T* x, T* y) {
        CUDA_KERNEL_LOOP(idx, count) {
            const int tmp = y_slice_dim * inner_dim;
            const int outer_idx = idx / tmp;

--- a/Dragon/src/utils/sse_alternative.cc
+++ b/Dragon/src/utils/sse_alternative.cc
@@ -9,20 +9,20 @@ namespace dragon {

 namespace sse {

-    template<> void Set(const int n, const float alpha, float* x){
+    template<> void Set(const int n, const float alpha, float* x) {
        __m128 scalar = SSE_FP32_SCALAR(alpha);
        SSE_LOOP1(i, n) SSE_FP32_STORE(x + i, scalar);
        SSE_LOOP2(i, n) x[i] = alpha;
    }

-    template<> void Set(const int n, const int alpha, int* x){
+    template<> void Set(const int n, const int alpha, int* x) {
        __m128i scalar = SSE_INT32_SCALAR(alpha);
        __m128i* x1 = reinterpret_cast<__m128i*>(x);
        SSE_LOOP1(i, n) SSE_INT128_STORE(x1++, scalar);
        SSE_LOOP2(i, n) x[i] = alpha;
    }

-    template<> void Add(const int n, const float* a, const float* b, float* y){
+    template<> void Add(const int n, const float* a, const float* b, float* y) {
        __m128 x1, y1, z1;
        SSE_LOOP1(i, n) {
            x1 = SSE_FP32_LOAD(a + i);
@@ -33,7 +33,7 @@ namespace sse {
        SSE_LOOP2(i, n) y[i] = a[i] + b[i];
    }

-    template<> void Sub(const int n, const float* a, const float* b, float* y){
+    template<> void Sub(const int n, const float* a, const float* b, float* y) {
        __m128 x1, y1, z1;
        SSE_LOOP1(i, n) {
            x1 = SSE_FP32_LOAD(a + i);
@@ -44,7 +44,7 @@ namespace sse {
        SSE_LOOP2(i, n) y[i] = a[i] - b[i];
    }

-    template<> void Mul(const int n, const float* a, const float* b, float* y){
+    template<> void Mul(const int n, const float* a, const float* b, float* y) {
        __m128 x1, y1, z1;
        SSE_LOOP1(i, n) {
            x1 = SSE_FP32_LOAD(a + i);
@@ -55,7 +55,7 @@ namespace sse {
        SSE_LOOP2(i, n) y[i] = a[i] * b[i];
    }

-    template<> void Div(const int n, const float* a, const float* b, float* y){
+    template<> void Div(const int n, const float* a, const float* b, float* y) {
        __m128 x1, y1, z1;
        SSE_LOOP1(i, n) {
            x1 = SSE_FP32_LOAD(a + i);
@@ -66,7 +66,7 @@ namespace sse {
        SSE_LOOP2(i, n) y[i] = a[i] / b[i];
    }

-    template<> void Scal(const int n, const float alpha, float* y){
+    template<> void Scal(const int n, const float alpha, float* y) {
        __m128 y1, scalar = SSE_FP32_SCALAR(alpha);
        SSE_LOOP1(i, n) {
            y1 = SSE_FP32_LOAD(y + i);
@@ -76,7 +76,7 @@ namespace sse {
        SSE_LOOP2(i, n) y[i] *= alpha;
    }

-    template<> void Scale(const int n, const float alpha, const float* x, float* y){
+    template<> void Scale(const int n, const float alpha, const float* x, float* y) {
        __m128 x1, scalar = SSE_FP32_SCALAR(alpha);
        SSE_LOOP1(i, n) {
            x1 = SSE_FP32_LOAD(x + i);
@@ -87,7 +87,7 @@ namespace sse {
    }

    
-    template<> void Axpy(const int n, float alpha, const float* x, float *y){
+    template<> void Axpy(const int n, float alpha, const float* x, float *y) {
        __m128 x1, y1, scalar = SSE_FP32_SCALAR(alpha);
        SSE_LOOP1(i, n) {
            x1 = SSE_FP32_LOAD(x + i);
@@ -100,7 +100,7 @@ namespace sse {
    }

    template<> void Axpby(const int n, float alpha, const float* x, 
-        const float beta, float *y){
+        const float beta, float *y) {
        __m128 x1, y1, z1;
        __m128 scalar1 = SSE_FP32_SCALAR(alpha);
        __m128 scalar2 = SSE_FP32_SCALAR(beta);
@@ -115,7 +115,7 @@ namespace sse {
        SSE_LOOP2(i, n) y[i] = alpha * x[i] + beta* y[i];
    }

-    template<> float ASum(const int n, const float *x){
+    template<> float ASum(const int n, const float *x) {
        __m128 x1, sum = SSE_FP32_ZERO;
        SSE_LOOP1(i, n) {
            x1 = SSE_FP32_LOAD(x + i);
@@ -128,7 +128,7 @@ namespace sse {
        return ret;
    }

-    template<> void AddScalar(const int n, const float alpha, float* y){
+    template<> void AddScalar(const int n, const float alpha, float* y) {
        __m128 y1, scalar = SSE_FP32_SCALAR(alpha);
        SSE_LOOP1(i, n) {
            y1 = SSE_FP32_LOAD(y + i);
@@ -138,7 +138,7 @@ namespace sse {
        SSE_LOOP2(i, n) y[i] += alpha;
    }

-    template<> void MulScalar(const int n, const float alpha, float* y){
+    template<> void MulScalar(const int n, const float alpha, float* y) {
        __m128 y1, scalar = SSE_FP32_SCALAR(alpha);
        SSE_LOOP1(i, n) {
            y1 = SSE_FP32_LOAD(y + i);
@@ -148,7 +148,7 @@ namespace sse {
        SSE_LOOP2(i, n) y[i] *= alpha;
    }

-    template <> float Dot(const int n, const float* a, const float* b){
+    template <> float Dot(const int n, const float* a, const float* b) {
        __m128 x1, y1, sum = SSE_FP32_ZERO;
        SSE_LOOP1(i, n) {
            x1 = SSE_FP32_LOAD(a + i);

--- a/examples/cifar10/gen_lmdb.py
+++ b/examples/cifar10/gen_lmdb.py
@@ -11,6 +11,7 @@ import sys
 import time
 import shutil
 import tarfile
+import numpy as np
 from six.moves import range as xrange

 import cv2
@@ -78,7 +79,7 @@ def extract_images():
            f.write(item)


-def make_db(image_path, label_path, database_path):
+def make_db(image_path, label_path, database_path, pad=0):
    if os.path.isfile(label_path) is False:
        raise ValueError('input path is empty or wrong.')
    if os.path.isdir(database_path) is True:
@@ -111,6 +112,12 @@ def make_db(image_path, label_path, database_path):
            label = record[1]

            img = cv2.imread(os.path.join(image_path ,path))
+            if pad > 0:
+                pad_img = np.zeros((img.shape[0] + 2 * pad,
+                                    img.shape[1] + 2 * pad, 3), dtype=np.uint8)
+                pad_img[pad : pad + img.shape[0],
+                        pad : pad + img.shape[1], :] = img
+                img = pad_img
            result, imgencode = cv2.imencode('.jpg', img, encode_param)

            datum = caffe_pb2.Datum()