first commit

daiab
Commit a3557f74 authored May 04, 2018 by daiab
Showing with 4260 additions and 0 deletions
atcfile_template.yml
dragon/__init__.py
helper/.autocnn/.autocnnalgorithm
helper/.autocnnignore
helper/autocnnfile.yml
helper/test.py
mxnet/.autocnnignore
mxnet/cifar10/.autocnn/algorithm
mxnet/cifar10/.autocnnignore
mxnet/cifar10/__init__.py
mxnet/cifar10/autocnn_distribute.yml
mxnet/cifar10/autocnnfile.yml
mxnet/cifar10/common/__init__.py
mxnet/cifar10/common/__pycache__/__init__.cpython-36.pyc
mxnet/cifar10/common/__pycache__/data.cpython-36.pyc
mxnet/cifar10/common/__pycache__/find_mxnet.cpython-36.pyc
mxnet/cifar10/common/__pycache__/fit.cpython-36.pyc
mxnet/cifar10/common/__pycache__/util.cpython-36.pyc
mxnet/cifar10/common/data.py
mxnet/cifar10/common/find_mxnet.py
--- a/atcfile_template.yml
+++ b/atcfile_template.yml
+---
+version: 1
+
+algorithm:
+  name: template
+
+resource:
+  default_resources:
+    cpu:
+      requests: 1
+      limits: 2
+    memory:
+      requests: 256
+      limits: 1024
+  tensorflow:
+    n_workers: 2
+    n_ps: 1
+    worker_resources:
+      - index: 0
+        cpu:
+          requests: 1
+          limits: 2
+        gpu:
+          requests: 1
+          limits: 1
+        memory:
+          requests: 256
+          limits: 1024
+      - index: 1
+        cpu:
+          requests: 1
+          limits: 2
+        gpu:
+          requests: 1
+          limits: 1
+        memory:
+          requests: 256
+          limits: 1024
+    ps_resources:
+      - index: 0
+        cpu:
+          requests: 1
+          limits: 1
+        gpu:
+          requests: 1
+          limits: 1
+        memory:
+          requests: 256
+          limits: 1024
+#  mxnet:
+#    n_workers: 2
+#    n_ps: 1
+#    worker_resources:
+#      - index: 0
+#        cpu:
+#          requests: 1
+#          limits: 2
+#        gpu:
+#          requests: 1
+#          limits: 1
+#        memory:
+#          requests: 256
+#          limits: 1024
+#      - index: 1
+#        cpu:
+#          requests: 1
+#          limits: 2
+#        gpu:
+#          requests: 1
+#          limits: 1
+#        memory:
+#          requests: 256
+#          limits: 1024
+#    ps_resources:
+#      - index: 0
+#        cpu:
+#          requests: 1
+#          limits: 1
+#        gpu:
+#          requests: 1
+#          limits: 1
+#        memory:
+#          requests: 256
+#          limits: 1024
+#  pytorch:
+#    n_workers: 2
+#    worker_resources:
+#      - index: 0
+#        cpu:
+#          requests: 1
+#          limits: 2
+#        gpu:
+#          requests: 1
+#          limits: 1
+#        memory:
+#          requests: 256
+#          limits: 1024
+#      - index: 1
+#        cpu:
+#          requests: 1
+#          limits: 2
+#        gpu:
+#          requests: 1
+#          limits: 1
+#        memory:
+#          requests: 256
+#          limits: 1024
+#  dragon:
+#    n_workers: 2
+#    worker_resources:
+#      - index: 0
+#        cpu:
+#          requests: 1
+#          limits: 2
+#        gpu:
+#          requests: 1
+#          limits: 1
+#        memory:
+#          requests: 256
+#          limits: 1024
+#      - index: 1
+#        cpu:
+#          requests: 1
+#          limits: 2
+#        gpu:
+#          requests: 1
+#          limits: 1
+#        memory:
+#          requests: 256
+#          limits: 1024
+
+image:
+  from_image: mxnet/python:1.1.0_gpu_cuda8
+  runs:
+    - apt-get update
+    - apt-get install -y git
+  envs:
+    LD_LIBRARY_PATH: /usr/local/cuda/lib64
+    PATH: $PATH:/usr/bin
+
+plugin:
+  mount:
+    ceph_hash_1: in_docker_path_1
+    ceph_hash_2: in_docker_path_2
+  output: /output
+  parameter:
+    LR: 0.001
+  cmd: python process.py
+
+train:
+  mount:
+    ceph_hash_1: in_docker_path_1
+    ceph_hash_2: in_docker_path_2
+  output: /output
+  parameter:
+    LR: 0.001
+  cmd: python run.py train
+
+test:
+  ref_model:
+    1: in_docker_path
+  mount:
+    ceph_hash_1: in_docker_path_1
+    ceph_hash_2: in_docker_path_2
+  output: /output
+  parameter:
+    BN: false
+  cmd: python run.py test
--- a/dragon/__init__.py
+++ b/dragon/__init__.py
--- a/helper/.autocnn/.autocnnalgorithm
+++ b/helper/.autocnn/.autocnnalgorithm
+{"name": "helper_test", "user": "daiab", "unique_name": "daiab.helper_test", "uuid": "3e8a62872c794285999b8a24ca0b4a19", "description": null, "is_public": false, "has_code": false, "created_at": "2018-04-13T12:26:36.552071+00:00", "updated_at": "2018-04-13T12:26:36.552116+00:00", "num_tasks": 0, "has_tensorboard": false, "has_notebook": false, "tasks": null}
\ No newline at end of file
--- a/helper/.autocnnignore
+++ b/helper/.autocnnignore
+
+.git
+.eggs
+eggs
+lib
+lib64
+parts
+sdist
+var
+*.pyc
+*.swp
+.DS_Store
+./.autocnn
--- a/helper/autocnnfile.yml
+++ b/helper/autocnnfile.yml
+---
+version: 1
+
+algorithm:
+  name: helper_test
+
+resource:
+  default_resources:
+    cpu:
+      requests: 1
+      limits: 1
+    memory:
+      requests: 1024
+      limits: 1024
+
+  #tensorflow:
+  #  n_workers: 2
+  #  n_ps: 1
+  #  worker_resources:
+  #    - index: 0
+  #      cpu:
+  #        requests: 1
+  #        limits: 2
+  #      memory:
+  #        requests: 256
+  #        limits: 1024
+  #    - index: 1
+  #      cpu:
+  #        requests: 1
+  #        limits: 2
+  #      memory:
+  #        requests: 256
+  #        limits: 1024
+  #  ps_resources:
+  #    - index: 0
+  #      cpu:
+  #        requests: 1
+  #        limits: 1
+  #      memory:
+  #        requests: 256
+  #        limits: 1024
+
+image:
+  from_image: tensorflow/tensorflow:1.4.1-py3
+  runs:
+    - apt-get -y update && apt-get install python3-pip && pip install -y requests
+  envs:
+    PYTHONPATH: $PYTHONPATH:/code/helper 
+
+train:
+  mount:
+    data/daiab: /code/data
+  parameter:
+    train:
+      child1:
+        node1: 1.0
+        node2: 2.0
+    test:
+      child2:
+        node1: [100, 300]
+        node2: "hello world"
+  cmd: /usr/bin/python3.5 test.py
+
--- a/helper/test.py
+++ b/helper/test.py
+
+import autocnn_helper as ah
+
+print('get_api', ah.get_api())
+print('get_cluster_def', ah.get_cluster_def())
+print('get_data_path', ah.get_data_path())
+print('get_job_info', ah.get_job_info())
+print('get_log_level', ah.get_log_level())
+print('get_outputs_path', ah.get_outputs_path())
+print('get_parameter', ah.get_parameter())
+print('get_task_info', ah.get_task_info())
+print('get_tf_config', ah.get_tf_config())
+print('get_user_token', ah.get_user_token())
+
--- a/mxnet/.autocnnignore
+++ b/mxnet/.autocnnignore
+
+.git
+.eggs
+eggs
+lib
+lib64
+parts
+sdist
+var
+*.pyc
+*.swp
+.DS_Store
+./.autocnn
--- a/mxnet/cifar10/.autocnn/algorithm
+++ b/mxnet/cifar10/.autocnn/algorithm
+{"name": "test", "user": "daiab", "unique_name": "daiab.test", "uuid": "32d51a4d310a4825a945826c683681ac", "description": "", "is_public": false, "has_code": true, "created_at": "2018-04-28T05:32:00.256679+00:00", "updated_at": "2018-04-28T05:32:00.256737+00:00", "num_tasks": 8, "has_tensorboard": false, "has_notebook": false, "tasks": null, "framework": "mxnet", "tags": ""}
\ No newline at end of file
--- a/mxnet/cifar10/.autocnnignore
+++ b/mxnet/cifar10/.autocnnignore
+
+.git
+.eggs
+eggs
+lib
+lib64
+parts
+sdist
+var
+*.pyc
+*.swp
+.DS_Store
+./.autocnn
--- a/mxnet/cifar10/__init__.py
+++ b/mxnet/cifar10/__init__.py
--- a/mxnet/cifar10/autocnn_distribute.yml
+++ b/mxnet/cifar10/autocnn_distribute.yml
+---
+version: 1
+
+algorithm:
+  name: cifar10
+
+environment:
+  mxnet:
+    n_workers: 2
+    n_ps: 1
+
+run:
+  image: autocnn/mxnetkv
+  steps:
+    - pip install --no-cache-dir -U autocnn-helper
+  env_vars:
+    - ['PS_VERBOSE', 2]
+  cmd:  python run.py --network resnet --num-layers 110 --batch-size 128 --kv-store dist_sync
--- a/mxnet/cifar10/autocnnfile.yml
+++ b/mxnet/cifar10/autocnnfile.yml
+---
+version: 1
+
+algorithm:
+  name: test
+
+resource:
+  default_resources:
+    cpu:
+      requests: 4
+      limits: 4
+    gpu:
+     limits: 2
+    memory:
+      requests: 10240
+      limits: 10240
+
+image:
+  from_image: mxnet/python:1.1.0_gpu_cuda8
+  runs:
+    - apt-get -y update && apt-get install -y python3-pip
+    - pip3 install atc-beta-helper
+  envs:
+    DAIAB: /daiab
+
+train:
+  mount:
+    data/daiab: /code/data
+    logs/daiab/daiab/independents/6: /code/6
+  output: /output
+  parameter:
+    # fit
+    network: "resnet"
+    num_layers: 110
+    gpus: "0,1"
+    kv_store: "device"
+    num_epochs: 3
+    lr: 0.05
+    lr_factor: 0.1
+    lr_step_epochs: "200,250"
+    initializer: "default"
+    optimizer: "sgd"
+    mom: 0.9
+    wd: 0.0001
+    batch_size: 128
+    disp_batches: 20
+    model_prefix: "/output/train"
+    monitor: 0
+    load_epoch: null
+    top_k: 0
+    loss: ""
+    test_io: 0
+    dtype: float32
+    gc_type: none
+    gc_threshold: 0.5
+    macrobatch_size: 0
+    warmup_epochs: 5
+    warmup_strategy: linear
+    # data
+    data_train: "/code/data/cifar10_train.rec"
+    data_train_idx: ""
+    data_val: "/code/data/cifar10_val.rec"
+    data_val_idx: ""
+    rgb_mean: "123.68,116.779,103.939"
+    pad_size: 4
+    image_shape: "3,28,28"
+    num_classes: 10
+    num_examples: 50000
+    data_nthreads: 4
+    benchmark: 0
+    # data_aug
+    random_crop: 1
+    random_mirror: 1
+    max_random_h: 0
+    max_random_s: 0
+    max_random_l: 0
+    max_random_aspect_ratio: 0
+    max_random_rotate_angle: 0
+    max_random_shear_ratio: 0
+    max_random_scale: 1
+    min_random_scale: 1
+    # data_aug_level
+    level: 2
+  cmd: python3 train_cifar10.py # python3 run.py
+
+test:
+  mount:
+    data/daiab: /code/data
+  # output: /output
+  ref_model:
+    72: /model
+  parameter:
+    model_prefix: "/model/train"
+    epoch: 3
+    data_val: "/code/data/cifar10_test.rec"
+    gpus: "0"
+    batch_size: 64
+    rgb_mean: "123.68,116.779,103.939"
+    image_shape: "3,28,28"
+    data_nthreads: 4
+  cmd: python3 test_cifar10.py
--- a/mxnet/cifar10/common/__init__.py
+++ b/mxnet/cifar10/common/__init__.py
--- a/mxnet/cifar10/common/__pycache__/__init__.cpython-36.pyc
+++ b/mxnet/cifar10/common/__pycache__/__init__.cpython-36.pyc
--- a/mxnet/cifar10/common/__pycache__/data.cpython-36.pyc
+++ b/mxnet/cifar10/common/__pycache__/data.cpython-36.pyc
--- a/mxnet/cifar10/common/__pycache__/find_mxnet.cpython-36.pyc
+++ b/mxnet/cifar10/common/__pycache__/find_mxnet.cpython-36.pyc
--- a/mxnet/cifar10/common/__pycache__/fit.cpython-36.pyc
+++ b/mxnet/cifar10/common/__pycache__/fit.cpython-36.pyc
--- a/mxnet/cifar10/common/__pycache__/util.cpython-36.pyc
+++ b/mxnet/cifar10/common/__pycache__/util.cpython-36.pyc
--- a/mxnet/cifar10/common/data.py
+++ b/mxnet/cifar10/common/data.py
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+from mxnet.io import DataBatch, DataIter
+import numpy as np
+
+
+def add_data_args(parser):
+    data = parser.add_argument_group('Data', 'the input images')
+    data.add_argument('--data-train', type=str, help='the training data')
+    data.add_argument('--data-train-idx', type=str, default='', help='the index of training data')
+    data.add_argument('--data-val', type=str, help='the validation data')
+    data.add_argument('--data-val-idx', type=str, default='', help='the index of validation data')
+    data.add_argument('--rgb-mean', type=str, default='123.68,116.779,103.939',
+                      help='a tuple of size 3 for the mean rgb')
+    data.add_argument('--pad-size', type=int, default=0,
+                      help='padding the input image')
+    data.add_argument('--image-shape', type=str,
+                      help='the image shape feed into the network, e.g. (3,224,224)')
+    data.add_argument('--num-classes', type=int, help='the number of classes')
+    data.add_argument('--num-examples', type=int, help='the number of training examples')
+    data.add_argument('--data-nthreads', type=int, default=4,
+                      help='number of threads for data decoding')
+    data.add_argument('--benchmark', type=int, default=0,
+                      help='if 1, then feed the network with synthetic data')
+    return data
+
+
+def add_data_aug_args(parser):
+    aug = parser.add_argument_group(
+        'Image augmentations', 'implemented in src/io/image_aug_default.cc')
+    aug.add_argument('--random-crop', type=int, default=1,
+                     help='if or not randomly crop the image')
+    aug.add_argument('--random-mirror', type=int, default=1,
+                     help='if or not randomly flip horizontally')
+    aug.add_argument('--max-random-h', type=int, default=0,
+                     help='max change of hue, whose range is [0, 180]')
+    aug.add_argument('--max-random-s', type=int, default=0,
+                     help='max change of saturation, whose range is [0, 255]')
+    aug.add_argument('--max-random-l', type=int, default=0,
+                     help='max change of intensity, whose range is [0, 255]')
+    aug.add_argument('--max-random-aspect-ratio', type=float, default=0,
+                     help='max change of aspect ratio, whose range is [0, 1]')
+    aug.add_argument('--max-random-rotate-angle', type=int, default=0,
+                     help='max angle to rotate, whose range is [0, 360]')
+    aug.add_argument('--max-random-shear-ratio', type=float, default=0,
+                     help='max ratio to shear, whose range is [0, 1]')
+    aug.add_argument('--max-random-scale', type=float, default=1,
+                     help='max ratio to scale')
+    aug.add_argument('--min-random-scale', type=float, default=1,
+                     help='min ratio to scale, should >= img_size/input_shape. otherwise use --pad-size')
+    return aug
+
+
+def set_data_aug_level(aug, level):
+    if level >= 1:
+        aug.set_defaults(random_crop=1, random_mirror=1)
+    if level >= 2:
+        aug.set_defaults(max_random_h=36, max_random_s=50, max_random_l=50)
+    if level >= 3:
+        aug.set_defaults(max_random_rotate_angle=10, max_random_shear_ratio=0.1,
+                         max_random_aspect_ratio=0.25)
+
+
+class SyntheticDataIter(DataIter):
+    def __init__(self, num_classes, data_shape, max_iter, dtype):
+        self.batch_size = data_shape[0]
+        self.cur_iter = 0
+        self.max_iter = max_iter
+        self.dtype = dtype
+        label = np.random.randint(0, num_classes, [self.batch_size, ])
+        data = np.random.uniform(-1, 1, data_shape)
+        self.data = mx.nd.array(data, dtype=self.dtype, ctx=mx.Context('cpu_pinned', 0))
+        self.label = mx.nd.array(label, dtype=self.dtype, ctx=mx.Context('cpu_pinned', 0))
+
+    def __iter__(self):
+        return self
+
+    @property
+    def provide_data(self):
+        return [mx.io.DataDesc('data', self.data.shape, self.dtype)]
+
+    @property
+    def provide_label(self):
+        return [mx.io.DataDesc('softmax_label', (self.batch_size,), self.dtype)]
+
+    def next(self):
+        self.cur_iter += 1
+        if self.cur_iter <= self.max_iter:
+            return DataBatch(data=(self.data,),
+                             label=(self.label,),
+                             pad=0,
+                             index=None,
+                             provide_data=self.provide_data,
+                             provide_label=self.provide_label)
+        else:
+            raise StopIteration
+
+    def __next__(self):
+        return self.next()
+
+    def reset(self):
+        self.cur_iter = 0
+
+
+def get_rec_iter(args, kv=None):
+    image_shape = tuple([int(l) for l in args.image_shape.split(',')])
+    if 'benchmark' in args and args.benchmark:
+        data_shape = (args.batch_size,) + image_shape
+        train = SyntheticDataIter(args.num_classes, data_shape,
+                                  args.num_examples / args.batch_size, np.float32)
+        return (train, None)
+    if kv:
+        (rank, nworker) = (kv.rank, kv.num_workers)
+    else:
+        (rank, nworker) = (0, 1)
+    rgb_mean = [float(i) for i in args.rgb_mean.split(',')]
+    train = mx.io.ImageRecordIter(
+        path_imgrec=args.data_train,
+        path_imgidx=args.data_train_idx,
+        label_width=1,
+        mean_r=rgb_mean[0],
+        mean_g=rgb_mean[1],
+        mean_b=rgb_mean[2],
+        data_name='data',
+        label_name='softmax_label',
+        data_shape=image_shape,
+        batch_size=args.batch_size,
+        rand_crop=args.random_crop,
+        max_random_scale=args.max_random_scale,
+        pad=args.pad_size,
+        fill_value=127,
+        min_random_scale=args.min_random_scale,
+        max_aspect_ratio=args.max_random_aspect_ratio,
+        random_h=args.max_random_h,
+        random_s=args.max_random_s,
+        random_l=args.max_random_l,
+        max_rotate_angle=args.max_random_rotate_angle,
+        max_shear_ratio=args.max_random_shear_ratio,
+        rand_mirror=args.random_mirror,
+        preprocess_threads=args.data_nthreads,
+        shuffle=True,
+        num_parts=nworker,
+        part_index=rank)
+    if args.data_val is None:
+        return (train, None)
+    val = mx.io.ImageRecordIter(
+        path_imgrec=args.data_val,
+        path_imgidx=args.data_val_idx,
+        label_width=1,
+        mean_r=rgb_mean[0],
+        mean_g=rgb_mean[1],
+        mean_b=rgb_mean[2],
+        data_name='data',
+        label_name='softmax_label',
+        batch_size=args.batch_size,
+        data_shape=image_shape,
+        preprocess_threads=args.data_nthreads,
+        rand_crop=False,
+        rand_mirror=False,
+        num_parts=nworker,
+        part_index=rank)
+    return (train, val)
--- a/mxnet/cifar10/common/find_mxnet.py
+++ b/mxnet/cifar10/common/find_mxnet.py
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os, sys
+
+try:
+    import mxnet as mx
+except ImportError:
+    curr_path = os.path.abspath(os.path.dirname(__file__))
+    sys.path.append(os.path.join(curr_path, "../../../python"))
+    import mxnet as mx
--- a/mxnet/cifar10/common/fit.py
+++ b/mxnet/cifar10/common/fit.py
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+""" example train fit utility """
+import logging
+import os
+import time
+import re
+import math
+import mxnet as mx
+
+
+def _get_lr_scheduler(args, kv):
+    if 'lr_factor' not in args or args.lr_factor >= 1:
+        return (args.lr, None)
+    epoch_size = args.num_examples / args.batch_size
+    if 'dist' in args.kv_store:
+        epoch_size /= kv.num_workers
+    begin_epoch = args.load_epoch if args.load_epoch else 0
+    if 'pow' in args.lr_step_epochs:
+        lr = args.lr
+        max_up = args.num_epochs * epoch_size
+        pwr = float(re.sub('pow[- ]*', '', args.lr_step_epochs))
+        poly_sched = mx.lr_scheduler.PolyScheduler(max_up, lr, pwr)
+        return (lr, poly_sched)
+    step_epochs = [int(l) for l in args.lr_step_epochs.split(',')]
+    lr = args.lr
+    for s in step_epochs:
+        if begin_epoch >= s:
+            lr *= args.lr_factor
+    if lr != args.lr:
+        logging.info('Adjust learning rate to %e for epoch %d',
+                     lr, begin_epoch)
+
+    steps = [epoch_size * (x - begin_epoch)
+             for x in step_epochs if x - begin_epoch > 0]
+    return lr, mx.lr_scheduler.MultiFactorScheduler(step=steps, factor=args.lr_factor)
+
+
+def _load_model(args, rank=0):
+    if 'load_epoch' not in args or args.load_epoch is None:
+        return (None, None, None)
+    assert args.model_prefix is not None
+    model_prefix = args.model_prefix
+    if rank > 0 and os.path.exists("%s-%d-symbol.json" % (model_prefix, rank)):
+        model_prefix += "-%d" % (rank)
+    sym, arg_params, aux_params = mx.model.load_checkpoint(
+        model_prefix, args.load_epoch)
+    logging.info('Loaded model %s_%04d.params', model_prefix, args.load_epoch)
+    return (sym, arg_params, aux_params)
+
+
+def _save_model(args, rank=0):
+    if args.model_prefix is None:
+        return None
+    dst_dir = os.path.dirname(args.model_prefix)
+    if not os.path.isdir(dst_dir):
+        os.mkdir(dst_dir)
+    return mx.callback.do_checkpoint(args.model_prefix if rank == 0 else "%s-%d" % (
+        args.model_prefix, rank))
+
+
+def add_fit_args(parser):
+    """
+    parser : argparse.ArgumentParser
+    return a parser added with args required by fit
+    """
+    train = parser.add_argument_group('Training', 'model training')
+    train.add_argument('--network', type=str,
+                       help='the neural network to use')
+    train.add_argument('--num-layers', type=int,
+                       help='number of layers in the neural network, \
+                             required by some networks such as resnet')
+    train.add_argument('--gpus', type=str,
+                       help='list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu')
+    train.add_argument('--kv-store', type=str, default='device',
+                       help='key-value store type')
+    train.add_argument('--num-epochs', type=int, default=100,
+                       help='max num of epochs')
+    train.add_argument('--lr', type=float, default=0.1,
+                       help='initial learning rate')
+    train.add_argument('--lr-factor', type=float, default=0.1,
+                       help='the ratio to reduce lr on each step')
+    train.add_argument('--lr-step-epochs', type=str,
+                       help='the epochs to reduce the lr, e.g. 30,60')
+    train.add_argument('--initializer', type=str, default='default',
+                       help='the initializer type')
+    train.add_argument('--optimizer', type=str, default='sgd',
+                       help='the optimizer type')
+    train.add_argument('--mom', type=float, default=0.9,
+                       help='momentum for sgd')
+    train.add_argument('--wd', type=float, default=0.0001,
+                       help='weight decay for sgd')
+    train.add_argument('--batch-size', type=int, default=128,
+                       help='the batch size')
+    train.add_argument('--disp-batches', type=int, default=20,
+                       help='show progress for every n batches')
+    train.add_argument('--model-prefix', type=str,
+                       help='model prefix')
+    parser.add_argument('--monitor', dest='monitor', type=int, default=0,
+                        help='log network parameters every N iters if larger than 0')
+    train.add_argument('--load-epoch', type=int,
+                       help='load the model on an epoch using the model-load-prefix')
+    train.add_argument('--top-k', type=int, default=0,
+                       help='report the top-k accuracy. 0 means no report.')
+    train.add_argument('--loss', type=str, default='',
+                       help='show the cross-entropy or nll loss. ce strands for cross-entropy, nll-loss stands for likelihood loss')
+    train.add_argument('--test-io', type=int, default=0,
+                       help='1 means test reading speed without training')
+    train.add_argument('--dtype', type=str, default='float32',
+                       help='precision: float32 or float16')
+    train.add_argument('--gc-type', type=str, default='none',
+                       help='type of gradient compression to use, \
+                             takes `2bit` or `none` for now')
+    train.add_argument('--gc-threshold', type=float, default=0.5,
+                       help='threshold for 2bit gradient compression')
+    # additional parameters for large batch sgd
+    train.add_argument('--macrobatch-size', type=int, default=0,
+                       help='distributed effective batch size')
+    train.add_argument('--warmup-epochs', type=int, default=5,
+                       help='the epochs to ramp-up lr to scaled large-batch value')
+    train.add_argument('--warmup-strategy', type=str, default='linear',
+                       help='the ramping-up strategy for large batch sgd')
+    return train
+
+
+def fit(args, network, data_loader, **kwargs):
+    """
+    train a model
+    args : argparse returns
+    network : the symbol definition of the nerual network
+    data_loader : function that returns the train and val data iterators
+    """
+    # kvstore
+    kv = mx.kvstore.create(args.kv_store)
+    if args.gc_type != 'none':
+        kv.set_gradient_compression({'type': args.gc_type,
+                                     'threshold': args.gc_threshold})
+
+    # logging
+    head = '%(asctime)-15s Node[' + str(kv.rank) + '] %(message)s'
+    logging.basicConfig(level=logging.DEBUG, format=head)
+    logging.info('start with arguments %s', args)
+
+    # data iterators
+    (train, val) = data_loader(args, kv)
+    if args.test_io:
+        tic = time.time()
+        for i, batch in enumerate(train):
+            for j in batch.data:
+                j.wait_to_read()
+            if (i + 1) % args.disp_batches == 0:
+                logging.info('Batch [%d]\tSpeed: %.2f samples/sec', i,
+                             args.disp_batches * args.batch_size / (time.time() - tic))
+                tic = time.time()
+
+        return
+
+    # load model
+    if 'arg_params' in kwargs and 'aux_params' in kwargs:
+        arg_params = kwargs['arg_params']
+        aux_params = kwargs['aux_params']
+    else:
+        sym, arg_params, aux_params = _load_model(args, kv.rank)
+        if sym is not None:
+            assert sym.tojson() == network.tojson()
+
+    # save model
+    checkpoint = _save_model(args, kv.rank)
+
+    # devices for training
+    devs = mx.cpu() if args.gpus is None or args.gpus == "" else [
+        mx.gpu(int(i)) for i in args.gpus.split(',')]
+
+    # learning rate
+    lr, lr_scheduler = _get_lr_scheduler(args, kv)
+
+    # create model
+    model = mx.mod.Module(
+        context=devs,
+        symbol=network
+    )
+
+    lr_scheduler = lr_scheduler
+    optimizer_params = {
+        'learning_rate': lr,
+        'wd': args.wd,
+        'lr_scheduler': lr_scheduler,
+        'multi_precision': True}
+
+    # Only a limited number of optimizers have 'momentum' property
+    has_momentum = {'sgd', 'dcasgd', 'nag'}
+    if args.optimizer in has_momentum:
+        optimizer_params['momentum'] = args.mom
+
+    monitor = mx.mon.Monitor(
+        args.monitor, pattern=".*") if args.monitor > 0 else None
+
+    # A limited number of optimizers have a warmup period
+    has_warmup = {'lbsgd', 'lbnag'}
+    if args.optimizer in has_warmup:
+        if 'dist' in args.kv_store:
+            nworkers = kv.num_workers
+        else:
+            nworkers = 1
+        epoch_size = args.num_examples / args.batch_size / nworkers
+        if epoch_size < 1:
+            epoch_size = 1
+        macrobatch_size = args.macrobatch_size
+        if macrobatch_size < args.batch_size * nworkers:
+            macrobatch_size = args.batch_size * nworkers
+        # batch_scale = round(float(macrobatch_size) / args.batch_size / nworkers +0.4999)
+        batch_scale = math.ceil(
+            float(macrobatch_size) / args.batch_size / nworkers)
+        optimizer_params['updates_per_epoch'] = epoch_size
+        optimizer_params['begin_epoch'] = args.load_epoch if args.load_epoch else 0
+        optimizer_params['batch_scale'] = batch_scale
+        optimizer_params['warmup_strategy'] = args.warmup_strategy
+        optimizer_params['warmup_epochs'] = args.warmup_epochs
+        optimizer_params['num_epochs'] = args.num_epochs
+
+    if args.initializer == 'default':
+        if args.network == 'alexnet':
+            # AlexNet will not converge using Xavier
+            initializer = mx.init.Normal()
+            # VGG will not trend to converge using Xavier-Gaussian
+        elif 'vgg' in args.network:
+            initializer = mx.init.Xavier()
+        else:
+            initializer = mx.init.Xavier(
+                rnd_type='gaussian', factor_type="in", magnitude=2)
+    # initializer   = mx.init.Xavier(factor_type="in", magnitude=2.34),
+    elif args.initializer == 'xavier':
+        initializer = mx.init.Xavier()
+    elif args.initializer == 'msra':
+        initializer = mx.init.MSRAPrelu()
+    elif args.initializer == 'orthogonal':
+        initializer = mx.init.Orthogonal()
+    elif args.initializer == 'normal':
+        initializer = mx.init.Normal()
+    elif args.initializer == 'uniform':
+        initializer = mx.init.Uniform()
+    elif args.initializer == 'one':
+        initializer = mx.init.One()
+    elif args.initializer == 'zero':
+        initializer = mx.init.Zero()
+
+    # evaluation metrices
+    eval_metrics = ['accuracy']
+    if args.top_k > 0:
+        eval_metrics.append(mx.metric.create(
+            'top_k_accuracy', top_k=args.top_k))
+
+    supported_loss = ['ce', 'nll_loss']
+    if len(args.loss) > 0:
+        # ce or nll loss is only applicable to softmax output
+        loss_type_list = args.loss.split(',')
+        if 'softmax_output' in network.list_outputs():
+            for loss_type in loss_type_list:
+                loss_type = loss_type.strip()
+                if loss_type == 'nll':
+                    loss_type = 'nll_loss'
+                if loss_type not in supported_loss:
+                    logging.warning(loss_type + ' is not an valid loss type, only cross-entropy or ' \
+                                                'negative likelihood loss is supported!')
+                else:
+                    eval_metrics.append(mx.metric.create(loss_type))
+        else:
+            logging.warning("The output is not softmax_output, loss argument will be skipped!")
+
+    # callbacks that run after each batch
+    batch_end_callbacks = [mx.callback.Speedometer(
+        args.batch_size, args.disp_batches)]
+    if 'batch_end_callback' in kwargs:
+        cbs = kwargs['batch_end_callback']
+        batch_end_callbacks += cbs if isinstance(cbs, list) else [cbs]
+    # run
+    model.fit(train,
+              begin_epoch=args.load_epoch if args.load_epoch else 0,
+              num_epoch=args.num_epochs,
+              eval_data=val,
+              eval_metric=eval_metrics,
+              kvstore=kv,
+              optimizer=args.optimizer,
+              optimizer_params=optimizer_params,
+              initializer=initializer,
+              arg_params=arg_params,
+              aux_params=aux_params,
+              batch_end_callback=batch_end_callbacks,
+              epoch_end_callback=checkpoint,
+              allow_missing=True,
+              monitor=monitor)
--- a/mxnet/cifar10/common/modelzoo.py
+++ b/mxnet/cifar10/common/modelzoo.py
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+from common.util import download_file
+
+_base_model_url = 'http://data.mxnet.io/models/'
+_default_model_info = {
+    'imagenet1k-inception-bn': {
+        'symbol': _base_model_url + 'imagenet/inception-bn/Inception-BN-symbol.json',
+        'params': _base_model_url + 'imagenet/inception-bn/Inception-BN-0126.params'},
+    'imagenet1k-resnet-18': {
+        'symbol': _base_model_url + 'imagenet/resnet/18-layers/resnet-18-symbol.json',
+        'params': _base_model_url + 'imagenet/resnet/18-layers/resnet-18-0000.params'},
+    'imagenet1k-resnet-34': {
+        'symbol': _base_model_url + 'imagenet/resnet/34-layers/resnet-34-symbol.json',
+        'params': _base_model_url + 'imagenet/resnet/34-layers/resnet-34-0000.params'},
+    'imagenet1k-resnet-50': {
+        'symbol': _base_model_url + 'imagenet/resnet/50-layers/resnet-50-symbol.json',
+        'params': _base_model_url + 'imagenet/resnet/50-layers/resnet-50-0000.params'},
+    'imagenet1k-resnet-101': {
+        'symbol': _base_model_url + 'imagenet/resnet/101-layers/resnet-101-symbol.json',
+        'params': _base_model_url + 'imagenet/resnet/101-layers/resnet-101-0000.params'},
+    'imagenet1k-resnet-152': {
+        'symbol': _base_model_url + 'imagenet/resnet/152-layers/resnet-152-symbol.json',
+        'params': _base_model_url + 'imagenet/resnet/152-layers/resnet-152-0000.params'},
+    'imagenet1k-resnext-50': {
+        'symbol': _base_model_url + 'imagenet/resnext/50-layers/resnext-50-symbol.json',
+        'params': _base_model_url + 'imagenet/resnext/50-layers/resnext-50-0000.params'},
+    'imagenet1k-resnext-101': {
+        'symbol': _base_model_url + 'imagenet/resnext/101-layers/resnext-101-symbol.json',
+        'params': _base_model_url + 'imagenet/resnext/101-layers/resnext-101-0000.params'},
+    'imagenet1k-resnext-101-64x4d': {
+        'symbol': _base_model_url + 'imagenet/resnext/101-layers/resnext-101-64x4d-symbol.json',
+        'params': _base_model_url + 'imagenet/resnext/101-layers/resnext-101-64x4d-0000.params'},
+    'imagenet11k-resnet-152': {
+        'symbol': _base_model_url + 'imagenet-11k/resnet-152/resnet-152-symbol.json',
+        'params': _base_model_url + 'imagenet-11k/resnet-152/resnet-152-0000.params'},
+    'imagenet11k-place365ch-resnet-152': {
+        'symbol': _base_model_url + 'imagenet-11k-place365-ch/resnet-152-symbol.json',
+        'params': _base_model_url + 'imagenet-11k-place365-ch/resnet-152-0000.params'},
+    'imagenet11k-place365ch-resnet-50': {
+        'symbol': _base_model_url + 'imagenet-11k-place365-ch/resnet-50-symbol.json',
+        'params': _base_model_url + 'imagenet-11k-place365-ch/resnet-50-0000.params'},
+}
+
+
+def download_model(model_name, dst_dir='./', meta_info=None):
+    if meta_info is None:
+        meta_info = _default_model_info
+    meta_info = dict(meta_info)
+    if model_name not in meta_info:
+        return (None, 0)
+    if not os.path.isdir(dst_dir):
+        os.mkdir(dst_dir)
+    meta = dict(meta_info[model_name])
+    assert 'symbol' in meta, "missing symbol url"
+    model_name = os.path.join(dst_dir, model_name)
+    download_file(meta['symbol'], model_name + '-symbol.json')
+    assert 'params' in meta, "mssing parameter file url"
+    download_file(meta['params'], model_name + '-0000.params')
+    return (model_name, 0)
--- a/mxnet/cifar10/common/util.py
+++ b/mxnet/cifar10/common/util.py
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import subprocess
+import os
+import errno
+
+
+def download_file(url, local_fname=None, force_write=False):
+    # requests is not default installed
+    import requests
+    if local_fname is None:
+        local_fname = url.split('/')[-1]
+    if not force_write and os.path.exists(local_fname):
+        return local_fname
+
+    dir_name = os.path.dirname(local_fname)
+
+    if dir_name != "":
+        if not os.path.exists(dir_name):
+            try:  # try to create the directory if it doesn't exists
+                os.makedirs(dir_name)
+            except OSError as exc:
+                if exc.errno != errno.EEXIST:
+                    raise
+
+    r = requests.get(url, stream=True)
+    assert r.status_code == 200, "failed to open %s" % url
+    with open(local_fname, 'wb') as f:
+        for chunk in r.iter_content(chunk_size=1024):
+            if chunk:  # filter out keep-alive new chunks
+                f.write(chunk)
+    return local_fname
+
+
+def get_gpus():
+    """
+    return a list of GPUs
+    """
+    try:
+        re = subprocess.check_output(["nvidia-smi", "-L"], universal_newlines=True)
+    except OSError:
+        return []
+    return range(len([i for i in re.split('\n') if 'GPU' in i]))
--- a/mxnet/cifar10/symbols/__init__.py
+++ b/mxnet/cifar10/symbols/__init__.py
--- a/mxnet/cifar10/symbols/__pycache__/__init__.cpython-36.pyc
+++ b/mxnet/cifar10/symbols/__pycache__/__init__.cpython-36.pyc
--- a/mxnet/cifar10/symbols/__pycache__/resnet.cpython-36.pyc
+++ b/mxnet/cifar10/symbols/__pycache__/resnet.cpython-36.pyc
--- a/mxnet/cifar10/symbols/resnet.py
+++ b/mxnet/cifar10/symbols/resnet.py
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+'''
+Adapted from https://github.com/tornadomeet/ResNet/blob/master/symbol_resnet.py
+Original author Wei Wu
+
+Implemented the following paper:
+
+Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. "Identity Mappings in Deep Residual Networks"
+'''
+import mxnet as mx
+import numpy as np
+
+
+def residual_unit(data, num_filter, stride, dim_match, name, bottle_neck=True, bn_mom=0.9,
+                  workspace=256, memonger=False):
+    """Return ResNet Unit symbol for building ResNet
+    Parameters
+    ----------
+    data : str
+        Input data
+    num_filter : int
+        Number of output channels
+    bnf : int
+        Bottle neck channels factor with regard to num_filter
+    stride : tuple
+        Stride used in convolution
+    dim_match : Boolean
+        True means channel number between input and output is the same, otherwise means differ
+    name : str
+        Base name of the operators
+    workspace : int
+        Workspace used in convolution operator
+    """
+    if bottle_neck:
+        # the same as https://github.com/facebook/fb.resnet.torch#notes, a bit difference with origin paper
+        bn1 = mx.sym.BatchNorm(data=data, fix_gamma=False, eps=2e-5, momentum=bn_mom,
+                               name=name + '_bn1')
+        act1 = mx.sym.Activation(data=bn1, act_type='relu', name=name + '_relu1')
+        conv1 = mx.sym.Convolution(data=act1, num_filter=int(num_filter * 0.25), kernel=(1, 1),
+                                   stride=(1, 1), pad=(0, 0),
+                                   no_bias=True, workspace=workspace, name=name + '_conv1')
+        bn2 = mx.sym.BatchNorm(data=conv1, fix_gamma=False, eps=2e-5, momentum=bn_mom,
+                               name=name + '_bn2')
+        act2 = mx.sym.Activation(data=bn2, act_type='relu', name=name + '_relu2')
+        conv2 = mx.sym.Convolution(data=act2, num_filter=int(num_filter * 0.25), kernel=(3, 3),
+                                   stride=stride, pad=(1, 1),
+                                   no_bias=True, workspace=workspace, name=name + '_conv2')
+        bn3 = mx.sym.BatchNorm(data=conv2, fix_gamma=False, eps=2e-5, momentum=bn_mom,
+                               name=name + '_bn3')
+        act3 = mx.sym.Activation(data=bn3, act_type='relu', name=name + '_relu3')
+        conv3 = mx.sym.Convolution(data=act3, num_filter=num_filter, kernel=(1, 1), stride=(1, 1),
+                                   pad=(0, 0), no_bias=True,
+                                   workspace=workspace, name=name + '_conv3')
+        if dim_match:
+            shortcut = data
+        else:
+            shortcut = mx.sym.Convolution(data=act1, num_filter=num_filter, kernel=(1, 1),
+                                          stride=stride, no_bias=True,
+                                          workspace=workspace, name=name + '_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return conv3 + shortcut
+    else:
+        bn1 = mx.sym.BatchNorm(data=data, fix_gamma=False, momentum=bn_mom, eps=2e-5,
+                               name=name + '_bn1')
+        act1 = mx.sym.Activation(data=bn1, act_type='relu', name=name + '_relu1')
+        conv1 = mx.sym.Convolution(data=act1, num_filter=num_filter, kernel=(3, 3), stride=stride,
+                                   pad=(1, 1),
+                                   no_bias=True, workspace=workspace, name=name + '_conv1')
+        bn2 = mx.sym.BatchNorm(data=conv1, fix_gamma=False, momentum=bn_mom, eps=2e-5,
+                               name=name + '_bn2')
+        act2 = mx.sym.Activation(data=bn2, act_type='relu', name=name + '_relu2')
+        conv2 = mx.sym.Convolution(data=act2, num_filter=num_filter, kernel=(3, 3), stride=(1, 1),
+                                   pad=(1, 1),
+                                   no_bias=True, workspace=workspace, name=name + '_conv2')
+        if dim_match:
+            shortcut = data
+        else:
+            shortcut = mx.sym.Convolution(data=act1, num_filter=num_filter, kernel=(1, 1),
+                                          stride=stride, no_bias=True,
+                                          workspace=workspace, name=name + '_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return conv2 + shortcut
+
+
+def resnet(units, num_stages, filter_list, num_classes, image_shape, bottle_neck=True, bn_mom=0.9,
+           workspace=256, dtype='float32', memonger=False):
+    """Return ResNet symbol of
+    Parameters
+    ----------
+    units : list
+        Number of units in each stage
+    num_stages : int
+        Number of stage
+    filter_list : list
+        Channel size of each stage
+    num_classes : int
+        Ouput size of symbol
+    dataset : str
+        Dataset type, only cifar10 and imagenet supports
+    workspace : int
+        Workspace used in convolution operator
+    dtype : str
+        Precision (float32 or float16)
+    """
+    num_unit = len(units)
+    assert (num_unit == num_stages)
+    data = mx.sym.Variable(name='data')
+    if dtype == 'float32':
+        data = mx.sym.identity(data=data, name='id')
+    else:
+        if dtype == 'float16':
+            data = mx.sym.Cast(data=data, dtype=np.float16)
+    data = mx.sym.BatchNorm(data=data, fix_gamma=True, eps=2e-5, momentum=bn_mom, name='bn_data')
+    (nchannel, height, width) = image_shape
+    if height <= 32:  # such as cifar10
+        body = mx.sym.Convolution(data=data, num_filter=filter_list[0], kernel=(3, 3),
+                                  stride=(1, 1), pad=(1, 1),
+                                  no_bias=True, name="conv0", workspace=workspace)
+    else:  # often expected to be 224 such as imagenet
+        body = mx.sym.Convolution(data=data, num_filter=filter_list[0], kernel=(7, 7),
+                                  stride=(2, 2), pad=(3, 3),
+                                  no_bias=True, name="conv0", workspace=workspace)
+        body = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name='bn0')
+        body = mx.sym.Activation(data=body, act_type='relu', name='relu0')
+        body = mx.sym.Pooling(data=body, kernel=(3, 3), stride=(2, 2), pad=(1, 1), pool_type='max')
+
+    for i in range(num_stages):
+        body = residual_unit(body, filter_list[i + 1], (1 if i == 0 else 2, 1 if i == 0 else 2),
+                             False,
+                             name='stage%d_unit%d' % (i + 1, 1), bottle_neck=bottle_neck,
+                             workspace=workspace,
+                             memonger=memonger)
+        for j in range(units[i] - 1):
+            body = residual_unit(body, filter_list[i + 1], (1, 1), True,
+                                 name='stage%d_unit%d' % (i + 1, j + 2),
+                                 bottle_neck=bottle_neck, workspace=workspace, memonger=memonger)
+    bn1 = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name='bn1')
+    relu1 = mx.sym.Activation(data=bn1, act_type='relu', name='relu1')
+    # Although kernel is not used here when global_pool=True, we should put one
+    pool1 = mx.sym.Pooling(data=relu1, global_pool=True, kernel=(7, 7), pool_type='avg',
+                           name='pool1')
+    flat = mx.sym.Flatten(data=pool1)
+    fc1 = mx.sym.FullyConnected(data=flat, num_hidden=num_classes, name='fc1')
+    if dtype == 'float16':
+        fc1 = mx.sym.Cast(data=fc1, dtype=np.float32)
+    return mx.sym.SoftmaxOutput(data=fc1, name='softmax')
+
+
+def get_symbol(num_classes, num_layers, image_shape, conv_workspace=256, dtype='float32', **kwargs):
+    """
+    Adapted from https://github.com/tornadomeet/ResNet/blob/master/train_resnet.py
+    Original author Wei Wu
+    """
+    image_shape = [int(l) for l in image_shape.split(',')]
+    (nchannel, height, width) = image_shape
+    if height <= 28:
+        num_stages = 3
+        if (num_layers - 2) % 9 == 0 and num_layers >= 164:
+            per_unit = [(num_layers - 2) // 9]
+            filter_list = [16, 64, 128, 256]
+            bottle_neck = True
+        elif (num_layers - 2) % 6 == 0 and num_layers < 164:
+            per_unit = [(num_layers - 2) // 6]
+            filter_list = [16, 16, 32, 64]
+            bottle_neck = False
+        else:
+            raise ValueError(
+                "no tasks done on num_layers {}, you can do it yourself".format(num_layers))
+        units = per_unit * num_stages
+    else:
+        if num_layers >= 50:
+            filter_list = [64, 256, 512, 1024, 2048]
+            bottle_neck = True
+        else:
+            filter_list = [64, 64, 128, 256, 512]
+            bottle_neck = False
+        num_stages = 4
+        if num_layers == 18:
+            units = [2, 2, 2, 2]
+        elif num_layers == 34:
+            units = [3, 4, 6, 3]
+        elif num_layers == 50:
+            units = [3, 4, 6, 3]
+        elif num_layers == 101:
+            units = [3, 4, 23, 3]
+        elif num_layers == 152:
+            units = [3, 8, 36, 3]
+        elif num_layers == 200:
+            units = [3, 24, 36, 3]
+        elif num_layers == 269:
+            units = [3, 30, 48, 8]
+        else:
+            raise ValueError(
+                "no tasks done on num_layers {}, you can do it yourself".format(num_layers))
+
+    return resnet(units=units,
+                  num_stages=num_stages,
+                  filter_list=filter_list,
+                  num_classes=num_classes,
+                  image_shape=image_shape,
+                  bottle_neck=bottle_neck,
+                  workspace=conv_workspace,
+                  dtype=dtype)
--- a/mxnet/cifar10/test_cifar10.py
+++ b/mxnet/cifar10/test_cifar10.py
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import autocnn_helper as helper
+import mxnet as mx
+import time
+import logging
+
+
+def score(model_prefix, epoch, data_val, metrics, gpus, batch_size, rgb_mean=None, mean_img=None,
+          image_shape='3,224,224', data_nthreads=4, label_name='softmax_label', max_num_examples=None):
+    # create data iterator
+    data_shape = tuple([int(i) for i in image_shape.split(',')])
+    if mean_img is not None:
+        mean_args = {'mean_img': mean_img}
+    elif rgb_mean is not None:
+        rgb_mean = [float(i) for i in rgb_mean.split(',')]
+        mean_args = {'mean_r': rgb_mean[0], 'mean_g': rgb_mean[1],
+                     'mean_b': rgb_mean[2]}
+
+    data = mx.io.ImageRecordIter(
+        path_imgrec=data_val,
+        label_width=1,
+        preprocess_threads=data_nthreads,
+        batch_size=batch_size,
+        data_shape=data_shape,
+        label_name=label_name,
+        rand_crop=False,
+        rand_mirror=False,
+        **mean_args)
+    sym, arg_params, aux_params = mx.model.load_checkpoint(model_prefix, epoch)
+
+    # create module
+    if gpus == '':
+        devs = mx.cpu()
+    else:
+        devs = [mx.gpu(int(i)) for i in gpus.split(',')]
+
+    mod = mx.mod.Module(symbol=sym, context=devs, label_names=[label_name, ])
+    mod.bind(for_training=False,
+             data_shapes=data.provide_data,
+             label_shapes=data.provide_label)
+    mod.set_params(arg_params, aux_params)
+    if not isinstance(metrics, list):
+        metrics = [metrics, ]
+    tic = time.time()
+    num = 0
+    for batch in data:
+        mod.forward(batch, is_train=False)
+        for m in metrics:
+            mod.update_metric(m, batch.label)
+        num += batch_size
+        if max_num_examples is not None and num > max_num_examples:
+            break
+    return num / (time.time() - tic)
+
+
+if __name__ == '__main__':
+    parameter = helper.get_parameter()
+    logger = logging.getLogger()
+    logger.setLevel(logging.DEBUG)
+
+    metrics = [mx.metric.create('acc'),
+               mx.metric.create('top_k_accuracy', top_k=5)]
+
+    speed = score(metrics=metrics, **parameter)
+    logging.info('Finished with %f images per second', speed)
+
+    for m in metrics:
+        logging.info(m.get())
--- a/mxnet/cifar10/train_cifar10.py
+++ b/mxnet/cifar10/train_cifar10.py
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+import logging
+import autocnn_helper as helper
+from common import data, fit
+logging.basicConfig(level=logging.DEBUG)
+
+
+if __name__ == '__main__':
+    # download data
+    data_dir = "data"
+    train_fname = os.path.join(data_dir, "cifar10_train.rec")
+    val_fname = os.path.join(data_dir, "cifar10_val.rec")
+
+    parameter = helper.get_parameter()
+    # parse args
+    # parser = argparse.ArgumentParser(description="train cifar10",
+    #                                  formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+    # load network
+    from importlib import import_module
+
+    net = import_module('symbols.' + parameter.network)
+    sym = net.get_symbol(**parameter)
+
+    # train
+    fit.fit(parameter, sym, data.get_rec_iter)
--- a/pytorch/.autocnn/algorithm
+++ b/pytorch/.autocnn/algorithm
+{"name": "pytorch_cifar10", "user": "daiab", "unique_name": "daiab.pytorch_cifar10", "uuid": "c8568748e1c647f9bea6f1a08d9e12e2", "description": "", "is_public": false, "has_code": true, "created_at": "2018-05-04T03:39:58.840190+00:00", "updated_at": "2018-05-04T03:39:58.840240+00:00", "num_tasks": 0, "has_tensorboard": false, "has_notebook": false, "tasks": null, "framework": "pytorch", "tags": ""}
\ No newline at end of file
--- a/pytorch/.autocnnignore
+++ b/pytorch/.autocnnignore
+
+.git
+.eggs
+eggs
+lib
+lib64
+parts
+sdist
+var
+*.pyc
+*.swp
+.DS_Store
+./.autocnn
--- a/pytorch/LICENSE
+++ b/pytorch/LICENSE
+MIT License
+
+Copyright (c) 2017 liukuang
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/pytorch/README.md
+++ b/pytorch/README.md
+# Train CIFAR10 with PyTorch
+
+I'm playing with [PyTorch](http://pytorch.org/) on the CIFAR10 dataset.
+
+## Pros & cons
+Pros:
+- Built-in data loading and augmentation, very nice!
+- Training is fast, maybe even a little bit faster.
+- Very memory efficient!
+
+Cons:
+- No progress bar, sad :(
+- No built-in log.
+
+## Accuracy
+| Model             | Acc.        |
+| ----------------- | ----------- |
+| [VGG16](https://arxiv.org/abs/1409.1556)              | 92.64%      |
+| [ResNet18](https://arxiv.org/abs/1512.03385)          | 93.02%      |
+| [ResNet50](https://arxiv.org/abs/1512.03385)          | 93.62%      |
+| [ResNet101](https://arxiv.org/abs/1512.03385)         | 93.75%      |
+| [MobileNetV2](https://arxiv.org/abs/1801.04381)       | 94.43%      |
+| [ResNeXt29(32x4d)](https://arxiv.org/abs/1611.05431)  | 94.73%      |
+| [ResNeXt29(2x64d)](https://arxiv.org/abs/1611.05431)  | 94.82%      |
+| [DenseNet121](https://arxiv.org/abs/1608.06993)       | 95.04%      |
+| [PreActResNet18](https://arxiv.org/abs/1603.05027)    | 95.11%      |
+| [DPN92](https://arxiv.org/abs/1707.01629)             | 95.16%      |
+
+## Learning rate adjustment
+I manually change the `lr` during training:
+- `0.1` for epoch `[0,150)`
+- `0.01` for epoch `[150,250)`
+- `0.001` for epoch `[250,350)`
+
+Resume the training with `python main.py --resume --lr=0.01`
--- a/pytorch/autocnnfile.yml
+++ b/pytorch/autocnnfile.yml
+version: 1
+
+algorithm:
+  name: pytorch_cifar10
+
+resource:
+  default_resources:
+    cpu:
+      requests: 4
+      limits: 4
+    gpu:
+      requests: 2
+      limits: 2
+    memory:
+      requests: 10240
+      limits: 10240
+
+image:
+  from_image: pytorch/pytorch:0.4_cuda9_cudnn7
+  runs:
+    - pip install atc-beta-helper
+
+train:
+  mount:
+    data/daiab/pytorch_cifar10/: /data
+  output: /output
+  parameter:
+    lr: 0.01        # learning rate
+    resume: false   # resume from checkpoint
+    epoch: 10
+  cmd: python3 main.py
+
+
--- a/pytorch/main.py
+++ b/pytorch/main.py
+'''Train CIFAR10 with PyTorch.'''
+from __future__ import print_function
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torch.nn.functional as F
+import torch.backends.cudnn as cudnn
+
+import torchvision
+import torchvision.transforms as transforms
+
+import os
+import argparse
+
+from models import *
+from torch.autograd import Variable
+import autocnn_helper as helper
+args = helper.get_parameter()
+
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+best_acc = 0  # best test accuracy
+start_epoch = 0  # start from epoch 0 or last checkpoint epoch
+
+# Data
+print('==> Preparing data..')
+transform_train = transforms.Compose([
+    transforms.RandomCrop(32, padding=4),
+    transforms.RandomHorizontalFlip(),
+    transforms.ToTensor(),
+    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
+])
+
+transform_test = transforms.Compose([
+    transforms.ToTensor(),
+    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
+])
+
+trainset = torchvision.datasets.CIFAR10(root='/data', train=True, download=False, transform=transform_train)
+trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True, num_workers=2)
+
+testset = torchvision.datasets.CIFAR10(root='/data', train=False, download=False, transform=transform_test)
+testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False, num_workers=2)
+
+classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
+
+# Model
+print('==> Building model..')
+# net = VGG('VGG19')
+net = ResNet18()
+# net = PreActResNet18()
+# net = GoogLeNet()
+# net = DenseNet121()
+# net = ResNeXt29_2x64d()
+# net = MobileNet()
+# net = MobileNetV2()
+# net = DPN92()
+# net = ShuffleNetG2()
+# net = SENet18()
+net = net.to(device)
+if device == 'cuda':
+    net = torch.nn.DataParallel(net)
+    cudnn.benchmark = True
+
+start_epoch = 0
+if args.resume:
+    # Load checkpoint.
+    print('==> Resuming from checkpoint..')
+    assert os.path.isdir('/output'), 'Error: no checkpoint directory found!'
+    checkpoint = torch.load('/output/ckpt.t7')
+    net.load_state_dict(checkpoint['net'])
+    best_acc = checkpoint['acc']
+    start_epoch = checkpoint['epoch']
+
+criterion = nn.CrossEntropyLoss()
+optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4)
+
+
+# Training
+def train(epoch):
+    print('\nEpoch: %d' % epoch)
+    net.train()
+    train_loss = 0
+    correct = 0
+    total = 0
+    for batch_idx, (inputs, targets) in enumerate(trainloader):
+        inputs, targets = inputs.to(device), targets.to(device)
+        optimizer.zero_grad()
+        outputs = net(inputs)
+        loss = criterion(outputs, targets)
+        loss.backward()
+        optimizer.step()
+
+        train_loss += loss.item()
+        _, predicted = outputs.max(1)
+        total += targets.size(0)
+        correct += predicted.eq(targets).sum().item()
+
+        print('%s/%s' % (batch_idx, len(trainloader)),
+              'Loss: %.3f | Acc: %.3f%% (%d/%d)' % (train_loss / (batch_idx + 1),
+                                                    100. * correct / total, correct, total))
+
+
+def test(epoch):
+    global best_acc
+    net.eval()
+    test_loss = 0
+    correct = 0
+    total = 0
+    with torch.no_grad():
+        for batch_idx, (inputs, targets) in enumerate(testloader):
+            inputs, targets = inputs.to(device), targets.to(device)
+            outputs = net(inputs)
+            loss = criterion(outputs, targets)
+
+            test_loss += loss.item()
+            _, predicted = outputs.max(1)
+            total += targets.size(0)
+            correct += predicted.eq(targets).sum().item()
+
+            print('%s/%s' % (batch_idx, len(trainloader)),
+                  'Loss: %.3f | Acc: %.3f%% (%d/%d)' % (test_loss / (batch_idx + 1),
+                                                        100. * correct / total, correct, total))
+
+    # Save checkpoint.
+    acc = 100. * correct / total
+    if acc > best_acc:
+        print('Saving..')
+        state = {
+            'net': net.state_dict(),
+            'acc': acc,
+            'epoch': epoch,
+        }
+        torch.save(state, '/output/ckpt.t7')
+        best_acc = acc
+
+
+for epoch in range(start_epoch, start_epoch + args.epoch):
+    train(epoch)
+    test(epoch)
--- a/pytorch/models/__init__.py
+++ b/pytorch/models/__init__.py
+from models.vgg import *
+from models.dpn import *
+from models.lenet import *
+from models.senet import *
+from models.pnasnet import *
+from models.densenet import *
+from models.googlenet import *
+from models.shufflenet import *
+from models.resnet import *
+from models.resnext import *
+from models.preact_resnet import *
+from models.mobilenet import *
+from models.mobilenetv2 import *
--- a/pytorch/models/densenet.py
+++ b/pytorch/models/densenet.py
+'''DenseNet in PyTorch.'''
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from torch.autograd import Variable
+
+
+class Bottleneck(nn.Module):
+    def __init__(self, in_planes, growth_rate):
+        super(Bottleneck, self).__init__()
+        self.bn1 = nn.BatchNorm2d(in_planes)
+        self.conv1 = nn.Conv2d(in_planes, 4*growth_rate, kernel_size=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(4*growth_rate)
+        self.conv2 = nn.Conv2d(4*growth_rate, growth_rate, kernel_size=3, padding=1, bias=False)
+
+    def forward(self, x):
+        out = self.conv1(F.relu(self.bn1(x)))
+        out = self.conv2(F.relu(self.bn2(out)))
+        out = torch.cat([out,x], 1)
+        return out
+
+
+class Transition(nn.Module):
+    def __init__(self, in_planes, out_planes):
+        super(Transition, self).__init__()
+        self.bn = nn.BatchNorm2d(in_planes)
+        self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=1, bias=False)
+
+    def forward(self, x):
+        out = self.conv(F.relu(self.bn(x)))
+        out = F.avg_pool2d(out, 2)
+        return out
+
+
+class DenseNet(nn.Module):
+    def __init__(self, block, nblocks, growth_rate=12, reduction=0.5, num_classes=10):
+        super(DenseNet, self).__init__()
+        self.growth_rate = growth_rate
+
+        num_planes = 2*growth_rate
+        self.conv1 = nn.Conv2d(3, num_planes, kernel_size=3, padding=1, bias=False)
+
+        self.dense1 = self._make_dense_layers(block, num_planes, nblocks[0])
+        num_planes += nblocks[0]*growth_rate
+        out_planes = int(math.floor(num_planes*reduction))
+        self.trans1 = Transition(num_planes, out_planes)
+        num_planes = out_planes
+
+        self.dense2 = self._make_dense_layers(block, num_planes, nblocks[1])
+        num_planes += nblocks[1]*growth_rate
+        out_planes = int(math.floor(num_planes*reduction))
+        self.trans2 = Transition(num_planes, out_planes)
+        num_planes = out_planes
+
+        self.dense3 = self._make_dense_layers(block, num_planes, nblocks[2])
+        num_planes += nblocks[2]*growth_rate
+        out_planes = int(math.floor(num_planes*reduction))
+        self.trans3 = Transition(num_planes, out_planes)
+        num_planes = out_planes
+
+        self.dense4 = self._make_dense_layers(block, num_planes, nblocks[3])
+        num_planes += nblocks[3]*growth_rate
+
+        self.bn = nn.BatchNorm2d(num_planes)
+        self.linear = nn.Linear(num_planes, num_classes)
+
+    def _make_dense_layers(self, block, in_planes, nblock):
+        layers = []
+        for i in range(nblock):
+            layers.append(block(in_planes, self.growth_rate))
+            in_planes += self.growth_rate
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        out = self.conv1(x)
+        out = self.trans1(self.dense1(out))
+        out = self.trans2(self.dense2(out))
+        out = self.trans3(self.dense3(out))
+        out = self.dense4(out)
+        out = F.avg_pool2d(F.relu(self.bn(out)), 4)
+        out = out.view(out.size(0), -1)
+        out = self.linear(out)
+        return out
+
+def DenseNet121():
+    return DenseNet(Bottleneck, [6,12,24,16], growth_rate=32)
+
+def DenseNet169():
+    return DenseNet(Bottleneck, [6,12,32,32], growth_rate=32)
+
+def DenseNet201():
+    return DenseNet(Bottleneck, [6,12,48,32], growth_rate=32)
+
+def DenseNet161():
+    return DenseNet(Bottleneck, [6,12,36,24], growth_rate=48)
+
+def densenet_cifar():
+    return DenseNet(Bottleneck, [6,12,24,16], growth_rate=12)
+
+def test_densenet():
+    net = densenet_cifar()
+    x = torch.randn(1,3,32,32)
+    y = net(Variable(x))
+    print(y)
+
+# test_densenet()
--- a/pytorch/models/dpn.py
+++ b/pytorch/models/dpn.py
+'''Dual Path Networks in PyTorch.'''
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from torch.autograd import Variable
+
+
+class Bottleneck(nn.Module):
+    def __init__(self, last_planes, in_planes, out_planes, dense_depth, stride, first_layer):
+        super(Bottleneck, self).__init__()
+        self.out_planes = out_planes
+        self.dense_depth = dense_depth
+
+        self.conv1 = nn.Conv2d(last_planes, in_planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(in_planes)
+        self.conv2 = nn.Conv2d(in_planes, in_planes, kernel_size=3, stride=stride, padding=1, groups=32, bias=False)
+        self.bn2 = nn.BatchNorm2d(in_planes)
+        self.conv3 = nn.Conv2d(in_planes, out_planes+dense_depth, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(out_planes+dense_depth)
+
+        self.shortcut = nn.Sequential()
+        if first_layer:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(last_planes, out_planes+dense_depth, kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(out_planes+dense_depth)
+            )
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = F.relu(self.bn2(self.conv2(out)))
+        out = self.bn3(self.conv3(out))
+        x = self.shortcut(x)
+        d = self.out_planes
+        out = torch.cat([x[:,:d,:,:]+out[:,:d,:,:], x[:,d:,:,:], out[:,d:,:,:]], 1)
+        out = F.relu(out)
+        return out
+
+
+class DPN(nn.Module):
+    def __init__(self, cfg):
+        super(DPN, self).__init__()
+        in_planes, out_planes = cfg['in_planes'], cfg['out_planes']
+        num_blocks, dense_depth = cfg['num_blocks'], cfg['dense_depth']
+
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.last_planes = 64
+        self.layer1 = self._make_layer(in_planes[0], out_planes[0], num_blocks[0], dense_depth[0], stride=1)
+        self.layer2 = self._make_layer(in_planes[1], out_planes[1], num_blocks[1], dense_depth[1], stride=2)
+        self.layer3 = self._make_layer(in_planes[2], out_planes[2], num_blocks[2], dense_depth[2], stride=2)
+        self.layer4 = self._make_layer(in_planes[3], out_planes[3], num_blocks[3], dense_depth[3], stride=2)
+        self.linear = nn.Linear(out_planes[3]+(num_blocks[3]+1)*dense_depth[3], 10)
+
+    def _make_layer(self, in_planes, out_planes, num_blocks, dense_depth, stride):
+        strides = [stride] + [1]*(num_blocks-1)
+        layers = []
+        for i,stride in enumerate(strides):
+            layers.append(Bottleneck(self.last_planes, in_planes, out_planes, dense_depth, stride, i==0))
+            self.last_planes = out_planes + (i+2) * dense_depth
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.layer1(out)
+        out = self.layer2(out)
+        out = self.layer3(out)
+        out = self.layer4(out)
+        out = F.avg_pool2d(out, 4)
+        out = out.view(out.size(0), -1)
+        out = self.linear(out)
+        return out
+
+
+def DPN26():
+    cfg = {
+        'in_planes': (96,192,384,768),
+        'out_planes': (256,512,1024,2048),
+        'num_blocks': (2,2,2,2),
+        'dense_depth': (16,32,24,128)
+    }
+    return DPN(cfg)
+
+def DPN92():
+    cfg = {
+        'in_planes': (96,192,384,768),
+        'out_planes': (256,512,1024,2048),
+        'num_blocks': (3,4,20,3),
+        'dense_depth': (16,32,24,128)
+    }
+    return DPN(cfg)
+
+
+def test():
+    net = DPN92()
+    x = Variable(torch.randn(1,3,32,32))
+    y = net(x)
+    print(y)
+
+# test()
--- a/pytorch/models/googlenet.py
+++ b/pytorch/models/googlenet.py
+'''GoogLeNet with PyTorch.'''
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from torch.autograd import Variable
+
+
+class Inception(nn.Module):
+    def __init__(self, in_planes, n1x1, n3x3red, n3x3, n5x5red, n5x5, pool_planes):
+        super(Inception, self).__init__()
+        # 1x1 conv branch
+        self.b1 = nn.Sequential(
+            nn.Conv2d(in_planes, n1x1, kernel_size=1),
+            nn.BatchNorm2d(n1x1),
+            nn.ReLU(True),
+        )
+
+        # 1x1 conv -> 3x3 conv branch
+        self.b2 = nn.Sequential(
+            nn.Conv2d(in_planes, n3x3red, kernel_size=1),
+            nn.BatchNorm2d(n3x3red),
+            nn.ReLU(True),
+            nn.Conv2d(n3x3red, n3x3, kernel_size=3, padding=1),
+            nn.BatchNorm2d(n3x3),
+            nn.ReLU(True),
+        )
+
+        # 1x1 conv -> 5x5 conv branch
+        self.b3 = nn.Sequential(
+            nn.Conv2d(in_planes, n5x5red, kernel_size=1),
+            nn.BatchNorm2d(n5x5red),
+            nn.ReLU(True),
+            nn.Conv2d(n5x5red, n5x5, kernel_size=3, padding=1),
+            nn.BatchNorm2d(n5x5),
+            nn.ReLU(True),
+            nn.Conv2d(n5x5, n5x5, kernel_size=3, padding=1),
+            nn.BatchNorm2d(n5x5),
+            nn.ReLU(True),
+        )
+
+        # 3x3 pool -> 1x1 conv branch
+        self.b4 = nn.Sequential(
+            nn.MaxPool2d(3, stride=1, padding=1),
+            nn.Conv2d(in_planes, pool_planes, kernel_size=1),
+            nn.BatchNorm2d(pool_planes),
+            nn.ReLU(True),
+        )
+
+    def forward(self, x):
+        y1 = self.b1(x)
+        y2 = self.b2(x)
+        y3 = self.b3(x)
+        y4 = self.b4(x)
+        return torch.cat([y1,y2,y3,y4], 1)
+
+
+class GoogLeNet(nn.Module):
+    def __init__(self):
+        super(GoogLeNet, self).__init__()
+        self.pre_layers = nn.Sequential(
+            nn.Conv2d(3, 192, kernel_size=3, padding=1),
+            nn.BatchNorm2d(192),
+            nn.ReLU(True),
+        )
+
+        self.a3 = Inception(192,  64,  96, 128, 16, 32, 32)
+        self.b3 = Inception(256, 128, 128, 192, 32, 96, 64)
+
+        self.maxpool = nn.MaxPool2d(3, stride=2, padding=1)
+
+        self.a4 = Inception(480, 192,  96, 208, 16,  48,  64)
+        self.b4 = Inception(512, 160, 112, 224, 24,  64,  64)
+        self.c4 = Inception(512, 128, 128, 256, 24,  64,  64)
+        self.d4 = Inception(512, 112, 144, 288, 32,  64,  64)
+        self.e4 = Inception(528, 256, 160, 320, 32, 128, 128)
+
+        self.a5 = Inception(832, 256, 160, 320, 32, 128, 128)
+        self.b5 = Inception(832, 384, 192, 384, 48, 128, 128)
+
+        self.avgpool = nn.AvgPool2d(8, stride=1)
+        self.linear = nn.Linear(1024, 10)
+
+    def forward(self, x):
+        out = self.pre_layers(x)
+        out = self.a3(out)
+        out = self.b3(out)
+        out = self.maxpool(out)
+        out = self.a4(out)
+        out = self.b4(out)
+        out = self.c4(out)
+        out = self.d4(out)
+        out = self.e4(out)
+        out = self.maxpool(out)
+        out = self.a5(out)
+        out = self.b5(out)
+        out = self.avgpool(out)
+        out = out.view(out.size(0), -1)
+        out = self.linear(out)
+        return out
+
+# net = GoogLeNet()
+# x = torch.randn(1,3,32,32)
+# y = net(Variable(x))
+# print(y.size())
--- a/pytorch/models/lenet.py
+++ b/pytorch/models/lenet.py
+'''LeNet in PyTorch.'''
+import torch.nn as nn
+import torch.nn.functional as F
+
+class LeNet(nn.Module):
+    def __init__(self):
+        super(LeNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1   = nn.Linear(16*5*5, 120)
+        self.fc2   = nn.Linear(120, 84)
+        self.fc3   = nn.Linear(84, 10)
+
+    def forward(self, x):
+        out = F.relu(self.conv1(x))
+        out = F.max_pool2d(out, 2)
+        out = F.relu(self.conv2(out))
+        out = F.max_pool2d(out, 2)
+        out = out.view(out.size(0), -1)
+        out = F.relu(self.fc1(out))
+        out = F.relu(self.fc2(out))
+        out = self.fc3(out)
+        return out
--- a/pytorch/models/mobilenet.py
+++ b/pytorch/models/mobilenet.py
+'''MobileNet in PyTorch.
+
+See the paper "MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications"
+for more details.
+'''
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from torch.autograd import Variable
+
+
+class Block(nn.Module):
+    '''Depthwise conv + Pointwise conv'''
+    def __init__(self, in_planes, out_planes, stride=1):
+        super(Block, self).__init__()
+        self.conv1 = nn.Conv2d(in_planes, in_planes, kernel_size=3, stride=stride, padding=1, groups=in_planes, bias=False)
+        self.bn1 = nn.BatchNorm2d(in_planes)
+        self.conv2 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
+        self.bn2 = nn.BatchNorm2d(out_planes)
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = F.relu(self.bn2(self.conv2(out)))
+        return out
+
+
+class MobileNet(nn.Module):
+    # (128,2) means conv planes=128, conv stride=2, by default conv stride=1
+    cfg = [64, (128,2), 128, (256,2), 256, (512,2), 512, 512, 512, 512, 512, (1024,2), 1024]
+
+    def __init__(self, num_classes=10):
+        super(MobileNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(32)
+        self.layers = self._make_layers(in_planes=32)
+        self.linear = nn.Linear(1024, num_classes)
+
+    def _make_layers(self, in_planes):
+        layers = []
+        for x in self.cfg:
+            out_planes = x if isinstance(x, int) else x[0]
+            stride = 1 if isinstance(x, int) else x[1]
+            layers.append(Block(in_planes, out_planes, stride))
+            in_planes = out_planes
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.layers(out)
+        out = F.avg_pool2d(out, 2)
+        out = out.view(out.size(0), -1)
+        out = self.linear(out)
+        return out
+
+
+def test():
+    net = MobileNet()
+    x = torch.randn(1,3,32,32)
+    y = net(Variable(x))
+    print(y.size())
+
+# test()
--- a/pytorch/models/mobilenetv2.py
+++ b/pytorch/models/mobilenetv2.py
+'''MobileNetV2 in PyTorch.
+
+See the paper "Inverted Residuals and Linear Bottlenecks:
+Mobile Networks for Classification, Detection and Segmentation" for more details.
+'''
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from torch.autograd import Variable
+
+
+class Block(nn.Module):
+    '''expand + depthwise + pointwise'''
+    def __init__(self, in_planes, out_planes, expansion, stride):
+        super(Block, self).__init__()
+        self.stride = stride
+
+        planes = expansion * in_planes
+        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, stride=1, padding=0, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, groups=planes, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
+        self.bn3 = nn.BatchNorm2d(out_planes)
+
+        self.shortcut = nn.Sequential()
+        if stride == 1 and in_planes != out_planes:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False),
+                nn.BatchNorm2d(out_planes),
+            )
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = F.relu(self.bn2(self.conv2(out)))
+        out = self.bn3(self.conv3(out))
+        out = out + self.shortcut(x) if self.stride==1 else out
+        return out
+
+
+class MobileNetV2(nn.Module):
+    # (expansion, out_planes, num_blocks, stride)
+    cfg = [(1,  16, 1, 1),
+           (6,  24, 2, 1),  # NOTE: change stride 2 -> 1 for CIFAR10
+           (6,  32, 3, 2),
+           (6,  64, 4, 2),
+           (6,  96, 3, 1),
+           (6, 160, 3, 2),
+           (6, 320, 1, 1)]
+
+    def __init__(self, num_classes=10):
+        super(MobileNetV2, self).__init__()
+        # NOTE: change conv1 stride 2 -> 1 for CIFAR10
+        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(32)
+        self.layers = self._make_layers(in_planes=32)
+        self.conv2 = nn.Conv2d(320, 1280, kernel_size=1, stride=1, padding=0, bias=False)
+        self.bn2 = nn.BatchNorm2d(1280)
+        self.linear = nn.Linear(1280, num_classes)
+
+    def _make_layers(self, in_planes):
+        layers = []
+        for expansion, out_planes, num_blocks, stride in self.cfg:
+            strides = [stride] + [1]*(num_blocks-1)
+            for stride in strides:
+                layers.append(Block(in_planes, out_planes, expansion, stride))
+                in_planes = out_planes
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.layers(out)
+        out = F.relu(self.bn2(self.conv2(out)))
+        # NOTE: change pooling kernel_size 7 -> 4 for CIFAR10
+        out = F.avg_pool2d(out, 4)
+        out = out.view(out.size(0), -1)
+        out = self.linear(out)
+        return out
+
+
+def test():
+    net = MobileNetV2()
+    x = Variable(torch.randn(2,3,32,32))
+    y = net(x)
+    print(y.size())
+
+# test()
--- a/pytorch/models/pnasnet.py
+++ b/pytorch/models/pnasnet.py
+'''PNASNet in PyTorch.
+
+Paper: Progressive Neural Architecture Search
+'''
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from torch.autograd import Variable
+
+
+class SepConv(nn.Module):
+    '''Separable Convolution.'''
+    def __init__(self, in_planes, out_planes, kernel_size, stride):
+        super(SepConv, self).__init__()
+        self.conv1 = nn.Conv2d(in_planes, out_planes,
+                               kernel_size, stride,
+                               padding=(kernel_size-1)//2,
+                               bias=False, groups=in_planes)
+        self.bn1 = nn.BatchNorm2d(out_planes)
+
+    def forward(self, x):
+        return self.bn1(self.conv1(x))
+
+
+class CellA(nn.Module):
+    def __init__(self, in_planes, out_planes, stride=1):
+        super(CellA, self).__init__()
+        self.stride = stride
+        self.sep_conv1 = SepConv(in_planes, out_planes, kernel_size=7, stride=stride)
+        if stride==2:
+            self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
+            self.bn1 = nn.BatchNorm2d(out_planes)
+
+    def forward(self, x):
+        y1 = self.sep_conv1(x)
+        y2 = F.max_pool2d(x, kernel_size=3, stride=self.stride, padding=1)
+        if self.stride==2:
+            y2 = self.bn1(self.conv1(y2))
+        return F.relu(y1+y2)
+
+class CellB(nn.Module):
+    def __init__(self, in_planes, out_planes, stride=1):
+        super(CellB, self).__init__()
+        self.stride = stride
+        # Left branch
+        self.sep_conv1 = SepConv(in_planes, out_planes, kernel_size=7, stride=stride)
+        self.sep_conv2 = SepConv(in_planes, out_planes, kernel_size=3, stride=stride)
+        # Right branch
+        self.sep_conv3 = SepConv(in_planes, out_planes, kernel_size=5, stride=stride)
+        if stride==2:
+            self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
+            self.bn1 = nn.BatchNorm2d(out_planes)
+        # Reduce channels
+        self.conv2 = nn.Conv2d(2*out_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
+        self.bn2 = nn.BatchNorm2d(out_planes)
+
+    def forward(self, x):
+        # Left branch
+        y1 = self.sep_conv1(x)
+        y2 = self.sep_conv2(x)
+        # Right branch
+        y3 = F.max_pool2d(x, kernel_size=3, stride=self.stride, padding=1)
+        if self.stride==2:
+            y3 = self.bn1(self.conv1(y3))
+        y4 = self.sep_conv3(x)
+        # Concat & reduce channels
+        b1 = F.relu(y1+y2)
+        b2 = F.relu(y3+y4)
+        y = torch.cat([b1,b2], 1)
+        return F.relu(self.bn2(self.conv2(y)))
+
+class PNASNet(nn.Module):
+    def __init__(self, cell_type, num_cells, num_planes):
+        super(PNASNet, self).__init__()
+        self.in_planes = num_planes
+        self.cell_type = cell_type
+
+        self.conv1 = nn.Conv2d(3, num_planes, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(num_planes)
+
+        self.layer1 = self._make_layer(num_planes, num_cells=6)
+        self.layer2 = self._downsample(num_planes*2)
+        self.layer3 = self._make_layer(num_planes*2, num_cells=6)
+        self.layer4 = self._downsample(num_planes*4)
+        self.layer5 = self._make_layer(num_planes*4, num_cells=6)
+
+        self.linear = nn.Linear(num_planes*4, 10)
+
+    def _make_layer(self, planes, num_cells):
+        layers = []
+        for _ in range(num_cells):
+            layers.append(self.cell_type(self.in_planes, planes, stride=1))
+            self.in_planes = planes
+        return nn.Sequential(*layers)
+
+    def _downsample(self, planes):
+        layer = self.cell_type(self.in_planes, planes, stride=2)
+        self.in_planes = planes
+        return layer
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.layer1(out)
+        out = self.layer2(out)
+        out = self.layer3(out)
+        out = self.layer4(out)
+        out = self.layer5(out)
+        out = F.avg_pool2d(out, 8)
+        out = self.linear(out.view(out.size(0), -1))
+        return out
+
+
+def PNASNetA():
+    return PNASNet(CellA, num_cells=6, num_planes=44)
+
+def PNASNetB():
+    return PNASNet(CellB, num_cells=6, num_planes=32)
+
+
+def test():
+    net = PNASNetB()
+    print(net)
+    x = Variable(torch.randn(1,3,32,32))
+    y = net(x)
+    print(y)
+
+# test()
--- a/pytorch/models/preact_resnet.py
+++ b/pytorch/models/preact_resnet.py
+'''Pre-activation ResNet in PyTorch.
+
+Reference:
+[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
+    Identity Mappings in Deep Residual Networks. arXiv:1603.05027
+'''
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from torch.autograd import Variable
+
+
+class PreActBlock(nn.Module):
+    '''Pre-activation version of the BasicBlock.'''
+    expansion = 1
+
+    def __init__(self, in_planes, planes, stride=1):
+        super(PreActBlock, self).__init__()
+        self.bn1 = nn.BatchNorm2d(in_planes)
+        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
+
+        if stride != 1 or in_planes != self.expansion*planes:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False)
+            )
+
+    def forward(self, x):
+        out = F.relu(self.bn1(x))
+        shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x
+        out = self.conv1(out)
+        out = self.conv2(F.relu(self.bn2(out)))
+        out += shortcut
+        return out
+
+
+class PreActBottleneck(nn.Module):
+    '''Pre-activation version of the original Bottleneck module.'''
+    expansion = 4
+
+    def __init__(self, in_planes, planes, stride=1):
+        super(PreActBottleneck, self).__init__()
+        self.bn1 = nn.BatchNorm2d(in_planes)
+        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False)
+
+        if stride != 1 or in_planes != self.expansion*planes:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False)
+            )
+
+    def forward(self, x):
+        out = F.relu(self.bn1(x))
+        shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x
+        out = self.conv1(out)
+        out = self.conv2(F.relu(self.bn2(out)))
+        out = self.conv3(F.relu(self.bn3(out)))
+        out += shortcut
+        return out
+
+
+class PreActResNet(nn.Module):
+    def __init__(self, block, num_blocks, num_classes=10):
+        super(PreActResNet, self).__init__()
+        self.in_planes = 64
+
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
+        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
+        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
+        self.linear = nn.Linear(512*block.expansion, num_classes)
+
+    def _make_layer(self, block, planes, num_blocks, stride):
+        strides = [stride] + [1]*(num_blocks-1)
+        layers = []
+        for stride in strides:
+            layers.append(block(self.in_planes, planes, stride))
+            self.in_planes = planes * block.expansion
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        out = self.conv1(x)
+        out = self.layer1(out)
+        out = self.layer2(out)
+        out = self.layer3(out)
+        out = self.layer4(out)
+        out = F.avg_pool2d(out, 4)
+        out = out.view(out.size(0), -1)
+        out = self.linear(out)
+        return out
+
+
+def PreActResNet18():
+    return PreActResNet(PreActBlock, [2,2,2,2])
+
+def PreActResNet34():
+    return PreActResNet(PreActBlock, [3,4,6,3])
+
+def PreActResNet50():
+    return PreActResNet(PreActBottleneck, [3,4,6,3])
+
+def PreActResNet101():
+    return PreActResNet(PreActBottleneck, [3,4,23,3])
+
+def PreActResNet152():
+    return PreActResNet(PreActBottleneck, [3,8,36,3])
+
+
+def test():
+    net = PreActResNet18()
+    y = net(Variable(torch.randn(1,3,32,32)))
+    print(y.size())
+
+# test()
--- a/pytorch/models/resnet.py
+++ b/pytorch/models/resnet.py
+'''ResNet in PyTorch.
+
+For Pre-activation ResNet, see 'preact_resnet.py'.
+
+Reference:
+[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
+    Deep Residual Learning for Image Recognition. arXiv:1512.03385
+'''
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from torch.autograd import Variable
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, in_planes, planes, stride=1):
+        super(BasicBlock, self).__init__()
+        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+
+        self.shortcut = nn.Sequential()
+        if stride != 1 or in_planes != self.expansion*planes:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(self.expansion*planes)
+            )
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.bn2(self.conv2(out))
+        out += self.shortcut(x)
+        out = F.relu(out)
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, in_planes, planes, stride=1):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(self.expansion*planes)
+
+        self.shortcut = nn.Sequential()
+        if stride != 1 or in_planes != self.expansion*planes:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(self.expansion*planes)
+            )
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = F.relu(self.bn2(self.conv2(out)))
+        out = self.bn3(self.conv3(out))
+        out += self.shortcut(x)
+        out = F.relu(out)
+        return out
+
+
+class ResNet(nn.Module):
+    def __init__(self, block, num_blocks, num_classes=10):
+        super(ResNet, self).__init__()
+        self.in_planes = 64
+
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
+        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
+        self.linear = nn.Linear(512*block.expansion, num_classes)
+
+    def _make_layer(self, block, planes, num_blocks, stride):
+        strides = [stride] + [1]*(num_blocks-1)
+        layers = []
+        for stride in strides:
+            layers.append(block(self.in_planes, planes, stride))
+            self.in_planes = planes * block.expansion
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.layer1(out)
+        out = self.layer2(out)
+        out = self.layer3(out)
+        out = self.layer4(out)
+        out = F.avg_pool2d(out, 4)
+        out = out.view(out.size(0), -1)
+        out = self.linear(out)
+        return out
+
+
+def ResNet18():
+    return ResNet(BasicBlock, [2,2,2,2])
+
+def ResNet34():
+    return ResNet(BasicBlock, [3,4,6,3])
+
+def ResNet50():
+    return ResNet(Bottleneck, [3,4,6,3])
+
+def ResNet101():
+    return ResNet(Bottleneck, [3,4,23,3])
+
+def ResNet152():
+    return ResNet(Bottleneck, [3,8,36,3])
+
+
+def test():
+    net = ResNet18()
+    y = net(Variable(torch.randn(1,3,32,32)))
+    print(y.size())
+
+# test()
--- a/pytorch/models/resnext.py
+++ b/pytorch/models/resnext.py
+'''ResNeXt in PyTorch.
+
+See the paper "Aggregated Residual Transformations for Deep Neural Networks" for more details.
+'''
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from torch.autograd import Variable
+
+
+class Block(nn.Module):
+    '''Grouped convolution block.'''
+    expansion = 2
+
+    def __init__(self, in_planes, cardinality=32, bottleneck_width=4, stride=1):
+        super(Block, self).__init__()
+        group_width = cardinality * bottleneck_width
+        self.conv1 = nn.Conv2d(in_planes, group_width, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(group_width)
+        self.conv2 = nn.Conv2d(group_width, group_width, kernel_size=3, stride=stride, padding=1, groups=cardinality, bias=False)
+        self.bn2 = nn.BatchNorm2d(group_width)
+        self.conv3 = nn.Conv2d(group_width, self.expansion*group_width, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(self.expansion*group_width)
+
+        self.shortcut = nn.Sequential()
+        if stride != 1 or in_planes != self.expansion*group_width:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_planes, self.expansion*group_width, kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(self.expansion*group_width)
+            )
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = F.relu(self.bn2(self.conv2(out)))
+        out = self.bn3(self.conv3(out))
+        out += self.shortcut(x)
+        out = F.relu(out)
+        return out
+
+
+class ResNeXt(nn.Module):
+    def __init__(self, num_blocks, cardinality, bottleneck_width, num_classes=10):
+        super(ResNeXt, self).__init__()
+        self.cardinality = cardinality
+        self.bottleneck_width = bottleneck_width
+        self.in_planes = 64
+
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.layer1 = self._make_layer(num_blocks[0], 1)
+        self.layer2 = self._make_layer(num_blocks[1], 2)
+        self.layer3 = self._make_layer(num_blocks[2], 2)
+        # self.layer4 = self._make_layer(num_blocks[3], 2)
+        self.linear = nn.Linear(cardinality*bottleneck_width*8, num_classes)
+
+    def _make_layer(self, num_blocks, stride):
+        strides = [stride] + [1]*(num_blocks-1)
+        layers = []
+        for stride in strides:
+            layers.append(Block(self.in_planes, self.cardinality, self.bottleneck_width, stride))
+            self.in_planes = Block.expansion * self.cardinality * self.bottleneck_width
+        # Increase bottleneck_width by 2 after each stage.
+        self.bottleneck_width *= 2
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.layer1(out)
+        out = self.layer2(out)
+        out = self.layer3(out)
+        # out = self.layer4(out)
+        out = F.avg_pool2d(out, 8)
+        out = out.view(out.size(0), -1)
+        out = self.linear(out)
+        return out
+
+
+def ResNeXt29_2x64d():
+    return ResNeXt(num_blocks=[3,3,3], cardinality=2, bottleneck_width=64)
+
+def ResNeXt29_4x64d():
+    return ResNeXt(num_blocks=[3,3,3], cardinality=4, bottleneck_width=64)
+
+def ResNeXt29_8x64d():
+    return ResNeXt(num_blocks=[3,3,3], cardinality=8, bottleneck_width=64)
+
+def ResNeXt29_32x4d():
+    return ResNeXt(num_blocks=[3,3,3], cardinality=32, bottleneck_width=4)
+
+def test_resnext():
+    net = ResNeXt29_2x64d()
+    x = torch.randn(1,3,32,32)
+    y = net(Variable(x))
+    print(y.size())
+
+# test_resnext()
--- a/pytorch/models/senet.py
+++ b/pytorch/models/senet.py
+'''SENet in PyTorch.
+
+SENet is the winner of ImageNet-2017. The paper is not released yet.
+'''
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from torch.autograd import Variable
+
+
+class BasicBlock(nn.Module):
+    def __init__(self, in_planes, planes, stride=1):
+        super(BasicBlock, self).__init__()
+        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+
+        self.shortcut = nn.Sequential()
+        if stride != 1 or in_planes != planes:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(planes)
+            )
+
+        # SE layers
+        self.fc1 = nn.Conv2d(planes, planes//16, kernel_size=1)  # Use nn.Conv2d instead of nn.Linear
+        self.fc2 = nn.Conv2d(planes//16, planes, kernel_size=1)
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.bn2(self.conv2(out))
+
+        # Squeeze
+        w = F.avg_pool2d(out, out.size(2))
+        w = F.relu(self.fc1(w))
+        w = F.sigmoid(self.fc2(w))
+        # Excitation
+        out = out * w  # New broadcasting feature from v0.2!
+
+        out += self.shortcut(x)
+        out = F.relu(out)
+        return out
+
+
+class PreActBlock(nn.Module):
+    def __init__(self, in_planes, planes, stride=1):
+        super(PreActBlock, self).__init__()
+        self.bn1 = nn.BatchNorm2d(in_planes)
+        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
+
+        if stride != 1 or in_planes != planes:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride, bias=False)
+            )
+
+        # SE layers
+        self.fc1 = nn.Conv2d(planes, planes//16, kernel_size=1)
+        self.fc2 = nn.Conv2d(planes//16, planes, kernel_size=1)
+
+    def forward(self, x):
+        out = F.relu(self.bn1(x))
+        shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x
+        out = self.conv1(out)
+        out = self.conv2(F.relu(self.bn2(out)))
+
+        # Squeeze
+        w = F.avg_pool2d(out, out.size(2))
+        w = F.relu(self.fc1(w))
+        w = F.sigmoid(self.fc2(w))
+        # Excitation
+        out = out * w
+
+        out += shortcut
+        return out
+
+
+class SENet(nn.Module):
+    def __init__(self, block, num_blocks, num_classes=10):
+        super(SENet, self).__init__()
+        self.in_planes = 64
+
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.layer1 = self._make_layer(block,  64, num_blocks[0], stride=1)
+        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
+        self.linear = nn.Linear(512, num_classes)
+
+    def _make_layer(self, block, planes, num_blocks, stride):
+        strides = [stride] + [1]*(num_blocks-1)
+        layers = []
+        for stride in strides:
+            layers.append(block(self.in_planes, planes, stride))
+            self.in_planes = planes
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.layer1(out)
+        out = self.layer2(out)
+        out = self.layer3(out)
+        out = self.layer4(out)
+        out = F.avg_pool2d(out, 4)
+        out = out.view(out.size(0), -1)
+        out = self.linear(out)
+        return out
+
+
+def SENet18():
+    return SENet(PreActBlock, [2,2,2,2])
+
+
+def test():
+    net = SENet18()
+    y = net(Variable(torch.randn(1,3,32,32)))
+    print(y.size())
+
+# test()
--- a/pytorch/models/shufflenet.py
+++ b/pytorch/models/shufflenet.py
+'''ShuffleNet in PyTorch.
+
+See the paper "ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices" for more details.
+'''
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from torch.autograd import Variable
+
+
+class ShuffleBlock(nn.Module):
+    def __init__(self, groups):
+        super(ShuffleBlock, self).__init__()
+        self.groups = groups
+
+    def forward(self, x):
+        '''Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]'''
+        N,C,H,W = x.size()
+        g = self.groups
+        return x.view(N,g,C/g,H,W).permute(0,2,1,3,4).contiguous().view(N,C,H,W)
+
+
+class Bottleneck(nn.Module):
+    def __init__(self, in_planes, out_planes, stride, groups):
+        super(Bottleneck, self).__init__()
+        self.stride = stride
+
+        mid_planes = out_planes/4
+        g = 1 if in_planes==24 else groups
+        self.conv1 = nn.Conv2d(in_planes, mid_planes, kernel_size=1, groups=g, bias=False)
+        self.bn1 = nn.BatchNorm2d(mid_planes)
+        self.shuffle1 = ShuffleBlock(groups=g)
+        self.conv2 = nn.Conv2d(mid_planes, mid_planes, kernel_size=3, stride=stride, padding=1, groups=mid_planes, bias=False)
+        self.bn2 = nn.BatchNorm2d(mid_planes)
+        self.conv3 = nn.Conv2d(mid_planes, out_planes, kernel_size=1, groups=groups, bias=False)
+        self.bn3 = nn.BatchNorm2d(out_planes)
+
+        self.shortcut = nn.Sequential()
+        if stride == 2:
+            self.shortcut = nn.Sequential(nn.AvgPool2d(3, stride=2, padding=1))
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.shuffle1(out)
+        out = F.relu(self.bn2(self.conv2(out)))
+        out = self.bn3(self.conv3(out))
+        res = self.shortcut(x)
+        out = F.relu(torch.cat([out,res], 1)) if self.stride==2 else F.relu(out+res)
+        return out
+
+
+class ShuffleNet(nn.Module):
+    def __init__(self, cfg):
+        super(ShuffleNet, self).__init__()
+        out_planes = cfg['out_planes']
+        num_blocks = cfg['num_blocks']
+        groups = cfg['groups']
+
+        self.conv1 = nn.Conv2d(3, 24, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(24)
+        self.in_planes = 24
+        self.layer1 = self._make_layer(out_planes[0], num_blocks[0], groups)
+        self.layer2 = self._make_layer(out_planes[1], num_blocks[1], groups)
+        self.layer3 = self._make_layer(out_planes[2], num_blocks[2], groups)
+        self.linear = nn.Linear(out_planes[2], 10)
+
+    def _make_layer(self, out_planes, num_blocks, groups):
+        layers = []
+        for i in range(num_blocks):
+            stride = 2 if i == 0 else 1
+            cat_planes = self.in_planes if i == 0 else 0
+            layers.append(Bottleneck(self.in_planes, out_planes-cat_planes, stride=stride, groups=groups))
+            self.in_planes = out_planes
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.layer1(out)
+        out = self.layer2(out)
+        out = self.layer3(out)
+        out = F.avg_pool2d(out, 4)
+        out = out.view(out.size(0), -1)
+        out = self.linear(out)
+        return out
+
+
+def ShuffleNetG2():
+    cfg = {
+        'out_planes': [200,400,800],
+        'num_blocks': [4,8,4],
+        'groups': 2
+    }
+    return ShuffleNet(cfg)
+
+def ShuffleNetG3():
+    cfg = {
+        'out_planes': [240,480,960],
+        'num_blocks': [4,8,4],
+        'groups': 3
+    }
+    return ShuffleNet(cfg)
+
+
+def test():
+    net = ShuffleNetG2()
+    x = Variable(torch.randn(1,3,32,32))
+    y = net(x)
+    print(y)
+
+# test()
--- a/pytorch/models/vgg.py
+++ b/pytorch/models/vgg.py
+'''VGG11/13/16/19 in Pytorch.'''
+import torch
+import torch.nn as nn
+from torch.autograd import Variable
+
+
+cfg = {
+    'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
+    'VGG13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
+    'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
+    'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
+}
+
+
+class VGG(nn.Module):
+    def __init__(self, vgg_name):
+        super(VGG, self).__init__()
+        self.features = self._make_layers(cfg[vgg_name])
+        self.classifier = nn.Linear(512, 10)
+
+    def forward(self, x):
+        out = self.features(x)
+        out = out.view(out.size(0), -1)
+        out = self.classifier(out)
+        return out
+
+    def _make_layers(self, cfg):
+        layers = []
+        in_channels = 3
+        for x in cfg:
+            if x == 'M':
+                layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
+            else:
+                layers += [nn.Conv2d(in_channels, x, kernel_size=3, padding=1),
+                           nn.BatchNorm2d(x),
+                           nn.ReLU(inplace=True)]
+                in_channels = x
+        layers += [nn.AvgPool2d(kernel_size=1, stride=1)]
+        return nn.Sequential(*layers)
+
+# net = VGG('VGG11')
+# x = torch.randn(2,3,32,32)
+# print(net(Variable(x)).size())
--- a/pytorch/utils.py
+++ b/pytorch/utils.py
+'''Some helper functions for PyTorch, including:
+    - get_mean_and_std: calculate the mean and std value of dataset.
+    - msr_init: net parameter initialization.
+    - progress_bar: progress bar mimic xlua.progress.
+'''
+import os
+import sys
+import time
+import math
+
+import torch.nn as nn
+import torch.nn.init as init
+
+
+def get_mean_and_std(dataset):
+    '''Compute the mean and std value of dataset.'''
+    dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=True, num_workers=2)
+    mean = torch.zeros(3)
+    std = torch.zeros(3)
+    print('==> Computing mean and std..')
+    for inputs, targets in dataloader:
+        for i in range(3):
+            mean[i] += inputs[:, i, :, :].mean()
+            std[i] += inputs[:, i, :, :].std()
+    mean.div_(len(dataset))
+    std.div_(len(dataset))
+    return mean, std
+
+
+def init_params(net):
+    '''Init layer parameters.'''
+    for m in net.modules():
+        if isinstance(m, nn.Conv2d):
+            init.kaiming_normal(m.weight, mode='fan_out')
+            if m.bias:
+                init.constant(m.bias, 0)
+        elif isinstance(m, nn.BatchNorm2d):
+            init.constant(m.weight, 1)
+            init.constant(m.bias, 0)
+        elif isinstance(m, nn.Linear):
+            init.normal(m.weight, std=1e-3)
+            if m.bias:
+                init.constant(m.bias, 0)
+
+
+# last_time = time.time()
+# begin_time = last_time
+#
+#
+# def format_time(seconds):
+#     days = int(seconds / 3600 / 24)
+#     seconds = seconds - days * 3600 * 24
+#     hours = int(seconds / 3600)
+#     seconds = seconds - hours * 3600
+#     minutes = int(seconds / 60)
+#     seconds = seconds - minutes * 60
+#     secondsf = int(seconds)
+#     seconds = seconds - secondsf
+#     millis = int(seconds * 1000)
+#
+#     f = ''
+#     i = 1
+#     if days > 0:
+#         f += str(days) + 'D'
+#         i += 1
+#     if hours > 0 and i <= 2:
+#         f += str(hours) + 'h'
+#         i += 1
+#     if minutes > 0 and i <= 2:
+#         f += str(minutes) + 'm'
+#         i += 1
+#     if secondsf > 0 and i <= 2:
+#         f += str(secondsf) + 's'
+#         i += 1
+#     if millis > 0 and i <= 2:
+#         f += str(millis) + 'ms'
+#         i += 1
+#     if f == '':
+#         f = '0ms'
+#     return f
--- a/tensorflow/cifare10/.autocnn/algorithm
+++ b/tensorflow/cifare10/.autocnn/algorithm
+{"name": "tf_cifar10", "user": "daiab", "unique_name": "daiab.tf_cifar10", "uuid": "a3ab488a5bf74828bc6b3bd8c4292324", "description": "", "is_public": false, "has_code": true, "created_at": "2018-05-03T11:49:24.829695+00:00", "updated_at": "2018-05-03T11:49:24.829781+00:00", "num_tasks": 0, "has_tensorboard": false, "has_notebook": false, "tasks": null, "framework": "tensorflow", "tags": ""}
\ No newline at end of file
--- a/tensorflow/cifare10/.autocnnignore
+++ b/tensorflow/cifare10/.autocnnignore
+
+.git
+.eggs
+eggs
+lib
+lib64
+parts
+sdist
+var
+*.pyc
+*.swp
+.DS_Store
+./.autocnn
--- a/tensorflow/cifare10/__init__.py
+++ b/tensorflow/cifare10/__init__.py
--- a/tensorflow/cifare10/__pycache__/cifar10.cpython-36.pyc
+++ b/tensorflow/cifare10/__pycache__/cifar10.cpython-36.pyc
--- a/tensorflow/cifare10/__pycache__/cifar10_input.cpython-36.pyc
+++ b/tensorflow/cifare10/__pycache__/cifar10_input.cpython-36.pyc
--- a/tensorflow/cifare10/autocnnfile.yml
+++ b/tensorflow/cifare10/autocnnfile.yml
+version: 1
+
+algorithm:
+  name: tf_cifar10
+
+resource:
+  default_resources:
+    cpu:
+      requests: 4
+      limits: 4
+    gpu:
+      requests: 2
+      limits: 2
+    memory:
+      requests: 10240
+      limits: 10240
+
+image:
+  from_image: tensorflow/tensorflow:1.5.0-devel-gpu-py3
+  # from_image: tensorflow/tensorflow:1.4.1
+  runs:
+    - apt-get -y update && apt-get -y install python3-pip && pip3 install atc-beta-helper
+
+train:
+  mount:
+    data/daiab/tf_cifar10/: /data
+  output: /output
+  parameter:
+    train_dir: /output   # Directory where to write event logs and checkpoint
+    max_steps: 1000000   # Number of batches to run.
+    log_device_placement: false  # Whether to log device placement.
+    log_frequency: 10    # How often to log results to the console.
+    batch_size: 128      # Number of images to process in a batch.
+    data_dir: /data     # Path to the CIFAR-10 data directory.
+    use_fp16: false      # Train the model using fp16.
+    num_gpus: 1
+  cmd: python3 cifar10_multi_gpu_train.py
+
+test:
+  mount:
+    data/daiab/tf_cifar10/: /data
+  output: /eval_output
+  ref_model:
+    10: /output  # make sure the model path saved in checkpoint file match with train's
+  parameter:
+    data_dir: /data     # Path to the CIFAR-10 data directory.
+    eval_dir: /eval_output      # Directory where to write event logs.
+    eval_data:  test        # Either 'test' or 'train_eval'.
+    checkpoint_dir: /output      # Directory where to read model checkpoints.
+    eval_interval_secs: 1  # How often to run the eval.
+    num_examples: 10000
+    run_once: true
+    use_fp16: false
+    batch_size: 128
+  cmd: python3 cifar10_eval.py
--- a/tensorflow/cifare10/autocnnfile_distributed.yml
+++ b/tensorflow/cifare10/autocnnfile_distributed.yml
+---
+version: 1
+
+algorithm:
+  name: cifar10
+
+environment:
+  tensorflow:
+    n_workers: 3
+    n_ps: 1
+
+run:
+  image: tensorflow/tensorflow:1.4.1
+  steps:
+    - pip install --no-cache-dir -U autocnn-helper
+  cmd:  python run.py --train-steps=400 --sync
--- a/tensorflow/cifare10/cifar10.py
+++ b/tensorflow/cifare10/cifar10.py
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Builds the CIFAR-10 network.
+
+Summary of available functions:
+
+ # Compute input images and labels for training. If you would like to run
+ # evaluations, use inputs() instead.
+ inputs, labels = distorted_inputs()
+
+ # Compute inference on the model inputs to make a prediction.
+ predictions = inference(inputs)
+
+ # Compute the total loss of the prediction with respect to the labels.
+ loss = loss(predictions, labels)
+
+ # Create a graph to run one step of training with respect to the loss.
+ train_op = train(loss, global_step)
+"""
+# pylint: disable=missing-docstring
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import re
+import sys
+import tarfile
+
+from six.moves import urllib
+import tensorflow as tf
+
+import cifar10_input
+import autocnn_helper as helper
+FLAGS = helper.get_parameter()
+
+# Global constants describing the CIFAR-10 data set.
+IMAGE_SIZE = cifar10_input.IMAGE_SIZE
+NUM_CLASSES = cifar10_input.NUM_CLASSES
+NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = cifar10_input.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN
+NUM_EXAMPLES_PER_EPOCH_FOR_EVAL = cifar10_input.NUM_EXAMPLES_PER_EPOCH_FOR_EVAL
+
+# Constants describing the training process.
+MOVING_AVERAGE_DECAY = 0.9999  # The decay to use for the moving average.
+NUM_EPOCHS_PER_DECAY = 350.0  # Epochs after which learning rate decays.
+LEARNING_RATE_DECAY_FACTOR = 0.1  # Learning rate decay factor.
+INITIAL_LEARNING_RATE = 0.1  # Initial learning rate.
+
+# If a model is trained with multiple GPUs, prefix all Op names with tower_name
+# to differentiate the operations. Note that this prefix is removed from the
+# names of the summaries when visualizing a model.
+TOWER_NAME = 'tower'
+
+DATA_URL = 'https://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz'
+
+
+def _activation_summary(x):
+    """Helper to create summaries for activations.
+
+    Creates a summary that provides a histogram of activations.
+    Creates a summary that measures the sparsity of activations.
+
+    Args:
+      x: Tensor
+    Returns:
+      nothing
+    """
+    # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
+    # session. This helps the clarity of presentation on tensorboard.
+    tensor_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', x.op.name)
+    tf.summary.histogram(tensor_name + '/activations', x)
+    tf.summary.scalar(tensor_name + '/sparsity',
+                      tf.nn.zero_fraction(x))
+
+
+def _variable_on_cpu(name, shape, initializer):
+    """Helper to create a Variable stored on CPU memory.
+
+    Args:
+      name: name of the variable
+      shape: list of ints
+      initializer: initializer for Variable
+
+    Returns:
+      Variable Tensor
+    """
+    with tf.device('/cpu:0'):
+        dtype = tf.float16 if FLAGS.use_fp16 else tf.float32
+        var = tf.get_variable(name, shape, initializer=initializer, dtype=dtype)
+    return var
+
+
+def _variable_with_weight_decay(name, shape, stddev, wd):
+    """Helper to create an initialized Variable with weight decay.
+
+    Note that the Variable is initialized with a truncated normal distribution.
+    A weight decay is added only if one is specified.
+
+    Args:
+      name: name of the variable
+      shape: list of ints
+      stddev: standard deviation of a truncated Gaussian
+      wd: add L2Loss weight decay multiplied by this float. If None, weight
+          decay is not added for this Variable.
+
+    Returns:
+      Variable Tensor
+    """
+    dtype = tf.float16 if FLAGS.use_fp16 else tf.float32
+    var = _variable_on_cpu(
+        name,
+        shape,
+        tf.truncated_normal_initializer(stddev=stddev, dtype=dtype))
+    if wd is not None:
+        weight_decay = tf.multiply(tf.nn.l2_loss(var), wd, name='weight_loss')
+        tf.add_to_collection('losses', weight_decay)
+    return var
+
+
+def distorted_inputs():
+    """Construct distorted input for CIFAR training using the Reader ops.
+
+    Returns:
+      images: Images. 4D tensor of [batch_size, IMAGE_SIZE, IMAGE_SIZE, 3] size.
+      labels: Labels. 1D tensor of [batch_size] size.
+
+    Raises:
+      ValueError: If no data_dir
+    """
+    if not FLAGS.data_dir:
+        raise ValueError('Please supply a data_dir')
+    data_dir = os.path.join(FLAGS.data_dir, 'cifar-10-batches-bin')
+    images, labels = cifar10_input.distorted_inputs(data_dir=data_dir,
+                                                    batch_size=FLAGS.batch_size)
+    if FLAGS.use_fp16:
+        images = tf.cast(images, tf.float16)
+        labels = tf.cast(labels, tf.float16)
+    return images, labels
+
+
+def inputs(eval_data):
+    """Construct input for CIFAR evaluation using the Reader ops.
+
+    Args:
+      eval_data: bool, indicating if one should use the train or eval data set.
+
+    Returns:
+      images: Images. 4D tensor of [batch_size, IMAGE_SIZE, IMAGE_SIZE, 3] size.
+      labels: Labels. 1D tensor of [batch_size] size.
+
+    Raises:
+      ValueError: If no data_dir
+    """
+    if not FLAGS.data_dir:
+        raise ValueError('Please supply a data_dir')
+    data_dir = os.path.join(FLAGS.data_dir, 'cifar-10-batches-bin')
+    images, labels = cifar10_input.inputs(eval_data=eval_data,
+                                          data_dir=data_dir,
+                                          batch_size=FLAGS.batch_size)
+    if FLAGS.use_fp16:
+        images = tf.cast(images, tf.float16)
+        labels = tf.cast(labels, tf.float16)
+    return images, labels
+
+
+def inference(images):
+    """Build the CIFAR-10 model.
+
+    Args:
+      images: Images returned from distorted_inputs() or inputs().
+
+    Returns:
+      Logits.
+    """
+    # We instantiate all variables using tf.get_variable() instead of
+    # tf.Variable() in order to share variables across multiple GPU training runs.
+    # If we only ran this model on a single GPU, we could simplify this function
+    # by replacing all instances of tf.get_variable() with tf.Variable().
+    #
+    # conv1
+    with tf.variable_scope('conv1') as scope:
+        kernel = _variable_with_weight_decay('weights',
+                                             shape=[5, 5, 3, 64],
+                                             stddev=5e-2,
+                                             wd=None)
+        conv = tf.nn.conv2d(images, kernel, [1, 1, 1, 1], padding='SAME')
+        biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.0))
+        pre_activation = tf.nn.bias_add(conv, biases)
+        conv1 = tf.nn.relu(pre_activation, name=scope.name)
+        _activation_summary(conv1)
+
+    # pool1
+    pool1 = tf.nn.max_pool(conv1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1],
+                           padding='SAME', name='pool1')
+    # norm1
+    norm1 = tf.nn.lrn(pool1, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75,
+                      name='norm1')
+
+    # conv2
+    with tf.variable_scope('conv2') as scope:
+        kernel = _variable_with_weight_decay('weights',
+                                             shape=[5, 5, 64, 64],
+                                             stddev=5e-2,
+                                             wd=None)
+        conv = tf.nn.conv2d(norm1, kernel, [1, 1, 1, 1], padding='SAME')
+        biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.1))
+        pre_activation = tf.nn.bias_add(conv, biases)
+        conv2 = tf.nn.relu(pre_activation, name=scope.name)
+        _activation_summary(conv2)
+
+    # norm2
+    norm2 = tf.nn.lrn(conv2, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75,
+                      name='norm2')
+    # pool2
+    pool2 = tf.nn.max_pool(norm2, ksize=[1, 3, 3, 1],
+                           strides=[1, 2, 2, 1], padding='SAME', name='pool2')
+
+    # local3
+    with tf.variable_scope('local3') as scope:
+        # Move everything into depth so we can perform a single matrix multiply.
+        reshape = tf.reshape(pool2, [images.get_shape().as_list()[0], -1])
+        dim = reshape.get_shape()[1].value
+        weights = _variable_with_weight_decay('weights', shape=[dim, 384],
+                                              stddev=0.04, wd=0.004)
+        biases = _variable_on_cpu('biases', [384], tf.constant_initializer(0.1))
+        local3 = tf.nn.relu(tf.matmul(reshape, weights) + biases, name=scope.name)
+        _activation_summary(local3)
+
+    # local4
+    with tf.variable_scope('local4') as scope:
+        weights = _variable_with_weight_decay('weights', shape=[384, 192],
+                                              stddev=0.04, wd=0.004)
+        biases = _variable_on_cpu('biases', [192], tf.constant_initializer(0.1))
+        local4 = tf.nn.relu(tf.matmul(local3, weights) + biases, name=scope.name)
+        _activation_summary(local4)
+
+    # linear layer(WX + b),
+    # We don't apply softmax here because
+    # tf.nn.sparse_softmax_cross_entropy_with_logits accepts the unscaled logits
+    # and performs the softmax internally for efficiency.
+    with tf.variable_scope('softmax_linear') as scope:
+        weights = _variable_with_weight_decay('weights', [192, NUM_CLASSES],
+                                              stddev=1 / 192.0, wd=None)
+        biases = _variable_on_cpu('biases', [NUM_CLASSES],
+                                  tf.constant_initializer(0.0))
+        softmax_linear = tf.add(tf.matmul(local4, weights), biases, name=scope.name)
+        _activation_summary(softmax_linear)
+
+    return softmax_linear
+
+
+def loss(logits, labels):
+    """Add L2Loss to all the trainable variables.
+
+    Add summary for "Loss" and "Loss/avg".
+    Args:
+      logits: Logits from inference().
+      labels: Labels from distorted_inputs or inputs(). 1-D tensor
+              of shape [batch_size]
+
+    Returns:
+      Loss tensor of type float.
+    """
+    # Calculate the average cross entropy loss across the batch.
+    labels = tf.cast(labels, tf.int64)
+    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
+        labels=labels, logits=logits, name='cross_entropy_per_example')
+    cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
+    tf.add_to_collection('losses', cross_entropy_mean)
+
+    # The total loss is defined as the cross entropy loss plus all of the weight
+    # decay terms (L2 loss).
+    return tf.add_n(tf.get_collection('losses'), name='total_loss')
+
+
+def _add_loss_summaries(total_loss):
+    """Add summaries for losses in CIFAR-10 model.
+
+    Generates moving average for all losses and associated summaries for
+    visualizing the performance of the network.
+
+    Args:
+      total_loss: Total loss from loss().
+    Returns:
+      loss_averages_op: op for generating moving averages of losses.
+    """
+    # Compute the moving average of all individual losses and the total loss.
+    loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
+    losses = tf.get_collection('losses')
+    loss_averages_op = loss_averages.apply(losses + [total_loss])
+
+    # Attach a scalar summary to all individual losses and the total loss; do the
+    # same for the averaged version of the losses.
+    for l in losses + [total_loss]:
+        # Name each loss as '(raw)' and name the moving average version of the loss
+        # as the original loss name.
+        tf.summary.scalar(l.op.name + ' (raw)', l)
+        tf.summary.scalar(l.op.name, loss_averages.average(l))
+
+    return loss_averages_op
+
+
+def train(total_loss, global_step):
+    """Train CIFAR-10 model.
+
+    Create an optimizer and apply to all trainable variables. Add moving
+    average for all trainable variables.
+
+    Args:
+      total_loss: Total loss from loss().
+      global_step: Integer Variable counting the number of training steps
+        processed.
+    Returns:
+      train_op: op for training.
+    """
+    # Variables that affect learning rate.
+    num_batches_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / FLAGS.batch_size
+    decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)
+
+    # Decay the learning rate exponentially based on the number of steps.
+    lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
+                                    global_step,
+                                    decay_steps,
+                                    LEARNING_RATE_DECAY_FACTOR,
+                                    staircase=True)
+    tf.summary.scalar('learning_rate', lr)
+
+    # Generate moving averages of all losses and associated summaries.
+    loss_averages_op = _add_loss_summaries(total_loss)
+
+    # Compute gradients.
+    with tf.control_dependencies([loss_averages_op]):
+        opt = tf.train.GradientDescentOptimizer(lr)
+        grads = opt.compute_gradients(total_loss)
+
+    # Apply gradients.
+    apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
+
+    # Add histograms for trainable variables.
+    for var in tf.trainable_variables():
+        tf.summary.histogram(var.op.name, var)
+
+    # Add histograms for gradients.
+    for grad, var in grads:
+        if grad is not None:
+            tf.summary.histogram(var.op.name + '/gradients', grad)
+
+    # Track the moving averages of all trainable variables.
+    variable_averages = tf.train.ExponentialMovingAverage(
+        MOVING_AVERAGE_DECAY, global_step)
+    with tf.control_dependencies([apply_gradient_op]):
+        variables_averages_op = variable_averages.apply(tf.trainable_variables())
+
+    return variables_averages_op
+
+
--- a/tensorflow/cifare10/cifar10_eval.py
+++ b/tensorflow/cifare10/cifar10_eval.py
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Evaluation for CIFAR-10.
+
+Accuracy:
+cifar10_train.py achieves 83.0% accuracy after 100K steps (256 epochs
+of data) as judged by cifar10_eval.py.
+
+Speed:
+On a single Tesla K40, cifar10_train.py processes a single batch of 128 images
+in 0.25-0.35 sec (i.e. 350 - 600 images /sec). The model reaches ~86%
+accuracy after 100K steps in 8 hours of training time.
+
+Usage:
+Please see the tutorial and website for how to download the CIFAR-10
+data set, compile the program and train the model.
+
+http://tensorflow.org/tutorials/deep_cnn/
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from datetime import datetime
+import math
+import time
+import numpy as np
+import tensorflow as tf
+import cifar10
+
+
+import autocnn_helper as helper
+FLAGS = helper.get_parameter()
+
+
+def eval_once(saver, summary_writer, top_k_op, summary_op):
+    """Run Eval once.
+
+    Args:
+      saver: Saver.
+      summary_writer: Summary writer.
+      top_k_op: Top K op.
+      summary_op: Summary op.
+    """
+    with tf.Session() as sess:
+        ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
+        if ckpt and ckpt.model_checkpoint_path:
+            # Restores from checkpoint
+            saver.restore(sess, ckpt.model_checkpoint_path)
+            # Assuming model_checkpoint_path looks something like:
+            #   /my-favorite-path/cifar10_train/model.ckpt-0,
+            # extract global_step from it.
+            global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]
+        else:
+            print('No checkpoint file found')
+            return
+
+        # Start the queue runners.
+        coord = tf.train.Coordinator()
+        try:
+            threads = []
+            for qr in tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS):
+                threads.extend(qr.create_threads(sess, coord=coord, daemon=True,
+                                                 start=True))
+
+            num_iter = int(math.ceil(FLAGS.num_examples / FLAGS.batch_size))
+            true_count = 0  # Counts the number of correct predictions.
+            total_sample_count = num_iter * FLAGS.batch_size
+            step = 0
+            while step < num_iter and not coord.should_stop():
+                predictions = sess.run([top_k_op])
+                true_count += np.sum(predictions)
+                step += 1
+
+            # Compute precision @ 1.
+            precision = true_count / total_sample_count
+            print('%s: precision @ 1 = %.3f' % (datetime.now(), precision))
+
+            summary = tf.Summary()
+            summary.ParseFromString(sess.run(summary_op))
+            summary.value.add(tag='Precision @ 1', simple_value=precision)
+            summary_writer.add_summary(summary, global_step)
+        except Exception as e:  # pylint: disable=broad-except
+            coord.request_stop(e)
+
+        coord.request_stop()
+        coord.join(threads, stop_grace_period_secs=10)
+
+
+def evaluate():
+    """Eval CIFAR-10 for a number of steps."""
+    with tf.Graph().as_default() as g:
+        # Get images and labels for CIFAR-10.
+        eval_data = FLAGS.eval_data == 'test'
+        images, labels = cifar10.inputs(eval_data=eval_data)
+
+        # Build a Graph that computes the logits predictions from the
+        # inference model.
+        logits = cifar10.inference(images)
+
+        # Calculate predictions.
+        top_k_op = tf.nn.in_top_k(logits, labels, 1)
+
+        # Restore the moving average version of the learned variables for eval.
+        variable_averages = tf.train.ExponentialMovingAverage(
+            cifar10.MOVING_AVERAGE_DECAY)
+        variables_to_restore = variable_averages.variables_to_restore()
+        saver = tf.train.Saver(variables_to_restore)
+
+        # Build the summary operation based on the TF collection of Summaries.
+        summary_op = tf.summary.merge_all()
+
+        summary_writer = tf.summary.FileWriter(FLAGS.eval_dir, g)
+
+        while True:
+            eval_once(saver, summary_writer, top_k_op, summary_op)
+            if FLAGS.run_once:
+                break
+            time.sleep(FLAGS.eval_interval_secs)
+
+
+if __name__ == '__main__':
+    evaluate()
+
--- a/tensorflow/cifare10/cifar10_input.py
+++ b/tensorflow/cifare10/cifar10_input.py
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Routine for decoding the CIFAR-10 binary file format."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow as tf
+
+# Process images of this size. Note that this differs from the original CIFAR
+# image size of 32 x 32. If one alters this number, then the entire model
+# architecture will change and any model would need to be retrained.
+IMAGE_SIZE = 24
+
+# Global constants describing the CIFAR-10 data set.
+NUM_CLASSES = 10
+NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 50000
+NUM_EXAMPLES_PER_EPOCH_FOR_EVAL = 10000
+
+
+def read_cifar10(filename_queue):
+    """Reads and parses examples from CIFAR10 data files.
+
+    Recommendation: if you want N-way read parallelism, call this function
+    N times.  This will give you N independent Readers reading different
+    files & positions within those files, which will give better mixing of
+    examples.
+
+    Args:
+      filename_queue: A queue of strings with the filenames to read from.
+
+    Returns:
+      An object representing a single example, with the following fields:
+        height: number of rows in the result (32)
+        width: number of columns in the result (32)
+        depth: number of color channels in the result (3)
+        key: a scalar string Tensor describing the filename & record number
+          for this example.
+        label: an int32 Tensor with the label in the range 0..9.
+        uint8image: a [height, width, depth] uint8 Tensor with the image data
+    """
+
+    class CIFAR10Record(object):
+        pass
+
+    result = CIFAR10Record()
+
+    # Dimensions of the images in the CIFAR-10 dataset.
+    # See http://www.cs.toronto.edu/~kriz/cifar.html for a description of the
+    # input format.
+    label_bytes = 1  # 2 for CIFAR-100
+    result.height = 32
+    result.width = 32
+    result.depth = 3
+    image_bytes = result.height * result.width * result.depth
+    # Every record consists of a label followed by the image, with a
+    # fixed number of bytes for each.
+    record_bytes = label_bytes + image_bytes
+
+    # Read a record, getting filenames from the filename_queue.  No
+    # header or footer in the CIFAR-10 format, so we leave header_bytes
+    # and footer_bytes at their default of 0.
+    reader = tf.FixedLengthRecordReader(record_bytes=record_bytes)
+    result.key, value = reader.read(filename_queue)
+
+    # Convert from a string to a vector of uint8 that is record_bytes long.
+    record_bytes = tf.decode_raw(value, tf.uint8)
+
+    # The first bytes represent the label, which we convert from uint8->int32.
+    result.label = tf.cast(
+        tf.strided_slice(record_bytes, [0], [label_bytes]), tf.int32)
+
+    # The remaining bytes after the label represent the image, which we reshape
+    # from [depth * height * width] to [depth, height, width].
+    depth_major = tf.reshape(
+        tf.strided_slice(record_bytes, [label_bytes],
+                         [label_bytes + image_bytes]),
+        [result.depth, result.height, result.width])
+    # Convert from [depth, height, width] to [height, width, depth].
+    result.uint8image = tf.transpose(depth_major, [1, 2, 0])
+
+    return result
+
+
+def _generate_image_and_label_batch(image, label, min_queue_examples,
+                                    batch_size, shuffle):
+    """Construct a queued batch of images and labels.
+
+    Args:
+      image: 3-D Tensor of [height, width, 3] of type.float32.
+      label: 1-D Tensor of type.int32
+      min_queue_examples: int32, minimum number of samples to retain
+        in the queue that provides of batches of examples.
+      batch_size: Number of images per batch.
+      shuffle: boolean indicating whether to use a shuffling queue.
+
+    Returns:
+      images: Images. 4D tensor of [batch_size, height, width, 3] size.
+      labels: Labels. 1D tensor of [batch_size] size.
+    """
+    # Create a queue that shuffles the examples, and then
+    # read 'batch_size' images + labels from the example queue.
+    num_preprocess_threads = 16
+    if shuffle:
+        images, label_batch = tf.train.shuffle_batch(
+            [image, label],
+            batch_size=batch_size,
+            num_threads=num_preprocess_threads,
+            capacity=min_queue_examples + 3 * batch_size,
+            min_after_dequeue=min_queue_examples)
+    else:
+        images, label_batch = tf.train.batch(
+            [image, label],
+            batch_size=batch_size,
+            num_threads=num_preprocess_threads,
+            capacity=min_queue_examples + 3 * batch_size)
+
+    # Display the training images in the visualizer.
+    tf.summary.image('images', images)
+
+    return images, tf.reshape(label_batch, [batch_size])
+
+
+def distorted_inputs(data_dir, batch_size):
+    """Construct distorted input for CIFAR training using the Reader ops.
+
+    Args:
+      data_dir: Path to the CIFAR-10 data directory.
+      batch_size: Number of images per batch.
+
+    Returns:
+      images: Images. 4D tensor of [batch_size, IMAGE_SIZE, IMAGE_SIZE, 3] size.
+      labels: Labels. 1D tensor of [batch_size] size.
+    """
+    filenames = [os.path.join(data_dir, 'data_batch_%d.bin' % i)
+                 for i in xrange(1, 6)]
+    for f in filenames:
+        if not tf.gfile.Exists(f):
+            raise ValueError('Failed to find file: ' + f)
+
+    # Create a queue that produces the filenames to read.
+    filename_queue = tf.train.string_input_producer(filenames)
+
+    with tf.name_scope('data_augmentation'):
+        # Read examples from files in the filename queue.
+        read_input = read_cifar10(filename_queue)
+        reshaped_image = tf.cast(read_input.uint8image, tf.float32)
+
+        height = IMAGE_SIZE
+        width = IMAGE_SIZE
+
+        # Image processing for training the network. Note the many random
+        # distortions applied to the image.
+
+        # Randomly crop a [height, width] section of the image.
+        distorted_image = tf.random_crop(reshaped_image, [height, width, 3])
+
+        # Randomly flip the image horizontally.
+        distorted_image = tf.image.random_flip_left_right(distorted_image)
+
+        # Because these operations are not commutative, consider randomizing
+        # the order their operation.
+        # NOTE: since per_image_standardization zeros the mean and makes
+        # the stddev unit, this likely has no effect see tensorflow#1458.
+        distorted_image = tf.image.random_brightness(distorted_image,
+                                                     max_delta=63)
+        distorted_image = tf.image.random_contrast(distorted_image,
+                                                   lower=0.2, upper=1.8)
+
+        # Subtract off the mean and divide by the variance of the pixels.
+        float_image = tf.image.per_image_standardization(distorted_image)
+
+        # Set the shapes of tensors.
+        float_image.set_shape([height, width, 3])
+        read_input.label.set_shape([1])
+
+        # Ensure that the random shuffling has good mixing properties.
+        min_fraction_of_examples_in_queue = 0.4
+        min_queue_examples = int(NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN *
+                                 min_fraction_of_examples_in_queue)
+        print('Filling queue with %d CIFAR images before starting to train. '
+              'This will take a few minutes.' % min_queue_examples)
+
+    # Generate a batch of images and labels by building up a queue of examples.
+    return _generate_image_and_label_batch(float_image, read_input.label,
+                                           min_queue_examples, batch_size,
+                                           shuffle=True)
+
+
+def inputs(eval_data, data_dir, batch_size):
+    """Construct input for CIFAR evaluation using the Reader ops.
+
+    Args:
+      eval_data: bool, indicating if one should use the train or eval data set.
+      data_dir: Path to the CIFAR-10 data directory.
+      batch_size: Number of images per batch.
+
+    Returns:
+      images: Images. 4D tensor of [batch_size, IMAGE_SIZE, IMAGE_SIZE, 3] size.
+      labels: Labels. 1D tensor of [batch_size] size.
+    """
+    if not eval_data:
+        filenames = [os.path.join(data_dir, 'data_batch_%d.bin' % i)
+                     for i in xrange(1, 6)]
+        num_examples_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN
+    else:
+        filenames = [os.path.join(data_dir, 'test_batch.bin')]
+        num_examples_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_EVAL
+
+    for f in filenames:
+        if not tf.gfile.Exists(f):
+            raise ValueError('Failed to find file: ' + f)
+
+    with tf.name_scope('input'):
+        # Create a queue that produces the filenames to read.
+        filename_queue = tf.train.string_input_producer(filenames)
+
+        # Read examples from files in the filename queue.
+        read_input = read_cifar10(filename_queue)
+        reshaped_image = tf.cast(read_input.uint8image, tf.float32)
+
+        height = IMAGE_SIZE
+        width = IMAGE_SIZE
+
+        # Image processing for evaluation.
+        # Crop the central [height, width] of the image.
+        resized_image = tf.image.resize_image_with_crop_or_pad(reshaped_image,
+                                                               height, width)
+
+        # Subtract off the mean and divide by the variance of the pixels.
+        float_image = tf.image.per_image_standardization(resized_image)
+
+        # Set the shapes of tensors.
+        float_image.set_shape([height, width, 3])
+        read_input.label.set_shape([1])
+
+        # Ensure that the random shuffling has good mixing properties.
+        min_fraction_of_examples_in_queue = 0.4
+        min_queue_examples = int(num_examples_per_epoch *
+                                 min_fraction_of_examples_in_queue)
+
+    # Generate a batch of images and labels by building up a queue of examples.
+    return _generate_image_and_label_batch(float_image, read_input.label,
+                                           min_queue_examples, batch_size,
+                                           shuffle=False)
--- a/tensorflow/cifare10/cifar10_input_test.py
+++ b/tensorflow/cifare10/cifar10_input_test.py
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for cifar10 input."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import tensorflow as tf
+
+import cifar10_input
+
+
+class CIFAR10InputTest(tf.test.TestCase):
+
+    def _record(self, label, red, green, blue):
+        image_size = 32 * 32
+        record = bytes(bytearray([label] + [red] * image_size +
+                                 [green] * image_size + [blue] * image_size))
+        expected = [[[red, green, blue]] * 32] * 32
+        return record, expected
+
+    def testSimple(self):
+        labels = [9, 3, 0]
+        records = [self._record(labels[0], 0, 128, 255),
+                   self._record(labels[1], 255, 0, 1),
+                   self._record(labels[2], 254, 255, 0)]
+        contents = b"".join([record for record, _ in records])
+        expected = [expected for _, expected in records]
+        filename = os.path.join(self.get_temp_dir(), "cifar")
+        open(filename, "wb").write(contents)
+
+        with self.test_session() as sess:
+            q = tf.FIFOQueue(99, [tf.string], shapes=())
+            q.enqueue([filename]).run()
+            q.close().run()
+            result = cifar10_input.read_cifar10(q)
+
+            for i in range(3):
+                key, label, uint8image = sess.run([
+                    result.key, result.label, result.uint8image])
+                self.assertEqual("%s:%d" % (filename, i), tf.compat.as_text(key))
+                self.assertEqual(labels[i], label)
+                self.assertAllEqual(expected[i], uint8image)
+
+            with self.assertRaises(tf.errors.OutOfRangeError):
+                sess.run([result.key, result.uint8image])
+
+
+if __name__ == "__main__":
+    tf.test.main()
--- a/tensorflow/cifare10/cifar10_multi_gpu_train.py
+++ b/tensorflow/cifare10/cifar10_multi_gpu_train.py
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""A binary to train CIFAR-10 using multiple GPUs with synchronous updates.
+
+Accuracy:
+cifar10_multi_gpu_train.py achieves ~86% accuracy after 100K steps (256
+epochs of data) as judged by cifar10_eval.py.
+
+Speed: With batch_size 128.
+
+System        | Step Time (sec/batch)  |     Accuracy
+--------------------------------------------------------------------
+1 Tesla K20m  | 0.35-0.60              | ~86% at 60K steps  (5 hours)
+1 Tesla K40m  | 0.25-0.35              | ~86% at 100K steps (4 hours)
+2 Tesla K20m  | 0.13-0.20              | ~84% at 30K steps  (2.5 hours)
+3 Tesla K20m  | 0.13-0.18              | ~84% at 30K steps
+4 Tesla K20m  | ~0.10                  | ~84% at 30K steps
+
+Usage:
+Please see the tutorial and website for how to download the CIFAR-10
+data set, compile the program and train the model.
+
+http://tensorflow.org/tutorials/deep_cnn/
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from datetime import datetime
+import os.path
+import re
+import time
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow as tf
+import cifar10
+import autocnn_helper as helper
+
+FLAGS = helper.get_parameter()
+print('Use GPU Num: ', FLAGS.num_gpus)
+
+
+def tower_loss(scope, images, labels):
+    """Calculate the total loss on a single tower running the CIFAR model.
+
+    Args:
+      scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0'
+      images: Images. 4D tensor of shape [batch_size, height, width, 3].
+      labels: Labels. 1D tensor of shape [batch_size].
+
+    Returns:
+       Tensor of shape [] containing the total loss for a batch of data
+    """
+
+    # Build inference Graph.
+    logits = cifar10.inference(images)
+
+    # Build the portion of the Graph calculating the losses. Note that we will
+    # assemble the total_loss using a custom function below.
+    _ = cifar10.loss(logits, labels)
+
+    # Assemble all of the losses for the current tower only.
+    losses = tf.get_collection('losses', scope)
+
+    # Calculate the total loss for the current tower.
+    total_loss = tf.add_n(losses, name='total_loss')
+
+    # Attach a scalar summary to all individual losses and the total loss; do the
+    # same for the averaged version of the losses.
+    for l in losses + [total_loss]:
+        # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
+        # session. This helps the clarity of presentation on tensorboard.
+        loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name)
+        tf.summary.scalar(loss_name, l)
+
+    return total_loss
+
+
+def average_gradients(tower_grads):
+    """Calculate the average gradient for each shared variable across all towers.
+
+    Note that this function provides a synchronization point across all towers.
+
+    Args:
+      tower_grads: List of lists of (gradient, variable) tuples. The outer list
+        is over individual gradients. The inner list is over the gradient
+        calculation for each tower.
+    Returns:
+       List of pairs of (gradient, variable) where the gradient has been averaged
+       across all towers.
+    """
+    average_grads = []
+    for grad_and_vars in zip(*tower_grads):
+        # Note that each grad_and_vars looks like the following:
+        #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
+        grads = []
+        for g, _ in grad_and_vars:
+            # Add 0 dimension to the gradients to represent the tower.
+            expanded_g = tf.expand_dims(g, 0)
+
+            # Append on a 'tower' dimension which we will average over below.
+            grads.append(expanded_g)
+
+        # Average over the 'tower' dimension.
+        grad = tf.concat(axis=0, values=grads)
+        grad = tf.reduce_mean(grad, 0)
+
+        # Keep in mind that the Variables are redundant because they are shared
+        # across towers. So .. we will just return the first tower's pointer to
+        # the Variable.
+        v = grad_and_vars[0][1]
+        grad_and_var = (grad, v)
+        average_grads.append(grad_and_var)
+    return average_grads
+
+
+def train():
+    """Train CIFAR-10 for a number of steps."""
+    with tf.Graph().as_default(), tf.device('/cpu:0'):
+        # Create a variable to count the number of train() calls. This equals the
+        # number of batches processed * FLAGS.num_gpus.
+        global_step = tf.get_variable(
+            'global_step', [],
+            initializer=tf.constant_initializer(0), trainable=False)
+
+        # Calculate the learning rate schedule.
+        num_batches_per_epoch = (cifar10.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /
+                                 FLAGS.batch_size)
+        decay_steps = int(num_batches_per_epoch * cifar10.NUM_EPOCHS_PER_DECAY)
+
+        # Decay the learning rate exponentially based on the number of steps.
+        lr = tf.train.exponential_decay(cifar10.INITIAL_LEARNING_RATE,
+                                        global_step,
+                                        decay_steps,
+                                        cifar10.LEARNING_RATE_DECAY_FACTOR,
+                                        staircase=True)
+
+        # Create an optimizer that performs gradient descent.
+        opt = tf.train.GradientDescentOptimizer(lr)
+
+        # Get images and labels for CIFAR-10.
+        images, labels = cifar10.distorted_inputs()
+        batch_queue = tf.contrib.slim.prefetch_queue.prefetch_queue(
+            [images, labels], capacity=2 * FLAGS.num_gpus)
+        # Calculate the gradients for each model tower.
+        tower_grads = []
+        with tf.variable_scope(tf.get_variable_scope()):
+            for i in xrange(FLAGS.num_gpus):
+                with tf.device('/gpu:%d' % i):
+                    with tf.name_scope('%s_%d' % (cifar10.TOWER_NAME, i)) as scope:
+                        # Dequeues one batch for the GPU
+                        image_batch, label_batch = batch_queue.dequeue()
+                        # Calculate the loss for one tower of the CIFAR model. This function
+                        # constructs the entire CIFAR model but shares the variables across
+                        # all towers.
+                        loss = tower_loss(scope, image_batch, label_batch)
+
+                        # Reuse variables for the next tower.
+                        tf.get_variable_scope().reuse_variables()
+
+                        # Retain the summaries from the final tower.
+                        summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)
+
+                        # Calculate the gradients for the batch of data on this CIFAR tower.
+                        grads = opt.compute_gradients(loss)
+
+                        # Keep track of the gradients across all towers.
+                        tower_grads.append(grads)
+
+        # We must calculate the mean of each gradient. Note that this is the
+        # synchronization point across all towers.
+        grads = average_gradients(tower_grads)
+
+        # Add a summary to track the learning rate.
+        summaries.append(tf.summary.scalar('learning_rate', lr))
+
+        # Add histograms for gradients.
+        for grad, var in grads:
+            if grad is not None:
+                summaries.append(tf.summary.histogram(var.op.name + '/gradients', grad))
+
+        # Apply the gradients to adjust the shared variables.
+        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
+
+        # Add histograms for trainable variables.
+        for var in tf.trainable_variables():
+            summaries.append(tf.summary.histogram(var.op.name, var))
+
+        # Track the moving averages of all trainable variables.
+        variable_averages = tf.train.ExponentialMovingAverage(
+            cifar10.MOVING_AVERAGE_DECAY, global_step)
+        variables_averages_op = variable_averages.apply(tf.trainable_variables())
+
+        # Group all updates to into a single train op.
+        train_op = tf.group(apply_gradient_op, variables_averages_op)
+
+        # Create a saver.
+        saver = tf.train.Saver(tf.global_variables())
+
+        # Build the summary operation from the last tower summaries.
+        summary_op = tf.summary.merge(summaries)
+
+        # Build an initialization operation to run below.
+        init = tf.global_variables_initializer()
+
+        # Start running operations on the Graph. allow_soft_placement must be set to
+        # True to build towers on GPU, as some of the ops do not have GPU
+        # implementations.
+        sess = tf.Session(config=tf.ConfigProto(
+            allow_soft_placement=True,
+            log_device_placement=FLAGS.log_device_placement))
+        sess.run(init)
+
+        # Start the queue runners.
+        tf.train.start_queue_runners(sess=sess)
+
+        summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph)
+
+        for step in xrange(FLAGS.max_steps):
+            start_time = time.time()
+            _, loss_value = sess.run([train_op, loss])
+            duration = time.time() - start_time
+
+            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
+
+            if step % 10 == 0:
+                num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
+                examples_per_sec = num_examples_per_step / duration
+                sec_per_batch = duration / FLAGS.num_gpus
+
+                format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
+                              'sec/batch)')
+                print(format_str % (datetime.now(), step, loss_value,
+                                    examples_per_sec, sec_per_batch))
+
+            if step % 100 == 0:
+                summary_str = sess.run(summary_op)
+                summary_writer.add_summary(summary_str, step)
+
+            # Save the model checkpoint periodically.
+            if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
+                checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
+                saver.save(sess, checkpoint_path, global_step=step)
+
+
+if __name__ == '__main__':
+    train()
+
--- a/tensorflow/cifare10/cifar10_train.py
+++ b/tensorflow/cifare10/cifar10_train.py
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""A binary to train CIFAR-10 using a single GPU.
+
+Accuracy:
+cifar10_train.py achieves ~86% accuracy after 100K steps (256 epochs of
+data) as judged by cifar10_eval.py.
+
+Speed: With batch_size 128.
+
+System        | Step Time (sec/batch)  |     Accuracy
+------------------------------------------------------------------
+1 Tesla K20m  | 0.35-0.60              | ~86% at 60K steps  (5 hours)
+1 Tesla K40m  | 0.25-0.35              | ~86% at 100K steps (4 hours)
+
+Usage:
+Please see the tutorial and website for how to download the CIFAR-10
+data set, compile the program and train the model.
+
+http://tensorflow.org/tutorials/deep_cnn/
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from datetime import datetime
+import time
+
+import tensorflow as tf
+
+import cifar10
+import autocnn_helper as helper
+
+FLAGS = helper.get_parameter()
+
+
+def train():
+    """Train CIFAR-10 for a number of steps."""
+    with tf.Graph().as_default():
+        global_step = tf.train.get_or_create_global_step()
+
+        # Get images and labels for CIFAR-10.
+        # Force input pipeline to CPU:0 to avoid operations sometimes ending up on
+        # GPU and resulting in a slow down.
+        with tf.device('/cpu:0'):
+            images, labels = cifar10.distorted_inputs()
+
+        # Build a Graph that computes the logits predictions from the
+        # inference model.
+        logits = cifar10.inference(images)
+
+        # Calculate loss.
+        loss = cifar10.loss(logits, labels)
+
+        # Build a Graph that trains the model with one batch of examples and
+        # updates the model parameters.
+        train_op = cifar10.train(loss, global_step)
+
+        class _LoggerHook(tf.train.SessionRunHook):
+            """Logs loss and runtime."""
+
+            def begin(self):
+                self._step = -1
+                self._start_time = time.time()
+
+            def before_run(self, run_context):
+                self._step += 1
+                return tf.train.SessionRunArgs(loss)  # Asks for loss value.
+
+            def after_run(self, run_context, run_values):
+                if self._step % FLAGS.log_frequency == 0:
+                    current_time = time.time()
+                    duration = current_time - self._start_time
+                    self._start_time = current_time
+
+                    loss_value = run_values.results
+                    examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration
+                    sec_per_batch = float(duration / FLAGS.log_frequency)
+
+                    format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
+                                  'sec/batch)')
+                    print(format_str % (datetime.now(), self._step, loss_value,
+                                        examples_per_sec, sec_per_batch))
+
+        with tf.train.MonitoredTrainingSession(
+                checkpoint_dir=FLAGS.train_dir,
+                hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
+                       tf.train.NanTensorHook(loss),
+                       _LoggerHook()],
+                config=tf.ConfigProto(
+                    log_device_placement=FLAGS.log_device_placement)) as mon_sess:
+            while not mon_sess.should_stop():
+                mon_sess.run(train_op)
+
+
+if __name__ == '__main__':
+    train()
+