From 6fc0183d5671115f0c7f32eeb3c7d91925cb98c3 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Wed, 11 Jul 2018 21:29:54 -0700
Subject: [PATCH 01/40] Fix integer overflow when the array size is too large

---
 src/operator/convolution_v1-inl.h   | 4 ++--
 src/operator/nn/deconvolution-inl.h | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/operator/convolution_v1-inl.h b/src/operator/convolution_v1-inl.h
index d8310e6f1fc0..1ac611d31185 100644
--- a/src/operator/convolution_v1-inl.h
+++ b/src/operator/convolution_v1-inl.h
@@ -337,10 +337,10 @@ class ConvolutionV1Op : public Operator {
     // if param_.workspace is set to zero the nstep_ equals ishape[0] (batch)
     nstep_ = std::max(
         std::min(
-            static_cast<index_t>(
+            static_cast<size_t>(
                 param_.workspace / (shape_colunit_.Size() + shape_dstunit_.Size())),
             ishape[0]),
-        1U);
+        1UL);
 
     mshadow::Shape<2> scol = mshadow::Shape2(shape_colunit_[0],
                                              shape_colunit_[1] * nstep_);
diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h
index b41ecf4aa41e..789633be388c 100644
--- a/src/operator/nn/deconvolution-inl.h
+++ b/src/operator/nn/deconvolution-inl.h
@@ -460,10 +460,10 @@ class DeconvolutionOp {
     // See convolution for workspace calculations. nstep_ will be the effective batch size
     nstep_ = std::max(
         std::min(
-            static_cast<index_t>(
+            static_cast<size_t>(
                 param_.workspace / (shape_colunit_.Size() + shape_dstunit_.Size())),
             ishape[0]),
-        1U);
+        1UL);
 
     mshadow::Shape<2> scol = mshadow::Shape2(shape_colunit_[0],
                                              shape_colunit_[1] * nstep_);

From 3c2379549eef856bff450b5f10da278f99b06fdc Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Mon, 16 Jul 2018 05:44:40 -0700
Subject: [PATCH 02/40] Update issue templates

---
 .github/ISSUE_TEMPLATE/bug_report.md | 35 ++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md

diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 000000000000..b735373365bc
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,35 @@
+---
+name: Bug report
+about: Create a report to help us improve
+
+---
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**To Reproduce**
+Steps to reproduce the behavior:
+1. Go to '...'
+2. Click on '....'
+3. Scroll down to '....'
+4. See error
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+
+**Screenshots**
+If applicable, add screenshots to help explain your problem.
+
+**Desktop (please complete the following information):**
+ - OS: [e.g. iOS]
+ - Browser [e.g. chrome, safari]
+ - Version [e.g. 22]
+
+**Smartphone (please complete the following information):**
+ - Device: [e.g. iPhone6]
+ - OS: [e.g. iOS8.1]
+ - Browser [e.g. stock browser, safari]
+ - Version [e.g. 22]
+
+**Additional context**
+Add any other context about the problem here.

From a4ba87e7b12ac108ae5fdffae9adc318f4524a57 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Mon, 16 Jul 2018 05:44:40 -0700
Subject: [PATCH 03/40] Update issue templates

---
 .github/ISSUE_TEMPLATE/bug_report.md | 35 ++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md

diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 000000000000..b735373365bc
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,35 @@
+---
+name: Bug report
+about: Create a report to help us improve
+
+---
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**To Reproduce**
+Steps to reproduce the behavior:
+1. Go to '...'
+2. Click on '....'
+3. Scroll down to '....'
+4. See error
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+
+**Screenshots**
+If applicable, add screenshots to help explain your problem.
+
+**Desktop (please complete the following information):**
+ - OS: [e.g. iOS]
+ - Browser [e.g. chrome, safari]
+ - Version [e.g. 22]
+
+**Smartphone (please complete the following information):**
+ - Device: [e.g. iPhone6]
+ - OS: [e.g. iOS8.1]
+ - Browser [e.g. stock browser, safari]
+ - Version [e.g. 22]
+
+**Additional context**
+Add any other context about the problem here.

From b3f94f24dbe902affb14f7da3d86467847f130a1 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Tue, 17 Jul 2018 11:07:38 -0700
Subject: [PATCH 04/40] Remove files added by mistake

---
 .github/ISSUE_TEMPLATE/bug_report.md | 35 ----------------------------
 1 file changed, 35 deletions(-)
 delete mode 100644 .github/ISSUE_TEMPLATE/bug_report.md

diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
deleted file mode 100644
index b735373365bc..000000000000
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ /dev/null
@@ -1,35 +0,0 @@
----
-name: Bug report
-about: Create a report to help us improve
-
----
-
-**Describe the bug**
-A clear and concise description of what the bug is.
-
-**To Reproduce**
-Steps to reproduce the behavior:
-1. Go to '...'
-2. Click on '....'
-3. Scroll down to '....'
-4. See error
-
-**Expected behavior**
-A clear and concise description of what you expected to happen.
-
-**Screenshots**
-If applicable, add screenshots to help explain your problem.
-
-**Desktop (please complete the following information):**
- - OS: [e.g. iOS]
- - Browser [e.g. chrome, safari]
- - Version [e.g. 22]
-
-**Smartphone (please complete the following information):**
- - Device: [e.g. iPhone6]
- - OS: [e.g. iOS8.1]
- - Browser [e.g. stock browser, safari]
- - Version [e.g. 22]
-
-**Additional context**
-Add any other context about the problem here.

From 4ade1a59809c52357f6132f95b3eaabd2ba00073 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Tue, 17 Jul 2018 14:50:23 -0700
Subject: [PATCH 05/40] Fix compilation error after type index_t changed to
 int64_t

---
 src/c_api/c_api_function.cc           | 2 +-
 src/io/iter_image_recordio_2.cc       | 5 +++--
 src/operator/convolution_v1-inl.h     | 4 ++--
 src/operator/custom/custom.cc         | 2 +-
 src/operator/elemwise_op_common.h     | 4 ++--
 src/operator/nn/deconvolution-inl.h   | 4 ++--
 src/operator/operator_common.h        | 4 ++--
 src/operator/tensor/ordering_op-inl.h | 2 +-
 src/operator/tensor/ordering_op.cc    | 4 ++--
 9 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/src/c_api/c_api_function.cc b/src/c_api/c_api_function.cc
index cea8c9553ccf..83b77202f22e 100644
--- a/src/c_api/c_api_function.cc
+++ b/src/c_api/c_api_function.cc
@@ -56,7 +56,7 @@ std::vector<nnvm::NodeEntry> Gradient(
 
   std::vector<nnvm::NodeEntry> ret;
   for (index_t i = 0; i < g->num_outputs(); ++i) {
-    ret.emplace_back(nnvm::NodeEntry{g, i, 0});
+    ret.emplace_back(nnvm::NodeEntry{g, static_cast<uint32_t>(i), 0});
   }
 
   return ret;
diff --git a/src/io/iter_image_recordio_2.cc b/src/io/iter_image_recordio_2.cc
index b6ff6e99b034..ac166dd2fba7 100644
--- a/src/io/iter_image_recordio_2.cc
+++ b/src/io/iter_image_recordio_2.cc
@@ -326,7 +326,8 @@ inline bool ImageRecordIOParser2<DType>::ParseNext(DataBatch *out) {
         n_to_out = 0;
       }
     } else {
-      int n_to_copy = std::min(n_parsed_, batch_param_.batch_size - current_size);
+      int n_to_copy = std::min(n_parsed_,
+        static_cast<unsigned>(batch_param_.batch_size) - current_size);
       n_parsed_ -= n_to_copy;
       // Copy
       #pragma omp parallel for num_threads(param_.preprocess_threads)
@@ -604,7 +605,7 @@ inline unsigned ImageRecordIOParser2<DType>::ParseChunk(DType* data_dptr, real_t
       res.release();
     }
   }
-  return (std::min(batch_param_.batch_size, gl_idx) - current_size);
+  return (std::min(static_cast<unsigned>(batch_param_.batch_size), gl_idx) - current_size);
 #else
   LOG(FATAL) << "Opencv is needed for image decoding and augmenting.";
   return 0;
diff --git a/src/operator/convolution_v1-inl.h b/src/operator/convolution_v1-inl.h
index 1ac611d31185..8435c15e1114 100644
--- a/src/operator/convolution_v1-inl.h
+++ b/src/operator/convolution_v1-inl.h
@@ -337,10 +337,10 @@ class ConvolutionV1Op : public Operator {
     // if param_.workspace is set to zero the nstep_ equals ishape[0] (batch)
     nstep_ = std::max(
         std::min(
-            static_cast<size_t>(
+            static_cast<index_t>(
                 param_.workspace / (shape_colunit_.Size() + shape_dstunit_.Size())),
             ishape[0]),
-        1UL);
+        1LL);
 
     mshadow::Shape<2> scol = mshadow::Shape2(shape_colunit_[0],
                                              shape_colunit_[1] * nstep_);
diff --git a/src/operator/custom/custom.cc b/src/operator/custom/custom.cc
index d117a2842166..11e5e36dbeb0 100644
--- a/src/operator/custom/custom.cc
+++ b/src/operator/custom/custom.cc
@@ -239,7 +239,7 @@ std::vector<nnvm::NodeEntry> Gradient(
 
   std::vector<nnvm::NodeEntry> ret;
   for (index_t i = 0; i < params.num_args; ++i) {
-    ret.emplace_back(nnvm::NodeEntry{g, i, 0});
+    ret.emplace_back(nnvm::NodeEntry{g, static_cast<uint32_t>(i), 0});
   }
   if (params.num_auxs) {
     nnvm::NodePtr ng = nnvm::Node::Create();
diff --git a/src/operator/elemwise_op_common.h b/src/operator/elemwise_op_common.h
index 16aa0c388cd1..1377d2db2536 100644
--- a/src/operator/elemwise_op_common.h
+++ b/src/operator/elemwise_op_common.h
@@ -199,7 +199,7 @@ struct ElemwiseGradUseOut {
     std::vector<nnvm::NodeEntry> heads;
     index_t n_out = n->num_outputs();
     for (index_t i = 0; i < n_out; ++i) {
-      heads.emplace_back(nnvm::NodeEntry{n, i, 0});
+      heads.emplace_back(nnvm::NodeEntry{n, static_cast<uint32_t>(i), 0});
     }
     return MakeNonlossGradNode(op_name, n, ograds, heads, n->attrs.dict);
   }
@@ -216,7 +216,7 @@ struct ElemwiseGradUseInOut {
     }
     index_t n_out = n->num_outputs();
     for (index_t i = 0; i < n_out; ++i) {
-      heads.emplace_back(nnvm::NodeEntry{n, i, 0});
+      heads.emplace_back(nnvm::NodeEntry{n, static_cast<uint32_t>(i), 0});
     }
     return MakeGradNode(op_name, n, heads, n->attrs.dict);
   }
diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h
index 789633be388c..f1837463f729 100644
--- a/src/operator/nn/deconvolution-inl.h
+++ b/src/operator/nn/deconvolution-inl.h
@@ -460,10 +460,10 @@ class DeconvolutionOp {
     // See convolution for workspace calculations. nstep_ will be the effective batch size
     nstep_ = std::max(
         std::min(
-            static_cast<size_t>(
+            static_cast<index_t>(
                 param_.workspace / (shape_colunit_.Size() + shape_dstunit_.Size())),
             ishape[0]),
-        1UL);
+        1LL);
 
     mshadow::Shape<2> scol = mshadow::Shape2(shape_colunit_[0],
                                              shape_colunit_[1] * nstep_);
diff --git a/src/operator/operator_common.h b/src/operator/operator_common.h
index 02130eb32e51..912f30aa54ee 100644
--- a/src/operator/operator_common.h
+++ b/src/operator/operator_common.h
@@ -396,7 +396,7 @@ inline std::vector<nnvm::NodeEntry> MakeGradNode(
                     &inputs, &dict, &n);
   std::vector<nnvm::NodeEntry> ret;
   for (index_t i = 0; i < p->num_outputs(); ++i) {
-    ret.emplace_back(nnvm::NodeEntry{p, i, 0});
+    ret.emplace_back(nnvm::NodeEntry{p, static_cast<uint32_t>(i), 0});
   }
   return ret;
 }
@@ -446,7 +446,7 @@ inline std::vector<nnvm::NodeEntry> MakeNonlossGradNode(
   p->inputs.insert(p->inputs.end(), inputs.begin(), inputs.end());
   std::vector<nnvm::NodeEntry> ret;
   for (index_t i = 0; i < p->num_outputs(); ++i) {
-    ret.emplace_back(nnvm::NodeEntry{p, i, 0});
+    ret.emplace_back(nnvm::NodeEntry{p, static_cast<uint32_t>(i), 0});
   }
   return ret;
 }
diff --git a/src/operator/tensor/ordering_op-inl.h b/src/operator/tensor/ordering_op-inl.h
index 105ee8b90db8..cd1e89e447c7 100644
--- a/src/operator/tensor/ordering_op-inl.h
+++ b/src/operator/tensor/ordering_op-inl.h
@@ -367,7 +367,7 @@ void TopKImpl(RunContext ctx,
   // Additional temp space for gpu full sorts for batch ids.
   temp_size += sizeof(int) * src.Size();
   // Temp space for cpu sorts.
-  temp_size = std::max(temp_size, sizeof(real_t) * src.Size());
+  temp_size = std::max(temp_size, sizeof(real_t) * static_cast<size_t>(src.Size()));
   size_t workspace_size = temp_size + sizeof(real_t) * src.Size() + sizeof(int) * src.Size();
   if (param.ret_typ == topk_enum::kReturnMask) {
     workspace_size += sizeof(int) * batch_size * k + sizeof(real_t) * batch_size * k;
diff --git a/src/operator/tensor/ordering_op.cc b/src/operator/tensor/ordering_op.cc
index ebd7c62ec886..0cfa095e5523 100644
--- a/src/operator/tensor/ordering_op.cc
+++ b/src/operator/tensor/ordering_op.cc
@@ -75,7 +75,7 @@ Examples::
       std::vector<nnvm::NodeEntry> inputs;
       index_t n_out = n->num_outputs();
       for (index_t i = 0; i < n_out; ++i) {
-        inputs.emplace_back(nnvm::NodeEntry{ n, i, 0 });
+        inputs.emplace_back(nnvm::NodeEntry{ n, static_cast<uint32_t>(i), 0 });
       }
       return MakeNonlossGradNode("_backward_topk", n, {ograds[0]}, inputs, n->attrs.dict);
     } else {
@@ -137,7 +137,7 @@ Examples::
     std::vector<nnvm::NodeEntry> inputs;
     index_t n_out = n->num_outputs();
     for (index_t i = 0; i < n_out; ++i) {
-      inputs.emplace_back(nnvm::NodeEntry{ n, i, 0 });
+      inputs.emplace_back(nnvm::NodeEntry{ n, static_cast<uint32_t>(i), 0 });
     }
     return MakeNonlossGradNode("_backward_topk", n, {ograds[0]}, inputs,
                                {{"axis", n->attrs.dict["axis"]},

From b2f4576eb956b0a94f1e67ddf3443a78924caab0 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-22-215.us-west-2.compute.internal>
Date: Tue, 17 Jul 2018 23:18:10 +0000
Subject: [PATCH 06/40] Explicity specify type in std::max template to avoid
 platform dependent compilation error

---
 src/operator/convolution_v1-inl.h   | 4 ++--
 src/operator/nn/deconvolution-inl.h | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/operator/convolution_v1-inl.h b/src/operator/convolution_v1-inl.h
index 8435c15e1114..fcb24915eb75 100644
--- a/src/operator/convolution_v1-inl.h
+++ b/src/operator/convolution_v1-inl.h
@@ -335,12 +335,12 @@ class ConvolutionV1Op : public Operator {
                                      oshape[2] * oshape[3]);
     // param_.workspace is in elements of sizeof(DType)
     // if param_.workspace is set to zero the nstep_ equals ishape[0] (batch)
-    nstep_ = std::max(
+    nstep_ = std::max<index_t>(
         std::min(
             static_cast<index_t>(
                 param_.workspace / (shape_colunit_.Size() + shape_dstunit_.Size())),
             ishape[0]),
-        1LL);
+        1);
 
     mshadow::Shape<2> scol = mshadow::Shape2(shape_colunit_[0],
                                              shape_colunit_[1] * nstep_);
diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h
index f1837463f729..027777ce5e0c 100644
--- a/src/operator/nn/deconvolution-inl.h
+++ b/src/operator/nn/deconvolution-inl.h
@@ -458,12 +458,12 @@ class DeconvolutionOp {
                                      oshape[1] / param_.num_group,
                                      oshape[2] * oshape[3]);
     // See convolution for workspace calculations. nstep_ will be the effective batch size
-    nstep_ = std::max(
+    nstep_ = std::max<index_t>(
         std::min(
             static_cast<index_t>(
                 param_.workspace / (shape_colunit_.Size() + shape_dstunit_.Size())),
             ishape[0]),
-        1LL);
+        1);
 
     mshadow::Shape<2> scol = mshadow::Shape2(shape_colunit_[0],
                                              shape_colunit_[1] * nstep_);

From 7cb661115c620597dfd5dcc0fe141f5e599fbb20 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Wed, 18 Jul 2018 09:56:16 -0700
Subject: [PATCH 07/40] Add nightly test for large array

---
 tests/nightly/test_large_array.py | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 tests/nightly/test_large_array.py

diff --git a/tests/nightly/test_large_array.py b/tests/nightly/test_large_array.py
new file mode 100644
index 000000000000..609f77ea291f
--- /dev/null
+++ b/tests/nightly/test_large_array.py
@@ -0,0 +1,31 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+from mxnet import gluon, nd
+
+def test_ndarray2numpy():
+    m = gluon.nn.Embedding(14000, 128)
+    m.initialize()
+    ind = nd.zeros((700000, 128))
+    x = m(ind)
+    x.shape
+    test=x.asnumpy()
+    assert (x.shape == test.shape)
+
+if __name__ == '__main__':
+    test_ndarray2numpy()
\ No newline at end of file

From cc07f5ff531a401e4e9cbf685dc919a6466d0bc4 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Wed, 1 Aug 2018 11:15:34 -0700
Subject: [PATCH 08/40] Update submodule mshadow

---
 3rdparty/mshadow | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/mshadow b/3rdparty/mshadow
index 463c0dffe3ea..d68d3694fdfb 160000
--- a/3rdparty/mshadow
+++ b/3rdparty/mshadow
@@ -1 +1 @@
-Subproject commit 463c0dffe3eae8c39caf7989c85b7244823df27e
+Subproject commit d68d3694fdfb44fdbb7c840c3591131ff2310a59

From 4b841be7c1d57263194a6df3fb4ac4a3d5b8ceef Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Wed, 1 Aug 2018 21:50:32 -0700
Subject: [PATCH 09/40] Fix compilation warning

---
 src/operator/channel_op_common.h                  | 4 ++--
 src/operator/contrib/count_sketch-inl.h           | 2 +-
 src/operator/contrib/deformable_convolution-inl.h | 4 ++--
 src/operator/contrib/fft-inl.h                    | 2 +-
 src/operator/contrib/ifft-inl.h                   | 2 +-
 src/operator/contrib/sync_batch_norm-inl.h        | 4 ++--
 src/operator/custom/custom.cc                     | 4 ++--
 src/operator/custom/native_op-inl.h               | 4 ++--
 src/operator/nn/batch_norm.cc                     | 2 +-
 src/operator/nn/convolution.cc                    | 2 +-
 src/operator/nn/deconvolution-inl.h               | 4 ++--
 src/operator/nn/deconvolution.cc                  | 2 +-
 src/operator/nn/lrn.cc                            | 2 +-
 src/operator/nn/pooling-inl.h                     | 2 +-
 src/operator/nn/upsampling.cc                     | 2 +-
 src/operator/softmax_output-inl.h                 | 8 ++++----
 16 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/src/operator/channel_op_common.h b/src/operator/channel_op_common.h
index 00cd8ae084bb..1afc13ad2594 100644
--- a/src/operator/channel_op_common.h
+++ b/src/operator/channel_op_common.h
@@ -44,7 +44,7 @@ inline void concatenate_helper(const std::vector<mshadow::Tensor<xpu, dim, DType
     mshadow::Tensor<xpu, dim, DType> out = *output;
     size_t size = input.size();
     index_t begin = 0;
-    for (index_t i = 0; i < size; ++i) {
+    for (size_t i = 0; i < size; ++i) {
       index_t end = begin + input[i].size(cdim);
       Assign(slice<cdim>(out, begin, end), req, input[i]);
       begin = end;
@@ -79,7 +79,7 @@ void split_helper(const mshadow::Tensor<xpu, dim, DType> &input,
     std::vector<mshadow::Tensor<xpu, dim, DType> > out = *output;
     size_t size = out.size();
     index_t begin = 0;
-    for (index_t i = 0; i < size; ++i) {
+    for (size_t i = 0; i < size; ++i) {
       index_t end = begin + out[i].size(cdim);
       Assign(out[i], req[i], slice<cdim>(input, begin, end));
       begin = end;
diff --git a/src/operator/contrib/count_sketch-inl.h b/src/operator/contrib/count_sketch-inl.h
index 76d1a7efb876..dd3bf54ab6a6 100644
--- a/src/operator/contrib/count_sketch-inl.h
+++ b/src/operator/contrib/count_sketch-inl.h
@@ -185,7 +185,7 @@ class CountSketchProp : public OperatorProperty {
     CHECK_GE(in_type->size(), 1);
     int dtype = (*in_type)[0];
     CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (index_t i = 0; i < in_type->size(); ++i) {
+    for (size_t i = 0; i < in_type->size(); ++i) {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
       } else {
diff --git a/src/operator/contrib/deformable_convolution-inl.h b/src/operator/contrib/deformable_convolution-inl.h
index 480f675bdbff..7328eb38308f 100644
--- a/src/operator/contrib/deformable_convolution-inl.h
+++ b/src/operator/contrib/deformable_convolution-inl.h
@@ -129,7 +129,7 @@ class DeformableConvolutionOp : public Operator {
     // calculate the shape of col_buffer
     TShape col_buffer_shape(num_spatial_axes_ + 1);
     col_buffer_shape[0] = conv_in_channels_ * param_.kernel.Size();
-    for (index_t i = 1; i < col_buffer_shape.ndim(); ++i) {
+    for (size_t i = 1; i < col_buffer_shape.ndim(); ++i) {
       col_buffer_shape[i] = out_data[0].shape_[i + 1];
     }
     // create a column buffer using workspace and col_buffer_shape
@@ -453,7 +453,7 @@ class DeformableConvolutionProp : public OperatorProperty {
     CHECK_GE(in_type->size(), 1U);
     int dtype = (*in_type)[0];
     CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (index_t i = 0; i < in_type->size(); ++i) {
+    for (size_t i = 0; i < in_type->size(); ++i) {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
       } else {
diff --git a/src/operator/contrib/fft-inl.h b/src/operator/contrib/fft-inl.h
index be7b64aeb0c6..c5c8574f19e7 100644
--- a/src/operator/contrib/fft-inl.h
+++ b/src/operator/contrib/fft-inl.h
@@ -258,7 +258,7 @@ class FFTProp : public OperatorProperty {
     CHECK_GE(in_type->size(), 1);
     int dtype = (*in_type)[0];
     CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (index_t i = 0; i < in_type->size(); ++i) {
+    for (size_t i = 0; i < in_type->size(); ++i) {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
       } else {
diff --git a/src/operator/contrib/ifft-inl.h b/src/operator/contrib/ifft-inl.h
index e48d653d9274..da560c3c5178 100644
--- a/src/operator/contrib/ifft-inl.h
+++ b/src/operator/contrib/ifft-inl.h
@@ -250,7 +250,7 @@ class IFFTProp : public OperatorProperty {
     CHECK_GE(in_type->size(), 1);
     int dtype = (*in_type)[0];
     CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (index_t i=0; i < in_type->size(); ++i) {
+    for (size_t i=0; i < in_type->size(); ++i) {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
       } else {
diff --git a/src/operator/contrib/sync_batch_norm-inl.h b/src/operator/contrib/sync_batch_norm-inl.h
index 1f548dbc7e5e..78f1c09dfe03 100644
--- a/src/operator/contrib/sync_batch_norm-inl.h
+++ b/src/operator/contrib/sync_batch_norm-inl.h
@@ -500,14 +500,14 @@ class SyncBatchNormProp : public OperatorProperty {
     // For other input types, these parameters have the same type as input
     // NOTE: This requirement is from cuDNN (v. 4 and 5)
     int dtype_param = (dtype == kFloat16) ? kFloat32 : dtype;
-    for (index_t i = 1; i < in_type->size(); ++i) {
+    for (size_t i = 1; i < in_type->size(); ++i) {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype_param;
       } else {
         UNIFORM_TYPE_CHECK((*in_type)[i], dtype_param, ListArguments()[i]);
       }
     }
-    for (index_t i = 0; i < aux_type->size(); ++i) {
+    for (size_t i = 0; i < aux_type->size(); ++i) {
       if ((*aux_type)[i] != -1) {
         UNIFORM_TYPE_CHECK((*aux_type)[i], dtype_param, ListArguments()[i]);
       }
diff --git a/src/operator/custom/custom.cc b/src/operator/custom/custom.cc
index 11e5e36dbeb0..c6ae61feb2c0 100644
--- a/src/operator/custom/custom.cc
+++ b/src/operator/custom/custom.cc
@@ -238,14 +238,14 @@ std::vector<nnvm::NodeEntry> Gradient(
   }
 
   std::vector<nnvm::NodeEntry> ret;
-  for (index_t i = 0; i < params.num_args; ++i) {
+  for (size_t i = 0; i < params.num_args; ++i) {
     ret.emplace_back(nnvm::NodeEntry{g, static_cast<uint32_t>(i), 0});
   }
   if (params.num_auxs) {
     nnvm::NodePtr ng = nnvm::Node::Create();
     ng->attrs.op = nnvm::Op::Get("_NoGradient");
     ng->attrs.name = "NoGradient";
-    for (index_t i = 0; i < params.num_auxs; ++i) {
+    for (size_t i = 0; i < params.num_auxs; ++i) {
       ret.emplace_back(nnvm::NodeEntry{ng, 0, 0});
     }
   }
diff --git a/src/operator/custom/native_op-inl.h b/src/operator/custom/native_op-inl.h
index d2fb1149f7b5..f2eca05e78f0 100644
--- a/src/operator/custom/native_op-inl.h
+++ b/src/operator/custom/native_op-inl.h
@@ -77,7 +77,7 @@ class NativeOp : public Operator {
     s->Wait();
     param_.pinfo->forward(ptrs.size(), ptrs.data(), ndims.data(), shapes.data(),
         tags.data(), param_.pinfo->p_forward);
-    for (index_t i = 0; i < out_data.size(); ++i) {
+    for (size_t i = 0; i < out_data.size(); ++i) {
       CHECK_NE(req[i], kAddTo) << "NativeOp doesn't support AddTo for output";
       if (req[i] != kNullOp) {
         std::stringstream ss;
@@ -111,7 +111,7 @@ class NativeOp : public Operator {
     s->Wait();
     param_.pinfo->backward(ptrs.size(), ptrs.data(), ndims.data(), shapes.data(),
         tags.data(), param_.pinfo->p_backward);
-    for (index_t i = 0; i < in_grad.size(); ++i) {
+    for (size_t i = 0; i < in_grad.size(); ++i) {
       CHECK_NE(req[i], kAddTo) << "NativeOp doesn't support AddTo for output";
       if (req[i] != kNullOp) {
         std::stringstream ss;
diff --git a/src/operator/nn/batch_norm.cc b/src/operator/nn/batch_norm.cc
index 30fb665dd05a..c11f98026865 100644
--- a/src/operator/nn/batch_norm.cc
+++ b/src/operator/nn/batch_norm.cc
@@ -362,7 +362,7 @@ static bool BatchNormType(const nnvm::NodeAttrs& attrs,
       dtype_param = mshadow::DataType<AccRealX>::kFlag; });
   std::vector<std::string> args{"data", "gamma", "beta", "mean", "var"};
   CHECK_LE(in_type->size(), args.size());
-  for (index_t i = 1; i < in_type->size(); ++i) {
+  for (size_t i = 1; i < in_type->size(); ++i) {
     if ((*in_type)[i] == -1) {
       (*in_type)[i] = dtype_param;
     } else {
diff --git a/src/operator/nn/convolution.cc b/src/operator/nn/convolution.cc
index ef70ccd6ec1e..42a4ed240587 100644
--- a/src/operator/nn/convolution.cc
+++ b/src/operator/nn/convolution.cc
@@ -276,7 +276,7 @@ static bool ConvolutionType(const nnvm::NodeAttrs& attrs,
   CHECK_GE(in_type->size(), 1U);
   int dtype = (*in_type)[0];
   CHECK_NE(dtype, -1) << "First input must have specified type";
-  for (index_t i = 0; i < in_type->size(); ++i) {
+  for (size_t i = 0; i < in_type->size(); ++i) {
     if ((*in_type)[i] == -1) {
       (*in_type)[i] = dtype;
     } else {
diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h
index 027777ce5e0c..b627fc441ffa 100644
--- a/src/operator/nn/deconvolution-inl.h
+++ b/src/operator/nn/deconvolution-inl.h
@@ -130,7 +130,7 @@ struct DeconvolutionParam : public dmlc::Parameter<DeconvolutionParam> {
     if (bCal) {
       size_t input_ndim = input.ndim();
 
-      for (index_t i = 0; i < ndim; i++) {
+      for (size_t i = 0; i < ndim; i++) {
         // input.ndim() can be larger than ndim, in case that the complete input
         // shape was passed and not only the ndim last ones
         o_pad[i] = stride[i] * (input[(input_ndim - ndim) + i] - 1) + DilatedKernelSize(i);
@@ -140,7 +140,7 @@ struct DeconvolutionParam : public dmlc::Parameter<DeconvolutionParam> {
         o_pad[i] = (o_pad[i] + 1) / 2;
       }
     } else {
-      for (index_t i = 0; i < ndim; i++) {
+      for (size_t i = 0; i < ndim; i++) {
         o_pad[i] = pad[i];
         o_adj[i] = adj[i];
       }
diff --git a/src/operator/nn/deconvolution.cc b/src/operator/nn/deconvolution.cc
index 9e0a70121bf9..8bbcc6780faf 100644
--- a/src/operator/nn/deconvolution.cc
+++ b/src/operator/nn/deconvolution.cc
@@ -244,7 +244,7 @@ static bool DeconvolutionType(const nnvm::NodeAttrs& attrs,
   CHECK_GE(in_type->size(), 1U);
   int dtype = (*in_type)[0];
   CHECK_NE(dtype, -1) << "First input must have specified type";
-  for (index_t i = 0; i < in_type->size(); ++i) {
+  for (size_t i = 0; i < in_type->size(); ++i) {
     if ((*in_type)[i] == -1) {
       (*in_type)[i] = dtype;
     } else {
diff --git a/src/operator/nn/lrn.cc b/src/operator/nn/lrn.cc
index 6b3d7c818378..056dbaa53621 100644
--- a/src/operator/nn/lrn.cc
+++ b/src/operator/nn/lrn.cc
@@ -56,7 +56,7 @@ bool LRNType(const nnvm::NodeAttrs& attrs,
   CHECK_GE(in_type->size(), 1U);
   int dtype = (*in_type)[0];
   CHECK_NE(dtype, -1) << "First input must have specified type";
-  for (index_t i = 0; i < in_type->size(); ++i) {
+  for (size_t i = 0; i < in_type->size(); ++i) {
     if ((*in_type)[i] == -1) {
       (*in_type)[i] = dtype;
     } else {
diff --git a/src/operator/nn/pooling-inl.h b/src/operator/nn/pooling-inl.h
index ad74a8feae39..d2a370cf6505 100644
--- a/src/operator/nn/pooling-inl.h
+++ b/src/operator/nn/pooling-inl.h
@@ -258,7 +258,7 @@ void PoolingCompute(const nnvm::NodeAttrs& attrs,
                     const std::vector<TBlob>& outputs) {
   const PoolingParam& param = nnvm::get<PoolingParam>(attrs.parsed);
   CHECK_EQ(inputs.size(), 1U);
-  CHECK_EQ(outputs.size(), GetNumOutputs(param));
+  CHECK_EQ(outputs.size(), static_cast<size_t>(GetNumOutputs(param));
   if (!param.global_pool) {
     // check if filter size assigned correctly
     CHECK_GT(param.kernel.ndim(), 0U)
diff --git a/src/operator/nn/upsampling.cc b/src/operator/nn/upsampling.cc
index 5aa111e26f75..b6b3d873df7d 100644
--- a/src/operator/nn/upsampling.cc
+++ b/src/operator/nn/upsampling.cc
@@ -92,7 +92,7 @@ static bool UpSamplingType(const nnvm::NodeAttrs& attrs,
   CHECK_GE(in_type->size(), 1U);
   int dtype = (*in_type)[0];
   CHECK_NE(dtype, -1) << "First input must have specified type";
-  for (index_t i = 0; i < in_type->size(); ++i) {
+  for (size_t i = 0; i < in_type->size(); ++i) {
     if ((*in_type)[i] == -1) {
       (*in_type)[i] = dtype;
     } else {
diff --git a/src/operator/softmax_output-inl.h b/src/operator/softmax_output-inl.h
index 9a4db2c9694a..06017853b078 100644
--- a/src/operator/softmax_output-inl.h
+++ b/src/operator/softmax_output-inl.h
@@ -185,8 +185,8 @@ class SoftmaxOutputOp : public Operator {
           ctx.requested[softmaxout_enum::kTempSpace].get_host_space_typed<2, DType>(
           label.shape_);
         Copy(workspace, label, label.stream_);
-        for (index_t i = 0; i < workspace.size(0); ++i) {
-          for (index_t j = 0; j < workspace.size(1); ++j) {
+        for (size_t i = 0; i < workspace.size(0); ++i) {
+          for (size_t j = 0; j < workspace.size(1); ++j) {
             if (static_cast<int>(workspace[i][j]) == i_label) {
               valid_cnt--;
             }
@@ -245,7 +245,7 @@ class SoftmaxOutputOp : public Operator {
           ctx.requested[softmaxout_enum::kTempSpace].get_host_space_typed<1, DType>(
           label.shape_);
         Copy(workspace, label, label.stream_);
-        for (index_t i = 0; i < label.size(0); ++i) {
+        for (size_t i = 0; i < label.size(0); ++i) {
           if (static_cast<int>(workspace[i]) == i_label) {
             valid_cnt--;
           }
@@ -333,7 +333,7 @@ class SoftmaxOutputProp : public OperatorProperty {
     CHECK_GE(in_type->size(), 1U);
     int dtype = (*in_type)[0];
     CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (index_t i = 0; i < in_type->size(); ++i) {
+    for (size_t i = 0; i < in_type->size(); ++i) {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
       } else {

From 1d3eb3e9351bd4627c21b8c336dbe9a068c86083 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Wed, 1 Aug 2018 23:19:10 -0700
Subject: [PATCH 10/40] Fix compilation warning

---
 src/executor/graph_executor.cc      |  2 +-
 src/operator/nn/pooling-inl.h       |  6 +++---
 src/operator/nn/pooling.cc          | 12 ++++++------
 src/operator/sequence_last-inl.h    |  2 +-
 src/operator/sequence_mask-inl.h    |  2 +-
 src/operator/sequence_reverse-inl.h |  2 +-
 src/operator/softmax_output-inl.h   |  6 +++---
 src/operator/svm_output-inl.h       |  2 +-
 src/operator/tensor/matrix_op-inl.h |  6 +++---
 9 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
index 7386de4d12e3..526d307c9940 100644
--- a/src/executor/graph_executor.cc
+++ b/src/executor/graph_executor.cc
@@ -1576,7 +1576,7 @@ void GraphExecutor::ExecuteMonCallback(size_t nid) {
     }
   }
   CHECK_EQ(opnode.exec->out_array.size(), output_names.size());
-  for (index_t i = 0; i < opnode.exec->out_array.size(); ++i) {
+  for (size_t i = 0; i < opnode.exec->out_array.size(); ++i) {
     NDArray *cpy = new NDArray(opnode.exec->out_array[i]);
     std::string name = inode.source->attrs.name + "_" + output_names[i];
     this->monitor_callback_(name.c_str(), reinterpret_cast<void*>(cpy));
diff --git a/src/operator/nn/pooling-inl.h b/src/operator/nn/pooling-inl.h
index d2a370cf6505..38fd7fb4296b 100644
--- a/src/operator/nn/pooling-inl.h
+++ b/src/operator/nn/pooling-inl.h
@@ -135,8 +135,8 @@ namespace op {
  * When MKLDNN is enabled, we might want 2 outputs instead of one inputs, which
  * also changes the number of inputs for backward.
  */
-int GetNumOutputs(const PoolingParam &param);
-int GetNumBackInputs(const PoolingParam &param);
+size_t GetNumOutputs(const PoolingParam &param);
+size_t GetNumBackInputs(const PoolingParam &param);
 
 template<typename xpu, typename DType>
 class PoolingOp {
@@ -258,7 +258,7 @@ void PoolingCompute(const nnvm::NodeAttrs& attrs,
                     const std::vector<TBlob>& outputs) {
   const PoolingParam& param = nnvm::get<PoolingParam>(attrs.parsed);
   CHECK_EQ(inputs.size(), 1U);
-  CHECK_EQ(outputs.size(), static_cast<size_t>(GetNumOutputs(param));
+  CHECK_EQ(outputs.size(), GetNumOutputs(param));
   if (!param.global_pool) {
     // check if filter size assigned correctly
     CHECK_GT(param.kernel.ndim(), 0U)
diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc
index 9b6996d0feb0..7982c0c3bd3e 100644
--- a/src/operator/nn/pooling.cc
+++ b/src/operator/nn/pooling.cc
@@ -57,19 +57,19 @@ void PoolingParamParser(nnvm::NodeAttrs *attrs) {
   attrs->parsed = std::move(param);
 }
 
-int GetNumOutputs(const PoolingParam &param) {
+size_t GetNumOutputs(const PoolingParam &param) {
 #if MXNET_USE_MKLDNN == 1
-  return MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param) ? 2 : 1;
+  return MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param) ? 2U : 1U;
 #else
-  return 1;
+  return 1U;
 #endif
 }
 
-int GetNumBackInputs(const PoolingParam &param) {
+size_t GetNumBackInputs(const PoolingParam &param) {
 #if MXNET_USE_MKLDNN == 1
-  return MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param) ? 5 : 3;
+  return MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param) ? 5U : 3U;
 #else
-  return 3;
+  return 3U;
 #endif
 }
 
diff --git a/src/operator/sequence_last-inl.h b/src/operator/sequence_last-inl.h
index 58562862a4e0..1a59473cfc3a 100644
--- a/src/operator/sequence_last-inl.h
+++ b/src/operator/sequence_last-inl.h
@@ -278,7 +278,7 @@ class SequenceLastProp : public OperatorProperty {
     CHECK_GE(in_type->size(), param_.use_sequence_length ? 2U : 1U);
     int dtype = (*in_type)[0];
     CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (index_t i = 0; i < in_type->size(); ++i) {
+    for (size_t i = 0; i < in_type->size(); ++i) {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
       } else {
diff --git a/src/operator/sequence_mask-inl.h b/src/operator/sequence_mask-inl.h
index a34cea04965e..c93ffb5f17b6 100644
--- a/src/operator/sequence_mask-inl.h
+++ b/src/operator/sequence_mask-inl.h
@@ -267,7 +267,7 @@ class SequenceMaskProp : public OperatorProperty {
     CHECK_GE(in_type->size(), param_.use_sequence_length ? 2U : 1U);
     int dtype = (*in_type)[0];
     CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (index_t i = 0; i < in_type->size(); ++i) {
+    for (size_t i = 0; i < in_type->size(); ++i) {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
       } else {
diff --git a/src/operator/sequence_reverse-inl.h b/src/operator/sequence_reverse-inl.h
index 943ca6e933c9..5c48729e18ff 100644
--- a/src/operator/sequence_reverse-inl.h
+++ b/src/operator/sequence_reverse-inl.h
@@ -246,7 +246,7 @@ class SequenceReverseProp : public OperatorProperty {
     CHECK_GE(in_type->size(), param_.use_sequence_length ? 2U : 1U);
     int dtype = (*in_type)[0];
     CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (index_t i = 0; i < in_type->size(); ++i) {
+    for (size_t i = 0; i < in_type->size(); ++i) {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
       } else {
diff --git a/src/operator/softmax_output-inl.h b/src/operator/softmax_output-inl.h
index 06017853b078..fec321b97e4c 100644
--- a/src/operator/softmax_output-inl.h
+++ b/src/operator/softmax_output-inl.h
@@ -185,8 +185,8 @@ class SoftmaxOutputOp : public Operator {
           ctx.requested[softmaxout_enum::kTempSpace].get_host_space_typed<2, DType>(
           label.shape_);
         Copy(workspace, label, label.stream_);
-        for (size_t i = 0; i < workspace.size(0); ++i) {
-          for (size_t j = 0; j < workspace.size(1); ++j) {
+        for (index_t i = 0; i < workspace.size(0); ++i) {
+          for (index_t j = 0; j < workspace.size(1); ++j) {
             if (static_cast<int>(workspace[i][j]) == i_label) {
               valid_cnt--;
             }
@@ -245,7 +245,7 @@ class SoftmaxOutputOp : public Operator {
           ctx.requested[softmaxout_enum::kTempSpace].get_host_space_typed<1, DType>(
           label.shape_);
         Copy(workspace, label, label.stream_);
-        for (size_t i = 0; i < label.size(0); ++i) {
+        for (index_t i = 0; i < label.size(0); ++i) {
           if (static_cast<int>(workspace[i]) == i_label) {
             valid_cnt--;
           }
diff --git a/src/operator/svm_output-inl.h b/src/operator/svm_output-inl.h
index 9ae0ced7a74a..011b9ad10284 100644
--- a/src/operator/svm_output-inl.h
+++ b/src/operator/svm_output-inl.h
@@ -159,7 +159,7 @@ class SVMOutputProp : public OperatorProperty {
     CHECK_GE(in_type->size(), 1U);
     int dtype = (*in_type)[0];
     CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (index_t i = 0; i < in_type->size(); ++i) {
+    for (size_t i = 0; i < in_type->size(); ++i) {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
       } else {
diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h
index eec920555ed1..05a60eb345b3 100644
--- a/src/operator/tensor/matrix_op-inl.h
+++ b/src/operator/tensor/matrix_op-inl.h
@@ -86,7 +86,7 @@ inline TShape InferReshapeShape(const nnvm::Tuple<IType>& shape,
   }
   auto dshape_len = dshape_vec.size();
   auto params_len = param_shape_vec.size();
-  for (index_t i = 0; i < params_len; ++i) {
+  for (size_t i = 0; i < params_len; ++i) {
     IType proposed_dim = param_shape_vec[i];
     if (proposed_dim == 0) {
       // keep same
@@ -2061,7 +2061,7 @@ void StackOpForward(const nnvm::NodeAttrs& attrs,
     Shape<3> oshape = Shape3(leading, mid, trailing);
     out = outputs[0].get_with_shape<xpu, 3, DType>(oshape, s);
 
-    for (index_t i = 0; i < inputs.size(); ++i) {
+    for (size_t i = 0; i < inputs.size(); ++i) {
       Shape<3> dshape = Shape3(leading, 1, trailing);
       data[i] = inputs[i].get_with_shape<xpu, 3, DType>(dshape, s);
     }
@@ -2095,7 +2095,7 @@ void StackOpBackward(const nnvm::NodeAttrs& attrs,
     Shape<3> oshape = Shape3(leading, mid, trailing);
     grad = inputs[0].get_with_shape<xpu, 3, DType>(oshape, s);
 
-    for (index_t i = 0; i < outputs.size(); ++i) {
+    for (size_t i = 0; i < outputs.size(); ++i) {
       Shape<3> dshape = Shape3(leading, 1, trailing);
       grad_in[i] = outputs[i].get_with_shape<xpu, 3, DType>(dshape, s);
     }

From 081e11163db57170e906e8414adb215359c20c27 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Thu, 2 Aug 2018 11:06:12 -0700
Subject: [PATCH 11/40] Change index variable type to size_t

---
 src/io/image_iter_common.h       |  4 ++--
 src/ndarray/ndarray.cc           | 12 ++++++------
 src/operator/batch_norm_v1-inl.h |  4 ++--
 src/operator/rnn-inl.h           |  2 +-
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/io/image_iter_common.h b/src/io/image_iter_common.h
index 8580ff8f9f9c..07e342579ec8 100644
--- a/src/io/image_iter_common.h
+++ b/src/io/image_iter_common.h
@@ -42,7 +42,7 @@ class ImageLabelMap {
    * \param label_width predefined label_width
    */
   explicit ImageLabelMap(const char *path_imglist,
-                         mshadow::index_t label_width,
+                         index_t label_width,
                          bool silent) {
     this->label_width = label_width;
     image_index_.clear();
@@ -58,7 +58,7 @@ class ImageLabelMap {
       // skip space
       while (isspace(*p) && p != end) ++p;
       image_index_.push_back(static_cast<size_t>(atol(p)));
-      for (size_t i = 0; i < label_width; ++i) {
+      for (index_t i = 0; i < label_width; ++i) {
         // skip till space
         while (!isspace(*p) && p != end) ++p;
         // skip space
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index 853838a87f4c..335587735084 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -2077,10 +2077,10 @@ void Imdecode(NDArray *ret, NDArray mean, size_t index,
   if (mean.is_none()) {
     MSHADOW_TYPE_SWITCH(buff.dtype(), DType, {
       mshadow::Tensor<cpu, 4, DType> tensor = buff.data().get<cpu, 4, DType>();
-      for (index_t i = 0; i < y1-y0; i++) {
+      for (size_t i = 0; i < y1-y0; i++) {
         uchar* im_data = res.ptr<uchar>(y0+i) + res.channels()*x0;
-        for (index_t j = 0; j < x1-x0; j++) {
-          for (index_t k = 0; k < n_channels; k++) {
+        for (size_t j = 0; j < x1-x0; j++) {
+          for (size_t k = 0; k < n_channels; k++) {
             tensor[0][k][i][j] = DType(im_data[k]);  // NOLINT(*)
           }
           im_data += res.channels();
@@ -2097,10 +2097,10 @@ void Imdecode(NDArray *ret, NDArray mean, size_t index,
     MSHADOW_TYPE_SWITCH(buff.dtype(), DType, {
       mshadow::Tensor<cpu, 4, DType> tensor = buff.data().get<cpu, 4, DType>();
       mshadow::Tensor<cpu, 3, DType> tmean = mean.data().get<cpu, 3, DType>();
-      for (index_t i = 0; i < y1-y0; i++) {
+      for (size_t i = 0; i < y1-y0; i++) {
         uchar* im_data = res.ptr<uchar>(y0+i) + res.channels()*x0;
-        for (index_t j = 0; j < x1-x0; j++) {
-          for (index_t k = 0; k < n_channels; k++) {
+        for (size_t j = 0; j < x1-x0; j++) {
+          for (size_t k = 0; k < n_channels; k++) {
             tensor[0][k][i][j] = DType(im_data[k]) - tmean[k][i][j];  // NOLINT(*)
           }
           im_data += res.channels();
diff --git a/src/operator/batch_norm_v1-inl.h b/src/operator/batch_norm_v1-inl.h
index 1e048452275c..f4116e30186e 100644
--- a/src/operator/batch_norm_v1-inl.h
+++ b/src/operator/batch_norm_v1-inl.h
@@ -286,14 +286,14 @@ class BatchNormV1Prop : public OperatorProperty {
     // For other input types, these parameters have the same type as input
     // NOTE: This requirement is from cuDNN (v. 4 and 5)
     int dtype_param = (dtype == kFloat16) ? kFloat32 : dtype;
-    for (index_t i = 1; i < in_type->size(); ++i) {
+    for (size_t i = 1; i < in_type->size(); ++i) {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype_param;
       } else {
         UNIFORM_TYPE_CHECK((*in_type)[i], dtype_param, ListArguments()[i]);
       }
     }
-    for (index_t i = 0; i < aux_type->size(); ++i) {
+    for (size_t i = 0; i < aux_type->size(); ++i) {
       if ((*aux_type)[i] != -1) {
         UNIFORM_TYPE_CHECK((*aux_type)[i], dtype_param, ListArguments()[i]);
       }
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index 1f905eda4a92..c511cef515bb 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -688,7 +688,7 @@ class RNNProp : public OperatorProperty {
     CHECK_GE(in_type->size(), 1U);
     int dtype = (*in_type)[0];
     CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (index_t i = 0; i < in_type->size(); ++i) {
+    for (size_t i = 0; i < in_type->size(); ++i) {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
       } else {

From 22327482006080d2bd2016f52a87326cbc540776 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Wed, 11 Jul 2018 21:29:54 -0700
Subject: [PATCH 12/40] Fix integer overflow when the array size is too large

---
 src/operator/convolution_v1-inl.h   | 4 ++--
 src/operator/nn/deconvolution-inl.h | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/operator/convolution_v1-inl.h b/src/operator/convolution_v1-inl.h
index d8310e6f1fc0..1ac611d31185 100644
--- a/src/operator/convolution_v1-inl.h
+++ b/src/operator/convolution_v1-inl.h
@@ -337,10 +337,10 @@ class ConvolutionV1Op : public Operator {
     // if param_.workspace is set to zero the nstep_ equals ishape[0] (batch)
     nstep_ = std::max(
         std::min(
-            static_cast<index_t>(
+            static_cast<size_t>(
                 param_.workspace / (shape_colunit_.Size() + shape_dstunit_.Size())),
             ishape[0]),
-        1U);
+        1UL);
 
     mshadow::Shape<2> scol = mshadow::Shape2(shape_colunit_[0],
                                              shape_colunit_[1] * nstep_);
diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h
index b41ecf4aa41e..789633be388c 100644
--- a/src/operator/nn/deconvolution-inl.h
+++ b/src/operator/nn/deconvolution-inl.h
@@ -460,10 +460,10 @@ class DeconvolutionOp {
     // See convolution for workspace calculations. nstep_ will be the effective batch size
     nstep_ = std::max(
         std::min(
-            static_cast<index_t>(
+            static_cast<size_t>(
                 param_.workspace / (shape_colunit_.Size() + shape_dstunit_.Size())),
             ishape[0]),
-        1U);
+        1UL);
 
     mshadow::Shape<2> scol = mshadow::Shape2(shape_colunit_[0],
                                              shape_colunit_[1] * nstep_);

From a38ca56a1061a4e506ee712cc54200aafc4930c2 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Mon, 16 Jul 2018 05:44:40 -0700
Subject: [PATCH 13/40] Update issue templates

---
 .github/ISSUE_TEMPLATE/bug_report.md | 35 ++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md

diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 000000000000..b735373365bc
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,35 @@
+---
+name: Bug report
+about: Create a report to help us improve
+
+---
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**To Reproduce**
+Steps to reproduce the behavior:
+1. Go to '...'
+2. Click on '....'
+3. Scroll down to '....'
+4. See error
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+
+**Screenshots**
+If applicable, add screenshots to help explain your problem.
+
+**Desktop (please complete the following information):**
+ - OS: [e.g. iOS]
+ - Browser [e.g. chrome, safari]
+ - Version [e.g. 22]
+
+**Smartphone (please complete the following information):**
+ - Device: [e.g. iPhone6]
+ - OS: [e.g. iOS8.1]
+ - Browser [e.g. stock browser, safari]
+ - Version [e.g. 22]
+
+**Additional context**
+Add any other context about the problem here.

From 18bfa716c096f81ea353ca2e2d0f366d146d4459 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Tue, 17 Jul 2018 11:07:38 -0700
Subject: [PATCH 14/40] Remove files added by mistake

---
 .github/ISSUE_TEMPLATE/bug_report.md | 35 ----------------------------
 1 file changed, 35 deletions(-)
 delete mode 100644 .github/ISSUE_TEMPLATE/bug_report.md

diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
deleted file mode 100644
index b735373365bc..000000000000
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ /dev/null
@@ -1,35 +0,0 @@
----
-name: Bug report
-about: Create a report to help us improve
-
----
-
-**Describe the bug**
-A clear and concise description of what the bug is.
-
-**To Reproduce**
-Steps to reproduce the behavior:
-1. Go to '...'
-2. Click on '....'
-3. Scroll down to '....'
-4. See error
-
-**Expected behavior**
-A clear and concise description of what you expected to happen.
-
-**Screenshots**
-If applicable, add screenshots to help explain your problem.
-
-**Desktop (please complete the following information):**
- - OS: [e.g. iOS]
- - Browser [e.g. chrome, safari]
- - Version [e.g. 22]
-
-**Smartphone (please complete the following information):**
- - Device: [e.g. iPhone6]
- - OS: [e.g. iOS8.1]
- - Browser [e.g. stock browser, safari]
- - Version [e.g. 22]
-
-**Additional context**
-Add any other context about the problem here.

From 74ae95a872ffa388f05c29b021f47c8018cacc66 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Tue, 17 Jul 2018 14:50:23 -0700
Subject: [PATCH 15/40] Fix compilation error after type index_t changed to
 int64_t

---
 src/c_api/c_api_function.cc           | 2 +-
 src/io/iter_image_recordio_2.cc       | 5 +++--
 src/operator/convolution_v1-inl.h     | 4 ++--
 src/operator/custom/custom.cc         | 2 +-
 src/operator/elemwise_op_common.h     | 4 ++--
 src/operator/nn/deconvolution-inl.h   | 4 ++--
 src/operator/operator_common.h        | 4 ++--
 src/operator/tensor/ordering_op-inl.h | 2 +-
 src/operator/tensor/ordering_op.cc    | 4 ++--
 9 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/src/c_api/c_api_function.cc b/src/c_api/c_api_function.cc
index cea8c9553ccf..83b77202f22e 100644
--- a/src/c_api/c_api_function.cc
+++ b/src/c_api/c_api_function.cc
@@ -56,7 +56,7 @@ std::vector<nnvm::NodeEntry> Gradient(
 
   std::vector<nnvm::NodeEntry> ret;
   for (index_t i = 0; i < g->num_outputs(); ++i) {
-    ret.emplace_back(nnvm::NodeEntry{g, i, 0});
+    ret.emplace_back(nnvm::NodeEntry{g, static_cast<uint32_t>(i), 0});
   }
 
   return ret;
diff --git a/src/io/iter_image_recordio_2.cc b/src/io/iter_image_recordio_2.cc
index b6ff6e99b034..ac166dd2fba7 100644
--- a/src/io/iter_image_recordio_2.cc
+++ b/src/io/iter_image_recordio_2.cc
@@ -326,7 +326,8 @@ inline bool ImageRecordIOParser2<DType>::ParseNext(DataBatch *out) {
         n_to_out = 0;
       }
     } else {
-      int n_to_copy = std::min(n_parsed_, batch_param_.batch_size - current_size);
+      int n_to_copy = std::min(n_parsed_,
+        static_cast<unsigned>(batch_param_.batch_size) - current_size);
       n_parsed_ -= n_to_copy;
       // Copy
       #pragma omp parallel for num_threads(param_.preprocess_threads)
@@ -604,7 +605,7 @@ inline unsigned ImageRecordIOParser2<DType>::ParseChunk(DType* data_dptr, real_t
       res.release();
     }
   }
-  return (std::min(batch_param_.batch_size, gl_idx) - current_size);
+  return (std::min(static_cast<unsigned>(batch_param_.batch_size), gl_idx) - current_size);
 #else
   LOG(FATAL) << "Opencv is needed for image decoding and augmenting.";
   return 0;
diff --git a/src/operator/convolution_v1-inl.h b/src/operator/convolution_v1-inl.h
index 1ac611d31185..8435c15e1114 100644
--- a/src/operator/convolution_v1-inl.h
+++ b/src/operator/convolution_v1-inl.h
@@ -337,10 +337,10 @@ class ConvolutionV1Op : public Operator {
     // if param_.workspace is set to zero the nstep_ equals ishape[0] (batch)
     nstep_ = std::max(
         std::min(
-            static_cast<size_t>(
+            static_cast<index_t>(
                 param_.workspace / (shape_colunit_.Size() + shape_dstunit_.Size())),
             ishape[0]),
-        1UL);
+        1LL);
 
     mshadow::Shape<2> scol = mshadow::Shape2(shape_colunit_[0],
                                              shape_colunit_[1] * nstep_);
diff --git a/src/operator/custom/custom.cc b/src/operator/custom/custom.cc
index d117a2842166..11e5e36dbeb0 100644
--- a/src/operator/custom/custom.cc
+++ b/src/operator/custom/custom.cc
@@ -239,7 +239,7 @@ std::vector<nnvm::NodeEntry> Gradient(
 
   std::vector<nnvm::NodeEntry> ret;
   for (index_t i = 0; i < params.num_args; ++i) {
-    ret.emplace_back(nnvm::NodeEntry{g, i, 0});
+    ret.emplace_back(nnvm::NodeEntry{g, static_cast<uint32_t>(i), 0});
   }
   if (params.num_auxs) {
     nnvm::NodePtr ng = nnvm::Node::Create();
diff --git a/src/operator/elemwise_op_common.h b/src/operator/elemwise_op_common.h
index 16aa0c388cd1..1377d2db2536 100644
--- a/src/operator/elemwise_op_common.h
+++ b/src/operator/elemwise_op_common.h
@@ -199,7 +199,7 @@ struct ElemwiseGradUseOut {
     std::vector<nnvm::NodeEntry> heads;
     index_t n_out = n->num_outputs();
     for (index_t i = 0; i < n_out; ++i) {
-      heads.emplace_back(nnvm::NodeEntry{n, i, 0});
+      heads.emplace_back(nnvm::NodeEntry{n, static_cast<uint32_t>(i), 0});
     }
     return MakeNonlossGradNode(op_name, n, ograds, heads, n->attrs.dict);
   }
@@ -216,7 +216,7 @@ struct ElemwiseGradUseInOut {
     }
     index_t n_out = n->num_outputs();
     for (index_t i = 0; i < n_out; ++i) {
-      heads.emplace_back(nnvm::NodeEntry{n, i, 0});
+      heads.emplace_back(nnvm::NodeEntry{n, static_cast<uint32_t>(i), 0});
     }
     return MakeGradNode(op_name, n, heads, n->attrs.dict);
   }
diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h
index 789633be388c..f1837463f729 100644
--- a/src/operator/nn/deconvolution-inl.h
+++ b/src/operator/nn/deconvolution-inl.h
@@ -460,10 +460,10 @@ class DeconvolutionOp {
     // See convolution for workspace calculations. nstep_ will be the effective batch size
     nstep_ = std::max(
         std::min(
-            static_cast<size_t>(
+            static_cast<index_t>(
                 param_.workspace / (shape_colunit_.Size() + shape_dstunit_.Size())),
             ishape[0]),
-        1UL);
+        1LL);
 
     mshadow::Shape<2> scol = mshadow::Shape2(shape_colunit_[0],
                                              shape_colunit_[1] * nstep_);
diff --git a/src/operator/operator_common.h b/src/operator/operator_common.h
index 29112939a22f..a25bf1f203cc 100644
--- a/src/operator/operator_common.h
+++ b/src/operator/operator_common.h
@@ -396,7 +396,7 @@ inline std::vector<nnvm::NodeEntry> MakeGradNode(
                     &inputs, &dict, &n);
   std::vector<nnvm::NodeEntry> ret;
   for (index_t i = 0; i < p->num_outputs(); ++i) {
-    ret.emplace_back(nnvm::NodeEntry{p, i, 0});
+    ret.emplace_back(nnvm::NodeEntry{p, static_cast<uint32_t>(i), 0});
   }
   return ret;
 }
@@ -446,7 +446,7 @@ inline std::vector<nnvm::NodeEntry> MakeNonlossGradNode(
   p->inputs.insert(p->inputs.end(), inputs.begin(), inputs.end());
   std::vector<nnvm::NodeEntry> ret;
   for (index_t i = 0; i < p->num_outputs(); ++i) {
-    ret.emplace_back(nnvm::NodeEntry{p, i, 0});
+    ret.emplace_back(nnvm::NodeEntry{p, static_cast<uint32_t>(i), 0});
   }
   return ret;
 }
diff --git a/src/operator/tensor/ordering_op-inl.h b/src/operator/tensor/ordering_op-inl.h
index 105ee8b90db8..cd1e89e447c7 100644
--- a/src/operator/tensor/ordering_op-inl.h
+++ b/src/operator/tensor/ordering_op-inl.h
@@ -367,7 +367,7 @@ void TopKImpl(RunContext ctx,
   // Additional temp space for gpu full sorts for batch ids.
   temp_size += sizeof(int) * src.Size();
   // Temp space for cpu sorts.
-  temp_size = std::max(temp_size, sizeof(real_t) * src.Size());
+  temp_size = std::max(temp_size, sizeof(real_t) * static_cast<size_t>(src.Size()));
   size_t workspace_size = temp_size + sizeof(real_t) * src.Size() + sizeof(int) * src.Size();
   if (param.ret_typ == topk_enum::kReturnMask) {
     workspace_size += sizeof(int) * batch_size * k + sizeof(real_t) * batch_size * k;
diff --git a/src/operator/tensor/ordering_op.cc b/src/operator/tensor/ordering_op.cc
index ebd7c62ec886..0cfa095e5523 100644
--- a/src/operator/tensor/ordering_op.cc
+++ b/src/operator/tensor/ordering_op.cc
@@ -75,7 +75,7 @@ Examples::
       std::vector<nnvm::NodeEntry> inputs;
       index_t n_out = n->num_outputs();
       for (index_t i = 0; i < n_out; ++i) {
-        inputs.emplace_back(nnvm::NodeEntry{ n, i, 0 });
+        inputs.emplace_back(nnvm::NodeEntry{ n, static_cast<uint32_t>(i), 0 });
       }
       return MakeNonlossGradNode("_backward_topk", n, {ograds[0]}, inputs, n->attrs.dict);
     } else {
@@ -137,7 +137,7 @@ Examples::
     std::vector<nnvm::NodeEntry> inputs;
     index_t n_out = n->num_outputs();
     for (index_t i = 0; i < n_out; ++i) {
-      inputs.emplace_back(nnvm::NodeEntry{ n, i, 0 });
+      inputs.emplace_back(nnvm::NodeEntry{ n, static_cast<uint32_t>(i), 0 });
     }
     return MakeNonlossGradNode("_backward_topk", n, {ograds[0]}, inputs,
                                {{"axis", n->attrs.dict["axis"]},

From 2f62006fe01e5b5cc162e33d30b9c0c13e88f680 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-22-215.us-west-2.compute.internal>
Date: Tue, 17 Jul 2018 23:18:10 +0000
Subject: [PATCH 16/40] Explicity specify type in std::max template to avoid
 platform dependent compilation error

---
 src/operator/convolution_v1-inl.h   | 4 ++--
 src/operator/nn/deconvolution-inl.h | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/operator/convolution_v1-inl.h b/src/operator/convolution_v1-inl.h
index 8435c15e1114..fcb24915eb75 100644
--- a/src/operator/convolution_v1-inl.h
+++ b/src/operator/convolution_v1-inl.h
@@ -335,12 +335,12 @@ class ConvolutionV1Op : public Operator {
                                      oshape[2] * oshape[3]);
     // param_.workspace is in elements of sizeof(DType)
     // if param_.workspace is set to zero the nstep_ equals ishape[0] (batch)
-    nstep_ = std::max(
+    nstep_ = std::max<index_t>(
         std::min(
             static_cast<index_t>(
                 param_.workspace / (shape_colunit_.Size() + shape_dstunit_.Size())),
             ishape[0]),
-        1LL);
+        1);
 
     mshadow::Shape<2> scol = mshadow::Shape2(shape_colunit_[0],
                                              shape_colunit_[1] * nstep_);
diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h
index f1837463f729..027777ce5e0c 100644
--- a/src/operator/nn/deconvolution-inl.h
+++ b/src/operator/nn/deconvolution-inl.h
@@ -458,12 +458,12 @@ class DeconvolutionOp {
                                      oshape[1] / param_.num_group,
                                      oshape[2] * oshape[3]);
     // See convolution for workspace calculations. nstep_ will be the effective batch size
-    nstep_ = std::max(
+    nstep_ = std::max<index_t>(
         std::min(
             static_cast<index_t>(
                 param_.workspace / (shape_colunit_.Size() + shape_dstunit_.Size())),
             ishape[0]),
-        1LL);
+        1);
 
     mshadow::Shape<2> scol = mshadow::Shape2(shape_colunit_[0],
                                              shape_colunit_[1] * nstep_);

From 5553cd06f72000587efc11bbc30d5fbcd332e845 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Wed, 18 Jul 2018 09:56:16 -0700
Subject: [PATCH 17/40] Add nightly test for large array

---
 tests/nightly/test_large_array.py | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 tests/nightly/test_large_array.py

diff --git a/tests/nightly/test_large_array.py b/tests/nightly/test_large_array.py
new file mode 100644
index 000000000000..609f77ea291f
--- /dev/null
+++ b/tests/nightly/test_large_array.py
@@ -0,0 +1,31 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+from mxnet import gluon, nd
+
+def test_ndarray2numpy():
+    m = gluon.nn.Embedding(14000, 128)
+    m.initialize()
+    ind = nd.zeros((700000, 128))
+    x = m(ind)
+    x.shape
+    test=x.asnumpy()
+    assert (x.shape == test.shape)
+
+if __name__ == '__main__':
+    test_ndarray2numpy()
\ No newline at end of file

From 7a7e2390345135f0881d29ee670c7724518394d0 Mon Sep 17 00:00:00 2001
From: Lanking <lanking520@live.com>
Date: Tue, 31 Jul 2018 10:53:32 -0700
Subject: [PATCH 18/40] [MXNET-531] NeuralStyle Example for Scala (#11621)

* add initial neuralstyle and test coverage

* Add two more test and README

* kill comments

* patch on memory leaks fix

* fix formatting issues

* remove redundant files

* disable the Gan example for now

* add ignore method

* add new download scheme to match the changes
---
 .../neuralstyle/ModelVgg19.scala              | 139 ++++-----
 .../neuralstyle/NeuralStyle.scala             | 251 ++++++++--------
 .../mxnetexamples/neuralstyle/README.md       |  83 ++++++
 .../neuralstyle/end2end/Basic.scala           |  32 +--
 .../neuralstyle/end2end/BoostInference.scala  |  60 ++--
 .../neuralstyle/end2end/BoostTrain.scala      | 271 +++++++++---------
 .../neuralstyle/end2end/DataProcessing.scala  |  15 +-
 .../neuralstyle/end2end/GenV3.scala           |  55 ++--
 .../neuralstyle/end2end/GenV4.scala           |  91 ++----
 .../neuralstyle/end2end/ModelVgg19.scala      | 111 -------
 .../neuralstyle/end2end/Module.scala          |  15 +-
 .../mxnetexamples/gan/GanExampleSuite.scala   |  49 ++--
 .../imclassification/MNISTExampleSuite.scala  |   3 +-
 .../neuralstyle/NeuralStyleSuite.scala        |  92 ++++++
 14 files changed, 642 insertions(+), 625 deletions(-)
 create mode 100644 scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/README.md
 delete mode 100644 scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/ModelVgg19.scala
 create mode 100644 scala-package/examples/src/test/scala/org/apache/mxnetexamples/neuralstyle/NeuralStyleSuite.scala

diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/ModelVgg19.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/ModelVgg19.scala
index 4d9aa35d21ff..ca4c242ab1ce 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/ModelVgg19.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/ModelVgg19.scala
@@ -17,92 +17,73 @@
 
 package org.apache.mxnetexamples.neuralstyle
 
-import org.apache.mxnet.Context
-import org.apache.mxnet.Executor
-import org.apache.mxnet.NDArray
-import org.apache.mxnet.Symbol
-import org.apache.mxnet.Shape
+import org.apache.mxnet.{Context, Executor, NDArray, Shape, Symbol}
 
 /**
- * Definition for the neuralstyle network and initialize it with pretrained weight
- * @author Depeng Liang
- */
+  * Definition for the neuralstyle network and initialize it with pretrained weight
+  */
 object ModelVgg19 {
   case class ConvExecutor(executor: Executor, data: NDArray, dataGrad: NDArray,
-                      style: Array[NDArray], content: NDArray, argDict: Map[String, NDArray])
+                          style: Array[NDArray], content: NDArray, argDict: Map[String, NDArray])
+
+  def ConvRelu(data : Symbol, convName : String, reluName : String,
+               numFilter : Int, kernel : (Int, Int) = (3, 3),
+               stride : (Int, Int) = (1, 1)) : Symbol = {
+    val conv = Symbol.api.Convolution(data = Some(data), num_filter = numFilter,
+      pad = Some(Shape(1, 1)), kernel = Shape(kernel._1, kernel._2),
+      stride = Some(Shape(stride._1, stride._2)), no_bias = Some(false),
+      workspace = Some(1024), name = convName)
+    val relu = Symbol.api.relu(data = Some(conv), name = reluName)
+    conv.dispose()
+    relu
+  }
 
   def getSymbol: (Symbol, Symbol) = {
+    getVggSymbol()
+  }
+
+  def getVggSymbol(prefix: String = "", contentOnly: Boolean = false): (Symbol, Symbol) = {
     // declare symbol
-    val data = Symbol.Variable("data")
-    val conv1_1 = Symbol.Convolution("conv1_1")()(Map("data" -> data , "num_filter" -> 64,
-                                        "pad" -> "(1,1)", "kernel" -> "(3,3)", "stride" -> "(1,1)",
-                                        "no_bias" -> false, "workspace" -> 1024))
-    val relu1_1 = Symbol.Activation("relu1_1")()(Map("data" -> conv1_1 , "act_type" -> "relu"))
-    val conv1_2 = Symbol.Convolution("conv1_2")()(Map("data" -> relu1_1 , "num_filter" -> 64,
-                                        "pad" -> "(1,1)", "kernel" -> "(3,3)", "stride" -> "(1,1)",
-                                        "no_bias" -> false, "workspace" -> 1024))
-    val relu1_2 = Symbol.Activation("relu1_2")()(Map("data" -> conv1_2 , "act_type" -> "relu"))
-    val pool1 = Symbol.Pooling("pool1")()(Map("data" -> relu1_2 , "pad" -> "(0,0)",
-                                    "kernel" -> "(2,2)", "stride" -> "(2,2)", "pool_type" -> "avg"))
-    val conv2_1 = Symbol.Convolution("conv2_1")()(Map("data" -> pool1 , "num_filter" -> 128,
-                                        "pad" -> "(1,1)", "kernel" -> "(3,3)", "stride" -> "(1,1)",
-                                        "no_bias" -> false, "workspace" -> 1024))
-    val relu2_1 = Symbol.Activation("relu2_1")()(Map("data" -> conv2_1 , "act_type" -> "relu"))
-    val conv2_2 = Symbol.Convolution("conv2_2")()(Map("data" -> relu2_1 , "num_filter" -> 128,
-                                        "pad" -> "(1,1)", "kernel" -> "(3,3)", "stride" -> "(1,1)",
-                                        "no_bias" -> false, "workspace" -> 1024))
-    val relu2_2 = Symbol.Activation("relu2_2")()(Map("data" -> conv2_2 , "act_type" -> "relu"))
-    val pool2 = Symbol.Pooling("pool2")()(Map("data" -> relu2_2 , "pad" -> "(0,0)",
-                                    "kernel" -> "(2,2)", "stride" -> "(2,2)", "pool_type" -> "avg"))
-    val conv3_1 = Symbol.Convolution("conv3_1")()(Map("data" -> pool2 , "num_filter" -> 256,
-                                        "pad" -> "(1,1)", "kernel" -> "(3,3)", "stride" -> "(1,1)",
-                                        "no_bias" -> false, "workspace" -> 1024))
-    val relu3_1 = Symbol.Activation("relu3_1")()(Map("data" -> conv3_1 , "act_type" -> "relu"))
-    val conv3_2 = Symbol.Convolution("conv3_2")()(Map("data" -> relu3_1 , "num_filter" -> 256,
-                                        "pad" -> "(1,1)", "kernel" -> "(3,3)", "stride" -> "(1,1)",
-                                        "no_bias" -> false, "workspace" -> 1024))
-    val relu3_2 = Symbol.Activation("'relu3_2")()(Map("data" -> conv3_2 , "act_type" -> "relu"))
-    val conv3_3 = Symbol.Convolution("conv3_3")()(Map("data" -> relu3_2 , "num_filter" -> 256,
-                                        "pad" -> "(1,1)", "kernel" -> "(3,3)", "stride" -> "(1,1)",
-                                        "no_bias" -> false, "workspace" -> 1024))
-    val relu3_3 = Symbol.Activation("relu3_3")()(Map("data" -> conv3_3 , "act_type" -> "relu"))
-    val conv3_4 = Symbol.Convolution("conv3_4")()(Map("data" -> relu3_3 , "num_filter" -> 256,
-                                        "pad" -> "(1,1)", "kernel" -> "(3,3)", "stride" -> "(1,1)",
-                                        "no_bias" -> false, "workspace" -> 1024))
-    val relu3_4 = Symbol.Activation("relu3_4")()(Map("data" -> conv3_4 , "act_type" -> "relu"))
-    val pool3 = Symbol.Pooling("pool3")()(Map("data" -> relu3_4 , "pad" -> "(0,0)",
-                                    "kernel" -> "(2,2)", "stride" -> "(2,2)", "pool_type" -> "avg"))
-    val conv4_1 = Symbol.Convolution("conv4_1")()(Map("data" -> pool3 , "num_filter" -> 512,
-                                        "pad" -> "(1,1)", "kernel" -> "(3,3)", "stride" -> "(1,1)",
-                                        "no_bias" -> false, "workspace" -> 1024))
-    val relu4_1 = Symbol.Activation("relu4_1")()(Map("data" -> conv4_1 , "act_type" -> "relu"))
-    val conv4_2 = Symbol.Convolution("conv4_2")()(Map("data" -> relu4_1 , "num_filter" -> 512,
-                                        "pad" -> "(1,1)", "kernel" -> "(3,3)", "stride" -> "(1,1)",
-                                        "no_bias" -> false, "workspace" -> 1024))
-    val relu4_2 = Symbol.Activation("relu4_2")()(Map("data" -> conv4_2 , "act_type" -> "relu"))
-    val conv4_3 = Symbol.Convolution("conv4_3")()(Map("data" -> relu4_2 , "num_filter" -> 512,
-                                        "pad" -> "(1,1)", "kernel" -> "(3,3)", "stride" -> "(1,1)",
-                                        "no_bias" -> false, "workspace" -> 1024))
-    val relu4_3 = Symbol.Activation("relu4_3")()(Map("data" -> conv4_3 , "act_type" -> "relu"))
-    val conv4_4 = Symbol.Convolution("conv4_4")()(Map("data" -> relu4_3 , "num_filter" -> 512,
-                                        "pad" -> "(1,1)", "kernel" -> "(3,3)", "stride" -> "(1,1)",
-                                        "no_bias" -> false, "workspace" -> 1024))
-    val relu4_4 = Symbol.Activation("relu4_4")()(Map("data" -> conv4_4 , "act_type" -> "relu"))
-    val pool4 = Symbol.Pooling("pool4")()(Map("data" -> relu4_4 , "pad" -> "(0,0)",
-                                    "kernel" -> "(2,2)", "stride" -> "(2,2)", "pool_type" -> "avg"))
-    val conv5_1 = Symbol.Convolution("conv5_1")()(Map("data" -> pool4 , "num_filter" -> 512,
-                                        "pad" -> "(1,1)", "kernel" -> "(3,3)", "stride" -> "(1,1)",
-                                        "no_bias" -> false, "workspace" -> 1024))
-    val relu5_1 = Symbol.Activation("relu5_1")()(Map("data" -> conv5_1 , "act_type" -> "relu"))
+    val data = Symbol.Variable(s"${prefix}data")
+
+    val relu1_1 = ConvRelu(data, s"${prefix}conv1_1", s"${prefix}relu1_1", 64)
+    val relu1_2 = ConvRelu(relu1_1, s"${prefix}conv1_2", s"${prefix}relu1_2", 64)
+    val pool1 = Symbol.api.Pooling(data = Some(relu1_2), pad = Some(Shape(0, 0)),
+      kernel = Some(Shape(2, 2)), stride = Some(Shape(2, 2)), pool_type = Some("avg"),
+      name = s"${prefix}pool1")
+
+    val relu2_1 = ConvRelu(pool1, s"${prefix}conv2_1", s"${prefix}relu2_1", 128)
+    val relu2_2 = ConvRelu(relu2_1, s"${prefix}conv2_2", s"${prefix}relu2_2", 128)
+    val pool2 = Symbol.api.Pooling(data = Some(relu2_2), pad = Some(Shape(0, 0)),
+      kernel = Some(Shape(2, 2)), stride = Some(Shape(2, 2)), pool_type = Some("avg"),
+      name = s"${prefix}pool2")
+
+    val relu3_1 = ConvRelu(pool2, s"${prefix}conv3_1", s"${prefix}relu3_1", 256)
+    val relu3_2 = ConvRelu(relu3_1, s"${prefix}conv3_2", s"${prefix}relu3_2", 256)
+    val relu3_3 = ConvRelu(relu3_2, s"${prefix}conv3_3", s"${prefix}relu3_3", 256)
+    val relu3_4 = ConvRelu(relu3_3, s"${prefix}conv3_4", s"${prefix}relu3_4", 256)
+    val pool3 = Symbol.api.Pooling(data = Some(relu3_4), pad = Some(Shape(0, 0)),
+      kernel = Some(Shape(2, 2)), stride = Some(Shape(2, 2)), pool_type = Some("avg"),
+      name = s"${prefix}pool3")
+
+    val relu4_1 = ConvRelu(pool3, s"${prefix}conv4_1", s"${prefix}relu4_1", 512)
+    val relu4_2 = ConvRelu(relu4_1, s"${prefix}conv4_2", s"${prefix}relu4_2", 512)
+    val relu4_3 = ConvRelu(relu4_2, s"${prefix}conv4_3", s"${prefix}relu4_3", 512)
+    val relu4_4 = ConvRelu(relu4_3, s"${prefix}conv4_4", s"${prefix}relu4_4", 512)
+    val pool4 = Symbol.api.Pooling(data = Some(relu4_4), pad = Some(Shape(0, 0)),
+      kernel = Some(Shape(2, 2)), stride = Some(Shape(2, 2)), pool_type = Some("avg"),
+      name = s"${prefix}pool4")
+
+    val relu5_1 = ConvRelu(pool4, s"${prefix}conv5_1", s"${prefix}relu5_1", 512)
 
     // style and content layers
-    val style = Symbol.Group(relu1_1, relu2_1, relu3_1, relu4_1, relu5_1)
+    val style = if (contentOnly) null else Symbol.Group(relu1_1, relu2_1, relu3_1, relu4_1, relu5_1)
     val content = Symbol.Group(relu4_2)
     (style, content)
   }
 
   def getExecutor(style: Symbol, content: Symbol, modelPath: String,
-      inputSize: (Int, Int), ctx: Context): ConvExecutor = {
+                  inputSize: (Int, Int), ctx: Context): ConvExecutor = {
     val out = Symbol.Group(style, content)
     // make executor
     val (argShapes, outputShapes, auxShapes) = out.inferShape(
@@ -116,15 +97,17 @@ object ModelVgg19 {
       val key = s"arg:$name"
       if (pretrained.contains(key)) argDict(name).set(pretrained(key))
     }
+    pretrained.foreach(ele => ele._2.dispose())
     val executor = out.bind(ctx, argDict, gradDict)
+    out.dispose()
     val outArray = executor.outputs
     ConvExecutor(executor = executor,
-                              data = argDict("data"),
-                              dataGrad = gradDict("data"),
-                              style = outArray.take(outArray.length - 1),
-                              content = outArray(outArray.length - 1),
-                              argDict = argDict)
-    }
+      data = argDict("data"),
+      dataGrad = gradDict("data"),
+      style = outArray.take(outArray.length - 1),
+      content = outArray(outArray.length - 1),
+      argDict = argDict)
+  }
 
   def getModel(modelPath: String, inputSize: (Int, Int), ctx: Context): ConvExecutor = {
     val (style, content) = getSymbol
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/NeuralStyle.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/NeuralStyle.scala
index d99ea641b5d4..f98d725c2304 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/NeuralStyle.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/NeuralStyle.scala
@@ -17,22 +17,22 @@
 
 package org.apache.mxnetexamples.neuralstyle
 
-import org.apache.mxnet._
-import org.kohsuke.args4j.{CmdLineParser, Option}
-import org.slf4j.LoggerFactory
-import scala.collection.JavaConverters._
-import com.sksamuel.scrimage.Image
 import java.io.File
-import com.sksamuel.scrimage.Pixel
+
+import com.sksamuel.scrimage.{Image, Pixel}
 import com.sksamuel.scrimage.filter.GaussianBlurFilter
 import com.sksamuel.scrimage.nio.JpegWriter
+import org.apache.mxnet._
 import org.apache.mxnet.optimizer.Adam
+import org.kohsuke.args4j.{CmdLineParser, Option}
+import org.slf4j.LoggerFactory
+
+import scala.collection.JavaConverters._
+import scala.collection.mutable.ListBuffer
 
 /**
- * An Implementation of the paper A Neural Algorithm of Artistic Style
- * by Leon A. Gatys, Alexander S. Ecker, and Matthias Bethge
- * @author Depeng Liang
- */
+  * An Implementation of the paper A Neural Algorithm of Artistic Style
+  */
 object NeuralStyle {
   case class NSExecutor(executor: Executor, data: NDArray, dataGrad: NDArray)
 
@@ -109,11 +109,11 @@ object NeuralStyle {
     var gradScale = List[Int]()
     for (i <- 0 until style.listOutputs().length) {
       val shape = outputShape(i)
-      val x = Symbol.Reshape()()(Map("data" -> style.get(i),
-          "target_shape" -> Shape(shape(1), shape(2) * shape(3))))
-      // use fully connected to quickly do dot(x, x^T)
-      val gram = Symbol.FullyConnected()()(Map("data" -> x, "weight" -> x,
-          "no_bias" -> true, "num_hidden" -> shape(1)))
+      val x = Symbol.api.Reshape(data = Some(style.get(i)),
+        target_shape = Some(Shape(shape(1), shape(2) * shape(3))))
+      val gram = Symbol.api.FullyConnected(data = Some(x), weight = Some(x),
+        no_bias = Some(true), num_hidden = shape(1))
+      x.dispose()
       gramList = gramList :+ gram
       gradScale = gradScale :+ (shape(1) * shape(2) * shape(3) * shape(1))
     }
@@ -121,13 +121,20 @@ object NeuralStyle {
   }
 
   def getLoss(gram: Symbol, content: Symbol): (Symbol, Symbol) = {
-    var gramLoss = List[Symbol]()
+    var gramLoss = ListBuffer[Symbol]()
     for (i <- 0 until gram.listOutputs().length) {
       val gvar = Symbol.Variable(s"target_gram_$i")
-      gramLoss = gramLoss :+ Symbol.sum()(Symbol.square()(gvar - gram.get(i))())()
+      Symbol.api.square(data = Some(gvar - gram.get(i)))
+      gramLoss += Symbol.api.sum(
+        Some(Symbol.api.square(data = Some(gvar - gram.get(i))))
+      )
+      gvar.dispose()
     }
+    gram.dispose()
     val cvar = Symbol.Variable("target_content")
-    val contentLoss = Symbol.sum()(Symbol.square()(cvar - content)())()
+    val contentLoss = Symbol.api.sum(
+      Some(Symbol.api.square(Some(cvar - content)))
+    )
     (Symbol.Group(gramLoss: _*), contentLoss)
   }
 
@@ -138,12 +145,13 @@ object NeuralStyle {
     val nChannel = img.shape(1)
     val sImg = Symbol.Variable("img")
     val sKernel = Symbol.Variable("kernel")
-    val channels = Symbol.SliceChannel()(sImg)(Map("num_outputs" -> nChannel))
-    val out = Symbol.Concat()((0 until nChannel).map { i =>
-      Symbol.Convolution()()(Map("data" -> channels.get(i), "weight" -> sKernel,
-                    "num_filter" -> 1, "kernel" -> "(3,3)", "pad" -> "(1,1)",
-                    "no_bias" -> true, "stride" -> "(1,1)"))
-    }: _*)() * tvWeight
+    val channels = Symbol.api.SliceChannel(data = Some(sImg), num_outputs = nChannel)
+    val result = (0 until nChannel).map { i =>
+      Symbol.api.Convolution(data = Some(channels.get(i)), weight = Some(sKernel),
+        num_filter = 1, kernel = Shape(3, 3), pad = Some(Shape(1, 1)), no_bias = Some(true),
+        stride = Some(Shape(1, 1)))
+    }.toArray
+    val out = Symbol.api.Concat(result, result.length) * tvWeight
     val kernel = {
       val tmp = NDArray.empty(Shape(1, 1, 3, 3), ctx)
       tmp.set(Array[Float](0, -1, 0, -1, 4, -1, 0, -1, 0))
@@ -156,104 +164,123 @@ object NeuralStyle {
     Math.sqrt(array.map(x => x * x).sum.toDouble).toFloat
   }
 
-  def main(args: Array[String]): Unit = {
-    val alle = new NeuralStyle
-    val parser: CmdLineParser = new CmdLineParser(alle)
-    try {
-      parser.parseArgument(args.toList.asJava)
-      assert(alle.contentImage != null && alle.styleImage != null
-        && alle.modelPath != null && alle.outputDir != null)
+  //scalastyle:off
+  def runTraining(model : String, contentImage : String, styleImage: String, dev : Context,
+                  modelPath : String, outputDir : String, styleWeight : Float,
+                  contentWeight : Float, tvWeight : Float, gaussianRadius : Int,
+                  lr: Float, maxNumEpochs: Int, maxLongEdge: Int,
+                  saveEpochs : Int, stopEps: Float) : Unit = {
 
-      val dev = if (alle.gpu >= 0) Context.gpu(alle.gpu) else Context.cpu(0)
-      val contentNp = preprocessContentImage(alle.contentImage, alle.maxLongEdge, dev)
-      val styleNp = preprocessStyleImage(alle.styleImage, contentNp.shape, dev)
-      val size = (contentNp.shape(2), contentNp.shape(3))
+    val contentNp = preprocessContentImage(contentImage, maxLongEdge, dev)
+    val styleNp = preprocessStyleImage(styleImage, contentNp.shape, dev)
+    val size = (contentNp.shape(2), contentNp.shape(3))
 
-      val (style, content) = ModelVgg19.getSymbol
-      val (gram, gScale) = styleGramSymbol(size, style)
-      var modelExecutor = ModelVgg19.getExecutor(gram, content, alle.modelPath, size, dev)
+    val (style, content) = ModelVgg19.getSymbol
+    val (gram, gScale) = styleGramSymbol(size, style)
+    var modelExecutor = ModelVgg19.getExecutor(gram, content, modelPath, size, dev)
 
-      modelExecutor.data.set(styleNp)
-      modelExecutor.executor.forward()
+    modelExecutor.data.set(styleNp)
+    modelExecutor.executor.forward()
 
-      val styleArray = modelExecutor.style.map(_.copyTo(Context.cpu()))
-      modelExecutor.data.set(contentNp)
-      modelExecutor.executor.forward()
-      val contentArray = modelExecutor.content.copyTo(Context.cpu())
+    val styleArray = modelExecutor.style.map(_.copyTo(Context.cpu()))
+    modelExecutor.data.set(contentNp)
+    modelExecutor.executor.forward()
+    val contentArray = modelExecutor.content.copyTo(Context.cpu())
 
-      // delete the executor
-      modelExecutor = null
+    // delete the executor
+    modelExecutor.argDict.foreach(ele => ele._2.dispose())
+    modelExecutor.content.dispose()
+    modelExecutor.data.dispose()
+    modelExecutor.dataGrad.dispose()
+    modelExecutor.style.foreach(_.dispose())
+    modelExecutor.executor.dispose()
+    modelExecutor = null
 
-      val (styleLoss, contentLoss) = getLoss(gram, content)
-      modelExecutor = ModelVgg19.getExecutor(
-          styleLoss, contentLoss, alle.modelPath, size, dev)
+    val (styleLoss, contentLoss) = getLoss(gram, content)
+    modelExecutor = ModelVgg19.getExecutor(
+      styleLoss, contentLoss, modelPath, size, dev)
 
-      val gradArray = {
-        var tmpGA = Array[NDArray]()
-        for (i <- 0 until styleArray.length) {
-          modelExecutor.argDict(s"target_gram_$i").set(styleArray(i))
-          tmpGA = tmpGA :+ NDArray.ones(Shape(1), dev) * (alle.styleWeight / gScale(i))
-        }
-        tmpGA :+ NDArray.ones(Shape(1), dev) * alle.contentWeight
+    val gradArray = {
+      var tmpGA = Array[NDArray]()
+      for (i <- 0 until styleArray.length) {
+        modelExecutor.argDict(s"target_gram_$i").set(styleArray(i))
+        tmpGA = tmpGA :+ NDArray.ones(Shape(1), dev) * (styleWeight / gScale(i))
       }
+      tmpGA :+ NDArray.ones(Shape(1), dev) * contentWeight
+    }
 
-      modelExecutor.argDict("target_content").set(contentArray)
-
-      // train
-      val img = Random.uniform(-0.1f, 0.1f, contentNp.shape, dev)
-      val lr = new FactorScheduler(step = 10, factor = 0.9f)
-
-      saveImage(contentNp, s"${alle.outputDir}/input.jpg", alle.guassianRadius)
-      saveImage(styleNp, s"${alle.outputDir}/style.jpg", alle.guassianRadius)
-
-      val optimizer = new Adam(
-          learningRate = alle.lr,
-          wd = 0.005f,
-          lrScheduler = lr)
-      val optimState = optimizer.createState(0, img)
-
-      logger.info(s"start training arguments $alle")
-
-      var oldImg = img.copyTo(dev)
-      val clipNorm = img.shape.toVector.reduce(_ * _)
-      val tvGradExecutor = getTvGradExecutor(img, dev, alle.tvWeight)
-      var eps = 0f
-      var trainingDone = false
-      var e = 0
-      while (e < alle.maxNumEpochs && !trainingDone) {
-        modelExecutor.data.set(img)
-        modelExecutor.executor.forward()
-        modelExecutor.executor.backward(gradArray)
-
-        val gNorm = NDArray.norm(modelExecutor.dataGrad).toScalar
-        if (gNorm > clipNorm) {
-          modelExecutor.dataGrad.set(modelExecutor.dataGrad * (clipNorm / gNorm))
-        }
-        tvGradExecutor match {
-          case Some(executor) => {
-            executor.forward()
-            optimizer.update(0, img,
-                modelExecutor.dataGrad + executor.outputs(0),
-                optimState)
-          }
-          case None =>
-            optimizer.update(0, img, modelExecutor.dataGrad, optimState)
-        }
-        eps = (NDArray.norm(oldImg - img) / NDArray.norm(img)).toScalar
-        oldImg.set(img)
-        logger.info(s"epoch $e, relative change $eps")
+    modelExecutor.argDict("target_content").set(contentArray)
 
-        if (eps < alle.stopEps) {
-          logger.info("eps < args.stop_eps, training finished")
-          trainingDone = true
-        }
-        if ((e + 1) % alle.saveEpochs == 0) {
-          saveImage(img, s"${alle.outputDir}/tmp_${e + 1}.jpg", alle.guassianRadius)
+    // train
+    val img = Random.uniform(-0.1f, 0.1f, contentNp.shape, dev)
+    val lrFS = new FactorScheduler(step = 10, factor = 0.9f)
+
+    saveImage(contentNp, s"${outputDir}/input.jpg", gaussianRadius)
+    saveImage(styleNp, s"${outputDir}/style.jpg", gaussianRadius)
+
+    val optimizer = new Adam(
+      learningRate = lr,
+      wd = 0.005f,
+      lrScheduler = lrFS)
+    val optimState = optimizer.createState(0, img)
+
+    logger.info(s"start training arguments")
+
+    var oldImg = img.copyTo(dev)
+    val clipNorm = img.shape.toVector.reduce(_ * _)
+    val tvGradExecutor = getTvGradExecutor(img, dev, tvWeight)
+    var eps = 0f
+    var trainingDone = false
+    var e = 0
+    while (e < maxNumEpochs && !trainingDone) {
+      modelExecutor.data.set(img)
+      modelExecutor.executor.forward()
+      modelExecutor.executor.backward(gradArray)
+
+      val gNorm = NDArray.norm(modelExecutor.dataGrad).toScalar
+      if (gNorm > clipNorm) {
+        modelExecutor.dataGrad.set(modelExecutor.dataGrad * (clipNorm / gNorm))
+      }
+      tvGradExecutor match {
+        case Some(executor) => {
+          executor.forward()
+          optimizer.update(0, img,
+            modelExecutor.dataGrad + executor.outputs(0),
+            optimState)
         }
-        e = e + 1
+        case None =>
+          optimizer.update(0, img, modelExecutor.dataGrad, optimState)
+      }
+      eps = (NDArray.norm(oldImg - img) / NDArray.norm(img)).toScalar
+      oldImg.set(img)
+      logger.info(s"epoch $e, relative change $eps")
+
+      if (eps < stopEps) {
+        logger.info("eps < args.stop_eps, training finished")
+        trainingDone = true
+      }
+      if ((e + 1) % saveEpochs == 0) {
+        saveImage(img, s"${outputDir}/tmp_${e + 1}.jpg", gaussianRadius)
       }
-      saveImage(img, s"${alle.outputDir}/out.jpg", alle.guassianRadius)
-      logger.info("Finish fit ...")
+      e = e + 1
+    }
+    saveImage(img, s"${outputDir}/out.jpg", gaussianRadius)
+    logger.info("Finish fit ...")
+  }
+
+  def main(args: Array[String]): Unit = {
+    val alle = new NeuralStyle
+    val parser: CmdLineParser = new CmdLineParser(alle)
+    try {
+      parser.parseArgument(args.toList.asJava)
+      assert(alle.contentImage != null && alle.styleImage != null
+        && alle.modelPath != null && alle.outputDir != null)
+
+      val dev = if (alle.gpu >= 0) Context.gpu(alle.gpu) else Context.cpu(0)
+      runTraining(alle.model, alle.contentImage, alle.styleImage, dev, alle.modelPath,
+        alle.outputDir, alle.styleWeight, alle.contentWeight, alle.tvWeight,
+        alle.gaussianRadius, alle.lr, alle.maxNumEpochs, alle.maxLongEdge,
+        alle.saveEpochs, alle.stopEps)
     } catch {
       case ex: Exception => {
         logger.error(ex.getMessage, ex)
@@ -293,6 +320,6 @@ class NeuralStyle {
   private val outputDir: String = null
   @Option(name = "--save-epochs", usage = "save the output every n epochs")
   private val saveEpochs: Int = 50
-  @Option(name = "--guassian-radius", usage = "the gaussian blur filter radius")
-  private val guassianRadius: Int = 1
+  @Option(name = "--gaussian-radius", usage = "the gaussian blur filter radius")
+  private val gaussianRadius: Int = 1
 }
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/README.md b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/README.md
new file mode 100644
index 000000000000..fe849343c9d7
--- /dev/null
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/README.md
@@ -0,0 +1,83 @@
+# Neural Style Example for Scala
+
+## Introduction
+This model contains three important components:
+- Boost Inference
+- Boost Training
+- Neural Style conversion
+
+You can use the prebuilt VGG model to do the conversion.
+By adding a style image, you can create several interesting images.
+
+Original Image            |  Style Image
+:-------------------------:|:-------------------------:
+![](https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/NeuralStyle/IMG_4343.jpg)  |  ![](https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/NeuralStyle/starry_night.jpg)
+
+Boost Inference Image (pretrained)           |  Epoch 150 Image
+:-------------------------:|:-------------------------:
+![](https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/NeuralStyle/out_3.jpg)  |  ![](https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/NeuralStyle/tmp_150.jpg)
+
+## Setup
+Please download the input image and style image following the links below:
+
+Input image
+```bash
+https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/NeuralStyle/IMG_4343.jpg
+```
+Style image
+```bash
+https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/NeuralStyle/starry_night.jpg
+```
+
+VGG model --Boost inference
+```bash
+https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/NeuralStyle/model.zip
+```
+
+VGG model --Boost Training
+```bash
+https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/NeuralStyle/vgg19.params
+```
+
+Please unzip the model before you use it.
+
+## Boost Inference Example
+
+Please provide the corresponding arguments before you execute the program
+```bash
+--input-image
+<path>/IMG_4343.jpg
+--model-path
+<path>/model
+--output-path
+<outputPath>
+```
+
+## Boost Training Example
+Please download your own training data for boost training.
+You can use 26k images sampled from [MIT Place dataset](http://places.csail.mit.edu/).
+```bash
+--style-image
+<path>/starry_night.jpg
+--data-path
+<path>/images
+--vgg-model-path
+<path>/vgg19.params
+--save-model-path
+<path>
+```
+
+## NeuralStyle Example
+Please provide the corresponding arguments before you execute the program
+```bash
+--model-path
+<path>/vgg19.params
+--content-image
+<path>/IMG_4343.jpg
+--style-image
+<path>/starry_night.jpg
+--gpu
+<num_of_gpus>
+--output-dir
+<path>
+```
\ No newline at end of file
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/Basic.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/Basic.scala
index c604f842c4c2..56303253f33d 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/Basic.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/Basic.scala
@@ -17,16 +17,11 @@
 
 package org.apache.mxnetexamples.neuralstyle.end2end
 
-import org.apache.mxnet.Shape
-import org.apache.mxnet.Context
-import org.apache.mxnet.NDArray
-import org.apache.mxnet.Symbol
-import org.apache.mxnet.Initializer
+import org.apache.mxnet.{Context, Initializer, NDArray, Shape, Symbol}
+import org.apache.mxnetexamples.neuralstyle.ModelVgg19
 import org.slf4j.LoggerFactory
 
-/**
- * @author Depeng Liang
- */
+
 object Basic {
 
   class PretrainedInit(prefix: String, params: Map[String, NDArray],
@@ -61,7 +56,7 @@ object Basic {
   def getStyleModule(prefix: String, dShape: Shape,
       ctx: Context, params: Map[String, NDArray]): Module = {
     val inputShape = Map(s"${prefix}_data" -> dShape)
-    val (style, content) = ModelVgg19.getVggSymbol(prefix)
+    val (style, content) = ModelVgg19.getVggSymbol(prefix + "_")
     val (gram, gScale) = styleGramSymbol(inputShape, style)
     val init = new PretrainedInit(prefix, params, true)
     new Module(symbol = gram, context = ctx,
@@ -75,11 +70,10 @@ object Basic {
     var gradScale = List[Int]()
     for (i <- 0 until style.listOutputs().length) {
       val shape = outputShape(i)
-      val x = Symbol.Reshape()()(Map("data" -> style.get(i),
-          "shape" -> Shape(shape(1), shape(2) * shape(3))))
-      // use fully connected to quickly do dot(x, x^T)
-      val gram = Symbol.FullyConnected()()(Map("data" -> x, "weight" -> x,
-          "no_bias" -> true, "num_hidden" -> shape(1)))
+      val x = Symbol.api.Reshape(data = Some(style.get(i)),
+        shape = Some(Shape(shape(1), shape(2) * shape(3))))
+      val gram = Symbol.api.FullyConnected(data = Some(x), weight = Some(x),
+        no_bias = Some(true), num_hidden = shape(1))
       gramList = gramList :+ gram
       gradScale = gradScale :+ (shape(1) * shape(2) * shape(3) * shape(1))
     }
@@ -90,16 +84,18 @@ object Basic {
     var gramLoss = List[Symbol]()
     for (i <- 0 until gram.listOutputs().length) {
       val gvar = Symbol.Variable(s"target_gram_$i")
-      gramLoss = gramLoss :+ Symbol.sum()(Symbol.square()(gvar - gram.get(i))())()
+      gramLoss = gramLoss :+ Symbol.api.sum(Some(
+        Symbol.api.square(Some(gvar - gram.get(i)))
+      ))
     }
     val cvar = Symbol.Variable("target_content")
-    val contentLoss = Symbol.sum()(Symbol.square()(cvar - content)())()
+    val contentLoss = Symbol.api.sum(Some(Symbol.api.square(Some(cvar - content))))
     (Symbol.Group(gramLoss: _*), contentLoss)
   }
 
   def getContentModule(prefix: String, dShape: Shape,
       ctx: Context, params: Map[String, NDArray]): Module = {
-    val (_, sym) = ModelVgg19.getVggSymbol(prefix, true)
+    val (_, sym) = ModelVgg19.getVggSymbol(prefix + "_", true)
     val init = new PretrainedInit(prefix, params)
     new Module(symbol = sym, context = ctx,
                     dataShapes = Map(s"${prefix}_data" -> dShape),
@@ -109,7 +105,7 @@ object Basic {
   def getLossModule(prefix: String, dShape: Shape,
       ctx: Context, params: Map[String, NDArray]): (Module, List[Int]) = {
     val inputShape = Map(s"${prefix}_data" -> dShape)
-    val (style, content) = ModelVgg19.getVggSymbol(prefix)
+    val (style, content) = ModelVgg19.getVggSymbol(prefix + "_")
     val (gram, gScale) = styleGramSymbol(inputShape, style)
     val (styleLoss, contentLoss) = getLoss(gram, content)
     val sym = Symbol.Group(styleLoss, contentLoss)
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/BoostInference.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/BoostInference.scala
index 0feb73d3036e..5410fb9edc7c 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/BoostInference.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/BoostInference.scala
@@ -17,19 +17,43 @@
 
 package org.apache.mxnetexamples.neuralstyle.end2end
 
-import org.slf4j.LoggerFactory
+import org.apache.mxnet.{Context, Shape}
 import org.kohsuke.args4j.{CmdLineParser, Option}
+import org.slf4j.LoggerFactory
+
 import scala.collection.JavaConverters._
-import org.apache.mxnet.Shape
-import org.apache.mxnet.Context
 
-/**
- * @author Depeng Liang
- */
 object BoostInference {
 
   private val logger = LoggerFactory.getLogger(classOf[BoostInference])
 
+  def runInference(modelPath: String, outputPath: String, guassianRadius : Int,
+                   inputImage : String, ctx : Context): Unit = {
+    val dShape = Shape(1, 3, 480, 640)
+    val clipNorm = 1.0f * dShape.product
+    // generator
+    val gens = Array(
+      GenV4.getModule("g0", dShape, ctx, isTrain = false),
+      GenV3.getModule("g1", dShape, ctx, isTrain = false),
+      GenV3.getModule("g2", dShape, ctx, isTrain = false),
+      GenV4.getModule("g3", dShape, ctx, isTrain = false)
+    )
+    gens.zipWithIndex.foreach { case (gen, i) =>
+      gen.loadParams(s"$modelPath/$i/v3_0002-0026000.params")
+    }
+
+    val contentNp =
+      DataProcessing.preprocessContentImage(s"$inputImage", dShape, ctx)
+    var data = Array(contentNp)
+    for (i <- 0 until gens.length) {
+      gens(i).forward(data.takeRight(1))
+      val newImg = gens(i).getOutputs()(0)
+      data :+= newImg
+      DataProcessing.saveImage(newImg, s"$outputPath/out_$i.jpg", guassianRadius)
+      logger.info(s"Converted image: $outputPath/out_$i.jpg")
+    }
+  }
+
   def main(args: Array[String]): Unit = {
     val stce = new BoostInference
     val parser: CmdLineParser = new CmdLineParser(stce)
@@ -39,30 +63,10 @@ object BoostInference {
           && stce.inputImage != null
           && stce.outputPath != null)
 
-      val dShape = Shape(1, 3, 480, 640)
-      val clipNorm = 1.0f * dShape.product
       val ctx = if (stce.gpu == -1) Context.cpu() else Context.gpu(stce.gpu)
 
-      // generator
-      val gens = Array(
-          GenV4.getModule("g0", dShape, ctx, isTrain = false),
-          GenV3.getModule("g1", dShape, ctx, isTrain = false),
-          GenV3.getModule("g2", dShape, ctx, isTrain = false),
-          GenV4.getModule("g3", dShape, ctx, isTrain = false)
-      )
-      gens.zipWithIndex.foreach { case (gen, i) =>
-        gen.loadParams(s"${stce.modelPath}/$i/v3_0002-0026000.params")
-      }
+      runInference(stce.modelPath, stce.outputPath, stce.guassianRadius, stce.inputImage, ctx)
 
-      val contentNp =
-        DataProcessing.preprocessContentImage(s"${stce.inputImage}", dShape, ctx)
-      var data = Array(contentNp)
-      for (i <- 0 until gens.length) {
-        gens(i).forward(data.takeRight(1))
-        val newImg = gens(i).getOutputs()(0)
-        data :+= newImg
-        DataProcessing.saveImage(newImg, s"${stce.outputPath}/out_${i}.jpg", stce.guassianRadius)
-      }
     } catch {
       case ex: Exception => {
         logger.error(ex.getMessage, ex)
@@ -74,7 +78,7 @@ object BoostInference {
 }
 
 class BoostInference {
-  @Option(name = "--model-path", usage = "the save model path")
+  @Option(name = "--model-path", usage = "the saved model path")
   private val modelPath: String = null
   @Option(name = "--input-image", usage = "the style image")
   private val inputImage: String = null
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/BoostTrain.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/BoostTrain.scala
index 8b5549de4af1..08b4c85d2c55 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/BoostTrain.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/BoostTrain.scala
@@ -17,24 +17,17 @@
 
 package org.apache.mxnetexamples.neuralstyle.end2end
 
-import org.slf4j.LoggerFactory
+import java.io.File
+
+import org.apache.mxnet.{Context, Executor, NDArray, Shape, Symbol}
+import org.apache.mxnet.optimizer.SGD
 import org.kohsuke.args4j.{CmdLineParser, Option}
+import org.slf4j.LoggerFactory
+
 import scala.collection.JavaConverters._
-import org.apache.mxnet.NDArray
-import org.apache.mxnet.Shape
-import org.apache.mxnet.Context
-import org.apache.mxnet.DataBatch
-import org.apache.mxnet.Symbol
-import org.apache.mxnet.Executor
-import org.apache.mxnet.optimizer.SGD
-import java.io.File
-import javax.imageio.ImageIO
 import scala.util.Random
-import org.apache.mxnet.optimizer.Adam
 
-/**
- * @author Depeng Liang
- */
+
 object BoostTrain {
 
   private val logger = LoggerFactory.getLogger(classOf[BoostTrain])
@@ -46,12 +39,13 @@ object BoostTrain {
     val nChannel = img.shape(1)
     val sImg = Symbol.Variable("img")
     val sKernel = Symbol.Variable("kernel")
-    val channels = Symbol.SliceChannel()(sImg)(Map("num_outputs" -> nChannel))
-    val out = Symbol.Concat()((0 until nChannel).map { i =>
-      Symbol.Convolution()()(Map("data" -> channels.get(i), "weight" -> sKernel,
-                    "num_filter" -> 1, "kernel" -> "(3,3)", "pad" -> "(1,1)",
-                    "no_bias" -> true, "stride" -> "(1,1)"))
-    }.toArray: _*)() * tvWeight
+    val channels = Symbol.api.SliceChannel(data = Some(sImg), num_outputs = nChannel)
+    val toConcat = (0 until nChannel).map( i =>
+      Symbol.api.Convolution(data = Some(channels.get(i)), weight = Some(sKernel),
+        num_filter = 1, kernel = Shape(3, 3), pad = Some(Shape(1, 1)),
+        no_bias = Some(true), stride = Some(Shape(1, 1)))
+    ).toArray
+    val out = Symbol.api.Concat(data = toConcat, num_args = toConcat.length) * tvWeight
     val kernel = {
       val tmp = NDArray.empty(Shape(1, 1, 3, 3), ctx)
       tmp.set(Array[Float](0, -1, 0, -1, 4, -1, 0, -1, 0))
@@ -60,130 +54,135 @@ object BoostTrain {
     out.bind(ctx, Map("img" -> img, "kernel" -> kernel))
   }
 
-  def main(args: Array[String]): Unit = {
-    val stin = new BoostTrain
-    val parser: CmdLineParser = new CmdLineParser(stin)
-    try {
-      parser.parseArgument(args.toList.asJava)
-      assert(stin.dataPath != null
-          && stin.vggModelPath != null
-          && stin.saveModelPath != null
-          && stin.styleImage != null)
-      // params
-      val vggParams = NDArray.load2Map(stin.vggModelPath)
-      val styleWeight = 1.2f
-      val contentWeight = 10f
-      val dShape = Shape(1, 3, 384, 384)
-      val clipNorm = 0.05f * dShape.product
-      val modelPrefix = "v3"
-      val ctx = if (stin.gpu == -1) Context.cpu() else Context.gpu(stin.gpu)
-
-      // init style
-      val styleNp = DataProcessing.preprocessStyleImage(stin.styleImage, dShape, ctx)
-      var styleMod = Basic.getStyleModule("style", dShape, ctx, vggParams)
-      styleMod.forward(Array(styleNp))
-      val styleArray = styleMod.getOutputs().map(_.copyTo(Context.cpu()))
-      styleMod.dispose()
-      styleMod = null
-
-      // content
-      val contentMod = Basic.getContentModule("content", dShape, ctx, vggParams)
-
-      // loss
-      val (loss, gScale) = Basic.getLossModule("loss", dShape, ctx, vggParams)
-      val extraArgs = (0 until styleArray.length)
-                                  .map( i => s"target_gram_$i" -> styleArray(i)).toMap
-      loss.setParams(extraArgs)
-      var gradArray = Array[NDArray]()
-      for (i <- 0 until styleArray.length) {
-        gradArray = gradArray :+ (NDArray.ones(Shape(1), ctx) * (styleWeight / gScale(i)))
-      }
-      gradArray = gradArray :+ (NDArray.ones(Shape(1), ctx) * contentWeight)
-
-      // generator
-      val gens = Array(
-          GenV4.getModule("g0", dShape, ctx),
-          GenV3.getModule("g1", dShape, ctx),
-          GenV3.getModule("g2", dShape, ctx),
-          GenV4.getModule("g3", dShape, ctx)
-      )
-      gens.foreach { gen =>
-        val opt = new SGD(learningRate = 1e-4f,
-                          momentum = 0.9f,
-                          wd = 5e-3f,
-                          clipGradient = 5f)
-        gen.initOptimizer(opt)
-      }
+  def runTraining(dataPath : String, vggModelPath: String, ctx : Context,
+                  styleImage : String, saveModelPath : String) : Unit = {
+    // params
+    val vggParams = NDArray.load2Map(vggModelPath)
+    val styleWeight = 1.2f
+    val contentWeight = 10f
+    val dShape = Shape(1, 3, 384, 384)
+    val clipNorm = 0.05f * dShape.product
+    val modelPrefix = "v3"
+    // init style
+    val styleNp = DataProcessing.preprocessStyleImage(styleImage, dShape, ctx)
+    var styleMod = Basic.getStyleModule("style", dShape, ctx, vggParams)
+    styleMod.forward(Array(styleNp))
+    val styleArray = styleMod.getOutputs().map(_.copyTo(Context.cpu()))
+    styleMod.dispose()
+    styleMod = null
+
+    // content
+    val contentMod = Basic.getContentModule("content", dShape, ctx, vggParams)
+
+    // loss
+    val (loss, gScale) = Basic.getLossModule("loss", dShape, ctx, vggParams)
+    val extraArgs = (0 until styleArray.length)
+      .map( i => s"target_gram_$i" -> styleArray(i)).toMap
+    loss.setParams(extraArgs)
+    var gradArray = Array[NDArray]()
+    for (i <- 0 until styleArray.length) {
+      gradArray = gradArray :+ (NDArray.ones(Shape(1), ctx) * (styleWeight / gScale(i)))
+    }
+    gradArray = gradArray :+ (NDArray.ones(Shape(1), ctx) * contentWeight)
+
+    // generator
+    val gens = Array(
+      GenV4.getModule("g0", dShape, ctx),
+      GenV3.getModule("g1", dShape, ctx),
+      GenV3.getModule("g2", dShape, ctx),
+      GenV4.getModule("g3", dShape, ctx)
+    )
+    gens.foreach { gen =>
+      val opt = new SGD(learningRate = 1e-4f,
+        momentum = 0.9f,
+        wd = 5e-3f,
+        clipGradient = 5f)
+      gen.initOptimizer(opt)
+    }
 
-      var filelist = new File(stin.dataPath).list().toList
-      val numImage = filelist.length
-      logger.info(s"Dataset size: $numImage")
+    var filelist = new File(dataPath).list().toList
+    val numImage = filelist.length
+    logger.info(s"Dataset size: $numImage")
 
-      val tvWeight = 1e-2f
+    val tvWeight = 1e-2f
 
-      val startEpoch = 0
-      val endEpoch = 3
+    val startEpoch = 0
+    val endEpoch = 3
 
-      for (k <- 0 until gens.length) {
-        val path = new File(s"${stin.saveModelPath}/$k")
-        if (!path.exists()) path.mkdir()
-      }
+    for (k <- 0 until gens.length) {
+      val path = new File(s"${saveModelPath}/$k")
+      if (!path.exists()) path.mkdir()
+    }
 
-      // train
-      for (i <- startEpoch until endEpoch) {
-        filelist = Random.shuffle(filelist)
-        for (idx <- filelist.indices) {
-          var dataArray = Array[NDArray]()
-          var lossGradArray = Array[NDArray]()
-          val data =
-            DataProcessing.preprocessContentImage(s"${stin.dataPath}/${filelist(idx)}", dShape, ctx)
-          dataArray = dataArray :+ data
-          // get content
-          contentMod.forward(Array(data))
-          // set target content
-          loss.setParams(Map("target_content" -> contentMod.getOutputs()(0)))
-          // gen_forward
-          for (k <- 0 until gens.length) {
-            gens(k).forward(dataArray.takeRight(1))
-            dataArray = dataArray :+ gens(k).getOutputs()(0)
-            // loss forward
-            loss.forward(dataArray.takeRight(1))
-            loss.backward(gradArray)
-            lossGradArray = lossGradArray :+ loss.getInputGrads()(0)
-          }
-          val grad = NDArray.zeros(data.shape, ctx)
-          for (k <- gens.length - 1 to 0 by -1) {
-            val tvGradExecutor = getTvGradExecutor(gens(k).getOutputs()(0), ctx, tvWeight)
-            tvGradExecutor.forward()
-            grad += lossGradArray(k) + tvGradExecutor.outputs(0)
-            val gNorm = NDArray.norm(grad)
-            if (gNorm.toScalar > clipNorm) {
-              grad *= clipNorm / gNorm.toScalar
-            }
-            gens(k).backward(Array(grad))
-            gens(k).update()
-            gNorm.dispose()
-            tvGradExecutor.dispose()
+    // train
+    for (i <- startEpoch until endEpoch) {
+      filelist = Random.shuffle(filelist)
+      for (idx <- filelist.indices) {
+        var dataArray = Array[NDArray]()
+        var lossGradArray = Array[NDArray]()
+        val data =
+          DataProcessing.preprocessContentImage(s"${dataPath}/${filelist(idx)}", dShape, ctx)
+        dataArray = dataArray :+ data
+        // get content
+        contentMod.forward(Array(data))
+        // set target content
+        loss.setParams(Map("target_content" -> contentMod.getOutputs()(0)))
+        // gen_forward
+        for (k <- 0 until gens.length) {
+          gens(k).forward(dataArray.takeRight(1))
+          dataArray = dataArray :+ gens(k).getOutputs()(0)
+          // loss forward
+          loss.forward(dataArray.takeRight(1))
+          loss.backward(gradArray)
+          lossGradArray = lossGradArray :+ loss.getInputGrads()(0)
+        }
+        val grad = NDArray.zeros(data.shape, ctx)
+        for (k <- gens.length - 1 to 0 by -1) {
+          val tvGradExecutor = getTvGradExecutor(gens(k).getOutputs()(0), ctx, tvWeight)
+          tvGradExecutor.forward()
+          grad += lossGradArray(k) + tvGradExecutor.outputs(0)
+          val gNorm = NDArray.norm(grad)
+          if (gNorm.toScalar > clipNorm) {
+            grad *= clipNorm / gNorm.toScalar
           }
-          grad.dispose()
-          if (idx % 20 == 0) {
-            logger.info(s"Epoch $i: Image $idx")
-            for (k <- 0 until gens.length) {
-              val n = NDArray.norm(gens(k).getInputGrads()(0))
-              logger.info(s"Data Norm : ${n.toScalar / dShape.product}")
-              n.dispose()
-            }
+          gens(k).backward(Array(grad))
+          gens(k).update()
+          gNorm.dispose()
+          tvGradExecutor.dispose()
+        }
+        grad.dispose()
+        if (idx % 20 == 0) {
+          logger.info(s"Epoch $i: Image $idx")
+          for (k <- 0 until gens.length) {
+            val n = NDArray.norm(gens(k).getInputGrads()(0))
+            logger.info(s"Data Norm : ${n.toScalar / dShape.product}")
+            n.dispose()
           }
-          if (idx % 1000 == 0) {
-            for (k <- 0 until gens.length) {
-              gens(k).saveParams(
-                  s"${stin.saveModelPath}/$k/${modelPrefix}_" +
-                  s"${"%04d".format(i)}-${"%07d".format(idx)}.params")
-            }
+        }
+        if (idx % 1000 == 0) {
+          for (k <- 0 until gens.length) {
+            gens(k).saveParams(
+              s"${saveModelPath}/$k/${modelPrefix}_" +
+                s"${"%04d".format(i)}-${"%07d".format(idx)}.params")
           }
-          data.dispose()
         }
+        data.dispose()
       }
+    }
+  }
+
+  def main(args: Array[String]): Unit = {
+    val stin = new BoostTrain
+    val parser: CmdLineParser = new CmdLineParser(stin)
+    try {
+      parser.parseArgument(args.toList.asJava)
+      assert(stin.dataPath != null
+          && stin.vggModelPath != null
+          && stin.saveModelPath != null
+          && stin.styleImage != null)
+
+      val ctx = if (stin.gpu == -1) Context.cpu() else Context.gpu(stin.gpu)
+      runTraining(stin.dataPath, stin.vggModelPath, ctx, stin.styleImage, stin.saveModelPath)
     } catch {
       case ex: Exception => {
         logger.error(ex.getMessage, ex)
@@ -197,9 +196,9 @@ object BoostTrain {
 class BoostTrain {
   @Option(name = "--data-path", usage = "the input train data path")
   private val dataPath: String = null
-  @Option(name = "--vgg--model-path", usage = "the pretrained model to use: ['vgg']")
+  @Option(name = "--vgg-model-path", usage = "the pretrained model to use: ['vgg']")
   private val vggModelPath: String = null
-  @Option(name = "--save--model-path", usage = "the save model path")
+  @Option(name = "--save-model-path", usage = "the save model path")
   private val saveModelPath: String = null
   @Option(name = "--style-image", usage = "the style image")
   private val styleImage: String = null
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/DataProcessing.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/DataProcessing.scala
index 94d05bb7d57c..80a009ea40c2 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/DataProcessing.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/DataProcessing.scala
@@ -17,19 +17,14 @@
 
 package org.apache.mxnetexamples.neuralstyle.end2end
 
-import com.sksamuel.scrimage.Image
-import com.sksamuel.scrimage.Pixel
+import java.io.File
+
+import com.sksamuel.scrimage.{Image, Pixel}
 import com.sksamuel.scrimage.filter.GaussianBlurFilter
 import com.sksamuel.scrimage.nio.JpegWriter
-import org.apache.mxnet.Context
-import org.apache.mxnet.NDArray
-import java.io.File
-import org.apache.mxnet.Shape
-import scala.util.Random
+import org.apache.mxnet.{Context, NDArray, Shape}
+
 
-/**
- * @author Depeng Liang
- */
 object DataProcessing {
 
   def preprocessContentImage(path: String,
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/GenV3.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/GenV3.scala
index b90e9f0e3171..d7ab59e28402 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/GenV3.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/GenV3.scala
@@ -17,34 +17,33 @@
 
 package org.apache.mxnetexamples.neuralstyle.end2end
 
-import org.apache.mxnet.Symbol
-import org.apache.mxnet.Shape
-import org.apache.mxnet.Context
-import org.apache.mxnet.Xavier
+import org.apache.mxnet.{Context, Shape, Symbol, Xavier}
+
 
-/**
- * @author Depeng Liang
- */
 object GenV3 {
   def Conv(data: Symbol, numFilter: Int, kernel: (Int, Int) = (5, 5),
-      pad: (Int, Int) = (2, 2), stride: (Int, Int) = (2, 2)): Symbol = {
-    var sym = Symbol.Convolution()()(Map("data" -> data, "num_filter" -> numFilter,
-        "kernel" -> s"$kernel", "stride" -> s"$stride", "pad" -> s"$pad", "no_bias" -> false))
-    sym = Symbol.BatchNorm()()(Map("data" -> sym, "fix_gamma" -> false))
-    sym = Symbol.LeakyReLU()()(Map("data" -> sym, "act_type" -> "leaky"))
-    sym
+           pad: (Int, Int) = (2, 2), stride: (Int, Int) = (2, 2)): Symbol = {
+    val sym1 = Symbol.api.Convolution(data = Some(data), num_filter = numFilter,
+      kernel = Shape(kernel._1, kernel._2), stride = Some(Shape(stride._1, stride._2)),
+      pad = Some(Shape(pad._1, pad._2)), no_bias = Some(false))
+    val sym2 = Symbol.api.BatchNorm(data = Some(sym1), fix_gamma = Some(false))
+    val sym3 = Symbol.api.LeakyReLU(data = Some(sym2), act_type = Some("leaky"))
+    sym2.dispose()
+    sym1.dispose()
+    sym3
   }
 
   def Deconv(data: Symbol, numFilter: Int, imHw: (Int, Int),
-      kernel: (Int, Int) = (7, 7), pad: (Int, Int) = (2, 2), stride: (Int, Int) = (2, 2),
-      crop: Boolean = true, out: Boolean = false): Symbol = {
-    var sym = Symbol.Deconvolution()()(Map("data" -> data, "num_filter" -> numFilter,
-        "kernel" -> s"$kernel", "stride" -> s"$stride", "pad" -> s"$pad", "no_bias" -> true))
-    if (crop) sym = Symbol.Crop()(sym)(
-        Map("offset" -> "(1, 1)", "h_w" -> s"$imHw", "num_args" -> 1))
-    sym = Symbol.BatchNorm()()(Map("data" -> sym, "fix_gamma" -> false))
-    if (out == false) Symbol.LeakyReLU()()(Map("data" -> sym, "act_type" -> "leaky"))
-    else Symbol.Activation()()(Map("data" -> sym, "act_type" -> "tanh"))
+             kernel: (Int, Int) = (7, 7), pad: (Int, Int) = (2, 2), stride: (Int, Int) = (2, 2),
+             crop: Boolean = true, out: Boolean = false): Symbol = {
+    var sym = Symbol.api.Deconvolution(data = Some(data), num_filter = numFilter,
+      kernel = Shape(kernel._1, kernel._2), stride = Some(Shape(stride._1, stride._2)),
+      pad = Some(Shape(pad._1, pad._2)), no_bias = Some(true))
+    if (crop) sym = Symbol.api.Crop(data = Array(sym), offset = Some(Shape(1, 1)),
+      h_w = Some(Shape(imHw._1, imHw._2)), num_args = 1)
+    sym = Symbol.api.BatchNorm(data = Some(sym), fix_gamma = Some(false))
+    if (out == false) Symbol.api.LeakyReLU(data = Some(sym), act_type = Some("leaky"))
+    else Symbol.api.Activation(data = Some(sym), act_type = "tanh")
   }
 
   def getGenerator(prefix: String, imHw: (Int, Int)): Symbol = {
@@ -61,12 +60,12 @@ object GenV3 {
     val conv5_1 = Conv(deconv2, 96, kernel = (3, 3), pad = (1, 1), stride = (1, 1))
     val deconv3 = Deconv(conv5_1, 3, imHw, kernel = (8, 8), pad = (3, 3), out = true, crop = false)
     val rawOut = (deconv3 * 128) + 128
-    val norm = Symbol.SliceChannel()(rawOut)(Map("num_outputs" -> 3))
+    val norm = Symbol.api.SliceChannel(data = Some(rawOut), num_outputs = 3)
     val rCh = norm.get(0) - 123.68f
     val gCh = norm.get(1) - 116.779f
     val bCh = norm.get(2) - 103.939f
-    val normOut = Symbol.Concat()(rCh, gCh, bCh)() * 0.4f + data * 0.6f
-    normOut
+    val normOut = Symbol.api.Concat(data = Array(rCh, gCh, bCh), num_args = 3)
+    normOut * 0.4f + data * 0.6f
   }
 
   def getModule(prefix: String, dShape: Shape, ctx: Context, isTrain: Boolean = true): Module = {
@@ -77,9 +76,9 @@ object GenV3 {
       else (dataShape, false, false)
     }
     val mod = new Module(symbol = sym, context = ctx,
-                         dataShapes = dataShapes,
-                         initializer = new Xavier(magnitude = 2f),
-                         forTraining = forTraining, inputsNeedGrad = inputsNeedGrad)
+      dataShapes = dataShapes,
+      initializer = new Xavier(magnitude = 2f),
+      forTraining = forTraining, inputsNeedGrad = inputsNeedGrad)
     mod
   }
 }
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/GenV4.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/GenV4.scala
index 876a0529b69e..82fc9b6ce109 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/GenV4.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/GenV4.scala
@@ -17,78 +17,43 @@
 
 package org.apache.mxnetexamples.neuralstyle.end2end
 
-import org.apache.mxnet.Symbol
-import org.apache.mxnet.Shape
-import org.apache.mxnet.Context
-import org.apache.mxnet.Xavier
+import org.apache.mxnet.{Context, Shape, Symbol, Xavier}
 
-/**
- * @author Depeng Liang
- */
-object GenV4 {
 
-  def Conv(data: Symbol, numFilter: Int, kernel: (Int, Int) = (5, 5),
-      pad: (Int, Int) = (2, 2), stride: (Int, Int) = (2, 2)): Symbol = {
-    var sym = Symbol.Convolution()()(Map("data" -> data, "num_filter" -> numFilter,
-        "kernel" -> s"$kernel", "stride" -> s"$stride", "pad" -> s"$pad", "no_bias" -> false))
-    sym = Symbol.BatchNorm()()(Map("data" -> sym, "fix_gamma" -> false))
-    sym = Symbol.LeakyReLU()()(Map("data" -> sym, "act_type" -> "leaky"))
-    sym
-  }
+object GenV4 {
 
-  def Deconv(data: Symbol, numFilter: Int, imHw: (Int, Int), kernel: (Int, Int) = (6, 6),
-      pad: (Int, Int) = (2, 2), stride: (Int, Int) = (2, 2), out: Boolean = false): Symbol = {
-    var sym = Symbol.Deconvolution()()(Map("data" -> data, "num_filter" -> numFilter,
-        "kernel" -> s"$kernel", "stride" -> s"$stride", "pad" -> s"$pad", "no_bias" -> true))
-    sym = Symbol.BatchNorm()()(Map("data" -> sym, "fix_gamma" -> false))
-    if (out == false) Symbol.LeakyReLU()()(Map("data" -> sym, "act_type" -> "leaky"))
-    else Symbol.Activation()()(Map("data" -> sym, "act_type" -> "tanh"))
+  def Conv(data: Symbol, numFilter: Int, workspace : Long, kernel: (Int, Int) = (5, 5),
+           pad: (Int, Int) = (2, 2)): Symbol = {
+    val sym1 = Symbol.api.Convolution(data = Some(data), num_filter = numFilter,
+      kernel = Shape(kernel._1, kernel._2), workspace = Some(workspace),
+      pad = Some(Shape(pad._1, pad._2)), no_bias = Some(false))
+    val sym2 = Symbol.api.BatchNorm(data = Some(sym1), fix_gamma = Some(false))
+    val sym3 = Symbol.api.LeakyReLU(data = Some(sym2), act_type = Some("leaky"))
+    sym2.dispose()
+    sym1.dispose()
+    sym3
   }
 
   def getGenerator(prefix: String, imHw: (Int, Int)): Symbol = {
     val data = Symbol.Variable(s"${prefix}_data")
 
-    var conv1_1 = Symbol.Convolution()()(Map("data" -> data, "num_filter" -> 48,
-        "kernel" -> "(5, 5)", "pad" -> "(2, 2)", "no_bias" -> false, "workspace" -> 4096))
-    conv1_1 = Symbol.BatchNorm()()(Map("data" -> conv1_1, "fix_gamma" -> false))
-    conv1_1 = Symbol.LeakyReLU()()(Map("data" -> conv1_1, "act_type" -> "leaky"))
-
-    var conv2_1 = Symbol.Convolution()()(Map("data" -> conv1_1, "num_filter" -> 32,
-        "kernel" -> "(5, 5)", "pad" -> "(2, 2)", "no_bias" -> false, "workspace" -> 4096))
-    conv2_1 = Symbol.BatchNorm()()(Map("data" -> conv2_1, "fix_gamma" -> false))
-    conv2_1 = Symbol.LeakyReLU()()(Map("data" -> conv2_1, "act_type" -> "leaky"))
-
-    var conv3_1 = Symbol.Convolution()()(Map("data" -> conv2_1, "num_filter" -> 64,
-        "kernel" -> "(3, 3)", "pad" -> "(1, 1)", "no_bias" -> false, "workspace" -> 4096))
-    conv3_1 = Symbol.BatchNorm()()(Map("data" -> conv3_1, "fix_gamma" -> false))
-    conv3_1 = Symbol.LeakyReLU()()(Map("data" -> conv3_1, "act_type" -> "leaky"))
-
-    var conv4_1 = Symbol.Convolution()()(Map("data" -> conv3_1, "num_filter" -> 32,
-        "kernel" -> "(5, 5)", "pad" -> "(2, 2)", "no_bias" -> false, "workspace" -> 4096))
-    conv4_1 = Symbol.BatchNorm()()(Map("data" -> conv4_1, "fix_gamma" -> false))
-    conv4_1 = Symbol.LeakyReLU()()(Map("data" -> conv4_1, "act_type" -> "leaky"))
-
-    var conv5_1 = Symbol.Convolution()()(Map("data" -> conv4_1, "num_filter" -> 48,
-        "kernel" -> "(5, 5)", "pad" -> "(2, 2)", "no_bias" -> false, "workspace" -> 4096))
-    conv5_1 = Symbol.BatchNorm()()(Map("data" -> conv5_1, "fix_gamma" -> false))
-    conv5_1 = Symbol.LeakyReLU()()(Map("data" -> conv5_1, "act_type" -> "leaky"))
-
-    var conv6_1 = Symbol.Convolution()()(Map("data" -> conv5_1, "num_filter" -> 32,
-        "kernel" -> "(5, 5)", "pad" -> "(2, 2)", "no_bias" -> true, "workspace" -> 4096))
-    conv6_1 = Symbol.BatchNorm()()(Map("data" -> conv6_1, "fix_gamma" -> false))
-    conv6_1 = Symbol.LeakyReLU()()(Map("data" -> conv6_1, "act_type" -> "leaky"))
-
-    var out = Symbol.Convolution()()(Map("data" -> conv6_1, "num_filter" -> 3, "kernel" -> "(3, 3)",
-        "pad" -> "(1, 1)", "no_bias" -> true, "workspace" -> 4096))
-    out = Symbol.BatchNorm()()(Map("data" -> out, "fix_gamma" -> false))
-    out = Symbol.Activation()()(Map("data" -> out, "act_type" -> "tanh"))
+    var conv1_1 = Conv(data, 48, 4096)
+    val conv2_1 = Conv(conv1_1, 32, 4096)
+    var conv3_1 = Conv(conv2_1, 64, 4096, (3, 3), (1, 1))
+    var conv4_1 = Conv(conv3_1, 32, 4096)
+    var conv5_1 = Conv(conv4_1, 48, 4096)
+    var conv6_1 = Conv(conv5_1, 32, 4096)
+    var out = Symbol.api.Convolution(data = Some(conv6_1), num_filter = 3, kernel = Shape(3, 3),
+      pad = Some(Shape(1, 1)), no_bias = Some(true), workspace = Some(4096))
+    out = Symbol.api.BatchNorm(data = Some(out), fix_gamma = Some(false))
+    out = Symbol.api.Activation(data = Some(out), act_type = "tanh")
     val rawOut = (out * 128) + 128
-    val norm = Symbol.SliceChannel()(rawOut)(Map("num_outputs" -> 3))
+    val norm = Symbol.api.SliceChannel(data = Some(rawOut), num_outputs = 3)
     val rCh = norm.get(0) - 123.68f
     val gCh = norm.get(1) - 116.779f
     val bCh = norm.get(2) - 103.939f
-    val normOut = Symbol.Concat()(rCh, gCh, bCh)() * 0.4f + data * 0.6f
-    normOut
+    val normOut = Symbol.api.Concat(data = Array(rCh, gCh, bCh), num_args = 3)
+    normOut * 0.4f + data * 0.6f
   }
 
   def getModule(prefix: String, dShape: Shape, ctx: Context, isTrain: Boolean = true): Module = {
@@ -99,9 +64,9 @@ object GenV4 {
       else (dataShape, false, false)
     }
     val mod = new Module(symbol = sym, context = ctx,
-                         dataShapes = dataShapes,
-                         initializer = new Xavier(magnitude = 2f),
-                         forTraining = forTraining, inputsNeedGrad = inputsNeedGrad)
+      dataShapes = dataShapes,
+      initializer = new Xavier(magnitude = 2f),
+      forTraining = forTraining, inputsNeedGrad = inputsNeedGrad)
     mod
   }
 }
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/ModelVgg19.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/ModelVgg19.scala
deleted file mode 100644
index 6044847be4ad..000000000000
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/ModelVgg19.scala
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mxnetexamples.neuralstyle.end2end
-
-import org.apache.mxnet.Executor
-import org.apache.mxnet.NDArray
-import org.apache.mxnet.Symbol
-
-
-object ModelVgg19 {
-
-  def getVggSymbol(prefix: String, contentOnly: Boolean = false): (Symbol, Symbol) = {
-    // declare symbol
-    val data = Symbol.Variable(s"${prefix}_data")
-    val conv1_1 = Symbol.Convolution(s"${prefix}_conv1_1")()(Map("data" -> data,
-                            "num_filter" -> 64, "pad" -> "(1,1)", "kernel" -> "(3,3)",
-                            "stride" -> "(1,1)", "no_bias" -> false, "workspace" -> 1024))
-    val relu1_1 = Symbol.Activation(s"${prefix}_relu1_1")()(Map("data" -> conv1_1,
-                            "act_type" -> "relu"))
-    val conv1_2 = Symbol.Convolution(s"${prefix}_conv1_2")()(Map("data" -> relu1_1,
-                            "num_filter" -> 64, "pad" -> "(1,1)", "kernel" -> "(3,3)",
-                            "stride" -> "(1,1)", "no_bias" -> false, "workspace" -> 1024))
-    val relu1_2 = Symbol.Activation(s"${prefix}_relu1_2")()(Map("data" -> conv1_2,
-                            "act_type" -> "relu"))
-    val pool1 = Symbol.Pooling(s"${prefix}_pool1")()(Map("data" -> relu1_2 , "pad" -> "(0,0)",
-                            "kernel" -> "(2,2)", "stride" -> "(2,2)", "pool_type" -> "avg"))
-    val conv2_1 = Symbol.Convolution(s"${prefix}_conv2_1")()(Map("data" -> pool1,
-                            "num_filter" -> 128, "pad" -> "(1,1)", "kernel" -> "(3,3)",
-                            "stride" -> "(1,1)", "no_bias" -> false, "workspace" -> 1024))
-    val relu2_1 = Symbol.Activation(s"${prefix}_relu2_1")()(Map("data" -> conv2_1,
-                            "act_type" -> "relu"))
-    val conv2_2 = Symbol.Convolution(s"${prefix}_conv2_2")()(Map("data" -> relu2_1,
-                            "num_filter" -> 128, "pad" -> "(1,1)", "kernel" -> "(3,3)",
-                            "stride" -> "(1,1)", "no_bias" -> false, "workspace" -> 1024))
-    val relu2_2 = Symbol.Activation(s"${prefix}_relu2_2")()(Map("data" -> conv2_2,
-                            "act_type" -> "relu"))
-    val pool2 = Symbol.Pooling("pool2")()(Map("data" -> relu2_2 , "pad" -> "(0,0)",
-                            "kernel" -> "(2,2)", "stride" -> "(2,2)", "pool_type" -> "avg"))
-    val conv3_1 = Symbol.Convolution(s"${prefix}_conv3_1")()(Map("data" -> pool2,
-                            "num_filter" -> 256, "pad" -> "(1,1)", "kernel" -> "(3,3)",
-                            "stride" -> "(1,1)", "no_bias" -> false, "workspace" -> 1024))
-    val relu3_1 = Symbol.Activation(s"${prefix}_relu3_1")()(Map("data" -> conv3_1,
-                            "act_type" -> "relu"))
-    val conv3_2 = Symbol.Convolution(s"${prefix}_conv3_2")()(Map("data" -> relu3_1,
-                            "num_filter" -> 256, "pad" -> "(1,1)", "kernel" -> "(3,3)",
-                            "stride" -> "(1,1)", "no_bias" -> false, "workspace" -> 1024))
-    val relu3_2 = Symbol.Activation(s"${prefix}_relu3_2")()(Map("data" -> conv3_2,
-                            "act_type" -> "relu"))
-    val conv3_3 = Symbol.Convolution(s"${prefix}_conv3_3")()(Map("data" -> relu3_2,
-                            "num_filter" -> 256, "pad" -> "(1,1)", "kernel" -> "(3,3)",
-                            "stride" -> "(1,1)", "no_bias" -> false, "workspace" -> 1024))
-    val relu3_3 = Symbol.Activation(s"${prefix}_relu3_3")()(Map("data" -> conv3_3,
-                            "act_type" -> "relu"))
-    val conv3_4 = Symbol.Convolution(s"${prefix}_conv3_4")()(Map("data" -> relu3_3,
-                            "num_filter" -> 256, "pad" -> "(1,1)", "kernel" -> "(3,3)",
-                            "stride" -> "(1,1)", "no_bias" -> false, "workspace" -> 1024))
-    val relu3_4 = Symbol.Activation(s"${prefix}_relu3_4")()(Map("data" -> conv3_4 ,
-                            "act_type" -> "relu"))
-    val pool3 = Symbol.Pooling(s"${prefix}_pool3")()(Map("data" -> relu3_4,
-                            "pad" -> "(0,0)", "kernel" -> "(2,2)", "stride" -> "(2,2)",
-                            "pool_type" -> "avg"))
-    val conv4_1 = Symbol.Convolution(s"${prefix}_conv4_1")()(Map("data" -> pool3,
-                            "num_filter" -> 512, "pad" -> "(1,1)", "kernel" -> "(3,3)",
-                            "stride" -> "(1,1)", "no_bias" -> false, "workspace" -> 1024))
-    val relu4_1 = Symbol.Activation(s"${prefix}_relu4_1")()(Map("data" -> conv4_1,
-                            "act_type" -> "relu"))
-    val conv4_2 = Symbol.Convolution(s"${prefix}_conv4_2")()(Map("data" -> relu4_1,
-                            "num_filter" -> 512, "pad" -> "(1,1)", "kernel" -> "(3,3)",
-                            "stride" -> "(1,1)", "no_bias" -> false, "workspace" -> 1024))
-    val relu4_2 = Symbol.Activation(s"${prefix}_relu4_2")()(Map("data" -> conv4_2,
-                            "act_type" -> "relu"))
-    val conv4_3 = Symbol.Convolution(s"${prefix}_conv4_3")()(Map("data" -> relu4_2,
-                            "num_filter" -> 512, "pad" -> "(1,1)", "kernel" -> "(3,3)",
-                            "stride" -> "(1,1)", "no_bias" -> false, "workspace" -> 1024))
-    val relu4_3 = Symbol.Activation(s"${prefix}_relu4_3")()(Map("data" -> conv4_3,
-                            "act_type" -> "relu"))
-    val conv4_4 = Symbol.Convolution(s"${prefix}_conv4_4")()(Map("data" -> relu4_3,
-                            "num_filter" -> 512, "pad" -> "(1,1)", "kernel" -> "(3,3)",
-                            "stride" -> "(1,1)", "no_bias" -> false, "workspace" -> 1024))
-    val relu4_4 = Symbol.Activation(s"${prefix}_relu4_4")()(Map("data" -> conv4_4,
-                            "act_type" -> "relu"))
-    val pool4 = Symbol.Pooling(s"${prefix}_pool4")()(Map("data" -> relu4_4,
-                            "pad" -> "(0,0)", "kernel" -> "(2,2)", "stride" -> "(2,2)",
-                            "pool_type" -> "avg"))
-    val conv5_1 = Symbol.Convolution(s"${prefix}_conv5_1")()(Map("data" -> pool4,
-                            "num_filter" -> 512, "pad" -> "(1,1)", "kernel" -> "(3,3)",
-                            "stride" -> "(1,1)", "no_bias" -> false, "workspace" -> 1024))
-    val relu5_1 = Symbol.Activation(s"${prefix}_relu5_1")()(Map("data" -> conv5_1,
-                            "act_type" -> "relu"))
-
-    // style and content layers
-    val style = if (contentOnly) null else Symbol.Group(relu1_1, relu2_1, relu3_1, relu4_1, relu5_1)
-    val content = Symbol.Group(relu4_2)
-    (style, content)
-  }
-}
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/Module.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/Module.scala
index d681b16c5af8..1d11f8864063 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/Module.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/Module.scala
@@ -17,20 +17,9 @@
 
 package org.apache.mxnetexamples.neuralstyle.end2end
 
-import org.apache.mxnet.Context
+import org.apache.mxnet.{Context, Initializer, NDArray, Optimizer, Shape, Symbol, Uniform}
 import org.slf4j.LoggerFactory
-import org.apache.mxnet.Symbol
-import org.apache.mxnet.NDArray
-import org.apache.mxnet.Optimizer
-import org.apache.mxnet.Executor
-import org.apache.mxnet.Shape
-import org.apache.mxnet.Uniform
-import org.apache.mxnet.Initializer
-import org.apache.mxnet.DataBatch
-
-/**
- * @author Depeng Liang
- */
+
 class Module(symbol: Symbol,
              context: Context,
              dataShapes: Map[String, Shape],
diff --git a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/gan/GanExampleSuite.scala b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/gan/GanExampleSuite.scala
index 8ab3a4b364a7..96820ce4e983 100644
--- a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/gan/GanExampleSuite.scala
+++ b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/gan/GanExampleSuite.scala
@@ -18,41 +18,38 @@
 package org.apache.mxnetexamples.gan
 
 import java.io.File
-import java.net.URL
-
-import org.apache.commons.io.FileUtils
 import org.apache.mxnet.Context
 import org.apache.mxnetexamples.Util
-import org.scalatest.{BeforeAndAfterAll, FunSuite}
+import org.scalatest.{BeforeAndAfterAll, FunSuite, Ignore}
 import org.slf4j.LoggerFactory
 
 import scala.sys.process.Process
 
+@Ignore
 class GanExampleSuite extends FunSuite with BeforeAndAfterAll{
   private val logger = LoggerFactory.getLogger(classOf[GanExampleSuite])
 
   test("Example CI: Test GAN MNIST") {
-    if (System.getenv().containsKey("SCALA_TEST_ON_GPU") &&
-      System.getenv("SCALA_TEST_ON_GPU").toInt == 1) {
-      logger.info("Downloading mnist model")
-      val baseUrl = "https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci"
-      val tempDirPath = System.getProperty("java.io.tmpdir")
-      val modelDirPath = tempDirPath + File.separator + "mnist/"
-      logger.info("tempDirPath: %s".format(tempDirPath))
-      Util.downloadUrl(baseUrl + "/mnist/mnist.zip",
-        tempDirPath + "/mnist/mnist.zip")
-      // TODO: Need to confirm with Windows
-      Process("unzip " + tempDirPath + "/mnist/mnist.zip -d "
-        + tempDirPath + "/mnist/") !
-
-      val context = Context.gpu()
-
-      val output = GanMnist.runTraining(modelDirPath, context, modelDirPath, 5)
-      Process("rm -rf " + modelDirPath) !
-
-      assert(output >= 0.0f)
-    } else {
-      logger.info("GPU test only, skipped...")
-    }
+      if (System.getenv().containsKey("SCALA_TEST_ON_GPU") &&
+        System.getenv("SCALA_TEST_ON_GPU").toInt == 1) {
+        logger.info("Downloading mnist model")
+        val baseUrl = "https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci"
+        val tempDirPath = System.getProperty("java.io.tmpdir")
+        val modelDirPath = tempDirPath + File.separator + "mnist/"
+        logger.info("tempDirPath: %s".format(tempDirPath))
+        Util.downloadUrl(baseUrl + "/mnist/mnist.zip", tempDirPath + "/mnist/mnist.zip")
+        // TODO: Need to confirm with Windows
+        Process("unzip " + tempDirPath + "/mnist/mnist.zip -d "
+          + tempDirPath + "/mnist/") !
+
+        val context = Context.gpu()
+
+        val output = GanMnist.runTraining(modelDirPath, context, modelDirPath, 5)
+        Process("rm -rf " + modelDirPath) !
+
+        assert(output >= 0.0f)
+      } else {
+        logger.info("GPU test only, skipped...")
+      }
   }
 }
diff --git a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/imclassification/MNISTExampleSuite.scala b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/imclassification/MNISTExampleSuite.scala
index 7b1d6ddc38b5..0fd3af02d9cf 100644
--- a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/imclassification/MNISTExampleSuite.scala
+++ b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/imclassification/MNISTExampleSuite.scala
@@ -29,8 +29,7 @@ import org.slf4j.LoggerFactory
 import scala.sys.process.Process
 
 /**
-  * Integration test for imageClassifier example.
-  * This will run as a part of "make scalatest"
+  * Integration test for MNIST example.
   */
 class MNISTExampleSuite extends FunSuite with BeforeAndAfterAll {
   private val logger = LoggerFactory.getLogger(classOf[MNISTExampleSuite])
diff --git a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/neuralstyle/NeuralStyleSuite.scala b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/neuralstyle/NeuralStyleSuite.scala
new file mode 100644
index 000000000000..dc8fc5b8c14d
--- /dev/null
+++ b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/neuralstyle/NeuralStyleSuite.scala
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mxnetexamples.neuralstyle
+
+import org.apache.mxnet.Context
+import org.apache.mxnetexamples.Util
+import org.apache.mxnetexamples.neuralstyle.end2end.{BoostInference, BoostTrain}
+import org.scalatest.{BeforeAndAfterAll, FunSuite}
+import org.slf4j.LoggerFactory
+
+import scala.sys.process.Process
+
+/**
+  * Neural Suite Test package
+  * Currently there is no plan to run to test accuracy
+  * This test is just to verify the model is runnable
+  */
+class NeuralStyleSuite extends FunSuite with BeforeAndAfterAll {
+  private val logger = LoggerFactory.getLogger(classOf[NeuralStyleSuite])
+
+
+  override def beforeAll(): Unit = {
+    logger.info("Downloading vgg model")
+    val tempDirPath = System.getProperty("java.io.tmpdir")
+    logger.info("tempDirPath: %s".format(tempDirPath))
+    val baseUrl = "https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/NeuralStyle/"
+    Util.downloadUrl(baseUrl + "IMG_4343.jpg", tempDirPath + "/NS/IMG_4343.jpg")
+    Util.downloadUrl(baseUrl + "starry_night.jpg", tempDirPath + "/NS/starry_night.jpg")
+    Util.downloadUrl(baseUrl + "model.zip", tempDirPath + "/NS/model.zip")
+    Util.downloadUrl(baseUrl + "vgg19.params", tempDirPath + "/NS/vgg19.params")
+    // TODO: Need to confirm with Windows
+    Process(s"unzip $tempDirPath/NS/model.zip -d $tempDirPath/NS/") !
+
+    Process(s"mkdir $tempDirPath/NS/images") !
+
+    for (i <- 0 until 20) {
+      Process(s"cp $tempDirPath/NS/IMG_4343.jpg $tempDirPath/NS/images/img$i.jpg") !
+    }
+  }
+
+  test("Example CI: Test Boost Inference") {
+    val tempDirPath = System.getProperty("java.io.tmpdir")
+    var ctx = Context.cpu()
+    if (System.getenv().containsKey("SCALA_TEST_ON_GPU") &&
+      System.getenv("SCALA_TEST_ON_GPU").toInt == 1) {
+      ctx = Context.gpu()
+    }
+    BoostInference.runInference(tempDirPath + "/NS/model", tempDirPath + "/NS", 2,
+      tempDirPath + "/NS/IMG_4343.jpg", ctx)
+  }
+
+  test("Example CI: Test Boost Training") {
+    val tempDirPath = System.getProperty("java.io.tmpdir")
+    if (System.getenv().containsKey("SCALA_TEST_ON_GPU") &&
+      System.getenv("SCALA_TEST_ON_GPU").toInt == 1) {
+      val ctx = Context.gpu()
+      BoostTrain.runTraining(tempDirPath + "/NS/images", tempDirPath + "/NS/vgg19.params", ctx,
+        tempDirPath + "/NS/starry_night.jpg", tempDirPath + "/NS")
+    } else {
+      logger.info("GPU test only, skip CPU...")
+    }
+  }
+
+  test("Example CI: Test Neural Style") {
+    val tempDirPath = System.getProperty("java.io.tmpdir")
+    if (System.getenv().containsKey("SCALA_TEST_ON_GPU") &&
+      System.getenv("SCALA_TEST_ON_GPU").toInt == 1) {
+      val ctx = Context.gpu()
+      NeuralStyle.runTraining("vgg19", tempDirPath + "/NS/IMG_4343.jpg",
+        tempDirPath + "/NS/starry_night.jpg",
+        ctx, tempDirPath + "/NS/vgg19.params", tempDirPath + "/NS",
+        1f, 20f, 0.01f, 1, 10f, 60, 600, 50, 0.0005f)
+    } else {
+      logger.info("GPU test only, skip CPU")
+    }
+  }
+}

From 9c3fd019a4349f7ae359c9d73761e70c12b7ebbd Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Wed, 1 Aug 2018 11:15:34 -0700
Subject: [PATCH 19/40] Update submodule mshadow

---
 3rdparty/mshadow | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/mshadow b/3rdparty/mshadow
index 463c0dffe3ea..d68d3694fdfb 160000
--- a/3rdparty/mshadow
+++ b/3rdparty/mshadow
@@ -1 +1 @@
-Subproject commit 463c0dffe3eae8c39caf7989c85b7244823df27e
+Subproject commit d68d3694fdfb44fdbb7c840c3591131ff2310a59

From 6124ad87c3bdb317a05a633c4c89622afb82856d Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Wed, 1 Aug 2018 21:50:32 -0700
Subject: [PATCH 20/40] Fix compilation warning

---
 src/operator/channel_op_common.h                  | 4 ++--
 src/operator/contrib/count_sketch-inl.h           | 2 +-
 src/operator/contrib/deformable_convolution-inl.h | 4 ++--
 src/operator/contrib/fft-inl.h                    | 2 +-
 src/operator/contrib/ifft-inl.h                   | 2 +-
 src/operator/contrib/sync_batch_norm-inl.h        | 4 ++--
 src/operator/custom/custom.cc                     | 4 ++--
 src/operator/custom/native_op-inl.h               | 4 ++--
 src/operator/nn/batch_norm.cc                     | 2 +-
 src/operator/nn/convolution.cc                    | 2 +-
 src/operator/nn/deconvolution-inl.h               | 4 ++--
 src/operator/nn/deconvolution.cc                  | 2 +-
 src/operator/nn/lrn.cc                            | 2 +-
 src/operator/nn/pooling-inl.h                     | 2 +-
 src/operator/nn/upsampling.cc                     | 2 +-
 src/operator/softmax_output-inl.h                 | 8 ++++----
 16 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/src/operator/channel_op_common.h b/src/operator/channel_op_common.h
index 00cd8ae084bb..1afc13ad2594 100644
--- a/src/operator/channel_op_common.h
+++ b/src/operator/channel_op_common.h
@@ -44,7 +44,7 @@ inline void concatenate_helper(const std::vector<mshadow::Tensor<xpu, dim, DType
     mshadow::Tensor<xpu, dim, DType> out = *output;
     size_t size = input.size();
     index_t begin = 0;
-    for (index_t i = 0; i < size; ++i) {
+    for (size_t i = 0; i < size; ++i) {
       index_t end = begin + input[i].size(cdim);
       Assign(slice<cdim>(out, begin, end), req, input[i]);
       begin = end;
@@ -79,7 +79,7 @@ void split_helper(const mshadow::Tensor<xpu, dim, DType> &input,
     std::vector<mshadow::Tensor<xpu, dim, DType> > out = *output;
     size_t size = out.size();
     index_t begin = 0;
-    for (index_t i = 0; i < size; ++i) {
+    for (size_t i = 0; i < size; ++i) {
       index_t end = begin + out[i].size(cdim);
       Assign(out[i], req[i], slice<cdim>(input, begin, end));
       begin = end;
diff --git a/src/operator/contrib/count_sketch-inl.h b/src/operator/contrib/count_sketch-inl.h
index 76d1a7efb876..dd3bf54ab6a6 100644
--- a/src/operator/contrib/count_sketch-inl.h
+++ b/src/operator/contrib/count_sketch-inl.h
@@ -185,7 +185,7 @@ class CountSketchProp : public OperatorProperty {
     CHECK_GE(in_type->size(), 1);
     int dtype = (*in_type)[0];
     CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (index_t i = 0; i < in_type->size(); ++i) {
+    for (size_t i = 0; i < in_type->size(); ++i) {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
       } else {
diff --git a/src/operator/contrib/deformable_convolution-inl.h b/src/operator/contrib/deformable_convolution-inl.h
index 480f675bdbff..7328eb38308f 100644
--- a/src/operator/contrib/deformable_convolution-inl.h
+++ b/src/operator/contrib/deformable_convolution-inl.h
@@ -129,7 +129,7 @@ class DeformableConvolutionOp : public Operator {
     // calculate the shape of col_buffer
     TShape col_buffer_shape(num_spatial_axes_ + 1);
     col_buffer_shape[0] = conv_in_channels_ * param_.kernel.Size();
-    for (index_t i = 1; i < col_buffer_shape.ndim(); ++i) {
+    for (size_t i = 1; i < col_buffer_shape.ndim(); ++i) {
       col_buffer_shape[i] = out_data[0].shape_[i + 1];
     }
     // create a column buffer using workspace and col_buffer_shape
@@ -453,7 +453,7 @@ class DeformableConvolutionProp : public OperatorProperty {
     CHECK_GE(in_type->size(), 1U);
     int dtype = (*in_type)[0];
     CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (index_t i = 0; i < in_type->size(); ++i) {
+    for (size_t i = 0; i < in_type->size(); ++i) {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
       } else {
diff --git a/src/operator/contrib/fft-inl.h b/src/operator/contrib/fft-inl.h
index be7b64aeb0c6..c5c8574f19e7 100644
--- a/src/operator/contrib/fft-inl.h
+++ b/src/operator/contrib/fft-inl.h
@@ -258,7 +258,7 @@ class FFTProp : public OperatorProperty {
     CHECK_GE(in_type->size(), 1);
     int dtype = (*in_type)[0];
     CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (index_t i = 0; i < in_type->size(); ++i) {
+    for (size_t i = 0; i < in_type->size(); ++i) {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
       } else {
diff --git a/src/operator/contrib/ifft-inl.h b/src/operator/contrib/ifft-inl.h
index e48d653d9274..da560c3c5178 100644
--- a/src/operator/contrib/ifft-inl.h
+++ b/src/operator/contrib/ifft-inl.h
@@ -250,7 +250,7 @@ class IFFTProp : public OperatorProperty {
     CHECK_GE(in_type->size(), 1);
     int dtype = (*in_type)[0];
     CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (index_t i=0; i < in_type->size(); ++i) {
+    for (size_t i=0; i < in_type->size(); ++i) {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
       } else {
diff --git a/src/operator/contrib/sync_batch_norm-inl.h b/src/operator/contrib/sync_batch_norm-inl.h
index 1f548dbc7e5e..78f1c09dfe03 100644
--- a/src/operator/contrib/sync_batch_norm-inl.h
+++ b/src/operator/contrib/sync_batch_norm-inl.h
@@ -500,14 +500,14 @@ class SyncBatchNormProp : public OperatorProperty {
     // For other input types, these parameters have the same type as input
     // NOTE: This requirement is from cuDNN (v. 4 and 5)
     int dtype_param = (dtype == kFloat16) ? kFloat32 : dtype;
-    for (index_t i = 1; i < in_type->size(); ++i) {
+    for (size_t i = 1; i < in_type->size(); ++i) {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype_param;
       } else {
         UNIFORM_TYPE_CHECK((*in_type)[i], dtype_param, ListArguments()[i]);
       }
     }
-    for (index_t i = 0; i < aux_type->size(); ++i) {
+    for (size_t i = 0; i < aux_type->size(); ++i) {
       if ((*aux_type)[i] != -1) {
         UNIFORM_TYPE_CHECK((*aux_type)[i], dtype_param, ListArguments()[i]);
       }
diff --git a/src/operator/custom/custom.cc b/src/operator/custom/custom.cc
index 11e5e36dbeb0..c6ae61feb2c0 100644
--- a/src/operator/custom/custom.cc
+++ b/src/operator/custom/custom.cc
@@ -238,14 +238,14 @@ std::vector<nnvm::NodeEntry> Gradient(
   }
 
   std::vector<nnvm::NodeEntry> ret;
-  for (index_t i = 0; i < params.num_args; ++i) {
+  for (size_t i = 0; i < params.num_args; ++i) {
     ret.emplace_back(nnvm::NodeEntry{g, static_cast<uint32_t>(i), 0});
   }
   if (params.num_auxs) {
     nnvm::NodePtr ng = nnvm::Node::Create();
     ng->attrs.op = nnvm::Op::Get("_NoGradient");
     ng->attrs.name = "NoGradient";
-    for (index_t i = 0; i < params.num_auxs; ++i) {
+    for (size_t i = 0; i < params.num_auxs; ++i) {
       ret.emplace_back(nnvm::NodeEntry{ng, 0, 0});
     }
   }
diff --git a/src/operator/custom/native_op-inl.h b/src/operator/custom/native_op-inl.h
index d2fb1149f7b5..f2eca05e78f0 100644
--- a/src/operator/custom/native_op-inl.h
+++ b/src/operator/custom/native_op-inl.h
@@ -77,7 +77,7 @@ class NativeOp : public Operator {
     s->Wait();
     param_.pinfo->forward(ptrs.size(), ptrs.data(), ndims.data(), shapes.data(),
         tags.data(), param_.pinfo->p_forward);
-    for (index_t i = 0; i < out_data.size(); ++i) {
+    for (size_t i = 0; i < out_data.size(); ++i) {
       CHECK_NE(req[i], kAddTo) << "NativeOp doesn't support AddTo for output";
       if (req[i] != kNullOp) {
         std::stringstream ss;
@@ -111,7 +111,7 @@ class NativeOp : public Operator {
     s->Wait();
     param_.pinfo->backward(ptrs.size(), ptrs.data(), ndims.data(), shapes.data(),
         tags.data(), param_.pinfo->p_backward);
-    for (index_t i = 0; i < in_grad.size(); ++i) {
+    for (size_t i = 0; i < in_grad.size(); ++i) {
       CHECK_NE(req[i], kAddTo) << "NativeOp doesn't support AddTo for output";
       if (req[i] != kNullOp) {
         std::stringstream ss;
diff --git a/src/operator/nn/batch_norm.cc b/src/operator/nn/batch_norm.cc
index 30fb665dd05a..c11f98026865 100644
--- a/src/operator/nn/batch_norm.cc
+++ b/src/operator/nn/batch_norm.cc
@@ -362,7 +362,7 @@ static bool BatchNormType(const nnvm::NodeAttrs& attrs,
       dtype_param = mshadow::DataType<AccRealX>::kFlag; });
   std::vector<std::string> args{"data", "gamma", "beta", "mean", "var"};
   CHECK_LE(in_type->size(), args.size());
-  for (index_t i = 1; i < in_type->size(); ++i) {
+  for (size_t i = 1; i < in_type->size(); ++i) {
     if ((*in_type)[i] == -1) {
       (*in_type)[i] = dtype_param;
     } else {
diff --git a/src/operator/nn/convolution.cc b/src/operator/nn/convolution.cc
index ef70ccd6ec1e..42a4ed240587 100644
--- a/src/operator/nn/convolution.cc
+++ b/src/operator/nn/convolution.cc
@@ -276,7 +276,7 @@ static bool ConvolutionType(const nnvm::NodeAttrs& attrs,
   CHECK_GE(in_type->size(), 1U);
   int dtype = (*in_type)[0];
   CHECK_NE(dtype, -1) << "First input must have specified type";
-  for (index_t i = 0; i < in_type->size(); ++i) {
+  for (size_t i = 0; i < in_type->size(); ++i) {
     if ((*in_type)[i] == -1) {
       (*in_type)[i] = dtype;
     } else {
diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h
index 027777ce5e0c..b627fc441ffa 100644
--- a/src/operator/nn/deconvolution-inl.h
+++ b/src/operator/nn/deconvolution-inl.h
@@ -130,7 +130,7 @@ struct DeconvolutionParam : public dmlc::Parameter<DeconvolutionParam> {
     if (bCal) {
       size_t input_ndim = input.ndim();
 
-      for (index_t i = 0; i < ndim; i++) {
+      for (size_t i = 0; i < ndim; i++) {
         // input.ndim() can be larger than ndim, in case that the complete input
         // shape was passed and not only the ndim last ones
         o_pad[i] = stride[i] * (input[(input_ndim - ndim) + i] - 1) + DilatedKernelSize(i);
@@ -140,7 +140,7 @@ struct DeconvolutionParam : public dmlc::Parameter<DeconvolutionParam> {
         o_pad[i] = (o_pad[i] + 1) / 2;
       }
     } else {
-      for (index_t i = 0; i < ndim; i++) {
+      for (size_t i = 0; i < ndim; i++) {
         o_pad[i] = pad[i];
         o_adj[i] = adj[i];
       }
diff --git a/src/operator/nn/deconvolution.cc b/src/operator/nn/deconvolution.cc
index 9e0a70121bf9..8bbcc6780faf 100644
--- a/src/operator/nn/deconvolution.cc
+++ b/src/operator/nn/deconvolution.cc
@@ -244,7 +244,7 @@ static bool DeconvolutionType(const nnvm::NodeAttrs& attrs,
   CHECK_GE(in_type->size(), 1U);
   int dtype = (*in_type)[0];
   CHECK_NE(dtype, -1) << "First input must have specified type";
-  for (index_t i = 0; i < in_type->size(); ++i) {
+  for (size_t i = 0; i < in_type->size(); ++i) {
     if ((*in_type)[i] == -1) {
       (*in_type)[i] = dtype;
     } else {
diff --git a/src/operator/nn/lrn.cc b/src/operator/nn/lrn.cc
index 6b3d7c818378..056dbaa53621 100644
--- a/src/operator/nn/lrn.cc
+++ b/src/operator/nn/lrn.cc
@@ -56,7 +56,7 @@ bool LRNType(const nnvm::NodeAttrs& attrs,
   CHECK_GE(in_type->size(), 1U);
   int dtype = (*in_type)[0];
   CHECK_NE(dtype, -1) << "First input must have specified type";
-  for (index_t i = 0; i < in_type->size(); ++i) {
+  for (size_t i = 0; i < in_type->size(); ++i) {
     if ((*in_type)[i] == -1) {
       (*in_type)[i] = dtype;
     } else {
diff --git a/src/operator/nn/pooling-inl.h b/src/operator/nn/pooling-inl.h
index ad74a8feae39..d2a370cf6505 100644
--- a/src/operator/nn/pooling-inl.h
+++ b/src/operator/nn/pooling-inl.h
@@ -258,7 +258,7 @@ void PoolingCompute(const nnvm::NodeAttrs& attrs,
                     const std::vector<TBlob>& outputs) {
   const PoolingParam& param = nnvm::get<PoolingParam>(attrs.parsed);
   CHECK_EQ(inputs.size(), 1U);
-  CHECK_EQ(outputs.size(), GetNumOutputs(param));
+  CHECK_EQ(outputs.size(), static_cast<size_t>(GetNumOutputs(param));
   if (!param.global_pool) {
     // check if filter size assigned correctly
     CHECK_GT(param.kernel.ndim(), 0U)
diff --git a/src/operator/nn/upsampling.cc b/src/operator/nn/upsampling.cc
index 5aa111e26f75..b6b3d873df7d 100644
--- a/src/operator/nn/upsampling.cc
+++ b/src/operator/nn/upsampling.cc
@@ -92,7 +92,7 @@ static bool UpSamplingType(const nnvm::NodeAttrs& attrs,
   CHECK_GE(in_type->size(), 1U);
   int dtype = (*in_type)[0];
   CHECK_NE(dtype, -1) << "First input must have specified type";
-  for (index_t i = 0; i < in_type->size(); ++i) {
+  for (size_t i = 0; i < in_type->size(); ++i) {
     if ((*in_type)[i] == -1) {
       (*in_type)[i] = dtype;
     } else {
diff --git a/src/operator/softmax_output-inl.h b/src/operator/softmax_output-inl.h
index 9a4db2c9694a..06017853b078 100644
--- a/src/operator/softmax_output-inl.h
+++ b/src/operator/softmax_output-inl.h
@@ -185,8 +185,8 @@ class SoftmaxOutputOp : public Operator {
           ctx.requested[softmaxout_enum::kTempSpace].get_host_space_typed<2, DType>(
           label.shape_);
         Copy(workspace, label, label.stream_);
-        for (index_t i = 0; i < workspace.size(0); ++i) {
-          for (index_t j = 0; j < workspace.size(1); ++j) {
+        for (size_t i = 0; i < workspace.size(0); ++i) {
+          for (size_t j = 0; j < workspace.size(1); ++j) {
             if (static_cast<int>(workspace[i][j]) == i_label) {
               valid_cnt--;
             }
@@ -245,7 +245,7 @@ class SoftmaxOutputOp : public Operator {
           ctx.requested[softmaxout_enum::kTempSpace].get_host_space_typed<1, DType>(
           label.shape_);
         Copy(workspace, label, label.stream_);
-        for (index_t i = 0; i < label.size(0); ++i) {
+        for (size_t i = 0; i < label.size(0); ++i) {
           if (static_cast<int>(workspace[i]) == i_label) {
             valid_cnt--;
           }
@@ -333,7 +333,7 @@ class SoftmaxOutputProp : public OperatorProperty {
     CHECK_GE(in_type->size(), 1U);
     int dtype = (*in_type)[0];
     CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (index_t i = 0; i < in_type->size(); ++i) {
+    for (size_t i = 0; i < in_type->size(); ++i) {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
       } else {

From a76bb206df5ad0f40ecb023b4491df71a242a814 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Wed, 1 Aug 2018 23:19:10 -0700
Subject: [PATCH 21/40] Fix compilation warning

---
 src/executor/graph_executor.cc      |  2 +-
 src/operator/nn/pooling-inl.h       |  6 +++---
 src/operator/nn/pooling.cc          | 12 ++++++------
 src/operator/sequence_last-inl.h    |  2 +-
 src/operator/sequence_mask-inl.h    |  2 +-
 src/operator/sequence_reverse-inl.h |  2 +-
 src/operator/softmax_output-inl.h   |  6 +++---
 src/operator/svm_output-inl.h       |  2 +-
 src/operator/tensor/matrix_op-inl.h |  6 +++---
 9 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
index 7386de4d12e3..526d307c9940 100644
--- a/src/executor/graph_executor.cc
+++ b/src/executor/graph_executor.cc
@@ -1576,7 +1576,7 @@ void GraphExecutor::ExecuteMonCallback(size_t nid) {
     }
   }
   CHECK_EQ(opnode.exec->out_array.size(), output_names.size());
-  for (index_t i = 0; i < opnode.exec->out_array.size(); ++i) {
+  for (size_t i = 0; i < opnode.exec->out_array.size(); ++i) {
     NDArray *cpy = new NDArray(opnode.exec->out_array[i]);
     std::string name = inode.source->attrs.name + "_" + output_names[i];
     this->monitor_callback_(name.c_str(), reinterpret_cast<void*>(cpy));
diff --git a/src/operator/nn/pooling-inl.h b/src/operator/nn/pooling-inl.h
index d2a370cf6505..38fd7fb4296b 100644
--- a/src/operator/nn/pooling-inl.h
+++ b/src/operator/nn/pooling-inl.h
@@ -135,8 +135,8 @@ namespace op {
  * When MKLDNN is enabled, we might want 2 outputs instead of one inputs, which
  * also changes the number of inputs for backward.
  */
-int GetNumOutputs(const PoolingParam &param);
-int GetNumBackInputs(const PoolingParam &param);
+size_t GetNumOutputs(const PoolingParam &param);
+size_t GetNumBackInputs(const PoolingParam &param);
 
 template<typename xpu, typename DType>
 class PoolingOp {
@@ -258,7 +258,7 @@ void PoolingCompute(const nnvm::NodeAttrs& attrs,
                     const std::vector<TBlob>& outputs) {
   const PoolingParam& param = nnvm::get<PoolingParam>(attrs.parsed);
   CHECK_EQ(inputs.size(), 1U);
-  CHECK_EQ(outputs.size(), static_cast<size_t>(GetNumOutputs(param));
+  CHECK_EQ(outputs.size(), GetNumOutputs(param));
   if (!param.global_pool) {
     // check if filter size assigned correctly
     CHECK_GT(param.kernel.ndim(), 0U)
diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc
index 9b6996d0feb0..7982c0c3bd3e 100644
--- a/src/operator/nn/pooling.cc
+++ b/src/operator/nn/pooling.cc
@@ -57,19 +57,19 @@ void PoolingParamParser(nnvm::NodeAttrs *attrs) {
   attrs->parsed = std::move(param);
 }
 
-int GetNumOutputs(const PoolingParam &param) {
+size_t GetNumOutputs(const PoolingParam &param) {
 #if MXNET_USE_MKLDNN == 1
-  return MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param) ? 2 : 1;
+  return MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param) ? 2U : 1U;
 #else
-  return 1;
+  return 1U;
 #endif
 }
 
-int GetNumBackInputs(const PoolingParam &param) {
+size_t GetNumBackInputs(const PoolingParam &param) {
 #if MXNET_USE_MKLDNN == 1
-  return MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param) ? 5 : 3;
+  return MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param) ? 5U : 3U;
 #else
-  return 3;
+  return 3U;
 #endif
 }
 
diff --git a/src/operator/sequence_last-inl.h b/src/operator/sequence_last-inl.h
index 58562862a4e0..1a59473cfc3a 100644
--- a/src/operator/sequence_last-inl.h
+++ b/src/operator/sequence_last-inl.h
@@ -278,7 +278,7 @@ class SequenceLastProp : public OperatorProperty {
     CHECK_GE(in_type->size(), param_.use_sequence_length ? 2U : 1U);
     int dtype = (*in_type)[0];
     CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (index_t i = 0; i < in_type->size(); ++i) {
+    for (size_t i = 0; i < in_type->size(); ++i) {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
       } else {
diff --git a/src/operator/sequence_mask-inl.h b/src/operator/sequence_mask-inl.h
index a34cea04965e..c93ffb5f17b6 100644
--- a/src/operator/sequence_mask-inl.h
+++ b/src/operator/sequence_mask-inl.h
@@ -267,7 +267,7 @@ class SequenceMaskProp : public OperatorProperty {
     CHECK_GE(in_type->size(), param_.use_sequence_length ? 2U : 1U);
     int dtype = (*in_type)[0];
     CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (index_t i = 0; i < in_type->size(); ++i) {
+    for (size_t i = 0; i < in_type->size(); ++i) {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
       } else {
diff --git a/src/operator/sequence_reverse-inl.h b/src/operator/sequence_reverse-inl.h
index 943ca6e933c9..5c48729e18ff 100644
--- a/src/operator/sequence_reverse-inl.h
+++ b/src/operator/sequence_reverse-inl.h
@@ -246,7 +246,7 @@ class SequenceReverseProp : public OperatorProperty {
     CHECK_GE(in_type->size(), param_.use_sequence_length ? 2U : 1U);
     int dtype = (*in_type)[0];
     CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (index_t i = 0; i < in_type->size(); ++i) {
+    for (size_t i = 0; i < in_type->size(); ++i) {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
       } else {
diff --git a/src/operator/softmax_output-inl.h b/src/operator/softmax_output-inl.h
index 06017853b078..fec321b97e4c 100644
--- a/src/operator/softmax_output-inl.h
+++ b/src/operator/softmax_output-inl.h
@@ -185,8 +185,8 @@ class SoftmaxOutputOp : public Operator {
           ctx.requested[softmaxout_enum::kTempSpace].get_host_space_typed<2, DType>(
           label.shape_);
         Copy(workspace, label, label.stream_);
-        for (size_t i = 0; i < workspace.size(0); ++i) {
-          for (size_t j = 0; j < workspace.size(1); ++j) {
+        for (index_t i = 0; i < workspace.size(0); ++i) {
+          for (index_t j = 0; j < workspace.size(1); ++j) {
             if (static_cast<int>(workspace[i][j]) == i_label) {
               valid_cnt--;
             }
@@ -245,7 +245,7 @@ class SoftmaxOutputOp : public Operator {
           ctx.requested[softmaxout_enum::kTempSpace].get_host_space_typed<1, DType>(
           label.shape_);
         Copy(workspace, label, label.stream_);
-        for (size_t i = 0; i < label.size(0); ++i) {
+        for (index_t i = 0; i < label.size(0); ++i) {
           if (static_cast<int>(workspace[i]) == i_label) {
             valid_cnt--;
           }
diff --git a/src/operator/svm_output-inl.h b/src/operator/svm_output-inl.h
index 9ae0ced7a74a..011b9ad10284 100644
--- a/src/operator/svm_output-inl.h
+++ b/src/operator/svm_output-inl.h
@@ -159,7 +159,7 @@ class SVMOutputProp : public OperatorProperty {
     CHECK_GE(in_type->size(), 1U);
     int dtype = (*in_type)[0];
     CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (index_t i = 0; i < in_type->size(); ++i) {
+    for (size_t i = 0; i < in_type->size(); ++i) {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
       } else {
diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h
index eec920555ed1..05a60eb345b3 100644
--- a/src/operator/tensor/matrix_op-inl.h
+++ b/src/operator/tensor/matrix_op-inl.h
@@ -86,7 +86,7 @@ inline TShape InferReshapeShape(const nnvm::Tuple<IType>& shape,
   }
   auto dshape_len = dshape_vec.size();
   auto params_len = param_shape_vec.size();
-  for (index_t i = 0; i < params_len; ++i) {
+  for (size_t i = 0; i < params_len; ++i) {
     IType proposed_dim = param_shape_vec[i];
     if (proposed_dim == 0) {
       // keep same
@@ -2061,7 +2061,7 @@ void StackOpForward(const nnvm::NodeAttrs& attrs,
     Shape<3> oshape = Shape3(leading, mid, trailing);
     out = outputs[0].get_with_shape<xpu, 3, DType>(oshape, s);
 
-    for (index_t i = 0; i < inputs.size(); ++i) {
+    for (size_t i = 0; i < inputs.size(); ++i) {
       Shape<3> dshape = Shape3(leading, 1, trailing);
       data[i] = inputs[i].get_with_shape<xpu, 3, DType>(dshape, s);
     }
@@ -2095,7 +2095,7 @@ void StackOpBackward(const nnvm::NodeAttrs& attrs,
     Shape<3> oshape = Shape3(leading, mid, trailing);
     grad = inputs[0].get_with_shape<xpu, 3, DType>(oshape, s);
 
-    for (index_t i = 0; i < outputs.size(); ++i) {
+    for (size_t i = 0; i < outputs.size(); ++i) {
       Shape<3> dshape = Shape3(leading, 1, trailing);
       grad_in[i] = outputs[i].get_with_shape<xpu, 3, DType>(dshape, s);
     }

From 8e39744e0cd0e1e1ec1254ceb9b0020be13d05a0 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Thu, 2 Aug 2018 11:06:12 -0700
Subject: [PATCH 22/40] Change index variable type to size_t

---
 src/io/image_iter_common.h       |  4 ++--
 src/ndarray/ndarray.cc           | 12 ++++++------
 src/operator/batch_norm_v1-inl.h |  4 ++--
 src/operator/rnn-inl.h           |  2 +-
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/io/image_iter_common.h b/src/io/image_iter_common.h
index 8580ff8f9f9c..07e342579ec8 100644
--- a/src/io/image_iter_common.h
+++ b/src/io/image_iter_common.h
@@ -42,7 +42,7 @@ class ImageLabelMap {
    * \param label_width predefined label_width
    */
   explicit ImageLabelMap(const char *path_imglist,
-                         mshadow::index_t label_width,
+                         index_t label_width,
                          bool silent) {
     this->label_width = label_width;
     image_index_.clear();
@@ -58,7 +58,7 @@ class ImageLabelMap {
       // skip space
       while (isspace(*p) && p != end) ++p;
       image_index_.push_back(static_cast<size_t>(atol(p)));
-      for (size_t i = 0; i < label_width; ++i) {
+      for (index_t i = 0; i < label_width; ++i) {
         // skip till space
         while (!isspace(*p) && p != end) ++p;
         // skip space
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index 853838a87f4c..335587735084 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -2077,10 +2077,10 @@ void Imdecode(NDArray *ret, NDArray mean, size_t index,
   if (mean.is_none()) {
     MSHADOW_TYPE_SWITCH(buff.dtype(), DType, {
       mshadow::Tensor<cpu, 4, DType> tensor = buff.data().get<cpu, 4, DType>();
-      for (index_t i = 0; i < y1-y0; i++) {
+      for (size_t i = 0; i < y1-y0; i++) {
         uchar* im_data = res.ptr<uchar>(y0+i) + res.channels()*x0;
-        for (index_t j = 0; j < x1-x0; j++) {
-          for (index_t k = 0; k < n_channels; k++) {
+        for (size_t j = 0; j < x1-x0; j++) {
+          for (size_t k = 0; k < n_channels; k++) {
             tensor[0][k][i][j] = DType(im_data[k]);  // NOLINT(*)
           }
           im_data += res.channels();
@@ -2097,10 +2097,10 @@ void Imdecode(NDArray *ret, NDArray mean, size_t index,
     MSHADOW_TYPE_SWITCH(buff.dtype(), DType, {
       mshadow::Tensor<cpu, 4, DType> tensor = buff.data().get<cpu, 4, DType>();
       mshadow::Tensor<cpu, 3, DType> tmean = mean.data().get<cpu, 3, DType>();
-      for (index_t i = 0; i < y1-y0; i++) {
+      for (size_t i = 0; i < y1-y0; i++) {
         uchar* im_data = res.ptr<uchar>(y0+i) + res.channels()*x0;
-        for (index_t j = 0; j < x1-x0; j++) {
-          for (index_t k = 0; k < n_channels; k++) {
+        for (size_t j = 0; j < x1-x0; j++) {
+          for (size_t k = 0; k < n_channels; k++) {
             tensor[0][k][i][j] = DType(im_data[k]) - tmean[k][i][j];  // NOLINT(*)
           }
           im_data += res.channels();
diff --git a/src/operator/batch_norm_v1-inl.h b/src/operator/batch_norm_v1-inl.h
index 1e048452275c..f4116e30186e 100644
--- a/src/operator/batch_norm_v1-inl.h
+++ b/src/operator/batch_norm_v1-inl.h
@@ -286,14 +286,14 @@ class BatchNormV1Prop : public OperatorProperty {
     // For other input types, these parameters have the same type as input
     // NOTE: This requirement is from cuDNN (v. 4 and 5)
     int dtype_param = (dtype == kFloat16) ? kFloat32 : dtype;
-    for (index_t i = 1; i < in_type->size(); ++i) {
+    for (size_t i = 1; i < in_type->size(); ++i) {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype_param;
       } else {
         UNIFORM_TYPE_CHECK((*in_type)[i], dtype_param, ListArguments()[i]);
       }
     }
-    for (index_t i = 0; i < aux_type->size(); ++i) {
+    for (size_t i = 0; i < aux_type->size(); ++i) {
       if ((*aux_type)[i] != -1) {
         UNIFORM_TYPE_CHECK((*aux_type)[i], dtype_param, ListArguments()[i]);
       }
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index 1f905eda4a92..c511cef515bb 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -688,7 +688,7 @@ class RNNProp : public OperatorProperty {
     CHECK_GE(in_type->size(), 1U);
     int dtype = (*in_type)[0];
     CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (index_t i = 0; i < in_type->size(); ++i) {
+    for (size_t i = 0; i < in_type->size(); ++i) {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
       } else {

From 53d5196146f5449ba61f8a656ce33f82bf86f734 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Thu, 2 Aug 2018 12:23:25 -0700
Subject: [PATCH 23/40] Change temp_size type from size_t to index_t

---
 src/ndarray/ndarray_function.cc       |  2 +-
 src/operator/convolution_v1-inl.h     |  2 +-
 src/operator/nn/pooling-inl.h         |  4 ++--
 src/operator/nn/pooling.cc            | 12 ++++++------
 src/operator/tensor/ordering_op-inl.h | 15 +++++++++------
 5 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/src/ndarray/ndarray_function.cc b/src/ndarray/ndarray_function.cc
index 022302aca403..43295d6e1014 100644
--- a/src/ndarray/ndarray_function.cc
+++ b/src/ndarray/ndarray_function.cc
@@ -92,7 +92,7 @@ void ElementwiseSumRspImpl(mshadow::Stream<cpu>* s,
               auto out_value_cur_row = out_values[irow];
               const auto offset = row_idx_ptr - nd_indices_start;
               auto nd_value_cur_row = nd_values[offset];
-              for (size_t j = 0; j < nd_value_cur_row.shape_[0]; ++j) {
+              for (index_t j = 0; j < nd_value_cur_row.shape_[0]; ++j) {
                 out_value_cur_row[j] += nd_value_cur_row[j];
               }
               ++irow;
diff --git a/src/operator/convolution_v1-inl.h b/src/operator/convolution_v1-inl.h
index fcb24915eb75..abda1ca6bee0 100644
--- a/src/operator/convolution_v1-inl.h
+++ b/src/operator/convolution_v1-inl.h
@@ -502,7 +502,7 @@ class ConvolutionV1Prop : public OperatorProperty {
     CHECK_GE(in_type->size(), 1);
     int dtype = (*in_type)[0];
     CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (index_t i = 0; i < in_type->size(); ++i) {
+    for (size_t i = 0; i < in_type->size(); ++i) {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
       } else {
diff --git a/src/operator/nn/pooling-inl.h b/src/operator/nn/pooling-inl.h
index 38fd7fb4296b..ad74a8feae39 100644
--- a/src/operator/nn/pooling-inl.h
+++ b/src/operator/nn/pooling-inl.h
@@ -135,8 +135,8 @@ namespace op {
  * When MKLDNN is enabled, we might want 2 outputs instead of one inputs, which
  * also changes the number of inputs for backward.
  */
-size_t GetNumOutputs(const PoolingParam &param);
-size_t GetNumBackInputs(const PoolingParam &param);
+int GetNumOutputs(const PoolingParam &param);
+int GetNumBackInputs(const PoolingParam &param);
 
 template<typename xpu, typename DType>
 class PoolingOp {
diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc
index 7982c0c3bd3e..9b6996d0feb0 100644
--- a/src/operator/nn/pooling.cc
+++ b/src/operator/nn/pooling.cc
@@ -57,19 +57,19 @@ void PoolingParamParser(nnvm::NodeAttrs *attrs) {
   attrs->parsed = std::move(param);
 }
 
-size_t GetNumOutputs(const PoolingParam &param) {
+int GetNumOutputs(const PoolingParam &param) {
 #if MXNET_USE_MKLDNN == 1
-  return MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param) ? 2U : 1U;
+  return MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param) ? 2 : 1;
 #else
-  return 1U;
+  return 1;
 #endif
 }
 
-size_t GetNumBackInputs(const PoolingParam &param) {
+int GetNumBackInputs(const PoolingParam &param) {
 #if MXNET_USE_MKLDNN == 1
-  return MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param) ? 5U : 3U;
+  return MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param) ? 5 : 3;
 #else
-  return 3U;
+  return 3;
 #endif
 }
 
diff --git a/src/operator/tensor/ordering_op-inl.h b/src/operator/tensor/ordering_op-inl.h
index cd1e89e447c7..6b43cf852078 100644
--- a/src/operator/tensor/ordering_op-inl.h
+++ b/src/operator/tensor/ordering_op-inl.h
@@ -359,16 +359,19 @@ void TopKImpl(RunContext ctx,
   ParseTopKParam(src.shape_, param,
                  &target_shape, &batch_size, &element_num, &axis, &k, &do_transpose, &is_ascend);
   Tensor<xpu, 3, real_t> dat = src.FlatTo3D<xpu, real_t>(axis, axis, s);
-  size_t temp_size = 0;
+  index_t temp_size = 0;
   // Temp space needed by the gpu-based full sorts.
-  temp_size = std::max(temp_size, mxnet::op::SortByKeyWorkspaceSize<int, int, xpu>(src.Size()));
-  temp_size = std::max(temp_size, mxnet::op::SortByKeyWorkspaceSize<int, real_t, xpu>(src.Size()));
-  temp_size = std::max(temp_size, mxnet::op::SortByKeyWorkspaceSize<real_t, int, xpu>(src.Size()));
+  temp_size = std::max(temp_size,
+    static_cast<index_t>(mxnet::op::SortByKeyWorkspaceSize<int, int, xpu>(src.Size())));
+  temp_size = std::max(temp_size,
+    static_cast<index_t>(mxnet::op::SortByKeyWorkspaceSize<int, real_t, xpu>(src.Size())));
+  temp_size = std::max(temp_size,
+    static_cast<index_t>(mxnet::op::SortByKeyWorkspaceSize<real_t, int, xpu>(src.Size())));
   // Additional temp space for gpu full sorts for batch ids.
   temp_size += sizeof(int) * src.Size();
   // Temp space for cpu sorts.
-  temp_size = std::max(temp_size, sizeof(real_t) * static_cast<size_t>(src.Size()));
-  size_t workspace_size = temp_size + sizeof(real_t) * src.Size() + sizeof(int) * src.Size();
+  temp_size = std::max(temp_size, static_cast<index_t>(sizeof(real_t)) * src.Size());
+  index_t workspace_size = temp_size + sizeof(real_t) * src.Size() + sizeof(int) * src.Size();
   if (param.ret_typ == topk_enum::kReturnMask) {
     workspace_size += sizeof(int) * batch_size * k + sizeof(real_t) * batch_size * k;
   }

From 5c63dde32fa158e8644cb8a1e6a4ea32f78e0264 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Thu, 2 Aug 2018 14:13:54 -0700
Subject: [PATCH 24/40] Fix lint error

---
 src/operator/convolution_v1-inl.h   | 8 +++-----
 src/operator/nn/deconvolution-inl.h | 8 +++-----
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/src/operator/convolution_v1-inl.h b/src/operator/convolution_v1-inl.h
index abda1ca6bee0..758ce12d8006 100644
--- a/src/operator/convolution_v1-inl.h
+++ b/src/operator/convolution_v1-inl.h
@@ -336,11 +336,9 @@ class ConvolutionV1Op : public Operator {
     // param_.workspace is in elements of sizeof(DType)
     // if param_.workspace is set to zero the nstep_ equals ishape[0] (batch)
     nstep_ = std::max<index_t>(
-        std::min(
-            static_cast<index_t>(
-                param_.workspace / (shape_colunit_.Size() + shape_dstunit_.Size())),
-            ishape[0]),
-        1);
+        std::min(static_cast<index_t>(param_.workspace) /
+          (shape_colunit_.Size() + shape_dstunit_.Size()), ishape[0]),
+      1);
 
     mshadow::Shape<2> scol = mshadow::Shape2(shape_colunit_[0],
                                              shape_colunit_[1] * nstep_);
diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h
index b627fc441ffa..53ef12fdbd95 100644
--- a/src/operator/nn/deconvolution-inl.h
+++ b/src/operator/nn/deconvolution-inl.h
@@ -459,11 +459,9 @@ class DeconvolutionOp {
                                      oshape[2] * oshape[3]);
     // See convolution for workspace calculations. nstep_ will be the effective batch size
     nstep_ = std::max<index_t>(
-        std::min(
-            static_cast<index_t>(
-                param_.workspace / (shape_colunit_.Size() + shape_dstunit_.Size())),
-            ishape[0]),
-        1);
+        std::min(static_cast<index_t>(param_.workspace) /
+          (shape_colunit_.Size() + shape_dstunit_.Size()), ishape[0]),
+      1);
 
     mshadow::Shape<2> scol = mshadow::Shape2(shape_colunit_[0],
                                              shape_colunit_[1] * nstep_);

From 8b496bfce0e0cf087a41d9e1d82e2fe41b529843 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Sun, 5 Aug 2018 02:58:02 +0000
Subject: [PATCH 25/40] Fix compilation error in GPU

---
 src/operator/random/shuffle_op.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/operator/random/shuffle_op.cu b/src/operator/random/shuffle_op.cu
index 5bf8320c0788..faa5e2c27f3c 100644
--- a/src/operator/random/shuffle_op.cu
+++ b/src/operator/random/shuffle_op.cu
@@ -60,7 +60,7 @@ void ShuffleForwardGPU(const nnvm::NodeAttrs& attrs,
   const index_t stride = size / first_axis_len;
   Stream<gpu> *s = ctx.get_stream<gpu>();
   MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-    using KeyType = index_t;
+    using KeyType = uint32_t;
     Tensor<gpu, 1, DType> in = inputs[0].get_with_shape<gpu, 1, DType>(Shape1(size), s);
     Tensor<gpu, 1, DType> out = outputs[0].get_with_shape<gpu, 1, DType>(Shape1(size), s);
     Random<gpu, KeyType> *prnd = ctx.requested[0].get_random<gpu, KeyType>(s);

From 4dd8791a6f7938c855cc4cc0d096c343baf7bba5 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Tue, 7 Aug 2018 22:35:14 +0000
Subject: [PATCH 26/40] Fix compilation error on GPU

---
 src/operator/pad.cu                     | 20 ++++++++++----------
 src/operator/random/shuffle_op.cu       |  3 ++-
 src/operator/tensor/indexing_op-inl.cuh |  4 ++--
 3 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/src/operator/pad.cu b/src/operator/pad.cu
index 372683a2be81..1aab12a3a79f 100644
--- a/src/operator/pad.cu
+++ b/src/operator/pad.cu
@@ -56,9 +56,9 @@ __global__ void image_2d_pad_edge_kernel(Tensor<gpu, 4, DType> dst,
   int oStartY = max(0, padT);
 
   int inputPointX =
-      min(max(padL, outputPointX), src.size(3) + padL - 1) - oStartX + iStartX;
+      min(max(padL, outputPointX), static_cast<int>(src.size(3)) + padL - 1) - oStartX + iStartX;
   int inputPointY =
-      min(max(padT, outputPointY), src.size(2) + padT - 1) - oStartY + iStartY;
+      min(max(padT, outputPointY), static_cast<int>(src.size(2)) + padT - 1) - oStartY + iStartY;
 
   DType valueToCopy = src[batch][plane][inputPointY][inputPointX];
   dst[batch][plane][outputPointY][outputPointX] = valueToCopy;
@@ -98,9 +98,9 @@ __global__ void image_2d_pad_edge_grad_kernel(
   int iStartY = max(0, -padT);
   int oStartX = max(0, padL);
   int oStartY = max(0, padT);
-  int inputPointX = min(max(padL, outputPointX), grad_in.size(3) + padL - 1) -
+  int inputPointX = min(max(padL, outputPointX), static_cast<int>(grad_in.size(3)) + padL - 1) -
                     oStartX + iStartX;
-  int inputPointY = min(max(padT, outputPointY), grad_in.size(2) + padT - 1) -
+  int inputPointY = min(max(padT, outputPointY), static_cast<int>(grad_in.size(2)) + padT - 1) -
                     oStartY + iStartY;
   DType valueToCopy = grad_out[batch][plane][outputPointY][outputPointX];
   atomicAdd(&grad_in[batch][plane][inputPointY][inputPointX], valueToCopy);
@@ -346,11 +346,11 @@ __global__ void image_3d_pad_edge_kernel(Tensor<gpu, 5, DType> dst,
   int oStartZ = max(0, padF);
 
   int inputPointX =
-      min(max(padL, outputPointX), src.size(4) + padL - 1) - oStartX + iStartX;
+      min(max(padL, outputPointX), static_cast<int>(src.size(4)) + padL - 1) - oStartX + iStartX;
   int inputPointY =
-      min(max(padT, outputPointY), src.size(3) + padT - 1) - oStartY + iStartY;
+      min(max(padT, outputPointY), static_cast<int>(src.size(3)) + padT - 1) - oStartY + iStartY;
   int inputPointZ =
-      min(max(padF, outputPointZ), src.size(2) + padF - 1) - oStartZ + iStartZ;
+      min(max(padF, outputPointZ), static_cast<int>(src.size(2)) + padF - 1) - oStartZ + iStartZ;
 
   DType valueToCopy = src[batch][plane][inputPointZ][inputPointY][inputPointX];
   dst[batch][plane][outputPointZ][outputPointY][outputPointX] = valueToCopy;
@@ -395,11 +395,11 @@ __global__ void image_3d_pad_edge_grad_kernel(
   int oStartY = max(0, padT);
   int oStartZ = max(0, padF);
 
-  int inputPointX = min(max(padL, outputPointX), grad_in.size(4) + padL - 1) -
+  int inputPointX = min(max(padL, outputPointX), static_cast<int>(grad_in.size(4)) + padL - 1) -
                     oStartX + iStartX;
-  int inputPointY = min(max(padT, outputPointY), grad_in.size(3) + padT - 1) -
+  int inputPointY = min(max(padT, outputPointY), static_cast<int>(grad_in.size(3)) + padT - 1) -
                     oStartY + iStartY;
-  int inputPointZ = min(max(padF, outputPointZ), grad_in.size(2) + padF - 1) -
+  int inputPointZ = min(max(padF, outputPointZ), static_cast<int>(grad_in.size(2)) + padF - 1) -
                     oStartZ + iStartZ;
   DType valueToCopy =
       grad_out[batch][plane][outputPointZ][outputPointY][outputPointX];
diff --git a/src/operator/random/shuffle_op.cu b/src/operator/random/shuffle_op.cu
index faa5e2c27f3c..51588494a63c 100644
--- a/src/operator/random/shuffle_op.cu
+++ b/src/operator/random/shuffle_op.cu
@@ -82,7 +82,8 @@ void ShuffleForwardGPU(const nnvm::NodeAttrs& attrs,
       Tensor<gpu, 1, index_t> indices(reinterpret_cast<index_t*>(tmp_space_ptr),
                                       Shape1(first_axis_len), s);
       tmp_space_ptr += sizeof(index_t) * first_axis_len;
-      Kernel<range_fwd, gpu>::Launch(s, first_axis_len, 1, 0U, 1U, kWriteTo, indices.dptr_);
+      Kernel<range_fwd, gpu>::Launch(s, static_cast<int>(first_axis_len),
+                                     1, index_t(0), index_t(1), kWriteTo, indices.dptr_);
       Tensor<gpu, 1, KeyType> keys(reinterpret_cast<KeyType*>(tmp_space_ptr),
                                    Shape1(first_axis_len), s);
       tmp_space_ptr += sizeof(KeyType) * first_axis_len;
diff --git a/src/operator/tensor/indexing_op-inl.cuh b/src/operator/tensor/indexing_op-inl.cuh
index b2f514e20cd9..5c5236363a53 100644
--- a/src/operator/tensor/indexing_op-inl.cuh
+++ b/src/operator/tensor/indexing_op-inl.cuh
@@ -213,10 +213,10 @@ inline void AddTakeGradLargeBatchKernelLaunch(mshadow::Tensor<gpu, 2, DType> dst
   cudaStream_t stream = mshadow::Stream<gpu>::GetStream(dst.stream_);
   const int num_unique_est = min(num_rows, src.size(0));
   const int max_nthread = 128;
-  const int num_y = max(src.size(0)/num_unique_est, 1);
+  const int num_y = max(static_cast<int>(src.size(0))/num_unique_est, 1);
   const int block_dim_x = kWarpSize;
   const int block_dim_y = min(num_y, max_nthread/block_dim_x);
-  const int SZ = min((src.size(1) + block_dim_x - 1) / block_dim_x, 4);
+  const int SZ = min((static_cast<int>(src.size(1)) + block_dim_x - 1) / block_dim_x, 4);
   const int grid_dim_x = (src.size(1) + block_dim_x * SZ - 1) / (block_dim_x * SZ);
   const int grid_dim_y = min(num_unique_est, mshadow::cuda::kBaseGridNum);
   dim3 dimBlock(block_dim_x, block_dim_y);

From 7530494643f5d8207e61c543d6dd506cc965075a Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Tue, 7 Aug 2018 18:09:36 -0700
Subject: [PATCH 27/40] Fix compilation error in cpp-package

---
 src/operator/nn/upsampling-inl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/operator/nn/upsampling-inl.h b/src/operator/nn/upsampling-inl.h
index 4b9159edd174..feb44c894a7a 100644
--- a/src/operator/nn/upsampling-inl.h
+++ b/src/operator/nn/upsampling-inl.h
@@ -48,8 +48,8 @@ enum UpSamplingMultiInputMode {kConcat, kSum};
 }  // namespace up_enum
 
 struct UpSamplingParam : public dmlc::Parameter<UpSamplingParam> {
-  index_t scale;
-  index_t num_filter;
+  int scale;
+  int num_filter;
   int sample_type;
   int num_args;
   int multi_input_mode;

From b3bdc568218abc85d0e186ace13f9429fc750757 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Fri, 28 Sep 2018 23:01:59 +0000
Subject: [PATCH 28/40] Fix unit test in GPU

---
 src/operator/bilinear_sampler.cu | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/operator/bilinear_sampler.cu b/src/operator/bilinear_sampler.cu
index 2e6be3e1ef3e..03734a61316b 100644
--- a/src/operator/bilinear_sampler.cu
+++ b/src/operator/bilinear_sampler.cu
@@ -51,8 +51,8 @@ __global__ void BilinearSamplerForwardKernel(const int i_c, const int i_h,
     int h = (index / o_w) % o_h;
     int c = (index / o_w / o_h) % o_c;
     int n = index / o_w / o_h / o_c;
-    index_t out_index = n * o_c * o_h * o_w + c * o_h * o_w + h * o_w + w;
-    index_t grid_index = n * o_h * o_w * 2 + h * o_w + w;
+    int out_index = n * o_c * o_h * o_w + c * o_h * o_w + h * o_w + w;
+    int grid_index = n * o_h * o_w * 2 + h * o_w + w;
     DType y_real = (*(grid + grid_index + o_h * o_w) + 1) * (i_h - 1) / 2;
     DType x_real = (*(grid + grid_index) + 1) * (i_w - 1) / 2;
     int top_left_y = static_cast<int>(floor(y_real));
@@ -96,7 +96,7 @@ __global__ void BilinearSamplerBackwardKernel(const int i_c, const int i_h,
     int n = index / o_w / o_h;
     DType top_left_y_gw = 0.0;
     DType top_left_x_gw = 0.0;
-    index_t grid_src_index = n * o_h * o_w * 2 + h * o_w + w;
+    int grid_src_index = n * o_h * o_w * 2 + h * o_w + w;
     DType y_real = (*(grid_src + grid_src_index + o_h * o_w) + 1) * (i_h - 1) / 2;
     DType x_real = (*(grid_src + grid_src_index) + 1) * (i_w - 1) / 2;
 
@@ -104,8 +104,8 @@ __global__ void BilinearSamplerBackwardKernel(const int i_c, const int i_h,
     int top_left_x = static_cast<int>(floor(x_real));
     DType top_left_y_w = 1.0 - (y_real - top_left_y);
     DType top_left_x_w = 1.0 - (x_real - top_left_x);
-    for (index_t c = 0; c < o_c; ++c) {
-      index_t grad_index = n * o_c * o_h * o_w + c * o_h * o_w + h * o_w + w;
+    for (int c = 0; c < o_c; ++c) {
+      int grad_index = n * o_c * o_h * o_w + c * o_h * o_w + h * o_w + w;
       int data_index = n * i_c * i_h * i_w + c * i_h * i_w + top_left_y * i_w + top_left_x;
       // calc 4 vertex value in input data
       DType top_left_v = 0;

From c9ce7344ffbeb04a60456f494b3110478a03597a Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Fri, 28 Sep 2018 16:16:46 -0700
Subject: [PATCH 29/40] Change correct type for nnvmGraph

---
 src/c_api/c_api_function.cc        |  4 ++--
 src/operator/elemwise_op_common.h  | 12 ++++++------
 src/operator/operator_common.h     | 10 +++++-----
 src/operator/tensor/ordering_op.cc | 12 ++++++------
 4 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/src/c_api/c_api_function.cc b/src/c_api/c_api_function.cc
index 83b77202f22e..7091be2e72c5 100644
--- a/src/c_api/c_api_function.cc
+++ b/src/c_api/c_api_function.cc
@@ -55,8 +55,8 @@ std::vector<nnvm::NodeEntry> Gradient(
   g->inputs = out_grads;
 
   std::vector<nnvm::NodeEntry> ret;
-  for (index_t i = 0; i < g->num_outputs(); ++i) {
-    ret.emplace_back(nnvm::NodeEntry{g, static_cast<uint32_t>(i), 0});
+  for (uint32_t i = 0; i < g->num_outputs(); ++i) {
+    ret.emplace_back(nnvm::NodeEntry{g, i, 0});
   }
 
   return ret;
diff --git a/src/operator/elemwise_op_common.h b/src/operator/elemwise_op_common.h
index 1377d2db2536..cf44da699156 100644
--- a/src/operator/elemwise_op_common.h
+++ b/src/operator/elemwise_op_common.h
@@ -197,9 +197,9 @@ struct ElemwiseGradUseOut {
   std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr& n,
                                           const std::vector<nnvm::NodeEntry>& ograds) const {
     std::vector<nnvm::NodeEntry> heads;
-    index_t n_out = n->num_outputs();
-    for (index_t i = 0; i < n_out; ++i) {
-      heads.emplace_back(nnvm::NodeEntry{n, static_cast<uint32_t>(i), 0});
+    uint32_t n_out = n->num_outputs();
+    for (uint32_t i = 0; i < n_out; ++i) {
+      heads.emplace_back(nnvm::NodeEntry{n, i, 0});
     }
     return MakeNonlossGradNode(op_name, n, ograds, heads, n->attrs.dict);
   }
@@ -214,9 +214,9 @@ struct ElemwiseGradUseInOut {
     for (auto& h : n->inputs) {
       heads.push_back(h);
     }
-    index_t n_out = n->num_outputs();
-    for (index_t i = 0; i < n_out; ++i) {
-      heads.emplace_back(nnvm::NodeEntry{n, static_cast<uint32_t>(i), 0});
+    uint32_t n_out = n->num_outputs();
+    for (uint32_t i = 0; i < n_out; ++i) {
+      heads.emplace_back(nnvm::NodeEntry{n, i, 0});
     }
     return MakeGradNode(op_name, n, heads, n->attrs.dict);
   }
diff --git a/src/operator/operator_common.h b/src/operator/operator_common.h
index e351ad694819..d7c141724777 100644
--- a/src/operator/operator_common.h
+++ b/src/operator/operator_common.h
@@ -395,8 +395,8 @@ inline std::vector<nnvm::NodeEntry> MakeGradNode(
   auto p = MakeNode(op_name, n->attrs.name + "_backward",
                     &inputs, &dict, &n);
   std::vector<nnvm::NodeEntry> ret;
-  for (index_t i = 0; i < p->num_outputs(); ++i) {
-    ret.emplace_back(nnvm::NodeEntry{p, static_cast<uint32_t>(i), 0});
+  for (uint32_t i = 0; i < p->num_outputs(); ++i) {
+    ret.emplace_back(nnvm::NodeEntry{p, i, 0});
   }
   return ret;
 }
@@ -406,7 +406,7 @@ inline std::vector<nnvm::NodeEntry> MakeZeroGradNodes(
     const nnvm::NodePtr& n,
     const std::vector<nnvm::NodeEntry>& ograds) {
   std::vector<nnvm::NodeEntry> ret;
-  for (index_t i = 0; i < n->num_inputs(); ++i) {
+  for (uint32_t i = 0; i < n->num_inputs(); ++i) {
     std::ostringstream os;
     if (1 == n->num_inputs()) {
       os << n->attrs.name << "_backward";
@@ -445,8 +445,8 @@ inline std::vector<nnvm::NodeEntry> MakeNonlossGradNode(
   p->inputs.insert(p->inputs.end(), ograds.begin(), ograds.end());
   p->inputs.insert(p->inputs.end(), inputs.begin(), inputs.end());
   std::vector<nnvm::NodeEntry> ret;
-  for (index_t i = 0; i < p->num_outputs(); ++i) {
-    ret.emplace_back(nnvm::NodeEntry{p, static_cast<uint32_t>(i), 0});
+  for (uint32_t i = 0; i < p->num_outputs(); ++i) {
+    ret.emplace_back(nnvm::NodeEntry{p, i, 0});
   }
   return ret;
 }
diff --git a/src/operator/tensor/ordering_op.cc b/src/operator/tensor/ordering_op.cc
index fc3aa9125efb..189ea19fa6f8 100644
--- a/src/operator/tensor/ordering_op.cc
+++ b/src/operator/tensor/ordering_op.cc
@@ -74,9 +74,9 @@ Examples::
     const TopKParam& param = nnvm::get<TopKParam>(n->attrs.parsed);
     if (param.ret_typ == topk_enum::kReturnValue || param.ret_typ == topk_enum::kReturnBoth) {
       std::vector<nnvm::NodeEntry> inputs;
-      index_t n_out = n->num_outputs();
-      for (index_t i = 0; i < n_out; ++i) {
-        inputs.emplace_back(nnvm::NodeEntry{ n, static_cast<uint32_t>(i), 0 });
+      uint32_t n_out = n->num_outputs();
+      for (uint32_t i = 0; i < n_out; ++i) {
+        inputs.emplace_back(nnvm::NodeEntry{ n, i, 0 });
       }
       return MakeNonlossGradNode("_backward_topk", n, {ograds[0]}, inputs, n->attrs.dict);
     } else {
@@ -136,9 +136,9 @@ Examples::
   [](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
     const SortParam& param = nnvm::get<SortParam>(n->attrs.parsed);
     std::vector<nnvm::NodeEntry> inputs;
-    index_t n_out = n->num_outputs();
-    for (index_t i = 0; i < n_out; ++i) {
-      inputs.emplace_back(nnvm::NodeEntry{ n, static_cast<uint32_t>(i), 0 });
+    uint32_t n_out = n->num_outputs();
+    for (uint32_t i = 0; i < n_out; ++i) {
+      inputs.emplace_back(nnvm::NodeEntry{ n, i, 0 });
     }
     return MakeNonlossGradNode("_backward_topk", n, {ograds[0]}, inputs,
                                {{"axis", n->attrs.dict["axis"]},

From d61b366548d65c75983a2d7721375ade5f914757 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Sat, 29 Sep 2018 21:19:11 -0700
Subject: [PATCH 30/40] update mshadow submodule to local repo to verify

---
 3rdparty/mshadow | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/mshadow b/3rdparty/mshadow
index 8a9e337f3c47..ffc76cb91b9d 160000
--- a/3rdparty/mshadow
+++ b/3rdparty/mshadow
@@ -1 +1 @@
-Subproject commit 8a9e337f3c4794876bd04d5351d967333bcabee3
+Subproject commit ffc76cb91b9d44383ebf0a40b9f76dee71124990

From 65670880d7313660fd736f3bb0c64d2b23eb267b Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Mon, 1 Oct 2018 16:37:22 +0000
Subject: [PATCH 31/40] update mshadow submodule

---
 3rdparty/mshadow | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/mshadow b/3rdparty/mshadow
index ffc76cb91b9d..d68d3694fdfb 160000
--- a/3rdparty/mshadow
+++ b/3rdparty/mshadow
@@ -1 +1 @@
-Subproject commit ffc76cb91b9d44383ebf0a40b9f76dee71124990
+Subproject commit d68d3694fdfb44fdbb7c840c3591131ff2310a59

From 7d7497c77bec7c41790f9f2946cebcca507ee222 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Tue, 2 Oct 2018 16:22:12 -0700
Subject: [PATCH 32/40] change some data type to size_t

---
 src/io/iter_image_recordio_2.cc       | 41 +++++++++++++--------------
 src/operator/tensor/ordering_op-inl.h | 10 +++----
 2 files changed, 25 insertions(+), 26 deletions(-)

diff --git a/src/io/iter_image_recordio_2.cc b/src/io/iter_image_recordio_2.cc
index 10216782cced..bd830740dd61 100644
--- a/src/io/iter_image_recordio_2.cc
+++ b/src/io/iter_image_recordio_2.cc
@@ -75,7 +75,7 @@ class ImageRecordIOParser2 {
   cv::Mat TJimdecode(cv::Mat buf, int color);
 #endif
 #endif
-  inline unsigned ParseChunk(DType* data_dptr, real_t* label_dptr, const unsigned current_size,
+  inline size_t ParseChunk(DType* data_dptr, real_t* label_dptr, const size_t current_size,
     dmlc::InputSplit::Blob * chunk);
   inline void CreateMeanImg(void);
 
@@ -104,10 +104,10 @@ class ImageRecordIOParser2 {
   /*! \brief temp space */
   mshadow::TensorContainer<cpu, 3> img_;
   /*! \brief internal instance order */
-  std::vector<std::pair<unsigned, unsigned> > inst_order_;
-  unsigned inst_index_;
+  std::vector<std::pair<size_t, size_t> > inst_order_;
+  size_t inst_index_;
   /*! \brief internal counter tracking number of already parsed entries */
-  unsigned n_parsed_;
+  size_t n_parsed_;
   /*! \brief overflow marker */
   bool overflow;
   /*! \brief unit size */
@@ -200,7 +200,7 @@ inline void ImageRecordIOParser2<DType>::Init(
                       "larger chunk size";
       }
       // 1.1 ratio is for a bit more shuffle parts to avoid boundary issue
-      unsigned num_shuffle_parts =
+      size_t num_shuffle_parts =
           std::ceil(source_->GetTotalSize() * 1.1 /
                     (param_.num_parts * (param_.shuffle_chunk_size << 20UL)));
 
@@ -262,7 +262,7 @@ inline bool ImageRecordIOParser2<DType>::ParseNext(DataBatch *out) {
   }
   CHECK(source_ != nullptr);
   dmlc::InputSplit::Blob chunk;
-  unsigned current_size = 0;
+  size_t current_size = 0;
   out->index.resize(batch_param_.batch_size);
 
   // InitBatch
@@ -295,7 +295,7 @@ inline bool ImageRecordIOParser2<DType>::ParseNext(DataBatch *out) {
 
   while (current_size < batch_param_.batch_size) {
     // int n_to_copy;
-    unsigned n_to_out = 0;
+    size_t n_to_out = 0;
     if (n_parsed_ == 0) {
       if (source_->NextBatch(&chunk, batch_param_.batch_size)) {
         inst_order_.clear();
@@ -328,16 +328,15 @@ inline bool ImageRecordIOParser2<DType>::ParseNext(DataBatch *out) {
         n_to_out = 0;
       }
     } else {
-      int n_to_copy = std::min(n_parsed_,
-        static_cast<unsigned>(batch_param_.batch_size) - current_size);
+      size_t n_to_copy = std::min(n_parsed_, static_cast<size_t>(batch_param_.batch_size) - current_size);
       n_parsed_ -= n_to_copy;
       // Copy
       #pragma omp parallel for num_threads(param_.preprocess_threads)
-      for (int i = 0; i < n_to_copy; ++i) {
+      for (size_t i = 0; i < n_to_copy; ++i) {
         omp_exc_.Run([&] {
-        std::pair<unsigned, unsigned> place = inst_order_[inst_index_ + i];
+        std::pair<size_t, size_t> place = inst_order_[inst_index_ + i];
         const DataInst& batch = temp_[place.first][place.second];
-        for (unsigned j = 0; j < batch.data.size(); ++j) {
+        for (size_t j = 0; j < batch.data.size(); ++j) {
           CHECK_EQ(unit_size_[j], batch.data[j].Size());
           MSHADOW_TYPE_SWITCH(out->data[j].data().type_flag_, dtype, {
           mshadow::Copy(
@@ -483,18 +482,18 @@ cv::Mat ImageRecordIOParser2<DType>::TJimdecode(cv::Mat image, int color) {
 
 // Returns the number of images that are put into output
 template<typename DType>
-inline unsigned ImageRecordIOParser2<DType>::ParseChunk(DType* data_dptr, real_t* label_dptr,
-  const unsigned current_size, dmlc::InputSplit::Blob * chunk) {
+inline size_t ImageRecordIOParser2<DType>::ParseChunk(DType* data_dptr, real_t* label_dptr,
+  const size_t current_size, dmlc::InputSplit::Blob * chunk) {
   temp_.resize(param_.preprocess_threads);
 #if MXNET_USE_OPENCV
   // save opencv out
   dmlc::RecordIOChunkReader reader(*chunk, 0, 1);
-  unsigned gl_idx = current_size;
+  size_t gl_idx = current_size;
   #pragma omp parallel num_threads(param_.preprocess_threads)
   {
     omp_exc_.Run([&] {
     CHECK(omp_get_num_threads() == param_.preprocess_threads);
-    unsigned int tid = omp_get_thread_num();
+    int tid = omp_get_thread_num();
     // dmlc::RecordIOChunkReader reader(*chunk, tid, param_.preprocess_threads);
     ImageRecordIO rec;
     dmlc::InputSplit::Blob blob;
@@ -503,7 +502,7 @@ inline unsigned ImageRecordIOParser2<DType>::ParseChunk(DType* data_dptr, real_t
     out_tmp.Clear();
     while (true) {
       bool reader_has_data;
-      unsigned idx;
+      size_t idx;
       #pragma omp critical
       {
         reader_has_data = reader.NextRecord(&blob);
@@ -568,7 +567,7 @@ inline unsigned ImageRecordIOParser2<DType>::ParseChunk(DType* data_dptr, real_t
         data = mshadow::Tensor<cpu, 3, DType>(data_dptr + idx*unit_size_[0],
           mshadow::Shape3(n_channels, res.rows, res.cols));
       } else {
-        out_tmp.Push(static_cast<unsigned>(rec.image_index()),
+        out_tmp.Push(static_cast<size_t>(rec.image_index()),
                  mshadow::Shape3(n_channels, res.rows, res.cols),
                  mshadow::Shape1(param_.label_width));
         data = out_tmp.data().Back();
@@ -613,7 +612,7 @@ inline unsigned ImageRecordIOParser2<DType>::ParseChunk(DType* data_dptr, real_t
   });
   }
   omp_exc_.Rethrow();
-  return (std::min(batch_param_.batch_size, gl_idx) - current_size);
+  return (std::min(static_cast<size_t>(batch_param_.batch_size), gl_idx) - current_size);
 #else
   LOG(FATAL) << "Opencv is needed for image decoding and augmenting.";
   return 0;
@@ -634,8 +633,8 @@ inline void ImageRecordIOParser2<DType>::CreateMeanImg(void) {
       inst_order_.clear();
       // Parse chunk w/o putting anything in out
       ParseChunk(nullptr, nullptr, batch_param_.batch_size, &chunk);
-      for (unsigned i = 0; i < inst_order_.size(); ++i) {
-        std::pair<unsigned, unsigned> place = inst_order_[i];
+      for (size_t i = 0; i < inst_order_.size(); ++i) {
+        std::pair<size_t, size_t> place = inst_order_[i];
         mshadow::Tensor<cpu, 3> outimg =
           temp_[place.first][place.second].data[0].template get<cpu, 3, real_t>();
         if (imcnt == 0) {
diff --git a/src/operator/tensor/ordering_op-inl.h b/src/operator/tensor/ordering_op-inl.h
index 2ce27b306aec..18bd7608e4c1 100644
--- a/src/operator/tensor/ordering_op-inl.h
+++ b/src/operator/tensor/ordering_op-inl.h
@@ -378,18 +378,18 @@ void TopKImpl(const RunContext &ctx,
     << "The total element_num is " << element_num << ", but the selected IDType can only represent "
     << mxnet::common::MaxIntegerValue<IDType>() << " elements";
   Tensor<xpu, 3, DType> dat = src.FlatTo3D<xpu, DType>(axis, axis, s);
-  index_t temp_size = 0;
+  size_t temp_size = 0;
   // Temp space needed by the gpu-based full sorts.
-  temp_size = std::max<index_t>(temp_size,
+  temp_size = std::max(temp_size,
     mxnet::op::SortByKeyWorkspaceSize<int, int, xpu>(src.Size()));
-  temp_size = std::max<index_t>(temp_size,
+  temp_size = std::max(temp_size,
     mxnet::op::SortByKeyWorkspaceSize<int, DType, xpu>(src.Size()));
-  temp_size = std::max<index_t>(temp_size,
+  temp_size = std::max(temp_size,
     mxnet::op::SortByKeyWorkspaceSize<DType, int, xpu>(src.Size()));
   // Additional temp space for gpu full sorts for batch ids.
   temp_size += sizeof(int) * src.Size();
   // Temp space for cpu sorts.
-  temp_size = std::max(temp_size, static_cast<index_t>(sizeof(DType)) * src.Size());
+  temp_size = std::max(temp_size, sizeof(DType) * static_cast<size_t>(src.Size()));
   index_t workspace_size = temp_size + sizeof(DType) * src.Size() + sizeof(int) * src.Size();
   if (param.ret_typ == topk_enum::kReturnMask) {
     workspace_size += sizeof(int) * batch_size * k + sizeof(DType) * batch_size * k;

From 9607772995791ce4bdde05283905afc6cd14c061 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Tue, 2 Oct 2018 20:43:27 -0700
Subject: [PATCH 33/40] change unit test style

---
 tests/nightly/test_large_array.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/tests/nightly/test_large_array.py b/tests/nightly/test_large_array.py
index 609f77ea291f..121acc174b51 100644
--- a/tests/nightly/test_large_array.py
+++ b/tests/nightly/test_large_array.py
@@ -15,17 +15,20 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import unittest
 import mxnet as mx
 from mxnet import gluon, nd
 
-def test_ndarray2numpy():
-    m = gluon.nn.Embedding(14000, 128)
-    m.initialize()
-    ind = nd.zeros((700000, 128))
-    x = m(ind)
-    x.shape
-    test=x.asnumpy()
-    assert (x.shape == test.shape)
+
+class TestLargeArray(unittest.TestCase):
+    def test_ndarray2numpy(self):
+        m = gluon.nn.Embedding(14000, 128)
+        m.initialize()
+        ind = nd.zeros((700000, 128))
+        x = m(ind)
+        x.shape
+        test = x.asnumpy()
+        assert (x.shape == test.shape)
 
 if __name__ == '__main__':
-    test_ndarray2numpy()
\ No newline at end of file
+    unittest.main()

From 6144565f33da603adee978da87655379d9e8b501 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Tue, 2 Oct 2018 21:17:12 -0700
Subject: [PATCH 34/40] fix lint

---
 src/io/iter_image_recordio_2.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/io/iter_image_recordio_2.cc b/src/io/iter_image_recordio_2.cc
index bd830740dd61..a7037c77c26d 100644
--- a/src/io/iter_image_recordio_2.cc
+++ b/src/io/iter_image_recordio_2.cc
@@ -328,7 +328,8 @@ inline bool ImageRecordIOParser2<DType>::ParseNext(DataBatch *out) {
         n_to_out = 0;
       }
     } else {
-      size_t n_to_copy = std::min(n_parsed_, static_cast<size_t>(batch_param_.batch_size) - current_size);
+      size_t n_to_copy = std::min(n_parsed_,
+                                  static_cast<size_t>(batch_param_.batch_size) - current_size);
       n_parsed_ -= n_to_copy;
       // Copy
       #pragma omp parallel for num_threads(param_.preprocess_threads)

From 109278eb7cd4904d40cd3cd0045d6ce3f62be0de Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Tue, 2 Oct 2018 22:10:21 -0700
Subject: [PATCH 35/40] fix compilation error in Windows

---
 src/io/iter_image_recordio_2.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/io/iter_image_recordio_2.cc b/src/io/iter_image_recordio_2.cc
index a7037c77c26d..1fb5accb860a 100644
--- a/src/io/iter_image_recordio_2.cc
+++ b/src/io/iter_image_recordio_2.cc
@@ -333,11 +333,11 @@ inline bool ImageRecordIOParser2<DType>::ParseNext(DataBatch *out) {
       n_parsed_ -= n_to_copy;
       // Copy
       #pragma omp parallel for num_threads(param_.preprocess_threads)
-      for (size_t i = 0; i < n_to_copy; ++i) {
+      for (uint32_t i = 0; i < n_to_copy; ++i) {
         omp_exc_.Run([&] {
         std::pair<size_t, size_t> place = inst_order_[inst_index_ + i];
         const DataInst& batch = temp_[place.first][place.second];
-        for (size_t j = 0; j < batch.data.size(); ++j) {
+        for (uint32_t j = 0; j < batch.data.size(); ++j) {
           CHECK_EQ(unit_size_[j], batch.data[j].Size());
           MSHADOW_TYPE_SWITCH(out->data[j].data().type_flag_, dtype, {
           mshadow::Copy(

From 42e113db2d4d4410e7e7add1112b720127eee19e Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Tue, 2 Oct 2018 22:36:23 -0700
Subject: [PATCH 36/40] fix compilation error in Windows

---
 src/io/iter_image_recordio_2.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/io/iter_image_recordio_2.cc b/src/io/iter_image_recordio_2.cc
index 1fb5accb860a..f41483955494 100644
--- a/src/io/iter_image_recordio_2.cc
+++ b/src/io/iter_image_recordio_2.cc
@@ -333,11 +333,11 @@ inline bool ImageRecordIOParser2<DType>::ParseNext(DataBatch *out) {
       n_parsed_ -= n_to_copy;
       // Copy
       #pragma omp parallel for num_threads(param_.preprocess_threads)
-      for (uint32_t i = 0; i < n_to_copy; ++i) {
+      for (int i = 0; i < static_cast<int>(n_to_copy); ++i) {
         omp_exc_.Run([&] {
         std::pair<size_t, size_t> place = inst_order_[inst_index_ + i];
         const DataInst& batch = temp_[place.first][place.second];
-        for (uint32_t j = 0; j < batch.data.size(); ++j) {
+        for (size_t j = 0; j < batch.data.size(); ++j) {
           CHECK_EQ(unit_size_[j], batch.data[j].Size());
           MSHADOW_TYPE_SWITCH(out->data[j].data().type_flag_, dtype, {
           mshadow::Copy(

From 8e732fdf3789ffdb15415a11208d16c8a90e74eb Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Thu, 4 Oct 2018 10:18:03 -0700
Subject: [PATCH 37/40] use forked submodule to verify

---
 3rdparty/mshadow | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/mshadow b/3rdparty/mshadow
index d68d3694fdfb..ffc76cb91b9d 160000
--- a/3rdparty/mshadow
+++ b/3rdparty/mshadow
@@ -1 +1 @@
-Subproject commit d68d3694fdfb44fdbb7c840c3591131ff2310a59
+Subproject commit ffc76cb91b9d44383ebf0a40b9f76dee71124990

From 239c18f0855b051e3cdd392a18eb549655cc53b9 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Thu, 4 Oct 2018 10:29:42 -0700
Subject: [PATCH 38/40] temporarily update submodule to verify the fix

---
 .gitmodules | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitmodules b/.gitmodules
index 836d824a6f5a..0945e8f4a6c2 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "3rdparty/mshadow"]
 	path = 3rdparty/mshadow
-	url = https://github.com/dmlc/mshadow.git
+	url = https://github.com/apeforest/mshadow.git
 [submodule "3rdparty/dmlc-core"]
 	path = 3rdparty/dmlc-core
 	url = https://github.com/dmlc/dmlc-core.git

From ca081a724ab79e827665c552fda9325acd8c99e4 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Thu, 4 Oct 2018 13:24:17 -0700
Subject: [PATCH 39/40] update mshadow submodule to use remote

---
 .gitmodules      | 2 +-
 3rdparty/mshadow | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index 0945e8f4a6c2..836d824a6f5a 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "3rdparty/mshadow"]
 	path = 3rdparty/mshadow
-	url = https://github.com/apeforest/mshadow.git
+	url = https://github.com/dmlc/mshadow.git
 [submodule "3rdparty/dmlc-core"]
 	path = 3rdparty/dmlc-core
 	url = https://github.com/dmlc/dmlc-core.git
diff --git a/3rdparty/mshadow b/3rdparty/mshadow
index ffc76cb91b9d..696803bd7723 160000
--- a/3rdparty/mshadow
+++ b/3rdparty/mshadow
@@ -1 +1 @@
-Subproject commit ffc76cb91b9d44383ebf0a40b9f76dee71124990
+Subproject commit 696803bd7723ade8230af878460d96c68a550fbc

From 2f6a24d4fa5bd736a660b5edf60efbdc0c7b0638 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Thu, 4 Oct 2018 13:47:01 -0700
Subject: [PATCH 40/40] add test to nightly test script

---
 tests/nightly/test_all.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/nightly/test_all.sh b/tests/nightly/test_all.sh
index 04d895fecf21..73f0f588fe90 100755
--- a/tests/nightly/test_all.sh
+++ b/tests/nightly/test_all.sh
@@ -122,4 +122,7 @@ juLog -name=BuildWithoutCUDNN -error=Error build
 # python: multi gpus lenet + mnist
 juLog -name=Python.Multi.Lenet.Mnist -error=Error python multi_lenet.py
 
+# python: large tensor
+juLog -name=Python.LargeTensor -error=Fail python test_large_array.py
+
 exit $errors