From 1beb9ac4b5dab1cd446ff06522f8689333596dc3 Mon Sep 17 00:00:00 2001
From: Luke Yeager <luke.yeager@gmail.com>
Date: Thu, 17 Dec 2015 11:01:40 -0800
Subject: [PATCH] Revert caffe-0.13 branch

---
 CMakeLists.txt                                |   5 -
 Makefile                                      |  46 +-
 Makefile.config.example                       |  16 +-
 cmake/ConfigGen.cmake                         |   6 -
 cmake/Cuda.cmake                              |  35 -
 cmake/Summary.cmake                           |   7 +-
 cmake/Targets.cmake                           |   5 +-
 cmake/Templates/CaffeConfig.cmake.in          |   7 +-
 cmake/Templates/caffe_config.h.in             |   4 -
 include/caffe/caffe.hpp                       |   1 -
 include/caffe/common.hpp                      |  92 +--
 include/caffe/common_layers.hpp               |   1 +
 include/caffe/data_layers.hpp                 |  39 +-
 include/caffe/data_reader.hpp                 |  82 ---
 include/caffe/internal_thread.hpp             |  25 +-
 include/caffe/layer_factory.hpp               |   4 +-
 include/caffe/neuron_layers.hpp               |   3 +
 include/caffe/parallel.hpp                    | 123 ----
 include/caffe/solver.hpp                      |  43 +-
 include/caffe/syncedmem.hpp                   |  42 +-
 .../caffe/test/test_gradient_check_util.hpp   |   5 +-
 include/caffe/util/blocking_queue.hpp         |  47 --
 include/caffe/util/device_alternate.hpp       |  11 -
 include/caffe/vision_layers.hpp               |  75 +--
 python/caffe/draw.py                          |   2 +-
 scripts/travis/travis_install.sh              |   2 +-
 .../travis/travis_setup_makefile_config.sh    |   1 -
 src/caffe/CMakeLists.txt                      |   1 -
 src/caffe/common.cpp                          | 147 +----
 src/caffe/data_reader.cpp                     | 119 ----
 src/caffe/data_transformer.cpp                |   4 +-
 src/caffe/internal_thread.cpp                 |  67 +-
 src/caffe/layer_factory.cpp                   |  45 +-
 src/caffe/layers/absval_layer.cu              |   1 +
 src/caffe/layers/base_data_layer.cpp          |  91 +--
 src/caffe/layers/base_data_layer.cu           |  15 +-
 src/caffe/layers/cudnn_conv_layer.cpp         | 206 +-----
 src/caffe/layers/cudnn_conv_layer.cu          |  90 +--
 src/caffe/layers/cudnn_lcn_layer.cpp          |  82 ---
 src/caffe/layers/cudnn_lcn_layer.cu           |  74 ---
 src/caffe/layers/cudnn_lrn_layer.cpp          |  57 --
 src/caffe/layers/cudnn_lrn_layer.cu           |  48 --
 src/caffe/layers/cudnn_pooling_layer.cpp      |   2 +
 src/caffe/layers/cudnn_pooling_layer.cu       |   4 +-
 src/caffe/layers/cudnn_relu_layer.cpp         |   2 +
 src/caffe/layers/cudnn_relu_layer.cu          |   4 +-
 src/caffe/layers/cudnn_sigmoid_layer.cpp      |   2 +
 src/caffe/layers/cudnn_sigmoid_layer.cu       |   4 +-
 src/caffe/layers/cudnn_softmax_layer.cpp      |   2 +
 src/caffe/layers/cudnn_softmax_layer.cu       |   5 +-
 src/caffe/layers/cudnn_tanh_layer.cpp         |   2 +
 src/caffe/layers/cudnn_tanh_layer.cu          |   4 +-
 src/caffe/layers/data_layer.cpp               | 112 ++--
 src/caffe/layers/dropout_layer.cu             |   2 +-
 src/caffe/layers/image_data_layer.cpp         |  27 +-
 src/caffe/layers/lrn_layer.cpp                |   2 +-
 src/caffe/layers/window_data_layer.cpp        |  20 +-
 src/caffe/net.cpp                             | 191 ++----
 src/caffe/parallel.cpp                        | 526 ----------------
 src/caffe/proto/caffe.proto                   |  32 -
 src/caffe/solver.cpp                          | 596 +++++++++---------
 src/caffe/syncedmem.cpp                       |  52 +-
 src/caffe/test/test_convolution_layer.cpp     |   3 -
 src/caffe/test/test_internal_thread.cpp       |  34 +-
 src/caffe/test/test_layer_factory.cpp         |  14 +-
 src/caffe/test/test_lrn_layer.cpp             | 209 ------
 src/caffe/test/test_upgrade_proto.cpp         |  12 -
 src/caffe/util/blocking_queue.cpp             |  96 ---
 tools/caffe.cpp                               | 116 +---
 69 files changed, 723 insertions(+), 3128 deletions(-)
 delete mode 100644 include/caffe/data_reader.hpp
 delete mode 100644 include/caffe/parallel.hpp
 delete mode 100644 include/caffe/util/blocking_queue.hpp
 delete mode 100644 src/caffe/data_reader.cpp
 delete mode 100644 src/caffe/layers/cudnn_lcn_layer.cpp
 delete mode 100644 src/caffe/layers/cudnn_lcn_layer.cu
 delete mode 100644 src/caffe/layers/cudnn_lrn_layer.cpp
 delete mode 100644 src/caffe/layers/cudnn_lrn_layer.cu
 delete mode 100644 src/caffe/parallel.cpp
 delete mode 100644 src/caffe/util/blocking_queue.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2b215da4dc3..74fa70c9d20 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,10 +3,6 @@ cmake_minimum_required(VERSION 2.8.7)
 # ---[ Caffe project
 project(Caffe C CXX)
 
-# ---[ Caffe version
-set(CAFFE_TARGET_VERSION "0.13.2")
-set(CAFFE_TARGET_SOVERSION "0.13")
-
 # ---[ Using cmake scripts and modules
 list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
 
@@ -19,7 +15,6 @@ include(cmake/ConfigGen.cmake)
 # ---[ Options
 caffe_option(CPU_ONLY  "Build Caffe wihtout CUDA support" OFF) # TODO: rename to USE_CUDA
 caffe_option(USE_CUDNN "Build Caffe with cuDNN libary support" ON IF NOT CPU_ONLY)
-caffe_option(USE_CNMEM "Build Caffe with CNMeM memory pool support" OFF)
 caffe_option(BUILD_SHARED_LIBS "Build shared libraries" ON)
 caffe_option(BUILD_python "Build Python wrapper" ON)
 set(python_version "2" CACHE STRING "Specify which python version to use")
diff --git a/Makefile b/Makefile
index c0382e3c1df..d2e5e5720ed 100644
--- a/Makefile
+++ b/Makefile
@@ -29,16 +29,9 @@ SRC_DIRS := $(shell find * -type d -exec bash -c "find {} -maxdepth 1 \
 	\( -name '*.cpp' -o -name '*.proto' \) | grep -q ." \; -print)
 
 # The target shared library name
-LIBRARY_NAME := $(PROJECT)$(LIBRARY_NAME_SUFFIX)
 LIB_BUILD_DIR := $(BUILD_DIR)/lib
-STATIC_NAME := $(LIB_BUILD_DIR)/lib$(LIBRARY_NAME).a
-DYNAMIC_VERSION_MAJOR 		:= 0
-DYNAMIC_VERSION_MINOR 		:= 13
-DYNAMIC_VERSION_REVISION 	:= 2
-DYNAMIC_NAME_SHORT := lib$(LIBRARY_NAME).so
-DYNAMIC_SONAME_SHORT := $(DYNAMIC_NAME_SHORT).$(DYNAMIC_VERSION_MAJOR).$(DYNAMIC_VERSION_MINOR)
-DYNAMIC_VERSIONED_NAME_SHORT := $(DYNAMIC_NAME_SHORT).$(DYNAMIC_VERSION_MAJOR).$(DYNAMIC_VERSION_MINOR).$(DYNAMIC_VERSION_REVISION)
-DYNAMIC_NAME := $(LIB_BUILD_DIR)/$(DYNAMIC_VERSIONED_NAME_SHORT)
+STATIC_NAME := $(LIB_BUILD_DIR)/lib$(PROJECT).a
+DYNAMIC_NAME := $(LIB_BUILD_DIR)/lib$(PROJECT).so
 
 ##############################
 # Get all source files
@@ -176,7 +169,6 @@ ifneq ($(CPU_ONLY), 1)
 	LIBRARY_DIRS += $(CUDA_LIB_DIR)
 	LIBRARIES := cudart cublas curand
 endif
-
 LIBRARIES += glog gflags protobuf leveldb snappy \
 	lmdb boost_system hdf5_hl hdf5 m \
 	opencv_core opencv_highgui opencv_imgproc
@@ -242,7 +234,6 @@ ifeq ($(LINUX), 1)
 	# boost::thread is reasonably called boost_thread (compare OS X)
 	# We will also explicitly add stdc++ to the link target.
 	LIBRARIES += boost_thread stdc++
-	VERSIONFLAGS += -Wl,-soname,$(DYNAMIC_SONAME_SHORT) -Wl,-rpath,$(ORIGIN)/../lib
 endif
 
 # OS X:
@@ -266,7 +257,6 @@ ifeq ($(OSX), 1)
 	# we need to explicitly ask for the rpath to be obeyed
 	DYNAMIC_FLAGS := -install_name @rpath/libcaffe.so
 	ORIGIN := @loader_path
-	VERSIONFLAGS += -Wl,-install_name,$(DYNAMIC_SONAME_SHORT) -Wl,-rpath,$(ORIGIN)/../../build/lib
 else
 	ORIGIN := \$$ORIGIN
 endif
@@ -300,12 +290,6 @@ ifeq ($(USE_CUDNN), 1)
 	COMMON_FLAGS += -DUSE_CUDNN
 endif
 
-# cuMEM integration
-ifeq ($(USE_CNMEM), 1)
-	LIBRARIES += cnmem
-	COMMON_FLAGS += -DUSE_CNMEM
-endif
-
 # CPU-only configuration
 ifeq ($(CPU_ONLY), 1)
 	OBJS := $(PROTO_OBJS) $(CXX_OBJS)
@@ -316,14 +300,6 @@ ifeq ($(CPU_ONLY), 1)
 	COMMON_FLAGS += -DCPU_ONLY
 endif
 
-# Benchmarks
-ifeq ($(BENCHMARK_DATA), 1)
-	COMMON_FLAGS += -DBENCHMARK_DATA
-endif
-ifeq ($(BENCHMARK_SOLVER), 1)
-	COMMON_FLAGS += -DBENCHMARK_SOLVER
-endif
-
 # Python layer support
 ifeq ($(WITH_PYTHON_LAYER), 1)
 	COMMON_FLAGS += -DWITH_PYTHON_LAYER
@@ -466,7 +442,7 @@ py: $(PY$(PROJECT)_SO) $(PROTO_GEN_PY)
 $(PY$(PROJECT)_SO): $(PY$(PROJECT)_SRC) $(PY$(PROJECT)_HXX) | $(DYNAMIC_NAME)
 	@ echo CXX/LD -o $@ $<
 	$(Q)$(CXX) -shared -o $@ $(PY$(PROJECT)_SRC) \
-		-o $@ $(LINKFLAGS) -l$(LIBRARY_NAME) $(PYTHON_LDFLAGS) \
+		-o $@ $(LINKFLAGS) -l$(PROJECT) $(PYTHON_LDFLAGS) \
 		-Wl,-rpath,$(ORIGIN)/../../build/lib
 
 mat$(PROJECT): mat
@@ -524,9 +500,7 @@ $(ALL_BUILD_DIRS): | $(BUILD_DIR_LINK)
 
 $(DYNAMIC_NAME): $(OBJS) | $(LIB_BUILD_DIR)
 	@ echo LD -o $@
-	$(Q)$(CXX) -shared -o $@ $(OBJS) $(VERSIONFLAGS) $(LINKFLAGS) $(LDFLAGS) $(DYNAMIC_FLAGS)
-	@ cd $(BUILD_DIR)/lib; rm -f $(DYNAMIC_SONAME_SHORT); ln -s $(DYNAMIC_VERSIONED_NAME_SHORT) $(DYNAMIC_SONAME_SHORT)
-	@ cd $(BUILD_DIR)/lib; rm -f $(DYNAMIC_NAME_SHORT);   ln -s $(DYNAMIC_SONAME_SHORT) $(DYNAMIC_NAME_SHORT)
+	$(Q)$(CXX) -shared -o $@ $(OBJS) $(LINKFLAGS) $(LDFLAGS) $(DYNAMIC_FLAGS)
 
 $(STATIC_NAME): $(OBJS) | $(LIB_BUILD_DIR)
 	@ echo AR -o $@
@@ -557,19 +531,19 @@ $(TEST_ALL_BIN): $(TEST_MAIN_SRC) $(TEST_OBJS) $(GTEST_OBJ) \
 		| $(DYNAMIC_NAME) $(TEST_BIN_DIR)
 	@ echo CXX/LD -o $@ $<
 	$(Q)$(CXX) $(TEST_MAIN_SRC) $(TEST_OBJS) $(GTEST_OBJ) \
-		-o $@ $(LINKFLAGS) $(LDFLAGS) -l$(LIBRARY_NAME) -Wl,-rpath,$(ORIGIN)/../lib
+		-o $@ $(LINKFLAGS) $(LDFLAGS) -l$(PROJECT) -Wl,-rpath,$(ORIGIN)/../lib
 
 $(TEST_CU_BINS): $(TEST_BIN_DIR)/%.testbin: $(TEST_CU_BUILD_DIR)/%.o \
 	$(GTEST_OBJ) | $(DYNAMIC_NAME) $(TEST_BIN_DIR)
 	@ echo LD $<
 	$(Q)$(CXX) $(TEST_MAIN_SRC) $< $(GTEST_OBJ) \
-		-o $@ $(LINKFLAGS) $(LDFLAGS) -l$(LIBRARY_NAME) -Wl,-rpath,$(ORIGIN)/../lib
+		-o $@ $(LINKFLAGS) $(LDFLAGS) -l$(PROJECT) -Wl,-rpath,$(ORIGIN)/../lib
 
 $(TEST_CXX_BINS): $(TEST_BIN_DIR)/%.testbin: $(TEST_CXX_BUILD_DIR)/%.o \
 	$(GTEST_OBJ) | $(DYNAMIC_NAME) $(TEST_BIN_DIR)
 	@ echo LD $<
 	$(Q)$(CXX) $(TEST_MAIN_SRC) $< $(GTEST_OBJ) \
-		-o $@ $(LINKFLAGS) $(LDFLAGS) -l$(LIBRARY_NAME) -Wl,-rpath,$(ORIGIN)/../lib
+		-o $@ $(LINKFLAGS) $(LDFLAGS) -l$(PROJECT) -Wl,-rpath,$(ORIGIN)/../lib
 
 # Target for extension-less symlinks to tool binaries with extension '*.bin'.
 $(TOOL_BUILD_DIR)/%: $(TOOL_BUILD_DIR)/%.bin | $(TOOL_BUILD_DIR)
@@ -578,12 +552,12 @@ $(TOOL_BUILD_DIR)/%: $(TOOL_BUILD_DIR)/%.bin | $(TOOL_BUILD_DIR)
 
 $(TOOL_BINS): %.bin : %.o | $(DYNAMIC_NAME)
 	@ echo CXX/LD -o $@
-	$(Q)$(CXX) $< -o $@ $(LINKFLAGS) -l$(LIBRARY_NAME) $(LDFLAGS) \
+	$(Q)$(CXX) $< -o $@ $(LINKFLAGS) -l$(PROJECT) $(LDFLAGS) \
 		-Wl,-rpath,$(ORIGIN)/../lib
 
 $(EXAMPLE_BINS): %.bin : %.o | $(DYNAMIC_NAME)
 	@ echo CXX/LD -o $@
-	$(Q)$(CXX) $< -o $@ $(LINKFLAGS) -l$(LIBRARY_NAME) $(LDFLAGS) \
+	$(Q)$(CXX) $< -o $@ $(LINKFLAGS) -l$(PROJECT) $(LDFLAGS) \
 		-Wl,-rpath,$(ORIGIN)/../../lib
 
 proto: $(PROTO_GEN_CC) $(PROTO_GEN_HEADER)
@@ -645,8 +619,6 @@ $(DISTRIBUTE_DIR): all py | $(DISTRIBUTE_SUBDIRS)
 	# add libraries
 	cp $(STATIC_NAME) $(DISTRIBUTE_DIR)/lib
 	cp $(DYNAMIC_NAME) $(DISTRIBUTE_DIR)/lib
-	cd $(DISTRIBUTE_DIR)/lib; rm -f $(DYNAMIC_SONAME_SHORT); ln -s $(DYNAMIC_VERSIONED_NAME_SHORT) $(DYNAMIC_SONAME_SHORT)
-	cd $(DISTRIBUTE_DIR)/lib; rm -f $(DYNAMIC_NAME_SHORT);   ln -s $(DYNAMIC_SONAME_SHORT) $(DYNAMIC_NAME_SHORT)
 	# add python - it's not the standard way, indeed...
 	cp -r python $(DISTRIBUTE_DIR)/python
 
diff --git a/Makefile.config.example b/Makefile.config.example
index 0007f5324b7..7a8aafd7c9f 100644
--- a/Makefile.config.example
+++ b/Makefile.config.example
@@ -1,10 +1,8 @@
 ## Refer to http://caffe.berkeleyvision.org/installation.html
 # Contributions simplifying and improving our build system are welcome!
 
-# cuDNN acceleration switch (comment to build without cuDNN).
-USE_CUDNN := 1
-# CNMeM memory pool switch (github.com/NVIDIA/cnmem required)
-# USE_CNMEM := 1
+# cuDNN acceleration switch (uncomment to build with cuDNN).
+# USE_CUDNN := 1
 
 # CPU-only switch (uncomment to build without GPU support).
 # CPU_ONLY := 1
@@ -47,8 +45,7 @@ BLAS := atlas
 # NOTE: this is required only if you will compile the python interface.
 # We need to be able to find Python.h and numpy/arrayobject.h.
 PYTHON_INCLUDE := /usr/include/python2.7 \
-		/usr/lib/python2.7/dist-packages/numpy/core/include \
-		/usr/local/lib/python2.7/dist-packages/numpy/core/include
+		/usr/lib/python2.7/dist-packages/numpy/core/include
 # Anaconda Python distribution is quite popular. Include path:
 # Verify anaconda location, sometimes it's in root.
 # ANACONDA_HOME := $(HOME)/anaconda
@@ -82,10 +79,3 @@ TEST_GPUID := 0
 
 # enable pretty build (comment to see full commands)
 Q ?= @
-
-# Adds timing info in logs
-# BENCHMARK_DATA := 1
-# BENCHMARK_SOLVER := 1
-
-# shared object suffix name to differentiate branches
-LIBRARY_NAME_SUFFIX := -nv
diff --git a/cmake/ConfigGen.cmake b/cmake/ConfigGen.cmake
index 88b7b0ac953..c82047dcc5f 100644
--- a/cmake/ConfigGen.cmake
+++ b/cmake/ConfigGen.cmake
@@ -51,12 +51,6 @@ function(caffe_generate_export_configs)
     list(APPEND DEFINITIONS -DUSE_CUDNN)
   endif()
 
-  if(NOT HAVE_CNMEM)
-    set(HAVE_CNMEM FALSE)
-  else()
-    list(APPEND DEFINITIONS -DUSE_CNMEM)
-  endif()
-
   if(BLAS STREQUAL "MKL" OR BLAS STREQUAL "mkl")
     list(APPEND Caffe_DEFINITIONS -DUSE_MKL)
   endif()
diff --git a/cmake/Cuda.cmake b/cmake/Cuda.cmake
index 74f7a211796..ff58d31c166 100644
--- a/cmake/Cuda.cmake
+++ b/cmake/Cuda.cmake
@@ -188,32 +188,6 @@ function(detect_cuDNN)
   endif()
 endfunction()
 
-################################################################################################
-# Short command for cuDNN detection. Believe it soon will be a part of CUDA toolkit distribution.
-# That's why not FindcuDNN.cmake file, but just the macro
-# Usage:
-#   detect_cuDNN()
-function(detect_CNMeM)
-  set(CNMEM_ROOT "" CACHE PATH "CNMeM root folder")
-
-  find_path(CNMEM_INCLUDE cnmem.h
-    PATHS ${CNMEM_ROOT} $ENV{CNMEM_ROOT} ${CUDA_TOOLKIT_INCLUDE}
-    DOC "Path to CNMeM include directory." )
-
-  get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH)
-  find_library(CNMEM_LIBRARY NAMES libcnmem.so # libcudnn_static.a
-                             PATHS ${CNMEM_ROOT} $ENV{CNMEM_ROOT} ${CNMEM_INCLUDE} ${__libpath_hist}
-                             DOC "Path to CNMeM library.")
-
-  if(CNMEM_INCLUDE AND CNMEM_LIBRARY)
-    set(HAVE_CNMEM  TRUE PARENT_SCOPE)
-    set(CNMEM_FOUND TRUE PARENT_SCOPE)
-
-    mark_as_advanced(CNMEM_INCLUDE CNMEM_LIBRARY CNMEM_ROOT)
-    message(STATUS "Found CNMeM (include: ${CNMEM_INCLUDE}, library: ${CNMEM_LIBRARY})")
-  endif()
-endfunction()
-
 
 ################################################################################################
 ###  Non macro section
@@ -242,15 +216,6 @@ if(USE_CUDNN)
   endif()
 endif()
 
-if(USE_CNMEM)
-  detect_CNMeM()
-  if(HAVE_CNMEM)
-    add_definitions(-DUSE_CNMEM)
-    include_directories(SYSTEM ${CNMEM_INCLUDE})
-    list(APPEND Caffe_LINKER_LIBS ${CNMEM_LIBRARY})
-  endif()
-endif()
-
 # setting nvcc arch flags
 caffe_select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
 list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA})
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index e5408e30509..32931942846 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -101,7 +101,7 @@ function(caffe_print_configuration_summary)
   caffe_status("")
   caffe_status("******************* Caffe Configuration Summary *******************")
   caffe_status("General:")
-  caffe_status("  Version           :   ${CAFFE_TARGET_VERSION}")
+  caffe_status("  Version           :   ${Caffe_VERSION}")
   caffe_status("  Git               :   ${Caffe_GIT_VERSION}")
   caffe_status("  System            :   ${CMAKE_SYSTEM_NAME}")
   caffe_status("  C++ compiler      :   ${CMAKE_CXX_COMPILER}")
@@ -136,11 +136,6 @@ function(caffe_print_configuration_summary)
     else()
       caffe_status("  cuDNN             :   Disabled")
     endif()
-    if (USE_CNMEM)
-      caffe_status("  CNMeM             : " HAVE_CNMEM THEN "Yes" ELSE "Not found")
-    else()
-      caffe_status("  CNMeM             :   Disabled")
-    endif()
     caffe_status("")
   endif()
   if(HAVE_PYTHON)
diff --git a/cmake/Targets.cmake b/cmake/Targets.cmake
index c0b8ec9abe3..e3ad872313b 100644
--- a/cmake/Targets.cmake
+++ b/cmake/Targets.cmake
@@ -109,10 +109,7 @@ function(caffe_default_properties target)
     DEBUG_POSTFIX ${Caffe_DEBUG_POSTFIX}
     ARCHIVE_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/lib"
     LIBRARY_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/lib"
-    RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/bin"
-    VERSION   ${CAFFE_TARGET_VERSION}
-    SOVERSION ${CAFFE_TARGET_SOVERSION}
-  )
+    RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/bin")
 endfunction()
 
 ################################################################################################
diff --git a/cmake/Templates/CaffeConfig.cmake.in b/cmake/Templates/CaffeConfig.cmake.in
index 99e194f086f..a4b03d961e0 100644
--- a/cmake/Templates/CaffeConfig.cmake.in
+++ b/cmake/Templates/CaffeConfig.cmake.in
@@ -15,10 +15,8 @@
 #
 #   Caffe_HAVE_CUDA    - signals about CUDA support
 #   Caffe_HAVE_CUDNN   - signals about cuDNN support
-#   Caffe_HAVE_CNMEM   - signals about CNMeM support
-# 
-#
-#
+ 
+
 # OpenCV dependency
 
 if(NOT OpenCV_FOUND)
@@ -58,4 +56,3 @@ set(Caffe_DEFINITIONS "@Caffe_DEFINITIONS@")
 set(Caffe_CPU_ONLY @CPU_ONLY@)
 set(Caffe_HAVE_CUDA @HAVE_CUDA@)
 set(Caffe_HAVE_CUDNN @HAVE_CUDNN@)
-set(Caffe_HAVE_CNMEM @HAVE_CNMEM@)
diff --git a/cmake/Templates/caffe_config.h.in b/cmake/Templates/caffe_config.h.in
index 3ea3bc41bf1..6039e8f6b21 100644
--- a/cmake/Templates/caffe_config.h.in
+++ b/cmake/Templates/caffe_config.h.in
@@ -11,10 +11,6 @@
 #cmakedefine HAVE_CUDNN
 #cmakedefine USE_CUDNN
 
-/* NVIDIA CNMeM */
-#cmakedefine HAVE_CNMEM
-#cmakedefine USE_CNMEM
- 
 /* NVIDA cuDNN */
 #cmakedefine CPU_ONLY
 
diff --git a/include/caffe/caffe.hpp b/include/caffe/caffe.hpp
index 68a5e1d1d1a..3c829f2f9b0 100644
--- a/include/caffe/caffe.hpp
+++ b/include/caffe/caffe.hpp
@@ -10,7 +10,6 @@
 #include "caffe/layer.hpp"
 #include "caffe/layer_factory.hpp"
 #include "caffe/net.hpp"
-#include "caffe/parallel.hpp"
 #include "caffe/proto/caffe.pb.h"
 #include "caffe/solver.hpp"
 #include "caffe/util/benchmark.hpp"
diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
index d67d12f6ee3..5f86bc2625b 100644
--- a/include/caffe/common.hpp
+++ b/include/caffe/common.hpp
@@ -18,11 +18,6 @@
 
 #include "caffe/util/device_alternate.hpp"
 
-#ifdef USE_CNMEM
-// cuMEM integration
-#include <cnmem.h>
-#endif
-
 // gflags 2.1 issue: namespace google was changed to gflags without warning.
 // Luckily we will be able to use GFLAGS_GFLAGS_H_ to detect if it is version
 // 2.1. If yes, we will add a temporary solution to redirect the namespace.
@@ -103,12 +98,12 @@ void GlobalInit(int* pargc, char*** pargv);
 class Caffe {
  public:
   ~Caffe();
-
-  // Thread local context for Caffe. Moved to common.cpp instead of
-  // including boost/thread.hpp to avoid a boost/NVCC issues (#1009, #1010)
-  // on OSX. Also fails on Linux with CUDA 7.0.18.
-  static Caffe& Get();
-
+  inline static Caffe& Get() {
+    if (!singleton_.get()) {
+      singleton_.reset(new Caffe());
+    }
+    return *singleton_;
+  }
   enum Brew { CPU, GPU };
 
   // This random number generator facade hides boost and CUDA rng
@@ -137,9 +132,6 @@ class Caffe {
   inline static curandGenerator_t curand_generator() {
     return Get().curand_generator_;
   }
-#ifdef USE_CUDNN
-  inline static cudnnHandle_t cudnn_handle() { return Get().cudnn_handle_; }
-#endif
 #endif
 
   // Returns the mode: running on CPU or GPU.
@@ -157,25 +149,16 @@ class Caffe {
   static void SetDevice(const int device_id);
   // Prints the current GPU status.
   static void DeviceQuery();
-  // Parallel training info
-  inline static int solver_count() { return Get().solver_count_; }
-  inline static void set_solver_count(int val) { Get().solver_count_ = val; }
-  inline static bool root_solver() { return Get().root_solver_; }
-  inline static void set_root_solver(bool val) { Get().root_solver_ = val; }
 
  protected:
 #ifndef CPU_ONLY
   cublasHandle_t cublas_handle_;
   curandGenerator_t curand_generator_;
-#ifdef USE_CUDNN
-  cudnnHandle_t cudnn_handle_;
-#endif
 #endif
   shared_ptr<RNG> random_generator_;
 
   Brew mode_;
-  int solver_count_;
-  bool root_solver_;
+  static shared_ptr<Caffe> singleton_;
 
  private:
   // The private constructor to avoid duplicate instantiation.
@@ -184,67 +167,6 @@ class Caffe {
   DISABLE_COPY_AND_ASSIGN(Caffe);
 };
 
-class MemoryHandler {
- public:
-  static MemoryHandler& Get();
-#ifndef CPU_ONLY
-  static void mallocGPU(void **ptr, size_t size,
-                        cudaStream_t stream = cudaStreamDefault);
-  static void freeGPU(void *ptr, cudaStream_t = cudaStreamDefault);
-  static void registerStream(cudaStream_t stream);
-#endif
-  static void setGPUs(const std::vector<int>& gpus) { Get().gpus_ = gpus; }
-  static void usePool() { Get().using_pool_ = true; }
-  static bool usingPool() {
-#ifdef USE_CNMEM
-    return Get().using_pool_;
-#else
-    return false;
-#endif
-  }
-  static void getInfo(size_t *free_mem, size_t *used_mem);
-  static void destroy();
-  ~MemoryHandler() { }
-
- private:
-  MemoryHandler() : using_pool_(false), initialized_(false) {}
-  static void Init();
-  // static void Destroy();
-#ifndef CPU_ONLY
-  void allocate_memory(void **ptr, size_t size, cudaStream_t stream);
-  void free_memory(void *ptr, cudaStream_t stream);
-#endif
-  DISABLE_COPY_AND_ASSIGN(MemoryHandler);
-
-  bool using_pool_;
-  bool initialized_;
-  std::vector<int> gpus_;
-};
-
-class MemoryHandlerActivator {
- public:
-  explicit MemoryHandlerActivator(const std::vector<int>& gpus)
-            : using_pool_(false) {
-    if (gpus.size() > 0) {
-      using_pool_ = true;
-      MemoryHandler::usePool();
-      MemoryHandler::setGPUs(gpus);
-#ifndef CPU_ONLY
-      void* temp;
-      MemoryHandler::mallocGPU(&temp, 4);
-      MemoryHandler::freeGPU(temp);
-#endif
-    }
-  }
-  ~MemoryHandlerActivator() {
-    if (using_pool_) {
-      MemoryHandler::destroy();
-    }
-  }
- private:
-  int using_pool_;
-};
-
 }  // namespace caffe
 
 #endif  // CAFFE_COMMON_HPP_
diff --git a/include/caffe/common_layers.hpp b/include/caffe/common_layers.hpp
index ecf516795fd..e6b42c14587 100644
--- a/include/caffe/common_layers.hpp
+++ b/include/caffe/common_layers.hpp
@@ -424,6 +424,7 @@ class CuDNNSoftmaxLayer : public SoftmaxLayer<Dtype> {
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   bool handles_setup_;
+  cudnnHandle_t             handle_;
   cudnnTensorDescriptor_t bottom_desc_;
   cudnnTensorDescriptor_t top_desc_;
 };
diff --git a/include/caffe/data_layers.hpp b/include/caffe/data_layers.hpp
index 12e6c366620..3958cb7ecb0 100644
--- a/include/caffe/data_layers.hpp
+++ b/include/caffe/data_layers.hpp
@@ -5,17 +5,16 @@
 #include <utility>
 #include <vector>
 
+#include "boost/scoped_ptr.hpp"
 #include "hdf5.h"
 
 #include "caffe/blob.hpp"
 #include "caffe/common.hpp"
-#include "caffe/data_reader.hpp"
 #include "caffe/data_transformer.hpp"
 #include "caffe/filler.hpp"
 #include "caffe/internal_thread.hpp"
 #include "caffe/layer.hpp"
 #include "caffe/proto/caffe.pb.h"
-#include "caffe/util/blocking_queue.hpp"
 #include "caffe/util/db.hpp"
 
 namespace caffe {
@@ -51,17 +50,12 @@ class BaseDataLayer : public Layer<Dtype> {
   bool output_labels_;
 };
 
-template <typename Dtype>
-class Batch {
- public:
-  Blob<Dtype> data_, label_;
-};
-
 template <typename Dtype>
 class BasePrefetchingDataLayer :
     public BaseDataLayer<Dtype>, public InternalThread {
  public:
-  explicit BasePrefetchingDataLayer(const LayerParameter& param);
+  explicit BasePrefetchingDataLayer(const LayerParameter& param)
+      : BaseDataLayer<Dtype>(param) {}
   // LayerSetUp: implements common data layer setup functionality, and calls
   // DataLayerSetUp to do special data layer setup for individual layer types.
   // This method may not be overridden.
@@ -73,24 +67,22 @@ class BasePrefetchingDataLayer :
   virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
 
-  // Prefetches batches (asynchronously if to GPU memory)
-  static const int PREFETCH_COUNT = 3;
+  virtual void CreatePrefetchThread();
+  virtual void JoinPrefetchThread();
+  // The thread's function
+  virtual void InternalThreadEntry() {}
 
  protected:
-  virtual void InternalThreadEntry();
-  virtual void load_batch(Batch<Dtype>* batch) = 0;
-
-  Batch<Dtype> prefetch_[PREFETCH_COUNT];
-  BlockingQueue<Batch<Dtype>*> prefetch_free_;
-  BlockingQueue<Batch<Dtype>*> prefetch_full_;
-
+  Blob<Dtype> prefetch_data_;
+  Blob<Dtype> prefetch_label_;
   Blob<Dtype> transformed_data_;
 };
 
 template <typename Dtype>
 class DataLayer : public BasePrefetchingDataLayer<Dtype> {
  public:
-  explicit DataLayer(const LayerParameter& param);
+  explicit DataLayer(const LayerParameter& param)
+      : BasePrefetchingDataLayer<Dtype>(param) {}
   virtual ~DataLayer();
   virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
@@ -101,9 +93,10 @@ class DataLayer : public BasePrefetchingDataLayer<Dtype> {
   virtual inline int MaxTopBlobs() const { return 2; }
 
  protected:
-  virtual void load_batch(Batch<Dtype>* batch);
+  virtual void InternalThreadEntry();
 
-  DataReader reader_;
+  shared_ptr<db::DB> db_;
+  shared_ptr<db::Cursor> cursor_;
 };
 
 /**
@@ -242,7 +235,7 @@ class ImageDataLayer : public BasePrefetchingDataLayer<Dtype> {
  protected:
   shared_ptr<Caffe::RNG> prefetch_rng_;
   virtual void ShuffleImages();
-  virtual void load_batch(Batch<Dtype>* batch);
+  virtual void InternalThreadEntry();
 
   vector<std::pair<std::string, int> > lines_;
   int lines_id_;
@@ -314,7 +307,7 @@ class WindowDataLayer : public BasePrefetchingDataLayer<Dtype> {
 
  protected:
   virtual unsigned int PrefetchRand();
-  virtual void load_batch(Batch<Dtype>* batch);
+  virtual void InternalThreadEntry();
 
   shared_ptr<Caffe::RNG> prefetch_rng_;
   vector<std::pair<std::string, vector<int> > > image_database_;
diff --git a/include/caffe/data_reader.hpp b/include/caffe/data_reader.hpp
deleted file mode 100644
index 8ed5542cb8d..00000000000
--- a/include/caffe/data_reader.hpp
+++ /dev/null
@@ -1,82 +0,0 @@
-#ifndef CAFFE_DATA_READER_HPP_
-#define CAFFE_DATA_READER_HPP_
-
-#include <map>
-#include <string>
-#include <vector>
-
-#include "caffe/common.hpp"
-#include "caffe/internal_thread.hpp"
-#include "caffe/util/blocking_queue.hpp"
-#include "caffe/util/db.hpp"
-
-namespace caffe {
-
-/**
- * @brief Reads data from a source to queues available to data layers.
- * A single reading thread is created per source, even if multiple solvers
- * are running in parallel, e.g. for multi-GPU training. This makes sure
- * databases are read sequentially, and that each solver accesses a different
- * subset of the database. Data is distributed to solvers in a round-robin
- * way to keep parallel training deterministic.
- */
-class DataReader {
- public:
-  explicit DataReader(const LayerParameter& param);
-  ~DataReader();
-
-  inline BlockingQueue<Datum*>& free() const {
-    return queue_pair_->free_;
-  }
-  inline BlockingQueue<Datum*>& full() const {
-    return queue_pair_->full_;
-  }
-
- protected:
-  // Queue pairs are shared between a body and its readers
-  class QueuePair {
-   public:
-    explicit QueuePair(int size);
-    ~QueuePair();
-
-    BlockingQueue<Datum*> free_;
-    BlockingQueue<Datum*> full_;
-
-  DISABLE_COPY_AND_ASSIGN(QueuePair);
-  };
-
-  // A single body is created per source
-  class Body : public InternalThread {
-   public:
-    explicit Body(const LayerParameter& param);
-    virtual ~Body();
-
-   protected:
-    void InternalThreadEntry();
-    void read_one(db::Cursor* cursor, QueuePair* qp);
-
-    const LayerParameter param_;
-    BlockingQueue<shared_ptr<QueuePair> > new_queue_pairs_;
-
-    friend class DataReader;
-
-  DISABLE_COPY_AND_ASSIGN(Body);
-  };
-
-  // A source is uniquely identified by its layer name + path, in case
-  // the same database is read from two different locations in the net.
-  static inline string source_key(const LayerParameter& param) {
-    return param.name() + ":" + param.data_param().source();
-  }
-
-  const shared_ptr<QueuePair> queue_pair_;
-  shared_ptr<Body> body_;
-
-  static map<const string, boost::weak_ptr<DataReader::Body> > bodies_;
-
-DISABLE_COPY_AND_ASSIGN(DataReader);
-};
-
-}  // namespace caffe
-
-#endif  // CAFFE_DATA_READER_HPP_
diff --git a/include/caffe/internal_thread.hpp b/include/caffe/internal_thread.hpp
index 3c32a1d13b3..815ca54605e 100644
--- a/include/caffe/internal_thread.hpp
+++ b/include/caffe/internal_thread.hpp
@@ -14,22 +14,18 @@ namespace caffe {
 /**
  * Virtual class encapsulate boost::thread for use in base class
  * The child class will acquire the ability to run a single thread,
- * by reimplementing the virtual function InternalThreadEntry.
+ * by reimplementing the virutal function InternalThreadEntry.
  */
 class InternalThread {
  public:
-  InternalThread();
+  InternalThread() : thread_() {}
   virtual ~InternalThread();
 
-  /**
-   * Caffe's thread local state will be initialized using the current
-   * thread values, e.g. device id, solver index etc. The random seed
-   * is initialized using caffe_rng_rand.
-   */
-  void StartInternalThread();
+  /** Returns true if the thread was successfully started. **/
+  bool StartInternalThread();
 
   /** Will not return until the internal thread has exited. */
-  void StopInternalThread();
+  bool WaitForInternalThreadToExit();
 
   bool is_started() const;
 
@@ -38,18 +34,7 @@ class InternalThread {
       with the code you want your thread to run. */
   virtual void InternalThreadEntry() {}
 
-  /* Should be tested when running loops to exit when requested. */
-  bool must_stop();
-
- private:
-  void entry();
-
   shared_ptr<boost::thread> thread_;
-  int device_;
-  Caffe::Brew mode_;
-  int rand_seed_;
-  int solver_count_;
-  bool root_solver_;
 };
 
 }  // namespace caffe
diff --git a/include/caffe/layer_factory.hpp b/include/caffe/layer_factory.hpp
index 32e849de0d2..2fcd93869a0 100644
--- a/include/caffe/layer_factory.hpp
+++ b/include/caffe/layer_factory.hpp
@@ -71,9 +71,7 @@ class LayerRegistry {
 
   // Get a layer using a LayerParameter.
   static shared_ptr<Layer<Dtype> > CreateLayer(const LayerParameter& param) {
-    if (Caffe::root_solver()) {
-      LOG(INFO) << "Creating layer " << param.name();
-    }
+    LOG(INFO) << "Creating layer " << param.name();
     const string& type = param.type();
     CreatorRegistry& registry = Registry();
     CHECK_EQ(registry.count(type), 1) << "Unknown layer type: " << type
diff --git a/include/caffe/neuron_layers.hpp b/include/caffe/neuron_layers.hpp
index bf90257e9cc..9cf233f0eb3 100644
--- a/include/caffe/neuron_layers.hpp
+++ b/include/caffe/neuron_layers.hpp
@@ -431,6 +431,7 @@ class CuDNNReLULayer : public ReLULayer<Dtype> {
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   bool handles_setup_;
+  cudnnHandle_t             handle_;
   cudnnTensorDescriptor_t bottom_desc_;
   cudnnTensorDescriptor_t top_desc_;
 };
@@ -513,6 +514,7 @@ class CuDNNSigmoidLayer : public SigmoidLayer<Dtype> {
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   bool handles_setup_;
+  cudnnHandle_t             handle_;
   cudnnTensorDescriptor_t bottom_desc_;
   cudnnTensorDescriptor_t top_desc_;
 };
@@ -597,6 +599,7 @@ class CuDNNTanHLayer : public TanHLayer<Dtype> {
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   bool handles_setup_;
+  cudnnHandle_t             handle_;
   cudnnTensorDescriptor_t bottom_desc_;
   cudnnTensorDescriptor_t top_desc_;
 };
diff --git a/include/caffe/parallel.hpp b/include/caffe/parallel.hpp
deleted file mode 100644
index 5f44a8702d3..00000000000
--- a/include/caffe/parallel.hpp
+++ /dev/null
@@ -1,123 +0,0 @@
-#ifndef CAFFE_PARALLEL_HPP_
-#define CAFFE_PARALLEL_HPP_
-
-#include <boost/date_time/posix_time/posix_time.hpp>
-
-#include <vector>
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/internal_thread.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/proto/caffe.pb.h"
-#include "caffe/solver.hpp"
-#include "caffe/syncedmem.hpp"
-#include "caffe/util/blocking_queue.hpp"
-
-namespace caffe {
-
-// Represents a net parameters. Once a net is created, its parameter buffers can
-// be replaced by ones from Params, to allow parallelization. Params ensures
-// parameters are allocated in one consecutive array.
-template<typename Dtype>
-class Params {
- public:
-  explicit Params(shared_ptr<Solver<Dtype> > root_solver);
-  virtual ~Params() {
-  }
-
-  inline size_t size() const {
-    return size_;
-  }
-  inline Dtype* data() const {
-    return data_;
-  }
-  inline Dtype* diff() const {
-    return diff_;
-  }
-
- protected:
-  const size_t size_;           // Size of buffers
-  Dtype* data_;                 // Network parameters
-  Dtype* diff_;                 // Gradient
-
-DISABLE_COPY_AND_ASSIGN(Params);
-};
-
-// Params stored in GPU memory.
-template<typename Dtype>
-class GPUParams : public Params<Dtype> {
- public:
-  GPUParams(shared_ptr<Solver<Dtype> > root_solver, int device);
-  virtual ~GPUParams();
-
-  void configure(Solver<Dtype>* solver) const;
-
- protected:
-  using Params<Dtype>::size_;
-  using Params<Dtype>::data_;
-  using Params<Dtype>::diff_;
- private:
-  int buffer_device_;
-};
-
-class DevicePair {
- public:
-  DevicePair(int parent, int device)
-      : parent_(parent),
-        device_(device) {
-  }
-  inline int parent() {
-    return parent_;
-  }
-  inline int device() {
-    return device_;
-  }
-
-  // Group GPUs in pairs, by proximity depending on machine's topology
-  static void compute(const vector<int> devices, vector<DevicePair>* pairs);
-
- protected:
-  int parent_;
-  int device_;
-};
-
-// Synchronous data parallelism using map-reduce between local GPUs.
-template<typename Dtype>
-class P2PSync : public GPUParams<Dtype>, public Solver<Dtype>::Callback,
-    public InternalThread {
- public:
-  explicit P2PSync(shared_ptr<Solver<Dtype> > root_solver,
-                   P2PSync<Dtype>* parent, const SolverParameter& param);
-  virtual ~P2PSync();
-
-  inline const shared_ptr<Solver<Dtype> >& solver() const {
-    return solver_;
-  }
-
-  static void run(shared_ptr<Solver<Dtype> > root, const vector<int>& gpus);
-
-  // Divide the batch size by the number of solvers
-  static void divide_batch_size(NetParameter* net);
-
- protected:
-  void on_start(Timer* timer, ostringstream* timing);
-  void on_gradients_ready(Timer* timer, ostringstream* timing);
-
-  void InternalThreadEntry();
-
-  P2PSync<Dtype>* parent_;
-  vector<P2PSync<Dtype>*> children_;
-  BlockingQueue<P2PSync<Dtype>*> queue_;
-  const int initial_iter_;
-  Dtype* parent_grads_;
-  shared_ptr<Solver<Dtype> > solver_;
-
-  using Params<Dtype>::size_;
-  using Params<Dtype>::data_;
-  using Params<Dtype>::diff_;
-};
-
-}  // namespace caffe
-
-#endif
diff --git a/include/caffe/solver.hpp b/include/caffe/solver.hpp
index b292ba25ae6..4dcdc3dc20b 100644
--- a/include/caffe/solver.hpp
+++ b/include/caffe/solver.hpp
@@ -5,7 +5,6 @@
 #include <vector>
 
 #include "caffe/net.hpp"
-#include "caffe/util/benchmark.hpp"
 
 namespace caffe {
 
@@ -33,30 +32,15 @@ class Solver {
   // function that restores the state from a SolverState protocol buffer.
   void Restore(const char* resume_file);
   virtual ~Solver() {}
-  inline const SolverParameter& param() const { return param_; }
   inline shared_ptr<Net<Dtype> > net() { return net_; }
   inline const vector<shared_ptr<Net<Dtype> > >& test_nets() {
     return test_nets_;
   }
   int iter() { return iter_; }
 
-  // Invoked at specific points during an iteration
-  class Callback {
-   protected:
-    virtual void on_start(Timer* timer, ostringstream* timing) = 0;
-    virtual void on_gradients_ready(Timer* timer, ostringstream* timing) = 0;
-
-    template <typename T>
-    friend class Solver;
-  };
-  const vector<Callback*>& callbacks() const { return callbacks_; }
-  void add_callback(Callback* value) {
-    callbacks_.push_back(value);
-  }
-
  protected:
-  // Get and apply the update value for the current iteration.
-  virtual void Iteration() {}
+  // Get the update value for the current iteration.
+  virtual void ComputeUpdateValue() = 0;
   // The Solver::Snapshot function implements the basic snapshotting utility
   // that stores the learned net. You should implement the SnapshotSolverState()
   // function that produces a SolverState protocol buffer that needs to be
@@ -65,12 +49,8 @@ class Solver {
   // The test routine
   void TestAll();
   void Test(const int test_net_id = 0);
-  virtual void SnapshotSolverState(SolverState* state) {
-    CHECK(false) << "Should be overriden";
-  }
-  virtual void RestoreSolverState(const SolverState& state) {
-    CHECK(false) << "Should be overriden";
-  }
+  virtual void SnapshotSolverState(SolverState* state) = 0;
+  virtual void RestoreSolverState(const SolverState& state) = 0;
   void DisplayOutputBlobs(const int net_id);
 
   SolverParameter param_;
@@ -78,10 +58,6 @@ class Solver {
   int current_step_;
   shared_ptr<Net<Dtype> > net_;
   vector<shared_ptr<Net<Dtype> > > test_nets_;
-  vector<Callback*> callbacks_;
-
-  Timer iteration_timer_;
-  float iterations_last_;
 
   DISABLE_COPY_AND_ASSIGN(Solver);
 };
@@ -104,9 +80,7 @@ class SGDSolver : public Solver<Dtype> {
  protected:
   void PreSolve();
   Dtype GetLearningRate();
-  virtual void Iteration();
-  virtual void Regularize(int param_id);
-  virtual void ComputeUpdateValue(int param_id, Dtype rate);
+  virtual void ComputeUpdateValue();
   virtual void ClipGradients();
   virtual void SnapshotSolverState(SolverState * state);
   virtual void RestoreSolverState(const SolverState& state);
@@ -116,9 +90,6 @@ class SGDSolver : public Solver<Dtype> {
   //   of gradients/updates and is not needed in snapshots
   vector<shared_ptr<Blob<Dtype> > > history_, update_, temp_;
 
-  using Solver<Dtype>::iteration_timer_;
-  using Solver<Dtype>::iterations_last_;
-
   DISABLE_COPY_AND_ASSIGN(SGDSolver);
 };
 
@@ -131,7 +102,7 @@ class NesterovSolver : public SGDSolver<Dtype> {
       : SGDSolver<Dtype>(param_file) {}
 
  protected:
-  virtual void ComputeUpdateValue(int param_id, Dtype rate);
+  virtual void ComputeUpdateValue();
 
   DISABLE_COPY_AND_ASSIGN(NesterovSolver);
 };
@@ -145,7 +116,7 @@ class AdaGradSolver : public SGDSolver<Dtype> {
       : SGDSolver<Dtype>(param_file) { constructor_sanity_check(); }
 
  protected:
-  virtual void ComputeUpdateValue(int param_id, Dtype rate);
+  virtual void ComputeUpdateValue();
   void constructor_sanity_check() {
     CHECK_EQ(0, this->param_.momentum())
         << "Momentum cannot be used with AdaGrad.";
diff --git a/include/caffe/syncedmem.hpp b/include/caffe/syncedmem.hpp
index 62aadef498d..1b726de9564 100644
--- a/include/caffe/syncedmem.hpp
+++ b/include/caffe/syncedmem.hpp
@@ -8,29 +8,26 @@
 
 namespace caffe {
 
-// If CUDA is available and in GPU mode, host memory will be allocated pinned,
-// using cudaMallocHost. It avoids dynamic pinning for transfers (DMA).
-// The improvement in performance seems negligible in the single GPU case,
-// but might be more significant for parallel training. Most importantly,
-// it improved stability for large models on many GPUs.
+// Theoretically, CaffeMallocHost and CaffeFreeHost should simply call the
+// cudaMallocHost and cudaFree functions in order to create pinned memory.
+// However, those codes rely on the existence of a cuda GPU (I don't know
+// why that is a must since allocating memory should not be accessing the
+// GPU resource, but it just creates an error as of Cuda 5.0) and will cause
+// problem when running on a machine without GPU. Thus, we simply define
+// these two functions for safety and possible future change if the problem
+// of calling cuda functions disappears in a future version.
+//
+// In practice, although we are creating unpinned memory here, as long as we
+// are constantly accessing them the memory pages almost always stays in
+// the physical memory (assuming we have large enough memory installed), and
+// does not seem to create a memory bottleneck here.
+
 inline void CaffeMallocHost(void** ptr, size_t size) {
-#ifndef CPU_ONLY
-  if (Caffe::mode() == Caffe::GPU) {
-    CUDA_CHECK(cudaMallocHost(ptr, size));
-    return;
-  }
-#endif
   *ptr = malloc(size);
   CHECK(*ptr) << "host allocation of size " << size << " failed";
 }
 
 inline void CaffeFreeHost(void* ptr) {
-#ifndef CPU_ONLY
-  if (Caffe::mode() == Caffe::GPU) {
-    CUDA_CHECK(cudaFreeHost(ptr));
-    return;
-  }
-#endif
   free(ptr);
 }
 
@@ -45,25 +42,20 @@ class SyncedMemory {
  public:
   SyncedMemory()
       : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(0), head_(UNINITIALIZED),
-        own_cpu_data_(false), own_gpu_data_(false), gpu_device_(-1) {}
+        own_cpu_data_(false) {}
   explicit SyncedMemory(size_t size)
       : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(size), head_(UNINITIALIZED),
-        own_cpu_data_(false), own_gpu_data_(false), gpu_device_(-1) {}
+        own_cpu_data_(false) {}
   ~SyncedMemory();
   const void* cpu_data();
   void set_cpu_data(void* data);
   const void* gpu_data();
-  void set_gpu_data(void* data);
   void* mutable_cpu_data();
   void* mutable_gpu_data();
   enum SyncedHead { UNINITIALIZED, HEAD_AT_CPU, HEAD_AT_GPU, SYNCED };
   SyncedHead head() { return head_; }
   size_t size() { return size_; }
 
-#ifndef CPU_ONLY
-  void async_gpu_push(const cudaStream_t& stream);
-#endif
-
  private:
   void to_cpu();
   void to_gpu();
@@ -72,8 +64,6 @@ class SyncedMemory {
   size_t size_;
   SyncedHead head_;
   bool own_cpu_data_;
-  bool own_gpu_data_;
-  int gpu_device_;
 
   DISABLE_COPY_AND_ASSIGN(SyncedMemory);
 };  // class SyncedMemory
diff --git a/include/caffe/test/test_gradient_check_util.hpp b/include/caffe/test/test_gradient_check_util.hpp
index 80fa9f26e11..22937711b58 100644
--- a/include/caffe/test/test_gradient_check_util.hpp
+++ b/include/caffe/test/test_gradient_check_util.hpp
@@ -84,10 +84,7 @@ void GradientChecker<Dtype>::CheckGradientSingle(Layer<Dtype>* layer,
   vector<Blob<Dtype>*> blobs_to_check;
   vector<bool> propagate_down(bottom.size(), check_bottom < 0);
   for (int i = 0; i < layer->blobs().size(); ++i) {
-    // blobs_to_check.push_back(layer->blobs()[i].get());
-    Blob<Dtype>* blob = layer->blobs()[i].get();
-    caffe_set(blob->count(), static_cast<Dtype>(0), blob->mutable_cpu_diff());
-    blobs_to_check.push_back(blob);
+    blobs_to_check.push_back(layer->blobs()[i].get());
   }
   if (check_bottom < 0) {
     for (int i = 0; i < bottom.size(); ++i) {
diff --git a/include/caffe/util/blocking_queue.hpp b/include/caffe/util/blocking_queue.hpp
deleted file mode 100644
index 955e12cc567..00000000000
--- a/include/caffe/util/blocking_queue.hpp
+++ /dev/null
@@ -1,47 +0,0 @@
-#ifndef CAFFE_UTIL_BLOCKING_QUEUE_HPP_
-#define CAFFE_UTIL_BLOCKING_QUEUE_HPP_
-
-#include <queue>
-#include <string>
-
-#include "caffe/common.hpp"
-
-namespace caffe {
-
-template<typename T>
-class BlockingQueue {
- public:
-  explicit BlockingQueue();
-
-  void push(const T& t);
-
-  bool try_pop(T* t);
-
-  // This logs a message if the threads needs to be blocked
-  // useful for detecting e.g. when data feeding is too slow
-  T pop(const string& log_on_wait = "");
-
-  bool try_peek(T* t);
-
-  // Return element without removing it
-  T peek();
-
-  size_t size() const;
-
- protected:
-  /**
-   Move synchronization fields out instead of including boost/thread.hpp
-   to avoid a boost/NVCC issues (#1009, #1010) on OSX. Also fails on
-   Linux CUDA 7.0.18.
-   */
-  class sync;
-
-  std::queue<T> queue_;
-  shared_ptr<sync> sync_;
-
-DISABLE_COPY_AND_ASSIGN(BlockingQueue);
-};
-
-}  // namespace caffe
-
-#endif
diff --git a/include/caffe/util/device_alternate.hpp b/include/caffe/util/device_alternate.hpp
index 55d616fbf71..6ea595dba2d 100644
--- a/include/caffe/util/device_alternate.hpp
+++ b/include/caffe/util/device_alternate.hpp
@@ -66,17 +66,6 @@ void classname<Dtype>::funcname##_##gpu(const vector<Blob<Dtype>*>& top, \
       << caffe::curandGetErrorString(status); \
   } while (0)
 
-#ifdef USE_CNMEM
-
-#define CNMEM_CHECK(condition) \
-  do { \
-    cnmemStatus_t status = condition; \
-    CHECK_EQ(status, CNMEM_STATUS_SUCCESS) << " " \
-      << cnmemGetErrorString(status); \
-  } while (0)
-
-#endif
-
 // CUDA: grid stride looping
 #define CUDA_KERNEL_LOOP(i, n) \
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp
index 09d3a10c52c..a6bd86a93f5 100644
--- a/include/caffe/vision_layers.hpp
+++ b/include/caffe/vision_layers.hpp
@@ -244,24 +244,15 @@ class CuDNNConvolutionLayer : public ConvolutionLayer<Dtype> {
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   bool handles_setup_;
-
-  // algorithms for forward and backwards convolutions
-  cudnnConvolutionFwdAlgo_t *fwd_algo_;
-  cudnnConvolutionBwdFilterAlgo_t *bwd_filter_algo_;
-  cudnnConvolutionBwdDataAlgo_t *bwd_data_algo_;
-
+  cudnnHandle_t* handle_;
+  cudaStream_t*  stream_;
   vector<cudnnTensorDescriptor_t> bottom_descs_, top_descs_;
   cudnnTensorDescriptor_t    bias_desc_;
   cudnnFilterDescriptor_t      filter_desc_;
   vector<cudnnConvolutionDescriptor_t> conv_descs_;
   int bottom_offset_, top_offset_, weight_offset_, bias_offset_;
-
-  size_t *workspace_fwd_sizes_;
-  size_t *workspace_bwd_data_sizes_;
-  size_t *workspace_bwd_filter_sizes_;
-  size_t workspaceSizeInBytes;  // size of underlying storage
-  void *workspaceData;  // underlying storage
-  void **workspace;  // aliases into workspaceData
+  size_t workspaceSizeInBytes;
+  void *workspace;
 };
 #endif
 
@@ -382,63 +373,6 @@ class LRNLayer : public Layer<Dtype> {
   vector<Blob<Dtype>*> product_bottom_vec_;
 };
 
-#ifdef USE_CUDNN
-
-template <typename Dtype>
-class CuDNNLRNLayer : public LRNLayer<Dtype> {
- public:
-  explicit CuDNNLRNLayer(const LayerParameter& param)
-      : LRNLayer<Dtype>(param), handles_setup_(false) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual ~CuDNNLRNLayer();
-
- protected:
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  bool handles_setup_;
-  cudnnLRNDescriptor_t norm_desc_;
-  cudnnTensorDescriptor_t bottom_desc_, top_desc_;
-
-  int size_;
-  Dtype alpha_, beta_, k_;
-};
-
-template <typename Dtype>
-class CuDNNLCNLayer : public LRNLayer<Dtype> {
- public:
-  explicit CuDNNLCNLayer(const LayerParameter& param)
-      : LRNLayer<Dtype>(param), handles_setup_(false), tempDataSize(0),
-        tempData1(NULL), tempData2(NULL) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual ~CuDNNLCNLayer();
-
- protected:
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  bool handles_setup_;
-  cudnnLRNDescriptor_t norm_desc_;
-  cudnnTensorDescriptor_t bottom_desc_, top_desc_;
-
-  int size_, pre_pad_;
-  Dtype alpha_, beta_, k_;
-
-  size_t tempDataSize;
-  void *tempData1, *tempData2;
-};
-
-#endif
 
 /**
  * @brief Pools the input image by taking the max, average, etc. within regions.
@@ -512,6 +446,7 @@ class CuDNNPoolingLayer : public PoolingLayer<Dtype> {
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   bool handles_setup_;
+  cudnnHandle_t             handle_;
   cudnnTensorDescriptor_t bottom_desc_, top_desc_;
   cudnnPoolingDescriptor_t  pooling_desc_;
   cudnnPoolingMode_t        mode_;
diff --git a/python/caffe/draw.py b/python/caffe/draw.py
index 8661f6d0648..08b7c1de14b 100644
--- a/python/caffe/draw.py
+++ b/python/caffe/draw.py
@@ -59,7 +59,7 @@ def determine_node_label_by_layertype(layer, layertype, rankdir):
     else:
         # If graph orientation is horizontal, vertical space is free and
         # horizontal space is not; separate words with newlines
-        separator = '\\n'
+        separator = '\n'
 
     if layertype == 'Convolution':
         # Outer double quotes needed or else colon characters don't parse
diff --git a/scripts/travis/travis_install.sh b/scripts/travis/travis_install.sh
index 04af643a5e2..0e8c37861b0 100755
--- a/scripts/travis/travis_install.sh
+++ b/scripts/travis/travis_install.sh
@@ -62,7 +62,7 @@ rm -f $LMDB_FILE
 # than using pip for everything).
 wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh
 chmod +x miniconda.sh
-./miniconda.sh -b -p /home/travis/miniconda
+./miniconda.sh -b
 export PATH=/home/travis/miniconda/bin:$PATH
 conda update --yes conda
 conda install --yes numpy scipy matplotlib scikit-image pip
diff --git a/scripts/travis/travis_setup_makefile_config.sh b/scripts/travis/travis_setup_makefile_config.sh
index 0b62bc2e0f9..ba326262bf8 100755
--- a/scripts/travis/travis_setup_makefile_config.sh
+++ b/scripts/travis/travis_setup_makefile_config.sh
@@ -20,5 +20,4 @@ PYTHON_LIB := $(ANACONDA_HOME)/lib
 INCLUDE_DIRS := $(PYTHON_INCLUDE) /usr/local/include
 LIBRARY_DIRS := $(PYTHON_LIB) /usr/local/lib /usr/lib
 WITH_PYTHON_LAYER := 1
-USE_CUDNN := 0
 EOF
diff --git a/src/caffe/CMakeLists.txt b/src/caffe/CMakeLists.txt
index 73ecd4496b1..40e6c11f5b0 100644
--- a/src/caffe/CMakeLists.txt
+++ b/src/caffe/CMakeLists.txt
@@ -20,7 +20,6 @@ endif()
 add_library(caffe ${srcs})
 target_link_libraries(caffe proto ${Caffe_LINKER_LIBS})
 caffe_default_properties(caffe)
-set_target_properties(caffe PROPERTIES OUTPUT_NAME "caffe-nv")
 
 # ---[ Tests
  add_subdirectory(test)
diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp
index 4075f1402d0..af96cac40aa 100644
--- a/src/caffe/common.cpp
+++ b/src/caffe/common.cpp
@@ -1,25 +1,13 @@
-#include <boost/thread.hpp>
 #include <glog/logging.h>
 #include <cstdio>
 #include <ctime>
-#include <vector>
 
 #include "caffe/common.hpp"
 #include "caffe/util/rng.hpp"
 
 namespace caffe {
 
-static boost::shared_ptr<MemoryHandler> mem_handler;
-// Make sure each thread can have different values.
-static boost::thread_specific_ptr<Caffe> thread_instance_;
-
-
-Caffe& Caffe::Get() {
-  if (!thread_instance_.get()) {
-    thread_instance_.reset(new Caffe());
-  }
-  return *(thread_instance_.get());
-}
+shared_ptr<Caffe> Caffe::singleton_;
 
 // random seeding
 int64_t cluster_seedgen(void) {
@@ -54,8 +42,7 @@ void GlobalInit(int* pargc, char*** pargv) {
 #ifdef CPU_ONLY  // CPU-only Caffe.
 
 Caffe::Caffe()
-    : random_generator_(), mode_(Caffe::CPU),
-      solver_count_(1), root_solver_(true) { }
+    : random_generator_(), mode_(Caffe::CPU) { }
 
 Caffe::~Caffe() { }
 
@@ -98,12 +85,8 @@ void* Caffe::RNG::generator() {
 #else  // Normal GPU + CPU Caffe.
 
 Caffe::Caffe()
-    : cublas_handle_(NULL), curand_generator_(NULL),
-#ifdef USE_CUDNN
-    cudnn_handle_(NULL),
-#endif
-    random_generator_(),
-    mode_(Caffe::CPU), solver_count_(1), root_solver_(true) {
+    : cublas_handle_(NULL), curand_generator_(NULL), random_generator_(),
+    mode_(Caffe::CPU) {
   // Try to create a cublas handler, and report an error if failed (but we will
   // keep the program running as one might just want to run CPU code).
   if (cublasCreate(&cublas_handle_) != CUBLAS_STATUS_SUCCESS) {
@@ -116,11 +99,6 @@ Caffe::Caffe()
       != CURAND_STATUS_SUCCESS) {
     LOG(ERROR) << "Cannot create Curand generator. Curand won't be available.";
   }
-#ifdef USE_CUDNN
-  if (cudnnCreate(&cudnn_handle_) != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "Cannot create cuDNN handle. cuDNN won't be available.";
-  }
-#endif
 }
 
 Caffe::~Caffe() {
@@ -128,9 +106,6 @@ Caffe::~Caffe() {
   if (curand_generator_) {
     CURAND_CHECK(curandDestroyGenerator(curand_generator_));
   }
-#ifdef USE_CUDNN
-  if (cudnn_handle_) CUDNN_CHECK(cudnnDestroy(cudnn_handle_));
-#endif
 }
 
 void Caffe::set_random_seed(const unsigned int seed) {
@@ -169,10 +144,6 @@ void Caffe::SetDevice(const int device_id) {
       CURAND_RNG_PSEUDO_DEFAULT));
   CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().curand_generator_,
       cluster_seedgen()));
-#ifdef USE_CUDNN
-  if (Get().cublas_handle_) CUDNN_CHECK(cudnnDestroy(Get().cudnn_handle_));
-  CUDNN_CHECK(cudnnCreate(&Get().cudnn_handle_));
-#endif
 }
 
 void Caffe::DeviceQuery() {
@@ -210,6 +181,7 @@ void Caffe::DeviceQuery() {
   return;
 }
 
+
 class Caffe::RNG::Generator {
  public:
   Generator() : rng_(new caffe::rng_t(cluster_seedgen())) {}
@@ -232,115 +204,6 @@ void* Caffe::RNG::generator() {
   return static_cast<void*>(generator_->rng());
 }
 
-static boost::mutex memHandlerMutex;
-
-MemoryHandler& MemoryHandler::Get() {
-  boost::mutex::scoped_lock lock(memHandlerMutex);
-  if (!mem_handler.get()) {
-    mem_handler.reset(new MemoryHandler());
-  }
-  return *(mem_handler.get());
-}
-
-void MemoryHandler::mallocGPU(void **ptr, size_t size, cudaStream_t stream) {
-  if (!Get().initialized_) {
-    Init();
-  }
-  Get().allocate_memory(ptr, size, stream);
-}
-
-void MemoryHandler::freeGPU(void *ptr, cudaStream_t stream) {
-  Get().free_memory(ptr, stream);
-}
-
-void MemoryHandler::allocate_memory(void **ptr, size_t size,
-                                    cudaStream_t stream) {
-  int initial_device;
-  cudaGetDevice(&initial_device);
-  if (size == 0) return;
-  if (using_pool_) {
-#ifdef USE_CNMEM
-    CNMEM_CHECK(cnmemMalloc(ptr, size, stream));
-#endif
-  } else {
-    CUDA_CHECK(cudaMalloc(ptr, size));
-  }
-  cudaSetDevice(initial_device);
-}
-
-void MemoryHandler::free_memory(void *ptr, cudaStream_t stream) {
-  // boost::mutex::scoped_lock lock(memHandlerMutex);
-  int initial_device;
-  cudaGetDevice(&initial_device);
-  if (using_pool_) {
-#ifdef USE_CNMEM
-    CNMEM_CHECK(cnmemFree(ptr, stream));
-#endif
-  } else {
-    CUDA_CHECK(cudaFree(ptr));
-  }
-  ptr = NULL;
-  cudaSetDevice(initial_device);
-}
-
-void MemoryHandler::registerStream(cudaStream_t stream) {
-  if (!Get().initialized_) {
-    Init();
-  }
-  if (Get().using_pool_) {
-#ifdef USE_CNMEM
-    CNMEM_CHECK(cnmemRegisterStream(stream));
-#endif
-  }
-}
-
-void MemoryHandler::destroy() {
-#ifdef USE_CNMEM
-  CNMEM_CHECK(cnmemFinalize());
-#endif
-}
-
-void MemoryHandler::Init() {
-  if (Get().using_pool_) {
-#ifdef USE_CNMEM
-    cnmemDevice_t *devs = new cnmemDevice_t[Get().gpus_.size()];
-
-    int initial_device;
-    CUDA_CHECK(cudaGetDevice(&initial_device));
-
-    for (int i = 0; i < Get().gpus_.size(); i++) {
-      CUDA_CHECK(cudaSetDevice(Get().gpus_[i]));
-
-      devs[i].device = Get().gpus_[i];
-
-      size_t free_mem, used_mem;
-      CUDA_CHECK(cudaMemGetInfo(&free_mem, &used_mem));
-
-      devs[i].size = size_t(0.95*free_mem);
-      devs[i].numStreams = 0;
-      devs[i].streams = NULL;
-    }
-    CNMEM_CHECK(cnmemInit(Get().gpus_.size(), devs, CNMEM_FLAGS_DEFAULT));
-    Get().initialized_ = true;
-
-    CUDA_CHECK(cudaSetDevice(initial_device));
-
-    delete [] devs;
-#endif
-  }
-  Get().initialized_ = true;
-}
-
-void MemoryHandler::getInfo(size_t *free_mem, size_t *total_mem) {
-  if (Get().using_pool_) {
-#ifdef USE_CNMEM
-    CNMEM_CHECK(cnmemMemGetInfo(free_mem, total_mem, cudaStreamDefault));
-#endif
-  } else {
-    CUDA_CHECK(cudaMemGetInfo(free_mem, total_mem));
-  }
-}
-
 const char* cublasGetErrorString(cublasStatus_t error) {
   switch (error) {
   case CUBLAS_STATUS_SUCCESS:
diff --git a/src/caffe/data_reader.cpp b/src/caffe/data_reader.cpp
deleted file mode 100644
index 16378203a88..00000000000
--- a/src/caffe/data_reader.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-#include <boost/thread.hpp>
-#include <map>
-#include <string>
-#include <vector>
-
-#include "caffe/common.hpp"
-#include "caffe/data_layers.hpp"
-#include "caffe/data_reader.hpp"
-#include "caffe/proto/caffe.pb.h"
-
-namespace caffe {
-
-using boost::weak_ptr;
-
-map<const string, weak_ptr<DataReader::Body> > DataReader::bodies_;
-static boost::mutex bodies_mutex_;
-
-DataReader::DataReader(const LayerParameter& param)
-    : queue_pair_(new QueuePair(  //
-        param.data_param().prefetch() * param.data_param().batch_size())) {
-  // Get or create a body
-  boost::mutex::scoped_lock lock(bodies_mutex_);
-  string key = source_key(param);
-  weak_ptr<Body>& weak = bodies_[key];
-  body_ = weak.lock();
-  if (!body_) {
-    body_.reset(new Body(param));
-    bodies_[key] = weak_ptr<Body>(body_);
-  }
-  body_->new_queue_pairs_.push(queue_pair_);
-}
-
-DataReader::~DataReader() {
-  string key = source_key(body_->param_);
-  body_.reset();
-  boost::mutex::scoped_lock lock(bodies_mutex_);
-  if (bodies_[key].expired()) {
-    bodies_.erase(key);
-  }
-}
-
-//
-
-DataReader::QueuePair::QueuePair(int size) {
-  // Initialize the free queue with requested number of datums
-  for (int i = 0; i < size; ++i) {
-    free_.push(new Datum());
-  }
-}
-
-DataReader::QueuePair::~QueuePair() {
-  Datum* datum;
-  while (free_.try_pop(&datum)) {
-    delete datum;
-  }
-  while (full_.try_pop(&datum)) {
-    delete datum;
-  }
-}
-
-//
-
-DataReader::Body::Body(const LayerParameter& param)
-    : param_(param),
-      new_queue_pairs_() {
-  StartInternalThread();
-}
-
-DataReader::Body::~Body() {
-  StopInternalThread();
-}
-
-void DataReader::Body::InternalThreadEntry() {
-  shared_ptr<db::DB> db(db::GetDB(param_.data_param().backend()));
-  db->Open(param_.data_param().source(), db::READ);
-  shared_ptr<db::Cursor> cursor(db->NewCursor());
-  vector<shared_ptr<QueuePair> > qps;
-  try {
-    int solver_count = param_.phase() == TRAIN ? Caffe::solver_count() : 1;
-
-    // To ensure deterministic runs, only start running once all solvers
-    // are ready. But solvers need to peek on one item during initialization,
-    // so read one item, then wait for the next solver.
-    for (int i = 0; i < solver_count; ++i) {
-      shared_ptr<QueuePair> qp(new_queue_pairs_.pop());
-      read_one(cursor.get(), qp.get());
-      qps.push_back(qp);
-    }
-    // Main loop
-    while (!must_stop()) {
-      for (int i = 0; i < solver_count; ++i) {
-        read_one(cursor.get(), qps[i].get());
-      }
-      // Check no additional readers have been created. This can happen if
-      // more than one net is trained at a time per process, whether single
-      // or multi solver. It might also happen if two data layers have same
-      // name and same source.
-      CHECK_EQ(new_queue_pairs_.size(), 0);
-    }
-  } catch (boost::thread_interrupted&) {
-    // Interrupted exception is expected on shutdown
-  }
-}
-
-void DataReader::Body::read_one(db::Cursor* cursor, QueuePair* qp) {
-  Datum* datum = qp->free_.pop();
-  // TODO deserialize in-place instead of copy?
-  datum->ParseFromString(cursor->value());
-  qp->full_.push(datum);
-
-  // go to the next iter
-  cursor->Next();
-  if (!cursor->valid()) {
-    DLOG(INFO) << "Restarting data prefetching from start.";
-    cursor->SeekToFirst();
-  }
-}
-
-}  // namespace caffe
diff --git a/src/caffe/data_transformer.cpp b/src/caffe/data_transformer.cpp
index 482b8c09d24..b0b98e478c1 100644
--- a/src/caffe/data_transformer.cpp
+++ b/src/caffe/data_transformer.cpp
@@ -19,9 +19,7 @@ DataTransformer<Dtype>::DataTransformer(const TransformationParameter& param,
     CHECK_EQ(param_.mean_value_size(), 0) <<
       "Cannot specify mean_file and mean_value at the same time";
     const string& mean_file = param.mean_file();
-    if (Caffe::root_solver()) {
-      LOG(INFO) << "Loading mean file from: " << mean_file;
-    }
+    LOG(INFO) << "Loading mean file from: " << mean_file;
     BlobProto blob_proto;
     ReadProtoFromBinaryFileOrDie(mean_file.c_str(), &blob_proto);
     data_mean_.FromProto(blob_proto);
diff --git a/src/caffe/internal_thread.cpp b/src/caffe/internal_thread.cpp
index 2402a192e7e..c2d19d433b4 100644
--- a/src/caffe/internal_thread.cpp
+++ b/src/caffe/internal_thread.cpp
@@ -1,75 +1,40 @@
 #include <boost/thread.hpp>
-#include <exception>
-
 #include "caffe/internal_thread.hpp"
-#include "caffe/util/math_functions.hpp"
 
 namespace caffe {
 
-InternalThread::InternalThread()
-    : thread_(),
-      device_(),
-      mode_(),
-      rand_seed_(),
-      solver_count_(),
-      root_solver_() {
-}
-
 InternalThread::~InternalThread() {
-  StopInternalThread();
+  WaitForInternalThreadToExit();
 }
 
 bool InternalThread::is_started() const {
-  return thread_ && thread_->joinable();
-}
-
-bool InternalThread::must_stop() {
-  return thread_ && thread_->interruption_requested();
+  return thread_.get() != NULL && thread_->joinable();
 }
 
-void InternalThread::StartInternalThread() {
-  // TODO switch to failing once Caffe prefetch thread is persistent.
-  // Threads should not be started and stopped repeatedly.
-  // CHECK(!is_started());
-  StopInternalThread();
-
-#ifndef CPU_ONLY
-  CUDA_CHECK(cudaGetDevice(&device_));
-#endif
-  mode_ = Caffe::mode();
-  rand_seed_ = caffe_rng_rand();
-  solver_count_ = Caffe::solver_count();
-  root_solver_ = Caffe::root_solver();
 
+bool InternalThread::StartInternalThread() {
+  if (!WaitForInternalThreadToExit()) {
+    return false;
+  }
   try {
-    thread_.reset(new boost::thread(&InternalThread::entry, this));
-  } catch (std::exception& e) {
-    CHECK(false) << e.what();
+    thread_.reset(
+        new boost::thread(&InternalThread::InternalThreadEntry, this));
+  } catch (...) {
+    return false;
   }
+  return true;
 }
 
-void InternalThread::entry() {
-#ifndef CPU_ONLY
-  CUDA_CHECK(cudaSetDevice(device_));
-#endif
-  Caffe::set_mode(mode_);
-  Caffe::set_random_seed(rand_seed_);
-  Caffe::set_solver_count(solver_count_);
-  Caffe::set_root_solver(root_solver_);
-
-  InternalThreadEntry();
-}
-
-void InternalThread::StopInternalThread() {
+/** Will not return until the internal thread has exited. */
+bool InternalThread::WaitForInternalThreadToExit() {
   if (is_started()) {
-    thread_->interrupt();
     try {
       thread_->join();
-    } catch (boost::thread_interrupted&) {
-    } catch (std::exception& e) {
-      CHECK(false) << e.what();
+    } catch (...) {
+      return false;
     }
   }
+  return true;
 }
 
 }  // namespace caffe
diff --git a/src/caffe/layer_factory.cpp b/src/caffe/layer_factory.cpp
index c7bb9643af6..d6a1cac5090 100644
--- a/src/caffe/layer_factory.cpp
+++ b/src/caffe/layer_factory.cpp
@@ -50,6 +50,12 @@ shared_ptr<Layer<Dtype> > GetPoolingLayer(const LayerParameter& param) {
 #ifdef USE_CUDNN
   } else if (engine == PoolingParameter_Engine_CUDNN) {
     PoolingParameter p_param = param.pooling_param();
+    if (p_param.pad() || p_param.pad_h() || p_param.pad_w() ||
+        param.top_size() > 1) {
+      LOG(INFO) << "CUDNN does not support padding or multiple tops. "
+                << "Using Caffe's own pooling layer.";
+      return shared_ptr<Layer<Dtype> >(new PoolingLayer<Dtype>(param));
+    }
     return shared_ptr<Layer<Dtype> >(new CuDNNPoolingLayer<Dtype>(param));
 #endif
   } else {
@@ -59,45 +65,6 @@ shared_ptr<Layer<Dtype> > GetPoolingLayer(const LayerParameter& param) {
 
 REGISTER_LAYER_CREATOR(Pooling, GetPoolingLayer);
 
-// Get LRN layer according to engine
-template <typename Dtype>
-shared_ptr<Layer<Dtype> > GetLRNLayer(const LayerParameter& param) {
-  LRNParameter_Engine engine = param.lrn_param().engine();
-
-  if (engine == LRNParameter_Engine_DEFAULT) {
-    engine = LRNParameter_Engine_CAFFE;
-#ifdef USE_CUDNN
-    engine = LRNParameter_Engine_CUDNN;
-#endif
-  }
-
-  if (engine == LRNParameter_Engine_CAFFE) {
-    return shared_ptr<Layer<Dtype> >(new LRNLayer<Dtype>(param));
-#ifdef USE_CUDNN
-  } else if (engine == LRNParameter_Engine_CUDNN) {
-    LRNParameter lrn_param = param.lrn_param();
-
-    if (lrn_param.norm_region() ==LRNParameter_NormRegion_WITHIN_CHANNEL) {
-      // not valid for cudnn
-      // return shared_ptr<Layer<Dtype> >(new LRNLayer<Dtype>(param));
-      return shared_ptr<Layer<Dtype> >(new CuDNNLCNLayer<Dtype>(param));
-    } else {
-      // local size is too big to be handled through cuDNN
-      if (param.lrn_param().local_size() > CUDNN_LRN_MAX_N) {
-        return shared_ptr<Layer<Dtype> >(new LRNLayer<Dtype>(param));
-      } else {
-        // return shared_ptr<Layer<Dtype> >(new LRNLayer<Dtype>(param));
-        return shared_ptr<Layer<Dtype> >(new CuDNNLRNLayer<Dtype>(param));
-      }
-    }
-#endif
-  } else {
-    LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
-  }
-}
-
-REGISTER_LAYER_CREATOR(LRN, GetLRNLayer);
-
 // Get relu layer according to engine.
 template <typename Dtype>
 shared_ptr<Layer<Dtype> > GetReLULayer(const LayerParameter& param) {
diff --git a/src/caffe/layers/absval_layer.cu b/src/caffe/layers/absval_layer.cu
index bb310e1afbb..91f3c77fe9a 100644
--- a/src/caffe/layers/absval_layer.cu
+++ b/src/caffe/layers/absval_layer.cu
@@ -18,6 +18,7 @@ template <typename Dtype>
 void AbsValLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   const int count = top[0]->count();
+  const Dtype* top_data = top[0]->gpu_data();
   const Dtype* top_diff = top[0]->gpu_diff();
   if (propagate_down[0]) {
     const Dtype* bottom_data = bottom[0]->gpu_data();
diff --git a/src/caffe/layers/base_data_layer.cpp b/src/caffe/layers/base_data_layer.cpp
index 71504c11bfd..931e4a9c0ab 100644
--- a/src/caffe/layers/base_data_layer.cpp
+++ b/src/caffe/layers/base_data_layer.cpp
@@ -1,9 +1,7 @@
-#include <boost/thread.hpp>
 #include <string>
 #include <vector>
 
 #include "caffe/data_layers.hpp"
-#include "caffe/net.hpp"
 #include "caffe/util/io.hpp"
 
 namespace caffe {
@@ -29,91 +27,54 @@ void BaseDataLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
   data_transformer_->InitRand();
 }
 
-template <typename Dtype>
-BasePrefetchingDataLayer<Dtype>::BasePrefetchingDataLayer(
-    const LayerParameter& param)
-    : BaseDataLayer<Dtype>(param),
-      prefetch_free_(), prefetch_full_() {
-  for (int i = 0; i < PREFETCH_COUNT; ++i) {
-    prefetch_free_.push(&prefetch_[i]);
-  }
-}
-
 template <typename Dtype>
 void BasePrefetchingDataLayer<Dtype>::LayerSetUp(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
   BaseDataLayer<Dtype>::LayerSetUp(bottom, top);
-
-  // Before starting the prefetch thread, we make cpu_data and gpu_data
-  // calls so that the prefetch thread does not accidentally make simultaneous
-  // cudaMalloc calls when the main thread is running. In some GPUs this
-  // seems to cause failures if we do not so.
-  for (int i = 0; i < PREFETCH_COUNT; ++i) {
-    prefetch_[i].data_.mutable_cpu_data();
-    if (this->output_labels_) {
-      prefetch_[i].label_.mutable_cpu_data();
-    }
-  }
-#ifndef CPU_ONLY
-  if (Caffe::mode() == Caffe::GPU) {
-    for (int i = 0; i < PREFETCH_COUNT; ++i) {
-      prefetch_[i].data_.mutable_gpu_data();
-      if (this->output_labels_) {
-        prefetch_[i].label_.mutable_gpu_data();
-      }
-    }
+  // Now, start the prefetch thread. Before calling prefetch, we make two
+  // cpu_data calls so that the prefetch thread does not accidentally make
+  // simultaneous cudaMalloc calls when the main thread is running. In some
+  // GPUs this seems to cause failures if we do not so.
+  this->prefetch_data_.mutable_cpu_data();
+  if (this->output_labels_) {
+    this->prefetch_label_.mutable_cpu_data();
   }
-#endif
-
   DLOG(INFO) << "Initializing prefetch";
-  this->data_transformer_->InitRand();
-  StartInternalThread();
+  this->CreatePrefetchThread();
   DLOG(INFO) << "Prefetch initialized.";
 }
 
 template <typename Dtype>
-void BasePrefetchingDataLayer<Dtype>::InternalThreadEntry() {
-#ifndef CPU_ONLY
-  cudaStream_t stream;
-  if (Caffe::mode() == Caffe::GPU) {
-    cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
-  }
-#endif
+void BasePrefetchingDataLayer<Dtype>::CreatePrefetchThread() {
+  this->data_transformer_->InitRand();
+  CHECK(StartInternalThread()) << "Thread execution failed";
+}
 
-  try {
-    while (!must_stop()) {
-      Batch<Dtype>* batch = prefetch_free_.pop();
-      load_batch(batch);
-#ifndef CPU_ONLY
-      if (Caffe::mode() == Caffe::GPU) {
-        batch->data_.data().get()->async_gpu_push(stream);
-        cudaStreamSynchronize(stream);
-      }
-#endif
-      prefetch_full_.push(batch);
-    }
-  } catch (boost::thread_interrupted&) {
-    // Interrupted exception is expected on shutdown
-  }
+template <typename Dtype>
+void BasePrefetchingDataLayer<Dtype>::JoinPrefetchThread() {
+  CHECK(WaitForInternalThreadToExit()) << "Thread joining failed";
 }
 
 template <typename Dtype>
 void BasePrefetchingDataLayer<Dtype>::Forward_cpu(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  Batch<Dtype>* batch = prefetch_full_.pop("Data layer prefetch queue empty");
+  // First, join the thread
+  JoinPrefetchThread();
+  DLOG(INFO) << "Thread joined";
   // Reshape to loaded data.
-  top[0]->Reshape(batch->data_.num(), batch->data_.channels(),
-      batch->data_.height(), batch->data_.width());
+  top[0]->Reshape(this->prefetch_data_.num(), this->prefetch_data_.channels(),
+      this->prefetch_data_.height(), this->prefetch_data_.width());
   // Copy the data
-  caffe_copy(batch->data_.count(), batch->data_.cpu_data(),
+  caffe_copy(prefetch_data_.count(), prefetch_data_.cpu_data(),
              top[0]->mutable_cpu_data());
   DLOG(INFO) << "Prefetch copied";
   if (this->output_labels_) {
-    caffe_copy(batch->label_.count(), batch->label_.cpu_data(),
-        top[1]->mutable_cpu_data());
+    caffe_copy(prefetch_label_.count(), prefetch_label_.cpu_data(),
+               top[1]->mutable_cpu_data());
   }
-
-  prefetch_free_.push(batch);
+  // Start a new prefetch thread
+  DLOG(INFO) << "CreatePrefetchThread";
+  CreatePrefetchThread();
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/base_data_layer.cu b/src/caffe/layers/base_data_layer.cu
index 52085d007a7..775f6c47f7e 100644
--- a/src/caffe/layers/base_data_layer.cu
+++ b/src/caffe/layers/base_data_layer.cu
@@ -7,19 +7,20 @@ namespace caffe {
 template <typename Dtype>
 void BasePrefetchingDataLayer<Dtype>::Forward_gpu(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  Batch<Dtype>* batch = prefetch_full_.pop("Data layer prefetch queue empty");
+  // First, join the thread
+  JoinPrefetchThread();
   // Reshape to loaded data.
-  top[0]->Reshape(batch->data_.num(), batch->data_.channels(),
-      batch->data_.height(), batch->data_.width());
+  top[0]->Reshape(this->prefetch_data_.num(), this->prefetch_data_.channels(),
+      this->prefetch_data_.height(), this->prefetch_data_.width());
   // Copy the data
-  caffe_copy(batch->data_.count(), batch->data_.gpu_data(),
+  caffe_copy(prefetch_data_.count(), prefetch_data_.cpu_data(),
       top[0]->mutable_gpu_data());
   if (this->output_labels_) {
-    caffe_copy(batch->label_.count(), batch->label_.gpu_data(),
+    caffe_copy(prefetch_label_.count(), prefetch_label_.cpu_data(),
         top[1]->mutable_gpu_data());
   }
-
-  prefetch_free_.push(batch);
+  // Start a new prefetch thread
+  CreatePrefetchThread();
 }
 
 INSTANTIATE_LAYER_GPU_FORWARD(BasePrefetchingDataLayer);
diff --git a/src/caffe/layers/cudnn_conv_layer.cpp b/src/caffe/layers/cudnn_conv_layer.cpp
index 1d89891f732..104d2b9d669 100644
--- a/src/caffe/layers/cudnn_conv_layer.cpp
+++ b/src/caffe/layers/cudnn_conv_layer.cpp
@@ -1,5 +1,4 @@
 #ifdef USE_CUDNN
-#include <algorithm>
 #include <vector>
 
 #include "caffe/filler.hpp"
@@ -13,53 +12,7 @@ namespace caffe {
 // Set to three for the benefit of the backward pass, which
 // can use separate streams for calculating the gradient w.r.t.
 // bias, filter weights, and bottom data for each group independently
-#define CUDNN_STREAMS_PER_GROUP 1
-
-cudnnConvolutionFwdAlgo_t
-GetCuDNNFwdAlgo(ConvolutionParameter_CuDNNFwdAlgorithm algo) {
-  switch (algo) {
-    case ConvolutionParameter_CuDNNFwdAlgorithm_IMPLICIT_GEMM:
-      return CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-    case ConvolutionParameter_CuDNNFwdAlgorithm_IMPLICIT_PRECOMP_GEMM:
-      return CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
-    case ConvolutionParameter_CuDNNFwdAlgorithm_GEMM:
-      return CUDNN_CONVOLUTION_FWD_ALGO_GEMM;
-    case ConvolutionParameter_CuDNNFwdAlgorithm_DIRECT:
-      return CUDNN_CONVOLUTION_FWD_ALGO_DIRECT;
-    case ConvolutionParameter_CuDNNFwdAlgorithm_FWD_FFT:
-      return CUDNN_CONVOLUTION_FWD_ALGO_FFT;
-    default:
-      return CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-  }
-}
-
-cudnnConvolutionBwdDataAlgo_t
-GetCuDNNBwdDataAlgo(ConvolutionParameter_CuDNNBwdDataAlgorithm algo) {
-  switch (algo) {
-    case ConvolutionParameter_CuDNNBwdDataAlgorithm_BWD_DATA_ALGO_0:
-      return CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
-    case ConvolutionParameter_CuDNNBwdDataAlgorithm_BWD_DATA_ALGO_1:
-      return CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
-    case ConvolutionParameter_CuDNNBwdDataAlgorithm_BWD_DATA_FFT:
-      return CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT;
-    default:
-      return CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
-  }
-}
-
-cudnnConvolutionBwdFilterAlgo_t
-GetCuDNNBwdFilterAlgo(ConvolutionParameter_CuDNNBwdFilterAlgorithm algo) {
-  switch (algo) {
-    case ConvolutionParameter_CuDNNBwdFilterAlgorithm_BWD_FILTER_ALGO_0:
-      return CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
-    case ConvolutionParameter_CuDNNBwdFilterAlgorithm_BWD_FILTER_ALGO_1:
-      return CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
-    case ConvolutionParameter_CuDNNBwdFilterAlgorithm_BWD_FILTER_FFT:
-      return CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT;
-    default:
-      return CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
-  }
-}
+#define CUDNN_STREAMS_PER_GROUP 3
 
 /**
  * TODO(dox) explain cuDNN interface
@@ -68,30 +21,16 @@ template <typename Dtype>
 void CuDNNConvolutionLayer<Dtype>::LayerSetUp(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
   ConvolutionLayer<Dtype>::LayerSetUp(bottom, top);
-  // Initialize algorithm arrays
-  fwd_algo_       = new cudnnConvolutionFwdAlgo_t[bottom.size()];
-  bwd_filter_algo_= new cudnnConvolutionBwdFilterAlgo_t[bottom.size()];
-  bwd_data_algo_  = new cudnnConvolutionBwdDataAlgo_t[bottom.size()];
-  // initialize size arrays
-  workspace_fwd_sizes_ = new size_t[bottom.size()];
-  workspace_bwd_filter_sizes_ = new size_t[bottom.size()];
-  workspace_bwd_data_sizes_ = new size_t[bottom.size()];
-  // workspace data
+  // Initialize CUDA streams and cuDNN.
+  stream_         = new cudaStream_t[this->group_ * CUDNN_STREAMS_PER_GROUP];
+  handle_         = new cudnnHandle_t[this->group_ * CUDNN_STREAMS_PER_GROUP];
   workspaceSizeInBytes = 0;
-  workspaceData = NULL;
-  workspace = new void*[this->group_ * CUDNN_STREAMS_PER_GROUP];
-
-  for (size_t i = 0; i < bottom.size(); i++) {
-    fwd_algo_[i] = (cudnnConvolutionFwdAlgo_t)0;
-    bwd_filter_algo_[i] = (cudnnConvolutionBwdFilterAlgo_t)0;
-    bwd_data_algo_[i] = (cudnnConvolutionBwdDataAlgo_t)0;
-    workspace_fwd_sizes_[i] = 0;
-    workspace_bwd_data_sizes_[i] = 0;
-    workspace_bwd_filter_sizes_[i] = 0;
-  }
+  workspace = NULL;
 
   for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) {
-    workspace[g] = NULL;
+    CUDA_CHECK(cudaStreamCreate(&stream_[g]));
+    CUDNN_CHECK(cudnnCreate(&handle_[g]));
+    CUDNN_CHECK(cudnnSetStream(handle_[g], stream_[g]));
   }
 
   // Set the indexing parameters.
@@ -134,16 +73,6 @@ void CuDNNConvolutionLayer<Dtype>::Reshape(
   top_offset_ = (this->num_output_ / this->group_)
       * this->height_out_ * this->width_out_;
 
-  // Specify workspace limit for kernels directly until we have a
-  // planning strategy and a rewrite of Caffe's GPU memory mangagement
-  size_t workspace_limit_bytes;
-  if (MemoryHandler::usingPool()) {
-    size_t total_memory;
-    MemoryHandler::getInfo(&workspace_limit_bytes, &total_memory);
-  } else {
-    workspace_limit_bytes = 8*1024*1024;
-  }
-
   for (int i = 0; i < bottom.size(); i++) {
     cudnn::setTensor4dDesc<Dtype>(&bottom_descs_[i],
         this->num_,
@@ -162,112 +91,7 @@ void CuDNNConvolutionLayer<Dtype>::Reshape(
     cudnn::setConvolutionDesc<Dtype>(&conv_descs_[i], bottom_descs_[i],
         filter_desc_, this->pad_h_, this->pad_w_,
         this->stride_h_, this->stride_w_);
-
-    // choose forward and backward algorithms + workspace(s)
-    if (!this->layer_param_.convolution_param().has_cudnnfwdalgo()) {
-      CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(Caffe::cudnn_handle(),
-        bottom_descs_[i],
-        filter_desc_,
-        conv_descs_[i],
-        top_descs_[i],
-        CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-        workspace_limit_bytes,
-        &fwd_algo_[i]));
-    } else {
-      fwd_algo_[i] = GetCuDNNFwdAlgo(
-                      this->layer_param_.convolution_param().cudnnfwdalgo());
-    }
-
-    CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(Caffe::cudnn_handle(),
-      bottom_descs_[i],
-      filter_desc_,
-      conv_descs_[i],
-      top_descs_[i],
-      fwd_algo_[i],
-      &(workspace_fwd_sizes_[i])));
-
-    if (MemoryHandler::usingPool()) {
-      // restrict to only 1 convolution at a time for memory allocation purposes
-      size_t total_memory;
-      MemoryHandler::getInfo(&workspace_limit_bytes, &total_memory);
-    } else {
-      workspace_limit_bytes = 8*1024*1024;
-    }
-    //
-    // choose backward algorithm for filter
-    if (!this->layer_param_.convolution_param().has_cudnnbwdfilteralgo()) {
-      CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
-            Caffe::cudnn_handle(),
-            bottom_descs_[i], top_descs_[i], conv_descs_[i], filter_desc_,
-            CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
-            workspace_limit_bytes, &bwd_filter_algo_[i]) );
-    } else {
-      bwd_filter_algo_[i] = GetCuDNNBwdFilterAlgo(
-                  this->layer_param_.convolution_param().cudnnbwdfilteralgo());
-    }
-    // get workspace for backwards filter algorithm
-    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
-          Caffe::cudnn_handle(),
-          bottom_descs_[i], top_descs_[i], conv_descs_[i], filter_desc_,
-          bwd_filter_algo_[i], &workspace_bwd_filter_sizes_[i]));
-
-    // choose backward algo for data
-    if (!this->layer_param_.convolution_param().has_cudnnbwddataalgo()) {
-      CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
-            Caffe::cudnn_handle(),
-            filter_desc_, top_descs_[i], conv_descs_[i], bottom_descs_[i],
-            CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
-          workspace_limit_bytes, &bwd_data_algo_[i]));
-    } else {
-      bwd_data_algo_[i] = GetCuDNNBwdDataAlgo(
-                  this->layer_param_.convolution_param().cudnnbwddataalgo());
-    }
-
-    // get workspace size
-    CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
-          Caffe::cudnn_handle(),
-          filter_desc_, top_descs_[i], conv_descs_[i], bottom_descs_[i],
-          bwd_data_algo_[i], &workspace_bwd_data_sizes_[i]) );
-  }
-
-#ifndef USE_CNMEM
-  // reduce over all workspace sizes to get a maximum to allocate / reallocate
-  size_t total_workspace_fwd = 0;
-  size_t total_workspace_bwd_data = 0;
-  size_t total_workspace_bwd_filter = 0;
-
-  // sum? max?
-  for (size_t i = 0; i < bottom.size(); i++) {
-    total_workspace_fwd = std::max(total_workspace_fwd,
-                                   workspace_fwd_sizes_[i]);
-    total_workspace_bwd_data = std::max(total_workspace_bwd_data,
-                                        workspace_bwd_data_sizes_[i]);
-    total_workspace_bwd_filter = std::max(total_workspace_bwd_filter,
-                                          workspace_bwd_filter_sizes_[i]);
   }
-  size_t max_workspace = std::max(total_workspace_fwd,
-                                  total_workspace_bwd_data);
-  max_workspace = std::max(max_workspace, total_workspace_bwd_filter);
-  size_t total_max_workspace = max_workspace *
-                               (this->group_ * CUDNN_STREAMS_PER_GROUP);
-
-  // this is the total amount of storage needed over all groups + streams
-  if (total_max_workspace > workspaceSizeInBytes) {
-    LOG(INFO) << "Reallocating workspace storage: " << total_max_workspace;
-    workspaceSizeInBytes = total_max_workspace;
-
-    // free the existing workspace and allocate a new (larger) one
-    MemoryHandler::freeGPU(this->workspaceData);
-    this->workspaceData = NULL;
-
-    MemoryHandler::mallocGPU(&(this->workspaceData), workspaceSizeInBytes);
-
-    // if we succeed in the allocation, set pointer aliases for workspaces
-    for (int g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) {
-      workspace[g] = reinterpret_cast<char *>(workspaceData) + g*max_workspace;
-    }
-  }
-#endif
 
   // Tensor descriptor for bias.
   if (this->bias_term_) {
@@ -291,13 +115,13 @@ CuDNNConvolutionLayer<Dtype>::~CuDNNConvolutionLayer() {
   }
   cudnnDestroyFilterDescriptor(filter_desc_);
 
-  cudaFree(workspaceData);
-  delete [] fwd_algo_;
-  delete [] bwd_filter_algo_;
-  delete [] bwd_data_algo_;
-  delete [] workspace_fwd_sizes_;
-  delete [] workspace_bwd_data_sizes_;
-  delete [] workspace_bwd_filter_sizes_;
+  for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) {
+    cudaStreamDestroy(stream_[g]);
+    cudnnDestroy(handle_[g]);
+  }
+
+  delete [] stream_;
+  delete [] handle_;
 }
 
 INSTANTIATE_CLASS(CuDNNConvolutionLayer);
diff --git a/src/caffe/layers/cudnn_conv_layer.cu b/src/caffe/layers/cudnn_conv_layer.cu
index 61846f0dcb3..4a1a4c4f4f2 100644
--- a/src/caffe/layers/cudnn_conv_layer.cu
+++ b/src/caffe/layers/cudnn_conv_layer.cu
@@ -19,30 +19,66 @@ void CuDNNConvolutionLayer<Dtype>::Forward_gpu(
     Dtype* top_data = top[i]->mutable_gpu_data();
     const Dtype* weight = this->blobs_[0]->gpu_data();
 
+    size_t workspace_limit_bytes = this->kernel_h_ *
+                                   this->kernel_w_ *
+                                   this->channels_ *
+                                   sizeof(int) + 1;
+
     // Forward through cuDNN in parallel over groups.
     for (int g = 0; g < this->group_; g++) {
-#ifdef USE_CNMEM
-      MemoryHandler::mallocGPU(&workspace[0], workspace_fwd_sizes_[i]);
-#endif
+      cudnnConvolutionFwdAlgo_t algo;
+
+      // pick the convolution algorithm
+      // TODO(shelhamer) this should be done during reshape
+      // TODO(shelhamer) the choice of automatic or manual algorithm picking
+      // should be exposed in proto
+      CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(handle_[g],
+        bottom_descs_[i],
+        filter_desc_,
+        conv_descs_[i],
+        top_descs_[i],
+        CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+        workspace_limit_bytes,  // memoryLimitInBytes,
+        &algo));
+
+      // get minimum size of the workspace needed for the desired algorithm
+      size_t workspaceSizeInBytes_temp = 0;
+
+      CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(handle_[g],
+        bottom_descs_[i],
+        filter_desc_,
+        conv_descs_[i],
+        top_descs_[i],
+        algo,
+        &workspaceSizeInBytes_temp));
+
+      if (workspaceSizeInBytes_temp > workspaceSizeInBytes) {
+        workspaceSizeInBytes = workspaceSizeInBytes_temp;
+        // free the existing workspace and allocate a new (larger) one
+        cudaFree(this->workspace);
+        cudaError_t err = cudaMalloc(&(this->workspace), workspaceSizeInBytes);
+        if (err != cudaSuccess) {
+          // force zero memory path
+          algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+          workspace = NULL;
+          workspaceSizeInBytes = 0;
+        }
+      }
+
       // Filters.
-      // CUDNN_CHECK(cudnnConvolutionForward(handle_[g],
-      CUDNN_CHECK(cudnnConvolutionForward(Caffe::cudnn_handle(),
+      CUDNN_CHECK(cudnnConvolutionForward(handle_[g],
             cudnn::dataType<Dtype>::one,
             bottom_descs_[i], bottom_data + bottom_offset_ * g,
             filter_desc_, weight + weight_offset_ * g,
             conv_descs_[i],
-            fwd_algo_[i], workspace[0], workspace_fwd_sizes_[i],
+            algo, workspace, workspaceSizeInBytes,
             cudnn::dataType<Dtype>::zero,
             top_descs_[i], top_data + top_offset_ * g));
 
-#ifdef USE_CNMEM
-      MemoryHandler::freeGPU(workspace[0]);
-      workspace[0] = NULL;
-#endif
       // Bias.
       if (this->bias_term_) {
         const Dtype* bias_data = this->blobs_[1]->gpu_data();
-        CUDNN_CHECK(cudnnAddTensor(Caffe::cudnn_handle(), CUDNN_ADD_SAME_C,
+        CUDNN_CHECK(cudnnAddTensor(handle_[g], CUDNN_ADD_SAME_C,
               cudnn::dataType<Dtype>::one,
               bias_desc_, bias_data + bias_offset_ * g,
               cudnn::dataType<Dtype>::one,
@@ -53,7 +89,7 @@ void CuDNNConvolutionLayer<Dtype>::Forward_gpu(
     // Synchronize the work across groups, each of which went into its own
     // stream, by launching an empty kernel into the default (null) stream.
     // NOLINT_NEXT_LINE(whitespace/operators)
-    CUDA_CHECK(cudaStreamSynchronize(cudaStreamLegacy));
+    sync_conv_groups<<<1, 1>>>();
   }
 }
 
@@ -65,12 +101,12 @@ void CuDNNConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   if (this->param_propagate_down_[0]) {
     weight = this->blobs_[0]->gpu_data();
     weight_diff = this->blobs_[0]->mutable_gpu_diff();
-    // caffe_gpu_set(this->blobs_[0]->count(), Dtype(0), weight_diff);
+    caffe_gpu_set(this->blobs_[0]->count(), Dtype(0), weight_diff);
   }
   Dtype* bias_diff = NULL;
   if (this->bias_term_ && this->param_propagate_down_[1]) {
     bias_diff = this->blobs_[1]->mutable_gpu_diff();
-    // caffe_gpu_set(this->blobs_[1]->count(), Dtype(0), bias_diff);
+    caffe_gpu_set(this->blobs_[1]->count(), Dtype(0), bias_diff);
   }
   for (int i = 0; i < top.size(); ++i) {
     const Dtype* top_diff = top[i]->gpu_diff();
@@ -78,7 +114,7 @@ void CuDNNConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     for (int g = 0; g < this->group_; g++) {
       // Gradient w.r.t. bias.
       if (this->bias_term_ && this->param_propagate_down_[1]) {
-        CUDNN_CHECK(cudnnConvolutionBackwardBias(Caffe::cudnn_handle(),
+        CUDNN_CHECK(cudnnConvolutionBackwardBias(handle_[0*this->group_ + g],
               cudnn::dataType<Dtype>::one,
               top_descs_[i],  top_diff + top_offset_ * g,
               cudnn::dataType<Dtype>::one,
@@ -87,23 +123,14 @@ void CuDNNConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
 
       // Gradient w.r.t. weights.
       if (this->param_propagate_down_[0]) {
-#ifdef USE_CNMEM
-        MemoryHandler::mallocGPU(&workspace[0], workspace_bwd_filter_sizes_[i]);
-#endif
         const Dtype* bottom_data = bottom[i]->gpu_data();
-        CUDNN_CHECK(cudnnConvolutionBackwardFilter_v3(
-              Caffe::cudnn_handle(),
+        CUDNN_CHECK(cudnnConvolutionBackwardFilter(handle_[1*this->group_ + g],
               cudnn::dataType<Dtype>::one,
               bottom_descs_[i], bottom_data + bottom_offset_ * g,
               top_descs_[i],    top_diff + top_offset_ * g,
               conv_descs_[i],
-              bwd_filter_algo_[i], workspace[0], workspace_bwd_filter_sizes_[i],
               cudnn::dataType<Dtype>::one,
               filter_desc_, weight_diff + weight_offset_ * g));
-#ifdef USE_CNMEM
-        MemoryHandler::freeGPU(workspace[0]);
-        workspace[0] = NULL;
-#endif
       }
 
       // Gradient w.r.t. bottom data.
@@ -112,29 +139,20 @@ void CuDNNConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
           weight = this->blobs_[0]->gpu_data();
         }
         Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
-#ifdef USE_CNMEM
-        MemoryHandler::mallocGPU(&workspace[0], workspace_bwd_data_sizes_[i]);
-#endif
-        CUDNN_CHECK(cudnnConvolutionBackwardData_v3(
-              Caffe::cudnn_handle(),
+        CUDNN_CHECK(cudnnConvolutionBackwardData(handle_[2*this->group_ + g],
               cudnn::dataType<Dtype>::one,
               filter_desc_, weight + weight_offset_ * g,
               top_descs_[i], top_diff + top_offset_ * g,
               conv_descs_[i],
-              bwd_data_algo_[i], workspace[0], workspace_bwd_data_sizes_[i],
               cudnn::dataType<Dtype>::zero,
               bottom_descs_[i], bottom_diff + bottom_offset_ * g));
-#ifdef USE_CNMEM
-        MemoryHandler::freeGPU(workspace[0]);
-        workspace[0] = NULL;
-#endif
       }
     }
 
     // Synchronize the work across groups, each of which went into its own
     // stream, by launching an empty kernel into the default (null) stream.
     // NOLINT_NEXT_LINE(whitespace/operators)
-    CUDA_CHECK(cudaStreamSynchronize(cudaStreamLegacy));
+    sync_conv_groups<<<1, 1>>>();
   }
 }
 
diff --git a/src/caffe/layers/cudnn_lcn_layer.cpp b/src/caffe/layers/cudnn_lcn_layer.cpp
deleted file mode 100644
index 4e104642a4b..00000000000
--- a/src/caffe/layers/cudnn_lcn_layer.cpp
+++ /dev/null
@@ -1,82 +0,0 @@
-#ifdef USE_CUDNN
-#include <vector>
-
-#include "caffe/filler.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/util/im2col.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNLCNLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  LRNLayer<Dtype>::LayerSetUp(bottom, top);
-
-  CUDNN_CHECK(cudnnCreateLRNDescriptor(&norm_desc_));
-  cudnn::createTensor4dDesc<Dtype>(&bottom_desc_);
-  cudnn::createTensor4dDesc<Dtype>(&top_desc_);
-
-  // create a LRN handle
-  handles_setup_ = true;
-
-  size_ = this->layer_param().lrn_param().local_size();
-  pre_pad_ = (size_ - 1) / 2;
-  alpha_ = this->layer_param().lrn_param().alpha();
-  beta_ = this->layer_param().lrn_param().beta();
-  k_ = this->layer_param().lrn_param().k();
-}
-
-template <typename Dtype>
-void CuDNNLCNLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  LRNLayer<Dtype>::Reshape(bottom, top);
-  cudnn::setTensor4dDesc<Dtype>(&bottom_desc_, bottom[0]->num(),
-      this->channels_, this->height_, this->width_);
-  cudnn::setTensor4dDesc<Dtype>(&top_desc_, bottom[0]->num(),
-      this->channels_, this->height_, this->width_);
-  CUDNN_CHECK(cudnnSetLRNDescriptor(norm_desc_, size_, alpha_, beta_, k_));
-
-  // allocate / reallocate tempData buffers
-  size_t totalSizeInBytes = sizeof(Dtype)*bottom[0]->num()* \
-                            this->channels_*this->height_*this->width_;
-
-#ifdef USE_CNMEM
-  this->tempDataSize = totalSizeInBytes;
-#else
-  if (totalSizeInBytes > tempDataSize) {
-    tempDataSize = totalSizeInBytes;
-
-    MemoryHandler::freeGPU(tempData1);
-    MemoryHandler::freeGPU(tempData2);
-    tempData1 = NULL;
-    tempData2 = NULL;
-
-    // allocate new buffers
-    MemoryHandler::mallocGPU(&tempData1, totalSizeInBytes);
-    MemoryHandler::mallocGPU(&tempData2, totalSizeInBytes);
-  }
-#endif
-}
-
-template <typename Dtype>
-CuDNNLCNLayer<Dtype>::~CuDNNLCNLayer() {
-  // Check that handles have been setup before destroying.
-  if (!handles_setup_) { return; }
-
-  CUDNN_CHECK(cudnnDestroyTensorDescriptor(bottom_desc_));
-  CUDNN_CHECK(cudnnDestroyTensorDescriptor(top_desc_));
-
-  // destroy LRN handle
-  CUDNN_CHECK(cudnnDestroyLRNDescriptor(norm_desc_));
-
-  // free temp buffers
-  if (tempData1 != NULL) cudaFree(tempData1);
-  if (tempData2 != NULL) cudaFree(tempData2);
-}
-
-INSTANTIATE_CLASS(CuDNNLCNLayer);
-
-}   // namespace caffe
-#endif
diff --git a/src/caffe/layers/cudnn_lcn_layer.cu b/src/caffe/layers/cudnn_lcn_layer.cu
deleted file mode 100644
index 245b967edd4..00000000000
--- a/src/caffe/layers/cudnn_lcn_layer.cu
+++ /dev/null
@@ -1,74 +0,0 @@
-#ifdef USE_CUDNN
-#include <vector>
-
-#include "caffe/filler.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/util/im2col.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNLCNLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-
-#ifdef USE_CNMEM
-  MemoryHandler::mallocGPU(&this->tempData1, this->tempDataSize);
-  MemoryHandler::mallocGPU(&this->tempData2, this->tempDataSize);
-#endif
-
-  CUDNN_CHECK(cudnnDivisiveNormalizationForward(
-        Caffe::cudnn_handle(), norm_desc_, CUDNN_DIVNORM_PRECOMPUTED_MEANS,
-        cudnn::dataType<Dtype>::one,
-        bottom_desc_, bottom_data,
-        NULL,  // srcMeansData
-        this->tempData1, this->tempData2,
-        cudnn::dataType<Dtype>::zero,
-        top_desc_, top_data) );
-
-#ifdef USE_CNMEM
-  MemoryHandler::freeGPU(this->tempData1);
-  MemoryHandler::freeGPU(this->tempData2);
-  this->tempData1 = NULL;
-  this->tempData2 = NULL;
-#endif
-}
-
-template <typename Dtype>
-void CuDNNLCNLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* top_diff = top[0]->gpu_diff();
-  const Dtype* top_data = top[0]->gpu_data();
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-
-#ifdef USE_CNMEM
-  MemoryHandler::mallocGPU(&this->tempData1, this->tempDataSize);
-  MemoryHandler::mallocGPU(&this->tempData2, this->tempDataSize);
-#endif
-
-  CUDNN_CHECK(cudnnDivisiveNormalizationBackward(
-        Caffe::cudnn_handle(), norm_desc_, CUDNN_DIVNORM_PRECOMPUTED_MEANS,
-        cudnn::dataType<Dtype>::one,
-        bottom_desc_, bottom_data,
-        NULL, top_diff,  // NULL - srcMeansData
-        this->tempData1, this->tempData2,
-        cudnn::dataType<Dtype>::zero,
-        bottom_desc_, bottom_diff,
-        NULL) );
-
-#ifdef USE_CNMEM
-  MemoryHandler::freeGPU(this->tempData1);
-  MemoryHandler::freeGPU(this->tempData2);
-  this->tempData1 = NULL;
-  this->tempData2 = NULL;
-#endif
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(CuDNNLCNLayer);
-
-}  // namespace caffe
-#endif
diff --git a/src/caffe/layers/cudnn_lrn_layer.cpp b/src/caffe/layers/cudnn_lrn_layer.cpp
deleted file mode 100644
index c263dae1c2d..00000000000
--- a/src/caffe/layers/cudnn_lrn_layer.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-#ifdef USE_CUDNN
-#include <vector>
-
-#include "caffe/filler.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/util/im2col.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNLRNLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  LRNLayer<Dtype>::LayerSetUp(bottom, top);
-
-  // CUDNN_CHECK(cudnnCreate(&handle_));
-  CUDNN_CHECK(cudnnCreateLRNDescriptor(&norm_desc_));
-  cudnn::createTensor4dDesc<Dtype>(&bottom_desc_);
-  cudnn::createTensor4dDesc<Dtype>(&top_desc_);
-
-  // create a LRN handle
-  handles_setup_ = true;
-
-  size_ = this->layer_param().lrn_param().local_size();
-  alpha_ = this->layer_param().lrn_param().alpha();
-  beta_ = this->layer_param().lrn_param().beta();
-  k_ = this->layer_param().lrn_param().k();
-}
-
-template <typename Dtype>
-void CuDNNLRNLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  LRNLayer<Dtype>::Reshape(bottom, top);
-  cudnn::setTensor4dDesc<Dtype>(&bottom_desc_, bottom[0]->num(),
-      this->channels_, this->height_, this->width_);
-  cudnn::setTensor4dDesc<Dtype>(&top_desc_, bottom[0]->num(),
-      this->channels_, this->height_, this->width_);
-  CUDNN_CHECK(cudnnSetLRNDescriptor(norm_desc_, size_, alpha_, beta_, k_));
-}
-
-template <typename Dtype>
-CuDNNLRNLayer<Dtype>::~CuDNNLRNLayer() {
-  // Check that handles have been setup before destroying.
-  if (!handles_setup_) { return; }
-
-  cudnnDestroyTensorDescriptor(bottom_desc_);
-  cudnnDestroyTensorDescriptor(top_desc_);
-
-  // destroy LRN handle
-  CUDNN_CHECK(cudnnDestroyLRNDescriptor(norm_desc_));
-}
-
-INSTANTIATE_CLASS(CuDNNLRNLayer);
-
-}   // namespace caffe
-#endif
diff --git a/src/caffe/layers/cudnn_lrn_layer.cu b/src/caffe/layers/cudnn_lrn_layer.cu
deleted file mode 100644
index 10b8a05250c..00000000000
--- a/src/caffe/layers/cudnn_lrn_layer.cu
+++ /dev/null
@@ -1,48 +0,0 @@
-#ifdef USE_CUDNN
-#include <vector>
-
-#include "caffe/filler.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/util/im2col.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNLRNLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-
-  CUDNN_CHECK(cudnnLRNCrossChannelForward(
-        Caffe::cudnn_handle(), norm_desc_, CUDNN_LRN_CROSS_CHANNEL_DIM1,
-        cudnn::dataType<Dtype>::one,
-        bottom_desc_, bottom_data,
-        cudnn::dataType<Dtype>::zero,
-        top_desc_, top_data) );
-}
-
-template <typename Dtype>
-void CuDNNLRNLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* top_diff = top[0]->gpu_diff();
-  const Dtype* top_data = top[0]->gpu_data();
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-
-  CUDNN_CHECK(cudnnLRNCrossChannelBackward(
-        Caffe::cudnn_handle(), norm_desc_, CUDNN_LRN_CROSS_CHANNEL_DIM1,
-        cudnn::dataType<Dtype>::one,
-        top_desc_, top_data,
-        top_desc_, top_diff,
-        bottom_desc_, bottom_data,
-        cudnn::dataType<Dtype>::zero,
-        bottom_desc_, bottom_diff) );
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(CuDNNLRNLayer);
-
-};  // namespace caffe
-
-#endif
diff --git a/src/caffe/layers/cudnn_pooling_layer.cpp b/src/caffe/layers/cudnn_pooling_layer.cpp
index d5b9cd0c179..c92c4e477b5 100644
--- a/src/caffe/layers/cudnn_pooling_layer.cpp
+++ b/src/caffe/layers/cudnn_pooling_layer.cpp
@@ -13,6 +13,7 @@ template <typename Dtype>
 void CuDNNPoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
   PoolingLayer<Dtype>::LayerSetUp(bottom, top);
+  CUDNN_CHECK(cudnnCreate(&handle_));
   cudnn::createTensor4dDesc<Dtype>(&bottom_desc_);
   cudnn::createTensor4dDesc<Dtype>(&top_desc_);
   cudnn::createPoolingDesc<Dtype>(&pooling_desc_,
@@ -40,6 +41,7 @@ CuDNNPoolingLayer<Dtype>::~CuDNNPoolingLayer() {
   cudnnDestroyTensorDescriptor(bottom_desc_);
   cudnnDestroyTensorDescriptor(top_desc_);
   cudnnDestroyPoolingDescriptor(pooling_desc_);
+  cudnnDestroy(handle_);
 }
 
 INSTANTIATE_CLASS(CuDNNPoolingLayer);
diff --git a/src/caffe/layers/cudnn_pooling_layer.cu b/src/caffe/layers/cudnn_pooling_layer.cu
index 9b8e6aee497..a952b855a48 100644
--- a/src/caffe/layers/cudnn_pooling_layer.cu
+++ b/src/caffe/layers/cudnn_pooling_layer.cu
@@ -14,7 +14,7 @@ void CuDNNPoolingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
   const Dtype* bottom_data = bottom[0]->gpu_data();
   Dtype* top_data = top[0]->mutable_gpu_data();
-  CUDNN_CHECK(cudnnPoolingForward(Caffe::cudnn_handle(), pooling_desc_,
+  CUDNN_CHECK(cudnnPoolingForward(handle_, pooling_desc_,
         cudnn::dataType<Dtype>::one,
         bottom_desc_, bottom_data,
         cudnn::dataType<Dtype>::zero,
@@ -31,7 +31,7 @@ void CuDNNPoolingLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   const Dtype* top_data = top[0]->gpu_data();
   const Dtype* bottom_data = bottom[0]->gpu_data();
   Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  CUDNN_CHECK(cudnnPoolingBackward(Caffe::cudnn_handle(), pooling_desc_,
+  CUDNN_CHECK(cudnnPoolingBackward(handle_, pooling_desc_,
         cudnn::dataType<Dtype>::one,
         top_desc_, top_data, top_desc_, top_diff,
         bottom_desc_, bottom_data,
diff --git a/src/caffe/layers/cudnn_relu_layer.cpp b/src/caffe/layers/cudnn_relu_layer.cpp
index 4dd9e6bfe8a..759d83984ef 100644
--- a/src/caffe/layers/cudnn_relu_layer.cpp
+++ b/src/caffe/layers/cudnn_relu_layer.cpp
@@ -12,6 +12,7 @@ void CuDNNReLULayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
   ReLULayer<Dtype>::LayerSetUp(bottom, top);
   // initialize cuDNN
+  CUDNN_CHECK(cudnnCreate(&handle_));
   cudnn::createTensor4dDesc<Dtype>(&bottom_desc_);
   cudnn::createTensor4dDesc<Dtype>(&top_desc_);
   handles_setup_ = true;
@@ -36,6 +37,7 @@ CuDNNReLULayer<Dtype>::~CuDNNReLULayer() {
 
   cudnnDestroyTensorDescriptor(this->bottom_desc_);
   cudnnDestroyTensorDescriptor(this->top_desc_);
+  cudnnDestroy(this->handle_);
 }
 
 INSTANTIATE_CLASS(CuDNNReLULayer);
diff --git a/src/caffe/layers/cudnn_relu_layer.cu b/src/caffe/layers/cudnn_relu_layer.cu
index 1664d649b9c..21d14857dd2 100644
--- a/src/caffe/layers/cudnn_relu_layer.cu
+++ b/src/caffe/layers/cudnn_relu_layer.cu
@@ -17,7 +17,7 @@ void CuDNNReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 
   const Dtype* bottom_data = bottom[0]->gpu_data();
   Dtype* top_data = top[0]->mutable_gpu_data();
-  CUDNN_CHECK(cudnnActivationForward(Caffe::cudnn_handle(),
+  CUDNN_CHECK(cudnnActivationForward(this->handle_,
         CUDNN_ACTIVATION_RELU,
         cudnn::dataType<Dtype>::one,
         this->bottom_desc_, bottom_data,
@@ -42,7 +42,7 @@ void CuDNNReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   const Dtype* top_diff = top[0]->gpu_diff();
   const Dtype* bottom_data = bottom[0]->gpu_data();
   Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  CUDNN_CHECK(cudnnActivationBackward(Caffe::cudnn_handle(),
+  CUDNN_CHECK(cudnnActivationBackward(this->handle_,
         CUDNN_ACTIVATION_RELU,
         cudnn::dataType<Dtype>::one,
         this->top_desc_, top_data, this->top_desc_, top_diff,
diff --git a/src/caffe/layers/cudnn_sigmoid_layer.cpp b/src/caffe/layers/cudnn_sigmoid_layer.cpp
index b9ba8903ebb..32637873d46 100644
--- a/src/caffe/layers/cudnn_sigmoid_layer.cpp
+++ b/src/caffe/layers/cudnn_sigmoid_layer.cpp
@@ -12,6 +12,7 @@ void CuDNNSigmoidLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
   SigmoidLayer<Dtype>::LayerSetUp(bottom, top);
   // initialize cuDNN
+  CUDNN_CHECK(cudnnCreate(&handle_));
   cudnn::createTensor4dDesc<Dtype>(&bottom_desc_);
   cudnn::createTensor4dDesc<Dtype>(&top_desc_);
   handles_setup_ = true;
@@ -36,6 +37,7 @@ CuDNNSigmoidLayer<Dtype>::~CuDNNSigmoidLayer() {
 
   cudnnDestroyTensorDescriptor(this->bottom_desc_);
   cudnnDestroyTensorDescriptor(this->top_desc_);
+  cudnnDestroy(this->handle_);
 }
 
 INSTANTIATE_CLASS(CuDNNSigmoidLayer);
diff --git a/src/caffe/layers/cudnn_sigmoid_layer.cu b/src/caffe/layers/cudnn_sigmoid_layer.cu
index bcf38da6c4e..7a06cf721da 100644
--- a/src/caffe/layers/cudnn_sigmoid_layer.cu
+++ b/src/caffe/layers/cudnn_sigmoid_layer.cu
@@ -12,7 +12,7 @@ void CuDNNSigmoidLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
   const Dtype* bottom_data = bottom[0]->gpu_data();
   Dtype* top_data = top[0]->mutable_gpu_data();
-  CUDNN_CHECK(cudnnActivationForward(Caffe::cudnn_handle(),
+  CUDNN_CHECK(cudnnActivationForward(this->handle_,
         CUDNN_ACTIVATION_SIGMOID,
         cudnn::dataType<Dtype>::one,
         this->bottom_desc_, bottom_data,
@@ -32,7 +32,7 @@ void CuDNNSigmoidLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   const Dtype* top_diff = top[0]->gpu_diff();
   const Dtype* bottom_data = bottom[0]->gpu_data();
   Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  CUDNN_CHECK(cudnnActivationBackward(Caffe::cudnn_handle(),
+  CUDNN_CHECK(cudnnActivationBackward(this->handle_,
         CUDNN_ACTIVATION_SIGMOID,
         cudnn::dataType<Dtype>::one,
         this->top_desc_, top_data, this->top_desc_, top_diff,
diff --git a/src/caffe/layers/cudnn_softmax_layer.cpp b/src/caffe/layers/cudnn_softmax_layer.cpp
index 20f9c4ed46f..77a3225adcd 100644
--- a/src/caffe/layers/cudnn_softmax_layer.cpp
+++ b/src/caffe/layers/cudnn_softmax_layer.cpp
@@ -16,6 +16,7 @@ void CuDNNSoftmaxLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
   SoftmaxLayer<Dtype>::LayerSetUp(bottom, top);
   // Initialize CUDNN.
+  CUDNN_CHECK(cudnnCreate(&handle_));
   cudnn::createTensor4dDesc<Dtype>(&bottom_desc_);
   cudnn::createTensor4dDesc<Dtype>(&top_desc_);
   handles_setup_ = true;
@@ -40,6 +41,7 @@ CuDNNSoftmaxLayer<Dtype>::~CuDNNSoftmaxLayer() {
 
   cudnnDestroyTensorDescriptor(bottom_desc_);
   cudnnDestroyTensorDescriptor(top_desc_);
+  cudnnDestroy(handle_);
 }
 
 INSTANTIATE_CLASS(CuDNNSoftmaxLayer);
diff --git a/src/caffe/layers/cudnn_softmax_layer.cu b/src/caffe/layers/cudnn_softmax_layer.cu
index 9a921ba7e96..a9e2fcefaf7 100644
--- a/src/caffe/layers/cudnn_softmax_layer.cu
+++ b/src/caffe/layers/cudnn_softmax_layer.cu
@@ -16,7 +16,7 @@ void CuDNNSoftmaxLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
   const Dtype* bottom_data = bottom[0]->gpu_data();
   Dtype* top_data = top[0]->mutable_gpu_data();
-  CUDNN_CHECK(cudnnSoftmaxForward(Caffe::cudnn_handle(), CUDNN_SOFTMAX_ACCURATE,
+  CUDNN_CHECK(cudnnSoftmaxForward(handle_, CUDNN_SOFTMAX_ACCURATE,
         CUDNN_SOFTMAX_MODE_CHANNEL,
         cudnn::dataType<Dtype>::one,
         bottom_desc_, bottom_data,
@@ -33,8 +33,7 @@ void CuDNNSoftmaxLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     const Dtype* bottom_data = bottom[0]->gpu_data();
     Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
 
-    CUDNN_CHECK(cudnnSoftmaxBackward(
-          Caffe::cudnn_handle(), CUDNN_SOFTMAX_ACCURATE,
+    CUDNN_CHECK(cudnnSoftmaxBackward(handle_, CUDNN_SOFTMAX_ACCURATE,
           CUDNN_SOFTMAX_MODE_CHANNEL,
           cudnn::dataType<Dtype>::one,
           top_desc_, top_data, top_desc_, top_diff,
diff --git a/src/caffe/layers/cudnn_tanh_layer.cpp b/src/caffe/layers/cudnn_tanh_layer.cpp
index 62afc6da7e4..376faad324d 100644
--- a/src/caffe/layers/cudnn_tanh_layer.cpp
+++ b/src/caffe/layers/cudnn_tanh_layer.cpp
@@ -12,6 +12,7 @@ void CuDNNTanHLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
   TanHLayer<Dtype>::LayerSetUp(bottom, top);
   // initialize cuDNN
+  CUDNN_CHECK(cudnnCreate(&handle_));
   cudnn::createTensor4dDesc<Dtype>(&bottom_desc_);
   cudnn::createTensor4dDesc<Dtype>(&top_desc_);
   handles_setup_ = true;
@@ -36,6 +37,7 @@ CuDNNTanHLayer<Dtype>::~CuDNNTanHLayer() {
 
   cudnnDestroyTensorDescriptor(this->bottom_desc_);
   cudnnDestroyTensorDescriptor(this->top_desc_);
+  cudnnDestroy(this->handle_);
 }
 
 INSTANTIATE_CLASS(CuDNNTanHLayer);
diff --git a/src/caffe/layers/cudnn_tanh_layer.cu b/src/caffe/layers/cudnn_tanh_layer.cu
index d4e6c8a08bc..d287f6fee85 100644
--- a/src/caffe/layers/cudnn_tanh_layer.cu
+++ b/src/caffe/layers/cudnn_tanh_layer.cu
@@ -12,7 +12,7 @@ void CuDNNTanHLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
   const Dtype* bottom_data = bottom[0]->gpu_data();
   Dtype* top_data = top[0]->mutable_gpu_data();
-  CUDNN_CHECK(cudnnActivationForward(Caffe::cudnn_handle(),
+  CUDNN_CHECK(cudnnActivationForward(this->handle_,
         CUDNN_ACTIVATION_TANH,
         cudnn::dataType<Dtype>::one,
         this->bottom_desc_, bottom_data,
@@ -33,7 +33,7 @@ void CuDNNTanHLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   const Dtype* bottom_data = bottom[0]->gpu_data();
   Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
 
-  CUDNN_CHECK(cudnnActivationBackward(Caffe::cudnn_handle(),
+  CUDNN_CHECK(cudnnActivationBackward(this->handle_,
         CUDNN_ACTIVATION_TANH,
         cudnn::dataType<Dtype>::one,
         this->top_desc_, top_data, this->top_desc_, top_diff,
diff --git a/src/caffe/layers/data_layer.cpp b/src/caffe/layers/data_layer.cpp
index 9c23ba0cd29..0f2d66776a9 100644
--- a/src/caffe/layers/data_layer.cpp
+++ b/src/caffe/layers/data_layer.cpp
@@ -11,25 +11,36 @@
 #include "caffe/proto/caffe.pb.h"
 #include "caffe/util/benchmark.hpp"
 #include "caffe/util/io.hpp"
+#include "caffe/util/math_functions.hpp"
+#include "caffe/util/rng.hpp"
 
 namespace caffe {
 
 template <typename Dtype>
-DataLayer<Dtype>::DataLayer(const LayerParameter& param)
-  : BasePrefetchingDataLayer<Dtype>(param),
-    reader_(param) {
-}
-
-template <typename Dtype>
-DataLayer<Dtype>::~DataLayer() {
-  this->StopInternalThread();
+DataLayer<Dtype>::~DataLayer<Dtype>() {
+  this->JoinPrefetchThread();
 }
 
 template <typename Dtype>
 void DataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
+  // Initialize DB
+  db_.reset(db::GetDB(this->layer_param_.data_param().backend()));
+  db_->Open(this->layer_param_.data_param().source(), db::READ);
+  cursor_.reset(db_->NewCursor());
+
+  // Check if we should randomly skip a few data points
+  if (this->layer_param_.data_param().rand_skip()) {
+    unsigned int skip = caffe_rng_rand() %
+                        this->layer_param_.data_param().rand_skip();
+    LOG(INFO) << "Skipping first " << skip << " data points.";
+    while (skip-- > 0) {
+      cursor_->Next();
+    }
+  }
   // Read a data point, and use it to initialize the top blob.
-  Datum& datum = *(reader_.full().peek());
+  Datum datum;
+  datum.ParseFromString(cursor_->value());
 
   bool force_color = this->layer_param_.data_param().force_encoded_color();
   if ((force_color && DecodeDatum(&datum, true)) ||
@@ -37,49 +48,42 @@ void DataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
     LOG(INFO) << "Decoding Datum";
   }
   // image
-  const int crop_size = this->layer_param_.transform_param().crop_size();
-  const int batch_size = this->layer_param_.data_param().batch_size();
+  int crop_size = this->layer_param_.transform_param().crop_size();
   if (crop_size > 0) {
-    top[0]->Reshape(batch_size, datum.channels(), crop_size, crop_size);
-    for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
-      this->prefetch_[i].data_.Reshape(batch_size, datum.channels(),
-          crop_size, crop_size);
-    }
-    this->transformed_data_.Reshape(1, datum.channels(),
-        crop_size, crop_size);
+    top[0]->Reshape(this->layer_param_.data_param().batch_size(),
+        datum.channels(), crop_size, crop_size);
+    this->prefetch_data_.Reshape(this->layer_param_.data_param().batch_size(),
+        datum.channels(), crop_size, crop_size);
+    this->transformed_data_.Reshape(1, datum.channels(), crop_size, crop_size);
   } else {
-    top[0]->Reshape(batch_size, datum.channels(),
+    top[0]->Reshape(
+        this->layer_param_.data_param().batch_size(), datum.channels(),
         datum.height(), datum.width());
-    for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
-      this->prefetch_[i].data_.Reshape(batch_size, datum.channels(),
-          datum.height(), datum.width());
-    }
+    this->prefetch_data_.Reshape(this->layer_param_.data_param().batch_size(),
+        datum.channels(), datum.height(), datum.width());
     this->transformed_data_.Reshape(1, datum.channels(),
-        datum.height(), datum.width());
+      datum.height(), datum.width());
   }
   LOG(INFO) << "output data size: " << top[0]->num() << ","
       << top[0]->channels() << "," << top[0]->height() << ","
       << top[0]->width();
   // label
   if (this->output_labels_) {
-    vector<int> label_shape(1, batch_size);
+    vector<int> label_shape(1, this->layer_param_.data_param().batch_size());
     top[1]->Reshape(label_shape);
-    for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
-      this->prefetch_[i].label_.Reshape(label_shape);
-    }
+    this->prefetch_label_.Reshape(label_shape);
   }
 }
 
-// This function is called on prefetch thread
-template<typename Dtype>
-void DataLayer<Dtype>::load_batch(Batch<Dtype>* batch) {
+// This function is used to create a thread that prefetches the data.
+template <typename Dtype>
+void DataLayer<Dtype>::InternalThreadEntry() {
   CPUTimer batch_timer;
   batch_timer.Start();
-  double deque_time = 0;
-  double decod_time = 0;
+  double read_time = 0;
   double trans_time = 0;
   CPUTimer timer;
-  CHECK(batch->data_.count());
+  CHECK(this->prefetch_data_.count());
   CHECK(this->transformed_data_.count());
 
   // Reshape on single input batches for inputs of varying dimension.
@@ -87,7 +91,8 @@ void DataLayer<Dtype>::load_batch(Batch<Dtype>* batch) {
   const int crop_size = this->layer_param_.transform_param().crop_size();
   bool force_color = this->layer_param_.data_param().force_encoded_color();
   if (batch_size == 1 && crop_size == 0) {
-    Datum& datum = *(reader_.full().peek());
+    Datum datum;
+    datum.ParseFromString(cursor_->value());
     if (datum.encoded()) {
       if (force_color) {
         DecodeDatum(&datum, true);
@@ -95,25 +100,24 @@ void DataLayer<Dtype>::load_batch(Batch<Dtype>* batch) {
         DecodeDatumNative(&datum);
       }
     }
-    batch->data_.Reshape(1, datum.channels(),
+    this->prefetch_data_.Reshape(1, datum.channels(),
         datum.height(), datum.width());
     this->transformed_data_.Reshape(1, datum.channels(),
         datum.height(), datum.width());
   }
 
-  Dtype* top_data = batch->data_.mutable_cpu_data();
+  Dtype* top_data = this->prefetch_data_.mutable_cpu_data();
   Dtype* top_label = NULL;  // suppress warnings about uninitialized variables
 
   if (this->output_labels_) {
-    top_label = batch->label_.mutable_cpu_data();
+    top_label = this->prefetch_label_.mutable_cpu_data();
   }
   for (int item_id = 0; item_id < batch_size; ++item_id) {
-    // get a blob
     timer.Start();
-    Datum& datum = *(reader_.full().pop("Waiting for data"));
-    deque_time += timer.MicroSeconds();
+    // get a blob
+    Datum datum;
+    datum.ParseFromString(cursor_->value());
 
-    timer.Start();
     cv::Mat cv_img;
     if (datum.encoded()) {
       if (force_color) {
@@ -128,11 +132,11 @@ void DataLayer<Dtype>::load_batch(Batch<Dtype>* batch) {
         << "convert_imageset.";
       }
     }
-    decod_time += timer.MicroSeconds();
+    read_time += timer.MicroSeconds();
+    timer.Start();
 
     // Apply data transformations (mirror, scale, crop...)
-    timer.Start();
-    int offset = batch->data_.offset(item_id);
+    int offset = this->prefetch_data_.offset(item_id);
     this->transformed_data_.set_cpu_data(top_data + offset);
     if (datum.encoded()) {
       this->data_transformer_->Transform(cv_img, &(this->transformed_data_));
@@ -143,17 +147,17 @@ void DataLayer<Dtype>::load_batch(Batch<Dtype>* batch) {
       top_label[item_id] = datum.label();
     }
     trans_time += timer.MicroSeconds();
-
-    reader_.free().push(const_cast<Datum*>(&datum));
+    // go to the next iter
+    cursor_->Next();
+    if (!cursor_->valid()) {
+      DLOG(INFO) << "Restarting data prefetching from start.";
+      cursor_->SeekToFirst();
+    }
   }
   batch_timer.Stop();
-
-#ifdef BENCHMARK_DATA
-  LOG(INFO) << "Prefetch batch: " << batch_timer.MilliSeconds() << " ms.";
-  LOG(INFO) << "  Dequeue time: " << deque_time / 1000 << " ms.";
-  LOG(INFO) << "   Decode time: " << decod_time / 1000 << " ms.";
-  LOG(INFO) << "Transform time: " << trans_time / 1000 << " ms.";
-#endif
+  DLOG(INFO) << "Prefetch batch: " << batch_timer.MilliSeconds() << " ms.";
+  DLOG(INFO) << "     Read time: " << read_time / 1000 << " ms.";
+  DLOG(INFO) << "Transform time: " << trans_time / 1000 << " ms.";
 }
 
 INSTANTIATE_CLASS(DataLayer);
diff --git a/src/caffe/layers/dropout_layer.cu b/src/caffe/layers/dropout_layer.cu
index 552d1ff2cf7..f9ea04f4acf 100644
--- a/src/caffe/layers/dropout_layer.cu
+++ b/src/caffe/layers/dropout_layer.cu
@@ -30,11 +30,11 @@ void DropoutLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     unsigned int* mask =
         static_cast<unsigned int*>(rand_vec_.mutable_gpu_data());
     caffe_gpu_rng_uniform(count, mask);
-    CUDA_POST_KERNEL_CHECK;
     // set thresholds
     // NOLINT_NEXT_LINE(whitespace/operators)
     DropoutForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
         count, bottom_data, mask, uint_thres_, scale_, top_data);
+    CUDA_POST_KERNEL_CHECK;
   } else {
     caffe_copy(count, bottom_data, top_data);
   }
diff --git a/src/caffe/layers/image_data_layer.cpp b/src/caffe/layers/image_data_layer.cpp
index 50187bbe5ce..38ebbd5ec14 100644
--- a/src/caffe/layers/image_data_layer.cpp
+++ b/src/caffe/layers/image_data_layer.cpp
@@ -17,7 +17,7 @@ namespace caffe {
 
 template <typename Dtype>
 ImageDataLayer<Dtype>::~ImageDataLayer<Dtype>() {
-  this->StopInternalThread();
+  this->JoinPrefetchThread();
 }
 
 template <typename Dtype>
@@ -70,14 +70,11 @@ void ImageDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
   const int batch_size = this->layer_param_.image_data_param().batch_size();
   if (crop_size > 0) {
     top[0]->Reshape(batch_size, channels, crop_size, crop_size);
-    for (int i = 0; i < this->PREFETCH_COUNT; ++i)
-      this->prefetch_[i].data_.Reshape(batch_size, channels,
-          crop_size, crop_size);
+    this->prefetch_data_.Reshape(batch_size, channels, crop_size, crop_size);
     this->transformed_data_.Reshape(1, channels, crop_size, crop_size);
   } else {
     top[0]->Reshape(batch_size, channels, height, width);
-    for (int i = 0; i < this->PREFETCH_COUNT; ++i)
-      this->prefetch_[i].data_.Reshape(batch_size, channels, height, width);
+    this->prefetch_data_.Reshape(batch_size, channels, height, width);
     this->transformed_data_.Reshape(1, channels, height, width);
   }
   LOG(INFO) << "output data size: " << top[0]->num() << ","
@@ -86,9 +83,7 @@ void ImageDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
   // label
   vector<int> label_shape(1, batch_size);
   top[1]->Reshape(label_shape);
-  for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
-    this->prefetch_[i].label_.Reshape(label_shape);
-  }
+  this->prefetch_label_.Reshape(label_shape);
 }
 
 template <typename Dtype>
@@ -98,15 +93,15 @@ void ImageDataLayer<Dtype>::ShuffleImages() {
   shuffle(lines_.begin(), lines_.end(), prefetch_rng);
 }
 
-// This function is called on prefetch thread
+// This function is used to create a thread that prefetches the data.
 template <typename Dtype>
-void ImageDataLayer<Dtype>::load_batch(Batch<Dtype>* batch) {
+void ImageDataLayer<Dtype>::InternalThreadEntry() {
   CPUTimer batch_timer;
   batch_timer.Start();
   double read_time = 0;
   double trans_time = 0;
   CPUTimer timer;
-  CHECK(batch->data_.count());
+  CHECK(this->prefetch_data_.count());
   CHECK(this->transformed_data_.count());
   ImageDataParameter image_data_param = this->layer_param_.image_data_param();
   const int batch_size = image_data_param.batch_size();
@@ -120,14 +115,14 @@ void ImageDataLayer<Dtype>::load_batch(Batch<Dtype>* batch) {
   if (batch_size == 1 && crop_size == 0 && new_height == 0 && new_width == 0) {
     cv::Mat cv_img = ReadImageToCVMat(root_folder + lines_[lines_id_].first,
         0, 0, is_color);
-    batch->data_.Reshape(1, cv_img.channels(),
+    this->prefetch_data_.Reshape(1, cv_img.channels(),
         cv_img.rows, cv_img.cols);
     this->transformed_data_.Reshape(1, cv_img.channels(),
         cv_img.rows, cv_img.cols);
   }
 
-  Dtype* prefetch_data = batch->data_.mutable_cpu_data();
-  Dtype* prefetch_label = batch->label_.mutable_cpu_data();
+  Dtype* prefetch_data = this->prefetch_data_.mutable_cpu_data();
+  Dtype* prefetch_label = this->prefetch_label_.mutable_cpu_data();
 
   // datum scales
   const int lines_size = lines_.size();
@@ -141,7 +136,7 @@ void ImageDataLayer<Dtype>::load_batch(Batch<Dtype>* batch) {
     read_time += timer.MicroSeconds();
     timer.Start();
     // Apply transformations (mirror, crop...) to the image
-    int offset = batch->data_.offset(item_id);
+    int offset = this->prefetch_data_.offset(item_id);
     this->transformed_data_.set_cpu_data(prefetch_data + offset);
     this->data_transformer_->Transform(cv_img, &(this->transformed_data_));
     trans_time += timer.MicroSeconds();
diff --git a/src/caffe/layers/lrn_layer.cpp b/src/caffe/layers/lrn_layer.cpp
index ba51a5eecb5..36c1ace4c99 100644
--- a/src/caffe/layers/lrn_layer.cpp
+++ b/src/caffe/layers/lrn_layer.cpp
@@ -254,6 +254,6 @@ STUB_GPU_BACKWARD(LRNLayer, CrossChannelBackward);
 #endif
 
 INSTANTIATE_CLASS(LRNLayer);
-// REGISTER_LAYER_CLASS(LRN);
+REGISTER_LAYER_CLASS(LRN);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/window_data_layer.cpp b/src/caffe/layers/window_data_layer.cpp
index f637f2ec6d4..c127d56bc46 100644
--- a/src/caffe/layers/window_data_layer.cpp
+++ b/src/caffe/layers/window_data_layer.cpp
@@ -27,7 +27,7 @@ namespace caffe {
 
 template <typename Dtype>
 WindowDataLayer<Dtype>::~WindowDataLayer<Dtype>() {
-  this->StopInternalThread();
+  this->JoinPrefetchThread();
 }
 
 template <typename Dtype>
@@ -171,9 +171,7 @@ void WindowDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
   CHECK_GT(crop_size, 0);
   const int batch_size = this->layer_param_.window_data_param().batch_size();
   top[0]->Reshape(batch_size, channels, crop_size, crop_size);
-  for (int i = 0; i < this->PREFETCH_COUNT; ++i)
-    this->prefetch_[i].data_.Reshape(
-        batch_size, channels, crop_size, crop_size);
+  this->prefetch_data_.Reshape(batch_size, channels, crop_size, crop_size);
 
   LOG(INFO) << "output data size: " << top[0]->num() << ","
       << top[0]->channels() << "," << top[0]->height() << ","
@@ -181,9 +179,7 @@ void WindowDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
   // label
   vector<int> label_shape(1, batch_size);
   top[1]->Reshape(label_shape);
-  for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
-    this->prefetch_[i].label_.Reshape(label_shape);
-  }
+  this->prefetch_label_.Reshape(label_shape);
 
   // data mean
   has_mean_file_ = this->transform_param_.has_mean_file();
@@ -221,9 +217,9 @@ unsigned int WindowDataLayer<Dtype>::PrefetchRand() {
   return (*prefetch_rng)();
 }
 
-// This function is called on prefetch thread
+// Thread fetching the data
 template <typename Dtype>
-void WindowDataLayer<Dtype>::load_batch(Batch<Dtype>* batch) {
+void WindowDataLayer<Dtype>::InternalThreadEntry() {
   // At each iteration, sample N windows where N*p are foreground (object)
   // windows and N*(1-p) are background (non-object) windows
   CPUTimer batch_timer;
@@ -231,8 +227,8 @@ void WindowDataLayer<Dtype>::load_batch(Batch<Dtype>* batch) {
   double read_time = 0;
   double trans_time = 0;
   CPUTimer timer;
-  Dtype* top_data = batch->data_.mutable_cpu_data();
-  Dtype* top_label = batch->label_.mutable_cpu_data();
+  Dtype* top_data = this->prefetch_data_.mutable_cpu_data();
+  Dtype* top_label = this->prefetch_label_.mutable_cpu_data();
   const Dtype scale = this->layer_param_.window_data_param().scale();
   const int batch_size = this->layer_param_.window_data_param().batch_size();
   const int context_pad = this->layer_param_.window_data_param().context_pad();
@@ -256,7 +252,7 @@ void WindowDataLayer<Dtype>::load_batch(Batch<Dtype>* batch) {
   bool use_square = (crop_mode == "square") ? true : false;
 
   // zero out batch
-  caffe_set(batch->data_.count(), Dtype(0), top_data);
+  caffe_set(this->prefetch_data_.count(), Dtype(0), top_data);
 
   const int num_fg = static_cast<int>(static_cast<float>(batch_size)
       * fg_fraction);
diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
index aa266cd73d2..a18ee63818e 100644
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -8,7 +8,6 @@
 #include "caffe/common.hpp"
 #include "caffe/layer.hpp"
 #include "caffe/net.hpp"
-#include "caffe/parallel.hpp"
 #include "caffe/proto/caffe.pb.h"
 #include "caffe/util/insert_splits.hpp"
 #include "caffe/util/io.hpp"
@@ -40,13 +39,8 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
   // the current NetState.
   NetParameter filtered_param;
   FilterNet(in_param, &filtered_param);
-  if (phase_ == TRAIN) {
-    caffe::P2PSync<Dtype>::divide_batch_size(&filtered_param);
-  }
-  if (Caffe::root_solver()) {
-    LOG(INFO) << "Initializing net from parameters: " << std::endl
-              << filtered_param.DebugString();
-  }
+  LOG(INFO) << "Initializing net from parameters: " << std::endl
+            << filtered_param.DebugString();
   // Create a copy of filtered_param with splits added where necessary.
   NetParameter param;
   InsertSplits(filtered_param, &param);
@@ -70,9 +64,7 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
     const int layer_id = -1;  // inputs have fake layer ID -1
     AppendTop(param, layer_id, input_id, &available_blobs, &blob_name_to_idx);
   }
-  if (Caffe::root_solver()) {
-    DLOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype);
-  }
+  DLOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype);
   // For each layer, set up its input and output
   bottom_vecs_.resize(param.layer_size());
   top_vecs_.resize(param.layer_size());
@@ -95,9 +87,7 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
     }
     layers_.push_back(LayerRegistry<Dtype>::CreateLayer(layer_param));
     layer_names_.push_back(layer_param.name());
-    if (Caffe::root_solver()) {
-      LOG(INFO) << "Creating Layer " << layer_param.name();
-    }
+    LOG(INFO) << "Creating Layer " << layer_param.name();
     bool need_backward = false;
 
     // Figure out this layer's input and output
@@ -127,30 +117,20 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
       }
     }
     // After this layer is connected, set it up.
-    if (Caffe::root_solver()) {
-      LOG(INFO) << "Setting up " << layer_names_[layer_id];
-    }
+    LOG(INFO) << "Setting up " << layer_names_[layer_id];
     layers_[layer_id]->SetUp(bottom_vecs_[layer_id], top_vecs_[layer_id]);
     for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) {
       if (blob_loss_weights_.size() <= top_id_vecs_[layer_id][top_id]) {
         blob_loss_weights_.resize(top_id_vecs_[layer_id][top_id] + 1, Dtype(0));
       }
       blob_loss_weights_[top_id_vecs_[layer_id][top_id]] = layer->loss(top_id);
-      if (Caffe::root_solver()) {
-        LOG(INFO) << "Top shape: "
-                  << top_vecs_[layer_id][top_id]->shape_string();
-      }
+      LOG(INFO) << "Top shape: " << top_vecs_[layer_id][top_id]->shape_string();
       if (layer->loss(top_id)) {
-        if (Caffe::root_solver()) {
-          LOG(INFO) << "    with loss weight " << layer->loss(top_id);
-        }
+        LOG(INFO) << "    with loss weight " << layer->loss(top_id);
       }
       memory_used_ += top_vecs_[layer_id][top_id]->count();
     }
-    if (Caffe::root_solver()) {
-      DLOG(INFO) << "Memory required for data: "
-                 << memory_used_ * sizeof(Dtype);
-    }
+    DLOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype);
     const int param_size = layer_param.param_size();
     const int num_param_blobs = layers_[layer_id]->blobs().size();
     CHECK_LE(param_size, num_param_blobs)
@@ -209,14 +189,10 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
     }
     if (!layer_contributes_loss) { layer_need_backward_[layer_id] = false; }
     if (layer_need_backward_[layer_id]) {
-      if (Caffe::root_solver()) {
-        LOG(INFO) << layer_names_[layer_id] << " needs backward computation.";
-      }
+      LOG(INFO) << layer_names_[layer_id] << " needs backward computation.";
     } else {
-      if (Caffe::root_solver()) {
-        LOG(INFO) << layer_names_[layer_id]
-                  << " does not need backward computation.";
-      }
+      LOG(INFO) << layer_names_[layer_id]
+                << " does not need backward computation.";
     }
     for (int bottom_id = 0; bottom_id < bottom_vecs_[layer_id].size();
          ++bottom_id) {
@@ -256,9 +232,7 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
   // In the end, all remaining blobs are considered output blobs.
   for (set<string>::iterator it = available_blobs.begin();
       it != available_blobs.end(); ++it) {
-    if (Caffe::root_solver()) {
-      LOG(INFO) << "This network produces output " << *it;
-    }
+    LOG(INFO) << "This network produces output " << *it;
     net_output_blobs_.push_back(blobs_[blob_name_to_idx[*it]].get());
     net_output_blob_indices_.push_back(blob_name_to_idx[*it]);
   }
@@ -270,10 +244,8 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
   }
   GetLearningRateAndWeightDecay();
   debug_info_ = param.debug_info();
-  if (Caffe::root_solver()) {
-    LOG(INFO) << "Network initialization done.";
-    LOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype);
-  }
+  LOG(INFO) << "Network initialization done.";
+  LOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype);
 }
 
 template <typename Dtype>
@@ -312,33 +284,27 @@ bool Net<Dtype>::StateMeetsRule(const NetState& state,
   // Check whether the rule is broken due to phase.
   if (rule.has_phase()) {
       if (rule.phase() != state.phase()) {
-        if (Caffe::root_solver()) {
-          LOG(INFO) << "The NetState phase (" << state.phase()
-                    << ") differed from the phase (" << rule.phase()
-                    << ") specified by a rule in layer " << layer_name;
-        }
+        LOG(INFO) << "The NetState phase (" << state.phase()
+          << ") differed from the phase (" << rule.phase()
+          << ") specified by a rule in layer " << layer_name;
         return false;
       }
   }
   // Check whether the rule is broken due to min level.
   if (rule.has_min_level()) {
     if (state.level() < rule.min_level()) {
-      if (Caffe::root_solver()) {
-        LOG(INFO) << "The NetState level (" << state.level()
-                  << ") is above the min_level (" << rule.min_level()
-                  << ") specified by a rule in layer " << layer_name;
-      }
+      LOG(INFO) << "The NetState level (" << state.level()
+          << ") is above the min_level (" << rule.min_level()
+          << ") specified by a rule in layer " << layer_name;
       return false;
     }
   }
   // Check whether the rule is broken due to max level.
   if (rule.has_max_level()) {
     if (state.level() > rule.max_level()) {
-      if (Caffe::root_solver()) {
-        LOG(INFO) << "The NetState level (" << state.level()
-                  << ") is above the max_level (" << rule.max_level()
-                  << ") specified by a rule in layer " << layer_name;
-      }
+      LOG(INFO) << "The NetState level (" << state.level()
+          << ") is above the max_level (" << rule.max_level()
+          << ") specified by a rule in layer " << layer_name;
       return false;
     }
   }
@@ -351,10 +317,8 @@ bool Net<Dtype>::StateMeetsRule(const NetState& state,
       if (rule.stage(i) == state.stage(j)) { has_stage = true; }
     }
     if (!has_stage) {
-      if (Caffe::root_solver()) {
-        LOG(INFO) << "The NetState did not contain stage '" << rule.stage(i)
-                  << "' specified by a rule in layer " << layer_name;
-      }
+      LOG(INFO) << "The NetState did not contain stage '" << rule.stage(i)
+                << "' specified by a rule in layer " << layer_name;
       return false;
     }
   }
@@ -367,10 +331,8 @@ bool Net<Dtype>::StateMeetsRule(const NetState& state,
       if (rule.not_stage(i) == state.stage(j)) { has_stage = true; }
     }
     if (has_stage) {
-      if (Caffe::root_solver()) {
-        LOG(INFO) << "The NetState contained a not_stage '" << rule.not_stage(i)
-                  << "' specified by a rule in layer " << layer_name;
-      }
+      LOG(INFO) << "The NetState contained a not_stage '" << rule.not_stage(i)
+                << "' specified by a rule in layer " << layer_name;
       return false;
     }
   }
@@ -392,9 +354,7 @@ void Net<Dtype>::AppendTop(const NetParameter& param, const int layer_id,
   if (blob_name_to_idx && layer_param && layer_param->bottom_size() > top_id &&
       blob_name == layer_param->bottom(top_id)) {
     // In-place computation
-    if (Caffe::root_solver()) {
-      LOG(INFO) << layer_param->name() << " -> " << blob_name << " (in-place)";
-    }
+    LOG(INFO) << layer_param->name() << " -> " << blob_name << " (in-place)";
     top_vecs_[layer_id].push_back(blobs_[(*blob_name_to_idx)[blob_name]].get());
     top_id_vecs_[layer_id].push_back((*blob_name_to_idx)[blob_name]);
   } else if (blob_name_to_idx &&
@@ -404,12 +364,10 @@ void Net<Dtype>::AppendTop(const NetParameter& param, const int layer_id,
     LOG(FATAL) << "Duplicate blobs produced by multiple sources.";
   } else {
     // Normal output.
-    if (Caffe::root_solver()) {
-      if (layer_param) {
-        LOG(INFO) << layer_param->name() << " -> " << blob_name;
-      } else {
-        LOG(INFO) << "Input " << top_id << " -> " << blob_name;
-      }
+    if (layer_param) {
+      LOG(INFO) << layer_param->name() << " -> " << blob_name;
+    } else {
+      LOG(INFO) << "Input " << top_id << " -> " << blob_name;
     }
     shared_ptr<Blob<Dtype> > blob_pointer(new Blob<Dtype>());
     const int blob_id = blobs_.size();
@@ -449,9 +407,7 @@ int Net<Dtype>::AppendBottom(const NetParameter& param, const int layer_id,
                << " (at index " << bottom_id << ") to layer " << layer_id;
   }
   const int blob_id = (*blob_name_to_idx)[blob_name];
-  if (Caffe::root_solver()) {
-    LOG(INFO) << layer_names_[layer_id] << " <- " << blob_name;
-  }
+  LOG(INFO) << layer_names_[layer_id] << " <- " << blob_name;
   bottom_vecs_[layer_id].push_back(blobs_[blob_id].get());
   bottom_id_vecs_[layer_id].push_back(blob_id);
   available_blobs->erase(blob_name);
@@ -500,11 +456,9 @@ void Net<Dtype>::AppendParam(const NetParameter& param, const int layer_id,
         param_layer_indices_[owner_net_param_id];
     const int owner_layer_id = owner_index.first;
     const int owner_param_id = owner_index.second;
-    if (Caffe::root_solver()) {
-      LOG(INFO) << "Sharing parameters '" << param_name << "' owned by "
-                << "layer '" << layer_names_[owner_layer_id] << "', param "
-                << "index " << owner_param_id;
-    }
+    LOG(INFO) << "Sharing parameters '" << param_name << "' owned by "
+              << "layer '" << layer_names_[owner_layer_id] << "', param "
+              << "index " << owner_param_id;
     Blob<Dtype>* this_blob = layers_[layer_id]->blobs()[param_id].get();
     Blob<Dtype>* owner_blob =
         layers_[owner_layer_id]->blobs()[owner_param_id].get();
@@ -525,9 +479,7 @@ void Net<Dtype>::AppendParam(const NetParameter& param, const int layer_id,
 
 template <typename Dtype>
 void Net<Dtype>::GetLearningRateAndWeightDecay() {
-  if (Caffe::root_solver()) {
-    LOG(INFO) << "Collecting Learning Rate and Weight Decay.";
-  }
+  LOG(INFO) << "Collecting Learning Rate and Weight Decay.";
   ParamSpec default_param_spec;
   for (int i = 0; i < layers_.size(); ++i) {
     vector<shared_ptr<Blob<Dtype> > >& layer_blobs = layers_[i]->blobs();
@@ -553,7 +505,6 @@ Dtype Net<Dtype>::ForwardFromTo(int start, int end) {
   }
   for (int i = start; i <= end; ++i) {
     // LOG(ERROR) << "Forwarding " << layer_names_[i];
-    layers_[i]->Reshape(bottom_vecs_[i], top_vecs_[i]);
     Dtype layer_loss = layers_[i]->Forward(bottom_vecs_[i], top_vecs_[i]);
     loss += layer_loss;
     if (debug_info_) { ForwardDebugInfo(i); }
@@ -618,7 +569,6 @@ void Net<Dtype>::BackwardFromTo(int start, int end) {
   CHECK_LT(start, layers_.size());
   for (int i = start; i >= end; --i) {
     if (layer_need_backward_[i]) {
-      layers_[i]->Reshape(bottom_vecs_[i], top_vecs_[i]);
       layers_[i]->Backward(
           top_vecs_[i], bottom_need_backward_[i], bottom_vecs_[i]);
       if (debug_info_) { BackwardDebugInfo(i); }
@@ -631,10 +581,8 @@ void Net<Dtype>::InputDebugInfo(const int input_id) {
   const Blob<Dtype>& blob = *net_input_blobs_[input_id];
   const string& blob_name = blob_names_[net_input_blob_indices_[input_id]];
   const Dtype data_abs_val_mean = blob.asum_data() / blob.count();
-  if (Caffe::root_solver()) {
-    LOG(INFO) << "    [Forward] "
-              << "Input " << blob_name << " data: " << data_abs_val_mean;
-  }
+  LOG(INFO) << "    [Forward] "
+     << "Input " << blob_name << " data: " << data_abs_val_mean;
 }
 
 template <typename Dtype>
@@ -643,12 +591,9 @@ void Net<Dtype>::ForwardDebugInfo(const int layer_id) {
     const Blob<Dtype>& blob = *top_vecs_[layer_id][top_id];
     const string& blob_name = blob_names_[top_id_vecs_[layer_id][top_id]];
     const Dtype data_abs_val_mean = blob.asum_data() / blob.count();
-    if (Caffe::root_solver()) {
-      LOG(INFO) << "    [Forward] "
-                << "Layer " << layer_names_[layer_id]
-                << ", top blob " << blob_name
-                << " data: " << data_abs_val_mean;
-    }
+    LOG(INFO) << "    [Forward] "
+       << "Layer " << layer_names_[layer_id] << ", top blob " << blob_name
+       << " data: " << data_abs_val_mean;
   }
   for (int param_id = 0; param_id < layers_[layer_id]->blobs().size();
        ++param_id) {
@@ -656,12 +601,9 @@ void Net<Dtype>::ForwardDebugInfo(const int layer_id) {
     const int net_param_id = param_id_vecs_[layer_id][param_id];
     const string& blob_name = param_display_names_[net_param_id];
     const Dtype data_abs_val_mean = blob.asum_data() / blob.count();
-    if (Caffe::root_solver()) {
-      LOG(INFO) << "    [Forward] "
-                << "Layer " << layer_names_[layer_id]
-                << ", param blob " << blob_name
-                << " data: " << data_abs_val_mean;
-    }
+    LOG(INFO) << "    [Forward] "
+       << "Layer " << layer_names_[layer_id] << ", param blob " << blob_name
+       << " data: " << data_abs_val_mean;
   }
 }
 
@@ -673,24 +615,18 @@ void Net<Dtype>::BackwardDebugInfo(const int layer_id) {
     const Blob<Dtype>& blob = *bottom_vec[bottom_id];
     const string& blob_name = blob_names_[bottom_id_vecs_[layer_id][bottom_id]];
     const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count();
-    if (Caffe::root_solver()) {
-      LOG(INFO) << "    [Backward] "
-                << "Layer " << layer_names_[layer_id]
-                << ", bottom blob " << blob_name
-                << " diff: " << diff_abs_val_mean;
-    }
+    LOG(INFO) << "    [Backward] "
+        << "Layer " << layer_names_[layer_id] << ", bottom blob " << blob_name
+        << " diff: " << diff_abs_val_mean;
   }
   for (int param_id = 0; param_id < layers_[layer_id]->blobs().size();
        ++param_id) {
     if (!layers_[layer_id]->param_propagate_down(param_id)) { continue; }
     const Blob<Dtype>& blob = *layers_[layer_id]->blobs()[param_id];
     const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count();
-    if (Caffe::root_solver()) {
-      LOG(INFO) << "    [Backward] "
-                << "Layer " << layer_names_[layer_id]
-                << ", param blob " << param_id
-                << " diff: " << diff_abs_val_mean;
-    }
+    LOG(INFO) << "    [Backward] "
+        << "Layer " << layer_names_[layer_id] << ", param blob " << param_id
+        << " diff: " << diff_abs_val_mean;
   }
 }
 
@@ -703,22 +639,17 @@ void Net<Dtype>::UpdateDebugInfo(const int param_id) {
   const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count();
   if (param_owner < 0) {
     const Dtype data_abs_val_mean = blob.asum_data() / blob.count();
-    if (Caffe::root_solver()) {
-      LOG(INFO) << "    [Update] Layer " << layer_name
-                << ", param " << param_display_name
-                << " data: " << data_abs_val_mean
-                << "; diff: " << diff_abs_val_mean;
-    }
+    LOG(INFO) << "    [Update] Layer " << layer_name
+        << ", param " << param_display_name
+        << " data: " << data_abs_val_mean << "; diff: " << diff_abs_val_mean;
   } else {
     const string& owner_layer_name =
         layer_names_[param_layer_indices_[param_owner].first];
-    if (Caffe::root_solver()) {
-      LOG(INFO) << "    [Update] Layer " << layer_name
-                << ", param blob " << param_display_name
-                << " (owned by layer " << owner_layer_name << ", " << "param "
-                << param_display_names_[param_owners_[param_id]] << ")"
-                << " diff: " << diff_abs_val_mean;
-    }
+    LOG(INFO) << "    [Update] Layer " << layer_name
+        << ", param blob " << param_display_name
+        << " (owned by layer " << owner_layer_name << ", "
+        << "param " << param_display_names_[param_owners_[param_id]] << ")"
+        << " diff: " << diff_abs_val_mean;
   }
 }
 
@@ -775,8 +706,8 @@ void Net<Dtype>::Backward() {
     const Dtype l2norm_data = std::sqrt(sumsq_data);
     const Dtype l2norm_diff = std::sqrt(sumsq_diff);
     LOG(ERROR) << "    [Backward] All net params (data, diff): "
-               << "L1 norm = (" << asum_data << ", " << asum_diff << "); "
-               << "L2 norm = (" << l2norm_data << ", " << l2norm_diff << ")";
+        << "L1 norm = (" << asum_data << ", " << asum_diff << "); "
+        << "L2 norm = (" << l2norm_data << ", " << l2norm_diff << ")";
   }
 }
 
diff --git a/src/caffe/parallel.cpp b/src/caffe/parallel.cpp
deleted file mode 100644
index 6ac0ff0406d..00000000000
--- a/src/caffe/parallel.cpp
+++ /dev/null
@@ -1,526 +0,0 @@
-#ifndef CPU_ONLY
-#include <cuda_runtime.h>
-#endif
-#include <glog/logging.h>
-#include <stdio.h>
-#include <sys/ioctl.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-
-#include <cstdlib>
-#include <sstream>
-#include <string>
-#include <vector>
-
-#include "boost/thread.hpp"
-#include "caffe/caffe.hpp"
-#include "caffe/parallel.hpp"
-
-namespace caffe {
-
-enum Op {
-  copy,
-  replace_cpu,
-  replace_gpu,
-  replace_cpu_diff,
-  replace_gpu_diff
-};
-
-template<typename Dtype>
-static void apply_buffers(const vector<shared_ptr<Blob<Dtype> > >& blobs,
-                          Dtype* buffer, size_t total_size, Op op) {
-  Dtype* ptr = buffer;
-  for (int i = 0; i < blobs.size(); ++i) {
-    int size = blobs[i]->count();
-    switch (op) {
-      case copy: {
-        // Init buffer to current values of blobs
-        caffe_copy(size,
-                   reinterpret_cast<const Dtype*>(blobs[i]->data()->cpu_data()),
-                   ptr);
-        break;
-      }
-      case replace_cpu:
-        blobs[i]->data()->set_cpu_data(ptr);
-        break;
-      case replace_gpu:
-        blobs[i]->data()->set_gpu_data(ptr);
-        break;
-      case replace_cpu_diff:
-        blobs[i]->diff()->set_cpu_data(ptr);
-        break;
-      case replace_gpu_diff:
-        blobs[i]->diff()->set_gpu_data(ptr);
-        break;
-    }
-    ptr += size;
-  }
-  CHECK_EQ(total_size, ptr - buffer);
-}
-
-// Buffer size necessary to store given blobs
-template<typename Dtype>
-static size_t total_size(const vector<shared_ptr<Blob<Dtype> > >& params) {
-  size_t size = 0;
-  for (int i = 0; i < params.size(); ++i)
-    size += params[i]->count();
-  return size;
-}
-
-template<typename Dtype>
-Params<Dtype>::Params(shared_ptr<Solver<Dtype> > root_solver)
-    : size_(total_size<Dtype>(root_solver->net()->params())),
-      data_(),
-      diff_() {
-}
-
-template<typename Dtype>
-GPUParams<Dtype>::GPUParams(shared_ptr<Solver<Dtype> > root_solver, int device)
-    : Params<Dtype>(root_solver) {
-#ifndef CPU_ONLY
-  int initial_device;
-  CUDA_CHECK(cudaGetDevice(&initial_device));
-
-  // Allocate device buffers
-  CUDA_CHECK(cudaSetDevice(device));
-  buffer_device_ = device;
-  // CUDA_CHECK(cudaMalloc(&data_, size_ * sizeof(Dtype)));
-  MemoryHandler::mallocGPU(reinterpret_cast<void **>(&data_),
-                           size_ * sizeof(Dtype));
-
-  // Copy blob values
-  const vector<shared_ptr<Blob<Dtype> > >& net = root_solver->net()->params();
-  apply_buffers(net, data_, size_, copy);
-
-  // CUDA_CHECK(cudaMalloc(&diff_, size_ * sizeof(Dtype)));
-  MemoryHandler::mallocGPU(reinterpret_cast<void **>(&diff_),
-                           size_ * sizeof(Dtype));
-  caffe_gpu_set(size_, Dtype(0), diff_);
-
-  CUDA_CHECK(cudaSetDevice(initial_device));
-#else
-  NO_GPU;
-#endif
-}
-
-template<typename Dtype>
-GPUParams<Dtype>::~GPUParams() {
-#ifndef CPU_ONLY
-  int initial_device;
-  cudaGetDevice(&initial_device);
-  cudaSetDevice(buffer_device_);
-  MemoryHandler::freeGPU(data_);
-  MemoryHandler::freeGPU(diff_);
-  data_ = NULL;
-  diff_ = NULL;
-  cudaSetDevice(initial_device);
-#endif
-}
-
-template<typename Dtype>
-void GPUParams<Dtype>::configure(Solver<Dtype>* solver) const {
-  const vector<shared_ptr<Blob<Dtype> > >& net = solver->net()->params();
-  apply_buffers(net, data_, size_, replace_gpu);
-  apply_buffers(net, diff_, size_, replace_gpu_diff);
-}
-
-//
-
-void DevicePair::compute(const vector<int> devices, vector<DevicePair>* pairs) {
-#ifndef CPU_ONLY
-  vector<int> remaining(devices);
-
-  // Depth for reduction tree
-  int remaining_depth = static_cast<int>(ceil(log2(remaining.size())));
-
-  // Group GPUs by board
-  for (int d = 0; d < remaining_depth; ++d) {
-    for (int i = 0; i < remaining.size(); ++i) {
-      for (int j = i + 1; j < remaining.size(); ++j) {
-        cudaDeviceProp a, b;
-        CUDA_CHECK(cudaGetDeviceProperties(&a, remaining[i]));
-        CUDA_CHECK(cudaGetDeviceProperties(&b, remaining[j]));
-        if (a.isMultiGpuBoard && b.isMultiGpuBoard) {
-          if (a.multiGpuBoardGroupID == b.multiGpuBoardGroupID) {
-              pairs->push_back(DevicePair(remaining[i], remaining[j]));
-              DLOG(INFO) << "GPU board: " << remaining[i]
-                         << ":" << remaining[j];
-              remaining.erase(remaining.begin() + j);
-              break;
-          }
-        }
-      }
-    }
-  }
-  ostringstream s;
-  for (int i = 0; i < remaining.size(); ++i) {
-    s << (i ? ", " : "") << remaining[i];
-  }
-  DLOG(INFO) << "GPUs paired by boards, remaining: " << s.str();
-
-  // Group by P2P accessibility
-  remaining_depth = ceil(log2(remaining.size()));
-  for (int d = 0; d < remaining_depth; ++d) {
-    for (int i = 0; i < remaining.size(); ++i) {
-      for (int j = i + 1; j < remaining.size(); ++j) {
-        int access;
-        CUDA_CHECK(cudaDeviceCanAccessPeer(&access,
-                                           remaining[i],
-                                           remaining[j]));
-        if (access) {
-            pairs->push_back(DevicePair(remaining[i], remaining[j]));
-            DLOG(INFO) << "P2P pair: " << remaining[i]
-                       << ":" << remaining[j];
-            remaining.erase(remaining.begin() + j);
-            break;
-        }
-      }
-    }
-  }
-  s.str("");
-  for (int i = 0; i < remaining.size(); ++i) {
-      s << (i ? ", " : "") << remaining[i];
-  }
-  DLOG(INFO) << "GPUs paired by P2P access, remaining: " << s.str();
-
-  // Group remaining
-  remaining_depth = ceil(log2(remaining.size()));
-  for (int d = 0; d < remaining_depth; ++d) {
-    for (int i = 0; i < remaining.size(); ++i) {
-          pairs->push_back(DevicePair(remaining[i], remaining[i+1]));
-          DLOG(INFO) << "Remaining pair: " << remaining[i]
-                     << ":" << remaining[i+1];
-          remaining.erase(remaining.begin() + i+1);
-    }
-  }
-
-  // Should only be the parent node remaining
-  CHECK_EQ(remaining.size(), 1);
-
-  pairs->insert(pairs->begin(), DevicePair(-1, remaining[0]));
-
-  CHECK(pairs->size() == devices.size());
-  for (int i = 0; i < pairs->size(); ++i) {
-    CHECK((*pairs)[i].parent() != (*pairs)[i].device());
-    for (int j = i + 1; j < pairs->size(); ++j) {
-      CHECK((*pairs)[i].device() != (*pairs)[j].device());
-    }
-  }
-#else
-  NO_GPU;
-#endif
-}
-
-//
-
-template<typename Dtype>
-P2PSync<Dtype>::P2PSync(shared_ptr<Solver<Dtype> > root_solver,
-                        P2PSync<Dtype>* parent, const SolverParameter& param)
-    : GPUParams<Dtype>(root_solver, param.device_id()),
-      parent_(parent),
-      children_(),
-      queue_(),
-      initial_iter_(root_solver->iter()),
-      solver_() {
-#ifndef CPU_ONLY
-  int initial_device;
-  CUDA_CHECK(cudaGetDevice(&initial_device));
-  const int self = param.device_id();
-  CUDA_CHECK(cudaSetDevice(self));
-
-  if (parent == NULL) {
-    solver_ = root_solver;
-  } else {
-    Caffe::set_root_solver(false);
-    solver_.reset(new Solver<Dtype>(param));
-    Caffe::set_root_solver(true);
-  }
-  this->configure(solver_.get());
-  solver_->add_callback(this);
-
-  if (parent) {
-    // Enable p2p access between devices
-    const int peer = parent->solver_->param().device_id();
-    int access;
-    CUDA_CHECK(cudaDeviceCanAccessPeer(&access, self, peer));
-    if (access) {
-      CUDA_CHECK(cudaDeviceEnablePeerAccess(peer, 0));
-    } else {
-      LOG(INFO)<< "GPU " << self << " does not have p2p access to GPU " << peer;
-    }
-    // Allocate receiving buffer on parent
-    CUDA_CHECK(cudaSetDevice(peer));
-    MemoryHandler::mallocGPU(reinterpret_cast<void **>(&parent_grads_),
-                     size_ * sizeof(Dtype));
-    CUDA_CHECK(cudaSetDevice(self));
-  }
-
-  CUDA_CHECK(cudaSetDevice(initial_device));
-#else
-  NO_GPU;
-#endif
-}
-
-template<typename Dtype>
-P2PSync<Dtype>::~P2PSync() {
-#ifndef CPU_ONLY
-  int initial_device;
-  CUDA_CHECK(cudaGetDevice(&initial_device));
-  const int self = solver_->param().device_id();
-  CUDA_CHECK(cudaSetDevice(self));
-
-  if (parent_) {
-    const int peer = parent_->solver_->param().device_id();
-    cudaSetDevice(peer);
-    MemoryHandler::freeGPU(parent_grads_);
-    parent_grads_ = NULL;
-    cudaSetDevice(self);
-    int access;
-    CUDA_CHECK(cudaDeviceCanAccessPeer(&access, self, peer));
-    if (access) {
-      CUDA_CHECK(cudaDeviceDisablePeerAccess(peer));
-    }
-  }
-
-  CUDA_CHECK(cudaSetDevice(initial_device));
-#endif
-}
-
-template<typename Dtype>
-void P2PSync<Dtype>::InternalThreadEntry() {
-  Caffe::SetDevice(solver_->param().device_id());
-  CHECK(Caffe::root_solver());
-  Caffe::set_root_solver(false);
-  // See if there is a defined seed and reset random state if so
-  if (solver_->param().random_seed() >= 0) {
-    // Fetch random seed and modulate by device ID to make sure
-    // everyone doesn't have the same seed.  We seem to have some
-    // solver instability if we have everyone with the same seed
-    Caffe::set_random_seed(
-        solver_->param().random_seed() + solver_->param().device_id());
-  }
-  solver_->Step(solver_->param().max_iter() - initial_iter_);
-}
-
-template<typename Dtype>
-void P2PSync<Dtype>::on_start(Timer* timer, ostringstream* timing) {
-#ifndef CPU_ONLY
-#ifdef DEBUG
-  int device;
-  CUDA_CHECK(cudaGetDevice(&device));
-  CHECK(device == solver_->param().device_id());
-#else
-//  CHECK(false);
-#endif
-
-  // Wait for update from parent
-  if (parent_) {
-    timer->Start();
-    P2PSync<Dtype> *parent = queue_.pop();
-    CHECK(parent == parent_);
-    *timing << " recv_param: " << timer->MilliSeconds();
-  }
-
-  // Update children
-  if (children_.size()) {
-    timer->Start();
-  }
-  for (int i = children_.size() - 1; i >= 0; i--) {
-    Dtype* src = data_;
-    Dtype* dst = children_[i]->data_;
-
-#ifdef DEBUG
-    cudaPointerAttributes attributes;
-    CUDA_CHECK(cudaPointerGetAttributes(&attributes, src));
-    CHECK(attributes.device == device);
-    CUDA_CHECK(cudaPointerGetAttributes(&attributes, dst));
-    CHECK(attributes.device == children_[i]->solver_->param().device_id());
-#endif
-
-    CUDA_CHECK(cudaMemcpyAsync(dst, src, size_ * sizeof(Dtype),  //
-        cudaMemcpyDeviceToDevice, cudaStreamDefault));
-    CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault));
-    children_[i]->queue_.push(this);
-  }
-  if (children_.size()) {
-    *timing << " send_param: " << timer->MilliSeconds();
-  }
-#endif
-}
-
-template<typename Dtype>
-void P2PSync<Dtype>::on_gradients_ready(Timer* timer, ostringstream* timing) {
-#ifndef CPU_ONLY
-#ifdef DEBUG
-  int device;
-  CUDA_CHECK(cudaGetDevice(&device));
-  CHECK(device == solver_->param().device_id());
-#endif
-
-  // Sum children gradients as they appear in the queue
-  for (int i = 0; i < children_.size(); ++i) {
-    timer->Start();
-    P2PSync<Dtype> *child = queue_.pop();
-    Dtype* src = child->parent_grads_;
-    Dtype* dst = diff_;
-
-#ifdef DEBUG
-    bool ok = false;
-    for (int j = 0; j < children_.size(); ++j) {
-      if (child == children_[j]) {
-        ok = true;
-      }
-    }
-    CHECK(ok);
-    cudaPointerAttributes attributes;
-    CUDA_CHECK(cudaPointerGetAttributes(&attributes, src));
-    CHECK(attributes.device == device);
-    CUDA_CHECK(cudaPointerGetAttributes(&attributes, dst));
-    CHECK(attributes.device == device);
-#endif
-
-    caffe_gpu_add(size_, src, dst, dst);
-    *timing << " add_grad: " << timer->MilliSeconds();
-  }
-
-  // Send gradients to parent
-  if (parent_) {
-    timer->Start();
-    Dtype* src = diff_;
-    Dtype* dst = parent_grads_;
-
-#ifdef DEBUG
-    cudaPointerAttributes attributes;
-    CUDA_CHECK(cudaPointerGetAttributes(&attributes, src));
-    CHECK(attributes.device == device);
-    CUDA_CHECK(cudaPointerGetAttributes(&attributes, dst));
-    CHECK(attributes.device == parent_->solver_->param().device_id());
-#endif
-
-    CUDA_CHECK(cudaMemcpyAsync(dst, src, size_ * sizeof(Dtype),  //
-        cudaMemcpyDeviceToDevice, cudaStreamDefault));
-    CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault));
-    parent_->queue_.push(this);
-    *timing << " send_grad: " << timer->MilliSeconds();
-  } else {
-    // Loss functions divide gradients by the batch size, so to compensate
-    // for split batch, the root solver divides by number of solvers.
-    caffe_gpu_scal(size_, Dtype(1.0 / Caffe::solver_count()), diff_);
-  }
-#endif
-}
-
-template<typename Dtype>
-void P2PSync<Dtype>::run(shared_ptr<Solver<Dtype> > root,
-                         const vector<int>& gpus) {
-  // Pair devices for map-reduce synchronization
-  vector<DevicePair> pairs;
-  DevicePair::compute(gpus, &pairs);
-  ostringstream s;
-  for (int i = 1; i < pairs.size(); ++i) {
-    s << (i == 1 ? "" : ", ") << pairs[i].parent() << ":" << pairs[i].device();
-  }
-  LOG(INFO)<< "GPUs pairs " << s.str();
-
-  SolverParameter param(root->param());
-  vector<shared_ptr<P2PSync<Dtype> > > syncs(gpus.size());
-  syncs[0].reset(new P2PSync<Dtype>(root, NULL, param));
-
-  // Build the GPU tree by finding the parent for each solver
-  for (int attempts = 0; attempts < pairs.size(); ++attempts) {
-    for (int i = 1; i < pairs.size(); ++i) {
-      if (!syncs[i].get()) {
-        P2PSync<Dtype>* parent = NULL;
-        for (int j = 0; j < syncs.size(); ++j) {
-          if (syncs[j]) {
-            const SolverParameter& p = syncs[j]->solver()->param();
-            if (p.device_id() == pairs[i].parent()) {
-              parent = (P2PSync<Dtype>*) syncs[j].get();
-            }
-          }
-        }
-        if (parent) {
-          param.set_device_id(pairs[i].device());
-          syncs[i].reset(new P2PSync<Dtype>(root, parent, param));
-          parent->children_.push_back((P2PSync<Dtype>*) syncs[i].get());
-        }
-      }
-    }
-  }
-
-  LOG(INFO)<< "Starting Optimization";
-
-  for (int i = 1; i < syncs.size(); ++i) {
-    syncs[i]->StartInternalThread();
-  }
-
-  // Run root solver on current thread
-  syncs[0]->solver_->Solve();
-
-  for (int i = 1; i < syncs.size(); ++i) {
-    syncs[i]->StopInternalThread();
-  }
-}
-
-template<typename Dtype>
-void P2PSync<Dtype>::divide_batch_size(NetParameter* net) {
-  int solver_count = Caffe::solver_count();
-  for (int i = 0; i < net->layer_size(); ++i) {
-    string m = "Batch size must be divisible by the number of solvers (GPUs)";
-    if (net->layer(i).has_data_param()) {
-      if (net->layer(i).data_param().has_batch_size()) {
-        uint32_t total = net->layer(i).data_param().batch_size();
-        uint32_t batch = total / solver_count;
-        CHECK(batch * solver_count == total) << m;
-        net->mutable_layer(i)->mutable_data_param()->set_batch_size(batch);
-
-        // Also adjust the prefetch count, as it is shared by all solvers
-        uint32_t prefetch = net->layer(i).data_param().prefetch();
-        net->mutable_layer(i)->mutable_data_param()->set_prefetch(
-            prefetch * solver_count);
-      }
-    }
-    if (net->layer(i).has_hdf5_data_param()) {
-      if (net->layer(i).hdf5_data_param().has_batch_size()) {
-        uint32_t total = net->layer(i).hdf5_data_param().batch_size();
-        uint32_t batch = total / solver_count;
-        CHECK(batch * solver_count == total) << m;
-        net->mutable_layer(i)->mutable_hdf5_data_param()->set_batch_size(batch);
-      }
-    }
-    if (net->layer(i).has_image_data_param()) {
-      if (net->layer(i).image_data_param().has_batch_size()) {
-        uint32_t total = net->layer(i).image_data_param().batch_size();
-        uint32_t batch = total / solver_count;
-        CHECK(batch * solver_count == total) << m;
-        net->mutable_layer(i)->mutable_image_data_param()->set_batch_size(
-            batch);
-      }
-    }
-    if (net->layer(i).has_memory_data_param()) {
-      if (net->layer(i).memory_data_param().has_batch_size()) {
-        uint32_t total = net->layer(i).memory_data_param().batch_size();
-        uint32_t batch = total / solver_count;
-        CHECK(batch * solver_count == total) << m;
-        net->mutable_layer(i)->mutable_memory_data_param()->set_batch_size(
-            batch);
-      }
-    }
-    if (net->layer(i).has_window_data_param()) {
-      if (net->layer(i).window_data_param().has_batch_size()) {
-        uint32_t total = net->layer(i).window_data_param().batch_size();
-        uint32_t batch = total / solver_count;
-        CHECK(batch * solver_count == total) << m;
-        net->mutable_layer(i)->mutable_window_data_param()->set_batch_size(
-            batch);
-      }
-    }
-  }
-}
-
-INSTANTIATE_CLASS(Params);
-INSTANTIATE_CLASS(GPUParams);
-INSTANTIATE_CLASS(P2PSync);
-
-}  // namespace caffe
-
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index 21a38d54ffc..307015f42c9 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -440,28 +440,6 @@ message ConvolutionParameter {
     CUDNN = 2;
   }
   optional Engine engine = 15 [default = DEFAULT];
-  enum CuDNNFwdAlgorithm {
-    IMPLICIT_GEMM = 0;
-    IMPLICIT_PRECOMP_GEMM = 1;
-    GEMM = 2;
-    DIRECT = 3;
-    FWD_FFT = 4;
-  }
-  optional CuDNNFwdAlgorithm CuDNNFwdAlgo = 16 [default = IMPLICIT_GEMM];
-  enum CuDNNBwdFilterAlgorithm {
-    BWD_FILTER_ALGO_0 = 0;
-    BWD_FILTER_ALGO_1 = 1;
-    BWD_FILTER_FFT = 2;
-  }
-  optional CuDNNBwdFilterAlgorithm CuDNNBwdFilterAlgo = 17 [default = BWD_FILTER_ALGO_0];
-  enum CuDNNBwdDataAlgorithm {
-    BWD_DATA_ALGO_0 = 0;
-    BWD_DATA_ALGO_1 = 1;
-    BWD_DATA_FFT = 2;
-    BWD_DATA_ALGO_2 = 3;
-  }
-  optional CuDNNBwdDataAlgorithm CuDNNBwdDataAlgo = 18 [default = BWD_DATA_ALGO_0];
-  optional uint32 CuDNNWorkspaceSize = 19 [default = 10485760]; // 10MB default
 }
 
 message DataParameter {
@@ -477,7 +455,6 @@ message DataParameter {
   // to avoid all asynchronous sgd clients to start at the same point. The skip
   // point would be set as rand_skip * rand(0,1). Note that rand_skip should not
   // be larger than the number of keys in the database.
-  // DEPRECATED. Each solver accesses a different subset of the database.
   optional uint32 rand_skip = 7 [default = 0];
   optional DB backend = 8 [default = LEVELDB];
   // DEPRECATED. See TransformationParameter. For data pre-processing, we can do
@@ -493,9 +470,6 @@ message DataParameter {
   optional bool mirror = 6 [default = false];
   // Force the encoded image to have 3 color channels
   optional bool force_encoded_color = 9 [default = false];
-  // Prefetch queue (Number of batches to prefetch to host memory, increase if
-  // data access bandwidth varies).
-  optional uint32 prefetch = 10 [default = 4];
 }
 
 message DropoutParameter {
@@ -629,12 +603,6 @@ message LRNParameter {
   }
   optional NormRegion norm_region = 4 [default = ACROSS_CHANNELS];
   optional float k = 5 [default = 1.];
-  enum Engine {
-    DEFAULT = 0;
-    CAFFE = 1;
-    CUDNN = 2;
-  }
-  optional Engine engine = 6 [default = DEFAULT];
 }
 
 message MemoryDataParameter {
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index 926490365ff..877b19b86f8 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -15,13 +15,13 @@ namespace caffe {
 
 template <typename Dtype>
 Solver<Dtype>::Solver(const SolverParameter& param)
-    : net_(), callbacks_(), iteration_timer_(), iterations_last_() {
+    : net_() {
   Init(param);
 }
 
 template <typename Dtype>
 Solver<Dtype>::Solver(const string& param_file)
-    : net_(), callbacks_(), iteration_timer_(), iterations_last_() {
+    : net_() {
   SolverParameter param;
   ReadProtoFromTextFileOrDie(param_file, &param);
   Init(param);
@@ -29,21 +29,17 @@ Solver<Dtype>::Solver(const string& param_file)
 
 template <typename Dtype>
 void Solver<Dtype>::Init(const SolverParameter& param) {
-  if (Caffe::root_solver()) {
-    LOG(INFO) << "Initializing solver from parameters: " << std::endl
-              << param.DebugString();
-  }
+  LOG(INFO) << "Initializing solver from parameters: " << std::endl
+            << param.DebugString();
   param_ = param;
   CHECK_GE(param_.average_loss(), 1) << "average_loss should be non-negative.";
-  if (Caffe::root_solver() && param_.random_seed() >= 0) {
+  if (param_.random_seed() >= 0) {
     Caffe::set_random_seed(param_.random_seed());
   }
   // Scaffolding code
   InitTrainNet();
-  if (Caffe::root_solver()) {
-    InitTestNets();
-    LOG(INFO) << "Solver scaffolding done.";
-  }
+  InitTestNets();
+  LOG(INFO) << "Solver scaffolding done.";
   iter_ = 0;
   current_step_ = 0;
 }
@@ -59,27 +55,19 @@ void Solver<Dtype>::InitTrainNet() {
       << "one of these fields specifying a train_net: " << field_names;
   NetParameter net_param;
   if (param_.has_train_net_param()) {
-    if (Caffe::root_solver()) {
-      LOG(INFO) << "Creating training net specified in train_net_param.";
-    }
+    LOG(INFO) << "Creating training net specified in train_net_param.";
     net_param.CopyFrom(param_.train_net_param());
   } else if (param_.has_train_net()) {
-    if (Caffe::root_solver()) {
-      LOG(INFO) << "Creating training net from train_net file: "
-                << param_.train_net();
-    }
+    LOG(INFO) << "Creating training net from train_net file: "
+              << param_.train_net();
     ReadNetParamsFromTextFileOrDie(param_.train_net(), &net_param);
   }
   if (param_.has_net_param()) {
-    if (Caffe::root_solver()) {
-      LOG(INFO) << "Creating training net specified in net_param.";
-    }
+    LOG(INFO) << "Creating training net specified in net_param.";
     net_param.CopyFrom(param_.net_param());
   }
   if (param_.has_net()) {
-    if (Caffe::root_solver()) {
-      LOG(INFO) << "Creating training net from net file: " << param_.net();
-    }
+    LOG(INFO) << "Creating training net from net file: " << param_.net();
     ReadNetParamsFromTextFileOrDie(param_.net(), &net_param);
   }
   // Set the correct NetState.  We start with the solver defaults (lowest
@@ -96,7 +84,6 @@ void Solver<Dtype>::InitTrainNet() {
 
 template <typename Dtype>
 void Solver<Dtype>::InitTestNets() {
-  CHECK(Caffe::root_solver());
   const bool has_net_param = param_.has_net_param();
   const bool has_net_file = param_.has_net();
   const int num_generic_nets = has_net_param + has_net_file;
@@ -180,26 +167,12 @@ void Solver<Dtype>::Step(int iters) {
   vector<Dtype> losses;
   Dtype smoothed_loss = 0;
 
-  iteration_timer_.Start();
-  Timer timer;
-  ostringstream timing;
-
   while (iter_ < stop_iter) {
     if (param_.test_interval() && iter_ % param_.test_interval() == 0
-        && (iter_ > 0 || param_.test_initialization())
-        && Caffe::root_solver()) {
+        && (iter_ > 0 || param_.test_initialization())) {
       TestAll();
     }
 
-    timer.Start();
-    timing.str("");
-    timing << "Timing ";
-    if (param().solver_mode() == SolverParameter_SolverMode_GPU) {
-      timing << "(device " << param().device_id() << ") ";
-    }
-    for (int i = 0; i < callbacks_.size(); ++i) {
-      callbacks_[i]->on_start(&timer, &timing);
-    }
     const bool display = param_.display() && iter_ % param_.display() == 0;
     net_->set_debug_info(display && param_.debug_info());
     Dtype loss = net_->ForwardBackward(bottom_vec);
@@ -213,9 +186,7 @@ void Solver<Dtype>::Step(int iters) {
       losses[idx] = loss;
     }
     if (display) {
-      if (Caffe::root_solver()) {
-        LOG(INFO) << "Iteration " << iter_ << ", loss = " << smoothed_loss;
-      }
+      LOG(INFO) << "Iteration " << iter_ << ", loss = " << smoothed_loss;
       const vector<Blob<Dtype>*>& result = net_->output_blobs();
       int score_index = 0;
       for (int j = 0; j < result.size(); ++j) {
@@ -230,34 +201,21 @@ void Solver<Dtype>::Step(int iters) {
             loss_msg_stream << " (* " << loss_weight
                             << " = " << loss_weight * result_vec[k] << " loss)";
           }
-          if (Caffe::root_solver()) {
-            LOG(INFO) << "    Train net output #"
-                << score_index++ << ": " << output_name << " = "
-                << result_vec[k] << loss_msg_stream.str();
-          }
+          LOG(INFO) << "    Train net output #"
+              << score_index++ << ": " << output_name << " = "
+              << result_vec[k] << loss_msg_stream.str();
         }
       }
     }
-    timing << " grads: " << timer.MilliSeconds();
-    for (int i = 0; i < callbacks_.size(); ++i) {
-      callbacks_[i]->on_gradients_ready(&timer, &timing);
-    }
-    timer.Start();
-    Iteration();
-    timing << " apply: " << timer.MilliSeconds();
-
-#ifdef BENCHMARK_SOLVER
-    LOG(INFO)<< timing.str();
-#endif
+    ComputeUpdateValue();
+    net_->Update();
 
     // Increment the internal iter_ counter -- its value should always indicate
     // the number of times the weights have been updated.
     ++iter_;
 
     // Save a snapshot if needed.
-    if (param_.snapshot()
-        && iter_ % param_.snapshot() == 0
-        && Caffe::root_solver()) {
+    if (param_.snapshot() && iter_ % param_.snapshot() == 0) {
       Snapshot();
     }
   }
@@ -265,7 +223,6 @@ void Solver<Dtype>::Step(int iters) {
 
 template <typename Dtype>
 void Solver<Dtype>::Solve(const char* resume_file) {
-  CHECK(Caffe::root_solver());
   LOG(INFO) << "Solving " << net_->name();
   LOG(INFO) << "Learning Rate Policy: " << param_.lr_policy();
 
@@ -310,7 +267,6 @@ void Solver<Dtype>::TestAll() {
 
 template <typename Dtype>
 void Solver<Dtype>::Test(const int test_net_id) {
-  CHECK(Caffe::root_solver());
   LOG(INFO) << "Iteration " << iter_
             << ", Testing net (#" << test_net_id << ")";
   CHECK_NOTNULL(test_nets_[test_net_id].get())->
@@ -361,14 +317,13 @@ void Solver<Dtype>::Test(const int test_net_id) {
                       << " = " << loss_weight * mean_score << " loss)";
     }
     LOG(INFO) << "    Test net output #" << i << ": " << output_name << " = "
-              << mean_score << loss_msg_stream.str();
+        << mean_score << loss_msg_stream.str();
   }
 }
 
 
 template <typename Dtype>
 void Solver<Dtype>::Snapshot() {
-  CHECK(Caffe::root_solver());
   NetParameter net_param;
   // For intermediate results, we will also dump the gradient values.
   net_->ToProto(&net_param, param_.snapshot_diff());
@@ -393,7 +348,6 @@ void Solver<Dtype>::Snapshot() {
 
 template <typename Dtype>
 void Solver<Dtype>::Restore(const char* state_file) {
-  CHECK(Caffe::root_solver());
   SolverState state;
   NetParameter net_param;
   ReadProtoFromBinaryFile(state_file, &state);
@@ -502,124 +456,95 @@ void SGDSolver<Dtype>::ClipGradients() {
 }
 
 template <typename Dtype>
-void SGDSolver<Dtype>::Iteration() {
-  CHECK(Caffe::root_solver());
+void SGDSolver<Dtype>::ComputeUpdateValue() {
+  const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
+  const vector<float>& net_params_lr = this->net_->params_lr();
+  const vector<float>& net_params_weight_decay =
+      this->net_->params_weight_decay();
+  // get the learning rate
   Dtype rate = GetLearningRate();
   if (this->param_.display() && this->iter_ % this->param_.display() == 0) {
-    float lapse = iteration_timer_.Seconds();
-    float per_s = (this->iter_ - iterations_last_) / (lapse ? lapse : 1);
-    LOG(INFO) << "Iteration " << this->iter_ << " (" << per_s << "/s), "
-              << "lr = " << rate;
-    iteration_timer_.Start();
-    iterations_last_ = this->iter_;
+    LOG(INFO) << "Iteration " << this->iter_ << ", lr = " << rate;
   }
   ClipGradients();
-  for (int param_id = 0; param_id < this->net_->params().size(); ++param_id) {
-    Regularize(param_id);
-    ComputeUpdateValue(param_id, rate);
-  }
-  this->net_->Update();
-}
-
-template <typename Dtype>
-void SGDSolver<Dtype>::Regularize(int param_id) {
-  const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
-  const vector<float>& net_params_weight_decay =
-      this->net_->params_weight_decay();
+  Dtype momentum = this->param_.momentum();
   Dtype weight_decay = this->param_.weight_decay();
   string regularization_type = this->param_.regularization_type();
   switch (Caffe::mode()) {
-  case Caffe::CPU: {
-    Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
-    if (local_decay) {
-      if (regularization_type == "L2") {
-        // add weight decay
-        caffe_axpy(net_params[param_id]->count(),
-            local_decay,
-            net_params[param_id]->cpu_data(),
-            net_params[param_id]->mutable_cpu_diff());
-      } else if (regularization_type == "L1") {
-        caffe_cpu_sign(net_params[param_id]->count(),
-            net_params[param_id]->cpu_data(),
-            temp_[param_id]->mutable_cpu_data());
-        caffe_axpy(net_params[param_id]->count(),
-            local_decay,
-            temp_[param_id]->cpu_data(),
-            net_params[param_id]->mutable_cpu_diff());
-      } else {
-        LOG(FATAL) << "Unknown regularization type: " << regularization_type;
+  case Caffe::CPU:
+    for (int param_id = 0; param_id < net_params.size(); ++param_id) {
+      // Compute the value to history, and then copy them to the blob's diff.
+      Dtype local_rate = rate * net_params_lr[param_id];
+      Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
+
+      if (local_decay) {
+        if (regularization_type == "L2") {
+          // add weight decay
+          caffe_axpy(net_params[param_id]->count(),
+              local_decay,
+              net_params[param_id]->cpu_data(),
+              net_params[param_id]->mutable_cpu_diff());
+        } else if (regularization_type == "L1") {
+          caffe_cpu_sign(net_params[param_id]->count(),
+              net_params[param_id]->cpu_data(),
+              temp_[param_id]->mutable_cpu_data());
+          caffe_axpy(net_params[param_id]->count(),
+              local_decay,
+              temp_[param_id]->cpu_data(),
+              net_params[param_id]->mutable_cpu_diff());
+        } else {
+          LOG(FATAL) << "Unknown regularization type: " << regularization_type;
+        }
       }
+
+      caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
+                net_params[param_id]->cpu_diff(), momentum,
+                history_[param_id]->mutable_cpu_data());
+      // copy
+      caffe_copy(net_params[param_id]->count(),
+          history_[param_id]->cpu_data(),
+          net_params[param_id]->mutable_cpu_diff());
     }
     break;
-  }
-  case Caffe::GPU: {
+  case Caffe::GPU:
 #ifndef CPU_ONLY
-    Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
-    if (local_decay) {
-      if (regularization_type == "L2") {
-        // add weight decay
-        caffe_gpu_axpy(net_params[param_id]->count(),
-            local_decay,
-            net_params[param_id]->gpu_data(),
-            net_params[param_id]->mutable_gpu_diff());
-      } else if (regularization_type == "L1") {
-        caffe_gpu_sign(net_params[param_id]->count(),
-            net_params[param_id]->gpu_data(),
-            temp_[param_id]->mutable_gpu_data());
-        caffe_gpu_axpy(net_params[param_id]->count(),
-            local_decay,
-            temp_[param_id]->gpu_data(),
-            net_params[param_id]->mutable_gpu_diff());
-      } else {
-        LOG(FATAL) << "Unknown regularization type: " << regularization_type;
+    for (int param_id = 0; param_id < net_params.size(); ++param_id) {
+      // Compute the value to history, and then copy them to the blob's diff.
+      Dtype local_rate = rate * net_params_lr[param_id];
+      Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
+
+      if (local_decay) {
+        if (regularization_type == "L2") {
+          // add weight decay
+          caffe_gpu_axpy(net_params[param_id]->count(),
+              local_decay,
+              net_params[param_id]->gpu_data(),
+              net_params[param_id]->mutable_gpu_diff());
+        } else if (regularization_type == "L1") {
+          caffe_gpu_sign(net_params[param_id]->count(),
+              net_params[param_id]->gpu_data(),
+              temp_[param_id]->mutable_gpu_data());
+          caffe_gpu_axpy(net_params[param_id]->count(),
+              local_decay,
+              temp_[param_id]->gpu_data(),
+              net_params[param_id]->mutable_gpu_diff());
+        } else {
+          LOG(FATAL) << "Unknown regularization type: " << regularization_type;
+        }
       }
-    }
-#else
-    NO_GPU;
-#endif
-    break;
-  }
-  default:
-    LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
-  }
-}
 
-template <typename Dtype>
-void SGDSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
-  const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
-  const vector<float>& net_params_lr = this->net_->params_lr();
-  Dtype momentum = this->param_.momentum();
-  switch (Caffe::mode()) {
-  case Caffe::CPU: {
-    // Compute the value to history, and then copy them to the blob's diff.
-    Dtype local_rate = rate * net_params_lr[param_id];
-
-    caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
-              net_params[param_id]->cpu_diff(), momentum,
-              history_[param_id]->mutable_cpu_data());
-    // copy
-    caffe_copy(net_params[param_id]->count(),
-        history_[param_id]->cpu_data(),
-        net_params[param_id]->mutable_cpu_diff());
-    break;
-  }
-  case Caffe::GPU: {
-#ifndef CPU_ONLY
-    // Compute the value to history, and then copy them to the blob's diff.
-    Dtype local_rate = rate * net_params_lr[param_id];
-
-    caffe_gpu_axpby(net_params[param_id]->count(), local_rate,
-              net_params[param_id]->gpu_diff(), momentum,
-              history_[param_id]->mutable_gpu_data());
-    // copy
-    caffe_copy(net_params[param_id]->count(),
-        history_[param_id]->gpu_data(),
-        net_params[param_id]->mutable_gpu_diff());
+      caffe_gpu_axpby(net_params[param_id]->count(), local_rate,
+                net_params[param_id]->gpu_diff(), momentum,
+                history_[param_id]->mutable_gpu_data());
+      // copy
+      caffe_copy(net_params[param_id]->count(),
+          history_[param_id]->gpu_data(),
+          net_params[param_id]->mutable_gpu_diff());
+    }
 #else
     NO_GPU;
 #endif
     break;
-  }
   default:
     LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
   }
@@ -637,7 +562,6 @@ void SGDSolver<Dtype>::SnapshotSolverState(SolverState* state) {
 
 template <typename Dtype>
 void SGDSolver<Dtype>::RestoreSolverState(const SolverState& state) {
-  CHECK(Caffe::root_solver());
   CHECK_EQ(state.history_size(), history_.size())
       << "Incorrect length of history blobs.";
   LOG(INFO) << "SGDSolver: restoring history";
@@ -647,146 +571,252 @@ void SGDSolver<Dtype>::RestoreSolverState(const SolverState& state) {
 }
 
 template <typename Dtype>
-void NesterovSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
-  CHECK(Caffe::root_solver());
+void NesterovSolver<Dtype>::ComputeUpdateValue() {
   const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
   const vector<float>& net_params_lr = this->net_->params_lr();
+  const vector<float>& net_params_weight_decay =
+      this->net_->params_weight_decay();
+  // get the learning rate
+  Dtype rate = this->GetLearningRate();
+  if (this->param_.display() && this->iter_ % this->param_.display() == 0) {
+    LOG(INFO) << "Iteration " << this->iter_ << ", lr = " << rate;
+  }
+  SGDSolver<Dtype>::ClipGradients();
   Dtype momentum = this->param_.momentum();
+  Dtype weight_decay = this->param_.weight_decay();
+  string regularization_type = this->param_.regularization_type();
   switch (Caffe::mode()) {
-  case Caffe::CPU: {
-    // save history momentum for stepping back
-    caffe_copy(net_params[param_id]->count(),
-        this->history_[param_id]->cpu_data(),
-        this->update_[param_id]->mutable_cpu_data());
-
-    Dtype local_rate = rate * net_params_lr[param_id];
-
-    // update history
-    caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
-              net_params[param_id]->cpu_diff(), momentum,
-              this->history_[param_id]->mutable_cpu_data());
-
-    // compute update: step back then over step
-    caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) + momentum,
-        this->history_[param_id]->cpu_data(), -momentum,
-        this->update_[param_id]->mutable_cpu_data());
-
-    // copy
-    caffe_copy(net_params[param_id]->count(),
-        this->update_[param_id]->cpu_data(),
-        net_params[param_id]->mutable_cpu_diff());
+  case Caffe::CPU:
+    for (int param_id = 0; param_id < net_params.size(); ++param_id) {
+      // save history momentum for stepping back
+      caffe_copy(net_params[param_id]->count(),
+          this->history_[param_id]->cpu_data(),
+          this->update_[param_id]->mutable_cpu_data());
+
+      Dtype local_rate = rate * net_params_lr[param_id];
+      Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
+
+      if (local_decay) {
+        if (regularization_type == "L2") {
+          // add weight decay
+          caffe_axpy(net_params[param_id]->count(),
+              local_decay,
+              net_params[param_id]->cpu_data(),
+              net_params[param_id]->mutable_cpu_diff());
+        } else if (regularization_type == "L1") {
+          caffe_cpu_sign(net_params[param_id]->count(),
+              net_params[param_id]->cpu_data(),
+              this->temp_[param_id]->mutable_cpu_data());
+          caffe_axpy(net_params[param_id]->count(),
+              local_decay,
+              this->temp_[param_id]->cpu_data(),
+              net_params[param_id]->mutable_cpu_diff());
+        } else {
+          LOG(FATAL) << "Unknown regularization type: " << regularization_type;
+        }
+      }
+
+      // update history
+      caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
+                net_params[param_id]->cpu_diff(), momentum,
+                this->history_[param_id]->mutable_cpu_data());
+
+      // compute udpate: step back then over step
+      caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) + momentum,
+          this->history_[param_id]->cpu_data(), -momentum,
+          this->update_[param_id]->mutable_cpu_data());
+
+      // copy
+      caffe_copy(net_params[param_id]->count(),
+          this->update_[param_id]->cpu_data(),
+          net_params[param_id]->mutable_cpu_diff());
+    }
     break;
-  }
-  case Caffe::GPU: {
+  case Caffe::GPU:
 #ifndef CPU_ONLY
-    // save history momentum for stepping back
-    caffe_copy(net_params[param_id]->count(),
-        this->history_[param_id]->gpu_data(),
-        this->update_[param_id]->mutable_gpu_data());
-
-    Dtype local_rate = rate * net_params_lr[param_id];
-
-    // update history
-    caffe_gpu_axpby(net_params[param_id]->count(), local_rate,
-              net_params[param_id]->gpu_diff(), momentum,
-              this->history_[param_id]->mutable_gpu_data());
-
-    // compute update: step back then over step
-    caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) + momentum,
-        this->history_[param_id]->gpu_data(), -momentum,
-        this->update_[param_id]->mutable_gpu_data());
-
-    // copy
-    caffe_copy(net_params[param_id]->count(),
-        this->update_[param_id]->gpu_data(),
-        net_params[param_id]->mutable_gpu_diff());
+    for (int param_id = 0; param_id < net_params.size(); ++param_id) {
+      // save history momentum for stepping back
+      caffe_copy(net_params[param_id]->count(),
+          this->history_[param_id]->gpu_data(),
+          this->update_[param_id]->mutable_gpu_data());
+
+      Dtype local_rate = rate * net_params_lr[param_id];
+      Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
+
+      if (local_decay) {
+        if (regularization_type == "L2") {
+          // add weight decay
+          caffe_gpu_axpy(net_params[param_id]->count(),
+              local_decay,
+              net_params[param_id]->gpu_data(),
+              net_params[param_id]->mutable_gpu_diff());
+        } else if (regularization_type == "L1") {
+          caffe_gpu_sign(net_params[param_id]->count(),
+              net_params[param_id]->gpu_data(),
+              this->temp_[param_id]->mutable_gpu_data());
+          caffe_gpu_axpy(net_params[param_id]->count(),
+              local_decay,
+              this->temp_[param_id]->gpu_data(),
+              net_params[param_id]->mutable_gpu_diff());
+        } else {
+          LOG(FATAL) << "Unknown regularization type: " << regularization_type;
+        }
+      }
+
+      // update history
+      caffe_gpu_axpby(net_params[param_id]->count(), local_rate,
+                net_params[param_id]->gpu_diff(), momentum,
+                this->history_[param_id]->mutable_gpu_data());
+
+      // compute udpate: step back then over step
+      caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) + momentum,
+          this->history_[param_id]->gpu_data(), -momentum,
+          this->update_[param_id]->mutable_gpu_data());
+
+      // copy
+      caffe_copy(net_params[param_id]->count(),
+          this->update_[param_id]->gpu_data(),
+          net_params[param_id]->mutable_gpu_diff());
+    }
 #else
     NO_GPU;
 #endif
     break;
-  }
   default:
     LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
   }
 }
 
 template <typename Dtype>
-void AdaGradSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
-  CHECK(Caffe::root_solver());
+void AdaGradSolver<Dtype>::ComputeUpdateValue() {
   const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
   const vector<float>& net_params_lr = this->net_->params_lr();
+  const vector<float>& net_params_weight_decay =
+      this->net_->params_weight_decay();
+  // get the learning rate
+  Dtype rate = this->GetLearningRate();
   Dtype delta = this->param_.delta();
+  if (this->param_.display() && this->iter_ % this->param_.display() == 0) {
+    LOG(INFO) << "Iteration " << this->iter_ << ", lr = " << rate;
+  }
+  SGDSolver<Dtype>::ClipGradients();
+  Dtype weight_decay = this->param_.weight_decay();
+  string regularization_type = this->param_.regularization_type();
   switch (Caffe::mode()) {
-  case Caffe::CPU: {
-    Dtype local_rate = rate * net_params_lr[param_id];
-
-    // compute square of gradient in update
-    caffe_powx(net_params[param_id]->count(),
-        net_params[param_id]->cpu_diff(), Dtype(2),
-        this->update_[param_id]->mutable_cpu_data());
-
-    // update history
-    caffe_add(net_params[param_id]->count(),
-        this->update_[param_id]->cpu_data(),
-        this->history_[param_id]->cpu_data(),
-        this->history_[param_id]->mutable_cpu_data());
-
-    // prepare update
-    caffe_powx(net_params[param_id]->count(),
-              this->history_[param_id]->cpu_data(), Dtype(0.5),
-              this->update_[param_id]->mutable_cpu_data());
-
-    caffe_add_scalar(net_params[param_id]->count(),
-              delta, this->update_[param_id]->mutable_cpu_data());
-
-    caffe_div(net_params[param_id]->count(),
-              net_params[param_id]->cpu_diff(),
-              this->update_[param_id]->cpu_data(),
-              this->update_[param_id]->mutable_cpu_data());
-
-    // scale and copy
-    caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
-        this->update_[param_id]->cpu_data(), Dtype(0),
-        net_params[param_id]->mutable_cpu_diff());
+  case Caffe::CPU:
+    for (int param_id = 0; param_id < net_params.size(); ++param_id) {
+      Dtype local_rate = rate * net_params_lr[param_id];
+      Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
+
+      if (local_decay) {
+        if (regularization_type == "L2") {
+          // add weight decay
+          caffe_axpy(net_params[param_id]->count(),
+              local_decay,
+              net_params[param_id]->cpu_data(),
+              net_params[param_id]->mutable_cpu_diff());
+        } else if (regularization_type == "L1") {
+          caffe_cpu_sign(net_params[param_id]->count(),
+              net_params[param_id]->cpu_data(),
+              this->temp_[param_id]->mutable_cpu_data());
+          caffe_axpy(net_params[param_id]->count(),
+              local_decay,
+              this->temp_[param_id]->cpu_data(),
+              net_params[param_id]->mutable_cpu_diff());
+        } else {
+          LOG(FATAL) << "Unknown regularization type: " << regularization_type;
+        }
+      }
+
+      // compute square of gradient in update
+      caffe_powx(net_params[param_id]->count(),
+          net_params[param_id]->cpu_diff(), Dtype(2),
+          this->update_[param_id]->mutable_cpu_data());
+
+      // update history
+      caffe_add(net_params[param_id]->count(),
+          this->update_[param_id]->cpu_data(),
+          this->history_[param_id]->cpu_data(),
+          this->history_[param_id]->mutable_cpu_data());
+
+      // prepare update
+      caffe_powx(net_params[param_id]->count(),
+                this->history_[param_id]->cpu_data(), Dtype(0.5),
+                this->update_[param_id]->mutable_cpu_data());
+
+      caffe_add_scalar(net_params[param_id]->count(),
+                delta, this->update_[param_id]->mutable_cpu_data());
+
+      caffe_div(net_params[param_id]->count(),
+                net_params[param_id]->cpu_diff(),
+                this->update_[param_id]->cpu_data(),
+                this->update_[param_id]->mutable_cpu_data());
+
+      // scale and copy
+      caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
+          this->update_[param_id]->cpu_data(), Dtype(0),
+          net_params[param_id]->mutable_cpu_diff());
+    }
     break;
-  }
-  case Caffe::GPU: {
+  case Caffe::GPU:
 #ifndef CPU_ONLY
-    Dtype local_rate = rate * net_params_lr[param_id];
-
-    // compute square of gradient in update
-    caffe_gpu_powx(net_params[param_id]->count(),
-        net_params[param_id]->gpu_diff(), Dtype(2),
-        this->update_[param_id]->mutable_gpu_data());
-
-    // update history
-    caffe_gpu_add(net_params[param_id]->count(),
-        this->update_[param_id]->gpu_data(),
-        this->history_[param_id]->gpu_data(),
-        this->history_[param_id]->mutable_gpu_data());
-
-    // prepare update
-    caffe_gpu_powx(net_params[param_id]->count(),
-              this->history_[param_id]->gpu_data(), Dtype(0.5),
-              this->update_[param_id]->mutable_gpu_data());
-
-    caffe_gpu_add_scalar(net_params[param_id]->count(),
-              delta, this->update_[param_id]->mutable_gpu_data());
-
-    caffe_gpu_div(net_params[param_id]->count(),
-              net_params[param_id]->gpu_diff(),
-              this->update_[param_id]->gpu_data(),
-              this->update_[param_id]->mutable_gpu_data());
-
-    // scale and copy
-    caffe_gpu_axpby(net_params[param_id]->count(), local_rate,
-        this->update_[param_id]->gpu_data(), Dtype(0),
-        net_params[param_id]->mutable_gpu_diff());
+    for (int param_id = 0; param_id < net_params.size(); ++param_id) {
+      Dtype local_rate = rate * net_params_lr[param_id];
+      Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
+
+      if (local_decay) {
+        if (regularization_type == "L2") {
+          // add weight decay
+          caffe_gpu_axpy(net_params[param_id]->count(),
+              local_decay,
+              net_params[param_id]->gpu_data(),
+              net_params[param_id]->mutable_gpu_diff());
+        } else if (regularization_type == "L1") {
+          caffe_gpu_sign(net_params[param_id]->count(),
+              net_params[param_id]->gpu_data(),
+              this->temp_[param_id]->mutable_gpu_data());
+          caffe_gpu_axpy(net_params[param_id]->count(),
+              local_decay,
+              this->temp_[param_id]->gpu_data(),
+              net_params[param_id]->mutable_gpu_diff());
+        } else {
+          LOG(FATAL) << "Unknown regularization type: " << regularization_type;
+        }
+      }
+
+      // compute square of gradient in update
+      caffe_gpu_powx(net_params[param_id]->count(),
+          net_params[param_id]->gpu_diff(), Dtype(2),
+          this->update_[param_id]->mutable_gpu_data());
+
+      // update history
+      caffe_gpu_add(net_params[param_id]->count(),
+          this->update_[param_id]->gpu_data(),
+          this->history_[param_id]->gpu_data(),
+          this->history_[param_id]->mutable_gpu_data());
+
+      // prepare update
+      caffe_gpu_powx(net_params[param_id]->count(),
+                this->history_[param_id]->gpu_data(), Dtype(0.5),
+                this->update_[param_id]->mutable_gpu_data());
+
+      caffe_gpu_add_scalar(net_params[param_id]->count(),
+                delta, this->update_[param_id]->mutable_gpu_data());
+
+      caffe_gpu_div(net_params[param_id]->count(),
+                net_params[param_id]->gpu_diff(),
+                this->update_[param_id]->gpu_data(),
+                this->update_[param_id]->mutable_gpu_data());
+
+      // scale and copy
+      caffe_gpu_axpby(net_params[param_id]->count(), local_rate,
+          this->update_[param_id]->gpu_data(), Dtype(0),
+          net_params[param_id]->mutable_gpu_diff());
+    }
 #else
     NO_GPU;
 #endif
     break;
-  }
   default:
     LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
   }
diff --git a/src/caffe/syncedmem.cpp b/src/caffe/syncedmem.cpp
index 5ae11ea983b..7617ccfb27f 100644
--- a/src/caffe/syncedmem.cpp
+++ b/src/caffe/syncedmem.cpp
@@ -12,14 +12,8 @@ SyncedMemory::~SyncedMemory() {
   }
 
 #ifndef CPU_ONLY
-  if (gpu_ptr_ && own_gpu_data_) {
-    int initial_device;
-    cudaGetDevice(&initial_device);
-    if (gpu_device_ != -1) {
-      CUDA_CHECK(cudaSetDevice(gpu_device_));
-    }
-    MemoryHandler::freeGPU(gpu_ptr_);
-    cudaSetDevice(initial_device);
+  if (gpu_ptr_) {
+    CUDA_CHECK(cudaFree(gpu_ptr_));
   }
 #endif  // CPU_ONLY
 }
@@ -54,17 +48,13 @@ inline void SyncedMemory::to_gpu() {
 #ifndef CPU_ONLY
   switch (head_) {
   case UNINITIALIZED:
-    CUDA_CHECK(cudaGetDevice(&gpu_device_));
-    MemoryHandler::mallocGPU(&gpu_ptr_, size_);
+    CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));
     caffe_gpu_memset(size_, 0, gpu_ptr_);
     head_ = HEAD_AT_GPU;
-    own_gpu_data_ = true;
     break;
   case HEAD_AT_CPU:
     if (gpu_ptr_ == NULL) {
-      CUDA_CHECK(cudaGetDevice(&gpu_device_));
-      MemoryHandler::mallocGPU(&gpu_ptr_, size_);
-      own_gpu_data_ = true;
+      CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));
     }
     caffe_gpu_memcpy(size_, cpu_ptr_, gpu_ptr_);
     head_ = SYNCED;
@@ -102,26 +92,6 @@ const void* SyncedMemory::gpu_data() {
 #endif
 }
 
-void SyncedMemory::set_gpu_data(void* data) {
-#ifndef CPU_ONLY
-  CHECK(data);
-  if (own_gpu_data_) {
-    int initial_device;
-    cudaGetDevice(&initial_device);
-    if (gpu_device_ != -1) {
-      CUDA_CHECK(cudaSetDevice(gpu_device_));
-    }
-    MemoryHandler::freeGPU(gpu_ptr_);
-    cudaSetDevice(initial_device);
-  }
-  gpu_ptr_ = data;
-  head_ = HEAD_AT_GPU;
-  own_gpu_data_ = false;
-#else
-  NO_GPU;
-#endif
-}
-
 void* SyncedMemory::mutable_cpu_data() {
   to_cpu();
   head_ = HEAD_AT_CPU;
@@ -138,20 +108,6 @@ void* SyncedMemory::mutable_gpu_data() {
 #endif
 }
 
-#ifndef CPU_ONLY
-void SyncedMemory::async_gpu_push(const cudaStream_t& stream) {
-  CHECK(head_ == HEAD_AT_CPU);
-  if (gpu_ptr_ == NULL) {
-    CUDA_CHECK(cudaGetDevice(&gpu_device_));
-    MemoryHandler::mallocGPU(&gpu_ptr_, size_);
-    own_gpu_data_ = true;
-  }
-  const cudaMemcpyKind put = cudaMemcpyHostToDevice;
-  CUDA_CHECK(cudaMemcpyAsync(gpu_ptr_, cpu_ptr_, size_, put, stream));
-  // Assume caller will synchronize on the stream before use
-  head_ = SYNCED;
-}
-#endif
 
 }  // namespace caffe
 
diff --git a/src/caffe/test/test_convolution_layer.cpp b/src/caffe/test/test_convolution_layer.cpp
index a8a2c9b9342..67d41fff844 100644
--- a/src/caffe/test/test_convolution_layer.cpp
+++ b/src/caffe/test/test_convolution_layer.cpp
@@ -603,7 +603,6 @@ TYPED_TEST(CuDNNConvolutionLayerTest, TestSobelConvolutionCuDNN) {
     weights[i +  8] =  1;
   }
   layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  cudaDeviceSynchronize();
   layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
   // Compute Sobel G_x operator as separable 3 x 1 and 1 x 3 convolutions.
   // (1) the [1 2 1] column filter
@@ -631,7 +630,6 @@ TYPED_TEST(CuDNNConvolutionLayerTest, TestSobelConvolutionCuDNN) {
     weights_1[i +  2] = 1;
   }
   layer->SetUp(sep_blob_bottom_vec, sep_blob_top_vec);
-  cudaDeviceSynchronize();
   layer->Forward(sep_blob_bottom_vec, sep_blob_top_vec);
   // (2) the [-1 0 1] row filter
   blob_sep->CopyFrom(*this->blob_top_2_, false, true);
@@ -654,7 +652,6 @@ TYPED_TEST(CuDNNConvolutionLayerTest, TestSobelConvolutionCuDNN) {
     weights_2[i +  2] =  1;
   }
   layer->SetUp(sep_blob_bottom_vec, sep_blob_top_vec);
-  cudaDeviceSynchronize();
   layer->Forward(sep_blob_bottom_vec, sep_blob_top_vec);
   // Test equivalence of full and separable filters.
   const TypeParam* top_data = this->blob_top_->cpu_data();
diff --git a/src/caffe/test/test_internal_thread.cpp b/src/caffe/test/test_internal_thread.cpp
index 93f1cc541cd..31882b6db1d 100644
--- a/src/caffe/test/test_internal_thread.cpp
+++ b/src/caffe/test/test_internal_thread.cpp
@@ -2,7 +2,6 @@
 #include "gtest/gtest.h"
 
 #include "caffe/internal_thread.hpp"
-#include "caffe/util/math_functions.hpp"
 
 #include "caffe/test/test_caffe_main.hpp"
 
@@ -14,40 +13,11 @@ class InternalThreadTest : public ::testing::Test {};
 TEST_F(InternalThreadTest, TestStartAndExit) {
   InternalThread thread;
   EXPECT_FALSE(thread.is_started());
-  thread.StartInternalThread();
+  EXPECT_TRUE(thread.StartInternalThread());
   EXPECT_TRUE(thread.is_started());
-  thread.StopInternalThread();
+  EXPECT_TRUE(thread.WaitForInternalThreadToExit());
   EXPECT_FALSE(thread.is_started());
 }
 
-class TestThreadA : public InternalThread {
-  void InternalThreadEntry() {
-    EXPECT_EQ(4244559767, caffe_rng_rand());
-  }
-};
-
-class TestThreadB : public InternalThread {
-  void InternalThreadEntry() {
-    EXPECT_EQ(1726478280, caffe_rng_rand());
-  }
-};
-
-TEST_F(InternalThreadTest, TestRandomSeed) {
-  TestThreadA t1;
-  Caffe::set_random_seed(9658361);
-  t1.StartInternalThread();
-  t1.StopInternalThread();
-
-  TestThreadA t2;
-  Caffe::set_random_seed(9658361);
-  t2.StartInternalThread();
-  t2.StopInternalThread();
-
-  TestThreadB t3;
-  Caffe::set_random_seed(3435563);
-  t3.StartInternalThread();
-  t3.StopInternalThread();
-}
-
 }  // namespace caffe
 
diff --git a/src/caffe/test/test_layer_factory.cpp b/src/caffe/test/test_layer_factory.cpp
index c86fafd000c..efb1b37ac42 100644
--- a/src/caffe/test/test_layer_factory.cpp
+++ b/src/caffe/test/test_layer_factory.cpp
@@ -1,14 +1,11 @@
 #include <map>
 #include <string>
 
-#include "boost/scoped_ptr.hpp"
 #include "gtest/gtest.h"
 
 #include "caffe/common.hpp"
 #include "caffe/layer.hpp"
 #include "caffe/layer_factory.hpp"
-#include "caffe/util/db.hpp"
-#include "caffe/util/io.hpp"
 
 #include "caffe/test/test_caffe_main.hpp"
 
@@ -24,20 +21,11 @@ TYPED_TEST(LayerFactoryTest, TestCreateLayer) {
   typename LayerRegistry<Dtype>::CreatorRegistry& registry =
       LayerRegistry<Dtype>::Registry();
   shared_ptr<Layer<Dtype> > layer;
+  LayerParameter layer_param;
   for (typename LayerRegistry<Dtype>::CreatorRegistry::iterator iter =
        registry.begin(); iter != registry.end(); ++iter) {
     // Special case: PythonLayer is checked by pytest
     if (iter->first == "Python") { continue; }
-    LayerParameter layer_param;
-    // Data layers expect a DB
-    if (iter->first == "Data") {
-      string tmp;
-      MakeTempDir(&tmp);
-      boost::scoped_ptr<db::DB> db(db::GetDB(DataParameter_DB_LEVELDB));
-      db->Open(tmp, db::NEW);
-      db->Close();
-      layer_param.mutable_data_param()->set_source(tmp);
-    }
     layer_param.set_type(iter->first);
     layer = LayerRegistry<Dtype>::CreateLayer(layer_param);
     EXPECT_EQ(iter->first, layer->type());
diff --git a/src/caffe/test/test_lrn_layer.cpp b/src/caffe/test/test_lrn_layer.cpp
index 012d5990cd4..c4e2f8ea7f2 100644
--- a/src/caffe/test/test_lrn_layer.cpp
+++ b/src/caffe/test/test_lrn_layer.cpp
@@ -246,214 +246,5 @@ TYPED_TEST(LRNLayerTest, TestGradientWithinChannel) {
       this->blob_top_vec_);
 }
 
-#ifdef USE_CUDNN
-template <typename Dtype>
-class CuDNNLRNLayerTest : public GPUDeviceTest<Dtype> {
- protected:
-  CuDNNLRNLayerTest()
-      : epsilon_(Dtype(1e-5)),
-        blob_bottom_(new Blob<Dtype>()),
-        blob_top_(new Blob<Dtype>()) {}
-  virtual void SetUp() {
-    Caffe::set_random_seed(1701);
-    blob_bottom_->Reshape(2, 7, 3, 3);
-    // fill the values
-    FillerParameter filler_param;
-    GaussianFiller<Dtype> filler(filler_param);
-    filler.Fill(this->blob_bottom_);
-    blob_bottom_vec_.push_back(blob_bottom_);
-    blob_top_vec_.push_back(blob_top_);
-  }
-  virtual ~CuDNNLRNLayerTest() { delete blob_bottom_; delete blob_top_; }
-  void ReferenceLRNForward(const Blob<Dtype>& blob_bottom,
-      const LayerParameter& layer_param, Blob<Dtype>* blob_top);
-
-  Dtype epsilon_;
-  Blob<Dtype>* const blob_bottom_;
-  Blob<Dtype>* const blob_top_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-};
-
-template <typename TypeParam>
-void CuDNNLRNLayerTest<TypeParam>::ReferenceLRNForward(
-    const Blob<TypeParam>& blob_bottom, const LayerParameter& layer_param,
-    Blob<TypeParam>* blob_top) {
-  typedef TypeParam Dtype;
-  blob_top->Reshape(blob_bottom.num(), blob_bottom.channels(),
-      blob_bottom.height(), blob_bottom.width());
-  Dtype* top_data = blob_top->mutable_cpu_data();
-  LRNParameter lrn_param = layer_param.lrn_param();
-  Dtype alpha = lrn_param.alpha();
-  Dtype beta = lrn_param.beta();
-  int size = lrn_param.local_size();
-  switch (lrn_param.norm_region()) {
-  case LRNParameter_NormRegion_ACROSS_CHANNELS:
-    for (int n = 0; n < blob_bottom.num(); ++n) {
-      for (int c = 0; c < blob_bottom.channels(); ++c) {
-        for (int h = 0; h < blob_bottom.height(); ++h) {
-          for (int w = 0; w < blob_bottom.width(); ++w) {
-            int c_start = c - (size - 1) / 2;
-            int c_end = min(c_start + size, blob_bottom.channels());
-            c_start = max(c_start, 0);
-            Dtype scale = 1.;
-            for (int i = c_start; i < c_end; ++i) {
-              Dtype value = blob_bottom.data_at(n, i, h, w);
-              scale += value * value * alpha / size;
-            }
-            *(top_data + blob_top->offset(n, c, h, w)) =
-              blob_bottom.data_at(n, c, h, w) / pow(scale, beta);
-          }
-        }
-      }
-    }
-    break;
-  case LRNParameter_NormRegion_WITHIN_CHANNEL:
-    for (int n = 0; n < blob_bottom.num(); ++n) {
-      for (int c = 0; c < blob_bottom.channels(); ++c) {
-        for (int h = 0; h < blob_bottom.height(); ++h) {
-          int h_start = h - (size - 1) / 2;
-          int h_end = min(h_start + size, blob_bottom.height());
-          h_start = max(h_start, 0);
-          for (int w = 0; w < blob_bottom.width(); ++w) {
-            Dtype scale = 1.;
-            int w_start = w - (size - 1) / 2;
-            int w_end = min(w_start + size, blob_bottom.width());
-            w_start = max(w_start, 0);
-            for (int nh = h_start; nh < h_end; ++nh) {
-              for (int nw = w_start; nw < w_end; ++nw) {
-                Dtype value = blob_bottom.data_at(n, c, nh, nw);
-                scale += value * value * alpha / (size * size);
-              }
-            }
-            *(top_data + blob_top->offset(n, c, h, w)) =
-              blob_bottom.data_at(n, c, h, w) / pow(scale, beta);
-          }
-        }
-      }
-    }
-    break;
-  default:
-    LOG(FATAL) << "Unknown normalization region.";
-  }
-}
-
-TYPED_TEST_CASE(CuDNNLRNLayerTest, TestDtypes);
-
-TYPED_TEST(CuDNNLRNLayerTest, TestForwardAcrossChannelsCuDNN) {
-  // typedef typename TypeParam::Dtype Dtype;
-  Caffe::set_mode(Caffe::GPU);
-  LayerParameter layer_param;
-  CuDNNLRNLayer<TypeParam> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  Blob<TypeParam> top_reference;
-  this->ReferenceLRNForward(*(this->blob_bottom_), layer_param,
-      &top_reference);
-  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-    EXPECT_NEAR(this->blob_top_->cpu_data()[i], top_reference.cpu_data()[i],
-                this->epsilon_);
-  }
-}
-
-TYPED_TEST(CuDNNLRNLayerTest, TestForwardAcrossChannelsLargeRegionCuDNN) {
-  Caffe::set_mode(Caffe::GPU);
-  typedef TypeParam Dtype;
-  LayerParameter layer_param;
-  layer_param.mutable_lrn_param()->set_local_size(15);
-  CuDNNLRNLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  Blob<Dtype> top_reference;
-  this->ReferenceLRNForward(*(this->blob_bottom_), layer_param,
-      &top_reference);
-  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-    EXPECT_NEAR(this->blob_top_->cpu_data()[i], top_reference.cpu_data()[i],
-                this->epsilon_);
-  }
-}
-
-TYPED_TEST(CuDNNLRNLayerTest, TestGradientAcrossChannelsCuDNN) {
-  typedef TypeParam Dtype;
-  LayerParameter layer_param;
-  CuDNNLRNLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-2);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  for (int i = 0; i < this->blob_top_->count(); ++i) {
-    this->blob_top_->mutable_cpu_diff()[i] = 1.;
-  }
-  vector<bool> propagate_down(this->blob_bottom_vec_.size(), true);
-  layer.Backward(this->blob_top_vec_, propagate_down,
-                 this->blob_bottom_vec_);
-  // for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-  //   std::cout << "CPU diff " << this->blob_bottom_->cpu_diff()[i]
-  //       << std::endl;
-  // }
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-/*
-TYPED_TEST(CuDNNLRNLayerTest, TestForwardWithinChannel) {
-  typedef TypeParam Dtype;
-  LayerParameter layer_param;
-  layer_param.mutable_lrn_param()->set_norm_region(
-      LRNParameter_NormRegion_WITHIN_CHANNEL);
-  layer_param.mutable_lrn_param()->set_local_size(3);
-  CuDNNLCNLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  Blob<Dtype> top_reference;
-  this->ReferenceLRNForward(*(this->blob_bottom_), layer_param,
-      &top_reference);
-  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-    EXPECT_NEAR(this->blob_top_->cpu_data()[i], top_reference.cpu_data()[i],
-                this->epsilon_);
-  }
-}
-
-TYPED_TEST(CuDNNLRNLayerTest, TestGradientWithinChannel) {
-  Caffe::set_mode(Caffe::GPU);
-  typedef TypeParam Dtype;
-  LayerParameter layer_param;
-  layer_param.mutable_lrn_param()->set_norm_region(
-      LRNParameter_NormRegion_WITHIN_CHANNEL);
-  layer_param.mutable_lrn_param()->set_local_size(3);
-  CuDNNLCNLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-2);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  for (int i = 0; i < this->blob_top_->count(); ++i) {
-    this->blob_top_->mutable_cpu_diff()[i] = 1.;
-  }
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-*/
-
-TYPED_TEST(CuDNNLRNLayerTest, TestGradientAcrossChannelsLargeRegionCuDNN) {
-  typedef TypeParam Dtype;
-  LayerParameter layer_param;
-  layer_param.mutable_lrn_param()->set_local_size(15);
-  CuDNNLRNLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-2);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  for (int i = 0; i < this->blob_top_->count(); ++i) {
-    this->blob_top_->mutable_cpu_diff()[i] = 1.;
-  }
-  vector<bool> propagate_down(this->blob_bottom_vec_.size(), true);
-  layer.Backward(this->blob_top_vec_, propagate_down,
-                 this->blob_bottom_vec_);
-  // for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-  //   std::cout << "CPU diff " << this->blob_bottom_->cpu_diff()[i]
-  //       << std::endl;
-  // }
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-#endif
 
 }  // namespace caffe
diff --git a/src/caffe/test/test_upgrade_proto.cpp b/src/caffe/test/test_upgrade_proto.cpp
index 006720231a5..eec627656ef 100644
--- a/src/caffe/test/test_upgrade_proto.cpp
+++ b/src/caffe/test/test_upgrade_proto.cpp
@@ -2,15 +2,12 @@
 #include <string>
 #include <vector>
 
-#include "boost/scoped_ptr.hpp"
 #include "google/protobuf/text_format.h"
 #include "gtest/gtest.h"
 
 #include "caffe/blob.hpp"
 #include "caffe/common.hpp"
 #include "caffe/layer.hpp"
-#include "caffe/util/db.hpp"
-#include "caffe/util/io.hpp"
 #include "caffe/util/upgrade_proto.hpp"
 
 #include "caffe/test/test_caffe_main.hpp"
@@ -2904,15 +2901,6 @@ TEST_F(NetUpgradeTest, TestUpgradeV1LayerType) {
       continue;  // Empty string isn't actually a valid layer type.
     }
     layer_param.set_type(v2_layer_type);
-    // Data layers expect a DB
-    if (v2_layer_type == "Data") {
-      string tmp;
-      MakeTempDir(&tmp);
-      boost::scoped_ptr<db::DB> db(db::GetDB(DataParameter_DB_LEVELDB));
-      db->Open(tmp, db::NEW);
-      db->Close();
-      layer_param.mutable_data_param()->set_source(tmp);
-    }
     layer = LayerRegistry<float>::CreateLayer(layer_param);
     EXPECT_EQ(v2_layer_type, layer->type());
   }
diff --git a/src/caffe/util/blocking_queue.cpp b/src/caffe/util/blocking_queue.cpp
deleted file mode 100644
index 8a0e9306f18..00000000000
--- a/src/caffe/util/blocking_queue.cpp
+++ /dev/null
@@ -1,96 +0,0 @@
-#include <boost/thread.hpp>
-#include <string>
-
-#include "caffe/data_layers.hpp"
-#include "caffe/data_reader.hpp"
-#include "caffe/parallel.hpp"
-#include "caffe/util/blocking_queue.hpp"
-
-namespace caffe {
-
-template<typename T>
-class BlockingQueue<T>::sync {
- public:
-  mutable boost::mutex mutex_;
-  boost::condition_variable condition_;
-};
-
-template<typename T>
-BlockingQueue<T>::BlockingQueue()
-    : sync_(new sync()) {
-}
-
-template<typename T>
-void BlockingQueue<T>::push(const T& t) {
-  boost::mutex::scoped_lock lock(sync_->mutex_);
-  queue_.push(t);
-  lock.unlock();
-  sync_->condition_.notify_one();
-}
-
-template<typename T>
-bool BlockingQueue<T>::try_pop(T* t) {
-  boost::mutex::scoped_lock lock(sync_->mutex_);
-
-  if (queue_.empty()) {
-    return false;
-  }
-
-  *t = queue_.front();
-  queue_.pop();
-  return true;
-}
-
-template<typename T>
-T BlockingQueue<T>::pop(const string& log_on_wait) {
-  boost::mutex::scoped_lock lock(sync_->mutex_);
-
-  while (queue_.empty()) {
-    if (!log_on_wait.empty()) {
-      LOG(INFO)<< log_on_wait;
-    }
-    sync_->condition_.wait(lock);
-  }
-
-  T t = queue_.front();
-  queue_.pop();
-  return t;
-}
-
-template<typename T>
-bool BlockingQueue<T>::try_peek(T* t) {
-  boost::mutex::scoped_lock lock(sync_->mutex_);
-
-  if (queue_.empty()) {
-    return false;
-  }
-
-  *t = queue_.front();
-  return true;
-}
-
-template<typename T>
-T BlockingQueue<T>::peek() {
-  boost::mutex::scoped_lock lock(sync_->mutex_);
-
-  while (queue_.empty()) {
-    sync_->condition_.wait(lock);
-  }
-
-  return queue_.front();
-}
-
-template<typename T>
-size_t BlockingQueue<T>::size() const {
-  boost::mutex::scoped_lock lock(sync_->mutex_);
-  return queue_.size();
-}
-
-template class BlockingQueue<Batch<float>*>;
-template class BlockingQueue<Batch<double>*>;
-template class BlockingQueue<Datum*>;
-template class BlockingQueue<shared_ptr<DataReader::QueuePair> >;
-template class BlockingQueue<P2PSync<float>*>;
-template class BlockingQueue<P2PSync<double>*>;
-
-}  // namespace caffe
diff --git a/tools/caffe.cpp b/tools/caffe.cpp
index e9c5a01ccbc..0b7523fccf9 100644
--- a/tools/caffe.cpp
+++ b/tools/caffe.cpp
@@ -12,19 +12,13 @@ using caffe::Blob;
 using caffe::Caffe;
 using caffe::Net;
 using caffe::Layer;
-using caffe::Solver;
 using caffe::shared_ptr;
-using caffe::string;
 using caffe::Timer;
 using caffe::vector;
-using std::ostringstream;
-using caffe::MemoryHandlerActivator;
+
 
 DEFINE_int32(gpu, -1,
-    "Run in GPU mode on given device ID (Legacy switch, use -gpus).");
-DEFINE_string(gpus, "",
-    "Run in GPU mode on given device IDs separated by ','."
-    "Use '-gpus all' to run on all available GPUs.");
+    "Run in GPU mode on given device ID.");
 DEFINE_string(solver, "",
     "The solver definition protocol buffer text file.");
 DEFINE_string(model, "",
@@ -32,8 +26,8 @@ DEFINE_string(model, "",
 DEFINE_string(snapshot, "",
     "Optional; the snapshot solver state to resume training.");
 DEFINE_string(weights, "",
-    "Optional; the pretrained weights to initialize finetuning, "
-    "separated by ','. Cannot be set simultaneously with snapshot.");
+    "Optional; the pretrained weights to initialize finetuning. "
+    "Cannot be set simultaneously with snapshot.");
 DEFINE_int32(iterations, 50,
     "The number of iterations to run.");
 
@@ -67,32 +61,6 @@ static BrewFunction GetBrewFunction(const caffe::string& name) {
   }
 }
 
-// Parse GPU ids or use all available devices
-static void get_gpus(vector<int>* gpus) {
-  if (FLAGS_gpu >= 0) {
-    FLAGS_gpus = "" + boost::lexical_cast<string>(FLAGS_gpu);
-  }
-  if (FLAGS_gpus == "all") {
-    int count = 0;
-#ifndef CPU_ONLY
-    CUDA_CHECK(cudaGetDeviceCount(&count));
-#else
-    NO_GPU;
-#endif
-    for (int i = 0; i < count; ++i) {
-      gpus->push_back(i);
-    }
-  } else if (FLAGS_gpus.size()) {
-    vector<string> strings;
-    boost::split(strings, FLAGS_gpus, boost::is_any_of(","));
-    for (int i = 0; i < strings.size(); ++i) {
-      gpus->push_back(boost::lexical_cast<int>(strings[i]));
-    }
-  } else {
-    CHECK_EQ(gpus->size(), 0);
-  }
-}
-
 // caffe commands to call by
 //     caffe <command> <args>
 //
@@ -101,13 +69,10 @@ static void get_gpus(vector<int>* gpus) {
 
 // Device Query: show diagnostic information for a GPU device.
 int device_query() {
-  LOG(INFO) << "Querying GPUs " << FLAGS_gpus;
-  vector<int> gpus;
-  get_gpus(&gpus);
-  for (int i = 0; i < gpus.size(); ++i) {
-    caffe::Caffe::SetDevice(gpus[i]);
-    caffe::Caffe::DeviceQuery();
-  }
+  CHECK_GT(FLAGS_gpu, -1) << "Need a device ID to query.";
+  LOG(INFO) << "Querying device ID = " << FLAGS_gpu;
+  caffe::Caffe::SetDevice(FLAGS_gpu);
+  caffe::Caffe::DeviceQuery();
   return 0;
 }
 RegisterBrewFunction(device_query);
@@ -136,52 +101,37 @@ int train() {
   caffe::SolverParameter solver_param;
   caffe::ReadProtoFromTextFileOrDie(FLAGS_solver, &solver_param);
 
-  // If the gpus flag is not provided, allow the mode and device to be set
+  // If the gpu flag is not provided, allow the mode and device to be set
   // in the solver prototxt.
-  if (FLAGS_gpu < 0 && FLAGS_gpus.size() == 0
-      && solver_param.solver_mode() == caffe::SolverParameter_SolverMode_GPU
-      && solver_param.has_device_id()) {
-    FLAGS_gpus = "" + boost::lexical_cast<string>(solver_param.device_id());
+  if (FLAGS_gpu < 0
+      && solver_param.solver_mode() == caffe::SolverParameter_SolverMode_GPU) {
+    FLAGS_gpu = solver_param.device_id();
   }
 
-  vector<int> gpus;
-  get_gpus(&gpus);
-  if (gpus.size() == 0) {
-    Caffe::set_mode(Caffe::CPU);
-  } else {
-    ostringstream s;
-    for (int i = 0; i < gpus.size(); ++i) {
-      s << (i ? ", " : "") << gpus[i];
-    }
-    LOG(INFO) << "Using GPUs " << s.str();
-
-    solver_param.set_device_id(gpus[0]);
-    Caffe::SetDevice(gpus[0]);
+  // Set device id and mode
+  if (FLAGS_gpu >= 0) {
+    LOG(INFO) << "Use GPU with device ID " << FLAGS_gpu;
+    Caffe::SetDevice(FLAGS_gpu);
     Caffe::set_mode(Caffe::GPU);
-    Caffe::set_solver_count(gpus.size());
+  } else {
+    LOG(INFO) << "Use CPU.";
+    Caffe::set_mode(Caffe::CPU);
   }
-#ifdef USE_CNMEM
-  MemoryHandlerActivator handler(gpus);
-#endif
 
-  shared_ptr<Solver<float> > solver(caffe::GetSolver<float>(solver_param));
+  LOG(INFO) << "Starting Optimization";
+  shared_ptr<caffe::Solver<float> >
+    solver(caffe::GetSolver<float>(solver_param));
 
   if (FLAGS_snapshot.size()) {
     LOG(INFO) << "Resuming from " << FLAGS_snapshot;
-    solver->Restore(FLAGS_snapshot.c_str());
+    solver->Solve(FLAGS_snapshot);
   } else if (FLAGS_weights.size()) {
-    CopyLayers(solver.get(), FLAGS_weights);
-  }
-
-  if (gpus.size() > 1) {
-    caffe::P2PSync<float>::run(solver, gpus);
+    CopyLayers(&*solver, FLAGS_weights);
+    solver->Solve();
   } else {
-    LOG(INFO) << "Starting Optimization";
     solver->Solve();
   }
   LOG(INFO) << "Optimization Done.";
-
-  // solver.reset();
   return 0;
 }
 RegisterBrewFunction(train);
@@ -193,11 +143,9 @@ int test() {
   CHECK_GT(FLAGS_weights.size(), 0) << "Need model weights to score.";
 
   // Set device id and mode
-  vector<int> gpus;
-  get_gpus(&gpus);
-  if (gpus.size() != 0) {
-    LOG(INFO) << "Use GPU with device ID " << gpus[0];
-    Caffe::SetDevice(gpus[0]);
+  if (FLAGS_gpu >= 0) {
+    LOG(INFO) << "Use GPU with device ID " << FLAGS_gpu;
+    Caffe::SetDevice(FLAGS_gpu);
     Caffe::set_mode(Caffe::GPU);
   } else {
     LOG(INFO) << "Use CPU.";
@@ -260,11 +208,9 @@ int time() {
   CHECK_GT(FLAGS_model.size(), 0) << "Need a model definition to time.";
 
   // Set device id and mode
-  vector<int> gpus;
-  get_gpus(&gpus);
-  if (gpus.size() != 0) {
-    LOG(INFO) << "Use GPU with device ID " << gpus[0];
-    Caffe::SetDevice(gpus[0]);
+  if (FLAGS_gpu >= 0) {
+    LOG(INFO) << "Use GPU with device ID " << FLAGS_gpu;
+    Caffe::SetDevice(FLAGS_gpu);
     Caffe::set_mode(Caffe::GPU);
   } else {
     LOG(INFO) << "Use CPU.";