From 1beb9ac4b5dab1cd446ff06522f8689333596dc3 Mon Sep 17 00:00:00 2001 From: Luke Yeager Date: Thu, 17 Dec 2015 11:01:40 -0800 Subject: [PATCH] Revert caffe-0.13 branch --- CMakeLists.txt | 5 - Makefile | 46 +- Makefile.config.example | 16 +- cmake/ConfigGen.cmake | 6 - cmake/Cuda.cmake | 35 - cmake/Summary.cmake | 7 +- cmake/Targets.cmake | 5 +- cmake/Templates/CaffeConfig.cmake.in | 7 +- cmake/Templates/caffe_config.h.in | 4 - include/caffe/caffe.hpp | 1 - include/caffe/common.hpp | 92 +-- include/caffe/common_layers.hpp | 1 + include/caffe/data_layers.hpp | 39 +- include/caffe/data_reader.hpp | 82 --- include/caffe/internal_thread.hpp | 25 +- include/caffe/layer_factory.hpp | 4 +- include/caffe/neuron_layers.hpp | 3 + include/caffe/parallel.hpp | 123 ---- include/caffe/solver.hpp | 43 +- include/caffe/syncedmem.hpp | 42 +- .../caffe/test/test_gradient_check_util.hpp | 5 +- include/caffe/util/blocking_queue.hpp | 47 -- include/caffe/util/device_alternate.hpp | 11 - include/caffe/vision_layers.hpp | 75 +-- python/caffe/draw.py | 2 +- scripts/travis/travis_install.sh | 2 +- .../travis/travis_setup_makefile_config.sh | 1 - src/caffe/CMakeLists.txt | 1 - src/caffe/common.cpp | 147 +---- src/caffe/data_reader.cpp | 119 ---- src/caffe/data_transformer.cpp | 4 +- src/caffe/internal_thread.cpp | 67 +- src/caffe/layer_factory.cpp | 45 +- src/caffe/layers/absval_layer.cu | 1 + src/caffe/layers/base_data_layer.cpp | 91 +-- src/caffe/layers/base_data_layer.cu | 15 +- src/caffe/layers/cudnn_conv_layer.cpp | 206 +----- src/caffe/layers/cudnn_conv_layer.cu | 90 +-- src/caffe/layers/cudnn_lcn_layer.cpp | 82 --- src/caffe/layers/cudnn_lcn_layer.cu | 74 --- src/caffe/layers/cudnn_lrn_layer.cpp | 57 -- src/caffe/layers/cudnn_lrn_layer.cu | 48 -- src/caffe/layers/cudnn_pooling_layer.cpp | 2 + src/caffe/layers/cudnn_pooling_layer.cu | 4 +- src/caffe/layers/cudnn_relu_layer.cpp | 2 + src/caffe/layers/cudnn_relu_layer.cu | 4 +- src/caffe/layers/cudnn_sigmoid_layer.cpp | 2 + src/caffe/layers/cudnn_sigmoid_layer.cu | 4 +- src/caffe/layers/cudnn_softmax_layer.cpp | 2 + src/caffe/layers/cudnn_softmax_layer.cu | 5 +- src/caffe/layers/cudnn_tanh_layer.cpp | 2 + src/caffe/layers/cudnn_tanh_layer.cu | 4 +- src/caffe/layers/data_layer.cpp | 112 ++-- src/caffe/layers/dropout_layer.cu | 2 +- src/caffe/layers/image_data_layer.cpp | 27 +- src/caffe/layers/lrn_layer.cpp | 2 +- src/caffe/layers/window_data_layer.cpp | 20 +- src/caffe/net.cpp | 191 ++---- src/caffe/parallel.cpp | 526 ---------------- src/caffe/proto/caffe.proto | 32 - src/caffe/solver.cpp | 596 +++++++++--------- src/caffe/syncedmem.cpp | 52 +- src/caffe/test/test_convolution_layer.cpp | 3 - src/caffe/test/test_internal_thread.cpp | 34 +- src/caffe/test/test_layer_factory.cpp | 14 +- src/caffe/test/test_lrn_layer.cpp | 209 ------ src/caffe/test/test_upgrade_proto.cpp | 12 - src/caffe/util/blocking_queue.cpp | 96 --- tools/caffe.cpp | 116 +--- 69 files changed, 723 insertions(+), 3128 deletions(-) delete mode 100644 include/caffe/data_reader.hpp delete mode 100644 include/caffe/parallel.hpp delete mode 100644 include/caffe/util/blocking_queue.hpp delete mode 100644 src/caffe/data_reader.cpp delete mode 100644 src/caffe/layers/cudnn_lcn_layer.cpp delete mode 100644 src/caffe/layers/cudnn_lcn_layer.cu delete mode 100644 src/caffe/layers/cudnn_lrn_layer.cpp delete mode 100644 src/caffe/layers/cudnn_lrn_layer.cu delete mode 100644 src/caffe/parallel.cpp delete mode 100644 src/caffe/util/blocking_queue.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 2b215da4dc3..74fa70c9d20 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,10 +3,6 @@ cmake_minimum_required(VERSION 2.8.7) # ---[ Caffe project project(Caffe C CXX) -# ---[ Caffe version -set(CAFFE_TARGET_VERSION "0.13.2") -set(CAFFE_TARGET_SOVERSION "0.13") - # ---[ Using cmake scripts and modules list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules) @@ -19,7 +15,6 @@ include(cmake/ConfigGen.cmake) # ---[ Options caffe_option(CPU_ONLY "Build Caffe wihtout CUDA support" OFF) # TODO: rename to USE_CUDA caffe_option(USE_CUDNN "Build Caffe with cuDNN libary support" ON IF NOT CPU_ONLY) -caffe_option(USE_CNMEM "Build Caffe with CNMeM memory pool support" OFF) caffe_option(BUILD_SHARED_LIBS "Build shared libraries" ON) caffe_option(BUILD_python "Build Python wrapper" ON) set(python_version "2" CACHE STRING "Specify which python version to use") diff --git a/Makefile b/Makefile index c0382e3c1df..d2e5e5720ed 100644 --- a/Makefile +++ b/Makefile @@ -29,16 +29,9 @@ SRC_DIRS := $(shell find * -type d -exec bash -c "find {} -maxdepth 1 \ \( -name '*.cpp' -o -name '*.proto' \) | grep -q ." \; -print) # The target shared library name -LIBRARY_NAME := $(PROJECT)$(LIBRARY_NAME_SUFFIX) LIB_BUILD_DIR := $(BUILD_DIR)/lib -STATIC_NAME := $(LIB_BUILD_DIR)/lib$(LIBRARY_NAME).a -DYNAMIC_VERSION_MAJOR := 0 -DYNAMIC_VERSION_MINOR := 13 -DYNAMIC_VERSION_REVISION := 2 -DYNAMIC_NAME_SHORT := lib$(LIBRARY_NAME).so -DYNAMIC_SONAME_SHORT := $(DYNAMIC_NAME_SHORT).$(DYNAMIC_VERSION_MAJOR).$(DYNAMIC_VERSION_MINOR) -DYNAMIC_VERSIONED_NAME_SHORT := $(DYNAMIC_NAME_SHORT).$(DYNAMIC_VERSION_MAJOR).$(DYNAMIC_VERSION_MINOR).$(DYNAMIC_VERSION_REVISION) -DYNAMIC_NAME := $(LIB_BUILD_DIR)/$(DYNAMIC_VERSIONED_NAME_SHORT) +STATIC_NAME := $(LIB_BUILD_DIR)/lib$(PROJECT).a +DYNAMIC_NAME := $(LIB_BUILD_DIR)/lib$(PROJECT).so ############################## # Get all source files @@ -176,7 +169,6 @@ ifneq ($(CPU_ONLY), 1) LIBRARY_DIRS += $(CUDA_LIB_DIR) LIBRARIES := cudart cublas curand endif - LIBRARIES += glog gflags protobuf leveldb snappy \ lmdb boost_system hdf5_hl hdf5 m \ opencv_core opencv_highgui opencv_imgproc @@ -242,7 +234,6 @@ ifeq ($(LINUX), 1) # boost::thread is reasonably called boost_thread (compare OS X) # We will also explicitly add stdc++ to the link target. LIBRARIES += boost_thread stdc++ - VERSIONFLAGS += -Wl,-soname,$(DYNAMIC_SONAME_SHORT) -Wl,-rpath,$(ORIGIN)/../lib endif # OS X: @@ -266,7 +257,6 @@ ifeq ($(OSX), 1) # we need to explicitly ask for the rpath to be obeyed DYNAMIC_FLAGS := -install_name @rpath/libcaffe.so ORIGIN := @loader_path - VERSIONFLAGS += -Wl,-install_name,$(DYNAMIC_SONAME_SHORT) -Wl,-rpath,$(ORIGIN)/../../build/lib else ORIGIN := \$$ORIGIN endif @@ -300,12 +290,6 @@ ifeq ($(USE_CUDNN), 1) COMMON_FLAGS += -DUSE_CUDNN endif -# cuMEM integration -ifeq ($(USE_CNMEM), 1) - LIBRARIES += cnmem - COMMON_FLAGS += -DUSE_CNMEM -endif - # CPU-only configuration ifeq ($(CPU_ONLY), 1) OBJS := $(PROTO_OBJS) $(CXX_OBJS) @@ -316,14 +300,6 @@ ifeq ($(CPU_ONLY), 1) COMMON_FLAGS += -DCPU_ONLY endif -# Benchmarks -ifeq ($(BENCHMARK_DATA), 1) - COMMON_FLAGS += -DBENCHMARK_DATA -endif -ifeq ($(BENCHMARK_SOLVER), 1) - COMMON_FLAGS += -DBENCHMARK_SOLVER -endif - # Python layer support ifeq ($(WITH_PYTHON_LAYER), 1) COMMON_FLAGS += -DWITH_PYTHON_LAYER @@ -466,7 +442,7 @@ py: $(PY$(PROJECT)_SO) $(PROTO_GEN_PY) $(PY$(PROJECT)_SO): $(PY$(PROJECT)_SRC) $(PY$(PROJECT)_HXX) | $(DYNAMIC_NAME) @ echo CXX/LD -o $@ $< $(Q)$(CXX) -shared -o $@ $(PY$(PROJECT)_SRC) \ - -o $@ $(LINKFLAGS) -l$(LIBRARY_NAME) $(PYTHON_LDFLAGS) \ + -o $@ $(LINKFLAGS) -l$(PROJECT) $(PYTHON_LDFLAGS) \ -Wl,-rpath,$(ORIGIN)/../../build/lib mat$(PROJECT): mat @@ -524,9 +500,7 @@ $(ALL_BUILD_DIRS): | $(BUILD_DIR_LINK) $(DYNAMIC_NAME): $(OBJS) | $(LIB_BUILD_DIR) @ echo LD -o $@ - $(Q)$(CXX) -shared -o $@ $(OBJS) $(VERSIONFLAGS) $(LINKFLAGS) $(LDFLAGS) $(DYNAMIC_FLAGS) - @ cd $(BUILD_DIR)/lib; rm -f $(DYNAMIC_SONAME_SHORT); ln -s $(DYNAMIC_VERSIONED_NAME_SHORT) $(DYNAMIC_SONAME_SHORT) - @ cd $(BUILD_DIR)/lib; rm -f $(DYNAMIC_NAME_SHORT); ln -s $(DYNAMIC_SONAME_SHORT) $(DYNAMIC_NAME_SHORT) + $(Q)$(CXX) -shared -o $@ $(OBJS) $(LINKFLAGS) $(LDFLAGS) $(DYNAMIC_FLAGS) $(STATIC_NAME): $(OBJS) | $(LIB_BUILD_DIR) @ echo AR -o $@ @@ -557,19 +531,19 @@ $(TEST_ALL_BIN): $(TEST_MAIN_SRC) $(TEST_OBJS) $(GTEST_OBJ) \ | $(DYNAMIC_NAME) $(TEST_BIN_DIR) @ echo CXX/LD -o $@ $< $(Q)$(CXX) $(TEST_MAIN_SRC) $(TEST_OBJS) $(GTEST_OBJ) \ - -o $@ $(LINKFLAGS) $(LDFLAGS) -l$(LIBRARY_NAME) -Wl,-rpath,$(ORIGIN)/../lib + -o $@ $(LINKFLAGS) $(LDFLAGS) -l$(PROJECT) -Wl,-rpath,$(ORIGIN)/../lib $(TEST_CU_BINS): $(TEST_BIN_DIR)/%.testbin: $(TEST_CU_BUILD_DIR)/%.o \ $(GTEST_OBJ) | $(DYNAMIC_NAME) $(TEST_BIN_DIR) @ echo LD $< $(Q)$(CXX) $(TEST_MAIN_SRC) $< $(GTEST_OBJ) \ - -o $@ $(LINKFLAGS) $(LDFLAGS) -l$(LIBRARY_NAME) -Wl,-rpath,$(ORIGIN)/../lib + -o $@ $(LINKFLAGS) $(LDFLAGS) -l$(PROJECT) -Wl,-rpath,$(ORIGIN)/../lib $(TEST_CXX_BINS): $(TEST_BIN_DIR)/%.testbin: $(TEST_CXX_BUILD_DIR)/%.o \ $(GTEST_OBJ) | $(DYNAMIC_NAME) $(TEST_BIN_DIR) @ echo LD $< $(Q)$(CXX) $(TEST_MAIN_SRC) $< $(GTEST_OBJ) \ - -o $@ $(LINKFLAGS) $(LDFLAGS) -l$(LIBRARY_NAME) -Wl,-rpath,$(ORIGIN)/../lib + -o $@ $(LINKFLAGS) $(LDFLAGS) -l$(PROJECT) -Wl,-rpath,$(ORIGIN)/../lib # Target for extension-less symlinks to tool binaries with extension '*.bin'. $(TOOL_BUILD_DIR)/%: $(TOOL_BUILD_DIR)/%.bin | $(TOOL_BUILD_DIR) @@ -578,12 +552,12 @@ $(TOOL_BUILD_DIR)/%: $(TOOL_BUILD_DIR)/%.bin | $(TOOL_BUILD_DIR) $(TOOL_BINS): %.bin : %.o | $(DYNAMIC_NAME) @ echo CXX/LD -o $@ - $(Q)$(CXX) $< -o $@ $(LINKFLAGS) -l$(LIBRARY_NAME) $(LDFLAGS) \ + $(Q)$(CXX) $< -o $@ $(LINKFLAGS) -l$(PROJECT) $(LDFLAGS) \ -Wl,-rpath,$(ORIGIN)/../lib $(EXAMPLE_BINS): %.bin : %.o | $(DYNAMIC_NAME) @ echo CXX/LD -o $@ - $(Q)$(CXX) $< -o $@ $(LINKFLAGS) -l$(LIBRARY_NAME) $(LDFLAGS) \ + $(Q)$(CXX) $< -o $@ $(LINKFLAGS) -l$(PROJECT) $(LDFLAGS) \ -Wl,-rpath,$(ORIGIN)/../../lib proto: $(PROTO_GEN_CC) $(PROTO_GEN_HEADER) @@ -645,8 +619,6 @@ $(DISTRIBUTE_DIR): all py | $(DISTRIBUTE_SUBDIRS) # add libraries cp $(STATIC_NAME) $(DISTRIBUTE_DIR)/lib cp $(DYNAMIC_NAME) $(DISTRIBUTE_DIR)/lib - cd $(DISTRIBUTE_DIR)/lib; rm -f $(DYNAMIC_SONAME_SHORT); ln -s $(DYNAMIC_VERSIONED_NAME_SHORT) $(DYNAMIC_SONAME_SHORT) - cd $(DISTRIBUTE_DIR)/lib; rm -f $(DYNAMIC_NAME_SHORT); ln -s $(DYNAMIC_SONAME_SHORT) $(DYNAMIC_NAME_SHORT) # add python - it's not the standard way, indeed... cp -r python $(DISTRIBUTE_DIR)/python diff --git a/Makefile.config.example b/Makefile.config.example index 0007f5324b7..7a8aafd7c9f 100644 --- a/Makefile.config.example +++ b/Makefile.config.example @@ -1,10 +1,8 @@ ## Refer to http://caffe.berkeleyvision.org/installation.html # Contributions simplifying and improving our build system are welcome! -# cuDNN acceleration switch (comment to build without cuDNN). -USE_CUDNN := 1 -# CNMeM memory pool switch (github.com/NVIDIA/cnmem required) -# USE_CNMEM := 1 +# cuDNN acceleration switch (uncomment to build with cuDNN). +# USE_CUDNN := 1 # CPU-only switch (uncomment to build without GPU support). # CPU_ONLY := 1 @@ -47,8 +45,7 @@ BLAS := atlas # NOTE: this is required only if you will compile the python interface. # We need to be able to find Python.h and numpy/arrayobject.h. PYTHON_INCLUDE := /usr/include/python2.7 \ - /usr/lib/python2.7/dist-packages/numpy/core/include \ - /usr/local/lib/python2.7/dist-packages/numpy/core/include + /usr/lib/python2.7/dist-packages/numpy/core/include # Anaconda Python distribution is quite popular. Include path: # Verify anaconda location, sometimes it's in root. # ANACONDA_HOME := $(HOME)/anaconda @@ -82,10 +79,3 @@ TEST_GPUID := 0 # enable pretty build (comment to see full commands) Q ?= @ - -# Adds timing info in logs -# BENCHMARK_DATA := 1 -# BENCHMARK_SOLVER := 1 - -# shared object suffix name to differentiate branches -LIBRARY_NAME_SUFFIX := -nv diff --git a/cmake/ConfigGen.cmake b/cmake/ConfigGen.cmake index 88b7b0ac953..c82047dcc5f 100644 --- a/cmake/ConfigGen.cmake +++ b/cmake/ConfigGen.cmake @@ -51,12 +51,6 @@ function(caffe_generate_export_configs) list(APPEND DEFINITIONS -DUSE_CUDNN) endif() - if(NOT HAVE_CNMEM) - set(HAVE_CNMEM FALSE) - else() - list(APPEND DEFINITIONS -DUSE_CNMEM) - endif() - if(BLAS STREQUAL "MKL" OR BLAS STREQUAL "mkl") list(APPEND Caffe_DEFINITIONS -DUSE_MKL) endif() diff --git a/cmake/Cuda.cmake b/cmake/Cuda.cmake index 74f7a211796..ff58d31c166 100644 --- a/cmake/Cuda.cmake +++ b/cmake/Cuda.cmake @@ -188,32 +188,6 @@ function(detect_cuDNN) endif() endfunction() -################################################################################################ -# Short command for cuDNN detection. Believe it soon will be a part of CUDA toolkit distribution. -# That's why not FindcuDNN.cmake file, but just the macro -# Usage: -# detect_cuDNN() -function(detect_CNMeM) - set(CNMEM_ROOT "" CACHE PATH "CNMeM root folder") - - find_path(CNMEM_INCLUDE cnmem.h - PATHS ${CNMEM_ROOT} $ENV{CNMEM_ROOT} ${CUDA_TOOLKIT_INCLUDE} - DOC "Path to CNMeM include directory." ) - - get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH) - find_library(CNMEM_LIBRARY NAMES libcnmem.so # libcudnn_static.a - PATHS ${CNMEM_ROOT} $ENV{CNMEM_ROOT} ${CNMEM_INCLUDE} ${__libpath_hist} - DOC "Path to CNMeM library.") - - if(CNMEM_INCLUDE AND CNMEM_LIBRARY) - set(HAVE_CNMEM TRUE PARENT_SCOPE) - set(CNMEM_FOUND TRUE PARENT_SCOPE) - - mark_as_advanced(CNMEM_INCLUDE CNMEM_LIBRARY CNMEM_ROOT) - message(STATUS "Found CNMeM (include: ${CNMEM_INCLUDE}, library: ${CNMEM_LIBRARY})") - endif() -endfunction() - ################################################################################################ ### Non macro section @@ -242,15 +216,6 @@ if(USE_CUDNN) endif() endif() -if(USE_CNMEM) - detect_CNMeM() - if(HAVE_CNMEM) - add_definitions(-DUSE_CNMEM) - include_directories(SYSTEM ${CNMEM_INCLUDE}) - list(APPEND Caffe_LINKER_LIBS ${CNMEM_LIBRARY}) - endif() -endif() - # setting nvcc arch flags caffe_select_nvcc_arch_flags(NVCC_FLAGS_EXTRA) list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA}) diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake index e5408e30509..32931942846 100644 --- a/cmake/Summary.cmake +++ b/cmake/Summary.cmake @@ -101,7 +101,7 @@ function(caffe_print_configuration_summary) caffe_status("") caffe_status("******************* Caffe Configuration Summary *******************") caffe_status("General:") - caffe_status(" Version : ${CAFFE_TARGET_VERSION}") + caffe_status(" Version : ${Caffe_VERSION}") caffe_status(" Git : ${Caffe_GIT_VERSION}") caffe_status(" System : ${CMAKE_SYSTEM_NAME}") caffe_status(" C++ compiler : ${CMAKE_CXX_COMPILER}") @@ -136,11 +136,6 @@ function(caffe_print_configuration_summary) else() caffe_status(" cuDNN : Disabled") endif() - if (USE_CNMEM) - caffe_status(" CNMeM : " HAVE_CNMEM THEN "Yes" ELSE "Not found") - else() - caffe_status(" CNMeM : Disabled") - endif() caffe_status("") endif() if(HAVE_PYTHON) diff --git a/cmake/Targets.cmake b/cmake/Targets.cmake index c0b8ec9abe3..e3ad872313b 100644 --- a/cmake/Targets.cmake +++ b/cmake/Targets.cmake @@ -109,10 +109,7 @@ function(caffe_default_properties target) DEBUG_POSTFIX ${Caffe_DEBUG_POSTFIX} ARCHIVE_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/lib" LIBRARY_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/lib" - RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/bin" - VERSION ${CAFFE_TARGET_VERSION} - SOVERSION ${CAFFE_TARGET_SOVERSION} - ) + RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/bin") endfunction() ################################################################################################ diff --git a/cmake/Templates/CaffeConfig.cmake.in b/cmake/Templates/CaffeConfig.cmake.in index 99e194f086f..a4b03d961e0 100644 --- a/cmake/Templates/CaffeConfig.cmake.in +++ b/cmake/Templates/CaffeConfig.cmake.in @@ -15,10 +15,8 @@ # # Caffe_HAVE_CUDA - signals about CUDA support # Caffe_HAVE_CUDNN - signals about cuDNN support -# Caffe_HAVE_CNMEM - signals about CNMeM support -# -# -# + + # OpenCV dependency if(NOT OpenCV_FOUND) @@ -58,4 +56,3 @@ set(Caffe_DEFINITIONS "@Caffe_DEFINITIONS@") set(Caffe_CPU_ONLY @CPU_ONLY@) set(Caffe_HAVE_CUDA @HAVE_CUDA@) set(Caffe_HAVE_CUDNN @HAVE_CUDNN@) -set(Caffe_HAVE_CNMEM @HAVE_CNMEM@) diff --git a/cmake/Templates/caffe_config.h.in b/cmake/Templates/caffe_config.h.in index 3ea3bc41bf1..6039e8f6b21 100644 --- a/cmake/Templates/caffe_config.h.in +++ b/cmake/Templates/caffe_config.h.in @@ -11,10 +11,6 @@ #cmakedefine HAVE_CUDNN #cmakedefine USE_CUDNN -/* NVIDIA CNMeM */ -#cmakedefine HAVE_CNMEM -#cmakedefine USE_CNMEM - /* NVIDA cuDNN */ #cmakedefine CPU_ONLY diff --git a/include/caffe/caffe.hpp b/include/caffe/caffe.hpp index 68a5e1d1d1a..3c829f2f9b0 100644 --- a/include/caffe/caffe.hpp +++ b/include/caffe/caffe.hpp @@ -10,7 +10,6 @@ #include "caffe/layer.hpp" #include "caffe/layer_factory.hpp" #include "caffe/net.hpp" -#include "caffe/parallel.hpp" #include "caffe/proto/caffe.pb.h" #include "caffe/solver.hpp" #include "caffe/util/benchmark.hpp" diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp index d67d12f6ee3..5f86bc2625b 100644 --- a/include/caffe/common.hpp +++ b/include/caffe/common.hpp @@ -18,11 +18,6 @@ #include "caffe/util/device_alternate.hpp" -#ifdef USE_CNMEM -// cuMEM integration -#include -#endif - // gflags 2.1 issue: namespace google was changed to gflags without warning. // Luckily we will be able to use GFLAGS_GFLAGS_H_ to detect if it is version // 2.1. If yes, we will add a temporary solution to redirect the namespace. @@ -103,12 +98,12 @@ void GlobalInit(int* pargc, char*** pargv); class Caffe { public: ~Caffe(); - - // Thread local context for Caffe. Moved to common.cpp instead of - // including boost/thread.hpp to avoid a boost/NVCC issues (#1009, #1010) - // on OSX. Also fails on Linux with CUDA 7.0.18. - static Caffe& Get(); - + inline static Caffe& Get() { + if (!singleton_.get()) { + singleton_.reset(new Caffe()); + } + return *singleton_; + } enum Brew { CPU, GPU }; // This random number generator facade hides boost and CUDA rng @@ -137,9 +132,6 @@ class Caffe { inline static curandGenerator_t curand_generator() { return Get().curand_generator_; } -#ifdef USE_CUDNN - inline static cudnnHandle_t cudnn_handle() { return Get().cudnn_handle_; } -#endif #endif // Returns the mode: running on CPU or GPU. @@ -157,25 +149,16 @@ class Caffe { static void SetDevice(const int device_id); // Prints the current GPU status. static void DeviceQuery(); - // Parallel training info - inline static int solver_count() { return Get().solver_count_; } - inline static void set_solver_count(int val) { Get().solver_count_ = val; } - inline static bool root_solver() { return Get().root_solver_; } - inline static void set_root_solver(bool val) { Get().root_solver_ = val; } protected: #ifndef CPU_ONLY cublasHandle_t cublas_handle_; curandGenerator_t curand_generator_; -#ifdef USE_CUDNN - cudnnHandle_t cudnn_handle_; -#endif #endif shared_ptr random_generator_; Brew mode_; - int solver_count_; - bool root_solver_; + static shared_ptr singleton_; private: // The private constructor to avoid duplicate instantiation. @@ -184,67 +167,6 @@ class Caffe { DISABLE_COPY_AND_ASSIGN(Caffe); }; -class MemoryHandler { - public: - static MemoryHandler& Get(); -#ifndef CPU_ONLY - static void mallocGPU(void **ptr, size_t size, - cudaStream_t stream = cudaStreamDefault); - static void freeGPU(void *ptr, cudaStream_t = cudaStreamDefault); - static void registerStream(cudaStream_t stream); -#endif - static void setGPUs(const std::vector& gpus) { Get().gpus_ = gpus; } - static void usePool() { Get().using_pool_ = true; } - static bool usingPool() { -#ifdef USE_CNMEM - return Get().using_pool_; -#else - return false; -#endif - } - static void getInfo(size_t *free_mem, size_t *used_mem); - static void destroy(); - ~MemoryHandler() { } - - private: - MemoryHandler() : using_pool_(false), initialized_(false) {} - static void Init(); - // static void Destroy(); -#ifndef CPU_ONLY - void allocate_memory(void **ptr, size_t size, cudaStream_t stream); - void free_memory(void *ptr, cudaStream_t stream); -#endif - DISABLE_COPY_AND_ASSIGN(MemoryHandler); - - bool using_pool_; - bool initialized_; - std::vector gpus_; -}; - -class MemoryHandlerActivator { - public: - explicit MemoryHandlerActivator(const std::vector& gpus) - : using_pool_(false) { - if (gpus.size() > 0) { - using_pool_ = true; - MemoryHandler::usePool(); - MemoryHandler::setGPUs(gpus); -#ifndef CPU_ONLY - void* temp; - MemoryHandler::mallocGPU(&temp, 4); - MemoryHandler::freeGPU(temp); -#endif - } - } - ~MemoryHandlerActivator() { - if (using_pool_) { - MemoryHandler::destroy(); - } - } - private: - int using_pool_; -}; - } // namespace caffe #endif // CAFFE_COMMON_HPP_ diff --git a/include/caffe/common_layers.hpp b/include/caffe/common_layers.hpp index ecf516795fd..e6b42c14587 100644 --- a/include/caffe/common_layers.hpp +++ b/include/caffe/common_layers.hpp @@ -424,6 +424,7 @@ class CuDNNSoftmaxLayer : public SoftmaxLayer { const vector& propagate_down, const vector*>& bottom); bool handles_setup_; + cudnnHandle_t handle_; cudnnTensorDescriptor_t bottom_desc_; cudnnTensorDescriptor_t top_desc_; }; diff --git a/include/caffe/data_layers.hpp b/include/caffe/data_layers.hpp index 12e6c366620..3958cb7ecb0 100644 --- a/include/caffe/data_layers.hpp +++ b/include/caffe/data_layers.hpp @@ -5,17 +5,16 @@ #include #include +#include "boost/scoped_ptr.hpp" #include "hdf5.h" #include "caffe/blob.hpp" #include "caffe/common.hpp" -#include "caffe/data_reader.hpp" #include "caffe/data_transformer.hpp" #include "caffe/filler.hpp" #include "caffe/internal_thread.hpp" #include "caffe/layer.hpp" #include "caffe/proto/caffe.pb.h" -#include "caffe/util/blocking_queue.hpp" #include "caffe/util/db.hpp" namespace caffe { @@ -51,17 +50,12 @@ class BaseDataLayer : public Layer { bool output_labels_; }; -template -class Batch { - public: - Blob data_, label_; -}; - template class BasePrefetchingDataLayer : public BaseDataLayer, public InternalThread { public: - explicit BasePrefetchingDataLayer(const LayerParameter& param); + explicit BasePrefetchingDataLayer(const LayerParameter& param) + : BaseDataLayer(param) {} // LayerSetUp: implements common data layer setup functionality, and calls // DataLayerSetUp to do special data layer setup for individual layer types. // This method may not be overridden. @@ -73,24 +67,22 @@ class BasePrefetchingDataLayer : virtual void Forward_gpu(const vector*>& bottom, const vector*>& top); - // Prefetches batches (asynchronously if to GPU memory) - static const int PREFETCH_COUNT = 3; + virtual void CreatePrefetchThread(); + virtual void JoinPrefetchThread(); + // The thread's function + virtual void InternalThreadEntry() {} protected: - virtual void InternalThreadEntry(); - virtual void load_batch(Batch* batch) = 0; - - Batch prefetch_[PREFETCH_COUNT]; - BlockingQueue*> prefetch_free_; - BlockingQueue*> prefetch_full_; - + Blob prefetch_data_; + Blob prefetch_label_; Blob transformed_data_; }; template class DataLayer : public BasePrefetchingDataLayer { public: - explicit DataLayer(const LayerParameter& param); + explicit DataLayer(const LayerParameter& param) + : BasePrefetchingDataLayer(param) {} virtual ~DataLayer(); virtual void DataLayerSetUp(const vector*>& bottom, const vector*>& top); @@ -101,9 +93,10 @@ class DataLayer : public BasePrefetchingDataLayer { virtual inline int MaxTopBlobs() const { return 2; } protected: - virtual void load_batch(Batch* batch); + virtual void InternalThreadEntry(); - DataReader reader_; + shared_ptr db_; + shared_ptr cursor_; }; /** @@ -242,7 +235,7 @@ class ImageDataLayer : public BasePrefetchingDataLayer { protected: shared_ptr prefetch_rng_; virtual void ShuffleImages(); - virtual void load_batch(Batch* batch); + virtual void InternalThreadEntry(); vector > lines_; int lines_id_; @@ -314,7 +307,7 @@ class WindowDataLayer : public BasePrefetchingDataLayer { protected: virtual unsigned int PrefetchRand(); - virtual void load_batch(Batch* batch); + virtual void InternalThreadEntry(); shared_ptr prefetch_rng_; vector > > image_database_; diff --git a/include/caffe/data_reader.hpp b/include/caffe/data_reader.hpp deleted file mode 100644 index 8ed5542cb8d..00000000000 --- a/include/caffe/data_reader.hpp +++ /dev/null @@ -1,82 +0,0 @@ -#ifndef CAFFE_DATA_READER_HPP_ -#define CAFFE_DATA_READER_HPP_ - -#include -#include -#include - -#include "caffe/common.hpp" -#include "caffe/internal_thread.hpp" -#include "caffe/util/blocking_queue.hpp" -#include "caffe/util/db.hpp" - -namespace caffe { - -/** - * @brief Reads data from a source to queues available to data layers. - * A single reading thread is created per source, even if multiple solvers - * are running in parallel, e.g. for multi-GPU training. This makes sure - * databases are read sequentially, and that each solver accesses a different - * subset of the database. Data is distributed to solvers in a round-robin - * way to keep parallel training deterministic. - */ -class DataReader { - public: - explicit DataReader(const LayerParameter& param); - ~DataReader(); - - inline BlockingQueue& free() const { - return queue_pair_->free_; - } - inline BlockingQueue& full() const { - return queue_pair_->full_; - } - - protected: - // Queue pairs are shared between a body and its readers - class QueuePair { - public: - explicit QueuePair(int size); - ~QueuePair(); - - BlockingQueue free_; - BlockingQueue full_; - - DISABLE_COPY_AND_ASSIGN(QueuePair); - }; - - // A single body is created per source - class Body : public InternalThread { - public: - explicit Body(const LayerParameter& param); - virtual ~Body(); - - protected: - void InternalThreadEntry(); - void read_one(db::Cursor* cursor, QueuePair* qp); - - const LayerParameter param_; - BlockingQueue > new_queue_pairs_; - - friend class DataReader; - - DISABLE_COPY_AND_ASSIGN(Body); - }; - - // A source is uniquely identified by its layer name + path, in case - // the same database is read from two different locations in the net. - static inline string source_key(const LayerParameter& param) { - return param.name() + ":" + param.data_param().source(); - } - - const shared_ptr queue_pair_; - shared_ptr body_; - - static map > bodies_; - -DISABLE_COPY_AND_ASSIGN(DataReader); -}; - -} // namespace caffe - -#endif // CAFFE_DATA_READER_HPP_ diff --git a/include/caffe/internal_thread.hpp b/include/caffe/internal_thread.hpp index 3c32a1d13b3..815ca54605e 100644 --- a/include/caffe/internal_thread.hpp +++ b/include/caffe/internal_thread.hpp @@ -14,22 +14,18 @@ namespace caffe { /** * Virtual class encapsulate boost::thread for use in base class * The child class will acquire the ability to run a single thread, - * by reimplementing the virtual function InternalThreadEntry. + * by reimplementing the virutal function InternalThreadEntry. */ class InternalThread { public: - InternalThread(); + InternalThread() : thread_() {} virtual ~InternalThread(); - /** - * Caffe's thread local state will be initialized using the current - * thread values, e.g. device id, solver index etc. The random seed - * is initialized using caffe_rng_rand. - */ - void StartInternalThread(); + /** Returns true if the thread was successfully started. **/ + bool StartInternalThread(); /** Will not return until the internal thread has exited. */ - void StopInternalThread(); + bool WaitForInternalThreadToExit(); bool is_started() const; @@ -38,18 +34,7 @@ class InternalThread { with the code you want your thread to run. */ virtual void InternalThreadEntry() {} - /* Should be tested when running loops to exit when requested. */ - bool must_stop(); - - private: - void entry(); - shared_ptr thread_; - int device_; - Caffe::Brew mode_; - int rand_seed_; - int solver_count_; - bool root_solver_; }; } // namespace caffe diff --git a/include/caffe/layer_factory.hpp b/include/caffe/layer_factory.hpp index 32e849de0d2..2fcd93869a0 100644 --- a/include/caffe/layer_factory.hpp +++ b/include/caffe/layer_factory.hpp @@ -71,9 +71,7 @@ class LayerRegistry { // Get a layer using a LayerParameter. static shared_ptr > CreateLayer(const LayerParameter& param) { - if (Caffe::root_solver()) { - LOG(INFO) << "Creating layer " << param.name(); - } + LOG(INFO) << "Creating layer " << param.name(); const string& type = param.type(); CreatorRegistry& registry = Registry(); CHECK_EQ(registry.count(type), 1) << "Unknown layer type: " << type diff --git a/include/caffe/neuron_layers.hpp b/include/caffe/neuron_layers.hpp index bf90257e9cc..9cf233f0eb3 100644 --- a/include/caffe/neuron_layers.hpp +++ b/include/caffe/neuron_layers.hpp @@ -431,6 +431,7 @@ class CuDNNReLULayer : public ReLULayer { const vector& propagate_down, const vector*>& bottom); bool handles_setup_; + cudnnHandle_t handle_; cudnnTensorDescriptor_t bottom_desc_; cudnnTensorDescriptor_t top_desc_; }; @@ -513,6 +514,7 @@ class CuDNNSigmoidLayer : public SigmoidLayer { const vector& propagate_down, const vector*>& bottom); bool handles_setup_; + cudnnHandle_t handle_; cudnnTensorDescriptor_t bottom_desc_; cudnnTensorDescriptor_t top_desc_; }; @@ -597,6 +599,7 @@ class CuDNNTanHLayer : public TanHLayer { const vector& propagate_down, const vector*>& bottom); bool handles_setup_; + cudnnHandle_t handle_; cudnnTensorDescriptor_t bottom_desc_; cudnnTensorDescriptor_t top_desc_; }; diff --git a/include/caffe/parallel.hpp b/include/caffe/parallel.hpp deleted file mode 100644 index 5f44a8702d3..00000000000 --- a/include/caffe/parallel.hpp +++ /dev/null @@ -1,123 +0,0 @@ -#ifndef CAFFE_PARALLEL_HPP_ -#define CAFFE_PARALLEL_HPP_ - -#include - -#include - -#include "caffe/blob.hpp" -#include "caffe/common.hpp" -#include "caffe/internal_thread.hpp" -#include "caffe/layer.hpp" -#include "caffe/proto/caffe.pb.h" -#include "caffe/solver.hpp" -#include "caffe/syncedmem.hpp" -#include "caffe/util/blocking_queue.hpp" - -namespace caffe { - -// Represents a net parameters. Once a net is created, its parameter buffers can -// be replaced by ones from Params, to allow parallelization. Params ensures -// parameters are allocated in one consecutive array. -template -class Params { - public: - explicit Params(shared_ptr > root_solver); - virtual ~Params() { - } - - inline size_t size() const { - return size_; - } - inline Dtype* data() const { - return data_; - } - inline Dtype* diff() const { - return diff_; - } - - protected: - const size_t size_; // Size of buffers - Dtype* data_; // Network parameters - Dtype* diff_; // Gradient - -DISABLE_COPY_AND_ASSIGN(Params); -}; - -// Params stored in GPU memory. -template -class GPUParams : public Params { - public: - GPUParams(shared_ptr > root_solver, int device); - virtual ~GPUParams(); - - void configure(Solver* solver) const; - - protected: - using Params::size_; - using Params::data_; - using Params::diff_; - private: - int buffer_device_; -}; - -class DevicePair { - public: - DevicePair(int parent, int device) - : parent_(parent), - device_(device) { - } - inline int parent() { - return parent_; - } - inline int device() { - return device_; - } - - // Group GPUs in pairs, by proximity depending on machine's topology - static void compute(const vector devices, vector* pairs); - - protected: - int parent_; - int device_; -}; - -// Synchronous data parallelism using map-reduce between local GPUs. -template -class P2PSync : public GPUParams, public Solver::Callback, - public InternalThread { - public: - explicit P2PSync(shared_ptr > root_solver, - P2PSync* parent, const SolverParameter& param); - virtual ~P2PSync(); - - inline const shared_ptr >& solver() const { - return solver_; - } - - static void run(shared_ptr > root, const vector& gpus); - - // Divide the batch size by the number of solvers - static void divide_batch_size(NetParameter* net); - - protected: - void on_start(Timer* timer, ostringstream* timing); - void on_gradients_ready(Timer* timer, ostringstream* timing); - - void InternalThreadEntry(); - - P2PSync* parent_; - vector*> children_; - BlockingQueue*> queue_; - const int initial_iter_; - Dtype* parent_grads_; - shared_ptr > solver_; - - using Params::size_; - using Params::data_; - using Params::diff_; -}; - -} // namespace caffe - -#endif diff --git a/include/caffe/solver.hpp b/include/caffe/solver.hpp index b292ba25ae6..4dcdc3dc20b 100644 --- a/include/caffe/solver.hpp +++ b/include/caffe/solver.hpp @@ -5,7 +5,6 @@ #include #include "caffe/net.hpp" -#include "caffe/util/benchmark.hpp" namespace caffe { @@ -33,30 +32,15 @@ class Solver { // function that restores the state from a SolverState protocol buffer. void Restore(const char* resume_file); virtual ~Solver() {} - inline const SolverParameter& param() const { return param_; } inline shared_ptr > net() { return net_; } inline const vector > >& test_nets() { return test_nets_; } int iter() { return iter_; } - // Invoked at specific points during an iteration - class Callback { - protected: - virtual void on_start(Timer* timer, ostringstream* timing) = 0; - virtual void on_gradients_ready(Timer* timer, ostringstream* timing) = 0; - - template - friend class Solver; - }; - const vector& callbacks() const { return callbacks_; } - void add_callback(Callback* value) { - callbacks_.push_back(value); - } - protected: - // Get and apply the update value for the current iteration. - virtual void Iteration() {} + // Get the update value for the current iteration. + virtual void ComputeUpdateValue() = 0; // The Solver::Snapshot function implements the basic snapshotting utility // that stores the learned net. You should implement the SnapshotSolverState() // function that produces a SolverState protocol buffer that needs to be @@ -65,12 +49,8 @@ class Solver { // The test routine void TestAll(); void Test(const int test_net_id = 0); - virtual void SnapshotSolverState(SolverState* state) { - CHECK(false) << "Should be overriden"; - } - virtual void RestoreSolverState(const SolverState& state) { - CHECK(false) << "Should be overriden"; - } + virtual void SnapshotSolverState(SolverState* state) = 0; + virtual void RestoreSolverState(const SolverState& state) = 0; void DisplayOutputBlobs(const int net_id); SolverParameter param_; @@ -78,10 +58,6 @@ class Solver { int current_step_; shared_ptr > net_; vector > > test_nets_; - vector callbacks_; - - Timer iteration_timer_; - float iterations_last_; DISABLE_COPY_AND_ASSIGN(Solver); }; @@ -104,9 +80,7 @@ class SGDSolver : public Solver { protected: void PreSolve(); Dtype GetLearningRate(); - virtual void Iteration(); - virtual void Regularize(int param_id); - virtual void ComputeUpdateValue(int param_id, Dtype rate); + virtual void ComputeUpdateValue(); virtual void ClipGradients(); virtual void SnapshotSolverState(SolverState * state); virtual void RestoreSolverState(const SolverState& state); @@ -116,9 +90,6 @@ class SGDSolver : public Solver { // of gradients/updates and is not needed in snapshots vector > > history_, update_, temp_; - using Solver::iteration_timer_; - using Solver::iterations_last_; - DISABLE_COPY_AND_ASSIGN(SGDSolver); }; @@ -131,7 +102,7 @@ class NesterovSolver : public SGDSolver { : SGDSolver(param_file) {} protected: - virtual void ComputeUpdateValue(int param_id, Dtype rate); + virtual void ComputeUpdateValue(); DISABLE_COPY_AND_ASSIGN(NesterovSolver); }; @@ -145,7 +116,7 @@ class AdaGradSolver : public SGDSolver { : SGDSolver(param_file) { constructor_sanity_check(); } protected: - virtual void ComputeUpdateValue(int param_id, Dtype rate); + virtual void ComputeUpdateValue(); void constructor_sanity_check() { CHECK_EQ(0, this->param_.momentum()) << "Momentum cannot be used with AdaGrad."; diff --git a/include/caffe/syncedmem.hpp b/include/caffe/syncedmem.hpp index 62aadef498d..1b726de9564 100644 --- a/include/caffe/syncedmem.hpp +++ b/include/caffe/syncedmem.hpp @@ -8,29 +8,26 @@ namespace caffe { -// If CUDA is available and in GPU mode, host memory will be allocated pinned, -// using cudaMallocHost. It avoids dynamic pinning for transfers (DMA). -// The improvement in performance seems negligible in the single GPU case, -// but might be more significant for parallel training. Most importantly, -// it improved stability for large models on many GPUs. +// Theoretically, CaffeMallocHost and CaffeFreeHost should simply call the +// cudaMallocHost and cudaFree functions in order to create pinned memory. +// However, those codes rely on the existence of a cuda GPU (I don't know +// why that is a must since allocating memory should not be accessing the +// GPU resource, but it just creates an error as of Cuda 5.0) and will cause +// problem when running on a machine without GPU. Thus, we simply define +// these two functions for safety and possible future change if the problem +// of calling cuda functions disappears in a future version. +// +// In practice, although we are creating unpinned memory here, as long as we +// are constantly accessing them the memory pages almost always stays in +// the physical memory (assuming we have large enough memory installed), and +// does not seem to create a memory bottleneck here. + inline void CaffeMallocHost(void** ptr, size_t size) { -#ifndef CPU_ONLY - if (Caffe::mode() == Caffe::GPU) { - CUDA_CHECK(cudaMallocHost(ptr, size)); - return; - } -#endif *ptr = malloc(size); CHECK(*ptr) << "host allocation of size " << size << " failed"; } inline void CaffeFreeHost(void* ptr) { -#ifndef CPU_ONLY - if (Caffe::mode() == Caffe::GPU) { - CUDA_CHECK(cudaFreeHost(ptr)); - return; - } -#endif free(ptr); } @@ -45,25 +42,20 @@ class SyncedMemory { public: SyncedMemory() : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(0), head_(UNINITIALIZED), - own_cpu_data_(false), own_gpu_data_(false), gpu_device_(-1) {} + own_cpu_data_(false) {} explicit SyncedMemory(size_t size) : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(size), head_(UNINITIALIZED), - own_cpu_data_(false), own_gpu_data_(false), gpu_device_(-1) {} + own_cpu_data_(false) {} ~SyncedMemory(); const void* cpu_data(); void set_cpu_data(void* data); const void* gpu_data(); - void set_gpu_data(void* data); void* mutable_cpu_data(); void* mutable_gpu_data(); enum SyncedHead { UNINITIALIZED, HEAD_AT_CPU, HEAD_AT_GPU, SYNCED }; SyncedHead head() { return head_; } size_t size() { return size_; } -#ifndef CPU_ONLY - void async_gpu_push(const cudaStream_t& stream); -#endif - private: void to_cpu(); void to_gpu(); @@ -72,8 +64,6 @@ class SyncedMemory { size_t size_; SyncedHead head_; bool own_cpu_data_; - bool own_gpu_data_; - int gpu_device_; DISABLE_COPY_AND_ASSIGN(SyncedMemory); }; // class SyncedMemory diff --git a/include/caffe/test/test_gradient_check_util.hpp b/include/caffe/test/test_gradient_check_util.hpp index 80fa9f26e11..22937711b58 100644 --- a/include/caffe/test/test_gradient_check_util.hpp +++ b/include/caffe/test/test_gradient_check_util.hpp @@ -84,10 +84,7 @@ void GradientChecker::CheckGradientSingle(Layer* layer, vector*> blobs_to_check; vector propagate_down(bottom.size(), check_bottom < 0); for (int i = 0; i < layer->blobs().size(); ++i) { - // blobs_to_check.push_back(layer->blobs()[i].get()); - Blob* blob = layer->blobs()[i].get(); - caffe_set(blob->count(), static_cast(0), blob->mutable_cpu_diff()); - blobs_to_check.push_back(blob); + blobs_to_check.push_back(layer->blobs()[i].get()); } if (check_bottom < 0) { for (int i = 0; i < bottom.size(); ++i) { diff --git a/include/caffe/util/blocking_queue.hpp b/include/caffe/util/blocking_queue.hpp deleted file mode 100644 index 955e12cc567..00000000000 --- a/include/caffe/util/blocking_queue.hpp +++ /dev/null @@ -1,47 +0,0 @@ -#ifndef CAFFE_UTIL_BLOCKING_QUEUE_HPP_ -#define CAFFE_UTIL_BLOCKING_QUEUE_HPP_ - -#include -#include - -#include "caffe/common.hpp" - -namespace caffe { - -template -class BlockingQueue { - public: - explicit BlockingQueue(); - - void push(const T& t); - - bool try_pop(T* t); - - // This logs a message if the threads needs to be blocked - // useful for detecting e.g. when data feeding is too slow - T pop(const string& log_on_wait = ""); - - bool try_peek(T* t); - - // Return element without removing it - T peek(); - - size_t size() const; - - protected: - /** - Move synchronization fields out instead of including boost/thread.hpp - to avoid a boost/NVCC issues (#1009, #1010) on OSX. Also fails on - Linux CUDA 7.0.18. - */ - class sync; - - std::queue queue_; - shared_ptr sync_; - -DISABLE_COPY_AND_ASSIGN(BlockingQueue); -}; - -} // namespace caffe - -#endif diff --git a/include/caffe/util/device_alternate.hpp b/include/caffe/util/device_alternate.hpp index 55d616fbf71..6ea595dba2d 100644 --- a/include/caffe/util/device_alternate.hpp +++ b/include/caffe/util/device_alternate.hpp @@ -66,17 +66,6 @@ void classname::funcname##_##gpu(const vector*>& top, \ << caffe::curandGetErrorString(status); \ } while (0) -#ifdef USE_CNMEM - -#define CNMEM_CHECK(condition) \ - do { \ - cnmemStatus_t status = condition; \ - CHECK_EQ(status, CNMEM_STATUS_SUCCESS) << " " \ - << cnmemGetErrorString(status); \ - } while (0) - -#endif - // CUDA: grid stride looping #define CUDA_KERNEL_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp index 09d3a10c52c..a6bd86a93f5 100644 --- a/include/caffe/vision_layers.hpp +++ b/include/caffe/vision_layers.hpp @@ -244,24 +244,15 @@ class CuDNNConvolutionLayer : public ConvolutionLayer { const vector& propagate_down, const vector*>& bottom); bool handles_setup_; - - // algorithms for forward and backwards convolutions - cudnnConvolutionFwdAlgo_t *fwd_algo_; - cudnnConvolutionBwdFilterAlgo_t *bwd_filter_algo_; - cudnnConvolutionBwdDataAlgo_t *bwd_data_algo_; - + cudnnHandle_t* handle_; + cudaStream_t* stream_; vector bottom_descs_, top_descs_; cudnnTensorDescriptor_t bias_desc_; cudnnFilterDescriptor_t filter_desc_; vector conv_descs_; int bottom_offset_, top_offset_, weight_offset_, bias_offset_; - - size_t *workspace_fwd_sizes_; - size_t *workspace_bwd_data_sizes_; - size_t *workspace_bwd_filter_sizes_; - size_t workspaceSizeInBytes; // size of underlying storage - void *workspaceData; // underlying storage - void **workspace; // aliases into workspaceData + size_t workspaceSizeInBytes; + void *workspace; }; #endif @@ -382,63 +373,6 @@ class LRNLayer : public Layer { vector*> product_bottom_vec_; }; -#ifdef USE_CUDNN - -template -class CuDNNLRNLayer : public LRNLayer { - public: - explicit CuDNNLRNLayer(const LayerParameter& param) - : LRNLayer(param), handles_setup_(false) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - virtual ~CuDNNLRNLayer(); - - protected: - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - bool handles_setup_; - cudnnLRNDescriptor_t norm_desc_; - cudnnTensorDescriptor_t bottom_desc_, top_desc_; - - int size_; - Dtype alpha_, beta_, k_; -}; - -template -class CuDNNLCNLayer : public LRNLayer { - public: - explicit CuDNNLCNLayer(const LayerParameter& param) - : LRNLayer(param), handles_setup_(false), tempDataSize(0), - tempData1(NULL), tempData2(NULL) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - virtual ~CuDNNLCNLayer(); - - protected: - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - bool handles_setup_; - cudnnLRNDescriptor_t norm_desc_; - cudnnTensorDescriptor_t bottom_desc_, top_desc_; - - int size_, pre_pad_; - Dtype alpha_, beta_, k_; - - size_t tempDataSize; - void *tempData1, *tempData2; -}; - -#endif /** * @brief Pools the input image by taking the max, average, etc. within regions. @@ -512,6 +446,7 @@ class CuDNNPoolingLayer : public PoolingLayer { const vector& propagate_down, const vector*>& bottom); bool handles_setup_; + cudnnHandle_t handle_; cudnnTensorDescriptor_t bottom_desc_, top_desc_; cudnnPoolingDescriptor_t pooling_desc_; cudnnPoolingMode_t mode_; diff --git a/python/caffe/draw.py b/python/caffe/draw.py index 8661f6d0648..08b7c1de14b 100644 --- a/python/caffe/draw.py +++ b/python/caffe/draw.py @@ -59,7 +59,7 @@ def determine_node_label_by_layertype(layer, layertype, rankdir): else: # If graph orientation is horizontal, vertical space is free and # horizontal space is not; separate words with newlines - separator = '\\n' + separator = '\n' if layertype == 'Convolution': # Outer double quotes needed or else colon characters don't parse diff --git a/scripts/travis/travis_install.sh b/scripts/travis/travis_install.sh index 04af643a5e2..0e8c37861b0 100755 --- a/scripts/travis/travis_install.sh +++ b/scripts/travis/travis_install.sh @@ -62,7 +62,7 @@ rm -f $LMDB_FILE # than using pip for everything). wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh chmod +x miniconda.sh -./miniconda.sh -b -p /home/travis/miniconda +./miniconda.sh -b export PATH=/home/travis/miniconda/bin:$PATH conda update --yes conda conda install --yes numpy scipy matplotlib scikit-image pip diff --git a/scripts/travis/travis_setup_makefile_config.sh b/scripts/travis/travis_setup_makefile_config.sh index 0b62bc2e0f9..ba326262bf8 100755 --- a/scripts/travis/travis_setup_makefile_config.sh +++ b/scripts/travis/travis_setup_makefile_config.sh @@ -20,5 +20,4 @@ PYTHON_LIB := $(ANACONDA_HOME)/lib INCLUDE_DIRS := $(PYTHON_INCLUDE) /usr/local/include LIBRARY_DIRS := $(PYTHON_LIB) /usr/local/lib /usr/lib WITH_PYTHON_LAYER := 1 -USE_CUDNN := 0 EOF diff --git a/src/caffe/CMakeLists.txt b/src/caffe/CMakeLists.txt index 73ecd4496b1..40e6c11f5b0 100644 --- a/src/caffe/CMakeLists.txt +++ b/src/caffe/CMakeLists.txt @@ -20,7 +20,6 @@ endif() add_library(caffe ${srcs}) target_link_libraries(caffe proto ${Caffe_LINKER_LIBS}) caffe_default_properties(caffe) -set_target_properties(caffe PROPERTIES OUTPUT_NAME "caffe-nv") # ---[ Tests add_subdirectory(test) diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp index 4075f1402d0..af96cac40aa 100644 --- a/src/caffe/common.cpp +++ b/src/caffe/common.cpp @@ -1,25 +1,13 @@ -#include #include #include #include -#include #include "caffe/common.hpp" #include "caffe/util/rng.hpp" namespace caffe { -static boost::shared_ptr mem_handler; -// Make sure each thread can have different values. -static boost::thread_specific_ptr thread_instance_; - - -Caffe& Caffe::Get() { - if (!thread_instance_.get()) { - thread_instance_.reset(new Caffe()); - } - return *(thread_instance_.get()); -} +shared_ptr Caffe::singleton_; // random seeding int64_t cluster_seedgen(void) { @@ -54,8 +42,7 @@ void GlobalInit(int* pargc, char*** pargv) { #ifdef CPU_ONLY // CPU-only Caffe. Caffe::Caffe() - : random_generator_(), mode_(Caffe::CPU), - solver_count_(1), root_solver_(true) { } + : random_generator_(), mode_(Caffe::CPU) { } Caffe::~Caffe() { } @@ -98,12 +85,8 @@ void* Caffe::RNG::generator() { #else // Normal GPU + CPU Caffe. Caffe::Caffe() - : cublas_handle_(NULL), curand_generator_(NULL), -#ifdef USE_CUDNN - cudnn_handle_(NULL), -#endif - random_generator_(), - mode_(Caffe::CPU), solver_count_(1), root_solver_(true) { + : cublas_handle_(NULL), curand_generator_(NULL), random_generator_(), + mode_(Caffe::CPU) { // Try to create a cublas handler, and report an error if failed (but we will // keep the program running as one might just want to run CPU code). if (cublasCreate(&cublas_handle_) != CUBLAS_STATUS_SUCCESS) { @@ -116,11 +99,6 @@ Caffe::Caffe() != CURAND_STATUS_SUCCESS) { LOG(ERROR) << "Cannot create Curand generator. Curand won't be available."; } -#ifdef USE_CUDNN - if (cudnnCreate(&cudnn_handle_) != CUDNN_STATUS_SUCCESS) { - LOG(ERROR) << "Cannot create cuDNN handle. cuDNN won't be available."; - } -#endif } Caffe::~Caffe() { @@ -128,9 +106,6 @@ Caffe::~Caffe() { if (curand_generator_) { CURAND_CHECK(curandDestroyGenerator(curand_generator_)); } -#ifdef USE_CUDNN - if (cudnn_handle_) CUDNN_CHECK(cudnnDestroy(cudnn_handle_)); -#endif } void Caffe::set_random_seed(const unsigned int seed) { @@ -169,10 +144,6 @@ void Caffe::SetDevice(const int device_id) { CURAND_RNG_PSEUDO_DEFAULT)); CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().curand_generator_, cluster_seedgen())); -#ifdef USE_CUDNN - if (Get().cublas_handle_) CUDNN_CHECK(cudnnDestroy(Get().cudnn_handle_)); - CUDNN_CHECK(cudnnCreate(&Get().cudnn_handle_)); -#endif } void Caffe::DeviceQuery() { @@ -210,6 +181,7 @@ void Caffe::DeviceQuery() { return; } + class Caffe::RNG::Generator { public: Generator() : rng_(new caffe::rng_t(cluster_seedgen())) {} @@ -232,115 +204,6 @@ void* Caffe::RNG::generator() { return static_cast(generator_->rng()); } -static boost::mutex memHandlerMutex; - -MemoryHandler& MemoryHandler::Get() { - boost::mutex::scoped_lock lock(memHandlerMutex); - if (!mem_handler.get()) { - mem_handler.reset(new MemoryHandler()); - } - return *(mem_handler.get()); -} - -void MemoryHandler::mallocGPU(void **ptr, size_t size, cudaStream_t stream) { - if (!Get().initialized_) { - Init(); - } - Get().allocate_memory(ptr, size, stream); -} - -void MemoryHandler::freeGPU(void *ptr, cudaStream_t stream) { - Get().free_memory(ptr, stream); -} - -void MemoryHandler::allocate_memory(void **ptr, size_t size, - cudaStream_t stream) { - int initial_device; - cudaGetDevice(&initial_device); - if (size == 0) return; - if (using_pool_) { -#ifdef USE_CNMEM - CNMEM_CHECK(cnmemMalloc(ptr, size, stream)); -#endif - } else { - CUDA_CHECK(cudaMalloc(ptr, size)); - } - cudaSetDevice(initial_device); -} - -void MemoryHandler::free_memory(void *ptr, cudaStream_t stream) { - // boost::mutex::scoped_lock lock(memHandlerMutex); - int initial_device; - cudaGetDevice(&initial_device); - if (using_pool_) { -#ifdef USE_CNMEM - CNMEM_CHECK(cnmemFree(ptr, stream)); -#endif - } else { - CUDA_CHECK(cudaFree(ptr)); - } - ptr = NULL; - cudaSetDevice(initial_device); -} - -void MemoryHandler::registerStream(cudaStream_t stream) { - if (!Get().initialized_) { - Init(); - } - if (Get().using_pool_) { -#ifdef USE_CNMEM - CNMEM_CHECK(cnmemRegisterStream(stream)); -#endif - } -} - -void MemoryHandler::destroy() { -#ifdef USE_CNMEM - CNMEM_CHECK(cnmemFinalize()); -#endif -} - -void MemoryHandler::Init() { - if (Get().using_pool_) { -#ifdef USE_CNMEM - cnmemDevice_t *devs = new cnmemDevice_t[Get().gpus_.size()]; - - int initial_device; - CUDA_CHECK(cudaGetDevice(&initial_device)); - - for (int i = 0; i < Get().gpus_.size(); i++) { - CUDA_CHECK(cudaSetDevice(Get().gpus_[i])); - - devs[i].device = Get().gpus_[i]; - - size_t free_mem, used_mem; - CUDA_CHECK(cudaMemGetInfo(&free_mem, &used_mem)); - - devs[i].size = size_t(0.95*free_mem); - devs[i].numStreams = 0; - devs[i].streams = NULL; - } - CNMEM_CHECK(cnmemInit(Get().gpus_.size(), devs, CNMEM_FLAGS_DEFAULT)); - Get().initialized_ = true; - - CUDA_CHECK(cudaSetDevice(initial_device)); - - delete [] devs; -#endif - } - Get().initialized_ = true; -} - -void MemoryHandler::getInfo(size_t *free_mem, size_t *total_mem) { - if (Get().using_pool_) { -#ifdef USE_CNMEM - CNMEM_CHECK(cnmemMemGetInfo(free_mem, total_mem, cudaStreamDefault)); -#endif - } else { - CUDA_CHECK(cudaMemGetInfo(free_mem, total_mem)); - } -} - const char* cublasGetErrorString(cublasStatus_t error) { switch (error) { case CUBLAS_STATUS_SUCCESS: diff --git a/src/caffe/data_reader.cpp b/src/caffe/data_reader.cpp deleted file mode 100644 index 16378203a88..00000000000 --- a/src/caffe/data_reader.cpp +++ /dev/null @@ -1,119 +0,0 @@ -#include -#include -#include -#include - -#include "caffe/common.hpp" -#include "caffe/data_layers.hpp" -#include "caffe/data_reader.hpp" -#include "caffe/proto/caffe.pb.h" - -namespace caffe { - -using boost::weak_ptr; - -map > DataReader::bodies_; -static boost::mutex bodies_mutex_; - -DataReader::DataReader(const LayerParameter& param) - : queue_pair_(new QueuePair( // - param.data_param().prefetch() * param.data_param().batch_size())) { - // Get or create a body - boost::mutex::scoped_lock lock(bodies_mutex_); - string key = source_key(param); - weak_ptr& weak = bodies_[key]; - body_ = weak.lock(); - if (!body_) { - body_.reset(new Body(param)); - bodies_[key] = weak_ptr(body_); - } - body_->new_queue_pairs_.push(queue_pair_); -} - -DataReader::~DataReader() { - string key = source_key(body_->param_); - body_.reset(); - boost::mutex::scoped_lock lock(bodies_mutex_); - if (bodies_[key].expired()) { - bodies_.erase(key); - } -} - -// - -DataReader::QueuePair::QueuePair(int size) { - // Initialize the free queue with requested number of datums - for (int i = 0; i < size; ++i) { - free_.push(new Datum()); - } -} - -DataReader::QueuePair::~QueuePair() { - Datum* datum; - while (free_.try_pop(&datum)) { - delete datum; - } - while (full_.try_pop(&datum)) { - delete datum; - } -} - -// - -DataReader::Body::Body(const LayerParameter& param) - : param_(param), - new_queue_pairs_() { - StartInternalThread(); -} - -DataReader::Body::~Body() { - StopInternalThread(); -} - -void DataReader::Body::InternalThreadEntry() { - shared_ptr db(db::GetDB(param_.data_param().backend())); - db->Open(param_.data_param().source(), db::READ); - shared_ptr cursor(db->NewCursor()); - vector > qps; - try { - int solver_count = param_.phase() == TRAIN ? Caffe::solver_count() : 1; - - // To ensure deterministic runs, only start running once all solvers - // are ready. But solvers need to peek on one item during initialization, - // so read one item, then wait for the next solver. - for (int i = 0; i < solver_count; ++i) { - shared_ptr qp(new_queue_pairs_.pop()); - read_one(cursor.get(), qp.get()); - qps.push_back(qp); - } - // Main loop - while (!must_stop()) { - for (int i = 0; i < solver_count; ++i) { - read_one(cursor.get(), qps[i].get()); - } - // Check no additional readers have been created. This can happen if - // more than one net is trained at a time per process, whether single - // or multi solver. It might also happen if two data layers have same - // name and same source. - CHECK_EQ(new_queue_pairs_.size(), 0); - } - } catch (boost::thread_interrupted&) { - // Interrupted exception is expected on shutdown - } -} - -void DataReader::Body::read_one(db::Cursor* cursor, QueuePair* qp) { - Datum* datum = qp->free_.pop(); - // TODO deserialize in-place instead of copy? - datum->ParseFromString(cursor->value()); - qp->full_.push(datum); - - // go to the next iter - cursor->Next(); - if (!cursor->valid()) { - DLOG(INFO) << "Restarting data prefetching from start."; - cursor->SeekToFirst(); - } -} - -} // namespace caffe diff --git a/src/caffe/data_transformer.cpp b/src/caffe/data_transformer.cpp index 482b8c09d24..b0b98e478c1 100644 --- a/src/caffe/data_transformer.cpp +++ b/src/caffe/data_transformer.cpp @@ -19,9 +19,7 @@ DataTransformer::DataTransformer(const TransformationParameter& param, CHECK_EQ(param_.mean_value_size(), 0) << "Cannot specify mean_file and mean_value at the same time"; const string& mean_file = param.mean_file(); - if (Caffe::root_solver()) { - LOG(INFO) << "Loading mean file from: " << mean_file; - } + LOG(INFO) << "Loading mean file from: " << mean_file; BlobProto blob_proto; ReadProtoFromBinaryFileOrDie(mean_file.c_str(), &blob_proto); data_mean_.FromProto(blob_proto); diff --git a/src/caffe/internal_thread.cpp b/src/caffe/internal_thread.cpp index 2402a192e7e..c2d19d433b4 100644 --- a/src/caffe/internal_thread.cpp +++ b/src/caffe/internal_thread.cpp @@ -1,75 +1,40 @@ #include -#include - #include "caffe/internal_thread.hpp" -#include "caffe/util/math_functions.hpp" namespace caffe { -InternalThread::InternalThread() - : thread_(), - device_(), - mode_(), - rand_seed_(), - solver_count_(), - root_solver_() { -} - InternalThread::~InternalThread() { - StopInternalThread(); + WaitForInternalThreadToExit(); } bool InternalThread::is_started() const { - return thread_ && thread_->joinable(); -} - -bool InternalThread::must_stop() { - return thread_ && thread_->interruption_requested(); + return thread_.get() != NULL && thread_->joinable(); } -void InternalThread::StartInternalThread() { - // TODO switch to failing once Caffe prefetch thread is persistent. - // Threads should not be started and stopped repeatedly. - // CHECK(!is_started()); - StopInternalThread(); - -#ifndef CPU_ONLY - CUDA_CHECK(cudaGetDevice(&device_)); -#endif - mode_ = Caffe::mode(); - rand_seed_ = caffe_rng_rand(); - solver_count_ = Caffe::solver_count(); - root_solver_ = Caffe::root_solver(); +bool InternalThread::StartInternalThread() { + if (!WaitForInternalThreadToExit()) { + return false; + } try { - thread_.reset(new boost::thread(&InternalThread::entry, this)); - } catch (std::exception& e) { - CHECK(false) << e.what(); + thread_.reset( + new boost::thread(&InternalThread::InternalThreadEntry, this)); + } catch (...) { + return false; } + return true; } -void InternalThread::entry() { -#ifndef CPU_ONLY - CUDA_CHECK(cudaSetDevice(device_)); -#endif - Caffe::set_mode(mode_); - Caffe::set_random_seed(rand_seed_); - Caffe::set_solver_count(solver_count_); - Caffe::set_root_solver(root_solver_); - - InternalThreadEntry(); -} - -void InternalThread::StopInternalThread() { +/** Will not return until the internal thread has exited. */ +bool InternalThread::WaitForInternalThreadToExit() { if (is_started()) { - thread_->interrupt(); try { thread_->join(); - } catch (boost::thread_interrupted&) { - } catch (std::exception& e) { - CHECK(false) << e.what(); + } catch (...) { + return false; } } + return true; } } // namespace caffe diff --git a/src/caffe/layer_factory.cpp b/src/caffe/layer_factory.cpp index c7bb9643af6..d6a1cac5090 100644 --- a/src/caffe/layer_factory.cpp +++ b/src/caffe/layer_factory.cpp @@ -50,6 +50,12 @@ shared_ptr > GetPoolingLayer(const LayerParameter& param) { #ifdef USE_CUDNN } else if (engine == PoolingParameter_Engine_CUDNN) { PoolingParameter p_param = param.pooling_param(); + if (p_param.pad() || p_param.pad_h() || p_param.pad_w() || + param.top_size() > 1) { + LOG(INFO) << "CUDNN does not support padding or multiple tops. " + << "Using Caffe's own pooling layer."; + return shared_ptr >(new PoolingLayer(param)); + } return shared_ptr >(new CuDNNPoolingLayer(param)); #endif } else { @@ -59,45 +65,6 @@ shared_ptr > GetPoolingLayer(const LayerParameter& param) { REGISTER_LAYER_CREATOR(Pooling, GetPoolingLayer); -// Get LRN layer according to engine -template -shared_ptr > GetLRNLayer(const LayerParameter& param) { - LRNParameter_Engine engine = param.lrn_param().engine(); - - if (engine == LRNParameter_Engine_DEFAULT) { - engine = LRNParameter_Engine_CAFFE; -#ifdef USE_CUDNN - engine = LRNParameter_Engine_CUDNN; -#endif - } - - if (engine == LRNParameter_Engine_CAFFE) { - return shared_ptr >(new LRNLayer(param)); -#ifdef USE_CUDNN - } else if (engine == LRNParameter_Engine_CUDNN) { - LRNParameter lrn_param = param.lrn_param(); - - if (lrn_param.norm_region() ==LRNParameter_NormRegion_WITHIN_CHANNEL) { - // not valid for cudnn - // return shared_ptr >(new LRNLayer(param)); - return shared_ptr >(new CuDNNLCNLayer(param)); - } else { - // local size is too big to be handled through cuDNN - if (param.lrn_param().local_size() > CUDNN_LRN_MAX_N) { - return shared_ptr >(new LRNLayer(param)); - } else { - // return shared_ptr >(new LRNLayer(param)); - return shared_ptr >(new CuDNNLRNLayer(param)); - } - } -#endif - } else { - LOG(FATAL) << "Layer " << param.name() << " has unknown engine."; - } -} - -REGISTER_LAYER_CREATOR(LRN, GetLRNLayer); - // Get relu layer according to engine. template shared_ptr > GetReLULayer(const LayerParameter& param) { diff --git a/src/caffe/layers/absval_layer.cu b/src/caffe/layers/absval_layer.cu index bb310e1afbb..91f3c77fe9a 100644 --- a/src/caffe/layers/absval_layer.cu +++ b/src/caffe/layers/absval_layer.cu @@ -18,6 +18,7 @@ template void AbsValLayer::Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom) { const int count = top[0]->count(); + const Dtype* top_data = top[0]->gpu_data(); const Dtype* top_diff = top[0]->gpu_diff(); if (propagate_down[0]) { const Dtype* bottom_data = bottom[0]->gpu_data(); diff --git a/src/caffe/layers/base_data_layer.cpp b/src/caffe/layers/base_data_layer.cpp index 71504c11bfd..931e4a9c0ab 100644 --- a/src/caffe/layers/base_data_layer.cpp +++ b/src/caffe/layers/base_data_layer.cpp @@ -1,9 +1,7 @@ -#include #include #include #include "caffe/data_layers.hpp" -#include "caffe/net.hpp" #include "caffe/util/io.hpp" namespace caffe { @@ -29,91 +27,54 @@ void BaseDataLayer::LayerSetUp(const vector*>& bottom, data_transformer_->InitRand(); } -template -BasePrefetchingDataLayer::BasePrefetchingDataLayer( - const LayerParameter& param) - : BaseDataLayer(param), - prefetch_free_(), prefetch_full_() { - for (int i = 0; i < PREFETCH_COUNT; ++i) { - prefetch_free_.push(&prefetch_[i]); - } -} - template void BasePrefetchingDataLayer::LayerSetUp( const vector*>& bottom, const vector*>& top) { BaseDataLayer::LayerSetUp(bottom, top); - - // Before starting the prefetch thread, we make cpu_data and gpu_data - // calls so that the prefetch thread does not accidentally make simultaneous - // cudaMalloc calls when the main thread is running. In some GPUs this - // seems to cause failures if we do not so. - for (int i = 0; i < PREFETCH_COUNT; ++i) { - prefetch_[i].data_.mutable_cpu_data(); - if (this->output_labels_) { - prefetch_[i].label_.mutable_cpu_data(); - } - } -#ifndef CPU_ONLY - if (Caffe::mode() == Caffe::GPU) { - for (int i = 0; i < PREFETCH_COUNT; ++i) { - prefetch_[i].data_.mutable_gpu_data(); - if (this->output_labels_) { - prefetch_[i].label_.mutable_gpu_data(); - } - } + // Now, start the prefetch thread. Before calling prefetch, we make two + // cpu_data calls so that the prefetch thread does not accidentally make + // simultaneous cudaMalloc calls when the main thread is running. In some + // GPUs this seems to cause failures if we do not so. + this->prefetch_data_.mutable_cpu_data(); + if (this->output_labels_) { + this->prefetch_label_.mutable_cpu_data(); } -#endif - DLOG(INFO) << "Initializing prefetch"; - this->data_transformer_->InitRand(); - StartInternalThread(); + this->CreatePrefetchThread(); DLOG(INFO) << "Prefetch initialized."; } template -void BasePrefetchingDataLayer::InternalThreadEntry() { -#ifndef CPU_ONLY - cudaStream_t stream; - if (Caffe::mode() == Caffe::GPU) { - cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); - } -#endif +void BasePrefetchingDataLayer::CreatePrefetchThread() { + this->data_transformer_->InitRand(); + CHECK(StartInternalThread()) << "Thread execution failed"; +} - try { - while (!must_stop()) { - Batch* batch = prefetch_free_.pop(); - load_batch(batch); -#ifndef CPU_ONLY - if (Caffe::mode() == Caffe::GPU) { - batch->data_.data().get()->async_gpu_push(stream); - cudaStreamSynchronize(stream); - } -#endif - prefetch_full_.push(batch); - } - } catch (boost::thread_interrupted&) { - // Interrupted exception is expected on shutdown - } +template +void BasePrefetchingDataLayer::JoinPrefetchThread() { + CHECK(WaitForInternalThreadToExit()) << "Thread joining failed"; } template void BasePrefetchingDataLayer::Forward_cpu( const vector*>& bottom, const vector*>& top) { - Batch* batch = prefetch_full_.pop("Data layer prefetch queue empty"); + // First, join the thread + JoinPrefetchThread(); + DLOG(INFO) << "Thread joined"; // Reshape to loaded data. - top[0]->Reshape(batch->data_.num(), batch->data_.channels(), - batch->data_.height(), batch->data_.width()); + top[0]->Reshape(this->prefetch_data_.num(), this->prefetch_data_.channels(), + this->prefetch_data_.height(), this->prefetch_data_.width()); // Copy the data - caffe_copy(batch->data_.count(), batch->data_.cpu_data(), + caffe_copy(prefetch_data_.count(), prefetch_data_.cpu_data(), top[0]->mutable_cpu_data()); DLOG(INFO) << "Prefetch copied"; if (this->output_labels_) { - caffe_copy(batch->label_.count(), batch->label_.cpu_data(), - top[1]->mutable_cpu_data()); + caffe_copy(prefetch_label_.count(), prefetch_label_.cpu_data(), + top[1]->mutable_cpu_data()); } - - prefetch_free_.push(batch); + // Start a new prefetch thread + DLOG(INFO) << "CreatePrefetchThread"; + CreatePrefetchThread(); } #ifdef CPU_ONLY diff --git a/src/caffe/layers/base_data_layer.cu b/src/caffe/layers/base_data_layer.cu index 52085d007a7..775f6c47f7e 100644 --- a/src/caffe/layers/base_data_layer.cu +++ b/src/caffe/layers/base_data_layer.cu @@ -7,19 +7,20 @@ namespace caffe { template void BasePrefetchingDataLayer::Forward_gpu( const vector*>& bottom, const vector*>& top) { - Batch* batch = prefetch_full_.pop("Data layer prefetch queue empty"); + // First, join the thread + JoinPrefetchThread(); // Reshape to loaded data. - top[0]->Reshape(batch->data_.num(), batch->data_.channels(), - batch->data_.height(), batch->data_.width()); + top[0]->Reshape(this->prefetch_data_.num(), this->prefetch_data_.channels(), + this->prefetch_data_.height(), this->prefetch_data_.width()); // Copy the data - caffe_copy(batch->data_.count(), batch->data_.gpu_data(), + caffe_copy(prefetch_data_.count(), prefetch_data_.cpu_data(), top[0]->mutable_gpu_data()); if (this->output_labels_) { - caffe_copy(batch->label_.count(), batch->label_.gpu_data(), + caffe_copy(prefetch_label_.count(), prefetch_label_.cpu_data(), top[1]->mutable_gpu_data()); } - - prefetch_free_.push(batch); + // Start a new prefetch thread + CreatePrefetchThread(); } INSTANTIATE_LAYER_GPU_FORWARD(BasePrefetchingDataLayer); diff --git a/src/caffe/layers/cudnn_conv_layer.cpp b/src/caffe/layers/cudnn_conv_layer.cpp index 1d89891f732..104d2b9d669 100644 --- a/src/caffe/layers/cudnn_conv_layer.cpp +++ b/src/caffe/layers/cudnn_conv_layer.cpp @@ -1,5 +1,4 @@ #ifdef USE_CUDNN -#include #include #include "caffe/filler.hpp" @@ -13,53 +12,7 @@ namespace caffe { // Set to three for the benefit of the backward pass, which // can use separate streams for calculating the gradient w.r.t. // bias, filter weights, and bottom data for each group independently -#define CUDNN_STREAMS_PER_GROUP 1 - -cudnnConvolutionFwdAlgo_t -GetCuDNNFwdAlgo(ConvolutionParameter_CuDNNFwdAlgorithm algo) { - switch (algo) { - case ConvolutionParameter_CuDNNFwdAlgorithm_IMPLICIT_GEMM: - return CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; - case ConvolutionParameter_CuDNNFwdAlgorithm_IMPLICIT_PRECOMP_GEMM: - return CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; - case ConvolutionParameter_CuDNNFwdAlgorithm_GEMM: - return CUDNN_CONVOLUTION_FWD_ALGO_GEMM; - case ConvolutionParameter_CuDNNFwdAlgorithm_DIRECT: - return CUDNN_CONVOLUTION_FWD_ALGO_DIRECT; - case ConvolutionParameter_CuDNNFwdAlgorithm_FWD_FFT: - return CUDNN_CONVOLUTION_FWD_ALGO_FFT; - default: - return CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; - } -} - -cudnnConvolutionBwdDataAlgo_t -GetCuDNNBwdDataAlgo(ConvolutionParameter_CuDNNBwdDataAlgorithm algo) { - switch (algo) { - case ConvolutionParameter_CuDNNBwdDataAlgorithm_BWD_DATA_ALGO_0: - return CUDNN_CONVOLUTION_BWD_DATA_ALGO_0; - case ConvolutionParameter_CuDNNBwdDataAlgorithm_BWD_DATA_ALGO_1: - return CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; - case ConvolutionParameter_CuDNNBwdDataAlgorithm_BWD_DATA_FFT: - return CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT; - default: - return CUDNN_CONVOLUTION_BWD_DATA_ALGO_0; - } -} - -cudnnConvolutionBwdFilterAlgo_t -GetCuDNNBwdFilterAlgo(ConvolutionParameter_CuDNNBwdFilterAlgorithm algo) { - switch (algo) { - case ConvolutionParameter_CuDNNBwdFilterAlgorithm_BWD_FILTER_ALGO_0: - return CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0; - case ConvolutionParameter_CuDNNBwdFilterAlgorithm_BWD_FILTER_ALGO_1: - return CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1; - case ConvolutionParameter_CuDNNBwdFilterAlgorithm_BWD_FILTER_FFT: - return CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT; - default: - return CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0; - } -} +#define CUDNN_STREAMS_PER_GROUP 3 /** * TODO(dox) explain cuDNN interface @@ -68,30 +21,16 @@ template void CuDNNConvolutionLayer::LayerSetUp( const vector*>& bottom, const vector*>& top) { ConvolutionLayer::LayerSetUp(bottom, top); - // Initialize algorithm arrays - fwd_algo_ = new cudnnConvolutionFwdAlgo_t[bottom.size()]; - bwd_filter_algo_= new cudnnConvolutionBwdFilterAlgo_t[bottom.size()]; - bwd_data_algo_ = new cudnnConvolutionBwdDataAlgo_t[bottom.size()]; - // initialize size arrays - workspace_fwd_sizes_ = new size_t[bottom.size()]; - workspace_bwd_filter_sizes_ = new size_t[bottom.size()]; - workspace_bwd_data_sizes_ = new size_t[bottom.size()]; - // workspace data + // Initialize CUDA streams and cuDNN. + stream_ = new cudaStream_t[this->group_ * CUDNN_STREAMS_PER_GROUP]; + handle_ = new cudnnHandle_t[this->group_ * CUDNN_STREAMS_PER_GROUP]; workspaceSizeInBytes = 0; - workspaceData = NULL; - workspace = new void*[this->group_ * CUDNN_STREAMS_PER_GROUP]; - - for (size_t i = 0; i < bottom.size(); i++) { - fwd_algo_[i] = (cudnnConvolutionFwdAlgo_t)0; - bwd_filter_algo_[i] = (cudnnConvolutionBwdFilterAlgo_t)0; - bwd_data_algo_[i] = (cudnnConvolutionBwdDataAlgo_t)0; - workspace_fwd_sizes_[i] = 0; - workspace_bwd_data_sizes_[i] = 0; - workspace_bwd_filter_sizes_[i] = 0; - } + workspace = NULL; for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) { - workspace[g] = NULL; + CUDA_CHECK(cudaStreamCreate(&stream_[g])); + CUDNN_CHECK(cudnnCreate(&handle_[g])); + CUDNN_CHECK(cudnnSetStream(handle_[g], stream_[g])); } // Set the indexing parameters. @@ -134,16 +73,6 @@ void CuDNNConvolutionLayer::Reshape( top_offset_ = (this->num_output_ / this->group_) * this->height_out_ * this->width_out_; - // Specify workspace limit for kernels directly until we have a - // planning strategy and a rewrite of Caffe's GPU memory mangagement - size_t workspace_limit_bytes; - if (MemoryHandler::usingPool()) { - size_t total_memory; - MemoryHandler::getInfo(&workspace_limit_bytes, &total_memory); - } else { - workspace_limit_bytes = 8*1024*1024; - } - for (int i = 0; i < bottom.size(); i++) { cudnn::setTensor4dDesc(&bottom_descs_[i], this->num_, @@ -162,112 +91,7 @@ void CuDNNConvolutionLayer::Reshape( cudnn::setConvolutionDesc(&conv_descs_[i], bottom_descs_[i], filter_desc_, this->pad_h_, this->pad_w_, this->stride_h_, this->stride_w_); - - // choose forward and backward algorithms + workspace(s) - if (!this->layer_param_.convolution_param().has_cudnnfwdalgo()) { - CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(Caffe::cudnn_handle(), - bottom_descs_[i], - filter_desc_, - conv_descs_[i], - top_descs_[i], - CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, - workspace_limit_bytes, - &fwd_algo_[i])); - } else { - fwd_algo_[i] = GetCuDNNFwdAlgo( - this->layer_param_.convolution_param().cudnnfwdalgo()); - } - - CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(Caffe::cudnn_handle(), - bottom_descs_[i], - filter_desc_, - conv_descs_[i], - top_descs_[i], - fwd_algo_[i], - &(workspace_fwd_sizes_[i]))); - - if (MemoryHandler::usingPool()) { - // restrict to only 1 convolution at a time for memory allocation purposes - size_t total_memory; - MemoryHandler::getInfo(&workspace_limit_bytes, &total_memory); - } else { - workspace_limit_bytes = 8*1024*1024; - } - // - // choose backward algorithm for filter - if (!this->layer_param_.convolution_param().has_cudnnbwdfilteralgo()) { - CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm( - Caffe::cudnn_handle(), - bottom_descs_[i], top_descs_[i], conv_descs_[i], filter_desc_, - CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, - workspace_limit_bytes, &bwd_filter_algo_[i]) ); - } else { - bwd_filter_algo_[i] = GetCuDNNBwdFilterAlgo( - this->layer_param_.convolution_param().cudnnbwdfilteralgo()); - } - // get workspace for backwards filter algorithm - CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize( - Caffe::cudnn_handle(), - bottom_descs_[i], top_descs_[i], conv_descs_[i], filter_desc_, - bwd_filter_algo_[i], &workspace_bwd_filter_sizes_[i])); - - // choose backward algo for data - if (!this->layer_param_.convolution_param().has_cudnnbwddataalgo()) { - CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm( - Caffe::cudnn_handle(), - filter_desc_, top_descs_[i], conv_descs_[i], bottom_descs_[i], - CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, - workspace_limit_bytes, &bwd_data_algo_[i])); - } else { - bwd_data_algo_[i] = GetCuDNNBwdDataAlgo( - this->layer_param_.convolution_param().cudnnbwddataalgo()); - } - - // get workspace size - CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize( - Caffe::cudnn_handle(), - filter_desc_, top_descs_[i], conv_descs_[i], bottom_descs_[i], - bwd_data_algo_[i], &workspace_bwd_data_sizes_[i]) ); - } - -#ifndef USE_CNMEM - // reduce over all workspace sizes to get a maximum to allocate / reallocate - size_t total_workspace_fwd = 0; - size_t total_workspace_bwd_data = 0; - size_t total_workspace_bwd_filter = 0; - - // sum? max? - for (size_t i = 0; i < bottom.size(); i++) { - total_workspace_fwd = std::max(total_workspace_fwd, - workspace_fwd_sizes_[i]); - total_workspace_bwd_data = std::max(total_workspace_bwd_data, - workspace_bwd_data_sizes_[i]); - total_workspace_bwd_filter = std::max(total_workspace_bwd_filter, - workspace_bwd_filter_sizes_[i]); } - size_t max_workspace = std::max(total_workspace_fwd, - total_workspace_bwd_data); - max_workspace = std::max(max_workspace, total_workspace_bwd_filter); - size_t total_max_workspace = max_workspace * - (this->group_ * CUDNN_STREAMS_PER_GROUP); - - // this is the total amount of storage needed over all groups + streams - if (total_max_workspace > workspaceSizeInBytes) { - LOG(INFO) << "Reallocating workspace storage: " << total_max_workspace; - workspaceSizeInBytes = total_max_workspace; - - // free the existing workspace and allocate a new (larger) one - MemoryHandler::freeGPU(this->workspaceData); - this->workspaceData = NULL; - - MemoryHandler::mallocGPU(&(this->workspaceData), workspaceSizeInBytes); - - // if we succeed in the allocation, set pointer aliases for workspaces - for (int g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) { - workspace[g] = reinterpret_cast(workspaceData) + g*max_workspace; - } - } -#endif // Tensor descriptor for bias. if (this->bias_term_) { @@ -291,13 +115,13 @@ CuDNNConvolutionLayer::~CuDNNConvolutionLayer() { } cudnnDestroyFilterDescriptor(filter_desc_); - cudaFree(workspaceData); - delete [] fwd_algo_; - delete [] bwd_filter_algo_; - delete [] bwd_data_algo_; - delete [] workspace_fwd_sizes_; - delete [] workspace_bwd_data_sizes_; - delete [] workspace_bwd_filter_sizes_; + for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) { + cudaStreamDestroy(stream_[g]); + cudnnDestroy(handle_[g]); + } + + delete [] stream_; + delete [] handle_; } INSTANTIATE_CLASS(CuDNNConvolutionLayer); diff --git a/src/caffe/layers/cudnn_conv_layer.cu b/src/caffe/layers/cudnn_conv_layer.cu index 61846f0dcb3..4a1a4c4f4f2 100644 --- a/src/caffe/layers/cudnn_conv_layer.cu +++ b/src/caffe/layers/cudnn_conv_layer.cu @@ -19,30 +19,66 @@ void CuDNNConvolutionLayer::Forward_gpu( Dtype* top_data = top[i]->mutable_gpu_data(); const Dtype* weight = this->blobs_[0]->gpu_data(); + size_t workspace_limit_bytes = this->kernel_h_ * + this->kernel_w_ * + this->channels_ * + sizeof(int) + 1; + // Forward through cuDNN in parallel over groups. for (int g = 0; g < this->group_; g++) { -#ifdef USE_CNMEM - MemoryHandler::mallocGPU(&workspace[0], workspace_fwd_sizes_[i]); -#endif + cudnnConvolutionFwdAlgo_t algo; + + // pick the convolution algorithm + // TODO(shelhamer) this should be done during reshape + // TODO(shelhamer) the choice of automatic or manual algorithm picking + // should be exposed in proto + CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(handle_[g], + bottom_descs_[i], + filter_desc_, + conv_descs_[i], + top_descs_[i], + CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, + workspace_limit_bytes, // memoryLimitInBytes, + &algo)); + + // get minimum size of the workspace needed for the desired algorithm + size_t workspaceSizeInBytes_temp = 0; + + CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(handle_[g], + bottom_descs_[i], + filter_desc_, + conv_descs_[i], + top_descs_[i], + algo, + &workspaceSizeInBytes_temp)); + + if (workspaceSizeInBytes_temp > workspaceSizeInBytes) { + workspaceSizeInBytes = workspaceSizeInBytes_temp; + // free the existing workspace and allocate a new (larger) one + cudaFree(this->workspace); + cudaError_t err = cudaMalloc(&(this->workspace), workspaceSizeInBytes); + if (err != cudaSuccess) { + // force zero memory path + algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; + workspace = NULL; + workspaceSizeInBytes = 0; + } + } + // Filters. - // CUDNN_CHECK(cudnnConvolutionForward(handle_[g], - CUDNN_CHECK(cudnnConvolutionForward(Caffe::cudnn_handle(), + CUDNN_CHECK(cudnnConvolutionForward(handle_[g], cudnn::dataType::one, bottom_descs_[i], bottom_data + bottom_offset_ * g, filter_desc_, weight + weight_offset_ * g, conv_descs_[i], - fwd_algo_[i], workspace[0], workspace_fwd_sizes_[i], + algo, workspace, workspaceSizeInBytes, cudnn::dataType::zero, top_descs_[i], top_data + top_offset_ * g)); -#ifdef USE_CNMEM - MemoryHandler::freeGPU(workspace[0]); - workspace[0] = NULL; -#endif // Bias. if (this->bias_term_) { const Dtype* bias_data = this->blobs_[1]->gpu_data(); - CUDNN_CHECK(cudnnAddTensor(Caffe::cudnn_handle(), CUDNN_ADD_SAME_C, + CUDNN_CHECK(cudnnAddTensor(handle_[g], CUDNN_ADD_SAME_C, cudnn::dataType::one, bias_desc_, bias_data + bias_offset_ * g, cudnn::dataType::one, @@ -53,7 +89,7 @@ void CuDNNConvolutionLayer::Forward_gpu( // Synchronize the work across groups, each of which went into its own // stream, by launching an empty kernel into the default (null) stream. // NOLINT_NEXT_LINE(whitespace/operators) - CUDA_CHECK(cudaStreamSynchronize(cudaStreamLegacy)); + sync_conv_groups<<<1, 1>>>(); } } @@ -65,12 +101,12 @@ void CuDNNConvolutionLayer::Backward_gpu(const vector*>& top, if (this->param_propagate_down_[0]) { weight = this->blobs_[0]->gpu_data(); weight_diff = this->blobs_[0]->mutable_gpu_diff(); - // caffe_gpu_set(this->blobs_[0]->count(), Dtype(0), weight_diff); + caffe_gpu_set(this->blobs_[0]->count(), Dtype(0), weight_diff); } Dtype* bias_diff = NULL; if (this->bias_term_ && this->param_propagate_down_[1]) { bias_diff = this->blobs_[1]->mutable_gpu_diff(); - // caffe_gpu_set(this->blobs_[1]->count(), Dtype(0), bias_diff); + caffe_gpu_set(this->blobs_[1]->count(), Dtype(0), bias_diff); } for (int i = 0; i < top.size(); ++i) { const Dtype* top_diff = top[i]->gpu_diff(); @@ -78,7 +114,7 @@ void CuDNNConvolutionLayer::Backward_gpu(const vector*>& top, for (int g = 0; g < this->group_; g++) { // Gradient w.r.t. bias. if (this->bias_term_ && this->param_propagate_down_[1]) { - CUDNN_CHECK(cudnnConvolutionBackwardBias(Caffe::cudnn_handle(), + CUDNN_CHECK(cudnnConvolutionBackwardBias(handle_[0*this->group_ + g], cudnn::dataType::one, top_descs_[i], top_diff + top_offset_ * g, cudnn::dataType::one, @@ -87,23 +123,14 @@ void CuDNNConvolutionLayer::Backward_gpu(const vector*>& top, // Gradient w.r.t. weights. if (this->param_propagate_down_[0]) { -#ifdef USE_CNMEM - MemoryHandler::mallocGPU(&workspace[0], workspace_bwd_filter_sizes_[i]); -#endif const Dtype* bottom_data = bottom[i]->gpu_data(); - CUDNN_CHECK(cudnnConvolutionBackwardFilter_v3( - Caffe::cudnn_handle(), + CUDNN_CHECK(cudnnConvolutionBackwardFilter(handle_[1*this->group_ + g], cudnn::dataType::one, bottom_descs_[i], bottom_data + bottom_offset_ * g, top_descs_[i], top_diff + top_offset_ * g, conv_descs_[i], - bwd_filter_algo_[i], workspace[0], workspace_bwd_filter_sizes_[i], cudnn::dataType::one, filter_desc_, weight_diff + weight_offset_ * g)); -#ifdef USE_CNMEM - MemoryHandler::freeGPU(workspace[0]); - workspace[0] = NULL; -#endif } // Gradient w.r.t. bottom data. @@ -112,29 +139,20 @@ void CuDNNConvolutionLayer::Backward_gpu(const vector*>& top, weight = this->blobs_[0]->gpu_data(); } Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); -#ifdef USE_CNMEM - MemoryHandler::mallocGPU(&workspace[0], workspace_bwd_data_sizes_[i]); -#endif - CUDNN_CHECK(cudnnConvolutionBackwardData_v3( - Caffe::cudnn_handle(), + CUDNN_CHECK(cudnnConvolutionBackwardData(handle_[2*this->group_ + g], cudnn::dataType::one, filter_desc_, weight + weight_offset_ * g, top_descs_[i], top_diff + top_offset_ * g, conv_descs_[i], - bwd_data_algo_[i], workspace[0], workspace_bwd_data_sizes_[i], cudnn::dataType::zero, bottom_descs_[i], bottom_diff + bottom_offset_ * g)); -#ifdef USE_CNMEM - MemoryHandler::freeGPU(workspace[0]); - workspace[0] = NULL; -#endif } } // Synchronize the work across groups, each of which went into its own // stream, by launching an empty kernel into the default (null) stream. // NOLINT_NEXT_LINE(whitespace/operators) - CUDA_CHECK(cudaStreamSynchronize(cudaStreamLegacy)); + sync_conv_groups<<<1, 1>>>(); } } diff --git a/src/caffe/layers/cudnn_lcn_layer.cpp b/src/caffe/layers/cudnn_lcn_layer.cpp deleted file mode 100644 index 4e104642a4b..00000000000 --- a/src/caffe/layers/cudnn_lcn_layer.cpp +++ /dev/null @@ -1,82 +0,0 @@ -#ifdef USE_CUDNN -#include - -#include "caffe/filler.hpp" -#include "caffe/layer.hpp" -#include "caffe/util/im2col.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void CuDNNLCNLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - LRNLayer::LayerSetUp(bottom, top); - - CUDNN_CHECK(cudnnCreateLRNDescriptor(&norm_desc_)); - cudnn::createTensor4dDesc(&bottom_desc_); - cudnn::createTensor4dDesc(&top_desc_); - - // create a LRN handle - handles_setup_ = true; - - size_ = this->layer_param().lrn_param().local_size(); - pre_pad_ = (size_ - 1) / 2; - alpha_ = this->layer_param().lrn_param().alpha(); - beta_ = this->layer_param().lrn_param().beta(); - k_ = this->layer_param().lrn_param().k(); -} - -template -void CuDNNLCNLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - LRNLayer::Reshape(bottom, top); - cudnn::setTensor4dDesc(&bottom_desc_, bottom[0]->num(), - this->channels_, this->height_, this->width_); - cudnn::setTensor4dDesc(&top_desc_, bottom[0]->num(), - this->channels_, this->height_, this->width_); - CUDNN_CHECK(cudnnSetLRNDescriptor(norm_desc_, size_, alpha_, beta_, k_)); - - // allocate / reallocate tempData buffers - size_t totalSizeInBytes = sizeof(Dtype)*bottom[0]->num()* \ - this->channels_*this->height_*this->width_; - -#ifdef USE_CNMEM - this->tempDataSize = totalSizeInBytes; -#else - if (totalSizeInBytes > tempDataSize) { - tempDataSize = totalSizeInBytes; - - MemoryHandler::freeGPU(tempData1); - MemoryHandler::freeGPU(tempData2); - tempData1 = NULL; - tempData2 = NULL; - - // allocate new buffers - MemoryHandler::mallocGPU(&tempData1, totalSizeInBytes); - MemoryHandler::mallocGPU(&tempData2, totalSizeInBytes); - } -#endif -} - -template -CuDNNLCNLayer::~CuDNNLCNLayer() { - // Check that handles have been setup before destroying. - if (!handles_setup_) { return; } - - CUDNN_CHECK(cudnnDestroyTensorDescriptor(bottom_desc_)); - CUDNN_CHECK(cudnnDestroyTensorDescriptor(top_desc_)); - - // destroy LRN handle - CUDNN_CHECK(cudnnDestroyLRNDescriptor(norm_desc_)); - - // free temp buffers - if (tempData1 != NULL) cudaFree(tempData1); - if (tempData2 != NULL) cudaFree(tempData2); -} - -INSTANTIATE_CLASS(CuDNNLCNLayer); - -} // namespace caffe -#endif diff --git a/src/caffe/layers/cudnn_lcn_layer.cu b/src/caffe/layers/cudnn_lcn_layer.cu deleted file mode 100644 index 245b967edd4..00000000000 --- a/src/caffe/layers/cudnn_lcn_layer.cu +++ /dev/null @@ -1,74 +0,0 @@ -#ifdef USE_CUDNN -#include - -#include "caffe/filler.hpp" -#include "caffe/layer.hpp" -#include "caffe/util/im2col.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void CuDNNLCNLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - -#ifdef USE_CNMEM - MemoryHandler::mallocGPU(&this->tempData1, this->tempDataSize); - MemoryHandler::mallocGPU(&this->tempData2, this->tempDataSize); -#endif - - CUDNN_CHECK(cudnnDivisiveNormalizationForward( - Caffe::cudnn_handle(), norm_desc_, CUDNN_DIVNORM_PRECOMPUTED_MEANS, - cudnn::dataType::one, - bottom_desc_, bottom_data, - NULL, // srcMeansData - this->tempData1, this->tempData2, - cudnn::dataType::zero, - top_desc_, top_data) ); - -#ifdef USE_CNMEM - MemoryHandler::freeGPU(this->tempData1); - MemoryHandler::freeGPU(this->tempData2); - this->tempData1 = NULL; - this->tempData2 = NULL; -#endif -} - -template -void CuDNNLCNLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const Dtype* top_diff = top[0]->gpu_diff(); - const Dtype* top_data = top[0]->gpu_data(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - -#ifdef USE_CNMEM - MemoryHandler::mallocGPU(&this->tempData1, this->tempDataSize); - MemoryHandler::mallocGPU(&this->tempData2, this->tempDataSize); -#endif - - CUDNN_CHECK(cudnnDivisiveNormalizationBackward( - Caffe::cudnn_handle(), norm_desc_, CUDNN_DIVNORM_PRECOMPUTED_MEANS, - cudnn::dataType::one, - bottom_desc_, bottom_data, - NULL, top_diff, // NULL - srcMeansData - this->tempData1, this->tempData2, - cudnn::dataType::zero, - bottom_desc_, bottom_diff, - NULL) ); - -#ifdef USE_CNMEM - MemoryHandler::freeGPU(this->tempData1); - MemoryHandler::freeGPU(this->tempData2); - this->tempData1 = NULL; - this->tempData2 = NULL; -#endif -} - -INSTANTIATE_LAYER_GPU_FUNCS(CuDNNLCNLayer); - -} // namespace caffe -#endif diff --git a/src/caffe/layers/cudnn_lrn_layer.cpp b/src/caffe/layers/cudnn_lrn_layer.cpp deleted file mode 100644 index c263dae1c2d..00000000000 --- a/src/caffe/layers/cudnn_lrn_layer.cpp +++ /dev/null @@ -1,57 +0,0 @@ -#ifdef USE_CUDNN -#include - -#include "caffe/filler.hpp" -#include "caffe/layer.hpp" -#include "caffe/util/im2col.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void CuDNNLRNLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - LRNLayer::LayerSetUp(bottom, top); - - // CUDNN_CHECK(cudnnCreate(&handle_)); - CUDNN_CHECK(cudnnCreateLRNDescriptor(&norm_desc_)); - cudnn::createTensor4dDesc(&bottom_desc_); - cudnn::createTensor4dDesc(&top_desc_); - - // create a LRN handle - handles_setup_ = true; - - size_ = this->layer_param().lrn_param().local_size(); - alpha_ = this->layer_param().lrn_param().alpha(); - beta_ = this->layer_param().lrn_param().beta(); - k_ = this->layer_param().lrn_param().k(); -} - -template -void CuDNNLRNLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - LRNLayer::Reshape(bottom, top); - cudnn::setTensor4dDesc(&bottom_desc_, bottom[0]->num(), - this->channels_, this->height_, this->width_); - cudnn::setTensor4dDesc(&top_desc_, bottom[0]->num(), - this->channels_, this->height_, this->width_); - CUDNN_CHECK(cudnnSetLRNDescriptor(norm_desc_, size_, alpha_, beta_, k_)); -} - -template -CuDNNLRNLayer::~CuDNNLRNLayer() { - // Check that handles have been setup before destroying. - if (!handles_setup_) { return; } - - cudnnDestroyTensorDescriptor(bottom_desc_); - cudnnDestroyTensorDescriptor(top_desc_); - - // destroy LRN handle - CUDNN_CHECK(cudnnDestroyLRNDescriptor(norm_desc_)); -} - -INSTANTIATE_CLASS(CuDNNLRNLayer); - -} // namespace caffe -#endif diff --git a/src/caffe/layers/cudnn_lrn_layer.cu b/src/caffe/layers/cudnn_lrn_layer.cu deleted file mode 100644 index 10b8a05250c..00000000000 --- a/src/caffe/layers/cudnn_lrn_layer.cu +++ /dev/null @@ -1,48 +0,0 @@ -#ifdef USE_CUDNN -#include - -#include "caffe/filler.hpp" -#include "caffe/layer.hpp" -#include "caffe/util/im2col.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void CuDNNLRNLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - - CUDNN_CHECK(cudnnLRNCrossChannelForward( - Caffe::cudnn_handle(), norm_desc_, CUDNN_LRN_CROSS_CHANNEL_DIM1, - cudnn::dataType::one, - bottom_desc_, bottom_data, - cudnn::dataType::zero, - top_desc_, top_data) ); -} - -template -void CuDNNLRNLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const Dtype* top_diff = top[0]->gpu_diff(); - const Dtype* top_data = top[0]->gpu_data(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - - CUDNN_CHECK(cudnnLRNCrossChannelBackward( - Caffe::cudnn_handle(), norm_desc_, CUDNN_LRN_CROSS_CHANNEL_DIM1, - cudnn::dataType::one, - top_desc_, top_data, - top_desc_, top_diff, - bottom_desc_, bottom_data, - cudnn::dataType::zero, - bottom_desc_, bottom_diff) ); -} - -INSTANTIATE_LAYER_GPU_FUNCS(CuDNNLRNLayer); - -}; // namespace caffe - -#endif diff --git a/src/caffe/layers/cudnn_pooling_layer.cpp b/src/caffe/layers/cudnn_pooling_layer.cpp index d5b9cd0c179..c92c4e477b5 100644 --- a/src/caffe/layers/cudnn_pooling_layer.cpp +++ b/src/caffe/layers/cudnn_pooling_layer.cpp @@ -13,6 +13,7 @@ template void CuDNNPoolingLayer::LayerSetUp(const vector*>& bottom, const vector*>& top) { PoolingLayer::LayerSetUp(bottom, top); + CUDNN_CHECK(cudnnCreate(&handle_)); cudnn::createTensor4dDesc(&bottom_desc_); cudnn::createTensor4dDesc(&top_desc_); cudnn::createPoolingDesc(&pooling_desc_, @@ -40,6 +41,7 @@ CuDNNPoolingLayer::~CuDNNPoolingLayer() { cudnnDestroyTensorDescriptor(bottom_desc_); cudnnDestroyTensorDescriptor(top_desc_); cudnnDestroyPoolingDescriptor(pooling_desc_); + cudnnDestroy(handle_); } INSTANTIATE_CLASS(CuDNNPoolingLayer); diff --git a/src/caffe/layers/cudnn_pooling_layer.cu b/src/caffe/layers/cudnn_pooling_layer.cu index 9b8e6aee497..a952b855a48 100644 --- a/src/caffe/layers/cudnn_pooling_layer.cu +++ b/src/caffe/layers/cudnn_pooling_layer.cu @@ -14,7 +14,7 @@ void CuDNNPoolingLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { const Dtype* bottom_data = bottom[0]->gpu_data(); Dtype* top_data = top[0]->mutable_gpu_data(); - CUDNN_CHECK(cudnnPoolingForward(Caffe::cudnn_handle(), pooling_desc_, + CUDNN_CHECK(cudnnPoolingForward(handle_, pooling_desc_, cudnn::dataType::one, bottom_desc_, bottom_data, cudnn::dataType::zero, @@ -31,7 +31,7 @@ void CuDNNPoolingLayer::Backward_gpu(const vector*>& top, const Dtype* top_data = top[0]->gpu_data(); const Dtype* bottom_data = bottom[0]->gpu_data(); Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - CUDNN_CHECK(cudnnPoolingBackward(Caffe::cudnn_handle(), pooling_desc_, + CUDNN_CHECK(cudnnPoolingBackward(handle_, pooling_desc_, cudnn::dataType::one, top_desc_, top_data, top_desc_, top_diff, bottom_desc_, bottom_data, diff --git a/src/caffe/layers/cudnn_relu_layer.cpp b/src/caffe/layers/cudnn_relu_layer.cpp index 4dd9e6bfe8a..759d83984ef 100644 --- a/src/caffe/layers/cudnn_relu_layer.cpp +++ b/src/caffe/layers/cudnn_relu_layer.cpp @@ -12,6 +12,7 @@ void CuDNNReLULayer::LayerSetUp(const vector*>& bottom, const vector*>& top) { ReLULayer::LayerSetUp(bottom, top); // initialize cuDNN + CUDNN_CHECK(cudnnCreate(&handle_)); cudnn::createTensor4dDesc(&bottom_desc_); cudnn::createTensor4dDesc(&top_desc_); handles_setup_ = true; @@ -36,6 +37,7 @@ CuDNNReLULayer::~CuDNNReLULayer() { cudnnDestroyTensorDescriptor(this->bottom_desc_); cudnnDestroyTensorDescriptor(this->top_desc_); + cudnnDestroy(this->handle_); } INSTANTIATE_CLASS(CuDNNReLULayer); diff --git a/src/caffe/layers/cudnn_relu_layer.cu b/src/caffe/layers/cudnn_relu_layer.cu index 1664d649b9c..21d14857dd2 100644 --- a/src/caffe/layers/cudnn_relu_layer.cu +++ b/src/caffe/layers/cudnn_relu_layer.cu @@ -17,7 +17,7 @@ void CuDNNReLULayer::Forward_gpu(const vector*>& bottom, const Dtype* bottom_data = bottom[0]->gpu_data(); Dtype* top_data = top[0]->mutable_gpu_data(); - CUDNN_CHECK(cudnnActivationForward(Caffe::cudnn_handle(), + CUDNN_CHECK(cudnnActivationForward(this->handle_, CUDNN_ACTIVATION_RELU, cudnn::dataType::one, this->bottom_desc_, bottom_data, @@ -42,7 +42,7 @@ void CuDNNReLULayer::Backward_gpu(const vector*>& top, const Dtype* top_diff = top[0]->gpu_diff(); const Dtype* bottom_data = bottom[0]->gpu_data(); Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - CUDNN_CHECK(cudnnActivationBackward(Caffe::cudnn_handle(), + CUDNN_CHECK(cudnnActivationBackward(this->handle_, CUDNN_ACTIVATION_RELU, cudnn::dataType::one, this->top_desc_, top_data, this->top_desc_, top_diff, diff --git a/src/caffe/layers/cudnn_sigmoid_layer.cpp b/src/caffe/layers/cudnn_sigmoid_layer.cpp index b9ba8903ebb..32637873d46 100644 --- a/src/caffe/layers/cudnn_sigmoid_layer.cpp +++ b/src/caffe/layers/cudnn_sigmoid_layer.cpp @@ -12,6 +12,7 @@ void CuDNNSigmoidLayer::LayerSetUp(const vector*>& bottom, const vector*>& top) { SigmoidLayer::LayerSetUp(bottom, top); // initialize cuDNN + CUDNN_CHECK(cudnnCreate(&handle_)); cudnn::createTensor4dDesc(&bottom_desc_); cudnn::createTensor4dDesc(&top_desc_); handles_setup_ = true; @@ -36,6 +37,7 @@ CuDNNSigmoidLayer::~CuDNNSigmoidLayer() { cudnnDestroyTensorDescriptor(this->bottom_desc_); cudnnDestroyTensorDescriptor(this->top_desc_); + cudnnDestroy(this->handle_); } INSTANTIATE_CLASS(CuDNNSigmoidLayer); diff --git a/src/caffe/layers/cudnn_sigmoid_layer.cu b/src/caffe/layers/cudnn_sigmoid_layer.cu index bcf38da6c4e..7a06cf721da 100644 --- a/src/caffe/layers/cudnn_sigmoid_layer.cu +++ b/src/caffe/layers/cudnn_sigmoid_layer.cu @@ -12,7 +12,7 @@ void CuDNNSigmoidLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { const Dtype* bottom_data = bottom[0]->gpu_data(); Dtype* top_data = top[0]->mutable_gpu_data(); - CUDNN_CHECK(cudnnActivationForward(Caffe::cudnn_handle(), + CUDNN_CHECK(cudnnActivationForward(this->handle_, CUDNN_ACTIVATION_SIGMOID, cudnn::dataType::one, this->bottom_desc_, bottom_data, @@ -32,7 +32,7 @@ void CuDNNSigmoidLayer::Backward_gpu(const vector*>& top, const Dtype* top_diff = top[0]->gpu_diff(); const Dtype* bottom_data = bottom[0]->gpu_data(); Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - CUDNN_CHECK(cudnnActivationBackward(Caffe::cudnn_handle(), + CUDNN_CHECK(cudnnActivationBackward(this->handle_, CUDNN_ACTIVATION_SIGMOID, cudnn::dataType::one, this->top_desc_, top_data, this->top_desc_, top_diff, diff --git a/src/caffe/layers/cudnn_softmax_layer.cpp b/src/caffe/layers/cudnn_softmax_layer.cpp index 20f9c4ed46f..77a3225adcd 100644 --- a/src/caffe/layers/cudnn_softmax_layer.cpp +++ b/src/caffe/layers/cudnn_softmax_layer.cpp @@ -16,6 +16,7 @@ void CuDNNSoftmaxLayer::LayerSetUp(const vector*>& bottom, const vector*>& top) { SoftmaxLayer::LayerSetUp(bottom, top); // Initialize CUDNN. + CUDNN_CHECK(cudnnCreate(&handle_)); cudnn::createTensor4dDesc(&bottom_desc_); cudnn::createTensor4dDesc(&top_desc_); handles_setup_ = true; @@ -40,6 +41,7 @@ CuDNNSoftmaxLayer::~CuDNNSoftmaxLayer() { cudnnDestroyTensorDescriptor(bottom_desc_); cudnnDestroyTensorDescriptor(top_desc_); + cudnnDestroy(handle_); } INSTANTIATE_CLASS(CuDNNSoftmaxLayer); diff --git a/src/caffe/layers/cudnn_softmax_layer.cu b/src/caffe/layers/cudnn_softmax_layer.cu index 9a921ba7e96..a9e2fcefaf7 100644 --- a/src/caffe/layers/cudnn_softmax_layer.cu +++ b/src/caffe/layers/cudnn_softmax_layer.cu @@ -16,7 +16,7 @@ void CuDNNSoftmaxLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { const Dtype* bottom_data = bottom[0]->gpu_data(); Dtype* top_data = top[0]->mutable_gpu_data(); - CUDNN_CHECK(cudnnSoftmaxForward(Caffe::cudnn_handle(), CUDNN_SOFTMAX_ACCURATE, + CUDNN_CHECK(cudnnSoftmaxForward(handle_, CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL, cudnn::dataType::one, bottom_desc_, bottom_data, @@ -33,8 +33,7 @@ void CuDNNSoftmaxLayer::Backward_gpu(const vector*>& top, const Dtype* bottom_data = bottom[0]->gpu_data(); Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - CUDNN_CHECK(cudnnSoftmaxBackward( - Caffe::cudnn_handle(), CUDNN_SOFTMAX_ACCURATE, + CUDNN_CHECK(cudnnSoftmaxBackward(handle_, CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL, cudnn::dataType::one, top_desc_, top_data, top_desc_, top_diff, diff --git a/src/caffe/layers/cudnn_tanh_layer.cpp b/src/caffe/layers/cudnn_tanh_layer.cpp index 62afc6da7e4..376faad324d 100644 --- a/src/caffe/layers/cudnn_tanh_layer.cpp +++ b/src/caffe/layers/cudnn_tanh_layer.cpp @@ -12,6 +12,7 @@ void CuDNNTanHLayer::LayerSetUp(const vector*>& bottom, const vector*>& top) { TanHLayer::LayerSetUp(bottom, top); // initialize cuDNN + CUDNN_CHECK(cudnnCreate(&handle_)); cudnn::createTensor4dDesc(&bottom_desc_); cudnn::createTensor4dDesc(&top_desc_); handles_setup_ = true; @@ -36,6 +37,7 @@ CuDNNTanHLayer::~CuDNNTanHLayer() { cudnnDestroyTensorDescriptor(this->bottom_desc_); cudnnDestroyTensorDescriptor(this->top_desc_); + cudnnDestroy(this->handle_); } INSTANTIATE_CLASS(CuDNNTanHLayer); diff --git a/src/caffe/layers/cudnn_tanh_layer.cu b/src/caffe/layers/cudnn_tanh_layer.cu index d4e6c8a08bc..d287f6fee85 100644 --- a/src/caffe/layers/cudnn_tanh_layer.cu +++ b/src/caffe/layers/cudnn_tanh_layer.cu @@ -12,7 +12,7 @@ void CuDNNTanHLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { const Dtype* bottom_data = bottom[0]->gpu_data(); Dtype* top_data = top[0]->mutable_gpu_data(); - CUDNN_CHECK(cudnnActivationForward(Caffe::cudnn_handle(), + CUDNN_CHECK(cudnnActivationForward(this->handle_, CUDNN_ACTIVATION_TANH, cudnn::dataType::one, this->bottom_desc_, bottom_data, @@ -33,7 +33,7 @@ void CuDNNTanHLayer::Backward_gpu(const vector*>& top, const Dtype* bottom_data = bottom[0]->gpu_data(); Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - CUDNN_CHECK(cudnnActivationBackward(Caffe::cudnn_handle(), + CUDNN_CHECK(cudnnActivationBackward(this->handle_, CUDNN_ACTIVATION_TANH, cudnn::dataType::one, this->top_desc_, top_data, this->top_desc_, top_diff, diff --git a/src/caffe/layers/data_layer.cpp b/src/caffe/layers/data_layer.cpp index 9c23ba0cd29..0f2d66776a9 100644 --- a/src/caffe/layers/data_layer.cpp +++ b/src/caffe/layers/data_layer.cpp @@ -11,25 +11,36 @@ #include "caffe/proto/caffe.pb.h" #include "caffe/util/benchmark.hpp" #include "caffe/util/io.hpp" +#include "caffe/util/math_functions.hpp" +#include "caffe/util/rng.hpp" namespace caffe { template -DataLayer::DataLayer(const LayerParameter& param) - : BasePrefetchingDataLayer(param), - reader_(param) { -} - -template -DataLayer::~DataLayer() { - this->StopInternalThread(); +DataLayer::~DataLayer() { + this->JoinPrefetchThread(); } template void DataLayer::DataLayerSetUp(const vector*>& bottom, const vector*>& top) { + // Initialize DB + db_.reset(db::GetDB(this->layer_param_.data_param().backend())); + db_->Open(this->layer_param_.data_param().source(), db::READ); + cursor_.reset(db_->NewCursor()); + + // Check if we should randomly skip a few data points + if (this->layer_param_.data_param().rand_skip()) { + unsigned int skip = caffe_rng_rand() % + this->layer_param_.data_param().rand_skip(); + LOG(INFO) << "Skipping first " << skip << " data points."; + while (skip-- > 0) { + cursor_->Next(); + } + } // Read a data point, and use it to initialize the top blob. - Datum& datum = *(reader_.full().peek()); + Datum datum; + datum.ParseFromString(cursor_->value()); bool force_color = this->layer_param_.data_param().force_encoded_color(); if ((force_color && DecodeDatum(&datum, true)) || @@ -37,49 +48,42 @@ void DataLayer::DataLayerSetUp(const vector*>& bottom, LOG(INFO) << "Decoding Datum"; } // image - const int crop_size = this->layer_param_.transform_param().crop_size(); - const int batch_size = this->layer_param_.data_param().batch_size(); + int crop_size = this->layer_param_.transform_param().crop_size(); if (crop_size > 0) { - top[0]->Reshape(batch_size, datum.channels(), crop_size, crop_size); - for (int i = 0; i < this->PREFETCH_COUNT; ++i) { - this->prefetch_[i].data_.Reshape(batch_size, datum.channels(), - crop_size, crop_size); - } - this->transformed_data_.Reshape(1, datum.channels(), - crop_size, crop_size); + top[0]->Reshape(this->layer_param_.data_param().batch_size(), + datum.channels(), crop_size, crop_size); + this->prefetch_data_.Reshape(this->layer_param_.data_param().batch_size(), + datum.channels(), crop_size, crop_size); + this->transformed_data_.Reshape(1, datum.channels(), crop_size, crop_size); } else { - top[0]->Reshape(batch_size, datum.channels(), + top[0]->Reshape( + this->layer_param_.data_param().batch_size(), datum.channels(), datum.height(), datum.width()); - for (int i = 0; i < this->PREFETCH_COUNT; ++i) { - this->prefetch_[i].data_.Reshape(batch_size, datum.channels(), - datum.height(), datum.width()); - } + this->prefetch_data_.Reshape(this->layer_param_.data_param().batch_size(), + datum.channels(), datum.height(), datum.width()); this->transformed_data_.Reshape(1, datum.channels(), - datum.height(), datum.width()); + datum.height(), datum.width()); } LOG(INFO) << "output data size: " << top[0]->num() << "," << top[0]->channels() << "," << top[0]->height() << "," << top[0]->width(); // label if (this->output_labels_) { - vector label_shape(1, batch_size); + vector label_shape(1, this->layer_param_.data_param().batch_size()); top[1]->Reshape(label_shape); - for (int i = 0; i < this->PREFETCH_COUNT; ++i) { - this->prefetch_[i].label_.Reshape(label_shape); - } + this->prefetch_label_.Reshape(label_shape); } } -// This function is called on prefetch thread -template -void DataLayer::load_batch(Batch* batch) { +// This function is used to create a thread that prefetches the data. +template +void DataLayer::InternalThreadEntry() { CPUTimer batch_timer; batch_timer.Start(); - double deque_time = 0; - double decod_time = 0; + double read_time = 0; double trans_time = 0; CPUTimer timer; - CHECK(batch->data_.count()); + CHECK(this->prefetch_data_.count()); CHECK(this->transformed_data_.count()); // Reshape on single input batches for inputs of varying dimension. @@ -87,7 +91,8 @@ void DataLayer::load_batch(Batch* batch) { const int crop_size = this->layer_param_.transform_param().crop_size(); bool force_color = this->layer_param_.data_param().force_encoded_color(); if (batch_size == 1 && crop_size == 0) { - Datum& datum = *(reader_.full().peek()); + Datum datum; + datum.ParseFromString(cursor_->value()); if (datum.encoded()) { if (force_color) { DecodeDatum(&datum, true); @@ -95,25 +100,24 @@ void DataLayer::load_batch(Batch* batch) { DecodeDatumNative(&datum); } } - batch->data_.Reshape(1, datum.channels(), + this->prefetch_data_.Reshape(1, datum.channels(), datum.height(), datum.width()); this->transformed_data_.Reshape(1, datum.channels(), datum.height(), datum.width()); } - Dtype* top_data = batch->data_.mutable_cpu_data(); + Dtype* top_data = this->prefetch_data_.mutable_cpu_data(); Dtype* top_label = NULL; // suppress warnings about uninitialized variables if (this->output_labels_) { - top_label = batch->label_.mutable_cpu_data(); + top_label = this->prefetch_label_.mutable_cpu_data(); } for (int item_id = 0; item_id < batch_size; ++item_id) { - // get a blob timer.Start(); - Datum& datum = *(reader_.full().pop("Waiting for data")); - deque_time += timer.MicroSeconds(); + // get a blob + Datum datum; + datum.ParseFromString(cursor_->value()); - timer.Start(); cv::Mat cv_img; if (datum.encoded()) { if (force_color) { @@ -128,11 +132,11 @@ void DataLayer::load_batch(Batch* batch) { << "convert_imageset."; } } - decod_time += timer.MicroSeconds(); + read_time += timer.MicroSeconds(); + timer.Start(); // Apply data transformations (mirror, scale, crop...) - timer.Start(); - int offset = batch->data_.offset(item_id); + int offset = this->prefetch_data_.offset(item_id); this->transformed_data_.set_cpu_data(top_data + offset); if (datum.encoded()) { this->data_transformer_->Transform(cv_img, &(this->transformed_data_)); @@ -143,17 +147,17 @@ void DataLayer::load_batch(Batch* batch) { top_label[item_id] = datum.label(); } trans_time += timer.MicroSeconds(); - - reader_.free().push(const_cast(&datum)); + // go to the next iter + cursor_->Next(); + if (!cursor_->valid()) { + DLOG(INFO) << "Restarting data prefetching from start."; + cursor_->SeekToFirst(); + } } batch_timer.Stop(); - -#ifdef BENCHMARK_DATA - LOG(INFO) << "Prefetch batch: " << batch_timer.MilliSeconds() << " ms."; - LOG(INFO) << " Dequeue time: " << deque_time / 1000 << " ms."; - LOG(INFO) << " Decode time: " << decod_time / 1000 << " ms."; - LOG(INFO) << "Transform time: " << trans_time / 1000 << " ms."; -#endif + DLOG(INFO) << "Prefetch batch: " << batch_timer.MilliSeconds() << " ms."; + DLOG(INFO) << " Read time: " << read_time / 1000 << " ms."; + DLOG(INFO) << "Transform time: " << trans_time / 1000 << " ms."; } INSTANTIATE_CLASS(DataLayer); diff --git a/src/caffe/layers/dropout_layer.cu b/src/caffe/layers/dropout_layer.cu index 552d1ff2cf7..f9ea04f4acf 100644 --- a/src/caffe/layers/dropout_layer.cu +++ b/src/caffe/layers/dropout_layer.cu @@ -30,11 +30,11 @@ void DropoutLayer::Forward_gpu(const vector*>& bottom, unsigned int* mask = static_cast(rand_vec_.mutable_gpu_data()); caffe_gpu_rng_uniform(count, mask); - CUDA_POST_KERNEL_CHECK; // set thresholds // NOLINT_NEXT_LINE(whitespace/operators) DropoutForward<<>>( count, bottom_data, mask, uint_thres_, scale_, top_data); + CUDA_POST_KERNEL_CHECK; } else { caffe_copy(count, bottom_data, top_data); } diff --git a/src/caffe/layers/image_data_layer.cpp b/src/caffe/layers/image_data_layer.cpp index 50187bbe5ce..38ebbd5ec14 100644 --- a/src/caffe/layers/image_data_layer.cpp +++ b/src/caffe/layers/image_data_layer.cpp @@ -17,7 +17,7 @@ namespace caffe { template ImageDataLayer::~ImageDataLayer() { - this->StopInternalThread(); + this->JoinPrefetchThread(); } template @@ -70,14 +70,11 @@ void ImageDataLayer::DataLayerSetUp(const vector*>& bottom, const int batch_size = this->layer_param_.image_data_param().batch_size(); if (crop_size > 0) { top[0]->Reshape(batch_size, channels, crop_size, crop_size); - for (int i = 0; i < this->PREFETCH_COUNT; ++i) - this->prefetch_[i].data_.Reshape(batch_size, channels, - crop_size, crop_size); + this->prefetch_data_.Reshape(batch_size, channels, crop_size, crop_size); this->transformed_data_.Reshape(1, channels, crop_size, crop_size); } else { top[0]->Reshape(batch_size, channels, height, width); - for (int i = 0; i < this->PREFETCH_COUNT; ++i) - this->prefetch_[i].data_.Reshape(batch_size, channels, height, width); + this->prefetch_data_.Reshape(batch_size, channels, height, width); this->transformed_data_.Reshape(1, channels, height, width); } LOG(INFO) << "output data size: " << top[0]->num() << "," @@ -86,9 +83,7 @@ void ImageDataLayer::DataLayerSetUp(const vector*>& bottom, // label vector label_shape(1, batch_size); top[1]->Reshape(label_shape); - for (int i = 0; i < this->PREFETCH_COUNT; ++i) { - this->prefetch_[i].label_.Reshape(label_shape); - } + this->prefetch_label_.Reshape(label_shape); } template @@ -98,15 +93,15 @@ void ImageDataLayer::ShuffleImages() { shuffle(lines_.begin(), lines_.end(), prefetch_rng); } -// This function is called on prefetch thread +// This function is used to create a thread that prefetches the data. template -void ImageDataLayer::load_batch(Batch* batch) { +void ImageDataLayer::InternalThreadEntry() { CPUTimer batch_timer; batch_timer.Start(); double read_time = 0; double trans_time = 0; CPUTimer timer; - CHECK(batch->data_.count()); + CHECK(this->prefetch_data_.count()); CHECK(this->transformed_data_.count()); ImageDataParameter image_data_param = this->layer_param_.image_data_param(); const int batch_size = image_data_param.batch_size(); @@ -120,14 +115,14 @@ void ImageDataLayer::load_batch(Batch* batch) { if (batch_size == 1 && crop_size == 0 && new_height == 0 && new_width == 0) { cv::Mat cv_img = ReadImageToCVMat(root_folder + lines_[lines_id_].first, 0, 0, is_color); - batch->data_.Reshape(1, cv_img.channels(), + this->prefetch_data_.Reshape(1, cv_img.channels(), cv_img.rows, cv_img.cols); this->transformed_data_.Reshape(1, cv_img.channels(), cv_img.rows, cv_img.cols); } - Dtype* prefetch_data = batch->data_.mutable_cpu_data(); - Dtype* prefetch_label = batch->label_.mutable_cpu_data(); + Dtype* prefetch_data = this->prefetch_data_.mutable_cpu_data(); + Dtype* prefetch_label = this->prefetch_label_.mutable_cpu_data(); // datum scales const int lines_size = lines_.size(); @@ -141,7 +136,7 @@ void ImageDataLayer::load_batch(Batch* batch) { read_time += timer.MicroSeconds(); timer.Start(); // Apply transformations (mirror, crop...) to the image - int offset = batch->data_.offset(item_id); + int offset = this->prefetch_data_.offset(item_id); this->transformed_data_.set_cpu_data(prefetch_data + offset); this->data_transformer_->Transform(cv_img, &(this->transformed_data_)); trans_time += timer.MicroSeconds(); diff --git a/src/caffe/layers/lrn_layer.cpp b/src/caffe/layers/lrn_layer.cpp index ba51a5eecb5..36c1ace4c99 100644 --- a/src/caffe/layers/lrn_layer.cpp +++ b/src/caffe/layers/lrn_layer.cpp @@ -254,6 +254,6 @@ STUB_GPU_BACKWARD(LRNLayer, CrossChannelBackward); #endif INSTANTIATE_CLASS(LRNLayer); -// REGISTER_LAYER_CLASS(LRN); +REGISTER_LAYER_CLASS(LRN); } // namespace caffe diff --git a/src/caffe/layers/window_data_layer.cpp b/src/caffe/layers/window_data_layer.cpp index f637f2ec6d4..c127d56bc46 100644 --- a/src/caffe/layers/window_data_layer.cpp +++ b/src/caffe/layers/window_data_layer.cpp @@ -27,7 +27,7 @@ namespace caffe { template WindowDataLayer::~WindowDataLayer() { - this->StopInternalThread(); + this->JoinPrefetchThread(); } template @@ -171,9 +171,7 @@ void WindowDataLayer::DataLayerSetUp(const vector*>& bottom, CHECK_GT(crop_size, 0); const int batch_size = this->layer_param_.window_data_param().batch_size(); top[0]->Reshape(batch_size, channels, crop_size, crop_size); - for (int i = 0; i < this->PREFETCH_COUNT; ++i) - this->prefetch_[i].data_.Reshape( - batch_size, channels, crop_size, crop_size); + this->prefetch_data_.Reshape(batch_size, channels, crop_size, crop_size); LOG(INFO) << "output data size: " << top[0]->num() << "," << top[0]->channels() << "," << top[0]->height() << "," @@ -181,9 +179,7 @@ void WindowDataLayer::DataLayerSetUp(const vector*>& bottom, // label vector label_shape(1, batch_size); top[1]->Reshape(label_shape); - for (int i = 0; i < this->PREFETCH_COUNT; ++i) { - this->prefetch_[i].label_.Reshape(label_shape); - } + this->prefetch_label_.Reshape(label_shape); // data mean has_mean_file_ = this->transform_param_.has_mean_file(); @@ -221,9 +217,9 @@ unsigned int WindowDataLayer::PrefetchRand() { return (*prefetch_rng)(); } -// This function is called on prefetch thread +// Thread fetching the data template -void WindowDataLayer::load_batch(Batch* batch) { +void WindowDataLayer::InternalThreadEntry() { // At each iteration, sample N windows where N*p are foreground (object) // windows and N*(1-p) are background (non-object) windows CPUTimer batch_timer; @@ -231,8 +227,8 @@ void WindowDataLayer::load_batch(Batch* batch) { double read_time = 0; double trans_time = 0; CPUTimer timer; - Dtype* top_data = batch->data_.mutable_cpu_data(); - Dtype* top_label = batch->label_.mutable_cpu_data(); + Dtype* top_data = this->prefetch_data_.mutable_cpu_data(); + Dtype* top_label = this->prefetch_label_.mutable_cpu_data(); const Dtype scale = this->layer_param_.window_data_param().scale(); const int batch_size = this->layer_param_.window_data_param().batch_size(); const int context_pad = this->layer_param_.window_data_param().context_pad(); @@ -256,7 +252,7 @@ void WindowDataLayer::load_batch(Batch* batch) { bool use_square = (crop_mode == "square") ? true : false; // zero out batch - caffe_set(batch->data_.count(), Dtype(0), top_data); + caffe_set(this->prefetch_data_.count(), Dtype(0), top_data); const int num_fg = static_cast(static_cast(batch_size) * fg_fraction); diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp index aa266cd73d2..a18ee63818e 100644 --- a/src/caffe/net.cpp +++ b/src/caffe/net.cpp @@ -8,7 +8,6 @@ #include "caffe/common.hpp" #include "caffe/layer.hpp" #include "caffe/net.hpp" -#include "caffe/parallel.hpp" #include "caffe/proto/caffe.pb.h" #include "caffe/util/insert_splits.hpp" #include "caffe/util/io.hpp" @@ -40,13 +39,8 @@ void Net::Init(const NetParameter& in_param) { // the current NetState. NetParameter filtered_param; FilterNet(in_param, &filtered_param); - if (phase_ == TRAIN) { - caffe::P2PSync::divide_batch_size(&filtered_param); - } - if (Caffe::root_solver()) { - LOG(INFO) << "Initializing net from parameters: " << std::endl - << filtered_param.DebugString(); - } + LOG(INFO) << "Initializing net from parameters: " << std::endl + << filtered_param.DebugString(); // Create a copy of filtered_param with splits added where necessary. NetParameter param; InsertSplits(filtered_param, ¶m); @@ -70,9 +64,7 @@ void Net::Init(const NetParameter& in_param) { const int layer_id = -1; // inputs have fake layer ID -1 AppendTop(param, layer_id, input_id, &available_blobs, &blob_name_to_idx); } - if (Caffe::root_solver()) { - DLOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype); - } + DLOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype); // For each layer, set up its input and output bottom_vecs_.resize(param.layer_size()); top_vecs_.resize(param.layer_size()); @@ -95,9 +87,7 @@ void Net::Init(const NetParameter& in_param) { } layers_.push_back(LayerRegistry::CreateLayer(layer_param)); layer_names_.push_back(layer_param.name()); - if (Caffe::root_solver()) { - LOG(INFO) << "Creating Layer " << layer_param.name(); - } + LOG(INFO) << "Creating Layer " << layer_param.name(); bool need_backward = false; // Figure out this layer's input and output @@ -127,30 +117,20 @@ void Net::Init(const NetParameter& in_param) { } } // After this layer is connected, set it up. - if (Caffe::root_solver()) { - LOG(INFO) << "Setting up " << layer_names_[layer_id]; - } + LOG(INFO) << "Setting up " << layer_names_[layer_id]; layers_[layer_id]->SetUp(bottom_vecs_[layer_id], top_vecs_[layer_id]); for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) { if (blob_loss_weights_.size() <= top_id_vecs_[layer_id][top_id]) { blob_loss_weights_.resize(top_id_vecs_[layer_id][top_id] + 1, Dtype(0)); } blob_loss_weights_[top_id_vecs_[layer_id][top_id]] = layer->loss(top_id); - if (Caffe::root_solver()) { - LOG(INFO) << "Top shape: " - << top_vecs_[layer_id][top_id]->shape_string(); - } + LOG(INFO) << "Top shape: " << top_vecs_[layer_id][top_id]->shape_string(); if (layer->loss(top_id)) { - if (Caffe::root_solver()) { - LOG(INFO) << " with loss weight " << layer->loss(top_id); - } + LOG(INFO) << " with loss weight " << layer->loss(top_id); } memory_used_ += top_vecs_[layer_id][top_id]->count(); } - if (Caffe::root_solver()) { - DLOG(INFO) << "Memory required for data: " - << memory_used_ * sizeof(Dtype); - } + DLOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype); const int param_size = layer_param.param_size(); const int num_param_blobs = layers_[layer_id]->blobs().size(); CHECK_LE(param_size, num_param_blobs) @@ -209,14 +189,10 @@ void Net::Init(const NetParameter& in_param) { } if (!layer_contributes_loss) { layer_need_backward_[layer_id] = false; } if (layer_need_backward_[layer_id]) { - if (Caffe::root_solver()) { - LOG(INFO) << layer_names_[layer_id] << " needs backward computation."; - } + LOG(INFO) << layer_names_[layer_id] << " needs backward computation."; } else { - if (Caffe::root_solver()) { - LOG(INFO) << layer_names_[layer_id] - << " does not need backward computation."; - } + LOG(INFO) << layer_names_[layer_id] + << " does not need backward computation."; } for (int bottom_id = 0; bottom_id < bottom_vecs_[layer_id].size(); ++bottom_id) { @@ -256,9 +232,7 @@ void Net::Init(const NetParameter& in_param) { // In the end, all remaining blobs are considered output blobs. for (set::iterator it = available_blobs.begin(); it != available_blobs.end(); ++it) { - if (Caffe::root_solver()) { - LOG(INFO) << "This network produces output " << *it; - } + LOG(INFO) << "This network produces output " << *it; net_output_blobs_.push_back(blobs_[blob_name_to_idx[*it]].get()); net_output_blob_indices_.push_back(blob_name_to_idx[*it]); } @@ -270,10 +244,8 @@ void Net::Init(const NetParameter& in_param) { } GetLearningRateAndWeightDecay(); debug_info_ = param.debug_info(); - if (Caffe::root_solver()) { - LOG(INFO) << "Network initialization done."; - LOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype); - } + LOG(INFO) << "Network initialization done."; + LOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype); } template @@ -312,33 +284,27 @@ bool Net::StateMeetsRule(const NetState& state, // Check whether the rule is broken due to phase. if (rule.has_phase()) { if (rule.phase() != state.phase()) { - if (Caffe::root_solver()) { - LOG(INFO) << "The NetState phase (" << state.phase() - << ") differed from the phase (" << rule.phase() - << ") specified by a rule in layer " << layer_name; - } + LOG(INFO) << "The NetState phase (" << state.phase() + << ") differed from the phase (" << rule.phase() + << ") specified by a rule in layer " << layer_name; return false; } } // Check whether the rule is broken due to min level. if (rule.has_min_level()) { if (state.level() < rule.min_level()) { - if (Caffe::root_solver()) { - LOG(INFO) << "The NetState level (" << state.level() - << ") is above the min_level (" << rule.min_level() - << ") specified by a rule in layer " << layer_name; - } + LOG(INFO) << "The NetState level (" << state.level() + << ") is above the min_level (" << rule.min_level() + << ") specified by a rule in layer " << layer_name; return false; } } // Check whether the rule is broken due to max level. if (rule.has_max_level()) { if (state.level() > rule.max_level()) { - if (Caffe::root_solver()) { - LOG(INFO) << "The NetState level (" << state.level() - << ") is above the max_level (" << rule.max_level() - << ") specified by a rule in layer " << layer_name; - } + LOG(INFO) << "The NetState level (" << state.level() + << ") is above the max_level (" << rule.max_level() + << ") specified by a rule in layer " << layer_name; return false; } } @@ -351,10 +317,8 @@ bool Net::StateMeetsRule(const NetState& state, if (rule.stage(i) == state.stage(j)) { has_stage = true; } } if (!has_stage) { - if (Caffe::root_solver()) { - LOG(INFO) << "The NetState did not contain stage '" << rule.stage(i) - << "' specified by a rule in layer " << layer_name; - } + LOG(INFO) << "The NetState did not contain stage '" << rule.stage(i) + << "' specified by a rule in layer " << layer_name; return false; } } @@ -367,10 +331,8 @@ bool Net::StateMeetsRule(const NetState& state, if (rule.not_stage(i) == state.stage(j)) { has_stage = true; } } if (has_stage) { - if (Caffe::root_solver()) { - LOG(INFO) << "The NetState contained a not_stage '" << rule.not_stage(i) - << "' specified by a rule in layer " << layer_name; - } + LOG(INFO) << "The NetState contained a not_stage '" << rule.not_stage(i) + << "' specified by a rule in layer " << layer_name; return false; } } @@ -392,9 +354,7 @@ void Net::AppendTop(const NetParameter& param, const int layer_id, if (blob_name_to_idx && layer_param && layer_param->bottom_size() > top_id && blob_name == layer_param->bottom(top_id)) { // In-place computation - if (Caffe::root_solver()) { - LOG(INFO) << layer_param->name() << " -> " << blob_name << " (in-place)"; - } + LOG(INFO) << layer_param->name() << " -> " << blob_name << " (in-place)"; top_vecs_[layer_id].push_back(blobs_[(*blob_name_to_idx)[blob_name]].get()); top_id_vecs_[layer_id].push_back((*blob_name_to_idx)[blob_name]); } else if (blob_name_to_idx && @@ -404,12 +364,10 @@ void Net::AppendTop(const NetParameter& param, const int layer_id, LOG(FATAL) << "Duplicate blobs produced by multiple sources."; } else { // Normal output. - if (Caffe::root_solver()) { - if (layer_param) { - LOG(INFO) << layer_param->name() << " -> " << blob_name; - } else { - LOG(INFO) << "Input " << top_id << " -> " << blob_name; - } + if (layer_param) { + LOG(INFO) << layer_param->name() << " -> " << blob_name; + } else { + LOG(INFO) << "Input " << top_id << " -> " << blob_name; } shared_ptr > blob_pointer(new Blob()); const int blob_id = blobs_.size(); @@ -449,9 +407,7 @@ int Net::AppendBottom(const NetParameter& param, const int layer_id, << " (at index " << bottom_id << ") to layer " << layer_id; } const int blob_id = (*blob_name_to_idx)[blob_name]; - if (Caffe::root_solver()) { - LOG(INFO) << layer_names_[layer_id] << " <- " << blob_name; - } + LOG(INFO) << layer_names_[layer_id] << " <- " << blob_name; bottom_vecs_[layer_id].push_back(blobs_[blob_id].get()); bottom_id_vecs_[layer_id].push_back(blob_id); available_blobs->erase(blob_name); @@ -500,11 +456,9 @@ void Net::AppendParam(const NetParameter& param, const int layer_id, param_layer_indices_[owner_net_param_id]; const int owner_layer_id = owner_index.first; const int owner_param_id = owner_index.second; - if (Caffe::root_solver()) { - LOG(INFO) << "Sharing parameters '" << param_name << "' owned by " - << "layer '" << layer_names_[owner_layer_id] << "', param " - << "index " << owner_param_id; - } + LOG(INFO) << "Sharing parameters '" << param_name << "' owned by " + << "layer '" << layer_names_[owner_layer_id] << "', param " + << "index " << owner_param_id; Blob* this_blob = layers_[layer_id]->blobs()[param_id].get(); Blob* owner_blob = layers_[owner_layer_id]->blobs()[owner_param_id].get(); @@ -525,9 +479,7 @@ void Net::AppendParam(const NetParameter& param, const int layer_id, template void Net::GetLearningRateAndWeightDecay() { - if (Caffe::root_solver()) { - LOG(INFO) << "Collecting Learning Rate and Weight Decay."; - } + LOG(INFO) << "Collecting Learning Rate and Weight Decay."; ParamSpec default_param_spec; for (int i = 0; i < layers_.size(); ++i) { vector > >& layer_blobs = layers_[i]->blobs(); @@ -553,7 +505,6 @@ Dtype Net::ForwardFromTo(int start, int end) { } for (int i = start; i <= end; ++i) { // LOG(ERROR) << "Forwarding " << layer_names_[i]; - layers_[i]->Reshape(bottom_vecs_[i], top_vecs_[i]); Dtype layer_loss = layers_[i]->Forward(bottom_vecs_[i], top_vecs_[i]); loss += layer_loss; if (debug_info_) { ForwardDebugInfo(i); } @@ -618,7 +569,6 @@ void Net::BackwardFromTo(int start, int end) { CHECK_LT(start, layers_.size()); for (int i = start; i >= end; --i) { if (layer_need_backward_[i]) { - layers_[i]->Reshape(bottom_vecs_[i], top_vecs_[i]); layers_[i]->Backward( top_vecs_[i], bottom_need_backward_[i], bottom_vecs_[i]); if (debug_info_) { BackwardDebugInfo(i); } @@ -631,10 +581,8 @@ void Net::InputDebugInfo(const int input_id) { const Blob& blob = *net_input_blobs_[input_id]; const string& blob_name = blob_names_[net_input_blob_indices_[input_id]]; const Dtype data_abs_val_mean = blob.asum_data() / blob.count(); - if (Caffe::root_solver()) { - LOG(INFO) << " [Forward] " - << "Input " << blob_name << " data: " << data_abs_val_mean; - } + LOG(INFO) << " [Forward] " + << "Input " << blob_name << " data: " << data_abs_val_mean; } template @@ -643,12 +591,9 @@ void Net::ForwardDebugInfo(const int layer_id) { const Blob& blob = *top_vecs_[layer_id][top_id]; const string& blob_name = blob_names_[top_id_vecs_[layer_id][top_id]]; const Dtype data_abs_val_mean = blob.asum_data() / blob.count(); - if (Caffe::root_solver()) { - LOG(INFO) << " [Forward] " - << "Layer " << layer_names_[layer_id] - << ", top blob " << blob_name - << " data: " << data_abs_val_mean; - } + LOG(INFO) << " [Forward] " + << "Layer " << layer_names_[layer_id] << ", top blob " << blob_name + << " data: " << data_abs_val_mean; } for (int param_id = 0; param_id < layers_[layer_id]->blobs().size(); ++param_id) { @@ -656,12 +601,9 @@ void Net::ForwardDebugInfo(const int layer_id) { const int net_param_id = param_id_vecs_[layer_id][param_id]; const string& blob_name = param_display_names_[net_param_id]; const Dtype data_abs_val_mean = blob.asum_data() / blob.count(); - if (Caffe::root_solver()) { - LOG(INFO) << " [Forward] " - << "Layer " << layer_names_[layer_id] - << ", param blob " << blob_name - << " data: " << data_abs_val_mean; - } + LOG(INFO) << " [Forward] " + << "Layer " << layer_names_[layer_id] << ", param blob " << blob_name + << " data: " << data_abs_val_mean; } } @@ -673,24 +615,18 @@ void Net::BackwardDebugInfo(const int layer_id) { const Blob& blob = *bottom_vec[bottom_id]; const string& blob_name = blob_names_[bottom_id_vecs_[layer_id][bottom_id]]; const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count(); - if (Caffe::root_solver()) { - LOG(INFO) << " [Backward] " - << "Layer " << layer_names_[layer_id] - << ", bottom blob " << blob_name - << " diff: " << diff_abs_val_mean; - } + LOG(INFO) << " [Backward] " + << "Layer " << layer_names_[layer_id] << ", bottom blob " << blob_name + << " diff: " << diff_abs_val_mean; } for (int param_id = 0; param_id < layers_[layer_id]->blobs().size(); ++param_id) { if (!layers_[layer_id]->param_propagate_down(param_id)) { continue; } const Blob& blob = *layers_[layer_id]->blobs()[param_id]; const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count(); - if (Caffe::root_solver()) { - LOG(INFO) << " [Backward] " - << "Layer " << layer_names_[layer_id] - << ", param blob " << param_id - << " diff: " << diff_abs_val_mean; - } + LOG(INFO) << " [Backward] " + << "Layer " << layer_names_[layer_id] << ", param blob " << param_id + << " diff: " << diff_abs_val_mean; } } @@ -703,22 +639,17 @@ void Net::UpdateDebugInfo(const int param_id) { const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count(); if (param_owner < 0) { const Dtype data_abs_val_mean = blob.asum_data() / blob.count(); - if (Caffe::root_solver()) { - LOG(INFO) << " [Update] Layer " << layer_name - << ", param " << param_display_name - << " data: " << data_abs_val_mean - << "; diff: " << diff_abs_val_mean; - } + LOG(INFO) << " [Update] Layer " << layer_name + << ", param " << param_display_name + << " data: " << data_abs_val_mean << "; diff: " << diff_abs_val_mean; } else { const string& owner_layer_name = layer_names_[param_layer_indices_[param_owner].first]; - if (Caffe::root_solver()) { - LOG(INFO) << " [Update] Layer " << layer_name - << ", param blob " << param_display_name - << " (owned by layer " << owner_layer_name << ", " << "param " - << param_display_names_[param_owners_[param_id]] << ")" - << " diff: " << diff_abs_val_mean; - } + LOG(INFO) << " [Update] Layer " << layer_name + << ", param blob " << param_display_name + << " (owned by layer " << owner_layer_name << ", " + << "param " << param_display_names_[param_owners_[param_id]] << ")" + << " diff: " << diff_abs_val_mean; } } @@ -775,8 +706,8 @@ void Net::Backward() { const Dtype l2norm_data = std::sqrt(sumsq_data); const Dtype l2norm_diff = std::sqrt(sumsq_diff); LOG(ERROR) << " [Backward] All net params (data, diff): " - << "L1 norm = (" << asum_data << ", " << asum_diff << "); " - << "L2 norm = (" << l2norm_data << ", " << l2norm_diff << ")"; + << "L1 norm = (" << asum_data << ", " << asum_diff << "); " + << "L2 norm = (" << l2norm_data << ", " << l2norm_diff << ")"; } } diff --git a/src/caffe/parallel.cpp b/src/caffe/parallel.cpp deleted file mode 100644 index 6ac0ff0406d..00000000000 --- a/src/caffe/parallel.cpp +++ /dev/null @@ -1,526 +0,0 @@ -#ifndef CPU_ONLY -#include -#endif -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include "boost/thread.hpp" -#include "caffe/caffe.hpp" -#include "caffe/parallel.hpp" - -namespace caffe { - -enum Op { - copy, - replace_cpu, - replace_gpu, - replace_cpu_diff, - replace_gpu_diff -}; - -template -static void apply_buffers(const vector > >& blobs, - Dtype* buffer, size_t total_size, Op op) { - Dtype* ptr = buffer; - for (int i = 0; i < blobs.size(); ++i) { - int size = blobs[i]->count(); - switch (op) { - case copy: { - // Init buffer to current values of blobs - caffe_copy(size, - reinterpret_cast(blobs[i]->data()->cpu_data()), - ptr); - break; - } - case replace_cpu: - blobs[i]->data()->set_cpu_data(ptr); - break; - case replace_gpu: - blobs[i]->data()->set_gpu_data(ptr); - break; - case replace_cpu_diff: - blobs[i]->diff()->set_cpu_data(ptr); - break; - case replace_gpu_diff: - blobs[i]->diff()->set_gpu_data(ptr); - break; - } - ptr += size; - } - CHECK_EQ(total_size, ptr - buffer); -} - -// Buffer size necessary to store given blobs -template -static size_t total_size(const vector > >& params) { - size_t size = 0; - for (int i = 0; i < params.size(); ++i) - size += params[i]->count(); - return size; -} - -template -Params::Params(shared_ptr > root_solver) - : size_(total_size(root_solver->net()->params())), - data_(), - diff_() { -} - -template -GPUParams::GPUParams(shared_ptr > root_solver, int device) - : Params(root_solver) { -#ifndef CPU_ONLY - int initial_device; - CUDA_CHECK(cudaGetDevice(&initial_device)); - - // Allocate device buffers - CUDA_CHECK(cudaSetDevice(device)); - buffer_device_ = device; - // CUDA_CHECK(cudaMalloc(&data_, size_ * sizeof(Dtype))); - MemoryHandler::mallocGPU(reinterpret_cast(&data_), - size_ * sizeof(Dtype)); - - // Copy blob values - const vector > >& net = root_solver->net()->params(); - apply_buffers(net, data_, size_, copy); - - // CUDA_CHECK(cudaMalloc(&diff_, size_ * sizeof(Dtype))); - MemoryHandler::mallocGPU(reinterpret_cast(&diff_), - size_ * sizeof(Dtype)); - caffe_gpu_set(size_, Dtype(0), diff_); - - CUDA_CHECK(cudaSetDevice(initial_device)); -#else - NO_GPU; -#endif -} - -template -GPUParams::~GPUParams() { -#ifndef CPU_ONLY - int initial_device; - cudaGetDevice(&initial_device); - cudaSetDevice(buffer_device_); - MemoryHandler::freeGPU(data_); - MemoryHandler::freeGPU(diff_); - data_ = NULL; - diff_ = NULL; - cudaSetDevice(initial_device); -#endif -} - -template -void GPUParams::configure(Solver* solver) const { - const vector > >& net = solver->net()->params(); - apply_buffers(net, data_, size_, replace_gpu); - apply_buffers(net, diff_, size_, replace_gpu_diff); -} - -// - -void DevicePair::compute(const vector devices, vector* pairs) { -#ifndef CPU_ONLY - vector remaining(devices); - - // Depth for reduction tree - int remaining_depth = static_cast(ceil(log2(remaining.size()))); - - // Group GPUs by board - for (int d = 0; d < remaining_depth; ++d) { - for (int i = 0; i < remaining.size(); ++i) { - for (int j = i + 1; j < remaining.size(); ++j) { - cudaDeviceProp a, b; - CUDA_CHECK(cudaGetDeviceProperties(&a, remaining[i])); - CUDA_CHECK(cudaGetDeviceProperties(&b, remaining[j])); - if (a.isMultiGpuBoard && b.isMultiGpuBoard) { - if (a.multiGpuBoardGroupID == b.multiGpuBoardGroupID) { - pairs->push_back(DevicePair(remaining[i], remaining[j])); - DLOG(INFO) << "GPU board: " << remaining[i] - << ":" << remaining[j]; - remaining.erase(remaining.begin() + j); - break; - } - } - } - } - } - ostringstream s; - for (int i = 0; i < remaining.size(); ++i) { - s << (i ? ", " : "") << remaining[i]; - } - DLOG(INFO) << "GPUs paired by boards, remaining: " << s.str(); - - // Group by P2P accessibility - remaining_depth = ceil(log2(remaining.size())); - for (int d = 0; d < remaining_depth; ++d) { - for (int i = 0; i < remaining.size(); ++i) { - for (int j = i + 1; j < remaining.size(); ++j) { - int access; - CUDA_CHECK(cudaDeviceCanAccessPeer(&access, - remaining[i], - remaining[j])); - if (access) { - pairs->push_back(DevicePair(remaining[i], remaining[j])); - DLOG(INFO) << "P2P pair: " << remaining[i] - << ":" << remaining[j]; - remaining.erase(remaining.begin() + j); - break; - } - } - } - } - s.str(""); - for (int i = 0; i < remaining.size(); ++i) { - s << (i ? ", " : "") << remaining[i]; - } - DLOG(INFO) << "GPUs paired by P2P access, remaining: " << s.str(); - - // Group remaining - remaining_depth = ceil(log2(remaining.size())); - for (int d = 0; d < remaining_depth; ++d) { - for (int i = 0; i < remaining.size(); ++i) { - pairs->push_back(DevicePair(remaining[i], remaining[i+1])); - DLOG(INFO) << "Remaining pair: " << remaining[i] - << ":" << remaining[i+1]; - remaining.erase(remaining.begin() + i+1); - } - } - - // Should only be the parent node remaining - CHECK_EQ(remaining.size(), 1); - - pairs->insert(pairs->begin(), DevicePair(-1, remaining[0])); - - CHECK(pairs->size() == devices.size()); - for (int i = 0; i < pairs->size(); ++i) { - CHECK((*pairs)[i].parent() != (*pairs)[i].device()); - for (int j = i + 1; j < pairs->size(); ++j) { - CHECK((*pairs)[i].device() != (*pairs)[j].device()); - } - } -#else - NO_GPU; -#endif -} - -// - -template -P2PSync::P2PSync(shared_ptr > root_solver, - P2PSync* parent, const SolverParameter& param) - : GPUParams(root_solver, param.device_id()), - parent_(parent), - children_(), - queue_(), - initial_iter_(root_solver->iter()), - solver_() { -#ifndef CPU_ONLY - int initial_device; - CUDA_CHECK(cudaGetDevice(&initial_device)); - const int self = param.device_id(); - CUDA_CHECK(cudaSetDevice(self)); - - if (parent == NULL) { - solver_ = root_solver; - } else { - Caffe::set_root_solver(false); - solver_.reset(new Solver(param)); - Caffe::set_root_solver(true); - } - this->configure(solver_.get()); - solver_->add_callback(this); - - if (parent) { - // Enable p2p access between devices - const int peer = parent->solver_->param().device_id(); - int access; - CUDA_CHECK(cudaDeviceCanAccessPeer(&access, self, peer)); - if (access) { - CUDA_CHECK(cudaDeviceEnablePeerAccess(peer, 0)); - } else { - LOG(INFO)<< "GPU " << self << " does not have p2p access to GPU " << peer; - } - // Allocate receiving buffer on parent - CUDA_CHECK(cudaSetDevice(peer)); - MemoryHandler::mallocGPU(reinterpret_cast(&parent_grads_), - size_ * sizeof(Dtype)); - CUDA_CHECK(cudaSetDevice(self)); - } - - CUDA_CHECK(cudaSetDevice(initial_device)); -#else - NO_GPU; -#endif -} - -template -P2PSync::~P2PSync() { -#ifndef CPU_ONLY - int initial_device; - CUDA_CHECK(cudaGetDevice(&initial_device)); - const int self = solver_->param().device_id(); - CUDA_CHECK(cudaSetDevice(self)); - - if (parent_) { - const int peer = parent_->solver_->param().device_id(); - cudaSetDevice(peer); - MemoryHandler::freeGPU(parent_grads_); - parent_grads_ = NULL; - cudaSetDevice(self); - int access; - CUDA_CHECK(cudaDeviceCanAccessPeer(&access, self, peer)); - if (access) { - CUDA_CHECK(cudaDeviceDisablePeerAccess(peer)); - } - } - - CUDA_CHECK(cudaSetDevice(initial_device)); -#endif -} - -template -void P2PSync::InternalThreadEntry() { - Caffe::SetDevice(solver_->param().device_id()); - CHECK(Caffe::root_solver()); - Caffe::set_root_solver(false); - // See if there is a defined seed and reset random state if so - if (solver_->param().random_seed() >= 0) { - // Fetch random seed and modulate by device ID to make sure - // everyone doesn't have the same seed. We seem to have some - // solver instability if we have everyone with the same seed - Caffe::set_random_seed( - solver_->param().random_seed() + solver_->param().device_id()); - } - solver_->Step(solver_->param().max_iter() - initial_iter_); -} - -template -void P2PSync::on_start(Timer* timer, ostringstream* timing) { -#ifndef CPU_ONLY -#ifdef DEBUG - int device; - CUDA_CHECK(cudaGetDevice(&device)); - CHECK(device == solver_->param().device_id()); -#else -// CHECK(false); -#endif - - // Wait for update from parent - if (parent_) { - timer->Start(); - P2PSync *parent = queue_.pop(); - CHECK(parent == parent_); - *timing << " recv_param: " << timer->MilliSeconds(); - } - - // Update children - if (children_.size()) { - timer->Start(); - } - for (int i = children_.size() - 1; i >= 0; i--) { - Dtype* src = data_; - Dtype* dst = children_[i]->data_; - -#ifdef DEBUG - cudaPointerAttributes attributes; - CUDA_CHECK(cudaPointerGetAttributes(&attributes, src)); - CHECK(attributes.device == device); - CUDA_CHECK(cudaPointerGetAttributes(&attributes, dst)); - CHECK(attributes.device == children_[i]->solver_->param().device_id()); -#endif - - CUDA_CHECK(cudaMemcpyAsync(dst, src, size_ * sizeof(Dtype), // - cudaMemcpyDeviceToDevice, cudaStreamDefault)); - CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault)); - children_[i]->queue_.push(this); - } - if (children_.size()) { - *timing << " send_param: " << timer->MilliSeconds(); - } -#endif -} - -template -void P2PSync::on_gradients_ready(Timer* timer, ostringstream* timing) { -#ifndef CPU_ONLY -#ifdef DEBUG - int device; - CUDA_CHECK(cudaGetDevice(&device)); - CHECK(device == solver_->param().device_id()); -#endif - - // Sum children gradients as they appear in the queue - for (int i = 0; i < children_.size(); ++i) { - timer->Start(); - P2PSync *child = queue_.pop(); - Dtype* src = child->parent_grads_; - Dtype* dst = diff_; - -#ifdef DEBUG - bool ok = false; - for (int j = 0; j < children_.size(); ++j) { - if (child == children_[j]) { - ok = true; - } - } - CHECK(ok); - cudaPointerAttributes attributes; - CUDA_CHECK(cudaPointerGetAttributes(&attributes, src)); - CHECK(attributes.device == device); - CUDA_CHECK(cudaPointerGetAttributes(&attributes, dst)); - CHECK(attributes.device == device); -#endif - - caffe_gpu_add(size_, src, dst, dst); - *timing << " add_grad: " << timer->MilliSeconds(); - } - - // Send gradients to parent - if (parent_) { - timer->Start(); - Dtype* src = diff_; - Dtype* dst = parent_grads_; - -#ifdef DEBUG - cudaPointerAttributes attributes; - CUDA_CHECK(cudaPointerGetAttributes(&attributes, src)); - CHECK(attributes.device == device); - CUDA_CHECK(cudaPointerGetAttributes(&attributes, dst)); - CHECK(attributes.device == parent_->solver_->param().device_id()); -#endif - - CUDA_CHECK(cudaMemcpyAsync(dst, src, size_ * sizeof(Dtype), // - cudaMemcpyDeviceToDevice, cudaStreamDefault)); - CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault)); - parent_->queue_.push(this); - *timing << " send_grad: " << timer->MilliSeconds(); - } else { - // Loss functions divide gradients by the batch size, so to compensate - // for split batch, the root solver divides by number of solvers. - caffe_gpu_scal(size_, Dtype(1.0 / Caffe::solver_count()), diff_); - } -#endif -} - -template -void P2PSync::run(shared_ptr > root, - const vector& gpus) { - // Pair devices for map-reduce synchronization - vector pairs; - DevicePair::compute(gpus, &pairs); - ostringstream s; - for (int i = 1; i < pairs.size(); ++i) { - s << (i == 1 ? "" : ", ") << pairs[i].parent() << ":" << pairs[i].device(); - } - LOG(INFO)<< "GPUs pairs " << s.str(); - - SolverParameter param(root->param()); - vector > > syncs(gpus.size()); - syncs[0].reset(new P2PSync(root, NULL, param)); - - // Build the GPU tree by finding the parent for each solver - for (int attempts = 0; attempts < pairs.size(); ++attempts) { - for (int i = 1; i < pairs.size(); ++i) { - if (!syncs[i].get()) { - P2PSync* parent = NULL; - for (int j = 0; j < syncs.size(); ++j) { - if (syncs[j]) { - const SolverParameter& p = syncs[j]->solver()->param(); - if (p.device_id() == pairs[i].parent()) { - parent = (P2PSync*) syncs[j].get(); - } - } - } - if (parent) { - param.set_device_id(pairs[i].device()); - syncs[i].reset(new P2PSync(root, parent, param)); - parent->children_.push_back((P2PSync*) syncs[i].get()); - } - } - } - } - - LOG(INFO)<< "Starting Optimization"; - - for (int i = 1; i < syncs.size(); ++i) { - syncs[i]->StartInternalThread(); - } - - // Run root solver on current thread - syncs[0]->solver_->Solve(); - - for (int i = 1; i < syncs.size(); ++i) { - syncs[i]->StopInternalThread(); - } -} - -template -void P2PSync::divide_batch_size(NetParameter* net) { - int solver_count = Caffe::solver_count(); - for (int i = 0; i < net->layer_size(); ++i) { - string m = "Batch size must be divisible by the number of solvers (GPUs)"; - if (net->layer(i).has_data_param()) { - if (net->layer(i).data_param().has_batch_size()) { - uint32_t total = net->layer(i).data_param().batch_size(); - uint32_t batch = total / solver_count; - CHECK(batch * solver_count == total) << m; - net->mutable_layer(i)->mutable_data_param()->set_batch_size(batch); - - // Also adjust the prefetch count, as it is shared by all solvers - uint32_t prefetch = net->layer(i).data_param().prefetch(); - net->mutable_layer(i)->mutable_data_param()->set_prefetch( - prefetch * solver_count); - } - } - if (net->layer(i).has_hdf5_data_param()) { - if (net->layer(i).hdf5_data_param().has_batch_size()) { - uint32_t total = net->layer(i).hdf5_data_param().batch_size(); - uint32_t batch = total / solver_count; - CHECK(batch * solver_count == total) << m; - net->mutable_layer(i)->mutable_hdf5_data_param()->set_batch_size(batch); - } - } - if (net->layer(i).has_image_data_param()) { - if (net->layer(i).image_data_param().has_batch_size()) { - uint32_t total = net->layer(i).image_data_param().batch_size(); - uint32_t batch = total / solver_count; - CHECK(batch * solver_count == total) << m; - net->mutable_layer(i)->mutable_image_data_param()->set_batch_size( - batch); - } - } - if (net->layer(i).has_memory_data_param()) { - if (net->layer(i).memory_data_param().has_batch_size()) { - uint32_t total = net->layer(i).memory_data_param().batch_size(); - uint32_t batch = total / solver_count; - CHECK(batch * solver_count == total) << m; - net->mutable_layer(i)->mutable_memory_data_param()->set_batch_size( - batch); - } - } - if (net->layer(i).has_window_data_param()) { - if (net->layer(i).window_data_param().has_batch_size()) { - uint32_t total = net->layer(i).window_data_param().batch_size(); - uint32_t batch = total / solver_count; - CHECK(batch * solver_count == total) << m; - net->mutable_layer(i)->mutable_window_data_param()->set_batch_size( - batch); - } - } - } -} - -INSTANTIATE_CLASS(Params); -INSTANTIATE_CLASS(GPUParams); -INSTANTIATE_CLASS(P2PSync); - -} // namespace caffe - diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto index 21a38d54ffc..307015f42c9 100644 --- a/src/caffe/proto/caffe.proto +++ b/src/caffe/proto/caffe.proto @@ -440,28 +440,6 @@ message ConvolutionParameter { CUDNN = 2; } optional Engine engine = 15 [default = DEFAULT]; - enum CuDNNFwdAlgorithm { - IMPLICIT_GEMM = 0; - IMPLICIT_PRECOMP_GEMM = 1; - GEMM = 2; - DIRECT = 3; - FWD_FFT = 4; - } - optional CuDNNFwdAlgorithm CuDNNFwdAlgo = 16 [default = IMPLICIT_GEMM]; - enum CuDNNBwdFilterAlgorithm { - BWD_FILTER_ALGO_0 = 0; - BWD_FILTER_ALGO_1 = 1; - BWD_FILTER_FFT = 2; - } - optional CuDNNBwdFilterAlgorithm CuDNNBwdFilterAlgo = 17 [default = BWD_FILTER_ALGO_0]; - enum CuDNNBwdDataAlgorithm { - BWD_DATA_ALGO_0 = 0; - BWD_DATA_ALGO_1 = 1; - BWD_DATA_FFT = 2; - BWD_DATA_ALGO_2 = 3; - } - optional CuDNNBwdDataAlgorithm CuDNNBwdDataAlgo = 18 [default = BWD_DATA_ALGO_0]; - optional uint32 CuDNNWorkspaceSize = 19 [default = 10485760]; // 10MB default } message DataParameter { @@ -477,7 +455,6 @@ message DataParameter { // to avoid all asynchronous sgd clients to start at the same point. The skip // point would be set as rand_skip * rand(0,1). Note that rand_skip should not // be larger than the number of keys in the database. - // DEPRECATED. Each solver accesses a different subset of the database. optional uint32 rand_skip = 7 [default = 0]; optional DB backend = 8 [default = LEVELDB]; // DEPRECATED. See TransformationParameter. For data pre-processing, we can do @@ -493,9 +470,6 @@ message DataParameter { optional bool mirror = 6 [default = false]; // Force the encoded image to have 3 color channels optional bool force_encoded_color = 9 [default = false]; - // Prefetch queue (Number of batches to prefetch to host memory, increase if - // data access bandwidth varies). - optional uint32 prefetch = 10 [default = 4]; } message DropoutParameter { @@ -629,12 +603,6 @@ message LRNParameter { } optional NormRegion norm_region = 4 [default = ACROSS_CHANNELS]; optional float k = 5 [default = 1.]; - enum Engine { - DEFAULT = 0; - CAFFE = 1; - CUDNN = 2; - } - optional Engine engine = 6 [default = DEFAULT]; } message MemoryDataParameter { diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp index 926490365ff..877b19b86f8 100644 --- a/src/caffe/solver.cpp +++ b/src/caffe/solver.cpp @@ -15,13 +15,13 @@ namespace caffe { template Solver::Solver(const SolverParameter& param) - : net_(), callbacks_(), iteration_timer_(), iterations_last_() { + : net_() { Init(param); } template Solver::Solver(const string& param_file) - : net_(), callbacks_(), iteration_timer_(), iterations_last_() { + : net_() { SolverParameter param; ReadProtoFromTextFileOrDie(param_file, ¶m); Init(param); @@ -29,21 +29,17 @@ Solver::Solver(const string& param_file) template void Solver::Init(const SolverParameter& param) { - if (Caffe::root_solver()) { - LOG(INFO) << "Initializing solver from parameters: " << std::endl - << param.DebugString(); - } + LOG(INFO) << "Initializing solver from parameters: " << std::endl + << param.DebugString(); param_ = param; CHECK_GE(param_.average_loss(), 1) << "average_loss should be non-negative."; - if (Caffe::root_solver() && param_.random_seed() >= 0) { + if (param_.random_seed() >= 0) { Caffe::set_random_seed(param_.random_seed()); } // Scaffolding code InitTrainNet(); - if (Caffe::root_solver()) { - InitTestNets(); - LOG(INFO) << "Solver scaffolding done."; - } + InitTestNets(); + LOG(INFO) << "Solver scaffolding done."; iter_ = 0; current_step_ = 0; } @@ -59,27 +55,19 @@ void Solver::InitTrainNet() { << "one of these fields specifying a train_net: " << field_names; NetParameter net_param; if (param_.has_train_net_param()) { - if (Caffe::root_solver()) { - LOG(INFO) << "Creating training net specified in train_net_param."; - } + LOG(INFO) << "Creating training net specified in train_net_param."; net_param.CopyFrom(param_.train_net_param()); } else if (param_.has_train_net()) { - if (Caffe::root_solver()) { - LOG(INFO) << "Creating training net from train_net file: " - << param_.train_net(); - } + LOG(INFO) << "Creating training net from train_net file: " + << param_.train_net(); ReadNetParamsFromTextFileOrDie(param_.train_net(), &net_param); } if (param_.has_net_param()) { - if (Caffe::root_solver()) { - LOG(INFO) << "Creating training net specified in net_param."; - } + LOG(INFO) << "Creating training net specified in net_param."; net_param.CopyFrom(param_.net_param()); } if (param_.has_net()) { - if (Caffe::root_solver()) { - LOG(INFO) << "Creating training net from net file: " << param_.net(); - } + LOG(INFO) << "Creating training net from net file: " << param_.net(); ReadNetParamsFromTextFileOrDie(param_.net(), &net_param); } // Set the correct NetState. We start with the solver defaults (lowest @@ -96,7 +84,6 @@ void Solver::InitTrainNet() { template void Solver::InitTestNets() { - CHECK(Caffe::root_solver()); const bool has_net_param = param_.has_net_param(); const bool has_net_file = param_.has_net(); const int num_generic_nets = has_net_param + has_net_file; @@ -180,26 +167,12 @@ void Solver::Step(int iters) { vector losses; Dtype smoothed_loss = 0; - iteration_timer_.Start(); - Timer timer; - ostringstream timing; - while (iter_ < stop_iter) { if (param_.test_interval() && iter_ % param_.test_interval() == 0 - && (iter_ > 0 || param_.test_initialization()) - && Caffe::root_solver()) { + && (iter_ > 0 || param_.test_initialization())) { TestAll(); } - timer.Start(); - timing.str(""); - timing << "Timing "; - if (param().solver_mode() == SolverParameter_SolverMode_GPU) { - timing << "(device " << param().device_id() << ") "; - } - for (int i = 0; i < callbacks_.size(); ++i) { - callbacks_[i]->on_start(&timer, &timing); - } const bool display = param_.display() && iter_ % param_.display() == 0; net_->set_debug_info(display && param_.debug_info()); Dtype loss = net_->ForwardBackward(bottom_vec); @@ -213,9 +186,7 @@ void Solver::Step(int iters) { losses[idx] = loss; } if (display) { - if (Caffe::root_solver()) { - LOG(INFO) << "Iteration " << iter_ << ", loss = " << smoothed_loss; - } + LOG(INFO) << "Iteration " << iter_ << ", loss = " << smoothed_loss; const vector*>& result = net_->output_blobs(); int score_index = 0; for (int j = 0; j < result.size(); ++j) { @@ -230,34 +201,21 @@ void Solver::Step(int iters) { loss_msg_stream << " (* " << loss_weight << " = " << loss_weight * result_vec[k] << " loss)"; } - if (Caffe::root_solver()) { - LOG(INFO) << " Train net output #" - << score_index++ << ": " << output_name << " = " - << result_vec[k] << loss_msg_stream.str(); - } + LOG(INFO) << " Train net output #" + << score_index++ << ": " << output_name << " = " + << result_vec[k] << loss_msg_stream.str(); } } } - timing << " grads: " << timer.MilliSeconds(); - for (int i = 0; i < callbacks_.size(); ++i) { - callbacks_[i]->on_gradients_ready(&timer, &timing); - } - timer.Start(); - Iteration(); - timing << " apply: " << timer.MilliSeconds(); - -#ifdef BENCHMARK_SOLVER - LOG(INFO)<< timing.str(); -#endif + ComputeUpdateValue(); + net_->Update(); // Increment the internal iter_ counter -- its value should always indicate // the number of times the weights have been updated. ++iter_; // Save a snapshot if needed. - if (param_.snapshot() - && iter_ % param_.snapshot() == 0 - && Caffe::root_solver()) { + if (param_.snapshot() && iter_ % param_.snapshot() == 0) { Snapshot(); } } @@ -265,7 +223,6 @@ void Solver::Step(int iters) { template void Solver::Solve(const char* resume_file) { - CHECK(Caffe::root_solver()); LOG(INFO) << "Solving " << net_->name(); LOG(INFO) << "Learning Rate Policy: " << param_.lr_policy(); @@ -310,7 +267,6 @@ void Solver::TestAll() { template void Solver::Test(const int test_net_id) { - CHECK(Caffe::root_solver()); LOG(INFO) << "Iteration " << iter_ << ", Testing net (#" << test_net_id << ")"; CHECK_NOTNULL(test_nets_[test_net_id].get())-> @@ -361,14 +317,13 @@ void Solver::Test(const int test_net_id) { << " = " << loss_weight * mean_score << " loss)"; } LOG(INFO) << " Test net output #" << i << ": " << output_name << " = " - << mean_score << loss_msg_stream.str(); + << mean_score << loss_msg_stream.str(); } } template void Solver::Snapshot() { - CHECK(Caffe::root_solver()); NetParameter net_param; // For intermediate results, we will also dump the gradient values. net_->ToProto(&net_param, param_.snapshot_diff()); @@ -393,7 +348,6 @@ void Solver::Snapshot() { template void Solver::Restore(const char* state_file) { - CHECK(Caffe::root_solver()); SolverState state; NetParameter net_param; ReadProtoFromBinaryFile(state_file, &state); @@ -502,124 +456,95 @@ void SGDSolver::ClipGradients() { } template -void SGDSolver::Iteration() { - CHECK(Caffe::root_solver()); +void SGDSolver::ComputeUpdateValue() { + const vector > >& net_params = this->net_->params(); + const vector& net_params_lr = this->net_->params_lr(); + const vector& net_params_weight_decay = + this->net_->params_weight_decay(); + // get the learning rate Dtype rate = GetLearningRate(); if (this->param_.display() && this->iter_ % this->param_.display() == 0) { - float lapse = iteration_timer_.Seconds(); - float per_s = (this->iter_ - iterations_last_) / (lapse ? lapse : 1); - LOG(INFO) << "Iteration " << this->iter_ << " (" << per_s << "/s), " - << "lr = " << rate; - iteration_timer_.Start(); - iterations_last_ = this->iter_; + LOG(INFO) << "Iteration " << this->iter_ << ", lr = " << rate; } ClipGradients(); - for (int param_id = 0; param_id < this->net_->params().size(); ++param_id) { - Regularize(param_id); - ComputeUpdateValue(param_id, rate); - } - this->net_->Update(); -} - -template -void SGDSolver::Regularize(int param_id) { - const vector > >& net_params = this->net_->params(); - const vector& net_params_weight_decay = - this->net_->params_weight_decay(); + Dtype momentum = this->param_.momentum(); Dtype weight_decay = this->param_.weight_decay(); string regularization_type = this->param_.regularization_type(); switch (Caffe::mode()) { - case Caffe::CPU: { - Dtype local_decay = weight_decay * net_params_weight_decay[param_id]; - if (local_decay) { - if (regularization_type == "L2") { - // add weight decay - caffe_axpy(net_params[param_id]->count(), - local_decay, - net_params[param_id]->cpu_data(), - net_params[param_id]->mutable_cpu_diff()); - } else if (regularization_type == "L1") { - caffe_cpu_sign(net_params[param_id]->count(), - net_params[param_id]->cpu_data(), - temp_[param_id]->mutable_cpu_data()); - caffe_axpy(net_params[param_id]->count(), - local_decay, - temp_[param_id]->cpu_data(), - net_params[param_id]->mutable_cpu_diff()); - } else { - LOG(FATAL) << "Unknown regularization type: " << regularization_type; + case Caffe::CPU: + for (int param_id = 0; param_id < net_params.size(); ++param_id) { + // Compute the value to history, and then copy them to the blob's diff. + Dtype local_rate = rate * net_params_lr[param_id]; + Dtype local_decay = weight_decay * net_params_weight_decay[param_id]; + + if (local_decay) { + if (regularization_type == "L2") { + // add weight decay + caffe_axpy(net_params[param_id]->count(), + local_decay, + net_params[param_id]->cpu_data(), + net_params[param_id]->mutable_cpu_diff()); + } else if (regularization_type == "L1") { + caffe_cpu_sign(net_params[param_id]->count(), + net_params[param_id]->cpu_data(), + temp_[param_id]->mutable_cpu_data()); + caffe_axpy(net_params[param_id]->count(), + local_decay, + temp_[param_id]->cpu_data(), + net_params[param_id]->mutable_cpu_diff()); + } else { + LOG(FATAL) << "Unknown regularization type: " << regularization_type; + } } + + caffe_cpu_axpby(net_params[param_id]->count(), local_rate, + net_params[param_id]->cpu_diff(), momentum, + history_[param_id]->mutable_cpu_data()); + // copy + caffe_copy(net_params[param_id]->count(), + history_[param_id]->cpu_data(), + net_params[param_id]->mutable_cpu_diff()); } break; - } - case Caffe::GPU: { + case Caffe::GPU: #ifndef CPU_ONLY - Dtype local_decay = weight_decay * net_params_weight_decay[param_id]; - if (local_decay) { - if (regularization_type == "L2") { - // add weight decay - caffe_gpu_axpy(net_params[param_id]->count(), - local_decay, - net_params[param_id]->gpu_data(), - net_params[param_id]->mutable_gpu_diff()); - } else if (regularization_type == "L1") { - caffe_gpu_sign(net_params[param_id]->count(), - net_params[param_id]->gpu_data(), - temp_[param_id]->mutable_gpu_data()); - caffe_gpu_axpy(net_params[param_id]->count(), - local_decay, - temp_[param_id]->gpu_data(), - net_params[param_id]->mutable_gpu_diff()); - } else { - LOG(FATAL) << "Unknown regularization type: " << regularization_type; + for (int param_id = 0; param_id < net_params.size(); ++param_id) { + // Compute the value to history, and then copy them to the blob's diff. + Dtype local_rate = rate * net_params_lr[param_id]; + Dtype local_decay = weight_decay * net_params_weight_decay[param_id]; + + if (local_decay) { + if (regularization_type == "L2") { + // add weight decay + caffe_gpu_axpy(net_params[param_id]->count(), + local_decay, + net_params[param_id]->gpu_data(), + net_params[param_id]->mutable_gpu_diff()); + } else if (regularization_type == "L1") { + caffe_gpu_sign(net_params[param_id]->count(), + net_params[param_id]->gpu_data(), + temp_[param_id]->mutable_gpu_data()); + caffe_gpu_axpy(net_params[param_id]->count(), + local_decay, + temp_[param_id]->gpu_data(), + net_params[param_id]->mutable_gpu_diff()); + } else { + LOG(FATAL) << "Unknown regularization type: " << regularization_type; + } } - } -#else - NO_GPU; -#endif - break; - } - default: - LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); - } -} -template -void SGDSolver::ComputeUpdateValue(int param_id, Dtype rate) { - const vector > >& net_params = this->net_->params(); - const vector& net_params_lr = this->net_->params_lr(); - Dtype momentum = this->param_.momentum(); - switch (Caffe::mode()) { - case Caffe::CPU: { - // Compute the value to history, and then copy them to the blob's diff. - Dtype local_rate = rate * net_params_lr[param_id]; - - caffe_cpu_axpby(net_params[param_id]->count(), local_rate, - net_params[param_id]->cpu_diff(), momentum, - history_[param_id]->mutable_cpu_data()); - // copy - caffe_copy(net_params[param_id]->count(), - history_[param_id]->cpu_data(), - net_params[param_id]->mutable_cpu_diff()); - break; - } - case Caffe::GPU: { -#ifndef CPU_ONLY - // Compute the value to history, and then copy them to the blob's diff. - Dtype local_rate = rate * net_params_lr[param_id]; - - caffe_gpu_axpby(net_params[param_id]->count(), local_rate, - net_params[param_id]->gpu_diff(), momentum, - history_[param_id]->mutable_gpu_data()); - // copy - caffe_copy(net_params[param_id]->count(), - history_[param_id]->gpu_data(), - net_params[param_id]->mutable_gpu_diff()); + caffe_gpu_axpby(net_params[param_id]->count(), local_rate, + net_params[param_id]->gpu_diff(), momentum, + history_[param_id]->mutable_gpu_data()); + // copy + caffe_copy(net_params[param_id]->count(), + history_[param_id]->gpu_data(), + net_params[param_id]->mutable_gpu_diff()); + } #else NO_GPU; #endif break; - } default: LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); } @@ -637,7 +562,6 @@ void SGDSolver::SnapshotSolverState(SolverState* state) { template void SGDSolver::RestoreSolverState(const SolverState& state) { - CHECK(Caffe::root_solver()); CHECK_EQ(state.history_size(), history_.size()) << "Incorrect length of history blobs."; LOG(INFO) << "SGDSolver: restoring history"; @@ -647,146 +571,252 @@ void SGDSolver::RestoreSolverState(const SolverState& state) { } template -void NesterovSolver::ComputeUpdateValue(int param_id, Dtype rate) { - CHECK(Caffe::root_solver()); +void NesterovSolver::ComputeUpdateValue() { const vector > >& net_params = this->net_->params(); const vector& net_params_lr = this->net_->params_lr(); + const vector& net_params_weight_decay = + this->net_->params_weight_decay(); + // get the learning rate + Dtype rate = this->GetLearningRate(); + if (this->param_.display() && this->iter_ % this->param_.display() == 0) { + LOG(INFO) << "Iteration " << this->iter_ << ", lr = " << rate; + } + SGDSolver::ClipGradients(); Dtype momentum = this->param_.momentum(); + Dtype weight_decay = this->param_.weight_decay(); + string regularization_type = this->param_.regularization_type(); switch (Caffe::mode()) { - case Caffe::CPU: { - // save history momentum for stepping back - caffe_copy(net_params[param_id]->count(), - this->history_[param_id]->cpu_data(), - this->update_[param_id]->mutable_cpu_data()); - - Dtype local_rate = rate * net_params_lr[param_id]; - - // update history - caffe_cpu_axpby(net_params[param_id]->count(), local_rate, - net_params[param_id]->cpu_diff(), momentum, - this->history_[param_id]->mutable_cpu_data()); - - // compute update: step back then over step - caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) + momentum, - this->history_[param_id]->cpu_data(), -momentum, - this->update_[param_id]->mutable_cpu_data()); - - // copy - caffe_copy(net_params[param_id]->count(), - this->update_[param_id]->cpu_data(), - net_params[param_id]->mutable_cpu_diff()); + case Caffe::CPU: + for (int param_id = 0; param_id < net_params.size(); ++param_id) { + // save history momentum for stepping back + caffe_copy(net_params[param_id]->count(), + this->history_[param_id]->cpu_data(), + this->update_[param_id]->mutable_cpu_data()); + + Dtype local_rate = rate * net_params_lr[param_id]; + Dtype local_decay = weight_decay * net_params_weight_decay[param_id]; + + if (local_decay) { + if (regularization_type == "L2") { + // add weight decay + caffe_axpy(net_params[param_id]->count(), + local_decay, + net_params[param_id]->cpu_data(), + net_params[param_id]->mutable_cpu_diff()); + } else if (regularization_type == "L1") { + caffe_cpu_sign(net_params[param_id]->count(), + net_params[param_id]->cpu_data(), + this->temp_[param_id]->mutable_cpu_data()); + caffe_axpy(net_params[param_id]->count(), + local_decay, + this->temp_[param_id]->cpu_data(), + net_params[param_id]->mutable_cpu_diff()); + } else { + LOG(FATAL) << "Unknown regularization type: " << regularization_type; + } + } + + // update history + caffe_cpu_axpby(net_params[param_id]->count(), local_rate, + net_params[param_id]->cpu_diff(), momentum, + this->history_[param_id]->mutable_cpu_data()); + + // compute udpate: step back then over step + caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) + momentum, + this->history_[param_id]->cpu_data(), -momentum, + this->update_[param_id]->mutable_cpu_data()); + + // copy + caffe_copy(net_params[param_id]->count(), + this->update_[param_id]->cpu_data(), + net_params[param_id]->mutable_cpu_diff()); + } break; - } - case Caffe::GPU: { + case Caffe::GPU: #ifndef CPU_ONLY - // save history momentum for stepping back - caffe_copy(net_params[param_id]->count(), - this->history_[param_id]->gpu_data(), - this->update_[param_id]->mutable_gpu_data()); - - Dtype local_rate = rate * net_params_lr[param_id]; - - // update history - caffe_gpu_axpby(net_params[param_id]->count(), local_rate, - net_params[param_id]->gpu_diff(), momentum, - this->history_[param_id]->mutable_gpu_data()); - - // compute update: step back then over step - caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) + momentum, - this->history_[param_id]->gpu_data(), -momentum, - this->update_[param_id]->mutable_gpu_data()); - - // copy - caffe_copy(net_params[param_id]->count(), - this->update_[param_id]->gpu_data(), - net_params[param_id]->mutable_gpu_diff()); + for (int param_id = 0; param_id < net_params.size(); ++param_id) { + // save history momentum for stepping back + caffe_copy(net_params[param_id]->count(), + this->history_[param_id]->gpu_data(), + this->update_[param_id]->mutable_gpu_data()); + + Dtype local_rate = rate * net_params_lr[param_id]; + Dtype local_decay = weight_decay * net_params_weight_decay[param_id]; + + if (local_decay) { + if (regularization_type == "L2") { + // add weight decay + caffe_gpu_axpy(net_params[param_id]->count(), + local_decay, + net_params[param_id]->gpu_data(), + net_params[param_id]->mutable_gpu_diff()); + } else if (regularization_type == "L1") { + caffe_gpu_sign(net_params[param_id]->count(), + net_params[param_id]->gpu_data(), + this->temp_[param_id]->mutable_gpu_data()); + caffe_gpu_axpy(net_params[param_id]->count(), + local_decay, + this->temp_[param_id]->gpu_data(), + net_params[param_id]->mutable_gpu_diff()); + } else { + LOG(FATAL) << "Unknown regularization type: " << regularization_type; + } + } + + // update history + caffe_gpu_axpby(net_params[param_id]->count(), local_rate, + net_params[param_id]->gpu_diff(), momentum, + this->history_[param_id]->mutable_gpu_data()); + + // compute udpate: step back then over step + caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) + momentum, + this->history_[param_id]->gpu_data(), -momentum, + this->update_[param_id]->mutable_gpu_data()); + + // copy + caffe_copy(net_params[param_id]->count(), + this->update_[param_id]->gpu_data(), + net_params[param_id]->mutable_gpu_diff()); + } #else NO_GPU; #endif break; - } default: LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); } } template -void AdaGradSolver::ComputeUpdateValue(int param_id, Dtype rate) { - CHECK(Caffe::root_solver()); +void AdaGradSolver::ComputeUpdateValue() { const vector > >& net_params = this->net_->params(); const vector& net_params_lr = this->net_->params_lr(); + const vector& net_params_weight_decay = + this->net_->params_weight_decay(); + // get the learning rate + Dtype rate = this->GetLearningRate(); Dtype delta = this->param_.delta(); + if (this->param_.display() && this->iter_ % this->param_.display() == 0) { + LOG(INFO) << "Iteration " << this->iter_ << ", lr = " << rate; + } + SGDSolver::ClipGradients(); + Dtype weight_decay = this->param_.weight_decay(); + string regularization_type = this->param_.regularization_type(); switch (Caffe::mode()) { - case Caffe::CPU: { - Dtype local_rate = rate * net_params_lr[param_id]; - - // compute square of gradient in update - caffe_powx(net_params[param_id]->count(), - net_params[param_id]->cpu_diff(), Dtype(2), - this->update_[param_id]->mutable_cpu_data()); - - // update history - caffe_add(net_params[param_id]->count(), - this->update_[param_id]->cpu_data(), - this->history_[param_id]->cpu_data(), - this->history_[param_id]->mutable_cpu_data()); - - // prepare update - caffe_powx(net_params[param_id]->count(), - this->history_[param_id]->cpu_data(), Dtype(0.5), - this->update_[param_id]->mutable_cpu_data()); - - caffe_add_scalar(net_params[param_id]->count(), - delta, this->update_[param_id]->mutable_cpu_data()); - - caffe_div(net_params[param_id]->count(), - net_params[param_id]->cpu_diff(), - this->update_[param_id]->cpu_data(), - this->update_[param_id]->mutable_cpu_data()); - - // scale and copy - caffe_cpu_axpby(net_params[param_id]->count(), local_rate, - this->update_[param_id]->cpu_data(), Dtype(0), - net_params[param_id]->mutable_cpu_diff()); + case Caffe::CPU: + for (int param_id = 0; param_id < net_params.size(); ++param_id) { + Dtype local_rate = rate * net_params_lr[param_id]; + Dtype local_decay = weight_decay * net_params_weight_decay[param_id]; + + if (local_decay) { + if (regularization_type == "L2") { + // add weight decay + caffe_axpy(net_params[param_id]->count(), + local_decay, + net_params[param_id]->cpu_data(), + net_params[param_id]->mutable_cpu_diff()); + } else if (regularization_type == "L1") { + caffe_cpu_sign(net_params[param_id]->count(), + net_params[param_id]->cpu_data(), + this->temp_[param_id]->mutable_cpu_data()); + caffe_axpy(net_params[param_id]->count(), + local_decay, + this->temp_[param_id]->cpu_data(), + net_params[param_id]->mutable_cpu_diff()); + } else { + LOG(FATAL) << "Unknown regularization type: " << regularization_type; + } + } + + // compute square of gradient in update + caffe_powx(net_params[param_id]->count(), + net_params[param_id]->cpu_diff(), Dtype(2), + this->update_[param_id]->mutable_cpu_data()); + + // update history + caffe_add(net_params[param_id]->count(), + this->update_[param_id]->cpu_data(), + this->history_[param_id]->cpu_data(), + this->history_[param_id]->mutable_cpu_data()); + + // prepare update + caffe_powx(net_params[param_id]->count(), + this->history_[param_id]->cpu_data(), Dtype(0.5), + this->update_[param_id]->mutable_cpu_data()); + + caffe_add_scalar(net_params[param_id]->count(), + delta, this->update_[param_id]->mutable_cpu_data()); + + caffe_div(net_params[param_id]->count(), + net_params[param_id]->cpu_diff(), + this->update_[param_id]->cpu_data(), + this->update_[param_id]->mutable_cpu_data()); + + // scale and copy + caffe_cpu_axpby(net_params[param_id]->count(), local_rate, + this->update_[param_id]->cpu_data(), Dtype(0), + net_params[param_id]->mutable_cpu_diff()); + } break; - } - case Caffe::GPU: { + case Caffe::GPU: #ifndef CPU_ONLY - Dtype local_rate = rate * net_params_lr[param_id]; - - // compute square of gradient in update - caffe_gpu_powx(net_params[param_id]->count(), - net_params[param_id]->gpu_diff(), Dtype(2), - this->update_[param_id]->mutable_gpu_data()); - - // update history - caffe_gpu_add(net_params[param_id]->count(), - this->update_[param_id]->gpu_data(), - this->history_[param_id]->gpu_data(), - this->history_[param_id]->mutable_gpu_data()); - - // prepare update - caffe_gpu_powx(net_params[param_id]->count(), - this->history_[param_id]->gpu_data(), Dtype(0.5), - this->update_[param_id]->mutable_gpu_data()); - - caffe_gpu_add_scalar(net_params[param_id]->count(), - delta, this->update_[param_id]->mutable_gpu_data()); - - caffe_gpu_div(net_params[param_id]->count(), - net_params[param_id]->gpu_diff(), - this->update_[param_id]->gpu_data(), - this->update_[param_id]->mutable_gpu_data()); - - // scale and copy - caffe_gpu_axpby(net_params[param_id]->count(), local_rate, - this->update_[param_id]->gpu_data(), Dtype(0), - net_params[param_id]->mutable_gpu_diff()); + for (int param_id = 0; param_id < net_params.size(); ++param_id) { + Dtype local_rate = rate * net_params_lr[param_id]; + Dtype local_decay = weight_decay * net_params_weight_decay[param_id]; + + if (local_decay) { + if (regularization_type == "L2") { + // add weight decay + caffe_gpu_axpy(net_params[param_id]->count(), + local_decay, + net_params[param_id]->gpu_data(), + net_params[param_id]->mutable_gpu_diff()); + } else if (regularization_type == "L1") { + caffe_gpu_sign(net_params[param_id]->count(), + net_params[param_id]->gpu_data(), + this->temp_[param_id]->mutable_gpu_data()); + caffe_gpu_axpy(net_params[param_id]->count(), + local_decay, + this->temp_[param_id]->gpu_data(), + net_params[param_id]->mutable_gpu_diff()); + } else { + LOG(FATAL) << "Unknown regularization type: " << regularization_type; + } + } + + // compute square of gradient in update + caffe_gpu_powx(net_params[param_id]->count(), + net_params[param_id]->gpu_diff(), Dtype(2), + this->update_[param_id]->mutable_gpu_data()); + + // update history + caffe_gpu_add(net_params[param_id]->count(), + this->update_[param_id]->gpu_data(), + this->history_[param_id]->gpu_data(), + this->history_[param_id]->mutable_gpu_data()); + + // prepare update + caffe_gpu_powx(net_params[param_id]->count(), + this->history_[param_id]->gpu_data(), Dtype(0.5), + this->update_[param_id]->mutable_gpu_data()); + + caffe_gpu_add_scalar(net_params[param_id]->count(), + delta, this->update_[param_id]->mutable_gpu_data()); + + caffe_gpu_div(net_params[param_id]->count(), + net_params[param_id]->gpu_diff(), + this->update_[param_id]->gpu_data(), + this->update_[param_id]->mutable_gpu_data()); + + // scale and copy + caffe_gpu_axpby(net_params[param_id]->count(), local_rate, + this->update_[param_id]->gpu_data(), Dtype(0), + net_params[param_id]->mutable_gpu_diff()); + } #else NO_GPU; #endif break; - } default: LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); } diff --git a/src/caffe/syncedmem.cpp b/src/caffe/syncedmem.cpp index 5ae11ea983b..7617ccfb27f 100644 --- a/src/caffe/syncedmem.cpp +++ b/src/caffe/syncedmem.cpp @@ -12,14 +12,8 @@ SyncedMemory::~SyncedMemory() { } #ifndef CPU_ONLY - if (gpu_ptr_ && own_gpu_data_) { - int initial_device; - cudaGetDevice(&initial_device); - if (gpu_device_ != -1) { - CUDA_CHECK(cudaSetDevice(gpu_device_)); - } - MemoryHandler::freeGPU(gpu_ptr_); - cudaSetDevice(initial_device); + if (gpu_ptr_) { + CUDA_CHECK(cudaFree(gpu_ptr_)); } #endif // CPU_ONLY } @@ -54,17 +48,13 @@ inline void SyncedMemory::to_gpu() { #ifndef CPU_ONLY switch (head_) { case UNINITIALIZED: - CUDA_CHECK(cudaGetDevice(&gpu_device_)); - MemoryHandler::mallocGPU(&gpu_ptr_, size_); + CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_)); caffe_gpu_memset(size_, 0, gpu_ptr_); head_ = HEAD_AT_GPU; - own_gpu_data_ = true; break; case HEAD_AT_CPU: if (gpu_ptr_ == NULL) { - CUDA_CHECK(cudaGetDevice(&gpu_device_)); - MemoryHandler::mallocGPU(&gpu_ptr_, size_); - own_gpu_data_ = true; + CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_)); } caffe_gpu_memcpy(size_, cpu_ptr_, gpu_ptr_); head_ = SYNCED; @@ -102,26 +92,6 @@ const void* SyncedMemory::gpu_data() { #endif } -void SyncedMemory::set_gpu_data(void* data) { -#ifndef CPU_ONLY - CHECK(data); - if (own_gpu_data_) { - int initial_device; - cudaGetDevice(&initial_device); - if (gpu_device_ != -1) { - CUDA_CHECK(cudaSetDevice(gpu_device_)); - } - MemoryHandler::freeGPU(gpu_ptr_); - cudaSetDevice(initial_device); - } - gpu_ptr_ = data; - head_ = HEAD_AT_GPU; - own_gpu_data_ = false; -#else - NO_GPU; -#endif -} - void* SyncedMemory::mutable_cpu_data() { to_cpu(); head_ = HEAD_AT_CPU; @@ -138,20 +108,6 @@ void* SyncedMemory::mutable_gpu_data() { #endif } -#ifndef CPU_ONLY -void SyncedMemory::async_gpu_push(const cudaStream_t& stream) { - CHECK(head_ == HEAD_AT_CPU); - if (gpu_ptr_ == NULL) { - CUDA_CHECK(cudaGetDevice(&gpu_device_)); - MemoryHandler::mallocGPU(&gpu_ptr_, size_); - own_gpu_data_ = true; - } - const cudaMemcpyKind put = cudaMemcpyHostToDevice; - CUDA_CHECK(cudaMemcpyAsync(gpu_ptr_, cpu_ptr_, size_, put, stream)); - // Assume caller will synchronize on the stream before use - head_ = SYNCED; -} -#endif } // namespace caffe diff --git a/src/caffe/test/test_convolution_layer.cpp b/src/caffe/test/test_convolution_layer.cpp index a8a2c9b9342..67d41fff844 100644 --- a/src/caffe/test/test_convolution_layer.cpp +++ b/src/caffe/test/test_convolution_layer.cpp @@ -603,7 +603,6 @@ TYPED_TEST(CuDNNConvolutionLayerTest, TestSobelConvolutionCuDNN) { weights[i + 8] = 1; } layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); - cudaDeviceSynchronize(); layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); // Compute Sobel G_x operator as separable 3 x 1 and 1 x 3 convolutions. // (1) the [1 2 1] column filter @@ -631,7 +630,6 @@ TYPED_TEST(CuDNNConvolutionLayerTest, TestSobelConvolutionCuDNN) { weights_1[i + 2] = 1; } layer->SetUp(sep_blob_bottom_vec, sep_blob_top_vec); - cudaDeviceSynchronize(); layer->Forward(sep_blob_bottom_vec, sep_blob_top_vec); // (2) the [-1 0 1] row filter blob_sep->CopyFrom(*this->blob_top_2_, false, true); @@ -654,7 +652,6 @@ TYPED_TEST(CuDNNConvolutionLayerTest, TestSobelConvolutionCuDNN) { weights_2[i + 2] = 1; } layer->SetUp(sep_blob_bottom_vec, sep_blob_top_vec); - cudaDeviceSynchronize(); layer->Forward(sep_blob_bottom_vec, sep_blob_top_vec); // Test equivalence of full and separable filters. const TypeParam* top_data = this->blob_top_->cpu_data(); diff --git a/src/caffe/test/test_internal_thread.cpp b/src/caffe/test/test_internal_thread.cpp index 93f1cc541cd..31882b6db1d 100644 --- a/src/caffe/test/test_internal_thread.cpp +++ b/src/caffe/test/test_internal_thread.cpp @@ -2,7 +2,6 @@ #include "gtest/gtest.h" #include "caffe/internal_thread.hpp" -#include "caffe/util/math_functions.hpp" #include "caffe/test/test_caffe_main.hpp" @@ -14,40 +13,11 @@ class InternalThreadTest : public ::testing::Test {}; TEST_F(InternalThreadTest, TestStartAndExit) { InternalThread thread; EXPECT_FALSE(thread.is_started()); - thread.StartInternalThread(); + EXPECT_TRUE(thread.StartInternalThread()); EXPECT_TRUE(thread.is_started()); - thread.StopInternalThread(); + EXPECT_TRUE(thread.WaitForInternalThreadToExit()); EXPECT_FALSE(thread.is_started()); } -class TestThreadA : public InternalThread { - void InternalThreadEntry() { - EXPECT_EQ(4244559767, caffe_rng_rand()); - } -}; - -class TestThreadB : public InternalThread { - void InternalThreadEntry() { - EXPECT_EQ(1726478280, caffe_rng_rand()); - } -}; - -TEST_F(InternalThreadTest, TestRandomSeed) { - TestThreadA t1; - Caffe::set_random_seed(9658361); - t1.StartInternalThread(); - t1.StopInternalThread(); - - TestThreadA t2; - Caffe::set_random_seed(9658361); - t2.StartInternalThread(); - t2.StopInternalThread(); - - TestThreadB t3; - Caffe::set_random_seed(3435563); - t3.StartInternalThread(); - t3.StopInternalThread(); -} - } // namespace caffe diff --git a/src/caffe/test/test_layer_factory.cpp b/src/caffe/test/test_layer_factory.cpp index c86fafd000c..efb1b37ac42 100644 --- a/src/caffe/test/test_layer_factory.cpp +++ b/src/caffe/test/test_layer_factory.cpp @@ -1,14 +1,11 @@ #include #include -#include "boost/scoped_ptr.hpp" #include "gtest/gtest.h" #include "caffe/common.hpp" #include "caffe/layer.hpp" #include "caffe/layer_factory.hpp" -#include "caffe/util/db.hpp" -#include "caffe/util/io.hpp" #include "caffe/test/test_caffe_main.hpp" @@ -24,20 +21,11 @@ TYPED_TEST(LayerFactoryTest, TestCreateLayer) { typename LayerRegistry::CreatorRegistry& registry = LayerRegistry::Registry(); shared_ptr > layer; + LayerParameter layer_param; for (typename LayerRegistry::CreatorRegistry::iterator iter = registry.begin(); iter != registry.end(); ++iter) { // Special case: PythonLayer is checked by pytest if (iter->first == "Python") { continue; } - LayerParameter layer_param; - // Data layers expect a DB - if (iter->first == "Data") { - string tmp; - MakeTempDir(&tmp); - boost::scoped_ptr db(db::GetDB(DataParameter_DB_LEVELDB)); - db->Open(tmp, db::NEW); - db->Close(); - layer_param.mutable_data_param()->set_source(tmp); - } layer_param.set_type(iter->first); layer = LayerRegistry::CreateLayer(layer_param); EXPECT_EQ(iter->first, layer->type()); diff --git a/src/caffe/test/test_lrn_layer.cpp b/src/caffe/test/test_lrn_layer.cpp index 012d5990cd4..c4e2f8ea7f2 100644 --- a/src/caffe/test/test_lrn_layer.cpp +++ b/src/caffe/test/test_lrn_layer.cpp @@ -246,214 +246,5 @@ TYPED_TEST(LRNLayerTest, TestGradientWithinChannel) { this->blob_top_vec_); } -#ifdef USE_CUDNN -template -class CuDNNLRNLayerTest : public GPUDeviceTest { - protected: - CuDNNLRNLayerTest() - : epsilon_(Dtype(1e-5)), - blob_bottom_(new Blob()), - blob_top_(new Blob()) {} - virtual void SetUp() { - Caffe::set_random_seed(1701); - blob_bottom_->Reshape(2, 7, 3, 3); - // fill the values - FillerParameter filler_param; - GaussianFiller filler(filler_param); - filler.Fill(this->blob_bottom_); - blob_bottom_vec_.push_back(blob_bottom_); - blob_top_vec_.push_back(blob_top_); - } - virtual ~CuDNNLRNLayerTest() { delete blob_bottom_; delete blob_top_; } - void ReferenceLRNForward(const Blob& blob_bottom, - const LayerParameter& layer_param, Blob* blob_top); - - Dtype epsilon_; - Blob* const blob_bottom_; - Blob* const blob_top_; - vector*> blob_bottom_vec_; - vector*> blob_top_vec_; -}; - -template -void CuDNNLRNLayerTest::ReferenceLRNForward( - const Blob& blob_bottom, const LayerParameter& layer_param, - Blob* blob_top) { - typedef TypeParam Dtype; - blob_top->Reshape(blob_bottom.num(), blob_bottom.channels(), - blob_bottom.height(), blob_bottom.width()); - Dtype* top_data = blob_top->mutable_cpu_data(); - LRNParameter lrn_param = layer_param.lrn_param(); - Dtype alpha = lrn_param.alpha(); - Dtype beta = lrn_param.beta(); - int size = lrn_param.local_size(); - switch (lrn_param.norm_region()) { - case LRNParameter_NormRegion_ACROSS_CHANNELS: - for (int n = 0; n < blob_bottom.num(); ++n) { - for (int c = 0; c < blob_bottom.channels(); ++c) { - for (int h = 0; h < blob_bottom.height(); ++h) { - for (int w = 0; w < blob_bottom.width(); ++w) { - int c_start = c - (size - 1) / 2; - int c_end = min(c_start + size, blob_bottom.channels()); - c_start = max(c_start, 0); - Dtype scale = 1.; - for (int i = c_start; i < c_end; ++i) { - Dtype value = blob_bottom.data_at(n, i, h, w); - scale += value * value * alpha / size; - } - *(top_data + blob_top->offset(n, c, h, w)) = - blob_bottom.data_at(n, c, h, w) / pow(scale, beta); - } - } - } - } - break; - case LRNParameter_NormRegion_WITHIN_CHANNEL: - for (int n = 0; n < blob_bottom.num(); ++n) { - for (int c = 0; c < blob_bottom.channels(); ++c) { - for (int h = 0; h < blob_bottom.height(); ++h) { - int h_start = h - (size - 1) / 2; - int h_end = min(h_start + size, blob_bottom.height()); - h_start = max(h_start, 0); - for (int w = 0; w < blob_bottom.width(); ++w) { - Dtype scale = 1.; - int w_start = w - (size - 1) / 2; - int w_end = min(w_start + size, blob_bottom.width()); - w_start = max(w_start, 0); - for (int nh = h_start; nh < h_end; ++nh) { - for (int nw = w_start; nw < w_end; ++nw) { - Dtype value = blob_bottom.data_at(n, c, nh, nw); - scale += value * value * alpha / (size * size); - } - } - *(top_data + blob_top->offset(n, c, h, w)) = - blob_bottom.data_at(n, c, h, w) / pow(scale, beta); - } - } - } - } - break; - default: - LOG(FATAL) << "Unknown normalization region."; - } -} - -TYPED_TEST_CASE(CuDNNLRNLayerTest, TestDtypes); - -TYPED_TEST(CuDNNLRNLayerTest, TestForwardAcrossChannelsCuDNN) { - // typedef typename TypeParam::Dtype Dtype; - Caffe::set_mode(Caffe::GPU); - LayerParameter layer_param; - CuDNNLRNLayer layer(layer_param); - layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_); - layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_); - Blob top_reference; - this->ReferenceLRNForward(*(this->blob_bottom_), layer_param, - &top_reference); - for (int i = 0; i < this->blob_bottom_->count(); ++i) { - EXPECT_NEAR(this->blob_top_->cpu_data()[i], top_reference.cpu_data()[i], - this->epsilon_); - } -} - -TYPED_TEST(CuDNNLRNLayerTest, TestForwardAcrossChannelsLargeRegionCuDNN) { - Caffe::set_mode(Caffe::GPU); - typedef TypeParam Dtype; - LayerParameter layer_param; - layer_param.mutable_lrn_param()->set_local_size(15); - CuDNNLRNLayer layer(layer_param); - layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_); - layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_); - Blob top_reference; - this->ReferenceLRNForward(*(this->blob_bottom_), layer_param, - &top_reference); - for (int i = 0; i < this->blob_bottom_->count(); ++i) { - EXPECT_NEAR(this->blob_top_->cpu_data()[i], top_reference.cpu_data()[i], - this->epsilon_); - } -} - -TYPED_TEST(CuDNNLRNLayerTest, TestGradientAcrossChannelsCuDNN) { - typedef TypeParam Dtype; - LayerParameter layer_param; - CuDNNLRNLayer layer(layer_param); - GradientChecker checker(1e-2, 1e-2); - layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_); - layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_); - for (int i = 0; i < this->blob_top_->count(); ++i) { - this->blob_top_->mutable_cpu_diff()[i] = 1.; - } - vector propagate_down(this->blob_bottom_vec_.size(), true); - layer.Backward(this->blob_top_vec_, propagate_down, - this->blob_bottom_vec_); - // for (int i = 0; i < this->blob_bottom_->count(); ++i) { - // std::cout << "CPU diff " << this->blob_bottom_->cpu_diff()[i] - // << std::endl; - // } - checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_, - this->blob_top_vec_); -} - -/* -TYPED_TEST(CuDNNLRNLayerTest, TestForwardWithinChannel) { - typedef TypeParam Dtype; - LayerParameter layer_param; - layer_param.mutable_lrn_param()->set_norm_region( - LRNParameter_NormRegion_WITHIN_CHANNEL); - layer_param.mutable_lrn_param()->set_local_size(3); - CuDNNLCNLayer layer(layer_param); - layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_); - layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_); - Blob top_reference; - this->ReferenceLRNForward(*(this->blob_bottom_), layer_param, - &top_reference); - for (int i = 0; i < this->blob_bottom_->count(); ++i) { - EXPECT_NEAR(this->blob_top_->cpu_data()[i], top_reference.cpu_data()[i], - this->epsilon_); - } -} - -TYPED_TEST(CuDNNLRNLayerTest, TestGradientWithinChannel) { - Caffe::set_mode(Caffe::GPU); - typedef TypeParam Dtype; - LayerParameter layer_param; - layer_param.mutable_lrn_param()->set_norm_region( - LRNParameter_NormRegion_WITHIN_CHANNEL); - layer_param.mutable_lrn_param()->set_local_size(3); - CuDNNLCNLayer layer(layer_param); - GradientChecker checker(1e-2, 1e-2); - layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_); - layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_); - for (int i = 0; i < this->blob_top_->count(); ++i) { - this->blob_top_->mutable_cpu_diff()[i] = 1.; - } - checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_, - this->blob_top_vec_); -} -*/ - -TYPED_TEST(CuDNNLRNLayerTest, TestGradientAcrossChannelsLargeRegionCuDNN) { - typedef TypeParam Dtype; - LayerParameter layer_param; - layer_param.mutable_lrn_param()->set_local_size(15); - CuDNNLRNLayer layer(layer_param); - GradientChecker checker(1e-2, 1e-2); - layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_); - layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_); - for (int i = 0; i < this->blob_top_->count(); ++i) { - this->blob_top_->mutable_cpu_diff()[i] = 1.; - } - vector propagate_down(this->blob_bottom_vec_.size(), true); - layer.Backward(this->blob_top_vec_, propagate_down, - this->blob_bottom_vec_); - // for (int i = 0; i < this->blob_bottom_->count(); ++i) { - // std::cout << "CPU diff " << this->blob_bottom_->cpu_diff()[i] - // << std::endl; - // } - checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_, - this->blob_top_vec_); -} - -#endif } // namespace caffe diff --git a/src/caffe/test/test_upgrade_proto.cpp b/src/caffe/test/test_upgrade_proto.cpp index 006720231a5..eec627656ef 100644 --- a/src/caffe/test/test_upgrade_proto.cpp +++ b/src/caffe/test/test_upgrade_proto.cpp @@ -2,15 +2,12 @@ #include #include -#include "boost/scoped_ptr.hpp" #include "google/protobuf/text_format.h" #include "gtest/gtest.h" #include "caffe/blob.hpp" #include "caffe/common.hpp" #include "caffe/layer.hpp" -#include "caffe/util/db.hpp" -#include "caffe/util/io.hpp" #include "caffe/util/upgrade_proto.hpp" #include "caffe/test/test_caffe_main.hpp" @@ -2904,15 +2901,6 @@ TEST_F(NetUpgradeTest, TestUpgradeV1LayerType) { continue; // Empty string isn't actually a valid layer type. } layer_param.set_type(v2_layer_type); - // Data layers expect a DB - if (v2_layer_type == "Data") { - string tmp; - MakeTempDir(&tmp); - boost::scoped_ptr db(db::GetDB(DataParameter_DB_LEVELDB)); - db->Open(tmp, db::NEW); - db->Close(); - layer_param.mutable_data_param()->set_source(tmp); - } layer = LayerRegistry::CreateLayer(layer_param); EXPECT_EQ(v2_layer_type, layer->type()); } diff --git a/src/caffe/util/blocking_queue.cpp b/src/caffe/util/blocking_queue.cpp deleted file mode 100644 index 8a0e9306f18..00000000000 --- a/src/caffe/util/blocking_queue.cpp +++ /dev/null @@ -1,96 +0,0 @@ -#include -#include - -#include "caffe/data_layers.hpp" -#include "caffe/data_reader.hpp" -#include "caffe/parallel.hpp" -#include "caffe/util/blocking_queue.hpp" - -namespace caffe { - -template -class BlockingQueue::sync { - public: - mutable boost::mutex mutex_; - boost::condition_variable condition_; -}; - -template -BlockingQueue::BlockingQueue() - : sync_(new sync()) { -} - -template -void BlockingQueue::push(const T& t) { - boost::mutex::scoped_lock lock(sync_->mutex_); - queue_.push(t); - lock.unlock(); - sync_->condition_.notify_one(); -} - -template -bool BlockingQueue::try_pop(T* t) { - boost::mutex::scoped_lock lock(sync_->mutex_); - - if (queue_.empty()) { - return false; - } - - *t = queue_.front(); - queue_.pop(); - return true; -} - -template -T BlockingQueue::pop(const string& log_on_wait) { - boost::mutex::scoped_lock lock(sync_->mutex_); - - while (queue_.empty()) { - if (!log_on_wait.empty()) { - LOG(INFO)<< log_on_wait; - } - sync_->condition_.wait(lock); - } - - T t = queue_.front(); - queue_.pop(); - return t; -} - -template -bool BlockingQueue::try_peek(T* t) { - boost::mutex::scoped_lock lock(sync_->mutex_); - - if (queue_.empty()) { - return false; - } - - *t = queue_.front(); - return true; -} - -template -T BlockingQueue::peek() { - boost::mutex::scoped_lock lock(sync_->mutex_); - - while (queue_.empty()) { - sync_->condition_.wait(lock); - } - - return queue_.front(); -} - -template -size_t BlockingQueue::size() const { - boost::mutex::scoped_lock lock(sync_->mutex_); - return queue_.size(); -} - -template class BlockingQueue*>; -template class BlockingQueue*>; -template class BlockingQueue; -template class BlockingQueue >; -template class BlockingQueue*>; -template class BlockingQueue*>; - -} // namespace caffe diff --git a/tools/caffe.cpp b/tools/caffe.cpp index e9c5a01ccbc..0b7523fccf9 100644 --- a/tools/caffe.cpp +++ b/tools/caffe.cpp @@ -12,19 +12,13 @@ using caffe::Blob; using caffe::Caffe; using caffe::Net; using caffe::Layer; -using caffe::Solver; using caffe::shared_ptr; -using caffe::string; using caffe::Timer; using caffe::vector; -using std::ostringstream; -using caffe::MemoryHandlerActivator; + DEFINE_int32(gpu, -1, - "Run in GPU mode on given device ID (Legacy switch, use -gpus)."); -DEFINE_string(gpus, "", - "Run in GPU mode on given device IDs separated by ','." - "Use '-gpus all' to run on all available GPUs."); + "Run in GPU mode on given device ID."); DEFINE_string(solver, "", "The solver definition protocol buffer text file."); DEFINE_string(model, "", @@ -32,8 +26,8 @@ DEFINE_string(model, "", DEFINE_string(snapshot, "", "Optional; the snapshot solver state to resume training."); DEFINE_string(weights, "", - "Optional; the pretrained weights to initialize finetuning, " - "separated by ','. Cannot be set simultaneously with snapshot."); + "Optional; the pretrained weights to initialize finetuning. " + "Cannot be set simultaneously with snapshot."); DEFINE_int32(iterations, 50, "The number of iterations to run."); @@ -67,32 +61,6 @@ static BrewFunction GetBrewFunction(const caffe::string& name) { } } -// Parse GPU ids or use all available devices -static void get_gpus(vector* gpus) { - if (FLAGS_gpu >= 0) { - FLAGS_gpus = "" + boost::lexical_cast(FLAGS_gpu); - } - if (FLAGS_gpus == "all") { - int count = 0; -#ifndef CPU_ONLY - CUDA_CHECK(cudaGetDeviceCount(&count)); -#else - NO_GPU; -#endif - for (int i = 0; i < count; ++i) { - gpus->push_back(i); - } - } else if (FLAGS_gpus.size()) { - vector strings; - boost::split(strings, FLAGS_gpus, boost::is_any_of(",")); - for (int i = 0; i < strings.size(); ++i) { - gpus->push_back(boost::lexical_cast(strings[i])); - } - } else { - CHECK_EQ(gpus->size(), 0); - } -} - // caffe commands to call by // caffe // @@ -101,13 +69,10 @@ static void get_gpus(vector* gpus) { // Device Query: show diagnostic information for a GPU device. int device_query() { - LOG(INFO) << "Querying GPUs " << FLAGS_gpus; - vector gpus; - get_gpus(&gpus); - for (int i = 0; i < gpus.size(); ++i) { - caffe::Caffe::SetDevice(gpus[i]); - caffe::Caffe::DeviceQuery(); - } + CHECK_GT(FLAGS_gpu, -1) << "Need a device ID to query."; + LOG(INFO) << "Querying device ID = " << FLAGS_gpu; + caffe::Caffe::SetDevice(FLAGS_gpu); + caffe::Caffe::DeviceQuery(); return 0; } RegisterBrewFunction(device_query); @@ -136,52 +101,37 @@ int train() { caffe::SolverParameter solver_param; caffe::ReadProtoFromTextFileOrDie(FLAGS_solver, &solver_param); - // If the gpus flag is not provided, allow the mode and device to be set + // If the gpu flag is not provided, allow the mode and device to be set // in the solver prototxt. - if (FLAGS_gpu < 0 && FLAGS_gpus.size() == 0 - && solver_param.solver_mode() == caffe::SolverParameter_SolverMode_GPU - && solver_param.has_device_id()) { - FLAGS_gpus = "" + boost::lexical_cast(solver_param.device_id()); + if (FLAGS_gpu < 0 + && solver_param.solver_mode() == caffe::SolverParameter_SolverMode_GPU) { + FLAGS_gpu = solver_param.device_id(); } - vector gpus; - get_gpus(&gpus); - if (gpus.size() == 0) { - Caffe::set_mode(Caffe::CPU); - } else { - ostringstream s; - for (int i = 0; i < gpus.size(); ++i) { - s << (i ? ", " : "") << gpus[i]; - } - LOG(INFO) << "Using GPUs " << s.str(); - - solver_param.set_device_id(gpus[0]); - Caffe::SetDevice(gpus[0]); + // Set device id and mode + if (FLAGS_gpu >= 0) { + LOG(INFO) << "Use GPU with device ID " << FLAGS_gpu; + Caffe::SetDevice(FLAGS_gpu); Caffe::set_mode(Caffe::GPU); - Caffe::set_solver_count(gpus.size()); + } else { + LOG(INFO) << "Use CPU."; + Caffe::set_mode(Caffe::CPU); } -#ifdef USE_CNMEM - MemoryHandlerActivator handler(gpus); -#endif - shared_ptr > solver(caffe::GetSolver(solver_param)); + LOG(INFO) << "Starting Optimization"; + shared_ptr > + solver(caffe::GetSolver(solver_param)); if (FLAGS_snapshot.size()) { LOG(INFO) << "Resuming from " << FLAGS_snapshot; - solver->Restore(FLAGS_snapshot.c_str()); + solver->Solve(FLAGS_snapshot); } else if (FLAGS_weights.size()) { - CopyLayers(solver.get(), FLAGS_weights); - } - - if (gpus.size() > 1) { - caffe::P2PSync::run(solver, gpus); + CopyLayers(&*solver, FLAGS_weights); + solver->Solve(); } else { - LOG(INFO) << "Starting Optimization"; solver->Solve(); } LOG(INFO) << "Optimization Done."; - - // solver.reset(); return 0; } RegisterBrewFunction(train); @@ -193,11 +143,9 @@ int test() { CHECK_GT(FLAGS_weights.size(), 0) << "Need model weights to score."; // Set device id and mode - vector gpus; - get_gpus(&gpus); - if (gpus.size() != 0) { - LOG(INFO) << "Use GPU with device ID " << gpus[0]; - Caffe::SetDevice(gpus[0]); + if (FLAGS_gpu >= 0) { + LOG(INFO) << "Use GPU with device ID " << FLAGS_gpu; + Caffe::SetDevice(FLAGS_gpu); Caffe::set_mode(Caffe::GPU); } else { LOG(INFO) << "Use CPU."; @@ -260,11 +208,9 @@ int time() { CHECK_GT(FLAGS_model.size(), 0) << "Need a model definition to time."; // Set device id and mode - vector gpus; - get_gpus(&gpus); - if (gpus.size() != 0) { - LOG(INFO) << "Use GPU with device ID " << gpus[0]; - Caffe::SetDevice(gpus[0]); + if (FLAGS_gpu >= 0) { + LOG(INFO) << "Use GPU with device ID " << FLAGS_gpu; + Caffe::SetDevice(FLAGS_gpu); Caffe::set_mode(Caffe::GPU); } else { LOG(INFO) << "Use CPU.";