[GrCUDA-HOTFIX] fix install and benchmarks (NVIDIA#20)

* fixed install dir (GRCUDA-67) * fixed python benchmarks not creating nested folders and not using experimental options (GRCUDA-68) * updated make for cuda benchmarks and readme (GRCUDA-68)
AlbertoParravicini · Sep 26, 2021 · 9c9a357 · 9c9a357
1 parent d63678d
commit 9c9a357
Show file tree

Hide file tree

Showing 5 changed files with 30 additions and 48 deletions.
diff --git a/README.md b/README.md
@@ -288,7 +288,7 @@ mx unittest com.nvidia
 mx unittest com.nvidia.grcuda.test.BuildKernelTest#testBuildKernelwithNFILegacytSignature
 ```
 
-5. **Setup the grcuda-data sumbodule**
+5. **Setup the grcuda-data submodule**
 The `grcuda-data` repository is used as a `git` submodule to store data, results, and plots for demos, benchmarks, and publications. You will need this submodule to run the full benchmark suite, and some of the demos. To setup the submodule, follow this [`README`](https://github.com/AlbertoParravicini/grcuda-data/tree/master).
 
 ### Setup your IDE
@@ -325,6 +325,7 @@ Here, we explain how to setup IntelliJ Idea.
 
 To measure the performance of GrCUDA on complex GPU applications, we have developed a custom benchmark suite, found in `projects/resources/python/benchmark`.
 These are the same benchmarks used in the [DAG-based Scheduling with Resource Sharing for Multi-task Applications in a Polyglot GPU Runtime](https://ieeexplore.ieee.org/abstract/document/9460491) paper.
+All commands are executed from `$GRCUDA_HOME/projects/resources/python/benchmark`;
 
 Run a single benchmark with custom settings
 ```console
@@ -336,6 +337,13 @@ Run all benchmarks
 graalpython --jvm --polyglot benchmark_wrapper.py -d -i 30 
 ```
 
+To run the CUDA version of all benchmarks, build it as follows. You might want to update the GPU architecture (the `-arch` flag) inside `$GRCUDA_HOME/projects/resources/cuda/Makefile` to reflect the hardware at your disposal.
+```console
+cd $GRCUDA_HOME/projects/resources/cuda;
+make
+cd -;
+```
+
 Run the CUDA version of all benchmarks
 ```console
 graalpython --jvm --polyglot benchmark_wrapper.py -d -i 30 -c

diff --git a/install.sh b/install.sh
@@ -3,5 +3,5 @@
 mx build;
 
 # Install for Java 8+;
-mkdir -p $GRAAL_HOME/jre/languages/grcuda;
+mkdir -p $GRAAL_HOME/languages/grcuda;
 cp $GRCUDA_HOME/mxbuild/dists/jdk1.8/grcuda.jar $GRAAL_HOME/languages/grcuda/.;
diff --git a/projects/resources/cuda/Makefile b/projects/resources/cuda/Makefile
@@ -27,51 +27,23 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-# Use NVCC;
+# Use NVCC.
+# Set the appropriate GPU architecture, check https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
 CXX=nvcc
 FLAGS = -std=c++11 -O3 -arch=sm_70
 
-# Use Clang;
-CXX=$(CLANG_DIR)/clang++
-FLAGS = --cuda-gpu-arch=sm_70 -L/usr/local/cuda/lib64 -lcudart_static -ldl -lrt -pthread -std=c++11 -O3
+# (Experimental) Use Clang;
+# CXX=$(CLANG_DIR)/clang++
+# FLAGS = --cuda-gpu-arch=sm_70 -L/usr/local/cuda/lib64 -lcudart_static -ldl -lrt -pthread -std=c++11 -O3
 
 BIN_FOLDER=bin
 FILES=main.cu benchmark.cu b1.cu b5.cu b6.cu b7.cu b8.cu b10.cu
 
-.PHONY: full all b1 b5 b6 b7 b8 b10
+.PHONY: all clean
 
-full:
+all:
+	mkdir -p $(BIN_FOLDER);
 	$(CXX) $(FILES) $(FLAGS) -o $(BIN_FOLDER)/b;
 
-all: \
-    full \
-	b1 \
-	b5 \
-	b6 \
-	b7 \
-	b8 \
-	b10
-
-b1: b1*
-	$(CXX) old/b1_default.cu $(FLAGS) -o $(BIN_FOLDER)/b1_default;
-	$(CXX) old/b1_sync.cu $(FLAGS) -o $(BIN_FOLDER)/b1_sync
-
-b5: b5*
-	$(CXX) old/b5_default.cu $(FLAGS) -o $(BIN_FOLDER)/b5_default;
-	$(CXX) old/b5_sync.cu $(FLAGS) -o $(BIN_FOLDER)/b5_sync
-
-b6: b6*
-	$(CXX) old/b6_default.cu $(FLAGS) -o $(BIN_FOLDER)/b6_default;
-	$(CXX) old/b6_sync.cu $(FLAGS) -o $(BIN_FOLDER)/b6_sync
-
-b7: b7*
-	$(CXX) old/b7_default.cu $(FLAGS) -o $(BIN_FOLDER)/b7_default;
-	$(CXX) old/b7_sync.cu $(FLAGS) -o $(BIN_FOLDER)/b7_sync
-
-b8: b8*
-	$(CXX) old/b8_default.cu $(FLAGS) -o $(BIN_FOLDER)/b8_default;
-	$(CXX) old/b8_sync.cu $(FLAGS) -o $(BIN_FOLDER)/b8_sync
-
-b10: b10*
-	$(CXX) old/b10_default.cu $(FLAGS) -o $(BIN_FOLDER)/b10_default;
-	$(CXX) old/b10_sync.cu $(FLAGS) -o $(BIN_FOLDER)/b10_sync
+clean:
+	rm $(BIN_FOLDER)/*;
diff --git a/projects/resources/python/benchmark/benchmark_nvprof_wrapper.py b/projects/resources/python/benchmark/benchmark_nvprof_wrapper.py
@@ -35,6 +35,7 @@
 from benchmark_result import BenchmarkResult
 from benchmark_main import create_block_size_list
 from java.lang import System
+from pathlib import Path
 
 ##############################
 ##############################
@@ -124,18 +125,18 @@
 
 if POST_TURING:
     GRAALPYTHON_CMD_METRICS = """/usr/local/cuda/bin/ncu -f --print-units base --csv --log-file "{}" --profile-from-start off --target-processes all {} \
-    {}/graalpython --vm.XX:MaxHeapSize={}G --jvm --polyglot --grcuda.RetrieveNewStreamPolicy={} {} --grcuda.ForceStreamAttach \
+    {}/graalpython --vm.XX:MaxHeapSize={}G --jvm --polyglot --experimental-options --grcuda.RetrieveNewStreamPolicy={} {} --grcuda.ForceStreamAttach \
     --grcuda.ExecutionPolicy={} --grcuda.DependencyPolicy={} --grcuda.RetrieveParentStreamPolicy={} benchmark_main.py \
     -i {} -n {} --reinit false --realloc false -g {} -b {} --block_size_1d {} --block_size_2d {} --no_cpu_validation {} {} --nvprof
     """
     GRAALPYTHON_CMD_TRACE = """/usr/local/cuda/bin/nvprof --csv --log-file "{}" --print-gpu-trace {} --profile-from-start off --profile-child-processes \
-    {}/graalpython --vm.XX:MaxHeapSize={}G --jvm --polyglot --grcuda.RetrieveNewStreamPolicy={} {} --grcuda.ForceStreamAttach \
+    {}/graalpython --vm.XX:MaxHeapSize={}G --jvm --polyglot --experimental-options --grcuda.RetrieveNewStreamPolicy={} {} --grcuda.ForceStreamAttach \
     --grcuda.ExecutionPolicy={} --grcuda.DependencyPolicy={} --grcuda.RetrieveParentStreamPolicy={} benchmark_main.py \
     -i {} -n {} --reinit false --realloc false -g {} -b {} --block_size_1d {} --block_size_2d {} --no_cpu_validation {} {} --nvprof
     """
 else:
     GRAALPYTHON_CMD = """/usr/local/cuda/bin/nvprof --csv --log-file "{}" --print-gpu-trace {} --profile-from-start off --profile-child-processes \
-    {}/graalpython --vm.XX:MaxHeapSize={}G --jvm --polyglot --grcuda.RetrieveNewStreamPolicy={} {} --grcuda.ForceStreamAttach \
+    {}/graalpython --vm.XX:MaxHeapSize={}G --jvm --polyglot --experimental-options --grcuda.RetrieveNewStreamPolicy={} {} --grcuda.ForceStreamAttach \
     --grcuda.ExecutionPolicy={} --grcuda.DependencyPolicy={} --grcuda.RetrieveParentStreamPolicy={} benchmark_main.py \
     -i {} -n {} --reinit false --realloc false -g {} -b {} --block_size_1d {} --block_size_2d {} --no_cpu_validation {} {} --nvprof
     """
@@ -169,7 +170,7 @@ def execute_grcuda_benchmark(benchmark, size, exec_policy, new_stream_policy,
         if not os.path.exists(output_folder_path):
             if debug:
                 BenchmarkResult.log_message(f"creating result folder: {output_folder_path}")
-            os.mkdir(output_folder_path)
+            Path(output_folder_path).mkdir(parents=True, exist_ok=True)
         file_name = f"{b}_{exec_policy}_{'metric' if m else 'nometric'}_{prefetch}{'' if (POST_TURING and m) else '_%p'}.csv"
         output_path = os.path.join(output_folder_path, file_name)
 

diff --git a/projects/resources/python/benchmark/benchmark_wrapper.py b/projects/resources/python/benchmark/benchmark_wrapper.py
@@ -35,6 +35,7 @@
 from benchmark_result import BenchmarkResult
 from benchmark_main import create_block_size_list
 from java.lang import System
+from pathlib import Path
 
 ##############################
 ##############################
@@ -160,15 +161,15 @@ def execute_cuda_benchmark(benchmark, size, block_size, exec_policy, num_iter, d
     if not os.path.exists(output_folder_path):
         if debug:
             BenchmarkResult.log_message(f"creating result folder: {output_folder_path}")
-        os.mkdir(output_folder_path)
+        Path(output_folder_path).mkdir(parents=True, exist_ok=True)
     output_path = os.path.join(output_folder_path, file_name)
 
     benchmark_cmd = CUDA_CMD.format(benchmark, exec_policy, size, block_size["block_size_1d"],
                                     block_size["block_size_2d"], num_iter, num_blocks, "-r" if prefetch else "", "-a", output_path)
     start = System.nanoTime()
     result = subprocess.run(benchmark_cmd,
                             shell=True,
-                            stdout=subprocess.STDOUT,
+                            stdout=None,
                             cwd=f"{os.getenv('GRCUDA_HOME')}/projects/resources/cuda/bin")
     result.check_returncode()
     end = System.nanoTime()
@@ -179,7 +180,7 @@ def execute_cuda_benchmark(benchmark, size, block_size, exec_policy, num_iter, d
 ##############################
 ##############################
 
-GRAALPYTHON_CMD = "graalpython --vm.XX:MaxHeapSize={}G --jvm --polyglot " \
+GRAALPYTHON_CMD = "graalpython --vm.XX:MaxHeapSize={}G --jvm --polyglot --experimental-options " \
                   "--grcuda.RetrieveNewStreamPolicy={} {} --grcuda.ForceStreamAttach --grcuda.ExecutionPolicy={} --grcuda.DependencyPolicy={} " \
                   "--grcuda.RetrieveParentStreamPolicy={} benchmark_main.py  -i {} -n {} -g {} " \
                   "--reinit false --realloc false  -b {} --block_size_1d {} --block_size_2d {} --no_cpu_validation {} {} -o {}"
@@ -214,7 +215,7 @@ def execute_grcuda_benchmark(benchmark, size, block_sizes, exec_policy, new_stre
     if not os.path.exists(output_folder_path):
         if debug:
             BenchmarkResult.log_message(f"creating result folder: {output_folder_path}")
-        os.mkdir(output_folder_path)
+        Path(output_folder_path).mkdir(parents=True, exist_ok=True)
     output_path = os.path.join(output_folder_path, file_name)
     b1d_size = " ".join([str(b['block_size_1d']) for b in block_sizes])
     b2d_size = " ".join([str(b['block_size_2d']) for b in block_sizes])