Skip to content

Commit

Permalink
[GrCUDA-HOTFIX] fix install and benchmarks (NVIDIA#20)
Browse files Browse the repository at this point in the history
* fixed install dir (GRCUDA-67)
* fixed python benchmarks not creating nested folders and not using experimental options (GRCUDA-68)
* updated make for cuda benchmarks and readme (GRCUDA-68)
  • Loading branch information
AlbertoParravicini committed Sep 26, 2021
1 parent d63678d commit 9c9a357
Show file tree
Hide file tree
Showing 5 changed files with 30 additions and 48 deletions.
10 changes: 9 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ mx unittest com.nvidia
mx unittest com.nvidia.grcuda.test.BuildKernelTest#testBuildKernelwithNFILegacytSignature
```

5. **Setup the grcuda-data sumbodule**
5. **Setup the grcuda-data submodule**
The `grcuda-data` repository is used as a `git` submodule to store data, results, and plots for demos, benchmarks, and publications. You will need this submodule to run the full benchmark suite, and some of the demos. To setup the submodule, follow this [`README`](https://github.com/AlbertoParravicini/grcuda-data/tree/master).

### Setup your IDE
Expand Down Expand Up @@ -325,6 +325,7 @@ Here, we explain how to setup IntelliJ Idea.

To measure the performance of GrCUDA on complex GPU applications, we have developed a custom benchmark suite, found in `projects/resources/python/benchmark`.
These are the same benchmarks used in the [DAG-based Scheduling with Resource Sharing for Multi-task Applications in a Polyglot GPU Runtime](https://ieeexplore.ieee.org/abstract/document/9460491) paper.
All commands are executed from `$GRCUDA_HOME/projects/resources/python/benchmark`;

Run a single benchmark with custom settings
```console
Expand All @@ -336,6 +337,13 @@ Run all benchmarks
graalpython --jvm --polyglot benchmark_wrapper.py -d -i 30
```

To run the CUDA version of all benchmarks, build it as follows. You might want to update the GPU architecture (the `-arch` flag) inside `$GRCUDA_HOME/projects/resources/cuda/Makefile` to reflect the hardware at your disposal.
```console
cd $GRCUDA_HOME/projects/resources/cuda;
make
cd -;
```

Run the CUDA version of all benchmarks
```console
graalpython --jvm --polyglot benchmark_wrapper.py -d -i 30 -c
Expand Down
2 changes: 1 addition & 1 deletion install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@
mx build;

# Install for Java 8+;
mkdir -p $GRAAL_HOME/jre/languages/grcuda;
mkdir -p $GRAAL_HOME/languages/grcuda;
cp $GRCUDA_HOME/mxbuild/dists/jdk1.8/grcuda.jar $GRAAL_HOME/languages/grcuda/.;
48 changes: 10 additions & 38 deletions projects/resources/cuda/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -27,51 +27,23 @@
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

# Use NVCC;
# Use NVCC.
# Set the appropriate GPU architecture, check https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
CXX=nvcc
FLAGS = -std=c++11 -O3 -arch=sm_70

# Use Clang;
CXX=$(CLANG_DIR)/clang++
FLAGS = --cuda-gpu-arch=sm_70 -L/usr/local/cuda/lib64 -lcudart_static -ldl -lrt -pthread -std=c++11 -O3
# (Experimental) Use Clang;
# CXX=$(CLANG_DIR)/clang++
# FLAGS = --cuda-gpu-arch=sm_70 -L/usr/local/cuda/lib64 -lcudart_static -ldl -lrt -pthread -std=c++11 -O3

BIN_FOLDER=bin
FILES=main.cu benchmark.cu b1.cu b5.cu b6.cu b7.cu b8.cu b10.cu

.PHONY: full all b1 b5 b6 b7 b8 b10
.PHONY: all clean

full:
all:
mkdir -p $(BIN_FOLDER);
$(CXX) $(FILES) $(FLAGS) -o $(BIN_FOLDER)/b;

all: \
full \
b1 \
b5 \
b6 \
b7 \
b8 \
b10

b1: b1*
$(CXX) old/b1_default.cu $(FLAGS) -o $(BIN_FOLDER)/b1_default;
$(CXX) old/b1_sync.cu $(FLAGS) -o $(BIN_FOLDER)/b1_sync

b5: b5*
$(CXX) old/b5_default.cu $(FLAGS) -o $(BIN_FOLDER)/b5_default;
$(CXX) old/b5_sync.cu $(FLAGS) -o $(BIN_FOLDER)/b5_sync

b6: b6*
$(CXX) old/b6_default.cu $(FLAGS) -o $(BIN_FOLDER)/b6_default;
$(CXX) old/b6_sync.cu $(FLAGS) -o $(BIN_FOLDER)/b6_sync

b7: b7*
$(CXX) old/b7_default.cu $(FLAGS) -o $(BIN_FOLDER)/b7_default;
$(CXX) old/b7_sync.cu $(FLAGS) -o $(BIN_FOLDER)/b7_sync

b8: b8*
$(CXX) old/b8_default.cu $(FLAGS) -o $(BIN_FOLDER)/b8_default;
$(CXX) old/b8_sync.cu $(FLAGS) -o $(BIN_FOLDER)/b8_sync

b10: b10*
$(CXX) old/b10_default.cu $(FLAGS) -o $(BIN_FOLDER)/b10_default;
$(CXX) old/b10_sync.cu $(FLAGS) -o $(BIN_FOLDER)/b10_sync
clean:
rm $(BIN_FOLDER)/*;
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
from benchmark_result import BenchmarkResult
from benchmark_main import create_block_size_list
from java.lang import System
from pathlib import Path

##############################
##############################
Expand Down Expand Up @@ -124,18 +125,18 @@

if POST_TURING:
GRAALPYTHON_CMD_METRICS = """/usr/local/cuda/bin/ncu -f --print-units base --csv --log-file "{}" --profile-from-start off --target-processes all {} \
{}/graalpython --vm.XX:MaxHeapSize={}G --jvm --polyglot --grcuda.RetrieveNewStreamPolicy={} {} --grcuda.ForceStreamAttach \
{}/graalpython --vm.XX:MaxHeapSize={}G --jvm --polyglot --experimental-options --grcuda.RetrieveNewStreamPolicy={} {} --grcuda.ForceStreamAttach \
--grcuda.ExecutionPolicy={} --grcuda.DependencyPolicy={} --grcuda.RetrieveParentStreamPolicy={} benchmark_main.py \
-i {} -n {} --reinit false --realloc false -g {} -b {} --block_size_1d {} --block_size_2d {} --no_cpu_validation {} {} --nvprof
"""
GRAALPYTHON_CMD_TRACE = """/usr/local/cuda/bin/nvprof --csv --log-file "{}" --print-gpu-trace {} --profile-from-start off --profile-child-processes \
{}/graalpython --vm.XX:MaxHeapSize={}G --jvm --polyglot --grcuda.RetrieveNewStreamPolicy={} {} --grcuda.ForceStreamAttach \
{}/graalpython --vm.XX:MaxHeapSize={}G --jvm --polyglot --experimental-options --grcuda.RetrieveNewStreamPolicy={} {} --grcuda.ForceStreamAttach \
--grcuda.ExecutionPolicy={} --grcuda.DependencyPolicy={} --grcuda.RetrieveParentStreamPolicy={} benchmark_main.py \
-i {} -n {} --reinit false --realloc false -g {} -b {} --block_size_1d {} --block_size_2d {} --no_cpu_validation {} {} --nvprof
"""
else:
GRAALPYTHON_CMD = """/usr/local/cuda/bin/nvprof --csv --log-file "{}" --print-gpu-trace {} --profile-from-start off --profile-child-processes \
{}/graalpython --vm.XX:MaxHeapSize={}G --jvm --polyglot --grcuda.RetrieveNewStreamPolicy={} {} --grcuda.ForceStreamAttach \
{}/graalpython --vm.XX:MaxHeapSize={}G --jvm --polyglot --experimental-options --grcuda.RetrieveNewStreamPolicy={} {} --grcuda.ForceStreamAttach \
--grcuda.ExecutionPolicy={} --grcuda.DependencyPolicy={} --grcuda.RetrieveParentStreamPolicy={} benchmark_main.py \
-i {} -n {} --reinit false --realloc false -g {} -b {} --block_size_1d {} --block_size_2d {} --no_cpu_validation {} {} --nvprof
"""
Expand Down Expand Up @@ -169,7 +170,7 @@ def execute_grcuda_benchmark(benchmark, size, exec_policy, new_stream_policy,
if not os.path.exists(output_folder_path):
if debug:
BenchmarkResult.log_message(f"creating result folder: {output_folder_path}")
os.mkdir(output_folder_path)
Path(output_folder_path).mkdir(parents=True, exist_ok=True)
file_name = f"{b}_{exec_policy}_{'metric' if m else 'nometric'}_{prefetch}{'' if (POST_TURING and m) else '_%p'}.csv"
output_path = os.path.join(output_folder_path, file_name)

Expand Down
9 changes: 5 additions & 4 deletions projects/resources/python/benchmark/benchmark_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
from benchmark_result import BenchmarkResult
from benchmark_main import create_block_size_list
from java.lang import System
from pathlib import Path

##############################
##############################
Expand Down Expand Up @@ -160,15 +161,15 @@ def execute_cuda_benchmark(benchmark, size, block_size, exec_policy, num_iter, d
if not os.path.exists(output_folder_path):
if debug:
BenchmarkResult.log_message(f"creating result folder: {output_folder_path}")
os.mkdir(output_folder_path)
Path(output_folder_path).mkdir(parents=True, exist_ok=True)
output_path = os.path.join(output_folder_path, file_name)

benchmark_cmd = CUDA_CMD.format(benchmark, exec_policy, size, block_size["block_size_1d"],
block_size["block_size_2d"], num_iter, num_blocks, "-r" if prefetch else "", "-a", output_path)
start = System.nanoTime()
result = subprocess.run(benchmark_cmd,
shell=True,
stdout=subprocess.STDOUT,
stdout=None,
cwd=f"{os.getenv('GRCUDA_HOME')}/projects/resources/cuda/bin")
result.check_returncode()
end = System.nanoTime()
Expand All @@ -179,7 +180,7 @@ def execute_cuda_benchmark(benchmark, size, block_size, exec_policy, num_iter, d
##############################
##############################

GRAALPYTHON_CMD = "graalpython --vm.XX:MaxHeapSize={}G --jvm --polyglot " \
GRAALPYTHON_CMD = "graalpython --vm.XX:MaxHeapSize={}G --jvm --polyglot --experimental-options " \
"--grcuda.RetrieveNewStreamPolicy={} {} --grcuda.ForceStreamAttach --grcuda.ExecutionPolicy={} --grcuda.DependencyPolicy={} " \
"--grcuda.RetrieveParentStreamPolicy={} benchmark_main.py -i {} -n {} -g {} " \
"--reinit false --realloc false -b {} --block_size_1d {} --block_size_2d {} --no_cpu_validation {} {} -o {}"
Expand Down Expand Up @@ -214,7 +215,7 @@ def execute_grcuda_benchmark(benchmark, size, block_sizes, exec_policy, new_stre
if not os.path.exists(output_folder_path):
if debug:
BenchmarkResult.log_message(f"creating result folder: {output_folder_path}")
os.mkdir(output_folder_path)
Path(output_folder_path).mkdir(parents=True, exist_ok=True)
output_path = os.path.join(output_folder_path, file_name)
b1d_size = " ".join([str(b['block_size_1d']) for b in block_sizes])
b2d_size = " ".join([str(b['block_size_2d']) for b in block_sizes])
Expand Down

0 comments on commit 9c9a357

Please sign in to comment.