Browse Source

Simplified p2pBandwidthLatencyTest, removed Dockerfile, updated README

Anish Saxena 2 years ago
parent
commit
24eda865ed
22 changed files with 90 additions and 648 deletions
  1. 3 0
      .gitignore
  2. 0 1
      hpc/multi_gpu_nways/Dockerfile
  3. 1 2
      hpc/multi_gpu_nways/README.md
  4. 0 1
      hpc/multi_gpu_nways/Singularity
  5. 57 2
      hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/nccl/nccl.ipynb
  6. 19 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/cuda/jacobi_streams_events.cu
  7. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/CT_skull_512x512_8u.raw
  8. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/CT_skull_512x512_8u_Gray.raw
  9. 0 1
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/Lena_512x512_8u_Gray.raw
  10. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/PCB2_1024x683_8u.raw
  11. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/PCB_1280x720_8u.raw
  12. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/PCB_METAL_509x335_8u.raw
  13. 0 285
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/Rocks_512x512_8u_Gray.raw
  14. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/lena_512x512_8u.raw
  15. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/lib/x64/freeglut.lib
  16. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/lib/x64/glew64.lib
  17. 9 273
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Makefile
  18. 1 1
      hpc/multi_gpu_nways/labs/CFD/English/introduction.ipynb
  19. BIN
      hpc/multi_gpu_nways/slurm_pmi_config/lib/libpmi.a
  20. 0 41
      hpc/multi_gpu_nways/slurm_pmi_config/lib/libpmi.la
  21. BIN
      hpc/multi_gpu_nways/slurm_pmi_config/lib/libpmi2.a
  22. 0 41
      hpc/multi_gpu_nways/slurm_pmi_config/lib/libpmi2.la

+ 3 - 0
.gitignore

@@ -3,3 +3,6 @@
 alk.traj.dcd
 *.simg
 *.so*
+*.a
+*.la
+mgpm

+ 0 - 1
hpc/multi_gpu_nways/Dockerfile

@@ -1 +0,0 @@
-# To be populated

+ 1 - 2
hpc/multi_gpu_nways/README.md

@@ -6,7 +6,6 @@ Scaling applications to multiple GPUs across multiple nodes requires one to be a
 
 ## Bootcamp Outline
 
-
 * Overview of single-GPU code and Nsight Systems Profiler
 * Single Node Multi-GPU:
   - CUDA Memcpy and Peer-to-Peer Memory Access
@@ -87,4 +86,4 @@ After running Jupyter Lab, open [http://localhost:8888](http://localhost:8888/)
 
 Please join [OpenACC Slack Channel](https://openacclang.slack.com/messages/openaccusergroup) to raise questions.
 
-If you observe any errors or issues, please file an issue on [GPUBootcamp GitHuB repository](https://github.com/Anish-Saxena/gpubootcamp/tree/hpc-multi-gpu).
+If you observe any errors or issues, please file an issue on [GPUBootcamp GitHuB repository](https://github.com/gpuhackathons-org/gpubootcamp).

+ 0 - 1
hpc/multi_gpu_nways/Singularity

@@ -1,7 +1,6 @@
 # Copyright (c) 2021 NVIDIA Corporation.  All rights reserved. 
 
 Bootstrap: docker
-#FROM: nvcr.io/nvidia/nvhpc:20.11-devel-cuda_multi-ubuntu20.04
 FROM: nvcr.io/nvidia/nvhpc:21.5-devel-cuda_multi-ubuntu20.04
 
 %environment

+ 57 - 2
hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/nccl/nccl.ipynb

@@ -10,10 +10,65 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "6ca7ab3b-aef8-41d6-a568-8458bce7c7d6",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Sun Aug  1 16:12:04 2021       \n",
+      "+-----------------------------------------------------------------------------+\n",
+      "| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |\n",
+      "|-------------------------------+----------------------+----------------------+\n",
+      "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
+      "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n",
+      "|                               |                      |               MIG M. |\n",
+      "|===============================+======================+======================|\n",
+      "|   0  Tesla V100-SXM2...  On   | 00000000:06:00.0 Off |                    0 |\n",
+      "| N/A   28C    P0    40W / 300W |      0MiB / 16160MiB |      0%      Default |\n",
+      "|                               |                      |                  N/A |\n",
+      "+-------------------------------+----------------------+----------------------+\n",
+      "|   1  Tesla V100-SXM2...  On   | 00000000:07:00.0 Off |                    0 |\n",
+      "| N/A   28C    P0    41W / 300W |      0MiB / 16160MiB |      0%      Default |\n",
+      "|                               |                      |                  N/A |\n",
+      "+-------------------------------+----------------------+----------------------+\n",
+      "|   2  Tesla V100-SXM2...  On   | 00000000:0A:00.0 Off |                    0 |\n",
+      "| N/A   29C    P0    42W / 300W |      0MiB / 16160MiB |      0%      Default |\n",
+      "|                               |                      |                  N/A |\n",
+      "+-------------------------------+----------------------+----------------------+\n",
+      "|   3  Tesla V100-SXM2...  On   | 00000000:0B:00.0 Off |                    0 |\n",
+      "| N/A   27C    P0    41W / 300W |      0MiB / 16160MiB |      0%      Default |\n",
+      "|                               |                      |                  N/A |\n",
+      "+-------------------------------+----------------------+----------------------+\n",
+      "|   4  Tesla V100-SXM2...  On   | 00000000:85:00.0 Off |                    0 |\n",
+      "| N/A   28C    P0    42W / 300W |      0MiB / 16160MiB |      0%      Default |\n",
+      "|                               |                      |                  N/A |\n",
+      "+-------------------------------+----------------------+----------------------+\n",
+      "|   5  Tesla V100-SXM2...  On   | 00000000:86:00.0 Off |                    0 |\n",
+      "| N/A   29C    P0    43W / 300W |      0MiB / 16160MiB |      0%      Default |\n",
+      "|                               |                      |                  N/A |\n",
+      "+-------------------------------+----------------------+----------------------+\n",
+      "|   6  Tesla V100-SXM2...  On   | 00000000:89:00.0 Off |                    0 |\n",
+      "| N/A   31C    P0    42W / 300W |      0MiB / 16160MiB |      0%      Default |\n",
+      "|                               |                      |                  N/A |\n",
+      "+-------------------------------+----------------------+----------------------+\n",
+      "|   7  Tesla V100-SXM2...  On   | 00000000:8A:00.0 Off |                    0 |\n",
+      "| N/A   28C    P0    42W / 300W |      0MiB / 16160MiB |      0%      Default |\n",
+      "|                               |                      |                  N/A |\n",
+      "+-------------------------------+----------------------+----------------------+\n",
+      "                                                                               \n",
+      "+-----------------------------------------------------------------------------+\n",
+      "| Processes:                                                                  |\n",
+      "|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |\n",
+      "|        ID   ID                                                   Usage      |\n",
+      "|=============================================================================|\n",
+      "|  No running processes found                                                 |\n",
+      "+-----------------------------------------------------------------------------+\n"
+     ]
+    }
+   ],
    "source": [
     "!nvidia-smi"
    ]

+ 19 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/cuda/jacobi_streams_events.cu

@@ -261,6 +261,11 @@ int main(int argc, char* argv[]) {
             CUDA_RT_CALL(
                 cudaMemsetAsync(l2_norm_d[dev_id], 0, sizeof(float), compute_stream[dev_id]));
 
+            // TODO: Part 4- Block the "compute_stream" as long as the top and bottom halos from the
+            // neighbours are not copied to "dev_id". The "push_top_done" and "push_bottom_done" 
+            // events are to monitored for "bottom" and "top" neighbours, respectively for the 
+            // previous iteration denoted by "iter % 2".
+            // Note that there should be 2 distinct cudaStreamWaitEvent calls.
             CUDA_RT_CALL(
                 cudaStreamWaitEvent(compute_stream[dev_id], push_top_done[(iter % 2)][bottom], 0));
             CUDA_RT_CALL(
@@ -273,6 +278,8 @@ int main(int argc, char* argv[]) {
                     a_new[dev_id], a[dev_id], l2_norm_d[dev_id], iy_start[dev_id], iy_end[dev_id],
                     nx);
 
+            // TODO: Part 4- Record that Jacobi computation on "compute_stream" is done by using
+            // cudaEventRecord for "compute_done" event for "dev_id"
             CUDA_RT_CALL(cudaEventRecord(compute_done[dev_id], compute_stream[dev_id]));
 
             CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h[dev_id], l2_norm_d[dev_id], sizeof(float),
@@ -284,17 +291,29 @@ int main(int argc, char* argv[]) {
             CUDA_RT_CALL(cudaSetDevice(dev_id));
 
             // Apply periodic boundary conditions
+            // TODO: Part 4- Wait for the Jacobi computation of "dev_id" to complete by using the
+            // "compute_done" event on "push_top_stream" so that the top halo isn't copied to the
+            // neighbour before computation is done
             CUDA_RT_CALL(cudaStreamWaitEvent(push_top_stream[dev_id], compute_done[dev_id], 0));
             CUDA_RT_CALL(cudaMemcpyAsync(a_new[top] + (iy_end[top] * nx),
                                          a_new[dev_id] + iy_start[dev_id] * nx, nx * sizeof(float),
                                          cudaMemcpyDeviceToDevice, push_top_stream[dev_id]));
+            // TODO: Part 4- Record completion of top halo copy from "dev_id" to its neighbour
+            // to be used in next iteration. Record the event for "push_top_done" stream of 
+            // "dev_id" for next iteration which is "(iter+1) % 2"
             CUDA_RT_CALL(
                 cudaEventRecord(push_top_done[((iter + 1) % 2)][dev_id], push_top_stream[dev_id]));
 
+            // TODO: Part 4- Wait for the Jacobi computation of "dev_id" to complete by using the
+            // "compute_done" event on "push_bottom_stream" so that the bottom halo isn't copied to
+            // the neighbour before computation is done
             CUDA_RT_CALL(cudaStreamWaitEvent(push_bottom_stream[dev_id], compute_done[dev_id], 0));
             CUDA_RT_CALL(cudaMemcpyAsync(a_new[bottom], a_new[dev_id] + (iy_end[dev_id] - 1) * nx,
                                          nx * sizeof(float), cudaMemcpyDeviceToDevice,
                                          push_bottom_stream[dev_id]));
+            // TODO: Part 4- Record completion of bottom halo copy from "dev_id" to its neighbour
+            // to be used in next iteration. Record the event for "push_bottom_done" stream of 
+            // "dev_id" for next iteration which is "(iter+1) % 2"
             CUDA_RT_CALL(cudaEventRecord(push_bottom_done[((iter + 1) % 2)][dev_id],
                                          push_bottom_stream[dev_id]));
         }

BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/CT_skull_512x512_8u.raw


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/CT_skull_512x512_8u_Gray.raw


File diff suppressed because it is too large
+ 0 - 1
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/Lena_512x512_8u_Gray.raw


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/PCB2_1024x683_8u.raw


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/PCB_1280x720_8u.raw


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/PCB_METAL_509x335_8u.raw


File diff suppressed because it is too large
+ 0 - 285
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/Rocks_512x512_8u_Gray.raw


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/lena_512x512_8u.raw


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/lib/x64/freeglut.lib


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/lib/x64/glew64.lib


+ 9 - 273
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Makefile

@@ -32,279 +32,28 @@
 ################################################################################
 
 # Location of the CUDA Toolkit
-CUDA_PATH=/opt/nvidia/hpc_sdk/Linux_x86_64/21.5/cuda
-GCC=
-##############################
-# start deprecated interface #
-##############################
-ifeq ($(x86_64),1)
-    $(info WARNING - x86_64 variable has been deprecated)
-    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
-    TARGET_ARCH ?= x86_64
-endif
-ifeq ($(ARMv7),1)
-    $(info WARNING - ARMv7 variable has been deprecated)
-    $(info WARNING - please use TARGET_ARCH=armv7l instead)
-    TARGET_ARCH ?= armv7l
-endif
-ifeq ($(aarch64),1)
-    $(info WARNING - aarch64 variable has been deprecated)
-    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
-    TARGET_ARCH ?= aarch64
-endif
-ifeq ($(ppc64le),1)
-    $(info WARNING - ppc64le variable has been deprecated)
-    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
-    TARGET_ARCH ?= ppc64le
-endif
-ifneq ($(GCC),)
-    $(info WARNING - GCC variable has been deprecated)
-    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
-    HOST_COMPILER ?= $(GCC)
-endif
-ifneq ($(abi),)
-    $(error ERROR - abi variable has been removed)
-endif
-############################
-# end deprecated interface #
-############################
 
-# architecture
-HOST_ARCH   := $(shell uname -m)
-TARGET_ARCH ?= $(HOST_ARCH)
-ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
-    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
-            TARGET_SIZE := 64
-        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
-            TARGET_SIZE := 32
-        endif
-    else
-        TARGET_SIZE := $(shell getconf LONG_BIT)
-    endif
-else
-    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
-endif
-
-# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
-ifeq ($(HOST_ARCH),aarch64)
-    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
-        HOST_ARCH := sbsa
-        TARGET_ARCH := sbsa
-    endif
-endif
-
-ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
-        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
-    endif
-endif
-
-# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
-ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
-    TARGET_ARCH = armv7l
-endif
-
-# operating system
-HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
-TARGET_OS ?= $(HOST_OS)
-ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
-    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
-endif
-
-# host compiler
-ifeq ($(TARGET_OS),darwin)
-    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
-        HOST_COMPILER ?= clang++
-    endif
-else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
-        ifeq ($(TARGET_OS),linux)
-            HOST_COMPILER ?= arm-linux-gnueabihf-g++
-        else ifeq ($(TARGET_OS),qnx)
-            ifeq ($(QNX_HOST),)
-                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
-            endif
-            ifeq ($(QNX_TARGET),)
-                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
-            endif
-            export QNX_HOST
-            export QNX_TARGET
-            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
-        else ifeq ($(TARGET_OS),android)
-            HOST_COMPILER ?= arm-linux-androideabi-g++
-        endif
-    else ifeq ($(TARGET_ARCH),aarch64)
-        ifeq ($(TARGET_OS), linux)
-            HOST_COMPILER ?= aarch64-linux-gnu-g++
-        else ifeq ($(TARGET_OS),qnx)
-            ifeq ($(QNX_HOST),)
-                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
-            endif
-            ifeq ($(QNX_TARGET),)
-                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
-            endif
-            export QNX_HOST
-            export QNX_TARGET
-            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
-        else ifeq ($(TARGET_OS), android)
-            HOST_COMPILER ?= aarch64-linux-android-clang++
-        endif
-    else ifeq ($(TARGET_ARCH),sbsa)
-        HOST_COMPILER ?= aarch64-linux-gnu-g++
-    else ifeq ($(TARGET_ARCH),ppc64le)
-        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
-    endif
-endif
 HOST_COMPILER ?= g++
-NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
+NVCC          := nvcc -ccbin $(HOST_COMPILER)
 
 # internal flags
-NVCCFLAGS   := -m${TARGET_SIZE}
+NVCCFLAGS   :=
 CCFLAGS     :=
 LDFLAGS     :=
 
-# build flags
-ifeq ($(TARGET_OS),darwin)
-    LDFLAGS += -rpath $(CUDA_PATH)/lib
-    CCFLAGS += -arch $(HOST_ARCH)
-else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
-    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
-    CCFLAGS += -mfloat-abi=hard
-else ifeq ($(TARGET_OS),android)
-    LDFLAGS += -pie
-    CCFLAGS += -fpie -fpic -fexceptions
-endif
-
-ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
-        ifneq ($(TARGET_FS),)
-            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
-            ifeq ($(GCCVERSIONLTEQ46),1)
-                CCFLAGS += --sysroot=$(TARGET_FS)
-            endif
-            LDFLAGS += --sysroot=$(TARGET_FS)
-            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
-        endif
-    endif
-    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
-        ifneq ($(TARGET_FS),)
-            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
-            ifeq ($(GCCVERSIONLTEQ46),1)
-                CCFLAGS += --sysroot=$(TARGET_FS)
-            endif
-            LDFLAGS += --sysroot=$(TARGET_FS)
-            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
-            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
-        endif
-    endif
-    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
-        NVCCFLAGS += --qpp-config 5.4.0,gcc_ntoaarch64le
-        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
-        LDFLAGS += -lsocket
-        LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
-        CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
-        ifdef TARGET_OVERRIDE
-            LDFLAGS += -lslog2
-        endif
-
-        ifneq ($(TARGET_FS),)
-            LDFLAGS += -L$(TARGET_FS)/usr/lib
-            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
-            LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
-            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
-            CCFLAGS += -I$(TARGET_FS)/../include
-        endif
-    endif
-endif
-
-ifdef TARGET_OVERRIDE # cuda toolkit targets override
-    NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
-endif
-
-# Install directory of different arch
-CUDA_INSTALL_TARGET_DIR :=
-ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
-    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
-else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
-    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
-else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
-    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
-else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
-    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
-else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
-    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
-else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
-    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
-else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
-    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
-else ifeq ($(TARGET_ARCH),ppc64le)
-    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
-endif
-
-# Debug build flags
-ifeq ($(dbg),1)
-      NVCCFLAGS += -g -G
-      BUILD_TYPE := debug
-else
-      BUILD_TYPE := release
-endif
-
-ALL_CCFLAGS :=
-ALL_CCFLAGS += $(NVCCFLAGS)
-ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
-ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
-ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
-
 SAMPLE_ENABLED := 1
 
-ALL_LDFLAGS :=
-ALL_LDFLAGS += $(ALL_CCFLAGS)
-ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
-ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
-
 # Common includes and paths for CUDA
 INCLUDES  := -I./Common
 LIBRARIES :=
 
 ################################################################################
+GENCODE_SM70    := -gencode arch=compute_70,code=sm_70
+GENCODE_SM80    := -gencode arch=compute_80,code=sm_80 
+GENCODE_FLAGS	:= $(GENCODE_SM70) $(GENCODE_SM80)
 
-# Gencode arguments
-ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
-SMS ?= 70 72 75 80 86
-else
-SMS ?= 70 75 80 86
-endif
-
-ifeq ($(SMS),)
-$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
-SAMPLE_ENABLED := 0
-endif
-
-ifeq ($(GENCODE_FLAGS),)
-# Generate SASS code for each SM architecture listed in $(SMS)
-$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
-
-# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
-HIGHEST_SM := $(lastword $(sort $(SMS)))
-ifneq ($(HIGHEST_SM),)
-GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
-endif
-endif
-
-ALL_CCFLAGS += --threads 0
-
-ifeq ($(SAMPLE_ENABLED),0)
-EXEC ?= @echo "[@]"
-endif
-
+NVCC_FLAGS += -std=c++14
+LD_FLAGS += -lcudart
 ################################################################################
 
 # Target rules
@@ -312,26 +61,13 @@ all: build
 
 build: p2pBandwidthLatencyTest
 
-check.deps:
-ifeq ($(SAMPLE_ENABLED),0)
-	@echo "Sample will be waived due to the above missing dependencies"
-else
-	@echo "Sample is ready - all dependencies have been met"
-endif
-
 p2pBandwidthLatencyTest.o:p2pBandwidthLatencyTest.cu
-	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+	$(NVCC) $(INCLUDES) $(NVCC_FLAGS) $(GENCODE_FLAGS) -o $@ -c $<
 
 p2pBandwidthLatencyTest: p2pBandwidthLatencyTest.o
-	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
-	$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
-	$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
-
-run: build
-	$(EXEC) ./p2pBandwidthLatencyTest
+	$(NVCC) $(LD_FLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
 
 clean:
 	rm -f p2pBandwidthLatencyTest p2pBandwidthLatencyTest.o
-	rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/p2pBandwidthLatencyTest
 
 clobber: clean

+ 1 - 1
hpc/multi_gpu_nways/labs/CFD/English/introduction.ipynb

@@ -35,7 +35,7 @@
     "We will take up the Jacobi Solver, an iterative technique for solving system of linear equations, in this tutorial. To begin, click on the first link below:\n",
     "\n",
     "1. [Overview of single-GPU code and Nsight Systems Profiler](C/jupyter_notebook/single_gpu/single_gpu_overview.ipynb)\n",
-    "2. Single Node :Multi-GPU:\n",
+    "2. Single Node Multi-GPU:\n",
     "    * [CUDA Memcpy and Peer-to-Peer Memory Access](C/jupyter_notebook/cuda/memcpy.ipynb)\n",
     "    * [Intra-node topology](C/jupyter_notebook/advanced_concepts/single_node_topology.ipynb)\n",
     "    * [CUDA Streams and Events](C/jupyter_notebook/cuda/streams.ipynb)\n",

BIN
hpc/multi_gpu_nways/slurm_pmi_config/lib/libpmi.a


+ 0 - 41
hpc/multi_gpu_nways/slurm_pmi_config/lib/libpmi.la

@@ -1,41 +0,0 @@
-# libpmi.la - a libtool library file
-# Generated by libtool (GNU libtool) 2.4.6 Debian-2.4.6-10
-#
-# Please DO NOT delete this file!
-# It is necessary for linking the library.
-
-# The name that we can dlopen(3).
-dlname='libpmi.so.0'
-
-# Names of this library.
-library_names='libpmi.so.0.0.0 libpmi.so.0 libpmi.so'
-
-# The name of the static archive.
-old_library='libpmi.a'
-
-# Linker flags that cannot go in dependency_libs.
-inherited_linker_flags=' -pthread'
-
-# Libraries that this one depends upon.
-dependency_libs=' -ldl -lresolv'
-
-# Names of additional weak libraries provided by this library
-weak_library_names=''
-
-# Version information for libpmi.
-current=0
-age=0
-revision=0
-
-# Is this an already installed library?
-installed=yes
-
-# Should we warn about portability when linking against -modules?
-shouldnotlink=no
-
-# Files to dlopen/dlpreopen
-dlopen=''
-dlpreopen=''
-
-# Directory that this library needs to be installed in:
-libdir='/usr/local/lib'

BIN
hpc/multi_gpu_nways/slurm_pmi_config/lib/libpmi2.a


+ 0 - 41
hpc/multi_gpu_nways/slurm_pmi_config/lib/libpmi2.la

@@ -1,41 +0,0 @@
-# libpmi2.la - a libtool library file
-# Generated by libtool (GNU libtool) 2.4.6 Debian-2.4.6-10
-#
-# Please DO NOT delete this file!
-# It is necessary for linking the library.
-
-# The name that we can dlopen(3).
-dlname='libpmi2.so.0'
-
-# Names of this library.
-library_names='libpmi2.so.0.0.0 libpmi2.so.0 libpmi2.so'
-
-# The name of the static archive.
-old_library='libpmi2.a'
-
-# Linker flags that cannot go in dependency_libs.
-inherited_linker_flags=' -pthread'
-
-# Libraries that this one depends upon.
-dependency_libs=' -lresolv'
-
-# Names of additional weak libraries provided by this library
-weak_library_names=''
-
-# Version information for libpmi2.
-current=0
-age=0
-revision=0
-
-# Is this an already installed library?
-installed=yes
-
-# Should we warn about portability when linking against -modules?
-shouldnotlink=no
-
-# Files to dlopen/dlpreopen
-dlopen=''
-dlpreopen=''
-
-# Directory that this library needs to be installed in:
-libdir='/usr/local/lib'