5 lat temu · 24eda865ed
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,6 @@
 
				 alk.traj.dcd
			
 
				 *.simg
			
 
				 *.so*
			
 
				+*.a
			
 
				+*.la
			
 
				+mgpm
			
--- a/hpc/multi_gpu_nways/Dockerfile
+++ b/hpc/multi_gpu_nways/Dockerfile
@@ -1 +0,0 @@
 
				-# To be populated
			
--- a/hpc/multi_gpu_nways/README.md
+++ b/hpc/multi_gpu_nways/README.md
@@ -6,7 +6,6 @@ Scaling applications to multiple GPUs across multiple nodes requires one to be a
 
				 
			
 
				 ## Bootcamp Outline
			
 
				 
			
 
				-
			
 
				 * Overview of single-GPU code and Nsight Systems Profiler
			
 
				 * Single Node Multi-GPU:
			
 
				   - CUDA Memcpy and Peer-to-Peer Memory Access
			
@@ -87,4 +86,4 @@ After running Jupyter Lab, open [http://localhost:8888](http://localhost:8888/)
 
				 
			
 
				 Please join [OpenACC Slack Channel](https://openacclang.slack.com/messages/openaccusergroup) to raise questions.
			
 
				 
			
 
				-If you observe any errors or issues, please file an issue on [GPUBootcamp GitHuB repository](https://github.com/Anish-Saxena/gpubootcamp/tree/hpc-multi-gpu).
			
 
				+If you observe any errors or issues, please file an issue on [GPUBootcamp GitHuB repository](https://github.com/gpuhackathons-org/gpubootcamp).
			
--- a/hpc/multi_gpu_nways/Singularity
+++ b/hpc/multi_gpu_nways/Singularity
@@ -1,7 +1,6 @@
 
				 # Copyright (c) 2021 NVIDIA Corporation.  All rights reserved. 
			
 
				 
			
 
				 Bootstrap: docker
			
 
				-#FROM: nvcr.io/nvidia/nvhpc:20.11-devel-cuda_multi-ubuntu20.04
			
 
				 FROM: nvcr.io/nvidia/nvhpc:21.5-devel-cuda_multi-ubuntu20.04
			
 
				 
			
 
				 %environment
			
--- a/hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/nccl/nccl.ipynb
+++ b/hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/nccl/nccl.ipynb
@@ -10,10 +10,65 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				+   "execution_count": 1,
			
 
				    "id": "6ca7ab3b-aef8-41d6-a568-8458bce7c7d6",
			
 
				    "metadata": {},
			
 
				-   "outputs": [],
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "Sun Aug  1 16:12:04 2021       \n",
			
 
				+      "+-----------------------------------------------------------------------------+\n",
			
 
				+      "| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |\n",
			
 
				+      "|-------------------------------+----------------------+----------------------+\n",
			
 
				+      "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
			
 
				+      "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n",
			
 
				+      "|                               |                      |               MIG M. |\n",
			
 
				+      "|===============================+======================+======================|\n",
			
 
				+      "|   0  Tesla V100-SXM2...  On   | 00000000:06:00.0 Off |                    0 |\n",
			
 
				+      "| N/A   28C    P0    40W / 300W |      0MiB / 16160MiB |      0%      Default |\n",
			
 
				+      "|                               |                      |                  N/A |\n",
			
 
				+      "+-------------------------------+----------------------+----------------------+\n",
			
 
				+      "|   1  Tesla V100-SXM2...  On   | 00000000:07:00.0 Off |                    0 |\n",
			
 
				+      "| N/A   28C    P0    41W / 300W |      0MiB / 16160MiB |      0%      Default |\n",
			
 
				+      "|                               |                      |                  N/A |\n",
			
 
				+      "+-------------------------------+----------------------+----------------------+\n",
			
 
				+      "|   2  Tesla V100-SXM2...  On   | 00000000:0A:00.0 Off |                    0 |\n",
			
 
				+      "| N/A   29C    P0    42W / 300W |      0MiB / 16160MiB |      0%      Default |\n",
			
 
				+      "|                               |                      |                  N/A |\n",
			
 
				+      "+-------------------------------+----------------------+----------------------+\n",
			
 
				+      "|   3  Tesla V100-SXM2...  On   | 00000000:0B:00.0 Off |                    0 |\n",
			
 
				+      "| N/A   27C    P0    41W / 300W |      0MiB / 16160MiB |      0%      Default |\n",
			
 
				+      "|                               |                      |                  N/A |\n",
			
 
				+      "+-------------------------------+----------------------+----------------------+\n",
			
 
				+      "|   4  Tesla V100-SXM2...  On   | 00000000:85:00.0 Off |                    0 |\n",
			
 
				+      "| N/A   28C    P0    42W / 300W |      0MiB / 16160MiB |      0%      Default |\n",
			
 
				+      "|                               |                      |                  N/A |\n",
			
 
				+      "+-------------------------------+----------------------+----------------------+\n",
			
 
				+      "|   5  Tesla V100-SXM2...  On   | 00000000:86:00.0 Off |                    0 |\n",
			
 
				+      "| N/A   29C    P0    43W / 300W |      0MiB / 16160MiB |      0%      Default |\n",
			
 
				+      "|                               |                      |                  N/A |\n",
			
 
				+      "+-------------------------------+----------------------+----------------------+\n",
			
 
				+      "|   6  Tesla V100-SXM2...  On   | 00000000:89:00.0 Off |                    0 |\n",
			
 
				+      "| N/A   31C    P0    42W / 300W |      0MiB / 16160MiB |      0%      Default |\n",
			
 
				+      "|                               |                      |                  N/A |\n",
			
 
				+      "+-------------------------------+----------------------+----------------------+\n",
			
 
				+      "|   7  Tesla V100-SXM2...  On   | 00000000:8A:00.0 Off |                    0 |\n",
			
 
				+      "| N/A   28C    P0    42W / 300W |      0MiB / 16160MiB |      0%      Default |\n",
			
 
				+      "|                               |                      |                  N/A |\n",
			
 
				+      "+-------------------------------+----------------------+----------------------+\n",
			
 
				+      "                                                                               \n",
			
 
				+      "+-----------------------------------------------------------------------------+\n",
			
 
				+      "| Processes:                                                                  |\n",
			
 
				+      "|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |\n",
			
 
				+      "|        ID   ID                                                   Usage      |\n",
			
 
				+      "|=============================================================================|\n",
			
 
				+      "|  No running processes found                                                 |\n",
			
 
				+      "+-----------------------------------------------------------------------------+\n"
			
 
				+     ]
			
 
				+    }
			
 
				+   ],
			
 
				    "source": [
			
 
				     "!nvidia-smi"
			
 
				    ]
			
--- a/hpc/multi_gpu_nways/labs/CFD/English/C/source_code/cuda/jacobi_streams_events.cu
+++ b/hpc/multi_gpu_nways/labs/CFD/English/C/source_code/cuda/jacobi_streams_events.cu
@@ -261,6 +261,11 @@ int main(int argc, char* argv[]) {
 
				             CUDA_RT_CALL(
			
 
				                 cudaMemsetAsync(l2_norm_d[dev_id], 0, sizeof(float), compute_stream[dev_id]));
			
 
				 
			
 
				+            // TODO: Part 4- Block the "compute_stream" as long as the top and bottom halos from the
			
 
				+            // neighbours are not copied to "dev_id". The "push_top_done" and "push_bottom_done" 
			
 
				+            // events are to monitored for "bottom" and "top" neighbours, respectively for the 
			
 
				+            // previous iteration denoted by "iter % 2".
			
 
				+            // Note that there should be 2 distinct cudaStreamWaitEvent calls.
			
 
				             CUDA_RT_CALL(
			
 
				                 cudaStreamWaitEvent(compute_stream[dev_id], push_top_done[(iter % 2)][bottom], 0));
			
 
				             CUDA_RT_CALL(
			
@@ -273,6 +278,8 @@ int main(int argc, char* argv[]) {
 
				                     a_new[dev_id], a[dev_id], l2_norm_d[dev_id], iy_start[dev_id], iy_end[dev_id],
			
 
				                     nx);
			
 
				 
			
 
				+            // TODO: Part 4- Record that Jacobi computation on "compute_stream" is done by using
			
 
				+            // cudaEventRecord for "compute_done" event for "dev_id"
			
 
				             CUDA_RT_CALL(cudaEventRecord(compute_done[dev_id], compute_stream[dev_id]));
			
 
				 
			
 
				             CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h[dev_id], l2_norm_d[dev_id], sizeof(float),
			
@@ -284,17 +291,29 @@ int main(int argc, char* argv[]) {
 
				             CUDA_RT_CALL(cudaSetDevice(dev_id));
			
 
				 
			
 
				             // Apply periodic boundary conditions
			
 
				+            // TODO: Part 4- Wait for the Jacobi computation of "dev_id" to complete by using the
			
 
				+            // "compute_done" event on "push_top_stream" so that the top halo isn't copied to the
			
 
				+            // neighbour before computation is done
			
 
				             CUDA_RT_CALL(cudaStreamWaitEvent(push_top_stream[dev_id], compute_done[dev_id], 0));
			
 
				             CUDA_RT_CALL(cudaMemcpyAsync(a_new[top] + (iy_end[top] * nx),
			
 
				                                          a_new[dev_id] + iy_start[dev_id] * nx, nx * sizeof(float),
			
 
				                                          cudaMemcpyDeviceToDevice, push_top_stream[dev_id]));
			
 
				+            // TODO: Part 4- Record completion of top halo copy from "dev_id" to its neighbour
			
 
				+            // to be used in next iteration. Record the event for "push_top_done" stream of 
			
 
				+            // "dev_id" for next iteration which is "(iter+1) % 2"
			
 
				             CUDA_RT_CALL(
			
 
				                 cudaEventRecord(push_top_done[((iter + 1) % 2)][dev_id], push_top_stream[dev_id]));
			
 
				 
			
 
				+            // TODO: Part 4- Wait for the Jacobi computation of "dev_id" to complete by using the
			
 
				+            // "compute_done" event on "push_bottom_stream" so that the bottom halo isn't copied to
			
 
				+            // the neighbour before computation is done
			
 
				             CUDA_RT_CALL(cudaStreamWaitEvent(push_bottom_stream[dev_id], compute_done[dev_id], 0));
			
 
				             CUDA_RT_CALL(cudaMemcpyAsync(a_new[bottom], a_new[dev_id] + (iy_end[dev_id] - 1) * nx,
			
 
				                                          nx * sizeof(float), cudaMemcpyDeviceToDevice,
			
 
				                                          push_bottom_stream[dev_id]));
			
 
				+            // TODO: Part 4- Record completion of bottom halo copy from "dev_id" to its neighbour
			
 
				+            // to be used in next iteration. Record the event for "push_bottom_done" stream of 
			
 
				+            // "dev_id" for next iteration which is "(iter+1) % 2"
			
 
				             CUDA_RT_CALL(cudaEventRecord(push_bottom_done[((iter + 1) % 2)][dev_id],
			
 
				                                          push_bottom_stream[dev_id]));
			
 
				         }
			
--- a/hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/CT_skull_512x512_8u.raw
+++ b/hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/CT_skull_512x512_8u.raw
--- a/hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/CT_skull_512x512_8u_Gray.raw
+++ b/hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/CT_skull_512x512_8u_Gray.raw
--- a/hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/Lena_512x512_8u_Gray.raw
+++ b/hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/Lena_512x512_8u_Gray.raw
--- a/hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/PCB2_1024x683_8u.raw
+++ b/hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/PCB2_1024x683_8u.raw
--- a/hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/PCB_1280x720_8u.raw
+++ b/hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/PCB_1280x720_8u.raw
--- a/hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/PCB_METAL_509x335_8u.raw
+++ b/hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/PCB_METAL_509x335_8u.raw
--- a/hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/Rocks_512x512_8u_Gray.raw
+++ b/hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/Rocks_512x512_8u_Gray.raw
--- a/hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/lena_512x512_8u.raw
+++ b/hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/lena_512x512_8u.raw
--- a/hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/lib/x64/freeglut.lib
+++ b/hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/lib/x64/freeglut.lib
--- a/hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/lib/x64/glew64.lib
+++ b/hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/lib/x64/glew64.lib
--- a/hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Makefile
+++ b/hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Makefile
@@ -32,279 +32,28 @@
 
				 ################################################################################
			
 
				 
			
 
				 # Location of the CUDA Toolkit
			
 
				-CUDA_PATH=/opt/nvidia/hpc_sdk/Linux_x86_64/21.5/cuda
			
 
				-GCC=
			
 
				-##############################
			
 
				-# start deprecated interface #
			
 
				-##############################
			
 
				-ifeq ($(x86_64),1)
			
 
				-    $(info WARNING - x86_64 variable has been deprecated)
			
 
				-    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
			
 
				-    TARGET_ARCH ?= x86_64
			
 
				-endif
			
 
				-ifeq ($(ARMv7),1)
			
 
				-    $(info WARNING - ARMv7 variable has been deprecated)
			
 
				-    $(info WARNING - please use TARGET_ARCH=armv7l instead)
			
 
				-    TARGET_ARCH ?= armv7l
			
 
				-endif
			
 
				-ifeq ($(aarch64),1)
			
 
				-    $(info WARNING - aarch64 variable has been deprecated)
			
 
				-    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
			
 
				-    TARGET_ARCH ?= aarch64
			
 
				-endif
			
 
				-ifeq ($(ppc64le),1)
			
 
				-    $(info WARNING - ppc64le variable has been deprecated)
			
 
				-    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
			
 
				-    TARGET_ARCH ?= ppc64le
			
 
				-endif
			
 
				-ifneq ($(GCC),)
			
 
				-    $(info WARNING - GCC variable has been deprecated)
			
 
				-    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
			
 
				-    HOST_COMPILER ?= $(GCC)
			
 
				-endif
			
 
				-ifneq ($(abi),)
			
 
				-    $(error ERROR - abi variable has been removed)
			
 
				-endif
			
 
				-############################
			
 
				-# end deprecated interface #
			
 
				-############################
			
 
				 
			
 
				-# architecture
			
 
				-HOST_ARCH   := $(shell uname -m)
			
 
				-TARGET_ARCH ?= $(HOST_ARCH)
			
 
				-ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
			
 
				-    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
			
 
				-        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
			
 
				-            TARGET_SIZE := 64
			
 
				-        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
			
 
				-            TARGET_SIZE := 32
			
 
				-        endif
			
 
				-    else
			
 
				-        TARGET_SIZE := $(shell getconf LONG_BIT)
			
 
				-    endif
			
 
				-else
			
 
				-    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
			
 
				-endif
			
 
				-
			
 
				-# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
			
 
				-ifeq ($(HOST_ARCH),aarch64)
			
 
				-    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
			
 
				-        HOST_ARCH := sbsa
			
 
				-        TARGET_ARCH := sbsa
			
 
				-    endif
			
 
				-endif
			
 
				-
			
 
				-ifneq ($(TARGET_ARCH),$(HOST_ARCH))
			
 
				-    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
			
 
				-        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
			
 
				-    endif
			
 
				-endif
			
 
				-
			
 
				-# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
			
 
				-ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
			
 
				-    TARGET_ARCH = armv7l
			
 
				-endif
			
 
				-
			
 
				-# operating system
			
 
				-HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
			
 
				-TARGET_OS ?= $(HOST_OS)
			
 
				-ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
			
 
				-    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
			
 
				-endif
			
 
				-
			
 
				-# host compiler
			
 
				-ifeq ($(TARGET_OS),darwin)
			
 
				-    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
			
 
				-        HOST_COMPILER ?= clang++
			
 
				-    endif
			
 
				-else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
			
 
				-    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
			
 
				-        ifeq ($(TARGET_OS),linux)
			
 
				-            HOST_COMPILER ?= arm-linux-gnueabihf-g++
			
 
				-        else ifeq ($(TARGET_OS),qnx)
			
 
				-            ifeq ($(QNX_HOST),)
			
 
				-                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
			
 
				-            endif
			
 
				-            ifeq ($(QNX_TARGET),)
			
 
				-                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
			
 
				-            endif
			
 
				-            export QNX_HOST
			
 
				-            export QNX_TARGET
			
 
				-            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
			
 
				-        else ifeq ($(TARGET_OS),android)
			
 
				-            HOST_COMPILER ?= arm-linux-androideabi-g++
			
 
				-        endif
			
 
				-    else ifeq ($(TARGET_ARCH),aarch64)
			
 
				-        ifeq ($(TARGET_OS), linux)
			
 
				-            HOST_COMPILER ?= aarch64-linux-gnu-g++
			
 
				-        else ifeq ($(TARGET_OS),qnx)
			
 
				-            ifeq ($(QNX_HOST),)
			
 
				-                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
			
 
				-            endif
			
 
				-            ifeq ($(QNX_TARGET),)
			
 
				-                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
			
 
				-            endif
			
 
				-            export QNX_HOST
			
 
				-            export QNX_TARGET
			
 
				-            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
			
 
				-        else ifeq ($(TARGET_OS), android)
			
 
				-            HOST_COMPILER ?= aarch64-linux-android-clang++
			
 
				-        endif
			
 
				-    else ifeq ($(TARGET_ARCH),sbsa)
			
 
				-        HOST_COMPILER ?= aarch64-linux-gnu-g++
			
 
				-    else ifeq ($(TARGET_ARCH),ppc64le)
			
 
				-        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
			
 
				-    endif
			
 
				-endif
			
 
				 HOST_COMPILER ?= g++
			
 
				-NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
			
 
				+NVCC          := nvcc -ccbin $(HOST_COMPILER)
			
 
				 
			
 
				 # internal flags
			
 
				-NVCCFLAGS   := -m${TARGET_SIZE}
			
 
				+NVCCFLAGS   :=
			
 
				 CCFLAGS     :=
			
 
				 LDFLAGS     :=
			
 
				 
			
 
				-# build flags
			
 
				-ifeq ($(TARGET_OS),darwin)
			
 
				-    LDFLAGS += -rpath $(CUDA_PATH)/lib
			
 
				-    CCFLAGS += -arch $(HOST_ARCH)
			
 
				-else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
			
 
				-    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
			
 
				-    CCFLAGS += -mfloat-abi=hard
			
 
				-else ifeq ($(TARGET_OS),android)
			
 
				-    LDFLAGS += -pie
			
 
				-    CCFLAGS += -fpie -fpic -fexceptions
			
 
				-endif
			
 
				-
			
 
				-ifneq ($(TARGET_ARCH),$(HOST_ARCH))
			
 
				-    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
			
 
				-        ifneq ($(TARGET_FS),)
			
 
				-            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
			
 
				-            ifeq ($(GCCVERSIONLTEQ46),1)
			
 
				-                CCFLAGS += --sysroot=$(TARGET_FS)
			
 
				-            endif
			
 
				-            LDFLAGS += --sysroot=$(TARGET_FS)
			
 
				-            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
			
 
				-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
			
 
				-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
			
 
				-        endif
			
 
				-    endif
			
 
				-    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
			
 
				-        ifneq ($(TARGET_FS),)
			
 
				-            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
			
 
				-            ifeq ($(GCCVERSIONLTEQ46),1)
			
 
				-                CCFLAGS += --sysroot=$(TARGET_FS)
			
 
				-            endif
			
 
				-            LDFLAGS += --sysroot=$(TARGET_FS)
			
 
				-            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
			
 
				-            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
			
 
				-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
			
 
				-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
			
 
				-            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
			
 
				-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
			
 
				-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
			
 
				-        endif
			
 
				-    endif
			
 
				-    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
			
 
				-        NVCCFLAGS += --qpp-config 5.4.0,gcc_ntoaarch64le
			
 
				-        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
			
 
				-        LDFLAGS += -lsocket
			
 
				-        LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
			
 
				-        CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
			
 
				-        ifdef TARGET_OVERRIDE
			
 
				-            LDFLAGS += -lslog2
			
 
				-        endif
			
 
				-
			
 
				-        ifneq ($(TARGET_FS),)
			
 
				-            LDFLAGS += -L$(TARGET_FS)/usr/lib
			
 
				-            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
			
 
				-            LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
			
 
				-            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
			
 
				-            CCFLAGS += -I$(TARGET_FS)/../include
			
 
				-        endif
			
 
				-    endif
			
 
				-endif
			
 
				-
			
 
				-ifdef TARGET_OVERRIDE # cuda toolkit targets override
			
 
				-    NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
			
 
				-endif
			
 
				-
			
 
				-# Install directory of different arch
			
 
				-CUDA_INSTALL_TARGET_DIR :=
			
 
				-ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
			
 
				-    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
			
 
				-else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
			
 
				-    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
			
 
				-else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
			
 
				-    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
			
 
				-else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
			
 
				-    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
			
 
				-else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
			
 
				-    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
			
 
				-else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
			
 
				-    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
			
 
				-else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
			
 
				-    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
			
 
				-else ifeq ($(TARGET_ARCH),ppc64le)
			
 
				-    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
			
 
				-endif
			
 
				-
			
 
				-# Debug build flags
			
 
				-ifeq ($(dbg),1)
			
 
				-      NVCCFLAGS += -g -G
			
 
				-      BUILD_TYPE := debug
			
 
				-else
			
 
				-      BUILD_TYPE := release
			
 
				-endif
			
 
				-
			
 
				-ALL_CCFLAGS :=
			
 
				-ALL_CCFLAGS += $(NVCCFLAGS)
			
 
				-ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
			
 
				-ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
			
 
				-ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
			
 
				-
			
 
				 SAMPLE_ENABLED := 1
			
 
				 
			
 
				-ALL_LDFLAGS :=
			
 
				-ALL_LDFLAGS += $(ALL_CCFLAGS)
			
 
				-ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
			
 
				-ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
			
 
				-
			
 
				 # Common includes and paths for CUDA
			
 
				 INCLUDES  := -I./Common
			
 
				 LIBRARIES :=
			
 
				 
			
 
				 ################################################################################
			
 
				+GENCODE_SM70    := -gencode arch=compute_70,code=sm_70
			
 
				+GENCODE_SM80    := -gencode arch=compute_80,code=sm_80 
			
 
				+GENCODE_FLAGS	:= $(GENCODE_SM70) $(GENCODE_SM80)
			
 
				 
			
 
				-# Gencode arguments
			
 
				-ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
			
 
				-SMS ?= 70 72 75 80 86
			
 
				-else
			
 
				-SMS ?= 70 75 80 86
			
 
				-endif
			
 
				-
			
 
				-ifeq ($(SMS),)
			
 
				-$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
			
 
				-SAMPLE_ENABLED := 0
			
 
				-endif
			
 
				-
			
 
				-ifeq ($(GENCODE_FLAGS),)
			
 
				-# Generate SASS code for each SM architecture listed in $(SMS)
			
 
				-$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
			
 
				-
			
 
				-# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
			
 
				-HIGHEST_SM := $(lastword $(sort $(SMS)))
			
 
				-ifneq ($(HIGHEST_SM),)
			
 
				-GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
			
 
				-endif
			
 
				-endif
			
 
				-
			
 
				-ALL_CCFLAGS += --threads 0
			
 
				-
			
 
				-ifeq ($(SAMPLE_ENABLED),0)
			
 
				-EXEC ?= @echo "[@]"
			
 
				-endif
			
 
				-
			
 
				+NVCC_FLAGS += -std=c++14
			
 
				+LD_FLAGS += -lcudart
			
 
				 ################################################################################
			
 
				 
			
 
				 # Target rules
			
@@ -312,26 +61,13 @@ all: build
 
				 
			
 
				 build: p2pBandwidthLatencyTest
			
 
				 
			
 
				-check.deps:
			
 
				-ifeq ($(SAMPLE_ENABLED),0)
			
 
				-	@echo "Sample will be waived due to the above missing dependencies"
			
 
				-else
			
 
				-	@echo "Sample is ready - all dependencies have been met"
			
 
				-endif
			
 
				-
			
 
				 p2pBandwidthLatencyTest.o:p2pBandwidthLatencyTest.cu
			
 
				-	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
			
 
				+	$(NVCC) $(INCLUDES) $(NVCC_FLAGS) $(GENCODE_FLAGS) -o $@ -c $<
			
 
				 
			
 
				 p2pBandwidthLatencyTest: p2pBandwidthLatencyTest.o
			
 
				-	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
			
 
				-	$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
			
 
				-	$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
			
 
				-
			
 
				-run: build
			
 
				-	$(EXEC) ./p2pBandwidthLatencyTest
			
 
				+	$(NVCC) $(LD_FLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
			
 
				 
			
 
				 clean:
			
 
				 	rm -f p2pBandwidthLatencyTest p2pBandwidthLatencyTest.o
			
 
				-	rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/p2pBandwidthLatencyTest
			
 
				 
			
 
				 clobber: clean
			
--- a/hpc/multi_gpu_nways/labs/CFD/English/introduction.ipynb
+++ b/hpc/multi_gpu_nways/labs/CFD/English/introduction.ipynb
@@ -35,7 +35,7 @@
 
				     "We will take up the Jacobi Solver, an iterative technique for solving system of linear equations, in this tutorial. To begin, click on the first link below:\n",
			
 
				     "\n",
			
 
				     "1. [Overview of single-GPU code and Nsight Systems Profiler](C/jupyter_notebook/single_gpu/single_gpu_overview.ipynb)\n",
			
 
				-    "2. Single Node :Multi-GPU:\n",
			
 
				+    "2. Single Node Multi-GPU:\n",
			
 
				     "    * [CUDA Memcpy and Peer-to-Peer Memory Access](C/jupyter_notebook/cuda/memcpy.ipynb)\n",
			
 
				     "    * [Intra-node topology](C/jupyter_notebook/advanced_concepts/single_node_topology.ipynb)\n",
			
 
				     "    * [CUDA Streams and Events](C/jupyter_notebook/cuda/streams.ipynb)\n",
			
--- a/hpc/multi_gpu_nways/slurm_pmi_config/lib/libpmi.a
+++ b/hpc/multi_gpu_nways/slurm_pmi_config/lib/libpmi.a
--- a/hpc/multi_gpu_nways/slurm_pmi_config/lib/libpmi.la
+++ b/hpc/multi_gpu_nways/slurm_pmi_config/lib/libpmi.la
@@ -1,41 +0,0 @@
 
				-# libpmi.la - a libtool library file
			
 
				-# Generated by libtool (GNU libtool) 2.4.6 Debian-2.4.6-10
			
 
				-#
			
 
				-# Please DO NOT delete this file!
			
 
				-# It is necessary for linking the library.
			
 
				-
			
 
				-# The name that we can dlopen(3).
			
 
				-dlname='libpmi.so.0'
			
 
				-
			
 
				-# Names of this library.
			
 
				-library_names='libpmi.so.0.0.0 libpmi.so.0 libpmi.so'
			
 
				-
			
 
				-# The name of the static archive.
			
 
				-old_library='libpmi.a'
			
 
				-
			
 
				-# Linker flags that cannot go in dependency_libs.
			
 
				-inherited_linker_flags=' -pthread'
			
 
				-
			
 
				-# Libraries that this one depends upon.
			
 
				-dependency_libs=' -ldl -lresolv'
			
 
				-
			
 
				-# Names of additional weak libraries provided by this library
			
 
				-weak_library_names=''
			
 
				-
			
 
				-# Version information for libpmi.
			
 
				-current=0
			
 
				-age=0
			
 
				-revision=0
			
 
				-
			
 
				-# Is this an already installed library?
			
 
				-installed=yes
			
 
				-
			
 
				-# Should we warn about portability when linking against -modules?
			
 
				-shouldnotlink=no
			
 
				-
			
 
				-# Files to dlopen/dlpreopen
			
 
				-dlopen=''
			
 
				-dlpreopen=''
			
 
				-
			
 
				-# Directory that this library needs to be installed in:
			
 
				-libdir='/usr/local/lib'
			
--- a/hpc/multi_gpu_nways/slurm_pmi_config/lib/libpmi2.a
+++ b/hpc/multi_gpu_nways/slurm_pmi_config/lib/libpmi2.a
--- a/hpc/multi_gpu_nways/slurm_pmi_config/lib/libpmi2.la
+++ b/hpc/multi_gpu_nways/slurm_pmi_config/lib/libpmi2.la
@@ -1,41 +0,0 @@
 
				-# libpmi2.la - a libtool library file
			
 
				-# Generated by libtool (GNU libtool) 2.4.6 Debian-2.4.6-10
			
 
				-#
			
 
				-# Please DO NOT delete this file!
			
 
				-# It is necessary for linking the library.
			
 
				-
			
 
				-# The name that we can dlopen(3).
			
 
				-dlname='libpmi2.so.0'
			
 
				-
			
 
				-# Names of this library.
			
 
				-library_names='libpmi2.so.0.0.0 libpmi2.so.0 libpmi2.so'
			
 
				-
			
 
				-# The name of the static archive.
			
 
				-old_library='libpmi2.a'
			
 
				-
			
 
				-# Linker flags that cannot go in dependency_libs.
			
 
				-inherited_linker_flags=' -pthread'
			
 
				-
			
 
				-# Libraries that this one depends upon.
			
 
				-dependency_libs=' -lresolv'
			
 
				-
			
 
				-# Names of additional weak libraries provided by this library
			
 
				-weak_library_names=''
			
 
				-
			
 
				-# Version information for libpmi2.
			
 
				-current=0
			
 
				-age=0
			
 
				-revision=0
			
 
				-
			
 
				-# Is this an already installed library?
			
 
				-installed=yes
			
 
				-
			
 
				-# Should we warn about portability when linking against -modules?
			
 
				-shouldnotlink=no
			
 
				-
			
 
				-# Files to dlopen/dlpreopen
			
 
				-dlopen=''
			
 
				-dlpreopen=''
			
 
				-
			
 
				-# Directory that this library needs to be installed in:
			
 
				-libdir='/usr/local/lib'