Browse Source

Added multi-node multi-GPU bootcamp code and notebooks

Anish Saxena 3 years ago
parent
commit
15c80111fa
100 changed files with 6040 additions and 1066 deletions
  1. 1 0
      .gitignore
  2. 16 4
      hpc/multi_gpu_nways/Singularity
  3. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/bin/x86_64/linux/release/p2pBandwidthLatencyTest
  4. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/cuda_streams_overview.png
  5. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/gpudirect_p2p.png
  6. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/gpudirect_rdma.png
  7. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/jacobi_memcpy_p2p_report.png
  8. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/jacobi_memcpy_report_events.png
  9. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/jacobi_memcpy_report_overview.png
  10. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/jacobi_memcpy_streams_events_p2p_report.png
  11. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/jupyter_lab_navigation.png
  12. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/memcpy_gpu_util.png
  13. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/memcpy_util_selection.png
  14. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/mpi_container_setup.png
  15. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/mpi_cuda_aware_gdr_latency.png
  16. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/mpi_cuda_aware_halo_exchange_latency.png
  17. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/mpi_cuda_aware_p2p_metrics.png
  18. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/mpi_host_staging_throughput_latency.png
  19. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/mpi_host_staging_time.png
  20. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/mpi_memcpy_halo_exchange_latency.png
  21. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/mpi_memcpy_large_time.png
  22. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/mpi_memcpy_nvtx_stats.png
  23. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/mpi_memcpy_overview.png
  24. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/mpi_overview.png
  25. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/streams_util_selection.png
  26. 272 0
      hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/advanced_concepts/single_node_topology.ipynb
  27. 40 166
      hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/memcpy/streams.ipynb
  28. 397 0
      hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/cuda/streams.ipynb
  29. 0 149
      hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/jacobi/overview.ipynb
  30. 247 0
      hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/mpi/containers_and_mpi.ipynb
  31. 380 0
      hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/mpi/cuda_aware.ipynb
  32. 430 0
      hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/mpi/memcpy.ipynb
  33. 262 0
      hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/mpi/multi_node_intro.ipynb
  34. 72 0
      hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/nccl/nccl.ipynb
  35. 37 9
      hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/single_gpu/single_gpu_overview.ipynb
  36. 6 2
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/memcpy/Makefile
  37. 26 25
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/memcpy/jacobi_memcpy.cu
  38. 455 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/cuda/jacobi_streams.cu
  39. 14 9
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/memcpy/jacobi_streams.cu
  40. 463 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/cuda/solutions/jacobi_memcpy.cu
  41. 455 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/cuda/solutions/jacobi_streams.cu
  42. 138 193
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/memcpy/jacobi.cu
  43. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/memcpy/jacobi
  44. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/memcpy/jacobi_memcpy
  45. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/memcpy/jacobi_memcpy_nvlink_report.qdrep
  46. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/memcpy/jacobi_memcpy_nvlink_report.sqlite
  47. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/memcpy/jacobi_memcpy_sys_report.qdrep
  48. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/memcpy/jacobi_memcpy_sys_report.sqlite
  49. 0 13
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/memcpy/mgpm
  50. 16 28
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/Makefile
  51. 24 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/containerization/Makefile
  52. 358 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/containerization/jacobi_cuda_aware_mpi.cpp
  53. 97 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/containerization/jacobi_kernels.cu
  54. 27 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/hello_world.c
  55. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/jacobi
  56. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/jacobi.o
  57. 358 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/jacobi_cuda_aware_mpi.cpp
  58. 37 53
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/jacobi_kernels.cu
  59. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/jacobi_kernels.o
  60. 378 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/jacobi_memcpy_mpi.cpp
  61. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/reports.zip
  62. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/reports/report1.qdrep
  63. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/reports/report2.qdrep
  64. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/reports/report3.qdrep
  65. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/reports/report4.qdrep
  66. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/reports/report5.qdrep
  67. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/reports/report6.qdrep
  68. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/reports/report7.qdrep
  69. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/reports/report8.qdrep
  70. 358 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/solutions/jacobi_cuda_aware_mpi.cpp
  71. 378 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/solutions/jacobi_memcpy_mpi.cpp
  72. 0 1
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/multi-gpu-programming-models
  73. 42 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/nccl/Makefile
  74. 117 36
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/jacobi.cpp
  75. 113 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/nccl/jacobi_kernels.cu
  76. 2 2
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Makefile
  77. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest
  78. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.o
  79. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/single_gpu/jacobi
  80. 2 2
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/single_gpu/jacobi.cu
  81. 0 315
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/single_gpu/jacobi.cu.old
  82. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/single_gpu/jacobi_report.qdrep
  83. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/single_gpu/jacobi_report.sqlite
  84. 0 13
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/single_gpu/temp
  85. 22 23
      hpc/multi_gpu_nways/labs/CFD/English/introduction.ipynb
  86. 0 23
      hpc/multi_gpu_nways/labs/profiler/English/LICENSE
  87. BIN
      hpc/multi_gpu_nways/labs/profiler/English/jupyter_notebook/images/Nsight Diagram.png
  88. BIN
      hpc/multi_gpu_nways/labs/profiler/English/jupyter_notebook/images/Optimization_Cycle.jpg
  89. BIN
      hpc/multi_gpu_nways/labs/profiler/English/jupyter_notebook/images/UM.png
  90. BIN
      hpc/multi_gpu_nways/labs/profiler/English/jupyter_notebook/images/allsection-compute.png
  91. BIN
      hpc/multi_gpu_nways/labs/profiler/English/jupyter_notebook/images/baseline-compute.png
  92. BIN
      hpc/multi_gpu_nways/labs/profiler/English/jupyter_notebook/images/charts-compute.png
  93. BIN
      hpc/multi_gpu_nways/labs/profiler/English/jupyter_notebook/images/cli-out.png
  94. BIN
      hpc/multi_gpu_nways/labs/profiler/English/jupyter_notebook/images/collapse_feedback.png
  95. BIN
      hpc/multi_gpu_nways/labs/profiler/English/jupyter_notebook/images/collapse_pre.png
  96. BIN
      hpc/multi_gpu_nways/labs/profiler/English/jupyter_notebook/images/collapse_thread.png
  97. BIN
      hpc/multi_gpu_nways/labs/profiler/English/jupyter_notebook/images/compute.png
  98. BIN
      hpc/multi_gpu_nways/labs/profiler/English/jupyter_notebook/images/compute_analyz.png
  99. BIN
      hpc/multi_gpu_nways/labs/profiler/English/jupyter_notebook/images/compute_command.png
  100. 0 0
      hpc/multi_gpu_nways/labs/profiler/English/jupyter_notebook/images/compute_command_line.png

+ 1 - 0
.gitignore

@@ -2,3 +2,4 @@
 */.ipynb_checkpoints/*
 alk.traj.dcd
 *.simg
+*.so*

+ 16 - 4
hpc/multi_gpu_nways/Singularity

@@ -2,13 +2,12 @@
 
 Bootstrap: docker
 #FROM: nvcr.io/nvidia/nvhpc:20.11-devel-cuda_multi-ubuntu20.04
-FROM: nvcr.io/nvidia/nvhpc:21.3-devel-cuda_multi-ubuntu20.04
+FROM: nvcr.io/nvidia/nvhpc:21.5-devel-cuda_multi-ubuntu20.04
 
 %environment
     export XDG_RUNTIME_DIR=
-    export PATH="$PATH:/usr/local/bin:/opt/anaconda3/bin:/usr/bin"
-    export PATH=/opt/nvidia/nsight-systems/2020.5.1/bin:/opt/nvidia/nsight-compute/2020.2.1:$PATH
-    export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib:/opt/nvidia/hpc_sdk/Linux_x86_64/21.3/cuda/11.2/lib64/"
+    export PATH="/opt/openmpi/ompi/bin/:/usr/local/bin:/opt/anaconda3/bin:/usr/bin:/opt/nvidia/nsight-systems/2020.5.1/bin:/opt/nvidia/nsight-compute/2020.2.1:$PATH"
+    export LD_LIBRARY_PATH="/opt/openmpi/ompi/lib:/pmi_utils/lib/:/usr/local/lib:/opt/nvidia/hpc_sdk/Linux_x86_64/21.5/cuda/lib64/:$LD_LIBRARY_PATH"
 
 %post
     build_tmp=$(mktemp -d) && cd ${build_tmp}
@@ -24,6 +23,7 @@ FROM: nvcr.io/nvidia/nvhpc:21.3-devel-cuda_multi-ubuntu20.04
 
     pip3 install --upgrade pip
     pip3 install --no-cache-dir jupyter
+    pip3 install --no-cache-dir jupyterlab
     pip3 install gdown
 
     apt-get install --no-install-recommends -y build-essential 
@@ -40,12 +40,24 @@ FROM: nvcr.io/nvidia/nvhpc:21.3-devel-cuda_multi-ubuntu20.04
     wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh 
     bash Miniconda3-latest-Linux-x86_64.sh -b -p /opt/anaconda3 
     rm Miniconda3-latest-Linux-x86_64.sh 
+
+# Install CUDA-aware OpenMPI with UCX and PMI
+    mkdir -p /opt/openmpi && cd /opt/openmpi
+    wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.1.tar.gz
+    tar -xvzf openmpi-4.1.1.tar.gz
+    mkdir -p /opt/openmpi/ompi/
+    cd /opt/openmpi/openmpi-4.1.1/
+    ./configure --prefix=/opt/openmpi/ompi/ --with-libevent=internal --with-xpmem --with-cuda=/opt/nvidia/hpc_sdk/Linux_x86_64/21.5/cuda/ --with-slurm --with-pmix=internal --with-pmi=/pmi_utils/ --enable-mpi1-compatibility --with-verbs --with-hcoll=/opt/nvidia/hpc_sdk/Linux_x86_64/21.5/comm_libs/hpcx/hpcx-2.8.1/hcoll/ --with-ucx=/opt/nvidia/hpc_sdk/Linux_x86_64/21.5/comm_libs/hpcx/hpcx-2.8.1/ucx/
+    export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/pmi_utils/lib/"
+    make all install
     
     cd /
     rm -rf ${build_tmp}
 
 %files
     labs/ /labs
+    slurm_pmi_config/ /pmi_utils
+
 %runscript
     "$@"
 

BIN
hpc/multi_gpu_nways/labs/CFD/English/C/bin/x86_64/linux/release/p2pBandwidthLatencyTest


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/cuda_streams_overview.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/gpudirect_p2p.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/gpudirect_rdma.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/jacobi_memcpy_p2p_report.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/jacobi_memcpy_report_events.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/jacobi_memcpy_report_overview.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/jacobi_memcpy_streams_events_p2p_report.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/jupyter_lab_navigation.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/memcpy_gpu_util.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/memcpy_util_selection.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/mpi_container_setup.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/mpi_cuda_aware_gdr_latency.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/mpi_cuda_aware_halo_exchange_latency.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/mpi_cuda_aware_p2p_metrics.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/mpi_host_staging_throughput_latency.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/mpi_host_staging_time.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/mpi_memcpy_halo_exchange_latency.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/mpi_memcpy_large_time.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/mpi_memcpy_nvtx_stats.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/mpi_memcpy_overview.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/mpi_overview.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/streams_util_selection.png


+ 272 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/advanced_concepts/single_node_topology.ipynb

@@ -0,0 +1,272 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "251d3000",
+   "metadata": {},
+   "source": [
+    "Before we begin, let's get an overview of the CUDA driver version and the GPUs running on the server by executing the `nvidia-smi` command below. Highlight the cell below by clicking on it and then either hit `Ctrl+Enter` on the keyboard or click on the `Run` button on the toolbar above. The output will be visible below the cell."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e6fa8e78",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "!nvidia-smi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "790904cd",
+   "metadata": {},
+   "source": [
+    "# Learning Objectives\n",
+    "\n",
+    "In this lab, we will learn about:\n",
+    "\n",
+    "* Understanding intra-node GPU topology and interconnections like PCIe and NVLink\n",
+    "* Architecture overview of NVIDIA DGX 1 Tesla V100 system\n",
+    "* Comparison of communication links and their impact on application performance\n",
+    "* p2pBandwidthLatencyTest micro-benchmark for P2P performance analysis.\n",
+    "\n",
+    "# Intra-Node Communication Topology\n",
+    "\n",
+    "Let's dive deeper into how the underlying communication architecture of our system affects program performance. Run the command below to display your node's GPU and NIC communication topology:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4bf585d6",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "!nvidia-smi topo -m"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "da57e0dd",
+   "metadata": {},
+   "source": [
+    "If the output is unclear, you can launch a Terminal session by clicking on `File` $\\rightarrow$ Open and following the steps as shown:\n",
+    "\n",
+    "![open_terminal_session](../../images/open_terminal_session.png)\n",
+    "\n",
+    "On our DGX-1 system, the output is as follows:\n",
+    "\n",
+    "![nvidia_smi_topo_output](../../images/nvidia_smi_topo_output.png)\n",
+    "\n",
+    "Focus one a particular row, say GPU 0. The output states that GPUs 1 through 4 are connected to it via NVLink (in addition to PCIe) and GPUs 5 through 7 are connected to it via PCIe as well as an \"SMP\" interconnect. We have a dual-socket system and the CPUs in these sockets are connected by an interconnect known as SMP interconnect.\n",
+    "\n",
+    "Thus, GPU 0 to GPU 5 communication happens via not just PCIe, but also over the inter-socket interconnect within the same node. Clearly, this is a longer path than say the one between GPU 0 and GPU 1, which are connected via NVLink directly. We will discuss the NIC to GPU connection in the inter-node section of this bootcamp.\n",
+    "\n",
+    "Even within the GPUs connected via NVLink, we see different annotations such as `NV1` and `NV2` that affect the communication bandwidth and hence the performance. In this section, we will explore the nuances associated with a diverse intra-node GPU communication topology like in the output above. Specifically, in our system, the communication topology is as follows:\n",
+    "\n",
+    "![dgx1_8x_tesla_v100_topo](../../images/dgx1_8x_tesla_v100_topo.png)\n",
+    "\n",
+    "Qualitatively, the bandwidth and latency vary with the topology as follows:\n",
+    "\n",
+    "![intra_node_topology_map](../../images/intra_node_topology_map.png)\n",
+    "\n",
+    "Host staging implies traversing through the CPU and the travel path taken is one of PHB, NODE, and SYS. In contrast, if the path taken is either NV1, NV2, or PIX, then P2P is available. PXB implies that the GPUs belong to different PCIe hubs and P2P is usually not supported in this case.\n",
+    "\n",
+    "A double NVLink connection provides twice the bandwidth compared to a single NVLink. \n",
+    "\n",
+    "For a pair of 2 GPUs, the peak bidirectional bandwidth are as follows:\n",
+    "* PCIe: Using PIX topology, 15.75GB/s for PCIe Gen 3.0 and 31.5GB/s for PCIe Gen 4.0.\n",
+    "* NVLink: Using NV# topology, 50GB/s per connection. So a double NVLink connection has 100GB/s peak bidirectional bandwidth.\n",
+    "\n",
+    "Let us understand what difference the underlying communication topology can make to the application performance in the following sub-section.\n",
+    "\n",
+    "**Note:** If your command output doesn't show any NVLink connection or if there's no difference in connection type (PIX, PXB, PHB, NODE, SYS, NV#) between any 2 pair of GPUs, then the communication bandwidth and latency will likely be the same between any pair and the following sub-sections will not display any performance difference.\n",
+    "\n",
+    "## Performance variation due to system topology\n",
+    "\n",
+    "So far, the code runs the multi-GPU version on all available GPUs in a node (8 in our case). We can supply the `-gpus` runtime flag to the binary to run our code on specific GPUs. If we want to run on only 2 GPUs, namely GPU 0 and GPU 3, we use the `-gpus 0,3` argument. \n",
+    "\n",
+    "Try to find the GPU pair with highest bandwidth available as per the table above and replace `0,3` with those GPUs, and then run the command below:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "93961dbc",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "!cd ../../source_code/cuda && ./jacobi_memcpy -p2p -gpus 0,3"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f0f5d159",
+   "metadata": {},
+   "source": [
+    "The efficiency would likely be higher than before due to less inter-GPU communication (each GPU does more wok instead). Our output is as follows:\n",
+    "\n",
+    "```bash\n",
+    "Num GPUs: 2. Using GPU ID: 0, 3, \n",
+    "16384x16384: 1 GPU:   4.4513 s, 2 GPUs:   2.2664 s, speedup:     1.96, efficiency:    98.20  \n",
+    "```\n",
+    "\n",
+    "Now, run the binary a pair of GPUs that have the lowest available bandwidth. In our case, we use GPU 0 and GPU 7. Our output is:\n",
+    "\n",
+    "```bash\n",
+    "Num GPUs: 2. Using GPU ID: 0, 7, \n",
+    "16384x16384: 1 GPU:   4.4529 s, 2 GPUs:   2.3454 s, speedup:     1.90, efficiency:    94.93  \n",
+    "```\n",
+    "\n",
+    "Now remove the `-p2p` flag and run the command again for GPUs 0 and 7. We didn't get any difference in performance. As you may recall, P2P is not possible between GPUs 0 and 7, so the underlying communication path doesn't change, resulting in same performance with and without the `-p2p` flag. \n",
+    "\n",
+    "The same can be confirmed by profiling the application and looking at the operations performed in the Nsight Systems timeline. \n",
+    "\n",
+    "![p2p_2_gpu_memcpy_nsys](../../images/p2p_2_gpu_memcpy_nsys.png)\n",
+    "\n",
+    "Try a few other GPU combinations and toggle P2P so see if the performance variation correlates with the table above. Also try reducing the grid size using `-nx` and `-ny` flags (to say 8192$\\times$8192) and see the effect on efficiency. \n",
+    "\n",
+    "## Benchmarking the system topology\n",
+    "\n",
+    "Our application is not very memory intensive. As is visible from the profiler output, $\\gt95\\%$ of the time in GPU is spent on computation. Therefore, to get a quantitative measure of latency and bandwidth impact due to topology, we run a micro-benchmark.\n",
+    "\n",
+    "### p2pBandwidthLatencyTest micro-benchmark\n",
+    "\n",
+    "p2pBandwidthLatencyTest is a part of [CUDA Samples GitHub repository](https://github.com/NVIDIA/cuda-samples) available to help CUDA developers. \n",
+    "\n",
+    "As the name suggests, this test measures the bandwidth and latency impact of P2P and underlying communication topology. Let's compile the benchmark:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "212a8dfc",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "!cd ../../source_code/p2pBandwidthLatencyTest/ && make clean && make"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "83369c1b",
+   "metadata": {},
+   "source": [
+    "Now, let's run the benchmark:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "59eeb793",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "!cd ../../source_code/p2pBandwidthLatencyTest/ && ./p2pBandwidthLatencyTest"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b584f5ef",
+   "metadata": {},
+   "source": [
+    "The first part of the benchmark gives device information and P2P access available from each GPU (similar to `nvidia-smi topo -m` command). Next, the benchmark measures the unidirectional and bidirectional bandwidth and latency with P2P disabled and enabled.\n",
+    "\n",
+    "We share partial results obtained in our DGX-1 system:\n",
+    "\n",
+    "```bash\n",
+    "Bidirectional P2P=Disabled Bandwidth Matrix (GB/s)\n",
+    "   D\\D     0      1      2      3      4      5      6      7 \n",
+    "     0 783.95   9.56  14.43  14.46  14.47  14.24  14.51  14.43 \n",
+    "\n",
+    "Bidirectional P2P=Enabled Bandwidth Matrix (GB/s)\n",
+    "   D\\D     0      1      2      3      4      5      6      7 \n",
+    "     0 784.87  48.49  48.49  96.85  96.90  14.25  14.54  14.49 \n",
+    "     \n",
+    "P2P=Disabled Latency Matrix (us)\n",
+    "   GPU     0      1      2      3      4      5      6      7 \n",
+    "     0   1.78  17.52  16.41  16.43  17.35  16.88  17.34  16.85 \n",
+    "     \n",
+    "P2P=Enabled Latency (P2P Writes) Matrix (us)\n",
+    "   GPU     0      1      2      3      4      5      6      7 \n",
+    "     0   1.76   1.62   1.61   2.01   2.02  18.44  19.15  19.34\n",
+    "```\n",
+    "\n",
+    "Our system is based on PCIe gen 3.0 with a peak maximum GPU-GPU PCIe banwidth of 15.75 GB/s. Let us analyze and understand these results:\n",
+    "\n",
+    "* GPU 0 and GPU 1/2: Connected by a single NVLink connection. By enabling P2P-\n",
+    "  - Bandwidth reaches close to the maximum peak of 50 GB/s.\n",
+    "  - Latency decreases by an order of magnitude.\n",
+    "* GPU 0 and GPU 3/4: Connected by a double NVLink connection. By enabling P2P-\n",
+    "  - Bandwidth reaches close to the maximum peak of 100 GB/s.\n",
+    "  - Latency decreases by an order of magnitude.\n",
+    "* GPU 0 and GPU 5/6/7: Connected by PCIe and SMP interconnect. By enabling P2P- \n",
+    "  - Bandwidth is unchanged.\n",
+    "  - Latency increases a marginally.\n",
+    "  \n",
+    "Correlate these results with the communication topology that can be displayed by usng `nvidia-smi topo -m` command and the qualtitative table in the previous section. They should be consistent with one another.\n",
+    "\n",
+    "In general, we should try to set the GPUs in an application such that a GPU can share data with its neighbours using a high-bandwidth, low-latency communication topology. Enabling P2P, when possible, usually improves the performance by eliminating host staging.\n",
+    "\n",
+    "We now have an in-depth understanding of intra-node topology and its effects on performance. Let us now analyze our P2P-enabled application again to uncover opportunities to extract more performance.\n",
+    "\n",
+    "Click on the link below to access the next lab where we discuss the need for CUDA streams and then implement them in our application.\n",
+    "\n",
+    "# [Next: CUDA Streams](../cuda/streams.ipynb)\n",
+    "\n",
+    "Here's a link to the home notebook through which all other notebooks are accessible:\n",
+    "\n",
+    "# [HOME](../../../introduction.ipynb)\n",
+    "\n",
+    "---\n",
+    "## Links and Resources\n",
+    "\n",
+    "* [Documentation: NVIDIA DGX 1 Tesla V100 Whitepaper](https://images.nvidia.com/content/pdf/dgx1-v100-system-architecture-whitepaper.pdf)\n",
+    "* [Concepts: NVLink](https://www.nvidia.com/en-in/data-center/nvlink/)\n",
+    "* [Research: Effect of topology-awareness on communication](https://ieeexplore.ieee.org/abstract/document/7529932)\n",
+    "* [Code: p2pBandwidthLatencyTest](https://github.com/NVIDIA/cuda-samples/tree/master/Samples/p2pBandwidthLatencyTest)\n",
+    "* [Code: Multi-GPU Programming Models](https://github.com/NVIDIA/multi-gpu-programming-models)\n",
+    "* [Code: GPU Bootcamp](https://github.com/gpuhackathons-org/gpubootcamp/)\n",
+    "\n",
+    "Don't forget to check out additional [OpenACC Resources](https://www.openacc.org/resources) and join our [OpenACC Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n",
+    "\n",
+    "## Licensing \n",
+    "\n",
+    "This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0)."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

+ 40 - 166
hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/memcpy/streams.ipynb

@@ -142,7 +142,7 @@
     "\n",
     "## Implementation exercise: Part 1\n",
     "\n",
-    "Now, let's parallelize our code across multiple GPUs by using `cudaSetDevice` and `cudaMemcpyAsync` operations. Open the [jacobi_memcpy.cu](../../source_code/memcpy/jacobi_memcpy.cu) file by using the `File` $\\rightarrow$ `Open...` option.\n",
+    "Now, let's parallelize our code across multiple GPUs by using `cudaSetDevice` and `cudaMemcpyAsync` operations. Open the [jacobi_memcpy.cu](../../source_code/cuda/jacobi_memcpy.cu) file.\n",
     "\n",
     "Understand the flow of the program from within the `main` function. Review the following pre-Jacobi-computation steps:\n",
     "\n",
@@ -158,7 +158,7 @@
     "2. Asynchronously copy GPU-local L2 norm back to CPU and implement top and bottom halo exchanges.\n",
     "3. Synchronize the devices at the end of each iteration using `cudaDeviceSynchronize` function.\n",
     "\n",
-    "Review the topic on Asynchronous Operations above if in doubt. Recall the utility of using separate `for` loops for launching device kernels and initiating copy operations.\n",
+    "Review the topic above on Asynchronous Operations if in doubt. Recall the utility of using separate `for` loops for launching device kernels and initiating copy operations.\n",
     "\n",
     "After implementing these, let's compile the code:"
    ]
@@ -170,7 +170,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!cd ../../source_code/memcpy && make clean && make jacobi_memcpy"
+    "!cd ../../source_code/cuda && make clean && make jacobi_memcpy"
    ]
   },
   {
@@ -188,7 +188,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!cd ../../source_code/memcpy && ./jacobi_memcpy"
+    "!cd ../../source_code/cuda && ./jacobi_memcpy"
    ]
   },
   {
@@ -200,10 +200,10 @@
     "\n",
     "```bash\n",
     "Num GPUs: 8. Using GPU ID: 0, 1, 2, 3, 4, 5, 6, 7, \n",
-    "16384x16384: 1 GPU:   5.0272 s, 8 GPUs:   1.1376 s, speedup:     4.42, efficiency:    55.24\n",
+    "16384x16384: 1 GPU:   4.4485 s, 8 GPUs:   1.0951 s, speedup:     4.06, efficiency:    50.78 \n",
     "```\n",
     "\n",
-    "Notice that we got a speed-up of $4.42\\times$ using 8 GPUs and a corresponding efficiency of $55.24\\%$. The numbers will vary depending on number of available GPUs in your system, the communication topology, GPU type, etc.\n",
+    "Notice that we got a speed-up of $4.06\\times$ using 8 GPUs and a corresponding efficiency of $50.78\\%$. The numbers will vary depending on number of available GPUs in your system, the communication topology, GPU type, etc.\n",
     "\n",
     "### Profiling\n",
     "\n",
@@ -217,7 +217,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!cd ../../source_code/memcpy/ && nsys profile --trace=cuda,nvtx --stats=true -o jacobi_memcpy_sys_report --force-overwrite true ./jacobi_memcpy -gpus 0,7"
+    "!cd ../../source_code/cuda/ && nsys profile --trace=cuda,nvtx --stats=true -o jacobi_memcpy_report --force-overwrite true ./jacobi_memcpy"
    ]
   },
   {
@@ -225,13 +225,15 @@
    "id": "c4ac727d",
    "metadata": {},
    "source": [
-    "In the profiler timeline, the first few seconds denote the single-GPU code running on one of the GPUs. This version is executed so we can compare the multi-GPU version with it and we have already analyzed it. Let's analyze the multi-GPU timeline:\n",
+    "In the profiler timeline, the first few seconds denote the single-GPU code running on one of the GPUs. This version is executed so we can compare the multi-GPU version with it and we have already analyzed it. Let's analyze the multi-GPU timeline.\n",
     "\n",
-    "IMAGE LINK HERE\n",
+    "![jacobi_memcpy_report_overview](../../images/jacobi_memcpy_report_overview.png)\n",
     "\n",
-    "NSYS DESCRIPTION HERE\n",
+    "The next iteration of the device kernel is not run till all inter-GPU copy operations are complete because we need to synchronize all GPUs at the end of each iteration. The total time taken by the Jacobi Solver loop (`jacobi_solve` NVTX annotatation) is visible and is 1.278 seconds. Also, notice the we have labelled halo exchanges as Device-to-Host (DtoH) and Host-to-Device) copies. Now, right click on `CUDA HW` tab and select `Show in Events View` option. \n",
     "\n",
-    "The solution for this exercise is present in `source_code/memcpy/solution` directory: [jacobi_memcpy.cu](../../source_code/memcpy/solution/jacobi_memcpy.cu)\n",
+    "![jacobi_memcpy_report_events](../../images/jacobi_memcpy_report_events.png)\n",
+    "\n",
+    "The \"Source Memory Kind\" and \"Destination Memory Kind\" of the selected DtoH operation are both \"Device\". However the copy operation is marked as \"Memcpy DtoH\". By default, the device-to-device copy operation uses a temporary CPU buffer internally. Let us understand more about this CPU buffer and how we can eliminate it to improve performance.\n",
     "\n",
     "## CUDA concepts: Part 2\n",
     "\n",
@@ -326,7 +328,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!cd ../../source_code/memcpy && make clean && make jacobi_memcpy"
+    "!cd ../../source_code/cuda && make clean && make jacobi_memcpy"
    ]
   },
   {
@@ -344,7 +346,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!cd ../../source_code/memcpy && ./jacobi_memcpy -p2p"
+    "!cd ../../source_code/cuda && ./jacobi_memcpy -p2p"
    ]
   },
   {
@@ -363,185 +365,57 @@
     "\n",
     "### Profiling\n",
     "\n",
-    "IMAGE LINK HERE\n",
-    "\n",
-    "NSYS DESCRIPTION HERE\n",
-    "\n",
-    "## Intra-Node Communication Topology\n",
-    "\n",
-    "Run the command below to display your node's GPU and NIC communication topology:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "5be59a7a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!nvidia-smi topo -m"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "a81fa29e",
-   "metadata": {},
-   "source": [
-    "If the output is unclear, you can launch a Terminal session by clicking on `File` $\\rightarrow$ Open and following the steps as shown:\n",
-    "\n",
-    "![open_terminal_session](../../images/open_terminal_session.png)\n",
-    "\n",
-    "On our DGX-1 system, the output is as follows:\n",
-    "\n",
-    "![nvidia_smi_topo_output](../../images/nvidia_smi_topo_output.png)\n",
-    "\n",
-    "Focus one a particular row, say GPU 0. The output states that GPUs 1 through 4 are connected to it via NVLink (in addition to PCIe) and GPUs 5 through 7 are connected to it via PCIe as well as an \"SMP\" interconnect. We have a dual-socket system and the CPUs in these sockets are connected by an interconnect known as SMP interconnect.\n",
-    "\n",
-    "Thus, GPU 0 to GPU 5 communication happens via not just PCIe, but also over the inter-socket interconnect within the same node. Clearly, this is a longer path than say the one between GPU 0 and GPU 1, which are connected via NVLink directly. We will discuss the NIC to GPU connection in the inter-node section of this bootcamp.\n",
-    "\n",
-    "Even within the GPUs connected via NVLink, we see different annotations such as `NV1` and `NV2` that affect the communication bandwidth and hence the performance. In this section, we will explore the nuances associated with a diverse intra-node GPU communication topology like in the output above. Specifically, in our system, the communication topology is as follows:\n",
-    "\n",
-    "![dgx1_8x_tesla_v100_topo](../../images/dgx1_8x_tesla_v100_topo.png)\n",
-    "\n",
-    "Qualitatively, the bandwidth and latency vary with the topology as follows:\n",
-    "\n",
-    "![intra_node_topology_map](../../images/intra_node_topology_map.png)\n",
-    "\n",
-    "Host staging implies traversing through the CPU and the travel path taken is one of PHB, NODE, and SYS. In contrast, if the path taken is either NV1, NV2, or PIX, then P2P is available. PXB implies that the GPUs belong to different PCIe hubs and P2P is usually not supported in this case.\n",
-    "\n",
-    "A double NVLink connection provides twice the bandwidth compared to a single NVLink. \n",
-    "\n",
-    "For a pair of 2 GPUs, the peak bidirectional bandwidth are as follows:\n",
-    "* PCIe: Using PIX topology, 15.75GB/s for PCIe Gen 3.0 and 31.5GB/s for PCIe Gen 4.0.\n",
-    "* NVLink: Using NV# topology, 50GB/s per connection. So a double NVLink connection has 100GB/s peak bidirectional bandwidth.\n",
-    "\n",
-    "Let us understand what difference the underlying communication topology can make to the application performance in the following sub-section.\n",
-    "\n",
-    "**Note:** If your command output doesn't show any NVLink connection or if there's no difference in connection type (PIX, PXB, PHB, NODE, SYS, NV#) between any 2 pair of GPUs, then the communication bandwidth and latency will likely be the same between any pair and the following sub-sections will not display any performance difference.\n",
-    "\n",
-    "### Performance variation due to system topology\n",
-    "\n",
-    "So far, the code runs the multi-GPU version on all available GPUs in a node (8 in our case). We can supply the `-gpus` runtime flag to the binary to run our code on specific GPUs. If we want to run on only 2 GPUs, namely GPU 0 and GPU 3, we use the `-gpus 0,3` argument. \n",
-    "\n",
-    "Try to find the GPU pair with highest bandwidth available as per the table above and replace `0,3` with those GPUs, and then run the command below:"
+    "Let us profile the execution with `nsys`:"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "ccd50a44",
+   "id": "adf3e8fb",
    "metadata": {},
    "outputs": [],
    "source": [
-    "!cd ../../source_code/memcpy && ./jacobi_memcpy -p2p -gpus 0,7"
+    "!cd ../../source_code/cuda/ && nsys profile --trace=cuda,nvtx --stats=true -o jacobi_memcpy_p2p_report --force-overwrite true ./jacobi_memcpy -p2p"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "21c4eb06",
+   "id": "4b801eb0",
    "metadata": {},
    "source": [
-    "The efficiency would likely be higher than before due to less inter-GPU communication (each GPU does more wok instead). Our output is as follows:\n",
+    "The output we obtain is shared below:\n",
     "\n",
-    "```bash\n",
-    "Num GPUs: 2. Using GPU ID: 0, 3, \n",
-    "16384x16384: 1 GPU:   4.4513 s, 2 GPUs:   2.2664 s, speedup:     1.96, efficiency:    98.20  \n",
-    "```\n",
+    "![jacobi_memcpy_p2p_report](../../images/jacobi_memcpy_p2p_report.png)\n",
     "\n",
-    "Now, run the binary a pair of GPUs that have the lowest available bandwidth. In our case, we use GPU 0 and GPU 7. Our output is:\n",
+    "For GPU 0, P2P is only possible with GPU 1 and the profiler output indeed shows only one set of P2P operations. Host-staging is used between GPU 0 and GPU 7. In contrast, GPU 2 can use P2P with both its neighbours, GPU 1 and GPU 3 and the profiler output verifies that. The events view of GPU 1 is shown. The selected operation's description shows a P2P copy operation from GPU 0 to GPU 1. Also, the total time taken for the solver loop has decreased to 1.052 seconds.\n",
     "\n",
-    "```bash\n",
-    "Num GPUs: 2. Using GPU ID: 0, 7, \n",
-    "16384x16384: 1 GPU:   4.4529 s, 2 GPUs:   2.3454 s, speedup:     1.90, efficiency:    94.93  \n",
-    "```\n",
+    "**Solution:** The solution for this exercise is present in `source_code/memcpy/solutions` directory: [jacobi_memcpy.cu](../../source_code/cuda/solutions/jacobi_memcpy.cu)\n",
     "\n",
-    "Now remove the `-p2p` flag and run the command again for GPUs 0 and 7. We didn't get any difference in performance. As you may recall, P2P is not possible between GPUs 0 and 7, so the underlying communication path doesn't change, resulting in same performance with and without the `-p2p` flag. The same can be confirmed by profiling the application and looking at the operations performed in the Nsight Systems timeline. \n",
+    "Let us dive deeper into the communication architecture to better understand the impact of P2P memory access. Click on the link below to access the next lab.\n",
     "\n",
-    "![p2p_2_gpu_memcpy_nsys](../../images/p2p_2_gpu_memcpy_nsys.png)\n",
+    "# [Next: Intra-node topology](../advanced_concepts/single_node_topology.ipynb)\n",
     "\n",
-    "Try a few other GPU combinations and toggle P2P so see if the performance variation correlates with the table above. Also try reducing the grid size using `-nx` and `-ny` flags (to say 8192$\\times$8192) and see the effect on efficiency. \n",
+    "Here's a link to the home notebook through which all other notebooks are accessible:\n",
     "\n",
-    "### Benchmarking the system topology\n",
+    "# [HOME](../../../introduction.ipynb)\n",
     "\n",
-    "Our application is not very memory intensive. As is visible from the profiler output, $\\gt95\\%$ of the time in GPU is spent on computation. Therefore, to get a quantitative measure of latency and bandwidth impact due to topology, we run a micro-benchmark.\n",
+    "---\n",
     "\n",
-    "**The p2pBandwidthLatencyTest micro-benchmark**\n",
+    "## Links and Resources\n",
     "\n",
-    "p2pBandwidthLatencyTest is a part of [CUDA Samples GitHub repository](https://github.com/NVIDIA/cuda-samples) available to help CUDA developers. \n",
+    "* [Programming: Optimized data transfers in CUDA](https://developer.nvidia.com/blog/how-optimize-data-transfers-cuda-cc/)\n",
+    "* [Documentation: CUDA Memory Management APIs](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html)\n",
+    "* [Documentation: nvidia-smi Command](https://developer.download.nvidia.com/compute/DCGM/docs/nvidia-smi-367.38.pdf)\n",
+    "* [Programming Concepts: Peer-to-Peer and Unified Virtual Addressing (UVA)](https://developer.download.nvidia.com/CUDA/training/cuda_webinars_GPUDirect_uva.pdf)\n",
+    "* [Programming Concepts: CUDA Peer-to-Peer Memory Access](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#peer-to-peer-memory-access)\n",
+    "* [Code: Multi-GPU Programming Models](https://github.com/NVIDIA/multi-gpu-programming-models)\n",
+    "* [Code: GPU Bootcamp](https://github.com/gpuhackathons-org/gpubootcamp/)\n",
     "\n",
-    "As the name suggests, this test measures the bandwidth and latency impact of P2P and underlying communication topology. Let's compile the benchmark:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "93fa162c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!cd ../../source_code/p2pBandwidthLatencyTest/ && make clean && make"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "429bc0cf",
-   "metadata": {},
-   "source": [
-    "Now, let's run the benchmark:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f607f88d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!cd ../../source_code/p2pBandwidthLatencyTest/ && ./p2pBandwidthLatencyTest"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "dacdaacc",
-   "metadata": {},
-   "source": [
-    "The first part of the benchmark gives device information and P2P access available from each GPU (similar to `nvidia-smi topo -m` command). Next, the benchmark measures the unidirectional and bidirectional bandwidth and latency with P2P disabled and enabled.\n",
-    "\n",
-    "We share partial results obtained in our DGX-1 system:\n",
+    "Don't forget to check out additional [OpenACC Resources](https://www.openacc.org/resources) and join our [OpenACC Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n",
     "\n",
-    "```bash\n",
-    "Bidirectional P2P=Disabled Bandwidth Matrix (GB/s)\n",
-    "   D\\D     0      1      2      3      4      5      6      7 \n",
-    "     0 783.95   9.56  14.43  14.46  14.47  14.24  14.51  14.43 \n",
-    "\n",
-    "Bidirectional P2P=Enabled Bandwidth Matrix (GB/s)\n",
-    "   D\\D     0      1      2      3      4      5      6      7 \n",
-    "     0 784.87  48.49  48.49  96.85  96.90  14.25  14.54  14.49 \n",
-    "     \n",
-    "P2P=Disabled Latency Matrix (us)\n",
-    "   GPU     0      1      2      3      4      5      6      7 \n",
-    "     0   1.78  17.52  16.41  16.43  17.35  16.88  17.34  16.85 \n",
-    "     \n",
-    "P2P=Enabled Latency (P2P Writes) Matrix (us)\n",
-    "   GPU     0      1      2      3      4      5      6      7 \n",
-    "     0   1.76   1.62   1.61   2.01   2.02  18.44  19.15  19.34\n",
-    "```\n",
+    "## Licensing \n",
     "\n",
-    "Our system is based on PCIe gen 3.0 with a peak maximum GPU-GPU PCIe banwidth of 15.75 GB/s. Let us analyze and understand these results:\n",
-    "\n",
-    "* GPU 0 and GPU 1/2: Connected by a single NVLink connection. By enabling P2P-\n",
-    "  - Bandwidth reaches close to the maximum peak of 50 GB/s.\n",
-    "  - Latency decreases by an order of magnitude.\n",
-    "* GPU 0 and GPU 3/4: Connected by a double NVLink connection. By enabling P2P-\n",
-    "  - Bandwidth reaches close to the maximum peak of 100 GB/s.\n",
-    "  - Latency decreases by an order of magnitude.\n",
-    "* GPU 0 and GPU 5/6/7: Connected by PCIe and SMP interconnect. By enabling P2P- \n",
-    "  - Bandwidth is unchanged.\n",
-    "  - Latency increases a marginally.\n",
-    "  \n",
-    "Correlate these results with the communication topology that can be displayed by usng `nvidia-smi topo -m` command and the qualtitative table in the previous section. They should be consistent with one another.\n",
-    "\n",
-    "In general, we should try to set the GPUs in an application such that a GPU can share data with its neighbours using a high-bandwidth, low-latency communication topology. Enabling P2P, when possible, usually improves the performance by eliminating host staging."
+    "This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0)."
    ]
   }
  ],
@@ -561,7 +435,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.9.5"
   }
  },
  "nbformat": 4,

+ 397 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/cuda/streams.ipynb

@@ -0,0 +1,397 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "18638d64",
+   "metadata": {},
+   "source": [
+    "Before we begin, let's get an overview of the CUDA driver version and the GPUs running on the server by executing the `nvidia-smi` command below. Highlight the cell below by clicking on it and then either hit `Ctrl+Enter` on the keyboard or click on the `Run` button on the toolbar above. The output will be visible below the cell."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6ddeeccc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!nvidia-smi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a7c63ff6",
+   "metadata": {},
+   "source": [
+    "# Learning Objectives\n",
+    "\n",
+    "We will learn about the following in this lab:\n",
+    "\n",
+    "* Concept of overlapping computation withEventEvent communication\n",
+    "* CUDA Streams overview and implementation\n",
+    "* CUDA Events overview and implementation\n",
+    "* Synchronization primitives in CUDA for the whole device, stream, event, etc.\n",
+    "\n",
+    "# Improving Application Performance\n",
+    "\n",
+    "### Analysis\n",
+    "\n",
+    "The $(i+1)^{th}$ Jacobi iteration on any GPU cannot begin until all memory operations between all GPUs at the end of $i^{th}$ iteration are complete. The GPU is idle after its memory and compute operations are completed, as is visible in the profiler output below. The white space between the blue device kernel and the orange/ green/ pink memory operations is when the GPU is idle.\n",
+    "\n",
+    "![memcpy_gpu_util](../../images/memcpy_gpu_util.png)\n",
+    "\n",
+    "Let us quantify the time loss from the profiler output. \n",
+    "\n",
+    "![memcpy_util_selection](../../images/memcpy_util_selection.png)\n",
+    "\n",
+    "On average, one iteration of `jacobi_kernel` takes about 600$\\mu$s. The copy operations take about 50$\\mu$s. The total time between Jacobi iterations is about 450$\\mu$s. So the idle time is about $450-50=400\\mu$s. \n",
+    "\n",
+    "We cannot recover all of the idle time as we are currently only considering the device timeline. Launching device kernels and copy operations has host-side overhead as well. Still, there is a significant opportunity to improve performance by minimizing the idle time.\n",
+    "\n",
+    "### Optimization\n",
+    "\n",
+    "Notice that the copy operations take place serially after the Jacobi iteration. The kernel computation must be complete before copying the updated halos from the GPU of interest (source) to its neighbours (destination).\n",
+    "\n",
+    "However, we can perform the copy operation from the neighbouring GPUs (source) to the GPU of interest (destination) concurrently with the kernel computation as it will only be required in the next iteration.\n",
+    "\n",
+    "An important optimization is to overlap computation and communication so that these operations can take place concurrently, whenever possible. We also need to keep track of dependencies so that the $(i+1)^{th}$ iteration on a GPU cannot begin until it sends and receives halos to and from its neighbours at the end of $i^{th}$ iteration.\n",
+    "\n",
+    "\n",
+    "## CUDA Concepts: Part 3\n",
+    "\n",
+    "A CUDA device has multiple \"engines\" that can concurrently manage kernel execution(s) and data transfer(s). That is, we can overlap computation and communication in our application by utilizing these engines. This requires the use of CUDA Streams.\n",
+    "\n",
+    "### Streams\n",
+    "\n",
+    "A stream in CUDA is a sequence of operations that execute on the device in the order in which they are issued by the host code. While operations within a stream are guaranteed to execute in the prescribed order, operations in different streams can be interleaved and, when possible, they can even run concurrently.\n",
+    "\n",
+    "#### The default stream\n",
+    "\n",
+    "All device operations (kernels and data transfers) in CUDA run in a stream. When no stream is specified, the default stream (also called the “null stream”) is used. All of our codes till now have implicitly used the default stream. \n",
+    "\n",
+    "The default stream is different from other streams because it is a synchronizing stream with respect to operations on the device: no operation in the default stream will begin until all previously issued operations in any stream on the device have completed, and an operation in the default stream must complete before any other operation (in any stream on the device) will begin.\n",
+    "\n",
+    "We need to use non-default streams to achieve concurrency as showcased in the image below.\n",
+    "\n",
+    "![cuda_streams_overview](../../images/cuda_streams_overview.png)\n",
+    "\n",
+    "#### Non-default streams\n",
+    "\n",
+    "Let us first learn to create and destroy non-default CUDA streams:\n",
+    "\n",
+    "```c\n",
+    "cudaStream_t stream1;\n",
+    "cudaError_t result;\n",
+    "result = cudaStreamCreate(&stream1);\n",
+    "result = cudaStreamDestroy(stream1);\n",
+    "```\n",
+    "\n",
+    "To issue a data transfer to a non-default stream we use the `cudaMemcpyAsync()` function, which takes a stream identifier as an optional fifth argument.\n",
+    "\n",
+    "```c\n",
+    "result = cudaMemcpyAsync(TopNeighbour, myTopRow, size, cudaMemcpyDeviceToDevice, stream1);\n",
+    "```\n",
+    "\n",
+    "To issue a kernel to a non-default stream we specify the stream identifier as a fourth configuration parameter. The third configuration parameter allocates shared device memory, use 0 for that. \n",
+    "\n",
+    "```c\n",
+    "jacobi_kernel<<<dim_grid, dim_block, 0, stream1>>>(...);\n",
+    "```\n",
+    "\n",
+    "#### Synchronization\n",
+    "\n",
+    "We have already encountered `cudaDeviceSynchronize()` function which blocks the host code until all previously issued operations on the device have completed. There are more fine-grained ways to synchronize codes that use streams.\n",
+    "\n",
+    "The function `cudaStreamSynchronize(stream)` can instead be used to block the host until all previously issued operations in the specified stream have completed.\n",
+    "\n",
+    "## Implementation exercise: Part 3\n",
+    "\n",
+    "Now, let's implement CUDA streams in our application. Open the [jacobi_streams.cu](../../source_code/cuda/jacobi_streams.cu) file.\n",
+    "\n",
+    "Note that we create 3 streams- `compute_stream`, `push_top_stream`, and `push_bottom_stream` for each GPU. We will compute the Jacobi iteration and perform GPU-local L2 norm copy operation on the `compute_stream`. Each GPU will perform its top and bottom halo copy operation to its neighbours using the `push_top_stream` and `push_bottom_stream` streams, respectively. \n",
+    "\n",
+    "Now, within the iterative Jacobi loop (the `while` loop), implement the following marked as `TODO: Part 3-`:\n",
+    "\n",
+    "1. Synchronize `push_top_stream` and `push_bottom_stream` streams to ensure \"top\" and \"bottom\" neighbours have shared updated halos from the previous iteration.\n",
+    "2. Call device kernel on `compute_stream` stream with correct device arrays in function arguments.\n",
+    "3. Asynchronously copy GPU-local L2 norm back to CPU on `compute_stream` stream.\n",
+    "4. Ensure the computation is complete by synchronizing \"compute_stream\" stream before copying the updated halos to neighbours.\n",
+    "5. Implement top and bottom halo exchanges on the correct stream.\n",
+    "\n",
+    "Review the topic above on Non-default streams if in doubt. Recall the utility of using separate `for` loops for launching device kernels and initiating copy operations.\n",
+    "\n",
+    "After implementing these, let's compile the code:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "003cf80a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cd ../../source_code/cuda && make clean && make jacobi_streams"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1cc33f19",
+   "metadata": {},
+   "source": [
+    "Validate the implementation by running the binary:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b71d5a98",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cd ../../source_code/cuda && ./jacobi_streams -p2p"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "442c832a",
+   "metadata": {},
+   "source": [
+    "We tested the code on a DGX-1 system with 8 Tesla V100 16GB GPUs, and we got the following output:\n",
+    "\n",
+    "```bash\n",
+    "Num GPUs: 8.\n",
+    "16384x16384: 1 GPU:   4.4481 s, 8 GPUs:   0.7401 s, speedup:     6.01, efficiency:    75.13 \n",
+    "```\n",
+    "\n",
+    "Recall that the P2P-enabled application using only `cudaMemcpy` functions achieved an efficiency of about $63\\%$ on our system. We get a significant increase of efficiency to about $75\\%$ by achieving compute-communication concurrency.\n",
+    "\n",
+    "Now, enable P2P on our current program by using the `-p2p` runtime flag. On our system, the efficiency increased to $82\\%$. Your efficiency numbers and improvement in performance may differ depending on the system topology, GPU type, etc.\n",
+    "\n",
+    "### Profiling\n",
+    "\n",
+    "Now, profile the P2P-enabled version of the program with `nsys`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9d0fd7b2",
+   "metadata": {
+    "scrolled": true,
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "!cd ../../source_code/cuda/ && nsys profile --trace=cuda,nvtx --stats=true -o jacobi_streams_p2p_report --force-overwrite true ./jacobi_streams -p2p"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a6f5bf5e",
+   "metadata": {},
+   "source": [
+    "Open the report in GUI and measure the total time between two Jacobi iterations as shown below.\n",
+    "\n",
+    "![streams_util_selection](../../images/streams_util_selection.png)\n",
+    "\n",
+    "The copy operations take same time as before, about 50$\\mu$s. Thus, the idle time is $200-50=150\\mu$s. Compare this idle time with the idle time for non-streams version of the application, which in our case is abour 400$\\mu$s. Concurrency improves GPU utilization and consequently speedup and efficiency.\n",
+    "\n",
+    "**Solution:** The solution for this exercise is present in `source_code/memcpy/solutions` directory: [jacobi_streams.cu](../../source_code/cuda/solutions/jacobi_streams.cu)\n",
+    "\n",
+    "#### Analysis\n",
+    "\n",
+    "Can we improve our program further? Yes! Can you think of any bottleneck that we have mentioned implicitly but haven't addressed yet? \n",
+    "\n",
+    "Recall that `cudaStreamSynchronize` function blocks the \"host\" until all previously issued operations in the specified stream have completed. Do we need to block the host?\n",
+    "\n",
+    "The utility of this function in our application is that it ensures the dependencies between iterations and between computation and communication are respected. We don't need to block the host for this purpose. \n",
+    "\n",
+    "## CUDA Concepts: Part 4\n",
+    "\n",
+    "### CUDA Events\n",
+    "\n",
+    "CUDA Events are synchronization markers that provide a mechanism to signal when operations have occurred \n",
+    "in a stream. They allow fine grained synchronization within a stream and also inter stream synchronization, e.g. let a stream wait for an event in another stream. \n",
+    "\n",
+    "Let us first learn to create and destroy CUDA events:\n",
+    "\n",
+    "```c\n",
+    "cudaEvent_t event1;\n",
+    "cudaError_t result;\n",
+    "result = cudaEventCreate(&event1);\n",
+    "result = cudaEventDestroy(&event1);\n",
+    "```\n",
+    "\n",
+    "#### Recording Events\n",
+    "\n",
+    "Events have a boolean state- Occurred or Not Occurred. The default state is Occurred. We record an event as follows:\n",
+    "\n",
+    "```c\n",
+    "cudaEventRecord(&event1, stream1); \n",
+    "```\n",
+    "\n",
+    "This function sets the event state of `event1` to Not Occurred, enqueues `event1` into queue at `stream1`, and the event state is set to Occurred when it reaches the front of the queue at `stream1`.\n",
+    "\n",
+    "#### Synchronizing Stream with Events\n",
+    "\n",
+    "`cudaEventSynchronize` acts similar to `cudaStreamSynchronize` and blocks the host until the recorded event has \"Occured\". But we do not wish to block the host thread. Thus, we use `cudaStreamWaitEvent`:\n",
+    "\n",
+    "```c\n",
+    "cudaStreamWaitEvent(stream1, event1, 0);\n",
+    "```\n",
+    "\n",
+    "This function blocks the stream until `event1` has Occured and it does not block the host. It works even if the event is recorded in a different stream or on a different device.\n",
+    "\n",
+    "Thus, fine-grained synchronization that doesn't block the host is achieved by first using `cudaEventRecord` on the independent operation, for example, halo copy from GPU 0 to GPU 1 at the end of $i^{th}$ iteration. Then, before issuing the dependent operation, for example, Jacobi computation for $(i+1)^{th}$ iteration on GPU 1, we block the stream using `cudaStreamWaitEvent`.  \n",
+    "\n",
+    "## Implementation Exercise: Part 4\n",
+    "\n",
+    "Let's implement CUDA Events with Streams in our application. Open the [jacobi_streams_events.cu](../../source_code/cuda/jacobi_streams_events.cu) file.\n",
+    "\n",
+    "Note that we create 5 events for each device, `compute_done`, `push_top_done[0]`, `push_top_done[1]`, `push_bottom_done[0]`, and `push_bottom_done[1]`. We need 2 events for each halo on every device:\n",
+    "\n",
+    "1. To synchronize \"top\" and \"bottom\" neighbour's `push_bottom_stream` and `push_top_stream` copy operations of $(i-1)^{th}$ iteration, respectively, before computing $i^{th}$ Jacobi iteration in `compute_stream`.\n",
+    "2. To record current device's `push_top_stream` and `push_bottom_stream` copy operations at the end of $i^{th}$ iteration.\n",
+    "\n",
+    "Now, within the iterative Jacobi loop (the `while` loop), implement the following marked as `TODO: Part 4-`:\n",
+    "\n",
+    "* Block the \"compute_stream\" as long as the top and bottom halos from the neighbours are not copied to `dev_id`. The `push_top_done` and `push_bottom_done` events are to monitored for `bottom` and `top` neighbours, respectively for the previous iteration denoted by `iter % 2`. Note that there should be 2 distinct `cudaStreamWaitEvent` function calls.\n",
+    "* Record that Jacobi computation on `compute_stream` is done by using `cudaEventRecord` for `compute_done` event for `dev_id`.\n",
+    "* Wait for the Jacobi computation of `dev_id` to complete by using the `compute_done` event on `push_top_stream` so that the top halo isn't copied to the neighbour before computation is done.\n",
+    "* Record completion of top halo copy from `dev_id` to its neighbour to be used in next iteration. Record the event for `push_top_done` stream of `dev_id` for next iteration which is `(iter+1) % 2`.\n",
+    "* Repeat the same procedure as described in previous two points for bottom halo copy with `push_bottom_stream` and `push_bottom_done` event.\n",
+    "\n",
+    "After implementing these, compile the code:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4b299e4e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cd ../../source_code/cuda && make clean && make jacobi_streams_events"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d9b942c7",
+   "metadata": {},
+   "source": [
+    "Validate the implementation by running the binary with and without P2P:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "35e57643",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cd ../../source_code/cuda && ./jacobi_streams_events -p2p"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c70e07a9",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "We share the partial output from our DGX-1 8 Tesla V100 system for the binary without using P2P:\n",
+    "\n",
+    "```bash\n",
+    "Num GPUs: 8.\n",
+    "16384x16384: 1 GPU:   4.4485 s, 8 GPUs:   0.6640 s, speedup:     6.70, efficiency:    83.75 \n",
+    "```\n",
+    "\n",
+    "With using P2P, the efficiency increases marginally:\n",
+    "\n",
+    "```bash\n",
+    "Num GPUs: 8.\n",
+    "16384x16384: 1 GPU:   4.4486 s, 8 GPUs:   0.6528 s, speedup:     6.81, efficiency:    85.18 \n",
+    "```\n",
+    "\n",
+    "Let us profile the code to verify that using events indeed overlaps computation with communication within each GPU.\n",
+    "\n",
+    "## Profiling\n",
+    "\n",
+    "Profile the binary with P2P enabled using `nsys`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4c910f6f-f58c-4d3b-ab37-49dbc4112751",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! cd ../../source_code/cuda/ && nsys profile --trace=cuda,nvtx --stats=true -o jacobi_streams_events_p2p_report --force-overwrite true ./jacobi_streams_events -p2p"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0e330889-77e3-4fe3-9782-b4a13425c9bb",
+   "metadata": {},
+   "source": [
+    "Download the `.qdrep` report file and open it in the Nsight Systems GUI application:\n",
+    "\n",
+    "![jacobi_memcpy_streams_events_p2p_report](../../images/jacobi_memcpy_streams_events_p2p_report.png)\n",
+    "\n",
+    "Observe that the computation is now overlapped with communication within each GPU. Moreover, we have decreased the total idle time between two Jacobi iterations to about $175\\mu$s. Therefore, the GPU idle time is $175-50=125\\mu$s, which is lesser than the $150\\mu$s idle time achieved using just streams.\n",
+    "\n",
+    "**Solution:** The solution for this exercise is present in `source_code/memcpy/solutions` directory: [jacobi_streams_events.cu](../../source_code/cuda/solutions/jacobi_streams_events.cu)\n",
+    "\n",
+    "We have now covered implementing computation and communication overlap using CUDA Streams and then fine-tuning it using CUDA Events. Note that all of our codes currently are confined to a single node. We would like to scale our codes across nodes.\n",
+    "\n",
+    "Therefore, let us learn about multi-node multi-GPU programming with MPI. Click bellow to access the next lab:\n",
+    "\n",
+    "# [Next: Multi-Node programming with MPI](../mpi/multi_node_intro.ipynb)\n",
+    "\n",
+    "Here's a link to the home notebook through which all other notebooks are accessible:\n",
+    "\n",
+    "# [HOME](../../../introduction.ipynb)\n",
+    "\n",
+    "---\n",
+    "## Links and Resources\n",
+    "\n",
+    "* [Programming Concepts: CUDA Streams and Concurrency](https://developer.download.nvidia.com/CUDA/training/StreamsAndConcurrencyWebinar.pdf)\n",
+    "* [Programming Concepts: CUDA Events and Performance Monitoring](https://developer.nvidia.com/blog/how-implement-performance-metrics-cuda-cc/)\n",
+    "* [Programming: CUDA Streams Programming Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#streams-cdp)\n",
+    "* [Concepts: Overlapping Computation and Communication](https://developer.nvidia.com/blog/how-overlap-data-transfers-cuda-cc/)\n",
+    "* [Documentation: CUDA Stream Management API](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html)\n",
+    "* [Documentation: CUDA Events Management API](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html)\n",
+    "* [Code: Multi-GPU Programming Models](https://github.com/NVIDIA/multi-gpu-programming-models)\n",
+    "* [Code: GPU Bootcamp](https://github.com/gpuhackathons-org/gpubootcamp/)\n",
+    "\n",
+    "Don't forget to check out additional [OpenACC Resources](https://www.openacc.org/resources) and join our [OpenACC Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n",
+    "\n",
+    "## Licensing \n",
+    "\n",
+    "This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0)."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

+ 0 - 149
hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/jacobi/overview.ipynb

@@ -1,149 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "39ad569e",
-   "metadata": {},
-   "source": [
-    "# Laplace Equation\n",
-    "\n",
-    "Laplace Equation is a well-studied linear partial differential equation that governs steady state heat conduction, irrotational fluid flow, and many other phenomena. \n",
-    "\n",
-    "In this lab, we will consider the 2D Laplace Equation on a rectangle with Dirichlet boundary conditions on the left and right boundary and period boundary conditions on top and bottom boundary. We wish to solve the following equation:\n",
-    "\n",
-    "$\\Delta u(x,y) = 0\\;\\forall\\;(x,y)\\in\\Omega,\\delta\\Omega$\n",
-    "\n",
-    "# Jacobi Method\n",
-    "\n",
-    "The Jacobi method is an iterative algorithm to solve a linear system of strictly diagonally dominant equations. The governing equation is discretized and converted to a matrix amenable to Jacobi-method based solver.\n",
-    "\n",
-    "## The Code\n",
-    "\n",
-    "Let's understand the single-GPU code first. The source code file is available here: [jacobi.cu](../../source_code/single_gpu/jacobi.cu).\n",
-    "\n",
-    "Alternatively, you can open the `File` menu and click on the `Open...` option which opens Jupyter's file explorer in a new tab. Then, navigate to `CFD/English/C/source_code/single_gpu/` directory in which you can view the `jacobi.cu` file. \n",
-    "\n",
-    "Similarly, have look at the [Makefile](../../source_code/single_gpu/Makefile). \n",
-    "\n",
-    "Refer to the `single_gpu(...)` function. The important steps at iteration of the Jacobi Solver (that is, the `while` loop) are:\n",
-    "1. The norm is set to 0.\n",
-    "2. The device kernel is called to update the interier points.\n",
-    "3. The norm is copied back to the host, and\n",
-    "4. The boundary conditions are re-applied for the next iteration.\n",
-    "\n",
-    "Note that we run the Jacobi solver for 1000 iterations over the grid.\n",
-    "\n",
-    "## Compilation and Execution\n",
-    "\n",
-    "Let's compile the single-GPU code:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 44,
-   "id": "eac2daf7",
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "rm -f jacobi jacobi.qdrep\r\n",
-      "nvcc -DHAVE_CUB -Xcompiler -fopenmp -lineinfo -DUSE_NVTX -lnvToolsExt -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_80,code=compute_80 -std=c++14 jacobi.cu -o jacobi\r\n"
-     ]
-    }
-   ],
-   "source": [
-    "!cd ../../source_code/single_gpu && make clean && make"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "33345661",
-   "metadata": {},
-   "source": [
-    "Now, let us execute the program: "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 45,
-   "id": "e234f430",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Single GPU jacobi relaxation: 1000 iterations on 16384 x 16384 mesh with norm check every 1 iterations\n",
-      "    0, 31.999022\n",
-      "  100, 0.897983\n",
-      "  200, 0.535684\n",
-      "  300, 0.395651\n",
-      "  400, 0.319039\n",
-      "  500, 0.269961\n",
-      "  600, 0.235509\n",
-      "  700, 0.209829\n",
-      "  800, 0.189854\n",
-      "  900, 0.173818\n",
-      "16384x16384: 1 GPU:   3.3650 s\n"
-     ]
-    }
-   ],
-   "source": [
-    "!cd ../../source_code/single_gpu && ./jacobi"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "14bb863e",
-   "metadata": {},
-   "source": [
-    "The output reports the norm value every 100 iterations and the total execution time of the Jacobi Solver. We would like to decrease the overall execution time of the program. To quantify the performance gain, we denote the single-GPU execution time as $T_s$ and multi-GPU execution time for $P$ GPUs as $T_p$. using this, we obtain the figures-of-merit, speedup $S = T_s/T_p$ (optimal is $P$) and efficiency $E = S/P$ (optimal is $1$). "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b0c6f16a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    " !cd ../../source_code/mpi && make clean && make"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0979d23b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!cd ../../source_code/mpi && mpirun -np 8 nsys profile --trace=mpi,cuda,nvtx ./jacobi"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.5"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}

File diff suppressed because it is too large
+ 247 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/mpi/containers_and_mpi.ipynb


File diff suppressed because it is too large
+ 380 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/mpi/cuda_aware.ipynb


+ 430 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/mpi/memcpy.ipynb

@@ -0,0 +1,430 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "4ecc207b-52c7-463a-8731-19203d384a30",
+   "metadata": {},
+   "source": [
+    "Before we begin, let's get an overview of the CUDA driver version and the GPUs running on the server by executing the `nvidia-smi` command below. Highlight the cell below by clicking on it and then either hit `Ctrl+Enter` on the keyboard or click on the `Run` button on the toolbar above. The output will be visible below the cell."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4d6d1387-f525-40d4-bf3a-f7403bdce2b5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!nvidia-smi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ed9d6f0d-cfa6-4ffd-b970-bee700bf1a90",
+   "metadata": {},
+   "source": [
+    "**Note:** Execution results can vary significantly based on the MPI installation, supporting libraries, workload manager, and underlying CPU and GPU hardware configuration and topology. The codes in this lab have been tested on DGX-1 8 Tesla V100 16 GB nodes connected by Mellanox InfiniBand NICs running OpenMPI v4.1.1 with HPCX 2.8.1 and CUDA v11.3.0.0.\n",
+    "\n",
+    "# Learning Objectives\n",
+    "\n",
+    "We will learn about the following in this lab:\n",
+    "\n",
+    "* Point-to-point and collective MPI communication routines.\n",
+    "* Managing the two-level hierarchy created by global and local rank of a process and how it accesses GPU(s).\n",
+    "* OpenMPI process mappings and its effect on application performance.\n",
+    "\n",
+    "## MPI Inter-Process Communication\n",
+    "\n",
+    "Let us learn more about how MPI communicates between processes.\n",
+    "\n",
+    "### Point-to-Point communication\n",
+    "\n",
+    "Two MPI processes can communicate directly (point-to-point) by sending and receiving data packets to and from each other. Both the sender and receivers processes must acknowledge the transaction using `MPI_Send` and `MPI_Recv` functions. MPI allows tagging messages to differenciate between various messages that processes may send to each other.\n",
+    "\n",
+    "The function syntax for `MPI_Send` is:\n",
+    "\n",
+    "```c\n",
+    "int MPI_Send(void* data, int count, MPI_Datatype datatype, int destination, \n",
+    "         int tag, MPI_Comm communicator);\n",
+    "```\n",
+    "\n",
+    "Similarly, the syntax for `MPI_Recv` is:\n",
+    "\n",
+    "```c\n",
+    "int MPI_Recv(void* data, int count, MPI_Datatype datatype, int source, int tag,\n",
+    "         MPI_Comm communicator, MPI_Status* status);\n",
+    "```\n",
+    "   \n",
+    "A simple 2-process send-receive code is as follows:\n",
+    "\n",
+    "```c\n",
+    "int data;\n",
+    "if (rank == 0) {\n",
+    "    data = -1;\n",
+    "    MPI_Send(&data, 1, MPI_INT, 1, 0, MPI_COMM_WORLD);\n",
+    "} else if (rank == 1) {\n",
+    "    MPI_Recv(&data, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n",
+    "}\n",
+    "```\n",
+    "\n",
+    "There are several other functions to send and receive data synchronously and asynchronously. In particular, we will make use of `MPI_SendRecv` function which sends and receives a message, and whose syntax is as follows:\n",
+    "\n",
+    "```c\n",
+    "int MPI_Sendrecv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,\n",
+    "                int dest, int sendtag,\n",
+    "                void *recvbuf, int recvcount, MPI_Datatype recvtype,\n",
+    "                int source, int recvtag,\n",
+    "                MPI_Comm comm, MPI_Status *status);\n",
+    "```\n",
+    "\n",
+    "### Collective communication\n",
+    "\n",
+    "Collective communication involves participation of all processes in a communicator. It implies an implicit synchronization point among processes. Depending on the requirement, we can peform broadcast, scatter, gather, reduce, and other operations between the participating processes. \n",
+    "\n",
+    "In our application, we would like to reduce all the rank-local norms to a single global norm using the sum operation. We use the `MPI_Allreduce` function for it which combines and reduces values from all processes and distributes the result back to all processes, and whose syntax is as follows:\n",
+    "\n",
+    "```c\n",
+    "int MPI_Allreduce(const void *sendbuf, void *recvbuf, int count,\n",
+    "                  MPI_Datatype datatype, MPI_Op op, MPI_Comm comm);\n",
+    "```\n",
+    "\n",
+    "The `op` in our case will be `MPI_SUM`.\n",
+    "\n",
+    "## Communication Models\n",
+    "\n",
+    "We will use multiple ranks within our program as we will use multiple nodes. There are three major approaches to handle GPUs within a node:\n",
+    "\n",
+    "1. Single GPU per rank\n",
+    "  * One process controls one GPU.\n",
+    "  * Easier to program and understand.\n",
+    "  * We can re-use our domain decomposition approach.\n",
+    "\n",
+    "\n",
+    "2. Multiple GPUs per rank\n",
+    "  * Usually, all GPUs within a node are handled by one process.\n",
+    "  * Coordinating between GPUs is quite tricky as CUDA-based communication is intertwined with MPI communication.\n",
+    "  * Requires a new decomposition for the two-tier communication hierarchy (MPI and CUDA).\n",
+    "\n",
+    "\n",
+    "3. Single GPU per multiple ranks\n",
+    "  * Multiple processes use the same GPU and number of processes in a node is usually equal to number of cores.\n",
+    "  * Intended for heterogeneous codes where both CPU and GPU accelerate the application.\n",
+    "  * CUDA Multi-Process-Service (MPS) is required to allow multiple CUDA processes to share a single GPU context.\n",
+    "  \n",
+    "We will take the first approach due to its simplicity (which eliminates approach #2) and because our application doesn't utilize CPU for compute (which eliminates approach #3). Thus our rank (core) to GPU mapping is one-to-one, as follows:\n",
+    "\n",
+    "![mpi_overview](../../images/mpi_overview.png)\n",
+    "\n",
+    "### Nodel-Level Local Rank\n",
+    "\n",
+    "As we will run on multiple nodes, for example 2, the number of processes launched, 16, will not map one-to-one with GPU Device ID, which runs from 0 to 7 on each node. Thus, we need to create a local rank at the node level.\n",
+    "\n",
+    "To achieve this, we split the `MPI_COMM_WORLD` communicator between the nodes and store it in a `local_comm` communicator. Then, we get the local rank by calling the familiar `MPI_Comm_rank` function. Finally, we free the `local_comm` communicator as we don't require it anymore. \n",
+    "\n",
+    "The code snippet to obtain the `local_rank` at each node level is as follows:\n",
+    "\n",
+    "```c\n",
+    "int local_rank = -1;\n",
+    "MPI_Comm local_comm;\n",
+    "MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL, &local_comm);\n",
+    "MPI_Comm_rank(local_comm, &local_rank);\n",
+    "MPI_Comm_free(&local_comm);\n",
+    "```\n",
+    "\n",
+    "## Implementation Exercise: Part 1\n",
+    "\n",
+    "### Code Structure\n",
+    "\n",
+    "Open the [jacobi_memcpy_mpi.cpp](../../source_code/mpi/jacobi_memcpy_mpi.cpp) file and the [jacobi_kernels.cu](../../source_code/mpi/jacobi_kernels.cu) files from the `source_code/mpi` directory.\n",
+    "\n",
+    "We separate the device kernels from other CUDA and MPI functions as `nvc++` compiler is required to compile CUDA C++ which may not be installed on some platforms Note that NVIDIA's HPC SDK includes the `nvc++` compiler.\n",
+    "\n",
+    "Review the [Makefile](../../source_code/mpi/Makefile) to see that we compile the CUDA kernels using `nvcc` and link the object file with `jacobi_memcpy_mpi.cpp` using `mpicxx` compiler as follows:\n",
+    "\n",
+    "```bash\n",
+    "# Compiling jacobi_kernels.cu\n",
+    "nvcc -gencode arch=compute_80,code=sm_80 -std=c++14 jacobi_kernels.cu -c\n",
+    "# Compiling and linking with jacobi_cuda_aware_mpi.cpp\n",
+    "mpicxx -I${CUDA_HOME}/include -fopenmp -std=c++14 jacobi_cuda_aware_mpi.cpp jacobi_kernels.o \\\n",
+    "        -L${CUDA_HOME}/lib64 -lcudart -lnvToolsExt -o jacobi_cuda_aware_mpi\n",
+    "```\n",
+    "\n",
+    "The device kernels are same as in previous labs. Open `jacobi_memcpy_mpi.cpp` file and understand the flow of the program. In particular, observe the following:\n",
+    "\n",
+    "1. `local_rank` is used to set the current GPU device.\n",
+    "2. Device kernel calls have been replaced with function wrappers for ease of compilation.\n",
+    "3. Rank 0 is used to calculate efficiency and other metrics, even though all ranks compute `single_gpu` function to verify multi-GPU implementation's correctness.\n",
+    "4. In the first set of halo exchanges, `top_halo_buf` stores the top halo copied from the device on the host which is then sent to top neighbour. Whereas `bot_halo_buf` stores the updated bottom halo received from bottom neighbour that is then copied to the device from the host.\n",
+    "5. In the second set of halo exchanges, `top_halo_buf` stores the updated top halo received from the top neighbour that is then copied to the device from the host. Whereas `bot_halo_buf` stores the bottom halo copied from the device to the host that is then sent to the bottom neighbour.\n",
+    "6. Each halo exchange is wrapped in NVTX \"Halo exchange Memcpy+MPI\" for ease of viewing in profiler.\n",
+    "\n",
+    "### To-Do\n",
+    "\n",
+    "Now, implement the following marked as `TODO: Part 1-`:\n",
+    "\n",
+    "* Obtain the node-level local rank by splitting the global communicator.\n",
+    "* Implement the MPI portion of first set of halo exchanges using `MPI_SendRecv` as explained above.\n",
+    "* Implement the Memcpy operations and MPI calls for the second set of halo exchanges. Recall why `cudaMemcpyAsync` is not the correct way of implementing this MPI program.\n",
+    "* Reduce the rank-local L2 Norm to a global L2 norm using `MPI_Allreduce` function.\n",
+    "\n",
+    "After implementing these, compile the program:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "57db8407-a720-4f19-9666-d0b1b37c6a1d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! cd ../../source_code/mpi && make clean && make jacobi_memcpy_mpi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9bd0ae58-b7bf-432d-8ced-367daaefbc7d",
+   "metadata": {},
+   "source": [
+    "Ensure there are no compilation errors. Now, let us validate the program. \n",
+    "\n",
+    "The grid-size of 16384$\\times$16384 has been selected such that all 8 GPUs are fully utilized. To test with 16 GPUs, we increase the grid size to 16384$\\times$32768 to maintain the invariant that GPUs are not under-utilized. Observe that the halo exchange copy size remains the same as before (16K elements * size of float (4B) = 64KB).\n",
+    "\n",
+    "Run the program with 16 processes across 2 nodes as follows:\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9ddb18d3-868f-4dc8-b3c6-7225cc367135",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! cd ../../source_code/mpi && mpirun -np 16 -npersocket 4 ./jacobi_memcpy_mpi -ny 32768"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e9bc18c9-a836-4503-a874-327293fd7d0b",
+   "metadata": {},
+   "source": [
+    "We share the partial output from 2 DGX-1 nodes with 8 Tesla V100-16GB each connected by InfiniBand (IB) NICs:\n",
+    "\n",
+    "```bash\n",
+    "Num GPUs: 16.\n",
+    "16384x32768: 1 GPU:   8.9057 s, 16 GPUs:   0.7695 s, speedup:    11.57, efficiency:    72.34 \n",
+    "```\n",
+    "For reference, we also share the output from 4 DGX-1 nodes with 16K$\\times$64K grid size ($4\\times$ the single-node's grid size):\n",
+    "\n",
+    "```bash\n",
+    "Num GPUs: 32.\n",
+    "16384x65536: 1 GPU:  17.6316 s, 32 GPUs:   0.8526 s, speedup:    20.68, efficiency:    64.62\n",
+    "```\n",
+    "\n",
+    "As the communication overhead increases due to more inter-node communication, the speed-up obtained and thus the efficiency of the application decreases. Nonetheless, our program can scale across mutliple nodes.\n",
+    "\n",
+    "### OpenMPI Process Mappings\n",
+    "\n",
+    "As we mentioned in previous labs, there are multiple ways to specify the number of processes to be run on each socket, node, etc. One such way is to use `--map-by` option. Mapping assigns a default location to each process.  To specify that we want each socket to run 4 processes, we use `--map-by ppr:4:socket` flag. Here, `ppr` stands for processes-per-resource, where the spcified resource is `socket` and the spcified number of processes is `4`. \n",
+    "\n",
+    "It is similar to using the `-npersocket 4` option. Run the following command and validate that the results obtained is the same:\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f874e966-53ad-4251-8059-76697ef6862e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! cd ../../source_code/mpi && mpirun -np 16 --map-by ppr:4:socket ./jacobi_memcpy_mpi -ny 32768"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0765be26-807b-4ee5-ae75-9e9a6a14c293",
+   "metadata": {},
+   "source": [
+    "We can also use the `--map-by ppr:8:node:4:socket` flag. Here, in addition to specifying the number of processes per socket, we also specify the number of processes per node. This should result in the same execution and results. So, run the following command:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ec11f8bf-1948-48d5-8eac-f6a655e1d369",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! cd ../../source_code/mpi && mpirun -np 16 --map-by ppr:8:node:4:socket ./jacobi_memcpy_mpi -ny 32768"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "07f62d8e-3584-414e-9203-e4d24961c2bc",
+   "metadata": {},
+   "source": [
+    "Notice that our efficiency has decreased. We share our partial results:\n",
+    "\n",
+    "```bash\n",
+    "Num GPUs: 16.\n",
+    "16384x32768: 1 GPU:   8.9050 s, 16 GPUs:   0.8150 s, speedup:    10.93, efficiency:    68.2\n",
+    "```\n",
+    "\n",
+    "Compare it with the previous result and notice the increase in multi-node execution time and corresponding decrease in efficiency. Let us check what cores or sockets or nodes each process (or MPI rank) is bound to. Binding constrains each process to run on specific processors. We use the `--report-bindings` option to check this:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b79fbfcf-ca8a-4e86-9ef2-9c6f3387ddea",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! cd ../../source_code/mpi && mpirun -np 16 --map-by ppr:8:node:4:socket --report-bindings ./jacobi_memcpy_mpi -ny 32768"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "37547338-af62-4329-b65c-9b2c0b45130f",
+   "metadata": {},
+   "source": [
+    "The output may seem cluttered, so let us focus on partial output from ranks 0 and 1:\n",
+    "\n",
+    "```bash\n",
+    "[<node_0_name>:<proc_id>] MCW rank 0 bound to socket 0 ... [BB/BB/BB/BB/BB/BB/BB/BB/BB/BB/BB/BB/BB/BB/BB/BB/BB/BB/BB/BB][../../../../../../../../../../../../../../../../../../../..]\n",
+    "[<node_0_name>:<proc_id>] MCW rank 1 bound to socket 1 ... [../../../../../../../../../../../../../../../../../../../..][BB/BB/BB/BB/BB/BB/BB/BB/BB/BB/BB/BB/BB/BB/BB/BB/BB/BB/BB/BB]\n",
+    "```\n",
+    "\n",
+    "Rank 0 is bound to all cores on socket 0 on node 0 while rank 1 is bound to all cores on socket 1 on node 0. Clearly, this is not an optimal arrangement as halo exchanges have to cross socket boundaries for process. Now, check the process bindings in the previous case:\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "24c54f2b-28a4-482d-9d53-4cea6aca1f00",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! cd ../../source_code/mpi && mpirun -np 16 --map-by ppr:4:socket --report-bindings ./jacobi_memcpy_mpi -ny 32768"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2c5791f2-169a-4394-a640-226b365b0ff8",
+   "metadata": {},
+   "source": [
+    "Now, ranks 0 and 1 are bound to the same socket in the same node. Moreover, ranks 3 and 4 are bound to different sockets (as `<procs_per_socket>` is 4) but bound to the same node, as desired.\n",
+    "\n",
+    "It is quite easy to end up in a sub-optimal process mapping by using simple OpenMPI flags and options. Thus, it is always advisible to double-check the process-to-core and process-to-socket bindings.  \n",
+    "\n",
+    "Moving forward, we will use the `--map-by ppr:4:socket` option as evidently it results in desired process-to-core, socket, and node mapping.\n",
+    "\n",
+    "### Profiling\n",
+    "\n",
+    "We can profile an MPI program in two ways. To profile everything, putting the data in one file:\n",
+    "\n",
+    "```bash\n",
+    "nsys [nsys options] mpirun [mpi options] <program>\n",
+    "```\n",
+    "\n",
+    "To profile everything putting the data from each rank into a separate file:\n",
+    "\n",
+    "```bash\n",
+    "mpirun [mpi options] nsys profile [nsys options] <program>\n",
+    "```\n",
+    "\n",
+    "We will use the latter approach as it produces a single report and is more convenient to view. The host compute nodes need a working installation of Nsight Systems.\n",
+    "\n",
+    "Let's profile the application using `nsys`: "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "49c461fa-777d-47ab-94d9-e1ac418b9711",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! cd ../../source_code/mpi && nsys profile --trace=mpi,cuda,nvtx --stats=true --force-overwrite true -o jacobi_memcpy_mpi_report \\\n",
+    "                                 mpirun -np 16 --map-by ppr:4:socket ./jacobi_memcpy_mpi -ny 32768"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c89cc9bd-aed4-4ae4-bd5c-ae4698d44d92",
+   "metadata": {},
+   "source": [
+    "Download the report and view it via the GUI. \n",
+    "\n",
+    "You may notice that only 8 MPI processes are visible even though we launched 16 MPI processes. Nsight Systems displays the output from a single node and inter-node transactions (copy operations) are visible. This is for ease of viewing and doesn't impede our analysis.\n",
+    "\n",
+    "We share the partial output below:\n",
+    "\n",
+    "![mpi_memcpy_overview](../../images/mpi_memcpy_overview.png)\n",
+    "\n",
+    "Observe the following in the Timeline snapshot:\n",
+    "\n",
+    "* Two sets of halo exchanges take place, each consisting of DtoH and HtoD CUDA Memcpy with an `MPI_Sendrecv` call in between for inter-process communication followed by an `MPI_Allreduce` call. \n",
+    "* Each halo exchange takes about $45\\mu$s in hardware and about $60\\mu$s overall including the software overhead.\n",
+    "* The time between two Jacobi kernel iterations is about $200\\mu$s.\n",
+    "\n",
+    "However, if you scroll back in time, you might notice that not all halo exchanges take $60\\mu$s. For example, here's a snapshot from near the beginning of the multi-GPU Jacobi iteration loop:\n",
+    "\n",
+    "![mpi_memcpy_large_time](../../images/mpi_memcpy_large_time.png)\n",
+    "\n",
+    "Here, the halo exchange takes about $1100\\mu$s. MPI uses a lot of heuristics to fine-tune its call-stack and communication protocol to enhance performance. Therefore, we observe the behavior shown above where initially MPI calls take significant time but it improves in subsequent iterations.\n",
+    "\n",
+    "**Solution:** The solution for this exercise is present in `source_code/mpi/solutions` directory: [jacobi_memcpy_mpi.cpp](../../source_code/mpi/solutions/jacobi_memcpy_mpi.cpp).\n",
+    "\n",
+    "Note that our current implementation uses explicit host-staging for every halo copy operation. From our previous labs, we know that within a node, GPU-to-GPU communication can bypass host-staging and we implemented it using DtoD CUDA Memcpy with P2P enabled. Certainly, eliminating host-staging should improve performance. There are also inter-node communication optimizations that we can employ. \n",
+    "\n",
+    "We will learn more about both intra-node and inter-node GPU-centric MPI communication optimizations in the next lab where we will work with CUDA-aware MPI. Click below to move to the next lab:\n",
+    "\n",
+    "# [Next: CUDA-aware MPI](../mpi/cuda_aware.ipynb)\n",
+    "\n",
+    "Here's a link to the home notebook through which all other notebooks are accessible:\n",
+    "\n",
+    "# [HOME](../../../introduction.ipynb)\n",
+    "\n",
+    "---\n",
+    "## Links and Resources\n",
+    "\n",
+    "* [Programming Concepts: MPI Point-to-Point Communication](https://cvw.cac.cornell.edu/mpip2p/p2pdef)\n",
+    "* [Programming Concepts: MPI Collective Communication](https://wgropp.cs.illinois.edu/courses/cs598-s15/lectures/lecture29.pdf)\n",
+    "* [Programming Concepts: NVIDIA Multi-Process Service](https://docs.nvidia.com/deploy/pdf/CUDA_Multi_Process_Service_Overview.pdf)\n",
+    "* [Documentation: MPI Processing Mapping, Ranking, and Binding](https://www.open-mpi.org/doc/current/man1/mpirun.1.php#sect12)\n",
+    "* [Code: Multi-GPU Programming Models](https://github.com/NVIDIA/multi-gpu-programming-models)\n",
+    "* [Code: GPU Bootcamp](https://github.com/gpuhackathons-org/gpubootcamp/)\n",
+    "\n",
+    "Don't forget to check out additional [OpenACC Resources](https://www.openacc.org/resources) and join our [OpenACC Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n",
+    "\n",
+    "## Licensing \n",
+    "\n",
+    "This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0)."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

File diff suppressed because it is too large
+ 262 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/mpi/multi_node_intro.ipynb


File diff suppressed because it is too large
+ 72 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/nccl/nccl.ipynb


+ 37 - 9
hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/single_gpu/single_gpu_overview.ipynb

@@ -5,6 +5,14 @@
    "id": "39ad569e",
    "metadata": {},
    "source": [
+    "# Learning Objectives\n",
+    "\n",
+    "The goal of this lab is to:\n",
+    "\n",
+    "* Review the scientific problem for which the Jacobi solver application has been developed.\n",
+    "* Understand the run the single-GPU code of the application.\n",
+    "* Learn about NVIDIA Nsight Systems profiler and how to use it to analyze our application.\n",
+    "\n",
     "# The Application\n",
     "\n",
     "This section provides an overview of the scientific problem we focus on and the solver we employ. Then, we execute the single GPU version of the application program.\n",
@@ -31,9 +39,13 @@
     "\n",
     "![gpu_programming_process](../../images/gpu_programming_process.png)\n",
     "\n",
-    "Let's understand the single-GPU code first. The source code file is available here: [jacobi.cu](../../source_code/single_gpu/jacobi.cu).\n",
+    "Let's understand the single-GPU code first. \n",
+    "\n",
+    "The source code file, [jacobi.cu](../../source_code/single_gpu/jacobi.cu) (click to open), is present in `CFD/English/C/source_code/single_gpu/` directory. \n",
     "\n",
-    "Alternatively, you can open the `File` menu and click on the `Open...` option which opens Jupyter's file explorer in a new tab. Then, navigate to `CFD/English/C/source_code/single_gpu/` directory in which you can view the `jacobi.cu` file. \n",
+    "Alternatively, you can navigate to `CFD/English/C/source_code/single_gpu/` directory in Jupyter's file browser in the left pane. Then, click to open the `jacobi.cu` file as shown below:\n",
+    "\n",
+    "![jupyter_lab_navigation](../../images/jupyter_lab_navigation.png)\n",
     "\n",
     "Similarly, have look at the [Makefile](../../source_code/single_gpu/Makefile). \n",
     "\n",
@@ -201,21 +213,37 @@
   {
    "cell_type": "markdown",
    "id": "6db3c3c7",
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "source": [
     "Now, download the report and view it via the GUI. This is the analysis step. Right click on the NVTX tab and select the Events View.\n",
     "\n",
     "![nsys single_gpu_analysis](../../images/nsys_single_gpu_analysis.png)\n",
     "\n",
-    "Clearly, we need to parallelize the \"Jacobi Solve\" routine, which is essentially the iterative Jacobi solver loop. Click on the link to continue to the next lab where we parallelize the code using cudaMemcpy and CUDA streams:\n",
+    "Clearly, we need to parallelize the \"Jacobi Solve\" routine, which is essentially the iterative Jacobi solver loop. Click on the link to continue to the next lab where we parallelize the code using cudaMemcpy and understand concepts like Peer-to-Peer Memory Access.\n",
+    "\n",
+    "# [Next: CUDA Memcpy and Peer-to-Peer Memory Access](../cuda/memcpy.ipynb)\n",
+    "\n",
+    "Here's a link to the home notebook through which all other notebooks are accessible:\n",
+    "\n",
+    "# [HOME](../../../introduction.ipynb)\n",
+    "\n",
+    "---\n",
+    "\n",
+    "## Links and Resources\n",
+    "\n",
+    "* [Science: Laplace Equation](https://mathworld.wolfram.com/LaplacesEquation.html)\n",
+    "* [Science: Jacobi Method](https://en.wikipedia.org/wiki/Jacobi_method)\n",
+    "* [Programming: CUDA C/C++ Programming Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html)\n",
+    "* [Programming: NVTX Documentation](https://docs.nvidia.com/nsight-visual-studio-edition/2020.1/nvtx/index.html)\n",
+    "* [Tools: NVIDIA NSight Systems profiler](https://developer.nvidia.com/nsight-systems)\n",
+    "* [Code: Multi-GPU Programming Models](https://github.com/NVIDIA/multi-gpu-programming-models)\n",
+    "* [Code: GPU Bootcamp](https://github.com/gpuhackathons-org/gpubootcamp/)\n",
     "\n",
-    "# [Multi-GPU: CUDA Streams](../memcpy/streams.ipynb)\n",
+    "Don't forget to check out additional [OpenACC Resources](https://www.openacc.org/resources) and join our [OpenACC Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n",
     "\n",
     "## Licensing \n",
     "\n",
-    "This material is released by NVIDIA Corporation under the Creative Commons Attribution 4.0 International (CC BY 4.0). "
+    "This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0)."
    ]
   }
  ],
@@ -235,7 +263,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.9.5"
   }
  },
  "nbformat": 4,

+ 6 - 2
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/memcpy/Makefile

@@ -1,5 +1,6 @@
 # Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
 NVCC=nvcc
+CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/21.5/cuda/
 GENCODE_SM70    := -gencode arch=compute_70,code=sm_70
 GENCODE_SM80    := -gencode arch=compute_80,code=sm_80 -gencode arch=compute_80,code=compute_80
 GENCODE_FLAGS	:= $(GENCODE_SM70) $(GENCODE_SM80)
@@ -11,9 +12,12 @@ jacobi_memcpy: jacobi_memcpy.cu
 jacobi_streams: jacobi_streams.cu
 	$(NVCC) $(NVCC_FLAGS) jacobi_streams.cu -o jacobi_streams
 
-all: jacobi_memcpy jacobi_streams
+jacobi_streams_events: jacobi_streams_events.cu
+	$(NVCC) $(NVCC_FLAGS) jacobi_streams_events.cu -o jacobi_streams_events
+
+all: jacobi_memcpy jacobi_streams jacobi_streams_events
 
 .PHONY: clean
 clean:
-	rm -f jacobi_memcpy jacobi_streams *.qdrep *.sqlite
+	rm -f jacobi_memcpy jacobi_streams jacobi_streams_events *.qdrep *.sqlite
 

+ 26 - 25
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/memcpy/jacobi_memcpy.cu

@@ -171,7 +171,7 @@ int main(int argc, char* argv[]) {
         CUDA_RT_CALL(cudaFree(0));
 
         if (0 == dev_id) {
-	    // Allocate memory on host and record single-GPU timings
+	        // Allocate memory on host and record single-GPU timings
             CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(float)));
             CUDA_RT_CALL(cudaMallocHost(&a_h, nx * ny * sizeof(float)));
             runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h);
@@ -188,12 +188,12 @@ int main(int argc, char* argv[]) {
         // num_ranks_low * chunk_size_low + (size - num_ranks_low) * (chunk_size_low + 1) = (ny - 2)
         int num_ranks_low = num_devices * chunk_size_low + num_devices - (ny - 2);  
 
-    if (dev_id < num_ranks_low)
+        if (dev_id < num_ranks_low)
             chunk_size[dev_id] = chunk_size_low;
         else
             chunk_size[dev_id] = chunk_size_high;
 
-	// Allocate memory on GPU
+	    // Allocate memory on GPU
         CUDA_RT_CALL(cudaMalloc(a + dev_id, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
         CUDA_RT_CALL(cudaMalloc(a_new + dev_id, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
 
@@ -227,14 +227,14 @@ int main(int argc, char* argv[]) {
 	    // TODO: Part 2- Check whether GPU "devices[dev_id]" can access peer "devices[top]"
             CUDA_RT_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, devices[dev_id], devices[top]));
             if (canAccessPeer) {
-		// TODO: Part 2- Enable peer access from GPU "devices[dev_id]" to "devices[top"
+		// TODO: Part 2- Enable peer access from GPU "devices[dev_id]" to "devices[top]"
                 CUDA_RT_CALL(cudaDeviceEnablePeerAccess(devices[top], 0));
             }
             const int bottom = (dev_id + 1) % num_devices;
             if (top != bottom) {
                 canAccessPeer = 0;
 		// TODO: Part 2- Check and enable peer access from GPU "devices[dev_id]" to
-		// "devices[bottom", whenever possible
+		// "devices[bottom]", whenever possible
                 CUDA_RT_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, 
 					devices[dev_id], devices[bottom]));
                 if (canAccessPeer) {
@@ -262,7 +262,7 @@ int main(int argc, char* argv[]) {
         CUDA_RT_CALL(cudaDeviceSynchronize());
     }
 
-    printf("Jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, ny, nx);
+    printf("Jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
 
     dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
     int iter = 0;
@@ -271,48 +271,49 @@ int main(int argc, char* argv[]) {
     double start = omp_get_wtime();
     nvtxRangePush("Jacobi solve");
     while (l2_norm > tol && iter < iter_max) {
-	// Launch device kernel on each GPU
+	    // Launch device kernel on each GPU
         for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
-	    // TODO: Part 1- Set current GPU to be "devices[dev_id]"
+            // TODO: Part 1- Set current GPU to be "devices[dev_id]"
             CUDA_RT_CALL(cudaSetDevice(devices[dev_id]));
 
             CUDA_RT_CALL(cudaMemsetAsync(l2_norm_d[dev_id], 0, sizeof(float)));
             dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X,
                           (chunk_size[dev_id] + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1);
 	    
-	    // TODO: Part 1- call Jacobi kernel with "dim_grid" blocks in grid and "dim_block"
-	    // blocks per thread. "dev_id" variable points to corresponding memory allocated 
-	    // for the current GPU.
+            // TODO: Part 1- Call Jacobi kernel with "dim_grid" blocks in grid and "dim_block"
+            // blocks per thread. "dev_id" variable points to corresponding memory allocated 
+            // for the current GPU.
             jacobi_kernel<<<dim_grid, dim_block>>>(
                     a_new[dev_id], a[dev_id], l2_norm_d[dev_id], iy_start[dev_id], iy_end[dev_id],
                     nx);
+
+            // TODO: Part 1- Copy GPU-local L2 norm "l2_norm_d" back to CPU "l2_norm_h"
+            CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h[dev_id], l2_norm_d[dev_id], sizeof(float),
+                     cudaMemcpyDeviceToHost));
 	}
-        // Launch async memory copy operations for halo exchange and 
+    // Launch async memory copy operations for halo exchange and 
 	// for copying local-grid L2 norm from each GPU to host
 	for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
             const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
             const int bottom = (dev_id + 1) % num_devices;
-	    // TODO: Part 1- Set current GPU
+            
+            // TODO: Part 1- Set current GPU
             CUDA_RT_CALL(cudaSetDevice(devices[dev_id]));
 
-	    // TODO: Part 1- Copy GPU-local L2 norm "l2_norm_d" back to CPU "l2_norm_h"
-            CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h[dev_id], l2_norm_d[dev_id], sizeof(float),
-                     cudaMemcpyDeviceToHost));
-
-	    // TODO: Part 1- Implement halo exchange with top neighbour "top"
+            // TODO: Part 1- Implement halo exchange with top neighbour "top"
             CUDA_RT_CALL(cudaMemcpyAsync(a_new[top] + (iy_end[top] * nx),
                                          a_new[dev_id] + iy_start[dev_id] * nx, nx * sizeof(float),
                                          cudaMemcpyDeviceToDevice));
 	    
-	    // TODO: Part 1- Implement halo exchange with bottom neighbour "bottom"
+            // TODO: Part 1- Implement halo exchange with bottom neighbour "bottom"
             CUDA_RT_CALL(cudaMemcpyAsync(a_new[bottom], a_new[dev_id] + (iy_end[dev_id] - 1) * nx,
                                          nx * sizeof(float), cudaMemcpyDeviceToDevice));
         }
         l2_norm = 0.0;
-	// Synchronize devices and compute global L2 norm
+        // Synchronize devices and compute global L2 norm
         for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
             // TODO: part 1- Set current GPU and call cudaDeviceSynchronize()
-	    CUDA_RT_CALL(cudaSetDevice(devices[dev_id]));
+	        CUDA_RT_CALL(cudaSetDevice(devices[dev_id]));
             CUDA_RT_CALL(cudaDeviceSynchronize());
 
             l2_norm += *(l2_norm_h[dev_id]);
@@ -413,7 +414,7 @@ double single_gpu(const int nx, const int ny, const int iter_max, float* const a
 
     CUDA_RT_CALL(cudaDeviceSynchronize());
 
-    printf("Single GPU jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, ny, nx);
+    printf("Single GPU jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
 
     dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X, (ny + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1);
     dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
@@ -437,9 +438,9 @@ double single_gpu(const int nx, const int ny, const int iter_max, float* const a
         CUDA_RT_CALL(cudaMemcpy(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(float),
                                      cudaMemcpyDeviceToDevice));
 
-	CUDA_RT_CALL(cudaDeviceSynchronize());
-	l2_norm = *l2_norm_h;
-	l2_norm = std::sqrt(l2_norm);
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+        l2_norm = *l2_norm_h;
+        l2_norm = std::sqrt(l2_norm);
 
     	iter++;
     	if ((iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm);

+ 455 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/cuda/jacobi_streams.cu

@@ -0,0 +1,455 @@
+/* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <sstream>
+
+#include <omp.h>
+#include <nvToolsExt.h>
+
+#define BLOCK_DIM_X 32
+#define BLOCK_DIM_Y 32
+
+#define CUDA_RT_CALL(call)                                                                  \
+    {                                                                                       \
+        cudaError_t cudaStatus = call;                                                      \
+        if (cudaSuccess != cudaStatus)                                                      \
+            fprintf(stderr,                                                                 \
+                    "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
+                    "with "                                                                 \
+                    "%s (%d).\n",                                                           \
+                    #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
+    }
+
+constexpr int MAX_NUM_DEVICES = 32;
+
+constexpr float tol = 1.0e-8;
+
+const float PI = 2.0 * std::asin(1.0);
+
+__global__ void initialize_boundaries(float*  a_new, float*  a, const float pi, const int offset, 
+                    const int nx, const int my_ny, const int ny) {
+    for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) {
+        const float y0 = sin(2.0 * pi * (offset + iy) / (ny - 1));
+        a[iy * nx + 0] = y0;
+        a[iy * nx + (nx - 1)] = y0;
+        a_new[iy * nx + 0] = y0;
+        a_new[iy * nx + (nx - 1)] = y0;
+    }
+}
+
+__global__ void jacobi_kernel(float*  a_new, const float*  a, float*  l2_norm, const int iy_start,
+                              const int iy_end, const int nx) {
+    int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start;
+    int ix = blockIdx.x * blockDim.x + threadIdx.x + 1;
+    __shared__ float block_l2_sum[BLOCK_DIM_X*BLOCK_DIM_Y];
+    unsigned thread_index = threadIdx.y*BLOCK_DIM_X + threadIdx.x;
+
+    if (iy < iy_end && ix < (nx - 1)) {
+    // Update grid point
+        const float new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] +
+                                     a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]);
+        a_new[iy * nx + ix] = new_val;
+    float residue = new_val - a[iy * nx + ix];
+    // Set block-level L2 norm value for this grid point
+    block_l2_sum[thread_index] = residue * residue;
+    }
+    else {
+    block_l2_sum[thread_index] = 0;
+    }
+    // Reduce L2 norm for the block in parallel
+    for (unsigned stride = 1; stride < BLOCK_DIM_X*BLOCK_DIM_Y; stride *= 2) {
+    __syncthreads();
+    if ((thread_index) % (2*stride) == 0) {
+            block_l2_sum[thread_index] += block_l2_sum[thread_index + stride];
+    }
+    }
+    // Atomically update global L2 norm with block-reduced L2 norm
+    if (thread_index == 0) {
+    atomicAdd(l2_norm, block_l2_sum[0]);
+    }
+}
+
+int get_argval(char** begin, char** end, const std::string& arg, const int default_val) {
+    int argval = default_val;
+    char** itr = std::find(begin, end, arg);
+    if (itr != end && ++itr != end) {
+        std::istringstream inbuf(*itr);
+        inbuf >> argval;
+    }
+    return argval;
+}
+
+bool get_arg(char** begin, char** end, const std::string& arg) {
+    char** itr = std::find(begin, end, arg);
+    if (itr != end) {
+        return true;
+    }
+    return false;
+}
+
+double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h);
+
+int main(int argc, char* argv[]) {
+    const int iter_max = get_argval(argv, argv + argc, "-niter", 1000);
+    const int nx = get_argval(argv, argv + argc, "-nx", 16384);
+    const int ny = get_argval(argv, argv + argc, "-ny", 16384);
+    const bool p2p = get_arg(argv, argv + argc, "-p2p");
+
+    float* a[MAX_NUM_DEVICES];
+    float* a_new[MAX_NUM_DEVICES];
+    float* a_ref_h;
+    float* a_h;
+    double runtime_serial = 0.0;
+
+    // Declare compute and halo exchange streams
+    cudaStream_t compute_stream[MAX_NUM_DEVICES];
+    cudaStream_t push_top_stream[MAX_NUM_DEVICES];
+    cudaStream_t push_bottom_stream[MAX_NUM_DEVICES];
+
+    float* l2_norm_d[MAX_NUM_DEVICES];
+    float* l2_norm_h[MAX_NUM_DEVICES];
+
+    int iy_start[MAX_NUM_DEVICES];
+    int iy_end[MAX_NUM_DEVICES];
+
+    int chunk_size[MAX_NUM_DEVICES];
+
+    int num_devices = 0;
+    CUDA_RT_CALL(cudaGetDeviceCount(&num_devices));
+
+    // Compute chunk size and allocate memory on GPUs
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(dev_id));
+        CUDA_RT_CALL(cudaFree(0));
+
+        if (0 == dev_id) {
+            // Allocate memory on host and record single-GPU timings
+            CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(float)));
+            CUDA_RT_CALL(cudaMallocHost(&a_h, nx * ny * sizeof(float)));
+            runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h);
+        }
+
+        // ny - 2 rows are distributed amongst `size` ranks in such a way
+        // that each rank gets either (ny - 2) / size or (ny - 2) / size + 1 rows.
+        // This optimizes load balancing when (ny - 2) % size != 0
+        int chunk_size_low = (ny - 2) / num_devices;
+        int chunk_size_high = chunk_size_low + 1;
+
+        // To calculate the number of ranks that need to compute an extra row,
+        // the following formula is derived from this equation:
+        // num_ranks_low * chunk_size_low + (size - num_ranks_low) * (chunk_size_low + 1) = (ny - 2)
+        int num_ranks_low = num_devices * chunk_size_low + num_devices - (ny - 2);  
+
+        if (dev_id < num_ranks_low)
+            chunk_size[dev_id] = chunk_size_low;
+        else
+            chunk_size[dev_id] = chunk_size_high;
+
+        // Allocate memory on GPU
+        CUDA_RT_CALL(cudaMalloc(a + dev_id, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
+        CUDA_RT_CALL(cudaMalloc(a_new + dev_id, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
+
+        CUDA_RT_CALL(cudaMemset(a[dev_id], 0, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
+        CUDA_RT_CALL(cudaMemset(a_new[dev_id], 0, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
+
+        // Calculate local domain boundaries
+        int iy_start_global;  // My start index in the global array
+        if (dev_id < num_ranks_low) {
+            iy_start_global = dev_id * chunk_size_low + 1;
+        } else {
+            iy_start_global =
+                num_ranks_low * chunk_size_low + (dev_id - num_ranks_low) * chunk_size_high + 1;
+        }
+
+        iy_start[dev_id] = 1;
+        iy_end[dev_id] = iy_start[dev_id] + chunk_size[dev_id];
+
+        // Set dirichlet boundary conditions on left and right boarder
+        initialize_boundaries<<<(ny / num_devices) / 128 + 1, 128>>>(
+            a[dev_id], a_new[dev_id], PI, iy_start_global - 1, nx, (chunk_size[dev_id] + 2), ny);
+        CUDA_RT_CALL(cudaGetLastError());
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+
+        // Create streams
+        CUDA_RT_CALL(cudaStreamCreate(compute_stream + dev_id));
+        CUDA_RT_CALL(cudaStreamCreate(push_top_stream + dev_id));
+        CUDA_RT_CALL(cudaStreamCreate(push_bottom_stream + dev_id));
+
+        CUDA_RT_CALL(cudaMalloc(l2_norm_d + dev_id, sizeof(float)));
+        CUDA_RT_CALL(cudaMallocHost(l2_norm_h + dev_id, sizeof(float)));
+
+        if (p2p == true) {
+            const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
+            int canAccessPeer = 0;
+            CUDA_RT_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, dev_id, top));
+            if (canAccessPeer) {
+                CUDA_RT_CALL(cudaDeviceEnablePeerAccess(top, 0));
+            }
+            const int bottom = (dev_id + 1) % num_devices;
+            if (top != bottom) {
+                canAccessPeer = 0;
+                CUDA_RT_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, dev_id, bottom));
+                if (canAccessPeer) {
+                    CUDA_RT_CALL(cudaDeviceEnablePeerAccess(bottom, 0));
+                }
+            }
+        }
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+    }
+
+    // Share initial top and bottom local grid-point values between neighbours
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(dev_id));
+        const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
+        const int bottom = (dev_id + 1) % num_devices;
+        CUDA_RT_CALL(cudaMemcpy(a_new[top] + (iy_end[top] * nx),
+                     a_new[dev_id] + iy_start[dev_id] * nx, nx * sizeof(float),
+                     cudaMemcpyDeviceToDevice));
+        CUDA_RT_CALL(cudaMemcpy(a_new[bottom], a_new[dev_id] + (iy_end[dev_id] - 1) * nx,
+                     nx * sizeof(float), cudaMemcpyDeviceToDevice));
+    }
+
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(dev_id));
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+    }
+
+    printf("Jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
+
+    dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    double start = omp_get_wtime();
+    nvtxRangePush("Jacobi solve");
+    while (l2_norm > tol && iter < iter_max) {
+        // Launch device kernel on each GPU
+        for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+            const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
+            const int bottom = (dev_id + 1) % num_devices;
+            CUDA_RT_CALL(cudaSetDevice(dev_id));
+
+            CUDA_RT_CALL(
+                cudaMemsetAsync(l2_norm_d[dev_id], 0, sizeof(float), compute_stream[dev_id]));
+
+            // TODO: Part 3- Ensure "top" and "bottom" neighbours have shared updated halos
+            // from the previous iteration by synchronizing "push_top_stream" and
+            // "push_bottom_stream" streams. Be careful with which neighbour's top stream and
+            // which neighbour's bottom stream needs to be synchronized.
+            CUDA_RT_CALL(cudaStreamSynchronize(push_top_stream[bottom]));
+            CUDA_RT_CALL(cudaStreamSynchronize(push_bottom_stream[top]));
+
+            dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X,
+                          (chunk_size[dev_id] + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1);
+
+            // TODO: Part 3- Launch Jacobi kernel on "compute_stream[dev_id]" and all other
+            // functional arguments
+            jacobi_kernel<<<dim_grid, dim_block, 0, compute_stream[dev_id]>>>(
+                    a_new[dev_id], a[dev_id], l2_norm_d[dev_id], iy_start[dev_id], iy_end[dev_id],
+                    nx);
+
+            // TODO: Part 3- Copy GPU-local L2 norm "l2_norm_d" back to CPU "l2_norm_h" on
+            // "compute_stream[dev_id]"
+            CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h[dev_id], l2_norm_d[dev_id], sizeof(float),
+                     cudaMemcpyDeviceToHost, compute_stream[dev_id]));
+        }    
+        for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+            const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
+            const int bottom = (dev_id + 1) % num_devices;
+            CUDA_RT_CALL(cudaSetDevice(dev_id));
+
+            // TODO: Part 3- Before copying the updated halos to neighbours, ensure the 
+            // computation is complete by synchronizing "compute_stream[dev_id]" stream
+            CUDA_RT_CALL(cudaStreamSynchronize(compute_stream[dev_id]));
+
+            // Apply periodic boundary conditions
+            // TODO: Part 3- Implement halo exchange with top neighbour on current device's 
+            // "push_top_stream"
+            CUDA_RT_CALL(cudaMemcpyAsync(a_new[top] + (iy_end[top] * nx),
+                                         a_new[dev_id] + iy_start[dev_id] * nx, nx * sizeof(float),
+                                         cudaMemcpyDeviceToDevice, push_top_stream[dev_id]));
+
+            // TODO: Part 3- Implement halo exchange with "bottom" neighbour on current device's 
+            // "push_bottom_stream"
+            CUDA_RT_CALL(cudaMemcpyAsync(a_new[bottom], a_new[dev_id] + (iy_end[dev_id] - 1) * nx,
+                                         nx * sizeof(float), cudaMemcpyDeviceToDevice,
+                                         push_bottom_stream[dev_id]));
+        }
+        l2_norm = 0.0;
+        for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+            l2_norm += *(l2_norm_h[dev_id]);
+        }
+
+        l2_norm = std::sqrt(l2_norm);
+        
+	iter++;
+        if ((iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm);
+
+        for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+            std::swap(a_new[dev_id], a[dev_id]);
+        }
+    }
+
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(dev_id));
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+    }
+
+    nvtxRangePop();
+    double stop = omp_get_wtime();
+
+    int offset = nx;
+    // Copy computed grid back to host from each GPU
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(
+            cudaMemcpy(a_h + offset, a[dev_id] + nx,
+                       std::min((nx * ny) - offset, nx * chunk_size[dev_id]) * sizeof(float),
+                       cudaMemcpyDeviceToHost));
+        offset += std::min(chunk_size[dev_id] * nx, (nx * ny) - offset);
+    }
+
+    // Compare against single GPU execution for correctness
+    bool result_correct = true;
+    for (int iy = 1; result_correct && (iy < (ny - 1)); ++iy) {
+        for (int ix = 1; result_correct && (ix < (nx - 1)); ++ix) {
+            if (std::fabs(a_ref_h[iy * nx + ix] - a_h[iy * nx + ix]) > tol) {
+                fprintf(stderr,
+                        "ERROR: a[%d * %d + %d] = %f does not match %f "
+                        "(reference)\n",
+                        iy, nx, ix, a_h[iy * nx + ix], a_ref_h[iy * nx + ix]);
+                result_correct = false;
+            }
+        }
+    }
+
+    if (result_correct) {
+        printf("Num GPUs: %d.\n", num_devices);
+        printf(
+            "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, "
+            "efficiency: %8.2f \n",
+            ny, nx, runtime_serial, num_devices, (stop - start),
+            runtime_serial / (stop - start),
+            runtime_serial / (num_devices * (stop - start)) * 100);
+    }
+
+    for (int dev_id = (num_devices - 1); dev_id >= 0; --dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(dev_id));
+        CUDA_RT_CALL(cudaStreamDestroy(push_bottom_stream[dev_id]));
+        CUDA_RT_CALL(cudaStreamDestroy(push_top_stream[dev_id]));
+        CUDA_RT_CALL(cudaStreamDestroy(compute_stream[dev_id]));
+
+        CUDA_RT_CALL(cudaFreeHost(l2_norm_h[dev_id]));
+        CUDA_RT_CALL(cudaFree(l2_norm_d[dev_id]));
+
+        CUDA_RT_CALL(cudaFree(a_new[dev_id]));
+        CUDA_RT_CALL(cudaFree(a[dev_id]));
+        if (0 == dev_id) {
+            CUDA_RT_CALL(cudaFreeHost(a_h));
+            CUDA_RT_CALL(cudaFreeHost(a_ref_h));
+        }
+    }
+
+    return result_correct ? 0 : 1;
+}
+
+double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h) {
+    float* a;
+    float* a_new;
+
+    float* l2_norm_d;
+    float* l2_norm_h;
+
+    int iy_start = 1;
+    int iy_end = (ny - 1);
+
+    CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(float)));
+
+    CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(float)));
+
+    // Set diriclet boundary conditions on left and right boarder
+    nvtxRangePush("Init boundaries");
+    initialize_boundaries<<<ny / 128 + 1, 128>>>(a, a_new, PI, 0, nx, ny, ny);
+    CUDA_RT_CALL(cudaGetLastError());
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+    nvtxRangePop();
+
+    CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(float)));
+    CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(float)));
+
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    printf("Single GPU jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
+
+    dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X, (ny + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1);
+    dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    double start = omp_get_wtime();
+    nvtxRangePush("Jacobi Solve");
+    while (l2_norm > tol && iter < iter_max) {
+        CUDA_RT_CALL(cudaMemset(l2_norm_d, 0, sizeof(float)));
+
+        // Compute grid points for this iteration
+        jacobi_kernel<<<dim_grid, dim_block>>>(a_new, a, l2_norm_d, iy_start, iy_end, nx);
+        CUDA_RT_CALL(cudaGetLastError());
+        CUDA_RT_CALL(cudaMemcpy(l2_norm_h, l2_norm_d, sizeof(float), cudaMemcpyDeviceToHost));
+
+        // Apply periodic boundary conditions
+        CUDA_RT_CALL(cudaMemcpy(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(float),
+                                     cudaMemcpyDeviceToDevice));
+        CUDA_RT_CALL(cudaMemcpy(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(float),
+                                     cudaMemcpyDeviceToDevice));
+
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+        l2_norm = *l2_norm_h;
+        l2_norm = std::sqrt(l2_norm);
+
+        iter++;
+        if ((iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm);
+
+        std::swap(a_new, a);
+    }
+    nvtxRangePop();
+    double stop = omp_get_wtime();
+
+    CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(float), cudaMemcpyDeviceToHost));
+
+    CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
+    CUDA_RT_CALL(cudaFree(l2_norm_d));
+
+    CUDA_RT_CALL(cudaFree(a_new));
+    CUDA_RT_CALL(cudaFree(a));
+    return (stop - start);
+}
+

+ 14 - 9
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/memcpy/jacobi_streams.cu

@@ -198,13 +198,13 @@ int main(int argc, char* argv[]) {
         CUDA_RT_CALL(cudaStreamCreate(compute_stream + dev_id));
         CUDA_RT_CALL(cudaStreamCreate(push_top_stream + dev_id));
         CUDA_RT_CALL(cudaStreamCreate(push_bottom_stream + dev_id));
-        CUDA_RT_CALL(cudaEventCreateWithFlags(compute_done + dev_id, cudaEventDisableTiming));
-        CUDA_RT_CALL(cudaEventCreateWithFlags(push_top_done[0] + dev_id, cudaEventDisableTiming));
+        CUDA_RT_CALL(cudaEventCreate(compute_done + dev_id));
+        CUDA_RT_CALL(cudaEventCreate(push_top_done[0] + dev_id));
         CUDA_RT_CALL(
-            cudaEventCreateWithFlags(push_bottom_done[0] + dev_id, cudaEventDisableTiming));
-        CUDA_RT_CALL(cudaEventCreateWithFlags(push_top_done[1] + dev_id, cudaEventDisableTiming));
+            cudaEventCreate(push_bottom_done[0] + dev_id));
+        CUDA_RT_CALL(cudaEventCreate(push_top_done[1] + dev_id));
         CUDA_RT_CALL(
-            cudaEventCreateWithFlags(push_bottom_done[1] + dev_id, cudaEventDisableTiming));
+            cudaEventCreate(push_bottom_done[1] + dev_id));
 
         CUDA_RT_CALL(cudaMalloc(l2_norm_d + dev_id, sizeof(float)));
         CUDA_RT_CALL(cudaMallocHost(l2_norm_h + dev_id, sizeof(float)));
@@ -241,10 +241,10 @@ int main(int argc, char* argv[]) {
 
     for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
         CUDA_RT_CALL(cudaSetDevice(dev_id));
-    CUDA_RT_CALL(cudaDeviceSynchronize());
+        CUDA_RT_CALL(cudaDeviceSynchronize());
     }
 
-    printf("Jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, ny, nx);
+    printf("Jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
 
     dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
     int iter = 0;
@@ -272,11 +272,16 @@ int main(int argc, char* argv[]) {
             jacobi_kernel<<<dim_grid, dim_block, 0, compute_stream[dev_id]>>>(
                     a_new[dev_id], a[dev_id], l2_norm_d[dev_id], iy_start[dev_id], iy_end[dev_id],
                     nx);
-            CUDA_RT_CALL(cudaGetLastError());
+
             CUDA_RT_CALL(cudaEventRecord(compute_done[dev_id], compute_stream[dev_id]));
 
             CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h[dev_id], l2_norm_d[dev_id], sizeof(float),
                      cudaMemcpyDeviceToHost, compute_stream[dev_id]));
+        }    
+        for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+            const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
+            const int bottom = (dev_id + 1) % num_devices;
+            CUDA_RT_CALL(cudaSetDevice(dev_id));
 
             // Apply periodic boundary conditions
             CUDA_RT_CALL(cudaStreamWaitEvent(push_top_stream[dev_id], compute_done[dev_id], 0));
@@ -402,7 +407,7 @@ double single_gpu(const int nx, const int ny, const int iter_max, float* const a
 
     CUDA_RT_CALL(cudaDeviceSynchronize());
 
-    printf("Single GPU jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, ny, nx);
+    printf("Single GPU jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
 
     dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X, (ny + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1);
     dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);

+ 463 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/cuda/solutions/jacobi_memcpy.cu

@@ -0,0 +1,463 @@
+/* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <sstream>
+
+#include <omp.h>
+#include <nvToolsExt.h>
+
+#define BLOCK_DIM_X 32
+#define BLOCK_DIM_Y 32
+
+#define CUDA_RT_CALL(call)                                                                  \
+    {                                                                                       \
+        cudaError_t cudaStatus = call;                                                      \
+        if (cudaSuccess != cudaStatus)                                                      \
+            fprintf(stderr,                                                                 \
+                    "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
+                    "with "                                                                 \
+                    "%s (%d).\n",                                                           \
+                    #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
+    }
+
+constexpr int MAX_NUM_DEVICES = 32;
+
+constexpr float tol = 1.0e-8;
+
+const float PI = 2.0 * std::asin(1.0);
+
+__global__ void initialize_boundaries(float*  a_new, float*  a, const float pi, const int offset, 
+                    const int nx, const int my_ny, const int ny) {
+    for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) {
+        const float y0 = sin(2.0 * pi * (offset + iy) / (ny - 1));
+        a[iy * nx + 0] = y0;
+        a[iy * nx + (nx - 1)] = y0;
+        a_new[iy * nx + 0] = y0;
+        a_new[iy * nx + (nx - 1)] = y0;
+    }
+}
+
+__global__ void jacobi_kernel(float*  a_new, const float*  a, float*  l2_norm, const int iy_start,
+                              const int iy_end, const int nx) {
+    int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start;
+    int ix = blockIdx.x * blockDim.x + threadIdx.x + 1;
+    __shared__ float block_l2_sum[BLOCK_DIM_X*BLOCK_DIM_Y];
+    unsigned thread_index = threadIdx.y*BLOCK_DIM_X + threadIdx.x;
+
+    if (iy < iy_end && ix < (nx - 1)) {
+    // Update grid point
+        const float new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] +
+                                     a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]);
+        a_new[iy * nx + ix] = new_val;
+    float residue = new_val - a[iy * nx + ix];
+    // Set block-level L2 norm value for this grid point
+    block_l2_sum[thread_index] = residue * residue;
+    }
+    else {
+    block_l2_sum[thread_index] = 0;
+    }
+    // Reduce L2 norm for the block in parallel
+    for (unsigned stride = 1; stride < BLOCK_DIM_X*BLOCK_DIM_Y; stride *= 2) {
+    __syncthreads();
+    if ((thread_index) % (2*stride) == 0) {
+            block_l2_sum[thread_index] += block_l2_sum[thread_index + stride];
+    }
+    }
+    // Atomically update global L2 norm with block-reduced L2 norm
+    if (thread_index == 0) {
+    atomicAdd(l2_norm, block_l2_sum[0]);
+    }
+}
+
+int get_argval(char** begin, char** end, const std::string& arg, const int default_val) {
+    int argval = default_val;
+    char** itr = std::find(begin, end, arg);
+    if (itr != end && ++itr != end) {
+        std::istringstream inbuf(*itr);
+        inbuf >> argval;
+    }
+    return argval;
+}
+
+int get_parsed_vals(char** begin, char **end, int* devices,
+		const std::string& arg, const int default_val) {
+    int numGPUs = default_val;
+    char** itr = std::find(begin, end, arg);
+    if (itr != end && ++itr != end) {
+        numGPUs = 0;
+        std::string dev_ids(*itr);
+	int currpos = 0, nextpos = 0;
+	do {
+	    nextpos = dev_ids.find_first_of(",", currpos);
+            devices[numGPUs] = stoi(dev_ids.substr(currpos, nextpos));
+	    numGPUs++;
+	    currpos = nextpos + 1;
+        } while (nextpos != std::string::npos);
+    }
+    else {
+        for (int i = 0; i < numGPUs; i++) {
+            devices[i] = i;
+	}
+    }
+    return numGPUs;
+}
+
+bool get_arg(char** begin, char** end, const std::string& arg) {
+    char** itr = std::find(begin, end, arg);
+    if (itr != end) {
+        return true;
+    }
+    return false;
+}
+
+double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h);
+
+int main(int argc, char* argv[]) {
+    const int iter_max = get_argval(argv, argv + argc, "-niter", 1000);
+    const int nx = get_argval(argv, argv + argc, "-nx", 16384);
+    const int ny = get_argval(argv, argv + argc, "-ny", 16384);
+    const bool p2p = get_arg(argv, argv + argc, "-p2p");
+    
+    // Get GPU mapping from runtime arguments
+    int available_devices = 0;
+    CUDA_RT_CALL(cudaGetDeviceCount(&available_devices));
+    int devices[MAX_NUM_DEVICES];
+    int num_devices = get_parsed_vals(argv, argv + argc, devices, "-gpus", available_devices);
+
+    float* a[MAX_NUM_DEVICES];
+    float* a_new[MAX_NUM_DEVICES];
+    float* a_ref_h;
+    float* a_h;
+    double runtime_serial = 0.0;
+
+    float* l2_norm_d[MAX_NUM_DEVICES];
+    float* l2_norm_h[MAX_NUM_DEVICES];
+
+    int iy_start[MAX_NUM_DEVICES];
+    int iy_end[MAX_NUM_DEVICES];
+
+    int chunk_size[MAX_NUM_DEVICES];
+
+    // Compute chunk size and allocate memory on GPUs
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(devices[dev_id]));
+        CUDA_RT_CALL(cudaFree(0));
+
+        if (0 == dev_id) {
+	        // Allocate memory on host and record single-GPU timings
+            CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(float)));
+            CUDA_RT_CALL(cudaMallocHost(&a_h, nx * ny * sizeof(float)));
+            runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h);
+        }
+
+        // ny - 2 rows are distributed amongst `size` ranks in such a way
+        // that each rank gets either (ny - 2) / size or (ny - 2) / size + 1 rows.
+        // This optimizes load balancing when (ny - 2) % size != 0
+        int chunk_size_low = (ny - 2) / num_devices;
+        int chunk_size_high = chunk_size_low + 1;
+
+        // To calculate the number of ranks that need to compute an extra row,
+        // the following formula is derived from this equation:
+        // num_ranks_low * chunk_size_low + (size - num_ranks_low) * (chunk_size_low + 1) = (ny - 2)
+        int num_ranks_low = num_devices * chunk_size_low + num_devices - (ny - 2);  
+
+        if (dev_id < num_ranks_low)
+            chunk_size[dev_id] = chunk_size_low;
+        else
+            chunk_size[dev_id] = chunk_size_high;
+
+	    // Allocate memory on GPU
+        CUDA_RT_CALL(cudaMalloc(a + dev_id, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
+        CUDA_RT_CALL(cudaMalloc(a_new + dev_id, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
+
+        CUDA_RT_CALL(cudaMemset(a[dev_id], 0, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
+        CUDA_RT_CALL(cudaMemset(a_new[dev_id], 0, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
+
+        // Calculate local domain boundaries
+        int iy_start_global;  // My start index in the global array
+        if (dev_id < num_ranks_low) {
+            iy_start_global = dev_id * chunk_size_low + 1;
+        } else {
+            iy_start_global =
+                num_ranks_low * chunk_size_low + (dev_id - num_ranks_low) * chunk_size_high + 1;
+        }
+
+        iy_start[dev_id] = 1;
+        iy_end[dev_id] = iy_start[dev_id] + chunk_size[dev_id];
+
+        // Set dirichlet boundary conditions on left and right boarder
+        initialize_boundaries<<<(ny / num_devices) / 128 + 1, 128>>>(
+            a[dev_id], a_new[dev_id], PI, iy_start_global - 1, nx, (chunk_size[dev_id] + 2), ny);
+        CUDA_RT_CALL(cudaGetLastError());
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+
+        CUDA_RT_CALL(cudaMalloc(l2_norm_d + dev_id, sizeof(float)));
+        CUDA_RT_CALL(cudaMallocHost(l2_norm_h + dev_id, sizeof(float)));
+
+        if (p2p == true) {
+            const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
+            int canAccessPeer = 0;
+	    // TODO: Part 2- Check whether GPU "devices[dev_id]" can access peer "devices[top]"
+            CUDA_RT_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, devices[dev_id], devices[top]));
+            if (canAccessPeer) {
+		// TODO: Part 2- Enable peer access from GPU "devices[dev_id]" to "devices[top]"
+                CUDA_RT_CALL(cudaDeviceEnablePeerAccess(devices[top], 0));
+            }
+            const int bottom = (dev_id + 1) % num_devices;
+            if (top != bottom) {
+                canAccessPeer = 0;
+		// TODO: Part 2- Check and enable peer access from GPU "devices[dev_id]" to
+		// "devices[bottom]", whenever possible
+                CUDA_RT_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, 
+					devices[dev_id], devices[bottom]));
+                if (canAccessPeer) {
+                    CUDA_RT_CALL(cudaDeviceEnablePeerAccess(devices[bottom], 0));
+                }
+            }
+        }
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+    }
+
+    // Share initial top and bottom local grid-point values between neighbours
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(devices[dev_id]));
+        const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
+        const int bottom = (dev_id + 1) % num_devices;
+        CUDA_RT_CALL(cudaMemcpy(a_new[top] + (iy_end[top] * nx),
+                     a_new[dev_id] + iy_start[dev_id] * nx, nx * sizeof(float),
+                     cudaMemcpyDeviceToDevice));
+        CUDA_RT_CALL(cudaMemcpy(a_new[bottom], a_new[dev_id] + (iy_end[dev_id] - 1) * nx,
+                     nx * sizeof(float), cudaMemcpyDeviceToDevice));
+    }
+
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(devices[dev_id]));
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+    }
+
+    printf("Jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
+
+    dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    double start = omp_get_wtime();
+    nvtxRangePush("Jacobi solve");
+    while (l2_norm > tol && iter < iter_max) {
+	    // Launch device kernel on each GPU
+        for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+            // TODO: Part 1- Set current GPU to be "devices[dev_id]"
+            CUDA_RT_CALL(cudaSetDevice(devices[dev_id]));
+
+            CUDA_RT_CALL(cudaMemsetAsync(l2_norm_d[dev_id], 0, sizeof(float)));
+            dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X,
+                          (chunk_size[dev_id] + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1);
+	    
+            // TODO: Part 1- Call Jacobi kernel with "dim_grid" blocks in grid and "dim_block"
+            // blocks per thread. "dev_id" variable points to corresponding memory allocated 
+            // for the current GPU.
+            jacobi_kernel<<<dim_grid, dim_block>>>(
+                    a_new[dev_id], a[dev_id], l2_norm_d[dev_id], iy_start[dev_id], iy_end[dev_id],
+                    nx);
+
+            // TODO: Part 1- Copy GPU-local L2 norm "l2_norm_d" back to CPU "l2_norm_h"
+            CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h[dev_id], l2_norm_d[dev_id], sizeof(float),
+                     cudaMemcpyDeviceToHost));
+	}
+    // Launch async memory copy operations for halo exchange and 
+	// for copying local-grid L2 norm from each GPU to host
+	for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+            const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
+            const int bottom = (dev_id + 1) % num_devices;
+            
+            // TODO: Part 1- Set current GPU
+            CUDA_RT_CALL(cudaSetDevice(devices[dev_id]));
+
+            // TODO: Part 1- Implement halo exchange with top neighbour "top"
+            CUDA_RT_CALL(cudaMemcpyAsync(a_new[top] + (iy_end[top] * nx),
+                                         a_new[dev_id] + iy_start[dev_id] * nx, nx * sizeof(float),
+                                         cudaMemcpyDeviceToDevice));
+	    
+            // TODO: Part 1- Implement halo exchange with bottom neighbour "bottom"
+            CUDA_RT_CALL(cudaMemcpyAsync(a_new[bottom], a_new[dev_id] + (iy_end[dev_id] - 1) * nx,
+                                         nx * sizeof(float), cudaMemcpyDeviceToDevice));
+        }
+        l2_norm = 0.0;
+        // Synchronize devices and compute global L2 norm
+        for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+            // TODO: part 1- Set current GPU and call cudaDeviceSynchronize()
+	        CUDA_RT_CALL(cudaSetDevice(devices[dev_id]));
+            CUDA_RT_CALL(cudaDeviceSynchronize());
+
+            l2_norm += *(l2_norm_h[dev_id]);
+        }
+
+        l2_norm = std::sqrt(l2_norm);
+        
+	iter++;
+        if ((iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm);
+
+        for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+            std::swap(a_new[dev_id], a[dev_id]);
+        }
+    }
+
+    nvtxRangePop();
+    double stop = omp_get_wtime();
+
+    int offset = nx;
+    // Copy computed grid back to host from each GPU
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(
+            cudaMemcpy(a_h + offset, a[dev_id] + nx,
+                       std::min((nx * ny) - offset, nx * chunk_size[dev_id]) * sizeof(float),
+                       cudaMemcpyDeviceToHost));
+        offset += std::min(chunk_size[dev_id] * nx, (nx * ny) - offset);
+    }
+
+    // Compare against single GPU execution for correctness
+    bool result_correct = true;
+    for (int iy = 1; result_correct && (iy < (ny - 1)); ++iy) {
+        for (int ix = 1; result_correct && (ix < (nx - 1)); ++ix) {
+            if (std::fabs(a_ref_h[iy * nx + ix] - a_h[iy * nx + ix]) > tol) {
+                fprintf(stderr,
+                        "ERROR: a[%d * %d + %d] = %f does not match %f "
+                        "(reference)\n",
+                        iy, nx, ix, a_h[iy * nx + ix], a_ref_h[iy * nx + ix]);
+                result_correct = false;
+            }
+        }
+    }
+
+    if (result_correct) {
+        printf("Num GPUs: %d. Using GPU ID: ", num_devices);
+	for (int i = 0; i < num_devices; i++) {
+            printf("%d, ", devices[i]);
+	}
+        printf(
+	    "\n%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, "
+            "efficiency: %8.2f \n",
+            ny, nx, runtime_serial, num_devices, (stop - start),
+            runtime_serial / (stop - start),
+            runtime_serial / (num_devices * (stop - start)) * 100);
+    }
+
+    for (int dev_id = (num_devices - 1); dev_id >= 0; --dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(dev_id));
+
+        CUDA_RT_CALL(cudaFreeHost(l2_norm_h[dev_id]));
+        CUDA_RT_CALL(cudaFree(l2_norm_d[dev_id]));
+
+        CUDA_RT_CALL(cudaFree(a_new[dev_id]));
+        CUDA_RT_CALL(cudaFree(a[dev_id]));
+        if (0 == dev_id) {
+            CUDA_RT_CALL(cudaFreeHost(a_h));
+            CUDA_RT_CALL(cudaFreeHost(a_ref_h));
+        }
+    }
+
+    return result_correct ? 0 : 1;
+}
+
+double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h) {
+    float* a;
+    float* a_new;
+
+    float* l2_norm_d;
+    float* l2_norm_h;
+
+    int iy_start = 1;
+    int iy_end = (ny - 1);
+
+    CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(float)));
+
+    CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(float)));
+
+    // Set diriclet boundary conditions on left and right boarder
+    nvtxRangePush("Init boundaries");
+    initialize_boundaries<<<ny / 128 + 1, 128>>>(a, a_new, PI, 0, nx, ny, ny);
+    CUDA_RT_CALL(cudaGetLastError());
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+    nvtxRangePop();
+
+    CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(float)));
+    CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(float)));
+
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    printf("Single GPU jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
+
+    dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X, (ny + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1);
+    dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    double start = omp_get_wtime();
+    nvtxRangePush("Jacobi Solve");
+    while (l2_norm > tol && iter < iter_max) {
+        CUDA_RT_CALL(cudaMemset(l2_norm_d, 0, sizeof(float)));
+
+       // Compute grid points for this iteration
+        jacobi_kernel<<<dim_grid, dim_block>>>(a_new, a, l2_norm_d, iy_start, iy_end, nx);
+        CUDA_RT_CALL(cudaGetLastError());
+        CUDA_RT_CALL(cudaMemcpy(l2_norm_h, l2_norm_d, sizeof(float), cudaMemcpyDeviceToHost));
+
+        // Apply periodic boundary conditions
+        CUDA_RT_CALL(cudaMemcpy(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(float),
+                                     cudaMemcpyDeviceToDevice));
+        CUDA_RT_CALL(cudaMemcpy(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(float),
+                                     cudaMemcpyDeviceToDevice));
+
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+        l2_norm = *l2_norm_h;
+        l2_norm = std::sqrt(l2_norm);
+
+    	iter++;
+    	if ((iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm);
+
+        std::swap(a_new, a);
+
+    }
+    nvtxRangePop();
+    double stop = omp_get_wtime();
+
+    CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(float), cudaMemcpyDeviceToHost));
+
+    CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
+    CUDA_RT_CALL(cudaFree(l2_norm_d));
+
+    CUDA_RT_CALL(cudaFree(a_new));
+    CUDA_RT_CALL(cudaFree(a));
+    return (stop - start);
+}
+

+ 455 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/cuda/solutions/jacobi_streams.cu

@@ -0,0 +1,455 @@
+/* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <sstream>
+
+#include <omp.h>
+#include <nvToolsExt.h>
+
+#define BLOCK_DIM_X 32
+#define BLOCK_DIM_Y 32
+
+#define CUDA_RT_CALL(call)                                                                  \
+    {                                                                                       \
+        cudaError_t cudaStatus = call;                                                      \
+        if (cudaSuccess != cudaStatus)                                                      \
+            fprintf(stderr,                                                                 \
+                    "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
+                    "with "                                                                 \
+                    "%s (%d).\n",                                                           \
+                    #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
+    }
+
+constexpr int MAX_NUM_DEVICES = 32;
+
+constexpr float tol = 1.0e-8;
+
+const float PI = 2.0 * std::asin(1.0);
+
+__global__ void initialize_boundaries(float*  a_new, float*  a, const float pi, const int offset, 
+                    const int nx, const int my_ny, const int ny) {
+    for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) {
+        const float y0 = sin(2.0 * pi * (offset + iy) / (ny - 1));
+        a[iy * nx + 0] = y0;
+        a[iy * nx + (nx - 1)] = y0;
+        a_new[iy * nx + 0] = y0;
+        a_new[iy * nx + (nx - 1)] = y0;
+    }
+}
+
+__global__ void jacobi_kernel(float*  a_new, const float*  a, float*  l2_norm, const int iy_start,
+                              const int iy_end, const int nx) {
+    int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start;
+    int ix = blockIdx.x * blockDim.x + threadIdx.x + 1;
+    __shared__ float block_l2_sum[BLOCK_DIM_X*BLOCK_DIM_Y];
+    unsigned thread_index = threadIdx.y*BLOCK_DIM_X + threadIdx.x;
+
+    if (iy < iy_end && ix < (nx - 1)) {
+    // Update grid point
+        const float new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] +
+                                     a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]);
+        a_new[iy * nx + ix] = new_val;
+    float residue = new_val - a[iy * nx + ix];
+    // Set block-level L2 norm value for this grid point
+    block_l2_sum[thread_index] = residue * residue;
+    }
+    else {
+    block_l2_sum[thread_index] = 0;
+    }
+    // Reduce L2 norm for the block in parallel
+    for (unsigned stride = 1; stride < BLOCK_DIM_X*BLOCK_DIM_Y; stride *= 2) {
+    __syncthreads();
+    if ((thread_index) % (2*stride) == 0) {
+            block_l2_sum[thread_index] += block_l2_sum[thread_index + stride];
+    }
+    }
+    // Atomically update global L2 norm with block-reduced L2 norm
+    if (thread_index == 0) {
+    atomicAdd(l2_norm, block_l2_sum[0]);
+    }
+}
+
+int get_argval(char** begin, char** end, const std::string& arg, const int default_val) {
+    int argval = default_val;
+    char** itr = std::find(begin, end, arg);
+    if (itr != end && ++itr != end) {
+        std::istringstream inbuf(*itr);
+        inbuf >> argval;
+    }
+    return argval;
+}
+
+bool get_arg(char** begin, char** end, const std::string& arg) {
+    char** itr = std::find(begin, end, arg);
+    if (itr != end) {
+        return true;
+    }
+    return false;
+}
+
+double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h);
+
+int main(int argc, char* argv[]) {
+    const int iter_max = get_argval(argv, argv + argc, "-niter", 1000);
+    const int nx = get_argval(argv, argv + argc, "-nx", 16384);
+    const int ny = get_argval(argv, argv + argc, "-ny", 16384);
+    const bool p2p = get_arg(argv, argv + argc, "-p2p");
+
+    float* a[MAX_NUM_DEVICES];
+    float* a_new[MAX_NUM_DEVICES];
+    float* a_ref_h;
+    float* a_h;
+    double runtime_serial = 0.0;
+
+    // Declare compute and halo exchange streams
+    cudaStream_t compute_stream[MAX_NUM_DEVICES];
+    cudaStream_t push_top_stream[MAX_NUM_DEVICES];
+    cudaStream_t push_bottom_stream[MAX_NUM_DEVICES];
+
+    float* l2_norm_d[MAX_NUM_DEVICES];
+    float* l2_norm_h[MAX_NUM_DEVICES];
+
+    int iy_start[MAX_NUM_DEVICES];
+    int iy_end[MAX_NUM_DEVICES];
+
+    int chunk_size[MAX_NUM_DEVICES];
+
+    int num_devices = 0;
+    CUDA_RT_CALL(cudaGetDeviceCount(&num_devices));
+
+    // Compute chunk size and allocate memory on GPUs
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(dev_id));
+        CUDA_RT_CALL(cudaFree(0));
+
+        if (0 == dev_id) {
+            // Allocate memory on host and record single-GPU timings
+            CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(float)));
+            CUDA_RT_CALL(cudaMallocHost(&a_h, nx * ny * sizeof(float)));
+            runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h);
+        }
+
+        // ny - 2 rows are distributed amongst `size` ranks in such a way
+        // that each rank gets either (ny - 2) / size or (ny - 2) / size + 1 rows.
+        // This optimizes load balancing when (ny - 2) % size != 0
+        int chunk_size_low = (ny - 2) / num_devices;
+        int chunk_size_high = chunk_size_low + 1;
+
+        // To calculate the number of ranks that need to compute an extra row,
+        // the following formula is derived from this equation:
+        // num_ranks_low * chunk_size_low + (size - num_ranks_low) * (chunk_size_low + 1) = (ny - 2)
+        int num_ranks_low = num_devices * chunk_size_low + num_devices - (ny - 2);  
+
+        if (dev_id < num_ranks_low)
+            chunk_size[dev_id] = chunk_size_low;
+        else
+            chunk_size[dev_id] = chunk_size_high;
+
+        // Allocate memory on GPU
+        CUDA_RT_CALL(cudaMalloc(a + dev_id, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
+        CUDA_RT_CALL(cudaMalloc(a_new + dev_id, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
+
+        CUDA_RT_CALL(cudaMemset(a[dev_id], 0, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
+        CUDA_RT_CALL(cudaMemset(a_new[dev_id], 0, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
+
+        // Calculate local domain boundaries
+        int iy_start_global;  // My start index in the global array
+        if (dev_id < num_ranks_low) {
+            iy_start_global = dev_id * chunk_size_low + 1;
+        } else {
+            iy_start_global =
+                num_ranks_low * chunk_size_low + (dev_id - num_ranks_low) * chunk_size_high + 1;
+        }
+
+        iy_start[dev_id] = 1;
+        iy_end[dev_id] = iy_start[dev_id] + chunk_size[dev_id];
+
+        // Set dirichlet boundary conditions on left and right boarder
+        initialize_boundaries<<<(ny / num_devices) / 128 + 1, 128>>>(
+            a[dev_id], a_new[dev_id], PI, iy_start_global - 1, nx, (chunk_size[dev_id] + 2), ny);
+        CUDA_RT_CALL(cudaGetLastError());
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+
+        // Create streams
+        CUDA_RT_CALL(cudaStreamCreate(compute_stream + dev_id));
+        CUDA_RT_CALL(cudaStreamCreate(push_top_stream + dev_id));
+        CUDA_RT_CALL(cudaStreamCreate(push_bottom_stream + dev_id));
+
+        CUDA_RT_CALL(cudaMalloc(l2_norm_d + dev_id, sizeof(float)));
+        CUDA_RT_CALL(cudaMallocHost(l2_norm_h + dev_id, sizeof(float)));
+
+        if (p2p == true) {
+            const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
+            int canAccessPeer = 0;
+            CUDA_RT_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, dev_id, top));
+            if (canAccessPeer) {
+                CUDA_RT_CALL(cudaDeviceEnablePeerAccess(top, 0));
+            }
+            const int bottom = (dev_id + 1) % num_devices;
+            if (top != bottom) {
+                canAccessPeer = 0;
+                CUDA_RT_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, dev_id, bottom));
+                if (canAccessPeer) {
+                    CUDA_RT_CALL(cudaDeviceEnablePeerAccess(bottom, 0));
+                }
+            }
+        }
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+    }
+
+    // Share initial top and bottom local grid-point values between neighbours
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(dev_id));
+        const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
+        const int bottom = (dev_id + 1) % num_devices;
+        CUDA_RT_CALL(cudaMemcpy(a_new[top] + (iy_end[top] * nx),
+                     a_new[dev_id] + iy_start[dev_id] * nx, nx * sizeof(float),
+                     cudaMemcpyDeviceToDevice));
+        CUDA_RT_CALL(cudaMemcpy(a_new[bottom], a_new[dev_id] + (iy_end[dev_id] - 1) * nx,
+                     nx * sizeof(float), cudaMemcpyDeviceToDevice));
+    }
+
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(dev_id));
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+    }
+
+    printf("Jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
+
+    dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    double start = omp_get_wtime();
+    nvtxRangePush("Jacobi solve");
+    while (l2_norm > tol && iter < iter_max) {
+        // Launch device kernel on each GPU
+        for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+            const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
+            const int bottom = (dev_id + 1) % num_devices;
+            CUDA_RT_CALL(cudaSetDevice(dev_id));
+
+            CUDA_RT_CALL(
+                cudaMemsetAsync(l2_norm_d[dev_id], 0, sizeof(float), compute_stream[dev_id]));
+
+            // TODO: Part 3- Ensure "top" and "bottom" neighbours have shared updated halos
+            // from the previous iteration by synchronizing "push_top_stream" and
+            // "push_bottom_stream" streams. Be careful with which neighbour's top stream and
+            // which neighbour's bottom stream needs to be synchronized.
+            CUDA_RT_CALL(cudaStreamSynchronize(push_top_stream[bottom]));
+            CUDA_RT_CALL(cudaStreamSynchronize(push_bottom_stream[top]));
+
+            dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X,
+                          (chunk_size[dev_id] + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1);
+
+            // TODO: Part 3- Launch Jacobi kernel on "compute_stream[dev_id]" and all other
+            // functional arguments
+            jacobi_kernel<<<dim_grid, dim_block, 0, compute_stream[dev_id]>>>(
+                    a_new[dev_id], a[dev_id], l2_norm_d[dev_id], iy_start[dev_id], iy_end[dev_id],
+                    nx);
+
+            // TODO: Part 3- Copy GPU-local L2 norm "l2_norm_d" back to CPU "l2_norm_h" on
+            // "compute_stream[dev_id]"
+            CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h[dev_id], l2_norm_d[dev_id], sizeof(float),
+                     cudaMemcpyDeviceToHost, compute_stream[dev_id]));
+        }    
+        for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+            const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
+            const int bottom = (dev_id + 1) % num_devices;
+            CUDA_RT_CALL(cudaSetDevice(dev_id));
+
+            // TODO: Part 3- Before copying the updated halos to neighbours, ensure the 
+            // computation is complete by synchronizing "compute_stream[dev_id]" stream
+            CUDA_RT_CALL(cudaStreamSynchronize(compute_stream[dev_id]));
+
+            // Apply periodic boundary conditions
+            // TODO: Part 3- Implement halo exchange with top neighbour on current device's 
+            // "push_top_stream"
+            CUDA_RT_CALL(cudaMemcpyAsync(a_new[top] + (iy_end[top] * nx),
+                                         a_new[dev_id] + iy_start[dev_id] * nx, nx * sizeof(float),
+                                         cudaMemcpyDeviceToDevice, push_top_stream[dev_id]));
+
+            // TODO: Part 3- Implement halo exchange with "bottom" neighbour on current device's 
+            // "push_bottom_stream"
+            CUDA_RT_CALL(cudaMemcpyAsync(a_new[bottom], a_new[dev_id] + (iy_end[dev_id] - 1) * nx,
+                                         nx * sizeof(float), cudaMemcpyDeviceToDevice,
+                                         push_bottom_stream[dev_id]));
+        }
+        l2_norm = 0.0;
+        for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+            l2_norm += *(l2_norm_h[dev_id]);
+        }
+
+        l2_norm = std::sqrt(l2_norm);
+        
+	iter++;
+        if ((iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm);
+
+        for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+            std::swap(a_new[dev_id], a[dev_id]);
+        }
+    }
+
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(dev_id));
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+    }
+
+    nvtxRangePop();
+    double stop = omp_get_wtime();
+
+    int offset = nx;
+    // Copy computed grid back to host from each GPU
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(
+            cudaMemcpy(a_h + offset, a[dev_id] + nx,
+                       std::min((nx * ny) - offset, nx * chunk_size[dev_id]) * sizeof(float),
+                       cudaMemcpyDeviceToHost));
+        offset += std::min(chunk_size[dev_id] * nx, (nx * ny) - offset);
+    }
+
+    // Compare against single GPU execution for correctness
+    bool result_correct = true;
+    for (int iy = 1; result_correct && (iy < (ny - 1)); ++iy) {
+        for (int ix = 1; result_correct && (ix < (nx - 1)); ++ix) {
+            if (std::fabs(a_ref_h[iy * nx + ix] - a_h[iy * nx + ix]) > tol) {
+                fprintf(stderr,
+                        "ERROR: a[%d * %d + %d] = %f does not match %f "
+                        "(reference)\n",
+                        iy, nx, ix, a_h[iy * nx + ix], a_ref_h[iy * nx + ix]);
+                result_correct = false;
+            }
+        }
+    }
+
+    if (result_correct) {
+        printf("Num GPUs: %d.\n", num_devices);
+        printf(
+            "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, "
+            "efficiency: %8.2f \n",
+            ny, nx, runtime_serial, num_devices, (stop - start),
+            runtime_serial / (stop - start),
+            runtime_serial / (num_devices * (stop - start)) * 100);
+    }
+
+    for (int dev_id = (num_devices - 1); dev_id >= 0; --dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(dev_id));
+        CUDA_RT_CALL(cudaStreamDestroy(push_bottom_stream[dev_id]));
+        CUDA_RT_CALL(cudaStreamDestroy(push_top_stream[dev_id]));
+        CUDA_RT_CALL(cudaStreamDestroy(compute_stream[dev_id]));
+
+        CUDA_RT_CALL(cudaFreeHost(l2_norm_h[dev_id]));
+        CUDA_RT_CALL(cudaFree(l2_norm_d[dev_id]));
+
+        CUDA_RT_CALL(cudaFree(a_new[dev_id]));
+        CUDA_RT_CALL(cudaFree(a[dev_id]));
+        if (0 == dev_id) {
+            CUDA_RT_CALL(cudaFreeHost(a_h));
+            CUDA_RT_CALL(cudaFreeHost(a_ref_h));
+        }
+    }
+
+    return result_correct ? 0 : 1;
+}
+
+double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h) {
+    float* a;
+    float* a_new;
+
+    float* l2_norm_d;
+    float* l2_norm_h;
+
+    int iy_start = 1;
+    int iy_end = (ny - 1);
+
+    CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(float)));
+
+    CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(float)));
+
+    // Set diriclet boundary conditions on left and right boarder
+    nvtxRangePush("Init boundaries");
+    initialize_boundaries<<<ny / 128 + 1, 128>>>(a, a_new, PI, 0, nx, ny, ny);
+    CUDA_RT_CALL(cudaGetLastError());
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+    nvtxRangePop();
+
+    CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(float)));
+    CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(float)));
+
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    printf("Single GPU jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
+
+    dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X, (ny + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1);
+    dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    double start = omp_get_wtime();
+    nvtxRangePush("Jacobi Solve");
+    while (l2_norm > tol && iter < iter_max) {
+        CUDA_RT_CALL(cudaMemset(l2_norm_d, 0, sizeof(float)));
+
+        // Compute grid points for this iteration
+        jacobi_kernel<<<dim_grid, dim_block>>>(a_new, a, l2_norm_d, iy_start, iy_end, nx);
+        CUDA_RT_CALL(cudaGetLastError());
+        CUDA_RT_CALL(cudaMemcpy(l2_norm_h, l2_norm_d, sizeof(float), cudaMemcpyDeviceToHost));
+
+        // Apply periodic boundary conditions
+        CUDA_RT_CALL(cudaMemcpy(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(float),
+                                     cudaMemcpyDeviceToDevice));
+        CUDA_RT_CALL(cudaMemcpy(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(float),
+                                     cudaMemcpyDeviceToDevice));
+
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+        l2_norm = *l2_norm_h;
+        l2_norm = std::sqrt(l2_norm);
+
+        iter++;
+        if ((iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm);
+
+        std::swap(a_new, a);
+    }
+    nvtxRangePop();
+    double stop = omp_get_wtime();
+
+    CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(float), cudaMemcpyDeviceToHost));
+
+    CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
+    CUDA_RT_CALL(cudaFree(l2_norm_d));
+
+    CUDA_RT_CALL(cudaFree(a_new));
+    CUDA_RT_CALL(cudaFree(a));
+    return (stop - start);
+}
+

+ 138 - 193
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/memcpy/jacobi.cu

@@ -31,36 +31,10 @@
 #include <sstream>
 
 #include <omp.h>
-
-#ifdef HAVE_CUB
-#include <cub/block/block_reduce.cuh>
-#endif  // HAVE_CUB
-
-#ifdef USE_NVTX
 #include <nvToolsExt.h>
 
-const uint32_t colors[] = {0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff,
-                           0x0000ffff, 0x00ff0000, 0x00ffffff};
-const int num_colors = sizeof(colors) / sizeof(uint32_t);
-
-#define PUSH_RANGE(name, cid)                              \
-    {                                                      \
-        int color_id = cid;                                \
-        color_id = color_id % num_colors;                  \
-        nvtxEventAttributes_t eventAttrib = {0};           \
-        eventAttrib.version = NVTX_VERSION;                \
-        eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;  \
-        eventAttrib.colorType = NVTX_COLOR_ARGB;           \
-        eventAttrib.color = colors[color_id];              \
-        eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \
-        eventAttrib.message.ascii = name;                  \
-        nvtxRangePushEx(&eventAttrib);                     \
-    }
-#define POP_RANGE nvtxRangePop();
-#else
-#define PUSH_RANGE(name, cid)
-#define POP_RANGE
-#endif
+#define BLOCK_DIM_X 32
+#define BLOCK_DIM_Y 32
 
 #define CUDA_RT_CALL(call)                                                                  \
     {                                                                                       \
@@ -75,16 +49,14 @@ const int num_colors = sizeof(colors) / sizeof(uint32_t);
 
 constexpr int MAX_NUM_DEVICES = 32;
 
-typedef float real;
-constexpr real tol = 1.0e-8;
+constexpr float tol = 1.0e-8;
 
-const real PI = 2.0 * std::asin(1.0);
+const float PI = 2.0 * std::asin(1.0);
 
-__global__ void initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a,
-                                      const real pi, const int offset, const int nx,
-                                      const int my_ny, const int ny) {
+__global__ void initialize_boundaries(float*  a_new, float*  a, const float pi, const int offset, 
+                    const int nx, const int my_ny, const int ny) {
     for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) {
-        const real y0 = sin(2.0 * pi * (offset + iy) / (ny - 1));
+        const float y0 = sin(2.0 * pi * (offset + iy) / (ny - 1));
         a[iy * nx + 0] = y0;
         a[iy * nx + (nx - 1)] = y0;
         a_new[iy * nx + 0] = y0;
@@ -92,44 +64,40 @@ __global__ void initialize_boundaries(real* __restrict__ const a_new, real* __re
     }
 }
 
-template <int BLOCK_DIM_X, int BLOCK_DIM_Y>
-__global__ void jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a,
-                              real* __restrict__ const l2_norm, const int iy_start,
-                              const int iy_end, const int nx, const bool calculate_norm) {
-#ifdef HAVE_CUB
-    typedef cub::BlockReduce<real, BLOCK_DIM_X, cub::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y>
-        BlockReduce;
-    __shared__ typename BlockReduce::TempStorage temp_storage;
-#endif  // HAVE_CUB
+__global__ void jacobi_kernel(float*  a_new, const float*  a, float*  l2_norm, const int iy_start,
+                              const int iy_end, const int nx) {
     int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start;
     int ix = blockIdx.x * blockDim.x + threadIdx.x + 1;
-    real local_l2_norm = 0.0;
+    __shared__ float block_l2_sum[BLOCK_DIM_X*BLOCK_DIM_Y];
+    unsigned thread_index = threadIdx.y*BLOCK_DIM_X + threadIdx.x;
 
     if (iy < iy_end && ix < (nx - 1)) {
-        const real new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] +
+    // Update grid point
+        const float new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] +
                                      a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]);
         a_new[iy * nx + ix] = new_val;
-        if (calculate_norm) {
-            real residue = new_val - a[iy * nx + ix];
-            local_l2_norm += residue * residue;
-        }
+    float residue = new_val - a[iy * nx + ix];
+    // Set block-level L2 norm value for this grid point
+    block_l2_sum[thread_index] = residue * residue;
+    }
+    else {
+    block_l2_sum[thread_index] = 0;
     }
-    if (calculate_norm) {
-#ifdef HAVE_CUB
-        real block_l2_norm = BlockReduce(temp_storage).Sum(local_l2_norm);
-        if (0 == threadIdx.y && 0 == threadIdx.x) atomicAdd(l2_norm, block_l2_norm);
-#else
-        atomicAdd(l2_norm, local_l2_norm);
-#endif  // HAVE_CUB
+    // Reduce L2 norm for the block in parallel
+    for (unsigned stride = 1; stride < BLOCK_DIM_X*BLOCK_DIM_Y; stride *= 2) {
+    __syncthreads();
+    if ((thread_index) % (2*stride) == 0) {
+            block_l2_sum[thread_index] += block_l2_sum[thread_index + stride];
+    }
+    }
+    // Atomically update global L2 norm with block-reduced L2 norm
+    if (thread_index == 0) {
+    atomicAdd(l2_norm, block_l2_sum[0]);
     }
 }
 
-double single_gpu(const int nx, const int ny, const int iter_max, real* const a_ref_h,
-                  const int nccheck, const bool print);
-
-template <typename T>
-T get_argval(char** begin, char** end, const std::string& arg, const T default_val) {
-    T argval = default_val;
+int get_argval(char** begin, char** end, const std::string& arg, const int default_val) {
+    int argval = default_val;
     char** itr = std::find(begin, end, arg);
     if (itr != end && ++itr != end) {
         std::istringstream inbuf(*itr);
@@ -146,18 +114,18 @@ bool get_arg(char** begin, char** end, const std::string& arg) {
     return false;
 }
 
+double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h);
+
 int main(int argc, char* argv[]) {
-    const int iter_max = get_argval<int>(argv, argv + argc, "-niter", 1000);
-    const int nccheck = get_argval<int>(argv, argv + argc, "-nccheck", 1);
-    const int nx = get_argval<int>(argv, argv + argc, "-nx", 16384);
-    const int ny = get_argval<int>(argv, argv + argc, "-ny", 16384);
-    const bool csv = get_arg(argv, argv + argc, "-csv");
-    const bool nop2p = get_arg(argv, argv + argc, "-nop2p");
-
-    real* a[MAX_NUM_DEVICES];
-    real* a_new[MAX_NUM_DEVICES];
-    real* a_ref_h;
-    real* a_h;
+    const int iter_max = get_argval(argv, argv + argc, "-niter", 1000);
+    const int nx = get_argval(argv, argv + argc, "-nx", 16384);
+    const int ny = get_argval(argv, argv + argc, "-ny", 16384);
+    const bool p2p = get_arg(argv, argv + argc, "-p2p");
+
+    float* a[MAX_NUM_DEVICES];
+    float* a_new[MAX_NUM_DEVICES];
+    float* a_ref_h;
+    float* a_h;
     double runtime_serial = 0.0;
 
     cudaStream_t compute_stream[MAX_NUM_DEVICES];
@@ -167,8 +135,8 @@ int main(int argc, char* argv[]) {
     cudaEvent_t push_top_done[2][MAX_NUM_DEVICES];
     cudaEvent_t push_bottom_done[2][MAX_NUM_DEVICES];
 
-    real* l2_norm_d[MAX_NUM_DEVICES];
-    real* l2_norm_h[MAX_NUM_DEVICES];
+    float* l2_norm_d[MAX_NUM_DEVICES];
+    float* l2_norm_h[MAX_NUM_DEVICES];
 
     int iy_start[MAX_NUM_DEVICES];
     int iy_end[MAX_NUM_DEVICES];
@@ -182,9 +150,9 @@ int main(int argc, char* argv[]) {
         CUDA_RT_CALL(cudaFree(0));
 
         if (0 == dev_id) {
-            CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(real)));
-            CUDA_RT_CALL(cudaMallocHost(&a_h, nx * ny * sizeof(real)));
-            runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h, nccheck, !csv);
+            CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(float)));
+            CUDA_RT_CALL(cudaMallocHost(&a_h, nx * ny * sizeof(float)));
+            runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h);
         }
 
         // ny - 2 rows are distributed amongst `size` ranks in such a way
@@ -192,21 +160,22 @@ int main(int argc, char* argv[]) {
         // This optimizes load balancing when (ny - 2) % size != 0
         int chunk_size_low = (ny - 2) / num_devices;
         int chunk_size_high = chunk_size_low + 1;
+
         // To calculate the number of ranks that need to compute an extra row,
         // the following formula is derived from this equation:
-        // num_ranks_low * chunk_size_low + (size - num_ranks_low) * (chunk_size_low + 1) = ny - 2
-        int num_ranks_low = num_devices * chunk_size_low + num_devices -
-                            (ny - 2);  // Number of ranks with chunk_size = chunk_size_low
-        if (dev_id < num_ranks_low)
+        // num_ranks_low * chunk_size_low + (size - num_ranks_low) * (chunk_size_low + 1) = (ny - 2)
+        int num_ranks_low = num_devices * chunk_size_low + num_devices - (ny - 2);  
+
+    if (dev_id < num_ranks_low)
             chunk_size[dev_id] = chunk_size_low;
         else
             chunk_size[dev_id] = chunk_size_high;
 
-        CUDA_RT_CALL(cudaMalloc(a + dev_id, nx * (chunk_size[dev_id] + 2) * sizeof(real)));
-        CUDA_RT_CALL(cudaMalloc(a_new + dev_id, nx * (chunk_size[dev_id] + 2) * sizeof(real)));
+        CUDA_RT_CALL(cudaMalloc(a + dev_id, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
+        CUDA_RT_CALL(cudaMalloc(a_new + dev_id, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
 
-        CUDA_RT_CALL(cudaMemset(a[dev_id], 0, nx * (chunk_size[dev_id] + 2) * sizeof(real)));
-        CUDA_RT_CALL(cudaMemset(a_new[dev_id], 0, nx * (chunk_size[dev_id] + 2) * sizeof(real)));
+        CUDA_RT_CALL(cudaMemset(a[dev_id], 0, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
+        CUDA_RT_CALL(cudaMemset(a_new[dev_id], 0, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
 
         // Calculate local domain boundaries
         int iy_start_global;  // My start index in the global array
@@ -229,18 +198,18 @@ int main(int argc, char* argv[]) {
         CUDA_RT_CALL(cudaStreamCreate(compute_stream + dev_id));
         CUDA_RT_CALL(cudaStreamCreate(push_top_stream + dev_id));
         CUDA_RT_CALL(cudaStreamCreate(push_bottom_stream + dev_id));
-        CUDA_RT_CALL(cudaEventCreateWithFlags(compute_done + dev_id, cudaEventDisableTiming));
-        CUDA_RT_CALL(cudaEventCreateWithFlags(push_top_done[0] + dev_id, cudaEventDisableTiming));
+        CUDA_RT_CALL(cudaEventCreate(compute_done + dev_id));
+        CUDA_RT_CALL(cudaEventCreate(push_top_done[0] + dev_id));
         CUDA_RT_CALL(
-            cudaEventCreateWithFlags(push_bottom_done[0] + dev_id, cudaEventDisableTiming));
-        CUDA_RT_CALL(cudaEventCreateWithFlags(push_top_done[1] + dev_id, cudaEventDisableTiming));
+            cudaEventCreate(push_bottom_done[0] + dev_id));
+        CUDA_RT_CALL(cudaEventCreate(push_top_done[1] + dev_id));
         CUDA_RT_CALL(
-            cudaEventCreateWithFlags(push_bottom_done[1] + dev_id, cudaEventDisableTiming));
+            cudaEventCreate(push_bottom_done[1] + dev_id));
 
-        CUDA_RT_CALL(cudaMalloc(l2_norm_d + dev_id, sizeof(real)));
-        CUDA_RT_CALL(cudaMallocHost(l2_norm_h + dev_id, sizeof(real)));
+        CUDA_RT_CALL(cudaMalloc(l2_norm_d + dev_id, sizeof(float)));
+        CUDA_RT_CALL(cudaMallocHost(l2_norm_h + dev_id, sizeof(float)));
 
-        if (!nop2p) {
+        if (p2p == true) {
             const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
             int canAccessPeer = 0;
             CUDA_RT_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, dev_id, top));
@@ -264,31 +233,25 @@ int main(int argc, char* argv[]) {
         const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
         const int bottom = (dev_id + 1) % num_devices;
         CUDA_RT_CALL(cudaMemcpy(a_new[top] + (iy_end[top] * nx),
-    				 a_new[dev_id] + iy_start[dev_id] * nx, nx * sizeof(real),
-    				 cudaMemcpyDeviceToDevice));
+                     a_new[dev_id] + iy_start[dev_id] * nx, nx * sizeof(float),
+                     cudaMemcpyDeviceToDevice));
         CUDA_RT_CALL(cudaMemcpy(a_new[bottom], a_new[dev_id] + (iy_end[dev_id] - 1) * nx,
-    				 nx * sizeof(real), cudaMemcpyDeviceToDevice));
+                     nx * sizeof(float), cudaMemcpyDeviceToDevice));
     }
 
     for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
         CUDA_RT_CALL(cudaSetDevice(dev_id));
-	CUDA_RT_CALL(cudaDeviceSynchronize());
+        CUDA_RT_CALL(cudaDeviceSynchronize());
     }
 
-    if (!csv)
-        printf(
-            "Jacobi relaxation: %d iterations on %d x %d mesh with norm check "
-            "every %d iterations\n",
-            iter_max, ny, nx, nccheck);
+    printf("Jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
 
-    constexpr int dim_block_x = 32;
-    constexpr int dim_block_y = 32;
+    dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
     int iter = 0;
-    bool calculate_norm;
-    real l2_norm = 1.0;
+    float l2_norm = 1.0;
 
     double start = omp_get_wtime();
-    PUSH_RANGE("Jacobi solve", 0)
+    nvtxRangePush("Jacobi solve");
     while (l2_norm > tol && iter < iter_max) {
         for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
             const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
@@ -296,59 +259,59 @@ int main(int argc, char* argv[]) {
             CUDA_RT_CALL(cudaSetDevice(dev_id));
 
             CUDA_RT_CALL(
-                cudaMemsetAsync(l2_norm_d[dev_id], 0, sizeof(real), compute_stream[dev_id]));
+                cudaMemsetAsync(l2_norm_d[dev_id], 0, sizeof(float), compute_stream[dev_id]));
 
             CUDA_RT_CALL(
                 cudaStreamWaitEvent(compute_stream[dev_id], push_top_done[(iter % 2)][bottom], 0));
             CUDA_RT_CALL(
                 cudaStreamWaitEvent(compute_stream[dev_id], push_bottom_done[(iter % 2)][top], 0));
 
-            calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100) == 0);
-            dim3 dim_grid((nx + dim_block_x - 1) / dim_block_x,
-                          (chunk_size[dev_id] + dim_block_y - 1) / dim_block_y, 1);
+            dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X,
+                          (chunk_size[dev_id] + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1);
 
-            jacobi_kernel<dim_block_x, dim_block_y>
-                <<<dim_grid, {dim_block_x, dim_block_y, 1}, 0, compute_stream[dev_id]>>>(
+            jacobi_kernel<<<dim_grid, dim_block, 0, compute_stream[dev_id]>>>(
                     a_new[dev_id], a[dev_id], l2_norm_d[dev_id], iy_start[dev_id], iy_end[dev_id],
-                    nx, calculate_norm);
-            CUDA_RT_CALL(cudaGetLastError());
+                    nx);
+
             CUDA_RT_CALL(cudaEventRecord(compute_done[dev_id], compute_stream[dev_id]));
 
-            if (calculate_norm) {
-                CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h[dev_id], l2_norm_d[dev_id], sizeof(real),
-                                             cudaMemcpyDeviceToHost, compute_stream[dev_id]));
-            }
+            CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h[dev_id], l2_norm_d[dev_id], sizeof(float),
+                     cudaMemcpyDeviceToHost, compute_stream[dev_id]));
+        }    
+        for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+            const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
+            const int bottom = (dev_id + 1) % num_devices;
+            CUDA_RT_CALL(cudaSetDevice(dev_id));
 
             // Apply periodic boundary conditions
             CUDA_RT_CALL(cudaStreamWaitEvent(push_top_stream[dev_id], compute_done[dev_id], 0));
             CUDA_RT_CALL(cudaMemcpyAsync(a_new[top] + (iy_end[top] * nx),
-                                         a_new[dev_id] + iy_start[dev_id] * nx, nx * sizeof(real),
+                                         a_new[dev_id] + iy_start[dev_id] * nx, nx * sizeof(float),
                                          cudaMemcpyDeviceToDevice, push_top_stream[dev_id]));
             CUDA_RT_CALL(
                 cudaEventRecord(push_top_done[((iter + 1) % 2)][dev_id], push_top_stream[dev_id]));
 
             CUDA_RT_CALL(cudaStreamWaitEvent(push_bottom_stream[dev_id], compute_done[dev_id], 0));
             CUDA_RT_CALL(cudaMemcpyAsync(a_new[bottom], a_new[dev_id] + (iy_end[dev_id] - 1) * nx,
-                                         nx * sizeof(real), cudaMemcpyDeviceToDevice,
+                                         nx * sizeof(float), cudaMemcpyDeviceToDevice,
                                          push_bottom_stream[dev_id]));
             CUDA_RT_CALL(cudaEventRecord(push_bottom_done[((iter + 1) % 2)][dev_id],
                                          push_bottom_stream[dev_id]));
         }
-        if (calculate_norm) {
-            l2_norm = 0.0;
-            for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
-                CUDA_RT_CALL(cudaStreamSynchronize(compute_stream[dev_id]));
-                l2_norm += *(l2_norm_h[dev_id]);
-            }
-
-            l2_norm = std::sqrt(l2_norm);
-            if (!csv && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm);
+        l2_norm = 0.0;
+        for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+            CUDA_RT_CALL(cudaStreamSynchronize(compute_stream[dev_id]));
+            l2_norm += *(l2_norm_h[dev_id]);
         }
 
+        l2_norm = std::sqrt(l2_norm);
+        
+	iter++;
+        if ((iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm);
+
         for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
             std::swap(a_new[dev_id], a[dev_id]);
         }
-        iter++;
     }
 
     for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
@@ -356,14 +319,14 @@ int main(int argc, char* argv[]) {
         CUDA_RT_CALL(cudaDeviceSynchronize());
     }
 
-    POP_RANGE
+    nvtxRangePop();
     double stop = omp_get_wtime();
 
     int offset = nx;
     for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
         CUDA_RT_CALL(
             cudaMemcpy(a_h + offset, a[dev_id] + nx,
-                       std::min((nx * ny) - offset, nx * chunk_size[dev_id]) * sizeof(real),
+                       std::min((nx * ny) - offset, nx * chunk_size[dev_id]) * sizeof(float),
                        cudaMemcpyDeviceToHost));
         offset += std::min(chunk_size[dev_id] * nx, (nx * ny) - offset);
     }
@@ -382,18 +345,13 @@ int main(int argc, char* argv[]) {
     }
 
     if (result_correct) {
-        if (csv) {
-            printf("single_threaded_copy, %d, %d, %d, %d, %d, %d, %f, %f\n", nx, ny, iter_max,
-                   nccheck, num_devices, nop2p ? 0 : 1, (stop - start), runtime_serial);
-        } else {
-            printf("Num GPUs: %d.\n", num_devices);
-            printf(
-                "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, "
-                "efficiency: %8.2f \n",
-                ny, nx, runtime_serial, num_devices, (stop - start),
-                runtime_serial / (stop - start),
-                runtime_serial / (num_devices * (stop - start)) * 100);
-        }
+        printf("Num GPUs: %d.\n", num_devices);
+        printf(
+            "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, "
+            "efficiency: %8.2f \n",
+            ny, nx, runtime_serial, num_devices, (stop - start),
+            runtime_serial / (stop - start),
+            runtime_serial / (num_devices * (stop - start)) * 100);
     }
 
     for (int dev_id = (num_devices - 1); dev_id >= 0; --dev_id) {
@@ -421,85 +379,71 @@ int main(int argc, char* argv[]) {
     return result_correct ? 0 : 1;
 }
 
-double single_gpu(const int nx, const int ny, const int iter_max, real* const a_ref_h,
-                  const int nccheck, const bool print) {
-    real* a;
-    real* a_new;
+double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h) {
+    float* a;
+    float* a_new;
 
-    real* l2_norm_d;
-    real* l2_norm_h;
+    float* l2_norm_d;
+    float* l2_norm_h;
 
     int iy_start = 1;
     int iy_end = (ny - 1);
 
-    CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(real)));
-    CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(real)));
+    CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(float)));
 
-    CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(real)));
-    CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(real)));
+    CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(float)));
 
     // Set diriclet boundary conditions on left and right boarder
+    nvtxRangePush("Init boundaries");
     initialize_boundaries<<<ny / 128 + 1, 128>>>(a, a_new, PI, 0, nx, ny, ny);
     CUDA_RT_CALL(cudaGetLastError());
     CUDA_RT_CALL(cudaDeviceSynchronize());
+    nvtxRangePop();
 
-    CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(real)));
-    CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(real)));
+    CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(float)));
+    CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(float)));
 
     CUDA_RT_CALL(cudaDeviceSynchronize());
 
-    if (print)
-        printf(
-            "Single GPU jacobi relaxation: %d iterations on %d x %d mesh with "
-            "norm "
-            "check every %d iterations\n",
-            iter_max, ny, nx, nccheck);
+    printf("Single GPU jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
 
-    constexpr int dim_block_x = 32;
-    constexpr int dim_block_y = 32;
-    dim3 dim_grid((nx + dim_block_x - 1) / dim_block_x, (ny + dim_block_y - 1) / dim_block_y, 1);
+    dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X, (ny + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1);
+    dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
 
     int iter = 0;
-    bool calculate_norm;
-    real l2_norm = 1.0;
+    float l2_norm = 1.0;
 
     double start = omp_get_wtime();
-    PUSH_RANGE("Jacobi solve", 0)
+    nvtxRangePush("Jacobi Solve");
     while (l2_norm > tol && iter < iter_max) {
-        CUDA_RT_CALL(cudaMemset(l2_norm_d, 0, sizeof(real)));
+        CUDA_RT_CALL(cudaMemset(l2_norm_d, 0, sizeof(float)));
 
-        calculate_norm = (iter % nccheck) == 0 || (print && ((iter % 100) == 0));
-        jacobi_kernel<dim_block_x, dim_block_y>
-            <<<dim_grid, {dim_block_x, dim_block_y, 1}, 0, 0>>>(
-                a_new, a, l2_norm_d, iy_start, iy_end, nx, calculate_norm);
+        // Compute grid points for this iteration
+        jacobi_kernel<<<dim_grid, dim_block>>>(a_new, a, l2_norm_d, iy_start, iy_end, nx);
         CUDA_RT_CALL(cudaGetLastError());
-
-        if (calculate_norm) {
-            CUDA_RT_CALL(cudaMemcpy(l2_norm_h, l2_norm_d, sizeof(real), cudaMemcpyDeviceToHost));
-        }
+        CUDA_RT_CALL(cudaMemcpy(l2_norm_h, l2_norm_d, sizeof(float), cudaMemcpyDeviceToHost));
 
         // Apply periodic boundary conditions
-
-        CUDA_RT_CALL(cudaMemcpy(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(real),
+        CUDA_RT_CALL(cudaMemcpy(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(float),
                                      cudaMemcpyDeviceToDevice));
-        CUDA_RT_CALL(cudaMemcpy(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(real),
+        CUDA_RT_CALL(cudaMemcpy(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(float),
                                      cudaMemcpyDeviceToDevice));
 
-        if (calculate_norm) {
-	    CUDA_RT_CALL(cudaDeviceSynchronize());
-            //CUDA_RT_CALL(cudaStreamSynchronize(compute_stream));
-            l2_norm = *l2_norm_h;
-            l2_norm = std::sqrt(l2_norm);
-            if (print && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm);
-        }
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+        l2_norm = *l2_norm_h;
+        l2_norm = std::sqrt(l2_norm);
 
-        std::swap(a_new, a);
         iter++;
+        if ((iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm);
+
+        std::swap(a_new, a);
     }
-    POP_RANGE
+    nvtxRangePop();
     double stop = omp_get_wtime();
 
-    CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(real), cudaMemcpyDeviceToHost));
+    CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(float), cudaMemcpyDeviceToHost));
 
     CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
     CUDA_RT_CALL(cudaFree(l2_norm_d));
@@ -508,3 +452,4 @@ double single_gpu(const int nx, const int ny, const int iter_max, real* const a_
     CUDA_RT_CALL(cudaFree(a));
     return (stop - start);
 }
+

BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/memcpy/jacobi


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/memcpy/jacobi_memcpy


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/memcpy/jacobi_memcpy_nvlink_report.qdrep


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/memcpy/jacobi_memcpy_nvlink_report.sqlite


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/memcpy/jacobi_memcpy_sys_report.qdrep


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/memcpy/jacobi_memcpy_sys_report.sqlite


+ 0 - 13
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/memcpy/mgpm

@@ -1,13 +0,0 @@
-#!/bin/bash
-#SBATCH --nodes=2
-
-NPROCS=16
-NPPERSOC=$(($NPROCS>>2))
-source ~/init.sh
-make clean && make
-rm -rf hpctoolkit*
-hpcrun -e CPUTIME -e IO -e gpu=nvidia -t ./jacobi
-hpcstruct --gpucfg yes hpctoolkit*
-hpcstruct jacobi
-hpcprof -S jacobi.hpcstruct -I jacobi.cu -I $CUDA_HOME/+ hpctoolkit*
-echo "===DONE==="

+ 16 - 28
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/Makefile

@@ -1,42 +1,30 @@
 # Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
-NP ?= 1
 NVCC=nvcc
 MPICXX=mpicxx
-MPIRUN ?= mpirun
-CUDA_HOME ?= /opt/nvidia/hpc_sdk/Linux_x86_64/21.3/cuda/11.2/
-GENCODE_SM30	:= -gencode arch=compute_30,code=sm_30
-GENCODE_SM35	:= -gencode arch=compute_35,code=sm_35
-GENCODE_SM37	:= -gencode arch=compute_37,code=sm_37
-GENCODE_SM50	:= -gencode arch=compute_50,code=sm_50
-GENCODE_SM52	:= -gencode arch=compute_52,code=sm_52
-GENCODE_SM60    := -gencode arch=compute_60,code=sm_60
+#CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/21.5/cuda/
 GENCODE_SM70    := -gencode arch=compute_70,code=sm_70
-GENCODE_SM80    := -gencode arch=compute_80,code=sm_80 -gencode arch=compute_80,code=compute_80
+GENCODE_SM80    := -gencode arch=compute_80,code=sm_80 
 GENCODE_FLAGS	:= $(GENCODE_SM70) $(GENCODE_SM80)
-ifdef DISABLE_CUB
-        NVCC_FLAGS = -Xptxas --optimize-float-atomics
-else
-        NVCC_FLAGS = -DHAVE_CUB
-endif
+
 NVCC_FLAGS += -lineinfo $(GENCODE_FLAGS) -std=c++14
-MPICXX_FLAGS = -DUSE_NVTX -I$(CUDA_HOME)/include -std=c++14
-MPICXX_FLAGS = -DUSE_NVTX -g -I$(CUDA_HOME)/include -std=c++14
+MPICXX_FLAGS = -g -I$(CUDA_HOME)/include  -fopenmp -std=c++14
 LD_FLAGS = -L$(CUDA_HOME)/lib64 -lcudart -lnvToolsExt
-jacobi: Makefile jacobi.cpp jacobi_kernels.o
-	$(MPICXX) $(MPICXX_FLAGS) jacobi.cpp jacobi_kernels.o $(LD_FLAGS) -o jacobi
+
+hello_world: Makefile hello_world.c
+	$(MPICXX) $(MPICXX_FLAGS) hello_world.c $(LD_FLAGS) -o hello_world
+
+jacobi_memcpy_mpi: Makefile jacobi_memcpy_mpi.cpp jacobi_kernels.o
+	$(MPICXX) $(MPICXX_FLAGS) jacobi_memcpy_mpi.cpp jacobi_kernels.o $(LD_FLAGS) -o jacobi_memcpy_mpi
+
+jacobi_cuda_aware_mpi: Makefile jacobi_cuda_aware_mpi.cpp jacobi_kernels.o
+	$(MPICXX) $(MPICXX_FLAGS) jacobi_cuda_aware_mpi.cpp jacobi_kernels.o $(LD_FLAGS) -o jacobi_cuda_aware_mpi
 
 jacobi_kernels.o: Makefile jacobi_kernels.cu
 	$(NVCC) $(NVCC_FLAGS) jacobi_kernels.cu -c
 
+all: hello_world jacobi_memcpy_mpi jacobi_cuda_aware_mpi
+
 .PHONY.: clean
 clean:
-	rm -f jacobi jacobi_kernels.o *.qdrep jacobi.*.compute-sanitizer.log
-
-sanitize: jacobi
-	$(MPIRUN) -np $(NP) compute-sanitizer --log-file jacobi.%q{OMPI_COMM_WORLD_RANK}.compute-sanitizer.log ./jacobi -niter 10
-
-run: jacobi
-	$(MPIRUN) -np $(NP) ./jacobi
+	rm -rf hello_world jacobi_memcpy_mpi jacobi_cuda_aware_mpi *.o *.qdrep *.sqlite
 
-profile: jacobi
-	$(MPIRUN) -np $(NP) nsys profile --trace=mpi,cuda,nvtx -o jacobi.%q{OMPI_COMM_WORLD_RANK} ./jacobi -niter 10

+ 24 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/containerization/Makefile

@@ -0,0 +1,24 @@
+# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+NVCC=nvcc
+MPICXX=mpicxx
+#CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/21.5/cuda/
+GENCODE_SM70    := -gencode arch=compute_70,code=sm_70
+GENCODE_SM80    := -gencode arch=compute_80,code=sm_80 
+GENCODE_FLAGS	:= $(GENCODE_SM70) $(GENCODE_SM80)
+
+NVCC_FLAGS += -lineinfo $(GENCODE_FLAGS) -std=c++14
+MPICXX_FLAGS = -g -I$(CUDA_HOME)/include  -fopenmp -std=c++14
+LD_FLAGS = -L$(CUDA_HOME)/lib64 -lcudart -lnvToolsExt
+
+jacobi_cuda_aware_mpi: Makefile jacobi_cuda_aware_mpi.cpp jacobi_kernels.o
+	$(MPICXX) $(MPICXX_FLAGS) jacobi_cuda_aware_mpi.cpp jacobi_kernels.o $(LD_FLAGS) -o jacobi_cuda_aware_mpi
+
+jacobi_kernels.o: Makefile jacobi_kernels.cu
+	$(NVCC) $(NVCC_FLAGS) jacobi_kernels.cu -c
+
+all: jacobi_cuda_aware_mpi
+
+.PHONY.: clean
+clean:
+	rm -rf jacobi_cuda_aware_mpi *.o *.qdrep *.sqlite
+

+ 358 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/containerization/jacobi_cuda_aware_mpi.cpp

@@ -0,0 +1,358 @@
+/* Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <sstream>
+#include <cuda_runtime.h>
+#include <nvToolsExt.h>
+#include <mpi.h>
+#include <omp.h>
+
+#define MPI_CALL(call)                                                                \
+    {                                                                                 \
+        int mpi_status = call;                                                        \
+        if (0 != mpi_status) {                                                        \
+            char mpi_error_string[MPI_MAX_ERROR_STRING];                              \
+            int mpi_error_string_length = 0;                                          \
+            MPI_Error_string(mpi_status, mpi_error_string, &mpi_error_string_length); \
+            if (NULL != mpi_error_string)                                             \
+                fprintf(stderr,                                                       \
+                        "ERROR: MPI call \"%s\" in line %d of file %s failed "        \
+                        "with %s "                                                    \
+                        "(%d).\n",                                                    \
+                        #call, __LINE__, __FILE__, mpi_error_string, mpi_status);     \
+            else                                                                      \
+                fprintf(stderr,                                                       \
+                        "ERROR: MPI call \"%s\" in line %d of file %s failed "        \
+                        "with %d.\n",                                                 \
+                        #call, __LINE__, __FILE__, mpi_status);                       \
+        }                                                                             \
+    }
+
+#define CUDA_RT_CALL(call)                                                                  \
+    {                                                                                       \
+        cudaError_t cudaStatus = call;                                                      \
+        if (cudaSuccess != cudaStatus)                                                      \
+            fprintf(stderr,                                                                 \
+                    "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
+                    "with "                                                                 \
+                    "%s (%d).\n",                                                           \
+                    #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
+    }
+
+constexpr float tol = 1.0e-8;
+
+const float PI = 2.0 * std::asin(1.0);
+
+void launch_initialize_boundaries(float* __restrict__ const a_new, float* __restrict__ const a,
+                                  const float pi, const int offset, const int nx, const int my_ny,
+                                  const int ny);
+
+void launch_jacobi_kernel(float* __restrict__ const a_new, const float* __restrict__ const a,
+                          float* __restrict__ const l2_norm, const int iy_start, const int iy_end,
+                          const int nx);
+
+double single_gpu(const int nx, const int ny, const int iter_max, 
+                    float* const a_ref_h, bool print);
+
+int get_argval(char** begin, char** end, const std::string& arg, const int default_val) {
+    int argval = default_val;
+    char** itr = std::find(begin, end, arg);
+    if (itr != end && ++itr != end) {
+        std::istringstream inbuf(*itr);
+        inbuf >> argval;
+    }
+    return argval;
+}
+
+bool get_arg(char** begin, char** end, const std::string& arg) {
+    char** itr = std::find(begin, end, arg);
+    if (itr != end) {
+        return true;
+    }
+    return false;
+}
+
+int main(int argc, char* argv[]) {
+    MPI_CALL(MPI_Init(&argc, &argv));
+    int rank;
+    MPI_CALL(MPI_Comm_rank(MPI_COMM_WORLD, &rank));
+    int size;
+    MPI_CALL(MPI_Comm_size(MPI_COMM_WORLD, &size));
+    int num_devices = 0;
+    cudaGetDeviceCount(&num_devices);
+    
+    const int iter_max = get_argval(argv, argv + argc, "-niter", 1000);
+    const int nx = get_argval(argv, argv + argc, "-nx", 16384);
+    const int ny = get_argval(argv, argv + argc, "-ny", 16384);
+    const bool skip_single_gpu = get_arg(argv, argv + argc, "-skip_single_gpu");
+
+    int local_rank = -1;
+    MPI_Comm local_comm;
+    MPI_CALL(MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL,
+                                    &local_comm));
+
+    MPI_CALL(MPI_Comm_rank(local_comm, &local_rank));
+
+    MPI_CALL(MPI_Comm_free(&local_comm));
+
+    CUDA_RT_CALL(cudaSetDevice(local_rank % num_devices));
+    CUDA_RT_CALL(cudaFree(0));
+
+    float* a_ref_h;
+    CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(float)));
+    float* a_h;
+    CUDA_RT_CALL(cudaMallocHost(&a_h, nx * ny * sizeof(float)));
+    
+    double runtime_serial = 1;
+    if (!skip_single_gpu){
+        runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h, rank == 0);
+    }
+
+    // ny - 2 rows are distributed amongst `size` ranks in such a way
+    // that each rank gets either (ny - 2) / size or (ny - 2) / size + 1 rows.
+    // This optimizes load balancing when (ny - 2) % size != 0
+    int chunk_size;
+    int chunk_size_low = (ny - 2) / size;
+    int chunk_size_high = chunk_size_low + 1;
+    // To calculate the number of ranks that need to compute an extra row,
+    // the following formula is derived from this equation:
+    // num_ranks_low * chunk_size_low + (size - num_ranks_low) * (chunk_size_low + 1) = ny - 2
+    int num_ranks_low = size * chunk_size_low + size -
+                        (ny - 2);  // Number of ranks with chunk_size = chunk_size_low
+    if (rank < num_ranks_low)
+        chunk_size = chunk_size_low;
+    else
+        chunk_size = chunk_size_high;
+
+    float* a;
+    CUDA_RT_CALL(cudaMalloc(&a, nx * (chunk_size + 2) * sizeof(float)));
+    float* a_new;
+    CUDA_RT_CALL(cudaMalloc(&a_new, nx * (chunk_size + 2) * sizeof(float)));
+
+    CUDA_RT_CALL(cudaMemset(a, 0, nx * (chunk_size + 2) * sizeof(float)));
+    CUDA_RT_CALL(cudaMemset(a_new, 0, nx * (chunk_size + 2) * sizeof(float)));
+
+    // Calculate local domain boundaries
+    int iy_start_global;  // My start index in the global array
+    if (rank < num_ranks_low) {
+        iy_start_global = rank * chunk_size_low + 1;
+    } else {
+        iy_start_global =
+            num_ranks_low * chunk_size_low + (rank - num_ranks_low) * chunk_size_high + 1;
+    }
+    int iy_end_global = iy_start_global + chunk_size - 1;  // My last index in the global array
+
+    int iy_start = 1;
+    int iy_end = iy_start + chunk_size;
+
+    // Set diriclet boundary conditions on left and right boarder
+    launch_initialize_boundaries(a, a_new, PI, iy_start_global - 1, nx, (chunk_size + 2), ny);
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    float* l2_norm_d;
+    CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(float)));
+    float* l2_norm_h;
+    CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(float)));
+
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    if (0 == rank) {
+        printf("Jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
+    }
+
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    MPI_CALL(MPI_Barrier(MPI_COMM_WORLD));
+    double start = MPI_Wtime();
+    nvtxRangePush("Jacobi Solve Multi-GPU");
+    while (l2_norm > tol && iter < iter_max) {
+        CUDA_RT_CALL(cudaMemset(l2_norm_d, 0, sizeof(float)));
+
+        launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, iy_end, nx);
+
+        CUDA_RT_CALL(cudaMemcpy(l2_norm_h, l2_norm_d, sizeof(float), cudaMemcpyDeviceToHost));
+
+        const int top = rank > 0 ? rank - 1 : (size - 1);
+        const int bottom = (rank + 1) % size;
+
+        // Apply periodic boundary conditions
+	    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+        nvtxRangePush("Halo exchange CUDA-aware MPI");
+        // TODO: Part 2- 
+        MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_FLOAT, top, 0,
+                              a_new + (iy_end * nx), nx, MPI_FLOAT, bottom, 0, MPI_COMM_WORLD,
+                              MPI_STATUS_IGNORE));
+        nvtxRangePop(); 
+
+        nvtxRangePush("Halo exchange CUDA-aware MPI");
+        // TODO: Part 2- 
+        MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_FLOAT, bottom, 0, a_new, nx,
+                              MPI_FLOAT, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE));
+        nvtxRangePop(); 
+
+        // TODO: Part 2- 
+        MPI_CALL(MPI_Allreduce(l2_norm_h, &l2_norm, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD));
+        l2_norm = std::sqrt(l2_norm);
+        
+        iter++;
+        if (0 == rank && (iter % 100) == 0) {
+            printf("%5d, %0.6f\n", iter, l2_norm);
+        }
+
+        std::swap(a_new, a);
+    }
+    double stop = MPI_Wtime();
+    nvtxRangePop();
+
+    CUDA_RT_CALL(cudaMemcpy(a_h + iy_start_global * nx, a + nx,
+                            std::min((ny - iy_start_global) * nx, chunk_size * nx) * sizeof(float),
+                            cudaMemcpyDeviceToHost));
+
+    int result_correct = 1;
+    if (!skip_single_gpu) {
+        for (int iy = iy_start_global; result_correct && (iy < iy_end_global); ++iy) {
+            for (int ix = 1; result_correct && (ix < (nx - 1)); ++ix) {
+                if (std::fabs(a_ref_h[iy * nx + ix] - a_h[iy * nx + ix]) > tol) {
+                    fprintf(stderr,
+                            "ERROR on rank %d: a[%d * %d + %d] = %f does not match %f "
+                            "(reference)\n",
+                            rank, iy, nx, ix, a_h[iy * nx + ix], a_ref_h[iy * nx + ix]);
+                    result_correct = 0;
+                }
+            }
+        }
+        int global_result_correct = 1;
+        MPI_CALL(MPI_Allreduce(&result_correct, &global_result_correct, 1, MPI_INT, MPI_MIN,
+                            MPI_COMM_WORLD));
+        result_correct = global_result_correct;
+    }
+
+    if (rank == 0 && result_correct) {
+        printf("Num GPUs: %d.\n", size);
+        if (!skip_single_gpu) {
+            printf(
+                "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, "
+                "efficiency: %8.2f \n",
+                nx, ny, runtime_serial, size, (stop - start), runtime_serial / (stop - start),
+                runtime_serial / (size * (stop - start)) * 100);
+        }
+        else {
+            printf("%dx%d: %d GPUs: %8.4f s \n", nx, ny, size, (stop - start)); 
+        }
+    }
+
+    CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
+    CUDA_RT_CALL(cudaFree(l2_norm_d));
+
+    CUDA_RT_CALL(cudaFree(a_new));
+    CUDA_RT_CALL(cudaFree(a));
+
+    CUDA_RT_CALL(cudaFreeHost(a_h));
+    CUDA_RT_CALL(cudaFreeHost(a_ref_h));
+
+    MPI_CALL(MPI_Finalize());
+    return (result_correct == 1) ? 0 : 1;
+}
+
+double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h, bool print) {
+    float* a;
+    float* a_new;
+
+    float* l2_norm_d;
+    float* l2_norm_h;
+
+    int iy_start = 1;
+    int iy_end = (ny - 1);
+
+    CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(float)));
+
+    CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(float)));
+
+    // Set diriclet boundary conditions on left and right boarder
+    nvtxRangePush("Init boundaries");
+    launch_initialize_boundaries(a, a_new, PI, 0, nx, ny, ny);
+    CUDA_RT_CALL(cudaGetLastError());
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+    nvtxRangePop();
+
+    CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(float)));
+    CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(float)));
+
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    if (print) {
+        printf("Single GPU jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
+    }
+
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    double start = omp_get_wtime();
+    nvtxRangePush("Jacobi Solve Single GPU");
+    while (l2_norm > tol && iter < iter_max) {
+        CUDA_RT_CALL(cudaMemset(l2_norm_d, 0, sizeof(float)));
+
+        // Compute grid points for this iteration
+        launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, iy_end, nx);
+        CUDA_RT_CALL(cudaGetLastError());
+        CUDA_RT_CALL(cudaMemcpy(l2_norm_h, l2_norm_d, sizeof(float), cudaMemcpyDeviceToHost));
+
+        // Apply periodic boundary conditions
+        CUDA_RT_CALL(cudaMemcpy(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(float),
+                                     cudaMemcpyDeviceToDevice));
+        CUDA_RT_CALL(cudaMemcpy(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(float),
+                                     cudaMemcpyDeviceToDevice));
+
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+        l2_norm = *l2_norm_h;
+        l2_norm = std::sqrt(l2_norm);
+
+        iter++;
+        if ((iter % 100) == 0 && print) {
+            printf("%5d, %0.6f\n", iter, l2_norm);
+        }
+        std::swap(a_new, a);
+    }
+    nvtxRangePop();
+    double stop = omp_get_wtime();
+
+    CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(float), cudaMemcpyDeviceToHost));
+
+    CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
+    CUDA_RT_CALL(cudaFree(l2_norm_d));
+
+    CUDA_RT_CALL(cudaFree(a_new));
+    CUDA_RT_CALL(cudaFree(a));
+    return (stop - start);
+}

+ 97 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/containerization/jacobi_kernels.cu

@@ -0,0 +1,97 @@
+/* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <cstdio>
+
+#define BLOCK_DIM_X 32
+#define BLOCK_DIM_Y 32
+
+#define CUDA_RT_CALL(call)                                                                  \
+    {                                                                                       \
+        cudaError_t cudaStatus = call;                                                      \
+        if (cudaSuccess != cudaStatus)                                                      \
+            fprintf(stderr,                                                                 \
+                    "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
+                    "with "                                                                 \
+                    "%s (%d).\n",                                                           \
+                    #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
+    }
+
+__global__ void initialize_boundaries(float*  a_new, float*  a, const float pi, const int offset, 
+                    const int nx, const int my_ny, const int ny) {
+    for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) {
+        const float y0 = sin(2.0 * pi * (offset + iy) / (ny - 1));
+        a[iy * nx + 0] = y0;
+        a[iy * nx + (nx - 1)] = y0;
+        a_new[iy * nx + 0] = y0;
+        a_new[iy * nx + (nx - 1)] = y0;
+    }
+}
+
+__global__ void jacobi_kernel(float*  a_new, const float*  a, float*  l2_norm, const int iy_start,
+                              const int iy_end, const int nx) {
+    int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start;
+    int ix = blockIdx.x * blockDim.x + threadIdx.x + 1;
+    __shared__ float block_l2_sum[BLOCK_DIM_X*BLOCK_DIM_Y];
+    unsigned thread_index = threadIdx.y*BLOCK_DIM_X + threadIdx.x;
+
+    if (iy < iy_end && ix < (nx - 1)) {
+        // Update grid point
+        const float new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] +
+                                     a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]);
+        a_new[iy * nx + ix] = new_val;
+        float residue = new_val - a[iy * nx + ix];
+        // Set block-level L2 norm value for this grid point
+        block_l2_sum[thread_index] = residue * residue;
+    }
+    else {
+        block_l2_sum[thread_index] = 0;
+    }
+    // Reduce L2 norm for the block in parallel
+    for (unsigned stride = 1; stride < BLOCK_DIM_X*BLOCK_DIM_Y; stride *= 2) {
+        __syncthreads();
+        if ((thread_index) % (2*stride) == 0) {
+            block_l2_sum[thread_index] += block_l2_sum[thread_index + stride];
+        }
+    }
+    // Atomically update global L2 norm with block-reduced L2 norm
+    if (thread_index == 0) {
+        atomicAdd(l2_norm, block_l2_sum[0]);
+    }
+}
+
+void launch_initialize_boundaries(float*  a_new, float*  a, const float pi, const int offset, 
+                                    const int nx, const int my_ny, const int ny){
+    initialize_boundaries<<<my_ny / 128 + 1, 128>>>(a_new, a, pi, offset, nx, my_ny, ny);
+}
+
+void launch_jacobi_kernel(float*  a_new, const float*  a, float*  l2_norm, const int iy_start,
+                              const int iy_end, const int nx) {
+    dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+    dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X,
+                  ((iy_end - iy_start) + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1);
+    jacobi_kernel<<<dim_grid, dim_block>>>(a_new, a, l2_norm, iy_start, iy_end, nx);
+}

+ 27 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/hello_world.c

@@ -0,0 +1,27 @@
+#include <mpi.h>
+#include <stdio.h>
+
+int main(int argc, char** argv) {
+    // Initialize the MPI environment
+    MPI_Init(NULL, NULL);
+
+    // Get the number of processes
+    int size;
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+    // Get the rank of the process
+    int rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    // Get the name of the processor
+    char processor_name[MPI_MAX_PROCESSOR_NAME];
+    int name_len;
+    MPI_Get_processor_name(processor_name, &name_len);
+
+    // Print a hello world message
+    printf("Hello world from processor %s, rank %d out of %d processors\n",
+           processor_name, rank, size);
+
+    // Finalize the MPI environment.
+    MPI_Finalize();
+}

BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/jacobi


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/jacobi.o


+ 358 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/jacobi_cuda_aware_mpi.cpp

@@ -0,0 +1,358 @@
+/* Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <sstream>
+#include <cuda_runtime.h>
+#include <nvToolsExt.h>
+#include <mpi.h>
+#include <omp.h>
+
+#define MPI_CALL(call)                                                                \
+    {                                                                                 \
+        int mpi_status = call;                                                        \
+        if (0 != mpi_status) {                                                        \
+            char mpi_error_string[MPI_MAX_ERROR_STRING];                              \
+            int mpi_error_string_length = 0;                                          \
+            MPI_Error_string(mpi_status, mpi_error_string, &mpi_error_string_length); \
+            if (NULL != mpi_error_string)                                             \
+                fprintf(stderr,                                                       \
+                        "ERROR: MPI call \"%s\" in line %d of file %s failed "        \
+                        "with %s "                                                    \
+                        "(%d).\n",                                                    \
+                        #call, __LINE__, __FILE__, mpi_error_string, mpi_status);     \
+            else                                                                      \
+                fprintf(stderr,                                                       \
+                        "ERROR: MPI call \"%s\" in line %d of file %s failed "        \
+                        "with %d.\n",                                                 \
+                        #call, __LINE__, __FILE__, mpi_status);                       \
+        }                                                                             \
+    }
+
+#define CUDA_RT_CALL(call)                                                                  \
+    {                                                                                       \
+        cudaError_t cudaStatus = call;                                                      \
+        if (cudaSuccess != cudaStatus)                                                      \
+            fprintf(stderr,                                                                 \
+                    "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
+                    "with "                                                                 \
+                    "%s (%d).\n",                                                           \
+                    #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
+    }
+
+constexpr float tol = 1.0e-8;
+
+const float PI = 2.0 * std::asin(1.0);
+
+void launch_initialize_boundaries(float* __restrict__ const a_new, float* __restrict__ const a,
+                                  const float pi, const int offset, const int nx, const int my_ny,
+                                  const int ny);
+
+void launch_jacobi_kernel(float* __restrict__ const a_new, const float* __restrict__ const a,
+                          float* __restrict__ const l2_norm, const int iy_start, const int iy_end,
+                          const int nx);
+
+double single_gpu(const int nx, const int ny, const int iter_max, 
+                    float* const a_ref_h, bool print);
+
+int get_argval(char** begin, char** end, const std::string& arg, const int default_val) {
+    int argval = default_val;
+    char** itr = std::find(begin, end, arg);
+    if (itr != end && ++itr != end) {
+        std::istringstream inbuf(*itr);
+        inbuf >> argval;
+    }
+    return argval;
+}
+
+bool get_arg(char** begin, char** end, const std::string& arg) {
+    char** itr = std::find(begin, end, arg);
+    if (itr != end) {
+        return true;
+    }
+    return false;
+}
+
+int main(int argc, char* argv[]) {
+    MPI_CALL(MPI_Init(&argc, &argv));
+    int rank;
+    MPI_CALL(MPI_Comm_rank(MPI_COMM_WORLD, &rank));
+    int size;
+    MPI_CALL(MPI_Comm_size(MPI_COMM_WORLD, &size));
+    int num_devices = 0;
+    cudaGetDeviceCount(&num_devices);
+    
+    const int iter_max = get_argval(argv, argv + argc, "-niter", 1000);
+    const int nx = get_argval(argv, argv + argc, "-nx", 16384);
+    const int ny = get_argval(argv, argv + argc, "-ny", 16384);
+    const bool skip_single_gpu = get_arg(argv, argv + argc, "-skip_single_gpu");
+
+    int local_rank = -1;
+    MPI_Comm local_comm;
+    MPI_CALL(MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL,
+                                    &local_comm));
+
+    MPI_CALL(MPI_Comm_rank(local_comm, &local_rank));
+
+    MPI_CALL(MPI_Comm_free(&local_comm));
+
+    CUDA_RT_CALL(cudaSetDevice(local_rank % num_devices));
+    CUDA_RT_CALL(cudaFree(0));
+
+    float* a_ref_h;
+    CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(float)));
+    float* a_h;
+    CUDA_RT_CALL(cudaMallocHost(&a_h, nx * ny * sizeof(float)));
+    
+    double runtime_serial = 1;
+    if (!skip_single_gpu){
+        runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h, rank == 0);
+    }
+
+    // ny - 2 rows are distributed amongst `size` ranks in such a way
+    // that each rank gets either (ny - 2) / size or (ny - 2) / size + 1 rows.
+    // This optimizes load balancing when (ny - 2) % size != 0
+    int chunk_size;
+    int chunk_size_low = (ny - 2) / size;
+    int chunk_size_high = chunk_size_low + 1;
+    // To calculate the number of ranks that need to compute an extra row,
+    // the following formula is derived from this equation:
+    // num_ranks_low * chunk_size_low + (size - num_ranks_low) * (chunk_size_low + 1) = ny - 2
+    int num_ranks_low = size * chunk_size_low + size -
+                        (ny - 2);  // Number of ranks with chunk_size = chunk_size_low
+    if (rank < num_ranks_low)
+        chunk_size = chunk_size_low;
+    else
+        chunk_size = chunk_size_high;
+
+    float* a;
+    CUDA_RT_CALL(cudaMalloc(&a, nx * (chunk_size + 2) * sizeof(float)));
+    float* a_new;
+    CUDA_RT_CALL(cudaMalloc(&a_new, nx * (chunk_size + 2) * sizeof(float)));
+
+    CUDA_RT_CALL(cudaMemset(a, 0, nx * (chunk_size + 2) * sizeof(float)));
+    CUDA_RT_CALL(cudaMemset(a_new, 0, nx * (chunk_size + 2) * sizeof(float)));
+
+    // Calculate local domain boundaries
+    int iy_start_global;  // My start index in the global array
+    if (rank < num_ranks_low) {
+        iy_start_global = rank * chunk_size_low + 1;
+    } else {
+        iy_start_global =
+            num_ranks_low * chunk_size_low + (rank - num_ranks_low) * chunk_size_high + 1;
+    }
+    int iy_end_global = iy_start_global + chunk_size - 1;  // My last index in the global array
+
+    int iy_start = 1;
+    int iy_end = iy_start + chunk_size;
+
+    // Set diriclet boundary conditions on left and right boarder
+    launch_initialize_boundaries(a, a_new, PI, iy_start_global - 1, nx, (chunk_size + 2), ny);
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    float* l2_norm_d;
+    CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(float)));
+    float* l2_norm_h;
+    CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(float)));
+
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    if (0 == rank) {
+        printf("Jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
+    }
+
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    MPI_CALL(MPI_Barrier(MPI_COMM_WORLD));
+    double start = MPI_Wtime();
+    nvtxRangePush("Jacobi Solve Multi-GPU");
+    while (l2_norm > tol && iter < iter_max) {
+        CUDA_RT_CALL(cudaMemset(l2_norm_d, 0, sizeof(float)));
+
+        launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, iy_end, nx);
+
+        CUDA_RT_CALL(cudaMemcpy(l2_norm_h, l2_norm_d, sizeof(float), cudaMemcpyDeviceToHost));
+
+        const int top = rank > 0 ? rank - 1 : (size - 1);
+        const int bottom = (rank + 1) % size;
+
+        // Apply periodic boundary conditions
+	    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+        nvtxRangePush("Halo exchange CUDA-aware MPI");
+        // TODO: Part 2- 
+        MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_FLOAT, top, 0,
+                              a_new + (iy_end * nx), nx, MPI_FLOAT, bottom, 0, MPI_COMM_WORLD,
+                              MPI_STATUS_IGNORE));
+        nvtxRangePop(); 
+
+        nvtxRangePush("Halo exchange CUDA-aware MPI");
+        // TODO: Part 2- 
+        MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_FLOAT, bottom, 0, a_new, nx,
+                              MPI_FLOAT, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE));
+        nvtxRangePop(); 
+
+        // TODO: Part 2- 
+        MPI_CALL(MPI_Allreduce(l2_norm_h, &l2_norm, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD));
+        l2_norm = std::sqrt(l2_norm);
+        
+        iter++;
+        if (0 == rank && (iter % 100) == 0) {
+            printf("%5d, %0.6f\n", iter, l2_norm);
+        }
+
+        std::swap(a_new, a);
+    }
+    double stop = MPI_Wtime();
+    nvtxRangePop();
+
+    CUDA_RT_CALL(cudaMemcpy(a_h + iy_start_global * nx, a + nx,
+                            std::min((ny - iy_start_global) * nx, chunk_size * nx) * sizeof(float),
+                            cudaMemcpyDeviceToHost));
+
+    int result_correct = 1;
+    if (!skip_single_gpu) {
+        for (int iy = iy_start_global; result_correct && (iy < iy_end_global); ++iy) {
+            for (int ix = 1; result_correct && (ix < (nx - 1)); ++ix) {
+                if (std::fabs(a_ref_h[iy * nx + ix] - a_h[iy * nx + ix]) > tol) {
+                    fprintf(stderr,
+                            "ERROR on rank %d: a[%d * %d + %d] = %f does not match %f "
+                            "(reference)\n",
+                            rank, iy, nx, ix, a_h[iy * nx + ix], a_ref_h[iy * nx + ix]);
+                    result_correct = 0;
+                }
+            }
+        }
+        int global_result_correct = 1;
+        MPI_CALL(MPI_Allreduce(&result_correct, &global_result_correct, 1, MPI_INT, MPI_MIN,
+                            MPI_COMM_WORLD));
+        result_correct = global_result_correct;
+    }
+
+    if (rank == 0 && result_correct) {
+        printf("Num GPUs: %d.\n", size);
+        if (!skip_single_gpu) {
+            printf(
+                "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, "
+                "efficiency: %8.2f \n",
+                nx, ny, runtime_serial, size, (stop - start), runtime_serial / (stop - start),
+                runtime_serial / (size * (stop - start)) * 100);
+        }
+        else {
+            printf("%dx%d: %d GPUs: %8.4f s \n", nx, ny, size, (stop - start)); 
+        }
+    }
+
+    CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
+    CUDA_RT_CALL(cudaFree(l2_norm_d));
+
+    CUDA_RT_CALL(cudaFree(a_new));
+    CUDA_RT_CALL(cudaFree(a));
+
+    CUDA_RT_CALL(cudaFreeHost(a_h));
+    CUDA_RT_CALL(cudaFreeHost(a_ref_h));
+
+    MPI_CALL(MPI_Finalize());
+    return (result_correct == 1) ? 0 : 1;
+}
+
+double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h, bool print) {
+    float* a;
+    float* a_new;
+
+    float* l2_norm_d;
+    float* l2_norm_h;
+
+    int iy_start = 1;
+    int iy_end = (ny - 1);
+
+    CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(float)));
+
+    CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(float)));
+
+    // Set diriclet boundary conditions on left and right boarder
+    nvtxRangePush("Init boundaries");
+    launch_initialize_boundaries(a, a_new, PI, 0, nx, ny, ny);
+    CUDA_RT_CALL(cudaGetLastError());
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+    nvtxRangePop();
+
+    CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(float)));
+    CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(float)));
+
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    if (print) {
+        printf("Single GPU jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
+    }
+
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    double start = omp_get_wtime();
+    nvtxRangePush("Jacobi Solve Single GPU");
+    while (l2_norm > tol && iter < iter_max) {
+        CUDA_RT_CALL(cudaMemset(l2_norm_d, 0, sizeof(float)));
+
+        // Compute grid points for this iteration
+        launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, iy_end, nx);
+        CUDA_RT_CALL(cudaGetLastError());
+        CUDA_RT_CALL(cudaMemcpy(l2_norm_h, l2_norm_d, sizeof(float), cudaMemcpyDeviceToHost));
+
+        // Apply periodic boundary conditions
+        CUDA_RT_CALL(cudaMemcpy(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(float),
+                                     cudaMemcpyDeviceToDevice));
+        CUDA_RT_CALL(cudaMemcpy(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(float),
+                                     cudaMemcpyDeviceToDevice));
+
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+        l2_norm = *l2_norm_h;
+        l2_norm = std::sqrt(l2_norm);
+
+        iter++;
+        if ((iter % 100) == 0 && print) {
+            printf("%5d, %0.6f\n", iter, l2_norm);
+        }
+        std::swap(a_new, a);
+    }
+    nvtxRangePop();
+    double stop = omp_get_wtime();
+
+    CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(float), cudaMemcpyDeviceToHost));
+
+    CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
+    CUDA_RT_CALL(cudaFree(l2_norm_d));
+
+    CUDA_RT_CALL(cudaFree(a_new));
+    CUDA_RT_CALL(cudaFree(a));
+    return (stop - start);
+}

+ 37 - 53
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/jacobi_kernels.cu

@@ -26,9 +26,8 @@
  */
 #include <cstdio>
 
-#ifdef HAVE_CUB
-#include <cub/block/block_reduce.cuh>
-#endif  // HAVE_CUB
+#define BLOCK_DIM_X 32
+#define BLOCK_DIM_Y 32
 
 #define CUDA_RT_CALL(call)                                                                  \
     {                                                                                       \
@@ -41,19 +40,10 @@
                     #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
     }
 
-#ifdef USE_DOUBLE
-typedef double real;
-#define MPI_REAL_TYPE MPI_DOUBLE
-#else
-typedef float real;
-#define MPI_REAL_TYPE MPI_FLOAT
-#endif
-
-__global__ void initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a,
-                                      const real pi, const int offset, const int nx,
-                                      const int my_ny, const int ny) {
+__global__ void initialize_boundaries(float*  a_new, float*  a, const float pi, const int offset, 
+                    const int nx, const int my_ny, const int ny) {
     for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) {
-        const real y0 = sin(2.0 * pi * (offset + iy) / (ny - 1));
+        const float y0 = sin(2.0 * pi * (offset + iy) / (ny - 1));
         a[iy * nx + 0] = y0;
         a[iy * nx + (nx - 1)] = y0;
         a_new[iy * nx + 0] = y0;
@@ -61,53 +51,47 @@ __global__ void initialize_boundaries(real* __restrict__ const a_new, real* __re
     }
 }
 
-void launch_initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a,
-                                  const real pi, const int offset, const int nx, const int my_ny,
-                                  const int ny) {
-    initialize_boundaries<<<my_ny / 128 + 1, 128>>>(a_new, a, pi, offset, nx, my_ny, ny);
-    CUDA_RT_CALL(cudaGetLastError());
-}
-
-template <int BLOCK_DIM_X, int BLOCK_DIM_Y>
-__global__ void jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a,
-                              real* __restrict__ const l2_norm, const int iy_start,
-                              const int iy_end, const int nx, const bool calculate_norm) {
-#ifdef HAVE_CUB
-    typedef cub::BlockReduce<real, BLOCK_DIM_X, cub::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y>
-        BlockReduce;
-    __shared__ typename BlockReduce::TempStorage temp_storage;
-#endif  // HAVE_CUB
+__global__ void jacobi_kernel(float*  a_new, const float*  a, float*  l2_norm, const int iy_start,
+                              const int iy_end, const int nx) {
     int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start;
     int ix = blockIdx.x * blockDim.x + threadIdx.x + 1;
-    real local_l2_norm = 0.0;
+    __shared__ float block_l2_sum[BLOCK_DIM_X*BLOCK_DIM_Y];
+    unsigned thread_index = threadIdx.y*BLOCK_DIM_X + threadIdx.x;
 
     if (iy < iy_end && ix < (nx - 1)) {
-        const real new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] +
+        // Update grid point
+        const float new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] +
                                      a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]);
         a_new[iy * nx + ix] = new_val;
-        if (calculate_norm) {
-            real residue = new_val - a[iy * nx + ix];
-            local_l2_norm += residue * residue;
+        float residue = new_val - a[iy * nx + ix];
+        // Set block-level L2 norm value for this grid point
+        block_l2_sum[thread_index] = residue * residue;
+    }
+    else {
+        block_l2_sum[thread_index] = 0;
+    }
+    // Reduce L2 norm for the block in parallel
+    for (unsigned stride = 1; stride < BLOCK_DIM_X*BLOCK_DIM_Y; stride *= 2) {
+        __syncthreads();
+        if ((thread_index) % (2*stride) == 0) {
+            block_l2_sum[thread_index] += block_l2_sum[thread_index + stride];
         }
     }
-    if (calculate_norm) {
-#ifdef HAVE_CUB
-        real block_l2_norm = BlockReduce(temp_storage).Sum(local_l2_norm);
-        if (0 == threadIdx.y && 0 == threadIdx.x) atomicAdd(l2_norm, block_l2_norm);
-#else
-        atomicAdd(l2_norm, local_l2_norm);
-#endif  // HAVE_CUB
+    // Atomically update global L2 norm with block-reduced L2 norm
+    if (thread_index == 0) {
+        atomicAdd(l2_norm, block_l2_sum[0]);
     }
 }
 
-void launch_jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a,
-                          real* __restrict__ const l2_norm, const int iy_start, const int iy_end,
-                          const int nx, const bool calculate_norm) {
-    constexpr int dim_block_x = 32;
-    constexpr int dim_block_y = 32;
-    dim3 dim_grid((nx + dim_block_x - 1) / dim_block_x,
-                  ((iy_end - iy_start) + dim_block_y - 1) / dim_block_y, 1);
-    jacobi_kernel<dim_block_x, dim_block_y><<<dim_grid, {dim_block_x, dim_block_y, 1}, 0, 0>>>(
-        a_new, a, l2_norm, iy_start, iy_end, nx, calculate_norm);
-    CUDA_RT_CALL(cudaGetLastError());
+void launch_initialize_boundaries(float*  a_new, float*  a, const float pi, const int offset, 
+                                    const int nx, const int my_ny, const int ny){
+    initialize_boundaries<<<my_ny / 128 + 1, 128>>>(a_new, a, pi, offset, nx, my_ny, ny);
+}
+
+void launch_jacobi_kernel(float*  a_new, const float*  a, float*  l2_norm, const int iy_start,
+                              const int iy_end, const int nx) {
+    dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+    dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X,
+                  ((iy_end - iy_start) + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1);
+    jacobi_kernel<<<dim_grid, dim_block>>>(a_new, a, l2_norm, iy_start, iy_end, nx);
 }

BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/jacobi_kernels.o


+ 378 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/jacobi_memcpy_mpi.cpp

@@ -0,0 +1,378 @@
+/* Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <sstream>
+#include <cuda_runtime.h>
+#include <nvToolsExt.h>
+#include <mpi.h>
+#include <omp.h>
+
+#define MPI_CALL(call)                                                                \
+    {                                                                                 \
+        int mpi_status = call;                                                        \
+        if (0 != mpi_status) {                                                        \
+            char mpi_error_string[MPI_MAX_ERROR_STRING];                              \
+            int mpi_error_string_length = 0;                                          \
+            MPI_Error_string(mpi_status, mpi_error_string, &mpi_error_string_length); \
+            if (NULL != mpi_error_string)                                             \
+                fprintf(stderr,                                                       \
+                        "ERROR: MPI call \"%s\" in line %d of file %s failed "        \
+                        "with %s "                                                    \
+                        "(%d).\n",                                                    \
+                        #call, __LINE__, __FILE__, mpi_error_string, mpi_status);     \
+            else                                                                      \
+                fprintf(stderr,                                                       \
+                        "ERROR: MPI call \"%s\" in line %d of file %s failed "        \
+                        "with %d.\n",                                                 \
+                        #call, __LINE__, __FILE__, mpi_status);                       \
+        }                                                                             \
+    }
+
+#define CUDA_RT_CALL(call)                                                                  \
+    {                                                                                       \
+        cudaError_t cudaStatus = call;                                                      \
+        if (cudaSuccess != cudaStatus)                                                      \
+            fprintf(stderr,                                                                 \
+                    "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
+                    "with "                                                                 \
+                    "%s (%d).\n",                                                           \
+                    #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
+    }
+
+constexpr float tol = 1.0e-8;
+
+const float PI = 2.0 * std::asin(1.0);
+
+void launch_initialize_boundaries(float* __restrict__ const a_new, float* __restrict__ const a,
+                                  const float pi, const int offset, const int nx, const int my_ny,
+                                  const int ny);
+
+void launch_jacobi_kernel(float* __restrict__ const a_new, const float* __restrict__ const a,
+                          float* __restrict__ const l2_norm, const int iy_start, const int iy_end,
+                          const int nx);
+
+double single_gpu(const int nx, const int ny, const int iter_max, 
+                    float* const a_ref_h, bool print);
+
+int get_argval(char** begin, char** end, const std::string& arg, const int default_val) {
+    int argval = default_val;
+    char** itr = std::find(begin, end, arg);
+    if (itr != end && ++itr != end) {
+        std::istringstream inbuf(*itr);
+        inbuf >> argval;
+    }
+    return argval;
+}
+
+bool get_arg(char** begin, char** end, const std::string& arg) {
+    char** itr = std::find(begin, end, arg);
+    if (itr != end) {
+        return true;
+    }
+    return false;
+}
+
+int main(int argc, char* argv[]) {
+    MPI_CALL(MPI_Init(&argc, &argv));
+    int rank;
+    MPI_CALL(MPI_Comm_rank(MPI_COMM_WORLD, &rank));
+    int size;
+    MPI_CALL(MPI_Comm_size(MPI_COMM_WORLD, &size));
+    int num_devices = 0;
+    cudaGetDeviceCount(&num_devices);
+    
+    const int iter_max = get_argval(argv, argv + argc, "-niter", 1000);
+    const int nx = get_argval(argv, argv + argc, "-nx", 16384);
+    const int ny = get_argval(argv, argv + argc, "-ny", 16384);
+    const bool skip_single_gpu = get_arg(argv, argv + argc, "-skip_single_gpu");
+
+    int local_rank = -1;
+    // TODO: Part 1- Obtain the node-level local rank by splitting the global communicator
+    // Make sure ot free the local communicator after its use
+    MPI_Comm local_comm;
+    MPI_CALL(MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL,
+                                    &local_comm));
+
+    MPI_CALL(MPI_Comm_rank(local_comm, &local_rank));
+
+    MPI_CALL(MPI_Comm_free(&local_comm));
+
+    CUDA_RT_CALL(cudaSetDevice(local_rank % num_devices));
+    CUDA_RT_CALL(cudaFree(0));
+
+    float* a_ref_h;
+    CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(float)));
+    float* a_h;
+    CUDA_RT_CALL(cudaMallocHost(&a_h, nx * ny * sizeof(float)));
+    float* top_halo_buf;
+    CUDA_RT_CALL(cudaMallocHost(&top_halo_buf, nx * sizeof(float)));
+    float* bot_halo_buf;
+    CUDA_RT_CALL(cudaMallocHost(&bot_halo_buf, nx * sizeof(float)));
+
+    double runtime_serial = 1;
+    if (!skip_single_gpu){
+        runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h, rank == 0);
+    }
+
+    // ny - 2 rows are distributed amongst `size` ranks in such a way
+    // that each rank gets either (ny - 2) / size or (ny - 2) / size + 1 rows.
+    // This optimizes load balancing when (ny - 2) % size != 0
+    int chunk_size;
+    int chunk_size_low = (ny - 2) / size;
+    int chunk_size_high = chunk_size_low + 1;
+    // To calculate the number of ranks that need to compute an extra row,
+    // the following formula is derived from this equation:
+    // num_ranks_low * chunk_size_low + (size - num_ranks_low) * (chunk_size_low + 1) = ny - 2
+    int num_ranks_low = size * chunk_size_low + size -
+                        (ny - 2);  // Number of ranks with chunk_size = chunk_size_low
+    if (rank < num_ranks_low)
+        chunk_size = chunk_size_low;
+    else
+        chunk_size = chunk_size_high;
+
+    float* a;
+    CUDA_RT_CALL(cudaMalloc(&a, nx * (chunk_size + 2) * sizeof(float)));
+    float* a_new;
+    CUDA_RT_CALL(cudaMalloc(&a_new, nx * (chunk_size + 2) * sizeof(float)));
+
+    CUDA_RT_CALL(cudaMemset(a, 0, nx * (chunk_size + 2) * sizeof(float)));
+    CUDA_RT_CALL(cudaMemset(a_new, 0, nx * (chunk_size + 2) * sizeof(float)));
+
+    // Calculate local domain boundaries
+    int iy_start_global;  // My start index in the global array
+    if (rank < num_ranks_low) {
+        iy_start_global = rank * chunk_size_low + 1;
+    } else {
+        iy_start_global =
+            num_ranks_low * chunk_size_low + (rank - num_ranks_low) * chunk_size_high + 1;
+    }
+    int iy_end_global = iy_start_global + chunk_size - 1;  // My last index in the global array
+
+    int iy_start = 1;
+    int iy_end = iy_start + chunk_size;
+
+    // Set diriclet boundary conditions on left and right boarder
+    launch_initialize_boundaries(a, a_new, PI, iy_start_global - 1, nx, (chunk_size + 2), ny);
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    float* l2_norm_d;
+    CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(float)));
+    float* l2_norm_h;
+    CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(float)));
+
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    if (0 == rank) {
+        printf("Jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
+    }
+
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    MPI_CALL(MPI_Barrier(MPI_COMM_WORLD));
+    double start = MPI_Wtime();
+    nvtxRangePush("Jacobi Solve Multi-GPU");
+    while (l2_norm > tol && iter < iter_max) {
+        CUDA_RT_CALL(cudaMemset(l2_norm_d, 0, sizeof(float)));
+
+        launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, iy_end, nx);
+	    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+        CUDA_RT_CALL(cudaMemcpy(l2_norm_h, l2_norm_d, sizeof(float), cudaMemcpyDeviceToHost));
+
+        const int top = rank > 0 ? rank - 1 : (size - 1);
+        const int bottom = (rank + 1) % size;
+
+        // Apply periodic boundary conditions
+
+        nvtxRangePush("Halo exchange Memcpy+MPI");
+        // First set of halo exchanges
+        CUDA_RT_CALL(cudaMemcpy(top_halo_buf, a_new + (iy_start * nx), nx * sizeof(float), 
+                                cudaMemcpyDeviceToHost));
+        // TODO: Part 1- Implement the first set of halo exchanges using MPI_SendRecv explained 
+        // in the Jupyter Notebook. Observe the Memcpy operations above and below this comment
+        MPI_CALL(MPI_Sendrecv(top_halo_buf, nx, MPI_FLOAT, top, 0,
+                              bot_halo_buf, nx, MPI_FLOAT, bottom, 0, MPI_COMM_WORLD,
+                              MPI_STATUS_IGNORE));
+        CUDA_RT_CALL(cudaMemcpy(a_new + (iy_end * nx), bot_halo_buf, nx * sizeof(float), 
+                                cudaMemcpyHostToDevice));
+        nvtxRangePop();                        
+
+        nvtxRangePush("Halo exchange Memcpy+MPI");
+        // Second set of halo exchanges
+        // TODO: Part 1- Implement the Memcpy operations and MPI calls for the second set of
+        // halo exchanges
+        CUDA_RT_CALL(cudaMemcpy(bot_halo_buf, a_new + (iy_end - 1) * nx, nx * sizeof(float), 
+                                cudaMemcpyDeviceToHost));
+        MPI_CALL(MPI_Sendrecv(bot_halo_buf, nx, MPI_FLOAT, bottom, 0, 
+                                top_halo_buf, nx, MPI_FLOAT, top, 0, MPI_COMM_WORLD, 
+                                MPI_STATUS_IGNORE));
+        CUDA_RT_CALL(cudaMemcpy(a_new, top_halo_buf, nx * sizeof(float), 
+                                cudaMemcpyHostToDevice));
+        nvtxRangePop();                        
+
+        // TODO: Part 1- Reduce the rank-local L2 Norm to a global L2 norm using MPI_Allreduce
+        MPI_CALL(MPI_Allreduce(l2_norm_h, &l2_norm, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD));
+        
+        l2_norm = std::sqrt(l2_norm);
+
+        iter++;
+        if (0 == rank && (iter % 100) == 0) {
+            printf("%5d, %0.6f\n", iter, l2_norm);
+        }
+
+        std::swap(a_new, a);
+    }
+    double stop = MPI_Wtime();
+    nvtxRangePop();
+
+    CUDA_RT_CALL(cudaMemcpy(a_h + iy_start_global * nx, a + nx,
+                            std::min((ny - iy_start_global) * nx, chunk_size * nx) * sizeof(float),
+                            cudaMemcpyDeviceToHost));
+
+    int result_correct = 1;
+    if (!skip_single_gpu) {
+        for (int iy = iy_start_global; result_correct && (iy < iy_end_global); ++iy) {
+            for (int ix = 1; result_correct && (ix < (nx - 1)); ++ix) {
+                if (std::fabs(a_ref_h[iy * nx + ix] - a_h[iy * nx + ix]) > tol) {
+                    fprintf(stderr,
+                            "ERROR on rank %d: a[%d * %d + %d] = %f does not match %f "
+                            "(reference)\n",
+                            rank, iy, nx, ix, a_h[iy * nx + ix], a_ref_h[iy * nx + ix]);
+                    result_correct = 0;
+                }
+            }
+        }
+        int global_result_correct = 1;
+        MPI_CALL(MPI_Allreduce(&result_correct, &global_result_correct, 1, MPI_INT, MPI_MIN,
+                            MPI_COMM_WORLD));
+        result_correct = global_result_correct;
+    }
+
+    if (rank == 0 && result_correct) {
+        printf("Num GPUs: %d.\n", size);
+        if (!skip_single_gpu) {
+            printf(
+                "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, "
+                "efficiency: %8.2f \n",
+                nx, ny, runtime_serial, size, (stop - start), runtime_serial / (stop - start),
+                runtime_serial / (size * (stop - start)) * 100);
+        }
+        else {
+            printf("%dx%d: %d GPUs: %8.4f s \n", nx, ny, size, (stop - start)); 
+        }
+    }
+
+    CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
+    CUDA_RT_CALL(cudaFree(l2_norm_d));
+
+    CUDA_RT_CALL(cudaFree(a_new));
+    CUDA_RT_CALL(cudaFree(a));
+
+    CUDA_RT_CALL(cudaFreeHost(a_h));
+    CUDA_RT_CALL(cudaFreeHost(a_ref_h));
+
+    MPI_CALL(MPI_Finalize());
+    return (result_correct == 1) ? 0 : 1;
+}
+
+double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h, bool print) {
+    float* a;
+    float* a_new;
+
+    float* l2_norm_d;
+    float* l2_norm_h;
+
+    int iy_start = 1;
+    int iy_end = (ny - 1);
+
+    CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(float)));
+
+    CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(float)));
+
+    // Set diriclet boundary conditions on left and right boarder
+    nvtxRangePush("Init boundaries");
+    launch_initialize_boundaries(a, a_new, PI, 0, nx, ny, ny);
+    CUDA_RT_CALL(cudaGetLastError());
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+    nvtxRangePop();
+
+    CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(float)));
+    CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(float)));
+
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    if (print) {
+        printf("Single GPU jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
+    }
+
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    double start = omp_get_wtime();
+    nvtxRangePush("Jacobi Solve Single GPU");
+    while (l2_norm > tol && iter < iter_max) {
+        CUDA_RT_CALL(cudaMemset(l2_norm_d, 0, sizeof(float)));
+
+        // Compute grid points for this iteration
+        launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, iy_end, nx);
+        CUDA_RT_CALL(cudaGetLastError());
+        CUDA_RT_CALL(cudaMemcpy(l2_norm_h, l2_norm_d, sizeof(float), cudaMemcpyDeviceToHost));
+
+        // Apply periodic boundary conditions
+        CUDA_RT_CALL(cudaMemcpy(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(float),
+                                     cudaMemcpyDeviceToDevice));
+        CUDA_RT_CALL(cudaMemcpy(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(float),
+                                     cudaMemcpyDeviceToDevice));
+
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+        l2_norm = *l2_norm_h;
+        l2_norm = std::sqrt(l2_norm);
+
+        iter++;
+        if ((iter % 100) == 0 && print) {
+            printf("%5d, %0.6f\n", iter, l2_norm);
+        }
+        std::swap(a_new, a);
+    }
+    nvtxRangePop();
+    double stop = omp_get_wtime();
+
+    CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(float), cudaMemcpyDeviceToHost));
+
+    CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
+    CUDA_RT_CALL(cudaFree(l2_norm_d));
+
+    CUDA_RT_CALL(cudaFree(a_new));
+    CUDA_RT_CALL(cudaFree(a));
+    return (stop - start);
+}

BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/reports.zip


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/reports/report1.qdrep


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/reports/report2.qdrep


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/reports/report3.qdrep


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/reports/report4.qdrep


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/reports/report5.qdrep


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/reports/report6.qdrep


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/reports/report7.qdrep


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/reports/report8.qdrep


+ 358 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/solutions/jacobi_cuda_aware_mpi.cpp

@@ -0,0 +1,358 @@
+/* Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <sstream>
+#include <cuda_runtime.h>
+#include <nvToolsExt.h>
+#include <mpi.h>
+#include <omp.h>
+
+#define MPI_CALL(call)                                                                \
+    {                                                                                 \
+        int mpi_status = call;                                                        \
+        if (0 != mpi_status) {                                                        \
+            char mpi_error_string[MPI_MAX_ERROR_STRING];                              \
+            int mpi_error_string_length = 0;                                          \
+            MPI_Error_string(mpi_status, mpi_error_string, &mpi_error_string_length); \
+            if (NULL != mpi_error_string)                                             \
+                fprintf(stderr,                                                       \
+                        "ERROR: MPI call \"%s\" in line %d of file %s failed "        \
+                        "with %s "                                                    \
+                        "(%d).\n",                                                    \
+                        #call, __LINE__, __FILE__, mpi_error_string, mpi_status);     \
+            else                                                                      \
+                fprintf(stderr,                                                       \
+                        "ERROR: MPI call \"%s\" in line %d of file %s failed "        \
+                        "with %d.\n",                                                 \
+                        #call, __LINE__, __FILE__, mpi_status);                       \
+        }                                                                             \
+    }
+
+#define CUDA_RT_CALL(call)                                                                  \
+    {                                                                                       \
+        cudaError_t cudaStatus = call;                                                      \
+        if (cudaSuccess != cudaStatus)                                                      \
+            fprintf(stderr,                                                                 \
+                    "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
+                    "with "                                                                 \
+                    "%s (%d).\n",                                                           \
+                    #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
+    }
+
+constexpr float tol = 1.0e-8;
+
+const float PI = 2.0 * std::asin(1.0);
+
+void launch_initialize_boundaries(float* __restrict__ const a_new, float* __restrict__ const a,
+                                  const float pi, const int offset, const int nx, const int my_ny,
+                                  const int ny);
+
+void launch_jacobi_kernel(float* __restrict__ const a_new, const float* __restrict__ const a,
+                          float* __restrict__ const l2_norm, const int iy_start, const int iy_end,
+                          const int nx);
+
+double single_gpu(const int nx, const int ny, const int iter_max, 
+                    float* const a_ref_h, bool print);
+
+int get_argval(char** begin, char** end, const std::string& arg, const int default_val) {
+    int argval = default_val;
+    char** itr = std::find(begin, end, arg);
+    if (itr != end && ++itr != end) {
+        std::istringstream inbuf(*itr);
+        inbuf >> argval;
+    }
+    return argval;
+}
+
+bool get_arg(char** begin, char** end, const std::string& arg) {
+    char** itr = std::find(begin, end, arg);
+    if (itr != end) {
+        return true;
+    }
+    return false;
+}
+
+int main(int argc, char* argv[]) {
+    MPI_CALL(MPI_Init(&argc, &argv));
+    int rank;
+    MPI_CALL(MPI_Comm_rank(MPI_COMM_WORLD, &rank));
+    int size;
+    MPI_CALL(MPI_Comm_size(MPI_COMM_WORLD, &size));
+    int num_devices = 0;
+    cudaGetDeviceCount(&num_devices);
+    
+    const int iter_max = get_argval(argv, argv + argc, "-niter", 1000);
+    const int nx = get_argval(argv, argv + argc, "-nx", 16384);
+    const int ny = get_argval(argv, argv + argc, "-ny", 16384);
+    const bool skip_single_gpu = get_arg(argv, argv + argc, "-skip_single_gpu");
+
+    int local_rank = -1;
+    MPI_Comm local_comm;
+    MPI_CALL(MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL,
+                                    &local_comm));
+
+    MPI_CALL(MPI_Comm_rank(local_comm, &local_rank));
+
+    MPI_CALL(MPI_Comm_free(&local_comm));
+
+    CUDA_RT_CALL(cudaSetDevice(local_rank % num_devices));
+    CUDA_RT_CALL(cudaFree(0));
+
+    float* a_ref_h;
+    CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(float)));
+    float* a_h;
+    CUDA_RT_CALL(cudaMallocHost(&a_h, nx * ny * sizeof(float)));
+    
+    double runtime_serial = 1;
+    if (!skip_single_gpu){
+        runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h, rank == 0);
+    }
+
+    // ny - 2 rows are distributed amongst `size` ranks in such a way
+    // that each rank gets either (ny - 2) / size or (ny - 2) / size + 1 rows.
+    // This optimizes load balancing when (ny - 2) % size != 0
+    int chunk_size;
+    int chunk_size_low = (ny - 2) / size;
+    int chunk_size_high = chunk_size_low + 1;
+    // To calculate the number of ranks that need to compute an extra row,
+    // the following formula is derived from this equation:
+    // num_ranks_low * chunk_size_low + (size - num_ranks_low) * (chunk_size_low + 1) = ny - 2
+    int num_ranks_low = size * chunk_size_low + size -
+                        (ny - 2);  // Number of ranks with chunk_size = chunk_size_low
+    if (rank < num_ranks_low)
+        chunk_size = chunk_size_low;
+    else
+        chunk_size = chunk_size_high;
+
+    float* a;
+    CUDA_RT_CALL(cudaMalloc(&a, nx * (chunk_size + 2) * sizeof(float)));
+    float* a_new;
+    CUDA_RT_CALL(cudaMalloc(&a_new, nx * (chunk_size + 2) * sizeof(float)));
+
+    CUDA_RT_CALL(cudaMemset(a, 0, nx * (chunk_size + 2) * sizeof(float)));
+    CUDA_RT_CALL(cudaMemset(a_new, 0, nx * (chunk_size + 2) * sizeof(float)));
+
+    // Calculate local domain boundaries
+    int iy_start_global;  // My start index in the global array
+    if (rank < num_ranks_low) {
+        iy_start_global = rank * chunk_size_low + 1;
+    } else {
+        iy_start_global =
+            num_ranks_low * chunk_size_low + (rank - num_ranks_low) * chunk_size_high + 1;
+    }
+    int iy_end_global = iy_start_global + chunk_size - 1;  // My last index in the global array
+
+    int iy_start = 1;
+    int iy_end = iy_start + chunk_size;
+
+    // Set diriclet boundary conditions on left and right boarder
+    launch_initialize_boundaries(a, a_new, PI, iy_start_global - 1, nx, (chunk_size + 2), ny);
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    float* l2_norm_d;
+    CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(float)));
+    float* l2_norm_h;
+    CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(float)));
+
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    if (0 == rank) {
+        printf("Jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
+    }
+
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    MPI_CALL(MPI_Barrier(MPI_COMM_WORLD));
+    double start = MPI_Wtime();
+    nvtxRangePush("Jacobi Solve Multi-GPU");
+    while (l2_norm > tol && iter < iter_max) {
+        CUDA_RT_CALL(cudaMemset(l2_norm_d, 0, sizeof(float)));
+
+        launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, iy_end, nx);
+
+        CUDA_RT_CALL(cudaMemcpy(l2_norm_h, l2_norm_d, sizeof(float), cudaMemcpyDeviceToHost));
+
+        const int top = rank > 0 ? rank - 1 : (size - 1);
+        const int bottom = (rank + 1) % size;
+
+        // Apply periodic boundary conditions
+	    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+        nvtxRangePush("Halo exchange CUDA-aware MPI");
+        // TODO: Part 2- 
+        MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_FLOAT, top, 0,
+                              a_new + (iy_end * nx), nx, MPI_FLOAT, bottom, 0, MPI_COMM_WORLD,
+                              MPI_STATUS_IGNORE));
+        nvtxRangePop(); 
+
+        nvtxRangePush("Halo exchange CUDA-aware MPI");
+        // TODO: Part 2- 
+        MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_FLOAT, bottom, 0, a_new, nx,
+                              MPI_FLOAT, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE));
+        nvtxRangePop(); 
+
+        // TODO: Part 2- 
+        MPI_CALL(MPI_Allreduce(l2_norm_h, &l2_norm, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD));
+        l2_norm = std::sqrt(l2_norm);
+        
+        iter++;
+        if (0 == rank && (iter % 100) == 0) {
+            printf("%5d, %0.6f\n", iter, l2_norm);
+        }
+
+        std::swap(a_new, a);
+    }
+    double stop = MPI_Wtime();
+    nvtxRangePop();
+
+    CUDA_RT_CALL(cudaMemcpy(a_h + iy_start_global * nx, a + nx,
+                            std::min((ny - iy_start_global) * nx, chunk_size * nx) * sizeof(float),
+                            cudaMemcpyDeviceToHost));
+
+    int result_correct = 1;
+    if (!skip_single_gpu) {
+        for (int iy = iy_start_global; result_correct && (iy < iy_end_global); ++iy) {
+            for (int ix = 1; result_correct && (ix < (nx - 1)); ++ix) {
+                if (std::fabs(a_ref_h[iy * nx + ix] - a_h[iy * nx + ix]) > tol) {
+                    fprintf(stderr,
+                            "ERROR on rank %d: a[%d * %d + %d] = %f does not match %f "
+                            "(reference)\n",
+                            rank, iy, nx, ix, a_h[iy * nx + ix], a_ref_h[iy * nx + ix]);
+                    result_correct = 0;
+                }
+            }
+        }
+        int global_result_correct = 1;
+        MPI_CALL(MPI_Allreduce(&result_correct, &global_result_correct, 1, MPI_INT, MPI_MIN,
+                            MPI_COMM_WORLD));
+        result_correct = global_result_correct;
+    }
+
+    if (rank == 0 && result_correct) {
+        printf("Num GPUs: %d.\n", size);
+        if (!skip_single_gpu) {
+            printf(
+                "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, "
+                "efficiency: %8.2f \n",
+                nx, ny, runtime_serial, size, (stop - start), runtime_serial / (stop - start),
+                runtime_serial / (size * (stop - start)) * 100);
+        }
+        else {
+            printf("%dx%d: %d GPUs: %8.4f s \n", nx, ny, size, (stop - start)); 
+        }
+    }
+
+    CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
+    CUDA_RT_CALL(cudaFree(l2_norm_d));
+
+    CUDA_RT_CALL(cudaFree(a_new));
+    CUDA_RT_CALL(cudaFree(a));
+
+    CUDA_RT_CALL(cudaFreeHost(a_h));
+    CUDA_RT_CALL(cudaFreeHost(a_ref_h));
+
+    MPI_CALL(MPI_Finalize());
+    return (result_correct == 1) ? 0 : 1;
+}
+
+double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h, bool print) {
+    float* a;
+    float* a_new;
+
+    float* l2_norm_d;
+    float* l2_norm_h;
+
+    int iy_start = 1;
+    int iy_end = (ny - 1);
+
+    CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(float)));
+
+    CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(float)));
+
+    // Set diriclet boundary conditions on left and right boarder
+    nvtxRangePush("Init boundaries");
+    launch_initialize_boundaries(a, a_new, PI, 0, nx, ny, ny);
+    CUDA_RT_CALL(cudaGetLastError());
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+    nvtxRangePop();
+
+    CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(float)));
+    CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(float)));
+
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    if (print) {
+        printf("Single GPU jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
+    }
+
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    double start = omp_get_wtime();
+    nvtxRangePush("Jacobi Solve Single GPU");
+    while (l2_norm > tol && iter < iter_max) {
+        CUDA_RT_CALL(cudaMemset(l2_norm_d, 0, sizeof(float)));
+
+        // Compute grid points for this iteration
+        launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, iy_end, nx);
+        CUDA_RT_CALL(cudaGetLastError());
+        CUDA_RT_CALL(cudaMemcpy(l2_norm_h, l2_norm_d, sizeof(float), cudaMemcpyDeviceToHost));
+
+        // Apply periodic boundary conditions
+        CUDA_RT_CALL(cudaMemcpy(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(float),
+                                     cudaMemcpyDeviceToDevice));
+        CUDA_RT_CALL(cudaMemcpy(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(float),
+                                     cudaMemcpyDeviceToDevice));
+
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+        l2_norm = *l2_norm_h;
+        l2_norm = std::sqrt(l2_norm);
+
+        iter++;
+        if ((iter % 100) == 0 && print) {
+            printf("%5d, %0.6f\n", iter, l2_norm);
+        }
+        std::swap(a_new, a);
+    }
+    nvtxRangePop();
+    double stop = omp_get_wtime();
+
+    CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(float), cudaMemcpyDeviceToHost));
+
+    CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
+    CUDA_RT_CALL(cudaFree(l2_norm_d));
+
+    CUDA_RT_CALL(cudaFree(a_new));
+    CUDA_RT_CALL(cudaFree(a));
+    return (stop - start);
+}

+ 378 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/solutions/jacobi_memcpy_mpi.cpp

@@ -0,0 +1,378 @@
+/* Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <sstream>
+#include <cuda_runtime.h>
+#include <nvToolsExt.h>
+#include <mpi.h>
+#include <omp.h>
+
+#define MPI_CALL(call)                                                                \
+    {                                                                                 \
+        int mpi_status = call;                                                        \
+        if (0 != mpi_status) {                                                        \
+            char mpi_error_string[MPI_MAX_ERROR_STRING];                              \
+            int mpi_error_string_length = 0;                                          \
+            MPI_Error_string(mpi_status, mpi_error_string, &mpi_error_string_length); \
+            if (NULL != mpi_error_string)                                             \
+                fprintf(stderr,                                                       \
+                        "ERROR: MPI call \"%s\" in line %d of file %s failed "        \
+                        "with %s "                                                    \
+                        "(%d).\n",                                                    \
+                        #call, __LINE__, __FILE__, mpi_error_string, mpi_status);     \
+            else                                                                      \
+                fprintf(stderr,                                                       \
+                        "ERROR: MPI call \"%s\" in line %d of file %s failed "        \
+                        "with %d.\n",                                                 \
+                        #call, __LINE__, __FILE__, mpi_status);                       \
+        }                                                                             \
+    }
+
+#define CUDA_RT_CALL(call)                                                                  \
+    {                                                                                       \
+        cudaError_t cudaStatus = call;                                                      \
+        if (cudaSuccess != cudaStatus)                                                      \
+            fprintf(stderr,                                                                 \
+                    "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
+                    "with "                                                                 \
+                    "%s (%d).\n",                                                           \
+                    #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
+    }
+
+constexpr float tol = 1.0e-8;
+
+const float PI = 2.0 * std::asin(1.0);
+
+void launch_initialize_boundaries(float* __restrict__ const a_new, float* __restrict__ const a,
+                                  const float pi, const int offset, const int nx, const int my_ny,
+                                  const int ny);
+
+void launch_jacobi_kernel(float* __restrict__ const a_new, const float* __restrict__ const a,
+                          float* __restrict__ const l2_norm, const int iy_start, const int iy_end,
+                          const int nx);
+
+double single_gpu(const int nx, const int ny, const int iter_max, 
+                    float* const a_ref_h, bool print);
+
+int get_argval(char** begin, char** end, const std::string& arg, const int default_val) {
+    int argval = default_val;
+    char** itr = std::find(begin, end, arg);
+    if (itr != end && ++itr != end) {
+        std::istringstream inbuf(*itr);
+        inbuf >> argval;
+    }
+    return argval;
+}
+
+bool get_arg(char** begin, char** end, const std::string& arg) {
+    char** itr = std::find(begin, end, arg);
+    if (itr != end) {
+        return true;
+    }
+    return false;
+}
+
+int main(int argc, char* argv[]) {
+    MPI_CALL(MPI_Init(&argc, &argv));
+    int rank;
+    MPI_CALL(MPI_Comm_rank(MPI_COMM_WORLD, &rank));
+    int size;
+    MPI_CALL(MPI_Comm_size(MPI_COMM_WORLD, &size));
+    int num_devices = 0;
+    cudaGetDeviceCount(&num_devices);
+    
+    const int iter_max = get_argval(argv, argv + argc, "-niter", 1000);
+    const int nx = get_argval(argv, argv + argc, "-nx", 16384);
+    const int ny = get_argval(argv, argv + argc, "-ny", 16384);
+    const bool skip_single_gpu = get_arg(argv, argv + argc, "-skip_single_gpu");
+
+    int local_rank = -1;
+    // TODO: Part 1- Obtain the node-level local rank by splitting the global communicator
+    // Make sure ot free the local communicator after its use
+    MPI_Comm local_comm;
+    MPI_CALL(MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL,
+                                    &local_comm));
+
+    MPI_CALL(MPI_Comm_rank(local_comm, &local_rank));
+
+    MPI_CALL(MPI_Comm_free(&local_comm));
+
+    CUDA_RT_CALL(cudaSetDevice(local_rank % num_devices));
+    CUDA_RT_CALL(cudaFree(0));
+
+    float* a_ref_h;
+    CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(float)));
+    float* a_h;
+    CUDA_RT_CALL(cudaMallocHost(&a_h, nx * ny * sizeof(float)));
+    float* top_halo_buf;
+    CUDA_RT_CALL(cudaMallocHost(&top_halo_buf, nx * sizeof(float)));
+    float* bot_halo_buf;
+    CUDA_RT_CALL(cudaMallocHost(&bot_halo_buf, nx * sizeof(float)));
+
+    double runtime_serial = 1;
+    if (!skip_single_gpu){
+        runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h, rank == 0);
+    }
+
+    // ny - 2 rows are distributed amongst `size` ranks in such a way
+    // that each rank gets either (ny - 2) / size or (ny - 2) / size + 1 rows.
+    // This optimizes load balancing when (ny - 2) % size != 0
+    int chunk_size;
+    int chunk_size_low = (ny - 2) / size;
+    int chunk_size_high = chunk_size_low + 1;
+    // To calculate the number of ranks that need to compute an extra row,
+    // the following formula is derived from this equation:
+    // num_ranks_low * chunk_size_low + (size - num_ranks_low) * (chunk_size_low + 1) = ny - 2
+    int num_ranks_low = size * chunk_size_low + size -
+                        (ny - 2);  // Number of ranks with chunk_size = chunk_size_low
+    if (rank < num_ranks_low)
+        chunk_size = chunk_size_low;
+    else
+        chunk_size = chunk_size_high;
+
+    float* a;
+    CUDA_RT_CALL(cudaMalloc(&a, nx * (chunk_size + 2) * sizeof(float)));
+    float* a_new;
+    CUDA_RT_CALL(cudaMalloc(&a_new, nx * (chunk_size + 2) * sizeof(float)));
+
+    CUDA_RT_CALL(cudaMemset(a, 0, nx * (chunk_size + 2) * sizeof(float)));
+    CUDA_RT_CALL(cudaMemset(a_new, 0, nx * (chunk_size + 2) * sizeof(float)));
+
+    // Calculate local domain boundaries
+    int iy_start_global;  // My start index in the global array
+    if (rank < num_ranks_low) {
+        iy_start_global = rank * chunk_size_low + 1;
+    } else {
+        iy_start_global =
+            num_ranks_low * chunk_size_low + (rank - num_ranks_low) * chunk_size_high + 1;
+    }
+    int iy_end_global = iy_start_global + chunk_size - 1;  // My last index in the global array
+
+    int iy_start = 1;
+    int iy_end = iy_start + chunk_size;
+
+    // Set diriclet boundary conditions on left and right boarder
+    launch_initialize_boundaries(a, a_new, PI, iy_start_global - 1, nx, (chunk_size + 2), ny);
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    float* l2_norm_d;
+    CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(float)));
+    float* l2_norm_h;
+    CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(float)));
+
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    if (0 == rank) {
+        printf("Jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
+    }
+
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    MPI_CALL(MPI_Barrier(MPI_COMM_WORLD));
+    double start = MPI_Wtime();
+    nvtxRangePush("Jacobi Solve Multi-GPU");
+    while (l2_norm > tol && iter < iter_max) {
+        CUDA_RT_CALL(cudaMemset(l2_norm_d, 0, sizeof(float)));
+
+        launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, iy_end, nx);
+	    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+        CUDA_RT_CALL(cudaMemcpy(l2_norm_h, l2_norm_d, sizeof(float), cudaMemcpyDeviceToHost));
+
+        const int top = rank > 0 ? rank - 1 : (size - 1);
+        const int bottom = (rank + 1) % size;
+
+        // Apply periodic boundary conditions
+
+        nvtxRangePush("Halo exchange Memcpy+MPI");
+        // First set of halo exchanges
+        CUDA_RT_CALL(cudaMemcpy(top_halo_buf, a_new + (iy_start * nx), nx * sizeof(float), 
+                                cudaMemcpyDeviceToHost));
+        // TODO: Part 1- Implement the first set of halo exchanges using MPI_SendRecv explained 
+        // in the Jupyter Notebook. Observe the Memcpy operations above and below this comment
+        MPI_CALL(MPI_Sendrecv(top_halo_buf, nx, MPI_FLOAT, top, 0,
+                              bot_halo_buf, nx, MPI_FLOAT, bottom, 0, MPI_COMM_WORLD,
+                              MPI_STATUS_IGNORE));
+        CUDA_RT_CALL(cudaMemcpy(a_new + (iy_end * nx), bot_halo_buf, nx * sizeof(float), 
+                                cudaMemcpyHostToDevice));
+        nvtxRangePop();                        
+
+        nvtxRangePush("Halo exchange Memcpy+MPI");
+        // Second set of halo exchanges
+        // TODO: Part 1- Implement the Memcpy operations and MPI calls for the second set of
+        // halo exchanges
+        CUDA_RT_CALL(cudaMemcpy(bot_halo_buf, a_new + (iy_end - 1) * nx, nx * sizeof(float), 
+                                cudaMemcpyDeviceToHost));
+        MPI_CALL(MPI_Sendrecv(bot_halo_buf, nx, MPI_FLOAT, bottom, 0, 
+                                top_halo_buf, nx, MPI_FLOAT, top, 0, MPI_COMM_WORLD, 
+                                MPI_STATUS_IGNORE));
+        CUDA_RT_CALL(cudaMemcpy(a_new, top_halo_buf, nx * sizeof(float), 
+                                cudaMemcpyHostToDevice));
+        nvtxRangePop();                        
+
+        // TODO: Part 1- Reduce the rank-local L2 Norm to a global L2 norm using MPI_Allreduce
+        MPI_CALL(MPI_Allreduce(l2_norm_h, &l2_norm, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD));
+        
+        l2_norm = std::sqrt(l2_norm);
+
+        iter++;
+        if (0 == rank && (iter % 100) == 0) {
+            printf("%5d, %0.6f\n", iter, l2_norm);
+        }
+
+        std::swap(a_new, a);
+    }
+    double stop = MPI_Wtime();
+    nvtxRangePop();
+
+    CUDA_RT_CALL(cudaMemcpy(a_h + iy_start_global * nx, a + nx,
+                            std::min((ny - iy_start_global) * nx, chunk_size * nx) * sizeof(float),
+                            cudaMemcpyDeviceToHost));
+
+    int result_correct = 1;
+    if (!skip_single_gpu) {
+        for (int iy = iy_start_global; result_correct && (iy < iy_end_global); ++iy) {
+            for (int ix = 1; result_correct && (ix < (nx - 1)); ++ix) {
+                if (std::fabs(a_ref_h[iy * nx + ix] - a_h[iy * nx + ix]) > tol) {
+                    fprintf(stderr,
+                            "ERROR on rank %d: a[%d * %d + %d] = %f does not match %f "
+                            "(reference)\n",
+                            rank, iy, nx, ix, a_h[iy * nx + ix], a_ref_h[iy * nx + ix]);
+                    result_correct = 0;
+                }
+            }
+        }
+        int global_result_correct = 1;
+        MPI_CALL(MPI_Allreduce(&result_correct, &global_result_correct, 1, MPI_INT, MPI_MIN,
+                            MPI_COMM_WORLD));
+        result_correct = global_result_correct;
+    }
+
+    if (rank == 0 && result_correct) {
+        printf("Num GPUs: %d.\n", size);
+        if (!skip_single_gpu) {
+            printf(
+                "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, "
+                "efficiency: %8.2f \n",
+                nx, ny, runtime_serial, size, (stop - start), runtime_serial / (stop - start),
+                runtime_serial / (size * (stop - start)) * 100);
+        }
+        else {
+            printf("%dx%d: %d GPUs: %8.4f s \n", nx, ny, size, (stop - start)); 
+        }
+    }
+
+    CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
+    CUDA_RT_CALL(cudaFree(l2_norm_d));
+
+    CUDA_RT_CALL(cudaFree(a_new));
+    CUDA_RT_CALL(cudaFree(a));
+
+    CUDA_RT_CALL(cudaFreeHost(a_h));
+    CUDA_RT_CALL(cudaFreeHost(a_ref_h));
+
+    MPI_CALL(MPI_Finalize());
+    return (result_correct == 1) ? 0 : 1;
+}
+
+double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h, bool print) {
+    float* a;
+    float* a_new;
+
+    float* l2_norm_d;
+    float* l2_norm_h;
+
+    int iy_start = 1;
+    int iy_end = (ny - 1);
+
+    CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(float)));
+
+    CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(float)));
+
+    // Set diriclet boundary conditions on left and right boarder
+    nvtxRangePush("Init boundaries");
+    launch_initialize_boundaries(a, a_new, PI, 0, nx, ny, ny);
+    CUDA_RT_CALL(cudaGetLastError());
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+    nvtxRangePop();
+
+    CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(float)));
+    CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(float)));
+
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    if (print) {
+        printf("Single GPU jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
+    }
+
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    double start = omp_get_wtime();
+    nvtxRangePush("Jacobi Solve Single GPU");
+    while (l2_norm > tol && iter < iter_max) {
+        CUDA_RT_CALL(cudaMemset(l2_norm_d, 0, sizeof(float)));
+
+        // Compute grid points for this iteration
+        launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, iy_end, nx);
+        CUDA_RT_CALL(cudaGetLastError());
+        CUDA_RT_CALL(cudaMemcpy(l2_norm_h, l2_norm_d, sizeof(float), cudaMemcpyDeviceToHost));
+
+        // Apply periodic boundary conditions
+        CUDA_RT_CALL(cudaMemcpy(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(float),
+                                     cudaMemcpyDeviceToDevice));
+        CUDA_RT_CALL(cudaMemcpy(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(float),
+                                     cudaMemcpyDeviceToDevice));
+
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+        l2_norm = *l2_norm_h;
+        l2_norm = std::sqrt(l2_norm);
+
+        iter++;
+        if ((iter % 100) == 0 && print) {
+            printf("%5d, %0.6f\n", iter, l2_norm);
+        }
+        std::swap(a_new, a);
+    }
+    nvtxRangePop();
+    double stop = omp_get_wtime();
+
+    CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(float), cudaMemcpyDeviceToHost));
+
+    CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
+    CUDA_RT_CALL(cudaFree(l2_norm_d));
+
+    CUDA_RT_CALL(cudaFree(a_new));
+    CUDA_RT_CALL(cudaFree(a));
+    return (stop - start);
+}

+ 0 - 1
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/multi-gpu-programming-models

@@ -1 +0,0 @@
-Subproject commit 5c21895b8fa7ec36adc1f38f454a5e95dde9ae48

+ 42 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/nccl/Makefile

@@ -0,0 +1,42 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+NP ?= 1
+NVCC=nvcc
+MPICXX=mpicxx
+MPIRUN ?= mpirun
+#CUDA_HOME ?= /usr/local/cuda
+#NCCL_HOME ?= /usr
+GENCODE_SM30	:= -gencode arch=compute_30,code=sm_30
+GENCODE_SM35	:= -gencode arch=compute_35,code=sm_35
+GENCODE_SM37	:= -gencode arch=compute_37,code=sm_37
+GENCODE_SM50	:= -gencode arch=compute_50,code=sm_50
+GENCODE_SM52	:= -gencode arch=compute_52,code=sm_52
+GENCODE_SM60    := -gencode arch=compute_60,code=sm_60
+GENCODE_SM70    := -gencode arch=compute_70,code=sm_70
+GENCODE_SM80    := -gencode arch=compute_80,code=sm_80 -gencode arch=compute_80,code=compute_80
+GENCODE_FLAGS	:= $(GENCODE_SM70) $(GENCODE_SM80)
+ifdef DISABLE_CUB
+        NVCC_FLAGS = -Xptxas --optimize-float-atomics
+else
+        NVCC_FLAGS = -DHAVE_CUB
+endif
+NVCC_FLAGS += -lineinfo $(GENCODE_FLAGS) -std=c++14
+MPICXX_FLAGS = -DUSE_NVTX -I$(CUDA_HOME)/include -I$(NCCL_HOME)/include -std=c++14
+LD_FLAGS = -L$(CUDA_HOME)/lib64 -lcudart -lnvToolsExt -lnccl
+jacobi: Makefile jacobi.cpp jacobi_kernels.o
+	$(MPICXX) $(MPICXX_FLAGS) jacobi.cpp jacobi_kernels.o $(LD_FLAGS) -o jacobi
+
+jacobi_kernels.o: Makefile jacobi_kernels.cu
+	$(NVCC) $(NVCC_FLAGS) jacobi_kernels.cu -c
+
+.PHONY.: clean
+clean:
+	rm -f jacobi jacobi_kernels.o *.qdrep jacobi.*.compute-sanitizer.log
+
+sanitize: jacobi
+	$(MPIRUN) -np $(NP) compute-sanitizer --log-file jacobi.%q{OMPI_COMM_WORLD_RANK}.compute-sanitizer.log ./jacobi -niter 10
+
+run: jacobi
+	$(MPIRUN) -np $(NP) ./jacobi
+
+profile: jacobi
+	$(MPIRUN) -np $(NP) nsys profile --trace=mpi,cuda,nvtx -o jacobi.%q{OMPI_COMM_WORLD_RANK} ./jacobi -niter 10

+ 117 - 36
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/jacobi.cpp

@@ -1,4 +1,4 @@
-/* Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -92,12 +92,27 @@ const int num_colors = sizeof(colors) / sizeof(uint32_t);
                     #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
     }
 
+#include <nccl.h>
+
+#define NCCL_CALL(call)                                                                     \
+    {                                                                                       \
+        ncclResult_t  ncclStatus = call;                                                    \
+        if (ncclSuccess != ncclStatus)                                                      \
+            fprintf(stderr,                                                                 \
+                    "ERROR: NCCL call \"%s\" in line %d of file %s failed "                 \
+                    "with "                                                                 \
+                    "%s (%d).\n",                                                           \
+                    #call, __LINE__, __FILE__, ncclGetErrorString(ncclStatus), ncclStatus); \
+    }
+
 #ifdef USE_DOUBLE
 typedef double real;
 #define MPI_REAL_TYPE MPI_DOUBLE
+#define NCCL_REAL_TYPE ncclDouble
 #else
 typedef float real;
 #define MPI_REAL_TYPE MPI_FLOAT
+#define NCCL_REAL_TYPE ncclFloat
 #endif
 
 constexpr real tol = 1.0e-8;
@@ -110,7 +125,7 @@ void launch_initialize_boundaries(real* __restrict__ const a_new, real* __restri
 
 void launch_jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a,
                           real* __restrict__ const l2_norm, const int iy_start, const int iy_end,
-                          const int nx, const bool calculate_norm);
+                          const int nx, const bool calculate_norm, cudaStream_t stream);
 
 double single_gpu(const int nx, const int ny, const int iter_max, real* const a_ref_h,
                   const int nccheck, const bool print);
@@ -140,8 +155,10 @@ int main(int argc, char* argv[]) {
     MPI_CALL(MPI_Comm_rank(MPI_COMM_WORLD, &rank));
     int size;
     MPI_CALL(MPI_Comm_size(MPI_COMM_WORLD, &size));
-    int num_devices = 0;
-    cudaGetDeviceCount(&num_devices);
+
+    ncclUniqueId nccl_uid;
+    if (rank == 0) NCCL_CALL(ncclGetUniqueId(&nccl_uid));
+    MPI_CALL(MPI_Bcast(&nccl_uid, sizeof(ncclUniqueId), MPI_BYTE, 0, MPI_COMM_WORLD));
 
     const int iter_max = get_argval<int>(argv, argv + argc, "-niter", 1000);
     const int nccheck = get_argval<int>(argv, argv + argc, "-nccheck", 1);
@@ -160,9 +177,20 @@ int main(int argc, char* argv[]) {
         MPI_CALL(MPI_Comm_free(&local_comm));
     }
 
-    CUDA_RT_CALL(cudaSetDevice(local_rank % num_devices));
+    CUDA_RT_CALL(cudaSetDevice(local_rank));
     CUDA_RT_CALL(cudaFree(0));
 
+    ncclComm_t nccl_comm;
+    NCCL_CALL(ncclCommInitRank(&nccl_comm, size, nccl_uid, rank));
+    int nccl_version = 0;
+    NCCL_CALL(ncclGetVersion(&nccl_version));
+    if ( nccl_version < 2800 ) {
+        fprintf(stderr,"ERROR NCCL 2.8 or newer is required.\n");
+        NCCL_CALL(ncclCommDestroy(nccl_comm));
+        MPI_CALL(MPI_Finalize());
+        return 1;
+    }
+
     real* a_ref_h;
     CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(real)));
     real* a_h;
@@ -210,18 +238,37 @@ int main(int argc, char* argv[]) {
     launch_initialize_boundaries(a, a_new, PI, iy_start_global - 1, nx, (chunk_size + 2), ny);
     CUDA_RT_CALL(cudaDeviceSynchronize());
 
+    cudaStream_t compute_stream;
+    CUDA_RT_CALL(cudaStreamCreate(&compute_stream));
+    cudaEvent_t compute_done;
+    CUDA_RT_CALL(cudaEventCreateWithFlags(&compute_done, cudaEventDisableTiming));
+
     real* l2_norm_d;
     CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(real)));
     real* l2_norm_h;
     CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(real)));
 
+    PUSH_RANGE("NCCL_Warmup", 5)
+    for (int i = 0; i < 10; ++i) {
+        const int top = rank > 0 ? rank - 1 : (size - 1);
+        const int bottom = (rank + 1) % size;
+        NCCL_CALL(ncclGroupStart());
+        NCCL_CALL(ncclRecv(a_new,                     nx, NCCL_REAL_TYPE, top,    nccl_comm, compute_stream));
+        NCCL_CALL(ncclSend(a_new + (iy_end - 1) * nx, nx, NCCL_REAL_TYPE, bottom, nccl_comm, compute_stream));
+        NCCL_CALL(ncclRecv(a_new + (iy_end * nx),     nx, NCCL_REAL_TYPE, bottom, nccl_comm, compute_stream));
+        NCCL_CALL(ncclSend(a_new + iy_start * nx,     nx, NCCL_REAL_TYPE, top,    nccl_comm, compute_stream));
+        NCCL_CALL(ncclGroupEnd());
+        CUDA_RT_CALL(cudaStreamSynchronize(compute_stream));
+    }
+    POP_RANGE
+
     CUDA_RT_CALL(cudaDeviceSynchronize());
 
     if (!csv && 0 == rank) {
         printf(
             "Jacobi relaxation: %d iterations on %d x %d mesh with norm check "
             "every %d iterations\n",
-            iter_max, ny, nx, nccheck);
+            iter_max, nx, ny, nccheck);
     }
 
     int iter = 0;
@@ -233,30 +280,34 @@ int main(int argc, char* argv[]) {
     double start = MPI_Wtime();
     PUSH_RANGE("Jacobi solve", 0)
     while (l2_norm > tol && iter < iter_max) {
-        CUDA_RT_CALL(cudaMemset(l2_norm_d, 0, sizeof(real)));
+        CUDA_RT_CALL(cudaMemsetAsync(l2_norm_d, 0, sizeof(real), compute_stream));
 
         calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100) == 0);
 
-        launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, iy_end, nx, calculate_norm);
+        launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, iy_end, nx, calculate_norm,
+                             compute_stream);
+        CUDA_RT_CALL(cudaEventRecord(compute_done, compute_stream));
 
         if (calculate_norm) {
-            CUDA_RT_CALL(cudaMemcpy(l2_norm_h, l2_norm_d, sizeof(real), cudaMemcpyDeviceToHost));
+            CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h, l2_norm_d, sizeof(real), cudaMemcpyDeviceToHost,
+                                         compute_stream));
         }
 
         const int top = rank > 0 ? rank - 1 : (size - 1);
         const int bottom = (rank + 1) % size;
 
         // Apply periodic boundary conditions
-	CUDA_RT_CALL(cudaDeviceSynchronize());
-        PUSH_RANGE("MPI", 5)
-        MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0,
-                              a_new + (iy_end * nx), nx, MPI_REAL_TYPE, bottom, 0, MPI_COMM_WORLD,
-                              MPI_STATUS_IGNORE));
-        MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_REAL_TYPE, bottom, 0, a_new, nx,
-                              MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE));
+        PUSH_RANGE("NCCL_LAUNCH", 5)
+        NCCL_CALL(ncclGroupStart());
+        NCCL_CALL(ncclRecv(a_new,                     nx, NCCL_REAL_TYPE, top,    nccl_comm, compute_stream));
+        NCCL_CALL(ncclSend(a_new + (iy_end - 1) * nx, nx, NCCL_REAL_TYPE, bottom, nccl_comm, compute_stream));
+        NCCL_CALL(ncclRecv(a_new + (iy_end * nx),     nx, NCCL_REAL_TYPE, bottom, nccl_comm, compute_stream));
+        NCCL_CALL(ncclSend(a_new + iy_start * nx,     nx, NCCL_REAL_TYPE, top,    nccl_comm, compute_stream));
+        NCCL_CALL(ncclGroupEnd());
         POP_RANGE
 
         if (calculate_norm) {
+            CUDA_RT_CALL(cudaStreamSynchronize(compute_stream));
             MPI_CALL(MPI_Allreduce(l2_norm_h, &l2_norm, 1, MPI_REAL_TYPE, MPI_SUM, MPI_COMM_WORLD));
             l2_norm = std::sqrt(l2_norm);
 
@@ -268,6 +319,7 @@ int main(int argc, char* argv[]) {
         std::swap(a_new, a);
         iter++;
     }
+    CUDA_RT_CALL(cudaDeviceSynchronize());
     double stop = MPI_Wtime();
     POP_RANGE
 
@@ -295,17 +347,19 @@ int main(int argc, char* argv[]) {
 
     if (rank == 0 && result_correct) {
         if (csv) {
-            printf("mpi, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size,
+            printf("nccl, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size,
                    (stop - start), runtime_serial);
         } else {
             printf("Num GPUs: %d.\n", size);
             printf(
                 "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, "
                 "efficiency: %8.2f \n",
-                ny, nx, runtime_serial, size, (stop - start), runtime_serial / (stop - start),
+                nx, ny, runtime_serial, size, (stop - start), runtime_serial / (stop - start),
                 runtime_serial / (size * (stop - start)) * 100);
         }
     }
+    CUDA_RT_CALL(cudaEventDestroy(compute_done));
+    CUDA_RT_CALL(cudaStreamDestroy(compute_stream));
 
     CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
     CUDA_RT_CALL(cudaFree(l2_norm_d));
@@ -316,6 +370,8 @@ int main(int argc, char* argv[]) {
     CUDA_RT_CALL(cudaFreeHost(a_h));
     CUDA_RT_CALL(cudaFreeHost(a_ref_h));
 
+    NCCL_CALL(ncclCommDestroy(nccl_comm));
+
     MPI_CALL(MPI_Finalize());
     return (result_correct == 1) ? 0 : 1;
 }
@@ -325,6 +381,13 @@ double single_gpu(const int nx, const int ny, const int iter_max, real* const a_
     real* a;
     real* a_new;
 
+    cudaStream_t compute_stream;
+    cudaStream_t push_top_stream;
+    cudaStream_t push_bottom_stream;
+    cudaEvent_t compute_done;
+    cudaEvent_t push_top_done;
+    cudaEvent_t push_bottom_done;
+
     real* l2_norm_d;
     real* l2_norm_h;
 
@@ -339,9 +402,15 @@ double single_gpu(const int nx, const int ny, const int iter_max, real* const a_
 
     // Set diriclet boundary conditions on left and right boarder
     launch_initialize_boundaries(a, a_new, PI, 0, nx, ny, ny);
-    CUDA_RT_CALL(cudaGetLastError());
     CUDA_RT_CALL(cudaDeviceSynchronize());
 
+    CUDA_RT_CALL(cudaStreamCreate(&compute_stream));
+    CUDA_RT_CALL(cudaStreamCreate(&push_top_stream));
+    CUDA_RT_CALL(cudaStreamCreate(&push_bottom_stream));
+    CUDA_RT_CALL(cudaEventCreateWithFlags(&compute_done, cudaEventDisableTiming));
+    CUDA_RT_CALL(cudaEventCreateWithFlags(&push_top_done, cudaEventDisableTiming));
+    CUDA_RT_CALL(cudaEventCreateWithFlags(&push_bottom_done, cudaEventDisableTiming));
+
     CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(real)));
     CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(real)));
 
@@ -352,39 +421,44 @@ double single_gpu(const int nx, const int ny, const int iter_max, real* const a_
             "Single GPU jacobi relaxation: %d iterations on %d x %d mesh with "
             "norm "
             "check every %d iterations\n",
-            iter_max, ny, nx, nccheck);
-
-    constexpr int dim_block_x = 32;
-    constexpr int dim_block_y = 32;
-    dim3 dim_grid((nx + dim_block_x - 1) / dim_block_x, (ny + dim_block_y - 1) / dim_block_y, 1);
+            iter_max, nx, ny, nccheck);
 
     int iter = 0;
-    bool calculate_norm;
     real l2_norm = 1.0;
+    bool calculate_norm;
 
     double start = MPI_Wtime();
     PUSH_RANGE("Jacobi solve", 0)
     while (l2_norm > tol && iter < iter_max) {
-        CUDA_RT_CALL(cudaMemset(l2_norm_d, 0, sizeof(real)));
+        CUDA_RT_CALL(cudaMemsetAsync(l2_norm_d, 0, sizeof(real), compute_stream));
 
-        calculate_norm = (iter % nccheck) == 0 || (print && ((iter % 100) == 0));
-        launch_jacobi_kernel(
-                a_new, a, l2_norm_d, iy_start, iy_end, nx, calculate_norm);
-        CUDA_RT_CALL(cudaGetLastError());
+        CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_top_done, 0));
+        CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_bottom_done, 0));
+
+        calculate_norm = (iter % nccheck) == 0 || (iter % 100) == 0;
+        launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, iy_end, nx, calculate_norm,
+                             compute_stream);
+        CUDA_RT_CALL(cudaEventRecord(compute_done, compute_stream));
 
         if (calculate_norm) {
-            CUDA_RT_CALL(cudaMemcpy(l2_norm_h, l2_norm_d, sizeof(real), cudaMemcpyDeviceToHost));
+            CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h, l2_norm_d, sizeof(real), cudaMemcpyDeviceToHost,
+                                         compute_stream));
         }
 
         // Apply periodic boundary conditions
 
-        CUDA_RT_CALL(cudaMemcpy(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(real),
-                                     cudaMemcpyDeviceToDevice));
-        CUDA_RT_CALL(cudaMemcpy(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(real),
-                                     cudaMemcpyDeviceToDevice));
+        CUDA_RT_CALL(cudaStreamWaitEvent(push_top_stream, compute_done, 0));
+        CUDA_RT_CALL(cudaMemcpyAsync(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(real),
+                                     cudaMemcpyDeviceToDevice, push_top_stream));
+        CUDA_RT_CALL(cudaEventRecord(push_top_done, push_top_stream));
+
+        CUDA_RT_CALL(cudaStreamWaitEvent(push_bottom_stream, compute_done, 0));
+        CUDA_RT_CALL(cudaMemcpyAsync(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(real),
+                                     cudaMemcpyDeviceToDevice, compute_stream));
+        CUDA_RT_CALL(cudaEventRecord(push_bottom_done, push_bottom_stream));
 
         if (calculate_norm) {
-	    CUDA_RT_CALL(cudaDeviceSynchronize());
+            CUDA_RT_CALL(cudaStreamSynchronize(compute_stream));
             l2_norm = *l2_norm_h;
             l2_norm = std::sqrt(l2_norm);
             if (print && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm);
@@ -398,6 +472,13 @@ double single_gpu(const int nx, const int ny, const int iter_max, real* const a_
 
     CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(real), cudaMemcpyDeviceToHost));
 
+    CUDA_RT_CALL(cudaEventDestroy(push_bottom_done));
+    CUDA_RT_CALL(cudaEventDestroy(push_top_done));
+    CUDA_RT_CALL(cudaEventDestroy(compute_done));
+    CUDA_RT_CALL(cudaStreamDestroy(push_bottom_stream));
+    CUDA_RT_CALL(cudaStreamDestroy(push_top_stream));
+    CUDA_RT_CALL(cudaStreamDestroy(compute_stream));
+
     CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
     CUDA_RT_CALL(cudaFree(l2_norm_d));
 

+ 113 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/nccl/jacobi_kernels.cu

@@ -0,0 +1,113 @@
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <cstdio>
+
+#ifdef HAVE_CUB
+#include <cub/block/block_reduce.cuh>
+#endif  // HAVE_CUB
+
+#define CUDA_RT_CALL(call)                                                                  \
+    {                                                                                       \
+        cudaError_t cudaStatus = call;                                                      \
+        if (cudaSuccess != cudaStatus)                                                      \
+            fprintf(stderr,                                                                 \
+                    "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
+                    "with "                                                                 \
+                    "%s (%d).\n",                                                           \
+                    #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
+    }
+
+#ifdef USE_DOUBLE
+typedef double real;
+#define MPI_REAL_TYPE MPI_DOUBLE
+#else
+typedef float real;
+#define MPI_REAL_TYPE MPI_FLOAT
+#endif
+
+__global__ void initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a,
+                                      const real pi, const int offset, const int nx,
+                                      const int my_ny, const int ny) {
+    for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) {
+        const real y0 = sin(2.0 * pi * (offset + iy) / (ny - 1));
+        a[iy * nx + 0] = y0;
+        a[iy * nx + (nx - 1)] = y0;
+        a_new[iy * nx + 0] = y0;
+        a_new[iy * nx + (nx - 1)] = y0;
+    }
+}
+
+void launch_initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a,
+                                  const real pi, const int offset, const int nx, const int my_ny,
+                                  const int ny) {
+    initialize_boundaries<<<my_ny / 128 + 1, 128>>>(a_new, a, pi, offset, nx, my_ny, ny);
+    CUDA_RT_CALL(cudaGetLastError());
+}
+
+template <int BLOCK_DIM_X, int BLOCK_DIM_Y>
+__global__ void jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a,
+                              real* __restrict__ const l2_norm, const int iy_start,
+                              const int iy_end, const int nx, const bool calculate_norm) {
+#ifdef HAVE_CUB
+    typedef cub::BlockReduce<real, BLOCK_DIM_X, cub::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y>
+        BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+#endif  // HAVE_CUB
+    int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start;
+    int ix = blockIdx.x * blockDim.x + threadIdx.x + 1;
+    real local_l2_norm = 0.0;
+
+    if (iy < iy_end && ix < (nx - 1)) {
+        const real new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] +
+                                     a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]);
+        a_new[iy * nx + ix] = new_val;
+        if (calculate_norm) {
+            real residue = new_val - a[iy * nx + ix];
+            local_l2_norm += residue * residue;
+        }
+    }
+    if (calculate_norm) {
+#ifdef HAVE_CUB
+        real block_l2_norm = BlockReduce(temp_storage).Sum(local_l2_norm);
+        if (0 == threadIdx.y && 0 == threadIdx.x) atomicAdd(l2_norm, block_l2_norm);
+#else
+        atomicAdd(l2_norm, local_l2_norm);
+#endif  // HAVE_CUB
+    }
+}
+
+void launch_jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a,
+                          real* __restrict__ const l2_norm, const int iy_start, const int iy_end,
+                          const int nx, const bool calculate_norm, cudaStream_t stream) {
+    constexpr int dim_block_x = 32;
+    constexpr int dim_block_y = 32;
+    dim3 dim_grid((nx + dim_block_x - 1) / dim_block_x,
+                  ((iy_end - iy_start) + dim_block_y - 1) / dim_block_y, 1);
+    jacobi_kernel<dim_block_x, dim_block_y><<<dim_grid, {dim_block_x, dim_block_y, 1}, 0, stream>>>(
+        a_new, a, l2_norm, iy_start, iy_end, nx, calculate_norm);
+    CUDA_RT_CALL(cudaGetLastError());
+}

+ 2 - 2
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Makefile

@@ -32,8 +32,8 @@
 ################################################################################
 
 # Location of the CUDA Toolkit
-CUDA_PATH ?= /opt/nvidia/hpc_sdk/Linux_x86_64/21.3/cuda/11.2/
-
+CUDA_PATH=/opt/nvidia/hpc_sdk/Linux_x86_64/21.5/cuda
+GCC=
 ##############################
 # start deprecated interface #
 ##############################

BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.o


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/single_gpu/jacobi


+ 2 - 2
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/single_gpu/jacobi.cu

@@ -122,7 +122,7 @@ int main(int argc, char* argv[]) {
     
     double runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h);
 
-    printf("%dx%d: 1 GPU: %8.4f s\n", ny, nx, runtime_serial);
+    printf("%dx%d: 1 GPU: %8.4f s\n", nx, ny, runtime_serial);
 
     return 0;
 }
@@ -155,7 +155,7 @@ double single_gpu(const int nx, const int ny, const int iter_max, float* const a
 
     CUDA_RT_CALL(cudaDeviceSynchronize());
 
-    printf("Single GPU jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, ny, nx);
+    printf("Single GPU jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
 
     dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X, (ny + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1);
     dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);

+ 0 - 315
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/single_gpu/jacobi.cu.old

@@ -1,315 +0,0 @@
-/* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *  * Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of NVIDIA CORPORATION nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-#include <algorithm>
-#include <array>
-#include <climits>
-#include <cmath>
-#include <cstdio>
-#include <iostream>
-#include <iterator>
-#include <sstream>
-
-#include <omp.h>
-
-#ifdef HAVE_CUB
-#include <cub/block/block_reduce.cuh>
-#endif  // HAVE_CUB
-
-#ifdef USE_NVTX
-#include <nvToolsExt.h>
-
-const uint32_t colors[] = {0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff,
-                           0x0000ffff, 0x00ff0000, 0x00ffffff};
-const int num_colors = sizeof(colors) / sizeof(uint32_t);
-
-#define PUSH_RANGE(name, cid)                              \
-    {                                                      \
-        int color_id = cid;                                \
-        color_id = color_id % num_colors;                  \
-        nvtxEventAttributes_t eventAttrib = {0};           \
-        eventAttrib.version = NVTX_VERSION;                \
-        eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;  \
-        eventAttrib.colorType = NVTX_COLOR_ARGB;           \
-        eventAttrib.color = colors[color_id];              \
-        eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \
-        eventAttrib.message.ascii = name;                  \
-        nvtxRangePushEx(&eventAttrib);                     \
-    }
-#define POP_RANGE nvtxRangePop();
-#else
-#define PUSH_RANGE(name, cid)
-#define POP_RANGE
-#endif
-
-#define CUDA_RT_CALL(call)                                                                  \
-    {                                                                                       \
-        cudaError_t cudaStatus = call;                                                      \
-        if (cudaSuccess != cudaStatus)                                                      \
-            fprintf(stderr,                                                                 \
-                    "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
-                    "with "                                                                 \
-                    "%s (%d).\n",                                                           \
-                    #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
-    }
-
-typedef float real;
-constexpr real tol = 1.0e-8;
-
-const real PI = 2.0 * std::asin(1.0);
-
-__global__ void initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a,
-                                      const real pi, const int nx, const int ny) {
-    for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < ny; iy += blockDim.x * gridDim.x) {
-        const real y0 = sin(2.0 * pi * iy / (ny - 1));
-        a[iy * nx + 0] = y0;
-        a[iy * nx + (nx - 1)] = y0;
-        a_new[iy * nx + 0] = y0;
-        a_new[iy * nx + (nx - 1)] = y0;
-    }
-}
-
-template <int BLOCK_DIM_X, int BLOCK_DIM_Y>
-__global__ void jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a,
-                              real* __restrict__ const l2_norm, const int iy_start,
-                              const int iy_end, const int nx) {
-#ifdef HAVE_CUB
-    typedef cub::BlockReduce<real, BLOCK_DIM_X, cub::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y>
-        BlockReduce;
-    __shared__ typename BlockReduce::TempStorage temp_storage;
-#endif  // HAVE_CUB
-    const int iy = blockIdx.y * blockDim.y + threadIdx.y + 1;
-    const int ix = blockIdx.x * blockDim.x + threadIdx.x;
-    real local_l2_norm = 0.0;
-
-    if (iy < iy_end) {
-        if (ix >= 1 && ix < (nx - 1)) {
-            const real new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] +
-                                         a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]);
-            a_new[iy * nx + ix] = new_val;
-
-            // apply boundary conditions
-            if (iy_start == iy) {
-                a_new[iy_end * nx + ix] = new_val;
-            }
-
-            if ((iy_end - 1) == iy) {
-                a_new[(iy_start - 1) * nx + ix] = new_val;
-            }
-
-            real residue = new_val - a[iy * nx + ix];
-            local_l2_norm = residue * residue;
-        }
-    }
-#ifdef HAVE_CUB
-    real block_l2_norm = BlockReduce(temp_storage).Sum(local_l2_norm);
-    if (0 == threadIdx.y && 0 == threadIdx.x) atomicAdd(l2_norm, block_l2_norm);
-#else
-    atomicAdd(l2_norm, local_l2_norm);
-#endif  // HAVE_CUB
-}
-
-double noopt(const int nx, const int ny, const int iter_max, real* const a_ref_h, const int nccheck,
-             const bool print);
-
-template <typename T>
-T get_argval(char** begin, char** end, const std::string& arg, const T default_val) {
-    T argval = default_val;
-    char** itr = std::find(begin, end, arg);
-    if (itr != end && ++itr != end) {
-        std::istringstream inbuf(*itr);
-        inbuf >> argval;
-    }
-    return argval;
-}
-
-bool get_arg(char** begin, char** end, const std::string& arg) {
-    char** itr = std::find(begin, end, arg);
-    if (itr != end) {
-        return true;
-    }
-    return false;
-}
-
-struct l2_norm_buf {
-    cudaEvent_t copy_done;
-    real* d;
-    real* h;
-};
-
-int main(int argc, char* argv[]) {
-    const int iter_max = get_argval<int>(argv, argv + argc, "-niter", 1000);
-    const int nccheck = get_argval<int>(argv, argv + argc, "-nccheck", 1);
-    const int nx = get_argval<int>(argv, argv + argc, "-nx", 16384);
-    const int ny = get_argval<int>(argv, argv + argc, "-ny", 16384);
-    const bool csv = get_arg(argv, argv + argc, "-csv");
-
-    if (nccheck != 1) {
-        fprintf(stderr, "Only nccheck = 1 is supported\n");
-        return -1;
-    }
-
-    real* a;
-    real* a_new;
-
-    cudaStream_t compute_stream;
-    cudaStream_t copy_l2_norm_stream;
-    cudaStream_t reset_l2_norm_stream;
-
-    cudaEvent_t compute_done;
-    cudaEvent_t reset_l2_norm_done[2];
-
-    real l2_norms[2];
-    l2_norm_buf l2_norm_bufs[2];
-
-    int iy_start = 1;
-    int iy_end = (ny - 1);
-
-    CUDA_RT_CALL(cudaSetDevice(0));
-    CUDA_RT_CALL(cudaFree(0));
-
-    CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(real)));
-    CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(real)));
-
-    CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(real)));
-    CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(real)));
-
-    // Set diriclet boundary conditions on left and right boarder
-    initialize_boundaries<<<ny / 128 + 1, 128>>>(a, a_new, PI, nx, ny);
-    CUDA_RT_CALL(cudaGetLastError());
-    CUDA_RT_CALL(cudaDeviceSynchronize());
-
-    CUDA_RT_CALL(cudaStreamCreate(&compute_stream));
-    CUDA_RT_CALL(cudaStreamCreate(&copy_l2_norm_stream));
-    CUDA_RT_CALL(cudaStreamCreate(&reset_l2_norm_stream));
-    CUDA_RT_CALL(cudaEventCreateWithFlags(&compute_done, cudaEventDisableTiming));
-    CUDA_RT_CALL(cudaEventCreateWithFlags(&reset_l2_norm_done[0], cudaEventDisableTiming));
-    CUDA_RT_CALL(cudaEventCreateWithFlags(&reset_l2_norm_done[1], cudaEventDisableTiming));
-
-    for (int i = 0; i < 2; ++i) {
-        CUDA_RT_CALL(cudaEventCreateWithFlags(&l2_norm_bufs[i].copy_done, cudaEventDisableTiming));
-        CUDA_RT_CALL(cudaMalloc(&l2_norm_bufs[i].d, sizeof(real)));
-        CUDA_RT_CALL(cudaMemset(l2_norm_bufs[i].d, 0, sizeof(real)));
-        CUDA_RT_CALL(cudaMallocHost(&l2_norm_bufs[i].h, sizeof(real)));
-        (*l2_norm_bufs[i].h) = 1.0;
-    }
-
-    CUDA_RT_CALL(cudaDeviceSynchronize());
-
-    if (!csv)
-        printf(
-            "Jacobi relaxation: %d iterations on %d x %d mesh with norm check "
-            "every %d iterations\n",
-            iter_max, ny, nx, nccheck);
-
-    constexpr int dim_block_x = 32;
-    constexpr int dim_block_y = 32;
-    dim3 dim_grid((nx + dim_block_x - 1) / dim_block_x, (ny + dim_block_y - 1) / dim_block_y, 1);
-
-    int iter = 0;
-    for (int i = 0; i < 2; ++i) {
-        l2_norms[i] = 0.0;
-    }
-
-    double start = omp_get_wtime();
-
-    PUSH_RANGE("Jacobi solve", 0)
-
-    bool l2_norm_greater_than_tol = true;
-    while (l2_norm_greater_than_tol && iter < iter_max) {
-        // on new iteration: old current vars are now previous vars, old
-        // previous vars are no longer needed
-        int prev = iter % 2;
-        int curr = (iter + 1) % 2;
-
-        // wait for memset from old previous iteration to complete
-        CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, reset_l2_norm_done[curr], 0));
-
-        jacobi_kernel<dim_block_x, dim_block_y>
-            <<<dim_grid, {dim_block_x, dim_block_y, 1}, 0, compute_stream>>>(
-                a_new, a, l2_norm_bufs[curr].d, iy_start, iy_end, nx);
-        CUDA_RT_CALL(cudaGetLastError());
-        CUDA_RT_CALL(cudaEventRecord(compute_done, compute_stream));
-
-        // perform L2 norm calculation
-        if ((iter % nccheck) == 0 || (!csv && (iter % 100) == 0)) {
-            CUDA_RT_CALL(cudaStreamWaitEvent(copy_l2_norm_stream, compute_done, 0));
-            CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_bufs[curr].h, l2_norm_bufs[curr].d, sizeof(real),
-                                         cudaMemcpyDeviceToHost, copy_l2_norm_stream));
-            CUDA_RT_CALL(cudaEventRecord(l2_norm_bufs[curr].copy_done, copy_l2_norm_stream));
-
-            // make sure D2H copy is complete before using the data for
-            // calculation
-            CUDA_RT_CALL(cudaEventSynchronize(l2_norm_bufs[prev].copy_done));
-
-            l2_norms[prev] = *(l2_norm_bufs[prev].h);
-            l2_norms[prev] = std::sqrt(l2_norms[prev]);
-            l2_norm_greater_than_tol = (l2_norms[prev] > tol);
-
-            if (!csv && (iter % 100) == 0) {
-                printf("%5d, %0.6f\n", iter, l2_norms[prev]);
-            }
-
-            // reset everything for next iteration
-            l2_norms[prev] = 0.0;
-            *(l2_norm_bufs[prev].h) = 0.0;
-            CUDA_RT_CALL(
-                cudaMemsetAsync(l2_norm_bufs[prev].d, 0, sizeof(real), reset_l2_norm_stream));
-            CUDA_RT_CALL(cudaEventRecord(reset_l2_norm_done[prev], reset_l2_norm_stream));
-        }
-
-        std::swap(a_new, a);
-        iter++;
-    }
-    CUDA_RT_CALL(cudaDeviceSynchronize());
-    POP_RANGE
-    double stop = omp_get_wtime();
-
-    if (csv) {
-        printf("single_gpu, %d, %d, %d, %d, %f\n", nx, ny, iter_max, nccheck, (stop - start));
-    } else {
-        printf("%dx%d: 1 GPU: %8.4f s\n", ny, nx, (stop - start));
-    }
-
-    for (int i = 0; i < 2; ++i) {
-        CUDA_RT_CALL(cudaFreeHost(l2_norm_bufs[i].h));
-        CUDA_RT_CALL(cudaFree(l2_norm_bufs[i].d));
-        CUDA_RT_CALL(cudaEventDestroy(l2_norm_bufs[i].copy_done));
-    }
-
-    CUDA_RT_CALL(cudaEventDestroy(reset_l2_norm_done[1]));
-    CUDA_RT_CALL(cudaEventDestroy(reset_l2_norm_done[0]));
-    CUDA_RT_CALL(cudaEventDestroy(compute_done));
-
-    CUDA_RT_CALL(cudaStreamDestroy(reset_l2_norm_stream));
-    CUDA_RT_CALL(cudaStreamDestroy(copy_l2_norm_stream));
-    CUDA_RT_CALL(cudaStreamDestroy(compute_stream));
-
-    CUDA_RT_CALL(cudaFree(a_new));
-    CUDA_RT_CALL(cudaFree(a));
-
-    return 0;
-}

BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/single_gpu/jacobi_report.qdrep


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/single_gpu/jacobi_report.sqlite


+ 0 - 13
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/single_gpu/temp

@@ -1,13 +0,0 @@
-Single GPU jacobi relaxation: 100 iterations on 8192 x 8192 mesh
-    0, 22.626005
-   10, 3.374940
-   20, 2.069380
-   30, 1.542849
-   40, 1.250118
-   50, 1.060773
-   60, 0.927187
-   70, 0.827260
-   80, 0.749264
-   90, 0.686587
-8192x8192: 1 GPU:  16.0760 s
-

+ 22 - 23
hpc/multi_gpu_nways/labs/CFD/English/introduction.ipynb

@@ -13,18 +13,14 @@
     "By the end of this bootcamp session, participants will be adept at:\n",
     "* Reviewing communication architecture and topology\n",
     "* Developing CUDA-aware multi-node multi-GPU MPI applications\n",
-    "* Profiling the application using Nsight Systems and HPCToolkit\n",
-    "* Applying optimizations like CUDA streams and overlapping compute and communication\n",
+    "* Profiling the application using NVIDIA Nsight Systems\n",
+    "* Applying optimizations like CUDA streams, events, and overlapping compute and communication\n",
     "* Understanding GPUDirect technologies like P2P and RDMA\n",
-    "* Learning and using NVIDIA NCCL and NVSHMEM libraries"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Tutorial Duration\n",
-    "The lab will take 8 hours to complete. Link to download all materials will be available at the end of the lab.\n",
+    "* Learning to use NVIDIA NCCL and NVSHMEM libraries\n",
+    "\n",
+    "### Bootcamp Duration\n",
+    "\n",
+    "The bootcamp will take 8 hours to complete. Link to download all materials will be available at the end of the lab.\n",
     "\n",
     "### Content Level\n",
     "Intermediate, Advanced\n",
@@ -38,21 +34,24 @@
     "\n",
     "We will take up the Jacobi Solver, an iterative technique for solving system of linear equations, in this tutorial. To begin, click on the first link below:\n",
     "\n",
-    "1. Single Node:\n",
-    "    * [Overview of single-GPU code and Nsight Systems Profiler](C/jupyter_notebook/single_gpu/single_gpu_overview.ipynb)\n",
-    "    * [Multi-GPU using CUDA streams](C/jupyter_notebook/memcpy/streams.ipynb)\n",
-    "    * Multi-GPU using normal and CUDA-aware MPI\n",
-    "2. Multi Node:\n",
-    "    * CUDA-aware MPI and introduction to HPCToolkit\n",
-    "    * Optimizations: computation-communication overlap\n",
-    "5. NCCL Library \n",
-    "6. NVHSMEM Library\n",
-    "7. Final remarks\n",
+    "1. [Overview of single-GPU code and Nsight Systems Profiler](C/jupyter_notebook/single_gpu/single_gpu_overview.ipynb)\n",
+    "2. Single Node :Multi-GPU:\n",
+    "    * [CUDA Memcpy and Peer-to-Peer Memory Access](C/jupyter_notebook/cuda/memcpy.ipynb)\n",
+    "    * [Intra-node topology](C/jupyter_notebook/advanced_concepts/single_node_topology.ipynb)\n",
+    "    * [CUDA Streams and Events](C/jupyter_notebook/cuda/streams.ipynb)\n",
+    "3. Multi-Node Multi-GPU:\n",
+    "    * [Introduction to MPI and Multi-Node execution overview](C/jupyter_notebook/mpi/multi_node_intro.ipynb)\n",
+    "    * [MPI with CUDA Memcpy](C/jupyter_notebook/mpi/memcpy.ipynb)\n",
+    "    * [CUDA-aware MPI](C/jupyter_notebook/mpi/cuda_aware.ipynb)\n",
+    "    * [Supplemental: Configuring MPI in a containerized environment](C/jupyter_notebook/mpi/containers_and_mpi.ipynb)\n",
+    "4. [NVIDIA Collectives Communications Library (NCCL)](C/jupyter_notebook/nccl/nccl/ipynb)\n",
+    "5. NVHSMEM Library\n",
+    "6. Final remarks\n",
     "--- \n",
     "\n",
     "## Licensing \n",
     "\n",
-    "This material is released by NVIDIA Corporation under the Creative Commons Attribution 4.0 International (CC BY 4.0). "
+    "This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0)."
    ]
   }
  ],
@@ -72,7 +71,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.9.5"
   }
  },
  "nbformat": 4,

+ 0 - 23
hpc/multi_gpu_nways/labs/profiler/English/LICENSE

@@ -1,23 +0,0 @@
-Copyright (c) 2018, National Center for Computational Sciences, Oak Ridge National Laboratory
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-* Redistributions of source code must retain the above copyright notice, this
-  list of conditions and the following disclaimer.
-
-* Redistributions in binary form must reproduce the above copyright notice,
-  this list of conditions and the following disclaimer in the documentation
-  and/or other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

BIN
hpc/multi_gpu_nways/labs/profiler/English/jupyter_notebook/images/Nsight Diagram.png


BIN
hpc/multi_gpu_nways/labs/profiler/English/jupyter_notebook/images/Optimization_Cycle.jpg


BIN
hpc/multi_gpu_nways/labs/profiler/English/jupyter_notebook/images/UM.png


BIN
hpc/multi_gpu_nways/labs/profiler/English/jupyter_notebook/images/allsection-compute.png


BIN
hpc/multi_gpu_nways/labs/profiler/English/jupyter_notebook/images/baseline-compute.png


BIN
hpc/multi_gpu_nways/labs/profiler/English/jupyter_notebook/images/charts-compute.png


BIN
hpc/multi_gpu_nways/labs/profiler/English/jupyter_notebook/images/cli-out.png


BIN
hpc/multi_gpu_nways/labs/profiler/English/jupyter_notebook/images/collapse_feedback.png


BIN
hpc/multi_gpu_nways/labs/profiler/English/jupyter_notebook/images/collapse_pre.png


BIN
hpc/multi_gpu_nways/labs/profiler/English/jupyter_notebook/images/collapse_thread.png


BIN
hpc/multi_gpu_nways/labs/profiler/English/jupyter_notebook/images/compute.png


BIN
hpc/multi_gpu_nways/labs/profiler/English/jupyter_notebook/images/compute_analyz.png


BIN
hpc/multi_gpu_nways/labs/profiler/English/jupyter_notebook/images/compute_command.png


+ 0 - 0
hpc/multi_gpu_nways/labs/profiler/English/jupyter_notebook/images/compute_command_line.png


Some files were not shown because too many files changed in this diff